vp9/encoder/vp9_rdopt.c

   1 /*
   2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include <assert.h>
  12 #include <math.h>
  13
  14 #include "./vp9_rtcd.h"
  15
  16 #include "vpx_mem/vpx_mem.h"
  17 #include "vpx_ports/mem.h"
  18
  19 #include "vp9/common/vp9_common.h"
  20 #include "vp9/common/vp9_entropy.h"
  21 #include "vp9/common/vp9_entropymode.h"
  22 #include "vp9/common/vp9_idct.h"
  23 #include "vp9/common/vp9_mvref_common.h"
  24 #include "vp9/common/vp9_pred_common.h"
  25 #include "vp9/common/vp9_quant_common.h"
  26 #include "vp9/common/vp9_reconinter.h"
  27 #include "vp9/common/vp9_reconintra.h"
  28 #include "vp9/common/vp9_scan.h"
  29 #include "vp9/common/vp9_seg_common.h"
  30 #include "vp9/common/vp9_systemdependent.h"
  31
  32 #include "vp9/encoder/vp9_cost.h"
  33 #include "vp9/encoder/vp9_encodemb.h"
  34 #include "vp9/encoder/vp9_encodemv.h"
  35 #include "vp9/encoder/vp9_encoder.h"
  36 #include "vp9/encoder/vp9_mcomp.h"
  37 #include "vp9/encoder/vp9_quantize.h"
  38 #include "vp9/encoder/vp9_ratectrl.h"
  39 #include "vp9/encoder/vp9_rd.h"
  40 #include "vp9/encoder/vp9_rdopt.h"
  41 #include "vp9/encoder/vp9_variance.h"
  42 #include "vp9/encoder/vp9_aq_variance.h"
  43
  44 #define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
  45                                  (1 << INTRA_FRAME))
  46 #define GOLDEN_FRAME_MODE_MASK  ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | \
  47                                  (1 << INTRA_FRAME))
  48 #define ALT_REF_MODE_MASK       ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | \
  49                                  (1 << INTRA_FRAME))
  50
  51 #define SECOND_REF_FRAME_MASK   ((1 << ALTREF_FRAME) | 0x01)
  52
  53 #define MIN_EARLY_TERM_INDEX    3
  54 #define NEW_MV_DISCOUNT_FACTOR  8
  55
  56 typedef struct {
  57   PREDICTION_MODE mode;
  58   MV_REFERENCE_FRAME ref_frame[2];
  59 } MODE_DEFINITION;
  60
  61 typedef struct {
  62   MV_REFERENCE_FRAME ref_frame[2];
  63 } REF_DEFINITION;
  64
  65 struct rdcost_block_args {
  66   MACROBLOCK *x;
  67   ENTROPY_CONTEXT t_above[16];
  68   ENTROPY_CONTEXT t_left[16];
  69   int rate;
  70   int64_t dist;
  71   int64_t sse;
  72   int this_rate;
  73   int64_t this_dist;
  74   int64_t this_sse;
  75   int64_t this_rd;
  76   int64_t best_rd;
  77   int skip;
  78   int use_fast_coef_costing;
  79   const scan_order *so;
  80 };
  81
  82 #define LAST_NEW_MV_INDEX 6
  83 static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
  84   {NEARESTMV, {LAST_FRAME,   NONE}},
  85   {NEARESTMV, {ALTREF_FRAME, NONE}},
  86   {NEARESTMV, {GOLDEN_FRAME, NONE}},
  87
  88   {DC_PRED,   {INTRA_FRAME,  NONE}},
  89
  90   {NEWMV,     {LAST_FRAME,   NONE}},
  91   {NEWMV,     {ALTREF_FRAME, NONE}},
  92   {NEWMV,     {GOLDEN_FRAME, NONE}},
  93
  94   {NEARMV,    {LAST_FRAME,   NONE}},
  95   {NEARMV,    {ALTREF_FRAME, NONE}},
  96   {NEARMV,    {GOLDEN_FRAME, NONE}},
  97
  98   {ZEROMV,    {LAST_FRAME,   NONE}},
  99   {ZEROMV,    {GOLDEN_FRAME, NONE}},
 100   {ZEROMV,    {ALTREF_FRAME, NONE}},
 101
 102   {NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
 103   {NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
 104
 105   {TM_PRED,   {INTRA_FRAME,  NONE}},
 106
 107   {NEARMV,    {LAST_FRAME,   ALTREF_FRAME}},
 108   {NEWMV,     {LAST_FRAME,   ALTREF_FRAME}},
 109   {NEARMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
 110   {NEWMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
 111
 112   {ZEROMV,    {LAST_FRAME,   ALTREF_FRAME}},
 113   {ZEROMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
 114
 115   {H_PRED,    {INTRA_FRAME,  NONE}},
 116   {V_PRED,    {INTRA_FRAME,  NONE}},
 117   {D135_PRED, {INTRA_FRAME,  NONE}},
 118   {D207_PRED, {INTRA_FRAME,  NONE}},
 119   {D153_PRED, {INTRA_FRAME,  NONE}},
 120   {D63_PRED,  {INTRA_FRAME,  NONE}},
 121   {D117_PRED, {INTRA_FRAME,  NONE}},
 122   {D45_PRED,  {INTRA_FRAME,  NONE}},
 123 };
 124
 125 static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
 126   {{LAST_FRAME,   NONE}},
 127   {{GOLDEN_FRAME, NONE}},
 128   {{ALTREF_FRAME, NONE}},
 129   {{LAST_FRAME,   ALTREF_FRAME}},
 130   {{GOLDEN_FRAME, ALTREF_FRAME}},
 131   {{INTRA_FRAME,  NONE}},
 132 };
 133
 134 static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
 135                            int m, int n, int min_plane, int max_plane) {
 136   int i;
 137
 138   for (i = min_plane; i < max_plane; ++i) {
 139     struct macroblock_plane *const p = &x->plane[i];
 140     struct macroblockd_plane *const pd = &x->e_mbd.plane[i];
 141
 142     p->coeff    = ctx->coeff_pbuf[i][m];
 143     p->qcoeff   = ctx->qcoeff_pbuf[i][m];
 144     pd->dqcoeff = ctx->dqcoeff_pbuf[i][m];
 145     p->eobs     = ctx->eobs_pbuf[i][m];
 146
 147     ctx->coeff_pbuf[i][m]   = ctx->coeff_pbuf[i][n];
 148     ctx->qcoeff_pbuf[i][m]  = ctx->qcoeff_pbuf[i][n];
 149     ctx->dqcoeff_pbuf[i][m] = ctx->dqcoeff_pbuf[i][n];
 150     ctx->eobs_pbuf[i][m]    = ctx->eobs_pbuf[i][n];
 151
 152     ctx->coeff_pbuf[i][n]   = p->coeff;
 153     ctx->qcoeff_pbuf[i][n]  = p->qcoeff;
 154     ctx->dqcoeff_pbuf[i][n] = pd->dqcoeff;
 155     ctx->eobs_pbuf[i][n]    = p->eobs;
 156   }
 157 }
 158
 159 static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
 160                             MACROBLOCK *x, MACROBLOCKD *xd,
 161                             int *out_rate_sum, int64_t *out_dist_sum,
 162                             int *skip_txfm_sb, int64_t *skip_sse_sb) {
 163   // Note our transform coeffs are 8 times an orthogonal transform.
 164   // Hence quantizer step is also 8 times. To get effective quantizer
 165   // we need to divide by 8 before sending to modeling function.
 166   int i;
 167   int64_t rate_sum = 0;
 168   int64_t dist_sum = 0;
 169   const int ref = xd->mi[0]->mbmi.ref_frame[0];
 170   unsigned int sse;
 171   unsigned int var = 0;
 172   unsigned int sum_sse = 0;
 173   int64_t total_sse = 0;
 174   int skip_flag = 1;
 175   const int shift = 6;
 176   int rate;
 177   int64_t dist;
 178
 179   x->pred_sse[ref] = 0;
 180
 181   for (i = 0; i < MAX_MB_PLANE; ++i) {
 182     struct macroblock_plane *const p = &x->plane[i];
 183     struct macroblockd_plane *const pd = &xd->plane[i];
 184     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
 185     const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 186     const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
 187     const int64_t dc_thr = p->quant_thred[0] >> shift;
 188     const int64_t ac_thr = p->quant_thred[1] >> shift;
 189     // The low thresholds are used to measure if the prediction errors are
 190     // low enough so that we can skip the mode search.
 191     const int64_t low_dc_thr = MIN(50, dc_thr >> 2);
 192     const int64_t low_ac_thr = MIN(80, ac_thr >> 2);
 193     int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
 194     int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
 195     int idx, idy;
 196     int lw = b_width_log2_lookup[unit_size] + 2;
 197     int lh = b_height_log2_lookup[unit_size] + 2;
 198
 199     sum_sse = 0;
 200
 201     for (idy = 0; idy < bh; ++idy) {
 202       for (idx = 0; idx < bw; ++idx) {
 203         uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
 204         uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh);
 205         int block_idx = (idy << 1) + idx;
 206         int low_err_skip = 0;
 207
 208         var = cpi->fn_ptr[unit_size].vf(src, p->src.stride,
 209                                         dst, pd->dst.stride, &sse);
 210         x->bsse[(i << 2) + block_idx] = sse;
 211         sum_sse += sse;
 212
 213         x->skip_txfm[(i << 2) + block_idx] = 0;
 214         if (!x->select_tx_size) {
 215           // Check if all ac coefficients can be quantized to zero.
 216           if (var < ac_thr || var == 0) {
 217             x->skip_txfm[(i << 2) + block_idx] = 2;
 218
 219             // Check if dc coefficient can be quantized to zero.
 220             if (sse - var < dc_thr || sse == var) {
 221               x->skip_txfm[(i << 2) + block_idx] = 1;
 222
 223               if (!sse || (var < low_ac_thr && sse - var < low_dc_thr))
 224                 low_err_skip = 1;
 225             }
 226           }
 227         }
 228
 229         if (skip_flag && !low_err_skip)
 230           skip_flag = 0;
 231
 232         if (i == 0)
 233           x->pred_sse[ref] += sse;
 234       }
 235     }
 236
 237     total_sse += sum_sse;
 238
 239     // Fast approximate the modelling function.
 240     if (cpi->oxcf.speed > 4) {
 241       int64_t rate;
 242       const int64_t square_error = sum_sse;
 243       int quantizer = (pd->dequant[1] >> 3);
 244 #if CONFIG_VP9_HIGHBITDEPTH
 245       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 246         quantizer >>= (xd->bd - 8);
 247       }
 248 #endif  // CONFIG_VP9_HIGHBITDEPTH
 249
 250       if (quantizer < 120)
 251         rate = (square_error * (280 - quantizer)) >> 8;
 252       else
 253         rate = 0;
 254       dist = (square_error * quantizer) >> 8;
 255       rate_sum += rate;
 256       dist_sum += dist;
 257     } else {
 258 #if CONFIG_VP9_HIGHBITDEPTH
 259       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 260         vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
 261                                      pd->dequant[1] >> (xd->bd - 5),
 262                                      &rate, &dist);
 263       } else {
 264         vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
 265                                      pd->dequant[1] >> 3, &rate, &dist);
 266       }
 267 #else
 268       vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
 269                                    pd->dequant[1] >> 3, &rate, &dist);
 270 #endif  // CONFIG_VP9_HIGHBITDEPTH
 271       rate_sum += rate;
 272       dist_sum += dist;
 273     }
 274   }
 275
 276   *skip_txfm_sb = skip_flag;
 277   *skip_sse_sb = total_sse << 4;
 278   *out_rate_sum = (int)rate_sum;
 279   *out_dist_sum = dist_sum << 4;
 280 }
 281
 282 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
 283                           intptr_t block_size, int64_t *ssz) {
 284   int i;
 285   int64_t error = 0, sqcoeff = 0;
 286
 287   for (i = 0; i < block_size; i++) {
 288     const int diff = coeff[i] - dqcoeff[i];
 289     error +=  diff * diff;
 290     sqcoeff += coeff[i] * coeff[i];
 291   }
 292
 293   *ssz = sqcoeff;
 294   return error;
 295 }
 296
 297 int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
 298                              int block_size) {
 299   int i;
 300   int64_t error = 0;
 301
 302   for (i = 0; i < block_size; i++) {
 303     const int diff = coeff[i] - dqcoeff[i];
 304     error +=  diff * diff;
 305   }
 306
 307   return error;
 308 }
 309
 310 #if CONFIG_VP9_HIGHBITDEPTH
 311 int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
 312                                  const tran_low_t *dqcoeff,
 313                                  intptr_t block_size,
 314                                  int64_t *ssz, int bd) {
 315   int i;
 316   int64_t error = 0, sqcoeff = 0;
 317   int shift = 2 * (bd - 8);
 318   int rounding = shift > 0 ? 1 << (shift - 1) : 0;
 319
 320   for (i = 0; i < block_size; i++) {
 321     const int64_t diff = coeff[i] - dqcoeff[i];
 322     error +=  diff * diff;
 323     sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
 324   }
 325   assert(error >= 0 && sqcoeff >= 0);
 326   error = (error + rounding) >> shift;
 327   sqcoeff = (sqcoeff + rounding) >> shift;
 328
 329   *ssz = sqcoeff;
 330   return error;
 331 }
 332 #endif  // CONFIG_VP9_HIGHBITDEPTH
 333
 334 /* The trailing '0' is a terminator which is used inside cost_coeffs() to
 335  * decide whether to include cost of a trailing EOB node or not (i.e. we
 336  * can skip this if the last coefficient in this transform block, e.g. the
 337  * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
 338  * were non-zero). */
 339 static const int16_t band_counts[TX_SIZES][8] = {
 340   { 1, 2, 3, 4,  3,   16 - 13, 0 },
 341   { 1, 2, 3, 4, 11,   64 - 21, 0 },
 342   { 1, 2, 3, 4, 11,  256 - 21, 0 },
 343   { 1, 2, 3, 4, 11, 1024 - 21, 0 },
 344 };
 345 static int cost_coeffs(MACROBLOCK *x,
 346                        int plane, int block,
 347                        ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
 348                        TX_SIZE tx_size,
 349                        const int16_t *scan, const int16_t *nb,
 350                        int use_fast_coef_costing) {
 351   MACROBLOCKD *const xd = &x->e_mbd;
 352   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
 353   const struct macroblock_plane *p = &x->plane[plane];
 354   const struct macroblockd_plane *pd = &xd->plane[plane];
 355   const PLANE_TYPE type = pd->plane_type;
 356   const int16_t *band_count = &band_counts[tx_size][1];
 357   const int eob = p->eobs[block];
 358   const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
 359   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
 360                    x->token_costs[tx_size][type][is_inter_block(mbmi)];
 361   uint8_t token_cache[32 * 32];
 362   int pt = combine_entropy_contexts(*A, *L);
 363   int c, cost;
 364 #if CONFIG_VP9_HIGHBITDEPTH
 365   const int16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
 366 #else
 367   const int16_t *cat6_high_cost = vp9_get_high_cost_table(8);
 368 #endif
 369
 370   // Check for consistency of tx_size with mode info
 371   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
 372                               : get_uv_tx_size(mbmi, pd) == tx_size);
 373
 374   if (eob == 0) {
 375     // single eob token
 376     cost = token_costs[0][0][pt][EOB_TOKEN];
 377     c = 0;
 378   } else {
 379     int band_left = *band_count++;
 380
 381     // dc token
 382     int v = qcoeff[0];
 383     int16_t prev_t;
 384     EXTRABIT e;
 385     vp9_get_token_extra(v, &prev_t, &e);
 386     cost = (*token_costs)[0][pt][prev_t] +
 387         vp9_get_cost(prev_t, e, cat6_high_cost);
 388
 389     token_cache[0] = vp9_pt_energy_class[prev_t];
 390     ++token_costs;
 391
 392     // ac tokens
 393     for (c = 1; c < eob; c++) {
 394       const int rc = scan[c];
 395       int16_t t;
 396
 397       v = qcoeff[rc];
 398       vp9_get_token_extra(v, &t, &e);
 399       if (use_fast_coef_costing) {
 400         cost += (*token_costs)[!prev_t][!prev_t][t] +
 401             vp9_get_cost(t, e, cat6_high_cost);
 402       } else {
 403         pt = get_coef_context(nb, token_cache, c);
 404         cost += (*token_costs)[!prev_t][pt][t] +
 405             vp9_get_cost(t, e, cat6_high_cost);
 406         token_cache[rc] = vp9_pt_energy_class[t];
 407       }
 408       prev_t = t;
 409       if (!--band_left) {
 410         band_left = *band_count++;
 411         ++token_costs;
 412       }
 413     }
 414
 415     // eob token
 416     if (band_left) {
 417       if (use_fast_coef_costing) {
 418         cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
 419       } else {
 420         pt = get_coef_context(nb, token_cache, c);
 421         cost += (*token_costs)[0][pt][EOB_TOKEN];
 422       }
 423     }
 424   }
 425
 426   // is eob first coefficient;
 427   *A = *L = (c > 0);
 428
 429   return cost;
 430 }
 431
 432 #if CONFIG_VP9_HIGHBITDEPTH
 433 static void dist_block(int plane, int block, TX_SIZE tx_size,
 434                        struct rdcost_block_args* args, int bd) {
 435 #else
 436 static void dist_block(int plane, int block, TX_SIZE tx_size,
 437                        struct rdcost_block_args* args) {
 438 #endif  // CONFIG_VP9_HIGHBITDEPTH
 439   const int ss_txfrm_size = tx_size << 1;
 440   MACROBLOCK* const x = args->x;
 441   MACROBLOCKD* const xd = &x->e_mbd;
 442   const struct macroblock_plane *const p = &x->plane[plane];
 443   const struct macroblockd_plane *const pd = &xd->plane[plane];
 444   int64_t this_sse;
 445   int shift = tx_size == TX_32X32 ? 0 : 2;
 446   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
 447   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 448 #if CONFIG_VP9_HIGHBITDEPTH
 449   args->dist = vp9_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
 450                                       &this_sse, bd) >> shift;
 451 #else
 452   args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
 453                                &this_sse) >> shift;
 454 #endif  // CONFIG_VP9_HIGHBITDEPTH
 455   args->sse  = this_sse >> shift;
 456
 457   if (x->skip_encode && !is_inter_block(&xd->mi[0]->mbmi)) {
 458     // TODO(jingning): tune the model to better capture the distortion.
 459     int64_t p = (pd->dequant[1] * pd->dequant[1] *
 460                     (1 << ss_txfrm_size)) >> (shift + 2);
 461 #if CONFIG_VP9_HIGHBITDEPTH
 462     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 463       p >>= ((xd->bd - 8) * 2);
 464     }
 465 #endif  // CONFIG_VP9_HIGHBITDEPTH
 466     args->dist += (p >> 4);
 467     args->sse  += p;
 468   }
 469 }
 470
 471 static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
 472                        TX_SIZE tx_size, struct rdcost_block_args* args) {
 473   int x_idx, y_idx;
 474   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x_idx, &y_idx);
 475
 476   args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
 477                            args->t_left + y_idx, tx_size,
 478                            args->so->scan, args->so->neighbors,
 479                            args->use_fast_coef_costing);
 480 }
 481
 482 static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
 483                           TX_SIZE tx_size, void *arg) {
 484   struct rdcost_block_args *args = arg;
 485   MACROBLOCK *const x = args->x;
 486   MACROBLOCKD *const xd = &x->e_mbd;
 487   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 488   int64_t rd1, rd2, rd;
 489
 490   if (args->skip)
 491     return;
 492
 493   if (!is_inter_block(mbmi)) {
 494     struct encode_b_args arg = {x, NULL, &mbmi->skip};
 495     vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
 496 #if CONFIG_VP9_HIGHBITDEPTH
 497     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 498       dist_block(plane, block, tx_size, args, xd->bd);
 499     } else {
 500       dist_block(plane, block, tx_size, args, 8);
 501     }
 502 #else
 503     dist_block(plane, block, tx_size, args);
 504 #endif  // CONFIG_VP9_HIGHBITDEPTH
 505   } else if (max_txsize_lookup[plane_bsize] == tx_size) {
 506     if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {
 507       // full forward transform and quantization
 508       vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 509 #if CONFIG_VP9_HIGHBITDEPTH
 510       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 511         dist_block(plane, block, tx_size, args, xd->bd);
 512       } else {
 513         dist_block(plane, block, tx_size, args, 8);
 514       }
 515 #else
 516       dist_block(plane, block, tx_size, args);
 517 #endif  // CONFIG_VP9_HIGHBITDEPTH
 518     } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
 519       // compute DC coefficient
 520       tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
 521       tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
 522       vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
 523       args->sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
 524       args->dist = args->sse;
 525       if (x->plane[plane].eobs[block]) {
 526         const int64_t orig_sse = (int64_t)coeff[0] * coeff[0];
 527         const int64_t resd_sse = coeff[0] - dqcoeff[0];
 528         int64_t dc_correct = orig_sse - resd_sse * resd_sse;
 529 #if CONFIG_VP9_HIGHBITDEPTH
 530         dc_correct >>= ((xd->bd - 8) * 2);
 531 #endif
 532         if (tx_size != TX_32X32)
 533           dc_correct >>= 2;
 534
 535         args->dist = MAX(0, args->sse - dc_correct);
 536       }
 537     } else {
 538       // skip forward transform
 539       x->plane[plane].eobs[block] = 0;
 540       args->sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
 541       args->dist = args->sse;
 542     }
 543   } else {
 544     // full forward transform and quantization
 545     vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 546 #if CONFIG_VP9_HIGHBITDEPTH
 547     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 548       dist_block(plane, block, tx_size, args, xd->bd);
 549     } else {
 550       dist_block(plane, block, tx_size, args, 8);
 551     }
 552 #else
 553     dist_block(plane, block, tx_size, args);
 554 #endif  // CONFIG_VP9_HIGHBITDEPTH
 555   }
 556
 557   rate_block(plane, block, plane_bsize, tx_size, args);
 558   rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
 559   rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
 560
 561   // TODO(jingning): temporarily enabled only for luma component
 562   rd = MIN(rd1, rd2);
 563   if (plane == 0)
 564     x->zcoeff_blk[tx_size][block] = !x->plane[plane].eobs[block] ||
 565                                     (rd1 > rd2 && !xd->lossless);
 566
 567   args->this_rate += args->rate;
 568   args->this_dist += args->dist;
 569   args->this_sse  += args->sse;
 570   args->this_rd += rd;
 571
 572   if (args->this_rd > args->best_rd) {
 573     args->skip = 1;
 574     return;
 575   }
 576 }
 577
 578 static void txfm_rd_in_plane(MACROBLOCK *x,
 579                              int *rate, int64_t *distortion,
 580                              int *skippable, int64_t *sse,
 581                              int64_t ref_best_rd, int plane,
 582                              BLOCK_SIZE bsize, TX_SIZE tx_size,
 583                              int use_fast_coef_casting) {
 584   MACROBLOCKD *const xd = &x->e_mbd;
 585   const struct macroblockd_plane *const pd = &xd->plane[plane];
 586   struct rdcost_block_args args;
 587   vp9_zero(args);
 588   args.x = x;
 589   args.best_rd = ref_best_rd;
 590   args.use_fast_coef_costing = use_fast_coef_casting;
 591
 592   if (plane == 0)
 593     xd->mi[0]->mbmi.tx_size = tx_size;
 594
 595   vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 596
 597   args.so = get_scan(xd, tx_size, pd->plane_type, 0);
 598
 599   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
 600                                          block_rd_txfm, &args);
 601   if (args.skip) {
 602     *rate       = INT_MAX;
 603     *distortion = INT64_MAX;
 604     *sse        = INT64_MAX;
 605     *skippable  = 0;
 606   } else {
 607     *distortion = args.this_dist;
 608     *rate       = args.this_rate;
 609     *sse        = args.this_sse;
 610     *skippable  = vp9_is_skippable_in_plane(x, bsize, plane);
 611   }
 612 }
 613
 614 static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x,
 615                                    int *rate, int64_t *distortion,
 616                                    int *skip, int64_t *sse,
 617                                    int64_t ref_best_rd,
 618                                    BLOCK_SIZE bs) {
 619   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 620   VP9_COMMON *const cm = &cpi->common;
 621   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
 622   MACROBLOCKD *const xd = &x->e_mbd;
 623   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 624
 625   mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
 626
 627   txfm_rd_in_plane(x, rate, distortion, skip,
 628                    sse, ref_best_rd, 0, bs,
 629                    mbmi->tx_size, cpi->sf.use_fast_coef_costing);
 630 }
 631
 632 static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
 633                                    int *rate,
 634                                    int64_t *distortion,
 635                                    int *skip,
 636                                    int64_t *psse,
 637                                    int64_t tx_cache[TX_MODES],
 638                                    int64_t ref_best_rd,
 639                                    BLOCK_SIZE bs) {
 640   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
 641   VP9_COMMON *const cm = &cpi->common;
 642   MACROBLOCKD *const xd = &x->e_mbd;
 643   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 644   vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
 645   int r[TX_SIZES][2], s[TX_SIZES];
 646   int64_t d[TX_SIZES], sse[TX_SIZES];
 647   int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
 648                              {INT64_MAX, INT64_MAX},
 649                              {INT64_MAX, INT64_MAX},
 650                              {INT64_MAX, INT64_MAX}};
 651   int n, m;
 652   int s0, s1;
 653   const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
 654   int64_t best_rd = INT64_MAX;
 655   TX_SIZE best_tx = max_tx_size;
 656
 657   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
 658   assert(skip_prob > 0);
 659   s0 = vp9_cost_bit(skip_prob, 0);
 660   s1 = vp9_cost_bit(skip_prob, 1);
 661
 662   for (n = max_tx_size; n >= 0;  n--) {
 663     txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n],
 664                      &sse[n], ref_best_rd, 0, bs, n,
 665                      cpi->sf.use_fast_coef_costing);
 666     r[n][1] = r[n][0];
 667     if (r[n][0] < INT_MAX) {
 668       for (m = 0; m <= n - (n == (int) max_tx_size); m++) {
 669         if (m == n)
 670           r[n][1] += vp9_cost_zero(tx_probs[m]);
 671         else
 672           r[n][1] += vp9_cost_one(tx_probs[m]);
 673       }
 674     }
 675     if (d[n] == INT64_MAX) {
 676       rd[n][0] = rd[n][1] = INT64_MAX;
 677     } else if (s[n]) {
 678       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
 679     } else {
 680       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
 681       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
 682     }
 683
 684     // Early termination in transform size search.
 685     if (cpi->sf.tx_size_search_breakout &&
 686         (rd[n][1] == INT64_MAX ||
 687         (n < (int) max_tx_size && rd[n][1] > rd[n + 1][1]) ||
 688         s[n] == 1))
 689       break;
 690
 691     if (rd[n][1] < best_rd) {
 692       best_tx = n;
 693       best_rd = rd[n][1];
 694     }
 695   }
 696   mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
 697                       best_tx : MIN(max_tx_size, max_mode_tx_size);
 698
 699
 700   *distortion = d[mbmi->tx_size];
 701   *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
 702   *skip       = s[mbmi->tx_size];
 703   *psse       = sse[mbmi->tx_size];
 704
 705   tx_cache[ONLY_4X4] = rd[TX_4X4][0];
 706   tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
 707   tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
 708   tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
 709
 710   if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
 711     tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
 712   } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
 713     tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
 714   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
 715     tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
 716   } else {
 717     tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
 718   }
 719 }
 720
 721 static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 722                             int64_t *distortion, int *skip,
 723                             int64_t *psse, BLOCK_SIZE bs,
 724                             int64_t txfm_cache[TX_MODES],
 725                             int64_t ref_best_rd) {
 726   MACROBLOCKD *xd = &x->e_mbd;
 727   int64_t sse;
 728   int64_t *ret_sse = psse ? psse : &sse;
 729
 730   assert(bs == xd->mi[0]->mbmi.sb_type);
 731
 732   if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
 733     memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
 734     choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
 735                            bs);
 736   } else {
 737     choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse,
 738                            txfm_cache, ref_best_rd, bs);
 739   }
 740 }
 741
 742 static int conditional_skipintra(PREDICTION_MODE mode,
 743                                  PREDICTION_MODE best_intra_mode) {
 744   if (mode == D117_PRED &&
 745       best_intra_mode != V_PRED &&
 746       best_intra_mode != D135_PRED)
 747     return 1;
 748   if (mode == D63_PRED &&
 749       best_intra_mode != V_PRED &&
 750       best_intra_mode != D45_PRED)
 751     return 1;
 752   if (mode == D207_PRED &&
 753       best_intra_mode != H_PRED &&
 754       best_intra_mode != D45_PRED)
 755     return 1;
 756   if (mode == D153_PRED &&
 757       best_intra_mode != H_PRED &&
 758       best_intra_mode != D135_PRED)
 759     return 1;
 760   return 0;
 761 }
 762
 763 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 764                                      PREDICTION_MODE *best_mode,
 765                                      const int *bmode_costs,
 766                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
 767                                      int *bestrate, int *bestratey,
 768                                      int64_t *bestdistortion,
 769                                      BLOCK_SIZE bsize, int64_t rd_thresh) {
 770   PREDICTION_MODE mode;
 771   MACROBLOCKD *const xd = &x->e_mbd;
 772   int64_t best_rd = rd_thresh;
 773
 774   struct macroblock_plane *p = &x->plane[0];
 775   struct macroblockd_plane *pd = &xd->plane[0];
 776   const int src_stride = p->src.stride;
 777   const int dst_stride = pd->dst.stride;
 778   const uint8_t *src_init = &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, ib,
 779                                                                 src_stride)];
 780   uint8_t *dst_init = &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, ib,
 781                                                            dst_stride)];
 782   ENTROPY_CONTEXT ta[2], tempa[2];
 783   ENTROPY_CONTEXT tl[2], templ[2];
 784
 785   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
 786   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
 787   int idx, idy;
 788   uint8_t best_dst[8 * 8];
 789 #if CONFIG_VP9_HIGHBITDEPTH
 790   uint16_t best_dst16[8 * 8];
 791 #endif
 792
 793   assert(ib < 4);
 794
 795   memcpy(ta, a, sizeof(ta));
 796   memcpy(tl, l, sizeof(tl));
 797   xd->mi[0]->mbmi.tx_size = TX_4X4;
 798
 799 #if CONFIG_VP9_HIGHBITDEPTH
 800   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
 801     for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
 802       int64_t this_rd;
 803       int ratey = 0;
 804       int64_t distortion = 0;
 805       int rate = bmode_costs[mode];
 806
 807       if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
 808         continue;
 809
 810       // Only do the oblique modes if the best so far is
 811       // one of the neighboring directional modes
 812       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
 813         if (conditional_skipintra(mode, *best_mode))
 814             continue;
 815       }
 816
 817       memcpy(tempa, ta, sizeof(ta));
 818       memcpy(templ, tl, sizeof(tl));
 819
 820       for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
 821         for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
 822           const int block = ib + idy * 2 + idx;
 823           const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
 824           uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
 825           int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8,
 826                                                                   block,
 827                                                                   p->src_diff);
 828           tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
 829           xd->mi[0]->bmi[block].as_mode = mode;
 830           vp9_predict_intra_block(xd, block, 1,
 831                                   TX_4X4, mode,
 832                                   x->skip_encode ? src : dst,
 833                                   x->skip_encode ? src_stride : dst_stride,
 834                                   dst, dst_stride, idx, idy, 0);
 835           vp9_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
 836                                     dst, dst_stride, xd->bd);
 837           if (xd->lossless) {
 838             const scan_order *so = &vp9_default_scan_orders[TX_4X4];
 839             vp9_highbd_fwht4x4(src_diff, coeff, 8);
 840             vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
 841             ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
 842                                  so->scan, so->neighbors,
 843                                  cpi->sf.use_fast_coef_costing);
 844             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
 845               goto next_highbd;
 846             vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
 847                                    dst, dst_stride,
 848                                    p->eobs[block], xd->bd);
 849           } else {
 850             int64_t unused;
 851             const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
 852             const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
 853             vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
 854             vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
 855             ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
 856                                  so->scan, so->neighbors,
 857                                  cpi->sf.use_fast_coef_costing);
 858             distortion += vp9_highbd_block_error(
 859                 coeff, BLOCK_OFFSET(pd->dqcoeff, block),
 860                 16, &unused, xd->bd) >> 2;
 861             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
 862               goto next_highbd;
 863             vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
 864                                   dst, dst_stride, p->eobs[block], xd->bd);
 865           }
 866         }
 867       }
 868
 869       rate += ratey;
 870       this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 871
 872       if (this_rd < best_rd) {
 873         *bestrate = rate;
 874         *bestratey = ratey;
 875         *bestdistortion = distortion;
 876         best_rd = this_rd;
 877         *best_mode = mode;
 878         memcpy(a, tempa, sizeof(tempa));
 879         memcpy(l, templ, sizeof(templ));
 880         for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
 881           memcpy(best_dst16 + idy * 8,
 882                  CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
 883                  num_4x4_blocks_wide * 4 * sizeof(uint16_t));
 884         }
 885       }
 886     next_highbd:
 887       {}
 888     }
 889     if (best_rd >= rd_thresh || x->skip_encode)
 890       return best_rd;
 891
 892     for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
 893       memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
 894              best_dst16 + idy * 8,
 895              num_4x4_blocks_wide * 4 * sizeof(uint16_t));
 896     }
 897
 898     return best_rd;
 899   }
 900 #endif  // CONFIG_VP9_HIGHBITDEPTH
 901
 902   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
 903     int64_t this_rd;
 904     int ratey = 0;
 905     int64_t distortion = 0;
 906     int rate = bmode_costs[mode];
 907
 908     if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
 909       continue;
 910
 911     // Only do the oblique modes if the best so far is
 912     // one of the neighboring directional modes
 913     if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
 914       if (conditional_skipintra(mode, *best_mode))
 915           continue;
 916     }
 917
 918     memcpy(tempa, ta, sizeof(ta));
 919     memcpy(templ, tl, sizeof(tl));
 920
 921     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
 922       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
 923         const int block = ib + idy * 2 + idx;
 924         const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
 925         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
 926         int16_t *const src_diff =
 927             vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
 928         tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
 929         xd->mi[0]->bmi[block].as_mode = mode;
 930         vp9_predict_intra_block(xd, block, 1,
 931                                 TX_4X4, mode,
 932                                 x->skip_encode ? src : dst,
 933                                 x->skip_encode ? src_stride : dst_stride,
 934                                 dst, dst_stride, idx, idy, 0);
 935         vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
 936
 937         if (xd->lossless) {
 938           const scan_order *so = &vp9_default_scan_orders[TX_4X4];
 939           vp9_fwht4x4(src_diff, coeff, 8);
 940           vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
 941           ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
 942                                so->scan, so->neighbors,
 943                                cpi->sf.use_fast_coef_costing);
 944           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
 945             goto next;
 946           vp9_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, dst_stride,
 947                           p->eobs[block]);
 948         } else {
 949           int64_t unused;
 950           const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
 951           const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
 952           vp9_fht4x4(src_diff, coeff, 8, tx_type);
 953           vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
 954           ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
 955                              so->scan, so->neighbors,
 956                              cpi->sf.use_fast_coef_costing);
 957           distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
 958                                         16, &unused) >> 2;
 959           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
 960             goto next;
 961           vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
 962                          dst, dst_stride, p->eobs[block]);
 963         }
 964       }
 965     }
 966
 967     rate += ratey;
 968     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 969
 970     if (this_rd < best_rd) {
 971       *bestrate = rate;
 972       *bestratey = ratey;
 973       *bestdistortion = distortion;
 974       best_rd = this_rd;
 975       *best_mode = mode;
 976       memcpy(a, tempa, sizeof(tempa));
 977       memcpy(l, templ, sizeof(templ));
 978       for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
 979         memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
 980                num_4x4_blocks_wide * 4);
 981     }
 982   next:
 983     {}
 984   }
 985
 986   if (best_rd >= rd_thresh || x->skip_encode)
 987     return best_rd;
 988
 989   for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
 990     memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
 991            num_4x4_blocks_wide * 4);
 992
 993   return best_rd;
 994 }
 995
 996 static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
 997                                             int *rate, int *rate_y,
 998                                             int64_t *distortion,
 999                                             int64_t best_rd) {
1000   int i, j;
1001   const MACROBLOCKD *const xd = &mb->e_mbd;
1002   MODE_INFO *const mic = xd->mi[0];
1003   const MODE_INFO *above_mi = xd->above_mi;
1004   const MODE_INFO *left_mi = xd->left_mi;
1005   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
1006   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
1007   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
1008   int idx, idy;
1009   int cost = 0;
1010   int64_t total_distortion = 0;
1011   int tot_rate_y = 0;
1012   int64_t total_rd = 0;
1013   ENTROPY_CONTEXT t_above[4], t_left[4];
1014   const int *bmode_costs = cpi->mbmode_cost;
1015
1016   memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
1017   memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
1018
1019   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
1020   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
1021     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
1022       PREDICTION_MODE best_mode = DC_PRED;
1023       int r = INT_MAX, ry = INT_MAX;
1024       int64_t d = INT64_MAX, this_rd = INT64_MAX;
1025       i = idy * 2 + idx;
1026       if (cpi->common.frame_type == KEY_FRAME) {
1027         const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, i);
1028         const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, i);
1029
1030         bmode_costs  = cpi->y_mode_costs[A][L];
1031       }
1032
1033       this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
1034                                       t_above + idx, t_left + idy, &r, &ry, &d,
1035                                       bsize, best_rd - total_rd);
1036       if (this_rd >= best_rd - total_rd)
1037         return INT64_MAX;
1038
1039       total_rd += this_rd;
1040       cost += r;
1041       total_distortion += d;
1042       tot_rate_y += ry;
1043
1044       mic->bmi[i].as_mode = best_mode;
1045       for (j = 1; j < num_4x4_blocks_high; ++j)
1046         mic->bmi[i + j * 2].as_mode = best_mode;
1047       for (j = 1; j < num_4x4_blocks_wide; ++j)
1048         mic->bmi[i + j].as_mode = best_mode;
1049
1050       if (total_rd >= best_rd)
1051         return INT64_MAX;
1052     }
1053   }
1054
1055   *rate = cost;
1056   *rate_y = tot_rate_y;
1057   *distortion = total_distortion;
1058   mic->mbmi.mode = mic->bmi[3].as_mode;
1059
1060   return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
1061 }
1062
1063 // This function is used only for intra_only frames
1064 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
1065                                       int *rate, int *rate_tokenonly,
1066                                       int64_t *distortion, int *skippable,
1067                                       BLOCK_SIZE bsize,
1068                                       int64_t tx_cache[TX_MODES],
1069                                       int64_t best_rd) {
1070   PREDICTION_MODE mode;
1071   PREDICTION_MODE mode_selected = DC_PRED;
1072   MACROBLOCKD *const xd = &x->e_mbd;
1073   MODE_INFO *const mic = xd->mi[0];
1074   int this_rate, this_rate_tokenonly, s;
1075   int64_t this_distortion, this_rd;
1076   TX_SIZE best_tx = TX_4X4;
1077   int i;
1078   int *bmode_costs;
1079   const MODE_INFO *above_mi = xd->above_mi;
1080   const MODE_INFO *left_mi = xd->left_mi;
1081   const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
1082   const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
1083   bmode_costs = cpi->y_mode_costs[A][L];
1084
1085   if (cpi->sf.tx_size_search_method == USE_FULL_RD)
1086     for (i = 0; i < TX_MODES; i++)
1087       tx_cache[i] = INT64_MAX;
1088
1089   memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
1090   /* Y Search for intra prediction mode */
1091   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
1092     int64_t local_tx_cache[TX_MODES];
1093
1094     if (cpi->sf.use_nonrd_pick_mode) {
1095       // These speed features are turned on in hybrid non-RD and RD mode
1096       // for key frame coding in the context of real-time setting.
1097       if (conditional_skipintra(mode, mode_selected))
1098           continue;
1099       if (*skippable)
1100         break;
1101     }
1102
1103     mic->mbmi.mode = mode;
1104
1105     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
1106         &s, NULL, bsize, local_tx_cache, best_rd);
1107
1108     if (this_rate_tokenonly == INT_MAX)
1109       continue;
1110
1111     this_rate = this_rate_tokenonly + bmode_costs[mode];
1112     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
1113
1114     if (this_rd < best_rd) {
1115       mode_selected   = mode;
1116       best_rd         = this_rd;
1117       best_tx         = mic->mbmi.tx_size;
1118       *rate           = this_rate;
1119       *rate_tokenonly = this_rate_tokenonly;
1120       *distortion     = this_distortion;
1121       *skippable      = s;
1122     }
1123
1124     if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
1125       for (i = 0; i < TX_MODES && local_tx_cache[i] < INT64_MAX; i++) {
1126         const int64_t adj_rd = this_rd + local_tx_cache[i] -
1127             local_tx_cache[cpi->common.tx_mode];
1128         if (adj_rd < tx_cache[i]) {
1129           tx_cache[i] = adj_rd;
1130         }
1131       }
1132     }
1133   }
1134
1135   mic->mbmi.mode = mode_selected;
1136   mic->mbmi.tx_size = best_tx;
1137
1138   return best_rd;
1139 }
1140
1141 // Return value 0: early termination triggered, no valid rd cost available;
1142 //              1: rd cost values are valid.
1143 static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
1144                             int *rate, int64_t *distortion, int *skippable,
1145                             int64_t *sse, BLOCK_SIZE bsize,
1146                             int64_t ref_best_rd) {
1147   MACROBLOCKD *const xd = &x->e_mbd;
1148   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
1149   const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
1150   int plane;
1151   int pnrate = 0, pnskip = 1;
1152   int64_t pndist = 0, pnsse = 0;
1153   int is_cost_valid = 1;
1154
1155   if (ref_best_rd < 0)
1156     is_cost_valid = 0;
1157
1158   if (is_inter_block(mbmi) && is_cost_valid) {
1159     int plane;
1160     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
1161       vp9_subtract_plane(x, bsize, plane);
1162   }
1163
1164   *rate = 0;
1165   *distortion = 0;
1166   *sse = 0;
1167   *skippable = 1;
1168
1169   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
1170     txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
1171                      ref_best_rd, plane, bsize, uv_tx_size,
1172                      cpi->sf.use_fast_coef_costing);
1173     if (pnrate == INT_MAX) {
1174       is_cost_valid = 0;
1175       break;
1176     }
1177     *rate += pnrate;
1178     *distortion += pndist;
1179     *sse += pnsse;
1180     *skippable &= pnskip;
1181   }
1182
1183   if (!is_cost_valid) {
1184     // reset cost value
1185     *rate = INT_MAX;
1186     *distortion = INT64_MAX;
1187     *sse = INT64_MAX;
1188     *skippable = 0;
1189   }
1190
1191   return is_cost_valid;
1192 }
1193
1194 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
1195                                        PICK_MODE_CONTEXT *ctx,
1196                                        int *rate, int *rate_tokenonly,
1197                                        int64_t *distortion, int *skippable,
1198                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
1199   MACROBLOCKD *xd = &x->e_mbd;
1200   PREDICTION_MODE mode;
1201   PREDICTION_MODE mode_selected = DC_PRED;
1202   int64_t best_rd = INT64_MAX, this_rd;
1203   int this_rate_tokenonly, this_rate, s;
1204   int64_t this_distortion, this_sse;
1205
1206   memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
1207   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
1208     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
1209       continue;
1210
1211     xd->mi[0]->mbmi.uv_mode = mode;
1212
1213     if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
1214                           &this_distortion, &s, &this_sse, bsize, best_rd))
1215       continue;
1216     this_rate = this_rate_tokenonly +
1217                 cpi->intra_uv_mode_cost[cpi->common.frame_type][mode];
1218     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
1219
1220     if (this_rd < best_rd) {
1221       mode_selected   = mode;
1222       best_rd         = this_rd;
1223       *rate           = this_rate;
1224       *rate_tokenonly = this_rate_tokenonly;
1225       *distortion     = this_distortion;
1226       *skippable      = s;
1227       if (!x->select_tx_size)
1228         swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE);
1229     }
1230   }
1231
1232   xd->mi[0]->mbmi.uv_mode = mode_selected;
1233   return best_rd;
1234 }
1235
1236 static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x,
1237                               int *rate, int *rate_tokenonly,
1238                               int64_t *distortion, int *skippable,
1239                               BLOCK_SIZE bsize) {
1240   const VP9_COMMON *cm = &cpi->common;
1241   int64_t unused;
1242
1243   x->e_mbd.mi[0]->mbmi.uv_mode = DC_PRED;
1244   memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
1245   super_block_uvrd(cpi, x, rate_tokenonly, distortion,
1246                    skippable, &unused, bsize, INT64_MAX);
1247   *rate = *rate_tokenonly + cpi->intra_uv_mode_cost[cm->frame_type][DC_PRED];
1248   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
1249 }
1250
1251 static void choose_intra_uv_mode(VP9_COMP *cpi, MACROBLOCK *const x,
1252                                  PICK_MODE_CONTEXT *ctx,
1253                                  BLOCK_SIZE bsize, TX_SIZE max_tx_size,
1254                                  int *rate_uv, int *rate_uv_tokenonly,
1255                                  int64_t *dist_uv, int *skip_uv,
1256                                  PREDICTION_MODE *mode_uv) {
1257   // Use an estimated rd for uv_intra based on DC_PRED if the
1258   // appropriate speed flag is set.
1259   if (cpi->sf.use_uv_intra_rd_estimate) {
1260     rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv,
1261                    skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
1262   // Else do a proper rd search for each possible transform size that may
1263   // be considered in the main rd loop.
1264   } else {
1265     rd_pick_intra_sbuv_mode(cpi, x, ctx,
1266                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
1267                             bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
1268   }
1269   *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
1270 }
1271
1272 static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode,
1273                        int mode_context) {
1274   assert(is_inter_mode(mode));
1275   return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
1276 }
1277
1278 static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCKD *xd, int i,
1279                                 PREDICTION_MODE mode, int_mv this_mv[2],
1280                                 int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
1281                                 int_mv seg_mvs[MAX_REF_FRAMES],
1282                                 int_mv *best_ref_mv[2], const int *mvjcost,
1283                                 int *mvcost[2]) {
1284   MODE_INFO *const mic = xd->mi[0];
1285   const MB_MODE_INFO *const mbmi = &mic->mbmi;
1286   int thismvcost = 0;
1287   int idx, idy;
1288   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
1289   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
1290   const int is_compound = has_second_ref(mbmi);
1291
1292   switch (mode) {
1293     case NEWMV:
1294       this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
1295       thismvcost += vp9_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
1296                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
1297       if (is_compound) {
1298         this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
1299         thismvcost += vp9_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
1300                                       mvjcost, mvcost, MV_COST_WEIGHT_SUB);
1301       }
1302       break;
1303     case NEARMV:
1304     case NEARESTMV:
1305       this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
1306       if (is_compound)
1307         this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
1308       break;
1309     case ZEROMV:
1310       this_mv[0].as_int = 0;
1311       if (is_compound)
1312         this_mv[1].as_int = 0;
1313       break;
1314     default:
1315       break;
1316   }
1317
1318   mic->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
1319   if (is_compound)
1320     mic->bmi[i].as_mv[1].as_int = this_mv[1].as_int;
1321
1322   mic->bmi[i].as_mode = mode;
1323
1324   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
1325     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
1326       memmove(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i]));
1327
1328   return cost_mv_ref(cpi, mode, mbmi->mode_context[mbmi->ref_frame[0]]) +
1329             thismvcost;
1330 }
1331
1332 static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
1333                                        MACROBLOCK *x,
1334                                        int64_t best_yrd,
1335                                        int i,
1336                                        int *labelyrate,
1337                                        int64_t *distortion, int64_t *sse,
1338                                        ENTROPY_CONTEXT *ta,
1339                                        ENTROPY_CONTEXT *tl,
1340                                        int mi_row, int mi_col) {
1341   int k;
1342   MACROBLOCKD *xd = &x->e_mbd;
1343   struct macroblockd_plane *const pd = &xd->plane[0];
1344   struct macroblock_plane *const p = &x->plane[0];
1345   MODE_INFO *const mi = xd->mi[0];
1346   const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
1347   const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
1348   const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
1349   int idx, idy;
1350
1351   const uint8_t *const src =
1352       &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
1353   uint8_t *const dst = &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i,
1354                                                             pd->dst.stride)];
1355   int64_t thisdistortion = 0, thissse = 0;
1356   int thisrate = 0, ref;
1357   const scan_order *so = &vp9_default_scan_orders[TX_4X4];
1358   const int is_compound = has_second_ref(&mi->mbmi);
1359   const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
1360
1361   for (ref = 0; ref < 1 + is_compound; ++ref) {
1362     const uint8_t *pre = &pd->pre[ref].buf[vp9_raster_block_offset(BLOCK_8X8, i,
1363                                                pd->pre[ref].stride)];
1364 #if CONFIG_VP9_HIGHBITDEPTH
1365   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
1366     vp9_highbd_build_inter_predictor(pre, pd->pre[ref].stride,
1367                                      dst, pd->dst.stride,
1368                                      &mi->bmi[i].as_mv[ref].as_mv,
1369                                      &xd->block_refs[ref]->sf, width, height,
1370                                      ref, kernel, MV_PRECISION_Q3,
1371                                      mi_col * MI_SIZE + 4 * (i % 2),
1372                                      mi_row * MI_SIZE + 4 * (i / 2), xd->bd);
1373   } else {
1374     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
1375                               dst, pd->dst.stride,
1376                               &mi->bmi[i].as_mv[ref].as_mv,
1377                               &xd->block_refs[ref]->sf, width, height, ref,
1378                               kernel, MV_PRECISION_Q3,
1379                               mi_col * MI_SIZE + 4 * (i % 2),
1380                               mi_row * MI_SIZE + 4 * (i / 2));
1381   }
1382 #else
1383     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
1384                               dst, pd->dst.stride,
1385                               &mi->bmi[i].as_mv[ref].as_mv,
1386                               &xd->block_refs[ref]->sf, width, height, ref,
1387                               kernel, MV_PRECISION_Q3,
1388                               mi_col * MI_SIZE + 4 * (i % 2),
1389                               mi_row * MI_SIZE + 4 * (i / 2));
1390 #endif  // CONFIG_VP9_HIGHBITDEPTH
1391   }
1392
1393 #if CONFIG_VP9_HIGHBITDEPTH
1394   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
1395     vp9_highbd_subtract_block(
1396         height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
1397         8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
1398   } else {
1399     vp9_subtract_block(
1400         height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
1401         8, src, p->src.stride, dst, pd->dst.stride);
1402   }
1403 #else
1404   vp9_subtract_block(height, width,
1405                      vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
1406                      8, src, p->src.stride, dst, pd->dst.stride);
1407 #endif  // CONFIG_VP9_HIGHBITDEPTH
1408
1409   k = i;
1410   for (idy = 0; idy < height / 4; ++idy) {
1411     for (idx = 0; idx < width / 4; ++idx) {
1412       int64_t ssz, rd, rd1, rd2;
1413       tran_low_t* coeff;
1414
1415       k += (idy * 2 + idx);
1416       coeff = BLOCK_OFFSET(p->coeff, k);
1417       x->fwd_txm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
1418                     coeff, 8);
1419       vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
1420 #if CONFIG_VP9_HIGHBITDEPTH
1421       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
1422         thisdistortion += vp9_highbd_block_error(coeff,
1423                                                  BLOCK_OFFSET(pd->dqcoeff, k),
1424                                                  16, &ssz, xd->bd);
1425       } else {
1426         thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
1427                                           16, &ssz);
1428       }
1429 #else
1430       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
1431                                         16, &ssz);
1432 #endif  // CONFIG_VP9_HIGHBITDEPTH
1433       thissse += ssz;
1434       thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
1435                               so->scan, so->neighbors,
1436                               cpi->sf.use_fast_coef_costing);
1437       rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
1438       rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
1439       rd = MIN(rd1, rd2);
1440       if (rd >= best_yrd)
1441         return INT64_MAX;
1442     }
1443   }
1444
1445   *distortion = thisdistortion >> 2;
1446   *labelyrate = thisrate;
1447   *sse = thissse >> 2;
1448
1449   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
1450 }
1451
1452 typedef struct {
1453   int eobs;
1454   int brate;
1455   int byrate;
1456   int64_t bdist;
1457   int64_t bsse;
1458   int64_t brdcost;
1459   int_mv mvs[2];
1460   ENTROPY_CONTEXT ta[2];
1461   ENTROPY_CONTEXT tl[2];
1462 } SEG_RDSTAT;
1463
1464 typedef struct {
1465   int_mv *ref_mv[2];
1466   int_mv mvp;
1467
1468   int64_t segment_rd;
1469   int r;
1470   int64_t d;
1471   int64_t sse;
1472   int segment_yrate;
1473   PREDICTION_MODE modes[4];
1474   SEG_RDSTAT rdstat[4][INTER_MODES];
1475   int mvthresh;
1476 } BEST_SEG_INFO;
1477
1478 static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) {
1479   return (mv->row >> 3) < x->mv_row_min ||
1480          (mv->row >> 3) > x->mv_row_max ||
1481          (mv->col >> 3) < x->mv_col_min ||
1482          (mv->col >> 3) > x->mv_col_max;
1483 }
1484
1485 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
1486   MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
1487   struct macroblock_plane *const p = &x->plane[0];
1488   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
1489
1490   p->src.buf = &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i,
1491                                                    p->src.stride)];
1492   assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
1493   pd->pre[0].buf = &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8, i,
1494                                                            pd->pre[0].stride)];
1495   if (has_second_ref(mbmi))
1496     pd->pre[1].buf = &pd->pre[1].buf[vp9_raster_block_offset(BLOCK_8X8, i,
1497                                                            pd->pre[1].stride)];
1498 }
1499
1500 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
1501                                   struct buf_2d orig_pre[2]) {
1502   MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
1503   x->plane[0].src = orig_src;
1504   x->e_mbd.plane[0].pre[0] = orig_pre[0];
1505   if (has_second_ref(mbmi))
1506     x->e_mbd.plane[0].pre[1] = orig_pre[1];
1507 }
1508
1509 static INLINE int mv_has_subpel(const MV *mv) {
1510   return (mv->row & 0x0F) || (mv->col & 0x0F);
1511 }
1512
1513 // Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
1514 // TODO(aconverse): Find out if this is still productive then clean up or remove
1515 static int check_best_zero_mv(
1516     const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
1517     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int this_mode,
1518     const MV_REFERENCE_FRAME ref_frames[2]) {
1519   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
1520       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
1521       (ref_frames[1] == NONE ||
1522        frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
1523     int rfc = mode_context[ref_frames[0]];
1524     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
1525     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
1526     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
1527
1528     if (this_mode == NEARMV) {
1529       if (c1 > c3) return 0;
1530     } else if (this_mode == NEARESTMV) {
1531       if (c2 > c3) return 0;
1532     } else {
1533       assert(this_mode == ZEROMV);
1534       if (ref_frames[1] == NONE) {
1535         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
1536             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
1537           return 0;
1538       } else {
1539         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 &&
1540              frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) ||
1541             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 &&
1542              frame_mv[NEARMV][ref_frames[1]].as_int == 0))
1543           return 0;
1544       }
1545     }
1546   }
1547   return 1;
1548 }
1549
1550 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
1551                                 BLOCK_SIZE bsize,
1552                                 int_mv *frame_mv,
1553                                 int mi_row, int mi_col,
1554                                 int_mv single_newmv[MAX_REF_FRAMES],
1555                                 int *rate_mv) {
1556   const VP9_COMMON *const cm = &cpi->common;
1557   const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
1558   const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
1559   MACROBLOCKD *xd = &x->e_mbd;
1560   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
1561   const int refs[2] = {mbmi->ref_frame[0],
1562                        mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]};
1563   int_mv ref_mv[2];
1564   int ite, ref;
1565   const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
1566   struct scale_factors sf;
1567
1568   // Do joint motion search in compound mode to get more accurate mv.
1569   struct buf_2d backup_yv12[2][MAX_MB_PLANE];
1570   int last_besterr[2] = {INT_MAX, INT_MAX};
1571   const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
1572     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
1573     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
1574   };
1575
1576   // Prediction buffer from second frame.
1577 #if CONFIG_VP9_HIGHBITDEPTH
1578   DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
1579   uint8_t *second_pred;
1580 #else
1581   DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
1582 #endif  // CONFIG_VP9_HIGHBITDEPTH
1583
1584   for (ref = 0; ref < 2; ++ref) {
1585     ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
1586
1587     if (scaled_ref_frame[ref]) {
1588       int i;
1589       // Swap out the reference frame for a version that's been scaled to
1590       // match the resolution of the current frame, allowing the existing
1591       // motion search code to be used without additional modifications.
1592       for (i = 0; i < MAX_MB_PLANE; i++)
1593         backup_yv12[ref][i] = xd->plane[i].pre[ref];
1594       vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
1595                            NULL);
1596     }
1597
1598     frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
1599   }
1600
1601   // Since we have scaled the reference frames to match the size of the current
1602   // frame we must use a unit scaling factor during mode selection.
1603 #if CONFIG_VP9_HIGHBITDEPTH
1604   vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
1605                                     cm->width, cm->height,
1606                                     cm->use_highbitdepth);
1607 #else
1608   vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
1609                                     cm->width, cm->height);
1610 #endif  // CONFIG_VP9_HIGHBITDEPTH
1611
1612   // Allow joint search multiple times iteratively for each reference frame
1613   // and break out of the search loop if it couldn't find a better mv.
1614   for (ite = 0; ite < 4; ite++) {
1615     struct buf_2d ref_yv12[2];
1616     int bestsme = INT_MAX;
1617     int sadpb = x->sadperbit16;
1618     MV tmp_mv;
1619     int search_range = 3;
1620
1621     int tmp_col_min = x->mv_col_min;
1622     int tmp_col_max = x->mv_col_max;
1623     int tmp_row_min = x->mv_row_min;
1624     int tmp_row_max = x->mv_row_max;
1625     int id = ite % 2;  // Even iterations search in the first reference frame,
1626                        // odd iterations search in the second. The predictor
1627                        // found for the 'other' reference frame is factored in.
1628
1629     // Initialized here because of compiler problem in Visual Studio.
1630     ref_yv12[0] = xd->plane[0].pre[0];
1631     ref_yv12[1] = xd->plane[0].pre[1];
1632
1633     // Get the prediction block from the 'other' reference frame.
1634 #if CONFIG_VP9_HIGHBITDEPTH
1635     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
1636       second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
1637       vp9_highbd_build_inter_predictor(ref_yv12[!id].buf,
1638                                        ref_yv12[!id].stride,
1639                                        second_pred, pw,
1640                                        &frame_mv[refs[!id]].as_mv,
1641                                        &sf, pw, ph, 0,
1642                                        kernel, MV_PRECISION_Q3,
1643                                        mi_col * MI_SIZE, mi_row * MI_SIZE,
1644                                        xd->bd);
1645     } else {
1646       second_pred = (uint8_t *)second_pred_alloc_16;
1647       vp9_build_inter_predictor(ref_yv12[!id].buf,
1648                                 ref_yv12[!id].stride,
1649                                 second_pred, pw,
1650                                 &frame_mv[refs[!id]].as_mv,
1651                                 &sf, pw, ph, 0,
1652                                 kernel, MV_PRECISION_Q3,
1653                                 mi_col * MI_SIZE, mi_row * MI_SIZE);
1654     }
1655 #else
1656     vp9_build_inter_predictor(ref_yv12[!id].buf,
1657                               ref_yv12[!id].stride,
1658                               second_pred, pw,
1659                               &frame_mv[refs[!id]].as_mv,
1660                               &sf, pw, ph, 0,
1661                               kernel, MV_PRECISION_Q3,
1662                               mi_col * MI_SIZE, mi_row * MI_SIZE);
1663 #endif  // CONFIG_VP9_HIGHBITDEPTH
1664
1665     // Do compound motion search on the current reference frame.
1666     if (id)
1667       xd->plane[0].pre[0] = ref_yv12[id];
1668     vp9_set_mv_search_range(x, &ref_mv[id].as_mv);
1669
1670     // Use the mv result from the single mode as mv predictor.
1671     tmp_mv = frame_mv[refs[id]].as_mv;
1672
1673     tmp_mv.col >>= 3;
1674     tmp_mv.row >>= 3;
1675
1676     // Small-range full-pixel motion search.
1677     bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
1678                                        search_range,
1679                                        &cpi->fn_ptr[bsize],
1680                                        &ref_mv[id].as_mv, second_pred);
1681     if (bestsme < INT_MAX)
1682       bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
1683                                       second_pred, &cpi->fn_ptr[bsize], 1);
1684
1685     x->mv_col_min = tmp_col_min;
1686     x->mv_col_max = tmp_col_max;
1687     x->mv_row_min = tmp_row_min;
1688     x->mv_row_max = tmp_row_max;
1689
1690     if (bestsme < INT_MAX) {
1691       int dis; /* TODO: use dis in distortion calculation later. */
1692       unsigned int sse;
1693       bestsme = cpi->find_fractional_mv_step(
1694           x, &tmp_mv,
1695           &ref_mv[id].as_mv,
1696           cpi->common.allow_high_precision_mv,
1697           x->errorperbit,
1698           &cpi->fn_ptr[bsize],
1699           0, cpi->sf.mv.subpel_iters_per_step,
1700           NULL,
1701           x->nmvjointcost, x->mvcost,
1702           &dis, &sse, second_pred,
1703           pw, ph);
1704     }
1705
1706     // Restore the pointer to the first (possibly scaled) prediction buffer.
1707     if (id)
1708       xd->plane[0].pre[0] = ref_yv12[0];
1709
1710     if (bestsme < last_besterr[id]) {
1711       frame_mv[refs[id]].as_mv = tmp_mv;
1712       last_besterr[id] = bestsme;
1713     } else {
1714       break;
1715     }
1716   }
1717
1718   *rate_mv = 0;
1719
1720   for (ref = 0; ref < 2; ++ref) {
1721     if (scaled_ref_frame[ref]) {
1722       // Restore the prediction frame pointers to their unscaled versions.
1723       int i;
1724       for (i = 0; i < MAX_MB_PLANE; i++)
1725         xd->plane[i].pre[ref] = backup_yv12[ref][i];
1726     }
1727
1728     *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
1729                                 &mbmi->ref_mvs[refs[ref]][0].as_mv,
1730                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
1731   }
1732 }
1733
1734 static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
1735                                         const TileInfo * const tile,
1736                                         int_mv *best_ref_mv,
1737                                         int_mv *second_best_ref_mv,
1738                                         int64_t best_rd, int *returntotrate,
1739                                         int *returnyrate,
1740                                         int64_t *returndistortion,
1741                                         int *skippable, int64_t *psse,
1742                                         int mvthresh,
1743                                         int_mv seg_mvs[4][MAX_REF_FRAMES],
1744                                         BEST_SEG_INFO *bsi_buf, int filter_idx,
1745                                         int mi_row, int mi_col) {
1746   int i;
1747   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
1748   MACROBLOCKD *xd = &x->e_mbd;
1749   MODE_INFO *mi = xd->mi[0];
1750   MB_MODE_INFO *mbmi = &mi->mbmi;
1751   int mode_idx;
1752   int k, br = 0, idx, idy;
1753   int64_t bd = 0, block_sse = 0;
1754   PREDICTION_MODE this_mode;
1755   VP9_COMMON *cm = &cpi->common;
1756   struct macroblock_plane *const p = &x->plane[0];
1757   struct macroblockd_plane *const pd = &xd->plane[0];
1758   const int label_count = 4;
1759   int64_t this_segment_rd = 0;
1760   int label_mv_thresh;
1761   int segmentyrate = 0;
1762   const BLOCK_SIZE bsize = mbmi->sb_type;
1763   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
1764   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
1765   ENTROPY_CONTEXT t_above[2], t_left[2];
1766   int subpelmv = 1, have_ref = 0;
1767   const int has_second_rf = has_second_ref(mbmi);
1768   const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
1769
1770   vp9_zero(*bsi);
1771
1772   bsi->segment_rd = best_rd;
1773   bsi->ref_mv[0] = best_ref_mv;
1774   bsi->ref_mv[1] = second_best_ref_mv;
1775   bsi->mvp.as_int = best_ref_mv->as_int;
1776   bsi->mvthresh = mvthresh;
1777
1778   for (i = 0; i < 4; i++)
1779     bsi->modes[i] = ZEROMV;
1780
1781   memcpy(t_above, pd->above_context, sizeof(t_above));
1782   memcpy(t_left, pd->left_context, sizeof(t_left));
1783
1784   // 64 makes this threshold really big effectively
1785   // making it so that we very rarely check mvs on
1786   // segments.   setting this to 1 would make mv thresh
1787   // roughly equal to what it is for macroblocks
1788   label_mv_thresh = 1 * bsi->mvthresh / label_count;
1789
1790   // Segmentation method overheads
1791   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
1792     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
1793       // TODO(jingning,rbultje): rewrite the rate-distortion optimization
1794       // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
1795       int_mv mode_mv[MB_MODE_COUNT][2];
1796       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
1797       PREDICTION_MODE mode_selected = ZEROMV;
1798       int64_t best_rd = INT64_MAX;
1799       const int i = idy * 2 + idx;
1800       int ref;
1801
1802       for (ref = 0; ref < 1 + has_second_rf; ++ref) {
1803         const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
1804         frame_mv[ZEROMV][frame].as_int = 0;
1805         vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, i, ref, mi_row, mi_col,
1806                                       &frame_mv[NEARESTMV][frame],
1807                                       &frame_mv[NEARMV][frame]);
1808       }
1809
1810       // search for the best motion vector on this segment
1811       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
1812         const struct buf_2d orig_src = x->plane[0].src;
1813         struct buf_2d orig_pre[2];
1814
1815         mode_idx = INTER_OFFSET(this_mode);
1816         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
1817         if (!(inter_mode_mask & (1 << this_mode)))
1818           continue;
1819
1820         if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
1821                                 this_mode, mbmi->ref_frame))
1822           continue;
1823
1824         memcpy(orig_pre, pd->pre, sizeof(orig_pre));
1825         memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
1826                sizeof(bsi->rdstat[i][mode_idx].ta));
1827         memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
1828                sizeof(bsi->rdstat[i][mode_idx].tl));
1829
1830         // motion search for newmv (single predictor case only)
1831         if (!has_second_rf && this_mode == NEWMV &&
1832             seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
1833           MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
1834           int step_param = 0;
1835           int thissme, bestsme = INT_MAX;
1836           int sadpb = x->sadperbit4;
1837           MV mvp_full;
1838           int max_mv;
1839           int cost_list[5];
1840
1841           /* Is the best so far sufficiently good that we cant justify doing
1842            * and new motion search. */
1843           if (best_rd < label_mv_thresh)
1844             break;
1845
1846           if (cpi->oxcf.mode != BEST) {
1847             // use previous block's result as next block's MV predictor.
1848             if (i > 0) {
1849               bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
1850               if (i == 2)
1851                 bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
1852             }
1853           }
1854           if (i == 0)
1855             max_mv = x->max_mv_context[mbmi->ref_frame[0]];
1856           else
1857             max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
1858
1859           if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
1860             // Take wtd average of the step_params based on the last frame's
1861             // max mv magnitude and the best ref mvs of the current block for
1862             // the given reference.
1863             step_param = (vp9_init_search_range(max_mv) +
1864                               cpi->mv_step_param) / 2;
1865           } else {
1866             step_param = cpi->mv_step_param;
1867           }
1868
1869           mvp_full.row = bsi->mvp.as_mv.row >> 3;
1870           mvp_full.col = bsi->mvp.as_mv.col >> 3;
1871
1872           if (cpi->sf.adaptive_motion_search) {
1873             mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
1874             mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
1875             step_param = MAX(step_param, 8);
1876           }
1877
1878           // adjust src pointer for this block
1879           mi_buf_shift(x, i);
1880
1881           vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
1882
1883           bestsme = vp9_full_pixel_search(
1884               cpi, x, bsize, &mvp_full, step_param, sadpb,
1885               cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
1886               &bsi->ref_mv[0]->as_mv, new_mv,
1887               INT_MAX, 1);
1888
1889           // Should we do a full search (best quality only)
1890           if (cpi->oxcf.mode == BEST) {
1891             int_mv *const best_mv = &mi->bmi[i].as_mv[0];
1892             /* Check if mvp_full is within the range. */
1893             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
1894                      x->mv_row_min, x->mv_row_max);
1895             thissme = cpi->full_search_sad(x, &mvp_full,
1896                                            sadpb, 16, &cpi->fn_ptr[bsize],
1897                                            &bsi->ref_mv[0]->as_mv,
1898                                            &best_mv->as_mv);
1899             cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
1900             if (thissme < bestsme) {
1901               bestsme = thissme;
1902               *new_mv = best_mv->as_mv;
1903             } else {
1904               // The full search result is actually worse so re-instate the
1905               // previous best vector
1906               best_mv->as_mv = *new_mv;
1907             }
1908           }
1909
1910           if (bestsme < INT_MAX) {
1911             int distortion;
1912             cpi->find_fractional_mv_step(
1913                 x,
1914                 new_mv,
1915                 &bsi->ref_mv[0]->as_mv,
1916                 cm->allow_high_precision_mv,
1917                 x->errorperbit, &cpi->fn_ptr[bsize],
1918                 cpi->sf.mv.subpel_force_stop,
1919                 cpi->sf.mv.subpel_iters_per_step,
1920                 cond_cost_list(cpi, cost_list),
1921                 x->nmvjointcost, x->mvcost,
1922                 &distortion,
1923                 &x->pred_sse[mbmi->ref_frame[0]],
1924                 NULL, 0, 0);
1925
1926             // save motion search result for use in compound prediction
1927             seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
1928           }
1929
1930           if (cpi->sf.adaptive_motion_search)
1931             x->pred_mv[mbmi->ref_frame[0]] = *new_mv;
1932
1933           // restore src pointers
1934           mi_buf_restore(x, orig_src, orig_pre);
1935         }
1936
1937         if (has_second_rf) {
1938           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
1939               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
1940             continue;
1941         }
1942
1943         if (has_second_rf && this_mode == NEWMV &&
1944             mbmi->interp_filter == EIGHTTAP) {
1945           // adjust src pointers
1946           mi_buf_shift(x, i);
1947           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
1948             int rate_mv;
1949             joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
1950                                 mi_row, mi_col, seg_mvs[i],
1951                                 &rate_mv);
1952             seg_mvs[i][mbmi->ref_frame[0]].as_int =
1953                 frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
1954             seg_mvs[i][mbmi->ref_frame[1]].as_int =
1955                 frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
1956           }
1957           // restore src pointers
1958           mi_buf_restore(x, orig_src, orig_pre);
1959         }
1960
1961         bsi->rdstat[i][mode_idx].brate =
1962             set_and_cost_bmi_mvs(cpi, xd, i, this_mode, mode_mv[this_mode],
1963                                  frame_mv, seg_mvs[i], bsi->ref_mv,
1964                                  x->nmvjointcost, x->mvcost);
1965
1966         for (ref = 0; ref < 1 + has_second_rf; ++ref) {
1967           bsi->rdstat[i][mode_idx].mvs[ref].as_int =
1968               mode_mv[this_mode][ref].as_int;
1969           if (num_4x4_blocks_wide > 1)
1970             bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int =
1971                 mode_mv[this_mode][ref].as_int;
1972           if (num_4x4_blocks_high > 1)
1973             bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int =
1974                 mode_mv[this_mode][ref].as_int;
1975         }
1976
1977         // Trap vectors that reach beyond the UMV borders
1978         if (mv_check_bounds(x, &mode_mv[this_mode][0].as_mv) ||
1979             (has_second_rf &&
1980              mv_check_bounds(x, &mode_mv[this_mode][1].as_mv)))
1981           continue;
1982
1983         if (filter_idx > 0) {
1984           BEST_SEG_INFO *ref_bsi = bsi_buf;
1985           subpelmv = 0;
1986           have_ref = 1;
1987
1988           for (ref = 0; ref < 1 + has_second_rf; ++ref) {
1989             subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
1990             have_ref &= mode_mv[this_mode][ref].as_int ==
1991                 ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
1992           }
1993
1994           if (filter_idx > 1 && !subpelmv && !have_ref) {
1995             ref_bsi = bsi_buf + 1;
1996             have_ref = 1;
1997             for (ref = 0; ref < 1 + has_second_rf; ++ref)
1998               have_ref &= mode_mv[this_mode][ref].as_int ==
1999                   ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
2000           }
2001
2002           if (!subpelmv && have_ref &&
2003               ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
2004             memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
2005                    sizeof(SEG_RDSTAT));
2006             if (num_4x4_blocks_wide > 1)
2007               bsi->rdstat[i + 1][mode_idx].eobs =
2008                   ref_bsi->rdstat[i + 1][mode_idx].eobs;
2009             if (num_4x4_blocks_high > 1)
2010               bsi->rdstat[i + 2][mode_idx].eobs =
2011                   ref_bsi->rdstat[i + 2][mode_idx].eobs;
2012
2013             if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
2014               mode_selected = this_mode;
2015               best_rd = bsi->rdstat[i][mode_idx].brdcost;
2016             }
2017             continue;
2018           }
2019         }
2020
2021         bsi->rdstat[i][mode_idx].brdcost =
2022             encode_inter_mb_segment(cpi, x,
2023                                     bsi->segment_rd - this_segment_rd, i,
2024                                     &bsi->rdstat[i][mode_idx].byrate,
2025                                     &bsi->rdstat[i][mode_idx].bdist,
2026                                     &bsi->rdstat[i][mode_idx].bsse,
2027                                     bsi->rdstat[i][mode_idx].ta,
2028                                     bsi->rdstat[i][mode_idx].tl,
2029                                     mi_row, mi_col);
2030         if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
2031           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
2032                                             bsi->rdstat[i][mode_idx].brate, 0);
2033           bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
2034           bsi->rdstat[i][mode_idx].eobs = p->eobs[i];
2035           if (num_4x4_blocks_wide > 1)
2036             bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1];
2037           if (num_4x4_blocks_high > 1)
2038             bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2];
2039         }
2040
2041         if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
2042           mode_selected = this_mode;
2043           best_rd = bsi->rdstat[i][mode_idx].brdcost;
2044         }
2045       } /*for each 4x4 mode*/
2046
2047       if (best_rd == INT64_MAX) {
2048         int iy, midx;
2049         for (iy = i + 1; iy < 4; ++iy)
2050           for (midx = 0; midx < INTER_MODES; ++midx)
2051             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
2052         bsi->segment_rd = INT64_MAX;
2053         return INT64_MAX;
2054       }
2055
2056       mode_idx = INTER_OFFSET(mode_selected);
2057       memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
2058       memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
2059
2060       set_and_cost_bmi_mvs(cpi, xd, i, mode_selected, mode_mv[mode_selected],
2061                            frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
2062                            x->mvcost);
2063
2064       br += bsi->rdstat[i][mode_idx].brate;
2065       bd += bsi->rdstat[i][mode_idx].bdist;
2066       block_sse += bsi->rdstat[i][mode_idx].bsse;
2067       segmentyrate += bsi->rdstat[i][mode_idx].byrate;
2068       this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
2069
2070       if (this_segment_rd > bsi->segment_rd) {
2071         int iy, midx;
2072         for (iy = i + 1; iy < 4; ++iy)
2073           for (midx = 0; midx < INTER_MODES; ++midx)
2074             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
2075         bsi->segment_rd = INT64_MAX;
2076         return INT64_MAX;
2077       }
2078     }
2079   } /* for each label */
2080
2081   bsi->r = br;
2082   bsi->d = bd;
2083   bsi->segment_yrate = segmentyrate;
2084   bsi->segment_rd = this_segment_rd;
2085   bsi->sse = block_sse;
2086
2087   // update the coding decisions
2088   for (k = 0; k < 4; ++k)
2089     bsi->modes[k] = mi->bmi[k].as_mode;
2090
2091   if (bsi->segment_rd > best_rd)
2092     return INT64_MAX;
2093   /* set it to the best */
2094   for (i = 0; i < 4; i++) {
2095     mode_idx = INTER_OFFSET(bsi->modes[i]);
2096     mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
2097     if (has_second_ref(mbmi))
2098       mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
2099     x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
2100     mi->bmi[i].as_mode = bsi->modes[i];
2101   }
2102
2103   /*
2104    * used to set mbmi->mv.as_int
2105    */
2106   *returntotrate = bsi->r;
2107   *returndistortion = bsi->d;
2108   *returnyrate = bsi->segment_yrate;
2109   *skippable = vp9_is_skippable_in_plane(x, BLOCK_8X8, 0);
2110   *psse = bsi->sse;
2111   mbmi->mode = bsi->modes[3];
2112
2113   return bsi->segment_rd;
2114 }
2115
2116 static void estimate_ref_frame_costs(const VP9_COMMON *cm,
2117                                      const MACROBLOCKD *xd,
2118                                      int segment_id,
2119                                      unsigned int *ref_costs_single,
2120                                      unsigned int *ref_costs_comp,
2121                                      vp9_prob *comp_mode_p) {
2122   int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
2123                                              SEG_LVL_REF_FRAME);
2124   if (seg_ref_active) {
2125     memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
2126     memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
2127     *comp_mode_p = 128;
2128   } else {
2129     vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
2130     vp9_prob comp_inter_p = 128;
2131
2132     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
2133       comp_inter_p = vp9_get_reference_mode_prob(cm, xd);
2134       *comp_mode_p = comp_inter_p;
2135     } else {
2136       *comp_mode_p = 128;
2137     }
2138
2139     ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
2140
2141     if (cm->reference_mode != COMPOUND_REFERENCE) {
2142       vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
2143       vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
2144       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
2145
2146       if (cm->reference_mode == REFERENCE_MODE_SELECT)
2147         base_cost += vp9_cost_bit(comp_inter_p, 0);
2148
2149       ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
2150           ref_costs_single[ALTREF_FRAME] = base_cost;
2151       ref_costs_single[LAST_FRAME]   += vp9_cost_bit(ref_single_p1, 0);
2152       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
2153       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
2154       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
2155       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
2156     } else {
2157       ref_costs_single[LAST_FRAME]   = 512;
2158       ref_costs_single[GOLDEN_FRAME] = 512;
2159       ref_costs_single[ALTREF_FRAME] = 512;
2160     }
2161     if (cm->reference_mode != SINGLE_REFERENCE) {
2162       vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
2163       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
2164
2165       if (cm->reference_mode == REFERENCE_MODE_SELECT)
2166         base_cost += vp9_cost_bit(comp_inter_p, 1);
2167
2168       ref_costs_comp[LAST_FRAME]   = base_cost + vp9_cost_bit(ref_comp_p, 0);
2169       ref_costs_comp[GOLDEN_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 1);
2170     } else {
2171       ref_costs_comp[LAST_FRAME]   = 512;
2172       ref_costs_comp[GOLDEN_FRAME] = 512;
2173     }
2174   }
2175 }
2176
2177 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
2178                          int mode_index,
2179                          int64_t comp_pred_diff[REFERENCE_MODES],
2180                          const int64_t tx_size_diff[TX_MODES],
2181                          int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS],
2182                          int skippable) {
2183   MACROBLOCKD *const xd = &x->e_mbd;
2184
2185   // Take a snapshot of the coding context so it can be
2186   // restored if we decide to encode this way
2187   ctx->skip = x->skip;
2188   ctx->skippable = skippable;
2189   ctx->best_mode_index = mode_index;
2190   ctx->mic = *xd->mi[0];
2191   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
2192   ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
2193   ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
2194
2195   memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
2196   memcpy(ctx->best_filter_diff, best_filter_diff,
2197          sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
2198 }
2199
2200 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
2201                                const TileInfo *const tile,
2202                                MV_REFERENCE_FRAME ref_frame,
2203                                BLOCK_SIZE block_size,
2204                                int mi_row, int mi_col,
2205                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
2206                                int_mv frame_near_mv[MAX_REF_FRAMES],
2207                                struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
2208   const VP9_COMMON *cm = &cpi->common;
2209   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
2210   MACROBLOCKD *const xd = &x->e_mbd;
2211   MODE_INFO *const mi = xd->mi[0];
2212   int_mv *const candidates = mi->mbmi.ref_mvs[ref_frame];
2213   const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
2214
2215   assert(yv12 != NULL);
2216
2217   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
2218   // use the UV scaling factors.
2219   vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
2220
2221   // Gets an initial list of candidate vectors from neighbours and orders them
2222   vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col,
2223                    NULL, NULL);
2224
2225   // Candidate refinement carried out at encoder and decoder
2226   vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
2227                         &frame_nearest_mv[ref_frame],
2228                         &frame_near_mv[ref_frame]);
2229
2230   // Further refinement that is encode side only to test the top few candidates
2231   // in full and choose the best as the centre point for subsequent searches.
2232   // The current implementation doesn't support scaling.
2233   if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8)
2234     vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
2235                 ref_frame, block_size);
2236 }
2237
2238 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
2239                                  BLOCK_SIZE bsize,
2240                                  int mi_row, int mi_col,
2241                                  int_mv *tmp_mv, int *rate_mv) {
2242   MACROBLOCKD *xd = &x->e_mbd;
2243   const VP9_COMMON *cm = &cpi->common;
2244   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
2245   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
2246   int bestsme = INT_MAX;
2247   int step_param;
2248   int sadpb = x->sadperbit16;
2249   MV mvp_full;
2250   int ref = mbmi->ref_frame[0];
2251   MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
2252
2253   int tmp_col_min = x->mv_col_min;
2254   int tmp_col_max = x->mv_col_max;
2255   int tmp_row_min = x->mv_row_min;
2256   int tmp_row_max = x->mv_row_max;
2257   int cost_list[5];
2258
2259   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
2260                                                                         ref);
2261
2262   MV pred_mv[3];
2263   pred_mv[0] = mbmi->ref_mvs[ref][0].as_mv;
2264   pred_mv[1] = mbmi->ref_mvs[ref][1].as_mv;
2265   pred_mv[2] = x->pred_mv[ref];
2266
2267   if (scaled_ref_frame) {
2268     int i;
2269     // Swap out the reference frame for a version that's been scaled to
2270     // match the resolution of the current frame, allowing the existing
2271     // motion search code to be used without additional modifications.
2272     for (i = 0; i < MAX_MB_PLANE; i++)
2273       backup_yv12[i] = xd->plane[i].pre[0];
2274
2275     vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
2276   }
2277
2278   vp9_set_mv_search_range(x, &ref_mv);
2279
2280   // Work out the size of the first step in the mv step search.
2281   // 0 here is maximum length first step. 1 is MAX >> 1 etc.
2282   if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
2283     // Take wtd average of the step_params based on the last frame's
2284     // max mv magnitude and that based on the best ref mvs of the current
2285     // block for the given reference.
2286     step_param = (vp9_init_search_range(x->max_mv_context[ref]) +
2287                     cpi->mv_step_param) / 2;
2288   } else {
2289     step_param = cpi->mv_step_param;
2290   }
2291
2292   if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
2293     int boffset = 2 * (b_width_log2_lookup[BLOCK_64X64] -
2294           MIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
2295     step_param = MAX(step_param, boffset);
2296   }
2297
2298   if (cpi->sf.adaptive_motion_search) {
2299     int bwl = b_width_log2_lookup[bsize];
2300     int bhl = b_height_log2_lookup[bsize];
2301     int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
2302
2303     if (tlevel < 5)
2304       step_param += 2;
2305
2306     // prev_mv_sad is not setup for dynamically scaled frames.
2307     if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
2308       int i;
2309       for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
2310         if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
2311           x->pred_mv[ref].row = 0;
2312           x->pred_mv[ref].col = 0;
2313           tmp_mv->as_int = INVALID_MV;
2314
2315           if (scaled_ref_frame) {
2316             int i;
2317             for (i = 0; i < MAX_MB_PLANE; ++i)
2318               xd->plane[i].pre[0] = backup_yv12[i];
2319           }
2320           return;
2321         }
2322       }
2323     }
2324   }
2325
2326   mvp_full = pred_mv[x->mv_best_ref_index[ref]];
2327
2328   mvp_full.col >>= 3;
2329   mvp_full.row >>= 3;
2330
2331   bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
2332                                   cond_cost_list(cpi, cost_list),
2333                                   &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
2334
2335   x->mv_col_min = tmp_col_min;
2336   x->mv_col_max = tmp_col_max;
2337   x->mv_row_min = tmp_row_min;
2338   x->mv_row_max = tmp_row_max;
2339
2340   if (bestsme < INT_MAX) {
2341     int dis;  /* TODO: use dis in distortion calculation later. */
2342     cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
2343                                  cm->allow_high_precision_mv,
2344                                  x->errorperbit,
2345                                  &cpi->fn_ptr[bsize],
2346                                  cpi->sf.mv.subpel_force_stop,
2347                                  cpi->sf.mv.subpel_iters_per_step,
2348                                  cond_cost_list(cpi, cost_list),
2349                                  x->nmvjointcost, x->mvcost,
2350                                  &dis, &x->pred_sse[ref], NULL, 0, 0);
2351   }
2352   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
2353                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2354
2355   if (cpi->sf.adaptive_motion_search)
2356     x->pred_mv[ref] = tmp_mv->as_mv;
2357
2358   if (scaled_ref_frame) {
2359     int i;
2360     for (i = 0; i < MAX_MB_PLANE; i++)
2361       xd->plane[i].pre[0] = backup_yv12[i];
2362   }
2363 }
2364
2365
2366
2367 static INLINE void restore_dst_buf(MACROBLOCKD *xd,
2368                                    uint8_t *orig_dst[MAX_MB_PLANE],
2369                                    int orig_dst_stride[MAX_MB_PLANE]) {
2370   int i;
2371   for (i = 0; i < MAX_MB_PLANE; i++) {
2372     xd->plane[i].dst.buf = orig_dst[i];
2373     xd->plane[i].dst.stride = orig_dst_stride[i];
2374   }
2375 }
2376
2377 // In some situations we want to discount tha pparent cost of a new motion
2378 // vector. Where there is a subtle motion field and especially where there is
2379 // low spatial complexity then it can be hard to cover the cost of a new motion
2380 // vector in a single block, even if that motion vector reduces distortion.
2381 // However, once established that vector may be usable through the nearest and
2382 // near mv modes to reduce distortion in subsequent blocks and also improve
2383 // visual quality.
2384 static int discount_newmv_test(const VP9_COMP *cpi,
2385                                int this_mode,
2386                                int_mv this_mv,
2387                                int_mv (*mode_mv)[MAX_REF_FRAMES],
2388                                int ref_frame) {
2389   return (!cpi->rc.is_src_frame_alt_ref &&
2390           (this_mode == NEWMV) &&
2391           (this_mv.as_int != 0) &&
2392           ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
2393            (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
2394           ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
2395            (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
2396 }
2397
2398 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
2399                                  BLOCK_SIZE bsize,
2400                                  int64_t txfm_cache[],
2401                                  int *rate2, int64_t *distortion,
2402                                  int *skippable,
2403                                  int *rate_y, int *rate_uv,
2404                                  int *disable_skip,
2405                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
2406                                  int mi_row, int mi_col,
2407                                  int_mv single_newmv[MAX_REF_FRAMES],
2408                                  INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
2409                                  int (*single_skippable)[MAX_REF_FRAMES],
2410                                  int64_t *psse,
2411                                  const int64_t ref_best_rd,
2412                                  int64_t *mask_filter,
2413                                  int64_t filter_cache[]) {
2414   VP9_COMMON *cm = &cpi->common;
2415   MACROBLOCKD *xd = &x->e_mbd;
2416   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
2417   const int is_comp_pred = has_second_ref(mbmi);
2418   const int this_mode = mbmi->mode;
2419   int_mv *frame_mv = mode_mv[this_mode];
2420   int i;
2421   int refs[2] = { mbmi->ref_frame[0],
2422     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
2423   int_mv cur_mv[2];
2424 #if CONFIG_VP9_HIGHBITDEPTH
2425   DECLARE_ALIGNED(16, uint16_t, tmp_buf16[MAX_MB_PLANE * 64 * 64]);
2426   uint8_t *tmp_buf;
2427 #else
2428   DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]);
2429 #endif  // CONFIG_VP9_HIGHBITDEPTH
2430   int pred_exists = 0;
2431   int intpel_mv;
2432   int64_t rd, tmp_rd, best_rd = INT64_MAX;
2433   int best_needs_copy = 0;
2434   uint8_t *orig_dst[MAX_MB_PLANE];
2435   int orig_dst_stride[MAX_MB_PLANE];
2436   int rs = 0;
2437   INTERP_FILTER best_filter = SWITCHABLE;
2438   uint8_t skip_txfm[MAX_MB_PLANE << 2] = {0};
2439   int64_t bsse[MAX_MB_PLANE << 2] = {0};
2440
2441   int bsl = mi_width_log2_lookup[bsize];
2442   int pred_filter_search = cpi->sf.cb_pred_filter_search ?
2443       (((mi_row + mi_col) >> bsl) +
2444        get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
2445
2446   int skip_txfm_sb = 0;
2447   int64_t skip_sse_sb = INT64_MAX;
2448   int64_t distortion_y = 0, distortion_uv = 0;
2449
2450 #if CONFIG_VP9_HIGHBITDEPTH
2451   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
2452     tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
2453   } else {
2454     tmp_buf = (uint8_t *)tmp_buf16;
2455   }
2456 #endif  // CONFIG_VP9_HIGHBITDEPTH
2457
2458   if (pred_filter_search) {
2459     INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
2460     if (xd->up_available)
2461       af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
2462     if (xd->left_available)
2463       lf = xd->mi[-1]->mbmi.interp_filter;
2464
2465     if ((this_mode != NEWMV) || (af == lf))
2466       best_filter = af;
2467   }
2468
2469   if (is_comp_pred) {
2470     if (frame_mv[refs[0]].as_int == INVALID_MV ||
2471         frame_mv[refs[1]].as_int == INVALID_MV)
2472       return INT64_MAX;
2473
2474     if (cpi->sf.adaptive_mode_search) {
2475       if (single_filter[this_mode][refs[0]] ==
2476           single_filter[this_mode][refs[1]])
2477         best_filter = single_filter[this_mode][refs[0]];
2478     }
2479   }
2480
2481   if (this_mode == NEWMV) {
2482     int rate_mv;
2483     if (is_comp_pred) {
2484       // Initialize mv using single prediction mode result.
2485       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
2486       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
2487
2488       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
2489         joint_motion_search(cpi, x, bsize, frame_mv,
2490                             mi_row, mi_col, single_newmv, &rate_mv);
2491       } else {
2492         rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
2493                                    &mbmi->ref_mvs[refs[0]][0].as_mv,
2494                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2495         rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
2496                                    &mbmi->ref_mvs[refs[1]][0].as_mv,
2497                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
2498       }
2499       *rate2 += rate_mv;
2500     } else {
2501       int_mv tmp_mv;
2502       single_motion_search(cpi, x, bsize, mi_row, mi_col,
2503                            &tmp_mv, &rate_mv);
2504       if (tmp_mv.as_int == INVALID_MV)
2505         return INT64_MAX;
2506
2507       frame_mv[refs[0]].as_int =
2508           xd->mi[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
2509       single_newmv[refs[0]].as_int = tmp_mv.as_int;
2510
2511       // Estimate the rate implications of a new mv but discount this
2512       // under certain circumstances where we want to help initiate a weak
2513       // motion field, where the distortion gain for a single block may not
2514       // be enough to overcome the cost of a new mv.
2515       if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
2516         *rate2 += MAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
2517       } else {
2518         *rate2 += rate_mv;
2519       }
2520     }
2521   }
2522
2523   for (i = 0; i < is_comp_pred + 1; ++i) {
2524     cur_mv[i] = frame_mv[refs[i]];
2525     // Clip "next_nearest" so that it does not extend to far out of image
2526     if (this_mode != NEWMV)
2527       clamp_mv2(&cur_mv[i].as_mv, xd);
2528
2529     if (mv_check_bounds(x, &cur_mv[i].as_mv))
2530       return INT64_MAX;
2531     mbmi->mv[i].as_int = cur_mv[i].as_int;
2532   }
2533
2534   // do first prediction into the destination buffer. Do the next
2535   // prediction into a temporary buffer. Then keep track of which one
2536   // of these currently holds the best predictor, and use the other
2537   // one for future predictions. In the end, copy from tmp_buf to
2538   // dst if necessary.
2539   for (i = 0; i < MAX_MB_PLANE; i++) {
2540     orig_dst[i] = xd->plane[i].dst.buf;
2541     orig_dst_stride[i] = xd->plane[i].dst.stride;
2542   }
2543
2544   // We don't include the cost of the second reference here, because there
2545   // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
2546   // words if you present them in that order, the second one is always known
2547   // if the first is known.
2548   //
2549   // Under some circumstances we discount the cost of new mv mode to encourage
2550   // initiation of a motion field.
2551   if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]],
2552                           mode_mv, refs[0])) {
2553     *rate2 += MIN(cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]),
2554                   cost_mv_ref(cpi, NEARESTMV, mbmi->mode_context[refs[0]]));
2555   } else {
2556     *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
2557   }
2558
2559   if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
2560       mbmi->mode != NEARESTMV)
2561     return INT64_MAX;
2562
2563   pred_exists = 0;
2564   // Are all MVs integer pel for Y and UV
2565   intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
2566   if (is_comp_pred)
2567     intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
2568
2569   // Search for best switchable filter by checking the variance of
2570   // pred error irrespective of whether the filter will be used
2571   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
2572     filter_cache[i] = INT64_MAX;
2573
2574   if (cm->interp_filter != BILINEAR) {
2575     if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
2576       best_filter = EIGHTTAP;
2577     } else if (best_filter == SWITCHABLE) {
2578       int newbest;
2579       int tmp_rate_sum = 0;
2580       int64_t tmp_dist_sum = 0;
2581
2582       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
2583         int j;
2584         int64_t rs_rd;
2585         int tmp_skip_sb = 0;
2586         int64_t tmp_skip_sse = INT64_MAX;
2587
2588         mbmi->interp_filter = i;
2589         rs = vp9_get_switchable_rate(cpi, xd);
2590         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
2591
2592         if (i > 0 && intpel_mv) {
2593           rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
2594           filter_cache[i] = rd;
2595           filter_cache[SWITCHABLE_FILTERS] =
2596               MIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
2597           if (cm->interp_filter == SWITCHABLE)
2598             rd += rs_rd;
2599           *mask_filter = MAX(*mask_filter, rd);
2600         } else {
2601           int rate_sum = 0;
2602           int64_t dist_sum = 0;
2603           if (i > 0 && cpi->sf.adaptive_interp_filter_search &&
2604               (cpi->sf.interp_filter_search_mask & (1 << i))) {
2605             rate_sum = INT_MAX;
2606             dist_sum = INT64_MAX;
2607             continue;
2608           }
2609
2610           if ((cm->interp_filter == SWITCHABLE &&
2611                (!i || best_needs_copy)) ||
2612               (cm->interp_filter != SWITCHABLE &&
2613                (cm->interp_filter == mbmi->interp_filter ||
2614                 (i == 0 && intpel_mv)))) {
2615             restore_dst_buf(xd, orig_dst, orig_dst_stride);
2616           } else {
2617             for (j = 0; j < MAX_MB_PLANE; j++) {
2618               xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
2619               xd->plane[j].dst.stride = 64;
2620             }
2621           }
2622           vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
2623           model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum,
2624                           &tmp_skip_sb, &tmp_skip_sse);
2625
2626           rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
2627           filter_cache[i] = rd;
2628           filter_cache[SWITCHABLE_FILTERS] =
2629               MIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
2630           if (cm->interp_filter == SWITCHABLE)
2631             rd += rs_rd;
2632           *mask_filter = MAX(*mask_filter, rd);
2633
2634           if (i == 0 && intpel_mv) {
2635             tmp_rate_sum = rate_sum;
2636             tmp_dist_sum = dist_sum;
2637           }
2638         }
2639
2640         if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
2641           if (rd / 2 > ref_best_rd) {
2642             restore_dst_buf(xd, orig_dst, orig_dst_stride);
2643             return INT64_MAX;
2644           }
2645         }
2646         newbest = i == 0 || rd < best_rd;
2647
2648         if (newbest) {
2649           best_rd = rd;
2650           best_filter = mbmi->interp_filter;
2651           if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
2652             best_needs_copy = !best_needs_copy;
2653         }
2654
2655         if ((cm->interp_filter == SWITCHABLE && newbest) ||
2656             (cm->interp_filter != SWITCHABLE &&
2657              cm->interp_filter == mbmi->interp_filter)) {
2658           pred_exists = 1;
2659           tmp_rd = best_rd;
2660
2661           skip_txfm_sb = tmp_skip_sb;
2662           skip_sse_sb = tmp_skip_sse;
2663           memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
2664           memcpy(bsse, x->bsse, sizeof(bsse));
2665         }
2666       }
2667       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2668     }
2669   }
2670   // Set the appropriate filter
2671   mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
2672       cm->interp_filter : best_filter;
2673   rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0;
2674
2675   if (pred_exists) {
2676     if (best_needs_copy) {
2677       // again temporarily set the buffers to local memory to prevent a memcpy
2678       for (i = 0; i < MAX_MB_PLANE; i++) {
2679         xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
2680         xd->plane[i].dst.stride = 64;
2681       }
2682     }
2683     rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0);
2684   } else {
2685     int tmp_rate;
2686     int64_t tmp_dist;
2687     // Handles the special case when a filter that is not in the
2688     // switchable list (ex. bilinear) is indicated at the frame level, or
2689     // skip condition holds.
2690     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
2691     model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
2692                     &skip_txfm_sb, &skip_sse_sb);
2693     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
2694     memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
2695     memcpy(bsse, x->bsse, sizeof(bsse));
2696   }
2697
2698   if (!is_comp_pred)
2699     single_filter[this_mode][refs[0]] = mbmi->interp_filter;
2700
2701   if (cpi->sf.adaptive_mode_search)
2702     if (is_comp_pred)
2703       if (single_skippable[this_mode][refs[0]] &&
2704           single_skippable[this_mode][refs[1]])
2705         memset(skip_txfm, 1, sizeof(skip_txfm));
2706
2707   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
2708     // if current pred_error modeled rd is substantially more than the best
2709     // so far, do not bother doing full rd
2710     if (rd / 2 > ref_best_rd) {
2711       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2712       return INT64_MAX;
2713     }
2714   }
2715
2716   if (cm->interp_filter == SWITCHABLE)
2717     *rate2 += rs;
2718
2719   memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
2720   memcpy(x->bsse, bsse, sizeof(bsse));
2721
2722   if (!skip_txfm_sb) {
2723     int skippable_y, skippable_uv;
2724     int64_t sseuv = INT64_MAX;
2725     int64_t rdcosty = INT64_MAX;
2726
2727     // Y cost and distortion
2728     vp9_subtract_plane(x, bsize, 0);
2729     super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
2730                     bsize, txfm_cache, ref_best_rd);
2731
2732     if (*rate_y == INT_MAX) {
2733       *rate2 = INT_MAX;
2734       *distortion = INT64_MAX;
2735       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2736       return INT64_MAX;
2737     }
2738
2739     *rate2 += *rate_y;
2740     *distortion += distortion_y;
2741
2742     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
2743     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
2744
2745     if (!super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
2746                           &sseuv, bsize, ref_best_rd - rdcosty)) {
2747       *rate2 = INT_MAX;
2748       *distortion = INT64_MAX;
2749       restore_dst_buf(xd, orig_dst, orig_dst_stride);
2750       return INT64_MAX;
2751     }
2752
2753     *psse += sseuv;
2754     *rate2 += *rate_uv;
2755     *distortion += distortion_uv;
2756     *skippable = skippable_y && skippable_uv;
2757   } else {
2758     x->skip = 1;
2759     *disable_skip = 1;
2760
2761     // The cost of skip bit needs to be added.
2762     *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
2763
2764     *distortion = skip_sse_sb;
2765   }
2766
2767   if (!is_comp_pred)
2768     single_skippable[this_mode][refs[0]] = *skippable;
2769
2770   restore_dst_buf(xd, orig_dst, orig_dst_stride);
2771   return 0;  // The rate-distortion cost will be re-calculated by caller.
2772 }
2773
2774 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
2775                                RD_COST *rd_cost, BLOCK_SIZE bsize,
2776                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
2777   VP9_COMMON *const cm = &cpi->common;
2778   MACROBLOCKD *const xd = &x->e_mbd;
2779   struct macroblockd_plane *const pd = xd->plane;
2780   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
2781   int y_skip = 0, uv_skip = 0;
2782   int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
2783   TX_SIZE max_uv_tx_size;
2784   x->skip_encode = 0;
2785   ctx->skip = 0;
2786   xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
2787   xd->mi[0]->mbmi.ref_frame[1] = NONE;
2788
2789   if (bsize >= BLOCK_8X8) {
2790     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
2791                                &dist_y, &y_skip, bsize, tx_cache,
2792                                best_rd) >= best_rd) {
2793       rd_cost->rate = INT_MAX;
2794       return;
2795     }
2796   } else {
2797     y_skip = 0;
2798     if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
2799                                      &dist_y, best_rd) >= best_rd) {
2800       rd_cost->rate = INT_MAX;
2801       return;
2802     }
2803   }
2804   max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize,
2805                                        pd[1].subsampling_x,
2806                                        pd[1].subsampling_y);
2807   rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
2808                           &dist_uv, &uv_skip, MAX(BLOCK_8X8, bsize),
2809                           max_uv_tx_size);
2810
2811   if (y_skip && uv_skip) {
2812     rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
2813                     vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
2814     rd_cost->dist = dist_y + dist_uv;
2815     vp9_zero(ctx->tx_rd_diff);
2816   } else {
2817     int i;
2818     rd_cost->rate = rate_y + rate_uv +
2819                       vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
2820     rd_cost->dist = dist_y + dist_uv;
2821     if (cpi->sf.tx_size_search_method == USE_FULL_RD)
2822       for (i = 0; i < TX_MODES; i++) {
2823         if (tx_cache[i] < INT64_MAX && tx_cache[cm->tx_mode] < INT64_MAX)
2824           ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
2825         else
2826           ctx->tx_rd_diff[i] = 0;
2827       }
2828   }
2829
2830   ctx->mic = *xd->mi[0];
2831   rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
2832 }
2833
2834 // This function is designed to apply a bias or adjustment to an rd value based
2835 // on the relative variance of the source and reconstruction.
2836 #define LOW_VAR_THRESH 16
2837 #define VLOW_ADJ_MAX 25
2838 #define VHIGH_ADJ_MAX 8
2839 static void rd_variance_adjustment(VP9_COMP *cpi,
2840                                    MACROBLOCK *x,
2841                                    BLOCK_SIZE bsize,
2842                                    int64_t *this_rd,
2843                                    MV_REFERENCE_FRAME ref_frame,
2844                                    unsigned int source_variance) {
2845   MACROBLOCKD *const xd = &x->e_mbd;
2846   unsigned int recon_variance;
2847   unsigned int absvar_diff = 0;
2848   int64_t var_error = 0;
2849   int64_t var_factor = 0;
2850
2851   if (*this_rd == INT64_MAX)
2852     return;
2853
2854 #if CONFIG_VP9_HIGHBITDEPTH
2855   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
2856     recon_variance =
2857       vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize, xd->bd);
2858   } else {
2859     recon_variance =
2860       vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
2861   }
2862 #else
2863   recon_variance =
2864     vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
2865 #endif  // CONFIG_VP9_HIGHBITDEPTH
2866
2867   if ((source_variance + recon_variance) > LOW_VAR_THRESH) {
2868     absvar_diff = (source_variance > recon_variance)
2869       ? (source_variance - recon_variance)
2870       : (recon_variance - source_variance);
2871
2872     var_error = (200 * source_variance * recon_variance) /
2873       ((source_variance * source_variance) +
2874        (recon_variance * recon_variance));
2875     var_error = 100 - var_error;
2876   }
2877
2878   // Source variance above a threshold and ref frame is intra.
2879   // This case is targeted mainly at discouraging intra modes that give rise
2880   // to a predictor with a low spatial complexity compared to the source.
2881   if ((source_variance > LOW_VAR_THRESH) && (ref_frame == INTRA_FRAME) &&
2882       (source_variance > recon_variance)) {
2883     var_factor = MIN(absvar_diff, MIN(VLOW_ADJ_MAX, var_error));
2884   // A second possible case of interest is where the source variance
2885   // is very low and we wish to discourage false texture or motion trails.
2886   } else if ((source_variance < (LOW_VAR_THRESH >> 1)) &&
2887              (recon_variance > source_variance)) {
2888     var_factor = MIN(absvar_diff, MIN(VHIGH_ADJ_MAX, var_error));
2889   }
2890   *this_rd += (*this_rd * var_factor) / 100;
2891 }
2892
2893 void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
2894                                TileDataEnc *tile_data,
2895                                MACROBLOCK *x,
2896                                int mi_row, int mi_col,
2897                                RD_COST *rd_cost, BLOCK_SIZE bsize,
2898                                PICK_MODE_CONTEXT *ctx,
2899                                int64_t best_rd_so_far) {
2900   VP9_COMMON *const cm = &cpi->common;
2901   TileInfo *const tile_info = &tile_data->tile_info;
2902   RD_OPT *const rd_opt = &cpi->rd;
2903   SPEED_FEATURES *const sf = &cpi->sf;
2904   MACROBLOCKD *const xd = &x->e_mbd;
2905   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
2906   const struct segmentation *const seg = &cm->seg;
2907   PREDICTION_MODE this_mode;
2908   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
2909   unsigned char segment_id = mbmi->segment_id;
2910   int comp_pred, i, k;
2911   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
2912   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
2913   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
2914   INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
2915   int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
2916   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
2917                                     VP9_ALT_FLAG };
2918   int64_t best_rd = best_rd_so_far;
2919   int64_t best_tx_rd[TX_MODES];
2920   int64_t best_tx_diff[TX_MODES];
2921   int64_t best_pred_diff[REFERENCE_MODES];
2922   int64_t best_pred_rd[REFERENCE_MODES];
2923   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
2924   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
2925   MB_MODE_INFO best_mbmode;
2926   int best_mode_skippable = 0;
2927   int midx, best_mode_index = -1;
2928   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
2929   vp9_prob comp_mode_p;
2930   int64_t best_intra_rd = INT64_MAX;
2931   unsigned int best_pred_sse = UINT_MAX;
2932   PREDICTION_MODE best_intra_mode = DC_PRED;
2933   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
2934   int64_t dist_uv[TX_SIZES];
2935   int skip_uv[TX_SIZES];
2936   PREDICTION_MODE mode_uv[TX_SIZES];
2937   const int intra_cost_penalty = vp9_get_intra_cost_penalty(
2938       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
2939   int best_skip2 = 0;
2940   uint8_t ref_frame_skip_mask[2] = { 0 };
2941   uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
2942   int mode_skip_start = sf->mode_skip_start + 1;
2943   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
2944   const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
2945   int64_t mode_threshold[MAX_MODES];
2946   int *mode_map = tile_data->mode_map[bsize];
2947   const int mode_search_skip_flags = sf->mode_search_skip_flags;
2948   int64_t mask_filter = 0;
2949   int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
2950
2951   vp9_zero(best_mbmode);
2952
2953   x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
2954
2955   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
2956     filter_cache[i] = INT64_MAX;
2957
2958   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
2959                            &comp_mode_p);
2960
2961   for (i = 0; i < REFERENCE_MODES; ++i)
2962     best_pred_rd[i] = INT64_MAX;
2963   for (i = 0; i < TX_MODES; i++)
2964     best_tx_rd[i] = INT64_MAX;
2965   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
2966     best_filter_rd[i] = INT64_MAX;
2967   for (i = 0; i < TX_SIZES; i++)
2968     rate_uv_intra[i] = INT_MAX;
2969   for (i = 0; i < MAX_REF_FRAMES; ++i)
2970     x->pred_sse[i] = INT_MAX;
2971   for (i = 0; i < MB_MODE_COUNT; ++i) {
2972     for (k = 0; k < MAX_REF_FRAMES; ++k) {
2973       single_inter_filter[i][k] = SWITCHABLE;
2974       single_skippable[i][k] = 0;
2975     }
2976   }
2977
2978   rd_cost->rate = INT_MAX;
2979
2980   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
2981     x->pred_mv_sad[ref_frame] = INT_MAX;
2982     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
2983       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
2984       setup_buffer_inter(cpi, x, tile_info, ref_frame, bsize, mi_row, mi_col,
2985                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
2986     }
2987     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
2988     frame_mv[ZEROMV][ref_frame].as_int = 0;
2989   }
2990
2991   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
2992     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
2993       // Skip checking missing references in both single and compound reference
2994       // modes. Note that a mode will be skipped iff both reference frames
2995       // are masked out.
2996       ref_frame_skip_mask[0] |= (1 << ref_frame);
2997       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
2998     } else if (sf->reference_masking) {
2999       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
3000         // Skip fixed mv modes for poor references
3001         if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
3002           mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
3003           break;
3004         }
3005       }
3006     }
3007     // If the segment reference frame feature is enabled....
3008     // then do nothing if the current ref frame is not allowed..
3009     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
3010         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
3011       ref_frame_skip_mask[0] |= (1 << ref_frame);
3012       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
3013     }
3014   }
3015
3016   // Disable this drop out case if the ref frame
3017   // segment level feature is enabled for this segment. This is to
3018   // prevent the possibility that we end up unable to pick any mode.
3019   if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
3020     // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
3021     // unless ARNR filtering is enabled in which case we want
3022     // an unfiltered alternative. We allow near/nearest as well
3023     // because they may result in zero-zero MVs but be cheaper.
3024     if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
3025       ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << GOLDEN_FRAME);
3026       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
3027       mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
3028       if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
3029         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
3030       if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
3031         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
3032     }
3033   }
3034
3035   if (cpi->rc.is_src_frame_alt_ref) {
3036     if (sf->alt_ref_search_fp) {
3037       mode_skip_mask[ALTREF_FRAME] = 0;
3038       ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
3039       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
3040     }
3041   }
3042
3043   if (sf->alt_ref_search_fp)
3044     if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
3045       if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
3046         mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
3047
3048   if (sf->adaptive_mode_search) {
3049     if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
3050         cpi->rc.frames_since_golden >= 3)
3051       if (x->pred_mv_sad[GOLDEN_FRAME] > (x->pred_mv_sad[LAST_FRAME] << 1))
3052         mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
3053   }
3054
3055   if (bsize > sf->max_intra_bsize) {
3056     ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
3057     ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
3058   }
3059
3060   mode_skip_mask[INTRA_FRAME] |=
3061       ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
3062
3063   for (i = 0; i <= LAST_NEW_MV_INDEX; ++i)
3064     mode_threshold[i] = 0;
3065   for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
3066     mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
3067
3068   midx =  sf->schedule_mode_search ? mode_skip_start : 0;
3069   while (midx > 4) {
3070     uint8_t end_pos = 0;
3071     for (i = 5; i < midx; ++i) {
3072       if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) {
3073         uint8_t tmp = mode_map[i];
3074         mode_map[i] = mode_map[i - 1];
3075         mode_map[i - 1] = tmp;
3076         end_pos = i;
3077       }
3078     }
3079     midx = end_pos;
3080   }
3081
3082   for (midx = 0; midx < MAX_MODES; ++midx) {
3083     int mode_index = mode_map[midx];
3084     int mode_excluded = 0;
3085     int64_t this_rd = INT64_MAX;
3086     int disable_skip = 0;
3087     int compmode_cost = 0;
3088     int rate2 = 0, rate_y = 0, rate_uv = 0;
3089     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
3090     int skippable = 0;
3091     int64_t tx_cache[TX_MODES];
3092     int this_skip2 = 0;
3093     int64_t total_sse = INT64_MAX;
3094     int early_term = 0;
3095
3096     this_mode = vp9_mode_order[mode_index].mode;
3097     ref_frame = vp9_mode_order[mode_index].ref_frame[0];
3098     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
3099
3100     // Look at the reference frame of the best mode so far and set the
3101     // skip mask to look at a subset of the remaining modes.
3102     if (midx == mode_skip_start && best_mode_index >= 0) {
3103       switch (best_mbmode.ref_frame[0]) {
3104         case INTRA_FRAME:
3105           break;
3106         case LAST_FRAME:
3107           ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
3108           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
3109           break;
3110         case GOLDEN_FRAME:
3111           ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
3112           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
3113           break;
3114         case ALTREF_FRAME:
3115           ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK;
3116           break;
3117         case NONE:
3118         case MAX_REF_FRAMES:
3119           assert(0 && "Invalid Reference frame");
3120           break;
3121       }
3122     }
3123
3124     if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
3125         (ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame))))
3126       continue;
3127
3128     if (mode_skip_mask[ref_frame] & (1 << this_mode))
3129       continue;
3130
3131     // Test best rd so far against threshold for trying this mode.
3132     if (best_mode_skippable && sf->schedule_mode_search)
3133       mode_threshold[mode_index] <<= 1;
3134
3135     if (best_rd < mode_threshold[mode_index])
3136       continue;
3137
3138     if (sf->motion_field_mode_search) {
3139       const int mi_width  = MIN(num_8x8_blocks_wide_lookup[bsize],
3140                                 tile_info->mi_col_end - mi_col);
3141       const int mi_height = MIN(num_8x8_blocks_high_lookup[bsize],
3142                                 tile_info->mi_row_end - mi_row);
3143       const int bsl = mi_width_log2_lookup[bsize];
3144       int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl)
3145           + get_chessboard_index(cm->current_video_frame)) & 0x1;
3146       MB_MODE_INFO *ref_mbmi;
3147       int const_motion = 1;
3148       int skip_ref_frame = !cb_partition_search_ctrl;
3149       MV_REFERENCE_FRAME rf = NONE;
3150       int_mv ref_mv;
3151       ref_mv.as_int = INVALID_MV;
3152
3153       if ((mi_row - 1) >= tile_info->mi_row_start) {
3154         ref_mv = xd->mi[-xd->mi_stride]->mbmi.mv[0];
3155         rf = xd->mi[-xd->mi_stride]->mbmi.ref_frame[0];
3156         for (i = 0; i < mi_width; ++i) {
3157           ref_mbmi = &xd->mi[-xd->mi_stride + i]->mbmi;
3158           const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
3159                           (ref_frame == ref_mbmi->ref_frame[0]);
3160           skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
3161         }
3162       }
3163
3164       if ((mi_col - 1) >= tile_info->mi_col_start) {
3165         if (ref_mv.as_int == INVALID_MV)
3166           ref_mv = xd->mi[-1]->mbmi.mv[0];
3167         if (rf == NONE)
3168           rf = xd->mi[-1]->mbmi.ref_frame[0];
3169         for (i = 0; i < mi_height; ++i) {
3170           ref_mbmi = &xd->mi[i * xd->mi_stride - 1]->mbmi;
3171           const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
3172                           (ref_frame == ref_mbmi->ref_frame[0]);
3173           skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
3174         }
3175       }
3176
3177       if (skip_ref_frame && this_mode != NEARESTMV && this_mode != NEWMV)
3178         if (rf > INTRA_FRAME)
3179           if (ref_frame != rf)
3180             continue;
3181
3182       if (const_motion)
3183         if (this_mode == NEARMV || this_mode == ZEROMV)
3184           continue;
3185     }
3186
3187     comp_pred = second_ref_frame > INTRA_FRAME;
3188     if (comp_pred) {
3189       if (!cpi->allow_comp_inter_inter)
3190         continue;
3191
3192       // Skip compound inter modes if ARF is not available.
3193       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
3194         continue;
3195
3196       // Do not allow compound prediction if the segment level reference frame
3197       // feature is in use as in this case there can only be one reference.
3198       if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
3199         continue;
3200
3201       if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
3202           best_mode_index >= 0 && best_mbmode.ref_frame[0] == INTRA_FRAME)
3203         continue;
3204
3205       mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
3206     } else {
3207       if (ref_frame != INTRA_FRAME)
3208         mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
3209     }
3210
3211     if (ref_frame == INTRA_FRAME) {
3212       if (sf->adaptive_mode_search)
3213         if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse)
3214           continue;
3215
3216       if (this_mode != DC_PRED) {
3217         // Disable intra modes other than DC_PRED for blocks with low variance
3218         // Threshold for intra skipping based on source variance
3219         // TODO(debargha): Specialize the threshold for super block sizes
3220         const unsigned int skip_intra_var_thresh = 64;
3221         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
3222             x->source_variance < skip_intra_var_thresh)
3223           continue;
3224         // Only search the oblique modes if the best so far is
3225         // one of the neighboring directional modes
3226         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
3227             (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
3228           if (best_mode_index >= 0 &&
3229               best_mbmode.ref_frame[0] > INTRA_FRAME)
3230             continue;
3231         }
3232         if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
3233           if (conditional_skipintra(this_mode, best_intra_mode))
3234               continue;
3235         }
3236       }
3237     } else {
3238       const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
3239       if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
3240                               this_mode, ref_frames))
3241         continue;
3242     }
3243
3244     mbmi->mode = this_mode;
3245     mbmi->uv_mode = DC_PRED;
3246     mbmi->ref_frame[0] = ref_frame;
3247     mbmi->ref_frame[1] = second_ref_frame;
3248     // Evaluate all sub-pel filters irrespective of whether we can use
3249     // them for this frame.
3250     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
3251                                                           : cm->interp_filter;
3252     mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
3253
3254     x->skip = 0;
3255     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
3256
3257     // Select prediction reference frames.
3258     for (i = 0; i < MAX_MB_PLANE; i++) {
3259       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
3260       if (comp_pred)
3261         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
3262     }
3263
3264     for (i = 0; i < TX_MODES; ++i)
3265       tx_cache[i] = INT64_MAX;
3266
3267     if (ref_frame == INTRA_FRAME) {
3268       TX_SIZE uv_tx;
3269       struct macroblockd_plane *const pd = &xd->plane[1];
3270       memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
3271       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
3272                       NULL, bsize, tx_cache, best_rd);
3273       if (rate_y == INT_MAX)
3274         continue;
3275
3276       uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd->subsampling_x,
3277                                   pd->subsampling_y);
3278       if (rate_uv_intra[uv_tx] == INT_MAX) {
3279         choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx,
3280                              &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
3281                              &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
3282       }
3283
3284       rate_uv = rate_uv_tokenonly[uv_tx];
3285       distortion_uv = dist_uv[uv_tx];
3286       skippable = skippable && skip_uv[uv_tx];
3287       mbmi->uv_mode = mode_uv[uv_tx];
3288
3289       rate2 = rate_y + cpi->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
3290       if (this_mode != DC_PRED && this_mode != TM_PRED)
3291         rate2 += intra_cost_penalty;
3292       distortion2 = distortion_y + distortion_uv;
3293     } else {
3294       this_rd = handle_inter_mode(cpi, x, bsize,
3295                                   tx_cache,
3296                                   &rate2, &distortion2, &skippable,
3297                                   &rate_y, &rate_uv,
3298                                   &disable_skip, frame_mv,
3299                                   mi_row, mi_col,
3300                                   single_newmv, single_inter_filter,
3301                                   single_skippable, &total_sse, best_rd,
3302                                   &mask_filter, filter_cache);
3303       if (this_rd == INT64_MAX)
3304         continue;
3305
3306       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
3307
3308       if (cm->reference_mode == REFERENCE_MODE_SELECT)
3309         rate2 += compmode_cost;
3310     }
3311
3312     // Estimate the reference frame signaling cost and add it
3313     // to the rolling cost variable.
3314     if (comp_pred) {
3315       rate2 += ref_costs_comp[ref_frame];
3316     } else {
3317       rate2 += ref_costs_single[ref_frame];
3318     }
3319
3320     if (!disable_skip) {
3321       if (skippable) {
3322         // Back out the coefficient coding costs
3323         rate2 -= (rate_y + rate_uv);
3324
3325         // Cost the skip mb case
3326         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
3327       } else if (ref_frame != INTRA_FRAME && !xd->lossless) {
3328         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
3329             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
3330           // Add in the cost of the no skip flag.
3331           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3332         } else {
3333           // FIXME(rbultje) make this work for splitmv also
3334           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
3335           distortion2 = total_sse;
3336           assert(total_sse >= 0);
3337           rate2 -= (rate_y + rate_uv);
3338           this_skip2 = 1;
3339         }
3340       } else {
3341         // Add in the cost of the no skip flag.
3342         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
3343       }
3344
3345       // Calculate the final RD estimate for this mode.
3346       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
3347     }
3348
3349     // Apply an adjustment to the rd value based on the similarity of the
3350     // source variance and reconstructed variance.
3351     rd_variance_adjustment(cpi, x, bsize, &this_rd,
3352                            ref_frame, x->source_variance);
3353
3354     if (ref_frame == INTRA_FRAME) {
3355     // Keep record of best intra rd
3356       if (this_rd < best_intra_rd) {
3357         best_intra_rd = this_rd;
3358         best_intra_mode = mbmi->mode;
3359       }
3360     }
3361
3362     if (!disable_skip && ref_frame == INTRA_FRAME) {
3363       for (i = 0; i < REFERENCE_MODES; ++i)
3364         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
3365       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
3366         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
3367     }
3368
3369     // Did this mode help.. i.e. is it the new best mode
3370     if (this_rd < best_rd || x->skip) {
3371       int max_plane = MAX_MB_PLANE;
3372       if (!mode_excluded) {
3373         // Note index of best mode so far
3374         best_mode_index = mode_index;
3375
3376         if (ref_frame == INTRA_FRAME) {
3377           /* required for left and above block mv */
3378           mbmi->mv[0].as_int = 0;
3379           max_plane = 1;
3380         } else {
3381           best_pred_sse = x->pred_sse[ref_frame];
3382         }
3383
3384         rd_cost->rate = rate2;
3385         rd_cost->dist = distortion2;
3386         rd_cost->rdcost = this_rd;
3387         best_rd = this_rd;
3388         best_mbmode = *mbmi;
3389         best_skip2 = this_skip2;
3390         best_mode_skippable = skippable;
3391
3392         if (!x->select_tx_size)
3393           swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
3394         memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
3395                sizeof(uint8_t) * ctx->num_4x4_blk);
3396
3397         // TODO(debargha): enhance this test with a better distortion prediction
3398         // based on qp, activity mask and history
3399         if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
3400             (mode_index > MIN_EARLY_TERM_INDEX)) {
3401           int qstep = xd->plane[0].dequant[1];
3402           // TODO(debargha): Enhance this by specializing for each mode_index
3403           int scale = 4;
3404 #if CONFIG_VP9_HIGHBITDEPTH
3405           if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
3406             qstep >>= (xd->bd - 8);
3407           }
3408 #endif  // CONFIG_VP9_HIGHBITDEPTH
3409           if (x->source_variance < UINT_MAX) {
3410             const int var_adjust = (x->source_variance < 16);
3411             scale -= var_adjust;
3412           }
3413           if (ref_frame > INTRA_FRAME &&
3414               distortion2 * scale < qstep * qstep) {
3415             early_term = 1;
3416           }
3417         }
3418       }
3419     }
3420
3421     /* keep record of best compound/single-only prediction */
3422     if (!disable_skip && ref_frame != INTRA_FRAME) {
3423       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
3424
3425       if (cm->reference_mode == REFERENCE_MODE_SELECT) {
3426         single_rate = rate2 - compmode_cost;
3427         hybrid_rate = rate2;
3428       } else {
3429         single_rate = rate2;
3430         hybrid_rate = rate2 + compmode_cost;
3431       }
3432
3433       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
3434       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
3435
3436       if (!comp_pred) {
3437         if (single_rd < best_pred_rd[SINGLE_REFERENCE])
3438           best_pred_rd[SINGLE_REFERENCE] = single_rd;
3439       } else {
3440         if (single_rd < best_pred_rd[COMPOUND_REFERENCE])
3441           best_pred_rd[COMPOUND_REFERENCE] = single_rd;
3442       }
3443       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
3444         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
3445
3446       /* keep record of best filter type */
3447       if (!mode_excluded && cm->interp_filter != BILINEAR) {
3448         int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
3449                               SWITCHABLE_FILTERS : cm->interp_filter];
3450
3451         for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
3452           int64_t adj_rd;
3453           if (ref == INT64_MAX)
3454             adj_rd = 0;
3455           else if (filter_cache[i] == INT64_MAX)
3456             // when early termination is triggered, the encoder does not have
3457             // access to the rate-distortion cost. it only knows that the cost
3458             // should be above the maximum valid value. hence it takes the known
3459             // maximum plus an arbitrary constant as the rate-distortion cost.
3460             adj_rd = mask_filter - ref + 10;
3461           else
3462             adj_rd = filter_cache[i] - ref;
3463
3464           adj_rd += this_rd;
3465           best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
3466         }
3467       }
3468     }
3469
3470     /* keep record of best txfm size */
3471     if (bsize < BLOCK_32X32) {
3472       if (bsize < BLOCK_16X16)
3473         tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
3474
3475       tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
3476     }
3477     if (!mode_excluded && this_rd != INT64_MAX) {
3478       for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
3479         int64_t adj_rd = INT64_MAX;
3480         adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
3481
3482         if (adj_rd < best_tx_rd[i])
3483           best_tx_rd[i] = adj_rd;
3484       }
3485     }
3486
3487     if (early_term)
3488       break;
3489
3490     if (x->skip && !comp_pred)
3491       break;
3492   }
3493
3494   // The inter modes' rate costs are not calculated precisely in some cases.
3495   // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
3496   // ZEROMV. Here, checks are added for those cases, and the mode decisions
3497   // are corrected.
3498   if (best_mbmode.mode == NEWMV) {
3499     const MV_REFERENCE_FRAME refs[2] = {best_mbmode.ref_frame[0],
3500         best_mbmode.ref_frame[1]};
3501     int comp_pred_mode = refs[1] > INTRA_FRAME;
3502
3503     if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
3504         ((comp_pred_mode && frame_mv[NEARESTMV][refs[1]].as_int ==
3505             best_mbmode.mv[1].as_int) || !comp_pred_mode))
3506       best_mbmode.mode = NEARESTMV;
3507     else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
3508         ((comp_pred_mode && frame_mv[NEARMV][refs[1]].as_int ==
3509             best_mbmode.mv[1].as_int) || !comp_pred_mode))
3510       best_mbmode.mode = NEARMV;
3511     else if (best_mbmode.mv[0].as_int == 0 &&
3512         ((comp_pred_mode && best_mbmode.mv[1].as_int == 0) || !comp_pred_mode))
3513       best_mbmode.mode = ZEROMV;
3514   }
3515
3516   if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
3517     rd_cost->rate = INT_MAX;
3518     rd_cost->rdcost = INT64_MAX;
3519     return;
3520   }
3521
3522   // If we used an estimate for the uv intra rd in the loop above...
3523   if (sf->use_uv_intra_rd_estimate) {
3524     // Do Intra UV best rd mode selection if best mode choice above was intra.
3525     if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
3526       TX_SIZE uv_tx_size;
3527       *mbmi = best_mbmode;
3528       uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
3529       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
3530                               &rate_uv_tokenonly[uv_tx_size],
3531                               &dist_uv[uv_tx_size],
3532                               &skip_uv[uv_tx_size],
3533                               bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize,
3534                               uv_tx_size);
3535     }
3536   }
3537
3538   assert((cm->interp_filter == SWITCHABLE) ||
3539          (cm->interp_filter == best_mbmode.interp_filter) ||
3540          !is_inter_block(&best_mbmode));
3541
3542   if (!cpi->rc.is_src_frame_alt_ref)
3543     vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
3544                               sf->adaptive_rd_thresh, bsize, best_mode_index);
3545
3546   // macroblock modes
3547   *mbmi = best_mbmode;
3548   x->skip |= best_skip2;
3549
3550   for (i = 0; i < REFERENCE_MODES; ++i) {
3551     if (best_pred_rd[i] == INT64_MAX)
3552       best_pred_diff[i] = INT_MIN;
3553     else
3554       best_pred_diff[i] = best_rd - best_pred_rd[i];
3555   }
3556
3557   if (!x->skip) {
3558     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
3559       if (best_filter_rd[i] == INT64_MAX)
3560         best_filter_diff[i] = 0;
3561       else
3562         best_filter_diff[i] = best_rd - best_filter_rd[i];
3563     }
3564     if (cm->interp_filter == SWITCHABLE)
3565       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
3566     for (i = 0; i < TX_MODES; i++) {
3567       if (best_tx_rd[i] == INT64_MAX)
3568         best_tx_diff[i] = 0;
3569       else
3570         best_tx_diff[i] = best_rd - best_tx_rd[i];
3571     }
3572   } else {
3573     vp9_zero(best_filter_diff);
3574     vp9_zero(best_tx_diff);
3575   }
3576
3577   // TODO(yunqingwang): Moving this line in front of the above best_filter_diff
3578   // updating code causes PSNR loss. Need to figure out the confliction.
3579   x->skip |= best_mode_skippable;
3580
3581   if (!x->skip && !x->select_tx_size) {
3582     int has_high_freq_coeff = 0;
3583     int plane;
3584     int max_plane = is_inter_block(&xd->mi[0]->mbmi)
3585                         ? MAX_MB_PLANE : 1;
3586     for (plane = 0; plane < max_plane; ++plane) {
3587       x->plane[plane].eobs = ctx->eobs_pbuf[plane][1];
3588       has_high_freq_coeff |= vp9_has_high_freq_in_plane(x, bsize, plane);
3589     }
3590
3591     for (plane = max_plane; plane < MAX_MB_PLANE; ++plane) {
3592       x->plane[plane].eobs = ctx->eobs_pbuf[plane][2];
3593       has_high_freq_coeff |= vp9_has_high_freq_in_plane(x, bsize, plane);
3594     }
3595
3596     best_mode_skippable |= !has_high_freq_coeff;
3597   }
3598
3599   assert(best_mode_index >= 0);
3600
3601   store_coding_context(x, ctx, best_mode_index, best_pred_diff,
3602                        best_tx_diff, best_filter_diff, best_mode_skippable);
3603 }
3604
3605 void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi,
3606                                         TileDataEnc *tile_data,
3607                                         MACROBLOCK *x,
3608                                         RD_COST *rd_cost,
3609                                         BLOCK_SIZE bsize,
3610                                         PICK_MODE_CONTEXT *ctx,
3611                                         int64_t best_rd_so_far) {
3612   VP9_COMMON *const cm = &cpi->common;
3613   MACROBLOCKD *const xd = &x->e_mbd;
3614   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
3615   unsigned char segment_id = mbmi->segment_id;
3616   const int comp_pred = 0;
3617   int i;
3618   int64_t best_tx_diff[TX_MODES];
3619   int64_t best_pred_diff[REFERENCE_MODES];
3620   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
3621   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
3622   vp9_prob comp_mode_p;
3623   INTERP_FILTER best_filter = SWITCHABLE;
3624   int64_t this_rd = INT64_MAX;
3625   int rate2 = 0;
3626   const int64_t distortion2 = 0;
3627
3628   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
3629
3630   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
3631                            &comp_mode_p);
3632
3633   for (i = 0; i < MAX_REF_FRAMES; ++i)
3634     x->pred_sse[i] = INT_MAX;
3635   for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i)
3636     x->pred_mv_sad[i] = INT_MAX;
3637
3638   rd_cost->rate = INT_MAX;
3639
3640   assert(vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
3641
3642   mbmi->mode = ZEROMV;
3643   mbmi->uv_mode = DC_PRED;
3644   mbmi->ref_frame[0] = LAST_FRAME;
3645   mbmi->ref_frame[1] = NONE;
3646   mbmi->mv[0].as_int = 0;
3647   x->skip = 1;
3648
3649   if (cm->interp_filter != BILINEAR) {
3650     best_filter = EIGHTTAP;
3651     if (cm->interp_filter == SWITCHABLE &&
3652         x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
3653       int rs;
3654       int best_rs = INT_MAX;
3655       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
3656         mbmi->interp_filter = i;
3657         rs = vp9_get_switchable_rate(cpi, xd);
3658         if (rs < best_rs) {
3659           best_rs = rs;
3660           best_filter = mbmi->interp_filter;
3661         }
3662       }
3663     }
3664   }
3665   // Set the appropriate filter
3666   if (cm->interp_filter == SWITCHABLE) {
3667     mbmi->interp_filter = best_filter;
3668     rate2 += vp9_get_switchable_rate(cpi, xd);
3669   } else {
3670     mbmi->interp_filter = cm->interp_filter;
3671   }
3672
3673   if (cm->reference_mode == REFERENCE_MODE_SELECT)
3674     rate2 += vp9_cost_bit(comp_mode_p, comp_pred);
3675
3676   // Estimate the reference frame signaling cost and add it
3677   // to the rolling cost variable.
3678   rate2 += ref_costs_single[LAST_FRAME];
3679   this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
3680
3681   rd_cost->rate = rate2;
3682   rd_cost->dist = distortion2;
3683   rd_cost->rdcost = this_rd;
3684
3685   if (this_rd >= best_rd_so_far) {
3686     rd_cost->rate = INT_MAX;
3687     rd_cost->rdcost = INT64_MAX;
3688     return;
3689   }
3690
3691   assert((cm->interp_filter == SWITCHABLE) ||
3692          (cm->interp_filter == mbmi->interp_filter));
3693
3694   vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
3695                             cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
3696
3697   vp9_zero(best_pred_diff);
3698   vp9_zero(best_filter_diff);
3699   vp9_zero(best_tx_diff);
3700
3701   if (!x->select_tx_size)
3702     swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE);
3703   store_coding_context(x, ctx, THR_ZEROMV,
3704                        best_pred_diff, best_tx_diff, best_filter_diff, 0);
3705 }
3706
3707 void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
3708                                    TileDataEnc *tile_data,
3709                                    MACROBLOCK *x,
3710                                    int mi_row, int mi_col,
3711                                    RD_COST *rd_cost,
3712                                    BLOCK_SIZE bsize,
3713                                    PICK_MODE_CONTEXT *ctx,
3714                                    int64_t best_rd_so_far) {
3715   VP9_COMMON *const cm = &cpi->common;
3716   TileInfo *const tile_info = &tile_data->tile_info;
3717   RD_OPT *const rd_opt = &cpi->rd;
3718   SPEED_FEATURES *const sf = &cpi->sf;
3719   MACROBLOCKD *const xd = &x->e_mbd;
3720   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
3721   const struct segmentation *const seg = &cm->seg;
3722   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
3723   unsigned char segment_id = mbmi->segment_id;
3724   int comp_pred, i;
3725   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
3726   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
3727   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
3728                                     VP9_ALT_FLAG };
3729   int64_t best_rd = best_rd_so_far;
3730   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
3731   static const int64_t best_tx_diff[TX_MODES] = { 0 };
3732   int64_t best_pred_diff[REFERENCE_MODES];
3733   int64_t best_pred_rd[REFERENCE_MODES];
3734   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
3735   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
3736   MB_MODE_INFO best_mbmode;
3737   int ref_index, best_ref_index = 0;
3738   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
3739   vp9_prob comp_mode_p;
3740   INTERP_FILTER tmp_best_filter = SWITCHABLE;
3741   int rate_uv_intra, rate_uv_tokenonly;
3742   int64_t dist_uv;
3743   int skip_uv;
3744   PREDICTION_MODE mode_uv = DC_PRED;
3745   const int intra_cost_penalty = vp9_get_intra_cost_penalty(
3746       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
3747   int_mv seg_mvs[4][MAX_REF_FRAMES];
3748   b_mode_info best_bmodes[4];
3749   int best_skip2 = 0;
3750   int ref_frame_skip_mask[2] = { 0 };
3751   int64_t mask_filter = 0;
3752   int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
3753
3754   x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
3755   memset(x->zcoeff_blk[TX_4X4], 0, 4);
3756   vp9_zero(best_mbmode);
3757
3758   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
3759     filter_cache[i] = INT64_MAX;
3760
3761   for (i = 0; i < 4; i++) {
3762     int j;
3763     for (j = 0; j < MAX_REF_FRAMES; j++)
3764       seg_mvs[i][j].as_int = INVALID_MV;
3765   }
3766
3767   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
3768                            &comp_mode_p);
3769
3770   for (i = 0; i < REFERENCE_MODES; ++i)
3771     best_pred_rd[i] = INT64_MAX;
3772   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
3773     best_filter_rd[i] = INT64_MAX;
3774   rate_uv_intra = INT_MAX;
3775
3776   rd_cost->rate = INT_MAX;
3777
3778   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
3779     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
3780       setup_buffer_inter(cpi, x, tile_info,
3781                          ref_frame, bsize, mi_row, mi_col,
3782                          frame_mv[NEARESTMV], frame_mv[NEARMV],
3783                          yv12_mb);
3784     } else {
3785       ref_frame_skip_mask[0] |= (1 << ref_frame);
3786       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
3787     }
3788     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
3789     frame_mv[ZEROMV][ref_frame].as_int = 0;
3790   }
3791
3792   for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
3793     int mode_excluded = 0;
3794     int64_t this_rd = INT64_MAX;
3795     int disable_skip = 0;
3796     int compmode_cost = 0;
3797     int rate2 = 0, rate_y = 0, rate_uv = 0;
3798     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
3799     int skippable = 0;
3800     int i;
3801     int this_skip2 = 0;
3802     int64_t total_sse = INT_MAX;
3803     int early_term = 0;
3804
3805     ref_frame = vp9_ref_order[ref_index].ref_frame[0];
3806     second_ref_frame = vp9_ref_order[ref_index].ref_frame[1];
3807
3808     // Look at the reference frame of the best mode so far and set the
3809     // skip mask to look at a subset of the remaining modes.
3810     if (ref_index > 2 && sf->mode_skip_start < MAX_MODES) {
3811       if (ref_index == 3) {
3812         switch (best_mbmode.ref_frame[0]) {
3813           case INTRA_FRAME:
3814             break;
3815           case LAST_FRAME:
3816             ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME);
3817             ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
3818             break;
3819           case GOLDEN_FRAME:
3820             ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << ALTREF_FRAME);
3821             ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
3822             break;
3823           case ALTREF_FRAME:
3824             ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME);
3825             break;
3826           case NONE:
3827           case MAX_REF_FRAMES:
3828             assert(0 && "Invalid Reference frame");
3829             break;
3830         }
3831       }
3832     }
3833
3834     if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
3835         (ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame))))
3836       continue;
3837
3838     // Test best rd so far against threshold for trying this mode.
3839     if (rd_less_than_thresh(best_rd,
3840                             rd_opt->threshes[segment_id][bsize][ref_index],
3841                             tile_data->thresh_freq_fact[bsize][ref_index]))
3842       continue;
3843
3844     comp_pred = second_ref_frame > INTRA_FRAME;
3845     if (comp_pred) {
3846       if (!cpi->allow_comp_inter_inter)
3847         continue;
3848       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
3849         continue;
3850       // Do not allow compound prediction if the segment level reference frame
3851       // feature is in use as in this case there can only be one reference.
3852       if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
3853         continue;
3854
3855       if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
3856           best_mbmode.ref_frame[0] == INTRA_FRAME)
3857         continue;
3858     }
3859
3860     // TODO(jingning, jkoleszar): scaling reference frame not supported for
3861     // sub8x8 blocks.
3862     if (ref_frame > INTRA_FRAME &&
3863         vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
3864       continue;
3865
3866     if (second_ref_frame > INTRA_FRAME &&
3867         vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
3868       continue;
3869
3870     if (comp_pred)
3871       mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
3872     else if (ref_frame != INTRA_FRAME)
3873       mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
3874
3875     // If the segment reference frame feature is enabled....
3876     // then do nothing if the current ref frame is not allowed..
3877     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
3878         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
3879       continue;
3880     // Disable this drop out case if the ref frame
3881     // segment level feature is enabled for this segment. This is to
3882     // prevent the possibility that we end up unable to pick any mode.
3883     } else if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
3884       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
3885       // unless ARNR filtering is enabled in which case we want
3886       // an unfiltered alternative. We allow near/nearest as well
3887       // because they may result in zero-zero MVs but be cheaper.
3888       if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
3889         continue;
3890     }
3891
3892     mbmi->tx_size = TX_4X4;
3893     mbmi->uv_mode = DC_PRED;
3894     mbmi->ref_frame[0] = ref_frame;
3895     mbmi->ref_frame[1] = second_ref_frame;
3896     // Evaluate all sub-pel filters irrespective of whether we can use
3897     // them for this frame.
3898     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
3899                                                           : cm->interp_filter;
3900     x->skip = 0;
3901     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
3902
3903     // Select prediction reference frames.
3904     for (i = 0; i < MAX_MB_PLANE; i++) {
3905       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
3906       if (comp_pred)
3907         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
3908     }
3909
3910     if (ref_frame == INTRA_FRAME) {
3911       int rate;
3912       if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
3913                                        &distortion_y, best_rd) >= best_rd)
3914         continue;
3915       rate2 += rate;
3916       rate2 += intra_cost_penalty;
3917       distortion2 += distortion_y;
3918
3919       if (rate_uv_intra == INT_MAX) {
3920         choose_intra_uv_mode(cpi, x, ctx, bsize, TX_4X4,
3921                              &rate_uv_intra,
3922                              &rate_uv_tokenonly,
3923                              &dist_uv, &skip_uv,
3924                              &mode_uv);
3925       }
3926       rate2 += rate_uv_intra;
3927       rate_uv = rate_uv_tokenonly;
3928       distortion2 += dist_uv;
3929       distortion_uv = dist_uv;
3930       mbmi->uv_mode = mode_uv;
3931     } else {
3932       int rate;
3933       int64_t distortion;
3934       int64_t this_rd_thresh;
3935       int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
3936       int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
3937       int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
3938       int tmp_best_skippable = 0;
3939       int switchable_filter_index;
3940       int_mv *second_ref = comp_pred ?
3941                              &mbmi->ref_mvs[second_ref_frame][0] : NULL;
3942       b_mode_info tmp_best_bmodes[16];
3943       MB_MODE_INFO tmp_best_mbmode;
3944       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
3945       int pred_exists = 0;
3946       int uv_skippable;
3947
3948       this_rd_thresh = (ref_frame == LAST_FRAME) ?
3949           rd_opt->threshes[segment_id][bsize][THR_LAST] :
3950           rd_opt->threshes[segment_id][bsize][THR_ALTR];
3951       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
3952       rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
3953       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
3954         filter_cache[i] = INT64_MAX;
3955
3956       if (cm->interp_filter != BILINEAR) {
3957         tmp_best_filter = EIGHTTAP;
3958         if (x->source_variance < sf->disable_filter_search_var_thresh) {
3959           tmp_best_filter = EIGHTTAP;
3960         } else if (sf->adaptive_pred_interp_filter == 1 &&
3961                    ctx->pred_interp_filter < SWITCHABLE) {
3962           tmp_best_filter = ctx->pred_interp_filter;
3963         } else if (sf->adaptive_pred_interp_filter == 2) {
3964           tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ?
3965                               ctx->pred_interp_filter : 0;
3966         } else {
3967           for (switchable_filter_index = 0;
3968                switchable_filter_index < SWITCHABLE_FILTERS;
3969                ++switchable_filter_index) {
3970             int newbest, rs;
3971             int64_t rs_rd;
3972             mbmi->interp_filter = switchable_filter_index;
3973             tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile_info,
3974                                               &mbmi->ref_mvs[ref_frame][0],
3975                                               second_ref, best_yrd, &rate,
3976                                               &rate_y, &distortion,
3977                                               &skippable, &total_sse,
3978                                               (int) this_rd_thresh, seg_mvs,
3979                                               bsi, switchable_filter_index,
3980                                               mi_row, mi_col);
3981
3982             if (tmp_rd == INT64_MAX)
3983               continue;
3984             rs = vp9_get_switchable_rate(cpi, xd);
3985             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
3986             filter_cache[switchable_filter_index] = tmp_rd;
3987             filter_cache[SWITCHABLE_FILTERS] =
3988                 MIN(filter_cache[SWITCHABLE_FILTERS],
3989                     tmp_rd + rs_rd);
3990             if (cm->interp_filter == SWITCHABLE)
3991               tmp_rd += rs_rd;
3992
3993             mask_filter = MAX(mask_filter, tmp_rd);
3994
3995             newbest = (tmp_rd < tmp_best_rd);
3996             if (newbest) {
3997               tmp_best_filter = mbmi->interp_filter;
3998               tmp_best_rd = tmp_rd;
3999             }
4000             if ((newbest && cm->interp_filter == SWITCHABLE) ||
4001                 (mbmi->interp_filter == cm->interp_filter &&
4002                  cm->interp_filter != SWITCHABLE)) {
4003               tmp_best_rdu = tmp_rd;
4004               tmp_best_rate = rate;
4005               tmp_best_ratey = rate_y;
4006               tmp_best_distortion = distortion;
4007               tmp_best_sse = total_sse;
4008               tmp_best_skippable = skippable;
4009               tmp_best_mbmode = *mbmi;
4010               for (i = 0; i < 4; i++) {
4011                 tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
4012                 x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
4013               }
4014               pred_exists = 1;
4015               if (switchable_filter_index == 0 &&
4016                   sf->use_rd_breakout &&
4017                   best_rd < INT64_MAX) {
4018                 if (tmp_best_rdu / 2 > best_rd) {
4019                   // skip searching the other filters if the first is
4020                   // already substantially larger than the best so far
4021                   tmp_best_filter = mbmi->interp_filter;
4022                   tmp_best_rdu = INT64_MAX;
4023                   break;
4024                 }
4025               }
4026             }
4027           }  // switchable_filter_index loop
4028         }
4029       }
4030
4031       if (tmp_best_rdu == INT64_MAX && pred_exists)
4032         continue;
4033
4034       mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ?
4035                              tmp_best_filter : cm->interp_filter);
4036       if (!pred_exists) {
4037         // Handles the special case when a filter that is not in the
4038         // switchable list (bilinear, 6-tap) is indicated at the frame level
4039         tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile_info,
4040                                           &mbmi->ref_mvs[ref_frame][0],
4041                                           second_ref, best_yrd, &rate, &rate_y,
4042                                           &distortion, &skippable, &total_sse,
4043                                           (int) this_rd_thresh, seg_mvs, bsi, 0,
4044                                           mi_row, mi_col);
4045         if (tmp_rd == INT64_MAX)
4046           continue;
4047       } else {
4048         total_sse = tmp_best_sse;
4049         rate = tmp_best_rate;
4050         rate_y = tmp_best_ratey;
4051         distortion = tmp_best_distortion;
4052         skippable = tmp_best_skippable;
4053         *mbmi = tmp_best_mbmode;
4054         for (i = 0; i < 4; i++)
4055           xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
4056       }
4057
4058       rate2 += rate;
4059       distortion2 += distortion;
4060
4061       if (cm->interp_filter == SWITCHABLE)
4062         rate2 += vp9_get_switchable_rate(cpi, xd);
4063
4064       if (!mode_excluded)
4065         mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
4066                                   : cm->reference_mode == COMPOUND_REFERENCE;
4067
4068       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
4069
4070       tmp_best_rdu = best_rd -
4071           MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
4072               RDCOST(x->rdmult, x->rddiv, 0, total_sse));
4073
4074       if (tmp_best_rdu > 0) {
4075         // If even the 'Y' rd value of split is higher than best so far
4076         // then dont bother looking at UV
4077         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
4078                                         BLOCK_8X8);
4079         memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
4080         if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
4081                               &uv_sse, BLOCK_8X8, tmp_best_rdu))
4082           continue;
4083
4084         rate2 += rate_uv;
4085         distortion2 += distortion_uv;
4086         skippable = skippable && uv_skippable;
4087         total_sse += uv_sse;
4088       }
4089     }
4090
4091     if (cm->reference_mode == REFERENCE_MODE_SELECT)
4092       rate2 += compmode_cost;
4093
4094     // Estimate the reference frame signaling cost and add it
4095     // to the rolling cost variable.
4096     if (second_ref_frame > INTRA_FRAME) {
4097       rate2 += ref_costs_comp[ref_frame];
4098     } else {
4099       rate2 += ref_costs_single[ref_frame];
4100     }
4101
4102     if (!disable_skip) {
4103       // Skip is never coded at the segment level for sub8x8 blocks and instead
4104       // always coded in the bitstream at the mode info level.
4105
4106       if (ref_frame != INTRA_FRAME && !xd->lossless) {
4107         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
4108             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
4109           // Add in the cost of the no skip flag.
4110           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
4111         } else {
4112           // FIXME(rbultje) make this work for splitmv also
4113           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
4114           distortion2 = total_sse;
4115           assert(total_sse >= 0);
4116           rate2 -= (rate_y + rate_uv);
4117           rate_y = 0;
4118           rate_uv = 0;
4119           this_skip2 = 1;
4120         }
4121       } else {
4122         // Add in the cost of the no skip flag.
4123         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
4124       }
4125
4126       // Calculate the final RD estimate for this mode.
4127       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
4128     }
4129
4130     if (!disable_skip && ref_frame == INTRA_FRAME) {
4131       for (i = 0; i < REFERENCE_MODES; ++i)
4132         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
4133       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
4134         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
4135     }
4136
4137     // Did this mode help.. i.e. is it the new best mode
4138     if (this_rd < best_rd || x->skip) {
4139       if (!mode_excluded) {
4140         int max_plane = MAX_MB_PLANE;
4141         // Note index of best mode so far
4142         best_ref_index = ref_index;
4143
4144         if (ref_frame == INTRA_FRAME) {
4145           /* required for left and above block mv */
4146           mbmi->mv[0].as_int = 0;
4147           max_plane = 1;
4148         }
4149
4150         rd_cost->rate = rate2;
4151         rd_cost->dist = distortion2;
4152         rd_cost->rdcost = this_rd;
4153         best_rd = this_rd;
4154         best_yrd = best_rd -
4155                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
4156         best_mbmode = *mbmi;
4157         best_skip2 = this_skip2;
4158         if (!x->select_tx_size)
4159           swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
4160         memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
4161                sizeof(uint8_t) * ctx->num_4x4_blk);
4162
4163         for (i = 0; i < 4; i++)
4164           best_bmodes[i] = xd->mi[0]->bmi[i];
4165
4166         // TODO(debargha): enhance this test with a better distortion prediction
4167         // based on qp, activity mask and history
4168         if ((sf->mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
4169             (ref_index > MIN_EARLY_TERM_INDEX)) {
4170           int qstep = xd->plane[0].dequant[1];
4171           // TODO(debargha): Enhance this by specializing for each mode_index
4172           int scale = 4;
4173 #if CONFIG_VP9_HIGHBITDEPTH
4174           if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
4175             qstep >>= (xd->bd - 8);
4176           }
4177 #endif  // CONFIG_VP9_HIGHBITDEPTH
4178           if (x->source_variance < UINT_MAX) {
4179             const int var_adjust = (x->source_variance < 16);
4180             scale -= var_adjust;
4181           }
4182           if (ref_frame > INTRA_FRAME &&
4183               distortion2 * scale < qstep * qstep) {
4184             early_term = 1;
4185           }
4186         }
4187       }
4188     }
4189
4190     /* keep record of best compound/single-only prediction */
4191     if (!disable_skip && ref_frame != INTRA_FRAME) {
4192       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
4193
4194       if (cm->reference_mode == REFERENCE_MODE_SELECT) {
4195         single_rate = rate2 - compmode_cost;
4196         hybrid_rate = rate2;
4197       } else {
4198         single_rate = rate2;
4199         hybrid_rate = rate2 + compmode_cost;
4200       }
4201
4202       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
4203       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
4204
4205       if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE])
4206         best_pred_rd[SINGLE_REFERENCE] = single_rd;
4207       else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE])
4208         best_pred_rd[COMPOUND_REFERENCE] = single_rd;
4209
4210       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
4211         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
4212     }
4213
4214     /* keep record of best filter type */
4215     if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
4216         cm->interp_filter != BILINEAR) {
4217       int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
4218                               SWITCHABLE_FILTERS : cm->interp_filter];
4219       int64_t adj_rd;
4220       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
4221         if (ref == INT64_MAX)
4222           adj_rd = 0;
4223         else if (filter_cache[i] == INT64_MAX)
4224           // when early termination is triggered, the encoder does not have
4225           // access to the rate-distortion cost. it only knows that the cost
4226           // should be above the maximum valid value. hence it takes the known
4227           // maximum plus an arbitrary constant as the rate-distortion cost.
4228           adj_rd = mask_filter - ref + 10;
4229         else
4230           adj_rd = filter_cache[i] - ref;
4231
4232         adj_rd += this_rd;
4233         best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
4234       }
4235     }
4236
4237     if (early_term)
4238       break;
4239
4240     if (x->skip && !comp_pred)
4241       break;
4242   }
4243
4244   if (best_rd >= best_rd_so_far) {
4245     rd_cost->rate = INT_MAX;
4246     rd_cost->rdcost = INT64_MAX;
4247     return;
4248   }
4249
4250   // If we used an estimate for the uv intra rd in the loop above...
4251   if (sf->use_uv_intra_rd_estimate) {
4252     // Do Intra UV best rd mode selection if best mode choice above was intra.
4253     if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
4254       *mbmi = best_mbmode;
4255       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra,
4256                               &rate_uv_tokenonly,
4257                               &dist_uv,
4258                               &skip_uv,
4259                               BLOCK_8X8, TX_4X4);
4260     }
4261   }
4262
4263   if (best_rd == INT64_MAX) {
4264     rd_cost->rate = INT_MAX;
4265     rd_cost->dist = INT64_MAX;
4266     rd_cost->rdcost = INT64_MAX;
4267     return;
4268   }
4269
4270   assert((cm->interp_filter == SWITCHABLE) ||
4271          (cm->interp_filter == best_mbmode.interp_filter) ||
4272          !is_inter_block(&best_mbmode));
4273
4274   vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
4275                             sf->adaptive_rd_thresh, bsize, best_ref_index);
4276
4277   // macroblock modes
4278   *mbmi = best_mbmode;
4279   x->skip |= best_skip2;
4280   if (!is_inter_block(&best_mbmode)) {
4281     for (i = 0; i < 4; i++)
4282       xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
4283   } else {
4284     for (i = 0; i < 4; ++i)
4285       memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
4286
4287     mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
4288     mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
4289   }
4290
4291   for (i = 0; i < REFERENCE_MODES; ++i) {
4292     if (best_pred_rd[i] == INT64_MAX)
4293       best_pred_diff[i] = INT_MIN;
4294     else
4295       best_pred_diff[i] = best_rd - best_pred_rd[i];
4296   }
4297
4298   if (!x->skip) {
4299     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
4300       if (best_filter_rd[i] == INT64_MAX)
4301         best_filter_diff[i] = 0;
4302       else
4303         best_filter_diff[i] = best_rd - best_filter_rd[i];
4304     }
4305     if (cm->interp_filter == SWITCHABLE)
4306       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
4307   } else {
4308     vp9_zero(best_filter_diff);
4309   }
4310
4311   store_coding_context(x, ctx, best_ref_index,
4312                        best_pred_diff, best_tx_diff, best_filter_diff, 0);
4313 }