av1/encoder/rdopt.c

   1 /*
   2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
   3  *
   4  * This source code is subject to the terms of the BSD 2 Clause License and
   5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
   6  * was not distributed with this source code in the LICENSE file, you can
   7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
   8  * Media Patent License 1.0 was not distributed with this source code in the
   9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  10  */
  11
  12 #include <assert.h>
  13 #include <math.h>
  14
  15 #include "./aom_dsp_rtcd.h"
  16 #include "./av1_rtcd.h"
  17
  18 #include "aom_dsp/aom_dsp_common.h"
  19 #include "aom_dsp/blend.h"
  20 #include "aom_mem/aom_mem.h"
  21 #include "aom_ports/mem.h"
  22 #include "aom_ports/system_state.h"
  23
  24 #include "av1/common/common.h"
  25 #include "av1/common/common_data.h"
  26 #include "av1/common/entropy.h"
  27 #include "av1/common/entropymode.h"
  28 #include "av1/common/idct.h"
  29 #include "av1/common/mvref_common.h"
  30 #include "av1/common/pred_common.h"
  31 #include "av1/common/quant_common.h"
  32 #include "av1/common/reconinter.h"
  33 #include "av1/common/reconintra.h"
  34 #include "av1/common/scan.h"
  35 #include "av1/common/seg_common.h"
  36 #if CONFIG_LV_MAP
  37 #include "av1/common/txb_common.h"
  38 #endif
  39 #if CONFIG_WARPED_MOTION
  40 #include "av1/common/warped_motion.h"
  41 #endif  // CONFIG_WARPED_MOTION
  42
  43 #include "av1/encoder/aq_variance.h"
  44 #include "av1/encoder/av1_quantize.h"
  45 #include "av1/encoder/cost.h"
  46 #include "av1/encoder/encodemb.h"
  47 #include "av1/encoder/encodemv.h"
  48 #include "av1/encoder/encoder.h"
  49 #if CONFIG_LV_MAP
  50 #include "av1/encoder/encodetxb.h"
  51 #endif
  52 #include "av1/encoder/hybrid_fwd_txfm.h"
  53 #include "av1/encoder/mcomp.h"
  54 #if CONFIG_PALETTE
  55 #include "av1/encoder/palette.h"
  56 #endif  // CONFIG_PALETTE
  57 #include "av1/encoder/ratectrl.h"
  58 #include "av1/encoder/rd.h"
  59 #include "av1/encoder/rdopt.h"
  60 #include "av1/encoder/tokenize.h"
  61 #if CONFIG_PVQ
  62 #include "av1/encoder/pvq_encoder.h"
  63 #endif  // CONFIG_PVQ
  64 #if CONFIG_PVQ || CONFIG_DAALA_DIST
  65 #include "av1/common/pvq.h"
  66 #endif  // CONFIG_PVQ || CONFIG_DAALA_DIST
  67 #if CONFIG_DUAL_FILTER
  68 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
  69 static const int filter_sets[DUAL_FILTER_SET_SIZE][2] = {
  70   { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 }, { 1, 0 }, { 1, 1 },
  71   { 1, 2 }, { 1, 3 }, { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 },
  72   { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 },
  73 };
  74 #endif  // CONFIG_DUAL_FILTER
  75
  76 #if CONFIG_EXT_REFS
  77
  78 #define LAST_FRAME_MODE_MASK                                      \
  79   ((1 << INTRA_FRAME) | (1 << LAST2_FRAME) | (1 << LAST3_FRAME) | \
  80    (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
  81 #define LAST2_FRAME_MODE_MASK                                    \
  82   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST3_FRAME) | \
  83    (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
  84 #define LAST3_FRAME_MODE_MASK                                    \
  85   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
  86    (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
  87 #define GOLDEN_FRAME_MODE_MASK                                   \
  88   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
  89    (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
  90 #define BWDREF_FRAME_MODE_MASK                                   \
  91   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
  92    (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME))
  93 #define ALTREF_FRAME_MODE_MASK                                   \
  94   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
  95    (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME))
  96
  97 #else
  98
  99 #define LAST_FRAME_MODE_MASK \
 100   ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
 101 #define GOLDEN_FRAME_MODE_MASK \
 102   ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
 103 #define ALTREF_FRAME_MODE_MASK \
 104   ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | (1 << INTRA_FRAME))
 105
 106 #endif  // CONFIG_EXT_REFS
 107
 108 #if CONFIG_EXT_REFS
 109 #define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | (1 << BWDREF_FRAME) | 0x01)
 110 #else
 111 #define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01)
 112 #endif  // CONFIG_EXT_REFS
 113
 114 #define MIN_EARLY_TERM_INDEX 3
 115 #define NEW_MV_DISCOUNT_FACTOR 8
 116
 117 #if CONFIG_EXT_INTRA
 118 #define ANGLE_SKIP_THRESH 10
 119 #define FILTER_FAST_SEARCH 1
 120 #endif  // CONFIG_EXT_INTRA
 121
 122 const double ADST_FLIP_SVM[8] = { -6.6623, -2.8062, -3.2531, 3.1671,    // vert
 123                                   -7.7051, -3.2234, -3.6193, 3.4533 };  // horz
 124
 125 typedef struct {
 126   PREDICTION_MODE mode;
 127   MV_REFERENCE_FRAME ref_frame[2];
 128 } MODE_DEFINITION;
 129
 130 typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION;
 131
 132 struct rdcost_block_args {
 133   const AV1_COMP *cpi;
 134   MACROBLOCK *x;
 135   ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE];
 136   ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE];
 137   RD_STATS rd_stats;
 138   int64_t this_rd;
 139   int64_t best_rd;
 140   int exit_early;
 141   int use_fast_coef_costing;
 142 };
 143
 144 #define LAST_NEW_MV_INDEX 6
 145 static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
 146   { NEARESTMV, { LAST_FRAME, NONE_FRAME } },
 147 #if CONFIG_EXT_REFS
 148   { NEARESTMV, { LAST2_FRAME, NONE_FRAME } },
 149   { NEARESTMV, { LAST3_FRAME, NONE_FRAME } },
 150   { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } },
 151 #endif  // CONFIG_EXT_REFS
 152   { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
 153   { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
 154
 155   { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
 156
 157   { NEWMV, { LAST_FRAME, NONE_FRAME } },
 158 #if CONFIG_EXT_REFS
 159   { NEWMV, { LAST2_FRAME, NONE_FRAME } },
 160   { NEWMV, { LAST3_FRAME, NONE_FRAME } },
 161   { NEWMV, { BWDREF_FRAME, NONE_FRAME } },
 162 #endif  // CONFIG_EXT_REFS
 163   { NEWMV, { ALTREF_FRAME, NONE_FRAME } },
 164   { NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
 165
 166   { NEARMV, { LAST_FRAME, NONE_FRAME } },
 167 #if CONFIG_EXT_REFS
 168   { NEARMV, { LAST2_FRAME, NONE_FRAME } },
 169   { NEARMV, { LAST3_FRAME, NONE_FRAME } },
 170   { NEARMV, { BWDREF_FRAME, NONE_FRAME } },
 171 #endif  // CONFIG_EXT_REFS
 172   { NEARMV, { ALTREF_FRAME, NONE_FRAME } },
 173   { NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
 174
 175   { ZEROMV, { LAST_FRAME, NONE_FRAME } },
 176 #if CONFIG_EXT_REFS
 177   { ZEROMV, { LAST2_FRAME, NONE_FRAME } },
 178   { ZEROMV, { LAST3_FRAME, NONE_FRAME } },
 179   { ZEROMV, { BWDREF_FRAME, NONE_FRAME } },
 180 #endif  // CONFIG_EXT_REFS
 181   { ZEROMV, { GOLDEN_FRAME, NONE_FRAME } },
 182   { ZEROMV, { ALTREF_FRAME, NONE_FRAME } },
 183
 184 // TODO(zoeliu): May need to reconsider the order on the modes to check
 185
 186 #if CONFIG_EXT_INTER
 187   { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
 188 #if CONFIG_EXT_REFS
 189   { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
 190   { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
 191 #endif  // CONFIG_EXT_REFS
 192   { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 193 #if CONFIG_EXT_REFS
 194   { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
 195   { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
 196   { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
 197   { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 198 #endif  // CONFIG_EXT_REFS
 199
 200 #else  // CONFIG_EXT_INTER
 201
 202   { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
 203 #if CONFIG_EXT_REFS
 204   { NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
 205   { NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
 206 #endif  // CONFIG_EXT_REFS
 207   { NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 208 #if CONFIG_EXT_REFS
 209   { NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
 210   { NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
 211   { NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
 212   { NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 213 #endif  // CONFIG_EXT_REFS
 214 #endif  // CONFIG_EXT_INTER
 215
 216   { TM_PRED, { INTRA_FRAME, NONE_FRAME } },
 217
 218 #if CONFIG_ALT_INTRA
 219   { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
 220 #if CONFIG_SMOOTH_HV
 221   { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
 222   { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
 223 #endif  // CONFIG_SMOOTH_HV
 224 #endif  // CONFIG_ALT_INTRA
 225
 226 #if CONFIG_EXT_INTER
 227   { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
 228   { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
 229   { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
 230   { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
 231   { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
 232   { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
 233   { ZERO_ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
 234
 235 #if CONFIG_EXT_REFS
 236   { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
 237   { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
 238   { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
 239   { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
 240   { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
 241   { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
 242   { ZERO_ZEROMV, { LAST2_FRAME, ALTREF_FRAME } },
 243
 244   { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
 245   { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
 246   { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
 247   { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
 248   { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
 249   { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
 250   { ZERO_ZEROMV, { LAST3_FRAME, ALTREF_FRAME } },
 251 #endif  // CONFIG_EXT_REFS
 252
 253   { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 254   { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 255   { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 256   { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 257   { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 258   { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 259   { ZERO_ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 260
 261 #if CONFIG_EXT_REFS
 262   { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
 263   { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
 264   { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
 265   { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
 266   { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
 267   { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
 268   { ZERO_ZEROMV, { LAST_FRAME, BWDREF_FRAME } },
 269
 270   { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
 271   { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
 272   { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
 273   { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
 274   { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
 275   { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
 276   { ZERO_ZEROMV, { LAST2_FRAME, BWDREF_FRAME } },
 277
 278   { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
 279   { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
 280   { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
 281   { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
 282   { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
 283   { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
 284   { ZERO_ZEROMV, { LAST3_FRAME, BWDREF_FRAME } },
 285
 286   { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 287   { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 288   { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 289   { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 290   { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 291   { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 292   { ZERO_ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 293 #endif  // CONFIG_EXT_REFS
 294
 295 #else  // CONFIG_EXT_INTER
 296
 297   { NEARMV, { LAST_FRAME, ALTREF_FRAME } },
 298   { NEWMV, { LAST_FRAME, ALTREF_FRAME } },
 299 #if CONFIG_EXT_REFS
 300   { NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
 301   { NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
 302   { NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
 303   { NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
 304 #endif  // CONFIG_EXT_REFS
 305   { NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 306   { NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 307
 308 #if CONFIG_EXT_REFS
 309   { NEARMV, { LAST_FRAME, BWDREF_FRAME } },
 310   { NEWMV, { LAST_FRAME, BWDREF_FRAME } },
 311   { NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
 312   { NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
 313   { NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
 314   { NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
 315   { NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 316   { NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 317 #endif  // CONFIG_EXT_REFS
 318
 319   { ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
 320 #if CONFIG_EXT_REFS
 321   { ZEROMV, { LAST2_FRAME, ALTREF_FRAME } },
 322   { ZEROMV, { LAST3_FRAME, ALTREF_FRAME } },
 323 #endif  // CONFIG_EXT_REFS
 324   { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 325
 326 #if CONFIG_EXT_REFS
 327   { ZEROMV, { LAST_FRAME, BWDREF_FRAME } },
 328   { ZEROMV, { LAST2_FRAME, BWDREF_FRAME } },
 329   { ZEROMV, { LAST3_FRAME, BWDREF_FRAME } },
 330   { ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 331 #endif  // CONFIG_EXT_REFS
 332
 333 #endif  // CONFIG_EXT_INTER
 334
 335   { H_PRED, { INTRA_FRAME, NONE_FRAME } },
 336   { V_PRED, { INTRA_FRAME, NONE_FRAME } },
 337   { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
 338   { D207_PRED, { INTRA_FRAME, NONE_FRAME } },
 339   { D153_PRED, { INTRA_FRAME, NONE_FRAME } },
 340   { D63_PRED, { INTRA_FRAME, NONE_FRAME } },
 341   { D117_PRED, { INTRA_FRAME, NONE_FRAME } },
 342   { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
 343
 344 #if CONFIG_EXT_INTER
 345   { ZEROMV, { LAST_FRAME, INTRA_FRAME } },
 346   { NEARESTMV, { LAST_FRAME, INTRA_FRAME } },
 347   { NEARMV, { LAST_FRAME, INTRA_FRAME } },
 348   { NEWMV, { LAST_FRAME, INTRA_FRAME } },
 349
 350 #if CONFIG_EXT_REFS
 351   { ZEROMV, { LAST2_FRAME, INTRA_FRAME } },
 352   { NEARESTMV, { LAST2_FRAME, INTRA_FRAME } },
 353   { NEARMV, { LAST2_FRAME, INTRA_FRAME } },
 354   { NEWMV, { LAST2_FRAME, INTRA_FRAME } },
 355
 356   { ZEROMV, { LAST3_FRAME, INTRA_FRAME } },
 357   { NEARESTMV, { LAST3_FRAME, INTRA_FRAME } },
 358   { NEARMV, { LAST3_FRAME, INTRA_FRAME } },
 359   { NEWMV, { LAST3_FRAME, INTRA_FRAME } },
 360 #endif  // CONFIG_EXT_REFS
 361
 362   { ZEROMV, { GOLDEN_FRAME, INTRA_FRAME } },
 363   { NEARESTMV, { GOLDEN_FRAME, INTRA_FRAME } },
 364   { NEARMV, { GOLDEN_FRAME, INTRA_FRAME } },
 365   { NEWMV, { GOLDEN_FRAME, INTRA_FRAME } },
 366
 367 #if CONFIG_EXT_REFS
 368   { ZEROMV, { BWDREF_FRAME, INTRA_FRAME } },
 369   { NEARESTMV, { BWDREF_FRAME, INTRA_FRAME } },
 370   { NEARMV, { BWDREF_FRAME, INTRA_FRAME } },
 371   { NEWMV, { BWDREF_FRAME, INTRA_FRAME } },
 372 #endif  // CONFIG_EXT_REFS
 373
 374   { ZEROMV, { ALTREF_FRAME, INTRA_FRAME } },
 375   { NEARESTMV, { ALTREF_FRAME, INTRA_FRAME } },
 376   { NEARMV, { ALTREF_FRAME, INTRA_FRAME } },
 377   { NEWMV, { ALTREF_FRAME, INTRA_FRAME } },
 378 #endif  // CONFIG_EXT_INTER
 379 };
 380
 381 #if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
 382 static INLINE int write_uniform_cost(int n, int v) {
 383   const int l = get_unsigned_bits(n);
 384   const int m = (1 << l) - n;
 385   if (l == 0) return 0;
 386   if (v < m)
 387     return (l - 1) * av1_cost_bit(128, 0);
 388   else
 389     return l * av1_cost_bit(128, 0);
 390 }
 391 #endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
 392
 393 // constants for prune 1 and prune 2 decision boundaries
 394 #define FAST_EXT_TX_CORR_MID 0.0
 395 #define FAST_EXT_TX_EDST_MID 0.1
 396 #define FAST_EXT_TX_CORR_MARGIN 0.5
 397 #define FAST_EXT_TX_EDST_MARGIN 0.3
 398
 399 #if CONFIG_DAALA_DIST
 400 static int od_compute_var_4x4(od_coeff *x, int stride) {
 401   int sum;
 402   int s2;
 403   int i;
 404   sum = 0;
 405   s2 = 0;
 406   for (i = 0; i < 4; i++) {
 407     int j;
 408     for (j = 0; j < 4; j++) {
 409       int t;
 410
 411       t = x[i * stride + j];
 412       sum += t;
 413       s2 += t * t;
 414     }
 415   }
 416   // TODO(yushin) : Check wheter any changes are required for high bit depth.
 417   return (s2 - (sum * sum >> 4)) >> 4;
 418 }
 419
 420 /* OD_DIST_LP_MID controls the frequency weighting filter used for computing
 421    the distortion. For a value X, the filter is [1 X 1]/(X + 2) and
 422    is applied both horizontally and vertically. For X=5, the filter is
 423    a good approximation for the OD_QM8_Q4_HVS quantization matrix. */
 424 #define OD_DIST_LP_MID (5)
 425 #define OD_DIST_LP_NORM (OD_DIST_LP_MID + 2)
 426
 427 static double od_compute_dist_8x8(int qm, int use_activity_masking, od_coeff *x,
 428                                   od_coeff *y, od_coeff *e_lp, int stride) {
 429   double sum;
 430   int min_var;
 431   double mean_var;
 432   double var_stat;
 433   double activity;
 434   double calibration;
 435   int i;
 436   int j;
 437   double vardist;
 438
 439   vardist = 0;
 440   OD_ASSERT(qm != OD_FLAT_QM);
 441   (void)qm;
 442 #if 1
 443   min_var = INT_MAX;
 444   mean_var = 0;
 445   for (i = 0; i < 3; i++) {
 446     for (j = 0; j < 3; j++) {
 447       int varx;
 448       int vary;
 449       varx = od_compute_var_4x4(x + 2 * i * stride + 2 * j, stride);
 450       vary = od_compute_var_4x4(y + 2 * i * stride + 2 * j, stride);
 451       min_var = OD_MINI(min_var, varx);
 452       mean_var += 1. / (1 + varx);
 453       /* The cast to (double) is to avoid an overflow before the sqrt.*/
 454       vardist += varx - 2 * sqrt(varx * (double)vary) + vary;
 455     }
 456   }
 457   /* We use a different variance statistic depending on whether activity
 458      masking is used, since the harmonic mean appeared slghtly worse with
 459      masking off. The calibration constant just ensures that we preserve the
 460      rate compared to activity=1. */
 461   if (use_activity_masking) {
 462     calibration = 1.95;
 463     var_stat = 9. / mean_var;
 464   } else {
 465     calibration = 1.62;
 466     var_stat = min_var;
 467   }
 468   /* 1.62 is a calibration constant, 0.25 is a noise floor and 1/6 is the
 469      activity masking constant. */
 470   activity = calibration * pow(.25 + var_stat, -1. / 6);
 471 #else
 472   activity = 1;
 473 #endif  // 1
 474   sum = 0;
 475   for (i = 0; i < 8; i++) {
 476     for (j = 0; j < 8; j++)
 477       sum += e_lp[i * stride + j] * (double)e_lp[i * stride + j];
 478   }
 479   /* Normalize the filter to unit DC response. */
 480   sum *= 1. / (OD_DIST_LP_NORM * OD_DIST_LP_NORM * OD_DIST_LP_NORM *
 481                OD_DIST_LP_NORM);
 482   return activity * activity * (sum + vardist);
 483 }
 484
 485 // Note : Inputs x and y are in a pixel domain
 486 static double od_compute_dist(int qm, int activity_masking, od_coeff *x,
 487                               od_coeff *y, int bsize_w, int bsize_h,
 488                               int qindex) {
 489   int i;
 490   double sum;
 491   sum = 0;
 492
 493   assert(bsize_w >= 8 && bsize_h >= 8);
 494
 495   if (qm == OD_FLAT_QM) {
 496     for (i = 0; i < bsize_w * bsize_h; i++) {
 497       double tmp;
 498       tmp = x[i] - y[i];
 499       sum += tmp * tmp;
 500     }
 501   } else {
 502     int j;
 503     DECLARE_ALIGNED(16, od_coeff, e[MAX_TX_SQUARE]);
 504     DECLARE_ALIGNED(16, od_coeff, tmp[MAX_TX_SQUARE]);
 505     DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_TX_SQUARE]);
 506     int mid = OD_DIST_LP_MID;
 507     for (i = 0; i < bsize_h; i++) {
 508       for (j = 0; j < bsize_w; j++) {
 509         e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j];
 510       }
 511     }
 512     for (i = 0; i < bsize_h; i++) {
 513       tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
 514       tmp[i * bsize_w + bsize_w - 1] =
 515           mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
 516       for (j = 1; j < bsize_w - 1; j++) {
 517         tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] +
 518                                e[i * bsize_w + j - 1] + e[i * bsize_w + j + 1];
 519       }
 520     }
 521     for (j = 0; j < bsize_w; j++) {
 522       e_lp[j] = mid * tmp[j] + 2 * tmp[bsize_w + j];
 523       e_lp[(bsize_h - 1) * bsize_w + j] =
 524           mid * tmp[(bsize_h - 1) * bsize_w + j] +
 525           2 * tmp[(bsize_h - 2) * bsize_w + j];
 526     }
 527     for (i = 1; i < bsize_h - 1; i++) {
 528       for (j = 0; j < bsize_w; j++) {
 529         e_lp[i * bsize_w + j] = mid * tmp[i * bsize_w + j] +
 530                                 tmp[(i - 1) * bsize_w + j] +
 531                                 tmp[(i + 1) * bsize_w + j];
 532       }
 533     }
 534     for (i = 0; i < bsize_h; i += 8) {
 535       for (j = 0; j < bsize_w; j += 8) {
 536         sum += od_compute_dist_8x8(qm, activity_masking, &x[i * bsize_w + j],
 537                                    &y[i * bsize_w + j], &e_lp[i * bsize_w + j],
 538                                    bsize_w);
 539       }
 540     }
 541     /* Scale according to linear regression against SSE, for 8x8 blocks. */
 542     if (activity_masking) {
 543       sum *= 2.2 + (1.7 - 2.2) * (qindex - 99) / (210 - 99) +
 544              (qindex < 99 ? 2.5 * (qindex - 99) / 99 * (qindex - 99) / 99 : 0);
 545     } else {
 546       sum *= qindex >= 128
 547                  ? 1.4 + (0.9 - 1.4) * (qindex - 128) / (209 - 128)
 548                  : qindex <= 43
 549                        ? 1.5 + (2.0 - 1.5) * (qindex - 43) / (16 - 43)
 550                        : 1.5 + (1.4 - 1.5) * (qindex - 43) / (128 - 43);
 551     }
 552   }
 553   return sum;
 554 }
 555
 556 int64_t av1_daala_dist(const uint8_t *src, int src_stride, const uint8_t *dst,
 557                        int dst_stride, int bsw, int bsh, int qm,
 558                        int use_activity_masking, int qindex) {
 559   int i, j;
 560   int64_t d;
 561   DECLARE_ALIGNED(16, od_coeff, orig[MAX_TX_SQUARE]);
 562   DECLARE_ALIGNED(16, od_coeff, rec[MAX_TX_SQUARE]);
 563
 564   assert(qm == OD_HVS_QM);
 565
 566   for (j = 0; j < bsh; j++)
 567     for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
 568
 569   for (j = 0; j < bsh; j++)
 570     for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
 571
 572   d = (int64_t)od_compute_dist(qm, use_activity_masking, orig, rec, bsw, bsh,
 573                                qindex);
 574   return d;
 575 }
 576 #endif  // CONFIG_DAALA_DIST
 577
 578 static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
 579                                          const uint8_t *src, int src_stride,
 580                                          const uint8_t *dst, int dst_stride,
 581                                          double *hordist, double *verdist) {
 582   const int bw = block_size_wide[bsize];
 583   const int bh = block_size_high[bsize];
 584   unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
 585
 586   const int f_index = bsize - BLOCK_16X16;
 587   if (f_index < 0) {
 588     const int w_shift = bw == 8 ? 1 : 2;
 589     const int h_shift = bh == 8 ? 1 : 2;
 590 #if CONFIG_HIGHBITDEPTH
 591     if (cpi->common.use_highbitdepth) {
 592       const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
 593       const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
 594       for (int i = 0; i < bh; ++i)
 595         for (int j = 0; j < bw; ++j) {
 596           const int index = (j >> w_shift) + ((i >> h_shift) << 2);
 597           esq[index] +=
 598               (src16[j + i * src_stride] - dst16[j + i * dst_stride]) *
 599               (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
 600         }
 601     } else {
 602 #endif  // CONFIG_HIGHBITDEPTH
 603
 604       for (int i = 0; i < bh; ++i)
 605         for (int j = 0; j < bw; ++j) {
 606           const int index = (j >> w_shift) + ((i >> h_shift) << 2);
 607           esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
 608                         (src[j + i * src_stride] - dst[j + i * dst_stride]);
 609         }
 610 #if CONFIG_HIGHBITDEPTH
 611     }
 612 #endif  // CONFIG_HIGHBITDEPTH
 613   } else {
 614     cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[0]);
 615     cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
 616                             &esq[1]);
 617     cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
 618                             &esq[2]);
 619     cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
 620                             dst_stride, &esq[3]);
 621     src += bh / 4 * src_stride;
 622     dst += bh / 4 * dst_stride;
 623
 624     cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[4]);
 625     cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
 626                             &esq[5]);
 627     cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
 628                             &esq[6]);
 629     cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
 630                             dst_stride, &esq[7]);
 631     src += bh / 4 * src_stride;
 632     dst += bh / 4 * dst_stride;
 633
 634     cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[8]);
 635     cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
 636                             &esq[9]);
 637     cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
 638                             &esq[10]);
 639     cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
 640                             dst_stride, &esq[11]);
 641     src += bh / 4 * src_stride;
 642     dst += bh / 4 * dst_stride;
 643
 644     cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[12]);
 645     cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
 646                             &esq[13]);
 647     cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
 648                             &esq[14]);
 649     cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
 650                             dst_stride, &esq[15]);
 651   }
 652
 653   double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] +
 654                  esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] +
 655                  esq[12] + esq[13] + esq[14] + esq[15];
 656   if (total > 0) {
 657     const double e_recip = 1.0 / total;
 658     hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip;
 659     hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip;
 660     hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip;
 661     verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip;
 662     verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip;
 663     verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip;
 664   } else {
 665     hordist[0] = verdist[0] = 0.25;
 666     hordist[1] = verdist[1] = 0.25;
 667     hordist[2] = verdist[2] = 0.25;
 668   }
 669 }
 670
 671 static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize,
 672                             const uint8_t *src, int src_stride,
 673                             const uint8_t *dst, int dst_stride) {
 674   int prune_bitmask = 0;
 675   double svm_proj_h = 0, svm_proj_v = 0;
 676   double hdist[3] = { 0, 0, 0 }, vdist[3] = { 0, 0, 0 };
 677   get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride,
 678                                hdist, vdist);
 679
 680   svm_proj_v = vdist[0] * ADST_FLIP_SVM[0] + vdist[1] * ADST_FLIP_SVM[1] +
 681                vdist[2] * ADST_FLIP_SVM[2] + ADST_FLIP_SVM[3];
 682   svm_proj_h = hdist[0] * ADST_FLIP_SVM[4] + hdist[1] * ADST_FLIP_SVM[5] +
 683                hdist[2] * ADST_FLIP_SVM[6] + ADST_FLIP_SVM[7];
 684   if (svm_proj_v > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
 685     prune_bitmask |= 1 << FLIPADST_1D;
 686   else if (svm_proj_v < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
 687     prune_bitmask |= 1 << ADST_1D;
 688
 689   if (svm_proj_h > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
 690     prune_bitmask |= 1 << (FLIPADST_1D + 8);
 691   else if (svm_proj_h < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
 692     prune_bitmask |= 1 << (ADST_1D + 8);
 693
 694   return prune_bitmask;
 695 }
 696
 697 #if CONFIG_EXT_TX
 698 static void get_horver_correlation(const int16_t *diff, int stride, int w,
 699                                    int h, double *hcorr, double *vcorr) {
 700   // Returns hor/ver correlation coefficient
 701   const int num = (h - 1) * (w - 1);
 702   double num_r;
 703   int i, j;
 704   int64_t xy_sum = 0, xz_sum = 0;
 705   int64_t x_sum = 0, y_sum = 0, z_sum = 0;
 706   int64_t x2_sum = 0, y2_sum = 0, z2_sum = 0;
 707   double x_var_n, y_var_n, z_var_n, xy_var_n, xz_var_n;
 708   *hcorr = *vcorr = 1;
 709
 710   assert(num > 0);
 711   num_r = 1.0 / num;
 712   for (i = 1; i < h; ++i) {
 713     for (j = 1; j < w; ++j) {
 714       const int16_t x = diff[i * stride + j];
 715       const int16_t y = diff[i * stride + j - 1];
 716       const int16_t z = diff[(i - 1) * stride + j];
 717       xy_sum += x * y;
 718       xz_sum += x * z;
 719       x_sum += x;
 720       y_sum += y;
 721       z_sum += z;
 722       x2_sum += x * x;
 723       y2_sum += y * y;
 724       z2_sum += z * z;
 725     }
 726   }
 727   x_var_n = x2_sum - (x_sum * x_sum) * num_r;
 728   y_var_n = y2_sum - (y_sum * y_sum) * num_r;
 729   z_var_n = z2_sum - (z_sum * z_sum) * num_r;
 730   xy_var_n = xy_sum - (x_sum * y_sum) * num_r;
 731   xz_var_n = xz_sum - (x_sum * z_sum) * num_r;
 732   if (x_var_n > 0 && y_var_n > 0) {
 733     *hcorr = xy_var_n / sqrt(x_var_n * y_var_n);
 734     *hcorr = *hcorr < 0 ? 0 : *hcorr;
 735   }
 736   if (x_var_n > 0 && z_var_n > 0) {
 737     *vcorr = xz_var_n / sqrt(x_var_n * z_var_n);
 738     *vcorr = *vcorr < 0 ? 0 : *vcorr;
 739   }
 740 }
 741
 742 int dct_vs_idtx(const int16_t *diff, int stride, int w, int h) {
 743   double hcorr, vcorr;
 744   int prune_bitmask = 0;
 745   get_horver_correlation(diff, stride, w, h, &hcorr, &vcorr);
 746
 747   if (vcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
 748     prune_bitmask |= 1 << IDTX_1D;
 749   else if (vcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
 750     prune_bitmask |= 1 << DCT_1D;
 751
 752   if (hcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
 753     prune_bitmask |= 1 << (IDTX_1D + 8);
 754   else if (hcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
 755     prune_bitmask |= 1 << (DCT_1D + 8);
 756   return prune_bitmask;
 757 }
 758
 759 // Performance drop: 0.5%, Speed improvement: 24%
 760 static int prune_two_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
 761                              MACROBLOCK *x, const MACROBLOCKD *xd,
 762                              int adst_flipadst, int dct_idtx) {
 763   int prune = 0;
 764
 765   if (adst_flipadst) {
 766     const struct macroblock_plane *const p = &x->plane[0];
 767     const struct macroblockd_plane *const pd = &xd->plane[0];
 768     prune |= adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride,
 769                               pd->dst.buf, pd->dst.stride);
 770   }
 771   if (dct_idtx) {
 772     av1_subtract_plane(x, bsize, 0);
 773     const struct macroblock_plane *const p = &x->plane[0];
 774     const int bw = 4 << (b_width_log2_lookup[bsize]);
 775     const int bh = 4 << (b_height_log2_lookup[bsize]);
 776     prune |= dct_vs_idtx(p->src_diff, bw, bw, bh);
 777   }
 778
 779   return prune;
 780 }
 781 #endif  // CONFIG_EXT_TX
 782
 783 // Performance drop: 0.3%, Speed improvement: 5%
 784 static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
 785                              const MACROBLOCK *x, const MACROBLOCKD *xd) {
 786   const struct macroblock_plane *const p = &x->plane[0];
 787   const struct macroblockd_plane *const pd = &xd->plane[0];
 788   return adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, pd->dst.buf,
 789                           pd->dst.stride);
 790 }
 791
 792 static int prune_tx_types(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
 793                           const MACROBLOCKD *const xd, int tx_set) {
 794 #if CONFIG_EXT_TX
 795   const int *tx_set_1D = tx_set >= 0 ? ext_tx_used_inter_1D[tx_set] : NULL;
 796 #else
 797   const int tx_set_1D[TX_TYPES_1D] = { 0 };
 798 #endif  // CONFIG_EXT_TX
 799
 800   switch (cpi->sf.tx_type_search.prune_mode) {
 801     case NO_PRUNE: return 0; break;
 802     case PRUNE_ONE:
 803       if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D]))
 804         return 0;
 805       return prune_one_for_sby(cpi, bsize, x, xd);
 806       break;
 807 #if CONFIG_EXT_TX
 808     case PRUNE_TWO:
 809       if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
 810         if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return 0;
 811         return prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
 812       }
 813       if ((tx_set >= 0) && !(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D]))
 814         return prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
 815       return prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
 816       break;
 817 #endif  // CONFIG_EXT_TX
 818   }
 819   assert(0);
 820   return 0;
 821 }
 822
 823 static int do_tx_type_search(TX_TYPE tx_type, int prune) {
 824 // TODO(sarahparker) implement for non ext tx
 825 #if CONFIG_EXT_TX
 826   return !(((prune >> vtx_tab[tx_type]) & 1) |
 827            ((prune >> (htx_tab[tx_type] + 8)) & 1));
 828 #else
 829   // temporary to avoid compiler warnings
 830   (void)vtx_tab;
 831   (void)htx_tab;
 832   (void)tx_type;
 833   (void)prune;
 834   return 1;
 835 #endif  // CONFIG_EXT_TX
 836 }
 837
 838 static void model_rd_from_sse(const AV1_COMP *const cpi,
 839                               const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
 840                               int plane, int64_t sse, int *rate,
 841                               int64_t *dist) {
 842   const struct macroblockd_plane *const pd = &xd->plane[plane];
 843   const int dequant_shift =
 844 #if CONFIG_HIGHBITDEPTH
 845       (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
 846 #endif  // CONFIG_HIGHBITDEPTH
 847                                                     3;
 848
 849   // Fast approximate the modelling function.
 850   if (cpi->sf.simple_model_rd_from_var) {
 851     const int64_t square_error = sse;
 852     int quantizer = (pd->dequant[1] >> dequant_shift);
 853
 854     if (quantizer < 120)
 855       *rate = (int)((square_error * (280 - quantizer)) >>
 856                     (16 - AV1_PROB_COST_SHIFT));
 857     else
 858       *rate = 0;
 859     *dist = (square_error * quantizer) >> 8;
 860   } else {
 861     av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize],
 862                                  pd->dequant[1] >> dequant_shift, rate, dist);
 863   }
 864
 865   *dist <<= 4;
 866 }
 867
 868 static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
 869                             MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
 870                             int plane_to, int *out_rate_sum,
 871                             int64_t *out_dist_sum, int *skip_txfm_sb,
 872                             int64_t *skip_sse_sb) {
 873   // Note our transform coeffs are 8 times an orthogonal transform.
 874   // Hence quantizer step is also 8 times. To get effective quantizer
 875   // we need to divide by 8 before sending to modeling function.
 876   int plane;
 877   const int ref = xd->mi[0]->mbmi.ref_frame[0];
 878
 879   int64_t rate_sum = 0;
 880   int64_t dist_sum = 0;
 881   int64_t total_sse = 0;
 882
 883   x->pred_sse[ref] = 0;
 884
 885   for (plane = plane_from; plane <= plane_to; ++plane) {
 886     struct macroblock_plane *const p = &x->plane[plane];
 887     struct macroblockd_plane *const pd = &xd->plane[plane];
 888 #if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
 889     const BLOCK_SIZE bs = AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
 890 #else
 891     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
 892 #endif  // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
 893
 894     unsigned int sse;
 895     int rate;
 896     int64_t dist;
 897
 898 #if CONFIG_CB4X4
 899     if (x->skip_chroma_rd && plane) continue;
 900 #endif  // CONFIG_CB4X4
 901
 902     // TODO(geza): Write direct sse functions that do not compute
 903     // variance as well.
 904     cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
 905                        &sse);
 906
 907     if (plane == 0) x->pred_sse[ref] = sse;
 908
 909     total_sse += sse;
 910
 911     model_rd_from_sse(cpi, xd, bs, plane, sse, &rate, &dist);
 912
 913     rate_sum += rate;
 914     dist_sum += dist;
 915   }
 916
 917   *skip_txfm_sb = total_sse == 0;
 918   *skip_sse_sb = total_sse << 4;
 919   *out_rate_sum = (int)rate_sum;
 920   *out_dist_sum = dist_sum;
 921 }
 922
 923 int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
 924                           intptr_t block_size, int64_t *ssz) {
 925   int i;
 926   int64_t error = 0, sqcoeff = 0;
 927
 928   for (i = 0; i < block_size; i++) {
 929     const int diff = coeff[i] - dqcoeff[i];
 930     error += diff * diff;
 931     sqcoeff += coeff[i] * coeff[i];
 932   }
 933
 934   *ssz = sqcoeff;
 935   return error;
 936 }
 937
 938 int64_t av1_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
 939                              int block_size) {
 940   int i;
 941   int64_t error = 0;
 942
 943   for (i = 0; i < block_size; i++) {
 944     const int diff = coeff[i] - dqcoeff[i];
 945     error += diff * diff;
 946   }
 947
 948   return error;
 949 }
 950
 951 #if CONFIG_HIGHBITDEPTH
 952 int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
 953                                  const tran_low_t *dqcoeff, intptr_t block_size,
 954                                  int64_t *ssz, int bd) {
 955   int i;
 956   int64_t error = 0, sqcoeff = 0;
 957   int shift = 2 * (bd - 8);
 958   int rounding = shift > 0 ? 1 << (shift - 1) : 0;
 959
 960   for (i = 0; i < block_size; i++) {
 961     const int64_t diff = coeff[i] - dqcoeff[i];
 962     error += diff * diff;
 963     sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
 964   }
 965   assert(error >= 0 && sqcoeff >= 0);
 966   error = (error + rounding) >> shift;
 967   sqcoeff = (sqcoeff + rounding) >> shift;
 968
 969   *ssz = sqcoeff;
 970   return error;
 971 }
 972 #endif  // CONFIG_HIGHBITDEPTH
 973
 974 #if CONFIG_PVQ
 975 // Without PVQ, av1_block_error_c() return two kind of errors,
 976 // 1) reconstruction (i.e. decoded) error and
 977 // 2) Squared sum of transformed residue (i.e. 'coeff')
 978 // However, if PVQ is enabled, coeff does not keep the transformed residue
 979 // but instead a transformed original is kept.
 980 // Hence, new parameter ref vector (i.e. transformed predicted signal)
 981 // is required to derive the residue signal,
 982 // i.e. coeff - ref = residue (all transformed).
 983
 984 #if CONFIG_HIGHBITDEPTH
 985 static int64_t av1_highbd_block_error2_c(const tran_low_t *coeff,
 986                                          const tran_low_t *dqcoeff,
 987                                          const tran_low_t *ref,
 988                                          intptr_t block_size, int64_t *ssz,
 989                                          int bd) {
 990   int64_t error;
 991   int64_t sqcoeff;
 992   int shift = 2 * (bd - 8);
 993   int rounding = shift > 0 ? 1 << (shift - 1) : 0;
 994   // Use the existing sse codes for calculating distortion of decoded signal:
 995   // i.e. (orig - decoded)^2
 996   // For high bit depth, throw away ssz until a 32-bit version of
 997   // av1_block_error_fp is written.
 998   int64_t ssz_trash;
 999   error = av1_block_error(coeff, dqcoeff, block_size, &ssz_trash);
1000   // prediction residue^2 = (orig - ref)^2
1001   sqcoeff = av1_block_error(coeff, ref, block_size, &ssz_trash);
1002   error = (error + rounding) >> shift;
1003   sqcoeff = (sqcoeff + rounding) >> shift;
1004   *ssz = sqcoeff;
1005   return error;
1006 }
1007 #else
1008 // TODO(yushin) : Since 4x4 case does not need ssz, better to refactor into
1009 // a separate function that does not do the extra computations for ssz.
1010 static int64_t av1_block_error2_c(const tran_low_t *coeff,
1011                                   const tran_low_t *dqcoeff,
1012                                   const tran_low_t *ref, intptr_t block_size,
1013                                   int64_t *ssz) {
1014   int64_t error;
1015   // Use the existing sse codes for calculating distortion of decoded signal:
1016   // i.e. (orig - decoded)^2
1017   error = av1_block_error_fp(coeff, dqcoeff, block_size);
1018   // prediction residue^2 = (orig - ref)^2
1019   *ssz = av1_block_error_fp(coeff, ref, block_size);
1020   return error;
1021 }
1022 #endif  // CONFIG_HIGHBITDEPTH
1023 #endif  // CONFIG_PVQ
1024
1025 #if !CONFIG_PVQ || CONFIG_VAR_TX
1026 /* The trailing '0' is a terminator which is used inside av1_cost_coeffs() to
1027  * decide whether to include cost of a trailing EOB node or not (i.e. we
1028  * can skip this if the last coefficient in this transform block, e.g. the
1029  * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
1030  * were non-zero). */
1031 #if !CONFIG_LV_MAP
1032 static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
1033                        int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
1034                        const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
1035                        int use_fast_coef_costing) {
1036   MACROBLOCKD *const xd = &x->e_mbd;
1037   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
1038   const struct macroblock_plane *p = &x->plane[plane];
1039   const struct macroblockd_plane *pd = &xd->plane[plane];
1040   const PLANE_TYPE type = pd->plane_type;
1041   const uint16_t *band_count = &band_count_table[tx_size][1];
1042   const int eob = p->eobs[block];
1043   const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
1044   const int tx_size_ctx = txsize_sqr_map[tx_size];
1045   unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
1046       x->token_costs[tx_size_ctx][type][is_inter_block(mbmi)];
1047   uint8_t token_cache[MAX_TX_SQUARE];
1048   int pt = combine_entropy_contexts(*a, *l);
1049   int c, cost;
1050   const int16_t *scan = scan_order->scan;
1051   const int16_t *nb = scan_order->neighbors;
1052   const int ref = is_inter_block(mbmi);
1053   aom_prob *blockz_probs =
1054       cm->fc->blockzero_probs[txsize_sqr_map[tx_size]][type][ref];
1055
1056 #if CONFIG_HIGHBITDEPTH
1057   const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, xd->bd);
1058 #else
1059   const int cat6_bits = av1_get_cat6_extrabits_size(tx_size, 8);
1060 #endif  // CONFIG_HIGHBITDEPTH
1061
1062 #if !CONFIG_VAR_TX && !CONFIG_SUPERTX
1063   // Check for consistency of tx_size with mode info
1064   assert(tx_size == get_tx_size(plane, xd));
1065 #endif  // !CONFIG_VAR_TX && !CONFIG_SUPERTX
1066   (void)cm;
1067
1068   if (eob == 0) {
1069     // single eob token
1070     cost = av1_cost_bit(blockz_probs[pt], 0);
1071   } else {
1072     if (use_fast_coef_costing) {
1073       int band_left = *band_count++;
1074
1075       // dc token
1076       int v = qcoeff[0];
1077       int16_t prev_t;
1078       cost = av1_get_token_cost(v, &prev_t, cat6_bits);
1079       cost += (*token_costs)[!prev_t][pt][prev_t];
1080
1081       token_cache[0] = av1_pt_energy_class[prev_t];
1082       ++token_costs;
1083
1084       // ac tokens
1085       for (c = 1; c < eob; c++) {
1086         const int rc = scan[c];
1087         int16_t t;
1088
1089         v = qcoeff[rc];
1090         cost += av1_get_token_cost(v, &t, cat6_bits);
1091         cost += (*token_costs)[!t][!prev_t][t];
1092         prev_t = t;
1093         if (!--band_left) {
1094           band_left = *band_count++;
1095           ++token_costs;
1096         }
1097       }
1098
1099       // eob token
1100       cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
1101
1102     } else {  // !use_fast_coef_costing
1103       int band_left = *band_count++;
1104
1105       // dc token
1106       int v = qcoeff[0];
1107       int16_t tok;
1108       cost = av1_get_token_cost(v, &tok, cat6_bits);
1109       cost += (*token_costs)[!tok][pt][tok];
1110
1111       token_cache[0] = av1_pt_energy_class[tok];
1112       ++token_costs;
1113
1114       // ac tokens
1115       for (c = 1; c < eob; c++) {
1116         const int rc = scan[c];
1117
1118         v = qcoeff[rc];
1119         cost += av1_get_token_cost(v, &tok, cat6_bits);
1120         pt = get_coef_context(nb, token_cache, c);
1121         cost += (*token_costs)[!tok][pt][tok];
1122         token_cache[rc] = av1_pt_energy_class[tok];
1123         if (!--band_left) {
1124           band_left = *band_count++;
1125           ++token_costs;
1126         }
1127       }
1128
1129       // eob token
1130       pt = get_coef_context(nb, token_cache, c);
1131       cost += (*token_costs)[0][pt][EOB_TOKEN];
1132     }
1133   }
1134
1135   return cost;
1136 }
1137 #endif  // !CONFIG_LV_MAP
1138
1139 int av1_cost_coeffs(const AV1_COMP *const cpi, MACROBLOCK *x, int plane,
1140                     int block, TX_SIZE tx_size, const SCAN_ORDER *scan_order,
1141                     const ENTROPY_CONTEXT *a, const ENTROPY_CONTEXT *l,
1142                     int use_fast_coef_costing) {
1143 #if !CONFIG_LV_MAP
1144   const AV1_COMMON *const cm = &cpi->common;
1145   return cost_coeffs(cm, x, plane, block, tx_size, scan_order, a, l,
1146                      use_fast_coef_costing);
1147 #else  // !CONFIG_LV_MAP
1148   (void)scan_order;
1149   (void)use_fast_coef_costing;
1150   const MACROBLOCKD *xd = &x->e_mbd;
1151   const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
1152   const struct macroblockd_plane *pd = &xd->plane[plane];
1153   const BLOCK_SIZE bsize = mbmi->sb_type;
1154 #if CONFIG_CB4X4
1155 #if CONFIG_CHROMA_2X2
1156   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
1157 #else
1158   const BLOCK_SIZE plane_bsize =
1159       AOMMAX(BLOCK_4X4, get_plane_block_size(bsize, pd));
1160 #endif  // CONFIG_CHROMA_2X2
1161 #else   // CONFIG_CB4X4
1162   const BLOCK_SIZE plane_bsize =
1163       get_plane_block_size(AOMMAX(BLOCK_8X8, bsize), pd);
1164 #endif  // CONFIG_CB4X4
1165
1166   TXB_CTX txb_ctx;
1167   get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
1168   return av1_cost_coeffs_txb(cpi, x, plane, block, &txb_ctx);
1169 #endif  // !CONFIG_LV_MAP
1170 }
1171 #endif  // !CONFIG_PVQ || CONFIG_VAR_TX
1172
1173 // Get transform block visible dimensions cropped to the MI units.
1174 static void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
1175                                BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
1176                                BLOCK_SIZE tx_bsize, int *width, int *height,
1177                                int *visible_width, int *visible_height) {
1178 #if !(CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT)
1179   assert(tx_bsize <= plane_bsize);
1180 #endif  // !(CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT)
1181   int txb_height = block_size_high[tx_bsize];
1182   int txb_width = block_size_wide[tx_bsize];
1183   const int block_height = block_size_high[plane_bsize];
1184   const int block_width = block_size_wide[plane_bsize];
1185   const struct macroblockd_plane *const pd = &xd->plane[plane];
1186   // TODO(aconverse@google.com): Investigate using crop_width/height here rather
1187   // than the MI size
1188   const int block_rows =
1189       (xd->mb_to_bottom_edge >= 0)
1190           ? block_height
1191           : (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height;
1192   const int block_cols =
1193       (xd->mb_to_right_edge >= 0)
1194           ? block_width
1195           : (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width;
1196   const int tx_unit_size = tx_size_wide_log2[0];
1197   if (width) *width = txb_width;
1198   if (height) *height = txb_height;
1199   *visible_width = clamp(block_cols - (blk_col << tx_unit_size), 0, txb_width);
1200   *visible_height =
1201       clamp(block_rows - (blk_row << tx_unit_size), 0, txb_height);
1202 }
1203
1204 // Compute the pixel domain sum square error on all visible 4x4s in the
1205 // transform block.
1206 static unsigned pixel_sse(const AV1_COMP *const cpi, const MACROBLOCKD *xd,
1207                           int plane, const uint8_t *src, const int src_stride,
1208                           const uint8_t *dst, const int dst_stride, int blk_row,
1209                           int blk_col, const BLOCK_SIZE plane_bsize,
1210                           const BLOCK_SIZE tx_bsize) {
1211   int txb_rows, txb_cols, visible_rows, visible_cols;
1212   get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize,
1213                      &txb_cols, &txb_rows, &visible_cols, &visible_rows);
1214   assert(visible_rows > 0);
1215   assert(visible_cols > 0);
1216 #if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
1217   if ((txb_rows == visible_rows && txb_cols == visible_cols) &&
1218       tx_bsize < BLOCK_SIZES) {
1219 #else
1220   if (txb_rows == visible_rows && txb_cols == visible_cols) {
1221 #endif
1222     unsigned sse;
1223     cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
1224     return sse;
1225   }
1226 #if CONFIG_HIGHBITDEPTH
1227   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
1228     uint64_t sse = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
1229                                            visible_cols, visible_rows);
1230     return (unsigned int)ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
1231   }
1232 #endif  // CONFIG_HIGHBITDEPTH
1233   unsigned sse = aom_sse_odd_size(src, src_stride, dst, dst_stride,
1234                                   visible_cols, visible_rows);
1235   return sse;
1236 }
1237
1238 // Compute the squares sum squares on all visible 4x4s in the transform block.
1239 static int64_t sum_squares_visible(const MACROBLOCKD *xd, int plane,
1240                                    const int16_t *diff, const int diff_stride,
1241                                    int blk_row, int blk_col,
1242                                    const BLOCK_SIZE plane_bsize,
1243                                    const BLOCK_SIZE tx_bsize) {
1244   int visible_rows, visible_cols;
1245   get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
1246                      NULL, &visible_cols, &visible_rows);
1247   return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
1248 }
1249
1250 void av1_dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
1251                     BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col,
1252                     TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse,
1253                     OUTPUT_STATUS output_status) {
1254   MACROBLOCKD *const xd = &x->e_mbd;
1255   const struct macroblock_plane *const p = &x->plane[plane];
1256 #if CONFIG_DAALA_DIST
1257   int qm = OD_HVS_QM;
1258   int use_activity_masking = 0;
1259 #if CONFIG_PVQ
1260   use_activity_masking = x->daala_enc.use_activity_masking;
1261 #endif  // CONFIG_PVQ
1262   struct macroblockd_plane *const pd = &xd->plane[plane];
1263 #else   // CONFIG_DAALA_DIST
1264   const struct macroblockd_plane *const pd = &xd->plane[plane];
1265 #endif  // CONFIG_DAALA_DIST
1266
1267   if (cpi->sf.use_transform_domain_distortion && !CONFIG_DAALA_DIST) {
1268     // Transform domain distortion computation is more efficient as it does
1269     // not involve an inverse transform, but it is less accurate.
1270     const int buffer_length = tx_size_2d[tx_size];
1271     int64_t this_sse;
1272     int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
1273     tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
1274     tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
1275 #if CONFIG_PVQ
1276     tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
1277
1278 #if CONFIG_HIGHBITDEPTH
1279     const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
1280     *out_dist = av1_highbd_block_error2_c(coeff, dqcoeff, ref_coeff,
1281                                           buffer_length, &this_sse, bd) >>
1282                 shift;
1283 #else
1284     *out_dist = av1_block_error2_c(coeff, dqcoeff, ref_coeff, buffer_length,
1285                                    &this_sse) >>
1286                 shift;
1287 #endif  // CONFIG_HIGHBITDEPTH
1288 #elif CONFIG_HIGHBITDEPTH
1289     const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
1290     *out_dist =
1291         av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, bd) >>
1292         shift;
1293 #else
1294     *out_dist =
1295         av1_block_error(coeff, dqcoeff, buffer_length, &this_sse) >> shift;
1296 #endif  // CONFIG_PVQ
1297     *out_sse = this_sse >> shift;
1298   } else {
1299     const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
1300 #if !CONFIG_PVQ || CONFIG_DAALA_DIST
1301     const int bsw = block_size_wide[tx_bsize];
1302     const int bsh = block_size_high[tx_bsize];
1303 #endif
1304     const int src_stride = x->plane[plane].src.stride;
1305     const int dst_stride = xd->plane[plane].dst.stride;
1306     // Scale the transform block index to pixel unit.
1307     const int src_idx = (blk_row * src_stride + blk_col)
1308                         << tx_size_wide_log2[0];
1309     const int dst_idx = (blk_row * dst_stride + blk_col)
1310                         << tx_size_wide_log2[0];
1311     const uint8_t *src = &x->plane[plane].src.buf[src_idx];
1312     const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
1313     const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
1314     const uint16_t eob = p->eobs[block];
1315
1316     assert(cpi != NULL);
1317     assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
1318
1319 #if CONFIG_DAALA_DIST
1320     if (plane == 0 && bsw >= 8 && bsh >= 8) {
1321       if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
1322         const int pred_stride = block_size_wide[plane_bsize];
1323         const int pred_idx = (blk_row * pred_stride + blk_col)
1324                              << tx_size_wide_log2[0];
1325         const int16_t *pred = &pd->pred[pred_idx];
1326         int i, j;
1327         DECLARE_ALIGNED(16, uint8_t, pred8[MAX_TX_SQUARE]);
1328
1329         for (j = 0; j < bsh; j++)
1330           for (i = 0; i < bsw; i++)
1331             pred8[j * bsw + i] = pred[j * pred_stride + i];
1332         *out_sse = av1_daala_dist(src, src_stride, pred8, bsw, bsw, bsh, qm,
1333                                   use_activity_masking, x->qindex);
1334       } else {
1335         *out_sse = av1_daala_dist(src, src_stride, dst, dst_stride, bsw, bsh,
1336                                   qm, use_activity_masking, x->qindex);
1337       }
1338     } else
1339 #endif  // CONFIG_DAALA_DIST
1340     {
1341       const int diff_stride = block_size_wide[plane_bsize];
1342       const int diff_idx = (blk_row * diff_stride + blk_col)
1343                            << tx_size_wide_log2[0];
1344       const int16_t *diff = &p->src_diff[diff_idx];
1345       *out_sse = sum_squares_visible(xd, plane, diff, diff_stride, blk_row,
1346                                      blk_col, plane_bsize, tx_bsize);
1347 #if CONFIG_HIGHBITDEPTH
1348       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
1349         *out_sse = ROUND_POWER_OF_TWO(*out_sse, (xd->bd - 8) * 2);
1350 #endif  // CONFIG_HIGHBITDEPTH
1351     }
1352     *out_sse *= 16;
1353
1354     if (eob) {
1355       if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
1356 #if CONFIG_DAALA_DIST
1357         if (plane == 0 && bsw >= 8 && bsh >= 8)
1358           *out_dist = av1_daala_dist(src, src_stride, dst, dst_stride, bsw, bsh,
1359                                      qm, use_activity_masking, x->qindex);
1360         else
1361 #endif  // CONFIG_DAALA_DIST
1362           *out_dist =
1363               pixel_sse(cpi, xd, plane, src, src_stride, dst, dst_stride,
1364                         blk_row, blk_col, plane_bsize, tx_bsize);
1365       } else {
1366 #if CONFIG_HIGHBITDEPTH
1367         uint8_t *recon;
1368         DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
1369
1370         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
1371           recon = CONVERT_TO_BYTEPTR(recon16);
1372         else
1373           recon = (uint8_t *)recon16;
1374 #else
1375         DECLARE_ALIGNED(16, uint8_t, recon[MAX_TX_SQUARE]);
1376 #endif  // CONFIG_HIGHBITDEPTH
1377
1378 #if !CONFIG_PVQ
1379 #if CONFIG_HIGHBITDEPTH
1380         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
1381           aom_highbd_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0,
1382                                    NULL, 0, bsw, bsh, xd->bd);
1383         } else {
1384 #endif  // CONFIG_HIGHBITDEPTH
1385           aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0, NULL,
1386                             0, bsw, bsh);
1387 #if CONFIG_HIGHBITDEPTH
1388         }
1389 #endif  // CONFIG_HIGHBITDEPTH
1390 #else
1391         (void)dst;
1392 #endif  // !CONFIG_PVQ
1393
1394         const PLANE_TYPE plane_type = get_plane_type(plane);
1395         TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
1396
1397         av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, recon,
1398                                     MAX_TX_SIZE, eob);
1399
1400 #if CONFIG_DAALA_DIST
1401         if (plane == 0 && bsw >= 8 && bsh >= 8) {
1402           *out_dist = av1_daala_dist(src, src_stride, recon, MAX_TX_SIZE, bsw,
1403                                      bsh, qm, use_activity_masking, x->qindex);
1404         } else {
1405           if (plane == 0) {
1406             // Save decoded pixels for inter block in pd->pred to avoid
1407             // block_8x8_rd_txfm_daala_dist() need to produce them
1408             // by calling av1_inverse_transform_block() again.
1409             const int pred_stride = block_size_wide[plane_bsize];
1410             const int pred_idx = (blk_row * pred_stride + blk_col)
1411                                  << tx_size_wide_log2[0];
1412             int16_t *pred = &pd->pred[pred_idx];
1413             int i, j;
1414
1415             for (j = 0; j < bsh; j++)
1416               for (i = 0; i < bsw; i++)
1417                 pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
1418           }
1419 #endif  // CONFIG_DAALA_DIST
1420           *out_dist =
1421               pixel_sse(cpi, xd, plane, src, src_stride, recon, MAX_TX_SIZE,
1422                         blk_row, blk_col, plane_bsize, tx_bsize);
1423 #if CONFIG_DAALA_DIST
1424         }
1425 #endif  // CONFIG_DAALA_DIST
1426       }
1427       *out_dist *= 16;
1428     } else {
1429       *out_dist = *out_sse;
1430     }
1431   }
1432 }
1433
1434 static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
1435                           BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
1436   struct rdcost_block_args *args = arg;
1437   MACROBLOCK *const x = args->x;
1438   MACROBLOCKD *const xd = &x->e_mbd;
1439   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
1440   const AV1_COMP *cpi = args->cpi;
1441   ENTROPY_CONTEXT *a = args->t_above + blk_col;
1442   ENTROPY_CONTEXT *l = args->t_left + blk_row;
1443 #if !CONFIG_TXK_SEL
1444   const AV1_COMMON *cm = &cpi->common;
1445 #endif
1446   int64_t rd1, rd2, rd;
1447   RD_STATS this_rd_stats;
1448
1449   assert(tx_size == get_tx_size(plane, xd));
1450
1451   av1_init_rd_stats(&this_rd_stats);
1452
1453   if (args->exit_early) return;
1454
1455   if (!is_inter_block(mbmi)) {
1456 #if CONFIG_CFL
1457
1458 #if CONFIG_EC_ADAPT
1459     FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
1460 #else
1461     FRAME_CONTEXT *const ec_ctx = cm->fc;
1462 #endif  // CONFIG_EC_ADAPT
1463
1464     av1_predict_intra_block_encoder_facade(x, ec_ctx, plane, block, blk_col,
1465                                            blk_row, tx_size, plane_bsize);
1466 #else
1467     av1_predict_intra_block_facade(xd, plane, block, blk_col, blk_row, tx_size);
1468 #endif
1469 #if CONFIG_DPCM_INTRA
1470     const int block_raster_idx =
1471         av1_block_index_to_raster_order(tx_size, block);
1472     const PREDICTION_MODE mode =
1473         (plane == 0) ? get_y_mode(xd->mi[0], block_raster_idx) : mbmi->uv_mode;
1474     TX_TYPE tx_type = get_tx_type((plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV,
1475                                   xd, block, tx_size);
1476     if (av1_use_dpcm_intra(plane, mode, tx_type, mbmi)) {
1477       int8_t skip;
1478       av1_encode_block_intra_dpcm(cm, x, mode, plane, block, blk_row, blk_col,
1479                                   plane_bsize, tx_size, tx_type, a, l, &skip);
1480       av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
1481                      tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
1482                      OUTPUT_HAS_DECODED_PIXELS);
1483       goto CALCULATE_RD;
1484     }
1485 #endif  // CONFIG_DPCM_INTRA
1486     av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
1487   }
1488
1489 #if !CONFIG_TXK_SEL
1490   // full forward transform and quantization
1491   const int coeff_ctx = combine_entropy_contexts(*a, *l);
1492   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
1493                   coeff_ctx, AV1_XFORM_QUANT_FP);
1494   av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx);
1495
1496   if (!is_inter_block(mbmi)) {
1497     struct macroblock_plane *const p = &x->plane[plane];
1498     av1_inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
1499                                        p->eobs[block]);
1500     av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
1501                    tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
1502                    OUTPUT_HAS_DECODED_PIXELS);
1503   } else {
1504     av1_dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
1505                    tx_size, &this_rd_stats.dist, &this_rd_stats.sse,
1506                    OUTPUT_HAS_PREDICTED_PIXELS);
1507   }
1508 #if CONFIG_CFL
1509   if (plane == AOM_PLANE_Y && x->cfl_store_y) {
1510     struct macroblockd_plane *const pd = &xd->plane[plane];
1511     const int dst_stride = pd->dst.stride;
1512     uint8_t *dst =
1513         &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
1514     cfl_store(xd->cfl, dst, dst_stride, blk_row, blk_col, tx_size);
1515   }
1516 #endif
1517 #if CONFIG_DPCM_INTRA
1518 CALCULATE_RD : {}
1519 #endif  // CONFIG_DPCM_INTRA
1520   rd = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.dist);
1521   if (args->this_rd + rd > args->best_rd) {
1522     args->exit_early = 1;
1523     return;
1524   }
1525 #if !CONFIG_PVQ
1526   const PLANE_TYPE plane_type = get_plane_type(plane);
1527   const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
1528   const SCAN_ORDER *scan_order =
1529       get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
1530   this_rd_stats.rate =
1531       av1_cost_coeffs(cpi, x, plane, block, tx_size, scan_order, a, l,
1532                       args->use_fast_coef_costing);
1533 #else   // !CONFIG_PVQ
1534   this_rd_stats.rate = x->rate;
1535 #endif  // !CONFIG_PVQ
1536 #else   // !CONFIG_TXK_SEL
1537   av1_search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
1538                       tx_size, a, l, args->use_fast_coef_costing,
1539                       &this_rd_stats);
1540 #endif  // !CONFIG_TXK_SEL
1541
1542 #if !CONFIG_PVQ
1543 #if CONFIG_RD_DEBUG
1544   av1_update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col,
1545                             this_rd_stats.rate);
1546 #endif  // CONFIG_RD_DEBUG
1547   av1_set_txb_context(x, plane, block, tx_size, a, l);
1548 #endif  // !CONFIG_PVQ
1549
1550   rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
1551   rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse);
1552
1553   // TODO(jingning): temporarily enabled only for luma component
1554   rd = AOMMIN(rd1, rd2);
1555
1556 #if CONFIG_DAALA_DIST
1557   if (plane == 0 && plane_bsize >= BLOCK_8X8 &&
1558       (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4)) {
1559     this_rd_stats.dist = 0;
1560     this_rd_stats.sse = 0;
1561     rd = 0;
1562     x->rate_4x4[block] = this_rd_stats.rate;
1563   }
1564 #endif  // CONFIG_DAALA_DIST
1565
1566 #if !CONFIG_PVQ
1567   this_rd_stats.skip &= !x->plane[plane].eobs[block];
1568 #else
1569   this_rd_stats.skip &= x->pvq_skip[plane];
1570 #endif  // !CONFIG_PVQ
1571   av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
1572
1573   args->this_rd += rd;
1574
1575   if (args->this_rd > args->best_rd) {
1576     args->exit_early = 1;
1577     return;
1578   }
1579 }
1580
1581 #if CONFIG_DAALA_DIST
1582 static void block_8x8_rd_txfm_daala_dist(int plane, int block, int blk_row,
1583                                          int blk_col, BLOCK_SIZE plane_bsize,
1584                                          TX_SIZE tx_size, void *arg) {
1585   struct rdcost_block_args *args = arg;
1586   MACROBLOCK *const x = args->x;
1587   MACROBLOCKD *const xd = &x->e_mbd;
1588   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
1589   int64_t rd, rd1, rd2;
1590   RD_STATS this_rd_stats;
1591   int qm = OD_HVS_QM;
1592   int use_activity_masking = 0;
1593
1594   (void)tx_size;
1595
1596   assert(plane == 0);
1597   assert(plane_bsize >= BLOCK_8X8);
1598 #if CONFIG_PVQ
1599   use_activity_masking = x->daala_enc.use_activity_masking;
1600 #endif  // CONFIG_PVQ
1601   av1_init_rd_stats(&this_rd_stats);
1602
1603   if (args->exit_early) return;
1604
1605   {
1606     const struct macroblock_plane *const p = &x->plane[plane];
1607     struct macroblockd_plane *const pd = &xd->plane[plane];
1608
1609     const int src_stride = p->src.stride;
1610     const int dst_stride = pd->dst.stride;
1611     const int diff_stride = block_size_wide[plane_bsize];
1612
1613     const uint8_t *src =
1614         &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
1615     const uint8_t *dst =
1616         &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
1617
1618     unsigned int tmp1, tmp2;
1619     int qindex = x->qindex;
1620     const int pred_stride = block_size_wide[plane_bsize];
1621     const int pred_idx = (blk_row * pred_stride + blk_col)
1622                          << tx_size_wide_log2[0];
1623     int16_t *pred = &pd->pred[pred_idx];
1624     int i, j;
1625     const int tx_blk_size = 8;
1626
1627     DECLARE_ALIGNED(16, uint8_t, pred8[8 * 8]);
1628
1629     for (j = 0; j < tx_blk_size; j++)
1630       for (i = 0; i < tx_blk_size; i++)
1631         pred8[j * tx_blk_size + i] = pred[j * diff_stride + i];
1632
1633     tmp1 = av1_daala_dist(src, src_stride, pred8, tx_blk_size, 8, 8, qm,
1634                           use_activity_masking, qindex);
1635     tmp2 = av1_daala_dist(src, src_stride, dst, dst_stride, 8, 8, qm,
1636                           use_activity_masking, qindex);
1637
1638     if (!is_inter_block(mbmi)) {
1639       this_rd_stats.sse = (int64_t)tmp1 * 16;
1640       this_rd_stats.dist = (int64_t)tmp2 * 16;
1641     } else {
1642       // For inter mode, the decoded pixels are provided in pd->pred,
1643       // while the predicted pixels are in dst.
1644       this_rd_stats.sse = (int64_t)tmp2 * 16;
1645       this_rd_stats.dist = (int64_t)tmp1 * 16;
1646     }
1647   }
1648
1649   rd = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.dist);
1650   if (args->this_rd + rd > args->best_rd) {
1651     args->exit_early = 1;
1652     return;
1653   }
1654
1655   {
1656     const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
1657     const uint8_t txw_unit = tx_size_wide_unit[tx_size];
1658     const uint8_t txh_unit = tx_size_high_unit[tx_size];
1659     const int step = txw_unit * txh_unit;
1660     int offset_h = tx_size_high_unit[TX_4X4];
1661     // The rate of the current 8x8 block is the sum of four 4x4 blocks in it.
1662     this_rd_stats.rate =
1663         x->rate_4x4[block - max_blocks_wide * offset_h - step] +
1664         x->rate_4x4[block - max_blocks_wide * offset_h] +
1665         x->rate_4x4[block - step] + x->rate_4x4[block];
1666   }
1667   rd1 = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate, this_rd_stats.dist);
1668   rd2 = RDCOST(x->rdmult, x->rddiv, 0, this_rd_stats.sse);
1669   rd = AOMMIN(rd1, rd2);
1670
1671   args->rd_stats.dist += this_rd_stats.dist;
1672   args->rd_stats.sse += this_rd_stats.sse;
1673
1674   args->this_rd += rd;
1675
1676   if (args->this_rd > args->best_rd) {
1677     args->exit_early = 1;
1678     return;
1679   }
1680 }
1681 #endif  // CONFIG_DAALA_DIST
1682
1683 static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
1684                              RD_STATS *rd_stats, int64_t ref_best_rd, int plane,
1685                              BLOCK_SIZE bsize, TX_SIZE tx_size,
1686                              int use_fast_coef_casting) {
1687   MACROBLOCKD *const xd = &x->e_mbd;
1688   const struct macroblockd_plane *const pd = &xd->plane[plane];
1689   struct rdcost_block_args args;
1690   av1_zero(args);
1691   args.x = x;
1692   args.cpi = cpi;
1693   args.best_rd = ref_best_rd;
1694   args.use_fast_coef_costing = use_fast_coef_casting;
1695   av1_init_rd_stats(&args.rd_stats);
1696
1697   if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size;
1698
1699   av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
1700
1701 #if CONFIG_DAALA_DIST
1702   if (plane == 0 && bsize >= BLOCK_8X8 &&
1703       (tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4))
1704     av1_foreach_8x8_transformed_block_in_yplane(
1705         xd, bsize, block_rd_txfm, block_8x8_rd_txfm_daala_dist, &args);
1706   else
1707 #endif  // CONFIG_DAALA_DIST
1708     av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
1709                                            &args);
1710
1711   if (args.exit_early) {
1712     av1_invalid_rd_stats(rd_stats);
1713   } else {
1714     *rd_stats = args.rd_stats;
1715   }
1716 }
1717
1718 #if CONFIG_SUPERTX
1719 void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate,
1720                                   int64_t *distortion, int *skippable,
1721                                   int64_t *sse, int64_t ref_best_rd, int plane,
1722                                   BLOCK_SIZE bsize, TX_SIZE tx_size,
1723                                   int use_fast_coef_casting) {
1724   MACROBLOCKD *const xd = &x->e_mbd;
1725   const struct macroblockd_plane *const pd = &xd->plane[plane];
1726   struct rdcost_block_args args;
1727   av1_zero(args);
1728   args.cpi = cpi;
1729   args.x = x;
1730   args.best_rd = ref_best_rd;
1731   args.use_fast_coef_costing = use_fast_coef_casting;
1732
1733 #if CONFIG_EXT_TX
1734   assert(tx_size < TX_SIZES);
1735 #endif  // CONFIG_EXT_TX
1736
1737   if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size;
1738
1739   av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
1740
1741   block_rd_txfm(plane, 0, 0, 0, get_plane_block_size(bsize, pd), tx_size,
1742                 &args);
1743
1744   if (args.exit_early) {
1745     *rate = INT_MAX;
1746     *distortion = INT64_MAX;
1747     *sse = INT64_MAX;
1748     *skippable = 0;
1749   } else {
1750     *distortion = args.rd_stats.dist;
1751     *rate = args.rd_stats.rate;
1752     *sse = args.rd_stats.sse;
1753     *skippable = !x->plane[plane].eobs[0];
1754   }
1755 }
1756 #endif  // CONFIG_SUPERTX
1757
1758 static int tx_size_cost(const AV1_COMP *const cpi, const MACROBLOCK *const x,
1759                         BLOCK_SIZE bsize, TX_SIZE tx_size) {
1760   const AV1_COMMON *const cm = &cpi->common;
1761   const MACROBLOCKD *const xd = &x->e_mbd;
1762   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
1763
1764   const int tx_select =
1765       cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8;
1766
1767   if (tx_select) {
1768     const int is_inter = is_inter_block(mbmi);
1769     const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
1770                                      : intra_tx_size_cat_lookup[bsize];
1771     const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
1772     const int depth = tx_size_to_depth(coded_tx_size);
1773     const int tx_size_ctx = get_tx_size_context(xd);
1774     int r_tx_size = cpi->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
1775 #if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
1776     if (is_quarter_tx_allowed(xd, mbmi, is_inter) && tx_size != coded_tx_size)
1777       r_tx_size += av1_cost_bit(cm->fc->quarter_tx_size_prob,
1778                                 tx_size == quarter_txsize_lookup[bsize]);
1779 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_RECT_TX_EXT
1780     return r_tx_size;
1781   } else {
1782     return 0;
1783   }
1784 }
1785
1786 // #TODO(angiebird): use this function whenever it's possible
1787 int av1_tx_type_cost(const AV1_COMP *cpi, const MACROBLOCKD *xd,
1788                      BLOCK_SIZE bsize, int plane, TX_SIZE tx_size,
1789                      TX_TYPE tx_type) {
1790   if (plane > 0) return 0;
1791
1792   const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
1793   const int is_inter = is_inter_block(mbmi);
1794 #if CONFIG_EXT_TX
1795   const AV1_COMMON *cm = &cpi->common;
1796   if (get_ext_tx_types(tx_size, bsize, is_inter, cm->reduced_tx_set_used) > 1 &&
1797       !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
1798     const int ext_tx_set =
1799         get_ext_tx_set(tx_size, bsize, is_inter, cm->reduced_tx_set_used);
1800     if (is_inter) {
1801       if (ext_tx_set > 0)
1802         return cpi
1803             ->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]][tx_type];
1804     } else {
1805       if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
1806         return cpi->intra_tx_type_costs[ext_tx_set][txsize_sqr_map[tx_size]]
1807                                        [mbmi->mode][tx_type];
1808     }
1809   }
1810 #else
1811   (void)bsize;
1812   if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
1813       !FIXED_TX_TYPE) {
1814     if (is_inter) {
1815       return cpi->inter_tx_type_costs[tx_size][tx_type];
1816     } else {
1817       return cpi->intra_tx_type_costs[tx_size]
1818                                      [intra_mode_to_tx_type_context[mbmi->mode]]
1819                                      [tx_type];
1820     }
1821   }
1822 #endif  // CONFIG_EXT_TX
1823   return 0;
1824 }
1825 static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
1826                         RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs,
1827                         TX_TYPE tx_type, int tx_size) {
1828   const AV1_COMMON *const cm = &cpi->common;
1829   MACROBLOCKD *const xd = &x->e_mbd;
1830   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
1831   int64_t rd = INT64_MAX;
1832   aom_prob skip_prob = av1_get_skip_prob(cm, xd);
1833   int s0, s1;
1834   const int is_inter = is_inter_block(mbmi);
1835   const int tx_select =
1836       cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8;
1837
1838   const int r_tx_size = tx_size_cost(cpi, x, bs, tx_size);
1839
1840   assert(skip_prob > 0);
1841 #if CONFIG_EXT_TX && CONFIG_RECT_TX
1842   assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
1843 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
1844
1845   s0 = av1_cost_bit(skip_prob, 0);
1846   s1 = av1_cost_bit(skip_prob, 1);
1847
1848   mbmi->tx_type = tx_type;
1849   mbmi->tx_size = tx_size;
1850   txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, tx_size,
1851                    cpi->sf.use_fast_coef_costing);
1852   if (rd_stats->rate == INT_MAX) return INT64_MAX;
1853 #if !CONFIG_TXK_SEL
1854   int plane = 0;
1855   rd_stats->rate += av1_tx_type_cost(cpi, xd, bs, plane, tx_size, tx_type);
1856 #endif
1857
1858   if (rd_stats->skip) {
1859     if (is_inter) {
1860       rd = RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse);
1861     } else {
1862       rd = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size * tx_select,
1863                   rd_stats->sse);
1864     }
1865   } else {
1866     rd = RDCOST(x->rdmult, x->rddiv,
1867                 rd_stats->rate + s0 + r_tx_size * tx_select, rd_stats->dist);
1868   }
1869
1870   if (tx_select) rd_stats->rate += r_tx_size;
1871
1872   if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
1873       !(rd_stats->skip))
1874     rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse));
1875
1876   return rd;
1877 }
1878
1879 static int skip_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
1880                             TX_TYPE tx_type, TX_SIZE tx_size) {
1881   const MACROBLOCKD *const xd = &x->e_mbd;
1882   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
1883   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
1884   const int is_inter = is_inter_block(mbmi);
1885   int prune = 0;
1886   if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
1887     // passing -1 in for tx_type indicates that all 1D
1888     // transforms should be considered for pruning
1889     prune = prune_tx_types(cpi, bs, x, xd, -1);
1890
1891   if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) return 1;
1892   if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, tx_size))
1893     return 1;
1894   if (!is_inter && x->use_default_intra_tx_type &&
1895       tx_type != get_default_tx_type(0, xd, 0, tx_size))
1896     return 1;
1897   if (is_inter && x->use_default_inter_tx_type &&
1898       tx_type != get_default_tx_type(0, xd, 0, tx_size))
1899     return 1;
1900   if (max_tx_size >= TX_32X32 && tx_size == TX_4X4) return 1;
1901 #if CONFIG_EXT_TX
1902   const AV1_COMMON *const cm = &cpi->common;
1903   int ext_tx_set =
1904       get_ext_tx_set(tx_size, bs, is_inter, cm->reduced_tx_set_used);
1905   if (is_inter) {
1906     if (!ext_tx_used_inter[ext_tx_set][tx_type]) return 1;
1907     if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
1908       if (!do_tx_type_search(tx_type, prune)) return 1;
1909     }
1910   } else {
1911     if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
1912       if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) return 1;
1913     }
1914     if (!ext_tx_used_intra[ext_tx_set][tx_type]) return 1;
1915   }
1916 #else   // CONFIG_EXT_TX
1917   if (tx_size >= TX_32X32 && tx_type != DCT_DCT) return 1;
1918   if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
1919       !do_tx_type_search(tx_type, prune))
1920     return 1;
1921 #endif  // CONFIG_EXT_TX
1922   return 0;
1923 }
1924
1925 #if CONFIG_EXT_INTER && (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
1926 static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
1927                                    MACROBLOCK *x, int *r, int64_t *d, int *s,
1928                                    int64_t *sse, int64_t ref_best_rd) {
1929   RD_STATS rd_stats;
1930   int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs, DCT_DCT,
1931                         max_txsize_lookup[bs]);
1932   *r = rd_stats.rate;
1933   *d = rd_stats.dist;
1934   *s = rd_stats.skip;
1935   *sse = rd_stats.sse;
1936   return rd;
1937 }
1938 #endif  // CONFIG_EXT_INTER && (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
1939
1940 static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
1941                                    RD_STATS *rd_stats, int64_t ref_best_rd,
1942                                    BLOCK_SIZE bs) {
1943   const AV1_COMMON *const cm = &cpi->common;
1944   MACROBLOCKD *const xd = &x->e_mbd;
1945   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
1946   TX_TYPE tx_type, best_tx_type = DCT_DCT;
1947   int64_t this_rd, best_rd = INT64_MAX;
1948   aom_prob skip_prob = av1_get_skip_prob(cm, xd);
1949   int s0 = av1_cost_bit(skip_prob, 0);
1950   int s1 = av1_cost_bit(skip_prob, 1);
1951   const int is_inter = is_inter_block(mbmi);
1952   int prune = 0;
1953   const int plane = 0;
1954 #if CONFIG_EXT_TX
1955   int ext_tx_set;
1956 #endif  // CONFIG_EXT_TX
1957   av1_invalid_rd_stats(rd_stats);
1958
1959   mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
1960 #if CONFIG_VAR_TX
1961   mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
1962 #endif  // CONFIG_VAR_TX
1963 #if CONFIG_EXT_TX
1964   ext_tx_set =
1965       get_ext_tx_set(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used);
1966 #endif  // CONFIG_EXT_TX
1967
1968   if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
1969 #if CONFIG_EXT_TX
1970     prune = prune_tx_types(cpi, bs, x, xd, ext_tx_set);
1971 #else
1972     prune = prune_tx_types(cpi, bs, x, xd, 0);
1973 #endif  // CONFIG_EXT_TX
1974 #if CONFIG_EXT_TX
1975   if (get_ext_tx_types(mbmi->tx_size, bs, is_inter, cm->reduced_tx_set_used) >
1976           1 &&
1977       !xd->lossless[mbmi->segment_id]) {
1978 #if CONFIG_PVQ
1979     od_rollback_buffer pre_buf, post_buf;
1980
1981     od_encode_checkpoint(&x->daala_enc, &pre_buf);
1982     od_encode_checkpoint(&x->daala_enc, &post_buf);
1983 #endif  // CONFIG_PVQ
1984
1985     for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
1986       RD_STATS this_rd_stats;
1987       if (is_inter) {
1988         if (x->use_default_inter_tx_type &&
1989             tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
1990           continue;
1991         if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
1992         if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
1993           if (!do_tx_type_search(tx_type, prune)) continue;
1994         }
1995       } else {
1996         if (x->use_default_intra_tx_type &&
1997             tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
1998           continue;
1999         if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
2000           if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
2001         }
2002         if (!ext_tx_used_intra[ext_tx_set][tx_type]) continue;
2003       }
2004
2005       mbmi->tx_type = tx_type;
2006
2007       txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
2008                        mbmi->tx_size, cpi->sf.use_fast_coef_costing);
2009 #if CONFIG_PVQ
2010       od_encode_rollback(&x->daala_enc, &pre_buf);
2011 #endif  // CONFIG_PVQ
2012       if (this_rd_stats.rate == INT_MAX) continue;
2013       av1_tx_type_cost(cpi, xd, bs, plane, mbmi->tx_size, tx_type);
2014
2015       if (this_rd_stats.skip)
2016         this_rd = RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse);
2017       else
2018         this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + s0,
2019                          this_rd_stats.dist);
2020       if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] &&
2021           !this_rd_stats.skip)
2022         this_rd =
2023             AOMMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse));
2024
2025       if (this_rd < best_rd) {
2026         best_rd = this_rd;
2027         best_tx_type = mbmi->tx_type;
2028         *rd_stats = this_rd_stats;
2029 #if CONFIG_PVQ
2030         od_encode_checkpoint(&x->daala_enc, &post_buf);
2031 #endif  // CONFIG_PVQ
2032       }
2033     }
2034 #if CONFIG_PVQ
2035     od_encode_rollback(&x->daala_enc, &post_buf);
2036 #endif  // CONFIG_PVQ
2037   } else {
2038     mbmi->tx_type = DCT_DCT;
2039     txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
2040                      cpi->sf.use_fast_coef_costing);
2041   }
2042 #else   // CONFIG_EXT_TX
2043   if (mbmi->tx_size < TX_32X32 && !xd->lossless[mbmi->segment_id]) {
2044     for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
2045       RD_STATS this_rd_stats;
2046       if (!is_inter && x->use_default_intra_tx_type &&
2047           tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
2048         continue;
2049       if (is_inter && x->use_default_inter_tx_type &&
2050           tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
2051         continue;
2052       mbmi->tx_type = tx_type;
2053       txfm_rd_in_plane(x, cpi, &this_rd_stats, ref_best_rd, 0, bs,
2054                        mbmi->tx_size, cpi->sf.use_fast_coef_costing);
2055       if (this_rd_stats.rate == INT_MAX) continue;
2056
2057       av1_tx_type_cost(cpi, xd, bs, plane, mbmi->tx_size, tx_type);
2058       if (is_inter) {
2059         if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
2060             !do_tx_type_search(tx_type, prune))
2061           continue;
2062       }
2063       if (this_rd_stats.skip)
2064         this_rd = RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse);
2065       else
2066         this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + s0,
2067                          this_rd_stats.dist);
2068       if (is_inter && !xd->lossless[mbmi->segment_id] && !this_rd_stats.skip)
2069         this_rd =
2070             AOMMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, this_rd_stats.sse));
2071
2072       if (this_rd < best_rd) {
2073         best_rd = this_rd;
2074         best_tx_type = mbmi->tx_type;
2075         *rd_stats = this_rd_stats;
2076       }
2077     }
2078   } else {
2079     mbmi->tx_type = DCT_DCT;
2080     txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
2081                      cpi->sf.use_fast_coef_costing);
2082   }
2083 #endif  // CONFIG_EXT_TX
2084   mbmi->tx_type = best_tx_type;
2085 }
2086
2087 static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
2088                                     RD_STATS *rd_stats, int64_t ref_best_rd,
2089                                     BLOCK_SIZE bs) {
2090   MACROBLOCKD *const xd = &x->e_mbd;
2091   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
2092
2093   mbmi->tx_size = TX_4X4;
2094   mbmi->tx_type = DCT_DCT;
2095 #if CONFIG_VAR_TX
2096   mbmi->min_tx_size = get_min_tx_size(TX_4X4);
2097 #endif  // CONFIG_VAR_TX
2098
2099   txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
2100                    cpi->sf.use_fast_coef_costing);
2101 }
2102
2103 #if CONFIG_TXK_SEL || CONFIG_VAR_TX
2104 static INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
2105   int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * tx_size_wide_log2[0]);
2106   return num_blk;
2107 }
2108 #endif  // CONFIG_TXK_SEL || CONFIG_VAR_TX
2109
2110 static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
2111                                         MACROBLOCK *x, RD_STATS *rd_stats,
2112                                         int64_t ref_best_rd, BLOCK_SIZE bs) {
2113   const AV1_COMMON *const cm = &cpi->common;
2114   MACROBLOCKD *const xd = &x->e_mbd;
2115   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
2116   int64_t rd = INT64_MAX;
2117   int n;
2118   int start_tx, end_tx;
2119   int64_t best_rd = INT64_MAX, last_rd = INT64_MAX;
2120   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
2121   TX_SIZE best_tx_size = max_tx_size;
2122   TX_TYPE best_tx_type = DCT_DCT;
2123 #if CONFIG_TXK_SEL
2124   TX_TYPE best_txk_type[MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
2125   const int num_blk = bsize_to_num_blk(bs);
2126 #endif  // CONFIG_TXK_SEL
2127   const int tx_select = cm->tx_mode == TX_MODE_SELECT;
2128   const int is_inter = is_inter_block(mbmi);
2129 #if CONFIG_PVQ
2130   od_rollback_buffer buf;
2131   od_encode_checkpoint(&x->daala_enc, &buf);
2132 #endif  // CONFIG_PVQ
2133
2134   av1_invalid_rd_stats(rd_stats);
2135
2136 #if CONFIG_EXT_TX && CONFIG_RECT_TX
2137   int evaluate_rect_tx = 0;
2138   if (tx_select) {
2139     evaluate_rect_tx = is_rect_tx_allowed(xd, mbmi);
2140   } else {
2141     const TX_SIZE chosen_tx_size =
2142         tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
2143     evaluate_rect_tx = is_rect_tx(chosen_tx_size);
2144     assert(IMPLIES(evaluate_rect_tx, is_rect_tx_allowed(xd, mbmi)));
2145   }
2146   if (evaluate_rect_tx) {
2147     TX_TYPE tx_start = DCT_DCT;
2148     TX_TYPE tx_end = TX_TYPES;
2149 #if CONFIG_TXK_SEL
2150     // The tx_type becomes dummy when lv_map is on. The tx_type search will be
2151     // performed in av1_search_txk_type()
2152     tx_end = DCT_DCT + 1;
2153 #endif
2154     TX_TYPE tx_type;
2155     for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
2156       if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
2157       const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs];
2158       RD_STATS this_rd_stats;
2159       int ext_tx_set =
2160           get_ext_tx_set(rect_tx_size, bs, is_inter, cm->reduced_tx_set_used);
2161       if ((is_inter && ext_tx_used_inter[ext_tx_set][tx_type]) ||
2162           (!is_inter && ext_tx_used_intra[ext_tx_set][tx_type])) {
2163         rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type,
2164                       rect_tx_size);
2165         if (rd < best_rd) {
2166 #if CONFIG_TXK_SEL
2167           memcpy(best_txk_type, mbmi->txk_type,
2168                  sizeof(best_txk_type[0]) * num_blk);
2169 #endif
2170           best_tx_type = tx_type;
2171           best_tx_size = rect_tx_size;
2172           best_rd = rd;
2173           *rd_stats = this_rd_stats;
2174         }
2175       }
2176 #if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
2177       const int is_inter = is_inter_block(mbmi);
2178       if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
2179 #endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
2180     }
2181   }
2182
2183 #if CONFIG_RECT_TX_EXT
2184   // test 1:4/4:1 tx
2185   int evaluate_quarter_tx = 0;
2186   if (is_quarter_tx_allowed(xd, mbmi, is_inter)) {
2187     if (tx_select) {
2188       evaluate_quarter_tx = 1;
2189     } else {
2190       const TX_SIZE chosen_tx_size =
2191           tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
2192       evaluate_quarter_tx = chosen_tx_size == quarter_txsize_lookup[bs];
2193     }
2194   }
2195   if (evaluate_quarter_tx) {
2196     TX_TYPE tx_start = DCT_DCT;
2197     TX_TYPE tx_end = TX_TYPES;
2198 #if CONFIG_TXK_SEL
2199     // The tx_type becomes dummy when lv_map is on. The tx_type search will be
2200     // performed in av1_search_txk_type()
2201     tx_end = DCT_DCT + 1;
2202 #endif
2203     TX_TYPE tx_type;
2204     for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
2205       if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
2206       const TX_SIZE tx_size = quarter_txsize_lookup[bs];
2207       RD_STATS this_rd_stats;
2208       int ext_tx_set =
2209           get_ext_tx_set(tx_size, bs, is_inter, cm->reduced_tx_set_used);
2210       if ((is_inter && ext_tx_used_inter[ext_tx_set][tx_type]) ||
2211           (!is_inter && ext_tx_used_intra[ext_tx_set][tx_type])) {
2212         rd =
2213             txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, tx_size);
2214         if (rd < best_rd) {
2215 #if CONFIG_TXK_SEL
2216           memcpy(best_txk_type, mbmi->txk_type,
2217                  sizeof(best_txk_type[0]) * num_blk);
2218 #endif
2219           best_tx_type = tx_type;
2220           best_tx_size = tx_size;
2221           best_rd = rd;
2222           *rd_stats = this_rd_stats;
2223         }
2224       }
2225 #if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
2226       const int is_inter = is_inter_block(mbmi);
2227       if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
2228 #endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
2229     }
2230   }
2231 #endif  // CONFIG_RECT_TX_EXT
2232 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
2233
2234   if (tx_select) {
2235     start_tx = max_tx_size;
2236     end_tx = (max_tx_size >= TX_32X32) ? TX_8X8 : TX_4X4;
2237   } else {
2238     const TX_SIZE chosen_tx_size =
2239         tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
2240     start_tx = chosen_tx_size;
2241     end_tx = chosen_tx_size;
2242   }
2243
2244   last_rd = INT64_MAX;
2245   for (n = start_tx; n >= end_tx; --n) {
2246 #if CONFIG_EXT_TX && CONFIG_RECT_TX
2247     if (is_rect_tx(n)) break;
2248 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
2249     TX_TYPE tx_start = DCT_DCT;
2250     TX_TYPE tx_end = TX_TYPES;
2251 #if CONFIG_TXK_SEL
2252     // The tx_type becomes dummy when lv_map is on. The tx_type search will be
2253     // performed in av1_search_txk_type()
2254     tx_end = DCT_DCT + 1;
2255 #endif
2256     TX_TYPE tx_type;
2257     for (tx_type = tx_start; tx_type < tx_end; ++tx_type) {
2258       RD_STATS this_rd_stats;
2259       if (skip_txfm_search(cpi, x, bs, tx_type, n)) continue;
2260       rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, tx_type, n);
2261 #if CONFIG_PVQ
2262       od_encode_rollback(&x->daala_enc, &buf);
2263 #endif  // CONFIG_PVQ
2264       // Early termination in transform size search.
2265       if (cpi->sf.tx_size_search_breakout &&
2266           (rd == INT64_MAX ||
2267            (this_rd_stats.skip == 1 && tx_type != DCT_DCT && n < start_tx) ||
2268            (n < (int)max_tx_size && rd > last_rd)))
2269         break;
2270
2271       last_rd = rd;
2272       if (rd < best_rd) {
2273 #if CONFIG_TXK_SEL
2274         memcpy(best_txk_type, mbmi->txk_type,
2275                sizeof(best_txk_type[0]) * num_blk);
2276 #endif
2277         best_tx_type = tx_type;
2278         best_tx_size = n;
2279         best_rd = rd;
2280         *rd_stats = this_rd_stats;
2281       }
2282 #if CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
2283       const int is_inter = is_inter_block(mbmi);
2284       if (mbmi->sb_type < BLOCK_8X8 && is_inter) break;
2285 #endif  // CONFIG_CB4X4 && !USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
2286     }
2287   }
2288   mbmi->tx_size = best_tx_size;
2289   mbmi->tx_type = best_tx_type;
2290 #if CONFIG_TXK_SEL
2291   memcpy(mbmi->txk_type, best_txk_type, sizeof(best_txk_type[0]) * num_blk);
2292 #endif
2293
2294 #if CONFIG_VAR_TX
2295   mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
2296 #endif  // CONFIG_VAR_TX
2297
2298 #if !CONFIG_EXT_TX
2299   if (mbmi->tx_size >= TX_32X32) assert(mbmi->tx_type == DCT_DCT);
2300 #endif  // !CONFIG_EXT_TX
2301 #if CONFIG_PVQ
2302   if (best_rd != INT64_MAX) {
2303     txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs, best_tx_type, best_tx_size);
2304   }
2305 #endif  // CONFIG_PVQ
2306 }
2307
2308 static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
2309                             RD_STATS *rd_stats, BLOCK_SIZE bs,
2310                             int64_t ref_best_rd) {
2311   MACROBLOCKD *xd = &x->e_mbd;
2312   av1_init_rd_stats(rd_stats);
2313
2314   assert(bs == xd->mi[0]->mbmi.sb_type);
2315
2316   if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
2317     choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
2318   } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
2319     choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
2320   } else {
2321     choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
2322   }
2323 }
2324
2325 static int conditional_skipintra(PREDICTION_MODE mode,
2326                                  PREDICTION_MODE best_intra_mode) {
2327   if (mode == D117_PRED && best_intra_mode != V_PRED &&
2328       best_intra_mode != D135_PRED)
2329     return 1;
2330   if (mode == D63_PRED && best_intra_mode != V_PRED &&
2331       best_intra_mode != D45_PRED)
2332     return 1;
2333   if (mode == D207_PRED && best_intra_mode != H_PRED &&
2334       best_intra_mode != D45_PRED)
2335     return 1;
2336   if (mode == D153_PRED && best_intra_mode != H_PRED &&
2337       best_intra_mode != D135_PRED)
2338     return 1;
2339   return 0;
2340 }
2341
2342 // Model based RD estimation for luma intra blocks.
2343 static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
2344                                BLOCK_SIZE bsize, int mode_cost) {
2345   MACROBLOCKD *const xd = &x->e_mbd;
2346   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
2347   assert(!is_inter_block(mbmi));
2348   RD_STATS this_rd_stats;
2349   int row, col;
2350   int64_t temp_sse, this_rd;
2351   const TX_SIZE tx_size = tx_size_from_tx_mode(bsize, cpi->common.tx_mode, 0);
2352   const int stepr = tx_size_high_unit[tx_size];
2353   const int stepc = tx_size_wide_unit[tx_size];
2354   const int max_blocks_wide = max_block_wide(xd, bsize, 0);
2355   const int max_blocks_high = max_block_high(xd, bsize, 0);
2356   mbmi->tx_size = tx_size;
2357   // Prediction.
2358   const int step = stepr * stepc;
2359   int block = 0;
2360   for (row = 0; row < max_blocks_high; row += stepr) {
2361     for (col = 0; col < max_blocks_wide; col += stepc) {
2362 #if CONFIG_CFL
2363       const struct macroblockd_plane *const pd = &xd->plane[0];
2364       const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
2365
2366 #if CONFIG_EC_ADAPT
2367       FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
2368 #else
2369       FRAME_CONTEXT *const ec_ctx = cpi->common.fc;
2370 #endif  // CONFIG_EC_ADAPT
2371
2372       av1_predict_intra_block_encoder_facade(x, ec_ctx, 0, block, col, row,
2373                                              tx_size, plane_bsize);
2374 #else
2375       av1_predict_intra_block_facade(xd, 0, block, col, row, tx_size);
2376 #endif
2377       block += step;
2378     }
2379   }
2380   // RD estimation.
2381   model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate,
2382                   &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse);
2383 #if CONFIG_EXT_INTRA
2384   if (av1_is_directional_mode(mbmi->mode, bsize)) {
2385     mode_cost += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
2386                                     MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
2387   }
2388 #endif  // CONFIG_EXT_INTRA
2389 #if CONFIG_FILTER_INTRA
2390   if (mbmi->mode == DC_PRED) {
2391     const aom_prob prob = cpi->common.fc->filter_intra_probs[0];
2392     if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
2393       const int mode = mbmi->filter_intra_mode_info.filter_intra_mode[0];
2394       mode_cost += (av1_cost_bit(prob, 1) +
2395                     write_uniform_cost(FILTER_INTRA_MODES, mode));
2396     } else {
2397       mode_cost += av1_cost_bit(prob, 0);
2398     }
2399   }
2400 #endif  // CONFIG_FILTER_INTRA
2401   this_rd = RDCOST(x->rdmult, x->rddiv, this_rd_stats.rate + mode_cost,
2402                    this_rd_stats.dist);
2403   return this_rd;
2404 }
2405
2406 #if CONFIG_PALETTE
2407 // Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
2408 // new_height'. Extra rows and columns are filled in by copying last valid
2409 // row/column.
2410 static void extend_palette_color_map(uint8_t *const color_map, int orig_width,
2411                                      int orig_height, int new_width,
2412                                      int new_height) {
2413   int j;
2414   assert(new_width >= orig_width);
2415   assert(new_height >= orig_height);
2416   if (new_width == orig_width && new_height == orig_height) return;
2417
2418   for (j = orig_height - 1; j >= 0; --j) {
2419     memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
2420     // Copy last column to extra columns.
2421     memset(color_map + j * new_width + orig_width,
2422            color_map[j * new_width + orig_width - 1], new_width - orig_width);
2423   }
2424   // Copy last row to extra rows.
2425   for (j = orig_height; j < new_height; ++j) {
2426     memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
2427            new_width);
2428   }
2429 }
2430
2431 #if CONFIG_PALETTE_DELTA_ENCODING
2432 // Bias toward using colors in the cache.
2433 // TODO(huisu): Try other schemes to improve compression.
2434 static void optimize_palette_colors(uint16_t *color_cache, int n_cache,
2435                                     int n_colors, int stride,
2436                                     float *centroids) {
2437   if (n_cache <= 0) return;
2438   for (int i = 0; i < n_colors * stride; i += stride) {
2439     float min_diff = fabsf(centroids[i] - color_cache[0]);
2440     int idx = 0;
2441     for (int j = 1; j < n_cache; ++j) {
2442       float this_diff = fabsf(centroids[i] - color_cache[j]);
2443       if (this_diff < min_diff) {
2444         min_diff = this_diff;
2445         idx = j;
2446       }
2447     }
2448     if (min_diff < 1.5) centroids[i] = color_cache[idx];
2449   }
2450 }
2451 #endif  // CONFIG_PALETTE_DELTA_ENCODING
2452
2453 static int rd_pick_palette_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
2454                                      BLOCK_SIZE bsize, int palette_ctx,
2455                                      int dc_mode_cost, MB_MODE_INFO *best_mbmi,
2456                                      uint8_t *best_palette_color_map,
2457                                      int64_t *best_rd, int64_t *best_model_rd,
2458                                      int *rate, int *rate_tokenonly,
2459                                      int64_t *distortion, int *skippable) {
2460   int rate_overhead = 0;
2461   MACROBLOCKD *const xd = &x->e_mbd;
2462   MODE_INFO *const mic = xd->mi[0];
2463   MB_MODE_INFO *const mbmi = &mic->mbmi;
2464   assert(!is_inter_block(mbmi));
2465   int this_rate, colors, n;
2466   const int src_stride = x->plane[0].src.stride;
2467   const uint8_t *const src = x->plane[0].src.buf;
2468   uint8_t *const color_map = xd->plane[0].color_index_map;
2469   int block_width, block_height, rows, cols;
2470   av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
2471                            &cols);
2472
2473   assert(cpi->common.allow_screen_content_tools);
2474
2475 #if CONFIG_HIGHBITDEPTH
2476   if (cpi->common.use_highbitdepth)
2477     colors = av1_count_colors_highbd(src, src_stride, rows, cols,
2478                                      cpi->common.bit_depth);
2479   else
2480 #endif  // CONFIG_HIGHBITDEPTH
2481     colors = av1_count_colors(src, src_stride, rows, cols);
2482 #if CONFIG_FILTER_INTRA
2483   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
2484 #endif  // CONFIG_FILTER_INTRA
2485
2486   if (colors > 1 && colors <= 64) {
2487     int r, c, i, j, k, palette_mode_cost;
2488     const int max_itr = 50;
2489     uint8_t color_order[PALETTE_MAX_SIZE];
2490     float *const data = x->palette_buffer->kmeans_data_buf;
2491     float centroids[PALETTE_MAX_SIZE];
2492     float lb, ub, val;
2493     RD_STATS tokenonly_rd_stats;
2494     int64_t this_rd, this_model_rd;
2495     PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
2496 #if CONFIG_HIGHBITDEPTH
2497     uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
2498     if (cpi->common.use_highbitdepth)
2499       lb = ub = src16[0];
2500     else
2501 #endif  // CONFIG_HIGHBITDEPTH
2502       lb = ub = src[0];
2503
2504 #if CONFIG_HIGHBITDEPTH
2505     if (cpi->common.use_highbitdepth) {
2506       for (r = 0; r < rows; ++r) {
2507         for (c = 0; c < cols; ++c) {
2508           val = src16[r * src_stride + c];
2509           data[r * cols + c] = val;
2510           if (val < lb)
2511             lb = val;
2512           else if (val > ub)
2513             ub = val;
2514         }
2515       }
2516     } else {
2517 #endif  // CONFIG_HIGHBITDEPTH
2518       for (r = 0; r < rows; ++r) {
2519         for (c = 0; c < cols; ++c) {
2520           val = src[r * src_stride + c];
2521           data[r * cols + c] = val;
2522           if (val < lb)
2523             lb = val;
2524           else if (val > ub)
2525             ub = val;
2526         }
2527       }
2528 #if CONFIG_HIGHBITDEPTH
2529     }
2530 #endif  // CONFIG_HIGHBITDEPTH
2531
2532     mbmi->mode = DC_PRED;
2533 #if CONFIG_FILTER_INTRA
2534     mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
2535 #endif  // CONFIG_FILTER_INTRA
2536
2537     if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return 0;
2538
2539 #if CONFIG_PALETTE_DELTA_ENCODING
2540     const MODE_INFO *above_mi = xd->above_mi;
2541     const MODE_INFO *left_mi = xd->left_mi;
2542     uint16_t color_cache[2 * PALETTE_MAX_SIZE];
2543     const int n_cache =
2544         av1_get_palette_cache(above_mi, left_mi, 0, color_cache);
2545 #endif  // CONFIG_PALETTE_DELTA_ENCODING
2546
2547     for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
2548          --n) {
2549       if (colors == PALETTE_MIN_SIZE) {
2550         // Special case: These colors automatically become the centroids.
2551         assert(colors == n);
2552         assert(colors == 2);
2553         centroids[0] = lb;
2554         centroids[1] = ub;
2555         k = 2;
2556       } else {
2557         for (i = 0; i < n; ++i) {
2558           centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
2559         }
2560         av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr);
2561 #if CONFIG_PALETTE_DELTA_ENCODING
2562         optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
2563 #endif  // CONFIG_PALETTE_DELTA_ENCODING
2564         k = av1_remove_duplicates(centroids, n);
2565         if (k < PALETTE_MIN_SIZE) {
2566           // Too few unique colors to create a palette. And DC_PRED will work
2567           // well for that case anyway. So skip.
2568           continue;
2569         }
2570       }
2571
2572 #if CONFIG_HIGHBITDEPTH
2573       if (cpi->common.use_highbitdepth)
2574         for (i = 0; i < k; ++i)
2575           pmi->palette_colors[i] =
2576               clip_pixel_highbd((int)centroids[i], cpi->common.bit_depth);
2577       else
2578 #endif  // CONFIG_HIGHBITDEPTH
2579         for (i = 0; i < k; ++i)
2580           pmi->palette_colors[i] = clip_pixel((int)centroids[i]);
2581       pmi->palette_size[0] = k;
2582
2583       av1_calc_indices(data, centroids, color_map, rows * cols, k, 1);
2584       extend_palette_color_map(color_map, cols, rows, block_width,
2585                                block_height);
2586       palette_mode_cost =
2587           dc_mode_cost +
2588           cpi->palette_y_size_cost[bsize - BLOCK_8X8][k - PALETTE_MIN_SIZE] +
2589           write_uniform_cost(k, color_map[0]) +
2590           av1_cost_bit(
2591               av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx],
2592               1);
2593       palette_mode_cost += av1_palette_color_cost_y(pmi,
2594 #if CONFIG_PALETTE_DELTA_ENCODING
2595                                                     color_cache, n_cache,
2596 #endif  // CONFIG_PALETTE_DELTA_ENCODING
2597                                                     cpi->common.bit_depth);
2598       for (i = 0; i < rows; ++i) {
2599         for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
2600           int color_idx;
2601           const int color_ctx = av1_get_palette_color_index_context(
2602               color_map, block_width, i, j, k, color_order, &color_idx);
2603           assert(color_idx >= 0 && color_idx < k);
2604           palette_mode_cost += cpi->palette_y_color_cost[k - PALETTE_MIN_SIZE]
2605                                                         [color_ctx][color_idx];
2606         }
2607       }
2608       this_model_rd = intra_model_yrd(cpi, x, bsize, palette_mode_cost);
2609       if (*best_model_rd != INT64_MAX &&
2610           this_model_rd > *best_model_rd + (*best_model_rd >> 1))
2611         continue;
2612       if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
2613       super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
2614       if (tokenonly_rd_stats.rate == INT_MAX) continue;
2615       this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
2616       this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
2617       if (!xd->lossless[mbmi->segment_id] && mbmi->sb_type >= BLOCK_8X8) {
2618         tokenonly_rd_stats.rate -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
2619       }
2620       if (this_rd < *best_rd) {
2621         *best_rd = this_rd;
2622         memcpy(best_palette_color_map, color_map,
2623                block_width * block_height * sizeof(color_map[0]));
2624         *best_mbmi = *mbmi;
2625         rate_overhead = this_rate - tokenonly_rd_stats.rate;
2626         if (rate) *rate = this_rate;
2627         if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
2628         if (distortion) *distortion = tokenonly_rd_stats.dist;
2629         if (skippable) *skippable = tokenonly_rd_stats.skip;
2630       }
2631     }
2632   }
2633
2634   if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
2635     memcpy(color_map, best_palette_color_map,
2636            rows * cols * sizeof(best_palette_color_map[0]));
2637   }
2638   *mbmi = *best_mbmi;
2639   return rate_overhead;
2640 }
2641 #endif  // CONFIG_PALETTE
2642
2643 static int64_t rd_pick_intra_sub_8x8_y_subblock_mode(
2644     const AV1_COMP *const cpi, MACROBLOCK *x, int row, int col,
2645     PREDICTION_MODE *best_mode, const int *bmode_costs, ENTROPY_CONTEXT *a,
2646     ENTROPY_CONTEXT *l, int *bestrate, int *bestratey, int64_t *bestdistortion,
2647     BLOCK_SIZE bsize, TX_SIZE tx_size, int *y_skip, int64_t rd_thresh) {
2648   const AV1_COMMON *const cm = &cpi->common;
2649   PREDICTION_MODE mode;
2650   MACROBLOCKD *const xd = &x->e_mbd;
2651   assert(!is_inter_block(&xd->mi[0]->mbmi));
2652   int64_t best_rd = rd_thresh;
2653   struct macroblock_plane *p = &x->plane[0];
2654   struct macroblockd_plane *pd = &xd->plane[0];
2655   const int src_stride = p->src.stride;
2656   const int dst_stride = pd->dst.stride;
2657   const uint8_t *src_init = &p->src.buf[row * 4 * src_stride + col * 4];
2658   uint8_t *dst_init = &pd->dst.buf[row * 4 * dst_stride + col * 4];
2659 #if CONFIG_CHROMA_2X2
2660   // TODO(jingning): This is a temporal change. The whole function should be
2661   // out when cb4x4 is enabled.
2662   ENTROPY_CONTEXT ta[4], tempa[4];
2663   ENTROPY_CONTEXT tl[4], templ[4];
2664 #else
2665   ENTROPY_CONTEXT ta[2], tempa[2];
2666   ENTROPY_CONTEXT tl[2], templ[2];
2667 #endif  // CONFIG_CHROMA_2X2
2668
2669   const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize];
2670   const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize];
2671   const int tx_width_unit = tx_size_wide_unit[tx_size];
2672   const int tx_height_unit = tx_size_high_unit[tx_size];
2673   const int pred_block_width = block_size_wide[bsize];
2674   const int pred_block_height = block_size_high[bsize];
2675   const int tx_width = tx_size_wide[tx_size];
2676   const int tx_height = tx_size_high[tx_size];
2677   const int pred_width_in_transform_blocks = pred_block_width / tx_width;
2678   const int pred_height_in_transform_blocks = pred_block_height / tx_height;
2679   int idx, idy;
2680   int best_can_skip = 0;
2681   uint8_t best_dst[8 * 8];
2682 #if CONFIG_HIGHBITDEPTH
2683   uint16_t best_dst16[8 * 8];
2684 #endif  // CONFIG_HIGHBITDEPTH
2685   const int is_lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
2686 #if CONFIG_EXT_TX && CONFIG_RECT_TX
2687   const int sub_bsize = bsize;
2688 #else
2689   const int sub_bsize = BLOCK_4X4;
2690 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
2691
2692 #if CONFIG_PVQ
2693   od_rollback_buffer pre_buf, post_buf;
2694   od_encode_checkpoint(&x->daala_enc, &pre_buf);
2695   od_encode_checkpoint(&x->daala_enc, &post_buf);
2696 #endif  // CONFIG_PVQ
2697
2698   assert(bsize < BLOCK_8X8);
2699   assert(tx_width < 8 || tx_height < 8);
2700 #if CONFIG_EXT_TX && CONFIG_RECT_TX
2701   if (is_lossless)
2702     assert(tx_width == 4 && tx_height == 4);
2703   else
2704     assert(tx_width == pred_block_width && tx_height == pred_block_height);
2705 #else
2706   assert(tx_width == 4 && tx_height == 4);
2707 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
2708
2709   memcpy(ta, a, pred_width_in_transform_blocks * sizeof(a[0]));
2710   memcpy(tl, l, pred_height_in_transform_blocks * sizeof(l[0]));
2711
2712   xd->mi[0]->mbmi.tx_size = tx_size;
2713
2714 #if CONFIG_PALETTE
2715   xd->mi[0]->mbmi.palette_mode_info.palette_size[0] = 0;
2716 #endif  // CONFIG_PALETTE
2717
2718 #if CONFIG_HIGHBITDEPTH
2719   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
2720 #if CONFIG_PVQ
2721     od_encode_checkpoint(&x->daala_enc, &pre_buf);
2722 #endif
2723     for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
2724       int64_t this_rd;
2725       int ratey = 0;
2726       int64_t distortion = 0;
2727       int rate = bmode_costs[mode];
2728       int can_skip = 1;
2729
2730       if (!(cpi->sf.intra_y_mode_mask[txsize_sqr_up_map[tx_size]] &
2731             (1 << mode)))
2732         continue;
2733
2734       // Only do the oblique modes if the best so far is
2735       // one of the neighboring directional modes
2736       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
2737         if (conditional_skipintra(mode, *best_mode)) continue;
2738       }
2739
2740       memcpy(tempa, ta, pred_width_in_transform_blocks * sizeof(ta[0]));
2741       memcpy(templ, tl, pred_height_in_transform_blocks * sizeof(tl[0]));
2742
2743       for (idy = 0; idy < pred_height_in_transform_blocks; ++idy) {
2744         for (idx = 0; idx < pred_width_in_transform_blocks; ++idx) {
2745           const int block_raster_idx = (row + idy) * 2 + (col + idx);
2746           const int block =
2747               av1_raster_order_to_block_index(tx_size, block_raster_idx);
2748           const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
2749           uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
2750 #if !CONFIG_PVQ
2751           int16_t *const src_diff = av1_raster_block_offset_int16(
2752               BLOCK_8X8, block_raster_idx, p->src_diff);
2753 #endif
2754           int skip;
2755           assert(block < 4);
2756           assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
2757                          idx == 0 && idy == 0));
2758           assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
2759                          block == 0 || block == 2));
2760           xd->mi[0]->bmi[block_raster_idx].as_mode = mode;
2761           av1_predict_intra_block(
2762               xd, pd->width, pd->height, txsize_to_bsize[tx_size], mode, dst,
2763               dst_stride, dst, dst_stride, col + idx, row + idy, 0);
2764 #if !CONFIG_PVQ
2765           aom_highbd_subtract_block(tx_height, tx_width, src_diff, 8, src,
2766                                     src_stride, dst, dst_stride, xd->bd);
2767 #endif
2768           if (is_lossless) {
2769             TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
2770             const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
2771             const int coeff_ctx =
2772                 combine_entropy_contexts(tempa[idx], templ[idy]);
2773 #if !CONFIG_PVQ
2774             av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
2775                             tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
2776             ratey += av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order,
2777                                      tempa + idx, templ + idy,
2778                                      cpi->sf.use_fast_coef_costing);
2779             skip = (p->eobs[block] == 0);
2780             can_skip &= skip;
2781             tempa[idx] = !skip;
2782             templ[idy] = !skip;
2783 #if CONFIG_EXT_TX
2784             if (tx_size == TX_8X4) {
2785               tempa[idx + 1] = tempa[idx];
2786             } else if (tx_size == TX_4X8) {
2787               templ[idy + 1] = templ[idy];
2788             }
2789 #endif  // CONFIG_EXT_TX
2790 #else
2791             (void)scan_order;
2792
2793             av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
2794                             tx_size, coeff_ctx, AV1_XFORM_QUANT_B);
2795
2796             ratey += x->rate;
2797             skip = x->pvq_skip[0];
2798             tempa[idx] = !skip;
2799             templ[idy] = !skip;
2800             can_skip &= skip;
2801 #endif
2802             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
2803               goto next_highbd;
2804 #if CONFIG_PVQ
2805             if (!skip)
2806 #endif
2807               av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
2808                                           DCT_DCT, tx_size, dst, dst_stride,
2809                                           p->eobs[block]);
2810           } else {
2811             int64_t dist;
2812             unsigned int tmp;
2813             TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
2814             const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
2815             const int coeff_ctx =
2816                 combine_entropy_contexts(tempa[idx], templ[idy]);
2817 #if !CONFIG_PVQ
2818             av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
2819                             tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
2820             av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx);
2821             ratey += av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order,
2822                                      tempa + idx, templ + idy,
2823                                      cpi->sf.use_fast_coef_costing);
2824             skip = (p->eobs[block] == 0);
2825             can_skip &= skip;
2826             tempa[idx] = !skip;
2827             templ[idy] = !skip;
2828 #if CONFIG_EXT_TX
2829             if (tx_size == TX_8X4) {
2830               tempa[idx + 1] = tempa[idx];
2831             } else if (tx_size == TX_4X8) {
2832               templ[idy + 1] = templ[idy];
2833             }
2834 #endif  // CONFIG_EXT_TX
2835 #else
2836             (void)scan_order;
2837
2838             av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
2839                             tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
2840             ratey += x->rate;
2841             skip = x->pvq_skip[0];
2842             tempa[idx] = !skip;
2843             templ[idy] = !skip;
2844             can_skip &= skip;
2845 #endif
2846 #if CONFIG_PVQ
2847             if (!skip)
2848 #endif
2849               av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
2850                                           tx_type, tx_size, dst, dst_stride,
2851                                           p->eobs[block]);
2852             cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
2853             dist = (int64_t)tmp << 4;
2854             distortion += dist;
2855             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
2856               goto next_highbd;
2857           }
2858         }
2859       }
2860
2861       rate += ratey;
2862       this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
2863
2864       if (this_rd < best_rd) {
2865         *bestrate = rate;
2866         *bestratey = ratey;
2867         *bestdistortion = distortion;
2868         best_rd = this_rd;
2869         best_can_skip = can_skip;
2870         *best_mode = mode;
2871         memcpy(a, tempa, pred_width_in_transform_blocks * sizeof(tempa[0]));
2872         memcpy(l, templ, pred_height_in_transform_blocks * sizeof(templ[0]));
2873 #if CONFIG_PVQ
2874         od_encode_checkpoint(&x->daala_enc, &post_buf);
2875 #endif
2876         for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) {
2877           memcpy(best_dst16 + idy * 8,
2878                  CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
2879                  pred_width_in_transform_blocks * 4 * sizeof(uint16_t));
2880         }
2881       }
2882     next_highbd : {}
2883 #if CONFIG_PVQ
2884       od_encode_rollback(&x->daala_enc, &pre_buf);
2885 #endif
2886     }
2887
2888     if (best_rd >= rd_thresh) return best_rd;
2889
2890 #if CONFIG_PVQ
2891     od_encode_rollback(&x->daala_enc, &post_buf);
2892 #endif
2893
2894     if (y_skip) *y_skip &= best_can_skip;
2895
2896     for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy) {
2897       memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
2898              best_dst16 + idy * 8,
2899              pred_width_in_transform_blocks * 4 * sizeof(uint16_t));
2900     }
2901
2902     return best_rd;
2903   }
2904 #endif  // CONFIG_HIGHBITDEPTH
2905
2906 #if CONFIG_PVQ
2907   od_encode_checkpoint(&x->daala_enc, &pre_buf);
2908 #endif  // CONFIG_PVQ
2909
2910   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
2911     int64_t this_rd;
2912     int ratey = 0;
2913     int64_t distortion = 0;
2914     int rate = bmode_costs[mode];
2915     int can_skip = 1;
2916
2917     if (!(cpi->sf.intra_y_mode_mask[txsize_sqr_up_map[tx_size]] &
2918           (1 << mode))) {
2919       continue;
2920     }
2921
2922     // Only do the oblique modes if the best so far is
2923     // one of the neighboring directional modes
2924     if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
2925       if (conditional_skipintra(mode, *best_mode)) continue;
2926     }
2927
2928     memcpy(tempa, ta, pred_width_in_transform_blocks * sizeof(ta[0]));
2929     memcpy(templ, tl, pred_height_in_transform_blocks * sizeof(tl[0]));
2930
2931     for (idy = 0; idy < pred_height_in_4x4_blocks; idy += tx_height_unit) {
2932       for (idx = 0; idx < pred_width_in_4x4_blocks; idx += tx_width_unit) {
2933         const int block_raster_idx = (row + idy) * 2 + (col + idx);
2934         int block = av1_raster_order_to_block_index(tx_size, block_raster_idx);
2935         const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
2936         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
2937 #if !CONFIG_PVQ
2938         int16_t *const src_diff = av1_raster_block_offset_int16(
2939             BLOCK_8X8, block_raster_idx, p->src_diff);
2940 #endif  // !CONFIG_PVQ
2941         int skip;
2942         assert(block < 4);
2943         assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
2944                        idx == 0 && idy == 0));
2945         assert(IMPLIES(tx_size == TX_4X8 || tx_size == TX_8X4,
2946                        block == 0 || block == 2));
2947         xd->mi[0]->bmi[block_raster_idx].as_mode = mode;
2948         av1_predict_intra_block(xd, pd->width, pd->height,
2949                                 txsize_to_bsize[tx_size], mode, dst, dst_stride,
2950                                 dst, dst_stride,
2951 #if CONFIG_CB4X4
2952                                 2 * (col + idx), 2 * (row + idy),
2953 #else
2954                                 col + idx, row + idy,
2955 #endif  // CONFIG_CB4X4
2956                                 0);
2957 #if !CONFIG_PVQ
2958         aom_subtract_block(tx_height, tx_width, src_diff, 8, src, src_stride,
2959                            dst, dst_stride);
2960 #endif  // !CONFIG_PVQ
2961
2962         TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, tx_size);
2963         const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
2964         const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]);
2965 #if CONFIG_CB4X4
2966         block = 4 * block;
2967 #endif  // CONFIG_CB4X4
2968 #if !CONFIG_PVQ
2969         const AV1_XFORM_QUANT xform_quant =
2970             is_lossless ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
2971         av1_xform_quant(cm, x, 0, block,
2972 #if CONFIG_CB4X4
2973                         2 * (row + idy), 2 * (col + idx),
2974 #else
2975                         row + idy, col + idx,
2976 #endif  // CONFIG_CB4X4
2977                         BLOCK_8X8, tx_size, coeff_ctx, xform_quant);
2978
2979         av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx);
2980
2981         ratey +=
2982             av1_cost_coeffs(cpi, x, 0, block, tx_size, scan_order, tempa + idx,
2983                             templ + idy, cpi->sf.use_fast_coef_costing);
2984         skip = (p->eobs[block] == 0);
2985         can_skip &= skip;
2986         tempa[idx] = !skip;
2987         templ[idy] = !skip;
2988 #if CONFIG_EXT_TX
2989         if (tx_size == TX_8X4) {
2990           tempa[idx + 1] = tempa[idx];
2991         } else if (tx_size == TX_4X8) {
2992           templ[idy + 1] = templ[idy];
2993         }
2994 #endif  // CONFIG_EXT_TX
2995 #else
2996         (void)scan_order;
2997
2998         av1_xform_quant(cm, x, 0, block,
2999 #if CONFIG_CB4X4
3000                         2 * (row + idy), 2 * (col + idx),
3001 #else
3002                         row + idy, col + idx,
3003 #endif  // CONFIG_CB4X4
3004                         BLOCK_8X8, tx_size, coeff_ctx, AV1_XFORM_QUANT_FP);
3005
3006         ratey += x->rate;
3007         skip = x->pvq_skip[0];
3008         tempa[idx] = !skip;
3009         templ[idy] = !skip;
3010         can_skip &= skip;
3011 #endif  // !CONFIG_PVQ
3012
3013         if (!is_lossless) {  // To use the pixel domain distortion, we need to
3014                              // calculate inverse txfm *before* calculating RD
3015                              // cost. Compared to calculating the distortion in
3016                              // the frequency domain, the overhead of encoding
3017                              // effort is low.
3018 #if CONFIG_PVQ
3019           if (!skip)
3020 #endif  // CONFIG_PVQ
3021             av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
3022                                         tx_type, tx_size, dst, dst_stride,
3023                                         p->eobs[block]);
3024           unsigned int tmp;
3025           cpi->fn_ptr[sub_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
3026           const int64_t dist = (int64_t)tmp << 4;
3027           distortion += dist;
3028         }
3029
3030         if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
3031           goto next;
3032
3033         if (is_lossless) {  // Calculate inverse txfm *after* RD cost.
3034 #if CONFIG_PVQ
3035           if (!skip)
3036 #endif  // CONFIG_PVQ
3037             av1_inverse_transform_block(xd, BLOCK_OFFSET(pd->dqcoeff, block),
3038                                         DCT_DCT, tx_size, dst, dst_stride,
3039                                         p->eobs[block]);
3040         }
3041       }
3042     }
3043
3044     rate += ratey;
3045     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
3046
3047     if (this_rd < best_rd) {
3048       *bestrate = rate;
3049       *bestratey = ratey;
3050       *bestdistortion = distortion;
3051       best_rd = this_rd;
3052       best_can_skip = can_skip;
3053       *best_mode = mode;
3054       memcpy(a, tempa, pred_width_in_transform_blocks * sizeof(tempa[0]));
3055       memcpy(l, templ, pred_height_in_transform_blocks * sizeof(templ[0]));
3056 #if CONFIG_PVQ
3057       od_encode_checkpoint(&x->daala_enc, &post_buf);
3058 #endif  // CONFIG_PVQ
3059       for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy)
3060         memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
3061                pred_width_in_transform_blocks * 4);
3062     }
3063   next : {}
3064 #if CONFIG_PVQ
3065     od_encode_rollback(&x->daala_enc, &pre_buf);
3066 #endif  // CONFIG_PVQ
3067   }     // mode decision loop
3068
3069   if (best_rd >= rd_thresh) return best_rd;
3070
3071 #if CONFIG_PVQ
3072   od_encode_rollback(&x->daala_enc, &post_buf);
3073 #endif  // CONFIG_PVQ
3074
3075   if (y_skip) *y_skip &= best_can_skip;
3076
3077   for (idy = 0; idy < pred_height_in_transform_blocks * 4; ++idy)
3078     memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
3079            pred_width_in_transform_blocks * 4);
3080
3081   return best_rd;
3082 }
3083
3084 static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
3085                                             MACROBLOCK *mb, int *rate,
3086                                             int *rate_y, int64_t *distortion,
3087                                             int *y_skip, int64_t best_rd) {
3088   const MACROBLOCKD *const xd = &mb->e_mbd;
3089   MODE_INFO *const mic = xd->mi[0];
3090   const MODE_INFO *above_mi = xd->above_mi;
3091   const MODE_INFO *left_mi = xd->left_mi;
3092   MB_MODE_INFO *const mbmi = &mic->mbmi;
3093   assert(!is_inter_block(mbmi));
3094   const BLOCK_SIZE bsize = mbmi->sb_type;
3095   const int pred_width_in_4x4_blocks = num_4x4_blocks_wide_lookup[bsize];
3096   const int pred_height_in_4x4_blocks = num_4x4_blocks_high_lookup[bsize];
3097   int idx, idy;
3098   int cost = 0;
3099   int64_t total_distortion = 0;
3100   int tot_rate_y = 0;
3101   int64_t total_rd = 0;
3102   const int *bmode_costs = cpi->mbmode_cost[0];
3103   const int is_lossless = xd->lossless[mbmi->segment_id];
3104 #if CONFIG_EXT_TX && CONFIG_RECT_TX
3105   const TX_SIZE tx_size = is_lossless ? TX_4X4 : max_txsize_rect_lookup[bsize];
3106 #else
3107   const TX_SIZE tx_size = TX_4X4;
3108 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
3109
3110 #if CONFIG_EXT_INTRA
3111 #if CONFIG_INTRA_INTERP
3112   mbmi->intra_filter = INTRA_FILTER_LINEAR;
3113 #endif  // CONFIG_INTRA_INTERP
3114 #endif  // CONFIG_EXT_INTRA
3115 #if CONFIG_FILTER_INTRA
3116   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
3117 #endif  // CONFIG_FILTER_INTRA
3118
3119   // TODO(any): Add search of the tx_type to improve rd performance at the
3120   // expense of speed.
3121   mbmi->tx_type = DCT_DCT;
3122   mbmi->tx_size = tx_size;
3123
3124   if (y_skip) *y_skip = 1;
3125
3126   // Pick modes for each prediction sub-block (of size 4x4, 4x8, or 8x4) in this
3127   // 8x8 coding block.
3128   for (idy = 0; idy < 2; idy += pred_height_in_4x4_blocks) {
3129     for (idx = 0; idx < 2; idx += pred_width_in_4x4_blocks) {
3130       PREDICTION_MODE best_mode = DC_PRED;
3131       int r = INT_MAX, ry = INT_MAX;
3132       int64_t d = INT64_MAX, this_rd = INT64_MAX;
3133       int j;
3134       const int pred_block_idx = idy * 2 + idx;
3135       if (cpi->common.frame_type == KEY_FRAME) {
3136         const PREDICTION_MODE A =
3137             av1_above_block_mode(mic, above_mi, pred_block_idx);
3138         const PREDICTION_MODE L =
3139             av1_left_block_mode(mic, left_mi, pred_block_idx);
3140
3141         bmode_costs = cpi->y_mode_costs[A][L];
3142       }
3143       this_rd = rd_pick_intra_sub_8x8_y_subblock_mode(
3144           cpi, mb, idy, idx, &best_mode, bmode_costs,
3145           xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r,
3146           &ry, &d, bsize, tx_size, y_skip, best_rd - total_rd);
3147 #if !CONFIG_DAALA_DIST
3148       if (this_rd >= best_rd - total_rd) return INT64_MAX;
3149 #endif  // !CONFIG_DAALA_DIST
3150       total_rd += this_rd;
3151       cost += r;
3152       total_distortion += d;
3153       tot_rate_y += ry;
3154
3155       mic->bmi[pred_block_idx].as_mode = best_mode;
3156       for (j = 1; j < pred_height_in_4x4_blocks; ++j)
3157         mic->bmi[pred_block_idx + j * 2].as_mode = best_mode;
3158       for (j = 1; j < pred_width_in_4x4_blocks; ++j)
3159         mic->bmi[pred_block_idx + j].as_mode = best_mode;
3160
3161       if (total_rd >= best_rd) return INT64_MAX;
3162     }
3163   }
3164   mbmi->mode = mic->bmi[3].as_mode;
3165
3166 #if CONFIG_DAALA_DIST
3167   {
3168     const struct macroblock_plane *p = &mb->plane[0];
3169     const struct macroblockd_plane *pd = &xd->plane[0];
3170     const int src_stride = p->src.stride;
3171     const int dst_stride = pd->dst.stride;
3172     uint8_t *src = p->src.buf;
3173     uint8_t *dst = pd->dst.buf;
3174     int use_activity_masking = 0;
3175     int qm = OD_HVS_QM;
3176
3177 #if CONFIG_PVQ
3178     use_activity_masking = mb->daala_enc.use_activity_masking;
3179 #endif  // CONFIG_PVQ
3180     // Daala-defined distortion computed for the block of 8x8 pixels
3181     total_distortion = av1_daala_dist(src, src_stride, dst, dst_stride, 8, 8,
3182                                       qm, use_activity_masking, mb->qindex)
3183                        << 4;
3184   }
3185 #endif  // CONFIG_DAALA_DIST
3186   // Add in the cost of the transform type
3187   if (!is_lossless) {
3188     int rate_tx_type = 0;
3189 #if CONFIG_EXT_TX
3190     if (get_ext_tx_types(tx_size, bsize, 0, cpi->common.reduced_tx_set_used) >
3191         1) {
3192       const int eset =
3193           get_ext_tx_set(tx_size, bsize, 0, cpi->common.reduced_tx_set_used);
3194       rate_tx_type = cpi->intra_tx_type_costs[eset][txsize_sqr_map[tx_size]]
3195                                              [mbmi->mode][mbmi->tx_type];
3196     }
3197 #else
3198     rate_tx_type =
3199         cpi->intra_tx_type_costs[txsize_sqr_map[tx_size]]
3200                                 [intra_mode_to_tx_type_context[mbmi->mode]]
3201                                 [mbmi->tx_type];
3202 #endif  // CONFIG_EXT_TX
3203     assert(mbmi->tx_size == tx_size);
3204     cost += rate_tx_type;
3205     tot_rate_y += rate_tx_type;
3206   }
3207
3208   *rate = cost;
3209   *rate_y = tot_rate_y;
3210   *distortion = total_distortion;
3211
3212   return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
3213 }
3214
3215 #if CONFIG_FILTER_INTRA
3216 // Return 1 if an filter intra mode is selected; return 0 otherwise.
3217 static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
3218                                     int *rate, int *rate_tokenonly,
3219                                     int64_t *distortion, int *skippable,
3220                                     BLOCK_SIZE bsize, int mode_cost,
3221                                     int64_t *best_rd, int64_t *best_model_rd,
3222                                     uint16_t skip_mask) {
3223   MACROBLOCKD *const xd = &x->e_mbd;
3224   MODE_INFO *const mic = xd->mi[0];
3225   MB_MODE_INFO *mbmi = &mic->mbmi;
3226   int filter_intra_selected_flag = 0;
3227   FILTER_INTRA_MODE mode;
3228   TX_SIZE best_tx_size = TX_4X4;
3229   FILTER_INTRA_MODE_INFO filter_intra_mode_info;
3230   TX_TYPE best_tx_type;
3231
3232   av1_zero(filter_intra_mode_info);
3233   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 1;
3234   mbmi->mode = DC_PRED;
3235 #if CONFIG_PALETTE
3236   mbmi->palette_mode_info.palette_size[0] = 0;
3237 #endif  // CONFIG_PALETTE
3238
3239   for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
3240     int this_rate;
3241     int64_t this_rd, this_model_rd;
3242     RD_STATS tokenonly_rd_stats;
3243     if (skip_mask & (1 << mode)) continue;
3244     mbmi->filter_intra_mode_info.filter_intra_mode[0] = mode;
3245     this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
3246     if (*best_model_rd != INT64_MAX &&
3247         this_model_rd > *best_model_rd + (*best_model_rd >> 1))
3248       continue;
3249     if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
3250     super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
3251     if (tokenonly_rd_stats.rate == INT_MAX) continue;
3252     this_rate = tokenonly_rd_stats.rate +
3253                 av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 1) +
3254                 write_uniform_cost(FILTER_INTRA_MODES, mode) + mode_cost;
3255     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
3256
3257     if (this_rd < *best_rd) {
3258       *best_rd = this_rd;
3259       best_tx_size = mic->mbmi.tx_size;
3260       filter_intra_mode_info = mbmi->filter_intra_mode_info;
3261       best_tx_type = mic->mbmi.tx_type;
3262       *rate = this_rate;
3263       *rate_tokenonly = tokenonly_rd_stats.rate;
3264       *distortion = tokenonly_rd_stats.dist;
3265       *skippable = tokenonly_rd_stats.skip;
3266       filter_intra_selected_flag = 1;
3267     }
3268   }
3269
3270   if (filter_intra_selected_flag) {
3271     mbmi->mode = DC_PRED;
3272     mbmi->tx_size = best_tx_size;
3273     mbmi->filter_intra_mode_info.use_filter_intra_mode[0] =
3274         filter_intra_mode_info.use_filter_intra_mode[0];
3275     mbmi->filter_intra_mode_info.filter_intra_mode[0] =
3276         filter_intra_mode_info.filter_intra_mode[0];
3277     mbmi->tx_type = best_tx_type;
3278     return 1;
3279   } else {
3280     return 0;
3281   }
3282 }
3283 #endif  // CONFIG_FILTER_INTRA
3284
3285 #if CONFIG_EXT_INTRA
3286 // Run RD calculation with given luma intra prediction angle., and return
3287 // the RD cost. Update the best mode info. if the RD cost is the best so far.
3288 static int64_t calc_rd_given_intra_angle(
3289     const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost,
3290     int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
3291     RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
3292     TX_TYPE *best_tx_type,
3293 #if CONFIG_INTRA_INTERP
3294     INTRA_FILTER *best_filter,
3295 #endif  // CONFIG_INTRA_INTERP
3296     int64_t *best_rd, int64_t *best_model_rd) {
3297   int this_rate;
3298   RD_STATS tokenonly_rd_stats;
3299   int64_t this_rd, this_model_rd;
3300   MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
3301   assert(!is_inter_block(mbmi));
3302
3303   mbmi->angle_delta[0] = angle_delta;
3304   this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost);
3305   if (*best_model_rd != INT64_MAX &&
3306       this_model_rd > *best_model_rd + (*best_model_rd >> 1))
3307     return INT64_MAX;
3308   if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
3309   super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in);
3310   if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
3311
3312   this_rate = tokenonly_rd_stats.rate + mode_cost +
3313               write_uniform_cost(2 * max_angle_delta + 1,
3314                                  mbmi->angle_delta[0] + max_angle_delta);
3315   this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
3316
3317   if (this_rd < *best_rd) {
3318     *best_rd = this_rd;
3319     *best_angle_delta = mbmi->angle_delta[0];
3320     *best_tx_size = mbmi->tx_size;
3321 #if CONFIG_INTRA_INTERP
3322     *best_filter = mbmi->intra_filter;
3323 #endif  // CONFIG_INTRA_INTERP
3324     *best_tx_type = mbmi->tx_type;
3325     *rate = this_rate;
3326     rd_stats->rate = tokenonly_rd_stats.rate;
3327     rd_stats->dist = tokenonly_rd_stats.dist;
3328     rd_stats->skip = tokenonly_rd_stats.skip;
3329   }
3330   return this_rd;
3331 }
3332
3333 // With given luma directional intra prediction mode, pick the best angle delta
3334 // Return the RD cost corresponding to the best angle delta.
3335 static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
3336                                        int *rate, RD_STATS *rd_stats,
3337                                        BLOCK_SIZE bsize, int mode_cost,
3338                                        int64_t best_rd,
3339                                        int64_t *best_model_rd) {
3340   MACROBLOCKD *const xd = &x->e_mbd;
3341   MODE_INFO *const mic = xd->mi[0];
3342   MB_MODE_INFO *mbmi = &mic->mbmi;
3343   assert(!is_inter_block(mbmi));
3344   int i, angle_delta, best_angle_delta = 0;
3345   int first_try = 1;
3346 #if CONFIG_INTRA_INTERP
3347   int p_angle;
3348   const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
3349   INTRA_FILTER filter, best_filter = INTRA_FILTER_LINEAR;
3350 #endif  // CONFIG_INTRA_INTERP
3351   int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
3352   TX_SIZE best_tx_size = mic->mbmi.tx_size;
3353   TX_TYPE best_tx_type = mbmi->tx_type;
3354
3355   for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
3356
3357   for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
3358 #if CONFIG_INTRA_INTERP
3359     for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
3360       if (FILTER_FAST_SEARCH && filter != INTRA_FILTER_LINEAR) continue;
3361       mic->mbmi.intra_filter = filter;
3362 #endif  // CONFIG_INTRA_INTERP
3363       for (i = 0; i < 2; ++i) {
3364         best_rd_in = (best_rd == INT64_MAX)
3365                          ? INT64_MAX
3366                          : (best_rd + (best_rd >> (first_try ? 3 : 5)));
3367         this_rd = calc_rd_given_intra_angle(
3368             cpi, x, bsize,
3369 #if CONFIG_INTRA_INTERP
3370             mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter],
3371 #else
3372           mode_cost,
3373 #endif  // CONFIG_INTRA_INTERP
3374             best_rd_in, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate,
3375             rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type,
3376 #if CONFIG_INTRA_INTERP
3377             &best_filter,
3378 #endif  // CONFIG_INTRA_INTERP
3379             &best_rd, best_model_rd);
3380         rd_cost[2 * angle_delta + i] = this_rd;
3381         if (first_try && this_rd == INT64_MAX) return best_rd;
3382         first_try = 0;
3383         if (angle_delta == 0) {
3384           rd_cost[1] = this_rd;
3385           break;
3386         }
3387       }
3388 #if CONFIG_INTRA_INTERP
3389     }
3390 #endif  // CONFIG_INTRA_INTERP
3391   }
3392
3393   assert(best_rd != INT64_MAX);
3394   for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
3395     int64_t rd_thresh;
3396 #if CONFIG_INTRA_INTERP
3397     for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
3398       if (FILTER_FAST_SEARCH && filter != INTRA_FILTER_LINEAR) continue;
3399       mic->mbmi.intra_filter = filter;
3400 #endif  // CONFIG_INTRA_INTERP
3401       for (i = 0; i < 2; ++i) {
3402         int skip_search = 0;
3403         rd_thresh = best_rd + (best_rd >> 5);
3404         if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
3405             rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
3406           skip_search = 1;
3407         if (!skip_search) {
3408           calc_rd_given_intra_angle(
3409               cpi, x, bsize,
3410 #if CONFIG_INTRA_INTERP
3411               mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter],
3412 #else
3413             mode_cost,
3414 #endif  // CONFIG_INTRA_INTERP
3415               best_rd, (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate,
3416               rd_stats, &best_angle_delta, &best_tx_size, &best_tx_type,
3417 #if CONFIG_INTRA_INTERP
3418               &best_filter,
3419 #endif  // CONFIG_INTRA_INTERP
3420               &best_rd, best_model_rd);
3421         }
3422       }
3423 #if CONFIG_INTRA_INTERP
3424     }
3425 #endif  // CONFIG_INTRA_INTERP
3426   }
3427
3428 #if CONFIG_INTRA_INTERP
3429   if (FILTER_FAST_SEARCH && rd_stats->rate < INT_MAX) {
3430     p_angle = mode_to_angle_map[mbmi->mode] + best_angle_delta * ANGLE_STEP;
3431     if (av1_is_intra_filter_switchable(p_angle)) {
3432       for (filter = INTRA_FILTER_LINEAR + 1; filter < INTRA_FILTERS; ++filter) {
3433         mic->mbmi.intra_filter = filter;
3434         this_rd = calc_rd_given_intra_angle(
3435             cpi, x, bsize,
3436             mode_cost + cpi->intra_filter_cost[intra_filter_ctx][filter],
3437             best_rd, best_angle_delta, MAX_ANGLE_DELTA, rate, rd_stats,
3438             &best_angle_delta, &best_tx_size, &best_tx_type, &best_filter,
3439             &best_rd, best_model_rd);
3440       }
3441     }
3442   }
3443 #endif  // CONFIG_INTRA_INTERP
3444
3445   mbmi->tx_size = best_tx_size;
3446   mbmi->angle_delta[0] = best_angle_delta;
3447 #if CONFIG_INTRA_INTERP
3448   mic->mbmi.intra_filter = best_filter;
3449 #endif  // CONFIG_INTRA_INTERP
3450   mbmi->tx_type = best_tx_type;
3451   return best_rd;
3452 }
3453
3454 // Indices are sign, integer, and fractional part of the gradient value
3455 static const uint8_t gradient_to_angle_bin[2][7][16] = {
3456   {
3457       { 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0 },
3458       { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },
3459       { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
3460       { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
3461       { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
3462       { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
3463       { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
3464   },
3465   {
3466       { 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4 },
3467       { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3 },
3468       { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
3469       { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
3470       { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
3471       { 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
3472       { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
3473   },
3474 };
3475
3476 /* clang-format off */
3477 static const uint8_t mode_to_angle_bin[INTRA_MODES] = {
3478   0, 2, 6, 0, 4, 3, 5, 7, 1, 0,
3479 #if CONFIG_ALT_INTRA
3480   0,
3481 #endif  // CONFIG_ALT_INTRA
3482 };
3483 /* clang-format on */
3484
3485 static void angle_estimation(const uint8_t *src, int src_stride, int rows,
3486                              int cols, BLOCK_SIZE bsize,
3487                              uint8_t *directional_mode_skip_mask) {
3488   memset(directional_mode_skip_mask, 0,
3489          INTRA_MODES * sizeof(*directional_mode_skip_mask));
3490   // Sub-8x8 blocks do not use extra directions.
3491   if (bsize < BLOCK_8X8) return;
3492   uint64_t hist[DIRECTIONAL_MODES];
3493   memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
3494   src += src_stride;
3495   int r, c, dx, dy;
3496   for (r = 1; r < rows; ++r) {
3497     for (c = 1; c < cols; ++c) {
3498       dx = src[c] - src[c - 1];
3499       dy = src[c] - src[c - src_stride];
3500       int index;
3501       const int temp = dx * dx + dy * dy;
3502       if (dy == 0) {
3503         index = 2;
3504       } else {
3505         const int sn = (dx > 0) ^ (dy > 0);
3506         dx = abs(dx);
3507         dy = abs(dy);
3508         const int remd = (dx % dy) * 16 / dy;
3509         const int quot = dx / dy;
3510         index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)];
3511       }
3512       hist[index] += temp;
3513     }
3514     src += src_stride;
3515   }
3516
3517   int i;
3518   uint64_t hist_sum = 0;
3519   for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
3520   for (i = 0; i < INTRA_MODES; ++i) {
3521     if (av1_is_directional_mode(i, bsize)) {
3522       const uint8_t angle_bin = mode_to_angle_bin[i];
3523       uint64_t score = 2 * hist[angle_bin];
3524       int weight = 2;
3525       if (angle_bin > 0) {
3526         score += hist[angle_bin - 1];
3527         ++weight;
3528       }
3529       if (angle_bin < DIRECTIONAL_MODES - 1) {
3530         score += hist[angle_bin + 1];
3531         ++weight;
3532       }
3533       if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
3534         directional_mode_skip_mask[i] = 1;
3535     }
3536   }
3537 }
3538
3539 #if CONFIG_HIGHBITDEPTH
3540 static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
3541                                     int rows, int cols, BLOCK_SIZE bsize,
3542                                     uint8_t *directional_mode_skip_mask) {
3543   memset(directional_mode_skip_mask, 0,
3544          INTRA_MODES * sizeof(*directional_mode_skip_mask));
3545   // Sub-8x8 blocks do not use extra directions.
3546   if (bsize < BLOCK_8X8) return;
3547   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
3548   uint64_t hist[DIRECTIONAL_MODES];
3549   memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
3550   src += src_stride;
3551   int r, c, dx, dy;
3552   for (r = 1; r < rows; ++r) {
3553     for (c = 1; c < cols; ++c) {
3554       dx = src[c] - src[c - 1];
3555       dy = src[c] - src[c - src_stride];
3556       int index;
3557       const int temp = dx * dx + dy * dy;
3558       if (dy == 0) {
3559         index = 2;
3560       } else {
3561         const int sn = (dx > 0) ^ (dy > 0);
3562         dx = abs(dx);
3563         dy = abs(dy);
3564         const int remd = (dx % dy) * 16 / dy;
3565         const int quot = dx / dy;
3566         index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)];
3567       }
3568       hist[index] += temp;
3569     }
3570     src += src_stride;
3571   }
3572
3573   int i;
3574   uint64_t hist_sum = 0;
3575   for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
3576   for (i = 0; i < INTRA_MODES; ++i) {
3577     if (av1_is_directional_mode(i, bsize)) {
3578       const uint8_t angle_bin = mode_to_angle_bin[i];
3579       uint64_t score = 2 * hist[angle_bin];
3580       int weight = 2;
3581       if (angle_bin > 0) {
3582         score += hist[angle_bin - 1];
3583         ++weight;
3584       }
3585       if (angle_bin < DIRECTIONAL_MODES - 1) {
3586         score += hist[angle_bin + 1];
3587         ++weight;
3588       }
3589       if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
3590         directional_mode_skip_mask[i] = 1;
3591     }
3592   }
3593 }
3594 #endif  // CONFIG_HIGHBITDEPTH
3595 #endif  // CONFIG_EXT_INTRA
3596
3597 // This function is used only for intra_only frames
3598 static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
3599                                       int *rate, int *rate_tokenonly,
3600                                       int64_t *distortion, int *skippable,
3601                                       BLOCK_SIZE bsize, int64_t best_rd) {
3602   uint8_t mode_idx;
3603   MACROBLOCKD *const xd = &x->e_mbd;
3604   MODE_INFO *const mic = xd->mi[0];
3605   MB_MODE_INFO *const mbmi = &mic->mbmi;
3606   assert(!is_inter_block(mbmi));
3607   MB_MODE_INFO best_mbmi = *mbmi;
3608   int64_t best_model_rd = INT64_MAX;
3609 #if CONFIG_EXT_INTRA
3610   const int rows = block_size_high[bsize];
3611   const int cols = block_size_wide[bsize];
3612 #if CONFIG_INTRA_INTERP
3613   const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
3614 #endif  // CONFIG_INTRA_INTERP
3615   int is_directional_mode;
3616   uint8_t directional_mode_skip_mask[INTRA_MODES];
3617   const int src_stride = x->plane[0].src.stride;
3618   const uint8_t *src = x->plane[0].src.buf;
3619 #endif  // CONFIG_EXT_INTRA
3620 #if CONFIG_FILTER_INTRA
3621   int beat_best_rd = 0;
3622   uint16_t filter_intra_mode_skip_mask = (1 << FILTER_INTRA_MODES) - 1;
3623 #endif  // CONFIG_FILTER_INTRA
3624   const int *bmode_costs;
3625 #if CONFIG_PALETTE
3626   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
3627   uint8_t *best_palette_color_map =
3628       cpi->common.allow_screen_content_tools
3629           ? x->palette_buffer->best_palette_color_map
3630           : NULL;
3631   int palette_y_mode_ctx = 0;
3632   const int try_palette =
3633       cpi->common.allow_screen_content_tools && bsize >= BLOCK_8X8;
3634 #endif  // CONFIG_PALETTE
3635   const MODE_INFO *above_mi = xd->above_mi;
3636   const MODE_INFO *left_mi = xd->left_mi;
3637   const PREDICTION_MODE A = av1_above_block_mode(mic, above_mi, 0);
3638   const PREDICTION_MODE L = av1_left_block_mode(mic, left_mi, 0);
3639   const PREDICTION_MODE FINAL_MODE_SEARCH = TM_PRED + 1;
3640 #if CONFIG_PVQ
3641   od_rollback_buffer pre_buf, post_buf;
3642
3643   od_encode_checkpoint(&x->daala_enc, &pre_buf);
3644   od_encode_checkpoint(&x->daala_enc, &post_buf);
3645 #endif  // CONFIG_PVQ
3646   bmode_costs = cpi->y_mode_costs[A][L];
3647
3648 #if CONFIG_EXT_INTRA
3649   mbmi->angle_delta[0] = 0;
3650 #if CONFIG_HIGHBITDEPTH
3651   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
3652     highbd_angle_estimation(src, src_stride, rows, cols, bsize,
3653                             directional_mode_skip_mask);
3654   else
3655 #endif  // CONFIG_HIGHBITDEPTH
3656     angle_estimation(src, src_stride, rows, cols, bsize,
3657                      directional_mode_skip_mask);
3658 #endif  // CONFIG_EXT_INTRA
3659 #if CONFIG_FILTER_INTRA
3660   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
3661 #endif  // CONFIG_FILTER_INTRA
3662 #if CONFIG_PALETTE
3663   pmi->palette_size[0] = 0;
3664   if (above_mi)
3665     palette_y_mode_ctx +=
3666         (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
3667   if (left_mi)
3668     palette_y_mode_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
3669 #endif  // CONFIG_PALETTE
3670
3671   if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
3672     x->use_default_intra_tx_type = 1;
3673   else
3674     x->use_default_intra_tx_type = 0;
3675
3676   /* Y Search for intra prediction mode */
3677   for (mode_idx = DC_PRED; mode_idx <= FINAL_MODE_SEARCH; ++mode_idx) {
3678     RD_STATS this_rd_stats;
3679     int this_rate, this_rate_tokenonly, s;
3680     int64_t this_distortion, this_rd, this_model_rd;
3681     if (mode_idx == FINAL_MODE_SEARCH) {
3682       if (x->use_default_intra_tx_type == 0) break;
3683       mbmi->mode = best_mbmi.mode;
3684       x->use_default_intra_tx_type = 0;
3685     } else {
3686       mbmi->mode = mode_idx;
3687     }
3688 #if CONFIG_PVQ
3689     od_encode_rollback(&x->daala_enc, &pre_buf);
3690 #endif  // CONFIG_PVQ
3691 #if CONFIG_EXT_INTRA
3692     mbmi->angle_delta[0] = 0;
3693 #endif  // CONFIG_EXT_INTRA
3694     this_model_rd = intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode]);
3695     if (best_model_rd != INT64_MAX &&
3696         this_model_rd > best_model_rd + (best_model_rd >> 1))
3697       continue;
3698     if (this_model_rd < best_model_rd) best_model_rd = this_model_rd;
3699 #if CONFIG_EXT_INTRA
3700     is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize);
3701     if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
3702     if (is_directional_mode) {
3703       this_rd_stats.rate = INT_MAX;
3704       rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize,
3705                               bmode_costs[mbmi->mode], best_rd, &best_model_rd);
3706     } else {
3707       super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
3708     }
3709 #else
3710     super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
3711 #endif  // CONFIG_EXT_INTRA
3712     this_rate_tokenonly = this_rd_stats.rate;
3713     this_distortion = this_rd_stats.dist;
3714     s = this_rd_stats.skip;
3715
3716     if (this_rate_tokenonly == INT_MAX) continue;
3717
3718     this_rate = this_rate_tokenonly + bmode_costs[mbmi->mode];
3719
3720     if (!xd->lossless[mbmi->segment_id] && mbmi->sb_type >= BLOCK_8X8) {
3721       // super_block_yrd above includes the cost of the tx_size in the
3722       // tokenonly rate, but for intra blocks, tx_size is always coded
3723       // (prediction granularity), so we account for it in the full rate,
3724       // not the tokenonly rate.
3725       this_rate_tokenonly -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
3726     }
3727 #if CONFIG_PALETTE
3728     if (try_palette && mbmi->mode == DC_PRED) {
3729       this_rate +=
3730           av1_cost_bit(av1_default_palette_y_mode_prob[bsize - BLOCK_8X8]
3731                                                       [palette_y_mode_ctx],
3732                        0);
3733     }
3734 #endif  // CONFIG_PALETTE
3735 #if CONFIG_FILTER_INTRA
3736     if (mbmi->mode == DC_PRED)
3737       this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 0);
3738 #endif  // CONFIG_FILTER_INTRA
3739 #if CONFIG_EXT_INTRA
3740     if (is_directional_mode) {
3741 #if CONFIG_INTRA_INTERP
3742       const int p_angle =
3743           mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
3744       if (av1_is_intra_filter_switchable(p_angle))
3745         this_rate +=
3746             cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
3747 #endif  // CONFIG_INTRA_INTERP
3748       this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
3749                                       MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
3750     }
3751 #endif  // CONFIG_EXT_INTRA
3752     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
3753 #if CONFIG_FILTER_INTRA
3754     if (best_rd == INT64_MAX || this_rd - best_rd < (best_rd >> 4)) {
3755       filter_intra_mode_skip_mask ^= (1 << mbmi->mode);
3756     }
3757 #endif  // CONFIG_FILTER_INTRA
3758
3759     if (this_rd < best_rd) {
3760       best_mbmi = *mbmi;
3761       best_rd = this_rd;
3762 #if CONFIG_FILTER_INTRA
3763       beat_best_rd = 1;
3764 #endif  // CONFIG_FILTER_INTRA
3765       *rate = this_rate;
3766       *rate_tokenonly = this_rate_tokenonly;
3767       *distortion = this_distortion;
3768       *skippable = s;
3769 #if CONFIG_PVQ
3770       od_encode_checkpoint(&x->daala_enc, &post_buf);
3771 #endif  // CONFIG_PVQ
3772     }
3773   }
3774
3775 #if CONFIG_PVQ
3776   od_encode_rollback(&x->daala_enc, &post_buf);
3777 #endif  // CONFIG_PVQ
3778
3779 #if CONFIG_CFL
3780   // Perform one extra txfm_rd_in_plane() call, this time with the best value so
3781   // we can store reconstructed luma values
3782   RD_STATS this_rd_stats;
3783   x->cfl_store_y = 1;
3784   txfm_rd_in_plane(x, cpi, &this_rd_stats, INT64_MAX, 0, bsize,
3785                    mic->mbmi.tx_size, cpi->sf.use_fast_coef_costing);
3786   x->cfl_store_y = 0;
3787 #endif
3788
3789 #if CONFIG_PALETTE
3790   if (try_palette) {
3791     rd_pick_palette_intra_sby(cpi, x, bsize, palette_y_mode_ctx,
3792                               bmode_costs[DC_PRED], &best_mbmi,
3793                               best_palette_color_map, &best_rd, &best_model_rd,
3794                               rate, rate_tokenonly, distortion, skippable);
3795   }
3796 #endif  // CONFIG_PALETTE
3797
3798 #if CONFIG_FILTER_INTRA
3799   if (beat_best_rd) {
3800     if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
3801                                  skippable, bsize, bmode_costs[DC_PRED],
3802                                  &best_rd, &best_model_rd,
3803                                  filter_intra_mode_skip_mask)) {
3804       best_mbmi = *mbmi;
3805     }
3806   }
3807 #endif  // CONFIG_FILTER_INTRA
3808
3809   *mbmi = best_mbmi;
3810   return best_rd;
3811 }
3812
3813 // Return value 0: early termination triggered, no valid rd cost available;
3814 //              1: rd cost values are valid.
3815 static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
3816                             RD_STATS *rd_stats, BLOCK_SIZE bsize,
3817                             int64_t ref_best_rd) {
3818   MACROBLOCKD *const xd = &x->e_mbd;
3819   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
3820   const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
3821   int plane;
3822   int is_cost_valid = 1;
3823   av1_init_rd_stats(rd_stats);
3824
3825   if (ref_best_rd < 0) is_cost_valid = 0;
3826
3827 #if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
3828   if (x->skip_chroma_rd) return is_cost_valid;
3829
3830   bsize = scale_chroma_bsize(bsize, xd->plane[1].subsampling_x,
3831                              xd->plane[1].subsampling_y);
3832 #endif  // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
3833
3834 #if !CONFIG_PVQ
3835   if (is_inter_block(mbmi) && is_cost_valid) {
3836     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
3837       av1_subtract_plane(x, bsize, plane);
3838   }
3839 #endif  // !CONFIG_PVQ
3840
3841   if (is_cost_valid) {
3842     for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
3843       RD_STATS pn_rd_stats;
3844       txfm_rd_in_plane(x, cpi, &pn_rd_stats, ref_best_rd, plane, bsize,
3845                        uv_tx_size, cpi->sf.use_fast_coef_costing);
3846       if (pn_rd_stats.rate == INT_MAX) {
3847         is_cost_valid = 0;
3848         break;
3849       }
3850       av1_merge_rd_stats(rd_stats, &pn_rd_stats);
3851       if (RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist) >
3852               ref_best_rd &&
3853           RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse) > ref_best_rd) {
3854         is_cost_valid = 0;
3855         break;
3856       }
3857     }
3858   }
3859
3860   if (!is_cost_valid) {
3861     // reset cost value
3862     av1_invalid_rd_stats(rd_stats);
3863   }
3864
3865   return is_cost_valid;
3866 }
3867
3868 #if CONFIG_VAR_TX
3869 // FIXME crop these calls
3870 static uint64_t sum_squares_2d(const int16_t *diff, int diff_stride,
3871                                TX_SIZE tx_size) {
3872   return aom_sum_squares_2d_i16(diff, diff_stride, tx_size_wide[tx_size],
3873                                 tx_size_high[tx_size]);
3874 }
3875
3876 void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
3877                        int blk_row, int blk_col, int plane, int block,
3878                        int plane_bsize, const ENTROPY_CONTEXT *a,
3879                        const ENTROPY_CONTEXT *l, RD_STATS *rd_stats) {
3880   const AV1_COMMON *const cm = &cpi->common;
3881   MACROBLOCKD *xd = &x->e_mbd;
3882   const struct macroblock_plane *const p = &x->plane[plane];
3883   struct macroblockd_plane *const pd = &xd->plane[plane];
3884   int64_t tmp;
3885   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
3886   PLANE_TYPE plane_type = get_plane_type(plane);
3887   TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
3888   const SCAN_ORDER *const scan_order =
3889       get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
3890   BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size];
3891   int bh = block_size_high[txm_bsize];
3892   int bw = block_size_wide[txm_bsize];
3893   int txb_h = tx_size_high_unit[tx_size];
3894   int txb_w = tx_size_wide_unit[tx_size];
3895
3896   int src_stride = p->src.stride;
3897   uint8_t *src =
3898       &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
3899   uint8_t *dst =
3900       &pd->dst
3901            .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
3902 #if CONFIG_HIGHBITDEPTH
3903   DECLARE_ALIGNED(16, uint16_t, rec_buffer16[MAX_TX_SQUARE]);
3904   uint8_t *rec_buffer;
3905 #else
3906   DECLARE_ALIGNED(16, uint8_t, rec_buffer[MAX_TX_SQUARE]);
3907 #endif  // CONFIG_HIGHBITDEPTH
3908   int max_blocks_high = block_size_high[plane_bsize];
3909   int max_blocks_wide = block_size_wide[plane_bsize];
3910   const int diff_stride = max_blocks_wide;
3911   const int16_t *diff =
3912       &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
3913   int txb_coeff_cost;
3914
3915   assert(tx_size < TX_SIZES_ALL);
3916
3917   if (xd->mb_to_bottom_edge < 0)
3918     max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
3919   if (xd->mb_to_right_edge < 0)
3920     max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
3921
3922   max_blocks_high >>= tx_size_wide_log2[0];
3923   max_blocks_wide >>= tx_size_wide_log2[0];
3924
3925   int coeff_ctx = get_entropy_context(tx_size, a, l);
3926
3927   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
3928                   coeff_ctx, AV1_XFORM_QUANT_FP);
3929
3930   av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx);
3931
3932 // TODO(any): Use av1_dist_block to compute distortion
3933 #if CONFIG_HIGHBITDEPTH
3934   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
3935     rec_buffer = CONVERT_TO_BYTEPTR(rec_buffer16);
3936     aom_highbd_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL,
3937                              0, NULL, 0, bw, bh, xd->bd);
3938   } else {
3939     rec_buffer = (uint8_t *)rec_buffer16;
3940     aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0,
3941                       NULL, 0, bw, bh);
3942   }
3943 #else
3944   aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0, NULL,
3945                     0, bw, bh);
3946 #endif  // CONFIG_HIGHBITDEPTH
3947
3948   if (blk_row + txb_h > max_blocks_high || blk_col + txb_w > max_blocks_wide) {
3949     int idx, idy;
3950     int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row);
3951     int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col);
3952     tmp = 0;
3953     for (idy = 0; idy < blocks_height; ++idy) {
3954       for (idx = 0; idx < blocks_width; ++idx) {
3955         const int16_t *d =
3956             diff + ((idy * diff_stride + idx) << tx_size_wide_log2[0]);
3957         tmp += sum_squares_2d(d, diff_stride, 0);
3958       }
3959     }
3960   } else {
3961     tmp = sum_squares_2d(diff, diff_stride, tx_size);
3962   }
3963
3964 #if CONFIG_HIGHBITDEPTH
3965   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
3966     tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
3967 #endif  // CONFIG_HIGHBITDEPTH
3968   rd_stats->sse += tmp * 16;
3969   const int eob = p->eobs[block];
3970
3971   av1_inverse_transform_block(xd, dqcoeff, tx_type, tx_size, rec_buffer,
3972                               MAX_TX_SIZE, eob);
3973   if (eob > 0) {
3974     if (txb_w + blk_col > max_blocks_wide ||
3975         txb_h + blk_row > max_blocks_high) {
3976       int idx, idy;
3977       unsigned int this_dist;
3978       int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row);
3979       int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col);
3980       tmp = 0;
3981       for (idy = 0; idy < blocks_height; ++idy) {
3982         for (idx = 0; idx < blocks_width; ++idx) {
3983           uint8_t *const s =
3984               src + ((idy * src_stride + idx) << tx_size_wide_log2[0]);
3985           uint8_t *const r =
3986               rec_buffer + ((idy * MAX_TX_SIZE + idx) << tx_size_wide_log2[0]);
3987           cpi->fn_ptr[0].vf(s, src_stride, r, MAX_TX_SIZE, &this_dist);
3988           tmp += this_dist;
3989         }
3990       }
3991     } else {
3992       uint32_t this_dist;
3993       cpi->fn_ptr[txm_bsize].vf(src, src_stride, rec_buffer, MAX_TX_SIZE,
3994                                 &this_dist);
3995       tmp = this_dist;
3996     }
3997   }
3998   rd_stats->dist += tmp * 16;
3999   txb_coeff_cost =
4000       av1_cost_coeffs(cpi, x, plane, block, tx_size, scan_order, a, l, 0);
4001   rd_stats->rate += txb_coeff_cost;
4002   rd_stats->skip &= (eob == 0);
4003
4004 #if CONFIG_RD_DEBUG
4005   av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
4006                             txb_coeff_cost);
4007 #endif  // CONFIG_RD_DEBUG
4008 }
4009
4010 static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
4011                             int blk_col, int plane, int block, int block32,
4012                             TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
4013                             ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
4014                             TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
4015                             RD_STATS *rd_stats, int64_t ref_best_rd,
4016                             int *is_cost_valid, RD_STATS *rd_stats_stack) {
4017   MACROBLOCKD *const xd = &x->e_mbd;
4018   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
4019   struct macroblock_plane *const p = &x->plane[plane];
4020   struct macroblockd_plane *const pd = &xd->plane[plane];
4021   const int tx_row = blk_row >> (1 - pd->subsampling_y);
4022   const int tx_col = blk_col >> (1 - pd->subsampling_x);
4023   TX_SIZE(*const inter_tx_size)
4024   [MAX_MIB_SIZE] =
4025       (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
4026   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
4027   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
4028   const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
4029   int64_t this_rd = INT64_MAX;
4030   ENTROPY_CONTEXT *pta = ta + blk_col;
4031   ENTROPY_CONTEXT *ptl = tl + blk_row;
4032   int coeff_ctx, i;
4033   int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
4034                                    mbmi->sb_type, tx_size);
4035   int64_t sum_rd = INT64_MAX;
4036   int tmp_eob = 0;
4037   int zero_blk_rate;
4038   RD_STATS sum_rd_stats;
4039   const int tx_size_ctx = txsize_sqr_map[tx_size];
4040
4041   av1_init_rd_stats(&sum_rd_stats);
4042
4043   assert(tx_size < TX_SIZES_ALL);
4044
4045   if (ref_best_rd < 0) {
4046     *is_cost_valid = 0;
4047     return;
4048   }
4049
4050   coeff_ctx = get_entropy_context(tx_size, pta, ptl);
4051
4052   av1_init_rd_stats(rd_stats);
4053
4054   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
4055
4056   zero_blk_rate = x->token_costs[tx_size_ctx][pd->plane_type][1][0][0]
4057                                 [coeff_ctx][EOB_TOKEN];
4058
4059   if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
4060     inter_tx_size[0][0] = tx_size;
4061
4062     if (tx_size == TX_32X32 && mbmi->tx_type != DCT_DCT &&
4063         rd_stats_stack[block32].rate != INT_MAX) {
4064       *rd_stats = rd_stats_stack[block32];
4065       p->eobs[block] = !rd_stats->skip;
4066       x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
4067     } else {
4068       av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
4069                         plane_bsize, pta, ptl, rd_stats);
4070       if (tx_size == TX_32X32) {
4071         rd_stats_stack[block32] = *rd_stats;
4072       }
4073     }
4074
4075     if ((RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist) >=
4076              RDCOST(x->rdmult, x->rddiv, zero_blk_rate, rd_stats->sse) ||
4077          rd_stats->skip == 1) &&
4078         !xd->lossless[mbmi->segment_id]) {
4079 #if CONFIG_RD_DEBUG
4080       av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
4081                                 zero_blk_rate - rd_stats->rate);
4082 #endif  // CONFIG_RD_DEBUG
4083       rd_stats->rate = zero_blk_rate;
4084       rd_stats->dist = rd_stats->sse;
4085       rd_stats->skip = 1;
4086       x->blk_skip[plane][blk_row * bw + blk_col] = 1;
4087       p->eobs[block] = 0;
4088     } else {
4089       x->blk_skip[plane][blk_row * bw + blk_col] = 0;
4090       rd_stats->skip = 0;
4091     }
4092
4093     if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
4094       rd_stats->rate +=
4095           av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
4096     this_rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
4097     tmp_eob = p->eobs[block];
4098   }
4099
4100   if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
4101     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
4102     const int bsl = tx_size_wide_unit[sub_txs];
4103     int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
4104     RD_STATS this_rd_stats;
4105     int this_cost_valid = 1;
4106     int64_t tmp_rd = 0;
4107
4108     sum_rd_stats.rate =
4109         av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
4110
4111     assert(tx_size < TX_SIZES_ALL);
4112
4113     for (i = 0; i < 4 && this_cost_valid; ++i) {
4114       int offsetr = blk_row + (i >> 1) * bsl;
4115       int offsetc = blk_col + (i & 0x01) * bsl;
4116
4117       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
4118
4119       select_tx_block(cpi, x, offsetr, offsetc, plane, block, block32, sub_txs,
4120                       depth + 1, plane_bsize, ta, tl, tx_above, tx_left,
4121                       &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid,
4122                       rd_stats_stack);
4123
4124       av1_merge_rd_stats(&sum_rd_stats, &this_rd_stats);
4125
4126       tmp_rd =
4127           RDCOST(x->rdmult, x->rddiv, sum_rd_stats.rate, sum_rd_stats.dist);
4128       if (this_rd < tmp_rd) break;
4129       block += sub_step;
4130     }
4131     if (this_cost_valid) sum_rd = tmp_rd;
4132   }
4133
4134   if (this_rd < sum_rd) {
4135     int idx, idy;
4136     for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) pta[i] = !(tmp_eob == 0);
4137     for (i = 0; i < tx_size_high_unit[tx_size]; ++i) ptl[i] = !(tmp_eob == 0);
4138     txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
4139                           tx_size);
4140     inter_tx_size[0][0] = tx_size;
4141     for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
4142       for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
4143         inter_tx_size[idy][idx] = tx_size;
4144     mbmi->tx_size = tx_size;
4145     if (this_rd == INT64_MAX) *is_cost_valid = 0;
4146     x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
4147   } else {
4148     *rd_stats = sum_rd_stats;
4149     if (sum_rd == INT64_MAX) *is_cost_valid = 0;
4150   }
4151 }
4152
4153 static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
4154                             RD_STATS *rd_stats, BLOCK_SIZE bsize,
4155                             int64_t ref_best_rd, RD_STATS *rd_stats_stack) {
4156   MACROBLOCKD *const xd = &x->e_mbd;
4157   int is_cost_valid = 1;
4158   int64_t this_rd = 0;
4159
4160   if (ref_best_rd < 0) is_cost_valid = 0;
4161
4162   av1_init_rd_stats(rd_stats);
4163
4164   if (is_cost_valid) {
4165     const struct macroblockd_plane *const pd = &xd->plane[0];
4166     const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
4167     const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
4168     const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
4169     const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
4170     const int bh = tx_size_high_unit[max_tx_size];
4171     const int bw = tx_size_wide_unit[max_tx_size];
4172     int idx, idy;
4173     int block = 0;
4174     int block32 = 0;
4175     int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
4176     ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
4177     ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
4178     TXFM_CONTEXT tx_above[MAX_MIB_SIZE * 2];
4179     TXFM_CONTEXT tx_left[MAX_MIB_SIZE * 2];
4180
4181     RD_STATS pn_rd_stats;
4182     av1_init_rd_stats(&pn_rd_stats);
4183
4184     av1_get_entropy_contexts(bsize, 0, pd, ctxa, ctxl);
4185     memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
4186     memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
4187
4188     for (idy = 0; idy < mi_height; idy += bh) {
4189       for (idx = 0; idx < mi_width; idx += bw) {
4190         select_tx_block(cpi, x, idy, idx, 0, block, block32, max_tx_size,
4191                         mi_height != mi_width, plane_bsize, ctxa, ctxl,
4192                         tx_above, tx_left, &pn_rd_stats, ref_best_rd - this_rd,
4193                         &is_cost_valid, rd_stats_stack);
4194         av1_merge_rd_stats(rd_stats, &pn_rd_stats);
4195         this_rd += AOMMIN(
4196             RDCOST(x->rdmult, x->rddiv, pn_rd_stats.rate, pn_rd_stats.dist),
4197             RDCOST(x->rdmult, x->rddiv, 0, pn_rd_stats.sse));
4198         block += step;
4199         ++block32;
4200       }
4201     }
4202   }
4203
4204   this_rd = AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist),
4205                    RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
4206   if (this_rd > ref_best_rd) is_cost_valid = 0;
4207
4208   if (!is_cost_valid) {
4209     // reset cost value
4210     av1_invalid_rd_stats(rd_stats);
4211   }
4212 }
4213
4214 static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
4215                                        RD_STATS *rd_stats, BLOCK_SIZE bsize,
4216                                        int64_t ref_best_rd, TX_TYPE tx_type,
4217                                        RD_STATS *rd_stats_stack) {
4218   const AV1_COMMON *const cm = &cpi->common;
4219   MACROBLOCKD *const xd = &x->e_mbd;
4220   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
4221   const int is_inter = is_inter_block(mbmi);
4222   aom_prob skip_prob = av1_get_skip_prob(cm, xd);
4223   int s0 = av1_cost_bit(skip_prob, 0);
4224   int s1 = av1_cost_bit(skip_prob, 1);
4225   int64_t rd;
4226   int row, col;
4227   const int max_blocks_high = max_block_high(xd, bsize, 0);
4228   const int max_blocks_wide = max_block_wide(xd, bsize, 0);
4229
4230   mbmi->tx_type = tx_type;
4231   inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, rd_stats_stack);
4232   mbmi->min_tx_size = get_min_tx_size(mbmi->inter_tx_size[0][0]);
4233
4234   if (rd_stats->rate == INT_MAX) return INT64_MAX;
4235
4236   for (row = 0; row < max_blocks_high / 2; ++row)
4237     for (col = 0; col < max_blocks_wide / 2; ++col)
4238       mbmi->min_tx_size = AOMMIN(
4239           mbmi->min_tx_size, get_min_tx_size(mbmi->inter_tx_size[row][col]));
4240
4241 #if CONFIG_EXT_TX
4242   if (get_ext_tx_types(mbmi->min_tx_size, bsize, is_inter,
4243                        cm->reduced_tx_set_used) > 1 &&
4244       !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
4245     const int ext_tx_set = get_ext_tx_set(mbmi->min_tx_size, bsize, is_inter,
4246                                           cm->reduced_tx_set_used);
4247     if (is_inter) {
4248       if (ext_tx_set > 0)
4249         rd_stats->rate +=
4250             cpi->inter_tx_type_costs[ext_tx_set]
4251                                     [txsize_sqr_map[mbmi->min_tx_size]]
4252                                     [mbmi->tx_type];
4253     } else {
4254       if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
4255         rd_stats->rate +=
4256             cpi->intra_tx_type_costs[ext_tx_set][mbmi->min_tx_size][mbmi->mode]
4257                                     [mbmi->tx_type];
4258     }
4259   }
4260 #else   // CONFIG_EXT_TX
4261   if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id])
4262     rd_stats->rate +=
4263         cpi->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
4264 #endif  // CONFIG_EXT_TX
4265
4266   if (rd_stats->skip)
4267     rd = RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse);
4268   else
4269     rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate + s0, rd_stats->dist);
4270
4271   if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
4272       !(rd_stats->skip))
4273     rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse));
4274
4275   return rd;
4276 }
4277
4278 static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
4279                                RD_STATS *rd_stats, BLOCK_SIZE bsize,
4280                                int64_t ref_best_rd) {
4281   const AV1_COMMON *cm = &cpi->common;
4282   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
4283   MACROBLOCKD *const xd = &x->e_mbd;
4284   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
4285   int64_t rd = INT64_MAX;
4286   int64_t best_rd = INT64_MAX;
4287   TX_TYPE tx_type, best_tx_type = DCT_DCT;
4288   const int is_inter = is_inter_block(mbmi);
4289   TX_SIZE best_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
4290   TX_SIZE best_tx = max_txsize_lookup[bsize];
4291   TX_SIZE best_min_tx_size = TX_SIZES_ALL;
4292   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 8];
4293   const int n4 = bsize_to_num_blk(bsize);
4294   int idx, idy;
4295   int prune = 0;
4296   const int count32 =
4297       1 << (2 * (cm->mib_size_log2 - mi_width_log2_lookup[BLOCK_32X32]));
4298 #if CONFIG_EXT_PARTITION
4299   RD_STATS rd_stats_stack[16];
4300 #else
4301   RD_STATS rd_stats_stack[4];
4302 #endif  // CONFIG_EXT_PARTITION
4303 #if CONFIG_EXT_TX
4304   const int ext_tx_set =
4305       get_ext_tx_set(max_tx_size, bsize, is_inter, cm->reduced_tx_set_used);
4306 #endif  // CONFIG_EXT_TX
4307
4308   if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
4309 #if CONFIG_EXT_TX
4310     prune = prune_tx_types(cpi, bsize, x, xd, ext_tx_set);
4311 #else
4312     prune = prune_tx_types(cpi, bsize, x, xd, 0);
4313 #endif  // CONFIG_EXT_TX
4314
4315   av1_invalid_rd_stats(rd_stats);
4316
4317   for (idx = 0; idx < count32; ++idx)
4318     av1_invalid_rd_stats(&rd_stats_stack[idx]);
4319
4320   for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
4321     RD_STATS this_rd_stats;
4322     av1_init_rd_stats(&this_rd_stats);
4323 #if CONFIG_EXT_TX
4324     if (is_inter) {
4325       if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
4326       if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
4327         if (!do_tx_type_search(tx_type, prune)) continue;
4328       }
4329     } else {
4330       if (!ALLOW_INTRA_EXT_TX && bsize >= BLOCK_8X8) {
4331         if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
4332       }
4333       if (!ext_tx_used_intra[ext_tx_set][tx_type]) continue;
4334     }
4335 #else   // CONFIG_EXT_TX
4336     if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
4337         !do_tx_type_search(tx_type, prune))
4338       continue;
4339 #endif  // CONFIG_EXT_TX
4340     if (is_inter && x->use_default_inter_tx_type &&
4341         tx_type != get_default_tx_type(0, xd, 0, max_tx_size))
4342       continue;
4343
4344     if (xd->lossless[mbmi->segment_id])
4345       if (tx_type != DCT_DCT) continue;
4346
4347     rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
4348                                  tx_type, rd_stats_stack);
4349
4350     if (rd < best_rd) {
4351       best_rd = rd;
4352       *rd_stats = this_rd_stats;
4353       best_tx_type = mbmi->tx_type;
4354       best_tx = mbmi->tx_size;
4355       best_min_tx_size = mbmi->min_tx_size;
4356       memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
4357       for (idy = 0; idy < xd->n8_h; ++idy)
4358         for (idx = 0; idx < xd->n8_w; ++idx)
4359           best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
4360     }
4361   }
4362
4363   mbmi->tx_type = best_tx_type;
4364   for (idy = 0; idy < xd->n8_h; ++idy)
4365     for (idx = 0; idx < xd->n8_w; ++idx)
4366       mbmi->inter_tx_size[idy][idx] = best_tx_size[idy][idx];
4367   mbmi->tx_size = best_tx;
4368   mbmi->min_tx_size = best_min_tx_size;
4369   memcpy(x->blk_skip[0], best_blk_skip, sizeof(best_blk_skip[0]) * n4);
4370 }
4371
4372 static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
4373                         int blk_col, int plane, int block, TX_SIZE tx_size,
4374                         BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx,
4375                         ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats) {
4376   MACROBLOCKD *const xd = &x->e_mbd;
4377   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
4378   struct macroblock_plane *const p = &x->plane[plane];
4379   struct macroblockd_plane *const pd = &xd->plane[plane];
4380   BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
4381   const int tx_row = blk_row >> (1 - pd->subsampling_y);
4382   const int tx_col = blk_col >> (1 - pd->subsampling_x);
4383   TX_SIZE plane_tx_size;
4384   const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
4385   const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
4386
4387   assert(tx_size < TX_SIZES_ALL);
4388
4389   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
4390
4391   plane_tx_size =
4392       plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
4393             : mbmi->inter_tx_size[tx_row][tx_col];
4394
4395   if (tx_size == plane_tx_size) {
4396     int i;
4397     ENTROPY_CONTEXT *ta = above_ctx + blk_col;
4398     ENTROPY_CONTEXT *tl = left_ctx + blk_row;
4399     av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
4400                       plane_bsize, ta, tl, rd_stats);
4401
4402     for (i = 0; i < tx_size_wide_unit[tx_size]; ++i)
4403       ta[i] = !(p->eobs[block] == 0);
4404     for (i = 0; i < tx_size_high_unit[tx_size]; ++i)
4405       tl[i] = !(p->eobs[block] == 0);
4406   } else {
4407     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
4408     const int bsl = tx_size_wide_unit[sub_txs];
4409     int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
4410     int i;
4411
4412     assert(bsl > 0);
4413
4414     for (i = 0; i < 4; ++i) {
4415       int offsetr = blk_row + (i >> 1) * bsl;
4416       int offsetc = blk_col + (i & 0x01) * bsl;
4417
4418       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
4419
4420       tx_block_rd(cpi, x, offsetr, offsetc, plane, block, sub_txs, plane_bsize,
4421                   above_ctx, left_ctx, rd_stats);
4422       block += step;
4423     }
4424   }
4425 }
4426
4427 // Return value 0: early termination triggered, no valid rd cost available;
4428 //              1: rd cost values are valid.
4429 static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
4430                             RD_STATS *rd_stats, BLOCK_SIZE bsize,
4431                             int64_t ref_best_rd) {
4432   MACROBLOCKD *const xd = &x->e_mbd;
4433   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
4434   int plane;
4435   int is_cost_valid = 1;
4436   int64_t this_rd;
4437
4438   if (ref_best_rd < 0) is_cost_valid = 0;
4439
4440   av1_init_rd_stats(rd_stats);
4441
4442 #if CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
4443   if (x->skip_chroma_rd) return is_cost_valid;
4444   bsize = scale_chroma_bsize(mbmi->sb_type, xd->plane[1].subsampling_x,
4445                              xd->plane[1].subsampling_y);
4446 #endif  // CONFIG_CB4X4 && !CONFIG_CHROMA_2X2
4447
4448 #if CONFIG_EXT_TX && CONFIG_RECT_TX
4449   if (is_rect_tx(mbmi->tx_size)) {
4450     return super_block_uvrd(cpi, x, rd_stats, bsize, ref_best_rd);
4451   }
4452 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
4453
4454   if (is_inter_block(mbmi) && is_cost_valid) {
4455     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
4456       av1_subtract_plane(x, bsize, plane);
4457   }
4458
4459   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
4460     const struct macroblockd_plane *const pd = &xd->plane[plane];
4461     const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
4462     const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
4463     const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
4464     const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
4465     const int bh = tx_size_high_unit[max_tx_size];
4466     const int bw = tx_size_wide_unit[max_tx_size];
4467     int idx, idy;
4468     int block = 0;
4469     const int step = bh * bw;
4470     ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE];
4471     ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE];
4472     RD_STATS pn_rd_stats;
4473     av1_init_rd_stats(&pn_rd_stats);
4474
4475     av1_get_entropy_contexts(bsize, 0, pd, ta, tl);
4476
4477     for (idy = 0; idy < mi_height; idy += bh) {
4478       for (idx = 0; idx < mi_width; idx += bw) {
4479         tx_block_rd(cpi, x, idy, idx, plane, block, max_tx_size, plane_bsize,
4480                     ta, tl, &pn_rd_stats);
4481         block += step;
4482       }
4483     }
4484
4485     if (pn_rd_stats.rate == INT_MAX) {
4486       is_cost_valid = 0;
4487       break;
4488     }
4489
4490     av1_merge_rd_stats(rd_stats, &pn_rd_stats);
4491
4492     this_rd =
4493         AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist),
4494                RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
4495
4496     if (this_rd > ref_best_rd) {
4497       is_cost_valid = 0;
4498       break;
4499     }
4500   }
4501
4502   if (!is_cost_valid) {
4503     // reset cost value
4504     av1_invalid_rd_stats(rd_stats);
4505   }
4506
4507   return is_cost_valid;
4508 }
4509 #endif  // CONFIG_VAR_TX
4510
4511 #if CONFIG_PALETTE
4512 static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
4513                                        int dc_mode_cost,
4514                                        uint8_t *best_palette_color_map,
4515                                        MB_MODE_INFO *const best_mbmi,
4516                                        int64_t *best_rd, int *rate,
4517                                        int *rate_tokenonly, int64_t *distortion,
4518                                        int *skippable) {
4519   MACROBLOCKD *const xd = &x->e_mbd;
4520   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
4521   assert(!is_inter_block(mbmi));
4522   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
4523   const BLOCK_SIZE bsize = mbmi->sb_type;
4524   int this_rate;
4525   int64_t this_rd;
4526   int colors_u, colors_v, colors;
4527   const int src_stride = x->plane[1].src.stride;
4528   const uint8_t *const src_u = x->plane[1].src.buf;
4529   const uint8_t *const src_v = x->plane[2].src.buf;
4530   uint8_t *const color_map = xd->plane[1].color_index_map;
4531   RD_STATS tokenonly_rd_stats;
4532   int plane_block_width, plane_block_height, rows, cols;
4533   av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
4534                            &plane_block_height, &rows, &cols);
4535   if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return;
4536
4537   mbmi->uv_mode = DC_PRED;
4538 #if CONFIG_FILTER_INTRA
4539   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
4540 #endif  // CONFIG_FILTER_INTRA
4541
4542 #if CONFIG_HIGHBITDEPTH
4543   if (cpi->common.use_highbitdepth) {
4544     colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
4545                                        cpi->common.bit_depth);
4546     colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
4547                                        cpi->common.bit_depth);
4548   } else {
4549 #endif  // CONFIG_HIGHBITDEPTH
4550     colors_u = av1_count_colors(src_u, src_stride, rows, cols);
4551     colors_v = av1_count_colors(src_v, src_stride, rows, cols);
4552 #if CONFIG_HIGHBITDEPTH
4553   }
4554 #endif  // CONFIG_HIGHBITDEPTH
4555
4556 #if CONFIG_PALETTE_DELTA_ENCODING
4557   const MODE_INFO *above_mi = xd->above_mi;
4558   const MODE_INFO *left_mi = xd->left_mi;
4559   uint16_t color_cache[2 * PALETTE_MAX_SIZE];
4560   const int n_cache = av1_get_palette_cache(above_mi, left_mi, 1, color_cache);
4561 #endif  // CONFIG_PALETTE_DELTA_ENCODING
4562
4563   colors = colors_u > colors_v ? colors_u : colors_v;
4564   if (colors > 1 && colors <= 64) {
4565     int r, c, n, i, j;
4566     const int max_itr = 50;
4567     uint8_t color_order[PALETTE_MAX_SIZE];
4568     float lb_u, ub_u, val_u;
4569     float lb_v, ub_v, val_v;
4570     float *const data = x->palette_buffer->kmeans_data_buf;
4571     float centroids[2 * PALETTE_MAX_SIZE];
4572
4573 #if CONFIG_HIGHBITDEPTH
4574     uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
4575     uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
4576     if (cpi->common.use_highbitdepth) {
4577       lb_u = src_u16[0];
4578       ub_u = src_u16[0];
4579       lb_v = src_v16[0];
4580       ub_v = src_v16[0];
4581     } else {
4582 #endif  // CONFIG_HIGHBITDEPTH
4583       lb_u = src_u[0];
4584       ub_u = src_u[0];
4585       lb_v = src_v[0];
4586       ub_v = src_v[0];
4587 #if CONFIG_HIGHBITDEPTH
4588     }
4589 #endif  // CONFIG_HIGHBITDEPTH
4590
4591     for (r = 0; r < rows; ++r) {
4592       for (c = 0; c < cols; ++c) {
4593 #if CONFIG_HIGHBITDEPTH
4594         if (cpi->common.use_highbitdepth) {
4595           val_u = src_u16[r * src_stride + c];
4596           val_v = src_v16[r * src_stride + c];
4597           data[(r * cols + c) * 2] = val_u;
4598           data[(r * cols + c) * 2 + 1] = val_v;
4599         } else {
4600 #endif  // CONFIG_HIGHBITDEPTH
4601           val_u = src_u[r * src_stride + c];
4602           val_v = src_v[r * src_stride + c];
4603           data[(r * cols + c) * 2] = val_u;
4604           data[(r * cols + c) * 2 + 1] = val_v;
4605 #if CONFIG_HIGHBITDEPTH
4606         }
4607 #endif  // CONFIG_HIGHBITDEPTH
4608         if (val_u < lb_u)
4609           lb_u = val_u;
4610         else if (val_u > ub_u)
4611           ub_u = val_u;
4612         if (val_v < lb_v)
4613           lb_v = val_v;
4614         else if (val_v > ub_v)
4615           ub_v = val_v;
4616       }
4617     }
4618
4619     for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
4620          --n) {
4621       for (i = 0; i < n; ++i) {
4622         centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
4623         centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
4624       }
4625       av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
4626 #if CONFIG_PALETTE_DELTA_ENCODING
4627       optimize_palette_colors(color_cache, n_cache, n, 2, centroids);
4628       // Sort the U channel colors in ascending order.
4629       for (i = 0; i < 2 * (n - 1); i += 2) {
4630         int min_idx = i;
4631         float min_val = centroids[i];
4632         for (j = i + 2; j < 2 * n; j += 2)
4633           if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
4634         if (min_idx != i) {
4635           float temp_u = centroids[i], temp_v = centroids[i + 1];
4636           centroids[i] = centroids[min_idx];
4637           centroids[i + 1] = centroids[min_idx + 1];
4638           centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
4639         }
4640       }
4641       av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
4642 #endif  // CONFIG_PALETTE_DELTA_ENCODING
4643       extend_palette_color_map(color_map, cols, rows, plane_block_width,
4644                                plane_block_height);
4645       pmi->palette_size[1] = n;
4646       for (i = 1; i < 3; ++i) {
4647         for (j = 0; j < n; ++j) {
4648 #if CONFIG_HIGHBITDEPTH
4649           if (cpi->common.use_highbitdepth)
4650             pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
4651                 (int)centroids[j * 2 + i - 1], cpi->common.bit_depth);
4652           else
4653 #endif  // CONFIG_HIGHBITDEPTH
4654             pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
4655                 clip_pixel((int)centroids[j * 2 + i - 1]);
4656         }
4657       }
4658
4659       super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
4660       if (tokenonly_rd_stats.rate == INT_MAX) continue;
4661       this_rate =
4662           tokenonly_rd_stats.rate + dc_mode_cost +
4663           cpi->palette_uv_size_cost[bsize - BLOCK_8X8][n - PALETTE_MIN_SIZE] +
4664           write_uniform_cost(n, color_map[0]) +
4665           av1_cost_bit(
4666               av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 1);
4667       this_rate += av1_palette_color_cost_uv(pmi,
4668 #if CONFIG_PALETTE_DELTA_ENCODING
4669                                              color_cache, n_cache,
4670 #endif  // CONFIG_PALETTE_DELTA_ENCODING
4671                                              cpi->common.bit_depth);
4672       for (i = 0; i < rows; ++i) {
4673         for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
4674           int color_idx;
4675           const int color_ctx = av1_get_palette_color_index_context(
4676               color_map, plane_block_width, i, j, n, color_order, &color_idx);
4677           assert(color_idx >= 0 && color_idx < n);
4678           this_rate += cpi->palette_uv_color_cost[n - PALETTE_MIN_SIZE]
4679                                                  [color_ctx][color_idx];
4680         }
4681       }
4682
4683       this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
4684       if (this_rd < *best_rd) {
4685         *best_rd = this_rd;
4686         *best_mbmi = *mbmi;
4687         memcpy(best_palette_color_map, color_map,
4688                plane_block_width * plane_block_height *
4689                    sizeof(best_palette_color_map[0]));
4690         *rate = this_rate;
4691         *distortion = tokenonly_rd_stats.dist;
4692         *rate_tokenonly = tokenonly_rd_stats.rate;
4693         *skippable = tokenonly_rd_stats.skip;
4694       }
4695     }
4696   }
4697   if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
4698     memcpy(color_map, best_palette_color_map,
4699            rows * cols * sizeof(best_palette_color_map[0]));
4700   }
4701 }
4702 #endif  // CONFIG_PALETTE
4703
4704 #if CONFIG_FILTER_INTRA
4705 // Return 1 if an filter intra mode is selected; return 0 otherwise.
4706 static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
4707                                      int *rate, int *rate_tokenonly,
4708                                      int64_t *distortion, int *skippable,
4709                                      BLOCK_SIZE bsize, int64_t *best_rd) {
4710   MACROBLOCKD *const xd = &x->e_mbd;
4711   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
4712   int filter_intra_selected_flag = 0;
4713   int this_rate;
4714   int64_t this_rd;
4715   FILTER_INTRA_MODE mode;
4716   FILTER_INTRA_MODE_INFO filter_intra_mode_info;
4717   RD_STATS tokenonly_rd_stats;
4718
4719   av1_zero(filter_intra_mode_info);
4720   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 1;
4721   mbmi->uv_mode = DC_PRED;
4722 #if CONFIG_PALETTE
4723   mbmi->palette_mode_info.palette_size[1] = 0;
4724 #endif  // CONFIG_PALETTE
4725
4726   for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
4727     mbmi->filter_intra_mode_info.filter_intra_mode[1] = mode;
4728     if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd))
4729       continue;
4730
4731     this_rate = tokenonly_rd_stats.rate +
4732                 av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 1) +
4733                 cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
4734                 write_uniform_cost(FILTER_INTRA_MODES, mode);
4735     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
4736     if (this_rd < *best_rd) {
4737       *best_rd = this_rd;
4738       *rate = this_rate;
4739       *rate_tokenonly = tokenonly_rd_stats.rate;
4740       *distortion = tokenonly_rd_stats.dist;
4741       *skippable = tokenonly_rd_stats.skip;
4742       filter_intra_mode_info = mbmi->filter_intra_mode_info;
4743       filter_intra_selected_flag = 1;
4744     }
4745   }
4746
4747   if (filter_intra_selected_flag) {
4748     mbmi->uv_mode = DC_PRED;
4749     mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
4750         filter_intra_mode_info.use_filter_intra_mode[1];
4751     mbmi->filter_intra_mode_info.filter_intra_mode[1] =
4752         filter_intra_mode_info.filter_intra_mode[1];
4753     return 1;
4754   } else {
4755     return 0;
4756   }
4757 }
4758 #endif  // CONFIG_FILTER_INTRA
4759
4760 #if CONFIG_EXT_INTRA
4761 // Run RD calculation with given chroma intra prediction angle., and return
4762 // the RD cost. Update the best mode info. if the RD cost is the best so far.
4763 static int64_t pick_intra_angle_routine_sbuv(
4764     const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
4765     int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats,
4766     int *best_angle_delta, int64_t *best_rd) {
4767   MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
4768   assert(!is_inter_block(mbmi));
4769   int this_rate;
4770   int64_t this_rd;
4771   RD_STATS tokenonly_rd_stats;
4772
4773   if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
4774     return INT64_MAX;
4775   this_rate = tokenonly_rd_stats.rate + rate_overhead;
4776   this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
4777   if (this_rd < *best_rd) {
4778     *best_rd = this_rd;
4779     *best_angle_delta = mbmi->angle_delta[1];
4780     *rate = this_rate;
4781     rd_stats->rate = tokenonly_rd_stats.rate;
4782     rd_stats->dist = tokenonly_rd_stats.dist;
4783     rd_stats->skip = tokenonly_rd_stats.skip;
4784   }
4785   return this_rd;
4786 }
4787
4788 // With given chroma directional intra prediction mode, pick the best angle
4789 // delta. Return true if a RD cost that is smaller than the input one is found.
4790 static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
4791                                     BLOCK_SIZE bsize, int rate_overhead,
4792                                     int64_t best_rd, int *rate,
4793                                     RD_STATS *rd_stats) {
4794   MACROBLOCKD *const xd = &x->e_mbd;
4795   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
4796   assert(!is_inter_block(mbmi));
4797   int i, angle_delta, best_angle_delta = 0;
4798   int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
4799
4800   rd_stats->rate = INT_MAX;
4801   rd_stats->skip = 0;
4802   rd_stats->dist = INT64_MAX;
4803   for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
4804
4805   for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
4806     for (i = 0; i < 2; ++i) {
4807       best_rd_in = (best_rd == INT64_MAX)
4808                        ? INT64_MAX
4809                        : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5)));
4810       mbmi->angle_delta[1] = (1 - 2 * i) * angle_delta;
4811       this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead,
4812                                               best_rd_in, rate, rd_stats,
4813                                               &best_angle_delta, &best_rd);
4814       rd_cost[2 * angle_delta + i] = this_rd;
4815       if (angle_delta == 0) {
4816         if (this_rd == INT64_MAX) return 0;
4817         rd_cost[1] = this_rd;
4818         break;
4819       }
4820     }
4821   }
4822
4823   assert(best_rd != INT64_MAX);
4824   for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
4825     int64_t rd_thresh;
4826     for (i = 0; i < 2; ++i) {
4827       int skip_search = 0;
4828       rd_thresh = best_rd + (best_rd >> 5);
4829       if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
4830           rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
4831         skip_search = 1;
4832       if (!skip_search) {
4833         mbmi->angle_delta[1] = (1 - 2 * i) * angle_delta;
4834         pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd,
4835                                       rate, rd_stats, &best_angle_delta,
4836                                       &best_rd);
4837       }
4838     }
4839   }
4840
4841   mbmi->angle_delta[1] = best_angle_delta;
4842   return rd_stats->rate != INT_MAX;
4843 }
4844 #endif  // CONFIG_EXT_INTRA
4845
4846 static void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
4847   mbmi->uv_mode = DC_PRED;
4848 #if CONFIG_PALETTE
4849   mbmi->palette_mode_info.palette_size[1] = 0;
4850 #endif  // CONFIG_PALETTE
4851 #if CONFIG_FILTER_INTRA
4852   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
4853 #endif  // CONFIG_FILTER_INTRA
4854 }
4855
4856 static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
4857                                        int *rate, int *rate_tokenonly,
4858                                        int64_t *distortion, int *skippable,
4859                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
4860   MACROBLOCKD *xd = &x->e_mbd;
4861   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
4862   assert(!is_inter_block(mbmi));
4863   MB_MODE_INFO best_mbmi = *mbmi;
4864   PREDICTION_MODE mode;
4865   int64_t best_rd = INT64_MAX, this_rd;
4866   int this_rate;
4867   RD_STATS tokenonly_rd_stats;
4868 #if CONFIG_PVQ
4869   od_rollback_buffer buf;
4870   od_encode_checkpoint(&x->daala_enc, &buf);
4871 #endif  // CONFIG_PVQ
4872 #if CONFIG_PALETTE
4873   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
4874   uint8_t *best_palette_color_map = NULL;
4875 #endif  // CONFIG_PALETTE
4876
4877   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
4878 #if CONFIG_EXT_INTRA
4879     const int is_directional_mode =
4880         av1_is_directional_mode(mode, mbmi->sb_type);
4881 #endif  // CONFIG_EXT_INTRA
4882     if (!(cpi->sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
4883           (1 << mode)))
4884       continue;
4885
4886     mbmi->uv_mode = mode;
4887 #if CONFIG_EXT_INTRA
4888     mbmi->angle_delta[1] = 0;
4889     if (is_directional_mode) {
4890       const int rate_overhead = cpi->intra_uv_mode_cost[mbmi->mode][mode] +
4891                                 write_uniform_cost(2 * MAX_ANGLE_DELTA + 1, 0);
4892       if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
4893                                     &this_rate, &tokenonly_rd_stats))
4894         continue;
4895     } else {
4896 #endif  // CONFIG_EXT_INTRA
4897       if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
4898 #if CONFIG_PVQ
4899         od_encode_rollback(&x->daala_enc, &buf);
4900 #endif  // CONFIG_PVQ
4901         continue;
4902       }
4903 #if CONFIG_EXT_INTRA
4904     }
4905 #endif  // CONFIG_EXT_INTRA
4906     this_rate =
4907         tokenonly_rd_stats.rate + cpi->intra_uv_mode_cost[mbmi->mode][mode];
4908
4909 #if CONFIG_EXT_INTRA
4910     if (is_directional_mode) {
4911       this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
4912                                       MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
4913     }
4914 #endif  // CONFIG_EXT_INTRA
4915 #if CONFIG_FILTER_INTRA
4916     if (mbmi->sb_type >= BLOCK_8X8 && mode == DC_PRED)
4917       this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 0);
4918 #endif  // CONFIG_FILTER_INTRA
4919 #if CONFIG_PALETTE
4920     if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8 &&
4921         mode == DC_PRED)
4922       this_rate += av1_cost_bit(
4923           av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0], 0);
4924 #endif  // CONFIG_PALETTE
4925
4926 #if CONFIG_PVQ
4927     od_encode_rollback(&x->daala_enc, &buf);
4928 #endif  // CONFIG_PVQ
4929     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, tokenonly_rd_stats.dist);
4930
4931     if (this_rd < best_rd) {
4932       best_mbmi = *mbmi;
4933       best_rd = this_rd;
4934       *rate = this_rate;
4935       *rate_tokenonly = tokenonly_rd_stats.rate;
4936       *distortion = tokenonly_rd_stats.dist;
4937       *skippable = tokenonly_rd_stats.skip;
4938     }
4939   }
4940
4941 #if CONFIG_PALETTE
4942   if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8) {
4943     best_palette_color_map = x->palette_buffer->best_palette_color_map;
4944     rd_pick_palette_intra_sbuv(cpi, x,
4945                                cpi->intra_uv_mode_cost[mbmi->mode][DC_PRED],
4946                                best_palette_color_map, &best_mbmi, &best_rd,
4947                                rate, rate_tokenonly, distortion, skippable);
4948   }
4949 #endif  // CONFIG_PALETTE
4950
4951 #if CONFIG_FILTER_INTRA
4952   if (mbmi->sb_type >= BLOCK_8X8) {
4953     if (rd_pick_filter_intra_sbuv(cpi, x, rate, rate_tokenonly, distortion,
4954                                   skippable, bsize, &best_rd))
4955       best_mbmi = *mbmi;
4956   }
4957 #endif  // CONFIG_FILTER_INTRA
4958
4959   *mbmi = best_mbmi;
4960   // Make sure we actually chose a mode
4961   assert(best_rd < INT64_MAX);
4962   return best_rd;
4963 }
4964
4965 static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
4966                                  PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
4967                                  TX_SIZE max_tx_size, int *rate_uv,
4968                                  int *rate_uv_tokenonly, int64_t *dist_uv,
4969                                  int *skip_uv, PREDICTION_MODE *mode_uv) {
4970   // Use an estimated rd for uv_intra based on DC_PRED if the
4971   // appropriate speed flag is set.
4972   (void)ctx;
4973   init_sbuv_mode(&x->e_mbd.mi[0]->mbmi);
4974 #if CONFIG_CB4X4
4975 #if CONFIG_CHROMA_2X2
4976   rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
4977                           bsize, max_tx_size);
4978 #else
4979   if (x->skip_chroma_rd) {
4980     *rate_uv = 0;
4981     *rate_uv_tokenonly = 0;
4982     *dist_uv = 0;
4983     *skip_uv = 1;
4984     *mode_uv = DC_PRED;
4985     return;
4986   }
4987   BLOCK_SIZE bs = scale_chroma_bsize(bsize, x->e_mbd.plane[1].subsampling_x,
4988                                      x->e_mbd.plane[1].subsampling_y);
4989   rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
4990                           bs, max_tx_size);
4991 #endif  // CONFIG_CHROMA_2X2
4992 #else
4993   rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
4994                           bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
4995 #endif  // CONFIG_CB4X4
4996   *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
4997 }
4998
4999 static int cost_mv_ref(const AV1_COMP *const cpi, PREDICTION_MODE mode,
5000                        int16_t mode_context) {
5001 #if CONFIG_EXT_INTER
5002   if (is_inter_compound_mode(mode)) {
5003     return cpi
5004         ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
5005   }
5006 #endif
5007
5008   int mode_cost = 0;
5009   int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
5010   int16_t is_all_zero_mv = mode_context & (1 << ALL_ZERO_FLAG_OFFSET);
5011
5012   assert(is_inter_mode(mode));
5013
5014   if (mode == NEWMV) {
5015     mode_cost = cpi->newmv_mode_cost[mode_ctx][0];
5016     return mode_cost;
5017   } else {
5018     mode_cost = cpi->newmv_mode_cost[mode_ctx][1];
5019     mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
5020
5021     if (is_all_zero_mv) return mode_cost;
5022
5023     if (mode == ZEROMV) {
5024       mode_cost += cpi->zeromv_mode_cost[mode_ctx][0];
5025       return mode_cost;
5026     } else {
5027       mode_cost += cpi->zeromv_mode_cost[mode_ctx][1];
5028       mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
5029
5030       if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
5031       if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
5032       if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
5033
5034       mode_cost += cpi->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
5035       return mode_cost;
5036     }
5037   }
5038 }
5039
5040 #if CONFIG_EXT_INTER && (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
5041 static int get_interinter_compound_type_bits(BLOCK_SIZE bsize,
5042                                              COMPOUND_TYPE comp_type) {
5043   (void)bsize;
5044   switch (comp_type) {
5045     case COMPOUND_AVERAGE: return 0;
5046 #if CONFIG_WEDGE
5047     case COMPOUND_WEDGE: return get_interinter_wedge_bits(bsize);
5048 #endif  // CONFIG_WEDGE
5049 #if CONFIG_COMPOUND_SEGMENT
5050     case COMPOUND_SEG: return 1;
5051 #endif  // CONFIG_COMPOUND_SEGMENT
5052     default: assert(0); return 0;
5053   }
5054 }
5055 #endif  // CONFIG_EXT_INTER && (CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT)
5056
5057 typedef struct {
5058   int eobs;
5059   int brate;
5060   int byrate;
5061   int64_t bdist;
5062   int64_t bsse;
5063   int64_t brdcost;
5064   int_mv mvs[2];
5065   int_mv pred_mv[2];
5066 #if CONFIG_EXT_INTER
5067   int_mv ref_mv[2];
5068 #endif  // CONFIG_EXT_INTER
5069
5070 #if CONFIG_CHROMA_2X2
5071   ENTROPY_CONTEXT ta[4];
5072   ENTROPY_CONTEXT tl[4];
5073 #else
5074   ENTROPY_CONTEXT ta[2];
5075   ENTROPY_CONTEXT tl[2];
5076 #endif  // CONFIG_CHROMA_2X2
5077 } SEG_RDSTAT;
5078
5079 typedef struct {
5080   int_mv *ref_mv[2];
5081   int_mv mvp;
5082
5083   int64_t segment_rd;
5084   int r;
5085   int64_t d;
5086   int64_t sse;
5087   int segment_yrate;
5088   PREDICTION_MODE modes[4];
5089 #if CONFIG_EXT_INTER
5090   SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES];
5091 #else
5092   SEG_RDSTAT rdstat[4][INTER_MODES];
5093 #endif  // CONFIG_EXT_INTER
5094   int mvthresh;
5095 } BEST_SEG_INFO;
5096
5097 static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
5098   return (mv->row >> 3) < mv_limits->row_min ||
5099          (mv->row >> 3) > mv_limits->row_max ||
5100          (mv->col >> 3) < mv_limits->col_min ||
5101          (mv->col >> 3) > mv_limits->col_max;
5102 }
5103
5104 // Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
5105 // TODO(aconverse): Find out if this is still productive then clean up or remove
5106 static int check_best_zero_mv(
5107     const AV1_COMP *const cpi, const int16_t mode_context[TOTAL_REFS_PER_FRAME],
5108 #if CONFIG_EXT_INTER
5109     const int16_t compound_mode_context[TOTAL_REFS_PER_FRAME],
5110 #endif  // CONFIG_EXT_INTER
5111     int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME], int this_mode,
5112     const MV_REFERENCE_FRAME ref_frames[2], const BLOCK_SIZE bsize, int block,
5113     int mi_row, int mi_col) {
5114   int_mv zeromv[2];
5115   int comp_pred_mode = ref_frames[1] > INTRA_FRAME;
5116   int cur_frm;
5117   (void)mi_row;
5118   (void)mi_col;
5119   for (cur_frm = 0; cur_frm < 1 + comp_pred_mode; cur_frm++) {
5120 #if CONFIG_GLOBAL_MOTION
5121     if (this_mode == ZEROMV
5122 #if CONFIG_EXT_INTER
5123         || this_mode == ZERO_ZEROMV
5124 #endif  // CONFIG_EXT_INTER
5125         )
5126       zeromv[cur_frm].as_int =
5127           gm_get_motion_vector(&cpi->common.global_motion[ref_frames[cur_frm]],
5128                                cpi->common.allow_high_precision_mv, bsize,
5129                                mi_col, mi_row, block)
5130               .as_int;
5131     else
5132 #endif  // CONFIG_GLOBAL_MOTION
5133       zeromv[cur_frm].as_int = 0;
5134   }
5135 #if !CONFIG_EXT_INTER
5136   assert(ref_frames[1] != INTRA_FRAME);  // Just sanity check
5137 #endif                                   // !CONFIG_EXT_INTER
5138   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
5139       frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
5140       (ref_frames[1] <= INTRA_FRAME ||
5141        frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int)) {
5142     int16_t rfc =
5143         av1_mode_context_analyzer(mode_context, ref_frames, bsize, block);
5144     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
5145     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
5146     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
5147
5148     if (this_mode == NEARMV) {
5149       if (c1 > c3) return 0;
5150     } else if (this_mode == NEARESTMV) {
5151       if (c2 > c3) return 0;
5152     } else {
5153       assert(this_mode == ZEROMV);
5154       if (ref_frames[1] <= INTRA_FRAME) {
5155         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
5156             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
5157           return 0;
5158       } else {
5159         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 &&
5160              frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) ||
5161             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 &&
5162              frame_mv[NEARMV][ref_frames[1]].as_int == 0))
5163           return 0;
5164       }
5165     }
5166   }
5167 #if CONFIG_EXT_INTER
5168   else if ((this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
5169             this_mode == ZERO_ZEROMV) &&
5170            frame_mv[this_mode][ref_frames[0]].as_int == zeromv[0].as_int &&
5171            frame_mv[this_mode][ref_frames[1]].as_int == zeromv[1].as_int) {
5172     int16_t rfc = compound_mode_context[ref_frames[0]];
5173     int c2 = cost_mv_ref(cpi, NEAREST_NEARESTMV, rfc);
5174     int c3 = cost_mv_ref(cpi, ZERO_ZEROMV, rfc);
5175     int c5 = cost_mv_ref(cpi, NEAR_NEARMV, rfc);
5176
5177     if (this_mode == NEAREST_NEARESTMV) {
5178       if (c2 > c3) return 0;
5179     } else if (this_mode == NEAR_NEARMV) {
5180       if (c5 > c3) return 0;
5181     } else {
5182       assert(this_mode == ZERO_ZEROMV);
5183       if ((c3 >= c2 && frame_mv[NEAREST_NEARESTMV][ref_frames[0]].as_int == 0 &&
5184            frame_mv[NEAREST_NEARESTMV][ref_frames[1]].as_int == 0) ||
5185           (c3 >= c5 && frame_mv[NEAR_NEARMV][ref_frames[0]].as_int == 0 &&
5186            frame_mv[NEAR_NEARMV][ref_frames[1]].as_int == 0))
5187         return 0;
5188     }
5189   }
5190 #endif  // CONFIG_EXT_INTER
5191   return 1;
5192 }
5193
5194 static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
5195                                 BLOCK_SIZE bsize, int_mv *frame_mv, int mi_row,
5196                                 int mi_col,
5197 #if CONFIG_EXT_INTER
5198                                 int_mv *ref_mv_sub8x8[2], const uint8_t *mask,
5199                                 int mask_stride,
5200 #endif  // CONFIG_EXT_INTER
5201                                 int *rate_mv, const int block) {
5202   const AV1_COMMON *const cm = &cpi->common;
5203   const int pw = block_size_wide[bsize];
5204   const int ph = block_size_high[bsize];
5205   MACROBLOCKD *xd = &x->e_mbd;
5206   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
5207   // This function should only ever be called for compound modes
5208   assert(has_second_ref(mbmi));
5209   const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
5210   int_mv ref_mv[2];
5211   int ite, ref;
5212 #if CONFIG_DUAL_FILTER
5213   InterpFilter interp_filter[4] = {
5214     mbmi->interp_filter[0], mbmi->interp_filter[1], mbmi->interp_filter[2],
5215     mbmi->interp_filter[3],
5216   };
5217 #else
5218   const InterpFilter interp_filter = mbmi->interp_filter;
5219 #endif  // CONFIG_DUAL_FILTER
5220   struct scale_factors sf;
5221   struct macroblockd_plane *const pd = &xd->plane[0];
5222 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
5223   // ic and ir are the 4x4 coordiantes of the sub8x8 at index "block"
5224   const int ic = block & 1;
5225   const int ir = (block - ic) >> 1;
5226   const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
5227   const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
5228 #if CONFIG_GLOBAL_MOTION
5229   int is_global[2];
5230   for (ref = 0; ref < 2; ++ref) {
5231     WarpedMotionParams *const wm =
5232         &xd->global_motion[xd->mi[0]->mbmi.ref_frame[ref]];
5233     is_global[ref] = is_global_mv_block(xd->mi[0], block, wm->wmtype);
5234   }
5235 #endif  // CONFIG_GLOBAL_MOTION
5236 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
5237
5238   // Do joint motion search in compound mode to get more accurate mv.
5239   struct buf_2d backup_yv12[2][MAX_MB_PLANE];
5240   int last_besterr[2] = { INT_MAX, INT_MAX };
5241   const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
5242     av1_get_scaled_ref_frame(cpi, refs[0]),
5243     av1_get_scaled_ref_frame(cpi, refs[1])
5244   };
5245
5246 // Prediction buffer from second frame.
5247 #if CONFIG_HIGHBITDEPTH
5248   DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
5249   uint8_t *second_pred;
5250 #else
5251   DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
5252 #endif  // CONFIG_HIGHBITDEPTH
5253
5254 #if CONFIG_EXT_INTER && CONFIG_CB4X4
5255   (void)ref_mv_sub8x8;
5256 #endif  // CONFIG_EXT_INTER && CONFIG_CB4X4
5257
5258   for (ref = 0; ref < 2; ++ref) {
5259 #if CONFIG_EXT_INTER && !CONFIG_CB4X4
5260     if (bsize < BLOCK_8X8 && ref_mv_sub8x8 != NULL)
5261       ref_mv[ref].as_int = ref_mv_sub8x8[ref]->as_int;
5262     else
5263 #endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
5264       ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
5265
5266     if (scaled_ref_frame[ref]) {
5267       int i;
5268       // Swap out the reference frame for a version that's been scaled to
5269       // match the resolution of the current frame, allowing the existing
5270       // motion search code to be used without additional modifications.
5271       for (i = 0; i < MAX_MB_PLANE; i++)
5272         backup_yv12[ref][i] = xd->plane[i].pre[ref];
5273       av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
5274                            NULL);
5275     }
5276   }
5277
5278 // Since we have scaled the reference frames to match the size of the current
5279 // frame we must use a unit scaling factor during mode selection.
5280 #if CONFIG_HIGHBITDEPTH
5281   av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
5282                                     cm->height, cm->use_highbitdepth);
5283 #else
5284   av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
5285                                     cm->height);
5286 #endif  // CONFIG_HIGHBITDEPTH
5287
5288   // Allow joint search multiple times iteratively for each reference frame
5289   // and break out of the search loop if it couldn't find a better mv.
5290   for (ite = 0; ite < 4; ite++) {
5291     struct buf_2d ref_yv12[2];
5292     int bestsme = INT_MAX;
5293     int sadpb = x->sadperbit16;
5294     MV *const best_mv = &x->best_mv.as_mv;
5295     int search_range = 3;
5296
5297     MvLimits tmp_mv_limits = x->mv_limits;
5298     int id = ite % 2;  // Even iterations search in the first reference frame,
5299                        // odd iterations search in the second. The predictor
5300                        // found for the 'other' reference frame is factored in.
5301     const int plane = 0;
5302     ConvolveParams conv_params = get_conv_params(0, plane);
5303 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
5304     WarpTypesAllowed warp_types;
5305 #if CONFIG_GLOBAL_MOTION
5306     warp_types.global_warp_allowed = is_global[!id];
5307 #endif  // CONFIG_GLOBAL_MOTION
5308 #if CONFIG_WARPED_MOTION
5309     warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
5310 #endif  // CONFIG_WARPED_MOTION
5311 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
5312
5313     // Initialized here because of compiler problem in Visual Studio.
5314     ref_yv12[0] = xd->plane[plane].pre[0];
5315     ref_yv12[1] = xd->plane[plane].pre[1];
5316
5317 #if CONFIG_DUAL_FILTER
5318     // reload the filter types
5319     interp_filter[0] =
5320         (id == 0) ? mbmi->interp_filter[2] : mbmi->interp_filter[0];
5321     interp_filter[1] =
5322         (id == 0) ? mbmi->interp_filter[3] : mbmi->interp_filter[1];
5323 #endif  // CONFIG_DUAL_FILTER
5324
5325 // Get the prediction block from the 'other' reference frame.
5326 #if CONFIG_HIGHBITDEPTH
5327     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
5328       second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
5329       av1_highbd_build_inter_predictor(
5330           ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
5331           &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, interp_filter,
5332 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
5333           &warp_types, p_col, p_row,
5334 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
5335           plane, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
5336     } else {
5337       second_pred = (uint8_t *)second_pred_alloc_16;
5338 #endif  // CONFIG_HIGHBITDEPTH
5339       av1_build_inter_predictor(
5340           ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
5341           &frame_mv[refs[!id]].as_mv, &sf, pw, ph, &conv_params, interp_filter,
5342 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
5343           &warp_types, p_col, p_row, plane, !id,
5344 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
5345           MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
5346 #if CONFIG_HIGHBITDEPTH
5347     }
5348 #endif  // CONFIG_HIGHBITDEPTH
5349
5350     // Do compound motion search on the current reference frame.
5351     if (id) xd->plane[plane].pre[0] = ref_yv12[id];
5352     av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv);
5353
5354     // Use the mv result from the single mode as mv predictor.
5355     *best_mv = frame_mv[refs[id]].as_mv;
5356
5357     best_mv->col >>= 3;
5358     best_mv->row >>= 3;
5359
5360     av1_set_mvcost(x, refs[id], id, mbmi->ref_mv_idx);
5361
5362     // Small-range full-pixel motion search.
5363     bestsme =
5364         av1_refining_search_8p_c(x, sadpb, search_range, &cpi->fn_ptr[bsize],
5365 #if CONFIG_EXT_INTER
5366                                  mask, mask_stride, id,
5367 #endif
5368                                  &ref_mv[id].as_mv, second_pred);
5369     if (bestsme < INT_MAX) {
5370 #if CONFIG_EXT_INTER
5371       if (mask)
5372         bestsme = av1_get_mvpred_mask_var(x, best_mv, &ref_mv[id].as_mv,
5373                                           second_pred, mask, mask_stride, id,
5374                                           &cpi->fn_ptr[bsize], 1);
5375       else
5376 #endif
5377         bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv[id].as_mv,
5378                                         second_pred, &cpi->fn_ptr[bsize], 1);
5379     }
5380
5381     x->mv_limits = tmp_mv_limits;
5382
5383     if (bestsme < INT_MAX) {
5384       int dis; /* TODO: use dis in distortion calculation later. */
5385       unsigned int sse;
5386       if (cpi->sf.use_upsampled_references) {
5387         // Use up-sampled reference frames.
5388         struct buf_2d backup_pred = pd->pre[0];
5389         const YV12_BUFFER_CONFIG *upsampled_ref =
5390             get_upsampled_ref(cpi, refs[id]);
5391
5392         // Set pred for Y plane
5393         setup_pred_plane(&pd->pre[0], bsize, upsampled_ref->y_buffer,
5394                          upsampled_ref->y_crop_width,
5395                          upsampled_ref->y_crop_height, upsampled_ref->y_stride,
5396                          (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
5397                          pd->subsampling_y);
5398
5399 // If bsize < BLOCK_8X8, adjust pred pointer for this block
5400 #if !CONFIG_CB4X4
5401         if (bsize < BLOCK_8X8)
5402           pd->pre[0].buf =
5403               &pd->pre[0].buf[(av1_raster_block_offset(BLOCK_8X8, block,
5404                                                        pd->pre[0].stride))
5405                               << 3];
5406 #endif  // !CONFIG_CB4X4
5407
5408         bestsme = cpi->find_fractional_mv_step(
5409             x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
5410             x->errorperbit, &cpi->fn_ptr[bsize], 0,
5411             cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
5412             &dis, &sse, second_pred,
5413 #if CONFIG_EXT_INTER
5414             mask, mask_stride, id,
5415 #endif
5416             pw, ph, 1);
5417
5418         // Restore the reference frames.
5419         pd->pre[0] = backup_pred;
5420       } else {
5421         (void)block;
5422         bestsme = cpi->find_fractional_mv_step(
5423             x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
5424             x->errorperbit, &cpi->fn_ptr[bsize], 0,
5425             cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
5426             &dis, &sse, second_pred,
5427 #if CONFIG_EXT_INTER
5428             mask, mask_stride, id,
5429 #endif
5430             pw, ph, 0);
5431       }
5432     }
5433
5434     // Restore the pointer to the first (possibly scaled) prediction buffer.
5435     if (id) xd->plane[plane].pre[0] = ref_yv12[0];
5436
5437     if (bestsme < last_besterr[id]) {
5438       frame_mv[refs[id]].as_mv = *best_mv;
5439       last_besterr[id] = bestsme;
5440     } else {
5441       break;
5442     }
5443   }
5444
5445   *rate_mv = 0;
5446
5447   for (ref = 0; ref < 2; ++ref) {
5448     if (scaled_ref_frame[ref]) {
5449       // Restore the prediction frame pointers to their unscaled versions.
5450       int i;
5451       for (i = 0; i < MAX_MB_PLANE; i++)
5452         xd->plane[i].pre[ref] = backup_yv12[ref][i];
5453     }
5454     av1_set_mvcost(x, refs[ref], ref, mbmi->ref_mv_idx);
5455 #if CONFIG_EXT_INTER && !CONFIG_CB4X4
5456     if (bsize >= BLOCK_8X8)
5457 #endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
5458       *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
5459                                   &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
5460                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
5461 #if CONFIG_EXT_INTER && !CONFIG_CB4X4
5462     else
5463       *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
5464                                   &ref_mv_sub8x8[ref]->as_mv, x->nmvjointcost,
5465                                   x->mvcost, MV_COST_WEIGHT);
5466 #endif  // CONFIG_EXT_INTER && !CONFIG_CB4X4
5467   }
5468 }
5469
5470 static void estimate_ref_frame_costs(const AV1_COMMON *cm,
5471                                      const MACROBLOCKD *xd, int segment_id,
5472                                      unsigned int *ref_costs_single,
5473                                      unsigned int *ref_costs_comp,
5474                                      aom_prob *comp_mode_p) {
5475   int seg_ref_active =
5476       segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
5477   if (seg_ref_active) {
5478     memset(ref_costs_single, 0,
5479            TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_single));
5480     memset(ref_costs_comp, 0, TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_comp));
5481     *comp_mode_p = 128;
5482   } else {
5483     aom_prob intra_inter_p = av1_get_intra_inter_prob(cm, xd);
5484     aom_prob comp_inter_p = 128;
5485
5486     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
5487       comp_inter_p = av1_get_reference_mode_prob(cm, xd);
5488       *comp_mode_p = comp_inter_p;
5489     } else {
5490       *comp_mode_p = 128;
5491     }
5492
5493     ref_costs_single[INTRA_FRAME] = av1_cost_bit(intra_inter_p, 0);
5494
5495     if (cm->reference_mode != COMPOUND_REFERENCE) {
5496       aom_prob ref_single_p1 = av1_get_pred_prob_single_ref_p1(cm, xd);
5497       aom_prob ref_single_p2 = av1_get_pred_prob_single_ref_p2(cm, xd);
5498 #if CONFIG_EXT_REFS
5499       aom_prob ref_single_p3 = av1_get_pred_prob_single_ref_p3(cm, xd);
5500       aom_prob ref_single_p4 = av1_get_pred_prob_single_ref_p4(cm, xd);
5501       aom_prob ref_single_p5 = av1_get_pred_prob_single_ref_p5(cm, xd);
5502 #endif  // CONFIG_EXT_REFS
5503
5504       unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
5505
5506       ref_costs_single[LAST_FRAME] =
5507 #if CONFIG_EXT_REFS
5508           ref_costs_single[LAST2_FRAME] = ref_costs_single[LAST3_FRAME] =
5509               ref_costs_single[BWDREF_FRAME] =
5510 #endif  // CONFIG_EXT_REFS
5511                   ref_costs_single[GOLDEN_FRAME] =
5512                       ref_costs_single[ALTREF_FRAME] = base_cost;
5513
5514 #if CONFIG_EXT_REFS
5515       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
5516       ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p1, 0);
5517       ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p1, 0);
5518       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 0);
5519       ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
5520       ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
5521
5522       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p3, 0);
5523       ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p3, 0);
5524       ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p3, 1);
5525       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p3, 1);
5526
5527       ref_costs_single[BWDREF_FRAME] += av1_cost_bit(ref_single_p2, 0);
5528       ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1);
5529
5530       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p4, 0);
5531       ref_costs_single[LAST2_FRAME] += av1_cost_bit(ref_single_p4, 1);
5532
5533       ref_costs_single[LAST3_FRAME] += av1_cost_bit(ref_single_p5, 0);
5534       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p5, 1);
5535 #else
5536       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
5537       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 1);
5538       ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
5539
5540       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p2, 0);
5541       ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1);
5542 #endif  // CONFIG_EXT_REFS
5543     } else {
5544       ref_costs_single[LAST_FRAME] = 512;
5545 #if CONFIG_EXT_REFS
5546       ref_costs_single[LAST2_FRAME] = 512;
5547       ref_costs_single[LAST3_FRAME] = 512;
5548       ref_costs_single[BWDREF_FRAME] = 512;
5549 #endif  // CONFIG_EXT_REFS
5550       ref_costs_single[GOLDEN_FRAME] = 512;
5551       ref_costs_single[ALTREF_FRAME] = 512;
5552     }
5553
5554     if (cm->reference_mode != SINGLE_REFERENCE) {
5555       aom_prob ref_comp_p = av1_get_pred_prob_comp_ref_p(cm, xd);
5556 #if CONFIG_EXT_REFS
5557       aom_prob ref_comp_p1 = av1_get_pred_prob_comp_ref_p1(cm, xd);
5558       aom_prob ref_comp_p2 = av1_get_pred_prob_comp_ref_p2(cm, xd);
5559       aom_prob bwdref_comp_p = av1_get_pred_prob_comp_bwdref_p(cm, xd);
5560 #endif  // CONFIG_EXT_REFS
5561
5562       unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
5563
5564       ref_costs_comp[LAST_FRAME] =
5565 #if CONFIG_EXT_REFS
5566           ref_costs_comp[LAST2_FRAME] = ref_costs_comp[LAST3_FRAME] =
5567 #endif  // CONFIG_EXT_REFS
5568               ref_costs_comp[GOLDEN_FRAME] = base_cost;
5569
5570 #if CONFIG_EXT_REFS
5571       ref_costs_comp[BWDREF_FRAME] = ref_costs_comp[ALTREF_FRAME] = 0;
5572 #endif  // CONFIG_EXT_REFS
5573
5574 #if CONFIG_EXT_REFS
5575       ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
5576       ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0);
5577       ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p, 1);
5578       ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
5579
5580       ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p1, 1);
5581       ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p1, 0);
5582
5583       ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p2, 0);
5584       ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1);
5585
5586       // NOTE(zoeliu): BWDREF and ALTREF each add an extra cost by coding 1
5587       //               more bit.
5588       ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
5589       ref_costs_comp[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
5590 #else
5591       ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
5592       ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
5593 #endif  // CONFIG_EXT_REFS
5594     } else {
5595       ref_costs_comp[LAST_FRAME] = 512;
5596 #if CONFIG_EXT_REFS
5597       ref_costs_comp[LAST2_FRAME] = 512;
5598       ref_costs_comp[LAST3_FRAME] = 512;
5599       ref_costs_comp[BWDREF_FRAME] = 512;
5600       ref_costs_comp[ALTREF_FRAME] = 512;
5601 #endif  // CONFIG_EXT_REFS
5602       ref_costs_comp[GOLDEN_FRAME] = 512;
5603     }
5604   }
5605 }
5606
5607 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
5608                                  int mode_index,
5609                                  int64_t comp_pred_diff[REFERENCE_MODES],
5610                                  int skippable) {
5611   MACROBLOCKD *const xd = &x->e_mbd;
5612
5613   // Take a snapshot of the coding context so it can be
5614   // restored if we decide to encode this way
5615   ctx->skip = x->skip;
5616   ctx->skippable = skippable;
5617   ctx->best_mode_index = mode_index;
5618   ctx->mic = *xd->mi[0];
5619   ctx->mbmi_ext = *x->mbmi_ext;
5620   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
5621   ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE];
5622   ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
5623 }
5624
5625 static void setup_buffer_inter(
5626     const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
5627     BLOCK_SIZE block_size, int mi_row, int mi_col,
5628     int_mv frame_nearest_mv[TOTAL_REFS_PER_FRAME],
5629     int_mv frame_near_mv[TOTAL_REFS_PER_FRAME],
5630     struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE]) {
5631   const AV1_COMMON *cm = &cpi->common;
5632   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
5633   MACROBLOCKD *const xd = &x->e_mbd;
5634   MODE_INFO *const mi = xd->mi[0];
5635   int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
5636   const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
5637   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
5638
5639   assert(yv12 != NULL);
5640
5641   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
5642   // use the UV scaling factors.
5643   av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
5644
5645   // Gets an initial list of candidate vectors from neighbours and orders them
5646   av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
5647                    mbmi_ext->ref_mv_stack[ref_frame],
5648 #if CONFIG_EXT_INTER
5649                    mbmi_ext->compound_mode_context,
5650 #endif  // CONFIG_EXT_INTER
5651                    candidates, mi_row, mi_col, NULL, NULL,
5652                    mbmi_ext->mode_context);
5653
5654   // Candidate refinement carried out at encoder and decoder
5655   av1_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
5656                         &frame_nearest_mv[ref_frame],
5657                         &frame_near_mv[ref_frame]);
5658
5659 // Further refinement that is encode side only to test the top few candidates
5660 // in full and choose the best as the centre point for subsequent searches.
5661 // The current implementation doesn't support scaling.
5662 #if CONFIG_CB4X4
5663   av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
5664               block_size);
5665 #else
5666   if (!av1_is_scaled(sf) && block_size >= BLOCK_8X8)
5667     av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
5668                 block_size);
5669 #endif  // CONFIG_CB4X4
5670 }
5671
5672 static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
5673                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
5674 #if CONFIG_EXT_INTER
5675                                  int ref_idx,
5676 #endif  // CONFIG_EXT_INTER
5677                                  int *rate_mv) {
5678   MACROBLOCKD *xd = &x->e_mbd;
5679   const AV1_COMMON *cm = &cpi->common;
5680   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
5681   struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
5682   int bestsme = INT_MAX;
5683   int step_param;
5684   int sadpb = x->sadperbit16;
5685   MV mvp_full;
5686 #if CONFIG_EXT_INTER
5687   int ref = mbmi->ref_frame[ref_idx];
5688 #else
5689   int ref = mbmi->ref_frame[0];
5690   int ref_idx = 0;
5691 #endif  // CONFIG_EXT_INTER
5692   MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
5693
5694   MvLimits tmp_mv_limits = x->mv_limits;
5695   int cost_list[5];
5696
5697   const YV12_BUFFER_CONFIG *scaled_ref_frame =
5698       av1_get_scaled_ref_frame(cpi, ref);
5699
5700   MV pred_mv[3];
5701   pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
5702   pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
5703   pred_mv[2] = x->pred_mv[ref];
5704
5705   if (scaled_ref_frame) {
5706     int i;
5707     // Swap out the reference frame for a version that's been scaled to
5708     // match the resolution of the current frame, allowing the existing
5709     // motion search code to be used without additional modifications.
5710     for (i = 0; i < MAX_MB_PLANE; i++)
5711       backup_yv12[i] = xd->plane[i].pre[ref_idx];
5712
5713     av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
5714   }
5715
5716   av1_set_mv_search_range(&x->mv_limits, &ref_mv);
5717
5718   av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
5719
5720   // Work out the size of the first step in the mv step search.
5721   // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
5722   if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
5723     // Take wtd average of the step_params based on the last frame's
5724     // max mv magnitude and that based on the best ref mvs of the current
5725     // block for the given reference.
5726     step_param =
5727         (av1_init_search_range(x->max_mv_context[ref]) + cpi->mv_step_param) /
5728         2;
5729   } else {
5730     step_param = cpi->mv_step_param;
5731   }
5732
5733   if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size) {
5734     int boffset =
5735         2 * (b_width_log2_lookup[cm->sb_size] -
5736              AOMMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
5737     step_param = AOMMAX(step_param, boffset);
5738   }
5739
5740   if (cpi->sf.adaptive_motion_search) {
5741     int bwl = b_width_log2_lookup[bsize];
5742     int bhl = b_height_log2_lookup[bsize];
5743     int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
5744
5745     if (tlevel < 5) step_param += 2;
5746
5747     // prev_mv_sad is not setup for dynamically scaled frames.
5748     if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
5749       int i;
5750       for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
5751         if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
5752           x->pred_mv[ref].row = 0;
5753           x->pred_mv[ref].col = 0;
5754           x->best_mv.as_int = INVALID_MV;
5755
5756           if (scaled_ref_frame) {
5757             int j;
5758             for (j = 0; j < MAX_MB_PLANE; ++j)
5759               xd->plane[j].pre[ref_idx] = backup_yv12[j];
5760           }
5761           return;
5762         }
5763       }
5764     }
5765   }
5766
5767   av1_set_mv_search_range(&x->mv_limits, &ref_mv);
5768
5769 #if CONFIG_MOTION_VAR
5770   if (mbmi->motion_mode != SIMPLE_TRANSLATION)
5771     mvp_full = mbmi->mv[0].as_mv;
5772   else
5773 #endif  // CONFIG_MOTION_VAR
5774     mvp_full = pred_mv[x->mv_best_ref_index[ref]];
5775
5776   mvp_full.col >>= 3;
5777   mvp_full.row >>= 3;
5778
5779   x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV;
5780
5781 #if CONFIG_MOTION_VAR
5782   switch (mbmi->motion_mode) {
5783     case SIMPLE_TRANSLATION:
5784 #endif  // CONFIG_MOTION_VAR
5785       bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
5786                                       sadpb, cond_cost_list(cpi, cost_list),
5787                                       &ref_mv, INT_MAX, 1);
5788 #if CONFIG_MOTION_VAR
5789       break;
5790     case OBMC_CAUSAL:
5791       bestsme = av1_obmc_full_pixel_diamond(
5792           cpi, x, &mvp_full, step_param, sadpb,
5793           MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
5794           &(x->best_mv.as_mv), 0);
5795       break;
5796     default: assert("Invalid motion mode!\n");
5797   }
5798 #endif  // CONFIG_MOTION_VAR
5799
5800   x->mv_limits = tmp_mv_limits;
5801
5802   if (bestsme < INT_MAX) {
5803     int dis; /* TODO: use dis in distortion calculation later. */
5804 #if CONFIG_MOTION_VAR
5805     switch (mbmi->motion_mode) {
5806       case SIMPLE_TRANSLATION:
5807 #endif  // CONFIG_MOTION_VAR
5808         if (cpi->sf.use_upsampled_references) {
5809           int best_mv_var;
5810           const int try_second = x->second_best_mv.as_int != INVALID_MV &&
5811                                  x->second_best_mv.as_int != x->best_mv.as_int;
5812           const int pw = block_size_wide[bsize];
5813           const int ph = block_size_high[bsize];
5814           // Use up-sampled reference frames.
5815           struct macroblockd_plane *const pd = &xd->plane[0];
5816           struct buf_2d backup_pred = pd->pre[ref_idx];
5817           const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
5818
5819           // Set pred for Y plane
5820           setup_pred_plane(
5821               &pd->pre[ref_idx], bsize, upsampled_ref->y_buffer,
5822               upsampled_ref->y_crop_width, upsampled_ref->y_crop_height,
5823               upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3), NULL,
5824               pd->subsampling_x, pd->subsampling_y);
5825
5826           best_mv_var = cpi->find_fractional_mv_step(
5827               x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
5828               &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
5829               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
5830               x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL,
5831 #if CONFIG_EXT_INTER
5832               NULL, 0, 0,
5833 #endif
5834               pw, ph, 1);
5835
5836           if (try_second) {
5837             const int minc =
5838                 AOMMAX(x->mv_limits.col_min * 8, ref_mv.col - MV_MAX);
5839             const int maxc =
5840                 AOMMIN(x->mv_limits.col_max * 8, ref_mv.col + MV_MAX);
5841             const int minr =
5842                 AOMMAX(x->mv_limits.row_min * 8, ref_mv.row - MV_MAX);
5843             const int maxr =
5844                 AOMMIN(x->mv_limits.row_max * 8, ref_mv.row + MV_MAX);
5845             int this_var;
5846             MV best_mv = x->best_mv.as_mv;
5847
5848             x->best_mv = x->second_best_mv;
5849             if (x->best_mv.as_mv.row * 8 <= maxr &&
5850                 x->best_mv.as_mv.row * 8 >= minr &&
5851                 x->best_mv.as_mv.col * 8 <= maxc &&
5852                 x->best_mv.as_mv.col * 8 >= minc) {
5853               this_var = cpi->find_fractional_mv_step(
5854                   x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
5855                   &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
5856                   cpi->sf.mv.subpel_iters_per_step,
5857                   cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
5858                   &dis, &x->pred_sse[ref], NULL,
5859 #if CONFIG_EXT_INTER
5860                   NULL, 0, 0,
5861 #endif
5862                   pw, ph, 1);
5863               if (this_var < best_mv_var) best_mv = x->best_mv.as_mv;
5864               x->best_mv.as_mv = best_mv;
5865             }
5866           }
5867
5868           // Restore the reference frames.
5869           pd->pre[ref_idx] = backup_pred;
5870         } else {
5871           cpi->find_fractional_mv_step(
5872               x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
5873               &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
5874               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
5875               x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL,
5876 #if CONFIG_EXT_INTER
5877               NULL, 0, 0,
5878 #endif
5879               0, 0, 0);
5880         }
5881 #if CONFIG_MOTION_VAR
5882         break;
5883       case OBMC_CAUSAL:
5884         av1_find_best_obmc_sub_pixel_tree_up(
5885             cpi, x, mi_row, mi_col, &x->best_mv.as_mv, &ref_mv,
5886             cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize],
5887             cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step,
5888             x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], 0,
5889             cpi->sf.use_upsampled_references);
5890         break;
5891       default: assert("Invalid motion mode!\n");
5892     }
5893 #endif  // CONFIG_MOTION_VAR
5894   }
5895   *rate_mv = av1_mv_bit_cost(&x->best_mv.as_mv, &ref_mv, x->nmvjointcost,
5896                              x->mvcost, MV_COST_WEIGHT);
5897
5898 #if CONFIG_MOTION_VAR
5899   if (cpi->sf.adaptive_motion_search && mbmi->motion_mode == SIMPLE_TRANSLATION)
5900 #else
5901   if (cpi->sf.adaptive_motion_search)
5902 #endif  // CONFIG_MOTION_VAR
5903     x->pred_mv[ref] = x->best_mv.as_mv;
5904
5905   if (scaled_ref_frame) {
5906     int i;
5907     for (i = 0; i < MAX_MB_PLANE; i++)
5908       xd->plane[i].pre[ref_idx] = backup_yv12[i];
5909   }
5910 }
5911
5912 static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst) {
5913   int i;
5914   for (i = 0; i < MAX_MB_PLANE; i++) {
5915     xd->plane[i].dst.buf = dst.plane[i];
5916     xd->plane[i].dst.stride = dst.stride[i];
5917   }
5918 }
5919
5920 #if CONFIG_EXT_INTER
5921 static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
5922                                     BLOCK_SIZE bsize, const MV *other_mv,
5923                                     int mi_row, int mi_col, const int block,
5924                                     int ref_idx, uint8_t *second_pred) {
5925   const AV1_COMMON *const cm = &cpi->common;
5926   const int pw = block_size_wide[bsize];
5927   const int ph = block_size_high[bsize];
5928   MACROBLOCKD *xd = &x->e_mbd;
5929   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
5930   const int other_ref = mbmi->ref_frame[!ref_idx];
5931 #if CONFIG_DUAL_FILTER
5932   InterpFilter interp_filter[2] = {
5933     (ref_idx == 0) ? mbmi->interp_filter[2] : mbmi->interp_filter[0],
5934     (ref_idx == 0) ? mbmi->interp_filter[3] : mbmi->interp_filter[1]
5935   };
5936 #else
5937   const InterpFilter interp_filter = mbmi->interp_filter;
5938 #endif  // CONFIG_DUAL_FILTER
5939   struct scale_factors sf;
5940 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
5941   struct macroblockd_plane *const pd = &xd->plane[0];
5942   // ic and ir are the 4x4 coordiantes of the sub8x8 at index "block"
5943   const int ic = block & 1;
5944   const int ir = (block - ic) >> 1;
5945   const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
5946   const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
5947 #if CONFIG_GLOBAL_MOTION
5948   WarpedMotionParams *const wm = &xd->global_motion[other_ref];
5949   int is_global = is_global_mv_block(xd->mi[0], block, wm->wmtype);
5950 #endif  // CONFIG_GLOBAL_MOTION
5951 #else
5952   (void)block;
5953 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
5954
5955   // This function should only ever be called for compound modes
5956   assert(has_second_ref(mbmi));
5957
5958   struct buf_2d backup_yv12[MAX_MB_PLANE];
5959   const YV12_BUFFER_CONFIG *const scaled_ref_frame =
5960       av1_get_scaled_ref_frame(cpi, other_ref);
5961
5962   if (scaled_ref_frame) {
5963     int i;
5964     // Swap out the reference frame for a version that's been scaled to
5965     // match the resolution of the current frame, allowing the existing
5966     // motion search code to be used without additional modifications.
5967     for (i = 0; i < MAX_MB_PLANE; i++)
5968       backup_yv12[i] = xd->plane[i].pre[!ref_idx];
5969     av1_setup_pre_planes(xd, !ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
5970   }
5971
5972 // Since we have scaled the reference frames to match the size of the current
5973 // frame we must use a unit scaling factor during mode selection.
5974 #if CONFIG_HIGHBITDEPTH
5975   av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
5976                                     cm->height, cm->use_highbitdepth);
5977 #else
5978   av1_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
5979                                     cm->height);
5980 #endif  // CONFIG_HIGHBITDEPTH
5981
5982   struct buf_2d ref_yv12;
5983
5984   const int plane = 0;
5985   ConvolveParams conv_params = get_conv_params(0, plane);
5986 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
5987   WarpTypesAllowed warp_types;
5988 #if CONFIG_GLOBAL_MOTION
5989   warp_types.global_warp_allowed = is_global;
5990 #endif  // CONFIG_GLOBAL_MOTION
5991 #if CONFIG_WARPED_MOTION
5992   warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
5993 #endif  // CONFIG_WARPED_MOTION
5994 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
5995
5996   // Initialized here because of compiler problem in Visual Studio.
5997   ref_yv12 = xd->plane[plane].pre[!ref_idx];
5998
5999 // Get the prediction block from the 'other' reference frame.
6000 #if CONFIG_HIGHBITDEPTH
6001   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
6002     av1_highbd_build_inter_predictor(
6003         ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
6004         0, interp_filter,
6005 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
6006         &warp_types, p_col, p_row,
6007 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
6008         plane, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
6009   } else {
6010 #endif  // CONFIG_HIGHBITDEPTH
6011     av1_build_inter_predictor(
6012         ref_yv12.buf, ref_yv12.stride, second_pred, pw, other_mv, &sf, pw, ph,
6013         &conv_params, interp_filter,
6014 #if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
6015         &warp_types, p_col, p_row, plane, !ref_idx,
6016 #endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
6017         MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd);
6018 #if CONFIG_HIGHBITDEPTH
6019   }
6020 #endif  // CONFIG_HIGHBITDEPTH
6021
6022   if (scaled_ref_frame) {
6023     // Restore the prediction frame pointers to their unscaled versions.
6024     int i;
6025     for (i = 0; i < MAX_MB_PLANE; i++)
6026       xd->plane[i].pre[!ref_idx] = backup_yv12[i];
6027   }
6028 }
6029
6030 // Search for the best mv for one component of a compound,
6031 // given that the other component is fixed.
6032 static void compound_single_motion_search(
6033     const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *this_mv,
6034     int mi_row, int mi_col, const uint8_t *second_pred, const uint8_t *mask,
6035     int mask_stride, int *rate_mv, const int block, int ref_idx) {
6036   const int pw = block_size_wide[bsize];
6037   const int ph = block_size_high[bsize];
6038   MACROBLOCKD *xd = &x->e_mbd;
6039   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
6040   const int ref = mbmi->ref_frame[ref_idx];
6041   int_mv ref_mv = x->mbmi_ext->ref_mvs[ref][0];
6042   struct macroblockd_plane *const pd = &xd->plane[0];
6043
6044   struct buf_2d backup_yv12[MAX_MB_PLANE];
6045   const YV12_BUFFER_CONFIG *const scaled_ref_frame =
6046       av1_get_scaled_ref_frame(cpi, ref);
6047
6048   // Check that this is either an interinter or an interintra block
6049   assert(has_second_ref(mbmi) ||
6050          (ref_idx == 0 && mbmi->ref_frame[1] == INTRA_FRAME));
6051
6052   if (scaled_ref_frame) {
6053     int i;
6054     // Swap out the reference frame for a version that's been scaled to
6055     // match the resolution of the current frame, allowing the existing
6056     // motion search code to be used without additional modifications.
6057     for (i = 0; i < MAX_MB_PLANE; i++)
6058       backup_yv12[i] = xd->plane[i].pre[ref_idx];
6059     av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
6060   }
6061
6062   struct buf_2d orig_yv12;
6063   int bestsme = INT_MAX;
6064   int sadpb = x->sadperbit16;
6065   MV *const best_mv = &x->best_mv.as_mv;
6066   int search_range = 3;
6067
6068   MvLimits tmp_mv_limits = x->mv_limits;
6069
6070   // Initialized here because of compiler problem in Visual Studio.
6071   if (ref_idx) {
6072     orig_yv12 = pd->pre[0];
6073     pd->pre[0] = pd->pre[ref_idx];
6074   }
6075
6076   // Do compound motion search on the current reference frame.
6077   av1_set_mv_search_range(&x->mv_limits, &ref_mv.as_mv);
6078
6079   // Use the mv result from the single mode as mv predictor.
6080   *best_mv = *this_mv;
6081
6082   best_mv->col >>= 3;
6083   best_mv->row >>= 3;
6084
6085   av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
6086
6087   // Small-range full-pixel motion search.
6088   bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
6089                                      &cpi->fn_ptr[bsize], mask, mask_stride,
6090                                      ref_idx, &ref_mv.as_mv, second_pred);
6091   if (bestsme < INT_MAX) {
6092     if (mask)
6093       bestsme =
6094           av1_get_mvpred_mask_var(x, best_mv, &ref_mv.as_mv, second_pred, mask,
6095                                   mask_stride, ref_idx, &cpi->fn_ptr[bsize], 1);
6096     else
6097       bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv.as_mv, second_pred,
6098                                       &cpi->fn_ptr[bsize], 1);
6099   }
6100
6101   x->mv_limits = tmp_mv_limits;
6102
6103   if (bestsme < INT_MAX) {
6104     int dis; /* TODO: use dis in distortion calculation later. */
6105     unsigned int sse;
6106     if (cpi->sf.use_upsampled_references) {
6107       // Use up-sampled reference frames.
6108       struct buf_2d backup_pred = pd->pre[0];
6109       const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
6110
6111       // Set pred for Y plane
6112       setup_pred_plane(&pd->pre[0], bsize, upsampled_ref->y_buffer,
6113                        upsampled_ref->y_crop_width,
6114                        upsampled_ref->y_crop_height, upsampled_ref->y_stride,
6115                        (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
6116                        pd->subsampling_y);
6117
6118 // If bsize < BLOCK_8X8, adjust pred pointer for this block
6119 #if !CONFIG_CB4X4
6120       if (bsize < BLOCK_8X8)
6121         pd->pre[0].buf =
6122             &pd->pre[0].buf[(av1_raster_block_offset(BLOCK_8X8, block,
6123                                                      pd->pre[0].stride))
6124                             << 3];
6125 #endif  // !CONFIG_CB4X4
6126
6127       bestsme = cpi->find_fractional_mv_step(
6128           x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
6129           &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
6130           x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask,
6131           mask_stride, ref_idx, pw, ph, 1);
6132
6133       // Restore the reference frames.
6134       pd->pre[0] = backup_pred;
6135     } else {
6136       (void)block;
6137       bestsme = cpi->find_fractional_mv_step(
6138           x, &ref_mv.as_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
6139           &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
6140           x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask,
6141           mask_stride, ref_idx, pw, ph, 0);
6142     }
6143   }
6144
6145   // Restore the pointer to the first (possibly scaled) prediction buffer.
6146   if (ref_idx) pd->pre[0] = orig_yv12;
6147
6148   if (bestsme < INT_MAX) *this_mv = *best_mv;
6149
6150   *rate_mv = 0;
6151
6152   if (scaled_ref_frame) {
6153     // Restore the prediction frame pointers to their unscaled versions.
6154     int i;
6155     for (i = 0; i < MAX_MB_PLANE; i++)
6156       xd->plane[i].pre[ref_idx] = backup_yv12[i];
6157   }
6158
6159   av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
6160   *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, x->nmvjointcost,
6161                               x->mvcost, MV_COST_WEIGHT);
6162 }
6163
6164 // Wrapper for compound_single_motion_search, for the common case
6165 // where the second prediction is also an inter mode.
6166 static void compound_single_motion_search_interinter(
6167     const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *frame_mv,
6168     int mi_row, int mi_col, const uint8_t *mask, int mask_stride, int *rate_mv,
6169     const int block, int ref_idx) {
6170   MACROBLOCKD *xd = &x->e_mbd;
6171   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
6172
6173   // This function should only ever be called for compound modes
6174   assert(has_second_ref(mbmi));
6175
6176 // Prediction buffer from second frame.
6177 #if CONFIG_HIGHBITDEPTH
6178   DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
6179   uint8_t *second_pred;
6180   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
6181     second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
6182   else
6183     second_pred = (uint8_t *)second_pred_alloc_16;
6184 #else
6185   DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
6186 #endif  // CONFIG_HIGHBITDEPTH
6187
6188   MV *this_mv = &frame_mv[mbmi->ref_frame[ref_idx]].as_mv;
6189   const MV *other_mv = &frame_mv[mbmi->ref_frame[!ref_idx]].as_mv;
6190
6191   build_second_inter_pred(cpi, x, bsize, other_mv, mi_row, mi_col, block,
6192                           ref_idx, second_pred);
6193
6194   compound_single_motion_search(cpi, x, bsize, this_mv, mi_row, mi_col,
6195                                 second_pred, mask, mask_stride, rate_mv, block,
6196                                 ref_idx);
6197 }
6198
6199 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
6200 static void do_masked_motion_search_indexed(
6201     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
6202     const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize,
6203     int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv, int which) {
6204   // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
6205   MACROBLOCKD *xd = &x->e_mbd;
6206   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
6207   BLOCK_SIZE sb_type = mbmi->sb_type;
6208   const uint8_t *mask;
6209   const int mask_stride = block_size_wide[bsize];
6210
6211   mask = av1_get_compound_type_mask(comp_data, sb_type);
6212
6213   int_mv frame_mv[TOTAL_REFS_PER_FRAME];
6214   MV_REFERENCE_FRAME rf[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
6215   assert(bsize >= BLOCK_8X8 || CONFIG_CB4X4);
6216
6217   frame_mv[rf[0]].as_int = cur_mv[0].as_int;
6218   frame_mv[rf[1]].as_int = cur_mv[1].as_int;
6219   if (which == 0 || which == 1) {
6220     compound_single_motion_search_interinter(cpi, x, bsize, frame_mv, mi_row,
6221                                              mi_col, mask, mask_stride, rate_mv,
6222                                              0, which);
6223   } else if (which == 2) {
6224     joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, mask,
6225                         mask_stride, rate_mv, 0);
6226   }
6227   tmp_mv[0].as_int = frame_mv[rf[0]].as_int;
6228   tmp_mv[1].as_int = frame_mv[rf[1]].as_int;
6229 }
6230 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
6231 #endif  // CONFIG_EXT_INTER
6232
6233 // In some situations we want to discount tha pparent cost of a new motion
6234 // vector. Where there is a subtle motion field and especially where there is
6235 // low spatial complexity then it can be hard to cover the cost of a new motion
6236 // vector in a single block, even if that motion vector reduces distortion.
6237 // However, once established that vector may be usable through the nearest and
6238 // near mv modes to reduce distortion in subsequent blocks and also improve
6239 // visual quality.
6240 static int discount_newmv_test(const AV1_COMP *const cpi, int this_mode,
6241                                int_mv this_mv,
6242                                int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME],
6243                                int ref_frame) {
6244   return (!cpi->rc.is_src_frame_alt_ref && (this_mode == NEWMV) &&
6245           (this_mv.as_int != 0) &&
6246           ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
6247            (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
6248           ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
6249            (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
6250 }
6251
6252 #define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
6253 #define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
6254
6255 // TODO(jingning): this mv clamping function should be block size dependent.
6256 static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
6257   clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
6258            xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
6259            xd->mb_to_top_edge - LEFT_TOP_MARGIN,
6260            xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
6261 }
6262
6263 #if CONFIG_EXT_INTER
6264 #if CONFIG_WEDGE
6265 static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
6266                                const BLOCK_SIZE bsize, const uint8_t *pred0,
6267                                int stride0, const uint8_t *pred1, int stride1) {
6268   const struct macroblock_plane *const p = &x->plane[0];
6269   const uint8_t *src = p->src.buf;
6270   int src_stride = p->src.stride;
6271   const int f_index = bsize - BLOCK_8X8;
6272   const int bw = block_size_wide[bsize];
6273   const int bh = block_size_high[bsize];
6274   uint32_t esq[2][4];
6275   int64_t tl, br;
6276
6277 #if CONFIG_HIGHBITDEPTH
6278   if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
6279     pred0 = CONVERT_TO_BYTEPTR(pred0);
6280     pred1 = CONVERT_TO_BYTEPTR(pred1);
6281   }
6282 #endif  // CONFIG_HIGHBITDEPTH
6283
6284   cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
6285   cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred0 + bw / 2, stride0,
6286                           &esq[0][1]);
6287   cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
6288                           pred0 + bh / 2 * stride0, stride0, &esq[0][2]);
6289   cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
6290                           pred0 + bh / 2 * stride0 + bw / 2, stride0,
6291                           &esq[0][3]);
6292   cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
6293   cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred1 + bw / 2, stride1,
6294                           &esq[1][1]);
6295   cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
6296                           pred1 + bh / 2 * stride1, stride0, &esq[1][2]);
6297   cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
6298                           pred1 + bh / 2 * stride1 + bw / 2, stride0,
6299                           &esq[1][3]);
6300
6301   tl = (int64_t)(esq[0][0] + esq[0][1] + esq[0][2]) -
6302        (int64_t)(esq[1][0] + esq[1][1] + esq[1][2]);
6303   br = (int64_t)(esq[1][3] + esq[1][1] + esq[1][2]) -
6304        (int64_t)(esq[0][3] + esq[0][1] + esq[0][2]);
6305   return (tl + br > 0);
6306 }
6307 #endif  // CONFIG_WEDGE
6308 #endif  // CONFIG_EXT_INTER
6309
6310 #if !CONFIG_DUAL_FILTER
6311 static InterpFilter predict_interp_filter(
6312     const AV1_COMP *cpi, const MACROBLOCK *x, const BLOCK_SIZE bsize,
6313     const int mi_row, const int mi_col,
6314     InterpFilter (*single_filter)[TOTAL_REFS_PER_FRAME]) {
6315   InterpFilter best_filter = SWITCHABLE;
6316   const AV1_COMMON *cm = &cpi->common;
6317   const MACROBLOCKD *xd = &x->e_mbd;
6318   int bsl = mi_width_log2_lookup[bsize];
6319   int pred_filter_search =
6320       cpi->sf.cb_pred_filter_search
6321           ? (((mi_row + mi_col) >> bsl) +
6322              get_chessboard_index(cm->current_video_frame)) &
6323                 0x1
6324           : 0;
6325   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
6326   const int is_comp_pred = has_second_ref(mbmi);
6327   const int this_mode = mbmi->mode;
6328   int refs[2] = { mbmi->ref_frame[0],
6329                   (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
6330   if (pred_filter_search) {
6331     InterpFilter af = SWITCHABLE, lf = SWITCHABLE;
6332     if (xd->up_available) af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
6333     if (xd->left_available) lf = xd->mi[-1]->mbmi.interp_filter;
6334
6335 #if CONFIG_EXT_INTER
6336     if ((this_mode != NEWMV && this_mode != NEW_NEWMV) || (af == lf))
6337 #else
6338     if ((this_mode != NEWMV) || (af == lf))
6339 #endif  // CONFIG_EXT_INTER
6340       best_filter = af;
6341   }
6342   if (is_comp_pred) {
6343     if (cpi->sf.adaptive_mode_search) {
6344 #if CONFIG_EXT_INTER
6345       switch (this_mode) {
6346         case NEAREST_NEARESTMV:
6347           if (single_filter[NEARESTMV][refs[0]] ==
6348               single_filter[NEARESTMV][refs[1]])
6349             best_filter = single_filter[NEARESTMV][refs[0]];
6350           break;
6351         case NEAR_NEARMV:
6352           if (single_filter[NEARMV][refs[0]] == single_filter[NEARMV][refs[1]])
6353             best_filter = single_filter[NEARMV][refs[0]];
6354           break;
6355         case ZERO_ZEROMV:
6356           if (single_filter[ZEROMV][refs[0]] == single_filter[ZEROMV][refs[1]])
6357             best_filter = single_filter[ZEROMV][refs[0]];
6358           break;
6359         case NEW_NEWMV:
6360           if (single_filter[NEWMV][refs[0]] == single_filter[NEWMV][refs[1]])
6361             best_filter = single_filter[NEWMV][refs[0]];
6362           break;
6363         case NEAREST_NEWMV:
6364           if (single_filter[NEARESTMV][refs[0]] ==
6365               single_filter[NEWMV][refs[1]])
6366             best_filter = single_filter[NEARESTMV][refs[0]];
6367           break;
6368         case NEAR_NEWMV:
6369           if (single_filter[NEARMV][refs[0]] == single_filter[NEWMV][refs[1]])
6370             best_filter = single_filter[NEARMV][refs[0]];
6371           break;
6372         case NEW_NEARESTMV:
6373           if (single_filter[NEWMV][refs[0]] ==
6374               single_filter[NEARESTMV][refs[1]])
6375             best_filter = single_filter[NEWMV][refs[0]];
6376           break;
6377         case NEW_NEARMV:
6378           if (single_filter[NEWMV][refs[0]] == single_filter[NEARMV][refs[1]])
6379             best_filter = single_filter[NEWMV][refs[0]];
6380           break;
6381         default:
6382           if (single_filter[this_mode][refs[0]] ==
6383               single_filter[this_mode][refs[1]])
6384             best_filter = single_filter[this_mode][refs[0]];
6385           break;
6386       }
6387 #else
6388       if (single_filter[this_mode][refs[0]] ==
6389           single_filter[this_mode][refs[1]])
6390         best_filter = single_filter[this_mode][refs[0]];
6391 #endif  // CONFIG_EXT_INTER
6392     }
6393   }
6394   if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
6395     best_filter = EIGHTTAP_REGULAR;
6396   }
6397   return best_filter;
6398 }
6399 #endif  // !CONFIG_DUAL_FILTER
6400
6401 #if CONFIG_EXT_INTER
6402 // Choose the best wedge index and sign
6403 #if CONFIG_WEDGE
6404 static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
6405                           const BLOCK_SIZE bsize, const uint8_t *const p0,
6406                           const uint8_t *const p1, int *const best_wedge_sign,
6407                           int *const best_wedge_index) {
6408   const MACROBLOCKD *const xd = &x->e_mbd;
6409   const struct buf_2d *const src = &x->plane[0].src;
6410   const int bw = block_size_wide[bsize];
6411   const int bh = block_size_high[bsize];
6412   const int N = bw * bh;
6413   int rate;
6414   int64_t dist;
6415   int64_t rd, best_rd = INT64_MAX;
6416   int wedge_index;
6417   int wedge_sign;
6418   int wedge_types = (1 << get_wedge_bits_lookup(bsize));
6419   const uint8_t *mask;
6420   uint64_t sse;
6421 #if CONFIG_HIGHBITDEPTH
6422   const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
6423   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
6424 #else
6425   const int bd_round = 0;
6426 #endif  // CONFIG_HIGHBITDEPTH
6427
6428   DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
6429   DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
6430   DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
6431   DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
6432
6433   int64_t sign_limit;
6434
6435 #if CONFIG_HIGHBITDEPTH
6436   if (hbd) {
6437     aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
6438                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
6439     aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
6440                               CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
6441     aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
6442                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
6443   } else  // NOLINT
6444 #endif    // CONFIG_HIGHBITDEPTH
6445   {
6446     aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
6447     aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
6448     aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
6449   }
6450
6451   sign_limit = ((int64_t)aom_sum_squares_i16(r0, N) -
6452                 (int64_t)aom_sum_squares_i16(r1, N)) *
6453                (1 << WEDGE_WEIGHT_BITS) / 2;
6454
6455   if (N < 64)
6456     av1_wedge_compute_delta_squares_c(ds, r0, r1, N);
6457   else
6458     av1_wedge_compute_delta_squares(ds, r0, r1, N);
6459
6460   for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
6461     mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
6462
6463     // TODO(jingning): Make sse2 functions support N = 16 case
6464     if (N < 64)
6465       wedge_sign = av1_wedge_sign_from_residuals_c(ds, mask, N, sign_limit);
6466     else
6467       wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
6468
6469     mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
6470     if (N < 64)
6471       sse = av1_wedge_sse_from_residuals_c(r1, d10, mask, N);
6472     else
6473       sse = av1_wedge_sse_from_residuals(r1, d10, mask, N);
6474     sse = ROUND_POWER_OF_TWO(sse, bd_round);
6475
6476     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
6477     rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
6478
6479     if (rd < best_rd) {
6480       *best_wedge_index = wedge_index;
6481       *best_wedge_sign = wedge_sign;
6482       best_rd = rd;
6483     }
6484   }
6485
6486   return best_rd;
6487 }
6488
6489 // Choose the best wedge index the specified sign
6490 static int64_t pick_wedge_fixed_sign(
6491     const AV1_COMP *const cpi, const MACROBLOCK *const x,
6492     const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1,
6493     const int wedge_sign, int *const best_wedge_index) {
6494   const MACROBLOCKD *const xd = &x->e_mbd;
6495   const struct buf_2d *const src = &x->plane[0].src;
6496   const int bw = block_size_wide[bsize];
6497   const int bh = block_size_high[bsize];
6498   const int N = bw * bh;
6499   int rate;
6500   int64_t dist;
6501   int64_t rd, best_rd = INT64_MAX;
6502   int wedge_index;
6503   int wedge_types = (1 << get_wedge_bits_lookup(bsize));
6504   const uint8_t *mask;
6505   uint64_t sse;
6506 #if CONFIG_HIGHBITDEPTH
6507   const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
6508   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
6509 #else
6510   const int bd_round = 0;
6511 #endif  // CONFIG_HIGHBITDEPTH
6512
6513   DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
6514   DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
6515
6516 #if CONFIG_HIGHBITDEPTH
6517   if (hbd) {
6518     aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
6519                               CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
6520     aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
6521                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
6522   } else  // NOLINT
6523 #endif    // CONFIG_HIGHBITDEPTH
6524   {
6525     aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
6526     aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
6527   }
6528
6529   for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
6530     mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
6531     if (N < 64)
6532       sse = av1_wedge_sse_from_residuals_c(r1, d10, mask, N);
6533     else
6534       sse = av1_wedge_sse_from_residuals(r1, d10, mask, N);
6535     sse = ROUND_POWER_OF_TWO(sse, bd_round);
6536
6537     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
6538     rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
6539
6540     if (rd < best_rd) {
6541       *best_wedge_index = wedge_index;
6542       best_rd = rd;
6543     }
6544   }
6545
6546   return best_rd;
6547 }
6548
6549 static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
6550                                      MACROBLOCK *const x,
6551                                      const BLOCK_SIZE bsize,
6552                                      const uint8_t *const p0,
6553                                      const uint8_t *const p1) {
6554   MACROBLOCKD *const xd = &x->e_mbd;
6555   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
6556   const int bw = block_size_wide[bsize];
6557
6558   int64_t rd;
6559   int wedge_index = -1;
6560   int wedge_sign = 0;
6561
6562   assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
6563   assert(cpi->common.allow_masked_compound);
6564
6565   if (cpi->sf.fast_wedge_sign_estimate) {
6566     wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
6567     rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, wedge_sign, &wedge_index);
6568   } else {
6569     rd = pick_wedge(cpi, x, bsize, p0, p1, &wedge_sign, &wedge_index);
6570   }
6571
6572   mbmi->wedge_sign = wedge_sign;
6573   mbmi->wedge_index = wedge_index;
6574   return rd;
6575 }
6576 #endif  // CONFIG_WEDGE
6577
6578 #if CONFIG_COMPOUND_SEGMENT
6579 static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
6580                                    MACROBLOCK *const x, const BLOCK_SIZE bsize,
6581                                    const uint8_t *const p0,
6582                                    const uint8_t *const p1) {
6583   MACROBLOCKD *const xd = &x->e_mbd;
6584   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
6585   const struct buf_2d *const src = &x->plane[0].src;
6586   const int bw = block_size_wide[bsize];
6587   const int bh = block_size_high[bsize];
6588   const int N = bw * bh;
6589   int rate;
6590   uint64_t sse;
6591   int64_t dist;
6592   int64_t rd0;
6593   SEG_MASK_TYPE cur_mask_type;
6594   int64_t best_rd = INT64_MAX;
6595   SEG_MASK_TYPE best_mask_type = 0;
6596 #if CONFIG_HIGHBITDEPTH
6597   const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
6598   const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
6599 #else
6600   const int bd_round = 0;
6601 #endif  // CONFIG_HIGHBITDEPTH
6602   DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
6603   DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
6604   DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
6605
6606 #if CONFIG_HIGHBITDEPTH
6607   if (hbd) {
6608     aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
6609                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
6610     aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
6611                               CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
6612     aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
6613                               CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
6614   } else  // NOLINT
6615 #endif    // CONFIG_HIGHBITDEPTH
6616   {
6617     aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
6618     aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
6619     aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
6620   }
6621
6622   // try each mask type and its inverse
6623   for (cur_mask_type = 0; cur_mask_type < SEG_MASK_TYPES; cur_mask_type++) {
6624 // build mask and inverse
6625 #if CONFIG_HIGHBITDEPTH
6626     if (hbd)
6627       build_compound_seg_mask_highbd(
6628           xd->seg_mask, cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
6629           CONVERT_TO_BYTEPTR(p1), bw, bsize, bh, bw, xd->bd);
6630     else
6631 #endif  // CONFIG_HIGHBITDEPTH
6632       build_compound_seg_mask(xd->seg_mask, cur_mask_type, p0, bw, p1, bw,
6633                               bsize, bh, bw);
6634
6635     // compute rd for mask
6636     sse = av1_wedge_sse_from_residuals(r1, d10, xd->seg_mask, N);
6637     sse = ROUND_POWER_OF_TWO(sse, bd_round);
6638
6639     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
6640     rd0 = RDCOST(x->rdmult, x->rddiv, rate, dist);
6641
6642     if (rd0 < best_rd) {
6643       best_mask_type = cur_mask_type;
6644       best_rd = rd0;
6645     }
6646   }
6647
6648   // make final mask
6649   mbmi->mask_type = best_mask_type;
6650 #if CONFIG_HIGHBITDEPTH
6651   if (hbd)
6652     build_compound_seg_mask_highbd(
6653         xd->seg_mask, mbmi->mask_type, CONVERT_TO_BYTEPTR(p0), bw,
6654         CONVERT_TO_BYTEPTR(p1), bw, bsize, bh, bw, xd->bd);
6655   else
6656 #endif  // CONFIG_HIGHBITDEPTH
6657     build_compound_seg_mask(xd->seg_mask, mbmi->mask_type, p0, bw, p1, bw,
6658                             bsize, bh, bw);
6659
6660   return best_rd;
6661 }
6662 #endif  // CONFIG_COMPOUND_SEGMENT
6663
6664 #if CONFIG_WEDGE && CONFIG_INTERINTRA
6665 static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
6666                                      const MACROBLOCK *const x,
6667                                      const BLOCK_SIZE bsize,
6668                                      const uint8_t *const p0,
6669                                      const uint8_t *const p1) {
6670   const MACROBLOCKD *const xd = &x->e_mbd;
6671   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
6672
6673   int64_t rd;
6674   int wedge_index = -1;
6675
6676   assert(is_interintra_wedge_used(bsize));
6677   assert(cpi->common.allow_interintra_compound);
6678
6679   rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, 0, &wedge_index);
6680
6681   mbmi->interintra_wedge_sign = 0;
6682   mbmi->interintra_wedge_index = wedge_index;
6683   return rd;
6684 }
6685 #endif  // CONFIG_WEDGE && CONFIG_INTERINTRA
6686
6687 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
6688 static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x,
6689                                     const BLOCK_SIZE bsize,
6690                                     const uint8_t *const p0,
6691                                     const uint8_t *const p1) {
6692   const COMPOUND_TYPE compound_type =
6693       x->e_mbd.mi[0]->mbmi.interinter_compound_type;
6694   switch (compound_type) {
6695 #if CONFIG_WEDGE
6696     case COMPOUND_WEDGE: return pick_interinter_wedge(cpi, x, bsize, p0, p1);
6697 #endif  // CONFIG_WEDGE
6698 #if CONFIG_COMPOUND_SEGMENT
6699     case COMPOUND_SEG: return pick_interinter_seg(cpi, x, bsize, p0, p1);
6700 #endif  // CONFIG_COMPOUND_SEGMENT
6701     default: assert(0); return 0;
6702   }
6703 }
6704
6705 static int interinter_compound_motion_search(
6706     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
6707     const BLOCK_SIZE bsize, const int this_mode, int mi_row, int mi_col) {
6708   MACROBLOCKD *const xd = &x->e_mbd;
6709   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
6710   int_mv tmp_mv[2];
6711   int tmp_rate_mv = 0;
6712   const INTERINTER_COMPOUND_DATA compound_data = {
6713 #if CONFIG_WEDGE
6714     mbmi->wedge_index,
6715     mbmi->wedge_sign,
6716 #endif  // CONFIG_WEDGE
6717 #if CONFIG_COMPOUND_SEGMENT
6718     mbmi->mask_type,
6719     xd->seg_mask,
6720 #endif  // CONFIG_COMPOUND_SEGMENT
6721     mbmi->interinter_compound_type
6722   };
6723   if (this_mode == NEW_NEWMV) {
6724     do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
6725                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 2);
6726     mbmi->mv[0].as_int = tmp_mv[0].as_int;
6727     mbmi->mv[1].as_int = tmp_mv[1].as_int;
6728   } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
6729     do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
6730                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 0);
6731     mbmi->mv[0].as_int = tmp_mv[0].as_int;
6732   } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
6733     do_masked_motion_search_indexed(cpi, x, cur_mv, &compound_data, bsize,
6734                                     mi_row, mi_col, tmp_mv, &tmp_rate_mv, 1);
6735     mbmi->mv[1].as_int = tmp_mv[1].as_int;
6736   }
6737   return tmp_rate_mv;
6738 }
6739
6740 static int64_t build_and_cost_compound_type(
6741     const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
6742     const BLOCK_SIZE bsize, const int this_mode, int rs2, int rate_mv,
6743     BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1,
6744     int *strides, int mi_row, int mi_col) {
6745   const AV1_COMMON *const cm = &cpi->common;
6746   MACROBLOCKD *xd = &x->e_mbd;
6747   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
6748   int rate_sum;
6749   int64_t dist_sum;
6750   int64_t best_rd_cur = INT64_MAX;
6751   int64_t rd = INT64_MAX;
6752   int tmp_skip_txfm_sb;
6753   int64_t tmp_skip_sse_sb;
6754   const COMPOUND_TYPE compound_type = mbmi->interinter_compound_type;
6755
6756   best_rd_cur = pick_interinter_mask(cpi, x, bsize, *preds0, *preds1);
6757   best_rd_cur += RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv, 0);
6758
6759   if (have_newmv_in_inter_mode(this_mode) &&
6760       use_masked_motion_search(compound_type)) {
6761     *out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize,
6762                                                      this_mode, mi_row, mi_col);
6763     av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
6764     model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
6765                     &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
6766     rd = RDCOST(x->rdmult, x->rddiv, rs2 + *out_rate_mv + rate_sum, dist_sum);
6767     if (rd >= best_rd_cur) {
6768       mbmi->mv[0].as_int = cur_mv[0].as_int;
6769       mbmi->mv[1].as_int = cur_mv[1].as_int;
6770       *out_rate_mv = rate_mv;
6771       av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0,
6772 #if CONFIG_SUPERTX
6773                                                0, 0,
6774 #endif  // CONFIG_SUPERTX
6775                                                preds0, strides, preds1,
6776                                                strides);
6777     }
6778     av1_subtract_plane(x, bsize, 0);
6779     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
6780                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
6781     if (rd != INT64_MAX)
6782       rd = RDCOST(x->rdmult, x->rddiv, rs2 + *out_rate_mv + rate_sum, dist_sum);
6783     best_rd_cur = rd;
6784
6785   } else {
6786     av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0,
6787 #if CONFIG_SUPERTX
6788                                              0, 0,
6789 #endif  // CONFIG_SUPERTX
6790                                              preds0, strides, preds1, strides);
6791     av1_subtract_plane(x, bsize, 0);
6792     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
6793                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
6794     if (rd != INT64_MAX)
6795       rd = RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv + rate_sum, dist_sum);
6796     best_rd_cur = rd;
6797   }
6798   return best_rd_cur;
6799 }
6800 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
6801 #endif  // CONFIG_EXT_INTER
6802
6803 typedef struct {
6804 #if CONFIG_MOTION_VAR
6805   // Inter prediction buffers and respective strides
6806   uint8_t *above_pred_buf[MAX_MB_PLANE];
6807   int above_pred_stride[MAX_MB_PLANE];
6808   uint8_t *left_pred_buf[MAX_MB_PLANE];
6809   int left_pred_stride[MAX_MB_PLANE];
6810 #endif  // CONFIG_MOTION_VAR
6811   int_mv *single_newmv;
6812 #if CONFIG_EXT_INTER
6813   // Pointer to array of motion vectors to use for each ref and their rates
6814   // Should point to first of 2 arrays in 2D array
6815   int *single_newmv_rate;
6816   // Pointer to array of predicted rate-distortion
6817   // Should point to first of 2 arrays in 2D array
6818   int64_t (*modelled_rd)[TOTAL_REFS_PER_FRAME];
6819 #endif  // CONFIG_EXT_INTER
6820   InterpFilter single_filter[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
6821 } HandleInterModeArgs;
6822
6823 static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
6824                             const BLOCK_SIZE bsize,
6825                             int_mv (*const mode_mv)[TOTAL_REFS_PER_FRAME],
6826                             const int mi_row, const int mi_col,
6827                             int *const rate_mv, int_mv *const single_newmv,
6828                             HandleInterModeArgs *const args) {
6829   const MACROBLOCKD *const xd = &x->e_mbd;
6830   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
6831   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
6832   const int is_comp_pred = has_second_ref(mbmi);
6833   const PREDICTION_MODE this_mode = mbmi->mode;
6834 #if CONFIG_EXT_INTER
6835   const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
6836 #endif  // CONFIG_EXT_INTER
6837   int_mv *const frame_mv = mode_mv[this_mode];
6838   const int refs[2] = { mbmi->ref_frame[0],
6839                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
6840   int i;
6841
6842   (void)args;
6843
6844   if (is_comp_pred) {
6845 #if CONFIG_EXT_INTER
6846     for (i = 0; i < 2; ++i) {
6847       single_newmv[refs[i]].as_int = args->single_newmv[refs[i]].as_int;
6848     }
6849
6850     if (this_mode == NEW_NEWMV) {
6851       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
6852       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
6853
6854       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
6855         joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, NULL,
6856                             0, rate_mv, 0);
6857       } else {
6858         *rate_mv = 0;
6859         for (i = 0; i < 2; ++i) {
6860           av1_set_mvcost(x, refs[i], i, mbmi->ref_mv_idx);
6861           *rate_mv += av1_mv_bit_cost(
6862               &frame_mv[refs[i]].as_mv, &mbmi_ext->ref_mvs[refs[i]][0].as_mv,
6863               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
6864         }
6865       }
6866     } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
6867       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
6868       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
6869         frame_mv[refs[0]].as_int =
6870             mode_mv[compound_ref0_mode(this_mode)][refs[0]].as_int;
6871         compound_single_motion_search_interinter(
6872             cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 1);
6873       } else {
6874         av1_set_mvcost(x, refs[1], 1, mbmi->ref_mv_idx);
6875         *rate_mv = av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
6876                                    &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
6877                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
6878       }
6879     } else {
6880       assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
6881       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
6882       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
6883         frame_mv[refs[1]].as_int =
6884             mode_mv[compound_ref1_mode(this_mode)][refs[1]].as_int;
6885         compound_single_motion_search_interinter(
6886             cpi, x, bsize, frame_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 0);
6887       } else {
6888         av1_set_mvcost(x, refs[0], 0, mbmi->ref_mv_idx);
6889         *rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
6890                                    &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
6891                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
6892       }
6893     }
6894 #else
6895     // Initialize mv using single prediction mode result.
6896     frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
6897     frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
6898
6899     if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
6900       joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, rate_mv, 0);
6901     } else {
6902       *rate_mv = 0;
6903       for (i = 0; i < 2; ++i) {
6904         av1_set_mvcost(x, refs[i], i, mbmi->ref_mv_idx);
6905         *rate_mv += av1_mv_bit_cost(&frame_mv[refs[i]].as_mv,
6906                                     &mbmi_ext->ref_mvs[refs[i]][0].as_mv,
6907                                     x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
6908       }
6909     }
6910 #endif  // CONFIG_EXT_INTER
6911   } else {
6912 #if CONFIG_EXT_INTER
6913     if (is_comp_interintra_pred) {
6914       x->best_mv = args->single_newmv[refs[0]];
6915       *rate_mv = args->single_newmv_rate[refs[0]];
6916     } else {
6917       single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv);
6918       args->single_newmv[refs[0]] = x->best_mv;
6919       args->single_newmv_rate[refs[0]] = *rate_mv;
6920     }
6921 #else
6922     single_motion_search(cpi, x, bsize, mi_row, mi_col, rate_mv);
6923     single_newmv[refs[0]] = x->best_mv;
6924 #endif  // CONFIG_EXT_INTER
6925
6926     if (x->best_mv.as_int == INVALID_MV) return INT64_MAX;
6927
6928     frame_mv[refs[0]] = x->best_mv;
6929     xd->mi[0]->bmi[0].as_mv[0] = x->best_mv;
6930
6931     // Estimate the rate implications of a new mv but discount this
6932     // under certain circumstances where we want to help initiate a weak
6933     // motion field, where the distortion gain for a single block may not
6934     // be enough to overcome the cost of a new mv.
6935     if (discount_newmv_test(cpi, this_mode, x->best_mv, mode_mv, refs[0])) {
6936       *rate_mv = AOMMAX(*rate_mv / NEW_MV_DISCOUNT_FACTOR, 1);
6937     }
6938   }
6939
6940   return 0;
6941 }
6942
6943 int64_t interpolation_filter_search(
6944     MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
6945     int mi_row, int mi_col, const BUFFER_SET *const tmp_dst,
6946     BUFFER_SET *const orig_dst,
6947     InterpFilter (*const single_filter)[TOTAL_REFS_PER_FRAME],
6948     int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb,
6949     int64_t *const skip_sse_sb) {
6950   const AV1_COMMON *cm = &cpi->common;
6951   MACROBLOCKD *const xd = &x->e_mbd;
6952   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
6953   int i;
6954   int tmp_rate;
6955   int64_t tmp_dist;
6956
6957   (void)single_filter;
6958
6959   InterpFilter assign_filter = SWITCHABLE;
6960
6961   if (cm->interp_filter == SWITCHABLE) {
6962 #if !CONFIG_DUAL_FILTER
6963     assign_filter = av1_is_interp_needed(xd)
6964                         ? predict_interp_filter(cpi, x, bsize, mi_row, mi_col,
6965                                                 single_filter)
6966                         : cm->interp_filter;
6967 #endif  // !CONFIG_DUAL_FILTER
6968   } else {
6969     assign_filter = cm->interp_filter;
6970   }
6971
6972   set_default_interp_filters(mbmi, assign_filter);
6973
6974   *switchable_rate = av1_get_switchable_rate(cpi, xd);
6975   av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
6976   model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate, &tmp_dist,
6977                   skip_txfm_sb, skip_sse_sb);
6978   *rd = RDCOST(x->rdmult, x->rddiv, *switchable_rate + tmp_rate, tmp_dist);
6979
6980   if (assign_filter == SWITCHABLE) {
6981     // do interp_filter search
6982     if (av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd)) {
6983 #if CONFIG_DUAL_FILTER
6984       const int filter_set_size = DUAL_FILTER_SET_SIZE;
6985 #else
6986       const int filter_set_size = SWITCHABLE_FILTERS;
6987 #endif  // CONFIG_DUAL_FILTER
6988       int best_in_temp = 0;
6989 #if CONFIG_DUAL_FILTER
6990       InterpFilter best_filter[4];
6991       av1_copy(best_filter, mbmi->interp_filter);
6992 #else
6993       InterpFilter best_filter = mbmi->interp_filter;
6994 #endif  // CONFIG_DUAL_FILTER
6995       restore_dst_buf(xd, *tmp_dst);
6996       // EIGHTTAP_REGULAR mode is calculated beforehand
6997       for (i = 1; i < filter_set_size; ++i) {
6998         int tmp_skip_sb = 0;
6999         int64_t tmp_skip_sse = INT64_MAX;
7000         int tmp_rs;
7001         int64_t tmp_rd;
7002 #if CONFIG_DUAL_FILTER
7003         mbmi->interp_filter[0] = filter_sets[i][0];
7004         mbmi->interp_filter[1] = filter_sets[i][1];
7005         mbmi->interp_filter[2] = filter_sets[i][0];
7006         mbmi->interp_filter[3] = filter_sets[i][1];
7007 #else
7008         mbmi->interp_filter = (InterpFilter)i;
7009 #endif  // CONFIG_DUAL_FILTER
7010         tmp_rs = av1_get_switchable_rate(cpi, xd);
7011         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
7012         model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
7013                         &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
7014         tmp_rd = RDCOST(x->rdmult, x->rddiv, tmp_rs + tmp_rate, tmp_dist);
7015
7016         if (tmp_rd < *rd) {
7017           *rd = tmp_rd;
7018           *switchable_rate = av1_get_switchable_rate(cpi, xd);
7019 #if CONFIG_DUAL_FILTER
7020           av1_copy(best_filter, mbmi->interp_filter);
7021 #else
7022           best_filter = mbmi->interp_filter;
7023 #endif  // CONFIG_DUAL_FILTER
7024           *skip_txfm_sb = tmp_skip_sb;
7025           *skip_sse_sb = tmp_skip_sse;
7026           best_in_temp = !best_in_temp;
7027           if (best_in_temp) {
7028             restore_dst_buf(xd, *orig_dst);
7029           } else {
7030             restore_dst_buf(xd, *tmp_dst);
7031           }
7032         }
7033       }
7034       if (best_in_temp) {
7035         restore_dst_buf(xd, *tmp_dst);
7036       } else {
7037         restore_dst_buf(xd, *orig_dst);
7038       }
7039 #if CONFIG_DUAL_FILTER
7040       av1_copy(mbmi->interp_filter, best_filter);
7041 #else
7042       mbmi->interp_filter = best_filter;
7043 #endif  // CONFIG_DUAL_FILTER
7044     } else {
7045 #if CONFIG_DUAL_FILTER
7046       for (i = 0; i < 4; ++i)
7047         assert(mbmi->interp_filter[i] == EIGHTTAP_REGULAR);
7048 #else
7049       assert(mbmi->interp_filter == EIGHTTAP_REGULAR);
7050 #endif  // CONFIG_DUAL_FILTER
7051     }
7052   }
7053
7054   return 0;
7055 }
7056
7057 // TODO(afergs): Refactor the MBMI references in here - there's four
7058 // TODO(afergs): Refactor optional args - add them to a struct or remove
7059 static int64_t motion_mode_rd(
7060     const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
7061     RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
7062     int *disable_skip, int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], int mi_row,
7063     int mi_col, HandleInterModeArgs *const args, const int64_t ref_best_rd,
7064     const int *refs, int rate_mv,
7065 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7066     int_mv *const single_newmv,
7067 #if CONFIG_EXT_INTER
7068     int rate2_bmc_nocoeff, MB_MODE_INFO *best_bmc_mbmi,
7069 #if CONFIG_MOTION_VAR
7070     int rate_mv_bmc,
7071 #endif  // CONFIG_MOTION_VAR
7072 #endif  // CONFIG_EXT_INTER
7073 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7074     int rs, int *skip_txfm_sb, int64_t *skip_sse_sb, BUFFER_SET *orig_dst) {
7075   const AV1_COMMON *const cm = &cpi->common;
7076   MACROBLOCKD *xd = &x->e_mbd;
7077   MODE_INFO *mi = xd->mi[0];
7078   MB_MODE_INFO *mbmi = &mi->mbmi;
7079   const int is_comp_pred = has_second_ref(mbmi);
7080   const PREDICTION_MODE this_mode = mbmi->mode;
7081
7082   (void)mode_mv;
7083   (void)mi_row;
7084   (void)mi_col;
7085   (void)args;
7086   (void)refs;
7087   (void)rate_mv;
7088   (void)is_comp_pred;
7089   (void)this_mode;
7090
7091 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7092   MOTION_MODE motion_mode, last_motion_mode_allowed;
7093   int rate2_nocoeff = 0, best_xskip, best_disable_skip = 0;
7094   RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
7095   MB_MODE_INFO base_mbmi, best_mbmi;
7096 #if CONFIG_VAR_TX
7097   uint8_t best_blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
7098 #endif  // CONFIG_VAR_TX
7099 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7100
7101 #if CONFIG_WARPED_MOTION
7102   int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
7103 #endif  // CONFIG_WARPED_MOTION
7104
7105 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7106   av1_invalid_rd_stats(&best_rd_stats);
7107 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7108
7109   if (cm->interp_filter == SWITCHABLE) rd_stats->rate += rs;
7110 #if CONFIG_WARPED_MOTION
7111   aom_clear_system_state();
7112   mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
7113 #if CONFIG_EXT_INTER
7114   best_bmc_mbmi->num_proj_ref[0] = mbmi->num_proj_ref[0];
7115 #endif  // CONFIG_EXT_INTER
7116 #endif  // CONFIG_WARPED_MOTION
7117 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7118   rate2_nocoeff = rd_stats->rate;
7119   last_motion_mode_allowed = motion_mode_allowed(
7120 #if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
7121       0, xd->global_motion,
7122 #endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
7123       mi);
7124   base_mbmi = *mbmi;
7125 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7126
7127 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7128   int64_t best_rd = INT64_MAX;
7129   for (motion_mode = SIMPLE_TRANSLATION;
7130        motion_mode <= last_motion_mode_allowed; motion_mode++) {
7131     int64_t tmp_rd = INT64_MAX;
7132     int tmp_rate;
7133     int64_t tmp_dist;
7134 #if CONFIG_EXT_INTER
7135     int tmp_rate2 =
7136         motion_mode != SIMPLE_TRANSLATION ? rate2_bmc_nocoeff : rate2_nocoeff;
7137 #else
7138     int tmp_rate2 = rate2_nocoeff;
7139 #endif  // CONFIG_EXT_INTER
7140
7141     *mbmi = base_mbmi;
7142     mbmi->motion_mode = motion_mode;
7143 #if CONFIG_MOTION_VAR
7144     if (mbmi->motion_mode == OBMC_CAUSAL) {
7145 #if CONFIG_EXT_INTER
7146       *mbmi = *best_bmc_mbmi;
7147       mbmi->motion_mode = OBMC_CAUSAL;
7148 #endif  // CONFIG_EXT_INTER
7149       if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
7150         int tmp_rate_mv = 0;
7151
7152         single_motion_search(cpi, x, bsize, mi_row, mi_col,
7153 #if CONFIG_EXT_INTER
7154                              0,
7155 #endif  // CONFIG_EXT_INTER
7156                              &tmp_rate_mv);
7157         mbmi->mv[0].as_int = x->best_mv.as_int;
7158         if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv,
7159                                 refs[0])) {
7160           tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
7161         }
7162 #if CONFIG_EXT_INTER
7163         tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
7164 #else
7165         tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
7166 #endif  // CONFIG_EXT_INTER
7167 #if CONFIG_DUAL_FILTER
7168         if (!has_subpel_mv_component(xd->mi[0], xd, 0))
7169           mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
7170         if (!has_subpel_mv_component(xd->mi[0], xd, 1))
7171           mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
7172 #endif  // CONFIG_DUAL_FILTER
7173         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
7174 #if CONFIG_EXT_INTER
7175       } else {
7176         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
7177 #endif  // CONFIG_EXT_INTER
7178       }
7179       av1_build_obmc_inter_prediction(
7180           cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
7181           args->left_pred_buf, args->left_pred_stride);
7182       model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
7183                       &tmp_dist, skip_txfm_sb, skip_sse_sb);
7184     }
7185 #endif  // CONFIG_MOTION_VAR
7186
7187 #if CONFIG_WARPED_MOTION
7188     if (mbmi->motion_mode == WARPED_CAUSAL) {
7189 #if CONFIG_EXT_INTER
7190       *mbmi = *best_bmc_mbmi;
7191       mbmi->motion_mode = WARPED_CAUSAL;
7192 #endif  // CONFIG_EXT_INTER
7193       mbmi->wm_params[0].wmtype = DEFAULT_WMTYPE;
7194 #if CONFIG_DUAL_FILTER
7195       for (int dir = 0; dir < 4; ++dir)
7196         mbmi->interp_filter[dir] = cm->interp_filter == SWITCHABLE
7197                                        ? EIGHTTAP_REGULAR
7198                                        : cm->interp_filter;
7199 #else
7200       mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR
7201                                                             : cm->interp_filter;
7202 #endif  // CONFIG_DUAL_FILTER
7203
7204       if (!find_projection(mbmi->num_proj_ref[0], pts, pts_inref, bsize,
7205                            mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
7206                            &mbmi->wm_params[0], mi_row, mi_col)) {
7207         // Refine MV for NEWMV mode
7208         if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
7209           int tmp_rate_mv = 0;
7210           const int_mv mv0 = mbmi->mv[0];
7211           WarpedMotionParams wm_params0 = mbmi->wm_params[0];
7212
7213           // Refine MV in a small range.
7214           av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts, pts_inref);
7215
7216           // Keep the refined MV and WM parameters.
7217           if (mv0.as_int != mbmi->mv[0].as_int) {
7218             const int ref = refs[0];
7219             const MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
7220
7221             tmp_rate_mv =
7222                 av1_mv_bit_cost(&mbmi->mv[0].as_mv, &ref_mv, x->nmvjointcost,
7223                                 x->mvcost, MV_COST_WEIGHT);
7224
7225             if (cpi->sf.adaptive_motion_search)
7226               x->pred_mv[ref] = mbmi->mv[0].as_mv;
7227
7228             single_newmv[ref] = mbmi->mv[0];
7229
7230             if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv,
7231                                     refs[0])) {
7232               tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
7233             }
7234 #if CONFIG_EXT_INTER
7235             tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
7236 #else
7237             tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
7238 #endif  // CONFIG_EXT_INTER
7239 #if CONFIG_DUAL_FILTER
7240             if (!has_subpel_mv_component(xd->mi[0], xd, 0))
7241               mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
7242             if (!has_subpel_mv_component(xd->mi[0], xd, 1))
7243               mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
7244 #endif  // CONFIG_DUAL_FILTER
7245           } else {
7246             // Restore the old MV and WM parameters.
7247             mbmi->mv[0] = mv0;
7248             mbmi->wm_params[0] = wm_params0;
7249           }
7250         }
7251
7252         av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
7253         model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
7254                         &tmp_dist, skip_txfm_sb, skip_sse_sb);
7255       } else {
7256         continue;
7257       }
7258     }
7259 #endif  // CONFIG_WARPED_MOTION
7260     x->skip = 0;
7261
7262     rd_stats->dist = 0;
7263     rd_stats->sse = 0;
7264     rd_stats->skip = 1;
7265     rd_stats->rate = tmp_rate2;
7266     if (last_motion_mode_allowed > SIMPLE_TRANSLATION) {
7267 #if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
7268       if (last_motion_mode_allowed == WARPED_CAUSAL)
7269 #endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
7270         rd_stats->rate += cpi->motion_mode_cost[bsize][mbmi->motion_mode];
7271 #if CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
7272       else
7273         rd_stats->rate += cpi->motion_mode_cost1[bsize][mbmi->motion_mode];
7274 #endif  // CONFIG_WARPED_MOTION && CONFIG_MOTION_VAR
7275     }
7276 #if CONFIG_WARPED_MOTION
7277     if (mbmi->motion_mode == WARPED_CAUSAL) {
7278       rd_stats->rate -= rs;
7279     }
7280 #endif  // CONFIG_WARPED_MOTION
7281 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7282     if (!*skip_txfm_sb) {
7283       int64_t rdcosty = INT64_MAX;
7284       int is_cost_valid_uv = 0;
7285
7286       // cost and distortion
7287       av1_subtract_plane(x, bsize, 0);
7288 #if CONFIG_VAR_TX
7289       if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
7290         select_tx_type_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
7291       } else {
7292         int idx, idy;
7293         super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
7294         for (idy = 0; idy < xd->n8_h; ++idy)
7295           for (idx = 0; idx < xd->n8_w; ++idx)
7296             mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
7297         memset(x->blk_skip[0], rd_stats_y->skip,
7298                sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
7299       }
7300 #else
7301     /* clang-format off */
7302       super_block_yrd(cpi, x, rd_stats_y, bsize, ref_best_rd);
7303 /* clang-format on */
7304 #endif  // CONFIG_VAR_TX
7305
7306       if (rd_stats_y->rate == INT_MAX) {
7307         av1_invalid_rd_stats(rd_stats);
7308 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7309         if (mbmi->motion_mode != SIMPLE_TRANSLATION) {
7310           continue;
7311         } else {
7312 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7313           restore_dst_buf(xd, *orig_dst);
7314           return INT64_MAX;
7315 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7316         }
7317 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7318       }
7319
7320       av1_merge_rd_stats(rd_stats, rd_stats_y);
7321
7322       rdcosty = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
7323       rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
7324 /* clang-format off */
7325 #if CONFIG_VAR_TX
7326       is_cost_valid_uv =
7327           inter_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty);
7328 #else
7329       is_cost_valid_uv =
7330           super_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_rd - rdcosty);
7331 #endif  // CONFIG_VAR_TX
7332       if (!is_cost_valid_uv) {
7333 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7334         continue;
7335 #else
7336         restore_dst_buf(xd, *orig_dst);
7337         return INT64_MAX;
7338 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7339       }
7340       /* clang-format on */
7341       av1_merge_rd_stats(rd_stats, rd_stats_uv);
7342 #if CONFIG_RD_DEBUG
7343       // record transform block coefficient cost
7344       // TODO(angiebird): So far rd_debug tool only detects discrepancy of
7345       // coefficient cost. Therefore, it is fine to copy rd_stats into mbmi
7346       // here because we already collect the coefficient cost. Move this part to
7347       // other place when we need to compare non-coefficient cost.
7348       mbmi->rd_stats = *rd_stats;
7349 #endif  // CONFIG_RD_DEBUG
7350 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7351       if (rd_stats->skip) {
7352         rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
7353         rd_stats_y->rate = 0;
7354         rd_stats_uv->rate = 0;
7355         rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
7356         mbmi->skip = 0;
7357         // here mbmi->skip temporarily plays a role as what this_skip2 does
7358       } else if (!xd->lossless[mbmi->segment_id] &&
7359                  (RDCOST(x->rdmult, x->rddiv,
7360                          rd_stats_y->rate + rd_stats_uv->rate +
7361                              av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
7362                          rd_stats->dist) >=
7363                   RDCOST(x->rdmult, x->rddiv,
7364                          av1_cost_bit(av1_get_skip_prob(cm, xd), 1),
7365                          rd_stats->sse))) {
7366         rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
7367         rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
7368         rd_stats->dist = rd_stats->sse;
7369         rd_stats_y->rate = 0;
7370         rd_stats_uv->rate = 0;
7371         mbmi->skip = 1;
7372       } else {
7373         rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
7374         mbmi->skip = 0;
7375       }
7376       *disable_skip = 0;
7377 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7378     } else {
7379       x->skip = 1;
7380       *disable_skip = 1;
7381       mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1);
7382
7383 // The cost of skip bit needs to be added.
7384 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7385       mbmi->skip = 0;
7386 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7387       rd_stats->rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
7388
7389       rd_stats->dist = *skip_sse_sb;
7390       rd_stats->sse = *skip_sse_sb;
7391       rd_stats_y->rate = 0;
7392       rd_stats_uv->rate = 0;
7393       rd_stats->skip = 1;
7394     }
7395
7396 #if CONFIG_GLOBAL_MOTION
7397     if (this_mode == ZEROMV
7398 #if CONFIG_EXT_INTER
7399         || this_mode == ZERO_ZEROMV
7400 #endif  // CONFIG_EXT_INTER
7401         ) {
7402       if (is_nontrans_global_motion(xd)) {
7403         rd_stats->rate -= rs;
7404 #if CONFIG_DUAL_FILTER
7405         mbmi->interp_filter[0] = cm->interp_filter == SWITCHABLE
7406                                      ? EIGHTTAP_REGULAR
7407                                      : cm->interp_filter;
7408         mbmi->interp_filter[1] = cm->interp_filter == SWITCHABLE
7409                                      ? EIGHTTAP_REGULAR
7410                                      : cm->interp_filter;
7411 #else
7412         mbmi->interp_filter = cm->interp_filter == SWITCHABLE
7413                                   ? EIGHTTAP_REGULAR
7414                                   : cm->interp_filter;
7415 #endif  // CONFIG_DUAL_FILTER
7416       }
7417     }
7418 #endif  // CONFIG_GLOBAL_MOTION
7419
7420 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7421     tmp_rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
7422     if (mbmi->motion_mode == SIMPLE_TRANSLATION || (tmp_rd < best_rd)) {
7423       best_mbmi = *mbmi;
7424       best_rd = tmp_rd;
7425       best_rd_stats = *rd_stats;
7426       best_rd_stats_y = *rd_stats_y;
7427       best_rd_stats_uv = *rd_stats_uv;
7428 #if CONFIG_VAR_TX
7429       for (int i = 0; i < MAX_MB_PLANE; ++i)
7430         memcpy(best_blk_skip[i], x->blk_skip[i],
7431                sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
7432 #endif  // CONFIG_VAR_TX
7433       best_xskip = x->skip;
7434       best_disable_skip = *disable_skip;
7435     }
7436   }
7437
7438   if (best_rd == INT64_MAX) {
7439     av1_invalid_rd_stats(rd_stats);
7440     restore_dst_buf(xd, *orig_dst);
7441     return INT64_MAX;
7442   }
7443   *mbmi = best_mbmi;
7444   *rd_stats = best_rd_stats;
7445   *rd_stats_y = best_rd_stats_y;
7446   *rd_stats_uv = best_rd_stats_uv;
7447 #if CONFIG_VAR_TX
7448   for (int i = 0; i < MAX_MB_PLANE; ++i)
7449     memcpy(x->blk_skip[i], best_blk_skip[i],
7450            sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
7451 #endif  // CONFIG_VAR_TX
7452   x->skip = best_xskip;
7453   *disable_skip = best_disable_skip;
7454 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7455
7456   restore_dst_buf(xd, *orig_dst);
7457   return 0;
7458 }
7459
7460 static int64_t handle_inter_mode(
7461     const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
7462     RD_STATS *rd_stats, RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
7463     int *disable_skip, int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], int mi_row,
7464     int mi_col, HandleInterModeArgs *args, const int64_t ref_best_rd) {
7465   const AV1_COMMON *cm = &cpi->common;
7466   (void)cm;
7467   MACROBLOCKD *xd = &x->e_mbd;
7468   MODE_INFO *mi = xd->mi[0];
7469   MB_MODE_INFO *mbmi = &mi->mbmi;
7470   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
7471   const int is_comp_pred = has_second_ref(mbmi);
7472   const int this_mode = mbmi->mode;
7473   int_mv *frame_mv = mode_mv[this_mode];
7474   int i;
7475   int refs[2] = { mbmi->ref_frame[0],
7476                   (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
7477   int_mv cur_mv[2];
7478   int rate_mv = 0;
7479 #if CONFIG_EXT_INTER
7480   int pred_exists = 1;
7481 #if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
7482   const int bw = block_size_wide[bsize];
7483 #endif  // ONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
7484   int_mv single_newmv[TOTAL_REFS_PER_FRAME];
7485 #if CONFIG_INTERINTRA
7486   const unsigned int *const interintra_mode_cost =
7487       cpi->interintra_mode_cost[size_group_lookup[bsize]];
7488 #endif  // CONFIG_INTERINTRA
7489   const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
7490   uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
7491 #else
7492   int_mv *const single_newmv = args->single_newmv;
7493 #endif  // CONFIG_EXT_INTER
7494 #if CONFIG_HIGHBITDEPTH
7495   DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
7496 #else
7497   DECLARE_ALIGNED(16, uint8_t, tmp_buf_[MAX_MB_PLANE * MAX_SB_SQUARE]);
7498 #endif  // CONFIG_HIGHBITDEPTH
7499   uint8_t *tmp_buf;
7500
7501 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7502 #if CONFIG_EXT_INTER
7503   int rate2_bmc_nocoeff;
7504   MB_MODE_INFO best_bmc_mbmi;
7505 #if CONFIG_MOTION_VAR
7506   int rate_mv_bmc;
7507 #endif  // CONFIG_MOTION_VAR
7508 #endif  // CONFIG_EXT_INTER
7509 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7510   int64_t rd = INT64_MAX;
7511   BUFFER_SET orig_dst, tmp_dst;
7512   int rs = 0;
7513
7514   int skip_txfm_sb = 0;
7515   int64_t skip_sse_sb = INT64_MAX;
7516   int16_t mode_ctx;
7517
7518 #if CONFIG_EXT_INTER
7519 #if CONFIG_INTERINTRA
7520   int compmode_interintra_cost = 0;
7521   mbmi->use_wedge_interintra = 0;
7522 #endif
7523 #if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
7524   int compmode_interinter_cost = 0;
7525   mbmi->interinter_compound_type = COMPOUND_AVERAGE;
7526 #endif
7527
7528 #if CONFIG_INTERINTRA
7529   if (!cm->allow_interintra_compound && is_comp_interintra_pred)
7530     return INT64_MAX;
7531 #endif  // CONFIG_INTERINTRA
7532
7533   // is_comp_interintra_pred implies !is_comp_pred
7534   assert(!is_comp_interintra_pred || (!is_comp_pred));
7535   // is_comp_interintra_pred implies is_interintra_allowed(mbmi->sb_type)
7536   assert(!is_comp_interintra_pred || is_interintra_allowed(mbmi));
7537 #endif  // CONFIG_EXT_INTER
7538
7539 #if CONFIG_EXT_INTER
7540   if (is_comp_pred)
7541     mode_ctx = mbmi_ext->compound_mode_context[refs[0]];
7542   else
7543 #endif  // CONFIG_EXT_INTER
7544     mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
7545                                          mbmi->ref_frame, bsize, -1);
7546
7547 #if CONFIG_HIGHBITDEPTH
7548   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
7549     tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
7550   else
7551 #endif  // CONFIG_HIGHBITDEPTH
7552     tmp_buf = tmp_buf_;
7553   // Make sure that we didn't leave the plane destination buffers set
7554   // to tmp_buf at the end of the last iteration
7555   assert(xd->plane[0].dst.buf != tmp_buf);
7556
7557 #if CONFIG_WARPED_MOTION
7558   mbmi->num_proj_ref[0] = 0;
7559   mbmi->num_proj_ref[1] = 0;
7560 #endif  // CONFIG_WARPED_MOTION
7561
7562   if (is_comp_pred) {
7563     if (frame_mv[refs[0]].as_int == INVALID_MV ||
7564         frame_mv[refs[1]].as_int == INVALID_MV)
7565       return INT64_MAX;
7566   }
7567
7568   mbmi->motion_mode = SIMPLE_TRANSLATION;
7569   if (have_newmv_in_inter_mode(this_mode)) {
7570     const int64_t ret_val = handle_newmv(cpi, x, bsize, mode_mv, mi_row, mi_col,
7571                                          &rate_mv, single_newmv, args);
7572     if (ret_val != 0)
7573       return ret_val;
7574     else
7575       rd_stats->rate += rate_mv;
7576   }
7577   for (i = 0; i < is_comp_pred + 1; ++i) {
7578     cur_mv[i] = frame_mv[refs[i]];
7579     // Clip "next_nearest" so that it does not extend to far out of image
7580     if (this_mode != NEWMV) clamp_mv2(&cur_mv[i].as_mv, xd);
7581     if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
7582     mbmi->mv[i].as_int = cur_mv[i].as_int;
7583   }
7584
7585 #if CONFIG_EXT_INTER
7586   if (this_mode == NEAREST_NEARESTMV)
7587 #else
7588   if (this_mode == NEARESTMV && is_comp_pred)
7589 #endif  // CONFIG_EXT_INTER
7590   {
7591 #if !CONFIG_EXT_INTER
7592     uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
7593 #endif  // !CONFIG_EXT_INTER
7594     if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
7595       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
7596       cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
7597
7598       for (i = 0; i < 2; ++i) {
7599         clamp_mv2(&cur_mv[i].as_mv, xd);
7600         if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
7601         mbmi->mv[i].as_int = cur_mv[i].as_int;
7602       }
7603     }
7604   }
7605
7606 #if CONFIG_EXT_INTER
7607   if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
7608     if (this_mode == NEAREST_NEWMV) {
7609       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
7610
7611       lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
7612       clamp_mv2(&cur_mv[0].as_mv, xd);
7613       if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX;
7614       mbmi->mv[0].as_int = cur_mv[0].as_int;
7615     }
7616
7617     if (this_mode == NEW_NEARESTMV) {
7618       cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
7619
7620       lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
7621       clamp_mv2(&cur_mv[1].as_mv, xd);
7622       if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
7623       mbmi->mv[1].as_int = cur_mv[1].as_int;
7624     }
7625   }
7626
7627   if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
7628     int ref_mv_idx = mbmi->ref_mv_idx + 1;
7629     if (this_mode == NEAR_NEWMV || this_mode == NEAR_NEARMV) {
7630       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
7631
7632       lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
7633       clamp_mv2(&cur_mv[0].as_mv, xd);
7634       if (mv_check_bounds(&x->mv_limits, &cur_mv[0].as_mv)) return INT64_MAX;
7635       mbmi->mv[0].as_int = cur_mv[0].as_int;
7636     }
7637
7638     if (this_mode == NEW_NEARMV || this_mode == NEAR_NEARMV) {
7639       cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
7640
7641       lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
7642       clamp_mv2(&cur_mv[1].as_mv, xd);
7643       if (mv_check_bounds(&x->mv_limits, &cur_mv[1].as_mv)) return INT64_MAX;
7644       mbmi->mv[1].as_int = cur_mv[1].as_int;
7645     }
7646   }
7647 #else
7648   if (this_mode == NEARMV && is_comp_pred) {
7649     uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
7650     if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
7651       int ref_mv_idx = mbmi->ref_mv_idx + 1;
7652       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
7653       cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
7654
7655       for (i = 0; i < 2; ++i) {
7656         clamp_mv2(&cur_mv[i].as_mv, xd);
7657         if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
7658         mbmi->mv[i].as_int = cur_mv[i].as_int;
7659       }
7660     }
7661   }
7662 #endif  // CONFIG_EXT_INTER
7663
7664   // do first prediction into the destination buffer. Do the next
7665   // prediction into a temporary buffer. Then keep track of which one
7666   // of these currently holds the best predictor, and use the other
7667   // one for future predictions. In the end, copy from tmp_buf to
7668   // dst if necessary.
7669   for (i = 0; i < MAX_MB_PLANE; i++) {
7670     tmp_dst.plane[i] = tmp_buf + i * MAX_SB_SQUARE;
7671     tmp_dst.stride[i] = MAX_SB_SIZE;
7672   }
7673   for (i = 0; i < MAX_MB_PLANE; i++) {
7674     orig_dst.plane[i] = xd->plane[i].dst.buf;
7675     orig_dst.stride[i] = xd->plane[i].dst.stride;
7676   }
7677
7678   // We don't include the cost of the second reference here, because there
7679   // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
7680   // words if you present them in that order, the second one is always known
7681   // if the first is known.
7682   //
7683   // Under some circumstances we discount the cost of new mv mode to encourage
7684   // initiation of a motion field.
7685   if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv,
7686                           refs[0])) {
7687 #if CONFIG_EXT_INTER
7688     rd_stats->rate +=
7689         AOMMIN(cost_mv_ref(cpi, this_mode, mode_ctx),
7690                cost_mv_ref(cpi, is_comp_pred ? NEAREST_NEARESTMV : NEARESTMV,
7691                            mode_ctx));
7692 #else
7693     rd_stats->rate += AOMMIN(cost_mv_ref(cpi, this_mode, mode_ctx),
7694                              cost_mv_ref(cpi, NEARESTMV, mode_ctx));
7695 #endif  // CONFIG_EXT_INTER
7696   } else {
7697     rd_stats->rate += cost_mv_ref(cpi, this_mode, mode_ctx);
7698   }
7699
7700   if (RDCOST(x->rdmult, x->rddiv, rd_stats->rate, 0) > ref_best_rd &&
7701 #if CONFIG_EXT_INTER
7702       mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV
7703 #else
7704       mbmi->mode != NEARESTMV
7705 #endif  // CONFIG_EXT_INTER
7706       )
7707     return INT64_MAX;
7708
7709   int64_t ret_val = interpolation_filter_search(
7710       x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, args->single_filter,
7711       &rd, &rs, &skip_txfm_sb, &skip_sse_sb);
7712   if (ret_val != 0) return ret_val;
7713
7714 #if CONFIG_EXT_INTER
7715 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7716   best_bmc_mbmi = *mbmi;
7717   rate2_bmc_nocoeff = rd_stats->rate;
7718   if (cm->interp_filter == SWITCHABLE) rate2_bmc_nocoeff += rs;
7719 #if CONFIG_MOTION_VAR
7720   rate_mv_bmc = rate_mv;
7721 #endif  // CONFIG_MOTION_VAR
7722 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
7723
7724 #if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
7725   if (is_comp_pred) {
7726     int rate_sum, rs2;
7727     int64_t dist_sum;
7728     int64_t best_rd_compound = INT64_MAX, best_rd_cur = INT64_MAX;
7729     INTERINTER_COMPOUND_DATA best_compound_data;
7730     int_mv best_mv[2];
7731     int best_tmp_rate_mv = rate_mv;
7732     int tmp_skip_txfm_sb;
7733     int64_t tmp_skip_sse_sb;
7734     int compound_type_cost[COMPOUND_TYPES];
7735     uint8_t pred0[2 * MAX_SB_SQUARE];
7736     uint8_t pred1[2 * MAX_SB_SQUARE];
7737     uint8_t *preds0[1] = { pred0 };
7738     uint8_t *preds1[1] = { pred1 };
7739     int strides[1] = { bw };
7740     int tmp_rate_mv;
7741     int masked_compound_used = is_any_masked_compound_used(bsize);
7742 #if CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
7743     masked_compound_used = masked_compound_used && cm->allow_masked_compound;
7744 #endif  // CONFIG_COMPOUND_SEGMENT || CONFIG_WEDGE
7745     COMPOUND_TYPE cur_type;
7746
7747     best_mv[0].as_int = cur_mv[0].as_int;
7748     best_mv[1].as_int = cur_mv[1].as_int;
7749     memset(&best_compound_data, 0, sizeof(best_compound_data));
7750 #if CONFIG_COMPOUND_SEGMENT
7751     uint8_t tmp_mask_buf[2 * MAX_SB_SQUARE];
7752     best_compound_data.seg_mask = tmp_mask_buf;
7753 #endif  // CONFIG_COMPOUND_SEGMENT
7754
7755     if (masked_compound_used) {
7756       av1_cost_tokens(compound_type_cost, cm->fc->compound_type_prob[bsize],
7757                       av1_compound_type_tree);
7758       // get inter predictors to use for masked compound modes
7759       av1_build_inter_predictors_for_planes_single_buf(
7760           xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides);
7761       av1_build_inter_predictors_for_planes_single_buf(
7762           xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides);
7763     }
7764
7765     for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
7766       if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
7767       if (!is_interinter_compound_used(cur_type, bsize)) break;
7768       tmp_rate_mv = rate_mv;
7769       best_rd_cur = INT64_MAX;
7770       mbmi->interinter_compound_type = cur_type;
7771       rs2 = av1_cost_literal(get_interinter_compound_type_bits(
7772                 bsize, mbmi->interinter_compound_type)) +
7773             (masked_compound_used
7774                  ? compound_type_cost[mbmi->interinter_compound_type]
7775                  : 0);
7776
7777       switch (cur_type) {
7778         case COMPOUND_AVERAGE:
7779           av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
7780                                          bsize);
7781           av1_subtract_plane(x, bsize, 0);
7782           rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
7783                                    &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
7784                                    INT64_MAX);
7785           if (rd != INT64_MAX)
7786             best_rd_cur =
7787                 RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv + rate_sum, dist_sum);
7788           best_rd_compound = best_rd_cur;
7789           break;
7790 #if CONFIG_WEDGE
7791         case COMPOUND_WEDGE:
7792           if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
7793               best_rd_compound / 3 < ref_best_rd) {
7794             best_rd_cur = build_and_cost_compound_type(
7795                 cpi, x, cur_mv, bsize, this_mode, rs2, rate_mv, &orig_dst,
7796                 &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
7797           }
7798           break;
7799 #endif  // CONFIG_WEDGE
7800 #if CONFIG_COMPOUND_SEGMENT
7801         case COMPOUND_SEG:
7802           if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
7803               best_rd_compound / 3 < ref_best_rd) {
7804             best_rd_cur = build_and_cost_compound_type(
7805                 cpi, x, cur_mv, bsize, this_mode, rs2, rate_mv, &orig_dst,
7806                 &tmp_rate_mv, preds0, preds1, strides, mi_row, mi_col);
7807           }
7808           break;
7809 #endif  // CONFIG_COMPOUND_SEGMENT
7810         default: assert(0); return 0;
7811       }
7812
7813       if (best_rd_cur < best_rd_compound) {
7814         best_rd_compound = best_rd_cur;
7815 #if CONFIG_WEDGE
7816         best_compound_data.wedge_index = mbmi->wedge_index;
7817         best_compound_data.wedge_sign = mbmi->wedge_sign;
7818 #endif  // CONFIG_WEDGE
7819 #if CONFIG_COMPOUND_SEGMENT
7820         best_compound_data.mask_type = mbmi->mask_type;
7821         memcpy(best_compound_data.seg_mask, xd->seg_mask,
7822                2 * MAX_SB_SQUARE * sizeof(uint8_t));
7823 #endif  // CONFIG_COMPOUND_SEGMENT
7824         best_compound_data.interinter_compound_type =
7825             mbmi->interinter_compound_type;
7826         if (have_newmv_in_inter_mode(this_mode)) {
7827           if (use_masked_motion_search(cur_type)) {
7828             best_tmp_rate_mv = tmp_rate_mv;
7829             best_mv[0].as_int = mbmi->mv[0].as_int;
7830             best_mv[1].as_int = mbmi->mv[1].as_int;
7831           } else {
7832             best_mv[0].as_int = cur_mv[0].as_int;
7833             best_mv[1].as_int = cur_mv[1].as_int;
7834           }
7835         }
7836       }
7837       // reset to original mvs for next iteration
7838       mbmi->mv[0].as_int = cur_mv[0].as_int;
7839       mbmi->mv[1].as_int = cur_mv[1].as_int;
7840     }
7841 #if CONFIG_WEDGE
7842     mbmi->wedge_index = best_compound_data.wedge_index;
7843     mbmi->wedge_sign = best_compound_data.wedge_sign;
7844 #endif  // CONFIG_WEDGE
7845 #if CONFIG_COMPOUND_SEGMENT
7846     mbmi->mask_type = best_compound_data.mask_type;
7847     memcpy(xd->seg_mask, best_compound_data.seg_mask,
7848            2 * MAX_SB_SQUARE * sizeof(uint8_t));
7849 #endif  // CONFIG_COMPOUND_SEGMENT
7850     mbmi->interinter_compound_type =
7851         best_compound_data.interinter_compound_type;
7852     if (have_newmv_in_inter_mode(this_mode)) {
7853       mbmi->mv[0].as_int = best_mv[0].as_int;
7854       mbmi->mv[1].as_int = best_mv[1].as_int;
7855       xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
7856       xd->mi[0]->bmi[0].as_mv[1].as_int = mbmi->mv[1].as_int;
7857       if (use_masked_motion_search(mbmi->interinter_compound_type)) {
7858         rd_stats->rate += best_tmp_rate_mv - rate_mv;
7859         rate_mv = best_tmp_rate_mv;
7860       }
7861     }
7862
7863     if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
7864       restore_dst_buf(xd, orig_dst);
7865       return INT64_MAX;
7866     }
7867
7868     pred_exists = 0;
7869
7870     compmode_interinter_cost =
7871         av1_cost_literal(get_interinter_compound_type_bits(
7872             bsize, mbmi->interinter_compound_type)) +
7873         (masked_compound_used
7874              ? compound_type_cost[mbmi->interinter_compound_type]
7875              : 0);
7876   }
7877 #endif  // CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
7878
7879 #if CONFIG_INTERINTRA
7880   if (is_comp_interintra_pred) {
7881     INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
7882     int64_t best_interintra_rd = INT64_MAX;
7883     int rmode, rate_sum;
7884     int64_t dist_sum;
7885     int j;
7886     int tmp_rate_mv = 0;
7887     int tmp_skip_txfm_sb;
7888     int64_t tmp_skip_sse_sb;
7889     DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_SB_SQUARE]);
7890     uint8_t *intrapred;
7891
7892 #if CONFIG_HIGHBITDEPTH
7893     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
7894       intrapred = CONVERT_TO_BYTEPTR(intrapred_);
7895     else
7896 #endif  // CONFIG_HIGHBITDEPTH
7897       intrapred = intrapred_;
7898
7899     mbmi->ref_frame[1] = NONE_FRAME;
7900     for (j = 0; j < MAX_MB_PLANE; j++) {
7901       xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE;
7902       xd->plane[j].dst.stride = bw;
7903     }
7904     av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst, bsize);
7905     restore_dst_buf(xd, orig_dst);
7906     mbmi->ref_frame[1] = INTRA_FRAME;
7907     mbmi->use_wedge_interintra = 0;
7908
7909     for (j = 0; j < INTERINTRA_MODES; ++j) {
7910       mbmi->interintra_mode = (INTERINTRA_MODE)j;
7911       rmode = interintra_mode_cost[mbmi->interintra_mode];
7912       av1_build_intra_predictors_for_interintra(xd, bsize, 0, &orig_dst,
7913                                                 intrapred, bw);
7914       av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
7915       model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
7916                       &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
7917       rd =
7918           RDCOST(x->rdmult, x->rddiv, tmp_rate_mv + rate_sum + rmode, dist_sum);
7919       if (rd < best_interintra_rd) {
7920         best_interintra_rd = rd;
7921         best_interintra_mode = mbmi->interintra_mode;
7922       }
7923     }
7924     mbmi->interintra_mode = best_interintra_mode;
7925     rmode = interintra_mode_cost[mbmi->interintra_mode];
7926     av1_build_intra_predictors_for_interintra(xd, bsize, 0, &orig_dst,
7927                                               intrapred, bw);
7928     av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
7929     av1_subtract_plane(x, bsize, 0);
7930     rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
7931                              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
7932     if (rd != INT64_MAX)
7933       rd = RDCOST(x->rdmult, x->rddiv, rate_mv + rmode + rate_sum, dist_sum);
7934     best_interintra_rd = rd;
7935
7936     if (ref_best_rd < INT64_MAX && best_interintra_rd > 2 * ref_best_rd) {
7937       // Don't need to call restore_dst_buf here
7938       return INT64_MAX;
7939     }
7940 #if CONFIG_WEDGE
7941     if (is_interintra_wedge_used(bsize)) {
7942       int64_t best_interintra_rd_nowedge = INT64_MAX;
7943       int64_t best_interintra_rd_wedge = INT64_MAX;
7944       int_mv tmp_mv;
7945       int rwedge = av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 0);
7946       if (rd != INT64_MAX)
7947         rd = RDCOST(x->rdmult, x->rddiv, rmode + rate_mv + rwedge + rate_sum,
7948                     dist_sum);
7949       best_interintra_rd_nowedge = best_interintra_rd;
7950
7951       // Disable wedge search if source variance is small
7952       if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
7953         mbmi->use_wedge_interintra = 1;
7954
7955         rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
7956                  av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 1);
7957
7958         best_interintra_rd_wedge =
7959             pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
7960
7961         best_interintra_rd_wedge +=
7962             RDCOST(x->rdmult, x->rddiv, rmode + rate_mv + rwedge, 0);
7963         // Refine motion vector.
7964         if (have_newmv_in_inter_mode(this_mode)) {
7965           // get negative of mask
7966           const uint8_t *mask = av1_get_contiguous_soft_mask(
7967               mbmi->interintra_wedge_index, 1, bsize);
7968           tmp_mv.as_int = x->mbmi_ext->ref_mvs[refs[0]][0].as_int;
7969           compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
7970                                         mi_col, intrapred, mask, bw,
7971                                         &tmp_rate_mv, 0, 0);
7972           mbmi->mv[0].as_int = tmp_mv.as_int;
7973           av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, &orig_dst,
7974                                          bsize);
7975           model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
7976                           &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
7977           rd = RDCOST(x->rdmult, x->rddiv,
7978                       rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum);
7979           if (rd >= best_interintra_rd_wedge) {
7980             tmp_mv.as_int = cur_mv[0].as_int;
7981             tmp_rate_mv = rate_mv;
7982           }
7983         } else {
7984           tmp_mv.as_int = cur_mv[0].as_int;
7985           tmp_rate_mv = rate_mv;
7986           av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
7987         }
7988         // Evaluate closer to true rd
7989         av1_subtract_plane(x, bsize, 0);
7990         rd =
7991             estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
7992                                 &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
7993         if (rd != INT64_MAX)
7994           rd = RDCOST(x->rdmult, x->rddiv,
7995                       rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum);
7996         best_interintra_rd_wedge = rd;
7997         if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
7998           mbmi->use_wedge_interintra = 1;
7999           mbmi->mv[0].as_int = tmp_mv.as_int;
8000           rd_stats->rate += tmp_rate_mv - rate_mv;
8001           rate_mv = tmp_rate_mv;
8002         } else {
8003           mbmi->use_wedge_interintra = 0;
8004           mbmi->mv[0].as_int = cur_mv[0].as_int;
8005         }
8006       } else {
8007         mbmi->use_wedge_interintra = 0;
8008       }
8009     }
8010 #endif  // CONFIG_WEDGE
8011
8012     pred_exists = 0;
8013     compmode_interintra_cost =
8014         av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 1) +
8015         interintra_mode_cost[mbmi->interintra_mode];
8016     if (is_interintra_wedge_used(bsize)) {
8017       compmode_interintra_cost += av1_cost_bit(
8018           cm->fc->wedge_interintra_prob[bsize], mbmi->use_wedge_interintra);
8019       if (mbmi->use_wedge_interintra) {
8020         compmode_interintra_cost +=
8021             av1_cost_literal(get_interintra_wedge_bits(bsize));
8022       }
8023     }
8024   } else if (is_interintra_allowed(mbmi)) {
8025     compmode_interintra_cost =
8026         av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 0);
8027   }
8028 #endif  // CONFIG_INTERINTRA
8029
8030   if (pred_exists == 0) {
8031     int tmp_rate;
8032     int64_t tmp_dist;
8033     av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, &orig_dst, bsize);
8034     model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
8035                     &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
8036     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
8037   }
8038 #endif  // CONFIG_EXT_INTER
8039
8040   if (!is_comp_pred)
8041 #if CONFIG_DUAL_FILTER
8042     args->single_filter[this_mode][refs[0]] = mbmi->interp_filter[0];
8043 #else
8044     args->single_filter[this_mode][refs[0]] = mbmi->interp_filter;
8045 #endif  // CONFIG_DUAL_FILTER
8046
8047 #if CONFIG_EXT_INTER
8048   if (args->modelled_rd != NULL) {
8049     if (is_comp_pred) {
8050       const int mode0 = compound_ref0_mode(this_mode);
8051       const int mode1 = compound_ref1_mode(this_mode);
8052       const int64_t mrd = AOMMIN(args->modelled_rd[mode0][refs[0]],
8053                                  args->modelled_rd[mode1][refs[1]]);
8054       if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
8055         restore_dst_buf(xd, orig_dst);
8056         return INT64_MAX;
8057       }
8058     } else if (!is_comp_interintra_pred) {
8059       args->modelled_rd[this_mode][refs[0]] = rd;
8060     }
8061   }
8062 #endif  // CONFIG_EXT_INTER
8063
8064   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
8065     // if current pred_error modeled rd is substantially more than the best
8066     // so far, do not bother doing full rd
8067     if (rd / 2 > ref_best_rd) {
8068       restore_dst_buf(xd, orig_dst);
8069       return INT64_MAX;
8070     }
8071   }
8072
8073 #if CONFIG_EXT_INTER
8074 #if CONFIG_INTERINTRA
8075   rd_stats->rate += compmode_interintra_cost;
8076 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
8077   rate2_bmc_nocoeff += compmode_interintra_cost;
8078 #endif
8079 #endif
8080 #if CONFIG_WEDGE || CONFIG_COMPOUND_SEGMENT
8081   rd_stats->rate += compmode_interinter_cost;
8082 #endif
8083 #endif
8084
8085   ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
8086                            disable_skip, mode_mv, mi_row, mi_col, args,
8087                            ref_best_rd, refs, rate_mv,
8088 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
8089                            single_newmv,
8090 #if CONFIG_EXT_INTER
8091                            rate2_bmc_nocoeff, &best_bmc_mbmi,
8092 #if CONFIG_MOTION_VAR
8093                            rate_mv_bmc,
8094 #endif  // CONFIG_MOTION_VAR
8095 #endif  // CONFIG_EXT_INTER
8096 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
8097                            rs, &skip_txfm_sb, &skip_sse_sb, &orig_dst);
8098   if (ret_val != 0) return ret_val;
8099
8100   return 0;  // The rate-distortion cost will be re-calculated by caller.
8101 }
8102
8103 #if CONFIG_INTRABC
8104 static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
8105                                        RD_STATS *rd_cost, BLOCK_SIZE bsize,
8106                                        int64_t best_rd) {
8107   const AV1_COMMON *const cm = &cpi->common;
8108   if (bsize < BLOCK_8X8 || !cm->allow_screen_content_tools) return INT64_MAX;
8109
8110   MACROBLOCKD *const xd = &x->e_mbd;
8111   const TileInfo *tile = &xd->tile;
8112   MODE_INFO *const mi = xd->mi[0];
8113   const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE);
8114   const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE);
8115   const int w = block_size_wide[bsize];
8116   const int h = block_size_high[bsize];
8117   const int sb_row = mi_row / MAX_MIB_SIZE;
8118   const int sb_col = mi_col / MAX_MIB_SIZE;
8119
8120   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
8121   MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
8122   int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
8123   av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
8124                    mbmi_ext->ref_mv_stack[ref_frame],
8125 #if CONFIG_EXT_INTER
8126                    mbmi_ext->compound_mode_context,
8127 #endif  // CONFIG_EXT_INTER
8128                    candidates, mi_row, mi_col, NULL, NULL,
8129                    mbmi_ext->mode_context);
8130
8131   int_mv nearestmv, nearmv;
8132   av1_find_best_ref_mvs(0, candidates, &nearestmv, &nearmv);
8133
8134   int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
8135   if (dv_ref.as_int == 0) av1_find_ref_dv(&dv_ref, mi_row, mi_col);
8136   mbmi_ext->ref_mvs[INTRA_FRAME][0] = dv_ref;
8137
8138   struct buf_2d yv12_mb[MAX_MB_PLANE];
8139   av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, mi_row, mi_col, NULL, NULL);
8140   for (int i = 0; i < MAX_MB_PLANE; ++i) {
8141     xd->plane[i].pre[0] = yv12_mb[i];
8142   }
8143
8144   enum IntrabcMotionDirection {
8145     IBC_MOTION_ABOVE,
8146     IBC_MOTION_LEFT,
8147     IBC_MOTION_DIRECTIONS
8148   };
8149
8150   MB_MODE_INFO *mbmi = &mi->mbmi;
8151   MB_MODE_INFO best_mbmi = *mbmi;
8152   RD_STATS best_rdcost = *rd_cost;
8153   int best_skip = x->skip;
8154
8155   for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE;
8156        dir < IBC_MOTION_DIRECTIONS; ++dir) {
8157     const MvLimits tmp_mv_limits = x->mv_limits;
8158     switch (dir) {
8159       case IBC_MOTION_ABOVE:
8160         x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
8161         x->mv_limits.col_max = (tile->mi_col_end - mi_col) * MI_SIZE - w;
8162         x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
8163         x->mv_limits.row_max = (sb_row * MAX_MIB_SIZE - mi_row) * MI_SIZE - h;
8164         break;
8165       case IBC_MOTION_LEFT:
8166         x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
8167         x->mv_limits.col_max = (sb_col * MAX_MIB_SIZE - mi_col) * MI_SIZE - w;
8168         // TODO(aconverse@google.com): Minimize the overlap between above and
8169         // left areas.
8170         x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
8171         int bottom_coded_mi_edge =
8172             AOMMIN((sb_row + 1) * MAX_MIB_SIZE, tile->mi_row_end);
8173         x->mv_limits.row_max = (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
8174         break;
8175       default: assert(0);
8176     }
8177     assert(x->mv_limits.col_min >= tmp_mv_limits.col_min);
8178     assert(x->mv_limits.col_max <= tmp_mv_limits.col_max);
8179     assert(x->mv_limits.row_min >= tmp_mv_limits.row_min);
8180     assert(x->mv_limits.row_max <= tmp_mv_limits.row_max);
8181     av1_set_mv_search_range(&x->mv_limits, &dv_ref.as_mv);
8182
8183     if (x->mv_limits.col_max < x->mv_limits.col_min ||
8184         x->mv_limits.row_max < x->mv_limits.row_min) {
8185       x->mv_limits = tmp_mv_limits;
8186       continue;
8187     }
8188
8189     int step_param = cpi->mv_step_param;
8190     MV mvp_full = dv_ref.as_mv;
8191     mvp_full.col >>= 3;
8192     mvp_full.row >>= 3;
8193     int sadpb = x->sadperbit16;
8194     int cost_list[5];
8195     int bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
8196                                         sadpb, cond_cost_list(cpi, cost_list),
8197                                         &dv_ref.as_mv, INT_MAX, 1);
8198
8199     x->mv_limits = tmp_mv_limits;
8200     if (bestsme == INT_MAX) continue;
8201     mvp_full = x->best_mv.as_mv;
8202     MV dv = {.row = mvp_full.row * 8, .col = mvp_full.col * 8 };
8203     if (mv_check_bounds(&x->mv_limits, &dv)) continue;
8204     if (!is_dv_valid(dv, tile, mi_row, mi_col, bsize)) continue;
8205
8206 #if CONFIG_PALETTE
8207     memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
8208 #endif
8209     mbmi->use_intrabc = 1;
8210     mbmi->mode = DC_PRED;
8211     mbmi->uv_mode = DC_PRED;
8212     mbmi->mv[0].as_mv = dv;
8213 #if CONFIG_DUAL_FILTER
8214     for (int idx = 0; idx < 4; ++idx) mbmi->interp_filter[idx] = BILINEAR;
8215 #else
8216     mbmi->interp_filter = BILINEAR;
8217 #endif
8218     mbmi->skip = 0;
8219     x->skip = 0;
8220     av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
8221
8222     int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, x->nmvjointcost,
8223                                   x->mvcost, MV_COST_WEIGHT);
8224     const PREDICTION_MODE A = av1_above_block_mode(mi, xd->above_mi, 0);
8225     const PREDICTION_MODE L = av1_left_block_mode(mi, xd->left_mi, 0);
8226     const int rate_mode =
8227         cpi->y_mode_costs[A][L][DC_PRED] + av1_cost_bit(INTRABC_PROB, 1);
8228
8229     RD_STATS rd_stats, rd_stats_uv;
8230     av1_subtract_plane(x, bsize, 0);
8231     super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
8232     super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
8233     av1_merge_rd_stats(&rd_stats, &rd_stats_uv);
8234 #if CONFIG_RD_DEBUG
8235     mbmi->rd_stats = rd_stats;
8236 #endif
8237
8238 #if CONFIG_VAR_TX
8239     // TODO(aconverse@google.com): Evaluate allowing VAR TX on intrabc blocks
8240     const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
8241     const int height = block_size_high[bsize] >> tx_size_high_log2[0];
8242     int idx, idy;
8243     for (idy = 0; idy < height; ++idy)
8244       for (idx = 0; idx < width; ++idx)
8245         mbmi->inter_tx_size[idy >> 1][idx >> 1] = mbmi->tx_size;
8246     mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
8247 #endif  // CONFIG_VAR_TX
8248
8249     const aom_prob skip_prob = av1_get_skip_prob(cm, xd);
8250
8251     RD_STATS rdc_noskip;
8252     av1_init_rd_stats(&rdc_noskip);
8253     rdc_noskip.rate =
8254         rate_mode + rate_mv + rd_stats.rate + av1_cost_bit(skip_prob, 0);
8255     rdc_noskip.dist = rd_stats.dist;
8256     rdc_noskip.rdcost =
8257         RDCOST(x->rdmult, x->rddiv, rdc_noskip.rate, rdc_noskip.dist);
8258     if (rdc_noskip.rdcost < best_rd) {
8259       best_rd = rdc_noskip.rdcost;
8260       best_mbmi = *mbmi;
8261       best_skip = x->skip;
8262       best_rdcost = rdc_noskip;
8263     }
8264
8265     x->skip = 1;
8266     mbmi->skip = 1;
8267     RD_STATS rdc_skip;
8268     av1_init_rd_stats(&rdc_skip);
8269     rdc_skip.rate = rate_mode + rate_mv + av1_cost_bit(skip_prob, 1);
8270     rdc_skip.dist = rd_stats.sse;
8271     rdc_skip.rdcost = RDCOST(x->rdmult, x->rddiv, rdc_skip.rate, rdc_skip.dist);
8272     if (rdc_skip.rdcost < best_rd) {
8273       best_rd = rdc_skip.rdcost;
8274       best_mbmi = *mbmi;
8275       best_skip = x->skip;
8276       best_rdcost = rdc_skip;
8277     }
8278   }
8279   *mbmi = best_mbmi;
8280   *rd_cost = best_rdcost;
8281   x->skip = best_skip;
8282   return best_rd;
8283 }
8284 #endif  // CONFIG_INTRABC
8285
8286 void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
8287                                RD_STATS *rd_cost, BLOCK_SIZE bsize,
8288                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
8289   const AV1_COMMON *const cm = &cpi->common;
8290   MACROBLOCKD *const xd = &x->e_mbd;
8291   struct macroblockd_plane *const pd = xd->plane;
8292   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
8293   int y_skip = 0, uv_skip = 0;
8294   int64_t dist_y = 0, dist_uv = 0;
8295   TX_SIZE max_uv_tx_size;
8296   const int unify_bsize = CONFIG_CB4X4;
8297
8298   ctx->skip = 0;
8299   xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
8300   xd->mi[0]->mbmi.ref_frame[1] = NONE_FRAME;
8301 #if CONFIG_INTRABC
8302   xd->mi[0]->mbmi.use_intrabc = 0;
8303   xd->mi[0]->mbmi.mv[0].as_int = 0;
8304 #endif  // CONFIG_INTRABC
8305
8306   const int64_t intra_yrd =
8307       (bsize >= BLOCK_8X8 || unify_bsize)
8308           ? rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
8309                                    &y_skip, bsize, best_rd)
8310           : rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
8311                                          &dist_y, &y_skip, best_rd);
8312
8313   if (intra_yrd < best_rd) {
8314     max_uv_tx_size = uv_txsize_lookup[bsize][xd->mi[0]->mbmi.tx_size]
8315                                      [pd[1].subsampling_x][pd[1].subsampling_y];
8316     init_sbuv_mode(&xd->mi[0]->mbmi);
8317 #if CONFIG_CB4X4
8318     if (!x->skip_chroma_rd)
8319       rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
8320                               &uv_skip, bsize, max_uv_tx_size);
8321 #else
8322     rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
8323                             &uv_skip, AOMMAX(BLOCK_8X8, bsize), max_uv_tx_size);
8324 #endif  // CONFIG_CB4X4
8325
8326     if (y_skip && uv_skip) {
8327       rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
8328                       av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
8329       rd_cost->dist = dist_y + dist_uv;
8330     } else {
8331       rd_cost->rate =
8332           rate_y + rate_uv + av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
8333       rd_cost->dist = dist_y + dist_uv;
8334     }
8335     rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
8336 #if CONFIG_DAALA_DIST && CONFIG_CB4X4
8337     rd_cost->dist_y = dist_y;
8338 #endif
8339   } else {
8340     rd_cost->rate = INT_MAX;
8341   }
8342
8343 #if CONFIG_INTRABC
8344   if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd)
8345     best_rd = rd_cost->rdcost;
8346   if (rd_pick_intrabc_mode_sb(cpi, x, rd_cost, bsize, best_rd) < best_rd) {
8347     ctx->skip = x->skip;  // FIXME where is the proper place to set this?!
8348     assert(rd_cost->rate != INT_MAX);
8349     rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
8350   }
8351 #endif
8352   if (rd_cost->rate == INT_MAX) return;
8353
8354   ctx->mic = *xd->mi[0];
8355   ctx->mbmi_ext = *x->mbmi_ext;
8356 }
8357
8358 // Do we have an internal image edge (e.g. formatting bars).
8359 int av1_internal_image_edge(const AV1_COMP *cpi) {
8360   return (cpi->oxcf.pass == 2) &&
8361          ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
8362           (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
8363 }
8364
8365 // Checks to see if a super block is on a horizontal image edge.
8366 // In most cases this is the "real" edge unless there are formatting
8367 // bars embedded in the stream.
8368 int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
8369   int top_edge = 0;
8370   int bottom_edge = cpi->common.mi_rows;
8371   int is_active_h_edge = 0;
8372
8373   // For two pass account for any formatting bars detected.
8374   if (cpi->oxcf.pass == 2) {
8375     const TWO_PASS *const twopass = &cpi->twopass;
8376
8377     // The inactive region is specified in MBs not mi units.
8378     // The image edge is in the following MB row.
8379     top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
8380
8381     bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
8382     bottom_edge = AOMMAX(top_edge, bottom_edge);
8383   }
8384
8385   if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
8386       ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
8387     is_active_h_edge = 1;
8388   }
8389   return is_active_h_edge;
8390 }
8391
8392 // Checks to see if a super block is on a vertical image edge.
8393 // In most cases this is the "real" edge unless there are formatting
8394 // bars embedded in the stream.
8395 int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
8396   int left_edge = 0;
8397   int right_edge = cpi->common.mi_cols;
8398   int is_active_v_edge = 0;
8399
8400   // For two pass account for any formatting bars detected.
8401   if (cpi->oxcf.pass == 2) {
8402     const TWO_PASS *const twopass = &cpi->twopass;
8403
8404     // The inactive region is specified in MBs not mi units.
8405     // The image edge is in the following MB row.
8406     left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
8407
8408     right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
8409     right_edge = AOMMAX(left_edge, right_edge);
8410   }
8411
8412   if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
8413       ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
8414     is_active_v_edge = 1;
8415   }
8416   return is_active_v_edge;
8417 }
8418
8419 // Checks to see if a super block is at the edge of the active image.
8420 // In most cases this is the "real" edge unless there are formatting
8421 // bars embedded in the stream.
8422 int av1_active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) {
8423   return av1_active_h_edge(cpi, mi_row, cpi->common.mib_size) ||
8424          av1_active_v_edge(cpi, mi_col, cpi->common.mib_size);
8425 }
8426
8427 #if CONFIG_PALETTE
8428 static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
8429   MACROBLOCKD *const xd = &x->e_mbd;
8430   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
8431   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
8432   const BLOCK_SIZE bsize = mbmi->sb_type;
8433   int src_stride = x->plane[1].src.stride;
8434   const uint8_t *const src_u = x->plane[1].src.buf;
8435   const uint8_t *const src_v = x->plane[2].src.buf;
8436   float *const data = x->palette_buffer->kmeans_data_buf;
8437   float centroids[2 * PALETTE_MAX_SIZE];
8438   uint8_t *const color_map = xd->plane[1].color_index_map;
8439   int r, c;
8440 #if CONFIG_HIGHBITDEPTH
8441   const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
8442   const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
8443 #endif  // CONFIG_HIGHBITDEPTH
8444   int plane_block_width, plane_block_height, rows, cols;
8445   av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
8446                            &plane_block_height, &rows, &cols);
8447   (void)cpi;
8448
8449   for (r = 0; r < rows; ++r) {
8450     for (c = 0; c < cols; ++c) {
8451 #if CONFIG_HIGHBITDEPTH
8452       if (cpi->common.use_highbitdepth) {
8453         data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
8454         data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
8455       } else {
8456 #endif  // CONFIG_HIGHBITDEPTH
8457         data[(r * cols + c) * 2] = src_u[r * src_stride + c];
8458         data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
8459 #if CONFIG_HIGHBITDEPTH
8460       }
8461 #endif  // CONFIG_HIGHBITDEPTH
8462     }
8463   }
8464
8465   for (r = 1; r < 3; ++r) {
8466     for (c = 0; c < pmi->palette_size[1]; ++c) {
8467       centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
8468     }
8469   }
8470
8471   av1_calc_indices(data, centroids, color_map, rows * cols,
8472                    pmi->palette_size[1], 2);
8473   extend_palette_color_map(color_map, cols, rows, plane_block_width,
8474                            plane_block_height);
8475 }
8476 #endif  // CONFIG_PALETTE
8477
8478 #if CONFIG_FILTER_INTRA
8479 static void pick_filter_intra_interframe(
8480     const AV1_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
8481     BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_uv_intra,
8482     int *rate_uv_tokenonly, int64_t *dist_uv, int *skip_uv,
8483     PREDICTION_MODE *mode_uv, FILTER_INTRA_MODE_INFO *filter_intra_mode_info_uv,
8484 #if CONFIG_EXT_INTRA
8485     int8_t *uv_angle_delta,
8486 #endif  // CONFIG_EXT_INTRA
8487 #if CONFIG_PALETTE
8488     PALETTE_MODE_INFO *pmi_uv, int palette_ctx,
8489 #endif  // CONFIG_PALETTE
8490     int skip_mask, unsigned int *ref_costs_single, int64_t *best_rd,
8491     int64_t *best_intra_rd, PREDICTION_MODE *best_intra_mode,
8492     int *best_mode_index, int *best_skip2, int *best_mode_skippable,
8493 #if CONFIG_SUPERTX
8494     int *returnrate_nocoef,
8495 #endif  // CONFIG_SUPERTX
8496     int64_t *best_pred_rd, MB_MODE_INFO *best_mbmode, RD_STATS *rd_cost) {
8497   const AV1_COMMON *const cm = &cpi->common;
8498   MACROBLOCKD *const xd = &x->e_mbd;
8499   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
8500 #if CONFIG_PALETTE
8501   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
8502 #endif  // CONFIG_PALETTE
8503   int rate2 = 0, rate_y = INT_MAX, skippable = 0, rate_uv, rate_dummy, i;
8504   int dc_mode_index;
8505   const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]];
8506   int64_t distortion2 = 0, distortion_y = 0, this_rd = *best_rd;
8507   int64_t distortion_uv, model_rd = INT64_MAX;
8508   TX_SIZE uv_tx;
8509
8510   for (i = 0; i < MAX_MODES; ++i)
8511     if (av1_mode_order[i].mode == DC_PRED &&
8512         av1_mode_order[i].ref_frame[0] == INTRA_FRAME)
8513       break;
8514   dc_mode_index = i;
8515   assert(i < MAX_MODES);
8516
8517   // TODO(huisu): use skip_mask for further speedup.
8518   (void)skip_mask;
8519   mbmi->mode = DC_PRED;
8520   mbmi->uv_mode = DC_PRED;
8521   mbmi->ref_frame[0] = INTRA_FRAME;
8522   mbmi->ref_frame[1] = NONE_FRAME;
8523   if (!rd_pick_filter_intra_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
8524                                 &skippable, bsize, intra_mode_cost[mbmi->mode],
8525                                 &this_rd, &model_rd, 0)) {
8526     return;
8527   }
8528   if (rate_y == INT_MAX) return;
8529
8530   uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
8531                           [xd->plane[1].subsampling_y];
8532   if (rate_uv_intra[uv_tx] == INT_MAX) {
8533     choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
8534                          &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
8535                          &skip_uv[uv_tx], &mode_uv[uv_tx]);
8536 #if CONFIG_PALETTE
8537     if (cm->allow_screen_content_tools) pmi_uv[uv_tx] = *pmi;
8538 #endif  // CONFIG_PALETTE
8539     filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
8540 #if CONFIG_EXT_INTRA
8541     uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
8542 #endif  // CONFIG_EXT_INTRA
8543   }
8544
8545   rate_uv = rate_uv_tokenonly[uv_tx];
8546   distortion_uv = dist_uv[uv_tx];
8547   skippable = skippable && skip_uv[uv_tx];
8548   mbmi->uv_mode = mode_uv[uv_tx];
8549 #if CONFIG_PALETTE
8550   if (cm->allow_screen_content_tools) {
8551     pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
8552     memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
8553            pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
8554            2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
8555   }
8556 #endif  // CONFIG_PALETTE
8557 #if CONFIG_EXT_INTRA
8558   mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
8559 #endif  // CONFIG_EXT_INTRA
8560   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
8561       filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
8562   if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
8563     mbmi->filter_intra_mode_info.filter_intra_mode[1] =
8564         filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
8565   }
8566
8567   rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
8568           cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
8569 #if CONFIG_PALETTE
8570   if (cpi->common.allow_screen_content_tools && mbmi->mode == DC_PRED &&
8571       bsize >= BLOCK_8X8)
8572     rate2 += av1_cost_bit(
8573         av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
8574 #endif  // CONFIG_PALETTE
8575
8576   if (!xd->lossless[mbmi->segment_id]) {
8577     // super_block_yrd above includes the cost of the tx_size in the
8578     // tokenonly rate, but for intra blocks, tx_size is always coded
8579     // (prediction granularity), so we account for it in the full rate,
8580     // not the tokenonly rate.
8581     rate_y -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
8582   }
8583
8584   rate2 += av1_cost_bit(cm->fc->filter_intra_probs[0],
8585                         mbmi->filter_intra_mode_info.use_filter_intra_mode[0]);
8586   rate2 += write_uniform_cost(
8587       FILTER_INTRA_MODES, mbmi->filter_intra_mode_info.filter_intra_mode[0]);
8588 #if CONFIG_EXT_INTRA
8589   if (av1_is_directional_mode(mbmi->uv_mode, bsize)) {
8590     rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
8591                                 MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
8592   }
8593 #endif  // CONFIG_EXT_INTRA
8594   if (mbmi->mode == DC_PRED) {
8595     rate2 +=
8596         av1_cost_bit(cpi->common.fc->filter_intra_probs[1],
8597                      mbmi->filter_intra_mode_info.use_filter_intra_mode[1]);
8598     if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1])
8599       rate2 +=
8600           write_uniform_cost(FILTER_INTRA_MODES,
8601                              mbmi->filter_intra_mode_info.filter_intra_mode[1]);
8602   }
8603   distortion2 = distortion_y + distortion_uv;
8604   av1_encode_intra_block_plane((AV1_COMMON *)cm, x, bsize, 0, 0, mi_row,
8605                                mi_col);
8606
8607   rate2 += ref_costs_single[INTRA_FRAME];
8608
8609   if (skippable) {
8610     rate2 -= (rate_y + rate_uv);
8611     rate_y = 0;
8612     rate_uv = 0;
8613     rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
8614   } else {
8615     rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
8616   }
8617   this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
8618
8619   if (this_rd < *best_intra_rd) {
8620     *best_intra_rd = this_rd;
8621     *best_intra_mode = mbmi->mode;
8622   }
8623   for (i = 0; i < REFERENCE_MODES; ++i)
8624     best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
8625
8626   if (this_rd < *best_rd) {
8627     *best_mode_index = dc_mode_index;
8628     mbmi->mv[0].as_int = 0;
8629     rd_cost->rate = rate2;
8630 #if CONFIG_SUPERTX
8631     if (x->skip)
8632       *returnrate_nocoef = rate2;
8633     else
8634       *returnrate_nocoef = rate2 - rate_y - rate_uv;
8635     *returnrate_nocoef -= av1_cost_bit(av1_get_skip_prob(cm, xd), skippable);
8636     *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
8637                                        mbmi->ref_frame[0] != INTRA_FRAME);
8638 #endif  // CONFIG_SUPERTX
8639     rd_cost->dist = distortion2;
8640     rd_cost->rdcost = this_rd;
8641     *best_rd = this_rd;
8642     *best_mbmode = *mbmi;
8643     *best_skip2 = 0;
8644     *best_mode_skippable = skippable;
8645   }
8646 }
8647 #endif  // CONFIG_FILTER_INTRA
8648
8649 #if CONFIG_MOTION_VAR
8650 static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
8651                                       const MACROBLOCKD *xd, int mi_row,
8652                                       int mi_col, const uint8_t *above,
8653                                       int above_stride, const uint8_t *left,
8654                                       int left_stride);
8655 #endif  // CONFIG_MOTION_VAR
8656
8657 void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
8658                                MACROBLOCK *x, int mi_row, int mi_col,
8659                                RD_STATS *rd_cost,
8660 #if CONFIG_SUPERTX
8661                                int *returnrate_nocoef,
8662 #endif  // CONFIG_SUPERTX
8663                                BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
8664                                int64_t best_rd_so_far) {
8665   const AV1_COMMON *const cm = &cpi->common;
8666   const RD_OPT *const rd_opt = &cpi->rd;
8667   const SPEED_FEATURES *const sf = &cpi->sf;
8668   MACROBLOCKD *const xd = &x->e_mbd;
8669   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
8670 #if CONFIG_PALETTE
8671   const int try_palette =
8672       cpi->common.allow_screen_content_tools && bsize >= BLOCK_8X8;
8673   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
8674 #endif  // CONFIG_PALETTE
8675   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
8676   const struct segmentation *const seg = &cm->seg;
8677   PREDICTION_MODE this_mode;
8678   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
8679   unsigned char segment_id = mbmi->segment_id;
8680   int comp_pred, i, k;
8681   int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
8682   struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE];
8683   int_mv single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
8684 #if CONFIG_EXT_INTER
8685   int single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 };
8686   int64_t modelled_rd[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
8687 #endif  // CONFIG_EXT_INTER
8688   static const int flag_list[TOTAL_REFS_PER_FRAME] = {
8689     0,
8690     AOM_LAST_FLAG,
8691 #if CONFIG_EXT_REFS
8692     AOM_LAST2_FLAG,
8693     AOM_LAST3_FLAG,
8694 #endif  // CONFIG_EXT_REFS
8695     AOM_GOLD_FLAG,
8696 #if CONFIG_EXT_REFS
8697     AOM_BWD_FLAG,
8698 #endif  // CONFIG_EXT_REFS
8699     AOM_ALT_FLAG
8700   };
8701   int64_t best_rd = best_rd_so_far;
8702   int best_rate_y = INT_MAX, best_rate_uv = INT_MAX;
8703   int64_t best_pred_diff[REFERENCE_MODES];
8704   int64_t best_pred_rd[REFERENCE_MODES];
8705   MB_MODE_INFO best_mbmode;
8706   int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
8707   int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
8708   int best_mode_skippable = 0;
8709   int midx, best_mode_index = -1;
8710   unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
8711   unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
8712   aom_prob comp_mode_p;
8713   int64_t best_intra_rd = INT64_MAX;
8714   unsigned int best_pred_sse = UINT_MAX;
8715   PREDICTION_MODE best_intra_mode = DC_PRED;
8716   int rate_uv_intra[TX_SIZES_ALL], rate_uv_tokenonly[TX_SIZES_ALL];
8717   int64_t dist_uvs[TX_SIZES_ALL];
8718   int skip_uvs[TX_SIZES_ALL];
8719   PREDICTION_MODE mode_uv[TX_SIZES_ALL];
8720 #if CONFIG_PALETTE
8721   PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL];
8722 #endif  // CONFIG_PALETTE
8723 #if CONFIG_EXT_INTRA
8724   int8_t uv_angle_delta[TX_SIZES_ALL];
8725   int is_directional_mode, angle_stats_ready = 0;
8726   uint8_t directional_mode_skip_mask[INTRA_MODES];
8727 #endif  // CONFIG_EXT_INTRA
8728 #if CONFIG_FILTER_INTRA
8729   int8_t dc_skipped = 1;
8730   FILTER_INTRA_MODE_INFO filter_intra_mode_info_uv[TX_SIZES_ALL];
8731 #endif  // CONFIG_FILTER_INTRA
8732   const int intra_cost_penalty = av1_get_intra_cost_penalty(
8733       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
8734   const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]];
8735   int best_skip2 = 0;
8736   uint8_t ref_frame_skip_mask[2] = { 0 };
8737   uint32_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 };
8738 #if CONFIG_EXT_INTER && CONFIG_INTERINTRA
8739   MV_REFERENCE_FRAME best_single_inter_ref = LAST_FRAME;
8740   int64_t best_single_inter_rd = INT64_MAX;
8741 #endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
8742   int mode_skip_start = sf->mode_skip_start + 1;
8743   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
8744   const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
8745   int64_t mode_threshold[MAX_MODES];
8746   int *mode_map = tile_data->mode_map[bsize];
8747   const int mode_search_skip_flags = sf->mode_search_skip_flags;
8748 #if CONFIG_PVQ
8749   od_rollback_buffer pre_buf;
8750 #endif  // CONFIG_PVQ
8751
8752   HandleInterModeArgs args = {
8753 #if CONFIG_MOTION_VAR
8754     { NULL },
8755     { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
8756     { NULL },
8757     { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
8758 #endif  // CONFIG_MOTION_VAR
8759 #if CONFIG_EXT_INTER
8760     NULL,
8761     NULL,
8762     NULL,
8763 #else   // CONFIG_EXT_INTER
8764     NULL,
8765 #endif  // CONFIG_EXT_INTER
8766     { { 0 } },
8767   };
8768
8769 #if CONFIG_PALETTE || CONFIG_EXT_INTRA
8770   const int rows = block_size_high[bsize];
8771   const int cols = block_size_wide[bsize];
8772 #endif  // CONFIG_PALETTE || CONFIG_EXT_INTRA
8773 #if CONFIG_PALETTE
8774   int palette_ctx = 0;
8775   const MODE_INFO *above_mi = xd->above_mi;
8776   const MODE_INFO *left_mi = xd->left_mi;
8777 #endif  // CONFIG_PALETTE
8778 #if CONFIG_MOTION_VAR
8779   int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
8780   int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
8781   int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
8782   int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
8783
8784 #if CONFIG_HIGHBITDEPTH
8785   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
8786     int len = sizeof(uint16_t);
8787     args.above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
8788     args.above_pred_buf[1] =
8789         CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
8790     args.above_pred_buf[2] =
8791         CONVERT_TO_BYTEPTR(x->above_pred_buf + 2 * MAX_SB_SQUARE * len);
8792     args.left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
8793     args.left_pred_buf[1] =
8794         CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
8795     args.left_pred_buf[2] =
8796         CONVERT_TO_BYTEPTR(x->left_pred_buf + 2 * MAX_SB_SQUARE * len);
8797   } else {
8798 #endif  // CONFIG_HIGHBITDEPTH
8799     args.above_pred_buf[0] = x->above_pred_buf;
8800     args.above_pred_buf[1] = x->above_pred_buf + MAX_SB_SQUARE;
8801     args.above_pred_buf[2] = x->above_pred_buf + 2 * MAX_SB_SQUARE;
8802     args.left_pred_buf[0] = x->left_pred_buf;
8803     args.left_pred_buf[1] = x->left_pred_buf + MAX_SB_SQUARE;
8804     args.left_pred_buf[2] = x->left_pred_buf + 2 * MAX_SB_SQUARE;
8805 #if CONFIG_HIGHBITDEPTH
8806   }
8807 #endif  // CONFIG_HIGHBITDEPTH
8808 #endif  // CONFIG_MOTION_VAR
8809
8810   av1_zero(best_mbmode);
8811
8812 #if CONFIG_PALETTE
8813   av1_zero(pmi_uv);
8814   if (try_palette) {
8815     if (above_mi)
8816       palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
8817     if (left_mi)
8818       palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
8819   }
8820 #endif  // CONFIG_PALETTE
8821
8822   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
8823                            &comp_mode_p);
8824
8825   for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX;
8826   for (i = 0; i < TX_SIZES_ALL; i++) rate_uv_intra[i] = INT_MAX;
8827   for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX;
8828   for (i = 0; i < MB_MODE_COUNT; ++i) {
8829     for (k = 0; k < TOTAL_REFS_PER_FRAME; ++k) {
8830       args.single_filter[i][k] = SWITCHABLE;
8831     }
8832   }
8833
8834   rd_cost->rate = INT_MAX;
8835 #if CONFIG_SUPERTX
8836   *returnrate_nocoef = INT_MAX;
8837 #endif  // CONFIG_SUPERTX
8838
8839   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
8840     x->pred_mv_sad[ref_frame] = INT_MAX;
8841     x->mbmi_ext->mode_context[ref_frame] = 0;
8842 #if CONFIG_EXT_INTER
8843     x->mbmi_ext->compound_mode_context[ref_frame] = 0;
8844 #endif  // CONFIG_EXT_INTER
8845     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
8846       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
8847       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
8848                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
8849     }
8850     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
8851 #if CONFIG_GLOBAL_MOTION
8852     frame_mv[ZEROMV][ref_frame].as_int =
8853         gm_get_motion_vector(&cm->global_motion[ref_frame],
8854                              cm->allow_high_precision_mv, bsize, mi_col, mi_row,
8855                              0)
8856             .as_int;
8857 #else   // CONFIG_GLOBAL_MOTION
8858     frame_mv[ZEROMV][ref_frame].as_int = 0;
8859 #endif  // CONFIG_GLOBAL_MOTION
8860 #if CONFIG_EXT_INTER
8861     frame_mv[NEW_NEWMV][ref_frame].as_int = INVALID_MV;
8862 #if CONFIG_GLOBAL_MOTION
8863     frame_mv[ZERO_ZEROMV][ref_frame].as_int =
8864         gm_get_motion_vector(&cm->global_motion[ref_frame],
8865                              cm->allow_high_precision_mv, bsize, mi_col, mi_row,
8866                              0)
8867             .as_int;
8868 #else   // CONFIG_GLOBAL_MOTION
8869     frame_mv[ZERO_ZEROMV][ref_frame].as_int = 0;
8870 #endif  // CONFIG_GLOBAL_MOTION
8871 #endif  // CONFIG_EXT_INTER
8872   }
8873
8874   for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
8875     MODE_INFO *const mi = xd->mi[0];
8876     int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
8877     x->mbmi_ext->mode_context[ref_frame] = 0;
8878     av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
8879                      mbmi_ext->ref_mv_stack[ref_frame],
8880 #if CONFIG_EXT_INTER
8881                      mbmi_ext->compound_mode_context,
8882 #endif  // CONFIG_EXT_INTER
8883                      candidates, mi_row, mi_col, NULL, NULL,
8884                      mbmi_ext->mode_context);
8885     if (mbmi_ext->ref_mv_count[ref_frame] < 2) {
8886       MV_REFERENCE_FRAME rf[2];
8887       av1_set_ref_frame(rf, ref_frame);
8888       if (mbmi_ext->ref_mvs[rf[0]][0].as_int !=
8889               frame_mv[ZEROMV][rf[0]].as_int ||
8890           mbmi_ext->ref_mvs[rf[0]][1].as_int !=
8891               frame_mv[ZEROMV][rf[0]].as_int ||
8892           mbmi_ext->ref_mvs[rf[1]][0].as_int !=
8893               frame_mv[ZEROMV][rf[1]].as_int ||
8894           mbmi_ext->ref_mvs[rf[1]][1].as_int != frame_mv[ZEROMV][rf[1]].as_int)
8895         mbmi_ext->mode_context[ref_frame] &= ~(1 << ALL_ZERO_FLAG_OFFSET);
8896     }
8897   }
8898
8899 #if CONFIG_MOTION_VAR
8900   av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
8901
8902   if (check_num_overlappable_neighbors(mbmi) &&
8903       is_motion_variation_allowed_bsize(bsize)) {
8904     av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
8905                                         args.above_pred_buf, dst_width1,
8906                                         dst_height1, args.above_pred_stride);
8907     av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col,
8908                                        args.left_pred_buf, dst_width2,
8909                                        dst_height2, args.left_pred_stride);
8910     av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
8911                          mi_col);
8912     calc_target_weighted_pred(cm, x, xd, mi_row, mi_col, args.above_pred_buf[0],
8913                               args.above_pred_stride[0], args.left_pred_buf[0],
8914                               args.left_pred_stride[0]);
8915   }
8916 #endif  // CONFIG_MOTION_VAR
8917
8918   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
8919     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
8920 // Skip checking missing references in both single and compound reference
8921 // modes. Note that a mode will be skipped iff both reference frames
8922 // are masked out.
8923 #if CONFIG_EXT_REFS
8924       if (ref_frame == BWDREF_FRAME || ref_frame == ALTREF_FRAME) {
8925         ref_frame_skip_mask[0] |= (1 << ref_frame);
8926         ref_frame_skip_mask[1] |= ((1 << ref_frame) | 0x01);
8927       } else {
8928 #endif  // CONFIG_EXT_REFS
8929         ref_frame_skip_mask[0] |= (1 << ref_frame);
8930         ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
8931 #if CONFIG_EXT_REFS
8932       }
8933 #endif  // CONFIG_EXT_REFS
8934     } else {
8935       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
8936         // Skip fixed mv modes for poor references
8937         if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
8938           mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
8939           break;
8940         }
8941       }
8942     }
8943     // If the segment reference frame feature is enabled....
8944     // then do nothing if the current ref frame is not allowed..
8945     if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
8946         get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
8947       ref_frame_skip_mask[0] |= (1 << ref_frame);
8948       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
8949     }
8950   }
8951
8952   // Disable this drop out case if the ref frame
8953   // segment level feature is enabled for this segment. This is to
8954   // prevent the possibility that we end up unable to pick any mode.
8955   if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
8956     // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
8957     // unless ARNR filtering is enabled in which case we want
8958     // an unfiltered alternative. We allow near/nearest as well
8959     // because they may result in zero-zero MVs but be cheaper.
8960     if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
8961       int_mv zeromv;
8962       ref_frame_skip_mask[0] = (1 << LAST_FRAME) |
8963 #if CONFIG_EXT_REFS
8964                                (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
8965                                (1 << BWDREF_FRAME) |
8966 #endif  // CONFIG_EXT_REFS
8967                                (1 << GOLDEN_FRAME);
8968       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
8969       // TODO(zoeliu): To further explore whether following needs to be done for
8970       //               BWDREF_FRAME as well.
8971       mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
8972 #if CONFIG_GLOBAL_MOTION
8973       zeromv.as_int = gm_get_motion_vector(&cm->global_motion[ALTREF_FRAME],
8974                                            cm->allow_high_precision_mv, bsize,
8975                                            mi_col, mi_row, 0)
8976                           .as_int;
8977 #else
8978       zeromv.as_int = 0;
8979 #endif  // CONFIG_GLOBAL_MOTION
8980       if (frame_mv[NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
8981         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
8982       if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
8983         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
8984 #if CONFIG_EXT_INTER
8985       if (frame_mv[NEAREST_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
8986         mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARESTMV);
8987       if (frame_mv[NEAR_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
8988         mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARMV);
8989 #endif  // CONFIG_EXT_INTER
8990     }
8991   }
8992
8993   if (cpi->rc.is_src_frame_alt_ref) {
8994     if (sf->alt_ref_search_fp) {
8995       assert(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]);
8996       mode_skip_mask[ALTREF_FRAME] = 0;
8997       ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
8998       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
8999     }
9000   }
9001
9002   if (sf->alt_ref_search_fp)
9003     if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
9004       if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
9005         mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
9006
9007   if (sf->adaptive_mode_search) {
9008     if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
9009         cpi->rc.frames_since_golden >= 3)
9010       if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME])
9011         mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
9012   }
9013
9014   if (bsize > sf->max_intra_bsize) {
9015     ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
9016     ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
9017   }
9018
9019   mode_skip_mask[INTRA_FRAME] |=
9020       ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
9021
9022   for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0;
9023   for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
9024     mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
9025
9026   midx = sf->schedule_mode_search ? mode_skip_start : 0;
9027   while (midx > 4) {
9028     uint8_t end_pos = 0;
9029     for (i = 5; i < midx; ++i) {
9030       if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) {
9031         uint8_t tmp = mode_map[i];
9032         mode_map[i] = mode_map[i - 1];
9033         mode_map[i - 1] = tmp;
9034         end_pos = i;
9035       }
9036     }
9037     midx = end_pos;
9038   }
9039
9040   if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
9041     x->use_default_intra_tx_type = 1;
9042   else
9043     x->use_default_intra_tx_type = 0;
9044
9045   if (cpi->sf.tx_type_search.fast_inter_tx_type_search)
9046     x->use_default_inter_tx_type = 1;
9047   else
9048     x->use_default_inter_tx_type = 0;
9049 #if CONFIG_PVQ
9050   od_encode_checkpoint(&x->daala_enc, &pre_buf);
9051 #endif  // CONFIG_PVQ
9052 #if CONFIG_EXT_INTER
9053   for (i = 0; i < MB_MODE_COUNT; ++i)
9054     for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame)
9055       modelled_rd[i][ref_frame] = INT64_MAX;
9056 #endif  // CONFIG_EXT_INTER
9057
9058   for (midx = 0; midx < MAX_MODES; ++midx) {
9059     int mode_index;
9060     int mode_excluded = 0;
9061     int64_t this_rd = INT64_MAX;
9062     int disable_skip = 0;
9063     int compmode_cost = 0;
9064     int rate2 = 0, rate_y = 0, rate_uv = 0;
9065     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
9066 #if CONFIG_DAALA_DIST && CONFIG_CB4X4
9067     int64_t distortion2_y = 0;
9068     int64_t total_sse_y = INT64_MAX;
9069 #endif
9070     int skippable = 0;
9071     int this_skip2 = 0;
9072     int64_t total_sse = INT64_MAX;
9073     uint8_t ref_frame_type;
9074 #if CONFIG_PVQ
9075     od_encode_rollback(&x->daala_enc, &pre_buf);
9076 #endif  // CONFIG_PVQ
9077     mode_index = mode_map[midx];
9078     this_mode = av1_mode_order[mode_index].mode;
9079     ref_frame = av1_mode_order[mode_index].ref_frame[0];
9080     second_ref_frame = av1_mode_order[mode_index].ref_frame[1];
9081     mbmi->ref_mv_idx = 0;
9082
9083 #if CONFIG_EXT_INTER
9084     if (ref_frame > INTRA_FRAME && second_ref_frame == INTRA_FRAME) {
9085       // Mode must by compatible
9086       if (!is_interintra_allowed_mode(this_mode)) continue;
9087       if (!is_interintra_allowed_bsize(bsize)) continue;
9088     }
9089
9090     if (is_inter_compound_mode(this_mode)) {
9091       frame_mv[this_mode][ref_frame].as_int =
9092           frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int;
9093       frame_mv[this_mode][second_ref_frame].as_int =
9094           frame_mv[compound_ref1_mode(this_mode)][second_ref_frame].as_int;
9095     }
9096 #endif  // CONFIG_EXT_INTER
9097
9098     // Look at the reference frame of the best mode so far and set the
9099     // skip mask to look at a subset of the remaining modes.
9100     if (midx == mode_skip_start && best_mode_index >= 0) {
9101       switch (best_mbmode.ref_frame[0]) {
9102         case INTRA_FRAME: break;
9103         case LAST_FRAME:
9104           ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
9105           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
9106           break;
9107 #if CONFIG_EXT_REFS
9108         case LAST2_FRAME:
9109           ref_frame_skip_mask[0] |= LAST2_FRAME_MODE_MASK;
9110           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
9111           break;
9112         case LAST3_FRAME:
9113           ref_frame_skip_mask[0] |= LAST3_FRAME_MODE_MASK;
9114           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
9115           break;
9116 #endif  // CONFIG_EXT_REFS
9117         case GOLDEN_FRAME:
9118           ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
9119           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
9120           break;
9121 #if CONFIG_EXT_REFS
9122         case BWDREF_FRAME:
9123           ref_frame_skip_mask[0] |= BWDREF_FRAME_MODE_MASK;
9124           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
9125           break;
9126 #endif  // CONFIG_EXT_REFS
9127         case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALTREF_FRAME_MODE_MASK;
9128 #if CONFIG_EXT_REFS
9129           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
9130 #endif  // CONFIG_EXT_REFS
9131           break;
9132         case NONE_FRAME:
9133         case TOTAL_REFS_PER_FRAME:
9134           assert(0 && "Invalid Reference frame");
9135           break;
9136       }
9137     }
9138
9139     if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
9140         (ref_frame_skip_mask[1] & (1 << AOMMAX(0, second_ref_frame))))
9141       continue;
9142
9143     if (mode_skip_mask[ref_frame] & (1 << this_mode)) continue;
9144
9145     // Test best rd so far against threshold for trying this mode.
9146     if (best_mode_skippable && sf->schedule_mode_search)
9147       mode_threshold[mode_index] <<= 1;
9148
9149     if (best_rd < mode_threshold[mode_index]) continue;
9150
9151     // This is only used in motion vector unit test.
9152     if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
9153
9154 #if CONFIG_ONE_SIDED_COMPOUND  // Changes LL bitstream
9155 #if CONFIG_EXT_REFS
9156     if (cpi->oxcf.pass == 0) {
9157       // Complexity-compression trade-offs
9158       // if (ref_frame == ALTREF_FRAME) continue;
9159       // if (ref_frame == BWDREF_FRAME) continue;
9160       if (second_ref_frame == ALTREF_FRAME) continue;
9161       // if (second_ref_frame == BWDREF_FRAME) continue;
9162     }
9163 #endif
9164 #endif
9165     comp_pred = second_ref_frame > INTRA_FRAME;
9166     if (comp_pred) {
9167       if (!cpi->allow_comp_inter_inter) continue;
9168
9169       // Skip compound inter modes if ARF is not available.
9170       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
9171
9172       // Do not allow compound prediction if the segment level reference frame
9173       // feature is in use as in this case there can only be one reference.
9174       if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
9175
9176       if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
9177           best_mode_index >= 0 && best_mbmode.ref_frame[0] == INTRA_FRAME)
9178         continue;
9179
9180       mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
9181     } else {
9182       if (ref_frame != INTRA_FRAME)
9183         mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
9184     }
9185
9186     if (ref_frame == INTRA_FRAME) {
9187       if (sf->adaptive_mode_search)
9188         if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse)
9189           continue;
9190
9191       if (this_mode != DC_PRED) {
9192         // Disable intra modes other than DC_PRED for blocks with low variance
9193         // Threshold for intra skipping based on source variance
9194         // TODO(debargha): Specialize the threshold for super block sizes
9195         const unsigned int skip_intra_var_thresh = 64;
9196         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
9197             x->source_variance < skip_intra_var_thresh)
9198           continue;
9199         // Only search the oblique modes if the best so far is
9200         // one of the neighboring directional modes
9201         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
9202             (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
9203           if (best_mode_index >= 0 && best_mbmode.ref_frame[0] > INTRA_FRAME)
9204             continue;
9205         }
9206         if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
9207           if (conditional_skipintra(this_mode, best_intra_mode)) continue;
9208         }
9209       }
9210 #if CONFIG_GLOBAL_MOTION
9211     } else if (cm->global_motion[ref_frame].wmtype == IDENTITY &&
9212                (!comp_pred ||
9213                 cm->global_motion[second_ref_frame].wmtype == IDENTITY)) {
9214 #else   // CONFIG_GLOBAL_MOTION
9215     } else {
9216 #endif  // CONFIG_GLOBAL_MOTION
9217       const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, second_ref_frame };
9218       if (!check_best_zero_mv(cpi, mbmi_ext->mode_context,
9219 #if CONFIG_EXT_INTER
9220                               mbmi_ext->compound_mode_context,
9221 #endif  // CONFIG_EXT_INTER
9222                               frame_mv, this_mode, ref_frames, bsize, -1,
9223                               mi_row, mi_col))
9224         continue;
9225     }
9226
9227     mbmi->mode = this_mode;
9228     mbmi->uv_mode = DC_PRED;
9229     mbmi->ref_frame[0] = ref_frame;
9230     mbmi->ref_frame[1] = second_ref_frame;
9231 #if CONFIG_PALETTE
9232     pmi->palette_size[0] = 0;
9233     pmi->palette_size[1] = 0;
9234 #endif  // CONFIG_PALETTE
9235 #if CONFIG_FILTER_INTRA
9236     mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
9237     mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
9238 #endif  // CONFIG_FILTER_INTRA
9239         // Evaluate all sub-pel filters irrespective of whether we can use
9240         // them for this frame.
9241
9242     set_default_interp_filters(mbmi, cm->interp_filter);
9243
9244     mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
9245     mbmi->motion_mode = SIMPLE_TRANSLATION;
9246
9247     x->skip = 0;
9248     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
9249
9250     // Select prediction reference frames.
9251     for (i = 0; i < MAX_MB_PLANE; i++) {
9252       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
9253       if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
9254     }
9255
9256 #if CONFIG_EXT_INTER && CONFIG_INTERINTRA
9257     mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
9258 #endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
9259
9260     if (ref_frame == INTRA_FRAME) {
9261       RD_STATS rd_stats_y;
9262       TX_SIZE uv_tx;
9263       struct macroblockd_plane *const pd = &xd->plane[1];
9264 #if CONFIG_EXT_INTRA
9265       is_directional_mode = av1_is_directional_mode(mbmi->mode, bsize);
9266       if (is_directional_mode) {
9267         int rate_dummy;
9268         int64_t model_rd = INT64_MAX;
9269         if (!angle_stats_ready) {
9270           const int src_stride = x->plane[0].src.stride;
9271           const uint8_t *src = x->plane[0].src.buf;
9272 #if CONFIG_HIGHBITDEPTH
9273           if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
9274             highbd_angle_estimation(src, src_stride, rows, cols, bsize,
9275                                     directional_mode_skip_mask);
9276           else
9277 #endif  // CONFIG_HIGHBITDEPTH
9278             angle_estimation(src, src_stride, rows, cols, bsize,
9279                              directional_mode_skip_mask);
9280           angle_stats_ready = 1;
9281         }
9282         if (directional_mode_skip_mask[mbmi->mode]) continue;
9283         rd_stats_y.rate = INT_MAX;
9284         rd_pick_intra_angle_sby(cpi, x, &rate_dummy, &rd_stats_y, bsize,
9285                                 intra_mode_cost[mbmi->mode], best_rd,
9286                                 &model_rd);
9287       } else {
9288         mbmi->angle_delta[0] = 0;
9289         super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
9290       }
9291 #else
9292       super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
9293 #endif  // CONFIG_EXT_INTRA
9294       rate_y = rd_stats_y.rate;
9295       distortion_y = rd_stats_y.dist;
9296       skippable = rd_stats_y.skip;
9297
9298       if (rate_y == INT_MAX) continue;
9299
9300 #if CONFIG_FILTER_INTRA
9301       if (mbmi->mode == DC_PRED) dc_skipped = 0;
9302 #endif  // CONFIG_FILTER_INTRA
9303
9304       uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][pd->subsampling_x]
9305                               [pd->subsampling_y];
9306       if (rate_uv_intra[uv_tx] == INT_MAX) {
9307         choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
9308                              &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
9309                              &skip_uvs[uv_tx], &mode_uv[uv_tx]);
9310 #if CONFIG_PALETTE
9311         if (try_palette) pmi_uv[uv_tx] = *pmi;
9312 #endif  // CONFIG_PALETTE
9313
9314 #if CONFIG_EXT_INTRA
9315         uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
9316 #endif  // CONFIG_EXT_INTRA
9317 #if CONFIG_FILTER_INTRA
9318         filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
9319 #endif  // CONFIG_FILTER_INTRA
9320       }
9321
9322       rate_uv = rate_uv_tokenonly[uv_tx];
9323       distortion_uv = dist_uvs[uv_tx];
9324       skippable = skippable && skip_uvs[uv_tx];
9325       mbmi->uv_mode = mode_uv[uv_tx];
9326 #if CONFIG_PALETTE
9327       if (try_palette) {
9328         pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
9329         memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
9330                pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
9331                2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
9332       }
9333 #endif  // CONFIG_PALETTE
9334
9335 #if CONFIG_EXT_INTRA
9336       mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
9337 #endif  // CONFIG_EXT_INTRA
9338 #if CONFIG_FILTER_INTRA
9339       mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
9340           filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
9341       if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
9342         mbmi->filter_intra_mode_info.filter_intra_mode[1] =
9343             filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
9344       }
9345 #endif  // CONFIG_FILTER_INTRA
9346
9347 #if CONFIG_CB4X4
9348       rate2 = rate_y + intra_mode_cost[mbmi->mode];
9349       if (!x->skip_chroma_rd)
9350         rate2 += rate_uv + cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
9351 #else
9352       rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
9353               cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
9354 #endif  // CONFIG_CB4X4
9355
9356 #if CONFIG_PALETTE
9357       if (try_palette && mbmi->mode == DC_PRED) {
9358         rate2 += av1_cost_bit(
9359             av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
9360       }
9361 #endif  // CONFIG_PALETTE
9362
9363       if (!xd->lossless[mbmi->segment_id] && bsize >= BLOCK_8X8) {
9364         // super_block_yrd above includes the cost of the tx_size in the
9365         // tokenonly rate, but for intra blocks, tx_size is always coded
9366         // (prediction granularity), so we account for it in the full rate,
9367         // not the tokenonly rate.
9368         rate_y -= tx_size_cost(cpi, x, bsize, mbmi->tx_size);
9369       }
9370 #if CONFIG_EXT_INTRA
9371       if (is_directional_mode) {
9372 #if CONFIG_INTRA_INTERP
9373         const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
9374         const int p_angle =
9375             mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
9376         if (av1_is_intra_filter_switchable(p_angle))
9377           rate2 += cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
9378 #endif  // CONFIG_INTRA_INTERP
9379         rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
9380                                     MAX_ANGLE_DELTA + mbmi->angle_delta[0]);
9381       }
9382       if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED) {
9383         rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTA + 1,
9384                                     MAX_ANGLE_DELTA + mbmi->angle_delta[1]);
9385       }
9386 #endif  // CONFIG_EXT_INTRA
9387 #if CONFIG_FILTER_INTRA
9388       if (mbmi->mode == DC_PRED) {
9389         rate2 +=
9390             av1_cost_bit(cm->fc->filter_intra_probs[0],
9391                          mbmi->filter_intra_mode_info.use_filter_intra_mode[0]);
9392         if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
9393           rate2 += write_uniform_cost(
9394               FILTER_INTRA_MODES,
9395               mbmi->filter_intra_mode_info.filter_intra_mode[0]);
9396         }
9397       }
9398       if (mbmi->uv_mode == DC_PRED) {
9399         rate2 +=
9400             av1_cost_bit(cpi->common.fc->filter_intra_probs[1],
9401                          mbmi->filter_intra_mode_info.use_filter_intra_mode[1]);
9402         if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1])
9403           rate2 += write_uniform_cost(
9404               FILTER_INTRA_MODES,
9405               mbmi->filter_intra_mode_info.filter_intra_mode[1]);
9406       }
9407 #endif  // CONFIG_FILTER_INTRA
9408       if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
9409         rate2 += intra_cost_penalty;
9410       distortion2 = distortion_y + distortion_uv;
9411 #if CONFIG_DAALA_DIST && CONFIG_CB4X4
9412       if (bsize < BLOCK_8X8) distortion2_y = distortion_y;
9413 #endif
9414     } else {
9415       int_mv backup_ref_mv[2];
9416
9417 #if !SUB8X8_COMP_REF
9418       if (bsize == BLOCK_4X4 && mbmi->ref_frame[1] > INTRA_FRAME) continue;
9419 #endif  // !SUB8X8_COMP_REF
9420
9421       backup_ref_mv[0] = mbmi_ext->ref_mvs[ref_frame][0];
9422       if (comp_pred) backup_ref_mv[1] = mbmi_ext->ref_mvs[second_ref_frame][0];
9423 #if CONFIG_EXT_INTER && CONFIG_INTERINTRA
9424       if (second_ref_frame == INTRA_FRAME) {
9425         if (best_single_inter_ref != ref_frame) continue;
9426         mbmi->interintra_mode = intra_to_interintra_mode[best_intra_mode];
9427 // TODO(debargha|geza.lore):
9428 // Should we use ext_intra modes for interintra?
9429 #if CONFIG_EXT_INTRA
9430         mbmi->angle_delta[0] = 0;
9431         mbmi->angle_delta[1] = 0;
9432 #if CONFIG_INTRA_INTERP
9433         mbmi->intra_filter = INTRA_FILTER_LINEAR;
9434 #endif  // CONFIG_INTRA_INTERP
9435 #endif  // CONFIG_EXT_INTRA
9436 #if CONFIG_FILTER_INTRA
9437         mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
9438         mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
9439 #endif  // CONFIG_FILTER_INTRA
9440       }
9441 #endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
9442       mbmi->ref_mv_idx = 0;
9443       ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
9444
9445 #if CONFIG_EXT_INTER
9446       if (comp_pred) {
9447         if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
9448           int ref_mv_idx = 0;
9449           // Special case: NEAR_NEWMV and NEW_NEARMV modes use
9450           // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
9451           // mbmi->ref_mv_idx (like NEWMV)
9452           if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
9453             ref_mv_idx = 1;
9454
9455           if (compound_ref0_mode(mbmi->mode) == NEWMV) {
9456             int_mv this_mv =
9457                 mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
9458             clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
9459                          xd->n8_h << MI_SIZE_LOG2, xd);
9460             mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
9461           }
9462           if (compound_ref1_mode(mbmi->mode) == NEWMV) {
9463             int_mv this_mv =
9464                 mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
9465             clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
9466                          xd->n8_h << MI_SIZE_LOG2, xd);
9467             mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
9468           }
9469         }
9470       } else {
9471 #endif  // CONFIG_EXT_INTER
9472         if (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
9473           int ref;
9474           for (ref = 0; ref < 1 + comp_pred; ++ref) {
9475             int_mv this_mv =
9476                 (ref == 0) ? mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv
9477                            : mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
9478             clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
9479                          xd->n8_h << MI_SIZE_LOG2, xd);
9480             mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
9481           }
9482         }
9483 #if CONFIG_EXT_INTER
9484       }
9485 #endif  // CONFIG_EXT_INTER
9486       {
9487         RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
9488         av1_init_rd_stats(&rd_stats);
9489         rd_stats.rate = rate2;
9490
9491         // Point to variables that are maintained between loop iterations
9492         args.single_newmv = single_newmv;
9493 #if CONFIG_EXT_INTER
9494         args.single_newmv_rate = single_newmv_rate;
9495         args.modelled_rd = modelled_rd;
9496 #endif  // CONFIG_EXT_INTER
9497         this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
9498                                     &rd_stats_uv, &disable_skip, frame_mv,
9499                                     mi_row, mi_col, &args, best_rd);
9500
9501         rate2 = rd_stats.rate;
9502         skippable = rd_stats.skip;
9503         distortion2 = rd_stats.dist;
9504         total_sse = rd_stats.sse;
9505         rate_y = rd_stats_y.rate;
9506         rate_uv = rd_stats_uv.rate;
9507 #if CONFIG_DAALA_DIST && CONFIG_CB4X4
9508         if (bsize < BLOCK_8X8) distortion2_y = rd_stats_y.dist;
9509 #endif
9510       }
9511
9512 // TODO(jingning): This needs some refactoring to improve code quality
9513 // and reduce redundant steps.
9514 #if CONFIG_EXT_INTER
9515       if ((have_nearmv_in_inter_mode(mbmi->mode) &&
9516            mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
9517           ((mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) &&
9518            mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
9519 #else
9520       if ((mbmi->mode == NEARMV &&
9521            mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
9522           (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
9523 #endif
9524         int_mv backup_mv = frame_mv[NEARMV][ref_frame];
9525         MB_MODE_INFO backup_mbmi = *mbmi;
9526         int backup_skip = x->skip;
9527         int64_t tmp_ref_rd = this_rd;
9528         int ref_idx;
9529
9530 // TODO(jingning): This should be deprecated shortly.
9531 #if CONFIG_EXT_INTER
9532         int idx_offset = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
9533 #else
9534         int idx_offset = (mbmi->mode == NEARMV) ? 1 : 0;
9535 #endif  // CONFIG_EXT_INTER
9536         int ref_set =
9537             AOMMIN(2, mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset);
9538
9539         uint8_t drl_ctx =
9540             av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx_offset);
9541         // Dummy
9542         int_mv backup_fmv[2];
9543         backup_fmv[0] = frame_mv[NEWMV][ref_frame];
9544         if (comp_pred) backup_fmv[1] = frame_mv[NEWMV][second_ref_frame];
9545
9546         rate2 += (rate2 < INT_MAX ? cpi->drl_mode_cost0[drl_ctx][0] : 0);
9547
9548         if (this_rd < INT64_MAX) {
9549           if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
9550               RDCOST(x->rdmult, x->rddiv, 0, total_sse))
9551             tmp_ref_rd =
9552                 RDCOST(x->rdmult, x->rddiv,
9553                        rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
9554                        distortion2);
9555           else
9556             tmp_ref_rd =
9557                 RDCOST(x->rdmult, x->rddiv,
9558                        rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
9559                            rate_y - rate_uv,
9560                        total_sse);
9561         }
9562 #if CONFIG_VAR_TX
9563         for (i = 0; i < MAX_MB_PLANE; ++i)
9564           memcpy(x->blk_skip_drl[i], x->blk_skip[i],
9565                  sizeof(uint8_t) * ctx->num_4x4_blk);
9566 #endif  // CONFIG_VAR_TX
9567
9568         for (ref_idx = 0; ref_idx < ref_set; ++ref_idx) {
9569           int64_t tmp_alt_rd = INT64_MAX;
9570           int dummy_disable_skip = 0;
9571           int ref;
9572           int_mv cur_mv;
9573           RD_STATS tmp_rd_stats, tmp_rd_stats_y, tmp_rd_stats_uv;
9574
9575           av1_invalid_rd_stats(&tmp_rd_stats);
9576           x->skip = 0;
9577
9578           mbmi->ref_mv_idx = 1 + ref_idx;
9579
9580 #if CONFIG_EXT_INTER
9581           if (comp_pred) {
9582             int ref_mv_idx = mbmi->ref_mv_idx;
9583             // Special case: NEAR_NEWMV and NEW_NEARMV modes use
9584             // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
9585             // mbmi->ref_mv_idx (like NEWMV)
9586             if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
9587               ref_mv_idx = 1 + mbmi->ref_mv_idx;
9588
9589             if (compound_ref0_mode(mbmi->mode) == NEWMV) {
9590               int_mv this_mv =
9591                   mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
9592               clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
9593                            xd->n8_h << MI_SIZE_LOG2, xd);
9594               mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
9595             } else if (compound_ref0_mode(mbmi->mode) == NEARESTMV) {
9596               int_mv this_mv =
9597                   mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
9598               clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
9599                            xd->n8_h << MI_SIZE_LOG2, xd);
9600               mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0] = this_mv;
9601             }
9602
9603             if (compound_ref1_mode(mbmi->mode) == NEWMV) {
9604               int_mv this_mv =
9605                   mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
9606               clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
9607                            xd->n8_h << MI_SIZE_LOG2, xd);
9608               mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
9609             } else if (compound_ref1_mode(mbmi->mode) == NEARESTMV) {
9610               int_mv this_mv =
9611                   mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
9612               clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
9613                            xd->n8_h << MI_SIZE_LOG2, xd);
9614               mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0] = this_mv;
9615             }
9616           } else {
9617 #endif  // CONFIG_EXT_INTER
9618             for (ref = 0; ref < 1 + comp_pred; ++ref) {
9619               int_mv this_mv =
9620                   (ref == 0)
9621                       ? mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
9622                             .this_mv
9623                       : mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
9624                             .comp_mv;
9625               clamp_mv_ref(&this_mv.as_mv, xd->n8_w << MI_SIZE_LOG2,
9626                            xd->n8_h << MI_SIZE_LOG2, xd);
9627               mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
9628             }
9629 #if CONFIG_EXT_INTER
9630           }
9631 #endif
9632
9633           cur_mv =
9634               mbmi_ext->ref_mv_stack[ref_frame][mbmi->ref_mv_idx + idx_offset]
9635                   .this_mv;
9636           clamp_mv2(&cur_mv.as_mv, xd);
9637
9638           if (!mv_check_bounds(&x->mv_limits, &cur_mv.as_mv)) {
9639             int_mv dummy_single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
9640 #if CONFIG_EXT_INTER
9641             int dummy_single_newmv_rate[TOTAL_REFS_PER_FRAME] = { 0 };
9642 #endif  // CONFIG_EXT_INTER
9643
9644             frame_mv[NEARMV][ref_frame] = cur_mv;
9645             av1_init_rd_stats(&tmp_rd_stats);
9646
9647             // Point to variables that are not maintained between iterations
9648             args.single_newmv = dummy_single_newmv;
9649 #if CONFIG_EXT_INTER
9650             args.single_newmv_rate = dummy_single_newmv_rate;
9651             args.modelled_rd = NULL;
9652 #endif  // CONFIG_EXT_INTER
9653             tmp_alt_rd = handle_inter_mode(
9654                 cpi, x, bsize, &tmp_rd_stats, &tmp_rd_stats_y, &tmp_rd_stats_uv,
9655                 &dummy_disable_skip, frame_mv, mi_row, mi_col, &args, best_rd);
9656             // Prevent pointers from escaping local scope
9657             args.single_newmv = NULL;
9658 #if CONFIG_EXT_INTER
9659             args.single_newmv_rate = NULL;
9660 #endif  // CONFIG_EXT_INTER
9661           }
9662
9663           for (i = 0; i < mbmi->ref_mv_idx; ++i) {
9664             uint8_t drl1_ctx = 0;
9665             drl1_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
9666                                    i + idx_offset);
9667             tmp_rd_stats.rate +=
9668                 (tmp_rd_stats.rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][1]
9669                                              : 0);
9670           }
9671
9672           if (mbmi_ext->ref_mv_count[ref_frame_type] >
9673                   mbmi->ref_mv_idx + idx_offset + 1 &&
9674               ref_idx < ref_set - 1) {
9675             uint8_t drl1_ctx =
9676                 av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
9677                             mbmi->ref_mv_idx + idx_offset);
9678             tmp_rd_stats.rate +=
9679                 (tmp_rd_stats.rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][0]
9680                                              : 0);
9681           }
9682
9683           if (tmp_alt_rd < INT64_MAX) {
9684 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9685             tmp_alt_rd = RDCOST(x->rdmult, x->rddiv, tmp_rd_stats.rate,
9686                                 tmp_rd_stats.dist);
9687 #else
9688             if (RDCOST(x->rdmult, x->rddiv,
9689                        tmp_rd_stats_y.rate + tmp_rd_stats_uv.rate,
9690                        tmp_rd_stats.dist) <
9691                 RDCOST(x->rdmult, x->rddiv, 0, tmp_rd_stats.sse))
9692               tmp_alt_rd =
9693                   RDCOST(x->rdmult, x->rddiv,
9694                          tmp_rd_stats.rate +
9695                              av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
9696                          tmp_rd_stats.dist);
9697             else
9698               tmp_alt_rd =
9699                   RDCOST(x->rdmult, x->rddiv,
9700                          tmp_rd_stats.rate +
9701                              av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
9702                              tmp_rd_stats_y.rate - tmp_rd_stats_uv.rate,
9703                          tmp_rd_stats.sse);
9704 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9705           }
9706
9707           if (tmp_ref_rd > tmp_alt_rd) {
9708             rate2 = tmp_rd_stats.rate;
9709             disable_skip = dummy_disable_skip;
9710             distortion2 = tmp_rd_stats.dist;
9711             skippable = tmp_rd_stats.skip;
9712             rate_y = tmp_rd_stats_y.rate;
9713             rate_uv = tmp_rd_stats_uv.rate;
9714             total_sse = tmp_rd_stats.sse;
9715             this_rd = tmp_alt_rd;
9716             tmp_ref_rd = tmp_alt_rd;
9717             backup_mbmi = *mbmi;
9718             backup_skip = x->skip;
9719 #if CONFIG_DAALA_DIST && CONFIG_CB4X4
9720             if (bsize < BLOCK_8X8) {
9721               total_sse_y = tmp_rd_stats_y.sse;
9722               distortion2_y = tmp_rd_stats_y.dist;
9723             }
9724 #endif
9725 #if CONFIG_VAR_TX
9726             for (i = 0; i < MAX_MB_PLANE; ++i)
9727               memcpy(x->blk_skip_drl[i], x->blk_skip[i],
9728                      sizeof(uint8_t) * ctx->num_4x4_blk);
9729 #endif  // CONFIG_VAR_TX
9730           } else {
9731             *mbmi = backup_mbmi;
9732             x->skip = backup_skip;
9733           }
9734         }
9735
9736         frame_mv[NEARMV][ref_frame] = backup_mv;
9737         frame_mv[NEWMV][ref_frame] = backup_fmv[0];
9738         if (comp_pred) frame_mv[NEWMV][second_ref_frame] = backup_fmv[1];
9739 #if CONFIG_VAR_TX
9740         for (i = 0; i < MAX_MB_PLANE; ++i)
9741           memcpy(x->blk_skip[i], x->blk_skip_drl[i],
9742                  sizeof(uint8_t) * ctx->num_4x4_blk);
9743 #endif  // CONFIG_VAR_TX
9744       }
9745       mbmi_ext->ref_mvs[ref_frame][0] = backup_ref_mv[0];
9746       if (comp_pred) mbmi_ext->ref_mvs[second_ref_frame][0] = backup_ref_mv[1];
9747
9748       if (this_rd == INT64_MAX) continue;
9749
9750 #if SUB8X8_COMP_REF
9751       compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
9752 #else
9753       if (mbmi->sb_type != BLOCK_4X4)
9754         compmode_cost = av1_cost_bit(comp_mode_p, comp_pred);
9755 #endif  // SUB8X8_COMP_REF
9756
9757       if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost;
9758     }
9759
9760     // Estimate the reference frame signaling cost and add it
9761     // to the rolling cost variable.
9762     if (comp_pred) {
9763       rate2 += ref_costs_comp[ref_frame];
9764 #if CONFIG_EXT_REFS
9765       rate2 += ref_costs_comp[second_ref_frame];
9766 #endif  // CONFIG_EXT_REFS
9767     } else {
9768       rate2 += ref_costs_single[ref_frame];
9769     }
9770
9771 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9772     if (ref_frame == INTRA_FRAME) {
9773 #else
9774     if (!disable_skip) {
9775 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9776       if (skippable) {
9777         // Back out the coefficient coding costs
9778         rate2 -= (rate_y + rate_uv);
9779         rate_y = 0;
9780         rate_uv = 0;
9781         // Cost the skip mb case
9782         rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
9783       } else if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
9784         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv + rate_skip0,
9785                    distortion2) <
9786             RDCOST(x->rdmult, x->rddiv, rate_skip1, total_sse)) {
9787           // Add in the cost of the no skip flag.
9788           rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
9789         } else {
9790           // FIXME(rbultje) make this work for splitmv also
9791           rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
9792           distortion2 = total_sse;
9793           assert(total_sse >= 0);
9794           rate2 -= (rate_y + rate_uv);
9795           this_skip2 = 1;
9796           rate_y = 0;
9797           rate_uv = 0;
9798 #if CONFIG_DAALA_DIST && CONFIG_CB4X4
9799           if (bsize < BLOCK_8X8) distortion2_y = total_sse_y;
9800 #endif
9801         }
9802       } else {
9803         // Add in the cost of the no skip flag.
9804         rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
9805       }
9806
9807       // Calculate the final RD estimate for this mode.
9808       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
9809 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9810     } else {
9811       this_skip2 = mbmi->skip;
9812       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
9813       if (this_skip2) {
9814         rate_y = 0;
9815         rate_uv = 0;
9816       }
9817 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9818     }
9819
9820     if (ref_frame == INTRA_FRAME) {
9821       // Keep record of best intra rd
9822       if (this_rd < best_intra_rd) {
9823         best_intra_rd = this_rd;
9824         best_intra_mode = mbmi->mode;
9825       }
9826 #if CONFIG_EXT_INTER && CONFIG_INTERINTRA
9827     } else if (second_ref_frame == NONE_FRAME) {
9828       if (this_rd < best_single_inter_rd) {
9829         best_single_inter_rd = this_rd;
9830         best_single_inter_ref = mbmi->ref_frame[0];
9831       }
9832 #endif  // CONFIG_EXT_INTER && CONFIG_INTERINTRA
9833     }
9834
9835     if (!disable_skip && ref_frame == INTRA_FRAME) {
9836       for (i = 0; i < REFERENCE_MODES; ++i)
9837         best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
9838     }
9839
9840     // Did this mode help.. i.e. is it the new best mode
9841     if (this_rd < best_rd || x->skip) {
9842       if (!mode_excluded) {
9843         // Note index of best mode so far
9844         best_mode_index = mode_index;
9845
9846         if (ref_frame == INTRA_FRAME) {
9847           /* required for left and above block mv */
9848           mbmi->mv[0].as_int = 0;
9849         } else {
9850           best_pred_sse = x->pred_sse[ref_frame];
9851         }
9852
9853         rd_cost->rate = rate2;
9854 #if CONFIG_SUPERTX
9855         if (x->skip)
9856           *returnrate_nocoef = rate2;
9857         else
9858           *returnrate_nocoef = rate2 - rate_y - rate_uv;
9859         *returnrate_nocoef -= av1_cost_bit(
9860             av1_get_skip_prob(cm, xd), disable_skip || skippable || this_skip2);
9861         *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
9862                                            mbmi->ref_frame[0] != INTRA_FRAME);
9863 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9864 #if CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
9865         MODE_INFO *const mi = xd->mi[0];
9866         const MOTION_MODE motion_allowed = motion_mode_allowed(
9867 #if CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
9868             0, xd->global_motion,
9869 #endif  // CONFIG_GLOBAL_MOTION && SEPARATE_GLOBAL_MOTION
9870             mi);
9871         if (motion_allowed == WARPED_CAUSAL)
9872           *returnrate_nocoef -= cpi->motion_mode_cost[bsize][mbmi->motion_mode];
9873         else if (motion_allowed == OBMC_CAUSAL)
9874           *returnrate_nocoef -=
9875               cpi->motion_mode_cost1[bsize][mbmi->motion_mode];
9876 #else
9877         *returnrate_nocoef -= cpi->motion_mode_cost[bsize][mbmi->motion_mode];
9878 #endif  // CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
9879 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
9880 #endif  // CONFIG_SUPERTX
9881         rd_cost->dist = distortion2;
9882         rd_cost->rdcost = this_rd;
9883         best_rd = this_rd;
9884         best_mbmode = *mbmi;
9885         best_skip2 = this_skip2;
9886         best_mode_skippable = skippable;
9887         best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd),
9888                                             this_skip2 || skippable);
9889         best_rate_uv = rate_uv;
9890 #if CONFIG_DAALA_DIST && CONFIG_CB4X4
9891         if (bsize < BLOCK_8X8) rd_cost->dist_y = distortion2_y;
9892 #endif
9893 #if CONFIG_VAR_TX
9894         for (i = 0; i < MAX_MB_PLANE; ++i)
9895           memcpy(ctx->blk_skip[i], x->blk_skip[i],
9896                  sizeof(uint8_t) * ctx->num_4x4_blk);
9897 #endif  // CONFIG_VAR_TX
9898       }
9899     }
9900
9901     /* keep record of best compound/single-only prediction */
9902     if (!disable_skip && ref_frame != INTRA_FRAME) {
9903       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
9904
9905       if (cm->reference_mode == REFERENCE_MODE_SELECT) {
9906         single_rate = rate2 - compmode_cost;
9907         hybrid_rate = rate2;
9908       } else {
9909         single_rate = rate2;
9910         hybrid_rate = rate2 + compmode_cost;
9911       }
9912
9913       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
9914       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
9915
9916       if (!comp_pred) {
9917         if (single_rd < best_pred_rd[SINGLE_REFERENCE])
9918           best_pred_rd[SINGLE_REFERENCE] = single_rd;
9919       } else {
9920         if (single_rd < best_pred_rd[COMPOUND_REFERENCE])
9921           best_pred_rd[COMPOUND_REFERENCE] = single_rd;
9922       }
9923       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
9924         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
9925     }
9926
9927     if (x->skip && !comp_pred) break;
9928   }
9929
9930   if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 &&
9931       ((sf->tx_type_search.fast_inter_tx_type_search == 1 &&
9932         is_inter_mode(best_mbmode.mode)) ||
9933        (sf->tx_type_search.fast_intra_tx_type_search == 1 &&
9934         !is_inter_mode(best_mbmode.mode)))) {
9935     int skip_blk = 0;
9936     RD_STATS rd_stats_y, rd_stats_uv;
9937
9938     x->use_default_inter_tx_type = 0;
9939     x->use_default_intra_tx_type = 0;
9940
9941     *mbmi = best_mbmode;
9942
9943     set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
9944
9945     // Select prediction reference frames.
9946     for (i = 0; i < MAX_MB_PLANE; i++) {
9947       xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
9948       if (has_second_ref(mbmi))
9949         xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
9950     }
9951
9952     if (is_inter_mode(mbmi->mode)) {
9953       av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
9954 #if CONFIG_MOTION_VAR
9955       if (mbmi->motion_mode == OBMC_CAUSAL) {
9956         av1_build_obmc_inter_prediction(
9957             cm, xd, mi_row, mi_col, args.above_pred_buf, args.above_pred_stride,
9958             args.left_pred_buf, args.left_pred_stride);
9959       }
9960 #endif  // CONFIG_MOTION_VAR
9961       av1_subtract_plane(x, bsize, 0);
9962 #if CONFIG_VAR_TX
9963       if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) {
9964         select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
9965       } else {
9966         int idx, idy;
9967         super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
9968         for (idy = 0; idy < xd->n8_h; ++idy)
9969           for (idx = 0; idx < xd->n8_w; ++idx)
9970             mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
9971         memset(x->blk_skip[0], rd_stats_y.skip,
9972                sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
9973       }
9974
9975       inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
9976 #else
9977       super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
9978       super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
9979 #endif  // CONFIG_VAR_TX
9980     } else {
9981       super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
9982       super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
9983     }
9984
9985     if (RDCOST(x->rdmult, x->rddiv, rd_stats_y.rate + rd_stats_uv.rate,
9986                (rd_stats_y.dist + rd_stats_uv.dist)) >
9987         RDCOST(x->rdmult, x->rddiv, 0, (rd_stats_y.sse + rd_stats_uv.sse))) {
9988       skip_blk = 1;
9989       rd_stats_y.rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
9990       rd_stats_uv.rate = 0;
9991       rd_stats_y.dist = rd_stats_y.sse;
9992       rd_stats_uv.dist = rd_stats_uv.sse;
9993     } else {
9994       skip_blk = 0;
9995       rd_stats_y.rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
9996     }
9997
9998     if (RDCOST(x->rdmult, x->rddiv, best_rate_y + best_rate_uv, rd_cost->dist) >
9999         RDCOST(x->rdmult, x->rddiv, rd_stats_y.rate + rd_stats_uv.rate,
10000                (rd_stats_y.dist + rd_stats_uv.dist))) {
10001 #if CONFIG_VAR_TX
10002       int idx, idy;
10003 #endif  // CONFIG_VAR_TX
10004       best_mbmode.tx_type = mbmi->tx_type;
10005       best_mbmode.tx_size = mbmi->tx_size;
10006 #if CONFIG_VAR_TX
10007       for (idy = 0; idy < xd->n8_h; ++idy)
10008         for (idx = 0; idx < xd->n8_w; ++idx)
10009           best_mbmode.inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
10010
10011       for (i = 0; i < MAX_MB_PLANE; ++i)
10012         memcpy(ctx->blk_skip[i], x->blk_skip[i],
10013                sizeof(uint8_t) * ctx->num_4x4_blk);
10014
10015       best_mbmode.min_tx_size = mbmi->min_tx_size;
10016 #endif  // CONFIG_VAR_TX
10017       rd_cost->rate +=
10018           (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv);
10019       rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
10020 #if CONFIG_DAALA_DIST && CONFIG_CB4X4
10021       if (bsize < BLOCK_8X8) rd_cost->dist_y = rd_stats_y.dist;
10022 #endif
10023       rd_cost->rdcost =
10024           RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
10025       best_skip2 = skip_blk;
10026     }
10027   }
10028
10029 #if CONFIG_PALETTE
10030   // Only try palette mode when the best mode so far is an intra mode.
10031   if (try_palette && !is_inter_mode(best_mbmode.mode)) {
10032     int rate2 = 0;
10033 #if CONFIG_SUPERTX
10034     int best_rate_nocoef;
10035 #endif  // CONFIG_SUPERTX
10036     int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd,
10037             best_model_rd_palette = INT64_MAX;
10038     int skippable = 0, rate_overhead_palette = 0;
10039     RD_STATS rd_stats_y;
10040     TX_SIZE uv_tx;
10041     uint8_t *const best_palette_color_map =
10042         x->palette_buffer->best_palette_color_map;
10043     uint8_t *const color_map = xd->plane[0].color_index_map;
10044     MB_MODE_INFO best_mbmi_palette = best_mbmode;
10045
10046     mbmi->mode = DC_PRED;
10047     mbmi->uv_mode = DC_PRED;
10048     mbmi->ref_frame[0] = INTRA_FRAME;
10049     mbmi->ref_frame[1] = NONE_FRAME;
10050     rate_overhead_palette = rd_pick_palette_intra_sby(
10051         cpi, x, bsize, palette_ctx, intra_mode_cost[DC_PRED],
10052         &best_mbmi_palette, best_palette_color_map, &best_rd_palette,
10053         &best_model_rd_palette, NULL, NULL, NULL, NULL);
10054     if (pmi->palette_size[0] == 0) goto PALETTE_EXIT;
10055     memcpy(color_map, best_palette_color_map,
10056            rows * cols * sizeof(best_palette_color_map[0]));
10057     super_block_yrd(cpi, x, &rd_stats_y, bsize, best_rd);
10058     if (rd_stats_y.rate == INT_MAX) goto PALETTE_EXIT;
10059     uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
10060                             [xd->plane[1].subsampling_y];
10061     if (rate_uv_intra[uv_tx] == INT_MAX) {
10062       choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
10063                            &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
10064                            &skip_uvs[uv_tx], &mode_uv[uv_tx]);
10065       pmi_uv[uv_tx] = *pmi;
10066 #if CONFIG_EXT_INTRA
10067       uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
10068 #endif  // CONFIG_EXT_INTRA
10069 #if CONFIG_FILTER_INTRA
10070       filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
10071 #endif  // CONFIG_FILTER_INTRA
10072     }
10073     mbmi->uv_mode = mode_uv[uv_tx];
10074     pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
10075     if (pmi->palette_size[1] > 0) {
10076       memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
10077              pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
10078              2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
10079     }
10080 #if CONFIG_EXT_INTRA
10081     mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
10082 #endif  // CONFIG_EXT_INTRA
10083 #if CONFIG_FILTER_INTRA
10084     mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
10085         filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
10086     if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
10087       mbmi->filter_intra_mode_info.filter_intra_mode[1] =
10088           filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
10089     }
10090 #endif  // CONFIG_FILTER_INTRA
10091     skippable = rd_stats_y.skip && skip_uvs[uv_tx];
10092     distortion2 = rd_stats_y.dist + dist_uvs[uv_tx];
10093     rate2 = rd_stats_y.rate + rate_overhead_palette + rate_uv_intra[uv_tx];
10094     rate2 += ref_costs_single[INTRA_FRAME];
10095
10096     if (skippable) {
10097       rate2 -= (rd_stats_y.rate + rate_uv_tokenonly[uv_tx]);
10098 #if CONFIG_SUPERTX
10099       best_rate_nocoef = rate2;
10100 #endif  // CONFIG_SUPERTX
10101       rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
10102     } else {
10103 #if CONFIG_SUPERTX
10104       best_rate_nocoef = rate2 - (rd_stats_y.rate + rate_uv_tokenonly[uv_tx]);
10105 #endif  // CONFIG_SUPERTX
10106       rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
10107     }
10108     this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
10109     if (this_rd < best_rd) {
10110       best_mode_index = 3;
10111       mbmi->mv[0].as_int = 0;
10112       rd_cost->rate = rate2;
10113 #if CONFIG_SUPERTX
10114       *returnrate_nocoef = best_rate_nocoef;
10115 #endif  // CONFIG_SUPERTX
10116       rd_cost->dist = distortion2;
10117       rd_cost->rdcost = this_rd;
10118       best_rd = this_rd;
10119       best_mbmode = *mbmi;
10120       best_skip2 = 0;
10121       best_mode_skippable = skippable;
10122     }
10123   }
10124 PALETTE_EXIT:
10125 #endif  // CONFIG_PALETTE
10126
10127 #if CONFIG_FILTER_INTRA
10128   // TODO(huisu): filter-intra is turned off in lossless mode for now to
10129   // avoid a unit test failure
10130   if (!xd->lossless[mbmi->segment_id] &&
10131 #if CONFIG_PALETTE
10132       pmi->palette_size[0] == 0 &&
10133 #endif  // CONFIG_PALETTE
10134       !dc_skipped && best_mode_index >= 0 &&
10135       best_intra_rd < (best_rd + (best_rd >> 3))) {
10136     pick_filter_intra_interframe(
10137         cpi, x, ctx, bsize, mi_row, mi_col, rate_uv_intra, rate_uv_tokenonly,
10138         dist_uvs, skip_uvs, mode_uv, filter_intra_mode_info_uv,
10139 #if CONFIG_EXT_INTRA
10140         uv_angle_delta,
10141 #endif  // CONFIG_EXT_INTRA
10142 #if CONFIG_PALETTE
10143         pmi_uv, palette_ctx,
10144 #endif  // CONFIG_PALETTE
10145         0, ref_costs_single, &best_rd, &best_intra_rd, &best_intra_mode,
10146         &best_mode_index, &best_skip2, &best_mode_skippable,
10147 #if CONFIG_SUPERTX
10148         returnrate_nocoef,
10149 #endif  // CONFIG_SUPERTX
10150         best_pred_rd, &best_mbmode, rd_cost);
10151   }
10152 #endif  // CONFIG_FILTER_INTRA
10153
10154   // The inter modes' rate costs are not calculated precisely in some cases.
10155   // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
10156   // ZEROMV. Here, checks are added for those cases, and the mode decisions
10157   // are corrected.
10158   if (best_mbmode.mode == NEWMV
10159 #if CONFIG_EXT_INTER
10160       || best_mbmode.mode == NEW_NEWMV
10161 #endif  // CONFIG_EXT_INTER
10162       ) {
10163     const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0],
10164                                          best_mbmode.ref_frame[1] };
10165     int comp_pred_mode = refs[1] > INTRA_FRAME;
10166     int_mv zeromv[2];
10167     const uint8_t rf_type = av1_ref_frame_type(best_mbmode.ref_frame);
10168 #if CONFIG_GLOBAL_MOTION
10169     zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
10170                                             cm->allow_high_precision_mv, bsize,
10171                                             mi_col, mi_row, 0)
10172                            .as_int;
10173     zeromv[1].as_int = comp_pred_mode
10174                            ? gm_get_motion_vector(&cm->global_motion[refs[1]],
10175                                                   cm->allow_high_precision_mv,
10176                                                   bsize, mi_col, mi_row, 0)
10177                                  .as_int
10178                            : 0;
10179 #else
10180     zeromv[0].as_int = 0;
10181     zeromv[1].as_int = 0;
10182 #endif  // CONFIG_GLOBAL_MOTION
10183     if (!comp_pred_mode) {
10184       int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
10185                         ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
10186                         : INT_MAX;
10187
10188       for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
10189         int_mv cur_mv = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
10190         if (cur_mv.as_int == best_mbmode.mv[0].as_int) {
10191           best_mbmode.mode = NEARMV;
10192           best_mbmode.ref_mv_idx = i;
10193         }
10194       }
10195
10196       if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int)
10197         best_mbmode.mode = NEARESTMV;
10198       else if (best_mbmode.mv[0].as_int == zeromv[0].as_int)
10199         best_mbmode.mode = ZEROMV;
10200     } else {
10201       int_mv nearestmv[2];
10202       int_mv nearmv[2];
10203
10204 #if CONFIG_EXT_INTER
10205       if (mbmi_ext->ref_mv_count[rf_type] > 1) {
10206         nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][1].this_mv;
10207         nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][1].comp_mv;
10208       } else {
10209         nearmv[0] = frame_mv[NEARMV][refs[0]];
10210         nearmv[1] = frame_mv[NEARMV][refs[1]];
10211       }
10212 #else
10213       int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
10214                         ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
10215                         : INT_MAX;
10216
10217       for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
10218         nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
10219         nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv;
10220
10221         if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
10222             nearmv[1].as_int == best_mbmode.mv[1].as_int) {
10223           best_mbmode.mode = NEARMV;
10224           best_mbmode.ref_mv_idx = i;
10225         }
10226       }
10227 #endif  // CONFIG_EXT_INTER
10228       if (mbmi_ext->ref_mv_count[rf_type] >= 1) {
10229         nearestmv[0] = mbmi_ext->ref_mv_stack[rf_type][0].this_mv;
10230         nearestmv[1] = mbmi_ext->ref_mv_stack[rf_type][0].comp_mv;
10231       } else {
10232         nearestmv[0] = frame_mv[NEARESTMV][refs[0]];
10233         nearestmv[1] = frame_mv[NEARESTMV][refs[1]];
10234       }
10235
10236       if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
10237           nearestmv[1].as_int == best_mbmode.mv[1].as_int) {
10238 #if CONFIG_EXT_INTER
10239         best_mbmode.mode = NEAREST_NEARESTMV;
10240       } else {
10241         int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
10242                           ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
10243                           : INT_MAX;
10244
10245         for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
10246           nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
10247           nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv;
10248
10249           // Try switching to the NEAR_NEARMV mode
10250           if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
10251               nearmv[1].as_int == best_mbmode.mv[1].as_int) {
10252             best_mbmode.mode = NEAR_NEARMV;
10253             best_mbmode.ref_mv_idx = i;
10254           }
10255         }
10256
10257         if (best_mbmode.mode == NEW_NEWMV &&
10258             best_mbmode.mv[0].as_int == zeromv[0].as_int &&
10259             best_mbmode.mv[1].as_int == zeromv[1].as_int)
10260           best_mbmode.mode = ZERO_ZEROMV;
10261       }
10262 #else
10263         best_mbmode.mode = NEARESTMV;
10264       } else if (best_mbmode.mv[0].as_int == zeromv[0].as_int &&
10265                  best_mbmode.mv[1].as_int == zeromv[1].as_int) {
10266         best_mbmode.mode = ZEROMV;
10267       }
10268 #endif  // CONFIG_EXT_INTER
10269     }
10270   }
10271
10272   // Make sure that the ref_mv_idx is only nonzero when we're
10273   // using a mode which can support ref_mv_idx
10274   if (best_mbmode.ref_mv_idx != 0 &&
10275 #if CONFIG_EXT_INTER
10276       !(best_mbmode.mode == NEWMV || best_mbmode.mode == NEW_NEWMV ||
10277         have_nearmv_in_inter_mode(best_mbmode.mode))) {
10278 #else
10279       !(best_mbmode.mode == NEARMV || best_mbmode.mode == NEWMV)) {
10280 #endif
10281     best_mbmode.ref_mv_idx = 0;
10282   }
10283
10284   {
10285     int8_t ref_frame_type = av1_ref_frame_type(best_mbmode.ref_frame);
10286     int16_t mode_ctx = mbmi_ext->mode_context[ref_frame_type];
10287     if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
10288       int_mv zeromv[2];
10289 #if CONFIG_GLOBAL_MOTION
10290       const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0],
10291                                            best_mbmode.ref_frame[1] };
10292       zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]],
10293                                               cm->allow_high_precision_mv,
10294                                               bsize, mi_col, mi_row, 0)
10295                              .as_int;
10296       zeromv[1].as_int = (refs[1] != NONE_FRAME)
10297                              ? gm_get_motion_vector(&cm->global_motion[refs[1]],
10298                                                     cm->allow_high_precision_mv,
10299                                                     bsize, mi_col, mi_row, 0)
10300                                    .as_int
10301                              : 0;
10302       lower_mv_precision(&zeromv[0].as_mv, cm->allow_high_precision_mv);
10303       lower_mv_precision(&zeromv[1].as_mv, cm->allow_high_precision_mv);
10304 #else
10305       zeromv[0].as_int = zeromv[1].as_int = 0;
10306 #endif  // CONFIG_GLOBAL_MOTION
10307       if (best_mbmode.ref_frame[0] > INTRA_FRAME &&
10308           best_mbmode.mv[0].as_int == zeromv[0].as_int &&
10309 #if CONFIG_EXT_INTER
10310           (best_mbmode.ref_frame[1] <= INTRA_FRAME)
10311 #else
10312           (best_mbmode.ref_frame[1] == NONE_FRAME ||
10313            best_mbmode.mv[1].as_int == zeromv[1].as_int)
10314 #endif  // CONFIG_EXT_INTER
10315               ) {
10316         best_mbmode.mode = ZEROMV;
10317       }
10318     }
10319   }
10320
10321   if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
10322     rd_cost->rate = INT_MAX;
10323     rd_cost->rdcost = INT64_MAX;
10324     return;
10325   }
10326
10327 #if CONFIG_DUAL_FILTER
10328   assert((cm->interp_filter == SWITCHABLE) ||
10329          (cm->interp_filter == best_mbmode.interp_filter[0]) ||
10330          !is_inter_block(&best_mbmode));
10331   assert((cm->interp_filter == SWITCHABLE) ||
10332          (cm->interp_filter == best_mbmode.interp_filter[1]) ||
10333          !is_inter_block(&best_mbmode));
10334   if (best_mbmode.ref_frame[1] > INTRA_FRAME) {
10335     assert((cm->interp_filter == SWITCHABLE) ||
10336            (cm->interp_filter == best_mbmode.interp_filter[2]) ||
10337            !is_inter_block(&best_mbmode));
10338     assert((cm->interp_filter == SWITCHABLE) ||
10339            (cm->interp_filter == best_mbmode.interp_filter[3]) ||
10340            !is_inter_block(&best_mbmode));
10341   }
10342 #else
10343   assert((cm->interp_filter == SWITCHABLE) ||
10344          (cm->interp_filter == best_mbmode.interp_filter) ||
10345          !is_inter_block(&best_mbmode));
10346 #endif  // CONFIG_DUAL_FILTER
10347
10348   if (!cpi->rc.is_src_frame_alt_ref)
10349     av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
10350                               sf->adaptive_rd_thresh, bsize, best_mode_index);
10351
10352   // macroblock modes
10353   *mbmi = best_mbmode;
10354   x->skip |= best_skip2;
10355
10356 // Note: this section is needed since the mode may have been forced to
10357 // ZEROMV by the all-zero mode handling of ref-mv.
10358 #if CONFIG_GLOBAL_MOTION
10359   if (mbmi->mode == ZEROMV
10360 #if CONFIG_EXT_INTER
10361       || mbmi->mode == ZERO_ZEROMV
10362 #endif  // CONFIG_EXT_INTER
10363       ) {
10364 #if CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
10365     // Correct the motion mode for ZEROMV
10366     const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
10367 #if SEPARATE_GLOBAL_MOTION
10368         0, xd->global_motion,
10369 #endif  // SEPARATE_GLOBAL_MOTION
10370         xd->mi[0]);
10371     if (mbmi->motion_mode > last_motion_mode_allowed)
10372       mbmi->motion_mode = last_motion_mode_allowed;
10373 #endif  // CONFIG_WARPED_MOTION || CONFIG_MOTION_VAR
10374
10375     // Correct the interpolation filter for ZEROMV
10376     if (is_nontrans_global_motion(xd)) {
10377 #if CONFIG_DUAL_FILTER
10378       mbmi->interp_filter[0] = cm->interp_filter == SWITCHABLE
10379                                    ? EIGHTTAP_REGULAR
10380                                    : cm->interp_filter;
10381       mbmi->interp_filter[1] = cm->interp_filter == SWITCHABLE
10382                                    ? EIGHTTAP_REGULAR
10383                                    : cm->interp_filter;
10384 #else
10385       mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR
10386                                                             : cm->interp_filter;
10387 #endif  // CONFIG_DUAL_FILTER
10388     }
10389   }
10390 #endif  // CONFIG_GLOBAL_MOTION
10391
10392   for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
10393     if (mbmi->mode != NEWMV)
10394       mbmi->pred_mv[i].as_int = mbmi->mv[i].as_int;
10395     else
10396       mbmi->pred_mv[i].as_int = mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_int;
10397   }
10398
10399   for (i = 0; i < REFERENCE_MODES; ++i) {
10400     if (best_pred_rd[i] == INT64_MAX)
10401       best_pred_diff[i] = INT_MIN;
10402     else
10403       best_pred_diff[i] = best_rd - best_pred_rd[i];
10404   }
10405
10406   x->skip |= best_mode_skippable;
10407
10408   assert(best_mode_index >= 0);
10409
10410   store_coding_context(x, ctx, best_mode_index, best_pred_diff,
10411                        best_mode_skippable);
10412
10413 #if CONFIG_PALETTE
10414   if (cm->allow_screen_content_tools && pmi->palette_size[1] > 0) {
10415     restore_uv_color_map(cpi, x);
10416   }
10417 #endif  // CONFIG_PALETTE
10418 }
10419
10420 void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
10421                                         TileDataEnc *tile_data, MACROBLOCK *x,
10422                                         int mi_row, int mi_col,
10423                                         RD_STATS *rd_cost, BLOCK_SIZE bsize,
10424                                         PICK_MODE_CONTEXT *ctx,
10425                                         int64_t best_rd_so_far) {
10426   const AV1_COMMON *const cm = &cpi->common;
10427   MACROBLOCKD *const xd = &x->e_mbd;
10428   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
10429   unsigned char segment_id = mbmi->segment_id;
10430   const int comp_pred = 0;
10431   int i;
10432   int64_t best_pred_diff[REFERENCE_MODES];
10433   unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
10434   unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
10435   aom_prob comp_mode_p;
10436   InterpFilter best_filter = SWITCHABLE;
10437   int64_t this_rd = INT64_MAX;
10438   int rate2 = 0;
10439   const int64_t distortion2 = 0;
10440   (void)mi_row;
10441   (void)mi_col;
10442
10443   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
10444                            &comp_mode_p);
10445
10446   for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX;
10447   for (i = LAST_FRAME; i < TOTAL_REFS_PER_FRAME; ++i)
10448     x->pred_mv_sad[i] = INT_MAX;
10449
10450   rd_cost->rate = INT_MAX;
10451
10452   assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
10453
10454 #if CONFIG_PALETTE
10455   mbmi->palette_mode_info.palette_size[0] = 0;
10456   mbmi->palette_mode_info.palette_size[1] = 0;
10457 #endif  // CONFIG_PALETTE
10458
10459 #if CONFIG_FILTER_INTRA
10460   mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
10461   mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
10462 #endif  // CONFIG_FILTER_INTRA
10463   mbmi->mode = ZEROMV;
10464   mbmi->motion_mode = SIMPLE_TRANSLATION;
10465   mbmi->uv_mode = DC_PRED;
10466   mbmi->ref_frame[0] = LAST_FRAME;
10467   mbmi->ref_frame[1] = NONE_FRAME;
10468 #if CONFIG_GLOBAL_MOTION
10469   mbmi->mv[0].as_int =
10470       gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]],
10471                            cm->allow_high_precision_mv, bsize, mi_col, mi_row,
10472                            0)
10473           .as_int;
10474 #else   // CONFIG_GLOBAL_MOTION
10475   mbmi->mv[0].as_int = 0;
10476 #endif  // CONFIG_GLOBAL_MOTION
10477   mbmi->tx_size = max_txsize_lookup[bsize];
10478   x->skip = 1;
10479
10480   mbmi->ref_mv_idx = 0;
10481   mbmi->pred_mv[0].as_int = 0;
10482
10483   mbmi->motion_mode = SIMPLE_TRANSLATION;
10484 #if CONFIG_MOTION_VAR
10485   av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
10486 #endif
10487 #if CONFIG_WARPED_MOTION
10488   if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
10489     int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
10490     mbmi->num_proj_ref[0] = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
10491   }
10492 #endif
10493
10494   set_default_interp_filters(mbmi, cm->interp_filter);
10495
10496   if (cm->interp_filter != SWITCHABLE) {
10497     best_filter = cm->interp_filter;
10498   } else {
10499     best_filter = EIGHTTAP_REGULAR;
10500     if (av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd) &&
10501         x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
10502       int rs;
10503       int best_rs = INT_MAX;
10504       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
10505 #if CONFIG_DUAL_FILTER
10506         int k;
10507         for (k = 0; k < 4; ++k) mbmi->interp_filter[k] = i;
10508 #else
10509         mbmi->interp_filter = i;
10510 #endif  // CONFIG_DUAL_FILTER
10511         rs = av1_get_switchable_rate(cpi, xd);
10512         if (rs < best_rs) {
10513           best_rs = rs;
10514 #if CONFIG_DUAL_FILTER
10515           best_filter = mbmi->interp_filter[0];
10516 #else
10517           best_filter = mbmi->interp_filter;
10518 #endif  // CONFIG_DUAL_FILTER
10519         }
10520       }
10521     }
10522   }
10523 // Set the appropriate filter
10524 #if CONFIG_DUAL_FILTER
10525   for (i = 0; i < 4; ++i) mbmi->interp_filter[i] = best_filter;
10526 #else
10527   mbmi->interp_filter = best_filter;
10528 #endif  // CONFIG_DUAL_FILTER
10529   rate2 += av1_get_switchable_rate(cpi, xd);
10530
10531   if (cm->reference_mode == REFERENCE_MODE_SELECT)
10532     rate2 += av1_cost_bit(comp_mode_p, comp_pred);
10533
10534   // Estimate the reference frame signaling cost and add it
10535   // to the rolling cost variable.
10536   rate2 += ref_costs_single[LAST_FRAME];
10537   this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
10538
10539   rd_cost->rate = rate2;
10540   rd_cost->dist = distortion2;
10541   rd_cost->rdcost = this_rd;
10542 #if CONFIG_DAALA_DIST && CONFIG_CB4X4
10543   if (bsize < BLOCK_8X8) rd_cost->dist_y = distortion2;
10544 #endif
10545   if (this_rd >= best_rd_so_far) {
10546     rd_cost->rate = INT_MAX;
10547     rd_cost->rdcost = INT64_MAX;
10548     return;
10549   }
10550
10551 #if CONFIG_DUAL_FILTER
10552   assert((cm->interp_filter == SWITCHABLE) ||
10553          (cm->interp_filter == mbmi->interp_filter[0]));
10554 #else
10555   assert((cm->interp_filter == SWITCHABLE) ||
10556          (cm->interp_filter == mbmi->interp_filter));
10557 #endif  // CONFIG_DUAL_FILTER
10558
10559   av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
10560                             cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
10561
10562   av1_zero(best_pred_diff);
10563
10564   store_coding_context(x, ctx, THR_ZEROMV, best_pred_diff, 0);
10565 }
10566
10567 #if CONFIG_MOTION_VAR
10568 // This function has a structure similar to av1_build_obmc_inter_prediction
10569 //
10570 // The OBMC predictor is computed as:
10571 //
10572 //  PObmc(x,y) =
10573 //    AOM_BLEND_A64(Mh(x),
10574 //                  AOM_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)),
10575 //                  PLeft(x, y))
10576 //
10577 // Scaling up by AOM_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate
10578 // rounding, this can be written as:
10579 //
10580 //  AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * Pobmc(x,y) =
10581 //    Mh(x) * Mv(y) * P(x,y) +
10582 //      Mh(x) * Cv(y) * Pabove(x,y) +
10583 //      AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
10584 //
10585 // Where :
10586 //
10587 //  Cv(y) = AOM_BLEND_A64_MAX_ALPHA - Mv(y)
10588 //  Ch(y) = AOM_BLEND_A64_MAX_ALPHA - Mh(y)
10589 //
10590 // This function computes 'wsrc' and 'mask' as:
10591 //
10592 //  wsrc(x, y) =
10593 //    AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * src(x, y) -
10594 //      Mh(x) * Cv(y) * Pabove(x,y) +
10595 //      AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
10596 //
10597 //  mask(x, y) = Mh(x) * Mv(y)
10598 //
10599 // These can then be used to efficiently approximate the error for any
10600 // predictor P in the context of the provided neighbouring predictors by
10601 // computing:
10602 //
10603 //  error(x, y) =
10604 //    wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2)
10605 //
10606 static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
10607                                       const MACROBLOCKD *xd, int mi_row,
10608                                       int mi_col, const uint8_t *above,
10609                                       int above_stride, const uint8_t *left,
10610                                       int left_stride) {
10611   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
10612   int row, col, i;
10613   const int bw = xd->n8_w << MI_SIZE_LOG2;
10614   const int bh = xd->n8_h << MI_SIZE_LOG2;
10615   int32_t *mask_buf = x->mask_buf;
10616   int32_t *wsrc_buf = x->wsrc_buf;
10617   const int wsrc_stride = bw;
10618   const int mask_stride = bw;
10619   const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
10620 #if CONFIG_HIGHBITDEPTH
10621   const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
10622 #else
10623   const int is_hbd = 0;
10624 #endif  // CONFIG_HIGHBITDEPTH
10625
10626   // plane 0 should not be subsampled
10627   assert(xd->plane[0].subsampling_x == 0);
10628   assert(xd->plane[0].subsampling_y == 0);
10629
10630   av1_zero_array(wsrc_buf, bw * bh);
10631   for (i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
10632
10633   // handle above row
10634   if (xd->up_available) {
10635     const int overlap = num_4x4_blocks_high_lookup[bsize] * 2;
10636     const int miw = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
10637     const int mi_row_offset = -1;
10638     const uint8_t *const mask1d = av1_get_obmc_mask(overlap);
10639     const int neighbor_limit = max_neighbor_obmc[b_width_log2_lookup[bsize]];
10640     int neighbor_count = 0;
10641
10642     assert(miw > 0);
10643
10644     i = 0;
10645     do {  // for each mi in the above row
10646       const int mi_col_offset = i;
10647       const MB_MODE_INFO *above_mbmi =
10648           &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
10649 #if CONFIG_CHROMA_SUB8X8
10650       if (above_mbmi->sb_type < BLOCK_8X8)
10651         above_mbmi =
10652             &xd->mi[mi_col_offset + 1 + mi_row_offset * xd->mi_stride]->mbmi;
10653 #endif
10654       const BLOCK_SIZE a_bsize = AOMMAX(above_mbmi->sb_type, BLOCK_8X8);
10655       const int mi_step = AOMMIN(xd->n8_w, mi_size_wide[a_bsize]);
10656       const int neighbor_bw = mi_step * MI_SIZE;
10657
10658       if (is_neighbor_overlappable(above_mbmi)) {
10659         if (!CONFIG_CB4X4 && (a_bsize == BLOCK_4X4 || a_bsize == BLOCK_4X8))
10660           neighbor_count += 2;
10661         else
10662           neighbor_count++;
10663         if (neighbor_count > neighbor_limit) break;
10664
10665         const int tmp_stride = above_stride;
10666         int32_t *wsrc = wsrc_buf + (i * MI_SIZE);
10667         int32_t *mask = mask_buf + (i * MI_SIZE);
10668
10669         if (!is_hbd) {
10670           const uint8_t *tmp = above;
10671
10672           for (row = 0; row < overlap; ++row) {
10673             const uint8_t m0 = mask1d[row];
10674             const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
10675             for (col = 0; col < neighbor_bw; ++col) {
10676               wsrc[col] = m1 * tmp[col];
10677               mask[col] = m0;
10678             }
10679             wsrc += wsrc_stride;
10680             mask += mask_stride;
10681             tmp += tmp_stride;
10682           }
10683 #if CONFIG_HIGHBITDEPTH
10684         } else {
10685           const uint16_t *tmp = CONVERT_TO_SHORTPTR(above);
10686
10687           for (row = 0; row < overlap; ++row) {
10688             const uint8_t m0 = mask1d[row];
10689             const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
10690             for (col = 0; col < neighbor_bw; ++col) {
10691               wsrc[col] = m1 * tmp[col];
10692               mask[col] = m0;
10693             }
10694             wsrc += wsrc_stride;
10695             mask += mask_stride;
10696             tmp += tmp_stride;
10697           }
10698 #endif  // CONFIG_HIGHBITDEPTH
10699         }
10700       }
10701
10702       above += neighbor_bw;
10703       i += mi_step;
10704     } while (i < miw);
10705   }
10706
10707   for (i = 0; i < bw * bh; ++i) {
10708     wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
10709     mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
10710   }
10711
10712   // handle left column
10713   if (xd->left_available) {
10714     const int overlap = num_4x4_blocks_wide_lookup[bsize] * 2;
10715     const int mih = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
10716     const int mi_col_offset = -1;
10717     const uint8_t *const mask1d = av1_get_obmc_mask(overlap);
10718     const int neighbor_limit = max_neighbor_obmc[b_height_log2_lookup[bsize]];
10719     int neighbor_count = 0;
10720
10721     assert(mih > 0);
10722
10723     i = 0;
10724     do {  // for each mi in the left column
10725       const int mi_row_offset = i;
10726       MB_MODE_INFO *left_mbmi =
10727           &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
10728
10729 #if CONFIG_CHROMA_SUB8X8
10730       if (left_mbmi->sb_type < BLOCK_8X8)
10731         left_mbmi =
10732             &xd->mi[mi_col_offset + (mi_row_offset + 1) * xd->mi_stride]->mbmi;
10733 #endif
10734       const BLOCK_SIZE l_bsize = AOMMAX(left_mbmi->sb_type, BLOCK_8X8);
10735       const int mi_step = AOMMIN(xd->n8_h, mi_size_high[l_bsize]);
10736       const int neighbor_bh = mi_step * MI_SIZE;
10737
10738       if (is_neighbor_overlappable(left_mbmi)) {
10739         if (!CONFIG_CB4X4 && (l_bsize == BLOCK_4X4 || l_bsize == BLOCK_8X4))
10740           neighbor_count += 2;
10741         else
10742           neighbor_count++;
10743         if (neighbor_count > neighbor_limit) break;
10744
10745         const int tmp_stride = left_stride;
10746         int32_t *wsrc = wsrc_buf + (i * MI_SIZE * wsrc_stride);
10747         int32_t *mask = mask_buf + (i * MI_SIZE * mask_stride);
10748
10749         if (!is_hbd) {
10750           const uint8_t *tmp = left;
10751
10752           for (row = 0; row < neighbor_bh; ++row) {
10753             for (col = 0; col < overlap; ++col) {
10754               const uint8_t m0 = mask1d[col];
10755               const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
10756               wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
10757                           (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
10758               mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
10759             }
10760             wsrc += wsrc_stride;
10761             mask += mask_stride;
10762             tmp += tmp_stride;
10763           }
10764 #if CONFIG_HIGHBITDEPTH
10765         } else {
10766           const uint16_t *tmp = CONVERT_TO_SHORTPTR(left);
10767
10768           for (row = 0; row < neighbor_bh; ++row) {
10769             for (col = 0; col < overlap; ++col) {
10770               const uint8_t m0 = mask1d[col];
10771               const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
10772               wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
10773                           (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
10774               mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
10775             }
10776             wsrc += wsrc_stride;
10777             mask += mask_stride;
10778             tmp += tmp_stride;
10779           }
10780 #endif  // CONFIG_HIGHBITDEPTH
10781         }
10782       }
10783
10784       left += neighbor_bh * left_stride;
10785       i += mi_step;
10786     } while (i < mih);
10787   }
10788
10789   if (!is_hbd) {
10790     const uint8_t *src = x->plane[0].src.buf;
10791
10792     for (row = 0; row < bh; ++row) {
10793       for (col = 0; col < bw; ++col) {
10794         wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
10795       }
10796       wsrc_buf += wsrc_stride;
10797       src += x->plane[0].src.stride;
10798     }
10799 #if CONFIG_HIGHBITDEPTH
10800   } else {
10801     const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
10802
10803     for (row = 0; row < bh; ++row) {
10804       for (col = 0; col < bw; ++col) {
10805         wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
10806       }
10807       wsrc_buf += wsrc_stride;
10808       src += x->plane[0].src.stride;
10809     }
10810 #endif  // CONFIG_HIGHBITDEPTH
10811   }
10812 }
10813
10814 #if CONFIG_NCOBMC
10815 void av1_check_ncobmc_rd(const struct AV1_COMP *cpi, struct macroblock *x,
10816                          int mi_row, int mi_col) {
10817   const AV1_COMMON *const cm = &cpi->common;
10818   MACROBLOCKD *const xd = &x->e_mbd;
10819   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
10820   MB_MODE_INFO backup_mbmi;
10821   BLOCK_SIZE bsize = mbmi->sb_type;
10822   int ref, skip_blk, backup_skip = x->skip;
10823   int64_t rd_causal;
10824   RD_STATS rd_stats_y, rd_stats_uv;
10825   int rate_skip0 = av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
10826   int rate_skip1 = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
10827
10828   // Recompute the best causal predictor and rd
10829   mbmi->motion_mode = SIMPLE_TRANSLATION;
10830   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
10831   for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
10832     YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
10833     assert(cfg != NULL);
10834     av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
10835                          &xd->block_refs[ref]->sf);
10836   }
10837   av1_setup_dst_planes(x->e_mbd.plane, bsize,
10838                        get_frame_new_buffer(&cpi->common), mi_row, mi_col);
10839
10840   av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
10841
10842   av1_subtract_plane(x, bsize, 0);
10843   super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
10844   super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
10845   assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
10846   if (rd_stats_y.skip && rd_stats_uv.skip) {
10847     rd_stats_y.rate = rate_skip1;
10848     rd_stats_uv.rate = 0;
10849     rd_stats_y.dist = rd_stats_y.sse;
10850     rd_stats_uv.dist = rd_stats_uv.sse;
10851     skip_blk = 0;
10852   } else if (RDCOST(x->rdmult, x->rddiv,
10853                     (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
10854                     (rd_stats_y.dist + rd_stats_uv.dist)) >
10855              RDCOST(x->rdmult, x->rddiv, rate_skip1,
10856                     (rd_stats_y.sse + rd_stats_uv.sse))) {
10857     rd_stats_y.rate = rate_skip1;
10858     rd_stats_uv.rate = 0;
10859     rd_stats_y.dist = rd_stats_y.sse;
10860     rd_stats_uv.dist = rd_stats_uv.sse;
10861     skip_blk = 1;
10862   } else {
10863     rd_stats_y.rate += rate_skip0;
10864     skip_blk = 0;
10865   }
10866   backup_skip = skip_blk;
10867   backup_mbmi = *mbmi;
10868   rd_causal = RDCOST(x->rdmult, x->rddiv, (rd_stats_y.rate + rd_stats_uv.rate),
10869                      (rd_stats_y.dist + rd_stats_uv.dist));
10870   rd_causal += RDCOST(x->rdmult, x->rddiv,
10871                       av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 0), 0);
10872
10873   // Check non-causal mode
10874   mbmi->motion_mode = OBMC_CAUSAL;
10875   av1_build_ncobmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
10876
10877   av1_subtract_plane(x, bsize, 0);
10878   super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
10879   super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
10880   assert(rd_stats_y.rate != INT_MAX && rd_stats_uv.rate != INT_MAX);
10881   if (rd_stats_y.skip && rd_stats_uv.skip) {
10882     rd_stats_y.rate = rate_skip1;
10883     rd_stats_uv.rate = 0;
10884     rd_stats_y.dist = rd_stats_y.sse;
10885     rd_stats_uv.dist = rd_stats_uv.sse;
10886     skip_blk = 0;
10887   } else if (RDCOST(x->rdmult, x->rddiv,
10888                     (rd_stats_y.rate + rd_stats_uv.rate + rate_skip0),
10889                     (rd_stats_y.dist + rd_stats_uv.dist)) >
10890              RDCOST(x->rdmult, x->rddiv, rate_skip1,
10891                     (rd_stats_y.sse + rd_stats_uv.sse))) {
10892     rd_stats_y.rate = rate_skip1;
10893     rd_stats_uv.rate = 0;
10894     rd_stats_y.dist = rd_stats_y.sse;
10895     rd_stats_uv.dist = rd_stats_uv.sse;
10896     skip_blk = 1;
10897   } else {
10898     rd_stats_y.rate += rate_skip0;
10899     skip_blk = 0;
10900   }
10901
10902   if (rd_causal >
10903       RDCOST(x->rdmult, x->rddiv,
10904              rd_stats_y.rate + rd_stats_uv.rate +
10905                  av1_cost_bit(cm->fc->motion_mode_prob[bsize][0], 1),
10906              (rd_stats_y.dist + rd_stats_uv.dist))) {
10907     x->skip = skip_blk;
10908   } else {
10909     *mbmi = backup_mbmi;
10910     x->skip = backup_skip;
10911   }
10912 }
10913 #endif  // CONFIG_NCOBMC
10914 #endif  // CONFIG_MOTION_VAR