vp9/encoder/vp9_ethread.c

   1 /*
   2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include "vp9/encoder/vp9_encodeframe.h"
  12 #include "vp9/encoder/vp9_encoder.h"
  13 #include "vp9/encoder/vp9_ethread.h"
  14
  15 static void accumulate_frame_counts(VP9_COMMON *cm, ThreadData *td) {
  16   int i, j, k, l, m;
  17
  18   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
  19     for (j = 0; j < INTRA_MODES; j++)
  20       cm->counts.y_mode[i][j] += td->counts->y_mode[i][j];
  21
  22   for (i = 0; i < INTRA_MODES; i++)
  23     for (j = 0; j < INTRA_MODES; j++)
  24       cm->counts.uv_mode[i][j] += td->counts->uv_mode[i][j];
  25
  26   for (i = 0; i < PARTITION_CONTEXTS; i++)
  27     for (j = 0; j < PARTITION_TYPES; j++)
  28       cm->counts.partition[i][j] += td->counts->partition[i][j];
  29
  30   for (i = 0; i < TX_SIZES; i++)
  31     for (j = 0; j < PLANE_TYPES; j++)
  32       for (k = 0; k < REF_TYPES; k++)
  33         for (l = 0; l < COEF_BANDS; l++)
  34           for (m = 0; m < COEFF_CONTEXTS; m++)
  35             cm->counts.eob_branch[i][j][k][l][m] +=
  36                 td->counts->eob_branch[i][j][k][l][m];
  37               // cm->counts.coef is only updated at frame level, so not need
  38               // to accumulate it here.
  39               // for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
  40               //   cm->counts.coef[i][j][k][l][m][n] +=
  41               //       td->counts->coef[i][j][k][l][m][n];
  42
  43   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
  44     for (j = 0; j < SWITCHABLE_FILTERS; j++)
  45       cm->counts.switchable_interp[i][j] += td->counts->switchable_interp[i][j];
  46
  47   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
  48     for (j = 0; j < INTER_MODES; j++)
  49       cm->counts.inter_mode[i][j] += td->counts->inter_mode[i][j];
  50
  51   for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
  52     for (j = 0; j < 2; j++)
  53       cm->counts.intra_inter[i][j] += td->counts->intra_inter[i][j];
  54
  55   for (i = 0; i < COMP_INTER_CONTEXTS; i++)
  56     for (j = 0; j < 2; j++)
  57       cm->counts.comp_inter[i][j] += td->counts->comp_inter[i][j];
  58
  59   for (i = 0; i < REF_CONTEXTS; i++)
  60     for (j = 0; j < 2; j++)
  61       for (k = 0; k < 2; k++)
  62       cm->counts.single_ref[i][j][k] += td->counts->single_ref[i][j][k];
  63
  64   for (i = 0; i < REF_CONTEXTS; i++)
  65     for (j = 0; j < 2; j++)
  66       cm->counts.comp_ref[i][j] += td->counts->comp_ref[i][j];
  67
  68   for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
  69     for (j = 0; j < TX_SIZES; j++)
  70       cm->counts.tx.p32x32[i][j] += td->counts->tx.p32x32[i][j];
  71
  72     for (j = 0; j < TX_SIZES - 1; j++)
  73       cm->counts.tx.p16x16[i][j] += td->counts->tx.p16x16[i][j];
  74
  75     for (j = 0; j < TX_SIZES - 2; j++)
  76       cm->counts.tx.p8x8[i][j] += td->counts->tx.p8x8[i][j];
  77   }
  78
  79   for (i = 0; i < SKIP_CONTEXTS; i++)
  80     for (j = 0; j < 2; j++)
  81       cm->counts.skip[i][j] += td->counts->skip[i][j];
  82
  83   for (i = 0; i < MV_JOINTS; i++)
  84     cm->counts.mv.joints[i] += td->counts->mv.joints[i];
  85
  86   for (k = 0; k < 2; k++) {
  87     nmv_component_counts *comps = &cm->counts.mv.comps[k];
  88     nmv_component_counts *comps_t = &td->counts->mv.comps[k];
  89
  90     for (i = 0; i < 2; i++) {
  91       comps->sign[i] += comps_t->sign[i];
  92       comps->class0_hp[i] += comps_t->class0_hp[i];
  93       comps->hp[i] += comps_t->hp[i];
  94     }
  95
  96     for (i = 0; i < MV_CLASSES; i++)
  97       comps->classes[i] += comps_t->classes[i];
  98
  99     for (i = 0; i < CLASS0_SIZE; i++) {
 100       comps->class0[i] += comps_t->class0[i];
 101       for (j = 0; j < MV_FP_SIZE; j++)
 102         comps->class0_fp[i][j] += comps_t->class0_fp[i][j];
 103     }
 104
 105     for (i = 0; i < MV_OFFSET_BITS; i++)
 106       for (j = 0; j < 2; j++)
 107         comps->bits[i][j] += comps_t->bits[i][j];
 108
 109     for (i = 0; i < MV_FP_SIZE; i++)
 110       comps->fp[i] += comps_t->fp[i];
 111   }
 112 }
 113
 114 static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
 115   int i, j, k, l, m, n;
 116
 117   for (i = 0; i < REFERENCE_MODES; i++)
 118     td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
 119
 120   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
 121     td->rd_counts.filter_diff[i] += td_t->rd_counts.filter_diff[i];
 122
 123   for (i = 0; i < TX_MODES; i++)
 124     td->rd_counts.tx_select_diff[i] += td_t->rd_counts.tx_select_diff[i];
 125
 126   for (i = 0; i < TX_SIZES; i++)
 127     for (j = 0; j < PLANE_TYPES; j++)
 128       for (k = 0; k < REF_TYPES; k++)
 129         for (l = 0; l < COEF_BANDS; l++)
 130           for (m = 0; m < COEFF_CONTEXTS; m++)
 131             for (n = 0; n < ENTROPY_TOKENS; n++)
 132               td->rd_counts.coef_counts[i][j][k][l][m][n] +=
 133                   td_t->rd_counts.coef_counts[i][j][k][l][m][n];
 134 }
 135
 136 static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
 137   VP9_COMP *const cpi = thread_data->cpi;
 138   const VP9_COMMON *const cm = &cpi->common;
 139   const int tile_cols = 1 << cm->log2_tile_cols;
 140   const int tile_rows = 1 << cm->log2_tile_rows;
 141   int t;
 142
 143   (void) unused;
 144
 145   for (t = thread_data->start; t < tile_rows * tile_cols;
 146       t += cpi->num_workers) {
 147     int tile_row = t / tile_cols;
 148     int tile_col = t % tile_cols;
 149
 150     vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col);
 151   }
 152
 153   return 0;
 154 }
 155
 156 void vp9_encode_tiles_mt(VP9_COMP *cpi) {
 157   VP9_COMMON *const cm = &cpi->common;
 158   const int tile_cols = 1 << cm->log2_tile_cols;
 159   const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
 160   const int num_workers = MIN(cpi->oxcf.max_threads, tile_cols);
 161   int i;
 162
 163   vp9_init_tile_data(cpi);
 164
 165   // Only run once to create threads and allocate thread data.
 166   if (cpi->num_workers == 0) {
 167     CHECK_MEM_ERROR(cm, cpi->workers,
 168                     vpx_malloc(num_workers * sizeof(*cpi->workers)));
 169
 170     CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
 171                     vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
 172
 173     for (i = 0; i < num_workers; i++) {
 174       VP9Worker *const worker = &cpi->workers[i];
 175       EncWorkerData *thread_data = &cpi->tile_thr_data[i];
 176
 177       ++cpi->num_workers;
 178       winterface->init(worker);
 179
 180       if (i < num_workers - 1) {
 181       thread_data->cpi = cpi;
 182
 183       // Allocate thread data.
 184       CHECK_MEM_ERROR(cm, thread_data->td,
 185                       vpx_memalign(32, sizeof(*thread_data->td)));
 186       vp9_zero(*thread_data->td);
 187
 188       // Set up pc_tree.
 189       thread_data->td->leaf_tree = NULL;
 190       thread_data->td->pc_tree = NULL;
 191       vp9_setup_pc_tree(cm, thread_data->td);
 192
 193       // Allocate frame counters in thread data.
 194       CHECK_MEM_ERROR(cm, thread_data->td->counts,
 195                       vpx_calloc(1, sizeof(*thread_data->td->counts)));
 196
 197       // Create threads
 198       if (!winterface->reset(worker))
 199         vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
 200                            "Tile encoder thread creation failed");
 201       } else {
 202         // Main thread acts as a worker and uses the thread data in cpi.
 203         thread_data->cpi = cpi;
 204         thread_data->td = &cpi->td;
 205       }
 206
 207       winterface->sync(worker);
 208     }
 209   }
 210
 211   for (i = 0; i < num_workers; i++) {
 212     VP9Worker *const worker = &cpi->workers[i];
 213     EncWorkerData *thread_data;
 214
 215     worker->hook = (VP9WorkerHook)enc_worker_hook;
 216     worker->data1 = &cpi->tile_thr_data[i];
 217     worker->data2 = NULL;
 218     thread_data = (EncWorkerData*)worker->data1;
 219
 220     // Before encoding a frame, copy the thread data from cpi.
 221     thread_data->td->mb = cpi->td.mb;
 222     thread_data->td->rd_counts = cpi->td.rd_counts;
 223     vpx_memcpy(thread_data->td->counts, &cpi->common.counts,
 224                sizeof(cpi->common.counts));
 225
 226     // Handle use_nonrd_pick_mode case.
 227     if (cpi->sf.use_nonrd_pick_mode) {
 228       MACROBLOCK *const x = &thread_data->td->mb;
 229       MACROBLOCKD *const xd = &x->e_mbd;
 230       struct macroblock_plane *const p = x->plane;
 231       struct macroblockd_plane *const pd = xd->plane;
 232       PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none;
 233       int j;
 234
 235       for (j = 0; j < MAX_MB_PLANE; ++j) {
 236         p[j].coeff = ctx->coeff_pbuf[j][0];
 237         p[j].qcoeff = ctx->qcoeff_pbuf[j][0];
 238         pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0];
 239         p[j].eobs = ctx->eobs_pbuf[j][0];
 240       }
 241     }
 242   }
 243
 244   // Encode a frame
 245   for (i = 0; i < num_workers; i++) {
 246     VP9Worker *const worker = &cpi->workers[i];
 247     EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
 248
 249     // Set the starting tile for each thread.
 250     thread_data->start = i;
 251
 252     if (i == num_workers - 1)
 253       winterface->execute(worker);
 254     else
 255       winterface->launch(worker);
 256   }
 257
 258   // Encoding ends.
 259   for (i = 0; i < num_workers; i++) {
 260     VP9Worker *const worker = &cpi->workers[i];
 261     winterface->sync(worker);
 262   }
 263
 264   for (i = 0; i < num_workers; i++) {
 265     VP9Worker *const worker = &cpi->workers[i];
 266     EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
 267
 268     // Accumulate counters.
 269     if (i < num_workers - 1) {
 270       accumulate_frame_counts(&cpi->common, thread_data->td);
 271       accumulate_rd_opt(&cpi->td, thread_data->td);
 272     }
 273   }
 274 }