2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include "./vpx_config.h"
12 #include "vpx_mem/vpx_mem.h"
13 #include "vp10/common/entropymode.h"
14 #include "vp10/common/thread_common.h"
15 #include "vp10/common/reconinter.h"
16 #include "vp10/common/loopfilter.h"
18 #if CONFIG_MULTITHREAD
19 static INLINE
void mutex_lock(pthread_mutex_t
*const mutex
) {
20 const int kMaxTryLocks
= 4000;
24 for (i
= 0; i
< kMaxTryLocks
; ++i
) {
25 if (!pthread_mutex_trylock(mutex
)) {
32 pthread_mutex_lock(mutex
);
34 #endif // CONFIG_MULTITHREAD
36 static INLINE
void sync_read(VP9LfSync
*const lf_sync
, int r
, int c
) {
37 #if CONFIG_MULTITHREAD
38 const int nsync
= lf_sync
->sync_range
;
40 if (r
&& !(c
& (nsync
- 1))) {
41 pthread_mutex_t
*const mutex
= &lf_sync
->mutex_
[r
- 1];
44 while (c
> lf_sync
->cur_sb_col
[r
- 1] - nsync
) {
45 pthread_cond_wait(&lf_sync
->cond_
[r
- 1], mutex
);
47 pthread_mutex_unlock(mutex
);
53 #endif // CONFIG_MULTITHREAD
56 static INLINE
void sync_write(VP9LfSync
*const lf_sync
, int r
, int c
,
58 #if CONFIG_MULTITHREAD
59 const int nsync
= lf_sync
->sync_range
;
61 // Only signal when there are enough filtered SB for next row to run.
64 if (c
< sb_cols
- 1) {
69 cur
= sb_cols
+ nsync
;
73 mutex_lock(&lf_sync
->mutex_
[r
]);
75 lf_sync
->cur_sb_col
[r
] = cur
;
77 pthread_cond_signal(&lf_sync
->cond_
[r
]);
78 pthread_mutex_unlock(&lf_sync
->mutex_
[r
]);
85 #endif // CONFIG_MULTITHREAD
88 // Implement row loopfiltering for each thread.
90 void thread_loop_filter_rows(const YV12_BUFFER_CONFIG
*const frame_buffer
,
91 VP10_COMMON
*const cm
,
92 struct macroblockd_plane planes
[MAX_MB_PLANE
],
93 int start
, int stop
, int y_only
,
94 VP9LfSync
*const lf_sync
) {
95 const int num_planes
= y_only
? 1 : MAX_MB_PLANE
;
96 const int sb_cols
= mi_cols_aligned_to_sb(cm
->mi_cols
) >> MI_BLOCK_SIZE_LOG2
;
101 else if (planes
[1].subsampling_y
== 1 && planes
[1].subsampling_x
== 1)
103 else if (planes
[1].subsampling_y
== 0 && planes
[1].subsampling_x
== 0)
108 for (mi_row
= start
; mi_row
< stop
;
109 mi_row
+= lf_sync
->num_workers
* MI_BLOCK_SIZE
) {
110 MODE_INFO
**const mi
= cm
->mi_grid_visible
+ mi_row
* cm
->mi_stride
;
112 for (mi_col
= 0; mi_col
< cm
->mi_cols
; mi_col
+= MI_BLOCK_SIZE
) {
113 const int r
= mi_row
>> MI_BLOCK_SIZE_LOG2
;
114 const int c
= mi_col
>> MI_BLOCK_SIZE_LOG2
;
115 LOOP_FILTER_MASK lfm
;
118 sync_read(lf_sync
, r
, c
);
120 vp10_setup_dst_planes(planes
, frame_buffer
, mi_row
, mi_col
);
122 // TODO(JBB): Make setup_mask work for non 420.
123 vp10_setup_mask(cm
, mi_row
, mi_col
, mi
+ mi_col
, cm
->mi_stride
,
126 vp10_filter_block_plane_ss00(cm
, &planes
[0], mi_row
, &lfm
);
127 for (plane
= 1; plane
< num_planes
; ++plane
) {
130 vp10_filter_block_plane_ss11(cm
, &planes
[plane
], mi_row
, &lfm
);
133 vp10_filter_block_plane_ss00(cm
, &planes
[plane
], mi_row
, &lfm
);
136 vp10_filter_block_plane_non420(cm
, &planes
[plane
], mi
+ mi_col
,
142 sync_write(lf_sync
, r
, c
, sb_cols
);
147 // Row-based multi-threaded loopfilter hook
148 static int loop_filter_row_worker(VP9LfSync
*const lf_sync
,
149 LFWorkerData
*const lf_data
) {
150 thread_loop_filter_rows(lf_data
->frame_buffer
, lf_data
->cm
, lf_data
->planes
,
151 lf_data
->start
, lf_data
->stop
, lf_data
->y_only
,
156 static void loop_filter_rows_mt(YV12_BUFFER_CONFIG
*frame
,
158 struct macroblockd_plane planes
[MAX_MB_PLANE
],
159 int start
, int stop
, int y_only
,
160 VPxWorker
*workers
, int nworkers
,
161 VP9LfSync
*lf_sync
) {
162 const VPxWorkerInterface
*const winterface
= vpx_get_worker_interface();
163 // Number of superblock rows and cols
164 const int sb_rows
= mi_cols_aligned_to_sb(cm
->mi_rows
) >> MI_BLOCK_SIZE_LOG2
;
165 // Decoder may allocate more threads than number of tiles based on user's
167 const int tile_cols
= 1 << cm
->log2_tile_cols
;
168 const int num_workers
= MIN(nworkers
, tile_cols
);
171 if (!lf_sync
->sync_range
|| sb_rows
!= lf_sync
->rows
||
172 num_workers
> lf_sync
->num_workers
) {
173 vp10_loop_filter_dealloc(lf_sync
);
174 vp10_loop_filter_alloc(lf_sync
, cm
, sb_rows
, cm
->width
, num_workers
);
177 // Initialize cur_sb_col to -1 for all SB rows.
178 memset(lf_sync
->cur_sb_col
, -1, sizeof(*lf_sync
->cur_sb_col
) * sb_rows
);
180 // Set up loopfilter thread data.
181 // The decoder is capping num_workers because it has been observed that using
182 // more threads on the loopfilter than there are cores will hurt performance
183 // on Android. This is because the system will only schedule the tile decode
184 // workers on cores equal to the number of tile columns. Then if the decoder
185 // tries to use more threads for the loopfilter, it will hurt performance
186 // because of contention. If the multithreading code changes in the future
187 // then the number of workers used by the loopfilter should be revisited.
188 for (i
= 0; i
< num_workers
; ++i
) {
189 VPxWorker
*const worker
= &workers
[i
];
190 LFWorkerData
*const lf_data
= &lf_sync
->lfdata
[i
];
192 worker
->hook
= (VPxWorkerHook
)loop_filter_row_worker
;
193 worker
->data1
= lf_sync
;
194 worker
->data2
= lf_data
;
197 vp10_loop_filter_data_reset(lf_data
, frame
, cm
, planes
);
198 lf_data
->start
= start
+ i
* MI_BLOCK_SIZE
;
199 lf_data
->stop
= stop
;
200 lf_data
->y_only
= y_only
;
202 // Start loopfiltering
203 if (i
== num_workers
- 1) {
204 winterface
->execute(worker
);
206 winterface
->launch(worker
);
210 // Wait till all rows are finished
211 for (i
= 0; i
< num_workers
; ++i
) {
212 winterface
->sync(&workers
[i
]);
216 void vp10_loop_filter_frame_mt(YV12_BUFFER_CONFIG
*frame
,
218 struct macroblockd_plane planes
[MAX_MB_PLANE
],
219 int frame_filter_level
,
220 int y_only
, int partial_frame
,
221 VPxWorker
*workers
, int num_workers
,
222 VP9LfSync
*lf_sync
) {
223 int start_mi_row
, end_mi_row
, mi_rows_to_filter
;
225 if (!frame_filter_level
) return;
228 mi_rows_to_filter
= cm
->mi_rows
;
229 if (partial_frame
&& cm
->mi_rows
> 8) {
230 start_mi_row
= cm
->mi_rows
>> 1;
231 start_mi_row
&= 0xfffffff8;
232 mi_rows_to_filter
= MAX(cm
->mi_rows
/ 8, 8);
234 end_mi_row
= start_mi_row
+ mi_rows_to_filter
;
235 vp10_loop_filter_frame_init(cm
, frame_filter_level
);
237 loop_filter_rows_mt(frame
, cm
, planes
, start_mi_row
, end_mi_row
,
238 y_only
, workers
, num_workers
, lf_sync
);
241 // Set up nsync by width.
242 static INLINE
int get_sync_range(int width
) {
243 // nsync numbers are picked by testing. For example, for 4k
244 // video, using 4 gives best performance.
247 else if (width
<= 1280)
249 else if (width
<= 4096)
255 // Allocate memory for lf row synchronization
256 void vp10_loop_filter_alloc(VP9LfSync
*lf_sync
, VP10_COMMON
*cm
, int rows
,
257 int width
, int num_workers
) {
258 lf_sync
->rows
= rows
;
259 #if CONFIG_MULTITHREAD
263 CHECK_MEM_ERROR(cm
, lf_sync
->mutex_
,
264 vpx_malloc(sizeof(*lf_sync
->mutex_
) * rows
));
265 if (lf_sync
->mutex_
) {
266 for (i
= 0; i
< rows
; ++i
) {
267 pthread_mutex_init(&lf_sync
->mutex_
[i
], NULL
);
271 CHECK_MEM_ERROR(cm
, lf_sync
->cond_
,
272 vpx_malloc(sizeof(*lf_sync
->cond_
) * rows
));
273 if (lf_sync
->cond_
) {
274 for (i
= 0; i
< rows
; ++i
) {
275 pthread_cond_init(&lf_sync
->cond_
[i
], NULL
);
279 #endif // CONFIG_MULTITHREAD
281 CHECK_MEM_ERROR(cm
, lf_sync
->lfdata
,
282 vpx_malloc(num_workers
* sizeof(*lf_sync
->lfdata
)));
283 lf_sync
->num_workers
= num_workers
;
285 CHECK_MEM_ERROR(cm
, lf_sync
->cur_sb_col
,
286 vpx_malloc(sizeof(*lf_sync
->cur_sb_col
) * rows
));
289 lf_sync
->sync_range
= get_sync_range(width
);
292 // Deallocate lf synchronization related mutex and data
293 void vp10_loop_filter_dealloc(VP9LfSync
*lf_sync
) {
294 if (lf_sync
!= NULL
) {
295 #if CONFIG_MULTITHREAD
298 if (lf_sync
->mutex_
!= NULL
) {
299 for (i
= 0; i
< lf_sync
->rows
; ++i
) {
300 pthread_mutex_destroy(&lf_sync
->mutex_
[i
]);
302 vpx_free(lf_sync
->mutex_
);
304 if (lf_sync
->cond_
!= NULL
) {
305 for (i
= 0; i
< lf_sync
->rows
; ++i
) {
306 pthread_cond_destroy(&lf_sync
->cond_
[i
]);
308 vpx_free(lf_sync
->cond_
);
310 #endif // CONFIG_MULTITHREAD
311 vpx_free(lf_sync
->lfdata
);
312 vpx_free(lf_sync
->cur_sb_col
);
313 // clear the structure as the source of this call may be a resize in which
314 // case this call will be followed by an _alloc() which may fail.
319 // Accumulate frame counts.
320 void vp10_accumulate_frame_counts(VP10_COMMON
*cm
, FRAME_COUNTS
*counts
,
324 for (i
= 0; i
< BLOCK_SIZE_GROUPS
; i
++)
325 for (j
= 0; j
< INTRA_MODES
; j
++)
326 cm
->counts
.y_mode
[i
][j
] += counts
->y_mode
[i
][j
];
328 for (i
= 0; i
< INTRA_MODES
; i
++)
329 for (j
= 0; j
< INTRA_MODES
; j
++)
330 cm
->counts
.uv_mode
[i
][j
] += counts
->uv_mode
[i
][j
];
332 for (i
= 0; i
< PARTITION_CONTEXTS
; i
++)
333 for (j
= 0; j
< PARTITION_TYPES
; j
++)
334 cm
->counts
.partition
[i
][j
] += counts
->partition
[i
][j
];
338 for (i
= 0; i
< TX_SIZES
; i
++)
339 for (j
= 0; j
< PLANE_TYPES
; j
++)
340 for (k
= 0; k
< REF_TYPES
; k
++)
341 for (l
= 0; l
< COEF_BANDS
; l
++)
342 for (m
= 0; m
< COEFF_CONTEXTS
; m
++) {
343 cm
->counts
.eob_branch
[i
][j
][k
][l
][m
] +=
344 counts
->eob_branch
[i
][j
][k
][l
][m
];
345 for (n
= 0; n
< UNCONSTRAINED_NODES
+ 1; n
++)
346 cm
->counts
.coef
[i
][j
][k
][l
][m
][n
] +=
347 counts
->coef
[i
][j
][k
][l
][m
][n
];
350 for (i
= 0; i
< TX_SIZES
; i
++)
351 for (j
= 0; j
< PLANE_TYPES
; j
++)
352 for (k
= 0; k
< REF_TYPES
; k
++)
353 for (l
= 0; l
< COEF_BANDS
; l
++)
354 for (m
= 0; m
< COEFF_CONTEXTS
; m
++)
355 cm
->counts
.eob_branch
[i
][j
][k
][l
][m
] +=
356 counts
->eob_branch
[i
][j
][k
][l
][m
];
357 // In the encoder, cm->counts.coef is only updated at frame
358 // level, so not need to accumulate it here.
359 // for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
360 // cm->counts.coef[i][j][k][l][m][n] +=
361 // counts->coef[i][j][k][l][m][n];
364 for (i
= 0; i
< SWITCHABLE_FILTER_CONTEXTS
; i
++)
365 for (j
= 0; j
< SWITCHABLE_FILTERS
; j
++)
366 cm
->counts
.switchable_interp
[i
][j
] += counts
->switchable_interp
[i
][j
];
368 for (i
= 0; i
< INTER_MODE_CONTEXTS
; i
++)
369 for (j
= 0; j
< INTER_MODES
; j
++)
370 cm
->counts
.inter_mode
[i
][j
] += counts
->inter_mode
[i
][j
];
372 for (i
= 0; i
< INTRA_INTER_CONTEXTS
; i
++)
373 for (j
= 0; j
< 2; j
++)
374 cm
->counts
.intra_inter
[i
][j
] += counts
->intra_inter
[i
][j
];
376 for (i
= 0; i
< COMP_INTER_CONTEXTS
; i
++)
377 for (j
= 0; j
< 2; j
++)
378 cm
->counts
.comp_inter
[i
][j
] += counts
->comp_inter
[i
][j
];
380 for (i
= 0; i
< REF_CONTEXTS
; i
++)
381 for (j
= 0; j
< 2; j
++)
382 for (k
= 0; k
< 2; k
++)
383 cm
->counts
.single_ref
[i
][j
][k
] += counts
->single_ref
[i
][j
][k
];
385 for (i
= 0; i
< REF_CONTEXTS
; i
++)
386 for (j
= 0; j
< 2; j
++)
387 cm
->counts
.comp_ref
[i
][j
] += counts
->comp_ref
[i
][j
];
389 for (i
= 0; i
< TX_SIZE_CONTEXTS
; i
++) {
390 for (j
= 0; j
< TX_SIZES
; j
++)
391 cm
->counts
.tx
.p32x32
[i
][j
] += counts
->tx
.p32x32
[i
][j
];
393 for (j
= 0; j
< TX_SIZES
- 1; j
++)
394 cm
->counts
.tx
.p16x16
[i
][j
] += counts
->tx
.p16x16
[i
][j
];
396 for (j
= 0; j
< TX_SIZES
- 2; j
++)
397 cm
->counts
.tx
.p8x8
[i
][j
] += counts
->tx
.p8x8
[i
][j
];
400 for (i
= 0; i
< TX_SIZES
; i
++)
401 cm
->counts
.tx
.tx_totals
[i
] += counts
->tx
.tx_totals
[i
];
403 for (i
= 0; i
< SKIP_CONTEXTS
; i
++)
404 for (j
= 0; j
< 2; j
++)
405 cm
->counts
.skip
[i
][j
] += counts
->skip
[i
][j
];
407 for (i
= 0; i
< MV_JOINTS
; i
++)
408 cm
->counts
.mv
.joints
[i
] += counts
->mv
.joints
[i
];
410 for (k
= 0; k
< 2; k
++) {
411 nmv_component_counts
*comps
= &cm
->counts
.mv
.comps
[k
];
412 nmv_component_counts
*comps_t
= &counts
->mv
.comps
[k
];
414 for (i
= 0; i
< 2; i
++) {
415 comps
->sign
[i
] += comps_t
->sign
[i
];
416 comps
->class0_hp
[i
] += comps_t
->class0_hp
[i
];
417 comps
->hp
[i
] += comps_t
->hp
[i
];
420 for (i
= 0; i
< MV_CLASSES
; i
++)
421 comps
->classes
[i
] += comps_t
->classes
[i
];
423 for (i
= 0; i
< CLASS0_SIZE
; i
++) {
424 comps
->class0
[i
] += comps_t
->class0
[i
];
425 for (j
= 0; j
< MV_FP_SIZE
; j
++)
426 comps
->class0_fp
[i
][j
] += comps_t
->class0_fp
[i
][j
];
429 for (i
= 0; i
< MV_OFFSET_BITS
; i
++)
430 for (j
= 0; j
< 2; j
++)
431 comps
->bits
[i
][j
] += comps_t
->bits
[i
][j
];
433 for (i
= 0; i
< MV_FP_SIZE
; i
++)
434 comps
->fp
[i
] += comps_t
->fp
[i
];