3 * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
5 * VC-3 encoder funded by the British Broadcasting Corporation
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #define RC_VARIANCE 1 // use variance or ssd for fast rc
29 #include "mpegvideo.h"
32 int dct_quantize_c(MpegEncContext
*s
, DCTELEM
*block
, int n
, int qscale
, int *overflow
);
34 #define LAMBDA_FRAC_BITS 10
36 static av_always_inline
void dnxhd_get_pixels_8x4(DCTELEM
*restrict block
, const uint8_t *pixels
, int line_size
)
39 for (i
= 0; i
< 4; i
++) {
40 block
[0] = pixels
[0]; block
[1] = pixels
[1];
41 block
[2] = pixels
[2]; block
[3] = pixels
[3];
42 block
[4] = pixels
[4]; block
[5] = pixels
[5];
43 block
[6] = pixels
[6]; block
[7] = pixels
[7];
47 memcpy(block
, block
- 8, sizeof(*block
)*8);
48 memcpy(block
+ 8, block
-16, sizeof(*block
)*8);
49 memcpy(block
+16, block
-24, sizeof(*block
)*8);
50 memcpy(block
+24, block
-32, sizeof(*block
)*8);
53 static int dnxhd_init_vlc(DNXHDEncContext
*ctx
)
56 int max_level
= 1<<(ctx
->cid_table
->bit_depth
+2);
58 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->vlc_codes
, max_level
*4*sizeof(*ctx
->vlc_codes
), fail
);
59 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->vlc_bits
, max_level
*4*sizeof(*ctx
->vlc_bits
), fail
);
60 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->run_codes
, 63*2 , fail
);
61 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->run_bits
, 63 , fail
);
63 ctx
->vlc_codes
+= max_level
*2;
64 ctx
->vlc_bits
+= max_level
*2;
65 for (level
= -max_level
; level
< max_level
; level
++) {
66 for (run
= 0; run
< 2; run
++) {
67 int index
= (level
<<1)|run
;
68 int sign
, offset
= 0, alevel
= level
;
70 MASK_ABS(sign
, alevel
);
72 offset
= (alevel
-1)>>6;
75 for (j
= 0; j
< 257; j
++) {
76 if (ctx
->cid_table
->ac_level
[j
] == alevel
&&
77 (!offset
|| (ctx
->cid_table
->ac_index_flag
[j
] && offset
)) &&
78 (!run
|| (ctx
->cid_table
->ac_run_flag
[j
] && run
))) {
79 assert(!ctx
->vlc_codes
[index
]);
81 ctx
->vlc_codes
[index
] = (ctx
->cid_table
->ac_codes
[j
]<<1)|(sign
&1);
82 ctx
->vlc_bits
[index
] = ctx
->cid_table
->ac_bits
[j
]+1;
84 ctx
->vlc_codes
[index
] = ctx
->cid_table
->ac_codes
[j
];
85 ctx
->vlc_bits
[index
] = ctx
->cid_table
->ac_bits
[j
];
90 assert(!alevel
|| j
< 257);
92 ctx
->vlc_codes
[index
] = (ctx
->vlc_codes
[index
]<<ctx
->cid_table
->index_bits
)|offset
;
93 ctx
->vlc_bits
[index
]+= ctx
->cid_table
->index_bits
;
97 for (i
= 0; i
< 62; i
++) {
98 int run
= ctx
->cid_table
->run
[i
];
100 ctx
->run_codes
[run
] = ctx
->cid_table
->run_codes
[i
];
101 ctx
->run_bits
[run
] = ctx
->cid_table
->run_bits
[i
];
108 static int dnxhd_init_qmat(DNXHDEncContext
*ctx
, int lbias
, int cbias
)
110 // init first elem to 1 to avoid div by 0 in convert_matrix
111 uint16_t weight_matrix
[64] = {1,}; // convert_matrix needs uint16_t*
114 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->qmatrix_l
, (ctx
->m
.avctx
->qmax
+1) * 64 * sizeof(int) , fail
);
115 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->qmatrix_c
, (ctx
->m
.avctx
->qmax
+1) * 64 * sizeof(int) , fail
);
116 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->qmatrix_l16
, (ctx
->m
.avctx
->qmax
+1) * 64 * 2 * sizeof(uint16_t), fail
);
117 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->qmatrix_c16
, (ctx
->m
.avctx
->qmax
+1) * 64 * 2 * sizeof(uint16_t), fail
);
119 for (i
= 1; i
< 64; i
++) {
120 int j
= ctx
->m
.dsp
.idct_permutation
[ff_zigzag_direct
[i
]];
121 weight_matrix
[j
] = ctx
->cid_table
->luma_weight
[i
];
123 ff_convert_matrix(&ctx
->m
.dsp
, ctx
->qmatrix_l
, ctx
->qmatrix_l16
, weight_matrix
,
124 ctx
->m
.intra_quant_bias
, 1, ctx
->m
.avctx
->qmax
, 1);
125 for (i
= 1; i
< 64; i
++) {
126 int j
= ctx
->m
.dsp
.idct_permutation
[ff_zigzag_direct
[i
]];
127 weight_matrix
[j
] = ctx
->cid_table
->chroma_weight
[i
];
129 ff_convert_matrix(&ctx
->m
.dsp
, ctx
->qmatrix_c
, ctx
->qmatrix_c16
, weight_matrix
,
130 ctx
->m
.intra_quant_bias
, 1, ctx
->m
.avctx
->qmax
, 1);
131 for (qscale
= 1; qscale
<= ctx
->m
.avctx
->qmax
; qscale
++) {
132 for (i
= 0; i
< 64; i
++) {
133 ctx
->qmatrix_l
[qscale
] [i
] <<= 2; ctx
->qmatrix_c
[qscale
] [i
] <<= 2;
134 ctx
->qmatrix_l16
[qscale
][0][i
] <<= 2; ctx
->qmatrix_l16
[qscale
][1][i
] <<= 2;
135 ctx
->qmatrix_c16
[qscale
][0][i
] <<= 2; ctx
->qmatrix_c16
[qscale
][1][i
] <<= 2;
143 static int dnxhd_init_rc(DNXHDEncContext
*ctx
)
145 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->mb_rc
, 8160*ctx
->m
.avctx
->qmax
*sizeof(RCEntry
), fail
);
146 if (ctx
->m
.avctx
->mb_decision
!= FF_MB_DECISION_RD
)
147 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->mb_cmp
, ctx
->m
.mb_num
*sizeof(RCCMPEntry
), fail
);
149 ctx
->frame_bits
= (ctx
->cid_table
->coding_unit_size
- 640 - 4) * 8;
151 ctx
->lambda
= 2<<LAMBDA_FRAC_BITS
; // qscale 2
157 static int dnxhd_encode_init(AVCodecContext
*avctx
)
159 DNXHDEncContext
*ctx
= avctx
->priv_data
;
162 ctx
->cid
= ff_dnxhd_find_cid(avctx
);
163 if (!ctx
->cid
|| avctx
->pix_fmt
!= PIX_FMT_YUV422P
) {
164 av_log(avctx
, AV_LOG_ERROR
, "video parameters incompatible with DNxHD\n");
167 av_log(avctx
, AV_LOG_DEBUG
, "cid %d\n", ctx
->cid
);
169 index
= ff_dnxhd_get_cid_table(ctx
->cid
);
170 ctx
->cid_table
= &ff_dnxhd_cid_table
[index
];
172 ctx
->m
.avctx
= avctx
;
176 ctx
->get_pixels_8x4_sym
= dnxhd_get_pixels_8x4
;
178 dsputil_init(&ctx
->m
.dsp
, avctx
);
179 ff_dct_common_init(&ctx
->m
);
181 ff_dnxhd_init_mmx(ctx
);
183 if (!ctx
->m
.dct_quantize
)
184 ctx
->m
.dct_quantize
= dct_quantize_c
;
186 ctx
->m
.mb_height
= (avctx
->height
+ 15) / 16;
187 ctx
->m
.mb_width
= (avctx
->width
+ 15) / 16;
189 if (avctx
->flags
& CODEC_FLAG_INTERLACED_DCT
) {
191 ctx
->m
.mb_height
/= 2;
194 ctx
->m
.mb_num
= ctx
->m
.mb_height
* ctx
->m
.mb_width
;
196 if (avctx
->intra_quant_bias
!= FF_DEFAULT_QUANT_BIAS
)
197 ctx
->m
.intra_quant_bias
= avctx
->intra_quant_bias
;
198 if (dnxhd_init_qmat(ctx
, ctx
->m
.intra_quant_bias
, 0) < 0) // XXX tune lbias/cbias
201 if (dnxhd_init_vlc(ctx
) < 0)
203 if (dnxhd_init_rc(ctx
) < 0)
206 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->slice_size
, ctx
->m
.mb_height
*sizeof(uint32_t), fail
);
207 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->mb_bits
, ctx
->m
.mb_num
*sizeof(uint16_t), fail
);
208 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->mb_qscale
, ctx
->m
.mb_num
*sizeof(uint8_t) , fail
);
210 ctx
->frame
.key_frame
= 1;
211 ctx
->frame
.pict_type
= FF_I_TYPE
;
212 ctx
->m
.avctx
->coded_frame
= &ctx
->frame
;
214 if (avctx
->thread_count
> MAX_THREADS
|| (avctx
->thread_count
> ctx
->m
.mb_height
)) {
215 av_log(avctx
, AV_LOG_ERROR
, "too many threads\n");
219 ctx
->thread
[0] = ctx
;
220 for (i
= 1; i
< avctx
->thread_count
; i
++) {
221 ctx
->thread
[i
] = av_malloc(sizeof(DNXHDEncContext
));
222 memcpy(ctx
->thread
[i
], ctx
, sizeof(DNXHDEncContext
));
225 for (i
= 0; i
< avctx
->thread_count
; i
++) {
226 ctx
->thread
[i
]->m
.start_mb_y
= (ctx
->m
.mb_height
*(i
) + avctx
->thread_count
/2) / avctx
->thread_count
;
227 ctx
->thread
[i
]->m
.end_mb_y
= (ctx
->m
.mb_height
*(i
+1) + avctx
->thread_count
/2) / avctx
->thread_count
;
231 fail
: //for FF_ALLOCZ_OR_GOTO
235 static int dnxhd_write_header(AVCodecContext
*avctx
, uint8_t *buf
)
237 DNXHDEncContext
*ctx
= avctx
->priv_data
;
238 const uint8_t header_prefix
[5] = { 0x00,0x00,0x02,0x80,0x01 };
242 memcpy(buf
, header_prefix
, 5);
243 buf
[5] = ctx
->interlaced
? ctx
->cur_field
+2 : 0x01;
244 buf
[6] = 0x80; // crc flag off
245 buf
[7] = 0xa0; // reserved
246 AV_WB16(buf
+ 0x18, avctx
->height
); // ALPF
247 AV_WB16(buf
+ 0x1a, avctx
->width
); // SPL
248 AV_WB16(buf
+ 0x1d, avctx
->height
); // NAL
250 buf
[0x21] = 0x38; // FIXME 8 bit per comp
251 buf
[0x22] = 0x88 + (ctx
->frame
.interlaced_frame
<<2);
252 AV_WB32(buf
+ 0x28, ctx
->cid
); // CID
253 buf
[0x2c] = ctx
->interlaced
? 0 : 0x80;
255 buf
[0x5f] = 0x01; // UDL
257 buf
[0x167] = 0x02; // reserved
258 AV_WB16(buf
+ 0x16a, ctx
->m
.mb_height
* 4 + 4); // MSIPS
259 buf
[0x16d] = ctx
->m
.mb_height
; // Ns
260 buf
[0x16f] = 0x10; // reserved
262 ctx
->msip
= buf
+ 0x170;
266 static av_always_inline
void dnxhd_encode_dc(DNXHDEncContext
*ctx
, int diff
)
270 nbits
= av_log2_16bit(-2*diff
);
273 nbits
= av_log2_16bit(2*diff
);
275 put_bits(&ctx
->m
.pb
, ctx
->cid_table
->dc_bits
[nbits
] + nbits
,
276 (ctx
->cid_table
->dc_codes
[nbits
]<<nbits
) + (diff
& ((1 << nbits
) - 1)));
279 static av_always_inline
void dnxhd_encode_block(DNXHDEncContext
*ctx
, DCTELEM
*block
, int last_index
, int n
)
281 int last_non_zero
= 0;
284 dnxhd_encode_dc(ctx
, block
[0] - ctx
->m
.last_dc
[n
]);
285 ctx
->m
.last_dc
[n
] = block
[0];
287 for (i
= 1; i
<= last_index
; i
++) {
288 j
= ctx
->m
.intra_scantable
.permutated
[i
];
291 int run_level
= i
- last_non_zero
- 1;
292 int rlevel
= (slevel
<<1)|!!run_level
;
293 put_bits(&ctx
->m
.pb
, ctx
->vlc_bits
[rlevel
], ctx
->vlc_codes
[rlevel
]);
295 put_bits(&ctx
->m
.pb
, ctx
->run_bits
[run_level
], ctx
->run_codes
[run_level
]);
299 put_bits(&ctx
->m
.pb
, ctx
->vlc_bits
[0], ctx
->vlc_codes
[0]); // EOB
302 static av_always_inline
void dnxhd_unquantize_c(DNXHDEncContext
*ctx
, DCTELEM
*block
, int n
, int qscale
, int last_index
)
304 const uint8_t *weight_matrix
;
308 weight_matrix
= (n
&2) ? ctx
->cid_table
->chroma_weight
: ctx
->cid_table
->luma_weight
;
310 for (i
= 1; i
<= last_index
; i
++) {
311 int j
= ctx
->m
.intra_scantable
.permutated
[i
];
315 level
= (1-2*level
) * qscale
* weight_matrix
[i
];
316 if (weight_matrix
[i
] != 32)
321 level
= (2*level
+1) * qscale
* weight_matrix
[i
];
322 if (weight_matrix
[i
] != 32)
331 static av_always_inline
int dnxhd_ssd_block(DCTELEM
*qblock
, DCTELEM
*block
)
335 for (i
= 0; i
< 64; i
++)
336 score
+= (block
[i
]-qblock
[i
])*(block
[i
]-qblock
[i
]);
340 static av_always_inline
int dnxhd_calc_ac_bits(DNXHDEncContext
*ctx
, DCTELEM
*block
, int last_index
)
342 int last_non_zero
= 0;
345 for (i
= 1; i
<= last_index
; i
++) {
346 j
= ctx
->m
.intra_scantable
.permutated
[i
];
349 int run_level
= i
- last_non_zero
- 1;
350 bits
+= ctx
->vlc_bits
[(level
<<1)|!!run_level
]+ctx
->run_bits
[run_level
];
357 static av_always_inline
void dnxhd_get_blocks(DNXHDEncContext
*ctx
, int mb_x
, int mb_y
)
359 const uint8_t *ptr_y
= ctx
->thread
[0]->src
[0] + ((mb_y
<< 4) * ctx
->m
.linesize
) + (mb_x
<< 4);
360 const uint8_t *ptr_u
= ctx
->thread
[0]->src
[1] + ((mb_y
<< 4) * ctx
->m
.uvlinesize
) + (mb_x
<< 3);
361 const uint8_t *ptr_v
= ctx
->thread
[0]->src
[2] + ((mb_y
<< 4) * ctx
->m
.uvlinesize
) + (mb_x
<< 3);
362 DSPContext
*dsp
= &ctx
->m
.dsp
;
364 dsp
->get_pixels(ctx
->blocks
[0], ptr_y
, ctx
->m
.linesize
);
365 dsp
->get_pixels(ctx
->blocks
[1], ptr_y
+ 8, ctx
->m
.linesize
);
366 dsp
->get_pixels(ctx
->blocks
[2], ptr_u
, ctx
->m
.uvlinesize
);
367 dsp
->get_pixels(ctx
->blocks
[3], ptr_v
, ctx
->m
.uvlinesize
);
369 if (mb_y
+1 == ctx
->m
.mb_height
&& ctx
->m
.avctx
->height
== 1080) {
370 if (ctx
->interlaced
) {
371 ctx
->get_pixels_8x4_sym(ctx
->blocks
[4], ptr_y
+ ctx
->dct_y_offset
, ctx
->m
.linesize
);
372 ctx
->get_pixels_8x4_sym(ctx
->blocks
[5], ptr_y
+ ctx
->dct_y_offset
+ 8, ctx
->m
.linesize
);
373 ctx
->get_pixels_8x4_sym(ctx
->blocks
[6], ptr_u
+ ctx
->dct_uv_offset
, ctx
->m
.uvlinesize
);
374 ctx
->get_pixels_8x4_sym(ctx
->blocks
[7], ptr_v
+ ctx
->dct_uv_offset
, ctx
->m
.uvlinesize
);
376 dsp
->clear_block(ctx
->blocks
[4]); dsp
->clear_block(ctx
->blocks
[5]);
377 dsp
->clear_block(ctx
->blocks
[6]); dsp
->clear_block(ctx
->blocks
[7]);
380 dsp
->get_pixels(ctx
->blocks
[4], ptr_y
+ ctx
->dct_y_offset
, ctx
->m
.linesize
);
381 dsp
->get_pixels(ctx
->blocks
[5], ptr_y
+ ctx
->dct_y_offset
+ 8, ctx
->m
.linesize
);
382 dsp
->get_pixels(ctx
->blocks
[6], ptr_u
+ ctx
->dct_uv_offset
, ctx
->m
.uvlinesize
);
383 dsp
->get_pixels(ctx
->blocks
[7], ptr_v
+ ctx
->dct_uv_offset
, ctx
->m
.uvlinesize
);
387 static av_always_inline
int dnxhd_switch_matrix(DNXHDEncContext
*ctx
, int i
)
390 ctx
->m
.q_intra_matrix16
= ctx
->qmatrix_c16
;
391 ctx
->m
.q_intra_matrix
= ctx
->qmatrix_c
;
394 ctx
->m
.q_intra_matrix16
= ctx
->qmatrix_l16
;
395 ctx
->m
.q_intra_matrix
= ctx
->qmatrix_l
;
400 static int dnxhd_calc_bits_thread(AVCodecContext
*avctx
, void *arg
)
402 DNXHDEncContext
*ctx
= *(void**)arg
;
404 int qscale
= ctx
->thread
[0]->qscale
;
406 for (mb_y
= ctx
->m
.start_mb_y
; mb_y
< ctx
->m
.end_mb_y
; mb_y
++) {
409 ctx
->m
.last_dc
[2] = 1024;
411 for (mb_x
= 0; mb_x
< ctx
->m
.mb_width
; mb_x
++) {
412 unsigned mb
= mb_y
* ctx
->m
.mb_width
+ mb_x
;
418 dnxhd_get_blocks(ctx
, mb_x
, mb_y
);
420 for (i
= 0; i
< 8; i
++) {
421 DECLARE_ALIGNED_16(DCTELEM
, block
[64]);
422 DCTELEM
*src_block
= ctx
->blocks
[i
];
423 int overflow
, nbits
, diff
, last_index
;
424 int n
= dnxhd_switch_matrix(ctx
, i
);
426 memcpy(block
, src_block
, sizeof(block
));
427 last_index
= ctx
->m
.dct_quantize((MpegEncContext
*)ctx
, block
, i
, qscale
, &overflow
);
428 ac_bits
+= dnxhd_calc_ac_bits(ctx
, block
, last_index
);
430 diff
= block
[0] - ctx
->m
.last_dc
[n
];
431 if (diff
< 0) nbits
= av_log2_16bit(-2*diff
);
432 else nbits
= av_log2_16bit( 2*diff
);
433 dc_bits
+= ctx
->cid_table
->dc_bits
[nbits
] + nbits
;
435 ctx
->m
.last_dc
[n
] = block
[0];
437 if (avctx
->mb_decision
== FF_MB_DECISION_RD
|| !RC_VARIANCE
) {
438 dnxhd_unquantize_c(ctx
, block
, i
, qscale
, last_index
);
439 ctx
->m
.dsp
.idct(block
);
440 ssd
+= dnxhd_ssd_block(block
, src_block
);
443 ctx
->mb_rc
[qscale
][mb
].ssd
= ssd
;
444 ctx
->mb_rc
[qscale
][mb
].bits
= ac_bits
+dc_bits
+12+8*ctx
->vlc_bits
[0];
450 static int dnxhd_encode_thread(AVCodecContext
*avctx
, void *arg
)
452 DNXHDEncContext
*ctx
= *(void**)arg
;
455 for (mb_y
= ctx
->m
.start_mb_y
; mb_y
< ctx
->m
.end_mb_y
; mb_y
++) {
458 ctx
->m
.last_dc
[2] = 1024;
459 for (mb_x
= 0; mb_x
< ctx
->m
.mb_width
; mb_x
++) {
460 unsigned mb
= mb_y
* ctx
->m
.mb_width
+ mb_x
;
461 int qscale
= ctx
->mb_qscale
[mb
];
464 put_bits(&ctx
->m
.pb
, 12, qscale
<<1);
466 dnxhd_get_blocks(ctx
, mb_x
, mb_y
);
468 for (i
= 0; i
< 8; i
++) {
469 DCTELEM
*block
= ctx
->blocks
[i
];
470 int last_index
, overflow
;
471 int n
= dnxhd_switch_matrix(ctx
, i
);
472 last_index
= ctx
->m
.dct_quantize((MpegEncContext
*)ctx
, block
, i
, qscale
, &overflow
);
474 dnxhd_encode_block(ctx
, block
, last_index
, n
);
475 //STOP_TIMER("encode_block");
478 if (put_bits_count(&ctx
->m
.pb
)&31)
479 put_bits(&ctx
->m
.pb
, 32-(put_bits_count(&ctx
->m
.pb
)&31), 0);
481 flush_put_bits(&ctx
->m
.pb
);
485 static void dnxhd_setup_threads_slices(DNXHDEncContext
*ctx
, uint8_t *buf
)
489 for (i
= 0; i
< ctx
->m
.avctx
->thread_count
; i
++) {
491 for (mb_y
= ctx
->thread
[i
]->m
.start_mb_y
; mb_y
< ctx
->thread
[i
]->m
.end_mb_y
; mb_y
++) {
492 ctx
->slice_size
[mb_y
] = 0;
493 for (mb_x
= 0; mb_x
< ctx
->m
.mb_width
; mb_x
++) {
494 unsigned mb
= mb_y
* ctx
->m
.mb_width
+ mb_x
;
495 ctx
->slice_size
[mb_y
] += ctx
->mb_bits
[mb
];
497 ctx
->slice_size
[mb_y
] = (ctx
->slice_size
[mb_y
]+31)&~31;
498 ctx
->slice_size
[mb_y
] >>= 3;
499 thread_size
+= ctx
->slice_size
[mb_y
];
501 init_put_bits(&ctx
->thread
[i
]->m
.pb
, buf
+ 640 + offset
, thread_size
);
502 offset
+= thread_size
;
506 static int dnxhd_mb_var_thread(AVCodecContext
*avctx
, void *arg
)
508 DNXHDEncContext
*ctx
= *(void**)arg
;
510 for (mb_y
= ctx
->m
.start_mb_y
; mb_y
< ctx
->m
.end_mb_y
; mb_y
++) {
511 for (mb_x
= 0; mb_x
< ctx
->m
.mb_width
; mb_x
++) {
512 unsigned mb
= mb_y
* ctx
->m
.mb_width
+ mb_x
;
513 uint8_t *pix
= ctx
->thread
[0]->src
[0] + ((mb_y
<<4) * ctx
->m
.linesize
) + (mb_x
<<4);
514 int sum
= ctx
->m
.dsp
.pix_sum(pix
, ctx
->m
.linesize
);
515 int varc
= (ctx
->m
.dsp
.pix_norm1(pix
, ctx
->m
.linesize
) - (((unsigned)(sum
*sum
))>>8)+128)>>8;
516 ctx
->mb_cmp
[mb
].value
= varc
;
517 ctx
->mb_cmp
[mb
].mb
= mb
;
523 static int dnxhd_encode_rdo(AVCodecContext
*avctx
, DNXHDEncContext
*ctx
)
525 int lambda
, up_step
, down_step
;
526 int last_lower
= INT_MAX
, last_higher
= 0;
529 for (q
= 1; q
< avctx
->qmax
; q
++) {
531 avctx
->execute(avctx
, dnxhd_calc_bits_thread
, (void**)&ctx
->thread
[0], NULL
, avctx
->thread_count
, sizeof(void*));
533 up_step
= down_step
= 2<<LAMBDA_FRAC_BITS
;
534 lambda
= ctx
->lambda
;
539 if (lambda
== last_higher
) {
541 end
= 1; // need to set final qscales/bits
543 for (y
= 0; y
< ctx
->m
.mb_height
; y
++) {
544 for (x
= 0; x
< ctx
->m
.mb_width
; x
++) {
545 unsigned min
= UINT_MAX
;
547 int mb
= y
*ctx
->m
.mb_width
+x
;
548 for (q
= 1; q
< avctx
->qmax
; q
++) {
549 unsigned score
= ctx
->mb_rc
[q
][mb
].bits
*lambda
+(ctx
->mb_rc
[q
][mb
].ssd
<<LAMBDA_FRAC_BITS
);
555 bits
+= ctx
->mb_rc
[qscale
][mb
].bits
;
556 ctx
->mb_qscale
[mb
] = qscale
;
557 ctx
->mb_bits
[mb
] = ctx
->mb_rc
[qscale
][mb
].bits
;
559 bits
= (bits
+31)&~31; // padding
560 if (bits
> ctx
->frame_bits
)
563 //dprintf(ctx->m.avctx, "lambda %d, up %u, down %u, bits %d, frame %d\n",
564 // lambda, last_higher, last_lower, bits, ctx->frame_bits);
566 if (bits
> ctx
->frame_bits
)
570 if (bits
< ctx
->frame_bits
) {
571 last_lower
= FFMIN(lambda
, last_lower
);
572 if (last_higher
!= 0)
573 lambda
= (lambda
+last_higher
)>>1;
576 down_step
*= 5; // XXX tune ?
577 up_step
= 1<<LAMBDA_FRAC_BITS
;
578 lambda
= FFMAX(1, lambda
);
579 if (lambda
== last_lower
)
582 last_higher
= FFMAX(lambda
, last_higher
);
583 if (last_lower
!= INT_MAX
)
584 lambda
= (lambda
+last_lower
)>>1;
588 down_step
= 1<<LAMBDA_FRAC_BITS
;
591 //dprintf(ctx->m.avctx, "out lambda %d\n", lambda);
592 ctx
->lambda
= lambda
;
596 static int dnxhd_find_qscale(DNXHDEncContext
*ctx
)
602 int last_lower
= INT_MAX
;
606 qscale
= ctx
->qscale
;
609 ctx
->qscale
= qscale
;
610 // XXX avoid recalculating bits
611 ctx
->m
.avctx
->execute(ctx
->m
.avctx
, dnxhd_calc_bits_thread
, (void**)&ctx
->thread
[0], NULL
, ctx
->m
.avctx
->thread_count
, sizeof(void*));
612 for (y
= 0; y
< ctx
->m
.mb_height
; y
++) {
613 for (x
= 0; x
< ctx
->m
.mb_width
; x
++)
614 bits
+= ctx
->mb_rc
[qscale
][y
*ctx
->m
.mb_width
+x
].bits
;
615 bits
= (bits
+31)&~31; // padding
616 if (bits
> ctx
->frame_bits
)
619 //dprintf(ctx->m.avctx, "%d, qscale %d, bits %d, frame %d, higher %d, lower %d\n",
620 // ctx->m.avctx->frame_number, qscale, bits, ctx->frame_bits, last_higher, last_lower);
621 if (bits
< ctx
->frame_bits
) {
624 if (last_higher
== qscale
- 1) {
625 qscale
= last_higher
;
628 last_lower
= FFMIN(qscale
, last_lower
);
629 if (last_higher
!= 0)
630 qscale
= (qscale
+last_higher
)>>1;
632 qscale
-= down_step
++;
637 if (last_lower
== qscale
+ 1)
639 last_higher
= FFMAX(qscale
, last_higher
);
640 if (last_lower
!= INT_MAX
)
641 qscale
= (qscale
+last_lower
)>>1;
645 if (qscale
>= ctx
->m
.avctx
->qmax
)
649 //dprintf(ctx->m.avctx, "out qscale %d\n", qscale);
650 ctx
->qscale
= qscale
;
654 #define BUCKET_BITS 8
655 #define RADIX_PASSES 4
656 #define NBUCKETS (1 << BUCKET_BITS)
658 static inline int get_bucket(int value
, int shift
)
661 value
&= NBUCKETS
- 1;
662 return NBUCKETS
- 1 - value
;
665 static void radix_count(const RCCMPEntry
*data
, int size
, int buckets
[RADIX_PASSES
][NBUCKETS
])
668 memset(buckets
, 0, sizeof(buckets
[0][0]) * RADIX_PASSES
* NBUCKETS
);
669 for (i
= 0; i
< size
; i
++) {
670 int v
= data
[i
].value
;
671 for (j
= 0; j
< RADIX_PASSES
; j
++) {
672 buckets
[j
][get_bucket(v
, 0)]++;
677 for (j
= 0; j
< RADIX_PASSES
; j
++) {
679 for (i
= NBUCKETS
- 1; i
>= 0; i
--)
680 buckets
[j
][i
] = offset
-= buckets
[j
][i
];
681 assert(!buckets
[j
][0]);
685 static void radix_sort_pass(RCCMPEntry
*dst
, const RCCMPEntry
*data
, int size
, int buckets
[NBUCKETS
], int pass
)
687 int shift
= pass
* BUCKET_BITS
;
689 for (i
= 0; i
< size
; i
++) {
690 int v
= get_bucket(data
[i
].value
, shift
);
691 int pos
= buckets
[v
]++;
696 static void radix_sort(RCCMPEntry
*data
, int size
)
698 int buckets
[RADIX_PASSES
][NBUCKETS
];
699 RCCMPEntry
*tmp
= av_malloc(sizeof(*tmp
) * size
);
700 radix_count(data
, size
, buckets
);
701 radix_sort_pass(tmp
, data
, size
, buckets
[0], 0);
702 radix_sort_pass(data
, tmp
, size
, buckets
[1], 1);
703 if (buckets
[2][NBUCKETS
- 1] || buckets
[3][NBUCKETS
- 1]) {
704 radix_sort_pass(tmp
, data
, size
, buckets
[2], 2);
705 radix_sort_pass(data
, tmp
, size
, buckets
[3], 3);
710 static int dnxhd_encode_fast(AVCodecContext
*avctx
, DNXHDEncContext
*ctx
)
714 if ((ret
= dnxhd_find_qscale(ctx
)) < 0)
716 for (y
= 0; y
< ctx
->m
.mb_height
; y
++) {
717 for (x
= 0; x
< ctx
->m
.mb_width
; x
++) {
718 int mb
= y
*ctx
->m
.mb_width
+x
;
720 ctx
->mb_qscale
[mb
] = ctx
->qscale
;
721 ctx
->mb_bits
[mb
] = ctx
->mb_rc
[ctx
->qscale
][mb
].bits
;
722 max_bits
+= ctx
->mb_rc
[ctx
->qscale
][mb
].bits
;
724 delta_bits
= ctx
->mb_rc
[ctx
->qscale
][mb
].bits
-ctx
->mb_rc
[ctx
->qscale
+1][mb
].bits
;
725 ctx
->mb_cmp
[mb
].mb
= mb
;
726 ctx
->mb_cmp
[mb
].value
= delta_bits
?
727 ((ctx
->mb_rc
[ctx
->qscale
][mb
].ssd
-ctx
->mb_rc
[ctx
->qscale
+1][mb
].ssd
)*100)/delta_bits
728 : INT_MIN
; //avoid increasing qscale
731 max_bits
+= 31; //worst padding
735 avctx
->execute(avctx
, dnxhd_mb_var_thread
, (void**)&ctx
->thread
[0], NULL
, avctx
->thread_count
, sizeof(void*));
736 radix_sort(ctx
->mb_cmp
, ctx
->m
.mb_num
);
737 for (x
= 0; x
< ctx
->m
.mb_num
&& max_bits
> ctx
->frame_bits
; x
++) {
738 int mb
= ctx
->mb_cmp
[x
].mb
;
739 max_bits
-= ctx
->mb_rc
[ctx
->qscale
][mb
].bits
- ctx
->mb_rc
[ctx
->qscale
+1][mb
].bits
;
740 ctx
->mb_qscale
[mb
] = ctx
->qscale
+1;
741 ctx
->mb_bits
[mb
] = ctx
->mb_rc
[ctx
->qscale
+1][mb
].bits
;
747 static void dnxhd_load_picture(DNXHDEncContext
*ctx
, const AVFrame
*frame
)
751 for (i
= 0; i
< 3; i
++) {
752 ctx
->frame
.data
[i
] = frame
->data
[i
];
753 ctx
->frame
.linesize
[i
] = frame
->linesize
[i
];
756 for (i
= 0; i
< ctx
->m
.avctx
->thread_count
; i
++) {
757 ctx
->thread
[i
]->m
.linesize
= ctx
->frame
.linesize
[0]<<ctx
->interlaced
;
758 ctx
->thread
[i
]->m
.uvlinesize
= ctx
->frame
.linesize
[1]<<ctx
->interlaced
;
759 ctx
->thread
[i
]->dct_y_offset
= ctx
->m
.linesize
*8;
760 ctx
->thread
[i
]->dct_uv_offset
= ctx
->m
.uvlinesize
*8;
763 ctx
->frame
.interlaced_frame
= frame
->interlaced_frame
;
764 ctx
->cur_field
= frame
->interlaced_frame
&& !frame
->top_field_first
;
767 static int dnxhd_encode_picture(AVCodecContext
*avctx
, unsigned char *buf
, int buf_size
, void *data
)
769 DNXHDEncContext
*ctx
= avctx
->priv_data
;
773 if (buf_size
< ctx
->cid_table
->frame_size
) {
774 av_log(avctx
, AV_LOG_ERROR
, "output buffer is too small to compress picture\n");
778 dnxhd_load_picture(ctx
, data
);
781 for (i
= 0; i
< 3; i
++) {
782 ctx
->src
[i
] = ctx
->frame
.data
[i
];
783 if (ctx
->interlaced
&& ctx
->cur_field
)
784 ctx
->src
[i
] += ctx
->frame
.linesize
[i
];
787 dnxhd_write_header(avctx
, buf
);
789 if (avctx
->mb_decision
== FF_MB_DECISION_RD
)
790 ret
= dnxhd_encode_rdo(avctx
, ctx
);
792 ret
= dnxhd_encode_fast(avctx
, ctx
);
794 av_log(avctx
, AV_LOG_ERROR
, "picture could not fit ratecontrol constraints\n");
798 dnxhd_setup_threads_slices(ctx
, buf
);
801 for (i
= 0; i
< ctx
->m
.mb_height
; i
++) {
802 AV_WB32(ctx
->msip
+ i
* 4, offset
);
803 offset
+= ctx
->slice_size
[i
];
804 assert(!(ctx
->slice_size
[i
] & 3));
807 avctx
->execute(avctx
, dnxhd_encode_thread
, (void**)&ctx
->thread
[0], NULL
, avctx
->thread_count
, sizeof(void*));
809 assert(640 + offset
+ 4 <= ctx
->cid_table
->coding_unit_size
);
810 memset(buf
+ 640 + offset
, 0, ctx
->cid_table
->coding_unit_size
- 4 - offset
- 640);
812 AV_WB32(buf
+ ctx
->cid_table
->coding_unit_size
- 4, 0x600DC0DE); // EOF
814 if (ctx
->interlaced
&& first_field
) {
817 buf
+= ctx
->cid_table
->coding_unit_size
;
818 buf_size
-= ctx
->cid_table
->coding_unit_size
;
819 goto encode_coding_unit
;
822 ctx
->frame
.quality
= ctx
->qscale
*FF_QP2LAMBDA
;
824 return ctx
->cid_table
->frame_size
;
827 static int dnxhd_encode_end(AVCodecContext
*avctx
)
829 DNXHDEncContext
*ctx
= avctx
->priv_data
;
830 int max_level
= 1<<(ctx
->cid_table
->bit_depth
+2);
833 av_free(ctx
->vlc_codes
-max_level
*2);
834 av_free(ctx
->vlc_bits
-max_level
*2);
835 av_freep(&ctx
->run_codes
);
836 av_freep(&ctx
->run_bits
);
838 av_freep(&ctx
->mb_bits
);
839 av_freep(&ctx
->mb_qscale
);
840 av_freep(&ctx
->mb_rc
);
841 av_freep(&ctx
->mb_cmp
);
842 av_freep(&ctx
->slice_size
);
844 av_freep(&ctx
->qmatrix_c
);
845 av_freep(&ctx
->qmatrix_l
);
846 av_freep(&ctx
->qmatrix_c16
);
847 av_freep(&ctx
->qmatrix_l16
);
849 for (i
= 1; i
< avctx
->thread_count
; i
++)
850 av_freep(&ctx
->thread
[i
]);
855 AVCodec dnxhd_encoder
= {
859 sizeof(DNXHDEncContext
),
861 dnxhd_encode_picture
,
863 .pix_fmts
= (const enum PixelFormat
[]){PIX_FMT_YUV422P
, PIX_FMT_NONE
},
864 .long_name
= NULL_IF_CONFIG_SMALL("VC3/DNxHD"),