3 * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
4 * Copyright (c) 2011 MirriAd Ltd
6 * VC-3 encoder funded by the British Broadcasting Corporation
7 * 10 bit support added by MirriAd Ltd, Joseph Artsimovich <joseph@mirriad.com>
9 * This file is part of FFmpeg.
11 * FFmpeg is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
16 * FFmpeg is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with FFmpeg; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 #include "libavutil/attributes.h"
27 #include "libavutil/internal.h"
28 #include "libavutil/opt.h"
29 #include "libavutil/timer.h"
35 #include "mpegvideo.h"
36 #include "pixblockdsp.h"
40 // The largest value that will not lead to overflow for 10bit samples.
41 #define DNX10BIT_QMAT_SHIFT 18
42 #define RC_VARIANCE 1 // use variance or ssd for fast rc
43 #define LAMBDA_FRAC_BITS 10
45 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
46 static const AVOption options
[] = {
47 { "nitris_compat", "encode with Avid Nitris compatibility",
48 offsetof(DNXHDEncContext
, nitris_compat
), AV_OPT_TYPE_INT
, { .i64
= 0 }, 0, 1, VE
},
52 static const AVClass dnxhd_class
= {
53 .class_name
= "dnxhd",
54 .item_name
= av_default_item_name
,
56 .version
= LIBAVUTIL_VERSION_INT
,
59 static void dnxhd_8bit_get_pixels_8x4_sym(int16_t *av_restrict block
,
60 const uint8_t *pixels
,
64 for (i
= 0; i
< 4; i
++) {
76 memcpy(block
, block
- 8, sizeof(*block
) * 8);
77 memcpy(block
+ 8, block
- 16, sizeof(*block
) * 8);
78 memcpy(block
+ 16, block
- 24, sizeof(*block
) * 8);
79 memcpy(block
+ 24, block
- 32, sizeof(*block
) * 8);
82 static av_always_inline
83 void dnxhd_10bit_get_pixels_8x4_sym(int16_t *av_restrict block
,
84 const uint8_t *pixels
,
88 const uint16_t* pixels16
= (const uint16_t*)pixels
;
91 for (i
= 0; i
< 4; i
++) {
92 block
[0] = pixels16
[0]; block
[1] = pixels16
[1];
93 block
[2] = pixels16
[2]; block
[3] = pixels16
[3];
94 block
[4] = pixels16
[4]; block
[5] = pixels16
[5];
95 block
[6] = pixels16
[6]; block
[7] = pixels16
[7];
96 pixels16
+= line_size
;
99 memcpy(block
, block
- 8, sizeof(*block
) * 8);
100 memcpy(block
+ 8, block
- 16, sizeof(*block
) * 8);
101 memcpy(block
+ 16, block
- 24, sizeof(*block
) * 8);
102 memcpy(block
+ 24, block
- 32, sizeof(*block
) * 8);
105 static int dnxhd_10bit_dct_quantize(MpegEncContext
*ctx
, int16_t *block
,
106 int n
, int qscale
, int *overflow
)
108 const uint8_t *scantable
= ctx
->intra_scantable
.scantable
;
109 const int *qmat
= n
<4 ? ctx
->q_intra_matrix
[qscale
] : ctx
->q_chroma_intra_matrix
[qscale
];
110 int last_non_zero
= 0;
113 ctx
->fdsp
.fdct(block
);
115 // Divide by 4 with rounding, to compensate scaling of DCT coefficients
116 block
[0] = (block
[0] + 2) >> 2;
118 for (i
= 1; i
< 64; ++i
) {
119 int j
= scantable
[i
];
120 int sign
= block
[j
] >> 31;
121 int level
= (block
[j
] ^ sign
) - sign
;
122 level
= level
* qmat
[j
] >> DNX10BIT_QMAT_SHIFT
;
123 block
[j
] = (level
^ sign
) - sign
;
128 return last_non_zero
;
131 static av_cold
int dnxhd_init_vlc(DNXHDEncContext
*ctx
)
133 int i
, j
, level
, run
;
134 int max_level
= 1 << (ctx
->cid_table
->bit_depth
+ 2);
136 FF_ALLOCZ_ARRAY_OR_GOTO(ctx
->m
.avctx
, ctx
->vlc_codes
,
137 max_level
, 4 * sizeof(*ctx
->vlc_codes
), fail
);
138 FF_ALLOCZ_ARRAY_OR_GOTO(ctx
->m
.avctx
, ctx
->vlc_bits
,
139 max_level
, 4 * sizeof(*ctx
->vlc_bits
), fail
);
140 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->run_codes
,
142 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->run_bits
,
145 ctx
->vlc_codes
+= max_level
* 2;
146 ctx
->vlc_bits
+= max_level
* 2;
147 for (level
= -max_level
; level
< max_level
; level
++) {
148 for (run
= 0; run
< 2; run
++) {
149 int index
= (level
<< 1) | run
;
150 int sign
, offset
= 0, alevel
= level
;
152 MASK_ABS(sign
, alevel
);
154 offset
= (alevel
- 1) >> 6;
155 alevel
-= offset
<< 6;
157 for (j
= 0; j
< 257; j
++) {
158 if (ctx
->cid_table
->ac_level
[j
] >> 1 == alevel
&&
159 (!offset
|| (ctx
->cid_table
->ac_flags
[j
] & 1) && offset
) &&
160 (!run
|| (ctx
->cid_table
->ac_flags
[j
] & 2) && run
)) {
161 av_assert1(!ctx
->vlc_codes
[index
]);
163 ctx
->vlc_codes
[index
] =
164 (ctx
->cid_table
->ac_codes
[j
] << 1) | (sign
& 1);
165 ctx
->vlc_bits
[index
] = ctx
->cid_table
->ac_bits
[j
] + 1;
167 ctx
->vlc_codes
[index
] = ctx
->cid_table
->ac_codes
[j
];
168 ctx
->vlc_bits
[index
] = ctx
->cid_table
->ac_bits
[j
];
173 av_assert0(!alevel
|| j
< 257);
175 ctx
->vlc_codes
[index
] =
176 (ctx
->vlc_codes
[index
] << ctx
->cid_table
->index_bits
) | offset
;
177 ctx
->vlc_bits
[index
] += ctx
->cid_table
->index_bits
;
181 for (i
= 0; i
< 62; i
++) {
182 int run
= ctx
->cid_table
->run
[i
];
183 av_assert0(run
< 63);
184 ctx
->run_codes
[run
] = ctx
->cid_table
->run_codes
[i
];
185 ctx
->run_bits
[run
] = ctx
->cid_table
->run_bits
[i
];
189 return AVERROR(ENOMEM
);
192 static av_cold
int dnxhd_init_qmat(DNXHDEncContext
*ctx
, int lbias
, int cbias
)
194 // init first elem to 1 to avoid div by 0 in convert_matrix
195 uint16_t weight_matrix
[64] = { 1, }; // convert_matrix needs uint16_t*
197 const uint8_t *luma_weight_table
= ctx
->cid_table
->luma_weight
;
198 const uint8_t *chroma_weight_table
= ctx
->cid_table
->chroma_weight
;
200 FF_ALLOCZ_ARRAY_OR_GOTO(ctx
->m
.avctx
, ctx
->qmatrix_l
,
201 (ctx
->m
.avctx
->qmax
+ 1), 64 * sizeof(int), fail
);
202 FF_ALLOCZ_ARRAY_OR_GOTO(ctx
->m
.avctx
, ctx
->qmatrix_c
,
203 (ctx
->m
.avctx
->qmax
+ 1), 64 * sizeof(int), fail
);
204 FF_ALLOCZ_ARRAY_OR_GOTO(ctx
->m
.avctx
, ctx
->qmatrix_l16
,
205 (ctx
->m
.avctx
->qmax
+ 1), 64 * 2 * sizeof(uint16_t),
207 FF_ALLOCZ_ARRAY_OR_GOTO(ctx
->m
.avctx
, ctx
->qmatrix_c16
,
208 (ctx
->m
.avctx
->qmax
+ 1), 64 * 2 * sizeof(uint16_t),
211 if (ctx
->cid_table
->bit_depth
== 8) {
212 for (i
= 1; i
< 64; i
++) {
213 int j
= ctx
->m
.idsp
.idct_permutation
[ff_zigzag_direct
[i
]];
214 weight_matrix
[j
] = ctx
->cid_table
->luma_weight
[i
];
216 ff_convert_matrix(&ctx
->m
, ctx
->qmatrix_l
, ctx
->qmatrix_l16
,
217 weight_matrix
, ctx
->m
.intra_quant_bias
, 1,
218 ctx
->m
.avctx
->qmax
, 1);
219 for (i
= 1; i
< 64; i
++) {
220 int j
= ctx
->m
.idsp
.idct_permutation
[ff_zigzag_direct
[i
]];
221 weight_matrix
[j
] = ctx
->cid_table
->chroma_weight
[i
];
223 ff_convert_matrix(&ctx
->m
, ctx
->qmatrix_c
, ctx
->qmatrix_c16
,
224 weight_matrix
, ctx
->m
.intra_quant_bias
, 1,
225 ctx
->m
.avctx
->qmax
, 1);
227 for (qscale
= 1; qscale
<= ctx
->m
.avctx
->qmax
; qscale
++) {
228 for (i
= 0; i
< 64; i
++) {
229 ctx
->qmatrix_l
[qscale
][i
] <<= 2;
230 ctx
->qmatrix_c
[qscale
][i
] <<= 2;
231 ctx
->qmatrix_l16
[qscale
][0][i
] <<= 2;
232 ctx
->qmatrix_l16
[qscale
][1][i
] <<= 2;
233 ctx
->qmatrix_c16
[qscale
][0][i
] <<= 2;
234 ctx
->qmatrix_c16
[qscale
][1][i
] <<= 2;
239 for (qscale
= 1; qscale
<= ctx
->m
.avctx
->qmax
; qscale
++) {
240 for (i
= 1; i
< 64; i
++) {
241 int j
= ctx
->m
.idsp
.idct_permutation
[ff_zigzag_direct
[i
]];
243 /* The quantization formula from the VC-3 standard is:
244 * quantized = sign(block[i]) * floor(abs(block[i]/s) * p /
245 * (qscale * weight_table[i]))
246 * Where p is 32 for 8-bit samples and 8 for 10-bit ones.
247 * The s factor compensates scaling of DCT coefficients done by
248 * the DCT routines, and therefore is not present in standard.
249 * It's 8 for 8-bit samples and 4 for 10-bit ones.
250 * We want values of ctx->qtmatrix_l and ctx->qtmatrix_r to be:
251 * ((1 << DNX10BIT_QMAT_SHIFT) * (p / s)) /
252 * (qscale * weight_table[i])
253 * For 10-bit samples, p / s == 2 */
254 ctx
->qmatrix_l
[qscale
][j
] = (1 << (DNX10BIT_QMAT_SHIFT
+ 1)) /
255 (qscale
* luma_weight_table
[i
]);
256 ctx
->qmatrix_c
[qscale
][j
] = (1 << (DNX10BIT_QMAT_SHIFT
+ 1)) /
257 (qscale
* chroma_weight_table
[i
]);
262 ctx
->m
.q_chroma_intra_matrix16
= ctx
->qmatrix_c16
;
263 ctx
->m
.q_chroma_intra_matrix
= ctx
->qmatrix_c
;
264 ctx
->m
.q_intra_matrix16
= ctx
->qmatrix_l16
;
265 ctx
->m
.q_intra_matrix
= ctx
->qmatrix_l
;
269 return AVERROR(ENOMEM
);
272 static av_cold
int dnxhd_init_rc(DNXHDEncContext
*ctx
)
274 FF_ALLOCZ_ARRAY_OR_GOTO(ctx
->m
.avctx
, ctx
->mb_rc
, (ctx
->m
.avctx
->qmax
+ 1), 8160 * sizeof(RCEntry
), fail
);
275 if (ctx
->m
.avctx
->mb_decision
!= FF_MB_DECISION_RD
)
276 FF_ALLOCZ_ARRAY_OR_GOTO(ctx
->m
.avctx
, ctx
->mb_cmp
,
277 ctx
->m
.mb_num
, sizeof(RCCMPEntry
), fail
);
279 ctx
->frame_bits
= (ctx
->cid_table
->coding_unit_size
-
280 640 - 4 - ctx
->min_padding
) * 8;
282 ctx
->lambda
= 2 << LAMBDA_FRAC_BITS
; // qscale 2
285 return AVERROR(ENOMEM
);
288 static av_cold
int dnxhd_encode_init(AVCodecContext
*avctx
)
290 DNXHDEncContext
*ctx
= avctx
->priv_data
;
291 int i
, index
, bit_depth
, ret
;
293 switch (avctx
->pix_fmt
) {
294 case AV_PIX_FMT_YUV422P
:
297 case AV_PIX_FMT_YUV422P10
:
301 av_log(avctx
, AV_LOG_ERROR
,
302 "pixel format is incompatible with DNxHD\n");
303 return AVERROR(EINVAL
);
306 ctx
->cid
= ff_dnxhd_find_cid(avctx
, bit_depth
);
308 av_log(avctx
, AV_LOG_ERROR
,
309 "video parameters incompatible with DNxHD. Valid DNxHD profiles:\n");
310 ff_dnxhd_print_profiles(avctx
, AV_LOG_ERROR
);
311 return AVERROR(EINVAL
);
313 av_log(avctx
, AV_LOG_DEBUG
, "cid %d\n", ctx
->cid
);
315 index
= ff_dnxhd_get_cid_table(ctx
->cid
);
316 av_assert0(index
>= 0);
317 ctx
->cid_table
= &ff_dnxhd_cid_table
[index
];
319 ctx
->m
.avctx
= avctx
;
323 avctx
->bits_per_raw_sample
= ctx
->cid_table
->bit_depth
;
325 ff_blockdsp_init(&ctx
->bdsp
, avctx
);
326 ff_fdctdsp_init(&ctx
->m
.fdsp
, avctx
);
327 ff_mpv_idct_init(&ctx
->m
);
328 ff_mpegvideoencdsp_init(&ctx
->m
.mpvencdsp
, avctx
);
329 ff_pixblockdsp_init(&ctx
->m
.pdsp
, avctx
);
330 ff_dct_encode_init(&ctx
->m
);
332 if (!ctx
->m
.dct_quantize
)
333 ctx
->m
.dct_quantize
= ff_dct_quantize_c
;
335 if (ctx
->cid_table
->bit_depth
== 10) {
336 ctx
->m
.dct_quantize
= dnxhd_10bit_dct_quantize
;
337 ctx
->get_pixels_8x4_sym
= dnxhd_10bit_get_pixels_8x4_sym
;
338 ctx
->block_width_l2
= 4;
340 ctx
->get_pixels_8x4_sym
= dnxhd_8bit_get_pixels_8x4_sym
;
341 ctx
->block_width_l2
= 3;
345 ff_dnxhdenc_init_x86(ctx
);
347 ctx
->m
.mb_height
= (avctx
->height
+ 15) / 16;
348 ctx
->m
.mb_width
= (avctx
->width
+ 15) / 16;
350 if (avctx
->flags
& CODEC_FLAG_INTERLACED_DCT
) {
352 ctx
->m
.mb_height
/= 2;
355 ctx
->m
.mb_num
= ctx
->m
.mb_height
* ctx
->m
.mb_width
;
357 if (avctx
->intra_quant_bias
!= FF_DEFAULT_QUANT_BIAS
)
358 ctx
->m
.intra_quant_bias
= avctx
->intra_quant_bias
;
359 // XXX tune lbias/cbias
360 if ((ret
= dnxhd_init_qmat(ctx
, ctx
->m
.intra_quant_bias
, 0)) < 0)
363 /* Avid Nitris hardware decoder requires a minimum amount of padding
364 * in the coding unit payload */
365 if (ctx
->nitris_compat
)
366 ctx
->min_padding
= 1600;
368 if ((ret
= dnxhd_init_vlc(ctx
)) < 0)
370 if ((ret
= dnxhd_init_rc(ctx
)) < 0)
373 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->slice_size
,
374 ctx
->m
.mb_height
* sizeof(uint32_t), fail
);
375 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->slice_offs
,
376 ctx
->m
.mb_height
* sizeof(uint32_t), fail
);
377 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->mb_bits
,
378 ctx
->m
.mb_num
* sizeof(uint16_t), fail
);
379 FF_ALLOCZ_OR_GOTO(ctx
->m
.avctx
, ctx
->mb_qscale
,
380 ctx
->m
.mb_num
* sizeof(uint8_t), fail
);
382 avctx
->coded_frame
= av_frame_alloc();
383 if (!avctx
->coded_frame
)
384 return AVERROR(ENOMEM
);
386 avctx
->coded_frame
->key_frame
= 1;
387 avctx
->coded_frame
->pict_type
= AV_PICTURE_TYPE_I
;
389 if (avctx
->thread_count
> MAX_THREADS
) {
390 av_log(avctx
, AV_LOG_ERROR
, "too many threads\n");
391 return AVERROR(EINVAL
);
394 if (avctx
->qmax
<= 1) {
395 av_log(avctx
, AV_LOG_ERROR
, "qmax must be at least 2\n");
396 return AVERROR(EINVAL
);
399 ctx
->thread
[0] = ctx
;
400 for (i
= 1; i
< avctx
->thread_count
; i
++) {
401 ctx
->thread
[i
] = av_malloc(sizeof(DNXHDEncContext
));
402 memcpy(ctx
->thread
[i
], ctx
, sizeof(DNXHDEncContext
));
406 fail
: // for FF_ALLOCZ_OR_GOTO
407 return AVERROR(ENOMEM
);
410 static int dnxhd_write_header(AVCodecContext
*avctx
, uint8_t *buf
)
412 DNXHDEncContext
*ctx
= avctx
->priv_data
;
413 static const uint8_t header_prefix
[5] = { 0x00, 0x00, 0x02, 0x80, 0x01 };
417 memcpy(buf
, header_prefix
, 5);
418 buf
[5] = ctx
->interlaced
? ctx
->cur_field
+ 2 : 0x01;
419 buf
[6] = 0x80; // crc flag off
420 buf
[7] = 0xa0; // reserved
421 AV_WB16(buf
+ 0x18, avctx
->height
>> ctx
->interlaced
); // ALPF
422 AV_WB16(buf
+ 0x1a, avctx
->width
); // SPL
423 AV_WB16(buf
+ 0x1d, avctx
->height
>> ctx
->interlaced
); // NAL
425 buf
[0x21] = ctx
->cid_table
->bit_depth
== 10 ? 0x58 : 0x38;
426 buf
[0x22] = 0x88 + (ctx
->interlaced
<< 2);
427 AV_WB32(buf
+ 0x28, ctx
->cid
); // CID
428 buf
[0x2c] = ctx
->interlaced
? 0 : 0x80;
430 buf
[0x5f] = 0x01; // UDL
432 buf
[0x167] = 0x02; // reserved
433 AV_WB16(buf
+ 0x16a, ctx
->m
.mb_height
* 4 + 4); // MSIPS
434 buf
[0x16d] = ctx
->m
.mb_height
; // Ns
435 buf
[0x16f] = 0x10; // reserved
437 ctx
->msip
= buf
+ 0x170;
441 static av_always_inline
void dnxhd_encode_dc(DNXHDEncContext
*ctx
, int diff
)
445 nbits
= av_log2_16bit(-2 * diff
);
448 nbits
= av_log2_16bit(2 * diff
);
450 put_bits(&ctx
->m
.pb
, ctx
->cid_table
->dc_bits
[nbits
] + nbits
,
451 (ctx
->cid_table
->dc_codes
[nbits
] << nbits
) +
452 (diff
& ((1 << nbits
) - 1)));
455 static av_always_inline
456 void dnxhd_encode_block(DNXHDEncContext
*ctx
, int16_t *block
,
457 int last_index
, int n
)
459 int last_non_zero
= 0;
462 dnxhd_encode_dc(ctx
, block
[0] - ctx
->m
.last_dc
[n
]);
463 ctx
->m
.last_dc
[n
] = block
[0];
465 for (i
= 1; i
<= last_index
; i
++) {
466 j
= ctx
->m
.intra_scantable
.permutated
[i
];
469 int run_level
= i
- last_non_zero
- 1;
470 int rlevel
= (slevel
<< 1) | !!run_level
;
471 put_bits(&ctx
->m
.pb
, ctx
->vlc_bits
[rlevel
], ctx
->vlc_codes
[rlevel
]);
473 put_bits(&ctx
->m
.pb
, ctx
->run_bits
[run_level
],
474 ctx
->run_codes
[run_level
]);
478 put_bits(&ctx
->m
.pb
, ctx
->vlc_bits
[0], ctx
->vlc_codes
[0]); // EOB
481 static av_always_inline
482 void dnxhd_unquantize_c(DNXHDEncContext
*ctx
, int16_t *block
, int n
,
483 int qscale
, int last_index
)
485 const uint8_t *weight_matrix
;
489 weight_matrix
= (n
& 2) ? ctx
->cid_table
->chroma_weight
490 : ctx
->cid_table
->luma_weight
;
492 for (i
= 1; i
<= last_index
; i
++) {
493 int j
= ctx
->m
.intra_scantable
.permutated
[i
];
497 level
= (1 - 2 * level
) * qscale
* weight_matrix
[i
];
498 if (ctx
->cid_table
->bit_depth
== 10) {
499 if (weight_matrix
[i
] != 8)
503 if (weight_matrix
[i
] != 32)
509 level
= (2 * level
+ 1) * qscale
* weight_matrix
[i
];
510 if (ctx
->cid_table
->bit_depth
== 10) {
511 if (weight_matrix
[i
] != 8)
515 if (weight_matrix
[i
] != 32)
525 static av_always_inline
int dnxhd_ssd_block(int16_t *qblock
, int16_t *block
)
529 for (i
= 0; i
< 64; i
++)
530 score
+= (block
[i
] - qblock
[i
]) * (block
[i
] - qblock
[i
]);
534 static av_always_inline
535 int dnxhd_calc_ac_bits(DNXHDEncContext
*ctx
, int16_t *block
, int last_index
)
537 int last_non_zero
= 0;
540 for (i
= 1; i
<= last_index
; i
++) {
541 j
= ctx
->m
.intra_scantable
.permutated
[i
];
544 int run_level
= i
- last_non_zero
- 1;
545 bits
+= ctx
->vlc_bits
[(level
<< 1) |
546 !!run_level
] + ctx
->run_bits
[run_level
];
553 static av_always_inline
554 void dnxhd_get_blocks(DNXHDEncContext
*ctx
, int mb_x
, int mb_y
)
556 const int bs
= ctx
->block_width_l2
;
557 const int bw
= 1 << bs
;
558 const uint8_t *ptr_y
= ctx
->thread
[0]->src
[0] +
559 ((mb_y
<< 4) * ctx
->m
.linesize
) + (mb_x
<< bs
+ 1);
560 const uint8_t *ptr_u
= ctx
->thread
[0]->src
[1] +
561 ((mb_y
<< 4) * ctx
->m
.uvlinesize
) + (mb_x
<< bs
);
562 const uint8_t *ptr_v
= ctx
->thread
[0]->src
[2] +
563 ((mb_y
<< 4) * ctx
->m
.uvlinesize
) + (mb_x
<< bs
);
564 PixblockDSPContext
*pdsp
= &ctx
->m
.pdsp
;
566 pdsp
->get_pixels(ctx
->blocks
[0], ptr_y
, ctx
->m
.linesize
);
567 pdsp
->get_pixels(ctx
->blocks
[1], ptr_y
+ bw
, ctx
->m
.linesize
);
568 pdsp
->get_pixels(ctx
->blocks
[2], ptr_u
, ctx
->m
.uvlinesize
);
569 pdsp
->get_pixels(ctx
->blocks
[3], ptr_v
, ctx
->m
.uvlinesize
);
571 if (mb_y
+ 1 == ctx
->m
.mb_height
&& ctx
->m
.avctx
->height
== 1080) {
572 if (ctx
->interlaced
) {
573 ctx
->get_pixels_8x4_sym(ctx
->blocks
[4],
574 ptr_y
+ ctx
->dct_y_offset
,
576 ctx
->get_pixels_8x4_sym(ctx
->blocks
[5],
577 ptr_y
+ ctx
->dct_y_offset
+ bw
,
579 ctx
->get_pixels_8x4_sym(ctx
->blocks
[6],
580 ptr_u
+ ctx
->dct_uv_offset
,
582 ctx
->get_pixels_8x4_sym(ctx
->blocks
[7],
583 ptr_v
+ ctx
->dct_uv_offset
,
586 ctx
->bdsp
.clear_block(ctx
->blocks
[4]);
587 ctx
->bdsp
.clear_block(ctx
->blocks
[5]);
588 ctx
->bdsp
.clear_block(ctx
->blocks
[6]);
589 ctx
->bdsp
.clear_block(ctx
->blocks
[7]);
592 pdsp
->get_pixels(ctx
->blocks
[4],
593 ptr_y
+ ctx
->dct_y_offset
, ctx
->m
.linesize
);
594 pdsp
->get_pixels(ctx
->blocks
[5],
595 ptr_y
+ ctx
->dct_y_offset
+ bw
, ctx
->m
.linesize
);
596 pdsp
->get_pixels(ctx
->blocks
[6],
597 ptr_u
+ ctx
->dct_uv_offset
, ctx
->m
.uvlinesize
);
598 pdsp
->get_pixels(ctx
->blocks
[7],
599 ptr_v
+ ctx
->dct_uv_offset
, ctx
->m
.uvlinesize
);
603 static av_always_inline
604 int dnxhd_switch_matrix(DNXHDEncContext
*ctx
, int i
)
606 const static uint8_t component
[8]={0,0,1,2,0,0,1,2};
610 static int dnxhd_calc_bits_thread(AVCodecContext
*avctx
, void *arg
,
611 int jobnr
, int threadnr
)
613 DNXHDEncContext
*ctx
= avctx
->priv_data
;
614 int mb_y
= jobnr
, mb_x
;
615 int qscale
= ctx
->qscale
;
616 LOCAL_ALIGNED_16(int16_t, block
, [64]);
617 ctx
= ctx
->thread
[threadnr
];
621 ctx
->m
.last_dc
[2] = 1 << (ctx
->cid_table
->bit_depth
+ 2);
623 for (mb_x
= 0; mb_x
< ctx
->m
.mb_width
; mb_x
++) {
624 unsigned mb
= mb_y
* ctx
->m
.mb_width
+ mb_x
;
630 dnxhd_get_blocks(ctx
, mb_x
, mb_y
);
632 for (i
= 0; i
< 8; i
++) {
633 int16_t *src_block
= ctx
->blocks
[i
];
634 int overflow
, nbits
, diff
, last_index
;
635 int n
= dnxhd_switch_matrix(ctx
, i
);
637 memcpy(block
, src_block
, 64 * sizeof(*block
));
638 last_index
= ctx
->m
.dct_quantize(&ctx
->m
, block
, 4 & (2*i
),
640 ac_bits
+= dnxhd_calc_ac_bits(ctx
, block
, last_index
);
642 diff
= block
[0] - ctx
->m
.last_dc
[n
];
644 nbits
= av_log2_16bit(-2 * diff
);
646 nbits
= av_log2_16bit(2 * diff
);
648 av_assert1(nbits
< ctx
->cid_table
->bit_depth
+ 4);
649 dc_bits
+= ctx
->cid_table
->dc_bits
[nbits
] + nbits
;
651 ctx
->m
.last_dc
[n
] = block
[0];
653 if (avctx
->mb_decision
== FF_MB_DECISION_RD
|| !RC_VARIANCE
) {
654 dnxhd_unquantize_c(ctx
, block
, i
, qscale
, last_index
);
655 ctx
->m
.idsp
.idct(block
);
656 ssd
+= dnxhd_ssd_block(block
, src_block
);
659 ctx
->mb_rc
[qscale
][mb
].ssd
= ssd
;
660 ctx
->mb_rc
[qscale
][mb
].bits
= ac_bits
+ dc_bits
+ 12 +
661 8 * ctx
->vlc_bits
[0];
666 static int dnxhd_encode_thread(AVCodecContext
*avctx
, void *arg
,
667 int jobnr
, int threadnr
)
669 DNXHDEncContext
*ctx
= avctx
->priv_data
;
670 int mb_y
= jobnr
, mb_x
;
671 ctx
= ctx
->thread
[threadnr
];
672 init_put_bits(&ctx
->m
.pb
, (uint8_t *)arg
+ 640 + ctx
->slice_offs
[jobnr
],
673 ctx
->slice_size
[jobnr
]);
677 ctx
->m
.last_dc
[2] = 1 << (ctx
->cid_table
->bit_depth
+ 2);
678 for (mb_x
= 0; mb_x
< ctx
->m
.mb_width
; mb_x
++) {
679 unsigned mb
= mb_y
* ctx
->m
.mb_width
+ mb_x
;
680 int qscale
= ctx
->mb_qscale
[mb
];
683 put_bits(&ctx
->m
.pb
, 12, qscale
<< 1);
685 dnxhd_get_blocks(ctx
, mb_x
, mb_y
);
687 for (i
= 0; i
< 8; i
++) {
688 int16_t *block
= ctx
->blocks
[i
];
689 int overflow
, n
= dnxhd_switch_matrix(ctx
, i
);
690 int last_index
= ctx
->m
.dct_quantize(&ctx
->m
, block
, 4 & (2*i
),
693 dnxhd_encode_block(ctx
, block
, last_index
, n
);
694 // STOP_TIMER("encode_block");
697 if (put_bits_count(&ctx
->m
.pb
) & 31)
698 put_bits(&ctx
->m
.pb
, 32 - (put_bits_count(&ctx
->m
.pb
) & 31), 0);
699 flush_put_bits(&ctx
->m
.pb
);
703 static void dnxhd_setup_threads_slices(DNXHDEncContext
*ctx
)
707 for (mb_y
= 0; mb_y
< ctx
->m
.mb_height
; mb_y
++) {
709 ctx
->slice_offs
[mb_y
] = offset
;
710 ctx
->slice_size
[mb_y
] = 0;
711 for (mb_x
= 0; mb_x
< ctx
->m
.mb_width
; mb_x
++) {
712 unsigned mb
= mb_y
* ctx
->m
.mb_width
+ mb_x
;
713 ctx
->slice_size
[mb_y
] += ctx
->mb_bits
[mb
];
715 ctx
->slice_size
[mb_y
] = (ctx
->slice_size
[mb_y
] + 31) & ~31;
716 ctx
->slice_size
[mb_y
] >>= 3;
717 thread_size
= ctx
->slice_size
[mb_y
];
718 offset
+= thread_size
;
722 static int dnxhd_mb_var_thread(AVCodecContext
*avctx
, void *arg
,
723 int jobnr
, int threadnr
)
725 DNXHDEncContext
*ctx
= avctx
->priv_data
;
726 int mb_y
= jobnr
, mb_x
, x
, y
;
727 int partial_last_row
= (mb_y
== ctx
->m
.mb_height
- 1) &&
728 ((avctx
->height
>> ctx
->interlaced
) & 0xF);
730 ctx
= ctx
->thread
[threadnr
];
731 if (ctx
->cid_table
->bit_depth
== 8) {
732 uint8_t *pix
= ctx
->thread
[0]->src
[0] + ((mb_y
<< 4) * ctx
->m
.linesize
);
733 for (mb_x
= 0; mb_x
< ctx
->m
.mb_width
; ++mb_x
, pix
+= 16) {
734 unsigned mb
= mb_y
* ctx
->m
.mb_width
+ mb_x
;
738 if (!partial_last_row
&& mb_x
* 16 <= avctx
->width
- 16) {
739 sum
= ctx
->m
.mpvencdsp
.pix_sum(pix
, ctx
->m
.linesize
);
740 varc
= ctx
->m
.mpvencdsp
.pix_norm1(pix
, ctx
->m
.linesize
);
742 int bw
= FFMIN(avctx
->width
- 16 * mb_x
, 16);
743 int bh
= FFMIN((avctx
->height
>> ctx
->interlaced
) - 16 * mb_y
, 16);
745 for (y
= 0; y
< bh
; y
++) {
746 for (x
= 0; x
< bw
; x
++) {
747 uint8_t val
= pix
[x
+ y
* ctx
->m
.linesize
];
753 varc
= (varc
- (((unsigned) sum
* sum
) >> 8) + 128) >> 8;
755 ctx
->mb_cmp
[mb
].value
= varc
;
756 ctx
->mb_cmp
[mb
].mb
= mb
;
759 int const linesize
= ctx
->m
.linesize
>> 1;
760 for (mb_x
= 0; mb_x
< ctx
->m
.mb_width
; ++mb_x
) {
761 uint16_t *pix
= (uint16_t *)ctx
->thread
[0]->src
[0] +
762 ((mb_y
<< 4) * linesize
) + (mb_x
<< 4);
763 unsigned mb
= mb_y
* ctx
->m
.mb_width
+ mb_x
;
768 // Macroblocks are 16x16 pixels, unlike DCT blocks which are 8x8.
769 for (i
= 0; i
< 16; ++i
) {
770 for (j
= 0; j
< 16; ++j
) {
771 // Turn 16-bit pixels into 10-bit ones.
772 int const sample
= (unsigned) pix
[j
] >> 6;
774 sqsum
+= sample
* sample
;
775 // 2^10 * 2^10 * 16 * 16 = 2^28, which is less than INT_MAX
779 mean
= sum
>> 8; // 16*16 == 2^8
781 ctx
->mb_cmp
[mb
].value
= sqmean
- mean
* mean
;
782 ctx
->mb_cmp
[mb
].mb
= mb
;
788 static int dnxhd_encode_rdo(AVCodecContext
*avctx
, DNXHDEncContext
*ctx
)
790 int lambda
, up_step
, down_step
;
791 int last_lower
= INT_MAX
, last_higher
= 0;
794 for (q
= 1; q
< avctx
->qmax
; q
++) {
796 avctx
->execute2(avctx
, dnxhd_calc_bits_thread
,
797 NULL
, NULL
, ctx
->m
.mb_height
);
799 up_step
= down_step
= 2 << LAMBDA_FRAC_BITS
;
800 lambda
= ctx
->lambda
;
805 if (lambda
== last_higher
) {
807 end
= 1; // need to set final qscales/bits
809 for (y
= 0; y
< ctx
->m
.mb_height
; y
++) {
810 for (x
= 0; x
< ctx
->m
.mb_width
; x
++) {
811 unsigned min
= UINT_MAX
;
813 int mb
= y
* ctx
->m
.mb_width
+ x
;
814 for (q
= 1; q
< avctx
->qmax
; q
++) {
815 unsigned score
= ctx
->mb_rc
[q
][mb
].bits
* lambda
+
816 ((unsigned) ctx
->mb_rc
[q
][mb
].ssd
<< LAMBDA_FRAC_BITS
);
822 bits
+= ctx
->mb_rc
[qscale
][mb
].bits
;
823 ctx
->mb_qscale
[mb
] = qscale
;
824 ctx
->mb_bits
[mb
] = ctx
->mb_rc
[qscale
][mb
].bits
;
826 bits
= (bits
+ 31) & ~31; // padding
827 if (bits
> ctx
->frame_bits
)
830 // av_dlog(ctx->m.avctx,
831 // "lambda %d, up %u, down %u, bits %d, frame %d\n",
832 // lambda, last_higher, last_lower, bits, ctx->frame_bits);
834 if (bits
> ctx
->frame_bits
)
835 return AVERROR(EINVAL
);
838 if (bits
< ctx
->frame_bits
) {
839 last_lower
= FFMIN(lambda
, last_lower
);
840 if (last_higher
!= 0)
841 lambda
= (lambda
+last_higher
)>>1;
844 down_step
= FFMIN((int64_t)down_step
*5, INT_MAX
);
845 up_step
= 1<<LAMBDA_FRAC_BITS
;
846 lambda
= FFMAX(1, lambda
);
847 if (lambda
== last_lower
)
850 last_higher
= FFMAX(lambda
, last_higher
);
851 if (last_lower
!= INT_MAX
)
852 lambda
= (lambda
+last_lower
)>>1;
853 else if ((int64_t)lambda
+ up_step
> INT_MAX
)
854 return AVERROR(EINVAL
);
857 up_step
= FFMIN((int64_t)up_step
*5, INT_MAX
);
858 down_step
= 1<<LAMBDA_FRAC_BITS
;
861 //av_dlog(ctx->m.avctx, "out lambda %d\n", lambda);
862 ctx
->lambda
= lambda
;
866 static int dnxhd_find_qscale(DNXHDEncContext
*ctx
)
872 int last_lower
= INT_MAX
;
876 qscale
= ctx
->qscale
;
879 ctx
->qscale
= qscale
;
880 // XXX avoid recalculating bits
881 ctx
->m
.avctx
->execute2(ctx
->m
.avctx
, dnxhd_calc_bits_thread
,
882 NULL
, NULL
, ctx
->m
.mb_height
);
883 for (y
= 0; y
< ctx
->m
.mb_height
; y
++) {
884 for (x
= 0; x
< ctx
->m
.mb_width
; x
++)
885 bits
+= ctx
->mb_rc
[qscale
][y
*ctx
->m
.mb_width
+x
].bits
;
886 bits
= (bits
+31)&~31; // padding
887 if (bits
> ctx
->frame_bits
)
890 // av_dlog(ctx->m.avctx,
891 // "%d, qscale %d, bits %d, frame %d, higher %d, lower %d\n",
892 // ctx->m.avctx->frame_number, qscale, bits, ctx->frame_bits,
893 // last_higher, last_lower);
894 if (bits
< ctx
->frame_bits
) {
897 if (last_higher
== qscale
- 1) {
898 qscale
= last_higher
;
901 last_lower
= FFMIN(qscale
, last_lower
);
902 if (last_higher
!= 0)
903 qscale
= (qscale
+ last_higher
) >> 1;
905 qscale
-= down_step
++;
910 if (last_lower
== qscale
+ 1)
912 last_higher
= FFMAX(qscale
, last_higher
);
913 if (last_lower
!= INT_MAX
)
914 qscale
= (qscale
+ last_lower
) >> 1;
918 if (qscale
>= ctx
->m
.avctx
->qmax
)
919 return AVERROR(EINVAL
);
922 //av_dlog(ctx->m.avctx, "out qscale %d\n", qscale);
923 ctx
->qscale
= qscale
;
927 #define BUCKET_BITS 8
928 #define RADIX_PASSES 4
929 #define NBUCKETS (1 << BUCKET_BITS)
931 static inline int get_bucket(int value
, int shift
)
934 value
&= NBUCKETS
- 1;
935 return NBUCKETS
- 1 - value
;
938 static void radix_count(const RCCMPEntry
*data
, int size
,
939 int buckets
[RADIX_PASSES
][NBUCKETS
])
942 memset(buckets
, 0, sizeof(buckets
[0][0]) * RADIX_PASSES
* NBUCKETS
);
943 for (i
= 0; i
< size
; i
++) {
944 int v
= data
[i
].value
;
945 for (j
= 0; j
< RADIX_PASSES
; j
++) {
946 buckets
[j
][get_bucket(v
, 0)]++;
951 for (j
= 0; j
< RADIX_PASSES
; j
++) {
953 for (i
= NBUCKETS
- 1; i
>= 0; i
--)
954 buckets
[j
][i
] = offset
-= buckets
[j
][i
];
955 av_assert1(!buckets
[j
][0]);
959 static void radix_sort_pass(RCCMPEntry
*dst
, const RCCMPEntry
*data
,
960 int size
, int buckets
[NBUCKETS
], int pass
)
962 int shift
= pass
* BUCKET_BITS
;
964 for (i
= 0; i
< size
; i
++) {
965 int v
= get_bucket(data
[i
].value
, shift
);
966 int pos
= buckets
[v
]++;
971 static void radix_sort(RCCMPEntry
*data
, int size
)
973 int buckets
[RADIX_PASSES
][NBUCKETS
];
974 RCCMPEntry
*tmp
= av_malloc_array(size
, sizeof(*tmp
));
975 radix_count(data
, size
, buckets
);
976 radix_sort_pass(tmp
, data
, size
, buckets
[0], 0);
977 radix_sort_pass(data
, tmp
, size
, buckets
[1], 1);
978 if (buckets
[2][NBUCKETS
- 1] || buckets
[3][NBUCKETS
- 1]) {
979 radix_sort_pass(tmp
, data
, size
, buckets
[2], 2);
980 radix_sort_pass(data
, tmp
, size
, buckets
[3], 3);
985 static int dnxhd_encode_fast(AVCodecContext
*avctx
, DNXHDEncContext
*ctx
)
989 if ((ret
= dnxhd_find_qscale(ctx
)) < 0)
991 for (y
= 0; y
< ctx
->m
.mb_height
; y
++) {
992 for (x
= 0; x
< ctx
->m
.mb_width
; x
++) {
993 int mb
= y
* ctx
->m
.mb_width
+ x
;
995 ctx
->mb_qscale
[mb
] = ctx
->qscale
;
996 ctx
->mb_bits
[mb
] = ctx
->mb_rc
[ctx
->qscale
][mb
].bits
;
997 max_bits
+= ctx
->mb_rc
[ctx
->qscale
][mb
].bits
;
999 delta_bits
= ctx
->mb_rc
[ctx
->qscale
][mb
].bits
-
1000 ctx
->mb_rc
[ctx
->qscale
+ 1][mb
].bits
;
1001 ctx
->mb_cmp
[mb
].mb
= mb
;
1002 ctx
->mb_cmp
[mb
].value
=
1003 delta_bits
? ((ctx
->mb_rc
[ctx
->qscale
][mb
].ssd
-
1004 ctx
->mb_rc
[ctx
->qscale
+ 1][mb
].ssd
) * 100) /
1006 : INT_MIN
; // avoid increasing qscale
1009 max_bits
+= 31; // worst padding
1013 avctx
->execute2(avctx
, dnxhd_mb_var_thread
,
1014 NULL
, NULL
, ctx
->m
.mb_height
);
1015 radix_sort(ctx
->mb_cmp
, ctx
->m
.mb_num
);
1016 for (x
= 0; x
< ctx
->m
.mb_num
&& max_bits
> ctx
->frame_bits
; x
++) {
1017 int mb
= ctx
->mb_cmp
[x
].mb
;
1018 max_bits
-= ctx
->mb_rc
[ctx
->qscale
][mb
].bits
-
1019 ctx
->mb_rc
[ctx
->qscale
+ 1][mb
].bits
;
1020 ctx
->mb_qscale
[mb
] = ctx
->qscale
+ 1;
1021 ctx
->mb_bits
[mb
] = ctx
->mb_rc
[ctx
->qscale
+ 1][mb
].bits
;
1027 static void dnxhd_load_picture(DNXHDEncContext
*ctx
, const AVFrame
*frame
)
1031 for (i
= 0; i
< ctx
->m
.avctx
->thread_count
; i
++) {
1032 ctx
->thread
[i
]->m
.linesize
= frame
->linesize
[0] << ctx
->interlaced
;
1033 ctx
->thread
[i
]->m
.uvlinesize
= frame
->linesize
[1] << ctx
->interlaced
;
1034 ctx
->thread
[i
]->dct_y_offset
= ctx
->m
.linesize
*8;
1035 ctx
->thread
[i
]->dct_uv_offset
= ctx
->m
.uvlinesize
*8;
1038 ctx
->m
.avctx
->coded_frame
->interlaced_frame
= frame
->interlaced_frame
;
1039 ctx
->cur_field
= frame
->interlaced_frame
&& !frame
->top_field_first
;
1042 static int dnxhd_encode_picture(AVCodecContext
*avctx
, AVPacket
*pkt
,
1043 const AVFrame
*frame
, int *got_packet
)
1045 DNXHDEncContext
*ctx
= avctx
->priv_data
;
1046 int first_field
= 1;
1050 if ((ret
= ff_alloc_packet2(avctx
, pkt
, ctx
->cid_table
->frame_size
)) < 0)
1054 dnxhd_load_picture(ctx
, frame
);
1057 for (i
= 0; i
< 3; i
++) {
1058 ctx
->src
[i
] = frame
->data
[i
];
1059 if (ctx
->interlaced
&& ctx
->cur_field
)
1060 ctx
->src
[i
] += frame
->linesize
[i
];
1063 dnxhd_write_header(avctx
, buf
);
1065 if (avctx
->mb_decision
== FF_MB_DECISION_RD
)
1066 ret
= dnxhd_encode_rdo(avctx
, ctx
);
1068 ret
= dnxhd_encode_fast(avctx
, ctx
);
1070 av_log(avctx
, AV_LOG_ERROR
,
1071 "picture could not fit ratecontrol constraints, increase qmax\n");
1075 dnxhd_setup_threads_slices(ctx
);
1078 for (i
= 0; i
< ctx
->m
.mb_height
; i
++) {
1079 AV_WB32(ctx
->msip
+ i
* 4, offset
);
1080 offset
+= ctx
->slice_size
[i
];
1081 av_assert1(!(ctx
->slice_size
[i
] & 3));
1084 avctx
->execute2(avctx
, dnxhd_encode_thread
, buf
, NULL
, ctx
->m
.mb_height
);
1086 av_assert1(640 + offset
+ 4 <= ctx
->cid_table
->coding_unit_size
);
1087 memset(buf
+ 640 + offset
, 0,
1088 ctx
->cid_table
->coding_unit_size
- 4 - offset
- 640);
1090 AV_WB32(buf
+ ctx
->cid_table
->coding_unit_size
- 4, 0x600DC0DE); // EOF
1092 if (ctx
->interlaced
&& first_field
) {
1094 ctx
->cur_field
^= 1;
1095 buf
+= ctx
->cid_table
->coding_unit_size
;
1096 goto encode_coding_unit
;
1099 avctx
->coded_frame
->quality
= ctx
->qscale
* FF_QP2LAMBDA
;
1101 pkt
->flags
|= AV_PKT_FLAG_KEY
;
1106 static av_cold
int dnxhd_encode_end(AVCodecContext
*avctx
)
1108 DNXHDEncContext
*ctx
= avctx
->priv_data
;
1109 int max_level
= 1 << (ctx
->cid_table
->bit_depth
+ 2);
1112 av_free(ctx
->vlc_codes
- max_level
* 2);
1113 av_free(ctx
->vlc_bits
- max_level
* 2);
1114 av_freep(&ctx
->run_codes
);
1115 av_freep(&ctx
->run_bits
);
1117 av_freep(&ctx
->mb_bits
);
1118 av_freep(&ctx
->mb_qscale
);
1119 av_freep(&ctx
->mb_rc
);
1120 av_freep(&ctx
->mb_cmp
);
1121 av_freep(&ctx
->slice_size
);
1122 av_freep(&ctx
->slice_offs
);
1124 av_freep(&ctx
->qmatrix_c
);
1125 av_freep(&ctx
->qmatrix_l
);
1126 av_freep(&ctx
->qmatrix_c16
);
1127 av_freep(&ctx
->qmatrix_l16
);
1129 for (i
= 1; i
< avctx
->thread_count
; i
++)
1130 av_freep(&ctx
->thread
[i
]);
1132 av_frame_free(&avctx
->coded_frame
);
1137 static const AVCodecDefault dnxhd_defaults
[] = {
1138 { "qmax", "1024" }, /* Maximum quantization scale factor allowed for VC-3 */
1142 AVCodec ff_dnxhd_encoder
= {
1144 .long_name
= NULL_IF_CONFIG_SMALL("VC3/DNxHD"),
1145 .type
= AVMEDIA_TYPE_VIDEO
,
1146 .id
= AV_CODEC_ID_DNXHD
,
1147 .priv_data_size
= sizeof(DNXHDEncContext
),
1148 .init
= dnxhd_encode_init
,
1149 .encode2
= dnxhd_encode_picture
,
1150 .close
= dnxhd_encode_end
,
1151 .capabilities
= CODEC_CAP_SLICE_THREADS
,
1152 .pix_fmts
= (const enum AVPixelFormat
[]) {
1154 AV_PIX_FMT_YUV422P10
,
1157 .priv_class
= &dnxhd_class
,
1158 .defaults
= dnxhd_defaults
,