1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Chung Shin Yee <shinyee@multicorewareinc.com>
5 * Min Chen <chenm003@163.com>
6 * Steve Borho <steve@borho.org>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at license @ x265.com.
24 *****************************************************************************/
28 #include "framedata.h"
29 #include "wavefront.h"
33 #include "frameencoder.h"
35 #include "slicetype.h"
39 void weightAnalyse(Slice
& slice
, Frame
& frame
, x265_param
& param
);
41 FrameEncoder::FrameEncoder()
43 , m_threadActive(true)
47 m_bAllRowsStop
= false;
48 m_vbvResetTriggerRow
= -1;
50 m_substreamSizes
= NULL
;
59 memset(&m_frameStats
, 0, sizeof(m_frameStats
));
60 memset(&m_rce
, 0, sizeof(RateControlEntry
));
63 void FrameEncoder::destroy()
66 JobProvider::flush(); // ensure no worker threads are using this frame
68 m_threadActive
= false;
72 delete[] m_outStreams
;
74 X265_FREE(m_ctuGeomMap
);
75 X265_FREE(m_substreamSizes
);
78 m_frameFilter
.destroy();
80 if (m_param
->bEmitHRDSEI
|| !!m_param
->interlaceMode
)
82 delete m_rce
.picTimingSEI
;
83 delete m_rce
.hrdTiming
;
86 // wait for worker thread to exit
90 bool FrameEncoder::init(Encoder
*top
, int numRows
, int numCols
, int id
)
93 m_param
= top
->m_param
;
96 m_filterRowDelay
= (m_param
->bEnableSAO
&& m_param
->bSaoNonDeblocked
) ?
97 2 : (m_param
->bEnableSAO
|| m_param
->bEnableLoopFilter
? 1 : 0);
98 m_filterRowDelayCus
= m_filterRowDelay
* numCols
;
99 m_frameEncoderID
= id
;
100 m_rows
= new CTURow
[m_numRows
];
101 bool ok
= !!m_numRows
;
103 int range
= m_param
->searchRange
; /* fpel search */
104 range
+= 1; /* diamond search range check lag */
105 range
+= 2; /* subpel refine */
106 range
+= NTAPS_LUMA
/ 2; /* subpel filter half-length */
107 m_refLagRows
= 1 + ((range
+ g_maxCUSize
- 1) / g_maxCUSize
);
109 // NOTE: 2 times of numRows because both Encoder and Filter in same queue
110 if (!WaveFront::init(m_numRows
* 2))
112 x265_log(m_param
, X265_LOG_ERROR
, "unable to initialize wavefront queue\n");
116 m_frameFilter
.init(top
, this, numRows
);
118 // initialize HRD parameters of SPS
119 if (m_param
->bEmitHRDSEI
|| !!m_param
->interlaceMode
)
121 m_rce
.picTimingSEI
= new SEIPictureTiming
;
122 m_rce
.hrdTiming
= new HRDTiming
;
124 ok
&= m_rce
.picTimingSEI
&& m_rce
.hrdTiming
;
127 if (m_param
->noiseReductionIntra
|| m_param
->noiseReductionInter
)
128 m_nr
= X265_MALLOC(NoiseReduction
, 1);
130 memset(m_nr
, 0, sizeof(NoiseReduction
));
132 m_param
->noiseReductionIntra
= m_param
->noiseReductionInter
= 0;
138 /* Generate a complete list of unique geom sets for the current picture dimensions */
139 bool FrameEncoder::initializeGeoms()
141 /* Geoms only vary between CTUs in the presence of picture edges */
142 int maxCUSize
= m_param
->maxCUSize
;
143 int heightRem
= m_param
->sourceHeight
& (maxCUSize
- 1);
144 int widthRem
= m_param
->sourceWidth
& (maxCUSize
- 1);
145 int allocGeoms
= 1; // body
146 if (heightRem
&& widthRem
)
147 allocGeoms
= 4; // body, right, bottom, corner
148 else if (heightRem
|| widthRem
)
149 allocGeoms
= 2; // body, right or bottom
151 m_ctuGeomMap
= X265_MALLOC(uint32_t, m_numRows
* m_numCols
);
152 m_cuGeoms
= X265_MALLOC(CUGeom
, allocGeoms
* CUGeom::MAX_GEOMS
);
153 if (!m_cuGeoms
|| !m_ctuGeomMap
)
157 CUData::calcCTUGeoms(maxCUSize
, maxCUSize
, maxCUSize
, m_cuGeoms
);
158 memset(m_ctuGeomMap
, 0, sizeof(uint32_t) * m_numRows
* m_numCols
);
166 CUData::calcCTUGeoms(widthRem
, maxCUSize
, maxCUSize
, m_cuGeoms
+ countGeoms
* CUGeom::MAX_GEOMS
);
167 for (int i
= 0; i
< m_numRows
; i
++)
169 uint32_t ctuAddr
= m_numCols
* (i
+ 1) - 1;
170 m_ctuGeomMap
[ctuAddr
] = countGeoms
* CUGeom::MAX_GEOMS
;
177 CUData::calcCTUGeoms(maxCUSize
, heightRem
, maxCUSize
, m_cuGeoms
+ countGeoms
* CUGeom::MAX_GEOMS
);
178 for (uint32_t i
= 0; i
< m_numCols
; i
++)
180 uint32_t ctuAddr
= m_numCols
* (m_numRows
- 1) + i
;
181 m_ctuGeomMap
[ctuAddr
] = countGeoms
* CUGeom::MAX_GEOMS
;
188 CUData::calcCTUGeoms(widthRem
, heightRem
, maxCUSize
, m_cuGeoms
+ countGeoms
* CUGeom::MAX_GEOMS
);
190 uint32_t ctuAddr
= m_numCols
* m_numRows
- 1;
191 m_ctuGeomMap
[ctuAddr
] = countGeoms
* CUGeom::MAX_GEOMS
;
194 X265_CHECK(countGeoms
== allocGeoms
, "geometry match check failure\n");
200 bool FrameEncoder::startCompressFrame(Frame
* curFrame
)
203 curFrame
->m_encData
->m_frameEncoderID
= m_frameEncoderID
; // Each Frame knows the ID of the FrameEncoder encoding it
204 curFrame
->m_encData
->m_slice
->m_mref
= m_mref
;
208 if (!initializeGeoms())
216 void FrameEncoder::threadMain()
218 // worker thread routine for FrameEncoder
221 m_enable
.wait(); // Encoder::encode() triggers this event
225 m_done
.trigger(); // FrameEncoder::getEncodedPicture() blocks for this event
228 while (m_threadActive
);
231 void FrameEncoder::compressFrame()
233 //ProfileScopeEvent(frameThread);
234 int64_t startCompressTime
= x265_mdate();
235 Slice
* slice
= m_frame
->m_encData
->m_slice
;
237 /* Emit access unit delimiter unless this is the first frame and the user is
238 * not repeating headers (since AUD is supposed to be the first NAL in the access
240 if (m_param
->bEnableAccessUnitDelimiters
&& (m_frame
->m_poc
|| m_param
->bRepeatHeaders
))
243 m_entropyCoder
.setBitstream(&m_bs
);
244 m_entropyCoder
.codeAUD(*slice
);
245 m_bs
.writeByteAlignment();
246 m_nalList
.serialize(NAL_UNIT_ACCESS_UNIT_DELIMITER
, m_bs
);
248 if (m_frame
->m_lowres
.bKeyframe
&& m_param
->bRepeatHeaders
)
249 m_top
->getStreamHeaders(m_nalList
, m_entropyCoder
, m_bs
);
251 // Weighted Prediction parameters estimation.
252 bool bUseWeightP
= slice
->m_sliceType
== P_SLICE
&& slice
->m_pps
->bUseWeightPred
;
253 bool bUseWeightB
= slice
->m_sliceType
== B_SLICE
&& slice
->m_pps
->bUseWeightedBiPred
;
254 if (bUseWeightP
|| bUseWeightB
)
255 weightAnalyse(*slice
, *m_frame
, *m_param
);
257 slice
->disableWeights();
259 // Generate motion references
260 int numPredDir
= slice
->isInterP() ? 1 : slice
->isInterB() ? 2 : 0;
261 for (int l
= 0; l
< numPredDir
; l
++)
263 for (int ref
= 0; ref
< slice
->m_numRefIdx
[l
]; ref
++)
265 WeightParam
*w
= NULL
;
266 if ((bUseWeightP
|| bUseWeightB
) && slice
->m_weightPredTable
[l
][ref
][0].bPresentFlag
)
267 w
= slice
->m_weightPredTable
[l
][ref
];
268 m_mref
[l
][ref
].init(slice
->m_refPicList
[l
][ref
]->m_reconPic
, w
, *m_param
);
272 /* Get the QP for this frame from rate control. This call may block until
273 * frames ahead of it in encode order have called rateControlEnd() */
274 int qp
= m_top
->m_rateControl
->rateControlStart(m_frame
, &m_rce
, m_top
);
277 /* Clip slice QP to 0-51 spec range before encoding */
278 slice
->m_sliceQp
= Clip3(-QP_BD_OFFSET
, QP_MAX_SPEC
, qp
);
280 m_initSliceContext
.resetEntropy(*slice
);
282 m_frameFilter
.start(m_frame
, m_initSliceContext
, qp
);
284 // reset entropy coders
285 m_entropyCoder
.load(m_initSliceContext
);
286 for (int i
= 0; i
< m_numRows
; i
++)
287 m_rows
[i
].init(m_initSliceContext
);
289 uint32_t numSubstreams
= m_param
->bEnableWavefront
? slice
->m_sps
->numCuInHeight
: 1;
292 m_outStreams
= new Bitstream
[numSubstreams
];
293 m_substreamSizes
= X265_MALLOC(uint32_t, numSubstreams
);
294 if (!m_param
->bEnableSAO
)
295 for (uint32_t i
= 0; i
< numSubstreams
; i
++)
296 m_rows
[i
].rowGoOnCoder
.setBitstream(&m_outStreams
[i
]);
299 for (uint32_t i
= 0; i
< numSubstreams
; i
++)
300 m_outStreams
[i
].resetBits();
302 if (m_frame
->m_lowres
.bKeyframe
)
304 if (m_param
->bEmitHRDSEI
)
306 SEIBufferingPeriod
* bpSei
= &m_top
->m_rateControl
->m_bufPeriodSEI
;
308 // since the temporal layer HRD is not ready, we assumed it is fixed
309 bpSei
->m_auCpbRemovalDelayDelta
= 1;
310 bpSei
->m_cpbDelayOffset
= 0;
311 bpSei
->m_dpbDelayOffset
= 0;
313 // hrdFullness() calculates the initial CPB removal delay and offset
314 m_top
->m_rateControl
->hrdFullness(bpSei
);
317 bpSei
->write(m_bs
, *slice
->m_sps
);
318 m_bs
.writeByteAlignment();
320 m_nalList
.serialize(NAL_UNIT_PREFIX_SEI
, m_bs
);
322 m_top
->m_lastBPSEI
= m_rce
.encodeOrder
;
325 // The recovery point SEI message assists a decoder in determining when the decoding
326 // process will produce acceptable pictures for display after the decoder initiates
327 // random access. The m_recoveryPocCnt is in units of POC(picture order count) which
328 // means pictures encoded after the CRA but precede it in display order(leading) are
329 // implicitly discarded after a random access seek regardless of the value of
330 // m_recoveryPocCnt. Our encoder does not use references prior to the most recent CRA,
331 // so all pictures following the CRA in POC order are guaranteed to be displayable,
332 // so m_recoveryPocCnt is always 0.
333 SEIRecoveryPoint sei_recovery_point
;
334 sei_recovery_point
.m_recoveryPocCnt
= 0;
335 sei_recovery_point
.m_exactMatchingFlag
= true;
336 sei_recovery_point
.m_brokenLinkFlag
= false;
339 sei_recovery_point
.write(m_bs
, *slice
->m_sps
);
340 m_bs
.writeByteAlignment();
342 m_nalList
.serialize(NAL_UNIT_PREFIX_SEI
, m_bs
);
345 if (m_param
->bEmitHRDSEI
|| !!m_param
->interlaceMode
)
347 SEIPictureTiming
*sei
= m_rce
.picTimingSEI
;
348 const VUI
*vui
= &slice
->m_sps
->vuiParameters
;
349 const HRDInfo
*hrd
= &vui
->hrdParameters
;
350 int poc
= slice
->m_poc
;
352 if (vui
->frameFieldInfoPresentFlag
)
354 if (m_param
->interlaceMode
== 2)
355 sei
->m_picStruct
= (poc
& 1) ? 1 /* top */ : 2 /* bottom */;
356 else if (m_param
->interlaceMode
== 1)
357 sei
->m_picStruct
= (poc
& 1) ? 2 /* bottom */ : 1 /* top */;
359 sei
->m_picStruct
= 0;
360 sei
->m_sourceScanType
= 0;
361 sei
->m_duplicateFlag
= false;
364 if (vui
->hrdParametersPresentFlag
)
366 // The m_aucpbremoval delay specifies how many clock ticks the
367 // access unit associated with the picture timing SEI message has to
368 // wait after removal of the access unit with the most recent
369 // buffering period SEI message
370 sei
->m_auCpbRemovalDelay
= X265_MIN(X265_MAX(1, m_rce
.encodeOrder
- m_top
->m_lastBPSEI
), (1 << hrd
->cpbRemovalDelayLength
));
371 sei
->m_picDpbOutputDelay
= slice
->m_sps
->numReorderPics
+ poc
- m_rce
.encodeOrder
;
375 sei
->write(m_bs
, *slice
->m_sps
);
376 m_bs
.writeByteAlignment();
377 m_nalList
.serialize(NAL_UNIT_PREFIX_SEI
, m_bs
);
380 // Analyze CTU rows, most of the hard work is done here
381 // frame is compressed in a wave-front pattern if WPP is enabled. Loop filter runs as a
382 // wave-front behind the CU compression and reconstruction
385 if (m_param
->rc
.bStatWrite
)
387 int totalI
= 0, totalP
= 0, totalSkip
= 0;
389 // accumulate intra,inter,skip cu count per frame for 2 pass
390 for (int i
= 0; i
< m_numRows
; i
++)
392 m_frameStats
.mvBits
+= m_rows
[i
].rowStats
.mvBits
;
393 m_frameStats
.coeffBits
+= m_rows
[i
].rowStats
.coeffBits
;
394 m_frameStats
.miscBits
+= m_rows
[i
].rowStats
.miscBits
;
395 totalI
+= m_rows
[i
].rowStats
.iCuCnt
;
396 totalP
+= m_rows
[i
].rowStats
.pCuCnt
;
397 totalSkip
+= m_rows
[i
].rowStats
.skipCuCnt
;
399 int totalCuCount
= totalI
+ totalP
+ totalSkip
;
400 m_frameStats
.percentIntra
= (double)totalI
/ totalCuCount
;
401 m_frameStats
.percentInter
= (double)totalP
/ totalCuCount
;
402 m_frameStats
.percentSkip
= (double)totalSkip
/ totalCuCount
;
406 m_entropyCoder
.load(m_initSliceContext
);
407 m_entropyCoder
.setBitstream(&m_bs
);
408 m_entropyCoder
.codeSliceHeader(*slice
, *m_frame
->m_encData
);
410 // finish encode of each CTU row, only required when SAO is enabled
411 if (m_param
->bEnableSAO
)
414 // serialize each row, record final lengths in slice header
415 uint32_t maxStreamSize
= m_nalList
.serializeSubstreams(m_substreamSizes
, numSubstreams
, m_outStreams
);
417 // complete the slice header by writing WPP row-starts
418 m_entropyCoder
.setBitstream(&m_bs
);
419 if (slice
->m_pps
->bEntropyCodingSyncEnabled
)
420 m_entropyCoder
.codeSliceHeaderWPPEntryPoints(*slice
, m_substreamSizes
, maxStreamSize
);
421 m_bs
.writeByteAlignment();
423 m_nalList
.serialize(slice
->m_nalUnitType
, m_bs
);
425 if (m_param
->decodedPictureHashSEI
)
427 if (m_param
->decodedPictureHashSEI
== 1)
429 m_seiReconPictureDigest
.m_method
= SEIDecodedPictureHash::MD5
;
430 for (int i
= 0; i
< 3; i
++)
431 MD5Final(&m_state
[i
], m_seiReconPictureDigest
.m_digest
[i
]);
433 else if (m_param
->decodedPictureHashSEI
== 2)
435 m_seiReconPictureDigest
.m_method
= SEIDecodedPictureHash::CRC
;
436 for (int i
= 0; i
< 3; i
++)
437 crcFinish(m_crc
[i
], m_seiReconPictureDigest
.m_digest
[i
]);
439 else if (m_param
->decodedPictureHashSEI
== 3)
441 m_seiReconPictureDigest
.m_method
= SEIDecodedPictureHash::CHECKSUM
;
442 for (int i
= 0; i
< 3; i
++)
443 checksumFinish(m_checksum
[i
], m_seiReconPictureDigest
.m_digest
[i
]);
447 m_seiReconPictureDigest
.write(m_bs
, *slice
->m_sps
);
448 m_bs
.writeByteAlignment();
450 m_nalList
.serialize(NAL_UNIT_SUFFIX_SEI
, m_bs
);
454 for (uint32_t i
= 0; i
< m_nalList
.m_numNal
; i
++)
456 int type
= m_nalList
.m_nal
[i
].type
;
459 if (type
!= NAL_UNIT_PREFIX_SEI
&& type
!= NAL_UNIT_SUFFIX_SEI
)
461 bytes
+= m_nalList
.m_nal
[i
].sizeBytes
;
462 // and exclude start code prefix
463 bytes
-= (!i
|| type
== NAL_UNIT_SPS
|| type
== NAL_UNIT_PPS
) ? 4 : 3;
466 m_accessUnitBits
= bytes
<< 3;
468 m_elapsedCompressTime
= (double)(x265_mdate() - startCompressTime
) / 1000000;
469 /* rateControlEnd may also block for earlier frames to call rateControlUpdateStats */
470 if (m_top
->m_rateControl
->rateControlEnd(m_frame
, m_accessUnitBits
, &m_rce
, &m_frameStats
) < 0)
471 m_top
->m_aborted
= true;
473 /* Accumulate NR statistics from all worker threads */
476 for (int i
= 0; i
< m_top
->m_numThreadLocalData
; i
++)
478 NoiseReduction
* nr
= &m_top
->m_threadLocalData
[i
].analysis
.m_quant
.m_frameNr
[m_frameEncoderID
];
479 for (int cat
= 0; cat
< MAX_NUM_TR_CATEGORIES
; cat
++)
481 for(int coeff
= 0; coeff
< MAX_NUM_TR_COEFFS
; coeff
++)
482 m_nr
->residualSum
[cat
][coeff
] += nr
->residualSum
[cat
][coeff
];
484 m_nr
->count
[cat
] += nr
->count
[cat
];
489 noiseReductionUpdate();
491 /* Copy updated NR coefficients back to all worker threads */
494 for (int i
= 0; i
< m_top
->m_numThreadLocalData
; i
++)
496 NoiseReduction
* nr
= &m_top
->m_threadLocalData
[i
].analysis
.m_quant
.m_frameNr
[m_frameEncoderID
];
497 memcpy(nr
->offsetDenoise
, m_nr
->offsetDenoise
, sizeof(uint16_t) * MAX_NUM_TR_CATEGORIES
* MAX_NUM_TR_COEFFS
);
498 memset(nr
->count
, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES
);
499 memset(nr
->residualSum
, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES
* MAX_NUM_TR_COEFFS
);
503 // Decrement referenced frame reference counts, allow them to be recycled
504 for (int l
= 0; l
< numPredDir
; l
++)
506 for (int ref
= 0; ref
< slice
->m_numRefIdx
[l
]; ref
++)
508 Frame
*refpic
= slice
->m_refPicList
[l
][ref
];
509 ATOMIC_DEC(&refpic
->m_countRefEncoders
);
514 void FrameEncoder::encodeSlice()
516 Slice
* slice
= m_frame
->m_encData
->m_slice
;
517 const uint32_t widthInLCUs
= slice
->m_sps
->numCuInWidth
;
518 const uint32_t lastCUAddr
= (slice
->m_endCUAddr
+ NUM_CU_PARTITIONS
- 1) / NUM_CU_PARTITIONS
;
519 const uint32_t numSubstreams
= m_param
->bEnableWavefront
? slice
->m_sps
->numCuInHeight
: 1;
521 SAOParam
* saoParam
= slice
->m_sps
->bUseSAO
? m_frame
->m_encData
->m_saoParam
: NULL
;
522 for (uint32_t cuAddr
= 0; cuAddr
< lastCUAddr
; cuAddr
++)
524 uint32_t col
= cuAddr
% widthInLCUs
;
525 uint32_t lin
= cuAddr
/ widthInLCUs
;
526 uint32_t subStrm
= lin
% numSubstreams
;
527 CUData
* ctu
= m_frame
->m_encData
->getPicCTU(cuAddr
);
529 m_entropyCoder
.setBitstream(&m_outStreams
[subStrm
]);
531 // Synchronize cabac probabilities with upper-right CTU if it's available and we're at the start of a line.
532 if (m_param
->bEnableWavefront
&& !col
&& lin
)
534 m_entropyCoder
.copyState(m_initSliceContext
);
535 m_entropyCoder
.loadContexts(m_rows
[lin
- 1].bufferedEntropy
);
540 if (saoParam
->bSaoFlag
[0] || saoParam
->bSaoFlag
[1])
542 int mergeLeft
= col
&& saoParam
->ctuParam
[0][cuAddr
].mergeMode
== SAO_MERGE_LEFT
;
543 int mergeUp
= lin
&& saoParam
->ctuParam
[0][cuAddr
].mergeMode
== SAO_MERGE_UP
;
545 m_entropyCoder
.codeSaoMerge(mergeLeft
);
546 if (lin
&& !mergeLeft
)
547 m_entropyCoder
.codeSaoMerge(mergeUp
);
548 if (!mergeLeft
&& !mergeUp
)
550 if (saoParam
->bSaoFlag
[0])
551 m_entropyCoder
.codeSaoOffset(saoParam
->ctuParam
[0][cuAddr
], 0);
552 if (saoParam
->bSaoFlag
[1])
554 m_entropyCoder
.codeSaoOffset(saoParam
->ctuParam
[1][cuAddr
], 1);
555 m_entropyCoder
.codeSaoOffset(saoParam
->ctuParam
[2][cuAddr
], 2);
561 for (int i
= 0; i
< 3; i
++)
562 saoParam
->ctuParam
[i
][cuAddr
].reset();
566 // final coding (bitstream generation) for this CU
567 m_entropyCoder
.encodeCTU(*ctu
, m_cuGeoms
[m_ctuGeomMap
[cuAddr
]]);
569 if (m_param
->bEnableWavefront
)
572 // Store probabilities of second CTU in line into buffer
573 m_rows
[lin
].bufferedEntropy
.loadContexts(m_entropyCoder
);
575 if (col
== widthInLCUs
- 1)
576 m_entropyCoder
.finishSlice();
579 if (!m_param
->bEnableWavefront
)
580 m_entropyCoder
.finishSlice();
583 void FrameEncoder::compressCTURows()
585 Slice
* slice
= m_frame
->m_encData
->m_slice
;
587 m_bAllRowsStop
= false;
588 m_vbvResetTriggerRow
= -1;
590 m_SSDY
= m_SSDU
= m_SSDV
= 0;
593 memset(&m_frameStats
, 0, sizeof(m_frameStats
));
595 bool bUseWeightP
= slice
->m_pps
->bUseWeightPred
&& slice
->m_sliceType
== P_SLICE
;
596 bool bUseWeightB
= slice
->m_pps
->bUseWeightedBiPred
&& slice
->m_sliceType
== B_SLICE
;
597 int numPredDir
= slice
->isInterP() ? 1 : slice
->isInterB() ? 2 : 0;
599 m_rows
[0].active
= true;
600 if (m_pool
&& m_param
->bEnableWavefront
)
602 WaveFront::clearEnabledRowMask();
603 WaveFront::enqueue();
605 for (int row
= 0; row
< m_numRows
; row
++)
607 // block until all reference frames have reconstructed the rows we need
608 for (int l
= 0; l
< numPredDir
; l
++)
610 for (int ref
= 0; ref
< slice
->m_numRefIdx
[l
]; ref
++)
612 Frame
*refpic
= slice
->m_refPicList
[l
][ref
];
614 int reconRowCount
= refpic
->m_reconRowCount
.get();
615 while ((reconRowCount
!= m_numRows
) && (reconRowCount
< row
+ m_refLagRows
))
616 reconRowCount
= refpic
->m_reconRowCount
.waitForChange(reconRowCount
);
618 if ((bUseWeightP
|| bUseWeightB
) && m_mref
[l
][ref
].isWeighted
)
619 m_mref
[l
][ref
].applyWeight(row
+ m_refLagRows
, m_numRows
);
623 enableRowEncoder(row
);
625 enqueueRowEncoder(0);
627 m_pool
->pokeIdleThread();
630 m_completionEvent
.wait();
632 WaveFront::dequeue();
636 for (int i
= 0; i
< this->m_numRows
+ m_filterRowDelay
; i
++)
641 // block until all reference frames have reconstructed the rows we need
642 for (int l
= 0; l
< numPredDir
; l
++)
645 for (int ref
= 0; ref
< slice
->m_numRefIdx
[list
]; ref
++)
647 Frame
*refpic
= slice
->m_refPicList
[list
][ref
];
649 int reconRowCount
= refpic
->m_reconRowCount
.get();
650 while ((reconRowCount
!= m_numRows
) && (reconRowCount
< i
+ m_refLagRows
))
651 reconRowCount
= refpic
->m_reconRowCount
.waitForChange(reconRowCount
);
653 if ((bUseWeightP
|| bUseWeightB
) && m_mref
[l
][ref
].isWeighted
)
654 m_mref
[list
][ref
].applyWeight(i
+ m_refLagRows
, m_numRows
);
658 processRowEncoder(i
, *m_tld
);
662 if (i
>= m_filterRowDelay
)
663 m_frameFilter
.processRow(i
- m_filterRowDelay
);
666 m_frameTime
= (double)m_totalTime
/ 1000000;
670 void FrameEncoder::processRow(int row
, int threadId
)
672 const int realRow
= row
>> 1;
673 const int typeNum
= row
& 1;
675 ThreadLocalData
& tld
= threadId
>= 0 ? m_top
->m_threadLocalData
[threadId
] : *m_tld
;
678 processRowEncoder(realRow
, tld
);
681 m_frameFilter
.processRow(realRow
);
683 // NOTE: Active next row
684 if (realRow
!= m_numRows
- 1)
685 enqueueRowFilter(realRow
+ 1);
687 m_completionEvent
.trigger();
691 // Called by worker threads
692 void FrameEncoder::processRowEncoder(int row
, ThreadLocalData
& tld
)
694 CTURow
& curRow
= m_rows
[row
];
697 ScopedLock
self(curRow
.lock
);
699 /* VBV restart is in progress, exit out */
703 /* On multi-socket Windows servers, we have seen problems with
704 * ATOMIC_CAS which resulted in multiple worker threads processing
705 * the same CU row, which often resulted in bad pointer accesses. We
706 * believe the problem is fixed, but are leaving this check in place
707 * to prevent crashes in case it is not */
708 x265_log(m_param
, X265_LOG_WARNING
,
709 "internal error - simultaneous row access detected. Please report HW to x265-devel@videolan.org\n");
715 /* When WPP is enabled, every row has its own row coder instance. Otherwise
716 * they share row 0 */
717 Entropy
& rowCoder
= m_param
->bEnableWavefront
? m_rows
[row
].rowGoOnCoder
: m_rows
[0].rowGoOnCoder
;
718 FrameData
& curEncData
= *m_frame
->m_encData
;
719 Slice
*slice
= curEncData
.m_slice
;
721 int64_t startTime
= x265_mdate();
722 const uint32_t numCols
= m_numCols
;
723 const uint32_t lineStartCUAddr
= row
* numCols
;
724 bool bIsVbv
= m_param
->rc
.vbvBufferSize
> 0 && m_param
->rc
.vbvMaxBitrate
> 0;
726 while (curRow
.completed
< numCols
)
728 ProfileScopeEvent(encodeCTU
);
730 int col
= curRow
.completed
;
731 const uint32_t cuAddr
= lineStartCUAddr
+ col
;
732 CUData
* ctu
= curEncData
.getPicCTU(cuAddr
);
733 ctu
->initCTU(*m_frame
, cuAddr
, slice
->m_sliceQp
);
739 curEncData
.m_rowStat
[row
].diagQp
= curEncData
.m_avgQpRc
;
740 curEncData
.m_rowStat
[row
].diagQpScale
= x265_qp2qScale(curEncData
.m_avgQpRc
);
743 if (row
>= col
&& row
&& m_vbvResetTriggerRow
!= row
)
744 curEncData
.m_cuStat
[cuAddr
].baseQp
= curEncData
.m_cuStat
[cuAddr
- numCols
+ 1].baseQp
;
746 curEncData
.m_cuStat
[cuAddr
].baseQp
= curEncData
.m_rowStat
[row
].diagQp
;
749 curEncData
.m_cuStat
[cuAddr
].baseQp
= curEncData
.m_avgQpRc
;
751 if (m_param
->rc
.aqMode
|| bIsVbv
)
753 int qp
= calcQpForCu(cuAddr
, curEncData
.m_cuStat
[cuAddr
].baseQp
);
754 tld
.analysis
.setQP(*slice
, qp
);
755 qp
= Clip3(QP_MIN
, QP_MAX_SPEC
, qp
);
756 ctu
->setQPSubParts((int8_t)qp
, 0, 0);
757 curEncData
.m_rowStat
[row
].sumQpAq
+= qp
;
760 tld
.analysis
.setQP(*slice
, slice
->m_sliceQp
);
762 if (m_param
->bEnableWavefront
&& !col
&& row
)
764 // Load SBAC coder context from previous row and initialize row state.
765 rowCoder
.copyState(m_initSliceContext
);
766 rowCoder
.loadContexts(m_rows
[row
- 1].bufferedEntropy
);
769 // Does all the CU analysis, returns best top level mode decision
770 Mode
& best
= tld
.analysis
.compressCTU(*ctu
, *m_frame
, m_cuGeoms
[m_ctuGeomMap
[cuAddr
]], rowCoder
);
772 /* advance top-level row coder to include the context of this CTU.
773 * if SAO is disabled, rowCoder writes the final CTU bitstream */
774 rowCoder
.encodeCTU(*ctu
, m_cuGeoms
[m_ctuGeomMap
[cuAddr
]]);
776 if (m_param
->bEnableWavefront
&& col
== 1)
777 // Save CABAC state for next row
778 curRow
.bufferedEntropy
.loadContexts(rowCoder
);
780 // Completed CU processing
783 if (m_param
->bLogCuStats
|| m_param
->rc
.bStatWrite
)
784 collectCTUStatistics(*ctu
);
786 // copy no. of intra, inter Cu cnt per row into frame stats for 2 pass
787 if (m_param
->rc
.bStatWrite
)
789 curRow
.rowStats
.mvBits
+= best
.mvBits
;
790 curRow
.rowStats
.coeffBits
+= best
.coeffBits
;
791 curRow
.rowStats
.miscBits
+= best
.totalBits
- (best
.mvBits
+ best
.coeffBits
);
792 StatisticLog
* log
= &m_sliceTypeLog
[slice
->m_sliceType
];
794 for (uint32_t depth
= 0; depth
<= g_maxCUDepth
; depth
++)
796 /* 1 << shift == number of 8x8 blocks at current depth */
797 int shift
= 2 * (g_maxCUDepth
- depth
);
798 curRow
.rowStats
.iCuCnt
+= log
->qTreeIntraCnt
[depth
] << shift
;
799 curRow
.rowStats
.pCuCnt
+= log
->qTreeInterCnt
[depth
] << shift
;
800 curRow
.rowStats
.skipCuCnt
+= log
->qTreeSkipCnt
[depth
] << shift
;
802 // clear the row cu data from thread local object
803 log
->qTreeIntraCnt
[depth
] = log
->qTreeInterCnt
[depth
] = log
->qTreeSkipCnt
[depth
] = 0;
807 curEncData
.m_cuStat
[cuAddr
].totalBits
= best
.totalBits
;
812 // Update encoded bits, satdCost, baseQP for each CU
813 curEncData
.m_rowStat
[row
].diagSatd
+= curEncData
.m_cuStat
[cuAddr
].vbvCost
;
814 curEncData
.m_rowStat
[row
].diagIntraSatd
+= curEncData
.m_cuStat
[cuAddr
].intraVbvCost
;
815 curEncData
.m_rowStat
[row
].encodedBits
+= curEncData
.m_cuStat
[cuAddr
].totalBits
;
816 curEncData
.m_rowStat
[row
].sumQpRc
+= curEncData
.m_cuStat
[cuAddr
].baseQp
;
817 curEncData
.m_rowStat
[row
].numEncodedCUs
= cuAddr
;
819 // If current block is at row diagonal checkpoint, call vbv ratecontrol.
821 if (row
== col
&& row
)
823 double qpBase
= curEncData
.m_cuStat
[cuAddr
].baseQp
;
824 int reEncode
= m_top
->m_rateControl
->rowDiagonalVbvRateControl(m_frame
, row
, &m_rce
, qpBase
);
825 qpBase
= Clip3((double)QP_MIN
, (double)QP_MAX_MAX
, qpBase
);
826 curEncData
.m_rowStat
[row
].diagQp
= qpBase
;
827 curEncData
.m_rowStat
[row
].diagQpScale
= x265_qp2qScale(qpBase
);
831 x265_log(m_param
, X265_LOG_DEBUG
, "POC %d row %d - encode restart required for VBV, to %.2f from %.2f\n",
832 m_frame
->m_poc
, row
, qpBase
, curEncData
.m_cuStat
[cuAddr
].baseQp
);
834 // prevent the WaveFront::findJob() method from providing new jobs
835 m_vbvResetTriggerRow
= row
;
836 m_bAllRowsStop
= true;
838 for (int r
= m_numRows
- 1; r
>= row
; r
--)
840 CTURow
& stopRow
= m_rows
[r
];
844 /* if row was active (ready to be run) clear active bit and bitmap bit for this row */
845 stopRow
.lock
.acquire();
846 while (stopRow
.active
)
848 if (dequeueRow(r
* 2))
849 stopRow
.active
= false;
852 /* we must release the row lock to allow the thread to exit */
853 stopRow
.lock
.release();
855 stopRow
.lock
.acquire();
858 stopRow
.lock
.release();
860 bool bRowBusy
= true;
863 stopRow
.lock
.acquire();
864 bRowBusy
= stopRow
.busy
;
865 stopRow
.lock
.release();
875 m_outStreams
[r
].resetBits();
876 stopRow
.completed
= 0;
877 memset(&stopRow
.rowStats
, 0, sizeof(stopRow
.rowStats
));
878 curEncData
.m_rowStat
[r
].numEncodedCUs
= 0;
879 curEncData
.m_rowStat
[r
].encodedBits
= 0;
880 curEncData
.m_rowStat
[r
].diagSatd
= 0;
881 curEncData
.m_rowStat
[r
].diagIntraSatd
= 0;
882 curEncData
.m_rowStat
[r
].sumQpRc
= 0;
883 curEncData
.m_rowStat
[r
].sumQpAq
= 0;
886 m_bAllRowsStop
= false;
891 // NOTE: do CU level Filter
892 if (m_param
->bEnableSAO
&& m_param
->bSaoNonDeblocked
)
893 // SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas
894 m_frameFilter
.m_sao
.calcSaoStatsCu_BeforeDblk(m_frame
, col
, row
);
896 // NOTE: active next row
897 if (curRow
.completed
>= 2 && row
< m_numRows
- 1)
899 ScopedLock
below(m_rows
[row
+ 1].lock
);
900 if (m_rows
[row
+ 1].active
== false &&
901 m_rows
[row
+ 1].completed
+ 2 <= curRow
.completed
&&
902 (!m_bAllRowsStop
|| row
+ 1 < m_vbvResetTriggerRow
))
904 m_rows
[row
+ 1].active
= true;
905 enqueueRowEncoder(row
+ 1);
909 ScopedLock
self(curRow
.lock
);
910 if ((m_bAllRowsStop
&& row
> m_vbvResetTriggerRow
) ||
911 (row
> 0 && curRow
.completed
< numCols
- 1 && m_rows
[row
- 1].completed
< m_rows
[row
].completed
+ 2))
913 curRow
.active
= false;
915 m_totalTime
+= x265_mdate() - startTime
;
920 /* *this row of CTUs has been encoded* */
922 /* flush row bitstream (if WPP and no SAO) or flush frame if no WPP and no SAO */
923 if (!m_param
->bEnableSAO
&& (m_param
->bEnableWavefront
|| row
== m_numRows
- 1))
924 rowCoder
.finishSlice();
926 /* If encoding with ABR, update update bits and complexity in rate control
927 * after a number of rows so the next frame's rateControlStart has more
928 * accurate data for estimation. At the start of the encode we update stats
929 * after half the frame is encoded, but after this initial period we update
930 * after refLagRows (the number of rows reference frames must have completed
931 * before referencees may begin encoding) */
933 if (m_param
->rc
.rateControlMode
== X265_RC_ABR
)
935 if ((uint32_t)m_rce
.encodeOrder
<= 2 * (m_param
->fpsNum
/ m_param
->fpsDenom
))
936 rowCount
= X265_MIN((m_numRows
+ 1) / 2, m_numRows
- 1);
938 rowCount
= X265_MIN(m_refLagRows
, m_numRows
- 1);
942 m_rce
.rowTotalBits
= 0;
944 for (int i
= 0; i
< rowCount
; i
++)
945 m_rce
.rowTotalBits
+= curEncData
.m_rowStat
[i
].encodedBits
;
947 for (uint32_t cuAddr
= 0; cuAddr
< rowCount
* numCols
; cuAddr
++)
948 m_rce
.rowTotalBits
+= curEncData
.m_cuStat
[cuAddr
].totalBits
;
950 m_top
->m_rateControl
->rateControlUpdateStats(&m_rce
);
953 if (m_param
->bEnableWavefront
)
955 /* trigger row-wise loop filters */
956 if (row
>= m_filterRowDelay
)
958 enableRowFilter(row
- m_filterRowDelay
);
960 /* NOTE: Activate filter if first row (row 0) */
961 if (row
== m_filterRowDelay
)
964 if (row
== m_numRows
- 1)
966 for (int i
= m_numRows
- m_filterRowDelay
; i
< m_numRows
; i
++)
971 m_totalTime
+= x265_mdate() - startTime
;
975 void FrameEncoder::collectCTUStatistics(CUData
& ctu
)
977 StatisticLog
* log
= &m_sliceTypeLog
[ctu
.m_slice
->m_sliceType
];
979 if (ctu
.m_slice
->m_sliceType
== I_SLICE
)
982 for (uint32_t absPartIdx
= 0; absPartIdx
< ctu
.m_numPartitions
; absPartIdx
+= ctu
.m_numPartitions
>> (depth
* 2))
984 depth
= ctu
.m_cuDepth
[absPartIdx
];
987 log
->cntIntra
[depth
]++;
988 log
->qTreeIntraCnt
[depth
]++;
990 if (ctu
.m_predMode
[absPartIdx
] == MODE_NONE
)
993 log
->cntIntra
[depth
]--;
994 log
->qTreeIntraCnt
[depth
]--;
996 else if (ctu
.m_partSize
[absPartIdx
] != SIZE_2Nx2N
)
998 /* TODO: log intra modes at absPartIdx +0 to +3 */
999 X265_CHECK(depth
== g_maxCUDepth
, "Intra NxN found at improbable depth\n");
1001 log
->cntIntra
[depth
]--;
1003 else if (ctu
.m_lumaIntraDir
[absPartIdx
] > 1)
1004 log
->cuIntraDistribution
[depth
][ANGULAR_MODE_ID
]++;
1006 log
->cuIntraDistribution
[depth
][ctu
.m_lumaIntraDir
[absPartIdx
]]++;
1012 for (uint32_t absPartIdx
= 0; absPartIdx
< ctu
.m_numPartitions
; absPartIdx
+= ctu
.m_numPartitions
>> (depth
* 2))
1014 depth
= ctu
.m_cuDepth
[absPartIdx
];
1017 log
->cntTotalCu
[depth
]++;
1019 if (ctu
.m_predMode
[absPartIdx
] == MODE_NONE
)
1022 log
->cntTotalCu
[depth
]--;
1024 else if (ctu
.isSkipped(absPartIdx
))
1027 log
->cntSkipCu
[depth
]++;
1028 log
->qTreeSkipCnt
[depth
]++;
1030 else if (ctu
.isInter(absPartIdx
))
1032 log
->cntInter
[depth
]++;
1033 log
->qTreeInterCnt
[depth
]++;
1035 if (ctu
.m_partSize
[absPartIdx
] < AMP_ID
)
1036 log
->cuInterDistribution
[depth
][ctu
.m_partSize
[absPartIdx
]]++;
1038 log
->cuInterDistribution
[depth
][AMP_ID
]++;
1040 else if (ctu
.isIntra(absPartIdx
))
1042 log
->cntIntra
[depth
]++;
1043 log
->qTreeIntraCnt
[depth
]++;
1045 if (ctu
.m_partSize
[absPartIdx
] != SIZE_2Nx2N
)
1047 X265_CHECK(depth
== g_maxCUDepth
, "Intra NxN found at improbable depth\n");
1049 /* TODO: log intra modes at absPartIdx +0 to +3 */
1051 else if (ctu
.m_lumaIntraDir
[absPartIdx
] > 1)
1052 log
->cuIntraDistribution
[depth
][ANGULAR_MODE_ID
]++;
1054 log
->cuIntraDistribution
[depth
][ctu
.m_lumaIntraDir
[absPartIdx
]]++;
1060 /* DCT-domain noise reduction / adaptive deadzone from libavcodec */
1061 void FrameEncoder::noiseReductionUpdate()
1066 static const uint32_t maxBlocksPerTrSize
[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12};
1068 for (int cat
= 0; cat
< MAX_NUM_TR_CATEGORIES
; cat
++)
1070 int trSize
= cat
& 3;
1071 int coefCount
= 1 << ((trSize
+ 2) * 2);
1073 if (m_nr
->count
[cat
] > maxBlocksPerTrSize
[trSize
])
1075 for (int i
= 0; i
< coefCount
; i
++)
1076 m_nr
->residualSum
[cat
][i
] >>= 1;
1077 m_nr
->count
[cat
] >>= 1;
1080 int nrStrength
= cat
< 8 ? m_param
->noiseReductionIntra
: m_param
->noiseReductionInter
;
1081 uint64_t scaledCount
= (uint64_t)nrStrength
* m_nr
->count
[cat
];
1083 for (int i
= 0; i
< coefCount
; i
++)
1085 uint64_t value
= scaledCount
+ m_nr
->residualSum
[cat
][i
] / 2;
1086 uint64_t denom
= m_nr
->residualSum
[cat
][i
] + 1;
1087 m_nr
->offsetDenoise
[cat
][i
] = (uint16_t)(value
/ denom
);
1090 // Don't denoise DC coefficients
1091 m_nr
->offsetDenoise
[cat
][0] = 0;
1095 int FrameEncoder::calcQpForCu(uint32_t ctuAddr
, double baseQp
)
1100 FrameData
& curEncData
= *m_frame
->m_encData
;
1101 /* clear cuCostsForVbv from when vbv row reset was triggered */
1102 bool bIsVbv
= m_param
->rc
.vbvBufferSize
> 0 && m_param
->rc
.vbvMaxBitrate
> 0;
1105 curEncData
.m_cuStat
[ctuAddr
].vbvCost
= 0;
1106 curEncData
.m_cuStat
[ctuAddr
].intraVbvCost
= 0;
1109 /* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */
1110 double qp_offset
= 0;
1111 uint32_t maxBlockCols
= (m_frame
->m_fencPic
->m_picWidth
+ (16 - 1)) / 16;
1112 uint32_t maxBlockRows
= (m_frame
->m_fencPic
->m_picHeight
+ (16 - 1)) / 16;
1113 uint32_t noOfBlocks
= g_maxCUSize
/ 16;
1114 uint32_t block_y
= (ctuAddr
/ curEncData
.m_slice
->m_sps
->numCuInWidth
) * noOfBlocks
;
1115 uint32_t block_x
= (ctuAddr
* noOfBlocks
) - block_y
* curEncData
.m_slice
->m_sps
->numCuInWidth
;
1117 /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
1118 bool isReferenced
= IS_REFERENCED(m_frame
);
1119 double *qpoffs
= (isReferenced
&& m_param
->rc
.cuTree
) ? m_frame
->m_lowres
.qpCuTreeOffset
: m_frame
->m_lowres
.qpAqOffset
;
1121 uint32_t cnt
= 0, idx
= 0;
1122 for (uint32_t h
= 0; h
< noOfBlocks
&& block_y
< maxBlockRows
; h
++, block_y
++)
1124 for (uint32_t w
= 0; w
< noOfBlocks
&& (block_x
+ w
) < maxBlockCols
; w
++)
1126 idx
= block_x
+ w
+ (block_y
* maxBlockCols
);
1127 if (m_param
->rc
.aqMode
)
1128 qp_offset
+= qpoffs
[idx
];
1131 curEncData
.m_cuStat
[ctuAddr
].vbvCost
+= m_frame
->m_lowres
.lowresCostForRc
[idx
] & LOWRES_COST_MASK
;
1132 curEncData
.m_cuStat
[ctuAddr
].intraVbvCost
+= m_frame
->m_lowres
.intraCost
[idx
];
1141 return Clip3(QP_MIN
, QP_MAX_MAX
, (int)(qp
+ 0.5));
1144 Frame
*FrameEncoder::getEncodedPicture(NALList
& output
)
1148 /* block here until worker thread completes */
1151 Frame
*ret
= m_frame
;
1153 output
.takeContents(m_nalList
);