1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Chung Shin Yee <shinyee@multicorewareinc.com>
5 * Min Chen <chenm003@163.com>
6 * Steve Borho <steve@borho.org>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at license @ x265.com.
24 *****************************************************************************/
28 #include "framedata.h"
29 #include "wavefront.h"
35 #include "frameencoder.h"
37 #include "slicetype.h"
41 void weightAnalyse(Slice
& slice
, Frame
& frame
, x265_param
& param
);
43 FrameEncoder::FrameEncoder()
45 , m_threadActive(true)
49 m_bAllRowsStop
= false;
50 m_vbvResetTriggerRow
= -1;
52 m_substreamSizes
= NULL
;
61 memset(&m_frameStats
, 0, sizeof(m_frameStats
));
62 memset(&m_rce
, 0, sizeof(RateControlEntry
));
65 void FrameEncoder::destroy()
68 JobProvider::flush(); // ensure no worker threads are using this frame
70 m_threadActive
= false;
74 delete[] m_outStreams
;
76 X265_FREE(m_ctuGeomMap
);
77 X265_FREE(m_substreamSizes
);
80 m_frameFilter
.destroy();
82 if (m_param
->bEmitHRDSEI
|| !!m_param
->interlaceMode
)
84 delete m_rce
.picTimingSEI
;
85 delete m_rce
.hrdTiming
;
88 // wait for worker thread to exit
92 bool FrameEncoder::init(Encoder
*top
, int numRows
, int numCols
, int id
)
95 m_param
= top
->m_param
;
98 m_filterRowDelay
= (m_param
->bEnableSAO
&& m_param
->bSaoNonDeblocked
) ?
99 2 : (m_param
->bEnableSAO
|| m_param
->bEnableLoopFilter
? 1 : 0);
100 m_filterRowDelayCus
= m_filterRowDelay
* numCols
;
101 m_frameEncoderID
= id
;
102 m_rows
= new CTURow
[m_numRows
];
103 bool ok
= !!m_numRows
;
105 int range
= m_param
->searchRange
; /* fpel search */
106 range
+= 1; /* diamond search range check lag */
107 range
+= 2; /* subpel refine */
108 range
+= NTAPS_LUMA
/ 2; /* subpel filter half-length */
109 m_refLagRows
= 1 + ((range
+ g_maxCUSize
- 1) / g_maxCUSize
);
111 // NOTE: 2 times of numRows because both Encoder and Filter in same queue
112 if (!WaveFront::init(m_numRows
* 2))
114 x265_log(m_param
, X265_LOG_ERROR
, "unable to initialize wavefront queue\n");
118 m_frameFilter
.init(top
, this, numRows
);
120 // initialize HRD parameters of SPS
121 if (m_param
->bEmitHRDSEI
|| !!m_param
->interlaceMode
)
123 m_rce
.picTimingSEI
= new SEIPictureTiming
;
124 m_rce
.hrdTiming
= new HRDTiming
;
126 ok
&= m_rce
.picTimingSEI
&& m_rce
.hrdTiming
;
129 if (m_param
->noiseReduction
)
130 m_nr
= X265_MALLOC(NoiseReduction
, 1);
132 memset(m_nr
, 0, sizeof(NoiseReduction
));
134 m_param
->noiseReduction
= 0;
140 /* Generate a complete list of unique geom sets for the current picture dimensions */
141 bool FrameEncoder::initializeGeoms(const FrameData
& encData
)
143 /* Geoms only vary between CTUs in the presence of picture edges */
144 int heightRem
= m_param
->sourceHeight
& (m_param
->maxCUSize
- 1);
145 int widthRem
= m_param
->sourceWidth
& (m_param
->maxCUSize
- 1);
146 int allocGeoms
= 1; // body
147 if (heightRem
&& widthRem
)
148 allocGeoms
= 4; // body, right, bottom, corner
149 else if (heightRem
|| widthRem
)
150 allocGeoms
= 2; // body, right or bottom
152 m_ctuGeomMap
= X265_MALLOC(uint32_t, m_numRows
* m_numCols
);
153 m_cuGeoms
= X265_MALLOC(CUGeom
, allocGeoms
* CUGeom::MAX_GEOMS
);
154 if (!m_cuGeoms
|| !m_ctuGeomMap
)
157 CUGeom cuLocalData
[CUGeom::MAX_GEOMS
];
158 memset(cuLocalData
, 0, sizeof(cuLocalData
)); // temporal fix for memcmp
161 for (uint32_t ctuAddr
= 0; ctuAddr
< m_numRows
* m_numCols
; ctuAddr
++)
163 /* TODO: detach this logic from TComDataCU */
164 encData
.m_picCTU
[ctuAddr
].initCTU(*m_frame
, ctuAddr
, 0);
165 encData
.m_picCTU
[ctuAddr
].calcCTUGeoms(m_param
->sourceWidth
, m_param
->sourceHeight
, m_param
->maxCUSize
, cuLocalData
);
167 m_ctuGeomMap
[ctuAddr
] = MAX_INT
;
168 for (int i
= 0; i
< countGeoms
; i
++)
170 if (!memcmp(cuLocalData
, m_cuGeoms
+ i
* CUGeom::MAX_GEOMS
, sizeof(CUGeom
) * CUGeom::MAX_GEOMS
))
172 m_ctuGeomMap
[ctuAddr
] = i
* CUGeom::MAX_GEOMS
;
177 if (m_ctuGeomMap
[ctuAddr
] == MAX_INT
)
179 X265_CHECK(countGeoms
< allocGeoms
, "geometry match check failure\n");
180 m_ctuGeomMap
[ctuAddr
] = countGeoms
* CUGeom::MAX_GEOMS
;
181 memcpy(m_cuGeoms
+ countGeoms
* CUGeom::MAX_GEOMS
, cuLocalData
, sizeof(CUGeom
) * CUGeom::MAX_GEOMS
);
189 bool FrameEncoder::startCompressFrame(Frame
* curFrame
)
192 curFrame
->m_encData
->m_frameEncoderID
= m_frameEncoderID
; // Each Frame knows the ID of the FrameEncoder encoding it
193 curFrame
->m_encData
->m_slice
->m_mref
= m_mref
;
196 if (!initializeGeoms(*curFrame
->m_encData
))
203 void FrameEncoder::threadMain()
205 // worker thread routine for FrameEncoder
208 m_enable
.wait(); // Encoder::encode() triggers this event
212 m_done
.trigger(); // FrameEncoder::getEncodedPicture() blocks for this event
215 while (m_threadActive
);
218 void FrameEncoder::compressFrame()
220 PPAScopeEvent(FrameEncoder_compressFrame
);
221 int64_t startCompressTime
= x265_mdate();
222 Slice
* slice
= m_frame
->m_encData
->m_slice
;
224 /* Emit access unit delimiter unless this is the first frame and the user is
225 * not repeating headers (since AUD is supposed to be the first NAL in the access
227 if (m_param
->bEnableAccessUnitDelimiters
&& (m_frame
->m_poc
|| m_param
->bRepeatHeaders
))
230 m_entropyCoder
.setBitstream(&m_bs
);
231 m_entropyCoder
.codeAUD(*slice
);
232 m_bs
.writeByteAlignment();
233 m_nalList
.serialize(NAL_UNIT_ACCESS_UNIT_DELIMITER
, m_bs
);
235 if (m_frame
->m_lowres
.bKeyframe
&& m_param
->bRepeatHeaders
)
236 m_top
->getStreamHeaders(m_nalList
, m_entropyCoder
, m_bs
);
238 // Weighted Prediction parameters estimation.
239 bool bUseWeightP
= slice
->m_sliceType
== P_SLICE
&& slice
->m_pps
->bUseWeightPred
;
240 bool bUseWeightB
= slice
->m_sliceType
== B_SLICE
&& slice
->m_pps
->bUseWeightedBiPred
;
241 if (bUseWeightP
|| bUseWeightB
)
242 weightAnalyse(*slice
, *m_frame
, *m_param
);
244 slice
->disableWeights();
246 // Generate motion references
247 int numPredDir
= slice
->isInterP() ? 1 : slice
->isInterB() ? 2 : 0;
248 for (int l
= 0; l
< numPredDir
; l
++)
250 for (int ref
= 0; ref
< slice
->m_numRefIdx
[l
]; ref
++)
252 WeightParam
*w
= NULL
;
253 if ((bUseWeightP
|| bUseWeightB
) && slice
->m_weightPredTable
[l
][ref
][0].bPresentFlag
)
254 w
= slice
->m_weightPredTable
[l
][ref
];
255 m_mref
[l
][ref
].init(slice
->m_refPicList
[l
][ref
]->m_reconPicYuv
, w
);
259 /* Get the QP for this frame from rate control. This call may block until
260 * frames ahead of it in encode order have called rateControlEnd() */
261 int qp
= m_top
->m_rateControl
->rateControlStart(m_frame
, &m_rce
, m_top
);
264 /* Clip slice QP to 0-51 spec range before encoding */
265 slice
->m_sliceQp
= Clip3(-QP_BD_OFFSET
, QP_MAX_SPEC
, qp
);
267 m_initSliceContext
.resetEntropy(*slice
);
269 m_frameFilter
.start(m_frame
, m_initSliceContext
, qp
);
271 // reset entropy coders
272 m_entropyCoder
.load(m_initSliceContext
);
273 for (int i
= 0; i
< m_numRows
; i
++)
274 m_rows
[i
].init(m_initSliceContext
);
276 uint32_t numSubstreams
= m_param
->bEnableWavefront
? slice
->m_sps
->numCuInHeight
: 1;
279 m_outStreams
= new Bitstream
[numSubstreams
];
280 m_substreamSizes
= X265_MALLOC(uint32_t, numSubstreams
);
281 if (!m_param
->bEnableSAO
)
282 for (uint32_t i
= 0; i
< numSubstreams
; i
++)
283 m_rows
[i
].rowGoOnCoder
.setBitstream(&m_outStreams
[i
]);
286 for (uint32_t i
= 0; i
< numSubstreams
; i
++)
287 m_outStreams
[i
].resetBits();
289 if (m_frame
->m_lowres
.bKeyframe
)
291 if (m_param
->bEmitHRDSEI
)
293 SEIBufferingPeriod
* bpSei
= &m_top
->m_rateControl
->m_bufPeriodSEI
;
295 // since the temporal layer HRD is not ready, we assumed it is fixed
296 bpSei
->m_auCpbRemovalDelayDelta
= 1;
297 bpSei
->m_cpbDelayOffset
= 0;
298 bpSei
->m_dpbDelayOffset
= 0;
300 // hrdFullness() calculates the initial CPB removal delay and offset
301 m_top
->m_rateControl
->hrdFullness(bpSei
);
304 bpSei
->write(m_bs
, *slice
->m_sps
);
305 m_bs
.writeByteAlignment();
307 m_nalList
.serialize(NAL_UNIT_PREFIX_SEI
, m_bs
);
309 m_top
->m_lastBPSEI
= m_rce
.encodeOrder
;
312 // The recovery point SEI message assists a decoder in determining when the decoding
313 // process will produce acceptable pictures for display after the decoder initiates
314 // random access. The m_recoveryPocCnt is in units of POC(picture order count) which
315 // means pictures encoded after the CRA but precede it in display order(leading) are
316 // implicitly discarded after a random access seek regardless of the value of
317 // m_recoveryPocCnt. Our encoder does not use references prior to the most recent CRA,
318 // so all pictures following the CRA in POC order are guaranteed to be displayable,
319 // so m_recoveryPocCnt is always 0.
320 SEIRecoveryPoint sei_recovery_point
;
321 sei_recovery_point
.m_recoveryPocCnt
= 0;
322 sei_recovery_point
.m_exactMatchingFlag
= true;
323 sei_recovery_point
.m_brokenLinkFlag
= false;
326 sei_recovery_point
.write(m_bs
, *slice
->m_sps
);
327 m_bs
.writeByteAlignment();
329 m_nalList
.serialize(NAL_UNIT_PREFIX_SEI
, m_bs
);
332 if (m_param
->bEmitHRDSEI
|| !!m_param
->interlaceMode
)
334 SEIPictureTiming
*sei
= m_rce
.picTimingSEI
;
335 const VUI
*vui
= &slice
->m_sps
->vuiParameters
;
336 const HRDInfo
*hrd
= &vui
->hrdParameters
;
337 int poc
= slice
->m_poc
;
339 if (vui
->frameFieldInfoPresentFlag
)
341 if (m_param
->interlaceMode
== 2)
342 sei
->m_picStruct
= (poc
& 1) ? 1 /* top */ : 2 /* bottom */;
343 else if (m_param
->interlaceMode
== 1)
344 sei
->m_picStruct
= (poc
& 1) ? 2 /* bottom */ : 1 /* top */;
346 sei
->m_picStruct
= 0;
347 sei
->m_sourceScanType
= 0;
348 sei
->m_duplicateFlag
= false;
351 if (vui
->hrdParametersPresentFlag
)
353 // The m_aucpbremoval delay specifies how many clock ticks the
354 // access unit associated with the picture timing SEI message has to
355 // wait after removal of the access unit with the most recent
356 // buffering period SEI message
357 sei
->m_auCpbRemovalDelay
= X265_MIN(X265_MAX(1, m_rce
.encodeOrder
- m_top
->m_lastBPSEI
), (1 << hrd
->cpbRemovalDelayLength
));
358 sei
->m_picDpbOutputDelay
= slice
->m_sps
->numReorderPics
+ poc
- m_rce
.encodeOrder
;
362 sei
->write(m_bs
, *slice
->m_sps
);
363 m_bs
.writeByteAlignment();
364 m_nalList
.serialize(NAL_UNIT_PREFIX_SEI
, m_bs
);
367 // Analyze CTU rows, most of the hard work is done here
368 // frame is compressed in a wave-front pattern if WPP is enabled. Loop filter runs as a
369 // wave-front behind the CU compression and reconstruction
372 if (m_param
->rc
.bStatWrite
)
374 int totalI
= 0, totalP
= 0, totalSkip
= 0;
376 // accumulate intra,inter,skip cu count per frame for 2 pass
377 for (int i
= 0; i
< m_numRows
; i
++)
379 m_frameStats
.mvBits
+= m_rows
[i
].rowStats
.mvBits
;
380 m_frameStats
.coeffBits
+= m_rows
[i
].rowStats
.coeffBits
;
381 m_frameStats
.miscBits
+= m_rows
[i
].rowStats
.miscBits
;
382 totalI
+= m_rows
[i
].rowStats
.iCuCnt
;
383 totalP
+= m_rows
[i
].rowStats
.pCuCnt
;
384 totalSkip
+= m_rows
[i
].rowStats
.skipCuCnt
;
386 int totalCuCount
= totalI
+ totalP
+ totalSkip
;
387 m_frameStats
.percentIntra
= (double)totalI
/ totalCuCount
;
388 m_frameStats
.percentInter
= (double)totalP
/ totalCuCount
;
389 m_frameStats
.percentSkip
= (double)totalSkip
/ totalCuCount
;
393 m_entropyCoder
.load(m_initSliceContext
);
394 m_entropyCoder
.setBitstream(&m_bs
);
395 m_entropyCoder
.codeSliceHeader(*slice
, *m_frame
->m_encData
);
397 // finish encode of each CTU row, only required when SAO is enabled
398 if (m_param
->bEnableSAO
)
401 // serialize each row, record final lengths in slice header
402 uint32_t maxStreamSize
= m_nalList
.serializeSubstreams(m_substreamSizes
, numSubstreams
, m_outStreams
);
404 // complete the slice header by writing WPP row-starts
405 m_entropyCoder
.setBitstream(&m_bs
);
406 if (slice
->m_pps
->bEntropyCodingSyncEnabled
)
407 m_entropyCoder
.codeSliceHeaderWPPEntryPoints(*slice
, m_substreamSizes
, maxStreamSize
);
408 m_bs
.writeByteAlignment();
410 m_nalList
.serialize(slice
->m_nalUnitType
, m_bs
);
412 if (m_param
->decodedPictureHashSEI
)
414 if (m_param
->decodedPictureHashSEI
== 1)
416 m_seiReconPictureDigest
.m_method
= SEIDecodedPictureHash::MD5
;
417 for (int i
= 0; i
< 3; i
++)
418 MD5Final(&m_state
[i
], m_seiReconPictureDigest
.m_digest
[i
]);
420 else if (m_param
->decodedPictureHashSEI
== 2)
422 m_seiReconPictureDigest
.m_method
= SEIDecodedPictureHash::CRC
;
423 for (int i
= 0; i
< 3; i
++)
424 crcFinish(m_crc
[i
], m_seiReconPictureDigest
.m_digest
[i
]);
426 else if (m_param
->decodedPictureHashSEI
== 3)
428 m_seiReconPictureDigest
.m_method
= SEIDecodedPictureHash::CHECKSUM
;
429 for (int i
= 0; i
< 3; i
++)
430 checksumFinish(m_checksum
[i
], m_seiReconPictureDigest
.m_digest
[i
]);
434 m_seiReconPictureDigest
.write(m_bs
, *slice
->m_sps
);
435 m_bs
.writeByteAlignment();
437 m_nalList
.serialize(NAL_UNIT_SUFFIX_SEI
, m_bs
);
441 for (uint32_t i
= 0; i
< m_nalList
.m_numNal
; i
++)
443 int type
= m_nalList
.m_nal
[i
].type
;
446 if (type
!= NAL_UNIT_PREFIX_SEI
&& type
!= NAL_UNIT_SUFFIX_SEI
)
448 bytes
+= m_nalList
.m_nal
[i
].sizeBytes
;
449 // and exclude start code prefix
450 bytes
-= (!i
|| type
== NAL_UNIT_SPS
|| type
== NAL_UNIT_PPS
) ? 4 : 3;
453 m_accessUnitBits
= bytes
<< 3;
455 m_elapsedCompressTime
= (double)(x265_mdate() - startCompressTime
) / 1000000;
456 /* rateControlEnd may also block for earlier frames to call rateControlUpdateStats */
457 if (m_top
->m_rateControl
->rateControlEnd(m_frame
, m_accessUnitBits
, &m_rce
, &m_frameStats
) < 0)
458 m_top
->m_aborted
= true;
460 /* Accumulate NR statistics from all worker threads */
463 for (int i
= 0; i
< m_top
->m_numThreadLocalData
; i
++)
465 NoiseReduction
* nr
= &m_top
->m_threadLocalData
[i
].analysis
.m_quant
.m_frameNr
[m_frameEncoderID
];
466 for (int cat
= 0; cat
< MAX_NUM_TR_CATEGORIES
; cat
++)
468 for(int coeff
= 0; coeff
< MAX_NUM_TR_COEFFS
; coeff
++)
469 m_nr
->residualSum
[cat
][coeff
] += nr
->residualSum
[cat
][coeff
];
471 m_nr
->count
[cat
] += nr
->count
[cat
];
476 noiseReductionUpdate();
478 /* Copy updated NR coefficients back to all worker threads */
481 for (int i
= 0; i
< m_top
->m_numThreadLocalData
; i
++)
483 NoiseReduction
* nr
= &m_top
->m_threadLocalData
[i
].analysis
.m_quant
.m_frameNr
[m_frameEncoderID
];
484 memcpy(nr
->offsetDenoise
, m_nr
->offsetDenoise
, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES
* MAX_NUM_TR_COEFFS
);
485 memset(nr
->count
, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES
);
486 memset(nr
->residualSum
, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES
* MAX_NUM_TR_COEFFS
);
490 // Decrement referenced frame reference counts, allow them to be recycled
491 for (int l
= 0; l
< numPredDir
; l
++)
493 for (int ref
= 0; ref
< slice
->m_numRefIdx
[l
]; ref
++)
495 Frame
*refpic
= slice
->m_refPicList
[l
][ref
];
496 ATOMIC_DEC(&refpic
->m_countRefEncoders
);
501 void FrameEncoder::encodeSlice()
503 Slice
* slice
= m_frame
->m_encData
->m_slice
;
504 const uint32_t widthInLCUs
= slice
->m_sps
->numCuInWidth
;
505 const uint32_t lastCUAddr
= (slice
->m_endCUAddr
+ NUM_CU_PARTITIONS
- 1) / NUM_CU_PARTITIONS
;
506 const uint32_t numSubstreams
= m_param
->bEnableWavefront
? slice
->m_sps
->numCuInHeight
: 1;
508 SAOParam
* saoParam
= slice
->m_sps
->bUseSAO
? m_frame
->m_encData
->m_saoParam
: NULL
;
509 for (uint32_t cuAddr
= 0; cuAddr
< lastCUAddr
; cuAddr
++)
511 uint32_t col
= cuAddr
% widthInLCUs
;
512 uint32_t lin
= cuAddr
/ widthInLCUs
;
513 uint32_t subStrm
= lin
% numSubstreams
;
514 CUData
* ctu
= m_frame
->m_encData
->getPicCTU(cuAddr
);
516 m_entropyCoder
.setBitstream(&m_outStreams
[subStrm
]);
518 // Synchronize cabac probabilities with upper-right CTU if it's available and we're at the start of a line.
519 if (m_param
->bEnableWavefront
&& !col
&& lin
)
521 m_entropyCoder
.copyState(m_initSliceContext
);
522 m_entropyCoder
.loadContexts(m_rows
[lin
- 1].bufferedEntropy
);
527 if (saoParam
->bSaoFlag
[0] || saoParam
->bSaoFlag
[1])
529 int mergeLeft
= col
&& saoParam
->ctuParam
[0][cuAddr
].mergeMode
== SAO_MERGE_LEFT
;
530 int mergeUp
= lin
&& saoParam
->ctuParam
[0][cuAddr
].mergeMode
== SAO_MERGE_UP
;
532 m_entropyCoder
.codeSaoMerge(mergeLeft
);
533 if (lin
&& !mergeLeft
)
534 m_entropyCoder
.codeSaoMerge(mergeUp
);
535 if (!mergeLeft
&& !mergeUp
)
537 if (saoParam
->bSaoFlag
[0])
538 m_entropyCoder
.codeSaoOffset(saoParam
->ctuParam
[0][cuAddr
], 0);
539 if (saoParam
->bSaoFlag
[1])
541 m_entropyCoder
.codeSaoOffset(saoParam
->ctuParam
[1][cuAddr
], 1);
542 m_entropyCoder
.codeSaoOffset(saoParam
->ctuParam
[2][cuAddr
], 2);
548 for (int i
= 0; i
< 3; i
++)
549 saoParam
->ctuParam
[i
][cuAddr
].reset();
553 // final coding (bitstream generation) for this CU
554 m_entropyCoder
.encodeCTU(*ctu
, m_cuGeoms
[m_ctuGeomMap
[cuAddr
]]);
556 if (m_param
->bEnableWavefront
)
559 // Store probabilities of second CTU in line into buffer
560 m_rows
[lin
].bufferedEntropy
.loadContexts(m_entropyCoder
);
562 if (col
== widthInLCUs
- 1)
563 m_entropyCoder
.finishSlice();
566 if (!m_param
->bEnableWavefront
)
567 m_entropyCoder
.finishSlice();
570 void FrameEncoder::compressCTURows()
572 PPAScopeEvent(FrameEncoder_compressRows
);
573 Slice
* slice
= m_frame
->m_encData
->m_slice
;
575 m_bAllRowsStop
= false;
576 m_vbvResetTriggerRow
= -1;
578 m_SSDY
= m_SSDU
= m_SSDV
= 0;
581 memset(&m_frameStats
, 0, sizeof(m_frameStats
));
583 bool bUseWeightP
= slice
->m_pps
->bUseWeightPred
&& slice
->m_sliceType
== P_SLICE
;
584 bool bUseWeightB
= slice
->m_pps
->bUseWeightedBiPred
&& slice
->m_sliceType
== B_SLICE
;
585 int numPredDir
= slice
->isInterP() ? 1 : slice
->isInterB() ? 2 : 0;
587 m_rows
[0].active
= true;
588 if (m_pool
&& m_param
->bEnableWavefront
)
590 WaveFront::clearEnabledRowMask();
591 WaveFront::enqueue();
593 for (int row
= 0; row
< m_numRows
; row
++)
595 // block until all reference frames have reconstructed the rows we need
596 for (int l
= 0; l
< numPredDir
; l
++)
598 for (int ref
= 0; ref
< slice
->m_numRefIdx
[l
]; ref
++)
600 Frame
*refpic
= slice
->m_refPicList
[l
][ref
];
602 int reconRowCount
= refpic
->m_reconRowCount
.get();
603 while ((reconRowCount
!= m_numRows
) && (reconRowCount
< row
+ m_refLagRows
))
604 reconRowCount
= refpic
->m_reconRowCount
.waitForChange(reconRowCount
);
606 if ((bUseWeightP
|| bUseWeightB
) && m_mref
[l
][ref
].isWeighted
)
607 m_mref
[l
][ref
].applyWeight(row
+ m_refLagRows
, m_numRows
);
611 enableRowEncoder(row
);
613 enqueueRowEncoder(0);
615 m_pool
->pokeIdleThread();
618 m_completionEvent
.wait();
620 WaveFront::dequeue();
624 for (int i
= 0; i
< this->m_numRows
+ m_filterRowDelay
; i
++)
629 // block until all reference frames have reconstructed the rows we need
630 for (int l
= 0; l
< numPredDir
; l
++)
633 for (int ref
= 0; ref
< slice
->m_numRefIdx
[list
]; ref
++)
635 Frame
*refpic
= slice
->m_refPicList
[list
][ref
];
637 int reconRowCount
= refpic
->m_reconRowCount
.get();
638 while ((reconRowCount
!= m_numRows
) && (reconRowCount
< i
+ m_refLagRows
))
639 reconRowCount
= refpic
->m_reconRowCount
.waitForChange(reconRowCount
);
641 if ((bUseWeightP
|| bUseWeightB
) && m_mref
[l
][ref
].isWeighted
)
642 m_mref
[list
][ref
].applyWeight(i
+ m_refLagRows
, m_numRows
);
646 processRow(i
* 2 + 0, -1);
650 if (i
>= m_filterRowDelay
)
651 processRow((i
- m_filterRowDelay
) * 2 + 1, -1);
654 m_frameTime
= (double)m_totalTime
/ 1000000;
658 void FrameEncoder::processRow(int row
, int threadId
)
660 const int realRow
= row
>> 1;
661 const int typeNum
= row
& 1;
663 ThreadLocalData
& tld
= threadId
>= 0 ? m_top
->m_threadLocalData
[threadId
] : *m_tld
;
666 processRowEncoder(realRow
, tld
);
669 processRowFilter(realRow
);
671 // NOTE: Active next row
672 if (realRow
!= m_numRows
- 1)
673 enqueueRowFilter(realRow
+ 1);
675 m_completionEvent
.trigger();
679 // Called by worker threads
680 void FrameEncoder::processRowEncoder(int row
, ThreadLocalData
& tld
)
682 PPAScopeEvent(Thread_ProcessRow
);
684 CTURow
& curRow
= m_rows
[row
];
687 ScopedLock
self(curRow
.lock
);
689 /* VBV restart is in progress, exit out */
693 /* On multi-socket Windows servers, we have seen problems with
694 * ATOMIC_CAS which resulted in multiple worker threads processing
695 * the same CU row, which often resulted in bad pointer accesses. We
696 * believe the problem is fixed, but are leaving this check in place
697 * to prevent crashes in case it is not */
698 x265_log(m_param
, X265_LOG_WARNING
,
699 "internal error - simultaneous row access detected. Please report HW to x265-devel@videolan.org\n");
705 /* When WPP is enabled, every row has its own row coder instance. Otherwise
706 * they share row 0 */
707 Entropy
& rowCoder
= m_param
->bEnableWavefront
? m_rows
[row
].rowGoOnCoder
: m_rows
[0].rowGoOnCoder
;
708 FrameData
& curEncData
= *m_frame
->m_encData
;
709 Slice
*slice
= curEncData
.m_slice
;
710 PicYuv
* fencPic
= m_frame
->m_origPicYuv
;
712 tld
.analysis
.m_me
.setSourcePlane(fencPic
->m_picOrg
[0], fencPic
->m_stride
);
714 int64_t startTime
= x265_mdate();
715 const uint32_t numCols
= m_numCols
;
716 const uint32_t lineStartCUAddr
= row
* numCols
;
717 bool bIsVbv
= m_param
->rc
.vbvBufferSize
> 0 && m_param
->rc
.vbvMaxBitrate
> 0;
719 while (curRow
.completed
< numCols
)
721 int col
= curRow
.completed
;
722 const uint32_t cuAddr
= lineStartCUAddr
+ col
;
723 CUData
* ctu
= curEncData
.getPicCTU(cuAddr
);
724 ctu
->initCTU(*m_frame
, cuAddr
, slice
->m_sliceQp
);
730 curEncData
.m_rowStat
[row
].diagQp
= curEncData
.m_avgQpRc
;
731 curEncData
.m_rowStat
[row
].diagQpScale
= x265_qp2qScale(curEncData
.m_avgQpRc
);
734 if (row
>= col
&& row
&& m_vbvResetTriggerRow
!= row
)
735 curEncData
.m_cuStat
[cuAddr
].baseQp
= curEncData
.m_cuStat
[cuAddr
- numCols
+ 1].baseQp
;
737 curEncData
.m_cuStat
[cuAddr
].baseQp
= curEncData
.m_rowStat
[row
].diagQp
;
740 curEncData
.m_cuStat
[cuAddr
].baseQp
= curEncData
.m_avgQpRc
;
742 if (m_param
->rc
.aqMode
|| bIsVbv
)
744 int qp
= calcQpForCu(cuAddr
, curEncData
.m_cuStat
[cuAddr
].baseQp
);
745 tld
.analysis
.setQP(*slice
, qp
);
746 qp
= Clip3(QP_MIN
, QP_MAX_SPEC
, qp
);
747 ctu
->setQPSubParts((char)qp
, 0, 0);
748 curEncData
.m_rowStat
[row
].sumQpAq
+= qp
;
751 tld
.analysis
.setQP(*slice
, slice
->m_sliceQp
);
753 if (m_param
->bEnableWavefront
&& !col
&& row
)
755 // Load SBAC coder context from previous row and initialize row state.
756 rowCoder
.copyState(m_initSliceContext
);
757 rowCoder
.loadContexts(m_rows
[row
- 1].bufferedEntropy
);
760 // Does all the CU analysis, returns best top level mode decision
761 Search::Mode
& best
= tld
.analysis
.compressCTU(*ctu
, *m_frame
, m_cuGeoms
[m_ctuGeomMap
[cuAddr
]], rowCoder
);
763 /* advance top-level row coder to include the context of this CTU.
764 * if SAO is disabled, rowCoder writes the final CTU bitstream */
765 rowCoder
.encodeCTU(*ctu
, m_cuGeoms
[m_ctuGeomMap
[cuAddr
]]);
767 if (m_param
->bEnableWavefront
&& col
== 1)
768 // Save CABAC state for next row
769 curRow
.bufferedEntropy
.loadContexts(rowCoder
);
771 // Completed CU processing
774 if (m_param
->bLogCuStats
|| m_param
->rc
.bStatWrite
)
775 collectCTUStatistics(*ctu
);
777 // copy no. of intra, inter Cu cnt per row into frame stats for 2 pass
778 if (m_param
->rc
.bStatWrite
)
780 curRow
.rowStats
.mvBits
+= best
.mvBits
;
781 curRow
.rowStats
.coeffBits
+= best
.coeffBits
;
782 curRow
.rowStats
.miscBits
+= best
.totalBits
- (best
.mvBits
+ best
.coeffBits
);
783 StatisticLog
* log
= &m_sliceTypeLog
[slice
->m_sliceType
];
785 for (uint32_t depth
= 0; depth
<= g_maxCUDepth
; depth
++)
787 /* 1 << shift == number of 8x8 blocks at current depth */
788 int shift
= 2 * (g_maxCUDepth
- depth
);
789 curRow
.rowStats
.iCuCnt
+= log
->qTreeIntraCnt
[depth
] << shift
;
790 curRow
.rowStats
.pCuCnt
+= log
->qTreeInterCnt
[depth
] << shift
;
791 curRow
.rowStats
.skipCuCnt
+= log
->qTreeSkipCnt
[depth
] << shift
;
793 // clear the row cu data from thread local object
794 log
->qTreeIntraCnt
[depth
] = log
->qTreeInterCnt
[depth
] = log
->qTreeSkipCnt
[depth
] = 0;
798 curEncData
.m_cuStat
[cuAddr
].totalBits
= best
.totalBits
;
803 // Update encoded bits, satdCost, baseQP for each CU
804 curEncData
.m_rowStat
[row
].diagSatd
+= curEncData
.m_cuStat
[cuAddr
].vbvCost
;
805 curEncData
.m_rowStat
[row
].diagIntraSatd
+= curEncData
.m_cuStat
[cuAddr
].intraVbvCost
;
806 curEncData
.m_rowStat
[row
].encodedBits
+= curEncData
.m_cuStat
[cuAddr
].totalBits
;
807 curEncData
.m_rowStat
[row
].sumQpRc
+= curEncData
.m_cuStat
[cuAddr
].baseQp
;
808 curEncData
.m_rowStat
[row
].numEncodedCUs
= cuAddr
;
810 // If current block is at row diagonal checkpoint, call vbv ratecontrol.
812 if (row
== col
&& row
)
814 double qpBase
= curEncData
.m_cuStat
[cuAddr
].baseQp
;
815 int reEncode
= m_top
->m_rateControl
->rowDiagonalVbvRateControl(m_frame
, row
, &m_rce
, qpBase
);
816 qpBase
= Clip3((double)QP_MIN
, (double)QP_MAX_MAX
, qpBase
);
817 curEncData
.m_rowStat
[row
].diagQp
= qpBase
;
818 curEncData
.m_rowStat
[row
].diagQpScale
= x265_qp2qScale(qpBase
);
822 x265_log(m_param
, X265_LOG_DEBUG
, "POC %d row %d - encode restart required for VBV, to %.2f from %.2f\n",
823 m_frame
->m_poc
, row
, qpBase
, curEncData
.m_cuStat
[cuAddr
].baseQp
);
825 // prevent the WaveFront::findJob() method from providing new jobs
826 m_vbvResetTriggerRow
= row
;
827 m_bAllRowsStop
= true;
829 for (int r
= m_numRows
- 1; r
>= row
; r
--)
831 CTURow
& stopRow
= m_rows
[r
];
835 /* if row was active (ready to be run) clear active bit and bitmap bit for this row */
836 stopRow
.lock
.acquire();
837 while (stopRow
.active
)
839 if (dequeueRow(r
* 2))
840 stopRow
.active
= false;
845 stopRow
.lock
.release();
847 bool bRowBusy
= true;
850 stopRow
.lock
.acquire();
851 bRowBusy
= stopRow
.busy
;
852 stopRow
.lock
.release();
862 m_outStreams
[r
].resetBits();
863 stopRow
.completed
= 0;
864 memset(&stopRow
.rowStats
, 0, sizeof(stopRow
.rowStats
));
865 curEncData
.m_rowStat
[r
].numEncodedCUs
= 0;
866 curEncData
.m_rowStat
[r
].encodedBits
= 0;
867 curEncData
.m_rowStat
[r
].diagSatd
= 0;
868 curEncData
.m_rowStat
[r
].diagIntraSatd
= 0;
869 curEncData
.m_rowStat
[r
].sumQpRc
= 0;
870 curEncData
.m_rowStat
[r
].sumQpAq
= 0;
873 m_bAllRowsStop
= false;
878 // NOTE: do CU level Filter
879 if (m_param
->bEnableSAO
&& m_param
->bSaoNonDeblocked
)
880 // SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas
881 m_frameFilter
.m_sao
.calcSaoStatsCu_BeforeDblk(m_frame
, col
, row
);
883 // NOTE: active next row
884 if (curRow
.completed
>= 2 && row
< m_numRows
- 1)
886 ScopedLock
below(m_rows
[row
+ 1].lock
);
887 if (m_rows
[row
+ 1].active
== false &&
888 m_rows
[row
+ 1].completed
+ 2 <= curRow
.completed
&&
889 (!m_bAllRowsStop
|| row
+ 1 < m_vbvResetTriggerRow
))
891 m_rows
[row
+ 1].active
= true;
892 enqueueRowEncoder(row
+ 1);
896 ScopedLock
self(curRow
.lock
);
897 if ((m_bAllRowsStop
&& row
> m_vbvResetTriggerRow
) ||
898 (row
> 0 && curRow
.completed
< numCols
- 1 && m_rows
[row
- 1].completed
< m_rows
[row
].completed
+ 2))
900 curRow
.active
= false;
902 m_totalTime
+= x265_mdate() - startTime
;
907 /* *this row of CTUs has been encoded* */
909 /* flush row bitstream (if WPP and no SAO) or flush frame if no WPP and no SAO */
910 if (!m_param
->bEnableSAO
&& (m_param
->bEnableWavefront
|| row
== m_numRows
- 1))
911 rowCoder
.finishSlice();
913 /* If encoding with ABR, update update bits and complexity in rate control
914 * after a number of rows so the next frame's rateControlStart has more
915 * accurate data for estimation. At the start of the encode we update stats
916 * after half the frame is encoded, but after this initial period we update
917 * after refLagRows (the number of rows reference frames must have completed
918 * before referencees may begin encoding) */
920 if (m_param
->rc
.rateControlMode
== X265_RC_ABR
)
922 if ((uint32_t)m_rce
.encodeOrder
<= 2 * (m_param
->fpsNum
/ m_param
->fpsDenom
))
923 rowCount
= X265_MIN((m_numRows
+ 1) / 2, m_numRows
- 1);
925 rowCount
= X265_MIN(m_refLagRows
, m_numRows
- 1);
929 m_rce
.rowTotalBits
= 0;
931 for (int i
= 0; i
< rowCount
; i
++)
932 m_rce
.rowTotalBits
+= curEncData
.m_rowStat
[i
].encodedBits
;
934 for (uint32_t cuAddr
= 0; cuAddr
< rowCount
* numCols
; cuAddr
++)
935 m_rce
.rowTotalBits
+= curEncData
.m_cuStat
[cuAddr
].totalBits
;
937 m_top
->m_rateControl
->rateControlUpdateStats(&m_rce
);
940 // trigger row-wise loop filters
941 if (row
>= m_filterRowDelay
)
943 enableRowFilter(row
- m_filterRowDelay
);
945 // NOTE: Active Filter to first row (row 0)
946 if (row
== m_filterRowDelay
)
949 if (row
== m_numRows
- 1)
951 for (int i
= m_numRows
- m_filterRowDelay
; i
< m_numRows
; i
++)
955 m_totalTime
+= x265_mdate() - startTime
;
959 void FrameEncoder::collectCTUStatistics(CUData
& ctu
)
961 StatisticLog
* log
= &m_sliceTypeLog
[ctu
.m_slice
->m_sliceType
];
963 if (ctu
.m_slice
->m_sliceType
== I_SLICE
)
966 for (uint32_t absPartIdx
= 0; absPartIdx
< ctu
.m_numPartitions
; absPartIdx
+= ctu
.m_numPartitions
>> (depth
* 2))
968 depth
= ctu
.m_cuDepth
[absPartIdx
];
971 log
->cntIntra
[depth
]++;
972 log
->qTreeIntraCnt
[depth
]++;
974 if (ctu
.m_partSize
[absPartIdx
] == SIZE_NONE
)
977 log
->cntIntra
[depth
]--;
978 log
->qTreeIntraCnt
[depth
]--;
980 else if (ctu
.m_partSize
[absPartIdx
] == SIZE_NxN
)
982 /* TODO: log intra modes at absPartIdx +0 to +3 */
983 X265_CHECK(depth
== g_maxCUDepth
, "Intra NxN found at improbable depth\n");
985 log
->cntIntra
[depth
]--;
987 else if (ctu
.m_lumaIntraDir
[absPartIdx
] > 1)
988 log
->cuIntraDistribution
[depth
][ANGULAR_MODE_ID
]++;
990 log
->cuIntraDistribution
[depth
][ctu
.m_lumaIntraDir
[absPartIdx
]]++;
996 for (uint32_t absPartIdx
= 0; absPartIdx
< ctu
.m_numPartitions
; absPartIdx
+= ctu
.m_numPartitions
>> (depth
* 2))
998 depth
= ctu
.m_cuDepth
[absPartIdx
];
1001 log
->cntTotalCu
[depth
]++;
1003 if (ctu
.m_partSize
[absPartIdx
] == SIZE_NONE
)
1006 log
->cntTotalCu
[depth
]--;
1008 else if (ctu
.isSkipped(absPartIdx
))
1011 log
->cntSkipCu
[depth
]++;
1012 log
->qTreeSkipCnt
[depth
]++;
1014 else if (ctu
.m_predMode
[absPartIdx
] == MODE_INTER
)
1016 log
->cntInter
[depth
]++;
1017 log
->qTreeInterCnt
[depth
]++;
1019 if (ctu
.m_partSize
[absPartIdx
] < AMP_ID
)
1020 log
->cuInterDistribution
[depth
][ctu
.m_partSize
[absPartIdx
]]++;
1022 log
->cuInterDistribution
[depth
][AMP_ID
]++;
1024 else if (ctu
.m_predMode
[absPartIdx
] == MODE_INTRA
)
1026 log
->cntIntra
[depth
]++;
1027 log
->qTreeIntraCnt
[depth
]++;
1029 if (ctu
.m_partSize
[absPartIdx
] == SIZE_NxN
)
1031 X265_CHECK(depth
== g_maxCUDepth
, "Intra NxN found at improbable depth\n");
1033 /* TODO: log intra modes at absPartIdx +0 to +3 */
1035 else if (ctu
.m_lumaIntraDir
[absPartIdx
] > 1)
1036 log
->cuIntraDistribution
[depth
][ANGULAR_MODE_ID
]++;
1038 log
->cuIntraDistribution
[depth
][ctu
.m_lumaIntraDir
[absPartIdx
]]++;
1044 /* DCT-domain noise reduction / adaptive deadzone from libavcodec */
1045 void FrameEncoder::noiseReductionUpdate()
1050 static const uint32_t maxBlocksPerTrSize
[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12};
1052 for (int cat
= 0; cat
< MAX_NUM_TR_CATEGORIES
; cat
++)
1054 int trSize
= cat
& 3;
1055 int coefCount
= 1 << ((trSize
+ 2) * 2);
1057 if (m_nr
->count
[cat
] > maxBlocksPerTrSize
[trSize
])
1059 for (int i
= 0; i
< coefCount
; i
++)
1060 m_nr
->residualSum
[cat
][i
] >>= 1;
1061 m_nr
->count
[cat
] >>= 1;
1064 uint64_t scaledCount
= (uint64_t)m_param
->noiseReduction
* m_nr
->count
[cat
];
1066 for (int i
= 0; i
< coefCount
; i
++)
1068 uint64_t value
= scaledCount
+ m_nr
->residualSum
[cat
][i
] / 2;
1069 uint64_t denom
= m_nr
->residualSum
[cat
][i
] + 1;
1070 m_nr
->offsetDenoise
[cat
][i
] = (uint16_t)(value
/ denom
);
1073 // Don't denoise DC coefficients
1074 m_nr
->offsetDenoise
[cat
][0] = 0;
1078 int FrameEncoder::calcQpForCu(uint32_t ctuAddr
, double baseQp
)
1083 FrameData
& curEncData
= *m_frame
->m_encData
;
1084 /* clear cuCostsForVbv from when vbv row reset was triggered */
1085 bool bIsVbv
= m_param
->rc
.vbvBufferSize
> 0 && m_param
->rc
.vbvMaxBitrate
> 0;
1088 curEncData
.m_cuStat
[ctuAddr
].vbvCost
= 0;
1089 curEncData
.m_cuStat
[ctuAddr
].intraVbvCost
= 0;
1092 /* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */
1093 double qp_offset
= 0;
1094 uint32_t maxBlockCols
= (m_frame
->m_origPicYuv
->m_picWidth
+ (16 - 1)) / 16;
1095 uint32_t maxBlockRows
= (m_frame
->m_origPicYuv
->m_picHeight
+ (16 - 1)) / 16;
1096 uint32_t noOfBlocks
= g_maxCUSize
/ 16;
1097 uint32_t block_y
= (ctuAddr
/ curEncData
.m_slice
->m_sps
->numCuInWidth
) * noOfBlocks
;
1098 uint32_t block_x
= (ctuAddr
* noOfBlocks
) - block_y
* curEncData
.m_slice
->m_sps
->numCuInWidth
;
1100 /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
1101 bool isReferenced
= IS_REFERENCED(m_frame
);
1102 double *qpoffs
= (isReferenced
&& m_param
->rc
.cuTree
) ? m_frame
->m_lowres
.qpCuTreeOffset
: m_frame
->m_lowres
.qpAqOffset
;
1104 uint32_t cnt
= 0, idx
= 0;
1105 for (uint32_t h
= 0; h
< noOfBlocks
&& block_y
< maxBlockRows
; h
++, block_y
++)
1107 for (uint32_t w
= 0; w
< noOfBlocks
&& (block_x
+ w
) < maxBlockCols
; w
++)
1109 idx
= block_x
+ w
+ (block_y
* maxBlockCols
);
1110 if (m_param
->rc
.aqMode
)
1111 qp_offset
+= qpoffs
[idx
];
1114 curEncData
.m_cuStat
[ctuAddr
].vbvCost
+= m_frame
->m_lowres
.lowresCostForRc
[idx
] & LOWRES_COST_MASK
;
1115 curEncData
.m_cuStat
[ctuAddr
].intraVbvCost
+= m_frame
->m_lowres
.intraCost
[idx
];
1124 return Clip3(QP_MIN
, QP_MAX_MAX
, (int)(qp
+ 0.5));
1127 Frame
*FrameEncoder::getEncodedPicture(NALList
& output
)
1131 /* block here until worker thread completes */
1134 Frame
*ret
= m_frame
;
1136 output
.takeContents(m_nalList
);