5f4d2f7cf56847f3ca091711dc04615a60929aa2
[deb_x265.git] / source / encoder / frameencoder.cpp
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Chung Shin Yee <shinyee@multicorewareinc.com>
5 * Min Chen <chenm003@163.com>
6 * Steve Borho <steve@borho.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 *
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at license @ x265.com.
24 *****************************************************************************/
25
26 #include "common.h"
27 #include "frame.h"
28 #include "framedata.h"
29 #include "wavefront.h"
30 #include "param.h"
31
32 #include "encoder.h"
33 #include "frameencoder.h"
34 #include "common.h"
35 #include "slicetype.h"
36 #include "nal.h"
37
38 namespace x265 {
39 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
40
41 FrameEncoder::FrameEncoder()
42 : WaveFront(NULL)
43 , m_threadActive(true)
44 {
45 m_totalTime = 0;
46 m_frameEncoderID = 0;
47 m_bAllRowsStop = false;
48 m_vbvResetTriggerRow = -1;
49 m_outStreams = NULL;
50 m_substreamSizes = NULL;
51 m_nr = NULL;
52 m_tld = NULL;
53 m_rows = NULL;
54 m_top = NULL;
55 m_param = NULL;
56 m_frame = NULL;
57 m_cuGeoms = NULL;
58 m_ctuGeomMap = NULL;
59 memset(&m_frameStats, 0, sizeof(m_frameStats));
60 memset(&m_rce, 0, sizeof(RateControlEntry));
61 }
62
63 void FrameEncoder::destroy()
64 {
65 if (m_pool)
66 JobProvider::flush(); // ensure no worker threads are using this frame
67
68 m_threadActive = false;
69 m_enable.trigger();
70
71 delete[] m_rows;
72 delete[] m_outStreams;
73 X265_FREE(m_cuGeoms);
74 X265_FREE(m_ctuGeomMap);
75 X265_FREE(m_substreamSizes);
76 X265_FREE(m_nr);
77
78 m_frameFilter.destroy();
79
80 if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
81 {
82 delete m_rce.picTimingSEI;
83 delete m_rce.hrdTiming;
84 }
85
86 // wait for worker thread to exit
87 stop();
88 }
89
90 bool FrameEncoder::init(Encoder *top, int numRows, int numCols, int id)
91 {
92 m_top = top;
93 m_param = top->m_param;
94 m_numRows = numRows;
95 m_numCols = numCols;
96 m_filterRowDelay = (m_param->bEnableSAO && m_param->bSaoNonDeblocked) ?
97 2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0);
98 m_filterRowDelayCus = m_filterRowDelay * numCols;
99 m_frameEncoderID = id;
100 m_rows = new CTURow[m_numRows];
101 bool ok = !!m_numRows;
102
103 int range = m_param->searchRange; /* fpel search */
104 range += 1; /* diamond search range check lag */
105 range += 2; /* subpel refine */
106 range += NTAPS_LUMA / 2; /* subpel filter half-length */
107 m_refLagRows = 1 + ((range + g_maxCUSize - 1) / g_maxCUSize);
108
109 // NOTE: 2 times of numRows because both Encoder and Filter in same queue
110 if (!WaveFront::init(m_numRows * 2))
111 {
112 x265_log(m_param, X265_LOG_ERROR, "unable to initialize wavefront queue\n");
113 m_pool = NULL;
114 }
115
116 m_frameFilter.init(top, this, numRows);
117
118 // initialize HRD parameters of SPS
119 if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
120 {
121 m_rce.picTimingSEI = new SEIPictureTiming;
122 m_rce.hrdTiming = new HRDTiming;
123
124 ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
125 }
126
127 if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
128 m_nr = X265_MALLOC(NoiseReduction, 1);
129 if (m_nr)
130 memset(m_nr, 0, sizeof(NoiseReduction));
131 else
132 m_param->noiseReductionIntra = m_param->noiseReductionInter = 0;
133
134 start();
135 return ok;
136 }
137
138 /* Generate a complete list of unique geom sets for the current picture dimensions */
139 bool FrameEncoder::initializeGeoms()
140 {
141 /* Geoms only vary between CTUs in the presence of picture edges */
142 int maxCUSize = m_param->maxCUSize;
143 int heightRem = m_param->sourceHeight & (maxCUSize - 1);
144 int widthRem = m_param->sourceWidth & (maxCUSize - 1);
145 int allocGeoms = 1; // body
146 if (heightRem && widthRem)
147 allocGeoms = 4; // body, right, bottom, corner
148 else if (heightRem || widthRem)
149 allocGeoms = 2; // body, right or bottom
150
151 m_ctuGeomMap = X265_MALLOC(uint32_t, m_numRows * m_numCols);
152 m_cuGeoms = X265_MALLOC(CUGeom, allocGeoms * CUGeom::MAX_GEOMS);
153 if (!m_cuGeoms || !m_ctuGeomMap)
154 return false;
155
156 // body
157 CUData::calcCTUGeoms(maxCUSize, maxCUSize, maxCUSize, m_cuGeoms);
158 memset(m_ctuGeomMap, 0, sizeof(uint32_t) * m_numRows * m_numCols);
159 if (allocGeoms == 1)
160 return true;
161
162 int countGeoms = 1;
163 if (widthRem)
164 {
165 // right
166 CUData::calcCTUGeoms(widthRem, maxCUSize, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
167 for (int i = 0; i < m_numRows; i++)
168 {
169 uint32_t ctuAddr = m_numCols * (i + 1) - 1;
170 m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
171 }
172 countGeoms++;
173 }
174 if (heightRem)
175 {
176 // bottom
177 CUData::calcCTUGeoms(maxCUSize, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
178 for (uint32_t i = 0; i < m_numCols; i++)
179 {
180 uint32_t ctuAddr = m_numCols * (m_numRows - 1) + i;
181 m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
182 }
183 countGeoms++;
184
185 if (widthRem)
186 {
187 // corner
188 CUData::calcCTUGeoms(widthRem, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
189
190 uint32_t ctuAddr = m_numCols * m_numRows - 1;
191 m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
192 countGeoms++;
193 }
194 X265_CHECK(countGeoms == allocGeoms, "geometry match check failure\n");
195 }
196
197 return true;
198 }
199
200 bool FrameEncoder::startCompressFrame(Frame* curFrame)
201 {
202 m_frame = curFrame;
203 curFrame->m_encData->m_frameEncoderID = m_frameEncoderID; // Each Frame knows the ID of the FrameEncoder encoding it
204 curFrame->m_encData->m_slice->m_mref = m_mref;
205
206 if (!m_cuGeoms)
207 {
208 if (!initializeGeoms())
209 return false;
210 }
211
212 m_enable.trigger();
213 return true;
214 }
215
216 void FrameEncoder::threadMain()
217 {
218 // worker thread routine for FrameEncoder
219 do
220 {
221 m_enable.wait(); // Encoder::encode() triggers this event
222 if (m_threadActive)
223 {
224 compressFrame();
225 m_done.trigger(); // FrameEncoder::getEncodedPicture() blocks for this event
226 }
227 }
228 while (m_threadActive);
229 }
230
231 void FrameEncoder::compressFrame()
232 {
233 //ProfileScopeEvent(frameThread);
234 int64_t startCompressTime = x265_mdate();
235 Slice* slice = m_frame->m_encData->m_slice;
236
237 /* Emit access unit delimiter unless this is the first frame and the user is
238 * not repeating headers (since AUD is supposed to be the first NAL in the access
239 * unit) */
240 if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
241 {
242 m_bs.resetBits();
243 m_entropyCoder.setBitstream(&m_bs);
244 m_entropyCoder.codeAUD(*slice);
245 m_bs.writeByteAlignment();
246 m_nalList.serialize(NAL_UNIT_ACCESS_UNIT_DELIMITER, m_bs);
247 }
248 if (m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders)
249 m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
250
251 // Weighted Prediction parameters estimation.
252 bool bUseWeightP = slice->m_sliceType == P_SLICE && slice->m_pps->bUseWeightPred;
253 bool bUseWeightB = slice->m_sliceType == B_SLICE && slice->m_pps->bUseWeightedBiPred;
254 if (bUseWeightP || bUseWeightB)
255 weightAnalyse(*slice, *m_frame, *m_param);
256 else
257 slice->disableWeights();
258
259 // Generate motion references
260 int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0;
261 for (int l = 0; l < numPredDir; l++)
262 {
263 for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
264 {
265 WeightParam *w = NULL;
266 if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag)
267 w = slice->m_weightPredTable[l][ref];
268 m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPic, w, *m_param);
269 }
270 }
271
272 /* Get the QP for this frame from rate control. This call may block until
273 * frames ahead of it in encode order have called rateControlEnd() */
274 int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
275 m_rce.newQp = qp;
276
277 /* Clip slice QP to 0-51 spec range before encoding */
278 slice->m_sliceQp = Clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
279
280 m_initSliceContext.resetEntropy(*slice);
281
282 m_frameFilter.start(m_frame, m_initSliceContext, qp);
283
284 // reset entropy coders
285 m_entropyCoder.load(m_initSliceContext);
286 for (int i = 0; i < m_numRows; i++)
287 m_rows[i].init(m_initSliceContext);
288
289 uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1;
290 if (!m_outStreams)
291 {
292 m_outStreams = new Bitstream[numSubstreams];
293 m_substreamSizes = X265_MALLOC(uint32_t, numSubstreams);
294 if (!m_param->bEnableSAO)
295 for (uint32_t i = 0; i < numSubstreams; i++)
296 m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]);
297 }
298 else
299 for (uint32_t i = 0; i < numSubstreams; i++)
300 m_outStreams[i].resetBits();
301
302 if (m_frame->m_lowres.bKeyframe)
303 {
304 if (m_param->bEmitHRDSEI)
305 {
306 SEIBufferingPeriod* bpSei = &m_top->m_rateControl->m_bufPeriodSEI;
307
308 // since the temporal layer HRD is not ready, we assumed it is fixed
309 bpSei->m_auCpbRemovalDelayDelta = 1;
310 bpSei->m_cpbDelayOffset = 0;
311 bpSei->m_dpbDelayOffset = 0;
312
313 // hrdFullness() calculates the initial CPB removal delay and offset
314 m_top->m_rateControl->hrdFullness(bpSei);
315
316 m_bs.resetBits();
317 bpSei->write(m_bs, *slice->m_sps);
318 m_bs.writeByteAlignment();
319
320 m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
321
322 m_top->m_lastBPSEI = m_rce.encodeOrder;
323 }
324
325 // The recovery point SEI message assists a decoder in determining when the decoding
326 // process will produce acceptable pictures for display after the decoder initiates
327 // random access. The m_recoveryPocCnt is in units of POC(picture order count) which
328 // means pictures encoded after the CRA but precede it in display order(leading) are
329 // implicitly discarded after a random access seek regardless of the value of
330 // m_recoveryPocCnt. Our encoder does not use references prior to the most recent CRA,
331 // so all pictures following the CRA in POC order are guaranteed to be displayable,
332 // so m_recoveryPocCnt is always 0.
333 SEIRecoveryPoint sei_recovery_point;
334 sei_recovery_point.m_recoveryPocCnt = 0;
335 sei_recovery_point.m_exactMatchingFlag = true;
336 sei_recovery_point.m_brokenLinkFlag = false;
337
338 m_bs.resetBits();
339 sei_recovery_point.write(m_bs, *slice->m_sps);
340 m_bs.writeByteAlignment();
341
342 m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
343 }
344
345 if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
346 {
347 SEIPictureTiming *sei = m_rce.picTimingSEI;
348 const VUI *vui = &slice->m_sps->vuiParameters;
349 const HRDInfo *hrd = &vui->hrdParameters;
350 int poc = slice->m_poc;
351
352 if (vui->frameFieldInfoPresentFlag)
353 {
354 if (m_param->interlaceMode == 2)
355 sei->m_picStruct = (poc & 1) ? 1 /* top */ : 2 /* bottom */;
356 else if (m_param->interlaceMode == 1)
357 sei->m_picStruct = (poc & 1) ? 2 /* bottom */ : 1 /* top */;
358 else
359 sei->m_picStruct = 0;
360 sei->m_sourceScanType = 0;
361 sei->m_duplicateFlag = false;
362 }
363
364 if (vui->hrdParametersPresentFlag)
365 {
366 // The m_aucpbremoval delay specifies how many clock ticks the
367 // access unit associated with the picture timing SEI message has to
368 // wait after removal of the access unit with the most recent
369 // buffering period SEI message
370 sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - m_top->m_lastBPSEI), (1 << hrd->cpbRemovalDelayLength));
371 sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder;
372 }
373
374 m_bs.resetBits();
375 sei->write(m_bs, *slice->m_sps);
376 m_bs.writeByteAlignment();
377 m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
378 }
379
380 // Analyze CTU rows, most of the hard work is done here
381 // frame is compressed in a wave-front pattern if WPP is enabled. Loop filter runs as a
382 // wave-front behind the CU compression and reconstruction
383 compressCTURows();
384
385 if (m_param->rc.bStatWrite)
386 {
387 int totalI = 0, totalP = 0, totalSkip = 0;
388
389 // accumulate intra,inter,skip cu count per frame for 2 pass
390 for (int i = 0; i < m_numRows; i++)
391 {
392 m_frameStats.mvBits += m_rows[i].rowStats.mvBits;
393 m_frameStats.coeffBits += m_rows[i].rowStats.coeffBits;
394 m_frameStats.miscBits += m_rows[i].rowStats.miscBits;
395 totalI += m_rows[i].rowStats.iCuCnt;
396 totalP += m_rows[i].rowStats.pCuCnt;
397 totalSkip += m_rows[i].rowStats.skipCuCnt;
398 }
399 int totalCuCount = totalI + totalP + totalSkip;
400 m_frameStats.percentIntra = (double)totalI / totalCuCount;
401 m_frameStats.percentInter = (double)totalP / totalCuCount;
402 m_frameStats.percentSkip = (double)totalSkip / totalCuCount;
403 }
404
405 m_bs.resetBits();
406 m_entropyCoder.load(m_initSliceContext);
407 m_entropyCoder.setBitstream(&m_bs);
408 m_entropyCoder.codeSliceHeader(*slice, *m_frame->m_encData);
409
410 // finish encode of each CTU row, only required when SAO is enabled
411 if (m_param->bEnableSAO)
412 encodeSlice();
413
414 // serialize each row, record final lengths in slice header
415 uint32_t maxStreamSize = m_nalList.serializeSubstreams(m_substreamSizes, numSubstreams, m_outStreams);
416
417 // complete the slice header by writing WPP row-starts
418 m_entropyCoder.setBitstream(&m_bs);
419 if (slice->m_pps->bEntropyCodingSyncEnabled)
420 m_entropyCoder.codeSliceHeaderWPPEntryPoints(*slice, m_substreamSizes, maxStreamSize);
421 m_bs.writeByteAlignment();
422
423 m_nalList.serialize(slice->m_nalUnitType, m_bs);
424
425 if (m_param->decodedPictureHashSEI)
426 {
427 if (m_param->decodedPictureHashSEI == 1)
428 {
429 m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5;
430 for (int i = 0; i < 3; i++)
431 MD5Final(&m_state[i], m_seiReconPictureDigest.m_digest[i]);
432 }
433 else if (m_param->decodedPictureHashSEI == 2)
434 {
435 m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC;
436 for (int i = 0; i < 3; i++)
437 crcFinish(m_crc[i], m_seiReconPictureDigest.m_digest[i]);
438 }
439 else if (m_param->decodedPictureHashSEI == 3)
440 {
441 m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM;
442 for (int i = 0; i < 3; i++)
443 checksumFinish(m_checksum[i], m_seiReconPictureDigest.m_digest[i]);
444 }
445
446 m_bs.resetBits();
447 m_seiReconPictureDigest.write(m_bs, *slice->m_sps);
448 m_bs.writeByteAlignment();
449
450 m_nalList.serialize(NAL_UNIT_SUFFIX_SEI, m_bs);
451 }
452
453 uint64_t bytes = 0;
454 for (uint32_t i = 0; i < m_nalList.m_numNal; i++)
455 {
456 int type = m_nalList.m_nal[i].type;
457
458 // exclude SEI
459 if (type != NAL_UNIT_PREFIX_SEI && type != NAL_UNIT_SUFFIX_SEI)
460 {
461 bytes += m_nalList.m_nal[i].sizeBytes;
462 // and exclude start code prefix
463 bytes -= (!i || type == NAL_UNIT_SPS || type == NAL_UNIT_PPS) ? 4 : 3;
464 }
465 }
466 m_accessUnitBits = bytes << 3;
467
468 m_elapsedCompressTime = (double)(x265_mdate() - startCompressTime) / 1000000;
469 /* rateControlEnd may also block for earlier frames to call rateControlUpdateStats */
470 if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0)
471 m_top->m_aborted = true;
472
473 /* Accumulate NR statistics from all worker threads */
474 if (m_nr)
475 {
476 for (int i = 0; i < m_top->m_numThreadLocalData; i++)
477 {
478 NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID];
479 for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
480 {
481 for(int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
482 m_nr->residualSum[cat][coeff] += nr->residualSum[cat][coeff];
483
484 m_nr->count[cat] += nr->count[cat];
485 }
486 }
487 }
488
489 noiseReductionUpdate();
490
491 /* Copy updated NR coefficients back to all worker threads */
492 if (m_nr)
493 {
494 for (int i = 0; i < m_top->m_numThreadLocalData; i++)
495 {
496 NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID];
497 memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint16_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
498 memset(nr->count, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES);
499 memset(nr->residualSum, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
500 }
501 }
502
503 // Decrement referenced frame reference counts, allow them to be recycled
504 for (int l = 0; l < numPredDir; l++)
505 {
506 for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
507 {
508 Frame *refpic = slice->m_refPicList[l][ref];
509 ATOMIC_DEC(&refpic->m_countRefEncoders);
510 }
511 }
512 }
513
514 void FrameEncoder::encodeSlice()
515 {
516 Slice* slice = m_frame->m_encData->m_slice;
517 const uint32_t widthInLCUs = slice->m_sps->numCuInWidth;
518 const uint32_t lastCUAddr = (slice->m_endCUAddr + NUM_CU_PARTITIONS - 1) / NUM_CU_PARTITIONS;
519 const uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1;
520
521 SAOParam* saoParam = slice->m_sps->bUseSAO ? m_frame->m_encData->m_saoParam : NULL;
522 for (uint32_t cuAddr = 0; cuAddr < lastCUAddr; cuAddr++)
523 {
524 uint32_t col = cuAddr % widthInLCUs;
525 uint32_t lin = cuAddr / widthInLCUs;
526 uint32_t subStrm = lin % numSubstreams;
527 CUData* ctu = m_frame->m_encData->getPicCTU(cuAddr);
528
529 m_entropyCoder.setBitstream(&m_outStreams[subStrm]);
530
531 // Synchronize cabac probabilities with upper-right CTU if it's available and we're at the start of a line.
532 if (m_param->bEnableWavefront && !col && lin)
533 {
534 m_entropyCoder.copyState(m_initSliceContext);
535 m_entropyCoder.loadContexts(m_rows[lin - 1].bufferedEntropy);
536 }
537
538 if (saoParam)
539 {
540 if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
541 {
542 int mergeLeft = col && saoParam->ctuParam[0][cuAddr].mergeMode == SAO_MERGE_LEFT;
543 int mergeUp = lin && saoParam->ctuParam[0][cuAddr].mergeMode == SAO_MERGE_UP;
544 if (col)
545 m_entropyCoder.codeSaoMerge(mergeLeft);
546 if (lin && !mergeLeft)
547 m_entropyCoder.codeSaoMerge(mergeUp);
548 if (!mergeLeft && !mergeUp)
549 {
550 if (saoParam->bSaoFlag[0])
551 m_entropyCoder.codeSaoOffset(saoParam->ctuParam[0][cuAddr], 0);
552 if (saoParam->bSaoFlag[1])
553 {
554 m_entropyCoder.codeSaoOffset(saoParam->ctuParam[1][cuAddr], 1);
555 m_entropyCoder.codeSaoOffset(saoParam->ctuParam[2][cuAddr], 2);
556 }
557 }
558 }
559 else
560 {
561 for (int i = 0; i < 3; i++)
562 saoParam->ctuParam[i][cuAddr].reset();
563 }
564 }
565
566 // final coding (bitstream generation) for this CU
567 m_entropyCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]);
568
569 if (m_param->bEnableWavefront)
570 {
571 if (col == 1)
572 // Store probabilities of second CTU in line into buffer
573 m_rows[lin].bufferedEntropy.loadContexts(m_entropyCoder);
574
575 if (col == widthInLCUs - 1)
576 m_entropyCoder.finishSlice();
577 }
578 }
579 if (!m_param->bEnableWavefront)
580 m_entropyCoder.finishSlice();
581 }
582
583 void FrameEncoder::compressCTURows()
584 {
585 Slice* slice = m_frame->m_encData->m_slice;
586
587 m_bAllRowsStop = false;
588 m_vbvResetTriggerRow = -1;
589
590 m_SSDY = m_SSDU = m_SSDV = 0;
591 m_ssim = 0;
592 m_ssimCnt = 0;
593 memset(&m_frameStats, 0, sizeof(m_frameStats));
594
595 bool bUseWeightP = slice->m_pps->bUseWeightPred && slice->m_sliceType == P_SLICE;
596 bool bUseWeightB = slice->m_pps->bUseWeightedBiPred && slice->m_sliceType == B_SLICE;
597 int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0;
598
599 m_rows[0].active = true;
600 if (m_pool && m_param->bEnableWavefront)
601 {
602 WaveFront::clearEnabledRowMask();
603 WaveFront::enqueue();
604
605 for (int row = 0; row < m_numRows; row++)
606 {
607 // block until all reference frames have reconstructed the rows we need
608 for (int l = 0; l < numPredDir; l++)
609 {
610 for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
611 {
612 Frame *refpic = slice->m_refPicList[l][ref];
613
614 int reconRowCount = refpic->m_reconRowCount.get();
615 while ((reconRowCount != m_numRows) && (reconRowCount < row + m_refLagRows))
616 reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount);
617
618 if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
619 m_mref[l][ref].applyWeight(row + m_refLagRows, m_numRows);
620 }
621 }
622
623 enableRowEncoder(row);
624 if (row == 0)
625 enqueueRowEncoder(0);
626 else
627 m_pool->pokeIdleThread();
628 }
629
630 m_completionEvent.wait();
631
632 WaveFront::dequeue();
633 }
634 else
635 {
636 for (int i = 0; i < this->m_numRows + m_filterRowDelay; i++)
637 {
638 // Encode
639 if (i < m_numRows)
640 {
641 // block until all reference frames have reconstructed the rows we need
642 for (int l = 0; l < numPredDir; l++)
643 {
644 int list = l;
645 for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++)
646 {
647 Frame *refpic = slice->m_refPicList[list][ref];
648
649 int reconRowCount = refpic->m_reconRowCount.get();
650 while ((reconRowCount != m_numRows) && (reconRowCount < i + m_refLagRows))
651 reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount);
652
653 if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
654 m_mref[list][ref].applyWeight(i + m_refLagRows, m_numRows);
655 }
656 }
657
658 processRowEncoder(i, *m_tld);
659 }
660
661 // Filter
662 if (i >= m_filterRowDelay)
663 m_frameFilter.processRow(i - m_filterRowDelay);
664 }
665 }
666 m_frameTime = (double)m_totalTime / 1000000;
667 m_totalTime = 0;
668 }
669
670 void FrameEncoder::processRow(int row, int threadId)
671 {
672 const int realRow = row >> 1;
673 const int typeNum = row & 1;
674
675 ThreadLocalData& tld = threadId >= 0 ? m_top->m_threadLocalData[threadId] : *m_tld;
676
677 if (!typeNum)
678 processRowEncoder(realRow, tld);
679 else
680 {
681 m_frameFilter.processRow(realRow);
682
683 // NOTE: Active next row
684 if (realRow != m_numRows - 1)
685 enqueueRowFilter(realRow + 1);
686 else
687 m_completionEvent.trigger();
688 }
689 }
690
691 // Called by worker threads
692 void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
693 {
694 CTURow& curRow = m_rows[row];
695
696 {
697 ScopedLock self(curRow.lock);
698 if (!curRow.active)
699 /* VBV restart is in progress, exit out */
700 return;
701 if (curRow.busy)
702 {
703 /* On multi-socket Windows servers, we have seen problems with
704 * ATOMIC_CAS which resulted in multiple worker threads processing
705 * the same CU row, which often resulted in bad pointer accesses. We
706 * believe the problem is fixed, but are leaving this check in place
707 * to prevent crashes in case it is not */
708 x265_log(m_param, X265_LOG_WARNING,
709 "internal error - simultaneous row access detected. Please report HW to x265-devel@videolan.org\n");
710 return;
711 }
712 curRow.busy = true;
713 }
714
715 /* When WPP is enabled, every row has its own row coder instance. Otherwise
716 * they share row 0 */
717 Entropy& rowCoder = m_param->bEnableWavefront ? m_rows[row].rowGoOnCoder : m_rows[0].rowGoOnCoder;
718 FrameData& curEncData = *m_frame->m_encData;
719 Slice *slice = curEncData.m_slice;
720
721 int64_t startTime = x265_mdate();
722 const uint32_t numCols = m_numCols;
723 const uint32_t lineStartCUAddr = row * numCols;
724 bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
725
726 while (curRow.completed < numCols)
727 {
728 ProfileScopeEvent(encodeCTU);
729
730 int col = curRow.completed;
731 const uint32_t cuAddr = lineStartCUAddr + col;
732 CUData* ctu = curEncData.getPicCTU(cuAddr);
733 ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp);
734
735 if (bIsVbv)
736 {
737 if (!row)
738 {
739 curEncData.m_rowStat[row].diagQp = curEncData.m_avgQpRc;
740 curEncData.m_rowStat[row].diagQpScale = x265_qp2qScale(curEncData.m_avgQpRc);
741 }
742
743 if (row >= col && row && m_vbvResetTriggerRow != row)
744 curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_cuStat[cuAddr - numCols + 1].baseQp;
745 else
746 curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_rowStat[row].diagQp;
747 }
748 else
749 curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_avgQpRc;
750
751 if (m_param->rc.aqMode || bIsVbv)
752 {
753 int qp = calcQpForCu(cuAddr, curEncData.m_cuStat[cuAddr].baseQp);
754 tld.analysis.setQP(*slice, qp);
755 qp = Clip3(QP_MIN, QP_MAX_SPEC, qp);
756 ctu->setQPSubParts((int8_t)qp, 0, 0);
757 curEncData.m_rowStat[row].sumQpAq += qp;
758 }
759 else
760 tld.analysis.setQP(*slice, slice->m_sliceQp);
761
762 if (m_param->bEnableWavefront && !col && row)
763 {
764 // Load SBAC coder context from previous row and initialize row state.
765 rowCoder.copyState(m_initSliceContext);
766 rowCoder.loadContexts(m_rows[row - 1].bufferedEntropy);
767 }
768
769 // Does all the CU analysis, returns best top level mode decision
770 Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
771
772 /* advance top-level row coder to include the context of this CTU.
773 * if SAO is disabled, rowCoder writes the final CTU bitstream */
774 rowCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]);
775
776 if (m_param->bEnableWavefront && col == 1)
777 // Save CABAC state for next row
778 curRow.bufferedEntropy.loadContexts(rowCoder);
779
780 // Completed CU processing
781 curRow.completed++;
782
783 if (m_param->bLogCuStats || m_param->rc.bStatWrite)
784 collectCTUStatistics(*ctu);
785
786 // copy no. of intra, inter Cu cnt per row into frame stats for 2 pass
787 if (m_param->rc.bStatWrite)
788 {
789 curRow.rowStats.mvBits += best.mvBits;
790 curRow.rowStats.coeffBits += best.coeffBits;
791 curRow.rowStats.miscBits += best.totalBits - (best.mvBits + best.coeffBits);
792 StatisticLog* log = &m_sliceTypeLog[slice->m_sliceType];
793
794 for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
795 {
796 /* 1 << shift == number of 8x8 blocks at current depth */
797 int shift = 2 * (g_maxCUDepth - depth);
798 curRow.rowStats.iCuCnt += log->qTreeIntraCnt[depth] << shift;
799 curRow.rowStats.pCuCnt += log->qTreeInterCnt[depth] << shift;
800 curRow.rowStats.skipCuCnt += log->qTreeSkipCnt[depth] << shift;
801
802 // clear the row cu data from thread local object
803 log->qTreeIntraCnt[depth] = log->qTreeInterCnt[depth] = log->qTreeSkipCnt[depth] = 0;
804 }
805 }
806
807 curEncData.m_cuStat[cuAddr].totalBits = best.totalBits;
808 x265_emms();
809
810 if (bIsVbv)
811 {
812 // Update encoded bits, satdCost, baseQP for each CU
813 curEncData.m_rowStat[row].diagSatd += curEncData.m_cuStat[cuAddr].vbvCost;
814 curEncData.m_rowStat[row].diagIntraSatd += curEncData.m_cuStat[cuAddr].intraVbvCost;
815 curEncData.m_rowStat[row].encodedBits += curEncData.m_cuStat[cuAddr].totalBits;
816 curEncData.m_rowStat[row].sumQpRc += curEncData.m_cuStat[cuAddr].baseQp;
817 curEncData.m_rowStat[row].numEncodedCUs = cuAddr;
818
819 // If current block is at row diagonal checkpoint, call vbv ratecontrol.
820
821 if (row == col && row)
822 {
823 double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
824 int reEncode = m_top->m_rateControl->rowDiagonalVbvRateControl(m_frame, row, &m_rce, qpBase);
825 qpBase = Clip3((double)QP_MIN, (double)QP_MAX_MAX, qpBase);
826 curEncData.m_rowStat[row].diagQp = qpBase;
827 curEncData.m_rowStat[row].diagQpScale = x265_qp2qScale(qpBase);
828
829 if (reEncode < 0)
830 {
831 x265_log(m_param, X265_LOG_DEBUG, "POC %d row %d - encode restart required for VBV, to %.2f from %.2f\n",
832 m_frame->m_poc, row, qpBase, curEncData.m_cuStat[cuAddr].baseQp);
833
834 // prevent the WaveFront::findJob() method from providing new jobs
835 m_vbvResetTriggerRow = row;
836 m_bAllRowsStop = true;
837
838 for (int r = m_numRows - 1; r >= row; r--)
839 {
840 CTURow& stopRow = m_rows[r];
841
842 if (r != row)
843 {
844 /* if row was active (ready to be run) clear active bit and bitmap bit for this row */
845 stopRow.lock.acquire();
846 while (stopRow.active)
847 {
848 if (dequeueRow(r * 2))
849 stopRow.active = false;
850 else
851 {
852 /* we must release the row lock to allow the thread to exit */
853 stopRow.lock.release();
854 GIVE_UP_TIME();
855 stopRow.lock.acquire();
856 }
857 }
858 stopRow.lock.release();
859
860 bool bRowBusy = true;
861 do
862 {
863 stopRow.lock.acquire();
864 bRowBusy = stopRow.busy;
865 stopRow.lock.release();
866
867 if (bRowBusy)
868 {
869 GIVE_UP_TIME();
870 }
871 }
872 while (bRowBusy);
873 }
874
875 m_outStreams[r].resetBits();
876 stopRow.completed = 0;
877 memset(&stopRow.rowStats, 0, sizeof(stopRow.rowStats));
878 curEncData.m_rowStat[r].numEncodedCUs = 0;
879 curEncData.m_rowStat[r].encodedBits = 0;
880 curEncData.m_rowStat[r].diagSatd = 0;
881 curEncData.m_rowStat[r].diagIntraSatd = 0;
882 curEncData.m_rowStat[r].sumQpRc = 0;
883 curEncData.m_rowStat[r].sumQpAq = 0;
884 }
885
886 m_bAllRowsStop = false;
887 }
888 }
889 }
890
891 // NOTE: do CU level Filter
892 if (m_param->bEnableSAO && m_param->bSaoNonDeblocked)
893 // SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas
894 m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row);
895
896 // NOTE: active next row
897 if (curRow.completed >= 2 && row < m_numRows - 1)
898 {
899 ScopedLock below(m_rows[row + 1].lock);
900 if (m_rows[row + 1].active == false &&
901 m_rows[row + 1].completed + 2 <= curRow.completed &&
902 (!m_bAllRowsStop || row + 1 < m_vbvResetTriggerRow))
903 {
904 m_rows[row + 1].active = true;
905 enqueueRowEncoder(row + 1);
906 }
907 }
908
909 ScopedLock self(curRow.lock);
910 if ((m_bAllRowsStop && row > m_vbvResetTriggerRow) ||
911 (row > 0 && curRow.completed < numCols - 1 && m_rows[row - 1].completed < m_rows[row].completed + 2))
912 {
913 curRow.active = false;
914 curRow.busy = false;
915 m_totalTime += x265_mdate() - startTime;
916 return;
917 }
918 }
919
920 /* *this row of CTUs has been encoded* */
921
922 /* flush row bitstream (if WPP and no SAO) or flush frame if no WPP and no SAO */
923 if (!m_param->bEnableSAO && (m_param->bEnableWavefront || row == m_numRows - 1))
924 rowCoder.finishSlice();
925
926 /* If encoding with ABR, update update bits and complexity in rate control
927 * after a number of rows so the next frame's rateControlStart has more
928 * accurate data for estimation. At the start of the encode we update stats
929 * after half the frame is encoded, but after this initial period we update
930 * after refLagRows (the number of rows reference frames must have completed
931 * before referencees may begin encoding) */
932 int rowCount = 0;
933 if (m_param->rc.rateControlMode == X265_RC_ABR)
934 {
935 if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
936 rowCount = X265_MIN((m_numRows + 1) / 2, m_numRows - 1);
937 else
938 rowCount = X265_MIN(m_refLagRows, m_numRows - 1);
939 }
940 if (row == rowCount)
941 {
942 m_rce.rowTotalBits = 0;
943 if (bIsVbv)
944 for (int i = 0; i < rowCount; i++)
945 m_rce.rowTotalBits += curEncData.m_rowStat[i].encodedBits;
946 else
947 for (uint32_t cuAddr = 0; cuAddr < rowCount * numCols; cuAddr++)
948 m_rce.rowTotalBits += curEncData.m_cuStat[cuAddr].totalBits;
949
950 m_top->m_rateControl->rateControlUpdateStats(&m_rce);
951 }
952
953 if (m_param->bEnableWavefront)
954 {
955 /* trigger row-wise loop filters */
956 if (row >= m_filterRowDelay)
957 {
958 enableRowFilter(row - m_filterRowDelay);
959
960 /* NOTE: Activate filter if first row (row 0) */
961 if (row == m_filterRowDelay)
962 enqueueRowFilter(0);
963 }
964 if (row == m_numRows - 1)
965 {
966 for (int i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
967 enableRowFilter(i);
968 }
969 }
970
971 m_totalTime += x265_mdate() - startTime;
972 curRow.busy = false;
973 }
974
975 void FrameEncoder::collectCTUStatistics(CUData& ctu)
976 {
977 StatisticLog* log = &m_sliceTypeLog[ctu.m_slice->m_sliceType];
978
979 if (ctu.m_slice->m_sliceType == I_SLICE)
980 {
981 uint32_t depth = 0;
982 for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
983 {
984 depth = ctu.m_cuDepth[absPartIdx];
985
986 log->totalCu++;
987 log->cntIntra[depth]++;
988 log->qTreeIntraCnt[depth]++;
989
990 if (ctu.m_predMode[absPartIdx] == MODE_NONE)
991 {
992 log->totalCu--;
993 log->cntIntra[depth]--;
994 log->qTreeIntraCnt[depth]--;
995 }
996 else if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
997 {
998 /* TODO: log intra modes at absPartIdx +0 to +3 */
999 X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n");
1000 log->cntIntraNxN++;
1001 log->cntIntra[depth]--;
1002 }
1003 else if (ctu.m_lumaIntraDir[absPartIdx] > 1)
1004 log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++;
1005 else
1006 log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++;
1007 }
1008 }
1009 else
1010 {
1011 uint32_t depth = 0;
1012 for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
1013 {
1014 depth = ctu.m_cuDepth[absPartIdx];
1015
1016 log->totalCu++;
1017 log->cntTotalCu[depth]++;
1018
1019 if (ctu.m_predMode[absPartIdx] == MODE_NONE)
1020 {
1021 log->totalCu--;
1022 log->cntTotalCu[depth]--;
1023 }
1024 else if (ctu.isSkipped(absPartIdx))
1025 {
1026 log->totalCu--;
1027 log->cntSkipCu[depth]++;
1028 log->qTreeSkipCnt[depth]++;
1029 }
1030 else if (ctu.isInter(absPartIdx))
1031 {
1032 log->cntInter[depth]++;
1033 log->qTreeInterCnt[depth]++;
1034
1035 if (ctu.m_partSize[absPartIdx] < AMP_ID)
1036 log->cuInterDistribution[depth][ctu.m_partSize[absPartIdx]]++;
1037 else
1038 log->cuInterDistribution[depth][AMP_ID]++;
1039 }
1040 else if (ctu.isIntra(absPartIdx))
1041 {
1042 log->cntIntra[depth]++;
1043 log->qTreeIntraCnt[depth]++;
1044
1045 if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
1046 {
1047 X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n");
1048 log->cntIntraNxN++;
1049 /* TODO: log intra modes at absPartIdx +0 to +3 */
1050 }
1051 else if (ctu.m_lumaIntraDir[absPartIdx] > 1)
1052 log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++;
1053 else
1054 log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++;
1055 }
1056 }
1057 }
1058 }
1059
1060 /* DCT-domain noise reduction / adaptive deadzone from libavcodec */
1061 void FrameEncoder::noiseReductionUpdate()
1062 {
1063 if (!m_nr)
1064 return;
1065
1066 static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12};
1067
1068 for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
1069 {
1070 int trSize = cat & 3;
1071 int coefCount = 1 << ((trSize + 2) * 2);
1072
1073 if (m_nr->count[cat] > maxBlocksPerTrSize[trSize])
1074 {
1075 for (int i = 0; i < coefCount; i++)
1076 m_nr->residualSum[cat][i] >>= 1;
1077 m_nr->count[cat] >>= 1;
1078 }
1079
1080 int nrStrength = cat < 8 ? m_param->noiseReductionIntra : m_param->noiseReductionInter;
1081 uint64_t scaledCount = (uint64_t)nrStrength * m_nr->count[cat];
1082
1083 for (int i = 0; i < coefCount; i++)
1084 {
1085 uint64_t value = scaledCount + m_nr->residualSum[cat][i] / 2;
1086 uint64_t denom = m_nr->residualSum[cat][i] + 1;
1087 m_nr->offsetDenoise[cat][i] = (uint16_t)(value / denom);
1088 }
1089
1090 // Don't denoise DC coefficients
1091 m_nr->offsetDenoise[cat][0] = 0;
1092 }
1093 }
1094
1095 int FrameEncoder::calcQpForCu(uint32_t ctuAddr, double baseQp)
1096 {
1097 x265_emms();
1098 double qp = baseQp;
1099
1100 FrameData& curEncData = *m_frame->m_encData;
1101 /* clear cuCostsForVbv from when vbv row reset was triggered */
1102 bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
1103 if (bIsVbv)
1104 {
1105 curEncData.m_cuStat[ctuAddr].vbvCost = 0;
1106 curEncData.m_cuStat[ctuAddr].intraVbvCost = 0;
1107 }
1108
1109 /* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */
1110 double qp_offset = 0;
1111 uint32_t maxBlockCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
1112 uint32_t maxBlockRows = (m_frame->m_fencPic->m_picHeight + (16 - 1)) / 16;
1113 uint32_t noOfBlocks = g_maxCUSize / 16;
1114 uint32_t block_y = (ctuAddr / curEncData.m_slice->m_sps->numCuInWidth) * noOfBlocks;
1115 uint32_t block_x = (ctuAddr * noOfBlocks) - block_y * curEncData.m_slice->m_sps->numCuInWidth;
1116
1117 /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
1118 bool isReferenced = IS_REFERENCED(m_frame);
1119 double *qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
1120
1121 uint32_t cnt = 0, idx = 0;
1122 for (uint32_t h = 0; h < noOfBlocks && block_y < maxBlockRows; h++, block_y++)
1123 {
1124 for (uint32_t w = 0; w < noOfBlocks && (block_x + w) < maxBlockCols; w++)
1125 {
1126 idx = block_x + w + (block_y * maxBlockCols);
1127 if (m_param->rc.aqMode)
1128 qp_offset += qpoffs[idx];
1129 if (bIsVbv)
1130 {
1131 curEncData.m_cuStat[ctuAddr].vbvCost += m_frame->m_lowres.lowresCostForRc[idx] & LOWRES_COST_MASK;
1132 curEncData.m_cuStat[ctuAddr].intraVbvCost += m_frame->m_lowres.intraCost[idx];
1133 }
1134 cnt++;
1135 }
1136 }
1137
1138 qp_offset /= cnt;
1139 qp += qp_offset;
1140
1141 return Clip3(QP_MIN, QP_MAX_MAX, (int)(qp + 0.5));
1142 }
1143
1144 Frame *FrameEncoder::getEncodedPicture(NALList& output)
1145 {
1146 if (m_frame)
1147 {
1148 /* block here until worker thread completes */
1149 m_done.wait();
1150
1151 Frame *ret = m_frame;
1152 m_frame = NULL;
1153 output.takeContents(m_nalList);
1154 return ret;
1155 }
1156
1157 return NULL;
1158 }
1159 }