Imported Upstream version 1.4
[deb_x265.git] / source / encoder / frameencoder.cpp
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Chung Shin Yee <shinyee@multicorewareinc.com>
5 * Min Chen <chenm003@163.com>
6 * Steve Borho <steve@borho.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 *
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at license @ x265.com.
24 *****************************************************************************/
25
26 #include "common.h"
27 #include "frame.h"
28 #include "framedata.h"
29 #include "wavefront.h"
30 #include "param.h"
31
32 #include "PPA/ppa.h"
33
34 #include "encoder.h"
35 #include "frameencoder.h"
36 #include "common.h"
37 #include "slicetype.h"
38 #include "nal.h"
39
40 namespace x265 {
41 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
42
43 FrameEncoder::FrameEncoder()
44 : WaveFront(NULL)
45 , m_threadActive(true)
46 {
47 m_totalTime = 0;
48 m_frameEncoderID = 0;
49 m_bAllRowsStop = false;
50 m_vbvResetTriggerRow = -1;
51 m_outStreams = NULL;
52 m_substreamSizes = NULL;
53 m_nr = NULL;
54 m_tld = NULL;
55 m_rows = NULL;
56 m_top = NULL;
57 m_param = NULL;
58 m_frame = NULL;
59 m_cuGeoms = NULL;
60 m_ctuGeomMap = NULL;
61 memset(&m_frameStats, 0, sizeof(m_frameStats));
62 memset(&m_rce, 0, sizeof(RateControlEntry));
63 }
64
65 void FrameEncoder::destroy()
66 {
67 if (m_pool)
68 JobProvider::flush(); // ensure no worker threads are using this frame
69
70 m_threadActive = false;
71 m_enable.trigger();
72
73 delete[] m_rows;
74 delete[] m_outStreams;
75 X265_FREE(m_cuGeoms);
76 X265_FREE(m_ctuGeomMap);
77 X265_FREE(m_substreamSizes);
78 X265_FREE(m_nr);
79
80 m_frameFilter.destroy();
81
82 if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
83 {
84 delete m_rce.picTimingSEI;
85 delete m_rce.hrdTiming;
86 }
87
88 // wait for worker thread to exit
89 stop();
90 }
91
92 bool FrameEncoder::init(Encoder *top, int numRows, int numCols, int id)
93 {
94 m_top = top;
95 m_param = top->m_param;
96 m_numRows = numRows;
97 m_numCols = numCols;
98 m_filterRowDelay = (m_param->bEnableSAO && m_param->bSaoNonDeblocked) ?
99 2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0);
100 m_filterRowDelayCus = m_filterRowDelay * numCols;
101 m_frameEncoderID = id;
102 m_rows = new CTURow[m_numRows];
103 bool ok = !!m_numRows;
104
105 int range = m_param->searchRange; /* fpel search */
106 range += 1; /* diamond search range check lag */
107 range += 2; /* subpel refine */
108 range += NTAPS_LUMA / 2; /* subpel filter half-length */
109 m_refLagRows = 1 + ((range + g_maxCUSize - 1) / g_maxCUSize);
110
111 // NOTE: 2 times of numRows because both Encoder and Filter in same queue
112 if (!WaveFront::init(m_numRows * 2))
113 {
114 x265_log(m_param, X265_LOG_ERROR, "unable to initialize wavefront queue\n");
115 m_pool = NULL;
116 }
117
118 m_frameFilter.init(top, this, numRows);
119
120 // initialize HRD parameters of SPS
121 if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
122 {
123 m_rce.picTimingSEI = new SEIPictureTiming;
124 m_rce.hrdTiming = new HRDTiming;
125
126 ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
127 }
128
129 if (m_param->noiseReduction)
130 m_nr = X265_MALLOC(NoiseReduction, 1);
131 if (m_nr)
132 memset(m_nr, 0, sizeof(NoiseReduction));
133 else
134 m_param->noiseReduction = 0;
135
136 start();
137 return ok;
138 }
139
140 /* Generate a complete list of unique geom sets for the current picture dimensions */
141 bool FrameEncoder::initializeGeoms(const FrameData& encData)
142 {
143 /* Geoms only vary between CTUs in the presence of picture edges */
144 int heightRem = m_param->sourceHeight & (m_param->maxCUSize - 1);
145 int widthRem = m_param->sourceWidth & (m_param->maxCUSize - 1);
146 int allocGeoms = 1; // body
147 if (heightRem && widthRem)
148 allocGeoms = 4; // body, right, bottom, corner
149 else if (heightRem || widthRem)
150 allocGeoms = 2; // body, right or bottom
151
152 m_ctuGeomMap = X265_MALLOC(uint32_t, m_numRows * m_numCols);
153 m_cuGeoms = X265_MALLOC(CUGeom, allocGeoms * CUGeom::MAX_GEOMS);
154 if (!m_cuGeoms || !m_ctuGeomMap)
155 return false;
156
157 CUGeom cuLocalData[CUGeom::MAX_GEOMS];
158 memset(cuLocalData, 0, sizeof(cuLocalData)); // temporal fix for memcmp
159
160 int countGeoms = 0;
161 for (uint32_t ctuAddr = 0; ctuAddr < m_numRows * m_numCols; ctuAddr++)
162 {
163 /* TODO: detach this logic from TComDataCU */
164 encData.m_picCTU[ctuAddr].initCTU(*m_frame, ctuAddr, 0);
165 encData.m_picCTU[ctuAddr].calcCTUGeoms(m_param->sourceWidth, m_param->sourceHeight, m_param->maxCUSize, cuLocalData);
166
167 m_ctuGeomMap[ctuAddr] = MAX_INT;
168 for (int i = 0; i < countGeoms; i++)
169 {
170 if (!memcmp(cuLocalData, m_cuGeoms + i * CUGeom::MAX_GEOMS, sizeof(CUGeom) * CUGeom::MAX_GEOMS))
171 {
172 m_ctuGeomMap[ctuAddr] = i * CUGeom::MAX_GEOMS;
173 break;
174 }
175 }
176
177 if (m_ctuGeomMap[ctuAddr] == MAX_INT)
178 {
179 X265_CHECK(countGeoms < allocGeoms, "geometry match check failure\n");
180 m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
181 memcpy(m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS, cuLocalData, sizeof(CUGeom) * CUGeom::MAX_GEOMS);
182 countGeoms++;
183 }
184 }
185
186 return true;
187 }
188
189 bool FrameEncoder::startCompressFrame(Frame* curFrame)
190 {
191 m_frame = curFrame;
192 curFrame->m_encData->m_frameEncoderID = m_frameEncoderID; // Each Frame knows the ID of the FrameEncoder encoding it
193 curFrame->m_encData->m_slice->m_mref = m_mref;
194 if (!m_cuGeoms)
195 {
196 if (!initializeGeoms(*curFrame->m_encData))
197 return false;
198 }
199 m_enable.trigger();
200 return true;
201 }
202
203 void FrameEncoder::threadMain()
204 {
205 // worker thread routine for FrameEncoder
206 do
207 {
208 m_enable.wait(); // Encoder::encode() triggers this event
209 if (m_threadActive)
210 {
211 compressFrame();
212 m_done.trigger(); // FrameEncoder::getEncodedPicture() blocks for this event
213 }
214 }
215 while (m_threadActive);
216 }
217
218 void FrameEncoder::compressFrame()
219 {
220 PPAScopeEvent(FrameEncoder_compressFrame);
221 int64_t startCompressTime = x265_mdate();
222 Slice* slice = m_frame->m_encData->m_slice;
223
224 /* Emit access unit delimiter unless this is the first frame and the user is
225 * not repeating headers (since AUD is supposed to be the first NAL in the access
226 * unit) */
227 if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
228 {
229 m_bs.resetBits();
230 m_entropyCoder.setBitstream(&m_bs);
231 m_entropyCoder.codeAUD(*slice);
232 m_bs.writeByteAlignment();
233 m_nalList.serialize(NAL_UNIT_ACCESS_UNIT_DELIMITER, m_bs);
234 }
235 if (m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders)
236 m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
237
238 // Weighted Prediction parameters estimation.
239 bool bUseWeightP = slice->m_sliceType == P_SLICE && slice->m_pps->bUseWeightPred;
240 bool bUseWeightB = slice->m_sliceType == B_SLICE && slice->m_pps->bUseWeightedBiPred;
241 if (bUseWeightP || bUseWeightB)
242 weightAnalyse(*slice, *m_frame, *m_param);
243 else
244 slice->disableWeights();
245
246 // Generate motion references
247 int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0;
248 for (int l = 0; l < numPredDir; l++)
249 {
250 for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
251 {
252 WeightParam *w = NULL;
253 if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag)
254 w = slice->m_weightPredTable[l][ref];
255 m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPicYuv, w);
256 }
257 }
258
259 /* Get the QP for this frame from rate control. This call may block until
260 * frames ahead of it in encode order have called rateControlEnd() */
261 int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
262 m_rce.newQp = qp;
263
264 /* Clip slice QP to 0-51 spec range before encoding */
265 slice->m_sliceQp = Clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
266
267 m_initSliceContext.resetEntropy(*slice);
268
269 m_frameFilter.start(m_frame, m_initSliceContext, qp);
270
271 // reset entropy coders
272 m_entropyCoder.load(m_initSliceContext);
273 for (int i = 0; i < m_numRows; i++)
274 m_rows[i].init(m_initSliceContext);
275
276 uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1;
277 if (!m_outStreams)
278 {
279 m_outStreams = new Bitstream[numSubstreams];
280 m_substreamSizes = X265_MALLOC(uint32_t, numSubstreams);
281 if (!m_param->bEnableSAO)
282 for (uint32_t i = 0; i < numSubstreams; i++)
283 m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]);
284 }
285 else
286 for (uint32_t i = 0; i < numSubstreams; i++)
287 m_outStreams[i].resetBits();
288
289 if (m_frame->m_lowres.bKeyframe)
290 {
291 if (m_param->bEmitHRDSEI)
292 {
293 SEIBufferingPeriod* bpSei = &m_top->m_rateControl->m_bufPeriodSEI;
294
295 // since the temporal layer HRD is not ready, we assumed it is fixed
296 bpSei->m_auCpbRemovalDelayDelta = 1;
297 bpSei->m_cpbDelayOffset = 0;
298 bpSei->m_dpbDelayOffset = 0;
299
300 // hrdFullness() calculates the initial CPB removal delay and offset
301 m_top->m_rateControl->hrdFullness(bpSei);
302
303 m_bs.resetBits();
304 bpSei->write(m_bs, *slice->m_sps);
305 m_bs.writeByteAlignment();
306
307 m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
308
309 m_top->m_lastBPSEI = m_rce.encodeOrder;
310 }
311
312 // The recovery point SEI message assists a decoder in determining when the decoding
313 // process will produce acceptable pictures for display after the decoder initiates
314 // random access. The m_recoveryPocCnt is in units of POC(picture order count) which
315 // means pictures encoded after the CRA but precede it in display order(leading) are
316 // implicitly discarded after a random access seek regardless of the value of
317 // m_recoveryPocCnt. Our encoder does not use references prior to the most recent CRA,
318 // so all pictures following the CRA in POC order are guaranteed to be displayable,
319 // so m_recoveryPocCnt is always 0.
320 SEIRecoveryPoint sei_recovery_point;
321 sei_recovery_point.m_recoveryPocCnt = 0;
322 sei_recovery_point.m_exactMatchingFlag = true;
323 sei_recovery_point.m_brokenLinkFlag = false;
324
325 m_bs.resetBits();
326 sei_recovery_point.write(m_bs, *slice->m_sps);
327 m_bs.writeByteAlignment();
328
329 m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
330 }
331
332 if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
333 {
334 SEIPictureTiming *sei = m_rce.picTimingSEI;
335 const VUI *vui = &slice->m_sps->vuiParameters;
336 const HRDInfo *hrd = &vui->hrdParameters;
337 int poc = slice->m_poc;
338
339 if (vui->frameFieldInfoPresentFlag)
340 {
341 if (m_param->interlaceMode == 2)
342 sei->m_picStruct = (poc & 1) ? 1 /* top */ : 2 /* bottom */;
343 else if (m_param->interlaceMode == 1)
344 sei->m_picStruct = (poc & 1) ? 2 /* bottom */ : 1 /* top */;
345 else
346 sei->m_picStruct = 0;
347 sei->m_sourceScanType = 0;
348 sei->m_duplicateFlag = false;
349 }
350
351 if (vui->hrdParametersPresentFlag)
352 {
353 // The m_aucpbremoval delay specifies how many clock ticks the
354 // access unit associated with the picture timing SEI message has to
355 // wait after removal of the access unit with the most recent
356 // buffering period SEI message
357 sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - m_top->m_lastBPSEI), (1 << hrd->cpbRemovalDelayLength));
358 sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder;
359 }
360
361 m_bs.resetBits();
362 sei->write(m_bs, *slice->m_sps);
363 m_bs.writeByteAlignment();
364 m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
365 }
366
367 // Analyze CTU rows, most of the hard work is done here
368 // frame is compressed in a wave-front pattern if WPP is enabled. Loop filter runs as a
369 // wave-front behind the CU compression and reconstruction
370 compressCTURows();
371
372 if (m_param->rc.bStatWrite)
373 {
374 int totalI = 0, totalP = 0, totalSkip = 0;
375
376 // accumulate intra,inter,skip cu count per frame for 2 pass
377 for (int i = 0; i < m_numRows; i++)
378 {
379 m_frameStats.mvBits += m_rows[i].rowStats.mvBits;
380 m_frameStats.coeffBits += m_rows[i].rowStats.coeffBits;
381 m_frameStats.miscBits += m_rows[i].rowStats.miscBits;
382 totalI += m_rows[i].rowStats.iCuCnt;
383 totalP += m_rows[i].rowStats.pCuCnt;
384 totalSkip += m_rows[i].rowStats.skipCuCnt;
385 }
386 int totalCuCount = totalI + totalP + totalSkip;
387 m_frameStats.percentIntra = (double)totalI / totalCuCount;
388 m_frameStats.percentInter = (double)totalP / totalCuCount;
389 m_frameStats.percentSkip = (double)totalSkip / totalCuCount;
390 }
391
392 m_bs.resetBits();
393 m_entropyCoder.load(m_initSliceContext);
394 m_entropyCoder.setBitstream(&m_bs);
395 m_entropyCoder.codeSliceHeader(*slice, *m_frame->m_encData);
396
397 // finish encode of each CTU row, only required when SAO is enabled
398 if (m_param->bEnableSAO)
399 encodeSlice();
400
401 // serialize each row, record final lengths in slice header
402 uint32_t maxStreamSize = m_nalList.serializeSubstreams(m_substreamSizes, numSubstreams, m_outStreams);
403
404 // complete the slice header by writing WPP row-starts
405 m_entropyCoder.setBitstream(&m_bs);
406 if (slice->m_pps->bEntropyCodingSyncEnabled)
407 m_entropyCoder.codeSliceHeaderWPPEntryPoints(*slice, m_substreamSizes, maxStreamSize);
408 m_bs.writeByteAlignment();
409
410 m_nalList.serialize(slice->m_nalUnitType, m_bs);
411
412 if (m_param->decodedPictureHashSEI)
413 {
414 if (m_param->decodedPictureHashSEI == 1)
415 {
416 m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5;
417 for (int i = 0; i < 3; i++)
418 MD5Final(&m_state[i], m_seiReconPictureDigest.m_digest[i]);
419 }
420 else if (m_param->decodedPictureHashSEI == 2)
421 {
422 m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC;
423 for (int i = 0; i < 3; i++)
424 crcFinish(m_crc[i], m_seiReconPictureDigest.m_digest[i]);
425 }
426 else if (m_param->decodedPictureHashSEI == 3)
427 {
428 m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM;
429 for (int i = 0; i < 3; i++)
430 checksumFinish(m_checksum[i], m_seiReconPictureDigest.m_digest[i]);
431 }
432
433 m_bs.resetBits();
434 m_seiReconPictureDigest.write(m_bs, *slice->m_sps);
435 m_bs.writeByteAlignment();
436
437 m_nalList.serialize(NAL_UNIT_SUFFIX_SEI, m_bs);
438 }
439
440 uint64_t bytes = 0;
441 for (uint32_t i = 0; i < m_nalList.m_numNal; i++)
442 {
443 int type = m_nalList.m_nal[i].type;
444
445 // exclude SEI
446 if (type != NAL_UNIT_PREFIX_SEI && type != NAL_UNIT_SUFFIX_SEI)
447 {
448 bytes += m_nalList.m_nal[i].sizeBytes;
449 // and exclude start code prefix
450 bytes -= (!i || type == NAL_UNIT_SPS || type == NAL_UNIT_PPS) ? 4 : 3;
451 }
452 }
453 m_accessUnitBits = bytes << 3;
454
455 m_elapsedCompressTime = (double)(x265_mdate() - startCompressTime) / 1000000;
456 /* rateControlEnd may also block for earlier frames to call rateControlUpdateStats */
457 if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0)
458 m_top->m_aborted = true;
459
460 /* Accumulate NR statistics from all worker threads */
461 if (m_nr)
462 {
463 for (int i = 0; i < m_top->m_numThreadLocalData; i++)
464 {
465 NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID];
466 for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
467 {
468 for(int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
469 m_nr->residualSum[cat][coeff] += nr->residualSum[cat][coeff];
470
471 m_nr->count[cat] += nr->count[cat];
472 }
473 }
474 }
475
476 noiseReductionUpdate();
477
478 /* Copy updated NR coefficients back to all worker threads */
479 if (m_nr)
480 {
481 for (int i = 0; i < m_top->m_numThreadLocalData; i++)
482 {
483 NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID];
484 memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
485 memset(nr->count, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES);
486 memset(nr->residualSum, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
487 }
488 }
489
490 // Decrement referenced frame reference counts, allow them to be recycled
491 for (int l = 0; l < numPredDir; l++)
492 {
493 for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
494 {
495 Frame *refpic = slice->m_refPicList[l][ref];
496 ATOMIC_DEC(&refpic->m_countRefEncoders);
497 }
498 }
499 }
500
501 void FrameEncoder::encodeSlice()
502 {
503 Slice* slice = m_frame->m_encData->m_slice;
504 const uint32_t widthInLCUs = slice->m_sps->numCuInWidth;
505 const uint32_t lastCUAddr = (slice->m_endCUAddr + NUM_CU_PARTITIONS - 1) / NUM_CU_PARTITIONS;
506 const uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1;
507
508 SAOParam* saoParam = slice->m_sps->bUseSAO ? m_frame->m_encData->m_saoParam : NULL;
509 for (uint32_t cuAddr = 0; cuAddr < lastCUAddr; cuAddr++)
510 {
511 uint32_t col = cuAddr % widthInLCUs;
512 uint32_t lin = cuAddr / widthInLCUs;
513 uint32_t subStrm = lin % numSubstreams;
514 CUData* ctu = m_frame->m_encData->getPicCTU(cuAddr);
515
516 m_entropyCoder.setBitstream(&m_outStreams[subStrm]);
517
518 // Synchronize cabac probabilities with upper-right CTU if it's available and we're at the start of a line.
519 if (m_param->bEnableWavefront && !col && lin)
520 {
521 m_entropyCoder.copyState(m_initSliceContext);
522 m_entropyCoder.loadContexts(m_rows[lin - 1].bufferedEntropy);
523 }
524
525 if (saoParam)
526 {
527 if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
528 {
529 int mergeLeft = col && saoParam->ctuParam[0][cuAddr].mergeMode == SAO_MERGE_LEFT;
530 int mergeUp = lin && saoParam->ctuParam[0][cuAddr].mergeMode == SAO_MERGE_UP;
531 if (col)
532 m_entropyCoder.codeSaoMerge(mergeLeft);
533 if (lin && !mergeLeft)
534 m_entropyCoder.codeSaoMerge(mergeUp);
535 if (!mergeLeft && !mergeUp)
536 {
537 if (saoParam->bSaoFlag[0])
538 m_entropyCoder.codeSaoOffset(saoParam->ctuParam[0][cuAddr], 0);
539 if (saoParam->bSaoFlag[1])
540 {
541 m_entropyCoder.codeSaoOffset(saoParam->ctuParam[1][cuAddr], 1);
542 m_entropyCoder.codeSaoOffset(saoParam->ctuParam[2][cuAddr], 2);
543 }
544 }
545 }
546 else
547 {
548 for (int i = 0; i < 3; i++)
549 saoParam->ctuParam[i][cuAddr].reset();
550 }
551 }
552
553 // final coding (bitstream generation) for this CU
554 m_entropyCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]);
555
556 if (m_param->bEnableWavefront)
557 {
558 if (col == 1)
559 // Store probabilities of second CTU in line into buffer
560 m_rows[lin].bufferedEntropy.loadContexts(m_entropyCoder);
561
562 if (col == widthInLCUs - 1)
563 m_entropyCoder.finishSlice();
564 }
565 }
566 if (!m_param->bEnableWavefront)
567 m_entropyCoder.finishSlice();
568 }
569
570 void FrameEncoder::compressCTURows()
571 {
572 PPAScopeEvent(FrameEncoder_compressRows);
573 Slice* slice = m_frame->m_encData->m_slice;
574
575 m_bAllRowsStop = false;
576 m_vbvResetTriggerRow = -1;
577
578 m_SSDY = m_SSDU = m_SSDV = 0;
579 m_ssim = 0;
580 m_ssimCnt = 0;
581 memset(&m_frameStats, 0, sizeof(m_frameStats));
582
583 bool bUseWeightP = slice->m_pps->bUseWeightPred && slice->m_sliceType == P_SLICE;
584 bool bUseWeightB = slice->m_pps->bUseWeightedBiPred && slice->m_sliceType == B_SLICE;
585 int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0;
586
587 m_rows[0].active = true;
588 if (m_pool && m_param->bEnableWavefront)
589 {
590 WaveFront::clearEnabledRowMask();
591 WaveFront::enqueue();
592
593 for (int row = 0; row < m_numRows; row++)
594 {
595 // block until all reference frames have reconstructed the rows we need
596 for (int l = 0; l < numPredDir; l++)
597 {
598 for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
599 {
600 Frame *refpic = slice->m_refPicList[l][ref];
601
602 int reconRowCount = refpic->m_reconRowCount.get();
603 while ((reconRowCount != m_numRows) && (reconRowCount < row + m_refLagRows))
604 reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount);
605
606 if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
607 m_mref[l][ref].applyWeight(row + m_refLagRows, m_numRows);
608 }
609 }
610
611 enableRowEncoder(row);
612 if (row == 0)
613 enqueueRowEncoder(0);
614 else
615 m_pool->pokeIdleThread();
616 }
617
618 m_completionEvent.wait();
619
620 WaveFront::dequeue();
621 }
622 else
623 {
624 for (int i = 0; i < this->m_numRows + m_filterRowDelay; i++)
625 {
626 // Encode
627 if (i < m_numRows)
628 {
629 // block until all reference frames have reconstructed the rows we need
630 for (int l = 0; l < numPredDir; l++)
631 {
632 int list = l;
633 for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++)
634 {
635 Frame *refpic = slice->m_refPicList[list][ref];
636
637 int reconRowCount = refpic->m_reconRowCount.get();
638 while ((reconRowCount != m_numRows) && (reconRowCount < i + m_refLagRows))
639 reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount);
640
641 if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
642 m_mref[list][ref].applyWeight(i + m_refLagRows, m_numRows);
643 }
644 }
645
646 processRow(i * 2 + 0, -1);
647 }
648
649 // Filter
650 if (i >= m_filterRowDelay)
651 processRow((i - m_filterRowDelay) * 2 + 1, -1);
652 }
653 }
654 m_frameTime = (double)m_totalTime / 1000000;
655 m_totalTime = 0;
656 }
657
658 void FrameEncoder::processRow(int row, int threadId)
659 {
660 const int realRow = row >> 1;
661 const int typeNum = row & 1;
662
663 ThreadLocalData& tld = threadId >= 0 ? m_top->m_threadLocalData[threadId] : *m_tld;
664
665 if (!typeNum)
666 processRowEncoder(realRow, tld);
667 else
668 {
669 processRowFilter(realRow);
670
671 // NOTE: Active next row
672 if (realRow != m_numRows - 1)
673 enqueueRowFilter(realRow + 1);
674 else
675 m_completionEvent.trigger();
676 }
677 }
678
679 // Called by worker threads
680 void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld)
681 {
682 PPAScopeEvent(Thread_ProcessRow);
683
684 CTURow& curRow = m_rows[row];
685
686 {
687 ScopedLock self(curRow.lock);
688 if (!curRow.active)
689 /* VBV restart is in progress, exit out */
690 return;
691 if (curRow.busy)
692 {
693 /* On multi-socket Windows servers, we have seen problems with
694 * ATOMIC_CAS which resulted in multiple worker threads processing
695 * the same CU row, which often resulted in bad pointer accesses. We
696 * believe the problem is fixed, but are leaving this check in place
697 * to prevent crashes in case it is not */
698 x265_log(m_param, X265_LOG_WARNING,
699 "internal error - simultaneous row access detected. Please report HW to x265-devel@videolan.org\n");
700 return;
701 }
702 curRow.busy = true;
703 }
704
705 /* When WPP is enabled, every row has its own row coder instance. Otherwise
706 * they share row 0 */
707 Entropy& rowCoder = m_param->bEnableWavefront ? m_rows[row].rowGoOnCoder : m_rows[0].rowGoOnCoder;
708 FrameData& curEncData = *m_frame->m_encData;
709 Slice *slice = curEncData.m_slice;
710 PicYuv* fencPic = m_frame->m_origPicYuv;
711
712 tld.analysis.m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride);
713
714 int64_t startTime = x265_mdate();
715 const uint32_t numCols = m_numCols;
716 const uint32_t lineStartCUAddr = row * numCols;
717 bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
718
719 while (curRow.completed < numCols)
720 {
721 int col = curRow.completed;
722 const uint32_t cuAddr = lineStartCUAddr + col;
723 CUData* ctu = curEncData.getPicCTU(cuAddr);
724 ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp);
725
726 if (bIsVbv)
727 {
728 if (!row)
729 {
730 curEncData.m_rowStat[row].diagQp = curEncData.m_avgQpRc;
731 curEncData.m_rowStat[row].diagQpScale = x265_qp2qScale(curEncData.m_avgQpRc);
732 }
733
734 if (row >= col && row && m_vbvResetTriggerRow != row)
735 curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_cuStat[cuAddr - numCols + 1].baseQp;
736 else
737 curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_rowStat[row].diagQp;
738 }
739 else
740 curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_avgQpRc;
741
742 if (m_param->rc.aqMode || bIsVbv)
743 {
744 int qp = calcQpForCu(cuAddr, curEncData.m_cuStat[cuAddr].baseQp);
745 tld.analysis.setQP(*slice, qp);
746 qp = Clip3(QP_MIN, QP_MAX_SPEC, qp);
747 ctu->setQPSubParts((char)qp, 0, 0);
748 curEncData.m_rowStat[row].sumQpAq += qp;
749 }
750 else
751 tld.analysis.setQP(*slice, slice->m_sliceQp);
752
753 if (m_param->bEnableWavefront && !col && row)
754 {
755 // Load SBAC coder context from previous row and initialize row state.
756 rowCoder.copyState(m_initSliceContext);
757 rowCoder.loadContexts(m_rows[row - 1].bufferedEntropy);
758 }
759
760 // Does all the CU analysis, returns best top level mode decision
761 Search::Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
762
763 /* advance top-level row coder to include the context of this CTU.
764 * if SAO is disabled, rowCoder writes the final CTU bitstream */
765 rowCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]);
766
767 if (m_param->bEnableWavefront && col == 1)
768 // Save CABAC state for next row
769 curRow.bufferedEntropy.loadContexts(rowCoder);
770
771 // Completed CU processing
772 curRow.completed++;
773
774 if (m_param->bLogCuStats || m_param->rc.bStatWrite)
775 collectCTUStatistics(*ctu);
776
777 // copy no. of intra, inter Cu cnt per row into frame stats for 2 pass
778 if (m_param->rc.bStatWrite)
779 {
780 curRow.rowStats.mvBits += best.mvBits;
781 curRow.rowStats.coeffBits += best.coeffBits;
782 curRow.rowStats.miscBits += best.totalBits - (best.mvBits + best.coeffBits);
783 StatisticLog* log = &m_sliceTypeLog[slice->m_sliceType];
784
785 for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
786 {
787 /* 1 << shift == number of 8x8 blocks at current depth */
788 int shift = 2 * (g_maxCUDepth - depth);
789 curRow.rowStats.iCuCnt += log->qTreeIntraCnt[depth] << shift;
790 curRow.rowStats.pCuCnt += log->qTreeInterCnt[depth] << shift;
791 curRow.rowStats.skipCuCnt += log->qTreeSkipCnt[depth] << shift;
792
793 // clear the row cu data from thread local object
794 log->qTreeIntraCnt[depth] = log->qTreeInterCnt[depth] = log->qTreeSkipCnt[depth] = 0;
795 }
796 }
797
798 curEncData.m_cuStat[cuAddr].totalBits = best.totalBits;
799 x265_emms();
800
801 if (bIsVbv)
802 {
803 // Update encoded bits, satdCost, baseQP for each CU
804 curEncData.m_rowStat[row].diagSatd += curEncData.m_cuStat[cuAddr].vbvCost;
805 curEncData.m_rowStat[row].diagIntraSatd += curEncData.m_cuStat[cuAddr].intraVbvCost;
806 curEncData.m_rowStat[row].encodedBits += curEncData.m_cuStat[cuAddr].totalBits;
807 curEncData.m_rowStat[row].sumQpRc += curEncData.m_cuStat[cuAddr].baseQp;
808 curEncData.m_rowStat[row].numEncodedCUs = cuAddr;
809
810 // If current block is at row diagonal checkpoint, call vbv ratecontrol.
811
812 if (row == col && row)
813 {
814 double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
815 int reEncode = m_top->m_rateControl->rowDiagonalVbvRateControl(m_frame, row, &m_rce, qpBase);
816 qpBase = Clip3((double)QP_MIN, (double)QP_MAX_MAX, qpBase);
817 curEncData.m_rowStat[row].diagQp = qpBase;
818 curEncData.m_rowStat[row].diagQpScale = x265_qp2qScale(qpBase);
819
820 if (reEncode < 0)
821 {
822 x265_log(m_param, X265_LOG_DEBUG, "POC %d row %d - encode restart required for VBV, to %.2f from %.2f\n",
823 m_frame->m_poc, row, qpBase, curEncData.m_cuStat[cuAddr].baseQp);
824
825 // prevent the WaveFront::findJob() method from providing new jobs
826 m_vbvResetTriggerRow = row;
827 m_bAllRowsStop = true;
828
829 for (int r = m_numRows - 1; r >= row; r--)
830 {
831 CTURow& stopRow = m_rows[r];
832
833 if (r != row)
834 {
835 /* if row was active (ready to be run) clear active bit and bitmap bit for this row */
836 stopRow.lock.acquire();
837 while (stopRow.active)
838 {
839 if (dequeueRow(r * 2))
840 stopRow.active = false;
841 else
842 GIVE_UP_TIME();
843 }
844
845 stopRow.lock.release();
846
847 bool bRowBusy = true;
848 do
849 {
850 stopRow.lock.acquire();
851 bRowBusy = stopRow.busy;
852 stopRow.lock.release();
853
854 if (bRowBusy)
855 {
856 GIVE_UP_TIME();
857 }
858 }
859 while (bRowBusy);
860 }
861
862 m_outStreams[r].resetBits();
863 stopRow.completed = 0;
864 memset(&stopRow.rowStats, 0, sizeof(stopRow.rowStats));
865 curEncData.m_rowStat[r].numEncodedCUs = 0;
866 curEncData.m_rowStat[r].encodedBits = 0;
867 curEncData.m_rowStat[r].diagSatd = 0;
868 curEncData.m_rowStat[r].diagIntraSatd = 0;
869 curEncData.m_rowStat[r].sumQpRc = 0;
870 curEncData.m_rowStat[r].sumQpAq = 0;
871 }
872
873 m_bAllRowsStop = false;
874 }
875 }
876 }
877
878 // NOTE: do CU level Filter
879 if (m_param->bEnableSAO && m_param->bSaoNonDeblocked)
880 // SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas
881 m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row);
882
883 // NOTE: active next row
884 if (curRow.completed >= 2 && row < m_numRows - 1)
885 {
886 ScopedLock below(m_rows[row + 1].lock);
887 if (m_rows[row + 1].active == false &&
888 m_rows[row + 1].completed + 2 <= curRow.completed &&
889 (!m_bAllRowsStop || row + 1 < m_vbvResetTriggerRow))
890 {
891 m_rows[row + 1].active = true;
892 enqueueRowEncoder(row + 1);
893 }
894 }
895
896 ScopedLock self(curRow.lock);
897 if ((m_bAllRowsStop && row > m_vbvResetTriggerRow) ||
898 (row > 0 && curRow.completed < numCols - 1 && m_rows[row - 1].completed < m_rows[row].completed + 2))
899 {
900 curRow.active = false;
901 curRow.busy = false;
902 m_totalTime += x265_mdate() - startTime;
903 return;
904 }
905 }
906
907 /* *this row of CTUs has been encoded* */
908
909 /* flush row bitstream (if WPP and no SAO) or flush frame if no WPP and no SAO */
910 if (!m_param->bEnableSAO && (m_param->bEnableWavefront || row == m_numRows - 1))
911 rowCoder.finishSlice();
912
913 /* If encoding with ABR, update update bits and complexity in rate control
914 * after a number of rows so the next frame's rateControlStart has more
915 * accurate data for estimation. At the start of the encode we update stats
916 * after half the frame is encoded, but after this initial period we update
917 * after refLagRows (the number of rows reference frames must have completed
918 * before referencees may begin encoding) */
919 int rowCount = 0;
920 if (m_param->rc.rateControlMode == X265_RC_ABR)
921 {
922 if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
923 rowCount = X265_MIN((m_numRows + 1) / 2, m_numRows - 1);
924 else
925 rowCount = X265_MIN(m_refLagRows, m_numRows - 1);
926 }
927 if (row == rowCount)
928 {
929 m_rce.rowTotalBits = 0;
930 if (bIsVbv)
931 for (int i = 0; i < rowCount; i++)
932 m_rce.rowTotalBits += curEncData.m_rowStat[i].encodedBits;
933 else
934 for (uint32_t cuAddr = 0; cuAddr < rowCount * numCols; cuAddr++)
935 m_rce.rowTotalBits += curEncData.m_cuStat[cuAddr].totalBits;
936
937 m_top->m_rateControl->rateControlUpdateStats(&m_rce);
938 }
939
940 // trigger row-wise loop filters
941 if (row >= m_filterRowDelay)
942 {
943 enableRowFilter(row - m_filterRowDelay);
944
945 // NOTE: Active Filter to first row (row 0)
946 if (row == m_filterRowDelay)
947 enqueueRowFilter(0);
948 }
949 if (row == m_numRows - 1)
950 {
951 for (int i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
952 enableRowFilter(i);
953 }
954
955 m_totalTime += x265_mdate() - startTime;
956 curRow.busy = false;
957 }
958
959 void FrameEncoder::collectCTUStatistics(CUData& ctu)
960 {
961 StatisticLog* log = &m_sliceTypeLog[ctu.m_slice->m_sliceType];
962
963 if (ctu.m_slice->m_sliceType == I_SLICE)
964 {
965 uint32_t depth = 0;
966 for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
967 {
968 depth = ctu.m_cuDepth[absPartIdx];
969
970 log->totalCu++;
971 log->cntIntra[depth]++;
972 log->qTreeIntraCnt[depth]++;
973
974 if (ctu.m_partSize[absPartIdx] == SIZE_NONE)
975 {
976 log->totalCu--;
977 log->cntIntra[depth]--;
978 log->qTreeIntraCnt[depth]--;
979 }
980 else if (ctu.m_partSize[absPartIdx] == SIZE_NxN)
981 {
982 /* TODO: log intra modes at absPartIdx +0 to +3 */
983 X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n");
984 log->cntIntraNxN++;
985 log->cntIntra[depth]--;
986 }
987 else if (ctu.m_lumaIntraDir[absPartIdx] > 1)
988 log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++;
989 else
990 log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++;
991 }
992 }
993 else
994 {
995 uint32_t depth = 0;
996 for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2))
997 {
998 depth = ctu.m_cuDepth[absPartIdx];
999
1000 log->totalCu++;
1001 log->cntTotalCu[depth]++;
1002
1003 if (ctu.m_partSize[absPartIdx] == SIZE_NONE)
1004 {
1005 log->totalCu--;
1006 log->cntTotalCu[depth]--;
1007 }
1008 else if (ctu.isSkipped(absPartIdx))
1009 {
1010 log->totalCu--;
1011 log->cntSkipCu[depth]++;
1012 log->qTreeSkipCnt[depth]++;
1013 }
1014 else if (ctu.m_predMode[absPartIdx] == MODE_INTER)
1015 {
1016 log->cntInter[depth]++;
1017 log->qTreeInterCnt[depth]++;
1018
1019 if (ctu.m_partSize[absPartIdx] < AMP_ID)
1020 log->cuInterDistribution[depth][ctu.m_partSize[absPartIdx]]++;
1021 else
1022 log->cuInterDistribution[depth][AMP_ID]++;
1023 }
1024 else if (ctu.m_predMode[absPartIdx] == MODE_INTRA)
1025 {
1026 log->cntIntra[depth]++;
1027 log->qTreeIntraCnt[depth]++;
1028
1029 if (ctu.m_partSize[absPartIdx] == SIZE_NxN)
1030 {
1031 X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n");
1032 log->cntIntraNxN++;
1033 /* TODO: log intra modes at absPartIdx +0 to +3 */
1034 }
1035 else if (ctu.m_lumaIntraDir[absPartIdx] > 1)
1036 log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++;
1037 else
1038 log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++;
1039 }
1040 }
1041 }
1042 }
1043
1044 /* DCT-domain noise reduction / adaptive deadzone from libavcodec */
1045 void FrameEncoder::noiseReductionUpdate()
1046 {
1047 if (!m_nr)
1048 return;
1049
1050 static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12};
1051
1052 for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
1053 {
1054 int trSize = cat & 3;
1055 int coefCount = 1 << ((trSize + 2) * 2);
1056
1057 if (m_nr->count[cat] > maxBlocksPerTrSize[trSize])
1058 {
1059 for (int i = 0; i < coefCount; i++)
1060 m_nr->residualSum[cat][i] >>= 1;
1061 m_nr->count[cat] >>= 1;
1062 }
1063
1064 uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr->count[cat];
1065
1066 for (int i = 0; i < coefCount; i++)
1067 {
1068 uint64_t value = scaledCount + m_nr->residualSum[cat][i] / 2;
1069 uint64_t denom = m_nr->residualSum[cat][i] + 1;
1070 m_nr->offsetDenoise[cat][i] = (uint16_t)(value / denom);
1071 }
1072
1073 // Don't denoise DC coefficients
1074 m_nr->offsetDenoise[cat][0] = 0;
1075 }
1076 }
1077
1078 int FrameEncoder::calcQpForCu(uint32_t ctuAddr, double baseQp)
1079 {
1080 x265_emms();
1081 double qp = baseQp;
1082
1083 FrameData& curEncData = *m_frame->m_encData;
1084 /* clear cuCostsForVbv from when vbv row reset was triggered */
1085 bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
1086 if (bIsVbv)
1087 {
1088 curEncData.m_cuStat[ctuAddr].vbvCost = 0;
1089 curEncData.m_cuStat[ctuAddr].intraVbvCost = 0;
1090 }
1091
1092 /* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */
1093 double qp_offset = 0;
1094 uint32_t maxBlockCols = (m_frame->m_origPicYuv->m_picWidth + (16 - 1)) / 16;
1095 uint32_t maxBlockRows = (m_frame->m_origPicYuv->m_picHeight + (16 - 1)) / 16;
1096 uint32_t noOfBlocks = g_maxCUSize / 16;
1097 uint32_t block_y = (ctuAddr / curEncData.m_slice->m_sps->numCuInWidth) * noOfBlocks;
1098 uint32_t block_x = (ctuAddr * noOfBlocks) - block_y * curEncData.m_slice->m_sps->numCuInWidth;
1099
1100 /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
1101 bool isReferenced = IS_REFERENCED(m_frame);
1102 double *qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
1103
1104 uint32_t cnt = 0, idx = 0;
1105 for (uint32_t h = 0; h < noOfBlocks && block_y < maxBlockRows; h++, block_y++)
1106 {
1107 for (uint32_t w = 0; w < noOfBlocks && (block_x + w) < maxBlockCols; w++)
1108 {
1109 idx = block_x + w + (block_y * maxBlockCols);
1110 if (m_param->rc.aqMode)
1111 qp_offset += qpoffs[idx];
1112 if (bIsVbv)
1113 {
1114 curEncData.m_cuStat[ctuAddr].vbvCost += m_frame->m_lowres.lowresCostForRc[idx] & LOWRES_COST_MASK;
1115 curEncData.m_cuStat[ctuAddr].intraVbvCost += m_frame->m_lowres.intraCost[idx];
1116 }
1117 cnt++;
1118 }
1119 }
1120
1121 qp_offset /= cnt;
1122 qp += qp_offset;
1123
1124 return Clip3(QP_MIN, QP_MAX_MAX, (int)(qp + 0.5));
1125 }
1126
1127 Frame *FrameEncoder::getEncodedPicture(NALList& output)
1128 {
1129 if (m_frame)
1130 {
1131 /* block here until worker thread completes */
1132 m_done.wait();
1133
1134 Frame *ret = m_frame;
1135 m_frame = NULL;
1136 output.takeContents(m_nalList);
1137 return ret;
1138 }
1139
1140 return NULL;
1141 }
1142 }