Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | /***************************************************************************** |
2 | * Copyright (C) 2013 x265 project | |
3 | * | |
4 | * Authors: Chung Shin Yee <shinyee@multicorewareinc.com> | |
5 | * Min Chen <chenm003@163.com> | |
6 | * Steve Borho <steve@borho.org> | |
7 | * | |
8 | * This program is free software; you can redistribute it and/or modify | |
9 | * it under the terms of the GNU General Public License as published by | |
10 | * the Free Software Foundation; either version 2 of the License, or | |
11 | * (at your option) any later version. | |
12 | * | |
13 | * This program is distributed in the hope that it will be useful, | |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | * GNU General Public License for more details. | |
17 | * | |
18 | * You should have received a copy of the GNU General Public License | |
19 | * along with this program; if not, write to the Free Software | |
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
21 | * | |
22 | * This program is also available under a commercial proprietary license. | |
23 | * For more information, contact us at license @ x265.com. | |
24 | *****************************************************************************/ | |
25 | ||
26 | #include "common.h" | |
27 | #include "frame.h" | |
28 | #include "framedata.h" | |
29 | #include "wavefront.h" | |
30 | #include "param.h" | |
31 | ||
32 | #include "PPA/ppa.h" | |
33 | ||
34 | #include "encoder.h" | |
35 | #include "frameencoder.h" | |
36 | #include "common.h" | |
37 | #include "slicetype.h" | |
38 | #include "nal.h" | |
39 | ||
40 | namespace x265 { | |
41 | void weightAnalyse(Slice& slice, Frame& frame, x265_param& param); | |
42 | ||
43 | FrameEncoder::FrameEncoder() | |
44 | : WaveFront(NULL) | |
45 | , m_threadActive(true) | |
46 | { | |
47 | m_totalTime = 0; | |
48 | m_frameEncoderID = 0; | |
49 | m_bAllRowsStop = false; | |
50 | m_vbvResetTriggerRow = -1; | |
51 | m_outStreams = NULL; | |
52 | m_substreamSizes = NULL; | |
53 | m_nr = NULL; | |
54 | m_tld = NULL; | |
55 | m_rows = NULL; | |
56 | m_top = NULL; | |
57 | m_param = NULL; | |
58 | m_frame = NULL; | |
59 | m_cuGeoms = NULL; | |
60 | m_ctuGeomMap = NULL; | |
61 | memset(&m_frameStats, 0, sizeof(m_frameStats)); | |
62 | memset(&m_rce, 0, sizeof(RateControlEntry)); | |
63 | } | |
64 | ||
65 | void FrameEncoder::destroy() | |
66 | { | |
67 | if (m_pool) | |
68 | JobProvider::flush(); // ensure no worker threads are using this frame | |
69 | ||
70 | m_threadActive = false; | |
71 | m_enable.trigger(); | |
72 | ||
73 | delete[] m_rows; | |
74 | delete[] m_outStreams; | |
75 | X265_FREE(m_cuGeoms); | |
76 | X265_FREE(m_ctuGeomMap); | |
77 | X265_FREE(m_substreamSizes); | |
78 | X265_FREE(m_nr); | |
79 | ||
80 | m_frameFilter.destroy(); | |
81 | ||
82 | if (m_param->bEmitHRDSEI || !!m_param->interlaceMode) | |
83 | { | |
84 | delete m_rce.picTimingSEI; | |
85 | delete m_rce.hrdTiming; | |
86 | } | |
87 | ||
88 | // wait for worker thread to exit | |
89 | stop(); | |
90 | } | |
91 | ||
92 | bool FrameEncoder::init(Encoder *top, int numRows, int numCols, int id) | |
93 | { | |
94 | m_top = top; | |
95 | m_param = top->m_param; | |
96 | m_numRows = numRows; | |
97 | m_numCols = numCols; | |
98 | m_filterRowDelay = (m_param->bEnableSAO && m_param->bSaoNonDeblocked) ? | |
99 | 2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0); | |
100 | m_filterRowDelayCus = m_filterRowDelay * numCols; | |
101 | m_frameEncoderID = id; | |
102 | m_rows = new CTURow[m_numRows]; | |
103 | bool ok = !!m_numRows; | |
104 | ||
105 | int range = m_param->searchRange; /* fpel search */ | |
106 | range += 1; /* diamond search range check lag */ | |
107 | range += 2; /* subpel refine */ | |
108 | range += NTAPS_LUMA / 2; /* subpel filter half-length */ | |
109 | m_refLagRows = 1 + ((range + g_maxCUSize - 1) / g_maxCUSize); | |
110 | ||
111 | // NOTE: 2 times of numRows because both Encoder and Filter in same queue | |
112 | if (!WaveFront::init(m_numRows * 2)) | |
113 | { | |
114 | x265_log(m_param, X265_LOG_ERROR, "unable to initialize wavefront queue\n"); | |
115 | m_pool = NULL; | |
116 | } | |
117 | ||
118 | m_frameFilter.init(top, this, numRows); | |
119 | ||
120 | // initialize HRD parameters of SPS | |
121 | if (m_param->bEmitHRDSEI || !!m_param->interlaceMode) | |
122 | { | |
123 | m_rce.picTimingSEI = new SEIPictureTiming; | |
124 | m_rce.hrdTiming = new HRDTiming; | |
125 | ||
126 | ok &= m_rce.picTimingSEI && m_rce.hrdTiming; | |
127 | } | |
128 | ||
129 | if (m_param->noiseReduction) | |
130 | m_nr = X265_MALLOC(NoiseReduction, 1); | |
131 | if (m_nr) | |
132 | memset(m_nr, 0, sizeof(NoiseReduction)); | |
133 | else | |
134 | m_param->noiseReduction = 0; | |
135 | ||
136 | start(); | |
137 | return ok; | |
138 | } | |
139 | ||
140 | /* Generate a complete list of unique geom sets for the current picture dimensions */ | |
141 | bool FrameEncoder::initializeGeoms(const FrameData& encData) | |
142 | { | |
143 | /* Geoms only vary between CTUs in the presence of picture edges */ | |
144 | int heightRem = m_param->sourceHeight & (m_param->maxCUSize - 1); | |
145 | int widthRem = m_param->sourceWidth & (m_param->maxCUSize - 1); | |
146 | int allocGeoms = 1; // body | |
147 | if (heightRem && widthRem) | |
148 | allocGeoms = 4; // body, right, bottom, corner | |
149 | else if (heightRem || widthRem) | |
150 | allocGeoms = 2; // body, right or bottom | |
151 | ||
152 | m_ctuGeomMap = X265_MALLOC(uint32_t, m_numRows * m_numCols); | |
153 | m_cuGeoms = X265_MALLOC(CUGeom, allocGeoms * CUGeom::MAX_GEOMS); | |
154 | if (!m_cuGeoms || !m_ctuGeomMap) | |
155 | return false; | |
156 | ||
157 | CUGeom cuLocalData[CUGeom::MAX_GEOMS]; | |
158 | memset(cuLocalData, 0, sizeof(cuLocalData)); // temporal fix for memcmp | |
159 | ||
160 | int countGeoms = 0; | |
161 | for (uint32_t ctuAddr = 0; ctuAddr < m_numRows * m_numCols; ctuAddr++) | |
162 | { | |
163 | /* TODO: detach this logic from TComDataCU */ | |
164 | encData.m_picCTU[ctuAddr].initCTU(*m_frame, ctuAddr, 0); | |
165 | encData.m_picCTU[ctuAddr].calcCTUGeoms(m_param->sourceWidth, m_param->sourceHeight, m_param->maxCUSize, cuLocalData); | |
166 | ||
167 | m_ctuGeomMap[ctuAddr] = MAX_INT; | |
168 | for (int i = 0; i < countGeoms; i++) | |
169 | { | |
170 | if (!memcmp(cuLocalData, m_cuGeoms + i * CUGeom::MAX_GEOMS, sizeof(CUGeom) * CUGeom::MAX_GEOMS)) | |
171 | { | |
172 | m_ctuGeomMap[ctuAddr] = i * CUGeom::MAX_GEOMS; | |
173 | break; | |
174 | } | |
175 | } | |
176 | ||
177 | if (m_ctuGeomMap[ctuAddr] == MAX_INT) | |
178 | { | |
179 | X265_CHECK(countGeoms < allocGeoms, "geometry match check failure\n"); | |
180 | m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS; | |
181 | memcpy(m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS, cuLocalData, sizeof(CUGeom) * CUGeom::MAX_GEOMS); | |
182 | countGeoms++; | |
183 | } | |
184 | } | |
185 | ||
186 | return true; | |
187 | } | |
188 | ||
189 | bool FrameEncoder::startCompressFrame(Frame* curFrame) | |
190 | { | |
191 | m_frame = curFrame; | |
192 | curFrame->m_encData->m_frameEncoderID = m_frameEncoderID; // Each Frame knows the ID of the FrameEncoder encoding it | |
193 | curFrame->m_encData->m_slice->m_mref = m_mref; | |
194 | if (!m_cuGeoms) | |
195 | { | |
196 | if (!initializeGeoms(*curFrame->m_encData)) | |
197 | return false; | |
198 | } | |
199 | m_enable.trigger(); | |
200 | return true; | |
201 | } | |
202 | ||
203 | void FrameEncoder::threadMain() | |
204 | { | |
205 | // worker thread routine for FrameEncoder | |
206 | do | |
207 | { | |
208 | m_enable.wait(); // Encoder::encode() triggers this event | |
209 | if (m_threadActive) | |
210 | { | |
211 | compressFrame(); | |
212 | m_done.trigger(); // FrameEncoder::getEncodedPicture() blocks for this event | |
213 | } | |
214 | } | |
215 | while (m_threadActive); | |
216 | } | |
217 | ||
218 | void FrameEncoder::compressFrame() | |
219 | { | |
220 | PPAScopeEvent(FrameEncoder_compressFrame); | |
221 | int64_t startCompressTime = x265_mdate(); | |
222 | Slice* slice = m_frame->m_encData->m_slice; | |
223 | ||
224 | /* Emit access unit delimiter unless this is the first frame and the user is | |
225 | * not repeating headers (since AUD is supposed to be the first NAL in the access | |
226 | * unit) */ | |
227 | if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders)) | |
228 | { | |
229 | m_bs.resetBits(); | |
230 | m_entropyCoder.setBitstream(&m_bs); | |
231 | m_entropyCoder.codeAUD(*slice); | |
232 | m_bs.writeByteAlignment(); | |
233 | m_nalList.serialize(NAL_UNIT_ACCESS_UNIT_DELIMITER, m_bs); | |
234 | } | |
235 | if (m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders) | |
236 | m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs); | |
237 | ||
238 | // Weighted Prediction parameters estimation. | |
239 | bool bUseWeightP = slice->m_sliceType == P_SLICE && slice->m_pps->bUseWeightPred; | |
240 | bool bUseWeightB = slice->m_sliceType == B_SLICE && slice->m_pps->bUseWeightedBiPred; | |
241 | if (bUseWeightP || bUseWeightB) | |
242 | weightAnalyse(*slice, *m_frame, *m_param); | |
243 | else | |
244 | slice->disableWeights(); | |
245 | ||
246 | // Generate motion references | |
247 | int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0; | |
248 | for (int l = 0; l < numPredDir; l++) | |
249 | { | |
250 | for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++) | |
251 | { | |
252 | WeightParam *w = NULL; | |
253 | if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag) | |
254 | w = slice->m_weightPredTable[l][ref]; | |
255 | m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPicYuv, w); | |
256 | } | |
257 | } | |
258 | ||
259 | /* Get the QP for this frame from rate control. This call may block until | |
260 | * frames ahead of it in encode order have called rateControlEnd() */ | |
261 | int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top); | |
262 | m_rce.newQp = qp; | |
263 | ||
264 | /* Clip slice QP to 0-51 spec range before encoding */ | |
265 | slice->m_sliceQp = Clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp); | |
266 | ||
267 | m_initSliceContext.resetEntropy(*slice); | |
268 | ||
269 | m_frameFilter.start(m_frame, m_initSliceContext, qp); | |
270 | ||
271 | // reset entropy coders | |
272 | m_entropyCoder.load(m_initSliceContext); | |
273 | for (int i = 0; i < m_numRows; i++) | |
274 | m_rows[i].init(m_initSliceContext); | |
275 | ||
276 | uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1; | |
277 | if (!m_outStreams) | |
278 | { | |
279 | m_outStreams = new Bitstream[numSubstreams]; | |
280 | m_substreamSizes = X265_MALLOC(uint32_t, numSubstreams); | |
281 | if (!m_param->bEnableSAO) | |
282 | for (uint32_t i = 0; i < numSubstreams; i++) | |
283 | m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]); | |
284 | } | |
285 | else | |
286 | for (uint32_t i = 0; i < numSubstreams; i++) | |
287 | m_outStreams[i].resetBits(); | |
288 | ||
289 | if (m_frame->m_lowres.bKeyframe) | |
290 | { | |
291 | if (m_param->bEmitHRDSEI) | |
292 | { | |
293 | SEIBufferingPeriod* bpSei = &m_top->m_rateControl->m_bufPeriodSEI; | |
294 | ||
295 | // since the temporal layer HRD is not ready, we assumed it is fixed | |
296 | bpSei->m_auCpbRemovalDelayDelta = 1; | |
297 | bpSei->m_cpbDelayOffset = 0; | |
298 | bpSei->m_dpbDelayOffset = 0; | |
299 | ||
300 | // hrdFullness() calculates the initial CPB removal delay and offset | |
301 | m_top->m_rateControl->hrdFullness(bpSei); | |
302 | ||
303 | m_bs.resetBits(); | |
304 | bpSei->write(m_bs, *slice->m_sps); | |
305 | m_bs.writeByteAlignment(); | |
306 | ||
307 | m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs); | |
308 | ||
309 | m_top->m_lastBPSEI = m_rce.encodeOrder; | |
310 | } | |
311 | ||
312 | // The recovery point SEI message assists a decoder in determining when the decoding | |
313 | // process will produce acceptable pictures for display after the decoder initiates | |
314 | // random access. The m_recoveryPocCnt is in units of POC(picture order count) which | |
315 | // means pictures encoded after the CRA but precede it in display order(leading) are | |
316 | // implicitly discarded after a random access seek regardless of the value of | |
317 | // m_recoveryPocCnt. Our encoder does not use references prior to the most recent CRA, | |
318 | // so all pictures following the CRA in POC order are guaranteed to be displayable, | |
319 | // so m_recoveryPocCnt is always 0. | |
320 | SEIRecoveryPoint sei_recovery_point; | |
321 | sei_recovery_point.m_recoveryPocCnt = 0; | |
322 | sei_recovery_point.m_exactMatchingFlag = true; | |
323 | sei_recovery_point.m_brokenLinkFlag = false; | |
324 | ||
325 | m_bs.resetBits(); | |
326 | sei_recovery_point.write(m_bs, *slice->m_sps); | |
327 | m_bs.writeByteAlignment(); | |
328 | ||
329 | m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs); | |
330 | } | |
331 | ||
332 | if (m_param->bEmitHRDSEI || !!m_param->interlaceMode) | |
333 | { | |
334 | SEIPictureTiming *sei = m_rce.picTimingSEI; | |
335 | const VUI *vui = &slice->m_sps->vuiParameters; | |
336 | const HRDInfo *hrd = &vui->hrdParameters; | |
337 | int poc = slice->m_poc; | |
338 | ||
339 | if (vui->frameFieldInfoPresentFlag) | |
340 | { | |
341 | if (m_param->interlaceMode == 2) | |
342 | sei->m_picStruct = (poc & 1) ? 1 /* top */ : 2 /* bottom */; | |
343 | else if (m_param->interlaceMode == 1) | |
344 | sei->m_picStruct = (poc & 1) ? 2 /* bottom */ : 1 /* top */; | |
345 | else | |
346 | sei->m_picStruct = 0; | |
347 | sei->m_sourceScanType = 0; | |
348 | sei->m_duplicateFlag = false; | |
349 | } | |
350 | ||
351 | if (vui->hrdParametersPresentFlag) | |
352 | { | |
353 | // The m_aucpbremoval delay specifies how many clock ticks the | |
354 | // access unit associated with the picture timing SEI message has to | |
355 | // wait after removal of the access unit with the most recent | |
356 | // buffering period SEI message | |
357 | sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - m_top->m_lastBPSEI), (1 << hrd->cpbRemovalDelayLength)); | |
358 | sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder; | |
359 | } | |
360 | ||
361 | m_bs.resetBits(); | |
362 | sei->write(m_bs, *slice->m_sps); | |
363 | m_bs.writeByteAlignment(); | |
364 | m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs); | |
365 | } | |
366 | ||
367 | // Analyze CTU rows, most of the hard work is done here | |
368 | // frame is compressed in a wave-front pattern if WPP is enabled. Loop filter runs as a | |
369 | // wave-front behind the CU compression and reconstruction | |
370 | compressCTURows(); | |
371 | ||
372 | if (m_param->rc.bStatWrite) | |
373 | { | |
374 | int totalI = 0, totalP = 0, totalSkip = 0; | |
375 | ||
376 | // accumulate intra,inter,skip cu count per frame for 2 pass | |
377 | for (int i = 0; i < m_numRows; i++) | |
378 | { | |
379 | m_frameStats.mvBits += m_rows[i].rowStats.mvBits; | |
380 | m_frameStats.coeffBits += m_rows[i].rowStats.coeffBits; | |
381 | m_frameStats.miscBits += m_rows[i].rowStats.miscBits; | |
382 | totalI += m_rows[i].rowStats.iCuCnt; | |
383 | totalP += m_rows[i].rowStats.pCuCnt; | |
384 | totalSkip += m_rows[i].rowStats.skipCuCnt; | |
385 | } | |
386 | int totalCuCount = totalI + totalP + totalSkip; | |
387 | m_frameStats.percentIntra = (double)totalI / totalCuCount; | |
388 | m_frameStats.percentInter = (double)totalP / totalCuCount; | |
389 | m_frameStats.percentSkip = (double)totalSkip / totalCuCount; | |
390 | } | |
391 | ||
392 | m_bs.resetBits(); | |
393 | m_entropyCoder.load(m_initSliceContext); | |
394 | m_entropyCoder.setBitstream(&m_bs); | |
395 | m_entropyCoder.codeSliceHeader(*slice, *m_frame->m_encData); | |
396 | ||
397 | // finish encode of each CTU row, only required when SAO is enabled | |
398 | if (m_param->bEnableSAO) | |
399 | encodeSlice(); | |
400 | ||
401 | // serialize each row, record final lengths in slice header | |
402 | uint32_t maxStreamSize = m_nalList.serializeSubstreams(m_substreamSizes, numSubstreams, m_outStreams); | |
403 | ||
404 | // complete the slice header by writing WPP row-starts | |
405 | m_entropyCoder.setBitstream(&m_bs); | |
406 | if (slice->m_pps->bEntropyCodingSyncEnabled) | |
407 | m_entropyCoder.codeSliceHeaderWPPEntryPoints(*slice, m_substreamSizes, maxStreamSize); | |
408 | m_bs.writeByteAlignment(); | |
409 | ||
410 | m_nalList.serialize(slice->m_nalUnitType, m_bs); | |
411 | ||
412 | if (m_param->decodedPictureHashSEI) | |
413 | { | |
414 | if (m_param->decodedPictureHashSEI == 1) | |
415 | { | |
416 | m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5; | |
417 | for (int i = 0; i < 3; i++) | |
418 | MD5Final(&m_state[i], m_seiReconPictureDigest.m_digest[i]); | |
419 | } | |
420 | else if (m_param->decodedPictureHashSEI == 2) | |
421 | { | |
422 | m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC; | |
423 | for (int i = 0; i < 3; i++) | |
424 | crcFinish(m_crc[i], m_seiReconPictureDigest.m_digest[i]); | |
425 | } | |
426 | else if (m_param->decodedPictureHashSEI == 3) | |
427 | { | |
428 | m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM; | |
429 | for (int i = 0; i < 3; i++) | |
430 | checksumFinish(m_checksum[i], m_seiReconPictureDigest.m_digest[i]); | |
431 | } | |
432 | ||
433 | m_bs.resetBits(); | |
434 | m_seiReconPictureDigest.write(m_bs, *slice->m_sps); | |
435 | m_bs.writeByteAlignment(); | |
436 | ||
437 | m_nalList.serialize(NAL_UNIT_SUFFIX_SEI, m_bs); | |
438 | } | |
439 | ||
440 | uint64_t bytes = 0; | |
441 | for (uint32_t i = 0; i < m_nalList.m_numNal; i++) | |
442 | { | |
443 | int type = m_nalList.m_nal[i].type; | |
444 | ||
445 | // exclude SEI | |
446 | if (type != NAL_UNIT_PREFIX_SEI && type != NAL_UNIT_SUFFIX_SEI) | |
447 | { | |
448 | bytes += m_nalList.m_nal[i].sizeBytes; | |
449 | // and exclude start code prefix | |
450 | bytes -= (!i || type == NAL_UNIT_SPS || type == NAL_UNIT_PPS) ? 4 : 3; | |
451 | } | |
452 | } | |
453 | m_accessUnitBits = bytes << 3; | |
454 | ||
455 | m_elapsedCompressTime = (double)(x265_mdate() - startCompressTime) / 1000000; | |
456 | /* rateControlEnd may also block for earlier frames to call rateControlUpdateStats */ | |
457 | if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0) | |
458 | m_top->m_aborted = true; | |
459 | ||
460 | /* Accumulate NR statistics from all worker threads */ | |
461 | if (m_nr) | |
462 | { | |
463 | for (int i = 0; i < m_top->m_numThreadLocalData; i++) | |
464 | { | |
465 | NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID]; | |
466 | for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++) | |
467 | { | |
468 | for(int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++) | |
469 | m_nr->residualSum[cat][coeff] += nr->residualSum[cat][coeff]; | |
470 | ||
471 | m_nr->count[cat] += nr->count[cat]; | |
472 | } | |
473 | } | |
474 | } | |
475 | ||
476 | noiseReductionUpdate(); | |
477 | ||
478 | /* Copy updated NR coefficients back to all worker threads */ | |
479 | if (m_nr) | |
480 | { | |
481 | for (int i = 0; i < m_top->m_numThreadLocalData; i++) | |
482 | { | |
483 | NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID]; | |
484 | memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS); | |
485 | memset(nr->count, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES); | |
486 | memset(nr->residualSum, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS); | |
487 | } | |
488 | } | |
489 | ||
490 | // Decrement referenced frame reference counts, allow them to be recycled | |
491 | for (int l = 0; l < numPredDir; l++) | |
492 | { | |
493 | for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++) | |
494 | { | |
495 | Frame *refpic = slice->m_refPicList[l][ref]; | |
496 | ATOMIC_DEC(&refpic->m_countRefEncoders); | |
497 | } | |
498 | } | |
499 | } | |
500 | ||
501 | void FrameEncoder::encodeSlice() | |
502 | { | |
503 | Slice* slice = m_frame->m_encData->m_slice; | |
504 | const uint32_t widthInLCUs = slice->m_sps->numCuInWidth; | |
505 | const uint32_t lastCUAddr = (slice->m_endCUAddr + NUM_CU_PARTITIONS - 1) / NUM_CU_PARTITIONS; | |
506 | const uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1; | |
507 | ||
508 | SAOParam* saoParam = slice->m_sps->bUseSAO ? m_frame->m_encData->m_saoParam : NULL; | |
509 | for (uint32_t cuAddr = 0; cuAddr < lastCUAddr; cuAddr++) | |
510 | { | |
511 | uint32_t col = cuAddr % widthInLCUs; | |
512 | uint32_t lin = cuAddr / widthInLCUs; | |
513 | uint32_t subStrm = lin % numSubstreams; | |
514 | CUData* ctu = m_frame->m_encData->getPicCTU(cuAddr); | |
515 | ||
516 | m_entropyCoder.setBitstream(&m_outStreams[subStrm]); | |
517 | ||
518 | // Synchronize cabac probabilities with upper-right CTU if it's available and we're at the start of a line. | |
519 | if (m_param->bEnableWavefront && !col && lin) | |
520 | { | |
521 | m_entropyCoder.copyState(m_initSliceContext); | |
522 | m_entropyCoder.loadContexts(m_rows[lin - 1].bufferedEntropy); | |
523 | } | |
524 | ||
525 | if (saoParam) | |
526 | { | |
527 | if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1]) | |
528 | { | |
529 | int mergeLeft = col && saoParam->ctuParam[0][cuAddr].mergeMode == SAO_MERGE_LEFT; | |
530 | int mergeUp = lin && saoParam->ctuParam[0][cuAddr].mergeMode == SAO_MERGE_UP; | |
531 | if (col) | |
532 | m_entropyCoder.codeSaoMerge(mergeLeft); | |
533 | if (lin && !mergeLeft) | |
534 | m_entropyCoder.codeSaoMerge(mergeUp); | |
535 | if (!mergeLeft && !mergeUp) | |
536 | { | |
537 | if (saoParam->bSaoFlag[0]) | |
538 | m_entropyCoder.codeSaoOffset(saoParam->ctuParam[0][cuAddr], 0); | |
539 | if (saoParam->bSaoFlag[1]) | |
540 | { | |
541 | m_entropyCoder.codeSaoOffset(saoParam->ctuParam[1][cuAddr], 1); | |
542 | m_entropyCoder.codeSaoOffset(saoParam->ctuParam[2][cuAddr], 2); | |
543 | } | |
544 | } | |
545 | } | |
546 | else | |
547 | { | |
548 | for (int i = 0; i < 3; i++) | |
549 | saoParam->ctuParam[i][cuAddr].reset(); | |
550 | } | |
551 | } | |
552 | ||
553 | // final coding (bitstream generation) for this CU | |
554 | m_entropyCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]); | |
555 | ||
556 | if (m_param->bEnableWavefront) | |
557 | { | |
558 | if (col == 1) | |
559 | // Store probabilities of second CTU in line into buffer | |
560 | m_rows[lin].bufferedEntropy.loadContexts(m_entropyCoder); | |
561 | ||
562 | if (col == widthInLCUs - 1) | |
563 | m_entropyCoder.finishSlice(); | |
564 | } | |
565 | } | |
566 | if (!m_param->bEnableWavefront) | |
567 | m_entropyCoder.finishSlice(); | |
568 | } | |
569 | ||
570 | void FrameEncoder::compressCTURows() | |
571 | { | |
572 | PPAScopeEvent(FrameEncoder_compressRows); | |
573 | Slice* slice = m_frame->m_encData->m_slice; | |
574 | ||
575 | m_bAllRowsStop = false; | |
576 | m_vbvResetTriggerRow = -1; | |
577 | ||
578 | m_SSDY = m_SSDU = m_SSDV = 0; | |
579 | m_ssim = 0; | |
580 | m_ssimCnt = 0; | |
581 | memset(&m_frameStats, 0, sizeof(m_frameStats)); | |
582 | ||
583 | bool bUseWeightP = slice->m_pps->bUseWeightPred && slice->m_sliceType == P_SLICE; | |
584 | bool bUseWeightB = slice->m_pps->bUseWeightedBiPred && slice->m_sliceType == B_SLICE; | |
585 | int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0; | |
586 | ||
587 | m_rows[0].active = true; | |
588 | if (m_pool && m_param->bEnableWavefront) | |
589 | { | |
590 | WaveFront::clearEnabledRowMask(); | |
591 | WaveFront::enqueue(); | |
592 | ||
593 | for (int row = 0; row < m_numRows; row++) | |
594 | { | |
595 | // block until all reference frames have reconstructed the rows we need | |
596 | for (int l = 0; l < numPredDir; l++) | |
597 | { | |
598 | for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++) | |
599 | { | |
600 | Frame *refpic = slice->m_refPicList[l][ref]; | |
601 | ||
602 | int reconRowCount = refpic->m_reconRowCount.get(); | |
603 | while ((reconRowCount != m_numRows) && (reconRowCount < row + m_refLagRows)) | |
604 | reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount); | |
605 | ||
606 | if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted) | |
607 | m_mref[l][ref].applyWeight(row + m_refLagRows, m_numRows); | |
608 | } | |
609 | } | |
610 | ||
611 | enableRowEncoder(row); | |
612 | if (row == 0) | |
613 | enqueueRowEncoder(0); | |
614 | else | |
615 | m_pool->pokeIdleThread(); | |
616 | } | |
617 | ||
618 | m_completionEvent.wait(); | |
619 | ||
620 | WaveFront::dequeue(); | |
621 | } | |
622 | else | |
623 | { | |
624 | for (int i = 0; i < this->m_numRows + m_filterRowDelay; i++) | |
625 | { | |
626 | // Encode | |
627 | if (i < m_numRows) | |
628 | { | |
629 | // block until all reference frames have reconstructed the rows we need | |
630 | for (int l = 0; l < numPredDir; l++) | |
631 | { | |
632 | int list = l; | |
633 | for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++) | |
634 | { | |
635 | Frame *refpic = slice->m_refPicList[list][ref]; | |
636 | ||
637 | int reconRowCount = refpic->m_reconRowCount.get(); | |
638 | while ((reconRowCount != m_numRows) && (reconRowCount < i + m_refLagRows)) | |
639 | reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount); | |
640 | ||
641 | if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted) | |
642 | m_mref[list][ref].applyWeight(i + m_refLagRows, m_numRows); | |
643 | } | |
644 | } | |
645 | ||
646 | processRow(i * 2 + 0, -1); | |
647 | } | |
648 | ||
649 | // Filter | |
650 | if (i >= m_filterRowDelay) | |
651 | processRow((i - m_filterRowDelay) * 2 + 1, -1); | |
652 | } | |
653 | } | |
654 | m_frameTime = (double)m_totalTime / 1000000; | |
655 | m_totalTime = 0; | |
656 | } | |
657 | ||
658 | void FrameEncoder::processRow(int row, int threadId) | |
659 | { | |
660 | const int realRow = row >> 1; | |
661 | const int typeNum = row & 1; | |
662 | ||
663 | ThreadLocalData& tld = threadId >= 0 ? m_top->m_threadLocalData[threadId] : *m_tld; | |
664 | ||
665 | if (!typeNum) | |
666 | processRowEncoder(realRow, tld); | |
667 | else | |
668 | { | |
669 | processRowFilter(realRow); | |
670 | ||
671 | // NOTE: Active next row | |
672 | if (realRow != m_numRows - 1) | |
673 | enqueueRowFilter(realRow + 1); | |
674 | else | |
675 | m_completionEvent.trigger(); | |
676 | } | |
677 | } | |
678 | ||
679 | // Called by worker threads | |
680 | void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld) | |
681 | { | |
682 | PPAScopeEvent(Thread_ProcessRow); | |
683 | ||
684 | CTURow& curRow = m_rows[row]; | |
685 | ||
686 | { | |
687 | ScopedLock self(curRow.lock); | |
688 | if (!curRow.active) | |
689 | /* VBV restart is in progress, exit out */ | |
690 | return; | |
691 | if (curRow.busy) | |
692 | { | |
693 | /* On multi-socket Windows servers, we have seen problems with | |
694 | * ATOMIC_CAS which resulted in multiple worker threads processing | |
695 | * the same CU row, which often resulted in bad pointer accesses. We | |
696 | * believe the problem is fixed, but are leaving this check in place | |
697 | * to prevent crashes in case it is not */ | |
698 | x265_log(m_param, X265_LOG_WARNING, | |
699 | "internal error - simultaneous row access detected. Please report HW to x265-devel@videolan.org\n"); | |
700 | return; | |
701 | } | |
702 | curRow.busy = true; | |
703 | } | |
704 | ||
705 | /* When WPP is enabled, every row has its own row coder instance. Otherwise | |
706 | * they share row 0 */ | |
707 | Entropy& rowCoder = m_param->bEnableWavefront ? m_rows[row].rowGoOnCoder : m_rows[0].rowGoOnCoder; | |
708 | FrameData& curEncData = *m_frame->m_encData; | |
709 | Slice *slice = curEncData.m_slice; | |
710 | PicYuv* fencPic = m_frame->m_origPicYuv; | |
711 | ||
712 | tld.analysis.m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride); | |
713 | ||
714 | int64_t startTime = x265_mdate(); | |
715 | const uint32_t numCols = m_numCols; | |
716 | const uint32_t lineStartCUAddr = row * numCols; | |
717 | bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0; | |
718 | ||
719 | while (curRow.completed < numCols) | |
720 | { | |
721 | int col = curRow.completed; | |
722 | const uint32_t cuAddr = lineStartCUAddr + col; | |
723 | CUData* ctu = curEncData.getPicCTU(cuAddr); | |
724 | ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp); | |
725 | ||
726 | if (bIsVbv) | |
727 | { | |
728 | if (!row) | |
729 | { | |
730 | curEncData.m_rowStat[row].diagQp = curEncData.m_avgQpRc; | |
731 | curEncData.m_rowStat[row].diagQpScale = x265_qp2qScale(curEncData.m_avgQpRc); | |
732 | } | |
733 | ||
734 | if (row >= col && row && m_vbvResetTriggerRow != row) | |
735 | curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_cuStat[cuAddr - numCols + 1].baseQp; | |
736 | else | |
737 | curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_rowStat[row].diagQp; | |
738 | } | |
739 | else | |
740 | curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_avgQpRc; | |
741 | ||
742 | if (m_param->rc.aqMode || bIsVbv) | |
743 | { | |
744 | int qp = calcQpForCu(cuAddr, curEncData.m_cuStat[cuAddr].baseQp); | |
745 | tld.analysis.setQP(*slice, qp); | |
746 | qp = Clip3(QP_MIN, QP_MAX_SPEC, qp); | |
747 | ctu->setQPSubParts((char)qp, 0, 0); | |
748 | curEncData.m_rowStat[row].sumQpAq += qp; | |
749 | } | |
750 | else | |
751 | tld.analysis.setQP(*slice, slice->m_sliceQp); | |
752 | ||
753 | if (m_param->bEnableWavefront && !col && row) | |
754 | { | |
755 | // Load SBAC coder context from previous row and initialize row state. | |
756 | rowCoder.copyState(m_initSliceContext); | |
757 | rowCoder.loadContexts(m_rows[row - 1].bufferedEntropy); | |
758 | } | |
759 | ||
760 | // Does all the CU analysis, returns best top level mode decision | |
761 | Search::Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder); | |
762 | ||
763 | /* advance top-level row coder to include the context of this CTU. | |
764 | * if SAO is disabled, rowCoder writes the final CTU bitstream */ | |
765 | rowCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]); | |
766 | ||
767 | if (m_param->bEnableWavefront && col == 1) | |
768 | // Save CABAC state for next row | |
769 | curRow.bufferedEntropy.loadContexts(rowCoder); | |
770 | ||
771 | // Completed CU processing | |
772 | curRow.completed++; | |
773 | ||
774 | if (m_param->bLogCuStats || m_param->rc.bStatWrite) | |
775 | collectCTUStatistics(*ctu); | |
776 | ||
777 | // copy no. of intra, inter Cu cnt per row into frame stats for 2 pass | |
778 | if (m_param->rc.bStatWrite) | |
779 | { | |
780 | curRow.rowStats.mvBits += best.mvBits; | |
781 | curRow.rowStats.coeffBits += best.coeffBits; | |
782 | curRow.rowStats.miscBits += best.totalBits - (best.mvBits + best.coeffBits); | |
783 | StatisticLog* log = &m_sliceTypeLog[slice->m_sliceType]; | |
784 | ||
785 | for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) | |
786 | { | |
787 | /* 1 << shift == number of 8x8 blocks at current depth */ | |
788 | int shift = 2 * (g_maxCUDepth - depth); | |
789 | curRow.rowStats.iCuCnt += log->qTreeIntraCnt[depth] << shift; | |
790 | curRow.rowStats.pCuCnt += log->qTreeInterCnt[depth] << shift; | |
791 | curRow.rowStats.skipCuCnt += log->qTreeSkipCnt[depth] << shift; | |
792 | ||
793 | // clear the row cu data from thread local object | |
794 | log->qTreeIntraCnt[depth] = log->qTreeInterCnt[depth] = log->qTreeSkipCnt[depth] = 0; | |
795 | } | |
796 | } | |
797 | ||
798 | curEncData.m_cuStat[cuAddr].totalBits = best.totalBits; | |
799 | x265_emms(); | |
800 | ||
801 | if (bIsVbv) | |
802 | { | |
803 | // Update encoded bits, satdCost, baseQP for each CU | |
804 | curEncData.m_rowStat[row].diagSatd += curEncData.m_cuStat[cuAddr].vbvCost; | |
805 | curEncData.m_rowStat[row].diagIntraSatd += curEncData.m_cuStat[cuAddr].intraVbvCost; | |
806 | curEncData.m_rowStat[row].encodedBits += curEncData.m_cuStat[cuAddr].totalBits; | |
807 | curEncData.m_rowStat[row].sumQpRc += curEncData.m_cuStat[cuAddr].baseQp; | |
808 | curEncData.m_rowStat[row].numEncodedCUs = cuAddr; | |
809 | ||
810 | // If current block is at row diagonal checkpoint, call vbv ratecontrol. | |
811 | ||
812 | if (row == col && row) | |
813 | { | |
814 | double qpBase = curEncData.m_cuStat[cuAddr].baseQp; | |
815 | int reEncode = m_top->m_rateControl->rowDiagonalVbvRateControl(m_frame, row, &m_rce, qpBase); | |
816 | qpBase = Clip3((double)QP_MIN, (double)QP_MAX_MAX, qpBase); | |
817 | curEncData.m_rowStat[row].diagQp = qpBase; | |
818 | curEncData.m_rowStat[row].diagQpScale = x265_qp2qScale(qpBase); | |
819 | ||
820 | if (reEncode < 0) | |
821 | { | |
822 | x265_log(m_param, X265_LOG_DEBUG, "POC %d row %d - encode restart required for VBV, to %.2f from %.2f\n", | |
823 | m_frame->m_poc, row, qpBase, curEncData.m_cuStat[cuAddr].baseQp); | |
824 | ||
825 | // prevent the WaveFront::findJob() method from providing new jobs | |
826 | m_vbvResetTriggerRow = row; | |
827 | m_bAllRowsStop = true; | |
828 | ||
829 | for (int r = m_numRows - 1; r >= row; r--) | |
830 | { | |
831 | CTURow& stopRow = m_rows[r]; | |
832 | ||
833 | if (r != row) | |
834 | { | |
835 | /* if row was active (ready to be run) clear active bit and bitmap bit for this row */ | |
836 | stopRow.lock.acquire(); | |
837 | while (stopRow.active) | |
838 | { | |
839 | if (dequeueRow(r * 2)) | |
840 | stopRow.active = false; | |
841 | else | |
842 | GIVE_UP_TIME(); | |
843 | } | |
844 | ||
845 | stopRow.lock.release(); | |
846 | ||
847 | bool bRowBusy = true; | |
848 | do | |
849 | { | |
850 | stopRow.lock.acquire(); | |
851 | bRowBusy = stopRow.busy; | |
852 | stopRow.lock.release(); | |
853 | ||
854 | if (bRowBusy) | |
855 | { | |
856 | GIVE_UP_TIME(); | |
857 | } | |
858 | } | |
859 | while (bRowBusy); | |
860 | } | |
861 | ||
862 | m_outStreams[r].resetBits(); | |
863 | stopRow.completed = 0; | |
864 | memset(&stopRow.rowStats, 0, sizeof(stopRow.rowStats)); | |
865 | curEncData.m_rowStat[r].numEncodedCUs = 0; | |
866 | curEncData.m_rowStat[r].encodedBits = 0; | |
867 | curEncData.m_rowStat[r].diagSatd = 0; | |
868 | curEncData.m_rowStat[r].diagIntraSatd = 0; | |
869 | curEncData.m_rowStat[r].sumQpRc = 0; | |
870 | curEncData.m_rowStat[r].sumQpAq = 0; | |
871 | } | |
872 | ||
873 | m_bAllRowsStop = false; | |
874 | } | |
875 | } | |
876 | } | |
877 | ||
878 | // NOTE: do CU level Filter | |
879 | if (m_param->bEnableSAO && m_param->bSaoNonDeblocked) | |
880 | // SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas | |
881 | m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row); | |
882 | ||
883 | // NOTE: active next row | |
884 | if (curRow.completed >= 2 && row < m_numRows - 1) | |
885 | { | |
886 | ScopedLock below(m_rows[row + 1].lock); | |
887 | if (m_rows[row + 1].active == false && | |
888 | m_rows[row + 1].completed + 2 <= curRow.completed && | |
889 | (!m_bAllRowsStop || row + 1 < m_vbvResetTriggerRow)) | |
890 | { | |
891 | m_rows[row + 1].active = true; | |
892 | enqueueRowEncoder(row + 1); | |
893 | } | |
894 | } | |
895 | ||
896 | ScopedLock self(curRow.lock); | |
897 | if ((m_bAllRowsStop && row > m_vbvResetTriggerRow) || | |
898 | (row > 0 && curRow.completed < numCols - 1 && m_rows[row - 1].completed < m_rows[row].completed + 2)) | |
899 | { | |
900 | curRow.active = false; | |
901 | curRow.busy = false; | |
902 | m_totalTime += x265_mdate() - startTime; | |
903 | return; | |
904 | } | |
905 | } | |
906 | ||
907 | /* *this row of CTUs has been encoded* */ | |
908 | ||
909 | /* flush row bitstream (if WPP and no SAO) or flush frame if no WPP and no SAO */ | |
910 | if (!m_param->bEnableSAO && (m_param->bEnableWavefront || row == m_numRows - 1)) | |
911 | rowCoder.finishSlice(); | |
912 | ||
913 | /* If encoding with ABR, update update bits and complexity in rate control | |
914 | * after a number of rows so the next frame's rateControlStart has more | |
915 | * accurate data for estimation. At the start of the encode we update stats | |
916 | * after half the frame is encoded, but after this initial period we update | |
917 | * after refLagRows (the number of rows reference frames must have completed | |
918 | * before referencees may begin encoding) */ | |
919 | int rowCount = 0; | |
920 | if (m_param->rc.rateControlMode == X265_RC_ABR) | |
921 | { | |
922 | if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom)) | |
923 | rowCount = X265_MIN((m_numRows + 1) / 2, m_numRows - 1); | |
924 | else | |
925 | rowCount = X265_MIN(m_refLagRows, m_numRows - 1); | |
926 | } | |
927 | if (row == rowCount) | |
928 | { | |
929 | m_rce.rowTotalBits = 0; | |
930 | if (bIsVbv) | |
931 | for (int i = 0; i < rowCount; i++) | |
932 | m_rce.rowTotalBits += curEncData.m_rowStat[i].encodedBits; | |
933 | else | |
934 | for (uint32_t cuAddr = 0; cuAddr < rowCount * numCols; cuAddr++) | |
935 | m_rce.rowTotalBits += curEncData.m_cuStat[cuAddr].totalBits; | |
936 | ||
937 | m_top->m_rateControl->rateControlUpdateStats(&m_rce); | |
938 | } | |
939 | ||
940 | // trigger row-wise loop filters | |
941 | if (row >= m_filterRowDelay) | |
942 | { | |
943 | enableRowFilter(row - m_filterRowDelay); | |
944 | ||
945 | // NOTE: Active Filter to first row (row 0) | |
946 | if (row == m_filterRowDelay) | |
947 | enqueueRowFilter(0); | |
948 | } | |
949 | if (row == m_numRows - 1) | |
950 | { | |
951 | for (int i = m_numRows - m_filterRowDelay; i < m_numRows; i++) | |
952 | enableRowFilter(i); | |
953 | } | |
954 | ||
955 | m_totalTime += x265_mdate() - startTime; | |
956 | curRow.busy = false; | |
957 | } | |
958 | ||
959 | void FrameEncoder::collectCTUStatistics(CUData& ctu) | |
960 | { | |
961 | StatisticLog* log = &m_sliceTypeLog[ctu.m_slice->m_sliceType]; | |
962 | ||
963 | if (ctu.m_slice->m_sliceType == I_SLICE) | |
964 | { | |
965 | uint32_t depth = 0; | |
966 | for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2)) | |
967 | { | |
968 | depth = ctu.m_cuDepth[absPartIdx]; | |
969 | ||
970 | log->totalCu++; | |
971 | log->cntIntra[depth]++; | |
972 | log->qTreeIntraCnt[depth]++; | |
973 | ||
974 | if (ctu.m_partSize[absPartIdx] == SIZE_NONE) | |
975 | { | |
976 | log->totalCu--; | |
977 | log->cntIntra[depth]--; | |
978 | log->qTreeIntraCnt[depth]--; | |
979 | } | |
980 | else if (ctu.m_partSize[absPartIdx] == SIZE_NxN) | |
981 | { | |
982 | /* TODO: log intra modes at absPartIdx +0 to +3 */ | |
983 | X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n"); | |
984 | log->cntIntraNxN++; | |
985 | log->cntIntra[depth]--; | |
986 | } | |
987 | else if (ctu.m_lumaIntraDir[absPartIdx] > 1) | |
988 | log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++; | |
989 | else | |
990 | log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++; | |
991 | } | |
992 | } | |
993 | else | |
994 | { | |
995 | uint32_t depth = 0; | |
996 | for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2)) | |
997 | { | |
998 | depth = ctu.m_cuDepth[absPartIdx]; | |
999 | ||
1000 | log->totalCu++; | |
1001 | log->cntTotalCu[depth]++; | |
1002 | ||
1003 | if (ctu.m_partSize[absPartIdx] == SIZE_NONE) | |
1004 | { | |
1005 | log->totalCu--; | |
1006 | log->cntTotalCu[depth]--; | |
1007 | } | |
1008 | else if (ctu.isSkipped(absPartIdx)) | |
1009 | { | |
1010 | log->totalCu--; | |
1011 | log->cntSkipCu[depth]++; | |
1012 | log->qTreeSkipCnt[depth]++; | |
1013 | } | |
1014 | else if (ctu.m_predMode[absPartIdx] == MODE_INTER) | |
1015 | { | |
1016 | log->cntInter[depth]++; | |
1017 | log->qTreeInterCnt[depth]++; | |
1018 | ||
1019 | if (ctu.m_partSize[absPartIdx] < AMP_ID) | |
1020 | log->cuInterDistribution[depth][ctu.m_partSize[absPartIdx]]++; | |
1021 | else | |
1022 | log->cuInterDistribution[depth][AMP_ID]++; | |
1023 | } | |
1024 | else if (ctu.m_predMode[absPartIdx] == MODE_INTRA) | |
1025 | { | |
1026 | log->cntIntra[depth]++; | |
1027 | log->qTreeIntraCnt[depth]++; | |
1028 | ||
1029 | if (ctu.m_partSize[absPartIdx] == SIZE_NxN) | |
1030 | { | |
1031 | X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n"); | |
1032 | log->cntIntraNxN++; | |
1033 | /* TODO: log intra modes at absPartIdx +0 to +3 */ | |
1034 | } | |
1035 | else if (ctu.m_lumaIntraDir[absPartIdx] > 1) | |
1036 | log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++; | |
1037 | else | |
1038 | log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++; | |
1039 | } | |
1040 | } | |
1041 | } | |
1042 | } | |
1043 | ||
1044 | /* DCT-domain noise reduction / adaptive deadzone from libavcodec */ | |
1045 | void FrameEncoder::noiseReductionUpdate() | |
1046 | { | |
1047 | if (!m_nr) | |
1048 | return; | |
1049 | ||
1050 | static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12}; | |
1051 | ||
1052 | for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++) | |
1053 | { | |
1054 | int trSize = cat & 3; | |
1055 | int coefCount = 1 << ((trSize + 2) * 2); | |
1056 | ||
1057 | if (m_nr->count[cat] > maxBlocksPerTrSize[trSize]) | |
1058 | { | |
1059 | for (int i = 0; i < coefCount; i++) | |
1060 | m_nr->residualSum[cat][i] >>= 1; | |
1061 | m_nr->count[cat] >>= 1; | |
1062 | } | |
1063 | ||
1064 | uint64_t scaledCount = (uint64_t)m_param->noiseReduction * m_nr->count[cat]; | |
1065 | ||
1066 | for (int i = 0; i < coefCount; i++) | |
1067 | { | |
1068 | uint64_t value = scaledCount + m_nr->residualSum[cat][i] / 2; | |
1069 | uint64_t denom = m_nr->residualSum[cat][i] + 1; | |
1070 | m_nr->offsetDenoise[cat][i] = (uint16_t)(value / denom); | |
1071 | } | |
1072 | ||
1073 | // Don't denoise DC coefficients | |
1074 | m_nr->offsetDenoise[cat][0] = 0; | |
1075 | } | |
1076 | } | |
1077 | ||
1078 | int FrameEncoder::calcQpForCu(uint32_t ctuAddr, double baseQp) | |
1079 | { | |
1080 | x265_emms(); | |
1081 | double qp = baseQp; | |
1082 | ||
1083 | FrameData& curEncData = *m_frame->m_encData; | |
1084 | /* clear cuCostsForVbv from when vbv row reset was triggered */ | |
1085 | bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0; | |
1086 | if (bIsVbv) | |
1087 | { | |
1088 | curEncData.m_cuStat[ctuAddr].vbvCost = 0; | |
1089 | curEncData.m_cuStat[ctuAddr].intraVbvCost = 0; | |
1090 | } | |
1091 | ||
1092 | /* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */ | |
1093 | double qp_offset = 0; | |
1094 | uint32_t maxBlockCols = (m_frame->m_origPicYuv->m_picWidth + (16 - 1)) / 16; | |
1095 | uint32_t maxBlockRows = (m_frame->m_origPicYuv->m_picHeight + (16 - 1)) / 16; | |
1096 | uint32_t noOfBlocks = g_maxCUSize / 16; | |
1097 | uint32_t block_y = (ctuAddr / curEncData.m_slice->m_sps->numCuInWidth) * noOfBlocks; | |
1098 | uint32_t block_x = (ctuAddr * noOfBlocks) - block_y * curEncData.m_slice->m_sps->numCuInWidth; | |
1099 | ||
1100 | /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */ | |
1101 | bool isReferenced = IS_REFERENCED(m_frame); | |
1102 | double *qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset; | |
1103 | ||
1104 | uint32_t cnt = 0, idx = 0; | |
1105 | for (uint32_t h = 0; h < noOfBlocks && block_y < maxBlockRows; h++, block_y++) | |
1106 | { | |
1107 | for (uint32_t w = 0; w < noOfBlocks && (block_x + w) < maxBlockCols; w++) | |
1108 | { | |
1109 | idx = block_x + w + (block_y * maxBlockCols); | |
1110 | if (m_param->rc.aqMode) | |
1111 | qp_offset += qpoffs[idx]; | |
1112 | if (bIsVbv) | |
1113 | { | |
1114 | curEncData.m_cuStat[ctuAddr].vbvCost += m_frame->m_lowres.lowresCostForRc[idx] & LOWRES_COST_MASK; | |
1115 | curEncData.m_cuStat[ctuAddr].intraVbvCost += m_frame->m_lowres.intraCost[idx]; | |
1116 | } | |
1117 | cnt++; | |
1118 | } | |
1119 | } | |
1120 | ||
1121 | qp_offset /= cnt; | |
1122 | qp += qp_offset; | |
1123 | ||
1124 | return Clip3(QP_MIN, QP_MAX_MAX, (int)(qp + 0.5)); | |
1125 | } | |
1126 | ||
1127 | Frame *FrameEncoder::getEncodedPicture(NALList& output) | |
1128 | { | |
1129 | if (m_frame) | |
1130 | { | |
1131 | /* block here until worker thread completes */ | |
1132 | m_done.wait(); | |
1133 | ||
1134 | Frame *ret = m_frame; | |
1135 | m_frame = NULL; | |
1136 | output.takeContents(m_nalList); | |
1137 | return ret; | |
1138 | } | |
1139 | ||
1140 | return NULL; | |
1141 | } | |
1142 | } |