Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | /***************************************************************************** |
2 | * Copyright (C) 2013 x265 project | |
3 | * | |
4 | * Authors: Chung Shin Yee <shinyee@multicorewareinc.com> | |
5 | * Min Chen <chenm003@163.com> | |
6 | * Steve Borho <steve@borho.org> | |
7 | * | |
8 | * This program is free software; you can redistribute it and/or modify | |
9 | * it under the terms of the GNU General Public License as published by | |
10 | * the Free Software Foundation; either version 2 of the License, or | |
11 | * (at your option) any later version. | |
12 | * | |
13 | * This program is distributed in the hope that it will be useful, | |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | * GNU General Public License for more details. | |
17 | * | |
18 | * You should have received a copy of the GNU General Public License | |
19 | * along with this program; if not, write to the Free Software | |
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
21 | * | |
22 | * This program is also available under a commercial proprietary license. | |
23 | * For more information, contact us at license @ x265.com. | |
24 | *****************************************************************************/ | |
25 | ||
26 | #include "common.h" | |
27 | #include "frame.h" | |
28 | #include "framedata.h" | |
29 | #include "wavefront.h" | |
30 | #include "param.h" | |
31 | ||
72b9787e JB |
32 | #include "encoder.h" |
33 | #include "frameencoder.h" | |
34 | #include "common.h" | |
35 | #include "slicetype.h" | |
36 | #include "nal.h" | |
37 | ||
38 | namespace x265 { | |
39 | void weightAnalyse(Slice& slice, Frame& frame, x265_param& param); | |
40 | ||
41 | FrameEncoder::FrameEncoder() | |
42 | : WaveFront(NULL) | |
43 | , m_threadActive(true) | |
44 | { | |
45 | m_totalTime = 0; | |
46 | m_frameEncoderID = 0; | |
47 | m_bAllRowsStop = false; | |
48 | m_vbvResetTriggerRow = -1; | |
49 | m_outStreams = NULL; | |
50 | m_substreamSizes = NULL; | |
51 | m_nr = NULL; | |
52 | m_tld = NULL; | |
53 | m_rows = NULL; | |
54 | m_top = NULL; | |
55 | m_param = NULL; | |
56 | m_frame = NULL; | |
57 | m_cuGeoms = NULL; | |
58 | m_ctuGeomMap = NULL; | |
59 | memset(&m_frameStats, 0, sizeof(m_frameStats)); | |
60 | memset(&m_rce, 0, sizeof(RateControlEntry)); | |
61 | } | |
62 | ||
63 | void FrameEncoder::destroy() | |
64 | { | |
65 | if (m_pool) | |
66 | JobProvider::flush(); // ensure no worker threads are using this frame | |
67 | ||
68 | m_threadActive = false; | |
69 | m_enable.trigger(); | |
70 | ||
71 | delete[] m_rows; | |
72 | delete[] m_outStreams; | |
73 | X265_FREE(m_cuGeoms); | |
74 | X265_FREE(m_ctuGeomMap); | |
75 | X265_FREE(m_substreamSizes); | |
76 | X265_FREE(m_nr); | |
77 | ||
78 | m_frameFilter.destroy(); | |
79 | ||
80 | if (m_param->bEmitHRDSEI || !!m_param->interlaceMode) | |
81 | { | |
82 | delete m_rce.picTimingSEI; | |
83 | delete m_rce.hrdTiming; | |
84 | } | |
85 | ||
86 | // wait for worker thread to exit | |
87 | stop(); | |
88 | } | |
89 | ||
90 | bool FrameEncoder::init(Encoder *top, int numRows, int numCols, int id) | |
91 | { | |
92 | m_top = top; | |
93 | m_param = top->m_param; | |
94 | m_numRows = numRows; | |
95 | m_numCols = numCols; | |
96 | m_filterRowDelay = (m_param->bEnableSAO && m_param->bSaoNonDeblocked) ? | |
97 | 2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0); | |
98 | m_filterRowDelayCus = m_filterRowDelay * numCols; | |
99 | m_frameEncoderID = id; | |
100 | m_rows = new CTURow[m_numRows]; | |
101 | bool ok = !!m_numRows; | |
102 | ||
103 | int range = m_param->searchRange; /* fpel search */ | |
104 | range += 1; /* diamond search range check lag */ | |
105 | range += 2; /* subpel refine */ | |
106 | range += NTAPS_LUMA / 2; /* subpel filter half-length */ | |
107 | m_refLagRows = 1 + ((range + g_maxCUSize - 1) / g_maxCUSize); | |
108 | ||
109 | // NOTE: 2 times of numRows because both Encoder and Filter in same queue | |
110 | if (!WaveFront::init(m_numRows * 2)) | |
111 | { | |
112 | x265_log(m_param, X265_LOG_ERROR, "unable to initialize wavefront queue\n"); | |
113 | m_pool = NULL; | |
114 | } | |
115 | ||
116 | m_frameFilter.init(top, this, numRows); | |
117 | ||
118 | // initialize HRD parameters of SPS | |
119 | if (m_param->bEmitHRDSEI || !!m_param->interlaceMode) | |
120 | { | |
121 | m_rce.picTimingSEI = new SEIPictureTiming; | |
122 | m_rce.hrdTiming = new HRDTiming; | |
123 | ||
124 | ok &= m_rce.picTimingSEI && m_rce.hrdTiming; | |
125 | } | |
126 | ||
b53f7c52 | 127 | if (m_param->noiseReductionIntra || m_param->noiseReductionInter) |
72b9787e JB |
128 | m_nr = X265_MALLOC(NoiseReduction, 1); |
129 | if (m_nr) | |
130 | memset(m_nr, 0, sizeof(NoiseReduction)); | |
131 | else | |
b53f7c52 | 132 | m_param->noiseReductionIntra = m_param->noiseReductionInter = 0; |
72b9787e JB |
133 | |
134 | start(); | |
135 | return ok; | |
136 | } | |
137 | ||
138 | /* Generate a complete list of unique geom sets for the current picture dimensions */ | |
b53f7c52 | 139 | bool FrameEncoder::initializeGeoms() |
72b9787e JB |
140 | { |
141 | /* Geoms only vary between CTUs in the presence of picture edges */ | |
b53f7c52 JB |
142 | int maxCUSize = m_param->maxCUSize; |
143 | int heightRem = m_param->sourceHeight & (maxCUSize - 1); | |
144 | int widthRem = m_param->sourceWidth & (maxCUSize - 1); | |
72b9787e JB |
145 | int allocGeoms = 1; // body |
146 | if (heightRem && widthRem) | |
147 | allocGeoms = 4; // body, right, bottom, corner | |
148 | else if (heightRem || widthRem) | |
149 | allocGeoms = 2; // body, right or bottom | |
150 | ||
151 | m_ctuGeomMap = X265_MALLOC(uint32_t, m_numRows * m_numCols); | |
152 | m_cuGeoms = X265_MALLOC(CUGeom, allocGeoms * CUGeom::MAX_GEOMS); | |
153 | if (!m_cuGeoms || !m_ctuGeomMap) | |
154 | return false; | |
155 | ||
b53f7c52 JB |
156 | // body |
157 | CUData::calcCTUGeoms(maxCUSize, maxCUSize, maxCUSize, m_cuGeoms); | |
158 | memset(m_ctuGeomMap, 0, sizeof(uint32_t) * m_numRows * m_numCols); | |
159 | if (allocGeoms == 1) | |
160 | return true; | |
72b9787e | 161 | |
b53f7c52 JB |
162 | int countGeoms = 1; |
163 | if (widthRem) | |
72b9787e | 164 | { |
b53f7c52 JB |
165 | // right |
166 | CUData::calcCTUGeoms(widthRem, maxCUSize, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS); | |
167 | for (int i = 0; i < m_numRows; i++) | |
72b9787e | 168 | { |
b53f7c52 JB |
169 | uint32_t ctuAddr = m_numCols * (i + 1) - 1; |
170 | m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS; | |
72b9787e | 171 | } |
b53f7c52 JB |
172 | countGeoms++; |
173 | } | |
174 | if (heightRem) | |
175 | { | |
176 | // bottom | |
177 | CUData::calcCTUGeoms(maxCUSize, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS); | |
178 | for (uint32_t i = 0; i < m_numCols; i++) | |
179 | { | |
180 | uint32_t ctuAddr = m_numCols * (m_numRows - 1) + i; | |
181 | m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS; | |
182 | } | |
183 | countGeoms++; | |
72b9787e | 184 | |
b53f7c52 | 185 | if (widthRem) |
72b9787e | 186 | { |
b53f7c52 JB |
187 | // corner |
188 | CUData::calcCTUGeoms(widthRem, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS); | |
189 | ||
190 | uint32_t ctuAddr = m_numCols * m_numRows - 1; | |
72b9787e | 191 | m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS; |
72b9787e JB |
192 | countGeoms++; |
193 | } | |
b53f7c52 | 194 | X265_CHECK(countGeoms == allocGeoms, "geometry match check failure\n"); |
72b9787e JB |
195 | } |
196 | ||
197 | return true; | |
198 | } | |
199 | ||
200 | bool FrameEncoder::startCompressFrame(Frame* curFrame) | |
201 | { | |
202 | m_frame = curFrame; | |
203 | curFrame->m_encData->m_frameEncoderID = m_frameEncoderID; // Each Frame knows the ID of the FrameEncoder encoding it | |
204 | curFrame->m_encData->m_slice->m_mref = m_mref; | |
b53f7c52 | 205 | |
72b9787e JB |
206 | if (!m_cuGeoms) |
207 | { | |
b53f7c52 | 208 | if (!initializeGeoms()) |
72b9787e JB |
209 | return false; |
210 | } | |
b53f7c52 | 211 | |
72b9787e JB |
212 | m_enable.trigger(); |
213 | return true; | |
214 | } | |
215 | ||
216 | void FrameEncoder::threadMain() | |
217 | { | |
218 | // worker thread routine for FrameEncoder | |
219 | do | |
220 | { | |
221 | m_enable.wait(); // Encoder::encode() triggers this event | |
222 | if (m_threadActive) | |
223 | { | |
224 | compressFrame(); | |
225 | m_done.trigger(); // FrameEncoder::getEncodedPicture() blocks for this event | |
226 | } | |
227 | } | |
228 | while (m_threadActive); | |
229 | } | |
230 | ||
231 | void FrameEncoder::compressFrame() | |
232 | { | |
b53f7c52 | 233 | //ProfileScopeEvent(frameThread); |
72b9787e JB |
234 | int64_t startCompressTime = x265_mdate(); |
235 | Slice* slice = m_frame->m_encData->m_slice; | |
236 | ||
237 | /* Emit access unit delimiter unless this is the first frame and the user is | |
238 | * not repeating headers (since AUD is supposed to be the first NAL in the access | |
239 | * unit) */ | |
240 | if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders)) | |
241 | { | |
242 | m_bs.resetBits(); | |
243 | m_entropyCoder.setBitstream(&m_bs); | |
244 | m_entropyCoder.codeAUD(*slice); | |
245 | m_bs.writeByteAlignment(); | |
246 | m_nalList.serialize(NAL_UNIT_ACCESS_UNIT_DELIMITER, m_bs); | |
247 | } | |
248 | if (m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders) | |
249 | m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs); | |
250 | ||
251 | // Weighted Prediction parameters estimation. | |
252 | bool bUseWeightP = slice->m_sliceType == P_SLICE && slice->m_pps->bUseWeightPred; | |
253 | bool bUseWeightB = slice->m_sliceType == B_SLICE && slice->m_pps->bUseWeightedBiPred; | |
254 | if (bUseWeightP || bUseWeightB) | |
255 | weightAnalyse(*slice, *m_frame, *m_param); | |
256 | else | |
257 | slice->disableWeights(); | |
258 | ||
259 | // Generate motion references | |
260 | int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0; | |
261 | for (int l = 0; l < numPredDir; l++) | |
262 | { | |
263 | for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++) | |
264 | { | |
265 | WeightParam *w = NULL; | |
266 | if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag) | |
267 | w = slice->m_weightPredTable[l][ref]; | |
b53f7c52 | 268 | m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPic, w, *m_param); |
72b9787e JB |
269 | } |
270 | } | |
271 | ||
272 | /* Get the QP for this frame from rate control. This call may block until | |
273 | * frames ahead of it in encode order have called rateControlEnd() */ | |
274 | int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top); | |
275 | m_rce.newQp = qp; | |
276 | ||
277 | /* Clip slice QP to 0-51 spec range before encoding */ | |
278 | slice->m_sliceQp = Clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp); | |
279 | ||
280 | m_initSliceContext.resetEntropy(*slice); | |
281 | ||
282 | m_frameFilter.start(m_frame, m_initSliceContext, qp); | |
283 | ||
284 | // reset entropy coders | |
285 | m_entropyCoder.load(m_initSliceContext); | |
286 | for (int i = 0; i < m_numRows; i++) | |
287 | m_rows[i].init(m_initSliceContext); | |
288 | ||
289 | uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1; | |
290 | if (!m_outStreams) | |
291 | { | |
292 | m_outStreams = new Bitstream[numSubstreams]; | |
293 | m_substreamSizes = X265_MALLOC(uint32_t, numSubstreams); | |
294 | if (!m_param->bEnableSAO) | |
295 | for (uint32_t i = 0; i < numSubstreams; i++) | |
296 | m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]); | |
297 | } | |
298 | else | |
299 | for (uint32_t i = 0; i < numSubstreams; i++) | |
300 | m_outStreams[i].resetBits(); | |
301 | ||
302 | if (m_frame->m_lowres.bKeyframe) | |
303 | { | |
304 | if (m_param->bEmitHRDSEI) | |
305 | { | |
306 | SEIBufferingPeriod* bpSei = &m_top->m_rateControl->m_bufPeriodSEI; | |
307 | ||
308 | // since the temporal layer HRD is not ready, we assumed it is fixed | |
309 | bpSei->m_auCpbRemovalDelayDelta = 1; | |
310 | bpSei->m_cpbDelayOffset = 0; | |
311 | bpSei->m_dpbDelayOffset = 0; | |
312 | ||
313 | // hrdFullness() calculates the initial CPB removal delay and offset | |
314 | m_top->m_rateControl->hrdFullness(bpSei); | |
315 | ||
316 | m_bs.resetBits(); | |
317 | bpSei->write(m_bs, *slice->m_sps); | |
318 | m_bs.writeByteAlignment(); | |
319 | ||
320 | m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs); | |
321 | ||
322 | m_top->m_lastBPSEI = m_rce.encodeOrder; | |
323 | } | |
324 | ||
325 | // The recovery point SEI message assists a decoder in determining when the decoding | |
326 | // process will produce acceptable pictures for display after the decoder initiates | |
327 | // random access. The m_recoveryPocCnt is in units of POC(picture order count) which | |
328 | // means pictures encoded after the CRA but precede it in display order(leading) are | |
329 | // implicitly discarded after a random access seek regardless of the value of | |
330 | // m_recoveryPocCnt. Our encoder does not use references prior to the most recent CRA, | |
331 | // so all pictures following the CRA in POC order are guaranteed to be displayable, | |
332 | // so m_recoveryPocCnt is always 0. | |
333 | SEIRecoveryPoint sei_recovery_point; | |
334 | sei_recovery_point.m_recoveryPocCnt = 0; | |
335 | sei_recovery_point.m_exactMatchingFlag = true; | |
336 | sei_recovery_point.m_brokenLinkFlag = false; | |
337 | ||
338 | m_bs.resetBits(); | |
339 | sei_recovery_point.write(m_bs, *slice->m_sps); | |
340 | m_bs.writeByteAlignment(); | |
341 | ||
342 | m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs); | |
343 | } | |
344 | ||
345 | if (m_param->bEmitHRDSEI || !!m_param->interlaceMode) | |
346 | { | |
347 | SEIPictureTiming *sei = m_rce.picTimingSEI; | |
348 | const VUI *vui = &slice->m_sps->vuiParameters; | |
349 | const HRDInfo *hrd = &vui->hrdParameters; | |
350 | int poc = slice->m_poc; | |
351 | ||
352 | if (vui->frameFieldInfoPresentFlag) | |
353 | { | |
354 | if (m_param->interlaceMode == 2) | |
355 | sei->m_picStruct = (poc & 1) ? 1 /* top */ : 2 /* bottom */; | |
356 | else if (m_param->interlaceMode == 1) | |
357 | sei->m_picStruct = (poc & 1) ? 2 /* bottom */ : 1 /* top */; | |
358 | else | |
359 | sei->m_picStruct = 0; | |
360 | sei->m_sourceScanType = 0; | |
361 | sei->m_duplicateFlag = false; | |
362 | } | |
363 | ||
364 | if (vui->hrdParametersPresentFlag) | |
365 | { | |
366 | // The m_aucpbremoval delay specifies how many clock ticks the | |
367 | // access unit associated with the picture timing SEI message has to | |
368 | // wait after removal of the access unit with the most recent | |
369 | // buffering period SEI message | |
370 | sei->m_auCpbRemovalDelay = X265_MIN(X265_MAX(1, m_rce.encodeOrder - m_top->m_lastBPSEI), (1 << hrd->cpbRemovalDelayLength)); | |
371 | sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder; | |
372 | } | |
373 | ||
374 | m_bs.resetBits(); | |
375 | sei->write(m_bs, *slice->m_sps); | |
376 | m_bs.writeByteAlignment(); | |
377 | m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs); | |
378 | } | |
379 | ||
380 | // Analyze CTU rows, most of the hard work is done here | |
381 | // frame is compressed in a wave-front pattern if WPP is enabled. Loop filter runs as a | |
382 | // wave-front behind the CU compression and reconstruction | |
383 | compressCTURows(); | |
384 | ||
385 | if (m_param->rc.bStatWrite) | |
386 | { | |
387 | int totalI = 0, totalP = 0, totalSkip = 0; | |
388 | ||
389 | // accumulate intra,inter,skip cu count per frame for 2 pass | |
390 | for (int i = 0; i < m_numRows; i++) | |
391 | { | |
392 | m_frameStats.mvBits += m_rows[i].rowStats.mvBits; | |
393 | m_frameStats.coeffBits += m_rows[i].rowStats.coeffBits; | |
394 | m_frameStats.miscBits += m_rows[i].rowStats.miscBits; | |
395 | totalI += m_rows[i].rowStats.iCuCnt; | |
396 | totalP += m_rows[i].rowStats.pCuCnt; | |
397 | totalSkip += m_rows[i].rowStats.skipCuCnt; | |
398 | } | |
399 | int totalCuCount = totalI + totalP + totalSkip; | |
400 | m_frameStats.percentIntra = (double)totalI / totalCuCount; | |
401 | m_frameStats.percentInter = (double)totalP / totalCuCount; | |
402 | m_frameStats.percentSkip = (double)totalSkip / totalCuCount; | |
403 | } | |
404 | ||
405 | m_bs.resetBits(); | |
406 | m_entropyCoder.load(m_initSliceContext); | |
407 | m_entropyCoder.setBitstream(&m_bs); | |
408 | m_entropyCoder.codeSliceHeader(*slice, *m_frame->m_encData); | |
409 | ||
410 | // finish encode of each CTU row, only required when SAO is enabled | |
411 | if (m_param->bEnableSAO) | |
412 | encodeSlice(); | |
413 | ||
414 | // serialize each row, record final lengths in slice header | |
415 | uint32_t maxStreamSize = m_nalList.serializeSubstreams(m_substreamSizes, numSubstreams, m_outStreams); | |
416 | ||
417 | // complete the slice header by writing WPP row-starts | |
418 | m_entropyCoder.setBitstream(&m_bs); | |
419 | if (slice->m_pps->bEntropyCodingSyncEnabled) | |
420 | m_entropyCoder.codeSliceHeaderWPPEntryPoints(*slice, m_substreamSizes, maxStreamSize); | |
421 | m_bs.writeByteAlignment(); | |
422 | ||
423 | m_nalList.serialize(slice->m_nalUnitType, m_bs); | |
424 | ||
425 | if (m_param->decodedPictureHashSEI) | |
426 | { | |
427 | if (m_param->decodedPictureHashSEI == 1) | |
428 | { | |
429 | m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5; | |
430 | for (int i = 0; i < 3; i++) | |
431 | MD5Final(&m_state[i], m_seiReconPictureDigest.m_digest[i]); | |
432 | } | |
433 | else if (m_param->decodedPictureHashSEI == 2) | |
434 | { | |
435 | m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC; | |
436 | for (int i = 0; i < 3; i++) | |
437 | crcFinish(m_crc[i], m_seiReconPictureDigest.m_digest[i]); | |
438 | } | |
439 | else if (m_param->decodedPictureHashSEI == 3) | |
440 | { | |
441 | m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM; | |
442 | for (int i = 0; i < 3; i++) | |
443 | checksumFinish(m_checksum[i], m_seiReconPictureDigest.m_digest[i]); | |
444 | } | |
445 | ||
446 | m_bs.resetBits(); | |
447 | m_seiReconPictureDigest.write(m_bs, *slice->m_sps); | |
448 | m_bs.writeByteAlignment(); | |
449 | ||
450 | m_nalList.serialize(NAL_UNIT_SUFFIX_SEI, m_bs); | |
451 | } | |
452 | ||
453 | uint64_t bytes = 0; | |
454 | for (uint32_t i = 0; i < m_nalList.m_numNal; i++) | |
455 | { | |
456 | int type = m_nalList.m_nal[i].type; | |
457 | ||
458 | // exclude SEI | |
459 | if (type != NAL_UNIT_PREFIX_SEI && type != NAL_UNIT_SUFFIX_SEI) | |
460 | { | |
461 | bytes += m_nalList.m_nal[i].sizeBytes; | |
462 | // and exclude start code prefix | |
463 | bytes -= (!i || type == NAL_UNIT_SPS || type == NAL_UNIT_PPS) ? 4 : 3; | |
464 | } | |
465 | } | |
466 | m_accessUnitBits = bytes << 3; | |
467 | ||
468 | m_elapsedCompressTime = (double)(x265_mdate() - startCompressTime) / 1000000; | |
469 | /* rateControlEnd may also block for earlier frames to call rateControlUpdateStats */ | |
470 | if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0) | |
471 | m_top->m_aborted = true; | |
472 | ||
473 | /* Accumulate NR statistics from all worker threads */ | |
474 | if (m_nr) | |
475 | { | |
476 | for (int i = 0; i < m_top->m_numThreadLocalData; i++) | |
477 | { | |
478 | NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID]; | |
479 | for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++) | |
480 | { | |
481 | for(int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++) | |
482 | m_nr->residualSum[cat][coeff] += nr->residualSum[cat][coeff]; | |
483 | ||
484 | m_nr->count[cat] += nr->count[cat]; | |
485 | } | |
486 | } | |
487 | } | |
488 | ||
489 | noiseReductionUpdate(); | |
490 | ||
491 | /* Copy updated NR coefficients back to all worker threads */ | |
492 | if (m_nr) | |
493 | { | |
494 | for (int i = 0; i < m_top->m_numThreadLocalData; i++) | |
495 | { | |
496 | NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID]; | |
b53f7c52 | 497 | memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint16_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS); |
72b9787e JB |
498 | memset(nr->count, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES); |
499 | memset(nr->residualSum, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS); | |
500 | } | |
501 | } | |
502 | ||
503 | // Decrement referenced frame reference counts, allow them to be recycled | |
504 | for (int l = 0; l < numPredDir; l++) | |
505 | { | |
506 | for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++) | |
507 | { | |
508 | Frame *refpic = slice->m_refPicList[l][ref]; | |
509 | ATOMIC_DEC(&refpic->m_countRefEncoders); | |
510 | } | |
511 | } | |
512 | } | |
513 | ||
514 | void FrameEncoder::encodeSlice() | |
515 | { | |
516 | Slice* slice = m_frame->m_encData->m_slice; | |
517 | const uint32_t widthInLCUs = slice->m_sps->numCuInWidth; | |
518 | const uint32_t lastCUAddr = (slice->m_endCUAddr + NUM_CU_PARTITIONS - 1) / NUM_CU_PARTITIONS; | |
519 | const uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1; | |
520 | ||
521 | SAOParam* saoParam = slice->m_sps->bUseSAO ? m_frame->m_encData->m_saoParam : NULL; | |
522 | for (uint32_t cuAddr = 0; cuAddr < lastCUAddr; cuAddr++) | |
523 | { | |
524 | uint32_t col = cuAddr % widthInLCUs; | |
525 | uint32_t lin = cuAddr / widthInLCUs; | |
526 | uint32_t subStrm = lin % numSubstreams; | |
527 | CUData* ctu = m_frame->m_encData->getPicCTU(cuAddr); | |
528 | ||
529 | m_entropyCoder.setBitstream(&m_outStreams[subStrm]); | |
530 | ||
531 | // Synchronize cabac probabilities with upper-right CTU if it's available and we're at the start of a line. | |
532 | if (m_param->bEnableWavefront && !col && lin) | |
533 | { | |
534 | m_entropyCoder.copyState(m_initSliceContext); | |
535 | m_entropyCoder.loadContexts(m_rows[lin - 1].bufferedEntropy); | |
536 | } | |
537 | ||
538 | if (saoParam) | |
539 | { | |
540 | if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1]) | |
541 | { | |
542 | int mergeLeft = col && saoParam->ctuParam[0][cuAddr].mergeMode == SAO_MERGE_LEFT; | |
543 | int mergeUp = lin && saoParam->ctuParam[0][cuAddr].mergeMode == SAO_MERGE_UP; | |
544 | if (col) | |
545 | m_entropyCoder.codeSaoMerge(mergeLeft); | |
546 | if (lin && !mergeLeft) | |
547 | m_entropyCoder.codeSaoMerge(mergeUp); | |
548 | if (!mergeLeft && !mergeUp) | |
549 | { | |
550 | if (saoParam->bSaoFlag[0]) | |
551 | m_entropyCoder.codeSaoOffset(saoParam->ctuParam[0][cuAddr], 0); | |
552 | if (saoParam->bSaoFlag[1]) | |
553 | { | |
554 | m_entropyCoder.codeSaoOffset(saoParam->ctuParam[1][cuAddr], 1); | |
555 | m_entropyCoder.codeSaoOffset(saoParam->ctuParam[2][cuAddr], 2); | |
556 | } | |
557 | } | |
558 | } | |
559 | else | |
560 | { | |
561 | for (int i = 0; i < 3; i++) | |
562 | saoParam->ctuParam[i][cuAddr].reset(); | |
563 | } | |
564 | } | |
565 | ||
566 | // final coding (bitstream generation) for this CU | |
567 | m_entropyCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]); | |
568 | ||
569 | if (m_param->bEnableWavefront) | |
570 | { | |
571 | if (col == 1) | |
572 | // Store probabilities of second CTU in line into buffer | |
573 | m_rows[lin].bufferedEntropy.loadContexts(m_entropyCoder); | |
574 | ||
575 | if (col == widthInLCUs - 1) | |
576 | m_entropyCoder.finishSlice(); | |
577 | } | |
578 | } | |
579 | if (!m_param->bEnableWavefront) | |
580 | m_entropyCoder.finishSlice(); | |
581 | } | |
582 | ||
583 | void FrameEncoder::compressCTURows() | |
584 | { | |
72b9787e JB |
585 | Slice* slice = m_frame->m_encData->m_slice; |
586 | ||
587 | m_bAllRowsStop = false; | |
588 | m_vbvResetTriggerRow = -1; | |
589 | ||
590 | m_SSDY = m_SSDU = m_SSDV = 0; | |
591 | m_ssim = 0; | |
592 | m_ssimCnt = 0; | |
593 | memset(&m_frameStats, 0, sizeof(m_frameStats)); | |
594 | ||
595 | bool bUseWeightP = slice->m_pps->bUseWeightPred && slice->m_sliceType == P_SLICE; | |
596 | bool bUseWeightB = slice->m_pps->bUseWeightedBiPred && slice->m_sliceType == B_SLICE; | |
597 | int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0; | |
598 | ||
599 | m_rows[0].active = true; | |
600 | if (m_pool && m_param->bEnableWavefront) | |
601 | { | |
602 | WaveFront::clearEnabledRowMask(); | |
603 | WaveFront::enqueue(); | |
604 | ||
605 | for (int row = 0; row < m_numRows; row++) | |
606 | { | |
607 | // block until all reference frames have reconstructed the rows we need | |
608 | for (int l = 0; l < numPredDir; l++) | |
609 | { | |
610 | for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++) | |
611 | { | |
612 | Frame *refpic = slice->m_refPicList[l][ref]; | |
613 | ||
614 | int reconRowCount = refpic->m_reconRowCount.get(); | |
615 | while ((reconRowCount != m_numRows) && (reconRowCount < row + m_refLagRows)) | |
616 | reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount); | |
617 | ||
618 | if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted) | |
619 | m_mref[l][ref].applyWeight(row + m_refLagRows, m_numRows); | |
620 | } | |
621 | } | |
622 | ||
623 | enableRowEncoder(row); | |
624 | if (row == 0) | |
625 | enqueueRowEncoder(0); | |
626 | else | |
627 | m_pool->pokeIdleThread(); | |
628 | } | |
629 | ||
630 | m_completionEvent.wait(); | |
631 | ||
632 | WaveFront::dequeue(); | |
633 | } | |
634 | else | |
635 | { | |
636 | for (int i = 0; i < this->m_numRows + m_filterRowDelay; i++) | |
637 | { | |
638 | // Encode | |
639 | if (i < m_numRows) | |
640 | { | |
641 | // block until all reference frames have reconstructed the rows we need | |
642 | for (int l = 0; l < numPredDir; l++) | |
643 | { | |
644 | int list = l; | |
645 | for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++) | |
646 | { | |
647 | Frame *refpic = slice->m_refPicList[list][ref]; | |
648 | ||
649 | int reconRowCount = refpic->m_reconRowCount.get(); | |
650 | while ((reconRowCount != m_numRows) && (reconRowCount < i + m_refLagRows)) | |
651 | reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount); | |
652 | ||
653 | if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted) | |
654 | m_mref[list][ref].applyWeight(i + m_refLagRows, m_numRows); | |
655 | } | |
656 | } | |
657 | ||
b53f7c52 | 658 | processRowEncoder(i, *m_tld); |
72b9787e JB |
659 | } |
660 | ||
661 | // Filter | |
662 | if (i >= m_filterRowDelay) | |
b53f7c52 | 663 | m_frameFilter.processRow(i - m_filterRowDelay); |
72b9787e JB |
664 | } |
665 | } | |
666 | m_frameTime = (double)m_totalTime / 1000000; | |
667 | m_totalTime = 0; | |
668 | } | |
669 | ||
670 | void FrameEncoder::processRow(int row, int threadId) | |
671 | { | |
672 | const int realRow = row >> 1; | |
673 | const int typeNum = row & 1; | |
674 | ||
675 | ThreadLocalData& tld = threadId >= 0 ? m_top->m_threadLocalData[threadId] : *m_tld; | |
676 | ||
677 | if (!typeNum) | |
678 | processRowEncoder(realRow, tld); | |
679 | else | |
680 | { | |
b53f7c52 | 681 | m_frameFilter.processRow(realRow); |
72b9787e JB |
682 | |
683 | // NOTE: Active next row | |
684 | if (realRow != m_numRows - 1) | |
685 | enqueueRowFilter(realRow + 1); | |
686 | else | |
687 | m_completionEvent.trigger(); | |
688 | } | |
689 | } | |
690 | ||
691 | // Called by worker threads | |
692 | void FrameEncoder::processRowEncoder(int row, ThreadLocalData& tld) | |
693 | { | |
72b9787e JB |
694 | CTURow& curRow = m_rows[row]; |
695 | ||
696 | { | |
697 | ScopedLock self(curRow.lock); | |
698 | if (!curRow.active) | |
699 | /* VBV restart is in progress, exit out */ | |
700 | return; | |
701 | if (curRow.busy) | |
702 | { | |
703 | /* On multi-socket Windows servers, we have seen problems with | |
704 | * ATOMIC_CAS which resulted in multiple worker threads processing | |
705 | * the same CU row, which often resulted in bad pointer accesses. We | |
706 | * believe the problem is fixed, but are leaving this check in place | |
707 | * to prevent crashes in case it is not */ | |
708 | x265_log(m_param, X265_LOG_WARNING, | |
709 | "internal error - simultaneous row access detected. Please report HW to x265-devel@videolan.org\n"); | |
710 | return; | |
711 | } | |
712 | curRow.busy = true; | |
713 | } | |
714 | ||
715 | /* When WPP is enabled, every row has its own row coder instance. Otherwise | |
716 | * they share row 0 */ | |
717 | Entropy& rowCoder = m_param->bEnableWavefront ? m_rows[row].rowGoOnCoder : m_rows[0].rowGoOnCoder; | |
718 | FrameData& curEncData = *m_frame->m_encData; | |
719 | Slice *slice = curEncData.m_slice; | |
72b9787e JB |
720 | |
721 | int64_t startTime = x265_mdate(); | |
722 | const uint32_t numCols = m_numCols; | |
723 | const uint32_t lineStartCUAddr = row * numCols; | |
724 | bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0; | |
725 | ||
726 | while (curRow.completed < numCols) | |
727 | { | |
b53f7c52 JB |
728 | ProfileScopeEvent(encodeCTU); |
729 | ||
72b9787e JB |
730 | int col = curRow.completed; |
731 | const uint32_t cuAddr = lineStartCUAddr + col; | |
732 | CUData* ctu = curEncData.getPicCTU(cuAddr); | |
733 | ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp); | |
734 | ||
735 | if (bIsVbv) | |
736 | { | |
737 | if (!row) | |
738 | { | |
739 | curEncData.m_rowStat[row].diagQp = curEncData.m_avgQpRc; | |
740 | curEncData.m_rowStat[row].diagQpScale = x265_qp2qScale(curEncData.m_avgQpRc); | |
741 | } | |
742 | ||
743 | if (row >= col && row && m_vbvResetTriggerRow != row) | |
744 | curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_cuStat[cuAddr - numCols + 1].baseQp; | |
745 | else | |
746 | curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_rowStat[row].diagQp; | |
747 | } | |
748 | else | |
749 | curEncData.m_cuStat[cuAddr].baseQp = curEncData.m_avgQpRc; | |
750 | ||
751 | if (m_param->rc.aqMode || bIsVbv) | |
752 | { | |
753 | int qp = calcQpForCu(cuAddr, curEncData.m_cuStat[cuAddr].baseQp); | |
754 | tld.analysis.setQP(*slice, qp); | |
755 | qp = Clip3(QP_MIN, QP_MAX_SPEC, qp); | |
b53f7c52 | 756 | ctu->setQPSubParts((int8_t)qp, 0, 0); |
72b9787e JB |
757 | curEncData.m_rowStat[row].sumQpAq += qp; |
758 | } | |
759 | else | |
760 | tld.analysis.setQP(*slice, slice->m_sliceQp); | |
761 | ||
762 | if (m_param->bEnableWavefront && !col && row) | |
763 | { | |
764 | // Load SBAC coder context from previous row and initialize row state. | |
765 | rowCoder.copyState(m_initSliceContext); | |
766 | rowCoder.loadContexts(m_rows[row - 1].bufferedEntropy); | |
767 | } | |
768 | ||
769 | // Does all the CU analysis, returns best top level mode decision | |
b53f7c52 | 770 | Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder); |
72b9787e JB |
771 | |
772 | /* advance top-level row coder to include the context of this CTU. | |
773 | * if SAO is disabled, rowCoder writes the final CTU bitstream */ | |
774 | rowCoder.encodeCTU(*ctu, m_cuGeoms[m_ctuGeomMap[cuAddr]]); | |
775 | ||
776 | if (m_param->bEnableWavefront && col == 1) | |
777 | // Save CABAC state for next row | |
778 | curRow.bufferedEntropy.loadContexts(rowCoder); | |
779 | ||
780 | // Completed CU processing | |
781 | curRow.completed++; | |
782 | ||
783 | if (m_param->bLogCuStats || m_param->rc.bStatWrite) | |
784 | collectCTUStatistics(*ctu); | |
785 | ||
786 | // copy no. of intra, inter Cu cnt per row into frame stats for 2 pass | |
787 | if (m_param->rc.bStatWrite) | |
788 | { | |
789 | curRow.rowStats.mvBits += best.mvBits; | |
790 | curRow.rowStats.coeffBits += best.coeffBits; | |
791 | curRow.rowStats.miscBits += best.totalBits - (best.mvBits + best.coeffBits); | |
792 | StatisticLog* log = &m_sliceTypeLog[slice->m_sliceType]; | |
793 | ||
794 | for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) | |
795 | { | |
796 | /* 1 << shift == number of 8x8 blocks at current depth */ | |
797 | int shift = 2 * (g_maxCUDepth - depth); | |
798 | curRow.rowStats.iCuCnt += log->qTreeIntraCnt[depth] << shift; | |
799 | curRow.rowStats.pCuCnt += log->qTreeInterCnt[depth] << shift; | |
800 | curRow.rowStats.skipCuCnt += log->qTreeSkipCnt[depth] << shift; | |
801 | ||
802 | // clear the row cu data from thread local object | |
803 | log->qTreeIntraCnt[depth] = log->qTreeInterCnt[depth] = log->qTreeSkipCnt[depth] = 0; | |
804 | } | |
805 | } | |
806 | ||
807 | curEncData.m_cuStat[cuAddr].totalBits = best.totalBits; | |
808 | x265_emms(); | |
809 | ||
810 | if (bIsVbv) | |
811 | { | |
812 | // Update encoded bits, satdCost, baseQP for each CU | |
813 | curEncData.m_rowStat[row].diagSatd += curEncData.m_cuStat[cuAddr].vbvCost; | |
814 | curEncData.m_rowStat[row].diagIntraSatd += curEncData.m_cuStat[cuAddr].intraVbvCost; | |
815 | curEncData.m_rowStat[row].encodedBits += curEncData.m_cuStat[cuAddr].totalBits; | |
816 | curEncData.m_rowStat[row].sumQpRc += curEncData.m_cuStat[cuAddr].baseQp; | |
817 | curEncData.m_rowStat[row].numEncodedCUs = cuAddr; | |
818 | ||
819 | // If current block is at row diagonal checkpoint, call vbv ratecontrol. | |
820 | ||
821 | if (row == col && row) | |
822 | { | |
823 | double qpBase = curEncData.m_cuStat[cuAddr].baseQp; | |
824 | int reEncode = m_top->m_rateControl->rowDiagonalVbvRateControl(m_frame, row, &m_rce, qpBase); | |
825 | qpBase = Clip3((double)QP_MIN, (double)QP_MAX_MAX, qpBase); | |
826 | curEncData.m_rowStat[row].diagQp = qpBase; | |
827 | curEncData.m_rowStat[row].diagQpScale = x265_qp2qScale(qpBase); | |
828 | ||
829 | if (reEncode < 0) | |
830 | { | |
831 | x265_log(m_param, X265_LOG_DEBUG, "POC %d row %d - encode restart required for VBV, to %.2f from %.2f\n", | |
832 | m_frame->m_poc, row, qpBase, curEncData.m_cuStat[cuAddr].baseQp); | |
833 | ||
834 | // prevent the WaveFront::findJob() method from providing new jobs | |
835 | m_vbvResetTriggerRow = row; | |
836 | m_bAllRowsStop = true; | |
837 | ||
838 | for (int r = m_numRows - 1; r >= row; r--) | |
839 | { | |
840 | CTURow& stopRow = m_rows[r]; | |
841 | ||
842 | if (r != row) | |
843 | { | |
844 | /* if row was active (ready to be run) clear active bit and bitmap bit for this row */ | |
845 | stopRow.lock.acquire(); | |
846 | while (stopRow.active) | |
847 | { | |
848 | if (dequeueRow(r * 2)) | |
849 | stopRow.active = false; | |
850 | else | |
b53f7c52 JB |
851 | { |
852 | /* we must release the row lock to allow the thread to exit */ | |
853 | stopRow.lock.release(); | |
72b9787e | 854 | GIVE_UP_TIME(); |
b53f7c52 JB |
855 | stopRow.lock.acquire(); |
856 | } | |
72b9787e | 857 | } |
72b9787e JB |
858 | stopRow.lock.release(); |
859 | ||
860 | bool bRowBusy = true; | |
861 | do | |
862 | { | |
863 | stopRow.lock.acquire(); | |
864 | bRowBusy = stopRow.busy; | |
865 | stopRow.lock.release(); | |
866 | ||
867 | if (bRowBusy) | |
868 | { | |
869 | GIVE_UP_TIME(); | |
870 | } | |
871 | } | |
872 | while (bRowBusy); | |
873 | } | |
874 | ||
875 | m_outStreams[r].resetBits(); | |
876 | stopRow.completed = 0; | |
877 | memset(&stopRow.rowStats, 0, sizeof(stopRow.rowStats)); | |
878 | curEncData.m_rowStat[r].numEncodedCUs = 0; | |
879 | curEncData.m_rowStat[r].encodedBits = 0; | |
880 | curEncData.m_rowStat[r].diagSatd = 0; | |
881 | curEncData.m_rowStat[r].diagIntraSatd = 0; | |
882 | curEncData.m_rowStat[r].sumQpRc = 0; | |
883 | curEncData.m_rowStat[r].sumQpAq = 0; | |
884 | } | |
885 | ||
886 | m_bAllRowsStop = false; | |
887 | } | |
888 | } | |
889 | } | |
890 | ||
891 | // NOTE: do CU level Filter | |
892 | if (m_param->bEnableSAO && m_param->bSaoNonDeblocked) | |
893 | // SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas | |
894 | m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row); | |
895 | ||
896 | // NOTE: active next row | |
897 | if (curRow.completed >= 2 && row < m_numRows - 1) | |
898 | { | |
899 | ScopedLock below(m_rows[row + 1].lock); | |
900 | if (m_rows[row + 1].active == false && | |
901 | m_rows[row + 1].completed + 2 <= curRow.completed && | |
902 | (!m_bAllRowsStop || row + 1 < m_vbvResetTriggerRow)) | |
903 | { | |
904 | m_rows[row + 1].active = true; | |
905 | enqueueRowEncoder(row + 1); | |
906 | } | |
907 | } | |
908 | ||
909 | ScopedLock self(curRow.lock); | |
910 | if ((m_bAllRowsStop && row > m_vbvResetTriggerRow) || | |
911 | (row > 0 && curRow.completed < numCols - 1 && m_rows[row - 1].completed < m_rows[row].completed + 2)) | |
912 | { | |
913 | curRow.active = false; | |
914 | curRow.busy = false; | |
915 | m_totalTime += x265_mdate() - startTime; | |
916 | return; | |
917 | } | |
918 | } | |
919 | ||
920 | /* *this row of CTUs has been encoded* */ | |
921 | ||
922 | /* flush row bitstream (if WPP and no SAO) or flush frame if no WPP and no SAO */ | |
923 | if (!m_param->bEnableSAO && (m_param->bEnableWavefront || row == m_numRows - 1)) | |
924 | rowCoder.finishSlice(); | |
925 | ||
926 | /* If encoding with ABR, update update bits and complexity in rate control | |
927 | * after a number of rows so the next frame's rateControlStart has more | |
928 | * accurate data for estimation. At the start of the encode we update stats | |
929 | * after half the frame is encoded, but after this initial period we update | |
930 | * after refLagRows (the number of rows reference frames must have completed | |
931 | * before referencees may begin encoding) */ | |
932 | int rowCount = 0; | |
933 | if (m_param->rc.rateControlMode == X265_RC_ABR) | |
934 | { | |
935 | if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom)) | |
936 | rowCount = X265_MIN((m_numRows + 1) / 2, m_numRows - 1); | |
937 | else | |
938 | rowCount = X265_MIN(m_refLagRows, m_numRows - 1); | |
939 | } | |
940 | if (row == rowCount) | |
941 | { | |
942 | m_rce.rowTotalBits = 0; | |
943 | if (bIsVbv) | |
944 | for (int i = 0; i < rowCount; i++) | |
945 | m_rce.rowTotalBits += curEncData.m_rowStat[i].encodedBits; | |
946 | else | |
947 | for (uint32_t cuAddr = 0; cuAddr < rowCount * numCols; cuAddr++) | |
948 | m_rce.rowTotalBits += curEncData.m_cuStat[cuAddr].totalBits; | |
949 | ||
950 | m_top->m_rateControl->rateControlUpdateStats(&m_rce); | |
951 | } | |
952 | ||
b53f7c52 | 953 | if (m_param->bEnableWavefront) |
72b9787e | 954 | { |
b53f7c52 JB |
955 | /* trigger row-wise loop filters */ |
956 | if (row >= m_filterRowDelay) | |
957 | { | |
958 | enableRowFilter(row - m_filterRowDelay); | |
72b9787e | 959 | |
b53f7c52 JB |
960 | /* NOTE: Activate filter if first row (row 0) */ |
961 | if (row == m_filterRowDelay) | |
962 | enqueueRowFilter(0); | |
963 | } | |
964 | if (row == m_numRows - 1) | |
965 | { | |
966 | for (int i = m_numRows - m_filterRowDelay; i < m_numRows; i++) | |
967 | enableRowFilter(i); | |
968 | } | |
72b9787e JB |
969 | } |
970 | ||
971 | m_totalTime += x265_mdate() - startTime; | |
972 | curRow.busy = false; | |
973 | } | |
974 | ||
975 | void FrameEncoder::collectCTUStatistics(CUData& ctu) | |
976 | { | |
977 | StatisticLog* log = &m_sliceTypeLog[ctu.m_slice->m_sliceType]; | |
978 | ||
979 | if (ctu.m_slice->m_sliceType == I_SLICE) | |
980 | { | |
981 | uint32_t depth = 0; | |
982 | for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2)) | |
983 | { | |
984 | depth = ctu.m_cuDepth[absPartIdx]; | |
985 | ||
986 | log->totalCu++; | |
987 | log->cntIntra[depth]++; | |
988 | log->qTreeIntraCnt[depth]++; | |
989 | ||
b53f7c52 | 990 | if (ctu.m_predMode[absPartIdx] == MODE_NONE) |
72b9787e JB |
991 | { |
992 | log->totalCu--; | |
993 | log->cntIntra[depth]--; | |
994 | log->qTreeIntraCnt[depth]--; | |
995 | } | |
b53f7c52 | 996 | else if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N) |
72b9787e JB |
997 | { |
998 | /* TODO: log intra modes at absPartIdx +0 to +3 */ | |
999 | X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n"); | |
1000 | log->cntIntraNxN++; | |
1001 | log->cntIntra[depth]--; | |
1002 | } | |
1003 | else if (ctu.m_lumaIntraDir[absPartIdx] > 1) | |
1004 | log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++; | |
1005 | else | |
1006 | log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++; | |
1007 | } | |
1008 | } | |
1009 | else | |
1010 | { | |
1011 | uint32_t depth = 0; | |
1012 | for (uint32_t absPartIdx = 0; absPartIdx < ctu.m_numPartitions; absPartIdx += ctu.m_numPartitions >> (depth * 2)) | |
1013 | { | |
1014 | depth = ctu.m_cuDepth[absPartIdx]; | |
1015 | ||
1016 | log->totalCu++; | |
1017 | log->cntTotalCu[depth]++; | |
1018 | ||
b53f7c52 | 1019 | if (ctu.m_predMode[absPartIdx] == MODE_NONE) |
72b9787e JB |
1020 | { |
1021 | log->totalCu--; | |
1022 | log->cntTotalCu[depth]--; | |
1023 | } | |
1024 | else if (ctu.isSkipped(absPartIdx)) | |
1025 | { | |
1026 | log->totalCu--; | |
1027 | log->cntSkipCu[depth]++; | |
1028 | log->qTreeSkipCnt[depth]++; | |
1029 | } | |
b53f7c52 | 1030 | else if (ctu.isInter(absPartIdx)) |
72b9787e JB |
1031 | { |
1032 | log->cntInter[depth]++; | |
1033 | log->qTreeInterCnt[depth]++; | |
1034 | ||
1035 | if (ctu.m_partSize[absPartIdx] < AMP_ID) | |
1036 | log->cuInterDistribution[depth][ctu.m_partSize[absPartIdx]]++; | |
1037 | else | |
1038 | log->cuInterDistribution[depth][AMP_ID]++; | |
1039 | } | |
b53f7c52 | 1040 | else if (ctu.isIntra(absPartIdx)) |
72b9787e JB |
1041 | { |
1042 | log->cntIntra[depth]++; | |
1043 | log->qTreeIntraCnt[depth]++; | |
1044 | ||
b53f7c52 | 1045 | if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N) |
72b9787e JB |
1046 | { |
1047 | X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n"); | |
1048 | log->cntIntraNxN++; | |
1049 | /* TODO: log intra modes at absPartIdx +0 to +3 */ | |
1050 | } | |
1051 | else if (ctu.m_lumaIntraDir[absPartIdx] > 1) | |
1052 | log->cuIntraDistribution[depth][ANGULAR_MODE_ID]++; | |
1053 | else | |
1054 | log->cuIntraDistribution[depth][ctu.m_lumaIntraDir[absPartIdx]]++; | |
1055 | } | |
1056 | } | |
1057 | } | |
1058 | } | |
1059 | ||
1060 | /* DCT-domain noise reduction / adaptive deadzone from libavcodec */ | |
1061 | void FrameEncoder::noiseReductionUpdate() | |
1062 | { | |
1063 | if (!m_nr) | |
1064 | return; | |
1065 | ||
1066 | static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12}; | |
1067 | ||
1068 | for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++) | |
1069 | { | |
1070 | int trSize = cat & 3; | |
1071 | int coefCount = 1 << ((trSize + 2) * 2); | |
1072 | ||
1073 | if (m_nr->count[cat] > maxBlocksPerTrSize[trSize]) | |
1074 | { | |
1075 | for (int i = 0; i < coefCount; i++) | |
1076 | m_nr->residualSum[cat][i] >>= 1; | |
1077 | m_nr->count[cat] >>= 1; | |
1078 | } | |
1079 | ||
b53f7c52 JB |
1080 | int nrStrength = cat < 8 ? m_param->noiseReductionIntra : m_param->noiseReductionInter; |
1081 | uint64_t scaledCount = (uint64_t)nrStrength * m_nr->count[cat]; | |
72b9787e JB |
1082 | |
1083 | for (int i = 0; i < coefCount; i++) | |
1084 | { | |
1085 | uint64_t value = scaledCount + m_nr->residualSum[cat][i] / 2; | |
1086 | uint64_t denom = m_nr->residualSum[cat][i] + 1; | |
1087 | m_nr->offsetDenoise[cat][i] = (uint16_t)(value / denom); | |
1088 | } | |
1089 | ||
1090 | // Don't denoise DC coefficients | |
1091 | m_nr->offsetDenoise[cat][0] = 0; | |
1092 | } | |
1093 | } | |
1094 | ||
1095 | int FrameEncoder::calcQpForCu(uint32_t ctuAddr, double baseQp) | |
1096 | { | |
1097 | x265_emms(); | |
1098 | double qp = baseQp; | |
1099 | ||
1100 | FrameData& curEncData = *m_frame->m_encData; | |
1101 | /* clear cuCostsForVbv from when vbv row reset was triggered */ | |
1102 | bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0; | |
1103 | if (bIsVbv) | |
1104 | { | |
1105 | curEncData.m_cuStat[ctuAddr].vbvCost = 0; | |
1106 | curEncData.m_cuStat[ctuAddr].intraVbvCost = 0; | |
1107 | } | |
1108 | ||
1109 | /* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */ | |
1110 | double qp_offset = 0; | |
b53f7c52 JB |
1111 | uint32_t maxBlockCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16; |
1112 | uint32_t maxBlockRows = (m_frame->m_fencPic->m_picHeight + (16 - 1)) / 16; | |
72b9787e JB |
1113 | uint32_t noOfBlocks = g_maxCUSize / 16; |
1114 | uint32_t block_y = (ctuAddr / curEncData.m_slice->m_sps->numCuInWidth) * noOfBlocks; | |
1115 | uint32_t block_x = (ctuAddr * noOfBlocks) - block_y * curEncData.m_slice->m_sps->numCuInWidth; | |
1116 | ||
1117 | /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */ | |
1118 | bool isReferenced = IS_REFERENCED(m_frame); | |
1119 | double *qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset; | |
1120 | ||
1121 | uint32_t cnt = 0, idx = 0; | |
1122 | for (uint32_t h = 0; h < noOfBlocks && block_y < maxBlockRows; h++, block_y++) | |
1123 | { | |
1124 | for (uint32_t w = 0; w < noOfBlocks && (block_x + w) < maxBlockCols; w++) | |
1125 | { | |
1126 | idx = block_x + w + (block_y * maxBlockCols); | |
1127 | if (m_param->rc.aqMode) | |
1128 | qp_offset += qpoffs[idx]; | |
1129 | if (bIsVbv) | |
1130 | { | |
1131 | curEncData.m_cuStat[ctuAddr].vbvCost += m_frame->m_lowres.lowresCostForRc[idx] & LOWRES_COST_MASK; | |
1132 | curEncData.m_cuStat[ctuAddr].intraVbvCost += m_frame->m_lowres.intraCost[idx]; | |
1133 | } | |
1134 | cnt++; | |
1135 | } | |
1136 | } | |
1137 | ||
1138 | qp_offset /= cnt; | |
1139 | qp += qp_offset; | |
1140 | ||
1141 | return Clip3(QP_MIN, QP_MAX_MAX, (int)(qp + 0.5)); | |
1142 | } | |
1143 | ||
1144 | Frame *FrameEncoder::getEncodedPicture(NALList& output) | |
1145 | { | |
1146 | if (m_frame) | |
1147 | { | |
1148 | /* block here until worker thread completes */ | |
1149 | m_done.wait(); | |
1150 | ||
1151 | Frame *ret = m_frame; | |
1152 | m_frame = NULL; | |
1153 | output.takeContents(m_nalList); | |
1154 | return ret; | |
1155 | } | |
1156 | ||
1157 | return NULL; | |
1158 | } | |
1159 | } |