Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | /***************************************************************************** |
2 | * Copyright (C) 2013 x265 project | |
3 | * | |
4 | * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com> | |
5 | * Steve Borho <steve@borho.org> | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License as published by | |
9 | * the Free Software Foundation; either version 2 of the License, or | |
10 | * (at your option) any later version. | |
11 | * | |
12 | * This program is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | * GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with this program; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
20 | * | |
21 | * This program is also available under a commercial proprietary license. | |
22 | * For more information, contact us at license @ x265.com. | |
23 | *****************************************************************************/ | |
24 | ||
25 | #include "common.h" | |
26 | #include "frame.h" | |
27 | #include "framedata.h" | |
28 | #include "picyuv.h" | |
29 | #include "primitives.h" | |
30 | #include "threading.h" | |
31 | ||
32 | #include "analysis.h" | |
33 | #include "rdcost.h" | |
34 | #include "encoder.h" | |
35 | ||
36 | #include "PPA/ppa.h" | |
37 | ||
38 | using namespace x265; | |
39 | ||
40 | /* An explanation of rate distortion levels (--rd-level) | |
41 | * | |
42 | * rd-level 0 generates no recon per CU (NO RDO or Quant) | |
43 | * | |
44 | * sa8d selection between merge / skip / inter / intra and split | |
45 | * no recon pixels generated until CTU analysis is complete, requiring | |
46 | * intra predictions to use source pixels | |
47 | * | |
48 | * rd-level 1 uses RDO for merge and skip, sa8d for all else | |
49 | * | |
50 | * RDO selection between merge and skip | |
51 | * sa8d selection between (merge/skip) / inter modes / intra and split | |
52 | * intra prediction uses reconstructed pixels | |
53 | * | |
54 | * rd-level 2 uses RDO for merge/skip and split | |
55 | * | |
56 | * RDO selection between merge and skip | |
57 | * sa8d selection between (merge/skip) / inter modes / intra | |
58 | * RDO split decisions | |
59 | * | |
60 | * rd-level 3 uses RDO for merge/skip/best inter/intra | |
61 | * | |
62 | * RDO selection between merge and skip | |
63 | * sa8d selection of best inter mode | |
64 | * RDO selection between (merge/skip) / best inter mode / intra / split | |
65 | * | |
66 | * rd-level 4 enables RDOQuant | |
67 | * | |
68 | * rd-level 5,6 does RDO for each inter mode | |
69 | */ | |
70 | ||
71 | Analysis::Analysis() | |
72 | { | |
73 | m_totalNumJobs = m_numAcquiredJobs = m_numCompletedJobs = 0; | |
74 | } | |
75 | ||
76 | bool Analysis::create(ThreadLocalData *tld) | |
77 | { | |
78 | m_tld = tld; | |
79 | m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2; | |
80 | ||
81 | int csp = m_param->internalCsp; | |
82 | uint32_t cuSize = g_maxCUSize; | |
83 | ||
84 | bool ok = true; | |
85 | for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++, cuSize >>= 1) | |
86 | { | |
87 | ModeDepth &md = m_modeDepth[depth]; | |
88 | ||
89 | md.cuMemPool.create(depth, csp, MAX_PRED_TYPES); | |
90 | ok &= md.fencYuv.create(cuSize, csp); | |
91 | ||
92 | for (int j = 0; j < MAX_PRED_TYPES; j++) | |
93 | { | |
94 | md.pred[j].cu.initialize(md.cuMemPool, depth, csp, j); | |
95 | ok &= md.pred[j].predYuv.create(cuSize, csp); | |
96 | ok &= md.pred[j].reconYuv.create(cuSize, csp); | |
97 | md.pred[j].fencYuv = &md.fencYuv; | |
98 | } | |
99 | } | |
100 | ||
101 | return ok; | |
102 | } | |
103 | ||
104 | void Analysis::destroy() | |
105 | { | |
106 | for (uint32_t i = 0; i <= g_maxCUDepth; i++) | |
107 | { | |
108 | m_modeDepth[i].cuMemPool.destroy(); | |
109 | m_modeDepth[i].fencYuv.destroy(); | |
110 | ||
111 | for (int j = 0; j < MAX_PRED_TYPES; j++) | |
112 | { | |
113 | m_modeDepth[i].pred[j].predYuv.destroy(); | |
114 | m_modeDepth[i].pred[j].reconYuv.destroy(); | |
115 | } | |
116 | } | |
117 | } | |
118 | ||
119 | Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext) | |
120 | { | |
121 | m_slice = ctu.m_slice; | |
122 | m_frame = &frame; | |
123 | ||
124 | invalidateContexts(0); | |
125 | m_quant.setQPforQuant(ctu); | |
126 | m_rqt[0].cur.load(initialContext); | |
127 | m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_origPicYuv, ctu.m_cuAddr, 0); | |
128 | ||
129 | uint32_t numPartition = ctu.m_numPartitions; | |
130 | if (m_slice->m_sliceType == I_SLICE) | |
131 | { | |
132 | uint32_t zOrder = 0; | |
133 | if (m_param->analysisMode == X265_ANALYSIS_LOAD) | |
134 | compressIntraCU(ctu, cuGeom, m_frame->m_intraData, zOrder); | |
135 | else | |
136 | { | |
137 | compressIntraCU(ctu, cuGeom, NULL, zOrder); | |
138 | ||
139 | if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_intraData) | |
140 | { | |
141 | CUData *bestCU = &m_modeDepth[0].bestMode->cu; | |
142 | memcpy(&m_frame->m_intraData->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition); | |
143 | memcpy(&m_frame->m_intraData->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition); | |
144 | memcpy(&m_frame->m_intraData->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition); | |
145 | m_frame->m_intraData->cuAddr[ctu.m_cuAddr] = ctu.m_cuAddr; | |
146 | m_frame->m_intraData->poc[ctu.m_cuAddr] = m_frame->m_poc; | |
147 | } | |
148 | } | |
149 | } | |
150 | else | |
151 | { | |
152 | if (!m_param->rdLevel) | |
153 | { | |
154 | /* In RD Level 0/1, copy source pixels into the reconstructed block so | |
155 | * they are available for intra predictions */ | |
156 | m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPicYuv, ctu.m_cuAddr, 0); | |
157 | ||
158 | compressInterCU_rd0_4(ctu, cuGeom); // TODO: this really wants to be compressInterCU_rd0_1 | |
159 | ||
160 | /* generate residual for entire CTU at once and copy to reconPic */ | |
161 | encodeResidue(ctu, cuGeom); | |
162 | } | |
163 | else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2) | |
164 | compressInterCU_dist(ctu, cuGeom); | |
165 | else if (m_param->rdLevel <= 4) | |
166 | compressInterCU_rd0_4(ctu, cuGeom); | |
167 | else | |
168 | compressInterCU_rd5_6(ctu, cuGeom); | |
169 | } | |
170 | ||
171 | return *m_modeDepth[0].bestMode; | |
172 | } | |
173 | ||
174 | void Analysis::tryLossless(const CUGeom& cuGeom) | |
175 | { | |
176 | ModeDepth& md = m_modeDepth[cuGeom.depth]; | |
177 | ||
178 | if (!md.bestMode->distortion) | |
179 | /* already lossless */ | |
180 | return; | |
181 | else if (md.bestMode->cu.m_predMode[0] == MODE_INTRA) | |
182 | { | |
183 | md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom); | |
184 | PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0]; | |
185 | uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir; | |
186 | checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes); | |
187 | checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth); | |
188 | } | |
189 | else | |
190 | { | |
191 | md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom); | |
192 | md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv); | |
193 | encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom); | |
194 | checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth); | |
195 | } | |
196 | } | |
197 | ||
198 | void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x265_intra_data* shared, uint32_t& zOrder) | |
199 | { | |
200 | uint32_t depth = cuGeom.depth; | |
201 | ModeDepth& md = m_modeDepth[depth]; | |
202 | md.bestMode = NULL; | |
203 | ||
204 | bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); | |
205 | bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); | |
206 | ||
207 | if (shared) | |
208 | { | |
209 | uint8_t* sharedDepth = &shared->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; | |
210 | char* sharedPartSizes = &shared->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; | |
211 | uint8_t* sharedModes = &shared->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; | |
212 | ||
213 | if (mightNotSplit && depth == sharedDepth[zOrder] && zOrder == cuGeom.encodeIdx) | |
214 | { | |
215 | m_quant.setQPforQuant(parentCTU); | |
216 | ||
217 | PartSize size = (PartSize)sharedPartSizes[zOrder]; | |
218 | Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN]; | |
219 | mode.cu.initSubCU(parentCTU, cuGeom); | |
220 | checkIntra(mode, cuGeom, size, sharedModes); | |
221 | checkBestMode(mode, depth); | |
222 | ||
223 | if (m_bTryLossless) | |
224 | tryLossless(cuGeom); | |
225 | ||
226 | if (mightSplit) | |
227 | addSplitFlagCost(*md.bestMode, cuGeom.depth); | |
228 | ||
229 | // increment zOrder offset to point to next best depth in sharedDepth buffer | |
230 | zOrder += g_depthInc[g_maxCUDepth - 1][sharedDepth[zOrder]]; | |
231 | mightSplit = false; | |
232 | } | |
233 | } | |
234 | else if (mightNotSplit) | |
235 | { | |
236 | m_quant.setQPforQuant(parentCTU); | |
237 | ||
238 | md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); | |
239 | checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL); | |
240 | checkBestMode(md.pred[PRED_INTRA], depth); | |
241 | ||
242 | if (depth == g_maxCUDepth) | |
243 | { | |
244 | md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom); | |
245 | checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL); | |
246 | checkBestMode(md.pred[PRED_INTRA_NxN], depth); | |
247 | } | |
248 | ||
249 | if (m_bTryLossless) | |
250 | tryLossless(cuGeom); | |
251 | ||
252 | if (mightSplit) | |
253 | addSplitFlagCost(*md.bestMode, cuGeom.depth); | |
254 | } | |
255 | ||
256 | if (mightSplit) | |
257 | { | |
258 | Mode* splitPred = &md.pred[PRED_SPLIT]; | |
259 | splitPred->initCosts(); | |
260 | CUData* splitCU = &splitPred->cu; | |
261 | splitCU->initSubCU(parentCTU, cuGeom); | |
262 | ||
263 | uint32_t nextDepth = depth + 1; | |
264 | ModeDepth& nd = m_modeDepth[nextDepth]; | |
265 | invalidateContexts(nextDepth); | |
266 | Entropy* nextContext = &m_rqt[depth].cur; | |
267 | ||
268 | for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) | |
269 | { | |
270 | const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); | |
271 | if (childCuData.flags & CUGeom::PRESENT) | |
272 | { | |
273 | m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx); | |
274 | m_rqt[nextDepth].cur.load(*nextContext); | |
275 | compressIntraCU(parentCTU, childCuData, shared, zOrder); | |
276 | ||
277 | // Save best CU and pred data for this sub CU | |
278 | splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx); | |
279 | splitPred->addSubCosts(*nd.bestMode); | |
280 | nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx); | |
281 | nextContext = &nd.bestMode->contexts; | |
282 | } | |
283 | else | |
284 | { | |
285 | /* record the depth of this non-present sub-CU */ | |
286 | splitCU->setEmptyPart(childCuData, subPartIdx); | |
287 | zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth]; | |
288 | } | |
289 | } | |
290 | nextContext->store(splitPred->contexts); | |
291 | if (mightNotSplit) | |
292 | addSplitFlagCost(*splitPred, cuGeom.depth); | |
293 | else | |
294 | updateModeCost(*splitPred); | |
295 | checkBestMode(*splitPred, depth); | |
296 | } | |
297 | ||
298 | checkDQP(md.bestMode->cu, cuGeom); | |
299 | ||
300 | /* Copy best data to encData CTU and recon */ | |
301 | md.bestMode->cu.copyToPic(depth); | |
302 | if (md.bestMode != &md.pred[PRED_SPLIT]) | |
303 | md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx); | |
304 | } | |
305 | ||
306 | bool Analysis::findJob(int threadId) | |
307 | { | |
308 | /* try to acquire a CU mode to analyze */ | |
309 | if (m_totalNumJobs > m_numAcquiredJobs) | |
310 | { | |
311 | /* ATOMIC_INC returns the incremented value */ | |
312 | int id = ATOMIC_INC(&m_numAcquiredJobs); | |
313 | if (m_totalNumJobs >= id) | |
314 | { | |
315 | parallelModeAnalysis(threadId, id - 1); | |
316 | ||
317 | if (ATOMIC_INC(&m_numCompletedJobs) == m_totalNumJobs) | |
318 | m_modeCompletionEvent.trigger(); | |
319 | return true; | |
320 | } | |
321 | } | |
322 | ||
323 | if (m_totalNumME > m_numAcquiredME) | |
324 | { | |
325 | int id = ATOMIC_INC(&m_numAcquiredME); | |
326 | if (m_totalNumME >= id) | |
327 | { | |
328 | parallelME(threadId, id - 1); | |
329 | ||
330 | if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME) | |
331 | m_meCompletionEvent.trigger(); | |
332 | return true; | |
333 | } | |
334 | } | |
335 | ||
336 | return false; | |
337 | } | |
338 | ||
339 | void Analysis::parallelME(int threadId, int meId) | |
340 | { | |
341 | Analysis* slave; | |
342 | ||
343 | if (threadId == -1) | |
344 | slave = this; | |
345 | else | |
346 | { | |
347 | slave = &m_tld[threadId].analysis; | |
348 | slave->setQP(*m_slice, m_rdCost.m_qp); | |
349 | slave->m_slice = m_slice; | |
350 | slave->m_frame = m_frame; | |
351 | ||
352 | PicYuv* fencPic = m_frame->m_origPicYuv; | |
353 | pixel* pu = fencPic->getLumaAddr(m_curMECu->m_cuAddr, m_curGeom->encodeIdx + m_puAbsPartIdx); | |
354 | slave->m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride); | |
355 | slave->m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight); | |
356 | ||
357 | slave->prepMotionCompensation(*m_curMECu, *m_curGeom, m_curPart); | |
358 | } | |
359 | ||
360 | if (meId < m_slice->m_numRefIdx[0]) | |
361 | slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 0, meId); | |
362 | else | |
363 | slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]); | |
364 | } | |
365 | ||
366 | void Analysis::parallelModeAnalysis(int threadId, int jobId) | |
367 | { | |
368 | Analysis* slave; | |
369 | ||
370 | if (threadId == -1) | |
371 | slave = this; | |
372 | else | |
373 | { | |
374 | slave = &m_tld[threadId].analysis; | |
375 | slave->m_slice = m_slice; | |
376 | slave->m_frame = m_frame; | |
377 | slave->setQP(*m_slice, m_rdCost.m_qp); | |
378 | slave->invalidateContexts(0); | |
379 | if (jobId) | |
380 | slave->m_me.setSourcePlane(m_frame->m_origPicYuv->m_picOrg[0], m_frame->m_origPicYuv->m_stride); | |
381 | } | |
382 | ||
383 | ModeDepth& md = m_modeDepth[m_curGeom->depth]; | |
384 | ||
385 | if (m_param->rdLevel <= 4) | |
386 | { | |
387 | switch (jobId) | |
388 | { | |
389 | case 0: | |
390 | if (slave != this) | |
391 | slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur); | |
392 | slave->checkIntraInInter_rd0_4(md.pred[PRED_INTRA], *m_curGeom); | |
393 | if (m_param->rdLevel > 2) | |
394 | slave->encodeIntraInInter(md.pred[PRED_INTRA], *m_curGeom); | |
395 | break; | |
396 | ||
397 | case 1: | |
398 | slave->checkInter_rd0_4(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N); | |
399 | break; | |
400 | ||
401 | case 2: | |
402 | slave->checkInter_rd0_4(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N); | |
403 | break; | |
404 | ||
405 | case 3: | |
406 | slave->checkInter_rd0_4(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN); | |
407 | break; | |
408 | ||
409 | case 4: | |
410 | slave->checkInter_rd0_4(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU); | |
411 | break; | |
412 | ||
413 | case 5: | |
414 | slave->checkInter_rd0_4(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD); | |
415 | break; | |
416 | ||
417 | case 6: | |
418 | slave->checkInter_rd0_4(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N); | |
419 | break; | |
420 | ||
421 | case 7: | |
422 | slave->checkInter_rd0_4(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N); | |
423 | break; | |
424 | ||
425 | default: | |
426 | X265_CHECK(0, "invalid job ID for parallel mode analysis\n"); | |
427 | break; | |
428 | } | |
429 | } | |
430 | else | |
431 | { | |
432 | bool bMergeOnly = m_curGeom->log2CUSize == 6; | |
433 | if (slave != this) | |
434 | { | |
435 | slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur); | |
436 | slave->m_quant.setQPforQuant(md.pred[PRED_2Nx2N].cu); | |
437 | } | |
438 | ||
439 | switch (jobId) | |
440 | { | |
441 | case 0: | |
442 | slave->checkIntra(md.pred[PRED_INTRA], *m_curGeom, SIZE_2Nx2N, NULL); | |
443 | if (m_curGeom->depth == g_maxCUDepth && m_curGeom->log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) | |
444 | slave->checkIntra(md.pred[PRED_INTRA_NxN], *m_curGeom, SIZE_NxN, NULL); | |
445 | break; | |
446 | ||
447 | case 1: | |
448 | slave->checkInter_rd5_6(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N, false); | |
449 | break; | |
450 | ||
451 | case 2: | |
452 | slave->checkInter_rd5_6(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N, false); | |
453 | break; | |
454 | ||
455 | case 3: | |
456 | slave->checkInter_rd5_6(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN, false); | |
457 | break; | |
458 | ||
459 | case 4: | |
460 | slave->checkInter_rd5_6(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU, bMergeOnly); | |
461 | break; | |
462 | ||
463 | case 5: | |
464 | slave->checkInter_rd5_6(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD, bMergeOnly); | |
465 | break; | |
466 | ||
467 | case 6: | |
468 | slave->checkInter_rd5_6(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N, bMergeOnly); | |
469 | break; | |
470 | ||
471 | case 7: | |
472 | slave->checkInter_rd5_6(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N, bMergeOnly); | |
473 | break; | |
474 | ||
475 | default: | |
476 | X265_CHECK(0, "invalid job ID for parallel mode analysis\n"); | |
477 | break; | |
478 | } | |
479 | } | |
480 | } | |
481 | ||
482 | void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom) | |
483 | { | |
484 | uint32_t depth = cuGeom.depth; | |
485 | uint32_t cuAddr = parentCTU.m_cuAddr; | |
486 | ModeDepth& md = m_modeDepth[depth]; | |
487 | md.bestMode = NULL; | |
488 | ||
489 | bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); | |
490 | bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); | |
491 | uint32_t minDepth = m_param->rdLevel <= 4 ? topSkipMinDepth(parentCTU, cuGeom) : 0; | |
492 | ||
493 | X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not support RD 0 or 1\n"); | |
494 | ||
495 | if (mightNotSplit && depth >= minDepth) | |
496 | { | |
497 | int bTryAmp = m_slice->m_sps->maxAMPDepth > depth && (cuGeom.log2CUSize < 6 || m_param->rdLevel > 4); | |
498 | int bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames; | |
499 | ||
500 | /* Initialize all prediction CUs based on parentCTU */ | |
501 | md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom); | |
502 | md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom); | |
503 | md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom); | |
504 | if (m_param->bEnableRectInter) | |
505 | { | |
506 | md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom); | |
507 | md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom); | |
508 | } | |
509 | if (bTryAmp) | |
510 | { | |
511 | md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom); | |
512 | md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom); | |
513 | md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom); | |
514 | md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom); | |
515 | } | |
516 | if (bTryIntra) | |
517 | { | |
518 | md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); | |
519 | if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) | |
520 | md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom); | |
521 | } | |
522 | ||
523 | m_totalNumJobs = 2 + m_param->bEnableRectInter * 2 + bTryAmp * 4; | |
524 | m_numAcquiredJobs = !bTryIntra; | |
525 | m_numCompletedJobs = m_numAcquiredJobs; | |
526 | m_curGeom = &cuGeom; | |
527 | m_bJobsQueued = true; | |
528 | JobProvider::enqueue(); | |
529 | ||
530 | for (int i = 0; i < m_totalNumJobs - m_numCompletedJobs; i++) | |
531 | m_pool->pokeIdleThread(); | |
532 | ||
533 | /* participate in processing jobs, until all are distributed */ | |
534 | while (findJob(-1)) | |
535 | ; | |
536 | ||
537 | JobProvider::dequeue(); | |
538 | m_bJobsQueued = false; | |
539 | ||
540 | /* the master worker thread (this one) does merge analysis. By doing | |
541 | * merge after all the other jobs are at least started, we usually avoid | |
542 | * blocking on another thread */ | |
543 | ||
544 | if (m_param->rdLevel <= 4) | |
545 | { | |
546 | checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); | |
547 | ||
548 | m_modeCompletionEvent.wait(); | |
549 | ||
550 | /* select best inter mode based on sa8d cost */ | |
551 | Mode *bestInter = &md.pred[PRED_2Nx2N]; | |
552 | ||
553 | if (m_param->bEnableRectInter) | |
554 | { | |
555 | if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost) | |
556 | bestInter = &md.pred[PRED_Nx2N]; | |
557 | if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost) | |
558 | bestInter = &md.pred[PRED_2NxN]; | |
559 | } | |
560 | ||
561 | if (bTryAmp) | |
562 | { | |
563 | if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost) | |
564 | bestInter = &md.pred[PRED_2NxnU]; | |
565 | if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost) | |
566 | bestInter = &md.pred[PRED_2NxnD]; | |
567 | if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost) | |
568 | bestInter = &md.pred[PRED_nLx2N]; | |
569 | if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost) | |
570 | bestInter = &md.pred[PRED_nRx2N]; | |
571 | } | |
572 | ||
573 | if (m_param->rdLevel > 2) | |
574 | { | |
575 | /* encode best inter */ | |
576 | for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++) | |
577 | { | |
578 | prepMotionCompensation(bestInter->cu, cuGeom, puIdx); | |
579 | motionCompensation(bestInter->predYuv, false, true); | |
580 | } | |
581 | encodeResAndCalcRdInterCU(*bestInter, cuGeom); | |
582 | ||
583 | /* RD selection between merge, inter and intra */ | |
584 | checkBestMode(*bestInter, depth); | |
585 | ||
586 | if (bTryIntra) | |
587 | checkBestMode(md.pred[PRED_INTRA], depth); | |
588 | } | |
589 | else /* m_param->rdLevel == 2 */ | |
590 | { | |
591 | if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost) | |
592 | md.bestMode = bestInter; | |
593 | ||
594 | if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost) | |
595 | { | |
596 | md.bestMode = &md.pred[PRED_INTRA]; | |
597 | encodeIntraInInter(*md.bestMode, cuGeom); | |
598 | } | |
599 | else if (!md.bestMode->cu.m_mergeFlag[0]) | |
600 | { | |
601 | /* finally code the best mode selected from SA8D costs */ | |
602 | for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++) | |
603 | { | |
604 | prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx); | |
605 | motionCompensation(md.bestMode->predYuv, false, true); | |
606 | } | |
607 | encodeResAndCalcRdInterCU(*md.bestMode, cuGeom); | |
608 | } | |
609 | } | |
610 | } | |
611 | else | |
612 | { | |
613 | checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); | |
614 | m_modeCompletionEvent.wait(); | |
615 | ||
616 | checkBestMode(md.pred[PRED_2Nx2N], depth); | |
617 | ||
618 | if (m_param->bEnableRectInter) | |
619 | { | |
620 | checkBestMode(md.pred[PRED_Nx2N], depth); | |
621 | checkBestMode(md.pred[PRED_2NxN], depth); | |
622 | } | |
623 | ||
624 | if (bTryAmp) | |
625 | { | |
626 | checkBestMode(md.pred[PRED_2NxnU], depth); | |
627 | checkBestMode(md.pred[PRED_2NxnD], depth); | |
628 | checkBestMode(md.pred[PRED_nLx2N], depth); | |
629 | checkBestMode(md.pred[PRED_nRx2N], depth); | |
630 | } | |
631 | ||
632 | if (bTryIntra) | |
633 | { | |
634 | checkBestMode(md.pred[PRED_INTRA], depth); | |
635 | if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) | |
636 | checkBestMode(md.pred[PRED_INTRA_NxN], depth); | |
637 | } | |
638 | } | |
639 | ||
640 | if (md.bestMode->rdCost == MAX_INT64 && !bTryIntra) | |
641 | { | |
642 | md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); | |
643 | checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom); | |
644 | encodeIntraInInter(md.pred[PRED_INTRA], cuGeom); | |
645 | checkBestMode(md.pred[PRED_INTRA], depth); | |
646 | } | |
647 | ||
648 | if (m_bTryLossless) | |
649 | tryLossless(cuGeom); | |
650 | ||
651 | if (mightSplit) | |
652 | addSplitFlagCost(*md.bestMode, cuGeom.depth); | |
653 | } | |
654 | ||
655 | bool bNoSplit = false; | |
656 | if (md.bestMode) | |
657 | { | |
658 | bNoSplit = !!md.bestMode->cu.isSkipped(0); | |
659 | if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4) | |
660 | bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode); | |
661 | } | |
662 | ||
663 | if (mightSplit && !bNoSplit) | |
664 | { | |
665 | Mode* splitPred = &md.pred[PRED_SPLIT]; | |
666 | splitPred->initCosts(); | |
667 | CUData* splitCU = &splitPred->cu; | |
668 | splitCU->initSubCU(parentCTU, cuGeom); | |
669 | ||
670 | uint32_t nextDepth = depth + 1; | |
671 | ModeDepth& nd = m_modeDepth[nextDepth]; | |
672 | invalidateContexts(nextDepth); | |
673 | Entropy* nextContext = &m_rqt[depth].cur; | |
674 | ||
675 | for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) | |
676 | { | |
677 | const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); | |
678 | if (childCuData.flags & CUGeom::PRESENT) | |
679 | { | |
680 | m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx); | |
681 | m_rqt[nextDepth].cur.load(*nextContext); | |
682 | compressInterCU_dist(parentCTU, childCuData); | |
683 | ||
684 | // Save best CU and pred data for this sub CU | |
685 | splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx); | |
686 | splitPred->addSubCosts(*nd.bestMode); | |
687 | ||
688 | nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx); | |
689 | nextContext = &nd.bestMode->contexts; | |
690 | } | |
691 | else | |
692 | splitCU->setEmptyPart(childCuData, subPartIdx); | |
693 | } | |
694 | nextContext->store(splitPred->contexts); | |
695 | ||
696 | if (mightNotSplit) | |
697 | addSplitFlagCost(*splitPred, cuGeom.depth); | |
698 | else | |
699 | updateModeCost(*splitPred); | |
700 | ||
701 | checkBestMode(*splitPred, depth); | |
702 | } | |
703 | ||
704 | if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA) | |
705 | { | |
706 | /* early-out statistics */ | |
707 | FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData); | |
708 | FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr]; | |
709 | uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth]; | |
710 | cuStat.count[depth] += 1; | |
711 | cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth]; | |
712 | } | |
713 | ||
714 | checkDQP(md.bestMode->cu, cuGeom); | |
715 | ||
716 | /* Copy best data to encData CTU and recon */ | |
717 | md.bestMode->cu.copyToPic(depth); | |
718 | if (md.bestMode != &md.pred[PRED_SPLIT]) | |
719 | md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx); | |
720 | } | |
721 | ||
722 | void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom) | |
723 | { | |
724 | uint32_t depth = cuGeom.depth; | |
725 | uint32_t cuAddr = parentCTU.m_cuAddr; | |
726 | ModeDepth& md = m_modeDepth[depth]; | |
727 | md.bestMode = NULL; | |
728 | ||
729 | bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); | |
730 | bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); | |
731 | uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom); | |
732 | ||
733 | if (mightNotSplit && depth >= minDepth) | |
734 | { | |
735 | bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames; | |
736 | ||
737 | /* Initialize all prediction CUs based on parentCTU */ | |
738 | md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom); | |
739 | md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom); | |
740 | md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom); | |
741 | if (m_param->bEnableRectInter) | |
742 | { | |
743 | md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom); | |
744 | md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom); | |
745 | } | |
746 | if (m_slice->m_sps->maxAMPDepth > depth && cuGeom.log2CUSize < 6) | |
747 | { | |
748 | md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom); | |
749 | md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom); | |
750 | md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom); | |
751 | md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom); | |
752 | } | |
753 | ||
754 | /* Compute Merge Cost */ | |
755 | checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); | |
756 | ||
757 | bool earlyskip = false; | |
758 | if (m_param->rdLevel) | |
759 | earlyskip = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth | |
760 | ||
761 | if (!earlyskip) | |
762 | { | |
763 | checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N); | |
764 | Mode *bestInter = &md.pred[PRED_2Nx2N]; | |
765 | ||
766 | if (m_param->bEnableRectInter) | |
767 | { | |
768 | checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N); | |
769 | if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost) | |
770 | bestInter = &md.pred[PRED_Nx2N]; | |
771 | checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN); | |
772 | if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost) | |
773 | bestInter = &md.pred[PRED_2NxN]; | |
774 | } | |
775 | ||
776 | if (m_slice->m_sps->maxAMPDepth > depth && cuGeom.log2CUSize < 6) | |
777 | { | |
778 | bool bHor = false, bVer = false; | |
779 | if (bestInter->cu.m_partSize[0] == SIZE_2NxN) | |
780 | bHor = true; | |
781 | else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N) | |
782 | bVer = true; | |
783 | else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N && | |
784 | md.bestMode && md.bestMode->cu.getQtRootCbf(0)) | |
785 | { | |
786 | bHor = true; | |
787 | bVer = true; | |
788 | } | |
789 | ||
790 | if (bHor) | |
791 | { | |
792 | checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU); | |
793 | if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost) | |
794 | bestInter = &md.pred[PRED_2NxnU]; | |
795 | checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD); | |
796 | if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost) | |
797 | bestInter = &md.pred[PRED_2NxnD]; | |
798 | } | |
799 | if (bVer) | |
800 | { | |
801 | checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N); | |
802 | if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost) | |
803 | bestInter = &md.pred[PRED_nLx2N]; | |
804 | checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N); | |
805 | if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost) | |
806 | bestInter = &md.pred[PRED_nRx2N]; | |
807 | } | |
808 | } | |
809 | ||
810 | if (m_param->rdLevel >= 3) | |
811 | { | |
812 | /* Calculate RD cost of best inter option */ | |
813 | for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++) | |
814 | { | |
815 | prepMotionCompensation(bestInter->cu, cuGeom, puIdx); | |
816 | motionCompensation(bestInter->predYuv, false, true); | |
817 | } | |
818 | ||
819 | encodeResAndCalcRdInterCU(*bestInter, cuGeom); | |
820 | ||
821 | if (!md.bestMode || bestInter->rdCost < md.bestMode->rdCost) | |
822 | md.bestMode = bestInter; | |
823 | ||
824 | if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) || | |
825 | md.bestMode->sa8dCost == MAX_INT64) | |
826 | { | |
827 | md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); | |
828 | checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom); | |
829 | encodeIntraInInter(md.pred[PRED_INTRA], cuGeom); | |
830 | if (md.pred[PRED_INTRA].rdCost < md.bestMode->rdCost) | |
831 | md.bestMode = &md.pred[PRED_INTRA]; | |
832 | } | |
833 | } | |
834 | else | |
835 | { | |
836 | /* SA8D choice between merge/skip, inter, and intra */ | |
837 | if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost) | |
838 | md.bestMode = bestInter; | |
839 | ||
840 | if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64) | |
841 | { | |
842 | md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); | |
843 | checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom); | |
844 | if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost) | |
845 | md.bestMode = &md.pred[PRED_INTRA]; | |
846 | } | |
847 | ||
848 | /* finally code the best mode selected by SA8D costs: | |
849 | * RD level 2 - fully encode the best mode | |
850 | * RD level 1 - generate recon pixels | |
851 | * RD level 0 - generate chroma prediction */ | |
852 | if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N) | |
853 | { | |
854 | /* prediction already generated for this CU, and if rd level | |
855 | * is not 0, it is already fully encoded */ | |
856 | } | |
857 | else if (md.bestMode->cu.m_predMode[0] == MODE_INTER) | |
858 | { | |
859 | for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++) | |
860 | { | |
861 | prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx); | |
862 | motionCompensation(md.bestMode->predYuv, false, true); | |
863 | } | |
864 | if (m_param->rdLevel == 2) | |
865 | encodeResAndCalcRdInterCU(*md.bestMode, cuGeom); | |
866 | else if (m_param->rdLevel == 1) | |
867 | { | |
868 | m_rqt[cuGeom.depth].tmpResiYuv.subtract(md.fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize); | |
869 | generateCoeffRecon(*md.bestMode, cuGeom); | |
870 | } | |
871 | } | |
872 | else | |
873 | { | |
874 | if (m_param->rdLevel == 2) | |
875 | encodeIntraInInter(*md.bestMode, cuGeom); | |
876 | else if (m_param->rdLevel == 1) | |
877 | generateCoeffRecon(*md.bestMode, cuGeom); | |
878 | } | |
879 | } | |
880 | } // !earlyskip | |
881 | ||
882 | if (m_bTryLossless) | |
883 | tryLossless(cuGeom); | |
884 | ||
885 | if (mightSplit) | |
886 | addSplitFlagCost(*md.bestMode, cuGeom.depth); | |
887 | } | |
888 | ||
889 | bool bNoSplit = false; | |
890 | if (md.bestMode) | |
891 | { | |
892 | bNoSplit = !!md.bestMode->cu.isSkipped(0); | |
893 | if (mightSplit && depth && depth >= minDepth && !bNoSplit) | |
894 | bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode); | |
895 | } | |
896 | ||
897 | if (mightSplit && !bNoSplit) | |
898 | { | |
899 | Mode* splitPred = &md.pred[PRED_SPLIT]; | |
900 | splitPred->initCosts(); | |
901 | CUData* splitCU = &splitPred->cu; | |
902 | splitCU->initSubCU(parentCTU, cuGeom); | |
903 | ||
904 | uint32_t nextDepth = depth + 1; | |
905 | ModeDepth& nd = m_modeDepth[nextDepth]; | |
906 | invalidateContexts(nextDepth); | |
907 | Entropy* nextContext = &m_rqt[depth].cur; | |
908 | ||
909 | for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) | |
910 | { | |
911 | const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); | |
912 | if (childCuData.flags & CUGeom::PRESENT) | |
913 | { | |
914 | m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx); | |
915 | m_rqt[nextDepth].cur.load(*nextContext); | |
916 | compressInterCU_rd0_4(parentCTU, childCuData); | |
917 | ||
918 | // Save best CU and pred data for this sub CU | |
919 | splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx); | |
920 | splitPred->addSubCosts(*nd.bestMode); | |
921 | ||
922 | if (m_param->rdLevel) | |
923 | nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx); | |
924 | else | |
925 | nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childCuData.numPartitions * subPartIdx); | |
926 | if (m_param->rdLevel > 1) | |
927 | nextContext = &nd.bestMode->contexts; | |
928 | } | |
929 | else | |
930 | splitCU->setEmptyPart(childCuData, subPartIdx); | |
931 | } | |
932 | nextContext->store(splitPred->contexts); | |
933 | ||
934 | if (mightNotSplit) | |
935 | addSplitFlagCost(*splitPred, cuGeom.depth); | |
936 | else if (m_param->rdLevel <= 1) | |
937 | splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits); | |
938 | else | |
939 | updateModeCost(*splitPred); | |
940 | ||
941 | if (!md.bestMode) | |
942 | md.bestMode = splitPred; | |
943 | else if (m_param->rdLevel >= 1) | |
944 | { | |
945 | if (splitPred->rdCost < md.bestMode->rdCost) | |
946 | md.bestMode = splitPred; | |
947 | } | |
948 | else | |
949 | { | |
950 | if (splitPred->sa8dCost < md.bestMode->sa8dCost) | |
951 | md.bestMode = splitPred; | |
952 | } | |
953 | } | |
954 | ||
955 | if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA) | |
956 | { | |
957 | /* early-out statistics */ | |
958 | FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData); | |
959 | FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr]; | |
960 | uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth]; | |
961 | cuStat.count[depth] += 1; | |
962 | cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth]; | |
963 | } | |
964 | ||
965 | checkDQP(md.bestMode->cu, cuGeom); | |
966 | ||
967 | /* Copy best data to encData CTU and recon */ | |
968 | md.bestMode->cu.copyToPic(depth); | |
969 | if (md.bestMode != &md.pred[PRED_SPLIT] && m_param->rdLevel) | |
970 | md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx); | |
971 | } | |
972 | ||
973 | void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom) | |
974 | { | |
975 | uint32_t depth = cuGeom.depth; | |
976 | ModeDepth& md = m_modeDepth[depth]; | |
977 | md.bestMode = NULL; | |
978 | ||
979 | bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); | |
980 | bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); | |
981 | ||
982 | if (mightNotSplit) | |
983 | { | |
984 | for (int i = 0; i < MAX_PRED_TYPES; i++) | |
985 | md.pred[i].cu.initSubCU(parentCTU, cuGeom); | |
986 | ||
987 | checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); | |
988 | bool earlySkip = m_param->bEnableEarlySkip && md.bestMode && !md.bestMode->cu.getQtRootCbf(0); | |
989 | ||
990 | if (!earlySkip) | |
991 | { | |
992 | checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, false); | |
993 | checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth); | |
994 | ||
995 | if (m_param->bEnableRectInter) | |
996 | { | |
997 | // Nx2N rect | |
998 | if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) | |
999 | { | |
1000 | checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, false); | |
1001 | checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth); | |
1002 | } | |
1003 | if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) | |
1004 | { | |
1005 | checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, false); | |
1006 | checkBestMode(md.pred[PRED_2NxN], cuGeom.depth); | |
1007 | } | |
1008 | } | |
1009 | ||
1010 | // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N) | |
1011 | if (m_slice->m_sps->maxAMPDepth > depth) | |
1012 | { | |
1013 | bool bMergeOnly = cuGeom.log2CUSize == 6; | |
1014 | ||
1015 | bool bHor = false, bVer = false; | |
1016 | if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN) | |
1017 | bHor = true; | |
1018 | else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N) | |
1019 | bVer = true; | |
1020 | else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0] && !md.bestMode->cu.isSkipped(0)) | |
1021 | { | |
1022 | bHor = true; | |
1023 | bVer = true; | |
1024 | } | |
1025 | ||
1026 | if (bHor) | |
1027 | { | |
1028 | if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) | |
1029 | { | |
1030 | checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, bMergeOnly); | |
1031 | checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth); | |
1032 | } | |
1033 | if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) | |
1034 | { | |
1035 | checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, bMergeOnly); | |
1036 | checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth); | |
1037 | } | |
1038 | } | |
1039 | if (bVer) | |
1040 | { | |
1041 | if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) | |
1042 | { | |
1043 | checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, bMergeOnly); | |
1044 | checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth); | |
1045 | } | |
1046 | if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) | |
1047 | { | |
1048 | checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, bMergeOnly); | |
1049 | checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth); | |
1050 | } | |
1051 | } | |
1052 | } | |
1053 | ||
1054 | if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && | |
1055 | (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))) | |
1056 | { | |
1057 | checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL); | |
1058 | checkBestMode(md.pred[PRED_INTRA], depth); | |
1059 | ||
1060 | if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) | |
1061 | { | |
1062 | checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL); | |
1063 | checkBestMode(md.pred[PRED_INTRA_NxN], depth); | |
1064 | } | |
1065 | } | |
1066 | } | |
1067 | ||
1068 | if (m_bTryLossless) | |
1069 | tryLossless(cuGeom); | |
1070 | ||
1071 | if (mightSplit) | |
1072 | addSplitFlagCost(*md.bestMode, cuGeom.depth); | |
1073 | } | |
1074 | ||
1075 | // estimate split cost | |
1076 | if (mightSplit && (!md.bestMode || !md.bestMode->cu.isSkipped(0))) | |
1077 | { | |
1078 | Mode* splitPred = &md.pred[PRED_SPLIT]; | |
1079 | splitPred->initCosts(); | |
1080 | CUData* splitCU = &splitPred->cu; | |
1081 | splitCU->initSubCU(parentCTU, cuGeom); | |
1082 | ||
1083 | uint32_t nextDepth = depth + 1; | |
1084 | ModeDepth& nd = m_modeDepth[nextDepth]; | |
1085 | invalidateContexts(nextDepth); | |
1086 | Entropy* nextContext = &m_rqt[depth].cur; | |
1087 | ||
1088 | for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) | |
1089 | { | |
1090 | const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); | |
1091 | if (childCuData.flags & CUGeom::PRESENT) | |
1092 | { | |
1093 | m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx); | |
1094 | m_rqt[nextDepth].cur.load(*nextContext); | |
1095 | compressInterCU_rd5_6(parentCTU, childCuData); | |
1096 | ||
1097 | // Save best CU and pred data for this sub CU | |
1098 | splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx); | |
1099 | splitPred->addSubCosts(*nd.bestMode); | |
1100 | nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx); | |
1101 | nextContext = &nd.bestMode->contexts; | |
1102 | } | |
1103 | else | |
1104 | splitCU->setEmptyPart(childCuData, subPartIdx); | |
1105 | } | |
1106 | nextContext->store(splitPred->contexts); | |
1107 | if (mightNotSplit) | |
1108 | addSplitFlagCost(*splitPred, cuGeom.depth); | |
1109 | else | |
1110 | updateModeCost(*splitPred); | |
1111 | ||
1112 | checkBestMode(*splitPred, depth); | |
1113 | } | |
1114 | ||
1115 | checkDQP(md.bestMode->cu, cuGeom); | |
1116 | ||
1117 | /* Copy best data to encData CTU and recon */ | |
1118 | md.bestMode->cu.copyToPic(depth); | |
1119 | if (md.bestMode != &md.pred[PRED_SPLIT]) | |
1120 | md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx); | |
1121 | } | |
1122 | ||
1123 | /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */ | |
1124 | void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom) | |
1125 | { | |
1126 | uint32_t depth = cuGeom.depth; | |
1127 | ModeDepth& md = m_modeDepth[depth]; | |
1128 | Yuv *fencYuv = &md.fencYuv; | |
1129 | ||
1130 | /* Note that these two Mode instances are named MERGE and SKIP but they may | |
1131 | * hold the reverse when the function returns. We toggle between the two modes */ | |
1132 | Mode* tempPred = &merge; | |
1133 | Mode* bestPred = &skip; | |
1134 | ||
1135 | X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n"); | |
1136 | ||
1137 | tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N); | |
1138 | tempPred->cu.setPredModeSubParts(MODE_INTER); | |
1139 | tempPred->cu.m_mergeFlag[0] = true; | |
1140 | ||
1141 | bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N); | |
1142 | bestPred->cu.setPredModeSubParts(MODE_INTER); | |
1143 | bestPred->cu.m_mergeFlag[0] = true; | |
1144 | ||
1145 | MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists | |
1146 | uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS]; | |
1147 | uint32_t maxNumMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours); | |
1148 | ||
1149 | bestPred->sa8dCost = MAX_INT64; | |
1150 | int bestSadCand = -1; | |
1151 | int sizeIdx = cuGeom.log2CUSize - 2; | |
1152 | for (uint32_t i = 0; i < maxNumMergeCand; ++i) | |
1153 | { | |
1154 | if (m_bFrameParallel && | |
1155 | (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 || | |
1156 | mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4)) | |
1157 | continue; | |
1158 | ||
1159 | tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx | |
1160 | tempPred->cu.m_interDir[0] = interDirNeighbours[i]; | |
1161 | tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv; | |
1162 | tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx; | |
1163 | tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv; | |
1164 | tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx; | |
1165 | ||
1166 | // do MC only for Luma part | |
1167 | prepMotionCompensation(tempPred->cu, cuGeom, 0); | |
1168 | motionCompensation(tempPred->predYuv, true, false); | |
1169 | ||
1170 | tempPred->sa8dBits = getTUBits(i, maxNumMergeCand); | |
1171 | tempPred->distortion = primitives.sa8d[sizeIdx](fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size); | |
1172 | tempPred->sa8dCost = m_rdCost.calcRdSADCost(tempPred->distortion, tempPred->sa8dBits); | |
1173 | ||
1174 | if (tempPred->sa8dCost < bestPred->sa8dCost) | |
1175 | { | |
1176 | bestSadCand = i; | |
1177 | std::swap(tempPred, bestPred); | |
1178 | } | |
1179 | } | |
1180 | ||
1181 | /* force mode decision to take inter or intra */ | |
1182 | if (bestSadCand < 0) | |
1183 | return; | |
1184 | ||
1185 | /* calculate the motion compensation for chroma for the best mode selected */ | |
1186 | prepMotionCompensation(bestPred->cu, cuGeom, 0); | |
1187 | motionCompensation(bestPred->predYuv, false, true); | |
1188 | ||
1189 | if (m_param->rdLevel) | |
1190 | { | |
1191 | if (m_param->bLossless) | |
1192 | bestPred->rdCost = MAX_INT64; | |
1193 | else | |
1194 | encodeResAndCalcRdSkipCU(*bestPred); | |
1195 | ||
1196 | /* Encode with residual */ | |
1197 | tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand; | |
1198 | tempPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0); | |
1199 | tempPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0); | |
1200 | tempPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0); | |
1201 | tempPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0); | |
1202 | tempPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0); | |
1203 | tempPred->sa8dCost = bestPred->sa8dCost; | |
1204 | tempPred->predYuv.copyFromYuv(bestPred->predYuv); | |
1205 | ||
1206 | encodeResAndCalcRdInterCU(*tempPred, cuGeom); | |
1207 | ||
1208 | md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred; | |
1209 | } | |
1210 | else | |
1211 | md.bestMode = bestPred; | |
1212 | ||
1213 | /* broadcast sets of MV field data */ | |
1214 | bestPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0); | |
1215 | bestPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0); | |
1216 | bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0); | |
1217 | bestPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0); | |
1218 | bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0); | |
1219 | } | |
1220 | ||
1221 | /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */ | |
1222 | void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom) | |
1223 | { | |
1224 | uint32_t depth = cuGeom.depth; | |
1225 | ||
1226 | /* Note that these two Mode instances are named MERGE and SKIP but they may | |
1227 | * hold the reverse when the function returns. We toggle between the two modes */ | |
1228 | Mode* tempPred = &merge; | |
1229 | Mode* bestPred = &skip; | |
1230 | ||
1231 | merge.cu.setPredModeSubParts(MODE_INTER); | |
1232 | merge.cu.setPartSizeSubParts(SIZE_2Nx2N); | |
1233 | merge.cu.m_mergeFlag[0] = true; | |
1234 | ||
1235 | skip.cu.setPredModeSubParts(MODE_INTER); | |
1236 | skip.cu.setPartSizeSubParts(SIZE_2Nx2N); | |
1237 | skip.cu.m_mergeFlag[0] = true; | |
1238 | ||
1239 | MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists | |
1240 | uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS]; | |
1241 | uint32_t maxNumMergeCand = merge.cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours); | |
1242 | ||
1243 | bool foundCbf0Merge = false; | |
1244 | bool triedPZero = false, triedBZero = false; | |
1245 | bestPred->rdCost = MAX_INT64; | |
1246 | for (uint32_t i = 0; i < maxNumMergeCand; i++) | |
1247 | { | |
1248 | if (m_bFrameParallel && | |
1249 | (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 || | |
1250 | mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4)) | |
1251 | continue; | |
1252 | ||
1253 | /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */ | |
1254 | if (interDirNeighbours[i] == 1 && !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx) | |
1255 | { | |
1256 | if (triedPZero) | |
1257 | continue; | |
1258 | triedPZero = true; | |
1259 | } | |
1260 | else if (interDirNeighbours[i] == 3 && | |
1261 | !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx && | |
1262 | !mvFieldNeighbours[i][1].mv.word && !mvFieldNeighbours[i][1].refIdx) | |
1263 | { | |
1264 | if (triedBZero) | |
1265 | continue; | |
1266 | triedBZero = true; | |
1267 | } | |
1268 | ||
1269 | tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; /* merge candidate ID is stored in L0 MVP idx */ | |
1270 | tempPred->cu.m_interDir[0] = interDirNeighbours[i]; | |
1271 | tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv; | |
1272 | tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx; | |
1273 | tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv; | |
1274 | tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx; | |
1275 | tempPred->cu.setSkipFlagSubParts(false); /* must be cleared between encode iterations */ | |
1276 | ||
1277 | prepMotionCompensation(tempPred->cu, cuGeom, 0); | |
1278 | motionCompensation(tempPred->predYuv, true, true); | |
1279 | ||
1280 | uint8_t hasCbf = true; | |
1281 | bool swapped = false; | |
1282 | if (!foundCbf0Merge) | |
1283 | { | |
1284 | /* if the best prediction has CBF (not a skip) then try merge with residual */ | |
1285 | ||
1286 | encodeResAndCalcRdInterCU(*tempPred, cuGeom); | |
1287 | hasCbf = tempPred->cu.getQtRootCbf(0); | |
1288 | foundCbf0Merge = !hasCbf; | |
1289 | ||
1290 | if (tempPred->rdCost < bestPred->rdCost) | |
1291 | { | |
1292 | std::swap(tempPred, bestPred); | |
1293 | swapped = true; | |
1294 | } | |
1295 | } | |
1296 | if (!m_param->bLossless && hasCbf) | |
1297 | { | |
1298 | /* try merge without residual (skip), if not lossless coding */ | |
1299 | ||
1300 | if (swapped) | |
1301 | { | |
1302 | tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; | |
1303 | tempPred->cu.m_interDir[0] = interDirNeighbours[i]; | |
1304 | tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv; | |
1305 | tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx; | |
1306 | tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv; | |
1307 | tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx; | |
1308 | tempPred->cu.setSkipFlagSubParts(false); | |
1309 | tempPred->predYuv.copyFromYuv(bestPred->predYuv); | |
1310 | } | |
1311 | ||
1312 | encodeResAndCalcRdSkipCU(*tempPred); | |
1313 | ||
1314 | if (tempPred->rdCost < bestPred->rdCost) | |
1315 | std::swap(tempPred, bestPred); | |
1316 | } | |
1317 | } | |
1318 | ||
1319 | if (bestPred->rdCost < MAX_INT64) | |
1320 | { | |
1321 | m_modeDepth[depth].bestMode = bestPred; | |
1322 | ||
1323 | /* broadcast sets of MV field data */ | |
1324 | uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0]; | |
1325 | bestPred->cu.setPUInterDir(interDirNeighbours[bestCand], 0, 0); | |
1326 | bestPred->cu.setPUMv(0, mvFieldNeighbours[bestCand][0].mv, 0, 0); | |
1327 | bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestCand][0].refIdx, 0, 0); | |
1328 | bestPred->cu.setPUMv(1, mvFieldNeighbours[bestCand][1].mv, 0, 0); | |
1329 | bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestCand][1].refIdx, 0, 0); | |
1330 | } | |
1331 | } | |
1332 | ||
1333 | void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize) | |
1334 | { | |
1335 | interMode.initCosts(); | |
1336 | interMode.cu.setPartSizeSubParts(partSize); | |
1337 | interMode.cu.setPredModeSubParts(MODE_INTER); | |
1338 | ||
1339 | if (predInterSearch(interMode, cuGeom, false, false)) | |
1340 | { | |
1341 | /* predInterSearch sets interMode.sa8dBits */ | |
1342 | const Yuv& fencYuv = *interMode.fencYuv; | |
1343 | Yuv& predYuv = interMode.predYuv; | |
1344 | interMode.distortion = primitives.sa8d[cuGeom.log2CUSize - 2](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size); | |
1345 | interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits); | |
1346 | } | |
1347 | else | |
1348 | { | |
1349 | interMode.distortion = MAX_UINT; | |
1350 | interMode.sa8dCost = MAX_INT64; | |
1351 | } | |
1352 | } | |
1353 | ||
1354 | void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, bool bMergeOnly) | |
1355 | { | |
1356 | interMode.initCosts(); | |
1357 | interMode.cu.setPartSizeSubParts(partSize); | |
1358 | interMode.cu.setPredModeSubParts(MODE_INTER); | |
1359 | ||
1360 | if (predInterSearch(interMode, cuGeom, bMergeOnly, true)) | |
1361 | { | |
1362 | /* predInterSearch sets interMode.sa8dBits, but this is ignored */ | |
1363 | encodeResAndCalcRdInterCU(interMode, cuGeom); | |
1364 | } | |
1365 | else | |
1366 | { | |
1367 | interMode.distortion = MAX_UINT; | |
1368 | interMode.rdCost = MAX_INT64; | |
1369 | } | |
1370 | } | |
1371 | ||
1372 | /* Note that this function does not save the best intra prediction, it must | |
1373 | * be generated later. It records the best mode in the cu */ | |
1374 | void Analysis::checkIntraInInter_rd0_4(Mode& intraMode, const CUGeom& cuGeom) | |
1375 | { | |
1376 | CUData& cu = intraMode.cu; | |
1377 | uint32_t depth = cu.m_cuDepth[0]; | |
1378 | ||
1379 | cu.setPartSizeSubParts(SIZE_2Nx2N); | |
1380 | cu.setPredModeSubParts(MODE_INTRA); | |
1381 | ||
1382 | uint32_t initTrDepth = 0; | |
1383 | uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth; | |
1384 | uint32_t tuSize = 1 << log2TrSize; | |
1385 | const uint32_t absPartIdx = 0; | |
1386 | ||
1387 | // Reference sample smoothing | |
1388 | initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX); | |
1389 | ||
1390 | pixel* fenc = m_modeDepth[depth].fencYuv.m_buf[0]; | |
1391 | uint32_t stride = m_modeDepth[depth].fencYuv.m_size; | |
1392 | ||
1393 | pixel *above = m_refAbove + tuSize - 1; | |
1394 | pixel *aboveFiltered = m_refAboveFlt + tuSize - 1; | |
1395 | pixel *left = m_refLeft + tuSize - 1; | |
1396 | pixel *leftFiltered = m_refLeftFlt + tuSize - 1; | |
1397 | int sad, bsad; | |
1398 | uint32_t bits, bbits, mode, bmode; | |
1399 | uint64_t cost, bcost; | |
1400 | ||
1401 | // 33 Angle modes once | |
1402 | ALIGN_VAR_32(pixel, bufScale[32 * 32]); | |
1403 | ALIGN_VAR_32(pixel, bufTrans[32 * 32]); | |
1404 | ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]); | |
1405 | int scaleTuSize = tuSize; | |
1406 | int scaleStride = stride; | |
1407 | int costShift = 0; | |
1408 | int sizeIdx = log2TrSize - 2; | |
1409 | ||
1410 | if (tuSize > 32) | |
1411 | { | |
1412 | // origin is 64x64, we scale to 32x32 and setup required parameters | |
1413 | primitives.scale2D_64to32(bufScale, fenc, stride); | |
1414 | fenc = bufScale; | |
1415 | ||
1416 | // reserve space in case primitives need to store data in above | |
1417 | // or left buffers | |
1418 | pixel _above[4 * 32 + 1]; | |
1419 | pixel _left[4 * 32 + 1]; | |
1420 | pixel *aboveScale = _above + 2 * 32; | |
1421 | pixel *leftScale = _left + 2 * 32; | |
1422 | aboveScale[0] = leftScale[0] = above[0]; | |
1423 | primitives.scale1D_128to64(aboveScale + 1, above + 1, 0); | |
1424 | primitives.scale1D_128to64(leftScale + 1, left + 1, 0); | |
1425 | ||
1426 | scaleTuSize = 32; | |
1427 | scaleStride = 32; | |
1428 | costShift = 2; | |
1429 | sizeIdx = 5 - 2; // log2(scaleTuSize) - 2 | |
1430 | ||
1431 | // Filtered and Unfiltered refAbove and refLeft pointing to above and left. | |
1432 | above = aboveScale; | |
1433 | left = leftScale; | |
1434 | aboveFiltered = aboveScale; | |
1435 | leftFiltered = leftScale; | |
1436 | } | |
1437 | ||
1438 | pixelcmp_t sa8d = primitives.sa8d[sizeIdx]; | |
1439 | int predsize = scaleTuSize * scaleTuSize; | |
1440 | ||
1441 | m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur); | |
1442 | ||
1443 | /* there are three cost tiers for intra modes: | |
1444 | * pred[0] - mode probable, least cost | |
1445 | * pred[1], pred[2] - less probable, slightly more cost | |
1446 | * non-mpm modes - all cost the same (rbits) */ | |
1447 | uint64_t mpms; | |
1448 | uint32_t preds[3]; | |
1449 | uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms); | |
1450 | ||
1451 | // DC | |
1452 | primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16)); | |
1453 | bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift; | |
1454 | bmode = mode = DC_IDX; | |
1455 | bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; | |
1456 | bcost = m_rdCost.calcRdSADCost(bsad, bbits); | |
1457 | ||
1458 | pixel *abovePlanar = above; | |
1459 | pixel *leftPlanar = left; | |
1460 | ||
1461 | if (tuSize & (8 | 16 | 32)) | |
1462 | { | |
1463 | abovePlanar = aboveFiltered; | |
1464 | leftPlanar = leftFiltered; | |
1465 | } | |
1466 | ||
1467 | // PLANAR | |
1468 | primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0); | |
1469 | sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift; | |
1470 | mode = PLANAR_IDX; | |
1471 | bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; | |
1472 | cost = m_rdCost.calcRdSADCost(sad, bits); | |
1473 | COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits); | |
1474 | ||
1475 | // Transpose NxN | |
1476 | primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride); | |
1477 | ||
1478 | primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16)); | |
1479 | ||
1480 | bool modeHor; | |
1481 | pixel *cmp; | |
1482 | intptr_t srcStride; | |
1483 | ||
1484 | #define TRY_ANGLE(angle) \ | |
1485 | modeHor = angle < 18; \ | |
1486 | cmp = modeHor ? bufTrans : fenc; \ | |
1487 | srcStride = modeHor ? scaleTuSize : scaleStride; \ | |
1488 | sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \ | |
1489 | bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \ | |
1490 | cost = m_rdCost.calcRdSADCost(sad, bits) | |
1491 | ||
1492 | if (m_param->bEnableFastIntra) | |
1493 | { | |
1494 | int asad = 0; | |
1495 | uint32_t lowmode, highmode, amode = 5, abits = 0; | |
1496 | uint64_t acost = MAX_INT64; | |
1497 | ||
1498 | /* pick the best angle, sampling at distance of 5 */ | |
1499 | for (mode = 5; mode < 35; mode += 5) | |
1500 | { | |
1501 | TRY_ANGLE(mode); | |
1502 | COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits); | |
1503 | } | |
1504 | ||
1505 | /* refine best angle at distance 2, then distance 1 */ | |
1506 | for (uint32_t dist = 2; dist >= 1; dist--) | |
1507 | { | |
1508 | lowmode = amode - dist; | |
1509 | highmode = amode + dist; | |
1510 | ||
1511 | X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n"); | |
1512 | TRY_ANGLE(lowmode); | |
1513 | COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits); | |
1514 | ||
1515 | X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n"); | |
1516 | TRY_ANGLE(highmode); | |
1517 | COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits); | |
1518 | } | |
1519 | ||
1520 | if (amode == 33) | |
1521 | { | |
1522 | TRY_ANGLE(34); | |
1523 | COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits); | |
1524 | } | |
1525 | ||
1526 | COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits); | |
1527 | } | |
1528 | else // calculate and search all intra prediction angles for lowest cost | |
1529 | { | |
1530 | for (mode = 2; mode < 35; mode++) | |
1531 | { | |
1532 | TRY_ANGLE(mode); | |
1533 | COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits); | |
1534 | } | |
1535 | } | |
1536 | ||
1537 | cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTrDepth); | |
1538 | intraMode.initCosts(); | |
1539 | intraMode.totalBits = bbits; | |
1540 | intraMode.distortion = bsad; | |
1541 | intraMode.sa8dCost = bcost; | |
1542 | intraMode.sa8dBits = bbits; | |
1543 | } | |
1544 | ||
1545 | void Analysis::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom) | |
1546 | { | |
1547 | CUData& cu = intraMode.cu; | |
1548 | Yuv* reconYuv = &intraMode.reconYuv; | |
1549 | Yuv* fencYuv = &m_modeDepth[cuGeom.depth].fencYuv; | |
1550 | ||
1551 | X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n"); | |
1552 | X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n"); | |
1553 | ||
1554 | m_quant.setQPforQuant(cu); | |
1555 | ||
1556 | uint32_t tuDepthRange[2]; | |
1557 | cu.getIntraTUQtDepthRange(tuDepthRange, 0); | |
1558 | ||
1559 | m_entropyCoder.load(m_rqt[cuGeom.depth].cur); | |
1560 | ||
1561 | Cost icosts; | |
1562 | codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange); | |
1563 | extractIntraResultQT(cu, *reconYuv, 0, 0); | |
1564 | ||
1565 | intraMode.distortion = icosts.distortion; | |
1566 | intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom); | |
1567 | ||
1568 | m_entropyCoder.resetBits(); | |
1569 | if (m_slice->m_pps->bTransquantBypassEnabled) | |
1570 | m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); | |
1571 | m_entropyCoder.codeSkipFlag(cu, 0); | |
1572 | m_entropyCoder.codePredMode(cu.m_predMode[0]); | |
1573 | m_entropyCoder.codePartSize(cu, 0, cuGeom.depth); | |
1574 | m_entropyCoder.codePredInfo(cu, 0); | |
1575 | intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits(); | |
1576 | ||
1577 | bool bCodeDQP = m_slice->m_pps->bUseDQP; | |
1578 | m_entropyCoder.codeCoeff(cu, 0, cuGeom.depth, bCodeDQP, tuDepthRange); | |
1579 | ||
1580 | intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits(); | |
1581 | intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits; | |
1582 | if (m_rdCost.m_psyRd) | |
1583 | intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); | |
1584 | ||
1585 | m_entropyCoder.store(intraMode.contexts); | |
1586 | updateModeCost(intraMode); | |
1587 | } | |
1588 | ||
1589 | void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom) | |
1590 | { | |
1591 | if (cuGeom.depth < ctu.m_cuDepth[cuGeom.encodeIdx] && cuGeom.depth < g_maxCUDepth) | |
1592 | { | |
1593 | for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) | |
1594 | { | |
1595 | const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx); | |
1596 | if (childCuData.flags & CUGeom::PRESENT) | |
1597 | encodeResidue(ctu, childCuData); | |
1598 | } | |
1599 | return; | |
1600 | } | |
1601 | ||
1602 | uint32_t absPartIdx = cuGeom.encodeIdx; | |
1603 | int sizeIdx = cuGeom.log2CUSize - 2; | |
1604 | ||
1605 | Yuv& fencYuv = m_modeDepth[0].fencYuv; | |
1606 | ||
1607 | /* reuse the bestMode data structures at the current depth */ | |
1608 | Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode; | |
1609 | Yuv& reconYuv = bestMode->reconYuv; | |
1610 | CUData& cu = bestMode->cu; | |
1611 | ||
1612 | cu.copyFromPic(ctu, cuGeom); | |
1613 | m_quant.setQPforQuant(cu); | |
1614 | ||
1615 | if (cu.m_predMode[0] == MODE_INTRA) | |
1616 | { | |
1617 | uint32_t tuDepthRange[2]; | |
1618 | cu.getIntraTUQtDepthRange(tuDepthRange, 0); | |
1619 | ||
1620 | uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN; | |
1621 | residualTransformQuantIntra(*bestMode, cuGeom, initTrDepth, 0, tuDepthRange); | |
1622 | getBestIntraModeChroma(*bestMode, cuGeom); | |
1623 | residualQTIntraChroma(*bestMode, cuGeom, 0, 0); | |
1624 | } | |
1625 | else if (cu.m_predMode[0] == MODE_INTER) | |
1626 | { | |
1627 | X265_CHECK(!ctu.m_skipFlag[absPartIdx], "skip not expected prior to transform\n"); | |
1628 | ||
1629 | /* Calculate residual for current CU part into depth sized resiYuv */ | |
1630 | ||
1631 | ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv; | |
1632 | ||
1633 | /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */ | |
1634 | Yuv& predYuv = m_modeDepth[0].bestMode->predYuv; | |
1635 | pixel* predY = predYuv.getLumaAddr(absPartIdx); | |
1636 | pixel* predU = predYuv.getCbAddr(absPartIdx); | |
1637 | pixel* predV = predYuv.getCrAddr(absPartIdx); | |
1638 | ||
1639 | primitives.luma_sub_ps[sizeIdx](resiYuv.m_buf[0], resiYuv.m_size, | |
1640 | fencYuv.getLumaAddr(absPartIdx), predY, | |
1641 | fencYuv.m_size, predYuv.m_size); | |
1642 | ||
1643 | primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[1], resiYuv.m_csize, | |
1644 | fencYuv.getCbAddr(absPartIdx), predU, | |
1645 | fencYuv.m_csize, predYuv.m_csize); | |
1646 | ||
1647 | primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[2], resiYuv.m_csize, | |
1648 | fencYuv.getCrAddr(absPartIdx), predV, | |
1649 | fencYuv.m_csize, predYuv.m_csize); | |
1650 | ||
1651 | uint32_t tuDepthRange[2]; | |
1652 | cu.getInterTUQtDepthRange(tuDepthRange, 0); | |
1653 | ||
1654 | residualTransformQuantInter(*bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange); | |
1655 | ||
1656 | if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0)) | |
1657 | cu.setSkipFlagSubParts(true); | |
1658 | ||
1659 | PicYuv& reconPicYuv = *m_frame->m_reconPicYuv; | |
1660 | if (cu.getQtRootCbf(0)) // TODO: split to each component | |
1661 | { | |
1662 | /* residualTransformQuantInter() wrote transformed residual back into | |
1663 | * resiYuv. Generate the recon pixels by adding it to the prediction */ | |
1664 | ||
1665 | primitives.luma_add_ps[sizeIdx](reconYuv.m_buf[0], reconYuv.m_size, | |
1666 | predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size); | |
1667 | primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[1], reconYuv.m_csize, | |
1668 | predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize); | |
1669 | primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[2], reconYuv.m_csize, | |
1670 | predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize); | |
1671 | ||
1672 | /* copy the reconstructed part to the recon pic for later intra | |
1673 | * predictions */ | |
1674 | reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, absPartIdx); | |
1675 | } | |
1676 | else | |
1677 | { | |
1678 | /* copy the prediction pixels to the recon pic for later intra | |
1679 | * predictions */ | |
1680 | ||
1681 | primitives.luma_copy_pp[sizeIdx](reconPicYuv.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_stride, | |
1682 | predY, predYuv.m_size); | |
1683 | primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCbAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC, | |
1684 | predU, predYuv.m_csize); | |
1685 | primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCrAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC, | |
1686 | predV, predYuv.m_csize); | |
1687 | } | |
1688 | } | |
1689 | /* else if (cu.m_predMode[0] == MODE_NONE) {} */ | |
1690 | ||
1691 | checkDQP(cu, cuGeom); | |
1692 | cu.updatePic(cuGeom.depth); | |
1693 | } | |
1694 | ||
1695 | /* check whether current try is the best with identifying the depth of current try */ | |
1696 | void Analysis::checkBestMode(Mode& mode, uint32_t depth) | |
1697 | { | |
1698 | ModeDepth& md = m_modeDepth[depth]; | |
1699 | if (md.bestMode) | |
1700 | { | |
1701 | if (mode.rdCost < md.bestMode->rdCost) | |
1702 | md.bestMode = &mode; | |
1703 | } | |
1704 | else | |
1705 | md.bestMode = &mode; | |
1706 | } | |
1707 | ||
1708 | void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth) | |
1709 | { | |
1710 | if (m_param->rdLevel >= 3) | |
1711 | { | |
1712 | /* code the split flag (0 or 1) and update bit costs */ | |
1713 | mode.contexts.resetBits(); | |
1714 | mode.contexts.codeSplitFlag(mode.cu, 0, depth); | |
1715 | uint32_t bits = mode.contexts.getNumberOfWrittenBits(); | |
1716 | mode.mvBits += bits; | |
1717 | mode.totalBits += bits; | |
1718 | updateModeCost(mode); | |
1719 | } | |
1720 | else if (m_param->rdLevel <= 1) | |
1721 | { | |
1722 | mode.sa8dBits++; | |
1723 | mode.sa8dCost = m_rdCost.calcRdSADCost(mode.distortion, mode.sa8dBits); | |
1724 | } | |
1725 | else | |
1726 | { | |
1727 | mode.mvBits++; | |
1728 | mode.totalBits++; | |
1729 | updateModeCost(mode); | |
1730 | } | |
1731 | } | |
1732 | ||
1733 | void Analysis::checkDQP(CUData& cu, const CUGeom& cuGeom) | |
1734 | { | |
1735 | if (m_slice->m_pps->bUseDQP && cuGeom.depth <= m_slice->m_pps->maxCuDQPDepth) | |
1736 | { | |
1737 | if (cu.m_cuDepth[0] > cuGeom.depth) // detect splits | |
1738 | { | |
1739 | bool hasResidual = false; | |
1740 | for (uint32_t absPartIdx = 0; absPartIdx < cu.m_numPartitions; absPartIdx++) | |
1741 | { | |
1742 | if (cu.getQtRootCbf(absPartIdx)) | |
1743 | { | |
1744 | hasResidual = true; | |
1745 | break; | |
1746 | } | |
1747 | } | |
1748 | if (hasResidual) | |
1749 | cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth); | |
1750 | else | |
1751 | cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth); | |
1752 | } | |
1753 | else | |
1754 | { | |
1755 | if (!cu.getCbf(0, TEXT_LUMA, 0) && !cu.getCbf(0, TEXT_CHROMA_U, 0) && !cu.getCbf(0, TEXT_CHROMA_V, 0)) | |
1756 | cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth); | |
1757 | } | |
1758 | } | |
1759 | } | |
1760 | ||
1761 | uint32_t Analysis::topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom) | |
1762 | { | |
1763 | /* Do not attempt to code a block larger than the largest block in the | |
1764 | * co-located CTUs in L0 and L1 */ | |
1765 | int currentQP = parentCTU.m_qp[0]; | |
1766 | int previousQP = currentQP; | |
1767 | uint32_t minDepth0 = 4, minDepth1 = 4; | |
1768 | uint32_t sum = 0; | |
1769 | int numRefs = 0; | |
1770 | if (m_slice->m_numRefIdx[0]) | |
1771 | { | |
1772 | numRefs++; | |
1773 | const CUData& cu = *m_slice->m_refPicList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr); | |
1774 | previousQP = cu.m_qp[0]; | |
1775 | if (!cu.m_cuDepth[cuGeom.encodeIdx]) | |
1776 | return 0; | |
1777 | for (uint32_t i = 0; i < cuGeom.numPartitions && minDepth0; i += 4) | |
1778 | { | |
1779 | uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i]; | |
1780 | minDepth0 = X265_MIN(d, minDepth0); | |
1781 | sum += d; | |
1782 | } | |
1783 | } | |
1784 | if (m_slice->m_numRefIdx[1]) | |
1785 | { | |
1786 | numRefs++; | |
1787 | const CUData& cu = *m_slice->m_refPicList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr); | |
1788 | if (!cu.m_cuDepth[cuGeom.encodeIdx]) | |
1789 | return 0; | |
1790 | for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4) | |
1791 | { | |
1792 | uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i]; | |
1793 | minDepth1 = X265_MIN(d, minDepth1); | |
1794 | sum += d; | |
1795 | } | |
1796 | } | |
1797 | if (!numRefs) | |
1798 | return 0; | |
1799 | ||
1800 | uint32_t minDepth = X265_MIN(minDepth0, minDepth1); | |
1801 | uint32_t thresh = minDepth * numRefs * (cuGeom.numPartitions >> 2); | |
1802 | ||
1803 | /* allow block size growth if QP is raising or avg depth is | |
1804 | * less than 1.5 of min depth */ | |
1805 | if (minDepth && currentQP >= previousQP && (sum <= thresh + (thresh >> 1))) | |
1806 | minDepth -= 1; | |
1807 | ||
1808 | return minDepth; | |
1809 | } | |
1810 | ||
1811 | /* returns true if recursion should be stopped */ | |
1812 | bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode) | |
1813 | { | |
1814 | /* early exit when the RD cost of best mode at depth n is less than the sum | |
1815 | * of average of RD cost of the neighbor CU's(above, aboveleft, aboveright, | |
1816 | * left, colocated) and avg cost of that CU at depth "n" with weightage for | |
1817 | * each quantity */ | |
1818 | ||
1819 | uint32_t depth = cuGeom.depth; | |
1820 | FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData); | |
1821 | FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr]; | |
1822 | uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth]; | |
1823 | uint64_t cuCount = cuStat.count[depth]; | |
1824 | ||
1825 | uint64_t neighCost = 0, neighCount = 0; | |
1826 | const CUData* above = parentCTU.m_cuAbove; | |
1827 | if (above) | |
1828 | { | |
1829 | FrameData::RCStatCU& astat = curEncData.m_cuStat[above->m_cuAddr]; | |
1830 | neighCost += astat.avgCost[depth] * astat.count[depth]; | |
1831 | neighCount += astat.count[depth]; | |
1832 | ||
1833 | const CUData* aboveLeft = parentCTU.m_cuAboveLeft; | |
1834 | if (aboveLeft) | |
1835 | { | |
1836 | FrameData::RCStatCU& lstat = curEncData.m_cuStat[aboveLeft->m_cuAddr]; | |
1837 | neighCost += lstat.avgCost[depth] * lstat.count[depth]; | |
1838 | neighCount += lstat.count[depth]; | |
1839 | } | |
1840 | ||
1841 | const CUData* aboveRight = parentCTU.m_cuAboveRight; | |
1842 | if (aboveRight) | |
1843 | { | |
1844 | FrameData::RCStatCU& rstat = curEncData.m_cuStat[aboveRight->m_cuAddr]; | |
1845 | neighCost += rstat.avgCost[depth] * rstat.count[depth]; | |
1846 | neighCount += rstat.count[depth]; | |
1847 | } | |
1848 | } | |
1849 | const CUData* left = parentCTU.m_cuLeft; | |
1850 | if (left) | |
1851 | { | |
1852 | FrameData::RCStatCU& nstat = curEncData.m_cuStat[left->m_cuAddr]; | |
1853 | neighCost += nstat.avgCost[depth] * nstat.count[depth]; | |
1854 | neighCount += nstat.count[depth]; | |
1855 | } | |
1856 | ||
1857 | // give 60% weight to all CU's and 40% weight to neighbour CU's | |
1858 | if (neighCost + cuCount) | |
1859 | { | |
1860 | uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount)); | |
1861 | uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost; | |
1862 | if (curCost < avgCost && avgCost) | |
1863 | return true; | |
1864 | } | |
1865 | ||
1866 | return false; | |
1867 | } |