Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / encoder / analysis.cpp
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
5 * Steve Borho <steve@borho.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 *
21 * This program is also available under a commercial proprietary license.
22 * For more information, contact us at license @ x265.com.
23 *****************************************************************************/
24
25 #include "common.h"
26 #include "frame.h"
27 #include "framedata.h"
28 #include "picyuv.h"
29 #include "primitives.h"
30 #include "threading.h"
31
32 #include "analysis.h"
33 #include "rdcost.h"
34 #include "encoder.h"
35
36 using namespace x265;
37
38 /* An explanation of rate distortion levels (--rd-level)
39 *
40 * rd-level 0 generates no recon per CU (NO RDO or Quant)
41 *
42 * sa8d selection between merge / skip / inter / intra and split
43 * no recon pixels generated until CTU analysis is complete, requiring
44 * intra predictions to use source pixels
45 *
46 * rd-level 1 uses RDO for merge and skip, sa8d for all else
47 *
48 * RDO selection between merge and skip
49 * sa8d selection between (merge/skip) / inter modes / intra and split
50 * intra prediction uses reconstructed pixels
51 *
52 * rd-level 2 uses RDO for merge/skip and split
53 *
54 * RDO selection between merge and skip
55 * sa8d selection between (merge/skip) / inter modes / intra
56 * RDO split decisions
57 *
58 * rd-level 3 uses RDO for merge/skip/best inter/intra
59 *
60 * RDO selection between merge and skip
61 * sa8d selection of best inter mode
62 * sa8d decisions include chroma residual cost
63 * RDO selection between (merge/skip) / best inter mode / intra / split
64 *
65 * rd-level 4 enables RDOQuant
66 * chroma residual cost included in satd decisions, including subpel refine
67 * (as a result of --subme 3 being used by preset slow)
68 *
69 * rd-level 5,6 does RDO for each inter mode
70 */
71
72 Analysis::Analysis()
73 {
74 m_totalNumJobs = m_numAcquiredJobs = m_numCompletedJobs = 0;
75 m_reuseIntraDataCTU = NULL;
76 m_reuseInterDataCTU = NULL;
77 }
78
79 bool Analysis::create(ThreadLocalData *tld)
80 {
81 m_tld = tld;
82 m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
83 m_bChromaSa8d = m_param->rdLevel >= 3;
84
85 int csp = m_param->internalCsp;
86 uint32_t cuSize = g_maxCUSize;
87
88 bool ok = true;
89 for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++, cuSize >>= 1)
90 {
91 ModeDepth &md = m_modeDepth[depth];
92
93 md.cuMemPool.create(depth, csp, MAX_PRED_TYPES);
94 ok &= md.fencYuv.create(cuSize, csp);
95
96 for (int j = 0; j < MAX_PRED_TYPES; j++)
97 {
98 md.pred[j].cu.initialize(md.cuMemPool, depth, csp, j);
99 ok &= md.pred[j].predYuv.create(cuSize, csp);
100 ok &= md.pred[j].reconYuv.create(cuSize, csp);
101 md.pred[j].fencYuv = &md.fencYuv;
102 }
103 }
104
105 return ok;
106 }
107
108 void Analysis::destroy()
109 {
110 for (uint32_t i = 0; i <= g_maxCUDepth; i++)
111 {
112 m_modeDepth[i].cuMemPool.destroy();
113 m_modeDepth[i].fencYuv.destroy();
114
115 for (int j = 0; j < MAX_PRED_TYPES; j++)
116 {
117 m_modeDepth[i].pred[j].predYuv.destroy();
118 m_modeDepth[i].pred[j].reconYuv.destroy();
119 }
120 }
121 }
122
123 Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
124 {
125 m_slice = ctu.m_slice;
126 m_frame = &frame;
127
128 invalidateContexts(0);
129 m_quant.setQPforQuant(ctu);
130 m_rqt[0].cur.load(initialContext);
131 m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
132
133 uint32_t numPartition = ctu.m_numPartitions;
134 if (m_param->analysisMode)
135 {
136 m_reuseIntraDataCTU = (analysis_intra_data *)m_frame->m_analysisData.intraData;
137 int numPredDir = m_slice->isInterP() ? 1 : 2;
138 m_reuseInterDataCTU = (analysis_inter_data *)m_frame->m_analysisData.interData + ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir;
139 }
140
141 if (m_slice->m_sliceType == I_SLICE)
142 {
143 uint32_t zOrder = 0;
144 compressIntraCU(ctu, cuGeom, zOrder);
145 if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData)
146 {
147 CUData *bestCU = &m_modeDepth[0].bestMode->cu;
148 memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
149 memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
150 memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
151 }
152 }
153 else
154 {
155 if (!m_param->rdLevel)
156 {
157 /* In RD Level 0/1, copy source pixels into the reconstructed block so
158 * they are available for intra predictions */
159 m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
160
161 compressInterCU_rd0_4(ctu, cuGeom);
162
163 /* generate residual for entire CTU at once and copy to reconPic */
164 encodeResidue(ctu, cuGeom);
165 }
166 else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2)
167 compressInterCU_dist(ctu, cuGeom);
168 else if (m_param->rdLevel <= 4)
169 compressInterCU_rd0_4(ctu, cuGeom);
170 else
171 compressInterCU_rd5_6(ctu, cuGeom);
172 }
173
174 return *m_modeDepth[0].bestMode;
175 }
176
177 void Analysis::tryLossless(const CUGeom& cuGeom)
178 {
179 ModeDepth& md = m_modeDepth[cuGeom.depth];
180
181 if (!md.bestMode->distortion)
182 /* already lossless */
183 return;
184 else if (md.bestMode->cu.isIntra(0))
185 {
186 md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
187 PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
188 uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir;
189 checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes);
190 checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
191 }
192 else
193 {
194 md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
195 md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
196 encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
197 checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
198 }
199 }
200
201 void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder)
202 {
203 uint32_t depth = cuGeom.depth;
204 ModeDepth& md = m_modeDepth[depth];
205 md.bestMode = NULL;
206
207 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
208 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
209
210 if (m_param->analysisMode == X265_ANALYSIS_LOAD)
211 {
212 uint8_t* reuseDepth = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
213 uint8_t* reuseModes = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
214 char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
215
216 if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.encodeIdx)
217 {
218 m_quant.setQPforQuant(parentCTU);
219
220 PartSize size = (PartSize)reusePartSizes[zOrder];
221 Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
222 mode.cu.initSubCU(parentCTU, cuGeom);
223 checkIntra(mode, cuGeom, size, &reuseModes[zOrder]);
224 checkBestMode(mode, depth);
225
226 if (m_bTryLossless)
227 tryLossless(cuGeom);
228
229 if (mightSplit)
230 addSplitFlagCost(*md.bestMode, cuGeom.depth);
231
232 // increment zOrder offset to point to next best depth in sharedDepth buffer
233 zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
234 mightSplit = false;
235 }
236 }
237 else if (mightNotSplit)
238 {
239 m_quant.setQPforQuant(parentCTU);
240
241 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
242 checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL);
243 checkBestMode(md.pred[PRED_INTRA], depth);
244
245 if (depth == g_maxCUDepth)
246 {
247 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
248 checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL);
249 checkBestMode(md.pred[PRED_INTRA_NxN], depth);
250 }
251
252 if (m_bTryLossless)
253 tryLossless(cuGeom);
254
255 if (mightSplit)
256 addSplitFlagCost(*md.bestMode, cuGeom.depth);
257 }
258
259 if (mightSplit)
260 {
261 Mode* splitPred = &md.pred[PRED_SPLIT];
262 splitPred->initCosts();
263 CUData* splitCU = &splitPred->cu;
264 splitCU->initSubCU(parentCTU, cuGeom);
265
266 uint32_t nextDepth = depth + 1;
267 ModeDepth& nd = m_modeDepth[nextDepth];
268 invalidateContexts(nextDepth);
269 Entropy* nextContext = &m_rqt[depth].cur;
270
271 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
272 {
273 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
274 if (childGeom.flags & CUGeom::PRESENT)
275 {
276 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
277 m_rqt[nextDepth].cur.load(*nextContext);
278 compressIntraCU(parentCTU, childGeom, zOrder);
279
280 // Save best CU and pred data for this sub CU
281 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
282 splitPred->addSubCosts(*nd.bestMode);
283 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
284 nextContext = &nd.bestMode->contexts;
285 }
286 else
287 {
288 /* record the depth of this non-present sub-CU */
289 splitCU->setEmptyPart(childGeom, subPartIdx);
290 zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth];
291 }
292 }
293 nextContext->store(splitPred->contexts);
294 if (mightNotSplit)
295 addSplitFlagCost(*splitPred, cuGeom.depth);
296 else
297 updateModeCost(*splitPred);
298 checkBestMode(*splitPred, depth);
299 }
300
301 checkDQP(md.bestMode->cu, cuGeom);
302
303 /* Copy best data to encData CTU and recon */
304 md.bestMode->cu.copyToPic(depth);
305 if (md.bestMode != &md.pred[PRED_SPLIT])
306 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx);
307 }
308
309 bool Analysis::findJob(int threadId)
310 {
311 /* try to acquire a CU mode to analyze */
312 m_pmodeLock.acquire();
313 if (m_totalNumJobs > m_numAcquiredJobs)
314 {
315 int id = m_numAcquiredJobs++;
316 m_pmodeLock.release();
317
318 parallelModeAnalysis(threadId, id);
319
320 m_pmodeLock.acquire();
321 if (++m_numCompletedJobs == m_totalNumJobs)
322 m_modeCompletionEvent.trigger();
323 m_pmodeLock.release();
324 return true;
325 }
326 else
327 m_pmodeLock.release();
328
329 m_meLock.acquire();
330 if (m_totalNumME > m_numAcquiredME)
331 {
332 int id = m_numAcquiredME++;
333 m_meLock.release();
334
335 parallelME(threadId, id);
336
337 m_meLock.acquire();
338 if (++m_numCompletedME == m_totalNumME)
339 m_meCompletionEvent.trigger();
340 m_meLock.release();
341 return true;
342 }
343 else
344 m_meLock.release();
345
346 return false;
347 }
348
349 void Analysis::parallelME(int threadId, int meId)
350 {
351 Analysis* slave;
352
353 if (threadId == -1)
354 slave = this;
355 else
356 {
357 slave = &m_tld[threadId].analysis;
358 slave->setQP(*m_slice, m_rdCost.m_qp);
359 slave->m_slice = m_slice;
360 slave->m_frame = m_frame;
361
362 slave->m_me.setSourcePU(*m_curInterMode->fencYuv, m_curInterMode->cu.m_cuAddr, m_curGeom->encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
363 slave->prepMotionCompensation(m_curInterMode->cu, *m_curGeom, m_curPart);
364 }
365
366 if (meId < m_slice->m_numRefIdx[0])
367 slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 0, meId);
368 else
369 slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]);
370 }
371
372 void Analysis::parallelModeAnalysis(int threadId, int jobId)
373 {
374 Analysis* slave;
375
376 if (threadId == -1)
377 slave = this;
378 else
379 {
380 slave = &m_tld[threadId].analysis;
381 slave->m_slice = m_slice;
382 slave->m_frame = m_frame;
383 slave->setQP(*m_slice, m_rdCost.m_qp);
384 slave->invalidateContexts(0);
385 }
386
387 ModeDepth& md = m_modeDepth[m_curGeom->depth];
388
389 if (m_param->rdLevel <= 4)
390 {
391 switch (jobId)
392 {
393 case 0:
394 if (slave != this)
395 slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur);
396 slave->checkIntraInInter(md.pred[PRED_INTRA], *m_curGeom);
397 if (m_param->rdLevel > 2)
398 slave->encodeIntraInInter(md.pred[PRED_INTRA], *m_curGeom);
399 break;
400
401 case 1:
402 slave->checkInter_rd0_4(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N);
403 if (m_slice->m_sliceType == B_SLICE)
404 slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom);
405 break;
406
407 case 2:
408 slave->checkInter_rd0_4(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N);
409 break;
410
411 case 3:
412 slave->checkInter_rd0_4(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN);
413 break;
414
415 case 4:
416 slave->checkInter_rd0_4(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU);
417 break;
418
419 case 5:
420 slave->checkInter_rd0_4(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD);
421 break;
422
423 case 6:
424 slave->checkInter_rd0_4(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N);
425 break;
426
427 case 7:
428 slave->checkInter_rd0_4(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N);
429 break;
430
431 default:
432 X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
433 break;
434 }
435 }
436 else
437 {
438 bool bMergeOnly = m_curGeom->log2CUSize == 6;
439 if (slave != this)
440 {
441 slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur);
442 slave->m_quant.setQPforQuant(md.pred[PRED_2Nx2N].cu);
443 }
444
445 switch (jobId)
446 {
447 case 0:
448 slave->checkIntra(md.pred[PRED_INTRA], *m_curGeom, SIZE_2Nx2N, NULL);
449 if (m_curGeom->depth == g_maxCUDepth && m_curGeom->log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
450 slave->checkIntra(md.pred[PRED_INTRA_NxN], *m_curGeom, SIZE_NxN, NULL);
451 break;
452
453 case 1:
454 slave->checkInter_rd5_6(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N, false);
455 md.pred[PRED_BIDIR].rdCost = MAX_INT64;
456 if (m_slice->m_sliceType == B_SLICE)
457 {
458 slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom);
459 if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
460 slave->encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], *m_curGeom);
461 }
462 break;
463
464 case 2:
465 slave->checkInter_rd5_6(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N, false);
466 break;
467
468 case 3:
469 slave->checkInter_rd5_6(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN, false);
470 break;
471
472 case 4:
473 slave->checkInter_rd5_6(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU, bMergeOnly);
474 break;
475
476 case 5:
477 slave->checkInter_rd5_6(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD, bMergeOnly);
478 break;
479
480 case 6:
481 slave->checkInter_rd5_6(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N, bMergeOnly);
482 break;
483
484 case 7:
485 slave->checkInter_rd5_6(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N, bMergeOnly);
486 break;
487
488 default:
489 X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
490 break;
491 }
492 }
493 }
494
495 void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom)
496 {
497 uint32_t depth = cuGeom.depth;
498 uint32_t cuAddr = parentCTU.m_cuAddr;
499 ModeDepth& md = m_modeDepth[depth];
500 md.bestMode = NULL;
501
502 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
503 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
504 uint32_t minDepth = m_param->rdLevel <= 4 ? topSkipMinDepth(parentCTU, cuGeom) : 0;
505
506 X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not support RD 0 or 1\n");
507
508 if (mightNotSplit && depth >= minDepth)
509 {
510 int bTryAmp = m_slice->m_sps->maxAMPDepth > depth && (cuGeom.log2CUSize < 6 || m_param->rdLevel > 4);
511 int bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
512
513 /* Initialize all prediction CUs based on parentCTU */
514 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
515 md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
516 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
517 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
518 if (m_param->bEnableRectInter)
519 {
520 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
521 md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
522 }
523 if (bTryAmp)
524 {
525 md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
526 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
527 md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
528 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
529 }
530 if (bTryIntra)
531 {
532 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
533 if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
534 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
535 }
536
537 m_pmodeLock.acquire();
538 m_totalNumJobs = 2 + m_param->bEnableRectInter * 2 + bTryAmp * 4;
539 m_numAcquiredJobs = !bTryIntra;
540 m_numCompletedJobs = m_numAcquiredJobs;
541 m_curGeom = &cuGeom;
542 m_bJobsQueued = true;
543 JobProvider::enqueue();
544 m_pmodeLock.release();
545
546 for (int i = 0; i < m_totalNumJobs - m_numCompletedJobs; i++)
547 m_pool->pokeIdleThread();
548
549 /* participate in processing jobs, until all are distributed */
550 while (findJob(-1))
551 ;
552
553 JobProvider::dequeue();
554 m_bJobsQueued = false;
555
556 /* the master worker thread (this one) does merge analysis. By doing
557 * merge after all the other jobs are at least started, we usually avoid
558 * blocking on another thread */
559
560 if (m_param->rdLevel <= 4)
561 {
562 checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
563
564 m_modeCompletionEvent.wait();
565
566 /* select best inter mode based on sa8d cost */
567 Mode *bestInter = &md.pred[PRED_2Nx2N];
568
569 if (m_param->bEnableRectInter)
570 {
571 if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
572 bestInter = &md.pred[PRED_Nx2N];
573 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
574 bestInter = &md.pred[PRED_2NxN];
575 }
576
577 if (bTryAmp)
578 {
579 if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
580 bestInter = &md.pred[PRED_2NxnU];
581 if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
582 bestInter = &md.pred[PRED_2NxnD];
583 if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
584 bestInter = &md.pred[PRED_nLx2N];
585 if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
586 bestInter = &md.pred[PRED_nRx2N];
587 }
588
589 if (m_param->rdLevel > 2)
590 {
591 /* RD selection between merge, inter, bidir and intra */
592 if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
593 {
594 for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
595 {
596 prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
597 motionCompensation(bestInter->predYuv, false, true);
598 }
599 }
600 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
601 checkBestMode(*bestInter, depth);
602
603 /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
604 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
605 md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
606 {
607 encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
608 checkBestMode(md.pred[PRED_BIDIR], depth);
609 }
610
611 if (bTryIntra)
612 checkBestMode(md.pred[PRED_INTRA], depth);
613 }
614 else /* m_param->rdLevel == 2 */
615 {
616 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
617 md.bestMode = bestInter;
618
619 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
620 md.bestMode = &md.pred[PRED_BIDIR];
621
622 if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
623 {
624 md.bestMode = &md.pred[PRED_INTRA];
625 encodeIntraInInter(*md.bestMode, cuGeom);
626 }
627 else if (!md.bestMode->cu.m_mergeFlag[0])
628 {
629 /* finally code the best mode selected from SA8D costs */
630 for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
631 {
632 prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx);
633 motionCompensation(md.bestMode->predYuv, false, true);
634 }
635 encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
636 }
637 }
638 }
639 else
640 {
641 checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
642 m_modeCompletionEvent.wait();
643
644 checkBestMode(md.pred[PRED_2Nx2N], depth);
645 checkBestMode(md.pred[PRED_BIDIR], depth);
646
647 if (m_param->bEnableRectInter)
648 {
649 checkBestMode(md.pred[PRED_Nx2N], depth);
650 checkBestMode(md.pred[PRED_2NxN], depth);
651 }
652
653 if (bTryAmp)
654 {
655 checkBestMode(md.pred[PRED_2NxnU], depth);
656 checkBestMode(md.pred[PRED_2NxnD], depth);
657 checkBestMode(md.pred[PRED_nLx2N], depth);
658 checkBestMode(md.pred[PRED_nRx2N], depth);
659 }
660
661 if (bTryIntra)
662 {
663 checkBestMode(md.pred[PRED_INTRA], depth);
664 if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
665 checkBestMode(md.pred[PRED_INTRA_NxN], depth);
666 }
667 }
668
669 if (md.bestMode->rdCost == MAX_INT64 && !bTryIntra)
670 {
671 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
672 checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
673 encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
674 checkBestMode(md.pred[PRED_INTRA], depth);
675 }
676
677 if (m_bTryLossless)
678 tryLossless(cuGeom);
679
680 if (mightSplit)
681 addSplitFlagCost(*md.bestMode, cuGeom.depth);
682 }
683
684 bool bNoSplit = false;
685 if (md.bestMode)
686 {
687 bNoSplit = md.bestMode->cu.isSkipped(0);
688 if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
689 bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
690 }
691
692 if (mightSplit && !bNoSplit)
693 {
694 Mode* splitPred = &md.pred[PRED_SPLIT];
695 splitPred->initCosts();
696 CUData* splitCU = &splitPred->cu;
697 splitCU->initSubCU(parentCTU, cuGeom);
698
699 uint32_t nextDepth = depth + 1;
700 ModeDepth& nd = m_modeDepth[nextDepth];
701 invalidateContexts(nextDepth);
702 Entropy* nextContext = &m_rqt[depth].cur;
703
704 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
705 {
706 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
707 if (childGeom.flags & CUGeom::PRESENT)
708 {
709 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
710 m_rqt[nextDepth].cur.load(*nextContext);
711 compressInterCU_dist(parentCTU, childGeom);
712
713 // Save best CU and pred data for this sub CU
714 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
715 splitPred->addSubCosts(*nd.bestMode);
716
717 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
718 nextContext = &nd.bestMode->contexts;
719 }
720 else
721 splitCU->setEmptyPart(childGeom, subPartIdx);
722 }
723 nextContext->store(splitPred->contexts);
724
725 if (mightNotSplit)
726 addSplitFlagCost(*splitPred, cuGeom.depth);
727 else
728 updateModeCost(*splitPred);
729
730 checkBestMode(*splitPred, depth);
731 }
732
733 if (mightNotSplit)
734 {
735 /* early-out statistics */
736 FrameData& curEncData = *m_frame->m_encData;
737 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
738 uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
739 cuStat.count[depth] += 1;
740 cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
741 }
742
743 checkDQP(md.bestMode->cu, cuGeom);
744
745 /* Copy best data to encData CTU and recon */
746 md.bestMode->cu.copyToPic(depth);
747 if (md.bestMode != &md.pred[PRED_SPLIT])
748 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx);
749 }
750
751 void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom)
752 {
753 uint32_t depth = cuGeom.depth;
754 uint32_t cuAddr = parentCTU.m_cuAddr;
755 ModeDepth& md = m_modeDepth[depth];
756 md.bestMode = NULL;
757
758 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
759 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
760 uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
761
762 if (mightNotSplit && depth >= minDepth)
763 {
764 bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
765
766 /* Compute Merge Cost */
767 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
768 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
769 checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
770
771 bool earlyskip = false;
772 if (m_param->rdLevel)
773 earlyskip = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
774
775 if (!earlyskip)
776 {
777 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
778 checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N);
779
780 if (m_slice->m_sliceType == B_SLICE)
781 {
782 md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
783 checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
784 }
785
786 Mode *bestInter = &md.pred[PRED_2Nx2N];
787 if (m_param->bEnableRectInter)
788 {
789 md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
790 checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N);
791 if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
792 bestInter = &md.pred[PRED_Nx2N];
793
794 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
795 checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN);
796 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
797 bestInter = &md.pred[PRED_2NxN];
798 }
799
800 if (m_slice->m_sps->maxAMPDepth > depth && cuGeom.log2CUSize < 6)
801 {
802 bool bHor = false, bVer = false;
803 if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
804 bHor = true;
805 else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)
806 bVer = true;
807 else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&
808 md.bestMode && md.bestMode->cu.getQtRootCbf(0))
809 {
810 bHor = true;
811 bVer = true;
812 }
813
814 if (bHor)
815 {
816 md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
817 checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU);
818 if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
819 bestInter = &md.pred[PRED_2NxnU];
820
821 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
822 checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD);
823 if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
824 bestInter = &md.pred[PRED_2NxnD];
825 }
826 if (bVer)
827 {
828 md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
829 checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N);
830 if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
831 bestInter = &md.pred[PRED_nLx2N];
832
833 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
834 checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N);
835 if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
836 bestInter = &md.pred[PRED_nRx2N];
837 }
838 }
839
840 if (m_param->rdLevel >= 3)
841 {
842 /* Calculate RD cost of best inter option */
843 if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
844 {
845 for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
846 {
847 prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
848 motionCompensation(bestInter->predYuv, false, true);
849 }
850 }
851 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
852 checkBestMode(*bestInter, depth);
853
854 /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
855 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
856 md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
857 {
858 encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
859 checkBestMode(md.pred[PRED_BIDIR], depth);
860 }
861
862 if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
863 md.bestMode->sa8dCost == MAX_INT64)
864 {
865 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
866 checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
867 encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
868 checkBestMode(md.pred[PRED_INTRA], depth);
869 }
870 }
871 else
872 {
873 /* SA8D choice between merge/skip, inter, bidir, and intra */
874 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
875 md.bestMode = bestInter;
876
877 if (m_slice->m_sliceType == B_SLICE &&
878 md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
879 md.bestMode = &md.pred[PRED_BIDIR];
880
881 if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
882 {
883 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
884 checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
885 if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
886 md.bestMode = &md.pred[PRED_INTRA];
887 }
888
889 /* finally code the best mode selected by SA8D costs:
890 * RD level 2 - fully encode the best mode
891 * RD level 1 - generate recon pixels
892 * RD level 0 - generate chroma prediction */
893 if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)
894 {
895 /* prediction already generated for this CU, and if rd level
896 * is not 0, it is already fully encoded */
897 }
898 else if (md.bestMode->cu.isInter(0))
899 {
900 for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
901 {
902 prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx);
903 motionCompensation(md.bestMode->predYuv, false, true);
904 }
905 if (m_param->rdLevel == 2)
906 encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
907 else if (m_param->rdLevel == 1)
908 {
909 /* generate recon pixels with no rate distortion considerations */
910 CUData& cu = md.bestMode->cu;
911 m_quant.setQPforQuant(cu);
912
913 uint32_t tuDepthRange[2];
914 cu.getInterTUQtDepthRange(tuDepthRange, 0);
915
916 m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
917 residualTransformQuantInter(*md.bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
918 if (cu.getQtRootCbf(0))
919 md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
920 else
921 {
922 md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
923 if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
924 cu.setPredModeSubParts(MODE_SKIP);
925 }
926 }
927 }
928 else
929 {
930 if (m_param->rdLevel == 2)
931 encodeIntraInInter(*md.bestMode, cuGeom);
932 else if (m_param->rdLevel == 1)
933 {
934 /* generate recon pixels with no rate distortion considerations */
935 CUData& cu = md.bestMode->cu;
936 m_quant.setQPforQuant(cu);
937
938 uint32_t tuDepthRange[2];
939 cu.getIntraTUQtDepthRange(tuDepthRange, 0);
940
941 uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
942 residualTransformQuantIntra(*md.bestMode, cuGeom, initTuDepth, 0, tuDepthRange);
943 getBestIntraModeChroma(*md.bestMode, cuGeom);
944 residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
945 md.bestMode->reconYuv.copyFromPicYuv(*m_frame->m_reconPic, cu.m_cuAddr, cuGeom.encodeIdx); // TODO:
946 }
947 }
948 }
949 } // !earlyskip
950
951 if (m_bTryLossless)
952 tryLossless(cuGeom);
953
954 if (mightSplit)
955 addSplitFlagCost(*md.bestMode, cuGeom.depth);
956 }
957
958 bool bNoSplit = false;
959 if (md.bestMode)
960 {
961 bNoSplit = md.bestMode->cu.isSkipped(0);
962 if (mightSplit && depth && depth >= minDepth && !bNoSplit)
963 bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
964 }
965
966 if (mightSplit && !bNoSplit)
967 {
968 Mode* splitPred = &md.pred[PRED_SPLIT];
969 splitPred->initCosts();
970 CUData* splitCU = &splitPred->cu;
971 splitCU->initSubCU(parentCTU, cuGeom);
972
973 uint32_t nextDepth = depth + 1;
974 ModeDepth& nd = m_modeDepth[nextDepth];
975 invalidateContexts(nextDepth);
976 Entropy* nextContext = &m_rqt[depth].cur;
977
978 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
979 {
980 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
981 if (childGeom.flags & CUGeom::PRESENT)
982 {
983 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
984 m_rqt[nextDepth].cur.load(*nextContext);
985 compressInterCU_rd0_4(parentCTU, childGeom);
986
987 // Save best CU and pred data for this sub CU
988 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
989 splitPred->addSubCosts(*nd.bestMode);
990
991 if (m_param->rdLevel)
992 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
993 else
994 nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);
995 if (m_param->rdLevel > 1)
996 nextContext = &nd.bestMode->contexts;
997 }
998 else
999 splitCU->setEmptyPart(childGeom, subPartIdx);
1000 }
1001 nextContext->store(splitPred->contexts);
1002
1003 if (mightNotSplit)
1004 addSplitFlagCost(*splitPred, cuGeom.depth);
1005 else if (m_param->rdLevel > 1)
1006 updateModeCost(*splitPred);
1007 else
1008 splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits);
1009
1010 if (!md.bestMode)
1011 md.bestMode = splitPred;
1012 else if (m_param->rdLevel > 1)
1013 checkBestMode(*splitPred, cuGeom.depth);
1014 else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
1015 md.bestMode = splitPred;
1016 }
1017
1018 if (mightNotSplit)
1019 {
1020 /* early-out statistics */
1021 FrameData& curEncData = *m_frame->m_encData;
1022 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
1023 uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
1024 cuStat.count[depth] += 1;
1025 cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
1026 }
1027
1028 checkDQP(md.bestMode->cu, cuGeom);
1029
1030 /* Copy best data to encData CTU and recon */
1031 md.bestMode->cu.copyToPic(depth);
1032 if (md.bestMode != &md.pred[PRED_SPLIT] && m_param->rdLevel)
1033 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx);
1034 }
1035
1036 void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom)
1037 {
1038 uint32_t depth = cuGeom.depth;
1039 ModeDepth& md = m_modeDepth[depth];
1040 md.bestMode = NULL;
1041
1042 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
1043 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
1044
1045 if (mightNotSplit)
1046 {
1047 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
1048 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
1049 checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1050 bool earlySkip = m_param->bEnableEarlySkip && md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
1051
1052 if (!earlySkip)
1053 {
1054 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
1055 checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, false);
1056 checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
1057
1058 if (m_slice->m_sliceType == B_SLICE)
1059 {
1060 md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
1061 checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
1062 if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
1063 {
1064 encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
1065 checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth);
1066 }
1067 }
1068
1069 if (m_param->bEnableRectInter)
1070 {
1071 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1072 {
1073 md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
1074 checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, false);
1075 checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
1076 }
1077 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1078 {
1079 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
1080 checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, false);
1081 checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
1082 }
1083 }
1084
1085 // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
1086 if (m_slice->m_sps->maxAMPDepth > depth)
1087 {
1088 bool bMergeOnly = cuGeom.log2CUSize == 6;
1089
1090 bool bHor = false, bVer = false;
1091 if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN)
1092 bHor = true;
1093 else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N)
1094 bVer = true;
1095 else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0] && !md.bestMode->cu.isSkipped(0))
1096 {
1097 bHor = true;
1098 bVer = true;
1099 }
1100
1101 if (bHor)
1102 {
1103 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1104 {
1105 md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
1106 checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, bMergeOnly);
1107 checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
1108 }
1109 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1110 {
1111 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
1112 checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, bMergeOnly);
1113 checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
1114 }
1115 }
1116 if (bVer)
1117 {
1118 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1119 {
1120 md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
1121 checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, bMergeOnly);
1122 checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
1123 }
1124 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1125 {
1126 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
1127 checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, bMergeOnly);
1128 checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
1129 }
1130 }
1131 }
1132
1133 if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) &&
1134 (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)))
1135 {
1136 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
1137 checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL);
1138 checkBestMode(md.pred[PRED_INTRA], depth);
1139
1140 if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
1141 {
1142 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
1143 checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL);
1144 checkBestMode(md.pred[PRED_INTRA_NxN], depth);
1145 }
1146 }
1147 }
1148
1149 if (m_bTryLossless)
1150 tryLossless(cuGeom);
1151
1152 if (mightSplit)
1153 addSplitFlagCost(*md.bestMode, cuGeom.depth);
1154 }
1155
1156 // estimate split cost
1157 if (mightSplit && (!md.bestMode || !md.bestMode->cu.isSkipped(0)))
1158 {
1159 Mode* splitPred = &md.pred[PRED_SPLIT];
1160 splitPred->initCosts();
1161 CUData* splitCU = &splitPred->cu;
1162 splitCU->initSubCU(parentCTU, cuGeom);
1163
1164 uint32_t nextDepth = depth + 1;
1165 ModeDepth& nd = m_modeDepth[nextDepth];
1166 invalidateContexts(nextDepth);
1167 Entropy* nextContext = &m_rqt[depth].cur;
1168
1169 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1170 {
1171 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1172 if (childGeom.flags & CUGeom::PRESENT)
1173 {
1174 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
1175 m_rqt[nextDepth].cur.load(*nextContext);
1176 compressInterCU_rd5_6(parentCTU, childGeom);
1177
1178 // Save best CU and pred data for this sub CU
1179 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
1180 splitPred->addSubCosts(*nd.bestMode);
1181 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
1182 nextContext = &nd.bestMode->contexts;
1183 }
1184 else
1185 splitCU->setEmptyPart(childGeom, subPartIdx);
1186 }
1187 nextContext->store(splitPred->contexts);
1188 if (mightNotSplit)
1189 addSplitFlagCost(*splitPred, cuGeom.depth);
1190 else
1191 updateModeCost(*splitPred);
1192
1193 checkBestMode(*splitPred, depth);
1194 }
1195
1196 checkDQP(md.bestMode->cu, cuGeom);
1197
1198 /* Copy best data to encData CTU and recon */
1199 md.bestMode->cu.copyToPic(depth);
1200 if (md.bestMode != &md.pred[PRED_SPLIT])
1201 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx);
1202 }
1203
1204 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
1205 void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom)
1206 {
1207 uint32_t depth = cuGeom.depth;
1208 ModeDepth& md = m_modeDepth[depth];
1209 Yuv *fencYuv = &md.fencYuv;
1210
1211 /* Note that these two Mode instances are named MERGE and SKIP but they may
1212 * hold the reverse when the function returns. We toggle between the two modes */
1213 Mode* tempPred = &merge;
1214 Mode* bestPred = &skip;
1215
1216 X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n");
1217
1218 tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
1219 tempPred->cu.setPredModeSubParts(MODE_INTER);
1220 tempPred->cu.m_mergeFlag[0] = true;
1221
1222 bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
1223 bestPred->cu.setPredModeSubParts(MODE_INTER);
1224 bestPred->cu.m_mergeFlag[0] = true;
1225
1226 MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
1227 uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS];
1228 uint32_t maxNumMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours);
1229
1230 bestPred->sa8dCost = MAX_INT64;
1231 int bestSadCand = -1;
1232 int cpart, sizeIdx = cuGeom.log2CUSize - 2;
1233 if (m_bChromaSa8d)
1234 {
1235 int cuSize = 1 << cuGeom.log2CUSize;
1236 cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
1237 }
1238 for (uint32_t i = 0; i < maxNumMergeCand; ++i)
1239 {
1240 if (m_bFrameParallel &&
1241 (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1242 mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4))
1243 continue;
1244
1245 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
1246 tempPred->cu.m_interDir[0] = interDirNeighbours[i];
1247 tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
1248 tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
1249 tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
1250 tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
1251
1252 prepMotionCompensation(tempPred->cu, cuGeom, 0);
1253 motionCompensation(tempPred->predYuv, true, m_bChromaSa8d);
1254
1255 tempPred->sa8dBits = getTUBits(i, maxNumMergeCand);
1256 tempPred->distortion = primitives.sa8d[sizeIdx](fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
1257 if (m_bChromaSa8d)
1258 {
1259 tempPred->distortion += primitives.sa8d_inter[cpart](fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
1260 tempPred->distortion += primitives.sa8d_inter[cpart](fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
1261 }
1262 tempPred->sa8dCost = m_rdCost.calcRdSADCost(tempPred->distortion, tempPred->sa8dBits);
1263
1264 if (tempPred->sa8dCost < bestPred->sa8dCost)
1265 {
1266 bestSadCand = i;
1267 std::swap(tempPred, bestPred);
1268 }
1269 }
1270
1271 /* force mode decision to take inter or intra */
1272 if (bestSadCand < 0)
1273 return;
1274
1275 /* calculate the motion compensation for chroma for the best mode selected */
1276 if (!m_bChromaSa8d) /* Chroma MC was done above */
1277 {
1278 prepMotionCompensation(bestPred->cu, cuGeom, 0);
1279 motionCompensation(bestPred->predYuv, false, true);
1280 }
1281
1282 if (m_param->rdLevel)
1283 {
1284 if (m_param->bLossless)
1285 bestPred->rdCost = MAX_INT64;
1286 else
1287 encodeResAndCalcRdSkipCU(*bestPred);
1288
1289 /* Encode with residual */
1290 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
1291 tempPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
1292 tempPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
1293 tempPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
1294 tempPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
1295 tempPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
1296 tempPred->sa8dCost = bestPred->sa8dCost;
1297 tempPred->predYuv.copyFromYuv(bestPred->predYuv);
1298
1299 encodeResAndCalcRdInterCU(*tempPred, cuGeom);
1300
1301 md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred;
1302 }
1303 else
1304 md.bestMode = bestPred;
1305
1306 /* broadcast sets of MV field data */
1307 bestPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
1308 bestPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
1309 bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
1310 bestPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
1311 bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
1312 }
1313
1314 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
1315 void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom)
1316 {
1317 uint32_t depth = cuGeom.depth;
1318
1319 /* Note that these two Mode instances are named MERGE and SKIP but they may
1320 * hold the reverse when the function returns. We toggle between the two modes */
1321 Mode* tempPred = &merge;
1322 Mode* bestPred = &skip;
1323
1324 merge.cu.setPredModeSubParts(MODE_INTER);
1325 merge.cu.setPartSizeSubParts(SIZE_2Nx2N);
1326 merge.cu.m_mergeFlag[0] = true;
1327
1328 skip.cu.setPredModeSubParts(MODE_INTER);
1329 skip.cu.setPartSizeSubParts(SIZE_2Nx2N);
1330 skip.cu.m_mergeFlag[0] = true;
1331
1332 MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
1333 uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS];
1334 uint32_t maxNumMergeCand = merge.cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours);
1335
1336 bool foundCbf0Merge = false;
1337 bool triedPZero = false, triedBZero = false;
1338 bestPred->rdCost = MAX_INT64;
1339 for (uint32_t i = 0; i < maxNumMergeCand; i++)
1340 {
1341 if (m_bFrameParallel &&
1342 (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1343 mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4))
1344 continue;
1345
1346 /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */
1347 if (interDirNeighbours[i] == 1 && !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx)
1348 {
1349 if (triedPZero)
1350 continue;
1351 triedPZero = true;
1352 }
1353 else if (interDirNeighbours[i] == 3 &&
1354 !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx &&
1355 !mvFieldNeighbours[i][1].mv.word && !mvFieldNeighbours[i][1].refIdx)
1356 {
1357 if (triedBZero)
1358 continue;
1359 triedBZero = true;
1360 }
1361
1362 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; /* merge candidate ID is stored in L0 MVP idx */
1363 tempPred->cu.m_interDir[0] = interDirNeighbours[i];
1364 tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
1365 tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
1366 tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
1367 tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
1368 tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
1369
1370 prepMotionCompensation(tempPred->cu, cuGeom, 0);
1371 motionCompensation(tempPred->predYuv, true, true);
1372
1373 uint8_t hasCbf = true;
1374 bool swapped = false;
1375 if (!foundCbf0Merge)
1376 {
1377 /* if the best prediction has CBF (not a skip) then try merge with residual */
1378
1379 encodeResAndCalcRdInterCU(*tempPred, cuGeom);
1380 hasCbf = tempPred->cu.getQtRootCbf(0);
1381 foundCbf0Merge = !hasCbf;
1382
1383 if (tempPred->rdCost < bestPred->rdCost)
1384 {
1385 std::swap(tempPred, bestPred);
1386 swapped = true;
1387 }
1388 }
1389 if (!m_param->bLossless && hasCbf)
1390 {
1391 /* try merge without residual (skip), if not lossless coding */
1392
1393 if (swapped)
1394 {
1395 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
1396 tempPred->cu.m_interDir[0] = interDirNeighbours[i];
1397 tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
1398 tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
1399 tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
1400 tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
1401 tempPred->cu.setPredModeSubParts(MODE_INTER);
1402 tempPred->predYuv.copyFromYuv(bestPred->predYuv);
1403 }
1404
1405 encodeResAndCalcRdSkipCU(*tempPred);
1406
1407 if (tempPred->rdCost < bestPred->rdCost)
1408 std::swap(tempPred, bestPred);
1409 }
1410 }
1411
1412 if (bestPred->rdCost < MAX_INT64)
1413 {
1414 m_modeDepth[depth].bestMode = bestPred;
1415
1416 /* broadcast sets of MV field data */
1417 uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0];
1418 bestPred->cu.setPUInterDir(interDirNeighbours[bestCand], 0, 0);
1419 bestPred->cu.setPUMv(0, mvFieldNeighbours[bestCand][0].mv, 0, 0);
1420 bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestCand][0].refIdx, 0, 0);
1421 bestPred->cu.setPUMv(1, mvFieldNeighbours[bestCand][1].mv, 0, 0);
1422 bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestCand][1].refIdx, 0, 0);
1423 }
1424 }
1425
1426 void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize)
1427 {
1428 interMode.initCosts();
1429 interMode.cu.setPartSizeSubParts(partSize);
1430 interMode.cu.setPredModeSubParts(MODE_INTER);
1431 int numPredDir = m_slice->isInterP() ? 1 : 2;
1432
1433 if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
1434 {
1435 for (uint32_t part = 0; part < interMode.cu.getNumPartInter(); part++)
1436 {
1437 MotionData* bestME = interMode.bestME[part];
1438 for (int32_t i = 0; i < numPredDir; i++)
1439 {
1440 bestME[i].ref = m_reuseInterDataCTU->ref;
1441 m_reuseInterDataCTU++;
1442 }
1443 }
1444 }
1445 if (predInterSearch(interMode, cuGeom, false, m_bChromaSa8d))
1446 {
1447 /* predInterSearch sets interMode.sa8dBits */
1448 const Yuv& fencYuv = *interMode.fencYuv;
1449 Yuv& predYuv = interMode.predYuv;
1450 int part = partitionFromLog2Size(cuGeom.log2CUSize);
1451 interMode.distortion = primitives.sa8d[part](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
1452 if (m_bChromaSa8d)
1453 {
1454 uint32_t cuSize = 1 << cuGeom.log2CUSize;
1455 int cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
1456 interMode.distortion += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
1457 interMode.distortion += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
1458 }
1459 interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits);
1460
1461 if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
1462 {
1463 for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
1464 {
1465 MotionData* bestME = interMode.bestME[puIdx];
1466 for (int32_t i = 0; i < numPredDir; i++)
1467 {
1468 m_reuseInterDataCTU->ref = bestME[i].ref;
1469 m_reuseInterDataCTU++;
1470 }
1471 }
1472 }
1473 }
1474 else
1475 {
1476 interMode.distortion = MAX_UINT;
1477 interMode.sa8dCost = MAX_INT64;
1478 }
1479 }
1480
1481 void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, bool bMergeOnly)
1482 {
1483 interMode.initCosts();
1484 interMode.cu.setPartSizeSubParts(partSize);
1485 interMode.cu.setPredModeSubParts(MODE_INTER);
1486 int numPredDir = m_slice->isInterP() ? 1 : 2;
1487
1488 if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
1489 {
1490 for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
1491 {
1492 MotionData* bestME = interMode.bestME[puIdx];
1493 for (int32_t i = 0; i < numPredDir; i++)
1494 {
1495 bestME[i].ref = m_reuseInterDataCTU->ref;
1496 m_reuseInterDataCTU++;
1497 }
1498 }
1499 }
1500 if (predInterSearch(interMode, cuGeom, bMergeOnly, true))
1501 {
1502 /* predInterSearch sets interMode.sa8dBits, but this is ignored */
1503 encodeResAndCalcRdInterCU(interMode, cuGeom);
1504
1505 if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
1506 {
1507 for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
1508 {
1509 MotionData* bestME = interMode.bestME[puIdx];
1510 for (int32_t i = 0; i < numPredDir; i++)
1511 {
1512 m_reuseInterDataCTU->ref = bestME[i].ref;
1513 m_reuseInterDataCTU++;
1514 }
1515 }
1516 }
1517 }
1518 else
1519 {
1520 interMode.distortion = MAX_UINT;
1521 interMode.rdCost = MAX_INT64;
1522 }
1523 }
1524
1525 void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom)
1526 {
1527 CUData& cu = bidir2Nx2N.cu;
1528
1529 if (cu.isBipredRestriction() || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT)
1530 {
1531 bidir2Nx2N.sa8dCost = MAX_INT64;
1532 bidir2Nx2N.rdCost = MAX_INT64;
1533 return;
1534 }
1535
1536 const Yuv& fencYuv = *bidir2Nx2N.fencYuv;
1537 MV mvzero(0, 0);
1538 int cpart, partEnum = cuGeom.log2CUSize - 2;
1539
1540 if (m_bChromaSa8d)
1541 {
1542 int cuSize = 1 << cuGeom.log2CUSize;
1543 cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
1544 }
1545
1546 bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0];
1547 bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1];
1548 MotionData* bestME = bidir2Nx2N.bestME[0];
1549 int ref0 = bestME[0].ref;
1550 MV mvp0 = bestME[0].mvp;
1551 int mvpIdx0 = bestME[0].mvpIdx;
1552 int ref1 = bestME[1].ref;
1553 MV mvp1 = bestME[1].mvp;
1554 int mvpIdx1 = bestME[1].mvpIdx;
1555
1556 bidir2Nx2N.initCosts();
1557 cu.setPartSizeSubParts(SIZE_2Nx2N);
1558 cu.setPredModeSubParts(MODE_INTER);
1559 cu.setPUInterDir(3, 0, 0);
1560 cu.setPURefIdx(0, (int8_t)ref0, 0, 0);
1561 cu.setPURefIdx(1, (int8_t)ref1, 0, 0);
1562 cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
1563 cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
1564 cu.m_mergeFlag[0] = 0;
1565
1566 /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */
1567 cu.setPUMv(0, bestME[0].mv, 0, 0);
1568 cu.m_mvd[0][0] = bestME[0].mv - mvp0;
1569
1570 cu.setPUMv(1, bestME[1].mv, 0, 0);
1571 cu.m_mvd[1][0] = bestME[1].mv - mvp1;
1572
1573 prepMotionCompensation(cu, cuGeom, 0);
1574 motionCompensation(bidir2Nx2N.predYuv, true, m_bChromaSa8d);
1575
1576 int sa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
1577 if (m_bChromaSa8d)
1578 {
1579 /* Add in chroma distortion */
1580 sa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
1581 sa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize);
1582 }
1583 bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
1584 bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits);
1585
1586 bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
1587 if (bTryZero)
1588 {
1589 /* Do not try zero MV if unidir motion predictors are beyond
1590 * valid search area */
1591 MV mvmin, mvmax;
1592 int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
1593 setSearchRange(cu, mvzero, merange, mvmin, mvmax);
1594 mvmax.y += 2; // there is some pad for subpel refine
1595 mvmin <<= 2;
1596 mvmax <<= 2;
1597
1598 bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
1599 bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
1600 }
1601 if (bTryZero)
1602 {
1603 /* Estimate cost of BIDIR using coincident blocks */
1604 Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1605
1606 int zsa8d;
1607
1608 if (m_bChromaSa8d)
1609 {
1610 cu.m_mv[0][0] = mvzero;
1611 cu.m_mv[1][0] = mvzero;
1612
1613 prepMotionCompensation(cu, cuGeom, 0);
1614 motionCompensation(tmpPredYuv, true, true);
1615
1616 zsa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
1617 zsa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
1618 zsa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
1619 }
1620 else
1621 {
1622 pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx);
1623 pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx);
1624 intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
1625
1626 primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
1627 zsa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
1628 }
1629
1630 uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
1631 uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
1632 uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
1633
1634 /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
1635 checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvp0, mvpIdx0, bits0, zcost);
1636 checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvp1, mvpIdx1, bits1, zcost);
1637
1638 uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
1639 zcost = zsa8d + m_rdCost.getCost(zbits);
1640
1641 if (zcost < bidir2Nx2N.sa8dCost)
1642 {
1643 bidir2Nx2N.sa8dBits = zbits;
1644 bidir2Nx2N.sa8dCost = zcost;
1645
1646 cu.setPUMv(0, mvzero, 0, 0);
1647 cu.m_mvd[0][0] = mvzero - mvp0;
1648 cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
1649
1650 cu.setPUMv(1, mvzero, 0, 0);
1651 cu.m_mvd[1][0] = mvzero - mvp1;
1652 cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
1653
1654 if (m_bChromaSa8d)
1655 /* real MC was already performed */
1656 bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
1657 else
1658 {
1659 prepMotionCompensation(cu, cuGeom, 0);
1660 motionCompensation(bidir2Nx2N.predYuv, true, true);
1661 }
1662 }
1663 else if (m_bChromaSa8d)
1664 {
1665 /* recover overwritten motion vectors */
1666 cu.m_mv[0][0] = bestME[0].mv;
1667 cu.m_mv[1][0] = bestME[1].mv;
1668 }
1669 }
1670 }
1671
1672 void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
1673 {
1674 if (cuGeom.depth < ctu.m_cuDepth[cuGeom.encodeIdx] && cuGeom.depth < g_maxCUDepth)
1675 {
1676 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1677 {
1678 const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1679 if (childGeom.flags & CUGeom::PRESENT)
1680 encodeResidue(ctu, childGeom);
1681 }
1682 return;
1683 }
1684
1685 uint32_t absPartIdx = cuGeom.encodeIdx;
1686 int sizeIdx = cuGeom.log2CUSize - 2;
1687
1688 /* reuse the bestMode data structures at the current depth */
1689 Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
1690 CUData& cu = bestMode->cu;
1691
1692 cu.copyFromPic(ctu, cuGeom);
1693 m_quant.setQPforQuant(cu);
1694
1695 Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv;
1696 if (cuGeom.depth)
1697 m_modeDepth[0].fencYuv.copyPartToYuv(fencYuv, absPartIdx);
1698 X265_CHECK(bestMode->fencYuv == &fencYuv, "invalid fencYuv\n");
1699
1700 if (cu.isIntra(0))
1701 {
1702 uint32_t tuDepthRange[2];
1703 cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1704
1705 uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
1706 residualTransformQuantIntra(*bestMode, cuGeom, initTuDepth, 0, tuDepthRange);
1707 getBestIntraModeChroma(*bestMode, cuGeom);
1708 residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
1709 }
1710 else // if (cu.isInter(0))
1711 {
1712 X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n");
1713
1714 /* Calculate residual for current CU part into depth sized resiYuv */
1715
1716 ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1717
1718 /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */
1719 Yuv& predYuv = m_modeDepth[0].bestMode->predYuv;
1720 pixel* predY = predYuv.getLumaAddr(absPartIdx);
1721 pixel* predU = predYuv.getCbAddr(absPartIdx);
1722 pixel* predV = predYuv.getCrAddr(absPartIdx);
1723
1724 primitives.luma_sub_ps[sizeIdx](resiYuv.m_buf[0], resiYuv.m_size,
1725 fencYuv.m_buf[0], predY,
1726 fencYuv.m_size, predYuv.m_size);
1727
1728 primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[1], resiYuv.m_csize,
1729 fencYuv.m_buf[1], predU,
1730 fencYuv.m_csize, predYuv.m_csize);
1731
1732 primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[2], resiYuv.m_csize,
1733 fencYuv.m_buf[2], predV,
1734 fencYuv.m_csize, predYuv.m_csize);
1735
1736 uint32_t tuDepthRange[2];
1737 cu.getInterTUQtDepthRange(tuDepthRange, 0);
1738
1739 residualTransformQuantInter(*bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
1740
1741 if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
1742 cu.setPredModeSubParts(MODE_SKIP);
1743
1744 /* residualTransformQuantInter() wrote transformed residual back into
1745 * resiYuv. Generate the recon pixels by adding it to the prediction */
1746
1747 PicYuv& reconPic = *m_frame->m_reconPic;
1748 if (cu.m_cbf[0][0])
1749 primitives.luma_add_ps[sizeIdx](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
1750 predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
1751 else
1752 primitives.luma_copy_pp[sizeIdx](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
1753 predY, predYuv.m_size);
1754
1755 if (cu.m_cbf[1][0])
1756 primitives.chroma[m_csp].add_ps[sizeIdx](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1757 predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
1758 else
1759 primitives.chroma[m_csp].copy_pp[sizeIdx](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1760 predU, predYuv.m_csize);
1761
1762 if (cu.m_cbf[2][0])
1763 primitives.chroma[m_csp].add_ps[sizeIdx](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1764 predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
1765 else
1766 primitives.chroma[m_csp].copy_pp[sizeIdx](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1767 predV, predYuv.m_csize);
1768 }
1769
1770 checkDQP(cu, cuGeom);
1771 cu.updatePic(cuGeom.depth);
1772 }
1773
1774 void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
1775 {
1776 if (m_param->rdLevel >= 3)
1777 {
1778 /* code the split flag (0 or 1) and update bit costs */
1779 mode.contexts.resetBits();
1780 mode.contexts.codeSplitFlag(mode.cu, 0, depth);
1781 uint32_t bits = mode.contexts.getNumberOfWrittenBits();
1782 mode.mvBits += bits;
1783 mode.totalBits += bits;
1784 updateModeCost(mode);
1785 }
1786 else if (m_param->rdLevel <= 1)
1787 {
1788 mode.sa8dBits++;
1789 mode.sa8dCost = m_rdCost.calcRdSADCost(mode.distortion, mode.sa8dBits);
1790 }
1791 else
1792 {
1793 mode.mvBits++;
1794 mode.totalBits++;
1795 updateModeCost(mode);
1796 }
1797 }
1798
1799 void Analysis::checkDQP(CUData& cu, const CUGeom& cuGeom)
1800 {
1801 if (m_slice->m_pps->bUseDQP && cuGeom.depth <= m_slice->m_pps->maxCuDQPDepth)
1802 {
1803 if (cu.m_cuDepth[0] > cuGeom.depth) // detect splits
1804 {
1805 bool hasResidual = false;
1806 for (uint32_t absPartIdx = 0; absPartIdx < cu.m_numPartitions; absPartIdx++)
1807 {
1808 if (cu.getQtRootCbf(absPartIdx))
1809 {
1810 hasResidual = true;
1811 break;
1812 }
1813 }
1814 if (hasResidual)
1815 cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth);
1816 else
1817 cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
1818 }
1819 else
1820 {
1821 if (!cu.getCbf(0, TEXT_LUMA, 0) && !cu.getCbf(0, TEXT_CHROMA_U, 0) && !cu.getCbf(0, TEXT_CHROMA_V, 0))
1822 cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
1823 }
1824 }
1825 }
1826
1827 uint32_t Analysis::topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom)
1828 {
1829 /* Do not attempt to code a block larger than the largest block in the
1830 * co-located CTUs in L0 and L1 */
1831 int currentQP = parentCTU.m_qp[0];
1832 int previousQP = currentQP;
1833 uint32_t minDepth0 = 4, minDepth1 = 4;
1834 uint32_t sum = 0;
1835 int numRefs = 0;
1836 if (m_slice->m_numRefIdx[0])
1837 {
1838 numRefs++;
1839 const CUData& cu = *m_slice->m_refPicList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
1840 previousQP = cu.m_qp[0];
1841 if (!cu.m_cuDepth[cuGeom.encodeIdx])
1842 return 0;
1843 for (uint32_t i = 0; i < cuGeom.numPartitions && minDepth0; i += 4)
1844 {
1845 uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i];
1846 minDepth0 = X265_MIN(d, minDepth0);
1847 sum += d;
1848 }
1849 }
1850 if (m_slice->m_numRefIdx[1])
1851 {
1852 numRefs++;
1853 const CUData& cu = *m_slice->m_refPicList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
1854 if (!cu.m_cuDepth[cuGeom.encodeIdx])
1855 return 0;
1856 for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
1857 {
1858 uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i];
1859 minDepth1 = X265_MIN(d, minDepth1);
1860 sum += d;
1861 }
1862 }
1863 if (!numRefs)
1864 return 0;
1865
1866 uint32_t minDepth = X265_MIN(minDepth0, minDepth1);
1867 uint32_t thresh = minDepth * numRefs * (cuGeom.numPartitions >> 2);
1868
1869 /* allow block size growth if QP is raising or avg depth is
1870 * less than 1.5 of min depth */
1871 if (minDepth && currentQP >= previousQP && (sum <= thresh + (thresh >> 1)))
1872 minDepth -= 1;
1873
1874 return minDepth;
1875 }
1876
1877 /* returns true if recursion should be stopped */
1878 bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode)
1879 {
1880 /* early exit when the RD cost of best mode at depth n is less than the sum
1881 * of average of RD cost of the neighbor CU's(above, aboveleft, aboveright,
1882 * left, colocated) and avg cost of that CU at depth "n" with weightage for
1883 * each quantity */
1884
1885 uint32_t depth = cuGeom.depth;
1886 FrameData& curEncData = *m_frame->m_encData;
1887 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
1888 uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth];
1889 uint64_t cuCount = cuStat.count[depth];
1890
1891 uint64_t neighCost = 0, neighCount = 0;
1892 const CUData* above = parentCTU.m_cuAbove;
1893 if (above)
1894 {
1895 FrameData::RCStatCU& astat = curEncData.m_cuStat[above->m_cuAddr];
1896 neighCost += astat.avgCost[depth] * astat.count[depth];
1897 neighCount += astat.count[depth];
1898
1899 const CUData* aboveLeft = parentCTU.m_cuAboveLeft;
1900 if (aboveLeft)
1901 {
1902 FrameData::RCStatCU& lstat = curEncData.m_cuStat[aboveLeft->m_cuAddr];
1903 neighCost += lstat.avgCost[depth] * lstat.count[depth];
1904 neighCount += lstat.count[depth];
1905 }
1906
1907 const CUData* aboveRight = parentCTU.m_cuAboveRight;
1908 if (aboveRight)
1909 {
1910 FrameData::RCStatCU& rstat = curEncData.m_cuStat[aboveRight->m_cuAddr];
1911 neighCost += rstat.avgCost[depth] * rstat.count[depth];
1912 neighCount += rstat.count[depth];
1913 }
1914 }
1915 const CUData* left = parentCTU.m_cuLeft;
1916 if (left)
1917 {
1918 FrameData::RCStatCU& nstat = curEncData.m_cuStat[left->m_cuAddr];
1919 neighCost += nstat.avgCost[depth] * nstat.count[depth];
1920 neighCount += nstat.count[depth];
1921 }
1922
1923 // give 60% weight to all CU's and 40% weight to neighbour CU's
1924 if (neighCount + cuCount)
1925 {
1926 uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount));
1927 uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost;
1928 if (curCost < avgCost && avgCost)
1929 return true;
1930 }
1931
1932 return false;
1933 }