source/encoder/analysis.cpp

   1 /*****************************************************************************
   2 * Copyright (C) 2013 x265 project
   3 *
   4 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
   5 *          Steve Borho <steve@borho.org>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License as published by
   9 * the Free Software Foundation; either version 2 of the License, or
  10 * (at your option) any later version.
  11 *
  12 * This program is distributed in the hope that it will be useful,
  13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 * GNU General Public License for more details.
  16 *
  17 * You should have received a copy of the GNU General Public License
  18 * along with this program; if not, write to the Free Software
  19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  20 *
  21 * This program is also available under a commercial proprietary license.
  22 * For more information, contact us at license @ x265.com.
  23 *****************************************************************************/
  24
  25 #include "common.h"
  26 #include "frame.h"
  27 #include "framedata.h"
  28 #include "picyuv.h"
  29 #include "primitives.h"
  30 #include "threading.h"
  31
  32 #include "analysis.h"
  33 #include "rdcost.h"
  34 #include "encoder.h"
  35
  36 using namespace x265;
  37
  38 /* An explanation of rate distortion levels (--rd-level)
  39  *
  40  * rd-level 0 generates no recon per CU (NO RDO or Quant)
  41  *
  42  *   sa8d selection between merge / skip / inter / intra and split
  43  *   no recon pixels generated until CTU analysis is complete, requiring
  44  *   intra predictions to use source pixels
  45  *
  46  * rd-level 1 uses RDO for merge and skip, sa8d for all else
  47  *
  48  *   RDO selection between merge and skip
  49  *   sa8d selection between (merge/skip) / inter modes / intra and split
  50  *   intra prediction uses reconstructed pixels
  51  *
  52  * rd-level 2 uses RDO for merge/skip and split
  53  *
  54  *   RDO selection between merge and skip
  55  *   sa8d selection between (merge/skip) / inter modes / intra
  56  *   RDO split decisions
  57  *
  58  * rd-level 3 uses RDO for merge/skip/best inter/intra
  59  *
  60  *   RDO selection between merge and skip
  61  *   sa8d selection of best inter mode
  62  *   sa8d decisions include chroma residual cost
  63  *   RDO selection between (merge/skip) / best inter mode / intra / split
  64  *
  65  * rd-level 4 enables RDOQuant
  66  *   chroma residual cost included in satd decisions, including subpel refine
  67  *    (as a result of --subme 3 being used by preset slow)
  68  *
  69  * rd-level 5,6 does RDO for each inter mode
  70  */
  71
  72 Analysis::Analysis()
  73 {
  74     m_totalNumJobs = m_numAcquiredJobs = m_numCompletedJobs = 0;
  75     m_reuseIntraDataCTU = NULL;
  76     m_reuseInterDataCTU = NULL;
  77 }
  78
  79 bool Analysis::create(ThreadLocalData *tld)
  80 {
  81     m_tld = tld;
  82     m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
  83     m_bChromaSa8d = m_param->rdLevel >= 3;
  84
  85     int csp = m_param->internalCsp;
  86     uint32_t cuSize = g_maxCUSize;
  87
  88     bool ok = true;
  89     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++, cuSize >>= 1)
  90     {
  91         ModeDepth &md = m_modeDepth[depth];
  92
  93         md.cuMemPool.create(depth, csp, MAX_PRED_TYPES);
  94         ok &= md.fencYuv.create(cuSize, csp);
  95
  96         for (int j = 0; j < MAX_PRED_TYPES; j++)
  97         {
  98             md.pred[j].cu.initialize(md.cuMemPool, depth, csp, j);
  99             ok &= md.pred[j].predYuv.create(cuSize, csp);
 100             ok &= md.pred[j].reconYuv.create(cuSize, csp);
 101             md.pred[j].fencYuv = &md.fencYuv;
 102         }
 103     }
 104
 105     return ok;
 106 }
 107
 108 void Analysis::destroy()
 109 {
 110     for (uint32_t i = 0; i <= g_maxCUDepth; i++)
 111     {
 112         m_modeDepth[i].cuMemPool.destroy();
 113         m_modeDepth[i].fencYuv.destroy();
 114
 115         for (int j = 0; j < MAX_PRED_TYPES; j++)
 116         {
 117             m_modeDepth[i].pred[j].predYuv.destroy();
 118             m_modeDepth[i].pred[j].reconYuv.destroy();
 119         }
 120     }
 121 }
 122
 123 Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
 124 {
 125     m_slice = ctu.m_slice;
 126     m_frame = &frame;
 127
 128     invalidateContexts(0);
 129     m_quant.setQPforQuant(ctu);
 130     m_rqt[0].cur.load(initialContext);
 131     m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
 132
 133     uint32_t numPartition = ctu.m_numPartitions;
 134     if (m_param->analysisMode)
 135     {
 136         m_reuseIntraDataCTU = (analysis_intra_data *)m_frame->m_analysisData.intraData;
 137         int numPredDir = m_slice->isInterP() ? 1 : 2;
 138         m_reuseInterDataCTU = (analysis_inter_data *)m_frame->m_analysisData.interData + ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir;
 139     }
 140
 141     if (m_slice->m_sliceType == I_SLICE)
 142     {
 143         uint32_t zOrder = 0;
 144         compressIntraCU(ctu, cuGeom, zOrder);
 145         if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData)
 146         {
 147             CUData *bestCU = &m_modeDepth[0].bestMode->cu;
 148             memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
 149             memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
 150             memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
 151         }
 152     }
 153     else
 154     {
 155         if (!m_param->rdLevel)
 156         {
 157             /* In RD Level 0/1, copy source pixels into the reconstructed block so
 158             * they are available for intra predictions */
 159             m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
 160
 161             compressInterCU_rd0_4(ctu, cuGeom);
 162
 163             /* generate residual for entire CTU at once and copy to reconPic */
 164             encodeResidue(ctu, cuGeom);
 165         }
 166         else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2)
 167             compressInterCU_dist(ctu, cuGeom);
 168         else if (m_param->rdLevel <= 4)
 169             compressInterCU_rd0_4(ctu, cuGeom);
 170         else
 171             compressInterCU_rd5_6(ctu, cuGeom);
 172     }
 173
 174     return *m_modeDepth[0].bestMode;
 175 }
 176
 177 void Analysis::tryLossless(const CUGeom& cuGeom)
 178 {
 179     ModeDepth& md = m_modeDepth[cuGeom.depth];
 180
 181     if (!md.bestMode->distortion)
 182         /* already lossless */
 183         return;
 184     else if (md.bestMode->cu.isIntra(0))
 185     {
 186         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
 187         PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
 188         uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir;
 189         checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes);
 190         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
 191     }
 192     else
 193     {
 194         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
 195         md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
 196         encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
 197         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
 198     }
 199 }
 200
 201 void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder)
 202 {
 203     uint32_t depth = cuGeom.depth;
 204     ModeDepth& md = m_modeDepth[depth];
 205     md.bestMode = NULL;
 206
 207     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
 208     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
 209
 210     if (m_param->analysisMode == X265_ANALYSIS_LOAD)
 211     {
 212         uint8_t* reuseDepth  = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
 213         uint8_t* reuseModes  = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
 214         char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
 215
 216         if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.encodeIdx)
 217         {
 218             m_quant.setQPforQuant(parentCTU);
 219
 220             PartSize size = (PartSize)reusePartSizes[zOrder];
 221             Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
 222             mode.cu.initSubCU(parentCTU, cuGeom);
 223             checkIntra(mode, cuGeom, size, &reuseModes[zOrder]);
 224             checkBestMode(mode, depth);
 225
 226             if (m_bTryLossless)
 227                 tryLossless(cuGeom);
 228
 229             if (mightSplit)
 230                 addSplitFlagCost(*md.bestMode, cuGeom.depth);
 231
 232             // increment zOrder offset to point to next best depth in sharedDepth buffer
 233             zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
 234             mightSplit = false;
 235         }
 236     }
 237     else if (mightNotSplit)
 238     {
 239         m_quant.setQPforQuant(parentCTU);
 240
 241         md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
 242         checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL);
 243         checkBestMode(md.pred[PRED_INTRA], depth);
 244
 245         if (depth == g_maxCUDepth)
 246         {
 247             md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
 248             checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL);
 249             checkBestMode(md.pred[PRED_INTRA_NxN], depth);
 250         }
 251
 252         if (m_bTryLossless)
 253             tryLossless(cuGeom);
 254
 255         if (mightSplit)
 256             addSplitFlagCost(*md.bestMode, cuGeom.depth);
 257     }
 258
 259     if (mightSplit)
 260     {
 261         Mode* splitPred = &md.pred[PRED_SPLIT];
 262         splitPred->initCosts();
 263         CUData* splitCU = &splitPred->cu;
 264         splitCU->initSubCU(parentCTU, cuGeom);
 265
 266         uint32_t nextDepth = depth + 1;
 267         ModeDepth& nd = m_modeDepth[nextDepth];
 268         invalidateContexts(nextDepth);
 269         Entropy* nextContext = &m_rqt[depth].cur;
 270
 271         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
 272         {
 273             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
 274             if (childGeom.flags & CUGeom::PRESENT)
 275             {
 276                 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
 277                 m_rqt[nextDepth].cur.load(*nextContext);
 278                 compressIntraCU(parentCTU, childGeom, zOrder);
 279
 280                 // Save best CU and pred data for this sub CU
 281                 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
 282                 splitPred->addSubCosts(*nd.bestMode);
 283                 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
 284                 nextContext = &nd.bestMode->contexts;
 285             }
 286             else
 287             {
 288                 /* record the depth of this non-present sub-CU */
 289                 splitCU->setEmptyPart(childGeom, subPartIdx);
 290                 zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth];
 291             }
 292         }
 293         nextContext->store(splitPred->contexts);
 294         if (mightNotSplit)
 295             addSplitFlagCost(*splitPred, cuGeom.depth);
 296         else
 297             updateModeCost(*splitPred);
 298         checkBestMode(*splitPred, depth);
 299     }
 300
 301     checkDQP(md.bestMode->cu, cuGeom);
 302
 303     /* Copy best data to encData CTU and recon */
 304     md.bestMode->cu.copyToPic(depth);
 305     if (md.bestMode != &md.pred[PRED_SPLIT])
 306         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx);
 307 }
 308
 309 bool Analysis::findJob(int threadId)
 310 {
 311     /* try to acquire a CU mode to analyze */
 312     m_pmodeLock.acquire();
 313     if (m_totalNumJobs > m_numAcquiredJobs)
 314     {
 315         int id = m_numAcquiredJobs++;
 316         m_pmodeLock.release();
 317
 318         parallelModeAnalysis(threadId, id);
 319
 320         m_pmodeLock.acquire();
 321         if (++m_numCompletedJobs == m_totalNumJobs)
 322             m_modeCompletionEvent.trigger();
 323         m_pmodeLock.release();
 324         return true;
 325     }
 326     else
 327         m_pmodeLock.release();
 328
 329     m_meLock.acquire();
 330     if (m_totalNumME > m_numAcquiredME)
 331     {
 332         int id = m_numAcquiredME++;
 333         m_meLock.release();
 334
 335         parallelME(threadId, id);
 336
 337         m_meLock.acquire();
 338         if (++m_numCompletedME == m_totalNumME)
 339             m_meCompletionEvent.trigger();
 340         m_meLock.release();
 341         return true;
 342     }
 343     else
 344         m_meLock.release();
 345
 346     return false;
 347 }
 348
 349 void Analysis::parallelME(int threadId, int meId)
 350 {
 351     Analysis* slave;
 352
 353     if (threadId == -1)
 354         slave = this;
 355     else
 356     {
 357         slave = &m_tld[threadId].analysis;
 358         slave->setQP(*m_slice, m_rdCost.m_qp);
 359         slave->m_slice = m_slice;
 360         slave->m_frame = m_frame;
 361
 362         slave->m_me.setSourcePU(*m_curInterMode->fencYuv, m_curInterMode->cu.m_cuAddr, m_curGeom->encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
 363         slave->prepMotionCompensation(m_curInterMode->cu, *m_curGeom, m_curPart);
 364     }
 365
 366     if (meId < m_slice->m_numRefIdx[0])
 367         slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 0, meId);
 368     else
 369         slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]);
 370 }
 371
 372 void Analysis::parallelModeAnalysis(int threadId, int jobId)
 373 {
 374     Analysis* slave;
 375
 376     if (threadId == -1)
 377         slave = this;
 378     else
 379     {
 380         slave = &m_tld[threadId].analysis;
 381         slave->m_slice = m_slice;
 382         slave->m_frame = m_frame;
 383         slave->setQP(*m_slice, m_rdCost.m_qp);
 384         slave->invalidateContexts(0);
 385     }
 386
 387     ModeDepth& md = m_modeDepth[m_curGeom->depth];
 388
 389     if (m_param->rdLevel <= 4)
 390     {
 391         switch (jobId)
 392         {
 393         case 0:
 394             if (slave != this)
 395                 slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur);
 396             slave->checkIntraInInter(md.pred[PRED_INTRA], *m_curGeom);
 397             if (m_param->rdLevel > 2)
 398                 slave->encodeIntraInInter(md.pred[PRED_INTRA], *m_curGeom);
 399             break;
 400
 401         case 1:
 402             slave->checkInter_rd0_4(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N);
 403             if (m_slice->m_sliceType == B_SLICE)
 404                 slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom);
 405             break;
 406
 407         case 2:
 408             slave->checkInter_rd0_4(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N);
 409             break;
 410
 411         case 3:
 412             slave->checkInter_rd0_4(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN);
 413             break;
 414
 415         case 4:
 416             slave->checkInter_rd0_4(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU);
 417             break;
 418
 419         case 5:
 420             slave->checkInter_rd0_4(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD);
 421             break;
 422
 423         case 6:
 424             slave->checkInter_rd0_4(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N);
 425             break;
 426
 427         case 7:
 428             slave->checkInter_rd0_4(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N);
 429             break;
 430
 431         default:
 432             X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
 433             break;
 434         }
 435     }
 436     else
 437     {
 438         bool bMergeOnly = m_curGeom->log2CUSize == 6;
 439         if (slave != this)
 440         {
 441             slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur);
 442             slave->m_quant.setQPforQuant(md.pred[PRED_2Nx2N].cu);
 443         }
 444
 445         switch (jobId)
 446         {
 447         case 0:
 448             slave->checkIntra(md.pred[PRED_INTRA], *m_curGeom, SIZE_2Nx2N, NULL);
 449             if (m_curGeom->depth == g_maxCUDepth && m_curGeom->log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
 450                 slave->checkIntra(md.pred[PRED_INTRA_NxN], *m_curGeom, SIZE_NxN, NULL);
 451             break;
 452
 453         case 1:
 454             slave->checkInter_rd5_6(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N, false);
 455             md.pred[PRED_BIDIR].rdCost = MAX_INT64;
 456             if (m_slice->m_sliceType == B_SLICE)
 457             {
 458                 slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom);
 459                 if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
 460                     slave->encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], *m_curGeom);
 461             }
 462             break;
 463
 464         case 2:
 465             slave->checkInter_rd5_6(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N, false);
 466             break;
 467
 468         case 3:
 469             slave->checkInter_rd5_6(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN, false);
 470             break;
 471
 472         case 4:
 473             slave->checkInter_rd5_6(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU, bMergeOnly);
 474             break;
 475
 476         case 5:
 477             slave->checkInter_rd5_6(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD, bMergeOnly);
 478             break;
 479
 480         case 6:
 481             slave->checkInter_rd5_6(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N, bMergeOnly);
 482             break;
 483
 484         case 7:
 485             slave->checkInter_rd5_6(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N, bMergeOnly);
 486             break;
 487
 488         default:
 489             X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
 490             break;
 491         }
 492     }
 493 }
 494
 495 void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom)
 496 {
 497     uint32_t depth = cuGeom.depth;
 498     uint32_t cuAddr = parentCTU.m_cuAddr;
 499     ModeDepth& md = m_modeDepth[depth];
 500     md.bestMode = NULL;
 501
 502     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
 503     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
 504     uint32_t minDepth = m_param->rdLevel <= 4 ? topSkipMinDepth(parentCTU, cuGeom) : 0;
 505
 506     X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not support RD 0 or 1\n");
 507
 508     if (mightNotSplit && depth >= minDepth)
 509     {
 510         int bTryAmp = m_slice->m_sps->maxAMPDepth > depth && (cuGeom.log2CUSize < 6 || m_param->rdLevel > 4);
 511         int bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
 512
 513         /* Initialize all prediction CUs based on parentCTU */
 514         md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
 515         md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
 516         md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
 517         md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
 518         if (m_param->bEnableRectInter)
 519         {
 520             md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
 521             md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
 522         }
 523         if (bTryAmp)
 524         {
 525             md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
 526             md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
 527             md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
 528             md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
 529         }
 530         if (bTryIntra)
 531         {
 532             md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
 533             if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
 534                 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
 535         }
 536
 537         m_pmodeLock.acquire();
 538         m_totalNumJobs = 2 + m_param->bEnableRectInter * 2 + bTryAmp * 4;
 539         m_numAcquiredJobs = !bTryIntra;
 540         m_numCompletedJobs = m_numAcquiredJobs;
 541         m_curGeom = &cuGeom;
 542         m_bJobsQueued = true;
 543         JobProvider::enqueue();
 544         m_pmodeLock.release();
 545
 546         for (int i = 0; i < m_totalNumJobs - m_numCompletedJobs; i++)
 547             m_pool->pokeIdleThread();
 548
 549         /* participate in processing jobs, until all are distributed */
 550         while (findJob(-1))
 551             ;
 552
 553         JobProvider::dequeue();
 554         m_bJobsQueued = false;
 555
 556         /* the master worker thread (this one) does merge analysis. By doing
 557          * merge after all the other jobs are at least started, we usually avoid
 558          * blocking on another thread */
 559
 560         if (m_param->rdLevel <= 4)
 561         {
 562             checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
 563
 564             m_modeCompletionEvent.wait();
 565
 566             /* select best inter mode based on sa8d cost */
 567             Mode *bestInter = &md.pred[PRED_2Nx2N];
 568
 569             if (m_param->bEnableRectInter)
 570             {
 571                 if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
 572                     bestInter = &md.pred[PRED_Nx2N];
 573                 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
 574                     bestInter = &md.pred[PRED_2NxN];
 575             }
 576
 577             if (bTryAmp)
 578             {
 579                 if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
 580                     bestInter = &md.pred[PRED_2NxnU];
 581                 if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
 582                     bestInter = &md.pred[PRED_2NxnD];
 583                 if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
 584                     bestInter = &md.pred[PRED_nLx2N];
 585                 if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
 586                     bestInter = &md.pred[PRED_nRx2N];
 587             }
 588
 589             if (m_param->rdLevel > 2)
 590             {
 591                 /* RD selection between merge, inter, bidir and intra */
 592                 if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
 593                 {
 594                     for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
 595                     {
 596                         prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
 597                         motionCompensation(bestInter->predYuv, false, true);
 598                     }
 599                 }
 600                 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
 601                 checkBestMode(*bestInter, depth);
 602
 603                 /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
 604                 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
 605                     md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
 606                 {
 607                     encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
 608                     checkBestMode(md.pred[PRED_BIDIR], depth);
 609                 }
 610
 611                 if (bTryIntra)
 612                     checkBestMode(md.pred[PRED_INTRA], depth);
 613             }
 614             else /* m_param->rdLevel == 2 */
 615             {
 616                 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
 617                     md.bestMode = bestInter;
 618
 619                 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
 620                     md.bestMode = &md.pred[PRED_BIDIR];
 621
 622                 if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
 623                 {
 624                     md.bestMode = &md.pred[PRED_INTRA];
 625                     encodeIntraInInter(*md.bestMode, cuGeom);
 626                 }
 627                 else if (!md.bestMode->cu.m_mergeFlag[0])
 628                 {
 629                     /* finally code the best mode selected from SA8D costs */
 630                     for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
 631                     {
 632                         prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx);
 633                         motionCompensation(md.bestMode->predYuv, false, true);
 634                     }
 635                     encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
 636                 }
 637             }
 638         }
 639         else
 640         {
 641             checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
 642             m_modeCompletionEvent.wait();
 643
 644             checkBestMode(md.pred[PRED_2Nx2N], depth);
 645             checkBestMode(md.pred[PRED_BIDIR], depth);
 646
 647             if (m_param->bEnableRectInter)
 648             {
 649                 checkBestMode(md.pred[PRED_Nx2N], depth);
 650                 checkBestMode(md.pred[PRED_2NxN], depth);
 651             }
 652
 653             if (bTryAmp)
 654             {
 655                 checkBestMode(md.pred[PRED_2NxnU], depth);
 656                 checkBestMode(md.pred[PRED_2NxnD], depth);
 657                 checkBestMode(md.pred[PRED_nLx2N], depth);
 658                 checkBestMode(md.pred[PRED_nRx2N], depth);
 659             }
 660
 661             if (bTryIntra)
 662             {
 663                 checkBestMode(md.pred[PRED_INTRA], depth);
 664                 if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
 665                     checkBestMode(md.pred[PRED_INTRA_NxN], depth);
 666             }
 667         }
 668
 669         if (md.bestMode->rdCost == MAX_INT64 && !bTryIntra)
 670         {
 671             md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
 672             checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
 673             encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
 674             checkBestMode(md.pred[PRED_INTRA], depth);
 675         }
 676
 677         if (m_bTryLossless)
 678             tryLossless(cuGeom);
 679
 680         if (mightSplit)
 681             addSplitFlagCost(*md.bestMode, cuGeom.depth);
 682     }
 683
 684     bool bNoSplit = false;
 685     if (md.bestMode)
 686     {
 687         bNoSplit = md.bestMode->cu.isSkipped(0);
 688         if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
 689             bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
 690     }
 691
 692     if (mightSplit && !bNoSplit)
 693     {
 694         Mode* splitPred = &md.pred[PRED_SPLIT];
 695         splitPred->initCosts();
 696         CUData* splitCU = &splitPred->cu;
 697         splitCU->initSubCU(parentCTU, cuGeom);
 698
 699         uint32_t nextDepth = depth + 1;
 700         ModeDepth& nd = m_modeDepth[nextDepth];
 701         invalidateContexts(nextDepth);
 702         Entropy* nextContext = &m_rqt[depth].cur;
 703
 704         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
 705         {
 706             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
 707             if (childGeom.flags & CUGeom::PRESENT)
 708             {
 709                 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
 710                 m_rqt[nextDepth].cur.load(*nextContext);
 711                 compressInterCU_dist(parentCTU, childGeom);
 712
 713                 // Save best CU and pred data for this sub CU
 714                 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
 715                 splitPred->addSubCosts(*nd.bestMode);
 716
 717                 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
 718                 nextContext = &nd.bestMode->contexts;
 719             }
 720             else
 721                 splitCU->setEmptyPart(childGeom, subPartIdx);
 722         }
 723         nextContext->store(splitPred->contexts);
 724
 725         if (mightNotSplit)
 726             addSplitFlagCost(*splitPred, cuGeom.depth);
 727         else
 728             updateModeCost(*splitPred);
 729
 730         checkBestMode(*splitPred, depth);
 731     }
 732
 733     if (mightNotSplit)
 734     {
 735         /* early-out statistics */
 736         FrameData& curEncData = *m_frame->m_encData;
 737         FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
 738         uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
 739         cuStat.count[depth] += 1;
 740         cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
 741     }
 742
 743     checkDQP(md.bestMode->cu, cuGeom);
 744
 745     /* Copy best data to encData CTU and recon */
 746     md.bestMode->cu.copyToPic(depth);
 747     if (md.bestMode != &md.pred[PRED_SPLIT])
 748         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx);
 749 }
 750
 751 void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom)
 752 {
 753     uint32_t depth = cuGeom.depth;
 754     uint32_t cuAddr = parentCTU.m_cuAddr;
 755     ModeDepth& md = m_modeDepth[depth];
 756     md.bestMode = NULL;
 757
 758     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
 759     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
 760     uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
 761
 762     if (mightNotSplit && depth >= minDepth)
 763     {
 764         bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
 765
 766         /* Compute Merge Cost */
 767         md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
 768         md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
 769         checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
 770
 771         bool earlyskip = false;
 772         if (m_param->rdLevel)
 773             earlyskip = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
 774
 775         if (!earlyskip)
 776         {
 777             md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
 778             checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N);
 779
 780             if (m_slice->m_sliceType == B_SLICE)
 781             {
 782                 md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
 783                 checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
 784             }
 785
 786             Mode *bestInter = &md.pred[PRED_2Nx2N];
 787             if (m_param->bEnableRectInter)
 788             {
 789                 md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
 790                 checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N);
 791                 if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
 792                     bestInter = &md.pred[PRED_Nx2N];
 793
 794                 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
 795                 checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN);
 796                 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
 797                     bestInter = &md.pred[PRED_2NxN];
 798             }
 799
 800             if (m_slice->m_sps->maxAMPDepth > depth && cuGeom.log2CUSize < 6)
 801             {
 802                 bool bHor = false, bVer = false;
 803                 if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
 804                     bHor = true;
 805                 else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)
 806                     bVer = true;
 807                 else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&
 808                          md.bestMode && md.bestMode->cu.getQtRootCbf(0))
 809                 {
 810                     bHor = true;
 811                     bVer = true;
 812                 }
 813
 814                 if (bHor)
 815                 {
 816                     md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
 817                     checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU);
 818                     if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
 819                         bestInter = &md.pred[PRED_2NxnU];
 820
 821                     md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
 822                     checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD);
 823                     if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
 824                         bestInter = &md.pred[PRED_2NxnD];
 825                 }
 826                 if (bVer)
 827                 {
 828                     md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
 829                     checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N);
 830                     if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
 831                         bestInter = &md.pred[PRED_nLx2N];
 832
 833                     md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
 834                     checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N);
 835                     if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
 836                         bestInter = &md.pred[PRED_nRx2N];
 837                 }
 838             }
 839
 840             if (m_param->rdLevel >= 3)
 841             {
 842                 /* Calculate RD cost of best inter option */
 843                 if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
 844                 {
 845                     for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
 846                     {
 847                         prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
 848                         motionCompensation(bestInter->predYuv, false, true);
 849                     }
 850                 }
 851                 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
 852                 checkBestMode(*bestInter, depth);
 853
 854                 /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
 855                 if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
 856                     md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
 857                 {
 858                     encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
 859                     checkBestMode(md.pred[PRED_BIDIR], depth);
 860                 }
 861
 862                 if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
 863                     md.bestMode->sa8dCost == MAX_INT64)
 864                 {
 865                     md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
 866                     checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
 867                     encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
 868                     checkBestMode(md.pred[PRED_INTRA], depth);
 869                 }
 870             }
 871             else
 872             {
 873                 /* SA8D choice between merge/skip, inter, bidir, and intra */
 874                 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
 875                     md.bestMode = bestInter;
 876
 877                 if (m_slice->m_sliceType == B_SLICE &&
 878                     md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
 879                     md.bestMode = &md.pred[PRED_BIDIR];
 880
 881                 if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
 882                 {
 883                     md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
 884                     checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
 885                     if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
 886                         md.bestMode = &md.pred[PRED_INTRA];
 887                 }
 888
 889                 /* finally code the best mode selected by SA8D costs:
 890                  * RD level 2 - fully encode the best mode
 891                  * RD level 1 - generate recon pixels
 892                  * RD level 0 - generate chroma prediction */
 893                 if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)
 894                 {
 895                     /* prediction already generated for this CU, and if rd level
 896                      * is not 0, it is already fully encoded */
 897                 }
 898                 else if (md.bestMode->cu.isInter(0))
 899                 {
 900                     for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
 901                     {
 902                         prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx);
 903                         motionCompensation(md.bestMode->predYuv, false, true);
 904                     }
 905                     if (m_param->rdLevel == 2)
 906                         encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
 907                     else if (m_param->rdLevel == 1)
 908                     {
 909                         /* generate recon pixels with no rate distortion considerations */
 910                         CUData& cu = md.bestMode->cu;
 911                         m_quant.setQPforQuant(cu);
 912
 913                         uint32_t tuDepthRange[2];
 914                         cu.getInterTUQtDepthRange(tuDepthRange, 0);
 915
 916                         m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
 917                         residualTransformQuantInter(*md.bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
 918                         if (cu.getQtRootCbf(0))
 919                             md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
 920                         else
 921                         {
 922                             md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
 923                             if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
 924                                 cu.setPredModeSubParts(MODE_SKIP);
 925                         }
 926                     }
 927                 }
 928                 else
 929                 {
 930                     if (m_param->rdLevel == 2)
 931                         encodeIntraInInter(*md.bestMode, cuGeom);
 932                     else if (m_param->rdLevel == 1)
 933                     {
 934                         /* generate recon pixels with no rate distortion considerations */
 935                         CUData& cu = md.bestMode->cu;
 936                         m_quant.setQPforQuant(cu);
 937
 938                         uint32_t tuDepthRange[2];
 939                         cu.getIntraTUQtDepthRange(tuDepthRange, 0);
 940
 941                         uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
 942                         residualTransformQuantIntra(*md.bestMode, cuGeom, initTuDepth, 0, tuDepthRange);
 943                         getBestIntraModeChroma(*md.bestMode, cuGeom);
 944                         residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
 945                         md.bestMode->reconYuv.copyFromPicYuv(*m_frame->m_reconPic, cu.m_cuAddr, cuGeom.encodeIdx); // TODO:
 946                     }
 947                 }
 948             }
 949         } // !earlyskip
 950
 951         if (m_bTryLossless)
 952             tryLossless(cuGeom);
 953
 954         if (mightSplit)
 955             addSplitFlagCost(*md.bestMode, cuGeom.depth);
 956     }
 957
 958     bool bNoSplit = false;
 959     if (md.bestMode)
 960     {
 961         bNoSplit = md.bestMode->cu.isSkipped(0);
 962         if (mightSplit && depth && depth >= minDepth && !bNoSplit)
 963             bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
 964     }
 965
 966     if (mightSplit && !bNoSplit)
 967     {
 968         Mode* splitPred = &md.pred[PRED_SPLIT];
 969         splitPred->initCosts();
 970         CUData* splitCU = &splitPred->cu;
 971         splitCU->initSubCU(parentCTU, cuGeom);
 972
 973         uint32_t nextDepth = depth + 1;
 974         ModeDepth& nd = m_modeDepth[nextDepth];
 975         invalidateContexts(nextDepth);
 976         Entropy* nextContext = &m_rqt[depth].cur;
 977
 978         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
 979         {
 980             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
 981             if (childGeom.flags & CUGeom::PRESENT)
 982             {
 983                 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
 984                 m_rqt[nextDepth].cur.load(*nextContext);
 985                 compressInterCU_rd0_4(parentCTU, childGeom);
 986
 987                 // Save best CU and pred data for this sub CU
 988                 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
 989                 splitPred->addSubCosts(*nd.bestMode);
 990
 991                 if (m_param->rdLevel)
 992                     nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
 993                 else
 994                     nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);
 995                 if (m_param->rdLevel > 1)
 996                     nextContext = &nd.bestMode->contexts;
 997             }
 998             else
 999                 splitCU->setEmptyPart(childGeom, subPartIdx);
1000         }
1001         nextContext->store(splitPred->contexts);
1002
1003         if (mightNotSplit)
1004             addSplitFlagCost(*splitPred, cuGeom.depth);
1005         else if (m_param->rdLevel > 1)
1006             updateModeCost(*splitPred);
1007         else
1008             splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits);
1009
1010         if (!md.bestMode)
1011             md.bestMode = splitPred;
1012         else if (m_param->rdLevel > 1)
1013             checkBestMode(*splitPred, cuGeom.depth);
1014         else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
1015             md.bestMode = splitPred;
1016     }
1017
1018     if (mightNotSplit)
1019     {
1020         /* early-out statistics */
1021         FrameData& curEncData = *m_frame->m_encData;
1022         FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
1023         uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
1024         cuStat.count[depth] += 1;
1025         cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
1026     }
1027
1028     checkDQP(md.bestMode->cu, cuGeom);
1029
1030     /* Copy best data to encData CTU and recon */
1031     md.bestMode->cu.copyToPic(depth);
1032     if (md.bestMode != &md.pred[PRED_SPLIT] && m_param->rdLevel)
1033         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx);
1034 }
1035
1036 void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom)
1037 {
1038     uint32_t depth = cuGeom.depth;
1039     ModeDepth& md = m_modeDepth[depth];
1040     md.bestMode = NULL;
1041
1042     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
1043     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
1044
1045     if (mightNotSplit)
1046     {
1047         md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
1048         md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
1049         checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
1050         bool earlySkip = m_param->bEnableEarlySkip && md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
1051
1052         if (!earlySkip)
1053         {
1054             md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
1055             checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, false);
1056             checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
1057
1058             if (m_slice->m_sliceType == B_SLICE)
1059             {
1060                 md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
1061                 checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
1062                 if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
1063                 {
1064                     encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
1065                     checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth);
1066                 }
1067             }
1068
1069             if (m_param->bEnableRectInter)
1070             {
1071                 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1072                 {
1073                     md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
1074                     checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, false);
1075                     checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
1076                 }
1077                 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1078                 {
1079                     md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
1080                     checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, false);
1081                     checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
1082                 }
1083             }
1084
1085             // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
1086             if (m_slice->m_sps->maxAMPDepth > depth)
1087             {
1088                 bool bMergeOnly = cuGeom.log2CUSize == 6;
1089
1090                 bool bHor = false, bVer = false;
1091                 if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN)
1092                     bHor = true;
1093                 else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N)
1094                     bVer = true;
1095                 else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0] && !md.bestMode->cu.isSkipped(0))
1096                 {
1097                     bHor = true;
1098                     bVer = true;
1099                 }
1100
1101                 if (bHor)
1102                 {
1103                     if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1104                     {
1105                         md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
1106                         checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, bMergeOnly);
1107                         checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
1108                     }
1109                     if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1110                     {
1111                         md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
1112                         checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, bMergeOnly);
1113                         checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
1114                     }
1115                 }
1116                 if (bVer)
1117                 {
1118                     if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1119                     {
1120                         md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
1121                         checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, bMergeOnly);
1122                         checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
1123                     }
1124                     if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1125                     {
1126                         md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
1127                         checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, bMergeOnly);
1128                         checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
1129                     }
1130                 }
1131             }
1132
1133             if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) &&
1134                 (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)))
1135             {
1136                 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
1137                 checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL);
1138                 checkBestMode(md.pred[PRED_INTRA], depth);
1139
1140                 if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
1141                 {
1142                     md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
1143                     checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL);
1144                     checkBestMode(md.pred[PRED_INTRA_NxN], depth);
1145                 }
1146             }
1147         }
1148
1149         if (m_bTryLossless)
1150             tryLossless(cuGeom);
1151
1152         if (mightSplit)
1153             addSplitFlagCost(*md.bestMode, cuGeom.depth);
1154     }
1155
1156     // estimate split cost
1157     if (mightSplit && (!md.bestMode || !md.bestMode->cu.isSkipped(0)))
1158     {
1159         Mode* splitPred = &md.pred[PRED_SPLIT];
1160         splitPred->initCosts();
1161         CUData* splitCU = &splitPred->cu;
1162         splitCU->initSubCU(parentCTU, cuGeom);
1163
1164         uint32_t nextDepth = depth + 1;
1165         ModeDepth& nd = m_modeDepth[nextDepth];
1166         invalidateContexts(nextDepth);
1167         Entropy* nextContext = &m_rqt[depth].cur;
1168
1169         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1170         {
1171             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1172             if (childGeom.flags & CUGeom::PRESENT)
1173             {
1174                 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
1175                 m_rqt[nextDepth].cur.load(*nextContext);
1176                 compressInterCU_rd5_6(parentCTU, childGeom);
1177
1178                 // Save best CU and pred data for this sub CU
1179                 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
1180                 splitPred->addSubCosts(*nd.bestMode);
1181                 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
1182                 nextContext = &nd.bestMode->contexts;
1183             }
1184             else
1185                 splitCU->setEmptyPart(childGeom, subPartIdx);
1186         }
1187         nextContext->store(splitPred->contexts);
1188         if (mightNotSplit)
1189             addSplitFlagCost(*splitPred, cuGeom.depth);
1190         else
1191             updateModeCost(*splitPred);
1192
1193         checkBestMode(*splitPred, depth);
1194     }
1195
1196     checkDQP(md.bestMode->cu, cuGeom);
1197
1198     /* Copy best data to encData CTU and recon */
1199     md.bestMode->cu.copyToPic(depth);
1200     if (md.bestMode != &md.pred[PRED_SPLIT])
1201         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx);
1202 }
1203
1204 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
1205 void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom)
1206 {
1207     uint32_t depth = cuGeom.depth;
1208     ModeDepth& md = m_modeDepth[depth];
1209     Yuv *fencYuv = &md.fencYuv;
1210
1211     /* Note that these two Mode instances are named MERGE and SKIP but they may
1212      * hold the reverse when the function returns. We toggle between the two modes */
1213     Mode* tempPred = &merge;
1214     Mode* bestPred = &skip;
1215
1216     X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n");
1217
1218     tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
1219     tempPred->cu.setPredModeSubParts(MODE_INTER);
1220     tempPred->cu.m_mergeFlag[0] = true;
1221
1222     bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
1223     bestPred->cu.setPredModeSubParts(MODE_INTER);
1224     bestPred->cu.m_mergeFlag[0] = true;
1225
1226     MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
1227     uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS];
1228     uint32_t maxNumMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours);
1229
1230     bestPred->sa8dCost = MAX_INT64;
1231     int bestSadCand = -1;
1232     int cpart, sizeIdx = cuGeom.log2CUSize - 2;
1233     if (m_bChromaSa8d)
1234     {
1235         int cuSize = 1 << cuGeom.log2CUSize;
1236         cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
1237     }
1238     for (uint32_t i = 0; i < maxNumMergeCand; ++i)
1239     {
1240         if (m_bFrameParallel &&
1241             (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1242             mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4))
1243             continue;
1244
1245         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
1246         tempPred->cu.m_interDir[0] = interDirNeighbours[i];
1247         tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
1248         tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
1249         tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
1250         tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
1251
1252         prepMotionCompensation(tempPred->cu, cuGeom, 0);
1253         motionCompensation(tempPred->predYuv, true, m_bChromaSa8d);
1254
1255         tempPred->sa8dBits = getTUBits(i, maxNumMergeCand);
1256         tempPred->distortion = primitives.sa8d[sizeIdx](fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
1257         if (m_bChromaSa8d)
1258         {
1259             tempPred->distortion += primitives.sa8d_inter[cpart](fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
1260             tempPred->distortion += primitives.sa8d_inter[cpart](fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
1261         }
1262         tempPred->sa8dCost = m_rdCost.calcRdSADCost(tempPred->distortion, tempPred->sa8dBits);
1263
1264         if (tempPred->sa8dCost < bestPred->sa8dCost)
1265         {
1266             bestSadCand = i;
1267             std::swap(tempPred, bestPred);
1268         }
1269     }
1270
1271     /* force mode decision to take inter or intra */
1272     if (bestSadCand < 0)
1273         return;
1274
1275     /* calculate the motion compensation for chroma for the best mode selected */
1276     if (!m_bChromaSa8d) /* Chroma MC was done above */
1277     {
1278         prepMotionCompensation(bestPred->cu, cuGeom, 0);
1279         motionCompensation(bestPred->predYuv, false, true);
1280     }
1281
1282     if (m_param->rdLevel)
1283     {
1284         if (m_param->bLossless)
1285             bestPred->rdCost = MAX_INT64;
1286         else
1287             encodeResAndCalcRdSkipCU(*bestPred);
1288
1289         /* Encode with residual */
1290         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
1291         tempPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
1292         tempPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
1293         tempPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
1294         tempPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
1295         tempPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
1296         tempPred->sa8dCost = bestPred->sa8dCost;
1297         tempPred->predYuv.copyFromYuv(bestPred->predYuv);
1298
1299         encodeResAndCalcRdInterCU(*tempPred, cuGeom);
1300
1301         md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred;
1302     }
1303     else
1304         md.bestMode = bestPred;
1305
1306     /* broadcast sets of MV field data */
1307     bestPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
1308     bestPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
1309     bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
1310     bestPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
1311     bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
1312 }
1313
1314 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
1315 void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom)
1316 {
1317     uint32_t depth = cuGeom.depth;
1318
1319     /* Note that these two Mode instances are named MERGE and SKIP but they may
1320      * hold the reverse when the function returns. We toggle between the two modes */
1321     Mode* tempPred = &merge;
1322     Mode* bestPred = &skip;
1323
1324     merge.cu.setPredModeSubParts(MODE_INTER);
1325     merge.cu.setPartSizeSubParts(SIZE_2Nx2N);
1326     merge.cu.m_mergeFlag[0] = true;
1327
1328     skip.cu.setPredModeSubParts(MODE_INTER);
1329     skip.cu.setPartSizeSubParts(SIZE_2Nx2N);
1330     skip.cu.m_mergeFlag[0] = true;
1331
1332     MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
1333     uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS];
1334     uint32_t maxNumMergeCand = merge.cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours);
1335
1336     bool foundCbf0Merge = false;
1337     bool triedPZero = false, triedBZero = false;
1338     bestPred->rdCost = MAX_INT64;
1339     for (uint32_t i = 0; i < maxNumMergeCand; i++)
1340     {
1341         if (m_bFrameParallel &&
1342             (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1343              mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4))
1344             continue;
1345
1346         /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */
1347         if (interDirNeighbours[i] == 1 && !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx)
1348         {
1349             if (triedPZero)
1350                 continue;
1351             triedPZero = true;
1352         }
1353         else if (interDirNeighbours[i] == 3 &&
1354                  !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx &&
1355                  !mvFieldNeighbours[i][1].mv.word && !mvFieldNeighbours[i][1].refIdx)
1356         {
1357             if (triedBZero)
1358                 continue;
1359             triedBZero = true;
1360         }
1361
1362         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;    /* merge candidate ID is stored in L0 MVP idx */
1363         tempPred->cu.m_interDir[0] = interDirNeighbours[i];
1364         tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
1365         tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
1366         tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
1367         tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
1368         tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
1369
1370         prepMotionCompensation(tempPred->cu, cuGeom, 0);
1371         motionCompensation(tempPred->predYuv, true, true);
1372
1373         uint8_t hasCbf = true;
1374         bool swapped = false;
1375         if (!foundCbf0Merge)
1376         {
1377             /* if the best prediction has CBF (not a skip) then try merge with residual */
1378
1379             encodeResAndCalcRdInterCU(*tempPred, cuGeom);
1380             hasCbf = tempPred->cu.getQtRootCbf(0);
1381             foundCbf0Merge = !hasCbf;
1382
1383             if (tempPred->rdCost < bestPred->rdCost)
1384             {
1385                 std::swap(tempPred, bestPred);
1386                 swapped = true;
1387             }
1388         }
1389         if (!m_param->bLossless && hasCbf)
1390         {
1391             /* try merge without residual (skip), if not lossless coding */
1392
1393             if (swapped)
1394             {
1395                 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
1396                 tempPred->cu.m_interDir[0] = interDirNeighbours[i];
1397                 tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
1398                 tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
1399                 tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
1400                 tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
1401                 tempPred->cu.setPredModeSubParts(MODE_INTER);
1402                 tempPred->predYuv.copyFromYuv(bestPred->predYuv);
1403             }
1404
1405             encodeResAndCalcRdSkipCU(*tempPred);
1406
1407             if (tempPred->rdCost < bestPred->rdCost)
1408                 std::swap(tempPred, bestPred);
1409         }
1410     }
1411
1412     if (bestPred->rdCost < MAX_INT64)
1413     {
1414         m_modeDepth[depth].bestMode = bestPred;
1415
1416         /* broadcast sets of MV field data */
1417         uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0];
1418         bestPred->cu.setPUInterDir(interDirNeighbours[bestCand], 0, 0);
1419         bestPred->cu.setPUMv(0, mvFieldNeighbours[bestCand][0].mv, 0, 0);
1420         bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestCand][0].refIdx, 0, 0);
1421         bestPred->cu.setPUMv(1, mvFieldNeighbours[bestCand][1].mv, 0, 0);
1422         bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestCand][1].refIdx, 0, 0);
1423     }
1424 }
1425
1426 void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize)
1427 {
1428     interMode.initCosts();
1429     interMode.cu.setPartSizeSubParts(partSize);
1430     interMode.cu.setPredModeSubParts(MODE_INTER);
1431     int numPredDir = m_slice->isInterP() ? 1 : 2;
1432
1433     if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
1434     {
1435         for (uint32_t part = 0; part < interMode.cu.getNumPartInter(); part++)
1436         {
1437             MotionData* bestME = interMode.bestME[part];
1438             for (int32_t i = 0; i < numPredDir; i++)
1439             {
1440                 bestME[i].ref = m_reuseInterDataCTU->ref;
1441                 m_reuseInterDataCTU++;
1442             }
1443         }
1444     }
1445     if (predInterSearch(interMode, cuGeom, false, m_bChromaSa8d))
1446     {
1447         /* predInterSearch sets interMode.sa8dBits */
1448         const Yuv& fencYuv = *interMode.fencYuv;
1449         Yuv& predYuv = interMode.predYuv;
1450         int part = partitionFromLog2Size(cuGeom.log2CUSize);
1451         interMode.distortion = primitives.sa8d[part](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
1452         if (m_bChromaSa8d)
1453         {
1454             uint32_t cuSize = 1 << cuGeom.log2CUSize;
1455             int cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
1456             interMode.distortion += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
1457             interMode.distortion += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
1458         }
1459         interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits);
1460
1461         if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
1462         {
1463             for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
1464             {
1465                 MotionData* bestME = interMode.bestME[puIdx];
1466                 for (int32_t i = 0; i < numPredDir; i++)
1467                 {
1468                     m_reuseInterDataCTU->ref = bestME[i].ref;
1469                     m_reuseInterDataCTU++;
1470                 }
1471             }
1472         }
1473     }
1474     else
1475     {
1476         interMode.distortion = MAX_UINT;
1477         interMode.sa8dCost = MAX_INT64;
1478     }
1479 }
1480
1481 void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, bool bMergeOnly)
1482 {
1483     interMode.initCosts();
1484     interMode.cu.setPartSizeSubParts(partSize);
1485     interMode.cu.setPredModeSubParts(MODE_INTER);
1486     int numPredDir = m_slice->isInterP() ? 1 : 2;
1487
1488     if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
1489     {
1490         for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
1491         {
1492             MotionData* bestME = interMode.bestME[puIdx];
1493             for (int32_t i = 0; i < numPredDir; i++)
1494             {
1495                 bestME[i].ref = m_reuseInterDataCTU->ref;
1496                 m_reuseInterDataCTU++;
1497             }
1498         }
1499     }
1500     if (predInterSearch(interMode, cuGeom, bMergeOnly, true))
1501     {
1502         /* predInterSearch sets interMode.sa8dBits, but this is ignored */
1503         encodeResAndCalcRdInterCU(interMode, cuGeom);
1504
1505         if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
1506         {
1507             for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
1508             {
1509                 MotionData* bestME = interMode.bestME[puIdx];
1510                 for (int32_t i = 0; i < numPredDir; i++)
1511                 {
1512                     m_reuseInterDataCTU->ref = bestME[i].ref;
1513                     m_reuseInterDataCTU++;
1514                 }
1515             }
1516         }
1517     }
1518     else
1519     {
1520         interMode.distortion = MAX_UINT;
1521         interMode.rdCost = MAX_INT64;
1522     }
1523 }
1524
1525 void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom)
1526 {
1527     CUData& cu = bidir2Nx2N.cu;
1528
1529     if (cu.isBipredRestriction() || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT)
1530     {
1531         bidir2Nx2N.sa8dCost = MAX_INT64;
1532         bidir2Nx2N.rdCost = MAX_INT64;
1533         return;
1534     }
1535
1536     const Yuv& fencYuv = *bidir2Nx2N.fencYuv;
1537     MV   mvzero(0, 0);
1538     int  cpart, partEnum = cuGeom.log2CUSize - 2;
1539
1540     if (m_bChromaSa8d)
1541     {
1542         int cuSize = 1 << cuGeom.log2CUSize;
1543         cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
1544     }
1545
1546     bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0];
1547     bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1];
1548     MotionData* bestME = bidir2Nx2N.bestME[0];
1549     int ref0    = bestME[0].ref;
1550     MV  mvp0    = bestME[0].mvp;
1551     int mvpIdx0 = bestME[0].mvpIdx;
1552     int ref1    = bestME[1].ref;
1553     MV  mvp1    = bestME[1].mvp;
1554     int mvpIdx1 = bestME[1].mvpIdx;
1555
1556     bidir2Nx2N.initCosts();
1557     cu.setPartSizeSubParts(SIZE_2Nx2N);
1558     cu.setPredModeSubParts(MODE_INTER);
1559     cu.setPUInterDir(3, 0, 0);
1560     cu.setPURefIdx(0, (int8_t)ref0, 0, 0);
1561     cu.setPURefIdx(1, (int8_t)ref1, 0, 0);
1562     cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
1563     cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
1564     cu.m_mergeFlag[0] = 0;
1565
1566     /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */
1567     cu.setPUMv(0, bestME[0].mv, 0, 0);
1568     cu.m_mvd[0][0] = bestME[0].mv - mvp0;
1569
1570     cu.setPUMv(1, bestME[1].mv, 0, 0);
1571     cu.m_mvd[1][0] = bestME[1].mv - mvp1;
1572
1573     prepMotionCompensation(cu, cuGeom, 0);
1574     motionCompensation(bidir2Nx2N.predYuv, true, m_bChromaSa8d);
1575
1576     int sa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
1577     if (m_bChromaSa8d)
1578     {
1579         /* Add in chroma distortion */
1580         sa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
1581         sa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize);
1582     }
1583     bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
1584     bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits);
1585
1586     bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
1587     if (bTryZero)
1588     {
1589         /* Do not try zero MV if unidir motion predictors are beyond
1590          * valid search area */
1591         MV mvmin, mvmax;
1592         int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
1593         setSearchRange(cu, mvzero, merange, mvmin, mvmax);
1594         mvmax.y += 2; // there is some pad for subpel refine
1595         mvmin <<= 2;
1596         mvmax <<= 2;
1597
1598         bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
1599         bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
1600     }
1601     if (bTryZero)
1602     {
1603         /* Estimate cost of BIDIR using coincident blocks */
1604         Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1605
1606         int zsa8d;
1607
1608         if (m_bChromaSa8d)
1609         {
1610             cu.m_mv[0][0] = mvzero;
1611             cu.m_mv[1][0] = mvzero;
1612
1613             prepMotionCompensation(cu, cuGeom, 0);
1614             motionCompensation(tmpPredYuv, true, true);
1615
1616             zsa8d  = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
1617             zsa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
1618             zsa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
1619         }
1620         else
1621         {
1622             pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx);
1623             pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx);
1624             intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
1625
1626             primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
1627             zsa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
1628         }
1629
1630         uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
1631         uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
1632         uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
1633
1634         /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
1635         checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvp0, mvpIdx0, bits0, zcost);
1636         checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvp1, mvpIdx1, bits1, zcost);
1637
1638         uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
1639         zcost = zsa8d + m_rdCost.getCost(zbits);
1640
1641         if (zcost < bidir2Nx2N.sa8dCost)
1642         {
1643             bidir2Nx2N.sa8dBits = zbits;
1644             bidir2Nx2N.sa8dCost = zcost;
1645
1646             cu.setPUMv(0, mvzero, 0, 0);
1647             cu.m_mvd[0][0] = mvzero - mvp0;
1648             cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
1649
1650             cu.setPUMv(1, mvzero, 0, 0);
1651             cu.m_mvd[1][0] = mvzero - mvp1;
1652             cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
1653
1654             if (m_bChromaSa8d)
1655                 /* real MC was already performed */
1656                 bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
1657             else
1658             {
1659                 prepMotionCompensation(cu, cuGeom, 0);
1660                 motionCompensation(bidir2Nx2N.predYuv, true, true);
1661             }
1662         }
1663         else if (m_bChromaSa8d)
1664         {
1665             /* recover overwritten motion vectors */
1666             cu.m_mv[0][0] = bestME[0].mv;
1667             cu.m_mv[1][0] = bestME[1].mv;
1668         }
1669     }
1670 }
1671
1672 void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
1673 {
1674     if (cuGeom.depth < ctu.m_cuDepth[cuGeom.encodeIdx] && cuGeom.depth < g_maxCUDepth)
1675     {
1676         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1677         {
1678             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1679             if (childGeom.flags & CUGeom::PRESENT)
1680                 encodeResidue(ctu, childGeom);
1681         }
1682         return;
1683     }
1684
1685     uint32_t absPartIdx = cuGeom.encodeIdx;
1686     int sizeIdx = cuGeom.log2CUSize - 2;
1687
1688     /* reuse the bestMode data structures at the current depth */
1689     Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
1690     CUData& cu = bestMode->cu;
1691
1692     cu.copyFromPic(ctu, cuGeom);
1693     m_quant.setQPforQuant(cu);
1694
1695     Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv;
1696     if (cuGeom.depth)
1697         m_modeDepth[0].fencYuv.copyPartToYuv(fencYuv, absPartIdx);
1698     X265_CHECK(bestMode->fencYuv == &fencYuv, "invalid fencYuv\n");
1699
1700     if (cu.isIntra(0))
1701     {
1702         uint32_t tuDepthRange[2];
1703         cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1704
1705         uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
1706         residualTransformQuantIntra(*bestMode, cuGeom, initTuDepth, 0, tuDepthRange);
1707         getBestIntraModeChroma(*bestMode, cuGeom);
1708         residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
1709     }
1710     else // if (cu.isInter(0))
1711     {
1712         X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n");
1713
1714         /* Calculate residual for current CU part into depth sized resiYuv */
1715
1716         ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1717
1718         /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */
1719         Yuv& predYuv = m_modeDepth[0].bestMode->predYuv;
1720         pixel* predY = predYuv.getLumaAddr(absPartIdx);
1721         pixel* predU = predYuv.getCbAddr(absPartIdx);
1722         pixel* predV = predYuv.getCrAddr(absPartIdx);
1723
1724         primitives.luma_sub_ps[sizeIdx](resiYuv.m_buf[0], resiYuv.m_size,
1725                                         fencYuv.m_buf[0], predY,
1726                                         fencYuv.m_size, predYuv.m_size);
1727
1728         primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[1], resiYuv.m_csize,
1729                                                  fencYuv.m_buf[1], predU,
1730                                                  fencYuv.m_csize, predYuv.m_csize);
1731
1732         primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[2], resiYuv.m_csize,
1733                                                  fencYuv.m_buf[2], predV,
1734                                                  fencYuv.m_csize, predYuv.m_csize);
1735
1736         uint32_t tuDepthRange[2];
1737         cu.getInterTUQtDepthRange(tuDepthRange, 0);
1738
1739         residualTransformQuantInter(*bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
1740
1741         if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
1742             cu.setPredModeSubParts(MODE_SKIP);
1743
1744         /* residualTransformQuantInter() wrote transformed residual back into
1745          * resiYuv. Generate the recon pixels by adding it to the prediction */
1746
1747         PicYuv& reconPic = *m_frame->m_reconPic;
1748         if (cu.m_cbf[0][0])
1749             primitives.luma_add_ps[sizeIdx](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
1750                                             predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
1751         else
1752             primitives.luma_copy_pp[sizeIdx](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
1753                                              predY, predYuv.m_size);
1754
1755         if (cu.m_cbf[1][0])
1756             primitives.chroma[m_csp].add_ps[sizeIdx](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1757                                                      predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
1758         else
1759             primitives.chroma[m_csp].copy_pp[sizeIdx](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1760                                                       predU, predYuv.m_csize);
1761
1762         if (cu.m_cbf[2][0])
1763             primitives.chroma[m_csp].add_ps[sizeIdx](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1764                                                      predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
1765         else
1766             primitives.chroma[m_csp].copy_pp[sizeIdx](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1767                                                       predV, predYuv.m_csize);
1768     }
1769
1770     checkDQP(cu, cuGeom);
1771     cu.updatePic(cuGeom.depth);
1772 }
1773
1774 void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
1775 {
1776     if (m_param->rdLevel >= 3)
1777     {
1778         /* code the split flag (0 or 1) and update bit costs */
1779         mode.contexts.resetBits();
1780         mode.contexts.codeSplitFlag(mode.cu, 0, depth);
1781         uint32_t bits = mode.contexts.getNumberOfWrittenBits();
1782         mode.mvBits += bits;
1783         mode.totalBits += bits;
1784         updateModeCost(mode);
1785     }
1786     else if (m_param->rdLevel <= 1)
1787     {
1788         mode.sa8dBits++;
1789         mode.sa8dCost = m_rdCost.calcRdSADCost(mode.distortion, mode.sa8dBits);
1790     }
1791     else
1792     {
1793         mode.mvBits++;
1794         mode.totalBits++;
1795         updateModeCost(mode);
1796     }
1797 }
1798
1799 void Analysis::checkDQP(CUData& cu, const CUGeom& cuGeom)
1800 {
1801     if (m_slice->m_pps->bUseDQP && cuGeom.depth <= m_slice->m_pps->maxCuDQPDepth)
1802     {
1803         if (cu.m_cuDepth[0] > cuGeom.depth) // detect splits
1804         {
1805             bool hasResidual = false;
1806             for (uint32_t absPartIdx = 0; absPartIdx < cu.m_numPartitions; absPartIdx++)
1807             {
1808                 if (cu.getQtRootCbf(absPartIdx))
1809                 {
1810                     hasResidual = true;
1811                     break;
1812                 }
1813             }
1814             if (hasResidual)
1815                 cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth);
1816             else
1817                 cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
1818         }
1819         else
1820         {
1821             if (!cu.getCbf(0, TEXT_LUMA, 0) && !cu.getCbf(0, TEXT_CHROMA_U, 0) && !cu.getCbf(0, TEXT_CHROMA_V, 0))
1822                 cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
1823         }
1824     }
1825 }
1826
1827 uint32_t Analysis::topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom)
1828 {
1829     /* Do not attempt to code a block larger than the largest block in the
1830      * co-located CTUs in L0 and L1 */
1831     int currentQP = parentCTU.m_qp[0];
1832     int previousQP = currentQP;
1833     uint32_t minDepth0 = 4, minDepth1 = 4;
1834     uint32_t sum = 0;
1835     int numRefs = 0;
1836     if (m_slice->m_numRefIdx[0])
1837     {
1838         numRefs++;
1839         const CUData& cu = *m_slice->m_refPicList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
1840         previousQP = cu.m_qp[0];
1841         if (!cu.m_cuDepth[cuGeom.encodeIdx])
1842             return 0;
1843         for (uint32_t i = 0; i < cuGeom.numPartitions && minDepth0; i += 4)
1844         {
1845             uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i];
1846             minDepth0 = X265_MIN(d, minDepth0);
1847             sum += d;
1848         }
1849     }
1850     if (m_slice->m_numRefIdx[1])
1851     {
1852         numRefs++;
1853         const CUData& cu = *m_slice->m_refPicList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
1854         if (!cu.m_cuDepth[cuGeom.encodeIdx])
1855             return 0;
1856         for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
1857         {
1858             uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i];
1859             minDepth1 = X265_MIN(d, minDepth1);
1860             sum += d;
1861         }
1862     }
1863     if (!numRefs)
1864         return 0;
1865
1866     uint32_t minDepth = X265_MIN(minDepth0, minDepth1);
1867     uint32_t thresh = minDepth * numRefs * (cuGeom.numPartitions >> 2);
1868
1869     /* allow block size growth if QP is raising or avg depth is
1870      * less than 1.5 of min depth */
1871     if (minDepth && currentQP >= previousQP && (sum <= thresh + (thresh >> 1)))
1872         minDepth -= 1;
1873
1874     return minDepth;
1875 }
1876
1877 /* returns true if recursion should be stopped */
1878 bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode)
1879 {
1880     /* early exit when the RD cost of best mode at depth n is less than the sum
1881      * of average of RD cost of the neighbor CU's(above, aboveleft, aboveright,
1882      * left, colocated) and avg cost of that CU at depth "n" with weightage for
1883      * each quantity */
1884
1885     uint32_t depth = cuGeom.depth;
1886     FrameData& curEncData = *m_frame->m_encData;
1887     FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
1888     uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth];
1889     uint64_t cuCount = cuStat.count[depth];
1890
1891     uint64_t neighCost = 0, neighCount = 0;
1892     const CUData* above = parentCTU.m_cuAbove;
1893     if (above)
1894     {
1895         FrameData::RCStatCU& astat = curEncData.m_cuStat[above->m_cuAddr];
1896         neighCost += astat.avgCost[depth] * astat.count[depth];
1897         neighCount += astat.count[depth];
1898
1899         const CUData* aboveLeft = parentCTU.m_cuAboveLeft;
1900         if (aboveLeft)
1901         {
1902             FrameData::RCStatCU& lstat = curEncData.m_cuStat[aboveLeft->m_cuAddr];
1903             neighCost += lstat.avgCost[depth] * lstat.count[depth];
1904             neighCount += lstat.count[depth];
1905         }
1906
1907         const CUData* aboveRight = parentCTU.m_cuAboveRight;
1908         if (aboveRight)
1909         {
1910             FrameData::RCStatCU& rstat = curEncData.m_cuStat[aboveRight->m_cuAddr];
1911             neighCost += rstat.avgCost[depth] * rstat.count[depth];
1912             neighCount += rstat.count[depth];
1913         }
1914     }
1915     const CUData* left = parentCTU.m_cuLeft;
1916     if (left)
1917     {
1918         FrameData::RCStatCU& nstat = curEncData.m_cuStat[left->m_cuAddr];
1919         neighCost += nstat.avgCost[depth] * nstat.count[depth];
1920         neighCount += nstat.count[depth];
1921     }
1922
1923     // give 60% weight to all CU's and 40% weight to neighbour CU's
1924     if (neighCount + cuCount)
1925     {
1926         uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount));
1927         uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost;
1928         if (curCost < avgCost && avgCost)
1929             return true;
1930     }
1931
1932     return false;
1933 }