source/encoder/analysis.cpp

   1 /*****************************************************************************
   2 * Copyright (C) 2013 x265 project
   3 *
   4 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
   5 *          Steve Borho <steve@borho.org>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License as published by
   9 * the Free Software Foundation; either version 2 of the License, or
  10 * (at your option) any later version.
  11 *
  12 * This program is distributed in the hope that it will be useful,
  13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 * GNU General Public License for more details.
  16 *
  17 * You should have received a copy of the GNU General Public License
  18 * along with this program; if not, write to the Free Software
  19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  20 *
  21 * This program is also available under a commercial proprietary license.
  22 * For more information, contact us at license @ x265.com.
  23 *****************************************************************************/
  24
  25 #include "common.h"
  26 #include "frame.h"
  27 #include "framedata.h"
  28 #include "picyuv.h"
  29 #include "primitives.h"
  30 #include "threading.h"
  31
  32 #include "analysis.h"
  33 #include "rdcost.h"
  34 #include "encoder.h"
  35
  36 #include "PPA/ppa.h"
  37
  38 using namespace x265;
  39
  40 /* An explanation of rate distortion levels (--rd-level)
  41  *
  42  * rd-level 0 generates no recon per CU (NO RDO or Quant)
  43  *
  44  *   sa8d selection between merge / skip / inter / intra and split
  45  *   no recon pixels generated until CTU analysis is complete, requiring
  46  *   intra predictions to use source pixels
  47  *
  48  * rd-level 1 uses RDO for merge and skip, sa8d for all else
  49  *
  50  *   RDO selection between merge and skip
  51  *   sa8d selection between (merge/skip) / inter modes / intra and split
  52  *   intra prediction uses reconstructed pixels
  53  *
  54  * rd-level 2 uses RDO for merge/skip and split
  55  *
  56  *   RDO selection between merge and skip
  57  *   sa8d selection between (merge/skip) / inter modes / intra
  58  *   RDO split decisions
  59  *
  60  * rd-level 3 uses RDO for merge/skip/best inter/intra
  61  *
  62  *   RDO selection between merge and skip
  63  *   sa8d selection of best inter mode
  64  *   RDO selection between (merge/skip) / best inter mode / intra / split
  65  *
  66  * rd-level 4 enables RDOQuant
  67  *
  68  * rd-level 5,6 does RDO for each inter mode
  69  */
  70
  71 Analysis::Analysis()
  72 {
  73     m_totalNumJobs = m_numAcquiredJobs = m_numCompletedJobs = 0;
  74 }
  75
  76 bool Analysis::create(ThreadLocalData *tld)
  77 {
  78     m_tld = tld;
  79     m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
  80
  81     int csp = m_param->internalCsp;
  82     uint32_t cuSize = g_maxCUSize;
  83
  84     bool ok = true;
  85     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++, cuSize >>= 1)
  86     {
  87         ModeDepth &md = m_modeDepth[depth];
  88
  89         md.cuMemPool.create(depth, csp, MAX_PRED_TYPES);
  90         ok &= md.fencYuv.create(cuSize, csp);
  91
  92         for (int j = 0; j < MAX_PRED_TYPES; j++)
  93         {
  94             md.pred[j].cu.initialize(md.cuMemPool, depth, csp, j);
  95             ok &= md.pred[j].predYuv.create(cuSize, csp);
  96             ok &= md.pred[j].reconYuv.create(cuSize, csp);
  97             md.pred[j].fencYuv = &md.fencYuv;
  98         }
  99     }
 100
 101     return ok;
 102 }
 103
 104 void Analysis::destroy()
 105 {
 106     for (uint32_t i = 0; i <= g_maxCUDepth; i++)
 107     {
 108         m_modeDepth[i].cuMemPool.destroy();
 109         m_modeDepth[i].fencYuv.destroy();
 110
 111         for (int j = 0; j < MAX_PRED_TYPES; j++)
 112         {
 113             m_modeDepth[i].pred[j].predYuv.destroy();
 114             m_modeDepth[i].pred[j].reconYuv.destroy();
 115         }
 116     }
 117 }
 118
 119 Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
 120 {
 121     m_slice = ctu.m_slice;
 122     m_frame = &frame;
 123
 124     invalidateContexts(0);
 125     m_quant.setQPforQuant(ctu);
 126     m_rqt[0].cur.load(initialContext);
 127     m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_origPicYuv, ctu.m_cuAddr, 0);
 128
 129     uint32_t numPartition = ctu.m_numPartitions;
 130     if (m_slice->m_sliceType == I_SLICE)
 131     {
 132         uint32_t zOrder = 0;
 133         if (m_param->analysisMode == X265_ANALYSIS_LOAD)
 134             compressIntraCU(ctu, cuGeom, m_frame->m_intraData, zOrder);
 135         else
 136         {
 137             compressIntraCU(ctu, cuGeom, NULL, zOrder);
 138
 139             if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_intraData)
 140             {
 141                 CUData *bestCU = &m_modeDepth[0].bestMode->cu;
 142                 memcpy(&m_frame->m_intraData->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
 143                 memcpy(&m_frame->m_intraData->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
 144                 memcpy(&m_frame->m_intraData->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
 145                 m_frame->m_intraData->cuAddr[ctu.m_cuAddr] = ctu.m_cuAddr;
 146                 m_frame->m_intraData->poc[ctu.m_cuAddr] = m_frame->m_poc;
 147             }
 148         }
 149     }
 150     else
 151     {
 152         if (!m_param->rdLevel)
 153         {
 154             /* In RD Level 0/1, copy source pixels into the reconstructed block so
 155              * they are available for intra predictions */
 156             m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPicYuv, ctu.m_cuAddr, 0);
 157
 158             compressInterCU_rd0_4(ctu, cuGeom); // TODO: this really wants to be compressInterCU_rd0_1
 159
 160             /* generate residual for entire CTU at once and copy to reconPic */
 161             encodeResidue(ctu, cuGeom);
 162         }
 163         else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2)
 164             compressInterCU_dist(ctu, cuGeom);
 165         else if (m_param->rdLevel <= 4)
 166             compressInterCU_rd0_4(ctu, cuGeom);
 167         else
 168             compressInterCU_rd5_6(ctu, cuGeom);
 169     }
 170
 171     return *m_modeDepth[0].bestMode;
 172 }
 173
 174 void Analysis::tryLossless(const CUGeom& cuGeom)
 175 {
 176     ModeDepth& md = m_modeDepth[cuGeom.depth];
 177
 178     if (!md.bestMode->distortion)
 179         /* already lossless */
 180         return;
 181     else if (md.bestMode->cu.m_predMode[0] == MODE_INTRA)
 182     {
 183         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
 184         PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
 185         uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir;
 186         checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes);
 187         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
 188     }
 189     else
 190     {
 191         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
 192         md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
 193         encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
 194         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
 195     }
 196 }
 197
 198 void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x265_intra_data* shared, uint32_t& zOrder)
 199 {
 200     uint32_t depth = cuGeom.depth;
 201     ModeDepth& md = m_modeDepth[depth];
 202     md.bestMode = NULL;
 203
 204     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
 205     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
 206
 207     if (shared)
 208     {
 209         uint8_t* sharedDepth = &shared->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
 210         char* sharedPartSizes = &shared->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
 211         uint8_t* sharedModes = &shared->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
 212
 213         if (mightNotSplit && depth == sharedDepth[zOrder] && zOrder == cuGeom.encodeIdx)
 214         {
 215             m_quant.setQPforQuant(parentCTU);
 216
 217             PartSize size = (PartSize)sharedPartSizes[zOrder];
 218             Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
 219             mode.cu.initSubCU(parentCTU, cuGeom);
 220             checkIntra(mode, cuGeom, size, sharedModes);
 221             checkBestMode(mode, depth);
 222
 223             if (m_bTryLossless)
 224                 tryLossless(cuGeom);
 225
 226             if (mightSplit)
 227                 addSplitFlagCost(*md.bestMode, cuGeom.depth);
 228
 229             // increment zOrder offset to point to next best depth in sharedDepth buffer
 230             zOrder += g_depthInc[g_maxCUDepth - 1][sharedDepth[zOrder]];
 231             mightSplit = false;
 232         }
 233     }
 234     else if (mightNotSplit)
 235     {
 236         m_quant.setQPforQuant(parentCTU);
 237
 238         md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
 239         checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL);
 240         checkBestMode(md.pred[PRED_INTRA], depth);
 241
 242         if (depth == g_maxCUDepth)
 243         {
 244             md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
 245             checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL);
 246             checkBestMode(md.pred[PRED_INTRA_NxN], depth);
 247         }
 248
 249         if (m_bTryLossless)
 250             tryLossless(cuGeom);
 251
 252         if (mightSplit)
 253             addSplitFlagCost(*md.bestMode, cuGeom.depth);
 254     }
 255
 256     if (mightSplit)
 257     {
 258         Mode* splitPred = &md.pred[PRED_SPLIT];
 259         splitPred->initCosts();
 260         CUData* splitCU = &splitPred->cu;
 261         splitCU->initSubCU(parentCTU, cuGeom);
 262
 263         uint32_t nextDepth = depth + 1;
 264         ModeDepth& nd = m_modeDepth[nextDepth];
 265         invalidateContexts(nextDepth);
 266         Entropy* nextContext = &m_rqt[depth].cur;
 267
 268         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
 269         {
 270             const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
 271             if (childCuData.flags & CUGeom::PRESENT)
 272             {
 273                 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
 274                 m_rqt[nextDepth].cur.load(*nextContext);
 275                 compressIntraCU(parentCTU, childCuData, shared, zOrder);
 276
 277                 // Save best CU and pred data for this sub CU
 278                 splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
 279                 splitPred->addSubCosts(*nd.bestMode);
 280                 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
 281                 nextContext = &nd.bestMode->contexts;
 282             }
 283             else
 284             {
 285                 /* record the depth of this non-present sub-CU */
 286                 splitCU->setEmptyPart(childCuData, subPartIdx);
 287                 zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth];
 288             }
 289         }
 290         nextContext->store(splitPred->contexts);
 291         if (mightNotSplit)
 292             addSplitFlagCost(*splitPred, cuGeom.depth);
 293         else
 294             updateModeCost(*splitPred);
 295         checkBestMode(*splitPred, depth);
 296     }
 297
 298     checkDQP(md.bestMode->cu, cuGeom);
 299
 300     /* Copy best data to encData CTU and recon */
 301     md.bestMode->cu.copyToPic(depth);
 302     if (md.bestMode != &md.pred[PRED_SPLIT])
 303         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx);
 304 }
 305
 306 bool Analysis::findJob(int threadId)
 307 {
 308     /* try to acquire a CU mode to analyze */
 309     if (m_totalNumJobs > m_numAcquiredJobs)
 310     {
 311         /* ATOMIC_INC returns the incremented value */
 312         int id = ATOMIC_INC(&m_numAcquiredJobs);
 313         if (m_totalNumJobs >= id)
 314         {
 315             parallelModeAnalysis(threadId, id - 1);
 316
 317             if (ATOMIC_INC(&m_numCompletedJobs) == m_totalNumJobs)
 318                 m_modeCompletionEvent.trigger();
 319             return true;
 320         }
 321     }
 322
 323     if (m_totalNumME > m_numAcquiredME)
 324     {
 325         int id = ATOMIC_INC(&m_numAcquiredME);
 326         if (m_totalNumME >= id)
 327         {
 328             parallelME(threadId, id - 1);
 329
 330             if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
 331                 m_meCompletionEvent.trigger();
 332             return true;
 333         }
 334     }
 335
 336     return false;
 337 }
 338
 339 void Analysis::parallelME(int threadId, int meId)
 340 {
 341     Analysis* slave;
 342
 343     if (threadId == -1)
 344         slave = this;
 345     else
 346     {
 347         slave = &m_tld[threadId].analysis;
 348         slave->setQP(*m_slice, m_rdCost.m_qp);
 349         slave->m_slice = m_slice;
 350         slave->m_frame = m_frame;
 351
 352         PicYuv* fencPic = m_frame->m_origPicYuv;
 353         pixel* pu = fencPic->getLumaAddr(m_curMECu->m_cuAddr, m_curGeom->encodeIdx + m_puAbsPartIdx);
 354         slave->m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride);
 355         slave->m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight);
 356
 357         slave->prepMotionCompensation(*m_curMECu, *m_curGeom, m_curPart);
 358     }
 359
 360     if (meId < m_slice->m_numRefIdx[0])
 361         slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 0, meId);
 362     else
 363         slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]);
 364 }
 365
 366 void Analysis::parallelModeAnalysis(int threadId, int jobId)
 367 {
 368     Analysis* slave;
 369
 370     if (threadId == -1)
 371         slave = this;
 372     else
 373     {
 374         slave = &m_tld[threadId].analysis;
 375         slave->m_slice = m_slice;
 376         slave->m_frame = m_frame;
 377         slave->setQP(*m_slice, m_rdCost.m_qp);
 378         slave->invalidateContexts(0);
 379         if (jobId)
 380             slave->m_me.setSourcePlane(m_frame->m_origPicYuv->m_picOrg[0], m_frame->m_origPicYuv->m_stride);
 381     }
 382
 383     ModeDepth& md = m_modeDepth[m_curGeom->depth];
 384
 385     if (m_param->rdLevel <= 4)
 386     {
 387         switch (jobId)
 388         {
 389         case 0:
 390             if (slave != this)
 391                 slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur);
 392             slave->checkIntraInInter_rd0_4(md.pred[PRED_INTRA], *m_curGeom);
 393             if (m_param->rdLevel > 2)
 394                 slave->encodeIntraInInter(md.pred[PRED_INTRA], *m_curGeom);
 395             break;
 396
 397         case 1:
 398             slave->checkInter_rd0_4(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N);
 399             break;
 400
 401         case 2:
 402             slave->checkInter_rd0_4(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N);
 403             break;
 404
 405         case 3:
 406             slave->checkInter_rd0_4(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN);
 407             break;
 408
 409         case 4:
 410             slave->checkInter_rd0_4(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU);
 411             break;
 412
 413         case 5:
 414             slave->checkInter_rd0_4(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD);
 415             break;
 416
 417         case 6:
 418             slave->checkInter_rd0_4(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N);
 419             break;
 420
 421         case 7:
 422             slave->checkInter_rd0_4(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N);
 423             break;
 424
 425         default:
 426             X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
 427             break;
 428         }
 429     }
 430     else
 431     {
 432         bool bMergeOnly = m_curGeom->log2CUSize == 6;
 433         if (slave != this)
 434         {
 435             slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur);
 436             slave->m_quant.setQPforQuant(md.pred[PRED_2Nx2N].cu);
 437         }
 438
 439         switch (jobId)
 440         {
 441         case 0:
 442             slave->checkIntra(md.pred[PRED_INTRA], *m_curGeom, SIZE_2Nx2N, NULL);
 443             if (m_curGeom->depth == g_maxCUDepth && m_curGeom->log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
 444                 slave->checkIntra(md.pred[PRED_INTRA_NxN], *m_curGeom, SIZE_NxN, NULL);
 445             break;
 446
 447         case 1:
 448             slave->checkInter_rd5_6(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N, false);
 449             break;
 450
 451         case 2:
 452             slave->checkInter_rd5_6(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N, false);
 453             break;
 454
 455         case 3:
 456             slave->checkInter_rd5_6(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN, false);
 457             break;
 458
 459         case 4:
 460             slave->checkInter_rd5_6(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU, bMergeOnly);
 461             break;
 462
 463         case 5:
 464             slave->checkInter_rd5_6(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD, bMergeOnly);
 465             break;
 466
 467         case 6:
 468             slave->checkInter_rd5_6(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N, bMergeOnly);
 469             break;
 470
 471         case 7:
 472             slave->checkInter_rd5_6(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N, bMergeOnly);
 473             break;
 474
 475         default:
 476             X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
 477             break;
 478         }
 479     }
 480 }
 481
 482 void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom)
 483 {
 484     uint32_t depth = cuGeom.depth;
 485     uint32_t cuAddr = parentCTU.m_cuAddr;
 486     ModeDepth& md = m_modeDepth[depth];
 487     md.bestMode = NULL;
 488
 489     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
 490     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
 491     uint32_t minDepth = m_param->rdLevel <= 4 ? topSkipMinDepth(parentCTU, cuGeom) : 0;
 492
 493     X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not support RD 0 or 1\n");
 494
 495     if (mightNotSplit && depth >= minDepth)
 496     {
 497         int bTryAmp = m_slice->m_sps->maxAMPDepth > depth && (cuGeom.log2CUSize < 6 || m_param->rdLevel > 4);
 498         int bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
 499
 500         /* Initialize all prediction CUs based on parentCTU */
 501         md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
 502         md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
 503         md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
 504         if (m_param->bEnableRectInter)
 505         {
 506             md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
 507             md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
 508         }
 509         if (bTryAmp)
 510         {
 511             md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
 512             md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
 513             md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
 514             md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
 515         }
 516         if (bTryIntra)
 517         {
 518             md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
 519             if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
 520                 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
 521         }
 522
 523         m_totalNumJobs = 2 + m_param->bEnableRectInter * 2 + bTryAmp * 4;
 524         m_numAcquiredJobs = !bTryIntra;
 525         m_numCompletedJobs = m_numAcquiredJobs;
 526         m_curGeom = &cuGeom;
 527         m_bJobsQueued = true;
 528         JobProvider::enqueue();
 529
 530         for (int i = 0; i < m_totalNumJobs - m_numCompletedJobs; i++)
 531             m_pool->pokeIdleThread();
 532
 533         /* participate in processing jobs, until all are distributed */
 534         while (findJob(-1))
 535             ;
 536
 537         JobProvider::dequeue();
 538         m_bJobsQueued = false;
 539
 540         /* the master worker thread (this one) does merge analysis. By doing
 541          * merge after all the other jobs are at least started, we usually avoid
 542          * blocking on another thread */
 543
 544         if (m_param->rdLevel <= 4)
 545         {
 546             checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
 547
 548             m_modeCompletionEvent.wait();
 549
 550             /* select best inter mode based on sa8d cost */
 551             Mode *bestInter = &md.pred[PRED_2Nx2N];
 552
 553             if (m_param->bEnableRectInter)
 554             {
 555                 if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
 556                     bestInter = &md.pred[PRED_Nx2N];
 557                 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
 558                     bestInter = &md.pred[PRED_2NxN];
 559             }
 560
 561             if (bTryAmp)
 562             {
 563                 if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
 564                     bestInter = &md.pred[PRED_2NxnU];
 565                 if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
 566                     bestInter = &md.pred[PRED_2NxnD];
 567                 if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
 568                     bestInter = &md.pred[PRED_nLx2N];
 569                 if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
 570                     bestInter = &md.pred[PRED_nRx2N];
 571             }
 572
 573             if (m_param->rdLevel > 2)
 574             {
 575                 /* encode best inter */
 576                 for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
 577                 {
 578                     prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
 579                     motionCompensation(bestInter->predYuv, false, true);
 580                 }
 581                 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
 582
 583                 /* RD selection between merge, inter and intra */
 584                 checkBestMode(*bestInter, depth);
 585
 586                 if (bTryIntra)
 587                     checkBestMode(md.pred[PRED_INTRA], depth);
 588             }
 589             else /* m_param->rdLevel == 2 */
 590             {
 591                 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
 592                     md.bestMode = bestInter;
 593
 594                 if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
 595                 {
 596                     md.bestMode = &md.pred[PRED_INTRA];
 597                     encodeIntraInInter(*md.bestMode, cuGeom);
 598                 }
 599                 else if (!md.bestMode->cu.m_mergeFlag[0])
 600                 {
 601                     /* finally code the best mode selected from SA8D costs */
 602                     for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
 603                     {
 604                         prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx);
 605                         motionCompensation(md.bestMode->predYuv, false, true);
 606                     }
 607                     encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
 608                 }
 609             }
 610         }
 611         else
 612         {
 613             checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
 614             m_modeCompletionEvent.wait();
 615
 616             checkBestMode(md.pred[PRED_2Nx2N], depth);
 617
 618             if (m_param->bEnableRectInter)
 619             {
 620                 checkBestMode(md.pred[PRED_Nx2N], depth);
 621                 checkBestMode(md.pred[PRED_2NxN], depth);
 622             }
 623
 624             if (bTryAmp)
 625             {
 626                 checkBestMode(md.pred[PRED_2NxnU], depth);
 627                 checkBestMode(md.pred[PRED_2NxnD], depth);
 628                 checkBestMode(md.pred[PRED_nLx2N], depth);
 629                 checkBestMode(md.pred[PRED_nRx2N], depth);
 630             }
 631
 632             if (bTryIntra)
 633             {
 634                 checkBestMode(md.pred[PRED_INTRA], depth);
 635                 if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
 636                     checkBestMode(md.pred[PRED_INTRA_NxN], depth);
 637             }
 638         }
 639
 640         if (md.bestMode->rdCost == MAX_INT64 && !bTryIntra)
 641         {
 642             md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
 643             checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
 644             encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
 645             checkBestMode(md.pred[PRED_INTRA], depth);
 646         }
 647
 648         if (m_bTryLossless)
 649             tryLossless(cuGeom);
 650
 651         if (mightSplit)
 652             addSplitFlagCost(*md.bestMode, cuGeom.depth);
 653     }
 654
 655     bool bNoSplit = false;
 656     if (md.bestMode)
 657     {
 658         bNoSplit = !!md.bestMode->cu.isSkipped(0);
 659         if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
 660             bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
 661     }
 662
 663     if (mightSplit && !bNoSplit)
 664     {
 665         Mode* splitPred = &md.pred[PRED_SPLIT];
 666         splitPred->initCosts();
 667         CUData* splitCU = &splitPred->cu;
 668         splitCU->initSubCU(parentCTU, cuGeom);
 669
 670         uint32_t nextDepth = depth + 1;
 671         ModeDepth& nd = m_modeDepth[nextDepth];
 672         invalidateContexts(nextDepth);
 673         Entropy* nextContext = &m_rqt[depth].cur;
 674
 675         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
 676         {
 677             const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
 678             if (childCuData.flags & CUGeom::PRESENT)
 679             {
 680                 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
 681                 m_rqt[nextDepth].cur.load(*nextContext);
 682                 compressInterCU_dist(parentCTU, childCuData);
 683
 684                 // Save best CU and pred data for this sub CU
 685                 splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
 686                 splitPred->addSubCosts(*nd.bestMode);
 687
 688                 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
 689                 nextContext = &nd.bestMode->contexts;
 690             }
 691             else
 692                 splitCU->setEmptyPart(childCuData, subPartIdx);
 693         }
 694         nextContext->store(splitPred->contexts);
 695
 696         if (mightNotSplit)
 697             addSplitFlagCost(*splitPred, cuGeom.depth);
 698         else
 699             updateModeCost(*splitPred);
 700
 701         checkBestMode(*splitPred, depth);
 702     }
 703
 704     if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA)
 705     {
 706         /* early-out statistics */
 707         FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
 708         FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
 709         uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
 710         cuStat.count[depth] += 1;
 711         cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
 712     }
 713
 714     checkDQP(md.bestMode->cu, cuGeom);
 715
 716     /* Copy best data to encData CTU and recon */
 717     md.bestMode->cu.copyToPic(depth);
 718     if (md.bestMode != &md.pred[PRED_SPLIT])
 719         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx);
 720 }
 721
 722 void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom)
 723 {
 724     uint32_t depth = cuGeom.depth;
 725     uint32_t cuAddr = parentCTU.m_cuAddr;
 726     ModeDepth& md = m_modeDepth[depth];
 727     md.bestMode = NULL;
 728
 729     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
 730     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
 731     uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
 732
 733     if (mightNotSplit && depth >= minDepth)
 734     {
 735         bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
 736
 737         /* Initialize all prediction CUs based on parentCTU */
 738         md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
 739         md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
 740         md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
 741         if (m_param->bEnableRectInter)
 742         {
 743             md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
 744             md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
 745         }
 746         if (m_slice->m_sps->maxAMPDepth > depth && cuGeom.log2CUSize < 6)
 747         {
 748             md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
 749             md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
 750             md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
 751             md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
 752         }
 753
 754         /* Compute Merge Cost */
 755         checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
 756
 757         bool earlyskip = false;
 758         if (m_param->rdLevel)
 759             earlyskip = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
 760
 761         if (!earlyskip)
 762         {
 763             checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N);
 764             Mode *bestInter = &md.pred[PRED_2Nx2N];
 765
 766             if (m_param->bEnableRectInter)
 767             {
 768                 checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N);
 769                 if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
 770                     bestInter = &md.pred[PRED_Nx2N];
 771                 checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN);
 772                 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
 773                     bestInter = &md.pred[PRED_2NxN];
 774             }
 775
 776             if (m_slice->m_sps->maxAMPDepth > depth && cuGeom.log2CUSize < 6)
 777             {
 778                 bool bHor = false, bVer = false;
 779                 if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
 780                     bHor = true;
 781                 else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)
 782                     bVer = true;
 783                 else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&
 784                          md.bestMode && md.bestMode->cu.getQtRootCbf(0))
 785                 {
 786                     bHor = true;
 787                     bVer = true;
 788                 }
 789
 790                 if (bHor)
 791                 {
 792                     checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU);
 793                     if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
 794                         bestInter = &md.pred[PRED_2NxnU];
 795                     checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD);
 796                     if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
 797                         bestInter = &md.pred[PRED_2NxnD];
 798                 }
 799                 if (bVer)
 800                 {
 801                     checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N);
 802                     if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
 803                         bestInter = &md.pred[PRED_nLx2N];
 804                     checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N);
 805                     if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
 806                         bestInter = &md.pred[PRED_nRx2N];
 807                 }
 808             }
 809
 810             if (m_param->rdLevel >= 3)
 811             {
 812                 /* Calculate RD cost of best inter option */
 813                 for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
 814                 {
 815                     prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
 816                     motionCompensation(bestInter->predYuv, false, true);
 817                 }
 818
 819                 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
 820
 821                 if (!md.bestMode || bestInter->rdCost < md.bestMode->rdCost)
 822                     md.bestMode = bestInter;
 823
 824                 if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
 825                     md.bestMode->sa8dCost == MAX_INT64)
 826                 {
 827                     md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
 828                     checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
 829                     encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
 830                     if (md.pred[PRED_INTRA].rdCost < md.bestMode->rdCost)
 831                         md.bestMode = &md.pred[PRED_INTRA];
 832                 }
 833             }
 834             else
 835             {
 836                 /* SA8D choice between merge/skip, inter, and intra */
 837                 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
 838                     md.bestMode = bestInter;
 839
 840                 if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
 841                 {
 842                     md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
 843                     checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
 844                     if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
 845                         md.bestMode = &md.pred[PRED_INTRA];
 846                 }
 847
 848                 /* finally code the best mode selected by SA8D costs:
 849                  * RD level 2 - fully encode the best mode
 850                  * RD level 1 - generate recon pixels
 851                  * RD level 0 - generate chroma prediction */
 852                 if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)
 853                 {
 854                     /* prediction already generated for this CU, and if rd level
 855                      * is not 0, it is already fully encoded */
 856                 }
 857                 else if (md.bestMode->cu.m_predMode[0] == MODE_INTER)
 858                 {
 859                     for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
 860                     {
 861                         prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx);
 862                         motionCompensation(md.bestMode->predYuv, false, true);
 863                     }
 864                     if (m_param->rdLevel == 2)
 865                         encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
 866                     else if (m_param->rdLevel == 1)
 867                     {
 868                         m_rqt[cuGeom.depth].tmpResiYuv.subtract(md.fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
 869                         generateCoeffRecon(*md.bestMode, cuGeom);
 870                     }
 871                 }
 872                 else
 873                 {
 874                     if (m_param->rdLevel == 2)
 875                         encodeIntraInInter(*md.bestMode, cuGeom);
 876                     else if (m_param->rdLevel == 1)
 877                         generateCoeffRecon(*md.bestMode, cuGeom);
 878                 }
 879             }
 880         } // !earlyskip
 881
 882         if (m_bTryLossless)
 883             tryLossless(cuGeom);
 884
 885         if (mightSplit)
 886             addSplitFlagCost(*md.bestMode, cuGeom.depth);
 887     }
 888
 889     bool bNoSplit = false;
 890     if (md.bestMode)
 891     {
 892         bNoSplit = !!md.bestMode->cu.isSkipped(0);
 893         if (mightSplit && depth && depth >= minDepth && !bNoSplit)
 894             bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
 895     }
 896
 897     if (mightSplit && !bNoSplit)
 898     {
 899         Mode* splitPred = &md.pred[PRED_SPLIT];
 900         splitPred->initCosts();
 901         CUData* splitCU = &splitPred->cu;
 902         splitCU->initSubCU(parentCTU, cuGeom);
 903
 904         uint32_t nextDepth = depth + 1;
 905         ModeDepth& nd = m_modeDepth[nextDepth];
 906         invalidateContexts(nextDepth);
 907         Entropy* nextContext = &m_rqt[depth].cur;
 908
 909         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
 910         {
 911             const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
 912             if (childCuData.flags & CUGeom::PRESENT)
 913             {
 914                 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
 915                 m_rqt[nextDepth].cur.load(*nextContext);
 916                 compressInterCU_rd0_4(parentCTU, childCuData);
 917
 918                 // Save best CU and pred data for this sub CU
 919                 splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
 920                 splitPred->addSubCosts(*nd.bestMode);
 921
 922                 if (m_param->rdLevel)
 923                     nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
 924                 else
 925                     nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childCuData.numPartitions * subPartIdx);
 926                 if (m_param->rdLevel > 1)
 927                     nextContext = &nd.bestMode->contexts;
 928             }
 929             else
 930                 splitCU->setEmptyPart(childCuData, subPartIdx);
 931         }
 932         nextContext->store(splitPred->contexts);
 933
 934         if (mightNotSplit)
 935             addSplitFlagCost(*splitPred, cuGeom.depth);
 936         else if (m_param->rdLevel <= 1)
 937             splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits);
 938         else
 939             updateModeCost(*splitPred);
 940
 941         if (!md.bestMode)
 942             md.bestMode = splitPred;
 943         else if (m_param->rdLevel >= 1)
 944         {
 945             if (splitPred->rdCost < md.bestMode->rdCost)
 946                 md.bestMode = splitPred;
 947         }
 948         else
 949         {
 950             if (splitPred->sa8dCost < md.bestMode->sa8dCost)
 951                 md.bestMode = splitPred;
 952         }
 953     }
 954
 955     if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA)
 956     {
 957         /* early-out statistics */
 958         FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
 959         FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
 960         uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
 961         cuStat.count[depth] += 1;
 962         cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
 963     }
 964
 965     checkDQP(md.bestMode->cu, cuGeom);
 966
 967     /* Copy best data to encData CTU and recon */
 968     md.bestMode->cu.copyToPic(depth);
 969     if (md.bestMode != &md.pred[PRED_SPLIT] && m_param->rdLevel)
 970         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx);
 971 }
 972
 973 void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom)
 974 {
 975     uint32_t depth = cuGeom.depth;
 976     ModeDepth& md = m_modeDepth[depth];
 977     md.bestMode = NULL;
 978
 979     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
 980     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
 981
 982     if (mightNotSplit)
 983     {
 984         for (int i = 0; i < MAX_PRED_TYPES; i++)
 985             md.pred[i].cu.initSubCU(parentCTU, cuGeom);
 986
 987         checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
 988         bool earlySkip = m_param->bEnableEarlySkip && md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
 989
 990         if (!earlySkip)
 991         {
 992             checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, false);
 993             checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
 994
 995             if (m_param->bEnableRectInter)
 996             {
 997                 // Nx2N rect
 998                 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
 999                 {
1000                     checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, false);
1001                     checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
1002                 }
1003                 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1004                 {
1005                     checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, false);
1006                     checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
1007                 }
1008             }
1009
1010             // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
1011             if (m_slice->m_sps->maxAMPDepth > depth)
1012             {
1013                 bool bMergeOnly = cuGeom.log2CUSize == 6;
1014
1015                 bool bHor = false, bVer = false;
1016                 if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN)
1017                     bHor = true;
1018                 else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N)
1019                     bVer = true;
1020                 else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0] && !md.bestMode->cu.isSkipped(0))
1021                 {
1022                     bHor = true;
1023                     bVer = true;
1024                 }
1025
1026                 if (bHor)
1027                 {
1028                     if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1029                     {
1030                         checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, bMergeOnly);
1031                         checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
1032                     }
1033                     if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1034                     {
1035                         checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, bMergeOnly);
1036                         checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
1037                     }
1038                 }
1039                 if (bVer)
1040                 {
1041                     if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1042                     {
1043                         checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, bMergeOnly);
1044                         checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
1045                     }
1046                     if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1047                     {
1048                         checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, bMergeOnly);
1049                         checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
1050                     }
1051                 }
1052             }
1053
1054             if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) &&
1055                 (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)))
1056             {
1057                 checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL);
1058                 checkBestMode(md.pred[PRED_INTRA], depth);
1059
1060                 if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
1061                 {
1062                     checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL);
1063                     checkBestMode(md.pred[PRED_INTRA_NxN], depth);
1064                 }
1065             }
1066         }
1067
1068         if (m_bTryLossless)
1069             tryLossless(cuGeom);
1070
1071         if (mightSplit)
1072             addSplitFlagCost(*md.bestMode, cuGeom.depth);
1073     }
1074
1075     // estimate split cost
1076     if (mightSplit && (!md.bestMode || !md.bestMode->cu.isSkipped(0)))
1077     {
1078         Mode* splitPred = &md.pred[PRED_SPLIT];
1079         splitPred->initCosts();
1080         CUData* splitCU = &splitPred->cu;
1081         splitCU->initSubCU(parentCTU, cuGeom);
1082
1083         uint32_t nextDepth = depth + 1;
1084         ModeDepth& nd = m_modeDepth[nextDepth];
1085         invalidateContexts(nextDepth);
1086         Entropy* nextContext = &m_rqt[depth].cur;
1087
1088         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1089         {
1090             const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1091             if (childCuData.flags & CUGeom::PRESENT)
1092             {
1093                 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
1094                 m_rqt[nextDepth].cur.load(*nextContext);
1095                 compressInterCU_rd5_6(parentCTU, childCuData);
1096
1097                 // Save best CU and pred data for this sub CU
1098                 splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
1099                 splitPred->addSubCosts(*nd.bestMode);
1100                 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
1101                 nextContext = &nd.bestMode->contexts;
1102             }
1103             else
1104                 splitCU->setEmptyPart(childCuData, subPartIdx);
1105         }
1106         nextContext->store(splitPred->contexts);
1107         if (mightNotSplit)
1108             addSplitFlagCost(*splitPred, cuGeom.depth);
1109         else
1110             updateModeCost(*splitPred);
1111
1112         checkBestMode(*splitPred, depth);
1113     }
1114
1115     checkDQP(md.bestMode->cu, cuGeom);
1116
1117     /* Copy best data to encData CTU and recon */
1118     md.bestMode->cu.copyToPic(depth);
1119     if (md.bestMode != &md.pred[PRED_SPLIT])
1120         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx);
1121 }
1122
1123 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
1124 void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom)
1125 {
1126     uint32_t depth = cuGeom.depth;
1127     ModeDepth& md = m_modeDepth[depth];
1128     Yuv *fencYuv = &md.fencYuv;
1129
1130     /* Note that these two Mode instances are named MERGE and SKIP but they may
1131      * hold the reverse when the function returns. We toggle between the two modes */
1132     Mode* tempPred = &merge;
1133     Mode* bestPred = &skip;
1134
1135     X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n");
1136
1137     tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
1138     tempPred->cu.setPredModeSubParts(MODE_INTER);
1139     tempPred->cu.m_mergeFlag[0] = true;
1140
1141     bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
1142     bestPred->cu.setPredModeSubParts(MODE_INTER);
1143     bestPred->cu.m_mergeFlag[0] = true;
1144
1145     MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
1146     uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS];
1147     uint32_t maxNumMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours);
1148
1149     bestPred->sa8dCost = MAX_INT64;
1150     int bestSadCand = -1;
1151     int sizeIdx = cuGeom.log2CUSize - 2;
1152     for (uint32_t i = 0; i < maxNumMergeCand; ++i)
1153     {
1154         if (m_bFrameParallel &&
1155             (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1156             mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4))
1157             continue;
1158
1159         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
1160         tempPred->cu.m_interDir[0] = interDirNeighbours[i];
1161         tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
1162         tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
1163         tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
1164         tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
1165
1166         // do MC only for Luma part
1167         prepMotionCompensation(tempPred->cu, cuGeom, 0);
1168         motionCompensation(tempPred->predYuv, true, false);
1169
1170         tempPred->sa8dBits = getTUBits(i, maxNumMergeCand);
1171         tempPred->distortion = primitives.sa8d[sizeIdx](fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
1172         tempPred->sa8dCost = m_rdCost.calcRdSADCost(tempPred->distortion, tempPred->sa8dBits);
1173
1174         if (tempPred->sa8dCost < bestPred->sa8dCost)
1175         {
1176             bestSadCand = i;
1177             std::swap(tempPred, bestPred);
1178         }
1179     }
1180
1181     /* force mode decision to take inter or intra */
1182     if (bestSadCand < 0)
1183         return;
1184
1185     /* calculate the motion compensation for chroma for the best mode selected */
1186     prepMotionCompensation(bestPred->cu, cuGeom, 0);
1187     motionCompensation(bestPred->predYuv, false, true);
1188
1189     if (m_param->rdLevel)
1190     {
1191         if (m_param->bLossless)
1192             bestPred->rdCost = MAX_INT64;
1193         else
1194             encodeResAndCalcRdSkipCU(*bestPred);
1195
1196         /* Encode with residual */
1197         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
1198         tempPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
1199         tempPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
1200         tempPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
1201         tempPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
1202         tempPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
1203         tempPred->sa8dCost = bestPred->sa8dCost;
1204         tempPred->predYuv.copyFromYuv(bestPred->predYuv);
1205
1206         encodeResAndCalcRdInterCU(*tempPred, cuGeom);
1207
1208         md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred;
1209     }
1210     else
1211         md.bestMode = bestPred;
1212
1213     /* broadcast sets of MV field data */
1214     bestPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
1215     bestPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
1216     bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
1217     bestPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
1218     bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
1219 }
1220
1221 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
1222 void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom)
1223 {
1224     uint32_t depth = cuGeom.depth;
1225
1226     /* Note that these two Mode instances are named MERGE and SKIP but they may
1227      * hold the reverse when the function returns. We toggle between the two modes */
1228     Mode* tempPred = &merge;
1229     Mode* bestPred = &skip;
1230
1231     merge.cu.setPredModeSubParts(MODE_INTER);
1232     merge.cu.setPartSizeSubParts(SIZE_2Nx2N);
1233     merge.cu.m_mergeFlag[0] = true;
1234
1235     skip.cu.setPredModeSubParts(MODE_INTER);
1236     skip.cu.setPartSizeSubParts(SIZE_2Nx2N);
1237     skip.cu.m_mergeFlag[0] = true;
1238
1239     MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
1240     uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS];
1241     uint32_t maxNumMergeCand = merge.cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours);
1242
1243     bool foundCbf0Merge = false;
1244     bool triedPZero = false, triedBZero = false;
1245     bestPred->rdCost = MAX_INT64;
1246     for (uint32_t i = 0; i < maxNumMergeCand; i++)
1247     {
1248         if (m_bFrameParallel &&
1249             (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1250              mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4))
1251             continue;
1252
1253         /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */
1254         if (interDirNeighbours[i] == 1 && !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx)
1255         {
1256             if (triedPZero)
1257                 continue;
1258             triedPZero = true;
1259         }
1260         else if (interDirNeighbours[i] == 3 &&
1261                  !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx &&
1262                  !mvFieldNeighbours[i][1].mv.word && !mvFieldNeighbours[i][1].refIdx)
1263         {
1264             if (triedBZero)
1265                 continue;
1266             triedBZero = true;
1267         }
1268
1269         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;    /* merge candidate ID is stored in L0 MVP idx */
1270         tempPred->cu.m_interDir[0] = interDirNeighbours[i];
1271         tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
1272         tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
1273         tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
1274         tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
1275         tempPred->cu.setSkipFlagSubParts(false);     /* must be cleared between encode iterations */
1276
1277         prepMotionCompensation(tempPred->cu, cuGeom, 0);
1278         motionCompensation(tempPred->predYuv, true, true);
1279
1280         uint8_t hasCbf = true;
1281         bool swapped = false;
1282         if (!foundCbf0Merge)
1283         {
1284             /* if the best prediction has CBF (not a skip) then try merge with residual */
1285
1286             encodeResAndCalcRdInterCU(*tempPred, cuGeom);
1287             hasCbf = tempPred->cu.getQtRootCbf(0);
1288             foundCbf0Merge = !hasCbf;
1289
1290             if (tempPred->rdCost < bestPred->rdCost)
1291             {
1292                 std::swap(tempPred, bestPred);
1293                 swapped = true;
1294             }
1295         }
1296         if (!m_param->bLossless && hasCbf)
1297         {
1298             /* try merge without residual (skip), if not lossless coding */
1299
1300             if (swapped)
1301             {
1302                 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
1303                 tempPred->cu.m_interDir[0] = interDirNeighbours[i];
1304                 tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
1305                 tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
1306                 tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
1307                 tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
1308                 tempPred->cu.setSkipFlagSubParts(false);
1309                 tempPred->predYuv.copyFromYuv(bestPred->predYuv);
1310             }
1311
1312             encodeResAndCalcRdSkipCU(*tempPred);
1313
1314             if (tempPred->rdCost < bestPred->rdCost)
1315                 std::swap(tempPred, bestPred);
1316         }
1317     }
1318
1319     if (bestPred->rdCost < MAX_INT64)
1320     {
1321         m_modeDepth[depth].bestMode = bestPred;
1322
1323         /* broadcast sets of MV field data */
1324         uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0];
1325         bestPred->cu.setPUInterDir(interDirNeighbours[bestCand], 0, 0);
1326         bestPred->cu.setPUMv(0, mvFieldNeighbours[bestCand][0].mv, 0, 0);
1327         bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestCand][0].refIdx, 0, 0);
1328         bestPred->cu.setPUMv(1, mvFieldNeighbours[bestCand][1].mv, 0, 0);
1329         bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestCand][1].refIdx, 0, 0);
1330     }
1331 }
1332
1333 void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize)
1334 {
1335     interMode.initCosts();
1336     interMode.cu.setPartSizeSubParts(partSize);
1337     interMode.cu.setPredModeSubParts(MODE_INTER);
1338
1339     if (predInterSearch(interMode, cuGeom, false, false))
1340     {
1341         /* predInterSearch sets interMode.sa8dBits */
1342         const Yuv& fencYuv = *interMode.fencYuv;
1343         Yuv& predYuv = interMode.predYuv;
1344         interMode.distortion = primitives.sa8d[cuGeom.log2CUSize - 2](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
1345         interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits);
1346     }
1347     else
1348     {
1349         interMode.distortion = MAX_UINT;
1350         interMode.sa8dCost = MAX_INT64;
1351     }
1352 }
1353
1354 void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, bool bMergeOnly)
1355 {
1356     interMode.initCosts();
1357     interMode.cu.setPartSizeSubParts(partSize);
1358     interMode.cu.setPredModeSubParts(MODE_INTER);
1359
1360     if (predInterSearch(interMode, cuGeom, bMergeOnly, true))
1361     {
1362         /* predInterSearch sets interMode.sa8dBits, but this is ignored */
1363         encodeResAndCalcRdInterCU(interMode, cuGeom);
1364     }
1365     else
1366     {
1367         interMode.distortion = MAX_UINT;
1368         interMode.rdCost = MAX_INT64;
1369     }
1370 }
1371
1372 /* Note that this function does not save the best intra prediction, it must
1373  * be generated later. It records the best mode in the cu */
1374 void Analysis::checkIntraInInter_rd0_4(Mode& intraMode, const CUGeom& cuGeom)
1375 {
1376     CUData& cu = intraMode.cu;
1377     uint32_t depth = cu.m_cuDepth[0];
1378
1379     cu.setPartSizeSubParts(SIZE_2Nx2N);
1380     cu.setPredModeSubParts(MODE_INTRA);
1381
1382     uint32_t initTrDepth = 0;
1383     uint32_t log2TrSize  = cu.m_log2CUSize[0] - initTrDepth;
1384     uint32_t tuSize      = 1 << log2TrSize;
1385     const uint32_t absPartIdx  = 0;
1386
1387     // Reference sample smoothing
1388     initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX);
1389
1390     pixel* fenc = m_modeDepth[depth].fencYuv.m_buf[0];
1391     uint32_t stride = m_modeDepth[depth].fencYuv.m_size;
1392
1393     pixel *above         = m_refAbove    + tuSize - 1;
1394     pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
1395     pixel *left          = m_refLeft     + tuSize - 1;
1396     pixel *leftFiltered  = m_refLeftFlt  + tuSize - 1;
1397     int sad, bsad;
1398     uint32_t bits, bbits, mode, bmode;
1399     uint64_t cost, bcost;
1400
1401     // 33 Angle modes once
1402     ALIGN_VAR_32(pixel, bufScale[32 * 32]);
1403     ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
1404     ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
1405     int scaleTuSize = tuSize;
1406     int scaleStride = stride;
1407     int costShift = 0;
1408     int sizeIdx = log2TrSize - 2;
1409
1410     if (tuSize > 32)
1411     {
1412         // origin is 64x64, we scale to 32x32 and setup required parameters
1413         primitives.scale2D_64to32(bufScale, fenc, stride);
1414         fenc = bufScale;
1415
1416         // reserve space in case primitives need to store data in above
1417         // or left buffers
1418         pixel _above[4 * 32 + 1];
1419         pixel _left[4 * 32 + 1];
1420         pixel *aboveScale  = _above + 2 * 32;
1421         pixel *leftScale   = _left + 2 * 32;
1422         aboveScale[0] = leftScale[0] = above[0];
1423         primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
1424         primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
1425
1426         scaleTuSize = 32;
1427         scaleStride = 32;
1428         costShift = 2;
1429         sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
1430
1431         // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
1432         above         = aboveScale;
1433         left          = leftScale;
1434         aboveFiltered = aboveScale;
1435         leftFiltered  = leftScale;
1436     }
1437
1438     pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
1439     int predsize = scaleTuSize * scaleTuSize;
1440
1441     m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1442
1443     /* there are three cost tiers for intra modes:
1444      *  pred[0]          - mode probable, least cost
1445      *  pred[1], pred[2] - less probable, slightly more cost
1446      *  non-mpm modes    - all cost the same (rbits) */
1447     uint64_t mpms;
1448     uint32_t preds[3];
1449     uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
1450
1451     // DC
1452     primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
1453     bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
1454     bmode = mode = DC_IDX;
1455     bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
1456     bcost = m_rdCost.calcRdSADCost(bsad, bbits);
1457
1458     pixel *abovePlanar = above;
1459     pixel *leftPlanar  = left;
1460
1461     if (tuSize & (8 | 16 | 32))
1462     {
1463         abovePlanar = aboveFiltered;
1464         leftPlanar  = leftFiltered;
1465     }
1466
1467     // PLANAR
1468     primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
1469     sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
1470     mode = PLANAR_IDX;
1471     bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
1472     cost = m_rdCost.calcRdSADCost(sad, bits);
1473     COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1474
1475     // Transpose NxN
1476     primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
1477
1478     primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
1479
1480     bool modeHor;
1481     pixel *cmp;
1482     intptr_t srcStride;
1483
1484 #define TRY_ANGLE(angle) \
1485     modeHor = angle < 18; \
1486     cmp = modeHor ? bufTrans : fenc; \
1487     srcStride = modeHor ? scaleTuSize : scaleStride; \
1488     sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
1489     bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
1490     cost = m_rdCost.calcRdSADCost(sad, bits)
1491
1492     if (m_param->bEnableFastIntra)
1493     {
1494         int asad = 0;
1495         uint32_t lowmode, highmode, amode = 5, abits = 0;
1496         uint64_t acost = MAX_INT64;
1497
1498         /* pick the best angle, sampling at distance of 5 */
1499         for (mode = 5; mode < 35; mode += 5)
1500         {
1501             TRY_ANGLE(mode);
1502             COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
1503         }
1504
1505         /* refine best angle at distance 2, then distance 1 */
1506         for (uint32_t dist = 2; dist >= 1; dist--)
1507         {
1508             lowmode = amode - dist;
1509             highmode = amode + dist;
1510
1511             X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
1512             TRY_ANGLE(lowmode);
1513             COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
1514
1515             X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
1516             TRY_ANGLE(highmode);
1517             COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
1518         }
1519
1520         if (amode == 33)
1521         {
1522             TRY_ANGLE(34);
1523             COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
1524         }
1525
1526         COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
1527     }
1528     else // calculate and search all intra prediction angles for lowest cost
1529     {
1530         for (mode = 2; mode < 35; mode++)
1531         {
1532             TRY_ANGLE(mode);
1533             COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1534         }
1535     }
1536
1537     cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTrDepth);
1538     intraMode.initCosts();
1539     intraMode.totalBits = bbits;
1540     intraMode.distortion = bsad;
1541     intraMode.sa8dCost = bcost;
1542     intraMode.sa8dBits = bbits;
1543 }
1544
1545 void Analysis::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
1546 {
1547     CUData& cu = intraMode.cu;
1548     Yuv* reconYuv = &intraMode.reconYuv;
1549     Yuv* fencYuv = &m_modeDepth[cuGeom.depth].fencYuv;
1550
1551     X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
1552     X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
1553
1554     m_quant.setQPforQuant(cu);
1555
1556     uint32_t tuDepthRange[2];
1557     cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1558
1559     m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
1560
1561     Cost icosts;
1562     codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
1563     extractIntraResultQT(cu, *reconYuv, 0, 0);
1564
1565     intraMode.distortion  = icosts.distortion;
1566     intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
1567
1568     m_entropyCoder.resetBits();
1569     if (m_slice->m_pps->bTransquantBypassEnabled)
1570         m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1571     m_entropyCoder.codeSkipFlag(cu, 0);
1572     m_entropyCoder.codePredMode(cu.m_predMode[0]);
1573     m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
1574     m_entropyCoder.codePredInfo(cu, 0);
1575     intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits();
1576
1577     bool bCodeDQP = m_slice->m_pps->bUseDQP;
1578     m_entropyCoder.codeCoeff(cu, 0, cuGeom.depth, bCodeDQP, tuDepthRange);
1579
1580     intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1581     intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
1582     if (m_rdCost.m_psyRd)
1583         intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
1584
1585     m_entropyCoder.store(intraMode.contexts);
1586     updateModeCost(intraMode);
1587 }
1588
1589 void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
1590 {
1591     if (cuGeom.depth < ctu.m_cuDepth[cuGeom.encodeIdx] && cuGeom.depth < g_maxCUDepth)
1592     {
1593         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1594         {
1595             const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1596             if (childCuData.flags & CUGeom::PRESENT)
1597                 encodeResidue(ctu, childCuData);
1598         }
1599         return;
1600     }
1601
1602     uint32_t absPartIdx = cuGeom.encodeIdx;
1603     int sizeIdx = cuGeom.log2CUSize - 2;
1604
1605     Yuv& fencYuv = m_modeDepth[0].fencYuv;
1606
1607     /* reuse the bestMode data structures at the current depth */
1608     Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
1609     Yuv& reconYuv = bestMode->reconYuv;
1610     CUData& cu = bestMode->cu;
1611
1612     cu.copyFromPic(ctu, cuGeom);
1613     m_quant.setQPforQuant(cu);
1614
1615     if (cu.m_predMode[0] == MODE_INTRA)
1616     {
1617         uint32_t tuDepthRange[2];
1618         cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1619
1620         uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN;
1621         residualTransformQuantIntra(*bestMode, cuGeom, initTrDepth, 0, tuDepthRange);
1622         getBestIntraModeChroma(*bestMode, cuGeom);
1623         residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
1624     }
1625     else if (cu.m_predMode[0] == MODE_INTER)
1626     {
1627         X265_CHECK(!ctu.m_skipFlag[absPartIdx], "skip not expected prior to transform\n");
1628
1629         /* Calculate residual for current CU part into depth sized resiYuv */
1630
1631         ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1632
1633         /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */
1634         Yuv& predYuv = m_modeDepth[0].bestMode->predYuv;
1635         pixel* predY = predYuv.getLumaAddr(absPartIdx);
1636         pixel* predU = predYuv.getCbAddr(absPartIdx);
1637         pixel* predV = predYuv.getCrAddr(absPartIdx);
1638
1639         primitives.luma_sub_ps[sizeIdx](resiYuv.m_buf[0], resiYuv.m_size,
1640                                         fencYuv.getLumaAddr(absPartIdx), predY,
1641                                         fencYuv.m_size, predYuv.m_size);
1642
1643         primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[1], resiYuv.m_csize,
1644                                         fencYuv.getCbAddr(absPartIdx), predU,
1645                                         fencYuv.m_csize, predYuv.m_csize);
1646
1647         primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[2], resiYuv.m_csize,
1648                                         fencYuv.getCrAddr(absPartIdx), predV,
1649                                         fencYuv.m_csize, predYuv.m_csize);
1650
1651         uint32_t tuDepthRange[2];
1652         cu.getInterTUQtDepthRange(tuDepthRange, 0);
1653
1654         residualTransformQuantInter(*bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
1655
1656         if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
1657             cu.setSkipFlagSubParts(true);
1658
1659         PicYuv& reconPicYuv = *m_frame->m_reconPicYuv;
1660         if (cu.getQtRootCbf(0)) // TODO: split to each component
1661         {
1662             /* residualTransformQuantInter() wrote transformed residual back into
1663              * resiYuv. Generate the recon pixels by adding it to the prediction */
1664
1665             primitives.luma_add_ps[sizeIdx](reconYuv.m_buf[0], reconYuv.m_size,
1666                                             predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
1667             primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[1], reconYuv.m_csize,
1668                                             predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
1669             primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[2], reconYuv.m_csize,
1670                                             predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
1671
1672             /* copy the reconstructed part to the recon pic for later intra
1673              * predictions */
1674             reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, absPartIdx);
1675         }
1676         else
1677         {
1678             /* copy the prediction pixels to the recon pic for later intra
1679              * predictions */
1680
1681             primitives.luma_copy_pp[sizeIdx](reconPicYuv.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_stride,
1682                                              predY, predYuv.m_size);
1683             primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCbAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC,
1684                                                       predU, predYuv.m_csize);
1685             primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCrAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC,
1686                                                       predV, predYuv.m_csize);
1687         }
1688     }
1689     /* else if (cu.m_predMode[0] == MODE_NONE) {} */
1690
1691     checkDQP(cu, cuGeom);
1692     cu.updatePic(cuGeom.depth);
1693 }
1694
1695 /* check whether current try is the best with identifying the depth of current try */
1696 void Analysis::checkBestMode(Mode& mode, uint32_t depth)
1697 {
1698     ModeDepth& md = m_modeDepth[depth];
1699     if (md.bestMode)
1700     {
1701         if (mode.rdCost < md.bestMode->rdCost)
1702             md.bestMode = &mode;
1703     }
1704     else
1705         md.bestMode = &mode;
1706 }
1707
1708 void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
1709 {
1710     if (m_param->rdLevel >= 3)
1711     {
1712         /* code the split flag (0 or 1) and update bit costs */
1713         mode.contexts.resetBits();
1714         mode.contexts.codeSplitFlag(mode.cu, 0, depth);
1715         uint32_t bits = mode.contexts.getNumberOfWrittenBits();
1716         mode.mvBits += bits;
1717         mode.totalBits += bits;
1718         updateModeCost(mode);
1719     }
1720     else if (m_param->rdLevel <= 1)
1721     {
1722         mode.sa8dBits++;
1723         mode.sa8dCost = m_rdCost.calcRdSADCost(mode.distortion, mode.sa8dBits);
1724     }
1725     else
1726     {
1727         mode.mvBits++;
1728         mode.totalBits++;
1729         updateModeCost(mode);
1730     }
1731 }
1732
1733 void Analysis::checkDQP(CUData& cu, const CUGeom& cuGeom)
1734 {
1735     if (m_slice->m_pps->bUseDQP && cuGeom.depth <= m_slice->m_pps->maxCuDQPDepth)
1736     {
1737         if (cu.m_cuDepth[0] > cuGeom.depth) // detect splits
1738         {
1739             bool hasResidual = false;
1740             for (uint32_t absPartIdx = 0; absPartIdx < cu.m_numPartitions; absPartIdx++)
1741             {
1742                 if (cu.getQtRootCbf(absPartIdx))
1743                 {
1744                     hasResidual = true;
1745                     break;
1746                 }
1747             }
1748             if (hasResidual)
1749                 cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth);
1750             else
1751                 cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
1752         }
1753         else
1754         {
1755             if (!cu.getCbf(0, TEXT_LUMA, 0) && !cu.getCbf(0, TEXT_CHROMA_U, 0) && !cu.getCbf(0, TEXT_CHROMA_V, 0))
1756                 cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
1757         }
1758     }
1759 }
1760
1761 uint32_t Analysis::topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom)
1762 {
1763     /* Do not attempt to code a block larger than the largest block in the
1764      * co-located CTUs in L0 and L1 */
1765     int currentQP = parentCTU.m_qp[0];
1766     int previousQP = currentQP;
1767     uint32_t minDepth0 = 4, minDepth1 = 4;
1768     uint32_t sum = 0;
1769     int numRefs = 0;
1770     if (m_slice->m_numRefIdx[0])
1771     {
1772         numRefs++;
1773         const CUData& cu = *m_slice->m_refPicList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
1774         previousQP = cu.m_qp[0];
1775         if (!cu.m_cuDepth[cuGeom.encodeIdx])
1776             return 0;
1777         for (uint32_t i = 0; i < cuGeom.numPartitions && minDepth0; i += 4)
1778         {
1779             uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i];
1780             minDepth0 = X265_MIN(d, minDepth0);
1781             sum += d;
1782         }
1783     }
1784     if (m_slice->m_numRefIdx[1])
1785     {
1786         numRefs++;
1787         const CUData& cu = *m_slice->m_refPicList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
1788         if (!cu.m_cuDepth[cuGeom.encodeIdx])
1789             return 0;
1790         for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
1791         {
1792             uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i];
1793             minDepth1 = X265_MIN(d, minDepth1);
1794             sum += d;
1795         }
1796     }
1797     if (!numRefs)
1798         return 0;
1799
1800     uint32_t minDepth = X265_MIN(minDepth0, minDepth1);
1801     uint32_t thresh = minDepth * numRefs * (cuGeom.numPartitions >> 2);
1802
1803     /* allow block size growth if QP is raising or avg depth is
1804      * less than 1.5 of min depth */
1805     if (minDepth && currentQP >= previousQP && (sum <= thresh + (thresh >> 1)))
1806         minDepth -= 1;
1807
1808     return minDepth;
1809 }
1810
1811 /* returns true if recursion should be stopped */
1812 bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode)
1813 {
1814     /* early exit when the RD cost of best mode at depth n is less than the sum
1815      * of average of RD cost of the neighbor CU's(above, aboveleft, aboveright,
1816      * left, colocated) and avg cost of that CU at depth "n" with weightage for
1817      * each quantity */
1818
1819     uint32_t depth = cuGeom.depth;
1820     FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
1821     FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
1822     uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth];
1823     uint64_t cuCount = cuStat.count[depth];
1824
1825     uint64_t neighCost = 0, neighCount = 0;
1826     const CUData* above = parentCTU.m_cuAbove;
1827     if (above)
1828     {
1829         FrameData::RCStatCU& astat = curEncData.m_cuStat[above->m_cuAddr];
1830         neighCost += astat.avgCost[depth] * astat.count[depth];
1831         neighCount += astat.count[depth];
1832
1833         const CUData* aboveLeft = parentCTU.m_cuAboveLeft;
1834         if (aboveLeft)
1835         {
1836             FrameData::RCStatCU& lstat = curEncData.m_cuStat[aboveLeft->m_cuAddr];
1837             neighCost += lstat.avgCost[depth] * lstat.count[depth];
1838             neighCount += lstat.count[depth];
1839         }
1840
1841         const CUData* aboveRight = parentCTU.m_cuAboveRight;
1842         if (aboveRight)
1843         {
1844             FrameData::RCStatCU& rstat = curEncData.m_cuStat[aboveRight->m_cuAddr];
1845             neighCost += rstat.avgCost[depth] * rstat.count[depth];
1846             neighCount += rstat.count[depth];
1847         }
1848     }
1849     const CUData* left = parentCTU.m_cuLeft;
1850     if (left)
1851     {
1852         FrameData::RCStatCU& nstat = curEncData.m_cuStat[left->m_cuAddr];
1853         neighCost += nstat.avgCost[depth] * nstat.count[depth];
1854         neighCount += nstat.count[depth];
1855     }
1856
1857     // give 60% weight to all CU's and 40% weight to neighbour CU's
1858     if (neighCost + cuCount)
1859     {
1860         uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount));
1861         uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost;
1862         if (curCost < avgCost && avgCost)
1863             return true;
1864     }
1865
1866     return false;
1867 }