source/encoder/search.cpp

   1 /*****************************************************************************
   2 * Copyright (C) 2013 x265 project
   3 *
   4 * Authors: Steve Borho <steve@borho.org>
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write to the Free Software
  18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  19 *
  20 * This program is also available under a commercial proprietary license.
  21 * For more information, contact us at license @ x265.com.
  22 *****************************************************************************/
  23
  24 #include "common.h"
  25 #include "primitives.h"
  26 #include "picyuv.h"
  27 #include "cudata.h"
  28
  29 #include "search.h"
  30 #include "entropy.h"
  31 #include "rdcost.h"
  32
  33 using namespace x265;
  34
  35 #if _MSC_VER
  36 #pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning)
  37 #pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
  38 #endif
  39
  40 #define MVP_IDX_BITS 1
  41
  42 ALIGN_VAR_32(const pixel, Search::zeroPixel[MAX_CU_SIZE]) = { 0 };
  43 ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
  44
  45 Search::Search() : JobProvider(NULL)
  46 {
  47     memset(m_rqt, 0, sizeof(m_rqt));
  48
  49     for (int i = 0; i < 3; i++)
  50     {
  51         m_qtTempTransformSkipFlag[i] = NULL;
  52         m_qtTempCbf[i] = NULL;
  53     }
  54
  55     m_numLayers = 0;
  56     m_param = NULL;
  57     m_slice = NULL;
  58     m_frame = NULL;
  59     m_bJobsQueued = false;
  60     m_totalNumME = m_numAcquiredME = m_numCompletedME = 0;
  61 }
  62
  63 bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
  64 {
  65     m_param = &param;
  66     m_bEnableRDOQ = param.rdLevel >= 4;
  67     m_bFrameParallel = param.frameNumThreads > 1;
  68     m_numLayers = g_log2Size[param.maxCUSize] - 2;
  69
  70     m_rdCost.setPsyRdScale(param.psyRd);
  71     m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
  72
  73     bool ok = m_quant.init(m_bEnableRDOQ, param.psyRdoq, scalingList, m_entropyCoder);
  74     if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
  75         ok &= m_quant.allocNoiseReduction(param);
  76
  77     ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
  78
  79     /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
  80      * available for motion reference.  See refLagRows in FrameEncoder::compressCTURows() */
  81     m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight;
  82
  83     uint32_t sizeL = 1 << (g_maxLog2CUSize * 2);
  84     uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
  85     uint32_t numPartitions = NUM_CU_PARTITIONS;
  86
  87     /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
  88      * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
  89      * which are reconstructed at each depth are valid. At the end, the transform depth table
  90      * is walked and the coeff and recon at the correct depths are collected */
  91     for (uint32_t i = 0; i <= m_numLayers; i++)
  92     {
  93         CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
  94         m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
  95         m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
  96         ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
  97         ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
  98     }
  99
 100     /* the rest of these buffers are indexed per-depth */
 101     for (uint32_t i = 0; i <= g_maxCUDepth; i++)
 102     {
 103         int cuSize = g_maxCUSize >> i;
 104         ok &= m_rqt[i].tmpResiYuv.create(cuSize, param.internalCsp);
 105         ok &= m_rqt[i].tmpPredYuv.create(cuSize, param.internalCsp);
 106         ok &= m_rqt[i].bidirPredYuv[0].create(cuSize, param.internalCsp);
 107         ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
 108     }
 109
 110     CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
 111     m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
 112     m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
 113     CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
 114     m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
 115     m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
 116
 117     return ok;
 118
 119 fail:
 120     return false;
 121 }
 122
 123 Search::~Search()
 124 {
 125     for (uint32_t i = 0; i <= m_numLayers; i++)
 126     {
 127         X265_FREE(m_rqt[i].coeffRQT[0]);
 128         m_rqt[i].reconQtYuv.destroy();
 129         m_rqt[i].resiQtYuv.destroy();
 130     }
 131
 132     for (uint32_t i = 0; i <= g_maxCUDepth; i++)
 133     {
 134         m_rqt[i].tmpResiYuv.destroy();
 135         m_rqt[i].tmpPredYuv.destroy();
 136         m_rqt[i].bidirPredYuv[0].destroy();
 137         m_rqt[i].bidirPredYuv[1].destroy();
 138     }
 139
 140     X265_FREE(m_qtTempCbf[0]);
 141     X265_FREE(m_qtTempTransformSkipFlag[0]);
 142 }
 143
 144 void Search::setQP(const Slice& slice, int qp)
 145 {
 146     x265_emms(); /* TODO: if the lambda tables were ints, this would not be necessary */
 147     m_me.setQP(qp);
 148     m_rdCost.setQP(slice, qp);
 149 }
 150
 151 #if CHECKED_BUILD || _DEBUG
 152 void Search::invalidateContexts(int fromDepth)
 153 {
 154     /* catch reads without previous writes */
 155     for (int d = fromDepth; d < NUM_FULL_DEPTH; d++)
 156     {
 157         m_rqt[d].cur.markInvalid();
 158         m_rqt[d].rqtTemp.markInvalid();
 159         m_rqt[d].rqtRoot.markInvalid();
 160         m_rqt[d].rqtTest.markInvalid();
 161     }
 162 }
 163 #else
 164 void Search::invalidateContexts(int) {}
 165 #endif
 166
 167 void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
 168 {
 169     uint32_t fullDepth  = cu.m_cuDepth[0] + tuDepth;
 170     uint32_t subdiv     = tuDepth < cu.m_tuDepth[absPartIdx];
 171     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 172
 173     if (!(log2TrSize - m_hChromaShift < 2))
 174     {
 175         if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
 176             m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
 177         if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
 178             m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
 179     }
 180
 181     if (subdiv)
 182     {
 183         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
 184         for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
 185             codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx);
 186     }
 187 }
 188
 189 void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype)
 190 {
 191     if (!cu.getCbf(absPartIdx, ttype, tuDepth))
 192         return;
 193
 194     uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
 195     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 196
 197     if (tuDepth < cu.m_tuDepth[absPartIdx])
 198     {
 199         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
 200         for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
 201             codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype);
 202
 203         return;
 204     }
 205
 206     uint32_t tuDepthC = tuDepth;
 207     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
 208
 209     if (log2TrSizeC < 2)
 210     {
 211         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
 212         if (absPartIdx & 3)
 213             return;
 214         log2TrSizeC = 2;
 215         tuDepthC--;
 216     }
 217
 218     uint32_t qtLayer = log2TrSize - 2;
 219
 220     if (m_csp != X265_CSP_I422)
 221     {
 222         uint32_t shift = (m_csp == X265_CSP_I420) ? 2 : 0;
 223         uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - shift);
 224         coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
 225         m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
 226     }
 227     else
 228     {
 229         uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
 230         coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
 231         uint32_t subTUSize = 1 << (log2TrSizeC * 2);
 232         uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
 233         if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
 234             m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
 235         if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
 236             m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype);
 237     }
 238 }
 239
 240 void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
 241 {
 242     uint32_t fullDepth  = mode.cu.m_cuDepth[0] + tuDepth;
 243     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 244     uint32_t qtLayer    = log2TrSize - 2;
 245     uint32_t sizeIdx    = log2TrSize - 2;
 246     bool mightNotSplit  = log2TrSize <= depthRange[1];
 247     bool mightSplit     = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
 248
 249     /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
 250     if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
 251     {
 252         mightNotSplit = false;
 253         mightSplit = true;
 254     }
 255
 256     CUData& cu = mode.cu;
 257
 258     Cost fullCost;
 259     uint32_t bCBF = 0;
 260
 261     pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
 262     uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
 263
 264     if (mightNotSplit)
 265     {
 266         if (mightSplit)
 267             m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
 268
 269         const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
 270         pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
 271         int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
 272         uint32_t stride   = mode.fencYuv->m_size;
 273
 274         // init availability pattern
 275         uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
 276         initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
 277
 278         // get prediction signal
 279         predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
 280
 281         cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
 282         cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
 283
 284         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
 285         coeff_t* coeffY       = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
 286
 287         // store original entropy coding status
 288         if (m_bEnableRDOQ)
 289             m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
 290
 291         primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
 292
 293         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
 294         if (numSig)
 295         {
 296             m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
 297             primitives.luma_add_ps[sizeIdx](reconQt, reconQtStride, pred, residual, stride, stride);
 298         }
 299         else
 300             // no coded residual, recon = pred
 301             primitives.luma_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride);
 302
 303         bCBF = !!numSig << tuDepth;
 304         cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
 305         fullCost.distortion = primitives.sse_pp[sizeIdx](reconQt, reconQtStride, fenc, stride);
 306
 307         m_entropyCoder.resetBits();
 308         if (!absPartIdx)
 309         {
 310             if (!cu.m_slice->isIntra())
 311             {
 312                 if (cu.m_slice->m_pps->bTransquantBypassEnabled)
 313                     m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
 314                 m_entropyCoder.codeSkipFlag(cu, 0);
 315                 m_entropyCoder.codePredMode(cu.m_predMode[0]);
 316             }
 317
 318             m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
 319         }
 320         if (cu.m_partSize[0] == SIZE_2Nx2N)
 321         {
 322             if (!absPartIdx)
 323                 m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
 324         }
 325         else
 326         {
 327             uint32_t qNumParts = cuGeom.numPartitions >> 2;
 328             if (!tuDepth)
 329             {
 330                 for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
 331                     m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
 332             }
 333             else if (!(absPartIdx & (qNumParts - 1)))
 334                 m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
 335         }
 336         if (log2TrSize != depthRange[0])
 337             m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
 338
 339         m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
 340
 341         if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
 342             m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
 343
 344         fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
 345
 346         if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE)
 347             fullCost.bits *= 4;
 348
 349         if (m_rdCost.m_psyRd)
 350         {
 351             fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
 352             fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
 353         }
 354         else
 355             fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
 356     }
 357     else
 358         fullCost.rdcost = MAX_INT64;
 359
 360     if (mightSplit)
 361     {
 362         if (mightNotSplit)
 363         {
 364             m_entropyCoder.store(m_rqt[fullDepth].rqtTest);  // save state after full TU encode
 365             m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);   // prep state of split encode
 366         }
 367
 368         // code split block
 369         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
 370
 371         int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
 372         if (m_param->bEnableTSkipFast)
 373             checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;
 374
 375         Cost splitCost;
 376         uint32_t cbf = 0;
 377         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
 378         {
 379             if (checkTransformSkip)
 380                 codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
 381             else
 382                 codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);
 383
 384             cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
 385         }
 386         for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
 387             cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
 388
 389         if (mightNotSplit && log2TrSize != depthRange[0])
 390         {
 391             /* If we could have coded this TU depth, include cost of subdiv flag */
 392             m_entropyCoder.resetBits();
 393             m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
 394             splitCost.bits += m_entropyCoder.getNumberOfWrittenBits();
 395
 396             if (m_rdCost.m_psyRd)
 397                 splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
 398             else
 399                 splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
 400         }
 401
 402         if (splitCost.rdcost < fullCost.rdcost)
 403         {
 404             outCost.rdcost     += splitCost.rdcost;
 405             outCost.distortion += splitCost.distortion;
 406             outCost.bits       += splitCost.bits;
 407             outCost.energy     += splitCost.energy;
 408             return;
 409         }
 410         else
 411         {
 412             // recover entropy state of full-size TU encode
 413             m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
 414
 415             // recover transform index and Cbf values
 416             cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
 417             cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
 418             cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
 419         }
 420     }
 421
 422     // set reconstruction for next intra prediction blocks if full TU prediction won
 423     pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
 424     intptr_t picStride = m_frame->m_reconPic->m_stride;
 425     primitives.luma_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
 426
 427     outCost.rdcost     += fullCost.rdcost;
 428     outCost.distortion += fullCost.distortion;
 429     outCost.bits       += fullCost.bits;
 430     outCost.energy     += fullCost.energy;
 431 }
 432
 433 void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
 434 {
 435     uint32_t fullDepth = mode.cu.m_cuDepth[0] + tuDepth;
 436     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 437     uint32_t tuSize = 1 << log2TrSize;
 438
 439     X265_CHECK(tuSize == MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
 440
 441     CUData& cu = mode.cu;
 442     Yuv* predYuv = &mode.predYuv;
 443     const Yuv* fencYuv = mode.fencYuv;
 444
 445     Cost fullCost;
 446     fullCost.rdcost = MAX_INT64;
 447     int      bTSkip = 0;
 448     uint32_t bCBF = 0;
 449
 450     const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
 451     pixel*   pred = predYuv->getLumaAddr(absPartIdx);
 452     int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
 453     uint32_t stride = fencYuv->m_size;
 454     int      sizeIdx = log2TrSize - 2;
 455
 456     // init availability pattern
 457     uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
 458     initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
 459
 460     // get prediction signal
 461     predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
 462
 463     cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
 464
 465     uint32_t qtLayer = log2TrSize - 2;
 466     uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
 467     coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
 468     pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
 469     uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
 470
 471     // store original entropy coding status
 472     m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
 473
 474     if (m_bEnableRDOQ)
 475         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
 476
 477     ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]);
 478     ALIGN_VAR_32(pixel,   tsReconY[MAX_TS_SIZE * MAX_TS_SIZE]);
 479
 480     int checkTransformSkip = 1;
 481     for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
 482     {
 483         uint64_t tmpCost;
 484         uint32_t tmpEnergy = 0;
 485
 486         coeff_t* coeff = (useTSkip ? tsCoeffY : coeffY);
 487         pixel*   tmpRecon = (useTSkip ? tsReconY : reconQt);
 488         uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
 489
 490         primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
 491
 492         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
 493         if (numSig)
 494         {
 495             m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
 496             primitives.luma_add_ps[sizeIdx](tmpRecon, tmpReconStride, pred, residual, stride, stride);
 497         }
 498         else if (useTSkip)
 499         {
 500             /* do not allow tskip if CBF=0, pretend we did not try tskip */
 501             checkTransformSkip = 0;
 502             break;
 503         }
 504         else
 505             // no residual coded, recon = pred
 506             primitives.luma_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride);
 507
 508         uint32_t tmpDist = primitives.sse_pp[sizeIdx](tmpRecon, tmpReconStride, fenc, stride);
 509
 510         cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
 511         cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
 512
 513         if (useTSkip)
 514             m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
 515
 516         m_entropyCoder.resetBits();
 517         if (!absPartIdx)
 518         {
 519             if (!cu.m_slice->isIntra())
 520             {
 521                 if (cu.m_slice->m_pps->bTransquantBypassEnabled)
 522                     m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
 523                 m_entropyCoder.codeSkipFlag(cu, 0);
 524                 m_entropyCoder.codePredMode(cu.m_predMode[0]);
 525             }
 526
 527             m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
 528         }
 529         if (cu.m_partSize[0] == SIZE_2Nx2N)
 530         {
 531             if (!absPartIdx)
 532                 m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
 533         }
 534         else
 535         {
 536             uint32_t qNumParts = cuGeom.numPartitions >> 2;
 537             if (!tuDepth)
 538             {
 539                 for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
 540                     m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
 541             }
 542             else if (!(absPartIdx & (qNumParts - 1)))
 543                 m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
 544         }
 545         m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
 546
 547         m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
 548
 549         if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
 550             m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
 551
 552         uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
 553
 554         if (!useTSkip)
 555             m_entropyCoder.store(m_rqt[fullDepth].rqtTemp);
 556
 557         if (m_rdCost.m_psyRd)
 558         {
 559             tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride);
 560             tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
 561         }
 562         else
 563             tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
 564
 565         if (tmpCost < fullCost.rdcost)
 566         {
 567             bTSkip = useTSkip;
 568             bCBF = !!numSig;
 569             fullCost.rdcost = tmpCost;
 570             fullCost.distortion = tmpDist;
 571             fullCost.bits = tmpBits;
 572             fullCost.energy = tmpEnergy;
 573         }
 574     }
 575
 576     if (bTSkip)
 577     {
 578         memcpy(coeffY, tsCoeffY, sizeof(coeff_t) << (log2TrSize * 2));
 579         primitives.luma_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize);
 580     }
 581     else if (checkTransformSkip)
 582     {
 583         cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
 584         cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
 585         m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
 586     }
 587
 588     // set reconstruction for next intra prediction blocks
 589     pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
 590     intptr_t picStride = m_frame->m_reconPic->m_stride;
 591     primitives.luma_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
 592
 593     outCost.rdcost += fullCost.rdcost;
 594     outCost.distortion += fullCost.distortion;
 595     outCost.bits += fullCost.bits;
 596     outCost.energy += fullCost.energy;
 597 }
 598
 599 /* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
 600 void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t depthRange[2])
 601 {
 602     CUData& cu = mode.cu;
 603
 604     uint32_t fullDepth   = cu.m_cuDepth[0] + tuDepth;
 605     uint32_t log2TrSize  = g_maxLog2CUSize - fullDepth;
 606     bool     bCheckFull  = log2TrSize <= depthRange[1];
 607
 608     X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");
 609
 610     /* we still respect rdPenalty == 2, we can forbid 32x32 intra TU. rdPenalty = 1 is impossible
 611      * since we are not measuring RD cost */
 612     if (m_param->rdPenalty == 2 && log2TrSize == 5 && depthRange[0] <= 4)
 613         bCheckFull = false;
 614
 615     if (bCheckFull)
 616     {
 617         const pixel* fenc  = mode.fencYuv->getLumaAddr(absPartIdx);
 618         pixel*   pred      = mode.predYuv.getLumaAddr(absPartIdx);
 619         int16_t* residual  = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
 620         pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
 621         intptr_t picStride = m_frame->m_reconPic->m_stride;
 622         uint32_t stride    = mode.fencYuv->m_size;
 623         uint32_t sizeIdx   = log2TrSize - 2;
 624         uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
 625         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
 626         coeff_t* coeff        = cu.m_trCoeff[TEXT_LUMA] + coeffOffsetY;
 627
 628         initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
 629         predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
 630
 631         X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
 632         cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
 633
 634         primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
 635         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, false);
 636         if (numSig)
 637         {
 638             m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, false, numSig);
 639             primitives.luma_add_ps[sizeIdx](picReconY, picStride, pred, residual, stride, stride);
 640             cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
 641         }
 642         else
 643         {
 644             primitives.luma_copy_pp[sizeIdx](picReconY, picStride, pred, stride);
 645             cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
 646         }
 647     }
 648     else
 649     {
 650         X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
 651
 652         /* code split block */
 653         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
 654         uint32_t cbf = 0;
 655         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
 656         {
 657             residualTransformQuantIntra(mode, cuGeom, tuDepth + 1, qPartIdx, depthRange);
 658             cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
 659         }
 660         for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
 661             cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << tuDepth);
 662     }
 663 }
 664
 665 void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
 666 {
 667     uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
 668     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 669
 670     if (tuDepth == cu.m_tuDepth[absPartIdx])
 671     {
 672         uint32_t qtLayer    = log2TrSize - 2;
 673
 674         // copy transform coefficients
 675         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
 676         coeff_t* coeffSrcY    = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
 677         coeff_t* coeffDestY   = cu.m_trCoeff[0]            + coeffOffsetY;
 678         memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2));
 679
 680         // copy reconstruction
 681         m_rqt[qtLayer].reconQtYuv.copyPartToPartLuma(reconYuv, absPartIdx, log2TrSize);
 682     }
 683     else
 684     {
 685         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
 686         for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
 687             extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx);
 688     }
 689 }
 690
 691 inline void offsetCBFs(uint8_t subTUCBF[2])
 692 {
 693     uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1];
 694     subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF;
 695     subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF;
 696 }
 697
 698 /* 4:2:2 post-TU split processing */
 699 void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
 700 {
 701     uint32_t depth = cu.m_cuDepth[0];
 702     uint32_t fullDepth = depth + tuDepth;
 703     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 704
 705     if (log2TrSize == 2)
 706     {
 707         X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
 708         ++log2TrSize;
 709     }
 710
 711     uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
 712
 713     // move the CBFs down a level and set the parent CBF
 714     uint8_t subTUCBF[2];
 715     subTUCBF[0] = cu.getCbf(absPartIdx            , ttype, tuDepth);
 716     subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth);
 717     offsetCBFs(subTUCBF);
 718
 719     cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx             , tuNumParts);
 720     cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts);
 721 }
 722
 723 /* returns distortion */
 724 uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
 725 {
 726     CUData& cu = mode.cu;
 727     uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
 728     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 729
 730     if (tuDepth < cu.m_tuDepth[absPartIdx])
 731     {
 732         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
 733         uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0;
 734         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
 735         {
 736             outDist += codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, psyEnergy);
 737             splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
 738             splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
 739         }
 740         for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
 741         {
 742             cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << tuDepth);
 743             cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << tuDepth);
 744         }
 745
 746         return outDist;
 747     }
 748
 749     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
 750
 751     uint32_t tuDepthC = tuDepth;
 752     if (log2TrSizeC < 2)
 753     {
 754         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
 755         if (absPartIdx & 3)
 756             return 0;
 757         log2TrSizeC = 2;
 758         tuDepthC--;
 759     }
 760
 761     if (m_bEnableRDOQ)
 762         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
 763
 764     bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
 765     checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
 766     if (checkTransformSkip)
 767         return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy);
 768
 769     uint32_t qtLayer = log2TrSize - 2;
 770     uint32_t tuSize = 1 << log2TrSizeC;
 771     uint32_t outDist = 0;
 772
 773     uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
 774     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
 775
 776     for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
 777     {
 778         TextType ttype = (TextType)chromaId;
 779
 780         TURecurse tuIterator(splitType, curPartNum, absPartIdx);
 781         do
 782         {
 783             uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
 784
 785             const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
 786             pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
 787             int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
 788             uint32_t stride   = mode.fencYuv->m_csize;
 789             uint32_t sizeIdxC = log2TrSizeC - 2;
 790
 791             uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
 792             coeff_t* coeffC        = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
 793             pixel*   reconQt       = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
 794             uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
 795
 796             pixel*   picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
 797             intptr_t picStride = m_frame->m_reconPic->m_strideC;
 798
 799             // init availability pattern
 800             initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
 801             pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
 802
 803             uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
 804             if (chromaPredMode == DM_CHROMA_IDX)
 805                 chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
 806             if (m_csp == X265_CSP_I422)
 807                 chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
 808
 809             // get prediction signal
 810             predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
 811
 812             cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 813
 814             primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
 815             uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
 816             if (numSig)
 817             {
 818                 m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
 819                 primitives.luma_add_ps[sizeIdxC](reconQt, reconQtStride, pred, residual, stride, stride);
 820                 cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 821             }
 822             else
 823             {
 824                 // no coded residual, recon = pred
 825                 primitives.luma_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride);
 826                 cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 827             }
 828
 829             outDist += m_rdCost.scaleChromaDist(chromaId, primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride));
 830
 831             if (m_rdCost.m_psyRd)
 832                 psyEnergy += m_rdCost.psyCost(sizeIdxC, fenc, stride, picReconC, picStride);
 833
 834             primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride);
 835         }
 836         while (tuIterator.isNextSection());
 837
 838         if (splitType == VERTICAL_SPLIT)
 839             offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
 840     }
 841
 842     return outDist;
 843 }
 844
 845 /* returns distortion */
 846 uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
 847 {
 848     CUData& cu = mode.cu;
 849     uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
 850     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 851     const uint32_t log2TrSizeC = 2;
 852     uint32_t tuSize = 4;
 853     uint32_t qtLayer = log2TrSize - 2;
 854     uint32_t outDist = 0;
 855
 856     /* At the TU layers above this one, no RDO is performed, only distortion is being measured,
 857      * so the entropy coder is not very accurate. The best we can do is return it in the same
 858      * condition as it arrived, and to do all bit estimates from the same state. */
 859     m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
 860
 861     ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
 862     ALIGN_VAR_32(pixel,   tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]);
 863
 864     uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
 865     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
 866
 867     for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
 868     {
 869         TextType ttype = (TextType)chromaId;
 870
 871         TURecurse tuIterator(splitType, curPartNum, absPartIdx);
 872         do
 873         {
 874             uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
 875
 876             const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
 877             pixel*   pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
 878             int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
 879             uint32_t stride = mode.fencYuv->m_csize;
 880             const uint32_t sizeIdxC = log2TrSizeC - 2;
 881
 882             uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
 883             coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
 884             pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
 885             uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
 886
 887             // init availability pattern
 888             initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
 889             pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
 890
 891             uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
 892             if (chromaPredMode == DM_CHROMA_IDX)
 893                 chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
 894             if (m_csp == X265_CSP_I422)
 895                 chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
 896
 897             // get prediction signal
 898             predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
 899
 900             uint64_t bCost = MAX_INT64;
 901             uint32_t bDist = 0;
 902             uint32_t bCbf = 0;
 903             uint32_t bEnergy = 0;
 904             int      bTSkip = 0;
 905
 906             int checkTransformSkip = 1;
 907             for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
 908             {
 909                 coeff_t* coeff = (useTSkip ? tskipCoeffC : coeffC);
 910                 pixel*   recon = (useTSkip ? tskipReconC : reconQt);
 911                 uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
 912
 913                 primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
 914
 915                 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
 916                 if (numSig)
 917                 {
 918                     m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
 919                     primitives.luma_add_ps[sizeIdxC](recon, reconStride, pred, residual, stride, stride);
 920                     cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 921                 }
 922                 else if (useTSkip)
 923                 {
 924                     checkTransformSkip = 0;
 925                     break;
 926                 }
 927                 else
 928                 {
 929                     primitives.luma_copy_pp[sizeIdxC](recon, reconStride, pred, stride);
 930                     cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 931                 }
 932                 uint32_t tmpDist = primitives.sse_pp[sizeIdxC](recon, reconStride, fenc, stride);
 933                 tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
 934
 935                 cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 936
 937                 uint32_t tmpBits = 0, tmpEnergy = 0;
 938                 if (numSig)
 939                 {
 940                     m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
 941                     m_entropyCoder.resetBits();
 942                     m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
 943                     tmpBits = m_entropyCoder.getNumberOfWrittenBits();
 944                 }
 945
 946                 uint64_t tmpCost;
 947                 if (m_rdCost.m_psyRd)
 948                 {
 949                     tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
 950                     tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
 951                 }
 952                 else
 953                     tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
 954
 955                 if (tmpCost < bCost)
 956                 {
 957                     bCost = tmpCost;
 958                     bDist = tmpDist;
 959                     bTSkip = useTSkip;
 960                     bCbf = !!numSig;
 961                     bEnergy = tmpEnergy;
 962                 }
 963             }
 964
 965             if (bTSkip)
 966             {
 967                 memcpy(coeffC, tskipCoeffC, sizeof(coeff_t) << (log2TrSizeC * 2));
 968                 primitives.luma_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE);
 969             }
 970
 971             cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 972             cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 973
 974             pixel*   reconPicC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
 975             intptr_t picStride = m_frame->m_reconPic->m_strideC;
 976             primitives.luma_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride);
 977
 978             outDist += bDist;
 979             psyEnergy += bEnergy;
 980         }
 981         while (tuIterator.isNextSection());
 982
 983         if (splitType == VERTICAL_SPLIT)
 984             offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
 985     }
 986
 987     m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
 988     return outDist;
 989 }
 990
 991 void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
 992 {
 993     uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
 994     uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
 995     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 996     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
 997
 998     if (tuDepthL == tuDepth || log2TrSizeC == 2)
 999     {
1000         // copy transform coefficients
1001         uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
1002         uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1003
1004         uint32_t qtLayer   = log2TrSize - 2 - (tuDepthL - tuDepth);
1005         coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
1006         coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
1007         coeff_t* coeffDstU = cu.m_trCoeff[1]           + coeffOffsetC;
1008         coeff_t* coeffDstV = cu.m_trCoeff[2]           + coeffOffsetC;
1009         memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
1010         memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
1011
1012         // copy reconstruction
1013         m_rqt[qtLayer].reconQtYuv.copyPartToPartChroma(reconYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
1014     }
1015     else
1016     {
1017         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1018         for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
1019             extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1);
1020     }
1021 }
1022
1023 void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx)
1024 {
1025     CUData& cu = mode.cu;
1026     uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
1027     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
1028
1029     if (tuDepth == cu.m_tuDepth[absPartIdx])
1030     {
1031         uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1032         uint32_t tuDepthC = tuDepth;
1033         if (log2TrSizeC < 2)
1034         {
1035             X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
1036             if (absPartIdx & 3)
1037                 return;
1038             log2TrSizeC = 2;
1039             tuDepthC--;
1040         }
1041
1042         ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1043         uint32_t tuSize = 1 << log2TrSizeC;
1044         uint32_t stride = mode.fencYuv->m_csize;
1045         const int sizeIdxC = log2TrSizeC - 2;
1046
1047         uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
1048         const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
1049
1050         for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1051         {
1052             TextType ttype = (TextType)chromaId;
1053
1054             TURecurse tuIterator(splitType, curPartNum, absPartIdx);
1055             do
1056             {
1057                 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1058
1059                 const pixel*   fenc   = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
1060                 pixel*   pred         = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
1061                 int16_t* residual     = resiYuv.getChromaAddr(chromaId, absPartIdxC);
1062                 pixel*   recon        = mode.reconYuv.getChromaAddr(chromaId, absPartIdxC); // TODO: needed?
1063                 uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1064                 coeff_t* coeff        = cu.m_trCoeff[ttype] + coeffOffsetC;
1065                 pixel*   picReconC    = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
1066                 uint32_t picStride    = m_frame->m_reconPic->m_strideC;
1067
1068                 uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
1069                 if (chromaPredMode == DM_CHROMA_IDX)
1070                     chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
1071                 chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
1072                 initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
1073                 pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
1074
1075                 predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
1076
1077                 X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
1078
1079                 primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
1080                 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, false);
1081                 if (numSig)
1082                 {
1083                     m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], residual, stride, coeff, log2TrSizeC, ttype, true, false, numSig);
1084                     primitives.luma_add_ps[sizeIdxC](recon, stride, pred, residual, stride, stride);
1085                     primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, recon, stride);
1086                     cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1087                 }
1088                 else
1089                 {
1090                     primitives.luma_copy_pp[sizeIdxC](recon, stride, pred, stride);
1091                     primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
1092                     cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1093                 }
1094             }
1095             while (tuIterator.isNextSection());
1096
1097             if (splitType == VERTICAL_SPLIT)
1098                 offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
1099         }
1100     }
1101     else
1102     {
1103         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1104         uint32_t splitCbfU = 0, splitCbfV = 0;
1105         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1106         {
1107             residualQTIntraChroma(mode, cuGeom, tuDepth + 1, qPartIdx);
1108             splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
1109             splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
1110         }
1111         for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
1112         {
1113             cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
1114             cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
1115         }
1116     }
1117 }
1118
1119 void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes)
1120 {
1121     uint32_t depth = cuGeom.depth;
1122     CUData& cu = intraMode.cu;
1123
1124     cu.setPartSizeSubParts(partSize);
1125     cu.setPredModeSubParts(MODE_INTRA);
1126
1127     uint32_t tuDepthRange[2];
1128     cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1129
1130     intraMode.initCosts();
1131     intraMode.distortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange, sharedModes);
1132     intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
1133
1134     m_entropyCoder.resetBits();
1135     if (m_slice->m_pps->bTransquantBypassEnabled)
1136         m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1137
1138     if (!m_slice->isIntra())
1139     {
1140         m_entropyCoder.codeSkipFlag(cu, 0);
1141         m_entropyCoder.codePredMode(cu.m_predMode[0]);
1142     }
1143
1144     m_entropyCoder.codePartSize(cu, 0, depth);
1145     m_entropyCoder.codePredInfo(cu, 0);
1146     intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
1147
1148     bool bCodeDQP = m_slice->m_pps->bUseDQP;
1149     m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
1150     m_entropyCoder.store(intraMode.contexts);
1151     intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1152     intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
1153     if (m_rdCost.m_psyRd)
1154         intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
1155
1156     updateModeCost(intraMode);
1157 }
1158
1159 /* Note that this function does not save the best intra prediction, it must
1160  * be generated later. It records the best mode in the cu */
1161 void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
1162 {
1163     CUData& cu = intraMode.cu;
1164     uint32_t depth = cu.m_cuDepth[0];
1165
1166     cu.setPartSizeSubParts(SIZE_2Nx2N);
1167     cu.setPredModeSubParts(MODE_INTRA);
1168
1169     const uint32_t initTuDepth = 0;
1170     uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth;
1171     uint32_t tuSize = 1 << log2TrSize;
1172     const uint32_t absPartIdx = 0;
1173
1174     // Reference sample smoothing
1175     initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
1176
1177     const pixel* fenc = intraMode.fencYuv->m_buf[0];
1178     uint32_t stride = intraMode.fencYuv->m_size;
1179
1180     pixel* above = m_refAbove + tuSize - 1;
1181     pixel* aboveFiltered = m_refAboveFlt + tuSize - 1;
1182     pixel* left = m_refLeft + tuSize - 1;
1183     pixel* leftFiltered = m_refLeftFlt + tuSize - 1;
1184     int sad, bsad;
1185     uint32_t bits, bbits, mode, bmode;
1186     uint64_t cost, bcost;
1187
1188     // 33 Angle modes once
1189     ALIGN_VAR_32(pixel, bufScale[32 * 32]);
1190     ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
1191     ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
1192     int scaleTuSize = tuSize;
1193     int scaleStride = stride;
1194     int costShift = 0;
1195     int sizeIdx = log2TrSize - 2;
1196
1197     if (tuSize > 32)
1198     {
1199         // origin is 64x64, we scale to 32x32 and setup required parameters
1200         primitives.scale2D_64to32(bufScale, fenc, stride);
1201         fenc = bufScale;
1202
1203         // reserve space in case primitives need to store data in above
1204         // or left buffers
1205         pixel _above[4 * 32 + 1];
1206         pixel _left[4 * 32 + 1];
1207         pixel* aboveScale = _above + 2 * 32;
1208         pixel* leftScale = _left + 2 * 32;
1209         aboveScale[0] = leftScale[0] = above[0];
1210         primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
1211         primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
1212
1213         scaleTuSize = 32;
1214         scaleStride = 32;
1215         costShift = 2;
1216         sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
1217
1218         // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
1219         above = aboveScale;
1220         left = leftScale;
1221         aboveFiltered = aboveScale;
1222         leftFiltered = leftScale;
1223     }
1224
1225     pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
1226     int predsize = scaleTuSize * scaleTuSize;
1227
1228     m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1229
1230     /* there are three cost tiers for intra modes:
1231     *  pred[0]          - mode probable, least cost
1232     *  pred[1], pred[2] - less probable, slightly more cost
1233     *  non-mpm modes    - all cost the same (rbits) */
1234     uint64_t mpms;
1235     uint32_t preds[3];
1236     uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
1237
1238     // DC
1239     primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
1240     bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
1241     bmode = mode = DC_IDX;
1242     bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
1243     bcost = m_rdCost.calcRdSADCost(bsad, bbits);
1244
1245     pixel* abovePlanar = above;
1246     pixel* leftPlanar = left;
1247
1248     if (tuSize & (8 | 16 | 32))
1249     {
1250         abovePlanar = aboveFiltered;
1251         leftPlanar = leftFiltered;
1252     }
1253
1254     // PLANAR
1255     primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
1256     sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
1257     mode = PLANAR_IDX;
1258     bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
1259     cost = m_rdCost.calcRdSADCost(sad, bits);
1260     COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1261
1262     // Transpose NxN
1263     primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
1264
1265     primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
1266
1267     bool modeHor;
1268     const pixel* cmp;
1269     intptr_t srcStride;
1270
1271 #define TRY_ANGLE(angle) \
1272     modeHor = angle < 18; \
1273     cmp = modeHor ? bufTrans : fenc; \
1274     srcStride = modeHor ? scaleTuSize : scaleStride; \
1275     sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
1276     bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
1277     cost = m_rdCost.calcRdSADCost(sad, bits)
1278
1279     if (m_param->bEnableFastIntra)
1280     {
1281         int asad = 0;
1282         uint32_t lowmode, highmode, amode = 5, abits = 0;
1283         uint64_t acost = MAX_INT64;
1284
1285         /* pick the best angle, sampling at distance of 5 */
1286         for (mode = 5; mode < 35; mode += 5)
1287         {
1288             TRY_ANGLE(mode);
1289             COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
1290         }
1291
1292         /* refine best angle at distance 2, then distance 1 */
1293         for (uint32_t dist = 2; dist >= 1; dist--)
1294         {
1295             lowmode = amode - dist;
1296             highmode = amode + dist;
1297
1298             X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
1299             TRY_ANGLE(lowmode);
1300             COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
1301
1302             X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
1303             TRY_ANGLE(highmode);
1304             COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
1305         }
1306
1307         if (amode == 33)
1308         {
1309             TRY_ANGLE(34);
1310             COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
1311         }
1312
1313         COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
1314     }
1315     else // calculate and search all intra prediction angles for lowest cost
1316     {
1317         for (mode = 2; mode < 35; mode++)
1318         {
1319             TRY_ANGLE(mode);
1320             COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1321         }
1322     }
1323
1324     cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth);
1325     intraMode.initCosts();
1326     intraMode.totalBits = bbits;
1327     intraMode.distortion = bsad;
1328     intraMode.sa8dCost = bcost;
1329     intraMode.sa8dBits = bbits;
1330 }
1331
1332 void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
1333 {
1334     CUData& cu = intraMode.cu;
1335     Yuv* reconYuv = &intraMode.reconYuv;
1336     const Yuv* fencYuv = intraMode.fencYuv;
1337
1338     X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
1339     X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
1340
1341     m_quant.setQPforQuant(cu);
1342
1343     uint32_t tuDepthRange[2];
1344     cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1345
1346     m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
1347
1348     Cost icosts;
1349     codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
1350     extractIntraResultQT(cu, *reconYuv, 0, 0);
1351
1352     intraMode.distortion = icosts.distortion;
1353     intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
1354
1355     m_entropyCoder.resetBits();
1356     if (m_slice->m_pps->bTransquantBypassEnabled)
1357         m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1358     m_entropyCoder.codeSkipFlag(cu, 0);
1359     m_entropyCoder.codePredMode(cu.m_predMode[0]);
1360     m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
1361     m_entropyCoder.codePredInfo(cu, 0);
1362     intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits();
1363
1364     bool bCodeDQP = m_slice->m_pps->bUseDQP;
1365     m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
1366
1367     intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1368     intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
1369     if (m_rdCost.m_psyRd)
1370         intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
1371
1372     m_entropyCoder.store(intraMode.contexts);
1373     updateModeCost(intraMode);
1374 }
1375
1376 uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes)
1377 {
1378     CUData& cu = intraMode.cu;
1379     Yuv* reconYuv = &intraMode.reconYuv;
1380     Yuv* predYuv = &intraMode.predYuv;
1381     const Yuv* fencYuv = intraMode.fencYuv;
1382
1383     uint32_t depth        = cu.m_cuDepth[0];
1384     uint32_t initTuDepth  = cu.m_partSize[0] != SIZE_2Nx2N;
1385     uint32_t numPU        = 1 << (2 * initTuDepth);
1386     uint32_t log2TrSize   = cu.m_log2CUSize[0] - initTuDepth;
1387     uint32_t tuSize       = 1 << log2TrSize;
1388     uint32_t qNumParts    = cuGeom.numPartitions >> 2;
1389     uint32_t sizeIdx      = log2TrSize - 2;
1390     uint32_t absPartIdx   = 0;
1391     uint32_t totalDistortion = 0;
1392
1393     int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
1394
1395     // loop over partitions
1396     for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
1397     {
1398         uint32_t bmode = 0;
1399
1400         if (sharedModes)
1401             bmode = sharedModes[puIdx];
1402         else
1403         {
1404             // Reference sample smoothing
1405             initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
1406
1407             // determine set of modes to be tested (using prediction signal only)
1408             const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
1409             uint32_t stride = predYuv->m_size;
1410
1411             pixel* above = m_refAbove + tuSize - 1;
1412             pixel* aboveFiltered = m_refAboveFlt + tuSize - 1;
1413             pixel* left = m_refLeft + tuSize - 1;
1414             pixel* leftFiltered = m_refLeftFlt + tuSize - 1;
1415
1416             // 33 Angle modes once
1417             ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
1418             ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
1419             ALIGN_VAR_32(pixel, bufScale[32 * 32]);
1420             pixel _above[4 * 32 + 1];
1421             pixel _left[4 * 32 + 1];
1422             int scaleTuSize = tuSize;
1423             int scaleStride = stride;
1424             int costShift = 0;
1425
1426             if (tuSize > 32)
1427             {
1428                 pixel* aboveScale = _above + 2 * 32;
1429                 pixel* leftScale = _left + 2 * 32;
1430
1431                 // origin is 64x64, we scale to 32x32 and setup required parameters
1432                 primitives.scale2D_64to32(bufScale, fenc, stride);
1433                 fenc = bufScale;
1434
1435                 // reserve space in case primitives need to store data in above
1436                 // or left buffers
1437                 aboveScale[0] = leftScale[0] = above[0];
1438                 primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
1439                 primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
1440
1441                 scaleTuSize = 32;
1442                 scaleStride = 32;
1443                 costShift = 2;
1444                 sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
1445
1446                 // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
1447                 above = aboveScale;
1448                 left = leftScale;
1449                 aboveFiltered = aboveScale;
1450                 leftFiltered = leftScale;
1451             }
1452
1453             m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1454
1455             /* there are three cost tiers for intra modes:
1456              *  pred[0]          - mode probable, least cost
1457              *  pred[1], pred[2] - less probable, slightly more cost
1458              *  non-mpm modes    - all cost the same (rbits) */
1459             uint64_t mpms;
1460             uint32_t preds[3];
1461             uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
1462
1463             pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
1464             uint64_t modeCosts[35];
1465             uint64_t bcost;
1466
1467             // DC
1468             primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
1469             uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, DC_IDX) : rbits;
1470             uint32_t sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
1471             modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
1472
1473             // PLANAR
1474             pixel* abovePlanar = above;
1475             pixel* leftPlanar = left;
1476             if (tuSize >= 8 && tuSize <= 32)
1477             {
1478                 abovePlanar = aboveFiltered;
1479                 leftPlanar = leftFiltered;
1480             }
1481             primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
1482             bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, PLANAR_IDX) : rbits;
1483             sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
1484             modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
1485             COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
1486
1487             // angular predictions
1488             primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
1489
1490             primitives.transpose[sizeIdx](buf_trans, fenc, scaleStride);
1491             for (int mode = 2; mode < 35; mode++)
1492             {
1493                 bool modeHor = (mode < 18);
1494                 const pixel* cmp = (modeHor ? buf_trans : fenc);
1495                 intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride);
1496                 bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
1497                 sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
1498                 modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
1499                 COPY1_IF_LT(bcost, modeCosts[mode]);
1500             }
1501
1502             /* Find the top maxCandCount candidate modes with cost within 25% of best
1503              * or among the most probable modes. maxCandCount is derived from the
1504              * rdLevel and depth. In general we want to try more modes at slower RD
1505              * levels and at higher depths */
1506             uint64_t candCostList[MAX_RD_INTRA_MODES];
1507             uint32_t rdModeList[MAX_RD_INTRA_MODES];
1508             int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
1509             for (int i = 0; i < maxCandCount; i++)
1510                 candCostList[i] = MAX_INT64;
1511
1512             uint64_t paddedBcost = bcost + (bcost >> 3); // 1.12%
1513             for (int mode = 0; mode < 35; mode++)
1514                 if (modeCosts[mode] < paddedBcost || (mpms & ((uint64_t)1 << mode)))
1515                     updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
1516
1517             /* measure best candidates using simple RDO (no TU splits) */
1518             bcost = MAX_INT64;
1519             for (int i = 0; i < maxCandCount; i++)
1520             {
1521                 if (candCostList[i] == MAX_INT64)
1522                     break;
1523                 m_entropyCoder.load(m_rqt[depth].cur);
1524                 cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
1525
1526                 Cost icosts;
1527                 if (checkTransformSkip)
1528                     codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
1529                 else
1530                     codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
1531                 COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
1532             }
1533         }
1534
1535         /* remeasure best mode, allowing TU splits */
1536         cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
1537         m_entropyCoder.load(m_rqt[depth].cur);
1538
1539         Cost icosts;
1540         if (checkTransformSkip)
1541             codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
1542         else
1543             codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
1544         totalDistortion += icosts.distortion;
1545
1546         extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
1547
1548         // set reconstruction for next intra prediction blocks
1549         if (puIdx != numPU - 1)
1550         {
1551             /* This has important implications for parallelism and RDO.  It is writing intermediate results into the
1552              * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
1553              * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
1554              * that the contexts should be tracked through each PU */
1555             pixel*   dst         = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
1556             uint32_t dststride   = m_frame->m_reconPic->m_stride;
1557             const pixel*   src   = reconYuv->getLumaAddr(absPartIdx);
1558             uint32_t srcstride   = reconYuv->m_size;
1559             primitives.luma_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
1560         }
1561     }
1562
1563     if (numPU > 1)
1564     {
1565         uint32_t combCbfY = 0;
1566         for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1567             combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
1568
1569         for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
1570             cu.m_cbf[0][offs] |= combCbfY;
1571     }
1572
1573     // TODO: remove this
1574     m_entropyCoder.load(m_rqt[depth].cur);
1575
1576     return totalDistortion;
1577 }
1578
1579 void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
1580 {
1581     CUData& cu = intraMode.cu;
1582     const Yuv* fencYuv = intraMode.fencYuv;
1583     Yuv* predYuv = &intraMode.predYuv;
1584
1585     uint32_t bestMode  = 0;
1586     uint64_t bestCost  = MAX_INT64;
1587     uint32_t modeList[NUM_CHROMA_MODE];
1588
1589     uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
1590     uint32_t tuSize = 1 << log2TrSizeC;
1591     int32_t scaleTuSize = tuSize;
1592     uint32_t tuDepth = 0;
1593     int32_t costShift = 0;
1594
1595     if (tuSize > 32)
1596     {
1597         scaleTuSize = 32;
1598         tuDepth = 1;
1599         costShift = 2;
1600         log2TrSizeC = 5;
1601     }
1602
1603     Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 1);
1604     Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 2);
1605     cu.getAllowedChromaDir(0, modeList);
1606
1607     // check chroma modes
1608     for (uint32_t mode = 0; mode < NUM_CHROMA_MODE; mode++)
1609     {
1610         uint32_t chromaPredMode = modeList[mode];
1611         if (chromaPredMode == DM_CHROMA_IDX)
1612             chromaPredMode = cu.m_lumaIntraDir[0];
1613         if (m_csp == X265_CSP_I422)
1614             chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1615
1616         uint64_t cost = 0;
1617         for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1618         {
1619             const pixel* fenc = fencYuv->m_buf[chromaId];
1620             pixel* pred = predYuv->m_buf[chromaId];
1621             pixel* chromaPred = getAdiChromaBuf(chromaId, scaleTuSize);
1622
1623             // get prediction signal
1624             predIntraChromaAng(chromaPred, chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC, m_csp);
1625             cost += primitives.sa8d[log2TrSizeC - 2](fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift;
1626         }
1627
1628         if (cost < bestCost)
1629         {
1630             bestCost = cost;
1631             bestMode = modeList[mode];
1632         }
1633     }
1634
1635     cu.setChromIntraDirSubParts(bestMode, 0, cu.m_cuDepth[0]);
1636 }
1637
1638 uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
1639 {
1640     CUData& cu = intraMode.cu;
1641     Yuv& reconYuv = intraMode.reconYuv;
1642
1643     uint32_t depth       = cu.m_cuDepth[0];
1644     uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
1645     uint32_t log2TrSize  = cu.m_log2CUSize[0] - initTuDepth;
1646     uint32_t absPartStep = (NUM_CU_PARTITIONS >> (depth << 1));
1647     uint32_t totalDistortion = 0;
1648
1649     int part = partitionFromLog2Size(log2TrSize);
1650
1651     TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
1652
1653     do
1654     {
1655         uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1656
1657         uint32_t bestMode = 0;
1658         uint32_t bestDist = 0;
1659         uint64_t bestCost = MAX_INT64;
1660
1661         // init mode list
1662         uint32_t minMode = 0;
1663         uint32_t maxMode = NUM_CHROMA_MODE;
1664         uint32_t modeList[NUM_CHROMA_MODE];
1665
1666         cu.getAllowedChromaDir(absPartIdxC, modeList);
1667
1668         // check chroma modes
1669         for (uint32_t mode = minMode; mode < maxMode; mode++)
1670         {
1671             // restore context models
1672             m_entropyCoder.load(m_rqt[depth].cur);
1673
1674             cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
1675             uint32_t psyEnergy = 0;
1676             uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, psyEnergy);
1677
1678             if (m_slice->m_pps->bTransformSkipEnabled)
1679                 m_entropyCoder.load(m_rqt[depth].cur);
1680
1681             m_entropyCoder.resetBits();
1682             // chroma prediction mode
1683             if (cu.m_partSize[0] == SIZE_2Nx2N || m_csp != X265_CSP_I444)
1684             {
1685                 if (!absPartIdxC)
1686                     m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
1687             }
1688             else
1689             {
1690                 uint32_t qNumParts = cuGeom.numPartitions >> 2;
1691                 if (!(absPartIdxC & (qNumParts - 1)))
1692                     m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
1693             }
1694
1695             codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC);
1696             codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
1697             codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
1698             uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
1699             uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(dist, bits, psyEnergy) : m_rdCost.calcRdCost(dist, bits);
1700
1701             if (cost < bestCost)
1702             {
1703                 bestCost = cost;
1704                 bestDist = dist;
1705                 bestMode = modeList[mode];
1706                 extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
1707                 memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1708                 memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1709                 memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1710                 memcpy(m_qtTempTransformSkipFlag[2], cu.m_transformSkip[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1711             }
1712         }
1713
1714         if (!tuIterator.isLastSection())
1715         {
1716             uint32_t zorder    = cuGeom.encodeIdx + absPartIdxC;
1717             uint32_t dststride = m_frame->m_reconPic->m_strideC;
1718             const pixel* src;
1719             pixel* dst;
1720
1721             dst = m_frame->m_reconPic->getCbAddr(cu.m_cuAddr, zorder);
1722             src = reconYuv.getCbAddr(absPartIdxC);
1723             primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
1724
1725             dst = m_frame->m_reconPic->getCrAddr(cu.m_cuAddr, zorder);
1726             src = reconYuv.getCrAddr(absPartIdxC);
1727             primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
1728         }
1729
1730         memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
1731         memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
1732         memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
1733         memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
1734         cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth);
1735         totalDistortion += bestDist;
1736     }
1737     while (tuIterator.isNextSection());
1738
1739     if (initTuDepth != 0)
1740     {
1741         uint32_t combCbfU = 0;
1742         uint32_t combCbfV = 0;
1743         uint32_t qNumParts = tuIterator.absPartIdxStep;
1744         for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1745         {
1746             combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1);
1747             combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
1748         }
1749
1750         for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
1751         {
1752             cu.m_cbf[1][offs] |= combCbfU;
1753             cu.m_cbf[2][offs] |= combCbfV;
1754         }
1755     }
1756
1757     /* TODO: remove this */
1758     m_entropyCoder.load(m_rqt[depth].cur);
1759     return totalDistortion;
1760 }
1761
1762 /* estimation of best merge coding of an inter PU (not a merge CU) */
1763 uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, MergeData& m)
1764 {
1765     X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "merge tested on non-2Nx2N partition\n");
1766
1767     m.maxNumMergeCand = cu.getInterMergeCandidates(m.absPartIdx, puIdx, m.mvFieldNeighbours, m.interDirNeighbours);
1768
1769     if (cu.isBipredRestriction())
1770     {
1771         /* in 8x8 CUs do not allow bidir merge candidates if not 2Nx2N */
1772         for (uint32_t mergeCand = 0; mergeCand < m.maxNumMergeCand; ++mergeCand)
1773         {
1774             if (m.interDirNeighbours[mergeCand] == 3)
1775             {
1776                 m.interDirNeighbours[mergeCand] = 1;
1777                 m.mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID;
1778             }
1779         }
1780     }
1781
1782     Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1783
1784     uint32_t outCost = MAX_UINT;
1785     for (uint32_t mergeCand = 0; mergeCand < m.maxNumMergeCand; ++mergeCand)
1786     {
1787         /* Prevent TMVP candidates from using unavailable reference pixels */
1788         if (m_bFrameParallel &&
1789             (m.mvFieldNeighbours[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1790              m.mvFieldNeighbours[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4))
1791             continue;
1792
1793         cu.m_mv[0][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][0].mv;
1794         cu.m_refIdx[0][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][0].refIdx;
1795         cu.m_mv[1][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv;
1796         cu.m_refIdx[1][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][1].refIdx;
1797
1798         prepMotionCompensation(cu, cuGeom, puIdx);
1799         motionCompensation(tempYuv, true, m_me.bChromaSATD);
1800
1801         uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(m.absPartIdx), tempYuv.m_size);
1802         if (m_me.bChromaSATD)
1803             costCand += m_me.bufChromaSATD(tempYuv, m.absPartIdx);
1804
1805         uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand);
1806         costCand = costCand + m_rdCost.getCost(bitsCand);
1807         if (costCand < outCost)
1808         {
1809             outCost = costCand;
1810             m.bits = bitsCand;
1811             m.index = mergeCand;
1812         }
1813     }
1814
1815     m.mvField[0] = m.mvFieldNeighbours[m.index][0];
1816     m.mvField[1] = m.mvFieldNeighbours[m.index][1];
1817     m.interDir = m.interDirNeighbours[m.index];
1818
1819     return outCost;
1820 }
1821
1822 /* this function assumes the caller has configured its MotionEstimation engine with the
1823  * correct source plane and source PU, and has called prepMotionCompensation() to set
1824  * m_puAbsPartIdx, m_puWidth, and m_puHeight */
1825 void Search::singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref)
1826 {
1827     uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
1828     bits += getTUBits(ref, m_slice->m_numRefIdx[list]);
1829
1830     MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
1831     int numMvc = interMode.cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, interMode.amvpCand[list][ref], mvc);
1832
1833     int mvpIdx = 0;
1834     int merange = m_param->searchRange;
1835     MotionData* bestME = interMode.bestME[part];
1836
1837     if (interMode.amvpCand[list][ref][0] != interMode.amvpCand[list][ref][1])
1838     {
1839         uint32_t bestCost = MAX_INT;
1840         for (int i = 0; i < AMVP_NUM_CANDS; i++)
1841         {
1842             MV mvCand = interMode.amvpCand[list][ref][i];
1843
1844             // NOTE: skip mvCand if Y is > merange and -FN>1
1845             if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
1846                 continue;
1847
1848             interMode.cu.clipMv(mvCand);
1849
1850             Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1851             predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPic, mvCand);
1852             uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
1853
1854             if (bestCost > cost)
1855             {
1856                 bestCost = cost;
1857                 mvpIdx = i;
1858             }
1859         }
1860     }
1861
1862     MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[list][ref][mvpIdx];
1863     setSearchRange(interMode.cu, mvp, merange, mvmin, mvmax);
1864
1865     int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
1866
1867     /* Get total cost of partition, but only include MV bit cost once */
1868     bits += m_me.bitcost(outmv);
1869     uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
1870
1871     /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
1872     checkBestMVP(interMode.amvpCand[list][ref], outmv, mvp, mvpIdx, bits, cost);
1873
1874     /* tie goes to the smallest ref ID, just like --no-pme */
1875     ScopedLock _lock(master.m_meLock);
1876     if (cost < bestME[list].cost ||
1877        (cost == bestME[list].cost && ref < bestME[list].ref))
1878     {
1879         bestME[list].mv = outmv;
1880         bestME[list].mvp = mvp;
1881         bestME[list].mvpIdx = mvpIdx;
1882         bestME[list].ref = ref;
1883         bestME[list].cost = cost;
1884         bestME[list].bits = bits;
1885     }
1886 }
1887
1888 /* search of the best candidate for inter prediction
1889  * returns true if predYuv was filled with a motion compensated prediction */
1890 bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChromaSA8D)
1891 {
1892     CUData& cu = interMode.cu;
1893     Yuv* predYuv = &interMode.predYuv;
1894
1895     MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
1896
1897     const Slice *slice = m_slice;
1898     int numPart     = cu.getNumPartInter();
1899     int numPredDir  = slice->isInterP() ? 1 : 2;
1900     const int* numRefIdx = slice->m_numRefIdx;
1901     uint32_t lastMode = 0;
1902     int      totalmebits = 0;
1903     bool     bDistributed = m_param->bDistributeMotionEstimation && (numRefIdx[0] + numRefIdx[1]) > 2;
1904     MV       mvzero(0, 0);
1905     Yuv&     tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1906
1907     MergeData merge;
1908     memset(&merge, 0, sizeof(merge));
1909
1910     for (int puIdx = 0; puIdx < numPart; puIdx++)
1911     {
1912         MotionData* bestME = interMode.bestME[puIdx];
1913
1914         /* sets m_puAbsPartIdx, m_puWidth, m_puHeight */
1915         initMotionCompensation(cu, cuGeom, puIdx);
1916
1917         m_me.setSourcePU(*interMode.fencYuv, cu.m_cuAddr, cuGeom.encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
1918
1919         uint32_t mrgCost = MAX_UINT;
1920
1921         /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
1922         if (cu.m_partSize[0] != SIZE_2Nx2N)
1923         {
1924             merge.absPartIdx = m_puAbsPartIdx;
1925             merge.width      = m_puWidth;
1926             merge.height     = m_puHeight;
1927             mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge);
1928
1929             if (bMergeOnly)
1930             {
1931                 if (mrgCost == MAX_UINT)
1932                 {
1933                     /* No valid merge modes were found, there is no possible way to
1934                      * perform a valid motion compensation prediction, so early-exit */
1935                     return false;
1936                 }
1937                 // set merge result
1938                 cu.m_mergeFlag[m_puAbsPartIdx] = true;
1939                 cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
1940                 cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx);
1941                 cu.setPUMv(0, merge.mvField[0].mv, m_puAbsPartIdx, puIdx);
1942                 cu.setPURefIdx(0, merge.mvField[0].refIdx, m_puAbsPartIdx, puIdx);
1943                 cu.setPUMv(1, merge.mvField[1].mv, m_puAbsPartIdx, puIdx);
1944                 cu.setPURefIdx(1, merge.mvField[1].refIdx, m_puAbsPartIdx, puIdx);
1945                 totalmebits += merge.bits;
1946
1947                 prepMotionCompensation(cu, cuGeom, puIdx);
1948                 motionCompensation(*predYuv, true, bChromaSA8D);
1949                 continue;
1950             }
1951         }
1952
1953         bestME[0].cost = MAX_UINT;
1954         bestME[1].cost = MAX_UINT;
1955
1956         getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
1957
1958         /* Uni-directional prediction */
1959         if (m_param->analysisMode == X265_ANALYSIS_LOAD && bestME[0].ref >= 0)
1960         {
1961             for (int l = 0; l < numPredDir; l++)
1962             {
1963                 int ref = bestME[l].ref;
1964                 uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
1965                 bits += getTUBits(ref, numRefIdx[l]);
1966
1967                 int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
1968
1969                 // Pick the best possible MVP from AMVP candidates based on least residual
1970                 int mvpIdx = 0;
1971                 int merange = m_param->searchRange;
1972
1973                 if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1])
1974                 {
1975                     uint32_t bestCost = MAX_INT;
1976                     for (int i = 0; i < AMVP_NUM_CANDS; i++)
1977                     {
1978                         MV mvCand = interMode.amvpCand[l][ref][i];
1979
1980                         // NOTE: skip mvCand if Y is > merange and -FN>1
1981                         if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
1982                             continue;
1983
1984                         cu.clipMv(mvCand);
1985                         predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
1986                         uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
1987
1988                         if (bestCost > cost)
1989                         {
1990                             bestCost = cost;
1991                             mvpIdx = i;
1992                         }
1993                     }
1994                 }
1995
1996                 MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx];
1997
1998                 int satdCost;
1999                 setSearchRange(cu, mvp, merange, mvmin, mvmax);
2000                 satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
2001
2002                 /* Get total cost of partition, but only include MV bit cost once */
2003                 bits += m_me.bitcost(outmv);
2004                 uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
2005
2006                 /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
2007                 checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
2008
2009                 if (cost < bestME[l].cost)
2010                 {
2011                     bestME[l].mv = outmv;
2012                     bestME[l].mvp = mvp;
2013                     bestME[l].mvpIdx = mvpIdx;
2014                     bestME[l].cost = cost;
2015                     bestME[l].bits = bits;
2016                 }
2017             }
2018         }
2019         else if (bDistributed)
2020         {
2021             m_meLock.acquire();
2022             m_curInterMode = &interMode;
2023             m_curGeom = &cuGeom;
2024             m_curPart = puIdx;
2025             m_totalNumME = 0;
2026             m_numAcquiredME = 1;
2027             m_numCompletedME = 0;
2028             m_totalNumME = numRefIdx[0] + numRefIdx[1];
2029             m_meLock.release();
2030
2031             if (!m_bJobsQueued)
2032                 JobProvider::enqueue();
2033
2034             for (int i = 1; i < m_totalNumME; i++)
2035                 m_pool->pokeIdleThread();
2036
2037             do
2038             {
2039                 m_meLock.acquire();
2040                 if (m_totalNumME > m_numAcquiredME)
2041                 {
2042                     int id = m_numAcquiredME++;
2043                     m_meLock.release();
2044
2045                     if (id < numRefIdx[0])
2046                         singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, id);
2047                     else
2048                         singleMotionEstimation(*this, interMode, cuGeom, puIdx, 1, id - numRefIdx[0]);
2049
2050                     m_meLock.acquire();
2051                     m_numCompletedME++;
2052                     m_meLock.release();
2053                 }
2054                 else
2055                     m_meLock.release();
2056             }
2057             while (m_totalNumME > m_numAcquiredME);
2058
2059             if (!m_bJobsQueued)
2060                 JobProvider::dequeue();
2061
2062             /* we saved L0-0 for ourselves */
2063             singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, 0);
2064
2065             m_meLock.acquire();
2066             if (++m_numCompletedME == m_totalNumME)
2067                 m_meCompletionEvent.trigger();
2068             m_meLock.release();
2069
2070             m_meCompletionEvent.wait();
2071         }
2072         else
2073         {
2074             for (int l = 0; l < numPredDir; l++)
2075             {
2076                 for (int ref = 0; ref < numRefIdx[l]; ref++)
2077                 {
2078                     uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
2079                     bits += getTUBits(ref, numRefIdx[l]);
2080
2081                     int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
2082
2083                     // Pick the best possible MVP from AMVP candidates based on least residual
2084                     int mvpIdx = 0;
2085                     int merange = m_param->searchRange;
2086
2087                     if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1])
2088                     {
2089                         uint32_t bestCost = MAX_INT;
2090                         for (int i = 0; i < AMVP_NUM_CANDS; i++)
2091                         {
2092                             MV mvCand = interMode.amvpCand[l][ref][i];
2093
2094                             // NOTE: skip mvCand if Y is > merange and -FN>1
2095                             if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
2096                                 continue;
2097
2098                             cu.clipMv(mvCand);
2099                             predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
2100                             uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
2101
2102                             if (bestCost > cost)
2103                             {
2104                                 bestCost = cost;
2105                                 mvpIdx = i;
2106                             }
2107                         }
2108                     }
2109
2110                     MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx];
2111
2112                     setSearchRange(cu, mvp, merange, mvmin, mvmax);
2113                     int satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
2114
2115                     /* Get total cost of partition, but only include MV bit cost once */
2116                     bits += m_me.bitcost(outmv);
2117                     uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
2118
2119                     /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
2120                     checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
2121
2122                     if (cost < bestME[l].cost)
2123                     {
2124                         bestME[l].mv = outmv;
2125                         bestME[l].mvp = mvp;
2126                         bestME[l].mvpIdx = mvpIdx;
2127                         bestME[l].ref = ref;
2128                         bestME[l].cost = cost;
2129                         bestME[l].bits = bits;
2130                     }
2131                 }
2132             }
2133         }
2134
2135         /* Bi-directional prediction */
2136         MotionData bidir[2];
2137         uint32_t bidirCost = MAX_UINT;
2138         int bidirBits = 0;
2139
2140         if (slice->isInterB() && !cu.isBipredRestriction() &&  /* biprediction is possible for this PU */
2141             cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N &&     /* 2Nx2N biprediction is handled elsewhere */
2142             bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
2143         {
2144             bidir[0] = bestME[0];
2145             bidir[1] = bestME[1];
2146
2147             int satdCost;
2148
2149             if (m_me.bChromaSATD)
2150             {
2151                 cu.m_mv[0][m_puAbsPartIdx] = bidir[0].mv;
2152                 cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
2153                 cu.m_mv[1][m_puAbsPartIdx] = bidir[1].mv;
2154                 cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
2155
2156                 prepMotionCompensation(cu, cuGeom, puIdx);
2157                 motionCompensation(tmpPredYuv, true, true);
2158
2159                 satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
2160                            m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
2161             }
2162             else
2163             {
2164                 PicYuv* refPic0 = slice->m_refPicList[0][bestME[0].ref]->m_reconPic;
2165                 PicYuv* refPic1 = slice->m_refPicList[1][bestME[1].ref]->m_reconPic;
2166                 Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
2167
2168                 /* Generate reference subpels */
2169                 predInterLumaPixel(bidirYuv[0], *refPic0, bestME[0].mv);
2170                 predInterLumaPixel(bidirYuv[1], *refPic1, bestME[1].mv);
2171
2172                 primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(m_puAbsPartIdx), bidirYuv[0].m_size,
2173                                                                                               bidirYuv[1].getLumaAddr(m_puAbsPartIdx), bidirYuv[1].m_size, 32);
2174                 satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
2175             }
2176
2177             bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
2178             bidirCost = satdCost + m_rdCost.getCost(bidirBits);
2179
2180             bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
2181             if (bTryZero)
2182             {
2183                 /* Do not try zero MV if unidir motion predictors are beyond
2184                  * valid search area */
2185                 MV mvmin, mvmax;
2186                 int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
2187                 setSearchRange(cu, mvzero, merange, mvmin, mvmax);
2188                 mvmax.y += 2; // there is some pad for subpel refine
2189                 mvmin <<= 2;
2190                 mvmax <<= 2;
2191
2192                 bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
2193                 bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
2194             }
2195             if (bTryZero)
2196             {
2197                 /* coincident blocks of the two reference pictures */
2198                 if (m_me.bChromaSATD)
2199                 {
2200                     cu.m_mv[0][m_puAbsPartIdx] = mvzero;
2201                     cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
2202                     cu.m_mv[1][m_puAbsPartIdx] = mvzero;
2203                     cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
2204
2205                     prepMotionCompensation(cu, cuGeom, puIdx);
2206                     motionCompensation(tmpPredYuv, true, true);
2207
2208                     satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
2209                                m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
2210                 }
2211                 else
2212                 {
2213                     const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
2214                     const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
2215                     intptr_t refStride = slice->m_mref[0][0].lumaStride;
2216
2217                     primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
2218                     satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
2219                 }
2220
2221                 MV mvp0 = bestME[0].mvp;
2222                 int mvpIdx0 = bestME[0].mvpIdx;
2223                 uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
2224
2225                 MV mvp1 = bestME[1].mvp;
2226                 int mvpIdx1 = bestME[1].mvpIdx;
2227                 uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
2228
2229                 uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
2230
2231                 /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
2232                 checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost);
2233                 checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost);
2234
2235                 if (cost < bidirCost)
2236                 {
2237                     bidir[0].mv = mvzero;
2238                     bidir[1].mv = mvzero;
2239                     bidir[0].mvp = mvp0;
2240                     bidir[1].mvp = mvp1;
2241                     bidir[0].mvpIdx = mvpIdx0;
2242                     bidir[1].mvpIdx = mvpIdx1;
2243                     bidirCost = cost;
2244                     bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
2245                 }
2246             }
2247         }
2248
2249         /* select best option and store into CU */
2250         if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
2251         {
2252             cu.m_mergeFlag[m_puAbsPartIdx] = true;
2253             cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
2254             cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx);
2255             cu.setPUMv(0, merge.mvField[0].mv, m_puAbsPartIdx, puIdx);
2256             cu.setPURefIdx(0, merge.mvField[0].refIdx, m_puAbsPartIdx, puIdx);
2257             cu.setPUMv(1, merge.mvField[1].mv, m_puAbsPartIdx, puIdx);
2258             cu.setPURefIdx(1, merge.mvField[1].refIdx, m_puAbsPartIdx, puIdx);
2259
2260             totalmebits += merge.bits;
2261         }
2262         else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
2263         {
2264             lastMode = 2;
2265
2266             cu.m_mergeFlag[m_puAbsPartIdx] = false;
2267             cu.setPUInterDir(3, m_puAbsPartIdx, puIdx);
2268             cu.setPUMv(0, bidir[0].mv, m_puAbsPartIdx, puIdx);
2269             cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
2270             cu.m_mvd[0][m_puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
2271             cu.m_mvpIdx[0][m_puAbsPartIdx] = bidir[0].mvpIdx;
2272
2273             cu.setPUMv(1, bidir[1].mv, m_puAbsPartIdx, puIdx);
2274             cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
2275             cu.m_mvd[1][m_puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
2276             cu.m_mvpIdx[1][m_puAbsPartIdx] = bidir[1].mvpIdx;
2277
2278             totalmebits += bidirBits;
2279         }
2280         else if (bestME[0].cost <= bestME[1].cost)
2281         {
2282             lastMode = 0;
2283
2284             cu.m_mergeFlag[m_puAbsPartIdx] = false;
2285             cu.setPUInterDir(1, m_puAbsPartIdx, puIdx);
2286             cu.setPUMv(0, bestME[0].mv, m_puAbsPartIdx, puIdx);
2287             cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
2288             cu.m_mvd[0][m_puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
2289             cu.m_mvpIdx[0][m_puAbsPartIdx] = bestME[0].mvpIdx;
2290
2291             cu.setPURefIdx(1, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
2292             cu.setPUMv(1, mvzero, m_puAbsPartIdx, puIdx);
2293
2294             totalmebits += bestME[0].bits;
2295         }
2296         else
2297         {
2298             lastMode = 1;
2299
2300             cu.m_mergeFlag[m_puAbsPartIdx] = false;
2301             cu.setPUInterDir(2, m_puAbsPartIdx, puIdx);
2302             cu.setPUMv(1, bestME[1].mv, m_puAbsPartIdx, puIdx);
2303             cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
2304             cu.m_mvd[1][m_puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
2305             cu.m_mvpIdx[1][m_puAbsPartIdx] = bestME[1].mvpIdx;
2306
2307             cu.setPURefIdx(0, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
2308             cu.setPUMv(0, mvzero, m_puAbsPartIdx, puIdx);
2309
2310             totalmebits += bestME[1].bits;
2311         }
2312
2313         prepMotionCompensation(cu, cuGeom, puIdx);
2314         motionCompensation(*predYuv, true, bChromaSA8D);
2315     }
2316
2317     interMode.sa8dBits += totalmebits;
2318     return true;
2319 }
2320
2321 void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3])
2322 {
2323     if (cuMode == SIZE_2Nx2N)
2324     {
2325         blockBit[0] = (!bPSlice) ? 3 : 1;
2326         blockBit[1] = 3;
2327         blockBit[2] = 5;
2328     }
2329     else if (cuMode == SIZE_2NxN || cuMode == SIZE_2NxnU || cuMode == SIZE_2NxnD)
2330     {
2331         static const uint32_t listBits[2][3][3] =
2332         {
2333             { { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
2334             { { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } }
2335         };
2336         if (bPSlice)
2337         {
2338             blockBit[0] = 3;
2339             blockBit[1] = 0;
2340             blockBit[2] = 0;
2341         }
2342         else
2343             memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
2344     }
2345     else if (cuMode == SIZE_Nx2N || cuMode == SIZE_nLx2N || cuMode == SIZE_nRx2N)
2346     {
2347         static const uint32_t listBits[2][3][3] =
2348         {
2349             { { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
2350             { { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } }
2351         };
2352         if (bPSlice)
2353         {
2354             blockBit[0] = 3;
2355             blockBit[1] = 0;
2356             blockBit[2] = 0;
2357         }
2358         else
2359             memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
2360     }
2361     else if (cuMode == SIZE_NxN)
2362     {
2363         blockBit[0] = (!bPSlice) ? 3 : 1;
2364         blockBit[1] = 3;
2365         blockBit[2] = 5;
2366     }
2367     else
2368     {
2369         X265_CHECK(0, "getBlkBits: unknown cuMode\n");
2370     }
2371 }
2372
2373 /* Check if using an alternative MVP would result in a smaller MVD + signal bits */
2374 void Search::checkBestMVP(MV* amvpCand, MV mv, MV& mvPred, int& outMvpIdx, uint32_t& outBits, uint32_t& outCost) const
2375 {
2376     X265_CHECK(amvpCand[outMvpIdx] == mvPred, "checkBestMVP: unexpected mvPred\n");
2377
2378     int mvpIdx = !outMvpIdx;
2379     MV mvp = amvpCand[mvpIdx];
2380     int diffBits = m_me.bitcost(mv, mvp) - m_me.bitcost(mv, mvPred);
2381     if (diffBits < 0)
2382     {
2383         outMvpIdx = mvpIdx;
2384         mvPred = mvp;
2385         uint32_t origOutBits = outBits;
2386         outBits = origOutBits + diffBits;
2387         outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
2388     }
2389 }
2390
2391 void Search::setSearchRange(const CUData& cu, MV mvp, int merange, MV& mvmin, MV& mvmax) const
2392 {
2393     cu.clipMv(mvp);
2394
2395     MV dist((int16_t)merange << 2, (int16_t)merange << 2);
2396     mvmin = mvp - dist;
2397     mvmax = mvp + dist;
2398
2399     cu.clipMv(mvmin);
2400     cu.clipMv(mvmax);
2401
2402     /* Clip search range to signaled maximum MV length.
2403      * We do not support this VUI field being changed from the default */
2404     const int maxMvLen = (1 << 15) - 1;
2405     mvmin.x = X265_MAX(mvmin.x, -maxMvLen);
2406     mvmin.y = X265_MAX(mvmin.y, -maxMvLen);
2407     mvmax.x = X265_MIN(mvmax.x, maxMvLen);
2408     mvmax.y = X265_MIN(mvmax.y, maxMvLen);
2409
2410     mvmin >>= 2;
2411     mvmax >>= 2;
2412
2413     /* conditional clipping for frame parallelism */
2414     mvmin.y = X265_MIN(mvmin.y, (int16_t)m_refLagPixels);
2415     mvmax.y = X265_MIN(mvmax.y, (int16_t)m_refLagPixels);
2416 }
2417
2418 /* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
2419 void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
2420 {
2421     CUData& cu = interMode.cu;
2422     Yuv* reconYuv = &interMode.reconYuv;
2423     const Yuv* fencYuv = interMode.fencYuv;
2424
2425     X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
2426
2427     uint32_t cuSize = 1 << cu.m_log2CUSize[0];
2428     uint32_t depth  = cu.m_cuDepth[0];
2429
2430     // No residual coding : SKIP mode
2431
2432     cu.setPredModeSubParts(MODE_SKIP);
2433     cu.clearCbf();
2434     cu.setTUDepthSubParts(0, 0, depth);
2435
2436     reconYuv->copyFromYuv(interMode.predYuv);
2437
2438     // Luma
2439     int part = partitionFromLog2Size(cu.m_log2CUSize[0]);
2440     interMode.distortion = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2441     // Chroma
2442     part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
2443     interMode.distortion += m_rdCost.scaleChromaDist(1, primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
2444     interMode.distortion += m_rdCost.scaleChromaDist(2, primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
2445
2446     m_entropyCoder.load(m_rqt[depth].cur);
2447     m_entropyCoder.resetBits();
2448     if (m_slice->m_pps->bTransquantBypassEnabled)
2449         m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
2450     m_entropyCoder.codeSkipFlag(cu, 0);
2451     m_entropyCoder.codeMergeIndex(cu, 0);
2452
2453     interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
2454     interMode.coeffBits = 0;
2455     interMode.totalBits = interMode.mvBits;
2456     if (m_rdCost.m_psyRd)
2457         interMode.psyEnergy = m_rdCost.psyCost(cu.m_log2CUSize[0] - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2458
2459     updateModeCost(interMode);
2460     m_entropyCoder.store(interMode.contexts);
2461 }
2462
2463 /* encode residual and calculate rate-distortion for a CU block.
2464  * Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
2465 void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
2466 {
2467     CUData& cu = interMode.cu;
2468     Yuv* reconYuv = &interMode.reconYuv;
2469     Yuv* predYuv = &interMode.predYuv;
2470     ShortYuv* resiYuv = &m_rqt[cuGeom.depth].tmpResiYuv;
2471     const Yuv* fencYuv = interMode.fencYuv;
2472
2473     X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
2474
2475     uint32_t log2CUSize = cu.m_log2CUSize[0];
2476     uint32_t cuSize = 1 << log2CUSize;
2477     uint32_t depth  = cu.m_cuDepth[0];
2478
2479     int part = partitionFromLog2Size(log2CUSize);
2480     int cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
2481
2482     m_quant.setQPforQuant(interMode.cu);
2483
2484     resiYuv->subtract(*fencYuv, *predYuv, log2CUSize);
2485
2486     uint32_t tuDepthRange[2];
2487     cu.getInterTUQtDepthRange(tuDepthRange, 0);
2488
2489     m_entropyCoder.load(m_rqt[depth].cur);
2490
2491     Cost costs;
2492     estimateResidualQT(interMode, cuGeom, 0, depth, *resiYuv, costs, tuDepthRange);
2493
2494     if (!cu.m_tqBypass[0])
2495     {
2496         uint32_t cbf0Dist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
2497         cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
2498         cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
2499
2500         /* Consider the RD cost of not signaling any residual */
2501         m_entropyCoder.load(m_rqt[depth].cur);
2502         m_entropyCoder.resetBits();
2503         m_entropyCoder.codeQtRootCbfZero();
2504         uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();
2505
2506         uint64_t cbf0Cost;
2507         uint32_t cbf0Energy;
2508         if (m_rdCost.m_psyRd)
2509         {
2510             cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
2511             cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
2512         }
2513         else
2514             cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits);
2515
2516         if (cbf0Cost < costs.rdcost)
2517         {
2518             cu.clearCbf();
2519             cu.setTUDepthSubParts(0, 0, depth);
2520         }
2521     }
2522
2523     if (cu.getQtRootCbf(0))
2524         saveResidualQTData(cu, *resiYuv, 0, depth);
2525
2526     /* calculate signal bits for inter/merge/skip coded CU */
2527     m_entropyCoder.load(m_rqt[depth].cur);
2528
2529     uint32_t coeffBits, bits;
2530     if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
2531     {
2532         cu.setPredModeSubParts(MODE_SKIP);
2533
2534         /* Merge/Skip */
2535         m_entropyCoder.resetBits();
2536         if (m_slice->m_pps->bTransquantBypassEnabled)
2537             m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
2538         m_entropyCoder.codeSkipFlag(cu, 0);
2539         m_entropyCoder.codeMergeIndex(cu, 0);
2540         coeffBits = 0;
2541         bits = m_entropyCoder.getNumberOfWrittenBits();
2542     }
2543     else
2544     {
2545         m_entropyCoder.resetBits();
2546         if (m_slice->m_pps->bTransquantBypassEnabled)
2547             m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
2548         m_entropyCoder.codeSkipFlag(cu, 0);
2549         m_entropyCoder.codePredMode(cu.m_predMode[0]);
2550         m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
2551         m_entropyCoder.codePredInfo(cu, 0);
2552         uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits();
2553
2554         bool bCodeDQP = m_slice->m_pps->bUseDQP;
2555         m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
2556         bits = m_entropyCoder.getNumberOfWrittenBits();
2557
2558         coeffBits = bits - mvBits;
2559     }
2560
2561     m_entropyCoder.store(interMode.contexts);
2562
2563     if (cu.getQtRootCbf(0))
2564         reconYuv->addClip(*predYuv, *resiYuv, log2CUSize);
2565     else
2566         reconYuv->copyFromYuv(*predYuv);
2567
2568     // update with clipped distortion and cost (qp estimation loop uses unclipped values)
2569     uint32_t bestDist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2570     bestDist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
2571     bestDist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
2572     if (m_rdCost.m_psyRd)
2573         interMode.psyEnergy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2574
2575     interMode.totalBits = bits;
2576     interMode.distortion = bestDist;
2577     interMode.coeffBits = coeffBits;
2578     interMode.mvBits = bits - coeffBits;
2579     updateModeCost(interMode);
2580 }
2581
2582 void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2])
2583 {
2584     CUData& cu = mode.cu;
2585     X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid depth\n");
2586
2587     uint32_t log2TrSize = g_maxLog2CUSize - depth;
2588     uint32_t tuDepth = depth - cu.m_cuDepth[0];
2589
2590     bool bCheckFull = log2TrSize <= depthRange[1];
2591     if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0])
2592         bCheckFull = false;
2593
2594     if (bCheckFull)
2595     {
2596         // code full block
2597         uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
2598         bool bCodeChroma = true;
2599         uint32_t tuDepthC = tuDepth;
2600         if (log2TrSizeC < 2)
2601         {
2602             X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
2603             log2TrSizeC = 2;
2604             tuDepthC--;
2605             bCodeChroma = !(absPartIdx & 3);
2606         }
2607
2608         uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
2609         uint32_t setCbf = 1 << tuDepth;
2610
2611         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
2612         coeff_t *coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
2613
2614         uint32_t sizeIdx  = log2TrSize  - 2;
2615
2616         cu.setTUDepthSubParts(depth - cu.m_cuDepth[0], absPartIdx, depth);
2617         cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
2618
2619         ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
2620         const Yuv* fencYuv = mode.fencYuv;
2621
2622         int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx);
2623         uint32_t strideResiY = resiYuv.m_size;
2624
2625         const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
2626         uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
2627
2628         if (numSigY)
2629         {
2630             m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
2631             cu.setCbfSubParts(setCbf, TEXT_LUMA, absPartIdx, depth);
2632         }
2633         else
2634         {
2635             primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);
2636             cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
2637         }
2638
2639         if (bCodeChroma)
2640         {
2641             uint32_t sizeIdxC = log2TrSizeC - 2;
2642             uint32_t strideResiC = resiYuv.m_csize;
2643
2644             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
2645             coeff_t *coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
2646             coeff_t *coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
2647             bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
2648
2649             TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
2650             do
2651             {
2652                 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
2653                 uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
2654
2655                 cu.setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
2656                 cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
2657
2658                 int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
2659                 const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
2660                 uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
2661                 if (numSigU)
2662                 {
2663                     m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
2664                     cu.setCbfPartRange(setCbf, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
2665                 }
2666                 else
2667                 {
2668                     primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);
2669                     cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
2670                 }
2671
2672                 int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
2673                 const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
2674                 uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
2675                 if (numSigV)
2676                 {
2677                     m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
2678                     cu.setCbfPartRange(setCbf, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
2679                 }
2680                 else
2681                 {
2682                     primitives.blockfill_s[sizeIdxC](curResiV, strideResiC, 0);
2683                     cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
2684                 }
2685             }
2686             while (tuIterator.isNextSection());
2687
2688             if (splitIntoSubTUs)
2689             {
2690                 offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
2691                 offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
2692             }
2693         }
2694     }
2695     else
2696     {
2697         X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
2698
2699         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
2700         uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
2701         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
2702         {
2703             residualTransformQuantInter(mode, cuGeom, qPartIdx, depth + 1, depthRange);
2704             ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
2705             ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
2706             vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
2707         }
2708         for (uint32_t i = 0; i < 4 * qNumParts; i++)
2709         {
2710             cu.m_cbf[TEXT_LUMA][absPartIdx + i] |= ycbf << tuDepth;
2711             cu.m_cbf[TEXT_CHROMA_U][absPartIdx + i] |= ucbf << tuDepth;
2712             cu.m_cbf[TEXT_CHROMA_V][absPartIdx + i] |= vcbf << tuDepth;
2713         }
2714     }
2715 }
2716
2717 uint64_t Search::estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId)
2718 {
2719     uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
2720
2721     if (m_rdCost.m_psyRd)
2722         return m_rdCost.calcPsyRdCost(dist, nullBits, psyEnergy);
2723     else
2724         return m_rdCost.calcRdCost(dist, nullBits);
2725 }
2726
2727 void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
2728 {
2729     CUData& cu = mode.cu;
2730     uint32_t log2TrSize = g_maxLog2CUSize - depth;
2731
2732     bool bCheckSplit = log2TrSize > depthRange[0];
2733     bool bCheckFull = log2TrSize <= depthRange[1];
2734     bool bSplitPresentFlag = bCheckSplit && bCheckFull;
2735
2736     if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit)
2737         bCheckFull = false;
2738
2739     X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
2740     X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
2741
2742     uint32_t tuDepth = depth - cu.m_cuDepth[0];
2743     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
2744     bool bCodeChroma = true;
2745     uint32_t tuDepthC = tuDepth;
2746     if (log2TrSizeC < 2)
2747     {
2748         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
2749         log2TrSizeC = 2;
2750         tuDepthC--;
2751         bCodeChroma = !(absPartIdx & 3);
2752     }
2753
2754     // code full block
2755     Cost fullCost;
2756     fullCost.rdcost = MAX_INT64;
2757
2758     uint8_t  cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
2759     uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
2760     uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2761     uint32_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2762     uint32_t singlePsyEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2763     uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2764     uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
2765
2766     m_entropyCoder.store(m_rqt[depth].rqtRoot);
2767
2768     uint32_t trSize = 1 << log2TrSize;
2769     const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
2770     uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] +  tuDepthC) << 1);
2771     const Yuv* fencYuv = mode.fencYuv;
2772
2773     // code full block
2774     if (bCheckFull)
2775     {
2776         uint32_t trSizeC = 1 << log2TrSizeC;
2777         int partSize  = partitionFromLog2Size(log2TrSize);
2778         int partSizeC = partitionFromLog2Size(log2TrSizeC);
2779         const uint32_t qtLayer = log2TrSize - 2;
2780         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
2781         coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
2782
2783         bool checkTransformSkip   = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0];
2784         bool checkTransformSkipY  = checkTransformSkip && log2TrSize  <= MAX_LOG2_TS_SIZE;
2785         bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
2786
2787         cu.setTUDepthSubParts(depth - cu.m_cuDepth[0], absPartIdx, depth);
2788         cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
2789
2790         if (m_bEnableRDOQ)
2791             m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
2792
2793         const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
2794         int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
2795         numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
2796         cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
2797
2798         m_entropyCoder.resetBits();
2799
2800         if (bSplitPresentFlag && log2TrSize > depthRange[0])
2801             m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
2802         fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
2803
2804         // Coding luma cbf flag has been removed from here. The context for cbf flag is different for each depth.
2805         // So it is valid if we encode coefficients and then cbfs at least for analysis.
2806 //        m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
2807         if (cbfFlag[TEXT_LUMA][0])
2808             m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
2809
2810         uint32_t singleBitsPrev = m_entropyCoder.getNumberOfWrittenBits();
2811         singleBits[TEXT_LUMA][0] = singleBitsPrev - fullCost.bits;
2812
2813         X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
2814         uint32_t distY = primitives.ssd_s[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size);
2815         uint32_t psyEnergyY = 0;
2816         if (m_rdCost.m_psyRd)
2817             psyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, (int16_t*)zeroShort, 0);
2818
2819         int16_t* curResiY    = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
2820         uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
2821
2822         if (cbfFlag[TEXT_LUMA][0])
2823         {
2824             m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
2825
2826             // non-zero cost calculation for luma - This is an approximation
2827             // finally we have to encode correct cbf after comparing with null cost
2828             const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
2829             uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
2830             uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0;
2831             if (m_rdCost.m_psyRd)
2832             {
2833                 nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
2834                 singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroPsyEnergyY);
2835             }
2836             else
2837                 singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
2838
2839             if (cu.m_tqBypass[0])
2840             {
2841                 singleDist[TEXT_LUMA][0] = nonZeroDistY;
2842                 singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
2843             }
2844             else
2845             {
2846                 // zero-cost calculation for luma. This is an approximation
2847                 // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
2848                 // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
2849                 uint64_t nullCostY = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
2850
2851                 if (nullCostY < singleCostY)
2852                 {
2853                     cbfFlag[TEXT_LUMA][0] = 0;
2854                     singleBits[TEXT_LUMA][0] = 0;
2855                     primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
2856 #if CHECKED_BUILD || _DEBUG
2857                     uint32_t numCoeffY = 1 << (log2TrSize << 1);
2858                     memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
2859 #endif
2860                     if (checkTransformSkipY)
2861                         minCost[TEXT_LUMA][0] = nullCostY;
2862                     singleDist[TEXT_LUMA][0] = distY;
2863                     singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
2864                 }
2865                 else
2866                 {
2867                     if (checkTransformSkipY)
2868                         minCost[TEXT_LUMA][0] = singleCostY;
2869                     singleDist[TEXT_LUMA][0] = nonZeroDistY;
2870                     singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
2871                 }
2872             }
2873         }
2874         else
2875         {
2876             if (checkTransformSkipY)
2877                 minCost[TEXT_LUMA][0] = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
2878             primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
2879             singleDist[TEXT_LUMA][0] = distY;
2880             singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
2881         }
2882
2883         cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
2884
2885         if (bCodeChroma)
2886         {
2887             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
2888             uint32_t strideResiC  = m_rqt[qtLayer].resiQtYuv.m_csize;
2889             for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
2890             {
2891                 uint32_t distC = 0, psyEnergyC = 0;
2892                 coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
2893                 TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
2894
2895                 do
2896                 {
2897                     uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
2898                     uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
2899
2900                     cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
2901
2902                     if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
2903                         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
2904
2905                     fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
2906                     resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
2907                     numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
2908                     cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
2909
2910                     //Coding cbf flags has been removed from here
2911 //                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][tuIterator.section], tuDepth);
2912                     if (cbfFlag[chromaId][tuIterator.section])
2913                         m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
2914                     uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
2915                     singleBits[chromaId][tuIterator.section] = newBits - singleBitsPrev;
2916                     singleBitsPrev = newBits;
2917
2918                     int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
2919                     distC = m_rdCost.scaleChromaDist(chromaId, primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
2920
2921                     if (cbfFlag[chromaId][tuIterator.section])
2922                     {
2923                         m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset,
2924                                                 log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
2925
2926                         // non-zero cost calculation for luma, same as luma - This is an approximation
2927                         // finally we have to encode correct cbf after comparing with null cost
2928                         uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
2929                         uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
2930                         uint32_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
2931                         uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0;
2932                         if (m_rdCost.m_psyRd)
2933                         {
2934                             nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
2935                             singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
2936                         }
2937                         else
2938                             singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
2939
2940                         if (cu.m_tqBypass[0])
2941                         {
2942                             singleDist[chromaId][tuIterator.section] = nonZeroDistC;
2943                             singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
2944                         }
2945                         else
2946                         {
2947                             //zero-cost calculation for chroma. This is an approximation
2948                             uint64_t nullCostC = estimateNullCbfCost(distC, psyEnergyC, tuDepth, (TextType)chromaId);
2949
2950                             if (nullCostC < singleCostC)
2951                             {
2952                                 cbfFlag[chromaId][tuIterator.section] = 0;
2953                                 singleBits[chromaId][tuIterator.section] = 0;
2954                                 primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
2955 #if CHECKED_BUILD || _DEBUG
2956                                 uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
2957                                 memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
2958 #endif
2959                                 if (checkTransformSkipC)
2960                                     minCost[chromaId][tuIterator.section] = nullCostC;
2961                                 singleDist[chromaId][tuIterator.section] = distC;
2962                                 singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
2963                             }
2964                             else
2965                             {
2966                                 if (checkTransformSkipC)
2967                                     minCost[chromaId][tuIterator.section] = singleCostC;
2968                                 singleDist[chromaId][tuIterator.section] = nonZeroDistC;
2969                                 singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
2970                             }
2971                         }
2972                     }
2973                     else
2974                     {
2975                         if (checkTransformSkipC)
2976                             minCost[chromaId][tuIterator.section] = estimateNullCbfCost(distC, psyEnergyC, tuDepthC, (TextType)chromaId);
2977                         primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
2978                         singleDist[chromaId][tuIterator.section] = distC;
2979                         singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
2980                     }
2981
2982                     cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
2983                 }
2984                 while (tuIterator.isNextSection());
2985             }
2986         }
2987
2988         if (checkTransformSkipY)
2989         {
2990             uint32_t nonZeroDistY = 0;
2991             uint32_t nonZeroPsyEnergyY = 0;
2992             uint64_t singleCostY = MAX_INT64;
2993
2994             ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]);
2995             ALIGN_VAR_32(int16_t, tsResiY[MAX_TS_SIZE * MAX_TS_SIZE]);
2996
2997             m_entropyCoder.load(m_rqt[depth].rqtRoot);
2998
2999             cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth);
3000
3001             if (m_bEnableRDOQ)
3002                 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
3003
3004             fenc = fencYuv->getLumaAddr(absPartIdx);
3005             resi = resiYuv.getLumaAddr(absPartIdx);
3006             uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, tsCoeffY, log2TrSize, TEXT_LUMA, absPartIdx, true);
3007
3008             if (numSigTSkipY)
3009             {
3010                 m_entropyCoder.resetBits();
3011                 m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
3012                 m_entropyCoder.codeCoeffNxN(cu, tsCoeffY, absPartIdx, log2TrSize, TEXT_LUMA);
3013                 const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
3014
3015                 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], tsResiY, trSize, tsCoeffY, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
3016
3017                 nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, tsResiY, trSize);
3018
3019                 if (m_rdCost.m_psyRd)
3020                 {
3021                     nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, tsResiY, trSize);
3022                     singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroPsyEnergyY);
3023                 }
3024                 else
3025                     singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
3026             }
3027
3028             if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
3029                 cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
3030             else
3031             {
3032                 singleDist[TEXT_LUMA][0] = nonZeroDistY;
3033                 singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
3034                 cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
3035                 bestTransformMode[TEXT_LUMA][0] = 1;
3036                 uint32_t numCoeffY = 1 << (log2TrSize << 1);
3037                 memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY);
3038                 primitives.luma_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize);
3039             }
3040
3041             cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
3042         }
3043
3044         if (bCodeChroma && checkTransformSkipC)
3045         {
3046             uint32_t nonZeroDistC = 0, nonZeroPsyEnergyC = 0;
3047             uint64_t singleCostC = MAX_INT64;
3048             uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
3049             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
3050
3051             m_entropyCoder.load(m_rqt[depth].rqtRoot);
3052
3053             for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
3054             {
3055                 coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
3056                 TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
3057
3058                 do
3059                 {
3060                     uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
3061                     uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
3062
3063                     int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
3064
3065                     ALIGN_VAR_32(coeff_t, tsCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
3066                     ALIGN_VAR_32(int16_t, tsResiC[MAX_TS_SIZE * MAX_TS_SIZE]);
3067
3068                     cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
3069
3070                     if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
3071                         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
3072
3073                     fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
3074                     resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
3075                     uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, tsCoeffC, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
3076
3077                     m_entropyCoder.resetBits();
3078                     singleBits[chromaId][tuIterator.section] = 0;
3079
3080                     if (numSigTSkipC)
3081                     {
3082                         m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
3083                         m_entropyCoder.codeCoeffNxN(cu, tsCoeffC, absPartIdxC, log2TrSizeC, (TextType)chromaId);
3084                         singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
3085
3086                         m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], tsResiC, trSizeC, tsCoeffC,
3087                                                 log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
3088                         uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
3089                         nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
3090                         if (m_rdCost.m_psyRd)
3091                         {
3092                             nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
3093                             singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
3094                         }
3095                         else
3096                             singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
3097                     }
3098
3099                     if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
3100                         cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
3101                     else
3102                     {
3103                         singleDist[chromaId][tuIterator.section] = nonZeroDistC;
3104                         singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
3105                         cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
3106                         bestTransformMode[chromaId][tuIterator.section] = 1;
3107                         uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
3108                         memcpy(coeffCurC + subTUOffset, tsCoeffC, sizeof(coeff_t) * numCoeffC);
3109                         primitives.luma_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC);
3110                     }
3111
3112                     cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
3113                 }
3114                 while (tuIterator.isNextSection());
3115             }
3116         }
3117
3118         // Here we were encoding cbfs and coefficients, after calculating distortion above.
3119         // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected
3120         // bits required for coefficients and added with number of cbf bits. As I tested the order does not
3121         // make any difference. But bit confused whether I should load the original context as below.
3122         m_entropyCoder.load(m_rqt[depth].rqtRoot);
3123         m_entropyCoder.resetBits();
3124
3125         //Encode cbf flags
3126         if (bCodeChroma)
3127         {
3128             for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
3129             {
3130                 if (!splitIntoSubTUs)
3131                     m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth);
3132                 else
3133                 {
3134                     offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
3135                     m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth);
3136                     m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][1], tuDepth);
3137                 }
3138             }
3139         }
3140
3141         m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
3142
3143         uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits();
3144
3145         uint32_t coeffBits = 0;
3146         coeffBits = singleBits[TEXT_LUMA][0];
3147         for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
3148         {
3149             coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex];
3150             coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex];
3151         }
3152
3153         // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma.
3154         // In case of chroma, if any one of the splitted block's cbf is 1, then we need to encode cbf 1, and then for
3155         // four splitted block's individual cbf value. This is not known before analysis of four splitted blocks.
3156         // For that reason, I am collecting individual coefficient bits only.
3157         fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
3158
3159         fullCost.distortion += singleDist[TEXT_LUMA][0];
3160         fullCost.energy += singlePsyEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
3161         for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
3162         {
3163             fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
3164             fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex];
3165         }
3166
3167         if (m_rdCost.m_psyRd)
3168             fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
3169         else
3170             fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
3171     }
3172
3173     // code sub-blocks
3174     if (bCheckSplit)
3175     {
3176         if (bCheckFull)
3177         {
3178             m_entropyCoder.store(m_rqt[depth].rqtTest);
3179             m_entropyCoder.load(m_rqt[depth].rqtRoot);
3180         }
3181
3182         Cost splitCost;
3183         if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
3184         {
3185             // Subdiv flag can be encoded at the start of anlysis of splitted blocks.
3186             m_entropyCoder.resetBits();
3187             m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
3188             splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
3189         }
3190
3191         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
3192         uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
3193         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
3194         {
3195             estimateResidualQT(mode, cuGeom, qPartIdx, depth + 1, resiYuv, splitCost, depthRange);
3196             ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
3197             ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
3198             vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
3199         }
3200         for (uint32_t i = 0; i < 4 * qNumParts; ++i)
3201         {
3202             cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
3203             cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
3204             cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
3205         }
3206
3207         // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
3208         // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
3209         // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
3210         // at depth 0 (for example).
3211         m_entropyCoder.load(m_rqt[depth].rqtRoot);
3212         m_entropyCoder.resetBits();
3213
3214         codeInterSubdivCbfQT(cu, absPartIdx, depth, depthRange);
3215         uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
3216         splitCost.bits += splitCbfBits;
3217
3218         if (m_rdCost.m_psyRd)
3219             splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
3220         else
3221             splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
3222
3223         if (ycbf || ucbf || vcbf || !bCheckFull)
3224         {
3225             if (splitCost.rdcost < fullCost.rdcost)
3226             {
3227                 outCosts.distortion += splitCost.distortion;
3228                 outCosts.rdcost     += splitCost.rdcost;
3229                 outCosts.bits       += splitCost.bits;
3230                 outCosts.energy     += splitCost.energy;
3231                 return;
3232             }
3233             else
3234                 outCosts.energy     += splitCost.energy;
3235         }
3236
3237         cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
3238         if (bCodeChroma)
3239         {
3240             if (!splitIntoSubTUs)
3241             {
3242                 cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth);
3243                 cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth);
3244             }
3245             else
3246             {
3247                 uint32_t tuNumParts = absPartIdxStep >> 1;
3248                 cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx             , tuNumParts);
3249                 cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
3250                 cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx             , tuNumParts);
3251                 cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
3252             }
3253         }
3254         X265_CHECK(bCheckFull, "check-full must be set\n");
3255         m_entropyCoder.load(m_rqt[depth].rqtTest);
3256     }
3257
3258     cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
3259     cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
3260
3261     if (bCodeChroma)
3262     {
3263         if (!splitIntoSubTUs)
3264         {
3265             cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth);
3266             cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth);
3267         }
3268         else
3269         {
3270             uint32_t tuNumParts = absPartIdxStep >> 1;
3271
3272             offsetCBFs(cbfFlag[TEXT_CHROMA_U]);
3273             offsetCBFs(cbfFlag[TEXT_CHROMA_V]);
3274             cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx             , tuNumParts);
3275             cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
3276             cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx             , tuNumParts);
3277             cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
3278         }
3279     }
3280
3281     outCosts.distortion += fullCost.distortion;
3282     outCosts.rdcost     += fullCost.rdcost;
3283     outCosts.bits       += fullCost.bits;
3284     outCosts.energy     += fullCost.energy;
3285 }
3286
3287 void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, const uint32_t depthRange[2])
3288 {
3289     X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
3290     X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
3291
3292     const uint32_t tuDepth     = depth - cu.m_cuDepth[0];
3293     const bool     bSubdiv     = tuDepth != cu.m_tuDepth[absPartIdx];
3294     const uint32_t log2TrSize  = g_maxLog2CUSize - depth;
3295
3296     if (!(log2TrSize - m_hChromaShift < 2))
3297     {
3298         if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
3299             m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
3300         if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
3301             m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
3302     }
3303     else
3304     {
3305         X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma CBF not matching\n");
3306         X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma CBF not matching\n");
3307     }
3308
3309     if (!bSubdiv)
3310     {
3311         m_entropyCoder.codeQtCbfLuma(cu, absPartIdx, tuDepth);
3312     }
3313     else
3314     {
3315         uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
3316         for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
3317             codeInterSubdivCbfQT(cu, absPartIdx, depth + 1, depthRange);
3318     }
3319 }
3320
3321 void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, TextType ttype, const uint32_t depthRange[2])
3322 {
3323     X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
3324     X265_CHECK(cu.isInter(absPartIdx), "encodeResidualQT() with intra block\n");
3325
3326     const uint32_t curTuDepth  = depth - cu.m_cuDepth[0];
3327     const uint32_t tuDepth     = cu.m_tuDepth[absPartIdx];
3328     const bool     bSubdiv     = curTuDepth != tuDepth;
3329     const uint32_t log2TrSize  = g_maxLog2CUSize - depth;
3330
3331     if (bSubdiv)
3332     {
3333         if (cu.getCbf(absPartIdx, ttype, curTuDepth))
3334         {
3335             uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
3336             for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
3337                 encodeResidualQT(cu, absPartIdx, depth + 1, ttype, depthRange);
3338         }
3339         return;
3340     }
3341     else
3342     {
3343         const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
3344         uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
3345
3346         // Luma
3347         const uint32_t qtLayer = log2TrSize - 2;
3348         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
3349         coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
3350
3351         // Chroma
3352         bool bCodeChroma = true;
3353         uint32_t tuDepthC = tuDepth;
3354         if (log2TrSize == 2 && m_csp != X265_CSP_I444)
3355         {
3356             X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
3357             log2TrSizeC++;
3358             tuDepthC--;
3359             bCodeChroma = !(absPartIdx & 3);
3360         }
3361
3362         if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
3363             m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
3364
3365         if (bCodeChroma)
3366         {
3367             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
3368             coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
3369             coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
3370
3371             if (!splitIntoSubTUs)
3372             {
3373                 if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
3374                     m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
3375                 if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
3376                     m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
3377             }
3378             else
3379             {
3380                 uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
3381                 uint32_t subTUSize = 1 << (log2TrSizeC * 2);
3382                 if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
3383                 {
3384                     if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
3385                         m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
3386                     if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
3387                         m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_U);
3388                 }
3389                 if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
3390                 {
3391                     if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
3392                         m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
3393                     if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
3394                         m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_V);
3395                 }
3396             }
3397         }
3398     }
3399 }
3400
3401 void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth)
3402 {
3403     X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
3404     const uint32_t curTrMode = depth - cu.m_cuDepth[0];
3405     const uint32_t tuDepth   = cu.m_tuDepth[absPartIdx];
3406     const uint32_t log2TrSize = g_maxLog2CUSize - depth;
3407
3408     if (curTrMode < tuDepth)
3409     {
3410         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
3411         for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
3412             saveResidualQTData(cu, resiYuv, absPartIdx, depth + 1);
3413         return;
3414     }
3415
3416     const uint32_t qtLayer = log2TrSize - 2;
3417
3418     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
3419     bool bCodeChroma = true;
3420     uint32_t tuDepthC = tuDepth;
3421     if (log2TrSizeC < 2)
3422     {
3423         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
3424         log2TrSizeC = 2;
3425         tuDepthC--;
3426         bCodeChroma = !(absPartIdx & 3);
3427     }
3428
3429     m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
3430
3431     uint32_t numCoeffY = 1 << (log2TrSize * 2);
3432     uint32_t coeffOffsetY = absPartIdx << LOG2_UNIT_SIZE * 2;
3433     coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
3434     coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY;
3435     memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
3436
3437     if (bCodeChroma)
3438     {
3439         m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
3440
3441         uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
3442         uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
3443
3444         coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
3445         coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
3446         coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
3447         coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
3448         memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
3449         memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
3450     }
3451 }
3452
3453 /* returns the number of bits required to signal a non-most-probable mode.
3454  * on return mpms contains bitmap of most probable modes */
3455 uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const
3456 {
3457     cu.getIntraDirLumaPredictor(absPartIdx, preds);
3458
3459     mpms = 0;
3460     for (int i = 0; i < 3; ++i)
3461         mpms |= ((uint64_t)1 << preds[i]);
3462
3463     return m_entropyCoder.bitsIntraModeNonMPM();
3464 }
3465
3466 /* swap the current mode/cost with the mode with the highest cost in the
3467  * current candidate list, if its cost is better (maintain a top N list) */
3468 void Search::updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList)
3469 {
3470     uint32_t maxIndex = 0;
3471     uint64_t maxValue = 0;
3472
3473     for (int i = 0; i < maxCandCount; i++)
3474     {
3475         if (maxValue < candCostList[i])
3476         {
3477             maxValue = candCostList[i];
3478             maxIndex = i;
3479         }
3480     }
3481
3482     if (cost < maxValue)
3483     {
3484         candCostList[maxIndex] = cost;
3485         candModeList[maxIndex] = mode;
3486     }
3487 }