source/encoder/search.cpp

   1 /*****************************************************************************
   2 * Copyright (C) 2013 x265 project
   3 *
   4 * Authors: Steve Borho <steve@borho.org>
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License as published by
   8 * the Free Software Foundation; either version 2 of the License, or
   9 * (at your option) any later version.
  10 *
  11 * This program is distributed in the hope that it will be useful,
  12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 * GNU General Public License for more details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write to the Free Software
  18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  19 *
  20 * This program is also available under a commercial proprietary license.
  21 * For more information, contact us at license @ x265.com.
  22 *****************************************************************************/
  23
  24 #include "common.h"
  25 #include "primitives.h"
  26 #include "picyuv.h"
  27 #include "cudata.h"
  28
  29 #include "search.h"
  30 #include "entropy.h"
  31 #include "rdcost.h"
  32
  33 using namespace x265;
  34
  35 #if _MSC_VER
  36 #pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning)
  37 #pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
  38 #endif
  39
  40 ALIGN_VAR_32(const pixel, Search::zeroPixel[MAX_CU_SIZE]) = { 0 };
  41 ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
  42
  43 Search::Search() : JobProvider(NULL)
  44 {
  45     memset(m_rqt, 0, sizeof(m_rqt));
  46
  47     for (int i = 0; i < 3; i++)
  48     {
  49         m_qtTempTransformSkipFlag[i] = NULL;
  50         m_qtTempCbf[i] = NULL;
  51     }
  52
  53     m_numLayers = 0;
  54     m_param = NULL;
  55     m_slice = NULL;
  56     m_frame = NULL;
  57     m_bJobsQueued = false;
  58     m_totalNumME = m_numAcquiredME = m_numCompletedME = 0;
  59 }
  60
  61 bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
  62 {
  63     m_param = &param;
  64     m_bEnableRDOQ = param.rdLevel >= 4;
  65     m_bFrameParallel = param.frameNumThreads > 1;
  66     m_numLayers = g_log2Size[param.maxCUSize] - 2;
  67
  68     m_rdCost.setPsyRdScale(param.psyRd);
  69     m_me.setSearchMethod(param.searchMethod);
  70     m_me.setSubpelRefine(param.subpelRefine);
  71
  72     bool ok = m_quant.init(m_bEnableRDOQ, param.psyRdoq, scalingList, m_entropyCoder);
  73     if (m_param->noiseReduction)
  74         ok &= m_quant.allocNoiseReduction(param);
  75
  76     ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
  77
  78     /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
  79      * available for motion reference.  See refLagRows in FrameEncoder::compressCTURows() */
  80     m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight;
  81
  82     uint32_t sizeL = 1 << (g_maxLog2CUSize * 2);
  83     uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
  84     uint32_t numPartitions = NUM_CU_PARTITIONS;
  85
  86     /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
  87      * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
  88      * which are reconstructed at each depth are valid. At the end, the transform depth table
  89      * is walked and the coeff and recon at the correct depths are collected */
  90     for (uint32_t i = 0; i <= m_numLayers; i++)
  91     {
  92         CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
  93         m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
  94         m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
  95         ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
  96         ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
  97     }
  98
  99     /* the rest of these buffers are indexed per-depth */
 100     for (uint32_t i = 0; i <= g_maxCUDepth; i++)
 101     {
 102         int cuSize = g_maxCUSize >> i;
 103         ok &= m_rqt[i].tmpResiYuv.create(cuSize, param.internalCsp);
 104         ok &= m_rqt[i].tmpPredYuv.create(cuSize, param.internalCsp);
 105         ok &= m_rqt[i].bidirPredYuv[0].create(cuSize, param.internalCsp);
 106         ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
 107     }
 108
 109     CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
 110     m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
 111     m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
 112     CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
 113     m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
 114     m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
 115
 116     return ok;
 117
 118 fail:
 119     return false;
 120 }
 121
 122 Search::~Search()
 123 {
 124     for (uint32_t i = 0; i <= m_numLayers; i++)
 125     {
 126         X265_FREE(m_rqt[i].coeffRQT[0]);
 127         m_rqt[i].reconQtYuv.destroy();
 128         m_rqt[i].resiQtYuv.destroy();
 129     }
 130
 131     for (uint32_t i = 0; i <= g_maxCUDepth; i++)
 132     {
 133         m_rqt[i].tmpResiYuv.destroy();
 134         m_rqt[i].tmpPredYuv.destroy();
 135         m_rqt[i].bidirPredYuv[0].destroy();
 136         m_rqt[i].bidirPredYuv[1].destroy();
 137     }
 138
 139     X265_FREE(m_qtTempCbf[0]);
 140     X265_FREE(m_qtTempTransformSkipFlag[0]);
 141 }
 142
 143 void Search::setQP(const Slice& slice, int qp)
 144 {
 145     x265_emms(); /* TODO: if the lambda tables were ints, this would not be necessary */
 146     m_me.setQP(qp);
 147     m_rdCost.setQP(slice, qp);
 148 }
 149
 150 #if CHECKED_BUILD || _DEBUG
 151 void Search::invalidateContexts(int fromDepth)
 152 {
 153     /* catch reads without previous writes */
 154     for (int d = fromDepth; d < NUM_FULL_DEPTH; d++)
 155     {
 156         m_rqt[d].cur.markInvalid();
 157         m_rqt[d].rqtTemp.markInvalid();
 158         m_rqt[d].rqtRoot.markInvalid();
 159         m_rqt[d].rqtTest.markInvalid();
 160     }
 161 }
 162 #else
 163 void Search::invalidateContexts(int) {}
 164 #endif
 165
 166 void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height)
 167 {
 168     uint32_t fullDepth  = cu.m_cuDepth[0] + trDepth;
 169     uint32_t tuDepthL   = cu.m_tuDepth[absPartIdx];
 170     uint32_t subdiv     = tuDepthL > trDepth;
 171     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 172
 173     bool mCodeAll = true;
 174     const uint32_t numPels = 1 << (log2TrSize * 2 - m_hChromaShift - m_vChromaShift);
 175     if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
 176         mCodeAll = false;
 177
 178     if (mCodeAll)
 179     {
 180         if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepth - 1))
 181             m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_U, trDepth, !subdiv);
 182
 183         if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepth - 1))
 184             m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_V, trDepth, !subdiv);
 185     }
 186
 187     if (subdiv)
 188     {
 189         absPartIdxStep >>= 2;
 190         width  >>= 1;
 191         height >>= 1;
 192
 193         uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
 194         for (uint32_t part = 0; part < 4; part++)
 195             codeSubdivCbfQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, absPartIdxStep, width, height);
 196     }
 197 }
 198
 199 void Search::codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype)
 200 {
 201     if (!cu.getCbf(absPartIdx, ttype, trDepth))
 202         return;
 203
 204     uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
 205     uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
 206
 207     if (tuDepthL > trDepth)
 208     {
 209         uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
 210         for (uint32_t part = 0; part < 4; part++)
 211             codeCoeffQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, ttype);
 212
 213         return;
 214     }
 215
 216     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 217
 218     uint32_t trDepthC = trDepth;
 219     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
 220
 221     if (log2TrSizeC == 1)
 222     {
 223         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "transform size too small\n");
 224         trDepthC--;
 225         log2TrSizeC++;
 226         uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
 227         bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
 228         if (!bFirstQ)
 229             return;
 230     }
 231
 232     uint32_t qtLayer = log2TrSize - 2;
 233
 234     if (m_csp != X265_CSP_I422)
 235     {
 236         uint32_t shift = (m_csp == X265_CSP_I420) ? 2 : 0;
 237         uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - shift);
 238         coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
 239         m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
 240     }
 241     else
 242     {
 243         uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
 244         coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
 245         uint32_t subTUSize = 1 << (log2TrSizeC * 2);
 246         uint32_t partIdxesPerSubTU  = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + trDepthC) << 1) + 1);
 247         if (cu.getCbf(absPartIdx, ttype, trDepth + 1))
 248             m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
 249         if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, trDepth + 1))
 250             m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, ttype);
 251     }
 252 }
 253
 254 void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, uint32_t depthRange[2])
 255 {
 256     uint32_t fullDepth  = mode.cu.m_cuDepth[0] + trDepth;
 257     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 258     uint32_t qtLayer    = log2TrSize - 2;
 259     uint32_t sizeIdx    = log2TrSize - 2;
 260     bool mightNotSplit  = log2TrSize <= depthRange[1];
 261     bool mightSplit     = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
 262
 263     /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
 264     if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
 265     {
 266         mightNotSplit = false;
 267         mightSplit = true;
 268     }
 269
 270     CUData& cu = mode.cu;
 271
 272     Cost fullCost;
 273     uint32_t bCBF = 0;
 274
 275     pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
 276     uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
 277
 278     if (mightNotSplit)
 279     {
 280         if (mightSplit)
 281             m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
 282
 283         pixel*   fenc     = const_cast<pixel*>(mode.fencYuv->getLumaAddr(absPartIdx));
 284         pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
 285         int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
 286         uint32_t stride   = mode.fencYuv->m_size;
 287
 288         // init availability pattern
 289         uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
 290         initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
 291
 292         // get prediction signal
 293         predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
 294
 295         cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
 296         cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
 297
 298         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
 299         coeff_t* coeffY       = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
 300
 301         // store original entropy coding status
 302         if (m_bEnableRDOQ)
 303             m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
 304
 305         primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
 306
 307         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
 308         if (numSig)
 309         {
 310             m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
 311             primitives.luma_add_ps[sizeIdx](reconQt, reconQtStride, pred, residual, stride, stride);
 312         }
 313         else
 314             // no coded residual, recon = pred
 315             primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride);
 316
 317         bCBF = !!numSig << trDepth;
 318         cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
 319         fullCost.distortion = primitives.sse_pp[sizeIdx](reconQt, reconQtStride, fenc, stride);
 320
 321         m_entropyCoder.resetBits();
 322         if (!absPartIdx)
 323         {
 324             if (!cu.m_slice->isIntra())
 325             {
 326                 if (cu.m_slice->m_pps->bTransquantBypassEnabled)
 327                     m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
 328                 m_entropyCoder.codeSkipFlag(cu, 0);
 329                 m_entropyCoder.codePredMode(cu.m_predMode[0]);
 330             }
 331
 332             m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
 333         }
 334         if (cu.m_partSize[0] == SIZE_2Nx2N)
 335         {
 336             if (!absPartIdx)
 337                 m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
 338         }
 339         else
 340         {
 341             uint32_t qtNumParts = cuGeom.numPartitions >> 2;
 342             if (!trDepth)
 343             {
 344                 for (uint32_t part = 0; part < 4; part++)
 345                     m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false);
 346             }
 347             else if (!(absPartIdx & (qtNumParts - 1)))
 348                 m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
 349         }
 350         if (log2TrSize != depthRange[0])
 351             m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
 352
 353         m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
 354
 355         if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth))
 356             m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
 357
 358         fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
 359
 360         if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE)
 361             fullCost.bits *= 4;
 362
 363         if (m_rdCost.m_psyRd)
 364         {
 365             fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
 366             fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
 367         }
 368         else
 369             fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
 370     }
 371     else
 372         fullCost.rdcost = MAX_INT64;
 373
 374     if (mightSplit)
 375     {
 376         if (mightNotSplit)
 377         {
 378             m_entropyCoder.store(m_rqt[fullDepth].rqtTest);  // save state after full TU encode
 379             m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);   // prep state of split encode
 380         }
 381
 382         // code split block
 383         uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
 384         uint32_t absPartIdxSub = absPartIdx;
 385
 386         int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
 387         if (m_param->bEnableTSkipFast)
 388             checkTransformSkip &= cu.m_partSize[absPartIdx] == SIZE_NxN;
 389
 390         Cost splitCost;
 391         uint32_t cbf = 0;
 392         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
 393         {
 394             if (checkTransformSkip)
 395                 codeIntraLumaTSkip(mode, cuGeom, trDepth + 1, absPartIdxSub, splitCost);
 396             else
 397                 codeIntraLumaQT(mode, cuGeom, trDepth + 1, absPartIdxSub, bAllowSplit, splitCost, depthRange);
 398
 399             cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
 400         }
 401         for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
 402             cu.m_cbf[0][absPartIdx + offs] |= (cbf << trDepth);
 403
 404         if (mightNotSplit && log2TrSize != depthRange[0])
 405         {
 406             /* If we could have coded this TU depth, include cost of subdiv flag */
 407             m_entropyCoder.resetBits();
 408             m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
 409             splitCost.bits += m_entropyCoder.getNumberOfWrittenBits();
 410
 411             if (m_rdCost.m_psyRd)
 412                 splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
 413             else
 414                 splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
 415         }
 416
 417         if (splitCost.rdcost < fullCost.rdcost)
 418         {
 419             outCost.rdcost     += splitCost.rdcost;
 420             outCost.distortion += splitCost.distortion;
 421             outCost.bits       += splitCost.bits;
 422             outCost.energy     += splitCost.energy;
 423             return;
 424         }
 425         else
 426         {
 427             // recover entropy state of full-size TU encode
 428             m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
 429
 430             // recover transform index and Cbf values
 431             cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
 432             cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
 433             cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
 434         }
 435     }
 436
 437     // set reconstruction for next intra prediction blocks if full TU prediction won
 438     pixel*   picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
 439     intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
 440     primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
 441
 442     outCost.rdcost     += fullCost.rdcost;
 443     outCost.distortion += fullCost.distortion;
 444     outCost.bits       += fullCost.bits;
 445     outCost.energy     += fullCost.energy;
 446 }
 447
 448 void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, Cost& outCost)
 449 {
 450     uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth;
 451     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 452     uint32_t tuSize = 1 << log2TrSize;
 453
 454     X265_CHECK(tuSize == MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
 455
 456     CUData& cu = mode.cu;
 457     Yuv* predYuv = &mode.predYuv;
 458     const Yuv* fencYuv = mode.fencYuv;
 459
 460     Cost fullCost;
 461     fullCost.rdcost = MAX_INT64;
 462     int      bTSkip = 0;
 463     uint32_t bCBF = 0;
 464
 465     pixel*   fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
 466     pixel*   pred = predYuv->getLumaAddr(absPartIdx);
 467     int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
 468     uint32_t stride = fencYuv->m_size;
 469     int      sizeIdx = log2TrSize - 2;
 470
 471     // init availability pattern
 472     uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
 473     initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
 474
 475     // get prediction signal
 476     predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
 477
 478     cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
 479
 480     uint32_t qtLayer = log2TrSize - 2;
 481     uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
 482     coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
 483     pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
 484     uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
 485
 486     // store original entropy coding status
 487     m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
 488
 489     if (m_bEnableRDOQ)
 490         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
 491
 492     ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]);
 493     ALIGN_VAR_32(pixel,   tsReconY[MAX_TS_SIZE * MAX_TS_SIZE]);
 494
 495     int checkTransformSkip = 1;
 496     for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
 497     {
 498         uint64_t tmpCost;
 499         uint32_t tmpEnergy = 0;
 500
 501         coeff_t* coeff = (useTSkip ? tsCoeffY : coeffY);
 502         pixel*   tmpRecon = (useTSkip ? tsReconY : reconQt);
 503         uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
 504
 505         primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
 506
 507         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
 508         if (numSig)
 509         {
 510             m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
 511             primitives.luma_add_ps[sizeIdx](tmpRecon, tmpReconStride, pred, residual, stride, stride);
 512         }
 513         else if (useTSkip)
 514         {
 515             /* do not allow tskip if CBF=0, pretend we did not try tskip */
 516             checkTransformSkip = 0;
 517             break;
 518         }
 519         else
 520             // no residual coded, recon = pred
 521             primitives.square_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride);
 522
 523         uint32_t tmpDist = primitives.sse_pp[sizeIdx](tmpRecon, tmpReconStride, fenc, stride);
 524
 525         cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
 526         cu.setCbfSubParts((!!numSig) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
 527
 528         if (useTSkip)
 529             m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
 530
 531         m_entropyCoder.resetBits();
 532         if (!absPartIdx)
 533         {
 534             if (!cu.m_slice->isIntra())
 535             {
 536                 if (cu.m_slice->m_pps->bTransquantBypassEnabled)
 537                     m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
 538                 m_entropyCoder.codeSkipFlag(cu, 0);
 539                 m_entropyCoder.codePredMode(cu.m_predMode[0]);
 540             }
 541
 542             m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
 543         }
 544         if (cu.m_partSize[0] == SIZE_2Nx2N)
 545         {
 546             if (!absPartIdx)
 547                 m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
 548         }
 549         else
 550         {
 551             uint32_t qtNumParts = cuGeom.numPartitions >> 2;
 552             if (!trDepth)
 553             {
 554                 for (uint32_t part = 0; part < 4; part++)
 555                     m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false);
 556             }
 557             else if (!(absPartIdx & (qtNumParts - 1)))
 558                 m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
 559         }
 560         m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
 561
 562         m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
 563
 564         if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth))
 565             m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
 566
 567         uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
 568
 569         if (!useTSkip)
 570             m_entropyCoder.store(m_rqt[fullDepth].rqtTemp);
 571
 572         if (m_rdCost.m_psyRd)
 573         {
 574             tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride);
 575             tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
 576         }
 577         else
 578             tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
 579
 580         if (tmpCost < fullCost.rdcost)
 581         {
 582             bTSkip = useTSkip;
 583             bCBF = !!numSig;
 584             fullCost.rdcost = tmpCost;
 585             fullCost.distortion = tmpDist;
 586             fullCost.bits = tmpBits;
 587             fullCost.energy = tmpEnergy;
 588         }
 589     }
 590
 591     if (bTSkip)
 592     {
 593         memcpy(coeffY, tsCoeffY, sizeof(coeff_t) << (log2TrSize * 2));
 594         primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize);
 595     }
 596     else if (checkTransformSkip)
 597     {
 598         cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
 599         cu.setCbfSubParts(bCBF << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
 600         m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
 601     }
 602
 603     // set reconstruction for next intra prediction blocks
 604     pixel*   picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
 605     intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
 606     primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
 607
 608     outCost.rdcost += fullCost.rdcost;
 609     outCost.distortion += fullCost.distortion;
 610     outCost.bits += fullCost.bits;
 611     outCost.energy += fullCost.energy;
 612 }
 613
 614 /* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
 615 void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t depthRange[2])
 616 {
 617     CUData& cu = mode.cu;
 618
 619     uint32_t fullDepth   = cu.m_cuDepth[0] + trDepth;
 620     uint32_t log2TrSize  = g_maxLog2CUSize - fullDepth;
 621     bool     bCheckFull  = log2TrSize <= depthRange[1];
 622
 623     X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");
 624
 625     /* we still respect rdPenalty == 2, we can forbid 32x32 intra TU. rdPenalty = 1 is impossible
 626      * since we are not measuring RD cost */
 627     if (m_param->rdPenalty == 2 && log2TrSize == 5 && depthRange[0] <= 4)
 628         bCheckFull = false;
 629
 630     if (bCheckFull)
 631     {
 632         pixel*   fenc      = const_cast<pixel*>(mode.fencYuv->getLumaAddr(absPartIdx));
 633         pixel*   pred      = mode.predYuv.getLumaAddr(absPartIdx);
 634         int16_t* residual  = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
 635         pixel*   picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
 636         intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
 637         uint32_t stride    = mode.fencYuv->m_size;
 638         uint32_t sizeIdx   = log2TrSize - 2;
 639         uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
 640         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
 641         coeff_t* coeff        = cu.m_trCoeff[TEXT_LUMA] + coeffOffsetY;
 642
 643         initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
 644         predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
 645
 646         X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
 647         cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
 648
 649         primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
 650         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, false);
 651         if (numSig)
 652         {
 653             m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, false, numSig);
 654             primitives.luma_add_ps[sizeIdx](picReconY, picStride, pred, residual, stride, stride);
 655             cu.setCbfSubParts(1 << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
 656         }
 657         else
 658         {
 659             primitives.square_copy_pp[sizeIdx](picReconY, picStride, pred, stride);
 660             cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
 661         }
 662     }
 663     else
 664     {
 665         X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
 666
 667         /* code split block */
 668         uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
 669         uint32_t cbf = 0;
 670         for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
 671         {
 672             residualTransformQuantIntra(mode, cuGeom, trDepth + 1, absPartIdxSub, depthRange);
 673             cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
 674         }
 675         for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
 676             cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << trDepth);
 677     }
 678 }
 679
 680 void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, uint32_t absPartIdx)
 681 {
 682     uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
 683     uint32_t tuDepth   = cu.m_tuDepth[absPartIdx];
 684
 685     if (tuDepth == trDepth)
 686     {
 687         uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 688         uint32_t qtLayer    = log2TrSize - 2;
 689
 690         // copy transform coefficients
 691         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
 692         coeff_t* coeffSrcY    = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
 693         coeff_t* coeffDestY   = cu.m_trCoeff[0]            + coeffOffsetY;
 694         memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2));
 695
 696         // copy reconstruction
 697         m_rqt[qtLayer].reconQtYuv.copyPartToPartLuma(reconYuv, absPartIdx, log2TrSize);
 698     }
 699     else
 700     {
 701         uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
 702         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
 703             extractIntraResultQT(cu, reconYuv, trDepth + 1, absPartIdx + subPartIdx * numQPart);
 704     }
 705 }
 706
 707 /* 4:2:2 post-TU split processing */
 708 void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx)
 709 {
 710     uint32_t depth = cu.m_cuDepth[0];
 711     uint32_t fullDepth = depth + trDepth;
 712     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 713
 714     uint32_t trDepthC = trDepth;
 715     if (log2TrSize == 2)
 716     {
 717         X265_CHECK(m_csp != X265_CSP_I444 && trDepthC, "trDepthC invalid\n");
 718         trDepthC--;
 719     }
 720
 721     uint32_t partIdxesPerSubTU = (NUM_CU_PARTITIONS >> ((depth + trDepthC) << 1)) >> 1;
 722
 723     // move the CBFs down a level and set the parent CBF
 724     uint8_t subTUCBF[2];
 725     uint8_t combinedSubTUCBF = 0;
 726
 727     for (uint32_t subTU = 0; subTU < 2; subTU++)
 728     {
 729         const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
 730
 731         subTUCBF[subTU]   = cu.getCbf(subTUAbsPartIdx, ttype, trDepth);
 732         combinedSubTUCBF |= subTUCBF[subTU];
 733     }
 734
 735     for (uint32_t subTU = 0; subTU < 2; subTU++)
 736     {
 737         const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
 738         const uint8_t compositeCBF = (subTUCBF[subTU] << 1) | combinedSubTUCBF;
 739
 740         cu.setCbfPartRange((compositeCBF << trDepth), ttype, subTUAbsPartIdx, partIdxesPerSubTU);
 741     }
 742 }
 743
 744 /* returns distortion */
 745 uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
 746 {
 747     CUData& cu = mode.cu;
 748     uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
 749     uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
 750
 751     if (tuDepthL > trDepth)
 752     {
 753         uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
 754         uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0;
 755         for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
 756         {
 757             outDist += codeIntraChromaQt(mode, cuGeom, trDepth + 1, absPartIdxSub, psyEnergy);
 758             splitCbfU |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_U, trDepth + 1);
 759             splitCbfV |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_V, trDepth + 1);
 760         }
 761         for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
 762         {
 763             cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << trDepth);
 764             cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << trDepth);
 765         }
 766
 767         return outDist;
 768     }
 769
 770     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 771     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
 772
 773     uint32_t trDepthC = trDepth;
 774     if (log2TrSizeC == 1)
 775     {
 776         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "invalid trDepth\n");
 777         trDepthC--;
 778         log2TrSizeC++;
 779         uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
 780         bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
 781         if (!bFirstQ)
 782             return 0;
 783     }
 784
 785     if (m_bEnableRDOQ)
 786         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
 787
 788     bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
 789     checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
 790     if (checkTransformSkip)
 791         return codeIntraChromaTSkip(mode, cuGeom, trDepth, trDepthC, absPartIdx, psyEnergy);
 792
 793     uint32_t qtLayer = log2TrSize - 2;
 794     uint32_t tuSize = 1 << log2TrSizeC;
 795     uint32_t outDist = 0;
 796
 797     uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
 798     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
 799
 800     for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
 801     {
 802         TextType ttype = (TextType)chromaId;
 803
 804         TURecurse tuIterator(splitType, curPartNum, absPartIdx);
 805         do
 806         {
 807             uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
 808
 809             pixel*   fenc     = const_cast<Yuv*>(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC);
 810             pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
 811             int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
 812             uint32_t stride   = mode.fencYuv->m_csize;
 813             uint32_t sizeIdxC = log2TrSizeC - 2;
 814
 815             uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
 816             coeff_t* coeffC        = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
 817             pixel*   reconQt       = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
 818             uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
 819
 820             pixel*   picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
 821             intptr_t picStride = m_frame->m_reconPicYuv->m_strideC;
 822
 823             // init availability pattern
 824             initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
 825             pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
 826
 827             uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
 828             if (chromaPredMode == DM_CHROMA_IDX)
 829                 chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
 830             if (m_csp == X265_CSP_I422)
 831                 chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
 832
 833             // get prediction signal
 834             predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
 835
 836             cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 837
 838             primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
 839             uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
 840             uint32_t tmpDist;
 841             if (numSig)
 842             {
 843                 m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
 844                 primitives.luma_add_ps[sizeIdxC](reconQt, reconQtStride, pred, residual, stride, stride);
 845                 cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 846             }
 847             else
 848             {
 849                 // no coded residual, recon = pred
 850                 primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride);
 851                 cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 852             }
 853
 854             tmpDist = primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride);
 855             outDist += (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist);
 856
 857             if (m_rdCost.m_psyRd)
 858                 psyEnergy += m_rdCost.psyCost(sizeIdxC, fenc, stride, picReconC, picStride);
 859
 860             primitives.square_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride);
 861         }
 862         while (tuIterator.isNextSection());
 863
 864         if (splitType == VERTICAL_SPLIT)
 865             offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx);
 866     }
 867
 868     return outDist;
 869 }
 870
 871 /* returns distortion */
 872 uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t trDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
 873 {
 874     CUData& cu = mode.cu;
 875     uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
 876     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
 877     uint32_t log2TrSizeC = 2;
 878     uint32_t tuSize = 4;
 879     uint32_t qtLayer = log2TrSize - 2;
 880     uint32_t outDist = 0;
 881
 882     /* At the TU layers above this one, no RDO is performed, only distortion is being measured,
 883      * so the entropy coder is not very accurate. The best we can do is return it in the same
 884      * condition as it arrived, and to do all bit estimates from the same state. */
 885     m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
 886
 887     ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
 888     ALIGN_VAR_32(pixel,   tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]);
 889
 890     uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
 891     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
 892
 893     for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
 894     {
 895         TextType ttype = (TextType)chromaId;
 896
 897         TURecurse tuIterator(splitType, curPartNum, absPartIdx);
 898         do
 899         {
 900             uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
 901
 902             pixel*   fenc = const_cast<Yuv*>(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC);
 903             pixel*   pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
 904             int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
 905             uint32_t stride = mode.fencYuv->m_csize;
 906             uint32_t sizeIdxC = log2TrSizeC - 2;
 907
 908             uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
 909             coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
 910             pixel*   reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
 911             uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
 912
 913             // init availability pattern
 914             initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
 915             pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
 916
 917             uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
 918             if (chromaPredMode == DM_CHROMA_IDX)
 919                 chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
 920             if (m_csp == X265_CSP_I422)
 921                 chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
 922
 923             // get prediction signal
 924             predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
 925
 926             uint64_t bCost = MAX_INT64;
 927             uint32_t bDist = 0;
 928             uint32_t bCbf = 0;
 929             uint32_t bEnergy = 0;
 930             int      bTSkip = 0;
 931
 932             int checkTransformSkip = 1;
 933             for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
 934             {
 935                 coeff_t* coeff = (useTSkip ? tskipCoeffC : coeffC);
 936                 pixel*   recon = (useTSkip ? tskipReconC : reconQt);
 937                 uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
 938
 939                 primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
 940
 941                 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
 942                 if (numSig)
 943                 {
 944                     m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
 945                     primitives.luma_add_ps[sizeIdxC](recon, reconStride, pred, residual, stride, stride);
 946                     cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 947                 }
 948                 else if (useTSkip)
 949                 {
 950                     checkTransformSkip = 0;
 951                     break;
 952                 }
 953                 else
 954                 {
 955                     primitives.square_copy_pp[sizeIdxC](recon, reconStride, pred, stride);
 956                     cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 957                 }
 958                 uint32_t tmpDist = primitives.sse_pp[sizeIdxC](recon, reconStride, fenc, stride);
 959                 tmpDist = (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist);
 960
 961                 cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 962
 963                 uint32_t tmpBits = 0, tmpEnergy = 0;
 964                 if (numSig)
 965                 {
 966                     m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
 967                     m_entropyCoder.resetBits();
 968                     m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
 969                     tmpBits = m_entropyCoder.getNumberOfWrittenBits();
 970                 }
 971
 972                 uint64_t tmpCost;
 973                 if (m_rdCost.m_psyRd)
 974                 {
 975                     tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
 976                     tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
 977                 }
 978                 else
 979                     tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
 980
 981                 if (tmpCost < bCost)
 982                 {
 983                     bCost = tmpCost;
 984                     bDist = tmpDist;
 985                     bTSkip = useTSkip;
 986                     bCbf = !!numSig;
 987                     bEnergy = tmpEnergy;
 988                 }
 989             }
 990
 991             if (bTSkip)
 992             {
 993                 memcpy(coeffC, tskipCoeffC, sizeof(coeff_t) << (log2TrSizeC * 2));
 994                 primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE);
 995             }
 996
 997             cu.setCbfPartRange(bCbf << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 998             cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 999
1000             pixel*   reconPicC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
1001             intptr_t picStride = m_frame->m_reconPicYuv->m_strideC;
1002             primitives.square_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride);
1003
1004             outDist += bDist;
1005             psyEnergy += bEnergy;
1006         }
1007         while (tuIterator.isNextSection());
1008
1009         if (splitType == VERTICAL_SPLIT)
1010             offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx);
1011     }
1012
1013     m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
1014     return outDist;
1015 }
1016
1017 void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t trDepth, bool tuQuad)
1018 {
1019     uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
1020     uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
1021
1022     if (tuDepthL == trDepth)
1023     {
1024         uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
1025         uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1026
1027         if (tuQuad)
1028         {
1029             log2TrSizeC++; /* extract one 4x4 instead of 4 2x2 */
1030             trDepth--;     /* also adjust the number of coeff read */
1031         }
1032
1033         // copy transform coefficients
1034         uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
1035         uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1036
1037         uint32_t qtLayer   = log2TrSize - 2;
1038         coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
1039         coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
1040         coeff_t* coeffDstU = cu.m_trCoeff[1]           + coeffOffsetC;
1041         coeff_t* coeffDstV = cu.m_trCoeff[2]           + coeffOffsetC;
1042         memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
1043         memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
1044
1045         // copy reconstruction
1046         m_rqt[qtLayer].reconQtYuv.copyPartToPartChroma(reconYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
1047     }
1048     else
1049     {
1050         if (g_maxLog2CUSize - fullDepth - 1 == 2 && m_csp != X265_CSP_I444)
1051             /* no such thing as chroma 2x2, so extract one 4x4 instead of 4 2x2 */
1052             extractIntraResultChromaQT(cu, reconYuv, absPartIdx, trDepth + 1, true);
1053         else
1054         {
1055             uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
1056             for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1057                 extractIntraResultChromaQT(cu, reconYuv, absPartIdx + subPartIdx * numQPart, trDepth + 1, false);
1058         }
1059     }
1060 }
1061
1062 void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx)
1063 {
1064     CUData& cu = mode.cu;
1065     uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
1066     uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
1067
1068     if (tuDepthL == trDepth)
1069     {
1070         uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
1071         uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1072         uint32_t trDepthC = trDepth;
1073         if (log2TrSizeC == 1)
1074         {
1075             X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth > 0, "invalid trDepth\n");
1076             trDepthC--;
1077             log2TrSizeC++;
1078             uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
1079             bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
1080             if (!bFirstQ)
1081                 return;
1082         }
1083
1084         ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1085         uint32_t tuSize = 1 << log2TrSizeC;
1086         uint32_t stride = mode.fencYuv->m_csize;
1087         const int sizeIdxC = log2TrSizeC - 2;
1088
1089         uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
1090         const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
1091
1092         for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1093         {
1094             TextType ttype = (TextType)chromaId;
1095
1096             TURecurse tuIterator(splitType, curPartNum, absPartIdx);
1097             do
1098             {
1099                 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1100
1101                 pixel*   fenc         = const_cast<pixel*>(mode.fencYuv->getChromaAddr(chromaId, absPartIdxC));
1102                 pixel*   pred         = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
1103                 int16_t* residual     = resiYuv.getChromaAddr(chromaId, absPartIdxC);
1104                 pixel*   recon        = mode.reconYuv.getChromaAddr(chromaId, absPartIdxC); // TODO: needed?
1105                 uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1106                 coeff_t* coeff        = cu.m_trCoeff[ttype] + coeffOffsetC;
1107                 pixel*   picReconC    = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
1108                 uint32_t picStride    = m_frame->m_reconPicYuv->m_strideC;
1109
1110                 uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
1111                 if (chromaPredMode == DM_CHROMA_IDX)
1112                     chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
1113                 chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
1114                 initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
1115                 pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
1116
1117                 predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
1118
1119                 X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
1120
1121                 primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
1122                 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, false);
1123                 if (numSig)
1124                 {
1125                     m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], residual, stride, coeff, log2TrSizeC, ttype, true, false, numSig);
1126                     primitives.luma_add_ps[sizeIdxC](recon, stride, pred, residual, stride, stride);
1127                     primitives.square_copy_pp[sizeIdxC](picReconC, picStride, recon, stride);
1128                     cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1129                 }
1130                 else
1131                 {
1132                     primitives.square_copy_pp[sizeIdxC](recon, stride, pred, stride);
1133                     primitives.square_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
1134                     cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1135                 }
1136             }
1137             while (tuIterator.isNextSection());
1138
1139             if (splitType == VERTICAL_SPLIT)
1140                 offsetSubTUCBFs(cu, (TextType)chromaId, trDepth, absPartIdx);
1141         }
1142     }
1143     else
1144     {
1145         uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
1146         uint32_t splitCbfU = 0, splitCbfV = 0;
1147         for (uint32_t subPartIdx = 0, absPartIdxC = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxC += qPartsDiv)
1148         {
1149             residualQTIntraChroma(mode, cuGeom, trDepth + 1, absPartIdxC);
1150             splitCbfU |= cu.getCbf(absPartIdxC, TEXT_CHROMA_U, trDepth + 1);
1151             splitCbfV |= cu.getCbf(absPartIdxC, TEXT_CHROMA_V, trDepth + 1);
1152         }
1153         for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
1154         {
1155             cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << trDepth);
1156             cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << trDepth);
1157         }
1158     }
1159 }
1160
1161 void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes)
1162 {
1163     uint32_t depth = cuGeom.depth;
1164     CUData& cu = intraMode.cu;
1165
1166     cu.setPartSizeSubParts(partSize);
1167     cu.setPredModeSubParts(MODE_INTRA);
1168
1169     uint32_t tuDepthRange[2];
1170     cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1171
1172     intraMode.initCosts();
1173     intraMode.distortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange, sharedModes);
1174     intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
1175
1176     m_entropyCoder.resetBits();
1177     if (m_slice->m_pps->bTransquantBypassEnabled)
1178         m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1179
1180     if (!m_slice->isIntra())
1181     {
1182         m_entropyCoder.codeSkipFlag(cu, 0);
1183         m_entropyCoder.codePredMode(cu.m_predMode[0]);
1184     }
1185
1186     m_entropyCoder.codePartSize(cu, 0, depth);
1187     m_entropyCoder.codePredInfo(cu, 0);
1188     intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
1189
1190     bool bCodeDQP = m_slice->m_pps->bUseDQP;
1191     m_entropyCoder.codeCoeff(cu, 0, depth, bCodeDQP, tuDepthRange);
1192     m_entropyCoder.store(intraMode.contexts);
1193     intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1194     intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
1195     if (m_rdCost.m_psyRd)
1196         intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
1197
1198     updateModeCost(intraMode);
1199 }
1200
1201 uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t depthRange[2], uint8_t* sharedModes)
1202 {
1203     CUData& cu = intraMode.cu;
1204     Yuv* reconYuv = &intraMode.reconYuv;
1205     Yuv* predYuv = &intraMode.predYuv;
1206     const Yuv* fencYuv = intraMode.fencYuv;
1207
1208     uint32_t depth        = cu.m_cuDepth[0];
1209     uint32_t initTrDepth  = cu.m_partSize[0] == SIZE_2Nx2N ? 0 : 1;
1210     uint32_t numPU        = 1 << (2 * initTrDepth);
1211     uint32_t log2TrSize   = cu.m_log2CUSize[0] - initTrDepth;
1212     uint32_t tuSize       = 1 << log2TrSize;
1213     uint32_t qNumParts    = cuGeom.numPartitions >> 2;
1214     uint32_t sizeIdx      = log2TrSize - 2;
1215     uint32_t absPartIdx   = 0;
1216     uint32_t totalDistortion = 0;
1217
1218     int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[absPartIdx] == SIZE_NxN;
1219
1220     // loop over partitions
1221     for (uint32_t pu = 0; pu < numPU; pu++, absPartIdx += qNumParts)
1222     {
1223         uint32_t bmode = 0;
1224
1225         if (sharedModes)
1226             bmode = sharedModes[pu];
1227         else
1228         {
1229             // Reference sample smoothing
1230             initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX);
1231
1232             // determine set of modes to be tested (using prediction signal only)
1233             pixel*   fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
1234             uint32_t stride = predYuv->m_size;
1235
1236             pixel *above = m_refAbove + tuSize - 1;
1237             pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
1238             pixel *left = m_refLeft + tuSize - 1;
1239             pixel *leftFiltered = m_refLeftFlt + tuSize - 1;
1240
1241             // 33 Angle modes once
1242             ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
1243             ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
1244             ALIGN_VAR_32(pixel, bufScale[32 * 32]);
1245             pixel _above[4 * 32 + 1];
1246             pixel _left[4 * 32 + 1];
1247             int scaleTuSize = tuSize;
1248             int scaleStride = stride;
1249             int costShift = 0;
1250
1251             if (tuSize > 32)
1252             {
1253                 pixel *aboveScale = _above + 2 * 32;
1254                 pixel *leftScale = _left + 2 * 32;
1255
1256                 // origin is 64x64, we scale to 32x32 and setup required parameters
1257                 primitives.scale2D_64to32(bufScale, fenc, stride);
1258                 fenc = bufScale;
1259
1260                 // reserve space in case primitives need to store data in above
1261                 // or left buffers
1262                 aboveScale[0] = leftScale[0] = above[0];
1263                 primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
1264                 primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
1265
1266                 scaleTuSize = 32;
1267                 scaleStride = 32;
1268                 costShift = 2;
1269                 sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
1270
1271                 // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
1272                 above = aboveScale;
1273                 left = leftScale;
1274                 aboveFiltered = aboveScale;
1275                 leftFiltered = leftScale;
1276             }
1277
1278             m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1279
1280             /* there are three cost tiers for intra modes:
1281              *  pred[0]          - mode probable, least cost
1282              *  pred[1], pred[2] - less probable, slightly more cost
1283              *  non-mpm modes    - all cost the same (rbits) */
1284             uint64_t mpms;
1285             uint32_t preds[3];
1286             uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
1287
1288             pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
1289             uint64_t modeCosts[35];
1290             uint64_t bcost;
1291
1292             // DC
1293             primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
1294             uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, DC_IDX) : rbits;
1295             uint32_t sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
1296             modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
1297
1298             // PLANAR
1299             pixel *abovePlanar = above;
1300             pixel *leftPlanar = left;
1301             if (tuSize >= 8 && tuSize <= 32)
1302             {
1303                 abovePlanar = aboveFiltered;
1304                 leftPlanar = leftFiltered;
1305             }
1306             primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
1307             bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, PLANAR_IDX) : rbits;
1308             sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
1309             modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
1310             COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
1311
1312             // angular predictions
1313             primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
1314
1315             primitives.transpose[sizeIdx](buf_trans, fenc, scaleStride);
1316             for (int mode = 2; mode < 35; mode++)
1317             {
1318                 bool modeHor = (mode < 18);
1319                 pixel *cmp = (modeHor ? buf_trans : fenc);
1320                 intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride);
1321                 bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
1322                 sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
1323                 modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
1324                 COPY1_IF_LT(bcost, modeCosts[mode]);
1325             }
1326
1327             /* Find the top maxCandCount candidate modes with cost within 25% of best
1328              * or among the most probable modes. maxCandCount is derived from the
1329              * rdLevel and depth. In general we want to try more modes at slower RD
1330              * levels and at higher depths */
1331             uint64_t candCostList[MAX_RD_INTRA_MODES];
1332             uint32_t rdModeList[MAX_RD_INTRA_MODES];
1333             int maxCandCount = 2 + m_param->rdLevel + ((depth + initTrDepth) >> 1);
1334             for (int i = 0; i < maxCandCount; i++)
1335                 candCostList[i] = MAX_INT64;
1336
1337             uint64_t paddedBcost = bcost + (bcost >> 3); // 1.12%
1338             for (int mode = 0; mode < 35; mode++)
1339                 if (modeCosts[mode] < paddedBcost || (mpms & ((uint64_t)1 << mode)))
1340                     updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
1341
1342             /* measure best candidates using simple RDO (no TU splits) */
1343             bcost = MAX_INT64;
1344             for (int i = 0; i < maxCandCount; i++)
1345             {
1346                 if (candCostList[i] == MAX_INT64)
1347                     break;
1348                 m_entropyCoder.load(m_rqt[depth].cur);
1349                 cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTrDepth);
1350
1351                 Cost icosts;
1352                 if (checkTransformSkip)
1353                     codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts);
1354                 else
1355                     codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, false, icosts, depthRange);
1356                 COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
1357             }
1358         }
1359
1360         /* remeasure best mode, allowing TU splits */
1361         cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTrDepth);
1362         m_entropyCoder.load(m_rqt[depth].cur);
1363
1364         Cost icosts;
1365         if (checkTransformSkip)
1366             codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts);
1367         else
1368             codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, true, icosts, depthRange);
1369         totalDistortion += icosts.distortion;
1370
1371         extractIntraResultQT(cu, *reconYuv, initTrDepth, absPartIdx);
1372
1373         // set reconstruction for next intra prediction blocks
1374         if (pu != numPU - 1)
1375         {
1376             /* This has important implications for parallelism and RDO.  It is writing intermediate results into the
1377              * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
1378              * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
1379              * that the contexts should be tracked through each PU */
1380             pixel*   dst         = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
1381             uint32_t dststride   = m_frame->m_reconPicYuv->m_stride;
1382             pixel*   src         = reconYuv->getLumaAddr(absPartIdx);
1383             uint32_t srcstride   = reconYuv->m_size;
1384             primitives.square_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
1385         }
1386     }
1387
1388     if (numPU > 1)
1389     {
1390         uint32_t combCbfY = 0;
1391         uint32_t partIdx  = 0;
1392         for (uint32_t part = 0; part < 4; part++, partIdx += qNumParts)
1393             combCbfY |= cu.getCbf(partIdx, TEXT_LUMA, 1);
1394
1395         for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
1396             cu.m_cbf[0][offs] |= combCbfY;
1397     }
1398
1399     // TODO: remove this
1400     m_entropyCoder.load(m_rqt[depth].cur);
1401
1402     return totalDistortion;
1403 }
1404
1405 void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
1406 {
1407     CUData& cu = intraMode.cu;
1408     const Yuv* fencYuv = intraMode.fencYuv;
1409     Yuv* predYuv = &intraMode.predYuv;
1410
1411     uint32_t bestMode  = 0;
1412     uint64_t bestCost  = MAX_INT64;
1413     uint32_t modeList[NUM_CHROMA_MODE];
1414
1415     uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
1416     uint32_t tuSize = 1 << log2TrSizeC;
1417     int32_t scaleTuSize = tuSize;
1418     int32_t costShift = 0;
1419
1420     if (tuSize > 32)
1421     {
1422         scaleTuSize = 32;
1423         costShift = 2;
1424         log2TrSizeC = 5;
1425     }
1426
1427     Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 1);
1428     Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 2);
1429     cu.getAllowedChromaDir(0, modeList);
1430
1431     // check chroma modes
1432     for (uint32_t mode = 0; mode < NUM_CHROMA_MODE; mode++)
1433     {
1434         uint32_t chromaPredMode = modeList[mode];
1435         if (chromaPredMode == DM_CHROMA_IDX)
1436             chromaPredMode = cu.m_lumaIntraDir[0];
1437         if (m_csp == X265_CSP_I422)
1438             chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1439
1440         uint64_t cost = 0;
1441         for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1442         {
1443             pixel* fenc = fencYuv->m_buf[chromaId];
1444             pixel* pred = predYuv->m_buf[chromaId];
1445             pixel* chromaPred = getAdiChromaBuf(chromaId, scaleTuSize);
1446
1447             // get prediction signal
1448             predIntraChromaAng(chromaPred, chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC, m_csp);
1449             cost += primitives.sa8d[log2TrSizeC - 2](fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift;
1450         }
1451
1452         if (cost < bestCost)
1453         {
1454             bestCost = cost;
1455             bestMode = modeList[mode];
1456         }
1457     }
1458
1459     cu.setChromIntraDirSubParts(bestMode, 0, cu.m_cuDepth[0]);
1460 }
1461
1462 uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
1463 {
1464     CUData& cu = intraMode.cu;
1465     Yuv& reconYuv = intraMode.reconYuv;
1466
1467     uint32_t depth       = cu.m_cuDepth[0];
1468     uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN && m_csp == X265_CSP_I444;
1469     uint32_t log2TrSize  = cu.m_log2CUSize[0] - initTrDepth;
1470     uint32_t absPartStep = (NUM_CU_PARTITIONS >> (depth << 1));
1471     uint32_t totalDistortion = 0;
1472
1473     int part = partitionFromLog2Size(log2TrSize);
1474
1475     TURecurse tuIterator((initTrDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
1476
1477     do
1478     {
1479         uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1480         int cuSize = 1 << cu.m_log2CUSize[absPartIdxC];
1481
1482         uint32_t bestMode = 0;
1483         uint32_t bestDist = 0;
1484         uint64_t bestCost = MAX_INT64;
1485
1486         // init mode list
1487         uint32_t minMode = 0;
1488         uint32_t maxMode = NUM_CHROMA_MODE;
1489         uint32_t modeList[NUM_CHROMA_MODE];
1490
1491         cu.getAllowedChromaDir(absPartIdxC, modeList);
1492
1493         // check chroma modes
1494         for (uint32_t mode = minMode; mode < maxMode; mode++)
1495         {
1496             // restore context models
1497             m_entropyCoder.load(m_rqt[depth].cur);
1498
1499             cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTrDepth);
1500             uint32_t psyEnergy = 0;
1501             uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTrDepth, absPartIdxC, psyEnergy);
1502
1503             if (m_slice->m_pps->bTransformSkipEnabled)
1504                 m_entropyCoder.load(m_rqt[depth].cur);
1505
1506             m_entropyCoder.resetBits();
1507             // chroma prediction mode
1508             if (cu.m_partSize[0] == SIZE_2Nx2N || m_csp != X265_CSP_I444)
1509             {
1510                 if (!absPartIdxC)
1511                     m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
1512             }
1513             else
1514             {
1515                 uint32_t qtNumParts = cuGeom.numPartitions >> 2;
1516                 if (!(absPartIdxC & (qtNumParts - 1)))
1517                     m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
1518             }
1519
1520             codeSubdivCbfQTChroma(cu, initTrDepth, absPartIdxC, tuIterator.absPartIdxStep, cuSize, cuSize);
1521             codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_U);
1522             codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_V);
1523             uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
1524             uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(dist, bits, psyEnergy) : m_rdCost.calcRdCost(dist, bits);
1525
1526             if (cost < bestCost)
1527             {
1528                 bestCost = cost;
1529                 bestDist = dist;
1530                 bestMode = modeList[mode];
1531                 extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTrDepth, false);
1532                 memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1533                 memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1534                 memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1535                 memcpy(m_qtTempTransformSkipFlag[2], cu.m_transformSkip[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1536             }
1537         }
1538
1539         if (!tuIterator.isLastSection())
1540         {
1541             uint32_t zorder    = cuGeom.encodeIdx + absPartIdxC;
1542             uint32_t dststride = m_frame->m_reconPicYuv->m_strideC;
1543             pixel *src, *dst;
1544
1545             dst = m_frame->m_reconPicYuv->getCbAddr(cu.m_cuAddr, zorder);
1546             src = reconYuv.getCbAddr(absPartIdxC);
1547             primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
1548
1549             dst = m_frame->m_reconPicYuv->getCrAddr(cu.m_cuAddr, zorder);
1550             src = reconYuv.getCrAddr(absPartIdxC);
1551             primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
1552         }
1553
1554         memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
1555         memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
1556         memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
1557         memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
1558         cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTrDepth);
1559         totalDistortion += bestDist;
1560     }
1561     while (tuIterator.isNextSection());
1562
1563     if (initTrDepth != 0)
1564     {
1565         uint32_t combCbfU = 0;
1566         uint32_t combCbfV = 0;
1567         uint32_t partIdx  = 0;
1568         for (uint32_t p = 0; p < 4; p++, partIdx += tuIterator.absPartIdxStep)
1569         {
1570             combCbfU |= cu.getCbf(partIdx, TEXT_CHROMA_U, 1);
1571             combCbfV |= cu.getCbf(partIdx, TEXT_CHROMA_V, 1);
1572         }
1573
1574         for (uint32_t offs = 0; offs < 4 * tuIterator.absPartIdxStep; offs++)
1575         {
1576             cu.m_cbf[1][offs] |= combCbfU;
1577             cu.m_cbf[2][offs] |= combCbfV;
1578         }
1579     }
1580
1581     /* TODO: remove this */
1582     m_entropyCoder.load(m_rqt[depth].cur);
1583     return totalDistortion;
1584 }
1585
1586 /* estimation of best merge coding of an inter PU (not a merge CU) */
1587 uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, MergeData& m)
1588 {
1589     X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "merge tested on non-2Nx2N partition\n");
1590
1591     m.maxNumMergeCand = cu.getInterMergeCandidates(m.absPartIdx, puIdx, m.mvFieldNeighbours, m.interDirNeighbours);
1592
1593     if (cu.isBipredRestriction())
1594     {
1595         /* in 8x8 CUs do not allow bidir merge candidates if not 2Nx2N */
1596         for (uint32_t mergeCand = 0; mergeCand < m.maxNumMergeCand; ++mergeCand)
1597         {
1598             if (m.interDirNeighbours[mergeCand] == 3)
1599             {
1600                 m.interDirNeighbours[mergeCand] = 1;
1601                 m.mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID;
1602             }
1603         }
1604     }
1605
1606     Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1607
1608     uint32_t outCost = MAX_UINT;
1609     for (uint32_t mergeCand = 0; mergeCand < m.maxNumMergeCand; ++mergeCand)
1610     {
1611         /* Prevent TMVP candidates from using unavailable reference pixels */
1612         if (m_bFrameParallel &&
1613             (m.mvFieldNeighbours[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1614              m.mvFieldNeighbours[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4))
1615             continue;
1616
1617         cu.m_mv[0][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][0].mv;
1618         cu.m_refIdx[0][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][0].refIdx;
1619         cu.m_mv[1][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv;
1620         cu.m_refIdx[1][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][1].refIdx;
1621
1622         prepMotionCompensation(cu, cuGeom, puIdx);
1623         motionCompensation(tempYuv, true, false);
1624         uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(m.absPartIdx), tempYuv.m_size);
1625         uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand);
1626         costCand = costCand + m_rdCost.getCost(bitsCand);
1627         if (costCand < outCost)
1628         {
1629             outCost = costCand;
1630             m.bits = bitsCand;
1631             m.index = mergeCand;
1632         }
1633     }
1634
1635     m.mvField[0] = m.mvFieldNeighbours[m.index][0];
1636     m.mvField[1] = m.mvFieldNeighbours[m.index][1];
1637     m.interDir = m.interDirNeighbours[m.index];
1638
1639     return outCost;
1640 }
1641
1642 /* this function assumes the caller has configured its MotionEstimation engine with the
1643  * correct source plane and source PU, and has called prepMotionCompensation() to set
1644  * m_puAbsPartIdx, m_puWidth, and m_puHeight */
1645 void Search::singleMotionEstimation(Search& master, const CUData& cu, const CUGeom& cuGeom, int part, int list, int ref)
1646 {
1647     uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
1648     bits += getTUBits(ref, m_slice->m_numRefIdx[list]);
1649
1650     MV amvpCand[AMVP_NUM_CANDS];
1651     MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
1652     int numMvc = cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, amvpCand, mvc);
1653
1654     uint32_t bestCost = MAX_INT;
1655     int mvpIdx = 0;
1656     int merange = m_param->searchRange;
1657     for (int i = 0; i < AMVP_NUM_CANDS; i++)
1658     {
1659         MV mvCand = amvpCand[i];
1660
1661         // NOTE: skip mvCand if Y is > merange and -FN>1
1662         if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
1663             continue;
1664
1665         cu.clipMv(mvCand);
1666
1667         Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1668         predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPicYuv, mvCand);
1669         uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
1670
1671         if (bestCost > cost)
1672         {
1673             bestCost = cost;
1674             mvpIdx = i;
1675         }
1676     }
1677
1678     MV mvmin, mvmax, outmv, mvp = amvpCand[mvpIdx];
1679     setSearchRange(cu, mvp, merange, mvmin, mvmax);
1680
1681     int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
1682
1683     /* Get total cost of partition, but only include MV bit cost once */
1684     bits += m_me.bitcost(outmv);
1685     uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
1686
1687     /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
1688     checkBestMVP(amvpCand, outmv, mvp, mvpIdx, bits, cost);
1689
1690     /* tie goes to the smallest ref ID, just like --no-pme */
1691     ScopedLock _lock(master.m_outputLock);
1692     if (cost < master.m_bestME[list].cost ||
1693        (cost == master.m_bestME[list].cost && ref < master.m_bestME[list].ref))
1694     {
1695         master.m_bestME[list].mv = outmv;
1696         master.m_bestME[list].mvp = mvp;
1697         master.m_bestME[list].mvpIdx = mvpIdx;
1698         master.m_bestME[list].ref = ref;
1699         master.m_bestME[list].cost = cost;
1700         master.m_bestME[list].bits = bits;
1701     }
1702 }
1703
1704 /* search of the best candidate for inter prediction
1705  * returns true if predYuv was filled with a motion compensated prediction */
1706 bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma)
1707 {
1708     CUData& cu = interMode.cu;
1709     Yuv* predYuv = &interMode.predYuv;
1710
1711     MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
1712     MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
1713
1714     const Slice *slice = m_slice;
1715     PicYuv* fencPic = m_frame->m_origPicYuv;
1716     int numPart     = cu.getNumPartInter();
1717     int numPredDir  = slice->isInterP() ? 1 : 2;
1718     const int* numRefIdx = slice->m_numRefIdx;
1719     uint32_t lastMode = 0;
1720     int      totalmebits = 0;
1721     bool     bDistributed = m_param->bDistributeMotionEstimation && (numRefIdx[0] + numRefIdx[1]) > 2;
1722     MV       mvzero(0, 0);
1723     Yuv&     tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1724
1725     MergeData merge;
1726     memset(&merge, 0, sizeof(merge));
1727
1728     for (int puIdx = 0; puIdx < numPart; puIdx++)
1729     {
1730         /* sets m_puAbsPartIdx, m_puWidth, m_puHeight */
1731         initMotionCompensation(cu, cuGeom, puIdx);
1732
1733         pixel* pu = fencPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
1734         m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight);
1735
1736         uint32_t mrgCost = MAX_UINT;
1737
1738         /* find best cost merge candidate */
1739         if (cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N)
1740         {
1741             merge.absPartIdx = m_puAbsPartIdx;
1742             merge.width      = m_puWidth;
1743             merge.height     = m_puHeight;
1744             mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge);
1745
1746             if (bMergeOnly && cu.m_log2CUSize[0] > 3)
1747             {
1748                 if (mrgCost == MAX_UINT)
1749                 {
1750                     /* No valid merge modes were found, there is no possible way to
1751                      * perform a valid motion compensation prediction, so early-exit */
1752                     return false;
1753                 }
1754                 // set merge result
1755                 cu.m_mergeFlag[m_puAbsPartIdx] = true;
1756                 cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
1757                 cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx);
1758                 cu.setPUMv(0, merge.mvField[0].mv, m_puAbsPartIdx, puIdx);
1759                 cu.setPURefIdx(0, merge.mvField[0].refIdx, m_puAbsPartIdx, puIdx);
1760                 cu.setPUMv(1, merge.mvField[1].mv, m_puAbsPartIdx, puIdx);
1761                 cu.setPURefIdx(1, merge.mvField[1].refIdx, m_puAbsPartIdx, puIdx);
1762                 totalmebits += merge.bits;
1763
1764                 prepMotionCompensation(cu, cuGeom, puIdx);
1765                 motionCompensation(*predYuv, true, bChroma);
1766                 continue;
1767             }
1768         }
1769
1770         MotionData bidir[2];
1771         uint32_t bidirCost = MAX_UINT;
1772         int bidirBits = 0;
1773
1774         m_bestME[0].cost = MAX_UINT;
1775         m_bestME[1].cost = MAX_UINT;
1776
1777         getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
1778
1779         if (bDistributed)
1780         {
1781             m_curMECu = &cu;
1782             m_curGeom = &cuGeom;
1783
1784             /* this worker might already be enqueued for pmode, so other threads
1785              * might be looking at the ME job counts at any time, do these sets
1786              * in a safe order */
1787             m_curPart = puIdx;
1788             m_totalNumME = 0;
1789             m_numAcquiredME = 1;
1790             m_numCompletedME = 0;
1791             m_totalNumME = numRefIdx[0] + numRefIdx[1];
1792
1793             if (!m_bJobsQueued)
1794                 JobProvider::enqueue();
1795
1796             for (int i = 1; i < m_totalNumME; i++)
1797                 m_pool->pokeIdleThread();
1798
1799             while (m_totalNumME > m_numAcquiredME)
1800             {
1801                 int id = ATOMIC_INC(&m_numAcquiredME);
1802                 if (m_totalNumME >= id)
1803                 {
1804                     id -= 1;
1805                     if (id < numRefIdx[0])
1806                         singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, id);
1807                     else
1808                         singleMotionEstimation(*this, cu, cuGeom, puIdx, 1, id - numRefIdx[0]);
1809
1810                     if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
1811                         m_meCompletionEvent.trigger();
1812                 }
1813             }
1814             if (!m_bJobsQueued)
1815                 JobProvider::dequeue();
1816
1817             /* we saved L0-0 for ourselves */
1818             singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, 0);
1819             if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
1820                 m_meCompletionEvent.trigger();
1821
1822             m_meCompletionEvent.wait();
1823         }
1824         else
1825         {
1826             // Uni-directional prediction
1827             for (int l = 0; l < numPredDir; l++)
1828             {
1829                 for (int ref = 0; ref < numRefIdx[l]; ref++)
1830                 {
1831                     uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
1832                     bits += getTUBits(ref, numRefIdx[l]);
1833
1834                     int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, amvpCand[l][ref], mvc);
1835
1836                     // Pick the best possible MVP from AMVP candidates based on least residual
1837                     uint32_t bestCost = MAX_INT;
1838                     int mvpIdx = 0;
1839                     int merange = m_param->searchRange;
1840
1841                     for (int i = 0; i < AMVP_NUM_CANDS; i++)
1842                     {
1843                         MV mvCand = amvpCand[l][ref][i];
1844
1845                         // NOTE: skip mvCand if Y is > merange and -FN>1
1846                         if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
1847                             continue;
1848
1849                         cu.clipMv(mvCand);
1850                         predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPicYuv, mvCand);
1851                         uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
1852
1853                         if (bestCost > cost)
1854                         {
1855                             bestCost = cost;
1856                             mvpIdx  = i;
1857                         }
1858                     }
1859
1860                     MV mvmin, mvmax, outmv, mvp = amvpCand[l][ref][mvpIdx];
1861
1862                     setSearchRange(cu, mvp, merange, mvmin, mvmax);
1863                     int satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
1864
1865                     /* Get total cost of partition, but only include MV bit cost once */
1866                     bits += m_me.bitcost(outmv);
1867                     uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
1868
1869                     /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
1870                     checkBestMVP(amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
1871
1872                     if (cost < m_bestME[l].cost)
1873                     {
1874                         m_bestME[l].mv = outmv;
1875                         m_bestME[l].mvp = mvp;
1876                         m_bestME[l].mvpIdx = mvpIdx;
1877                         m_bestME[l].ref = ref;
1878                         m_bestME[l].cost = cost;
1879                         m_bestME[l].bits = bits;
1880                     }
1881                 }
1882             }
1883         }
1884
1885         /* Bi-directional prediction */
1886         if (slice->isInterB() && !cu.isBipredRestriction() && m_bestME[0].cost != MAX_UINT && m_bestME[1].cost != MAX_UINT)
1887         {
1888             bidir[0] = m_bestME[0];
1889             bidir[1] = m_bestME[1];
1890
1891             /* Generate reference subpels */
1892             PicYuv* refPic0  = slice->m_refPicList[0][m_bestME[0].ref]->m_reconPicYuv;
1893             PicYuv* refPic1  = slice->m_refPicList[1][m_bestME[1].ref]->m_reconPicYuv;
1894             Yuv*    bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
1895             predInterLumaPixel(bidirYuv[0], *refPic0, m_bestME[0].mv);
1896             predInterLumaPixel(bidirYuv[1], *refPic1, m_bestME[1].mv);
1897
1898             pixel *pred0 = bidirYuv[0].getLumaAddr(m_puAbsPartIdx);
1899             pixel *pred1 = bidirYuv[1].getLumaAddr(m_puAbsPartIdx);
1900
1901             int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
1902             primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, pred0, bidirYuv[0].m_size, pred1, bidirYuv[1].m_size, 32);
1903             int satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
1904
1905             bidirBits = m_bestME[0].bits + m_bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
1906             bidirCost = satdCost + m_rdCost.getCost(bidirBits);
1907
1908             bool bTryZero = m_bestME[0].mv.notZero() || m_bestME[1].mv.notZero();
1909             if (bTryZero)
1910             {
1911                 /* Do not try zero MV if unidir motion predictors are beyond
1912                  * valid search area */
1913                 MV mvmin, mvmax;
1914                 int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
1915                 setSearchRange(cu, mvzero, merange, mvmin, mvmax);
1916                 mvmax.y += 2; // there is some pad for subpel refine
1917                 mvmin <<= 2;
1918                 mvmax <<= 2;
1919
1920                 bTryZero &= m_bestME[0].mvp.checkRange(mvmin, mvmax);
1921                 bTryZero &= m_bestME[1].mvp.checkRange(mvmin, mvmax);
1922             }
1923             if (bTryZero)
1924             {
1925                 // coincident blocks of the two reference pictures
1926                 pixel *ref0 = slice->m_mref[0][m_bestME[0].ref].fpelPlane + (pu - fencPic->m_picOrg[0]);
1927                 pixel *ref1 = slice->m_mref[1][m_bestME[1].ref].fpelPlane + (pu - fencPic->m_picOrg[0]);
1928                 intptr_t refStride = slice->m_mref[0][0].lumaStride;
1929
1930                 primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
1931                 satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
1932
1933                 MV mvp0 = m_bestME[0].mvp;
1934                 int mvpIdx0 = m_bestME[0].mvpIdx;
1935                 uint32_t bits0 = m_bestME[0].bits - m_me.bitcost(m_bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
1936
1937                 MV mvp1 = m_bestME[1].mvp;
1938                 int mvpIdx1 = m_bestME[1].mvpIdx;
1939                 uint32_t bits1 = m_bestME[1].bits - m_me.bitcost(m_bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
1940
1941                 uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
1942
1943                 if (bDistributed)
1944                 {
1945                     cu.fillMvpCand(puIdx, m_puAbsPartIdx, 0, m_bestME[0].ref, amvpCand[0][m_bestME[0].ref], mvc);
1946                     cu.fillMvpCand(puIdx, m_puAbsPartIdx, 1, m_bestME[1].ref, amvpCand[1][m_bestME[1].ref], mvc);
1947                 }
1948
1949                 /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
1950                 checkBestMVP(amvpCand[0][m_bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost);
1951                 checkBestMVP(amvpCand[1][m_bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost);
1952
1953                 if (cost < bidirCost)
1954                 {
1955                     bidir[0].mv = mvzero;
1956                     bidir[1].mv = mvzero;
1957                     bidir[0].mvp = mvp0;
1958                     bidir[1].mvp = mvp1;
1959                     bidir[0].mvpIdx = mvpIdx0;
1960                     bidir[1].mvpIdx = mvpIdx1;
1961                     bidirCost = cost;
1962                     bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
1963                 }
1964             }
1965         }
1966
1967         /* select best option and store into CU */
1968         if (mrgCost < bidirCost && mrgCost < m_bestME[0].cost && mrgCost < m_bestME[1].cost)
1969         {
1970             cu.m_mergeFlag[m_puAbsPartIdx] = true;
1971             cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
1972             cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx);
1973             cu.setPUMv(0, merge.mvField[0].mv, m_puAbsPartIdx, puIdx);
1974             cu.setPURefIdx(0, merge.mvField[0].refIdx, m_puAbsPartIdx, puIdx);
1975             cu.setPUMv(1, merge.mvField[1].mv, m_puAbsPartIdx, puIdx);
1976             cu.setPURefIdx(1, merge.mvField[1].refIdx, m_puAbsPartIdx, puIdx);
1977
1978             totalmebits += merge.bits;
1979         }
1980         else if (bidirCost < m_bestME[0].cost && bidirCost < m_bestME[1].cost)
1981         {
1982             lastMode = 2;
1983
1984             cu.m_mergeFlag[m_puAbsPartIdx] = false;
1985             cu.setPUInterDir(3, m_puAbsPartIdx, puIdx);
1986             cu.setPUMv(0, bidir[0].mv, m_puAbsPartIdx, puIdx);
1987             cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx);
1988             cu.m_mvd[0][m_puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
1989             cu.m_mvpIdx[0][m_puAbsPartIdx] = bidir[0].mvpIdx;
1990
1991             cu.setPUMv(1, bidir[1].mv, m_puAbsPartIdx, puIdx);
1992             cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx);
1993             cu.m_mvd[1][m_puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
1994             cu.m_mvpIdx[1][m_puAbsPartIdx] = bidir[1].mvpIdx;
1995
1996             totalmebits += bidirBits;
1997         }
1998         else if (m_bestME[0].cost <= m_bestME[1].cost)
1999         {
2000             lastMode = 0;
2001
2002             cu.m_mergeFlag[m_puAbsPartIdx] = false;
2003             cu.setPUInterDir(1, m_puAbsPartIdx, puIdx);
2004             cu.setPUMv(0, m_bestME[0].mv, m_puAbsPartIdx, puIdx);
2005             cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx);
2006             cu.m_mvd[0][m_puAbsPartIdx] = m_bestME[0].mv - m_bestME[0].mvp;
2007             cu.m_mvpIdx[0][m_puAbsPartIdx] = m_bestME[0].mvpIdx;
2008
2009             cu.setPURefIdx(1, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
2010             cu.setPUMv(1, mvzero, m_puAbsPartIdx, puIdx);
2011
2012             totalmebits += m_bestME[0].bits;
2013         }
2014         else
2015         {
2016             lastMode = 1;
2017
2018             cu.m_mergeFlag[m_puAbsPartIdx] = false;
2019             cu.setPUInterDir(2, m_puAbsPartIdx, puIdx);
2020             cu.setPUMv(1, m_bestME[1].mv, m_puAbsPartIdx, puIdx);
2021             cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx);
2022             cu.m_mvd[1][m_puAbsPartIdx] = m_bestME[1].mv - m_bestME[1].mvp;
2023             cu.m_mvpIdx[1][m_puAbsPartIdx] = m_bestME[1].mvpIdx;
2024
2025             cu.setPURefIdx(0, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
2026             cu.setPUMv(0, mvzero, m_puAbsPartIdx, puIdx);
2027
2028             totalmebits += m_bestME[1].bits;
2029         }
2030
2031         prepMotionCompensation(cu, cuGeom, puIdx);
2032         motionCompensation(*predYuv, true, bChroma);
2033     }
2034
2035     interMode.sa8dBits += totalmebits;
2036     return true;
2037 }
2038
2039 void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3])
2040 {
2041     if (cuMode == SIZE_2Nx2N)
2042     {
2043         blockBit[0] = (!bPSlice) ? 3 : 1;
2044         blockBit[1] = 3;
2045         blockBit[2] = 5;
2046     }
2047     else if (cuMode == SIZE_2NxN || cuMode == SIZE_2NxnU || cuMode == SIZE_2NxnD)
2048     {
2049         static const uint32_t listBits[2][3][3] =
2050         {
2051             { { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
2052             { { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } }
2053         };
2054         if (bPSlice)
2055         {
2056             blockBit[0] = 3;
2057             blockBit[1] = 0;
2058             blockBit[2] = 0;
2059         }
2060         else
2061             memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
2062     }
2063     else if (cuMode == SIZE_Nx2N || cuMode == SIZE_nLx2N || cuMode == SIZE_nRx2N)
2064     {
2065         static const uint32_t listBits[2][3][3] =
2066         {
2067             { { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
2068             { { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } }
2069         };
2070         if (bPSlice)
2071         {
2072             blockBit[0] = 3;
2073             blockBit[1] = 0;
2074             blockBit[2] = 0;
2075         }
2076         else
2077             memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
2078     }
2079     else if (cuMode == SIZE_NxN)
2080     {
2081         blockBit[0] = (!bPSlice) ? 3 : 1;
2082         blockBit[1] = 3;
2083         blockBit[2] = 5;
2084     }
2085     else
2086     {
2087         X265_CHECK(0, "getBlkBits: unknown cuMode\n");
2088     }
2089 }
2090
2091 /* Check if using an alternative MVP would result in a smaller MVD + signal bits */
2092 void Search::checkBestMVP(MV* amvpCand, MV mv, MV& mvPred, int& outMvpIdx, uint32_t& outBits, uint32_t& outCost) const
2093 {
2094     X265_CHECK(amvpCand[outMvpIdx] == mvPred, "checkBestMVP: unexpected mvPred\n");
2095
2096     int mvpIdx = !outMvpIdx;
2097     MV mvp = amvpCand[mvpIdx];
2098     int diffBits = m_me.bitcost(mv, mvp) - m_me.bitcost(mv, mvPred);
2099     if (diffBits < 0)
2100     {
2101         outMvpIdx = mvpIdx;
2102         mvPred = mvp;
2103         uint32_t origOutBits = outBits;
2104         outBits = origOutBits + diffBits;
2105         outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
2106     }
2107 }
2108
2109 void Search::setSearchRange(const CUData& cu, MV mvp, int merange, MV& mvmin, MV& mvmax) const
2110 {
2111     cu.clipMv(mvp);
2112
2113     MV dist((int16_t)merange << 2, (int16_t)merange << 2);
2114     mvmin = mvp - dist;
2115     mvmax = mvp + dist;
2116
2117     cu.clipMv(mvmin);
2118     cu.clipMv(mvmax);
2119
2120     /* Clip search range to signaled maximum MV length.
2121      * We do not support this VUI field being changed from the default */
2122     const int maxMvLen = (1 << 15) - 1;
2123     mvmin.x = X265_MAX(mvmin.x, -maxMvLen);
2124     mvmin.y = X265_MAX(mvmin.y, -maxMvLen);
2125     mvmax.x = X265_MIN(mvmax.x, maxMvLen);
2126     mvmax.y = X265_MIN(mvmax.y, maxMvLen);
2127
2128     mvmin >>= 2;
2129     mvmax >>= 2;
2130
2131     /* conditional clipping for frame parallelism */
2132     mvmin.y = X265_MIN(mvmin.y, (int16_t)m_refLagPixels);
2133     mvmax.y = X265_MIN(mvmax.y, (int16_t)m_refLagPixels);
2134 }
2135
2136 /* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
2137 void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
2138 {
2139     CUData& cu = interMode.cu;
2140     Yuv* reconYuv = &interMode.reconYuv;
2141     const Yuv* fencYuv = interMode.fencYuv;
2142
2143     X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
2144
2145     uint32_t cuSize = 1 << cu.m_log2CUSize[0];
2146     uint32_t depth  = cu.m_cuDepth[0];
2147
2148     // No residual coding : SKIP mode
2149
2150     cu.setSkipFlagSubParts(true);
2151     cu.clearCbf();
2152     cu.setTUDepthSubParts(0, 0, depth);
2153
2154     reconYuv->copyFromYuv(interMode.predYuv);
2155
2156     // Luma
2157     int part = partitionFromLog2Size(cu.m_log2CUSize[0]);
2158     interMode.distortion = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2159     // Chroma
2160     part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
2161     interMode.distortion += m_rdCost.scaleChromaDistCb(primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
2162     interMode.distortion += m_rdCost.scaleChromaDistCr(primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
2163
2164     m_entropyCoder.load(m_rqt[depth].cur);
2165     m_entropyCoder.resetBits();
2166     if (m_slice->m_pps->bTransquantBypassEnabled)
2167         m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
2168     m_entropyCoder.codeSkipFlag(cu, 0);
2169     m_entropyCoder.codeMergeIndex(cu, 0);
2170
2171     interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
2172     interMode.coeffBits = 0;
2173     interMode.totalBits = interMode.mvBits;
2174     if (m_rdCost.m_psyRd)
2175         interMode.psyEnergy = m_rdCost.psyCost(cu.m_log2CUSize[0] - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2176
2177     updateModeCost(interMode);
2178     m_entropyCoder.store(interMode.contexts);
2179 }
2180
2181 /* encode residual and calculate rate-distortion for a CU block.
2182  * Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
2183 void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
2184 {
2185     CUData& cu = interMode.cu;
2186     Yuv* reconYuv = &interMode.reconYuv;
2187     Yuv* predYuv = &interMode.predYuv;
2188     ShortYuv* resiYuv = &m_rqt[cuGeom.depth].tmpResiYuv;
2189     const Yuv* fencYuv = interMode.fencYuv;
2190
2191     X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
2192
2193     uint32_t log2CUSize = cu.m_log2CUSize[0];
2194     uint32_t cuSize = 1 << log2CUSize;
2195     uint32_t depth  = cu.m_cuDepth[0];
2196
2197     int part = partitionFromLog2Size(log2CUSize);
2198     int cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
2199
2200     m_quant.setQPforQuant(interMode.cu);
2201
2202     resiYuv->subtract(*fencYuv, *predYuv, log2CUSize);
2203
2204     uint32_t tuDepthRange[2];
2205     cu.getInterTUQtDepthRange(tuDepthRange, 0);
2206
2207     m_entropyCoder.load(m_rqt[depth].cur);
2208
2209     Cost costs;
2210     estimateResidualQT(interMode, cuGeom, 0, depth, *resiYuv, costs, tuDepthRange);
2211
2212     if (!cu.m_tqBypass[0])
2213     {
2214         uint32_t cbf0Dist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
2215         cbf0Dist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
2216         cbf0Dist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
2217
2218         /* Consider the RD cost of not signaling any residual */
2219         m_entropyCoder.load(m_rqt[depth].cur);
2220         m_entropyCoder.resetBits();
2221         m_entropyCoder.codeQtRootCbfZero();
2222         uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();
2223
2224         uint64_t cbf0Cost;
2225         uint32_t cbf0Energy;
2226         if (m_rdCost.m_psyRd)
2227         {
2228             cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
2229             cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
2230         }
2231         else
2232             cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits);
2233
2234         if (cbf0Cost < costs.rdcost)
2235         {
2236             cu.clearCbf();
2237             cu.setTUDepthSubParts(0, 0, depth);
2238         }
2239     }
2240
2241     if (cu.getQtRootCbf(0))
2242         saveResidualQTData(cu, *resiYuv, 0, depth);
2243
2244     /* calculate signal bits for inter/merge/skip coded CU */
2245     m_entropyCoder.load(m_rqt[depth].cur);
2246
2247     uint32_t coeffBits, bits;
2248     if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
2249     {
2250         cu.setSkipFlagSubParts(true);
2251
2252         /* Merge/Skip */
2253         m_entropyCoder.resetBits();
2254         if (m_slice->m_pps->bTransquantBypassEnabled)
2255             m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
2256         m_entropyCoder.codeSkipFlag(cu, 0);
2257         m_entropyCoder.codeMergeIndex(cu, 0);
2258         coeffBits = 0;
2259         bits = m_entropyCoder.getNumberOfWrittenBits();
2260     }
2261     else
2262     {
2263         m_entropyCoder.resetBits();
2264         if (m_slice->m_pps->bTransquantBypassEnabled)
2265             m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
2266         m_entropyCoder.codeSkipFlag(cu, 0);
2267         m_entropyCoder.codePredMode(cu.m_predMode[0]);
2268         m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
2269         m_entropyCoder.codePredInfo(cu, 0);
2270         uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits();
2271
2272         bool bCodeDQP = m_slice->m_pps->bUseDQP;
2273         m_entropyCoder.codeCoeff(cu, 0, cu.m_cuDepth[0], bCodeDQP, tuDepthRange);
2274         bits = m_entropyCoder.getNumberOfWrittenBits();
2275
2276         coeffBits = bits - mvBits;
2277     }
2278
2279     m_entropyCoder.store(interMode.contexts);
2280
2281     if (cu.getQtRootCbf(0))
2282         reconYuv->addClip(*predYuv, *resiYuv, log2CUSize);
2283     else
2284         reconYuv->copyFromYuv(*predYuv);
2285
2286     // update with clipped distortion and cost (qp estimation loop uses unclipped values)
2287     uint32_t bestDist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2288     bestDist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
2289     bestDist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
2290     if (m_rdCost.m_psyRd)
2291         interMode.psyEnergy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2292
2293     interMode.totalBits = bits;
2294     interMode.distortion = bestDist;
2295     interMode.coeffBits = coeffBits;
2296     interMode.mvBits = bits - coeffBits;
2297     updateModeCost(interMode);
2298 }
2299
2300 void Search::generateCoeffRecon(Mode& mode, const CUGeom& cuGeom)
2301 {
2302     CUData& cu = mode.cu;
2303
2304     m_quant.setQPforQuant(mode.cu);
2305
2306     if (cu.m_predMode[0] == MODE_INTER)
2307     {
2308         uint32_t tuDepthRange[2];
2309         cu.getInterTUQtDepthRange(tuDepthRange, 0);
2310
2311         residualTransformQuantInter(mode, cuGeom, 0, cu.m_cuDepth[0], tuDepthRange);
2312         if (cu.getQtRootCbf(0))
2313             mode.reconYuv.addClip(mode.predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
2314         else
2315         {
2316             mode.reconYuv.copyFromYuv(mode.predYuv);
2317             if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
2318                 cu.setSkipFlagSubParts(true);
2319         }
2320     }
2321     else if (cu.m_predMode[0] == MODE_INTRA)
2322     {
2323         uint32_t tuDepthRange[2];
2324         cu.getIntraTUQtDepthRange(tuDepthRange, 0);
2325
2326         uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN;
2327         residualTransformQuantIntra(mode, cuGeom, initTrDepth, 0, tuDepthRange);
2328         getBestIntraModeChroma(mode, cuGeom);
2329         residualQTIntraChroma(mode, cuGeom, 0, 0);
2330         mode.reconYuv.copyFromPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, cuGeom.encodeIdx); // TODO:
2331     }
2332 }
2333
2334 void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, uint32_t depthRange[2])
2335 {
2336     CUData& cu = mode.cu;
2337     X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid depth\n");
2338
2339     uint32_t log2TrSize = g_maxLog2CUSize - depth;
2340     uint32_t tuDepth = depth - cu.m_cuDepth[0];
2341
2342     bool bCheckFull = log2TrSize <= depthRange[1];
2343     if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0])
2344         bCheckFull = false;
2345
2346     if (bCheckFull)
2347     {
2348         // code full block
2349         uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
2350         bool bCodeChroma = true;
2351         uint32_t tuDepthC = tuDepth;
2352         if (log2TrSizeC == 1)
2353         {
2354             X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n");
2355             log2TrSizeC++;
2356             tuDepthC--;
2357             uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
2358             bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
2359         }
2360
2361         uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
2362         uint32_t setCbf = 1 << tuDepth;
2363
2364         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
2365         coeff_t *coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
2366
2367         uint32_t sizeIdx  = log2TrSize  - 2;
2368
2369         cu.setTUDepthSubParts(depth - cu.m_cuDepth[0], absPartIdx, depth);
2370         cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
2371
2372         ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
2373         const Yuv* fencYuv = mode.fencYuv;
2374
2375         int16_t *curResiY = resiYuv.getLumaAddr(absPartIdx);
2376         uint32_t strideResiY = resiYuv.m_size;
2377
2378         pixel *fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
2379         uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
2380
2381         if (numSigY)
2382         {
2383             m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
2384             cu.setCbfSubParts(setCbf, TEXT_LUMA, absPartIdx, depth);
2385         }
2386         else
2387         {
2388             primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);
2389             cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
2390         }
2391
2392         if (bCodeChroma)
2393         {
2394             uint32_t sizeIdxC = log2TrSizeC - 2;
2395             uint32_t strideResiC = resiYuv.m_csize;
2396
2397             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
2398             coeff_t *coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
2399             coeff_t *coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
2400             bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
2401
2402             TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
2403             do
2404             {
2405                 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
2406                 uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
2407
2408                 cu.setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
2409                 cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
2410
2411                 int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
2412                 pixel*   fencCb = const_cast<pixel*>(fencYuv->getCbAddr(absPartIdxC));
2413                 uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
2414                 if (numSigU)
2415                 {
2416                     m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
2417                     cu.setCbfPartRange(setCbf, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
2418                 }
2419                 else
2420                 {
2421                     primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);
2422                     cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
2423                 }
2424
2425                 int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
2426                 pixel*   fencCr = const_cast<pixel*>(fencYuv->getCrAddr(absPartIdxC));
2427                 uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
2428                 if (numSigV)
2429                 {
2430                     m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
2431                     cu.setCbfPartRange(setCbf, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
2432                 }
2433                 else
2434                 {
2435                     primitives.blockfill_s[sizeIdxC](curResiV, strideResiC, 0);
2436                     cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
2437                 }
2438             }
2439             while (tuIterator.isNextSection());
2440
2441             if (splitIntoSubTUs)
2442             {
2443                 offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
2444                 offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
2445             }
2446         }
2447     }
2448     else
2449     {
2450         X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
2451
2452         const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
2453         uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
2454         for (uint32_t i = 0; i < 4; i++)
2455         {
2456             residualTransformQuantInter(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, depthRange);
2457             ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1);
2458             ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1);
2459             vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1);
2460         }
2461         for (uint32_t i = 0; i < 4 * qPartNumSubdiv; i++)
2462         {
2463             cu.m_cbf[TEXT_LUMA][absPartIdx + i] |= ycbf << tuDepth;
2464             cu.m_cbf[TEXT_CHROMA_U][absPartIdx + i] |= ucbf << tuDepth;
2465             cu.m_cbf[TEXT_CHROMA_V][absPartIdx + i] |= vcbf << tuDepth;
2466         }
2467     }
2468 }
2469
2470 void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, uint32_t depthRange[2])
2471 {
2472     CUData& cu = mode.cu;
2473     uint32_t log2TrSize = g_maxLog2CUSize - depth;
2474
2475     bool bCheckSplit = log2TrSize > depthRange[0];
2476     bool bCheckFull = log2TrSize <= depthRange[1];
2477
2478     if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit)
2479         bCheckFull = false;
2480
2481     X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
2482     X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
2483
2484     uint32_t tuDepth = depth - cu.m_cuDepth[0];
2485     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
2486     bool bCodeChroma = true;
2487     uint32_t tuDepthC = tuDepth;
2488     if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444))
2489     {
2490         log2TrSizeC++;
2491         tuDepthC--;
2492         uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
2493         bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
2494     }
2495
2496     // code full block
2497     Cost fullCost;
2498     fullCost.rdcost = MAX_INT64;
2499
2500     uint8_t  cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
2501     uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
2502     uint32_t singleBitsComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2503     uint32_t singleDistComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2504     uint32_t singlePsyEnergyComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2505     uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2506     uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
2507
2508     m_entropyCoder.store(m_rqt[depth].rqtRoot);
2509
2510     uint32_t trSize = 1 << log2TrSize;
2511     const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
2512     uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] +  tuDepthC) << 1);
2513     const Yuv* fencYuv = mode.fencYuv;
2514
2515     // code full block
2516     if (bCheckFull)
2517     {
2518         uint32_t trSizeC = 1 << log2TrSizeC;
2519         int partSize  = partitionFromLog2Size(log2TrSize);
2520         int partSizeC = partitionFromLog2Size(log2TrSizeC);
2521         const uint32_t qtLayer = log2TrSize - 2;
2522         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
2523         coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
2524
2525         bool checkTransformSkip   = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0];
2526         bool checkTransformSkipY  = checkTransformSkip && log2TrSize  <= MAX_LOG2_TS_SIZE;
2527         bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
2528
2529         cu.setTUDepthSubParts(depth - cu.m_cuDepth[0], absPartIdx, depth);
2530         cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
2531
2532         if (m_bEnableRDOQ)
2533             m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
2534
2535         pixel *fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
2536         int16_t *resi = resiYuv.getLumaAddr(absPartIdx);
2537         numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
2538         cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
2539
2540         m_entropyCoder.resetBits();
2541         m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
2542         if (cbfFlag[TEXT_LUMA][0])
2543             m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
2544         singleBitsComp[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits();
2545
2546         uint32_t singleBitsPrev = singleBitsComp[TEXT_LUMA][0];
2547
2548         if (bCodeChroma)
2549         {
2550             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
2551             for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
2552             {
2553                 coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
2554                 TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
2555
2556                 do
2557                 {
2558                     uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
2559                     uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
2560
2561                     cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
2562
2563                     if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
2564                         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
2565
2566                     fenc = const_cast<pixel*>(fencYuv->getChromaAddr(chromaId, absPartIdxC));
2567                     resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
2568                     numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
2569                     cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
2570
2571                     m_entropyCoder.codeQtCbf(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
2572                     if (cbfFlag[chromaId][tuIterator.section])
2573                         m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
2574
2575                     uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
2576                     singleBitsComp[chromaId][tuIterator.section] = newBits - singleBitsPrev;
2577
2578                     singleBitsPrev = newBits;
2579                 }
2580                 while (tuIterator.isNextSection());
2581             }
2582         }
2583
2584         const uint32_t numCoeffY = 1 << (log2TrSize * 2);
2585         const uint32_t numCoeffC = 1 << (log2TrSizeC * 2);
2586
2587         X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
2588         uint32_t distY = primitives.ssd_s[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size);
2589         uint32_t psyEnergyY = 0;
2590         if (m_rdCost.m_psyRd)
2591             psyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, (int16_t*)zeroShort, 0);
2592
2593         int16_t *curResiY    = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
2594         uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
2595
2596         if (cbfFlag[TEXT_LUMA][0])
2597         {
2598             m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
2599
2600             const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
2601             uint32_t nonZeroPsyEnergyY = 0;
2602             if (m_rdCost.m_psyRd)
2603                 nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
2604
2605             if (cu.m_tqBypass[0])
2606             {
2607                 distY = nonZeroDistY;
2608                 psyEnergyY = nonZeroPsyEnergyY;
2609             }
2610             else
2611             {
2612                 uint64_t singleCostY = 0;
2613                 if (m_rdCost.m_psyRd)
2614                     singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0], nonZeroPsyEnergyY);
2615                 else
2616                     singleCostY = m_rdCost.calcRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0]);
2617                 m_entropyCoder.resetBits();
2618                 m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth);
2619                 const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits();
2620                 uint64_t nullCostY = 0;
2621                 if (m_rdCost.m_psyRd)
2622                     nullCostY = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY);
2623                 else
2624                     nullCostY = m_rdCost.calcRdCost(distY, nullBitsY);
2625                 if (nullCostY < singleCostY)
2626                 {
2627                     cbfFlag[TEXT_LUMA][0] = 0;
2628 #if CHECKED_BUILD || _DEBUG
2629                     memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
2630 #endif
2631                     if (checkTransformSkipY)
2632                         minCost[TEXT_LUMA][0] = nullCostY;
2633                 }
2634                 else
2635                 {
2636                     distY = nonZeroDistY;
2637                     psyEnergyY = nonZeroPsyEnergyY;
2638                     if (checkTransformSkipY)
2639                         minCost[TEXT_LUMA][0] = singleCostY;
2640                 }
2641             }
2642         }
2643         else if (checkTransformSkipY)
2644         {
2645             m_entropyCoder.resetBits();
2646             m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth);
2647             const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits();
2648             if (m_rdCost.m_psyRd)
2649                 minCost[TEXT_LUMA][0] = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY);
2650             else
2651                 minCost[TEXT_LUMA][0] = m_rdCost.calcRdCost(distY, nullBitsY);
2652         }
2653
2654         singleDistComp[TEXT_LUMA][0] = distY;
2655         singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY;
2656         if (!cbfFlag[TEXT_LUMA][0])
2657             primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
2658         cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
2659
2660         if (bCodeChroma)
2661         {
2662             uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
2663             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
2664             for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
2665             {
2666                 uint32_t distC = 0, psyEnergyC = 0;
2667                 coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
2668                 TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
2669
2670             do
2671             {
2672                 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
2673                 uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
2674
2675                 int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
2676
2677                 distC = m_rdCost.scaleChromaDistCb(primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
2678
2679                 if (cbfFlag[chromaId][tuIterator.section])
2680                 {
2681                     m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset,
2682                                             log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
2683                     uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
2684                     const uint32_t nonZeroDistC = m_rdCost.scaleChromaDistCb(dist);
2685                     uint32_t nonZeroPsyEnergyC = 0;
2686                     if (m_rdCost.m_psyRd)
2687                         nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
2688
2689                     if (cu.m_tqBypass[0])
2690                     {
2691                         distC = nonZeroDistC;
2692                         psyEnergyC = nonZeroPsyEnergyC;
2693                     }
2694                     else
2695                     {
2696                         uint64_t singleCostC = 0;
2697                         if (m_rdCost.m_psyRd)
2698                             singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
2699                         else
2700                             singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
2701                         m_entropyCoder.resetBits();
2702                         m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepth);
2703                         const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits();
2704                         uint64_t nullCostC = 0;
2705                         if (m_rdCost.m_psyRd)
2706                             nullCostC = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC);
2707                         else
2708                             nullCostC = m_rdCost.calcRdCost(distC, nullBitsC);
2709                         if (nullCostC < singleCostC)
2710                         {
2711                             cbfFlag[chromaId][tuIterator.section] = 0;
2712 #if CHECKED_BUILD || _DEBUG
2713                                 memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
2714 #endif
2715                                 if (checkTransformSkipC)
2716                                     minCost[chromaId][tuIterator.section] = nullCostC;
2717                             }
2718                             else
2719                             {
2720                                 distC = nonZeroDistC;
2721                                 psyEnergyC = nonZeroPsyEnergyC;
2722                                 if (checkTransformSkipC)
2723                                     minCost[chromaId][tuIterator.section] = singleCostC;
2724                             }
2725                         }
2726                     }
2727                     else if (checkTransformSkipC)
2728                     {
2729                         m_entropyCoder.resetBits();
2730                         m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepthC);
2731                         const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits();
2732                         if (m_rdCost.m_psyRd)
2733                             minCost[chromaId][tuIterator.section] = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC);
2734                         else
2735                             minCost[chromaId][tuIterator.section] = m_rdCost.calcRdCost(distC, nullBitsC);
2736                     }
2737
2738                     singleDistComp[chromaId][tuIterator.section] = distC;
2739                     singlePsyEnergyComp[chromaId][tuIterator.section] = psyEnergyC;
2740
2741                     if (!cbfFlag[chromaId][tuIterator.section])
2742                         primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
2743
2744                     cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
2745                 }
2746                 while (tuIterator.isNextSection());
2747             }
2748         }
2749
2750         if (checkTransformSkipY)
2751         {
2752             uint32_t nonZeroDistY = 0;
2753             uint32_t nonZeroPsyEnergyY = 0;
2754             uint64_t singleCostY = MAX_INT64;
2755
2756             ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]);
2757             ALIGN_VAR_32(int16_t, tsResiY[MAX_TS_SIZE * MAX_TS_SIZE]);
2758
2759             m_entropyCoder.load(m_rqt[depth].rqtRoot);
2760
2761             cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth);
2762
2763             if (m_bEnableRDOQ)
2764                 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
2765
2766             fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
2767             resi = resiYuv.getLumaAddr(absPartIdx);
2768             uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, tsCoeffY, log2TrSize, TEXT_LUMA, absPartIdx, true);
2769
2770             if (numSigTSkipY)
2771             {
2772                 m_entropyCoder.resetBits();
2773                 m_entropyCoder.codeQtCbf(!!numSigTSkipY, TEXT_LUMA, tuDepth);
2774                 m_entropyCoder.codeCoeffNxN(cu, tsCoeffY, absPartIdx, log2TrSize, TEXT_LUMA);
2775                 const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
2776
2777                 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], tsResiY, trSize, tsCoeffY, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
2778
2779                 nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, tsResiY, trSize);
2780
2781                 if (m_rdCost.m_psyRd)
2782                 {
2783                     nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, tsResiY, trSize);
2784                     singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroPsyEnergyY);
2785                 }
2786                 else
2787                     singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
2788             }
2789
2790             if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
2791                 cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
2792             else
2793             {
2794                 singleDistComp[TEXT_LUMA][0] = nonZeroDistY;
2795                 singlePsyEnergyComp[TEXT_LUMA][0] = nonZeroPsyEnergyY;
2796                 cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
2797                 bestTransformMode[TEXT_LUMA][0] = 1;
2798                 memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY);
2799                 primitives.square_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize);
2800             }
2801
2802             cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
2803         }
2804
2805         if (bCodeChroma && checkTransformSkipC)
2806         {
2807             uint32_t nonZeroDistC = 0, nonZeroPsyEnergyC = 0;
2808             uint64_t singleCostC = MAX_INT64;
2809             uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
2810             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
2811
2812             m_entropyCoder.load(m_rqt[depth].rqtRoot);
2813
2814             for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
2815             {
2816                 coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
2817                 TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
2818
2819                 do
2820                 {
2821                     uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
2822                     uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
2823
2824                     int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
2825
2826                     ALIGN_VAR_32(coeff_t, tsCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
2827                     ALIGN_VAR_32(int16_t, tsResiC[MAX_TS_SIZE * MAX_TS_SIZE]);
2828
2829                     cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
2830
2831                     if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
2832                         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
2833
2834                     fenc = const_cast<pixel*>(fencYuv->getChromaAddr(chromaId, absPartIdxC));
2835                     resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
2836                     uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, tsCoeffC, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
2837
2838                     m_entropyCoder.resetBits();
2839                     singleBitsComp[chromaId][tuIterator.section] = 0;
2840
2841                     if (numSigTSkipC)
2842                     {
2843                         m_entropyCoder.codeQtCbf(!!numSigTSkipC, (TextType)chromaId, tuDepth);
2844                         m_entropyCoder.codeCoeffNxN(cu, tsCoeffC, absPartIdxC, log2TrSizeC, (TextType)chromaId);
2845                         singleBitsComp[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
2846
2847                         m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], tsResiC, trSizeC, tsCoeffC,
2848                                                 log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
2849                         uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
2850                         nonZeroDistC = m_rdCost.scaleChromaDistCb(dist);
2851                         if (m_rdCost.m_psyRd)
2852                         {
2853                             nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
2854                             singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
2855                         }
2856                         else
2857                             singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
2858                     }
2859
2860                     if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
2861                         cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
2862                     else
2863                     {
2864                         singleDistComp[chromaId][tuIterator.section] = nonZeroDistC;
2865                         singlePsyEnergyComp[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
2866                         cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
2867                         bestTransformMode[chromaId][tuIterator.section] = 1;
2868                         memcpy(coeffCurC + subTUOffset, tsCoeffC, sizeof(coeff_t) * numCoeffC);
2869                         primitives.square_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC);
2870                     }
2871
2872                     cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
2873                 }
2874                 while (tuIterator.isNextSection());
2875             }
2876         }
2877
2878         m_entropyCoder.load(m_rqt[depth].rqtRoot);
2879
2880         m_entropyCoder.resetBits();
2881
2882         if (log2TrSize > depthRange[0])
2883             m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
2884
2885         if (bCodeChroma)
2886         {
2887             for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
2888             {
2889                 if (!splitIntoSubTUs)
2890                     m_entropyCoder.codeQtCbf(cbfFlag[chromaId][0], (TextType)chromaId, tuDepth);
2891                 else
2892                 {
2893                     offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
2894                     for (uint32_t subTU = 0; subTU < 2; subTU++)
2895                         m_entropyCoder.codeQtCbf(cbfFlag[chromaId][subTU], (TextType)chromaId, tuDepth);
2896                 }
2897             }
2898         }
2899
2900         m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
2901         if (cbfFlag[TEXT_LUMA][0])
2902             m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
2903
2904         if (bCodeChroma)
2905         {
2906             uint32_t subTUSize = 1 << (log2TrSizeC * 2);
2907             uint32_t partIdxesPerSubTU = absPartIdxStep >> 1;
2908             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
2909
2910             for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
2911             {
2912                 coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
2913                 if (!splitIntoSubTUs)
2914                 {
2915                     if (cbfFlag[chromaId][0])
2916                         m_entropyCoder.codeCoeffNxN(cu, coeffCurC, absPartIdx, log2TrSizeC, (TextType)chromaId);
2917                 }
2918                 else
2919                 {
2920                     for (uint32_t subTU = 0; subTU < 2; subTU++)
2921                     {
2922                         if (cbfFlag[chromaId][subTU])
2923                             m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTU * subTUSize, absPartIdx + subTU * partIdxesPerSubTU, log2TrSizeC, (TextType)chromaId);
2924                     }
2925                 }
2926             }
2927         }
2928
2929         fullCost.distortion += singleDistComp[TEXT_LUMA][0];
2930         fullCost.energy += singlePsyEnergyComp[TEXT_LUMA][0];// need to check we need to add chroma also
2931         for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
2932         {
2933             fullCost.distortion += singleDistComp[TEXT_CHROMA_U][subTUIndex];
2934             fullCost.distortion += singleDistComp[TEXT_CHROMA_V][subTUIndex];
2935         }
2936
2937         fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
2938         if (m_rdCost.m_psyRd)
2939             fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
2940         else
2941             fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
2942     }
2943
2944     // code sub-blocks
2945     if (bCheckSplit)
2946     {
2947         if (bCheckFull)
2948         {
2949             m_entropyCoder.store(m_rqt[depth].rqtTest);
2950             m_entropyCoder.load(m_rqt[depth].rqtRoot);
2951         }
2952
2953         Cost splitCost;
2954         const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
2955         uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
2956         for (uint32_t i = 0; i < 4; ++i)
2957         {
2958             estimateResidualQT(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, resiYuv, splitCost, depthRange);
2959             ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA,     tuDepth + 1);
2960             ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1);
2961             vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1);
2962         }
2963         for (uint32_t i = 0; i < 4 * qPartNumSubdiv; ++i)
2964         {
2965             cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
2966             cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
2967             cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
2968         }
2969
2970         m_entropyCoder.load(m_rqt[depth].rqtRoot);
2971         m_entropyCoder.resetBits();
2972
2973         encodeResidualQT(cu, absPartIdx, depth, true,  TEXT_LUMA, depthRange);
2974         encodeResidualQT(cu, absPartIdx, depth, false, TEXT_LUMA, depthRange);
2975         encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_U, depthRange);
2976         encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_V, depthRange);
2977
2978         splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
2979
2980         if (m_rdCost.m_psyRd)
2981             splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
2982         else
2983             splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
2984
2985         if (ycbf || ucbf || vcbf || !bCheckFull)
2986         {
2987             if (splitCost.rdcost < fullCost.rdcost)
2988             {
2989                 outCosts.distortion += splitCost.distortion;
2990                 outCosts.rdcost     += splitCost.rdcost;
2991                 outCosts.bits       += splitCost.bits;
2992                 outCosts.energy     += splitCost.energy;
2993                 return;
2994             }
2995             else
2996                 outCosts.energy     += splitCost.energy;
2997         }
2998
2999         cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
3000         if (bCodeChroma)
3001         {
3002             const uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1;
3003
3004             uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0);
3005             for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++)
3006             {
3007                 const uint32_t  subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU);
3008
3009                 cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][subTUIndex], TEXT_CHROMA_U, subTUPartIdx, partIdxesPerSubTU);
3010                 cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][subTUIndex], TEXT_CHROMA_V, subTUPartIdx, partIdxesPerSubTU);
3011             }
3012         }
3013         X265_CHECK(bCheckFull, "check-full must be set\n");
3014         m_entropyCoder.load(m_rqt[depth].rqtTest);
3015     }
3016
3017     cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
3018     cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
3019
3020     if (bCodeChroma)
3021     {
3022         uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1;
3023         uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0);
3024
3025         for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
3026         {
3027             for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++)
3028             {
3029                 const uint32_t  subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU);
3030
3031                 if (splitIntoSubTUs)
3032                 {
3033                     uint8_t combinedSubTUCBF = cbfFlag[chromaId][0] | cbfFlag[chromaId][1];
3034                     cu.setCbfPartRange(((cbfFlag[chromaId][subTUIndex] << 1) | combinedSubTUCBF) << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU);
3035                 }
3036                 else
3037                     cu.setCbfPartRange(cbfFlag[chromaId][subTUIndex] << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU);
3038             }
3039         }
3040     }
3041
3042     outCosts.distortion += fullCost.distortion;
3043     outCosts.rdcost     += fullCost.rdcost;
3044     outCosts.bits       += fullCost.bits;
3045     outCosts.energy     += fullCost.energy;
3046 }
3047
3048 void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2])
3049 {
3050     X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
3051     X265_CHECK(cu.m_predMode[absPartIdx] != MODE_INTRA, "encodeResidualQT() with intra block\n");
3052
3053     const uint32_t curTuDepth  = depth - cu.m_cuDepth[0];
3054     const uint32_t tuDepth     = cu.m_tuDepth[absPartIdx];
3055     const bool     bSubdiv     = curTuDepth != tuDepth;
3056     const uint32_t log2TrSize  = g_maxLog2CUSize - depth;
3057
3058     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
3059
3060     const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
3061
3062     if (bSubdivAndCbf && log2TrSize <= depthRange[1] && log2TrSize > depthRange[0])
3063         m_entropyCoder.codeTransformSubdivFlag(bSubdiv, 5 - log2TrSize);
3064
3065     bool mCodeAll = true;
3066     uint32_t trWidthC  = 1 << log2TrSizeC;
3067     uint32_t trHeightC = splitIntoSubTUs ? (trWidthC << 1) : trWidthC;
3068
3069     const uint32_t numPels = trWidthC * trHeightC;
3070     if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
3071         mCodeAll = false;
3072
3073     if (bSubdivAndCbf)
3074     {
3075         const bool bFirstCbfOfCU = curTuDepth == 0;
3076         if (bFirstCbfOfCU || mCodeAll)
3077         {
3078             uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] +  curTuDepth) << 1);
3079             if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1))
3080                 m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_U, curTuDepth, !bSubdiv);
3081             if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1))
3082                 m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_V, curTuDepth, !bSubdiv);
3083         }
3084         else
3085         {
3086             X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1), "chroma CBF not matching\n");
3087             X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1), "chroma CBF not matching\n");
3088         }
3089     }
3090
3091     if (!bSubdiv)
3092     {
3093         // Luma
3094         const uint32_t qtLayer = log2TrSize - 2;
3095         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
3096         coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
3097
3098         // Chroma
3099         bool bCodeChroma = true;
3100         uint32_t tuDepthC = tuDepth;
3101         if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444))
3102         {
3103             log2TrSizeC++;
3104             tuDepthC--;
3105             uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
3106             bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
3107         }
3108
3109         if (bSubdivAndCbf)
3110             m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, tuDepth);
3111         else
3112         {
3113             if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
3114                 m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
3115
3116             if (bCodeChroma)
3117             {
3118                 uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
3119                 coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
3120                 coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
3121
3122                 if (!splitIntoSubTUs)
3123                 {
3124                     if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
3125                         m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
3126                     if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
3127                         m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
3128                 }
3129                 else
3130                 {
3131                     uint32_t partIdxesPerSubTU  = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + tuDepthC) << 1) + 1);
3132                     uint32_t subTUSize = 1 << (log2TrSizeC * 2);
3133                     if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
3134                     {
3135                         if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
3136                             m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
3137                         if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1))
3138                             m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_U);
3139                     }
3140                     if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
3141                     {
3142                         if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
3143                             m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
3144                         if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1))
3145                             m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_V);
3146                     }
3147                 }
3148             }
3149         }
3150     }
3151     else
3152     {
3153         if (bSubdivAndCbf || cu.getCbf(absPartIdx, ttype, curTuDepth))
3154         {
3155             const uint32_t qpartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
3156             for (uint32_t i = 0; i < 4; ++i)
3157                 encodeResidualQT(cu, absPartIdx + i * qpartNumSubdiv, depth + 1, bSubdivAndCbf, ttype, depthRange);
3158         }
3159     }
3160 }
3161
3162 void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth)
3163 {
3164     X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
3165     const uint32_t curTrMode = depth - cu.m_cuDepth[0];
3166     const uint32_t tuDepth   = cu.m_tuDepth[absPartIdx];
3167
3168     if (curTrMode < tuDepth)
3169     {
3170         uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
3171         for (uint32_t i = 0; i < 4; i++, absPartIdx += qPartNumSubdiv)
3172             saveResidualQTData(cu, resiYuv, absPartIdx, depth + 1);
3173         return;
3174     }
3175
3176     const uint32_t log2TrSize = g_maxLog2CUSize - depth;
3177     const uint32_t qtLayer = log2TrSize - 2;
3178
3179     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
3180     bool bCodeChroma = true;
3181     uint32_t tuDepthC = tuDepth;
3182     if (log2TrSizeC == 1)
3183     {
3184         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n");
3185         log2TrSizeC++;
3186         tuDepthC--;
3187         uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
3188         bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
3189     }
3190
3191     m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
3192
3193     uint32_t numCoeffY = 1 << (log2TrSize * 2);
3194     uint32_t coeffOffsetY = absPartIdx << LOG2_UNIT_SIZE * 2;
3195     coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
3196     coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY;
3197     memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
3198
3199     if (bCodeChroma)
3200     {
3201         m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
3202
3203         uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
3204         uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
3205
3206         coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
3207         coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
3208         coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
3209         coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
3210         memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
3211         memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
3212     }
3213 }
3214
3215 /* returns the number of bits required to signal a non-most-probable mode.
3216  * on return mpms contains bitmap of most probable modes */
3217 uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const
3218 {
3219     cu.getIntraDirLumaPredictor(absPartIdx, preds);
3220
3221     mpms = 0;
3222     for (int i = 0; i < 3; ++i)
3223         mpms |= ((uint64_t)1 << preds[i]);
3224
3225     return m_entropyCoder.bitsIntraModeNonMPM();
3226 }
3227
3228 /* swap the current mode/cost with the mode with the highest cost in the
3229  * current candidate list, if its cost is better (maintain a top N list) */
3230 void Search::updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList)
3231 {
3232     uint32_t maxIndex = 0;
3233     uint64_t maxValue = 0;
3234
3235     for (int i = 0; i < maxCandCount; i++)
3236     {
3237         if (maxValue < candCostList[i])
3238         {
3239             maxValue = candCostList[i];
3240             maxIndex = i;
3241         }
3242     }
3243
3244     if (cost < maxValue)
3245     {
3246         candCostList[maxIndex] = cost;
3247         candModeList[maxIndex] = mode;
3248     }
3249 }