Fix cut&paste typo in changelog.

[deb_x265.git] / source / encoder / search.cpp
diff --git a/source/encoder/search.cpp b/source/encoder/search.cpp

index cd86318984c02dd6532b524b7a62e21e4b9b6087..bc0dc94cf27dbf5e097716436e52de276218277a 100644 (file)
--- a/source/encoder/search.cpp
+++ b/source/encoder/search.cpp
@@ -37,6 +37,8 @@ using namespace x265;
  #pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
  #endif
  
+#define MVP_IDX_BITS 1
+
  ALIGN_VAR_32(const pixel, Search::zeroPixel[MAX_CU_SIZE]) = { 0 };
  ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
  
@@ -66,11 +68,10 @@ bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
      m_numLayers = g_log2Size[param.maxCUSize] - 2;
  
      m_rdCost.setPsyRdScale(param.psyRd);
-    m_me.setSearchMethod(param.searchMethod);
-    m_me.setSubpelRefine(param.subpelRefine);
+    m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
  
      bool ok = m_quant.init(m_bEnableRDOQ, param.psyRdoq, scalingList, m_entropyCoder);
-    if (m_param->noiseReduction)
+    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
          ok &= m_quant.allocNoiseReduction(param);
  
      ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
@@ -163,70 +164,55 @@ void Search::invalidateContexts(int fromDepth)
  void Search::invalidateContexts(int) {}
  #endif
  
-void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height)
+void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
  {
-    uint32_t fullDepth  = cu.m_cuDepth[0] + trDepth;
-    uint32_t tuDepthL   = cu.m_tuDepth[absPartIdx];
-    uint32_t subdiv     = tuDepthL > trDepth;
+    uint32_t fullDepth  = cu.m_cuDepth[0] + tuDepth;
+    uint32_t subdiv     = tuDepth < cu.m_tuDepth[absPartIdx];
      uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
  
-    bool mCodeAll = true;
-    const uint32_t numPels = 1 << (log2TrSize * 2 - m_hChromaShift - m_vChromaShift);
-    if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
-        mCodeAll = false;
-
-    if (mCodeAll)
+    if (!(log2TrSize - m_hChromaShift < 2))
      {
-        if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepth - 1))
-            m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_U, trDepth, !subdiv);
-
-        if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepth - 1))
-            m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_V, trDepth, !subdiv);
+        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
+            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
+        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
+            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
      }
  
      if (subdiv)
      {
-        absPartIdxStep >>= 2;
-        width  >>= 1;
-        height >>= 1;
-
-        uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
-        for (uint32_t part = 0; part < 4; part++)
-            codeSubdivCbfQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, absPartIdxStep, width, height);
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+            codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx);
      }
  }
  
-void Search::codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype)
+void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype)
  {
-    if (!cu.getCbf(absPartIdx, ttype, trDepth))
+    if (!cu.getCbf(absPartIdx, ttype, tuDepth))
          return;
  
-    uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
-    uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
+    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
  
-    if (tuDepthL > trDepth)
+    if (tuDepth < cu.m_tuDepth[absPartIdx])
      {
-        uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
-        for (uint32_t part = 0; part < 4; part++)
-            codeCoeffQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, ttype);
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+            codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype);
  
          return;
      }
  
-    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
-
-    uint32_t trDepthC = trDepth;
+    uint32_t tuDepthC = tuDepth;
      uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-    
-    if (log2TrSizeC == 1)
-    {
-        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "transform size too small\n");
-        trDepthC--;
-        log2TrSizeC++;
-        uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
-        bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
-        if (!bFirstQ)
+
+    if (log2TrSizeC < 2)
+    {
+        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+        if (absPartIdx & 3)
              return;
+        log2TrSizeC = 2;
+        tuDepthC--;
      }
  
      uint32_t qtLayer = log2TrSize - 2;
@@ -243,17 +229,17 @@ void Search::codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absP
          uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
          coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
          uint32_t subTUSize = 1 << (log2TrSizeC * 2);
-        uint32_t partIdxesPerSubTU  = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + trDepthC) << 1) + 1);
-        if (cu.getCbf(absPartIdx, ttype, trDepth + 1))
+        uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
+        if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
              m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
-        if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, trDepth + 1))
-            m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, ttype);
+        if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
+            m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype);
      }
  }
  
-void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, uint32_t depthRange[2])
+void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
  {
-    uint32_t fullDepth  = mode.cu.m_cuDepth[0] + trDepth;
+    uint32_t fullDepth  = mode.cu.m_cuDepth[0] + tuDepth;
      uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
      uint32_t qtLayer    = log2TrSize - 2;
      uint32_t sizeIdx    = log2TrSize - 2;
@@ -280,20 +266,20 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
          if (mightSplit)
              m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
  
-        pixel*   fenc     = const_cast<pixel*>(mode.fencYuv->getLumaAddr(absPartIdx));
+        const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
          pixel*   pred     = mode.predYuv.getLumaAddr(absPartIdx);
          int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
          uint32_t stride   = mode.fencYuv->m_size;
  
          // init availability pattern
          uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
-        initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
+        initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
  
          // get prediction signal
          predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
  
          cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
-        cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
  
          uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
          coeff_t* coeffY       = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
@@ -312,9 +298,9 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
          }
          else
              // no coded residual, recon = pred
-            primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride);
+            primitives.luma_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride);
  
-        bCBF = !!numSig << trDepth;
+        bCBF = !!numSig << tuDepth;
          cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
          fullCost.distortion = primitives.sse_pp[sizeIdx](reconQt, reconQtStride, fenc, stride);
  
@@ -338,21 +324,21 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
          }
          else
          {
-            uint32_t qtNumParts = cuGeom.numPartitions >> 2;
-            if (!trDepth)
+            uint32_t qNumParts = cuGeom.numPartitions >> 2;
+            if (!tuDepth)
              {
-                for (uint32_t part = 0; part < 4; part++)
-                    m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false);
+                for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
+                    m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
              }
-            else if (!(absPartIdx & (qtNumParts - 1)))
+            else if (!(absPartIdx & (qNumParts - 1)))
                  m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
          }
          if (log2TrSize != depthRange[0])
              m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
  
-        m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
+        m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
  
-        if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth))
+        if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
              m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
  
          fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
@@ -380,26 +366,25 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
          }
  
          // code split block
-        uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
-        uint32_t absPartIdxSub = absPartIdx;
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
  
          int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
          if (m_param->bEnableTSkipFast)
-            checkTransformSkip &= cu.m_partSize[absPartIdx] == SIZE_NxN;
+            checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;
  
          Cost splitCost;
          uint32_t cbf = 0;
-        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
+        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
          {
              if (checkTransformSkip)
-                codeIntraLumaTSkip(mode, cuGeom, trDepth + 1, absPartIdxSub, splitCost);
+                codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
              else
-                codeIntraLumaQT(mode, cuGeom, trDepth + 1, absPartIdxSub, bAllowSplit, splitCost, depthRange);
+                codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);
  
-            cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
+            cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
          }
-        for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
-            cu.m_cbf[0][absPartIdx + offs] |= (cbf << trDepth);
+        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
+            cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
  
          if (mightNotSplit && log2TrSize != depthRange[0])
          {
@@ -428,16 +413,16 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
              m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
  
              // recover transform index and Cbf values
-            cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+            cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
              cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
              cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
          }
      }
  
      // set reconstruction for next intra prediction blocks if full TU prediction won
-    pixel*   picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
-    intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
-    primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
+    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+    intptr_t picStride = m_frame->m_reconPic->m_stride;
+    primitives.luma_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
  
      outCost.rdcost     += fullCost.rdcost;
      outCost.distortion += fullCost.distortion;
@@ -445,9 +430,9 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth,
      outCost.energy     += fullCost.energy;
  }
  
-void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, Cost& outCost)
+void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
  {
-    uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth;
+    uint32_t fullDepth = mode.cu.m_cuDepth[0] + tuDepth;
      uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
      uint32_t tuSize = 1 << log2TrSize;
  
@@ -462,7 +447,7 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
      int      bTSkip = 0;
      uint32_t bCBF = 0;
  
-    pixel*   fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+    const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
      pixel*   pred = predYuv->getLumaAddr(absPartIdx);
      int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
      uint32_t stride = fencYuv->m_size;
@@ -470,12 +455,12 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
  
      // init availability pattern
      uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
-    initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
+    initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
  
      // get prediction signal
      predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
  
-    cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+    cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
  
      uint32_t qtLayer = log2TrSize - 2;
      uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
@@ -518,12 +503,12 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
          }
          else
              // no residual coded, recon = pred
-            primitives.square_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride);
+            primitives.luma_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride);
  
          uint32_t tmpDist = primitives.sse_pp[sizeIdx](tmpRecon, tmpReconStride, fenc, stride);
  
          cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
-        cu.setCbfSubParts((!!numSig) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
+        cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
  
          if (useTSkip)
              m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
@@ -548,20 +533,20 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
          }
          else
          {
-            uint32_t qtNumParts = cuGeom.numPartitions >> 2;
-            if (!trDepth)
+            uint32_t qNumParts = cuGeom.numPartitions >> 2;
+            if (!tuDepth)
              {
-                for (uint32_t part = 0; part < 4; part++)
-                    m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false);
+                for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
+                    m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
              }
-            else if (!(absPartIdx & (qtNumParts - 1)))
+            else if (!(absPartIdx & (qNumParts - 1)))
                  m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
          }
          m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
  
-        m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
+        m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
  
-        if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth))
+        if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
              m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
  
          uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
@@ -591,19 +576,19 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
      if (bTSkip)
      {
          memcpy(coeffY, tsCoeffY, sizeof(coeff_t) << (log2TrSize * 2));
-        primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize);
+        primitives.luma_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize);
      }
      else if (checkTransformSkip)
      {
          cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
-        cu.setCbfSubParts(bCBF << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
+        cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
          m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
      }
  
      // set reconstruction for next intra prediction blocks
-    pixel*   picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
-    intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
-    primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
+    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+    intptr_t picStride = m_frame->m_reconPic->m_stride;
+    primitives.luma_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
  
      outCost.rdcost += fullCost.rdcost;
      outCost.distortion += fullCost.distortion;
@@ -612,11 +597,11 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep
  }
  
  /* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
-void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t depthRange[2])
+void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t depthRange[2])
  {
      CUData& cu = mode.cu;
  
-    uint32_t fullDepth   = cu.m_cuDepth[0] + trDepth;
+    uint32_t fullDepth   = cu.m_cuDepth[0] + tuDepth;
      uint32_t log2TrSize  = g_maxLog2CUSize - fullDepth;
      bool     bCheckFull  = log2TrSize <= depthRange[1];
  
@@ -629,22 +614,22 @@ void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint3
  
      if (bCheckFull)
      {
-        pixel*   fenc      = const_cast<pixel*>(mode.fencYuv->getLumaAddr(absPartIdx));
+        const pixel* fenc  = mode.fencYuv->getLumaAddr(absPartIdx);
          pixel*   pred      = mode.predYuv.getLumaAddr(absPartIdx);
          int16_t* residual  = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
-        pixel*   picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
-        intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
+        pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+        intptr_t picStride = m_frame->m_reconPic->m_stride;
          uint32_t stride    = mode.fencYuv->m_size;
          uint32_t sizeIdx   = log2TrSize - 2;
          uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
          uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
          coeff_t* coeff        = cu.m_trCoeff[TEXT_LUMA] + coeffOffsetY;
  
-        initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
+        initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
          predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
  
          X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
-        cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+        cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
  
          primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
          uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, false);
@@ -652,11 +637,11 @@ void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint3
          {
              m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, false, numSig);
              primitives.luma_add_ps[sizeIdx](picReconY, picStride, pred, residual, stride, stride);
-            cu.setCbfSubParts(1 << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
+            cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
          }
          else
          {
-            primitives.square_copy_pp[sizeIdx](picReconY, picStride, pred, stride);
+            primitives.luma_copy_pp[sizeIdx](picReconY, picStride, pred, stride);
              cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
          }
      }
@@ -665,26 +650,25 @@ void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint3
          X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
          
          /* code split block */
-        uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
          uint32_t cbf = 0;
-        for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
+        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
          {
-            residualTransformQuantIntra(mode, cuGeom, trDepth + 1, absPartIdxSub, depthRange);
-            cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
+            residualTransformQuantIntra(mode, cuGeom, tuDepth + 1, qPartIdx, depthRange);
+            cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
          }
-        for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
-            cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << trDepth);
+        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
+            cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << tuDepth);
      }
  }
  
-void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, uint32_t absPartIdx)
+void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
  {
-    uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
-    uint32_t tuDepth   = cu.m_tuDepth[absPartIdx];
+    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
  
-    if (tuDepth == trDepth)
+    if (tuDepth == cu.m_tuDepth[absPartIdx])
      {
-        uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
          uint32_t qtLayer    = log2TrSize - 2;
  
          // copy transform coefficients
@@ -698,88 +682,80 @@ void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, u
      }
      else
      {
-        uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
-        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
-            extractIntraResultQT(cu, reconYuv, trDepth + 1, absPartIdx + subPartIdx * numQPart);
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+            extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx);
      }
  }
  
+inline void offsetCBFs(uint8_t subTUCBF[2])
+{
+    uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1];
+    subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF;
+    subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF;
+}
+
  /* 4:2:2 post-TU split processing */
-void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx)
+void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
  {
      uint32_t depth = cu.m_cuDepth[0];
-    uint32_t fullDepth = depth + trDepth;
+    uint32_t fullDepth = depth + tuDepth;
      uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
  
-    uint32_t trDepthC = trDepth;
      if (log2TrSize == 2)
      {
-        X265_CHECK(m_csp != X265_CSP_I444 && trDepthC, "trDepthC invalid\n");
-        trDepthC--;
+        X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+        ++log2TrSize;
      }
  
-    uint32_t partIdxesPerSubTU = (NUM_CU_PARTITIONS >> ((depth + trDepthC) << 1)) >> 1;
+    uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
  
      // move the CBFs down a level and set the parent CBF
      uint8_t subTUCBF[2];
-    uint8_t combinedSubTUCBF = 0;
-
-    for (uint32_t subTU = 0; subTU < 2; subTU++)
-    {
-        const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
+    subTUCBF[0] = cu.getCbf(absPartIdx            , ttype, tuDepth);
+    subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth);
+    offsetCBFs(subTUCBF);
  
-        subTUCBF[subTU]   = cu.getCbf(subTUAbsPartIdx, ttype, trDepth);
-        combinedSubTUCBF |= subTUCBF[subTU];
-    }
-
-    for (uint32_t subTU = 0; subTU < 2; subTU++)
-    {
-        const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
-        const uint8_t compositeCBF = (subTUCBF[subTU] << 1) | combinedSubTUCBF;
-
-        cu.setCbfPartRange((compositeCBF << trDepth), ttype, subTUAbsPartIdx, partIdxesPerSubTU);
-    }
+    cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx             , tuNumParts);
+    cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts);
  }
  
  /* returns distortion */
-uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
+uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
  {
      CUData& cu = mode.cu;
-    uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
-    uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
+    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
  
-    if (tuDepthL > trDepth)
+    if (tuDepth < cu.m_tuDepth[absPartIdx])
      {
-        uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
          uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0;
-        for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
+        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
          {
-            outDist += codeIntraChromaQt(mode, cuGeom, trDepth + 1, absPartIdxSub, psyEnergy);
-            splitCbfU |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_U, trDepth + 1);
-            splitCbfV |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_V, trDepth + 1);
+            outDist += codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, psyEnergy);
+            splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+            splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
          }
-        for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
+        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
          {
-            cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << trDepth);
-            cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << trDepth);
+            cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << tuDepth);
+            cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << tuDepth);
          }
  
          return outDist;
      }
  
-    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
      uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
  
-    uint32_t trDepthC = trDepth;
-    if (log2TrSizeC == 1)
+    uint32_t tuDepthC = tuDepth;
+    if (log2TrSizeC < 2)
      {
-        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "invalid trDepth\n");
-        trDepthC--;
-        log2TrSizeC++;
-        uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
-        bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
-        if (!bFirstQ)
+        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+        if (absPartIdx & 3)
              return 0;
+        log2TrSizeC = 2;
+        tuDepthC--;
      }
  
      if (m_bEnableRDOQ)
@@ -788,13 +764,13 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr
      bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
      checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
      if (checkTransformSkip)
-        return codeIntraChromaTSkip(mode, cuGeom, trDepth, trDepthC, absPartIdx, psyEnergy);
+        return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy);
  
      uint32_t qtLayer = log2TrSize - 2;
      uint32_t tuSize = 1 << log2TrSizeC;
      uint32_t outDist = 0;
  
-    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
+    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
      const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
  
      for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
@@ -806,7 +782,7 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr
          {
              uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
  
-            pixel*   fenc     = const_cast<Yuv*>(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC);
+            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
              pixel*   pred     = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
              int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
              uint32_t stride   = mode.fencYuv->m_csize;
@@ -817,11 +793,11 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr
              pixel*   reconQt       = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
              uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
  
-            pixel*   picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
-            intptr_t picStride = m_frame->m_reconPicYuv->m_strideC;
+            pixel*   picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+            intptr_t picStride = m_frame->m_reconPic->m_strideC;
  
              // init availability pattern
-            initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
+            initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
              pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
  
              uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
@@ -837,44 +813,42 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr
  
              primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
              uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
-            uint32_t tmpDist;
              if (numSig)
              {
                  m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
                  primitives.luma_add_ps[sizeIdxC](reconQt, reconQtStride, pred, residual, stride, stride);
-                cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+                cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
              }
              else
              {
                  // no coded residual, recon = pred
-                primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride);
+                primitives.luma_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride);
                  cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
              }
  
-            tmpDist = primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride);
-            outDist += (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist);
+            outDist += m_rdCost.scaleChromaDist(chromaId, primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride));
  
              if (m_rdCost.m_psyRd)
                  psyEnergy += m_rdCost.psyCost(sizeIdxC, fenc, stride, picReconC, picStride);
  
-            primitives.square_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride);
+            primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride);
          }
          while (tuIterator.isNextSection());
  
          if (splitType == VERTICAL_SPLIT)
-            offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx);
+            offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
      }
  
      return outDist;
  }
  
  /* returns distortion */
-uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t trDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
+uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
  {
      CUData& cu = mode.cu;
-    uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
+    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
      uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
-    uint32_t log2TrSizeC = 2;
+    const uint32_t log2TrSizeC = 2;
      uint32_t tuSize = 4;
      uint32_t qtLayer = log2TrSize - 2;
      uint32_t outDist = 0;
@@ -887,7 +861,7 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
      ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
      ALIGN_VAR_32(pixel,   tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]);
  
-    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
+    uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
      const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
  
      for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
@@ -899,11 +873,11 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
          {
              uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
  
-            pixel*   fenc = const_cast<Yuv*>(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC);
+            const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
              pixel*   pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
              int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
              uint32_t stride = mode.fencYuv->m_csize;
-            uint32_t sizeIdxC = log2TrSizeC - 2;
+            const uint32_t sizeIdxC = log2TrSizeC - 2;
  
              uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
              coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
@@ -911,7 +885,7 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
              uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
  
              // init availability pattern
-            initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
+            initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
              pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
  
              uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
@@ -943,7 +917,7 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
                  {
                      m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
                      primitives.luma_add_ps[sizeIdxC](recon, reconStride, pred, residual, stride, stride);
-                    cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+                    cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
                  }
                  else if (useTSkip)
                  {
@@ -952,11 +926,11 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
                  }
                  else
                  {
-                    primitives.square_copy_pp[sizeIdxC](recon, reconStride, pred, stride);
+                    primitives.luma_copy_pp[sizeIdxC](recon, reconStride, pred, stride);
                      cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
                  }
                  uint32_t tmpDist = primitives.sse_pp[sizeIdxC](recon, reconStride, fenc, stride);
-                tmpDist = (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist);
+                tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
  
                  cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
  
@@ -991,15 +965,15 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
              if (bTSkip)
              {
                  memcpy(coeffC, tskipCoeffC, sizeof(coeff_t) << (log2TrSizeC * 2));
-                primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE);
+                primitives.luma_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE);
              }
  
-            cu.setCbfPartRange(bCbf << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+            cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
              cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
  
-            pixel*   reconPicC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
-            intptr_t picStride = m_frame->m_reconPicYuv->m_strideC;
-            primitives.square_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride);
+            pixel*   reconPicC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+            intptr_t picStride = m_frame->m_reconPic->m_strideC;
+            primitives.luma_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride);
  
              outDist += bDist;
              psyEnergy += bEnergy;
@@ -1007,34 +981,27 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t
          while (tuIterator.isNextSection());
  
          if (splitType == VERTICAL_SPLIT)
-            offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx);
+            offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
      }
  
      m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
      return outDist;
  }
  
-void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t trDepth, bool tuQuad)
+void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
  {
-    uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
+    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
      uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
  
-    if (tuDepthL == trDepth)
+    if (tuDepthL == tuDepth || log2TrSizeC == 2)
      {
-        uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
-        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-
-        if (tuQuad)
-        {
-            log2TrSizeC++; /* extract one 4x4 instead of 4 2x2 */
-            trDepth--;     /* also adjust the number of coeff read */
-        }
-
          // copy transform coefficients
          uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
          uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
  
-        uint32_t qtLayer   = log2TrSize - 2;
+        uint32_t qtLayer   = log2TrSize - 2 - (tuDepthL - tuDepth);
          coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
          coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
          coeff_t* coeffDstU = cu.m_trCoeff[1]           + coeffOffsetC;
@@ -1047,38 +1014,29 @@ void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absP
      }
      else
      {
-        if (g_maxLog2CUSize - fullDepth - 1 == 2 && m_csp != X265_CSP_I444)
-            /* no such thing as chroma 2x2, so extract one 4x4 instead of 4 2x2 */
-            extractIntraResultChromaQT(cu, reconYuv, absPartIdx, trDepth + 1, true);
-        else
-        {
-            uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
-            for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
-                extractIntraResultChromaQT(cu, reconYuv, absPartIdx + subPartIdx * numQPart, trDepth + 1, false);
-        }
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+            extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1);
      }
  }
  
-void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx)
+void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx)
  {
      CUData& cu = mode.cu;
-    uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
-    uint32_t tuDepthL  = cu.m_tuDepth[absPartIdx];
+    uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+    uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
      
-    if (tuDepthL == trDepth)
+    if (tuDepth == cu.m_tuDepth[absPartIdx])
      {
-        uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
          uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-        uint32_t trDepthC = trDepth;
-        if (log2TrSizeC == 1)
+        uint32_t tuDepthC = tuDepth;
+        if (log2TrSizeC < 2)
          {
-            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth > 0, "invalid trDepth\n");
-            trDepthC--;
-            log2TrSizeC++;
-            uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
-            bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
-            if (!bFirstQ)
+            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+            if (absPartIdx & 3)
                  return;
+            log2TrSizeC = 2;
+            tuDepthC--;
          }
  
          ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
@@ -1086,7 +1044,7 @@ void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tr
          uint32_t stride = mode.fencYuv->m_csize;
          const int sizeIdxC = log2TrSizeC - 2;
  
-        uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
+        uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
          const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
  
          for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
@@ -1098,20 +1056,20 @@ void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tr
              {
                  uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
  
-                pixel*   fenc         = const_cast<pixel*>(mode.fencYuv->getChromaAddr(chromaId, absPartIdxC));
+                const pixel*   fenc   = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
                  pixel*   pred         = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
                  int16_t* residual     = resiYuv.getChromaAddr(chromaId, absPartIdxC);
                  pixel*   recon        = mode.reconYuv.getChromaAddr(chromaId, absPartIdxC); // TODO: needed?
                  uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
                  coeff_t* coeff        = cu.m_trCoeff[ttype] + coeffOffsetC;
-                pixel*   picReconC    = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
-                uint32_t picStride    = m_frame->m_reconPicYuv->m_strideC;
+                pixel*   picReconC    = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+                uint32_t picStride    = m_frame->m_reconPic->m_strideC;
  
                  uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
                  if (chromaPredMode == DM_CHROMA_IDX)
                      chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
                  chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
-                initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
+                initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
                  pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
  
                  predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
@@ -1124,36 +1082,36 @@ void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tr
                  {
                      m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], residual, stride, coeff, log2TrSizeC, ttype, true, false, numSig);
                      primitives.luma_add_ps[sizeIdxC](recon, stride, pred, residual, stride, stride);
-                    primitives.square_copy_pp[sizeIdxC](picReconC, picStride, recon, stride);
-                    cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+                    primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, recon, stride);
+                    cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
                  }
                  else
                  {
-                    primitives.square_copy_pp[sizeIdxC](recon, stride, pred, stride);
-                    primitives.square_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
+                    primitives.luma_copy_pp[sizeIdxC](recon, stride, pred, stride);
+                    primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
                      cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
                  }
              }
              while (tuIterator.isNextSection());
  
              if (splitType == VERTICAL_SPLIT)
-                offsetSubTUCBFs(cu, (TextType)chromaId, trDepth, absPartIdx);
+                offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
          }
      }
      else
      {
-        uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
          uint32_t splitCbfU = 0, splitCbfV = 0;
-        for (uint32_t subPartIdx = 0, absPartIdxC = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxC += qPartsDiv)
+        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
          {
-            residualQTIntraChroma(mode, cuGeom, trDepth + 1, absPartIdxC);
-            splitCbfU |= cu.getCbf(absPartIdxC, TEXT_CHROMA_U, trDepth + 1);
-            splitCbfV |= cu.getCbf(absPartIdxC, TEXT_CHROMA_V, trDepth + 1);
+            residualQTIntraChroma(mode, cuGeom, tuDepth + 1, qPartIdx);
+            splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+            splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
          }
-        for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
+        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
          {
-            cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << trDepth);
-            cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << trDepth);
+            cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
+            cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
          }
      }
  }
@@ -1188,7 +1146,7 @@ void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize
      intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
  
      bool bCodeDQP = m_slice->m_pps->bUseDQP;
-    m_entropyCoder.codeCoeff(cu, 0, depth, bCodeDQP, tuDepthRange);
+    m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
      m_entropyCoder.store(intraMode.contexts);
      intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
      intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
@@ -1198,7 +1156,224 @@ void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize
      updateModeCost(intraMode);
  }
  
-uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t depthRange[2], uint8_t* sharedModes)
+/* Note that this function does not save the best intra prediction, it must
+ * be generated later. It records the best mode in the cu */
+void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
+{
+    CUData& cu = intraMode.cu;
+    uint32_t depth = cu.m_cuDepth[0];
+
+    cu.setPartSizeSubParts(SIZE_2Nx2N);
+    cu.setPredModeSubParts(MODE_INTRA);
+
+    const uint32_t initTuDepth = 0;
+    uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth;
+    uint32_t tuSize = 1 << log2TrSize;
+    const uint32_t absPartIdx = 0;
+
+    // Reference sample smoothing
+    initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
+
+    const pixel* fenc = intraMode.fencYuv->m_buf[0];
+    uint32_t stride = intraMode.fencYuv->m_size;
+
+    pixel* above = m_refAbove + tuSize - 1;
+    pixel* aboveFiltered = m_refAboveFlt + tuSize - 1;
+    pixel* left = m_refLeft + tuSize - 1;
+    pixel* leftFiltered = m_refLeftFlt + tuSize - 1;
+    int sad, bsad;
+    uint32_t bits, bbits, mode, bmode;
+    uint64_t cost, bcost;
+
+    // 33 Angle modes once
+    ALIGN_VAR_32(pixel, bufScale[32 * 32]);
+    ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
+    ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
+    int scaleTuSize = tuSize;
+    int scaleStride = stride;
+    int costShift = 0;
+    int sizeIdx = log2TrSize - 2;
+
+    if (tuSize > 32)
+    {
+        // origin is 64x64, we scale to 32x32 and setup required parameters
+        primitives.scale2D_64to32(bufScale, fenc, stride);
+        fenc = bufScale;
+
+        // reserve space in case primitives need to store data in above
+        // or left buffers
+        pixel _above[4 * 32 + 1];
+        pixel _left[4 * 32 + 1];
+        pixel* aboveScale = _above + 2 * 32;
+        pixel* leftScale = _left + 2 * 32;
+        aboveScale[0] = leftScale[0] = above[0];
+        primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
+        primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
+
+        scaleTuSize = 32;
+        scaleStride = 32;
+        costShift = 2;
+        sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
+
+        // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
+        above = aboveScale;
+        left = leftScale;
+        aboveFiltered = aboveScale;
+        leftFiltered = leftScale;
+    }
+
+    pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
+    int predsize = scaleTuSize * scaleTuSize;
+
+    m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
+
+    /* there are three cost tiers for intra modes:
+    *  pred[0]          - mode probable, least cost
+    *  pred[1], pred[2] - less probable, slightly more cost
+    *  non-mpm modes    - all cost the same (rbits) */
+    uint64_t mpms;
+    uint32_t preds[3];
+    uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
+
+    // DC
+    primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
+    bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
+    bmode = mode = DC_IDX;
+    bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+    bcost = m_rdCost.calcRdSADCost(bsad, bbits);
+
+    pixel* abovePlanar = above;
+    pixel* leftPlanar = left;
+
+    if (tuSize & (8 | 16 | 32))
+    {
+        abovePlanar = aboveFiltered;
+        leftPlanar = leftFiltered;
+    }
+
+    // PLANAR
+    primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
+    sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
+    mode = PLANAR_IDX;
+    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+    cost = m_rdCost.calcRdSADCost(sad, bits);
+    COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+
+    // Transpose NxN
+    primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
+
+    primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+
+    bool modeHor;
+    const pixel* cmp;
+    intptr_t srcStride;
+
+#define TRY_ANGLE(angle) \
+    modeHor = angle < 18; \
+    cmp = modeHor ? bufTrans : fenc; \
+    srcStride = modeHor ? scaleTuSize : scaleStride; \
+    sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
+    bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
+    cost = m_rdCost.calcRdSADCost(sad, bits)
+
+    if (m_param->bEnableFastIntra)
+    {
+        int asad = 0;
+        uint32_t lowmode, highmode, amode = 5, abits = 0;
+        uint64_t acost = MAX_INT64;
+
+        /* pick the best angle, sampling at distance of 5 */
+        for (mode = 5; mode < 35; mode += 5)
+        {
+            TRY_ANGLE(mode);
+            COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
+        }
+
+        /* refine best angle at distance 2, then distance 1 */
+        for (uint32_t dist = 2; dist >= 1; dist--)
+        {
+            lowmode = amode - dist;
+            highmode = amode + dist;
+
+            X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
+            TRY_ANGLE(lowmode);
+            COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
+
+            X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
+            TRY_ANGLE(highmode);
+            COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
+        }
+
+        if (amode == 33)
+        {
+            TRY_ANGLE(34);
+            COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
+        }
+
+        COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
+    }
+    else // calculate and search all intra prediction angles for lowest cost
+    {
+        for (mode = 2; mode < 35; mode++)
+        {
+            TRY_ANGLE(mode);
+            COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+        }
+    }
+
+    cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth);
+    intraMode.initCosts();
+    intraMode.totalBits = bbits;
+    intraMode.distortion = bsad;
+    intraMode.sa8dCost = bcost;
+    intraMode.sa8dBits = bbits;
+}
+
+void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
+{
+    CUData& cu = intraMode.cu;
+    Yuv* reconYuv = &intraMode.reconYuv;
+    const Yuv* fencYuv = intraMode.fencYuv;
+
+    X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
+    X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
+
+    m_quant.setQPforQuant(cu);
+
+    uint32_t tuDepthRange[2];
+    cu.getIntraTUQtDepthRange(tuDepthRange, 0);
+
+    m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
+
+    Cost icosts;
+    codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
+    extractIntraResultQT(cu, *reconYuv, 0, 0);
+
+    intraMode.distortion = icosts.distortion;
+    intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
+
+    m_entropyCoder.resetBits();
+    if (m_slice->m_pps->bTransquantBypassEnabled)
+        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
+    m_entropyCoder.codeSkipFlag(cu, 0);
+    m_entropyCoder.codePredMode(cu.m_predMode[0]);
+    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
+    m_entropyCoder.codePredInfo(cu, 0);
+    intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits();
+
+    bool bCodeDQP = m_slice->m_pps->bUseDQP;
+    m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
+
+    intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
+    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
+    if (m_rdCost.m_psyRd)
+        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+
+    m_entropyCoder.store(intraMode.contexts);
+    updateModeCost(intraMode);
+}
+
+uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes)
  {
      CUData& cu = intraMode.cu;
      Yuv* reconYuv = &intraMode.reconYuv;
@@ -1206,37 +1381,37 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t
      const Yuv* fencYuv = intraMode.fencYuv;
  
      uint32_t depth        = cu.m_cuDepth[0];
-    uint32_t initTrDepth  = cu.m_partSize[0] == SIZE_2Nx2N ? 0 : 1;
-    uint32_t numPU        = 1 << (2 * initTrDepth);
-    uint32_t log2TrSize   = cu.m_log2CUSize[0] - initTrDepth;
+    uint32_t initTuDepth  = cu.m_partSize[0] != SIZE_2Nx2N;
+    uint32_t numPU        = 1 << (2 * initTuDepth);
+    uint32_t log2TrSize   = cu.m_log2CUSize[0] - initTuDepth;
      uint32_t tuSize       = 1 << log2TrSize;
      uint32_t qNumParts    = cuGeom.numPartitions >> 2;
      uint32_t sizeIdx      = log2TrSize - 2;
      uint32_t absPartIdx   = 0;
      uint32_t totalDistortion = 0;
  
-    int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[absPartIdx] == SIZE_NxN;
+    int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
  
      // loop over partitions
-    for (uint32_t pu = 0; pu < numPU; pu++, absPartIdx += qNumParts)
+    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
      {
          uint32_t bmode = 0;
  
          if (sharedModes)
-            bmode = sharedModes[pu];
+            bmode = sharedModes[puIdx];
          else
          {
              // Reference sample smoothing
-            initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX);
+            initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
  
              // determine set of modes to be tested (using prediction signal only)
-            pixel*   fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+            const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
              uint32_t stride = predYuv->m_size;
  
-            pixel *above = m_refAbove + tuSize - 1;
-            pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
-            pixel *left = m_refLeft + tuSize - 1;
-            pixel *leftFiltered = m_refLeftFlt + tuSize - 1;
+            pixel* above = m_refAbove + tuSize - 1;
+            pixel* aboveFiltered = m_refAboveFlt + tuSize - 1;
+            pixel* left = m_refLeft + tuSize - 1;
+            pixel* leftFiltered = m_refLeftFlt + tuSize - 1;
  
              // 33 Angle modes once
              ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
@@ -1250,8 +1425,8 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t
  
              if (tuSize > 32)
              {
-                pixel *aboveScale = _above + 2 * 32;
-                pixel *leftScale = _left + 2 * 32;
+                pixel* aboveScale = _above + 2 * 32;
+                pixel* leftScale = _left + 2 * 32;
  
                  // origin is 64x64, we scale to 32x32 and setup required parameters
                  primitives.scale2D_64to32(bufScale, fenc, stride);
@@ -1296,8 +1471,8 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t
              modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
  
              // PLANAR
-            pixel *abovePlanar = above;
-            pixel *leftPlanar = left;
+            pixel* abovePlanar = above;
+            pixel* leftPlanar = left;
              if (tuSize >= 8 && tuSize <= 32)
              {
                  abovePlanar = aboveFiltered;
@@ -1316,7 +1491,7 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t
              for (int mode = 2; mode < 35; mode++)
              {
                  bool modeHor = (mode < 18);
-                pixel *cmp = (modeHor ? buf_trans : fenc);
+                const pixel* cmp = (modeHor ? buf_trans : fenc);
                  intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride);
                  bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
                  sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
@@ -1330,7 +1505,7 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t
               * levels and at higher depths */
              uint64_t candCostList[MAX_RD_INTRA_MODES];
              uint32_t rdModeList[MAX_RD_INTRA_MODES];
-            int maxCandCount = 2 + m_param->rdLevel + ((depth + initTrDepth) >> 1);
+            int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
              for (int i = 0; i < maxCandCount; i++)
                  candCostList[i] = MAX_INT64;
  
@@ -1346,51 +1521,50 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t
                  if (candCostList[i] == MAX_INT64)
                      break;
                  m_entropyCoder.load(m_rqt[depth].cur);
-                cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTrDepth);
+                cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
  
                  Cost icosts;
                  if (checkTransformSkip)
-                    codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts);
+                    codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
                  else
-                    codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, false, icosts, depthRange);
+                    codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
                  COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
              }
          }
  
          /* remeasure best mode, allowing TU splits */
-        cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTrDepth);
+        cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
          m_entropyCoder.load(m_rqt[depth].cur);
  
          Cost icosts;
          if (checkTransformSkip)
-            codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts);
+            codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
          else
-            codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, true, icosts, depthRange);
+            codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
          totalDistortion += icosts.distortion;
  
-        extractIntraResultQT(cu, *reconYuv, initTrDepth, absPartIdx);
+        extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
  
          // set reconstruction for next intra prediction blocks
-        if (pu != numPU - 1)
+        if (puIdx != numPU - 1)
          {
              /* This has important implications for parallelism and RDO.  It is writing intermediate results into the
               * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
               * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
               * that the contexts should be tracked through each PU */
-            pixel*   dst         = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
-            uint32_t dststride   = m_frame->m_reconPicYuv->m_stride;
-            pixel*   src         = reconYuv->getLumaAddr(absPartIdx);
+            pixel*   dst         = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+            uint32_t dststride   = m_frame->m_reconPic->m_stride;
+            const pixel*   src   = reconYuv->getLumaAddr(absPartIdx);
              uint32_t srcstride   = reconYuv->m_size;
-            primitives.square_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
+            primitives.luma_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
          }
      }
  
      if (numPU > 1)
      {
          uint32_t combCbfY = 0;
-        uint32_t partIdx  = 0;
-        for (uint32_t part = 0; part < 4; part++, partIdx += qNumParts)
-            combCbfY |= cu.getCbf(partIdx, TEXT_LUMA, 1);
+        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
+            combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
  
          for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
              cu.m_cbf[0][offs] |= combCbfY;
@@ -1415,17 +1589,19 @@ void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
      uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
      uint32_t tuSize = 1 << log2TrSizeC;
      int32_t scaleTuSize = tuSize;
+    uint32_t tuDepth = 0;
      int32_t costShift = 0;
  
      if (tuSize > 32)
      {
          scaleTuSize = 32;
+        tuDepth = 1;
          costShift = 2;
          log2TrSizeC = 5;
      }
  
-    Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 1);
-    Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 2);
+    Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 1);
+    Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 2);
      cu.getAllowedChromaDir(0, modeList);
  
      // check chroma modes
@@ -1440,7 +1616,7 @@ void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
          uint64_t cost = 0;
          for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
          {
-            pixel* fenc = fencYuv->m_buf[chromaId];
+            const pixel* fenc = fencYuv->m_buf[chromaId];
              pixel* pred = predYuv->m_buf[chromaId];
              pixel* chromaPred = getAdiChromaBuf(chromaId, scaleTuSize);
  
@@ -1465,19 +1641,18 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
      Yuv& reconYuv = intraMode.reconYuv;
  
      uint32_t depth       = cu.m_cuDepth[0];
-    uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN && m_csp == X265_CSP_I444;
-    uint32_t log2TrSize  = cu.m_log2CUSize[0] - initTrDepth;
+    uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
+    uint32_t log2TrSize  = cu.m_log2CUSize[0] - initTuDepth;
      uint32_t absPartStep = (NUM_CU_PARTITIONS >> (depth << 1));
      uint32_t totalDistortion = 0;
  
      int part = partitionFromLog2Size(log2TrSize);
  
-    TURecurse tuIterator((initTrDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
+    TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
  
      do
      {
          uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
-        int cuSize = 1 << cu.m_log2CUSize[absPartIdxC];
  
          uint32_t bestMode = 0;
          uint32_t bestDist = 0;
@@ -1496,9 +1671,9 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
              // restore context models
              m_entropyCoder.load(m_rqt[depth].cur);
  
-            cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTrDepth);
+            cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
              uint32_t psyEnergy = 0;
-            uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTrDepth, absPartIdxC, psyEnergy);
+            uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, psyEnergy);
  
              if (m_slice->m_pps->bTransformSkipEnabled)
                  m_entropyCoder.load(m_rqt[depth].cur);
@@ -1512,14 +1687,14 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
              }
              else
              {
-                uint32_t qtNumParts = cuGeom.numPartitions >> 2;
-                if (!(absPartIdxC & (qtNumParts - 1)))
+                uint32_t qNumParts = cuGeom.numPartitions >> 2;
+                if (!(absPartIdxC & (qNumParts - 1)))
                      m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
              }
  
-            codeSubdivCbfQTChroma(cu, initTrDepth, absPartIdxC, tuIterator.absPartIdxStep, cuSize, cuSize);
-            codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_U);
-            codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_V);
+            codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC);
+            codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
+            codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
              uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
              uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(dist, bits, psyEnergy) : m_rdCost.calcRdCost(dist, bits);
  
@@ -1528,7 +1703,7 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
                  bestCost = cost;
                  bestDist = dist;
                  bestMode = modeList[mode];
-                extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTrDepth, false);
+                extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
                  memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
                  memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
                  memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
@@ -1539,14 +1714,15 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
          if (!tuIterator.isLastSection())
          {
              uint32_t zorder    = cuGeom.encodeIdx + absPartIdxC;
-            uint32_t dststride = m_frame->m_reconPicYuv->m_strideC;
-            pixel *src, *dst;
+            uint32_t dststride = m_frame->m_reconPic->m_strideC;
+            const pixel* src;
+            pixel* dst;
  
-            dst = m_frame->m_reconPicYuv->getCbAddr(cu.m_cuAddr, zorder);
+            dst = m_frame->m_reconPic->getCbAddr(cu.m_cuAddr, zorder);
              src = reconYuv.getCbAddr(absPartIdxC);
              primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
  
-            dst = m_frame->m_reconPicYuv->getCrAddr(cu.m_cuAddr, zorder);
+            dst = m_frame->m_reconPic->getCrAddr(cu.m_cuAddr, zorder);
              src = reconYuv.getCrAddr(absPartIdxC);
              primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
          }
@@ -1555,23 +1731,23 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
          memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
          memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
          memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
-        cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTrDepth);
+        cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth);
          totalDistortion += bestDist;
      }
      while (tuIterator.isNextSection());
  
-    if (initTrDepth != 0)
+    if (initTuDepth != 0)
      {
          uint32_t combCbfU = 0;
          uint32_t combCbfV = 0;
-        uint32_t partIdx  = 0;
-        for (uint32_t p = 0; p < 4; p++, partIdx += tuIterator.absPartIdxStep)
+        uint32_t qNumParts = tuIterator.absPartIdxStep;
+        for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
          {
-            combCbfU |= cu.getCbf(partIdx, TEXT_CHROMA_U, 1);
-            combCbfV |= cu.getCbf(partIdx, TEXT_CHROMA_V, 1);
+            combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1);
+            combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
          }
  
-        for (uint32_t offs = 0; offs < 4 * tuIterator.absPartIdxStep; offs++)
+        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
          {
              cu.m_cbf[1][offs] |= combCbfU;
              cu.m_cbf[2][offs] |= combCbfV;
@@ -1615,13 +1791,17 @@ uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, Me
              continue;
  
          cu.m_mv[0][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][0].mv;
-        cu.m_refIdx[0][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][0].refIdx;
+        cu.m_refIdx[0][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][0].refIdx;
          cu.m_mv[1][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv;
-        cu.m_refIdx[1][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][1].refIdx;
+        cu.m_refIdx[1][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][1].refIdx;
  
          prepMotionCompensation(cu, cuGeom, puIdx);
-        motionCompensation(tempYuv, true, false);
+        motionCompensation(tempYuv, true, m_me.bChromaSATD);
+
          uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(m.absPartIdx), tempYuv.m_size);
+        if (m_me.bChromaSATD)
+            costCand += m_me.bufChromaSATD(tempYuv, m.absPartIdx);
+
          uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand);
          costCand = costCand + m_rdCost.getCost(bitsCand);
          if (costCand < outCost)
@@ -1642,41 +1822,45 @@ uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, Me
  /* this function assumes the caller has configured its MotionEstimation engine with the
   * correct source plane and source PU, and has called prepMotionCompensation() to set
   * m_puAbsPartIdx, m_puWidth, and m_puHeight */
-void Search::singleMotionEstimation(Search& master, const CUData& cu, const CUGeom& cuGeom, int part, int list, int ref)
+void Search::singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref)
  {
      uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
      bits += getTUBits(ref, m_slice->m_numRefIdx[list]);
  
-    MV amvpCand[AMVP_NUM_CANDS];
      MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
-    int numMvc = cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, amvpCand, mvc);
+    int numMvc = interMode.cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, interMode.amvpCand[list][ref], mvc);
  
-    uint32_t bestCost = MAX_INT;
      int mvpIdx = 0;
      int merange = m_param->searchRange;
-    for (int i = 0; i < AMVP_NUM_CANDS; i++)
+    MotionData* bestME = interMode.bestME[part];
+
+    if (interMode.amvpCand[list][ref][0] != interMode.amvpCand[list][ref][1])
      {
-        MV mvCand = amvpCand[i];
+        uint32_t bestCost = MAX_INT;
+        for (int i = 0; i < AMVP_NUM_CANDS; i++)
+        {
+            MV mvCand = interMode.amvpCand[list][ref][i];
  
-        // NOTE: skip mvCand if Y is > merange and -FN>1
-        if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
-            continue;
+            // NOTE: skip mvCand if Y is > merange and -FN>1
+            if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
+                continue;
  
-        cu.clipMv(mvCand);
+            interMode.cu.clipMv(mvCand);
  
-        Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
-        predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPicYuv, mvCand);
-        uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
+            Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
+            predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPic, mvCand);
+            uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
  
-        if (bestCost > cost)
-        {
-            bestCost = cost;
-            mvpIdx = i;
+            if (bestCost > cost)
+            {
+                bestCost = cost;
+                mvpIdx = i;
+            }
          }
      }
  
-    MV mvmin, mvmax, outmv, mvp = amvpCand[mvpIdx];
-    setSearchRange(cu, mvp, merange, mvmin, mvmax);
+    MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[list][ref][mvpIdx];
+    setSearchRange(interMode.cu, mvp, merange, mvmin, mvmax);
  
      int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
  
@@ -1685,34 +1869,32 @@ void Search::singleMotionEstimation(Search& master, const CUData& cu, const CUGe
      uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
  
      /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
-    checkBestMVP(amvpCand, outmv, mvp, mvpIdx, bits, cost);
+    checkBestMVP(interMode.amvpCand[list][ref], outmv, mvp, mvpIdx, bits, cost);
  
      /* tie goes to the smallest ref ID, just like --no-pme */
-    ScopedLock _lock(master.m_outputLock);
-    if (cost < master.m_bestME[list].cost ||
-       (cost == master.m_bestME[list].cost && ref < master.m_bestME[list].ref))
+    ScopedLock _lock(master.m_meLock);
+    if (cost < bestME[list].cost ||
+       (cost == bestME[list].cost && ref < bestME[list].ref))
      {
-        master.m_bestME[list].mv = outmv;
-        master.m_bestME[list].mvp = mvp;
-        master.m_bestME[list].mvpIdx = mvpIdx;
-        master.m_bestME[list].ref = ref;
-        master.m_bestME[list].cost = cost;
-        master.m_bestME[list].bits = bits;
+        bestME[list].mv = outmv;
+        bestME[list].mvp = mvp;
+        bestME[list].mvpIdx = mvpIdx;
+        bestME[list].ref = ref;
+        bestME[list].cost = cost;
+        bestME[list].bits = bits;
      }
  }
  
  /* search of the best candidate for inter prediction
   * returns true if predYuv was filled with a motion compensated prediction */
-bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma)
+bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChromaSA8D)
  {
      CUData& cu = interMode.cu;
      Yuv* predYuv = &interMode.predYuv;
  
-    MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
      MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
  
      const Slice *slice = m_slice;
-    PicYuv* fencPic = m_frame->m_origPicYuv;
      int numPart     = cu.getNumPartInter();
      int numPredDir  = slice->isInterP() ? 1 : 2;
      const int* numRefIdx = slice->m_numRefIdx;
@@ -1727,23 +1909,24 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
  
      for (int puIdx = 0; puIdx < numPart; puIdx++)
      {
+        MotionData* bestME = interMode.bestME[puIdx];
+
          /* sets m_puAbsPartIdx, m_puWidth, m_puHeight */
          initMotionCompensation(cu, cuGeom, puIdx);
  
-        pixel* pu = fencPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
-        m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight);
+        m_me.setSourcePU(*interMode.fencYuv, cu.m_cuAddr, cuGeom.encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
  
          uint32_t mrgCost = MAX_UINT;
  
-        /* find best cost merge candidate */
-        if (cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N)
+        /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
+        if (cu.m_partSize[0] != SIZE_2Nx2N)
          {
              merge.absPartIdx = m_puAbsPartIdx;
              merge.width      = m_puWidth;
              merge.height     = m_puHeight;
              mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge);
  
-            if (bMergeOnly && cu.m_log2CUSize[0] > 3)
+            if (bMergeOnly)
              {
                  if (mrgCost == MAX_UINT)
                  {
@@ -1762,33 +1945,88 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
                  totalmebits += merge.bits;
  
                  prepMotionCompensation(cu, cuGeom, puIdx);
-                motionCompensation(*predYuv, true, bChroma);
+                motionCompensation(*predYuv, true, bChromaSA8D);
                  continue;
              }
          }
  
-        MotionData bidir[2];
-        uint32_t bidirCost = MAX_UINT;
-        int bidirBits = 0;
-
-        m_bestME[0].cost = MAX_UINT;
-        m_bestME[1].cost = MAX_UINT;
+        bestME[0].cost = MAX_UINT;
+        bestME[1].cost = MAX_UINT;
  
          getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
  
-        if (bDistributed)
+        /* Uni-directional prediction */
+        if (m_param->analysisMode == X265_ANALYSIS_LOAD && bestME[0].ref >= 0)
          {
-            m_curMECu = &cu;
-            m_curGeom = &cuGeom;
+            for (int l = 0; l < numPredDir; l++)
+            {
+                int ref = bestME[l].ref;
+                uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
+                bits += getTUBits(ref, numRefIdx[l]);
+
+                int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
+
+                // Pick the best possible MVP from AMVP candidates based on least residual
+                int mvpIdx = 0;
+                int merange = m_param->searchRange;
+
+                if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1])
+                {
+                    uint32_t bestCost = MAX_INT;
+                    for (int i = 0; i < AMVP_NUM_CANDS; i++)
+                    {
+                        MV mvCand = interMode.amvpCand[l][ref][i];
+
+                        // NOTE: skip mvCand if Y is > merange and -FN>1
+                        if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
+                            continue;
+
+                        cu.clipMv(mvCand);
+                        predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
+                        uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
  
-            /* this worker might already be enqueued for pmode, so other threads
-             * might be looking at the ME job counts at any time, do these sets
-             * in a safe order */
+                        if (bestCost > cost)
+                        {
+                            bestCost = cost;
+                            mvpIdx = i;
+                        }
+                    }
+                }
+
+                MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx];
+
+                int satdCost;
+                setSearchRange(cu, mvp, merange, mvmin, mvmax);
+                satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
+
+                /* Get total cost of partition, but only include MV bit cost once */
+                bits += m_me.bitcost(outmv);
+                uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
+
+                /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
+                checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
+
+                if (cost < bestME[l].cost)
+                {
+                    bestME[l].mv = outmv;
+                    bestME[l].mvp = mvp;
+                    bestME[l].mvpIdx = mvpIdx;
+                    bestME[l].cost = cost;
+                    bestME[l].bits = bits;
+                }
+            }
+        }
+        else if (bDistributed)
+        {
+            m_meLock.acquire();
+            m_curInterMode = &interMode;
+            m_curGeom = &cuGeom;
              m_curPart = puIdx;
              m_totalNumME = 0;
              m_numAcquiredME = 1;
              m_numCompletedME = 0;
              m_totalNumME = numRefIdx[0] + numRefIdx[1];
+            m_meLock.release();
  
              if (!m_bJobsQueued)
                  JobProvider::enqueue();
@@ -1796,34 +2034,43 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
              for (int i = 1; i < m_totalNumME; i++)
                  m_pool->pokeIdleThread();
  
-            while (m_totalNumME > m_numAcquiredME)
+            do
              {
-                int id = ATOMIC_INC(&m_numAcquiredME);
-                if (m_totalNumME >= id)
+                m_meLock.acquire();
+                if (m_totalNumME > m_numAcquiredME)
                  {
-                    id -= 1;
+                    int id = m_numAcquiredME++;
+                    m_meLock.release();
+
                      if (id < numRefIdx[0])
-                        singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, id);
+                        singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, id);
                      else
-                        singleMotionEstimation(*this, cu, cuGeom, puIdx, 1, id - numRefIdx[0]);
+                        singleMotionEstimation(*this, interMode, cuGeom, puIdx, 1, id - numRefIdx[0]);
  
-                    if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
-                        m_meCompletionEvent.trigger();
+                    m_meLock.acquire();
+                    m_numCompletedME++;
+                    m_meLock.release();
                  }
+                else
+                    m_meLock.release();
              }
+            while (m_totalNumME > m_numAcquiredME);
+
              if (!m_bJobsQueued)
                  JobProvider::dequeue();
  
              /* we saved L0-0 for ourselves */
-            singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, 0);
-            if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
+            singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, 0);
+
+            m_meLock.acquire();
+            if (++m_numCompletedME == m_totalNumME)
                  m_meCompletionEvent.trigger();
+            m_meLock.release();
  
              m_meCompletionEvent.wait();
          }
          else
          {
-            // Uni-directional prediction
              for (int l = 0; l < numPredDir; l++)
              {
                  for (int ref = 0; ref < numRefIdx[l]; ref++)
@@ -1831,33 +2078,36 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
                      uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
                      bits += getTUBits(ref, numRefIdx[l]);
  
-                    int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, amvpCand[l][ref], mvc);
+                    int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
  
                      // Pick the best possible MVP from AMVP candidates based on least residual
-                    uint32_t bestCost = MAX_INT;
                      int mvpIdx = 0;
                      int merange = m_param->searchRange;
  
-                    for (int i = 0; i < AMVP_NUM_CANDS; i++)
+                    if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1])
                      {
-                        MV mvCand = amvpCand[l][ref][i];
+                        uint32_t bestCost = MAX_INT;
+                        for (int i = 0; i < AMVP_NUM_CANDS; i++)
+                        {
+                            MV mvCand = interMode.amvpCand[l][ref][i];
  
-                        // NOTE: skip mvCand if Y is > merange and -FN>1
-                        if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
-                            continue;
+                            // NOTE: skip mvCand if Y is > merange and -FN>1
+                            if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
+                                continue;
  
-                        cu.clipMv(mvCand);
-                        predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPicYuv, mvCand);
-                        uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
+                            cu.clipMv(mvCand);
+                            predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
+                            uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
  
-                        if (bestCost > cost)
-                        {
-                            bestCost = cost;
-                            mvpIdx  = i;
+                            if (bestCost > cost)
+                            {
+                                bestCost = cost;
+                                mvpIdx = i;
+                            }
                          }
                      }
  
-                    MV mvmin, mvmax, outmv, mvp = amvpCand[l][ref][mvpIdx];
+                    MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx];
  
                      setSearchRange(cu, mvp, merange, mvmin, mvmax);
                      int satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
@@ -1867,45 +2117,67 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
                      uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
  
                      /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
-                    checkBestMVP(amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
+                    checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
  
-                    if (cost < m_bestME[l].cost)
+                    if (cost < bestME[l].cost)
                      {
-                        m_bestME[l].mv = outmv;
-                        m_bestME[l].mvp = mvp;
-                        m_bestME[l].mvpIdx = mvpIdx;
-                        m_bestME[l].ref = ref;
-                        m_bestME[l].cost = cost;
-                        m_bestME[l].bits = bits;
+                        bestME[l].mv = outmv;
+                        bestME[l].mvp = mvp;
+                        bestME[l].mvpIdx = mvpIdx;
+                        bestME[l].ref = ref;
+                        bestME[l].cost = cost;
+                        bestME[l].bits = bits;
                      }
                  }
              }
          }
  
          /* Bi-directional prediction */
-        if (slice->isInterB() && !cu.isBipredRestriction() && m_bestME[0].cost != MAX_UINT && m_bestME[1].cost != MAX_UINT)
+        MotionData bidir[2];
+        uint32_t bidirCost = MAX_UINT;
+        int bidirBits = 0;
+
+        if (slice->isInterB() && !cu.isBipredRestriction() &&  /* biprediction is possible for this PU */
+            cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N &&     /* 2Nx2N biprediction is handled elsewhere */
+            bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
          {
-            bidir[0] = m_bestME[0];
-            bidir[1] = m_bestME[1];
+            bidir[0] = bestME[0];
+            bidir[1] = bestME[1];
+
+            int satdCost;
  
-            /* Generate reference subpels */
-            PicYuv* refPic0  = slice->m_refPicList[0][m_bestME[0].ref]->m_reconPicYuv;
-            PicYuv* refPic1  = slice->m_refPicList[1][m_bestME[1].ref]->m_reconPicYuv;
-            Yuv*    bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
-            predInterLumaPixel(bidirYuv[0], *refPic0, m_bestME[0].mv);
-            predInterLumaPixel(bidirYuv[1], *refPic1, m_bestME[1].mv);
+            if (m_me.bChromaSATD)
+            {
+                cu.m_mv[0][m_puAbsPartIdx] = bidir[0].mv;
+                cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
+                cu.m_mv[1][m_puAbsPartIdx] = bidir[1].mv;
+                cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
  
-            pixel *pred0 = bidirYuv[0].getLumaAddr(m_puAbsPartIdx);
-            pixel *pred1 = bidirYuv[1].getLumaAddr(m_puAbsPartIdx);
+                prepMotionCompensation(cu, cuGeom, puIdx);
+                motionCompensation(tmpPredYuv, true, true);
  
-            int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
-            primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, pred0, bidirYuv[0].m_size, pred1, bidirYuv[1].m_size, 32);
-            int satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+                satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
+                           m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
+            }
+            else
+            {
+                PicYuv* refPic0 = slice->m_refPicList[0][bestME[0].ref]->m_reconPic;
+                PicYuv* refPic1 = slice->m_refPicList[1][bestME[1].ref]->m_reconPic;
+                Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
  
-            bidirBits = m_bestME[0].bits + m_bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
+                /* Generate reference subpels */
+                predInterLumaPixel(bidirYuv[0], *refPic0, bestME[0].mv);
+                predInterLumaPixel(bidirYuv[1], *refPic1, bestME[1].mv);
+
+                primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(m_puAbsPartIdx), bidirYuv[0].m_size,
+                                                                                              bidirYuv[1].getLumaAddr(m_puAbsPartIdx), bidirYuv[1].m_size, 32);
+                satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+            }
+
+            bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
              bidirCost = satdCost + m_rdCost.getCost(bidirBits);
  
-            bool bTryZero = m_bestME[0].mv.notZero() || m_bestME[1].mv.notZero();
+            bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
              if (bTryZero)
              {
                  /* Do not try zero MV if unidir motion predictors are beyond
@@ -1917,38 +2189,48 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
                  mvmin <<= 2;
                  mvmax <<= 2;
  
-                bTryZero &= m_bestME[0].mvp.checkRange(mvmin, mvmax);
-                bTryZero &= m_bestME[1].mvp.checkRange(mvmin, mvmax);
+                bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
+                bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
              }
              if (bTryZero)
              {
-                // coincident blocks of the two reference pictures
-                pixel *ref0 = slice->m_mref[0][m_bestME[0].ref].fpelPlane + (pu - fencPic->m_picOrg[0]);
-                pixel *ref1 = slice->m_mref[1][m_bestME[1].ref].fpelPlane + (pu - fencPic->m_picOrg[0]);
-                intptr_t refStride = slice->m_mref[0][0].lumaStride;
+                /* coincident blocks of the two reference pictures */
+                if (m_me.bChromaSATD)
+                {
+                    cu.m_mv[0][m_puAbsPartIdx] = mvzero;
+                    cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
+                    cu.m_mv[1][m_puAbsPartIdx] = mvzero;
+                    cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
  
-                primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
-                satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+                    prepMotionCompensation(cu, cuGeom, puIdx);
+                    motionCompensation(tmpPredYuv, true, true);
  
-                MV mvp0 = m_bestME[0].mvp;
-                int mvpIdx0 = m_bestME[0].mvpIdx;
-                uint32_t bits0 = m_bestME[0].bits - m_me.bitcost(m_bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
+                    satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
+                               m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
+                }
+                else
+                {
+                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+                    intptr_t refStride = slice->m_mref[0][0].lumaStride;
  
-                MV mvp1 = m_bestME[1].mvp;
-                int mvpIdx1 = m_bestME[1].mvpIdx;
-                uint32_t bits1 = m_bestME[1].bits - m_me.bitcost(m_bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
+                    primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
+                    satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+                }
  
-                uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
+                MV mvp0 = bestME[0].mvp;
+                int mvpIdx0 = bestME[0].mvpIdx;
+                uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
  
-                if (bDistributed)
-                {
-                    cu.fillMvpCand(puIdx, m_puAbsPartIdx, 0, m_bestME[0].ref, amvpCand[0][m_bestME[0].ref], mvc);
-                    cu.fillMvpCand(puIdx, m_puAbsPartIdx, 1, m_bestME[1].ref, amvpCand[1][m_bestME[1].ref], mvc);
-                }
+                MV mvp1 = bestME[1].mvp;
+                int mvpIdx1 = bestME[1].mvpIdx;
+                uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
+
+                uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
  
                  /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
-                checkBestMVP(amvpCand[0][m_bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost);
-                checkBestMVP(amvpCand[1][m_bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost);
+                checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost);
+                checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost);
  
                  if (cost < bidirCost)
                  {
@@ -1965,7 +2247,7 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
          }
  
          /* select best option and store into CU */
-        if (mrgCost < bidirCost && mrgCost < m_bestME[0].cost && mrgCost < m_bestME[1].cost)
+        if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
          {
              cu.m_mergeFlag[m_puAbsPartIdx] = true;
              cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
@@ -1977,39 +2259,39 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
  
              totalmebits += merge.bits;
          }
-        else if (bidirCost < m_bestME[0].cost && bidirCost < m_bestME[1].cost)
+        else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
          {
              lastMode = 2;
  
              cu.m_mergeFlag[m_puAbsPartIdx] = false;
              cu.setPUInterDir(3, m_puAbsPartIdx, puIdx);
              cu.setPUMv(0, bidir[0].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx);
+            cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
              cu.m_mvd[0][m_puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
              cu.m_mvpIdx[0][m_puAbsPartIdx] = bidir[0].mvpIdx;
  
              cu.setPUMv(1, bidir[1].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx);
+            cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
              cu.m_mvd[1][m_puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
              cu.m_mvpIdx[1][m_puAbsPartIdx] = bidir[1].mvpIdx;
  
              totalmebits += bidirBits;
          }
-        else if (m_bestME[0].cost <= m_bestME[1].cost)
+        else if (bestME[0].cost <= bestME[1].cost)
          {
              lastMode = 0;
  
              cu.m_mergeFlag[m_puAbsPartIdx] = false;
              cu.setPUInterDir(1, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(0, m_bestME[0].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx);
-            cu.m_mvd[0][m_puAbsPartIdx] = m_bestME[0].mv - m_bestME[0].mvp;
-            cu.m_mvpIdx[0][m_puAbsPartIdx] = m_bestME[0].mvpIdx;
+            cu.setPUMv(0, bestME[0].mv, m_puAbsPartIdx, puIdx);
+            cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
+            cu.m_mvd[0][m_puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
+            cu.m_mvpIdx[0][m_puAbsPartIdx] = bestME[0].mvpIdx;
  
              cu.setPURefIdx(1, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
              cu.setPUMv(1, mvzero, m_puAbsPartIdx, puIdx);
  
-            totalmebits += m_bestME[0].bits;
+            totalmebits += bestME[0].bits;
          }
          else
          {
@@ -2017,19 +2299,19 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO
  
              cu.m_mergeFlag[m_puAbsPartIdx] = false;
              cu.setPUInterDir(2, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(1, m_bestME[1].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx);
-            cu.m_mvd[1][m_puAbsPartIdx] = m_bestME[1].mv - m_bestME[1].mvp;
-            cu.m_mvpIdx[1][m_puAbsPartIdx] = m_bestME[1].mvpIdx;
+            cu.setPUMv(1, bestME[1].mv, m_puAbsPartIdx, puIdx);
+            cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
+            cu.m_mvd[1][m_puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
+            cu.m_mvpIdx[1][m_puAbsPartIdx] = bestME[1].mvpIdx;
  
              cu.setPURefIdx(0, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
              cu.setPUMv(0, mvzero, m_puAbsPartIdx, puIdx);
  
-            totalmebits += m_bestME[1].bits;
+            totalmebits += bestME[1].bits;
          }
  
          prepMotionCompensation(cu, cuGeom, puIdx);
-        motionCompensation(*predYuv, true, bChroma);
+        motionCompensation(*predYuv, true, bChromaSA8D);
      }
  
      interMode.sa8dBits += totalmebits;
@@ -2147,7 +2429,7 @@ void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
  
      // No residual coding : SKIP mode
  
-    cu.setSkipFlagSubParts(true);
+    cu.setPredModeSubParts(MODE_SKIP);
      cu.clearCbf();
      cu.setTUDepthSubParts(0, 0, depth);
  
@@ -2158,8 +2440,8 @@ void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
      interMode.distortion = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
      // Chroma
      part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
-    interMode.distortion += m_rdCost.scaleChromaDistCb(primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
-    interMode.distortion += m_rdCost.scaleChromaDistCr(primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
+    interMode.distortion += m_rdCost.scaleChromaDist(1, primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
+    interMode.distortion += m_rdCost.scaleChromaDist(2, primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
  
      m_entropyCoder.load(m_rqt[depth].cur);
      m_entropyCoder.resetBits();
@@ -2212,8 +2494,8 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
      if (!cu.m_tqBypass[0])
      {
          uint32_t cbf0Dist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
-        cbf0Dist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
-        cbf0Dist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
+        cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
+        cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
  
          /* Consider the RD cost of not signaling any residual */
          m_entropyCoder.load(m_rqt[depth].cur);
@@ -2247,7 +2529,7 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
      uint32_t coeffBits, bits;
      if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
      {
-        cu.setSkipFlagSubParts(true);
+        cu.setPredModeSubParts(MODE_SKIP);
  
          /* Merge/Skip */
          m_entropyCoder.resetBits();
@@ -2270,7 +2552,7 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
          uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits();
  
          bool bCodeDQP = m_slice->m_pps->bUseDQP;
-        m_entropyCoder.codeCoeff(cu, 0, cu.m_cuDepth[0], bCodeDQP, tuDepthRange);
+        m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
          bits = m_entropyCoder.getNumberOfWrittenBits();
  
          coeffBits = bits - mvBits;
@@ -2285,8 +2567,8 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
  
      // update with clipped distortion and cost (qp estimation loop uses unclipped values)
      uint32_t bestDist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
-    bestDist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
-    bestDist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
+    bestDist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
+    bestDist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
      if (m_rdCost.m_psyRd)
          interMode.psyEnergy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
  
@@ -2297,41 +2579,7 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
      updateModeCost(interMode);
  }
  
-void Search::generateCoeffRecon(Mode& mode, const CUGeom& cuGeom)
-{
-    CUData& cu = mode.cu;
-
-    m_quant.setQPforQuant(mode.cu);
-
-    if (cu.m_predMode[0] == MODE_INTER)
-    {
-        uint32_t tuDepthRange[2];
-        cu.getInterTUQtDepthRange(tuDepthRange, 0);
-
-        residualTransformQuantInter(mode, cuGeom, 0, cu.m_cuDepth[0], tuDepthRange);
-        if (cu.getQtRootCbf(0))
-            mode.reconYuv.addClip(mode.predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
-        else
-        {
-            mode.reconYuv.copyFromYuv(mode.predYuv);
-            if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
-                cu.setSkipFlagSubParts(true);
-        }
-    }
-    else if (cu.m_predMode[0] == MODE_INTRA)
-    {
-        uint32_t tuDepthRange[2];
-        cu.getIntraTUQtDepthRange(tuDepthRange, 0);
-
-        uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN;
-        residualTransformQuantIntra(mode, cuGeom, initTrDepth, 0, tuDepthRange);
-        getBestIntraModeChroma(mode, cuGeom);
-        residualQTIntraChroma(mode, cuGeom, 0, 0);
-        mode.reconYuv.copyFromPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, cuGeom.encodeIdx); // TODO: 
-    }
-}
-
-void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, uint32_t depthRange[2])
+void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2])
  {
      CUData& cu = mode.cu;
      X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid depth\n");
@@ -2340,7 +2588,7 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
      uint32_t tuDepth = depth - cu.m_cuDepth[0];
  
      bool bCheckFull = log2TrSize <= depthRange[1];
-    if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0])
+    if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0])
          bCheckFull = false;
  
      if (bCheckFull)
@@ -2349,13 +2597,12 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
          uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
          bool bCodeChroma = true;
          uint32_t tuDepthC = tuDepth;
-        if (log2TrSizeC == 1)
+        if (log2TrSizeC < 2)
          {
-            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n");
-            log2TrSizeC++;
+            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+            log2TrSizeC = 2;
              tuDepthC--;
-            uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
-            bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+            bCodeChroma = !(absPartIdx & 3);
          }
  
          uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
@@ -2372,10 +2619,10 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
          ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
          const Yuv* fencYuv = mode.fencYuv;
  
-        int16_t *curResiY = resiYuv.getLumaAddr(absPartIdx);
+        int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx);
          uint32_t strideResiY = resiYuv.m_size;
  
-        pixel *fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+        const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
          uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
  
          if (numSigY)
@@ -2409,7 +2656,7 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
                  cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
  
                  int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
-                pixel*   fencCb = const_cast<pixel*>(fencYuv->getCbAddr(absPartIdxC));
+                const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
                  uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
                  if (numSigU)
                  {
@@ -2423,7 +2670,7 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
                  }
  
                  int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
-                pixel*   fencCr = const_cast<pixel*>(fencYuv->getCrAddr(absPartIdxC));
+                const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
                  uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
                  if (numSigV)
                  {
@@ -2449,16 +2696,16 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
      {
          X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
  
-        const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
          uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
-        for (uint32_t i = 0; i < 4; i++)
+        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
          {
-            residualTransformQuantInter(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, depthRange);
-            ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1);
-            ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1);
-            vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1);
+            residualTransformQuantInter(mode, cuGeom, qPartIdx, depth + 1, depthRange);
+            ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
+            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
          }
-        for (uint32_t i = 0; i < 4 * qPartNumSubdiv; i++)
+        for (uint32_t i = 0; i < 4 * qNumParts; i++)
          {
              cu.m_cbf[TEXT_LUMA][absPartIdx + i] |= ycbf << tuDepth;
              cu.m_cbf[TEXT_CHROMA_U][absPartIdx + i] |= ucbf << tuDepth;
@@ -2467,15 +2714,26 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3
      }
  }
  
-void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, uint32_t depthRange[2])
+uint64_t Search::estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId)
+{
+    uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
+
+    if (m_rdCost.m_psyRd)
+        return m_rdCost.calcPsyRdCost(dist, nullBits, psyEnergy);
+    else
+        return m_rdCost.calcRdCost(dist, nullBits);
+}
+
+void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
  {
      CUData& cu = mode.cu;
      uint32_t log2TrSize = g_maxLog2CUSize - depth;
  
      bool bCheckSplit = log2TrSize > depthRange[0];
      bool bCheckFull = log2TrSize <= depthRange[1];
+    bool bSplitPresentFlag = bCheckSplit && bCheckFull;
  
-    if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit)
+    if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit)
          bCheckFull = false;
  
      X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
@@ -2485,12 +2743,12 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
      uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
      bool bCodeChroma = true;
      uint32_t tuDepthC = tuDepth;
-    if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444))
+    if (log2TrSizeC < 2)
      {
-        log2TrSizeC++;
+        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+        log2TrSizeC = 2;
          tuDepthC--;
-        uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
-        bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+        bCodeChroma = !(absPartIdx & 3);
      }
  
      // code full block
@@ -2499,9 +2757,9 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
  
      uint8_t  cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
      uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
-    uint32_t singleBitsComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
-    uint32_t singleDistComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
-    uint32_t singlePsyEnergyComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+    uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+    uint32_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+    uint32_t singlePsyEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
      uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
      uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
  
@@ -2532,57 +2790,25 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
          if (m_bEnableRDOQ)
              m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
  
-        pixel *fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
-        int16_t *resi = resiYuv.getLumaAddr(absPartIdx);
+        const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
+        int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
          numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
          cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
  
          m_entropyCoder.resetBits();
-        m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
-        if (cbfFlag[TEXT_LUMA][0])
-            m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
-        singleBitsComp[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits();
-
-        uint32_t singleBitsPrev = singleBitsComp[TEXT_LUMA][0];
-
-        if (bCodeChroma)
-        {
-            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
-            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
-            {
-                coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
-                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
-
-                do
-                {
-                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
-                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
-
-                    cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
  
-                    if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
-                        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
-
-                    fenc = const_cast<pixel*>(fencYuv->getChromaAddr(chromaId, absPartIdxC));
-                    resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
-                    numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
-                    cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
-
-                    m_entropyCoder.codeQtCbf(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
-                    if (cbfFlag[chromaId][tuIterator.section])
-                        m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
-
-                    uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
-                    singleBitsComp[chromaId][tuIterator.section] = newBits - singleBitsPrev;
+        if (bSplitPresentFlag && log2TrSize > depthRange[0])
+            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
+        fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
  
-                    singleBitsPrev = newBits;
-                }
-                while (tuIterator.isNextSection());
-            }
-        }
+        // Coding luma cbf flag has been removed from here. The context for cbf flag is different for each depth.
+        // So it is valid if we encode coefficients and then cbfs at least for analysis.
+//        m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
+        if (cbfFlag[TEXT_LUMA][0])
+            m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
  
-        const uint32_t numCoeffY = 1 << (log2TrSize * 2);
-        const uint32_t numCoeffC = 1 << (log2TrSizeC * 2);
+        uint32_t singleBitsPrev = m_entropyCoder.getNumberOfWrittenBits();
+        singleBits[TEXT_LUMA][0] = singleBitsPrev - fullCost.bits;
  
          X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
          uint32_t distY = primitives.ssd_s[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size);
@@ -2590,156 +2816,168 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
          if (m_rdCost.m_psyRd)
              psyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, (int16_t*)zeroShort, 0);
  
-        int16_t *curResiY    = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
+        int16_t* curResiY    = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
          uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
  
          if (cbfFlag[TEXT_LUMA][0])
          {
              m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
  
+            // non-zero cost calculation for luma - This is an approximation
+            // finally we have to encode correct cbf after comparing with null cost
              const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
-            uint32_t nonZeroPsyEnergyY = 0;
+            uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
+            uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0;
              if (m_rdCost.m_psyRd)
+            {
                  nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
+                singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroPsyEnergyY);
+            }
+            else
+                singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
  
              if (cu.m_tqBypass[0])
              {
-                distY = nonZeroDistY;
-                psyEnergyY = nonZeroPsyEnergyY;
+                singleDist[TEXT_LUMA][0] = nonZeroDistY;
+                singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
              }
              else
              {
-                uint64_t singleCostY = 0;
-                if (m_rdCost.m_psyRd)
-                    singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0], nonZeroPsyEnergyY);
-                else
-                    singleCostY = m_rdCost.calcRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0]);
-                m_entropyCoder.resetBits();
-                m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth);
-                const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits();
-                uint64_t nullCostY = 0;
-                if (m_rdCost.m_psyRd)
-                    nullCostY = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY);
-                else
-                    nullCostY = m_rdCost.calcRdCost(distY, nullBitsY);
+                // zero-cost calculation for luma. This is an approximation
+                // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
+                // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
+                uint64_t nullCostY = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
+
                  if (nullCostY < singleCostY)
                  {
                      cbfFlag[TEXT_LUMA][0] = 0;
+                    singleBits[TEXT_LUMA][0] = 0;
+                    primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
  #if CHECKED_BUILD || _DEBUG
+                    uint32_t numCoeffY = 1 << (log2TrSize << 1);
                      memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
  #endif
                      if (checkTransformSkipY)
                          minCost[TEXT_LUMA][0] = nullCostY;
+                    singleDist[TEXT_LUMA][0] = distY;
+                    singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
                  }
                  else
                  {
-                    distY = nonZeroDistY;
-                    psyEnergyY = nonZeroPsyEnergyY;
                      if (checkTransformSkipY)
                          minCost[TEXT_LUMA][0] = singleCostY;
+                    singleDist[TEXT_LUMA][0] = nonZeroDistY;
+                    singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
                  }
              }
          }
-        else if (checkTransformSkipY)
+        else
          {
-            m_entropyCoder.resetBits();
-            m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth);
-            const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits();
-            if (m_rdCost.m_psyRd)
-                minCost[TEXT_LUMA][0] = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY);
-            else
-                minCost[TEXT_LUMA][0] = m_rdCost.calcRdCost(distY, nullBitsY);
+            if (checkTransformSkipY)
+                minCost[TEXT_LUMA][0] = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
+            primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
+            singleDist[TEXT_LUMA][0] = distY;
+            singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
          }
  
-        singleDistComp[TEXT_LUMA][0] = distY;
-        singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY;
-        if (!cbfFlag[TEXT_LUMA][0])
-            primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
          cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
  
          if (bCodeChroma)
          {
-            uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
              uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
+            uint32_t strideResiC  = m_rqt[qtLayer].resiQtYuv.m_csize;
              for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
              {
                  uint32_t distC = 0, psyEnergyC = 0;
                  coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
                  TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
  
-            do
-            {
-                uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
-                uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
+                do
+                {
+                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+                    uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
  
-                int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
+                    cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
  
-                distC = m_rdCost.scaleChromaDistCb(primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
+                    if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
+                        m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
  
-                if (cbfFlag[chromaId][tuIterator.section])
-                {
-                    m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset,
-                                            log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
-                    uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
-                    const uint32_t nonZeroDistC = m_rdCost.scaleChromaDistCb(dist);
-                    uint32_t nonZeroPsyEnergyC = 0;
-                    if (m_rdCost.m_psyRd)
-                        nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
-
-                    if (cu.m_tqBypass[0])
-                    {
-                        distC = nonZeroDistC;
-                        psyEnergyC = nonZeroPsyEnergyC;
-                    }
-                    else
+                    fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
+                    resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
+                    numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
+                    cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
+
+                    //Coding cbf flags has been removed from here
+//                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][tuIterator.section], tuDepth);
+                    if (cbfFlag[chromaId][tuIterator.section])
+                        m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
+                    uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
+                    singleBits[chromaId][tuIterator.section] = newBits - singleBitsPrev;
+                    singleBitsPrev = newBits;
+
+                    int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
+                    distC = m_rdCost.scaleChromaDist(chromaId, primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
+
+                    if (cbfFlag[chromaId][tuIterator.section])
                      {
-                        uint64_t singleCostC = 0;
+                        m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset,
+                                                log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
+
+                        // non-zero cost calculation for luma, same as luma - This is an approximation
+                        // finally we have to encode correct cbf after comparing with null cost
+                        uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
+                        uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
+                        uint32_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
+                        uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0;
                          if (m_rdCost.m_psyRd)
-                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+                        {
+                            nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
+                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+                        }
                          else
-                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
-                        m_entropyCoder.resetBits();
-                        m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepth);
-                        const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits();
-                        uint64_t nullCostC = 0;
-                        if (m_rdCost.m_psyRd)
-                            nullCostC = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC);
+                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
+
+                        if (cu.m_tqBypass[0])
+                        {
+                            singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+                            singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
+                        }
                          else
-                            nullCostC = m_rdCost.calcRdCost(distC, nullBitsC);
-                        if (nullCostC < singleCostC)
                          {
-                            cbfFlag[chromaId][tuIterator.section] = 0;
+                            //zero-cost calculation for chroma. This is an approximation
+                            uint64_t nullCostC = estimateNullCbfCost(distC, psyEnergyC, tuDepth, (TextType)chromaId);
+
+                            if (nullCostC < singleCostC)
+                            {
+                                cbfFlag[chromaId][tuIterator.section] = 0;
+                                singleBits[chromaId][tuIterator.section] = 0;
+                                primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
  #if CHECKED_BUILD || _DEBUG
+                                uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
                                  memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
  #endif
                                  if (checkTransformSkipC)
                                      minCost[chromaId][tuIterator.section] = nullCostC;
+                                singleDist[chromaId][tuIterator.section] = distC;
+                                singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
                              }
                              else
                              {
-                                distC = nonZeroDistC;
-                                psyEnergyC = nonZeroPsyEnergyC;
                                  if (checkTransformSkipC)
                                      minCost[chromaId][tuIterator.section] = singleCostC;
+                                singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+                                singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
                              }
                          }
                      }
-                    else if (checkTransformSkipC)
+                    else
                      {
-                        m_entropyCoder.resetBits();
-                        m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepthC);
-                        const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits();
-                        if (m_rdCost.m_psyRd)
-                            minCost[chromaId][tuIterator.section] = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC);
-                        else
-                            minCost[chromaId][tuIterator.section] = m_rdCost.calcRdCost(distC, nullBitsC);
-                    }
-
-                    singleDistComp[chromaId][tuIterator.section] = distC;
-                    singlePsyEnergyComp[chromaId][tuIterator.section] = psyEnergyC;
-
-                    if (!cbfFlag[chromaId][tuIterator.section])
+                        if (checkTransformSkipC)
+                            minCost[chromaId][tuIterator.section] = estimateNullCbfCost(distC, psyEnergyC, tuDepthC, (TextType)chromaId);
                          primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
+                        singleDist[chromaId][tuIterator.section] = distC;
+                        singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
+                    }
  
                      cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
                  }
@@ -2763,14 +3001,14 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
              if (m_bEnableRDOQ)
                  m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
  
-            fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+            fenc = fencYuv->getLumaAddr(absPartIdx);
              resi = resiYuv.getLumaAddr(absPartIdx);
              uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, tsCoeffY, log2TrSize, TEXT_LUMA, absPartIdx, true);
  
              if (numSigTSkipY)
              {
                  m_entropyCoder.resetBits();
-                m_entropyCoder.codeQtCbf(!!numSigTSkipY, TEXT_LUMA, tuDepth);
+                m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
                  m_entropyCoder.codeCoeffNxN(cu, tsCoeffY, absPartIdx, log2TrSize, TEXT_LUMA);
                  const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
  
@@ -2791,12 +3029,13 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
                  cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
              else
              {
-                singleDistComp[TEXT_LUMA][0] = nonZeroDistY;
-                singlePsyEnergyComp[TEXT_LUMA][0] = nonZeroPsyEnergyY;
+                singleDist[TEXT_LUMA][0] = nonZeroDistY;
+                singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
                  cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
                  bestTransformMode[TEXT_LUMA][0] = 1;
+                uint32_t numCoeffY = 1 << (log2TrSize << 1);
                  memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY);
-                primitives.square_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize);
+                primitives.luma_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize);
              }
  
              cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
@@ -2821,7 +3060,7 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
                      uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
                      uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
  
-                    int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
+                    int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
  
                      ALIGN_VAR_32(coeff_t, tsCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
                      ALIGN_VAR_32(int16_t, tsResiC[MAX_TS_SIZE * MAX_TS_SIZE]);
@@ -2831,42 +3070,43 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
                      if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
                          m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
  
-                    fenc = const_cast<pixel*>(fencYuv->getChromaAddr(chromaId, absPartIdxC));
+                    fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
                      resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
                      uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, tsCoeffC, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
  
                      m_entropyCoder.resetBits();
-                    singleBitsComp[chromaId][tuIterator.section] = 0;
+                    singleBits[chromaId][tuIterator.section] = 0;
  
                      if (numSigTSkipC)
                      {
-                        m_entropyCoder.codeQtCbf(!!numSigTSkipC, (TextType)chromaId, tuDepth);
+                        m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
                          m_entropyCoder.codeCoeffNxN(cu, tsCoeffC, absPartIdxC, log2TrSizeC, (TextType)chromaId);
-                        singleBitsComp[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
+                        singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
  
                          m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], tsResiC, trSizeC, tsCoeffC,
                                                  log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
                          uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
-                        nonZeroDistC = m_rdCost.scaleChromaDistCb(dist);
+                        nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
                          if (m_rdCost.m_psyRd)
                          {
                              nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
-                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+                            singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
                          }
                          else
-                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
+                            singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
                      }
  
                      if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
                          cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
                      else
                      {
-                        singleDistComp[chromaId][tuIterator.section] = nonZeroDistC;
-                        singlePsyEnergyComp[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
+                        singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+                        singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
                          cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
                          bestTransformMode[chromaId][tuIterator.section] = 1;
+                        uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
                          memcpy(coeffCurC + subTUOffset, tsCoeffC, sizeof(coeff_t) * numCoeffC);
-                        primitives.square_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC);
+                        primitives.luma_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC);
                      }
  
                      cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
@@ -2875,66 +3115,55 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
              }
          }
  
+        // Here we were encoding cbfs and coefficients, after calculating distortion above.
+        // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected
+        // bits required for coefficients and added with number of cbf bits. As I tested the order does not
+        // make any difference. But bit confused whether I should load the original context as below.
          m_entropyCoder.load(m_rqt[depth].rqtRoot);
-
          m_entropyCoder.resetBits();
  
-        if (log2TrSize > depthRange[0])
-            m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
-
+        //Encode cbf flags
          if (bCodeChroma)
          {
              for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
              {
                  if (!splitIntoSubTUs)
-                    m_entropyCoder.codeQtCbf(cbfFlag[chromaId][0], (TextType)chromaId, tuDepth);
+                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth);
                  else
                  {
                      offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
-                    for (uint32_t subTU = 0; subTU < 2; subTU++)
-                        m_entropyCoder.codeQtCbf(cbfFlag[chromaId][subTU], (TextType)chromaId, tuDepth);
+                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth);
+                    m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][1], tuDepth);
                  }
              }
          }
  
-        m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
-        if (cbfFlag[TEXT_LUMA][0])
-            m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
+        m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
  
-        if (bCodeChroma)
-        {
-            uint32_t subTUSize = 1 << (log2TrSizeC * 2);
-            uint32_t partIdxesPerSubTU = absPartIdxStep >> 1;
-            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
+        uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits();
  
-            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
-            {
-                coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
-                if (!splitIntoSubTUs)
-                {
-                    if (cbfFlag[chromaId][0])
-                        m_entropyCoder.codeCoeffNxN(cu, coeffCurC, absPartIdx, log2TrSizeC, (TextType)chromaId);
-                }
-                else
-                {
-                    for (uint32_t subTU = 0; subTU < 2; subTU++)
-                    {
-                        if (cbfFlag[chromaId][subTU])
-                            m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTU * subTUSize, absPartIdx + subTU * partIdxesPerSubTU, log2TrSizeC, (TextType)chromaId);
-                    }
-                }
-            }
+        uint32_t coeffBits = 0;
+        coeffBits = singleBits[TEXT_LUMA][0];
+        for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
+        {
+            coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex];
+            coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex];
          }
  
-        fullCost.distortion += singleDistComp[TEXT_LUMA][0];
-        fullCost.energy += singlePsyEnergyComp[TEXT_LUMA][0];// need to check we need to add chroma also
+        // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma.
+        // In case of chroma, if any one of the splitted block's cbf is 1, then we need to encode cbf 1, and then for
+        // four splitted block's individual cbf value. This is not known before analysis of four splitted blocks.
+        // For that reason, I am collecting individual coefficient bits only.
+        fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
+
+        fullCost.distortion += singleDist[TEXT_LUMA][0];
+        fullCost.energy += singlePsyEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
          for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
          {
-            fullCost.distortion += singleDistComp[TEXT_CHROMA_U][subTUIndex];
-            fullCost.distortion += singleDistComp[TEXT_CHROMA_V][subTUIndex];
+            fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
+            fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex];
          }
  
-        fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
          if (m_rdCost.m_psyRd)
              fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
          else
@@ -2951,31 +3180,40 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
          }
  
          Cost splitCost;
-        const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
+        if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
+        {
+            // Subdiv flag can be encoded at the start of anlysis of splitted blocks.
+            m_entropyCoder.resetBits();
+            m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
+            splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
+        }
+
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
          uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
-        for (uint32_t i = 0; i < 4; ++i)
+        for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
          {
-            estimateResidualQT(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, resiYuv, splitCost, depthRange);
-            ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA,     tuDepth + 1);
-            ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1);
-            vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1);
+            estimateResidualQT(mode, cuGeom, qPartIdx, depth + 1, resiYuv, splitCost, depthRange);
+            ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
+            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
          }
-        for (uint32_t i = 0; i < 4 * qPartNumSubdiv; ++i)
+        for (uint32_t i = 0; i < 4 * qNumParts; ++i)
          {
              cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
              cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
              cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
          }
  
+        // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
+        // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
+        // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
+        // at depth 0 (for example).
          m_entropyCoder.load(m_rqt[depth].rqtRoot);
          m_entropyCoder.resetBits();
  
-        encodeResidualQT(cu, absPartIdx, depth, true,  TEXT_LUMA, depthRange);
-        encodeResidualQT(cu, absPartIdx, depth, false, TEXT_LUMA, depthRange);
-        encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_U, depthRange);
-        encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_V, depthRange);
-
-        splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
+        codeInterSubdivCbfQT(cu, absPartIdx, depth, depthRange);
+        uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
+        splitCost.bits += splitCbfBits;
  
          if (m_rdCost.m_psyRd)
              splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
@@ -2999,15 +3237,18 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
          cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
          if (bCodeChroma)
          {
-            const uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1;
-
-            uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0);
-            for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++)
+            if (!splitIntoSubTUs)
              {
-                const uint32_t  subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU);
-
-                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][subTUIndex], TEXT_CHROMA_U, subTUPartIdx, partIdxesPerSubTU);
-                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][subTUIndex], TEXT_CHROMA_V, subTUPartIdx, partIdxesPerSubTU);
+                cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth);
+                cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth);
+            }
+            else
+            {
+                uint32_t tuNumParts = absPartIdxStep >> 1;
+                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx             , tuNumParts);
+                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
+                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx             , tuNumParts);
+                cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
              }
          }
          X265_CHECK(bCheckFull, "check-full must be set\n");
@@ -3019,23 +3260,21 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
  
      if (bCodeChroma)
      {
-        uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1;
-        uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0);
-
-        for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+        if (!splitIntoSubTUs)
          {
-            for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++)
-            {
-                const uint32_t  subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU);
+            cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth);
+            cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth);
+        }
+        else
+        {
+            uint32_t tuNumParts = absPartIdxStep >> 1;
  
-                if (splitIntoSubTUs)
-                {
-                    uint8_t combinedSubTUCBF = cbfFlag[chromaId][0] | cbfFlag[chromaId][1];
-                    cu.setCbfPartRange(((cbfFlag[chromaId][subTUIndex] << 1) | combinedSubTUCBF) << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU);
-                }
-                else
-                    cu.setCbfPartRange(cbfFlag[chromaId][subTUIndex] << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU);
-            }
+            offsetCBFs(cbfFlag[TEXT_CHROMA_U]);
+            offsetCBFs(cbfFlag[TEXT_CHROMA_V]);
+            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx             , tuNumParts);
+            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
+            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx             , tuNumParts);
+            cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
          }
      }
  
@@ -3045,51 +3284,65 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa
      outCosts.energy     += fullCost.energy;
  }
  
-void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2])
+void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, const uint32_t depthRange[2])
  {
      X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
-    X265_CHECK(cu.m_predMode[absPartIdx] != MODE_INTRA, "encodeResidualQT() with intra block\n");
+    X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
  
-    const uint32_t curTuDepth  = depth - cu.m_cuDepth[0];
-    const uint32_t tuDepth     = cu.m_tuDepth[absPartIdx];
-    const bool     bSubdiv     = curTuDepth != tuDepth;
+    const uint32_t tuDepth     = depth - cu.m_cuDepth[0];
+    const bool     bSubdiv     = tuDepth != cu.m_tuDepth[absPartIdx];
      const uint32_t log2TrSize  = g_maxLog2CUSize - depth;
  
-    uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-    
-    const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
+    if (!(log2TrSize - m_hChromaShift < 2))
+    {
+        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
+            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
+        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
+            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
+    }
+    else
+    {
+        X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma CBF not matching\n");
+        X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma CBF not matching\n");
+    }
  
-    if (bSubdivAndCbf && log2TrSize <= depthRange[1] && log2TrSize > depthRange[0])
-        m_entropyCoder.codeTransformSubdivFlag(bSubdiv, 5 - log2TrSize);
+    if (!bSubdiv)
+    {
+        m_entropyCoder.codeQtCbfLuma(cu, absPartIdx, tuDepth);
+    }
+    else
+    {
+        uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
+        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+            codeInterSubdivCbfQT(cu, absPartIdx, depth + 1, depthRange);
+    }
+}
  
-    bool mCodeAll = true;
-    uint32_t trWidthC  = 1 << log2TrSizeC;
-    uint32_t trHeightC = splitIntoSubTUs ? (trWidthC << 1) : trWidthC;
+void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, TextType ttype, const uint32_t depthRange[2])
+{
+    X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
+    X265_CHECK(cu.isInter(absPartIdx), "encodeResidualQT() with intra block\n");
  
-    const uint32_t numPels = trWidthC * trHeightC;
-    if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
-        mCodeAll = false;
+    const uint32_t curTuDepth  = depth - cu.m_cuDepth[0];
+    const uint32_t tuDepth     = cu.m_tuDepth[absPartIdx];
+    const bool     bSubdiv     = curTuDepth != tuDepth;
+    const uint32_t log2TrSize  = g_maxLog2CUSize - depth;
  
-    if (bSubdivAndCbf)
+    if (bSubdiv)
      {
-        const bool bFirstCbfOfCU = curTuDepth == 0;
-        if (bFirstCbfOfCU || mCodeAll)
-        {
-            uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] +  curTuDepth) << 1);
-            if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1))
-                m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_U, curTuDepth, !bSubdiv);
-            if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1))
-                m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_V, curTuDepth, !bSubdiv);
-        }
-        else
+        if (cu.getCbf(absPartIdx, ttype, curTuDepth))
          {
-            X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1), "chroma CBF not matching\n");
-            X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1), "chroma CBF not matching\n");
+            uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+            for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+                encodeResidualQT(cu, absPartIdx, depth + 1, ttype, depthRange);
          }
+        return;
      }
-
-    if (!bSubdiv)
+    else
      {
+        const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
+        uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
+
          // Luma
          const uint32_t qtLayer = log2TrSize - 2;
          uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
@@ -3098,65 +3351,51 @@ void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t de
          // Chroma
          bool bCodeChroma = true;
          uint32_t tuDepthC = tuDepth;
-        if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444))
+        if (log2TrSize == 2 && m_csp != X265_CSP_I444)
          {
+            X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
              log2TrSizeC++;
              tuDepthC--;
-            uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
-            bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+            bCodeChroma = !(absPartIdx & 3);
          }
  
-        if (bSubdivAndCbf)
-            m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, tuDepth);
-        else
+        if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
+            m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
+
+        if (bCodeChroma)
          {
-            if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
-                m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
+            uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
+            coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
+            coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
  
-            if (bCodeChroma)
+            if (!splitIntoSubTUs)
              {
-                uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
-                coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
-                coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
-
-                if (!splitIntoSubTUs)
+                if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
+                    m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
+                if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
+                    m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
+            }
+            else
+            {
+                uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
+                uint32_t subTUSize = 1 << (log2TrSizeC * 2);
+                if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
                  {
-                    if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
+                    if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
                          m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
-                    if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
-                        m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
+                    if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
+                        m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_U);
                  }
-                else
+                if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
                  {
-                    uint32_t partIdxesPerSubTU  = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + tuDepthC) << 1) + 1);
-                    uint32_t subTUSize = 1 << (log2TrSizeC * 2);
-                    if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
-                    {
-                        if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
-                            m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
-                        if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1))
-                            m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_U);
-                    }
-                    if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
-                    {
-                        if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
-                            m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
-                        if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1))
-                            m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_V);
-                    }
+                    if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
+                        m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
+                    if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
+                        m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_V);
                  }
              }
          }
      }
-    else
-    {
-        if (bSubdivAndCbf || cu.getCbf(absPartIdx, ttype, curTuDepth))
-        {
-            const uint32_t qpartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
-            for (uint32_t i = 0; i < 4; ++i)
-                encodeResidualQT(cu, absPartIdx + i * qpartNumSubdiv, depth + 1, bSubdivAndCbf, ttype, depthRange);
-        }
-    }
  }
  
  void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth)
@@ -3164,28 +3403,27 @@ void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartI
      X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
      const uint32_t curTrMode = depth - cu.m_cuDepth[0];
      const uint32_t tuDepth   = cu.m_tuDepth[absPartIdx];
+    const uint32_t log2TrSize = g_maxLog2CUSize - depth;
  
      if (curTrMode < tuDepth)
      {
-        uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
-        for (uint32_t i = 0; i < 4; i++, absPartIdx += qPartNumSubdiv)
+        uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+        for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
              saveResidualQTData(cu, resiYuv, absPartIdx, depth + 1);
          return;
      }
  
-    const uint32_t log2TrSize = g_maxLog2CUSize - depth;
      const uint32_t qtLayer = log2TrSize - 2;
  
      uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
      bool bCodeChroma = true;
      uint32_t tuDepthC = tuDepth;
-    if (log2TrSizeC == 1)
+    if (log2TrSizeC < 2)
      {
-        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n");
-        log2TrSizeC++;
+        X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+        log2TrSizeC = 2;
          tuDepthC--;
-        uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
-        bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+        bCodeChroma = !(absPartIdx & 3);
      }
  
      m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);