Imported Upstream version 1.4+222+hg5f9f7194267b

[deb_x265.git] / source / encoder / analysis.cpp
diff --git a/source/encoder/analysis.cpp b/source/encoder/analysis.cpp

index c62f5f041bde26c6bc2137f95d6cdce7d9c5a622..8b8d1031fd04020bf448f0a7514673b7d1f9ace3 100644 (file)
--- a/source/encoder/analysis.cpp
+++ b/source/encoder/analysis.cpp
@@ -33,8 +33,6 @@
  #include "rdcost.h"
  #include "encoder.h"
  
-#include "PPA/ppa.h"
-
  using namespace x265;
  
  /* An explanation of rate distortion levels (--rd-level)
@@ -61,9 +59,12 @@ using namespace x265;
   *
   *   RDO selection between merge and skip
   *   sa8d selection of best inter mode
+ *   sa8d decisions include chroma residual cost
   *   RDO selection between (merge/skip) / best inter mode / intra / split
   *
   * rd-level 4 enables RDOQuant
+ *   chroma residual cost included in satd decisions, including subpel refine
+ *    (as a result of --subme 3 being used by preset slow)
   *
   * rd-level 5,6 does RDO for each inter mode
   */
@@ -71,12 +72,15 @@ using namespace x265;
  Analysis::Analysis()
  {
      m_totalNumJobs = m_numAcquiredJobs = m_numCompletedJobs = 0;
+    m_reuseIntraDataCTU = NULL;
+    m_reuseInterDataCTU = NULL;
  }
  
  bool Analysis::create(ThreadLocalData *tld)
  {
      m_tld = tld;
      m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
+    m_bChromaSa8d = m_param->rdLevel >= 3;
  
      int csp = m_param->internalCsp;
      uint32_t cuSize = g_maxCUSize;
@@ -116,7 +120,7 @@ void Analysis::destroy()
      }
  }
  
-Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
+Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
  {
      m_slice = ctu.m_slice;
      m_frame = &frame;
@@ -124,27 +128,26 @@ Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuG
      invalidateContexts(0);
      m_quant.setQPforQuant(ctu);
      m_rqt[0].cur.load(initialContext);
-    m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_origPicYuv, ctu.m_cuAddr, 0);
+    m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
  
      uint32_t numPartition = ctu.m_numPartitions;
+    if (m_param->analysisMode)
+    {
+        m_reuseIntraDataCTU = (analysis_intra_data *)m_frame->m_analysisData.intraData;
+        int numPredDir = m_slice->isInterP() ? 1 : 2;
+        m_reuseInterDataCTU = (analysis_inter_data *)m_frame->m_analysisData.interData + ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir;
+    }
+
      if (m_slice->m_sliceType == I_SLICE)
      {
          uint32_t zOrder = 0;
-        if (m_param->analysisMode == X265_ANALYSIS_LOAD)
-            compressIntraCU(ctu, cuGeom, m_frame->m_intraData, zOrder);
-        else
+        compressIntraCU(ctu, cuGeom, zOrder);
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData)
          {
-            compressIntraCU(ctu, cuGeom, NULL, zOrder);
-
-            if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_intraData)
-            {
-                CUData *bestCU = &m_modeDepth[0].bestMode->cu;
-                memcpy(&m_frame->m_intraData->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
-                memcpy(&m_frame->m_intraData->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
-                memcpy(&m_frame->m_intraData->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
-                m_frame->m_intraData->cuAddr[ctu.m_cuAddr] = ctu.m_cuAddr;
-                m_frame->m_intraData->poc[ctu.m_cuAddr] = m_frame->m_poc;
-            }
+            CUData *bestCU = &m_modeDepth[0].bestMode->cu;
+            memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
+            memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
+            memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
          }
      }
      else
@@ -152,10 +155,10 @@ Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuG
          if (!m_param->rdLevel)
          {
              /* In RD Level 0/1, copy source pixels into the reconstructed block so
-             * they are available for intra predictions */
-            m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPicYuv, ctu.m_cuAddr, 0);
-            
-            compressInterCU_rd0_4(ctu, cuGeom); // TODO: this really wants to be compressInterCU_rd0_1
+            * they are available for intra predictions */
+            m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
+
+            compressInterCU_rd0_4(ctu, cuGeom);
  
              /* generate residual for entire CTU at once and copy to reconPic */
              encodeResidue(ctu, cuGeom);
@@ -178,7 +181,7 @@ void Analysis::tryLossless(const CUGeom& cuGeom)
      if (!md.bestMode->distortion)
          /* already lossless */
          return;
-    else if (md.bestMode->cu.m_predMode[0] == MODE_INTRA)
+    else if (md.bestMode->cu.isIntra(0))
      {
          md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
          PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
@@ -195,7 +198,7 @@ void Analysis::tryLossless(const CUGeom& cuGeom)
      }
  }
  
-void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x265_intra_data* shared, uint32_t& zOrder)
+void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder)
  {
      uint32_t depth = cuGeom.depth;
      ModeDepth& md = m_modeDepth[depth];
@@ -204,20 +207,20 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x2
      bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
      bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
  
-    if (shared)
+    if (m_param->analysisMode == X265_ANALYSIS_LOAD)
      {
-        uint8_t* sharedDepth = &shared->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
-        char* sharedPartSizes = &shared->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
-        uint8_t* sharedModes = &shared->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+        uint8_t* reuseDepth  = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+        uint8_t* reuseModes  = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+        char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
  
-        if (mightNotSplit && depth == sharedDepth[zOrder] && zOrder == cuGeom.encodeIdx)
+        if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.encodeIdx)
          {
              m_quant.setQPforQuant(parentCTU);
  
-            PartSize size = (PartSize)sharedPartSizes[zOrder];
+            PartSize size = (PartSize)reusePartSizes[zOrder];
              Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
              mode.cu.initSubCU(parentCTU, cuGeom);
-            checkIntra(mode, cuGeom, size, sharedModes);
+            checkIntra(mode, cuGeom, size, &reuseModes[zOrder]);
              checkBestMode(mode, depth);
  
              if (m_bTryLossless)
@@ -227,7 +230,7 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x2
                  addSplitFlagCost(*md.bestMode, cuGeom.depth);
  
              // increment zOrder offset to point to next best depth in sharedDepth buffer
-            zOrder += g_depthInc[g_maxCUDepth - 1][sharedDepth[zOrder]];
+            zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
              mightSplit = false;
          }
      }
@@ -267,23 +270,23 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x2
  
          for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
          {
-            const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
-            if (childCuData.flags & CUGeom::PRESENT)
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
              {
-                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
+                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
                  m_rqt[nextDepth].cur.load(*nextContext);
-                compressIntraCU(parentCTU, childCuData, shared, zOrder);
+                compressIntraCU(parentCTU, childGeom, zOrder);
  
                  // Save best CU and pred data for this sub CU
-                splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
+                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
                  splitPred->addSubCosts(*nd.bestMode);
-                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
+                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
                  nextContext = &nd.bestMode->contexts;
              }
              else
              {
                  /* record the depth of this non-present sub-CU */
-                splitCU->setEmptyPart(childCuData, subPartIdx);
+                splitCU->setEmptyPart(childGeom, subPartIdx);
                  zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth];
              }
          }
@@ -300,38 +303,45 @@ void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x2
      /* Copy best data to encData CTU and recon */
      md.bestMode->cu.copyToPic(depth);
      if (md.bestMode != &md.pred[PRED_SPLIT])
-        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx);
+        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx);
  }
  
  bool Analysis::findJob(int threadId)
  {
      /* try to acquire a CU mode to analyze */
+    m_pmodeLock.acquire();
      if (m_totalNumJobs > m_numAcquiredJobs)
      {
-        /* ATOMIC_INC returns the incremented value */
-        int id = ATOMIC_INC(&m_numAcquiredJobs);
-        if (m_totalNumJobs >= id)
-        {
-            parallelModeAnalysis(threadId, id - 1);
+        int id = m_numAcquiredJobs++;
+        m_pmodeLock.release();
  
-            if (ATOMIC_INC(&m_numCompletedJobs) == m_totalNumJobs)
-                m_modeCompletionEvent.trigger();
-            return true;
-        }
+        parallelModeAnalysis(threadId, id);
+
+        m_pmodeLock.acquire();
+        if (++m_numCompletedJobs == m_totalNumJobs)
+            m_modeCompletionEvent.trigger();
+        m_pmodeLock.release();
+        return true;
      }
+    else
+        m_pmodeLock.release();
  
+    m_meLock.acquire();
      if (m_totalNumME > m_numAcquiredME)
      {
-        int id = ATOMIC_INC(&m_numAcquiredME);
-        if (m_totalNumME >= id)
-        {
-            parallelME(threadId, id - 1);
+        int id = m_numAcquiredME++;
+        m_meLock.release();
  
-            if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
-                m_meCompletionEvent.trigger();
-            return true;
-        }
+        parallelME(threadId, id);
+
+        m_meLock.acquire();
+        if (++m_numCompletedME == m_totalNumME)
+            m_meCompletionEvent.trigger();
+        m_meLock.release();
+        return true;
      }
+    else
+        m_meLock.release();
  
      return false;
  }
@@ -349,18 +359,14 @@ void Analysis::parallelME(int threadId, int meId)
          slave->m_slice = m_slice;
          slave->m_frame = m_frame;
  
-        PicYuv* fencPic = m_frame->m_origPicYuv;
-        pixel* pu = fencPic->getLumaAddr(m_curMECu->m_cuAddr, m_curGeom->encodeIdx + m_puAbsPartIdx);
-        slave->m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride);
-        slave->m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight);
-
-        slave->prepMotionCompensation(*m_curMECu, *m_curGeom, m_curPart);
+        slave->m_me.setSourcePU(*m_curInterMode->fencYuv, m_curInterMode->cu.m_cuAddr, m_curGeom->encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
+        slave->prepMotionCompensation(m_curInterMode->cu, *m_curGeom, m_curPart);
      }
  
      if (meId < m_slice->m_numRefIdx[0])
-        slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 0, meId);
+        slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 0, meId);
      else
-        slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]);
+        slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]);
  }
  
  void Analysis::parallelModeAnalysis(int threadId, int jobId)
@@ -376,8 +382,6 @@ void Analysis::parallelModeAnalysis(int threadId, int jobId)
          slave->m_frame = m_frame;
          slave->setQP(*m_slice, m_rdCost.m_qp);
          slave->invalidateContexts(0);
-        if (jobId)
-            slave->m_me.setSourcePlane(m_frame->m_origPicYuv->m_picOrg[0], m_frame->m_origPicYuv->m_stride);
      }
  
      ModeDepth& md = m_modeDepth[m_curGeom->depth];
@@ -389,13 +393,15 @@ void Analysis::parallelModeAnalysis(int threadId, int jobId)
          case 0:
              if (slave != this)
                  slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur);
-            slave->checkIntraInInter_rd0_4(md.pred[PRED_INTRA], *m_curGeom);
+            slave->checkIntraInInter(md.pred[PRED_INTRA], *m_curGeom);
              if (m_param->rdLevel > 2)
                  slave->encodeIntraInInter(md.pred[PRED_INTRA], *m_curGeom);
              break;
  
          case 1:
              slave->checkInter_rd0_4(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N);
+            if (m_slice->m_sliceType == B_SLICE)
+                slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom);
              break;
  
          case 2:
@@ -446,6 +452,13 @@ void Analysis::parallelModeAnalysis(int threadId, int jobId)
  
          case 1:
              slave->checkInter_rd5_6(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N, false);
+            md.pred[PRED_BIDIR].rdCost = MAX_INT64;
+            if (m_slice->m_sliceType == B_SLICE)
+            {
+                slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom);
+                if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
+                    slave->encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], *m_curGeom);
+            }
              break;
  
          case 2:
@@ -499,6 +512,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
  
          /* Initialize all prediction CUs based on parentCTU */
          md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
+        md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
          md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
          md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
          if (m_param->bEnableRectInter)
@@ -520,12 +534,14 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
                  md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
          }
  
+        m_pmodeLock.acquire();
          m_totalNumJobs = 2 + m_param->bEnableRectInter * 2 + bTryAmp * 4;
          m_numAcquiredJobs = !bTryIntra;
          m_numCompletedJobs = m_numAcquiredJobs;
          m_curGeom = &cuGeom;
          m_bJobsQueued = true;
          JobProvider::enqueue();
+        m_pmodeLock.release();
  
          for (int i = 0; i < m_totalNumJobs - m_numCompletedJobs; i++)
              m_pool->pokeIdleThread();
@@ -572,17 +588,26 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
  
              if (m_param->rdLevel > 2)
              {
-                /* encode best inter */
-                for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+                /* RD selection between merge, inter, bidir and intra */
+                if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
                  {
-                    prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
-                    motionCompensation(bestInter->predYuv, false, true);
+                    for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+                    {
+                        prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
+                        motionCompensation(bestInter->predYuv, false, true);
+                    }
                  }
                  encodeResAndCalcRdInterCU(*bestInter, cuGeom);
-
-                /* RD selection between merge, inter and intra */
                  checkBestMode(*bestInter, depth);
  
+                /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
+                if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
+                    md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
+                {
+                    encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
+                    checkBestMode(md.pred[PRED_BIDIR], depth);
+                }
+
                  if (bTryIntra)
                      checkBestMode(md.pred[PRED_INTRA], depth);
              }
@@ -591,6 +616,9 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
                  if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
                      md.bestMode = bestInter;
  
+                if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
+                    md.bestMode = &md.pred[PRED_BIDIR];
+
                  if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
                  {
                      md.bestMode = &md.pred[PRED_INTRA];
@@ -614,6 +642,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
              m_modeCompletionEvent.wait();
  
              checkBestMode(md.pred[PRED_2Nx2N], depth);
+            checkBestMode(md.pred[PRED_BIDIR], depth);
  
              if (m_param->bEnableRectInter)
              {
@@ -640,7 +669,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
          if (md.bestMode->rdCost == MAX_INT64 && !bTryIntra)
          {
              md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
-            checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
+            checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
              encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
              checkBestMode(md.pred[PRED_INTRA], depth);
          }
@@ -655,7 +684,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
      bool bNoSplit = false;
      if (md.bestMode)
      {
-        bNoSplit = !!md.bestMode->cu.isSkipped(0);
+        bNoSplit = md.bestMode->cu.isSkipped(0);
          if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
              bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
      }
@@ -674,22 +703,22 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
  
          for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
          {
-            const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
-            if (childCuData.flags & CUGeom::PRESENT)
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
              {
-                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
+                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
                  m_rqt[nextDepth].cur.load(*nextContext);
-                compressInterCU_dist(parentCTU, childCuData);
+                compressInterCU_dist(parentCTU, childGeom);
  
                  // Save best CU and pred data for this sub CU
-                splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
+                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
                  splitPred->addSubCosts(*nd.bestMode);
  
-                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
+                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
                  nextContext = &nd.bestMode->contexts;
              }
              else
-                splitCU->setEmptyPart(childCuData, subPartIdx);
+                splitCU->setEmptyPart(childGeom, subPartIdx);
          }
          nextContext->store(splitPred->contexts);
  
@@ -701,10 +730,10 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
          checkBestMode(*splitPred, depth);
      }
  
-    if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA)
+    if (mightNotSplit)
      {
          /* early-out statistics */
-        FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
+        FrameData& curEncData = *m_frame->m_encData;
          FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
          uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
          cuStat.count[depth] += 1;
@@ -716,7 +745,7 @@ void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeo
      /* Copy best data to encData CTU and recon */
      md.bestMode->cu.copyToPic(depth);
      if (md.bestMode != &md.pred[PRED_SPLIT])
-        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx);
+        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx);
  }
  
  void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom)
@@ -734,24 +763,9 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
      {
          bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
  
-        /* Initialize all prediction CUs based on parentCTU */
-        md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
+        /* Compute Merge Cost */
          md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
          md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
-        if (m_param->bEnableRectInter)
-        {
-            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
-            md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
-        }
-        if (m_slice->m_sps->maxAMPDepth > depth && cuGeom.log2CUSize < 6)
-        {
-            md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
-            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
-            md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
-            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
-        }
-
-        /* Compute Merge Cost */
          checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
  
          bool earlyskip = false;
@@ -760,14 +774,24 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
  
          if (!earlyskip)
          {
+            md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
              checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N);
-            Mode *bestInter = &md.pred[PRED_2Nx2N];
  
+            if (m_slice->m_sliceType == B_SLICE)
+            {
+                md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
+                checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
+            }
+
+            Mode *bestInter = &md.pred[PRED_2Nx2N];
              if (m_param->bEnableRectInter)
              {
+                md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
                  checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N);
                  if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
                      bestInter = &md.pred[PRED_Nx2N];
+
+                md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
                  checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN);
                  if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
                      bestInter = &md.pred[PRED_2NxN];
@@ -789,18 +813,24 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
  
                  if (bHor)
                  {
+                    md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
                      checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU);
                      if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
                          bestInter = &md.pred[PRED_2NxnU];
+
+                    md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
                      checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD);
                      if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
                          bestInter = &md.pred[PRED_2NxnD];
                  }
                  if (bVer)
                  {
+                    md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
                      checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N);
                      if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
                          bestInter = &md.pred[PRED_nLx2N];
+
+                    md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
                      checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N);
                      if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
                          bestInter = &md.pred[PRED_nRx2N];
@@ -810,37 +840,48 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
              if (m_param->rdLevel >= 3)
              {
                  /* Calculate RD cost of best inter option */
-                for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+                if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
                  {
-                    prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
-                    motionCompensation(bestInter->predYuv, false, true);
+                    for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
+                    {
+                        prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
+                        motionCompensation(bestInter->predYuv, false, true);
+                    }
                  }
-
                  encodeResAndCalcRdInterCU(*bestInter, cuGeom);
+                checkBestMode(*bestInter, depth);
  
-                if (!md.bestMode || bestInter->rdCost < md.bestMode->rdCost)
-                    md.bestMode = bestInter;
+                /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
+                if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
+                    md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
+                {
+                    encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
+                    checkBestMode(md.pred[PRED_BIDIR], depth);
+                }
  
                  if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
                      md.bestMode->sa8dCost == MAX_INT64)
                  {
                      md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
-                    checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
+                    checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
                      encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
-                    if (md.pred[PRED_INTRA].rdCost < md.bestMode->rdCost)
-                        md.bestMode = &md.pred[PRED_INTRA];
+                    checkBestMode(md.pred[PRED_INTRA], depth);
                  }
              }
              else
              {
-                /* SA8D choice between merge/skip, inter, and intra */
+                /* SA8D choice between merge/skip, inter, bidir, and intra */
                  if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
                      md.bestMode = bestInter;
  
+                if (m_slice->m_sliceType == B_SLICE &&
+                    md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
+                    md.bestMode = &md.pred[PRED_BIDIR];
+
                  if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
                  {
                      md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
-                    checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
+                    checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
                      if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
                          md.bestMode = &md.pred[PRED_INTRA];
                  }
@@ -854,7 +895,7 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
                      /* prediction already generated for this CU, and if rd level
                       * is not 0, it is already fully encoded */
                  }
-                else if (md.bestMode->cu.m_predMode[0] == MODE_INTER)
+                else if (md.bestMode->cu.isInter(0))
                  {
                      for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
                      {
@@ -865,8 +906,23 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
                          encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
                      else if (m_param->rdLevel == 1)
                      {
-                        m_rqt[cuGeom.depth].tmpResiYuv.subtract(md.fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
-                        generateCoeffRecon(*md.bestMode, cuGeom);
+                        /* generate recon pixels with no rate distortion considerations */
+                        CUData& cu = md.bestMode->cu;
+                        m_quant.setQPforQuant(cu);
+
+                        uint32_t tuDepthRange[2];
+                        cu.getInterTUQtDepthRange(tuDepthRange, 0);
+
+                        m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
+                        residualTransformQuantInter(*md.bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
+                        if (cu.getQtRootCbf(0))
+                            md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
+                        else
+                        {
+                            md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
+                            if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
+                                cu.setPredModeSubParts(MODE_SKIP);
+                        }
                      }
                  }
                  else
@@ -874,7 +930,20 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
                      if (m_param->rdLevel == 2)
                          encodeIntraInInter(*md.bestMode, cuGeom);
                      else if (m_param->rdLevel == 1)
-                        generateCoeffRecon(*md.bestMode, cuGeom);
+                    {
+                        /* generate recon pixels with no rate distortion considerations */
+                        CUData& cu = md.bestMode->cu;
+                        m_quant.setQPforQuant(cu);
+
+                        uint32_t tuDepthRange[2];
+                        cu.getIntraTUQtDepthRange(tuDepthRange, 0);
+
+                        uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
+                        residualTransformQuantIntra(*md.bestMode, cuGeom, initTuDepth, 0, tuDepthRange);
+                        getBestIntraModeChroma(*md.bestMode, cuGeom);
+                        residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
+                        md.bestMode->reconYuv.copyFromPicYuv(*m_frame->m_reconPic, cu.m_cuAddr, cuGeom.encodeIdx); // TODO:
+                    }
                  }
              }
          } // !earlyskip
@@ -889,7 +958,7 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
      bool bNoSplit = false;
      if (md.bestMode)
      {
-        bNoSplit = !!md.bestMode->cu.isSkipped(0);
+        bNoSplit = md.bestMode->cu.isSkipped(0);
          if (mightSplit && depth && depth >= minDepth && !bNoSplit)
              bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
      }
@@ -908,54 +977,48 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
  
          for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
          {
-            const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
-            if (childCuData.flags & CUGeom::PRESENT)
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
              {
-                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
+                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
                  m_rqt[nextDepth].cur.load(*nextContext);
-                compressInterCU_rd0_4(parentCTU, childCuData);
+                compressInterCU_rd0_4(parentCTU, childGeom);
  
                  // Save best CU and pred data for this sub CU
-                splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
+                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
                  splitPred->addSubCosts(*nd.bestMode);
  
                  if (m_param->rdLevel)
-                    nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
+                    nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
                  else
-                    nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childCuData.numPartitions * subPartIdx);
+                    nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);
                  if (m_param->rdLevel > 1)
                      nextContext = &nd.bestMode->contexts;
              }
              else
-                splitCU->setEmptyPart(childCuData, subPartIdx);
+                splitCU->setEmptyPart(childGeom, subPartIdx);
          }
          nextContext->store(splitPred->contexts);
  
          if (mightNotSplit)
              addSplitFlagCost(*splitPred, cuGeom.depth);
-        else if (m_param->rdLevel <= 1)
-            splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits);
-        else
+        else if (m_param->rdLevel > 1)
              updateModeCost(*splitPred);
+        else
+            splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits);
  
          if (!md.bestMode)
              md.bestMode = splitPred;
-        else if (m_param->rdLevel >= 1)
-        {
-            if (splitPred->rdCost < md.bestMode->rdCost)
-                md.bestMode = splitPred;
-        }
-        else
-        {
-            if (splitPred->sa8dCost < md.bestMode->sa8dCost)
-                md.bestMode = splitPred;
-        }
+        else if (m_param->rdLevel > 1)
+            checkBestMode(*splitPred, cuGeom.depth);
+        else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
+            md.bestMode = splitPred;
      }
  
-    if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA)
+    if (mightNotSplit)
      {
          /* early-out statistics */
-        FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
+        FrameData& curEncData = *m_frame->m_encData;
          FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
          uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
          cuStat.count[depth] += 1;
@@ -967,7 +1030,7 @@ void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGe
      /* Copy best data to encData CTU and recon */
      md.bestMode->cu.copyToPic(depth);
      if (md.bestMode != &md.pred[PRED_SPLIT] && m_param->rdLevel)
-        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx);
+        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx);
  }
  
  void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom)
@@ -981,27 +1044,39 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
  
      if (mightNotSplit)
      {
-        for (int i = 0; i < MAX_PRED_TYPES; i++)
-            md.pred[i].cu.initSubCU(parentCTU, cuGeom);
-
+        md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
+        md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
          checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
          bool earlySkip = m_param->bEnableEarlySkip && md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
  
          if (!earlySkip)
          {
+            md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
              checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, false);
              checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
  
+            if (m_slice->m_sliceType == B_SLICE)
+            {
+                md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
+                checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
+                if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
+                {
+                    encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
+                    checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth);
+                }
+            }
+
              if (m_param->bEnableRectInter)
              {
-                // Nx2N rect
                  if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
                  {
+                    md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
                      checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, false);
                      checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
                  }
                  if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
                  {
+                    md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
                      checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, false);
                      checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
                  }
@@ -1027,11 +1102,13 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
                  {
                      if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
                      {
+                        md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
                          checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, bMergeOnly);
                          checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
                      }
                      if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
                      {
+                        md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
                          checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, bMergeOnly);
                          checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
                      }
@@ -1040,11 +1117,13 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
                  {
                      if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
                      {
+                        md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
                          checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, bMergeOnly);
                          checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
                      }
                      if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
                      {
+                        md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
                          checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, bMergeOnly);
                          checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
                      }
@@ -1054,11 +1133,13 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
              if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) &&
                  (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)))
              {
+                md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
                  checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL);
                  checkBestMode(md.pred[PRED_INTRA], depth);
  
                  if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
                  {
+                    md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
                      checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL);
                      checkBestMode(md.pred[PRED_INTRA_NxN], depth);
                  }
@@ -1087,21 +1168,21 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
  
          for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
          {
-            const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
-            if (childCuData.flags & CUGeom::PRESENT)
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
              {
-                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
+                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
                  m_rqt[nextDepth].cur.load(*nextContext);
-                compressInterCU_rd5_6(parentCTU, childCuData);
+                compressInterCU_rd5_6(parentCTU, childGeom);
  
                  // Save best CU and pred data for this sub CU
-                splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
+                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
                  splitPred->addSubCosts(*nd.bestMode);
-                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
+                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
                  nextContext = &nd.bestMode->contexts;
              }
              else
-                splitCU->setEmptyPart(childCuData, subPartIdx);
+                splitCU->setEmptyPart(childGeom, subPartIdx);
          }
          nextContext->store(splitPred->contexts);
          if (mightNotSplit)
@@ -1117,7 +1198,7 @@ void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGe
      /* Copy best data to encData CTU and recon */
      md.bestMode->cu.copyToPic(depth);
      if (md.bestMode != &md.pred[PRED_SPLIT])
-        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx);
+        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx);
  }
  
  /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
@@ -1148,7 +1229,12 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
  
      bestPred->sa8dCost = MAX_INT64;
      int bestSadCand = -1;
-    int sizeIdx = cuGeom.log2CUSize - 2;
+    int cpart, sizeIdx = cuGeom.log2CUSize - 2;
+    if (m_bChromaSa8d)
+    {
+        int cuSize = 1 << cuGeom.log2CUSize;
+        cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
+    }
      for (uint32_t i = 0; i < maxNumMergeCand; ++i)
      {
          if (m_bFrameParallel &&
@@ -1159,16 +1245,20 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
          tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
          tempPred->cu.m_interDir[0] = interDirNeighbours[i];
          tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
-        tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
+        tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
          tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
-        tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
+        tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
  
-        // do MC only for Luma part
          prepMotionCompensation(tempPred->cu, cuGeom, 0);
-        motionCompensation(tempPred->predYuv, true, false);
+        motionCompensation(tempPred->predYuv, true, m_bChromaSa8d);
  
          tempPred->sa8dBits = getTUBits(i, maxNumMergeCand);
          tempPred->distortion = primitives.sa8d[sizeIdx](fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
+        if (m_bChromaSa8d)
+        {
+            tempPred->distortion += primitives.sa8d_inter[cpart](fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
+            tempPred->distortion += primitives.sa8d_inter[cpart](fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
+        }
          tempPred->sa8dCost = m_rdCost.calcRdSADCost(tempPred->distortion, tempPred->sa8dBits);
  
          if (tempPred->sa8dCost < bestPred->sa8dCost)
@@ -1183,8 +1273,11 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
          return;
  
      /* calculate the motion compensation for chroma for the best mode selected */
-    prepMotionCompensation(bestPred->cu, cuGeom, 0);
-    motionCompensation(bestPred->predYuv, false, true);
+    if (!m_bChromaSa8d) /* Chroma MC was done above */
+    {
+        prepMotionCompensation(bestPred->cu, cuGeom, 0);
+        motionCompensation(bestPred->predYuv, false, true);
+    }
  
      if (m_param->rdLevel)
      {
@@ -1197,9 +1290,9 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
          tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
          tempPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
          tempPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
-        tempPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
+        tempPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
          tempPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
-        tempPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
+        tempPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
          tempPred->sa8dCost = bestPred->sa8dCost;
          tempPred->predYuv.copyFromYuv(bestPred->predYuv);
  
@@ -1213,9 +1306,9 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGe
      /* broadcast sets of MV field data */
      bestPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
      bestPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
-    bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
+    bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
      bestPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
-    bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
+    bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
  }
  
  /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
@@ -1269,10 +1362,10 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe
          tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;    /* merge candidate ID is stored in L0 MVP idx */
          tempPred->cu.m_interDir[0] = interDirNeighbours[i];
          tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
-        tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
+        tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
          tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
-        tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
-        tempPred->cu.setSkipFlagSubParts(false);     /* must be cleared between encode iterations */
+        tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
+        tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
  
          prepMotionCompensation(tempPred->cu, cuGeom, 0);
          motionCompensation(tempPred->predYuv, true, true);
@@ -1302,10 +1395,10 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe
                  tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
                  tempPred->cu.m_interDir[0] = interDirNeighbours[i];
                  tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
-                tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
+                tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
                  tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
-                tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
-                tempPred->cu.setSkipFlagSubParts(false);
+                tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
+                tempPred->cu.setPredModeSubParts(MODE_INTER);
                  tempPred->predYuv.copyFromYuv(bestPred->predYuv);
              }
              
@@ -1324,9 +1417,9 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGe
          uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0];
          bestPred->cu.setPUInterDir(interDirNeighbours[bestCand], 0, 0);
          bestPred->cu.setPUMv(0, mvFieldNeighbours[bestCand][0].mv, 0, 0);
-        bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestCand][0].refIdx, 0, 0);
+        bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestCand][0].refIdx, 0, 0);
          bestPred->cu.setPUMv(1, mvFieldNeighbours[bestCand][1].mv, 0, 0);
-        bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestCand][1].refIdx, 0, 0);
+        bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestCand][1].refIdx, 0, 0);
      }
  }
  
@@ -1335,14 +1428,48 @@ void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize
      interMode.initCosts();
      interMode.cu.setPartSizeSubParts(partSize);
      interMode.cu.setPredModeSubParts(MODE_INTER);
+    int numPredDir = m_slice->isInterP() ? 1 : 2;
  
-    if (predInterSearch(interMode, cuGeom, false, false))
+    if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
+    {
+        for (uint32_t part = 0; part < interMode.cu.getNumPartInter(); part++)
+        {
+            MotionData* bestME = interMode.bestME[part];
+            for (int32_t i = 0; i < numPredDir; i++)
+            {
+                bestME[i].ref = m_reuseInterDataCTU->ref;
+                m_reuseInterDataCTU++;
+            }
+        }
+    }
+    if (predInterSearch(interMode, cuGeom, false, m_bChromaSa8d))
      {
          /* predInterSearch sets interMode.sa8dBits */
          const Yuv& fencYuv = *interMode.fencYuv;
          Yuv& predYuv = interMode.predYuv;
-        interMode.distortion = primitives.sa8d[cuGeom.log2CUSize - 2](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
+        int part = partitionFromLog2Size(cuGeom.log2CUSize);
+        interMode.distortion = primitives.sa8d[part](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
+        if (m_bChromaSa8d)
+        {
+            uint32_t cuSize = 1 << cuGeom.log2CUSize;
+            int cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
+            interMode.distortion += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
+            interMode.distortion += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
+        }
          interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits);
+
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
+        {
+            for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
+            {
+                MotionData* bestME = interMode.bestME[puIdx];
+                for (int32_t i = 0; i < numPredDir; i++)
+                {
+                    m_reuseInterDataCTU->ref = bestME[i].ref;
+                    m_reuseInterDataCTU++;
+                }
+            }
+        }
      }
      else
      {
@@ -1356,11 +1483,37 @@ void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize
      interMode.initCosts();
      interMode.cu.setPartSizeSubParts(partSize);
      interMode.cu.setPredModeSubParts(MODE_INTER);
+    int numPredDir = m_slice->isInterP() ? 1 : 2;
  
+    if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
+    {
+        for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
+        {
+            MotionData* bestME = interMode.bestME[puIdx];
+            for (int32_t i = 0; i < numPredDir; i++)
+            {
+                bestME[i].ref = m_reuseInterDataCTU->ref;
+                m_reuseInterDataCTU++;
+            }
+        }
+    }
      if (predInterSearch(interMode, cuGeom, bMergeOnly, true))
      {
          /* predInterSearch sets interMode.sa8dBits, but this is ignored */
          encodeResAndCalcRdInterCU(interMode, cuGeom);
+
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
+        {
+            for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
+            {
+                MotionData* bestME = interMode.bestME[puIdx];
+                for (int32_t i = 0; i < numPredDir; i++)
+                {
+                    m_reuseInterDataCTU->ref = bestME[i].ref;
+                    m_reuseInterDataCTU++;
+                }
+            }
+        }
      }
      else
      {
@@ -1369,221 +1522,151 @@ void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize
      }
  }
  
-/* Note that this function does not save the best intra prediction, it must
- * be generated later. It records the best mode in the cu */
-void Analysis::checkIntraInInter_rd0_4(Mode& intraMode, const CUGeom& cuGeom)
+void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom)
  {
-    CUData& cu = intraMode.cu;
-    uint32_t depth = cu.m_cuDepth[0];
+    CUData& cu = bidir2Nx2N.cu;
  
-    cu.setPartSizeSubParts(SIZE_2Nx2N);
-    cu.setPredModeSubParts(MODE_INTRA);
-
-    uint32_t initTrDepth = 0;
-    uint32_t log2TrSize  = cu.m_log2CUSize[0] - initTrDepth;
-    uint32_t tuSize      = 1 << log2TrSize;
-    const uint32_t absPartIdx  = 0;
-
-    // Reference sample smoothing
-    initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX);
-
-    pixel* fenc = m_modeDepth[depth].fencYuv.m_buf[0];
-    uint32_t stride = m_modeDepth[depth].fencYuv.m_size;
-
-    pixel *above         = m_refAbove    + tuSize - 1;
-    pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
-    pixel *left          = m_refLeft     + tuSize - 1;
-    pixel *leftFiltered  = m_refLeftFlt  + tuSize - 1;
-    int sad, bsad;
-    uint32_t bits, bbits, mode, bmode;
-    uint64_t cost, bcost;
-
-    // 33 Angle modes once
-    ALIGN_VAR_32(pixel, bufScale[32 * 32]);
-    ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
-    ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
-    int scaleTuSize = tuSize;
-    int scaleStride = stride;
-    int costShift = 0;
-    int sizeIdx = log2TrSize - 2;
-
-    if (tuSize > 32)
+    if (cu.isBipredRestriction() || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT)
      {
-        // origin is 64x64, we scale to 32x32 and setup required parameters
-        primitives.scale2D_64to32(bufScale, fenc, stride);
-        fenc = bufScale;
-
-        // reserve space in case primitives need to store data in above
-        // or left buffers
-        pixel _above[4 * 32 + 1];
-        pixel _left[4 * 32 + 1];
-        pixel *aboveScale  = _above + 2 * 32;
-        pixel *leftScale   = _left + 2 * 32;
-        aboveScale[0] = leftScale[0] = above[0];
-        primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
-        primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
-
-        scaleTuSize = 32;
-        scaleStride = 32;
-        costShift = 2;
-        sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
-
-        // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
-        above         = aboveScale;
-        left          = leftScale;
-        aboveFiltered = aboveScale;
-        leftFiltered  = leftScale;
+        bidir2Nx2N.sa8dCost = MAX_INT64;
+        bidir2Nx2N.rdCost = MAX_INT64;
+        return;
      }
  
-    pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
-    int predsize = scaleTuSize * scaleTuSize;
-
-    m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
+    const Yuv& fencYuv = *bidir2Nx2N.fencYuv;
+    MV   mvzero(0, 0);
+    int  cpart, partEnum = cuGeom.log2CUSize - 2;
  
-    /* there are three cost tiers for intra modes:
-     *  pred[0]          - mode probable, least cost
-     *  pred[1], pred[2] - less probable, slightly more cost
-     *  non-mpm modes    - all cost the same (rbits) */
-    uint64_t mpms;
-    uint32_t preds[3];
-    uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
-
-    // DC
-    primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
-    bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
-    bmode = mode = DC_IDX;
-    bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
-    bcost = m_rdCost.calcRdSADCost(bsad, bbits);
-
-    pixel *abovePlanar = above;
-    pixel *leftPlanar  = left;
-
-    if (tuSize & (8 | 16 | 32))
+    if (m_bChromaSa8d)
      {
-        abovePlanar = aboveFiltered;
-        leftPlanar  = leftFiltered;
+        int cuSize = 1 << cuGeom.log2CUSize;
+        cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
      }
  
-    // PLANAR
-    primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
-    sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
-    mode = PLANAR_IDX;
-    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
-    cost = m_rdCost.calcRdSADCost(sad, bits);
-    COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
-
-    // Transpose NxN
-    primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
-
-    primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
-
-    bool modeHor;
-    pixel *cmp;
-    intptr_t srcStride;
-
-#define TRY_ANGLE(angle) \
-    modeHor = angle < 18; \
-    cmp = modeHor ? bufTrans : fenc; \
-    srcStride = modeHor ? scaleTuSize : scaleStride; \
-    sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
-    bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
-    cost = m_rdCost.calcRdSADCost(sad, bits)
+    bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0];
+    bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1];
+    MotionData* bestME = bidir2Nx2N.bestME[0];
+    int ref0    = bestME[0].ref;
+    MV  mvp0    = bestME[0].mvp;
+    int mvpIdx0 = bestME[0].mvpIdx;
+    int ref1    = bestME[1].ref;
+    MV  mvp1    = bestME[1].mvp;
+    int mvpIdx1 = bestME[1].mvpIdx;
+
+    bidir2Nx2N.initCosts();
+    cu.setPartSizeSubParts(SIZE_2Nx2N);
+    cu.setPredModeSubParts(MODE_INTER);
+    cu.setPUInterDir(3, 0, 0);
+    cu.setPURefIdx(0, (int8_t)ref0, 0, 0);
+    cu.setPURefIdx(1, (int8_t)ref1, 0, 0);
+    cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
+    cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
+    cu.m_mergeFlag[0] = 0;
+
+    /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */
+    cu.setPUMv(0, bestME[0].mv, 0, 0);
+    cu.m_mvd[0][0] = bestME[0].mv - mvp0;
+
+    cu.setPUMv(1, bestME[1].mv, 0, 0);
+    cu.m_mvd[1][0] = bestME[1].mv - mvp1;
+
+    prepMotionCompensation(cu, cuGeom, 0);
+    motionCompensation(bidir2Nx2N.predYuv, true, m_bChromaSa8d);
+
+    int sa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
+    if (m_bChromaSa8d)
+    {
+        /* Add in chroma distortion */
+        sa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
+        sa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize);
+    }
+    bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
+    bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits);
  
-    if (m_param->bEnableFastIntra)
+    bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
+    if (bTryZero)
+    {
+        /* Do not try zero MV if unidir motion predictors are beyond
+         * valid search area */
+        MV mvmin, mvmax;
+        int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
+        setSearchRange(cu, mvzero, merange, mvmin, mvmax);
+        mvmax.y += 2; // there is some pad for subpel refine
+        mvmin <<= 2;
+        mvmax <<= 2;
+
+        bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
+        bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
+    }
+    if (bTryZero)
      {
-        int asad = 0;
-        uint32_t lowmode, highmode, amode = 5, abits = 0;
-        uint64_t acost = MAX_INT64;
+        /* Estimate cost of BIDIR using coincident blocks */
+        Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
  
-        /* pick the best angle, sampling at distance of 5 */
-        for (mode = 5; mode < 35; mode += 5)
-        {
-            TRY_ANGLE(mode);
-            COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
-        }
+        int zsa8d;
  
-        /* refine best angle at distance 2, then distance 1 */
-        for (uint32_t dist = 2; dist >= 1; dist--)
+        if (m_bChromaSa8d)
          {
-            lowmode = amode - dist;
-            highmode = amode + dist;
+            cu.m_mv[0][0] = mvzero;
+            cu.m_mv[1][0] = mvzero;
  
-            X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
-            TRY_ANGLE(lowmode);
-            COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
+            prepMotionCompensation(cu, cuGeom, 0);
+            motionCompensation(tmpPredYuv, true, true);
  
-            X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
-            TRY_ANGLE(highmode);
-            COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
+            zsa8d  = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+            zsa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
+            zsa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
          }
-
-        if (amode == 33)
+        else
          {
-            TRY_ANGLE(34);
-            COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
-        }
+            pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx);
+            pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx);
+            intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
  
-        COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
-    }
-    else // calculate and search all intra prediction angles for lowest cost
-    {
-        for (mode = 2; mode < 35; mode++)
-        {
-            TRY_ANGLE(mode);
-            COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+            primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
+            zsa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
          }
-    }
  
-    cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTrDepth);
-    intraMode.initCosts();
-    intraMode.totalBits = bbits;
-    intraMode.distortion = bsad;
-    intraMode.sa8dCost = bcost;
-    intraMode.sa8dBits = bbits;
-}
+        uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
+        uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
+        uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
  
-void Analysis::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
-{
-    CUData& cu = intraMode.cu;
-    Yuv* reconYuv = &intraMode.reconYuv;
-    Yuv* fencYuv = &m_modeDepth[cuGeom.depth].fencYuv;
+        /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
+        checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvp0, mvpIdx0, bits0, zcost);
+        checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvp1, mvpIdx1, bits1, zcost);
  
-    X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
-    X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
+        uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
+        zcost = zsa8d + m_rdCost.getCost(zbits);
  
-    m_quant.setQPforQuant(cu);
-
-    uint32_t tuDepthRange[2];
-    cu.getIntraTUQtDepthRange(tuDepthRange, 0);
-
-    m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
-
-    Cost icosts;
-    codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
-    extractIntraResultQT(cu, *reconYuv, 0, 0);
-
-    intraMode.distortion  = icosts.distortion;
-    intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
-
-    m_entropyCoder.resetBits();
-    if (m_slice->m_pps->bTransquantBypassEnabled)
-        m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
-    m_entropyCoder.codeSkipFlag(cu, 0);
-    m_entropyCoder.codePredMode(cu.m_predMode[0]);
-    m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
-    m_entropyCoder.codePredInfo(cu, 0);
-    intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits();
+        if (zcost < bidir2Nx2N.sa8dCost)
+        {
+            bidir2Nx2N.sa8dBits = zbits;
+            bidir2Nx2N.sa8dCost = zcost;
  
-    bool bCodeDQP = m_slice->m_pps->bUseDQP;
-    m_entropyCoder.codeCoeff(cu, 0, cuGeom.depth, bCodeDQP, tuDepthRange);
+            cu.setPUMv(0, mvzero, 0, 0);
+            cu.m_mvd[0][0] = mvzero - mvp0;
+            cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0;
  
-    intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
-    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
-    if (m_rdCost.m_psyRd)
-        intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+            cu.setPUMv(1, mvzero, 0, 0);
+            cu.m_mvd[1][0] = mvzero - mvp1;
+            cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
  
-    m_entropyCoder.store(intraMode.contexts);
-    updateModeCost(intraMode);
+            if (m_bChromaSa8d)
+                /* real MC was already performed */
+                bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
+            else
+            {
+                prepMotionCompensation(cu, cuGeom, 0);
+                motionCompensation(bidir2Nx2N.predYuv, true, true);
+            }
+        }
+        else if (m_bChromaSa8d)
+        {
+            /* recover overwritten motion vectors */
+            cu.m_mv[0][0] = bestME[0].mv;
+            cu.m_mv[1][0] = bestME[1].mv;
+        }
+    }
  }
  
  void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
@@ -1592,9 +1675,9 @@ void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
      {
          for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
          {
-            const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
-            if (childCuData.flags & CUGeom::PRESENT)
-                encodeResidue(ctu, childCuData);
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
+                encodeResidue(ctu, childGeom);
          }
          return;
      }
@@ -1602,29 +1685,31 @@ void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
      uint32_t absPartIdx = cuGeom.encodeIdx;
      int sizeIdx = cuGeom.log2CUSize - 2;
  
-    Yuv& fencYuv = m_modeDepth[0].fencYuv;
-
      /* reuse the bestMode data structures at the current depth */
      Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
-    Yuv& reconYuv = bestMode->reconYuv;
      CUData& cu = bestMode->cu;
  
      cu.copyFromPic(ctu, cuGeom);
      m_quant.setQPforQuant(cu);
  
-    if (cu.m_predMode[0] == MODE_INTRA)
+    Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv;
+    if (cuGeom.depth)
+        m_modeDepth[0].fencYuv.copyPartToYuv(fencYuv, absPartIdx);
+    X265_CHECK(bestMode->fencYuv == &fencYuv, "invalid fencYuv\n");
+
+    if (cu.isIntra(0))
      {
          uint32_t tuDepthRange[2];
          cu.getIntraTUQtDepthRange(tuDepthRange, 0);
  
-        uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN;
-        residualTransformQuantIntra(*bestMode, cuGeom, initTrDepth, 0, tuDepthRange);
+        uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
+        residualTransformQuantIntra(*bestMode, cuGeom, initTuDepth, 0, tuDepthRange);
          getBestIntraModeChroma(*bestMode, cuGeom);
          residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
      }
-    else if (cu.m_predMode[0] == MODE_INTER)
+    else // if (cu.isInter(0))
      {
-        X265_CHECK(!ctu.m_skipFlag[absPartIdx], "skip not expected prior to transform\n");
+        X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n");
  
          /* Calculate residual for current CU part into depth sized resiYuv */
  
@@ -1637,16 +1722,16 @@ void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
          pixel* predV = predYuv.getCrAddr(absPartIdx);
  
          primitives.luma_sub_ps[sizeIdx](resiYuv.m_buf[0], resiYuv.m_size,
-                                        fencYuv.getLumaAddr(absPartIdx), predY,
+                                        fencYuv.m_buf[0], predY,
                                          fencYuv.m_size, predYuv.m_size);
  
          primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[1], resiYuv.m_csize,
-                                        fencYuv.getCbAddr(absPartIdx), predU,
-                                        fencYuv.m_csize, predYuv.m_csize);
+                                                 fencYuv.m_buf[1], predU,
+                                                 fencYuv.m_csize, predYuv.m_csize);
  
          primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[2], resiYuv.m_csize,
-                                        fencYuv.getCrAddr(absPartIdx), predV,
-                                        fencYuv.m_csize, predYuv.m_csize);
+                                                 fencYuv.m_buf[2], predV,
+                                                 fencYuv.m_csize, predYuv.m_csize);
  
          uint32_t tuDepthRange[2];
          cu.getInterTUQtDepthRange(tuDepthRange, 0);
@@ -1654,57 +1739,38 @@ void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
          residualTransformQuantInter(*bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
  
          if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
-            cu.setSkipFlagSubParts(true);
+            cu.setPredModeSubParts(MODE_SKIP);
  
-        PicYuv& reconPicYuv = *m_frame->m_reconPicYuv;
-        if (cu.getQtRootCbf(0)) // TODO: split to each component
-        {
-            /* residualTransformQuantInter() wrote transformed residual back into
-             * resiYuv. Generate the recon pixels by adding it to the prediction */
+        /* residualTransformQuantInter() wrote transformed residual back into
+         * resiYuv. Generate the recon pixels by adding it to the prediction */
  
-            primitives.luma_add_ps[sizeIdx](reconYuv.m_buf[0], reconYuv.m_size,
+        PicYuv& reconPic = *m_frame->m_reconPic;
+        if (cu.m_cbf[0][0])
+            primitives.luma_add_ps[sizeIdx](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
                                              predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
-            primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[1], reconYuv.m_csize,
-                                            predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
-            primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[2], reconYuv.m_csize,
-                                            predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
-
-            /* copy the reconstructed part to the recon pic for later intra
-             * predictions */
-            reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, absPartIdx);
-        }
          else
-        {
-            /* copy the prediction pixels to the recon pic for later intra
-             * predictions */
-
-            primitives.luma_copy_pp[sizeIdx](reconPicYuv.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_stride,
+            primitives.luma_copy_pp[sizeIdx](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
                                               predY, predYuv.m_size);
-            primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCbAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC,
+
+        if (cu.m_cbf[1][0])
+            primitives.chroma[m_csp].add_ps[sizeIdx](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+                                                     predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
+        else
+            primitives.chroma[m_csp].copy_pp[sizeIdx](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
                                                        predU, predYuv.m_csize);
-            primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCrAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC,
+
+        if (cu.m_cbf[2][0])
+            primitives.chroma[m_csp].add_ps[sizeIdx](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+                                                     predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
+        else
+            primitives.chroma[m_csp].copy_pp[sizeIdx](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
                                                        predV, predYuv.m_csize);
-        }
      }
-    /* else if (cu.m_predMode[0] == MODE_NONE) {} */
  
      checkDQP(cu, cuGeom);
      cu.updatePic(cuGeom.depth);
  }
  
-/* check whether current try is the best with identifying the depth of current try */
-void Analysis::checkBestMode(Mode& mode, uint32_t depth)
-{
-    ModeDepth& md = m_modeDepth[depth];
-    if (md.bestMode)
-    {
-        if (mode.rdCost < md.bestMode->rdCost)
-            md.bestMode = &mode;
-    }
-    else
-        md.bestMode = &mode;
-}
-
  void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
  {
      if (m_param->rdLevel >= 3)
@@ -1817,7 +1883,7 @@ bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom
       * each quantity */
  
      uint32_t depth = cuGeom.depth;
-    FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
+    FrameData& curEncData = *m_frame->m_encData;
      FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
      uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth];
      uint64_t cuCount = cuStat.count[depth];
@@ -1855,7 +1921,7 @@ bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom
      }
  
      // give 60% weight to all CU's and 40% weight to neighbour CU's
-    if (neighCost + cuCount)
+    if (neighCount + cuCount)
      {
          uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount));
          uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost;