Fix cut&paste typo in changelog.

[deb_x265.git] / source / encoder / slicetype.cpp
diff --git a/source/encoder/slicetype.cpp b/source/encoder/slicetype.cpp

index cc70c20658d9df0bfe2fa93c52f26f6c868f98e7..a792760a05fc7ee6f453e93038a2a036e1f4e80e 100644 (file)
--- a/source/encoder/slicetype.cpp
+++ b/source/encoder/slicetype.cpp
@@ -111,7 +111,7 @@ void Lookahead::destroy()
  /* Called by API thread */
  void Lookahead::addPicture(Frame *curFrame, int sliceType)
  {
-    PicYuv *orig = curFrame->m_origPicYuv;
+    PicYuv *orig = curFrame->m_fencPic;
  
      curFrame->m_lowres.init(orig, curFrame->m_poc, sliceType);
  
@@ -192,7 +192,7 @@ Frame* Lookahead::getDecidedPicture()
  /* Called by pool worker threads */
  bool Lookahead::findJob(int)
  {
-    if (m_bReady && ATOMIC_CAS32(&m_bReady, 1, 0) == 1)
+    if (m_bReady > 0 && ATOMIC_DEC(&m_bReady) == 0)
      {
          m_inputQueueLock.acquire();
          slicetypeDecide();
@@ -290,6 +290,8 @@ void Lookahead::getEstimatedPictureCost(Frame *curFrame)
  /* called by API thread or worker thread with inputQueueLock acquired */
  void Lookahead::slicetypeDecide()
  {
+    ProfileScopeEvent(slicetypeDecideEV);
+
      ScopedLock lock(m_decideLock);
  
      Lowres *frames[X265_LOOKAHEAD_MAX];
@@ -417,7 +419,6 @@ void Lookahead::slicetypeDecide()
          list[bframes / 2]->m_lowres.sliceType = X265_TYPE_BREF;
          brefs++;
      }
-
      /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
      if (m_param->rc.rateControlMode != X265_RC_CQP)
      {
@@ -524,14 +525,12 @@ void Lookahead::slicetypeDecide()
  void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
  {
      int prevNonB = 0, curNonB = 1, idx = 0;
-    bool isNextNonB = false;
-
      while (curNonB < numFrames && frames[curNonB]->sliceType == X265_TYPE_B)
          curNonB++;
-
      int nextNonB = keyframe ? prevNonB : curNonB;
-    int nextB = keyframe ? prevNonB + 1 : curNonB + 1;
-
+    int nextB = prevNonB + 1;
+    int nextBRef = 0;
+    int miniGopEnd = keyframe ? prevNonB : curNonB;
      while (curNonB < numFrames + !keyframe)
      {
          /* P/I cost: This shouldn't include the cost of nextNonB */
@@ -540,38 +539,53 @@ void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
              int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB;
              frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, p0, curNonB, curNonB);
              frames[nextNonB]->plannedType[idx] = frames[curNonB]->sliceType;
+            /* Save the nextNonB Cost in each B frame of the current miniGop */
+            if (curNonB > miniGopEnd)
+            {
+                for (int j = nextB; j < miniGopEnd; j++)
+                {
+                    frames[j]->plannedSatd[frames[j]->indB] = frames[nextNonB]->plannedSatd[idx];
+                    frames[j]->plannedType[frames[j]->indB++] = frames[nextNonB]->plannedType[idx];
+                
+                }
+            }
              idx++;
          }
          /* Handle the B-frames: coded order */
-        for (int i = prevNonB + 1; i < curNonB; i++, idx++)
-        {
-            frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, prevNonB, curNonB, i);
-            frames[nextNonB]->plannedType[idx] = X265_TYPE_B;
-        }
+        if (m_param->bBPyramid && curNonB - prevNonB > 1)
+            nextBRef = (prevNonB + curNonB + 1) / 2;
  
-        for (int i = nextB; i <= curNonB; i++)
+        for (int i = prevNonB + 1; i < curNonB; i++, idx++)
          {
-            for (int j = frames[i]->indB + i + 1; j <= curNonB; j++, frames[i]->indB++)
+            int64_t satdCost = 0; int type = X265_TYPE_B;
+            if (nextBRef)
              {
-                if (j == curNonB)
+                if (i == nextBRef)
                  {
-                    if (isNextNonB)
-                    {
-                        int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB;
-                        frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, p0, curNonB, curNonB);
-                        frames[i]->plannedType[frames[i]->indB] = frames[curNonB]->sliceType;
-                    }
+                    satdCost = vbvFrameCost(frames, prevNonB, curNonB, nextBRef);
+                    type = X265_TYPE_BREF;
                  }
+                else if (i < nextBRef)
+                    satdCost = vbvFrameCost(frames, prevNonB, nextBRef, i);
                  else
-                {
-                    frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, prevNonB, curNonB, j);
-                    frames[i]->plannedType[frames[i]->indB] = X265_TYPE_B;
-                }
+                    satdCost = vbvFrameCost(frames, nextBRef, curNonB, i);
              }
-            if (i == curNonB && !isNextNonB)
-                isNextNonB = true;
-        }
+            else
+                satdCost = vbvFrameCost(frames, prevNonB, nextNonB, i);
+            frames[nextNonB]->plannedSatd[idx] = satdCost;
+            frames[nextNonB]->plannedType[idx] = type;
+            /* Save the nextB Cost in each B frame of the current miniGop */
  
+            for (int j = nextB; j < miniGopEnd; j++)
+            {
+                if (nextBRef && i == nextBRef)
+                    break;
+                if (j >= i && j !=nextBRef)
+                    continue;
+                frames[j]->plannedSatd[frames[j]->indB] = satdCost;
+                frames[j]->plannedType[frames[j]->indB++] = X265_TYPE_B;
+            }
+        }
          prevNonB = curNonB;
          curNonB++;
          while (curNonB <= numFrames && frames[curNonB]->sliceType == X265_TYPE_B)
@@ -1238,7 +1252,7 @@ void CostEstimate::init(x265_param *_param, Frame *curFrame)
  
      if (m_param->bEnableWeightedPred)
      {
-        PicYuv *orig = curFrame->m_origPicYuv;
+        PicYuv *orig = curFrame->m_fencPic;
          m_paddedLines = curFrame->m_lowres.lines + 2 * orig->m_lumaMarginY;
          intptr_t padoffset = curFrame->m_lowres.lumaStride * orig->m_lumaMarginY + orig->m_lumaMarginX;
  
@@ -1249,7 +1263,7 @@ void CostEstimate::init(x265_param *_param, Frame *curFrame)
              m_weightedRef.lowresPlane[i] = m_wbuffer[i] + padoffset;
          }
  
-        m_weightedRef.fpelPlane = m_weightedRef.lowresPlane[0];
+        m_weightedRef.fpelPlane[0] = m_weightedRef.lowresPlane[0];
          m_weightedRef.lumaStride = curFrame->m_lowres.lumaStride;
          m_weightedRef.isLowres = true;
          m_weightedRef.isWeighted = false;
@@ -1290,7 +1304,6 @@ int64_t CostEstimate::estimateFrameCost(Lowres **frames, int p0, int p1, int b,
          for (int i = 0; i < m_heightInCU; i++)
          {
              m_rows[i].init();
-            m_rows[i].m_me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride);
              if (!fenc->bIntraCalculated)
                  fenc->rowSatds[0][0][i] = 0;
              fenc->rowSatds[b - p0][p1 - b][i] = 0;
@@ -1351,7 +1364,7 @@ uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightPara
  {
      Lowres *fenc = frames[b];
      Lowres *ref  = frames[p0];
-    pixel *src = ref->fpelPlane;
+    pixel *src = ref->fpelPlane[0];
      intptr_t stride = fenc->lumaStride;
  
      if (wp)
@@ -1365,7 +1378,7 @@ uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightPara
  
          primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines,
                               scale, round << correction, denom + correction, offset);
-        src = m_weightedRef.fpelPlane;
+        src = m_weightedRef.fpelPlane[0];
      }
  
      uint32_t cost = 0;
@@ -1376,7 +1389,7 @@ uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightPara
      {
          for (int x = 0; x < fenc->width; x += 8, mb++, pixoff += 8)
          {
-            int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane + pixoff, stride);
+            int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane[0] + pixoff, stride);
              cost += X265_MIN(satd, fenc->intraCost[mb]);
          }
      }
@@ -1469,6 +1482,8 @@ void CostEstimate::weightsAnalyse(Lowres **frames, int b, int p0)
  
  void CostEstimate::processRow(int row, int /*threadId*/)
  {
+    ProfileScopeEvent(costEstimateRow);
+
      int realrow = m_heightInCU - 1 - row;
      Lowres **frames = m_curframes;
      ReferencePlanes *wfref0 = m_weightedRef.isWeighted ? &m_weightedRef : frames[m_curp0];
@@ -1531,7 +1546,7 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c
      const bool bFrameScoreCU = (cux > 0 && cux < m_widthInCU - 1 &&
                                  cuy > 0 && cuy < m_heightInCU - 1) || m_widthInCU <= 2 || m_heightInCU <= 2;
  
-    m_me.setSourcePU(pelOffset, cuSize, cuSize);
+    m_me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize);
  
      /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
      int lowresPenalty = 4;
@@ -1592,12 +1607,13 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c
          }
          if (bBidir)
          {
-            pixel subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE], subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
+            ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+            ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
              intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
              pixel *src0 = wfref0->lowresMC(pelOffset, *fenc_mvs[0], subpelbuf0, stride0);
              pixel *src1 = fref1->lowresMC(pelOffset, *fenc_mvs[1], subpelbuf1, stride1);
  
-            pixel ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
+            ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
              primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
              int bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE);
              COPY2_IF_LT(bcost, bicost, listused, 3);
@@ -1626,9 +1642,7 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c
  
          // Copy Left
          for (int i = 0; i < cuSize + 1; i++)
-        {
              left0[i] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride];
-        }
  
          for (int i = 0; i < cuSize; i++)
          {
@@ -1652,22 +1666,22 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c
  
          // generate 35 intra predictions into m_predictions
          pixelcmp_t satd = primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
-        int icost = m_me.COST_MAX, cost;
+        int icost = m_me.COST_MAX;
          primitives.intra_pred[DC_IDX][sizeIdx](m_predictions, cuSize, left0, above0, 0, (cuSize <= 16));
-        cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+        int cost = m_me.bufSATD(m_predictions, cuSize);
          if (cost < icost)
              icost = cost;
          pixel *above = (cuSize >= 8) ? above1 : above0;
          pixel *left  = (cuSize >= 8) ? left1 : left0;
          primitives.intra_pred[PLANAR_IDX][sizeIdx](m_predictions, cuSize, left, above, 0, 0);
-        cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+        cost = m_me.bufSATD(m_predictions, cuSize);
          if (cost < icost)
              icost = cost;
          primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
  
          // calculate satd costs, keep least cost
          ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
-        primitives.transpose[sizeIdx](buf_trans, m_me.fenc, FENC_STRIDE);
+        primitives.transpose[sizeIdx](buf_trans, m_me.fencPUYuv.m_buf[0], FENC_STRIDE);
  
          int acost = m_me.COST_MAX;
          uint32_t mode, lowmode = 4;
@@ -1676,7 +1690,7 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c
              if (mode < 18)
                  cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
              else
-                cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
+                cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
              COPY2_IF_LT(acost, cost, lowmode, mode);
          }
          for (uint32_t dist = 2; dist >= 1; dist--)
@@ -1685,14 +1699,14 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c
              if (mode < 18)
                  cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
              else
-                cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
+                cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
              COPY2_IF_LT(acost, cost, lowmode, mode);
  
              mode = lowmode + dist;
              if (mode < 18)
                  cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
              else
-                cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
+                cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
              COPY2_IF_LT(acost, cost, lowmode, mode);
          }
          if (acost < icost)
@@ -1701,6 +1715,7 @@ void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int c
          const int intraPenalty = 5 * m_lookAheadLambda;
          icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */
          fenc->intraCost[cuXY] = icost;
+        fenc->intraMode[cuXY] = (uint8_t)lowmode;
          int icostAq = icost;
          if (bFrameScoreCU)
          {