/* Called by API thread */
void Lookahead::addPicture(Frame *curFrame, int sliceType)
{
- PicYuv *orig = curFrame->m_origPicYuv;
+ PicYuv *orig = curFrame->m_fencPic;
curFrame->m_lowres.init(orig, curFrame->m_poc, sliceType);
/* Called by pool worker threads */
bool Lookahead::findJob(int)
{
- if (m_bReady && ATOMIC_CAS32(&m_bReady, 1, 0) == 1)
+ if (m_bReady > 0 && ATOMIC_DEC(&m_bReady) == 0)
{
m_inputQueueLock.acquire();
slicetypeDecide();
/* called by API thread or worker thread with inputQueueLock acquired */
void Lookahead::slicetypeDecide()
{
+ ProfileScopeEvent(slicetypeDecideEV);
+
ScopedLock lock(m_decideLock);
Lowres *frames[X265_LOOKAHEAD_MAX];
list[bframes / 2]->m_lowres.sliceType = X265_TYPE_BREF;
brefs++;
}
-
/* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
if (m_param->rc.rateControlMode != X265_RC_CQP)
{
void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
{
int prevNonB = 0, curNonB = 1, idx = 0;
- bool isNextNonB = false;
-
while (curNonB < numFrames && frames[curNonB]->sliceType == X265_TYPE_B)
curNonB++;
-
int nextNonB = keyframe ? prevNonB : curNonB;
- int nextB = keyframe ? prevNonB + 1 : curNonB + 1;
-
+ int nextB = prevNonB + 1;
+ int nextBRef = 0;
+ int miniGopEnd = keyframe ? prevNonB : curNonB;
while (curNonB < numFrames + !keyframe)
{
/* P/I cost: This shouldn't include the cost of nextNonB */
int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB;
frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, p0, curNonB, curNonB);
frames[nextNonB]->plannedType[idx] = frames[curNonB]->sliceType;
+ /* Save the nextNonB Cost in each B frame of the current miniGop */
+ if (curNonB > miniGopEnd)
+ {
+ for (int j = nextB; j < miniGopEnd; j++)
+ {
+ frames[j]->plannedSatd[frames[j]->indB] = frames[nextNonB]->plannedSatd[idx];
+ frames[j]->plannedType[frames[j]->indB++] = frames[nextNonB]->plannedType[idx];
+
+ }
+ }
idx++;
}
/* Handle the B-frames: coded order */
- for (int i = prevNonB + 1; i < curNonB; i++, idx++)
- {
- frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, prevNonB, curNonB, i);
- frames[nextNonB]->plannedType[idx] = X265_TYPE_B;
- }
+ if (m_param->bBPyramid && curNonB - prevNonB > 1)
+ nextBRef = (prevNonB + curNonB + 1) / 2;
- for (int i = nextB; i <= curNonB; i++)
+ for (int i = prevNonB + 1; i < curNonB; i++, idx++)
{
- for (int j = frames[i]->indB + i + 1; j <= curNonB; j++, frames[i]->indB++)
+ int64_t satdCost = 0; int type = X265_TYPE_B;
+ if (nextBRef)
{
- if (j == curNonB)
+ if (i == nextBRef)
{
- if (isNextNonB)
- {
- int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB;
- frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, p0, curNonB, curNonB);
- frames[i]->plannedType[frames[i]->indB] = frames[curNonB]->sliceType;
- }
+ satdCost = vbvFrameCost(frames, prevNonB, curNonB, nextBRef);
+ type = X265_TYPE_BREF;
}
+ else if (i < nextBRef)
+ satdCost = vbvFrameCost(frames, prevNonB, nextBRef, i);
else
- {
- frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, prevNonB, curNonB, j);
- frames[i]->plannedType[frames[i]->indB] = X265_TYPE_B;
- }
+ satdCost = vbvFrameCost(frames, nextBRef, curNonB, i);
}
- if (i == curNonB && !isNextNonB)
- isNextNonB = true;
- }
+ else
+ satdCost = vbvFrameCost(frames, prevNonB, nextNonB, i);
+ frames[nextNonB]->plannedSatd[idx] = satdCost;
+ frames[nextNonB]->plannedType[idx] = type;
+ /* Save the nextB Cost in each B frame of the current miniGop */
+ for (int j = nextB; j < miniGopEnd; j++)
+ {
+ if (nextBRef && i == nextBRef)
+ break;
+ if (j >= i && j !=nextBRef)
+ continue;
+ frames[j]->plannedSatd[frames[j]->indB] = satdCost;
+ frames[j]->plannedType[frames[j]->indB++] = X265_TYPE_B;
+ }
+ }
prevNonB = curNonB;
curNonB++;
while (curNonB <= numFrames && frames[curNonB]->sliceType == X265_TYPE_B)
if (m_param->bEnableWeightedPred)
{
- PicYuv *orig = curFrame->m_origPicYuv;
+ PicYuv *orig = curFrame->m_fencPic;
m_paddedLines = curFrame->m_lowres.lines + 2 * orig->m_lumaMarginY;
intptr_t padoffset = curFrame->m_lowres.lumaStride * orig->m_lumaMarginY + orig->m_lumaMarginX;
m_weightedRef.lowresPlane[i] = m_wbuffer[i] + padoffset;
}
- m_weightedRef.fpelPlane = m_weightedRef.lowresPlane[0];
+ m_weightedRef.fpelPlane[0] = m_weightedRef.lowresPlane[0];
m_weightedRef.lumaStride = curFrame->m_lowres.lumaStride;
m_weightedRef.isLowres = true;
m_weightedRef.isWeighted = false;
for (int i = 0; i < m_heightInCU; i++)
{
m_rows[i].init();
- m_rows[i].m_me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride);
if (!fenc->bIntraCalculated)
fenc->rowSatds[0][0][i] = 0;
fenc->rowSatds[b - p0][p1 - b][i] = 0;
{
Lowres *fenc = frames[b];
Lowres *ref = frames[p0];
- pixel *src = ref->fpelPlane;
+ pixel *src = ref->fpelPlane[0];
intptr_t stride = fenc->lumaStride;
if (wp)
primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines,
scale, round << correction, denom + correction, offset);
- src = m_weightedRef.fpelPlane;
+ src = m_weightedRef.fpelPlane[0];
}
uint32_t cost = 0;
{
for (int x = 0; x < fenc->width; x += 8, mb++, pixoff += 8)
{
- int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane + pixoff, stride);
+ int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane[0] + pixoff, stride);
cost += X265_MIN(satd, fenc->intraCost[mb]);
}
}
void CostEstimate::processRow(int row, int /*threadId*/)
{
+ ProfileScopeEvent(costEstimateRow);
+
int realrow = m_heightInCU - 1 - row;
Lowres **frames = m_curframes;
ReferencePlanes *wfref0 = m_weightedRef.isWeighted ? &m_weightedRef : frames[m_curp0];
const bool bFrameScoreCU = (cux > 0 && cux < m_widthInCU - 1 &&
cuy > 0 && cuy < m_heightInCU - 1) || m_widthInCU <= 2 || m_heightInCU <= 2;
- m_me.setSourcePU(pelOffset, cuSize, cuSize);
+ m_me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize);
/* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
int lowresPenalty = 4;
}
if (bBidir)
{
- pixel subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE], subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
+ ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+ ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
pixel *src0 = wfref0->lowresMC(pelOffset, *fenc_mvs[0], subpelbuf0, stride0);
pixel *src1 = fref1->lowresMC(pelOffset, *fenc_mvs[1], subpelbuf1, stride1);
- pixel ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
+ ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
int bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE);
COPY2_IF_LT(bcost, bicost, listused, 3);
// Copy Left
for (int i = 0; i < cuSize + 1; i++)
- {
left0[i] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride];
- }
for (int i = 0; i < cuSize; i++)
{
// generate 35 intra predictions into m_predictions
pixelcmp_t satd = primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
- int icost = m_me.COST_MAX, cost;
+ int icost = m_me.COST_MAX;
primitives.intra_pred[DC_IDX][sizeIdx](m_predictions, cuSize, left0, above0, 0, (cuSize <= 16));
- cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+ int cost = m_me.bufSATD(m_predictions, cuSize);
if (cost < icost)
icost = cost;
pixel *above = (cuSize >= 8) ? above1 : above0;
pixel *left = (cuSize >= 8) ? left1 : left0;
primitives.intra_pred[PLANAR_IDX][sizeIdx](m_predictions, cuSize, left, above, 0, 0);
- cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
+ cost = m_me.bufSATD(m_predictions, cuSize);
if (cost < icost)
icost = cost;
primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
// calculate satd costs, keep least cost
ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
- primitives.transpose[sizeIdx](buf_trans, m_me.fenc, FENC_STRIDE);
+ primitives.transpose[sizeIdx](buf_trans, m_me.fencPUYuv.m_buf[0], FENC_STRIDE);
int acost = m_me.COST_MAX;
uint32_t mode, lowmode = 4;
if (mode < 18)
cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
else
- cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
+ cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
COPY2_IF_LT(acost, cost, lowmode, mode);
}
for (uint32_t dist = 2; dist >= 1; dist--)
if (mode < 18)
cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
else
- cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
+ cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
COPY2_IF_LT(acost, cost, lowmode, mode);
mode = lowmode + dist;
if (mode < 18)
cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
else
- cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
+ cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
COPY2_IF_LT(acost, cost, lowmode, mode);
}
if (acost < icost)
const int intraPenalty = 5 * m_lookAheadLambda;
icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */
fenc->intraCost[cuXY] = icost;
+ fenc->intraMode[cuXY] = (uint8_t)lowmode;
int icostAq = icost;
if (bFrameScoreCU)
{