#pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
#endif
+#define MVP_IDX_BITS 1
+
ALIGN_VAR_32(const pixel, Search::zeroPixel[MAX_CU_SIZE]) = { 0 };
ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
m_numLayers = g_log2Size[param.maxCUSize] - 2;
m_rdCost.setPsyRdScale(param.psyRd);
- m_me.setSearchMethod(param.searchMethod);
- m_me.setSubpelRefine(param.subpelRefine);
+ m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
bool ok = m_quant.init(m_bEnableRDOQ, param.psyRdoq, scalingList, m_entropyCoder);
- if (m_param->noiseReduction)
+ if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
ok &= m_quant.allocNoiseReduction(param);
ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
void Search::invalidateContexts(int) {}
#endif
-void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height)
+void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
{
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
- uint32_t subdiv = tuDepthL > trDepth;
+ uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+ uint32_t subdiv = tuDepth < cu.m_tuDepth[absPartIdx];
uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- bool mCodeAll = true;
- const uint32_t numPels = 1 << (log2TrSize * 2 - m_hChromaShift - m_vChromaShift);
- if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
- mCodeAll = false;
-
- if (mCodeAll)
+ if (!(log2TrSize - m_hChromaShift < 2))
{
- if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepth - 1))
- m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_U, trDepth, !subdiv);
-
- if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepth - 1))
- m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_V, trDepth, !subdiv);
+ if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
+ m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
+ if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
+ m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
}
if (subdiv)
{
- absPartIdxStep >>= 2;
- width >>= 1;
- height >>= 1;
-
- uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
- for (uint32_t part = 0; part < 4; part++)
- codeSubdivCbfQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, absPartIdxStep, width, height);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx);
}
}
-void Search::codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype)
+void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype)
{
- if (!cu.getCbf(absPartIdx, ttype, trDepth))
+ if (!cu.getCbf(absPartIdx, ttype, tuDepth))
return;
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
+ uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+ uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- if (tuDepthL > trDepth)
+ if (tuDepth < cu.m_tuDepth[absPartIdx])
{
- uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
- for (uint32_t part = 0; part < 4; part++)
- codeCoeffQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, ttype);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype);
return;
}
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
-
- uint32_t trDepthC = trDepth;
+ uint32_t tuDepthC = tuDepth;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-
- if (log2TrSizeC == 1)
- {
- X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "transform size too small\n");
- trDepthC--;
- log2TrSizeC++;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
- bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
- if (!bFirstQ)
+
+ if (log2TrSizeC < 2)
+ {
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ if (absPartIdx & 3)
return;
+ log2TrSizeC = 2;
+ tuDepthC--;
}
uint32_t qtLayer = log2TrSize - 2;
uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
uint32_t subTUSize = 1 << (log2TrSizeC * 2);
- uint32_t partIdxesPerSubTU = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + trDepthC) << 1) + 1);
- if (cu.getCbf(absPartIdx, ttype, trDepth + 1))
+ uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
+ if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
- if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, trDepth + 1))
- m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, ttype);
+ if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
+ m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype);
}
}
-void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, uint32_t depthRange[2])
+void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
{
- uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth;
+ uint32_t fullDepth = mode.cu.m_cuDepth[0] + tuDepth;
uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
uint32_t qtLayer = log2TrSize - 2;
uint32_t sizeIdx = log2TrSize - 2;
if (mightSplit)
m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
- pixel* fenc = const_cast<pixel*>(mode.fencYuv->getLumaAddr(absPartIdx));
+ const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
uint32_t stride = mode.fencYuv->m_size;
// init availability pattern
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
- initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
+ initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
// get prediction signal
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
- cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+ cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
}
else
// no coded residual, recon = pred
- primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride);
+ primitives.luma_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride);
- bCBF = !!numSig << trDepth;
+ bCBF = !!numSig << tuDepth;
cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
fullCost.distortion = primitives.sse_pp[sizeIdx](reconQt, reconQtStride, fenc, stride);
}
else
{
- uint32_t qtNumParts = cuGeom.numPartitions >> 2;
- if (!trDepth)
+ uint32_t qNumParts = cuGeom.numPartitions >> 2;
+ if (!tuDepth)
{
- for (uint32_t part = 0; part < 4; part++)
- m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false);
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
+ m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
}
- else if (!(absPartIdx & (qtNumParts - 1)))
+ else if (!(absPartIdx & (qNumParts - 1)))
m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
}
if (log2TrSize != depthRange[0])
m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
- m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
+ m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
- if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth))
+ if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
}
// code split block
- uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
- uint32_t absPartIdxSub = absPartIdx;
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
if (m_param->bEnableTSkipFast)
- checkTransformSkip &= cu.m_partSize[absPartIdx] == SIZE_NxN;
+ checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;
Cost splitCost;
uint32_t cbf = 0;
- for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
if (checkTransformSkip)
- codeIntraLumaTSkip(mode, cuGeom, trDepth + 1, absPartIdxSub, splitCost);
+ codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
else
- codeIntraLumaQT(mode, cuGeom, trDepth + 1, absPartIdxSub, bAllowSplit, splitCost, depthRange);
+ codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);
- cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
+ cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
}
- for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
- cu.m_cbf[0][absPartIdx + offs] |= (cbf << trDepth);
+ for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
+ cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
if (mightNotSplit && log2TrSize != depthRange[0])
{
m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
// recover transform index and Cbf values
- cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+ cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
}
}
// set reconstruction for next intra prediction blocks if full TU prediction won
- pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
- intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
- primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
+ pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+ intptr_t picStride = m_frame->m_reconPic->m_stride;
+ primitives.luma_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
outCost.rdcost += fullCost.rdcost;
outCost.distortion += fullCost.distortion;
outCost.energy += fullCost.energy;
}
-void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, Cost& outCost)
+void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
{
- uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth;
+ uint32_t fullDepth = mode.cu.m_cuDepth[0] + tuDepth;
uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
uint32_t tuSize = 1 << log2TrSize;
int bTSkip = 0;
uint32_t bCBF = 0;
- pixel* fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+ const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
pixel* pred = predYuv->getLumaAddr(absPartIdx);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
uint32_t stride = fencYuv->m_size;
// init availability pattern
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
- initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
+ initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
// get prediction signal
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
- cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+ cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
uint32_t qtLayer = log2TrSize - 2;
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
}
else
// no residual coded, recon = pred
- primitives.square_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride);
+ primitives.luma_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride);
uint32_t tmpDist = primitives.sse_pp[sizeIdx](tmpRecon, tmpReconStride, fenc, stride);
cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
- cu.setCbfSubParts((!!numSig) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
+ cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
if (useTSkip)
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
}
else
{
- uint32_t qtNumParts = cuGeom.numPartitions >> 2;
- if (!trDepth)
+ uint32_t qNumParts = cuGeom.numPartitions >> 2;
+ if (!tuDepth)
{
- for (uint32_t part = 0; part < 4; part++)
- m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false);
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
+ m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
}
- else if (!(absPartIdx & (qtNumParts - 1)))
+ else if (!(absPartIdx & (qNumParts - 1)))
m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
}
m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
- m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
+ m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
- if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth))
+ if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
if (bTSkip)
{
memcpy(coeffY, tsCoeffY, sizeof(coeff_t) << (log2TrSize * 2));
- primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize);
+ primitives.luma_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize);
}
else if (checkTransformSkip)
{
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
- cu.setCbfSubParts(bCBF << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
+ cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
}
// set reconstruction for next intra prediction blocks
- pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
- intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
- primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
+ pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+ intptr_t picStride = m_frame->m_reconPic->m_stride;
+ primitives.luma_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
outCost.rdcost += fullCost.rdcost;
outCost.distortion += fullCost.distortion;
}
/* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
-void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t depthRange[2])
+void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t depthRange[2])
{
CUData& cu = mode.cu;
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
+ uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
bool bCheckFull = log2TrSize <= depthRange[1];
if (bCheckFull)
{
- pixel* fenc = const_cast<pixel*>(mode.fencYuv->getLumaAddr(absPartIdx));
+ const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
- pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
- intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
+ pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+ intptr_t picStride = m_frame->m_reconPic->m_stride;
uint32_t stride = mode.fencYuv->m_size;
uint32_t sizeIdx = log2TrSize - 2;
uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
coeff_t* coeff = cu.m_trCoeff[TEXT_LUMA] + coeffOffsetY;
- initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
+ initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
- cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
+ cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, false);
{
m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, false, numSig);
primitives.luma_add_ps[sizeIdx](picReconY, picStride, pred, residual, stride, stride);
- cu.setCbfSubParts(1 << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
+ cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
}
else
{
- primitives.square_copy_pp[sizeIdx](picReconY, picStride, pred, stride);
+ primitives.luma_copy_pp[sizeIdx](picReconY, picStride, pred, stride);
cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
}
}
X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
/* code split block */
- uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t cbf = 0;
- for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- residualTransformQuantIntra(mode, cuGeom, trDepth + 1, absPartIdxSub, depthRange);
- cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
+ residualTransformQuantIntra(mode, cuGeom, tuDepth + 1, qPartIdx, depthRange);
+ cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
}
- for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
- cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << trDepth);
+ for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
+ cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << tuDepth);
}
}
-void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, uint32_t absPartIdx)
+void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
{
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t tuDepth = cu.m_tuDepth[absPartIdx];
+ uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+ uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- if (tuDepth == trDepth)
+ if (tuDepth == cu.m_tuDepth[absPartIdx])
{
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
uint32_t qtLayer = log2TrSize - 2;
// copy transform coefficients
}
else
{
- uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
- for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
- extractIntraResultQT(cu, reconYuv, trDepth + 1, absPartIdx + subPartIdx * numQPart);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx);
}
}
+inline void offsetCBFs(uint8_t subTUCBF[2])
+{
+ uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1];
+ subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF;
+ subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF;
+}
+
/* 4:2:2 post-TU split processing */
-void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx)
+void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
{
uint32_t depth = cu.m_cuDepth[0];
- uint32_t fullDepth = depth + trDepth;
+ uint32_t fullDepth = depth + tuDepth;
uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- uint32_t trDepthC = trDepth;
if (log2TrSize == 2)
{
- X265_CHECK(m_csp != X265_CSP_I444 && trDepthC, "trDepthC invalid\n");
- trDepthC--;
+ X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ ++log2TrSize;
}
- uint32_t partIdxesPerSubTU = (NUM_CU_PARTITIONS >> ((depth + trDepthC) << 1)) >> 1;
+ uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
// move the CBFs down a level and set the parent CBF
uint8_t subTUCBF[2];
- uint8_t combinedSubTUCBF = 0;
-
- for (uint32_t subTU = 0; subTU < 2; subTU++)
- {
- const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
+ subTUCBF[0] = cu.getCbf(absPartIdx , ttype, tuDepth);
+ subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth);
+ offsetCBFs(subTUCBF);
- subTUCBF[subTU] = cu.getCbf(subTUAbsPartIdx, ttype, trDepth);
- combinedSubTUCBF |= subTUCBF[subTU];
- }
-
- for (uint32_t subTU = 0; subTU < 2; subTU++)
- {
- const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
- const uint8_t compositeCBF = (subTUCBF[subTU] << 1) | combinedSubTUCBF;
-
- cu.setCbfPartRange((compositeCBF << trDepth), ttype, subTUAbsPartIdx, partIdxesPerSubTU);
- }
+ cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx , tuNumParts);
+ cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts);
}
/* returns distortion */
-uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
+uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
{
CUData& cu = mode.cu;
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
+ uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+ uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- if (tuDepthL > trDepth)
+ if (tuDepth < cu.m_tuDepth[absPartIdx])
{
- uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0;
- for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- outDist += codeIntraChromaQt(mode, cuGeom, trDepth + 1, absPartIdxSub, psyEnergy);
- splitCbfU |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_U, trDepth + 1);
- splitCbfV |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_V, trDepth + 1);
+ outDist += codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, psyEnergy);
+ splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+ splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
- for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
+ for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
{
- cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << trDepth);
- cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << trDepth);
+ cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << tuDepth);
+ cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << tuDepth);
}
return outDist;
}
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
- uint32_t trDepthC = trDepth;
- if (log2TrSizeC == 1)
+ uint32_t tuDepthC = tuDepth;
+ if (log2TrSizeC < 2)
{
- X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "invalid trDepth\n");
- trDepthC--;
- log2TrSizeC++;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
- bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
- if (!bFirstQ)
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ if (absPartIdx & 3)
return 0;
+ log2TrSizeC = 2;
+ tuDepthC--;
}
if (m_bEnableRDOQ)
bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
if (checkTransformSkip)
- return codeIntraChromaTSkip(mode, cuGeom, trDepth, trDepthC, absPartIdx, psyEnergy);
+ return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy);
uint32_t qtLayer = log2TrSize - 2;
uint32_t tuSize = 1 << log2TrSizeC;
uint32_t outDist = 0;
- uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
+ uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- pixel* fenc = const_cast<Yuv*>(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC);
+ const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t stride = mode.fencYuv->m_csize;
pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
- pixel* picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
- intptr_t picStride = m_frame->m_reconPicYuv->m_strideC;
+ pixel* picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+ intptr_t picStride = m_frame->m_reconPic->m_strideC;
// init availability pattern
- initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
+ initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
- uint32_t tmpDist;
if (numSig)
{
m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
primitives.luma_add_ps[sizeIdxC](reconQt, reconQtStride, pred, residual, stride, stride);
- cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+ cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else
{
// no coded residual, recon = pred
- primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride);
+ primitives.luma_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride);
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
- tmpDist = primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride);
- outDist += (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist);
+ outDist += m_rdCost.scaleChromaDist(chromaId, primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride));
if (m_rdCost.m_psyRd)
psyEnergy += m_rdCost.psyCost(sizeIdxC, fenc, stride, picReconC, picStride);
- primitives.square_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride);
+ primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride);
}
while (tuIterator.isNextSection());
if (splitType == VERTICAL_SPLIT)
- offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx);
+ offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
}
return outDist;
}
/* returns distortion */
-uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t trDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
+uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
{
CUData& cu = mode.cu;
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
+ uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- uint32_t log2TrSizeC = 2;
+ const uint32_t log2TrSizeC = 2;
uint32_t tuSize = 4;
uint32_t qtLayer = log2TrSize - 2;
uint32_t outDist = 0;
ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
ALIGN_VAR_32(pixel, tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]);
- uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
+ uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- pixel* fenc = const_cast<Yuv*>(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC);
+ const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t stride = mode.fencYuv->m_csize;
- uint32_t sizeIdxC = log2TrSizeC - 2;
+ const uint32_t sizeIdxC = log2TrSizeC - 2;
uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
// init availability pattern
- initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
+ initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
{
m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
primitives.luma_add_ps[sizeIdxC](recon, reconStride, pred, residual, stride, stride);
- cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+ cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else if (useTSkip)
{
}
else
{
- primitives.square_copy_pp[sizeIdxC](recon, reconStride, pred, stride);
+ primitives.luma_copy_pp[sizeIdxC](recon, reconStride, pred, stride);
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
uint32_t tmpDist = primitives.sse_pp[sizeIdxC](recon, reconStride, fenc, stride);
- tmpDist = (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist);
+ tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
if (bTSkip)
{
memcpy(coeffC, tskipCoeffC, sizeof(coeff_t) << (log2TrSizeC * 2));
- primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE);
+ primitives.luma_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE);
}
- cu.setCbfPartRange(bCbf << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+ cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
- pixel* reconPicC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
- intptr_t picStride = m_frame->m_reconPicYuv->m_strideC;
- primitives.square_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride);
+ pixel* reconPicC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+ intptr_t picStride = m_frame->m_reconPic->m_strideC;
+ primitives.luma_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride);
outDist += bDist;
psyEnergy += bEnergy;
while (tuIterator.isNextSection());
if (splitType == VERTICAL_SPLIT)
- offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx);
+ offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
}
m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
return outDist;
}
-void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t trDepth, bool tuQuad)
+void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
{
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
+ uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
+ uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
+ uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
- if (tuDepthL == trDepth)
+ if (tuDepthL == tuDepth || log2TrSizeC == 2)
{
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-
- if (tuQuad)
- {
- log2TrSizeC++; /* extract one 4x4 instead of 4 2x2 */
- trDepth--; /* also adjust the number of coeff read */
- }
-
// copy transform coefficients
uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
- uint32_t qtLayer = log2TrSize - 2;
+ uint32_t qtLayer = log2TrSize - 2 - (tuDepthL - tuDepth);
coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
}
else
{
- if (g_maxLog2CUSize - fullDepth - 1 == 2 && m_csp != X265_CSP_I444)
- /* no such thing as chroma 2x2, so extract one 4x4 instead of 4 2x2 */
- extractIntraResultChromaQT(cu, reconYuv, absPartIdx, trDepth + 1, true);
- else
- {
- uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
- for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
- extractIntraResultChromaQT(cu, reconYuv, absPartIdx + subPartIdx * numQPart, trDepth + 1, false);
- }
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1);
}
}
-void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx)
+void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx)
{
CUData& cu = mode.cu;
- uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
- uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
+ uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
+ uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
- if (tuDepthL == trDepth)
+ if (tuDepth == cu.m_tuDepth[absPartIdx])
{
- uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
- uint32_t trDepthC = trDepth;
- if (log2TrSizeC == 1)
+ uint32_t tuDepthC = tuDepth;
+ if (log2TrSizeC < 2)
{
- X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth > 0, "invalid trDepth\n");
- trDepthC--;
- log2TrSizeC++;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
- bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
- if (!bFirstQ)
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ if (absPartIdx & 3)
return;
+ log2TrSizeC = 2;
+ tuDepthC--;
}
ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
uint32_t stride = mode.fencYuv->m_csize;
const int sizeIdxC = log2TrSizeC - 2;
- uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
+ uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- pixel* fenc = const_cast<pixel*>(mode.fencYuv->getChromaAddr(chromaId, absPartIdxC));
+ const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
pixel* recon = mode.reconYuv.getChromaAddr(chromaId, absPartIdxC); // TODO: needed?
uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
coeff_t* coeff = cu.m_trCoeff[ttype] + coeffOffsetC;
- pixel* picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
- uint32_t picStride = m_frame->m_reconPicYuv->m_strideC;
+ pixel* picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+ uint32_t picStride = m_frame->m_reconPic->m_strideC;
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
if (chromaPredMode == DM_CHROMA_IDX)
chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
- initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
+ initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
{
m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], residual, stride, coeff, log2TrSizeC, ttype, true, false, numSig);
primitives.luma_add_ps[sizeIdxC](recon, stride, pred, residual, stride, stride);
- primitives.square_copy_pp[sizeIdxC](picReconC, picStride, recon, stride);
- cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
+ primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, recon, stride);
+ cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
else
{
- primitives.square_copy_pp[sizeIdxC](recon, stride, pred, stride);
- primitives.square_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
+ primitives.luma_copy_pp[sizeIdxC](recon, stride, pred, stride);
+ primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
}
}
while (tuIterator.isNextSection());
if (splitType == VERTICAL_SPLIT)
- offsetSubTUCBFs(cu, (TextType)chromaId, trDepth, absPartIdx);
+ offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
}
}
else
{
- uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t splitCbfU = 0, splitCbfV = 0;
- for (uint32_t subPartIdx = 0, absPartIdxC = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxC += qPartsDiv)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- residualQTIntraChroma(mode, cuGeom, trDepth + 1, absPartIdxC);
- splitCbfU |= cu.getCbf(absPartIdxC, TEXT_CHROMA_U, trDepth + 1);
- splitCbfV |= cu.getCbf(absPartIdxC, TEXT_CHROMA_V, trDepth + 1);
+ residualQTIntraChroma(mode, cuGeom, tuDepth + 1, qPartIdx);
+ splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+ splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
- for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
+ for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
{
- cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << trDepth);
- cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << trDepth);
+ cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
+ cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
}
}
}
intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
bool bCodeDQP = m_slice->m_pps->bUseDQP;
- m_entropyCoder.codeCoeff(cu, 0, depth, bCodeDQP, tuDepthRange);
+ m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
m_entropyCoder.store(intraMode.contexts);
intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
updateModeCost(intraMode);
}
-uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t depthRange[2], uint8_t* sharedModes)
+/* Note that this function does not save the best intra prediction, it must
+ * be generated later. It records the best mode in the cu */
+void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
+{
+ CUData& cu = intraMode.cu;
+ uint32_t depth = cu.m_cuDepth[0];
+
+ cu.setPartSizeSubParts(SIZE_2Nx2N);
+ cu.setPredModeSubParts(MODE_INTRA);
+
+ const uint32_t initTuDepth = 0;
+ uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth;
+ uint32_t tuSize = 1 << log2TrSize;
+ const uint32_t absPartIdx = 0;
+
+ // Reference sample smoothing
+ initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
+
+ const pixel* fenc = intraMode.fencYuv->m_buf[0];
+ uint32_t stride = intraMode.fencYuv->m_size;
+
+ pixel* above = m_refAbove + tuSize - 1;
+ pixel* aboveFiltered = m_refAboveFlt + tuSize - 1;
+ pixel* left = m_refLeft + tuSize - 1;
+ pixel* leftFiltered = m_refLeftFlt + tuSize - 1;
+ int sad, bsad;
+ uint32_t bits, bbits, mode, bmode;
+ uint64_t cost, bcost;
+
+ // 33 Angle modes once
+ ALIGN_VAR_32(pixel, bufScale[32 * 32]);
+ ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
+ ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
+ int scaleTuSize = tuSize;
+ int scaleStride = stride;
+ int costShift = 0;
+ int sizeIdx = log2TrSize - 2;
+
+ if (tuSize > 32)
+ {
+ // origin is 64x64, we scale to 32x32 and setup required parameters
+ primitives.scale2D_64to32(bufScale, fenc, stride);
+ fenc = bufScale;
+
+ // reserve space in case primitives need to store data in above
+ // or left buffers
+ pixel _above[4 * 32 + 1];
+ pixel _left[4 * 32 + 1];
+ pixel* aboveScale = _above + 2 * 32;
+ pixel* leftScale = _left + 2 * 32;
+ aboveScale[0] = leftScale[0] = above[0];
+ primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
+ primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
+
+ scaleTuSize = 32;
+ scaleStride = 32;
+ costShift = 2;
+ sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
+
+ // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
+ above = aboveScale;
+ left = leftScale;
+ aboveFiltered = aboveScale;
+ leftFiltered = leftScale;
+ }
+
+ pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
+ int predsize = scaleTuSize * scaleTuSize;
+
+ m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
+
+ /* there are three cost tiers for intra modes:
+ * pred[0] - mode probable, least cost
+ * pred[1], pred[2] - less probable, slightly more cost
+ * non-mpm modes - all cost the same (rbits) */
+ uint64_t mpms;
+ uint32_t preds[3];
+ uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
+
+ // DC
+ primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
+ bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
+ bmode = mode = DC_IDX;
+ bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+ bcost = m_rdCost.calcRdSADCost(bsad, bbits);
+
+ pixel* abovePlanar = above;
+ pixel* leftPlanar = left;
+
+ if (tuSize & (8 | 16 | 32))
+ {
+ abovePlanar = aboveFiltered;
+ leftPlanar = leftFiltered;
+ }
+
+ // PLANAR
+ primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
+ sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
+ mode = PLANAR_IDX;
+ bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+ cost = m_rdCost.calcRdSADCost(sad, bits);
+ COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+
+ // Transpose NxN
+ primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
+
+ primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+
+ bool modeHor;
+ const pixel* cmp;
+ intptr_t srcStride;
+
+#define TRY_ANGLE(angle) \
+ modeHor = angle < 18; \
+ cmp = modeHor ? bufTrans : fenc; \
+ srcStride = modeHor ? scaleTuSize : scaleStride; \
+ sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
+ bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
+ cost = m_rdCost.calcRdSADCost(sad, bits)
+
+ if (m_param->bEnableFastIntra)
+ {
+ int asad = 0;
+ uint32_t lowmode, highmode, amode = 5, abits = 0;
+ uint64_t acost = MAX_INT64;
+
+ /* pick the best angle, sampling at distance of 5 */
+ for (mode = 5; mode < 35; mode += 5)
+ {
+ TRY_ANGLE(mode);
+ COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
+ }
+
+ /* refine best angle at distance 2, then distance 1 */
+ for (uint32_t dist = 2; dist >= 1; dist--)
+ {
+ lowmode = amode - dist;
+ highmode = amode + dist;
+
+ X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
+ TRY_ANGLE(lowmode);
+ COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
+
+ X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
+ TRY_ANGLE(highmode);
+ COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
+ }
+
+ if (amode == 33)
+ {
+ TRY_ANGLE(34);
+ COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
+ }
+
+ COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
+ }
+ else // calculate and search all intra prediction angles for lowest cost
+ {
+ for (mode = 2; mode < 35; mode++)
+ {
+ TRY_ANGLE(mode);
+ COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
+ }
+ }
+
+ cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth);
+ intraMode.initCosts();
+ intraMode.totalBits = bbits;
+ intraMode.distortion = bsad;
+ intraMode.sa8dCost = bcost;
+ intraMode.sa8dBits = bbits;
+}
+
+void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
+{
+ CUData& cu = intraMode.cu;
+ Yuv* reconYuv = &intraMode.reconYuv;
+ const Yuv* fencYuv = intraMode.fencYuv;
+
+ X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
+ X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
+
+ m_quant.setQPforQuant(cu);
+
+ uint32_t tuDepthRange[2];
+ cu.getIntraTUQtDepthRange(tuDepthRange, 0);
+
+ m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
+
+ Cost icosts;
+ codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
+ extractIntraResultQT(cu, *reconYuv, 0, 0);
+
+ intraMode.distortion = icosts.distortion;
+ intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
+
+ m_entropyCoder.resetBits();
+ if (m_slice->m_pps->bTransquantBypassEnabled)
+ m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
+ m_entropyCoder.codeSkipFlag(cu, 0);
+ m_entropyCoder.codePredMode(cu.m_predMode[0]);
+ m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
+ m_entropyCoder.codePredInfo(cu, 0);
+ intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits();
+
+ bool bCodeDQP = m_slice->m_pps->bUseDQP;
+ m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
+
+ intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
+ intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
+ if (m_rdCost.m_psyRd)
+ intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+
+ m_entropyCoder.store(intraMode.contexts);
+ updateModeCost(intraMode);
+}
+
+uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes)
{
CUData& cu = intraMode.cu;
Yuv* reconYuv = &intraMode.reconYuv;
const Yuv* fencYuv = intraMode.fencYuv;
uint32_t depth = cu.m_cuDepth[0];
- uint32_t initTrDepth = cu.m_partSize[0] == SIZE_2Nx2N ? 0 : 1;
- uint32_t numPU = 1 << (2 * initTrDepth);
- uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth;
+ uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
+ uint32_t numPU = 1 << (2 * initTuDepth);
+ uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth;
uint32_t tuSize = 1 << log2TrSize;
uint32_t qNumParts = cuGeom.numPartitions >> 2;
uint32_t sizeIdx = log2TrSize - 2;
uint32_t absPartIdx = 0;
uint32_t totalDistortion = 0;
- int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[absPartIdx] == SIZE_NxN;
+ int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
// loop over partitions
- for (uint32_t pu = 0; pu < numPU; pu++, absPartIdx += qNumParts)
+ for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
{
uint32_t bmode = 0;
if (sharedModes)
- bmode = sharedModes[pu];
+ bmode = sharedModes[puIdx];
else
{
// Reference sample smoothing
- initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX);
+ initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
// determine set of modes to be tested (using prediction signal only)
- pixel* fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+ const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
uint32_t stride = predYuv->m_size;
- pixel *above = m_refAbove + tuSize - 1;
- pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
- pixel *left = m_refLeft + tuSize - 1;
- pixel *leftFiltered = m_refLeftFlt + tuSize - 1;
+ pixel* above = m_refAbove + tuSize - 1;
+ pixel* aboveFiltered = m_refAboveFlt + tuSize - 1;
+ pixel* left = m_refLeft + tuSize - 1;
+ pixel* leftFiltered = m_refLeftFlt + tuSize - 1;
// 33 Angle modes once
ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
if (tuSize > 32)
{
- pixel *aboveScale = _above + 2 * 32;
- pixel *leftScale = _left + 2 * 32;
+ pixel* aboveScale = _above + 2 * 32;
+ pixel* leftScale = _left + 2 * 32;
// origin is 64x64, we scale to 32x32 and setup required parameters
primitives.scale2D_64to32(bufScale, fenc, stride);
modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
// PLANAR
- pixel *abovePlanar = above;
- pixel *leftPlanar = left;
+ pixel* abovePlanar = above;
+ pixel* leftPlanar = left;
if (tuSize >= 8 && tuSize <= 32)
{
abovePlanar = aboveFiltered;
for (int mode = 2; mode < 35; mode++)
{
bool modeHor = (mode < 18);
- pixel *cmp = (modeHor ? buf_trans : fenc);
+ const pixel* cmp = (modeHor ? buf_trans : fenc);
intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride);
bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
* levels and at higher depths */
uint64_t candCostList[MAX_RD_INTRA_MODES];
uint32_t rdModeList[MAX_RD_INTRA_MODES];
- int maxCandCount = 2 + m_param->rdLevel + ((depth + initTrDepth) >> 1);
+ int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
for (int i = 0; i < maxCandCount; i++)
candCostList[i] = MAX_INT64;
if (candCostList[i] == MAX_INT64)
break;
m_entropyCoder.load(m_rqt[depth].cur);
- cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTrDepth);
+ cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
Cost icosts;
if (checkTransformSkip)
- codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts);
+ codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
else
- codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, false, icosts, depthRange);
+ codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
}
}
/* remeasure best mode, allowing TU splits */
- cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTrDepth);
+ cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
m_entropyCoder.load(m_rqt[depth].cur);
Cost icosts;
if (checkTransformSkip)
- codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts);
+ codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
else
- codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, true, icosts, depthRange);
+ codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
totalDistortion += icosts.distortion;
- extractIntraResultQT(cu, *reconYuv, initTrDepth, absPartIdx);
+ extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
// set reconstruction for next intra prediction blocks
- if (pu != numPU - 1)
+ if (puIdx != numPU - 1)
{
/* This has important implications for parallelism and RDO. It is writing intermediate results into the
* output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
* it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
* that the contexts should be tracked through each PU */
- pixel* dst = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
- uint32_t dststride = m_frame->m_reconPicYuv->m_stride;
- pixel* src = reconYuv->getLumaAddr(absPartIdx);
+ pixel* dst = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+ uint32_t dststride = m_frame->m_reconPic->m_stride;
+ const pixel* src = reconYuv->getLumaAddr(absPartIdx);
uint32_t srcstride = reconYuv->m_size;
- primitives.square_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
+ primitives.luma_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
}
}
if (numPU > 1)
{
uint32_t combCbfY = 0;
- uint32_t partIdx = 0;
- for (uint32_t part = 0; part < 4; part++, partIdx += qNumParts)
- combCbfY |= cu.getCbf(partIdx, TEXT_LUMA, 1);
+ for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
+ combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
cu.m_cbf[0][offs] |= combCbfY;
uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
uint32_t tuSize = 1 << log2TrSizeC;
int32_t scaleTuSize = tuSize;
+ uint32_t tuDepth = 0;
int32_t costShift = 0;
if (tuSize > 32)
{
scaleTuSize = 32;
+ tuDepth = 1;
costShift = 2;
log2TrSizeC = 5;
}
- Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 1);
- Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 2);
+ Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 1);
+ Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 2);
cu.getAllowedChromaDir(0, modeList);
// check chroma modes
uint64_t cost = 0;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
- pixel* fenc = fencYuv->m_buf[chromaId];
+ const pixel* fenc = fencYuv->m_buf[chromaId];
pixel* pred = predYuv->m_buf[chromaId];
pixel* chromaPred = getAdiChromaBuf(chromaId, scaleTuSize);
Yuv& reconYuv = intraMode.reconYuv;
uint32_t depth = cu.m_cuDepth[0];
- uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN && m_csp == X265_CSP_I444;
- uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth;
+ uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
+ uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth;
uint32_t absPartStep = (NUM_CU_PARTITIONS >> (depth << 1));
uint32_t totalDistortion = 0;
int part = partitionFromLog2Size(log2TrSize);
- TURecurse tuIterator((initTrDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
+ TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
do
{
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- int cuSize = 1 << cu.m_log2CUSize[absPartIdxC];
uint32_t bestMode = 0;
uint32_t bestDist = 0;
// restore context models
m_entropyCoder.load(m_rqt[depth].cur);
- cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTrDepth);
+ cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
uint32_t psyEnergy = 0;
- uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTrDepth, absPartIdxC, psyEnergy);
+ uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, psyEnergy);
if (m_slice->m_pps->bTransformSkipEnabled)
m_entropyCoder.load(m_rqt[depth].cur);
}
else
{
- uint32_t qtNumParts = cuGeom.numPartitions >> 2;
- if (!(absPartIdxC & (qtNumParts - 1)))
+ uint32_t qNumParts = cuGeom.numPartitions >> 2;
+ if (!(absPartIdxC & (qNumParts - 1)))
m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
}
- codeSubdivCbfQTChroma(cu, initTrDepth, absPartIdxC, tuIterator.absPartIdxStep, cuSize, cuSize);
- codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_U);
- codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_V);
+ codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC);
+ codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
+ codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(dist, bits, psyEnergy) : m_rdCost.calcRdCost(dist, bits);
bestCost = cost;
bestDist = dist;
bestMode = modeList[mode];
- extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTrDepth, false);
+ extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
if (!tuIterator.isLastSection())
{
uint32_t zorder = cuGeom.encodeIdx + absPartIdxC;
- uint32_t dststride = m_frame->m_reconPicYuv->m_strideC;
- pixel *src, *dst;
+ uint32_t dststride = m_frame->m_reconPic->m_strideC;
+ const pixel* src;
+ pixel* dst;
- dst = m_frame->m_reconPicYuv->getCbAddr(cu.m_cuAddr, zorder);
+ dst = m_frame->m_reconPic->getCbAddr(cu.m_cuAddr, zorder);
src = reconYuv.getCbAddr(absPartIdxC);
primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
- dst = m_frame->m_reconPicYuv->getCrAddr(cu.m_cuAddr, zorder);
+ dst = m_frame->m_reconPic->getCrAddr(cu.m_cuAddr, zorder);
src = reconYuv.getCrAddr(absPartIdxC);
primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
}
memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
- cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTrDepth);
+ cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth);
totalDistortion += bestDist;
}
while (tuIterator.isNextSection());
- if (initTrDepth != 0)
+ if (initTuDepth != 0)
{
uint32_t combCbfU = 0;
uint32_t combCbfV = 0;
- uint32_t partIdx = 0;
- for (uint32_t p = 0; p < 4; p++, partIdx += tuIterator.absPartIdxStep)
+ uint32_t qNumParts = tuIterator.absPartIdxStep;
+ for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- combCbfU |= cu.getCbf(partIdx, TEXT_CHROMA_U, 1);
- combCbfV |= cu.getCbf(partIdx, TEXT_CHROMA_V, 1);
+ combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1);
+ combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
}
- for (uint32_t offs = 0; offs < 4 * tuIterator.absPartIdxStep; offs++)
+ for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
{
cu.m_cbf[1][offs] |= combCbfU;
cu.m_cbf[2][offs] |= combCbfV;
continue;
cu.m_mv[0][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][0].mv;
- cu.m_refIdx[0][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][0].refIdx;
+ cu.m_refIdx[0][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][0].refIdx;
cu.m_mv[1][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv;
- cu.m_refIdx[1][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][1].refIdx;
+ cu.m_refIdx[1][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][1].refIdx;
prepMotionCompensation(cu, cuGeom, puIdx);
- motionCompensation(tempYuv, true, false);
+ motionCompensation(tempYuv, true, m_me.bChromaSATD);
+
uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(m.absPartIdx), tempYuv.m_size);
+ if (m_me.bChromaSATD)
+ costCand += m_me.bufChromaSATD(tempYuv, m.absPartIdx);
+
uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand);
costCand = costCand + m_rdCost.getCost(bitsCand);
if (costCand < outCost)
/* this function assumes the caller has configured its MotionEstimation engine with the
* correct source plane and source PU, and has called prepMotionCompensation() to set
* m_puAbsPartIdx, m_puWidth, and m_puHeight */
-void Search::singleMotionEstimation(Search& master, const CUData& cu, const CUGeom& cuGeom, int part, int list, int ref)
+void Search::singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref)
{
uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
bits += getTUBits(ref, m_slice->m_numRefIdx[list]);
- MV amvpCand[AMVP_NUM_CANDS];
MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
- int numMvc = cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, amvpCand, mvc);
+ int numMvc = interMode.cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, interMode.amvpCand[list][ref], mvc);
- uint32_t bestCost = MAX_INT;
int mvpIdx = 0;
int merange = m_param->searchRange;
- for (int i = 0; i < AMVP_NUM_CANDS; i++)
+ MotionData* bestME = interMode.bestME[part];
+
+ if (interMode.amvpCand[list][ref][0] != interMode.amvpCand[list][ref][1])
{
- MV mvCand = amvpCand[i];
+ uint32_t bestCost = MAX_INT;
+ for (int i = 0; i < AMVP_NUM_CANDS; i++)
+ {
+ MV mvCand = interMode.amvpCand[list][ref][i];
- // NOTE: skip mvCand if Y is > merange and -FN>1
- if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
- continue;
+ // NOTE: skip mvCand if Y is > merange and -FN>1
+ if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
+ continue;
- cu.clipMv(mvCand);
+ interMode.cu.clipMv(mvCand);
- Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
- predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPicYuv, mvCand);
- uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
+ Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
+ predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPic, mvCand);
+ uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
- if (bestCost > cost)
- {
- bestCost = cost;
- mvpIdx = i;
+ if (bestCost > cost)
+ {
+ bestCost = cost;
+ mvpIdx = i;
+ }
}
}
- MV mvmin, mvmax, outmv, mvp = amvpCand[mvpIdx];
- setSearchRange(cu, mvp, merange, mvmin, mvmax);
+ MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[list][ref][mvpIdx];
+ setSearchRange(interMode.cu, mvp, merange, mvmin, mvmax);
int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
/* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
- checkBestMVP(amvpCand, outmv, mvp, mvpIdx, bits, cost);
+ checkBestMVP(interMode.amvpCand[list][ref], outmv, mvp, mvpIdx, bits, cost);
/* tie goes to the smallest ref ID, just like --no-pme */
- ScopedLock _lock(master.m_outputLock);
- if (cost < master.m_bestME[list].cost ||
- (cost == master.m_bestME[list].cost && ref < master.m_bestME[list].ref))
+ ScopedLock _lock(master.m_meLock);
+ if (cost < bestME[list].cost ||
+ (cost == bestME[list].cost && ref < bestME[list].ref))
{
- master.m_bestME[list].mv = outmv;
- master.m_bestME[list].mvp = mvp;
- master.m_bestME[list].mvpIdx = mvpIdx;
- master.m_bestME[list].ref = ref;
- master.m_bestME[list].cost = cost;
- master.m_bestME[list].bits = bits;
+ bestME[list].mv = outmv;
+ bestME[list].mvp = mvp;
+ bestME[list].mvpIdx = mvpIdx;
+ bestME[list].ref = ref;
+ bestME[list].cost = cost;
+ bestME[list].bits = bits;
}
}
/* search of the best candidate for inter prediction
* returns true if predYuv was filled with a motion compensated prediction */
-bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma)
+bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChromaSA8D)
{
CUData& cu = interMode.cu;
Yuv* predYuv = &interMode.predYuv;
- MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
const Slice *slice = m_slice;
- PicYuv* fencPic = m_frame->m_origPicYuv;
int numPart = cu.getNumPartInter();
int numPredDir = slice->isInterP() ? 1 : 2;
const int* numRefIdx = slice->m_numRefIdx;
for (int puIdx = 0; puIdx < numPart; puIdx++)
{
+ MotionData* bestME = interMode.bestME[puIdx];
+
/* sets m_puAbsPartIdx, m_puWidth, m_puHeight */
initMotionCompensation(cu, cuGeom, puIdx);
- pixel* pu = fencPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
- m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight);
+ m_me.setSourcePU(*interMode.fencYuv, cu.m_cuAddr, cuGeom.encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
uint32_t mrgCost = MAX_UINT;
- /* find best cost merge candidate */
- if (cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N)
+ /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
+ if (cu.m_partSize[0] != SIZE_2Nx2N)
{
merge.absPartIdx = m_puAbsPartIdx;
merge.width = m_puWidth;
merge.height = m_puHeight;
mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge);
- if (bMergeOnly && cu.m_log2CUSize[0] > 3)
+ if (bMergeOnly)
{
if (mrgCost == MAX_UINT)
{
totalmebits += merge.bits;
prepMotionCompensation(cu, cuGeom, puIdx);
- motionCompensation(*predYuv, true, bChroma);
+ motionCompensation(*predYuv, true, bChromaSA8D);
continue;
}
}
- MotionData bidir[2];
- uint32_t bidirCost = MAX_UINT;
- int bidirBits = 0;
-
- m_bestME[0].cost = MAX_UINT;
- m_bestME[1].cost = MAX_UINT;
+ bestME[0].cost = MAX_UINT;
+ bestME[1].cost = MAX_UINT;
getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
- if (bDistributed)
+ /* Uni-directional prediction */
+ if (m_param->analysisMode == X265_ANALYSIS_LOAD && bestME[0].ref >= 0)
{
- m_curMECu = &cu;
- m_curGeom = &cuGeom;
+ for (int l = 0; l < numPredDir; l++)
+ {
+ int ref = bestME[l].ref;
+ uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
+ bits += getTUBits(ref, numRefIdx[l]);
+
+ int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
+
+ // Pick the best possible MVP from AMVP candidates based on least residual
+ int mvpIdx = 0;
+ int merange = m_param->searchRange;
+
+ if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1])
+ {
+ uint32_t bestCost = MAX_INT;
+ for (int i = 0; i < AMVP_NUM_CANDS; i++)
+ {
+ MV mvCand = interMode.amvpCand[l][ref][i];
+
+ // NOTE: skip mvCand if Y is > merange and -FN>1
+ if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
+ continue;
+
+ cu.clipMv(mvCand);
+ predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
+ uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
- /* this worker might already be enqueued for pmode, so other threads
- * might be looking at the ME job counts at any time, do these sets
- * in a safe order */
+ if (bestCost > cost)
+ {
+ bestCost = cost;
+ mvpIdx = i;
+ }
+ }
+ }
+
+ MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx];
+
+ int satdCost;
+ setSearchRange(cu, mvp, merange, mvmin, mvmax);
+ satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
+
+ /* Get total cost of partition, but only include MV bit cost once */
+ bits += m_me.bitcost(outmv);
+ uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
+
+ /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
+ checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
+
+ if (cost < bestME[l].cost)
+ {
+ bestME[l].mv = outmv;
+ bestME[l].mvp = mvp;
+ bestME[l].mvpIdx = mvpIdx;
+ bestME[l].cost = cost;
+ bestME[l].bits = bits;
+ }
+ }
+ }
+ else if (bDistributed)
+ {
+ m_meLock.acquire();
+ m_curInterMode = &interMode;
+ m_curGeom = &cuGeom;
m_curPart = puIdx;
m_totalNumME = 0;
m_numAcquiredME = 1;
m_numCompletedME = 0;
m_totalNumME = numRefIdx[0] + numRefIdx[1];
+ m_meLock.release();
if (!m_bJobsQueued)
JobProvider::enqueue();
for (int i = 1; i < m_totalNumME; i++)
m_pool->pokeIdleThread();
- while (m_totalNumME > m_numAcquiredME)
+ do
{
- int id = ATOMIC_INC(&m_numAcquiredME);
- if (m_totalNumME >= id)
+ m_meLock.acquire();
+ if (m_totalNumME > m_numAcquiredME)
{
- id -= 1;
+ int id = m_numAcquiredME++;
+ m_meLock.release();
+
if (id < numRefIdx[0])
- singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, id);
+ singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, id);
else
- singleMotionEstimation(*this, cu, cuGeom, puIdx, 1, id - numRefIdx[0]);
+ singleMotionEstimation(*this, interMode, cuGeom, puIdx, 1, id - numRefIdx[0]);
- if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
- m_meCompletionEvent.trigger();
+ m_meLock.acquire();
+ m_numCompletedME++;
+ m_meLock.release();
}
+ else
+ m_meLock.release();
}
+ while (m_totalNumME > m_numAcquiredME);
+
if (!m_bJobsQueued)
JobProvider::dequeue();
/* we saved L0-0 for ourselves */
- singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, 0);
- if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
+ singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, 0);
+
+ m_meLock.acquire();
+ if (++m_numCompletedME == m_totalNumME)
m_meCompletionEvent.trigger();
+ m_meLock.release();
m_meCompletionEvent.wait();
}
else
{
- // Uni-directional prediction
for (int l = 0; l < numPredDir; l++)
{
for (int ref = 0; ref < numRefIdx[l]; ref++)
uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
bits += getTUBits(ref, numRefIdx[l]);
- int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, amvpCand[l][ref], mvc);
+ int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
// Pick the best possible MVP from AMVP candidates based on least residual
- uint32_t bestCost = MAX_INT;
int mvpIdx = 0;
int merange = m_param->searchRange;
- for (int i = 0; i < AMVP_NUM_CANDS; i++)
+ if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1])
{
- MV mvCand = amvpCand[l][ref][i];
+ uint32_t bestCost = MAX_INT;
+ for (int i = 0; i < AMVP_NUM_CANDS; i++)
+ {
+ MV mvCand = interMode.amvpCand[l][ref][i];
- // NOTE: skip mvCand if Y is > merange and -FN>1
- if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
- continue;
+ // NOTE: skip mvCand if Y is > merange and -FN>1
+ if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
+ continue;
- cu.clipMv(mvCand);
- predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPicYuv, mvCand);
- uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
+ cu.clipMv(mvCand);
+ predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
+ uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
- if (bestCost > cost)
- {
- bestCost = cost;
- mvpIdx = i;
+ if (bestCost > cost)
+ {
+ bestCost = cost;
+ mvpIdx = i;
+ }
}
}
- MV mvmin, mvmax, outmv, mvp = amvpCand[l][ref][mvpIdx];
+ MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx];
setSearchRange(cu, mvp, merange, mvmin, mvmax);
int satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
/* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
- checkBestMVP(amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
+ checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
- if (cost < m_bestME[l].cost)
+ if (cost < bestME[l].cost)
{
- m_bestME[l].mv = outmv;
- m_bestME[l].mvp = mvp;
- m_bestME[l].mvpIdx = mvpIdx;
- m_bestME[l].ref = ref;
- m_bestME[l].cost = cost;
- m_bestME[l].bits = bits;
+ bestME[l].mv = outmv;
+ bestME[l].mvp = mvp;
+ bestME[l].mvpIdx = mvpIdx;
+ bestME[l].ref = ref;
+ bestME[l].cost = cost;
+ bestME[l].bits = bits;
}
}
}
}
/* Bi-directional prediction */
- if (slice->isInterB() && !cu.isBipredRestriction() && m_bestME[0].cost != MAX_UINT && m_bestME[1].cost != MAX_UINT)
+ MotionData bidir[2];
+ uint32_t bidirCost = MAX_UINT;
+ int bidirBits = 0;
+
+ if (slice->isInterB() && !cu.isBipredRestriction() && /* biprediction is possible for this PU */
+ cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N && /* 2Nx2N biprediction is handled elsewhere */
+ bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
{
- bidir[0] = m_bestME[0];
- bidir[1] = m_bestME[1];
+ bidir[0] = bestME[0];
+ bidir[1] = bestME[1];
+
+ int satdCost;
- /* Generate reference subpels */
- PicYuv* refPic0 = slice->m_refPicList[0][m_bestME[0].ref]->m_reconPicYuv;
- PicYuv* refPic1 = slice->m_refPicList[1][m_bestME[1].ref]->m_reconPicYuv;
- Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
- predInterLumaPixel(bidirYuv[0], *refPic0, m_bestME[0].mv);
- predInterLumaPixel(bidirYuv[1], *refPic1, m_bestME[1].mv);
+ if (m_me.bChromaSATD)
+ {
+ cu.m_mv[0][m_puAbsPartIdx] = bidir[0].mv;
+ cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
+ cu.m_mv[1][m_puAbsPartIdx] = bidir[1].mv;
+ cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
- pixel *pred0 = bidirYuv[0].getLumaAddr(m_puAbsPartIdx);
- pixel *pred1 = bidirYuv[1].getLumaAddr(m_puAbsPartIdx);
+ prepMotionCompensation(cu, cuGeom, puIdx);
+ motionCompensation(tmpPredYuv, true, true);
- int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
- primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, pred0, bidirYuv[0].m_size, pred1, bidirYuv[1].m_size, 32);
- int satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+ satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
+ m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
+ }
+ else
+ {
+ PicYuv* refPic0 = slice->m_refPicList[0][bestME[0].ref]->m_reconPic;
+ PicYuv* refPic1 = slice->m_refPicList[1][bestME[1].ref]->m_reconPic;
+ Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
- bidirBits = m_bestME[0].bits + m_bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
+ /* Generate reference subpels */
+ predInterLumaPixel(bidirYuv[0], *refPic0, bestME[0].mv);
+ predInterLumaPixel(bidirYuv[1], *refPic1, bestME[1].mv);
+
+ primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(m_puAbsPartIdx), bidirYuv[0].m_size,
+ bidirYuv[1].getLumaAddr(m_puAbsPartIdx), bidirYuv[1].m_size, 32);
+ satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+ }
+
+ bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
bidirCost = satdCost + m_rdCost.getCost(bidirBits);
- bool bTryZero = m_bestME[0].mv.notZero() || m_bestME[1].mv.notZero();
+ bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
if (bTryZero)
{
/* Do not try zero MV if unidir motion predictors are beyond
mvmin <<= 2;
mvmax <<= 2;
- bTryZero &= m_bestME[0].mvp.checkRange(mvmin, mvmax);
- bTryZero &= m_bestME[1].mvp.checkRange(mvmin, mvmax);
+ bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
+ bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
}
if (bTryZero)
{
- // coincident blocks of the two reference pictures
- pixel *ref0 = slice->m_mref[0][m_bestME[0].ref].fpelPlane + (pu - fencPic->m_picOrg[0]);
- pixel *ref1 = slice->m_mref[1][m_bestME[1].ref].fpelPlane + (pu - fencPic->m_picOrg[0]);
- intptr_t refStride = slice->m_mref[0][0].lumaStride;
+ /* coincident blocks of the two reference pictures */
+ if (m_me.bChromaSATD)
+ {
+ cu.m_mv[0][m_puAbsPartIdx] = mvzero;
+ cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
+ cu.m_mv[1][m_puAbsPartIdx] = mvzero;
+ cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
- primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
- satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+ prepMotionCompensation(cu, cuGeom, puIdx);
+ motionCompensation(tmpPredYuv, true, true);
- MV mvp0 = m_bestME[0].mvp;
- int mvpIdx0 = m_bestME[0].mvpIdx;
- uint32_t bits0 = m_bestME[0].bits - m_me.bitcost(m_bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
+ satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
+ m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
+ }
+ else
+ {
+ const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+ const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+ intptr_t refStride = slice->m_mref[0][0].lumaStride;
- MV mvp1 = m_bestME[1].mvp;
- int mvpIdx1 = m_bestME[1].mvpIdx;
- uint32_t bits1 = m_bestME[1].bits - m_me.bitcost(m_bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
+ primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
+ satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
+ }
- uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
+ MV mvp0 = bestME[0].mvp;
+ int mvpIdx0 = bestME[0].mvpIdx;
+ uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
- if (bDistributed)
- {
- cu.fillMvpCand(puIdx, m_puAbsPartIdx, 0, m_bestME[0].ref, amvpCand[0][m_bestME[0].ref], mvc);
- cu.fillMvpCand(puIdx, m_puAbsPartIdx, 1, m_bestME[1].ref, amvpCand[1][m_bestME[1].ref], mvc);
- }
+ MV mvp1 = bestME[1].mvp;
+ int mvpIdx1 = bestME[1].mvpIdx;
+ uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
+
+ uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
/* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
- checkBestMVP(amvpCand[0][m_bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost);
- checkBestMVP(amvpCand[1][m_bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost);
+ checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost);
+ checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost);
if (cost < bidirCost)
{
}
/* select best option and store into CU */
- if (mrgCost < bidirCost && mrgCost < m_bestME[0].cost && mrgCost < m_bestME[1].cost)
+ if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
{
cu.m_mergeFlag[m_puAbsPartIdx] = true;
cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
totalmebits += merge.bits;
}
- else if (bidirCost < m_bestME[0].cost && bidirCost < m_bestME[1].cost)
+ else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
{
lastMode = 2;
cu.m_mergeFlag[m_puAbsPartIdx] = false;
cu.setPUInterDir(3, m_puAbsPartIdx, puIdx);
cu.setPUMv(0, bidir[0].mv, m_puAbsPartIdx, puIdx);
- cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx);
+ cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
cu.m_mvd[0][m_puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
cu.m_mvpIdx[0][m_puAbsPartIdx] = bidir[0].mvpIdx;
cu.setPUMv(1, bidir[1].mv, m_puAbsPartIdx, puIdx);
- cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx);
+ cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
cu.m_mvd[1][m_puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
cu.m_mvpIdx[1][m_puAbsPartIdx] = bidir[1].mvpIdx;
totalmebits += bidirBits;
}
- else if (m_bestME[0].cost <= m_bestME[1].cost)
+ else if (bestME[0].cost <= bestME[1].cost)
{
lastMode = 0;
cu.m_mergeFlag[m_puAbsPartIdx] = false;
cu.setPUInterDir(1, m_puAbsPartIdx, puIdx);
- cu.setPUMv(0, m_bestME[0].mv, m_puAbsPartIdx, puIdx);
- cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx);
- cu.m_mvd[0][m_puAbsPartIdx] = m_bestME[0].mv - m_bestME[0].mvp;
- cu.m_mvpIdx[0][m_puAbsPartIdx] = m_bestME[0].mvpIdx;
+ cu.setPUMv(0, bestME[0].mv, m_puAbsPartIdx, puIdx);
+ cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
+ cu.m_mvd[0][m_puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
+ cu.m_mvpIdx[0][m_puAbsPartIdx] = bestME[0].mvpIdx;
cu.setPURefIdx(1, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
cu.setPUMv(1, mvzero, m_puAbsPartIdx, puIdx);
- totalmebits += m_bestME[0].bits;
+ totalmebits += bestME[0].bits;
}
else
{
cu.m_mergeFlag[m_puAbsPartIdx] = false;
cu.setPUInterDir(2, m_puAbsPartIdx, puIdx);
- cu.setPUMv(1, m_bestME[1].mv, m_puAbsPartIdx, puIdx);
- cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx);
- cu.m_mvd[1][m_puAbsPartIdx] = m_bestME[1].mv - m_bestME[1].mvp;
- cu.m_mvpIdx[1][m_puAbsPartIdx] = m_bestME[1].mvpIdx;
+ cu.setPUMv(1, bestME[1].mv, m_puAbsPartIdx, puIdx);
+ cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
+ cu.m_mvd[1][m_puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
+ cu.m_mvpIdx[1][m_puAbsPartIdx] = bestME[1].mvpIdx;
cu.setPURefIdx(0, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
cu.setPUMv(0, mvzero, m_puAbsPartIdx, puIdx);
- totalmebits += m_bestME[1].bits;
+ totalmebits += bestME[1].bits;
}
prepMotionCompensation(cu, cuGeom, puIdx);
- motionCompensation(*predYuv, true, bChroma);
+ motionCompensation(*predYuv, true, bChromaSA8D);
}
interMode.sa8dBits += totalmebits;
// No residual coding : SKIP mode
- cu.setSkipFlagSubParts(true);
+ cu.setPredModeSubParts(MODE_SKIP);
cu.clearCbf();
cu.setTUDepthSubParts(0, 0, depth);
interMode.distortion = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
// Chroma
part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
- interMode.distortion += m_rdCost.scaleChromaDistCb(primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
- interMode.distortion += m_rdCost.scaleChromaDistCr(primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
+ interMode.distortion += m_rdCost.scaleChromaDist(1, primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
+ interMode.distortion += m_rdCost.scaleChromaDist(2, primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
m_entropyCoder.load(m_rqt[depth].cur);
m_entropyCoder.resetBits();
if (!cu.m_tqBypass[0])
{
uint32_t cbf0Dist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
- cbf0Dist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
- cbf0Dist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
+ cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
+ cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
/* Consider the RD cost of not signaling any residual */
m_entropyCoder.load(m_rqt[depth].cur);
uint32_t coeffBits, bits;
if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
{
- cu.setSkipFlagSubParts(true);
+ cu.setPredModeSubParts(MODE_SKIP);
/* Merge/Skip */
m_entropyCoder.resetBits();
uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits();
bool bCodeDQP = m_slice->m_pps->bUseDQP;
- m_entropyCoder.codeCoeff(cu, 0, cu.m_cuDepth[0], bCodeDQP, tuDepthRange);
+ m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
bits = m_entropyCoder.getNumberOfWrittenBits();
coeffBits = bits - mvBits;
// update with clipped distortion and cost (qp estimation loop uses unclipped values)
uint32_t bestDist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
- bestDist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
- bestDist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
+ bestDist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
+ bestDist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
if (m_rdCost.m_psyRd)
interMode.psyEnergy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
updateModeCost(interMode);
}
-void Search::generateCoeffRecon(Mode& mode, const CUGeom& cuGeom)
-{
- CUData& cu = mode.cu;
-
- m_quant.setQPforQuant(mode.cu);
-
- if (cu.m_predMode[0] == MODE_INTER)
- {
- uint32_t tuDepthRange[2];
- cu.getInterTUQtDepthRange(tuDepthRange, 0);
-
- residualTransformQuantInter(mode, cuGeom, 0, cu.m_cuDepth[0], tuDepthRange);
- if (cu.getQtRootCbf(0))
- mode.reconYuv.addClip(mode.predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
- else
- {
- mode.reconYuv.copyFromYuv(mode.predYuv);
- if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
- cu.setSkipFlagSubParts(true);
- }
- }
- else if (cu.m_predMode[0] == MODE_INTRA)
- {
- uint32_t tuDepthRange[2];
- cu.getIntraTUQtDepthRange(tuDepthRange, 0);
-
- uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN;
- residualTransformQuantIntra(mode, cuGeom, initTrDepth, 0, tuDepthRange);
- getBestIntraModeChroma(mode, cuGeom);
- residualQTIntraChroma(mode, cuGeom, 0, 0);
- mode.reconYuv.copyFromPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, cuGeom.encodeIdx); // TODO:
- }
-}
-
-void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, uint32_t depthRange[2])
+void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2])
{
CUData& cu = mode.cu;
X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid depth\n");
uint32_t tuDepth = depth - cu.m_cuDepth[0];
bool bCheckFull = log2TrSize <= depthRange[1];
- if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0])
+ if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0])
bCheckFull = false;
if (bCheckFull)
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
bool bCodeChroma = true;
uint32_t tuDepthC = tuDepth;
- if (log2TrSizeC == 1)
+ if (log2TrSizeC < 2)
{
- X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n");
- log2TrSizeC++;
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ log2TrSizeC = 2;
tuDepthC--;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
- bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+ bCodeChroma = !(absPartIdx & 3);
}
uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
const Yuv* fencYuv = mode.fencYuv;
- int16_t *curResiY = resiYuv.getLumaAddr(absPartIdx);
+ int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx);
uint32_t strideResiY = resiYuv.m_size;
- pixel *fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+ const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
if (numSigY)
cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
- pixel* fencCb = const_cast<pixel*>(fencYuv->getCbAddr(absPartIdxC));
+ const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
if (numSigU)
{
}
int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
- pixel* fencCr = const_cast<pixel*>(fencYuv->getCrAddr(absPartIdxC));
+ const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
if (numSigV)
{
{
X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
- const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
- for (uint32_t i = 0; i < 4; i++)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- residualTransformQuantInter(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, depthRange);
- ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1);
- ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1);
- vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1);
+ residualTransformQuantInter(mode, cuGeom, qPartIdx, depth + 1, depthRange);
+ ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
+ ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+ vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
- for (uint32_t i = 0; i < 4 * qPartNumSubdiv; i++)
+ for (uint32_t i = 0; i < 4 * qNumParts; i++)
{
cu.m_cbf[TEXT_LUMA][absPartIdx + i] |= ycbf << tuDepth;
cu.m_cbf[TEXT_CHROMA_U][absPartIdx + i] |= ucbf << tuDepth;
}
}
-void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, uint32_t depthRange[2])
+uint64_t Search::estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId)
+{
+ uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
+
+ if (m_rdCost.m_psyRd)
+ return m_rdCost.calcPsyRdCost(dist, nullBits, psyEnergy);
+ else
+ return m_rdCost.calcRdCost(dist, nullBits);
+}
+
+void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
{
CUData& cu = mode.cu;
uint32_t log2TrSize = g_maxLog2CUSize - depth;
bool bCheckSplit = log2TrSize > depthRange[0];
bool bCheckFull = log2TrSize <= depthRange[1];
+ bool bSplitPresentFlag = bCheckSplit && bCheckFull;
- if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit)
+ if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit)
bCheckFull = false;
X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
bool bCodeChroma = true;
uint32_t tuDepthC = tuDepth;
- if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444))
+ if (log2TrSizeC < 2)
{
- log2TrSizeC++;
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ log2TrSizeC = 2;
tuDepthC--;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
- bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+ bCodeChroma = !(absPartIdx & 3);
}
// code full block
uint8_t cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
- uint32_t singleBitsComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
- uint32_t singleDistComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
- uint32_t singlePsyEnergyComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+ uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+ uint32_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
+ uint32_t singlePsyEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
if (m_bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
- pixel *fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
- int16_t *resi = resiYuv.getLumaAddr(absPartIdx);
+ const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
+ int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
- if (cbfFlag[TEXT_LUMA][0])
- m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
- singleBitsComp[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits();
-
- uint32_t singleBitsPrev = singleBitsComp[TEXT_LUMA][0];
-
- if (bCodeChroma)
- {
- uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
- for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
- {
- coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
- TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
-
- do
- {
- uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
-
- cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
- if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
- m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
-
- fenc = const_cast<pixel*>(fencYuv->getChromaAddr(chromaId, absPartIdxC));
- resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
- numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
- cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
-
- m_entropyCoder.codeQtCbf(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
- if (cbfFlag[chromaId][tuIterator.section])
- m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
-
- uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
- singleBitsComp[chromaId][tuIterator.section] = newBits - singleBitsPrev;
+ if (bSplitPresentFlag && log2TrSize > depthRange[0])
+ m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
+ fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
- singleBitsPrev = newBits;
- }
- while (tuIterator.isNextSection());
- }
- }
+ // Coding luma cbf flag has been removed from here. The context for cbf flag is different for each depth.
+ // So it is valid if we encode coefficients and then cbfs at least for analysis.
+// m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
+ if (cbfFlag[TEXT_LUMA][0])
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
- const uint32_t numCoeffY = 1 << (log2TrSize * 2);
- const uint32_t numCoeffC = 1 << (log2TrSizeC * 2);
+ uint32_t singleBitsPrev = m_entropyCoder.getNumberOfWrittenBits();
+ singleBits[TEXT_LUMA][0] = singleBitsPrev - fullCost.bits;
X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
uint32_t distY = primitives.ssd_s[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size);
if (m_rdCost.m_psyRd)
psyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, (int16_t*)zeroShort, 0);
- int16_t *curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
+ int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
if (cbfFlag[TEXT_LUMA][0])
{
m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
+ // non-zero cost calculation for luma - This is an approximation
+ // finally we have to encode correct cbf after comparing with null cost
const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
- uint32_t nonZeroPsyEnergyY = 0;
+ uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
+ uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0;
if (m_rdCost.m_psyRd)
+ {
nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
+ singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroPsyEnergyY);
+ }
+ else
+ singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
if (cu.m_tqBypass[0])
{
- distY = nonZeroDistY;
- psyEnergyY = nonZeroPsyEnergyY;
+ singleDist[TEXT_LUMA][0] = nonZeroDistY;
+ singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
}
else
{
- uint64_t singleCostY = 0;
- if (m_rdCost.m_psyRd)
- singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0], nonZeroPsyEnergyY);
- else
- singleCostY = m_rdCost.calcRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0]);
- m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth);
- const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits();
- uint64_t nullCostY = 0;
- if (m_rdCost.m_psyRd)
- nullCostY = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY);
- else
- nullCostY = m_rdCost.calcRdCost(distY, nullBitsY);
+ // zero-cost calculation for luma. This is an approximation
+ // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
+ // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
+ uint64_t nullCostY = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
+
if (nullCostY < singleCostY)
{
cbfFlag[TEXT_LUMA][0] = 0;
+ singleBits[TEXT_LUMA][0] = 0;
+ primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
#if CHECKED_BUILD || _DEBUG
+ uint32_t numCoeffY = 1 << (log2TrSize << 1);
memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
#endif
if (checkTransformSkipY)
minCost[TEXT_LUMA][0] = nullCostY;
+ singleDist[TEXT_LUMA][0] = distY;
+ singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
}
else
{
- distY = nonZeroDistY;
- psyEnergyY = nonZeroPsyEnergyY;
if (checkTransformSkipY)
minCost[TEXT_LUMA][0] = singleCostY;
+ singleDist[TEXT_LUMA][0] = nonZeroDistY;
+ singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
}
}
}
- else if (checkTransformSkipY)
+ else
{
- m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth);
- const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits();
- if (m_rdCost.m_psyRd)
- minCost[TEXT_LUMA][0] = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY);
- else
- minCost[TEXT_LUMA][0] = m_rdCost.calcRdCost(distY, nullBitsY);
+ if (checkTransformSkipY)
+ minCost[TEXT_LUMA][0] = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
+ primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
+ singleDist[TEXT_LUMA][0] = distY;
+ singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
}
- singleDistComp[TEXT_LUMA][0] = distY;
- singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY;
- if (!cbfFlag[TEXT_LUMA][0])
- primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
if (bCodeChroma)
{
- uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
+ uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
uint32_t distC = 0, psyEnergyC = 0;
coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
- do
- {
- uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
- uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
+ do
+ {
+ uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
+ uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
- int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
+ cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
- distC = m_rdCost.scaleChromaDistCb(primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
+ if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
+ m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
- if (cbfFlag[chromaId][tuIterator.section])
- {
- m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset,
- log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
- uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
- const uint32_t nonZeroDistC = m_rdCost.scaleChromaDistCb(dist);
- uint32_t nonZeroPsyEnergyC = 0;
- if (m_rdCost.m_psyRd)
- nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
-
- if (cu.m_tqBypass[0])
- {
- distC = nonZeroDistC;
- psyEnergyC = nonZeroPsyEnergyC;
- }
- else
+ fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
+ resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
+ numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
+ cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
+
+ //Coding cbf flags has been removed from here
+// m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][tuIterator.section], tuDepth);
+ if (cbfFlag[chromaId][tuIterator.section])
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
+ uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
+ singleBits[chromaId][tuIterator.section] = newBits - singleBitsPrev;
+ singleBitsPrev = newBits;
+
+ int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
+ distC = m_rdCost.scaleChromaDist(chromaId, primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
+
+ if (cbfFlag[chromaId][tuIterator.section])
{
- uint64_t singleCostC = 0;
+ m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset,
+ log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
+
+ // non-zero cost calculation for luma, same as luma - This is an approximation
+ // finally we have to encode correct cbf after comparing with null cost
+ uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
+ uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
+ uint32_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
+ uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0;
if (m_rdCost.m_psyRd)
- singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+ {
+ nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
+ singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+ }
else
- singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
- m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepth);
- const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits();
- uint64_t nullCostC = 0;
- if (m_rdCost.m_psyRd)
- nullCostC = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC);
+ singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
+
+ if (cu.m_tqBypass[0])
+ {
+ singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+ singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
+ }
else
- nullCostC = m_rdCost.calcRdCost(distC, nullBitsC);
- if (nullCostC < singleCostC)
{
- cbfFlag[chromaId][tuIterator.section] = 0;
+ //zero-cost calculation for chroma. This is an approximation
+ uint64_t nullCostC = estimateNullCbfCost(distC, psyEnergyC, tuDepth, (TextType)chromaId);
+
+ if (nullCostC < singleCostC)
+ {
+ cbfFlag[chromaId][tuIterator.section] = 0;
+ singleBits[chromaId][tuIterator.section] = 0;
+ primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
#if CHECKED_BUILD || _DEBUG
+ uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
#endif
if (checkTransformSkipC)
minCost[chromaId][tuIterator.section] = nullCostC;
+ singleDist[chromaId][tuIterator.section] = distC;
+ singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
}
else
{
- distC = nonZeroDistC;
- psyEnergyC = nonZeroPsyEnergyC;
if (checkTransformSkipC)
minCost[chromaId][tuIterator.section] = singleCostC;
+ singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+ singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
}
}
}
- else if (checkTransformSkipC)
+ else
{
- m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepthC);
- const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits();
- if (m_rdCost.m_psyRd)
- minCost[chromaId][tuIterator.section] = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC);
- else
- minCost[chromaId][tuIterator.section] = m_rdCost.calcRdCost(distC, nullBitsC);
- }
-
- singleDistComp[chromaId][tuIterator.section] = distC;
- singlePsyEnergyComp[chromaId][tuIterator.section] = psyEnergyC;
-
- if (!cbfFlag[chromaId][tuIterator.section])
+ if (checkTransformSkipC)
+ minCost[chromaId][tuIterator.section] = estimateNullCbfCost(distC, psyEnergyC, tuDepthC, (TextType)chromaId);
primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
+ singleDist[chromaId][tuIterator.section] = distC;
+ singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
+ }
cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
}
if (m_bEnableRDOQ)
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
- fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
+ fenc = fencYuv->getLumaAddr(absPartIdx);
resi = resiYuv.getLumaAddr(absPartIdx);
uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, tsCoeffY, log2TrSize, TEXT_LUMA, absPartIdx, true);
if (numSigTSkipY)
{
m_entropyCoder.resetBits();
- m_entropyCoder.codeQtCbf(!!numSigTSkipY, TEXT_LUMA, tuDepth);
+ m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
m_entropyCoder.codeCoeffNxN(cu, tsCoeffY, absPartIdx, log2TrSize, TEXT_LUMA);
const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
else
{
- singleDistComp[TEXT_LUMA][0] = nonZeroDistY;
- singlePsyEnergyComp[TEXT_LUMA][0] = nonZeroPsyEnergyY;
+ singleDist[TEXT_LUMA][0] = nonZeroDistY;
+ singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
bestTransformMode[TEXT_LUMA][0] = 1;
+ uint32_t numCoeffY = 1 << (log2TrSize << 1);
memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY);
- primitives.square_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize);
+ primitives.luma_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize);
}
cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
- int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
+ int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
ALIGN_VAR_32(coeff_t, tsCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
ALIGN_VAR_32(int16_t, tsResiC[MAX_TS_SIZE * MAX_TS_SIZE]);
if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
- fenc = const_cast<pixel*>(fencYuv->getChromaAddr(chromaId, absPartIdxC));
+ fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, tsCoeffC, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
m_entropyCoder.resetBits();
- singleBitsComp[chromaId][tuIterator.section] = 0;
+ singleBits[chromaId][tuIterator.section] = 0;
if (numSigTSkipC)
{
- m_entropyCoder.codeQtCbf(!!numSigTSkipC, (TextType)chromaId, tuDepth);
+ m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
m_entropyCoder.codeCoeffNxN(cu, tsCoeffC, absPartIdxC, log2TrSizeC, (TextType)chromaId);
- singleBitsComp[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
+ singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], tsResiC, trSizeC, tsCoeffC,
log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
- nonZeroDistC = m_rdCost.scaleChromaDistCb(dist);
+ nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
if (m_rdCost.m_psyRd)
{
nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
- singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
+ singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
}
else
- singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
+ singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
}
if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
else
{
- singleDistComp[chromaId][tuIterator.section] = nonZeroDistC;
- singlePsyEnergyComp[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
+ singleDist[chromaId][tuIterator.section] = nonZeroDistC;
+ singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
bestTransformMode[chromaId][tuIterator.section] = 1;
+ uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
memcpy(coeffCurC + subTUOffset, tsCoeffC, sizeof(coeff_t) * numCoeffC);
- primitives.square_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC);
+ primitives.luma_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC);
}
cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
}
}
+ // Here we were encoding cbfs and coefficients, after calculating distortion above.
+ // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected
+ // bits required for coefficients and added with number of cbf bits. As I tested the order does not
+ // make any difference. But bit confused whether I should load the original context as below.
m_entropyCoder.load(m_rqt[depth].rqtRoot);
-
m_entropyCoder.resetBits();
- if (log2TrSize > depthRange[0])
- m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
-
+ //Encode cbf flags
if (bCodeChroma)
{
for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
{
if (!splitIntoSubTUs)
- m_entropyCoder.codeQtCbf(cbfFlag[chromaId][0], (TextType)chromaId, tuDepth);
+ m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth);
else
{
offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
- for (uint32_t subTU = 0; subTU < 2; subTU++)
- m_entropyCoder.codeQtCbf(cbfFlag[chromaId][subTU], (TextType)chromaId, tuDepth);
+ m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth);
+ m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][1], tuDepth);
}
}
}
- m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
- if (cbfFlag[TEXT_LUMA][0])
- m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
+ m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
- if (bCodeChroma)
- {
- uint32_t subTUSize = 1 << (log2TrSizeC * 2);
- uint32_t partIdxesPerSubTU = absPartIdxStep >> 1;
- uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
+ uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits();
- for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
- {
- coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
- if (!splitIntoSubTUs)
- {
- if (cbfFlag[chromaId][0])
- m_entropyCoder.codeCoeffNxN(cu, coeffCurC, absPartIdx, log2TrSizeC, (TextType)chromaId);
- }
- else
- {
- for (uint32_t subTU = 0; subTU < 2; subTU++)
- {
- if (cbfFlag[chromaId][subTU])
- m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTU * subTUSize, absPartIdx + subTU * partIdxesPerSubTU, log2TrSizeC, (TextType)chromaId);
- }
- }
- }
+ uint32_t coeffBits = 0;
+ coeffBits = singleBits[TEXT_LUMA][0];
+ for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
+ {
+ coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex];
+ coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex];
}
- fullCost.distortion += singleDistComp[TEXT_LUMA][0];
- fullCost.energy += singlePsyEnergyComp[TEXT_LUMA][0];// need to check we need to add chroma also
+ // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma.
+ // In case of chroma, if any one of the splitted block's cbf is 1, then we need to encode cbf 1, and then for
+ // four splitted block's individual cbf value. This is not known before analysis of four splitted blocks.
+ // For that reason, I am collecting individual coefficient bits only.
+ fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
+
+ fullCost.distortion += singleDist[TEXT_LUMA][0];
+ fullCost.energy += singlePsyEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
{
- fullCost.distortion += singleDistComp[TEXT_CHROMA_U][subTUIndex];
- fullCost.distortion += singleDistComp[TEXT_CHROMA_V][subTUIndex];
+ fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
+ fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex];
}
- fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
if (m_rdCost.m_psyRd)
fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
else
}
Cost splitCost;
- const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
+ if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
+ {
+ // Subdiv flag can be encoded at the start of anlysis of splitted blocks.
+ m_entropyCoder.resetBits();
+ m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
+ splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
+ }
+
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
- for (uint32_t i = 0; i < 4; ++i)
+ for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
{
- estimateResidualQT(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, resiYuv, splitCost, depthRange);
- ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1);
- ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1);
- vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1);
+ estimateResidualQT(mode, cuGeom, qPartIdx, depth + 1, resiYuv, splitCost, depthRange);
+ ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
+ ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+ vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
}
- for (uint32_t i = 0; i < 4 * qPartNumSubdiv; ++i)
+ for (uint32_t i = 0; i < 4 * qNumParts; ++i)
{
cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
}
+ // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
+ // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
+ // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
+ // at depth 0 (for example).
m_entropyCoder.load(m_rqt[depth].rqtRoot);
m_entropyCoder.resetBits();
- encodeResidualQT(cu, absPartIdx, depth, true, TEXT_LUMA, depthRange);
- encodeResidualQT(cu, absPartIdx, depth, false, TEXT_LUMA, depthRange);
- encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_U, depthRange);
- encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_V, depthRange);
-
- splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
+ codeInterSubdivCbfQT(cu, absPartIdx, depth, depthRange);
+ uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
+ splitCost.bits += splitCbfBits;
if (m_rdCost.m_psyRd)
splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
if (bCodeChroma)
{
- const uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1;
-
- uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0);
- for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++)
+ if (!splitIntoSubTUs)
{
- const uint32_t subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU);
-
- cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][subTUIndex], TEXT_CHROMA_U, subTUPartIdx, partIdxesPerSubTU);
- cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][subTUIndex], TEXT_CHROMA_V, subTUPartIdx, partIdxesPerSubTU);
+ cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth);
+ cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth);
+ }
+ else
+ {
+ uint32_t tuNumParts = absPartIdxStep >> 1;
+ cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx , tuNumParts);
+ cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
+ cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx , tuNumParts);
+ cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
}
}
X265_CHECK(bCheckFull, "check-full must be set\n");
if (bCodeChroma)
{
- uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1;
- uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0);
-
- for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
+ if (!splitIntoSubTUs)
{
- for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++)
- {
- const uint32_t subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU);
+ cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth);
+ cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth);
+ }
+ else
+ {
+ uint32_t tuNumParts = absPartIdxStep >> 1;
- if (splitIntoSubTUs)
- {
- uint8_t combinedSubTUCBF = cbfFlag[chromaId][0] | cbfFlag[chromaId][1];
- cu.setCbfPartRange(((cbfFlag[chromaId][subTUIndex] << 1) | combinedSubTUCBF) << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU);
- }
- else
- cu.setCbfPartRange(cbfFlag[chromaId][subTUIndex] << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU);
- }
+ offsetCBFs(cbfFlag[TEXT_CHROMA_U]);
+ offsetCBFs(cbfFlag[TEXT_CHROMA_V]);
+ cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx , tuNumParts);
+ cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
+ cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx , tuNumParts);
+ cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
}
}
outCosts.energy += fullCost.energy;
}
-void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2])
+void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, const uint32_t depthRange[2])
{
X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
- X265_CHECK(cu.m_predMode[absPartIdx] != MODE_INTRA, "encodeResidualQT() with intra block\n");
+ X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
- const uint32_t curTuDepth = depth - cu.m_cuDepth[0];
- const uint32_t tuDepth = cu.m_tuDepth[absPartIdx];
- const bool bSubdiv = curTuDepth != tuDepth;
+ const uint32_t tuDepth = depth - cu.m_cuDepth[0];
+ const bool bSubdiv = tuDepth != cu.m_tuDepth[absPartIdx];
const uint32_t log2TrSize = g_maxLog2CUSize - depth;
- uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-
- const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
+ if (!(log2TrSize - m_hChromaShift < 2))
+ {
+ if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
+ m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
+ if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
+ m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
+ }
+ else
+ {
+ X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma CBF not matching\n");
+ X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma CBF not matching\n");
+ }
- if (bSubdivAndCbf && log2TrSize <= depthRange[1] && log2TrSize > depthRange[0])
- m_entropyCoder.codeTransformSubdivFlag(bSubdiv, 5 - log2TrSize);
+ if (!bSubdiv)
+ {
+ m_entropyCoder.codeQtCbfLuma(cu, absPartIdx, tuDepth);
+ }
+ else
+ {
+ uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ codeInterSubdivCbfQT(cu, absPartIdx, depth + 1, depthRange);
+ }
+}
- bool mCodeAll = true;
- uint32_t trWidthC = 1 << log2TrSizeC;
- uint32_t trHeightC = splitIntoSubTUs ? (trWidthC << 1) : trWidthC;
+void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, TextType ttype, const uint32_t depthRange[2])
+{
+ X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
+ X265_CHECK(cu.isInter(absPartIdx), "encodeResidualQT() with intra block\n");
- const uint32_t numPels = trWidthC * trHeightC;
- if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
- mCodeAll = false;
+ const uint32_t curTuDepth = depth - cu.m_cuDepth[0];
+ const uint32_t tuDepth = cu.m_tuDepth[absPartIdx];
+ const bool bSubdiv = curTuDepth != tuDepth;
+ const uint32_t log2TrSize = g_maxLog2CUSize - depth;
- if (bSubdivAndCbf)
+ if (bSubdiv)
{
- const bool bFirstCbfOfCU = curTuDepth == 0;
- if (bFirstCbfOfCU || mCodeAll)
- {
- uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + curTuDepth) << 1);
- if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1))
- m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_U, curTuDepth, !bSubdiv);
- if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1))
- m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_V, curTuDepth, !bSubdiv);
- }
- else
+ if (cu.getCbf(absPartIdx, ttype, curTuDepth))
{
- X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1), "chroma CBF not matching\n");
- X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1), "chroma CBF not matching\n");
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
+ encodeResidualQT(cu, absPartIdx, depth + 1, ttype, depthRange);
}
+ return;
}
-
- if (!bSubdiv)
+ else
{
+ const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
+ uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
+
// Luma
const uint32_t qtLayer = log2TrSize - 2;
uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
// Chroma
bool bCodeChroma = true;
uint32_t tuDepthC = tuDepth;
- if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444))
+ if (log2TrSize == 2 && m_csp != X265_CSP_I444)
{
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
log2TrSizeC++;
tuDepthC--;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
- bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+ bCodeChroma = !(absPartIdx & 3);
}
- if (bSubdivAndCbf)
- m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, tuDepth);
- else
+ if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
+
+ if (bCodeChroma)
{
- if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
+ uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
+ coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
+ coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
- if (bCodeChroma)
+ if (!splitIntoSubTUs)
{
- uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
- coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
- coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
-
- if (!splitIntoSubTUs)
+ if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
+ if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
+ }
+ else
+ {
+ uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
+ uint32_t subTUSize = 1 << (log2TrSizeC * 2);
+ if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
{
- if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
+ if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
- if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
+ if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_U);
}
- else
+ if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
{
- uint32_t partIdxesPerSubTU = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + tuDepthC) << 1) + 1);
- uint32_t subTUSize = 1 << (log2TrSizeC * 2);
- if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
- {
- if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
- if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_U);
- }
- if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
- {
- if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
- if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1))
- m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_V);
- }
+ if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
+ if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
+ m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_V);
}
}
}
}
- else
- {
- if (bSubdivAndCbf || cu.getCbf(absPartIdx, ttype, curTuDepth))
- {
- const uint32_t qpartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
- for (uint32_t i = 0; i < 4; ++i)
- encodeResidualQT(cu, absPartIdx + i * qpartNumSubdiv, depth + 1, bSubdivAndCbf, ttype, depthRange);
- }
- }
}
void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth)
X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
const uint32_t curTrMode = depth - cu.m_cuDepth[0];
const uint32_t tuDepth = cu.m_tuDepth[absPartIdx];
+ const uint32_t log2TrSize = g_maxLog2CUSize - depth;
if (curTrMode < tuDepth)
{
- uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
- for (uint32_t i = 0; i < 4; i++, absPartIdx += qPartNumSubdiv)
+ uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
+ for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
saveResidualQTData(cu, resiYuv, absPartIdx, depth + 1);
return;
}
- const uint32_t log2TrSize = g_maxLog2CUSize - depth;
const uint32_t qtLayer = log2TrSize - 2;
uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
bool bCodeChroma = true;
uint32_t tuDepthC = tuDepth;
- if (log2TrSizeC == 1)
+ if (log2TrSizeC < 2)
{
- X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n");
- log2TrSizeC++;
+ X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
+ log2TrSizeC = 2;
tuDepthC--;
- uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
- bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
+ bCodeChroma = !(absPartIdx & 3);
}
m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);