X-Git-Url: https://git.piment-noir.org/?p=deb_x265.git;a=blobdiff_plain;f=source%2Fencoder%2Fsearch.cpp;h=bc0dc94cf27dbf5e097716436e52de276218277a;hp=cd86318984c02dd6532b524b7a62e21e4b9b6087;hb=b53f7c52d8280ab63876efd6eb292c21430ac607;hpb=5c9b45285dd64723ad1dac380b98a7b1f3095674 diff --git a/source/encoder/search.cpp b/source/encoder/search.cpp index cd86318..bc0dc94 100644 --- a/source/encoder/search.cpp +++ b/source/encoder/search.cpp @@ -37,6 +37,8 @@ using namespace x265; #pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data) #endif +#define MVP_IDX_BITS 1 + ALIGN_VAR_32(const pixel, Search::zeroPixel[MAX_CU_SIZE]) = { 0 }; ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 }; @@ -66,11 +68,10 @@ bool Search::initSearch(const x265_param& param, ScalingList& scalingList) m_numLayers = g_log2Size[param.maxCUSize] - 2; m_rdCost.setPsyRdScale(param.psyRd); - m_me.setSearchMethod(param.searchMethod); - m_me.setSubpelRefine(param.subpelRefine); + m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp); bool ok = m_quant.init(m_bEnableRDOQ, param.psyRdoq, scalingList, m_entropyCoder); - if (m_param->noiseReduction) + if (m_param->noiseReductionIntra || m_param->noiseReductionInter) ok &= m_quant.allocNoiseReduction(param); ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */ @@ -163,70 +164,55 @@ void Search::invalidateContexts(int fromDepth) void Search::invalidateContexts(int) {} #endif -void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height) +void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx) { - uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; - uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; - uint32_t subdiv = tuDepthL > trDepth; + uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth; + uint32_t subdiv = tuDepth < cu.m_tuDepth[absPartIdx]; uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - bool mCodeAll = true; - const uint32_t numPels = 1 << (log2TrSize * 2 - m_hChromaShift - m_vChromaShift); - if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE)) - mCodeAll = false; - - if (mCodeAll) + if (!(log2TrSize - m_hChromaShift < 2)) { - if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepth - 1)) - m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_U, trDepth, !subdiv); - - if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepth - 1)) - m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_V, trDepth, !subdiv); + if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1)) + m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv); + if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1)) + m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv); } if (subdiv) { - absPartIdxStep >>= 2; - width >>= 1; - height >>= 1; - - uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); - for (uint32_t part = 0; part < 4; part++) - codeSubdivCbfQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, absPartIdxStep, width, height); + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) + codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx); } } -void Search::codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype) +void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype) { - if (!cu.getCbf(absPartIdx, ttype, trDepth)) + if (!cu.getCbf(absPartIdx, ttype, tuDepth)) return; - uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; - uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; + uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth; + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - if (tuDepthL > trDepth) + if (tuDepth < cu.m_tuDepth[absPartIdx]) { - uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); - for (uint32_t part = 0; part < 4; part++) - codeCoeffQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, ttype); + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) + codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype); return; } - uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - - uint32_t trDepthC = trDepth; + uint32_t tuDepthC = tuDepth; uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; - - if (log2TrSizeC == 1) - { - X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "transform size too small\n"); - trDepthC--; - log2TrSizeC++; - uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); - bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0); - if (!bFirstQ) + + if (log2TrSizeC < 2) + { + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); + if (absPartIdx & 3) return; + log2TrSizeC = 2; + tuDepthC--; } uint32_t qtLayer = log2TrSize - 2; @@ -243,17 +229,17 @@ void Search::codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absP uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1); coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset; uint32_t subTUSize = 1 << (log2TrSizeC * 2); - uint32_t partIdxesPerSubTU = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + trDepthC) << 1) + 1); - if (cu.getCbf(absPartIdx, ttype, trDepth + 1)) + uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2); + if (cu.getCbf(absPartIdx, ttype, tuDepth + 1)) m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype); - if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, trDepth + 1)) - m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, ttype); + if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1)) + m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype); } } -void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, uint32_t depthRange[2]) +void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2]) { - uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth; + uint32_t fullDepth = mode.cu.m_cuDepth[0] + tuDepth; uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; uint32_t qtLayer = log2TrSize - 2; uint32_t sizeIdx = log2TrSize - 2; @@ -280,20 +266,20 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, if (mightSplit) m_entropyCoder.store(m_rqt[fullDepth].rqtRoot); - pixel* fenc = const_cast(mode.fencYuv->getLumaAddr(absPartIdx)); + const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx); pixel* pred = mode.predYuv.getLumaAddr(absPartIdx); int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx); uint32_t stride = mode.fencYuv->m_size; // init availability pattern uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx]; - initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode); + initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode); // get prediction signal predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize); cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); - cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth); + cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth); uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY; @@ -312,9 +298,9 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, } else // no coded residual, recon = pred - primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride); + primitives.luma_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride); - bCBF = !!numSig << trDepth; + bCBF = !!numSig << tuDepth; cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth); fullCost.distortion = primitives.sse_pp[sizeIdx](reconQt, reconQtStride, fenc, stride); @@ -338,21 +324,21 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, } else { - uint32_t qtNumParts = cuGeom.numPartitions >> 2; - if (!trDepth) + uint32_t qNumParts = cuGeom.numPartitions >> 2; + if (!tuDepth) { - for (uint32_t part = 0; part < 4; part++) - m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false); + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx) + m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false); } - else if (!(absPartIdx & (qtNumParts - 1))) + else if (!(absPartIdx & (qNumParts - 1))) m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false); } if (log2TrSize != depthRange[0]) m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); - m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]); + m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth); - if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth)) + if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth)) m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA); fullCost.bits = m_entropyCoder.getNumberOfWrittenBits(); @@ -380,26 +366,25 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, } // code split block - uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); - uint32_t absPartIdxSub = absPartIdx; + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0]; if (m_param->bEnableTSkipFast) - checkTransformSkip &= cu.m_partSize[absPartIdx] == SIZE_NxN; + checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N; Cost splitCost; uint32_t cbf = 0; - for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv) + for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) { if (checkTransformSkip) - codeIntraLumaTSkip(mode, cuGeom, trDepth + 1, absPartIdxSub, splitCost); + codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost); else - codeIntraLumaQT(mode, cuGeom, trDepth + 1, absPartIdxSub, bAllowSplit, splitCost, depthRange); + codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange); - cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1); + cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1); } - for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++) - cu.m_cbf[0][absPartIdx + offs] |= (cbf << trDepth); + for (uint32_t offs = 0; offs < 4 * qNumParts; offs++) + cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth); if (mightNotSplit && log2TrSize != depthRange[0]) { @@ -428,16 +413,16 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, m_entropyCoder.load(m_rqt[fullDepth].rqtTest); // recover transform index and Cbf values - cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth); + cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth); cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth); cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); } } // set reconstruction for next intra prediction blocks if full TU prediction won - pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); - intptr_t picStride = m_frame->m_reconPicYuv->m_stride; - primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride); + pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + intptr_t picStride = m_frame->m_reconPic->m_stride; + primitives.luma_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride); outCost.rdcost += fullCost.rdcost; outCost.distortion += fullCost.distortion; @@ -445,9 +430,9 @@ void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, outCost.energy += fullCost.energy; } -void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, Cost& outCost) +void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost) { - uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth; + uint32_t fullDepth = mode.cu.m_cuDepth[0] + tuDepth; uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; uint32_t tuSize = 1 << log2TrSize; @@ -462,7 +447,7 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep int bTSkip = 0; uint32_t bCBF = 0; - pixel* fenc = const_cast(fencYuv->getLumaAddr(absPartIdx)); + const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); pixel* pred = predYuv->getLumaAddr(absPartIdx); int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx); uint32_t stride = fencYuv->m_size; @@ -470,12 +455,12 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep // init availability pattern uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx]; - initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode); + initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode); // get prediction signal predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize); - cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth); + cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth); uint32_t qtLayer = log2TrSize - 2; uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); @@ -518,12 +503,12 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep } else // no residual coded, recon = pred - primitives.square_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride); + primitives.luma_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride); uint32_t tmpDist = primitives.sse_pp[sizeIdx](tmpRecon, tmpReconStride, fenc, stride); cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth); - cu.setCbfSubParts((!!numSig) << trDepth, TEXT_LUMA, absPartIdx, fullDepth); + cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth); if (useTSkip) m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); @@ -548,20 +533,20 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep } else { - uint32_t qtNumParts = cuGeom.numPartitions >> 2; - if (!trDepth) + uint32_t qNumParts = cuGeom.numPartitions >> 2; + if (!tuDepth) { - for (uint32_t part = 0; part < 4; part++) - m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false); + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx) + m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false); } - else if (!(absPartIdx & (qtNumParts - 1))) + else if (!(absPartIdx & (qNumParts - 1))) m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false); } m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); - m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]); + m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth); - if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth)) + if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth)) m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA); uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits(); @@ -591,19 +576,19 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep if (bTSkip) { memcpy(coeffY, tsCoeffY, sizeof(coeff_t) << (log2TrSize * 2)); - primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize); + primitives.luma_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize); } else if (checkTransformSkip) { cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); - cu.setCbfSubParts(bCBF << trDepth, TEXT_LUMA, absPartIdx, fullDepth); + cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth); m_entropyCoder.load(m_rqt[fullDepth].rqtTemp); } // set reconstruction for next intra prediction blocks - pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); - intptr_t picStride = m_frame->m_reconPicYuv->m_stride; - primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride); + pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + intptr_t picStride = m_frame->m_reconPic->m_stride; + primitives.luma_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride); outCost.rdcost += fullCost.rdcost; outCost.distortion += fullCost.distortion; @@ -612,11 +597,11 @@ void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDep } /* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */ -void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t depthRange[2]) +void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t depthRange[2]) { CUData& cu = mode.cu; - uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; + uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth; uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; bool bCheckFull = log2TrSize <= depthRange[1]; @@ -629,22 +614,22 @@ void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint3 if (bCheckFull) { - pixel* fenc = const_cast(mode.fencYuv->getLumaAddr(absPartIdx)); + const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx); pixel* pred = mode.predYuv.getLumaAddr(absPartIdx); int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx); - pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); - intptr_t picStride = m_frame->m_reconPicYuv->m_stride; + pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + intptr_t picStride = m_frame->m_reconPic->m_stride; uint32_t stride = mode.fencYuv->m_size; uint32_t sizeIdx = log2TrSize - 2; uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx]; uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); coeff_t* coeff = cu.m_trCoeff[TEXT_LUMA] + coeffOffsetY; - initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode); + initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode); predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize); X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n"); - cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth); + cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth); primitives.calcresidual[sizeIdx](fenc, pred, residual, stride); uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, false); @@ -652,11 +637,11 @@ void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint3 { m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, false, numSig); primitives.luma_add_ps[sizeIdx](picReconY, picStride, pred, residual, stride, stride); - cu.setCbfSubParts(1 << trDepth, TEXT_LUMA, absPartIdx, fullDepth); + cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth); } else { - primitives.square_copy_pp[sizeIdx](picReconY, picStride, pred, stride); + primitives.luma_copy_pp[sizeIdx](picReconY, picStride, pred, stride); cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth); } } @@ -665,26 +650,25 @@ void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint3 X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n"); /* code split block */ - uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; uint32_t cbf = 0; - for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv) + for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) { - residualTransformQuantIntra(mode, cuGeom, trDepth + 1, absPartIdxSub, depthRange); - cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1); + residualTransformQuantIntra(mode, cuGeom, tuDepth + 1, qPartIdx, depthRange); + cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1); } - for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++) - cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << trDepth); + for (uint32_t offs = 0; offs < 4 * qNumParts; offs++) + cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << tuDepth); } } -void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, uint32_t absPartIdx) +void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx) { - uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; - uint32_t tuDepth = cu.m_tuDepth[absPartIdx]; + uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth; + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - if (tuDepth == trDepth) + if (tuDepth == cu.m_tuDepth[absPartIdx]) { - uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; uint32_t qtLayer = log2TrSize - 2; // copy transform coefficients @@ -698,88 +682,80 @@ void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, u } else { - uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); - for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) - extractIntraResultQT(cu, reconYuv, trDepth + 1, absPartIdx + subPartIdx * numQPart); + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) + extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx); } } +inline void offsetCBFs(uint8_t subTUCBF[2]) +{ + uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1]; + subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF; + subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF; +} + /* 4:2:2 post-TU split processing */ -void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx) +void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx) { uint32_t depth = cu.m_cuDepth[0]; - uint32_t fullDepth = depth + trDepth; + uint32_t fullDepth = depth + tuDepth; uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - uint32_t trDepthC = trDepth; if (log2TrSize == 2) { - X265_CHECK(m_csp != X265_CSP_I444 && trDepthC, "trDepthC invalid\n"); - trDepthC--; + X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); + ++log2TrSize; } - uint32_t partIdxesPerSubTU = (NUM_CU_PARTITIONS >> ((depth + trDepthC) << 1)) >> 1; + uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1); // move the CBFs down a level and set the parent CBF uint8_t subTUCBF[2]; - uint8_t combinedSubTUCBF = 0; - - for (uint32_t subTU = 0; subTU < 2; subTU++) - { - const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU); + subTUCBF[0] = cu.getCbf(absPartIdx , ttype, tuDepth); + subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth); + offsetCBFs(subTUCBF); - subTUCBF[subTU] = cu.getCbf(subTUAbsPartIdx, ttype, trDepth); - combinedSubTUCBF |= subTUCBF[subTU]; - } - - for (uint32_t subTU = 0; subTU < 2; subTU++) - { - const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU); - const uint8_t compositeCBF = (subTUCBF[subTU] << 1) | combinedSubTUCBF; - - cu.setCbfPartRange((compositeCBF << trDepth), ttype, subTUAbsPartIdx, partIdxesPerSubTU); - } + cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx , tuNumParts); + cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts); } /* returns distortion */ -uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t& psyEnergy) +uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy) { CUData& cu = mode.cu; - uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; - uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; + uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth; + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - if (tuDepthL > trDepth) + if (tuDepth < cu.m_tuDepth[absPartIdx]) { - uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0; - for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv) + for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) { - outDist += codeIntraChromaQt(mode, cuGeom, trDepth + 1, absPartIdxSub, psyEnergy); - splitCbfU |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_U, trDepth + 1); - splitCbfV |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_V, trDepth + 1); + outDist += codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, psyEnergy); + splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1); + splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1); } - for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++) + for (uint32_t offs = 0; offs < 4 * qNumParts; offs++) { - cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << trDepth); - cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << trDepth); + cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << tuDepth); + cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << tuDepth); } return outDist; } - uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; - uint32_t trDepthC = trDepth; - if (log2TrSizeC == 1) + uint32_t tuDepthC = tuDepth; + if (log2TrSizeC < 2) { - X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "invalid trDepth\n"); - trDepthC--; - log2TrSizeC++; - uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); - bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0); - if (!bFirstQ) + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); + if (absPartIdx & 3) return 0; + log2TrSizeC = 2; + tuDepthC--; } if (m_bEnableRDOQ) @@ -788,13 +764,13 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0]; checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]); if (checkTransformSkip) - return codeIntraChromaTSkip(mode, cuGeom, trDepth, trDepthC, absPartIdx, psyEnergy); + return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy); uint32_t qtLayer = log2TrSize - 2; uint32_t tuSize = 1 << log2TrSizeC; uint32_t outDist = 0; - uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); + uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1); const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT; for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) @@ -806,7 +782,7 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr { uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; - pixel* fenc = const_cast(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC); + const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC); pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC); int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC); uint32_t stride = mode.fencYuv->m_csize; @@ -817,11 +793,11 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC); uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize; - pixel* picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); - intptr_t picStride = m_frame->m_reconPicYuv->m_strideC; + pixel* picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); + intptr_t picStride = m_frame->m_reconPic->m_strideC; // init availability pattern - initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId); + initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId); pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize); uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; @@ -837,44 +813,42 @@ uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tr primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride); uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false); - uint32_t tmpDist; if (numSig) { m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig); primitives.luma_add_ps[sizeIdxC](reconQt, reconQtStride, pred, residual, stride, stride); - cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); + cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); } else { // no coded residual, recon = pred - primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride); + primitives.luma_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride); cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); } - tmpDist = primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride); - outDist += (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist); + outDist += m_rdCost.scaleChromaDist(chromaId, primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride)); if (m_rdCost.m_psyRd) psyEnergy += m_rdCost.psyCost(sizeIdxC, fenc, stride, picReconC, picStride); - primitives.square_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride); + primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride); } while (tuIterator.isNextSection()); if (splitType == VERTICAL_SPLIT) - offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx); + offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx); } return outDist; } /* returns distortion */ -uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t trDepthC, uint32_t absPartIdx, uint32_t& psyEnergy) +uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy) { CUData& cu = mode.cu; - uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; + uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth; uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - uint32_t log2TrSizeC = 2; + const uint32_t log2TrSizeC = 2; uint32_t tuSize = 4; uint32_t qtLayer = log2TrSize - 2; uint32_t outDist = 0; @@ -887,7 +861,7 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]); ALIGN_VAR_32(pixel, tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]); - uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); + uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1); const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT; for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) @@ -899,11 +873,11 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t { uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; - pixel* fenc = const_cast(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC); + const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC); pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC); int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC); uint32_t stride = mode.fencYuv->m_csize; - uint32_t sizeIdxC = log2TrSizeC - 2; + const uint32_t sizeIdxC = log2TrSizeC - 2; uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; @@ -911,7 +885,7 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize; // init availability pattern - initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId); + initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId); pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize); uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; @@ -943,7 +917,7 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t { m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig); primitives.luma_add_ps[sizeIdxC](recon, reconStride, pred, residual, stride, stride); - cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); + cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); } else if (useTSkip) { @@ -952,11 +926,11 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t } else { - primitives.square_copy_pp[sizeIdxC](recon, reconStride, pred, stride); + primitives.luma_copy_pp[sizeIdxC](recon, reconStride, pred, stride); cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); } uint32_t tmpDist = primitives.sse_pp[sizeIdxC](recon, reconStride, fenc, stride); - tmpDist = (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist); + tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist); cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep); @@ -991,15 +965,15 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t if (bTSkip) { memcpy(coeffC, tskipCoeffC, sizeof(coeff_t) << (log2TrSizeC * 2)); - primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE); + primitives.luma_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE); } - cu.setCbfPartRange(bCbf << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); + cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep); - pixel* reconPicC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); - intptr_t picStride = m_frame->m_reconPicYuv->m_strideC; - primitives.square_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride); + pixel* reconPicC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); + intptr_t picStride = m_frame->m_reconPic->m_strideC; + primitives.luma_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride); outDist += bDist; psyEnergy += bEnergy; @@ -1007,34 +981,27 @@ uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t while (tuIterator.isNextSection()); if (splitType == VERTICAL_SPLIT) - offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx); + offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx); } m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); return outDist; } -void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t trDepth, bool tuQuad) +void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth) { - uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; + uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth; uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; + uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; - if (tuDepthL == trDepth) + if (tuDepthL == tuDepth || log2TrSizeC == 2) { - uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; - - if (tuQuad) - { - log2TrSizeC++; /* extract one 4x4 instead of 4 2x2 */ - trDepth--; /* also adjust the number of coeff read */ - } - // copy transform coefficients uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422)); uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); - uint32_t qtLayer = log2TrSize - 2; + uint32_t qtLayer = log2TrSize - 2 - (tuDepthL - tuDepth); coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC; coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC; coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC; @@ -1047,38 +1014,29 @@ void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absP } else { - if (g_maxLog2CUSize - fullDepth - 1 == 2 && m_csp != X265_CSP_I444) - /* no such thing as chroma 2x2, so extract one 4x4 instead of 4 2x2 */ - extractIntraResultChromaQT(cu, reconYuv, absPartIdx, trDepth + 1, true); - else - { - uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); - for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) - extractIntraResultChromaQT(cu, reconYuv, absPartIdx + subPartIdx * numQPart, trDepth + 1, false); - } + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) + extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1); } } -void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx) +void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx) { CUData& cu = mode.cu; - uint32_t fullDepth = cu.m_cuDepth[0] + trDepth; - uint32_t tuDepthL = cu.m_tuDepth[absPartIdx]; + uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth; + uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; - if (tuDepthL == trDepth) + if (tuDepth == cu.m_tuDepth[absPartIdx]) { - uint32_t log2TrSize = g_maxLog2CUSize - fullDepth; uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; - uint32_t trDepthC = trDepth; - if (log2TrSizeC == 1) + uint32_t tuDepthC = tuDepth; + if (log2TrSizeC < 2) { - X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth > 0, "invalid trDepth\n"); - trDepthC--; - log2TrSizeC++; - uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); - bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0); - if (!bFirstQ) + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); + if (absPartIdx & 3) return; + log2TrSizeC = 2; + tuDepthC--; } ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv; @@ -1086,7 +1044,7 @@ void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tr uint32_t stride = mode.fencYuv->m_csize; const int sizeIdxC = log2TrSizeC - 2; - uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1); + uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1); const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT; for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) @@ -1098,20 +1056,20 @@ void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tr { uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; - pixel* fenc = const_cast(mode.fencYuv->getChromaAddr(chromaId, absPartIdxC)); + const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC); pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC); int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC); pixel* recon = mode.reconYuv.getChromaAddr(chromaId, absPartIdxC); // TODO: needed? uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); coeff_t* coeff = cu.m_trCoeff[ttype] + coeffOffsetC; - pixel* picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); - uint32_t picStride = m_frame->m_reconPicYuv->m_strideC; + pixel* picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); + uint32_t picStride = m_frame->m_reconPic->m_strideC; uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; if (chromaPredMode == DM_CHROMA_IDX) chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0]; chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode; - initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId); + initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId); pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize); predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp); @@ -1124,36 +1082,36 @@ void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tr { m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], residual, stride, coeff, log2TrSizeC, ttype, true, false, numSig); primitives.luma_add_ps[sizeIdxC](recon, stride, pred, residual, stride, stride); - primitives.square_copy_pp[sizeIdxC](picReconC, picStride, recon, stride); - cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); + primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, recon, stride); + cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); } else { - primitives.square_copy_pp[sizeIdxC](recon, stride, pred, stride); - primitives.square_copy_pp[sizeIdxC](picReconC, picStride, pred, stride); + primitives.luma_copy_pp[sizeIdxC](recon, stride, pred, stride); + primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, pred, stride); cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); } } while (tuIterator.isNextSection()); if (splitType == VERTICAL_SPLIT) - offsetSubTUCBFs(cu, (TextType)chromaId, trDepth, absPartIdx); + offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx); } } else { - uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1); + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; uint32_t splitCbfU = 0, splitCbfV = 0; - for (uint32_t subPartIdx = 0, absPartIdxC = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxC += qPartsDiv) + for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) { - residualQTIntraChroma(mode, cuGeom, trDepth + 1, absPartIdxC); - splitCbfU |= cu.getCbf(absPartIdxC, TEXT_CHROMA_U, trDepth + 1); - splitCbfV |= cu.getCbf(absPartIdxC, TEXT_CHROMA_V, trDepth + 1); + residualQTIntraChroma(mode, cuGeom, tuDepth + 1, qPartIdx); + splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1); + splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1); } - for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++) + for (uint32_t offs = 0; offs < 4 * qNumParts; offs++) { - cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << trDepth); - cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << trDepth); + cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth); + cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth); } } } @@ -1188,7 +1146,7 @@ void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits(); bool bCodeDQP = m_slice->m_pps->bUseDQP; - m_entropyCoder.codeCoeff(cu, 0, depth, bCodeDQP, tuDepthRange); + m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange); m_entropyCoder.store(intraMode.contexts); intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits(); intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits; @@ -1198,7 +1156,224 @@ void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize updateModeCost(intraMode); } -uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t depthRange[2], uint8_t* sharedModes) +/* Note that this function does not save the best intra prediction, it must + * be generated later. It records the best mode in the cu */ +void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom) +{ + CUData& cu = intraMode.cu; + uint32_t depth = cu.m_cuDepth[0]; + + cu.setPartSizeSubParts(SIZE_2Nx2N); + cu.setPredModeSubParts(MODE_INTRA); + + const uint32_t initTuDepth = 0; + uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth; + uint32_t tuSize = 1 << log2TrSize; + const uint32_t absPartIdx = 0; + + // Reference sample smoothing + initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX); + + const pixel* fenc = intraMode.fencYuv->m_buf[0]; + uint32_t stride = intraMode.fencYuv->m_size; + + pixel* above = m_refAbove + tuSize - 1; + pixel* aboveFiltered = m_refAboveFlt + tuSize - 1; + pixel* left = m_refLeft + tuSize - 1; + pixel* leftFiltered = m_refLeftFlt + tuSize - 1; + int sad, bsad; + uint32_t bits, bbits, mode, bmode; + uint64_t cost, bcost; + + // 33 Angle modes once + ALIGN_VAR_32(pixel, bufScale[32 * 32]); + ALIGN_VAR_32(pixel, bufTrans[32 * 32]); + ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]); + int scaleTuSize = tuSize; + int scaleStride = stride; + int costShift = 0; + int sizeIdx = log2TrSize - 2; + + if (tuSize > 32) + { + // origin is 64x64, we scale to 32x32 and setup required parameters + primitives.scale2D_64to32(bufScale, fenc, stride); + fenc = bufScale; + + // reserve space in case primitives need to store data in above + // or left buffers + pixel _above[4 * 32 + 1]; + pixel _left[4 * 32 + 1]; + pixel* aboveScale = _above + 2 * 32; + pixel* leftScale = _left + 2 * 32; + aboveScale[0] = leftScale[0] = above[0]; + primitives.scale1D_128to64(aboveScale + 1, above + 1, 0); + primitives.scale1D_128to64(leftScale + 1, left + 1, 0); + + scaleTuSize = 32; + scaleStride = 32; + costShift = 2; + sizeIdx = 5 - 2; // log2(scaleTuSize) - 2 + + // Filtered and Unfiltered refAbove and refLeft pointing to above and left. + above = aboveScale; + left = leftScale; + aboveFiltered = aboveScale; + leftFiltered = leftScale; + } + + pixelcmp_t sa8d = primitives.sa8d[sizeIdx]; + int predsize = scaleTuSize * scaleTuSize; + + m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur); + + /* there are three cost tiers for intra modes: + * pred[0] - mode probable, least cost + * pred[1], pred[2] - less probable, slightly more cost + * non-mpm modes - all cost the same (rbits) */ + uint64_t mpms; + uint32_t preds[3]; + uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms); + + // DC + primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16)); + bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift; + bmode = mode = DC_IDX; + bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; + bcost = m_rdCost.calcRdSADCost(bsad, bbits); + + pixel* abovePlanar = above; + pixel* leftPlanar = left; + + if (tuSize & (8 | 16 | 32)) + { + abovePlanar = aboveFiltered; + leftPlanar = leftFiltered; + } + + // PLANAR + primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0); + sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift; + mode = PLANAR_IDX; + bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; + cost = m_rdCost.calcRdSADCost(sad, bits); + COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits); + + // Transpose NxN + primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride); + + primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16)); + + bool modeHor; + const pixel* cmp; + intptr_t srcStride; + +#define TRY_ANGLE(angle) \ + modeHor = angle < 18; \ + cmp = modeHor ? bufTrans : fenc; \ + srcStride = modeHor ? scaleTuSize : scaleStride; \ + sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \ + bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \ + cost = m_rdCost.calcRdSADCost(sad, bits) + + if (m_param->bEnableFastIntra) + { + int asad = 0; + uint32_t lowmode, highmode, amode = 5, abits = 0; + uint64_t acost = MAX_INT64; + + /* pick the best angle, sampling at distance of 5 */ + for (mode = 5; mode < 35; mode += 5) + { + TRY_ANGLE(mode); + COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits); + } + + /* refine best angle at distance 2, then distance 1 */ + for (uint32_t dist = 2; dist >= 1; dist--) + { + lowmode = amode - dist; + highmode = amode + dist; + + X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n"); + TRY_ANGLE(lowmode); + COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits); + + X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n"); + TRY_ANGLE(highmode); + COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits); + } + + if (amode == 33) + { + TRY_ANGLE(34); + COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits); + } + + COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits); + } + else // calculate and search all intra prediction angles for lowest cost + { + for (mode = 2; mode < 35; mode++) + { + TRY_ANGLE(mode); + COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits); + } + } + + cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth); + intraMode.initCosts(); + intraMode.totalBits = bbits; + intraMode.distortion = bsad; + intraMode.sa8dCost = bcost; + intraMode.sa8dBits = bbits; +} + +void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom) +{ + CUData& cu = intraMode.cu; + Yuv* reconYuv = &intraMode.reconYuv; + const Yuv* fencYuv = intraMode.fencYuv; + + X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n"); + X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n"); + + m_quant.setQPforQuant(cu); + + uint32_t tuDepthRange[2]; + cu.getIntraTUQtDepthRange(tuDepthRange, 0); + + m_entropyCoder.load(m_rqt[cuGeom.depth].cur); + + Cost icosts; + codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange); + extractIntraResultQT(cu, *reconYuv, 0, 0); + + intraMode.distortion = icosts.distortion; + intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom); + + m_entropyCoder.resetBits(); + if (m_slice->m_pps->bTransquantBypassEnabled) + m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]); + m_entropyCoder.codeSkipFlag(cu, 0); + m_entropyCoder.codePredMode(cu.m_predMode[0]); + m_entropyCoder.codePartSize(cu, 0, cuGeom.depth); + m_entropyCoder.codePredInfo(cu, 0); + intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits(); + + bool bCodeDQP = m_slice->m_pps->bUseDQP; + m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange); + + intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits(); + intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits; + if (m_rdCost.m_psyRd) + intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); + + m_entropyCoder.store(intraMode.contexts); + updateModeCost(intraMode); +} + +uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes) { CUData& cu = intraMode.cu; Yuv* reconYuv = &intraMode.reconYuv; @@ -1206,37 +1381,37 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t const Yuv* fencYuv = intraMode.fencYuv; uint32_t depth = cu.m_cuDepth[0]; - uint32_t initTrDepth = cu.m_partSize[0] == SIZE_2Nx2N ? 0 : 1; - uint32_t numPU = 1 << (2 * initTrDepth); - uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth; + uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N; + uint32_t numPU = 1 << (2 * initTuDepth); + uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth; uint32_t tuSize = 1 << log2TrSize; uint32_t qNumParts = cuGeom.numPartitions >> 2; uint32_t sizeIdx = log2TrSize - 2; uint32_t absPartIdx = 0; uint32_t totalDistortion = 0; - int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[absPartIdx] == SIZE_NxN; + int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N; // loop over partitions - for (uint32_t pu = 0; pu < numPU; pu++, absPartIdx += qNumParts) + for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts) { uint32_t bmode = 0; if (sharedModes) - bmode = sharedModes[pu]; + bmode = sharedModes[puIdx]; else { // Reference sample smoothing - initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX); + initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX); // determine set of modes to be tested (using prediction signal only) - pixel* fenc = const_cast(fencYuv->getLumaAddr(absPartIdx)); + const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); uint32_t stride = predYuv->m_size; - pixel *above = m_refAbove + tuSize - 1; - pixel *aboveFiltered = m_refAboveFlt + tuSize - 1; - pixel *left = m_refLeft + tuSize - 1; - pixel *leftFiltered = m_refLeftFlt + tuSize - 1; + pixel* above = m_refAbove + tuSize - 1; + pixel* aboveFiltered = m_refAboveFlt + tuSize - 1; + pixel* left = m_refLeft + tuSize - 1; + pixel* leftFiltered = m_refLeftFlt + tuSize - 1; // 33 Angle modes once ALIGN_VAR_32(pixel, buf_trans[32 * 32]); @@ -1250,8 +1425,8 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t if (tuSize > 32) { - pixel *aboveScale = _above + 2 * 32; - pixel *leftScale = _left + 2 * 32; + pixel* aboveScale = _above + 2 * 32; + pixel* leftScale = _left + 2 * 32; // origin is 64x64, we scale to 32x32 and setup required parameters primitives.scale2D_64to32(bufScale, fenc, stride); @@ -1296,8 +1471,8 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits); // PLANAR - pixel *abovePlanar = above; - pixel *leftPlanar = left; + pixel* abovePlanar = above; + pixel* leftPlanar = left; if (tuSize >= 8 && tuSize <= 32) { abovePlanar = aboveFiltered; @@ -1316,7 +1491,7 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t for (int mode = 2; mode < 35; mode++) { bool modeHor = (mode < 18); - pixel *cmp = (modeHor ? buf_trans : fenc); + const pixel* cmp = (modeHor ? buf_trans : fenc); intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride); bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift; @@ -1330,7 +1505,7 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t * levels and at higher depths */ uint64_t candCostList[MAX_RD_INTRA_MODES]; uint32_t rdModeList[MAX_RD_INTRA_MODES]; - int maxCandCount = 2 + m_param->rdLevel + ((depth + initTrDepth) >> 1); + int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1); for (int i = 0; i < maxCandCount; i++) candCostList[i] = MAX_INT64; @@ -1346,51 +1521,50 @@ uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t if (candCostList[i] == MAX_INT64) break; m_entropyCoder.load(m_rqt[depth].cur); - cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTrDepth); + cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth); Cost icosts; if (checkTransformSkip) - codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts); + codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts); else - codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, false, icosts, depthRange); + codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange); COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]); } } /* remeasure best mode, allowing TU splits */ - cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTrDepth); + cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth); m_entropyCoder.load(m_rqt[depth].cur); Cost icosts; if (checkTransformSkip) - codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts); + codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts); else - codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, true, icosts, depthRange); + codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange); totalDistortion += icosts.distortion; - extractIntraResultQT(cu, *reconYuv, initTrDepth, absPartIdx); + extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx); // set reconstruction for next intra prediction blocks - if (pu != numPU - 1) + if (puIdx != numPU - 1) { /* This has important implications for parallelism and RDO. It is writing intermediate results into the * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think * that the contexts should be tracked through each PU */ - pixel* dst = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); - uint32_t dststride = m_frame->m_reconPicYuv->m_stride; - pixel* src = reconYuv->getLumaAddr(absPartIdx); + pixel* dst = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + uint32_t dststride = m_frame->m_reconPic->m_stride; + const pixel* src = reconYuv->getLumaAddr(absPartIdx); uint32_t srcstride = reconYuv->m_size; - primitives.square_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride); + primitives.luma_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride); } } if (numPU > 1) { uint32_t combCbfY = 0; - uint32_t partIdx = 0; - for (uint32_t part = 0; part < 4; part++, partIdx += qNumParts) - combCbfY |= cu.getCbf(partIdx, TEXT_LUMA, 1); + for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts) + combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1); for (uint32_t offs = 0; offs < 4 * qNumParts; offs++) cu.m_cbf[0][offs] |= combCbfY; @@ -1415,17 +1589,19 @@ void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom) uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift; uint32_t tuSize = 1 << log2TrSizeC; int32_t scaleTuSize = tuSize; + uint32_t tuDepth = 0; int32_t costShift = 0; if (tuSize > 32) { scaleTuSize = 32; + tuDepth = 1; costShift = 2; log2TrSizeC = 5; } - Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 1); - Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 2); + Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 1); + Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 2); cu.getAllowedChromaDir(0, modeList); // check chroma modes @@ -1440,7 +1616,7 @@ void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom) uint64_t cost = 0; for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) { - pixel* fenc = fencYuv->m_buf[chromaId]; + const pixel* fenc = fencYuv->m_buf[chromaId]; pixel* pred = predYuv->m_buf[chromaId]; pixel* chromaPred = getAdiChromaBuf(chromaId, scaleTuSize); @@ -1465,19 +1641,18 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom) Yuv& reconYuv = intraMode.reconYuv; uint32_t depth = cu.m_cuDepth[0]; - uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN && m_csp == X265_CSP_I444; - uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth; + uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444; + uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth; uint32_t absPartStep = (NUM_CU_PARTITIONS >> (depth << 1)); uint32_t totalDistortion = 0; int part = partitionFromLog2Size(log2TrSize); - TURecurse tuIterator((initTrDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0); + TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0); do { uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; - int cuSize = 1 << cu.m_log2CUSize[absPartIdxC]; uint32_t bestMode = 0; uint32_t bestDist = 0; @@ -1496,9 +1671,9 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom) // restore context models m_entropyCoder.load(m_rqt[depth].cur); - cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTrDepth); + cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth); uint32_t psyEnergy = 0; - uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTrDepth, absPartIdxC, psyEnergy); + uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, psyEnergy); if (m_slice->m_pps->bTransformSkipEnabled) m_entropyCoder.load(m_rqt[depth].cur); @@ -1512,14 +1687,14 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom) } else { - uint32_t qtNumParts = cuGeom.numPartitions >> 2; - if (!(absPartIdxC & (qtNumParts - 1))) + uint32_t qNumParts = cuGeom.numPartitions >> 2; + if (!(absPartIdxC & (qNumParts - 1))) m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList); } - codeSubdivCbfQTChroma(cu, initTrDepth, absPartIdxC, tuIterator.absPartIdxStep, cuSize, cuSize); - codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_U); - codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_V); + codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC); + codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U); + codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V); uint32_t bits = m_entropyCoder.getNumberOfWrittenBits(); uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(dist, bits, psyEnergy) : m_rdCost.calcRdCost(dist, bits); @@ -1528,7 +1703,7 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom) bestCost = cost; bestDist = dist; bestMode = modeList[mode]; - extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTrDepth, false); + extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth); memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t)); @@ -1539,14 +1714,15 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom) if (!tuIterator.isLastSection()) { uint32_t zorder = cuGeom.encodeIdx + absPartIdxC; - uint32_t dststride = m_frame->m_reconPicYuv->m_strideC; - pixel *src, *dst; + uint32_t dststride = m_frame->m_reconPic->m_strideC; + const pixel* src; + pixel* dst; - dst = m_frame->m_reconPicYuv->getCbAddr(cu.m_cuAddr, zorder); + dst = m_frame->m_reconPic->getCbAddr(cu.m_cuAddr, zorder); src = reconYuv.getCbAddr(absPartIdxC); primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize); - dst = m_frame->m_reconPicYuv->getCrAddr(cu.m_cuAddr, zorder); + dst = m_frame->m_reconPic->getCrAddr(cu.m_cuAddr, zorder); src = reconYuv.getCrAddr(absPartIdxC); primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize); } @@ -1555,23 +1731,23 @@ uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom) memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t)); memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t)); memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t)); - cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTrDepth); + cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth); totalDistortion += bestDist; } while (tuIterator.isNextSection()); - if (initTrDepth != 0) + if (initTuDepth != 0) { uint32_t combCbfU = 0; uint32_t combCbfV = 0; - uint32_t partIdx = 0; - for (uint32_t p = 0; p < 4; p++, partIdx += tuIterator.absPartIdxStep) + uint32_t qNumParts = tuIterator.absPartIdxStep; + for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts) { - combCbfU |= cu.getCbf(partIdx, TEXT_CHROMA_U, 1); - combCbfV |= cu.getCbf(partIdx, TEXT_CHROMA_V, 1); + combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1); + combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1); } - for (uint32_t offs = 0; offs < 4 * tuIterator.absPartIdxStep; offs++) + for (uint32_t offs = 0; offs < 4 * qNumParts; offs++) { cu.m_cbf[1][offs] |= combCbfU; cu.m_cbf[2][offs] |= combCbfV; @@ -1615,13 +1791,17 @@ uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, Me continue; cu.m_mv[0][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][0].mv; - cu.m_refIdx[0][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][0].refIdx; + cu.m_refIdx[0][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][0].refIdx; cu.m_mv[1][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv; - cu.m_refIdx[1][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][1].refIdx; + cu.m_refIdx[1][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][1].refIdx; prepMotionCompensation(cu, cuGeom, puIdx); - motionCompensation(tempYuv, true, false); + motionCompensation(tempYuv, true, m_me.bChromaSATD); + uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(m.absPartIdx), tempYuv.m_size); + if (m_me.bChromaSATD) + costCand += m_me.bufChromaSATD(tempYuv, m.absPartIdx); + uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand); costCand = costCand + m_rdCost.getCost(bitsCand); if (costCand < outCost) @@ -1642,41 +1822,45 @@ uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, Me /* this function assumes the caller has configured its MotionEstimation engine with the * correct source plane and source PU, and has called prepMotionCompensation() to set * m_puAbsPartIdx, m_puWidth, and m_puHeight */ -void Search::singleMotionEstimation(Search& master, const CUData& cu, const CUGeom& cuGeom, int part, int list, int ref) +void Search::singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref) { uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS; bits += getTUBits(ref, m_slice->m_numRefIdx[list]); - MV amvpCand[AMVP_NUM_CANDS]; MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1]; - int numMvc = cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, amvpCand, mvc); + int numMvc = interMode.cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, interMode.amvpCand[list][ref], mvc); - uint32_t bestCost = MAX_INT; int mvpIdx = 0; int merange = m_param->searchRange; - for (int i = 0; i < AMVP_NUM_CANDS; i++) + MotionData* bestME = interMode.bestME[part]; + + if (interMode.amvpCand[list][ref][0] != interMode.amvpCand[list][ref][1]) { - MV mvCand = amvpCand[i]; + uint32_t bestCost = MAX_INT; + for (int i = 0; i < AMVP_NUM_CANDS; i++) + { + MV mvCand = interMode.amvpCand[list][ref][i]; - // NOTE: skip mvCand if Y is > merange and -FN>1 - if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4)) - continue; + // NOTE: skip mvCand if Y is > merange and -FN>1 + if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4)) + continue; - cu.clipMv(mvCand); + interMode.cu.clipMv(mvCand); - Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv; - predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPicYuv, mvCand); - uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size); + Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv; + predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPic, mvCand); + uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size); - if (bestCost > cost) - { - bestCost = cost; - mvpIdx = i; + if (bestCost > cost) + { + bestCost = cost; + mvpIdx = i; + } } } - MV mvmin, mvmax, outmv, mvp = amvpCand[mvpIdx]; - setSearchRange(cu, mvp, merange, mvmin, mvmax); + MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[list][ref][mvpIdx]; + setSearchRange(interMode.cu, mvp, merange, mvmin, mvmax); int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv); @@ -1685,34 +1869,32 @@ void Search::singleMotionEstimation(Search& master, const CUData& cu, const CUGe uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits); /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */ - checkBestMVP(amvpCand, outmv, mvp, mvpIdx, bits, cost); + checkBestMVP(interMode.amvpCand[list][ref], outmv, mvp, mvpIdx, bits, cost); /* tie goes to the smallest ref ID, just like --no-pme */ - ScopedLock _lock(master.m_outputLock); - if (cost < master.m_bestME[list].cost || - (cost == master.m_bestME[list].cost && ref < master.m_bestME[list].ref)) + ScopedLock _lock(master.m_meLock); + if (cost < bestME[list].cost || + (cost == bestME[list].cost && ref < bestME[list].ref)) { - master.m_bestME[list].mv = outmv; - master.m_bestME[list].mvp = mvp; - master.m_bestME[list].mvpIdx = mvpIdx; - master.m_bestME[list].ref = ref; - master.m_bestME[list].cost = cost; - master.m_bestME[list].bits = bits; + bestME[list].mv = outmv; + bestME[list].mvp = mvp; + bestME[list].mvpIdx = mvpIdx; + bestME[list].ref = ref; + bestME[list].cost = cost; + bestME[list].bits = bits; } } /* search of the best candidate for inter prediction * returns true if predYuv was filled with a motion compensated prediction */ -bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma) +bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChromaSA8D) { CUData& cu = interMode.cu; Yuv* predYuv = &interMode.predYuv; - MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS]; MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1]; const Slice *slice = m_slice; - PicYuv* fencPic = m_frame->m_origPicYuv; int numPart = cu.getNumPartInter(); int numPredDir = slice->isInterP() ? 1 : 2; const int* numRefIdx = slice->m_numRefIdx; @@ -1727,23 +1909,24 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO for (int puIdx = 0; puIdx < numPart; puIdx++) { + MotionData* bestME = interMode.bestME[puIdx]; + /* sets m_puAbsPartIdx, m_puWidth, m_puHeight */ initMotionCompensation(cu, cuGeom, puIdx); - pixel* pu = fencPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx); - m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight); + m_me.setSourcePU(*interMode.fencYuv, cu.m_cuAddr, cuGeom.encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight); uint32_t mrgCost = MAX_UINT; - /* find best cost merge candidate */ - if (cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N) + /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */ + if (cu.m_partSize[0] != SIZE_2Nx2N) { merge.absPartIdx = m_puAbsPartIdx; merge.width = m_puWidth; merge.height = m_puHeight; mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge); - if (bMergeOnly && cu.m_log2CUSize[0] > 3) + if (bMergeOnly) { if (mrgCost == MAX_UINT) { @@ -1762,33 +1945,88 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO totalmebits += merge.bits; prepMotionCompensation(cu, cuGeom, puIdx); - motionCompensation(*predYuv, true, bChroma); + motionCompensation(*predYuv, true, bChromaSA8D); continue; } } - MotionData bidir[2]; - uint32_t bidirCost = MAX_UINT; - int bidirBits = 0; - - m_bestME[0].cost = MAX_UINT; - m_bestME[1].cost = MAX_UINT; + bestME[0].cost = MAX_UINT; + bestME[1].cost = MAX_UINT; getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits); - if (bDistributed) + /* Uni-directional prediction */ + if (m_param->analysisMode == X265_ANALYSIS_LOAD && bestME[0].ref >= 0) { - m_curMECu = &cu; - m_curGeom = &cuGeom; + for (int l = 0; l < numPredDir; l++) + { + int ref = bestME[l].ref; + uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS; + bits += getTUBits(ref, numRefIdx[l]); + + int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc); + + // Pick the best possible MVP from AMVP candidates based on least residual + int mvpIdx = 0; + int merange = m_param->searchRange; + + if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1]) + { + uint32_t bestCost = MAX_INT; + for (int i = 0; i < AMVP_NUM_CANDS; i++) + { + MV mvCand = interMode.amvpCand[l][ref][i]; + + // NOTE: skip mvCand if Y is > merange and -FN>1 + if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4)) + continue; + + cu.clipMv(mvCand); + predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand); + uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size); - /* this worker might already be enqueued for pmode, so other threads - * might be looking at the ME job counts at any time, do these sets - * in a safe order */ + if (bestCost > cost) + { + bestCost = cost; + mvpIdx = i; + } + } + } + + MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx]; + + int satdCost; + setSearchRange(cu, mvp, merange, mvmin, mvmax); + satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv); + + /* Get total cost of partition, but only include MV bit cost once */ + bits += m_me.bitcost(outmv); + uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits); + + /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */ + checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost); + + if (cost < bestME[l].cost) + { + bestME[l].mv = outmv; + bestME[l].mvp = mvp; + bestME[l].mvpIdx = mvpIdx; + bestME[l].cost = cost; + bestME[l].bits = bits; + } + } + } + else if (bDistributed) + { + m_meLock.acquire(); + m_curInterMode = &interMode; + m_curGeom = &cuGeom; m_curPart = puIdx; m_totalNumME = 0; m_numAcquiredME = 1; m_numCompletedME = 0; m_totalNumME = numRefIdx[0] + numRefIdx[1]; + m_meLock.release(); if (!m_bJobsQueued) JobProvider::enqueue(); @@ -1796,34 +2034,43 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO for (int i = 1; i < m_totalNumME; i++) m_pool->pokeIdleThread(); - while (m_totalNumME > m_numAcquiredME) + do { - int id = ATOMIC_INC(&m_numAcquiredME); - if (m_totalNumME >= id) + m_meLock.acquire(); + if (m_totalNumME > m_numAcquiredME) { - id -= 1; + int id = m_numAcquiredME++; + m_meLock.release(); + if (id < numRefIdx[0]) - singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, id); + singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, id); else - singleMotionEstimation(*this, cu, cuGeom, puIdx, 1, id - numRefIdx[0]); + singleMotionEstimation(*this, interMode, cuGeom, puIdx, 1, id - numRefIdx[0]); - if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME) - m_meCompletionEvent.trigger(); + m_meLock.acquire(); + m_numCompletedME++; + m_meLock.release(); } + else + m_meLock.release(); } + while (m_totalNumME > m_numAcquiredME); + if (!m_bJobsQueued) JobProvider::dequeue(); /* we saved L0-0 for ourselves */ - singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, 0); - if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME) + singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, 0); + + m_meLock.acquire(); + if (++m_numCompletedME == m_totalNumME) m_meCompletionEvent.trigger(); + m_meLock.release(); m_meCompletionEvent.wait(); } else { - // Uni-directional prediction for (int l = 0; l < numPredDir; l++) { for (int ref = 0; ref < numRefIdx[l]; ref++) @@ -1831,33 +2078,36 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS; bits += getTUBits(ref, numRefIdx[l]); - int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, amvpCand[l][ref], mvc); + int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc); // Pick the best possible MVP from AMVP candidates based on least residual - uint32_t bestCost = MAX_INT; int mvpIdx = 0; int merange = m_param->searchRange; - for (int i = 0; i < AMVP_NUM_CANDS; i++) + if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1]) { - MV mvCand = amvpCand[l][ref][i]; + uint32_t bestCost = MAX_INT; + for (int i = 0; i < AMVP_NUM_CANDS; i++) + { + MV mvCand = interMode.amvpCand[l][ref][i]; - // NOTE: skip mvCand if Y is > merange and -FN>1 - if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4)) - continue; + // NOTE: skip mvCand if Y is > merange and -FN>1 + if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4)) + continue; - cu.clipMv(mvCand); - predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPicYuv, mvCand); - uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size); + cu.clipMv(mvCand); + predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand); + uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size); - if (bestCost > cost) - { - bestCost = cost; - mvpIdx = i; + if (bestCost > cost) + { + bestCost = cost; + mvpIdx = i; + } } } - MV mvmin, mvmax, outmv, mvp = amvpCand[l][ref][mvpIdx]; + MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx]; setSearchRange(cu, mvp, merange, mvmin, mvmax); int satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv); @@ -1867,45 +2117,67 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits); /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */ - checkBestMVP(amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost); + checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost); - if (cost < m_bestME[l].cost) + if (cost < bestME[l].cost) { - m_bestME[l].mv = outmv; - m_bestME[l].mvp = mvp; - m_bestME[l].mvpIdx = mvpIdx; - m_bestME[l].ref = ref; - m_bestME[l].cost = cost; - m_bestME[l].bits = bits; + bestME[l].mv = outmv; + bestME[l].mvp = mvp; + bestME[l].mvpIdx = mvpIdx; + bestME[l].ref = ref; + bestME[l].cost = cost; + bestME[l].bits = bits; } } } } /* Bi-directional prediction */ - if (slice->isInterB() && !cu.isBipredRestriction() && m_bestME[0].cost != MAX_UINT && m_bestME[1].cost != MAX_UINT) + MotionData bidir[2]; + uint32_t bidirCost = MAX_UINT; + int bidirBits = 0; + + if (slice->isInterB() && !cu.isBipredRestriction() && /* biprediction is possible for this PU */ + cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N && /* 2Nx2N biprediction is handled elsewhere */ + bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT) { - bidir[0] = m_bestME[0]; - bidir[1] = m_bestME[1]; + bidir[0] = bestME[0]; + bidir[1] = bestME[1]; + + int satdCost; - /* Generate reference subpels */ - PicYuv* refPic0 = slice->m_refPicList[0][m_bestME[0].ref]->m_reconPicYuv; - PicYuv* refPic1 = slice->m_refPicList[1][m_bestME[1].ref]->m_reconPicYuv; - Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv; - predInterLumaPixel(bidirYuv[0], *refPic0, m_bestME[0].mv); - predInterLumaPixel(bidirYuv[1], *refPic1, m_bestME[1].mv); + if (m_me.bChromaSATD) + { + cu.m_mv[0][m_puAbsPartIdx] = bidir[0].mv; + cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref; + cu.m_mv[1][m_puAbsPartIdx] = bidir[1].mv; + cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref; - pixel *pred0 = bidirYuv[0].getLumaAddr(m_puAbsPartIdx); - pixel *pred1 = bidirYuv[1].getLumaAddr(m_puAbsPartIdx); + prepMotionCompensation(cu, cuGeom, puIdx); + motionCompensation(tmpPredYuv, true, true); - int partEnum = partitionFromSizes(m_puWidth, m_puHeight); - primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, pred0, bidirYuv[0].m_size, pred1, bidirYuv[1].m_size, 32); - int satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size); + satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) + + m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx); + } + else + { + PicYuv* refPic0 = slice->m_refPicList[0][bestME[0].ref]->m_reconPic; + PicYuv* refPic1 = slice->m_refPicList[1][bestME[1].ref]->m_reconPic; + Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv; - bidirBits = m_bestME[0].bits + m_bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]); + /* Generate reference subpels */ + predInterLumaPixel(bidirYuv[0], *refPic0, bestME[0].mv); + predInterLumaPixel(bidirYuv[1], *refPic1, bestME[1].mv); + + primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(m_puAbsPartIdx), bidirYuv[0].m_size, + bidirYuv[1].getLumaAddr(m_puAbsPartIdx), bidirYuv[1].m_size, 32); + satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size); + } + + bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]); bidirCost = satdCost + m_rdCost.getCost(bidirBits); - bool bTryZero = m_bestME[0].mv.notZero() || m_bestME[1].mv.notZero(); + bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero(); if (bTryZero) { /* Do not try zero MV if unidir motion predictors are beyond @@ -1917,38 +2189,48 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO mvmin <<= 2; mvmax <<= 2; - bTryZero &= m_bestME[0].mvp.checkRange(mvmin, mvmax); - bTryZero &= m_bestME[1].mvp.checkRange(mvmin, mvmax); + bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax); + bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax); } if (bTryZero) { - // coincident blocks of the two reference pictures - pixel *ref0 = slice->m_mref[0][m_bestME[0].ref].fpelPlane + (pu - fencPic->m_picOrg[0]); - pixel *ref1 = slice->m_mref[1][m_bestME[1].ref].fpelPlane + (pu - fencPic->m_picOrg[0]); - intptr_t refStride = slice->m_mref[0][0].lumaStride; + /* coincident blocks of the two reference pictures */ + if (m_me.bChromaSATD) + { + cu.m_mv[0][m_puAbsPartIdx] = mvzero; + cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref; + cu.m_mv[1][m_puAbsPartIdx] = mvzero; + cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref; - primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32); - satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size); + prepMotionCompensation(cu, cuGeom, puIdx); + motionCompensation(tmpPredYuv, true, true); - MV mvp0 = m_bestME[0].mvp; - int mvpIdx0 = m_bestME[0].mvpIdx; - uint32_t bits0 = m_bestME[0].bits - m_me.bitcost(m_bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0); + satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) + + m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx); + } + else + { + const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx); + const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx); + intptr_t refStride = slice->m_mref[0][0].lumaStride; - MV mvp1 = m_bestME[1].mvp; - int mvpIdx1 = m_bestME[1].mvpIdx; - uint32_t bits1 = m_bestME[1].bits - m_me.bitcost(m_bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1); + primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32); + satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size); + } - uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1); + MV mvp0 = bestME[0].mvp; + int mvpIdx0 = bestME[0].mvpIdx; + uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0); - if (bDistributed) - { - cu.fillMvpCand(puIdx, m_puAbsPartIdx, 0, m_bestME[0].ref, amvpCand[0][m_bestME[0].ref], mvc); - cu.fillMvpCand(puIdx, m_puAbsPartIdx, 1, m_bestME[1].ref, amvpCand[1][m_bestME[1].ref], mvc); - } + MV mvp1 = bestME[1].mvp; + int mvpIdx1 = bestME[1].mvpIdx; + uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1); + + uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1); /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */ - checkBestMVP(amvpCand[0][m_bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost); - checkBestMVP(amvpCand[1][m_bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost); + checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost); + checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost); if (cost < bidirCost) { @@ -1965,7 +2247,7 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO } /* select best option and store into CU */ - if (mrgCost < bidirCost && mrgCost < m_bestME[0].cost && mrgCost < m_bestME[1].cost) + if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost) { cu.m_mergeFlag[m_puAbsPartIdx] = true; cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx @@ -1977,39 +2259,39 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO totalmebits += merge.bits; } - else if (bidirCost < m_bestME[0].cost && bidirCost < m_bestME[1].cost) + else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost) { lastMode = 2; cu.m_mergeFlag[m_puAbsPartIdx] = false; cu.setPUInterDir(3, m_puAbsPartIdx, puIdx); cu.setPUMv(0, bidir[0].mv, m_puAbsPartIdx, puIdx); - cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx); cu.m_mvd[0][m_puAbsPartIdx] = bidir[0].mv - bidir[0].mvp; cu.m_mvpIdx[0][m_puAbsPartIdx] = bidir[0].mvpIdx; cu.setPUMv(1, bidir[1].mv, m_puAbsPartIdx, puIdx); - cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx); cu.m_mvd[1][m_puAbsPartIdx] = bidir[1].mv - bidir[1].mvp; cu.m_mvpIdx[1][m_puAbsPartIdx] = bidir[1].mvpIdx; totalmebits += bidirBits; } - else if (m_bestME[0].cost <= m_bestME[1].cost) + else if (bestME[0].cost <= bestME[1].cost) { lastMode = 0; cu.m_mergeFlag[m_puAbsPartIdx] = false; cu.setPUInterDir(1, m_puAbsPartIdx, puIdx); - cu.setPUMv(0, m_bestME[0].mv, m_puAbsPartIdx, puIdx); - cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx); - cu.m_mvd[0][m_puAbsPartIdx] = m_bestME[0].mv - m_bestME[0].mvp; - cu.m_mvpIdx[0][m_puAbsPartIdx] = m_bestME[0].mvpIdx; + cu.setPUMv(0, bestME[0].mv, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx); + cu.m_mvd[0][m_puAbsPartIdx] = bestME[0].mv - bestME[0].mvp; + cu.m_mvpIdx[0][m_puAbsPartIdx] = bestME[0].mvpIdx; cu.setPURefIdx(1, REF_NOT_VALID, m_puAbsPartIdx, puIdx); cu.setPUMv(1, mvzero, m_puAbsPartIdx, puIdx); - totalmebits += m_bestME[0].bits; + totalmebits += bestME[0].bits; } else { @@ -2017,19 +2299,19 @@ bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeO cu.m_mergeFlag[m_puAbsPartIdx] = false; cu.setPUInterDir(2, m_puAbsPartIdx, puIdx); - cu.setPUMv(1, m_bestME[1].mv, m_puAbsPartIdx, puIdx); - cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx); - cu.m_mvd[1][m_puAbsPartIdx] = m_bestME[1].mv - m_bestME[1].mvp; - cu.m_mvpIdx[1][m_puAbsPartIdx] = m_bestME[1].mvpIdx; + cu.setPUMv(1, bestME[1].mv, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx); + cu.m_mvd[1][m_puAbsPartIdx] = bestME[1].mv - bestME[1].mvp; + cu.m_mvpIdx[1][m_puAbsPartIdx] = bestME[1].mvpIdx; cu.setPURefIdx(0, REF_NOT_VALID, m_puAbsPartIdx, puIdx); cu.setPUMv(0, mvzero, m_puAbsPartIdx, puIdx); - totalmebits += m_bestME[1].bits; + totalmebits += bestME[1].bits; } prepMotionCompensation(cu, cuGeom, puIdx); - motionCompensation(*predYuv, true, bChroma); + motionCompensation(*predYuv, true, bChromaSA8D); } interMode.sa8dBits += totalmebits; @@ -2147,7 +2429,7 @@ void Search::encodeResAndCalcRdSkipCU(Mode& interMode) // No residual coding : SKIP mode - cu.setSkipFlagSubParts(true); + cu.setPredModeSubParts(MODE_SKIP); cu.clearCbf(); cu.setTUDepthSubParts(0, 0, depth); @@ -2158,8 +2440,8 @@ void Search::encodeResAndCalcRdSkipCU(Mode& interMode) interMode.distortion = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); // Chroma part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift); - interMode.distortion += m_rdCost.scaleChromaDistCb(primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize)); - interMode.distortion += m_rdCost.scaleChromaDistCr(primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize)); + interMode.distortion += m_rdCost.scaleChromaDist(1, primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize)); + interMode.distortion += m_rdCost.scaleChromaDist(2, primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize)); m_entropyCoder.load(m_rqt[depth].cur); m_entropyCoder.resetBits(); @@ -2212,8 +2494,8 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom) if (!cu.m_tqBypass[0]) { uint32_t cbf0Dist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size); - cbf0Dist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize)); - cbf0Dist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize)); + cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize)); + cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize)); /* Consider the RD cost of not signaling any residual */ m_entropyCoder.load(m_rqt[depth].cur); @@ -2247,7 +2529,7 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom) uint32_t coeffBits, bits; if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0)) { - cu.setSkipFlagSubParts(true); + cu.setPredModeSubParts(MODE_SKIP); /* Merge/Skip */ m_entropyCoder.resetBits(); @@ -2270,7 +2552,7 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom) uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits(); bool bCodeDQP = m_slice->m_pps->bUseDQP; - m_entropyCoder.codeCoeff(cu, 0, cu.m_cuDepth[0], bCodeDQP, tuDepthRange); + m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange); bits = m_entropyCoder.getNumberOfWrittenBits(); coeffBits = bits - mvBits; @@ -2285,8 +2567,8 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom) // update with clipped distortion and cost (qp estimation loop uses unclipped values) uint32_t bestDist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); - bestDist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize)); - bestDist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize)); + bestDist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize)); + bestDist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize)); if (m_rdCost.m_psyRd) interMode.psyEnergy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); @@ -2297,41 +2579,7 @@ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom) updateModeCost(interMode); } -void Search::generateCoeffRecon(Mode& mode, const CUGeom& cuGeom) -{ - CUData& cu = mode.cu; - - m_quant.setQPforQuant(mode.cu); - - if (cu.m_predMode[0] == MODE_INTER) - { - uint32_t tuDepthRange[2]; - cu.getInterTUQtDepthRange(tuDepthRange, 0); - - residualTransformQuantInter(mode, cuGeom, 0, cu.m_cuDepth[0], tuDepthRange); - if (cu.getQtRootCbf(0)) - mode.reconYuv.addClip(mode.predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]); - else - { - mode.reconYuv.copyFromYuv(mode.predYuv); - if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N) - cu.setSkipFlagSubParts(true); - } - } - else if (cu.m_predMode[0] == MODE_INTRA) - { - uint32_t tuDepthRange[2]; - cu.getIntraTUQtDepthRange(tuDepthRange, 0); - - uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN; - residualTransformQuantIntra(mode, cuGeom, initTrDepth, 0, tuDepthRange); - getBestIntraModeChroma(mode, cuGeom); - residualQTIntraChroma(mode, cuGeom, 0, 0); - mode.reconYuv.copyFromPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, cuGeom.encodeIdx); // TODO: - } -} - -void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, uint32_t depthRange[2]) +void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2]) { CUData& cu = mode.cu; X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid depth\n"); @@ -2340,7 +2588,7 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3 uint32_t tuDepth = depth - cu.m_cuDepth[0]; bool bCheckFull = log2TrSize <= depthRange[1]; - if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0]) + if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0]) bCheckFull = false; if (bCheckFull) @@ -2349,13 +2597,12 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; bool bCodeChroma = true; uint32_t tuDepthC = tuDepth; - if (log2TrSizeC == 1) + if (log2TrSizeC < 2) { - X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n"); - log2TrSizeC++; + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); + log2TrSizeC = 2; tuDepthC--; - uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1); - bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0); + bCodeChroma = !(absPartIdx & 3); } uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1); @@ -2372,10 +2619,10 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3 ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv; const Yuv* fencYuv = mode.fencYuv; - int16_t *curResiY = resiYuv.getLumaAddr(absPartIdx); + int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx); uint32_t strideResiY = resiYuv.m_size; - pixel *fenc = const_cast(fencYuv->getLumaAddr(absPartIdx)); + const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false); if (numSigY) @@ -2409,7 +2656,7 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3 cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep); int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC); - pixel* fencCb = const_cast(fencYuv->getCbAddr(absPartIdxC)); + const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC); uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false); if (numSigU) { @@ -2423,7 +2670,7 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3 } int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC); - pixel* fencCr = const_cast(fencYuv->getCrAddr(absPartIdxC)); + const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC); uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false); if (numSigV) { @@ -2449,16 +2696,16 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3 { X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n"); - const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1); + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; uint32_t ycbf = 0, ucbf = 0, vcbf = 0; - for (uint32_t i = 0; i < 4; i++) + for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) { - residualTransformQuantInter(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, depthRange); - ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1); - ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1); - vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1); + residualTransformQuantInter(mode, cuGeom, qPartIdx, depth + 1, depthRange); + ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1); + ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1); + vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1); } - for (uint32_t i = 0; i < 4 * qPartNumSubdiv; i++) + for (uint32_t i = 0; i < 4 * qNumParts; i++) { cu.m_cbf[TEXT_LUMA][absPartIdx + i] |= ycbf << tuDepth; cu.m_cbf[TEXT_CHROMA_U][absPartIdx + i] |= ucbf << tuDepth; @@ -2467,15 +2714,26 @@ void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint3 } } -void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, uint32_t depthRange[2]) +uint64_t Search::estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId) +{ + uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth); + + if (m_rdCost.m_psyRd) + return m_rdCost.calcPsyRdCost(dist, nullBits, psyEnergy); + else + return m_rdCost.calcRdCost(dist, nullBits); +} + +void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2]) { CUData& cu = mode.cu; uint32_t log2TrSize = g_maxLog2CUSize - depth; bool bCheckSplit = log2TrSize > depthRange[0]; bool bCheckFull = log2TrSize <= depthRange[1]; + bool bSplitPresentFlag = bCheckSplit && bCheckFull; - if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit) + if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit) bCheckFull = false; X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n"); @@ -2485,12 +2743,12 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; bool bCodeChroma = true; uint32_t tuDepthC = tuDepth; - if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444)) + if (log2TrSizeC < 2) { - log2TrSizeC++; + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); + log2TrSizeC = 2; tuDepthC--; - uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1); - bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0); + bCodeChroma = !(absPartIdx & 3); } // code full block @@ -2499,9 +2757,9 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa uint8_t cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} }; uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} }; - uint32_t singleBitsComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; - uint32_t singleDistComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; - uint32_t singlePsyEnergyComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; + uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; + uint32_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; + uint32_t singlePsyEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } }; uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} }; @@ -2532,57 +2790,25 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa if (m_bEnableRDOQ) m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); - pixel *fenc = const_cast(fencYuv->getLumaAddr(absPartIdx)); - int16_t *resi = resiYuv.getLumaAddr(absPartIdx); + const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); + int16_t* resi = resiYuv.getLumaAddr(absPartIdx); numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false); cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0]; m_entropyCoder.resetBits(); - m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth); - if (cbfFlag[TEXT_LUMA][0]) - m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA); - singleBitsComp[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits(); - - uint32_t singleBitsPrev = singleBitsComp[TEXT_LUMA][0]; - - if (bCodeChroma) - { - uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); - for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) - { - coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; - TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx); - - do - { - uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; - uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); - - cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); - if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V)) - m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false); - - fenc = const_cast(fencYuv->getChromaAddr(chromaId, absPartIdxC)); - resi = resiYuv.getChromaAddr(chromaId, absPartIdxC); - numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false); - cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section]; - - m_entropyCoder.codeQtCbf(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth); - if (cbfFlag[chromaId][tuIterator.section]) - m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId); - - uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits(); - singleBitsComp[chromaId][tuIterator.section] = newBits - singleBitsPrev; + if (bSplitPresentFlag && log2TrSize > depthRange[0]) + m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); + fullCost.bits = m_entropyCoder.getNumberOfWrittenBits(); - singleBitsPrev = newBits; - } - while (tuIterator.isNextSection()); - } - } + // Coding luma cbf flag has been removed from here. The context for cbf flag is different for each depth. + // So it is valid if we encode coefficients and then cbfs at least for analysis. +// m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth); + if (cbfFlag[TEXT_LUMA][0]) + m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA); - const uint32_t numCoeffY = 1 << (log2TrSize * 2); - const uint32_t numCoeffC = 1 << (log2TrSizeC * 2); + uint32_t singleBitsPrev = m_entropyCoder.getNumberOfWrittenBits(); + singleBits[TEXT_LUMA][0] = singleBitsPrev - fullCost.bits; X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n"); uint32_t distY = primitives.ssd_s[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size); @@ -2590,156 +2816,168 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa if (m_rdCost.m_psyRd) psyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, (int16_t*)zeroShort, 0); - int16_t *curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx); + int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx); uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size; if (cbfFlag[TEXT_LUMA][0]) { m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only + // non-zero cost calculation for luma - This is an approximation + // finally we have to encode correct cbf after comparing with null cost const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY); - uint32_t nonZeroPsyEnergyY = 0; + uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth); + uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0; if (m_rdCost.m_psyRd) + { nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY); + singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroPsyEnergyY); + } + else + singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]); if (cu.m_tqBypass[0]) { - distY = nonZeroDistY; - psyEnergyY = nonZeroPsyEnergyY; + singleDist[TEXT_LUMA][0] = nonZeroDistY; + singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY; } else { - uint64_t singleCostY = 0; - if (m_rdCost.m_psyRd) - singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0], nonZeroPsyEnergyY); - else - singleCostY = m_rdCost.calcRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0]); - m_entropyCoder.resetBits(); - m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth); - const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits(); - uint64_t nullCostY = 0; - if (m_rdCost.m_psyRd) - nullCostY = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY); - else - nullCostY = m_rdCost.calcRdCost(distY, nullBitsY); + // zero-cost calculation for luma. This is an approximation + // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf. + // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma. + uint64_t nullCostY = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA); + if (nullCostY < singleCostY) { cbfFlag[TEXT_LUMA][0] = 0; + singleBits[TEXT_LUMA][0] = 0; + primitives.blockfill_s[partSize](curResiY, strideResiY, 0); #if CHECKED_BUILD || _DEBUG + uint32_t numCoeffY = 1 << (log2TrSize << 1); memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY); #endif if (checkTransformSkipY) minCost[TEXT_LUMA][0] = nullCostY; + singleDist[TEXT_LUMA][0] = distY; + singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY; } else { - distY = nonZeroDistY; - psyEnergyY = nonZeroPsyEnergyY; if (checkTransformSkipY) minCost[TEXT_LUMA][0] = singleCostY; + singleDist[TEXT_LUMA][0] = nonZeroDistY; + singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY; } } } - else if (checkTransformSkipY) + else { - m_entropyCoder.resetBits(); - m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth); - const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits(); - if (m_rdCost.m_psyRd) - minCost[TEXT_LUMA][0] = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY); - else - minCost[TEXT_LUMA][0] = m_rdCost.calcRdCost(distY, nullBitsY); + if (checkTransformSkipY) + minCost[TEXT_LUMA][0] = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA); + primitives.blockfill_s[partSize](curResiY, strideResiY, 0); + singleDist[TEXT_LUMA][0] = distY; + singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY; } - singleDistComp[TEXT_LUMA][0] = distY; - singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY; - if (!cbfFlag[TEXT_LUMA][0]) - primitives.blockfill_s[partSize](curResiY, strideResiY, 0); cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth); if (bCodeChroma) { - uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize; uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); + uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize; for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) { uint32_t distC = 0, psyEnergyC = 0; coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx); - do - { - uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; - uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); + do + { + uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; + uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); - int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC); + cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); - distC = m_rdCost.scaleChromaDistCb(primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize)); + if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V)) + m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false); - if (cbfFlag[chromaId][tuIterator.section]) - { - m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset, - log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]); - uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC); - const uint32_t nonZeroDistC = m_rdCost.scaleChromaDistCb(dist); - uint32_t nonZeroPsyEnergyC = 0; - if (m_rdCost.m_psyRd) - nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC); - - if (cu.m_tqBypass[0]) - { - distC = nonZeroDistC; - psyEnergyC = nonZeroPsyEnergyC; - } - else + fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC); + resi = resiYuv.getChromaAddr(chromaId, absPartIdxC); + numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false); + cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section]; + + //Coding cbf flags has been removed from here +// m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][tuIterator.section], tuDepth); + if (cbfFlag[chromaId][tuIterator.section]) + m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId); + uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits(); + singleBits[chromaId][tuIterator.section] = newBits - singleBitsPrev; + singleBitsPrev = newBits; + + int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC); + distC = m_rdCost.scaleChromaDist(chromaId, primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize)); + + if (cbfFlag[chromaId][tuIterator.section]) { - uint64_t singleCostC = 0; + m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset, + log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]); + + // non-zero cost calculation for luma, same as luma - This is an approximation + // finally we have to encode correct cbf after comparing with null cost + uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC); + uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth); + uint32_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist); + uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0; if (m_rdCost.m_psyRd) - singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC); + { + nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC); + singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC); + } else - singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]); - m_entropyCoder.resetBits(); - m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepth); - const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits(); - uint64_t nullCostC = 0; - if (m_rdCost.m_psyRd) - nullCostC = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC); + singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]); + + if (cu.m_tqBypass[0]) + { + singleDist[chromaId][tuIterator.section] = nonZeroDistC; + singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC; + } else - nullCostC = m_rdCost.calcRdCost(distC, nullBitsC); - if (nullCostC < singleCostC) { - cbfFlag[chromaId][tuIterator.section] = 0; + //zero-cost calculation for chroma. This is an approximation + uint64_t nullCostC = estimateNullCbfCost(distC, psyEnergyC, tuDepth, (TextType)chromaId); + + if (nullCostC < singleCostC) + { + cbfFlag[chromaId][tuIterator.section] = 0; + singleBits[chromaId][tuIterator.section] = 0; + primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0); #if CHECKED_BUILD || _DEBUG + uint32_t numCoeffC = 1 << (log2TrSizeC << 1); memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC); #endif if (checkTransformSkipC) minCost[chromaId][tuIterator.section] = nullCostC; + singleDist[chromaId][tuIterator.section] = distC; + singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC; } else { - distC = nonZeroDistC; - psyEnergyC = nonZeroPsyEnergyC; if (checkTransformSkipC) minCost[chromaId][tuIterator.section] = singleCostC; + singleDist[chromaId][tuIterator.section] = nonZeroDistC; + singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC; } } } - else if (checkTransformSkipC) + else { - m_entropyCoder.resetBits(); - m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepthC); - const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits(); - if (m_rdCost.m_psyRd) - minCost[chromaId][tuIterator.section] = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC); - else - minCost[chromaId][tuIterator.section] = m_rdCost.calcRdCost(distC, nullBitsC); - } - - singleDistComp[chromaId][tuIterator.section] = distC; - singlePsyEnergyComp[chromaId][tuIterator.section] = psyEnergyC; - - if (!cbfFlag[chromaId][tuIterator.section]) + if (checkTransformSkipC) + minCost[chromaId][tuIterator.section] = estimateNullCbfCost(distC, psyEnergyC, tuDepthC, (TextType)chromaId); primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0); + singleDist[chromaId][tuIterator.section] = distC; + singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC; + } cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); } @@ -2763,14 +3001,14 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa if (m_bEnableRDOQ) m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); - fenc = const_cast(fencYuv->getLumaAddr(absPartIdx)); + fenc = fencYuv->getLumaAddr(absPartIdx); resi = resiYuv.getLumaAddr(absPartIdx); uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, tsCoeffY, log2TrSize, TEXT_LUMA, absPartIdx, true); if (numSigTSkipY) { m_entropyCoder.resetBits(); - m_entropyCoder.codeQtCbf(!!numSigTSkipY, TEXT_LUMA, tuDepth); + m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth); m_entropyCoder.codeCoeffNxN(cu, tsCoeffY, absPartIdx, log2TrSize, TEXT_LUMA); const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits(); @@ -2791,12 +3029,13 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth); else { - singleDistComp[TEXT_LUMA][0] = nonZeroDistY; - singlePsyEnergyComp[TEXT_LUMA][0] = nonZeroPsyEnergyY; + singleDist[TEXT_LUMA][0] = nonZeroDistY; + singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY; cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY; bestTransformMode[TEXT_LUMA][0] = 1; + uint32_t numCoeffY = 1 << (log2TrSize << 1); memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY); - primitives.square_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize); + primitives.luma_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize); } cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth); @@ -2821,7 +3060,7 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU; uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); - int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC); + int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC); ALIGN_VAR_32(coeff_t, tsCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]); ALIGN_VAR_32(int16_t, tsResiC[MAX_TS_SIZE * MAX_TS_SIZE]); @@ -2831,42 +3070,43 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V)) m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false); - fenc = const_cast(fencYuv->getChromaAddr(chromaId, absPartIdxC)); + fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC); resi = resiYuv.getChromaAddr(chromaId, absPartIdxC); uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, tsCoeffC, log2TrSizeC, (TextType)chromaId, absPartIdxC, true); m_entropyCoder.resetBits(); - singleBitsComp[chromaId][tuIterator.section] = 0; + singleBits[chromaId][tuIterator.section] = 0; if (numSigTSkipC) { - m_entropyCoder.codeQtCbf(!!numSigTSkipC, (TextType)chromaId, tuDepth); + m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth); m_entropyCoder.codeCoeffNxN(cu, tsCoeffC, absPartIdxC, log2TrSizeC, (TextType)chromaId); - singleBitsComp[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits(); + singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits(); m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], tsResiC, trSizeC, tsCoeffC, log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC); uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC); - nonZeroDistC = m_rdCost.scaleChromaDistCb(dist); + nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist); if (m_rdCost.m_psyRd) { nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC); - singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC); + singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC); } else - singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]); + singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]); } if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC) cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); else { - singleDistComp[chromaId][tuIterator.section] = nonZeroDistC; - singlePsyEnergyComp[chromaId][tuIterator.section] = nonZeroPsyEnergyC; + singleDist[chromaId][tuIterator.section] = nonZeroDistC; + singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC; cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC; bestTransformMode[chromaId][tuIterator.section] = 1; + uint32_t numCoeffC = 1 << (log2TrSizeC << 1); memcpy(coeffCurC + subTUOffset, tsCoeffC, sizeof(coeff_t) * numCoeffC); - primitives.square_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC); + primitives.luma_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC); } cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); @@ -2875,66 +3115,55 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa } } + // Here we were encoding cbfs and coefficients, after calculating distortion above. + // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected + // bits required for coefficients and added with number of cbf bits. As I tested the order does not + // make any difference. But bit confused whether I should load the original context as below. m_entropyCoder.load(m_rqt[depth].rqtRoot); - m_entropyCoder.resetBits(); - if (log2TrSize > depthRange[0]) - m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize); - + //Encode cbf flags if (bCodeChroma) { for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) { if (!splitIntoSubTUs) - m_entropyCoder.codeQtCbf(cbfFlag[chromaId][0], (TextType)chromaId, tuDepth); + m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth); else { offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx); - for (uint32_t subTU = 0; subTU < 2; subTU++) - m_entropyCoder.codeQtCbf(cbfFlag[chromaId][subTU], (TextType)chromaId, tuDepth); + m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth); + m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][1], tuDepth); } } } - m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth); - if (cbfFlag[TEXT_LUMA][0]) - m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA); + m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth); - if (bCodeChroma) - { - uint32_t subTUSize = 1 << (log2TrSizeC * 2); - uint32_t partIdxesPerSubTU = absPartIdxStep >> 1; - uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); + uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits(); - for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) - { - coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; - if (!splitIntoSubTUs) - { - if (cbfFlag[chromaId][0]) - m_entropyCoder.codeCoeffNxN(cu, coeffCurC, absPartIdx, log2TrSizeC, (TextType)chromaId); - } - else - { - for (uint32_t subTU = 0; subTU < 2; subTU++) - { - if (cbfFlag[chromaId][subTU]) - m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTU * subTUSize, absPartIdx + subTU * partIdxesPerSubTU, log2TrSizeC, (TextType)chromaId); - } - } - } + uint32_t coeffBits = 0; + coeffBits = singleBits[TEXT_LUMA][0]; + for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++) + { + coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex]; + coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex]; } - fullCost.distortion += singleDistComp[TEXT_LUMA][0]; - fullCost.energy += singlePsyEnergyComp[TEXT_LUMA][0];// need to check we need to add chroma also + // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma. + // In case of chroma, if any one of the splitted block's cbf is 1, then we need to encode cbf 1, and then for + // four splitted block's individual cbf value. This is not known before analysis of four splitted blocks. + // For that reason, I am collecting individual coefficient bits only. + fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits; + + fullCost.distortion += singleDist[TEXT_LUMA][0]; + fullCost.energy += singlePsyEnergy[TEXT_LUMA][0];// need to check we need to add chroma also for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++) { - fullCost.distortion += singleDistComp[TEXT_CHROMA_U][subTUIndex]; - fullCost.distortion += singleDistComp[TEXT_CHROMA_V][subTUIndex]; + fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex]; + fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex]; } - fullCost.bits = m_entropyCoder.getNumberOfWrittenBits(); if (m_rdCost.m_psyRd) fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy); else @@ -2951,31 +3180,40 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa } Cost splitCost; - const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1); + if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0])) + { + // Subdiv flag can be encoded at the start of anlysis of splitted blocks. + m_entropyCoder.resetBits(); + m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize); + splitCost.bits = m_entropyCoder.getNumberOfWrittenBits(); + } + + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; uint32_t ycbf = 0, ucbf = 0, vcbf = 0; - for (uint32_t i = 0; i < 4; ++i) + for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts) { - estimateResidualQT(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, resiYuv, splitCost, depthRange); - ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1); - ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1); - vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1); + estimateResidualQT(mode, cuGeom, qPartIdx, depth + 1, resiYuv, splitCost, depthRange); + ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1); + ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1); + vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1); } - for (uint32_t i = 0; i < 4 * qPartNumSubdiv; ++i) + for (uint32_t i = 0; i < 4 * qNumParts; ++i) { cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth; cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth; cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth; } + // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits + // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma. + // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context + // at depth 0 (for example). m_entropyCoder.load(m_rqt[depth].rqtRoot); m_entropyCoder.resetBits(); - encodeResidualQT(cu, absPartIdx, depth, true, TEXT_LUMA, depthRange); - encodeResidualQT(cu, absPartIdx, depth, false, TEXT_LUMA, depthRange); - encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_U, depthRange); - encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_V, depthRange); - - splitCost.bits = m_entropyCoder.getNumberOfWrittenBits(); + codeInterSubdivCbfQT(cu, absPartIdx, depth, depthRange); + uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits(); + splitCost.bits += splitCbfBits; if (m_rdCost.m_psyRd) splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy); @@ -2999,15 +3237,18 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth); if (bCodeChroma) { - const uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1; - - uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0); - for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++) + if (!splitIntoSubTUs) { - const uint32_t subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU); - - cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][subTUIndex], TEXT_CHROMA_U, subTUPartIdx, partIdxesPerSubTU); - cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][subTUIndex], TEXT_CHROMA_V, subTUPartIdx, partIdxesPerSubTU); + cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth); + cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth); + } + else + { + uint32_t tuNumParts = absPartIdxStep >> 1; + cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx , tuNumParts); + cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts); + cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx , tuNumParts); + cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts); } } X265_CHECK(bCheckFull, "check-full must be set\n"); @@ -3019,23 +3260,21 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa if (bCodeChroma) { - uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1; - uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0); - - for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) + if (!splitIntoSubTUs) { - for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++) - { - const uint32_t subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU); + cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth); + cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth); + } + else + { + uint32_t tuNumParts = absPartIdxStep >> 1; - if (splitIntoSubTUs) - { - uint8_t combinedSubTUCBF = cbfFlag[chromaId][0] | cbfFlag[chromaId][1]; - cu.setCbfPartRange(((cbfFlag[chromaId][subTUIndex] << 1) | combinedSubTUCBF) << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU); - } - else - cu.setCbfPartRange(cbfFlag[chromaId][subTUIndex] << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU); - } + offsetCBFs(cbfFlag[TEXT_CHROMA_U]); + offsetCBFs(cbfFlag[TEXT_CHROMA_V]); + cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx , tuNumParts); + cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts); + cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx , tuNumParts); + cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts); } } @@ -3045,51 +3284,65 @@ void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPa outCosts.energy += fullCost.energy; } -void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2]) +void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, const uint32_t depthRange[2]) { X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n"); - X265_CHECK(cu.m_predMode[absPartIdx] != MODE_INTRA, "encodeResidualQT() with intra block\n"); + X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n"); - const uint32_t curTuDepth = depth - cu.m_cuDepth[0]; - const uint32_t tuDepth = cu.m_tuDepth[absPartIdx]; - const bool bSubdiv = curTuDepth != tuDepth; + const uint32_t tuDepth = depth - cu.m_cuDepth[0]; + const bool bSubdiv = tuDepth != cu.m_tuDepth[absPartIdx]; const uint32_t log2TrSize = g_maxLog2CUSize - depth; - uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; - - const bool splitIntoSubTUs = (m_csp == X265_CSP_I422); + if (!(log2TrSize - m_hChromaShift < 2)) + { + if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1)) + m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv); + if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1)) + m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv); + } + else + { + X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma CBF not matching\n"); + X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma CBF not matching\n"); + } - if (bSubdivAndCbf && log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]) - m_entropyCoder.codeTransformSubdivFlag(bSubdiv, 5 - log2TrSize); + if (!bSubdiv) + { + m_entropyCoder.codeQtCbfLuma(cu, absPartIdx, tuDepth); + } + else + { + uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2; + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) + codeInterSubdivCbfQT(cu, absPartIdx, depth + 1, depthRange); + } +} - bool mCodeAll = true; - uint32_t trWidthC = 1 << log2TrSizeC; - uint32_t trHeightC = splitIntoSubTUs ? (trWidthC << 1) : trWidthC; +void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, TextType ttype, const uint32_t depthRange[2]) +{ + X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n"); + X265_CHECK(cu.isInter(absPartIdx), "encodeResidualQT() with intra block\n"); - const uint32_t numPels = trWidthC * trHeightC; - if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE)) - mCodeAll = false; + const uint32_t curTuDepth = depth - cu.m_cuDepth[0]; + const uint32_t tuDepth = cu.m_tuDepth[absPartIdx]; + const bool bSubdiv = curTuDepth != tuDepth; + const uint32_t log2TrSize = g_maxLog2CUSize - depth; - if (bSubdivAndCbf) + if (bSubdiv) { - const bool bFirstCbfOfCU = curTuDepth == 0; - if (bFirstCbfOfCU || mCodeAll) - { - uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + curTuDepth) << 1); - if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1)) - m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_U, curTuDepth, !bSubdiv); - if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1)) - m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_V, curTuDepth, !bSubdiv); - } - else + if (cu.getCbf(absPartIdx, ttype, curTuDepth)) { - X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1), "chroma CBF not matching\n"); - X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1), "chroma CBF not matching\n"); + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) + encodeResidualQT(cu, absPartIdx, depth + 1, ttype, depthRange); } + return; } - - if (!bSubdiv) + else { + const bool splitIntoSubTUs = (m_csp == X265_CSP_I422); + uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; + // Luma const uint32_t qtLayer = log2TrSize - 2; uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2); @@ -3098,65 +3351,51 @@ void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t de // Chroma bool bCodeChroma = true; uint32_t tuDepthC = tuDepth; - if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444)) + if (log2TrSize == 2 && m_csp != X265_CSP_I444) { + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); log2TrSizeC++; tuDepthC--; - uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1); - bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0); + bCodeChroma = !(absPartIdx & 3); } - if (bSubdivAndCbf) - m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, tuDepth); - else + if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA); + + if (bCodeChroma) { - if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth)) - m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA); + uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); + coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC; + coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC; - if (bCodeChroma) + if (!splitIntoSubTUs) { - uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift); - coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC; - coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC; - - if (!splitIntoSubTUs) + if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U); + if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V); + } + else + { + uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2); + uint32_t subTUSize = 1 << (log2TrSizeC * 2); + if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth)) { - if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth)) + if (cu.getCbf(absPartIdx, ttype, tuDepth + 1)) m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U); - if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth)) - m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V); + if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_U); } - else + if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth)) { - uint32_t partIdxesPerSubTU = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + tuDepthC) << 1) + 1); - uint32_t subTUSize = 1 << (log2TrSizeC * 2); - if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth)) - { - if (cu.getCbf(absPartIdx, ttype, tuDepth + 1)) - m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U); - if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1)) - m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_U); - } - if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth)) - { - if (cu.getCbf(absPartIdx, ttype, tuDepth + 1)) - m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V); - if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1)) - m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_V); - } + if (cu.getCbf(absPartIdx, ttype, tuDepth + 1)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V); + if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1)) + m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_V); } } } } - else - { - if (bSubdivAndCbf || cu.getCbf(absPartIdx, ttype, curTuDepth)) - { - const uint32_t qpartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1); - for (uint32_t i = 0; i < 4; ++i) - encodeResidualQT(cu, absPartIdx + i * qpartNumSubdiv, depth + 1, bSubdivAndCbf, ttype, depthRange); - } - } } void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth) @@ -3164,28 +3403,27 @@ void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartI X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n"); const uint32_t curTrMode = depth - cu.m_cuDepth[0]; const uint32_t tuDepth = cu.m_tuDepth[absPartIdx]; + const uint32_t log2TrSize = g_maxLog2CUSize - depth; if (curTrMode < tuDepth) { - uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1); - for (uint32_t i = 0; i < 4; i++, absPartIdx += qPartNumSubdiv) + uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2; + for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) saveResidualQTData(cu, resiYuv, absPartIdx, depth + 1); return; } - const uint32_t log2TrSize = g_maxLog2CUSize - depth; const uint32_t qtLayer = log2TrSize - 2; uint32_t log2TrSizeC = log2TrSize - m_hChromaShift; bool bCodeChroma = true; uint32_t tuDepthC = tuDepth; - if (log2TrSizeC == 1) + if (log2TrSizeC < 2) { - X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n"); - log2TrSizeC++; + X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n"); + log2TrSizeC = 2; tuDepthC--; - uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1); - bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0); + bCodeChroma = !(absPartIdx & 3); } m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);