return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
}
-inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
+inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
{
X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n");
X265_CHECK(absGoRice <= 4, "absGoRice check failure\n");
// NOTE: mapping to x86 hardware instruction BSR
unsigned long size;
- CLZ32(size, absLevel);
+ CLZ(size, absLevel);
int egs = size * 2 + 1;
rate += egs << 15;
}
/* Calculates the cost for specific absolute transform level */
-inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
+inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
{
X265_CHECK(absLevel, "absLevel should not be zero\n");
if (symbol)
{
unsigned long idx;
- CLZ32(idx, symbol + 1);
+ CLZ(idx, symbol + 1);
length = idx;
}
m_useRDOQ = useRDOQ;
m_psyRdoqScale = (int64_t)(psyScale * 256.0);
m_scalingList = &scalingList;
- m_resiDctCoeff = X265_MALLOC(int32_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
+ m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE);
m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE);
m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
int qpy = ctu.m_qp[0];
m_qpParam[TEXT_LUMA].setQpParam(qpy + QP_BD_OFFSET);
- setChromaQP(qpy + ctu.m_slice->m_pps->chromaCbQpOffset, TEXT_CHROMA_U, ctu.m_chromaFormat);
- setChromaQP(qpy + ctu.m_slice->m_pps->chromaCrQpOffset, TEXT_CHROMA_V, ctu.m_chromaFormat);
+ setChromaQP(qpy + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
+ setChromaQP(qpy + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
}
void Quant::setChromaQP(int qpin, TextType ttype, int chFmt)
uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams)
{
const uint32_t log2TrSizeCG = codeParams.log2TrSizeCG;
- const uint16_t *scan = codeParams.scan;
+ const uint16_t* scan = codeParams.scan;
bool lastCG = true;
for (int cg = (1 << (log2TrSizeCG * 2)) - 1; cg >= 0; cg--)
return numSig;
}
-uint32_t Quant::transformNxN(CUData& cu, pixel* fenc, uint32_t fencStride, int16_t* residual, uint32_t stride,
+uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride,
coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip)
{
+ const uint32_t sizeIdx = log2TrSize - 2;
if (cu.m_tqBypass[absPartIdx])
{
X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
- return primitives.copy_cnt[log2TrSize - 2](coeff, residual, stride);
+ return primitives.copy_cnt[sizeIdx](coeff, residual, resiStride);
}
bool isLuma = ttype == TEXT_LUMA;
bool usePsy = m_psyRdoqScale && isLuma && !useTransformSkip;
- bool isIntra = cu.m_predMode[absPartIdx] == MODE_INTRA;
int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
- int trSize = 1 << log2TrSize;
X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n");
if (useTransformSkip)
{
#if X265_DEPTH <= 10
- primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
+ X265_CHECK(transformShift >= 0, "invalid transformShift\n");
+ primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
#else
if (transformShift >= 0)
- primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize);
+ primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
else
- {
- int shift = -transformShift;
- int offset = (1 << (shift - 1));
- primitives.cvt16to32_shr[log2TrSize - 2](m_resiDctCoeff, residual, stride, shift, offset);
- }
+ primitives.cpy2Dto1D_shr[sizeIdx](m_resiDctCoeff, residual, resiStride, -transformShift);
#endif
}
else
{
- const uint32_t sizeIdx = log2TrSize - 2;
+ bool isIntra = cu.isIntra(absPartIdx);
int useDST = !sizeIdx && isLuma && isIntra;
int index = DCT_4x4 + sizeIdx - useDST;
- primitives.dct[index](residual, m_resiDctCoeff, stride);
+ primitives.dct[index](residual, m_resiDctCoeff, resiStride);
/* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so
* there is no risk of performing this DCT unnecessarily */
if (usePsy)
{
+ int trSize = 1 << log2TrSize;
/* perform DCT on source pixels for psy-rdoq */
- primitives.square_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride);
+ primitives.luma_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride);
primitives.dct[index](m_fencShortBuf, m_fencDctCoeff, trSize);
}
- if (m_nr && !isIntra)
+ if (m_nr)
{
/* denoise is not applied to intra residual, so DST can be ignored */
- int cat = sizeIdx + 4 * !isLuma;
+ int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra;
int numCoeff = 1 << (log2TrSize * 2);
primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
m_nr->count[cat]++;
int scalingListType = ttype + (isLuma ? 3 : 0);
int rem = m_qpParam[ttype].rem;
int per = m_qpParam[ttype].per;
- int32_t *quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
+ const int32_t* quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
int qbits = QUANT_SHIFT + per + transformShift;
int add = (cu.m_slice->m_sliceType == I_SLICE ? 171 : 85) << (qbits - 9);
}
}
-void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff,
+void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
{
+ const uint32_t sizeIdx = log2TrSize - 2;
if (transQuantBypass)
{
- primitives.copy_shl[log2TrSize - 2](residual, coeff, stride, 0);
+ primitives.cpy1Dto2D_shl[sizeIdx](residual, coeff, resiStride, 0);
return;
}
if (m_scalingList->m_bEnabled)
{
int scalingListType = (bIntra ? 0 : 3) + ttype;
- int32_t *dequantCoef = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
+ const int32_t* dequantCoef = m_scalingList->m_dequantCoef[sizeIdx][scalingListType][rem];
primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift);
}
else
if (useTransformSkip)
{
- int trSize = 1 << log2TrSize;
-
#if X265_DEPTH <= 10
- primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
+ X265_CHECK(transformShift > 0, "invalid transformShift\n");
+ primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
#else
if (transformShift > 0)
- primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize);
+ primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
else
- primitives.cvt32to16_shl[log2TrSize - 2](residual, m_resiDctCoeff, stride, -transformShift);
+ primitives.cpy1Dto2D_shl[sizeIdx](residual, m_resiDctCoeff, resiStride, -transformShift);
#endif
}
else
{
- const uint32_t sizeIdx = log2TrSize - 2;
int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra;
X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << (log2TrSize * 2)), "numSig differ\n");
// DC only
if (numSig == 1 && coeff[0] != 0 && !useDST)
{
- const int shift_1st = 7;
+ const int shift_1st = 7 - 6;
const int add_1st = 1 << (shift_1st - 1);
- const int shift_2nd = 12 - (X265_DEPTH - 8);
+ const int shift_2nd = 12 - (X265_DEPTH - 8) - 3;
const int add_2nd = 1 << (shift_2nd - 1);
- int dc_val = (((m_resiDctCoeff[0] * 64 + add_1st) >> shift_1st) * 64 + add_2nd) >> shift_2nd;
- primitives.blockfill_s[sizeIdx](residual, stride, (int16_t)dc_val);
+ int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
+ primitives.blockfill_s[sizeIdx](residual, resiStride, (int16_t)dc_val);
return;
}
- primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, stride);
+ primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, resiStride);
}
}
/* Rate distortion optimized quantization for entropy coding engines using
* probability models like CABAC */
-uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy)
+uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy)
{
int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
int per = m_qpParam[ttype].per;
int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */
int add = (1 << (qbits - 1));
- int32_t *qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
+ const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
int numCoeff = 1 << (log2TrSize * 2);
/* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
* scale applied that must be removed during unquant. Note that in real dequant there is clipping
* at several stages. We skip the clipping for simplicity when measuring RD cost */
- int32_t *unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
+ const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0);
int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0;
int scaleBits = SCALE_BITS - 2 * transformShift;
// coefficient level estimation
const uint32_t oneCtx = 4 * ctxSet + c1;
const uint32_t absCtx = ctxSet + c2;
- const int *greaterOneBits = estBitsSbac.greaterOneBits[oneCtx];
- const int *levelAbsBits = estBitsSbac.levelAbsBits[absCtx];
+ const int* greaterOneBits = estBitsSbac.greaterOneBits[oneCtx];
+ const int* levelAbsBits = estBitsSbac.levelAbsBits[absCtx];
uint16_t level = 0;
uint32_t sigCoefBits = 0;