1 /*****************************************************************************
2 * Copyright (C) 2014 x265 project
4 * Authors: Steve Borho <steve@borho.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 * This program is also available under a commercial proprietary license.
21 * For more information, contact us at license @ x265.com.
22 *****************************************************************************/
25 #include "primitives.h"
27 #include "framedata.h"
35 #define SIGN(x,y) ((x^(y >> 31))-(y >> 31))
39 struct coeffGroupRDStats
41 int nnzBeforePos0
; /* indicates coeff other than pos 0 are coded */
42 int64_t codedLevelAndDist
; /* distortion and level cost of coded coefficients */
43 int64_t uncodedDist
; /* uncoded distortion cost of coded coefficients */
44 int64_t sigCost
; /* cost of signaling significant coeff bitmap */
45 int64_t sigCost0
; /* cost of signaling sig coeff bit of coeff 0 */
48 inline int fastMin(int x
, int y
)
50 return y
+ ((x
- y
) & ((x
- y
) >> (sizeof(int) * CHAR_BIT
- 1))); // min(x, y)
53 inline int getICRate(uint32_t absLevel
, int32_t diffLevel
, const int *greaterOneBits
, const int *levelAbsBits
, uint32_t absGoRice
, uint32_t c1c2Idx
)
55 X265_CHECK(c1c2Idx
<= 3, "c1c2Idx check failure\n");
56 X265_CHECK(absGoRice
<= 4, "absGoRice check failure\n");
59 X265_CHECK(diffLevel
< 0, "diffLevel check failure\n");
66 X265_CHECK(absLevel
<= 2, "absLevel check failure\n");
67 rate
+= greaterOneBits
[(absLevel
== 2)];
70 rate
+= levelAbsBits
[0];
74 uint32_t symbol
= diffLevel
;
75 const uint32_t maxVlc
= g_goRiceRange
[absGoRice
];
76 bool expGolomb
= (symbol
> maxVlc
);
80 absLevel
= symbol
- maxVlc
;
82 // NOTE: mapping to x86 hardware instruction BSR
84 CLZ32(size
, absLevel
);
85 int egs
= size
* 2 + 1;
89 // NOTE: in here, expGolomb=true means (symbol >= maxVlc + 1)
90 X265_CHECK(fastMin(symbol
, (maxVlc
+ 1)) == (int)maxVlc
+ 1, "min check failure\n");
94 uint32_t prefLen
= (symbol
>> absGoRice
) + 1;
95 uint32_t numBins
= fastMin(prefLen
+ absGoRice
, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */);
97 rate
+= numBins
<< 15;
100 rate
+= greaterOneBits
[1];
103 rate
+= levelAbsBits
[1];
108 /* Calculates the cost for specific absolute transform level */
109 inline uint32_t getICRateCost(uint32_t absLevel
, int32_t diffLevel
, const int *greaterOneBits
, const int *levelAbsBits
, uint32_t absGoRice
, uint32_t c1c2Idx
)
111 X265_CHECK(absLevel
, "absLevel should not be zero\n");
115 X265_CHECK((absLevel
== 1) || (absLevel
== 2), "absLevel range check failure\n");
117 uint32_t rate
= greaterOneBits
[(absLevel
== 2)];
119 rate
+= levelAbsBits
[0];
125 uint32_t symbol
= diffLevel
;
126 if ((symbol
>> absGoRice
) < COEF_REMAIN_BIN_REDUCTION
)
128 uint32_t length
= symbol
>> absGoRice
;
129 rate
= (length
+ 1 + absGoRice
) << 15;
134 symbol
= (symbol
>> absGoRice
) - COEF_REMAIN_BIN_REDUCTION
;
138 CLZ32(idx
, symbol
+ 1);
142 rate
= (COEF_REMAIN_BIN_REDUCTION
+ length
+ absGoRice
+ 1 + length
) << 15;
145 rate
+= greaterOneBits
[1];
147 rate
+= levelAbsBits
[1];
156 m_resiDctCoeff
= NULL
;
157 m_fencDctCoeff
= NULL
;
158 m_fencShortBuf
= NULL
;
163 bool Quant::init(bool useRDOQ
, double psyScale
, const ScalingList
& scalingList
, Entropy
& entropy
)
165 m_entropyCoder
= &entropy
;
167 m_psyRdoqScale
= (int64_t)(psyScale
* 256.0);
168 m_scalingList
= &scalingList
;
169 m_resiDctCoeff
= X265_MALLOC(int32_t, MAX_TR_SIZE
* MAX_TR_SIZE
* 2);
170 m_fencDctCoeff
= m_resiDctCoeff
+ (MAX_TR_SIZE
* MAX_TR_SIZE
);
171 m_fencShortBuf
= X265_MALLOC(int16_t, MAX_TR_SIZE
* MAX_TR_SIZE
);
173 return m_resiDctCoeff
&& m_fencShortBuf
;
176 bool Quant::allocNoiseReduction(const x265_param
& param
)
178 m_frameNr
= X265_MALLOC(NoiseReduction
, param
.frameNumThreads
);
180 memset(m_frameNr
, 0, sizeof(NoiseReduction
) * param
.frameNumThreads
);
188 X265_FREE(m_frameNr
);
189 X265_FREE(m_resiDctCoeff
);
190 X265_FREE(m_fencShortBuf
);
193 void Quant::setQPforQuant(const CUData
& ctu
)
195 m_nr
= m_frameNr
? &m_frameNr
[ctu
.m_encData
->m_frameEncoderID
] : NULL
;
196 int qpy
= ctu
.m_qp
[0];
197 m_qpParam
[TEXT_LUMA
].setQpParam(qpy
+ QP_BD_OFFSET
);
198 setChromaQP(qpy
+ ctu
.m_slice
->m_pps
->chromaCbQpOffset
, TEXT_CHROMA_U
, ctu
.m_chromaFormat
);
199 setChromaQP(qpy
+ ctu
.m_slice
->m_pps
->chromaCrQpOffset
, TEXT_CHROMA_V
, ctu
.m_chromaFormat
);
202 void Quant::setChromaQP(int qpin
, TextType ttype
, int chFmt
)
204 int qp
= Clip3(-QP_BD_OFFSET
, 57, qpin
);
207 if (chFmt
== X265_CSP_I420
)
208 qp
= g_chromaScale
[qp
];
210 qp
= X265_MIN(qp
, 51);
212 m_qpParam
[ttype
].setQpParam(qp
+ QP_BD_OFFSET
);
215 /* To minimize the distortion only. No rate is considered */
216 uint32_t Quant::signBitHidingHDQ(int16_t* coeff
, int32_t* deltaU
, uint32_t numSig
, const TUEntropyCodingParameters
&codeParams
)
218 const uint32_t log2TrSizeCG
= codeParams
.log2TrSizeCG
;
219 const uint16_t *scan
= codeParams
.scan
;
222 for (int cg
= (1 << (log2TrSizeCG
* 2)) - 1; cg
>= 0; cg
--)
224 int cgStartPos
= cg
<< LOG2_SCAN_SET_SIZE
;
227 for (n
= SCAN_SET_SIZE
- 1; n
>= 0; --n
)
228 if (coeff
[scan
[n
+ cgStartPos
]])
233 int lastNZPosInCG
= n
;
236 if (coeff
[scan
[n
+ cgStartPos
]])
239 int firstNZPosInCG
= n
;
241 if (lastNZPosInCG
- firstNZPosInCG
>= SBH_THRESHOLD
)
243 uint32_t signbit
= coeff
[scan
[cgStartPos
+ firstNZPosInCG
]] > 0 ? 0 : 1;
246 for (n
= firstNZPosInCG
; n
<= lastNZPosInCG
; n
++)
247 absSum
+= coeff
[scan
[n
+ cgStartPos
]];
249 if (signbit
!= (absSum
& 0x1)) // compare signbit with sum_parity
251 int minCostInc
= MAX_INT
, minPos
= -1, curCost
= MAX_INT
;
252 int16_t finalChange
= 0, curChange
= 0;
254 for (n
= (lastCG
? lastNZPosInCG
: SCAN_SET_SIZE
- 1); n
>= 0; --n
)
256 uint32_t blkPos
= scan
[n
+ cgStartPos
];
259 if (deltaU
[blkPos
] > 0)
261 curCost
= -deltaU
[blkPos
];
266 if (n
== firstNZPosInCG
&& abs(coeff
[blkPos
]) == 1)
270 curCost
= deltaU
[blkPos
];
277 if (n
< firstNZPosInCG
)
279 uint32_t thisSignBit
= m_resiDctCoeff
[blkPos
] >= 0 ? 0 : 1;
280 if (thisSignBit
!= signbit
)
284 curCost
= -deltaU
[blkPos
];
290 curCost
= -deltaU
[blkPos
];
295 if (curCost
< minCostInc
)
297 minCostInc
= curCost
;
298 finalChange
= curChange
;
303 /* do not allow change to violate coeff clamp */
304 if (coeff
[minPos
] == 32767 || coeff
[minPos
] == -32768)
309 else if (finalChange
== -1 && abs(coeff
[minPos
]) == 1)
312 if (m_resiDctCoeff
[minPos
] >= 0)
313 coeff
[minPos
] += finalChange
;
315 coeff
[minPos
] -= finalChange
;
325 uint32_t Quant::transformNxN(CUData
& cu
, pixel
* fenc
, uint32_t fencStride
, int16_t* residual
, uint32_t stride
,
326 coeff_t
* coeff
, uint32_t log2TrSize
, TextType ttype
, uint32_t absPartIdx
, bool useTransformSkip
)
328 if (cu
.m_tqBypass
[absPartIdx
])
330 X265_CHECK(log2TrSize
>= 2 && log2TrSize
<= 5, "Block size mistake!\n");
331 return primitives
.copy_cnt
[log2TrSize
- 2](coeff
, residual
, stride
);
334 bool isLuma
= ttype
== TEXT_LUMA
;
335 bool usePsy
= m_psyRdoqScale
&& isLuma
&& !useTransformSkip
;
336 bool isIntra
= cu
.m_predMode
[absPartIdx
] == MODE_INTRA
;
337 int transformShift
= MAX_TR_DYNAMIC_RANGE
- X265_DEPTH
- log2TrSize
; // Represents scaling through forward transform
338 int trSize
= 1 << log2TrSize
;
340 X265_CHECK((cu
.m_slice
->m_sps
->quadtreeTULog2MaxSize
>= log2TrSize
), "transform size too large\n");
341 if (useTransformSkip
)
344 primitives
.cvt16to32_shl(m_resiDctCoeff
, residual
, stride
, transformShift
, trSize
);
346 if (transformShift
>= 0)
347 primitives
.cvt16to32_shl(m_resiDctCoeff
, residual
, stride
, transformShift
, trSize
);
350 int shift
= -transformShift
;
351 int offset
= (1 << (shift
- 1));
352 primitives
.cvt16to32_shr
[log2TrSize
- 2](m_resiDctCoeff
, residual
, stride
, shift
, offset
);
358 const uint32_t sizeIdx
= log2TrSize
- 2;
359 int useDST
= !sizeIdx
&& isLuma
&& isIntra
;
360 int index
= DCT_4x4
+ sizeIdx
- useDST
;
362 primitives
.dct
[index
](residual
, m_resiDctCoeff
, stride
);
364 /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so
365 * there is no risk of performing this DCT unnecessarily */
368 /* perform DCT on source pixels for psy-rdoq */
369 primitives
.square_copy_ps
[sizeIdx
](m_fencShortBuf
, trSize
, fenc
, fencStride
);
370 primitives
.dct
[index
](m_fencShortBuf
, m_fencDctCoeff
, trSize
);
373 if (m_nr
&& !isIntra
)
375 /* denoise is not applied to intra residual, so DST can be ignored */
376 int cat
= sizeIdx
+ 4 * !isLuma
;
377 int numCoeff
= 1 << (log2TrSize
* 2);
378 primitives
.denoiseDct(m_resiDctCoeff
, m_nr
->residualSum
[cat
], m_nr
->offsetDenoise
[cat
], numCoeff
);
384 return rdoQuant(cu
, coeff
, log2TrSize
, ttype
, absPartIdx
, usePsy
);
389 int scalingListType
= ttype
+ (isLuma
? 3 : 0);
390 int rem
= m_qpParam
[ttype
].rem
;
391 int per
= m_qpParam
[ttype
].per
;
392 int32_t *quantCoeff
= m_scalingList
->m_quantCoef
[log2TrSize
- 2][scalingListType
][rem
];
394 int qbits
= QUANT_SHIFT
+ per
+ transformShift
;
395 int add
= (cu
.m_slice
->m_sliceType
== I_SLICE
? 171 : 85) << (qbits
- 9);
396 int numCoeff
= 1 << (log2TrSize
* 2);
398 uint32_t numSig
= primitives
.quant(m_resiDctCoeff
, quantCoeff
, deltaU
, coeff
, qbits
, add
, numCoeff
);
400 if (numSig
>= 2 && cu
.m_slice
->m_pps
->bSignHideEnabled
)
402 TUEntropyCodingParameters codeParams
;
403 cu
.getTUEntropyCodingParameters(codeParams
, absPartIdx
, log2TrSize
, isLuma
);
404 return signBitHidingHDQ(coeff
, deltaU
, numSig
, codeParams
);
411 void Quant::invtransformNxN(bool transQuantBypass
, int16_t* residual
, uint32_t stride
, coeff_t
* coeff
,
412 uint32_t log2TrSize
, TextType ttype
, bool bIntra
, bool useTransformSkip
, uint32_t numSig
)
414 if (transQuantBypass
)
416 primitives
.copy_shl
[log2TrSize
- 2](residual
, coeff
, stride
, 0);
420 // Values need to pass as input parameter in dequant
421 int rem
= m_qpParam
[ttype
].rem
;
422 int per
= m_qpParam
[ttype
].per
;
423 int transformShift
= MAX_TR_DYNAMIC_RANGE
- X265_DEPTH
- log2TrSize
;
424 int shift
= QUANT_IQUANT_SHIFT
- QUANT_SHIFT
- transformShift
;
425 int numCoeff
= 1 << (log2TrSize
* 2);
427 if (m_scalingList
->m_bEnabled
)
429 int scalingListType
= (bIntra
? 0 : 3) + ttype
;
430 int32_t *dequantCoef
= m_scalingList
->m_dequantCoef
[log2TrSize
- 2][scalingListType
][rem
];
431 primitives
.dequant_scaling(coeff
, dequantCoef
, m_resiDctCoeff
, numCoeff
, per
, shift
);
435 int scale
= m_scalingList
->s_invQuantScales
[rem
] << per
;
436 primitives
.dequant_normal(coeff
, m_resiDctCoeff
, numCoeff
, scale
, shift
);
439 if (useTransformSkip
)
441 int trSize
= 1 << log2TrSize
;
444 primitives
.cvt32to16_shr(residual
, m_resiDctCoeff
, stride
, transformShift
, trSize
);
446 if (transformShift
> 0)
447 primitives
.cvt32to16_shr(residual
, m_resiDctCoeff
, stride
, transformShift
, trSize
);
449 primitives
.cvt32to16_shl
[log2TrSize
- 2](residual
, m_resiDctCoeff
, stride
, -transformShift
);
454 const uint32_t sizeIdx
= log2TrSize
- 2;
455 int useDST
= !sizeIdx
&& ttype
== TEXT_LUMA
&& bIntra
;
457 X265_CHECK((int)numSig
== primitives
.count_nonzero(coeff
, 1 << (log2TrSize
* 2)), "numSig differ\n");
460 if (numSig
== 1 && coeff
[0] != 0 && !useDST
)
462 const int shift_1st
= 7;
463 const int add_1st
= 1 << (shift_1st
- 1);
464 const int shift_2nd
= 12 - (X265_DEPTH
- 8);
465 const int add_2nd
= 1 << (shift_2nd
- 1);
467 int dc_val
= (((m_resiDctCoeff
[0] * 64 + add_1st
) >> shift_1st
) * 64 + add_2nd
) >> shift_2nd
;
468 primitives
.blockfill_s
[sizeIdx
](residual
, stride
, (int16_t)dc_val
);
472 primitives
.idct
[IDCT_4x4
+ sizeIdx
- useDST
](m_resiDctCoeff
, residual
, stride
);
476 /* Rate distortion optimized quantization for entropy coding engines using
477 * probability models like CABAC */
478 uint32_t Quant::rdoQuant(CUData
& cu
, int16_t* dstCoeff
, uint32_t log2TrSize
, TextType ttype
, uint32_t absPartIdx
, bool usePsy
)
480 int transformShift
= MAX_TR_DYNAMIC_RANGE
- X265_DEPTH
- log2TrSize
; /* Represents scaling through forward transform */
481 int scalingListType
= (cu
.isIntra(absPartIdx
) ? 0 : 3) + ttype
;
483 X265_CHECK(scalingListType
< 6, "scaling list type out of range\n");
485 int rem
= m_qpParam
[ttype
].rem
;
486 int per
= m_qpParam
[ttype
].per
;
487 int qbits
= QUANT_SHIFT
+ per
+ transformShift
; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */
488 int add
= (1 << (qbits
- 1));
489 int32_t *qCoef
= m_scalingList
->m_quantCoef
[log2TrSize
- 2][scalingListType
][rem
];
491 int numCoeff
= 1 << (log2TrSize
* 2);
493 uint32_t numSig
= primitives
.nquant(m_resiDctCoeff
, qCoef
, dstCoeff
, qbits
, add
, numCoeff
);
495 X265_CHECK((int)numSig
== primitives
.count_nonzero(dstCoeff
, 1 << (log2TrSize
* 2)), "numSig differ\n");
499 uint32_t trSize
= 1 << log2TrSize
;
500 int64_t lambda2
= m_qpParam
[ttype
].lambda2
;
501 int64_t psyScale
= (m_psyRdoqScale
* m_qpParam
[ttype
].lambda
);
503 /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
504 * scale applied that must be removed during unquant. Note that in real dequant there is clipping
505 * at several stages. We skip the clipping for simplicity when measuring RD cost */
506 int32_t *unquantScale
= m_scalingList
->m_dequantCoef
[log2TrSize
- 2][scalingListType
][rem
];
507 int unquantShift
= QUANT_IQUANT_SHIFT
- QUANT_SHIFT
- transformShift
+ (m_scalingList
->m_bEnabled
? 4 : 0);
508 int unquantRound
= (unquantShift
> per
) ? 1 << (unquantShift
- per
- 1) : 0;
509 int scaleBits
= SCALE_BITS
- 2 * transformShift
;
511 #define UNQUANT(lvl) (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift)
512 #define SIGCOST(bits) ((lambda2 * (bits)) >> 8)
513 #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits))
514 #define PSYVALUE(rec) ((psyScale * (rec)) >> (16 - scaleBits))
516 int64_t costCoeff
[32 * 32]; /* d*d + lambda * bits */
517 int64_t costUncoded
[32 * 32]; /* d*d + lambda * 0 */
518 int64_t costSig
[32 * 32]; /* lambda * bits */
520 int rateIncUp
[32 * 32]; /* signal overhead of increasing level */
521 int rateIncDown
[32 * 32]; /* signal overhead of decreasing level */
522 int sigRateDelta
[32 * 32]; /* signal difference between zero and non-zero */
524 int64_t costCoeffGroupSig
[MLS_GRP_NUM
]; /* lambda * bits of group coding cost */
525 uint64_t sigCoeffGroupFlag64
= 0;
530 uint32_t goRiceParam
= 0;
533 int cgLastScanPos
= -1;
534 int lastScanPos
= -1;
535 const uint32_t cgSize
= (1 << MLS_CG_SIZE
); /* 4x4 num coef = 16 */
536 bool bIsLuma
= ttype
== TEXT_LUMA
;
538 /* total rate distortion cost of transform block, as CBF=0 */
539 int64_t totalUncodedCost
= 0;
541 /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks,
542 * the distortion and signal cost of coded blocks, and the coding cost of significant
543 * coefficient and coefficient group bitmaps */
544 int64_t totalRdCost
= 0;
546 TUEntropyCodingParameters codeParams
;
547 cu
.getTUEntropyCodingParameters(codeParams
, absPartIdx
, log2TrSize
, bIsLuma
);
548 const uint32_t cgNum
= 1 << (codeParams
.log2TrSizeCG
* 2);
550 /* TODO: update bit estimates if dirty */
551 EstBitsSbac
& estBitsSbac
= m_entropyCoder
->m_estBitsSbac
;
554 coeffGroupRDStats cgRdStats
;
556 /* iterate over coding groups in reverse scan order */
557 for (int cgScanPos
= cgNum
- 1; cgScanPos
>= 0; cgScanPos
--)
559 const uint32_t cgBlkPos
= codeParams
.scanCG
[cgScanPos
];
560 const uint32_t cgPosY
= cgBlkPos
>> codeParams
.log2TrSizeCG
;
561 const uint32_t cgPosX
= cgBlkPos
- (cgPosY
<< codeParams
.log2TrSizeCG
);
562 const uint64_t cgBlkPosMask
= ((uint64_t)1 << cgBlkPos
);
563 memset(&cgRdStats
, 0, sizeof(coeffGroupRDStats
));
565 const int patternSigCtx
= calcPatternSigCtx(sigCoeffGroupFlag64
, cgPosX
, cgPosY
, codeParams
.log2TrSizeCG
);
567 /* iterate over coefficients in each group in reverse scan order */
568 for (int scanPosinCG
= cgSize
- 1; scanPosinCG
>= 0; scanPosinCG
--)
570 scanPos
= (cgScanPos
<< MLS_CG_SIZE
) + scanPosinCG
;
571 uint32_t blkPos
= codeParams
.scan
[scanPos
];
572 uint16_t maxAbsLevel
= (int16_t)abs(dstCoeff
[blkPos
]); /* abs(quantized coeff) */
573 int signCoef
= m_resiDctCoeff
[blkPos
]; /* pre-quantization DCT coeff */
574 int predictedCoef
= m_fencDctCoeff
[blkPos
] - signCoef
; /* predicted DCT = source DCT - residual DCT*/
576 /* RDOQ measures distortion as the squared difference between the unquantized coded level
577 * and the original DCT coefficient. The result is shifted scaleBits to account for the
578 * FIX15 nature of the CABAC cost tables minus the forward transform scale */
580 /* cost of not coding this coefficient (all distortion, no signal bits) */
581 costUncoded
[scanPos
] = (int64_t)(signCoef
* signCoef
) << scaleBits
;
582 if (usePsy
&& blkPos
)
583 /* when no residual coefficient is coded, predicted coef == recon coef */
584 costUncoded
[scanPos
] -= PSYVALUE(predictedCoef
);
586 totalUncodedCost
+= costUncoded
[scanPos
];
588 if (maxAbsLevel
&& lastScanPos
< 0)
590 /* remember the first non-zero coef found in this reverse scan as the last pos */
591 lastScanPos
= scanPos
;
592 ctxSet
= (scanPos
< SCAN_SET_SIZE
|| !bIsLuma
) ? 0 : 2;
593 cgLastScanPos
= cgScanPos
;
598 /* coefficients after lastNZ have no distortion signal cost */
599 costCoeff
[scanPos
] = 0;
600 costSig
[scanPos
] = 0;
602 /* No non-zero coefficient yet found, but this does not mean
603 * there is no uncoded-cost for this coefficient. Pre-
604 * quantization the coefficient may have been non-zero */
605 totalRdCost
+= costUncoded
[scanPos
];
609 const uint32_t c1c2Idx
= ((c1Idx
- 8) >> (sizeof(int) * CHAR_BIT
- 1)) + (((-(int)c2Idx
) >> (sizeof(int) * CHAR_BIT
- 1)) + 1) * 2;
610 const uint32_t baseLevel
= ((uint32_t)0xD9 >> (c1c2Idx
* 2)) & 3; // {1, 2, 1, 3}
612 X265_CHECK(!!((int)c1Idx
< C1FLAG_NUMBER
) == (int)((c1Idx
- 8) >> (sizeof(int) * CHAR_BIT
- 1)), "scan validation 1\n");
613 X265_CHECK(!!(c2Idx
== 0) == ((-(int)c2Idx
) >> (sizeof(int) * CHAR_BIT
- 1)) + 1, "scan validation 2\n");
614 X265_CHECK((int)baseLevel
== ((c1Idx
< C1FLAG_NUMBER
) ? (2 + (c2Idx
== 0)) : 1), "scan validation 3\n");
616 // coefficient level estimation
617 const uint32_t oneCtx
= 4 * ctxSet
+ c1
;
618 const uint32_t absCtx
= ctxSet
+ c2
;
619 const int *greaterOneBits
= estBitsSbac
.greaterOneBits
[oneCtx
];
620 const int *levelAbsBits
= estBitsSbac
.levelAbsBits
[absCtx
];
623 uint32_t sigCoefBits
= 0;
624 costCoeff
[scanPos
] = MAX_INT64
;
626 if ((int)scanPos
== lastScanPos
)
627 sigRateDelta
[blkPos
] = 0;
630 const uint32_t ctxSig
= getSigCtxInc(patternSigCtx
, log2TrSize
, trSize
, blkPos
, bIsLuma
, codeParams
.firstSignificanceMapContext
);
633 /* set default costs to uncoded costs */
634 costSig
[scanPos
] = SIGCOST(estBitsSbac
.significantBits
[ctxSig
][0]);
635 costCoeff
[scanPos
] = costUncoded
[scanPos
] + costSig
[scanPos
];
637 sigRateDelta
[blkPos
] = estBitsSbac
.significantBits
[ctxSig
][1] - estBitsSbac
.significantBits
[ctxSig
][0];
638 sigCoefBits
= estBitsSbac
.significantBits
[ctxSig
][1];
642 uint16_t minAbsLevel
= X265_MAX(maxAbsLevel
- 1, 1);
643 for (uint16_t lvl
= maxAbsLevel
; lvl
>= minAbsLevel
; lvl
--)
645 uint32_t levelBits
= getICRateCost(lvl
, lvl
- baseLevel
, greaterOneBits
, levelAbsBits
, goRiceParam
, c1c2Idx
) + IEP_RATE
;
647 int unquantAbsLevel
= UNQUANT(lvl
);
648 int d
= abs(signCoef
) - unquantAbsLevel
;
649 int64_t curCost
= RDCOST(d
, sigCoefBits
+ levelBits
);
651 /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */
652 if (usePsy
&& blkPos
)
654 int reconCoef
= abs(unquantAbsLevel
+ SIGN(predictedCoef
, signCoef
));
655 curCost
-= PSYVALUE(reconCoef
);
658 if (curCost
< costCoeff
[scanPos
])
661 costCoeff
[scanPos
] = curCost
;
662 costSig
[scanPos
] = SIGCOST(sigCoefBits
);
667 dstCoeff
[blkPos
] = level
;
668 totalRdCost
+= costCoeff
[scanPos
];
670 /* record costs for sign-hiding performed at the end */
673 int rateNow
= getICRate(level
, level
- baseLevel
, greaterOneBits
, levelAbsBits
, goRiceParam
, c1c2Idx
);
674 rateIncUp
[blkPos
] = getICRate(level
+ 1, level
+ 1 - baseLevel
, greaterOneBits
, levelAbsBits
, goRiceParam
, c1c2Idx
) - rateNow
;
675 rateIncDown
[blkPos
] = getICRate(level
- 1, level
- 1 - baseLevel
, greaterOneBits
, levelAbsBits
, goRiceParam
, c1c2Idx
) - rateNow
;
679 rateIncUp
[blkPos
] = greaterOneBits
[0];
680 rateIncDown
[blkPos
] = 0;
683 /* Update CABAC estimation state */
684 if (level
>= baseLevel
&& goRiceParam
< 4 && level
> (3U << goRiceParam
))
687 c1Idx
-= (-(int32_t)level
) >> 31;
689 /* update bin model */
693 c2
+= (uint32_t)(c2
- 2) >> 31;
696 else if ((c1
< 3) && (c1
> 0) && level
)
699 /* context set update */
700 if (!(scanPos
% SCAN_SET_SIZE
) && scanPos
)
707 ctxSet
= (scanPos
== SCAN_SET_SIZE
|| !bIsLuma
) ? 0 : 2;
708 X265_CHECK(c1
>= 0, "c1 is negative\n");
709 ctxSet
-= ((int32_t)(c1
- 1) >> 31);
714 cgRdStats
.sigCost
+= costSig
[scanPos
];
716 cgRdStats
.sigCost0
= costSig
[scanPos
];
718 if (dstCoeff
[blkPos
])
720 sigCoeffGroupFlag64
|= cgBlkPosMask
;
721 cgRdStats
.codedLevelAndDist
+= costCoeff
[scanPos
] - costSig
[scanPos
];
722 cgRdStats
.uncodedDist
+= costUncoded
[scanPos
];
723 cgRdStats
.nnzBeforePos0
+= scanPosinCG
;
725 } /* end for (scanPosinCG) */
727 costCoeffGroupSig
[cgScanPos
] = 0;
729 if (cgLastScanPos
< 0)
731 /* nothing to do at this point */
733 else if (!cgScanPos
|| cgScanPos
== cgLastScanPos
)
735 /* coeff group 0 is implied to be present, no signal cost */
736 /* coeff group with last NZ is implied to be present, handled below */
738 else if (sigCoeffGroupFlag64
& cgBlkPosMask
)
740 if (!cgRdStats
.nnzBeforePos0
)
742 /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */
743 totalRdCost
-= cgRdStats
.sigCost0
;
744 cgRdStats
.sigCost
-= cgRdStats
.sigCost0
;
747 /* there are coded coefficients in this group, but now we include the signaling cost
748 * of the significant coefficient group flag and evaluate whether the RD cost of the
749 * coded group is more than the RD cost of the uncoded group */
751 uint32_t sigCtx
= getSigCoeffGroupCtxInc(sigCoeffGroupFlag64
, cgPosX
, cgPosY
, codeParams
.log2TrSizeCG
);
753 int64_t costZeroCG
= totalRdCost
+ SIGCOST(estBitsSbac
.significantCoeffGroupBits
[sigCtx
][0]);
754 costZeroCG
+= cgRdStats
.uncodedDist
; /* add distortion for resetting non-zero levels to zero levels */
755 costZeroCG
-= cgRdStats
.codedLevelAndDist
; /* remove distortion and level cost of coded coefficients */
756 costZeroCG
-= cgRdStats
.sigCost
; /* remove signaling cost of significant coeff bitmap */
758 costCoeffGroupSig
[cgScanPos
] = SIGCOST(estBitsSbac
.significantCoeffGroupBits
[sigCtx
][1]);
759 totalRdCost
+= costCoeffGroupSig
[cgScanPos
]; /* add the cost of 1 bit in significant CG bitmap */
761 if (costZeroCG
< totalRdCost
)
763 sigCoeffGroupFlag64
&= ~cgBlkPosMask
;
764 totalRdCost
= costZeroCG
;
765 costCoeffGroupSig
[cgScanPos
] = SIGCOST(estBitsSbac
.significantCoeffGroupBits
[sigCtx
][0]);
767 /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */
768 for (int scanPosinCG
= cgSize
- 1; scanPosinCG
>= 0; scanPosinCG
--)
770 scanPos
= cgScanPos
* cgSize
+ scanPosinCG
;
771 uint32_t blkPos
= codeParams
.scan
[scanPos
];
772 if (dstCoeff
[blkPos
])
774 costCoeff
[scanPos
] = costUncoded
[scanPos
];
775 costSig
[scanPos
] = 0;
777 dstCoeff
[blkPos
] = 0;
783 /* there were no coded coefficients in this coefficient group */
784 uint32_t ctxSig
= getSigCoeffGroupCtxInc(sigCoeffGroupFlag64
, cgPosX
, cgPosY
, codeParams
.log2TrSizeCG
);
785 costCoeffGroupSig
[cgScanPos
] = SIGCOST(estBitsSbac
.significantCoeffGroupBits
[ctxSig
][0]);
786 totalRdCost
+= costCoeffGroupSig
[cgScanPos
]; /* add cost of 0 bit in significant CG bitmap */
787 totalRdCost
-= cgRdStats
.sigCost
; /* remove cost of significant coefficient bitmap */
789 } /* end for (cgScanPos) */
791 X265_CHECK(lastScanPos
>= 0, "numSig non zero, but no coded CG\n");
793 /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */
795 if (!cu
.isIntra(absPartIdx
) && bIsLuma
&& !cu
.m_tuDepth
[absPartIdx
])
797 bestCost
= totalUncodedCost
+ SIGCOST(estBitsSbac
.blockRootCbpBits
[0]);
798 totalRdCost
+= SIGCOST(estBitsSbac
.blockRootCbpBits
[1]);
802 int ctx
= ctxCbf
[ttype
][cu
.m_tuDepth
[absPartIdx
]];
803 bestCost
= totalUncodedCost
+ SIGCOST(estBitsSbac
.blockCbpBits
[ctx
][0]);
804 totalRdCost
+= SIGCOST(estBitsSbac
.blockCbpBits
[ctx
][1]);
807 /* This loop starts with the last non-zero found in the first loop and then refines this last
808 * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs
809 * at all previous coefficients until a coefficient greater than 1 is encountered or we run out
810 * of coefficients to evaluate. This will factor in the cost of coding empty groups and empty
811 * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */
813 bool foundLast
= false;
814 for (int cgScanPos
= cgLastScanPos
; cgScanPos
>= 0 && !foundLast
; cgScanPos
--)
816 if (!cgScanPos
|| cgScanPos
== cgLastScanPos
)
818 /* the presence of these coefficient groups are inferred, they have no bit in
819 * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */
821 else if (sigCoeffGroupFlag64
& (1ULL << codeParams
.scanCG
[cgScanPos
]))
823 /* remove cost of significant coeff group flag, the group's presence would be inferred
824 * from lastNZ if it were present in this group */
825 totalRdCost
-= costCoeffGroupSig
[cgScanPos
];
829 /* remove cost of signaling this empty group as not present */
830 totalRdCost
-= costCoeffGroupSig
[cgScanPos
];
834 for (int scanPosinCG
= cgSize
- 1; scanPosinCG
>= 0; scanPosinCG
--)
836 scanPos
= cgScanPos
* cgSize
+ scanPosinCG
;
837 if ((int)scanPos
> lastScanPos
)
840 /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then
841 * continue as if it were uncoded. If the coefficient was already uncoded, remove the
842 * cost of signaling it as not-significant */
843 uint32_t blkPos
= codeParams
.scan
[scanPos
];
844 if (dstCoeff
[blkPos
])
846 /* Swap the cost of signaling its significant coeff bit with the cost of
847 * signaling its lastNZ pos */
848 uint32_t posY
= blkPos
>> log2TrSize
;
849 uint32_t posX
= blkPos
- (posY
<< log2TrSize
);
850 uint32_t bitsLastNZ
= codeParams
.scanType
== SCAN_VER
? getRateLast(posY
, posX
) : getRateLast(posX
, posY
);
851 int64_t costAsLast
= totalRdCost
- costSig
[scanPos
] + SIGCOST(bitsLastNZ
);
853 if (costAsLast
< bestCost
)
855 bestLastIdx
= scanPos
+ 1;
856 bestCost
= costAsLast
;
858 if (dstCoeff
[blkPos
] > 1)
864 totalRdCost
-= costCoeff
[scanPos
];
865 totalRdCost
+= costUncoded
[scanPos
];
868 totalRdCost
-= costSig
[scanPos
];
872 /* recount non-zero coefficients and re-apply sign of DCT coef */
874 for (int pos
= 0; pos
< bestLastIdx
; pos
++)
876 int blkPos
= codeParams
.scan
[pos
];
877 int level
= dstCoeff
[blkPos
];
878 numSig
+= (level
!= 0);
880 uint32_t mask
= (int32_t)m_resiDctCoeff
[blkPos
] >> 31;
881 dstCoeff
[blkPos
] = (int16_t)((level
^ mask
) - mask
);
884 /* clean uncoded coefficients */
885 for (int pos
= bestLastIdx
; pos
<= lastScanPos
; pos
++)
886 dstCoeff
[codeParams
.scan
[pos
]] = 0;
888 /* rate-distortion based sign-hiding */
889 if (cu
.m_slice
->m_pps
->bSignHideEnabled
&& numSig
>= 2)
892 for (int subSet
= cgLastScanPos
; subSet
>= 0; subSet
--)
894 int subPos
= subSet
<< LOG2_SCAN_SET_SIZE
;
897 /* measure distance between first and last non-zero coef in this
899 for (n
= SCAN_SET_SIZE
- 1; n
>= 0; --n
)
900 if (dstCoeff
[codeParams
.scan
[n
+ subPos
]])
905 int lastNZPosInCG
= n
;
908 if (dstCoeff
[codeParams
.scan
[n
+ subPos
]])
911 int firstNZPosInCG
= n
;
913 if (lastNZPosInCG
- firstNZPosInCG
>= SBH_THRESHOLD
)
915 uint32_t signbit
= (dstCoeff
[codeParams
.scan
[subPos
+ firstNZPosInCG
]] > 0 ? 0 : 1);
918 for (n
= firstNZPosInCG
; n
<= lastNZPosInCG
; n
++)
919 absSum
+= dstCoeff
[codeParams
.scan
[n
+ subPos
]];
921 if (signbit
!= (absSum
& 1U))
923 /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff
924 * is properly implied. Note dstCoeff[] are signed by this point but curChange and
925 * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */
927 int64_t minCostInc
= MAX_INT64
, curCost
= MAX_INT64
;
929 int16_t finalChange
= 0, curChange
= 0;
931 for (n
= (lastCG
? lastNZPosInCG
: SCAN_SET_SIZE
- 1); n
>= 0; --n
)
933 uint32_t blkPos
= codeParams
.scan
[n
+ subPos
];
934 int signCoef
= m_resiDctCoeff
[blkPos
]; /* pre-quantization DCT coeff */
935 int absLevel
= abs(dstCoeff
[blkPos
]);
937 int d
= abs(signCoef
) - UNQUANT(absLevel
);
938 int64_t origDist
= (((int64_t)d
* d
)) << scaleBits
;
940 #define DELTARDCOST(d, deltabits) ((((int64_t)d * d) << scaleBits) - origDist + ((lambda2 * (int64_t)(deltabits)) >> 8))
942 if (dstCoeff
[blkPos
])
944 d
= abs(signCoef
) - UNQUANT(absLevel
+ 1);
945 int64_t costUp
= DELTARDCOST(d
, rateIncUp
[blkPos
]);
947 /* if decrementing would make the coeff 0, we can include the
948 * significant coeff flag cost savings */
949 d
= abs(signCoef
) - UNQUANT(absLevel
- 1);
950 bool isOne
= abs(dstCoeff
[blkPos
]) == 1;
951 int downBits
= rateIncDown
[blkPos
] - (isOne
? (IEP_RATE
+ sigRateDelta
[blkPos
]) : 0);
952 int64_t costDown
= DELTARDCOST(d
, downBits
);
954 if (lastCG
&& lastNZPosInCG
== n
&& isOne
)
955 costDown
-= 4 * IEP_RATE
;
957 if (costUp
< costDown
)
965 if (n
== firstNZPosInCG
&& isOne
)
971 else if (n
< firstNZPosInCG
&& signbit
!= (signCoef
>= 0 ? 0 : 1U))
973 /* don't try to make a new coded coeff before the first coeff if its
974 * sign would be different than the first coeff, the inferred sign would
975 * still be wrong and we'd have to do this again. */
980 /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */
981 d
= abs(signCoef
) - UNQUANT(1);
982 curCost
= DELTARDCOST(d
, rateIncUp
[blkPos
] + IEP_RATE
+ sigRateDelta
[blkPos
]);
986 if (curCost
< minCostInc
)
988 minCostInc
= curCost
;
989 finalChange
= curChange
;
994 if (dstCoeff
[minPos
] == 32767 || dstCoeff
[minPos
] == -32768)
995 /* don't allow sign hiding to violate the SPEC range */
998 if (dstCoeff
[minPos
] == 0)
1000 else if (finalChange
== -1 && abs(dstCoeff
[minPos
]) == 1)
1003 if (m_resiDctCoeff
[minPos
] >= 0)
1004 dstCoeff
[minPos
] += finalChange
;
1006 dstCoeff
[minPos
] -= finalChange
;
1017 /* Pattern decision for context derivation process of significant_coeff_flag */
1018 uint32_t Quant::calcPatternSigCtx(uint64_t sigCoeffGroupFlag64
, uint32_t cgPosX
, uint32_t cgPosY
, uint32_t log2TrSizeCG
)
1023 const uint32_t trSizeCG
= 1 << log2TrSizeCG
;
1024 X265_CHECK(trSizeCG
<= 8, "transform CG is too large\n");
1025 const uint32_t sigPos
= (uint32_t)(sigCoeffGroupFlag64
>> (1 + (cgPosY
<< log2TrSizeCG
) + cgPosX
));
1026 const uint32_t sigRight
= ((int32_t)(cgPosX
- (trSizeCG
- 1)) >> 31) & (sigPos
& 1);
1027 const uint32_t sigLower
= ((int32_t)(cgPosY
- (trSizeCG
- 1)) >> 31) & (sigPos
>> (trSizeCG
- 2)) & 2;
1029 return sigRight
+ sigLower
;
1032 /* Context derivation process of coeff_abs_significant_flag */
1033 uint32_t Quant::getSigCtxInc(uint32_t patternSigCtx
, uint32_t log2TrSize
, uint32_t trSize
, uint32_t blkPos
, bool bIsLuma
,
1034 uint32_t firstSignificanceMapContext
)
1036 static const uint8_t ctxIndMap
[16] =
1044 if (!blkPos
) // special case for the DC context variable
1047 if (log2TrSize
== 2) // 4x4
1048 return ctxIndMap
[blkPos
];
1050 const uint32_t posY
= blkPos
>> log2TrSize
;
1051 const uint32_t posX
= blkPos
& (trSize
- 1);
1052 X265_CHECK((blkPos
- (posY
<< log2TrSize
)) == posX
, "block pos check failed\n");
1054 int posXinSubset
= blkPos
& 3;
1055 X265_CHECK((posX
& 3) == (blkPos
& 3), "pos alignment fail\n");
1056 int posYinSubset
= posY
& 3;
1058 // NOTE: [patternSigCtx][posXinSubset][posYinSubset]
1059 static const uint8_t table_cnt
[4][4][4] =
1061 // patternSigCtx = 0
1068 // patternSigCtx = 1
1075 // patternSigCtx = 2
1082 // patternSigCtx = 3
1091 int cnt
= table_cnt
[patternSigCtx
][posXinSubset
][posYinSubset
];
1092 int offset
= firstSignificanceMapContext
;
1096 return (bIsLuma
&& (posX
| posY
) >= 4) ? 3 + offset
: offset
;
1099 /* Calculates the cost of signaling the last significant coefficient in the block */
1100 inline uint32_t Quant::getRateLast(uint32_t posx
, uint32_t posy
) const
1102 uint32_t ctxX
= getGroupIdx(posx
);
1103 uint32_t ctxY
= getGroupIdx(posy
);
1104 uint32_t cost
= m_entropyCoder
->m_estBitsSbac
.lastXBits
[ctxX
] + m_entropyCoder
->m_estBitsSbac
.lastYBits
[ctxY
];
1106 int32_t maskX
= (int32_t)(2 - posx
) >> 31;
1107 int32_t maskY
= (int32_t)(2 - posy
) >> 31;
1109 cost
+= maskX
& (IEP_RATE
* ((ctxX
- 2) >> 1));
1110 cost
+= maskY
& (IEP_RATE
* ((ctxY
- 2) >> 1));
1114 /* Context derivation process of coeff_abs_significant_flag */
1115 uint32_t Quant::getSigCoeffGroupCtxInc(uint64_t cgGroupMask
, uint32_t cgPosX
, uint32_t cgPosY
, uint32_t log2TrSizeCG
)
1117 const uint32_t trSizeCG
= 1 << log2TrSizeCG
;
1119 const uint32_t sigPos
= (uint32_t)(cgGroupMask
>> (1 + (cgPosY
<< log2TrSizeCG
) + cgPosX
));
1120 const uint32_t sigRight
= ((int32_t)(cgPosX
- (trSizeCG
- 1)) >> 31) & sigPos
;
1121 const uint32_t sigLower
= ((int32_t)(cgPosY
- (trSizeCG
- 1)) >> 31) & (sigPos
>> (trSizeCG
- 1));
1123 return (sigRight
| sigLower
) & 1;