1 /*****************************************************************************
2 * Copyright (C) 2014 x265 project
4 * Authors: Steve Borho <steve@borho.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 * This program is also available under a commercial proprietary license.
21 * For more information, contact us at license @ x265.com.
22 *****************************************************************************/
25 #include "primitives.h"
27 #include "framedata.h"
35 #define SIGN(x,y) ((x^(y >> 31))-(y >> 31))
39 struct coeffGroupRDStats
41 int nnzBeforePos0
; /* indicates coeff other than pos 0 are coded */
42 int64_t codedLevelAndDist
; /* distortion and level cost of coded coefficients */
43 int64_t uncodedDist
; /* uncoded distortion cost of coded coefficients */
44 int64_t sigCost
; /* cost of signaling significant coeff bitmap */
45 int64_t sigCost0
; /* cost of signaling sig coeff bit of coeff 0 */
48 inline int fastMin(int x
, int y
)
50 return y
+ ((x
- y
) & ((x
- y
) >> (sizeof(int) * CHAR_BIT
- 1))); // min(x, y)
53 inline int getICRate(uint32_t absLevel
, int32_t diffLevel
, const int* greaterOneBits
, const int* levelAbsBits
, uint32_t absGoRice
, uint32_t c1c2Idx
)
55 X265_CHECK(c1c2Idx
<= 3, "c1c2Idx check failure\n");
56 X265_CHECK(absGoRice
<= 4, "absGoRice check failure\n");
59 X265_CHECK(diffLevel
< 0, "diffLevel check failure\n");
66 X265_CHECK(absLevel
<= 2, "absLevel check failure\n");
67 rate
+= greaterOneBits
[(absLevel
== 2)];
70 rate
+= levelAbsBits
[0];
74 uint32_t symbol
= diffLevel
;
75 const uint32_t maxVlc
= g_goRiceRange
[absGoRice
];
76 bool expGolomb
= (symbol
> maxVlc
);
80 absLevel
= symbol
- maxVlc
;
82 // NOTE: mapping to x86 hardware instruction BSR
85 int egs
= size
* 2 + 1;
89 // NOTE: in here, expGolomb=true means (symbol >= maxVlc + 1)
90 X265_CHECK(fastMin(symbol
, (maxVlc
+ 1)) == (int)maxVlc
+ 1, "min check failure\n");
94 uint32_t prefLen
= (symbol
>> absGoRice
) + 1;
95 uint32_t numBins
= fastMin(prefLen
+ absGoRice
, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */);
97 rate
+= numBins
<< 15;
100 rate
+= greaterOneBits
[1];
103 rate
+= levelAbsBits
[1];
108 /* Calculates the cost for specific absolute transform level */
109 inline uint32_t getICRateCost(uint32_t absLevel
, int32_t diffLevel
, const int* greaterOneBits
, const int* levelAbsBits
, uint32_t absGoRice
, uint32_t c1c2Idx
)
111 X265_CHECK(absLevel
, "absLevel should not be zero\n");
115 X265_CHECK((absLevel
== 1) || (absLevel
== 2), "absLevel range check failure\n");
117 uint32_t rate
= greaterOneBits
[(absLevel
== 2)];
119 rate
+= levelAbsBits
[0];
125 uint32_t symbol
= diffLevel
;
126 if ((symbol
>> absGoRice
) < COEF_REMAIN_BIN_REDUCTION
)
128 uint32_t length
= symbol
>> absGoRice
;
129 rate
= (length
+ 1 + absGoRice
) << 15;
134 symbol
= (symbol
>> absGoRice
) - COEF_REMAIN_BIN_REDUCTION
;
138 CLZ(idx
, symbol
+ 1);
142 rate
= (COEF_REMAIN_BIN_REDUCTION
+ length
+ absGoRice
+ 1 + length
) << 15;
145 rate
+= greaterOneBits
[1];
147 rate
+= levelAbsBits
[1];
156 m_resiDctCoeff
= NULL
;
157 m_fencDctCoeff
= NULL
;
158 m_fencShortBuf
= NULL
;
163 bool Quant::init(bool useRDOQ
, double psyScale
, const ScalingList
& scalingList
, Entropy
& entropy
)
165 m_entropyCoder
= &entropy
;
167 m_psyRdoqScale
= (int64_t)(psyScale
* 256.0);
168 m_scalingList
= &scalingList
;
169 m_resiDctCoeff
= X265_MALLOC(int16_t, MAX_TR_SIZE
* MAX_TR_SIZE
* 2);
170 m_fencDctCoeff
= m_resiDctCoeff
+ (MAX_TR_SIZE
* MAX_TR_SIZE
);
171 m_fencShortBuf
= X265_MALLOC(int16_t, MAX_TR_SIZE
* MAX_TR_SIZE
);
173 return m_resiDctCoeff
&& m_fencShortBuf
;
176 bool Quant::allocNoiseReduction(const x265_param
& param
)
178 m_frameNr
= X265_MALLOC(NoiseReduction
, param
.frameNumThreads
);
180 memset(m_frameNr
, 0, sizeof(NoiseReduction
) * param
.frameNumThreads
);
188 X265_FREE(m_frameNr
);
189 X265_FREE(m_resiDctCoeff
);
190 X265_FREE(m_fencShortBuf
);
193 void Quant::setQPforQuant(const CUData
& ctu
)
195 m_nr
= m_frameNr
? &m_frameNr
[ctu
.m_encData
->m_frameEncoderID
] : NULL
;
196 int qpy
= ctu
.m_qp
[0];
197 m_qpParam
[TEXT_LUMA
].setQpParam(qpy
+ QP_BD_OFFSET
);
198 setChromaQP(qpy
+ ctu
.m_slice
->m_pps
->chromaQpOffset
[0], TEXT_CHROMA_U
, ctu
.m_chromaFormat
);
199 setChromaQP(qpy
+ ctu
.m_slice
->m_pps
->chromaQpOffset
[1], TEXT_CHROMA_V
, ctu
.m_chromaFormat
);
202 void Quant::setChromaQP(int qpin
, TextType ttype
, int chFmt
)
204 int qp
= Clip3(-QP_BD_OFFSET
, 57, qpin
);
207 if (chFmt
== X265_CSP_I420
)
208 qp
= g_chromaScale
[qp
];
210 qp
= X265_MIN(qp
, 51);
212 m_qpParam
[ttype
].setQpParam(qp
+ QP_BD_OFFSET
);
215 /* To minimize the distortion only. No rate is considered */
216 uint32_t Quant::signBitHidingHDQ(int16_t* coeff
, int32_t* deltaU
, uint32_t numSig
, const TUEntropyCodingParameters
&codeParams
)
218 const uint32_t log2TrSizeCG
= codeParams
.log2TrSizeCG
;
219 const uint16_t* scan
= codeParams
.scan
;
222 for (int cg
= (1 << (log2TrSizeCG
* 2)) - 1; cg
>= 0; cg
--)
224 int cgStartPos
= cg
<< LOG2_SCAN_SET_SIZE
;
227 for (n
= SCAN_SET_SIZE
- 1; n
>= 0; --n
)
228 if (coeff
[scan
[n
+ cgStartPos
]])
233 int lastNZPosInCG
= n
;
236 if (coeff
[scan
[n
+ cgStartPos
]])
239 int firstNZPosInCG
= n
;
241 if (lastNZPosInCG
- firstNZPosInCG
>= SBH_THRESHOLD
)
243 uint32_t signbit
= coeff
[scan
[cgStartPos
+ firstNZPosInCG
]] > 0 ? 0 : 1;
246 for (n
= firstNZPosInCG
; n
<= lastNZPosInCG
; n
++)
247 absSum
+= coeff
[scan
[n
+ cgStartPos
]];
249 if (signbit
!= (absSum
& 0x1)) // compare signbit with sum_parity
251 int minCostInc
= MAX_INT
, minPos
= -1, curCost
= MAX_INT
;
252 int16_t finalChange
= 0, curChange
= 0;
254 for (n
= (lastCG
? lastNZPosInCG
: SCAN_SET_SIZE
- 1); n
>= 0; --n
)
256 uint32_t blkPos
= scan
[n
+ cgStartPos
];
259 if (deltaU
[blkPos
] > 0)
261 curCost
= -deltaU
[blkPos
];
266 if (n
== firstNZPosInCG
&& abs(coeff
[blkPos
]) == 1)
270 curCost
= deltaU
[blkPos
];
277 if (n
< firstNZPosInCG
)
279 uint32_t thisSignBit
= m_resiDctCoeff
[blkPos
] >= 0 ? 0 : 1;
280 if (thisSignBit
!= signbit
)
284 curCost
= -deltaU
[blkPos
];
290 curCost
= -deltaU
[blkPos
];
295 if (curCost
< minCostInc
)
297 minCostInc
= curCost
;
298 finalChange
= curChange
;
303 /* do not allow change to violate coeff clamp */
304 if (coeff
[minPos
] == 32767 || coeff
[minPos
] == -32768)
309 else if (finalChange
== -1 && abs(coeff
[minPos
]) == 1)
312 if (m_resiDctCoeff
[minPos
] >= 0)
313 coeff
[minPos
] += finalChange
;
315 coeff
[minPos
] -= finalChange
;
325 uint32_t Quant::transformNxN(const CUData
& cu
, const pixel
* fenc
, uint32_t fencStride
, const int16_t* residual
, uint32_t resiStride
,
326 coeff_t
* coeff
, uint32_t log2TrSize
, TextType ttype
, uint32_t absPartIdx
, bool useTransformSkip
)
328 const uint32_t sizeIdx
= log2TrSize
- 2;
329 if (cu
.m_tqBypass
[absPartIdx
])
331 X265_CHECK(log2TrSize
>= 2 && log2TrSize
<= 5, "Block size mistake!\n");
332 return primitives
.copy_cnt
[sizeIdx
](coeff
, residual
, resiStride
);
335 bool isLuma
= ttype
== TEXT_LUMA
;
336 bool usePsy
= m_psyRdoqScale
&& isLuma
&& !useTransformSkip
;
337 int transformShift
= MAX_TR_DYNAMIC_RANGE
- X265_DEPTH
- log2TrSize
; // Represents scaling through forward transform
339 X265_CHECK((cu
.m_slice
->m_sps
->quadtreeTULog2MaxSize
>= log2TrSize
), "transform size too large\n");
340 if (useTransformSkip
)
343 X265_CHECK(transformShift
>= 0, "invalid transformShift\n");
344 primitives
.cpy2Dto1D_shl
[sizeIdx
](m_resiDctCoeff
, residual
, resiStride
, transformShift
);
346 if (transformShift
>= 0)
347 primitives
.cpy2Dto1D_shl
[sizeIdx
](m_resiDctCoeff
, residual
, resiStride
, transformShift
);
349 primitives
.cpy2Dto1D_shr
[sizeIdx
](m_resiDctCoeff
, residual
, resiStride
, -transformShift
);
354 bool isIntra
= cu
.isIntra(absPartIdx
);
355 int useDST
= !sizeIdx
&& isLuma
&& isIntra
;
356 int index
= DCT_4x4
+ sizeIdx
- useDST
;
358 primitives
.dct
[index
](residual
, m_resiDctCoeff
, resiStride
);
360 /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so
361 * there is no risk of performing this DCT unnecessarily */
364 int trSize
= 1 << log2TrSize
;
365 /* perform DCT on source pixels for psy-rdoq */
366 primitives
.luma_copy_ps
[sizeIdx
](m_fencShortBuf
, trSize
, fenc
, fencStride
);
367 primitives
.dct
[index
](m_fencShortBuf
, m_fencDctCoeff
, trSize
);
372 /* denoise is not applied to intra residual, so DST can be ignored */
373 int cat
= sizeIdx
+ 4 * !isLuma
+ 8 * !isIntra
;
374 int numCoeff
= 1 << (log2TrSize
* 2);
375 primitives
.denoiseDct(m_resiDctCoeff
, m_nr
->residualSum
[cat
], m_nr
->offsetDenoise
[cat
], numCoeff
);
381 return rdoQuant(cu
, coeff
, log2TrSize
, ttype
, absPartIdx
, usePsy
);
386 int scalingListType
= ttype
+ (isLuma
? 3 : 0);
387 int rem
= m_qpParam
[ttype
].rem
;
388 int per
= m_qpParam
[ttype
].per
;
389 const int32_t* quantCoeff
= m_scalingList
->m_quantCoef
[log2TrSize
- 2][scalingListType
][rem
];
391 int qbits
= QUANT_SHIFT
+ per
+ transformShift
;
392 int add
= (cu
.m_slice
->m_sliceType
== I_SLICE
? 171 : 85) << (qbits
- 9);
393 int numCoeff
= 1 << (log2TrSize
* 2);
395 uint32_t numSig
= primitives
.quant(m_resiDctCoeff
, quantCoeff
, deltaU
, coeff
, qbits
, add
, numCoeff
);
397 if (numSig
>= 2 && cu
.m_slice
->m_pps
->bSignHideEnabled
)
399 TUEntropyCodingParameters codeParams
;
400 cu
.getTUEntropyCodingParameters(codeParams
, absPartIdx
, log2TrSize
, isLuma
);
401 return signBitHidingHDQ(coeff
, deltaU
, numSig
, codeParams
);
408 void Quant::invtransformNxN(bool transQuantBypass
, int16_t* residual
, uint32_t resiStride
, const coeff_t
* coeff
,
409 uint32_t log2TrSize
, TextType ttype
, bool bIntra
, bool useTransformSkip
, uint32_t numSig
)
411 const uint32_t sizeIdx
= log2TrSize
- 2;
412 if (transQuantBypass
)
414 primitives
.cpy1Dto2D_shl
[sizeIdx
](residual
, coeff
, resiStride
, 0);
418 // Values need to pass as input parameter in dequant
419 int rem
= m_qpParam
[ttype
].rem
;
420 int per
= m_qpParam
[ttype
].per
;
421 int transformShift
= MAX_TR_DYNAMIC_RANGE
- X265_DEPTH
- log2TrSize
;
422 int shift
= QUANT_IQUANT_SHIFT
- QUANT_SHIFT
- transformShift
;
423 int numCoeff
= 1 << (log2TrSize
* 2);
425 if (m_scalingList
->m_bEnabled
)
427 int scalingListType
= (bIntra
? 0 : 3) + ttype
;
428 const int32_t* dequantCoef
= m_scalingList
->m_dequantCoef
[sizeIdx
][scalingListType
][rem
];
429 primitives
.dequant_scaling(coeff
, dequantCoef
, m_resiDctCoeff
, numCoeff
, per
, shift
);
433 int scale
= m_scalingList
->s_invQuantScales
[rem
] << per
;
434 primitives
.dequant_normal(coeff
, m_resiDctCoeff
, numCoeff
, scale
, shift
);
437 if (useTransformSkip
)
440 X265_CHECK(transformShift
> 0, "invalid transformShift\n");
441 primitives
.cpy1Dto2D_shr
[sizeIdx
](residual
, m_resiDctCoeff
, resiStride
, transformShift
);
443 if (transformShift
> 0)
444 primitives
.cpy1Dto2D_shr
[sizeIdx
](residual
, m_resiDctCoeff
, resiStride
, transformShift
);
446 primitives
.cpy1Dto2D_shl
[sizeIdx
](residual
, m_resiDctCoeff
, resiStride
, -transformShift
);
451 int useDST
= !sizeIdx
&& ttype
== TEXT_LUMA
&& bIntra
;
453 X265_CHECK((int)numSig
== primitives
.count_nonzero(coeff
, 1 << (log2TrSize
* 2)), "numSig differ\n");
456 if (numSig
== 1 && coeff
[0] != 0 && !useDST
)
458 const int shift_1st
= 7 - 6;
459 const int add_1st
= 1 << (shift_1st
- 1);
460 const int shift_2nd
= 12 - (X265_DEPTH
- 8) - 3;
461 const int add_2nd
= 1 << (shift_2nd
- 1);
463 int dc_val
= (((m_resiDctCoeff
[0] * (64 >> 6) + add_1st
) >> shift_1st
) * (64 >> 3) + add_2nd
) >> shift_2nd
;
464 primitives
.blockfill_s
[sizeIdx
](residual
, resiStride
, (int16_t)dc_val
);
468 primitives
.idct
[IDCT_4x4
+ sizeIdx
- useDST
](m_resiDctCoeff
, residual
, resiStride
);
472 /* Rate distortion optimized quantization for entropy coding engines using
473 * probability models like CABAC */
474 uint32_t Quant::rdoQuant(const CUData
& cu
, int16_t* dstCoeff
, uint32_t log2TrSize
, TextType ttype
, uint32_t absPartIdx
, bool usePsy
)
476 int transformShift
= MAX_TR_DYNAMIC_RANGE
- X265_DEPTH
- log2TrSize
; /* Represents scaling through forward transform */
477 int scalingListType
= (cu
.isIntra(absPartIdx
) ? 0 : 3) + ttype
;
479 X265_CHECK(scalingListType
< 6, "scaling list type out of range\n");
481 int rem
= m_qpParam
[ttype
].rem
;
482 int per
= m_qpParam
[ttype
].per
;
483 int qbits
= QUANT_SHIFT
+ per
+ transformShift
; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */
484 int add
= (1 << (qbits
- 1));
485 const int32_t* qCoef
= m_scalingList
->m_quantCoef
[log2TrSize
- 2][scalingListType
][rem
];
487 int numCoeff
= 1 << (log2TrSize
* 2);
489 uint32_t numSig
= primitives
.nquant(m_resiDctCoeff
, qCoef
, dstCoeff
, qbits
, add
, numCoeff
);
491 X265_CHECK((int)numSig
== primitives
.count_nonzero(dstCoeff
, 1 << (log2TrSize
* 2)), "numSig differ\n");
495 uint32_t trSize
= 1 << log2TrSize
;
496 int64_t lambda2
= m_qpParam
[ttype
].lambda2
;
497 int64_t psyScale
= (m_psyRdoqScale
* m_qpParam
[ttype
].lambda
);
499 /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
500 * scale applied that must be removed during unquant. Note that in real dequant there is clipping
501 * at several stages. We skip the clipping for simplicity when measuring RD cost */
502 const int32_t* unquantScale
= m_scalingList
->m_dequantCoef
[log2TrSize
- 2][scalingListType
][rem
];
503 int unquantShift
= QUANT_IQUANT_SHIFT
- QUANT_SHIFT
- transformShift
+ (m_scalingList
->m_bEnabled
? 4 : 0);
504 int unquantRound
= (unquantShift
> per
) ? 1 << (unquantShift
- per
- 1) : 0;
505 int scaleBits
= SCALE_BITS
- 2 * transformShift
;
507 #define UNQUANT(lvl) (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift)
508 #define SIGCOST(bits) ((lambda2 * (bits)) >> 8)
509 #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits))
510 #define PSYVALUE(rec) ((psyScale * (rec)) >> (16 - scaleBits))
512 int64_t costCoeff
[32 * 32]; /* d*d + lambda * bits */
513 int64_t costUncoded
[32 * 32]; /* d*d + lambda * 0 */
514 int64_t costSig
[32 * 32]; /* lambda * bits */
516 int rateIncUp
[32 * 32]; /* signal overhead of increasing level */
517 int rateIncDown
[32 * 32]; /* signal overhead of decreasing level */
518 int sigRateDelta
[32 * 32]; /* signal difference between zero and non-zero */
520 int64_t costCoeffGroupSig
[MLS_GRP_NUM
]; /* lambda * bits of group coding cost */
521 uint64_t sigCoeffGroupFlag64
= 0;
526 uint32_t goRiceParam
= 0;
529 int cgLastScanPos
= -1;
530 int lastScanPos
= -1;
531 const uint32_t cgSize
= (1 << MLS_CG_SIZE
); /* 4x4 num coef = 16 */
532 bool bIsLuma
= ttype
== TEXT_LUMA
;
534 /* total rate distortion cost of transform block, as CBF=0 */
535 int64_t totalUncodedCost
= 0;
537 /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks,
538 * the distortion and signal cost of coded blocks, and the coding cost of significant
539 * coefficient and coefficient group bitmaps */
540 int64_t totalRdCost
= 0;
542 TUEntropyCodingParameters codeParams
;
543 cu
.getTUEntropyCodingParameters(codeParams
, absPartIdx
, log2TrSize
, bIsLuma
);
544 const uint32_t cgNum
= 1 << (codeParams
.log2TrSizeCG
* 2);
546 /* TODO: update bit estimates if dirty */
547 EstBitsSbac
& estBitsSbac
= m_entropyCoder
->m_estBitsSbac
;
550 coeffGroupRDStats cgRdStats
;
552 /* iterate over coding groups in reverse scan order */
553 for (int cgScanPos
= cgNum
- 1; cgScanPos
>= 0; cgScanPos
--)
555 const uint32_t cgBlkPos
= codeParams
.scanCG
[cgScanPos
];
556 const uint32_t cgPosY
= cgBlkPos
>> codeParams
.log2TrSizeCG
;
557 const uint32_t cgPosX
= cgBlkPos
- (cgPosY
<< codeParams
.log2TrSizeCG
);
558 const uint64_t cgBlkPosMask
= ((uint64_t)1 << cgBlkPos
);
559 memset(&cgRdStats
, 0, sizeof(coeffGroupRDStats
));
561 const int patternSigCtx
= calcPatternSigCtx(sigCoeffGroupFlag64
, cgPosX
, cgPosY
, codeParams
.log2TrSizeCG
);
563 /* iterate over coefficients in each group in reverse scan order */
564 for (int scanPosinCG
= cgSize
- 1; scanPosinCG
>= 0; scanPosinCG
--)
566 scanPos
= (cgScanPos
<< MLS_CG_SIZE
) + scanPosinCG
;
567 uint32_t blkPos
= codeParams
.scan
[scanPos
];
568 uint16_t maxAbsLevel
= (int16_t)abs(dstCoeff
[blkPos
]); /* abs(quantized coeff) */
569 int signCoef
= m_resiDctCoeff
[blkPos
]; /* pre-quantization DCT coeff */
570 int predictedCoef
= m_fencDctCoeff
[blkPos
] - signCoef
; /* predicted DCT = source DCT - residual DCT*/
572 /* RDOQ measures distortion as the squared difference between the unquantized coded level
573 * and the original DCT coefficient. The result is shifted scaleBits to account for the
574 * FIX15 nature of the CABAC cost tables minus the forward transform scale */
576 /* cost of not coding this coefficient (all distortion, no signal bits) */
577 costUncoded
[scanPos
] = (int64_t)(signCoef
* signCoef
) << scaleBits
;
578 if (usePsy
&& blkPos
)
579 /* when no residual coefficient is coded, predicted coef == recon coef */
580 costUncoded
[scanPos
] -= PSYVALUE(predictedCoef
);
582 totalUncodedCost
+= costUncoded
[scanPos
];
584 if (maxAbsLevel
&& lastScanPos
< 0)
586 /* remember the first non-zero coef found in this reverse scan as the last pos */
587 lastScanPos
= scanPos
;
588 ctxSet
= (scanPos
< SCAN_SET_SIZE
|| !bIsLuma
) ? 0 : 2;
589 cgLastScanPos
= cgScanPos
;
594 /* coefficients after lastNZ have no distortion signal cost */
595 costCoeff
[scanPos
] = 0;
596 costSig
[scanPos
] = 0;
598 /* No non-zero coefficient yet found, but this does not mean
599 * there is no uncoded-cost for this coefficient. Pre-
600 * quantization the coefficient may have been non-zero */
601 totalRdCost
+= costUncoded
[scanPos
];
605 const uint32_t c1c2Idx
= ((c1Idx
- 8) >> (sizeof(int) * CHAR_BIT
- 1)) + (((-(int)c2Idx
) >> (sizeof(int) * CHAR_BIT
- 1)) + 1) * 2;
606 const uint32_t baseLevel
= ((uint32_t)0xD9 >> (c1c2Idx
* 2)) & 3; // {1, 2, 1, 3}
608 X265_CHECK(!!((int)c1Idx
< C1FLAG_NUMBER
) == (int)((c1Idx
- 8) >> (sizeof(int) * CHAR_BIT
- 1)), "scan validation 1\n");
609 X265_CHECK(!!(c2Idx
== 0) == ((-(int)c2Idx
) >> (sizeof(int) * CHAR_BIT
- 1)) + 1, "scan validation 2\n");
610 X265_CHECK((int)baseLevel
== ((c1Idx
< C1FLAG_NUMBER
) ? (2 + (c2Idx
== 0)) : 1), "scan validation 3\n");
612 // coefficient level estimation
613 const uint32_t oneCtx
= 4 * ctxSet
+ c1
;
614 const uint32_t absCtx
= ctxSet
+ c2
;
615 const int* greaterOneBits
= estBitsSbac
.greaterOneBits
[oneCtx
];
616 const int* levelAbsBits
= estBitsSbac
.levelAbsBits
[absCtx
];
619 uint32_t sigCoefBits
= 0;
620 costCoeff
[scanPos
] = MAX_INT64
;
622 if ((int)scanPos
== lastScanPos
)
623 sigRateDelta
[blkPos
] = 0;
626 const uint32_t ctxSig
= getSigCtxInc(patternSigCtx
, log2TrSize
, trSize
, blkPos
, bIsLuma
, codeParams
.firstSignificanceMapContext
);
629 /* set default costs to uncoded costs */
630 costSig
[scanPos
] = SIGCOST(estBitsSbac
.significantBits
[ctxSig
][0]);
631 costCoeff
[scanPos
] = costUncoded
[scanPos
] + costSig
[scanPos
];
633 sigRateDelta
[blkPos
] = estBitsSbac
.significantBits
[ctxSig
][1] - estBitsSbac
.significantBits
[ctxSig
][0];
634 sigCoefBits
= estBitsSbac
.significantBits
[ctxSig
][1];
638 uint16_t minAbsLevel
= X265_MAX(maxAbsLevel
- 1, 1);
639 for (uint16_t lvl
= maxAbsLevel
; lvl
>= minAbsLevel
; lvl
--)
641 uint32_t levelBits
= getICRateCost(lvl
, lvl
- baseLevel
, greaterOneBits
, levelAbsBits
, goRiceParam
, c1c2Idx
) + IEP_RATE
;
643 int unquantAbsLevel
= UNQUANT(lvl
);
644 int d
= abs(signCoef
) - unquantAbsLevel
;
645 int64_t curCost
= RDCOST(d
, sigCoefBits
+ levelBits
);
647 /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */
648 if (usePsy
&& blkPos
)
650 int reconCoef
= abs(unquantAbsLevel
+ SIGN(predictedCoef
, signCoef
));
651 curCost
-= PSYVALUE(reconCoef
);
654 if (curCost
< costCoeff
[scanPos
])
657 costCoeff
[scanPos
] = curCost
;
658 costSig
[scanPos
] = SIGCOST(sigCoefBits
);
663 dstCoeff
[blkPos
] = level
;
664 totalRdCost
+= costCoeff
[scanPos
];
666 /* record costs for sign-hiding performed at the end */
669 int rateNow
= getICRate(level
, level
- baseLevel
, greaterOneBits
, levelAbsBits
, goRiceParam
, c1c2Idx
);
670 rateIncUp
[blkPos
] = getICRate(level
+ 1, level
+ 1 - baseLevel
, greaterOneBits
, levelAbsBits
, goRiceParam
, c1c2Idx
) - rateNow
;
671 rateIncDown
[blkPos
] = getICRate(level
- 1, level
- 1 - baseLevel
, greaterOneBits
, levelAbsBits
, goRiceParam
, c1c2Idx
) - rateNow
;
675 rateIncUp
[blkPos
] = greaterOneBits
[0];
676 rateIncDown
[blkPos
] = 0;
679 /* Update CABAC estimation state */
680 if (level
>= baseLevel
&& goRiceParam
< 4 && level
> (3U << goRiceParam
))
683 c1Idx
-= (-(int32_t)level
) >> 31;
685 /* update bin model */
689 c2
+= (uint32_t)(c2
- 2) >> 31;
692 else if ((c1
< 3) && (c1
> 0) && level
)
695 /* context set update */
696 if (!(scanPos
% SCAN_SET_SIZE
) && scanPos
)
703 ctxSet
= (scanPos
== SCAN_SET_SIZE
|| !bIsLuma
) ? 0 : 2;
704 X265_CHECK(c1
>= 0, "c1 is negative\n");
705 ctxSet
-= ((int32_t)(c1
- 1) >> 31);
710 cgRdStats
.sigCost
+= costSig
[scanPos
];
712 cgRdStats
.sigCost0
= costSig
[scanPos
];
714 if (dstCoeff
[blkPos
])
716 sigCoeffGroupFlag64
|= cgBlkPosMask
;
717 cgRdStats
.codedLevelAndDist
+= costCoeff
[scanPos
] - costSig
[scanPos
];
718 cgRdStats
.uncodedDist
+= costUncoded
[scanPos
];
719 cgRdStats
.nnzBeforePos0
+= scanPosinCG
;
721 } /* end for (scanPosinCG) */
723 costCoeffGroupSig
[cgScanPos
] = 0;
725 if (cgLastScanPos
< 0)
727 /* nothing to do at this point */
729 else if (!cgScanPos
|| cgScanPos
== cgLastScanPos
)
731 /* coeff group 0 is implied to be present, no signal cost */
732 /* coeff group with last NZ is implied to be present, handled below */
734 else if (sigCoeffGroupFlag64
& cgBlkPosMask
)
736 if (!cgRdStats
.nnzBeforePos0
)
738 /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */
739 totalRdCost
-= cgRdStats
.sigCost0
;
740 cgRdStats
.sigCost
-= cgRdStats
.sigCost0
;
743 /* there are coded coefficients in this group, but now we include the signaling cost
744 * of the significant coefficient group flag and evaluate whether the RD cost of the
745 * coded group is more than the RD cost of the uncoded group */
747 uint32_t sigCtx
= getSigCoeffGroupCtxInc(sigCoeffGroupFlag64
, cgPosX
, cgPosY
, codeParams
.log2TrSizeCG
);
749 int64_t costZeroCG
= totalRdCost
+ SIGCOST(estBitsSbac
.significantCoeffGroupBits
[sigCtx
][0]);
750 costZeroCG
+= cgRdStats
.uncodedDist
; /* add distortion for resetting non-zero levels to zero levels */
751 costZeroCG
-= cgRdStats
.codedLevelAndDist
; /* remove distortion and level cost of coded coefficients */
752 costZeroCG
-= cgRdStats
.sigCost
; /* remove signaling cost of significant coeff bitmap */
754 costCoeffGroupSig
[cgScanPos
] = SIGCOST(estBitsSbac
.significantCoeffGroupBits
[sigCtx
][1]);
755 totalRdCost
+= costCoeffGroupSig
[cgScanPos
]; /* add the cost of 1 bit in significant CG bitmap */
757 if (costZeroCG
< totalRdCost
)
759 sigCoeffGroupFlag64
&= ~cgBlkPosMask
;
760 totalRdCost
= costZeroCG
;
761 costCoeffGroupSig
[cgScanPos
] = SIGCOST(estBitsSbac
.significantCoeffGroupBits
[sigCtx
][0]);
763 /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */
764 for (int scanPosinCG
= cgSize
- 1; scanPosinCG
>= 0; scanPosinCG
--)
766 scanPos
= cgScanPos
* cgSize
+ scanPosinCG
;
767 uint32_t blkPos
= codeParams
.scan
[scanPos
];
768 if (dstCoeff
[blkPos
])
770 costCoeff
[scanPos
] = costUncoded
[scanPos
];
771 costSig
[scanPos
] = 0;
773 dstCoeff
[blkPos
] = 0;
779 /* there were no coded coefficients in this coefficient group */
780 uint32_t ctxSig
= getSigCoeffGroupCtxInc(sigCoeffGroupFlag64
, cgPosX
, cgPosY
, codeParams
.log2TrSizeCG
);
781 costCoeffGroupSig
[cgScanPos
] = SIGCOST(estBitsSbac
.significantCoeffGroupBits
[ctxSig
][0]);
782 totalRdCost
+= costCoeffGroupSig
[cgScanPos
]; /* add cost of 0 bit in significant CG bitmap */
783 totalRdCost
-= cgRdStats
.sigCost
; /* remove cost of significant coefficient bitmap */
785 } /* end for (cgScanPos) */
787 X265_CHECK(lastScanPos
>= 0, "numSig non zero, but no coded CG\n");
789 /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */
791 if (!cu
.isIntra(absPartIdx
) && bIsLuma
&& !cu
.m_tuDepth
[absPartIdx
])
793 bestCost
= totalUncodedCost
+ SIGCOST(estBitsSbac
.blockRootCbpBits
[0]);
794 totalRdCost
+= SIGCOST(estBitsSbac
.blockRootCbpBits
[1]);
798 int ctx
= ctxCbf
[ttype
][cu
.m_tuDepth
[absPartIdx
]];
799 bestCost
= totalUncodedCost
+ SIGCOST(estBitsSbac
.blockCbpBits
[ctx
][0]);
800 totalRdCost
+= SIGCOST(estBitsSbac
.blockCbpBits
[ctx
][1]);
803 /* This loop starts with the last non-zero found in the first loop and then refines this last
804 * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs
805 * at all previous coefficients until a coefficient greater than 1 is encountered or we run out
806 * of coefficients to evaluate. This will factor in the cost of coding empty groups and empty
807 * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */
809 bool foundLast
= false;
810 for (int cgScanPos
= cgLastScanPos
; cgScanPos
>= 0 && !foundLast
; cgScanPos
--)
812 if (!cgScanPos
|| cgScanPos
== cgLastScanPos
)
814 /* the presence of these coefficient groups are inferred, they have no bit in
815 * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */
817 else if (sigCoeffGroupFlag64
& (1ULL << codeParams
.scanCG
[cgScanPos
]))
819 /* remove cost of significant coeff group flag, the group's presence would be inferred
820 * from lastNZ if it were present in this group */
821 totalRdCost
-= costCoeffGroupSig
[cgScanPos
];
825 /* remove cost of signaling this empty group as not present */
826 totalRdCost
-= costCoeffGroupSig
[cgScanPos
];
830 for (int scanPosinCG
= cgSize
- 1; scanPosinCG
>= 0; scanPosinCG
--)
832 scanPos
= cgScanPos
* cgSize
+ scanPosinCG
;
833 if ((int)scanPos
> lastScanPos
)
836 /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then
837 * continue as if it were uncoded. If the coefficient was already uncoded, remove the
838 * cost of signaling it as not-significant */
839 uint32_t blkPos
= codeParams
.scan
[scanPos
];
840 if (dstCoeff
[blkPos
])
842 /* Swap the cost of signaling its significant coeff bit with the cost of
843 * signaling its lastNZ pos */
844 uint32_t posY
= blkPos
>> log2TrSize
;
845 uint32_t posX
= blkPos
- (posY
<< log2TrSize
);
846 uint32_t bitsLastNZ
= codeParams
.scanType
== SCAN_VER
? getRateLast(posY
, posX
) : getRateLast(posX
, posY
);
847 int64_t costAsLast
= totalRdCost
- costSig
[scanPos
] + SIGCOST(bitsLastNZ
);
849 if (costAsLast
< bestCost
)
851 bestLastIdx
= scanPos
+ 1;
852 bestCost
= costAsLast
;
854 if (dstCoeff
[blkPos
] > 1)
860 totalRdCost
-= costCoeff
[scanPos
];
861 totalRdCost
+= costUncoded
[scanPos
];
864 totalRdCost
-= costSig
[scanPos
];
868 /* recount non-zero coefficients and re-apply sign of DCT coef */
870 for (int pos
= 0; pos
< bestLastIdx
; pos
++)
872 int blkPos
= codeParams
.scan
[pos
];
873 int level
= dstCoeff
[blkPos
];
874 numSig
+= (level
!= 0);
876 uint32_t mask
= (int32_t)m_resiDctCoeff
[blkPos
] >> 31;
877 dstCoeff
[blkPos
] = (int16_t)((level
^ mask
) - mask
);
880 /* clean uncoded coefficients */
881 for (int pos
= bestLastIdx
; pos
<= lastScanPos
; pos
++)
882 dstCoeff
[codeParams
.scan
[pos
]] = 0;
884 /* rate-distortion based sign-hiding */
885 if (cu
.m_slice
->m_pps
->bSignHideEnabled
&& numSig
>= 2)
888 for (int subSet
= cgLastScanPos
; subSet
>= 0; subSet
--)
890 int subPos
= subSet
<< LOG2_SCAN_SET_SIZE
;
893 /* measure distance between first and last non-zero coef in this
895 for (n
= SCAN_SET_SIZE
- 1; n
>= 0; --n
)
896 if (dstCoeff
[codeParams
.scan
[n
+ subPos
]])
901 int lastNZPosInCG
= n
;
904 if (dstCoeff
[codeParams
.scan
[n
+ subPos
]])
907 int firstNZPosInCG
= n
;
909 if (lastNZPosInCG
- firstNZPosInCG
>= SBH_THRESHOLD
)
911 uint32_t signbit
= (dstCoeff
[codeParams
.scan
[subPos
+ firstNZPosInCG
]] > 0 ? 0 : 1);
914 for (n
= firstNZPosInCG
; n
<= lastNZPosInCG
; n
++)
915 absSum
+= dstCoeff
[codeParams
.scan
[n
+ subPos
]];
917 if (signbit
!= (absSum
& 1U))
919 /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff
920 * is properly implied. Note dstCoeff[] are signed by this point but curChange and
921 * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */
923 int64_t minCostInc
= MAX_INT64
, curCost
= MAX_INT64
;
925 int16_t finalChange
= 0, curChange
= 0;
927 for (n
= (lastCG
? lastNZPosInCG
: SCAN_SET_SIZE
- 1); n
>= 0; --n
)
929 uint32_t blkPos
= codeParams
.scan
[n
+ subPos
];
930 int signCoef
= m_resiDctCoeff
[blkPos
]; /* pre-quantization DCT coeff */
931 int absLevel
= abs(dstCoeff
[blkPos
]);
933 int d
= abs(signCoef
) - UNQUANT(absLevel
);
934 int64_t origDist
= (((int64_t)d
* d
)) << scaleBits
;
936 #define DELTARDCOST(d, deltabits) ((((int64_t)d * d) << scaleBits) - origDist + ((lambda2 * (int64_t)(deltabits)) >> 8))
938 if (dstCoeff
[blkPos
])
940 d
= abs(signCoef
) - UNQUANT(absLevel
+ 1);
941 int64_t costUp
= DELTARDCOST(d
, rateIncUp
[blkPos
]);
943 /* if decrementing would make the coeff 0, we can include the
944 * significant coeff flag cost savings */
945 d
= abs(signCoef
) - UNQUANT(absLevel
- 1);
946 bool isOne
= abs(dstCoeff
[blkPos
]) == 1;
947 int downBits
= rateIncDown
[blkPos
] - (isOne
? (IEP_RATE
+ sigRateDelta
[blkPos
]) : 0);
948 int64_t costDown
= DELTARDCOST(d
, downBits
);
950 if (lastCG
&& lastNZPosInCG
== n
&& isOne
)
951 costDown
-= 4 * IEP_RATE
;
953 if (costUp
< costDown
)
961 if (n
== firstNZPosInCG
&& isOne
)
967 else if (n
< firstNZPosInCG
&& signbit
!= (signCoef
>= 0 ? 0 : 1U))
969 /* don't try to make a new coded coeff before the first coeff if its
970 * sign would be different than the first coeff, the inferred sign would
971 * still be wrong and we'd have to do this again. */
976 /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */
977 d
= abs(signCoef
) - UNQUANT(1);
978 curCost
= DELTARDCOST(d
, rateIncUp
[blkPos
] + IEP_RATE
+ sigRateDelta
[blkPos
]);
982 if (curCost
< minCostInc
)
984 minCostInc
= curCost
;
985 finalChange
= curChange
;
990 if (dstCoeff
[minPos
] == 32767 || dstCoeff
[minPos
] == -32768)
991 /* don't allow sign hiding to violate the SPEC range */
994 if (dstCoeff
[minPos
] == 0)
996 else if (finalChange
== -1 && abs(dstCoeff
[minPos
]) == 1)
999 if (m_resiDctCoeff
[minPos
] >= 0)
1000 dstCoeff
[minPos
] += finalChange
;
1002 dstCoeff
[minPos
] -= finalChange
;
1013 /* Pattern decision for context derivation process of significant_coeff_flag */
1014 uint32_t Quant::calcPatternSigCtx(uint64_t sigCoeffGroupFlag64
, uint32_t cgPosX
, uint32_t cgPosY
, uint32_t log2TrSizeCG
)
1019 const uint32_t trSizeCG
= 1 << log2TrSizeCG
;
1020 X265_CHECK(trSizeCG
<= 8, "transform CG is too large\n");
1021 const uint32_t sigPos
= (uint32_t)(sigCoeffGroupFlag64
>> (1 + (cgPosY
<< log2TrSizeCG
) + cgPosX
));
1022 const uint32_t sigRight
= ((int32_t)(cgPosX
- (trSizeCG
- 1)) >> 31) & (sigPos
& 1);
1023 const uint32_t sigLower
= ((int32_t)(cgPosY
- (trSizeCG
- 1)) >> 31) & (sigPos
>> (trSizeCG
- 2)) & 2;
1025 return sigRight
+ sigLower
;
1028 /* Context derivation process of coeff_abs_significant_flag */
1029 uint32_t Quant::getSigCtxInc(uint32_t patternSigCtx
, uint32_t log2TrSize
, uint32_t trSize
, uint32_t blkPos
, bool bIsLuma
,
1030 uint32_t firstSignificanceMapContext
)
1032 static const uint8_t ctxIndMap
[16] =
1040 if (!blkPos
) // special case for the DC context variable
1043 if (log2TrSize
== 2) // 4x4
1044 return ctxIndMap
[blkPos
];
1046 const uint32_t posY
= blkPos
>> log2TrSize
;
1047 const uint32_t posX
= blkPos
& (trSize
- 1);
1048 X265_CHECK((blkPos
- (posY
<< log2TrSize
)) == posX
, "block pos check failed\n");
1050 int posXinSubset
= blkPos
& 3;
1051 X265_CHECK((posX
& 3) == (blkPos
& 3), "pos alignment fail\n");
1052 int posYinSubset
= posY
& 3;
1054 // NOTE: [patternSigCtx][posXinSubset][posYinSubset]
1055 static const uint8_t table_cnt
[4][4][4] =
1057 // patternSigCtx = 0
1064 // patternSigCtx = 1
1071 // patternSigCtx = 2
1078 // patternSigCtx = 3
1087 int cnt
= table_cnt
[patternSigCtx
][posXinSubset
][posYinSubset
];
1088 int offset
= firstSignificanceMapContext
;
1092 return (bIsLuma
&& (posX
| posY
) >= 4) ? 3 + offset
: offset
;
1095 /* Calculates the cost of signaling the last significant coefficient in the block */
1096 inline uint32_t Quant::getRateLast(uint32_t posx
, uint32_t posy
) const
1098 uint32_t ctxX
= getGroupIdx(posx
);
1099 uint32_t ctxY
= getGroupIdx(posy
);
1100 uint32_t cost
= m_entropyCoder
->m_estBitsSbac
.lastXBits
[ctxX
] + m_entropyCoder
->m_estBitsSbac
.lastYBits
[ctxY
];
1102 int32_t maskX
= (int32_t)(2 - posx
) >> 31;
1103 int32_t maskY
= (int32_t)(2 - posy
) >> 31;
1105 cost
+= maskX
& (IEP_RATE
* ((ctxX
- 2) >> 1));
1106 cost
+= maskY
& (IEP_RATE
* ((ctxY
- 2) >> 1));
1110 /* Context derivation process of coeff_abs_significant_flag */
1111 uint32_t Quant::getSigCoeffGroupCtxInc(uint64_t cgGroupMask
, uint32_t cgPosX
, uint32_t cgPosY
, uint32_t log2TrSizeCG
)
1113 const uint32_t trSizeCG
= 1 << log2TrSizeCG
;
1115 const uint32_t sigPos
= (uint32_t)(cgGroupMask
>> (1 + (cgPosY
<< log2TrSizeCG
) + cgPosX
));
1116 const uint32_t sigRight
= ((int32_t)(cgPosX
- (trSizeCG
- 1)) >> 31) & sigPos
;
1117 const uint32_t sigLower
= ((int32_t)(cgPosY
- (trSizeCG
- 1)) >> 31) & (sigPos
>> (trSizeCG
- 1));
1119 return (sigRight
| sigLower
) & 1;