source/common/quant.cpp

   1 /*****************************************************************************
   2  * Copyright (C) 2014 x265 project
   3  *
   4  * Authors: Steve Borho <steve@borho.org>
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  19  *
  20  * This program is also available under a commercial proprietary license.
  21  * For more information, contact us at license @ x265.com.
  22  *****************************************************************************/
  23
  24 #include "common.h"
  25 #include "primitives.h"
  26 #include "quant.h"
  27 #include "framedata.h"
  28 #include "entropy.h"
  29 #include "yuv.h"
  30 #include "cudata.h"
  31 #include "contexts.h"
  32
  33 using namespace x265;
  34
  35 #define SIGN(x,y) ((x^(y >> 31))-(y >> 31))
  36
  37 namespace {
  38
  39 struct coeffGroupRDStats
  40 {
  41     int     nnzBeforePos0;     /* indicates coeff other than pos 0 are coded */
  42     int64_t codedLevelAndDist; /* distortion and level cost of coded coefficients */
  43     int64_t uncodedDist;       /* uncoded distortion cost of coded coefficients */
  44     int64_t sigCost;           /* cost of signaling significant coeff bitmap */
  45     int64_t sigCost0;          /* cost of signaling sig coeff bit of coeff 0 */
  46 };
  47
  48 inline int fastMin(int x, int y)
  49 {
  50     return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
  51 }
  52
  53 inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
  54 {
  55     X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n");
  56     X265_CHECK(absGoRice <= 4, "absGoRice check failure\n");
  57     if (!absLevel)
  58     {
  59         X265_CHECK(diffLevel < 0, "diffLevel check failure\n");
  60         return 0;
  61     }
  62     int rate = 0;
  63
  64     if (diffLevel < 0)
  65     {
  66         X265_CHECK(absLevel <= 2, "absLevel check failure\n");
  67         rate += greaterOneBits[(absLevel == 2)];
  68
  69         if (absLevel == 2)
  70             rate += levelAbsBits[0];
  71     }
  72     else
  73     {
  74         uint32_t symbol = diffLevel;
  75         const uint32_t maxVlc = g_goRiceRange[absGoRice];
  76         bool expGolomb = (symbol > maxVlc);
  77
  78         if (expGolomb)
  79         {
  80             absLevel = symbol - maxVlc;
  81
  82             // NOTE: mapping to x86 hardware instruction BSR
  83             unsigned long size;
  84             CLZ(size, absLevel);
  85             int egs = size * 2 + 1;
  86
  87             rate += egs << 15;
  88
  89             // NOTE: in here, expGolomb=true means (symbol >= maxVlc + 1)
  90             X265_CHECK(fastMin(symbol, (maxVlc + 1)) == (int)maxVlc + 1, "min check failure\n");
  91             symbol = maxVlc + 1;
  92         }
  93
  94         uint32_t prefLen = (symbol >> absGoRice) + 1;
  95         uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */);
  96
  97         rate += numBins << 15;
  98
  99         if (c1c2Idx & 1)
 100             rate += greaterOneBits[1];
 101
 102         if (c1c2Idx == 3)
 103             rate += levelAbsBits[1];
 104     }
 105     return rate;
 106 }
 107
 108 /* Calculates the cost for specific absolute transform level */
 109 inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
 110 {
 111     X265_CHECK(absLevel, "absLevel should not be zero\n");
 112
 113     if (diffLevel < 0)
 114     {
 115         X265_CHECK((absLevel == 1) || (absLevel == 2), "absLevel range check failure\n");
 116
 117         uint32_t rate = greaterOneBits[(absLevel == 2)];
 118         if (absLevel == 2)
 119             rate += levelAbsBits[0];
 120         return rate;
 121     }
 122     else
 123     {
 124         uint32_t rate;
 125         uint32_t symbol = diffLevel;
 126         if ((symbol >> absGoRice) < COEF_REMAIN_BIN_REDUCTION)
 127         {
 128             uint32_t length = symbol >> absGoRice;
 129             rate = (length + 1 + absGoRice) << 15;
 130         }
 131         else
 132         {
 133             uint32_t length = 0;
 134             symbol = (symbol >> absGoRice) - COEF_REMAIN_BIN_REDUCTION;
 135             if (symbol)
 136             {
 137                 unsigned long idx;
 138                 CLZ(idx, symbol + 1);
 139                 length = idx;
 140             }
 141
 142             rate = (COEF_REMAIN_BIN_REDUCTION + length + absGoRice + 1 + length) << 15;
 143         }
 144         if (c1c2Idx & 1)
 145             rate += greaterOneBits[1];
 146         if (c1c2Idx == 3)
 147             rate += levelAbsBits[1];
 148         return rate;
 149     }
 150 }
 151
 152 }
 153
 154 Quant::Quant()
 155 {
 156     m_resiDctCoeff = NULL;
 157     m_fencDctCoeff = NULL;
 158     m_fencShortBuf = NULL;
 159     m_frameNr      = NULL;
 160     m_nr           = NULL;
 161 }
 162
 163 bool Quant::init(bool useRDOQ, double psyScale, const ScalingList& scalingList, Entropy& entropy)
 164 {
 165     m_entropyCoder = &entropy;
 166     m_useRDOQ = useRDOQ;
 167     m_psyRdoqScale = (int64_t)(psyScale * 256.0);
 168     m_scalingList = &scalingList;
 169     m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
 170     m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE);
 171     m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE);
 172
 173     return m_resiDctCoeff && m_fencShortBuf;
 174 }
 175
 176 bool Quant::allocNoiseReduction(const x265_param& param)
 177 {
 178     m_frameNr = X265_MALLOC(NoiseReduction, param.frameNumThreads);
 179     if (m_frameNr)
 180         memset(m_frameNr, 0, sizeof(NoiseReduction) * param.frameNumThreads);
 181     else
 182         return false;
 183     return true;
 184 }
 185
 186 Quant::~Quant()
 187 {
 188     X265_FREE(m_frameNr);
 189     X265_FREE(m_resiDctCoeff);
 190     X265_FREE(m_fencShortBuf);
 191 }
 192
 193 void Quant::setQPforQuant(const CUData& ctu)
 194 {
 195     m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
 196     int qpy = ctu.m_qp[0];
 197     m_qpParam[TEXT_LUMA].setQpParam(qpy + QP_BD_OFFSET);
 198     setChromaQP(qpy + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
 199     setChromaQP(qpy + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
 200 }
 201
 202 void Quant::setChromaQP(int qpin, TextType ttype, int chFmt)
 203 {
 204     int qp = Clip3(-QP_BD_OFFSET, 57, qpin);
 205     if (qp >= 30)
 206     {
 207         if (chFmt == X265_CSP_I420)
 208             qp = g_chromaScale[qp];
 209         else
 210             qp = X265_MIN(qp, 51);
 211     }
 212     m_qpParam[ttype].setQpParam(qp + QP_BD_OFFSET);
 213 }
 214
 215 /* To minimize the distortion only. No rate is considered */
 216 uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams)
 217 {
 218     const uint32_t log2TrSizeCG = codeParams.log2TrSizeCG;
 219     const uint16_t* scan = codeParams.scan;
 220     bool lastCG = true;
 221
 222     for (int cg = (1 << (log2TrSizeCG * 2)) - 1; cg >= 0; cg--)
 223     {
 224         int cgStartPos = cg << LOG2_SCAN_SET_SIZE;
 225         int n;
 226
 227         for (n = SCAN_SET_SIZE - 1; n >= 0; --n)
 228             if (coeff[scan[n + cgStartPos]])
 229                 break;
 230         if (n < 0)
 231             continue;
 232
 233         int lastNZPosInCG = n;
 234
 235         for (n = 0;; n++)
 236             if (coeff[scan[n + cgStartPos]])
 237                 break;
 238
 239         int firstNZPosInCG = n;
 240
 241         if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
 242         {
 243             uint32_t signbit = coeff[scan[cgStartPos + firstNZPosInCG]] > 0 ? 0 : 1;
 244             uint32_t absSum = 0;
 245
 246             for (n = firstNZPosInCG; n <= lastNZPosInCG; n++)
 247                 absSum += coeff[scan[n + cgStartPos]];
 248
 249             if (signbit != (absSum & 0x1)) // compare signbit with sum_parity
 250             {
 251                 int minCostInc = MAX_INT,  minPos = -1, curCost = MAX_INT;
 252                 int16_t finalChange = 0, curChange = 0;
 253
 254                 for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n)
 255                 {
 256                     uint32_t blkPos = scan[n + cgStartPos];
 257                     if (coeff[blkPos])
 258                     {
 259                         if (deltaU[blkPos] > 0)
 260                         {
 261                             curCost = -deltaU[blkPos];
 262                             curChange = 1;
 263                         }
 264                         else
 265                         {
 266                             if (n == firstNZPosInCG && abs(coeff[blkPos]) == 1)
 267                                 curCost = MAX_INT;
 268                             else
 269                             {
 270                                 curCost = deltaU[blkPos];
 271                                 curChange = -1;
 272                             }
 273                         }
 274                     }
 275                     else
 276                     {
 277                         if (n < firstNZPosInCG)
 278                         {
 279                             uint32_t thisSignBit = m_resiDctCoeff[blkPos] >= 0 ? 0 : 1;
 280                             if (thisSignBit != signbit)
 281                                 curCost = MAX_INT;
 282                             else
 283                             {
 284                                 curCost = -deltaU[blkPos];
 285                                 curChange = 1;
 286                             }
 287                         }
 288                         else
 289                         {
 290                             curCost = -deltaU[blkPos];
 291                             curChange = 1;
 292                         }
 293                     }
 294
 295                     if (curCost < minCostInc)
 296                     {
 297                         minCostInc = curCost;
 298                         finalChange = curChange;
 299                         minPos = blkPos;
 300                     }
 301                 }
 302
 303                 /* do not allow change to violate coeff clamp */
 304                 if (coeff[minPos] == 32767 || coeff[minPos] == -32768)
 305                     finalChange = -1;
 306
 307                 if (!coeff[minPos])
 308                     numSig++;
 309                 else if (finalChange == -1 && abs(coeff[minPos]) == 1)
 310                     numSig--;
 311
 312                 if (m_resiDctCoeff[minPos] >= 0)
 313                     coeff[minPos] += finalChange;
 314                 else
 315                     coeff[minPos] -= finalChange;
 316             }
 317         }
 318
 319         lastCG = false;
 320     }
 321
 322     return numSig;
 323 }
 324
 325 uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride,
 326                              coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip)
 327 {
 328     const uint32_t sizeIdx = log2TrSize - 2;
 329     if (cu.m_tqBypass[absPartIdx])
 330     {
 331         X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
 332         return primitives.copy_cnt[sizeIdx](coeff, residual, resiStride);
 333     }
 334
 335     bool isLuma  = ttype == TEXT_LUMA;
 336     bool usePsy  = m_psyRdoqScale && isLuma && !useTransformSkip;
 337     int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform
 338
 339     X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n");
 340     if (useTransformSkip)
 341     {
 342 #if X265_DEPTH <= 10
 343         X265_CHECK(transformShift >= 0, "invalid transformShift\n");
 344         primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
 345 #else
 346         if (transformShift >= 0)
 347             primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift);
 348         else
 349             primitives.cpy2Dto1D_shr[sizeIdx](m_resiDctCoeff, residual, resiStride, -transformShift);
 350 #endif
 351     }
 352     else
 353     {
 354         bool isIntra = cu.isIntra(absPartIdx);
 355         int useDST = !sizeIdx && isLuma && isIntra;
 356         int index = DCT_4x4 + sizeIdx - useDST;
 357
 358         primitives.dct[index](residual, m_resiDctCoeff, resiStride);
 359
 360         /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so
 361          * there is no risk of performing this DCT unnecessarily */
 362         if (usePsy)
 363         {
 364             int trSize = 1 << log2TrSize;
 365             /* perform DCT on source pixels for psy-rdoq */
 366             primitives.luma_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride);
 367             primitives.dct[index](m_fencShortBuf, m_fencDctCoeff, trSize);
 368         }
 369
 370         if (m_nr)
 371         {
 372             /* denoise is not applied to intra residual, so DST can be ignored */
 373             int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra;
 374             int numCoeff = 1 << (log2TrSize * 2);
 375             primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
 376             m_nr->count[cat]++;
 377         }
 378     }
 379
 380     if (m_useRDOQ)
 381         return rdoQuant(cu, coeff, log2TrSize, ttype, absPartIdx, usePsy);
 382     else
 383     {
 384         int deltaU[32 * 32];
 385
 386         int scalingListType = ttype + (isLuma ? 3 : 0);
 387         int rem = m_qpParam[ttype].rem;
 388         int per = m_qpParam[ttype].per;
 389         const int32_t* quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
 390
 391         int qbits = QUANT_SHIFT + per + transformShift;
 392         int add = (cu.m_slice->m_sliceType == I_SLICE ? 171 : 85) << (qbits - 9);
 393         int numCoeff = 1 << (log2TrSize * 2);
 394
 395         uint32_t numSig = primitives.quant(m_resiDctCoeff, quantCoeff, deltaU, coeff, qbits, add, numCoeff);
 396
 397         if (numSig >= 2 && cu.m_slice->m_pps->bSignHideEnabled)
 398         {
 399             TUEntropyCodingParameters codeParams;
 400             cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, isLuma);
 401             return signBitHidingHDQ(coeff, deltaU, numSig, codeParams);
 402         }
 403         else
 404             return numSig;
 405     }
 406 }
 407
 408 void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff,
 409                             uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
 410 {
 411     const uint32_t sizeIdx = log2TrSize - 2;
 412     if (transQuantBypass)
 413     {
 414         primitives.cpy1Dto2D_shl[sizeIdx](residual, coeff, resiStride, 0);
 415         return;
 416     }
 417
 418     // Values need to pass as input parameter in dequant
 419     int rem = m_qpParam[ttype].rem;
 420     int per = m_qpParam[ttype].per;
 421     int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize;
 422     int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift;
 423     int numCoeff = 1 << (log2TrSize * 2);
 424
 425     if (m_scalingList->m_bEnabled)
 426     {
 427         int scalingListType = (bIntra ? 0 : 3) + ttype;
 428         const int32_t* dequantCoef = m_scalingList->m_dequantCoef[sizeIdx][scalingListType][rem];
 429         primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift);
 430     }
 431     else
 432     {
 433         int scale = m_scalingList->s_invQuantScales[rem] << per;
 434         primitives.dequant_normal(coeff, m_resiDctCoeff, numCoeff, scale, shift);
 435     }
 436
 437     if (useTransformSkip)
 438     {
 439 #if X265_DEPTH <= 10
 440         X265_CHECK(transformShift > 0, "invalid transformShift\n");
 441         primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
 442 #else
 443         if (transformShift > 0)
 444             primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift);
 445         else
 446             primitives.cpy1Dto2D_shl[sizeIdx](residual, m_resiDctCoeff, resiStride, -transformShift);
 447 #endif
 448     }
 449     else
 450     {
 451         int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra;
 452
 453         X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << (log2TrSize * 2)), "numSig differ\n");
 454
 455         // DC only
 456         if (numSig == 1 && coeff[0] != 0 && !useDST)
 457         {
 458             const int shift_1st = 7 - 6;
 459             const int add_1st = 1 << (shift_1st - 1);
 460             const int shift_2nd = 12 - (X265_DEPTH - 8) - 3;
 461             const int add_2nd = 1 << (shift_2nd - 1);
 462
 463             int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
 464             primitives.blockfill_s[sizeIdx](residual, resiStride, (int16_t)dc_val);
 465             return;
 466         }
 467
 468         primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, resiStride);
 469     }
 470 }
 471
 472 /* Rate distortion optimized quantization for entropy coding engines using
 473  * probability models like CABAC */
 474 uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy)
 475 {
 476     int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
 477     int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
 478
 479     X265_CHECK(scalingListType < 6, "scaling list type out of range\n");
 480
 481     int rem = m_qpParam[ttype].rem;
 482     int per = m_qpParam[ttype].per;
 483     int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */
 484     int add = (1 << (qbits - 1));
 485     const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
 486
 487     int numCoeff = 1 << (log2TrSize * 2);
 488
 489     uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);
 490
 491     X265_CHECK((int)numSig == primitives.count_nonzero(dstCoeff, 1 << (log2TrSize * 2)), "numSig differ\n");
 492     if (!numSig)
 493         return 0;
 494
 495     uint32_t trSize = 1 << log2TrSize;
 496     int64_t lambda2 = m_qpParam[ttype].lambda2;
 497     int64_t psyScale = (m_psyRdoqScale * m_qpParam[ttype].lambda);
 498
 499     /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
 500      * scale applied that must be removed during unquant. Note that in real dequant there is clipping
 501      * at several stages. We skip the clipping for simplicity when measuring RD cost */
 502     const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
 503     int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0);
 504     int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0;
 505     int scaleBits = SCALE_BITS - 2 * transformShift;
 506
 507 #define UNQUANT(lvl)    (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift)
 508 #define SIGCOST(bits)   ((lambda2 * (bits)) >> 8)
 509 #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits))
 510 #define PSYVALUE(rec)   ((psyScale * (rec)) >> (16 - scaleBits))
 511
 512     int64_t costCoeff[32 * 32];   /* d*d + lambda * bits */
 513     int64_t costUncoded[32 * 32]; /* d*d + lambda * 0    */
 514     int64_t costSig[32 * 32];     /* lambda * bits       */
 515
 516     int rateIncUp[32 * 32];      /* signal overhead of increasing level */
 517     int rateIncDown[32 * 32];    /* signal overhead of decreasing level */
 518     int sigRateDelta[32 * 32];   /* signal difference between zero and non-zero */
 519
 520     int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */
 521     uint64_t sigCoeffGroupFlag64 = 0;
 522
 523     uint32_t ctxSet      = 0;
 524     int    c1            = 1;
 525     int    c2            = 0;
 526     uint32_t goRiceParam = 0;
 527     uint32_t c1Idx       = 0;
 528     uint32_t c2Idx       = 0;
 529     int cgLastScanPos    = -1;
 530     int lastScanPos      = -1;
 531     const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */
 532     bool bIsLuma = ttype == TEXT_LUMA;
 533
 534     /* total rate distortion cost of transform block, as CBF=0 */
 535     int64_t totalUncodedCost = 0;
 536
 537     /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks,
 538      * the distortion and signal cost of coded blocks, and the coding cost of significant
 539      * coefficient and coefficient group bitmaps */
 540     int64_t totalRdCost = 0;
 541
 542     TUEntropyCodingParameters codeParams;
 543     cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma);
 544     const uint32_t cgNum = 1 << (codeParams.log2TrSizeCG * 2);
 545
 546     /* TODO: update bit estimates if dirty */
 547     EstBitsSbac& estBitsSbac = m_entropyCoder->m_estBitsSbac;
 548
 549     uint32_t scanPos;
 550     coeffGroupRDStats cgRdStats;
 551
 552     /* iterate over coding groups in reverse scan order */
 553     for (int cgScanPos = cgNum - 1; cgScanPos >= 0; cgScanPos--)
 554     {
 555         const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos];
 556         const uint32_t cgPosY   = cgBlkPos >> codeParams.log2TrSizeCG;
 557         const uint32_t cgPosX   = cgBlkPos - (cgPosY << codeParams.log2TrSizeCG);
 558         const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
 559         memset(&cgRdStats, 0, sizeof(coeffGroupRDStats));
 560
 561         const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, codeParams.log2TrSizeCG);
 562
 563         /* iterate over coefficients in each group in reverse scan order */
 564         for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
 565         {
 566             scanPos              = (cgScanPos << MLS_CG_SIZE) + scanPosinCG;
 567             uint32_t blkPos      = codeParams.scan[scanPos];
 568             uint16_t maxAbsLevel = (int16_t)abs(dstCoeff[blkPos]);             /* abs(quantized coeff) */
 569             int signCoef         = m_resiDctCoeff[blkPos];            /* pre-quantization DCT coeff */
 570             int predictedCoef    = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/
 571
 572             /* RDOQ measures distortion as the squared difference between the unquantized coded level
 573              * and the original DCT coefficient. The result is shifted scaleBits to account for the
 574              * FIX15 nature of the CABAC cost tables minus the forward transform scale */
 575
 576             /* cost of not coding this coefficient (all distortion, no signal bits) */
 577             costUncoded[scanPos] = (int64_t)(signCoef * signCoef) << scaleBits;
 578             if (usePsy && blkPos)
 579                 /* when no residual coefficient is coded, predicted coef == recon coef */
 580                 costUncoded[scanPos] -= PSYVALUE(predictedCoef);
 581
 582             totalUncodedCost += costUncoded[scanPos];
 583
 584             if (maxAbsLevel && lastScanPos < 0)
 585             {
 586                 /* remember the first non-zero coef found in this reverse scan as the last pos */
 587                 lastScanPos   = scanPos;
 588                 ctxSet        = (scanPos < SCAN_SET_SIZE || !bIsLuma) ? 0 : 2;
 589                 cgLastScanPos = cgScanPos;
 590             }
 591
 592             if (lastScanPos < 0)
 593             {
 594                 /* coefficients after lastNZ have no distortion signal cost */
 595                 costCoeff[scanPos] = 0;
 596                 costSig[scanPos] = 0;
 597
 598                 /* No non-zero coefficient yet found, but this does not mean
 599                  * there is no uncoded-cost for this coefficient. Pre-
 600                  * quantization the coefficient may have been non-zero */
 601                 totalRdCost += costUncoded[scanPos];
 602             }
 603             else
 604             {
 605                 const uint32_t c1c2Idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2;
 606                 const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2Idx * 2)) & 3;  // {1, 2, 1, 3}
 607
 608                 X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n");
 609                 X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n");
 610                 X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n");
 611
 612                 // coefficient level estimation
 613                 const uint32_t oneCtx = 4 * ctxSet + c1;
 614                 const uint32_t absCtx = ctxSet + c2;
 615                 const int* greaterOneBits = estBitsSbac.greaterOneBits[oneCtx];
 616                 const int* levelAbsBits = estBitsSbac.levelAbsBits[absCtx];
 617
 618                 uint16_t level = 0;
 619                 uint32_t sigCoefBits = 0;
 620                 costCoeff[scanPos] = MAX_INT64;
 621
 622                 if ((int)scanPos == lastScanPos)
 623                     sigRateDelta[blkPos] = 0;
 624                 else
 625                 {
 626                     const uint32_t ctxSig = getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext);
 627                     if (maxAbsLevel < 3)
 628                     {
 629                         /* set default costs to uncoded costs */
 630                         costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[ctxSig][0]);
 631                         costCoeff[scanPos] = costUncoded[scanPos] + costSig[scanPos];
 632                     }
 633                     sigRateDelta[blkPos] = estBitsSbac.significantBits[ctxSig][1] - estBitsSbac.significantBits[ctxSig][0];
 634                     sigCoefBits = estBitsSbac.significantBits[ctxSig][1];
 635                 }
 636                 if (maxAbsLevel)
 637                 {
 638                     uint16_t minAbsLevel = X265_MAX(maxAbsLevel - 1, 1);
 639                     for (uint16_t lvl = maxAbsLevel; lvl >= minAbsLevel; lvl--)
 640                     {
 641                         uint32_t levelBits = getICRateCost(lvl, lvl - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) + IEP_RATE;
 642
 643                         int unquantAbsLevel = UNQUANT(lvl);
 644                         int d = abs(signCoef) - unquantAbsLevel;
 645                         int64_t curCost = RDCOST(d, sigCoefBits + levelBits);
 646
 647                         /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */
 648                         if (usePsy && blkPos)
 649                         {
 650                             int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef));
 651                             curCost -= PSYVALUE(reconCoef);
 652                         }
 653
 654                         if (curCost < costCoeff[scanPos])
 655                         {
 656                             level = lvl;
 657                             costCoeff[scanPos] = curCost;
 658                             costSig[scanPos] = SIGCOST(sigCoefBits);
 659                         }
 660                     }
 661                 }
 662
 663                 dstCoeff[blkPos] = level;
 664                 totalRdCost += costCoeff[scanPos];
 665
 666                 /* record costs for sign-hiding performed at the end */
 667                 if (level)
 668                 {
 669                     int rateNow = getICRate(level, level - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx);
 670                     rateIncUp[blkPos] = getICRate(level + 1, level + 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) - rateNow;
 671                     rateIncDown[blkPos] = getICRate(level - 1, level - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) - rateNow;
 672                 }
 673                 else
 674                 {
 675                     rateIncUp[blkPos] = greaterOneBits[0];
 676                     rateIncDown[blkPos] = 0;
 677                 }
 678
 679                 /* Update CABAC estimation state */
 680                 if (level >= baseLevel && goRiceParam < 4 && level > (3U << goRiceParam))
 681                     goRiceParam++;
 682
 683                 c1Idx -= (-(int32_t)level) >> 31;
 684
 685                 /* update bin model */
 686                 if (level > 1)
 687                 {
 688                     c1 = 0;
 689                     c2 += (uint32_t)(c2 - 2) >> 31;
 690                     c2Idx++;
 691                 }
 692                 else if ((c1 < 3) && (c1 > 0) && level)
 693                     c1++;
 694
 695                 /* context set update */
 696                 if (!(scanPos % SCAN_SET_SIZE) && scanPos)
 697                 {
 698                     c2 = 0;
 699                     goRiceParam = 0;
 700
 701                     c1Idx = 0;
 702                     c2Idx = 0;
 703                     ctxSet = (scanPos == SCAN_SET_SIZE || !bIsLuma) ? 0 : 2;
 704                     X265_CHECK(c1 >= 0, "c1 is negative\n");
 705                     ctxSet -= ((int32_t)(c1 - 1) >> 31);
 706                     c1 = 1;
 707                 }
 708             }
 709
 710             cgRdStats.sigCost += costSig[scanPos];
 711             if (!scanPosinCG)
 712                 cgRdStats.sigCost0 = costSig[scanPos];
 713
 714             if (dstCoeff[blkPos])
 715             {
 716                 sigCoeffGroupFlag64 |= cgBlkPosMask;
 717                 cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos];
 718                 cgRdStats.uncodedDist += costUncoded[scanPos];
 719                 cgRdStats.nnzBeforePos0 += scanPosinCG;
 720             }
 721         } /* end for (scanPosinCG) */
 722
 723         costCoeffGroupSig[cgScanPos] = 0;
 724
 725         if (cgLastScanPos < 0)
 726         {
 727             /* nothing to do at this point */
 728         }
 729         else if (!cgScanPos || cgScanPos == cgLastScanPos)
 730         {
 731             /* coeff group 0 is implied to be present, no signal cost */
 732             /* coeff group with last NZ is implied to be present, handled below */
 733         }
 734         else if (sigCoeffGroupFlag64 & cgBlkPosMask)
 735         {
 736             if (!cgRdStats.nnzBeforePos0)
 737             {
 738                 /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */
 739                 totalRdCost -= cgRdStats.sigCost0;
 740                 cgRdStats.sigCost -= cgRdStats.sigCost0;
 741             }
 742
 743             /* there are coded coefficients in this group, but now we include the signaling cost
 744              * of the significant coefficient group flag and evaluate whether the RD cost of the
 745              * coded group is more than the RD cost of the uncoded group */
 746
 747             uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, codeParams.log2TrSizeCG);
 748
 749             int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
 750             costZeroCG += cgRdStats.uncodedDist;       /* add distortion for resetting non-zero levels to zero levels */
 751             costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */
 752             costZeroCG -= cgRdStats.sigCost;           /* remove signaling cost of significant coeff bitmap */
 753
 754             costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]);
 755             totalRdCost += costCoeffGroupSig[cgScanPos];  /* add the cost of 1 bit in significant CG bitmap */
 756
 757             if (costZeroCG < totalRdCost)
 758             {
 759                 sigCoeffGroupFlag64 &= ~cgBlkPosMask;
 760                 totalRdCost = costZeroCG;
 761                 costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]);
 762
 763                 /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */
 764                 for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
 765                 {
 766                     scanPos = cgScanPos * cgSize + scanPosinCG;
 767                     uint32_t blkPos = codeParams.scan[scanPos];
 768                     if (dstCoeff[blkPos])
 769                     {
 770                         costCoeff[scanPos] = costUncoded[scanPos];
 771                         costSig[scanPos] = 0;
 772                     }
 773                     dstCoeff[blkPos] = 0;
 774                 }
 775             }
 776         }
 777         else
 778         {
 779             /* there were no coded coefficients in this coefficient group */
 780             uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, codeParams.log2TrSizeCG);
 781             costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]);
 782             totalRdCost += costCoeffGroupSig[cgScanPos];  /* add cost of 0 bit in significant CG bitmap */
 783             totalRdCost -= cgRdStats.sigCost;             /* remove cost of significant coefficient bitmap */
 784         }
 785     } /* end for (cgScanPos) */
 786
 787     X265_CHECK(lastScanPos >= 0, "numSig non zero, but no coded CG\n");
 788
 789     /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */
 790     int64_t bestCost;
 791     if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx])
 792     {
 793         bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]);
 794         totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]);
 795     }
 796     else
 797     {
 798         int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]];
 799         bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]);
 800         totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]);
 801     }
 802
 803     /* This loop starts with the last non-zero found in the first loop and then refines this last
 804      * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs
 805      * at all previous coefficients until a coefficient greater than 1 is encountered or we run out
 806      * of coefficients to evaluate.  This will factor in the cost of coding empty groups and empty
 807      * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */
 808     int  bestLastIdx = 0;
 809     bool foundLast = false;
 810     for (int cgScanPos = cgLastScanPos; cgScanPos >= 0 && !foundLast; cgScanPos--)
 811     {
 812         if (!cgScanPos || cgScanPos == cgLastScanPos)
 813         {
 814             /* the presence of these coefficient groups are inferred, they have no bit in
 815              * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */
 816         }
 817         else if (sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[cgScanPos]))
 818         {
 819             /* remove cost of significant coeff group flag, the group's presence would be inferred
 820              * from lastNZ if it were present in this group */
 821             totalRdCost -= costCoeffGroupSig[cgScanPos];
 822         }
 823         else
 824         {
 825             /* remove cost of signaling this empty group as not present */
 826             totalRdCost -= costCoeffGroupSig[cgScanPos];
 827             continue;
 828         }
 829
 830         for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--)
 831         {
 832             scanPos = cgScanPos * cgSize + scanPosinCG;
 833             if ((int)scanPos > lastScanPos)
 834                 continue;
 835
 836             /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then
 837              * continue as if it were uncoded. If the coefficient was already uncoded, remove the
 838              * cost of signaling it as not-significant */
 839             uint32_t blkPos = codeParams.scan[scanPos];
 840             if (dstCoeff[blkPos])
 841             {
 842                 /* Swap the cost of signaling its significant coeff bit with the cost of
 843                  * signaling its lastNZ pos */
 844                 uint32_t posY = blkPos >> log2TrSize;
 845                 uint32_t posX = blkPos - (posY << log2TrSize);
 846                 uint32_t bitsLastNZ = codeParams.scanType == SCAN_VER ? getRateLast(posY, posX) : getRateLast(posX, posY);
 847                 int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ);
 848
 849                 if (costAsLast < bestCost)
 850                 {
 851                     bestLastIdx = scanPos + 1;
 852                     bestCost = costAsLast;
 853                 }
 854                 if (dstCoeff[blkPos] > 1)
 855                 {
 856                     foundLast = true;
 857                     break;
 858                 }
 859
 860                 totalRdCost -= costCoeff[scanPos];
 861                 totalRdCost += costUncoded[scanPos];
 862             }
 863             else
 864                 totalRdCost -= costSig[scanPos];
 865         }
 866     }
 867
 868     /* recount non-zero coefficients and re-apply sign of DCT coef */
 869     numSig = 0;
 870     for (int pos = 0; pos < bestLastIdx; pos++)
 871     {
 872         int blkPos = codeParams.scan[pos];
 873         int level  = dstCoeff[blkPos];
 874         numSig += (level != 0);
 875
 876         uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31;
 877         dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask);
 878     }
 879
 880     /* clean uncoded coefficients */
 881     for (int pos = bestLastIdx; pos <= lastScanPos; pos++)
 882         dstCoeff[codeParams.scan[pos]] = 0;
 883
 884     /* rate-distortion based sign-hiding */
 885     if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2)
 886     {
 887         int lastCG = true;
 888         for (int subSet = cgLastScanPos; subSet >= 0; subSet--)
 889         {
 890             int subPos = subSet << LOG2_SCAN_SET_SIZE;
 891             int n;
 892
 893             /* measure distance between first and last non-zero coef in this
 894              * coding group */
 895             for (n = SCAN_SET_SIZE - 1; n >= 0; --n)
 896                 if (dstCoeff[codeParams.scan[n + subPos]])
 897                     break;
 898             if (n < 0)
 899                 continue;
 900
 901             int lastNZPosInCG = n;
 902
 903             for (n = 0;; n++)
 904                 if (dstCoeff[codeParams.scan[n + subPos]])
 905                     break;
 906
 907             int firstNZPosInCG = n;
 908
 909             if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
 910             {
 911                 uint32_t signbit = (dstCoeff[codeParams.scan[subPos + firstNZPosInCG]] > 0 ? 0 : 1);
 912                 int absSum = 0;
 913
 914                 for (n = firstNZPosInCG; n <= lastNZPosInCG; n++)
 915                     absSum += dstCoeff[codeParams.scan[n + subPos]];
 916
 917                 if (signbit != (absSum & 1U))
 918                 {
 919                     /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff
 920                      * is properly implied. Note dstCoeff[] are signed by this point but curChange and
 921                      * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */
 922
 923                     int64_t minCostInc = MAX_INT64, curCost = MAX_INT64;
 924                     int minPos = -1;
 925                     int16_t finalChange = 0, curChange = 0;
 926
 927                     for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n)
 928                     {
 929                         uint32_t blkPos = codeParams.scan[n + subPos];
 930                         int signCoef    = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */
 931                         int absLevel    = abs(dstCoeff[blkPos]);
 932
 933                         int d = abs(signCoef) - UNQUANT(absLevel);
 934                         int64_t origDist = (((int64_t)d * d)) << scaleBits;
 935
 936 #define DELTARDCOST(d, deltabits) ((((int64_t)d * d) << scaleBits) - origDist + ((lambda2 * (int64_t)(deltabits)) >> 8))
 937
 938                         if (dstCoeff[blkPos])
 939                         {
 940                             d = abs(signCoef) - UNQUANT(absLevel + 1);
 941                             int64_t costUp = DELTARDCOST(d, rateIncUp[blkPos]);
 942
 943                             /* if decrementing would make the coeff 0, we can include the
 944                              * significant coeff flag cost savings */
 945                             d = abs(signCoef) - UNQUANT(absLevel - 1);
 946                             bool isOne = abs(dstCoeff[blkPos]) == 1;
 947                             int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0);
 948                             int64_t costDown = DELTARDCOST(d, downBits);
 949
 950                             if (lastCG && lastNZPosInCG == n && isOne)
 951                                 costDown -= 4 * IEP_RATE;
 952
 953                             if (costUp < costDown)
 954                             {
 955                                 curCost = costUp;
 956                                 curChange =  1;
 957                             }
 958                             else
 959                             {
 960                                 curChange = -1;
 961                                 if (n == firstNZPosInCG && isOne)
 962                                     curCost = MAX_INT64;
 963                                 else
 964                                     curCost = costDown;
 965                             }
 966                         }
 967                         else if (n < firstNZPosInCG && signbit != (signCoef >= 0 ? 0 : 1U))
 968                         {
 969                             /* don't try to make a new coded coeff before the first coeff if its
 970                              * sign would be different than the first coeff, the inferred sign would
 971                              * still be wrong and we'd have to do this again. */
 972                             curCost = MAX_INT64;
 973                         }
 974                         else
 975                         {
 976                             /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */
 977                             d = abs(signCoef) - UNQUANT(1);
 978                             curCost = DELTARDCOST(d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]);
 979                             curChange = 1;
 980                         }
 981
 982                         if (curCost < minCostInc)
 983                         {
 984                             minCostInc = curCost;
 985                             finalChange = curChange;
 986                             minPos = blkPos;
 987                         }
 988                     }
 989
 990                     if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768)
 991                         /* don't allow sign hiding to violate the SPEC range */
 992                         finalChange = -1;
 993
 994                     if (dstCoeff[minPos] == 0)
 995                         numSig++;
 996                     else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1)
 997                         numSig--;
 998
 999                     if (m_resiDctCoeff[minPos] >= 0)
1000                         dstCoeff[minPos] += finalChange;
1001                     else
1002                         dstCoeff[minPos] -= finalChange;
1003                 }
1004             }
1005
1006             lastCG = false;
1007         }
1008     }
1009
1010     return numSig;
1011 }
1012
1013 /* Pattern decision for context derivation process of significant_coeff_flag */
1014 uint32_t Quant::calcPatternSigCtx(uint64_t sigCoeffGroupFlag64, uint32_t cgPosX, uint32_t cgPosY, uint32_t log2TrSizeCG)
1015 {
1016     if (!log2TrSizeCG)
1017         return 0;
1018
1019     const uint32_t trSizeCG = 1 << log2TrSizeCG;
1020     X265_CHECK(trSizeCG <= 8, "transform CG is too large\n");
1021     const uint32_t sigPos = (uint32_t)(sigCoeffGroupFlag64 >> (1 + (cgPosY << log2TrSizeCG) + cgPosX));
1022     const uint32_t sigRight = ((int32_t)(cgPosX - (trSizeCG - 1)) >> 31) & (sigPos & 1);
1023     const uint32_t sigLower = ((int32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 2)) & 2;
1024
1025     return sigRight + sigLower;
1026 }
1027
1028 /* Context derivation process of coeff_abs_significant_flag */
1029 uint32_t Quant::getSigCtxInc(uint32_t patternSigCtx, uint32_t log2TrSize, uint32_t trSize, uint32_t blkPos, bool bIsLuma,
1030                              uint32_t firstSignificanceMapContext)
1031 {
1032     static const uint8_t ctxIndMap[16] =
1033     {
1034         0, 1, 4, 5,
1035         2, 3, 4, 5,
1036         6, 6, 8, 8,
1037         7, 7, 8, 8
1038     };
1039
1040     if (!blkPos) // special case for the DC context variable
1041         return 0;
1042
1043     if (log2TrSize == 2) // 4x4
1044         return ctxIndMap[blkPos];
1045
1046     const uint32_t posY = blkPos >> log2TrSize;
1047     const uint32_t posX = blkPos & (trSize - 1);
1048     X265_CHECK((blkPos - (posY << log2TrSize)) == posX, "block pos check failed\n");
1049
1050     int posXinSubset = blkPos & 3;
1051     X265_CHECK((posX & 3) == (blkPos & 3), "pos alignment fail\n");
1052     int posYinSubset = posY & 3;
1053
1054     // NOTE: [patternSigCtx][posXinSubset][posYinSubset]
1055     static const uint8_t table_cnt[4][4][4] =
1056     {
1057         // patternSigCtx = 0
1058         {
1059             { 2, 1, 1, 0 },
1060             { 1, 1, 0, 0 },
1061             { 1, 0, 0, 0 },
1062             { 0, 0, 0, 0 },
1063         },
1064         // patternSigCtx = 1
1065         {
1066             { 2, 1, 0, 0 },
1067             { 2, 1, 0, 0 },
1068             { 2, 1, 0, 0 },
1069             { 2, 1, 0, 0 },
1070         },
1071         // patternSigCtx = 2
1072         {
1073             { 2, 2, 2, 2 },
1074             { 1, 1, 1, 1 },
1075             { 0, 0, 0, 0 },
1076             { 0, 0, 0, 0 },
1077         },
1078         // patternSigCtx = 3
1079         {
1080             { 2, 2, 2, 2 },
1081             { 2, 2, 2, 2 },
1082             { 2, 2, 2, 2 },
1083             { 2, 2, 2, 2 },
1084         }
1085     };
1086
1087     int cnt = table_cnt[patternSigCtx][posXinSubset][posYinSubset];
1088     int offset = firstSignificanceMapContext;
1089
1090     offset += cnt;
1091
1092     return (bIsLuma && (posX | posY) >= 4) ? 3 + offset : offset;
1093 }
1094
1095 /* Calculates the cost of signaling the last significant coefficient in the block */
1096 inline uint32_t Quant::getRateLast(uint32_t posx, uint32_t posy) const
1097 {
1098     uint32_t ctxX = getGroupIdx(posx);
1099     uint32_t ctxY = getGroupIdx(posy);
1100     uint32_t cost = m_entropyCoder->m_estBitsSbac.lastXBits[ctxX] + m_entropyCoder->m_estBitsSbac.lastYBits[ctxY];
1101
1102     int32_t maskX = (int32_t)(2 - posx) >> 31;
1103     int32_t maskY = (int32_t)(2 - posy) >> 31;
1104
1105     cost += maskX & (IEP_RATE * ((ctxX - 2) >> 1));
1106     cost += maskY & (IEP_RATE * ((ctxY - 2) >> 1));
1107     return cost;
1108 }
1109
1110 /* Context derivation process of coeff_abs_significant_flag */
1111 uint32_t Quant::getSigCoeffGroupCtxInc(uint64_t cgGroupMask, uint32_t cgPosX, uint32_t cgPosY, uint32_t log2TrSizeCG)
1112 {
1113     const uint32_t trSizeCG = 1 << log2TrSizeCG;
1114
1115     const uint32_t sigPos = (uint32_t)(cgGroupMask >> (1 + (cgPosY << log2TrSizeCG) + cgPosX));
1116     const uint32_t sigRight = ((int32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos;
1117     const uint32_t sigLower = ((int32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1));
1118
1119     return (sigRight | sigLower) & 1;
1120 }