Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | /***************************************************************************** |
2 | * Copyright (C) 2014 x265 project | |
3 | * | |
4 | * Authors: Steve Borho <steve@borho.org> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | * This program is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with this program; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
19 | * | |
20 | * This program is also available under a commercial proprietary license. | |
21 | * For more information, contact us at license @ x265.com. | |
22 | *****************************************************************************/ | |
23 | ||
24 | #include "common.h" | |
25 | #include "primitives.h" | |
26 | #include "quant.h" | |
27 | #include "framedata.h" | |
28 | #include "entropy.h" | |
29 | #include "yuv.h" | |
30 | #include "cudata.h" | |
31 | #include "contexts.h" | |
32 | ||
33 | using namespace x265; | |
34 | ||
35 | #define SIGN(x,y) ((x^(y >> 31))-(y >> 31)) | |
36 | ||
37 | namespace { | |
38 | ||
39 | struct coeffGroupRDStats | |
40 | { | |
41 | int nnzBeforePos0; /* indicates coeff other than pos 0 are coded */ | |
42 | int64_t codedLevelAndDist; /* distortion and level cost of coded coefficients */ | |
43 | int64_t uncodedDist; /* uncoded distortion cost of coded coefficients */ | |
44 | int64_t sigCost; /* cost of signaling significant coeff bitmap */ | |
45 | int64_t sigCost0; /* cost of signaling sig coeff bit of coeff 0 */ | |
46 | }; | |
47 | ||
48 | inline int fastMin(int x, int y) | |
49 | { | |
50 | return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y) | |
51 | } | |
52 | ||
53 | inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx) | |
54 | { | |
55 | X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n"); | |
56 | X265_CHECK(absGoRice <= 4, "absGoRice check failure\n"); | |
57 | if (!absLevel) | |
58 | { | |
59 | X265_CHECK(diffLevel < 0, "diffLevel check failure\n"); | |
60 | return 0; | |
61 | } | |
62 | int rate = 0; | |
63 | ||
64 | if (diffLevel < 0) | |
65 | { | |
66 | X265_CHECK(absLevel <= 2, "absLevel check failure\n"); | |
67 | rate += greaterOneBits[(absLevel == 2)]; | |
68 | ||
69 | if (absLevel == 2) | |
70 | rate += levelAbsBits[0]; | |
71 | } | |
72 | else | |
73 | { | |
74 | uint32_t symbol = diffLevel; | |
75 | const uint32_t maxVlc = g_goRiceRange[absGoRice]; | |
76 | bool expGolomb = (symbol > maxVlc); | |
77 | ||
78 | if (expGolomb) | |
79 | { | |
80 | absLevel = symbol - maxVlc; | |
81 | ||
82 | // NOTE: mapping to x86 hardware instruction BSR | |
83 | unsigned long size; | |
84 | CLZ32(size, absLevel); | |
85 | int egs = size * 2 + 1; | |
86 | ||
87 | rate += egs << 15; | |
88 | ||
89 | // NOTE: in here, expGolomb=true means (symbol >= maxVlc + 1) | |
90 | X265_CHECK(fastMin(symbol, (maxVlc + 1)) == (int)maxVlc + 1, "min check failure\n"); | |
91 | symbol = maxVlc + 1; | |
92 | } | |
93 | ||
94 | uint32_t prefLen = (symbol >> absGoRice) + 1; | |
95 | uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */); | |
96 | ||
97 | rate += numBins << 15; | |
98 | ||
99 | if (c1c2Idx & 1) | |
100 | rate += greaterOneBits[1]; | |
101 | ||
102 | if (c1c2Idx == 3) | |
103 | rate += levelAbsBits[1]; | |
104 | } | |
105 | return rate; | |
106 | } | |
107 | ||
108 | /* Calculates the cost for specific absolute transform level */ | |
109 | inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int *greaterOneBits, const int *levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx) | |
110 | { | |
111 | X265_CHECK(absLevel, "absLevel should not be zero\n"); | |
112 | ||
113 | if (diffLevel < 0) | |
114 | { | |
115 | X265_CHECK((absLevel == 1) || (absLevel == 2), "absLevel range check failure\n"); | |
116 | ||
117 | uint32_t rate = greaterOneBits[(absLevel == 2)]; | |
118 | if (absLevel == 2) | |
119 | rate += levelAbsBits[0]; | |
120 | return rate; | |
121 | } | |
122 | else | |
123 | { | |
124 | uint32_t rate; | |
125 | uint32_t symbol = diffLevel; | |
126 | if ((symbol >> absGoRice) < COEF_REMAIN_BIN_REDUCTION) | |
127 | { | |
128 | uint32_t length = symbol >> absGoRice; | |
129 | rate = (length + 1 + absGoRice) << 15; | |
130 | } | |
131 | else | |
132 | { | |
133 | uint32_t length = 0; | |
134 | symbol = (symbol >> absGoRice) - COEF_REMAIN_BIN_REDUCTION; | |
135 | if (symbol) | |
136 | { | |
137 | unsigned long idx; | |
138 | CLZ32(idx, symbol + 1); | |
139 | length = idx; | |
140 | } | |
141 | ||
142 | rate = (COEF_REMAIN_BIN_REDUCTION + length + absGoRice + 1 + length) << 15; | |
143 | } | |
144 | if (c1c2Idx & 1) | |
145 | rate += greaterOneBits[1]; | |
146 | if (c1c2Idx == 3) | |
147 | rate += levelAbsBits[1]; | |
148 | return rate; | |
149 | } | |
150 | } | |
151 | ||
152 | } | |
153 | ||
154 | Quant::Quant() | |
155 | { | |
156 | m_resiDctCoeff = NULL; | |
157 | m_fencDctCoeff = NULL; | |
158 | m_fencShortBuf = NULL; | |
159 | m_frameNr = NULL; | |
160 | m_nr = NULL; | |
161 | } | |
162 | ||
163 | bool Quant::init(bool useRDOQ, double psyScale, const ScalingList& scalingList, Entropy& entropy) | |
164 | { | |
165 | m_entropyCoder = &entropy; | |
166 | m_useRDOQ = useRDOQ; | |
167 | m_psyRdoqScale = (int64_t)(psyScale * 256.0); | |
168 | m_scalingList = &scalingList; | |
169 | m_resiDctCoeff = X265_MALLOC(int32_t, MAX_TR_SIZE * MAX_TR_SIZE * 2); | |
170 | m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE); | |
171 | m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE); | |
172 | ||
173 | return m_resiDctCoeff && m_fencShortBuf; | |
174 | } | |
175 | ||
176 | bool Quant::allocNoiseReduction(const x265_param& param) | |
177 | { | |
178 | m_frameNr = X265_MALLOC(NoiseReduction, param.frameNumThreads); | |
179 | if (m_frameNr) | |
180 | memset(m_frameNr, 0, sizeof(NoiseReduction) * param.frameNumThreads); | |
181 | else | |
182 | return false; | |
183 | return true; | |
184 | } | |
185 | ||
186 | Quant::~Quant() | |
187 | { | |
188 | X265_FREE(m_frameNr); | |
189 | X265_FREE(m_resiDctCoeff); | |
190 | X265_FREE(m_fencShortBuf); | |
191 | } | |
192 | ||
193 | void Quant::setQPforQuant(const CUData& ctu) | |
194 | { | |
195 | m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL; | |
196 | int qpy = ctu.m_qp[0]; | |
197 | m_qpParam[TEXT_LUMA].setQpParam(qpy + QP_BD_OFFSET); | |
198 | setChromaQP(qpy + ctu.m_slice->m_pps->chromaCbQpOffset, TEXT_CHROMA_U, ctu.m_chromaFormat); | |
199 | setChromaQP(qpy + ctu.m_slice->m_pps->chromaCrQpOffset, TEXT_CHROMA_V, ctu.m_chromaFormat); | |
200 | } | |
201 | ||
202 | void Quant::setChromaQP(int qpin, TextType ttype, int chFmt) | |
203 | { | |
204 | int qp = Clip3(-QP_BD_OFFSET, 57, qpin); | |
205 | if (qp >= 30) | |
206 | { | |
207 | if (chFmt == X265_CSP_I420) | |
208 | qp = g_chromaScale[qp]; | |
209 | else | |
210 | qp = X265_MIN(qp, 51); | |
211 | } | |
212 | m_qpParam[ttype].setQpParam(qp + QP_BD_OFFSET); | |
213 | } | |
214 | ||
215 | /* To minimize the distortion only. No rate is considered */ | |
216 | uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams) | |
217 | { | |
218 | const uint32_t log2TrSizeCG = codeParams.log2TrSizeCG; | |
219 | const uint16_t *scan = codeParams.scan; | |
220 | bool lastCG = true; | |
221 | ||
222 | for (int cg = (1 << (log2TrSizeCG * 2)) - 1; cg >= 0; cg--) | |
223 | { | |
224 | int cgStartPos = cg << LOG2_SCAN_SET_SIZE; | |
225 | int n; | |
226 | ||
227 | for (n = SCAN_SET_SIZE - 1; n >= 0; --n) | |
228 | if (coeff[scan[n + cgStartPos]]) | |
229 | break; | |
230 | if (n < 0) | |
231 | continue; | |
232 | ||
233 | int lastNZPosInCG = n; | |
234 | ||
235 | for (n = 0;; n++) | |
236 | if (coeff[scan[n + cgStartPos]]) | |
237 | break; | |
238 | ||
239 | int firstNZPosInCG = n; | |
240 | ||
241 | if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD) | |
242 | { | |
243 | uint32_t signbit = coeff[scan[cgStartPos + firstNZPosInCG]] > 0 ? 0 : 1; | |
244 | uint32_t absSum = 0; | |
245 | ||
246 | for (n = firstNZPosInCG; n <= lastNZPosInCG; n++) | |
247 | absSum += coeff[scan[n + cgStartPos]]; | |
248 | ||
249 | if (signbit != (absSum & 0x1)) // compare signbit with sum_parity | |
250 | { | |
251 | int minCostInc = MAX_INT, minPos = -1, curCost = MAX_INT; | |
252 | int16_t finalChange = 0, curChange = 0; | |
253 | ||
254 | for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n) | |
255 | { | |
256 | uint32_t blkPos = scan[n + cgStartPos]; | |
257 | if (coeff[blkPos]) | |
258 | { | |
259 | if (deltaU[blkPos] > 0) | |
260 | { | |
261 | curCost = -deltaU[blkPos]; | |
262 | curChange = 1; | |
263 | } | |
264 | else | |
265 | { | |
266 | if (n == firstNZPosInCG && abs(coeff[blkPos]) == 1) | |
267 | curCost = MAX_INT; | |
268 | else | |
269 | { | |
270 | curCost = deltaU[blkPos]; | |
271 | curChange = -1; | |
272 | } | |
273 | } | |
274 | } | |
275 | else | |
276 | { | |
277 | if (n < firstNZPosInCG) | |
278 | { | |
279 | uint32_t thisSignBit = m_resiDctCoeff[blkPos] >= 0 ? 0 : 1; | |
280 | if (thisSignBit != signbit) | |
281 | curCost = MAX_INT; | |
282 | else | |
283 | { | |
284 | curCost = -deltaU[blkPos]; | |
285 | curChange = 1; | |
286 | } | |
287 | } | |
288 | else | |
289 | { | |
290 | curCost = -deltaU[blkPos]; | |
291 | curChange = 1; | |
292 | } | |
293 | } | |
294 | ||
295 | if (curCost < minCostInc) | |
296 | { | |
297 | minCostInc = curCost; | |
298 | finalChange = curChange; | |
299 | minPos = blkPos; | |
300 | } | |
301 | } | |
302 | ||
303 | /* do not allow change to violate coeff clamp */ | |
304 | if (coeff[minPos] == 32767 || coeff[minPos] == -32768) | |
305 | finalChange = -1; | |
306 | ||
307 | if (!coeff[minPos]) | |
308 | numSig++; | |
309 | else if (finalChange == -1 && abs(coeff[minPos]) == 1) | |
310 | numSig--; | |
311 | ||
312 | if (m_resiDctCoeff[minPos] >= 0) | |
313 | coeff[minPos] += finalChange; | |
314 | else | |
315 | coeff[minPos] -= finalChange; | |
316 | } | |
317 | } | |
318 | ||
319 | lastCG = false; | |
320 | } | |
321 | ||
322 | return numSig; | |
323 | } | |
324 | ||
325 | uint32_t Quant::transformNxN(CUData& cu, pixel* fenc, uint32_t fencStride, int16_t* residual, uint32_t stride, | |
326 | coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip) | |
327 | { | |
328 | if (cu.m_tqBypass[absPartIdx]) | |
329 | { | |
330 | X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n"); | |
331 | return primitives.copy_cnt[log2TrSize - 2](coeff, residual, stride); | |
332 | } | |
333 | ||
334 | bool isLuma = ttype == TEXT_LUMA; | |
335 | bool usePsy = m_psyRdoqScale && isLuma && !useTransformSkip; | |
336 | bool isIntra = cu.m_predMode[absPartIdx] == MODE_INTRA; | |
337 | int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform | |
338 | int trSize = 1 << log2TrSize; | |
339 | ||
340 | X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n"); | |
341 | if (useTransformSkip) | |
342 | { | |
343 | #if X265_DEPTH <= 10 | |
344 | primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize); | |
345 | #else | |
346 | if (transformShift >= 0) | |
347 | primitives.cvt16to32_shl(m_resiDctCoeff, residual, stride, transformShift, trSize); | |
348 | else | |
349 | { | |
350 | int shift = -transformShift; | |
351 | int offset = (1 << (shift - 1)); | |
352 | primitives.cvt16to32_shr[log2TrSize - 2](m_resiDctCoeff, residual, stride, shift, offset); | |
353 | } | |
354 | #endif | |
355 | } | |
356 | else | |
357 | { | |
358 | const uint32_t sizeIdx = log2TrSize - 2; | |
359 | int useDST = !sizeIdx && isLuma && isIntra; | |
360 | int index = DCT_4x4 + sizeIdx - useDST; | |
361 | ||
362 | primitives.dct[index](residual, m_resiDctCoeff, stride); | |
363 | ||
364 | /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so | |
365 | * there is no risk of performing this DCT unnecessarily */ | |
366 | if (usePsy) | |
367 | { | |
368 | /* perform DCT on source pixels for psy-rdoq */ | |
369 | primitives.square_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride); | |
370 | primitives.dct[index](m_fencShortBuf, m_fencDctCoeff, trSize); | |
371 | } | |
372 | ||
373 | if (m_nr && !isIntra) | |
374 | { | |
375 | /* denoise is not applied to intra residual, so DST can be ignored */ | |
376 | int cat = sizeIdx + 4 * !isLuma; | |
377 | int numCoeff = 1 << (log2TrSize * 2); | |
378 | primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff); | |
379 | m_nr->count[cat]++; | |
380 | } | |
381 | } | |
382 | ||
383 | if (m_useRDOQ) | |
384 | return rdoQuant(cu, coeff, log2TrSize, ttype, absPartIdx, usePsy); | |
385 | else | |
386 | { | |
387 | int deltaU[32 * 32]; | |
388 | ||
389 | int scalingListType = ttype + (isLuma ? 3 : 0); | |
390 | int rem = m_qpParam[ttype].rem; | |
391 | int per = m_qpParam[ttype].per; | |
392 | int32_t *quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; | |
393 | ||
394 | int qbits = QUANT_SHIFT + per + transformShift; | |
395 | int add = (cu.m_slice->m_sliceType == I_SLICE ? 171 : 85) << (qbits - 9); | |
396 | int numCoeff = 1 << (log2TrSize * 2); | |
397 | ||
398 | uint32_t numSig = primitives.quant(m_resiDctCoeff, quantCoeff, deltaU, coeff, qbits, add, numCoeff); | |
399 | ||
400 | if (numSig >= 2 && cu.m_slice->m_pps->bSignHideEnabled) | |
401 | { | |
402 | TUEntropyCodingParameters codeParams; | |
403 | cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, isLuma); | |
404 | return signBitHidingHDQ(coeff, deltaU, numSig, codeParams); | |
405 | } | |
406 | else | |
407 | return numSig; | |
408 | } | |
409 | } | |
410 | ||
411 | void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t stride, coeff_t* coeff, | |
412 | uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig) | |
413 | { | |
414 | if (transQuantBypass) | |
415 | { | |
416 | primitives.copy_shl[log2TrSize - 2](residual, coeff, stride, 0); | |
417 | return; | |
418 | } | |
419 | ||
420 | // Values need to pass as input parameter in dequant | |
421 | int rem = m_qpParam[ttype].rem; | |
422 | int per = m_qpParam[ttype].per; | |
423 | int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; | |
424 | int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift; | |
425 | int numCoeff = 1 << (log2TrSize * 2); | |
426 | ||
427 | if (m_scalingList->m_bEnabled) | |
428 | { | |
429 | int scalingListType = (bIntra ? 0 : 3) + ttype; | |
430 | int32_t *dequantCoef = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem]; | |
431 | primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift); | |
432 | } | |
433 | else | |
434 | { | |
435 | int scale = m_scalingList->s_invQuantScales[rem] << per; | |
436 | primitives.dequant_normal(coeff, m_resiDctCoeff, numCoeff, scale, shift); | |
437 | } | |
438 | ||
439 | if (useTransformSkip) | |
440 | { | |
441 | int trSize = 1 << log2TrSize; | |
442 | ||
443 | #if X265_DEPTH <= 10 | |
444 | primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize); | |
445 | #else | |
446 | if (transformShift > 0) | |
447 | primitives.cvt32to16_shr(residual, m_resiDctCoeff, stride, transformShift, trSize); | |
448 | else | |
449 | primitives.cvt32to16_shl[log2TrSize - 2](residual, m_resiDctCoeff, stride, -transformShift); | |
450 | #endif | |
451 | } | |
452 | else | |
453 | { | |
454 | const uint32_t sizeIdx = log2TrSize - 2; | |
455 | int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra; | |
456 | ||
457 | X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << (log2TrSize * 2)), "numSig differ\n"); | |
458 | ||
459 | // DC only | |
460 | if (numSig == 1 && coeff[0] != 0 && !useDST) | |
461 | { | |
462 | const int shift_1st = 7; | |
463 | const int add_1st = 1 << (shift_1st - 1); | |
464 | const int shift_2nd = 12 - (X265_DEPTH - 8); | |
465 | const int add_2nd = 1 << (shift_2nd - 1); | |
466 | ||
467 | int dc_val = (((m_resiDctCoeff[0] * 64 + add_1st) >> shift_1st) * 64 + add_2nd) >> shift_2nd; | |
468 | primitives.blockfill_s[sizeIdx](residual, stride, (int16_t)dc_val); | |
469 | return; | |
470 | } | |
471 | ||
472 | primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, stride); | |
473 | } | |
474 | } | |
475 | ||
476 | /* Rate distortion optimized quantization for entropy coding engines using | |
477 | * probability models like CABAC */ | |
478 | uint32_t Quant::rdoQuant(CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy) | |
479 | { | |
480 | int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ | |
481 | int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype; | |
482 | ||
483 | X265_CHECK(scalingListType < 6, "scaling list type out of range\n"); | |
484 | ||
485 | int rem = m_qpParam[ttype].rem; | |
486 | int per = m_qpParam[ttype].per; | |
487 | int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */ | |
488 | int add = (1 << (qbits - 1)); | |
489 | int32_t *qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; | |
490 | ||
491 | int numCoeff = 1 << (log2TrSize * 2); | |
492 | ||
493 | uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff); | |
494 | ||
495 | X265_CHECK((int)numSig == primitives.count_nonzero(dstCoeff, 1 << (log2TrSize * 2)), "numSig differ\n"); | |
496 | if (!numSig) | |
497 | return 0; | |
498 | ||
499 | uint32_t trSize = 1 << log2TrSize; | |
500 | int64_t lambda2 = m_qpParam[ttype].lambda2; | |
501 | int64_t psyScale = (m_psyRdoqScale * m_qpParam[ttype].lambda); | |
502 | ||
503 | /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4) | |
504 | * scale applied that must be removed during unquant. Note that in real dequant there is clipping | |
505 | * at several stages. We skip the clipping for simplicity when measuring RD cost */ | |
506 | int32_t *unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem]; | |
507 | int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0); | |
508 | int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0; | |
509 | int scaleBits = SCALE_BITS - 2 * transformShift; | |
510 | ||
511 | #define UNQUANT(lvl) (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift) | |
512 | #define SIGCOST(bits) ((lambda2 * (bits)) >> 8) | |
513 | #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits)) | |
514 | #define PSYVALUE(rec) ((psyScale * (rec)) >> (16 - scaleBits)) | |
515 | ||
516 | int64_t costCoeff[32 * 32]; /* d*d + lambda * bits */ | |
517 | int64_t costUncoded[32 * 32]; /* d*d + lambda * 0 */ | |
518 | int64_t costSig[32 * 32]; /* lambda * bits */ | |
519 | ||
520 | int rateIncUp[32 * 32]; /* signal overhead of increasing level */ | |
521 | int rateIncDown[32 * 32]; /* signal overhead of decreasing level */ | |
522 | int sigRateDelta[32 * 32]; /* signal difference between zero and non-zero */ | |
523 | ||
524 | int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */ | |
525 | uint64_t sigCoeffGroupFlag64 = 0; | |
526 | ||
527 | uint32_t ctxSet = 0; | |
528 | int c1 = 1; | |
529 | int c2 = 0; | |
530 | uint32_t goRiceParam = 0; | |
531 | uint32_t c1Idx = 0; | |
532 | uint32_t c2Idx = 0; | |
533 | int cgLastScanPos = -1; | |
534 | int lastScanPos = -1; | |
535 | const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */ | |
536 | bool bIsLuma = ttype == TEXT_LUMA; | |
537 | ||
538 | /* total rate distortion cost of transform block, as CBF=0 */ | |
539 | int64_t totalUncodedCost = 0; | |
540 | ||
541 | /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks, | |
542 | * the distortion and signal cost of coded blocks, and the coding cost of significant | |
543 | * coefficient and coefficient group bitmaps */ | |
544 | int64_t totalRdCost = 0; | |
545 | ||
546 | TUEntropyCodingParameters codeParams; | |
547 | cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma); | |
548 | const uint32_t cgNum = 1 << (codeParams.log2TrSizeCG * 2); | |
549 | ||
550 | /* TODO: update bit estimates if dirty */ | |
551 | EstBitsSbac& estBitsSbac = m_entropyCoder->m_estBitsSbac; | |
552 | ||
553 | uint32_t scanPos; | |
554 | coeffGroupRDStats cgRdStats; | |
555 | ||
556 | /* iterate over coding groups in reverse scan order */ | |
557 | for (int cgScanPos = cgNum - 1; cgScanPos >= 0; cgScanPos--) | |
558 | { | |
559 | const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos]; | |
560 | const uint32_t cgPosY = cgBlkPos >> codeParams.log2TrSizeCG; | |
561 | const uint32_t cgPosX = cgBlkPos - (cgPosY << codeParams.log2TrSizeCG); | |
562 | const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos); | |
563 | memset(&cgRdStats, 0, sizeof(coeffGroupRDStats)); | |
564 | ||
565 | const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, codeParams.log2TrSizeCG); | |
566 | ||
567 | /* iterate over coefficients in each group in reverse scan order */ | |
568 | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) | |
569 | { | |
570 | scanPos = (cgScanPos << MLS_CG_SIZE) + scanPosinCG; | |
571 | uint32_t blkPos = codeParams.scan[scanPos]; | |
572 | uint16_t maxAbsLevel = (int16_t)abs(dstCoeff[blkPos]); /* abs(quantized coeff) */ | |
573 | int signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ | |
574 | int predictedCoef = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/ | |
575 | ||
576 | /* RDOQ measures distortion as the squared difference between the unquantized coded level | |
577 | * and the original DCT coefficient. The result is shifted scaleBits to account for the | |
578 | * FIX15 nature of the CABAC cost tables minus the forward transform scale */ | |
579 | ||
580 | /* cost of not coding this coefficient (all distortion, no signal bits) */ | |
581 | costUncoded[scanPos] = (int64_t)(signCoef * signCoef) << scaleBits; | |
582 | if (usePsy && blkPos) | |
583 | /* when no residual coefficient is coded, predicted coef == recon coef */ | |
584 | costUncoded[scanPos] -= PSYVALUE(predictedCoef); | |
585 | ||
586 | totalUncodedCost += costUncoded[scanPos]; | |
587 | ||
588 | if (maxAbsLevel && lastScanPos < 0) | |
589 | { | |
590 | /* remember the first non-zero coef found in this reverse scan as the last pos */ | |
591 | lastScanPos = scanPos; | |
592 | ctxSet = (scanPos < SCAN_SET_SIZE || !bIsLuma) ? 0 : 2; | |
593 | cgLastScanPos = cgScanPos; | |
594 | } | |
595 | ||
596 | if (lastScanPos < 0) | |
597 | { | |
598 | /* coefficients after lastNZ have no distortion signal cost */ | |
599 | costCoeff[scanPos] = 0; | |
600 | costSig[scanPos] = 0; | |
601 | ||
602 | /* No non-zero coefficient yet found, but this does not mean | |
603 | * there is no uncoded-cost for this coefficient. Pre- | |
604 | * quantization the coefficient may have been non-zero */ | |
605 | totalRdCost += costUncoded[scanPos]; | |
606 | } | |
607 | else | |
608 | { | |
609 | const uint32_t c1c2Idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2; | |
610 | const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2Idx * 2)) & 3; // {1, 2, 1, 3} | |
611 | ||
612 | X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n"); | |
613 | X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n"); | |
614 | X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n"); | |
615 | ||
616 | // coefficient level estimation | |
617 | const uint32_t oneCtx = 4 * ctxSet + c1; | |
618 | const uint32_t absCtx = ctxSet + c2; | |
619 | const int *greaterOneBits = estBitsSbac.greaterOneBits[oneCtx]; | |
620 | const int *levelAbsBits = estBitsSbac.levelAbsBits[absCtx]; | |
621 | ||
622 | uint16_t level = 0; | |
623 | uint32_t sigCoefBits = 0; | |
624 | costCoeff[scanPos] = MAX_INT64; | |
625 | ||
626 | if ((int)scanPos == lastScanPos) | |
627 | sigRateDelta[blkPos] = 0; | |
628 | else | |
629 | { | |
630 | const uint32_t ctxSig = getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext); | |
631 | if (maxAbsLevel < 3) | |
632 | { | |
633 | /* set default costs to uncoded costs */ | |
634 | costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[ctxSig][0]); | |
635 | costCoeff[scanPos] = costUncoded[scanPos] + costSig[scanPos]; | |
636 | } | |
637 | sigRateDelta[blkPos] = estBitsSbac.significantBits[ctxSig][1] - estBitsSbac.significantBits[ctxSig][0]; | |
638 | sigCoefBits = estBitsSbac.significantBits[ctxSig][1]; | |
639 | } | |
640 | if (maxAbsLevel) | |
641 | { | |
642 | uint16_t minAbsLevel = X265_MAX(maxAbsLevel - 1, 1); | |
643 | for (uint16_t lvl = maxAbsLevel; lvl >= minAbsLevel; lvl--) | |
644 | { | |
645 | uint32_t levelBits = getICRateCost(lvl, lvl - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) + IEP_RATE; | |
646 | ||
647 | int unquantAbsLevel = UNQUANT(lvl); | |
648 | int d = abs(signCoef) - unquantAbsLevel; | |
649 | int64_t curCost = RDCOST(d, sigCoefBits + levelBits); | |
650 | ||
651 | /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */ | |
652 | if (usePsy && blkPos) | |
653 | { | |
654 | int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef)); | |
655 | curCost -= PSYVALUE(reconCoef); | |
656 | } | |
657 | ||
658 | if (curCost < costCoeff[scanPos]) | |
659 | { | |
660 | level = lvl; | |
661 | costCoeff[scanPos] = curCost; | |
662 | costSig[scanPos] = SIGCOST(sigCoefBits); | |
663 | } | |
664 | } | |
665 | } | |
666 | ||
667 | dstCoeff[blkPos] = level; | |
668 | totalRdCost += costCoeff[scanPos]; | |
669 | ||
670 | /* record costs for sign-hiding performed at the end */ | |
671 | if (level) | |
672 | { | |
673 | int rateNow = getICRate(level, level - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx); | |
674 | rateIncUp[blkPos] = getICRate(level + 1, level + 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) - rateNow; | |
675 | rateIncDown[blkPos] = getICRate(level - 1, level - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) - rateNow; | |
676 | } | |
677 | else | |
678 | { | |
679 | rateIncUp[blkPos] = greaterOneBits[0]; | |
680 | rateIncDown[blkPos] = 0; | |
681 | } | |
682 | ||
683 | /* Update CABAC estimation state */ | |
684 | if (level >= baseLevel && goRiceParam < 4 && level > (3U << goRiceParam)) | |
685 | goRiceParam++; | |
686 | ||
687 | c1Idx -= (-(int32_t)level) >> 31; | |
688 | ||
689 | /* update bin model */ | |
690 | if (level > 1) | |
691 | { | |
692 | c1 = 0; | |
693 | c2 += (uint32_t)(c2 - 2) >> 31; | |
694 | c2Idx++; | |
695 | } | |
696 | else if ((c1 < 3) && (c1 > 0) && level) | |
697 | c1++; | |
698 | ||
699 | /* context set update */ | |
700 | if (!(scanPos % SCAN_SET_SIZE) && scanPos) | |
701 | { | |
702 | c2 = 0; | |
703 | goRiceParam = 0; | |
704 | ||
705 | c1Idx = 0; | |
706 | c2Idx = 0; | |
707 | ctxSet = (scanPos == SCAN_SET_SIZE || !bIsLuma) ? 0 : 2; | |
708 | X265_CHECK(c1 >= 0, "c1 is negative\n"); | |
709 | ctxSet -= ((int32_t)(c1 - 1) >> 31); | |
710 | c1 = 1; | |
711 | } | |
712 | } | |
713 | ||
714 | cgRdStats.sigCost += costSig[scanPos]; | |
715 | if (!scanPosinCG) | |
716 | cgRdStats.sigCost0 = costSig[scanPos]; | |
717 | ||
718 | if (dstCoeff[blkPos]) | |
719 | { | |
720 | sigCoeffGroupFlag64 |= cgBlkPosMask; | |
721 | cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos]; | |
722 | cgRdStats.uncodedDist += costUncoded[scanPos]; | |
723 | cgRdStats.nnzBeforePos0 += scanPosinCG; | |
724 | } | |
725 | } /* end for (scanPosinCG) */ | |
726 | ||
727 | costCoeffGroupSig[cgScanPos] = 0; | |
728 | ||
729 | if (cgLastScanPos < 0) | |
730 | { | |
731 | /* nothing to do at this point */ | |
732 | } | |
733 | else if (!cgScanPos || cgScanPos == cgLastScanPos) | |
734 | { | |
735 | /* coeff group 0 is implied to be present, no signal cost */ | |
736 | /* coeff group with last NZ is implied to be present, handled below */ | |
737 | } | |
738 | else if (sigCoeffGroupFlag64 & cgBlkPosMask) | |
739 | { | |
740 | if (!cgRdStats.nnzBeforePos0) | |
741 | { | |
742 | /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */ | |
743 | totalRdCost -= cgRdStats.sigCost0; | |
744 | cgRdStats.sigCost -= cgRdStats.sigCost0; | |
745 | } | |
746 | ||
747 | /* there are coded coefficients in this group, but now we include the signaling cost | |
748 | * of the significant coefficient group flag and evaluate whether the RD cost of the | |
749 | * coded group is more than the RD cost of the uncoded group */ | |
750 | ||
751 | uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, codeParams.log2TrSizeCG); | |
752 | ||
753 | int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); | |
754 | costZeroCG += cgRdStats.uncodedDist; /* add distortion for resetting non-zero levels to zero levels */ | |
755 | costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */ | |
756 | costZeroCG -= cgRdStats.sigCost; /* remove signaling cost of significant coeff bitmap */ | |
757 | ||
758 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]); | |
759 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add the cost of 1 bit in significant CG bitmap */ | |
760 | ||
761 | if (costZeroCG < totalRdCost) | |
762 | { | |
763 | sigCoeffGroupFlag64 &= ~cgBlkPosMask; | |
764 | totalRdCost = costZeroCG; | |
765 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); | |
766 | ||
767 | /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */ | |
768 | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) | |
769 | { | |
770 | scanPos = cgScanPos * cgSize + scanPosinCG; | |
771 | uint32_t blkPos = codeParams.scan[scanPos]; | |
772 | if (dstCoeff[blkPos]) | |
773 | { | |
774 | costCoeff[scanPos] = costUncoded[scanPos]; | |
775 | costSig[scanPos] = 0; | |
776 | } | |
777 | dstCoeff[blkPos] = 0; | |
778 | } | |
779 | } | |
780 | } | |
781 | else | |
782 | { | |
783 | /* there were no coded coefficients in this coefficient group */ | |
784 | uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, codeParams.log2TrSizeCG); | |
785 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]); | |
786 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add cost of 0 bit in significant CG bitmap */ | |
787 | totalRdCost -= cgRdStats.sigCost; /* remove cost of significant coefficient bitmap */ | |
788 | } | |
789 | } /* end for (cgScanPos) */ | |
790 | ||
791 | X265_CHECK(lastScanPos >= 0, "numSig non zero, but no coded CG\n"); | |
792 | ||
793 | /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */ | |
794 | int64_t bestCost; | |
795 | if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx]) | |
796 | { | |
797 | bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]); | |
798 | totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]); | |
799 | } | |
800 | else | |
801 | { | |
802 | int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]]; | |
803 | bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]); | |
804 | totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]); | |
805 | } | |
806 | ||
807 | /* This loop starts with the last non-zero found in the first loop and then refines this last | |
808 | * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs | |
809 | * at all previous coefficients until a coefficient greater than 1 is encountered or we run out | |
810 | * of coefficients to evaluate. This will factor in the cost of coding empty groups and empty | |
811 | * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */ | |
812 | int bestLastIdx = 0; | |
813 | bool foundLast = false; | |
814 | for (int cgScanPos = cgLastScanPos; cgScanPos >= 0 && !foundLast; cgScanPos--) | |
815 | { | |
816 | if (!cgScanPos || cgScanPos == cgLastScanPos) | |
817 | { | |
818 | /* the presence of these coefficient groups are inferred, they have no bit in | |
819 | * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */ | |
820 | } | |
821 | else if (sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[cgScanPos])) | |
822 | { | |
823 | /* remove cost of significant coeff group flag, the group's presence would be inferred | |
824 | * from lastNZ if it were present in this group */ | |
825 | totalRdCost -= costCoeffGroupSig[cgScanPos]; | |
826 | } | |
827 | else | |
828 | { | |
829 | /* remove cost of signaling this empty group as not present */ | |
830 | totalRdCost -= costCoeffGroupSig[cgScanPos]; | |
831 | continue; | |
832 | } | |
833 | ||
834 | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) | |
835 | { | |
836 | scanPos = cgScanPos * cgSize + scanPosinCG; | |
837 | if ((int)scanPos > lastScanPos) | |
838 | continue; | |
839 | ||
840 | /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then | |
841 | * continue as if it were uncoded. If the coefficient was already uncoded, remove the | |
842 | * cost of signaling it as not-significant */ | |
843 | uint32_t blkPos = codeParams.scan[scanPos]; | |
844 | if (dstCoeff[blkPos]) | |
845 | { | |
846 | /* Swap the cost of signaling its significant coeff bit with the cost of | |
847 | * signaling its lastNZ pos */ | |
848 | uint32_t posY = blkPos >> log2TrSize; | |
849 | uint32_t posX = blkPos - (posY << log2TrSize); | |
850 | uint32_t bitsLastNZ = codeParams.scanType == SCAN_VER ? getRateLast(posY, posX) : getRateLast(posX, posY); | |
851 | int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ); | |
852 | ||
853 | if (costAsLast < bestCost) | |
854 | { | |
855 | bestLastIdx = scanPos + 1; | |
856 | bestCost = costAsLast; | |
857 | } | |
858 | if (dstCoeff[blkPos] > 1) | |
859 | { | |
860 | foundLast = true; | |
861 | break; | |
862 | } | |
863 | ||
864 | totalRdCost -= costCoeff[scanPos]; | |
865 | totalRdCost += costUncoded[scanPos]; | |
866 | } | |
867 | else | |
868 | totalRdCost -= costSig[scanPos]; | |
869 | } | |
870 | } | |
871 | ||
872 | /* recount non-zero coefficients and re-apply sign of DCT coef */ | |
873 | numSig = 0; | |
874 | for (int pos = 0; pos < bestLastIdx; pos++) | |
875 | { | |
876 | int blkPos = codeParams.scan[pos]; | |
877 | int level = dstCoeff[blkPos]; | |
878 | numSig += (level != 0); | |
879 | ||
880 | uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31; | |
881 | dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask); | |
882 | } | |
883 | ||
884 | /* clean uncoded coefficients */ | |
885 | for (int pos = bestLastIdx; pos <= lastScanPos; pos++) | |
886 | dstCoeff[codeParams.scan[pos]] = 0; | |
887 | ||
888 | /* rate-distortion based sign-hiding */ | |
889 | if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2) | |
890 | { | |
891 | int lastCG = true; | |
892 | for (int subSet = cgLastScanPos; subSet >= 0; subSet--) | |
893 | { | |
894 | int subPos = subSet << LOG2_SCAN_SET_SIZE; | |
895 | int n; | |
896 | ||
897 | /* measure distance between first and last non-zero coef in this | |
898 | * coding group */ | |
899 | for (n = SCAN_SET_SIZE - 1; n >= 0; --n) | |
900 | if (dstCoeff[codeParams.scan[n + subPos]]) | |
901 | break; | |
902 | if (n < 0) | |
903 | continue; | |
904 | ||
905 | int lastNZPosInCG = n; | |
906 | ||
907 | for (n = 0;; n++) | |
908 | if (dstCoeff[codeParams.scan[n + subPos]]) | |
909 | break; | |
910 | ||
911 | int firstNZPosInCG = n; | |
912 | ||
913 | if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD) | |
914 | { | |
915 | uint32_t signbit = (dstCoeff[codeParams.scan[subPos + firstNZPosInCG]] > 0 ? 0 : 1); | |
916 | int absSum = 0; | |
917 | ||
918 | for (n = firstNZPosInCG; n <= lastNZPosInCG; n++) | |
919 | absSum += dstCoeff[codeParams.scan[n + subPos]]; | |
920 | ||
921 | if (signbit != (absSum & 1U)) | |
922 | { | |
923 | /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff | |
924 | * is properly implied. Note dstCoeff[] are signed by this point but curChange and | |
925 | * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */ | |
926 | ||
927 | int64_t minCostInc = MAX_INT64, curCost = MAX_INT64; | |
928 | int minPos = -1; | |
929 | int16_t finalChange = 0, curChange = 0; | |
930 | ||
931 | for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n) | |
932 | { | |
933 | uint32_t blkPos = codeParams.scan[n + subPos]; | |
934 | int signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ | |
935 | int absLevel = abs(dstCoeff[blkPos]); | |
936 | ||
937 | int d = abs(signCoef) - UNQUANT(absLevel); | |
938 | int64_t origDist = (((int64_t)d * d)) << scaleBits; | |
939 | ||
940 | #define DELTARDCOST(d, deltabits) ((((int64_t)d * d) << scaleBits) - origDist + ((lambda2 * (int64_t)(deltabits)) >> 8)) | |
941 | ||
942 | if (dstCoeff[blkPos]) | |
943 | { | |
944 | d = abs(signCoef) - UNQUANT(absLevel + 1); | |
945 | int64_t costUp = DELTARDCOST(d, rateIncUp[blkPos]); | |
946 | ||
947 | /* if decrementing would make the coeff 0, we can include the | |
948 | * significant coeff flag cost savings */ | |
949 | d = abs(signCoef) - UNQUANT(absLevel - 1); | |
950 | bool isOne = abs(dstCoeff[blkPos]) == 1; | |
951 | int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0); | |
952 | int64_t costDown = DELTARDCOST(d, downBits); | |
953 | ||
954 | if (lastCG && lastNZPosInCG == n && isOne) | |
955 | costDown -= 4 * IEP_RATE; | |
956 | ||
957 | if (costUp < costDown) | |
958 | { | |
959 | curCost = costUp; | |
960 | curChange = 1; | |
961 | } | |
962 | else | |
963 | { | |
964 | curChange = -1; | |
965 | if (n == firstNZPosInCG && isOne) | |
966 | curCost = MAX_INT64; | |
967 | else | |
968 | curCost = costDown; | |
969 | } | |
970 | } | |
971 | else if (n < firstNZPosInCG && signbit != (signCoef >= 0 ? 0 : 1U)) | |
972 | { | |
973 | /* don't try to make a new coded coeff before the first coeff if its | |
974 | * sign would be different than the first coeff, the inferred sign would | |
975 | * still be wrong and we'd have to do this again. */ | |
976 | curCost = MAX_INT64; | |
977 | } | |
978 | else | |
979 | { | |
980 | /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */ | |
981 | d = abs(signCoef) - UNQUANT(1); | |
982 | curCost = DELTARDCOST(d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]); | |
983 | curChange = 1; | |
984 | } | |
985 | ||
986 | if (curCost < minCostInc) | |
987 | { | |
988 | minCostInc = curCost; | |
989 | finalChange = curChange; | |
990 | minPos = blkPos; | |
991 | } | |
992 | } | |
993 | ||
994 | if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768) | |
995 | /* don't allow sign hiding to violate the SPEC range */ | |
996 | finalChange = -1; | |
997 | ||
998 | if (dstCoeff[minPos] == 0) | |
999 | numSig++; | |
1000 | else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1) | |
1001 | numSig--; | |
1002 | ||
1003 | if (m_resiDctCoeff[minPos] >= 0) | |
1004 | dstCoeff[minPos] += finalChange; | |
1005 | else | |
1006 | dstCoeff[minPos] -= finalChange; | |
1007 | } | |
1008 | } | |
1009 | ||
1010 | lastCG = false; | |
1011 | } | |
1012 | } | |
1013 | ||
1014 | return numSig; | |
1015 | } | |
1016 | ||
1017 | /* Pattern decision for context derivation process of significant_coeff_flag */ | |
1018 | uint32_t Quant::calcPatternSigCtx(uint64_t sigCoeffGroupFlag64, uint32_t cgPosX, uint32_t cgPosY, uint32_t log2TrSizeCG) | |
1019 | { | |
1020 | if (!log2TrSizeCG) | |
1021 | return 0; | |
1022 | ||
1023 | const uint32_t trSizeCG = 1 << log2TrSizeCG; | |
1024 | X265_CHECK(trSizeCG <= 8, "transform CG is too large\n"); | |
1025 | const uint32_t sigPos = (uint32_t)(sigCoeffGroupFlag64 >> (1 + (cgPosY << log2TrSizeCG) + cgPosX)); | |
1026 | const uint32_t sigRight = ((int32_t)(cgPosX - (trSizeCG - 1)) >> 31) & (sigPos & 1); | |
1027 | const uint32_t sigLower = ((int32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 2)) & 2; | |
1028 | ||
1029 | return sigRight + sigLower; | |
1030 | } | |
1031 | ||
1032 | /* Context derivation process of coeff_abs_significant_flag */ | |
1033 | uint32_t Quant::getSigCtxInc(uint32_t patternSigCtx, uint32_t log2TrSize, uint32_t trSize, uint32_t blkPos, bool bIsLuma, | |
1034 | uint32_t firstSignificanceMapContext) | |
1035 | { | |
1036 | static const uint8_t ctxIndMap[16] = | |
1037 | { | |
1038 | 0, 1, 4, 5, | |
1039 | 2, 3, 4, 5, | |
1040 | 6, 6, 8, 8, | |
1041 | 7, 7, 8, 8 | |
1042 | }; | |
1043 | ||
1044 | if (!blkPos) // special case for the DC context variable | |
1045 | return 0; | |
1046 | ||
1047 | if (log2TrSize == 2) // 4x4 | |
1048 | return ctxIndMap[blkPos]; | |
1049 | ||
1050 | const uint32_t posY = blkPos >> log2TrSize; | |
1051 | const uint32_t posX = blkPos & (trSize - 1); | |
1052 | X265_CHECK((blkPos - (posY << log2TrSize)) == posX, "block pos check failed\n"); | |
1053 | ||
1054 | int posXinSubset = blkPos & 3; | |
1055 | X265_CHECK((posX & 3) == (blkPos & 3), "pos alignment fail\n"); | |
1056 | int posYinSubset = posY & 3; | |
1057 | ||
1058 | // NOTE: [patternSigCtx][posXinSubset][posYinSubset] | |
1059 | static const uint8_t table_cnt[4][4][4] = | |
1060 | { | |
1061 | // patternSigCtx = 0 | |
1062 | { | |
1063 | { 2, 1, 1, 0 }, | |
1064 | { 1, 1, 0, 0 }, | |
1065 | { 1, 0, 0, 0 }, | |
1066 | { 0, 0, 0, 0 }, | |
1067 | }, | |
1068 | // patternSigCtx = 1 | |
1069 | { | |
1070 | { 2, 1, 0, 0 }, | |
1071 | { 2, 1, 0, 0 }, | |
1072 | { 2, 1, 0, 0 }, | |
1073 | { 2, 1, 0, 0 }, | |
1074 | }, | |
1075 | // patternSigCtx = 2 | |
1076 | { | |
1077 | { 2, 2, 2, 2 }, | |
1078 | { 1, 1, 1, 1 }, | |
1079 | { 0, 0, 0, 0 }, | |
1080 | { 0, 0, 0, 0 }, | |
1081 | }, | |
1082 | // patternSigCtx = 3 | |
1083 | { | |
1084 | { 2, 2, 2, 2 }, | |
1085 | { 2, 2, 2, 2 }, | |
1086 | { 2, 2, 2, 2 }, | |
1087 | { 2, 2, 2, 2 }, | |
1088 | } | |
1089 | }; | |
1090 | ||
1091 | int cnt = table_cnt[patternSigCtx][posXinSubset][posYinSubset]; | |
1092 | int offset = firstSignificanceMapContext; | |
1093 | ||
1094 | offset += cnt; | |
1095 | ||
1096 | return (bIsLuma && (posX | posY) >= 4) ? 3 + offset : offset; | |
1097 | } | |
1098 | ||
1099 | /* Calculates the cost of signaling the last significant coefficient in the block */ | |
1100 | inline uint32_t Quant::getRateLast(uint32_t posx, uint32_t posy) const | |
1101 | { | |
1102 | uint32_t ctxX = getGroupIdx(posx); | |
1103 | uint32_t ctxY = getGroupIdx(posy); | |
1104 | uint32_t cost = m_entropyCoder->m_estBitsSbac.lastXBits[ctxX] + m_entropyCoder->m_estBitsSbac.lastYBits[ctxY]; | |
1105 | ||
1106 | int32_t maskX = (int32_t)(2 - posx) >> 31; | |
1107 | int32_t maskY = (int32_t)(2 - posy) >> 31; | |
1108 | ||
1109 | cost += maskX & (IEP_RATE * ((ctxX - 2) >> 1)); | |
1110 | cost += maskY & (IEP_RATE * ((ctxY - 2) >> 1)); | |
1111 | return cost; | |
1112 | } | |
1113 | ||
1114 | /* Context derivation process of coeff_abs_significant_flag */ | |
1115 | uint32_t Quant::getSigCoeffGroupCtxInc(uint64_t cgGroupMask, uint32_t cgPosX, uint32_t cgPosY, uint32_t log2TrSizeCG) | |
1116 | { | |
1117 | const uint32_t trSizeCG = 1 << log2TrSizeCG; | |
1118 | ||
1119 | const uint32_t sigPos = (uint32_t)(cgGroupMask >> (1 + (cgPosY << log2TrSizeCG) + cgPosX)); | |
1120 | const uint32_t sigRight = ((int32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos; | |
1121 | const uint32_t sigLower = ((int32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1)); | |
1122 | ||
1123 | return (sigRight | sigLower) & 1; | |
1124 | } |