Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | /***************************************************************************** |
2 | * Copyright (C) 2014 x265 project | |
3 | * | |
4 | * Authors: Steve Borho <steve@borho.org> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | * This program is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with this program; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
19 | * | |
20 | * This program is also available under a commercial proprietary license. | |
21 | * For more information, contact us at license @ x265.com. | |
22 | *****************************************************************************/ | |
23 | ||
24 | #include "common.h" | |
25 | #include "primitives.h" | |
26 | #include "quant.h" | |
27 | #include "framedata.h" | |
28 | #include "entropy.h" | |
29 | #include "yuv.h" | |
30 | #include "cudata.h" | |
31 | #include "contexts.h" | |
32 | ||
33 | using namespace x265; | |
34 | ||
35 | #define SIGN(x,y) ((x^(y >> 31))-(y >> 31)) | |
36 | ||
37 | namespace { | |
38 | ||
39 | struct coeffGroupRDStats | |
40 | { | |
41 | int nnzBeforePos0; /* indicates coeff other than pos 0 are coded */ | |
42 | int64_t codedLevelAndDist; /* distortion and level cost of coded coefficients */ | |
43 | int64_t uncodedDist; /* uncoded distortion cost of coded coefficients */ | |
44 | int64_t sigCost; /* cost of signaling significant coeff bitmap */ | |
45 | int64_t sigCost0; /* cost of signaling sig coeff bit of coeff 0 */ | |
46 | }; | |
47 | ||
48 | inline int fastMin(int x, int y) | |
49 | { | |
50 | return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y) | |
51 | } | |
52 | ||
b53f7c52 | 53 | inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx) |
72b9787e JB |
54 | { |
55 | X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n"); | |
56 | X265_CHECK(absGoRice <= 4, "absGoRice check failure\n"); | |
57 | if (!absLevel) | |
58 | { | |
59 | X265_CHECK(diffLevel < 0, "diffLevel check failure\n"); | |
60 | return 0; | |
61 | } | |
62 | int rate = 0; | |
63 | ||
64 | if (diffLevel < 0) | |
65 | { | |
66 | X265_CHECK(absLevel <= 2, "absLevel check failure\n"); | |
67 | rate += greaterOneBits[(absLevel == 2)]; | |
68 | ||
69 | if (absLevel == 2) | |
70 | rate += levelAbsBits[0]; | |
71 | } | |
72 | else | |
73 | { | |
74 | uint32_t symbol = diffLevel; | |
75 | const uint32_t maxVlc = g_goRiceRange[absGoRice]; | |
76 | bool expGolomb = (symbol > maxVlc); | |
77 | ||
78 | if (expGolomb) | |
79 | { | |
80 | absLevel = symbol - maxVlc; | |
81 | ||
82 | // NOTE: mapping to x86 hardware instruction BSR | |
83 | unsigned long size; | |
b53f7c52 | 84 | CLZ(size, absLevel); |
72b9787e JB |
85 | int egs = size * 2 + 1; |
86 | ||
87 | rate += egs << 15; | |
88 | ||
89 | // NOTE: in here, expGolomb=true means (symbol >= maxVlc + 1) | |
90 | X265_CHECK(fastMin(symbol, (maxVlc + 1)) == (int)maxVlc + 1, "min check failure\n"); | |
91 | symbol = maxVlc + 1; | |
92 | } | |
93 | ||
94 | uint32_t prefLen = (symbol >> absGoRice) + 1; | |
95 | uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */); | |
96 | ||
97 | rate += numBins << 15; | |
98 | ||
99 | if (c1c2Idx & 1) | |
100 | rate += greaterOneBits[1]; | |
101 | ||
102 | if (c1c2Idx == 3) | |
103 | rate += levelAbsBits[1]; | |
104 | } | |
105 | return rate; | |
106 | } | |
107 | ||
108 | /* Calculates the cost for specific absolute transform level */ | |
b53f7c52 | 109 | inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx) |
72b9787e JB |
110 | { |
111 | X265_CHECK(absLevel, "absLevel should not be zero\n"); | |
112 | ||
113 | if (diffLevel < 0) | |
114 | { | |
115 | X265_CHECK((absLevel == 1) || (absLevel == 2), "absLevel range check failure\n"); | |
116 | ||
117 | uint32_t rate = greaterOneBits[(absLevel == 2)]; | |
118 | if (absLevel == 2) | |
119 | rate += levelAbsBits[0]; | |
120 | return rate; | |
121 | } | |
122 | else | |
123 | { | |
124 | uint32_t rate; | |
125 | uint32_t symbol = diffLevel; | |
126 | if ((symbol >> absGoRice) < COEF_REMAIN_BIN_REDUCTION) | |
127 | { | |
128 | uint32_t length = symbol >> absGoRice; | |
129 | rate = (length + 1 + absGoRice) << 15; | |
130 | } | |
131 | else | |
132 | { | |
133 | uint32_t length = 0; | |
134 | symbol = (symbol >> absGoRice) - COEF_REMAIN_BIN_REDUCTION; | |
135 | if (symbol) | |
136 | { | |
137 | unsigned long idx; | |
b53f7c52 | 138 | CLZ(idx, symbol + 1); |
72b9787e JB |
139 | length = idx; |
140 | } | |
141 | ||
142 | rate = (COEF_REMAIN_BIN_REDUCTION + length + absGoRice + 1 + length) << 15; | |
143 | } | |
144 | if (c1c2Idx & 1) | |
145 | rate += greaterOneBits[1]; | |
146 | if (c1c2Idx == 3) | |
147 | rate += levelAbsBits[1]; | |
148 | return rate; | |
149 | } | |
150 | } | |
151 | ||
152 | } | |
153 | ||
154 | Quant::Quant() | |
155 | { | |
156 | m_resiDctCoeff = NULL; | |
157 | m_fencDctCoeff = NULL; | |
158 | m_fencShortBuf = NULL; | |
159 | m_frameNr = NULL; | |
160 | m_nr = NULL; | |
161 | } | |
162 | ||
163 | bool Quant::init(bool useRDOQ, double psyScale, const ScalingList& scalingList, Entropy& entropy) | |
164 | { | |
165 | m_entropyCoder = &entropy; | |
166 | m_useRDOQ = useRDOQ; | |
167 | m_psyRdoqScale = (int64_t)(psyScale * 256.0); | |
168 | m_scalingList = &scalingList; | |
b53f7c52 | 169 | m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2); |
72b9787e JB |
170 | m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE); |
171 | m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE); | |
172 | ||
173 | return m_resiDctCoeff && m_fencShortBuf; | |
174 | } | |
175 | ||
176 | bool Quant::allocNoiseReduction(const x265_param& param) | |
177 | { | |
178 | m_frameNr = X265_MALLOC(NoiseReduction, param.frameNumThreads); | |
179 | if (m_frameNr) | |
180 | memset(m_frameNr, 0, sizeof(NoiseReduction) * param.frameNumThreads); | |
181 | else | |
182 | return false; | |
183 | return true; | |
184 | } | |
185 | ||
186 | Quant::~Quant() | |
187 | { | |
188 | X265_FREE(m_frameNr); | |
189 | X265_FREE(m_resiDctCoeff); | |
190 | X265_FREE(m_fencShortBuf); | |
191 | } | |
192 | ||
193 | void Quant::setQPforQuant(const CUData& ctu) | |
194 | { | |
195 | m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL; | |
196 | int qpy = ctu.m_qp[0]; | |
197 | m_qpParam[TEXT_LUMA].setQpParam(qpy + QP_BD_OFFSET); | |
b53f7c52 JB |
198 | setChromaQP(qpy + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat); |
199 | setChromaQP(qpy + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat); | |
72b9787e JB |
200 | } |
201 | ||
202 | void Quant::setChromaQP(int qpin, TextType ttype, int chFmt) | |
203 | { | |
204 | int qp = Clip3(-QP_BD_OFFSET, 57, qpin); | |
205 | if (qp >= 30) | |
206 | { | |
207 | if (chFmt == X265_CSP_I420) | |
208 | qp = g_chromaScale[qp]; | |
209 | else | |
210 | qp = X265_MIN(qp, 51); | |
211 | } | |
212 | m_qpParam[ttype].setQpParam(qp + QP_BD_OFFSET); | |
213 | } | |
214 | ||
215 | /* To minimize the distortion only. No rate is considered */ | |
216 | uint32_t Quant::signBitHidingHDQ(int16_t* coeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codeParams) | |
217 | { | |
218 | const uint32_t log2TrSizeCG = codeParams.log2TrSizeCG; | |
b53f7c52 | 219 | const uint16_t* scan = codeParams.scan; |
72b9787e JB |
220 | bool lastCG = true; |
221 | ||
222 | for (int cg = (1 << (log2TrSizeCG * 2)) - 1; cg >= 0; cg--) | |
223 | { | |
224 | int cgStartPos = cg << LOG2_SCAN_SET_SIZE; | |
225 | int n; | |
226 | ||
227 | for (n = SCAN_SET_SIZE - 1; n >= 0; --n) | |
228 | if (coeff[scan[n + cgStartPos]]) | |
229 | break; | |
230 | if (n < 0) | |
231 | continue; | |
232 | ||
233 | int lastNZPosInCG = n; | |
234 | ||
235 | for (n = 0;; n++) | |
236 | if (coeff[scan[n + cgStartPos]]) | |
237 | break; | |
238 | ||
239 | int firstNZPosInCG = n; | |
240 | ||
241 | if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD) | |
242 | { | |
243 | uint32_t signbit = coeff[scan[cgStartPos + firstNZPosInCG]] > 0 ? 0 : 1; | |
244 | uint32_t absSum = 0; | |
245 | ||
246 | for (n = firstNZPosInCG; n <= lastNZPosInCG; n++) | |
247 | absSum += coeff[scan[n + cgStartPos]]; | |
248 | ||
249 | if (signbit != (absSum & 0x1)) // compare signbit with sum_parity | |
250 | { | |
251 | int minCostInc = MAX_INT, minPos = -1, curCost = MAX_INT; | |
252 | int16_t finalChange = 0, curChange = 0; | |
253 | ||
254 | for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n) | |
255 | { | |
256 | uint32_t blkPos = scan[n + cgStartPos]; | |
257 | if (coeff[blkPos]) | |
258 | { | |
259 | if (deltaU[blkPos] > 0) | |
260 | { | |
261 | curCost = -deltaU[blkPos]; | |
262 | curChange = 1; | |
263 | } | |
264 | else | |
265 | { | |
266 | if (n == firstNZPosInCG && abs(coeff[blkPos]) == 1) | |
267 | curCost = MAX_INT; | |
268 | else | |
269 | { | |
270 | curCost = deltaU[blkPos]; | |
271 | curChange = -1; | |
272 | } | |
273 | } | |
274 | } | |
275 | else | |
276 | { | |
277 | if (n < firstNZPosInCG) | |
278 | { | |
279 | uint32_t thisSignBit = m_resiDctCoeff[blkPos] >= 0 ? 0 : 1; | |
280 | if (thisSignBit != signbit) | |
281 | curCost = MAX_INT; | |
282 | else | |
283 | { | |
284 | curCost = -deltaU[blkPos]; | |
285 | curChange = 1; | |
286 | } | |
287 | } | |
288 | else | |
289 | { | |
290 | curCost = -deltaU[blkPos]; | |
291 | curChange = 1; | |
292 | } | |
293 | } | |
294 | ||
295 | if (curCost < minCostInc) | |
296 | { | |
297 | minCostInc = curCost; | |
298 | finalChange = curChange; | |
299 | minPos = blkPos; | |
300 | } | |
301 | } | |
302 | ||
303 | /* do not allow change to violate coeff clamp */ | |
304 | if (coeff[minPos] == 32767 || coeff[minPos] == -32768) | |
305 | finalChange = -1; | |
306 | ||
307 | if (!coeff[minPos]) | |
308 | numSig++; | |
309 | else if (finalChange == -1 && abs(coeff[minPos]) == 1) | |
310 | numSig--; | |
311 | ||
312 | if (m_resiDctCoeff[minPos] >= 0) | |
313 | coeff[minPos] += finalChange; | |
314 | else | |
315 | coeff[minPos] -= finalChange; | |
316 | } | |
317 | } | |
318 | ||
319 | lastCG = false; | |
320 | } | |
321 | ||
322 | return numSig; | |
323 | } | |
324 | ||
b53f7c52 | 325 | uint32_t Quant::transformNxN(const CUData& cu, const pixel* fenc, uint32_t fencStride, const int16_t* residual, uint32_t resiStride, |
72b9787e JB |
326 | coeff_t* coeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool useTransformSkip) |
327 | { | |
b53f7c52 | 328 | const uint32_t sizeIdx = log2TrSize - 2; |
72b9787e JB |
329 | if (cu.m_tqBypass[absPartIdx]) |
330 | { | |
331 | X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n"); | |
b53f7c52 | 332 | return primitives.copy_cnt[sizeIdx](coeff, residual, resiStride); |
72b9787e JB |
333 | } |
334 | ||
335 | bool isLuma = ttype == TEXT_LUMA; | |
336 | bool usePsy = m_psyRdoqScale && isLuma && !useTransformSkip; | |
72b9787e | 337 | int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; // Represents scaling through forward transform |
72b9787e JB |
338 | |
339 | X265_CHECK((cu.m_slice->m_sps->quadtreeTULog2MaxSize >= log2TrSize), "transform size too large\n"); | |
340 | if (useTransformSkip) | |
341 | { | |
342 | #if X265_DEPTH <= 10 | |
b53f7c52 JB |
343 | X265_CHECK(transformShift >= 0, "invalid transformShift\n"); |
344 | primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift); | |
72b9787e JB |
345 | #else |
346 | if (transformShift >= 0) | |
b53f7c52 | 347 | primitives.cpy2Dto1D_shl[sizeIdx](m_resiDctCoeff, residual, resiStride, transformShift); |
72b9787e | 348 | else |
b53f7c52 | 349 | primitives.cpy2Dto1D_shr[sizeIdx](m_resiDctCoeff, residual, resiStride, -transformShift); |
72b9787e JB |
350 | #endif |
351 | } | |
352 | else | |
353 | { | |
b53f7c52 | 354 | bool isIntra = cu.isIntra(absPartIdx); |
72b9787e JB |
355 | int useDST = !sizeIdx && isLuma && isIntra; |
356 | int index = DCT_4x4 + sizeIdx - useDST; | |
357 | ||
b53f7c52 | 358 | primitives.dct[index](residual, m_resiDctCoeff, resiStride); |
72b9787e JB |
359 | |
360 | /* NOTE: if RDOQ is disabled globally, psy-rdoq is also disabled, so | |
361 | * there is no risk of performing this DCT unnecessarily */ | |
362 | if (usePsy) | |
363 | { | |
b53f7c52 | 364 | int trSize = 1 << log2TrSize; |
72b9787e | 365 | /* perform DCT on source pixels for psy-rdoq */ |
b53f7c52 | 366 | primitives.luma_copy_ps[sizeIdx](m_fencShortBuf, trSize, fenc, fencStride); |
72b9787e JB |
367 | primitives.dct[index](m_fencShortBuf, m_fencDctCoeff, trSize); |
368 | } | |
369 | ||
b53f7c52 | 370 | if (m_nr) |
72b9787e JB |
371 | { |
372 | /* denoise is not applied to intra residual, so DST can be ignored */ | |
b53f7c52 | 373 | int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra; |
72b9787e JB |
374 | int numCoeff = 1 << (log2TrSize * 2); |
375 | primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff); | |
376 | m_nr->count[cat]++; | |
377 | } | |
378 | } | |
379 | ||
380 | if (m_useRDOQ) | |
381 | return rdoQuant(cu, coeff, log2TrSize, ttype, absPartIdx, usePsy); | |
382 | else | |
383 | { | |
384 | int deltaU[32 * 32]; | |
385 | ||
386 | int scalingListType = ttype + (isLuma ? 3 : 0); | |
387 | int rem = m_qpParam[ttype].rem; | |
388 | int per = m_qpParam[ttype].per; | |
b53f7c52 | 389 | const int32_t* quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; |
72b9787e JB |
390 | |
391 | int qbits = QUANT_SHIFT + per + transformShift; | |
392 | int add = (cu.m_slice->m_sliceType == I_SLICE ? 171 : 85) << (qbits - 9); | |
393 | int numCoeff = 1 << (log2TrSize * 2); | |
394 | ||
395 | uint32_t numSig = primitives.quant(m_resiDctCoeff, quantCoeff, deltaU, coeff, qbits, add, numCoeff); | |
396 | ||
397 | if (numSig >= 2 && cu.m_slice->m_pps->bSignHideEnabled) | |
398 | { | |
399 | TUEntropyCodingParameters codeParams; | |
400 | cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, isLuma); | |
401 | return signBitHidingHDQ(coeff, deltaU, numSig, codeParams); | |
402 | } | |
403 | else | |
404 | return numSig; | |
405 | } | |
406 | } | |
407 | ||
b53f7c52 | 408 | void Quant::invtransformNxN(bool transQuantBypass, int16_t* residual, uint32_t resiStride, const coeff_t* coeff, |
72b9787e JB |
409 | uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig) |
410 | { | |
b53f7c52 | 411 | const uint32_t sizeIdx = log2TrSize - 2; |
72b9787e JB |
412 | if (transQuantBypass) |
413 | { | |
b53f7c52 | 414 | primitives.cpy1Dto2D_shl[sizeIdx](residual, coeff, resiStride, 0); |
72b9787e JB |
415 | return; |
416 | } | |
417 | ||
418 | // Values need to pass as input parameter in dequant | |
419 | int rem = m_qpParam[ttype].rem; | |
420 | int per = m_qpParam[ttype].per; | |
421 | int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; | |
422 | int shift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift; | |
423 | int numCoeff = 1 << (log2TrSize * 2); | |
424 | ||
425 | if (m_scalingList->m_bEnabled) | |
426 | { | |
427 | int scalingListType = (bIntra ? 0 : 3) + ttype; | |
b53f7c52 | 428 | const int32_t* dequantCoef = m_scalingList->m_dequantCoef[sizeIdx][scalingListType][rem]; |
72b9787e JB |
429 | primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift); |
430 | } | |
431 | else | |
432 | { | |
433 | int scale = m_scalingList->s_invQuantScales[rem] << per; | |
434 | primitives.dequant_normal(coeff, m_resiDctCoeff, numCoeff, scale, shift); | |
435 | } | |
436 | ||
437 | if (useTransformSkip) | |
438 | { | |
72b9787e | 439 | #if X265_DEPTH <= 10 |
b53f7c52 JB |
440 | X265_CHECK(transformShift > 0, "invalid transformShift\n"); |
441 | primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift); | |
72b9787e JB |
442 | #else |
443 | if (transformShift > 0) | |
b53f7c52 | 444 | primitives.cpy1Dto2D_shr[sizeIdx](residual, m_resiDctCoeff, resiStride, transformShift); |
72b9787e | 445 | else |
b53f7c52 | 446 | primitives.cpy1Dto2D_shl[sizeIdx](residual, m_resiDctCoeff, resiStride, -transformShift); |
72b9787e JB |
447 | #endif |
448 | } | |
449 | else | |
450 | { | |
72b9787e JB |
451 | int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra; |
452 | ||
453 | X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << (log2TrSize * 2)), "numSig differ\n"); | |
454 | ||
455 | // DC only | |
456 | if (numSig == 1 && coeff[0] != 0 && !useDST) | |
457 | { | |
b53f7c52 | 458 | const int shift_1st = 7 - 6; |
72b9787e | 459 | const int add_1st = 1 << (shift_1st - 1); |
b53f7c52 | 460 | const int shift_2nd = 12 - (X265_DEPTH - 8) - 3; |
72b9787e JB |
461 | const int add_2nd = 1 << (shift_2nd - 1); |
462 | ||
b53f7c52 JB |
463 | int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd; |
464 | primitives.blockfill_s[sizeIdx](residual, resiStride, (int16_t)dc_val); | |
72b9787e JB |
465 | return; |
466 | } | |
467 | ||
b53f7c52 | 468 | primitives.idct[IDCT_4x4 + sizeIdx - useDST](m_resiDctCoeff, residual, resiStride); |
72b9787e JB |
469 | } |
470 | } | |
471 | ||
472 | /* Rate distortion optimized quantization for entropy coding engines using | |
473 | * probability models like CABAC */ | |
b53f7c52 | 474 | uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy) |
72b9787e JB |
475 | { |
476 | int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */ | |
477 | int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype; | |
478 | ||
479 | X265_CHECK(scalingListType < 6, "scaling list type out of range\n"); | |
480 | ||
481 | int rem = m_qpParam[ttype].rem; | |
482 | int per = m_qpParam[ttype].per; | |
483 | int qbits = QUANT_SHIFT + per + transformShift; /* Right shift of non-RDOQ quantizer level = (coeff*Q + offset)>>q_bits */ | |
484 | int add = (1 << (qbits - 1)); | |
b53f7c52 | 485 | const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; |
72b9787e JB |
486 | |
487 | int numCoeff = 1 << (log2TrSize * 2); | |
488 | ||
489 | uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff); | |
490 | ||
491 | X265_CHECK((int)numSig == primitives.count_nonzero(dstCoeff, 1 << (log2TrSize * 2)), "numSig differ\n"); | |
492 | if (!numSig) | |
493 | return 0; | |
494 | ||
495 | uint32_t trSize = 1 << log2TrSize; | |
496 | int64_t lambda2 = m_qpParam[ttype].lambda2; | |
497 | int64_t psyScale = (m_psyRdoqScale * m_qpParam[ttype].lambda); | |
498 | ||
499 | /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4) | |
500 | * scale applied that must be removed during unquant. Note that in real dequant there is clipping | |
501 | * at several stages. We skip the clipping for simplicity when measuring RD cost */ | |
b53f7c52 | 502 | const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem]; |
72b9787e JB |
503 | int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0); |
504 | int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0; | |
505 | int scaleBits = SCALE_BITS - 2 * transformShift; | |
506 | ||
507 | #define UNQUANT(lvl) (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift) | |
508 | #define SIGCOST(bits) ((lambda2 * (bits)) >> 8) | |
509 | #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits)) | |
510 | #define PSYVALUE(rec) ((psyScale * (rec)) >> (16 - scaleBits)) | |
511 | ||
512 | int64_t costCoeff[32 * 32]; /* d*d + lambda * bits */ | |
513 | int64_t costUncoded[32 * 32]; /* d*d + lambda * 0 */ | |
514 | int64_t costSig[32 * 32]; /* lambda * bits */ | |
515 | ||
516 | int rateIncUp[32 * 32]; /* signal overhead of increasing level */ | |
517 | int rateIncDown[32 * 32]; /* signal overhead of decreasing level */ | |
518 | int sigRateDelta[32 * 32]; /* signal difference between zero and non-zero */ | |
519 | ||
520 | int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */ | |
521 | uint64_t sigCoeffGroupFlag64 = 0; | |
522 | ||
523 | uint32_t ctxSet = 0; | |
524 | int c1 = 1; | |
525 | int c2 = 0; | |
526 | uint32_t goRiceParam = 0; | |
527 | uint32_t c1Idx = 0; | |
528 | uint32_t c2Idx = 0; | |
529 | int cgLastScanPos = -1; | |
530 | int lastScanPos = -1; | |
531 | const uint32_t cgSize = (1 << MLS_CG_SIZE); /* 4x4 num coef = 16 */ | |
532 | bool bIsLuma = ttype == TEXT_LUMA; | |
533 | ||
534 | /* total rate distortion cost of transform block, as CBF=0 */ | |
535 | int64_t totalUncodedCost = 0; | |
536 | ||
537 | /* Total rate distortion cost of this transform block, counting te distortion of uncoded blocks, | |
538 | * the distortion and signal cost of coded blocks, and the coding cost of significant | |
539 | * coefficient and coefficient group bitmaps */ | |
540 | int64_t totalRdCost = 0; | |
541 | ||
542 | TUEntropyCodingParameters codeParams; | |
543 | cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma); | |
544 | const uint32_t cgNum = 1 << (codeParams.log2TrSizeCG * 2); | |
545 | ||
546 | /* TODO: update bit estimates if dirty */ | |
547 | EstBitsSbac& estBitsSbac = m_entropyCoder->m_estBitsSbac; | |
548 | ||
549 | uint32_t scanPos; | |
550 | coeffGroupRDStats cgRdStats; | |
551 | ||
552 | /* iterate over coding groups in reverse scan order */ | |
553 | for (int cgScanPos = cgNum - 1; cgScanPos >= 0; cgScanPos--) | |
554 | { | |
555 | const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos]; | |
556 | const uint32_t cgPosY = cgBlkPos >> codeParams.log2TrSizeCG; | |
557 | const uint32_t cgPosX = cgBlkPos - (cgPosY << codeParams.log2TrSizeCG); | |
558 | const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos); | |
559 | memset(&cgRdStats, 0, sizeof(coeffGroupRDStats)); | |
560 | ||
561 | const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, codeParams.log2TrSizeCG); | |
562 | ||
563 | /* iterate over coefficients in each group in reverse scan order */ | |
564 | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) | |
565 | { | |
566 | scanPos = (cgScanPos << MLS_CG_SIZE) + scanPosinCG; | |
567 | uint32_t blkPos = codeParams.scan[scanPos]; | |
568 | uint16_t maxAbsLevel = (int16_t)abs(dstCoeff[blkPos]); /* abs(quantized coeff) */ | |
569 | int signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ | |
570 | int predictedCoef = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/ | |
571 | ||
572 | /* RDOQ measures distortion as the squared difference between the unquantized coded level | |
573 | * and the original DCT coefficient. The result is shifted scaleBits to account for the | |
574 | * FIX15 nature of the CABAC cost tables minus the forward transform scale */ | |
575 | ||
576 | /* cost of not coding this coefficient (all distortion, no signal bits) */ | |
577 | costUncoded[scanPos] = (int64_t)(signCoef * signCoef) << scaleBits; | |
578 | if (usePsy && blkPos) | |
579 | /* when no residual coefficient is coded, predicted coef == recon coef */ | |
580 | costUncoded[scanPos] -= PSYVALUE(predictedCoef); | |
581 | ||
582 | totalUncodedCost += costUncoded[scanPos]; | |
583 | ||
584 | if (maxAbsLevel && lastScanPos < 0) | |
585 | { | |
586 | /* remember the first non-zero coef found in this reverse scan as the last pos */ | |
587 | lastScanPos = scanPos; | |
588 | ctxSet = (scanPos < SCAN_SET_SIZE || !bIsLuma) ? 0 : 2; | |
589 | cgLastScanPos = cgScanPos; | |
590 | } | |
591 | ||
592 | if (lastScanPos < 0) | |
593 | { | |
594 | /* coefficients after lastNZ have no distortion signal cost */ | |
595 | costCoeff[scanPos] = 0; | |
596 | costSig[scanPos] = 0; | |
597 | ||
598 | /* No non-zero coefficient yet found, but this does not mean | |
599 | * there is no uncoded-cost for this coefficient. Pre- | |
600 | * quantization the coefficient may have been non-zero */ | |
601 | totalRdCost += costUncoded[scanPos]; | |
602 | } | |
603 | else | |
604 | { | |
605 | const uint32_t c1c2Idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2; | |
606 | const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2Idx * 2)) & 3; // {1, 2, 1, 3} | |
607 | ||
608 | X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n"); | |
609 | X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n"); | |
610 | X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n"); | |
611 | ||
612 | // coefficient level estimation | |
613 | const uint32_t oneCtx = 4 * ctxSet + c1; | |
614 | const uint32_t absCtx = ctxSet + c2; | |
b53f7c52 JB |
615 | const int* greaterOneBits = estBitsSbac.greaterOneBits[oneCtx]; |
616 | const int* levelAbsBits = estBitsSbac.levelAbsBits[absCtx]; | |
72b9787e JB |
617 | |
618 | uint16_t level = 0; | |
619 | uint32_t sigCoefBits = 0; | |
620 | costCoeff[scanPos] = MAX_INT64; | |
621 | ||
622 | if ((int)scanPos == lastScanPos) | |
623 | sigRateDelta[blkPos] = 0; | |
624 | else | |
625 | { | |
626 | const uint32_t ctxSig = getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext); | |
627 | if (maxAbsLevel < 3) | |
628 | { | |
629 | /* set default costs to uncoded costs */ | |
630 | costSig[scanPos] = SIGCOST(estBitsSbac.significantBits[ctxSig][0]); | |
631 | costCoeff[scanPos] = costUncoded[scanPos] + costSig[scanPos]; | |
632 | } | |
633 | sigRateDelta[blkPos] = estBitsSbac.significantBits[ctxSig][1] - estBitsSbac.significantBits[ctxSig][0]; | |
634 | sigCoefBits = estBitsSbac.significantBits[ctxSig][1]; | |
635 | } | |
636 | if (maxAbsLevel) | |
637 | { | |
638 | uint16_t minAbsLevel = X265_MAX(maxAbsLevel - 1, 1); | |
639 | for (uint16_t lvl = maxAbsLevel; lvl >= minAbsLevel; lvl--) | |
640 | { | |
641 | uint32_t levelBits = getICRateCost(lvl, lvl - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) + IEP_RATE; | |
642 | ||
643 | int unquantAbsLevel = UNQUANT(lvl); | |
644 | int d = abs(signCoef) - unquantAbsLevel; | |
645 | int64_t curCost = RDCOST(d, sigCoefBits + levelBits); | |
646 | ||
647 | /* Psy RDOQ: bias in favor of higher AC coefficients in the reconstructed frame */ | |
648 | if (usePsy && blkPos) | |
649 | { | |
650 | int reconCoef = abs(unquantAbsLevel + SIGN(predictedCoef, signCoef)); | |
651 | curCost -= PSYVALUE(reconCoef); | |
652 | } | |
653 | ||
654 | if (curCost < costCoeff[scanPos]) | |
655 | { | |
656 | level = lvl; | |
657 | costCoeff[scanPos] = curCost; | |
658 | costSig[scanPos] = SIGCOST(sigCoefBits); | |
659 | } | |
660 | } | |
661 | } | |
662 | ||
663 | dstCoeff[blkPos] = level; | |
664 | totalRdCost += costCoeff[scanPos]; | |
665 | ||
666 | /* record costs for sign-hiding performed at the end */ | |
667 | if (level) | |
668 | { | |
669 | int rateNow = getICRate(level, level - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx); | |
670 | rateIncUp[blkPos] = getICRate(level + 1, level + 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) - rateNow; | |
671 | rateIncDown[blkPos] = getICRate(level - 1, level - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) - rateNow; | |
672 | } | |
673 | else | |
674 | { | |
675 | rateIncUp[blkPos] = greaterOneBits[0]; | |
676 | rateIncDown[blkPos] = 0; | |
677 | } | |
678 | ||
679 | /* Update CABAC estimation state */ | |
680 | if (level >= baseLevel && goRiceParam < 4 && level > (3U << goRiceParam)) | |
681 | goRiceParam++; | |
682 | ||
683 | c1Idx -= (-(int32_t)level) >> 31; | |
684 | ||
685 | /* update bin model */ | |
686 | if (level > 1) | |
687 | { | |
688 | c1 = 0; | |
689 | c2 += (uint32_t)(c2 - 2) >> 31; | |
690 | c2Idx++; | |
691 | } | |
692 | else if ((c1 < 3) && (c1 > 0) && level) | |
693 | c1++; | |
694 | ||
695 | /* context set update */ | |
696 | if (!(scanPos % SCAN_SET_SIZE) && scanPos) | |
697 | { | |
698 | c2 = 0; | |
699 | goRiceParam = 0; | |
700 | ||
701 | c1Idx = 0; | |
702 | c2Idx = 0; | |
703 | ctxSet = (scanPos == SCAN_SET_SIZE || !bIsLuma) ? 0 : 2; | |
704 | X265_CHECK(c1 >= 0, "c1 is negative\n"); | |
705 | ctxSet -= ((int32_t)(c1 - 1) >> 31); | |
706 | c1 = 1; | |
707 | } | |
708 | } | |
709 | ||
710 | cgRdStats.sigCost += costSig[scanPos]; | |
711 | if (!scanPosinCG) | |
712 | cgRdStats.sigCost0 = costSig[scanPos]; | |
713 | ||
714 | if (dstCoeff[blkPos]) | |
715 | { | |
716 | sigCoeffGroupFlag64 |= cgBlkPosMask; | |
717 | cgRdStats.codedLevelAndDist += costCoeff[scanPos] - costSig[scanPos]; | |
718 | cgRdStats.uncodedDist += costUncoded[scanPos]; | |
719 | cgRdStats.nnzBeforePos0 += scanPosinCG; | |
720 | } | |
721 | } /* end for (scanPosinCG) */ | |
722 | ||
723 | costCoeffGroupSig[cgScanPos] = 0; | |
724 | ||
725 | if (cgLastScanPos < 0) | |
726 | { | |
727 | /* nothing to do at this point */ | |
728 | } | |
729 | else if (!cgScanPos || cgScanPos == cgLastScanPos) | |
730 | { | |
731 | /* coeff group 0 is implied to be present, no signal cost */ | |
732 | /* coeff group with last NZ is implied to be present, handled below */ | |
733 | } | |
734 | else if (sigCoeffGroupFlag64 & cgBlkPosMask) | |
735 | { | |
736 | if (!cgRdStats.nnzBeforePos0) | |
737 | { | |
738 | /* if only coeff 0 in this CG is coded, its significant coeff bit is implied */ | |
739 | totalRdCost -= cgRdStats.sigCost0; | |
740 | cgRdStats.sigCost -= cgRdStats.sigCost0; | |
741 | } | |
742 | ||
743 | /* there are coded coefficients in this group, but now we include the signaling cost | |
744 | * of the significant coefficient group flag and evaluate whether the RD cost of the | |
745 | * coded group is more than the RD cost of the uncoded group */ | |
746 | ||
747 | uint32_t sigCtx = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, codeParams.log2TrSizeCG); | |
748 | ||
749 | int64_t costZeroCG = totalRdCost + SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); | |
750 | costZeroCG += cgRdStats.uncodedDist; /* add distortion for resetting non-zero levels to zero levels */ | |
751 | costZeroCG -= cgRdStats.codedLevelAndDist; /* remove distortion and level cost of coded coefficients */ | |
752 | costZeroCG -= cgRdStats.sigCost; /* remove signaling cost of significant coeff bitmap */ | |
753 | ||
754 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]); | |
755 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add the cost of 1 bit in significant CG bitmap */ | |
756 | ||
757 | if (costZeroCG < totalRdCost) | |
758 | { | |
759 | sigCoeffGroupFlag64 &= ~cgBlkPosMask; | |
760 | totalRdCost = costZeroCG; | |
761 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][0]); | |
762 | ||
763 | /* reset all coeffs to 0. UNCODE THIS COEFF GROUP! */ | |
764 | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) | |
765 | { | |
766 | scanPos = cgScanPos * cgSize + scanPosinCG; | |
767 | uint32_t blkPos = codeParams.scan[scanPos]; | |
768 | if (dstCoeff[blkPos]) | |
769 | { | |
770 | costCoeff[scanPos] = costUncoded[scanPos]; | |
771 | costSig[scanPos] = 0; | |
772 | } | |
773 | dstCoeff[blkPos] = 0; | |
774 | } | |
775 | } | |
776 | } | |
777 | else | |
778 | { | |
779 | /* there were no coded coefficients in this coefficient group */ | |
780 | uint32_t ctxSig = getSigCoeffGroupCtxInc(sigCoeffGroupFlag64, cgPosX, cgPosY, codeParams.log2TrSizeCG); | |
781 | costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[ctxSig][0]); | |
782 | totalRdCost += costCoeffGroupSig[cgScanPos]; /* add cost of 0 bit in significant CG bitmap */ | |
783 | totalRdCost -= cgRdStats.sigCost; /* remove cost of significant coefficient bitmap */ | |
784 | } | |
785 | } /* end for (cgScanPos) */ | |
786 | ||
787 | X265_CHECK(lastScanPos >= 0, "numSig non zero, but no coded CG\n"); | |
788 | ||
789 | /* calculate RD cost of uncoded block CBF=0, and add cost of CBF=1 to total */ | |
790 | int64_t bestCost; | |
791 | if (!cu.isIntra(absPartIdx) && bIsLuma && !cu.m_tuDepth[absPartIdx]) | |
792 | { | |
793 | bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockRootCbpBits[0]); | |
794 | totalRdCost += SIGCOST(estBitsSbac.blockRootCbpBits[1]); | |
795 | } | |
796 | else | |
797 | { | |
798 | int ctx = ctxCbf[ttype][cu.m_tuDepth[absPartIdx]]; | |
799 | bestCost = totalUncodedCost + SIGCOST(estBitsSbac.blockCbpBits[ctx][0]); | |
800 | totalRdCost += SIGCOST(estBitsSbac.blockCbpBits[ctx][1]); | |
801 | } | |
802 | ||
803 | /* This loop starts with the last non-zero found in the first loop and then refines this last | |
804 | * non-zero by measuring the true RD cost of the last NZ at this position, and then the RD costs | |
805 | * at all previous coefficients until a coefficient greater than 1 is encountered or we run out | |
806 | * of coefficients to evaluate. This will factor in the cost of coding empty groups and empty | |
807 | * coeff prior to the last NZ. The base best cost is the RD cost of CBF=0 */ | |
808 | int bestLastIdx = 0; | |
809 | bool foundLast = false; | |
810 | for (int cgScanPos = cgLastScanPos; cgScanPos >= 0 && !foundLast; cgScanPos--) | |
811 | { | |
812 | if (!cgScanPos || cgScanPos == cgLastScanPos) | |
813 | { | |
814 | /* the presence of these coefficient groups are inferred, they have no bit in | |
815 | * sigCoeffGroupFlag64 and no saved costCoeffGroupSig[] cost */ | |
816 | } | |
817 | else if (sigCoeffGroupFlag64 & (1ULL << codeParams.scanCG[cgScanPos])) | |
818 | { | |
819 | /* remove cost of significant coeff group flag, the group's presence would be inferred | |
820 | * from lastNZ if it were present in this group */ | |
821 | totalRdCost -= costCoeffGroupSig[cgScanPos]; | |
822 | } | |
823 | else | |
824 | { | |
825 | /* remove cost of signaling this empty group as not present */ | |
826 | totalRdCost -= costCoeffGroupSig[cgScanPos]; | |
827 | continue; | |
828 | } | |
829 | ||
830 | for (int scanPosinCG = cgSize - 1; scanPosinCG >= 0; scanPosinCG--) | |
831 | { | |
832 | scanPos = cgScanPos * cgSize + scanPosinCG; | |
833 | if ((int)scanPos > lastScanPos) | |
834 | continue; | |
835 | ||
836 | /* if the coefficient was coded, measure the RD cost of it as the last non-zero and then | |
837 | * continue as if it were uncoded. If the coefficient was already uncoded, remove the | |
838 | * cost of signaling it as not-significant */ | |
839 | uint32_t blkPos = codeParams.scan[scanPos]; | |
840 | if (dstCoeff[blkPos]) | |
841 | { | |
842 | /* Swap the cost of signaling its significant coeff bit with the cost of | |
843 | * signaling its lastNZ pos */ | |
844 | uint32_t posY = blkPos >> log2TrSize; | |
845 | uint32_t posX = blkPos - (posY << log2TrSize); | |
846 | uint32_t bitsLastNZ = codeParams.scanType == SCAN_VER ? getRateLast(posY, posX) : getRateLast(posX, posY); | |
847 | int64_t costAsLast = totalRdCost - costSig[scanPos] + SIGCOST(bitsLastNZ); | |
848 | ||
849 | if (costAsLast < bestCost) | |
850 | { | |
851 | bestLastIdx = scanPos + 1; | |
852 | bestCost = costAsLast; | |
853 | } | |
854 | if (dstCoeff[blkPos] > 1) | |
855 | { | |
856 | foundLast = true; | |
857 | break; | |
858 | } | |
859 | ||
860 | totalRdCost -= costCoeff[scanPos]; | |
861 | totalRdCost += costUncoded[scanPos]; | |
862 | } | |
863 | else | |
864 | totalRdCost -= costSig[scanPos]; | |
865 | } | |
866 | } | |
867 | ||
868 | /* recount non-zero coefficients and re-apply sign of DCT coef */ | |
869 | numSig = 0; | |
870 | for (int pos = 0; pos < bestLastIdx; pos++) | |
871 | { | |
872 | int blkPos = codeParams.scan[pos]; | |
873 | int level = dstCoeff[blkPos]; | |
874 | numSig += (level != 0); | |
875 | ||
876 | uint32_t mask = (int32_t)m_resiDctCoeff[blkPos] >> 31; | |
877 | dstCoeff[blkPos] = (int16_t)((level ^ mask) - mask); | |
878 | } | |
879 | ||
880 | /* clean uncoded coefficients */ | |
881 | for (int pos = bestLastIdx; pos <= lastScanPos; pos++) | |
882 | dstCoeff[codeParams.scan[pos]] = 0; | |
883 | ||
884 | /* rate-distortion based sign-hiding */ | |
885 | if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2) | |
886 | { | |
887 | int lastCG = true; | |
888 | for (int subSet = cgLastScanPos; subSet >= 0; subSet--) | |
889 | { | |
890 | int subPos = subSet << LOG2_SCAN_SET_SIZE; | |
891 | int n; | |
892 | ||
893 | /* measure distance between first and last non-zero coef in this | |
894 | * coding group */ | |
895 | for (n = SCAN_SET_SIZE - 1; n >= 0; --n) | |
896 | if (dstCoeff[codeParams.scan[n + subPos]]) | |
897 | break; | |
898 | if (n < 0) | |
899 | continue; | |
900 | ||
901 | int lastNZPosInCG = n; | |
902 | ||
903 | for (n = 0;; n++) | |
904 | if (dstCoeff[codeParams.scan[n + subPos]]) | |
905 | break; | |
906 | ||
907 | int firstNZPosInCG = n; | |
908 | ||
909 | if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD) | |
910 | { | |
911 | uint32_t signbit = (dstCoeff[codeParams.scan[subPos + firstNZPosInCG]] > 0 ? 0 : 1); | |
912 | int absSum = 0; | |
913 | ||
914 | for (n = firstNZPosInCG; n <= lastNZPosInCG; n++) | |
915 | absSum += dstCoeff[codeParams.scan[n + subPos]]; | |
916 | ||
917 | if (signbit != (absSum & 1U)) | |
918 | { | |
919 | /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff | |
920 | * is properly implied. Note dstCoeff[] are signed by this point but curChange and | |
921 | * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */ | |
922 | ||
923 | int64_t minCostInc = MAX_INT64, curCost = MAX_INT64; | |
924 | int minPos = -1; | |
925 | int16_t finalChange = 0, curChange = 0; | |
926 | ||
927 | for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n) | |
928 | { | |
929 | uint32_t blkPos = codeParams.scan[n + subPos]; | |
930 | int signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */ | |
931 | int absLevel = abs(dstCoeff[blkPos]); | |
932 | ||
933 | int d = abs(signCoef) - UNQUANT(absLevel); | |
934 | int64_t origDist = (((int64_t)d * d)) << scaleBits; | |
935 | ||
936 | #define DELTARDCOST(d, deltabits) ((((int64_t)d * d) << scaleBits) - origDist + ((lambda2 * (int64_t)(deltabits)) >> 8)) | |
937 | ||
938 | if (dstCoeff[blkPos]) | |
939 | { | |
940 | d = abs(signCoef) - UNQUANT(absLevel + 1); | |
941 | int64_t costUp = DELTARDCOST(d, rateIncUp[blkPos]); | |
942 | ||
943 | /* if decrementing would make the coeff 0, we can include the | |
944 | * significant coeff flag cost savings */ | |
945 | d = abs(signCoef) - UNQUANT(absLevel - 1); | |
946 | bool isOne = abs(dstCoeff[blkPos]) == 1; | |
947 | int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0); | |
948 | int64_t costDown = DELTARDCOST(d, downBits); | |
949 | ||
950 | if (lastCG && lastNZPosInCG == n && isOne) | |
951 | costDown -= 4 * IEP_RATE; | |
952 | ||
953 | if (costUp < costDown) | |
954 | { | |
955 | curCost = costUp; | |
956 | curChange = 1; | |
957 | } | |
958 | else | |
959 | { | |
960 | curChange = -1; | |
961 | if (n == firstNZPosInCG && isOne) | |
962 | curCost = MAX_INT64; | |
963 | else | |
964 | curCost = costDown; | |
965 | } | |
966 | } | |
967 | else if (n < firstNZPosInCG && signbit != (signCoef >= 0 ? 0 : 1U)) | |
968 | { | |
969 | /* don't try to make a new coded coeff before the first coeff if its | |
970 | * sign would be different than the first coeff, the inferred sign would | |
971 | * still be wrong and we'd have to do this again. */ | |
972 | curCost = MAX_INT64; | |
973 | } | |
974 | else | |
975 | { | |
976 | /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */ | |
977 | d = abs(signCoef) - UNQUANT(1); | |
978 | curCost = DELTARDCOST(d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]); | |
979 | curChange = 1; | |
980 | } | |
981 | ||
982 | if (curCost < minCostInc) | |
983 | { | |
984 | minCostInc = curCost; | |
985 | finalChange = curChange; | |
986 | minPos = blkPos; | |
987 | } | |
988 | } | |
989 | ||
990 | if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768) | |
991 | /* don't allow sign hiding to violate the SPEC range */ | |
992 | finalChange = -1; | |
993 | ||
994 | if (dstCoeff[minPos] == 0) | |
995 | numSig++; | |
996 | else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1) | |
997 | numSig--; | |
998 | ||
999 | if (m_resiDctCoeff[minPos] >= 0) | |
1000 | dstCoeff[minPos] += finalChange; | |
1001 | else | |
1002 | dstCoeff[minPos] -= finalChange; | |
1003 | } | |
1004 | } | |
1005 | ||
1006 | lastCG = false; | |
1007 | } | |
1008 | } | |
1009 | ||
1010 | return numSig; | |
1011 | } | |
1012 | ||
1013 | /* Pattern decision for context derivation process of significant_coeff_flag */ | |
1014 | uint32_t Quant::calcPatternSigCtx(uint64_t sigCoeffGroupFlag64, uint32_t cgPosX, uint32_t cgPosY, uint32_t log2TrSizeCG) | |
1015 | { | |
1016 | if (!log2TrSizeCG) | |
1017 | return 0; | |
1018 | ||
1019 | const uint32_t trSizeCG = 1 << log2TrSizeCG; | |
1020 | X265_CHECK(trSizeCG <= 8, "transform CG is too large\n"); | |
1021 | const uint32_t sigPos = (uint32_t)(sigCoeffGroupFlag64 >> (1 + (cgPosY << log2TrSizeCG) + cgPosX)); | |
1022 | const uint32_t sigRight = ((int32_t)(cgPosX - (trSizeCG - 1)) >> 31) & (sigPos & 1); | |
1023 | const uint32_t sigLower = ((int32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 2)) & 2; | |
1024 | ||
1025 | return sigRight + sigLower; | |
1026 | } | |
1027 | ||
1028 | /* Context derivation process of coeff_abs_significant_flag */ | |
1029 | uint32_t Quant::getSigCtxInc(uint32_t patternSigCtx, uint32_t log2TrSize, uint32_t trSize, uint32_t blkPos, bool bIsLuma, | |
1030 | uint32_t firstSignificanceMapContext) | |
1031 | { | |
1032 | static const uint8_t ctxIndMap[16] = | |
1033 | { | |
1034 | 0, 1, 4, 5, | |
1035 | 2, 3, 4, 5, | |
1036 | 6, 6, 8, 8, | |
1037 | 7, 7, 8, 8 | |
1038 | }; | |
1039 | ||
1040 | if (!blkPos) // special case for the DC context variable | |
1041 | return 0; | |
1042 | ||
1043 | if (log2TrSize == 2) // 4x4 | |
1044 | return ctxIndMap[blkPos]; | |
1045 | ||
1046 | const uint32_t posY = blkPos >> log2TrSize; | |
1047 | const uint32_t posX = blkPos & (trSize - 1); | |
1048 | X265_CHECK((blkPos - (posY << log2TrSize)) == posX, "block pos check failed\n"); | |
1049 | ||
1050 | int posXinSubset = blkPos & 3; | |
1051 | X265_CHECK((posX & 3) == (blkPos & 3), "pos alignment fail\n"); | |
1052 | int posYinSubset = posY & 3; | |
1053 | ||
1054 | // NOTE: [patternSigCtx][posXinSubset][posYinSubset] | |
1055 | static const uint8_t table_cnt[4][4][4] = | |
1056 | { | |
1057 | // patternSigCtx = 0 | |
1058 | { | |
1059 | { 2, 1, 1, 0 }, | |
1060 | { 1, 1, 0, 0 }, | |
1061 | { 1, 0, 0, 0 }, | |
1062 | { 0, 0, 0, 0 }, | |
1063 | }, | |
1064 | // patternSigCtx = 1 | |
1065 | { | |
1066 | { 2, 1, 0, 0 }, | |
1067 | { 2, 1, 0, 0 }, | |
1068 | { 2, 1, 0, 0 }, | |
1069 | { 2, 1, 0, 0 }, | |
1070 | }, | |
1071 | // patternSigCtx = 2 | |
1072 | { | |
1073 | { 2, 2, 2, 2 }, | |
1074 | { 1, 1, 1, 1 }, | |
1075 | { 0, 0, 0, 0 }, | |
1076 | { 0, 0, 0, 0 }, | |
1077 | }, | |
1078 | // patternSigCtx = 3 | |
1079 | { | |
1080 | { 2, 2, 2, 2 }, | |
1081 | { 2, 2, 2, 2 }, | |
1082 | { 2, 2, 2, 2 }, | |
1083 | { 2, 2, 2, 2 }, | |
1084 | } | |
1085 | }; | |
1086 | ||
1087 | int cnt = table_cnt[patternSigCtx][posXinSubset][posYinSubset]; | |
1088 | int offset = firstSignificanceMapContext; | |
1089 | ||
1090 | offset += cnt; | |
1091 | ||
1092 | return (bIsLuma && (posX | posY) >= 4) ? 3 + offset : offset; | |
1093 | } | |
1094 | ||
1095 | /* Calculates the cost of signaling the last significant coefficient in the block */ | |
1096 | inline uint32_t Quant::getRateLast(uint32_t posx, uint32_t posy) const | |
1097 | { | |
1098 | uint32_t ctxX = getGroupIdx(posx); | |
1099 | uint32_t ctxY = getGroupIdx(posy); | |
1100 | uint32_t cost = m_entropyCoder->m_estBitsSbac.lastXBits[ctxX] + m_entropyCoder->m_estBitsSbac.lastYBits[ctxY]; | |
1101 | ||
1102 | int32_t maskX = (int32_t)(2 - posx) >> 31; | |
1103 | int32_t maskY = (int32_t)(2 - posy) >> 31; | |
1104 | ||
1105 | cost += maskX & (IEP_RATE * ((ctxX - 2) >> 1)); | |
1106 | cost += maskY & (IEP_RATE * ((ctxY - 2) >> 1)); | |
1107 | return cost; | |
1108 | } | |
1109 | ||
1110 | /* Context derivation process of coeff_abs_significant_flag */ | |
1111 | uint32_t Quant::getSigCoeffGroupCtxInc(uint64_t cgGroupMask, uint32_t cgPosX, uint32_t cgPosY, uint32_t log2TrSizeCG) | |
1112 | { | |
1113 | const uint32_t trSizeCG = 1 << log2TrSizeCG; | |
1114 | ||
1115 | const uint32_t sigPos = (uint32_t)(cgGroupMask >> (1 + (cgPosY << log2TrSizeCG) + cgPosX)); | |
1116 | const uint32_t sigRight = ((int32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos; | |
1117 | const uint32_t sigLower = ((int32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1)); | |
1118 | ||
1119 | return (sigRight | sigLower) & 1; | |
1120 | } |