Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / encoder / search.cpp
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19 *
20 * This program is also available under a commercial proprietary license.
21 * For more information, contact us at license @ x265.com.
22 *****************************************************************************/
23
24 #include "common.h"
25 #include "primitives.h"
26 #include "picyuv.h"
27 #include "cudata.h"
28
29 #include "search.h"
30 #include "entropy.h"
31 #include "rdcost.h"
32
33 using namespace x265;
34
35 #if _MSC_VER
36 #pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning)
37 #pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
38 #endif
39
40 #define MVP_IDX_BITS 1
41
42 ALIGN_VAR_32(const pixel, Search::zeroPixel[MAX_CU_SIZE]) = { 0 };
43 ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
44
45 Search::Search() : JobProvider(NULL)
46 {
47 memset(m_rqt, 0, sizeof(m_rqt));
48
49 for (int i = 0; i < 3; i++)
50 {
51 m_qtTempTransformSkipFlag[i] = NULL;
52 m_qtTempCbf[i] = NULL;
53 }
54
55 m_numLayers = 0;
56 m_param = NULL;
57 m_slice = NULL;
58 m_frame = NULL;
59 m_bJobsQueued = false;
60 m_totalNumME = m_numAcquiredME = m_numCompletedME = 0;
61 }
62
63 bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
64 {
65 m_param = &param;
66 m_bEnableRDOQ = param.rdLevel >= 4;
67 m_bFrameParallel = param.frameNumThreads > 1;
68 m_numLayers = g_log2Size[param.maxCUSize] - 2;
69
70 m_rdCost.setPsyRdScale(param.psyRd);
71 m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
72
73 bool ok = m_quant.init(m_bEnableRDOQ, param.psyRdoq, scalingList, m_entropyCoder);
74 if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
75 ok &= m_quant.allocNoiseReduction(param);
76
77 ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
78
79 /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
80 * available for motion reference. See refLagRows in FrameEncoder::compressCTURows() */
81 m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight;
82
83 uint32_t sizeL = 1 << (g_maxLog2CUSize * 2);
84 uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
85 uint32_t numPartitions = NUM_CU_PARTITIONS;
86
87 /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
88 * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
89 * which are reconstructed at each depth are valid. At the end, the transform depth table
90 * is walked and the coeff and recon at the correct depths are collected */
91 for (uint32_t i = 0; i <= m_numLayers; i++)
92 {
93 CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
94 m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
95 m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
96 ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
97 ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
98 }
99
100 /* the rest of these buffers are indexed per-depth */
101 for (uint32_t i = 0; i <= g_maxCUDepth; i++)
102 {
103 int cuSize = g_maxCUSize >> i;
104 ok &= m_rqt[i].tmpResiYuv.create(cuSize, param.internalCsp);
105 ok &= m_rqt[i].tmpPredYuv.create(cuSize, param.internalCsp);
106 ok &= m_rqt[i].bidirPredYuv[0].create(cuSize, param.internalCsp);
107 ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
108 }
109
110 CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
111 m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
112 m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
113 CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
114 m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
115 m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
116
117 return ok;
118
119 fail:
120 return false;
121 }
122
123 Search::~Search()
124 {
125 for (uint32_t i = 0; i <= m_numLayers; i++)
126 {
127 X265_FREE(m_rqt[i].coeffRQT[0]);
128 m_rqt[i].reconQtYuv.destroy();
129 m_rqt[i].resiQtYuv.destroy();
130 }
131
132 for (uint32_t i = 0; i <= g_maxCUDepth; i++)
133 {
134 m_rqt[i].tmpResiYuv.destroy();
135 m_rqt[i].tmpPredYuv.destroy();
136 m_rqt[i].bidirPredYuv[0].destroy();
137 m_rqt[i].bidirPredYuv[1].destroy();
138 }
139
140 X265_FREE(m_qtTempCbf[0]);
141 X265_FREE(m_qtTempTransformSkipFlag[0]);
142 }
143
144 void Search::setQP(const Slice& slice, int qp)
145 {
146 x265_emms(); /* TODO: if the lambda tables were ints, this would not be necessary */
147 m_me.setQP(qp);
148 m_rdCost.setQP(slice, qp);
149 }
150
151 #if CHECKED_BUILD || _DEBUG
152 void Search::invalidateContexts(int fromDepth)
153 {
154 /* catch reads without previous writes */
155 for (int d = fromDepth; d < NUM_FULL_DEPTH; d++)
156 {
157 m_rqt[d].cur.markInvalid();
158 m_rqt[d].rqtTemp.markInvalid();
159 m_rqt[d].rqtRoot.markInvalid();
160 m_rqt[d].rqtTest.markInvalid();
161 }
162 }
163 #else
164 void Search::invalidateContexts(int) {}
165 #endif
166
167 void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx)
168 {
169 uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
170 uint32_t subdiv = tuDepth < cu.m_tuDepth[absPartIdx];
171 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
172
173 if (!(log2TrSize - m_hChromaShift < 2))
174 {
175 if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
176 m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
177 if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
178 m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
179 }
180
181 if (subdiv)
182 {
183 uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
184 for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
185 codeSubdivCbfQTChroma(cu, tuDepth + 1, absPartIdx);
186 }
187 }
188
189 void Search::codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype)
190 {
191 if (!cu.getCbf(absPartIdx, ttype, tuDepth))
192 return;
193
194 uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
195 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
196
197 if (tuDepth < cu.m_tuDepth[absPartIdx])
198 {
199 uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
200 for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
201 codeCoeffQTChroma(cu, tuDepth + 1, absPartIdx, ttype);
202
203 return;
204 }
205
206 uint32_t tuDepthC = tuDepth;
207 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
208
209 if (log2TrSizeC < 2)
210 {
211 X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
212 if (absPartIdx & 3)
213 return;
214 log2TrSizeC = 2;
215 tuDepthC--;
216 }
217
218 uint32_t qtLayer = log2TrSize - 2;
219
220 if (m_csp != X265_CSP_I422)
221 {
222 uint32_t shift = (m_csp == X265_CSP_I420) ? 2 : 0;
223 uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - shift);
224 coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
225 m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
226 }
227 else
228 {
229 uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
230 coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
231 uint32_t subTUSize = 1 << (log2TrSizeC * 2);
232 uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
233 if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
234 m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
235 if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
236 m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, ttype);
237 }
238 }
239
240 void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
241 {
242 uint32_t fullDepth = mode.cu.m_cuDepth[0] + tuDepth;
243 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
244 uint32_t qtLayer = log2TrSize - 2;
245 uint32_t sizeIdx = log2TrSize - 2;
246 bool mightNotSplit = log2TrSize <= depthRange[1];
247 bool mightSplit = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
248
249 /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
250 if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
251 {
252 mightNotSplit = false;
253 mightSplit = true;
254 }
255
256 CUData& cu = mode.cu;
257
258 Cost fullCost;
259 uint32_t bCBF = 0;
260
261 pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
262 uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
263
264 if (mightNotSplit)
265 {
266 if (mightSplit)
267 m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
268
269 const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
270 pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
271 int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
272 uint32_t stride = mode.fencYuv->m_size;
273
274 // init availability pattern
275 uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
276 initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
277
278 // get prediction signal
279 predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
280
281 cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
282 cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
283
284 uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
285 coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
286
287 // store original entropy coding status
288 if (m_bEnableRDOQ)
289 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
290
291 primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
292
293 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
294 if (numSig)
295 {
296 m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
297 primitives.luma_add_ps[sizeIdx](reconQt, reconQtStride, pred, residual, stride, stride);
298 }
299 else
300 // no coded residual, recon = pred
301 primitives.luma_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride);
302
303 bCBF = !!numSig << tuDepth;
304 cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
305 fullCost.distortion = primitives.sse_pp[sizeIdx](reconQt, reconQtStride, fenc, stride);
306
307 m_entropyCoder.resetBits();
308 if (!absPartIdx)
309 {
310 if (!cu.m_slice->isIntra())
311 {
312 if (cu.m_slice->m_pps->bTransquantBypassEnabled)
313 m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
314 m_entropyCoder.codeSkipFlag(cu, 0);
315 m_entropyCoder.codePredMode(cu.m_predMode[0]);
316 }
317
318 m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
319 }
320 if (cu.m_partSize[0] == SIZE_2Nx2N)
321 {
322 if (!absPartIdx)
323 m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
324 }
325 else
326 {
327 uint32_t qNumParts = cuGeom.numPartitions >> 2;
328 if (!tuDepth)
329 {
330 for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
331 m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
332 }
333 else if (!(absPartIdx & (qNumParts - 1)))
334 m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
335 }
336 if (log2TrSize != depthRange[0])
337 m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
338
339 m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
340
341 if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
342 m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
343
344 fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
345
346 if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE)
347 fullCost.bits *= 4;
348
349 if (m_rdCost.m_psyRd)
350 {
351 fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
352 fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
353 }
354 else
355 fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
356 }
357 else
358 fullCost.rdcost = MAX_INT64;
359
360 if (mightSplit)
361 {
362 if (mightNotSplit)
363 {
364 m_entropyCoder.store(m_rqt[fullDepth].rqtTest); // save state after full TU encode
365 m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); // prep state of split encode
366 }
367
368 // code split block
369 uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
370
371 int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
372 if (m_param->bEnableTSkipFast)
373 checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;
374
375 Cost splitCost;
376 uint32_t cbf = 0;
377 for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
378 {
379 if (checkTransformSkip)
380 codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
381 else
382 codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);
383
384 cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
385 }
386 for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
387 cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
388
389 if (mightNotSplit && log2TrSize != depthRange[0])
390 {
391 /* If we could have coded this TU depth, include cost of subdiv flag */
392 m_entropyCoder.resetBits();
393 m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
394 splitCost.bits += m_entropyCoder.getNumberOfWrittenBits();
395
396 if (m_rdCost.m_psyRd)
397 splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
398 else
399 splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
400 }
401
402 if (splitCost.rdcost < fullCost.rdcost)
403 {
404 outCost.rdcost += splitCost.rdcost;
405 outCost.distortion += splitCost.distortion;
406 outCost.bits += splitCost.bits;
407 outCost.energy += splitCost.energy;
408 return;
409 }
410 else
411 {
412 // recover entropy state of full-size TU encode
413 m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
414
415 // recover transform index and Cbf values
416 cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
417 cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
418 cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
419 }
420 }
421
422 // set reconstruction for next intra prediction blocks if full TU prediction won
423 pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
424 intptr_t picStride = m_frame->m_reconPic->m_stride;
425 primitives.luma_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
426
427 outCost.rdcost += fullCost.rdcost;
428 outCost.distortion += fullCost.distortion;
429 outCost.bits += fullCost.bits;
430 outCost.energy += fullCost.energy;
431 }
432
433 void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
434 {
435 uint32_t fullDepth = mode.cu.m_cuDepth[0] + tuDepth;
436 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
437 uint32_t tuSize = 1 << log2TrSize;
438
439 X265_CHECK(tuSize == MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
440
441 CUData& cu = mode.cu;
442 Yuv* predYuv = &mode.predYuv;
443 const Yuv* fencYuv = mode.fencYuv;
444
445 Cost fullCost;
446 fullCost.rdcost = MAX_INT64;
447 int bTSkip = 0;
448 uint32_t bCBF = 0;
449
450 const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
451 pixel* pred = predYuv->getLumaAddr(absPartIdx);
452 int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
453 uint32_t stride = fencYuv->m_size;
454 int sizeIdx = log2TrSize - 2;
455
456 // init availability pattern
457 uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
458 initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
459
460 // get prediction signal
461 predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
462
463 cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
464
465 uint32_t qtLayer = log2TrSize - 2;
466 uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
467 coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
468 pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
469 uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
470
471 // store original entropy coding status
472 m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
473
474 if (m_bEnableRDOQ)
475 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
476
477 ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]);
478 ALIGN_VAR_32(pixel, tsReconY[MAX_TS_SIZE * MAX_TS_SIZE]);
479
480 int checkTransformSkip = 1;
481 for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
482 {
483 uint64_t tmpCost;
484 uint32_t tmpEnergy = 0;
485
486 coeff_t* coeff = (useTSkip ? tsCoeffY : coeffY);
487 pixel* tmpRecon = (useTSkip ? tsReconY : reconQt);
488 uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
489
490 primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
491
492 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
493 if (numSig)
494 {
495 m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
496 primitives.luma_add_ps[sizeIdx](tmpRecon, tmpReconStride, pred, residual, stride, stride);
497 }
498 else if (useTSkip)
499 {
500 /* do not allow tskip if CBF=0, pretend we did not try tskip */
501 checkTransformSkip = 0;
502 break;
503 }
504 else
505 // no residual coded, recon = pred
506 primitives.luma_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride);
507
508 uint32_t tmpDist = primitives.sse_pp[sizeIdx](tmpRecon, tmpReconStride, fenc, stride);
509
510 cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
511 cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
512
513 if (useTSkip)
514 m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
515
516 m_entropyCoder.resetBits();
517 if (!absPartIdx)
518 {
519 if (!cu.m_slice->isIntra())
520 {
521 if (cu.m_slice->m_pps->bTransquantBypassEnabled)
522 m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
523 m_entropyCoder.codeSkipFlag(cu, 0);
524 m_entropyCoder.codePredMode(cu.m_predMode[0]);
525 }
526
527 m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
528 }
529 if (cu.m_partSize[0] == SIZE_2Nx2N)
530 {
531 if (!absPartIdx)
532 m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
533 }
534 else
535 {
536 uint32_t qNumParts = cuGeom.numPartitions >> 2;
537 if (!tuDepth)
538 {
539 for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
540 m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
541 }
542 else if (!(absPartIdx & (qNumParts - 1)))
543 m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
544 }
545 m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
546
547 m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
548
549 if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
550 m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
551
552 uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
553
554 if (!useTSkip)
555 m_entropyCoder.store(m_rqt[fullDepth].rqtTemp);
556
557 if (m_rdCost.m_psyRd)
558 {
559 tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride);
560 tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
561 }
562 else
563 tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
564
565 if (tmpCost < fullCost.rdcost)
566 {
567 bTSkip = useTSkip;
568 bCBF = !!numSig;
569 fullCost.rdcost = tmpCost;
570 fullCost.distortion = tmpDist;
571 fullCost.bits = tmpBits;
572 fullCost.energy = tmpEnergy;
573 }
574 }
575
576 if (bTSkip)
577 {
578 memcpy(coeffY, tsCoeffY, sizeof(coeff_t) << (log2TrSize * 2));
579 primitives.luma_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize);
580 }
581 else if (checkTransformSkip)
582 {
583 cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
584 cu.setCbfSubParts(bCBF << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
585 m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
586 }
587
588 // set reconstruction for next intra prediction blocks
589 pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
590 intptr_t picStride = m_frame->m_reconPic->m_stride;
591 primitives.luma_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
592
593 outCost.rdcost += fullCost.rdcost;
594 outCost.distortion += fullCost.distortion;
595 outCost.bits += fullCost.bits;
596 outCost.energy += fullCost.energy;
597 }
598
599 /* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
600 void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, const uint32_t depthRange[2])
601 {
602 CUData& cu = mode.cu;
603
604 uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
605 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
606 bool bCheckFull = log2TrSize <= depthRange[1];
607
608 X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");
609
610 /* we still respect rdPenalty == 2, we can forbid 32x32 intra TU. rdPenalty = 1 is impossible
611 * since we are not measuring RD cost */
612 if (m_param->rdPenalty == 2 && log2TrSize == 5 && depthRange[0] <= 4)
613 bCheckFull = false;
614
615 if (bCheckFull)
616 {
617 const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
618 pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
619 int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
620 pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
621 intptr_t picStride = m_frame->m_reconPic->m_stride;
622 uint32_t stride = mode.fencYuv->m_size;
623 uint32_t sizeIdx = log2TrSize - 2;
624 uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
625 uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
626 coeff_t* coeff = cu.m_trCoeff[TEXT_LUMA] + coeffOffsetY;
627
628 initAdiPattern(cu, cuGeom, absPartIdx, tuDepth, lumaPredMode);
629 predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
630
631 X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
632 cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
633
634 primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
635 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, false);
636 if (numSig)
637 {
638 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, false, numSig);
639 primitives.luma_add_ps[sizeIdx](picReconY, picStride, pred, residual, stride, stride);
640 cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
641 }
642 else
643 {
644 primitives.luma_copy_pp[sizeIdx](picReconY, picStride, pred, stride);
645 cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
646 }
647 }
648 else
649 {
650 X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
651
652 /* code split block */
653 uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
654 uint32_t cbf = 0;
655 for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
656 {
657 residualTransformQuantIntra(mode, cuGeom, tuDepth + 1, qPartIdx, depthRange);
658 cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
659 }
660 for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
661 cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << tuDepth);
662 }
663 }
664
665 void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx)
666 {
667 uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
668 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
669
670 if (tuDepth == cu.m_tuDepth[absPartIdx])
671 {
672 uint32_t qtLayer = log2TrSize - 2;
673
674 // copy transform coefficients
675 uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
676 coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
677 coeff_t* coeffDestY = cu.m_trCoeff[0] + coeffOffsetY;
678 memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2));
679
680 // copy reconstruction
681 m_rqt[qtLayer].reconQtYuv.copyPartToPartLuma(reconYuv, absPartIdx, log2TrSize);
682 }
683 else
684 {
685 uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
686 for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
687 extractIntraResultQT(cu, reconYuv, tuDepth + 1, absPartIdx);
688 }
689 }
690
691 inline void offsetCBFs(uint8_t subTUCBF[2])
692 {
693 uint8_t combinedCBF = subTUCBF[0] | subTUCBF[1];
694 subTUCBF[0] = subTUCBF[0] << 1 | combinedCBF;
695 subTUCBF[1] = subTUCBF[1] << 1 | combinedCBF;
696 }
697
698 /* 4:2:2 post-TU split processing */
699 void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx)
700 {
701 uint32_t depth = cu.m_cuDepth[0];
702 uint32_t fullDepth = depth + tuDepth;
703 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
704
705 if (log2TrSize == 2)
706 {
707 X265_CHECK(m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
708 ++log2TrSize;
709 }
710
711 uint32_t tuNumParts = 1 << ((log2TrSize - LOG2_UNIT_SIZE) * 2 - 1);
712
713 // move the CBFs down a level and set the parent CBF
714 uint8_t subTUCBF[2];
715 subTUCBF[0] = cu.getCbf(absPartIdx , ttype, tuDepth);
716 subTUCBF[1] = cu.getCbf(absPartIdx+ tuNumParts, ttype, tuDepth);
717 offsetCBFs(subTUCBF);
718
719 cu.setCbfPartRange(subTUCBF[0] << tuDepth, ttype, absPartIdx , tuNumParts);
720 cu.setCbfPartRange(subTUCBF[1] << tuDepth, ttype, absPartIdx + tuNumParts, tuNumParts);
721 }
722
723 /* returns distortion */
724 uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
725 {
726 CUData& cu = mode.cu;
727 uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
728 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
729
730 if (tuDepth < cu.m_tuDepth[absPartIdx])
731 {
732 uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
733 uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0;
734 for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
735 {
736 outDist += codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, psyEnergy);
737 splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
738 splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
739 }
740 for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
741 {
742 cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << tuDepth);
743 cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << tuDepth);
744 }
745
746 return outDist;
747 }
748
749 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
750
751 uint32_t tuDepthC = tuDepth;
752 if (log2TrSizeC < 2)
753 {
754 X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
755 if (absPartIdx & 3)
756 return 0;
757 log2TrSizeC = 2;
758 tuDepthC--;
759 }
760
761 if (m_bEnableRDOQ)
762 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
763
764 bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
765 checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
766 if (checkTransformSkip)
767 return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy);
768
769 uint32_t qtLayer = log2TrSize - 2;
770 uint32_t tuSize = 1 << log2TrSizeC;
771 uint32_t outDist = 0;
772
773 uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
774 const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
775
776 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
777 {
778 TextType ttype = (TextType)chromaId;
779
780 TURecurse tuIterator(splitType, curPartNum, absPartIdx);
781 do
782 {
783 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
784
785 const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
786 pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
787 int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
788 uint32_t stride = mode.fencYuv->m_csize;
789 uint32_t sizeIdxC = log2TrSizeC - 2;
790
791 uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
792 coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
793 pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
794 uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
795
796 pixel* picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
797 intptr_t picStride = m_frame->m_reconPic->m_strideC;
798
799 // init availability pattern
800 initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
801 pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
802
803 uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
804 if (chromaPredMode == DM_CHROMA_IDX)
805 chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
806 if (m_csp == X265_CSP_I422)
807 chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
808
809 // get prediction signal
810 predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
811
812 cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
813
814 primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
815 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
816 if (numSig)
817 {
818 m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
819 primitives.luma_add_ps[sizeIdxC](reconQt, reconQtStride, pred, residual, stride, stride);
820 cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
821 }
822 else
823 {
824 // no coded residual, recon = pred
825 primitives.luma_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride);
826 cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
827 }
828
829 outDist += m_rdCost.scaleChromaDist(chromaId, primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride));
830
831 if (m_rdCost.m_psyRd)
832 psyEnergy += m_rdCost.psyCost(sizeIdxC, fenc, stride, picReconC, picStride);
833
834 primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride);
835 }
836 while (tuIterator.isNextSection());
837
838 if (splitType == VERTICAL_SPLIT)
839 offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
840 }
841
842 return outDist;
843 }
844
845 /* returns distortion */
846 uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
847 {
848 CUData& cu = mode.cu;
849 uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
850 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
851 const uint32_t log2TrSizeC = 2;
852 uint32_t tuSize = 4;
853 uint32_t qtLayer = log2TrSize - 2;
854 uint32_t outDist = 0;
855
856 /* At the TU layers above this one, no RDO is performed, only distortion is being measured,
857 * so the entropy coder is not very accurate. The best we can do is return it in the same
858 * condition as it arrived, and to do all bit estimates from the same state. */
859 m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
860
861 ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
862 ALIGN_VAR_32(pixel, tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]);
863
864 uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
865 const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
866
867 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
868 {
869 TextType ttype = (TextType)chromaId;
870
871 TURecurse tuIterator(splitType, curPartNum, absPartIdx);
872 do
873 {
874 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
875
876 const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
877 pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
878 int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
879 uint32_t stride = mode.fencYuv->m_csize;
880 const uint32_t sizeIdxC = log2TrSizeC - 2;
881
882 uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
883 coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
884 pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
885 uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
886
887 // init availability pattern
888 initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
889 pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
890
891 uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
892 if (chromaPredMode == DM_CHROMA_IDX)
893 chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
894 if (m_csp == X265_CSP_I422)
895 chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
896
897 // get prediction signal
898 predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
899
900 uint64_t bCost = MAX_INT64;
901 uint32_t bDist = 0;
902 uint32_t bCbf = 0;
903 uint32_t bEnergy = 0;
904 int bTSkip = 0;
905
906 int checkTransformSkip = 1;
907 for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
908 {
909 coeff_t* coeff = (useTSkip ? tskipCoeffC : coeffC);
910 pixel* recon = (useTSkip ? tskipReconC : reconQt);
911 uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
912
913 primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
914
915 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
916 if (numSig)
917 {
918 m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
919 primitives.luma_add_ps[sizeIdxC](recon, reconStride, pred, residual, stride, stride);
920 cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
921 }
922 else if (useTSkip)
923 {
924 checkTransformSkip = 0;
925 break;
926 }
927 else
928 {
929 primitives.luma_copy_pp[sizeIdxC](recon, reconStride, pred, stride);
930 cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
931 }
932 uint32_t tmpDist = primitives.sse_pp[sizeIdxC](recon, reconStride, fenc, stride);
933 tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
934
935 cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
936
937 uint32_t tmpBits = 0, tmpEnergy = 0;
938 if (numSig)
939 {
940 m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
941 m_entropyCoder.resetBits();
942 m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
943 tmpBits = m_entropyCoder.getNumberOfWrittenBits();
944 }
945
946 uint64_t tmpCost;
947 if (m_rdCost.m_psyRd)
948 {
949 tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
950 tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
951 }
952 else
953 tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
954
955 if (tmpCost < bCost)
956 {
957 bCost = tmpCost;
958 bDist = tmpDist;
959 bTSkip = useTSkip;
960 bCbf = !!numSig;
961 bEnergy = tmpEnergy;
962 }
963 }
964
965 if (bTSkip)
966 {
967 memcpy(coeffC, tskipCoeffC, sizeof(coeff_t) << (log2TrSizeC * 2));
968 primitives.luma_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE);
969 }
970
971 cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
972 cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
973
974 pixel* reconPicC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
975 intptr_t picStride = m_frame->m_reconPic->m_strideC;
976 primitives.luma_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride);
977
978 outDist += bDist;
979 psyEnergy += bEnergy;
980 }
981 while (tuIterator.isNextSection());
982
983 if (splitType == VERTICAL_SPLIT)
984 offsetSubTUCBFs(cu, ttype, tuDepth, absPartIdx);
985 }
986
987 m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
988 return outDist;
989 }
990
991 void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
992 {
993 uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
994 uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
995 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
996 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
997
998 if (tuDepthL == tuDepth || log2TrSizeC == 2)
999 {
1000 // copy transform coefficients
1001 uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
1002 uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1003
1004 uint32_t qtLayer = log2TrSize - 2 - (tuDepthL - tuDepth);
1005 coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
1006 coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
1007 coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
1008 coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
1009 memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
1010 memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
1011
1012 // copy reconstruction
1013 m_rqt[qtLayer].reconQtYuv.copyPartToPartChroma(reconYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
1014 }
1015 else
1016 {
1017 uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1018 for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
1019 extractIntraResultChromaQT(cu, reconYuv, absPartIdx, tuDepth + 1);
1020 }
1021 }
1022
1023 void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx)
1024 {
1025 CUData& cu = mode.cu;
1026 uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
1027 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
1028
1029 if (tuDepth == cu.m_tuDepth[absPartIdx])
1030 {
1031 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1032 uint32_t tuDepthC = tuDepth;
1033 if (log2TrSizeC < 2)
1034 {
1035 X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
1036 if (absPartIdx & 3)
1037 return;
1038 log2TrSizeC = 2;
1039 tuDepthC--;
1040 }
1041
1042 ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1043 uint32_t tuSize = 1 << log2TrSizeC;
1044 uint32_t stride = mode.fencYuv->m_csize;
1045 const int sizeIdxC = log2TrSizeC - 2;
1046
1047 uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
1048 const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
1049
1050 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1051 {
1052 TextType ttype = (TextType)chromaId;
1053
1054 TURecurse tuIterator(splitType, curPartNum, absPartIdx);
1055 do
1056 {
1057 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1058
1059 const pixel* fenc = mode.fencYuv->getChromaAddr(chromaId, absPartIdxC);
1060 pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
1061 int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
1062 pixel* recon = mode.reconYuv.getChromaAddr(chromaId, absPartIdxC); // TODO: needed?
1063 uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1064 coeff_t* coeff = cu.m_trCoeff[ttype] + coeffOffsetC;
1065 pixel* picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
1066 uint32_t picStride = m_frame->m_reconPic->m_strideC;
1067
1068 uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
1069 if (chromaPredMode == DM_CHROMA_IDX)
1070 chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
1071 chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
1072 initAdiPatternChroma(cu, cuGeom, absPartIdxC, tuDepthC, chromaId);
1073 pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
1074
1075 predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
1076
1077 X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
1078
1079 primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
1080 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, false);
1081 if (numSig)
1082 {
1083 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], residual, stride, coeff, log2TrSizeC, ttype, true, false, numSig);
1084 primitives.luma_add_ps[sizeIdxC](recon, stride, pred, residual, stride, stride);
1085 primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, recon, stride);
1086 cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1087 }
1088 else
1089 {
1090 primitives.luma_copy_pp[sizeIdxC](recon, stride, pred, stride);
1091 primitives.luma_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
1092 cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1093 }
1094 }
1095 while (tuIterator.isNextSection());
1096
1097 if (splitType == VERTICAL_SPLIT)
1098 offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
1099 }
1100 }
1101 else
1102 {
1103 uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
1104 uint32_t splitCbfU = 0, splitCbfV = 0;
1105 for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1106 {
1107 residualQTIntraChroma(mode, cuGeom, tuDepth + 1, qPartIdx);
1108 splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
1109 splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
1110 }
1111 for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
1112 {
1113 cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
1114 cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
1115 }
1116 }
1117 }
1118
1119 void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes)
1120 {
1121 uint32_t depth = cuGeom.depth;
1122 CUData& cu = intraMode.cu;
1123
1124 cu.setPartSizeSubParts(partSize);
1125 cu.setPredModeSubParts(MODE_INTRA);
1126
1127 uint32_t tuDepthRange[2];
1128 cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1129
1130 intraMode.initCosts();
1131 intraMode.distortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange, sharedModes);
1132 intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
1133
1134 m_entropyCoder.resetBits();
1135 if (m_slice->m_pps->bTransquantBypassEnabled)
1136 m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1137
1138 if (!m_slice->isIntra())
1139 {
1140 m_entropyCoder.codeSkipFlag(cu, 0);
1141 m_entropyCoder.codePredMode(cu.m_predMode[0]);
1142 }
1143
1144 m_entropyCoder.codePartSize(cu, 0, depth);
1145 m_entropyCoder.codePredInfo(cu, 0);
1146 intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
1147
1148 bool bCodeDQP = m_slice->m_pps->bUseDQP;
1149 m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
1150 m_entropyCoder.store(intraMode.contexts);
1151 intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1152 intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
1153 if (m_rdCost.m_psyRd)
1154 intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
1155
1156 updateModeCost(intraMode);
1157 }
1158
1159 /* Note that this function does not save the best intra prediction, it must
1160 * be generated later. It records the best mode in the cu */
1161 void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
1162 {
1163 CUData& cu = intraMode.cu;
1164 uint32_t depth = cu.m_cuDepth[0];
1165
1166 cu.setPartSizeSubParts(SIZE_2Nx2N);
1167 cu.setPredModeSubParts(MODE_INTRA);
1168
1169 const uint32_t initTuDepth = 0;
1170 uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth;
1171 uint32_t tuSize = 1 << log2TrSize;
1172 const uint32_t absPartIdx = 0;
1173
1174 // Reference sample smoothing
1175 initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
1176
1177 const pixel* fenc = intraMode.fencYuv->m_buf[0];
1178 uint32_t stride = intraMode.fencYuv->m_size;
1179
1180 pixel* above = m_refAbove + tuSize - 1;
1181 pixel* aboveFiltered = m_refAboveFlt + tuSize - 1;
1182 pixel* left = m_refLeft + tuSize - 1;
1183 pixel* leftFiltered = m_refLeftFlt + tuSize - 1;
1184 int sad, bsad;
1185 uint32_t bits, bbits, mode, bmode;
1186 uint64_t cost, bcost;
1187
1188 // 33 Angle modes once
1189 ALIGN_VAR_32(pixel, bufScale[32 * 32]);
1190 ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
1191 ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
1192 int scaleTuSize = tuSize;
1193 int scaleStride = stride;
1194 int costShift = 0;
1195 int sizeIdx = log2TrSize - 2;
1196
1197 if (tuSize > 32)
1198 {
1199 // origin is 64x64, we scale to 32x32 and setup required parameters
1200 primitives.scale2D_64to32(bufScale, fenc, stride);
1201 fenc = bufScale;
1202
1203 // reserve space in case primitives need to store data in above
1204 // or left buffers
1205 pixel _above[4 * 32 + 1];
1206 pixel _left[4 * 32 + 1];
1207 pixel* aboveScale = _above + 2 * 32;
1208 pixel* leftScale = _left + 2 * 32;
1209 aboveScale[0] = leftScale[0] = above[0];
1210 primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
1211 primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
1212
1213 scaleTuSize = 32;
1214 scaleStride = 32;
1215 costShift = 2;
1216 sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
1217
1218 // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
1219 above = aboveScale;
1220 left = leftScale;
1221 aboveFiltered = aboveScale;
1222 leftFiltered = leftScale;
1223 }
1224
1225 pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
1226 int predsize = scaleTuSize * scaleTuSize;
1227
1228 m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1229
1230 /* there are three cost tiers for intra modes:
1231 * pred[0] - mode probable, least cost
1232 * pred[1], pred[2] - less probable, slightly more cost
1233 * non-mpm modes - all cost the same (rbits) */
1234 uint64_t mpms;
1235 uint32_t preds[3];
1236 uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
1237
1238 // DC
1239 primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
1240 bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
1241 bmode = mode = DC_IDX;
1242 bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
1243 bcost = m_rdCost.calcRdSADCost(bsad, bbits);
1244
1245 pixel* abovePlanar = above;
1246 pixel* leftPlanar = left;
1247
1248 if (tuSize & (8 | 16 | 32))
1249 {
1250 abovePlanar = aboveFiltered;
1251 leftPlanar = leftFiltered;
1252 }
1253
1254 // PLANAR
1255 primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
1256 sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
1257 mode = PLANAR_IDX;
1258 bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
1259 cost = m_rdCost.calcRdSADCost(sad, bits);
1260 COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1261
1262 // Transpose NxN
1263 primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
1264
1265 primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
1266
1267 bool modeHor;
1268 const pixel* cmp;
1269 intptr_t srcStride;
1270
1271 #define TRY_ANGLE(angle) \
1272 modeHor = angle < 18; \
1273 cmp = modeHor ? bufTrans : fenc; \
1274 srcStride = modeHor ? scaleTuSize : scaleStride; \
1275 sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
1276 bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
1277 cost = m_rdCost.calcRdSADCost(sad, bits)
1278
1279 if (m_param->bEnableFastIntra)
1280 {
1281 int asad = 0;
1282 uint32_t lowmode, highmode, amode = 5, abits = 0;
1283 uint64_t acost = MAX_INT64;
1284
1285 /* pick the best angle, sampling at distance of 5 */
1286 for (mode = 5; mode < 35; mode += 5)
1287 {
1288 TRY_ANGLE(mode);
1289 COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
1290 }
1291
1292 /* refine best angle at distance 2, then distance 1 */
1293 for (uint32_t dist = 2; dist >= 1; dist--)
1294 {
1295 lowmode = amode - dist;
1296 highmode = amode + dist;
1297
1298 X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
1299 TRY_ANGLE(lowmode);
1300 COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
1301
1302 X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
1303 TRY_ANGLE(highmode);
1304 COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
1305 }
1306
1307 if (amode == 33)
1308 {
1309 TRY_ANGLE(34);
1310 COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
1311 }
1312
1313 COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
1314 }
1315 else // calculate and search all intra prediction angles for lowest cost
1316 {
1317 for (mode = 2; mode < 35; mode++)
1318 {
1319 TRY_ANGLE(mode);
1320 COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1321 }
1322 }
1323
1324 cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTuDepth);
1325 intraMode.initCosts();
1326 intraMode.totalBits = bbits;
1327 intraMode.distortion = bsad;
1328 intraMode.sa8dCost = bcost;
1329 intraMode.sa8dBits = bbits;
1330 }
1331
1332 void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
1333 {
1334 CUData& cu = intraMode.cu;
1335 Yuv* reconYuv = &intraMode.reconYuv;
1336 const Yuv* fencYuv = intraMode.fencYuv;
1337
1338 X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
1339 X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
1340
1341 m_quant.setQPforQuant(cu);
1342
1343 uint32_t tuDepthRange[2];
1344 cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1345
1346 m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
1347
1348 Cost icosts;
1349 codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
1350 extractIntraResultQT(cu, *reconYuv, 0, 0);
1351
1352 intraMode.distortion = icosts.distortion;
1353 intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
1354
1355 m_entropyCoder.resetBits();
1356 if (m_slice->m_pps->bTransquantBypassEnabled)
1357 m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1358 m_entropyCoder.codeSkipFlag(cu, 0);
1359 m_entropyCoder.codePredMode(cu.m_predMode[0]);
1360 m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
1361 m_entropyCoder.codePredInfo(cu, 0);
1362 intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits();
1363
1364 bool bCodeDQP = m_slice->m_pps->bUseDQP;
1365 m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
1366
1367 intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1368 intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
1369 if (m_rdCost.m_psyRd)
1370 intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
1371
1372 m_entropyCoder.store(intraMode.contexts);
1373 updateModeCost(intraMode);
1374 }
1375
1376 uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes)
1377 {
1378 CUData& cu = intraMode.cu;
1379 Yuv* reconYuv = &intraMode.reconYuv;
1380 Yuv* predYuv = &intraMode.predYuv;
1381 const Yuv* fencYuv = intraMode.fencYuv;
1382
1383 uint32_t depth = cu.m_cuDepth[0];
1384 uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N;
1385 uint32_t numPU = 1 << (2 * initTuDepth);
1386 uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth;
1387 uint32_t tuSize = 1 << log2TrSize;
1388 uint32_t qNumParts = cuGeom.numPartitions >> 2;
1389 uint32_t sizeIdx = log2TrSize - 2;
1390 uint32_t absPartIdx = 0;
1391 uint32_t totalDistortion = 0;
1392
1393 int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
1394
1395 // loop over partitions
1396 for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
1397 {
1398 uint32_t bmode = 0;
1399
1400 if (sharedModes)
1401 bmode = sharedModes[puIdx];
1402 else
1403 {
1404 // Reference sample smoothing
1405 initAdiPattern(cu, cuGeom, absPartIdx, initTuDepth, ALL_IDX);
1406
1407 // determine set of modes to be tested (using prediction signal only)
1408 const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
1409 uint32_t stride = predYuv->m_size;
1410
1411 pixel* above = m_refAbove + tuSize - 1;
1412 pixel* aboveFiltered = m_refAboveFlt + tuSize - 1;
1413 pixel* left = m_refLeft + tuSize - 1;
1414 pixel* leftFiltered = m_refLeftFlt + tuSize - 1;
1415
1416 // 33 Angle modes once
1417 ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
1418 ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
1419 ALIGN_VAR_32(pixel, bufScale[32 * 32]);
1420 pixel _above[4 * 32 + 1];
1421 pixel _left[4 * 32 + 1];
1422 int scaleTuSize = tuSize;
1423 int scaleStride = stride;
1424 int costShift = 0;
1425
1426 if (tuSize > 32)
1427 {
1428 pixel* aboveScale = _above + 2 * 32;
1429 pixel* leftScale = _left + 2 * 32;
1430
1431 // origin is 64x64, we scale to 32x32 and setup required parameters
1432 primitives.scale2D_64to32(bufScale, fenc, stride);
1433 fenc = bufScale;
1434
1435 // reserve space in case primitives need to store data in above
1436 // or left buffers
1437 aboveScale[0] = leftScale[0] = above[0];
1438 primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
1439 primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
1440
1441 scaleTuSize = 32;
1442 scaleStride = 32;
1443 costShift = 2;
1444 sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
1445
1446 // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
1447 above = aboveScale;
1448 left = leftScale;
1449 aboveFiltered = aboveScale;
1450 leftFiltered = leftScale;
1451 }
1452
1453 m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1454
1455 /* there are three cost tiers for intra modes:
1456 * pred[0] - mode probable, least cost
1457 * pred[1], pred[2] - less probable, slightly more cost
1458 * non-mpm modes - all cost the same (rbits) */
1459 uint64_t mpms;
1460 uint32_t preds[3];
1461 uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
1462
1463 pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
1464 uint64_t modeCosts[35];
1465 uint64_t bcost;
1466
1467 // DC
1468 primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
1469 uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, DC_IDX) : rbits;
1470 uint32_t sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
1471 modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
1472
1473 // PLANAR
1474 pixel* abovePlanar = above;
1475 pixel* leftPlanar = left;
1476 if (tuSize >= 8 && tuSize <= 32)
1477 {
1478 abovePlanar = aboveFiltered;
1479 leftPlanar = leftFiltered;
1480 }
1481 primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
1482 bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, PLANAR_IDX) : rbits;
1483 sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
1484 modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
1485 COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
1486
1487 // angular predictions
1488 primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
1489
1490 primitives.transpose[sizeIdx](buf_trans, fenc, scaleStride);
1491 for (int mode = 2; mode < 35; mode++)
1492 {
1493 bool modeHor = (mode < 18);
1494 const pixel* cmp = (modeHor ? buf_trans : fenc);
1495 intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride);
1496 bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
1497 sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
1498 modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
1499 COPY1_IF_LT(bcost, modeCosts[mode]);
1500 }
1501
1502 /* Find the top maxCandCount candidate modes with cost within 25% of best
1503 * or among the most probable modes. maxCandCount is derived from the
1504 * rdLevel and depth. In general we want to try more modes at slower RD
1505 * levels and at higher depths */
1506 uint64_t candCostList[MAX_RD_INTRA_MODES];
1507 uint32_t rdModeList[MAX_RD_INTRA_MODES];
1508 int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
1509 for (int i = 0; i < maxCandCount; i++)
1510 candCostList[i] = MAX_INT64;
1511
1512 uint64_t paddedBcost = bcost + (bcost >> 3); // 1.12%
1513 for (int mode = 0; mode < 35; mode++)
1514 if (modeCosts[mode] < paddedBcost || (mpms & ((uint64_t)1 << mode)))
1515 updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
1516
1517 /* measure best candidates using simple RDO (no TU splits) */
1518 bcost = MAX_INT64;
1519 for (int i = 0; i < maxCandCount; i++)
1520 {
1521 if (candCostList[i] == MAX_INT64)
1522 break;
1523 m_entropyCoder.load(m_rqt[depth].cur);
1524 cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
1525
1526 Cost icosts;
1527 if (checkTransformSkip)
1528 codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
1529 else
1530 codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
1531 COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
1532 }
1533 }
1534
1535 /* remeasure best mode, allowing TU splits */
1536 cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
1537 m_entropyCoder.load(m_rqt[depth].cur);
1538
1539 Cost icosts;
1540 if (checkTransformSkip)
1541 codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
1542 else
1543 codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
1544 totalDistortion += icosts.distortion;
1545
1546 extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
1547
1548 // set reconstruction for next intra prediction blocks
1549 if (puIdx != numPU - 1)
1550 {
1551 /* This has important implications for parallelism and RDO. It is writing intermediate results into the
1552 * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
1553 * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
1554 * that the contexts should be tracked through each PU */
1555 pixel* dst = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
1556 uint32_t dststride = m_frame->m_reconPic->m_stride;
1557 const pixel* src = reconYuv->getLumaAddr(absPartIdx);
1558 uint32_t srcstride = reconYuv->m_size;
1559 primitives.luma_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
1560 }
1561 }
1562
1563 if (numPU > 1)
1564 {
1565 uint32_t combCbfY = 0;
1566 for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1567 combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
1568
1569 for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
1570 cu.m_cbf[0][offs] |= combCbfY;
1571 }
1572
1573 // TODO: remove this
1574 m_entropyCoder.load(m_rqt[depth].cur);
1575
1576 return totalDistortion;
1577 }
1578
1579 void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
1580 {
1581 CUData& cu = intraMode.cu;
1582 const Yuv* fencYuv = intraMode.fencYuv;
1583 Yuv* predYuv = &intraMode.predYuv;
1584
1585 uint32_t bestMode = 0;
1586 uint64_t bestCost = MAX_INT64;
1587 uint32_t modeList[NUM_CHROMA_MODE];
1588
1589 uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
1590 uint32_t tuSize = 1 << log2TrSizeC;
1591 int32_t scaleTuSize = tuSize;
1592 uint32_t tuDepth = 0;
1593 int32_t costShift = 0;
1594
1595 if (tuSize > 32)
1596 {
1597 scaleTuSize = 32;
1598 tuDepth = 1;
1599 costShift = 2;
1600 log2TrSizeC = 5;
1601 }
1602
1603 Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 1);
1604 Predict::initAdiPatternChroma(cu, cuGeom, 0, tuDepth, 2);
1605 cu.getAllowedChromaDir(0, modeList);
1606
1607 // check chroma modes
1608 for (uint32_t mode = 0; mode < NUM_CHROMA_MODE; mode++)
1609 {
1610 uint32_t chromaPredMode = modeList[mode];
1611 if (chromaPredMode == DM_CHROMA_IDX)
1612 chromaPredMode = cu.m_lumaIntraDir[0];
1613 if (m_csp == X265_CSP_I422)
1614 chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1615
1616 uint64_t cost = 0;
1617 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1618 {
1619 const pixel* fenc = fencYuv->m_buf[chromaId];
1620 pixel* pred = predYuv->m_buf[chromaId];
1621 pixel* chromaPred = getAdiChromaBuf(chromaId, scaleTuSize);
1622
1623 // get prediction signal
1624 predIntraChromaAng(chromaPred, chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC, m_csp);
1625 cost += primitives.sa8d[log2TrSizeC - 2](fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift;
1626 }
1627
1628 if (cost < bestCost)
1629 {
1630 bestCost = cost;
1631 bestMode = modeList[mode];
1632 }
1633 }
1634
1635 cu.setChromIntraDirSubParts(bestMode, 0, cu.m_cuDepth[0]);
1636 }
1637
1638 uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
1639 {
1640 CUData& cu = intraMode.cu;
1641 Yuv& reconYuv = intraMode.reconYuv;
1642
1643 uint32_t depth = cu.m_cuDepth[0];
1644 uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
1645 uint32_t log2TrSize = cu.m_log2CUSize[0] - initTuDepth;
1646 uint32_t absPartStep = (NUM_CU_PARTITIONS >> (depth << 1));
1647 uint32_t totalDistortion = 0;
1648
1649 int part = partitionFromLog2Size(log2TrSize);
1650
1651 TURecurse tuIterator((initTuDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
1652
1653 do
1654 {
1655 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1656
1657 uint32_t bestMode = 0;
1658 uint32_t bestDist = 0;
1659 uint64_t bestCost = MAX_INT64;
1660
1661 // init mode list
1662 uint32_t minMode = 0;
1663 uint32_t maxMode = NUM_CHROMA_MODE;
1664 uint32_t modeList[NUM_CHROMA_MODE];
1665
1666 cu.getAllowedChromaDir(absPartIdxC, modeList);
1667
1668 // check chroma modes
1669 for (uint32_t mode = minMode; mode < maxMode; mode++)
1670 {
1671 // restore context models
1672 m_entropyCoder.load(m_rqt[depth].cur);
1673
1674 cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
1675 uint32_t psyEnergy = 0;
1676 uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, psyEnergy);
1677
1678 if (m_slice->m_pps->bTransformSkipEnabled)
1679 m_entropyCoder.load(m_rqt[depth].cur);
1680
1681 m_entropyCoder.resetBits();
1682 // chroma prediction mode
1683 if (cu.m_partSize[0] == SIZE_2Nx2N || m_csp != X265_CSP_I444)
1684 {
1685 if (!absPartIdxC)
1686 m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
1687 }
1688 else
1689 {
1690 uint32_t qNumParts = cuGeom.numPartitions >> 2;
1691 if (!(absPartIdxC & (qNumParts - 1)))
1692 m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
1693 }
1694
1695 codeSubdivCbfQTChroma(cu, initTuDepth, absPartIdxC);
1696 codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
1697 codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
1698 uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
1699 uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(dist, bits, psyEnergy) : m_rdCost.calcRdCost(dist, bits);
1700
1701 if (cost < bestCost)
1702 {
1703 bestCost = cost;
1704 bestDist = dist;
1705 bestMode = modeList[mode];
1706 extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
1707 memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1708 memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1709 memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1710 memcpy(m_qtTempTransformSkipFlag[2], cu.m_transformSkip[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1711 }
1712 }
1713
1714 if (!tuIterator.isLastSection())
1715 {
1716 uint32_t zorder = cuGeom.encodeIdx + absPartIdxC;
1717 uint32_t dststride = m_frame->m_reconPic->m_strideC;
1718 const pixel* src;
1719 pixel* dst;
1720
1721 dst = m_frame->m_reconPic->getCbAddr(cu.m_cuAddr, zorder);
1722 src = reconYuv.getCbAddr(absPartIdxC);
1723 primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
1724
1725 dst = m_frame->m_reconPic->getCrAddr(cu.m_cuAddr, zorder);
1726 src = reconYuv.getCrAddr(absPartIdxC);
1727 primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
1728 }
1729
1730 memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
1731 memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
1732 memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
1733 memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
1734 cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTuDepth);
1735 totalDistortion += bestDist;
1736 }
1737 while (tuIterator.isNextSection());
1738
1739 if (initTuDepth != 0)
1740 {
1741 uint32_t combCbfU = 0;
1742 uint32_t combCbfV = 0;
1743 uint32_t qNumParts = tuIterator.absPartIdxStep;
1744 for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
1745 {
1746 combCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, 1);
1747 combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
1748 }
1749
1750 for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
1751 {
1752 cu.m_cbf[1][offs] |= combCbfU;
1753 cu.m_cbf[2][offs] |= combCbfV;
1754 }
1755 }
1756
1757 /* TODO: remove this */
1758 m_entropyCoder.load(m_rqt[depth].cur);
1759 return totalDistortion;
1760 }
1761
1762 /* estimation of best merge coding of an inter PU (not a merge CU) */
1763 uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, MergeData& m)
1764 {
1765 X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "merge tested on non-2Nx2N partition\n");
1766
1767 m.maxNumMergeCand = cu.getInterMergeCandidates(m.absPartIdx, puIdx, m.mvFieldNeighbours, m.interDirNeighbours);
1768
1769 if (cu.isBipredRestriction())
1770 {
1771 /* in 8x8 CUs do not allow bidir merge candidates if not 2Nx2N */
1772 for (uint32_t mergeCand = 0; mergeCand < m.maxNumMergeCand; ++mergeCand)
1773 {
1774 if (m.interDirNeighbours[mergeCand] == 3)
1775 {
1776 m.interDirNeighbours[mergeCand] = 1;
1777 m.mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID;
1778 }
1779 }
1780 }
1781
1782 Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1783
1784 uint32_t outCost = MAX_UINT;
1785 for (uint32_t mergeCand = 0; mergeCand < m.maxNumMergeCand; ++mergeCand)
1786 {
1787 /* Prevent TMVP candidates from using unavailable reference pixels */
1788 if (m_bFrameParallel &&
1789 (m.mvFieldNeighbours[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1790 m.mvFieldNeighbours[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4))
1791 continue;
1792
1793 cu.m_mv[0][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][0].mv;
1794 cu.m_refIdx[0][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][0].refIdx;
1795 cu.m_mv[1][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv;
1796 cu.m_refIdx[1][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][1].refIdx;
1797
1798 prepMotionCompensation(cu, cuGeom, puIdx);
1799 motionCompensation(tempYuv, true, m_me.bChromaSATD);
1800
1801 uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(m.absPartIdx), tempYuv.m_size);
1802 if (m_me.bChromaSATD)
1803 costCand += m_me.bufChromaSATD(tempYuv, m.absPartIdx);
1804
1805 uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand);
1806 costCand = costCand + m_rdCost.getCost(bitsCand);
1807 if (costCand < outCost)
1808 {
1809 outCost = costCand;
1810 m.bits = bitsCand;
1811 m.index = mergeCand;
1812 }
1813 }
1814
1815 m.mvField[0] = m.mvFieldNeighbours[m.index][0];
1816 m.mvField[1] = m.mvFieldNeighbours[m.index][1];
1817 m.interDir = m.interDirNeighbours[m.index];
1818
1819 return outCost;
1820 }
1821
1822 /* this function assumes the caller has configured its MotionEstimation engine with the
1823 * correct source plane and source PU, and has called prepMotionCompensation() to set
1824 * m_puAbsPartIdx, m_puWidth, and m_puHeight */
1825 void Search::singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref)
1826 {
1827 uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
1828 bits += getTUBits(ref, m_slice->m_numRefIdx[list]);
1829
1830 MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
1831 int numMvc = interMode.cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, interMode.amvpCand[list][ref], mvc);
1832
1833 int mvpIdx = 0;
1834 int merange = m_param->searchRange;
1835 MotionData* bestME = interMode.bestME[part];
1836
1837 if (interMode.amvpCand[list][ref][0] != interMode.amvpCand[list][ref][1])
1838 {
1839 uint32_t bestCost = MAX_INT;
1840 for (int i = 0; i < AMVP_NUM_CANDS; i++)
1841 {
1842 MV mvCand = interMode.amvpCand[list][ref][i];
1843
1844 // NOTE: skip mvCand if Y is > merange and -FN>1
1845 if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
1846 continue;
1847
1848 interMode.cu.clipMv(mvCand);
1849
1850 Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1851 predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPic, mvCand);
1852 uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
1853
1854 if (bestCost > cost)
1855 {
1856 bestCost = cost;
1857 mvpIdx = i;
1858 }
1859 }
1860 }
1861
1862 MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[list][ref][mvpIdx];
1863 setSearchRange(interMode.cu, mvp, merange, mvmin, mvmax);
1864
1865 int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
1866
1867 /* Get total cost of partition, but only include MV bit cost once */
1868 bits += m_me.bitcost(outmv);
1869 uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
1870
1871 /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
1872 checkBestMVP(interMode.amvpCand[list][ref], outmv, mvp, mvpIdx, bits, cost);
1873
1874 /* tie goes to the smallest ref ID, just like --no-pme */
1875 ScopedLock _lock(master.m_meLock);
1876 if (cost < bestME[list].cost ||
1877 (cost == bestME[list].cost && ref < bestME[list].ref))
1878 {
1879 bestME[list].mv = outmv;
1880 bestME[list].mvp = mvp;
1881 bestME[list].mvpIdx = mvpIdx;
1882 bestME[list].ref = ref;
1883 bestME[list].cost = cost;
1884 bestME[list].bits = bits;
1885 }
1886 }
1887
1888 /* search of the best candidate for inter prediction
1889 * returns true if predYuv was filled with a motion compensated prediction */
1890 bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChromaSA8D)
1891 {
1892 CUData& cu = interMode.cu;
1893 Yuv* predYuv = &interMode.predYuv;
1894
1895 MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
1896
1897 const Slice *slice = m_slice;
1898 int numPart = cu.getNumPartInter();
1899 int numPredDir = slice->isInterP() ? 1 : 2;
1900 const int* numRefIdx = slice->m_numRefIdx;
1901 uint32_t lastMode = 0;
1902 int totalmebits = 0;
1903 bool bDistributed = m_param->bDistributeMotionEstimation && (numRefIdx[0] + numRefIdx[1]) > 2;
1904 MV mvzero(0, 0);
1905 Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1906
1907 MergeData merge;
1908 memset(&merge, 0, sizeof(merge));
1909
1910 for (int puIdx = 0; puIdx < numPart; puIdx++)
1911 {
1912 MotionData* bestME = interMode.bestME[puIdx];
1913
1914 /* sets m_puAbsPartIdx, m_puWidth, m_puHeight */
1915 initMotionCompensation(cu, cuGeom, puIdx);
1916
1917 m_me.setSourcePU(*interMode.fencYuv, cu.m_cuAddr, cuGeom.encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
1918
1919 uint32_t mrgCost = MAX_UINT;
1920
1921 /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
1922 if (cu.m_partSize[0] != SIZE_2Nx2N)
1923 {
1924 merge.absPartIdx = m_puAbsPartIdx;
1925 merge.width = m_puWidth;
1926 merge.height = m_puHeight;
1927 mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge);
1928
1929 if (bMergeOnly)
1930 {
1931 if (mrgCost == MAX_UINT)
1932 {
1933 /* No valid merge modes were found, there is no possible way to
1934 * perform a valid motion compensation prediction, so early-exit */
1935 return false;
1936 }
1937 // set merge result
1938 cu.m_mergeFlag[m_puAbsPartIdx] = true;
1939 cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
1940 cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx);
1941 cu.setPUMv(0, merge.mvField[0].mv, m_puAbsPartIdx, puIdx);
1942 cu.setPURefIdx(0, merge.mvField[0].refIdx, m_puAbsPartIdx, puIdx);
1943 cu.setPUMv(1, merge.mvField[1].mv, m_puAbsPartIdx, puIdx);
1944 cu.setPURefIdx(1, merge.mvField[1].refIdx, m_puAbsPartIdx, puIdx);
1945 totalmebits += merge.bits;
1946
1947 prepMotionCompensation(cu, cuGeom, puIdx);
1948 motionCompensation(*predYuv, true, bChromaSA8D);
1949 continue;
1950 }
1951 }
1952
1953 bestME[0].cost = MAX_UINT;
1954 bestME[1].cost = MAX_UINT;
1955
1956 getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
1957
1958 /* Uni-directional prediction */
1959 if (m_param->analysisMode == X265_ANALYSIS_LOAD && bestME[0].ref >= 0)
1960 {
1961 for (int l = 0; l < numPredDir; l++)
1962 {
1963 int ref = bestME[l].ref;
1964 uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
1965 bits += getTUBits(ref, numRefIdx[l]);
1966
1967 int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
1968
1969 // Pick the best possible MVP from AMVP candidates based on least residual
1970 int mvpIdx = 0;
1971 int merange = m_param->searchRange;
1972
1973 if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1])
1974 {
1975 uint32_t bestCost = MAX_INT;
1976 for (int i = 0; i < AMVP_NUM_CANDS; i++)
1977 {
1978 MV mvCand = interMode.amvpCand[l][ref][i];
1979
1980 // NOTE: skip mvCand if Y is > merange and -FN>1
1981 if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
1982 continue;
1983
1984 cu.clipMv(mvCand);
1985 predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
1986 uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
1987
1988 if (bestCost > cost)
1989 {
1990 bestCost = cost;
1991 mvpIdx = i;
1992 }
1993 }
1994 }
1995
1996 MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx];
1997
1998 int satdCost;
1999 setSearchRange(cu, mvp, merange, mvmin, mvmax);
2000 satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
2001
2002 /* Get total cost of partition, but only include MV bit cost once */
2003 bits += m_me.bitcost(outmv);
2004 uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
2005
2006 /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
2007 checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
2008
2009 if (cost < bestME[l].cost)
2010 {
2011 bestME[l].mv = outmv;
2012 bestME[l].mvp = mvp;
2013 bestME[l].mvpIdx = mvpIdx;
2014 bestME[l].cost = cost;
2015 bestME[l].bits = bits;
2016 }
2017 }
2018 }
2019 else if (bDistributed)
2020 {
2021 m_meLock.acquire();
2022 m_curInterMode = &interMode;
2023 m_curGeom = &cuGeom;
2024 m_curPart = puIdx;
2025 m_totalNumME = 0;
2026 m_numAcquiredME = 1;
2027 m_numCompletedME = 0;
2028 m_totalNumME = numRefIdx[0] + numRefIdx[1];
2029 m_meLock.release();
2030
2031 if (!m_bJobsQueued)
2032 JobProvider::enqueue();
2033
2034 for (int i = 1; i < m_totalNumME; i++)
2035 m_pool->pokeIdleThread();
2036
2037 do
2038 {
2039 m_meLock.acquire();
2040 if (m_totalNumME > m_numAcquiredME)
2041 {
2042 int id = m_numAcquiredME++;
2043 m_meLock.release();
2044
2045 if (id < numRefIdx[0])
2046 singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, id);
2047 else
2048 singleMotionEstimation(*this, interMode, cuGeom, puIdx, 1, id - numRefIdx[0]);
2049
2050 m_meLock.acquire();
2051 m_numCompletedME++;
2052 m_meLock.release();
2053 }
2054 else
2055 m_meLock.release();
2056 }
2057 while (m_totalNumME > m_numAcquiredME);
2058
2059 if (!m_bJobsQueued)
2060 JobProvider::dequeue();
2061
2062 /* we saved L0-0 for ourselves */
2063 singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, 0);
2064
2065 m_meLock.acquire();
2066 if (++m_numCompletedME == m_totalNumME)
2067 m_meCompletionEvent.trigger();
2068 m_meLock.release();
2069
2070 m_meCompletionEvent.wait();
2071 }
2072 else
2073 {
2074 for (int l = 0; l < numPredDir; l++)
2075 {
2076 for (int ref = 0; ref < numRefIdx[l]; ref++)
2077 {
2078 uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
2079 bits += getTUBits(ref, numRefIdx[l]);
2080
2081 int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
2082
2083 // Pick the best possible MVP from AMVP candidates based on least residual
2084 int mvpIdx = 0;
2085 int merange = m_param->searchRange;
2086
2087 if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1])
2088 {
2089 uint32_t bestCost = MAX_INT;
2090 for (int i = 0; i < AMVP_NUM_CANDS; i++)
2091 {
2092 MV mvCand = interMode.amvpCand[l][ref][i];
2093
2094 // NOTE: skip mvCand if Y is > merange and -FN>1
2095 if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
2096 continue;
2097
2098 cu.clipMv(mvCand);
2099 predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
2100 uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
2101
2102 if (bestCost > cost)
2103 {
2104 bestCost = cost;
2105 mvpIdx = i;
2106 }
2107 }
2108 }
2109
2110 MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx];
2111
2112 setSearchRange(cu, mvp, merange, mvmin, mvmax);
2113 int satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
2114
2115 /* Get total cost of partition, but only include MV bit cost once */
2116 bits += m_me.bitcost(outmv);
2117 uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
2118
2119 /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
2120 checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
2121
2122 if (cost < bestME[l].cost)
2123 {
2124 bestME[l].mv = outmv;
2125 bestME[l].mvp = mvp;
2126 bestME[l].mvpIdx = mvpIdx;
2127 bestME[l].ref = ref;
2128 bestME[l].cost = cost;
2129 bestME[l].bits = bits;
2130 }
2131 }
2132 }
2133 }
2134
2135 /* Bi-directional prediction */
2136 MotionData bidir[2];
2137 uint32_t bidirCost = MAX_UINT;
2138 int bidirBits = 0;
2139
2140 if (slice->isInterB() && !cu.isBipredRestriction() && /* biprediction is possible for this PU */
2141 cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N && /* 2Nx2N biprediction is handled elsewhere */
2142 bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
2143 {
2144 bidir[0] = bestME[0];
2145 bidir[1] = bestME[1];
2146
2147 int satdCost;
2148
2149 if (m_me.bChromaSATD)
2150 {
2151 cu.m_mv[0][m_puAbsPartIdx] = bidir[0].mv;
2152 cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
2153 cu.m_mv[1][m_puAbsPartIdx] = bidir[1].mv;
2154 cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
2155
2156 prepMotionCompensation(cu, cuGeom, puIdx);
2157 motionCompensation(tmpPredYuv, true, true);
2158
2159 satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
2160 m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
2161 }
2162 else
2163 {
2164 PicYuv* refPic0 = slice->m_refPicList[0][bestME[0].ref]->m_reconPic;
2165 PicYuv* refPic1 = slice->m_refPicList[1][bestME[1].ref]->m_reconPic;
2166 Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
2167
2168 /* Generate reference subpels */
2169 predInterLumaPixel(bidirYuv[0], *refPic0, bestME[0].mv);
2170 predInterLumaPixel(bidirYuv[1], *refPic1, bestME[1].mv);
2171
2172 primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(m_puAbsPartIdx), bidirYuv[0].m_size,
2173 bidirYuv[1].getLumaAddr(m_puAbsPartIdx), bidirYuv[1].m_size, 32);
2174 satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
2175 }
2176
2177 bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
2178 bidirCost = satdCost + m_rdCost.getCost(bidirBits);
2179
2180 bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();
2181 if (bTryZero)
2182 {
2183 /* Do not try zero MV if unidir motion predictors are beyond
2184 * valid search area */
2185 MV mvmin, mvmax;
2186 int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
2187 setSearchRange(cu, mvzero, merange, mvmin, mvmax);
2188 mvmax.y += 2; // there is some pad for subpel refine
2189 mvmin <<= 2;
2190 mvmax <<= 2;
2191
2192 bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);
2193 bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);
2194 }
2195 if (bTryZero)
2196 {
2197 /* coincident blocks of the two reference pictures */
2198 if (m_me.bChromaSATD)
2199 {
2200 cu.m_mv[0][m_puAbsPartIdx] = mvzero;
2201 cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
2202 cu.m_mv[1][m_puAbsPartIdx] = mvzero;
2203 cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
2204
2205 prepMotionCompensation(cu, cuGeom, puIdx);
2206 motionCompensation(tmpPredYuv, true, true);
2207
2208 satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
2209 m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
2210 }
2211 else
2212 {
2213 const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
2214 const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
2215 intptr_t refStride = slice->m_mref[0][0].lumaStride;
2216
2217 primitives.pixelavg_pp[m_me.partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
2218 satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
2219 }
2220
2221 MV mvp0 = bestME[0].mvp;
2222 int mvpIdx0 = bestME[0].mvpIdx;
2223 uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
2224
2225 MV mvp1 = bestME[1].mvp;
2226 int mvpIdx1 = bestME[1].mvpIdx;
2227 uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
2228
2229 uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
2230
2231 /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
2232 checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost);
2233 checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost);
2234
2235 if (cost < bidirCost)
2236 {
2237 bidir[0].mv = mvzero;
2238 bidir[1].mv = mvzero;
2239 bidir[0].mvp = mvp0;
2240 bidir[1].mvp = mvp1;
2241 bidir[0].mvpIdx = mvpIdx0;
2242 bidir[1].mvpIdx = mvpIdx1;
2243 bidirCost = cost;
2244 bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
2245 }
2246 }
2247 }
2248
2249 /* select best option and store into CU */
2250 if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
2251 {
2252 cu.m_mergeFlag[m_puAbsPartIdx] = true;
2253 cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
2254 cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx);
2255 cu.setPUMv(0, merge.mvField[0].mv, m_puAbsPartIdx, puIdx);
2256 cu.setPURefIdx(0, merge.mvField[0].refIdx, m_puAbsPartIdx, puIdx);
2257 cu.setPUMv(1, merge.mvField[1].mv, m_puAbsPartIdx, puIdx);
2258 cu.setPURefIdx(1, merge.mvField[1].refIdx, m_puAbsPartIdx, puIdx);
2259
2260 totalmebits += merge.bits;
2261 }
2262 else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost)
2263 {
2264 lastMode = 2;
2265
2266 cu.m_mergeFlag[m_puAbsPartIdx] = false;
2267 cu.setPUInterDir(3, m_puAbsPartIdx, puIdx);
2268 cu.setPUMv(0, bidir[0].mv, m_puAbsPartIdx, puIdx);
2269 cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
2270 cu.m_mvd[0][m_puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
2271 cu.m_mvpIdx[0][m_puAbsPartIdx] = bidir[0].mvpIdx;
2272
2273 cu.setPUMv(1, bidir[1].mv, m_puAbsPartIdx, puIdx);
2274 cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
2275 cu.m_mvd[1][m_puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
2276 cu.m_mvpIdx[1][m_puAbsPartIdx] = bidir[1].mvpIdx;
2277
2278 totalmebits += bidirBits;
2279 }
2280 else if (bestME[0].cost <= bestME[1].cost)
2281 {
2282 lastMode = 0;
2283
2284 cu.m_mergeFlag[m_puAbsPartIdx] = false;
2285 cu.setPUInterDir(1, m_puAbsPartIdx, puIdx);
2286 cu.setPUMv(0, bestME[0].mv, m_puAbsPartIdx, puIdx);
2287 cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
2288 cu.m_mvd[0][m_puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
2289 cu.m_mvpIdx[0][m_puAbsPartIdx] = bestME[0].mvpIdx;
2290
2291 cu.setPURefIdx(1, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
2292 cu.setPUMv(1, mvzero, m_puAbsPartIdx, puIdx);
2293
2294 totalmebits += bestME[0].bits;
2295 }
2296 else
2297 {
2298 lastMode = 1;
2299
2300 cu.m_mergeFlag[m_puAbsPartIdx] = false;
2301 cu.setPUInterDir(2, m_puAbsPartIdx, puIdx);
2302 cu.setPUMv(1, bestME[1].mv, m_puAbsPartIdx, puIdx);
2303 cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
2304 cu.m_mvd[1][m_puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
2305 cu.m_mvpIdx[1][m_puAbsPartIdx] = bestME[1].mvpIdx;
2306
2307 cu.setPURefIdx(0, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
2308 cu.setPUMv(0, mvzero, m_puAbsPartIdx, puIdx);
2309
2310 totalmebits += bestME[1].bits;
2311 }
2312
2313 prepMotionCompensation(cu, cuGeom, puIdx);
2314 motionCompensation(*predYuv, true, bChromaSA8D);
2315 }
2316
2317 interMode.sa8dBits += totalmebits;
2318 return true;
2319 }
2320
2321 void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3])
2322 {
2323 if (cuMode == SIZE_2Nx2N)
2324 {
2325 blockBit[0] = (!bPSlice) ? 3 : 1;
2326 blockBit[1] = 3;
2327 blockBit[2] = 5;
2328 }
2329 else if (cuMode == SIZE_2NxN || cuMode == SIZE_2NxnU || cuMode == SIZE_2NxnD)
2330 {
2331 static const uint32_t listBits[2][3][3] =
2332 {
2333 { { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
2334 { { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } }
2335 };
2336 if (bPSlice)
2337 {
2338 blockBit[0] = 3;
2339 blockBit[1] = 0;
2340 blockBit[2] = 0;
2341 }
2342 else
2343 memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
2344 }
2345 else if (cuMode == SIZE_Nx2N || cuMode == SIZE_nLx2N || cuMode == SIZE_nRx2N)
2346 {
2347 static const uint32_t listBits[2][3][3] =
2348 {
2349 { { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
2350 { { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } }
2351 };
2352 if (bPSlice)
2353 {
2354 blockBit[0] = 3;
2355 blockBit[1] = 0;
2356 blockBit[2] = 0;
2357 }
2358 else
2359 memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
2360 }
2361 else if (cuMode == SIZE_NxN)
2362 {
2363 blockBit[0] = (!bPSlice) ? 3 : 1;
2364 blockBit[1] = 3;
2365 blockBit[2] = 5;
2366 }
2367 else
2368 {
2369 X265_CHECK(0, "getBlkBits: unknown cuMode\n");
2370 }
2371 }
2372
2373 /* Check if using an alternative MVP would result in a smaller MVD + signal bits */
2374 void Search::checkBestMVP(MV* amvpCand, MV mv, MV& mvPred, int& outMvpIdx, uint32_t& outBits, uint32_t& outCost) const
2375 {
2376 X265_CHECK(amvpCand[outMvpIdx] == mvPred, "checkBestMVP: unexpected mvPred\n");
2377
2378 int mvpIdx = !outMvpIdx;
2379 MV mvp = amvpCand[mvpIdx];
2380 int diffBits = m_me.bitcost(mv, mvp) - m_me.bitcost(mv, mvPred);
2381 if (diffBits < 0)
2382 {
2383 outMvpIdx = mvpIdx;
2384 mvPred = mvp;
2385 uint32_t origOutBits = outBits;
2386 outBits = origOutBits + diffBits;
2387 outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
2388 }
2389 }
2390
2391 void Search::setSearchRange(const CUData& cu, MV mvp, int merange, MV& mvmin, MV& mvmax) const
2392 {
2393 cu.clipMv(mvp);
2394
2395 MV dist((int16_t)merange << 2, (int16_t)merange << 2);
2396 mvmin = mvp - dist;
2397 mvmax = mvp + dist;
2398
2399 cu.clipMv(mvmin);
2400 cu.clipMv(mvmax);
2401
2402 /* Clip search range to signaled maximum MV length.
2403 * We do not support this VUI field being changed from the default */
2404 const int maxMvLen = (1 << 15) - 1;
2405 mvmin.x = X265_MAX(mvmin.x, -maxMvLen);
2406 mvmin.y = X265_MAX(mvmin.y, -maxMvLen);
2407 mvmax.x = X265_MIN(mvmax.x, maxMvLen);
2408 mvmax.y = X265_MIN(mvmax.y, maxMvLen);
2409
2410 mvmin >>= 2;
2411 mvmax >>= 2;
2412
2413 /* conditional clipping for frame parallelism */
2414 mvmin.y = X265_MIN(mvmin.y, (int16_t)m_refLagPixels);
2415 mvmax.y = X265_MIN(mvmax.y, (int16_t)m_refLagPixels);
2416 }
2417
2418 /* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
2419 void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
2420 {
2421 CUData& cu = interMode.cu;
2422 Yuv* reconYuv = &interMode.reconYuv;
2423 const Yuv* fencYuv = interMode.fencYuv;
2424
2425 X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
2426
2427 uint32_t cuSize = 1 << cu.m_log2CUSize[0];
2428 uint32_t depth = cu.m_cuDepth[0];
2429
2430 // No residual coding : SKIP mode
2431
2432 cu.setPredModeSubParts(MODE_SKIP);
2433 cu.clearCbf();
2434 cu.setTUDepthSubParts(0, 0, depth);
2435
2436 reconYuv->copyFromYuv(interMode.predYuv);
2437
2438 // Luma
2439 int part = partitionFromLog2Size(cu.m_log2CUSize[0]);
2440 interMode.distortion = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2441 // Chroma
2442 part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
2443 interMode.distortion += m_rdCost.scaleChromaDist(1, primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
2444 interMode.distortion += m_rdCost.scaleChromaDist(2, primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
2445
2446 m_entropyCoder.load(m_rqt[depth].cur);
2447 m_entropyCoder.resetBits();
2448 if (m_slice->m_pps->bTransquantBypassEnabled)
2449 m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
2450 m_entropyCoder.codeSkipFlag(cu, 0);
2451 m_entropyCoder.codeMergeIndex(cu, 0);
2452
2453 interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
2454 interMode.coeffBits = 0;
2455 interMode.totalBits = interMode.mvBits;
2456 if (m_rdCost.m_psyRd)
2457 interMode.psyEnergy = m_rdCost.psyCost(cu.m_log2CUSize[0] - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2458
2459 updateModeCost(interMode);
2460 m_entropyCoder.store(interMode.contexts);
2461 }
2462
2463 /* encode residual and calculate rate-distortion for a CU block.
2464 * Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
2465 void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
2466 {
2467 CUData& cu = interMode.cu;
2468 Yuv* reconYuv = &interMode.reconYuv;
2469 Yuv* predYuv = &interMode.predYuv;
2470 ShortYuv* resiYuv = &m_rqt[cuGeom.depth].tmpResiYuv;
2471 const Yuv* fencYuv = interMode.fencYuv;
2472
2473 X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
2474
2475 uint32_t log2CUSize = cu.m_log2CUSize[0];
2476 uint32_t cuSize = 1 << log2CUSize;
2477 uint32_t depth = cu.m_cuDepth[0];
2478
2479 int part = partitionFromLog2Size(log2CUSize);
2480 int cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
2481
2482 m_quant.setQPforQuant(interMode.cu);
2483
2484 resiYuv->subtract(*fencYuv, *predYuv, log2CUSize);
2485
2486 uint32_t tuDepthRange[2];
2487 cu.getInterTUQtDepthRange(tuDepthRange, 0);
2488
2489 m_entropyCoder.load(m_rqt[depth].cur);
2490
2491 Cost costs;
2492 estimateResidualQT(interMode, cuGeom, 0, depth, *resiYuv, costs, tuDepthRange);
2493
2494 if (!cu.m_tqBypass[0])
2495 {
2496 uint32_t cbf0Dist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
2497 cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
2498 cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
2499
2500 /* Consider the RD cost of not signaling any residual */
2501 m_entropyCoder.load(m_rqt[depth].cur);
2502 m_entropyCoder.resetBits();
2503 m_entropyCoder.codeQtRootCbfZero();
2504 uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();
2505
2506 uint64_t cbf0Cost;
2507 uint32_t cbf0Energy;
2508 if (m_rdCost.m_psyRd)
2509 {
2510 cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
2511 cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
2512 }
2513 else
2514 cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits);
2515
2516 if (cbf0Cost < costs.rdcost)
2517 {
2518 cu.clearCbf();
2519 cu.setTUDepthSubParts(0, 0, depth);
2520 }
2521 }
2522
2523 if (cu.getQtRootCbf(0))
2524 saveResidualQTData(cu, *resiYuv, 0, depth);
2525
2526 /* calculate signal bits for inter/merge/skip coded CU */
2527 m_entropyCoder.load(m_rqt[depth].cur);
2528
2529 uint32_t coeffBits, bits;
2530 if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
2531 {
2532 cu.setPredModeSubParts(MODE_SKIP);
2533
2534 /* Merge/Skip */
2535 m_entropyCoder.resetBits();
2536 if (m_slice->m_pps->bTransquantBypassEnabled)
2537 m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
2538 m_entropyCoder.codeSkipFlag(cu, 0);
2539 m_entropyCoder.codeMergeIndex(cu, 0);
2540 coeffBits = 0;
2541 bits = m_entropyCoder.getNumberOfWrittenBits();
2542 }
2543 else
2544 {
2545 m_entropyCoder.resetBits();
2546 if (m_slice->m_pps->bTransquantBypassEnabled)
2547 m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
2548 m_entropyCoder.codeSkipFlag(cu, 0);
2549 m_entropyCoder.codePredMode(cu.m_predMode[0]);
2550 m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
2551 m_entropyCoder.codePredInfo(cu, 0);
2552 uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits();
2553
2554 bool bCodeDQP = m_slice->m_pps->bUseDQP;
2555 m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
2556 bits = m_entropyCoder.getNumberOfWrittenBits();
2557
2558 coeffBits = bits - mvBits;
2559 }
2560
2561 m_entropyCoder.store(interMode.contexts);
2562
2563 if (cu.getQtRootCbf(0))
2564 reconYuv->addClip(*predYuv, *resiYuv, log2CUSize);
2565 else
2566 reconYuv->copyFromYuv(*predYuv);
2567
2568 // update with clipped distortion and cost (qp estimation loop uses unclipped values)
2569 uint32_t bestDist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2570 bestDist += m_rdCost.scaleChromaDist(1, primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
2571 bestDist += m_rdCost.scaleChromaDist(2, primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
2572 if (m_rdCost.m_psyRd)
2573 interMode.psyEnergy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2574
2575 interMode.totalBits = bits;
2576 interMode.distortion = bestDist;
2577 interMode.coeffBits = coeffBits;
2578 interMode.mvBits = bits - coeffBits;
2579 updateModeCost(interMode);
2580 }
2581
2582 void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, const uint32_t depthRange[2])
2583 {
2584 CUData& cu = mode.cu;
2585 X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid depth\n");
2586
2587 uint32_t log2TrSize = g_maxLog2CUSize - depth;
2588 uint32_t tuDepth = depth - cu.m_cuDepth[0];
2589
2590 bool bCheckFull = log2TrSize <= depthRange[1];
2591 if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0])
2592 bCheckFull = false;
2593
2594 if (bCheckFull)
2595 {
2596 // code full block
2597 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
2598 bool bCodeChroma = true;
2599 uint32_t tuDepthC = tuDepth;
2600 if (log2TrSizeC < 2)
2601 {
2602 X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
2603 log2TrSizeC = 2;
2604 tuDepthC--;
2605 bCodeChroma = !(absPartIdx & 3);
2606 }
2607
2608 uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
2609 uint32_t setCbf = 1 << tuDepth;
2610
2611 uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
2612 coeff_t *coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
2613
2614 uint32_t sizeIdx = log2TrSize - 2;
2615
2616 cu.setTUDepthSubParts(depth - cu.m_cuDepth[0], absPartIdx, depth);
2617 cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
2618
2619 ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
2620 const Yuv* fencYuv = mode.fencYuv;
2621
2622 int16_t* curResiY = resiYuv.getLumaAddr(absPartIdx);
2623 uint32_t strideResiY = resiYuv.m_size;
2624
2625 const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
2626 uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
2627
2628 if (numSigY)
2629 {
2630 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
2631 cu.setCbfSubParts(setCbf, TEXT_LUMA, absPartIdx, depth);
2632 }
2633 else
2634 {
2635 primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);
2636 cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
2637 }
2638
2639 if (bCodeChroma)
2640 {
2641 uint32_t sizeIdxC = log2TrSizeC - 2;
2642 uint32_t strideResiC = resiYuv.m_csize;
2643
2644 uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
2645 coeff_t *coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
2646 coeff_t *coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
2647 bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
2648
2649 TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
2650 do
2651 {
2652 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
2653 uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
2654
2655 cu.setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
2656 cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
2657
2658 int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
2659 const pixel* fencCb = fencYuv->getCbAddr(absPartIdxC);
2660 uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
2661 if (numSigU)
2662 {
2663 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
2664 cu.setCbfPartRange(setCbf, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
2665 }
2666 else
2667 {
2668 primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);
2669 cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
2670 }
2671
2672 int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
2673 const pixel* fencCr = fencYuv->getCrAddr(absPartIdxC);
2674 uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
2675 if (numSigV)
2676 {
2677 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
2678 cu.setCbfPartRange(setCbf, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
2679 }
2680 else
2681 {
2682 primitives.blockfill_s[sizeIdxC](curResiV, strideResiC, 0);
2683 cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
2684 }
2685 }
2686 while (tuIterator.isNextSection());
2687
2688 if (splitIntoSubTUs)
2689 {
2690 offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
2691 offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
2692 }
2693 }
2694 }
2695 else
2696 {
2697 X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
2698
2699 uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
2700 uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
2701 for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
2702 {
2703 residualTransformQuantInter(mode, cuGeom, qPartIdx, depth + 1, depthRange);
2704 ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
2705 ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
2706 vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
2707 }
2708 for (uint32_t i = 0; i < 4 * qNumParts; i++)
2709 {
2710 cu.m_cbf[TEXT_LUMA][absPartIdx + i] |= ycbf << tuDepth;
2711 cu.m_cbf[TEXT_CHROMA_U][absPartIdx + i] |= ucbf << tuDepth;
2712 cu.m_cbf[TEXT_CHROMA_V][absPartIdx + i] |= vcbf << tuDepth;
2713 }
2714 }
2715 }
2716
2717 uint64_t Search::estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId)
2718 {
2719 uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
2720
2721 if (m_rdCost.m_psyRd)
2722 return m_rdCost.calcPsyRdCost(dist, nullBits, psyEnergy);
2723 else
2724 return m_rdCost.calcRdCost(dist, nullBits);
2725 }
2726
2727 void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
2728 {
2729 CUData& cu = mode.cu;
2730 uint32_t log2TrSize = g_maxLog2CUSize - depth;
2731
2732 bool bCheckSplit = log2TrSize > depthRange[0];
2733 bool bCheckFull = log2TrSize <= depthRange[1];
2734 bool bSplitPresentFlag = bCheckSplit && bCheckFull;
2735
2736 if (cu.m_partSize[0] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit)
2737 bCheckFull = false;
2738
2739 X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
2740 X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
2741
2742 uint32_t tuDepth = depth - cu.m_cuDepth[0];
2743 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
2744 bool bCodeChroma = true;
2745 uint32_t tuDepthC = tuDepth;
2746 if (log2TrSizeC < 2)
2747 {
2748 X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
2749 log2TrSizeC = 2;
2750 tuDepthC--;
2751 bCodeChroma = !(absPartIdx & 3);
2752 }
2753
2754 // code full block
2755 Cost fullCost;
2756 fullCost.rdcost = MAX_INT64;
2757
2758 uint8_t cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
2759 uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
2760 uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2761 uint32_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2762 uint32_t singlePsyEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2763 uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2764 uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
2765
2766 m_entropyCoder.store(m_rqt[depth].rqtRoot);
2767
2768 uint32_t trSize = 1 << log2TrSize;
2769 const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
2770 uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
2771 const Yuv* fencYuv = mode.fencYuv;
2772
2773 // code full block
2774 if (bCheckFull)
2775 {
2776 uint32_t trSizeC = 1 << log2TrSizeC;
2777 int partSize = partitionFromLog2Size(log2TrSize);
2778 int partSizeC = partitionFromLog2Size(log2TrSizeC);
2779 const uint32_t qtLayer = log2TrSize - 2;
2780 uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
2781 coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
2782
2783 bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0];
2784 bool checkTransformSkipY = checkTransformSkip && log2TrSize <= MAX_LOG2_TS_SIZE;
2785 bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
2786
2787 cu.setTUDepthSubParts(depth - cu.m_cuDepth[0], absPartIdx, depth);
2788 cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
2789
2790 if (m_bEnableRDOQ)
2791 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
2792
2793 const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
2794 int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
2795 numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
2796 cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
2797
2798 m_entropyCoder.resetBits();
2799
2800 if (bSplitPresentFlag && log2TrSize > depthRange[0])
2801 m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
2802 fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
2803
2804 // Coding luma cbf flag has been removed from here. The context for cbf flag is different for each depth.
2805 // So it is valid if we encode coefficients and then cbfs at least for analysis.
2806 // m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
2807 if (cbfFlag[TEXT_LUMA][0])
2808 m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
2809
2810 uint32_t singleBitsPrev = m_entropyCoder.getNumberOfWrittenBits();
2811 singleBits[TEXT_LUMA][0] = singleBitsPrev - fullCost.bits;
2812
2813 X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
2814 uint32_t distY = primitives.ssd_s[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size);
2815 uint32_t psyEnergyY = 0;
2816 if (m_rdCost.m_psyRd)
2817 psyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, (int16_t*)zeroShort, 0);
2818
2819 int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
2820 uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
2821
2822 if (cbfFlag[TEXT_LUMA][0])
2823 {
2824 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
2825
2826 // non-zero cost calculation for luma - This is an approximation
2827 // finally we have to encode correct cbf after comparing with null cost
2828 const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
2829 uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
2830 uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0;
2831 if (m_rdCost.m_psyRd)
2832 {
2833 nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
2834 singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroPsyEnergyY);
2835 }
2836 else
2837 singleCostY = m_rdCost.calcRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0]);
2838
2839 if (cu.m_tqBypass[0])
2840 {
2841 singleDist[TEXT_LUMA][0] = nonZeroDistY;
2842 singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
2843 }
2844 else
2845 {
2846 // zero-cost calculation for luma. This is an approximation
2847 // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
2848 // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
2849 uint64_t nullCostY = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
2850
2851 if (nullCostY < singleCostY)
2852 {
2853 cbfFlag[TEXT_LUMA][0] = 0;
2854 singleBits[TEXT_LUMA][0] = 0;
2855 primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
2856 #if CHECKED_BUILD || _DEBUG
2857 uint32_t numCoeffY = 1 << (log2TrSize << 1);
2858 memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
2859 #endif
2860 if (checkTransformSkipY)
2861 minCost[TEXT_LUMA][0] = nullCostY;
2862 singleDist[TEXT_LUMA][0] = distY;
2863 singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
2864 }
2865 else
2866 {
2867 if (checkTransformSkipY)
2868 minCost[TEXT_LUMA][0] = singleCostY;
2869 singleDist[TEXT_LUMA][0] = nonZeroDistY;
2870 singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
2871 }
2872 }
2873 }
2874 else
2875 {
2876 if (checkTransformSkipY)
2877 minCost[TEXT_LUMA][0] = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
2878 primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
2879 singleDist[TEXT_LUMA][0] = distY;
2880 singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
2881 }
2882
2883 cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
2884
2885 if (bCodeChroma)
2886 {
2887 uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
2888 uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
2889 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
2890 {
2891 uint32_t distC = 0, psyEnergyC = 0;
2892 coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
2893 TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
2894
2895 do
2896 {
2897 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
2898 uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
2899
2900 cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
2901
2902 if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
2903 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
2904
2905 fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
2906 resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
2907 numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
2908 cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
2909
2910 //Coding cbf flags has been removed from here
2911 // m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][tuIterator.section], tuDepth);
2912 if (cbfFlag[chromaId][tuIterator.section])
2913 m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
2914 uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
2915 singleBits[chromaId][tuIterator.section] = newBits - singleBitsPrev;
2916 singleBitsPrev = newBits;
2917
2918 int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
2919 distC = m_rdCost.scaleChromaDist(chromaId, primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
2920
2921 if (cbfFlag[chromaId][tuIterator.section])
2922 {
2923 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset,
2924 log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
2925
2926 // non-zero cost calculation for luma, same as luma - This is an approximation
2927 // finally we have to encode correct cbf after comparing with null cost
2928 uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
2929 uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
2930 uint32_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
2931 uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0;
2932 if (m_rdCost.m_psyRd)
2933 {
2934 nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
2935 singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
2936 }
2937 else
2938 singleCostC = m_rdCost.calcRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section]);
2939
2940 if (cu.m_tqBypass[0])
2941 {
2942 singleDist[chromaId][tuIterator.section] = nonZeroDistC;
2943 singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
2944 }
2945 else
2946 {
2947 //zero-cost calculation for chroma. This is an approximation
2948 uint64_t nullCostC = estimateNullCbfCost(distC, psyEnergyC, tuDepth, (TextType)chromaId);
2949
2950 if (nullCostC < singleCostC)
2951 {
2952 cbfFlag[chromaId][tuIterator.section] = 0;
2953 singleBits[chromaId][tuIterator.section] = 0;
2954 primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
2955 #if CHECKED_BUILD || _DEBUG
2956 uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
2957 memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
2958 #endif
2959 if (checkTransformSkipC)
2960 minCost[chromaId][tuIterator.section] = nullCostC;
2961 singleDist[chromaId][tuIterator.section] = distC;
2962 singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
2963 }
2964 else
2965 {
2966 if (checkTransformSkipC)
2967 minCost[chromaId][tuIterator.section] = singleCostC;
2968 singleDist[chromaId][tuIterator.section] = nonZeroDistC;
2969 singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
2970 }
2971 }
2972 }
2973 else
2974 {
2975 if (checkTransformSkipC)
2976 minCost[chromaId][tuIterator.section] = estimateNullCbfCost(distC, psyEnergyC, tuDepthC, (TextType)chromaId);
2977 primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
2978 singleDist[chromaId][tuIterator.section] = distC;
2979 singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
2980 }
2981
2982 cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
2983 }
2984 while (tuIterator.isNextSection());
2985 }
2986 }
2987
2988 if (checkTransformSkipY)
2989 {
2990 uint32_t nonZeroDistY = 0;
2991 uint32_t nonZeroPsyEnergyY = 0;
2992 uint64_t singleCostY = MAX_INT64;
2993
2994 ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]);
2995 ALIGN_VAR_32(int16_t, tsResiY[MAX_TS_SIZE * MAX_TS_SIZE]);
2996
2997 m_entropyCoder.load(m_rqt[depth].rqtRoot);
2998
2999 cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth);
3000
3001 if (m_bEnableRDOQ)
3002 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
3003
3004 fenc = fencYuv->getLumaAddr(absPartIdx);
3005 resi = resiYuv.getLumaAddr(absPartIdx);
3006 uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, tsCoeffY, log2TrSize, TEXT_LUMA, absPartIdx, true);
3007
3008 if (numSigTSkipY)
3009 {
3010 m_entropyCoder.resetBits();
3011 m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
3012 m_entropyCoder.codeCoeffNxN(cu, tsCoeffY, absPartIdx, log2TrSize, TEXT_LUMA);
3013 const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
3014
3015 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], tsResiY, trSize, tsCoeffY, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
3016
3017 nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, tsResiY, trSize);
3018
3019 if (m_rdCost.m_psyRd)
3020 {
3021 nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, tsResiY, trSize);
3022 singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroPsyEnergyY);
3023 }
3024 else
3025 singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
3026 }
3027
3028 if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
3029 cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
3030 else
3031 {
3032 singleDist[TEXT_LUMA][0] = nonZeroDistY;
3033 singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
3034 cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
3035 bestTransformMode[TEXT_LUMA][0] = 1;
3036 uint32_t numCoeffY = 1 << (log2TrSize << 1);
3037 memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY);
3038 primitives.luma_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize);
3039 }
3040
3041 cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
3042 }
3043
3044 if (bCodeChroma && checkTransformSkipC)
3045 {
3046 uint32_t nonZeroDistC = 0, nonZeroPsyEnergyC = 0;
3047 uint64_t singleCostC = MAX_INT64;
3048 uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
3049 uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
3050
3051 m_entropyCoder.load(m_rqt[depth].rqtRoot);
3052
3053 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
3054 {
3055 coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
3056 TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
3057
3058 do
3059 {
3060 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
3061 uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
3062
3063 int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
3064
3065 ALIGN_VAR_32(coeff_t, tsCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
3066 ALIGN_VAR_32(int16_t, tsResiC[MAX_TS_SIZE * MAX_TS_SIZE]);
3067
3068 cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
3069
3070 if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
3071 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
3072
3073 fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
3074 resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
3075 uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, tsCoeffC, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
3076
3077 m_entropyCoder.resetBits();
3078 singleBits[chromaId][tuIterator.section] = 0;
3079
3080 if (numSigTSkipC)
3081 {
3082 m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
3083 m_entropyCoder.codeCoeffNxN(cu, tsCoeffC, absPartIdxC, log2TrSizeC, (TextType)chromaId);
3084 singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
3085
3086 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], tsResiC, trSizeC, tsCoeffC,
3087 log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
3088 uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
3089 nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
3090 if (m_rdCost.m_psyRd)
3091 {
3092 nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
3093 singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
3094 }
3095 else
3096 singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section]);
3097 }
3098
3099 if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
3100 cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
3101 else
3102 {
3103 singleDist[chromaId][tuIterator.section] = nonZeroDistC;
3104 singlePsyEnergy[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
3105 cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
3106 bestTransformMode[chromaId][tuIterator.section] = 1;
3107 uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
3108 memcpy(coeffCurC + subTUOffset, tsCoeffC, sizeof(coeff_t) * numCoeffC);
3109 primitives.luma_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC);
3110 }
3111
3112 cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
3113 }
3114 while (tuIterator.isNextSection());
3115 }
3116 }
3117
3118 // Here we were encoding cbfs and coefficients, after calculating distortion above.
3119 // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected
3120 // bits required for coefficients and added with number of cbf bits. As I tested the order does not
3121 // make any difference. But bit confused whether I should load the original context as below.
3122 m_entropyCoder.load(m_rqt[depth].rqtRoot);
3123 m_entropyCoder.resetBits();
3124
3125 //Encode cbf flags
3126 if (bCodeChroma)
3127 {
3128 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
3129 {
3130 if (!splitIntoSubTUs)
3131 m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth);
3132 else
3133 {
3134 offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
3135 m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][0], tuDepth);
3136 m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][1], tuDepth);
3137 }
3138 }
3139 }
3140
3141 m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
3142
3143 uint32_t cbfBits = m_entropyCoder.getNumberOfWrittenBits();
3144
3145 uint32_t coeffBits = 0;
3146 coeffBits = singleBits[TEXT_LUMA][0];
3147 for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
3148 {
3149 coeffBits += singleBits[TEXT_CHROMA_U][subTUIndex];
3150 coeffBits += singleBits[TEXT_CHROMA_V][subTUIndex];
3151 }
3152
3153 // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma.
3154 // In case of chroma, if any one of the splitted block's cbf is 1, then we need to encode cbf 1, and then for
3155 // four splitted block's individual cbf value. This is not known before analysis of four splitted blocks.
3156 // For that reason, I am collecting individual coefficient bits only.
3157 fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits;
3158
3159 fullCost.distortion += singleDist[TEXT_LUMA][0];
3160 fullCost.energy += singlePsyEnergy[TEXT_LUMA][0];// need to check we need to add chroma also
3161 for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
3162 {
3163 fullCost.distortion += singleDist[TEXT_CHROMA_U][subTUIndex];
3164 fullCost.distortion += singleDist[TEXT_CHROMA_V][subTUIndex];
3165 }
3166
3167 if (m_rdCost.m_psyRd)
3168 fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
3169 else
3170 fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
3171 }
3172
3173 // code sub-blocks
3174 if (bCheckSplit)
3175 {
3176 if (bCheckFull)
3177 {
3178 m_entropyCoder.store(m_rqt[depth].rqtTest);
3179 m_entropyCoder.load(m_rqt[depth].rqtRoot);
3180 }
3181
3182 Cost splitCost;
3183 if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0]))
3184 {
3185 // Subdiv flag can be encoded at the start of anlysis of splitted blocks.
3186 m_entropyCoder.resetBits();
3187 m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
3188 splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
3189 }
3190
3191 uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
3192 uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
3193 for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
3194 {
3195 estimateResidualQT(mode, cuGeom, qPartIdx, depth + 1, resiYuv, splitCost, depthRange);
3196 ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
3197 ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
3198 vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
3199 }
3200 for (uint32_t i = 0; i < 4 * qNumParts; ++i)
3201 {
3202 cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
3203 cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
3204 cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
3205 }
3206
3207 // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
3208 // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
3209 // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
3210 // at depth 0 (for example).
3211 m_entropyCoder.load(m_rqt[depth].rqtRoot);
3212 m_entropyCoder.resetBits();
3213
3214 codeInterSubdivCbfQT(cu, absPartIdx, depth, depthRange);
3215 uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
3216 splitCost.bits += splitCbfBits;
3217
3218 if (m_rdCost.m_psyRd)
3219 splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
3220 else
3221 splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
3222
3223 if (ycbf || ucbf || vcbf || !bCheckFull)
3224 {
3225 if (splitCost.rdcost < fullCost.rdcost)
3226 {
3227 outCosts.distortion += splitCost.distortion;
3228 outCosts.rdcost += splitCost.rdcost;
3229 outCosts.bits += splitCost.bits;
3230 outCosts.energy += splitCost.energy;
3231 return;
3232 }
3233 else
3234 outCosts.energy += splitCost.energy;
3235 }
3236
3237 cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
3238 if (bCodeChroma)
3239 {
3240 if (!splitIntoSubTUs)
3241 {
3242 cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx, depth);
3243 cu.setTransformSkipSubParts(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx, depth);
3244 }
3245 else
3246 {
3247 uint32_t tuNumParts = absPartIdxStep >> 1;
3248 cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][0], TEXT_CHROMA_U, absPartIdx , tuNumParts);
3249 cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][1], TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
3250 cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][0], TEXT_CHROMA_V, absPartIdx , tuNumParts);
3251 cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][1], TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
3252 }
3253 }
3254 X265_CHECK(bCheckFull, "check-full must be set\n");
3255 m_entropyCoder.load(m_rqt[depth].rqtTest);
3256 }
3257
3258 cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
3259 cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
3260
3261 if (bCodeChroma)
3262 {
3263 if (!splitIntoSubTUs)
3264 {
3265 cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx, depth);
3266 cu.setCbfSubParts(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx, depth);
3267 }
3268 else
3269 {
3270 uint32_t tuNumParts = absPartIdxStep >> 1;
3271
3272 offsetCBFs(cbfFlag[TEXT_CHROMA_U]);
3273 offsetCBFs(cbfFlag[TEXT_CHROMA_V]);
3274 cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][0] << tuDepth, TEXT_CHROMA_U, absPartIdx , tuNumParts);
3275 cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_U][1] << tuDepth, TEXT_CHROMA_U, absPartIdx + tuNumParts, tuNumParts);
3276 cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][0] << tuDepth, TEXT_CHROMA_V, absPartIdx , tuNumParts);
3277 cu.setCbfPartRange(cbfFlag[TEXT_CHROMA_V][1] << tuDepth, TEXT_CHROMA_V, absPartIdx + tuNumParts, tuNumParts);
3278 }
3279 }
3280
3281 outCosts.distortion += fullCost.distortion;
3282 outCosts.rdcost += fullCost.rdcost;
3283 outCosts.bits += fullCost.bits;
3284 outCosts.energy += fullCost.energy;
3285 }
3286
3287 void Search::codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, const uint32_t depthRange[2])
3288 {
3289 X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
3290 X265_CHECK(cu.isInter(absPartIdx), "codeInterSubdivCbfQT() with intra block\n");
3291
3292 const uint32_t tuDepth = depth - cu.m_cuDepth[0];
3293 const bool bSubdiv = tuDepth != cu.m_tuDepth[absPartIdx];
3294 const uint32_t log2TrSize = g_maxLog2CUSize - depth;
3295
3296 if (!(log2TrSize - m_hChromaShift < 2))
3297 {
3298 if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
3299 m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
3300 if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
3301 m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
3302 }
3303 else
3304 {
3305 X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma CBF not matching\n");
3306 X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma CBF not matching\n");
3307 }
3308
3309 if (!bSubdiv)
3310 {
3311 m_entropyCoder.codeQtCbfLuma(cu, absPartIdx, tuDepth);
3312 }
3313 else
3314 {
3315 uint32_t qNumParts = 1 << (log2TrSize -1 - LOG2_UNIT_SIZE) * 2;
3316 for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
3317 codeInterSubdivCbfQT(cu, absPartIdx, depth + 1, depthRange);
3318 }
3319 }
3320
3321 void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, TextType ttype, const uint32_t depthRange[2])
3322 {
3323 X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
3324 X265_CHECK(cu.isInter(absPartIdx), "encodeResidualQT() with intra block\n");
3325
3326 const uint32_t curTuDepth = depth - cu.m_cuDepth[0];
3327 const uint32_t tuDepth = cu.m_tuDepth[absPartIdx];
3328 const bool bSubdiv = curTuDepth != tuDepth;
3329 const uint32_t log2TrSize = g_maxLog2CUSize - depth;
3330
3331 if (bSubdiv)
3332 {
3333 if (cu.getCbf(absPartIdx, ttype, curTuDepth))
3334 {
3335 uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
3336 for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
3337 encodeResidualQT(cu, absPartIdx, depth + 1, ttype, depthRange);
3338 }
3339 return;
3340 }
3341 else
3342 {
3343 const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
3344 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
3345
3346 // Luma
3347 const uint32_t qtLayer = log2TrSize - 2;
3348 uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
3349 coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
3350
3351 // Chroma
3352 bool bCodeChroma = true;
3353 uint32_t tuDepthC = tuDepth;
3354 if (log2TrSize == 2 && m_csp != X265_CSP_I444)
3355 {
3356 X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
3357 log2TrSizeC++;
3358 tuDepthC--;
3359 bCodeChroma = !(absPartIdx & 3);
3360 }
3361
3362 if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
3363 m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
3364
3365 if (bCodeChroma)
3366 {
3367 uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
3368 coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
3369 coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
3370
3371 if (!splitIntoSubTUs)
3372 {
3373 if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
3374 m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
3375 if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
3376 m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
3377 }
3378 else
3379 {
3380 uint32_t tuNumParts = 2 << ((log2TrSizeC - LOG2_UNIT_SIZE) * 2);
3381 uint32_t subTUSize = 1 << (log2TrSizeC * 2);
3382 if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
3383 {
3384 if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
3385 m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
3386 if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
3387 m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_U);
3388 }
3389 if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
3390 {
3391 if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
3392 m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
3393 if (cu.getCbf(absPartIdx + tuNumParts, ttype, tuDepth + 1))
3394 m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + tuNumParts, log2TrSizeC, TEXT_CHROMA_V);
3395 }
3396 }
3397 }
3398 }
3399 }
3400
3401 void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth)
3402 {
3403 X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
3404 const uint32_t curTrMode = depth - cu.m_cuDepth[0];
3405 const uint32_t tuDepth = cu.m_tuDepth[absPartIdx];
3406 const uint32_t log2TrSize = g_maxLog2CUSize - depth;
3407
3408 if (curTrMode < tuDepth)
3409 {
3410 uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
3411 for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
3412 saveResidualQTData(cu, resiYuv, absPartIdx, depth + 1);
3413 return;
3414 }
3415
3416 const uint32_t qtLayer = log2TrSize - 2;
3417
3418 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
3419 bool bCodeChroma = true;
3420 uint32_t tuDepthC = tuDepth;
3421 if (log2TrSizeC < 2)
3422 {
3423 X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
3424 log2TrSizeC = 2;
3425 tuDepthC--;
3426 bCodeChroma = !(absPartIdx & 3);
3427 }
3428
3429 m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
3430
3431 uint32_t numCoeffY = 1 << (log2TrSize * 2);
3432 uint32_t coeffOffsetY = absPartIdx << LOG2_UNIT_SIZE * 2;
3433 coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
3434 coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY;
3435 memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
3436
3437 if (bCodeChroma)
3438 {
3439 m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
3440
3441 uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
3442 uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
3443
3444 coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
3445 coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
3446 coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
3447 coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
3448 memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
3449 memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
3450 }
3451 }
3452
3453 /* returns the number of bits required to signal a non-most-probable mode.
3454 * on return mpms contains bitmap of most probable modes */
3455 uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const
3456 {
3457 cu.getIntraDirLumaPredictor(absPartIdx, preds);
3458
3459 mpms = 0;
3460 for (int i = 0; i < 3; ++i)
3461 mpms |= ((uint64_t)1 << preds[i]);
3462
3463 return m_entropyCoder.bitsIntraModeNonMPM();
3464 }
3465
3466 /* swap the current mode/cost with the mode with the highest cost in the
3467 * current candidate list, if its cost is better (maintain a top N list) */
3468 void Search::updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList)
3469 {
3470 uint32_t maxIndex = 0;
3471 uint64_t maxValue = 0;
3472
3473 for (int i = 0; i < maxCandCount; i++)
3474 {
3475 if (maxValue < candCostList[i])
3476 {
3477 maxValue = candCostList[i];
3478 maxIndex = i;
3479 }
3480 }
3481
3482 if (cost < maxValue)
3483 {
3484 candCostList[maxIndex] = cost;
3485 candModeList[maxIndex] = mode;
3486 }
3487 }