Update changelog.
[deb_x265.git] / source / encoder / search.cpp
... / ...
CommitLineData
1/*****************************************************************************
2* Copyright (C) 2013 x265 project
3*
4* Authors: Steve Borho <steve@borho.org>
5*
6* This program is free software; you can redistribute it and/or modify
7* it under the terms of the GNU General Public License as published by
8* the Free Software Foundation; either version 2 of the License, or
9* (at your option) any later version.
10*
11* This program is distributed in the hope that it will be useful,
12* but WITHOUT ANY WARRANTY; without even the implied warranty of
13* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14* GNU General Public License for more details.
15*
16* You should have received a copy of the GNU General Public License
17* along with this program; if not, write to the Free Software
18* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19*
20* This program is also available under a commercial proprietary license.
21* For more information, contact us at license @ x265.com.
22*****************************************************************************/
23
24#include "common.h"
25#include "primitives.h"
26#include "picyuv.h"
27#include "cudata.h"
28
29#include "search.h"
30#include "entropy.h"
31#include "rdcost.h"
32
33using namespace x265;
34
35#if _MSC_VER
36#pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning)
37#pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
38#endif
39
40ALIGN_VAR_32(const pixel, Search::zeroPixel[MAX_CU_SIZE]) = { 0 };
41ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
42
43Search::Search() : JobProvider(NULL)
44{
45 memset(m_rqt, 0, sizeof(m_rqt));
46
47 for (int i = 0; i < 3; i++)
48 {
49 m_qtTempTransformSkipFlag[i] = NULL;
50 m_qtTempCbf[i] = NULL;
51 }
52
53 m_numLayers = 0;
54 m_param = NULL;
55 m_slice = NULL;
56 m_frame = NULL;
57 m_bJobsQueued = false;
58 m_totalNumME = m_numAcquiredME = m_numCompletedME = 0;
59}
60
61bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
62{
63 m_param = &param;
64 m_bEnableRDOQ = param.rdLevel >= 4;
65 m_bFrameParallel = param.frameNumThreads > 1;
66 m_numLayers = g_log2Size[param.maxCUSize] - 2;
67
68 m_rdCost.setPsyRdScale(param.psyRd);
69 m_me.setSearchMethod(param.searchMethod);
70 m_me.setSubpelRefine(param.subpelRefine);
71
72 bool ok = m_quant.init(m_bEnableRDOQ, param.psyRdoq, scalingList, m_entropyCoder);
73 if (m_param->noiseReduction)
74 ok &= m_quant.allocNoiseReduction(param);
75
76 ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
77
78 /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
79 * available for motion reference. See refLagRows in FrameEncoder::compressCTURows() */
80 m_refLagPixels = m_bFrameParallel ? param.searchRange : param.sourceHeight;
81
82 uint32_t sizeL = 1 << (g_maxLog2CUSize * 2);
83 uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
84 uint32_t numPartitions = NUM_CU_PARTITIONS;
85
86 /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
87 * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
88 * which are reconstructed at each depth are valid. At the end, the transform depth table
89 * is walked and the coeff and recon at the correct depths are collected */
90 for (uint32_t i = 0; i <= m_numLayers; i++)
91 {
92 CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
93 m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
94 m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
95 ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
96 ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
97 }
98
99 /* the rest of these buffers are indexed per-depth */
100 for (uint32_t i = 0; i <= g_maxCUDepth; i++)
101 {
102 int cuSize = g_maxCUSize >> i;
103 ok &= m_rqt[i].tmpResiYuv.create(cuSize, param.internalCsp);
104 ok &= m_rqt[i].tmpPredYuv.create(cuSize, param.internalCsp);
105 ok &= m_rqt[i].bidirPredYuv[0].create(cuSize, param.internalCsp);
106 ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
107 }
108
109 CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
110 m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
111 m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
112 CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
113 m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
114 m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
115
116 return ok;
117
118fail:
119 return false;
120}
121
122Search::~Search()
123{
124 for (uint32_t i = 0; i <= m_numLayers; i++)
125 {
126 X265_FREE(m_rqt[i].coeffRQT[0]);
127 m_rqt[i].reconQtYuv.destroy();
128 m_rqt[i].resiQtYuv.destroy();
129 }
130
131 for (uint32_t i = 0; i <= g_maxCUDepth; i++)
132 {
133 m_rqt[i].tmpResiYuv.destroy();
134 m_rqt[i].tmpPredYuv.destroy();
135 m_rqt[i].bidirPredYuv[0].destroy();
136 m_rqt[i].bidirPredYuv[1].destroy();
137 }
138
139 X265_FREE(m_qtTempCbf[0]);
140 X265_FREE(m_qtTempTransformSkipFlag[0]);
141}
142
143void Search::setQP(const Slice& slice, int qp)
144{
145 x265_emms(); /* TODO: if the lambda tables were ints, this would not be necessary */
146 m_me.setQP(qp);
147 m_rdCost.setQP(slice, qp);
148}
149
150#if CHECKED_BUILD || _DEBUG
151void Search::invalidateContexts(int fromDepth)
152{
153 /* catch reads without previous writes */
154 for (int d = fromDepth; d < NUM_FULL_DEPTH; d++)
155 {
156 m_rqt[d].cur.markInvalid();
157 m_rqt[d].rqtTemp.markInvalid();
158 m_rqt[d].rqtRoot.markInvalid();
159 m_rqt[d].rqtTest.markInvalid();
160 }
161}
162#else
163void Search::invalidateContexts(int) {}
164#endif
165
166void Search::codeSubdivCbfQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, uint32_t width, uint32_t height)
167{
168 uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
169 uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
170 uint32_t subdiv = tuDepthL > trDepth;
171 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
172
173 bool mCodeAll = true;
174 const uint32_t numPels = 1 << (log2TrSize * 2 - m_hChromaShift - m_vChromaShift);
175 if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
176 mCodeAll = false;
177
178 if (mCodeAll)
179 {
180 if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, trDepth - 1))
181 m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_U, trDepth, !subdiv);
182
183 if (!trDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, trDepth - 1))
184 m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, (width >> m_hChromaShift), (height >> m_vChromaShift), TEXT_CHROMA_V, trDepth, !subdiv);
185 }
186
187 if (subdiv)
188 {
189 absPartIdxStep >>= 2;
190 width >>= 1;
191 height >>= 1;
192
193 uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
194 for (uint32_t part = 0; part < 4; part++)
195 codeSubdivCbfQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, absPartIdxStep, width, height);
196 }
197}
198
199void Search::codeCoeffQTChroma(const CUData& cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype)
200{
201 if (!cu.getCbf(absPartIdx, ttype, trDepth))
202 return;
203
204 uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
205 uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
206
207 if (tuDepthL > trDepth)
208 {
209 uint32_t qtPartNum = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
210 for (uint32_t part = 0; part < 4; part++)
211 codeCoeffQTChroma(cu, trDepth + 1, absPartIdx + part * qtPartNum, ttype);
212
213 return;
214 }
215
216 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
217
218 uint32_t trDepthC = trDepth;
219 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
220
221 if (log2TrSizeC == 1)
222 {
223 X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "transform size too small\n");
224 trDepthC--;
225 log2TrSizeC++;
226 uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
227 bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
228 if (!bFirstQ)
229 return;
230 }
231
232 uint32_t qtLayer = log2TrSize - 2;
233
234 if (m_csp != X265_CSP_I422)
235 {
236 uint32_t shift = (m_csp == X265_CSP_I420) ? 2 : 0;
237 uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - shift);
238 coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
239 m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
240 }
241 else
242 {
243 uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2 - 1);
244 coeff_t* coeff = m_rqt[qtLayer].coeffRQT[ttype] + coeffOffset;
245 uint32_t subTUSize = 1 << (log2TrSizeC * 2);
246 uint32_t partIdxesPerSubTU = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + trDepthC) << 1) + 1);
247 if (cu.getCbf(absPartIdx, ttype, trDepth + 1))
248 m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSizeC, ttype);
249 if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, trDepth + 1))
250 m_entropyCoder.codeCoeffNxN(cu, coeff + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, ttype);
251 }
252}
253
254void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, uint32_t depthRange[2])
255{
256 uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth;
257 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
258 uint32_t qtLayer = log2TrSize - 2;
259 uint32_t sizeIdx = log2TrSize - 2;
260 bool mightNotSplit = log2TrSize <= depthRange[1];
261 bool mightSplit = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
262
263 /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
264 if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
265 {
266 mightNotSplit = false;
267 mightSplit = true;
268 }
269
270 CUData& cu = mode.cu;
271
272 Cost fullCost;
273 uint32_t bCBF = 0;
274
275 pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
276 uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
277
278 if (mightNotSplit)
279 {
280 if (mightSplit)
281 m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
282
283 pixel* fenc = const_cast<pixel*>(mode.fencYuv->getLumaAddr(absPartIdx));
284 pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
285 int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
286 uint32_t stride = mode.fencYuv->m_size;
287
288 // init availability pattern
289 uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
290 initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
291
292 // get prediction signal
293 predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
294
295 cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
296 cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
297
298 uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
299 coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
300
301 // store original entropy coding status
302 if (m_bEnableRDOQ)
303 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
304
305 primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
306
307 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
308 if (numSig)
309 {
310 m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
311 primitives.luma_add_ps[sizeIdx](reconQt, reconQtStride, pred, residual, stride, stride);
312 }
313 else
314 // no coded residual, recon = pred
315 primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, pred, stride);
316
317 bCBF = !!numSig << trDepth;
318 cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
319 fullCost.distortion = primitives.sse_pp[sizeIdx](reconQt, reconQtStride, fenc, stride);
320
321 m_entropyCoder.resetBits();
322 if (!absPartIdx)
323 {
324 if (!cu.m_slice->isIntra())
325 {
326 if (cu.m_slice->m_pps->bTransquantBypassEnabled)
327 m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
328 m_entropyCoder.codeSkipFlag(cu, 0);
329 m_entropyCoder.codePredMode(cu.m_predMode[0]);
330 }
331
332 m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
333 }
334 if (cu.m_partSize[0] == SIZE_2Nx2N)
335 {
336 if (!absPartIdx)
337 m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
338 }
339 else
340 {
341 uint32_t qtNumParts = cuGeom.numPartitions >> 2;
342 if (!trDepth)
343 {
344 for (uint32_t part = 0; part < 4; part++)
345 m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false);
346 }
347 else if (!(absPartIdx & (qtNumParts - 1)))
348 m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
349 }
350 if (log2TrSize != depthRange[0])
351 m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
352
353 m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
354
355 if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth))
356 m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
357
358 fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
359
360 if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE)
361 fullCost.bits *= 4;
362
363 if (m_rdCost.m_psyRd)
364 {
365 fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
366 fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
367 }
368 else
369 fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
370 }
371 else
372 fullCost.rdcost = MAX_INT64;
373
374 if (mightSplit)
375 {
376 if (mightNotSplit)
377 {
378 m_entropyCoder.store(m_rqt[fullDepth].rqtTest); // save state after full TU encode
379 m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); // prep state of split encode
380 }
381
382 // code split block
383 uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
384 uint32_t absPartIdxSub = absPartIdx;
385
386 int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
387 if (m_param->bEnableTSkipFast)
388 checkTransformSkip &= cu.m_partSize[absPartIdx] == SIZE_NxN;
389
390 Cost splitCost;
391 uint32_t cbf = 0;
392 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
393 {
394 if (checkTransformSkip)
395 codeIntraLumaTSkip(mode, cuGeom, trDepth + 1, absPartIdxSub, splitCost);
396 else
397 codeIntraLumaQT(mode, cuGeom, trDepth + 1, absPartIdxSub, bAllowSplit, splitCost, depthRange);
398
399 cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
400 }
401 for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
402 cu.m_cbf[0][absPartIdx + offs] |= (cbf << trDepth);
403
404 if (mightNotSplit && log2TrSize != depthRange[0])
405 {
406 /* If we could have coded this TU depth, include cost of subdiv flag */
407 m_entropyCoder.resetBits();
408 m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
409 splitCost.bits += m_entropyCoder.getNumberOfWrittenBits();
410
411 if (m_rdCost.m_psyRd)
412 splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
413 else
414 splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
415 }
416
417 if (splitCost.rdcost < fullCost.rdcost)
418 {
419 outCost.rdcost += splitCost.rdcost;
420 outCost.distortion += splitCost.distortion;
421 outCost.bits += splitCost.bits;
422 outCost.energy += splitCost.energy;
423 return;
424 }
425 else
426 {
427 // recover entropy state of full-size TU encode
428 m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
429
430 // recover transform index and Cbf values
431 cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
432 cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
433 cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
434 }
435 }
436
437 // set reconstruction for next intra prediction blocks if full TU prediction won
438 pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
439 intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
440 primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
441
442 outCost.rdcost += fullCost.rdcost;
443 outCost.distortion += fullCost.distortion;
444 outCost.bits += fullCost.bits;
445 outCost.energy += fullCost.energy;
446}
447
448void Search::codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, Cost& outCost)
449{
450 uint32_t fullDepth = mode.cu.m_cuDepth[0] + trDepth;
451 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
452 uint32_t tuSize = 1 << log2TrSize;
453
454 X265_CHECK(tuSize == MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
455
456 CUData& cu = mode.cu;
457 Yuv* predYuv = &mode.predYuv;
458 const Yuv* fencYuv = mode.fencYuv;
459
460 Cost fullCost;
461 fullCost.rdcost = MAX_INT64;
462 int bTSkip = 0;
463 uint32_t bCBF = 0;
464
465 pixel* fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
466 pixel* pred = predYuv->getLumaAddr(absPartIdx);
467 int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
468 uint32_t stride = fencYuv->m_size;
469 int sizeIdx = log2TrSize - 2;
470
471 // init availability pattern
472 uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
473 initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
474
475 // get prediction signal
476 predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
477
478 cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
479
480 uint32_t qtLayer = log2TrSize - 2;
481 uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
482 coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
483 pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
484 uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
485
486 // store original entropy coding status
487 m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
488
489 if (m_bEnableRDOQ)
490 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
491
492 ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]);
493 ALIGN_VAR_32(pixel, tsReconY[MAX_TS_SIZE * MAX_TS_SIZE]);
494
495 int checkTransformSkip = 1;
496 for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
497 {
498 uint64_t tmpCost;
499 uint32_t tmpEnergy = 0;
500
501 coeff_t* coeff = (useTSkip ? tsCoeffY : coeffY);
502 pixel* tmpRecon = (useTSkip ? tsReconY : reconQt);
503 uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
504
505 primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
506
507 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
508 if (numSig)
509 {
510 m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
511 primitives.luma_add_ps[sizeIdx](tmpRecon, tmpReconStride, pred, residual, stride, stride);
512 }
513 else if (useTSkip)
514 {
515 /* do not allow tskip if CBF=0, pretend we did not try tskip */
516 checkTransformSkip = 0;
517 break;
518 }
519 else
520 // no residual coded, recon = pred
521 primitives.square_copy_pp[sizeIdx](tmpRecon, tmpReconStride, pred, stride);
522
523 uint32_t tmpDist = primitives.sse_pp[sizeIdx](tmpRecon, tmpReconStride, fenc, stride);
524
525 cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
526 cu.setCbfSubParts((!!numSig) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
527
528 if (useTSkip)
529 m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
530
531 m_entropyCoder.resetBits();
532 if (!absPartIdx)
533 {
534 if (!cu.m_slice->isIntra())
535 {
536 if (cu.m_slice->m_pps->bTransquantBypassEnabled)
537 m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
538 m_entropyCoder.codeSkipFlag(cu, 0);
539 m_entropyCoder.codePredMode(cu.m_predMode[0]);
540 }
541
542 m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
543 }
544 if (cu.m_partSize[0] == SIZE_2Nx2N)
545 {
546 if (!absPartIdx)
547 m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
548 }
549 else
550 {
551 uint32_t qtNumParts = cuGeom.numPartitions >> 2;
552 if (!trDepth)
553 {
554 for (uint32_t part = 0; part < 4; part++)
555 m_entropyCoder.codeIntraDirLumaAng(cu, part * qtNumParts, false);
556 }
557 else if (!(absPartIdx & (qtNumParts - 1)))
558 m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
559 }
560 m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
561
562 m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, cu.m_tuDepth[absPartIdx]);
563
564 if (cu.getCbf(absPartIdx, TEXT_LUMA, trDepth))
565 m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdx, log2TrSize, TEXT_LUMA);
566
567 uint32_t tmpBits = m_entropyCoder.getNumberOfWrittenBits();
568
569 if (!useTSkip)
570 m_entropyCoder.store(m_rqt[fullDepth].rqtTemp);
571
572 if (m_rdCost.m_psyRd)
573 {
574 tmpEnergy = m_rdCost.psyCost(sizeIdx, fenc, fencYuv->m_size, tmpRecon, tmpReconStride);
575 tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
576 }
577 else
578 tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
579
580 if (tmpCost < fullCost.rdcost)
581 {
582 bTSkip = useTSkip;
583 bCBF = !!numSig;
584 fullCost.rdcost = tmpCost;
585 fullCost.distortion = tmpDist;
586 fullCost.bits = tmpBits;
587 fullCost.energy = tmpEnergy;
588 }
589 }
590
591 if (bTSkip)
592 {
593 memcpy(coeffY, tsCoeffY, sizeof(coeff_t) << (log2TrSize * 2));
594 primitives.square_copy_pp[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize);
595 }
596 else if (checkTransformSkip)
597 {
598 cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
599 cu.setCbfSubParts(bCBF << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
600 m_entropyCoder.load(m_rqt[fullDepth].rqtTemp);
601 }
602
603 // set reconstruction for next intra prediction blocks
604 pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
605 intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
606 primitives.square_copy_pp[sizeIdx](picReconY, picStride, reconQt, reconQtStride);
607
608 outCost.rdcost += fullCost.rdcost;
609 outCost.distortion += fullCost.distortion;
610 outCost.bits += fullCost.bits;
611 outCost.energy += fullCost.energy;
612}
613
614/* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
615void Search::residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t depthRange[2])
616{
617 CUData& cu = mode.cu;
618
619 uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
620 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
621 bool bCheckFull = log2TrSize <= depthRange[1];
622
623 X265_CHECK(m_slice->m_sliceType != I_SLICE, "residualTransformQuantIntra not intended for I slices\n");
624
625 /* we still respect rdPenalty == 2, we can forbid 32x32 intra TU. rdPenalty = 1 is impossible
626 * since we are not measuring RD cost */
627 if (m_param->rdPenalty == 2 && log2TrSize == 5 && depthRange[0] <= 4)
628 bCheckFull = false;
629
630 if (bCheckFull)
631 {
632 pixel* fenc = const_cast<pixel*>(mode.fencYuv->getLumaAddr(absPartIdx));
633 pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
634 int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
635 pixel* picReconY = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
636 intptr_t picStride = m_frame->m_reconPicYuv->m_stride;
637 uint32_t stride = mode.fencYuv->m_size;
638 uint32_t sizeIdx = log2TrSize - 2;
639 uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
640 uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
641 coeff_t* coeff = cu.m_trCoeff[TEXT_LUMA] + coeffOffsetY;
642
643 initAdiPattern(cu, cuGeom, absPartIdx, trDepth, lumaPredMode);
644 predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
645
646 X265_CHECK(!cu.m_transformSkip[TEXT_LUMA][absPartIdx], "unexpected tskip flag in residualTransformQuantIntra\n");
647 cu.setTUDepthSubParts(trDepth, absPartIdx, fullDepth);
648
649 primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);
650 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, false);
651 if (numSig)
652 {
653 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], residual, stride, coeff, log2TrSize, TEXT_LUMA, true, false, numSig);
654 primitives.luma_add_ps[sizeIdx](picReconY, picStride, pred, residual, stride, stride);
655 cu.setCbfSubParts(1 << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
656 }
657 else
658 {
659 primitives.square_copy_pp[sizeIdx](picReconY, picStride, pred, stride);
660 cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
661 }
662 }
663 else
664 {
665 X265_CHECK(log2TrSize > depthRange[0], "intra luma split state failure\n");
666
667 /* code split block */
668 uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
669 uint32_t cbf = 0;
670 for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
671 {
672 residualTransformQuantIntra(mode, cuGeom, trDepth + 1, absPartIdxSub, depthRange);
673 cbf |= cu.getCbf(absPartIdxSub, TEXT_LUMA, trDepth + 1);
674 }
675 for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
676 cu.m_cbf[TEXT_LUMA][absPartIdx + offs] |= (cbf << trDepth);
677 }
678}
679
680void Search::extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t trDepth, uint32_t absPartIdx)
681{
682 uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
683 uint32_t tuDepth = cu.m_tuDepth[absPartIdx];
684
685 if (tuDepth == trDepth)
686 {
687 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
688 uint32_t qtLayer = log2TrSize - 2;
689
690 // copy transform coefficients
691 uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
692 coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
693 coeff_t* coeffDestY = cu.m_trCoeff[0] + coeffOffsetY;
694 memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2));
695
696 // copy reconstruction
697 m_rqt[qtLayer].reconQtYuv.copyPartToPartLuma(reconYuv, absPartIdx, log2TrSize);
698 }
699 else
700 {
701 uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
702 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
703 extractIntraResultQT(cu, reconYuv, trDepth + 1, absPartIdx + subPartIdx * numQPart);
704 }
705}
706
707/* 4:2:2 post-TU split processing */
708void Search::offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx)
709{
710 uint32_t depth = cu.m_cuDepth[0];
711 uint32_t fullDepth = depth + trDepth;
712 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
713
714 uint32_t trDepthC = trDepth;
715 if (log2TrSize == 2)
716 {
717 X265_CHECK(m_csp != X265_CSP_I444 && trDepthC, "trDepthC invalid\n");
718 trDepthC--;
719 }
720
721 uint32_t partIdxesPerSubTU = (NUM_CU_PARTITIONS >> ((depth + trDepthC) << 1)) >> 1;
722
723 // move the CBFs down a level and set the parent CBF
724 uint8_t subTUCBF[2];
725 uint8_t combinedSubTUCBF = 0;
726
727 for (uint32_t subTU = 0; subTU < 2; subTU++)
728 {
729 const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
730
731 subTUCBF[subTU] = cu.getCbf(subTUAbsPartIdx, ttype, trDepth);
732 combinedSubTUCBF |= subTUCBF[subTU];
733 }
734
735 for (uint32_t subTU = 0; subTU < 2; subTU++)
736 {
737 const uint32_t subTUAbsPartIdx = absPartIdx + (subTU * partIdxesPerSubTU);
738 const uint8_t compositeCBF = (subTUCBF[subTU] << 1) | combinedSubTUCBF;
739
740 cu.setCbfPartRange((compositeCBF << trDepth), ttype, subTUAbsPartIdx, partIdxesPerSubTU);
741 }
742}
743
744/* returns distortion */
745uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
746{
747 CUData& cu = mode.cu;
748 uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
749 uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
750
751 if (tuDepthL > trDepth)
752 {
753 uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
754 uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0;
755 for (uint32_t subPartIdx = 0, absPartIdxSub = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxSub += qPartsDiv)
756 {
757 outDist += codeIntraChromaQt(mode, cuGeom, trDepth + 1, absPartIdxSub, psyEnergy);
758 splitCbfU |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_U, trDepth + 1);
759 splitCbfV |= cu.getCbf(absPartIdxSub, TEXT_CHROMA_V, trDepth + 1);
760 }
761 for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
762 {
763 cu.m_cbf[TEXT_CHROMA_U][absPartIdx + offs] |= (splitCbfU << trDepth);
764 cu.m_cbf[TEXT_CHROMA_V][absPartIdx + offs] |= (splitCbfV << trDepth);
765 }
766
767 return outDist;
768 }
769
770 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
771 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
772
773 uint32_t trDepthC = trDepth;
774 if (log2TrSizeC == 1)
775 {
776 X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth, "invalid trDepth\n");
777 trDepthC--;
778 log2TrSizeC++;
779 uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
780 bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
781 if (!bFirstQ)
782 return 0;
783 }
784
785 if (m_bEnableRDOQ)
786 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
787
788 bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
789 checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
790 if (checkTransformSkip)
791 return codeIntraChromaTSkip(mode, cuGeom, trDepth, trDepthC, absPartIdx, psyEnergy);
792
793 uint32_t qtLayer = log2TrSize - 2;
794 uint32_t tuSize = 1 << log2TrSizeC;
795 uint32_t outDist = 0;
796
797 uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
798 const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
799
800 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
801 {
802 TextType ttype = (TextType)chromaId;
803
804 TURecurse tuIterator(splitType, curPartNum, absPartIdx);
805 do
806 {
807 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
808
809 pixel* fenc = const_cast<Yuv*>(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC);
810 pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
811 int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
812 uint32_t stride = mode.fencYuv->m_csize;
813 uint32_t sizeIdxC = log2TrSizeC - 2;
814
815 uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
816 coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
817 pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
818 uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
819
820 pixel* picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
821 intptr_t picStride = m_frame->m_reconPicYuv->m_strideC;
822
823 // init availability pattern
824 initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
825 pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
826
827 uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
828 if (chromaPredMode == DM_CHROMA_IDX)
829 chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
830 if (m_csp == X265_CSP_I422)
831 chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
832
833 // get prediction signal
834 predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
835
836 cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
837
838 primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
839 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
840 uint32_t tmpDist;
841 if (numSig)
842 {
843 m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
844 primitives.luma_add_ps[sizeIdxC](reconQt, reconQtStride, pred, residual, stride, stride);
845 cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
846 }
847 else
848 {
849 // no coded residual, recon = pred
850 primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, pred, stride);
851 cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
852 }
853
854 tmpDist = primitives.sse_pp[sizeIdxC](reconQt, reconQtStride, fenc, stride);
855 outDist += (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist);
856
857 if (m_rdCost.m_psyRd)
858 psyEnergy += m_rdCost.psyCost(sizeIdxC, fenc, stride, picReconC, picStride);
859
860 primitives.square_copy_pp[sizeIdxC](picReconC, picStride, reconQt, reconQtStride);
861 }
862 while (tuIterator.isNextSection());
863
864 if (splitType == VERTICAL_SPLIT)
865 offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx);
866 }
867
868 return outDist;
869}
870
871/* returns distortion */
872uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t trDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
873{
874 CUData& cu = mode.cu;
875 uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
876 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
877 uint32_t log2TrSizeC = 2;
878 uint32_t tuSize = 4;
879 uint32_t qtLayer = log2TrSize - 2;
880 uint32_t outDist = 0;
881
882 /* At the TU layers above this one, no RDO is performed, only distortion is being measured,
883 * so the entropy coder is not very accurate. The best we can do is return it in the same
884 * condition as it arrived, and to do all bit estimates from the same state. */
885 m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
886
887 ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
888 ALIGN_VAR_32(pixel, tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]);
889
890 uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
891 const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
892
893 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
894 {
895 TextType ttype = (TextType)chromaId;
896
897 TURecurse tuIterator(splitType, curPartNum, absPartIdx);
898 do
899 {
900 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
901
902 pixel* fenc = const_cast<Yuv*>(mode.fencYuv)->getChromaAddr(chromaId, absPartIdxC);
903 pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
904 int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddr(chromaId, absPartIdxC);
905 uint32_t stride = mode.fencYuv->m_csize;
906 uint32_t sizeIdxC = log2TrSizeC - 2;
907
908 uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
909 coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
910 pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
911 uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
912
913 // init availability pattern
914 initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
915 pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
916
917 uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
918 if (chromaPredMode == DM_CHROMA_IDX)
919 chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
920 if (m_csp == X265_CSP_I422)
921 chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
922
923 // get prediction signal
924 predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
925
926 uint64_t bCost = MAX_INT64;
927 uint32_t bDist = 0;
928 uint32_t bCbf = 0;
929 uint32_t bEnergy = 0;
930 int bTSkip = 0;
931
932 int checkTransformSkip = 1;
933 for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
934 {
935 coeff_t* coeff = (useTSkip ? tskipCoeffC : coeffC);
936 pixel* recon = (useTSkip ? tskipReconC : reconQt);
937 uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
938
939 primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
940
941 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
942 if (numSig)
943 {
944 m_quant.invtransformNxN(cu.m_tqBypass[0], residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
945 primitives.luma_add_ps[sizeIdxC](recon, reconStride, pred, residual, stride, stride);
946 cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
947 }
948 else if (useTSkip)
949 {
950 checkTransformSkip = 0;
951 break;
952 }
953 else
954 {
955 primitives.square_copy_pp[sizeIdxC](recon, reconStride, pred, stride);
956 cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
957 }
958 uint32_t tmpDist = primitives.sse_pp[sizeIdxC](recon, reconStride, fenc, stride);
959 tmpDist = (ttype == TEXT_CHROMA_U) ? m_rdCost.scaleChromaDistCb(tmpDist) : m_rdCost.scaleChromaDistCr(tmpDist);
960
961 cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
962
963 uint32_t tmpBits = 0, tmpEnergy = 0;
964 if (numSig)
965 {
966 m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
967 m_entropyCoder.resetBits();
968 m_entropyCoder.codeCoeffNxN(cu, coeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
969 tmpBits = m_entropyCoder.getNumberOfWrittenBits();
970 }
971
972 uint64_t tmpCost;
973 if (m_rdCost.m_psyRd)
974 {
975 tmpEnergy = m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
976 tmpCost = m_rdCost.calcPsyRdCost(tmpDist, tmpBits, tmpEnergy);
977 }
978 else
979 tmpCost = m_rdCost.calcRdCost(tmpDist, tmpBits);
980
981 if (tmpCost < bCost)
982 {
983 bCost = tmpCost;
984 bDist = tmpDist;
985 bTSkip = useTSkip;
986 bCbf = !!numSig;
987 bEnergy = tmpEnergy;
988 }
989 }
990
991 if (bTSkip)
992 {
993 memcpy(coeffC, tskipCoeffC, sizeof(coeff_t) << (log2TrSizeC * 2));
994 primitives.square_copy_pp[sizeIdxC](reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE);
995 }
996
997 cu.setCbfPartRange(bCbf << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
998 cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
999
1000 pixel* reconPicC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
1001 intptr_t picStride = m_frame->m_reconPicYuv->m_strideC;
1002 primitives.square_copy_pp[sizeIdxC](reconPicC, picStride, reconQt, reconQtStride);
1003
1004 outDist += bDist;
1005 psyEnergy += bEnergy;
1006 }
1007 while (tuIterator.isNextSection());
1008
1009 if (splitType == VERTICAL_SPLIT)
1010 offsetSubTUCBFs(cu, ttype, trDepth, absPartIdx);
1011 }
1012
1013 m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
1014 return outDist;
1015}
1016
1017void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t trDepth, bool tuQuad)
1018{
1019 uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
1020 uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
1021
1022 if (tuDepthL == trDepth)
1023 {
1024 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
1025 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1026
1027 if (tuQuad)
1028 {
1029 log2TrSizeC++; /* extract one 4x4 instead of 4 2x2 */
1030 trDepth--; /* also adjust the number of coeff read */
1031 }
1032
1033 // copy transform coefficients
1034 uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
1035 uint32_t coeffOffsetC = absPartIdx << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1036
1037 uint32_t qtLayer = log2TrSize - 2;
1038 coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
1039 coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
1040 coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
1041 coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
1042 memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
1043 memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
1044
1045 // copy reconstruction
1046 m_rqt[qtLayer].reconQtYuv.copyPartToPartChroma(reconYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
1047 }
1048 else
1049 {
1050 if (g_maxLog2CUSize - fullDepth - 1 == 2 && m_csp != X265_CSP_I444)
1051 /* no such thing as chroma 2x2, so extract one 4x4 instead of 4 2x2 */
1052 extractIntraResultChromaQT(cu, reconYuv, absPartIdx, trDepth + 1, true);
1053 else
1054 {
1055 uint32_t numQPart = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
1056 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1057 extractIntraResultChromaQT(cu, reconYuv, absPartIdx + subPartIdx * numQPart, trDepth + 1, false);
1058 }
1059 }
1060}
1061
1062void Search::residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t trDepth, uint32_t absPartIdx)
1063{
1064 CUData& cu = mode.cu;
1065 uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
1066 uint32_t tuDepthL = cu.m_tuDepth[absPartIdx];
1067
1068 if (tuDepthL == trDepth)
1069 {
1070 uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
1071 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1072 uint32_t trDepthC = trDepth;
1073 if (log2TrSizeC == 1)
1074 {
1075 X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && trDepth > 0, "invalid trDepth\n");
1076 trDepthC--;
1077 log2TrSizeC++;
1078 uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
1079 bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
1080 if (!bFirstQ)
1081 return;
1082 }
1083
1084 ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1085 uint32_t tuSize = 1 << log2TrSizeC;
1086 uint32_t stride = mode.fencYuv->m_csize;
1087 const int sizeIdxC = log2TrSizeC - 2;
1088
1089 uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
1090 const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
1091
1092 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1093 {
1094 TextType ttype = (TextType)chromaId;
1095
1096 TURecurse tuIterator(splitType, curPartNum, absPartIdx);
1097 do
1098 {
1099 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1100
1101 pixel* fenc = const_cast<pixel*>(mode.fencYuv->getChromaAddr(chromaId, absPartIdxC));
1102 pixel* pred = mode.predYuv.getChromaAddr(chromaId, absPartIdxC);
1103 int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
1104 pixel* recon = mode.reconYuv.getChromaAddr(chromaId, absPartIdxC); // TODO: needed?
1105 uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
1106 coeff_t* coeff = cu.m_trCoeff[ttype] + coeffOffsetC;
1107 pixel* picReconC = m_frame->m_reconPicYuv->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
1108 uint32_t picStride = m_frame->m_reconPicYuv->m_strideC;
1109
1110 uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
1111 if (chromaPredMode == DM_CHROMA_IDX)
1112 chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
1113 chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
1114 initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
1115 pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
1116
1117 predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
1118
1119 X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
1120
1121 primitives.calcresidual[sizeIdxC](fenc, pred, residual, stride);
1122 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, false);
1123 if (numSig)
1124 {
1125 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], residual, stride, coeff, log2TrSizeC, ttype, true, false, numSig);
1126 primitives.luma_add_ps[sizeIdxC](recon, stride, pred, residual, stride, stride);
1127 primitives.square_copy_pp[sizeIdxC](picReconC, picStride, recon, stride);
1128 cu.setCbfPartRange(1 << trDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1129 }
1130 else
1131 {
1132 primitives.square_copy_pp[sizeIdxC](recon, stride, pred, stride);
1133 primitives.square_copy_pp[sizeIdxC](picReconC, picStride, pred, stride);
1134 cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
1135 }
1136 }
1137 while (tuIterator.isNextSection());
1138
1139 if (splitType == VERTICAL_SPLIT)
1140 offsetSubTUCBFs(cu, (TextType)chromaId, trDepth, absPartIdx);
1141 }
1142 }
1143 else
1144 {
1145 uint32_t qPartsDiv = NUM_CU_PARTITIONS >> ((fullDepth + 1) << 1);
1146 uint32_t splitCbfU = 0, splitCbfV = 0;
1147 for (uint32_t subPartIdx = 0, absPartIdxC = absPartIdx; subPartIdx < 4; subPartIdx++, absPartIdxC += qPartsDiv)
1148 {
1149 residualQTIntraChroma(mode, cuGeom, trDepth + 1, absPartIdxC);
1150 splitCbfU |= cu.getCbf(absPartIdxC, TEXT_CHROMA_U, trDepth + 1);
1151 splitCbfV |= cu.getCbf(absPartIdxC, TEXT_CHROMA_V, trDepth + 1);
1152 }
1153 for (uint32_t offs = 0; offs < 4 * qPartsDiv; offs++)
1154 {
1155 cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << trDepth);
1156 cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << trDepth);
1157 }
1158 }
1159}
1160
1161void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes)
1162{
1163 uint32_t depth = cuGeom.depth;
1164 CUData& cu = intraMode.cu;
1165
1166 cu.setPartSizeSubParts(partSize);
1167 cu.setPredModeSubParts(MODE_INTRA);
1168
1169 uint32_t tuDepthRange[2];
1170 cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1171
1172 intraMode.initCosts();
1173 intraMode.distortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange, sharedModes);
1174 intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
1175
1176 m_entropyCoder.resetBits();
1177 if (m_slice->m_pps->bTransquantBypassEnabled)
1178 m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1179
1180 if (!m_slice->isIntra())
1181 {
1182 m_entropyCoder.codeSkipFlag(cu, 0);
1183 m_entropyCoder.codePredMode(cu.m_predMode[0]);
1184 }
1185
1186 m_entropyCoder.codePartSize(cu, 0, depth);
1187 m_entropyCoder.codePredInfo(cu, 0);
1188 intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
1189
1190 bool bCodeDQP = m_slice->m_pps->bUseDQP;
1191 m_entropyCoder.codeCoeff(cu, 0, depth, bCodeDQP, tuDepthRange);
1192 m_entropyCoder.store(intraMode.contexts);
1193 intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1194 intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
1195 if (m_rdCost.m_psyRd)
1196 intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
1197
1198 updateModeCost(intraMode);
1199}
1200
1201uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, uint32_t depthRange[2], uint8_t* sharedModes)
1202{
1203 CUData& cu = intraMode.cu;
1204 Yuv* reconYuv = &intraMode.reconYuv;
1205 Yuv* predYuv = &intraMode.predYuv;
1206 const Yuv* fencYuv = intraMode.fencYuv;
1207
1208 uint32_t depth = cu.m_cuDepth[0];
1209 uint32_t initTrDepth = cu.m_partSize[0] == SIZE_2Nx2N ? 0 : 1;
1210 uint32_t numPU = 1 << (2 * initTrDepth);
1211 uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth;
1212 uint32_t tuSize = 1 << log2TrSize;
1213 uint32_t qNumParts = cuGeom.numPartitions >> 2;
1214 uint32_t sizeIdx = log2TrSize - 2;
1215 uint32_t absPartIdx = 0;
1216 uint32_t totalDistortion = 0;
1217
1218 int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[absPartIdx] == SIZE_NxN;
1219
1220 // loop over partitions
1221 for (uint32_t pu = 0; pu < numPU; pu++, absPartIdx += qNumParts)
1222 {
1223 uint32_t bmode = 0;
1224
1225 if (sharedModes)
1226 bmode = sharedModes[pu];
1227 else
1228 {
1229 // Reference sample smoothing
1230 initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX);
1231
1232 // determine set of modes to be tested (using prediction signal only)
1233 pixel* fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
1234 uint32_t stride = predYuv->m_size;
1235
1236 pixel *above = m_refAbove + tuSize - 1;
1237 pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
1238 pixel *left = m_refLeft + tuSize - 1;
1239 pixel *leftFiltered = m_refLeftFlt + tuSize - 1;
1240
1241 // 33 Angle modes once
1242 ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
1243 ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
1244 ALIGN_VAR_32(pixel, bufScale[32 * 32]);
1245 pixel _above[4 * 32 + 1];
1246 pixel _left[4 * 32 + 1];
1247 int scaleTuSize = tuSize;
1248 int scaleStride = stride;
1249 int costShift = 0;
1250
1251 if (tuSize > 32)
1252 {
1253 pixel *aboveScale = _above + 2 * 32;
1254 pixel *leftScale = _left + 2 * 32;
1255
1256 // origin is 64x64, we scale to 32x32 and setup required parameters
1257 primitives.scale2D_64to32(bufScale, fenc, stride);
1258 fenc = bufScale;
1259
1260 // reserve space in case primitives need to store data in above
1261 // or left buffers
1262 aboveScale[0] = leftScale[0] = above[0];
1263 primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
1264 primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
1265
1266 scaleTuSize = 32;
1267 scaleStride = 32;
1268 costShift = 2;
1269 sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
1270
1271 // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
1272 above = aboveScale;
1273 left = leftScale;
1274 aboveFiltered = aboveScale;
1275 leftFiltered = leftScale;
1276 }
1277
1278 m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1279
1280 /* there are three cost tiers for intra modes:
1281 * pred[0] - mode probable, least cost
1282 * pred[1], pred[2] - less probable, slightly more cost
1283 * non-mpm modes - all cost the same (rbits) */
1284 uint64_t mpms;
1285 uint32_t preds[3];
1286 uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
1287
1288 pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
1289 uint64_t modeCosts[35];
1290 uint64_t bcost;
1291
1292 // DC
1293 primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
1294 uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, DC_IDX) : rbits;
1295 uint32_t sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
1296 modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
1297
1298 // PLANAR
1299 pixel *abovePlanar = above;
1300 pixel *leftPlanar = left;
1301 if (tuSize >= 8 && tuSize <= 32)
1302 {
1303 abovePlanar = aboveFiltered;
1304 leftPlanar = leftFiltered;
1305 }
1306 primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
1307 bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, PLANAR_IDX) : rbits;
1308 sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
1309 modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
1310 COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
1311
1312 // angular predictions
1313 primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
1314
1315 primitives.transpose[sizeIdx](buf_trans, fenc, scaleStride);
1316 for (int mode = 2; mode < 35; mode++)
1317 {
1318 bool modeHor = (mode < 18);
1319 pixel *cmp = (modeHor ? buf_trans : fenc);
1320 intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride);
1321 bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
1322 sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
1323 modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
1324 COPY1_IF_LT(bcost, modeCosts[mode]);
1325 }
1326
1327 /* Find the top maxCandCount candidate modes with cost within 25% of best
1328 * or among the most probable modes. maxCandCount is derived from the
1329 * rdLevel and depth. In general we want to try more modes at slower RD
1330 * levels and at higher depths */
1331 uint64_t candCostList[MAX_RD_INTRA_MODES];
1332 uint32_t rdModeList[MAX_RD_INTRA_MODES];
1333 int maxCandCount = 2 + m_param->rdLevel + ((depth + initTrDepth) >> 1);
1334 for (int i = 0; i < maxCandCount; i++)
1335 candCostList[i] = MAX_INT64;
1336
1337 uint64_t paddedBcost = bcost + (bcost >> 3); // 1.12%
1338 for (int mode = 0; mode < 35; mode++)
1339 if (modeCosts[mode] < paddedBcost || (mpms & ((uint64_t)1 << mode)))
1340 updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
1341
1342 /* measure best candidates using simple RDO (no TU splits) */
1343 bcost = MAX_INT64;
1344 for (int i = 0; i < maxCandCount; i++)
1345 {
1346 if (candCostList[i] == MAX_INT64)
1347 break;
1348 m_entropyCoder.load(m_rqt[depth].cur);
1349 cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTrDepth);
1350
1351 Cost icosts;
1352 if (checkTransformSkip)
1353 codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts);
1354 else
1355 codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, false, icosts, depthRange);
1356 COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
1357 }
1358 }
1359
1360 /* remeasure best mode, allowing TU splits */
1361 cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTrDepth);
1362 m_entropyCoder.load(m_rqt[depth].cur);
1363
1364 Cost icosts;
1365 if (checkTransformSkip)
1366 codeIntraLumaTSkip(intraMode, cuGeom, initTrDepth, absPartIdx, icosts);
1367 else
1368 codeIntraLumaQT(intraMode, cuGeom, initTrDepth, absPartIdx, true, icosts, depthRange);
1369 totalDistortion += icosts.distortion;
1370
1371 extractIntraResultQT(cu, *reconYuv, initTrDepth, absPartIdx);
1372
1373 // set reconstruction for next intra prediction blocks
1374 if (pu != numPU - 1)
1375 {
1376 /* This has important implications for parallelism and RDO. It is writing intermediate results into the
1377 * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
1378 * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
1379 * that the contexts should be tracked through each PU */
1380 pixel* dst = m_frame->m_reconPicYuv->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
1381 uint32_t dststride = m_frame->m_reconPicYuv->m_stride;
1382 pixel* src = reconYuv->getLumaAddr(absPartIdx);
1383 uint32_t srcstride = reconYuv->m_size;
1384 primitives.square_copy_pp[log2TrSize - 2](dst, dststride, src, srcstride);
1385 }
1386 }
1387
1388 if (numPU > 1)
1389 {
1390 uint32_t combCbfY = 0;
1391 uint32_t partIdx = 0;
1392 for (uint32_t part = 0; part < 4; part++, partIdx += qNumParts)
1393 combCbfY |= cu.getCbf(partIdx, TEXT_LUMA, 1);
1394
1395 for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
1396 cu.m_cbf[0][offs] |= combCbfY;
1397 }
1398
1399 // TODO: remove this
1400 m_entropyCoder.load(m_rqt[depth].cur);
1401
1402 return totalDistortion;
1403}
1404
1405void Search::getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom)
1406{
1407 CUData& cu = intraMode.cu;
1408 const Yuv* fencYuv = intraMode.fencYuv;
1409 Yuv* predYuv = &intraMode.predYuv;
1410
1411 uint32_t bestMode = 0;
1412 uint64_t bestCost = MAX_INT64;
1413 uint32_t modeList[NUM_CHROMA_MODE];
1414
1415 uint32_t log2TrSizeC = cu.m_log2CUSize[0] - m_hChromaShift;
1416 uint32_t tuSize = 1 << log2TrSizeC;
1417 int32_t scaleTuSize = tuSize;
1418 int32_t costShift = 0;
1419
1420 if (tuSize > 32)
1421 {
1422 scaleTuSize = 32;
1423 costShift = 2;
1424 log2TrSizeC = 5;
1425 }
1426
1427 Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 1);
1428 Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 2);
1429 cu.getAllowedChromaDir(0, modeList);
1430
1431 // check chroma modes
1432 for (uint32_t mode = 0; mode < NUM_CHROMA_MODE; mode++)
1433 {
1434 uint32_t chromaPredMode = modeList[mode];
1435 if (chromaPredMode == DM_CHROMA_IDX)
1436 chromaPredMode = cu.m_lumaIntraDir[0];
1437 if (m_csp == X265_CSP_I422)
1438 chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
1439
1440 uint64_t cost = 0;
1441 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1442 {
1443 pixel* fenc = fencYuv->m_buf[chromaId];
1444 pixel* pred = predYuv->m_buf[chromaId];
1445 pixel* chromaPred = getAdiChromaBuf(chromaId, scaleTuSize);
1446
1447 // get prediction signal
1448 predIntraChromaAng(chromaPred, chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC, m_csp);
1449 cost += primitives.sa8d[log2TrSizeC - 2](fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift;
1450 }
1451
1452 if (cost < bestCost)
1453 {
1454 bestCost = cost;
1455 bestMode = modeList[mode];
1456 }
1457 }
1458
1459 cu.setChromIntraDirSubParts(bestMode, 0, cu.m_cuDepth[0]);
1460}
1461
1462uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
1463{
1464 CUData& cu = intraMode.cu;
1465 Yuv& reconYuv = intraMode.reconYuv;
1466
1467 uint32_t depth = cu.m_cuDepth[0];
1468 uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN && m_csp == X265_CSP_I444;
1469 uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth;
1470 uint32_t absPartStep = (NUM_CU_PARTITIONS >> (depth << 1));
1471 uint32_t totalDistortion = 0;
1472
1473 int part = partitionFromLog2Size(log2TrSize);
1474
1475 TURecurse tuIterator((initTrDepth == 0) ? DONT_SPLIT : QUAD_SPLIT, absPartStep, 0);
1476
1477 do
1478 {
1479 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
1480 int cuSize = 1 << cu.m_log2CUSize[absPartIdxC];
1481
1482 uint32_t bestMode = 0;
1483 uint32_t bestDist = 0;
1484 uint64_t bestCost = MAX_INT64;
1485
1486 // init mode list
1487 uint32_t minMode = 0;
1488 uint32_t maxMode = NUM_CHROMA_MODE;
1489 uint32_t modeList[NUM_CHROMA_MODE];
1490
1491 cu.getAllowedChromaDir(absPartIdxC, modeList);
1492
1493 // check chroma modes
1494 for (uint32_t mode = minMode; mode < maxMode; mode++)
1495 {
1496 // restore context models
1497 m_entropyCoder.load(m_rqt[depth].cur);
1498
1499 cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTrDepth);
1500 uint32_t psyEnergy = 0;
1501 uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTrDepth, absPartIdxC, psyEnergy);
1502
1503 if (m_slice->m_pps->bTransformSkipEnabled)
1504 m_entropyCoder.load(m_rqt[depth].cur);
1505
1506 m_entropyCoder.resetBits();
1507 // chroma prediction mode
1508 if (cu.m_partSize[0] == SIZE_2Nx2N || m_csp != X265_CSP_I444)
1509 {
1510 if (!absPartIdxC)
1511 m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
1512 }
1513 else
1514 {
1515 uint32_t qtNumParts = cuGeom.numPartitions >> 2;
1516 if (!(absPartIdxC & (qtNumParts - 1)))
1517 m_entropyCoder.codeIntraDirChroma(cu, absPartIdxC, modeList);
1518 }
1519
1520 codeSubdivCbfQTChroma(cu, initTrDepth, absPartIdxC, tuIterator.absPartIdxStep, cuSize, cuSize);
1521 codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_U);
1522 codeCoeffQTChroma(cu, initTrDepth, absPartIdxC, TEXT_CHROMA_V);
1523 uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
1524 uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(dist, bits, psyEnergy) : m_rdCost.calcRdCost(dist, bits);
1525
1526 if (cost < bestCost)
1527 {
1528 bestCost = cost;
1529 bestDist = dist;
1530 bestMode = modeList[mode];
1531 extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTrDepth, false);
1532 memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1533 memcpy(m_qtTempCbf[2], cu.m_cbf[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1534 memcpy(m_qtTempTransformSkipFlag[1], cu.m_transformSkip[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1535 memcpy(m_qtTempTransformSkipFlag[2], cu.m_transformSkip[2] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
1536 }
1537 }
1538
1539 if (!tuIterator.isLastSection())
1540 {
1541 uint32_t zorder = cuGeom.encodeIdx + absPartIdxC;
1542 uint32_t dststride = m_frame->m_reconPicYuv->m_strideC;
1543 pixel *src, *dst;
1544
1545 dst = m_frame->m_reconPicYuv->getCbAddr(cu.m_cuAddr, zorder);
1546 src = reconYuv.getCbAddr(absPartIdxC);
1547 primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
1548
1549 dst = m_frame->m_reconPicYuv->getCrAddr(cu.m_cuAddr, zorder);
1550 src = reconYuv.getCrAddr(absPartIdxC);
1551 primitives.chroma[m_csp].copy_pp[part](dst, dststride, src, reconYuv.m_csize);
1552 }
1553
1554 memcpy(cu.m_cbf[1] + absPartIdxC, m_qtTempCbf[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
1555 memcpy(cu.m_cbf[2] + absPartIdxC, m_qtTempCbf[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
1556 memcpy(cu.m_transformSkip[1] + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.absPartIdxStep * sizeof(uint8_t));
1557 memcpy(cu.m_transformSkip[2] + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.absPartIdxStep * sizeof(uint8_t));
1558 cu.setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTrDepth);
1559 totalDistortion += bestDist;
1560 }
1561 while (tuIterator.isNextSection());
1562
1563 if (initTrDepth != 0)
1564 {
1565 uint32_t combCbfU = 0;
1566 uint32_t combCbfV = 0;
1567 uint32_t partIdx = 0;
1568 for (uint32_t p = 0; p < 4; p++, partIdx += tuIterator.absPartIdxStep)
1569 {
1570 combCbfU |= cu.getCbf(partIdx, TEXT_CHROMA_U, 1);
1571 combCbfV |= cu.getCbf(partIdx, TEXT_CHROMA_V, 1);
1572 }
1573
1574 for (uint32_t offs = 0; offs < 4 * tuIterator.absPartIdxStep; offs++)
1575 {
1576 cu.m_cbf[1][offs] |= combCbfU;
1577 cu.m_cbf[2][offs] |= combCbfV;
1578 }
1579 }
1580
1581 /* TODO: remove this */
1582 m_entropyCoder.load(m_rqt[depth].cur);
1583 return totalDistortion;
1584}
1585
1586/* estimation of best merge coding of an inter PU (not a merge CU) */
1587uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, MergeData& m)
1588{
1589 X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "merge tested on non-2Nx2N partition\n");
1590
1591 m.maxNumMergeCand = cu.getInterMergeCandidates(m.absPartIdx, puIdx, m.mvFieldNeighbours, m.interDirNeighbours);
1592
1593 if (cu.isBipredRestriction())
1594 {
1595 /* in 8x8 CUs do not allow bidir merge candidates if not 2Nx2N */
1596 for (uint32_t mergeCand = 0; mergeCand < m.maxNumMergeCand; ++mergeCand)
1597 {
1598 if (m.interDirNeighbours[mergeCand] == 3)
1599 {
1600 m.interDirNeighbours[mergeCand] = 1;
1601 m.mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID;
1602 }
1603 }
1604 }
1605
1606 Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1607
1608 uint32_t outCost = MAX_UINT;
1609 for (uint32_t mergeCand = 0; mergeCand < m.maxNumMergeCand; ++mergeCand)
1610 {
1611 /* Prevent TMVP candidates from using unavailable reference pixels */
1612 if (m_bFrameParallel &&
1613 (m.mvFieldNeighbours[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1614 m.mvFieldNeighbours[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4))
1615 continue;
1616
1617 cu.m_mv[0][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][0].mv;
1618 cu.m_refIdx[0][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][0].refIdx;
1619 cu.m_mv[1][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv;
1620 cu.m_refIdx[1][m.absPartIdx] = (char)m.mvFieldNeighbours[mergeCand][1].refIdx;
1621
1622 prepMotionCompensation(cu, cuGeom, puIdx);
1623 motionCompensation(tempYuv, true, false);
1624 uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(m.absPartIdx), tempYuv.m_size);
1625 uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand);
1626 costCand = costCand + m_rdCost.getCost(bitsCand);
1627 if (costCand < outCost)
1628 {
1629 outCost = costCand;
1630 m.bits = bitsCand;
1631 m.index = mergeCand;
1632 }
1633 }
1634
1635 m.mvField[0] = m.mvFieldNeighbours[m.index][0];
1636 m.mvField[1] = m.mvFieldNeighbours[m.index][1];
1637 m.interDir = m.interDirNeighbours[m.index];
1638
1639 return outCost;
1640}
1641
1642/* this function assumes the caller has configured its MotionEstimation engine with the
1643 * correct source plane and source PU, and has called prepMotionCompensation() to set
1644 * m_puAbsPartIdx, m_puWidth, and m_puHeight */
1645void Search::singleMotionEstimation(Search& master, const CUData& cu, const CUGeom& cuGeom, int part, int list, int ref)
1646{
1647 uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
1648 bits += getTUBits(ref, m_slice->m_numRefIdx[list]);
1649
1650 MV amvpCand[AMVP_NUM_CANDS];
1651 MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
1652 int numMvc = cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, amvpCand, mvc);
1653
1654 uint32_t bestCost = MAX_INT;
1655 int mvpIdx = 0;
1656 int merange = m_param->searchRange;
1657 for (int i = 0; i < AMVP_NUM_CANDS; i++)
1658 {
1659 MV mvCand = amvpCand[i];
1660
1661 // NOTE: skip mvCand if Y is > merange and -FN>1
1662 if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
1663 continue;
1664
1665 cu.clipMv(mvCand);
1666
1667 Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1668 predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPicYuv, mvCand);
1669 uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
1670
1671 if (bestCost > cost)
1672 {
1673 bestCost = cost;
1674 mvpIdx = i;
1675 }
1676 }
1677
1678 MV mvmin, mvmax, outmv, mvp = amvpCand[mvpIdx];
1679 setSearchRange(cu, mvp, merange, mvmin, mvmax);
1680
1681 int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
1682
1683 /* Get total cost of partition, but only include MV bit cost once */
1684 bits += m_me.bitcost(outmv);
1685 uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
1686
1687 /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
1688 checkBestMVP(amvpCand, outmv, mvp, mvpIdx, bits, cost);
1689
1690 /* tie goes to the smallest ref ID, just like --no-pme */
1691 ScopedLock _lock(master.m_outputLock);
1692 if (cost < master.m_bestME[list].cost ||
1693 (cost == master.m_bestME[list].cost && ref < master.m_bestME[list].ref))
1694 {
1695 master.m_bestME[list].mv = outmv;
1696 master.m_bestME[list].mvp = mvp;
1697 master.m_bestME[list].mvpIdx = mvpIdx;
1698 master.m_bestME[list].ref = ref;
1699 master.m_bestME[list].cost = cost;
1700 master.m_bestME[list].bits = bits;
1701 }
1702}
1703
1704/* search of the best candidate for inter prediction
1705 * returns true if predYuv was filled with a motion compensated prediction */
1706bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma)
1707{
1708 CUData& cu = interMode.cu;
1709 Yuv* predYuv = &interMode.predYuv;
1710
1711 MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
1712 MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
1713
1714 const Slice *slice = m_slice;
1715 PicYuv* fencPic = m_frame->m_origPicYuv;
1716 int numPart = cu.getNumPartInter();
1717 int numPredDir = slice->isInterP() ? 1 : 2;
1718 const int* numRefIdx = slice->m_numRefIdx;
1719 uint32_t lastMode = 0;
1720 int totalmebits = 0;
1721 bool bDistributed = m_param->bDistributeMotionEstimation && (numRefIdx[0] + numRefIdx[1]) > 2;
1722 MV mvzero(0, 0);
1723 Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
1724
1725 MergeData merge;
1726 memset(&merge, 0, sizeof(merge));
1727
1728 for (int puIdx = 0; puIdx < numPart; puIdx++)
1729 {
1730 /* sets m_puAbsPartIdx, m_puWidth, m_puHeight */
1731 initMotionCompensation(cu, cuGeom, puIdx);
1732
1733 pixel* pu = fencPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
1734 m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight);
1735
1736 uint32_t mrgCost = MAX_UINT;
1737
1738 /* find best cost merge candidate */
1739 if (cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N)
1740 {
1741 merge.absPartIdx = m_puAbsPartIdx;
1742 merge.width = m_puWidth;
1743 merge.height = m_puHeight;
1744 mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge);
1745
1746 if (bMergeOnly && cu.m_log2CUSize[0] > 3)
1747 {
1748 if (mrgCost == MAX_UINT)
1749 {
1750 /* No valid merge modes were found, there is no possible way to
1751 * perform a valid motion compensation prediction, so early-exit */
1752 return false;
1753 }
1754 // set merge result
1755 cu.m_mergeFlag[m_puAbsPartIdx] = true;
1756 cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
1757 cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx);
1758 cu.setPUMv(0, merge.mvField[0].mv, m_puAbsPartIdx, puIdx);
1759 cu.setPURefIdx(0, merge.mvField[0].refIdx, m_puAbsPartIdx, puIdx);
1760 cu.setPUMv(1, merge.mvField[1].mv, m_puAbsPartIdx, puIdx);
1761 cu.setPURefIdx(1, merge.mvField[1].refIdx, m_puAbsPartIdx, puIdx);
1762 totalmebits += merge.bits;
1763
1764 prepMotionCompensation(cu, cuGeom, puIdx);
1765 motionCompensation(*predYuv, true, bChroma);
1766 continue;
1767 }
1768 }
1769
1770 MotionData bidir[2];
1771 uint32_t bidirCost = MAX_UINT;
1772 int bidirBits = 0;
1773
1774 m_bestME[0].cost = MAX_UINT;
1775 m_bestME[1].cost = MAX_UINT;
1776
1777 getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
1778
1779 if (bDistributed)
1780 {
1781 m_curMECu = &cu;
1782 m_curGeom = &cuGeom;
1783
1784 /* this worker might already be enqueued for pmode, so other threads
1785 * might be looking at the ME job counts at any time, do these sets
1786 * in a safe order */
1787 m_curPart = puIdx;
1788 m_totalNumME = 0;
1789 m_numAcquiredME = 1;
1790 m_numCompletedME = 0;
1791 m_totalNumME = numRefIdx[0] + numRefIdx[1];
1792
1793 if (!m_bJobsQueued)
1794 JobProvider::enqueue();
1795
1796 for (int i = 1; i < m_totalNumME; i++)
1797 m_pool->pokeIdleThread();
1798
1799 while (m_totalNumME > m_numAcquiredME)
1800 {
1801 int id = ATOMIC_INC(&m_numAcquiredME);
1802 if (m_totalNumME >= id)
1803 {
1804 id -= 1;
1805 if (id < numRefIdx[0])
1806 singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, id);
1807 else
1808 singleMotionEstimation(*this, cu, cuGeom, puIdx, 1, id - numRefIdx[0]);
1809
1810 if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
1811 m_meCompletionEvent.trigger();
1812 }
1813 }
1814 if (!m_bJobsQueued)
1815 JobProvider::dequeue();
1816
1817 /* we saved L0-0 for ourselves */
1818 singleMotionEstimation(*this, cu, cuGeom, puIdx, 0, 0);
1819 if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
1820 m_meCompletionEvent.trigger();
1821
1822 m_meCompletionEvent.wait();
1823 }
1824 else
1825 {
1826 // Uni-directional prediction
1827 for (int l = 0; l < numPredDir; l++)
1828 {
1829 for (int ref = 0; ref < numRefIdx[l]; ref++)
1830 {
1831 uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
1832 bits += getTUBits(ref, numRefIdx[l]);
1833
1834 int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, amvpCand[l][ref], mvc);
1835
1836 // Pick the best possible MVP from AMVP candidates based on least residual
1837 uint32_t bestCost = MAX_INT;
1838 int mvpIdx = 0;
1839 int merange = m_param->searchRange;
1840
1841 for (int i = 0; i < AMVP_NUM_CANDS; i++)
1842 {
1843 MV mvCand = amvpCand[l][ref][i];
1844
1845 // NOTE: skip mvCand if Y is > merange and -FN>1
1846 if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4))
1847 continue;
1848
1849 cu.clipMv(mvCand);
1850 predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPicYuv, mvCand);
1851 uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
1852
1853 if (bestCost > cost)
1854 {
1855 bestCost = cost;
1856 mvpIdx = i;
1857 }
1858 }
1859
1860 MV mvmin, mvmax, outmv, mvp = amvpCand[l][ref][mvpIdx];
1861
1862 setSearchRange(cu, mvp, merange, mvmin, mvmax);
1863 int satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv);
1864
1865 /* Get total cost of partition, but only include MV bit cost once */
1866 bits += m_me.bitcost(outmv);
1867 uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
1868
1869 /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
1870 checkBestMVP(amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost);
1871
1872 if (cost < m_bestME[l].cost)
1873 {
1874 m_bestME[l].mv = outmv;
1875 m_bestME[l].mvp = mvp;
1876 m_bestME[l].mvpIdx = mvpIdx;
1877 m_bestME[l].ref = ref;
1878 m_bestME[l].cost = cost;
1879 m_bestME[l].bits = bits;
1880 }
1881 }
1882 }
1883 }
1884
1885 /* Bi-directional prediction */
1886 if (slice->isInterB() && !cu.isBipredRestriction() && m_bestME[0].cost != MAX_UINT && m_bestME[1].cost != MAX_UINT)
1887 {
1888 bidir[0] = m_bestME[0];
1889 bidir[1] = m_bestME[1];
1890
1891 /* Generate reference subpels */
1892 PicYuv* refPic0 = slice->m_refPicList[0][m_bestME[0].ref]->m_reconPicYuv;
1893 PicYuv* refPic1 = slice->m_refPicList[1][m_bestME[1].ref]->m_reconPicYuv;
1894 Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
1895 predInterLumaPixel(bidirYuv[0], *refPic0, m_bestME[0].mv);
1896 predInterLumaPixel(bidirYuv[1], *refPic1, m_bestME[1].mv);
1897
1898 pixel *pred0 = bidirYuv[0].getLumaAddr(m_puAbsPartIdx);
1899 pixel *pred1 = bidirYuv[1].getLumaAddr(m_puAbsPartIdx);
1900
1901 int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
1902 primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, pred0, bidirYuv[0].m_size, pred1, bidirYuv[1].m_size, 32);
1903 int satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
1904
1905 bidirBits = m_bestME[0].bits + m_bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
1906 bidirCost = satdCost + m_rdCost.getCost(bidirBits);
1907
1908 bool bTryZero = m_bestME[0].mv.notZero() || m_bestME[1].mv.notZero();
1909 if (bTryZero)
1910 {
1911 /* Do not try zero MV if unidir motion predictors are beyond
1912 * valid search area */
1913 MV mvmin, mvmax;
1914 int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);
1915 setSearchRange(cu, mvzero, merange, mvmin, mvmax);
1916 mvmax.y += 2; // there is some pad for subpel refine
1917 mvmin <<= 2;
1918 mvmax <<= 2;
1919
1920 bTryZero &= m_bestME[0].mvp.checkRange(mvmin, mvmax);
1921 bTryZero &= m_bestME[1].mvp.checkRange(mvmin, mvmax);
1922 }
1923 if (bTryZero)
1924 {
1925 // coincident blocks of the two reference pictures
1926 pixel *ref0 = slice->m_mref[0][m_bestME[0].ref].fpelPlane + (pu - fencPic->m_picOrg[0]);
1927 pixel *ref1 = slice->m_mref[1][m_bestME[1].ref].fpelPlane + (pu - fencPic->m_picOrg[0]);
1928 intptr_t refStride = slice->m_mref[0][0].lumaStride;
1929
1930 primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
1931 satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
1932
1933 MV mvp0 = m_bestME[0].mvp;
1934 int mvpIdx0 = m_bestME[0].mvpIdx;
1935 uint32_t bits0 = m_bestME[0].bits - m_me.bitcost(m_bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
1936
1937 MV mvp1 = m_bestME[1].mvp;
1938 int mvpIdx1 = m_bestME[1].mvpIdx;
1939 uint32_t bits1 = m_bestME[1].bits - m_me.bitcost(m_bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
1940
1941 uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
1942
1943 if (bDistributed)
1944 {
1945 cu.fillMvpCand(puIdx, m_puAbsPartIdx, 0, m_bestME[0].ref, amvpCand[0][m_bestME[0].ref], mvc);
1946 cu.fillMvpCand(puIdx, m_puAbsPartIdx, 1, m_bestME[1].ref, amvpCand[1][m_bestME[1].ref], mvc);
1947 }
1948
1949 /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
1950 checkBestMVP(amvpCand[0][m_bestME[0].ref], mvzero, mvp0, mvpIdx0, bits0, cost);
1951 checkBestMVP(amvpCand[1][m_bestME[1].ref], mvzero, mvp1, mvpIdx1, bits1, cost);
1952
1953 if (cost < bidirCost)
1954 {
1955 bidir[0].mv = mvzero;
1956 bidir[1].mv = mvzero;
1957 bidir[0].mvp = mvp0;
1958 bidir[1].mvp = mvp1;
1959 bidir[0].mvpIdx = mvpIdx0;
1960 bidir[1].mvpIdx = mvpIdx1;
1961 bidirCost = cost;
1962 bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);
1963 }
1964 }
1965 }
1966
1967 /* select best option and store into CU */
1968 if (mrgCost < bidirCost && mrgCost < m_bestME[0].cost && mrgCost < m_bestME[1].cost)
1969 {
1970 cu.m_mergeFlag[m_puAbsPartIdx] = true;
1971 cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
1972 cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx);
1973 cu.setPUMv(0, merge.mvField[0].mv, m_puAbsPartIdx, puIdx);
1974 cu.setPURefIdx(0, merge.mvField[0].refIdx, m_puAbsPartIdx, puIdx);
1975 cu.setPUMv(1, merge.mvField[1].mv, m_puAbsPartIdx, puIdx);
1976 cu.setPURefIdx(1, merge.mvField[1].refIdx, m_puAbsPartIdx, puIdx);
1977
1978 totalmebits += merge.bits;
1979 }
1980 else if (bidirCost < m_bestME[0].cost && bidirCost < m_bestME[1].cost)
1981 {
1982 lastMode = 2;
1983
1984 cu.m_mergeFlag[m_puAbsPartIdx] = false;
1985 cu.setPUInterDir(3, m_puAbsPartIdx, puIdx);
1986 cu.setPUMv(0, bidir[0].mv, m_puAbsPartIdx, puIdx);
1987 cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx);
1988 cu.m_mvd[0][m_puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
1989 cu.m_mvpIdx[0][m_puAbsPartIdx] = bidir[0].mvpIdx;
1990
1991 cu.setPUMv(1, bidir[1].mv, m_puAbsPartIdx, puIdx);
1992 cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx);
1993 cu.m_mvd[1][m_puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
1994 cu.m_mvpIdx[1][m_puAbsPartIdx] = bidir[1].mvpIdx;
1995
1996 totalmebits += bidirBits;
1997 }
1998 else if (m_bestME[0].cost <= m_bestME[1].cost)
1999 {
2000 lastMode = 0;
2001
2002 cu.m_mergeFlag[m_puAbsPartIdx] = false;
2003 cu.setPUInterDir(1, m_puAbsPartIdx, puIdx);
2004 cu.setPUMv(0, m_bestME[0].mv, m_puAbsPartIdx, puIdx);
2005 cu.setPURefIdx(0, m_bestME[0].ref, m_puAbsPartIdx, puIdx);
2006 cu.m_mvd[0][m_puAbsPartIdx] = m_bestME[0].mv - m_bestME[0].mvp;
2007 cu.m_mvpIdx[0][m_puAbsPartIdx] = m_bestME[0].mvpIdx;
2008
2009 cu.setPURefIdx(1, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
2010 cu.setPUMv(1, mvzero, m_puAbsPartIdx, puIdx);
2011
2012 totalmebits += m_bestME[0].bits;
2013 }
2014 else
2015 {
2016 lastMode = 1;
2017
2018 cu.m_mergeFlag[m_puAbsPartIdx] = false;
2019 cu.setPUInterDir(2, m_puAbsPartIdx, puIdx);
2020 cu.setPUMv(1, m_bestME[1].mv, m_puAbsPartIdx, puIdx);
2021 cu.setPURefIdx(1, m_bestME[1].ref, m_puAbsPartIdx, puIdx);
2022 cu.m_mvd[1][m_puAbsPartIdx] = m_bestME[1].mv - m_bestME[1].mvp;
2023 cu.m_mvpIdx[1][m_puAbsPartIdx] = m_bestME[1].mvpIdx;
2024
2025 cu.setPURefIdx(0, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
2026 cu.setPUMv(0, mvzero, m_puAbsPartIdx, puIdx);
2027
2028 totalmebits += m_bestME[1].bits;
2029 }
2030
2031 prepMotionCompensation(cu, cuGeom, puIdx);
2032 motionCompensation(*predYuv, true, bChroma);
2033 }
2034
2035 interMode.sa8dBits += totalmebits;
2036 return true;
2037}
2038
2039void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3])
2040{
2041 if (cuMode == SIZE_2Nx2N)
2042 {
2043 blockBit[0] = (!bPSlice) ? 3 : 1;
2044 blockBit[1] = 3;
2045 blockBit[2] = 5;
2046 }
2047 else if (cuMode == SIZE_2NxN || cuMode == SIZE_2NxnU || cuMode == SIZE_2NxnD)
2048 {
2049 static const uint32_t listBits[2][3][3] =
2050 {
2051 { { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
2052 { { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } }
2053 };
2054 if (bPSlice)
2055 {
2056 blockBit[0] = 3;
2057 blockBit[1] = 0;
2058 blockBit[2] = 0;
2059 }
2060 else
2061 memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
2062 }
2063 else if (cuMode == SIZE_Nx2N || cuMode == SIZE_nLx2N || cuMode == SIZE_nRx2N)
2064 {
2065 static const uint32_t listBits[2][3][3] =
2066 {
2067 { { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
2068 { { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } }
2069 };
2070 if (bPSlice)
2071 {
2072 blockBit[0] = 3;
2073 blockBit[1] = 0;
2074 blockBit[2] = 0;
2075 }
2076 else
2077 memcpy(blockBit, listBits[partIdx][lastMode], 3 * sizeof(uint32_t));
2078 }
2079 else if (cuMode == SIZE_NxN)
2080 {
2081 blockBit[0] = (!bPSlice) ? 3 : 1;
2082 blockBit[1] = 3;
2083 blockBit[2] = 5;
2084 }
2085 else
2086 {
2087 X265_CHECK(0, "getBlkBits: unknown cuMode\n");
2088 }
2089}
2090
2091/* Check if using an alternative MVP would result in a smaller MVD + signal bits */
2092void Search::checkBestMVP(MV* amvpCand, MV mv, MV& mvPred, int& outMvpIdx, uint32_t& outBits, uint32_t& outCost) const
2093{
2094 X265_CHECK(amvpCand[outMvpIdx] == mvPred, "checkBestMVP: unexpected mvPred\n");
2095
2096 int mvpIdx = !outMvpIdx;
2097 MV mvp = amvpCand[mvpIdx];
2098 int diffBits = m_me.bitcost(mv, mvp) - m_me.bitcost(mv, mvPred);
2099 if (diffBits < 0)
2100 {
2101 outMvpIdx = mvpIdx;
2102 mvPred = mvp;
2103 uint32_t origOutBits = outBits;
2104 outBits = origOutBits + diffBits;
2105 outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
2106 }
2107}
2108
2109void Search::setSearchRange(const CUData& cu, MV mvp, int merange, MV& mvmin, MV& mvmax) const
2110{
2111 cu.clipMv(mvp);
2112
2113 MV dist((int16_t)merange << 2, (int16_t)merange << 2);
2114 mvmin = mvp - dist;
2115 mvmax = mvp + dist;
2116
2117 cu.clipMv(mvmin);
2118 cu.clipMv(mvmax);
2119
2120 /* Clip search range to signaled maximum MV length.
2121 * We do not support this VUI field being changed from the default */
2122 const int maxMvLen = (1 << 15) - 1;
2123 mvmin.x = X265_MAX(mvmin.x, -maxMvLen);
2124 mvmin.y = X265_MAX(mvmin.y, -maxMvLen);
2125 mvmax.x = X265_MIN(mvmax.x, maxMvLen);
2126 mvmax.y = X265_MIN(mvmax.y, maxMvLen);
2127
2128 mvmin >>= 2;
2129 mvmax >>= 2;
2130
2131 /* conditional clipping for frame parallelism */
2132 mvmin.y = X265_MIN(mvmin.y, (int16_t)m_refLagPixels);
2133 mvmax.y = X265_MIN(mvmax.y, (int16_t)m_refLagPixels);
2134}
2135
2136/* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
2137void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
2138{
2139 CUData& cu = interMode.cu;
2140 Yuv* reconYuv = &interMode.reconYuv;
2141 const Yuv* fencYuv = interMode.fencYuv;
2142
2143 X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
2144
2145 uint32_t cuSize = 1 << cu.m_log2CUSize[0];
2146 uint32_t depth = cu.m_cuDepth[0];
2147
2148 // No residual coding : SKIP mode
2149
2150 cu.setSkipFlagSubParts(true);
2151 cu.clearCbf();
2152 cu.setTUDepthSubParts(0, 0, depth);
2153
2154 reconYuv->copyFromYuv(interMode.predYuv);
2155
2156 // Luma
2157 int part = partitionFromLog2Size(cu.m_log2CUSize[0]);
2158 interMode.distortion = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2159 // Chroma
2160 part = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
2161 interMode.distortion += m_rdCost.scaleChromaDistCb(primitives.sse_pp[part](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
2162 interMode.distortion += m_rdCost.scaleChromaDistCr(primitives.sse_pp[part](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
2163
2164 m_entropyCoder.load(m_rqt[depth].cur);
2165 m_entropyCoder.resetBits();
2166 if (m_slice->m_pps->bTransquantBypassEnabled)
2167 m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
2168 m_entropyCoder.codeSkipFlag(cu, 0);
2169 m_entropyCoder.codeMergeIndex(cu, 0);
2170
2171 interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
2172 interMode.coeffBits = 0;
2173 interMode.totalBits = interMode.mvBits;
2174 if (m_rdCost.m_psyRd)
2175 interMode.psyEnergy = m_rdCost.psyCost(cu.m_log2CUSize[0] - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2176
2177 updateModeCost(interMode);
2178 m_entropyCoder.store(interMode.contexts);
2179}
2180
2181/* encode residual and calculate rate-distortion for a CU block.
2182 * Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
2183void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
2184{
2185 CUData& cu = interMode.cu;
2186 Yuv* reconYuv = &interMode.reconYuv;
2187 Yuv* predYuv = &interMode.predYuv;
2188 ShortYuv* resiYuv = &m_rqt[cuGeom.depth].tmpResiYuv;
2189 const Yuv* fencYuv = interMode.fencYuv;
2190
2191 X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
2192
2193 uint32_t log2CUSize = cu.m_log2CUSize[0];
2194 uint32_t cuSize = 1 << log2CUSize;
2195 uint32_t depth = cu.m_cuDepth[0];
2196
2197 int part = partitionFromLog2Size(log2CUSize);
2198 int cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift);
2199
2200 m_quant.setQPforQuant(interMode.cu);
2201
2202 resiYuv->subtract(*fencYuv, *predYuv, log2CUSize);
2203
2204 uint32_t tuDepthRange[2];
2205 cu.getInterTUQtDepthRange(tuDepthRange, 0);
2206
2207 m_entropyCoder.load(m_rqt[depth].cur);
2208
2209 Cost costs;
2210 estimateResidualQT(interMode, cuGeom, 0, depth, *resiYuv, costs, tuDepthRange);
2211
2212 if (!cu.m_tqBypass[0])
2213 {
2214 uint32_t cbf0Dist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
2215 cbf0Dist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
2216 cbf0Dist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
2217
2218 /* Consider the RD cost of not signaling any residual */
2219 m_entropyCoder.load(m_rqt[depth].cur);
2220 m_entropyCoder.resetBits();
2221 m_entropyCoder.codeQtRootCbfZero();
2222 uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();
2223
2224 uint64_t cbf0Cost;
2225 uint32_t cbf0Energy;
2226 if (m_rdCost.m_psyRd)
2227 {
2228 cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
2229 cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy);
2230 }
2231 else
2232 cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits);
2233
2234 if (cbf0Cost < costs.rdcost)
2235 {
2236 cu.clearCbf();
2237 cu.setTUDepthSubParts(0, 0, depth);
2238 }
2239 }
2240
2241 if (cu.getQtRootCbf(0))
2242 saveResidualQTData(cu, *resiYuv, 0, depth);
2243
2244 /* calculate signal bits for inter/merge/skip coded CU */
2245 m_entropyCoder.load(m_rqt[depth].cur);
2246
2247 uint32_t coeffBits, bits;
2248 if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
2249 {
2250 cu.setSkipFlagSubParts(true);
2251
2252 /* Merge/Skip */
2253 m_entropyCoder.resetBits();
2254 if (m_slice->m_pps->bTransquantBypassEnabled)
2255 m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
2256 m_entropyCoder.codeSkipFlag(cu, 0);
2257 m_entropyCoder.codeMergeIndex(cu, 0);
2258 coeffBits = 0;
2259 bits = m_entropyCoder.getNumberOfWrittenBits();
2260 }
2261 else
2262 {
2263 m_entropyCoder.resetBits();
2264 if (m_slice->m_pps->bTransquantBypassEnabled)
2265 m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
2266 m_entropyCoder.codeSkipFlag(cu, 0);
2267 m_entropyCoder.codePredMode(cu.m_predMode[0]);
2268 m_entropyCoder.codePartSize(cu, 0, cu.m_cuDepth[0]);
2269 m_entropyCoder.codePredInfo(cu, 0);
2270 uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits();
2271
2272 bool bCodeDQP = m_slice->m_pps->bUseDQP;
2273 m_entropyCoder.codeCoeff(cu, 0, cu.m_cuDepth[0], bCodeDQP, tuDepthRange);
2274 bits = m_entropyCoder.getNumberOfWrittenBits();
2275
2276 coeffBits = bits - mvBits;
2277 }
2278
2279 m_entropyCoder.store(interMode.contexts);
2280
2281 if (cu.getQtRootCbf(0))
2282 reconYuv->addClip(*predYuv, *resiYuv, log2CUSize);
2283 else
2284 reconYuv->copyFromYuv(*predYuv);
2285
2286 // update with clipped distortion and cost (qp estimation loop uses unclipped values)
2287 uint32_t bestDist = primitives.sse_pp[part](fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2288 bestDist += m_rdCost.scaleChromaDistCb(primitives.sse_pp[cpart](fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
2289 bestDist += m_rdCost.scaleChromaDistCr(primitives.sse_pp[cpart](fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
2290 if (m_rdCost.m_psyRd)
2291 interMode.psyEnergy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
2292
2293 interMode.totalBits = bits;
2294 interMode.distortion = bestDist;
2295 interMode.coeffBits = coeffBits;
2296 interMode.mvBits = bits - coeffBits;
2297 updateModeCost(interMode);
2298}
2299
2300void Search::generateCoeffRecon(Mode& mode, const CUGeom& cuGeom)
2301{
2302 CUData& cu = mode.cu;
2303
2304 m_quant.setQPforQuant(mode.cu);
2305
2306 if (cu.m_predMode[0] == MODE_INTER)
2307 {
2308 uint32_t tuDepthRange[2];
2309 cu.getInterTUQtDepthRange(tuDepthRange, 0);
2310
2311 residualTransformQuantInter(mode, cuGeom, 0, cu.m_cuDepth[0], tuDepthRange);
2312 if (cu.getQtRootCbf(0))
2313 mode.reconYuv.addClip(mode.predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
2314 else
2315 {
2316 mode.reconYuv.copyFromYuv(mode.predYuv);
2317 if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
2318 cu.setSkipFlagSubParts(true);
2319 }
2320 }
2321 else if (cu.m_predMode[0] == MODE_INTRA)
2322 {
2323 uint32_t tuDepthRange[2];
2324 cu.getIntraTUQtDepthRange(tuDepthRange, 0);
2325
2326 uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN;
2327 residualTransformQuantIntra(mode, cuGeom, initTrDepth, 0, tuDepthRange);
2328 getBestIntraModeChroma(mode, cuGeom);
2329 residualQTIntraChroma(mode, cuGeom, 0, 0);
2330 mode.reconYuv.copyFromPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, cuGeom.encodeIdx); // TODO:
2331 }
2332}
2333
2334void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, uint32_t depthRange[2])
2335{
2336 CUData& cu = mode.cu;
2337 X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "invalid depth\n");
2338
2339 uint32_t log2TrSize = g_maxLog2CUSize - depth;
2340 uint32_t tuDepth = depth - cu.m_cuDepth[0];
2341
2342 bool bCheckFull = log2TrSize <= depthRange[1];
2343 if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && log2TrSize > depthRange[0])
2344 bCheckFull = false;
2345
2346 if (bCheckFull)
2347 {
2348 // code full block
2349 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
2350 bool bCodeChroma = true;
2351 uint32_t tuDepthC = tuDepth;
2352 if (log2TrSizeC == 1)
2353 {
2354 X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n");
2355 log2TrSizeC++;
2356 tuDepthC--;
2357 uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
2358 bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
2359 }
2360
2361 uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
2362 uint32_t setCbf = 1 << tuDepth;
2363
2364 uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
2365 coeff_t *coeffCurY = cu.m_trCoeff[0] + coeffOffsetY;
2366
2367 uint32_t sizeIdx = log2TrSize - 2;
2368
2369 cu.setTUDepthSubParts(depth - cu.m_cuDepth[0], absPartIdx, depth);
2370 cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
2371
2372 ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
2373 const Yuv* fencYuv = mode.fencYuv;
2374
2375 int16_t *curResiY = resiYuv.getLumaAddr(absPartIdx);
2376 uint32_t strideResiY = resiYuv.m_size;
2377
2378 pixel *fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
2379 uint32_t numSigY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
2380
2381 if (numSigY)
2382 {
2383 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSigY);
2384 cu.setCbfSubParts(setCbf, TEXT_LUMA, absPartIdx, depth);
2385 }
2386 else
2387 {
2388 primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);
2389 cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
2390 }
2391
2392 if (bCodeChroma)
2393 {
2394 uint32_t sizeIdxC = log2TrSizeC - 2;
2395 uint32_t strideResiC = resiYuv.m_csize;
2396
2397 uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
2398 coeff_t *coeffCurU = cu.m_trCoeff[1] + coeffOffsetC;
2399 coeff_t *coeffCurV = cu.m_trCoeff[2] + coeffOffsetC;
2400 bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
2401
2402 TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
2403 do
2404 {
2405 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
2406 uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
2407
2408 cu.setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
2409 cu.setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
2410
2411 int16_t* curResiU = resiYuv.getCbAddr(absPartIdxC);
2412 pixel* fencCb = const_cast<pixel*>(fencYuv->getCbAddr(absPartIdxC));
2413 uint32_t numSigU = m_quant.transformNxN(cu, fencCb, fencYuv->m_csize, curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, absPartIdxC, false);
2414 if (numSigU)
2415 {
2416 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiU, strideResiC, coeffCurU + subTUOffset, log2TrSizeC, TEXT_CHROMA_U, false, false, numSigU);
2417 cu.setCbfPartRange(setCbf, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
2418 }
2419 else
2420 {
2421 primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);
2422 cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
2423 }
2424
2425 int16_t* curResiV = resiYuv.getCrAddr(absPartIdxC);
2426 pixel* fencCr = const_cast<pixel*>(fencYuv->getCrAddr(absPartIdxC));
2427 uint32_t numSigV = m_quant.transformNxN(cu, fencCr, fencYuv->m_csize, curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, absPartIdxC, false);
2428 if (numSigV)
2429 {
2430 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiV, strideResiC, coeffCurV + subTUOffset, log2TrSizeC, TEXT_CHROMA_V, false, false, numSigV);
2431 cu.setCbfPartRange(setCbf, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
2432 }
2433 else
2434 {
2435 primitives.blockfill_s[sizeIdxC](curResiV, strideResiC, 0);
2436 cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
2437 }
2438 }
2439 while (tuIterator.isNextSection());
2440
2441 if (splitIntoSubTUs)
2442 {
2443 offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
2444 offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
2445 }
2446 }
2447 }
2448 else
2449 {
2450 X265_CHECK(log2TrSize > depthRange[0], "residualTransformQuantInter recursion check failure\n");
2451
2452 const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
2453 uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
2454 for (uint32_t i = 0; i < 4; i++)
2455 {
2456 residualTransformQuantInter(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, depthRange);
2457 ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1);
2458 ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1);
2459 vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1);
2460 }
2461 for (uint32_t i = 0; i < 4 * qPartNumSubdiv; i++)
2462 {
2463 cu.m_cbf[TEXT_LUMA][absPartIdx + i] |= ycbf << tuDepth;
2464 cu.m_cbf[TEXT_CHROMA_U][absPartIdx + i] |= ucbf << tuDepth;
2465 cu.m_cbf[TEXT_CHROMA_V][absPartIdx + i] |= vcbf << tuDepth;
2466 }
2467 }
2468}
2469
2470void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& outCosts, uint32_t depthRange[2])
2471{
2472 CUData& cu = mode.cu;
2473 uint32_t log2TrSize = g_maxLog2CUSize - depth;
2474
2475 bool bCheckSplit = log2TrSize > depthRange[0];
2476 bool bCheckFull = log2TrSize <= depthRange[1];
2477
2478 if (cu.m_partSize[absPartIdx] != SIZE_2Nx2N && depth == cu.m_cuDepth[absPartIdx] && bCheckSplit)
2479 bCheckFull = false;
2480
2481 X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
2482 X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
2483
2484 uint32_t tuDepth = depth - cu.m_cuDepth[0];
2485 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
2486 bool bCodeChroma = true;
2487 uint32_t tuDepthC = tuDepth;
2488 if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444))
2489 {
2490 log2TrSizeC++;
2491 tuDepthC--;
2492 uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
2493 bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
2494 }
2495
2496 // code full block
2497 Cost fullCost;
2498 fullCost.rdcost = MAX_INT64;
2499
2500 uint8_t cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
2501 uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
2502 uint32_t singleBitsComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2503 uint32_t singleDistComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2504 uint32_t singlePsyEnergyComp[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2505 uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2506 uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
2507
2508 m_entropyCoder.store(m_rqt[depth].rqtRoot);
2509
2510 uint32_t trSize = 1 << log2TrSize;
2511 const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
2512 uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
2513 const Yuv* fencYuv = mode.fencYuv;
2514
2515 // code full block
2516 if (bCheckFull)
2517 {
2518 uint32_t trSizeC = 1 << log2TrSizeC;
2519 int partSize = partitionFromLog2Size(log2TrSize);
2520 int partSizeC = partitionFromLog2Size(log2TrSizeC);
2521 const uint32_t qtLayer = log2TrSize - 2;
2522 uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
2523 coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
2524
2525 bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0];
2526 bool checkTransformSkipY = checkTransformSkip && log2TrSize <= MAX_LOG2_TS_SIZE;
2527 bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
2528
2529 cu.setTUDepthSubParts(depth - cu.m_cuDepth[0], absPartIdx, depth);
2530 cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
2531
2532 if (m_bEnableRDOQ)
2533 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
2534
2535 pixel *fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
2536 int16_t *resi = resiYuv.getLumaAddr(absPartIdx);
2537 numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
2538 cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
2539
2540 m_entropyCoder.resetBits();
2541 m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
2542 if (cbfFlag[TEXT_LUMA][0])
2543 m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
2544 singleBitsComp[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits();
2545
2546 uint32_t singleBitsPrev = singleBitsComp[TEXT_LUMA][0];
2547
2548 if (bCodeChroma)
2549 {
2550 uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
2551 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
2552 {
2553 coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
2554 TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
2555
2556 do
2557 {
2558 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
2559 uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
2560
2561 cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
2562
2563 if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
2564 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
2565
2566 fenc = const_cast<pixel*>(fencYuv->getChromaAddr(chromaId, absPartIdxC));
2567 resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
2568 numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
2569 cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
2570
2571 m_entropyCoder.codeQtCbf(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
2572 if (cbfFlag[chromaId][tuIterator.section])
2573 m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
2574
2575 uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
2576 singleBitsComp[chromaId][tuIterator.section] = newBits - singleBitsPrev;
2577
2578 singleBitsPrev = newBits;
2579 }
2580 while (tuIterator.isNextSection());
2581 }
2582 }
2583
2584 const uint32_t numCoeffY = 1 << (log2TrSize * 2);
2585 const uint32_t numCoeffC = 1 << (log2TrSizeC * 2);
2586
2587 X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
2588 uint32_t distY = primitives.ssd_s[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size);
2589 uint32_t psyEnergyY = 0;
2590 if (m_rdCost.m_psyRd)
2591 psyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, (int16_t*)zeroShort, 0);
2592
2593 int16_t *curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
2594 uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
2595
2596 if (cbfFlag[TEXT_LUMA][0])
2597 {
2598 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], curResiY, strideResiY, coeffCurY, log2TrSize, TEXT_LUMA, false, false, numSig[TEXT_LUMA][0]); //this is for inter mode only
2599
2600 const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
2601 uint32_t nonZeroPsyEnergyY = 0;
2602 if (m_rdCost.m_psyRd)
2603 nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
2604
2605 if (cu.m_tqBypass[0])
2606 {
2607 distY = nonZeroDistY;
2608 psyEnergyY = nonZeroPsyEnergyY;
2609 }
2610 else
2611 {
2612 uint64_t singleCostY = 0;
2613 if (m_rdCost.m_psyRd)
2614 singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0], nonZeroPsyEnergyY);
2615 else
2616 singleCostY = m_rdCost.calcRdCost(nonZeroDistY, singleBitsComp[TEXT_LUMA][0]);
2617 m_entropyCoder.resetBits();
2618 m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth);
2619 const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits();
2620 uint64_t nullCostY = 0;
2621 if (m_rdCost.m_psyRd)
2622 nullCostY = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY);
2623 else
2624 nullCostY = m_rdCost.calcRdCost(distY, nullBitsY);
2625 if (nullCostY < singleCostY)
2626 {
2627 cbfFlag[TEXT_LUMA][0] = 0;
2628#if CHECKED_BUILD || _DEBUG
2629 memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
2630#endif
2631 if (checkTransformSkipY)
2632 minCost[TEXT_LUMA][0] = nullCostY;
2633 }
2634 else
2635 {
2636 distY = nonZeroDistY;
2637 psyEnergyY = nonZeroPsyEnergyY;
2638 if (checkTransformSkipY)
2639 minCost[TEXT_LUMA][0] = singleCostY;
2640 }
2641 }
2642 }
2643 else if (checkTransformSkipY)
2644 {
2645 m_entropyCoder.resetBits();
2646 m_entropyCoder.codeQtCbfZero(TEXT_LUMA, tuDepth);
2647 const uint32_t nullBitsY = m_entropyCoder.getNumberOfWrittenBits();
2648 if (m_rdCost.m_psyRd)
2649 minCost[TEXT_LUMA][0] = m_rdCost.calcPsyRdCost(distY, nullBitsY, psyEnergyY);
2650 else
2651 minCost[TEXT_LUMA][0] = m_rdCost.calcRdCost(distY, nullBitsY);
2652 }
2653
2654 singleDistComp[TEXT_LUMA][0] = distY;
2655 singlePsyEnergyComp[TEXT_LUMA][0] = psyEnergyY;
2656 if (!cbfFlag[TEXT_LUMA][0])
2657 primitives.blockfill_s[partSize](curResiY, strideResiY, 0);
2658 cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
2659
2660 if (bCodeChroma)
2661 {
2662 uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
2663 uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
2664 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
2665 {
2666 uint32_t distC = 0, psyEnergyC = 0;
2667 coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
2668 TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
2669
2670 do
2671 {
2672 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
2673 uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
2674
2675 int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
2676
2677 distC = m_rdCost.scaleChromaDistCb(primitives.ssd_s[log2TrSizeC - 2](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
2678
2679 if (cbfFlag[chromaId][tuIterator.section])
2680 {
2681 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], curResiC, strideResiC, coeffCurC + subTUOffset,
2682 log2TrSizeC, (TextType)chromaId, false, false, numSig[chromaId][tuIterator.section]);
2683 uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
2684 const uint32_t nonZeroDistC = m_rdCost.scaleChromaDistCb(dist);
2685 uint32_t nonZeroPsyEnergyC = 0;
2686 if (m_rdCost.m_psyRd)
2687 nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
2688
2689 if (cu.m_tqBypass[0])
2690 {
2691 distC = nonZeroDistC;
2692 psyEnergyC = nonZeroPsyEnergyC;
2693 }
2694 else
2695 {
2696 uint64_t singleCostC = 0;
2697 if (m_rdCost.m_psyRd)
2698 singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
2699 else
2700 singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
2701 m_entropyCoder.resetBits();
2702 m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepth);
2703 const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits();
2704 uint64_t nullCostC = 0;
2705 if (m_rdCost.m_psyRd)
2706 nullCostC = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC);
2707 else
2708 nullCostC = m_rdCost.calcRdCost(distC, nullBitsC);
2709 if (nullCostC < singleCostC)
2710 {
2711 cbfFlag[chromaId][tuIterator.section] = 0;
2712#if CHECKED_BUILD || _DEBUG
2713 memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
2714#endif
2715 if (checkTransformSkipC)
2716 minCost[chromaId][tuIterator.section] = nullCostC;
2717 }
2718 else
2719 {
2720 distC = nonZeroDistC;
2721 psyEnergyC = nonZeroPsyEnergyC;
2722 if (checkTransformSkipC)
2723 minCost[chromaId][tuIterator.section] = singleCostC;
2724 }
2725 }
2726 }
2727 else if (checkTransformSkipC)
2728 {
2729 m_entropyCoder.resetBits();
2730 m_entropyCoder.codeQtCbfZero((TextType)chromaId, tuDepthC);
2731 const uint32_t nullBitsC = m_entropyCoder.getNumberOfWrittenBits();
2732 if (m_rdCost.m_psyRd)
2733 minCost[chromaId][tuIterator.section] = m_rdCost.calcPsyRdCost(distC, nullBitsC, psyEnergyC);
2734 else
2735 minCost[chromaId][tuIterator.section] = m_rdCost.calcRdCost(distC, nullBitsC);
2736 }
2737
2738 singleDistComp[chromaId][tuIterator.section] = distC;
2739 singlePsyEnergyComp[chromaId][tuIterator.section] = psyEnergyC;
2740
2741 if (!cbfFlag[chromaId][tuIterator.section])
2742 primitives.blockfill_s[partSizeC](curResiC, strideResiC, 0);
2743
2744 cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
2745 }
2746 while (tuIterator.isNextSection());
2747 }
2748 }
2749
2750 if (checkTransformSkipY)
2751 {
2752 uint32_t nonZeroDistY = 0;
2753 uint32_t nonZeroPsyEnergyY = 0;
2754 uint64_t singleCostY = MAX_INT64;
2755
2756 ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]);
2757 ALIGN_VAR_32(int16_t, tsResiY[MAX_TS_SIZE * MAX_TS_SIZE]);
2758
2759 m_entropyCoder.load(m_rqt[depth].rqtRoot);
2760
2761 cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth);
2762
2763 if (m_bEnableRDOQ)
2764 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
2765
2766 fenc = const_cast<pixel*>(fencYuv->getLumaAddr(absPartIdx));
2767 resi = resiYuv.getLumaAddr(absPartIdx);
2768 uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, tsCoeffY, log2TrSize, TEXT_LUMA, absPartIdx, true);
2769
2770 if (numSigTSkipY)
2771 {
2772 m_entropyCoder.resetBits();
2773 m_entropyCoder.codeQtCbf(!!numSigTSkipY, TEXT_LUMA, tuDepth);
2774 m_entropyCoder.codeCoeffNxN(cu, tsCoeffY, absPartIdx, log2TrSize, TEXT_LUMA);
2775 const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
2776
2777 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdx], tsResiY, trSize, tsCoeffY, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
2778
2779 nonZeroDistY = primitives.sse_ss[partSize](resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, tsResiY, trSize);
2780
2781 if (m_rdCost.m_psyRd)
2782 {
2783 nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, tsResiY, trSize);
2784 singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroPsyEnergyY);
2785 }
2786 else
2787 singleCostY = m_rdCost.calcRdCost(nonZeroDistY, skipSingleBitsY);
2788 }
2789
2790 if (!numSigTSkipY || minCost[TEXT_LUMA][0] < singleCostY)
2791 cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
2792 else
2793 {
2794 singleDistComp[TEXT_LUMA][0] = nonZeroDistY;
2795 singlePsyEnergyComp[TEXT_LUMA][0] = nonZeroPsyEnergyY;
2796 cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
2797 bestTransformMode[TEXT_LUMA][0] = 1;
2798 memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY);
2799 primitives.square_copy_ss[partSize](curResiY, strideResiY, tsResiY, trSize);
2800 }
2801
2802 cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
2803 }
2804
2805 if (bCodeChroma && checkTransformSkipC)
2806 {
2807 uint32_t nonZeroDistC = 0, nonZeroPsyEnergyC = 0;
2808 uint64_t singleCostC = MAX_INT64;
2809 uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
2810 uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
2811
2812 m_entropyCoder.load(m_rqt[depth].rqtRoot);
2813
2814 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
2815 {
2816 coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
2817 TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
2818
2819 do
2820 {
2821 uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
2822 uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2);
2823
2824 int16_t *curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
2825
2826 ALIGN_VAR_32(coeff_t, tsCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
2827 ALIGN_VAR_32(int16_t, tsResiC[MAX_TS_SIZE * MAX_TS_SIZE]);
2828
2829 cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
2830
2831 if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
2832 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
2833
2834 fenc = const_cast<pixel*>(fencYuv->getChromaAddr(chromaId, absPartIdxC));
2835 resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
2836 uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, tsCoeffC, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
2837
2838 m_entropyCoder.resetBits();
2839 singleBitsComp[chromaId][tuIterator.section] = 0;
2840
2841 if (numSigTSkipC)
2842 {
2843 m_entropyCoder.codeQtCbf(!!numSigTSkipC, (TextType)chromaId, tuDepth);
2844 m_entropyCoder.codeCoeffNxN(cu, tsCoeffC, absPartIdxC, log2TrSizeC, (TextType)chromaId);
2845 singleBitsComp[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
2846
2847 m_quant.invtransformNxN(cu.m_tqBypass[absPartIdxC], tsResiC, trSizeC, tsCoeffC,
2848 log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
2849 uint32_t dist = primitives.sse_ss[partSizeC](resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
2850 nonZeroDistC = m_rdCost.scaleChromaDistCb(dist);
2851 if (m_rdCost.m_psyRd)
2852 {
2853 nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
2854 singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section], nonZeroPsyEnergyC);
2855 }
2856 else
2857 singleCostC = m_rdCost.calcRdCost(nonZeroDistC, singleBitsComp[chromaId][tuIterator.section]);
2858 }
2859
2860 if (!numSigTSkipC || minCost[chromaId][tuIterator.section] < singleCostC)
2861 cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
2862 else
2863 {
2864 singleDistComp[chromaId][tuIterator.section] = nonZeroDistC;
2865 singlePsyEnergyComp[chromaId][tuIterator.section] = nonZeroPsyEnergyC;
2866 cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
2867 bestTransformMode[chromaId][tuIterator.section] = 1;
2868 memcpy(coeffCurC + subTUOffset, tsCoeffC, sizeof(coeff_t) * numCoeffC);
2869 primitives.square_copy_ss[partSizeC](curResiC, strideResiC, tsResiC, trSizeC);
2870 }
2871
2872 cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
2873 }
2874 while (tuIterator.isNextSection());
2875 }
2876 }
2877
2878 m_entropyCoder.load(m_rqt[depth].rqtRoot);
2879
2880 m_entropyCoder.resetBits();
2881
2882 if (log2TrSize > depthRange[0])
2883 m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
2884
2885 if (bCodeChroma)
2886 {
2887 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
2888 {
2889 if (!splitIntoSubTUs)
2890 m_entropyCoder.codeQtCbf(cbfFlag[chromaId][0], (TextType)chromaId, tuDepth);
2891 else
2892 {
2893 offsetSubTUCBFs(cu, (TextType)chromaId, tuDepth, absPartIdx);
2894 for (uint32_t subTU = 0; subTU < 2; subTU++)
2895 m_entropyCoder.codeQtCbf(cbfFlag[chromaId][subTU], (TextType)chromaId, tuDepth);
2896 }
2897 }
2898 }
2899
2900 m_entropyCoder.codeQtCbf(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
2901 if (cbfFlag[TEXT_LUMA][0])
2902 m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
2903
2904 if (bCodeChroma)
2905 {
2906 uint32_t subTUSize = 1 << (log2TrSizeC * 2);
2907 uint32_t partIdxesPerSubTU = absPartIdxStep >> 1;
2908 uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
2909
2910 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
2911 {
2912 coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
2913 if (!splitIntoSubTUs)
2914 {
2915 if (cbfFlag[chromaId][0])
2916 m_entropyCoder.codeCoeffNxN(cu, coeffCurC, absPartIdx, log2TrSizeC, (TextType)chromaId);
2917 }
2918 else
2919 {
2920 for (uint32_t subTU = 0; subTU < 2; subTU++)
2921 {
2922 if (cbfFlag[chromaId][subTU])
2923 m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTU * subTUSize, absPartIdx + subTU * partIdxesPerSubTU, log2TrSizeC, (TextType)chromaId);
2924 }
2925 }
2926 }
2927 }
2928
2929 fullCost.distortion += singleDistComp[TEXT_LUMA][0];
2930 fullCost.energy += singlePsyEnergyComp[TEXT_LUMA][0];// need to check we need to add chroma also
2931 for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)
2932 {
2933 fullCost.distortion += singleDistComp[TEXT_CHROMA_U][subTUIndex];
2934 fullCost.distortion += singleDistComp[TEXT_CHROMA_V][subTUIndex];
2935 }
2936
2937 fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
2938 if (m_rdCost.m_psyRd)
2939 fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
2940 else
2941 fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
2942 }
2943
2944 // code sub-blocks
2945 if (bCheckSplit)
2946 {
2947 if (bCheckFull)
2948 {
2949 m_entropyCoder.store(m_rqt[depth].rqtTest);
2950 m_entropyCoder.load(m_rqt[depth].rqtRoot);
2951 }
2952
2953 Cost splitCost;
2954 const uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
2955 uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
2956 for (uint32_t i = 0; i < 4; ++i)
2957 {
2958 estimateResidualQT(mode, cuGeom, absPartIdx + i * qPartNumSubdiv, depth + 1, resiYuv, splitCost, depthRange);
2959 ycbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_LUMA, tuDepth + 1);
2960 ucbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_U, tuDepth + 1);
2961 vcbf |= cu.getCbf(absPartIdx + i * qPartNumSubdiv, TEXT_CHROMA_V, tuDepth + 1);
2962 }
2963 for (uint32_t i = 0; i < 4 * qPartNumSubdiv; ++i)
2964 {
2965 cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
2966 cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
2967 cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
2968 }
2969
2970 m_entropyCoder.load(m_rqt[depth].rqtRoot);
2971 m_entropyCoder.resetBits();
2972
2973 encodeResidualQT(cu, absPartIdx, depth, true, TEXT_LUMA, depthRange);
2974 encodeResidualQT(cu, absPartIdx, depth, false, TEXT_LUMA, depthRange);
2975 encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_U, depthRange);
2976 encodeResidualQT(cu, absPartIdx, depth, false, TEXT_CHROMA_V, depthRange);
2977
2978 splitCost.bits = m_entropyCoder.getNumberOfWrittenBits();
2979
2980 if (m_rdCost.m_psyRd)
2981 splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
2982 else
2983 splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
2984
2985 if (ycbf || ucbf || vcbf || !bCheckFull)
2986 {
2987 if (splitCost.rdcost < fullCost.rdcost)
2988 {
2989 outCosts.distortion += splitCost.distortion;
2990 outCosts.rdcost += splitCost.rdcost;
2991 outCosts.bits += splitCost.bits;
2992 outCosts.energy += splitCost.energy;
2993 return;
2994 }
2995 else
2996 outCosts.energy += splitCost.energy;
2997 }
2998
2999 cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
3000 if (bCodeChroma)
3001 {
3002 const uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1;
3003
3004 uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0);
3005 for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++)
3006 {
3007 const uint32_t subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU);
3008
3009 cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_U][subTUIndex], TEXT_CHROMA_U, subTUPartIdx, partIdxesPerSubTU);
3010 cu.setTransformSkipPartRange(bestTransformMode[TEXT_CHROMA_V][subTUIndex], TEXT_CHROMA_V, subTUPartIdx, partIdxesPerSubTU);
3011 }
3012 }
3013 X265_CHECK(bCheckFull, "check-full must be set\n");
3014 m_entropyCoder.load(m_rqt[depth].rqtTest);
3015 }
3016
3017 cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
3018 cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
3019
3020 if (bCodeChroma)
3021 {
3022 uint32_t numberOfSections = splitIntoSubTUs ? 2 : 1;
3023 uint32_t partIdxesPerSubTU = absPartIdxStep >> (splitIntoSubTUs ? 1 : 0);
3024
3025 for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
3026 {
3027 for (uint32_t subTUIndex = 0; subTUIndex < numberOfSections; subTUIndex++)
3028 {
3029 const uint32_t subTUPartIdx = absPartIdx + (subTUIndex * partIdxesPerSubTU);
3030
3031 if (splitIntoSubTUs)
3032 {
3033 uint8_t combinedSubTUCBF = cbfFlag[chromaId][0] | cbfFlag[chromaId][1];
3034 cu.setCbfPartRange(((cbfFlag[chromaId][subTUIndex] << 1) | combinedSubTUCBF) << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU);
3035 }
3036 else
3037 cu.setCbfPartRange(cbfFlag[chromaId][subTUIndex] << tuDepth, (TextType)chromaId, subTUPartIdx, partIdxesPerSubTU);
3038 }
3039 }
3040 }
3041
3042 outCosts.distortion += fullCost.distortion;
3043 outCosts.rdcost += fullCost.rdcost;
3044 outCosts.bits += fullCost.bits;
3045 outCosts.energy += fullCost.energy;
3046}
3047
3048void Search::encodeResidualQT(CUData& cu, uint32_t absPartIdx, const uint32_t depth, bool bSubdivAndCbf, TextType ttype, uint32_t depthRange[2])
3049{
3050 X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
3051 X265_CHECK(cu.m_predMode[absPartIdx] != MODE_INTRA, "encodeResidualQT() with intra block\n");
3052
3053 const uint32_t curTuDepth = depth - cu.m_cuDepth[0];
3054 const uint32_t tuDepth = cu.m_tuDepth[absPartIdx];
3055 const bool bSubdiv = curTuDepth != tuDepth;
3056 const uint32_t log2TrSize = g_maxLog2CUSize - depth;
3057
3058 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
3059
3060 const bool splitIntoSubTUs = (m_csp == X265_CSP_I422);
3061
3062 if (bSubdivAndCbf && log2TrSize <= depthRange[1] && log2TrSize > depthRange[0])
3063 m_entropyCoder.codeTransformSubdivFlag(bSubdiv, 5 - log2TrSize);
3064
3065 bool mCodeAll = true;
3066 uint32_t trWidthC = 1 << log2TrSizeC;
3067 uint32_t trHeightC = splitIntoSubTUs ? (trWidthC << 1) : trWidthC;
3068
3069 const uint32_t numPels = trWidthC * trHeightC;
3070 if (numPels < (MIN_TU_SIZE * MIN_TU_SIZE))
3071 mCodeAll = false;
3072
3073 if (bSubdivAndCbf)
3074 {
3075 const bool bFirstCbfOfCU = curTuDepth == 0;
3076 if (bFirstCbfOfCU || mCodeAll)
3077 {
3078 uint32_t absPartIdxStep = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + curTuDepth) << 1);
3079 if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1))
3080 m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_U, curTuDepth, !bSubdiv);
3081 if (bFirstCbfOfCU || cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1))
3082 m_entropyCoder.codeQtCbf(cu, absPartIdx, absPartIdxStep, trWidthC, trHeightC, TEXT_CHROMA_V, curTuDepth, !bSubdiv);
3083 }
3084 else
3085 {
3086 X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, curTuDepth - 1), "chroma CBF not matching\n");
3087 X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, curTuDepth - 1), "chroma CBF not matching\n");
3088 }
3089 }
3090
3091 if (!bSubdiv)
3092 {
3093 // Luma
3094 const uint32_t qtLayer = log2TrSize - 2;
3095 uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
3096 coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
3097
3098 // Chroma
3099 bool bCodeChroma = true;
3100 uint32_t tuDepthC = tuDepth;
3101 if ((log2TrSize == 2) && !(m_csp == X265_CSP_I444))
3102 {
3103 log2TrSizeC++;
3104 tuDepthC--;
3105 uint32_t qpdiv = NUM_CU_PARTITIONS >> ((depth - 1) << 1);
3106 bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
3107 }
3108
3109 if (bSubdivAndCbf)
3110 m_entropyCoder.codeQtCbf(cu, absPartIdx, TEXT_LUMA, tuDepth);
3111 else
3112 {
3113 if (ttype == TEXT_LUMA && cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
3114 m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
3115
3116 if (bCodeChroma)
3117 {
3118 uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
3119 coeff_t* coeffCurU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
3120 coeff_t* coeffCurV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
3121
3122 if (!splitIntoSubTUs)
3123 {
3124 if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
3125 m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
3126 if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
3127 m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
3128 }
3129 else
3130 {
3131 uint32_t partIdxesPerSubTU = NUM_CU_PARTITIONS >> (((cu.m_cuDepth[absPartIdx] + tuDepthC) << 1) + 1);
3132 uint32_t subTUSize = 1 << (log2TrSizeC * 2);
3133 if (ttype == TEXT_CHROMA_U && cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth))
3134 {
3135 if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
3136 m_entropyCoder.codeCoeffNxN(cu, coeffCurU, absPartIdx, log2TrSizeC, TEXT_CHROMA_U);
3137 if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1))
3138 m_entropyCoder.codeCoeffNxN(cu, coeffCurU + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_U);
3139 }
3140 if (ttype == TEXT_CHROMA_V && cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth))
3141 {
3142 if (cu.getCbf(absPartIdx, ttype, tuDepth + 1))
3143 m_entropyCoder.codeCoeffNxN(cu, coeffCurV, absPartIdx, log2TrSizeC, TEXT_CHROMA_V);
3144 if (cu.getCbf(absPartIdx + partIdxesPerSubTU, ttype, tuDepth + 1))
3145 m_entropyCoder.codeCoeffNxN(cu, coeffCurV + subTUSize, absPartIdx + partIdxesPerSubTU, log2TrSizeC, TEXT_CHROMA_V);
3146 }
3147 }
3148 }
3149 }
3150 }
3151 else
3152 {
3153 if (bSubdivAndCbf || cu.getCbf(absPartIdx, ttype, curTuDepth))
3154 {
3155 const uint32_t qpartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
3156 for (uint32_t i = 0; i < 4; ++i)
3157 encodeResidualQT(cu, absPartIdx + i * qpartNumSubdiv, depth + 1, bSubdivAndCbf, ttype, depthRange);
3158 }
3159 }
3160}
3161
3162void Search::saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t depth)
3163{
3164 X265_CHECK(cu.m_cuDepth[0] == cu.m_cuDepth[absPartIdx], "depth not matching\n");
3165 const uint32_t curTrMode = depth - cu.m_cuDepth[0];
3166 const uint32_t tuDepth = cu.m_tuDepth[absPartIdx];
3167
3168 if (curTrMode < tuDepth)
3169 {
3170 uint32_t qPartNumSubdiv = NUM_CU_PARTITIONS >> ((depth + 1) << 1);
3171 for (uint32_t i = 0; i < 4; i++, absPartIdx += qPartNumSubdiv)
3172 saveResidualQTData(cu, resiYuv, absPartIdx, depth + 1);
3173 return;
3174 }
3175
3176 const uint32_t log2TrSize = g_maxLog2CUSize - depth;
3177 const uint32_t qtLayer = log2TrSize - 2;
3178
3179 uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
3180 bool bCodeChroma = true;
3181 uint32_t tuDepthC = tuDepth;
3182 if (log2TrSizeC == 1)
3183 {
3184 X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444, "tuQuad check failed\n");
3185 log2TrSizeC++;
3186 tuDepthC--;
3187 uint32_t qpdiv = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + tuDepthC) << 1);
3188 bCodeChroma = ((absPartIdx & (qpdiv - 1)) == 0);
3189 }
3190
3191 m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
3192
3193 uint32_t numCoeffY = 1 << (log2TrSize * 2);
3194 uint32_t coeffOffsetY = absPartIdx << LOG2_UNIT_SIZE * 2;
3195 coeff_t* coeffSrcY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
3196 coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY;
3197 memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
3198
3199 if (bCodeChroma)
3200 {
3201 m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
3202
3203 uint32_t numCoeffC = 1 << (log2TrSizeC * 2 + (m_csp == X265_CSP_I422));
3204 uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
3205
3206 coeff_t* coeffSrcU = m_rqt[qtLayer].coeffRQT[1] + coeffOffsetC;
3207 coeff_t* coeffSrcV = m_rqt[qtLayer].coeffRQT[2] + coeffOffsetC;
3208 coeff_t* coeffDstU = cu.m_trCoeff[1] + coeffOffsetC;
3209 coeff_t* coeffDstV = cu.m_trCoeff[2] + coeffOffsetC;
3210 memcpy(coeffDstU, coeffSrcU, sizeof(coeff_t) * numCoeffC);
3211 memcpy(coeffDstV, coeffSrcV, sizeof(coeff_t) * numCoeffC);
3212 }
3213}
3214
3215/* returns the number of bits required to signal a non-most-probable mode.
3216 * on return mpms contains bitmap of most probable modes */
3217uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const
3218{
3219 cu.getIntraDirLumaPredictor(absPartIdx, preds);
3220
3221 mpms = 0;
3222 for (int i = 0; i < 3; ++i)
3223 mpms |= ((uint64_t)1 << preds[i]);
3224
3225 return m_entropyCoder.bitsIntraModeNonMPM();
3226}
3227
3228/* swap the current mode/cost with the mode with the highest cost in the
3229 * current candidate list, if its cost is better (maintain a top N list) */
3230void Search::updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList)
3231{
3232 uint32_t maxIndex = 0;
3233 uint64_t maxValue = 0;
3234
3235 for (int i = 0; i < maxCandCount; i++)
3236 {
3237 if (maxValue < candCostList[i])
3238 {
3239 maxValue = candCostList[i];
3240 maxIndex = i;
3241 }
3242 }
3243
3244 if (cost < maxValue)
3245 {
3246 candCostList[maxIndex] = cost;
3247 candModeList[maxIndex] = mode;
3248 }
3249}