1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Steve Borho <steve@borho.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 * This program is also available under a commercial proprietary license.
21 * For more information, contact us at license @ x265.com.
22 *****************************************************************************/
25 #include "primitives.h"
36 #pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning)
37 #pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
40 ALIGN_VAR_32(const pixel
, Search::zeroPixel
[MAX_CU_SIZE
]) = { 0 };
41 ALIGN_VAR_32(const int16_t, Search::zeroShort
[MAX_CU_SIZE
]) = { 0 };
43 Search::Search() : JobProvider(NULL
)
45 memset(m_rqt
, 0, sizeof(m_rqt
));
47 for (int i
= 0; i
< 3; i
++)
49 m_qtTempTransformSkipFlag
[i
] = NULL
;
50 m_qtTempCbf
[i
] = NULL
;
57 m_bJobsQueued
= false;
58 m_totalNumME
= m_numAcquiredME
= m_numCompletedME
= 0;
61 bool Search::initSearch(const x265_param
& param
, ScalingList
& scalingList
)
64 m_bEnableRDOQ
= param
.rdLevel
>= 4;
65 m_bFrameParallel
= param
.frameNumThreads
> 1;
66 m_numLayers
= g_log2Size
[param
.maxCUSize
] - 2;
68 m_rdCost
.setPsyRdScale(param
.psyRd
);
69 m_me
.setSearchMethod(param
.searchMethod
);
70 m_me
.setSubpelRefine(param
.subpelRefine
);
72 bool ok
= m_quant
.init(m_bEnableRDOQ
, param
.psyRdoq
, scalingList
, m_entropyCoder
);
73 if (m_param
->noiseReduction
)
74 ok
&= m_quant
.allocNoiseReduction(param
);
76 ok
&= Predict::allocBuffers(param
.internalCsp
); /* sets m_hChromaShift & m_vChromaShift */
78 /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
79 * available for motion reference. See refLagRows in FrameEncoder::compressCTURows() */
80 m_refLagPixels
= m_bFrameParallel
? param
.searchRange
: param
.sourceHeight
;
82 uint32_t sizeL
= 1 << (g_maxLog2CUSize
* 2);
83 uint32_t sizeC
= sizeL
>> (m_hChromaShift
+ m_vChromaShift
);
84 uint32_t numPartitions
= NUM_CU_PARTITIONS
;
86 /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
87 * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
88 * which are reconstructed at each depth are valid. At the end, the transform depth table
89 * is walked and the coeff and recon at the correct depths are collected */
90 for (uint32_t i
= 0; i
<= m_numLayers
; i
++)
92 CHECKED_MALLOC(m_rqt
[i
].coeffRQT
[0], coeff_t
, sizeL
+ sizeC
* 2);
93 m_rqt
[i
].coeffRQT
[1] = m_rqt
[i
].coeffRQT
[0] + sizeL
;
94 m_rqt
[i
].coeffRQT
[2] = m_rqt
[i
].coeffRQT
[0] + sizeL
+ sizeC
;
95 ok
&= m_rqt
[i
].reconQtYuv
.create(g_maxCUSize
, param
.internalCsp
);
96 ok
&= m_rqt
[i
].resiQtYuv
.create(g_maxCUSize
, param
.internalCsp
);
99 /* the rest of these buffers are indexed per-depth */
100 for (uint32_t i
= 0; i
<= g_maxCUDepth
; i
++)
102 int cuSize
= g_maxCUSize
>> i
;
103 ok
&= m_rqt
[i
].tmpResiYuv
.create(cuSize
, param
.internalCsp
);
104 ok
&= m_rqt
[i
].tmpPredYuv
.create(cuSize
, param
.internalCsp
);
105 ok
&= m_rqt
[i
].bidirPredYuv
[0].create(cuSize
, param
.internalCsp
);
106 ok
&= m_rqt
[i
].bidirPredYuv
[1].create(cuSize
, param
.internalCsp
);
109 CHECKED_MALLOC(m_qtTempCbf
[0], uint8_t, numPartitions
* 3);
110 m_qtTempCbf
[1] = m_qtTempCbf
[0] + numPartitions
;
111 m_qtTempCbf
[2] = m_qtTempCbf
[0] + numPartitions
* 2;
112 CHECKED_MALLOC(m_qtTempTransformSkipFlag
[0], uint8_t, numPartitions
* 3);
113 m_qtTempTransformSkipFlag
[1] = m_qtTempTransformSkipFlag
[0] + numPartitions
;
114 m_qtTempTransformSkipFlag
[2] = m_qtTempTransformSkipFlag
[0] + numPartitions
* 2;
124 for (uint32_t i
= 0; i
<= m_numLayers
; i
++)
126 X265_FREE(m_rqt
[i
].coeffRQT
[0]);
127 m_rqt
[i
].reconQtYuv
.destroy();
128 m_rqt
[i
].resiQtYuv
.destroy();
131 for (uint32_t i
= 0; i
<= g_maxCUDepth
; i
++)
133 m_rqt
[i
].tmpResiYuv
.destroy();
134 m_rqt
[i
].tmpPredYuv
.destroy();
135 m_rqt
[i
].bidirPredYuv
[0].destroy();
136 m_rqt
[i
].bidirPredYuv
[1].destroy();
139 X265_FREE(m_qtTempCbf
[0]);
140 X265_FREE(m_qtTempTransformSkipFlag
[0]);
143 void Search::setQP(const Slice
& slice
, int qp
)
145 x265_emms(); /* TODO: if the lambda tables were ints, this would not be necessary */
147 m_rdCost
.setQP(slice
, qp
);
150 #if CHECKED_BUILD || _DEBUG
151 void Search::invalidateContexts(int fromDepth
)
153 /* catch reads without previous writes */
154 for (int d
= fromDepth
; d
< NUM_FULL_DEPTH
; d
++)
156 m_rqt
[d
].cur
.markInvalid();
157 m_rqt
[d
].rqtTemp
.markInvalid();
158 m_rqt
[d
].rqtRoot
.markInvalid();
159 m_rqt
[d
].rqtTest
.markInvalid();
163 void Search::invalidateContexts(int) {}
166 void Search::codeSubdivCbfQTChroma(const CUData
& cu
, uint32_t trDepth
, uint32_t absPartIdx
, uint32_t absPartIdxStep
, uint32_t width
, uint32_t height
)
168 uint32_t fullDepth
= cu
.m_cuDepth
[0] + trDepth
;
169 uint32_t tuDepthL
= cu
.m_tuDepth
[absPartIdx
];
170 uint32_t subdiv
= tuDepthL
> trDepth
;
171 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
173 bool mCodeAll
= true;
174 const uint32_t numPels
= 1 << (log2TrSize
* 2 - m_hChromaShift
- m_vChromaShift
);
175 if (numPels
< (MIN_TU_SIZE
* MIN_TU_SIZE
))
180 if (!trDepth
|| cu
.getCbf(absPartIdx
, TEXT_CHROMA_U
, trDepth
- 1))
181 m_entropyCoder
.codeQtCbf(cu
, absPartIdx
, absPartIdxStep
, (width
>> m_hChromaShift
), (height
>> m_vChromaShift
), TEXT_CHROMA_U
, trDepth
, !subdiv
);
183 if (!trDepth
|| cu
.getCbf(absPartIdx
, TEXT_CHROMA_V
, trDepth
- 1))
184 m_entropyCoder
.codeQtCbf(cu
, absPartIdx
, absPartIdxStep
, (width
>> m_hChromaShift
), (height
>> m_vChromaShift
), TEXT_CHROMA_V
, trDepth
, !subdiv
);
189 absPartIdxStep
>>= 2;
193 uint32_t qtPartNum
= NUM_CU_PARTITIONS
>> ((fullDepth
+ 1) << 1);
194 for (uint32_t part
= 0; part
< 4; part
++)
195 codeSubdivCbfQTChroma(cu
, trDepth
+ 1, absPartIdx
+ part
* qtPartNum
, absPartIdxStep
, width
, height
);
199 void Search::codeCoeffQTChroma(const CUData
& cu
, uint32_t trDepth
, uint32_t absPartIdx
, TextType ttype
)
201 if (!cu
.getCbf(absPartIdx
, ttype
, trDepth
))
204 uint32_t fullDepth
= cu
.m_cuDepth
[0] + trDepth
;
205 uint32_t tuDepthL
= cu
.m_tuDepth
[absPartIdx
];
207 if (tuDepthL
> trDepth
)
209 uint32_t qtPartNum
= NUM_CU_PARTITIONS
>> ((fullDepth
+ 1) << 1);
210 for (uint32_t part
= 0; part
< 4; part
++)
211 codeCoeffQTChroma(cu
, trDepth
+ 1, absPartIdx
+ part
* qtPartNum
, ttype
);
216 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
218 uint32_t trDepthC
= trDepth
;
219 uint32_t log2TrSizeC
= log2TrSize
- m_hChromaShift
;
221 if (log2TrSizeC
== 1)
223 X265_CHECK(log2TrSize
== 2 && m_csp
!= X265_CSP_I444
&& trDepth
, "transform size too small\n");
226 uint32_t qpdiv
= NUM_CU_PARTITIONS
>> ((cu
.m_cuDepth
[0] + trDepthC
) << 1);
227 bool bFirstQ
= ((absPartIdx
& (qpdiv
- 1)) == 0);
232 uint32_t qtLayer
= log2TrSize
- 2;
234 if (m_csp
!= X265_CSP_I422
)
236 uint32_t shift
= (m_csp
== X265_CSP_I420
) ? 2 : 0;
237 uint32_t coeffOffset
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2 - shift
);
238 coeff_t
* coeff
= m_rqt
[qtLayer
].coeffRQT
[ttype
] + coeffOffset
;
239 m_entropyCoder
.codeCoeffNxN(cu
, coeff
, absPartIdx
, log2TrSizeC
, ttype
);
243 uint32_t coeffOffset
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2 - 1);
244 coeff_t
* coeff
= m_rqt
[qtLayer
].coeffRQT
[ttype
] + coeffOffset
;
245 uint32_t subTUSize
= 1 << (log2TrSizeC
* 2);
246 uint32_t partIdxesPerSubTU
= NUM_CU_PARTITIONS
>> (((cu
.m_cuDepth
[absPartIdx
] + trDepthC
) << 1) + 1);
247 if (cu
.getCbf(absPartIdx
, ttype
, trDepth
+ 1))
248 m_entropyCoder
.codeCoeffNxN(cu
, coeff
, absPartIdx
, log2TrSizeC
, ttype
);
249 if (cu
.getCbf(absPartIdx
+ partIdxesPerSubTU
, ttype
, trDepth
+ 1))
250 m_entropyCoder
.codeCoeffNxN(cu
, coeff
+ subTUSize
, absPartIdx
+ partIdxesPerSubTU
, log2TrSizeC
, ttype
);
254 void Search::codeIntraLumaQT(Mode
& mode
, const CUGeom
& cuGeom
, uint32_t trDepth
, uint32_t absPartIdx
, bool bAllowSplit
, Cost
& outCost
, uint32_t depthRange
[2])
256 uint32_t fullDepth
= mode
.cu
.m_cuDepth
[0] + trDepth
;
257 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
258 uint32_t qtLayer
= log2TrSize
- 2;
259 uint32_t sizeIdx
= log2TrSize
- 2;
260 bool mightNotSplit
= log2TrSize
<= depthRange
[1];
261 bool mightSplit
= (log2TrSize
> depthRange
[0]) && (bAllowSplit
|| !mightNotSplit
);
263 /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
264 if (m_param
->rdPenalty
== 2 && m_slice
->m_sliceType
!= I_SLICE
&& log2TrSize
== 5 && depthRange
[0] <= 4)
266 mightNotSplit
= false;
270 CUData
& cu
= mode
.cu
;
275 pixel
* reconQt
= m_rqt
[qtLayer
].reconQtYuv
.getLumaAddr(absPartIdx
);
276 uint32_t reconQtStride
= m_rqt
[qtLayer
].reconQtYuv
.m_size
;
281 m_entropyCoder
.store(m_rqt
[fullDepth
].rqtRoot
);
283 pixel
* fenc
= const_cast<pixel
*>(mode
.fencYuv
->getLumaAddr(absPartIdx
));
284 pixel
* pred
= mode
.predYuv
.getLumaAddr(absPartIdx
);
285 int16_t* residual
= m_rqt
[cuGeom
.depth
].tmpResiYuv
.getLumaAddr(absPartIdx
);
286 uint32_t stride
= mode
.fencYuv
->m_size
;
288 // init availability pattern
289 uint32_t lumaPredMode
= cu
.m_lumaIntraDir
[absPartIdx
];
290 initAdiPattern(cu
, cuGeom
, absPartIdx
, trDepth
, lumaPredMode
);
292 // get prediction signal
293 predIntraLumaAng(lumaPredMode
, pred
, stride
, log2TrSize
);
295 cu
.setTransformSkipSubParts(0, TEXT_LUMA
, absPartIdx
, fullDepth
);
296 cu
.setTUDepthSubParts(trDepth
, absPartIdx
, fullDepth
);
298 uint32_t coeffOffsetY
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2);
299 coeff_t
* coeffY
= m_rqt
[qtLayer
].coeffRQT
[0] + coeffOffsetY
;
301 // store original entropy coding status
303 m_entropyCoder
.estBit(m_entropyCoder
.m_estBitsSbac
, log2TrSize
, true);
305 primitives
.calcresidual
[sizeIdx
](fenc
, pred
, residual
, stride
);
307 uint32_t numSig
= m_quant
.transformNxN(cu
, fenc
, stride
, residual
, stride
, coeffY
, log2TrSize
, TEXT_LUMA
, absPartIdx
, false);
310 m_quant
.invtransformNxN(cu
.m_tqBypass
[0], residual
, stride
, coeffY
, log2TrSize
, TEXT_LUMA
, true, false, numSig
);
311 primitives
.luma_add_ps
[sizeIdx
](reconQt
, reconQtStride
, pred
, residual
, stride
, stride
);
314 // no coded residual, recon = pred
315 primitives
.square_copy_pp
[sizeIdx
](reconQt
, reconQtStride
, pred
, stride
);
317 bCBF
= !!numSig
<< trDepth
;
318 cu
.setCbfSubParts(bCBF
, TEXT_LUMA
, absPartIdx
, fullDepth
);
319 fullCost
.distortion
= primitives
.sse_pp
[sizeIdx
](reconQt
, reconQtStride
, fenc
, stride
);
321 m_entropyCoder
.resetBits();
324 if (!cu
.m_slice
->isIntra())
326 if (cu
.m_slice
->m_pps
->bTransquantBypassEnabled
)
327 m_entropyCoder
.codeCUTransquantBypassFlag(cu
.m_tqBypass
[0]);
328 m_entropyCoder
.codeSkipFlag(cu
, 0);
329 m_entropyCoder
.codePredMode(cu
.m_predMode
[0]);
332 m_entropyCoder
.codePartSize(cu
, 0, cu
.m_cuDepth
[0]);
334 if (cu
.m_partSize
[0] == SIZE_2Nx2N
)
337 m_entropyCoder
.codeIntraDirLumaAng(cu
, 0, false);
341 uint32_t qtNumParts
= cuGeom
.numPartitions
>> 2;
344 for (uint32_t part
= 0; part
< 4; part
++)
345 m_entropyCoder
.codeIntraDirLumaAng(cu
, part
* qtNumParts
, false);
347 else if (!(absPartIdx
& (qtNumParts
- 1)))
348 m_entropyCoder
.codeIntraDirLumaAng(cu
, absPartIdx
, false);
350 if (log2TrSize
!= depthRange
[0])
351 m_entropyCoder
.codeTransformSubdivFlag(0, 5 - log2TrSize
);
353 m_entropyCoder
.codeQtCbf(cu
, absPartIdx
, TEXT_LUMA
, cu
.m_tuDepth
[absPartIdx
]);
355 if (cu
.getCbf(absPartIdx
, TEXT_LUMA
, trDepth
))
356 m_entropyCoder
.codeCoeffNxN(cu
, coeffY
, absPartIdx
, log2TrSize
, TEXT_LUMA
);
358 fullCost
.bits
= m_entropyCoder
.getNumberOfWrittenBits();
360 if (m_param
->rdPenalty
&& log2TrSize
== 5 && m_slice
->m_sliceType
!= I_SLICE
)
363 if (m_rdCost
.m_psyRd
)
365 fullCost
.energy
= m_rdCost
.psyCost(sizeIdx
, fenc
, mode
.fencYuv
->m_size
, reconQt
, reconQtStride
);
366 fullCost
.rdcost
= m_rdCost
.calcPsyRdCost(fullCost
.distortion
, fullCost
.bits
, fullCost
.energy
);
369 fullCost
.rdcost
= m_rdCost
.calcRdCost(fullCost
.distortion
, fullCost
.bits
);
372 fullCost
.rdcost
= MAX_INT64
;
378 m_entropyCoder
.store(m_rqt
[fullDepth
].rqtTest
); // save state after full TU encode
379 m_entropyCoder
.load(m_rqt
[fullDepth
].rqtRoot
); // prep state of split encode
383 uint32_t qPartsDiv
= NUM_CU_PARTITIONS
>> ((fullDepth
+ 1) << 1);
384 uint32_t absPartIdxSub
= absPartIdx
;
386 int checkTransformSkip
= m_slice
->m_pps
->bTransformSkipEnabled
&& (log2TrSize
- 1) <= MAX_LOG2_TS_SIZE
&& !cu
.m_tqBypass
[0];
387 if (m_param
->bEnableTSkipFast
)
388 checkTransformSkip
&= cu
.m_partSize
[absPartIdx
] == SIZE_NxN
;
392 for (uint32_t subPartIdx
= 0; subPartIdx
< 4; subPartIdx
++, absPartIdxSub
+= qPartsDiv
)
394 if (checkTransformSkip
)
395 codeIntraLumaTSkip(mode
, cuGeom
, trDepth
+ 1, absPartIdxSub
, splitCost
);
397 codeIntraLumaQT(mode
, cuGeom
, trDepth
+ 1, absPartIdxSub
, bAllowSplit
, splitCost
, depthRange
);
399 cbf
|= cu
.getCbf(absPartIdxSub
, TEXT_LUMA
, trDepth
+ 1);
401 for (uint32_t offs
= 0; offs
< 4 * qPartsDiv
; offs
++)
402 cu
.m_cbf
[0][absPartIdx
+ offs
] |= (cbf
<< trDepth
);
404 if (mightNotSplit
&& log2TrSize
!= depthRange
[0])
406 /* If we could have coded this TU depth, include cost of subdiv flag */
407 m_entropyCoder
.resetBits();
408 m_entropyCoder
.codeTransformSubdivFlag(1, 5 - log2TrSize
);
409 splitCost
.bits
+= m_entropyCoder
.getNumberOfWrittenBits();
411 if (m_rdCost
.m_psyRd
)
412 splitCost
.rdcost
= m_rdCost
.calcPsyRdCost(splitCost
.distortion
, splitCost
.bits
, splitCost
.energy
);
414 splitCost
.rdcost
= m_rdCost
.calcRdCost(splitCost
.distortion
, splitCost
.bits
);
417 if (splitCost
.rdcost
< fullCost
.rdcost
)
419 outCost
.rdcost
+= splitCost
.rdcost
;
420 outCost
.distortion
+= splitCost
.distortion
;
421 outCost
.bits
+= splitCost
.bits
;
422 outCost
.energy
+= splitCost
.energy
;
427 // recover entropy state of full-size TU encode
428 m_entropyCoder
.load(m_rqt
[fullDepth
].rqtTest
);
430 // recover transform index and Cbf values
431 cu
.setTUDepthSubParts(trDepth
, absPartIdx
, fullDepth
);
432 cu
.setCbfSubParts(bCBF
, TEXT_LUMA
, absPartIdx
, fullDepth
);
433 cu
.setTransformSkipSubParts(0, TEXT_LUMA
, absPartIdx
, fullDepth
);
437 // set reconstruction for next intra prediction blocks if full TU prediction won
438 pixel
* picReconY
= m_frame
->m_reconPicYuv
->getLumaAddr(cu
.m_cuAddr
, cuGeom
.encodeIdx
+ absPartIdx
);
439 intptr_t picStride
= m_frame
->m_reconPicYuv
->m_stride
;
440 primitives
.square_copy_pp
[sizeIdx
](picReconY
, picStride
, reconQt
, reconQtStride
);
442 outCost
.rdcost
+= fullCost
.rdcost
;
443 outCost
.distortion
+= fullCost
.distortion
;
444 outCost
.bits
+= fullCost
.bits
;
445 outCost
.energy
+= fullCost
.energy
;
448 void Search::codeIntraLumaTSkip(Mode
& mode
, const CUGeom
& cuGeom
, uint32_t trDepth
, uint32_t absPartIdx
, Cost
& outCost
)
450 uint32_t fullDepth
= mode
.cu
.m_cuDepth
[0] + trDepth
;
451 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
452 uint32_t tuSize
= 1 << log2TrSize
;
454 X265_CHECK(tuSize
== MAX_TS_SIZE
, "transform skip is only possible at 4x4 TUs\n");
456 CUData
& cu
= mode
.cu
;
457 Yuv
* predYuv
= &mode
.predYuv
;
458 const Yuv
* fencYuv
= mode
.fencYuv
;
461 fullCost
.rdcost
= MAX_INT64
;
465 pixel
* fenc
= const_cast<pixel
*>(fencYuv
->getLumaAddr(absPartIdx
));
466 pixel
* pred
= predYuv
->getLumaAddr(absPartIdx
);
467 int16_t* residual
= m_rqt
[cuGeom
.depth
].tmpResiYuv
.getLumaAddr(absPartIdx
);
468 uint32_t stride
= fencYuv
->m_size
;
469 int sizeIdx
= log2TrSize
- 2;
471 // init availability pattern
472 uint32_t lumaPredMode
= cu
.m_lumaIntraDir
[absPartIdx
];
473 initAdiPattern(cu
, cuGeom
, absPartIdx
, trDepth
, lumaPredMode
);
475 // get prediction signal
476 predIntraLumaAng(lumaPredMode
, pred
, stride
, log2TrSize
);
478 cu
.setTUDepthSubParts(trDepth
, absPartIdx
, fullDepth
);
480 uint32_t qtLayer
= log2TrSize
- 2;
481 uint32_t coeffOffsetY
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2);
482 coeff_t
* coeffY
= m_rqt
[qtLayer
].coeffRQT
[0] + coeffOffsetY
;
483 pixel
* reconQt
= m_rqt
[qtLayer
].reconQtYuv
.getLumaAddr(absPartIdx
);
484 uint32_t reconQtStride
= m_rqt
[qtLayer
].reconQtYuv
.m_size
;
486 // store original entropy coding status
487 m_entropyCoder
.store(m_rqt
[fullDepth
].rqtRoot
);
490 m_entropyCoder
.estBit(m_entropyCoder
.m_estBitsSbac
, log2TrSize
, true);
492 ALIGN_VAR_32(coeff_t
, tsCoeffY
[MAX_TS_SIZE
* MAX_TS_SIZE
]);
493 ALIGN_VAR_32(pixel
, tsReconY
[MAX_TS_SIZE
* MAX_TS_SIZE
]);
495 int checkTransformSkip
= 1;
496 for (int useTSkip
= 0; useTSkip
<= checkTransformSkip
; useTSkip
++)
499 uint32_t tmpEnergy
= 0;
501 coeff_t
* coeff
= (useTSkip
? tsCoeffY
: coeffY
);
502 pixel
* tmpRecon
= (useTSkip
? tsReconY
: reconQt
);
503 uint32_t tmpReconStride
= (useTSkip
? MAX_TS_SIZE
: reconQtStride
);
505 primitives
.calcresidual
[sizeIdx
](fenc
, pred
, residual
, stride
);
507 uint32_t numSig
= m_quant
.transformNxN(cu
, fenc
, stride
, residual
, stride
, coeff
, log2TrSize
, TEXT_LUMA
, absPartIdx
, useTSkip
);
510 m_quant
.invtransformNxN(cu
.m_tqBypass
[0], residual
, stride
, coeff
, log2TrSize
, TEXT_LUMA
, true, useTSkip
, numSig
);
511 primitives
.luma_add_ps
[sizeIdx
](tmpRecon
, tmpReconStride
, pred
, residual
, stride
, stride
);
515 /* do not allow tskip if CBF=0, pretend we did not try tskip */
516 checkTransformSkip
= 0;
520 // no residual coded, recon = pred
521 primitives
.square_copy_pp
[sizeIdx
](tmpRecon
, tmpReconStride
, pred
, stride
);
523 uint32_t tmpDist
= primitives
.sse_pp
[sizeIdx
](tmpRecon
, tmpReconStride
, fenc
, stride
);
525 cu
.setTransformSkipSubParts(useTSkip
, TEXT_LUMA
, absPartIdx
, fullDepth
);
526 cu
.setCbfSubParts((!!numSig
) << trDepth
, TEXT_LUMA
, absPartIdx
, fullDepth
);
529 m_entropyCoder
.load(m_rqt
[fullDepth
].rqtRoot
);
531 m_entropyCoder
.resetBits();
534 if (!cu
.m_slice
->isIntra())
536 if (cu
.m_slice
->m_pps
->bTransquantBypassEnabled
)
537 m_entropyCoder
.codeCUTransquantBypassFlag(cu
.m_tqBypass
[0]);
538 m_entropyCoder
.codeSkipFlag(cu
, 0);
539 m_entropyCoder
.codePredMode(cu
.m_predMode
[0]);
542 m_entropyCoder
.codePartSize(cu
, 0, cu
.m_cuDepth
[0]);
544 if (cu
.m_partSize
[0] == SIZE_2Nx2N
)
547 m_entropyCoder
.codeIntraDirLumaAng(cu
, 0, false);
551 uint32_t qtNumParts
= cuGeom
.numPartitions
>> 2;
554 for (uint32_t part
= 0; part
< 4; part
++)
555 m_entropyCoder
.codeIntraDirLumaAng(cu
, part
* qtNumParts
, false);
557 else if (!(absPartIdx
& (qtNumParts
- 1)))
558 m_entropyCoder
.codeIntraDirLumaAng(cu
, absPartIdx
, false);
560 m_entropyCoder
.codeTransformSubdivFlag(0, 5 - log2TrSize
);
562 m_entropyCoder
.codeQtCbf(cu
, absPartIdx
, TEXT_LUMA
, cu
.m_tuDepth
[absPartIdx
]);
564 if (cu
.getCbf(absPartIdx
, TEXT_LUMA
, trDepth
))
565 m_entropyCoder
.codeCoeffNxN(cu
, coeff
, absPartIdx
, log2TrSize
, TEXT_LUMA
);
567 uint32_t tmpBits
= m_entropyCoder
.getNumberOfWrittenBits();
570 m_entropyCoder
.store(m_rqt
[fullDepth
].rqtTemp
);
572 if (m_rdCost
.m_psyRd
)
574 tmpEnergy
= m_rdCost
.psyCost(sizeIdx
, fenc
, fencYuv
->m_size
, tmpRecon
, tmpReconStride
);
575 tmpCost
= m_rdCost
.calcPsyRdCost(tmpDist
, tmpBits
, tmpEnergy
);
578 tmpCost
= m_rdCost
.calcRdCost(tmpDist
, tmpBits
);
580 if (tmpCost
< fullCost
.rdcost
)
584 fullCost
.rdcost
= tmpCost
;
585 fullCost
.distortion
= tmpDist
;
586 fullCost
.bits
= tmpBits
;
587 fullCost
.energy
= tmpEnergy
;
593 memcpy(coeffY
, tsCoeffY
, sizeof(coeff_t
) << (log2TrSize
* 2));
594 primitives
.square_copy_pp
[sizeIdx
](reconQt
, reconQtStride
, tsReconY
, tuSize
);
596 else if (checkTransformSkip
)
598 cu
.setTransformSkipSubParts(0, TEXT_LUMA
, absPartIdx
, fullDepth
);
599 cu
.setCbfSubParts(bCBF
<< trDepth
, TEXT_LUMA
, absPartIdx
, fullDepth
);
600 m_entropyCoder
.load(m_rqt
[fullDepth
].rqtTemp
);
603 // set reconstruction for next intra prediction blocks
604 pixel
* picReconY
= m_frame
->m_reconPicYuv
->getLumaAddr(cu
.m_cuAddr
, cuGeom
.encodeIdx
+ absPartIdx
);
605 intptr_t picStride
= m_frame
->m_reconPicYuv
->m_stride
;
606 primitives
.square_copy_pp
[sizeIdx
](picReconY
, picStride
, reconQt
, reconQtStride
);
608 outCost
.rdcost
+= fullCost
.rdcost
;
609 outCost
.distortion
+= fullCost
.distortion
;
610 outCost
.bits
+= fullCost
.bits
;
611 outCost
.energy
+= fullCost
.energy
;
614 /* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
615 void Search::residualTransformQuantIntra(Mode
& mode
, const CUGeom
& cuGeom
, uint32_t trDepth
, uint32_t absPartIdx
, uint32_t depthRange
[2])
617 CUData
& cu
= mode
.cu
;
619 uint32_t fullDepth
= cu
.m_cuDepth
[0] + trDepth
;
620 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
621 bool bCheckFull
= log2TrSize
<= depthRange
[1];
623 X265_CHECK(m_slice
->m_sliceType
!= I_SLICE
, "residualTransformQuantIntra not intended for I slices\n");
625 /* we still respect rdPenalty == 2, we can forbid 32x32 intra TU. rdPenalty = 1 is impossible
626 * since we are not measuring RD cost */
627 if (m_param
->rdPenalty
== 2 && log2TrSize
== 5 && depthRange
[0] <= 4)
632 pixel
* fenc
= const_cast<pixel
*>(mode
.fencYuv
->getLumaAddr(absPartIdx
));
633 pixel
* pred
= mode
.predYuv
.getLumaAddr(absPartIdx
);
634 int16_t* residual
= m_rqt
[cuGeom
.depth
].tmpResiYuv
.getLumaAddr(absPartIdx
);
635 pixel
* picReconY
= m_frame
->m_reconPicYuv
->getLumaAddr(cu
.m_cuAddr
, cuGeom
.encodeIdx
+ absPartIdx
);
636 intptr_t picStride
= m_frame
->m_reconPicYuv
->m_stride
;
637 uint32_t stride
= mode
.fencYuv
->m_size
;
638 uint32_t sizeIdx
= log2TrSize
- 2;
639 uint32_t lumaPredMode
= cu
.m_lumaIntraDir
[absPartIdx
];
640 uint32_t coeffOffsetY
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2);
641 coeff_t
* coeff
= cu
.m_trCoeff
[TEXT_LUMA
] + coeffOffsetY
;
643 initAdiPattern(cu
, cuGeom
, absPartIdx
, trDepth
, lumaPredMode
);
644 predIntraLumaAng(lumaPredMode
, pred
, stride
, log2TrSize
);
646 X265_CHECK(!cu
.m_transformSkip
[TEXT_LUMA
][absPartIdx
], "unexpected tskip flag in residualTransformQuantIntra\n");
647 cu
.setTUDepthSubParts(trDepth
, absPartIdx
, fullDepth
);
649 primitives
.calcresidual
[sizeIdx
](fenc
, pred
, residual
, stride
);
650 uint32_t numSig
= m_quant
.transformNxN(cu
, fenc
, stride
, residual
, stride
, coeff
, log2TrSize
, TEXT_LUMA
, absPartIdx
, false);
653 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdx
], residual
, stride
, coeff
, log2TrSize
, TEXT_LUMA
, true, false, numSig
);
654 primitives
.luma_add_ps
[sizeIdx
](picReconY
, picStride
, pred
, residual
, stride
, stride
);
655 cu
.setCbfSubParts(1 << trDepth
, TEXT_LUMA
, absPartIdx
, fullDepth
);
659 primitives
.square_copy_pp
[sizeIdx
](picReconY
, picStride
, pred
, stride
);
660 cu
.setCbfSubParts(0, TEXT_LUMA
, absPartIdx
, fullDepth
);
665 X265_CHECK(log2TrSize
> depthRange
[0], "intra luma split state failure\n");
667 /* code split block */
668 uint32_t qPartsDiv
= NUM_CU_PARTITIONS
>> ((fullDepth
+ 1) << 1);
670 for (uint32_t subPartIdx
= 0, absPartIdxSub
= absPartIdx
; subPartIdx
< 4; subPartIdx
++, absPartIdxSub
+= qPartsDiv
)
672 residualTransformQuantIntra(mode
, cuGeom
, trDepth
+ 1, absPartIdxSub
, depthRange
);
673 cbf
|= cu
.getCbf(absPartIdxSub
, TEXT_LUMA
, trDepth
+ 1);
675 for (uint32_t offs
= 0; offs
< 4 * qPartsDiv
; offs
++)
676 cu
.m_cbf
[TEXT_LUMA
][absPartIdx
+ offs
] |= (cbf
<< trDepth
);
680 void Search::extractIntraResultQT(CUData
& cu
, Yuv
& reconYuv
, uint32_t trDepth
, uint32_t absPartIdx
)
682 uint32_t fullDepth
= cu
.m_cuDepth
[0] + trDepth
;
683 uint32_t tuDepth
= cu
.m_tuDepth
[absPartIdx
];
685 if (tuDepth
== trDepth
)
687 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
688 uint32_t qtLayer
= log2TrSize
- 2;
690 // copy transform coefficients
691 uint32_t coeffOffsetY
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2);
692 coeff_t
* coeffSrcY
= m_rqt
[qtLayer
].coeffRQT
[0] + coeffOffsetY
;
693 coeff_t
* coeffDestY
= cu
.m_trCoeff
[0] + coeffOffsetY
;
694 memcpy(coeffDestY
, coeffSrcY
, sizeof(coeff_t
) << (log2TrSize
* 2));
696 // copy reconstruction
697 m_rqt
[qtLayer
].reconQtYuv
.copyPartToPartLuma(reconYuv
, absPartIdx
, log2TrSize
);
701 uint32_t numQPart
= NUM_CU_PARTITIONS
>> ((fullDepth
+ 1) << 1);
702 for (uint32_t subPartIdx
= 0; subPartIdx
< 4; subPartIdx
++)
703 extractIntraResultQT(cu
, reconYuv
, trDepth
+ 1, absPartIdx
+ subPartIdx
* numQPart
);
707 /* 4:2:2 post-TU split processing */
708 void Search::offsetSubTUCBFs(CUData
& cu
, TextType ttype
, uint32_t trDepth
, uint32_t absPartIdx
)
710 uint32_t depth
= cu
.m_cuDepth
[0];
711 uint32_t fullDepth
= depth
+ trDepth
;
712 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
714 uint32_t trDepthC
= trDepth
;
717 X265_CHECK(m_csp
!= X265_CSP_I444
&& trDepthC
, "trDepthC invalid\n");
721 uint32_t partIdxesPerSubTU
= (NUM_CU_PARTITIONS
>> ((depth
+ trDepthC
) << 1)) >> 1;
723 // move the CBFs down a level and set the parent CBF
725 uint8_t combinedSubTUCBF
= 0;
727 for (uint32_t subTU
= 0; subTU
< 2; subTU
++)
729 const uint32_t subTUAbsPartIdx
= absPartIdx
+ (subTU
* partIdxesPerSubTU
);
731 subTUCBF
[subTU
] = cu
.getCbf(subTUAbsPartIdx
, ttype
, trDepth
);
732 combinedSubTUCBF
|= subTUCBF
[subTU
];
735 for (uint32_t subTU
= 0; subTU
< 2; subTU
++)
737 const uint32_t subTUAbsPartIdx
= absPartIdx
+ (subTU
* partIdxesPerSubTU
);
738 const uint8_t compositeCBF
= (subTUCBF
[subTU
] << 1) | combinedSubTUCBF
;
740 cu
.setCbfPartRange((compositeCBF
<< trDepth
), ttype
, subTUAbsPartIdx
, partIdxesPerSubTU
);
744 /* returns distortion */
745 uint32_t Search::codeIntraChromaQt(Mode
& mode
, const CUGeom
& cuGeom
, uint32_t trDepth
, uint32_t absPartIdx
, uint32_t& psyEnergy
)
747 CUData
& cu
= mode
.cu
;
748 uint32_t fullDepth
= cu
.m_cuDepth
[0] + trDepth
;
749 uint32_t tuDepthL
= cu
.m_tuDepth
[absPartIdx
];
751 if (tuDepthL
> trDepth
)
753 uint32_t qPartsDiv
= NUM_CU_PARTITIONS
>> ((fullDepth
+ 1) << 1);
754 uint32_t outDist
= 0, splitCbfU
= 0, splitCbfV
= 0;
755 for (uint32_t subPartIdx
= 0, absPartIdxSub
= absPartIdx
; subPartIdx
< 4; subPartIdx
++, absPartIdxSub
+= qPartsDiv
)
757 outDist
+= codeIntraChromaQt(mode
, cuGeom
, trDepth
+ 1, absPartIdxSub
, psyEnergy
);
758 splitCbfU
|= cu
.getCbf(absPartIdxSub
, TEXT_CHROMA_U
, trDepth
+ 1);
759 splitCbfV
|= cu
.getCbf(absPartIdxSub
, TEXT_CHROMA_V
, trDepth
+ 1);
761 for (uint32_t offs
= 0; offs
< 4 * qPartsDiv
; offs
++)
763 cu
.m_cbf
[TEXT_CHROMA_U
][absPartIdx
+ offs
] |= (splitCbfU
<< trDepth
);
764 cu
.m_cbf
[TEXT_CHROMA_V
][absPartIdx
+ offs
] |= (splitCbfV
<< trDepth
);
770 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
771 uint32_t log2TrSizeC
= log2TrSize
- m_hChromaShift
;
773 uint32_t trDepthC
= trDepth
;
774 if (log2TrSizeC
== 1)
776 X265_CHECK(log2TrSize
== 2 && m_csp
!= X265_CSP_I444
&& trDepth
, "invalid trDepth\n");
779 uint32_t qpdiv
= NUM_CU_PARTITIONS
>> ((cu
.m_cuDepth
[0] + trDepthC
) << 1);
780 bool bFirstQ
= ((absPartIdx
& (qpdiv
- 1)) == 0);
786 m_entropyCoder
.estBit(m_entropyCoder
.m_estBitsSbac
, log2TrSizeC
, false);
788 bool checkTransformSkip
= m_slice
->m_pps
->bTransformSkipEnabled
&& log2TrSizeC
<= MAX_LOG2_TS_SIZE
&& !cu
.m_tqBypass
[0];
789 checkTransformSkip
&= !m_param
->bEnableTSkipFast
|| (log2TrSize
<= MAX_LOG2_TS_SIZE
&& cu
.m_transformSkip
[TEXT_LUMA
][absPartIdx
]);
790 if (checkTransformSkip
)
791 return codeIntraChromaTSkip(mode
, cuGeom
, trDepth
, trDepthC
, absPartIdx
, psyEnergy
);
793 uint32_t qtLayer
= log2TrSize
- 2;
794 uint32_t tuSize
= 1 << log2TrSizeC
;
795 uint32_t outDist
= 0;
797 uint32_t curPartNum
= NUM_CU_PARTITIONS
>> ((cu
.m_cuDepth
[0] + trDepthC
) << 1);
798 const SplitType splitType
= (m_csp
== X265_CSP_I422
) ? VERTICAL_SPLIT
: DONT_SPLIT
;
800 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
802 TextType ttype
= (TextType
)chromaId
;
804 TURecurse
tuIterator(splitType
, curPartNum
, absPartIdx
);
807 uint32_t absPartIdxC
= tuIterator
.absPartIdxTURelCU
;
809 pixel
* fenc
= const_cast<Yuv
*>(mode
.fencYuv
)->getChromaAddr(chromaId
, absPartIdxC
);
810 pixel
* pred
= mode
.predYuv
.getChromaAddr(chromaId
, absPartIdxC
);
811 int16_t* residual
= m_rqt
[cuGeom
.depth
].tmpResiYuv
.getChromaAddr(chromaId
, absPartIdxC
);
812 uint32_t stride
= mode
.fencYuv
->m_csize
;
813 uint32_t sizeIdxC
= log2TrSizeC
- 2;
815 uint32_t coeffOffsetC
= absPartIdxC
<< (LOG2_UNIT_SIZE
* 2 - (m_hChromaShift
+ m_vChromaShift
));
816 coeff_t
* coeffC
= m_rqt
[qtLayer
].coeffRQT
[chromaId
] + coeffOffsetC
;
817 pixel
* reconQt
= m_rqt
[qtLayer
].reconQtYuv
.getChromaAddr(chromaId
, absPartIdxC
);
818 uint32_t reconQtStride
= m_rqt
[qtLayer
].reconQtYuv
.m_csize
;
820 pixel
* picReconC
= m_frame
->m_reconPicYuv
->getChromaAddr(chromaId
, cu
.m_cuAddr
, cuGeom
.encodeIdx
+ absPartIdxC
);
821 intptr_t picStride
= m_frame
->m_reconPicYuv
->m_strideC
;
823 // init availability pattern
824 initAdiPatternChroma(cu
, cuGeom
, absPartIdxC
, trDepthC
, chromaId
);
825 pixel
* chromaPred
= getAdiChromaBuf(chromaId
, tuSize
);
827 uint32_t chromaPredMode
= cu
.m_chromaIntraDir
[absPartIdxC
];
828 if (chromaPredMode
== DM_CHROMA_IDX
)
829 chromaPredMode
= cu
.m_lumaIntraDir
[(m_csp
== X265_CSP_I444
) ? absPartIdxC
: 0];
830 if (m_csp
== X265_CSP_I422
)
831 chromaPredMode
= g_chroma422IntraAngleMappingTable
[chromaPredMode
];
833 // get prediction signal
834 predIntraChromaAng(chromaPred
, chromaPredMode
, pred
, stride
, log2TrSizeC
, m_csp
);
836 cu
.setTransformSkipPartRange(0, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
838 primitives
.calcresidual
[sizeIdxC
](fenc
, pred
, residual
, stride
);
839 uint32_t numSig
= m_quant
.transformNxN(cu
, fenc
, stride
, residual
, stride
, coeffC
, log2TrSizeC
, ttype
, absPartIdxC
, false);
843 m_quant
.invtransformNxN(cu
.m_tqBypass
[0], residual
, stride
, coeffC
, log2TrSizeC
, ttype
, true, false, numSig
);
844 primitives
.luma_add_ps
[sizeIdxC
](reconQt
, reconQtStride
, pred
, residual
, stride
, stride
);
845 cu
.setCbfPartRange(1 << trDepth
, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
849 // no coded residual, recon = pred
850 primitives
.square_copy_pp
[sizeIdxC
](reconQt
, reconQtStride
, pred
, stride
);
851 cu
.setCbfPartRange(0, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
854 tmpDist
= primitives
.sse_pp
[sizeIdxC
](reconQt
, reconQtStride
, fenc
, stride
);
855 outDist
+= (ttype
== TEXT_CHROMA_U
) ? m_rdCost
.scaleChromaDistCb(tmpDist
) : m_rdCost
.scaleChromaDistCr(tmpDist
);
857 if (m_rdCost
.m_psyRd
)
858 psyEnergy
+= m_rdCost
.psyCost(sizeIdxC
, fenc
, stride
, picReconC
, picStride
);
860 primitives
.square_copy_pp
[sizeIdxC
](picReconC
, picStride
, reconQt
, reconQtStride
);
862 while (tuIterator
.isNextSection());
864 if (splitType
== VERTICAL_SPLIT
)
865 offsetSubTUCBFs(cu
, ttype
, trDepth
, absPartIdx
);
871 /* returns distortion */
872 uint32_t Search::codeIntraChromaTSkip(Mode
& mode
, const CUGeom
& cuGeom
, uint32_t trDepth
, uint32_t trDepthC
, uint32_t absPartIdx
, uint32_t& psyEnergy
)
874 CUData
& cu
= mode
.cu
;
875 uint32_t fullDepth
= cu
.m_cuDepth
[0] + trDepth
;
876 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
877 uint32_t log2TrSizeC
= 2;
879 uint32_t qtLayer
= log2TrSize
- 2;
880 uint32_t outDist
= 0;
882 /* At the TU layers above this one, no RDO is performed, only distortion is being measured,
883 * so the entropy coder is not very accurate. The best we can do is return it in the same
884 * condition as it arrived, and to do all bit estimates from the same state. */
885 m_entropyCoder
.store(m_rqt
[fullDepth
].rqtRoot
);
887 ALIGN_VAR_32(coeff_t
, tskipCoeffC
[MAX_TS_SIZE
* MAX_TS_SIZE
]);
888 ALIGN_VAR_32(pixel
, tskipReconC
[MAX_TS_SIZE
* MAX_TS_SIZE
]);
890 uint32_t curPartNum
= NUM_CU_PARTITIONS
>> ((cu
.m_cuDepth
[0] + trDepthC
) << 1);
891 const SplitType splitType
= (m_csp
== X265_CSP_I422
) ? VERTICAL_SPLIT
: DONT_SPLIT
;
893 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
895 TextType ttype
= (TextType
)chromaId
;
897 TURecurse
tuIterator(splitType
, curPartNum
, absPartIdx
);
900 uint32_t absPartIdxC
= tuIterator
.absPartIdxTURelCU
;
902 pixel
* fenc
= const_cast<Yuv
*>(mode
.fencYuv
)->getChromaAddr(chromaId
, absPartIdxC
);
903 pixel
* pred
= mode
.predYuv
.getChromaAddr(chromaId
, absPartIdxC
);
904 int16_t* residual
= m_rqt
[cuGeom
.depth
].tmpResiYuv
.getChromaAddr(chromaId
, absPartIdxC
);
905 uint32_t stride
= mode
.fencYuv
->m_csize
;
906 uint32_t sizeIdxC
= log2TrSizeC
- 2;
908 uint32_t coeffOffsetC
= absPartIdxC
<< (LOG2_UNIT_SIZE
* 2 - (m_hChromaShift
+ m_vChromaShift
));
909 coeff_t
* coeffC
= m_rqt
[qtLayer
].coeffRQT
[chromaId
] + coeffOffsetC
;
910 pixel
* reconQt
= m_rqt
[qtLayer
].reconQtYuv
.getChromaAddr(chromaId
, absPartIdxC
);
911 uint32_t reconQtStride
= m_rqt
[qtLayer
].reconQtYuv
.m_csize
;
913 // init availability pattern
914 initAdiPatternChroma(cu
, cuGeom
, absPartIdxC
, trDepthC
, chromaId
);
915 pixel
* chromaPred
= getAdiChromaBuf(chromaId
, tuSize
);
917 uint32_t chromaPredMode
= cu
.m_chromaIntraDir
[absPartIdxC
];
918 if (chromaPredMode
== DM_CHROMA_IDX
)
919 chromaPredMode
= cu
.m_lumaIntraDir
[(m_csp
== X265_CSP_I444
) ? absPartIdxC
: 0];
920 if (m_csp
== X265_CSP_I422
)
921 chromaPredMode
= g_chroma422IntraAngleMappingTable
[chromaPredMode
];
923 // get prediction signal
924 predIntraChromaAng(chromaPred
, chromaPredMode
, pred
, stride
, log2TrSizeC
, m_csp
);
926 uint64_t bCost
= MAX_INT64
;
929 uint32_t bEnergy
= 0;
932 int checkTransformSkip
= 1;
933 for (int useTSkip
= 0; useTSkip
<= checkTransformSkip
; useTSkip
++)
935 coeff_t
* coeff
= (useTSkip
? tskipCoeffC
: coeffC
);
936 pixel
* recon
= (useTSkip
? tskipReconC
: reconQt
);
937 uint32_t reconStride
= (useTSkip
? MAX_TS_SIZE
: reconQtStride
);
939 primitives
.calcresidual
[sizeIdxC
](fenc
, pred
, residual
, stride
);
941 uint32_t numSig
= m_quant
.transformNxN(cu
, fenc
, stride
, residual
, stride
, coeff
, log2TrSizeC
, ttype
, absPartIdxC
, useTSkip
);
944 m_quant
.invtransformNxN(cu
.m_tqBypass
[0], residual
, stride
, coeff
, log2TrSizeC
, ttype
, true, useTSkip
, numSig
);
945 primitives
.luma_add_ps
[sizeIdxC
](recon
, reconStride
, pred
, residual
, stride
, stride
);
946 cu
.setCbfPartRange(1 << trDepth
, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
950 checkTransformSkip
= 0;
955 primitives
.square_copy_pp
[sizeIdxC
](recon
, reconStride
, pred
, stride
);
956 cu
.setCbfPartRange(0, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
958 uint32_t tmpDist
= primitives
.sse_pp
[sizeIdxC
](recon
, reconStride
, fenc
, stride
);
959 tmpDist
= (ttype
== TEXT_CHROMA_U
) ? m_rdCost
.scaleChromaDistCb(tmpDist
) : m_rdCost
.scaleChromaDistCr(tmpDist
);
961 cu
.setTransformSkipPartRange(useTSkip
, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
963 uint32_t tmpBits
= 0, tmpEnergy
= 0;
966 m_entropyCoder
.load(m_rqt
[fullDepth
].rqtRoot
);
967 m_entropyCoder
.resetBits();
968 m_entropyCoder
.codeCoeffNxN(cu
, coeff
, absPartIdxC
, log2TrSizeC
, (TextType
)chromaId
);
969 tmpBits
= m_entropyCoder
.getNumberOfWrittenBits();
973 if (m_rdCost
.m_psyRd
)
975 tmpEnergy
= m_rdCost
.psyCost(sizeIdxC
, fenc
, stride
, reconQt
, reconQtStride
);
976 tmpCost
= m_rdCost
.calcPsyRdCost(tmpDist
, tmpBits
, tmpEnergy
);
979 tmpCost
= m_rdCost
.calcRdCost(tmpDist
, tmpBits
);
993 memcpy(coeffC
, tskipCoeffC
, sizeof(coeff_t
) << (log2TrSizeC
* 2));
994 primitives
.square_copy_pp
[sizeIdxC
](reconQt
, reconQtStride
, tskipReconC
, MAX_TS_SIZE
);
997 cu
.setCbfPartRange(bCbf
<< trDepth
, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
998 cu
.setTransformSkipPartRange(bTSkip
, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
1000 pixel
* reconPicC
= m_frame
->m_reconPicYuv
->getChromaAddr(chromaId
, cu
.m_cuAddr
, cuGeom
.encodeIdx
+ absPartIdxC
);
1001 intptr_t picStride
= m_frame
->m_reconPicYuv
->m_strideC
;
1002 primitives
.square_copy_pp
[sizeIdxC
](reconPicC
, picStride
, reconQt
, reconQtStride
);
1005 psyEnergy
+= bEnergy
;
1007 while (tuIterator
.isNextSection());
1009 if (splitType
== VERTICAL_SPLIT
)
1010 offsetSubTUCBFs(cu
, ttype
, trDepth
, absPartIdx
);
1013 m_entropyCoder
.load(m_rqt
[fullDepth
].rqtRoot
);
1017 void Search::extractIntraResultChromaQT(CUData
& cu
, Yuv
& reconYuv
, uint32_t absPartIdx
, uint32_t trDepth
, bool tuQuad
)
1019 uint32_t fullDepth
= cu
.m_cuDepth
[0] + trDepth
;
1020 uint32_t tuDepthL
= cu
.m_tuDepth
[absPartIdx
];
1022 if (tuDepthL
== trDepth
)
1024 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
1025 uint32_t log2TrSizeC
= log2TrSize
- m_hChromaShift
;
1029 log2TrSizeC
++; /* extract one 4x4 instead of 4 2x2 */
1030 trDepth
--; /* also adjust the number of coeff read */
1033 // copy transform coefficients
1034 uint32_t numCoeffC
= 1 << (log2TrSizeC
* 2 + (m_csp
== X265_CSP_I422
));
1035 uint32_t coeffOffsetC
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2 - (m_hChromaShift
+ m_vChromaShift
));
1037 uint32_t qtLayer
= log2TrSize
- 2;
1038 coeff_t
* coeffSrcU
= m_rqt
[qtLayer
].coeffRQT
[1] + coeffOffsetC
;
1039 coeff_t
* coeffSrcV
= m_rqt
[qtLayer
].coeffRQT
[2] + coeffOffsetC
;
1040 coeff_t
* coeffDstU
= cu
.m_trCoeff
[1] + coeffOffsetC
;
1041 coeff_t
* coeffDstV
= cu
.m_trCoeff
[2] + coeffOffsetC
;
1042 memcpy(coeffDstU
, coeffSrcU
, sizeof(coeff_t
) * numCoeffC
);
1043 memcpy(coeffDstV
, coeffSrcV
, sizeof(coeff_t
) * numCoeffC
);
1045 // copy reconstruction
1046 m_rqt
[qtLayer
].reconQtYuv
.copyPartToPartChroma(reconYuv
, absPartIdx
, log2TrSizeC
+ m_hChromaShift
);
1050 if (g_maxLog2CUSize
- fullDepth
- 1 == 2 && m_csp
!= X265_CSP_I444
)
1051 /* no such thing as chroma 2x2, so extract one 4x4 instead of 4 2x2 */
1052 extractIntraResultChromaQT(cu
, reconYuv
, absPartIdx
, trDepth
+ 1, true);
1055 uint32_t numQPart
= NUM_CU_PARTITIONS
>> ((fullDepth
+ 1) << 1);
1056 for (uint32_t subPartIdx
= 0; subPartIdx
< 4; subPartIdx
++)
1057 extractIntraResultChromaQT(cu
, reconYuv
, absPartIdx
+ subPartIdx
* numQPart
, trDepth
+ 1, false);
1062 void Search::residualQTIntraChroma(Mode
& mode
, const CUGeom
& cuGeom
, uint32_t trDepth
, uint32_t absPartIdx
)
1064 CUData
& cu
= mode
.cu
;
1065 uint32_t fullDepth
= cu
.m_cuDepth
[0] + trDepth
;
1066 uint32_t tuDepthL
= cu
.m_tuDepth
[absPartIdx
];
1068 if (tuDepthL
== trDepth
)
1070 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
1071 uint32_t log2TrSizeC
= log2TrSize
- m_hChromaShift
;
1072 uint32_t trDepthC
= trDepth
;
1073 if (log2TrSizeC
== 1)
1075 X265_CHECK(log2TrSize
== 2 && m_csp
!= X265_CSP_I444
&& trDepth
> 0, "invalid trDepth\n");
1078 uint32_t qpdiv
= NUM_CU_PARTITIONS
>> ((cu
.m_cuDepth
[0] + trDepthC
) << 1);
1079 bool bFirstQ
= ((absPartIdx
& (qpdiv
- 1)) == 0);
1084 ShortYuv
& resiYuv
= m_rqt
[cuGeom
.depth
].tmpResiYuv
;
1085 uint32_t tuSize
= 1 << log2TrSizeC
;
1086 uint32_t stride
= mode
.fencYuv
->m_csize
;
1087 const int sizeIdxC
= log2TrSizeC
- 2;
1089 uint32_t curPartNum
= NUM_CU_PARTITIONS
>> ((cu
.m_cuDepth
[0] + trDepthC
) << 1);
1090 const SplitType splitType
= (m_csp
== X265_CSP_I422
) ? VERTICAL_SPLIT
: DONT_SPLIT
;
1092 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
1094 TextType ttype
= (TextType
)chromaId
;
1096 TURecurse
tuIterator(splitType
, curPartNum
, absPartIdx
);
1099 uint32_t absPartIdxC
= tuIterator
.absPartIdxTURelCU
;
1101 pixel
* fenc
= const_cast<pixel
*>(mode
.fencYuv
->getChromaAddr(chromaId
, absPartIdxC
));
1102 pixel
* pred
= mode
.predYuv
.getChromaAddr(chromaId
, absPartIdxC
);
1103 int16_t* residual
= resiYuv
.getChromaAddr(chromaId
, absPartIdxC
);
1104 pixel
* recon
= mode
.reconYuv
.getChromaAddr(chromaId
, absPartIdxC
); // TODO: needed?
1105 uint32_t coeffOffsetC
= absPartIdxC
<< (LOG2_UNIT_SIZE
* 2 - (m_hChromaShift
+ m_vChromaShift
));
1106 coeff_t
* coeff
= cu
.m_trCoeff
[ttype
] + coeffOffsetC
;
1107 pixel
* picReconC
= m_frame
->m_reconPicYuv
->getChromaAddr(chromaId
, cu
.m_cuAddr
, cuGeom
.encodeIdx
+ absPartIdxC
);
1108 uint32_t picStride
= m_frame
->m_reconPicYuv
->m_strideC
;
1110 uint32_t chromaPredMode
= cu
.m_chromaIntraDir
[absPartIdxC
];
1111 if (chromaPredMode
== DM_CHROMA_IDX
)
1112 chromaPredMode
= cu
.m_lumaIntraDir
[(m_csp
== X265_CSP_I444
) ? absPartIdxC
: 0];
1113 chromaPredMode
= (m_csp
== X265_CSP_I422
) ? g_chroma422IntraAngleMappingTable
[chromaPredMode
] : chromaPredMode
;
1114 initAdiPatternChroma(cu
, cuGeom
, absPartIdxC
, trDepthC
, chromaId
);
1115 pixel
* chromaPred
= getAdiChromaBuf(chromaId
, tuSize
);
1117 predIntraChromaAng(chromaPred
, chromaPredMode
, pred
, stride
, log2TrSizeC
, m_csp
);
1119 X265_CHECK(!cu
.m_transformSkip
[ttype
][0], "transform skip not supported at low RD levels\n");
1121 primitives
.calcresidual
[sizeIdxC
](fenc
, pred
, residual
, stride
);
1122 uint32_t numSig
= m_quant
.transformNxN(cu
, fenc
, stride
, residual
, stride
, coeff
, log2TrSizeC
, ttype
, absPartIdxC
, false);
1125 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdxC
], residual
, stride
, coeff
, log2TrSizeC
, ttype
, true, false, numSig
);
1126 primitives
.luma_add_ps
[sizeIdxC
](recon
, stride
, pred
, residual
, stride
, stride
);
1127 primitives
.square_copy_pp
[sizeIdxC
](picReconC
, picStride
, recon
, stride
);
1128 cu
.setCbfPartRange(1 << trDepth
, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
1132 primitives
.square_copy_pp
[sizeIdxC
](recon
, stride
, pred
, stride
);
1133 primitives
.square_copy_pp
[sizeIdxC
](picReconC
, picStride
, pred
, stride
);
1134 cu
.setCbfPartRange(0, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
1137 while (tuIterator
.isNextSection());
1139 if (splitType
== VERTICAL_SPLIT
)
1140 offsetSubTUCBFs(cu
, (TextType
)chromaId
, trDepth
, absPartIdx
);
1145 uint32_t qPartsDiv
= NUM_CU_PARTITIONS
>> ((fullDepth
+ 1) << 1);
1146 uint32_t splitCbfU
= 0, splitCbfV
= 0;
1147 for (uint32_t subPartIdx
= 0, absPartIdxC
= absPartIdx
; subPartIdx
< 4; subPartIdx
++, absPartIdxC
+= qPartsDiv
)
1149 residualQTIntraChroma(mode
, cuGeom
, trDepth
+ 1, absPartIdxC
);
1150 splitCbfU
|= cu
.getCbf(absPartIdxC
, TEXT_CHROMA_U
, trDepth
+ 1);
1151 splitCbfV
|= cu
.getCbf(absPartIdxC
, TEXT_CHROMA_V
, trDepth
+ 1);
1153 for (uint32_t offs
= 0; offs
< 4 * qPartsDiv
; offs
++)
1155 cu
.m_cbf
[1][absPartIdx
+ offs
] |= (splitCbfU
<< trDepth
);
1156 cu
.m_cbf
[2][absPartIdx
+ offs
] |= (splitCbfV
<< trDepth
);
1161 void Search::checkIntra(Mode
& intraMode
, const CUGeom
& cuGeom
, PartSize partSize
, uint8_t* sharedModes
)
1163 uint32_t depth
= cuGeom
.depth
;
1164 CUData
& cu
= intraMode
.cu
;
1166 cu
.setPartSizeSubParts(partSize
);
1167 cu
.setPredModeSubParts(MODE_INTRA
);
1169 uint32_t tuDepthRange
[2];
1170 cu
.getIntraTUQtDepthRange(tuDepthRange
, 0);
1172 intraMode
.initCosts();
1173 intraMode
.distortion
+= estIntraPredQT(intraMode
, cuGeom
, tuDepthRange
, sharedModes
);
1174 intraMode
.distortion
+= estIntraPredChromaQT(intraMode
, cuGeom
);
1176 m_entropyCoder
.resetBits();
1177 if (m_slice
->m_pps
->bTransquantBypassEnabled
)
1178 m_entropyCoder
.codeCUTransquantBypassFlag(cu
.m_tqBypass
[0]);
1180 if (!m_slice
->isIntra())
1182 m_entropyCoder
.codeSkipFlag(cu
, 0);
1183 m_entropyCoder
.codePredMode(cu
.m_predMode
[0]);
1186 m_entropyCoder
.codePartSize(cu
, 0, depth
);
1187 m_entropyCoder
.codePredInfo(cu
, 0);
1188 intraMode
.mvBits
= m_entropyCoder
.getNumberOfWrittenBits();
1190 bool bCodeDQP
= m_slice
->m_pps
->bUseDQP
;
1191 m_entropyCoder
.codeCoeff(cu
, 0, depth
, bCodeDQP
, tuDepthRange
);
1192 m_entropyCoder
.store(intraMode
.contexts
);
1193 intraMode
.totalBits
= m_entropyCoder
.getNumberOfWrittenBits();
1194 intraMode
.coeffBits
= intraMode
.totalBits
- intraMode
.mvBits
;
1195 if (m_rdCost
.m_psyRd
)
1196 intraMode
.psyEnergy
= m_rdCost
.psyCost(cuGeom
.log2CUSize
- 2, intraMode
.fencYuv
->m_buf
[0], intraMode
.fencYuv
->m_size
, intraMode
.reconYuv
.m_buf
[0], intraMode
.reconYuv
.m_size
);
1198 updateModeCost(intraMode
);
1201 uint32_t Search::estIntraPredQT(Mode
&intraMode
, const CUGeom
& cuGeom
, uint32_t depthRange
[2], uint8_t* sharedModes
)
1203 CUData
& cu
= intraMode
.cu
;
1204 Yuv
* reconYuv
= &intraMode
.reconYuv
;
1205 Yuv
* predYuv
= &intraMode
.predYuv
;
1206 const Yuv
* fencYuv
= intraMode
.fencYuv
;
1208 uint32_t depth
= cu
.m_cuDepth
[0];
1209 uint32_t initTrDepth
= cu
.m_partSize
[0] == SIZE_2Nx2N
? 0 : 1;
1210 uint32_t numPU
= 1 << (2 * initTrDepth
);
1211 uint32_t log2TrSize
= cu
.m_log2CUSize
[0] - initTrDepth
;
1212 uint32_t tuSize
= 1 << log2TrSize
;
1213 uint32_t qNumParts
= cuGeom
.numPartitions
>> 2;
1214 uint32_t sizeIdx
= log2TrSize
- 2;
1215 uint32_t absPartIdx
= 0;
1216 uint32_t totalDistortion
= 0;
1218 int checkTransformSkip
= m_slice
->m_pps
->bTransformSkipEnabled
&& !cu
.m_tqBypass
[0] && cu
.m_partSize
[absPartIdx
] == SIZE_NxN
;
1220 // loop over partitions
1221 for (uint32_t pu
= 0; pu
< numPU
; pu
++, absPartIdx
+= qNumParts
)
1226 bmode
= sharedModes
[pu
];
1229 // Reference sample smoothing
1230 initAdiPattern(cu
, cuGeom
, absPartIdx
, initTrDepth
, ALL_IDX
);
1232 // determine set of modes to be tested (using prediction signal only)
1233 pixel
* fenc
= const_cast<pixel
*>(fencYuv
->getLumaAddr(absPartIdx
));
1234 uint32_t stride
= predYuv
->m_size
;
1236 pixel
*above
= m_refAbove
+ tuSize
- 1;
1237 pixel
*aboveFiltered
= m_refAboveFlt
+ tuSize
- 1;
1238 pixel
*left
= m_refLeft
+ tuSize
- 1;
1239 pixel
*leftFiltered
= m_refLeftFlt
+ tuSize
- 1;
1241 // 33 Angle modes once
1242 ALIGN_VAR_32(pixel
, buf_trans
[32 * 32]);
1243 ALIGN_VAR_32(pixel
, tmp
[33 * 32 * 32]);
1244 ALIGN_VAR_32(pixel
, bufScale
[32 * 32]);
1245 pixel _above
[4 * 32 + 1];
1246 pixel _left
[4 * 32 + 1];
1247 int scaleTuSize
= tuSize
;
1248 int scaleStride
= stride
;
1253 pixel
*aboveScale
= _above
+ 2 * 32;
1254 pixel
*leftScale
= _left
+ 2 * 32;
1256 // origin is 64x64, we scale to 32x32 and setup required parameters
1257 primitives
.scale2D_64to32(bufScale
, fenc
, stride
);
1260 // reserve space in case primitives need to store data in above
1262 aboveScale
[0] = leftScale
[0] = above
[0];
1263 primitives
.scale1D_128to64(aboveScale
+ 1, above
+ 1, 0);
1264 primitives
.scale1D_128to64(leftScale
+ 1, left
+ 1, 0);
1269 sizeIdx
= 5 - 2; // log2(scaleTuSize) - 2
1271 // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
1274 aboveFiltered
= aboveScale
;
1275 leftFiltered
= leftScale
;
1278 m_entropyCoder
.loadIntraDirModeLuma(m_rqt
[depth
].cur
);
1280 /* there are three cost tiers for intra modes:
1281 * pred[0] - mode probable, least cost
1282 * pred[1], pred[2] - less probable, slightly more cost
1283 * non-mpm modes - all cost the same (rbits) */
1286 uint32_t rbits
= getIntraRemModeBits(cu
, absPartIdx
, preds
, mpms
);
1288 pixelcmp_t sa8d
= primitives
.sa8d
[sizeIdx
];
1289 uint64_t modeCosts
[35];
1293 primitives
.intra_pred
[DC_IDX
][sizeIdx
](tmp
, scaleStride
, left
, above
, 0, (scaleTuSize
<= 16));
1294 uint32_t bits
= (mpms
& ((uint64_t)1 << DC_IDX
)) ? m_entropyCoder
.bitsIntraModeMPM(preds
, DC_IDX
) : rbits
;
1295 uint32_t sad
= sa8d(fenc
, scaleStride
, tmp
, scaleStride
) << costShift
;
1296 modeCosts
[DC_IDX
] = bcost
= m_rdCost
.calcRdSADCost(sad
, bits
);
1299 pixel
*abovePlanar
= above
;
1300 pixel
*leftPlanar
= left
;
1301 if (tuSize
>= 8 && tuSize
<= 32)
1303 abovePlanar
= aboveFiltered
;
1304 leftPlanar
= leftFiltered
;
1306 primitives
.intra_pred
[PLANAR_IDX
][sizeIdx
](tmp
, scaleStride
, leftPlanar
, abovePlanar
, 0, 0);
1307 bits
= (mpms
& ((uint64_t)1 << PLANAR_IDX
)) ? m_entropyCoder
.bitsIntraModeMPM(preds
, PLANAR_IDX
) : rbits
;
1308 sad
= sa8d(fenc
, scaleStride
, tmp
, scaleStride
) << costShift
;
1309 modeCosts
[PLANAR_IDX
] = m_rdCost
.calcRdSADCost(sad
, bits
);
1310 COPY1_IF_LT(bcost
, modeCosts
[PLANAR_IDX
]);
1312 // angular predictions
1313 primitives
.intra_pred_allangs
[sizeIdx
](tmp
, above
, left
, aboveFiltered
, leftFiltered
, (scaleTuSize
<= 16));
1315 primitives
.transpose
[sizeIdx
](buf_trans
, fenc
, scaleStride
);
1316 for (int mode
= 2; mode
< 35; mode
++)
1318 bool modeHor
= (mode
< 18);
1319 pixel
*cmp
= (modeHor
? buf_trans
: fenc
);
1320 intptr_t srcStride
= (modeHor
? scaleTuSize
: scaleStride
);
1321 bits
= (mpms
& ((uint64_t)1 << mode
)) ? m_entropyCoder
.bitsIntraModeMPM(preds
, mode
) : rbits
;
1322 sad
= sa8d(cmp
, srcStride
, &tmp
[(mode
- 2) * (scaleTuSize
* scaleTuSize
)], scaleTuSize
) << costShift
;
1323 modeCosts
[mode
] = m_rdCost
.calcRdSADCost(sad
, bits
);
1324 COPY1_IF_LT(bcost
, modeCosts
[mode
]);
1327 /* Find the top maxCandCount candidate modes with cost within 25% of best
1328 * or among the most probable modes. maxCandCount is derived from the
1329 * rdLevel and depth. In general we want to try more modes at slower RD
1330 * levels and at higher depths */
1331 uint64_t candCostList
[MAX_RD_INTRA_MODES
];
1332 uint32_t rdModeList
[MAX_RD_INTRA_MODES
];
1333 int maxCandCount
= 2 + m_param
->rdLevel
+ ((depth
+ initTrDepth
) >> 1);
1334 for (int i
= 0; i
< maxCandCount
; i
++)
1335 candCostList
[i
] = MAX_INT64
;
1337 uint64_t paddedBcost
= bcost
+ (bcost
>> 3); // 1.12%
1338 for (int mode
= 0; mode
< 35; mode
++)
1339 if (modeCosts
[mode
] < paddedBcost
|| (mpms
& ((uint64_t)1 << mode
)))
1340 updateCandList(mode
, modeCosts
[mode
], maxCandCount
, rdModeList
, candCostList
);
1342 /* measure best candidates using simple RDO (no TU splits) */
1344 for (int i
= 0; i
< maxCandCount
; i
++)
1346 if (candCostList
[i
] == MAX_INT64
)
1348 m_entropyCoder
.load(m_rqt
[depth
].cur
);
1349 cu
.setLumaIntraDirSubParts(rdModeList
[i
], absPartIdx
, depth
+ initTrDepth
);
1352 if (checkTransformSkip
)
1353 codeIntraLumaTSkip(intraMode
, cuGeom
, initTrDepth
, absPartIdx
, icosts
);
1355 codeIntraLumaQT(intraMode
, cuGeom
, initTrDepth
, absPartIdx
, false, icosts
, depthRange
);
1356 COPY2_IF_LT(bcost
, icosts
.rdcost
, bmode
, rdModeList
[i
]);
1360 /* remeasure best mode, allowing TU splits */
1361 cu
.setLumaIntraDirSubParts(bmode
, absPartIdx
, depth
+ initTrDepth
);
1362 m_entropyCoder
.load(m_rqt
[depth
].cur
);
1365 if (checkTransformSkip
)
1366 codeIntraLumaTSkip(intraMode
, cuGeom
, initTrDepth
, absPartIdx
, icosts
);
1368 codeIntraLumaQT(intraMode
, cuGeom
, initTrDepth
, absPartIdx
, true, icosts
, depthRange
);
1369 totalDistortion
+= icosts
.distortion
;
1371 extractIntraResultQT(cu
, *reconYuv
, initTrDepth
, absPartIdx
);
1373 // set reconstruction for next intra prediction blocks
1374 if (pu
!= numPU
- 1)
1376 /* This has important implications for parallelism and RDO. It is writing intermediate results into the
1377 * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
1378 * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
1379 * that the contexts should be tracked through each PU */
1380 pixel
* dst
= m_frame
->m_reconPicYuv
->getLumaAddr(cu
.m_cuAddr
, cuGeom
.encodeIdx
+ absPartIdx
);
1381 uint32_t dststride
= m_frame
->m_reconPicYuv
->m_stride
;
1382 pixel
* src
= reconYuv
->getLumaAddr(absPartIdx
);
1383 uint32_t srcstride
= reconYuv
->m_size
;
1384 primitives
.square_copy_pp
[log2TrSize
- 2](dst
, dststride
, src
, srcstride
);
1390 uint32_t combCbfY
= 0;
1391 uint32_t partIdx
= 0;
1392 for (uint32_t part
= 0; part
< 4; part
++, partIdx
+= qNumParts
)
1393 combCbfY
|= cu
.getCbf(partIdx
, TEXT_LUMA
, 1);
1395 for (uint32_t offs
= 0; offs
< 4 * qNumParts
; offs
++)
1396 cu
.m_cbf
[0][offs
] |= combCbfY
;
1399 // TODO: remove this
1400 m_entropyCoder
.load(m_rqt
[depth
].cur
);
1402 return totalDistortion
;
1405 void Search::getBestIntraModeChroma(Mode
& intraMode
, const CUGeom
& cuGeom
)
1407 CUData
& cu
= intraMode
.cu
;
1408 const Yuv
* fencYuv
= intraMode
.fencYuv
;
1409 Yuv
* predYuv
= &intraMode
.predYuv
;
1411 uint32_t bestMode
= 0;
1412 uint64_t bestCost
= MAX_INT64
;
1413 uint32_t modeList
[NUM_CHROMA_MODE
];
1415 uint32_t log2TrSizeC
= cu
.m_log2CUSize
[0] - m_hChromaShift
;
1416 uint32_t tuSize
= 1 << log2TrSizeC
;
1417 int32_t scaleTuSize
= tuSize
;
1418 int32_t costShift
= 0;
1427 Predict::initAdiPatternChroma(cu
, cuGeom
, 0, 0, 1);
1428 Predict::initAdiPatternChroma(cu
, cuGeom
, 0, 0, 2);
1429 cu
.getAllowedChromaDir(0, modeList
);
1431 // check chroma modes
1432 for (uint32_t mode
= 0; mode
< NUM_CHROMA_MODE
; mode
++)
1434 uint32_t chromaPredMode
= modeList
[mode
];
1435 if (chromaPredMode
== DM_CHROMA_IDX
)
1436 chromaPredMode
= cu
.m_lumaIntraDir
[0];
1437 if (m_csp
== X265_CSP_I422
)
1438 chromaPredMode
= g_chroma422IntraAngleMappingTable
[chromaPredMode
];
1441 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
1443 pixel
* fenc
= fencYuv
->m_buf
[chromaId
];
1444 pixel
* pred
= predYuv
->m_buf
[chromaId
];
1445 pixel
* chromaPred
= getAdiChromaBuf(chromaId
, scaleTuSize
);
1447 // get prediction signal
1448 predIntraChromaAng(chromaPred
, chromaPredMode
, pred
, fencYuv
->m_csize
, log2TrSizeC
, m_csp
);
1449 cost
+= primitives
.sa8d
[log2TrSizeC
- 2](fenc
, predYuv
->m_csize
, pred
, fencYuv
->m_csize
) << costShift
;
1452 if (cost
< bestCost
)
1455 bestMode
= modeList
[mode
];
1459 cu
.setChromIntraDirSubParts(bestMode
, 0, cu
.m_cuDepth
[0]);
1462 uint32_t Search::estIntraPredChromaQT(Mode
&intraMode
, const CUGeom
& cuGeom
)
1464 CUData
& cu
= intraMode
.cu
;
1465 Yuv
& reconYuv
= intraMode
.reconYuv
;
1467 uint32_t depth
= cu
.m_cuDepth
[0];
1468 uint32_t initTrDepth
= cu
.m_partSize
[0] == SIZE_NxN
&& m_csp
== X265_CSP_I444
;
1469 uint32_t log2TrSize
= cu
.m_log2CUSize
[0] - initTrDepth
;
1470 uint32_t absPartStep
= (NUM_CU_PARTITIONS
>> (depth
<< 1));
1471 uint32_t totalDistortion
= 0;
1473 int part
= partitionFromLog2Size(log2TrSize
);
1475 TURecurse
tuIterator((initTrDepth
== 0) ? DONT_SPLIT
: QUAD_SPLIT
, absPartStep
, 0);
1479 uint32_t absPartIdxC
= tuIterator
.absPartIdxTURelCU
;
1480 int cuSize
= 1 << cu
.m_log2CUSize
[absPartIdxC
];
1482 uint32_t bestMode
= 0;
1483 uint32_t bestDist
= 0;
1484 uint64_t bestCost
= MAX_INT64
;
1487 uint32_t minMode
= 0;
1488 uint32_t maxMode
= NUM_CHROMA_MODE
;
1489 uint32_t modeList
[NUM_CHROMA_MODE
];
1491 cu
.getAllowedChromaDir(absPartIdxC
, modeList
);
1493 // check chroma modes
1494 for (uint32_t mode
= minMode
; mode
< maxMode
; mode
++)
1496 // restore context models
1497 m_entropyCoder
.load(m_rqt
[depth
].cur
);
1499 cu
.setChromIntraDirSubParts(modeList
[mode
], absPartIdxC
, depth
+ initTrDepth
);
1500 uint32_t psyEnergy
= 0;
1501 uint32_t dist
= codeIntraChromaQt(intraMode
, cuGeom
, initTrDepth
, absPartIdxC
, psyEnergy
);
1503 if (m_slice
->m_pps
->bTransformSkipEnabled
)
1504 m_entropyCoder
.load(m_rqt
[depth
].cur
);
1506 m_entropyCoder
.resetBits();
1507 // chroma prediction mode
1508 if (cu
.m_partSize
[0] == SIZE_2Nx2N
|| m_csp
!= X265_CSP_I444
)
1511 m_entropyCoder
.codeIntraDirChroma(cu
, absPartIdxC
, modeList
);
1515 uint32_t qtNumParts
= cuGeom
.numPartitions
>> 2;
1516 if (!(absPartIdxC
& (qtNumParts
- 1)))
1517 m_entropyCoder
.codeIntraDirChroma(cu
, absPartIdxC
, modeList
);
1520 codeSubdivCbfQTChroma(cu
, initTrDepth
, absPartIdxC
, tuIterator
.absPartIdxStep
, cuSize
, cuSize
);
1521 codeCoeffQTChroma(cu
, initTrDepth
, absPartIdxC
, TEXT_CHROMA_U
);
1522 codeCoeffQTChroma(cu
, initTrDepth
, absPartIdxC
, TEXT_CHROMA_V
);
1523 uint32_t bits
= m_entropyCoder
.getNumberOfWrittenBits();
1524 uint64_t cost
= m_rdCost
.m_psyRd
? m_rdCost
.calcPsyRdCost(dist
, bits
, psyEnergy
) : m_rdCost
.calcRdCost(dist
, bits
);
1526 if (cost
< bestCost
)
1530 bestMode
= modeList
[mode
];
1531 extractIntraResultChromaQT(cu
, reconYuv
, absPartIdxC
, initTrDepth
, false);
1532 memcpy(m_qtTempCbf
[1], cu
.m_cbf
[1] + absPartIdxC
, tuIterator
.absPartIdxStep
* sizeof(uint8_t));
1533 memcpy(m_qtTempCbf
[2], cu
.m_cbf
[2] + absPartIdxC
, tuIterator
.absPartIdxStep
* sizeof(uint8_t));
1534 memcpy(m_qtTempTransformSkipFlag
[1], cu
.m_transformSkip
[1] + absPartIdxC
, tuIterator
.absPartIdxStep
* sizeof(uint8_t));
1535 memcpy(m_qtTempTransformSkipFlag
[2], cu
.m_transformSkip
[2] + absPartIdxC
, tuIterator
.absPartIdxStep
* sizeof(uint8_t));
1539 if (!tuIterator
.isLastSection())
1541 uint32_t zorder
= cuGeom
.encodeIdx
+ absPartIdxC
;
1542 uint32_t dststride
= m_frame
->m_reconPicYuv
->m_strideC
;
1545 dst
= m_frame
->m_reconPicYuv
->getCbAddr(cu
.m_cuAddr
, zorder
);
1546 src
= reconYuv
.getCbAddr(absPartIdxC
);
1547 primitives
.chroma
[m_csp
].copy_pp
[part
](dst
, dststride
, src
, reconYuv
.m_csize
);
1549 dst
= m_frame
->m_reconPicYuv
->getCrAddr(cu
.m_cuAddr
, zorder
);
1550 src
= reconYuv
.getCrAddr(absPartIdxC
);
1551 primitives
.chroma
[m_csp
].copy_pp
[part
](dst
, dststride
, src
, reconYuv
.m_csize
);
1554 memcpy(cu
.m_cbf
[1] + absPartIdxC
, m_qtTempCbf
[1], tuIterator
.absPartIdxStep
* sizeof(uint8_t));
1555 memcpy(cu
.m_cbf
[2] + absPartIdxC
, m_qtTempCbf
[2], tuIterator
.absPartIdxStep
* sizeof(uint8_t));
1556 memcpy(cu
.m_transformSkip
[1] + absPartIdxC
, m_qtTempTransformSkipFlag
[1], tuIterator
.absPartIdxStep
* sizeof(uint8_t));
1557 memcpy(cu
.m_transformSkip
[2] + absPartIdxC
, m_qtTempTransformSkipFlag
[2], tuIterator
.absPartIdxStep
* sizeof(uint8_t));
1558 cu
.setChromIntraDirSubParts(bestMode
, absPartIdxC
, depth
+ initTrDepth
);
1559 totalDistortion
+= bestDist
;
1561 while (tuIterator
.isNextSection());
1563 if (initTrDepth
!= 0)
1565 uint32_t combCbfU
= 0;
1566 uint32_t combCbfV
= 0;
1567 uint32_t partIdx
= 0;
1568 for (uint32_t p
= 0; p
< 4; p
++, partIdx
+= tuIterator
.absPartIdxStep
)
1570 combCbfU
|= cu
.getCbf(partIdx
, TEXT_CHROMA_U
, 1);
1571 combCbfV
|= cu
.getCbf(partIdx
, TEXT_CHROMA_V
, 1);
1574 for (uint32_t offs
= 0; offs
< 4 * tuIterator
.absPartIdxStep
; offs
++)
1576 cu
.m_cbf
[1][offs
] |= combCbfU
;
1577 cu
.m_cbf
[2][offs
] |= combCbfV
;
1581 /* TODO: remove this */
1582 m_entropyCoder
.load(m_rqt
[depth
].cur
);
1583 return totalDistortion
;
1586 /* estimation of best merge coding of an inter PU (not a merge CU) */
1587 uint32_t Search::mergeEstimation(CUData
& cu
, const CUGeom
& cuGeom
, int puIdx
, MergeData
& m
)
1589 X265_CHECK(cu
.m_partSize
[0] != SIZE_2Nx2N
, "merge tested on non-2Nx2N partition\n");
1591 m
.maxNumMergeCand
= cu
.getInterMergeCandidates(m
.absPartIdx
, puIdx
, m
.mvFieldNeighbours
, m
.interDirNeighbours
);
1593 if (cu
.isBipredRestriction())
1595 /* in 8x8 CUs do not allow bidir merge candidates if not 2Nx2N */
1596 for (uint32_t mergeCand
= 0; mergeCand
< m
.maxNumMergeCand
; ++mergeCand
)
1598 if (m
.interDirNeighbours
[mergeCand
] == 3)
1600 m
.interDirNeighbours
[mergeCand
] = 1;
1601 m
.mvFieldNeighbours
[mergeCand
][1].refIdx
= REF_NOT_VALID
;
1606 Yuv
& tempYuv
= m_rqt
[cuGeom
.depth
].tmpPredYuv
;
1608 uint32_t outCost
= MAX_UINT
;
1609 for (uint32_t mergeCand
= 0; mergeCand
< m
.maxNumMergeCand
; ++mergeCand
)
1611 /* Prevent TMVP candidates from using unavailable reference pixels */
1612 if (m_bFrameParallel
&&
1613 (m
.mvFieldNeighbours
[mergeCand
][0].mv
.y
>= (m_param
->searchRange
+ 1) * 4 ||
1614 m
.mvFieldNeighbours
[mergeCand
][1].mv
.y
>= (m_param
->searchRange
+ 1) * 4))
1617 cu
.m_mv
[0][m
.absPartIdx
] = m
.mvFieldNeighbours
[mergeCand
][0].mv
;
1618 cu
.m_refIdx
[0][m
.absPartIdx
] = (char)m
.mvFieldNeighbours
[mergeCand
][0].refIdx
;
1619 cu
.m_mv
[1][m
.absPartIdx
] = m
.mvFieldNeighbours
[mergeCand
][1].mv
;
1620 cu
.m_refIdx
[1][m
.absPartIdx
] = (char)m
.mvFieldNeighbours
[mergeCand
][1].refIdx
;
1622 prepMotionCompensation(cu
, cuGeom
, puIdx
);
1623 motionCompensation(tempYuv
, true, false);
1624 uint32_t costCand
= m_me
.bufSATD(tempYuv
.getLumaAddr(m
.absPartIdx
), tempYuv
.m_size
);
1625 uint32_t bitsCand
= getTUBits(mergeCand
, m
.maxNumMergeCand
);
1626 costCand
= costCand
+ m_rdCost
.getCost(bitsCand
);
1627 if (costCand
< outCost
)
1631 m
.index
= mergeCand
;
1635 m
.mvField
[0] = m
.mvFieldNeighbours
[m
.index
][0];
1636 m
.mvField
[1] = m
.mvFieldNeighbours
[m
.index
][1];
1637 m
.interDir
= m
.interDirNeighbours
[m
.index
];
1642 /* this function assumes the caller has configured its MotionEstimation engine with the
1643 * correct source plane and source PU, and has called prepMotionCompensation() to set
1644 * m_puAbsPartIdx, m_puWidth, and m_puHeight */
1645 void Search::singleMotionEstimation(Search
& master
, const CUData
& cu
, const CUGeom
& cuGeom
, int part
, int list
, int ref
)
1647 uint32_t bits
= master
.m_listSelBits
[list
] + MVP_IDX_BITS
;
1648 bits
+= getTUBits(ref
, m_slice
->m_numRefIdx
[list
]);
1650 MV amvpCand
[AMVP_NUM_CANDS
];
1651 MV mvc
[(MD_ABOVE_LEFT
+ 1) * 2 + 1];
1652 int numMvc
= cu
.fillMvpCand(part
, m_puAbsPartIdx
, list
, ref
, amvpCand
, mvc
);
1654 uint32_t bestCost
= MAX_INT
;
1656 int merange
= m_param
->searchRange
;
1657 for (int i
= 0; i
< AMVP_NUM_CANDS
; i
++)
1659 MV mvCand
= amvpCand
[i
];
1661 // NOTE: skip mvCand if Y is > merange and -FN>1
1662 if (m_bFrameParallel
&& (mvCand
.y
>= (merange
+ 1) * 4))
1667 Yuv
& tmpPredYuv
= m_rqt
[cuGeom
.depth
].tmpPredYuv
;
1668 predInterLumaPixel(tmpPredYuv
, *m_slice
->m_refPicList
[list
][ref
]->m_reconPicYuv
, mvCand
);
1669 uint32_t cost
= m_me
.bufSAD(tmpPredYuv
.getLumaAddr(m_puAbsPartIdx
), tmpPredYuv
.m_size
);
1671 if (bestCost
> cost
)
1678 MV mvmin
, mvmax
, outmv
, mvp
= amvpCand
[mvpIdx
];
1679 setSearchRange(cu
, mvp
, merange
, mvmin
, mvmax
);
1681 int satdCost
= m_me
.motionEstimate(&m_slice
->m_mref
[list
][ref
], mvmin
, mvmax
, mvp
, numMvc
, mvc
, merange
, outmv
);
1683 /* Get total cost of partition, but only include MV bit cost once */
1684 bits
+= m_me
.bitcost(outmv
);
1685 uint32_t cost
= (satdCost
- m_me
.mvcost(outmv
)) + m_rdCost
.getCost(bits
);
1687 /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
1688 checkBestMVP(amvpCand
, outmv
, mvp
, mvpIdx
, bits
, cost
);
1690 /* tie goes to the smallest ref ID, just like --no-pme */
1691 ScopedLock
_lock(master
.m_outputLock
);
1692 if (cost
< master
.m_bestME
[list
].cost
||
1693 (cost
== master
.m_bestME
[list
].cost
&& ref
< master
.m_bestME
[list
].ref
))
1695 master
.m_bestME
[list
].mv
= outmv
;
1696 master
.m_bestME
[list
].mvp
= mvp
;
1697 master
.m_bestME
[list
].mvpIdx
= mvpIdx
;
1698 master
.m_bestME
[list
].ref
= ref
;
1699 master
.m_bestME
[list
].cost
= cost
;
1700 master
.m_bestME
[list
].bits
= bits
;
1704 /* search of the best candidate for inter prediction
1705 * returns true if predYuv was filled with a motion compensated prediction */
1706 bool Search::predInterSearch(Mode
& interMode
, const CUGeom
& cuGeom
, bool bMergeOnly
, bool bChroma
)
1708 CUData
& cu
= interMode
.cu
;
1709 Yuv
* predYuv
= &interMode
.predYuv
;
1711 MV amvpCand
[2][MAX_NUM_REF
][AMVP_NUM_CANDS
];
1712 MV mvc
[(MD_ABOVE_LEFT
+ 1) * 2 + 1];
1714 const Slice
*slice
= m_slice
;
1715 PicYuv
* fencPic
= m_frame
->m_origPicYuv
;
1716 int numPart
= cu
.getNumPartInter();
1717 int numPredDir
= slice
->isInterP() ? 1 : 2;
1718 const int* numRefIdx
= slice
->m_numRefIdx
;
1719 uint32_t lastMode
= 0;
1720 int totalmebits
= 0;
1721 bool bDistributed
= m_param
->bDistributeMotionEstimation
&& (numRefIdx
[0] + numRefIdx
[1]) > 2;
1723 Yuv
& tmpPredYuv
= m_rqt
[cuGeom
.depth
].tmpPredYuv
;
1726 memset(&merge
, 0, sizeof(merge
));
1728 for (int puIdx
= 0; puIdx
< numPart
; puIdx
++)
1730 /* sets m_puAbsPartIdx, m_puWidth, m_puHeight */
1731 initMotionCompensation(cu
, cuGeom
, puIdx
);
1733 pixel
* pu
= fencPic
->getLumaAddr(cu
.m_cuAddr
, cuGeom
.encodeIdx
+ m_puAbsPartIdx
);
1734 m_me
.setSourcePU(pu
- fencPic
->m_picOrg
[0], m_puWidth
, m_puHeight
);
1736 uint32_t mrgCost
= MAX_UINT
;
1738 /* find best cost merge candidate */
1739 if (cu
.m_partSize
[m_puAbsPartIdx
] != SIZE_2Nx2N
)
1741 merge
.absPartIdx
= m_puAbsPartIdx
;
1742 merge
.width
= m_puWidth
;
1743 merge
.height
= m_puHeight
;
1744 mrgCost
= mergeEstimation(cu
, cuGeom
, puIdx
, merge
);
1746 if (bMergeOnly
&& cu
.m_log2CUSize
[0] > 3)
1748 if (mrgCost
== MAX_UINT
)
1750 /* No valid merge modes were found, there is no possible way to
1751 * perform a valid motion compensation prediction, so early-exit */
1755 cu
.m_mergeFlag
[m_puAbsPartIdx
] = true;
1756 cu
.m_mvpIdx
[0][m_puAbsPartIdx
] = merge
.index
; // merge candidate ID is stored in L0 MVP idx
1757 cu
.setPUInterDir(merge
.interDir
, m_puAbsPartIdx
, puIdx
);
1758 cu
.setPUMv(0, merge
.mvField
[0].mv
, m_puAbsPartIdx
, puIdx
);
1759 cu
.setPURefIdx(0, merge
.mvField
[0].refIdx
, m_puAbsPartIdx
, puIdx
);
1760 cu
.setPUMv(1, merge
.mvField
[1].mv
, m_puAbsPartIdx
, puIdx
);
1761 cu
.setPURefIdx(1, merge
.mvField
[1].refIdx
, m_puAbsPartIdx
, puIdx
);
1762 totalmebits
+= merge
.bits
;
1764 prepMotionCompensation(cu
, cuGeom
, puIdx
);
1765 motionCompensation(*predYuv
, true, bChroma
);
1770 MotionData bidir
[2];
1771 uint32_t bidirCost
= MAX_UINT
;
1774 m_bestME
[0].cost
= MAX_UINT
;
1775 m_bestME
[1].cost
= MAX_UINT
;
1777 getBlkBits((PartSize
)cu
.m_partSize
[0], slice
->isInterP(), puIdx
, lastMode
, m_listSelBits
);
1782 m_curGeom
= &cuGeom
;
1784 /* this worker might already be enqueued for pmode, so other threads
1785 * might be looking at the ME job counts at any time, do these sets
1786 * in a safe order */
1789 m_numAcquiredME
= 1;
1790 m_numCompletedME
= 0;
1791 m_totalNumME
= numRefIdx
[0] + numRefIdx
[1];
1794 JobProvider::enqueue();
1796 for (int i
= 1; i
< m_totalNumME
; i
++)
1797 m_pool
->pokeIdleThread();
1799 while (m_totalNumME
> m_numAcquiredME
)
1801 int id
= ATOMIC_INC(&m_numAcquiredME
);
1802 if (m_totalNumME
>= id
)
1805 if (id
< numRefIdx
[0])
1806 singleMotionEstimation(*this, cu
, cuGeom
, puIdx
, 0, id
);
1808 singleMotionEstimation(*this, cu
, cuGeom
, puIdx
, 1, id
- numRefIdx
[0]);
1810 if (ATOMIC_INC(&m_numCompletedME
) == m_totalNumME
)
1811 m_meCompletionEvent
.trigger();
1815 JobProvider::dequeue();
1817 /* we saved L0-0 for ourselves */
1818 singleMotionEstimation(*this, cu
, cuGeom
, puIdx
, 0, 0);
1819 if (ATOMIC_INC(&m_numCompletedME
) == m_totalNumME
)
1820 m_meCompletionEvent
.trigger();
1822 m_meCompletionEvent
.wait();
1826 // Uni-directional prediction
1827 for (int l
= 0; l
< numPredDir
; l
++)
1829 for (int ref
= 0; ref
< numRefIdx
[l
]; ref
++)
1831 uint32_t bits
= m_listSelBits
[l
] + MVP_IDX_BITS
;
1832 bits
+= getTUBits(ref
, numRefIdx
[l
]);
1834 int numMvc
= cu
.fillMvpCand(puIdx
, m_puAbsPartIdx
, l
, ref
, amvpCand
[l
][ref
], mvc
);
1836 // Pick the best possible MVP from AMVP candidates based on least residual
1837 uint32_t bestCost
= MAX_INT
;
1839 int merange
= m_param
->searchRange
;
1841 for (int i
= 0; i
< AMVP_NUM_CANDS
; i
++)
1843 MV mvCand
= amvpCand
[l
][ref
][i
];
1845 // NOTE: skip mvCand if Y is > merange and -FN>1
1846 if (m_bFrameParallel
&& (mvCand
.y
>= (merange
+ 1) * 4))
1850 predInterLumaPixel(tmpPredYuv
, *slice
->m_refPicList
[l
][ref
]->m_reconPicYuv
, mvCand
);
1851 uint32_t cost
= m_me
.bufSAD(tmpPredYuv
.getLumaAddr(m_puAbsPartIdx
), tmpPredYuv
.m_size
);
1853 if (bestCost
> cost
)
1860 MV mvmin
, mvmax
, outmv
, mvp
= amvpCand
[l
][ref
][mvpIdx
];
1862 setSearchRange(cu
, mvp
, merange
, mvmin
, mvmax
);
1863 int satdCost
= m_me
.motionEstimate(&slice
->m_mref
[l
][ref
], mvmin
, mvmax
, mvp
, numMvc
, mvc
, merange
, outmv
);
1865 /* Get total cost of partition, but only include MV bit cost once */
1866 bits
+= m_me
.bitcost(outmv
);
1867 uint32_t cost
= (satdCost
- m_me
.mvcost(outmv
)) + m_rdCost
.getCost(bits
);
1869 /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
1870 checkBestMVP(amvpCand
[l
][ref
], outmv
, mvp
, mvpIdx
, bits
, cost
);
1872 if (cost
< m_bestME
[l
].cost
)
1874 m_bestME
[l
].mv
= outmv
;
1875 m_bestME
[l
].mvp
= mvp
;
1876 m_bestME
[l
].mvpIdx
= mvpIdx
;
1877 m_bestME
[l
].ref
= ref
;
1878 m_bestME
[l
].cost
= cost
;
1879 m_bestME
[l
].bits
= bits
;
1885 /* Bi-directional prediction */
1886 if (slice
->isInterB() && !cu
.isBipredRestriction() && m_bestME
[0].cost
!= MAX_UINT
&& m_bestME
[1].cost
!= MAX_UINT
)
1888 bidir
[0] = m_bestME
[0];
1889 bidir
[1] = m_bestME
[1];
1891 /* Generate reference subpels */
1892 PicYuv
* refPic0
= slice
->m_refPicList
[0][m_bestME
[0].ref
]->m_reconPicYuv
;
1893 PicYuv
* refPic1
= slice
->m_refPicList
[1][m_bestME
[1].ref
]->m_reconPicYuv
;
1894 Yuv
* bidirYuv
= m_rqt
[cuGeom
.depth
].bidirPredYuv
;
1895 predInterLumaPixel(bidirYuv
[0], *refPic0
, m_bestME
[0].mv
);
1896 predInterLumaPixel(bidirYuv
[1], *refPic1
, m_bestME
[1].mv
);
1898 pixel
*pred0
= bidirYuv
[0].getLumaAddr(m_puAbsPartIdx
);
1899 pixel
*pred1
= bidirYuv
[1].getLumaAddr(m_puAbsPartIdx
);
1901 int partEnum
= partitionFromSizes(m_puWidth
, m_puHeight
);
1902 primitives
.pixelavg_pp
[partEnum
](tmpPredYuv
.m_buf
[0], tmpPredYuv
.m_size
, pred0
, bidirYuv
[0].m_size
, pred1
, bidirYuv
[1].m_size
, 32);
1903 int satdCost
= m_me
.bufSATD(tmpPredYuv
.m_buf
[0], tmpPredYuv
.m_size
);
1905 bidirBits
= m_bestME
[0].bits
+ m_bestME
[1].bits
+ m_listSelBits
[2] - (m_listSelBits
[0] + m_listSelBits
[1]);
1906 bidirCost
= satdCost
+ m_rdCost
.getCost(bidirBits
);
1908 bool bTryZero
= m_bestME
[0].mv
.notZero() || m_bestME
[1].mv
.notZero();
1911 /* Do not try zero MV if unidir motion predictors are beyond
1912 * valid search area */
1914 int merange
= X265_MAX(m_param
->sourceWidth
, m_param
->sourceHeight
);
1915 setSearchRange(cu
, mvzero
, merange
, mvmin
, mvmax
);
1916 mvmax
.y
+= 2; // there is some pad for subpel refine
1920 bTryZero
&= m_bestME
[0].mvp
.checkRange(mvmin
, mvmax
);
1921 bTryZero
&= m_bestME
[1].mvp
.checkRange(mvmin
, mvmax
);
1925 // coincident blocks of the two reference pictures
1926 pixel
*ref0
= slice
->m_mref
[0][m_bestME
[0].ref
].fpelPlane
+ (pu
- fencPic
->m_picOrg
[0]);
1927 pixel
*ref1
= slice
->m_mref
[1][m_bestME
[1].ref
].fpelPlane
+ (pu
- fencPic
->m_picOrg
[0]);
1928 intptr_t refStride
= slice
->m_mref
[0][0].lumaStride
;
1930 primitives
.pixelavg_pp
[partEnum
](tmpPredYuv
.m_buf
[0], tmpPredYuv
.m_size
, ref0
, refStride
, ref1
, refStride
, 32);
1931 satdCost
= m_me
.bufSATD(tmpPredYuv
.m_buf
[0], tmpPredYuv
.m_size
);
1933 MV mvp0
= m_bestME
[0].mvp
;
1934 int mvpIdx0
= m_bestME
[0].mvpIdx
;
1935 uint32_t bits0
= m_bestME
[0].bits
- m_me
.bitcost(m_bestME
[0].mv
, mvp0
) + m_me
.bitcost(mvzero
, mvp0
);
1937 MV mvp1
= m_bestME
[1].mvp
;
1938 int mvpIdx1
= m_bestME
[1].mvpIdx
;
1939 uint32_t bits1
= m_bestME
[1].bits
- m_me
.bitcost(m_bestME
[1].mv
, mvp1
) + m_me
.bitcost(mvzero
, mvp1
);
1941 uint32_t cost
= satdCost
+ m_rdCost
.getCost(bits0
) + m_rdCost
.getCost(bits1
);
1945 cu
.fillMvpCand(puIdx
, m_puAbsPartIdx
, 0, m_bestME
[0].ref
, amvpCand
[0][m_bestME
[0].ref
], mvc
);
1946 cu
.fillMvpCand(puIdx
, m_puAbsPartIdx
, 1, m_bestME
[1].ref
, amvpCand
[1][m_bestME
[1].ref
], mvc
);
1949 /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
1950 checkBestMVP(amvpCand
[0][m_bestME
[0].ref
], mvzero
, mvp0
, mvpIdx0
, bits0
, cost
);
1951 checkBestMVP(amvpCand
[1][m_bestME
[1].ref
], mvzero
, mvp1
, mvpIdx1
, bits1
, cost
);
1953 if (cost
< bidirCost
)
1955 bidir
[0].mv
= mvzero
;
1956 bidir
[1].mv
= mvzero
;
1957 bidir
[0].mvp
= mvp0
;
1958 bidir
[1].mvp
= mvp1
;
1959 bidir
[0].mvpIdx
= mvpIdx0
;
1960 bidir
[1].mvpIdx
= mvpIdx1
;
1962 bidirBits
= bits0
+ bits1
+ m_listSelBits
[2] - (m_listSelBits
[0] + m_listSelBits
[1]);
1967 /* select best option and store into CU */
1968 if (mrgCost
< bidirCost
&& mrgCost
< m_bestME
[0].cost
&& mrgCost
< m_bestME
[1].cost
)
1970 cu
.m_mergeFlag
[m_puAbsPartIdx
] = true;
1971 cu
.m_mvpIdx
[0][m_puAbsPartIdx
] = merge
.index
; // merge candidate ID is stored in L0 MVP idx
1972 cu
.setPUInterDir(merge
.interDir
, m_puAbsPartIdx
, puIdx
);
1973 cu
.setPUMv(0, merge
.mvField
[0].mv
, m_puAbsPartIdx
, puIdx
);
1974 cu
.setPURefIdx(0, merge
.mvField
[0].refIdx
, m_puAbsPartIdx
, puIdx
);
1975 cu
.setPUMv(1, merge
.mvField
[1].mv
, m_puAbsPartIdx
, puIdx
);
1976 cu
.setPURefIdx(1, merge
.mvField
[1].refIdx
, m_puAbsPartIdx
, puIdx
);
1978 totalmebits
+= merge
.bits
;
1980 else if (bidirCost
< m_bestME
[0].cost
&& bidirCost
< m_bestME
[1].cost
)
1984 cu
.m_mergeFlag
[m_puAbsPartIdx
] = false;
1985 cu
.setPUInterDir(3, m_puAbsPartIdx
, puIdx
);
1986 cu
.setPUMv(0, bidir
[0].mv
, m_puAbsPartIdx
, puIdx
);
1987 cu
.setPURefIdx(0, m_bestME
[0].ref
, m_puAbsPartIdx
, puIdx
);
1988 cu
.m_mvd
[0][m_puAbsPartIdx
] = bidir
[0].mv
- bidir
[0].mvp
;
1989 cu
.m_mvpIdx
[0][m_puAbsPartIdx
] = bidir
[0].mvpIdx
;
1991 cu
.setPUMv(1, bidir
[1].mv
, m_puAbsPartIdx
, puIdx
);
1992 cu
.setPURefIdx(1, m_bestME
[1].ref
, m_puAbsPartIdx
, puIdx
);
1993 cu
.m_mvd
[1][m_puAbsPartIdx
] = bidir
[1].mv
- bidir
[1].mvp
;
1994 cu
.m_mvpIdx
[1][m_puAbsPartIdx
] = bidir
[1].mvpIdx
;
1996 totalmebits
+= bidirBits
;
1998 else if (m_bestME
[0].cost
<= m_bestME
[1].cost
)
2002 cu
.m_mergeFlag
[m_puAbsPartIdx
] = false;
2003 cu
.setPUInterDir(1, m_puAbsPartIdx
, puIdx
);
2004 cu
.setPUMv(0, m_bestME
[0].mv
, m_puAbsPartIdx
, puIdx
);
2005 cu
.setPURefIdx(0, m_bestME
[0].ref
, m_puAbsPartIdx
, puIdx
);
2006 cu
.m_mvd
[0][m_puAbsPartIdx
] = m_bestME
[0].mv
- m_bestME
[0].mvp
;
2007 cu
.m_mvpIdx
[0][m_puAbsPartIdx
] = m_bestME
[0].mvpIdx
;
2009 cu
.setPURefIdx(1, REF_NOT_VALID
, m_puAbsPartIdx
, puIdx
);
2010 cu
.setPUMv(1, mvzero
, m_puAbsPartIdx
, puIdx
);
2012 totalmebits
+= m_bestME
[0].bits
;
2018 cu
.m_mergeFlag
[m_puAbsPartIdx
] = false;
2019 cu
.setPUInterDir(2, m_puAbsPartIdx
, puIdx
);
2020 cu
.setPUMv(1, m_bestME
[1].mv
, m_puAbsPartIdx
, puIdx
);
2021 cu
.setPURefIdx(1, m_bestME
[1].ref
, m_puAbsPartIdx
, puIdx
);
2022 cu
.m_mvd
[1][m_puAbsPartIdx
] = m_bestME
[1].mv
- m_bestME
[1].mvp
;
2023 cu
.m_mvpIdx
[1][m_puAbsPartIdx
] = m_bestME
[1].mvpIdx
;
2025 cu
.setPURefIdx(0, REF_NOT_VALID
, m_puAbsPartIdx
, puIdx
);
2026 cu
.setPUMv(0, mvzero
, m_puAbsPartIdx
, puIdx
);
2028 totalmebits
+= m_bestME
[1].bits
;
2031 prepMotionCompensation(cu
, cuGeom
, puIdx
);
2032 motionCompensation(*predYuv
, true, bChroma
);
2035 interMode
.sa8dBits
+= totalmebits
;
2039 void Search::getBlkBits(PartSize cuMode
, bool bPSlice
, int partIdx
, uint32_t lastMode
, uint32_t blockBit
[3])
2041 if (cuMode
== SIZE_2Nx2N
)
2043 blockBit
[0] = (!bPSlice
) ? 3 : 1;
2047 else if (cuMode
== SIZE_2NxN
|| cuMode
== SIZE_2NxnU
|| cuMode
== SIZE_2NxnD
)
2049 static const uint32_t listBits
[2][3][3] =
2051 { { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
2052 { { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } }
2061 memcpy(blockBit
, listBits
[partIdx
][lastMode
], 3 * sizeof(uint32_t));
2063 else if (cuMode
== SIZE_Nx2N
|| cuMode
== SIZE_nLx2N
|| cuMode
== SIZE_nRx2N
)
2065 static const uint32_t listBits
[2][3][3] =
2067 { { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
2068 { { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } }
2077 memcpy(blockBit
, listBits
[partIdx
][lastMode
], 3 * sizeof(uint32_t));
2079 else if (cuMode
== SIZE_NxN
)
2081 blockBit
[0] = (!bPSlice
) ? 3 : 1;
2087 X265_CHECK(0, "getBlkBits: unknown cuMode\n");
2091 /* Check if using an alternative MVP would result in a smaller MVD + signal bits */
2092 void Search::checkBestMVP(MV
* amvpCand
, MV mv
, MV
& mvPred
, int& outMvpIdx
, uint32_t& outBits
, uint32_t& outCost
) const
2094 X265_CHECK(amvpCand
[outMvpIdx
] == mvPred
, "checkBestMVP: unexpected mvPred\n");
2096 int mvpIdx
= !outMvpIdx
;
2097 MV mvp
= amvpCand
[mvpIdx
];
2098 int diffBits
= m_me
.bitcost(mv
, mvp
) - m_me
.bitcost(mv
, mvPred
);
2103 uint32_t origOutBits
= outBits
;
2104 outBits
= origOutBits
+ diffBits
;
2105 outCost
= (outCost
- m_rdCost
.getCost(origOutBits
)) + m_rdCost
.getCost(outBits
);
2109 void Search::setSearchRange(const CUData
& cu
, MV mvp
, int merange
, MV
& mvmin
, MV
& mvmax
) const
2113 MV
dist((int16_t)merange
<< 2, (int16_t)merange
<< 2);
2120 /* Clip search range to signaled maximum MV length.
2121 * We do not support this VUI field being changed from the default */
2122 const int maxMvLen
= (1 << 15) - 1;
2123 mvmin
.x
= X265_MAX(mvmin
.x
, -maxMvLen
);
2124 mvmin
.y
= X265_MAX(mvmin
.y
, -maxMvLen
);
2125 mvmax
.x
= X265_MIN(mvmax
.x
, maxMvLen
);
2126 mvmax
.y
= X265_MIN(mvmax
.y
, maxMvLen
);
2131 /* conditional clipping for frame parallelism */
2132 mvmin
.y
= X265_MIN(mvmin
.y
, (int16_t)m_refLagPixels
);
2133 mvmax
.y
= X265_MIN(mvmax
.y
, (int16_t)m_refLagPixels
);
2136 /* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
2137 void Search::encodeResAndCalcRdSkipCU(Mode
& interMode
)
2139 CUData
& cu
= interMode
.cu
;
2140 Yuv
* reconYuv
= &interMode
.reconYuv
;
2141 const Yuv
* fencYuv
= interMode
.fencYuv
;
2143 X265_CHECK(!cu
.isIntra(0), "intra CU not expected\n");
2145 uint32_t cuSize
= 1 << cu
.m_log2CUSize
[0];
2146 uint32_t depth
= cu
.m_cuDepth
[0];
2148 // No residual coding : SKIP mode
2150 cu
.setSkipFlagSubParts(true);
2152 cu
.setTUDepthSubParts(0, 0, depth
);
2154 reconYuv
->copyFromYuv(interMode
.predYuv
);
2157 int part
= partitionFromLog2Size(cu
.m_log2CUSize
[0]);
2158 interMode
.distortion
= primitives
.sse_pp
[part
](fencYuv
->m_buf
[0], fencYuv
->m_size
, reconYuv
->m_buf
[0], reconYuv
->m_size
);
2160 part
= partitionFromSizes(cuSize
>> m_hChromaShift
, cuSize
>> m_vChromaShift
);
2161 interMode
.distortion
+= m_rdCost
.scaleChromaDistCb(primitives
.sse_pp
[part
](fencYuv
->m_buf
[1], fencYuv
->m_csize
, reconYuv
->m_buf
[1], reconYuv
->m_csize
));
2162 interMode
.distortion
+= m_rdCost
.scaleChromaDistCr(primitives
.sse_pp
[part
](fencYuv
->m_buf
[2], fencYuv
->m_csize
, reconYuv
->m_buf
[2], reconYuv
->m_csize
));
2164 m_entropyCoder
.load(m_rqt
[depth
].cur
);
2165 m_entropyCoder
.resetBits();
2166 if (m_slice
->m_pps
->bTransquantBypassEnabled
)
2167 m_entropyCoder
.codeCUTransquantBypassFlag(cu
.m_tqBypass
[0]);
2168 m_entropyCoder
.codeSkipFlag(cu
, 0);
2169 m_entropyCoder
.codeMergeIndex(cu
, 0);
2171 interMode
.mvBits
= m_entropyCoder
.getNumberOfWrittenBits();
2172 interMode
.coeffBits
= 0;
2173 interMode
.totalBits
= interMode
.mvBits
;
2174 if (m_rdCost
.m_psyRd
)
2175 interMode
.psyEnergy
= m_rdCost
.psyCost(cu
.m_log2CUSize
[0] - 2, fencYuv
->m_buf
[0], fencYuv
->m_size
, reconYuv
->m_buf
[0], reconYuv
->m_size
);
2177 updateModeCost(interMode
);
2178 m_entropyCoder
.store(interMode
.contexts
);
2181 /* encode residual and calculate rate-distortion for a CU block.
2182 * Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
2183 void Search::encodeResAndCalcRdInterCU(Mode
& interMode
, const CUGeom
& cuGeom
)
2185 CUData
& cu
= interMode
.cu
;
2186 Yuv
* reconYuv
= &interMode
.reconYuv
;
2187 Yuv
* predYuv
= &interMode
.predYuv
;
2188 ShortYuv
* resiYuv
= &m_rqt
[cuGeom
.depth
].tmpResiYuv
;
2189 const Yuv
* fencYuv
= interMode
.fencYuv
;
2191 X265_CHECK(!cu
.isIntra(0), "intra CU not expected\n");
2193 uint32_t log2CUSize
= cu
.m_log2CUSize
[0];
2194 uint32_t cuSize
= 1 << log2CUSize
;
2195 uint32_t depth
= cu
.m_cuDepth
[0];
2197 int part
= partitionFromLog2Size(log2CUSize
);
2198 int cpart
= partitionFromSizes(cuSize
>> m_hChromaShift
, cuSize
>> m_vChromaShift
);
2200 m_quant
.setQPforQuant(interMode
.cu
);
2202 resiYuv
->subtract(*fencYuv
, *predYuv
, log2CUSize
);
2204 uint32_t tuDepthRange
[2];
2205 cu
.getInterTUQtDepthRange(tuDepthRange
, 0);
2207 m_entropyCoder
.load(m_rqt
[depth
].cur
);
2210 estimateResidualQT(interMode
, cuGeom
, 0, depth
, *resiYuv
, costs
, tuDepthRange
);
2212 if (!cu
.m_tqBypass
[0])
2214 uint32_t cbf0Dist
= primitives
.sse_pp
[part
](fencYuv
->m_buf
[0], fencYuv
->m_size
, predYuv
->m_buf
[0], predYuv
->m_size
);
2215 cbf0Dist
+= m_rdCost
.scaleChromaDistCb(primitives
.sse_pp
[cpart
](fencYuv
->m_buf
[1], predYuv
->m_csize
, predYuv
->m_buf
[1], predYuv
->m_csize
));
2216 cbf0Dist
+= m_rdCost
.scaleChromaDistCr(primitives
.sse_pp
[cpart
](fencYuv
->m_buf
[2], predYuv
->m_csize
, predYuv
->m_buf
[2], predYuv
->m_csize
));
2218 /* Consider the RD cost of not signaling any residual */
2219 m_entropyCoder
.load(m_rqt
[depth
].cur
);
2220 m_entropyCoder
.resetBits();
2221 m_entropyCoder
.codeQtRootCbfZero();
2222 uint32_t cbf0Bits
= m_entropyCoder
.getNumberOfWrittenBits();
2225 uint32_t cbf0Energy
;
2226 if (m_rdCost
.m_psyRd
)
2228 cbf0Energy
= m_rdCost
.psyCost(log2CUSize
- 2, fencYuv
->m_buf
[0], fencYuv
->m_size
, predYuv
->m_buf
[0], predYuv
->m_size
);
2229 cbf0Cost
= m_rdCost
.calcPsyRdCost(cbf0Dist
, cbf0Bits
, cbf0Energy
);
2232 cbf0Cost
= m_rdCost
.calcRdCost(cbf0Dist
, cbf0Bits
);
2234 if (cbf0Cost
< costs
.rdcost
)
2237 cu
.setTUDepthSubParts(0, 0, depth
);
2241 if (cu
.getQtRootCbf(0))
2242 saveResidualQTData(cu
, *resiYuv
, 0, depth
);
2244 /* calculate signal bits for inter/merge/skip coded CU */
2245 m_entropyCoder
.load(m_rqt
[depth
].cur
);
2247 uint32_t coeffBits
, bits
;
2248 if (cu
.m_mergeFlag
[0] && cu
.m_partSize
[0] == SIZE_2Nx2N
&& !cu
.getQtRootCbf(0))
2250 cu
.setSkipFlagSubParts(true);
2253 m_entropyCoder
.resetBits();
2254 if (m_slice
->m_pps
->bTransquantBypassEnabled
)
2255 m_entropyCoder
.codeCUTransquantBypassFlag(cu
.m_tqBypass
[0]);
2256 m_entropyCoder
.codeSkipFlag(cu
, 0);
2257 m_entropyCoder
.codeMergeIndex(cu
, 0);
2259 bits
= m_entropyCoder
.getNumberOfWrittenBits();
2263 m_entropyCoder
.resetBits();
2264 if (m_slice
->m_pps
->bTransquantBypassEnabled
)
2265 m_entropyCoder
.codeCUTransquantBypassFlag(cu
.m_tqBypass
[0]);
2266 m_entropyCoder
.codeSkipFlag(cu
, 0);
2267 m_entropyCoder
.codePredMode(cu
.m_predMode
[0]);
2268 m_entropyCoder
.codePartSize(cu
, 0, cu
.m_cuDepth
[0]);
2269 m_entropyCoder
.codePredInfo(cu
, 0);
2270 uint32_t mvBits
= m_entropyCoder
.getNumberOfWrittenBits();
2272 bool bCodeDQP
= m_slice
->m_pps
->bUseDQP
;
2273 m_entropyCoder
.codeCoeff(cu
, 0, cu
.m_cuDepth
[0], bCodeDQP
, tuDepthRange
);
2274 bits
= m_entropyCoder
.getNumberOfWrittenBits();
2276 coeffBits
= bits
- mvBits
;
2279 m_entropyCoder
.store(interMode
.contexts
);
2281 if (cu
.getQtRootCbf(0))
2282 reconYuv
->addClip(*predYuv
, *resiYuv
, log2CUSize
);
2284 reconYuv
->copyFromYuv(*predYuv
);
2286 // update with clipped distortion and cost (qp estimation loop uses unclipped values)
2287 uint32_t bestDist
= primitives
.sse_pp
[part
](fencYuv
->m_buf
[0], fencYuv
->m_size
, reconYuv
->m_buf
[0], reconYuv
->m_size
);
2288 bestDist
+= m_rdCost
.scaleChromaDistCb(primitives
.sse_pp
[cpart
](fencYuv
->m_buf
[1], fencYuv
->m_csize
, reconYuv
->m_buf
[1], reconYuv
->m_csize
));
2289 bestDist
+= m_rdCost
.scaleChromaDistCr(primitives
.sse_pp
[cpart
](fencYuv
->m_buf
[2], fencYuv
->m_csize
, reconYuv
->m_buf
[2], reconYuv
->m_csize
));
2290 if (m_rdCost
.m_psyRd
)
2291 interMode
.psyEnergy
= m_rdCost
.psyCost(log2CUSize
- 2, fencYuv
->m_buf
[0], fencYuv
->m_size
, reconYuv
->m_buf
[0], reconYuv
->m_size
);
2293 interMode
.totalBits
= bits
;
2294 interMode
.distortion
= bestDist
;
2295 interMode
.coeffBits
= coeffBits
;
2296 interMode
.mvBits
= bits
- coeffBits
;
2297 updateModeCost(interMode
);
2300 void Search::generateCoeffRecon(Mode
& mode
, const CUGeom
& cuGeom
)
2302 CUData
& cu
= mode
.cu
;
2304 m_quant
.setQPforQuant(mode
.cu
);
2306 if (cu
.m_predMode
[0] == MODE_INTER
)
2308 uint32_t tuDepthRange
[2];
2309 cu
.getInterTUQtDepthRange(tuDepthRange
, 0);
2311 residualTransformQuantInter(mode
, cuGeom
, 0, cu
.m_cuDepth
[0], tuDepthRange
);
2312 if (cu
.getQtRootCbf(0))
2313 mode
.reconYuv
.addClip(mode
.predYuv
, m_rqt
[cuGeom
.depth
].tmpResiYuv
, cu
.m_log2CUSize
[0]);
2316 mode
.reconYuv
.copyFromYuv(mode
.predYuv
);
2317 if (cu
.m_mergeFlag
[0] && cu
.m_partSize
[0] == SIZE_2Nx2N
)
2318 cu
.setSkipFlagSubParts(true);
2321 else if (cu
.m_predMode
[0] == MODE_INTRA
)
2323 uint32_t tuDepthRange
[2];
2324 cu
.getIntraTUQtDepthRange(tuDepthRange
, 0);
2326 uint32_t initTrDepth
= cu
.m_partSize
[0] == SIZE_NxN
;
2327 residualTransformQuantIntra(mode
, cuGeom
, initTrDepth
, 0, tuDepthRange
);
2328 getBestIntraModeChroma(mode
, cuGeom
);
2329 residualQTIntraChroma(mode
, cuGeom
, 0, 0);
2330 mode
.reconYuv
.copyFromPicYuv(*m_frame
->m_reconPicYuv
, cu
.m_cuAddr
, cuGeom
.encodeIdx
); // TODO:
2334 void Search::residualTransformQuantInter(Mode
& mode
, const CUGeom
& cuGeom
, uint32_t absPartIdx
, uint32_t depth
, uint32_t depthRange
[2])
2336 CUData
& cu
= mode
.cu
;
2337 X265_CHECK(cu
.m_cuDepth
[0] == cu
.m_cuDepth
[absPartIdx
], "invalid depth\n");
2339 uint32_t log2TrSize
= g_maxLog2CUSize
- depth
;
2340 uint32_t tuDepth
= depth
- cu
.m_cuDepth
[0];
2342 bool bCheckFull
= log2TrSize
<= depthRange
[1];
2343 if (cu
.m_partSize
[absPartIdx
] != SIZE_2Nx2N
&& depth
== cu
.m_cuDepth
[absPartIdx
] && log2TrSize
> depthRange
[0])
2349 uint32_t log2TrSizeC
= log2TrSize
- m_hChromaShift
;
2350 bool bCodeChroma
= true;
2351 uint32_t tuDepthC
= tuDepth
;
2352 if (log2TrSizeC
== 1)
2354 X265_CHECK(log2TrSize
== 2 && m_csp
!= X265_CSP_I444
, "tuQuad check failed\n");
2357 uint32_t qpdiv
= NUM_CU_PARTITIONS
>> ((depth
- 1) << 1);
2358 bCodeChroma
= ((absPartIdx
& (qpdiv
- 1)) == 0);
2361 uint32_t absPartIdxStep
= NUM_CU_PARTITIONS
>> ((cu
.m_cuDepth
[0] + tuDepthC
) << 1);
2362 uint32_t setCbf
= 1 << tuDepth
;
2364 uint32_t coeffOffsetY
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2);
2365 coeff_t
*coeffCurY
= cu
.m_trCoeff
[0] + coeffOffsetY
;
2367 uint32_t sizeIdx
= log2TrSize
- 2;
2369 cu
.setTUDepthSubParts(depth
- cu
.m_cuDepth
[0], absPartIdx
, depth
);
2370 cu
.setTransformSkipSubParts(0, TEXT_LUMA
, absPartIdx
, depth
);
2372 ShortYuv
& resiYuv
= m_rqt
[cuGeom
.depth
].tmpResiYuv
;
2373 const Yuv
* fencYuv
= mode
.fencYuv
;
2375 int16_t *curResiY
= resiYuv
.getLumaAddr(absPartIdx
);
2376 uint32_t strideResiY
= resiYuv
.m_size
;
2378 pixel
*fenc
= const_cast<pixel
*>(fencYuv
->getLumaAddr(absPartIdx
));
2379 uint32_t numSigY
= m_quant
.transformNxN(cu
, fenc
, fencYuv
->m_size
, curResiY
, strideResiY
, coeffCurY
, log2TrSize
, TEXT_LUMA
, absPartIdx
, false);
2383 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdx
], curResiY
, strideResiY
, coeffCurY
, log2TrSize
, TEXT_LUMA
, false, false, numSigY
);
2384 cu
.setCbfSubParts(setCbf
, TEXT_LUMA
, absPartIdx
, depth
);
2388 primitives
.blockfill_s
[sizeIdx
](curResiY
, strideResiY
, 0);
2389 cu
.setCbfSubParts(0, TEXT_LUMA
, absPartIdx
, depth
);
2394 uint32_t sizeIdxC
= log2TrSizeC
- 2;
2395 uint32_t strideResiC
= resiYuv
.m_csize
;
2397 uint32_t coeffOffsetC
= coeffOffsetY
>> (m_hChromaShift
+ m_vChromaShift
);
2398 coeff_t
*coeffCurU
= cu
.m_trCoeff
[1] + coeffOffsetC
;
2399 coeff_t
*coeffCurV
= cu
.m_trCoeff
[2] + coeffOffsetC
;
2400 bool splitIntoSubTUs
= (m_csp
== X265_CSP_I422
);
2402 TURecurse
tuIterator(splitIntoSubTUs
? VERTICAL_SPLIT
: DONT_SPLIT
, absPartIdxStep
, absPartIdx
);
2405 uint32_t absPartIdxC
= tuIterator
.absPartIdxTURelCU
;
2406 uint32_t subTUOffset
= tuIterator
.section
<< (log2TrSizeC
* 2);
2408 cu
.setTransformSkipPartRange(0, TEXT_CHROMA_U
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2409 cu
.setTransformSkipPartRange(0, TEXT_CHROMA_V
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2411 int16_t* curResiU
= resiYuv
.getCbAddr(absPartIdxC
);
2412 pixel
* fencCb
= const_cast<pixel
*>(fencYuv
->getCbAddr(absPartIdxC
));
2413 uint32_t numSigU
= m_quant
.transformNxN(cu
, fencCb
, fencYuv
->m_csize
, curResiU
, strideResiC
, coeffCurU
+ subTUOffset
, log2TrSizeC
, TEXT_CHROMA_U
, absPartIdxC
, false);
2416 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdxC
], curResiU
, strideResiC
, coeffCurU
+ subTUOffset
, log2TrSizeC
, TEXT_CHROMA_U
, false, false, numSigU
);
2417 cu
.setCbfPartRange(setCbf
, TEXT_CHROMA_U
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2421 primitives
.blockfill_s
[sizeIdxC
](curResiU
, strideResiC
, 0);
2422 cu
.setCbfPartRange(0, TEXT_CHROMA_U
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2425 int16_t* curResiV
= resiYuv
.getCrAddr(absPartIdxC
);
2426 pixel
* fencCr
= const_cast<pixel
*>(fencYuv
->getCrAddr(absPartIdxC
));
2427 uint32_t numSigV
= m_quant
.transformNxN(cu
, fencCr
, fencYuv
->m_csize
, curResiV
, strideResiC
, coeffCurV
+ subTUOffset
, log2TrSizeC
, TEXT_CHROMA_V
, absPartIdxC
, false);
2430 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdxC
], curResiV
, strideResiC
, coeffCurV
+ subTUOffset
, log2TrSizeC
, TEXT_CHROMA_V
, false, false, numSigV
);
2431 cu
.setCbfPartRange(setCbf
, TEXT_CHROMA_V
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2435 primitives
.blockfill_s
[sizeIdxC
](curResiV
, strideResiC
, 0);
2436 cu
.setCbfPartRange(0, TEXT_CHROMA_V
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2439 while (tuIterator
.isNextSection());
2441 if (splitIntoSubTUs
)
2443 offsetSubTUCBFs(cu
, TEXT_CHROMA_U
, tuDepth
, absPartIdx
);
2444 offsetSubTUCBFs(cu
, TEXT_CHROMA_V
, tuDepth
, absPartIdx
);
2450 X265_CHECK(log2TrSize
> depthRange
[0], "residualTransformQuantInter recursion check failure\n");
2452 const uint32_t qPartNumSubdiv
= NUM_CU_PARTITIONS
>> ((depth
+ 1) << 1);
2453 uint32_t ycbf
= 0, ucbf
= 0, vcbf
= 0;
2454 for (uint32_t i
= 0; i
< 4; i
++)
2456 residualTransformQuantInter(mode
, cuGeom
, absPartIdx
+ i
* qPartNumSubdiv
, depth
+ 1, depthRange
);
2457 ycbf
|= cu
.getCbf(absPartIdx
+ i
* qPartNumSubdiv
, TEXT_LUMA
, tuDepth
+ 1);
2458 ucbf
|= cu
.getCbf(absPartIdx
+ i
* qPartNumSubdiv
, TEXT_CHROMA_U
, tuDepth
+ 1);
2459 vcbf
|= cu
.getCbf(absPartIdx
+ i
* qPartNumSubdiv
, TEXT_CHROMA_V
, tuDepth
+ 1);
2461 for (uint32_t i
= 0; i
< 4 * qPartNumSubdiv
; i
++)
2463 cu
.m_cbf
[TEXT_LUMA
][absPartIdx
+ i
] |= ycbf
<< tuDepth
;
2464 cu
.m_cbf
[TEXT_CHROMA_U
][absPartIdx
+ i
] |= ucbf
<< tuDepth
;
2465 cu
.m_cbf
[TEXT_CHROMA_V
][absPartIdx
+ i
] |= vcbf
<< tuDepth
;
2470 void Search::estimateResidualQT(Mode
& mode
, const CUGeom
& cuGeom
, uint32_t absPartIdx
, uint32_t depth
, ShortYuv
& resiYuv
, Cost
& outCosts
, uint32_t depthRange
[2])
2472 CUData
& cu
= mode
.cu
;
2473 uint32_t log2TrSize
= g_maxLog2CUSize
- depth
;
2475 bool bCheckSplit
= log2TrSize
> depthRange
[0];
2476 bool bCheckFull
= log2TrSize
<= depthRange
[1];
2478 if (cu
.m_partSize
[absPartIdx
] != SIZE_2Nx2N
&& depth
== cu
.m_cuDepth
[absPartIdx
] && bCheckSplit
)
2481 X265_CHECK(bCheckFull
|| bCheckSplit
, "check-full or check-split must be set\n");
2482 X265_CHECK(cu
.m_cuDepth
[0] == cu
.m_cuDepth
[absPartIdx
], "depth not matching\n");
2484 uint32_t tuDepth
= depth
- cu
.m_cuDepth
[0];
2485 uint32_t log2TrSizeC
= log2TrSize
- m_hChromaShift
;
2486 bool bCodeChroma
= true;
2487 uint32_t tuDepthC
= tuDepth
;
2488 if ((log2TrSize
== 2) && !(m_csp
== X265_CSP_I444
))
2492 uint32_t qpdiv
= NUM_CU_PARTITIONS
>> ((depth
- 1) << 1);
2493 bCodeChroma
= ((absPartIdx
& (qpdiv
- 1)) == 0);
2498 fullCost
.rdcost
= MAX_INT64
;
2500 uint8_t cbfFlag
[MAX_NUM_COMPONENT
][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
2501 uint32_t numSig
[MAX_NUM_COMPONENT
][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
2502 uint32_t singleBitsComp
[MAX_NUM_COMPONENT
][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2503 uint32_t singleDistComp
[MAX_NUM_COMPONENT
][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2504 uint32_t singlePsyEnergyComp
[MAX_NUM_COMPONENT
][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2505 uint32_t bestTransformMode
[MAX_NUM_COMPONENT
][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2506 uint64_t minCost
[MAX_NUM_COMPONENT
][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64
, MAX_INT64
}, {MAX_INT64
, MAX_INT64
}, {MAX_INT64
, MAX_INT64
} };
2508 m_entropyCoder
.store(m_rqt
[depth
].rqtRoot
);
2510 uint32_t trSize
= 1 << log2TrSize
;
2511 const bool splitIntoSubTUs
= (m_csp
== X265_CSP_I422
);
2512 uint32_t absPartIdxStep
= NUM_CU_PARTITIONS
>> ((cu
.m_cuDepth
[0] + tuDepthC
) << 1);
2513 const Yuv
* fencYuv
= mode
.fencYuv
;
2518 uint32_t trSizeC
= 1 << log2TrSizeC
;
2519 int partSize
= partitionFromLog2Size(log2TrSize
);
2520 int partSizeC
= partitionFromLog2Size(log2TrSizeC
);
2521 const uint32_t qtLayer
= log2TrSize
- 2;
2522 uint32_t coeffOffsetY
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2);
2523 coeff_t
* coeffCurY
= m_rqt
[qtLayer
].coeffRQT
[0] + coeffOffsetY
;
2525 bool checkTransformSkip
= m_slice
->m_pps
->bTransformSkipEnabled
&& !cu
.m_tqBypass
[0];
2526 bool checkTransformSkipY
= checkTransformSkip
&& log2TrSize
<= MAX_LOG2_TS_SIZE
;
2527 bool checkTransformSkipC
= checkTransformSkip
&& log2TrSizeC
<= MAX_LOG2_TS_SIZE
;
2529 cu
.setTUDepthSubParts(depth
- cu
.m_cuDepth
[0], absPartIdx
, depth
);
2530 cu
.setTransformSkipSubParts(0, TEXT_LUMA
, absPartIdx
, depth
);
2533 m_entropyCoder
.estBit(m_entropyCoder
.m_estBitsSbac
, log2TrSize
, true);
2535 pixel
*fenc
= const_cast<pixel
*>(fencYuv
->getLumaAddr(absPartIdx
));
2536 int16_t *resi
= resiYuv
.getLumaAddr(absPartIdx
);
2537 numSig
[TEXT_LUMA
][0] = m_quant
.transformNxN(cu
, fenc
, fencYuv
->m_size
, resi
, resiYuv
.m_size
, coeffCurY
, log2TrSize
, TEXT_LUMA
, absPartIdx
, false);
2538 cbfFlag
[TEXT_LUMA
][0] = !!numSig
[TEXT_LUMA
][0];
2540 m_entropyCoder
.resetBits();
2541 m_entropyCoder
.codeQtCbf(cbfFlag
[TEXT_LUMA
][0], TEXT_LUMA
, tuDepth
);
2542 if (cbfFlag
[TEXT_LUMA
][0])
2543 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurY
, absPartIdx
, log2TrSize
, TEXT_LUMA
);
2544 singleBitsComp
[TEXT_LUMA
][0] = m_entropyCoder
.getNumberOfWrittenBits();
2546 uint32_t singleBitsPrev
= singleBitsComp
[TEXT_LUMA
][0];
2550 uint32_t coeffOffsetC
= coeffOffsetY
>> (m_hChromaShift
+ m_vChromaShift
);
2551 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
2553 coeff_t
* coeffCurC
= m_rqt
[qtLayer
].coeffRQT
[chromaId
] + coeffOffsetC
;
2554 TURecurse
tuIterator(splitIntoSubTUs
? VERTICAL_SPLIT
: DONT_SPLIT
, absPartIdxStep
, absPartIdx
);
2558 uint32_t absPartIdxC
= tuIterator
.absPartIdxTURelCU
;
2559 uint32_t subTUOffset
= tuIterator
.section
<< (log2TrSizeC
* 2);
2561 cu
.setTransformSkipPartRange(0, (TextType
)chromaId
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2563 if (m_bEnableRDOQ
&& (chromaId
!= TEXT_CHROMA_V
))
2564 m_entropyCoder
.estBit(m_entropyCoder
.m_estBitsSbac
, log2TrSizeC
, false);
2566 fenc
= const_cast<pixel
*>(fencYuv
->getChromaAddr(chromaId
, absPartIdxC
));
2567 resi
= resiYuv
.getChromaAddr(chromaId
, absPartIdxC
);
2568 numSig
[chromaId
][tuIterator
.section
] = m_quant
.transformNxN(cu
, fenc
, fencYuv
->m_csize
, resi
, resiYuv
.m_csize
, coeffCurC
+ subTUOffset
, log2TrSizeC
, (TextType
)chromaId
, absPartIdxC
, false);
2569 cbfFlag
[chromaId
][tuIterator
.section
] = !!numSig
[chromaId
][tuIterator
.section
];
2571 m_entropyCoder
.codeQtCbf(cbfFlag
[chromaId
][tuIterator
.section
], (TextType
)chromaId
, tuDepth
);
2572 if (cbfFlag
[chromaId
][tuIterator
.section
])
2573 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurC
+ subTUOffset
, absPartIdxC
, log2TrSizeC
, (TextType
)chromaId
);
2575 uint32_t newBits
= m_entropyCoder
.getNumberOfWrittenBits();
2576 singleBitsComp
[chromaId
][tuIterator
.section
] = newBits
- singleBitsPrev
;
2578 singleBitsPrev
= newBits
;
2580 while (tuIterator
.isNextSection());
2584 const uint32_t numCoeffY
= 1 << (log2TrSize
* 2);
2585 const uint32_t numCoeffC
= 1 << (log2TrSizeC
* 2);
2587 X265_CHECK(log2TrSize
<= 5, "log2TrSize is too large\n");
2588 uint32_t distY
= primitives
.ssd_s
[partSize
](resiYuv
.getLumaAddr(absPartIdx
), resiYuv
.m_size
);
2589 uint32_t psyEnergyY
= 0;
2590 if (m_rdCost
.m_psyRd
)
2591 psyEnergyY
= m_rdCost
.psyCost(partSize
, resiYuv
.getLumaAddr(absPartIdx
), resiYuv
.m_size
, (int16_t*)zeroShort
, 0);
2593 int16_t *curResiY
= m_rqt
[qtLayer
].resiQtYuv
.getLumaAddr(absPartIdx
);
2594 uint32_t strideResiY
= m_rqt
[qtLayer
].resiQtYuv
.m_size
;
2596 if (cbfFlag
[TEXT_LUMA
][0])
2598 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdx
], curResiY
, strideResiY
, coeffCurY
, log2TrSize
, TEXT_LUMA
, false, false, numSig
[TEXT_LUMA
][0]); //this is for inter mode only
2600 const uint32_t nonZeroDistY
= primitives
.sse_ss
[partSize
](resiYuv
.getLumaAddr(absPartIdx
), resiYuv
.m_size
, curResiY
, strideResiY
);
2601 uint32_t nonZeroPsyEnergyY
= 0;
2602 if (m_rdCost
.m_psyRd
)
2603 nonZeroPsyEnergyY
= m_rdCost
.psyCost(partSize
, resiYuv
.getLumaAddr(absPartIdx
), resiYuv
.m_size
, curResiY
, strideResiY
);
2605 if (cu
.m_tqBypass
[0])
2607 distY
= nonZeroDistY
;
2608 psyEnergyY
= nonZeroPsyEnergyY
;
2612 uint64_t singleCostY
= 0;
2613 if (m_rdCost
.m_psyRd
)
2614 singleCostY
= m_rdCost
.calcPsyRdCost(nonZeroDistY
, singleBitsComp
[TEXT_LUMA
][0], nonZeroPsyEnergyY
);
2616 singleCostY
= m_rdCost
.calcRdCost(nonZeroDistY
, singleBitsComp
[TEXT_LUMA
][0]);
2617 m_entropyCoder
.resetBits();
2618 m_entropyCoder
.codeQtCbfZero(TEXT_LUMA
, tuDepth
);
2619 const uint32_t nullBitsY
= m_entropyCoder
.getNumberOfWrittenBits();
2620 uint64_t nullCostY
= 0;
2621 if (m_rdCost
.m_psyRd
)
2622 nullCostY
= m_rdCost
.calcPsyRdCost(distY
, nullBitsY
, psyEnergyY
);
2624 nullCostY
= m_rdCost
.calcRdCost(distY
, nullBitsY
);
2625 if (nullCostY
< singleCostY
)
2627 cbfFlag
[TEXT_LUMA
][0] = 0;
2628 #if CHECKED_BUILD || _DEBUG
2629 memset(coeffCurY
, 0, sizeof(coeff_t
) * numCoeffY
);
2631 if (checkTransformSkipY
)
2632 minCost
[TEXT_LUMA
][0] = nullCostY
;
2636 distY
= nonZeroDistY
;
2637 psyEnergyY
= nonZeroPsyEnergyY
;
2638 if (checkTransformSkipY
)
2639 minCost
[TEXT_LUMA
][0] = singleCostY
;
2643 else if (checkTransformSkipY
)
2645 m_entropyCoder
.resetBits();
2646 m_entropyCoder
.codeQtCbfZero(TEXT_LUMA
, tuDepth
);
2647 const uint32_t nullBitsY
= m_entropyCoder
.getNumberOfWrittenBits();
2648 if (m_rdCost
.m_psyRd
)
2649 minCost
[TEXT_LUMA
][0] = m_rdCost
.calcPsyRdCost(distY
, nullBitsY
, psyEnergyY
);
2651 minCost
[TEXT_LUMA
][0] = m_rdCost
.calcRdCost(distY
, nullBitsY
);
2654 singleDistComp
[TEXT_LUMA
][0] = distY
;
2655 singlePsyEnergyComp
[TEXT_LUMA
][0] = psyEnergyY
;
2656 if (!cbfFlag
[TEXT_LUMA
][0])
2657 primitives
.blockfill_s
[partSize
](curResiY
, strideResiY
, 0);
2658 cu
.setCbfSubParts(cbfFlag
[TEXT_LUMA
][0] << tuDepth
, TEXT_LUMA
, absPartIdx
, depth
);
2662 uint32_t strideResiC
= m_rqt
[qtLayer
].resiQtYuv
.m_csize
;
2663 uint32_t coeffOffsetC
= coeffOffsetY
>> (m_hChromaShift
+ m_vChromaShift
);
2664 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
2666 uint32_t distC
= 0, psyEnergyC
= 0;
2667 coeff_t
* coeffCurC
= m_rqt
[qtLayer
].coeffRQT
[chromaId
] + coeffOffsetC
;
2668 TURecurse
tuIterator(splitIntoSubTUs
? VERTICAL_SPLIT
: DONT_SPLIT
, absPartIdxStep
, absPartIdx
);
2672 uint32_t absPartIdxC
= tuIterator
.absPartIdxTURelCU
;
2673 uint32_t subTUOffset
= tuIterator
.section
<< (log2TrSizeC
* 2);
2675 int16_t *curResiC
= m_rqt
[qtLayer
].resiQtYuv
.getChromaAddr(chromaId
, absPartIdxC
);
2677 distC
= m_rdCost
.scaleChromaDistCb(primitives
.ssd_s
[log2TrSizeC
- 2](resiYuv
.getChromaAddr(chromaId
, absPartIdxC
), resiYuv
.m_csize
));
2679 if (cbfFlag
[chromaId
][tuIterator
.section
])
2681 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdxC
], curResiC
, strideResiC
, coeffCurC
+ subTUOffset
,
2682 log2TrSizeC
, (TextType
)chromaId
, false, false, numSig
[chromaId
][tuIterator
.section
]);
2683 uint32_t dist
= primitives
.sse_ss
[partSizeC
](resiYuv
.getChromaAddr(chromaId
, absPartIdxC
), resiYuv
.m_csize
, curResiC
, strideResiC
);
2684 const uint32_t nonZeroDistC
= m_rdCost
.scaleChromaDistCb(dist
);
2685 uint32_t nonZeroPsyEnergyC
= 0;
2686 if (m_rdCost
.m_psyRd
)
2687 nonZeroPsyEnergyC
= m_rdCost
.psyCost(partSizeC
, resiYuv
.getChromaAddr(chromaId
, absPartIdxC
), resiYuv
.m_csize
, curResiC
, strideResiC
);
2689 if (cu
.m_tqBypass
[0])
2691 distC
= nonZeroDistC
;
2692 psyEnergyC
= nonZeroPsyEnergyC
;
2696 uint64_t singleCostC
= 0;
2697 if (m_rdCost
.m_psyRd
)
2698 singleCostC
= m_rdCost
.calcPsyRdCost(nonZeroDistC
, singleBitsComp
[chromaId
][tuIterator
.section
], nonZeroPsyEnergyC
);
2700 singleCostC
= m_rdCost
.calcRdCost(nonZeroDistC
, singleBitsComp
[chromaId
][tuIterator
.section
]);
2701 m_entropyCoder
.resetBits();
2702 m_entropyCoder
.codeQtCbfZero((TextType
)chromaId
, tuDepth
);
2703 const uint32_t nullBitsC
= m_entropyCoder
.getNumberOfWrittenBits();
2704 uint64_t nullCostC
= 0;
2705 if (m_rdCost
.m_psyRd
)
2706 nullCostC
= m_rdCost
.calcPsyRdCost(distC
, nullBitsC
, psyEnergyC
);
2708 nullCostC
= m_rdCost
.calcRdCost(distC
, nullBitsC
);
2709 if (nullCostC
< singleCostC
)
2711 cbfFlag
[chromaId
][tuIterator
.section
] = 0;
2712 #if CHECKED_BUILD || _DEBUG
2713 memset(coeffCurC
+ subTUOffset
, 0, sizeof(coeff_t
) * numCoeffC
);
2715 if (checkTransformSkipC
)
2716 minCost
[chromaId
][tuIterator
.section
] = nullCostC
;
2720 distC
= nonZeroDistC
;
2721 psyEnergyC
= nonZeroPsyEnergyC
;
2722 if (checkTransformSkipC
)
2723 minCost
[chromaId
][tuIterator
.section
] = singleCostC
;
2727 else if (checkTransformSkipC
)
2729 m_entropyCoder
.resetBits();
2730 m_entropyCoder
.codeQtCbfZero((TextType
)chromaId
, tuDepthC
);
2731 const uint32_t nullBitsC
= m_entropyCoder
.getNumberOfWrittenBits();
2732 if (m_rdCost
.m_psyRd
)
2733 minCost
[chromaId
][tuIterator
.section
] = m_rdCost
.calcPsyRdCost(distC
, nullBitsC
, psyEnergyC
);
2735 minCost
[chromaId
][tuIterator
.section
] = m_rdCost
.calcRdCost(distC
, nullBitsC
);
2738 singleDistComp
[chromaId
][tuIterator
.section
] = distC
;
2739 singlePsyEnergyComp
[chromaId
][tuIterator
.section
] = psyEnergyC
;
2741 if (!cbfFlag
[chromaId
][tuIterator
.section
])
2742 primitives
.blockfill_s
[partSizeC
](curResiC
, strideResiC
, 0);
2744 cu
.setCbfPartRange(cbfFlag
[chromaId
][tuIterator
.section
] << tuDepth
, (TextType
)chromaId
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2746 while (tuIterator
.isNextSection());
2750 if (checkTransformSkipY
)
2752 uint32_t nonZeroDistY
= 0;
2753 uint32_t nonZeroPsyEnergyY
= 0;
2754 uint64_t singleCostY
= MAX_INT64
;
2756 ALIGN_VAR_32(coeff_t
, tsCoeffY
[MAX_TS_SIZE
* MAX_TS_SIZE
]);
2757 ALIGN_VAR_32(int16_t, tsResiY
[MAX_TS_SIZE
* MAX_TS_SIZE
]);
2759 m_entropyCoder
.load(m_rqt
[depth
].rqtRoot
);
2761 cu
.setTransformSkipSubParts(1, TEXT_LUMA
, absPartIdx
, depth
);
2764 m_entropyCoder
.estBit(m_entropyCoder
.m_estBitsSbac
, log2TrSize
, true);
2766 fenc
= const_cast<pixel
*>(fencYuv
->getLumaAddr(absPartIdx
));
2767 resi
= resiYuv
.getLumaAddr(absPartIdx
);
2768 uint32_t numSigTSkipY
= m_quant
.transformNxN(cu
, fenc
, fencYuv
->m_size
, resi
, resiYuv
.m_size
, tsCoeffY
, log2TrSize
, TEXT_LUMA
, absPartIdx
, true);
2772 m_entropyCoder
.resetBits();
2773 m_entropyCoder
.codeQtCbf(!!numSigTSkipY
, TEXT_LUMA
, tuDepth
);
2774 m_entropyCoder
.codeCoeffNxN(cu
, tsCoeffY
, absPartIdx
, log2TrSize
, TEXT_LUMA
);
2775 const uint32_t skipSingleBitsY
= m_entropyCoder
.getNumberOfWrittenBits();
2777 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdx
], tsResiY
, trSize
, tsCoeffY
, log2TrSize
, TEXT_LUMA
, false, true, numSigTSkipY
);
2779 nonZeroDistY
= primitives
.sse_ss
[partSize
](resiYuv
.getLumaAddr(absPartIdx
), resiYuv
.m_size
, tsResiY
, trSize
);
2781 if (m_rdCost
.m_psyRd
)
2783 nonZeroPsyEnergyY
= m_rdCost
.psyCost(partSize
, resiYuv
.getLumaAddr(absPartIdx
), resiYuv
.m_size
, tsResiY
, trSize
);
2784 singleCostY
= m_rdCost
.calcPsyRdCost(nonZeroDistY
, skipSingleBitsY
, nonZeroPsyEnergyY
);
2787 singleCostY
= m_rdCost
.calcRdCost(nonZeroDistY
, skipSingleBitsY
);
2790 if (!numSigTSkipY
|| minCost
[TEXT_LUMA
][0] < singleCostY
)
2791 cu
.setTransformSkipSubParts(0, TEXT_LUMA
, absPartIdx
, depth
);
2794 singleDistComp
[TEXT_LUMA
][0] = nonZeroDistY
;
2795 singlePsyEnergyComp
[TEXT_LUMA
][0] = nonZeroPsyEnergyY
;
2796 cbfFlag
[TEXT_LUMA
][0] = !!numSigTSkipY
;
2797 bestTransformMode
[TEXT_LUMA
][0] = 1;
2798 memcpy(coeffCurY
, tsCoeffY
, sizeof(coeff_t
) * numCoeffY
);
2799 primitives
.square_copy_ss
[partSize
](curResiY
, strideResiY
, tsResiY
, trSize
);
2802 cu
.setCbfSubParts(cbfFlag
[TEXT_LUMA
][0] << tuDepth
, TEXT_LUMA
, absPartIdx
, depth
);
2805 if (bCodeChroma
&& checkTransformSkipC
)
2807 uint32_t nonZeroDistC
= 0, nonZeroPsyEnergyC
= 0;
2808 uint64_t singleCostC
= MAX_INT64
;
2809 uint32_t strideResiC
= m_rqt
[qtLayer
].resiQtYuv
.m_csize
;
2810 uint32_t coeffOffsetC
= coeffOffsetY
>> (m_hChromaShift
+ m_vChromaShift
);
2812 m_entropyCoder
.load(m_rqt
[depth
].rqtRoot
);
2814 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
2816 coeff_t
* coeffCurC
= m_rqt
[qtLayer
].coeffRQT
[chromaId
] + coeffOffsetC
;
2817 TURecurse
tuIterator(splitIntoSubTUs
? VERTICAL_SPLIT
: DONT_SPLIT
, absPartIdxStep
, absPartIdx
);
2821 uint32_t absPartIdxC
= tuIterator
.absPartIdxTURelCU
;
2822 uint32_t subTUOffset
= tuIterator
.section
<< (log2TrSizeC
* 2);
2824 int16_t *curResiC
= m_rqt
[qtLayer
].resiQtYuv
.getChromaAddr(chromaId
, absPartIdxC
);
2826 ALIGN_VAR_32(coeff_t
, tsCoeffC
[MAX_TS_SIZE
* MAX_TS_SIZE
]);
2827 ALIGN_VAR_32(int16_t, tsResiC
[MAX_TS_SIZE
* MAX_TS_SIZE
]);
2829 cu
.setTransformSkipPartRange(1, (TextType
)chromaId
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2831 if (m_bEnableRDOQ
&& (chromaId
!= TEXT_CHROMA_V
))
2832 m_entropyCoder
.estBit(m_entropyCoder
.m_estBitsSbac
, log2TrSizeC
, false);
2834 fenc
= const_cast<pixel
*>(fencYuv
->getChromaAddr(chromaId
, absPartIdxC
));
2835 resi
= resiYuv
.getChromaAddr(chromaId
, absPartIdxC
);
2836 uint32_t numSigTSkipC
= m_quant
.transformNxN(cu
, fenc
, fencYuv
->m_csize
, resi
, resiYuv
.m_csize
, tsCoeffC
, log2TrSizeC
, (TextType
)chromaId
, absPartIdxC
, true);
2838 m_entropyCoder
.resetBits();
2839 singleBitsComp
[chromaId
][tuIterator
.section
] = 0;
2843 m_entropyCoder
.codeQtCbf(!!numSigTSkipC
, (TextType
)chromaId
, tuDepth
);
2844 m_entropyCoder
.codeCoeffNxN(cu
, tsCoeffC
, absPartIdxC
, log2TrSizeC
, (TextType
)chromaId
);
2845 singleBitsComp
[chromaId
][tuIterator
.section
] = m_entropyCoder
.getNumberOfWrittenBits();
2847 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdxC
], tsResiC
, trSizeC
, tsCoeffC
,
2848 log2TrSizeC
, (TextType
)chromaId
, false, true, numSigTSkipC
);
2849 uint32_t dist
= primitives
.sse_ss
[partSizeC
](resiYuv
.getChromaAddr(chromaId
, absPartIdxC
), resiYuv
.m_csize
, tsResiC
, trSizeC
);
2850 nonZeroDistC
= m_rdCost
.scaleChromaDistCb(dist
);
2851 if (m_rdCost
.m_psyRd
)
2853 nonZeroPsyEnergyC
= m_rdCost
.psyCost(partSizeC
, resiYuv
.getChromaAddr(chromaId
, absPartIdxC
), resiYuv
.m_csize
, tsResiC
, trSizeC
);
2854 singleCostC
= m_rdCost
.calcPsyRdCost(nonZeroDistC
, singleBitsComp
[chromaId
][tuIterator
.section
], nonZeroPsyEnergyC
);
2857 singleCostC
= m_rdCost
.calcRdCost(nonZeroDistC
, singleBitsComp
[chromaId
][tuIterator
.section
]);
2860 if (!numSigTSkipC
|| minCost
[chromaId
][tuIterator
.section
] < singleCostC
)
2861 cu
.setTransformSkipPartRange(0, (TextType
)chromaId
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2864 singleDistComp
[chromaId
][tuIterator
.section
] = nonZeroDistC
;
2865 singlePsyEnergyComp
[chromaId
][tuIterator
.section
] = nonZeroPsyEnergyC
;
2866 cbfFlag
[chromaId
][tuIterator
.section
] = !!numSigTSkipC
;
2867 bestTransformMode
[chromaId
][tuIterator
.section
] = 1;
2868 memcpy(coeffCurC
+ subTUOffset
, tsCoeffC
, sizeof(coeff_t
) * numCoeffC
);
2869 primitives
.square_copy_ss
[partSizeC
](curResiC
, strideResiC
, tsResiC
, trSizeC
);
2872 cu
.setCbfPartRange(cbfFlag
[chromaId
][tuIterator
.section
] << tuDepth
, (TextType
)chromaId
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2874 while (tuIterator
.isNextSection());
2878 m_entropyCoder
.load(m_rqt
[depth
].rqtRoot
);
2880 m_entropyCoder
.resetBits();
2882 if (log2TrSize
> depthRange
[0])
2883 m_entropyCoder
.codeTransformSubdivFlag(0, 5 - log2TrSize
);
2887 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
2889 if (!splitIntoSubTUs
)
2890 m_entropyCoder
.codeQtCbf(cbfFlag
[chromaId
][0], (TextType
)chromaId
, tuDepth
);
2893 offsetSubTUCBFs(cu
, (TextType
)chromaId
, tuDepth
, absPartIdx
);
2894 for (uint32_t subTU
= 0; subTU
< 2; subTU
++)
2895 m_entropyCoder
.codeQtCbf(cbfFlag
[chromaId
][subTU
], (TextType
)chromaId
, tuDepth
);
2900 m_entropyCoder
.codeQtCbf(cbfFlag
[TEXT_LUMA
][0], TEXT_LUMA
, tuDepth
);
2901 if (cbfFlag
[TEXT_LUMA
][0])
2902 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurY
, absPartIdx
, log2TrSize
, TEXT_LUMA
);
2906 uint32_t subTUSize
= 1 << (log2TrSizeC
* 2);
2907 uint32_t partIdxesPerSubTU
= absPartIdxStep
>> 1;
2908 uint32_t coeffOffsetC
= coeffOffsetY
>> (m_hChromaShift
+ m_vChromaShift
);
2910 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
2912 coeff_t
* coeffCurC
= m_rqt
[qtLayer
].coeffRQT
[chromaId
] + coeffOffsetC
;
2913 if (!splitIntoSubTUs
)
2915 if (cbfFlag
[chromaId
][0])
2916 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurC
, absPartIdx
, log2TrSizeC
, (TextType
)chromaId
);
2920 for (uint32_t subTU
= 0; subTU
< 2; subTU
++)
2922 if (cbfFlag
[chromaId
][subTU
])
2923 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurC
+ subTU
* subTUSize
, absPartIdx
+ subTU
* partIdxesPerSubTU
, log2TrSizeC
, (TextType
)chromaId
);
2929 fullCost
.distortion
+= singleDistComp
[TEXT_LUMA
][0];
2930 fullCost
.energy
+= singlePsyEnergyComp
[TEXT_LUMA
][0];// need to check we need to add chroma also
2931 for (uint32_t subTUIndex
= 0; subTUIndex
< 2; subTUIndex
++)
2933 fullCost
.distortion
+= singleDistComp
[TEXT_CHROMA_U
][subTUIndex
];
2934 fullCost
.distortion
+= singleDistComp
[TEXT_CHROMA_V
][subTUIndex
];
2937 fullCost
.bits
= m_entropyCoder
.getNumberOfWrittenBits();
2938 if (m_rdCost
.m_psyRd
)
2939 fullCost
.rdcost
= m_rdCost
.calcPsyRdCost(fullCost
.distortion
, fullCost
.bits
, fullCost
.energy
);
2941 fullCost
.rdcost
= m_rdCost
.calcRdCost(fullCost
.distortion
, fullCost
.bits
);
2949 m_entropyCoder
.store(m_rqt
[depth
].rqtTest
);
2950 m_entropyCoder
.load(m_rqt
[depth
].rqtRoot
);
2954 const uint32_t qPartNumSubdiv
= NUM_CU_PARTITIONS
>> ((depth
+ 1) << 1);
2955 uint32_t ycbf
= 0, ucbf
= 0, vcbf
= 0;
2956 for (uint32_t i
= 0; i
< 4; ++i
)
2958 estimateResidualQT(mode
, cuGeom
, absPartIdx
+ i
* qPartNumSubdiv
, depth
+ 1, resiYuv
, splitCost
, depthRange
);
2959 ycbf
|= cu
.getCbf(absPartIdx
+ i
* qPartNumSubdiv
, TEXT_LUMA
, tuDepth
+ 1);
2960 ucbf
|= cu
.getCbf(absPartIdx
+ i
* qPartNumSubdiv
, TEXT_CHROMA_U
, tuDepth
+ 1);
2961 vcbf
|= cu
.getCbf(absPartIdx
+ i
* qPartNumSubdiv
, TEXT_CHROMA_V
, tuDepth
+ 1);
2963 for (uint32_t i
= 0; i
< 4 * qPartNumSubdiv
; ++i
)
2965 cu
.m_cbf
[0][absPartIdx
+ i
] |= ycbf
<< tuDepth
;
2966 cu
.m_cbf
[1][absPartIdx
+ i
] |= ucbf
<< tuDepth
;
2967 cu
.m_cbf
[2][absPartIdx
+ i
] |= vcbf
<< tuDepth
;
2970 m_entropyCoder
.load(m_rqt
[depth
].rqtRoot
);
2971 m_entropyCoder
.resetBits();
2973 encodeResidualQT(cu
, absPartIdx
, depth
, true, TEXT_LUMA
, depthRange
);
2974 encodeResidualQT(cu
, absPartIdx
, depth
, false, TEXT_LUMA
, depthRange
);
2975 encodeResidualQT(cu
, absPartIdx
, depth
, false, TEXT_CHROMA_U
, depthRange
);
2976 encodeResidualQT(cu
, absPartIdx
, depth
, false, TEXT_CHROMA_V
, depthRange
);
2978 splitCost
.bits
= m_entropyCoder
.getNumberOfWrittenBits();
2980 if (m_rdCost
.m_psyRd
)
2981 splitCost
.rdcost
= m_rdCost
.calcPsyRdCost(splitCost
.distortion
, splitCost
.bits
, splitCost
.energy
);
2983 splitCost
.rdcost
= m_rdCost
.calcRdCost(splitCost
.distortion
, splitCost
.bits
);
2985 if (ycbf
|| ucbf
|| vcbf
|| !bCheckFull
)
2987 if (splitCost
.rdcost
< fullCost
.rdcost
)
2989 outCosts
.distortion
+= splitCost
.distortion
;
2990 outCosts
.rdcost
+= splitCost
.rdcost
;
2991 outCosts
.bits
+= splitCost
.bits
;
2992 outCosts
.energy
+= splitCost
.energy
;
2996 outCosts
.energy
+= splitCost
.energy
;
2999 cu
.setTransformSkipSubParts(bestTransformMode
[TEXT_LUMA
][0], TEXT_LUMA
, absPartIdx
, depth
);
3002 const uint32_t numberOfSections
= splitIntoSubTUs
? 2 : 1;
3004 uint32_t partIdxesPerSubTU
= absPartIdxStep
>> (splitIntoSubTUs
? 1 : 0);
3005 for (uint32_t subTUIndex
= 0; subTUIndex
< numberOfSections
; subTUIndex
++)
3007 const uint32_t subTUPartIdx
= absPartIdx
+ (subTUIndex
* partIdxesPerSubTU
);
3009 cu
.setTransformSkipPartRange(bestTransformMode
[TEXT_CHROMA_U
][subTUIndex
], TEXT_CHROMA_U
, subTUPartIdx
, partIdxesPerSubTU
);
3010 cu
.setTransformSkipPartRange(bestTransformMode
[TEXT_CHROMA_V
][subTUIndex
], TEXT_CHROMA_V
, subTUPartIdx
, partIdxesPerSubTU
);
3013 X265_CHECK(bCheckFull
, "check-full must be set\n");
3014 m_entropyCoder
.load(m_rqt
[depth
].rqtTest
);
3017 cu
.setTUDepthSubParts(tuDepth
, absPartIdx
, depth
);
3018 cu
.setCbfSubParts(cbfFlag
[TEXT_LUMA
][0] << tuDepth
, TEXT_LUMA
, absPartIdx
, depth
);
3022 uint32_t numberOfSections
= splitIntoSubTUs
? 2 : 1;
3023 uint32_t partIdxesPerSubTU
= absPartIdxStep
>> (splitIntoSubTUs
? 1 : 0);
3025 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
3027 for (uint32_t subTUIndex
= 0; subTUIndex
< numberOfSections
; subTUIndex
++)
3029 const uint32_t subTUPartIdx
= absPartIdx
+ (subTUIndex
* partIdxesPerSubTU
);
3031 if (splitIntoSubTUs
)
3033 uint8_t combinedSubTUCBF
= cbfFlag
[chromaId
][0] | cbfFlag
[chromaId
][1];
3034 cu
.setCbfPartRange(((cbfFlag
[chromaId
][subTUIndex
] << 1) | combinedSubTUCBF
) << tuDepth
, (TextType
)chromaId
, subTUPartIdx
, partIdxesPerSubTU
);
3037 cu
.setCbfPartRange(cbfFlag
[chromaId
][subTUIndex
] << tuDepth
, (TextType
)chromaId
, subTUPartIdx
, partIdxesPerSubTU
);
3042 outCosts
.distortion
+= fullCost
.distortion
;
3043 outCosts
.rdcost
+= fullCost
.rdcost
;
3044 outCosts
.bits
+= fullCost
.bits
;
3045 outCosts
.energy
+= fullCost
.energy
;
3048 void Search::encodeResidualQT(CUData
& cu
, uint32_t absPartIdx
, const uint32_t depth
, bool bSubdivAndCbf
, TextType ttype
, uint32_t depthRange
[2])
3050 X265_CHECK(cu
.m_cuDepth
[0] == cu
.m_cuDepth
[absPartIdx
], "depth not matching\n");
3051 X265_CHECK(cu
.m_predMode
[absPartIdx
] != MODE_INTRA
, "encodeResidualQT() with intra block\n");
3053 const uint32_t curTuDepth
= depth
- cu
.m_cuDepth
[0];
3054 const uint32_t tuDepth
= cu
.m_tuDepth
[absPartIdx
];
3055 const bool bSubdiv
= curTuDepth
!= tuDepth
;
3056 const uint32_t log2TrSize
= g_maxLog2CUSize
- depth
;
3058 uint32_t log2TrSizeC
= log2TrSize
- m_hChromaShift
;
3060 const bool splitIntoSubTUs
= (m_csp
== X265_CSP_I422
);
3062 if (bSubdivAndCbf
&& log2TrSize
<= depthRange
[1] && log2TrSize
> depthRange
[0])
3063 m_entropyCoder
.codeTransformSubdivFlag(bSubdiv
, 5 - log2TrSize
);
3065 bool mCodeAll
= true;
3066 uint32_t trWidthC
= 1 << log2TrSizeC
;
3067 uint32_t trHeightC
= splitIntoSubTUs
? (trWidthC
<< 1) : trWidthC
;
3069 const uint32_t numPels
= trWidthC
* trHeightC
;
3070 if (numPels
< (MIN_TU_SIZE
* MIN_TU_SIZE
))
3075 const bool bFirstCbfOfCU
= curTuDepth
== 0;
3076 if (bFirstCbfOfCU
|| mCodeAll
)
3078 uint32_t absPartIdxStep
= NUM_CU_PARTITIONS
>> ((cu
.m_cuDepth
[0] + curTuDepth
) << 1);
3079 if (bFirstCbfOfCU
|| cu
.getCbf(absPartIdx
, TEXT_CHROMA_U
, curTuDepth
- 1))
3080 m_entropyCoder
.codeQtCbf(cu
, absPartIdx
, absPartIdxStep
, trWidthC
, trHeightC
, TEXT_CHROMA_U
, curTuDepth
, !bSubdiv
);
3081 if (bFirstCbfOfCU
|| cu
.getCbf(absPartIdx
, TEXT_CHROMA_V
, curTuDepth
- 1))
3082 m_entropyCoder
.codeQtCbf(cu
, absPartIdx
, absPartIdxStep
, trWidthC
, trHeightC
, TEXT_CHROMA_V
, curTuDepth
, !bSubdiv
);
3086 X265_CHECK(cu
.getCbf(absPartIdx
, TEXT_CHROMA_U
, curTuDepth
) == cu
.getCbf(absPartIdx
, TEXT_CHROMA_U
, curTuDepth
- 1), "chroma CBF not matching\n");
3087 X265_CHECK(cu
.getCbf(absPartIdx
, TEXT_CHROMA_V
, curTuDepth
) == cu
.getCbf(absPartIdx
, TEXT_CHROMA_V
, curTuDepth
- 1), "chroma CBF not matching\n");
3094 const uint32_t qtLayer
= log2TrSize
- 2;
3095 uint32_t coeffOffsetY
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2);
3096 coeff_t
* coeffCurY
= m_rqt
[qtLayer
].coeffRQT
[0] + coeffOffsetY
;
3099 bool bCodeChroma
= true;
3100 uint32_t tuDepthC
= tuDepth
;
3101 if ((log2TrSize
== 2) && !(m_csp
== X265_CSP_I444
))
3105 uint32_t qpdiv
= NUM_CU_PARTITIONS
>> ((depth
- 1) << 1);
3106 bCodeChroma
= ((absPartIdx
& (qpdiv
- 1)) == 0);
3110 m_entropyCoder
.codeQtCbf(cu
, absPartIdx
, TEXT_LUMA
, tuDepth
);
3113 if (ttype
== TEXT_LUMA
&& cu
.getCbf(absPartIdx
, TEXT_LUMA
, tuDepth
))
3114 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurY
, absPartIdx
, log2TrSize
, TEXT_LUMA
);
3118 uint32_t coeffOffsetC
= coeffOffsetY
>> (m_hChromaShift
+ m_vChromaShift
);
3119 coeff_t
* coeffCurU
= m_rqt
[qtLayer
].coeffRQT
[1] + coeffOffsetC
;
3120 coeff_t
* coeffCurV
= m_rqt
[qtLayer
].coeffRQT
[2] + coeffOffsetC
;
3122 if (!splitIntoSubTUs
)
3124 if (ttype
== TEXT_CHROMA_U
&& cu
.getCbf(absPartIdx
, TEXT_CHROMA_U
, tuDepth
))
3125 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurU
, absPartIdx
, log2TrSizeC
, TEXT_CHROMA_U
);
3126 if (ttype
== TEXT_CHROMA_V
&& cu
.getCbf(absPartIdx
, TEXT_CHROMA_V
, tuDepth
))
3127 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurV
, absPartIdx
, log2TrSizeC
, TEXT_CHROMA_V
);
3131 uint32_t partIdxesPerSubTU
= NUM_CU_PARTITIONS
>> (((cu
.m_cuDepth
[absPartIdx
] + tuDepthC
) << 1) + 1);
3132 uint32_t subTUSize
= 1 << (log2TrSizeC
* 2);
3133 if (ttype
== TEXT_CHROMA_U
&& cu
.getCbf(absPartIdx
, TEXT_CHROMA_U
, tuDepth
))
3135 if (cu
.getCbf(absPartIdx
, ttype
, tuDepth
+ 1))
3136 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurU
, absPartIdx
, log2TrSizeC
, TEXT_CHROMA_U
);
3137 if (cu
.getCbf(absPartIdx
+ partIdxesPerSubTU
, ttype
, tuDepth
+ 1))
3138 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurU
+ subTUSize
, absPartIdx
+ partIdxesPerSubTU
, log2TrSizeC
, TEXT_CHROMA_U
);
3140 if (ttype
== TEXT_CHROMA_V
&& cu
.getCbf(absPartIdx
, TEXT_CHROMA_V
, tuDepth
))
3142 if (cu
.getCbf(absPartIdx
, ttype
, tuDepth
+ 1))
3143 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurV
, absPartIdx
, log2TrSizeC
, TEXT_CHROMA_V
);
3144 if (cu
.getCbf(absPartIdx
+ partIdxesPerSubTU
, ttype
, tuDepth
+ 1))
3145 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurV
+ subTUSize
, absPartIdx
+ partIdxesPerSubTU
, log2TrSizeC
, TEXT_CHROMA_V
);
3153 if (bSubdivAndCbf
|| cu
.getCbf(absPartIdx
, ttype
, curTuDepth
))
3155 const uint32_t qpartNumSubdiv
= NUM_CU_PARTITIONS
>> ((depth
+ 1) << 1);
3156 for (uint32_t i
= 0; i
< 4; ++i
)
3157 encodeResidualQT(cu
, absPartIdx
+ i
* qpartNumSubdiv
, depth
+ 1, bSubdivAndCbf
, ttype
, depthRange
);
3162 void Search::saveResidualQTData(CUData
& cu
, ShortYuv
& resiYuv
, uint32_t absPartIdx
, uint32_t depth
)
3164 X265_CHECK(cu
.m_cuDepth
[0] == cu
.m_cuDepth
[absPartIdx
], "depth not matching\n");
3165 const uint32_t curTrMode
= depth
- cu
.m_cuDepth
[0];
3166 const uint32_t tuDepth
= cu
.m_tuDepth
[absPartIdx
];
3168 if (curTrMode
< tuDepth
)
3170 uint32_t qPartNumSubdiv
= NUM_CU_PARTITIONS
>> ((depth
+ 1) << 1);
3171 for (uint32_t i
= 0; i
< 4; i
++, absPartIdx
+= qPartNumSubdiv
)
3172 saveResidualQTData(cu
, resiYuv
, absPartIdx
, depth
+ 1);
3176 const uint32_t log2TrSize
= g_maxLog2CUSize
- depth
;
3177 const uint32_t qtLayer
= log2TrSize
- 2;
3179 uint32_t log2TrSizeC
= log2TrSize
- m_hChromaShift
;
3180 bool bCodeChroma
= true;
3181 uint32_t tuDepthC
= tuDepth
;
3182 if (log2TrSizeC
== 1)
3184 X265_CHECK(log2TrSize
== 2 && m_csp
!= X265_CSP_I444
, "tuQuad check failed\n");
3187 uint32_t qpdiv
= NUM_CU_PARTITIONS
>> ((cu
.m_cuDepth
[0] + tuDepthC
) << 1);
3188 bCodeChroma
= ((absPartIdx
& (qpdiv
- 1)) == 0);
3191 m_rqt
[qtLayer
].resiQtYuv
.copyPartToPartLuma(resiYuv
, absPartIdx
, log2TrSize
);
3193 uint32_t numCoeffY
= 1 << (log2TrSize
* 2);
3194 uint32_t coeffOffsetY
= absPartIdx
<< LOG2_UNIT_SIZE
* 2;
3195 coeff_t
* coeffSrcY
= m_rqt
[qtLayer
].coeffRQT
[0] + coeffOffsetY
;
3196 coeff_t
* coeffDstY
= cu
.m_trCoeff
[0] + coeffOffsetY
;
3197 memcpy(coeffDstY
, coeffSrcY
, sizeof(coeff_t
) * numCoeffY
);
3201 m_rqt
[qtLayer
].resiQtYuv
.copyPartToPartChroma(resiYuv
, absPartIdx
, log2TrSizeC
+ m_hChromaShift
);
3203 uint32_t numCoeffC
= 1 << (log2TrSizeC
* 2 + (m_csp
== X265_CSP_I422
));
3204 uint32_t coeffOffsetC
= coeffOffsetY
>> (m_hChromaShift
+ m_vChromaShift
);
3206 coeff_t
* coeffSrcU
= m_rqt
[qtLayer
].coeffRQT
[1] + coeffOffsetC
;
3207 coeff_t
* coeffSrcV
= m_rqt
[qtLayer
].coeffRQT
[2] + coeffOffsetC
;
3208 coeff_t
* coeffDstU
= cu
.m_trCoeff
[1] + coeffOffsetC
;
3209 coeff_t
* coeffDstV
= cu
.m_trCoeff
[2] + coeffOffsetC
;
3210 memcpy(coeffDstU
, coeffSrcU
, sizeof(coeff_t
) * numCoeffC
);
3211 memcpy(coeffDstV
, coeffSrcV
, sizeof(coeff_t
) * numCoeffC
);
3215 /* returns the number of bits required to signal a non-most-probable mode.
3216 * on return mpms contains bitmap of most probable modes */
3217 uint32_t Search::getIntraRemModeBits(CUData
& cu
, uint32_t absPartIdx
, uint32_t preds
[3], uint64_t& mpms
) const
3219 cu
.getIntraDirLumaPredictor(absPartIdx
, preds
);
3222 for (int i
= 0; i
< 3; ++i
)
3223 mpms
|= ((uint64_t)1 << preds
[i
]);
3225 return m_entropyCoder
.bitsIntraModeNonMPM();
3228 /* swap the current mode/cost with the mode with the highest cost in the
3229 * current candidate list, if its cost is better (maintain a top N list) */
3230 void Search::updateCandList(uint32_t mode
, uint64_t cost
, int maxCandCount
, uint32_t* candModeList
, uint64_t* candCostList
)
3232 uint32_t maxIndex
= 0;
3233 uint64_t maxValue
= 0;
3235 for (int i
= 0; i
< maxCandCount
; i
++)
3237 if (maxValue
< candCostList
[i
])
3239 maxValue
= candCostList
[i
];
3244 if (cost
< maxValue
)
3246 candCostList
[maxIndex
] = cost
;
3247 candModeList
[maxIndex
] = mode
;