1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Steve Borho <steve@borho.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 * This program is also available under a commercial proprietary license.
21 * For more information, contact us at license @ x265.com.
22 *****************************************************************************/
25 #include "primitives.h"
36 #pragma warning(disable: 4800) // 'uint8_t' : forcing value to bool 'true' or 'false' (performance warning)
37 #pragma warning(disable: 4244) // '=' : conversion from 'int' to 'uint8_t', possible loss of data)
40 #define MVP_IDX_BITS 1
42 ALIGN_VAR_32(const pixel
, Search::zeroPixel
[MAX_CU_SIZE
]) = { 0 };
43 ALIGN_VAR_32(const int16_t, Search::zeroShort
[MAX_CU_SIZE
]) = { 0 };
45 Search::Search() : JobProvider(NULL
)
47 memset(m_rqt
, 0, sizeof(m_rqt
));
49 for (int i
= 0; i
< 3; i
++)
51 m_qtTempTransformSkipFlag
[i
] = NULL
;
52 m_qtTempCbf
[i
] = NULL
;
59 m_bJobsQueued
= false;
60 m_totalNumME
= m_numAcquiredME
= m_numCompletedME
= 0;
63 bool Search::initSearch(const x265_param
& param
, ScalingList
& scalingList
)
66 m_bEnableRDOQ
= param
.rdLevel
>= 4;
67 m_bFrameParallel
= param
.frameNumThreads
> 1;
68 m_numLayers
= g_log2Size
[param
.maxCUSize
] - 2;
70 m_rdCost
.setPsyRdScale(param
.psyRd
);
71 m_me
.init(param
.searchMethod
, param
.subpelRefine
, param
.internalCsp
);
73 bool ok
= m_quant
.init(m_bEnableRDOQ
, param
.psyRdoq
, scalingList
, m_entropyCoder
);
74 if (m_param
->noiseReductionIntra
|| m_param
->noiseReductionInter
)
75 ok
&= m_quant
.allocNoiseReduction(param
);
77 ok
&= Predict::allocBuffers(param
.internalCsp
); /* sets m_hChromaShift & m_vChromaShift */
79 /* When frame parallelism is active, only 'refLagPixels' of reference frames will be guaranteed
80 * available for motion reference. See refLagRows in FrameEncoder::compressCTURows() */
81 m_refLagPixels
= m_bFrameParallel
? param
.searchRange
: param
.sourceHeight
;
83 uint32_t sizeL
= 1 << (g_maxLog2CUSize
* 2);
84 uint32_t sizeC
= sizeL
>> (m_hChromaShift
+ m_vChromaShift
);
85 uint32_t numPartitions
= NUM_CU_PARTITIONS
;
87 /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
88 * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
89 * which are reconstructed at each depth are valid. At the end, the transform depth table
90 * is walked and the coeff and recon at the correct depths are collected */
91 for (uint32_t i
= 0; i
<= m_numLayers
; i
++)
93 CHECKED_MALLOC(m_rqt
[i
].coeffRQT
[0], coeff_t
, sizeL
+ sizeC
* 2);
94 m_rqt
[i
].coeffRQT
[1] = m_rqt
[i
].coeffRQT
[0] + sizeL
;
95 m_rqt
[i
].coeffRQT
[2] = m_rqt
[i
].coeffRQT
[0] + sizeL
+ sizeC
;
96 ok
&= m_rqt
[i
].reconQtYuv
.create(g_maxCUSize
, param
.internalCsp
);
97 ok
&= m_rqt
[i
].resiQtYuv
.create(g_maxCUSize
, param
.internalCsp
);
100 /* the rest of these buffers are indexed per-depth */
101 for (uint32_t i
= 0; i
<= g_maxCUDepth
; i
++)
103 int cuSize
= g_maxCUSize
>> i
;
104 ok
&= m_rqt
[i
].tmpResiYuv
.create(cuSize
, param
.internalCsp
);
105 ok
&= m_rqt
[i
].tmpPredYuv
.create(cuSize
, param
.internalCsp
);
106 ok
&= m_rqt
[i
].bidirPredYuv
[0].create(cuSize
, param
.internalCsp
);
107 ok
&= m_rqt
[i
].bidirPredYuv
[1].create(cuSize
, param
.internalCsp
);
110 CHECKED_MALLOC(m_qtTempCbf
[0], uint8_t, numPartitions
* 3);
111 m_qtTempCbf
[1] = m_qtTempCbf
[0] + numPartitions
;
112 m_qtTempCbf
[2] = m_qtTempCbf
[0] + numPartitions
* 2;
113 CHECKED_MALLOC(m_qtTempTransformSkipFlag
[0], uint8_t, numPartitions
* 3);
114 m_qtTempTransformSkipFlag
[1] = m_qtTempTransformSkipFlag
[0] + numPartitions
;
115 m_qtTempTransformSkipFlag
[2] = m_qtTempTransformSkipFlag
[0] + numPartitions
* 2;
125 for (uint32_t i
= 0; i
<= m_numLayers
; i
++)
127 X265_FREE(m_rqt
[i
].coeffRQT
[0]);
128 m_rqt
[i
].reconQtYuv
.destroy();
129 m_rqt
[i
].resiQtYuv
.destroy();
132 for (uint32_t i
= 0; i
<= g_maxCUDepth
; i
++)
134 m_rqt
[i
].tmpResiYuv
.destroy();
135 m_rqt
[i
].tmpPredYuv
.destroy();
136 m_rqt
[i
].bidirPredYuv
[0].destroy();
137 m_rqt
[i
].bidirPredYuv
[1].destroy();
140 X265_FREE(m_qtTempCbf
[0]);
141 X265_FREE(m_qtTempTransformSkipFlag
[0]);
144 void Search::setQP(const Slice
& slice
, int qp
)
146 x265_emms(); /* TODO: if the lambda tables were ints, this would not be necessary */
148 m_rdCost
.setQP(slice
, qp
);
151 #if CHECKED_BUILD || _DEBUG
152 void Search::invalidateContexts(int fromDepth
)
154 /* catch reads without previous writes */
155 for (int d
= fromDepth
; d
< NUM_FULL_DEPTH
; d
++)
157 m_rqt
[d
].cur
.markInvalid();
158 m_rqt
[d
].rqtTemp
.markInvalid();
159 m_rqt
[d
].rqtRoot
.markInvalid();
160 m_rqt
[d
].rqtTest
.markInvalid();
164 void Search::invalidateContexts(int) {}
167 void Search::codeSubdivCbfQTChroma(const CUData
& cu
, uint32_t tuDepth
, uint32_t absPartIdx
)
169 uint32_t fullDepth
= cu
.m_cuDepth
[0] + tuDepth
;
170 uint32_t subdiv
= tuDepth
< cu
.m_tuDepth
[absPartIdx
];
171 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
173 if (!(log2TrSize
- m_hChromaShift
< 2))
175 if (!tuDepth
|| cu
.getCbf(absPartIdx
, TEXT_CHROMA_U
, tuDepth
- 1))
176 m_entropyCoder
.codeQtCbfChroma(cu
, absPartIdx
, TEXT_CHROMA_U
, tuDepth
, !subdiv
);
177 if (!tuDepth
|| cu
.getCbf(absPartIdx
, TEXT_CHROMA_V
, tuDepth
- 1))
178 m_entropyCoder
.codeQtCbfChroma(cu
, absPartIdx
, TEXT_CHROMA_V
, tuDepth
, !subdiv
);
183 uint32_t qNumParts
= 1 << (log2TrSize
- 1 - LOG2_UNIT_SIZE
) * 2;
184 for (uint32_t qIdx
= 0; qIdx
< 4; ++qIdx
, absPartIdx
+= qNumParts
)
185 codeSubdivCbfQTChroma(cu
, tuDepth
+ 1, absPartIdx
);
189 void Search::codeCoeffQTChroma(const CUData
& cu
, uint32_t tuDepth
, uint32_t absPartIdx
, TextType ttype
)
191 if (!cu
.getCbf(absPartIdx
, ttype
, tuDepth
))
194 uint32_t fullDepth
= cu
.m_cuDepth
[0] + tuDepth
;
195 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
197 if (tuDepth
< cu
.m_tuDepth
[absPartIdx
])
199 uint32_t qNumParts
= 1 << (log2TrSize
- 1 - LOG2_UNIT_SIZE
) * 2;
200 for (uint32_t qIdx
= 0; qIdx
< 4; ++qIdx
, absPartIdx
+= qNumParts
)
201 codeCoeffQTChroma(cu
, tuDepth
+ 1, absPartIdx
, ttype
);
206 uint32_t tuDepthC
= tuDepth
;
207 uint32_t log2TrSizeC
= log2TrSize
- m_hChromaShift
;
211 X265_CHECK(log2TrSize
== 2 && m_csp
!= X265_CSP_I444
&& tuDepth
, "invalid tuDepth\n");
218 uint32_t qtLayer
= log2TrSize
- 2;
220 if (m_csp
!= X265_CSP_I422
)
222 uint32_t shift
= (m_csp
== X265_CSP_I420
) ? 2 : 0;
223 uint32_t coeffOffset
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2 - shift
);
224 coeff_t
* coeff
= m_rqt
[qtLayer
].coeffRQT
[ttype
] + coeffOffset
;
225 m_entropyCoder
.codeCoeffNxN(cu
, coeff
, absPartIdx
, log2TrSizeC
, ttype
);
229 uint32_t coeffOffset
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2 - 1);
230 coeff_t
* coeff
= m_rqt
[qtLayer
].coeffRQT
[ttype
] + coeffOffset
;
231 uint32_t subTUSize
= 1 << (log2TrSizeC
* 2);
232 uint32_t tuNumParts
= 2 << ((log2TrSizeC
- LOG2_UNIT_SIZE
) * 2);
233 if (cu
.getCbf(absPartIdx
, ttype
, tuDepth
+ 1))
234 m_entropyCoder
.codeCoeffNxN(cu
, coeff
, absPartIdx
, log2TrSizeC
, ttype
);
235 if (cu
.getCbf(absPartIdx
+ tuNumParts
, ttype
, tuDepth
+ 1))
236 m_entropyCoder
.codeCoeffNxN(cu
, coeff
+ subTUSize
, absPartIdx
+ tuNumParts
, log2TrSizeC
, ttype
);
240 void Search::codeIntraLumaQT(Mode
& mode
, const CUGeom
& cuGeom
, uint32_t tuDepth
, uint32_t absPartIdx
, bool bAllowSplit
, Cost
& outCost
, const uint32_t depthRange
[2])
242 uint32_t fullDepth
= mode
.cu
.m_cuDepth
[0] + tuDepth
;
243 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
244 uint32_t qtLayer
= log2TrSize
- 2;
245 uint32_t sizeIdx
= log2TrSize
- 2;
246 bool mightNotSplit
= log2TrSize
<= depthRange
[1];
247 bool mightSplit
= (log2TrSize
> depthRange
[0]) && (bAllowSplit
|| !mightNotSplit
);
249 /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
250 if (m_param
->rdPenalty
== 2 && m_slice
->m_sliceType
!= I_SLICE
&& log2TrSize
== 5 && depthRange
[0] <= 4)
252 mightNotSplit
= false;
256 CUData
& cu
= mode
.cu
;
261 pixel
* reconQt
= m_rqt
[qtLayer
].reconQtYuv
.getLumaAddr(absPartIdx
);
262 uint32_t reconQtStride
= m_rqt
[qtLayer
].reconQtYuv
.m_size
;
267 m_entropyCoder
.store(m_rqt
[fullDepth
].rqtRoot
);
269 const pixel
* fenc
= mode
.fencYuv
->getLumaAddr(absPartIdx
);
270 pixel
* pred
= mode
.predYuv
.getLumaAddr(absPartIdx
);
271 int16_t* residual
= m_rqt
[cuGeom
.depth
].tmpResiYuv
.getLumaAddr(absPartIdx
);
272 uint32_t stride
= mode
.fencYuv
->m_size
;
274 // init availability pattern
275 uint32_t lumaPredMode
= cu
.m_lumaIntraDir
[absPartIdx
];
276 initAdiPattern(cu
, cuGeom
, absPartIdx
, tuDepth
, lumaPredMode
);
278 // get prediction signal
279 predIntraLumaAng(lumaPredMode
, pred
, stride
, log2TrSize
);
281 cu
.setTransformSkipSubParts(0, TEXT_LUMA
, absPartIdx
, fullDepth
);
282 cu
.setTUDepthSubParts(tuDepth
, absPartIdx
, fullDepth
);
284 uint32_t coeffOffsetY
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2);
285 coeff_t
* coeffY
= m_rqt
[qtLayer
].coeffRQT
[0] + coeffOffsetY
;
287 // store original entropy coding status
289 m_entropyCoder
.estBit(m_entropyCoder
.m_estBitsSbac
, log2TrSize
, true);
291 primitives
.calcresidual
[sizeIdx
](fenc
, pred
, residual
, stride
);
293 uint32_t numSig
= m_quant
.transformNxN(cu
, fenc
, stride
, residual
, stride
, coeffY
, log2TrSize
, TEXT_LUMA
, absPartIdx
, false);
296 m_quant
.invtransformNxN(cu
.m_tqBypass
[0], residual
, stride
, coeffY
, log2TrSize
, TEXT_LUMA
, true, false, numSig
);
297 primitives
.luma_add_ps
[sizeIdx
](reconQt
, reconQtStride
, pred
, residual
, stride
, stride
);
300 // no coded residual, recon = pred
301 primitives
.luma_copy_pp
[sizeIdx
](reconQt
, reconQtStride
, pred
, stride
);
303 bCBF
= !!numSig
<< tuDepth
;
304 cu
.setCbfSubParts(bCBF
, TEXT_LUMA
, absPartIdx
, fullDepth
);
305 fullCost
.distortion
= primitives
.sse_pp
[sizeIdx
](reconQt
, reconQtStride
, fenc
, stride
);
307 m_entropyCoder
.resetBits();
310 if (!cu
.m_slice
->isIntra())
312 if (cu
.m_slice
->m_pps
->bTransquantBypassEnabled
)
313 m_entropyCoder
.codeCUTransquantBypassFlag(cu
.m_tqBypass
[0]);
314 m_entropyCoder
.codeSkipFlag(cu
, 0);
315 m_entropyCoder
.codePredMode(cu
.m_predMode
[0]);
318 m_entropyCoder
.codePartSize(cu
, 0, cu
.m_cuDepth
[0]);
320 if (cu
.m_partSize
[0] == SIZE_2Nx2N
)
323 m_entropyCoder
.codeIntraDirLumaAng(cu
, 0, false);
327 uint32_t qNumParts
= cuGeom
.numPartitions
>> 2;
330 for (uint32_t qIdx
= 0; qIdx
< 4; ++qIdx
)
331 m_entropyCoder
.codeIntraDirLumaAng(cu
, qIdx
* qNumParts
, false);
333 else if (!(absPartIdx
& (qNumParts
- 1)))
334 m_entropyCoder
.codeIntraDirLumaAng(cu
, absPartIdx
, false);
336 if (log2TrSize
!= depthRange
[0])
337 m_entropyCoder
.codeTransformSubdivFlag(0, 5 - log2TrSize
);
339 m_entropyCoder
.codeQtCbfLuma(!!numSig
, tuDepth
);
341 if (cu
.getCbf(absPartIdx
, TEXT_LUMA
, tuDepth
))
342 m_entropyCoder
.codeCoeffNxN(cu
, coeffY
, absPartIdx
, log2TrSize
, TEXT_LUMA
);
344 fullCost
.bits
= m_entropyCoder
.getNumberOfWrittenBits();
346 if (m_param
->rdPenalty
&& log2TrSize
== 5 && m_slice
->m_sliceType
!= I_SLICE
)
349 if (m_rdCost
.m_psyRd
)
351 fullCost
.energy
= m_rdCost
.psyCost(sizeIdx
, fenc
, mode
.fencYuv
->m_size
, reconQt
, reconQtStride
);
352 fullCost
.rdcost
= m_rdCost
.calcPsyRdCost(fullCost
.distortion
, fullCost
.bits
, fullCost
.energy
);
355 fullCost
.rdcost
= m_rdCost
.calcRdCost(fullCost
.distortion
, fullCost
.bits
);
358 fullCost
.rdcost
= MAX_INT64
;
364 m_entropyCoder
.store(m_rqt
[fullDepth
].rqtTest
); // save state after full TU encode
365 m_entropyCoder
.load(m_rqt
[fullDepth
].rqtRoot
); // prep state of split encode
369 uint32_t qNumParts
= 1 << (log2TrSize
- 1 - LOG2_UNIT_SIZE
) * 2;
371 int checkTransformSkip
= m_slice
->m_pps
->bTransformSkipEnabled
&& (log2TrSize
- 1) <= MAX_LOG2_TS_SIZE
&& !cu
.m_tqBypass
[0];
372 if (m_param
->bEnableTSkipFast
)
373 checkTransformSkip
&= cu
.m_partSize
[0] != SIZE_2Nx2N
;
377 for (uint32_t qIdx
= 0, qPartIdx
= absPartIdx
; qIdx
< 4; ++qIdx
, qPartIdx
+= qNumParts
)
379 if (checkTransformSkip
)
380 codeIntraLumaTSkip(mode
, cuGeom
, tuDepth
+ 1, qPartIdx
, splitCost
);
382 codeIntraLumaQT(mode
, cuGeom
, tuDepth
+ 1, qPartIdx
, bAllowSplit
, splitCost
, depthRange
);
384 cbf
|= cu
.getCbf(qPartIdx
, TEXT_LUMA
, tuDepth
+ 1);
386 for (uint32_t offs
= 0; offs
< 4 * qNumParts
; offs
++)
387 cu
.m_cbf
[0][absPartIdx
+ offs
] |= (cbf
<< tuDepth
);
389 if (mightNotSplit
&& log2TrSize
!= depthRange
[0])
391 /* If we could have coded this TU depth, include cost of subdiv flag */
392 m_entropyCoder
.resetBits();
393 m_entropyCoder
.codeTransformSubdivFlag(1, 5 - log2TrSize
);
394 splitCost
.bits
+= m_entropyCoder
.getNumberOfWrittenBits();
396 if (m_rdCost
.m_psyRd
)
397 splitCost
.rdcost
= m_rdCost
.calcPsyRdCost(splitCost
.distortion
, splitCost
.bits
, splitCost
.energy
);
399 splitCost
.rdcost
= m_rdCost
.calcRdCost(splitCost
.distortion
, splitCost
.bits
);
402 if (splitCost
.rdcost
< fullCost
.rdcost
)
404 outCost
.rdcost
+= splitCost
.rdcost
;
405 outCost
.distortion
+= splitCost
.distortion
;
406 outCost
.bits
+= splitCost
.bits
;
407 outCost
.energy
+= splitCost
.energy
;
412 // recover entropy state of full-size TU encode
413 m_entropyCoder
.load(m_rqt
[fullDepth
].rqtTest
);
415 // recover transform index and Cbf values
416 cu
.setTUDepthSubParts(tuDepth
, absPartIdx
, fullDepth
);
417 cu
.setCbfSubParts(bCBF
, TEXT_LUMA
, absPartIdx
, fullDepth
);
418 cu
.setTransformSkipSubParts(0, TEXT_LUMA
, absPartIdx
, fullDepth
);
422 // set reconstruction for next intra prediction blocks if full TU prediction won
423 pixel
* picReconY
= m_frame
->m_reconPic
->getLumaAddr(cu
.m_cuAddr
, cuGeom
.encodeIdx
+ absPartIdx
);
424 intptr_t picStride
= m_frame
->m_reconPic
->m_stride
;
425 primitives
.luma_copy_pp
[sizeIdx
](picReconY
, picStride
, reconQt
, reconQtStride
);
427 outCost
.rdcost
+= fullCost
.rdcost
;
428 outCost
.distortion
+= fullCost
.distortion
;
429 outCost
.bits
+= fullCost
.bits
;
430 outCost
.energy
+= fullCost
.energy
;
433 void Search::codeIntraLumaTSkip(Mode
& mode
, const CUGeom
& cuGeom
, uint32_t tuDepth
, uint32_t absPartIdx
, Cost
& outCost
)
435 uint32_t fullDepth
= mode
.cu
.m_cuDepth
[0] + tuDepth
;
436 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
437 uint32_t tuSize
= 1 << log2TrSize
;
439 X265_CHECK(tuSize
== MAX_TS_SIZE
, "transform skip is only possible at 4x4 TUs\n");
441 CUData
& cu
= mode
.cu
;
442 Yuv
* predYuv
= &mode
.predYuv
;
443 const Yuv
* fencYuv
= mode
.fencYuv
;
446 fullCost
.rdcost
= MAX_INT64
;
450 const pixel
* fenc
= fencYuv
->getLumaAddr(absPartIdx
);
451 pixel
* pred
= predYuv
->getLumaAddr(absPartIdx
);
452 int16_t* residual
= m_rqt
[cuGeom
.depth
].tmpResiYuv
.getLumaAddr(absPartIdx
);
453 uint32_t stride
= fencYuv
->m_size
;
454 int sizeIdx
= log2TrSize
- 2;
456 // init availability pattern
457 uint32_t lumaPredMode
= cu
.m_lumaIntraDir
[absPartIdx
];
458 initAdiPattern(cu
, cuGeom
, absPartIdx
, tuDepth
, lumaPredMode
);
460 // get prediction signal
461 predIntraLumaAng(lumaPredMode
, pred
, stride
, log2TrSize
);
463 cu
.setTUDepthSubParts(tuDepth
, absPartIdx
, fullDepth
);
465 uint32_t qtLayer
= log2TrSize
- 2;
466 uint32_t coeffOffsetY
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2);
467 coeff_t
* coeffY
= m_rqt
[qtLayer
].coeffRQT
[0] + coeffOffsetY
;
468 pixel
* reconQt
= m_rqt
[qtLayer
].reconQtYuv
.getLumaAddr(absPartIdx
);
469 uint32_t reconQtStride
= m_rqt
[qtLayer
].reconQtYuv
.m_size
;
471 // store original entropy coding status
472 m_entropyCoder
.store(m_rqt
[fullDepth
].rqtRoot
);
475 m_entropyCoder
.estBit(m_entropyCoder
.m_estBitsSbac
, log2TrSize
, true);
477 ALIGN_VAR_32(coeff_t
, tsCoeffY
[MAX_TS_SIZE
* MAX_TS_SIZE
]);
478 ALIGN_VAR_32(pixel
, tsReconY
[MAX_TS_SIZE
* MAX_TS_SIZE
]);
480 int checkTransformSkip
= 1;
481 for (int useTSkip
= 0; useTSkip
<= checkTransformSkip
; useTSkip
++)
484 uint32_t tmpEnergy
= 0;
486 coeff_t
* coeff
= (useTSkip
? tsCoeffY
: coeffY
);
487 pixel
* tmpRecon
= (useTSkip
? tsReconY
: reconQt
);
488 uint32_t tmpReconStride
= (useTSkip
? MAX_TS_SIZE
: reconQtStride
);
490 primitives
.calcresidual
[sizeIdx
](fenc
, pred
, residual
, stride
);
492 uint32_t numSig
= m_quant
.transformNxN(cu
, fenc
, stride
, residual
, stride
, coeff
, log2TrSize
, TEXT_LUMA
, absPartIdx
, useTSkip
);
495 m_quant
.invtransformNxN(cu
.m_tqBypass
[0], residual
, stride
, coeff
, log2TrSize
, TEXT_LUMA
, true, useTSkip
, numSig
);
496 primitives
.luma_add_ps
[sizeIdx
](tmpRecon
, tmpReconStride
, pred
, residual
, stride
, stride
);
500 /* do not allow tskip if CBF=0, pretend we did not try tskip */
501 checkTransformSkip
= 0;
505 // no residual coded, recon = pred
506 primitives
.luma_copy_pp
[sizeIdx
](tmpRecon
, tmpReconStride
, pred
, stride
);
508 uint32_t tmpDist
= primitives
.sse_pp
[sizeIdx
](tmpRecon
, tmpReconStride
, fenc
, stride
);
510 cu
.setTransformSkipSubParts(useTSkip
, TEXT_LUMA
, absPartIdx
, fullDepth
);
511 cu
.setCbfSubParts((!!numSig
) << tuDepth
, TEXT_LUMA
, absPartIdx
, fullDepth
);
514 m_entropyCoder
.load(m_rqt
[fullDepth
].rqtRoot
);
516 m_entropyCoder
.resetBits();
519 if (!cu
.m_slice
->isIntra())
521 if (cu
.m_slice
->m_pps
->bTransquantBypassEnabled
)
522 m_entropyCoder
.codeCUTransquantBypassFlag(cu
.m_tqBypass
[0]);
523 m_entropyCoder
.codeSkipFlag(cu
, 0);
524 m_entropyCoder
.codePredMode(cu
.m_predMode
[0]);
527 m_entropyCoder
.codePartSize(cu
, 0, cu
.m_cuDepth
[0]);
529 if (cu
.m_partSize
[0] == SIZE_2Nx2N
)
532 m_entropyCoder
.codeIntraDirLumaAng(cu
, 0, false);
536 uint32_t qNumParts
= cuGeom
.numPartitions
>> 2;
539 for (uint32_t qIdx
= 0; qIdx
< 4; ++qIdx
)
540 m_entropyCoder
.codeIntraDirLumaAng(cu
, qIdx
* qNumParts
, false);
542 else if (!(absPartIdx
& (qNumParts
- 1)))
543 m_entropyCoder
.codeIntraDirLumaAng(cu
, absPartIdx
, false);
545 m_entropyCoder
.codeTransformSubdivFlag(0, 5 - log2TrSize
);
547 m_entropyCoder
.codeQtCbfLuma(!!numSig
, tuDepth
);
549 if (cu
.getCbf(absPartIdx
, TEXT_LUMA
, tuDepth
))
550 m_entropyCoder
.codeCoeffNxN(cu
, coeff
, absPartIdx
, log2TrSize
, TEXT_LUMA
);
552 uint32_t tmpBits
= m_entropyCoder
.getNumberOfWrittenBits();
555 m_entropyCoder
.store(m_rqt
[fullDepth
].rqtTemp
);
557 if (m_rdCost
.m_psyRd
)
559 tmpEnergy
= m_rdCost
.psyCost(sizeIdx
, fenc
, fencYuv
->m_size
, tmpRecon
, tmpReconStride
);
560 tmpCost
= m_rdCost
.calcPsyRdCost(tmpDist
, tmpBits
, tmpEnergy
);
563 tmpCost
= m_rdCost
.calcRdCost(tmpDist
, tmpBits
);
565 if (tmpCost
< fullCost
.rdcost
)
569 fullCost
.rdcost
= tmpCost
;
570 fullCost
.distortion
= tmpDist
;
571 fullCost
.bits
= tmpBits
;
572 fullCost
.energy
= tmpEnergy
;
578 memcpy(coeffY
, tsCoeffY
, sizeof(coeff_t
) << (log2TrSize
* 2));
579 primitives
.luma_copy_pp
[sizeIdx
](reconQt
, reconQtStride
, tsReconY
, tuSize
);
581 else if (checkTransformSkip
)
583 cu
.setTransformSkipSubParts(0, TEXT_LUMA
, absPartIdx
, fullDepth
);
584 cu
.setCbfSubParts(bCBF
<< tuDepth
, TEXT_LUMA
, absPartIdx
, fullDepth
);
585 m_entropyCoder
.load(m_rqt
[fullDepth
].rqtTemp
);
588 // set reconstruction for next intra prediction blocks
589 pixel
* picReconY
= m_frame
->m_reconPic
->getLumaAddr(cu
.m_cuAddr
, cuGeom
.encodeIdx
+ absPartIdx
);
590 intptr_t picStride
= m_frame
->m_reconPic
->m_stride
;
591 primitives
.luma_copy_pp
[sizeIdx
](picReconY
, picStride
, reconQt
, reconQtStride
);
593 outCost
.rdcost
+= fullCost
.rdcost
;
594 outCost
.distortion
+= fullCost
.distortion
;
595 outCost
.bits
+= fullCost
.bits
;
596 outCost
.energy
+= fullCost
.energy
;
599 /* fast luma intra residual generation. Only perform the minimum number of TU splits required by the CU size */
600 void Search::residualTransformQuantIntra(Mode
& mode
, const CUGeom
& cuGeom
, uint32_t tuDepth
, uint32_t absPartIdx
, const uint32_t depthRange
[2])
602 CUData
& cu
= mode
.cu
;
604 uint32_t fullDepth
= cu
.m_cuDepth
[0] + tuDepth
;
605 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
606 bool bCheckFull
= log2TrSize
<= depthRange
[1];
608 X265_CHECK(m_slice
->m_sliceType
!= I_SLICE
, "residualTransformQuantIntra not intended for I slices\n");
610 /* we still respect rdPenalty == 2, we can forbid 32x32 intra TU. rdPenalty = 1 is impossible
611 * since we are not measuring RD cost */
612 if (m_param
->rdPenalty
== 2 && log2TrSize
== 5 && depthRange
[0] <= 4)
617 const pixel
* fenc
= mode
.fencYuv
->getLumaAddr(absPartIdx
);
618 pixel
* pred
= mode
.predYuv
.getLumaAddr(absPartIdx
);
619 int16_t* residual
= m_rqt
[cuGeom
.depth
].tmpResiYuv
.getLumaAddr(absPartIdx
);
620 pixel
* picReconY
= m_frame
->m_reconPic
->getLumaAddr(cu
.m_cuAddr
, cuGeom
.encodeIdx
+ absPartIdx
);
621 intptr_t picStride
= m_frame
->m_reconPic
->m_stride
;
622 uint32_t stride
= mode
.fencYuv
->m_size
;
623 uint32_t sizeIdx
= log2TrSize
- 2;
624 uint32_t lumaPredMode
= cu
.m_lumaIntraDir
[absPartIdx
];
625 uint32_t coeffOffsetY
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2);
626 coeff_t
* coeff
= cu
.m_trCoeff
[TEXT_LUMA
] + coeffOffsetY
;
628 initAdiPattern(cu
, cuGeom
, absPartIdx
, tuDepth
, lumaPredMode
);
629 predIntraLumaAng(lumaPredMode
, pred
, stride
, log2TrSize
);
631 X265_CHECK(!cu
.m_transformSkip
[TEXT_LUMA
][absPartIdx
], "unexpected tskip flag in residualTransformQuantIntra\n");
632 cu
.setTUDepthSubParts(tuDepth
, absPartIdx
, fullDepth
);
634 primitives
.calcresidual
[sizeIdx
](fenc
, pred
, residual
, stride
);
635 uint32_t numSig
= m_quant
.transformNxN(cu
, fenc
, stride
, residual
, stride
, coeff
, log2TrSize
, TEXT_LUMA
, absPartIdx
, false);
638 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdx
], residual
, stride
, coeff
, log2TrSize
, TEXT_LUMA
, true, false, numSig
);
639 primitives
.luma_add_ps
[sizeIdx
](picReconY
, picStride
, pred
, residual
, stride
, stride
);
640 cu
.setCbfSubParts(1 << tuDepth
, TEXT_LUMA
, absPartIdx
, fullDepth
);
644 primitives
.luma_copy_pp
[sizeIdx
](picReconY
, picStride
, pred
, stride
);
645 cu
.setCbfSubParts(0, TEXT_LUMA
, absPartIdx
, fullDepth
);
650 X265_CHECK(log2TrSize
> depthRange
[0], "intra luma split state failure\n");
652 /* code split block */
653 uint32_t qNumParts
= 1 << (log2TrSize
- 1 - LOG2_UNIT_SIZE
) * 2;
655 for (uint32_t qIdx
= 0, qPartIdx
= absPartIdx
; qIdx
< 4; ++qIdx
, qPartIdx
+= qNumParts
)
657 residualTransformQuantIntra(mode
, cuGeom
, tuDepth
+ 1, qPartIdx
, depthRange
);
658 cbf
|= cu
.getCbf(qPartIdx
, TEXT_LUMA
, tuDepth
+ 1);
660 for (uint32_t offs
= 0; offs
< 4 * qNumParts
; offs
++)
661 cu
.m_cbf
[TEXT_LUMA
][absPartIdx
+ offs
] |= (cbf
<< tuDepth
);
665 void Search::extractIntraResultQT(CUData
& cu
, Yuv
& reconYuv
, uint32_t tuDepth
, uint32_t absPartIdx
)
667 uint32_t fullDepth
= cu
.m_cuDepth
[0] + tuDepth
;
668 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
670 if (tuDepth
== cu
.m_tuDepth
[absPartIdx
])
672 uint32_t qtLayer
= log2TrSize
- 2;
674 // copy transform coefficients
675 uint32_t coeffOffsetY
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2);
676 coeff_t
* coeffSrcY
= m_rqt
[qtLayer
].coeffRQT
[0] + coeffOffsetY
;
677 coeff_t
* coeffDestY
= cu
.m_trCoeff
[0] + coeffOffsetY
;
678 memcpy(coeffDestY
, coeffSrcY
, sizeof(coeff_t
) << (log2TrSize
* 2));
680 // copy reconstruction
681 m_rqt
[qtLayer
].reconQtYuv
.copyPartToPartLuma(reconYuv
, absPartIdx
, log2TrSize
);
685 uint32_t qNumParts
= 1 << (log2TrSize
- 1 - LOG2_UNIT_SIZE
) * 2;
686 for (uint32_t qIdx
= 0; qIdx
< 4; ++qIdx
, absPartIdx
+= qNumParts
)
687 extractIntraResultQT(cu
, reconYuv
, tuDepth
+ 1, absPartIdx
);
691 inline void offsetCBFs(uint8_t subTUCBF
[2])
693 uint8_t combinedCBF
= subTUCBF
[0] | subTUCBF
[1];
694 subTUCBF
[0] = subTUCBF
[0] << 1 | combinedCBF
;
695 subTUCBF
[1] = subTUCBF
[1] << 1 | combinedCBF
;
698 /* 4:2:2 post-TU split processing */
699 void Search::offsetSubTUCBFs(CUData
& cu
, TextType ttype
, uint32_t tuDepth
, uint32_t absPartIdx
)
701 uint32_t depth
= cu
.m_cuDepth
[0];
702 uint32_t fullDepth
= depth
+ tuDepth
;
703 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
707 X265_CHECK(m_csp
!= X265_CSP_I444
&& tuDepth
, "invalid tuDepth\n");
711 uint32_t tuNumParts
= 1 << ((log2TrSize
- LOG2_UNIT_SIZE
) * 2 - 1);
713 // move the CBFs down a level and set the parent CBF
715 subTUCBF
[0] = cu
.getCbf(absPartIdx
, ttype
, tuDepth
);
716 subTUCBF
[1] = cu
.getCbf(absPartIdx
+ tuNumParts
, ttype
, tuDepth
);
717 offsetCBFs(subTUCBF
);
719 cu
.setCbfPartRange(subTUCBF
[0] << tuDepth
, ttype
, absPartIdx
, tuNumParts
);
720 cu
.setCbfPartRange(subTUCBF
[1] << tuDepth
, ttype
, absPartIdx
+ tuNumParts
, tuNumParts
);
723 /* returns distortion */
724 uint32_t Search::codeIntraChromaQt(Mode
& mode
, const CUGeom
& cuGeom
, uint32_t tuDepth
, uint32_t absPartIdx
, uint32_t& psyEnergy
)
726 CUData
& cu
= mode
.cu
;
727 uint32_t fullDepth
= cu
.m_cuDepth
[0] + tuDepth
;
728 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
730 if (tuDepth
< cu
.m_tuDepth
[absPartIdx
])
732 uint32_t qNumParts
= 1 << (log2TrSize
- 1 - LOG2_UNIT_SIZE
) * 2;
733 uint32_t outDist
= 0, splitCbfU
= 0, splitCbfV
= 0;
734 for (uint32_t qIdx
= 0, qPartIdx
= absPartIdx
; qIdx
< 4; ++qIdx
, qPartIdx
+= qNumParts
)
736 outDist
+= codeIntraChromaQt(mode
, cuGeom
, tuDepth
+ 1, qPartIdx
, psyEnergy
);
737 splitCbfU
|= cu
.getCbf(qPartIdx
, TEXT_CHROMA_U
, tuDepth
+ 1);
738 splitCbfV
|= cu
.getCbf(qPartIdx
, TEXT_CHROMA_V
, tuDepth
+ 1);
740 for (uint32_t offs
= 0; offs
< 4 * qNumParts
; offs
++)
742 cu
.m_cbf
[TEXT_CHROMA_U
][absPartIdx
+ offs
] |= (splitCbfU
<< tuDepth
);
743 cu
.m_cbf
[TEXT_CHROMA_V
][absPartIdx
+ offs
] |= (splitCbfV
<< tuDepth
);
749 uint32_t log2TrSizeC
= log2TrSize
- m_hChromaShift
;
751 uint32_t tuDepthC
= tuDepth
;
754 X265_CHECK(log2TrSize
== 2 && m_csp
!= X265_CSP_I444
&& tuDepth
, "invalid tuDepth\n");
762 m_entropyCoder
.estBit(m_entropyCoder
.m_estBitsSbac
, log2TrSizeC
, false);
764 bool checkTransformSkip
= m_slice
->m_pps
->bTransformSkipEnabled
&& log2TrSizeC
<= MAX_LOG2_TS_SIZE
&& !cu
.m_tqBypass
[0];
765 checkTransformSkip
&= !m_param
->bEnableTSkipFast
|| (log2TrSize
<= MAX_LOG2_TS_SIZE
&& cu
.m_transformSkip
[TEXT_LUMA
][absPartIdx
]);
766 if (checkTransformSkip
)
767 return codeIntraChromaTSkip(mode
, cuGeom
, tuDepth
, tuDepthC
, absPartIdx
, psyEnergy
);
769 uint32_t qtLayer
= log2TrSize
- 2;
770 uint32_t tuSize
= 1 << log2TrSizeC
;
771 uint32_t outDist
= 0;
773 uint32_t curPartNum
= NUM_CU_PARTITIONS
>> ((cu
.m_cuDepth
[0] + tuDepthC
) << 1);
774 const SplitType splitType
= (m_csp
== X265_CSP_I422
) ? VERTICAL_SPLIT
: DONT_SPLIT
;
776 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
778 TextType ttype
= (TextType
)chromaId
;
780 TURecurse
tuIterator(splitType
, curPartNum
, absPartIdx
);
783 uint32_t absPartIdxC
= tuIterator
.absPartIdxTURelCU
;
785 const pixel
* fenc
= mode
.fencYuv
->getChromaAddr(chromaId
, absPartIdxC
);
786 pixel
* pred
= mode
.predYuv
.getChromaAddr(chromaId
, absPartIdxC
);
787 int16_t* residual
= m_rqt
[cuGeom
.depth
].tmpResiYuv
.getChromaAddr(chromaId
, absPartIdxC
);
788 uint32_t stride
= mode
.fencYuv
->m_csize
;
789 uint32_t sizeIdxC
= log2TrSizeC
- 2;
791 uint32_t coeffOffsetC
= absPartIdxC
<< (LOG2_UNIT_SIZE
* 2 - (m_hChromaShift
+ m_vChromaShift
));
792 coeff_t
* coeffC
= m_rqt
[qtLayer
].coeffRQT
[chromaId
] + coeffOffsetC
;
793 pixel
* reconQt
= m_rqt
[qtLayer
].reconQtYuv
.getChromaAddr(chromaId
, absPartIdxC
);
794 uint32_t reconQtStride
= m_rqt
[qtLayer
].reconQtYuv
.m_csize
;
796 pixel
* picReconC
= m_frame
->m_reconPic
->getChromaAddr(chromaId
, cu
.m_cuAddr
, cuGeom
.encodeIdx
+ absPartIdxC
);
797 intptr_t picStride
= m_frame
->m_reconPic
->m_strideC
;
799 // init availability pattern
800 initAdiPatternChroma(cu
, cuGeom
, absPartIdxC
, tuDepthC
, chromaId
);
801 pixel
* chromaPred
= getAdiChromaBuf(chromaId
, tuSize
);
803 uint32_t chromaPredMode
= cu
.m_chromaIntraDir
[absPartIdxC
];
804 if (chromaPredMode
== DM_CHROMA_IDX
)
805 chromaPredMode
= cu
.m_lumaIntraDir
[(m_csp
== X265_CSP_I444
) ? absPartIdxC
: 0];
806 if (m_csp
== X265_CSP_I422
)
807 chromaPredMode
= g_chroma422IntraAngleMappingTable
[chromaPredMode
];
809 // get prediction signal
810 predIntraChromaAng(chromaPred
, chromaPredMode
, pred
, stride
, log2TrSizeC
, m_csp
);
812 cu
.setTransformSkipPartRange(0, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
814 primitives
.calcresidual
[sizeIdxC
](fenc
, pred
, residual
, stride
);
815 uint32_t numSig
= m_quant
.transformNxN(cu
, fenc
, stride
, residual
, stride
, coeffC
, log2TrSizeC
, ttype
, absPartIdxC
, false);
818 m_quant
.invtransformNxN(cu
.m_tqBypass
[0], residual
, stride
, coeffC
, log2TrSizeC
, ttype
, true, false, numSig
);
819 primitives
.luma_add_ps
[sizeIdxC
](reconQt
, reconQtStride
, pred
, residual
, stride
, stride
);
820 cu
.setCbfPartRange(1 << tuDepth
, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
824 // no coded residual, recon = pred
825 primitives
.luma_copy_pp
[sizeIdxC
](reconQt
, reconQtStride
, pred
, stride
);
826 cu
.setCbfPartRange(0, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
829 outDist
+= m_rdCost
.scaleChromaDist(chromaId
, primitives
.sse_pp
[sizeIdxC
](reconQt
, reconQtStride
, fenc
, stride
));
831 if (m_rdCost
.m_psyRd
)
832 psyEnergy
+= m_rdCost
.psyCost(sizeIdxC
, fenc
, stride
, picReconC
, picStride
);
834 primitives
.luma_copy_pp
[sizeIdxC
](picReconC
, picStride
, reconQt
, reconQtStride
);
836 while (tuIterator
.isNextSection());
838 if (splitType
== VERTICAL_SPLIT
)
839 offsetSubTUCBFs(cu
, ttype
, tuDepth
, absPartIdx
);
845 /* returns distortion */
846 uint32_t Search::codeIntraChromaTSkip(Mode
& mode
, const CUGeom
& cuGeom
, uint32_t tuDepth
, uint32_t tuDepthC
, uint32_t absPartIdx
, uint32_t& psyEnergy
)
848 CUData
& cu
= mode
.cu
;
849 uint32_t fullDepth
= cu
.m_cuDepth
[0] + tuDepth
;
850 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
851 const uint32_t log2TrSizeC
= 2;
853 uint32_t qtLayer
= log2TrSize
- 2;
854 uint32_t outDist
= 0;
856 /* At the TU layers above this one, no RDO is performed, only distortion is being measured,
857 * so the entropy coder is not very accurate. The best we can do is return it in the same
858 * condition as it arrived, and to do all bit estimates from the same state. */
859 m_entropyCoder
.store(m_rqt
[fullDepth
].rqtRoot
);
861 ALIGN_VAR_32(coeff_t
, tskipCoeffC
[MAX_TS_SIZE
* MAX_TS_SIZE
]);
862 ALIGN_VAR_32(pixel
, tskipReconC
[MAX_TS_SIZE
* MAX_TS_SIZE
]);
864 uint32_t curPartNum
= NUM_CU_PARTITIONS
>> ((cu
.m_cuDepth
[0] + tuDepthC
) << 1);
865 const SplitType splitType
= (m_csp
== X265_CSP_I422
) ? VERTICAL_SPLIT
: DONT_SPLIT
;
867 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
869 TextType ttype
= (TextType
)chromaId
;
871 TURecurse
tuIterator(splitType
, curPartNum
, absPartIdx
);
874 uint32_t absPartIdxC
= tuIterator
.absPartIdxTURelCU
;
876 const pixel
* fenc
= mode
.fencYuv
->getChromaAddr(chromaId
, absPartIdxC
);
877 pixel
* pred
= mode
.predYuv
.getChromaAddr(chromaId
, absPartIdxC
);
878 int16_t* residual
= m_rqt
[cuGeom
.depth
].tmpResiYuv
.getChromaAddr(chromaId
, absPartIdxC
);
879 uint32_t stride
= mode
.fencYuv
->m_csize
;
880 const uint32_t sizeIdxC
= log2TrSizeC
- 2;
882 uint32_t coeffOffsetC
= absPartIdxC
<< (LOG2_UNIT_SIZE
* 2 - (m_hChromaShift
+ m_vChromaShift
));
883 coeff_t
* coeffC
= m_rqt
[qtLayer
].coeffRQT
[chromaId
] + coeffOffsetC
;
884 pixel
* reconQt
= m_rqt
[qtLayer
].reconQtYuv
.getChromaAddr(chromaId
, absPartIdxC
);
885 uint32_t reconQtStride
= m_rqt
[qtLayer
].reconQtYuv
.m_csize
;
887 // init availability pattern
888 initAdiPatternChroma(cu
, cuGeom
, absPartIdxC
, tuDepthC
, chromaId
);
889 pixel
* chromaPred
= getAdiChromaBuf(chromaId
, tuSize
);
891 uint32_t chromaPredMode
= cu
.m_chromaIntraDir
[absPartIdxC
];
892 if (chromaPredMode
== DM_CHROMA_IDX
)
893 chromaPredMode
= cu
.m_lumaIntraDir
[(m_csp
== X265_CSP_I444
) ? absPartIdxC
: 0];
894 if (m_csp
== X265_CSP_I422
)
895 chromaPredMode
= g_chroma422IntraAngleMappingTable
[chromaPredMode
];
897 // get prediction signal
898 predIntraChromaAng(chromaPred
, chromaPredMode
, pred
, stride
, log2TrSizeC
, m_csp
);
900 uint64_t bCost
= MAX_INT64
;
903 uint32_t bEnergy
= 0;
906 int checkTransformSkip
= 1;
907 for (int useTSkip
= 0; useTSkip
<= checkTransformSkip
; useTSkip
++)
909 coeff_t
* coeff
= (useTSkip
? tskipCoeffC
: coeffC
);
910 pixel
* recon
= (useTSkip
? tskipReconC
: reconQt
);
911 uint32_t reconStride
= (useTSkip
? MAX_TS_SIZE
: reconQtStride
);
913 primitives
.calcresidual
[sizeIdxC
](fenc
, pred
, residual
, stride
);
915 uint32_t numSig
= m_quant
.transformNxN(cu
, fenc
, stride
, residual
, stride
, coeff
, log2TrSizeC
, ttype
, absPartIdxC
, useTSkip
);
918 m_quant
.invtransformNxN(cu
.m_tqBypass
[0], residual
, stride
, coeff
, log2TrSizeC
, ttype
, true, useTSkip
, numSig
);
919 primitives
.luma_add_ps
[sizeIdxC
](recon
, reconStride
, pred
, residual
, stride
, stride
);
920 cu
.setCbfPartRange(1 << tuDepth
, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
924 checkTransformSkip
= 0;
929 primitives
.luma_copy_pp
[sizeIdxC
](recon
, reconStride
, pred
, stride
);
930 cu
.setCbfPartRange(0, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
932 uint32_t tmpDist
= primitives
.sse_pp
[sizeIdxC
](recon
, reconStride
, fenc
, stride
);
933 tmpDist
= m_rdCost
.scaleChromaDist(chromaId
, tmpDist
);
935 cu
.setTransformSkipPartRange(useTSkip
, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
937 uint32_t tmpBits
= 0, tmpEnergy
= 0;
940 m_entropyCoder
.load(m_rqt
[fullDepth
].rqtRoot
);
941 m_entropyCoder
.resetBits();
942 m_entropyCoder
.codeCoeffNxN(cu
, coeff
, absPartIdxC
, log2TrSizeC
, (TextType
)chromaId
);
943 tmpBits
= m_entropyCoder
.getNumberOfWrittenBits();
947 if (m_rdCost
.m_psyRd
)
949 tmpEnergy
= m_rdCost
.psyCost(sizeIdxC
, fenc
, stride
, reconQt
, reconQtStride
);
950 tmpCost
= m_rdCost
.calcPsyRdCost(tmpDist
, tmpBits
, tmpEnergy
);
953 tmpCost
= m_rdCost
.calcRdCost(tmpDist
, tmpBits
);
967 memcpy(coeffC
, tskipCoeffC
, sizeof(coeff_t
) << (log2TrSizeC
* 2));
968 primitives
.luma_copy_pp
[sizeIdxC
](reconQt
, reconQtStride
, tskipReconC
, MAX_TS_SIZE
);
971 cu
.setCbfPartRange(bCbf
<< tuDepth
, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
972 cu
.setTransformSkipPartRange(bTSkip
, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
974 pixel
* reconPicC
= m_frame
->m_reconPic
->getChromaAddr(chromaId
, cu
.m_cuAddr
, cuGeom
.encodeIdx
+ absPartIdxC
);
975 intptr_t picStride
= m_frame
->m_reconPic
->m_strideC
;
976 primitives
.luma_copy_pp
[sizeIdxC
](reconPicC
, picStride
, reconQt
, reconQtStride
);
979 psyEnergy
+= bEnergy
;
981 while (tuIterator
.isNextSection());
983 if (splitType
== VERTICAL_SPLIT
)
984 offsetSubTUCBFs(cu
, ttype
, tuDepth
, absPartIdx
);
987 m_entropyCoder
.load(m_rqt
[fullDepth
].rqtRoot
);
991 void Search::extractIntraResultChromaQT(CUData
& cu
, Yuv
& reconYuv
, uint32_t absPartIdx
, uint32_t tuDepth
)
993 uint32_t fullDepth
= cu
.m_cuDepth
[0] + tuDepth
;
994 uint32_t tuDepthL
= cu
.m_tuDepth
[absPartIdx
];
995 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
996 uint32_t log2TrSizeC
= log2TrSize
- m_hChromaShift
;
998 if (tuDepthL
== tuDepth
|| log2TrSizeC
== 2)
1000 // copy transform coefficients
1001 uint32_t numCoeffC
= 1 << (log2TrSizeC
* 2 + (m_csp
== X265_CSP_I422
));
1002 uint32_t coeffOffsetC
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2 - (m_hChromaShift
+ m_vChromaShift
));
1004 uint32_t qtLayer
= log2TrSize
- 2 - (tuDepthL
- tuDepth
);
1005 coeff_t
* coeffSrcU
= m_rqt
[qtLayer
].coeffRQT
[1] + coeffOffsetC
;
1006 coeff_t
* coeffSrcV
= m_rqt
[qtLayer
].coeffRQT
[2] + coeffOffsetC
;
1007 coeff_t
* coeffDstU
= cu
.m_trCoeff
[1] + coeffOffsetC
;
1008 coeff_t
* coeffDstV
= cu
.m_trCoeff
[2] + coeffOffsetC
;
1009 memcpy(coeffDstU
, coeffSrcU
, sizeof(coeff_t
) * numCoeffC
);
1010 memcpy(coeffDstV
, coeffSrcV
, sizeof(coeff_t
) * numCoeffC
);
1012 // copy reconstruction
1013 m_rqt
[qtLayer
].reconQtYuv
.copyPartToPartChroma(reconYuv
, absPartIdx
, log2TrSizeC
+ m_hChromaShift
);
1017 uint32_t qNumParts
= 1 << (log2TrSize
- 1 - LOG2_UNIT_SIZE
) * 2;
1018 for (uint32_t qIdx
= 0; qIdx
< 4; ++qIdx
, absPartIdx
+= qNumParts
)
1019 extractIntraResultChromaQT(cu
, reconYuv
, absPartIdx
, tuDepth
+ 1);
1023 void Search::residualQTIntraChroma(Mode
& mode
, const CUGeom
& cuGeom
, uint32_t tuDepth
, uint32_t absPartIdx
)
1025 CUData
& cu
= mode
.cu
;
1026 uint32_t fullDepth
= cu
.m_cuDepth
[0] + tuDepth
;
1027 uint32_t log2TrSize
= g_maxLog2CUSize
- fullDepth
;
1029 if (tuDepth
== cu
.m_tuDepth
[absPartIdx
])
1031 uint32_t log2TrSizeC
= log2TrSize
- m_hChromaShift
;
1032 uint32_t tuDepthC
= tuDepth
;
1033 if (log2TrSizeC
< 2)
1035 X265_CHECK(log2TrSize
== 2 && m_csp
!= X265_CSP_I444
&& tuDepth
, "invalid tuDepth\n");
1042 ShortYuv
& resiYuv
= m_rqt
[cuGeom
.depth
].tmpResiYuv
;
1043 uint32_t tuSize
= 1 << log2TrSizeC
;
1044 uint32_t stride
= mode
.fencYuv
->m_csize
;
1045 const int sizeIdxC
= log2TrSizeC
- 2;
1047 uint32_t curPartNum
= NUM_CU_PARTITIONS
>> ((cu
.m_cuDepth
[0] + tuDepthC
) << 1);
1048 const SplitType splitType
= (m_csp
== X265_CSP_I422
) ? VERTICAL_SPLIT
: DONT_SPLIT
;
1050 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
1052 TextType ttype
= (TextType
)chromaId
;
1054 TURecurse
tuIterator(splitType
, curPartNum
, absPartIdx
);
1057 uint32_t absPartIdxC
= tuIterator
.absPartIdxTURelCU
;
1059 const pixel
* fenc
= mode
.fencYuv
->getChromaAddr(chromaId
, absPartIdxC
);
1060 pixel
* pred
= mode
.predYuv
.getChromaAddr(chromaId
, absPartIdxC
);
1061 int16_t* residual
= resiYuv
.getChromaAddr(chromaId
, absPartIdxC
);
1062 pixel
* recon
= mode
.reconYuv
.getChromaAddr(chromaId
, absPartIdxC
); // TODO: needed?
1063 uint32_t coeffOffsetC
= absPartIdxC
<< (LOG2_UNIT_SIZE
* 2 - (m_hChromaShift
+ m_vChromaShift
));
1064 coeff_t
* coeff
= cu
.m_trCoeff
[ttype
] + coeffOffsetC
;
1065 pixel
* picReconC
= m_frame
->m_reconPic
->getChromaAddr(chromaId
, cu
.m_cuAddr
, cuGeom
.encodeIdx
+ absPartIdxC
);
1066 uint32_t picStride
= m_frame
->m_reconPic
->m_strideC
;
1068 uint32_t chromaPredMode
= cu
.m_chromaIntraDir
[absPartIdxC
];
1069 if (chromaPredMode
== DM_CHROMA_IDX
)
1070 chromaPredMode
= cu
.m_lumaIntraDir
[(m_csp
== X265_CSP_I444
) ? absPartIdxC
: 0];
1071 chromaPredMode
= (m_csp
== X265_CSP_I422
) ? g_chroma422IntraAngleMappingTable
[chromaPredMode
] : chromaPredMode
;
1072 initAdiPatternChroma(cu
, cuGeom
, absPartIdxC
, tuDepthC
, chromaId
);
1073 pixel
* chromaPred
= getAdiChromaBuf(chromaId
, tuSize
);
1075 predIntraChromaAng(chromaPred
, chromaPredMode
, pred
, stride
, log2TrSizeC
, m_csp
);
1077 X265_CHECK(!cu
.m_transformSkip
[ttype
][0], "transform skip not supported at low RD levels\n");
1079 primitives
.calcresidual
[sizeIdxC
](fenc
, pred
, residual
, stride
);
1080 uint32_t numSig
= m_quant
.transformNxN(cu
, fenc
, stride
, residual
, stride
, coeff
, log2TrSizeC
, ttype
, absPartIdxC
, false);
1083 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdxC
], residual
, stride
, coeff
, log2TrSizeC
, ttype
, true, false, numSig
);
1084 primitives
.luma_add_ps
[sizeIdxC
](recon
, stride
, pred
, residual
, stride
, stride
);
1085 primitives
.luma_copy_pp
[sizeIdxC
](picReconC
, picStride
, recon
, stride
);
1086 cu
.setCbfPartRange(1 << tuDepth
, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
1090 primitives
.luma_copy_pp
[sizeIdxC
](recon
, stride
, pred
, stride
);
1091 primitives
.luma_copy_pp
[sizeIdxC
](picReconC
, picStride
, pred
, stride
);
1092 cu
.setCbfPartRange(0, ttype
, absPartIdxC
, tuIterator
.absPartIdxStep
);
1095 while (tuIterator
.isNextSection());
1097 if (splitType
== VERTICAL_SPLIT
)
1098 offsetSubTUCBFs(cu
, (TextType
)chromaId
, tuDepth
, absPartIdx
);
1103 uint32_t qNumParts
= 1 << (log2TrSize
- 1 - LOG2_UNIT_SIZE
) * 2;
1104 uint32_t splitCbfU
= 0, splitCbfV
= 0;
1105 for (uint32_t qIdx
= 0, qPartIdx
= absPartIdx
; qIdx
< 4; ++qIdx
, qPartIdx
+= qNumParts
)
1107 residualQTIntraChroma(mode
, cuGeom
, tuDepth
+ 1, qPartIdx
);
1108 splitCbfU
|= cu
.getCbf(qPartIdx
, TEXT_CHROMA_U
, tuDepth
+ 1);
1109 splitCbfV
|= cu
.getCbf(qPartIdx
, TEXT_CHROMA_V
, tuDepth
+ 1);
1111 for (uint32_t offs
= 0; offs
< 4 * qNumParts
; offs
++)
1113 cu
.m_cbf
[1][absPartIdx
+ offs
] |= (splitCbfU
<< tuDepth
);
1114 cu
.m_cbf
[2][absPartIdx
+ offs
] |= (splitCbfV
<< tuDepth
);
1119 void Search::checkIntra(Mode
& intraMode
, const CUGeom
& cuGeom
, PartSize partSize
, uint8_t* sharedModes
)
1121 uint32_t depth
= cuGeom
.depth
;
1122 CUData
& cu
= intraMode
.cu
;
1124 cu
.setPartSizeSubParts(partSize
);
1125 cu
.setPredModeSubParts(MODE_INTRA
);
1127 uint32_t tuDepthRange
[2];
1128 cu
.getIntraTUQtDepthRange(tuDepthRange
, 0);
1130 intraMode
.initCosts();
1131 intraMode
.distortion
+= estIntraPredQT(intraMode
, cuGeom
, tuDepthRange
, sharedModes
);
1132 intraMode
.distortion
+= estIntraPredChromaQT(intraMode
, cuGeom
);
1134 m_entropyCoder
.resetBits();
1135 if (m_slice
->m_pps
->bTransquantBypassEnabled
)
1136 m_entropyCoder
.codeCUTransquantBypassFlag(cu
.m_tqBypass
[0]);
1138 if (!m_slice
->isIntra())
1140 m_entropyCoder
.codeSkipFlag(cu
, 0);
1141 m_entropyCoder
.codePredMode(cu
.m_predMode
[0]);
1144 m_entropyCoder
.codePartSize(cu
, 0, depth
);
1145 m_entropyCoder
.codePredInfo(cu
, 0);
1146 intraMode
.mvBits
= m_entropyCoder
.getNumberOfWrittenBits();
1148 bool bCodeDQP
= m_slice
->m_pps
->bUseDQP
;
1149 m_entropyCoder
.codeCoeff(cu
, 0, bCodeDQP
, tuDepthRange
);
1150 m_entropyCoder
.store(intraMode
.contexts
);
1151 intraMode
.totalBits
= m_entropyCoder
.getNumberOfWrittenBits();
1152 intraMode
.coeffBits
= intraMode
.totalBits
- intraMode
.mvBits
;
1153 if (m_rdCost
.m_psyRd
)
1154 intraMode
.psyEnergy
= m_rdCost
.psyCost(cuGeom
.log2CUSize
- 2, intraMode
.fencYuv
->m_buf
[0], intraMode
.fencYuv
->m_size
, intraMode
.reconYuv
.m_buf
[0], intraMode
.reconYuv
.m_size
);
1156 updateModeCost(intraMode
);
1159 /* Note that this function does not save the best intra prediction, it must
1160 * be generated later. It records the best mode in the cu */
1161 void Search::checkIntraInInter(Mode
& intraMode
, const CUGeom
& cuGeom
)
1163 CUData
& cu
= intraMode
.cu
;
1164 uint32_t depth
= cu
.m_cuDepth
[0];
1166 cu
.setPartSizeSubParts(SIZE_2Nx2N
);
1167 cu
.setPredModeSubParts(MODE_INTRA
);
1169 const uint32_t initTuDepth
= 0;
1170 uint32_t log2TrSize
= cu
.m_log2CUSize
[0] - initTuDepth
;
1171 uint32_t tuSize
= 1 << log2TrSize
;
1172 const uint32_t absPartIdx
= 0;
1174 // Reference sample smoothing
1175 initAdiPattern(cu
, cuGeom
, absPartIdx
, initTuDepth
, ALL_IDX
);
1177 const pixel
* fenc
= intraMode
.fencYuv
->m_buf
[0];
1178 uint32_t stride
= intraMode
.fencYuv
->m_size
;
1180 pixel
* above
= m_refAbove
+ tuSize
- 1;
1181 pixel
* aboveFiltered
= m_refAboveFlt
+ tuSize
- 1;
1182 pixel
* left
= m_refLeft
+ tuSize
- 1;
1183 pixel
* leftFiltered
= m_refLeftFlt
+ tuSize
- 1;
1185 uint32_t bits
, bbits
, mode
, bmode
;
1186 uint64_t cost
, bcost
;
1188 // 33 Angle modes once
1189 ALIGN_VAR_32(pixel
, bufScale
[32 * 32]);
1190 ALIGN_VAR_32(pixel
, bufTrans
[32 * 32]);
1191 ALIGN_VAR_32(pixel
, tmp
[33 * 32 * 32]);
1192 int scaleTuSize
= tuSize
;
1193 int scaleStride
= stride
;
1195 int sizeIdx
= log2TrSize
- 2;
1199 // origin is 64x64, we scale to 32x32 and setup required parameters
1200 primitives
.scale2D_64to32(bufScale
, fenc
, stride
);
1203 // reserve space in case primitives need to store data in above
1205 pixel _above
[4 * 32 + 1];
1206 pixel _left
[4 * 32 + 1];
1207 pixel
* aboveScale
= _above
+ 2 * 32;
1208 pixel
* leftScale
= _left
+ 2 * 32;
1209 aboveScale
[0] = leftScale
[0] = above
[0];
1210 primitives
.scale1D_128to64(aboveScale
+ 1, above
+ 1, 0);
1211 primitives
.scale1D_128to64(leftScale
+ 1, left
+ 1, 0);
1216 sizeIdx
= 5 - 2; // log2(scaleTuSize) - 2
1218 // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
1221 aboveFiltered
= aboveScale
;
1222 leftFiltered
= leftScale
;
1225 pixelcmp_t sa8d
= primitives
.sa8d
[sizeIdx
];
1226 int predsize
= scaleTuSize
* scaleTuSize
;
1228 m_entropyCoder
.loadIntraDirModeLuma(m_rqt
[depth
].cur
);
1230 /* there are three cost tiers for intra modes:
1231 * pred[0] - mode probable, least cost
1232 * pred[1], pred[2] - less probable, slightly more cost
1233 * non-mpm modes - all cost the same (rbits) */
1236 uint32_t rbits
= getIntraRemModeBits(cu
, absPartIdx
, preds
, mpms
);
1239 primitives
.intra_pred
[DC_IDX
][sizeIdx
](tmp
, scaleStride
, left
, above
, 0, (scaleTuSize
<= 16));
1240 bsad
= sa8d(fenc
, scaleStride
, tmp
, scaleStride
) << costShift
;
1241 bmode
= mode
= DC_IDX
;
1242 bbits
= (mpms
& ((uint64_t)1 << mode
)) ? m_entropyCoder
.bitsIntraModeMPM(preds
, mode
) : rbits
;
1243 bcost
= m_rdCost
.calcRdSADCost(bsad
, bbits
);
1245 pixel
* abovePlanar
= above
;
1246 pixel
* leftPlanar
= left
;
1248 if (tuSize
& (8 | 16 | 32))
1250 abovePlanar
= aboveFiltered
;
1251 leftPlanar
= leftFiltered
;
1255 primitives
.intra_pred
[PLANAR_IDX
][sizeIdx
](tmp
, scaleStride
, leftPlanar
, abovePlanar
, 0, 0);
1256 sad
= sa8d(fenc
, scaleStride
, tmp
, scaleStride
) << costShift
;
1258 bits
= (mpms
& ((uint64_t)1 << mode
)) ? m_entropyCoder
.bitsIntraModeMPM(preds
, mode
) : rbits
;
1259 cost
= m_rdCost
.calcRdSADCost(sad
, bits
);
1260 COPY4_IF_LT(bcost
, cost
, bmode
, mode
, bsad
, sad
, bbits
, bits
);
1263 primitives
.transpose
[sizeIdx
](bufTrans
, fenc
, scaleStride
);
1265 primitives
.intra_pred_allangs
[sizeIdx
](tmp
, above
, left
, aboveFiltered
, leftFiltered
, (scaleTuSize
<= 16));
1271 #define TRY_ANGLE(angle) \
1272 modeHor = angle < 18; \
1273 cmp = modeHor ? bufTrans : fenc; \
1274 srcStride = modeHor ? scaleTuSize : scaleStride; \
1275 sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
1276 bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
1277 cost = m_rdCost.calcRdSADCost(sad, bits)
1279 if (m_param
->bEnableFastIntra
)
1282 uint32_t lowmode
, highmode
, amode
= 5, abits
= 0;
1283 uint64_t acost
= MAX_INT64
;
1285 /* pick the best angle, sampling at distance of 5 */
1286 for (mode
= 5; mode
< 35; mode
+= 5)
1289 COPY4_IF_LT(acost
, cost
, amode
, mode
, asad
, sad
, abits
, bits
);
1292 /* refine best angle at distance 2, then distance 1 */
1293 for (uint32_t dist
= 2; dist
>= 1; dist
--)
1295 lowmode
= amode
- dist
;
1296 highmode
= amode
+ dist
;
1298 X265_CHECK(lowmode
>= 2 && lowmode
<= 34, "low intra mode out of range\n");
1300 COPY4_IF_LT(acost
, cost
, amode
, lowmode
, asad
, sad
, abits
, bits
);
1302 X265_CHECK(highmode
>= 2 && highmode
<= 34, "high intra mode out of range\n");
1303 TRY_ANGLE(highmode
);
1304 COPY4_IF_LT(acost
, cost
, amode
, highmode
, asad
, sad
, abits
, bits
);
1310 COPY4_IF_LT(acost
, cost
, amode
, 34, asad
, sad
, abits
, bits
);
1313 COPY4_IF_LT(bcost
, acost
, bmode
, amode
, bsad
, asad
, bbits
, abits
);
1315 else // calculate and search all intra prediction angles for lowest cost
1317 for (mode
= 2; mode
< 35; mode
++)
1320 COPY4_IF_LT(bcost
, cost
, bmode
, mode
, bsad
, sad
, bbits
, bits
);
1324 cu
.setLumaIntraDirSubParts((uint8_t)bmode
, absPartIdx
, depth
+ initTuDepth
);
1325 intraMode
.initCosts();
1326 intraMode
.totalBits
= bbits
;
1327 intraMode
.distortion
= bsad
;
1328 intraMode
.sa8dCost
= bcost
;
1329 intraMode
.sa8dBits
= bbits
;
1332 void Search::encodeIntraInInter(Mode
& intraMode
, const CUGeom
& cuGeom
)
1334 CUData
& cu
= intraMode
.cu
;
1335 Yuv
* reconYuv
= &intraMode
.reconYuv
;
1336 const Yuv
* fencYuv
= intraMode
.fencYuv
;
1338 X265_CHECK(cu
.m_partSize
[0] == SIZE_2Nx2N
, "encodeIntraInInter does not expect NxN intra\n");
1339 X265_CHECK(!m_slice
->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
1341 m_quant
.setQPforQuant(cu
);
1343 uint32_t tuDepthRange
[2];
1344 cu
.getIntraTUQtDepthRange(tuDepthRange
, 0);
1346 m_entropyCoder
.load(m_rqt
[cuGeom
.depth
].cur
);
1349 codeIntraLumaQT(intraMode
, cuGeom
, 0, 0, false, icosts
, tuDepthRange
);
1350 extractIntraResultQT(cu
, *reconYuv
, 0, 0);
1352 intraMode
.distortion
= icosts
.distortion
;
1353 intraMode
.distortion
+= estIntraPredChromaQT(intraMode
, cuGeom
);
1355 m_entropyCoder
.resetBits();
1356 if (m_slice
->m_pps
->bTransquantBypassEnabled
)
1357 m_entropyCoder
.codeCUTransquantBypassFlag(cu
.m_tqBypass
[0]);
1358 m_entropyCoder
.codeSkipFlag(cu
, 0);
1359 m_entropyCoder
.codePredMode(cu
.m_predMode
[0]);
1360 m_entropyCoder
.codePartSize(cu
, 0, cuGeom
.depth
);
1361 m_entropyCoder
.codePredInfo(cu
, 0);
1362 intraMode
.mvBits
+= m_entropyCoder
.getNumberOfWrittenBits();
1364 bool bCodeDQP
= m_slice
->m_pps
->bUseDQP
;
1365 m_entropyCoder
.codeCoeff(cu
, 0, bCodeDQP
, tuDepthRange
);
1367 intraMode
.totalBits
= m_entropyCoder
.getNumberOfWrittenBits();
1368 intraMode
.coeffBits
= intraMode
.totalBits
- intraMode
.mvBits
;
1369 if (m_rdCost
.m_psyRd
)
1370 intraMode
.psyEnergy
= m_rdCost
.psyCost(cuGeom
.log2CUSize
- 2, fencYuv
->m_buf
[0], fencYuv
->m_size
, reconYuv
->m_buf
[0], reconYuv
->m_size
);
1372 m_entropyCoder
.store(intraMode
.contexts
);
1373 updateModeCost(intraMode
);
1376 uint32_t Search::estIntraPredQT(Mode
&intraMode
, const CUGeom
& cuGeom
, const uint32_t depthRange
[2], uint8_t* sharedModes
)
1378 CUData
& cu
= intraMode
.cu
;
1379 Yuv
* reconYuv
= &intraMode
.reconYuv
;
1380 Yuv
* predYuv
= &intraMode
.predYuv
;
1381 const Yuv
* fencYuv
= intraMode
.fencYuv
;
1383 uint32_t depth
= cu
.m_cuDepth
[0];
1384 uint32_t initTuDepth
= cu
.m_partSize
[0] != SIZE_2Nx2N
;
1385 uint32_t numPU
= 1 << (2 * initTuDepth
);
1386 uint32_t log2TrSize
= cu
.m_log2CUSize
[0] - initTuDepth
;
1387 uint32_t tuSize
= 1 << log2TrSize
;
1388 uint32_t qNumParts
= cuGeom
.numPartitions
>> 2;
1389 uint32_t sizeIdx
= log2TrSize
- 2;
1390 uint32_t absPartIdx
= 0;
1391 uint32_t totalDistortion
= 0;
1393 int checkTransformSkip
= m_slice
->m_pps
->bTransformSkipEnabled
&& !cu
.m_tqBypass
[0] && cu
.m_partSize
[0] != SIZE_2Nx2N
;
1395 // loop over partitions
1396 for (uint32_t puIdx
= 0; puIdx
< numPU
; puIdx
++, absPartIdx
+= qNumParts
)
1401 bmode
= sharedModes
[puIdx
];
1404 // Reference sample smoothing
1405 initAdiPattern(cu
, cuGeom
, absPartIdx
, initTuDepth
, ALL_IDX
);
1407 // determine set of modes to be tested (using prediction signal only)
1408 const pixel
* fenc
= fencYuv
->getLumaAddr(absPartIdx
);
1409 uint32_t stride
= predYuv
->m_size
;
1411 pixel
* above
= m_refAbove
+ tuSize
- 1;
1412 pixel
* aboveFiltered
= m_refAboveFlt
+ tuSize
- 1;
1413 pixel
* left
= m_refLeft
+ tuSize
- 1;
1414 pixel
* leftFiltered
= m_refLeftFlt
+ tuSize
- 1;
1416 // 33 Angle modes once
1417 ALIGN_VAR_32(pixel
, buf_trans
[32 * 32]);
1418 ALIGN_VAR_32(pixel
, tmp
[33 * 32 * 32]);
1419 ALIGN_VAR_32(pixel
, bufScale
[32 * 32]);
1420 pixel _above
[4 * 32 + 1];
1421 pixel _left
[4 * 32 + 1];
1422 int scaleTuSize
= tuSize
;
1423 int scaleStride
= stride
;
1428 pixel
* aboveScale
= _above
+ 2 * 32;
1429 pixel
* leftScale
= _left
+ 2 * 32;
1431 // origin is 64x64, we scale to 32x32 and setup required parameters
1432 primitives
.scale2D_64to32(bufScale
, fenc
, stride
);
1435 // reserve space in case primitives need to store data in above
1437 aboveScale
[0] = leftScale
[0] = above
[0];
1438 primitives
.scale1D_128to64(aboveScale
+ 1, above
+ 1, 0);
1439 primitives
.scale1D_128to64(leftScale
+ 1, left
+ 1, 0);
1444 sizeIdx
= 5 - 2; // log2(scaleTuSize) - 2
1446 // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
1449 aboveFiltered
= aboveScale
;
1450 leftFiltered
= leftScale
;
1453 m_entropyCoder
.loadIntraDirModeLuma(m_rqt
[depth
].cur
);
1455 /* there are three cost tiers for intra modes:
1456 * pred[0] - mode probable, least cost
1457 * pred[1], pred[2] - less probable, slightly more cost
1458 * non-mpm modes - all cost the same (rbits) */
1461 uint32_t rbits
= getIntraRemModeBits(cu
, absPartIdx
, preds
, mpms
);
1463 pixelcmp_t sa8d
= primitives
.sa8d
[sizeIdx
];
1464 uint64_t modeCosts
[35];
1468 primitives
.intra_pred
[DC_IDX
][sizeIdx
](tmp
, scaleStride
, left
, above
, 0, (scaleTuSize
<= 16));
1469 uint32_t bits
= (mpms
& ((uint64_t)1 << DC_IDX
)) ? m_entropyCoder
.bitsIntraModeMPM(preds
, DC_IDX
) : rbits
;
1470 uint32_t sad
= sa8d(fenc
, scaleStride
, tmp
, scaleStride
) << costShift
;
1471 modeCosts
[DC_IDX
] = bcost
= m_rdCost
.calcRdSADCost(sad
, bits
);
1474 pixel
* abovePlanar
= above
;
1475 pixel
* leftPlanar
= left
;
1476 if (tuSize
>= 8 && tuSize
<= 32)
1478 abovePlanar
= aboveFiltered
;
1479 leftPlanar
= leftFiltered
;
1481 primitives
.intra_pred
[PLANAR_IDX
][sizeIdx
](tmp
, scaleStride
, leftPlanar
, abovePlanar
, 0, 0);
1482 bits
= (mpms
& ((uint64_t)1 << PLANAR_IDX
)) ? m_entropyCoder
.bitsIntraModeMPM(preds
, PLANAR_IDX
) : rbits
;
1483 sad
= sa8d(fenc
, scaleStride
, tmp
, scaleStride
) << costShift
;
1484 modeCosts
[PLANAR_IDX
] = m_rdCost
.calcRdSADCost(sad
, bits
);
1485 COPY1_IF_LT(bcost
, modeCosts
[PLANAR_IDX
]);
1487 // angular predictions
1488 primitives
.intra_pred_allangs
[sizeIdx
](tmp
, above
, left
, aboveFiltered
, leftFiltered
, (scaleTuSize
<= 16));
1490 primitives
.transpose
[sizeIdx
](buf_trans
, fenc
, scaleStride
);
1491 for (int mode
= 2; mode
< 35; mode
++)
1493 bool modeHor
= (mode
< 18);
1494 const pixel
* cmp
= (modeHor
? buf_trans
: fenc
);
1495 intptr_t srcStride
= (modeHor
? scaleTuSize
: scaleStride
);
1496 bits
= (mpms
& ((uint64_t)1 << mode
)) ? m_entropyCoder
.bitsIntraModeMPM(preds
, mode
) : rbits
;
1497 sad
= sa8d(cmp
, srcStride
, &tmp
[(mode
- 2) * (scaleTuSize
* scaleTuSize
)], scaleTuSize
) << costShift
;
1498 modeCosts
[mode
] = m_rdCost
.calcRdSADCost(sad
, bits
);
1499 COPY1_IF_LT(bcost
, modeCosts
[mode
]);
1502 /* Find the top maxCandCount candidate modes with cost within 25% of best
1503 * or among the most probable modes. maxCandCount is derived from the
1504 * rdLevel and depth. In general we want to try more modes at slower RD
1505 * levels and at higher depths */
1506 uint64_t candCostList
[MAX_RD_INTRA_MODES
];
1507 uint32_t rdModeList
[MAX_RD_INTRA_MODES
];
1508 int maxCandCount
= 2 + m_param
->rdLevel
+ ((depth
+ initTuDepth
) >> 1);
1509 for (int i
= 0; i
< maxCandCount
; i
++)
1510 candCostList
[i
] = MAX_INT64
;
1512 uint64_t paddedBcost
= bcost
+ (bcost
>> 3); // 1.12%
1513 for (int mode
= 0; mode
< 35; mode
++)
1514 if (modeCosts
[mode
] < paddedBcost
|| (mpms
& ((uint64_t)1 << mode
)))
1515 updateCandList(mode
, modeCosts
[mode
], maxCandCount
, rdModeList
, candCostList
);
1517 /* measure best candidates using simple RDO (no TU splits) */
1519 for (int i
= 0; i
< maxCandCount
; i
++)
1521 if (candCostList
[i
] == MAX_INT64
)
1523 m_entropyCoder
.load(m_rqt
[depth
].cur
);
1524 cu
.setLumaIntraDirSubParts(rdModeList
[i
], absPartIdx
, depth
+ initTuDepth
);
1527 if (checkTransformSkip
)
1528 codeIntraLumaTSkip(intraMode
, cuGeom
, initTuDepth
, absPartIdx
, icosts
);
1530 codeIntraLumaQT(intraMode
, cuGeom
, initTuDepth
, absPartIdx
, false, icosts
, depthRange
);
1531 COPY2_IF_LT(bcost
, icosts
.rdcost
, bmode
, rdModeList
[i
]);
1535 /* remeasure best mode, allowing TU splits */
1536 cu
.setLumaIntraDirSubParts(bmode
, absPartIdx
, depth
+ initTuDepth
);
1537 m_entropyCoder
.load(m_rqt
[depth
].cur
);
1540 if (checkTransformSkip
)
1541 codeIntraLumaTSkip(intraMode
, cuGeom
, initTuDepth
, absPartIdx
, icosts
);
1543 codeIntraLumaQT(intraMode
, cuGeom
, initTuDepth
, absPartIdx
, true, icosts
, depthRange
);
1544 totalDistortion
+= icosts
.distortion
;
1546 extractIntraResultQT(cu
, *reconYuv
, initTuDepth
, absPartIdx
);
1548 // set reconstruction for next intra prediction blocks
1549 if (puIdx
!= numPU
- 1)
1551 /* This has important implications for parallelism and RDO. It is writing intermediate results into the
1552 * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
1553 * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
1554 * that the contexts should be tracked through each PU */
1555 pixel
* dst
= m_frame
->m_reconPic
->getLumaAddr(cu
.m_cuAddr
, cuGeom
.encodeIdx
+ absPartIdx
);
1556 uint32_t dststride
= m_frame
->m_reconPic
->m_stride
;
1557 const pixel
* src
= reconYuv
->getLumaAddr(absPartIdx
);
1558 uint32_t srcstride
= reconYuv
->m_size
;
1559 primitives
.luma_copy_pp
[log2TrSize
- 2](dst
, dststride
, src
, srcstride
);
1565 uint32_t combCbfY
= 0;
1566 for (uint32_t qIdx
= 0, qPartIdx
= 0; qIdx
< 4; ++qIdx
, qPartIdx
+= qNumParts
)
1567 combCbfY
|= cu
.getCbf(qPartIdx
, TEXT_LUMA
, 1);
1569 for (uint32_t offs
= 0; offs
< 4 * qNumParts
; offs
++)
1570 cu
.m_cbf
[0][offs
] |= combCbfY
;
1573 // TODO: remove this
1574 m_entropyCoder
.load(m_rqt
[depth
].cur
);
1576 return totalDistortion
;
1579 void Search::getBestIntraModeChroma(Mode
& intraMode
, const CUGeom
& cuGeom
)
1581 CUData
& cu
= intraMode
.cu
;
1582 const Yuv
* fencYuv
= intraMode
.fencYuv
;
1583 Yuv
* predYuv
= &intraMode
.predYuv
;
1585 uint32_t bestMode
= 0;
1586 uint64_t bestCost
= MAX_INT64
;
1587 uint32_t modeList
[NUM_CHROMA_MODE
];
1589 uint32_t log2TrSizeC
= cu
.m_log2CUSize
[0] - m_hChromaShift
;
1590 uint32_t tuSize
= 1 << log2TrSizeC
;
1591 int32_t scaleTuSize
= tuSize
;
1592 uint32_t tuDepth
= 0;
1593 int32_t costShift
= 0;
1603 Predict::initAdiPatternChroma(cu
, cuGeom
, 0, tuDepth
, 1);
1604 Predict::initAdiPatternChroma(cu
, cuGeom
, 0, tuDepth
, 2);
1605 cu
.getAllowedChromaDir(0, modeList
);
1607 // check chroma modes
1608 for (uint32_t mode
= 0; mode
< NUM_CHROMA_MODE
; mode
++)
1610 uint32_t chromaPredMode
= modeList
[mode
];
1611 if (chromaPredMode
== DM_CHROMA_IDX
)
1612 chromaPredMode
= cu
.m_lumaIntraDir
[0];
1613 if (m_csp
== X265_CSP_I422
)
1614 chromaPredMode
= g_chroma422IntraAngleMappingTable
[chromaPredMode
];
1617 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
1619 const pixel
* fenc
= fencYuv
->m_buf
[chromaId
];
1620 pixel
* pred
= predYuv
->m_buf
[chromaId
];
1621 pixel
* chromaPred
= getAdiChromaBuf(chromaId
, scaleTuSize
);
1623 // get prediction signal
1624 predIntraChromaAng(chromaPred
, chromaPredMode
, pred
, fencYuv
->m_csize
, log2TrSizeC
, m_csp
);
1625 cost
+= primitives
.sa8d
[log2TrSizeC
- 2](fenc
, predYuv
->m_csize
, pred
, fencYuv
->m_csize
) << costShift
;
1628 if (cost
< bestCost
)
1631 bestMode
= modeList
[mode
];
1635 cu
.setChromIntraDirSubParts(bestMode
, 0, cu
.m_cuDepth
[0]);
1638 uint32_t Search::estIntraPredChromaQT(Mode
&intraMode
, const CUGeom
& cuGeom
)
1640 CUData
& cu
= intraMode
.cu
;
1641 Yuv
& reconYuv
= intraMode
.reconYuv
;
1643 uint32_t depth
= cu
.m_cuDepth
[0];
1644 uint32_t initTuDepth
= cu
.m_partSize
[0] != SIZE_2Nx2N
&& m_csp
== X265_CSP_I444
;
1645 uint32_t log2TrSize
= cu
.m_log2CUSize
[0] - initTuDepth
;
1646 uint32_t absPartStep
= (NUM_CU_PARTITIONS
>> (depth
<< 1));
1647 uint32_t totalDistortion
= 0;
1649 int part
= partitionFromLog2Size(log2TrSize
);
1651 TURecurse
tuIterator((initTuDepth
== 0) ? DONT_SPLIT
: QUAD_SPLIT
, absPartStep
, 0);
1655 uint32_t absPartIdxC
= tuIterator
.absPartIdxTURelCU
;
1657 uint32_t bestMode
= 0;
1658 uint32_t bestDist
= 0;
1659 uint64_t bestCost
= MAX_INT64
;
1662 uint32_t minMode
= 0;
1663 uint32_t maxMode
= NUM_CHROMA_MODE
;
1664 uint32_t modeList
[NUM_CHROMA_MODE
];
1666 cu
.getAllowedChromaDir(absPartIdxC
, modeList
);
1668 // check chroma modes
1669 for (uint32_t mode
= minMode
; mode
< maxMode
; mode
++)
1671 // restore context models
1672 m_entropyCoder
.load(m_rqt
[depth
].cur
);
1674 cu
.setChromIntraDirSubParts(modeList
[mode
], absPartIdxC
, depth
+ initTuDepth
);
1675 uint32_t psyEnergy
= 0;
1676 uint32_t dist
= codeIntraChromaQt(intraMode
, cuGeom
, initTuDepth
, absPartIdxC
, psyEnergy
);
1678 if (m_slice
->m_pps
->bTransformSkipEnabled
)
1679 m_entropyCoder
.load(m_rqt
[depth
].cur
);
1681 m_entropyCoder
.resetBits();
1682 // chroma prediction mode
1683 if (cu
.m_partSize
[0] == SIZE_2Nx2N
|| m_csp
!= X265_CSP_I444
)
1686 m_entropyCoder
.codeIntraDirChroma(cu
, absPartIdxC
, modeList
);
1690 uint32_t qNumParts
= cuGeom
.numPartitions
>> 2;
1691 if (!(absPartIdxC
& (qNumParts
- 1)))
1692 m_entropyCoder
.codeIntraDirChroma(cu
, absPartIdxC
, modeList
);
1695 codeSubdivCbfQTChroma(cu
, initTuDepth
, absPartIdxC
);
1696 codeCoeffQTChroma(cu
, initTuDepth
, absPartIdxC
, TEXT_CHROMA_U
);
1697 codeCoeffQTChroma(cu
, initTuDepth
, absPartIdxC
, TEXT_CHROMA_V
);
1698 uint32_t bits
= m_entropyCoder
.getNumberOfWrittenBits();
1699 uint64_t cost
= m_rdCost
.m_psyRd
? m_rdCost
.calcPsyRdCost(dist
, bits
, psyEnergy
) : m_rdCost
.calcRdCost(dist
, bits
);
1701 if (cost
< bestCost
)
1705 bestMode
= modeList
[mode
];
1706 extractIntraResultChromaQT(cu
, reconYuv
, absPartIdxC
, initTuDepth
);
1707 memcpy(m_qtTempCbf
[1], cu
.m_cbf
[1] + absPartIdxC
, tuIterator
.absPartIdxStep
* sizeof(uint8_t));
1708 memcpy(m_qtTempCbf
[2], cu
.m_cbf
[2] + absPartIdxC
, tuIterator
.absPartIdxStep
* sizeof(uint8_t));
1709 memcpy(m_qtTempTransformSkipFlag
[1], cu
.m_transformSkip
[1] + absPartIdxC
, tuIterator
.absPartIdxStep
* sizeof(uint8_t));
1710 memcpy(m_qtTempTransformSkipFlag
[2], cu
.m_transformSkip
[2] + absPartIdxC
, tuIterator
.absPartIdxStep
* sizeof(uint8_t));
1714 if (!tuIterator
.isLastSection())
1716 uint32_t zorder
= cuGeom
.encodeIdx
+ absPartIdxC
;
1717 uint32_t dststride
= m_frame
->m_reconPic
->m_strideC
;
1721 dst
= m_frame
->m_reconPic
->getCbAddr(cu
.m_cuAddr
, zorder
);
1722 src
= reconYuv
.getCbAddr(absPartIdxC
);
1723 primitives
.chroma
[m_csp
].copy_pp
[part
](dst
, dststride
, src
, reconYuv
.m_csize
);
1725 dst
= m_frame
->m_reconPic
->getCrAddr(cu
.m_cuAddr
, zorder
);
1726 src
= reconYuv
.getCrAddr(absPartIdxC
);
1727 primitives
.chroma
[m_csp
].copy_pp
[part
](dst
, dststride
, src
, reconYuv
.m_csize
);
1730 memcpy(cu
.m_cbf
[1] + absPartIdxC
, m_qtTempCbf
[1], tuIterator
.absPartIdxStep
* sizeof(uint8_t));
1731 memcpy(cu
.m_cbf
[2] + absPartIdxC
, m_qtTempCbf
[2], tuIterator
.absPartIdxStep
* sizeof(uint8_t));
1732 memcpy(cu
.m_transformSkip
[1] + absPartIdxC
, m_qtTempTransformSkipFlag
[1], tuIterator
.absPartIdxStep
* sizeof(uint8_t));
1733 memcpy(cu
.m_transformSkip
[2] + absPartIdxC
, m_qtTempTransformSkipFlag
[2], tuIterator
.absPartIdxStep
* sizeof(uint8_t));
1734 cu
.setChromIntraDirSubParts(bestMode
, absPartIdxC
, depth
+ initTuDepth
);
1735 totalDistortion
+= bestDist
;
1737 while (tuIterator
.isNextSection());
1739 if (initTuDepth
!= 0)
1741 uint32_t combCbfU
= 0;
1742 uint32_t combCbfV
= 0;
1743 uint32_t qNumParts
= tuIterator
.absPartIdxStep
;
1744 for (uint32_t qIdx
= 0, qPartIdx
= 0; qIdx
< 4; ++qIdx
, qPartIdx
+= qNumParts
)
1746 combCbfU
|= cu
.getCbf(qPartIdx
, TEXT_CHROMA_U
, 1);
1747 combCbfV
|= cu
.getCbf(qPartIdx
, TEXT_CHROMA_V
, 1);
1750 for (uint32_t offs
= 0; offs
< 4 * qNumParts
; offs
++)
1752 cu
.m_cbf
[1][offs
] |= combCbfU
;
1753 cu
.m_cbf
[2][offs
] |= combCbfV
;
1757 /* TODO: remove this */
1758 m_entropyCoder
.load(m_rqt
[depth
].cur
);
1759 return totalDistortion
;
1762 /* estimation of best merge coding of an inter PU (not a merge CU) */
1763 uint32_t Search::mergeEstimation(CUData
& cu
, const CUGeom
& cuGeom
, int puIdx
, MergeData
& m
)
1765 X265_CHECK(cu
.m_partSize
[0] != SIZE_2Nx2N
, "merge tested on non-2Nx2N partition\n");
1767 m
.maxNumMergeCand
= cu
.getInterMergeCandidates(m
.absPartIdx
, puIdx
, m
.mvFieldNeighbours
, m
.interDirNeighbours
);
1769 if (cu
.isBipredRestriction())
1771 /* in 8x8 CUs do not allow bidir merge candidates if not 2Nx2N */
1772 for (uint32_t mergeCand
= 0; mergeCand
< m
.maxNumMergeCand
; ++mergeCand
)
1774 if (m
.interDirNeighbours
[mergeCand
] == 3)
1776 m
.interDirNeighbours
[mergeCand
] = 1;
1777 m
.mvFieldNeighbours
[mergeCand
][1].refIdx
= REF_NOT_VALID
;
1782 Yuv
& tempYuv
= m_rqt
[cuGeom
.depth
].tmpPredYuv
;
1784 uint32_t outCost
= MAX_UINT
;
1785 for (uint32_t mergeCand
= 0; mergeCand
< m
.maxNumMergeCand
; ++mergeCand
)
1787 /* Prevent TMVP candidates from using unavailable reference pixels */
1788 if (m_bFrameParallel
&&
1789 (m
.mvFieldNeighbours
[mergeCand
][0].mv
.y
>= (m_param
->searchRange
+ 1) * 4 ||
1790 m
.mvFieldNeighbours
[mergeCand
][1].mv
.y
>= (m_param
->searchRange
+ 1) * 4))
1793 cu
.m_mv
[0][m
.absPartIdx
] = m
.mvFieldNeighbours
[mergeCand
][0].mv
;
1794 cu
.m_refIdx
[0][m
.absPartIdx
] = (int8_t)m
.mvFieldNeighbours
[mergeCand
][0].refIdx
;
1795 cu
.m_mv
[1][m
.absPartIdx
] = m
.mvFieldNeighbours
[mergeCand
][1].mv
;
1796 cu
.m_refIdx
[1][m
.absPartIdx
] = (int8_t)m
.mvFieldNeighbours
[mergeCand
][1].refIdx
;
1798 prepMotionCompensation(cu
, cuGeom
, puIdx
);
1799 motionCompensation(tempYuv
, true, m_me
.bChromaSATD
);
1801 uint32_t costCand
= m_me
.bufSATD(tempYuv
.getLumaAddr(m
.absPartIdx
), tempYuv
.m_size
);
1802 if (m_me
.bChromaSATD
)
1803 costCand
+= m_me
.bufChromaSATD(tempYuv
, m
.absPartIdx
);
1805 uint32_t bitsCand
= getTUBits(mergeCand
, m
.maxNumMergeCand
);
1806 costCand
= costCand
+ m_rdCost
.getCost(bitsCand
);
1807 if (costCand
< outCost
)
1811 m
.index
= mergeCand
;
1815 m
.mvField
[0] = m
.mvFieldNeighbours
[m
.index
][0];
1816 m
.mvField
[1] = m
.mvFieldNeighbours
[m
.index
][1];
1817 m
.interDir
= m
.interDirNeighbours
[m
.index
];
1822 /* this function assumes the caller has configured its MotionEstimation engine with the
1823 * correct source plane and source PU, and has called prepMotionCompensation() to set
1824 * m_puAbsPartIdx, m_puWidth, and m_puHeight */
1825 void Search::singleMotionEstimation(Search
& master
, Mode
& interMode
, const CUGeom
& cuGeom
, int part
, int list
, int ref
)
1827 uint32_t bits
= master
.m_listSelBits
[list
] + MVP_IDX_BITS
;
1828 bits
+= getTUBits(ref
, m_slice
->m_numRefIdx
[list
]);
1830 MV mvc
[(MD_ABOVE_LEFT
+ 1) * 2 + 1];
1831 int numMvc
= interMode
.cu
.fillMvpCand(part
, m_puAbsPartIdx
, list
, ref
, interMode
.amvpCand
[list
][ref
], mvc
);
1834 int merange
= m_param
->searchRange
;
1835 MotionData
* bestME
= interMode
.bestME
[part
];
1837 if (interMode
.amvpCand
[list
][ref
][0] != interMode
.amvpCand
[list
][ref
][1])
1839 uint32_t bestCost
= MAX_INT
;
1840 for (int i
= 0; i
< AMVP_NUM_CANDS
; i
++)
1842 MV mvCand
= interMode
.amvpCand
[list
][ref
][i
];
1844 // NOTE: skip mvCand if Y is > merange and -FN>1
1845 if (m_bFrameParallel
&& (mvCand
.y
>= (merange
+ 1) * 4))
1848 interMode
.cu
.clipMv(mvCand
);
1850 Yuv
& tmpPredYuv
= m_rqt
[cuGeom
.depth
].tmpPredYuv
;
1851 predInterLumaPixel(tmpPredYuv
, *m_slice
->m_refPicList
[list
][ref
]->m_reconPic
, mvCand
);
1852 uint32_t cost
= m_me
.bufSAD(tmpPredYuv
.getLumaAddr(m_puAbsPartIdx
), tmpPredYuv
.m_size
);
1854 if (bestCost
> cost
)
1862 MV mvmin
, mvmax
, outmv
, mvp
= interMode
.amvpCand
[list
][ref
][mvpIdx
];
1863 setSearchRange(interMode
.cu
, mvp
, merange
, mvmin
, mvmax
);
1865 int satdCost
= m_me
.motionEstimate(&m_slice
->m_mref
[list
][ref
], mvmin
, mvmax
, mvp
, numMvc
, mvc
, merange
, outmv
);
1867 /* Get total cost of partition, but only include MV bit cost once */
1868 bits
+= m_me
.bitcost(outmv
);
1869 uint32_t cost
= (satdCost
- m_me
.mvcost(outmv
)) + m_rdCost
.getCost(bits
);
1871 /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
1872 checkBestMVP(interMode
.amvpCand
[list
][ref
], outmv
, mvp
, mvpIdx
, bits
, cost
);
1874 /* tie goes to the smallest ref ID, just like --no-pme */
1875 ScopedLock
_lock(master
.m_meLock
);
1876 if (cost
< bestME
[list
].cost
||
1877 (cost
== bestME
[list
].cost
&& ref
< bestME
[list
].ref
))
1879 bestME
[list
].mv
= outmv
;
1880 bestME
[list
].mvp
= mvp
;
1881 bestME
[list
].mvpIdx
= mvpIdx
;
1882 bestME
[list
].ref
= ref
;
1883 bestME
[list
].cost
= cost
;
1884 bestME
[list
].bits
= bits
;
1888 /* search of the best candidate for inter prediction
1889 * returns true if predYuv was filled with a motion compensated prediction */
1890 bool Search::predInterSearch(Mode
& interMode
, const CUGeom
& cuGeom
, bool bMergeOnly
, bool bChromaSA8D
)
1892 CUData
& cu
= interMode
.cu
;
1893 Yuv
* predYuv
= &interMode
.predYuv
;
1895 MV mvc
[(MD_ABOVE_LEFT
+ 1) * 2 + 1];
1897 const Slice
*slice
= m_slice
;
1898 int numPart
= cu
.getNumPartInter();
1899 int numPredDir
= slice
->isInterP() ? 1 : 2;
1900 const int* numRefIdx
= slice
->m_numRefIdx
;
1901 uint32_t lastMode
= 0;
1902 int totalmebits
= 0;
1903 bool bDistributed
= m_param
->bDistributeMotionEstimation
&& (numRefIdx
[0] + numRefIdx
[1]) > 2;
1905 Yuv
& tmpPredYuv
= m_rqt
[cuGeom
.depth
].tmpPredYuv
;
1908 memset(&merge
, 0, sizeof(merge
));
1910 for (int puIdx
= 0; puIdx
< numPart
; puIdx
++)
1912 MotionData
* bestME
= interMode
.bestME
[puIdx
];
1914 /* sets m_puAbsPartIdx, m_puWidth, m_puHeight */
1915 initMotionCompensation(cu
, cuGeom
, puIdx
);
1917 m_me
.setSourcePU(*interMode
.fencYuv
, cu
.m_cuAddr
, cuGeom
.encodeIdx
, m_puAbsPartIdx
, m_puWidth
, m_puHeight
);
1919 uint32_t mrgCost
= MAX_UINT
;
1921 /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
1922 if (cu
.m_partSize
[0] != SIZE_2Nx2N
)
1924 merge
.absPartIdx
= m_puAbsPartIdx
;
1925 merge
.width
= m_puWidth
;
1926 merge
.height
= m_puHeight
;
1927 mrgCost
= mergeEstimation(cu
, cuGeom
, puIdx
, merge
);
1931 if (mrgCost
== MAX_UINT
)
1933 /* No valid merge modes were found, there is no possible way to
1934 * perform a valid motion compensation prediction, so early-exit */
1938 cu
.m_mergeFlag
[m_puAbsPartIdx
] = true;
1939 cu
.m_mvpIdx
[0][m_puAbsPartIdx
] = merge
.index
; // merge candidate ID is stored in L0 MVP idx
1940 cu
.setPUInterDir(merge
.interDir
, m_puAbsPartIdx
, puIdx
);
1941 cu
.setPUMv(0, merge
.mvField
[0].mv
, m_puAbsPartIdx
, puIdx
);
1942 cu
.setPURefIdx(0, merge
.mvField
[0].refIdx
, m_puAbsPartIdx
, puIdx
);
1943 cu
.setPUMv(1, merge
.mvField
[1].mv
, m_puAbsPartIdx
, puIdx
);
1944 cu
.setPURefIdx(1, merge
.mvField
[1].refIdx
, m_puAbsPartIdx
, puIdx
);
1945 totalmebits
+= merge
.bits
;
1947 prepMotionCompensation(cu
, cuGeom
, puIdx
);
1948 motionCompensation(*predYuv
, true, bChromaSA8D
);
1953 bestME
[0].cost
= MAX_UINT
;
1954 bestME
[1].cost
= MAX_UINT
;
1956 getBlkBits((PartSize
)cu
.m_partSize
[0], slice
->isInterP(), puIdx
, lastMode
, m_listSelBits
);
1958 /* Uni-directional prediction */
1959 if (m_param
->analysisMode
== X265_ANALYSIS_LOAD
&& bestME
[0].ref
>= 0)
1961 for (int l
= 0; l
< numPredDir
; l
++)
1963 int ref
= bestME
[l
].ref
;
1964 uint32_t bits
= m_listSelBits
[l
] + MVP_IDX_BITS
;
1965 bits
+= getTUBits(ref
, numRefIdx
[l
]);
1967 int numMvc
= cu
.fillMvpCand(puIdx
, m_puAbsPartIdx
, l
, ref
, interMode
.amvpCand
[l
][ref
], mvc
);
1969 // Pick the best possible MVP from AMVP candidates based on least residual
1971 int merange
= m_param
->searchRange
;
1973 if (interMode
.amvpCand
[l
][ref
][0] != interMode
.amvpCand
[l
][ref
][1])
1975 uint32_t bestCost
= MAX_INT
;
1976 for (int i
= 0; i
< AMVP_NUM_CANDS
; i
++)
1978 MV mvCand
= interMode
.amvpCand
[l
][ref
][i
];
1980 // NOTE: skip mvCand if Y is > merange and -FN>1
1981 if (m_bFrameParallel
&& (mvCand
.y
>= (merange
+ 1) * 4))
1985 predInterLumaPixel(tmpPredYuv
, *slice
->m_refPicList
[l
][ref
]->m_reconPic
, mvCand
);
1986 uint32_t cost
= m_me
.bufSAD(tmpPredYuv
.getLumaAddr(m_puAbsPartIdx
), tmpPredYuv
.m_size
);
1988 if (bestCost
> cost
)
1996 MV mvmin
, mvmax
, outmv
, mvp
= interMode
.amvpCand
[l
][ref
][mvpIdx
];
1999 setSearchRange(cu
, mvp
, merange
, mvmin
, mvmax
);
2000 satdCost
= m_me
.motionEstimate(&slice
->m_mref
[l
][ref
], mvmin
, mvmax
, mvp
, numMvc
, mvc
, merange
, outmv
);
2002 /* Get total cost of partition, but only include MV bit cost once */
2003 bits
+= m_me
.bitcost(outmv
);
2004 uint32_t cost
= (satdCost
- m_me
.mvcost(outmv
)) + m_rdCost
.getCost(bits
);
2006 /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
2007 checkBestMVP(interMode
.amvpCand
[l
][ref
], outmv
, mvp
, mvpIdx
, bits
, cost
);
2009 if (cost
< bestME
[l
].cost
)
2011 bestME
[l
].mv
= outmv
;
2012 bestME
[l
].mvp
= mvp
;
2013 bestME
[l
].mvpIdx
= mvpIdx
;
2014 bestME
[l
].cost
= cost
;
2015 bestME
[l
].bits
= bits
;
2019 else if (bDistributed
)
2022 m_curInterMode
= &interMode
;
2023 m_curGeom
= &cuGeom
;
2026 m_numAcquiredME
= 1;
2027 m_numCompletedME
= 0;
2028 m_totalNumME
= numRefIdx
[0] + numRefIdx
[1];
2032 JobProvider::enqueue();
2034 for (int i
= 1; i
< m_totalNumME
; i
++)
2035 m_pool
->pokeIdleThread();
2040 if (m_totalNumME
> m_numAcquiredME
)
2042 int id
= m_numAcquiredME
++;
2045 if (id
< numRefIdx
[0])
2046 singleMotionEstimation(*this, interMode
, cuGeom
, puIdx
, 0, id
);
2048 singleMotionEstimation(*this, interMode
, cuGeom
, puIdx
, 1, id
- numRefIdx
[0]);
2057 while (m_totalNumME
> m_numAcquiredME
);
2060 JobProvider::dequeue();
2062 /* we saved L0-0 for ourselves */
2063 singleMotionEstimation(*this, interMode
, cuGeom
, puIdx
, 0, 0);
2066 if (++m_numCompletedME
== m_totalNumME
)
2067 m_meCompletionEvent
.trigger();
2070 m_meCompletionEvent
.wait();
2074 for (int l
= 0; l
< numPredDir
; l
++)
2076 for (int ref
= 0; ref
< numRefIdx
[l
]; ref
++)
2078 uint32_t bits
= m_listSelBits
[l
] + MVP_IDX_BITS
;
2079 bits
+= getTUBits(ref
, numRefIdx
[l
]);
2081 int numMvc
= cu
.fillMvpCand(puIdx
, m_puAbsPartIdx
, l
, ref
, interMode
.amvpCand
[l
][ref
], mvc
);
2083 // Pick the best possible MVP from AMVP candidates based on least residual
2085 int merange
= m_param
->searchRange
;
2087 if (interMode
.amvpCand
[l
][ref
][0] != interMode
.amvpCand
[l
][ref
][1])
2089 uint32_t bestCost
= MAX_INT
;
2090 for (int i
= 0; i
< AMVP_NUM_CANDS
; i
++)
2092 MV mvCand
= interMode
.amvpCand
[l
][ref
][i
];
2094 // NOTE: skip mvCand if Y is > merange and -FN>1
2095 if (m_bFrameParallel
&& (mvCand
.y
>= (merange
+ 1) * 4))
2099 predInterLumaPixel(tmpPredYuv
, *slice
->m_refPicList
[l
][ref
]->m_reconPic
, mvCand
);
2100 uint32_t cost
= m_me
.bufSAD(tmpPredYuv
.getLumaAddr(m_puAbsPartIdx
), tmpPredYuv
.m_size
);
2102 if (bestCost
> cost
)
2110 MV mvmin
, mvmax
, outmv
, mvp
= interMode
.amvpCand
[l
][ref
][mvpIdx
];
2112 setSearchRange(cu
, mvp
, merange
, mvmin
, mvmax
);
2113 int satdCost
= m_me
.motionEstimate(&slice
->m_mref
[l
][ref
], mvmin
, mvmax
, mvp
, numMvc
, mvc
, merange
, outmv
);
2115 /* Get total cost of partition, but only include MV bit cost once */
2116 bits
+= m_me
.bitcost(outmv
);
2117 uint32_t cost
= (satdCost
- m_me
.mvcost(outmv
)) + m_rdCost
.getCost(bits
);
2119 /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */
2120 checkBestMVP(interMode
.amvpCand
[l
][ref
], outmv
, mvp
, mvpIdx
, bits
, cost
);
2122 if (cost
< bestME
[l
].cost
)
2124 bestME
[l
].mv
= outmv
;
2125 bestME
[l
].mvp
= mvp
;
2126 bestME
[l
].mvpIdx
= mvpIdx
;
2127 bestME
[l
].ref
= ref
;
2128 bestME
[l
].cost
= cost
;
2129 bestME
[l
].bits
= bits
;
2135 /* Bi-directional prediction */
2136 MotionData bidir
[2];
2137 uint32_t bidirCost
= MAX_UINT
;
2140 if (slice
->isInterB() && !cu
.isBipredRestriction() && /* biprediction is possible for this PU */
2141 cu
.m_partSize
[m_puAbsPartIdx
] != SIZE_2Nx2N
&& /* 2Nx2N biprediction is handled elsewhere */
2142 bestME
[0].cost
!= MAX_UINT
&& bestME
[1].cost
!= MAX_UINT
)
2144 bidir
[0] = bestME
[0];
2145 bidir
[1] = bestME
[1];
2149 if (m_me
.bChromaSATD
)
2151 cu
.m_mv
[0][m_puAbsPartIdx
] = bidir
[0].mv
;
2152 cu
.m_refIdx
[0][m_puAbsPartIdx
] = (int8_t)bidir
[0].ref
;
2153 cu
.m_mv
[1][m_puAbsPartIdx
] = bidir
[1].mv
;
2154 cu
.m_refIdx
[1][m_puAbsPartIdx
] = (int8_t)bidir
[1].ref
;
2156 prepMotionCompensation(cu
, cuGeom
, puIdx
);
2157 motionCompensation(tmpPredYuv
, true, true);
2159 satdCost
= m_me
.bufSATD(tmpPredYuv
.getLumaAddr(m_puAbsPartIdx
), tmpPredYuv
.m_size
) +
2160 m_me
.bufChromaSATD(tmpPredYuv
, m_puAbsPartIdx
);
2164 PicYuv
* refPic0
= slice
->m_refPicList
[0][bestME
[0].ref
]->m_reconPic
;
2165 PicYuv
* refPic1
= slice
->m_refPicList
[1][bestME
[1].ref
]->m_reconPic
;
2166 Yuv
* bidirYuv
= m_rqt
[cuGeom
.depth
].bidirPredYuv
;
2168 /* Generate reference subpels */
2169 predInterLumaPixel(bidirYuv
[0], *refPic0
, bestME
[0].mv
);
2170 predInterLumaPixel(bidirYuv
[1], *refPic1
, bestME
[1].mv
);
2172 primitives
.pixelavg_pp
[m_me
.partEnum
](tmpPredYuv
.m_buf
[0], tmpPredYuv
.m_size
, bidirYuv
[0].getLumaAddr(m_puAbsPartIdx
), bidirYuv
[0].m_size
,
2173 bidirYuv
[1].getLumaAddr(m_puAbsPartIdx
), bidirYuv
[1].m_size
, 32);
2174 satdCost
= m_me
.bufSATD(tmpPredYuv
.m_buf
[0], tmpPredYuv
.m_size
);
2177 bidirBits
= bestME
[0].bits
+ bestME
[1].bits
+ m_listSelBits
[2] - (m_listSelBits
[0] + m_listSelBits
[1]);
2178 bidirCost
= satdCost
+ m_rdCost
.getCost(bidirBits
);
2180 bool bTryZero
= bestME
[0].mv
.notZero() || bestME
[1].mv
.notZero();
2183 /* Do not try zero MV if unidir motion predictors are beyond
2184 * valid search area */
2186 int merange
= X265_MAX(m_param
->sourceWidth
, m_param
->sourceHeight
);
2187 setSearchRange(cu
, mvzero
, merange
, mvmin
, mvmax
);
2188 mvmax
.y
+= 2; // there is some pad for subpel refine
2192 bTryZero
&= bestME
[0].mvp
.checkRange(mvmin
, mvmax
);
2193 bTryZero
&= bestME
[1].mvp
.checkRange(mvmin
, mvmax
);
2197 /* coincident blocks of the two reference pictures */
2198 if (m_me
.bChromaSATD
)
2200 cu
.m_mv
[0][m_puAbsPartIdx
] = mvzero
;
2201 cu
.m_refIdx
[0][m_puAbsPartIdx
] = (int8_t)bidir
[0].ref
;
2202 cu
.m_mv
[1][m_puAbsPartIdx
] = mvzero
;
2203 cu
.m_refIdx
[1][m_puAbsPartIdx
] = (int8_t)bidir
[1].ref
;
2205 prepMotionCompensation(cu
, cuGeom
, puIdx
);
2206 motionCompensation(tmpPredYuv
, true, true);
2208 satdCost
= m_me
.bufSATD(tmpPredYuv
.getLumaAddr(m_puAbsPartIdx
), tmpPredYuv
.m_size
) +
2209 m_me
.bufChromaSATD(tmpPredYuv
, m_puAbsPartIdx
);
2213 const pixel
* ref0
= m_slice
->m_mref
[0][bestME
[0].ref
].getLumaAddr(cu
.m_cuAddr
, cuGeom
.encodeIdx
+ m_puAbsPartIdx
);
2214 const pixel
* ref1
= m_slice
->m_mref
[1][bestME
[1].ref
].getLumaAddr(cu
.m_cuAddr
, cuGeom
.encodeIdx
+ m_puAbsPartIdx
);
2215 intptr_t refStride
= slice
->m_mref
[0][0].lumaStride
;
2217 primitives
.pixelavg_pp
[m_me
.partEnum
](tmpPredYuv
.m_buf
[0], tmpPredYuv
.m_size
, ref0
, refStride
, ref1
, refStride
, 32);
2218 satdCost
= m_me
.bufSATD(tmpPredYuv
.m_buf
[0], tmpPredYuv
.m_size
);
2221 MV mvp0
= bestME
[0].mvp
;
2222 int mvpIdx0
= bestME
[0].mvpIdx
;
2223 uint32_t bits0
= bestME
[0].bits
- m_me
.bitcost(bestME
[0].mv
, mvp0
) + m_me
.bitcost(mvzero
, mvp0
);
2225 MV mvp1
= bestME
[1].mvp
;
2226 int mvpIdx1
= bestME
[1].mvpIdx
;
2227 uint32_t bits1
= bestME
[1].bits
- m_me
.bitcost(bestME
[1].mv
, mvp1
) + m_me
.bitcost(mvzero
, mvp1
);
2229 uint32_t cost
= satdCost
+ m_rdCost
.getCost(bits0
) + m_rdCost
.getCost(bits1
);
2231 /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
2232 checkBestMVP(interMode
.amvpCand
[0][bestME
[0].ref
], mvzero
, mvp0
, mvpIdx0
, bits0
, cost
);
2233 checkBestMVP(interMode
.amvpCand
[1][bestME
[1].ref
], mvzero
, mvp1
, mvpIdx1
, bits1
, cost
);
2235 if (cost
< bidirCost
)
2237 bidir
[0].mv
= mvzero
;
2238 bidir
[1].mv
= mvzero
;
2239 bidir
[0].mvp
= mvp0
;
2240 bidir
[1].mvp
= mvp1
;
2241 bidir
[0].mvpIdx
= mvpIdx0
;
2242 bidir
[1].mvpIdx
= mvpIdx1
;
2244 bidirBits
= bits0
+ bits1
+ m_listSelBits
[2] - (m_listSelBits
[0] + m_listSelBits
[1]);
2249 /* select best option and store into CU */
2250 if (mrgCost
< bidirCost
&& mrgCost
< bestME
[0].cost
&& mrgCost
< bestME
[1].cost
)
2252 cu
.m_mergeFlag
[m_puAbsPartIdx
] = true;
2253 cu
.m_mvpIdx
[0][m_puAbsPartIdx
] = merge
.index
; // merge candidate ID is stored in L0 MVP idx
2254 cu
.setPUInterDir(merge
.interDir
, m_puAbsPartIdx
, puIdx
);
2255 cu
.setPUMv(0, merge
.mvField
[0].mv
, m_puAbsPartIdx
, puIdx
);
2256 cu
.setPURefIdx(0, merge
.mvField
[0].refIdx
, m_puAbsPartIdx
, puIdx
);
2257 cu
.setPUMv(1, merge
.mvField
[1].mv
, m_puAbsPartIdx
, puIdx
);
2258 cu
.setPURefIdx(1, merge
.mvField
[1].refIdx
, m_puAbsPartIdx
, puIdx
);
2260 totalmebits
+= merge
.bits
;
2262 else if (bidirCost
< bestME
[0].cost
&& bidirCost
< bestME
[1].cost
)
2266 cu
.m_mergeFlag
[m_puAbsPartIdx
] = false;
2267 cu
.setPUInterDir(3, m_puAbsPartIdx
, puIdx
);
2268 cu
.setPUMv(0, bidir
[0].mv
, m_puAbsPartIdx
, puIdx
);
2269 cu
.setPURefIdx(0, bestME
[0].ref
, m_puAbsPartIdx
, puIdx
);
2270 cu
.m_mvd
[0][m_puAbsPartIdx
] = bidir
[0].mv
- bidir
[0].mvp
;
2271 cu
.m_mvpIdx
[0][m_puAbsPartIdx
] = bidir
[0].mvpIdx
;
2273 cu
.setPUMv(1, bidir
[1].mv
, m_puAbsPartIdx
, puIdx
);
2274 cu
.setPURefIdx(1, bestME
[1].ref
, m_puAbsPartIdx
, puIdx
);
2275 cu
.m_mvd
[1][m_puAbsPartIdx
] = bidir
[1].mv
- bidir
[1].mvp
;
2276 cu
.m_mvpIdx
[1][m_puAbsPartIdx
] = bidir
[1].mvpIdx
;
2278 totalmebits
+= bidirBits
;
2280 else if (bestME
[0].cost
<= bestME
[1].cost
)
2284 cu
.m_mergeFlag
[m_puAbsPartIdx
] = false;
2285 cu
.setPUInterDir(1, m_puAbsPartIdx
, puIdx
);
2286 cu
.setPUMv(0, bestME
[0].mv
, m_puAbsPartIdx
, puIdx
);
2287 cu
.setPURefIdx(0, bestME
[0].ref
, m_puAbsPartIdx
, puIdx
);
2288 cu
.m_mvd
[0][m_puAbsPartIdx
] = bestME
[0].mv
- bestME
[0].mvp
;
2289 cu
.m_mvpIdx
[0][m_puAbsPartIdx
] = bestME
[0].mvpIdx
;
2291 cu
.setPURefIdx(1, REF_NOT_VALID
, m_puAbsPartIdx
, puIdx
);
2292 cu
.setPUMv(1, mvzero
, m_puAbsPartIdx
, puIdx
);
2294 totalmebits
+= bestME
[0].bits
;
2300 cu
.m_mergeFlag
[m_puAbsPartIdx
] = false;
2301 cu
.setPUInterDir(2, m_puAbsPartIdx
, puIdx
);
2302 cu
.setPUMv(1, bestME
[1].mv
, m_puAbsPartIdx
, puIdx
);
2303 cu
.setPURefIdx(1, bestME
[1].ref
, m_puAbsPartIdx
, puIdx
);
2304 cu
.m_mvd
[1][m_puAbsPartIdx
] = bestME
[1].mv
- bestME
[1].mvp
;
2305 cu
.m_mvpIdx
[1][m_puAbsPartIdx
] = bestME
[1].mvpIdx
;
2307 cu
.setPURefIdx(0, REF_NOT_VALID
, m_puAbsPartIdx
, puIdx
);
2308 cu
.setPUMv(0, mvzero
, m_puAbsPartIdx
, puIdx
);
2310 totalmebits
+= bestME
[1].bits
;
2313 prepMotionCompensation(cu
, cuGeom
, puIdx
);
2314 motionCompensation(*predYuv
, true, bChromaSA8D
);
2317 interMode
.sa8dBits
+= totalmebits
;
2321 void Search::getBlkBits(PartSize cuMode
, bool bPSlice
, int partIdx
, uint32_t lastMode
, uint32_t blockBit
[3])
2323 if (cuMode
== SIZE_2Nx2N
)
2325 blockBit
[0] = (!bPSlice
) ? 3 : 1;
2329 else if (cuMode
== SIZE_2NxN
|| cuMode
== SIZE_2NxnU
|| cuMode
== SIZE_2NxnD
)
2331 static const uint32_t listBits
[2][3][3] =
2333 { { 0, 0, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
2334 { { 5, 7, 7 }, { 7, 5, 7 }, { 9 - 3, 9 - 3, 9 - 3 } }
2343 memcpy(blockBit
, listBits
[partIdx
][lastMode
], 3 * sizeof(uint32_t));
2345 else if (cuMode
== SIZE_Nx2N
|| cuMode
== SIZE_nLx2N
|| cuMode
== SIZE_nRx2N
)
2347 static const uint32_t listBits
[2][3][3] =
2349 { { 0, 2, 3 }, { 0, 0, 0 }, { 0, 0, 0 } },
2350 { { 5, 7, 7 }, { 7 - 2, 7 - 2, 9 - 2 }, { 9 - 3, 9 - 3, 9 - 3 } }
2359 memcpy(blockBit
, listBits
[partIdx
][lastMode
], 3 * sizeof(uint32_t));
2361 else if (cuMode
== SIZE_NxN
)
2363 blockBit
[0] = (!bPSlice
) ? 3 : 1;
2369 X265_CHECK(0, "getBlkBits: unknown cuMode\n");
2373 /* Check if using an alternative MVP would result in a smaller MVD + signal bits */
2374 void Search::checkBestMVP(MV
* amvpCand
, MV mv
, MV
& mvPred
, int& outMvpIdx
, uint32_t& outBits
, uint32_t& outCost
) const
2376 X265_CHECK(amvpCand
[outMvpIdx
] == mvPred
, "checkBestMVP: unexpected mvPred\n");
2378 int mvpIdx
= !outMvpIdx
;
2379 MV mvp
= amvpCand
[mvpIdx
];
2380 int diffBits
= m_me
.bitcost(mv
, mvp
) - m_me
.bitcost(mv
, mvPred
);
2385 uint32_t origOutBits
= outBits
;
2386 outBits
= origOutBits
+ diffBits
;
2387 outCost
= (outCost
- m_rdCost
.getCost(origOutBits
)) + m_rdCost
.getCost(outBits
);
2391 void Search::setSearchRange(const CUData
& cu
, MV mvp
, int merange
, MV
& mvmin
, MV
& mvmax
) const
2395 MV
dist((int16_t)merange
<< 2, (int16_t)merange
<< 2);
2402 /* Clip search range to signaled maximum MV length.
2403 * We do not support this VUI field being changed from the default */
2404 const int maxMvLen
= (1 << 15) - 1;
2405 mvmin
.x
= X265_MAX(mvmin
.x
, -maxMvLen
);
2406 mvmin
.y
= X265_MAX(mvmin
.y
, -maxMvLen
);
2407 mvmax
.x
= X265_MIN(mvmax
.x
, maxMvLen
);
2408 mvmax
.y
= X265_MIN(mvmax
.y
, maxMvLen
);
2413 /* conditional clipping for frame parallelism */
2414 mvmin
.y
= X265_MIN(mvmin
.y
, (int16_t)m_refLagPixels
);
2415 mvmax
.y
= X265_MIN(mvmax
.y
, (int16_t)m_refLagPixels
);
2418 /* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
2419 void Search::encodeResAndCalcRdSkipCU(Mode
& interMode
)
2421 CUData
& cu
= interMode
.cu
;
2422 Yuv
* reconYuv
= &interMode
.reconYuv
;
2423 const Yuv
* fencYuv
= interMode
.fencYuv
;
2425 X265_CHECK(!cu
.isIntra(0), "intra CU not expected\n");
2427 uint32_t cuSize
= 1 << cu
.m_log2CUSize
[0];
2428 uint32_t depth
= cu
.m_cuDepth
[0];
2430 // No residual coding : SKIP mode
2432 cu
.setPredModeSubParts(MODE_SKIP
);
2434 cu
.setTUDepthSubParts(0, 0, depth
);
2436 reconYuv
->copyFromYuv(interMode
.predYuv
);
2439 int part
= partitionFromLog2Size(cu
.m_log2CUSize
[0]);
2440 interMode
.distortion
= primitives
.sse_pp
[part
](fencYuv
->m_buf
[0], fencYuv
->m_size
, reconYuv
->m_buf
[0], reconYuv
->m_size
);
2442 part
= partitionFromSizes(cuSize
>> m_hChromaShift
, cuSize
>> m_vChromaShift
);
2443 interMode
.distortion
+= m_rdCost
.scaleChromaDist(1, primitives
.sse_pp
[part
](fencYuv
->m_buf
[1], fencYuv
->m_csize
, reconYuv
->m_buf
[1], reconYuv
->m_csize
));
2444 interMode
.distortion
+= m_rdCost
.scaleChromaDist(2, primitives
.sse_pp
[part
](fencYuv
->m_buf
[2], fencYuv
->m_csize
, reconYuv
->m_buf
[2], reconYuv
->m_csize
));
2446 m_entropyCoder
.load(m_rqt
[depth
].cur
);
2447 m_entropyCoder
.resetBits();
2448 if (m_slice
->m_pps
->bTransquantBypassEnabled
)
2449 m_entropyCoder
.codeCUTransquantBypassFlag(cu
.m_tqBypass
[0]);
2450 m_entropyCoder
.codeSkipFlag(cu
, 0);
2451 m_entropyCoder
.codeMergeIndex(cu
, 0);
2453 interMode
.mvBits
= m_entropyCoder
.getNumberOfWrittenBits();
2454 interMode
.coeffBits
= 0;
2455 interMode
.totalBits
= interMode
.mvBits
;
2456 if (m_rdCost
.m_psyRd
)
2457 interMode
.psyEnergy
= m_rdCost
.psyCost(cu
.m_log2CUSize
[0] - 2, fencYuv
->m_buf
[0], fencYuv
->m_size
, reconYuv
->m_buf
[0], reconYuv
->m_size
);
2459 updateModeCost(interMode
);
2460 m_entropyCoder
.store(interMode
.contexts
);
2463 /* encode residual and calculate rate-distortion for a CU block.
2464 * Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
2465 void Search::encodeResAndCalcRdInterCU(Mode
& interMode
, const CUGeom
& cuGeom
)
2467 CUData
& cu
= interMode
.cu
;
2468 Yuv
* reconYuv
= &interMode
.reconYuv
;
2469 Yuv
* predYuv
= &interMode
.predYuv
;
2470 ShortYuv
* resiYuv
= &m_rqt
[cuGeom
.depth
].tmpResiYuv
;
2471 const Yuv
* fencYuv
= interMode
.fencYuv
;
2473 X265_CHECK(!cu
.isIntra(0), "intra CU not expected\n");
2475 uint32_t log2CUSize
= cu
.m_log2CUSize
[0];
2476 uint32_t cuSize
= 1 << log2CUSize
;
2477 uint32_t depth
= cu
.m_cuDepth
[0];
2479 int part
= partitionFromLog2Size(log2CUSize
);
2480 int cpart
= partitionFromSizes(cuSize
>> m_hChromaShift
, cuSize
>> m_vChromaShift
);
2482 m_quant
.setQPforQuant(interMode
.cu
);
2484 resiYuv
->subtract(*fencYuv
, *predYuv
, log2CUSize
);
2486 uint32_t tuDepthRange
[2];
2487 cu
.getInterTUQtDepthRange(tuDepthRange
, 0);
2489 m_entropyCoder
.load(m_rqt
[depth
].cur
);
2492 estimateResidualQT(interMode
, cuGeom
, 0, depth
, *resiYuv
, costs
, tuDepthRange
);
2494 if (!cu
.m_tqBypass
[0])
2496 uint32_t cbf0Dist
= primitives
.sse_pp
[part
](fencYuv
->m_buf
[0], fencYuv
->m_size
, predYuv
->m_buf
[0], predYuv
->m_size
);
2497 cbf0Dist
+= m_rdCost
.scaleChromaDist(1, primitives
.sse_pp
[cpart
](fencYuv
->m_buf
[1], predYuv
->m_csize
, predYuv
->m_buf
[1], predYuv
->m_csize
));
2498 cbf0Dist
+= m_rdCost
.scaleChromaDist(2, primitives
.sse_pp
[cpart
](fencYuv
->m_buf
[2], predYuv
->m_csize
, predYuv
->m_buf
[2], predYuv
->m_csize
));
2500 /* Consider the RD cost of not signaling any residual */
2501 m_entropyCoder
.load(m_rqt
[depth
].cur
);
2502 m_entropyCoder
.resetBits();
2503 m_entropyCoder
.codeQtRootCbfZero();
2504 uint32_t cbf0Bits
= m_entropyCoder
.getNumberOfWrittenBits();
2507 uint32_t cbf0Energy
;
2508 if (m_rdCost
.m_psyRd
)
2510 cbf0Energy
= m_rdCost
.psyCost(log2CUSize
- 2, fencYuv
->m_buf
[0], fencYuv
->m_size
, predYuv
->m_buf
[0], predYuv
->m_size
);
2511 cbf0Cost
= m_rdCost
.calcPsyRdCost(cbf0Dist
, cbf0Bits
, cbf0Energy
);
2514 cbf0Cost
= m_rdCost
.calcRdCost(cbf0Dist
, cbf0Bits
);
2516 if (cbf0Cost
< costs
.rdcost
)
2519 cu
.setTUDepthSubParts(0, 0, depth
);
2523 if (cu
.getQtRootCbf(0))
2524 saveResidualQTData(cu
, *resiYuv
, 0, depth
);
2526 /* calculate signal bits for inter/merge/skip coded CU */
2527 m_entropyCoder
.load(m_rqt
[depth
].cur
);
2529 uint32_t coeffBits
, bits
;
2530 if (cu
.m_mergeFlag
[0] && cu
.m_partSize
[0] == SIZE_2Nx2N
&& !cu
.getQtRootCbf(0))
2532 cu
.setPredModeSubParts(MODE_SKIP
);
2535 m_entropyCoder
.resetBits();
2536 if (m_slice
->m_pps
->bTransquantBypassEnabled
)
2537 m_entropyCoder
.codeCUTransquantBypassFlag(cu
.m_tqBypass
[0]);
2538 m_entropyCoder
.codeSkipFlag(cu
, 0);
2539 m_entropyCoder
.codeMergeIndex(cu
, 0);
2541 bits
= m_entropyCoder
.getNumberOfWrittenBits();
2545 m_entropyCoder
.resetBits();
2546 if (m_slice
->m_pps
->bTransquantBypassEnabled
)
2547 m_entropyCoder
.codeCUTransquantBypassFlag(cu
.m_tqBypass
[0]);
2548 m_entropyCoder
.codeSkipFlag(cu
, 0);
2549 m_entropyCoder
.codePredMode(cu
.m_predMode
[0]);
2550 m_entropyCoder
.codePartSize(cu
, 0, cu
.m_cuDepth
[0]);
2551 m_entropyCoder
.codePredInfo(cu
, 0);
2552 uint32_t mvBits
= m_entropyCoder
.getNumberOfWrittenBits();
2554 bool bCodeDQP
= m_slice
->m_pps
->bUseDQP
;
2555 m_entropyCoder
.codeCoeff(cu
, 0, bCodeDQP
, tuDepthRange
);
2556 bits
= m_entropyCoder
.getNumberOfWrittenBits();
2558 coeffBits
= bits
- mvBits
;
2561 m_entropyCoder
.store(interMode
.contexts
);
2563 if (cu
.getQtRootCbf(0))
2564 reconYuv
->addClip(*predYuv
, *resiYuv
, log2CUSize
);
2566 reconYuv
->copyFromYuv(*predYuv
);
2568 // update with clipped distortion and cost (qp estimation loop uses unclipped values)
2569 uint32_t bestDist
= primitives
.sse_pp
[part
](fencYuv
->m_buf
[0], fencYuv
->m_size
, reconYuv
->m_buf
[0], reconYuv
->m_size
);
2570 bestDist
+= m_rdCost
.scaleChromaDist(1, primitives
.sse_pp
[cpart
](fencYuv
->m_buf
[1], fencYuv
->m_csize
, reconYuv
->m_buf
[1], reconYuv
->m_csize
));
2571 bestDist
+= m_rdCost
.scaleChromaDist(2, primitives
.sse_pp
[cpart
](fencYuv
->m_buf
[2], fencYuv
->m_csize
, reconYuv
->m_buf
[2], reconYuv
->m_csize
));
2572 if (m_rdCost
.m_psyRd
)
2573 interMode
.psyEnergy
= m_rdCost
.psyCost(log2CUSize
- 2, fencYuv
->m_buf
[0], fencYuv
->m_size
, reconYuv
->m_buf
[0], reconYuv
->m_size
);
2575 interMode
.totalBits
= bits
;
2576 interMode
.distortion
= bestDist
;
2577 interMode
.coeffBits
= coeffBits
;
2578 interMode
.mvBits
= bits
- coeffBits
;
2579 updateModeCost(interMode
);
2582 void Search::residualTransformQuantInter(Mode
& mode
, const CUGeom
& cuGeom
, uint32_t absPartIdx
, uint32_t depth
, const uint32_t depthRange
[2])
2584 CUData
& cu
= mode
.cu
;
2585 X265_CHECK(cu
.m_cuDepth
[0] == cu
.m_cuDepth
[absPartIdx
], "invalid depth\n");
2587 uint32_t log2TrSize
= g_maxLog2CUSize
- depth
;
2588 uint32_t tuDepth
= depth
- cu
.m_cuDepth
[0];
2590 bool bCheckFull
= log2TrSize
<= depthRange
[1];
2591 if (cu
.m_partSize
[0] != SIZE_2Nx2N
&& depth
== cu
.m_cuDepth
[absPartIdx
] && log2TrSize
> depthRange
[0])
2597 uint32_t log2TrSizeC
= log2TrSize
- m_hChromaShift
;
2598 bool bCodeChroma
= true;
2599 uint32_t tuDepthC
= tuDepth
;
2600 if (log2TrSizeC
< 2)
2602 X265_CHECK(log2TrSize
== 2 && m_csp
!= X265_CSP_I444
&& tuDepth
, "invalid tuDepth\n");
2605 bCodeChroma
= !(absPartIdx
& 3);
2608 uint32_t absPartIdxStep
= NUM_CU_PARTITIONS
>> ((cu
.m_cuDepth
[0] + tuDepthC
) << 1);
2609 uint32_t setCbf
= 1 << tuDepth
;
2611 uint32_t coeffOffsetY
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2);
2612 coeff_t
*coeffCurY
= cu
.m_trCoeff
[0] + coeffOffsetY
;
2614 uint32_t sizeIdx
= log2TrSize
- 2;
2616 cu
.setTUDepthSubParts(depth
- cu
.m_cuDepth
[0], absPartIdx
, depth
);
2617 cu
.setTransformSkipSubParts(0, TEXT_LUMA
, absPartIdx
, depth
);
2619 ShortYuv
& resiYuv
= m_rqt
[cuGeom
.depth
].tmpResiYuv
;
2620 const Yuv
* fencYuv
= mode
.fencYuv
;
2622 int16_t* curResiY
= resiYuv
.getLumaAddr(absPartIdx
);
2623 uint32_t strideResiY
= resiYuv
.m_size
;
2625 const pixel
* fenc
= fencYuv
->getLumaAddr(absPartIdx
);
2626 uint32_t numSigY
= m_quant
.transformNxN(cu
, fenc
, fencYuv
->m_size
, curResiY
, strideResiY
, coeffCurY
, log2TrSize
, TEXT_LUMA
, absPartIdx
, false);
2630 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdx
], curResiY
, strideResiY
, coeffCurY
, log2TrSize
, TEXT_LUMA
, false, false, numSigY
);
2631 cu
.setCbfSubParts(setCbf
, TEXT_LUMA
, absPartIdx
, depth
);
2635 primitives
.blockfill_s
[sizeIdx
](curResiY
, strideResiY
, 0);
2636 cu
.setCbfSubParts(0, TEXT_LUMA
, absPartIdx
, depth
);
2641 uint32_t sizeIdxC
= log2TrSizeC
- 2;
2642 uint32_t strideResiC
= resiYuv
.m_csize
;
2644 uint32_t coeffOffsetC
= coeffOffsetY
>> (m_hChromaShift
+ m_vChromaShift
);
2645 coeff_t
*coeffCurU
= cu
.m_trCoeff
[1] + coeffOffsetC
;
2646 coeff_t
*coeffCurV
= cu
.m_trCoeff
[2] + coeffOffsetC
;
2647 bool splitIntoSubTUs
= (m_csp
== X265_CSP_I422
);
2649 TURecurse
tuIterator(splitIntoSubTUs
? VERTICAL_SPLIT
: DONT_SPLIT
, absPartIdxStep
, absPartIdx
);
2652 uint32_t absPartIdxC
= tuIterator
.absPartIdxTURelCU
;
2653 uint32_t subTUOffset
= tuIterator
.section
<< (log2TrSizeC
* 2);
2655 cu
.setTransformSkipPartRange(0, TEXT_CHROMA_U
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2656 cu
.setTransformSkipPartRange(0, TEXT_CHROMA_V
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2658 int16_t* curResiU
= resiYuv
.getCbAddr(absPartIdxC
);
2659 const pixel
* fencCb
= fencYuv
->getCbAddr(absPartIdxC
);
2660 uint32_t numSigU
= m_quant
.transformNxN(cu
, fencCb
, fencYuv
->m_csize
, curResiU
, strideResiC
, coeffCurU
+ subTUOffset
, log2TrSizeC
, TEXT_CHROMA_U
, absPartIdxC
, false);
2663 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdxC
], curResiU
, strideResiC
, coeffCurU
+ subTUOffset
, log2TrSizeC
, TEXT_CHROMA_U
, false, false, numSigU
);
2664 cu
.setCbfPartRange(setCbf
, TEXT_CHROMA_U
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2668 primitives
.blockfill_s
[sizeIdxC
](curResiU
, strideResiC
, 0);
2669 cu
.setCbfPartRange(0, TEXT_CHROMA_U
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2672 int16_t* curResiV
= resiYuv
.getCrAddr(absPartIdxC
);
2673 const pixel
* fencCr
= fencYuv
->getCrAddr(absPartIdxC
);
2674 uint32_t numSigV
= m_quant
.transformNxN(cu
, fencCr
, fencYuv
->m_csize
, curResiV
, strideResiC
, coeffCurV
+ subTUOffset
, log2TrSizeC
, TEXT_CHROMA_V
, absPartIdxC
, false);
2677 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdxC
], curResiV
, strideResiC
, coeffCurV
+ subTUOffset
, log2TrSizeC
, TEXT_CHROMA_V
, false, false, numSigV
);
2678 cu
.setCbfPartRange(setCbf
, TEXT_CHROMA_V
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2682 primitives
.blockfill_s
[sizeIdxC
](curResiV
, strideResiC
, 0);
2683 cu
.setCbfPartRange(0, TEXT_CHROMA_V
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2686 while (tuIterator
.isNextSection());
2688 if (splitIntoSubTUs
)
2690 offsetSubTUCBFs(cu
, TEXT_CHROMA_U
, tuDepth
, absPartIdx
);
2691 offsetSubTUCBFs(cu
, TEXT_CHROMA_V
, tuDepth
, absPartIdx
);
2697 X265_CHECK(log2TrSize
> depthRange
[0], "residualTransformQuantInter recursion check failure\n");
2699 uint32_t qNumParts
= 1 << (log2TrSize
- 1 - LOG2_UNIT_SIZE
) * 2;
2700 uint32_t ycbf
= 0, ucbf
= 0, vcbf
= 0;
2701 for (uint32_t qIdx
= 0, qPartIdx
= absPartIdx
; qIdx
< 4; ++qIdx
, qPartIdx
+= qNumParts
)
2703 residualTransformQuantInter(mode
, cuGeom
, qPartIdx
, depth
+ 1, depthRange
);
2704 ycbf
|= cu
.getCbf(qPartIdx
, TEXT_LUMA
, tuDepth
+ 1);
2705 ucbf
|= cu
.getCbf(qPartIdx
, TEXT_CHROMA_U
, tuDepth
+ 1);
2706 vcbf
|= cu
.getCbf(qPartIdx
, TEXT_CHROMA_V
, tuDepth
+ 1);
2708 for (uint32_t i
= 0; i
< 4 * qNumParts
; i
++)
2710 cu
.m_cbf
[TEXT_LUMA
][absPartIdx
+ i
] |= ycbf
<< tuDepth
;
2711 cu
.m_cbf
[TEXT_CHROMA_U
][absPartIdx
+ i
] |= ucbf
<< tuDepth
;
2712 cu
.m_cbf
[TEXT_CHROMA_V
][absPartIdx
+ i
] |= vcbf
<< tuDepth
;
2717 uint64_t Search::estimateNullCbfCost(uint32_t &dist
, uint32_t &psyEnergy
, uint32_t tuDepth
, TextType compId
)
2719 uint32_t nullBits
= m_entropyCoder
.estimateCbfBits(0, compId
, tuDepth
);
2721 if (m_rdCost
.m_psyRd
)
2722 return m_rdCost
.calcPsyRdCost(dist
, nullBits
, psyEnergy
);
2724 return m_rdCost
.calcRdCost(dist
, nullBits
);
2727 void Search::estimateResidualQT(Mode
& mode
, const CUGeom
& cuGeom
, uint32_t absPartIdx
, uint32_t depth
, ShortYuv
& resiYuv
, Cost
& outCosts
, const uint32_t depthRange
[2])
2729 CUData
& cu
= mode
.cu
;
2730 uint32_t log2TrSize
= g_maxLog2CUSize
- depth
;
2732 bool bCheckSplit
= log2TrSize
> depthRange
[0];
2733 bool bCheckFull
= log2TrSize
<= depthRange
[1];
2734 bool bSplitPresentFlag
= bCheckSplit
&& bCheckFull
;
2736 if (cu
.m_partSize
[0] != SIZE_2Nx2N
&& depth
== cu
.m_cuDepth
[absPartIdx
] && bCheckSplit
)
2739 X265_CHECK(bCheckFull
|| bCheckSplit
, "check-full or check-split must be set\n");
2740 X265_CHECK(cu
.m_cuDepth
[0] == cu
.m_cuDepth
[absPartIdx
], "depth not matching\n");
2742 uint32_t tuDepth
= depth
- cu
.m_cuDepth
[0];
2743 uint32_t log2TrSizeC
= log2TrSize
- m_hChromaShift
;
2744 bool bCodeChroma
= true;
2745 uint32_t tuDepthC
= tuDepth
;
2746 if (log2TrSizeC
< 2)
2748 X265_CHECK(log2TrSize
== 2 && m_csp
!= X265_CSP_I444
&& tuDepth
, "invalid tuDepth\n");
2751 bCodeChroma
= !(absPartIdx
& 3);
2756 fullCost
.rdcost
= MAX_INT64
;
2758 uint8_t cbfFlag
[MAX_NUM_COMPONENT
][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
2759 uint32_t numSig
[MAX_NUM_COMPONENT
][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
2760 uint32_t singleBits
[MAX_NUM_COMPONENT
][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2761 uint32_t singleDist
[MAX_NUM_COMPONENT
][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2762 uint32_t singlePsyEnergy
[MAX_NUM_COMPONENT
][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2763 uint32_t bestTransformMode
[MAX_NUM_COMPONENT
][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
2764 uint64_t minCost
[MAX_NUM_COMPONENT
][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64
, MAX_INT64
}, {MAX_INT64
, MAX_INT64
}, {MAX_INT64
, MAX_INT64
} };
2766 m_entropyCoder
.store(m_rqt
[depth
].rqtRoot
);
2768 uint32_t trSize
= 1 << log2TrSize
;
2769 const bool splitIntoSubTUs
= (m_csp
== X265_CSP_I422
);
2770 uint32_t absPartIdxStep
= NUM_CU_PARTITIONS
>> ((cu
.m_cuDepth
[0] + tuDepthC
) << 1);
2771 const Yuv
* fencYuv
= mode
.fencYuv
;
2776 uint32_t trSizeC
= 1 << log2TrSizeC
;
2777 int partSize
= partitionFromLog2Size(log2TrSize
);
2778 int partSizeC
= partitionFromLog2Size(log2TrSizeC
);
2779 const uint32_t qtLayer
= log2TrSize
- 2;
2780 uint32_t coeffOffsetY
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2);
2781 coeff_t
* coeffCurY
= m_rqt
[qtLayer
].coeffRQT
[0] + coeffOffsetY
;
2783 bool checkTransformSkip
= m_slice
->m_pps
->bTransformSkipEnabled
&& !cu
.m_tqBypass
[0];
2784 bool checkTransformSkipY
= checkTransformSkip
&& log2TrSize
<= MAX_LOG2_TS_SIZE
;
2785 bool checkTransformSkipC
= checkTransformSkip
&& log2TrSizeC
<= MAX_LOG2_TS_SIZE
;
2787 cu
.setTUDepthSubParts(depth
- cu
.m_cuDepth
[0], absPartIdx
, depth
);
2788 cu
.setTransformSkipSubParts(0, TEXT_LUMA
, absPartIdx
, depth
);
2791 m_entropyCoder
.estBit(m_entropyCoder
.m_estBitsSbac
, log2TrSize
, true);
2793 const pixel
* fenc
= fencYuv
->getLumaAddr(absPartIdx
);
2794 int16_t* resi
= resiYuv
.getLumaAddr(absPartIdx
);
2795 numSig
[TEXT_LUMA
][0] = m_quant
.transformNxN(cu
, fenc
, fencYuv
->m_size
, resi
, resiYuv
.m_size
, coeffCurY
, log2TrSize
, TEXT_LUMA
, absPartIdx
, false);
2796 cbfFlag
[TEXT_LUMA
][0] = !!numSig
[TEXT_LUMA
][0];
2798 m_entropyCoder
.resetBits();
2800 if (bSplitPresentFlag
&& log2TrSize
> depthRange
[0])
2801 m_entropyCoder
.codeTransformSubdivFlag(0, 5 - log2TrSize
);
2802 fullCost
.bits
= m_entropyCoder
.getNumberOfWrittenBits();
2804 // Coding luma cbf flag has been removed from here. The context for cbf flag is different for each depth.
2805 // So it is valid if we encode coefficients and then cbfs at least for analysis.
2806 // m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
2807 if (cbfFlag
[TEXT_LUMA
][0])
2808 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurY
, absPartIdx
, log2TrSize
, TEXT_LUMA
);
2810 uint32_t singleBitsPrev
= m_entropyCoder
.getNumberOfWrittenBits();
2811 singleBits
[TEXT_LUMA
][0] = singleBitsPrev
- fullCost
.bits
;
2813 X265_CHECK(log2TrSize
<= 5, "log2TrSize is too large\n");
2814 uint32_t distY
= primitives
.ssd_s
[partSize
](resiYuv
.getLumaAddr(absPartIdx
), resiYuv
.m_size
);
2815 uint32_t psyEnergyY
= 0;
2816 if (m_rdCost
.m_psyRd
)
2817 psyEnergyY
= m_rdCost
.psyCost(partSize
, resiYuv
.getLumaAddr(absPartIdx
), resiYuv
.m_size
, (int16_t*)zeroShort
, 0);
2819 int16_t* curResiY
= m_rqt
[qtLayer
].resiQtYuv
.getLumaAddr(absPartIdx
);
2820 uint32_t strideResiY
= m_rqt
[qtLayer
].resiQtYuv
.m_size
;
2822 if (cbfFlag
[TEXT_LUMA
][0])
2824 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdx
], curResiY
, strideResiY
, coeffCurY
, log2TrSize
, TEXT_LUMA
, false, false, numSig
[TEXT_LUMA
][0]); //this is for inter mode only
2826 // non-zero cost calculation for luma - This is an approximation
2827 // finally we have to encode correct cbf after comparing with null cost
2828 const uint32_t nonZeroDistY
= primitives
.sse_ss
[partSize
](resiYuv
.getLumaAddr(absPartIdx
), resiYuv
.m_size
, curResiY
, strideResiY
);
2829 uint32_t nzCbfBitsY
= m_entropyCoder
.estimateCbfBits(cbfFlag
[TEXT_LUMA
][0], TEXT_LUMA
, tuDepth
);
2830 uint32_t nonZeroPsyEnergyY
= 0; uint64_t singleCostY
= 0;
2831 if (m_rdCost
.m_psyRd
)
2833 nonZeroPsyEnergyY
= m_rdCost
.psyCost(partSize
, resiYuv
.getLumaAddr(absPartIdx
), resiYuv
.m_size
, curResiY
, strideResiY
);
2834 singleCostY
= m_rdCost
.calcPsyRdCost(nonZeroDistY
, nzCbfBitsY
+ singleBits
[TEXT_LUMA
][0], nonZeroPsyEnergyY
);
2837 singleCostY
= m_rdCost
.calcRdCost(nonZeroDistY
, nzCbfBitsY
+ singleBits
[TEXT_LUMA
][0]);
2839 if (cu
.m_tqBypass
[0])
2841 singleDist
[TEXT_LUMA
][0] = nonZeroDistY
;
2842 singlePsyEnergy
[TEXT_LUMA
][0] = nonZeroPsyEnergyY
;
2846 // zero-cost calculation for luma. This is an approximation
2847 // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
2848 // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
2849 uint64_t nullCostY
= estimateNullCbfCost(distY
, psyEnergyY
, tuDepth
, TEXT_LUMA
);
2851 if (nullCostY
< singleCostY
)
2853 cbfFlag
[TEXT_LUMA
][0] = 0;
2854 singleBits
[TEXT_LUMA
][0] = 0;
2855 primitives
.blockfill_s
[partSize
](curResiY
, strideResiY
, 0);
2856 #if CHECKED_BUILD || _DEBUG
2857 uint32_t numCoeffY
= 1 << (log2TrSize
<< 1);
2858 memset(coeffCurY
, 0, sizeof(coeff_t
) * numCoeffY
);
2860 if (checkTransformSkipY
)
2861 minCost
[TEXT_LUMA
][0] = nullCostY
;
2862 singleDist
[TEXT_LUMA
][0] = distY
;
2863 singlePsyEnergy
[TEXT_LUMA
][0] = psyEnergyY
;
2867 if (checkTransformSkipY
)
2868 minCost
[TEXT_LUMA
][0] = singleCostY
;
2869 singleDist
[TEXT_LUMA
][0] = nonZeroDistY
;
2870 singlePsyEnergy
[TEXT_LUMA
][0] = nonZeroPsyEnergyY
;
2876 if (checkTransformSkipY
)
2877 minCost
[TEXT_LUMA
][0] = estimateNullCbfCost(distY
, psyEnergyY
, tuDepth
, TEXT_LUMA
);
2878 primitives
.blockfill_s
[partSize
](curResiY
, strideResiY
, 0);
2879 singleDist
[TEXT_LUMA
][0] = distY
;
2880 singlePsyEnergy
[TEXT_LUMA
][0] = psyEnergyY
;
2883 cu
.setCbfSubParts(cbfFlag
[TEXT_LUMA
][0] << tuDepth
, TEXT_LUMA
, absPartIdx
, depth
);
2887 uint32_t coeffOffsetC
= coeffOffsetY
>> (m_hChromaShift
+ m_vChromaShift
);
2888 uint32_t strideResiC
= m_rqt
[qtLayer
].resiQtYuv
.m_csize
;
2889 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
2891 uint32_t distC
= 0, psyEnergyC
= 0;
2892 coeff_t
* coeffCurC
= m_rqt
[qtLayer
].coeffRQT
[chromaId
] + coeffOffsetC
;
2893 TURecurse
tuIterator(splitIntoSubTUs
? VERTICAL_SPLIT
: DONT_SPLIT
, absPartIdxStep
, absPartIdx
);
2897 uint32_t absPartIdxC
= tuIterator
.absPartIdxTURelCU
;
2898 uint32_t subTUOffset
= tuIterator
.section
<< (log2TrSizeC
* 2);
2900 cu
.setTransformSkipPartRange(0, (TextType
)chromaId
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2902 if (m_bEnableRDOQ
&& (chromaId
!= TEXT_CHROMA_V
))
2903 m_entropyCoder
.estBit(m_entropyCoder
.m_estBitsSbac
, log2TrSizeC
, false);
2905 fenc
= fencYuv
->getChromaAddr(chromaId
, absPartIdxC
);
2906 resi
= resiYuv
.getChromaAddr(chromaId
, absPartIdxC
);
2907 numSig
[chromaId
][tuIterator
.section
] = m_quant
.transformNxN(cu
, fenc
, fencYuv
->m_csize
, resi
, resiYuv
.m_csize
, coeffCurC
+ subTUOffset
, log2TrSizeC
, (TextType
)chromaId
, absPartIdxC
, false);
2908 cbfFlag
[chromaId
][tuIterator
.section
] = !!numSig
[chromaId
][tuIterator
.section
];
2910 //Coding cbf flags has been removed from here
2911 // m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][tuIterator.section], tuDepth);
2912 if (cbfFlag
[chromaId
][tuIterator
.section
])
2913 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurC
+ subTUOffset
, absPartIdxC
, log2TrSizeC
, (TextType
)chromaId
);
2914 uint32_t newBits
= m_entropyCoder
.getNumberOfWrittenBits();
2915 singleBits
[chromaId
][tuIterator
.section
] = newBits
- singleBitsPrev
;
2916 singleBitsPrev
= newBits
;
2918 int16_t* curResiC
= m_rqt
[qtLayer
].resiQtYuv
.getChromaAddr(chromaId
, absPartIdxC
);
2919 distC
= m_rdCost
.scaleChromaDist(chromaId
, primitives
.ssd_s
[log2TrSizeC
- 2](resiYuv
.getChromaAddr(chromaId
, absPartIdxC
), resiYuv
.m_csize
));
2921 if (cbfFlag
[chromaId
][tuIterator
.section
])
2923 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdxC
], curResiC
, strideResiC
, coeffCurC
+ subTUOffset
,
2924 log2TrSizeC
, (TextType
)chromaId
, false, false, numSig
[chromaId
][tuIterator
.section
]);
2926 // non-zero cost calculation for luma, same as luma - This is an approximation
2927 // finally we have to encode correct cbf after comparing with null cost
2928 uint32_t dist
= primitives
.sse_ss
[partSizeC
](resiYuv
.getChromaAddr(chromaId
, absPartIdxC
), resiYuv
.m_csize
, curResiC
, strideResiC
);
2929 uint32_t nzCbfBitsC
= m_entropyCoder
.estimateCbfBits(cbfFlag
[chromaId
][tuIterator
.section
], (TextType
)chromaId
, tuDepth
);
2930 uint32_t nonZeroDistC
= m_rdCost
.scaleChromaDist(chromaId
, dist
);
2931 uint32_t nonZeroPsyEnergyC
= 0; uint64_t singleCostC
= 0;
2932 if (m_rdCost
.m_psyRd
)
2934 nonZeroPsyEnergyC
= m_rdCost
.psyCost(partSizeC
, resiYuv
.getChromaAddr(chromaId
, absPartIdxC
), resiYuv
.m_csize
, curResiC
, strideResiC
);
2935 singleCostC
= m_rdCost
.calcPsyRdCost(nonZeroDistC
, nzCbfBitsC
+ singleBits
[chromaId
][tuIterator
.section
], nonZeroPsyEnergyC
);
2938 singleCostC
= m_rdCost
.calcRdCost(nonZeroDistC
, nzCbfBitsC
+ singleBits
[chromaId
][tuIterator
.section
]);
2940 if (cu
.m_tqBypass
[0])
2942 singleDist
[chromaId
][tuIterator
.section
] = nonZeroDistC
;
2943 singlePsyEnergy
[chromaId
][tuIterator
.section
] = nonZeroPsyEnergyC
;
2947 //zero-cost calculation for chroma. This is an approximation
2948 uint64_t nullCostC
= estimateNullCbfCost(distC
, psyEnergyC
, tuDepth
, (TextType
)chromaId
);
2950 if (nullCostC
< singleCostC
)
2952 cbfFlag
[chromaId
][tuIterator
.section
] = 0;
2953 singleBits
[chromaId
][tuIterator
.section
] = 0;
2954 primitives
.blockfill_s
[partSizeC
](curResiC
, strideResiC
, 0);
2955 #if CHECKED_BUILD || _DEBUG
2956 uint32_t numCoeffC
= 1 << (log2TrSizeC
<< 1);
2957 memset(coeffCurC
+ subTUOffset
, 0, sizeof(coeff_t
) * numCoeffC
);
2959 if (checkTransformSkipC
)
2960 minCost
[chromaId
][tuIterator
.section
] = nullCostC
;
2961 singleDist
[chromaId
][tuIterator
.section
] = distC
;
2962 singlePsyEnergy
[chromaId
][tuIterator
.section
] = psyEnergyC
;
2966 if (checkTransformSkipC
)
2967 minCost
[chromaId
][tuIterator
.section
] = singleCostC
;
2968 singleDist
[chromaId
][tuIterator
.section
] = nonZeroDistC
;
2969 singlePsyEnergy
[chromaId
][tuIterator
.section
] = nonZeroPsyEnergyC
;
2975 if (checkTransformSkipC
)
2976 minCost
[chromaId
][tuIterator
.section
] = estimateNullCbfCost(distC
, psyEnergyC
, tuDepthC
, (TextType
)chromaId
);
2977 primitives
.blockfill_s
[partSizeC
](curResiC
, strideResiC
, 0);
2978 singleDist
[chromaId
][tuIterator
.section
] = distC
;
2979 singlePsyEnergy
[chromaId
][tuIterator
.section
] = psyEnergyC
;
2982 cu
.setCbfPartRange(cbfFlag
[chromaId
][tuIterator
.section
] << tuDepth
, (TextType
)chromaId
, absPartIdxC
, tuIterator
.absPartIdxStep
);
2984 while (tuIterator
.isNextSection());
2988 if (checkTransformSkipY
)
2990 uint32_t nonZeroDistY
= 0;
2991 uint32_t nonZeroPsyEnergyY
= 0;
2992 uint64_t singleCostY
= MAX_INT64
;
2994 ALIGN_VAR_32(coeff_t
, tsCoeffY
[MAX_TS_SIZE
* MAX_TS_SIZE
]);
2995 ALIGN_VAR_32(int16_t, tsResiY
[MAX_TS_SIZE
* MAX_TS_SIZE
]);
2997 m_entropyCoder
.load(m_rqt
[depth
].rqtRoot
);
2999 cu
.setTransformSkipSubParts(1, TEXT_LUMA
, absPartIdx
, depth
);
3002 m_entropyCoder
.estBit(m_entropyCoder
.m_estBitsSbac
, log2TrSize
, true);
3004 fenc
= fencYuv
->getLumaAddr(absPartIdx
);
3005 resi
= resiYuv
.getLumaAddr(absPartIdx
);
3006 uint32_t numSigTSkipY
= m_quant
.transformNxN(cu
, fenc
, fencYuv
->m_size
, resi
, resiYuv
.m_size
, tsCoeffY
, log2TrSize
, TEXT_LUMA
, absPartIdx
, true);
3010 m_entropyCoder
.resetBits();
3011 m_entropyCoder
.codeQtCbfLuma(!!numSigTSkipY
, tuDepth
);
3012 m_entropyCoder
.codeCoeffNxN(cu
, tsCoeffY
, absPartIdx
, log2TrSize
, TEXT_LUMA
);
3013 const uint32_t skipSingleBitsY
= m_entropyCoder
.getNumberOfWrittenBits();
3015 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdx
], tsResiY
, trSize
, tsCoeffY
, log2TrSize
, TEXT_LUMA
, false, true, numSigTSkipY
);
3017 nonZeroDistY
= primitives
.sse_ss
[partSize
](resiYuv
.getLumaAddr(absPartIdx
), resiYuv
.m_size
, tsResiY
, trSize
);
3019 if (m_rdCost
.m_psyRd
)
3021 nonZeroPsyEnergyY
= m_rdCost
.psyCost(partSize
, resiYuv
.getLumaAddr(absPartIdx
), resiYuv
.m_size
, tsResiY
, trSize
);
3022 singleCostY
= m_rdCost
.calcPsyRdCost(nonZeroDistY
, skipSingleBitsY
, nonZeroPsyEnergyY
);
3025 singleCostY
= m_rdCost
.calcRdCost(nonZeroDistY
, skipSingleBitsY
);
3028 if (!numSigTSkipY
|| minCost
[TEXT_LUMA
][0] < singleCostY
)
3029 cu
.setTransformSkipSubParts(0, TEXT_LUMA
, absPartIdx
, depth
);
3032 singleDist
[TEXT_LUMA
][0] = nonZeroDistY
;
3033 singlePsyEnergy
[TEXT_LUMA
][0] = nonZeroPsyEnergyY
;
3034 cbfFlag
[TEXT_LUMA
][0] = !!numSigTSkipY
;
3035 bestTransformMode
[TEXT_LUMA
][0] = 1;
3036 uint32_t numCoeffY
= 1 << (log2TrSize
<< 1);
3037 memcpy(coeffCurY
, tsCoeffY
, sizeof(coeff_t
) * numCoeffY
);
3038 primitives
.luma_copy_ss
[partSize
](curResiY
, strideResiY
, tsResiY
, trSize
);
3041 cu
.setCbfSubParts(cbfFlag
[TEXT_LUMA
][0] << tuDepth
, TEXT_LUMA
, absPartIdx
, depth
);
3044 if (bCodeChroma
&& checkTransformSkipC
)
3046 uint32_t nonZeroDistC
= 0, nonZeroPsyEnergyC
= 0;
3047 uint64_t singleCostC
= MAX_INT64
;
3048 uint32_t strideResiC
= m_rqt
[qtLayer
].resiQtYuv
.m_csize
;
3049 uint32_t coeffOffsetC
= coeffOffsetY
>> (m_hChromaShift
+ m_vChromaShift
);
3051 m_entropyCoder
.load(m_rqt
[depth
].rqtRoot
);
3053 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
3055 coeff_t
* coeffCurC
= m_rqt
[qtLayer
].coeffRQT
[chromaId
] + coeffOffsetC
;
3056 TURecurse
tuIterator(splitIntoSubTUs
? VERTICAL_SPLIT
: DONT_SPLIT
, absPartIdxStep
, absPartIdx
);
3060 uint32_t absPartIdxC
= tuIterator
.absPartIdxTURelCU
;
3061 uint32_t subTUOffset
= tuIterator
.section
<< (log2TrSizeC
* 2);
3063 int16_t* curResiC
= m_rqt
[qtLayer
].resiQtYuv
.getChromaAddr(chromaId
, absPartIdxC
);
3065 ALIGN_VAR_32(coeff_t
, tsCoeffC
[MAX_TS_SIZE
* MAX_TS_SIZE
]);
3066 ALIGN_VAR_32(int16_t, tsResiC
[MAX_TS_SIZE
* MAX_TS_SIZE
]);
3068 cu
.setTransformSkipPartRange(1, (TextType
)chromaId
, absPartIdxC
, tuIterator
.absPartIdxStep
);
3070 if (m_bEnableRDOQ
&& (chromaId
!= TEXT_CHROMA_V
))
3071 m_entropyCoder
.estBit(m_entropyCoder
.m_estBitsSbac
, log2TrSizeC
, false);
3073 fenc
= fencYuv
->getChromaAddr(chromaId
, absPartIdxC
);
3074 resi
= resiYuv
.getChromaAddr(chromaId
, absPartIdxC
);
3075 uint32_t numSigTSkipC
= m_quant
.transformNxN(cu
, fenc
, fencYuv
->m_csize
, resi
, resiYuv
.m_csize
, tsCoeffC
, log2TrSizeC
, (TextType
)chromaId
, absPartIdxC
, true);
3077 m_entropyCoder
.resetBits();
3078 singleBits
[chromaId
][tuIterator
.section
] = 0;
3082 m_entropyCoder
.codeQtCbfChroma(!!numSigTSkipC
, tuDepth
);
3083 m_entropyCoder
.codeCoeffNxN(cu
, tsCoeffC
, absPartIdxC
, log2TrSizeC
, (TextType
)chromaId
);
3084 singleBits
[chromaId
][tuIterator
.section
] = m_entropyCoder
.getNumberOfWrittenBits();
3086 m_quant
.invtransformNxN(cu
.m_tqBypass
[absPartIdxC
], tsResiC
, trSizeC
, tsCoeffC
,
3087 log2TrSizeC
, (TextType
)chromaId
, false, true, numSigTSkipC
);
3088 uint32_t dist
= primitives
.sse_ss
[partSizeC
](resiYuv
.getChromaAddr(chromaId
, absPartIdxC
), resiYuv
.m_csize
, tsResiC
, trSizeC
);
3089 nonZeroDistC
= m_rdCost
.scaleChromaDist(chromaId
, dist
);
3090 if (m_rdCost
.m_psyRd
)
3092 nonZeroPsyEnergyC
= m_rdCost
.psyCost(partSizeC
, resiYuv
.getChromaAddr(chromaId
, absPartIdxC
), resiYuv
.m_csize
, tsResiC
, trSizeC
);
3093 singleCostC
= m_rdCost
.calcPsyRdCost(nonZeroDistC
, singleBits
[chromaId
][tuIterator
.section
], nonZeroPsyEnergyC
);
3096 singleCostC
= m_rdCost
.calcRdCost(nonZeroDistC
, singleBits
[chromaId
][tuIterator
.section
]);
3099 if (!numSigTSkipC
|| minCost
[chromaId
][tuIterator
.section
] < singleCostC
)
3100 cu
.setTransformSkipPartRange(0, (TextType
)chromaId
, absPartIdxC
, tuIterator
.absPartIdxStep
);
3103 singleDist
[chromaId
][tuIterator
.section
] = nonZeroDistC
;
3104 singlePsyEnergy
[chromaId
][tuIterator
.section
] = nonZeroPsyEnergyC
;
3105 cbfFlag
[chromaId
][tuIterator
.section
] = !!numSigTSkipC
;
3106 bestTransformMode
[chromaId
][tuIterator
.section
] = 1;
3107 uint32_t numCoeffC
= 1 << (log2TrSizeC
<< 1);
3108 memcpy(coeffCurC
+ subTUOffset
, tsCoeffC
, sizeof(coeff_t
) * numCoeffC
);
3109 primitives
.luma_copy_ss
[partSizeC
](curResiC
, strideResiC
, tsResiC
, trSizeC
);
3112 cu
.setCbfPartRange(cbfFlag
[chromaId
][tuIterator
.section
] << tuDepth
, (TextType
)chromaId
, absPartIdxC
, tuIterator
.absPartIdxStep
);
3114 while (tuIterator
.isNextSection());
3118 // Here we were encoding cbfs and coefficients, after calculating distortion above.
3119 // Now I am encoding only cbfs, since I have encoded coefficients above. I have just collected
3120 // bits required for coefficients and added with number of cbf bits. As I tested the order does not
3121 // make any difference. But bit confused whether I should load the original context as below.
3122 m_entropyCoder
.load(m_rqt
[depth
].rqtRoot
);
3123 m_entropyCoder
.resetBits();
3128 for (uint32_t chromaId
= TEXT_CHROMA_U
; chromaId
<= TEXT_CHROMA_V
; chromaId
++)
3130 if (!splitIntoSubTUs
)
3131 m_entropyCoder
.codeQtCbfChroma(cbfFlag
[chromaId
][0], tuDepth
);
3134 offsetSubTUCBFs(cu
, (TextType
)chromaId
, tuDepth
, absPartIdx
);
3135 m_entropyCoder
.codeQtCbfChroma(cbfFlag
[chromaId
][0], tuDepth
);
3136 m_entropyCoder
.codeQtCbfChroma(cbfFlag
[chromaId
][1], tuDepth
);
3141 m_entropyCoder
.codeQtCbfLuma(cbfFlag
[TEXT_LUMA
][0], tuDepth
);
3143 uint32_t cbfBits
= m_entropyCoder
.getNumberOfWrittenBits();
3145 uint32_t coeffBits
= 0;
3146 coeffBits
= singleBits
[TEXT_LUMA
][0];
3147 for (uint32_t subTUIndex
= 0; subTUIndex
< 2; subTUIndex
++)
3149 coeffBits
+= singleBits
[TEXT_CHROMA_U
][subTUIndex
];
3150 coeffBits
+= singleBits
[TEXT_CHROMA_V
][subTUIndex
];
3153 // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma.
3154 // In case of chroma, if any one of the splitted block's cbf is 1, then we need to encode cbf 1, and then for
3155 // four splitted block's individual cbf value. This is not known before analysis of four splitted blocks.
3156 // For that reason, I am collecting individual coefficient bits only.
3157 fullCost
.bits
= bSplitPresentFlag
? cbfBits
+ coeffBits
: coeffBits
;
3159 fullCost
.distortion
+= singleDist
[TEXT_LUMA
][0];
3160 fullCost
.energy
+= singlePsyEnergy
[TEXT_LUMA
][0];// need to check we need to add chroma also
3161 for (uint32_t subTUIndex
= 0; subTUIndex
< 2; subTUIndex
++)
3163 fullCost
.distortion
+= singleDist
[TEXT_CHROMA_U
][subTUIndex
];
3164 fullCost
.distortion
+= singleDist
[TEXT_CHROMA_V
][subTUIndex
];
3167 if (m_rdCost
.m_psyRd
)
3168 fullCost
.rdcost
= m_rdCost
.calcPsyRdCost(fullCost
.distortion
, fullCost
.bits
, fullCost
.energy
);
3170 fullCost
.rdcost
= m_rdCost
.calcRdCost(fullCost
.distortion
, fullCost
.bits
);
3178 m_entropyCoder
.store(m_rqt
[depth
].rqtTest
);
3179 m_entropyCoder
.load(m_rqt
[depth
].rqtRoot
);
3183 if (bSplitPresentFlag
&& (log2TrSize
<= depthRange
[1] && log2TrSize
> depthRange
[0]))
3185 // Subdiv flag can be encoded at the start of anlysis of splitted blocks.
3186 m_entropyCoder
.resetBits();
3187 m_entropyCoder
.codeTransformSubdivFlag(1, 5 - log2TrSize
);
3188 splitCost
.bits
= m_entropyCoder
.getNumberOfWrittenBits();
3191 uint32_t qNumParts
= 1 << (log2TrSize
- 1 - LOG2_UNIT_SIZE
) * 2;
3192 uint32_t ycbf
= 0, ucbf
= 0, vcbf
= 0;
3193 for (uint32_t qIdx
= 0, qPartIdx
= absPartIdx
; qIdx
< 4; ++qIdx
, qPartIdx
+= qNumParts
)
3195 estimateResidualQT(mode
, cuGeom
, qPartIdx
, depth
+ 1, resiYuv
, splitCost
, depthRange
);
3196 ycbf
|= cu
.getCbf(qPartIdx
, TEXT_LUMA
, tuDepth
+ 1);
3197 ucbf
|= cu
.getCbf(qPartIdx
, TEXT_CHROMA_U
, tuDepth
+ 1);
3198 vcbf
|= cu
.getCbf(qPartIdx
, TEXT_CHROMA_V
, tuDepth
+ 1);
3200 for (uint32_t i
= 0; i
< 4 * qNumParts
; ++i
)
3202 cu
.m_cbf
[0][absPartIdx
+ i
] |= ycbf
<< tuDepth
;
3203 cu
.m_cbf
[1][absPartIdx
+ i
] |= ucbf
<< tuDepth
;
3204 cu
.m_cbf
[2][absPartIdx
+ i
] |= vcbf
<< tuDepth
;
3207 // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
3208 // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
3209 // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
3210 // at depth 0 (for example).
3211 m_entropyCoder
.load(m_rqt
[depth
].rqtRoot
);
3212 m_entropyCoder
.resetBits();
3214 codeInterSubdivCbfQT(cu
, absPartIdx
, depth
, depthRange
);
3215 uint32_t splitCbfBits
= m_entropyCoder
.getNumberOfWrittenBits();
3216 splitCost
.bits
+= splitCbfBits
;
3218 if (m_rdCost
.m_psyRd
)
3219 splitCost
.rdcost
= m_rdCost
.calcPsyRdCost(splitCost
.distortion
, splitCost
.bits
, splitCost
.energy
);
3221 splitCost
.rdcost
= m_rdCost
.calcRdCost(splitCost
.distortion
, splitCost
.bits
);
3223 if (ycbf
|| ucbf
|| vcbf
|| !bCheckFull
)
3225 if (splitCost
.rdcost
< fullCost
.rdcost
)
3227 outCosts
.distortion
+= splitCost
.distortion
;
3228 outCosts
.rdcost
+= splitCost
.rdcost
;
3229 outCosts
.bits
+= splitCost
.bits
;
3230 outCosts
.energy
+= splitCost
.energy
;
3234 outCosts
.energy
+= splitCost
.energy
;
3237 cu
.setTransformSkipSubParts(bestTransformMode
[TEXT_LUMA
][0], TEXT_LUMA
, absPartIdx
, depth
);
3240 if (!splitIntoSubTUs
)
3242 cu
.setTransformSkipSubParts(bestTransformMode
[TEXT_CHROMA_U
][0], TEXT_CHROMA_U
, absPartIdx
, depth
);
3243 cu
.setTransformSkipSubParts(bestTransformMode
[TEXT_CHROMA_V
][0], TEXT_CHROMA_V
, absPartIdx
, depth
);
3247 uint32_t tuNumParts
= absPartIdxStep
>> 1;
3248 cu
.setTransformSkipPartRange(bestTransformMode
[TEXT_CHROMA_U
][0], TEXT_CHROMA_U
, absPartIdx
, tuNumParts
);
3249 cu
.setTransformSkipPartRange(bestTransformMode
[TEXT_CHROMA_U
][1], TEXT_CHROMA_U
, absPartIdx
+ tuNumParts
, tuNumParts
);
3250 cu
.setTransformSkipPartRange(bestTransformMode
[TEXT_CHROMA_V
][0], TEXT_CHROMA_V
, absPartIdx
, tuNumParts
);
3251 cu
.setTransformSkipPartRange(bestTransformMode
[TEXT_CHROMA_V
][1], TEXT_CHROMA_V
, absPartIdx
+ tuNumParts
, tuNumParts
);
3254 X265_CHECK(bCheckFull
, "check-full must be set\n");
3255 m_entropyCoder
.load(m_rqt
[depth
].rqtTest
);
3258 cu
.setTUDepthSubParts(tuDepth
, absPartIdx
, depth
);
3259 cu
.setCbfSubParts(cbfFlag
[TEXT_LUMA
][0] << tuDepth
, TEXT_LUMA
, absPartIdx
, depth
);
3263 if (!splitIntoSubTUs
)
3265 cu
.setCbfSubParts(cbfFlag
[TEXT_CHROMA_U
][0] << tuDepth
, TEXT_CHROMA_U
, absPartIdx
, depth
);
3266 cu
.setCbfSubParts(cbfFlag
[TEXT_CHROMA_V
][0] << tuDepth
, TEXT_CHROMA_V
, absPartIdx
, depth
);
3270 uint32_t tuNumParts
= absPartIdxStep
>> 1;
3272 offsetCBFs(cbfFlag
[TEXT_CHROMA_U
]);
3273 offsetCBFs(cbfFlag
[TEXT_CHROMA_V
]);
3274 cu
.setCbfPartRange(cbfFlag
[TEXT_CHROMA_U
][0] << tuDepth
, TEXT_CHROMA_U
, absPartIdx
, tuNumParts
);
3275 cu
.setCbfPartRange(cbfFlag
[TEXT_CHROMA_U
][1] << tuDepth
, TEXT_CHROMA_U
, absPartIdx
+ tuNumParts
, tuNumParts
);
3276 cu
.setCbfPartRange(cbfFlag
[TEXT_CHROMA_V
][0] << tuDepth
, TEXT_CHROMA_V
, absPartIdx
, tuNumParts
);
3277 cu
.setCbfPartRange(cbfFlag
[TEXT_CHROMA_V
][1] << tuDepth
, TEXT_CHROMA_V
, absPartIdx
+ tuNumParts
, tuNumParts
);
3281 outCosts
.distortion
+= fullCost
.distortion
;
3282 outCosts
.rdcost
+= fullCost
.rdcost
;
3283 outCosts
.bits
+= fullCost
.bits
;
3284 outCosts
.energy
+= fullCost
.energy
;
3287 void Search::codeInterSubdivCbfQT(CUData
& cu
, uint32_t absPartIdx
, const uint32_t depth
, const uint32_t depthRange
[2])
3289 X265_CHECK(cu
.m_cuDepth
[0] == cu
.m_cuDepth
[absPartIdx
], "depth not matching\n");
3290 X265_CHECK(cu
.isInter(absPartIdx
), "codeInterSubdivCbfQT() with intra block\n");
3292 const uint32_t tuDepth
= depth
- cu
.m_cuDepth
[0];
3293 const bool bSubdiv
= tuDepth
!= cu
.m_tuDepth
[absPartIdx
];
3294 const uint32_t log2TrSize
= g_maxLog2CUSize
- depth
;
3296 if (!(log2TrSize
- m_hChromaShift
< 2))
3298 if (!tuDepth
|| cu
.getCbf(absPartIdx
, TEXT_CHROMA_U
, tuDepth
- 1))
3299 m_entropyCoder
.codeQtCbfChroma(cu
, absPartIdx
, TEXT_CHROMA_U
, tuDepth
, !bSubdiv
);
3300 if (!tuDepth
|| cu
.getCbf(absPartIdx
, TEXT_CHROMA_V
, tuDepth
- 1))
3301 m_entropyCoder
.codeQtCbfChroma(cu
, absPartIdx
, TEXT_CHROMA_V
, tuDepth
, !bSubdiv
);
3305 X265_CHECK(cu
.getCbf(absPartIdx
, TEXT_CHROMA_U
, tuDepth
) == cu
.getCbf(absPartIdx
, TEXT_CHROMA_U
, tuDepth
- 1), "chroma CBF not matching\n");
3306 X265_CHECK(cu
.getCbf(absPartIdx
, TEXT_CHROMA_V
, tuDepth
) == cu
.getCbf(absPartIdx
, TEXT_CHROMA_V
, tuDepth
- 1), "chroma CBF not matching\n");
3311 m_entropyCoder
.codeQtCbfLuma(cu
, absPartIdx
, tuDepth
);
3315 uint32_t qNumParts
= 1 << (log2TrSize
-1 - LOG2_UNIT_SIZE
) * 2;
3316 for (uint32_t qIdx
= 0; qIdx
< 4; ++qIdx
, absPartIdx
+= qNumParts
)
3317 codeInterSubdivCbfQT(cu
, absPartIdx
, depth
+ 1, depthRange
);
3321 void Search::encodeResidualQT(CUData
& cu
, uint32_t absPartIdx
, const uint32_t depth
, TextType ttype
, const uint32_t depthRange
[2])
3323 X265_CHECK(cu
.m_cuDepth
[0] == cu
.m_cuDepth
[absPartIdx
], "depth not matching\n");
3324 X265_CHECK(cu
.isInter(absPartIdx
), "encodeResidualQT() with intra block\n");
3326 const uint32_t curTuDepth
= depth
- cu
.m_cuDepth
[0];
3327 const uint32_t tuDepth
= cu
.m_tuDepth
[absPartIdx
];
3328 const bool bSubdiv
= curTuDepth
!= tuDepth
;
3329 const uint32_t log2TrSize
= g_maxLog2CUSize
- depth
;
3333 if (cu
.getCbf(absPartIdx
, ttype
, curTuDepth
))
3335 uint32_t qNumParts
= 1 << (log2TrSize
- 1 - LOG2_UNIT_SIZE
) * 2;
3336 for (uint32_t qIdx
= 0; qIdx
< 4; ++qIdx
, absPartIdx
+= qNumParts
)
3337 encodeResidualQT(cu
, absPartIdx
, depth
+ 1, ttype
, depthRange
);
3343 const bool splitIntoSubTUs
= (m_csp
== X265_CSP_I422
);
3344 uint32_t log2TrSizeC
= log2TrSize
- m_hChromaShift
;
3347 const uint32_t qtLayer
= log2TrSize
- 2;
3348 uint32_t coeffOffsetY
= absPartIdx
<< (LOG2_UNIT_SIZE
* 2);
3349 coeff_t
* coeffCurY
= m_rqt
[qtLayer
].coeffRQT
[0] + coeffOffsetY
;
3352 bool bCodeChroma
= true;
3353 uint32_t tuDepthC
= tuDepth
;
3354 if (log2TrSize
== 2 && m_csp
!= X265_CSP_I444
)
3356 X265_CHECK(log2TrSize
== 2 && m_csp
!= X265_CSP_I444
&& tuDepth
, "invalid tuDepth\n");
3359 bCodeChroma
= !(absPartIdx
& 3);
3362 if (ttype
== TEXT_LUMA
&& cu
.getCbf(absPartIdx
, TEXT_LUMA
, tuDepth
))
3363 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurY
, absPartIdx
, log2TrSize
, TEXT_LUMA
);
3367 uint32_t coeffOffsetC
= coeffOffsetY
>> (m_hChromaShift
+ m_vChromaShift
);
3368 coeff_t
* coeffCurU
= m_rqt
[qtLayer
].coeffRQT
[1] + coeffOffsetC
;
3369 coeff_t
* coeffCurV
= m_rqt
[qtLayer
].coeffRQT
[2] + coeffOffsetC
;
3371 if (!splitIntoSubTUs
)
3373 if (ttype
== TEXT_CHROMA_U
&& cu
.getCbf(absPartIdx
, TEXT_CHROMA_U
, tuDepth
))
3374 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurU
, absPartIdx
, log2TrSizeC
, TEXT_CHROMA_U
);
3375 if (ttype
== TEXT_CHROMA_V
&& cu
.getCbf(absPartIdx
, TEXT_CHROMA_V
, tuDepth
))
3376 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurV
, absPartIdx
, log2TrSizeC
, TEXT_CHROMA_V
);
3380 uint32_t tuNumParts
= 2 << ((log2TrSizeC
- LOG2_UNIT_SIZE
) * 2);
3381 uint32_t subTUSize
= 1 << (log2TrSizeC
* 2);
3382 if (ttype
== TEXT_CHROMA_U
&& cu
.getCbf(absPartIdx
, TEXT_CHROMA_U
, tuDepth
))
3384 if (cu
.getCbf(absPartIdx
, ttype
, tuDepth
+ 1))
3385 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurU
, absPartIdx
, log2TrSizeC
, TEXT_CHROMA_U
);
3386 if (cu
.getCbf(absPartIdx
+ tuNumParts
, ttype
, tuDepth
+ 1))
3387 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurU
+ subTUSize
, absPartIdx
+ tuNumParts
, log2TrSizeC
, TEXT_CHROMA_U
);
3389 if (ttype
== TEXT_CHROMA_V
&& cu
.getCbf(absPartIdx
, TEXT_CHROMA_V
, tuDepth
))
3391 if (cu
.getCbf(absPartIdx
, ttype
, tuDepth
+ 1))
3392 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurV
, absPartIdx
, log2TrSizeC
, TEXT_CHROMA_V
);
3393 if (cu
.getCbf(absPartIdx
+ tuNumParts
, ttype
, tuDepth
+ 1))
3394 m_entropyCoder
.codeCoeffNxN(cu
, coeffCurV
+ subTUSize
, absPartIdx
+ tuNumParts
, log2TrSizeC
, TEXT_CHROMA_V
);
3401 void Search::saveResidualQTData(CUData
& cu
, ShortYuv
& resiYuv
, uint32_t absPartIdx
, uint32_t depth
)
3403 X265_CHECK(cu
.m_cuDepth
[0] == cu
.m_cuDepth
[absPartIdx
], "depth not matching\n");
3404 const uint32_t curTrMode
= depth
- cu
.m_cuDepth
[0];
3405 const uint32_t tuDepth
= cu
.m_tuDepth
[absPartIdx
];
3406 const uint32_t log2TrSize
= g_maxLog2CUSize
- depth
;
3408 if (curTrMode
< tuDepth
)
3410 uint32_t qNumParts
= 1 << (log2TrSize
- 1 - LOG2_UNIT_SIZE
) * 2;
3411 for (uint32_t qIdx
= 0; qIdx
< 4; ++qIdx
, absPartIdx
+= qNumParts
)
3412 saveResidualQTData(cu
, resiYuv
, absPartIdx
, depth
+ 1);
3416 const uint32_t qtLayer
= log2TrSize
- 2;
3418 uint32_t log2TrSizeC
= log2TrSize
- m_hChromaShift
;
3419 bool bCodeChroma
= true;
3420 uint32_t tuDepthC
= tuDepth
;
3421 if (log2TrSizeC
< 2)
3423 X265_CHECK(log2TrSize
== 2 && m_csp
!= X265_CSP_I444
&& tuDepth
, "invalid tuDepth\n");
3426 bCodeChroma
= !(absPartIdx
& 3);
3429 m_rqt
[qtLayer
].resiQtYuv
.copyPartToPartLuma(resiYuv
, absPartIdx
, log2TrSize
);
3431 uint32_t numCoeffY
= 1 << (log2TrSize
* 2);
3432 uint32_t coeffOffsetY
= absPartIdx
<< LOG2_UNIT_SIZE
* 2;
3433 coeff_t
* coeffSrcY
= m_rqt
[qtLayer
].coeffRQT
[0] + coeffOffsetY
;
3434 coeff_t
* coeffDstY
= cu
.m_trCoeff
[0] + coeffOffsetY
;
3435 memcpy(coeffDstY
, coeffSrcY
, sizeof(coeff_t
) * numCoeffY
);
3439 m_rqt
[qtLayer
].resiQtYuv
.copyPartToPartChroma(resiYuv
, absPartIdx
, log2TrSizeC
+ m_hChromaShift
);
3441 uint32_t numCoeffC
= 1 << (log2TrSizeC
* 2 + (m_csp
== X265_CSP_I422
));
3442 uint32_t coeffOffsetC
= coeffOffsetY
>> (m_hChromaShift
+ m_vChromaShift
);
3444 coeff_t
* coeffSrcU
= m_rqt
[qtLayer
].coeffRQT
[1] + coeffOffsetC
;
3445 coeff_t
* coeffSrcV
= m_rqt
[qtLayer
].coeffRQT
[2] + coeffOffsetC
;
3446 coeff_t
* coeffDstU
= cu
.m_trCoeff
[1] + coeffOffsetC
;
3447 coeff_t
* coeffDstV
= cu
.m_trCoeff
[2] + coeffOffsetC
;
3448 memcpy(coeffDstU
, coeffSrcU
, sizeof(coeff_t
) * numCoeffC
);
3449 memcpy(coeffDstV
, coeffSrcV
, sizeof(coeff_t
) * numCoeffC
);
3453 /* returns the number of bits required to signal a non-most-probable mode.
3454 * on return mpms contains bitmap of most probable modes */
3455 uint32_t Search::getIntraRemModeBits(CUData
& cu
, uint32_t absPartIdx
, uint32_t preds
[3], uint64_t& mpms
) const
3457 cu
.getIntraDirLumaPredictor(absPartIdx
, preds
);
3460 for (int i
= 0; i
< 3; ++i
)
3461 mpms
|= ((uint64_t)1 << preds
[i
]);
3463 return m_entropyCoder
.bitsIntraModeNonMPM();
3466 /* swap the current mode/cost with the mode with the highest cost in the
3467 * current candidate list, if its cost is better (maintain a top N list) */
3468 void Search::updateCandList(uint32_t mode
, uint64_t cost
, int maxCandCount
, uint32_t* candModeList
, uint64_t* candCostList
)
3470 uint32_t maxIndex
= 0;
3471 uint64_t maxValue
= 0;
3473 for (int i
= 0; i
< maxCandCount
; i
++)
3475 if (maxValue
< candCostList
[i
])
3477 maxValue
= candCostList
[i
];
3482 if (cost
< maxValue
)
3484 candCostList
[maxIndex
] = cost
;
3485 candModeList
[maxIndex
] = mode
;