1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
5 * Steve Borho <steve@borho.org>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 * This program is also available under a commercial proprietary license.
22 * For more information, contact us at license @ x265.com.
23 *****************************************************************************/
27 #include "framedata.h"
29 #include "primitives.h"
30 #include "threading.h"
38 /* An explanation of rate distortion levels (--rd-level)
40 * rd-level 0 generates no recon per CU (NO RDO or Quant)
42 * sa8d selection between merge / skip / inter / intra and split
43 * no recon pixels generated until CTU analysis is complete, requiring
44 * intra predictions to use source pixels
46 * rd-level 1 uses RDO for merge and skip, sa8d for all else
48 * RDO selection between merge and skip
49 * sa8d selection between (merge/skip) / inter modes / intra and split
50 * intra prediction uses reconstructed pixels
52 * rd-level 2 uses RDO for merge/skip and split
54 * RDO selection between merge and skip
55 * sa8d selection between (merge/skip) / inter modes / intra
58 * rd-level 3 uses RDO for merge/skip/best inter/intra
60 * RDO selection between merge and skip
61 * sa8d selection of best inter mode
62 * sa8d decisions include chroma residual cost
63 * RDO selection between (merge/skip) / best inter mode / intra / split
65 * rd-level 4 enables RDOQuant
66 * chroma residual cost included in satd decisions, including subpel refine
67 * (as a result of --subme 3 being used by preset slow)
69 * rd-level 5,6 does RDO for each inter mode
74 m_totalNumJobs
= m_numAcquiredJobs
= m_numCompletedJobs
= 0;
75 m_reuseIntraDataCTU
= NULL
;
76 m_reuseInterDataCTU
= NULL
;
79 bool Analysis::create(ThreadLocalData
*tld
)
82 m_bTryLossless
= m_param
->bCULossless
&& !m_param
->bLossless
&& m_param
->rdLevel
>= 2;
83 m_bChromaSa8d
= m_param
->rdLevel
>= 3;
85 int csp
= m_param
->internalCsp
;
86 uint32_t cuSize
= g_maxCUSize
;
89 for (uint32_t depth
= 0; depth
<= g_maxCUDepth
; depth
++, cuSize
>>= 1)
91 ModeDepth
&md
= m_modeDepth
[depth
];
93 md
.cuMemPool
.create(depth
, csp
, MAX_PRED_TYPES
);
94 ok
&= md
.fencYuv
.create(cuSize
, csp
);
96 for (int j
= 0; j
< MAX_PRED_TYPES
; j
++)
98 md
.pred
[j
].cu
.initialize(md
.cuMemPool
, depth
, csp
, j
);
99 ok
&= md
.pred
[j
].predYuv
.create(cuSize
, csp
);
100 ok
&= md
.pred
[j
].reconYuv
.create(cuSize
, csp
);
101 md
.pred
[j
].fencYuv
= &md
.fencYuv
;
108 void Analysis::destroy()
110 for (uint32_t i
= 0; i
<= g_maxCUDepth
; i
++)
112 m_modeDepth
[i
].cuMemPool
.destroy();
113 m_modeDepth
[i
].fencYuv
.destroy();
115 for (int j
= 0; j
< MAX_PRED_TYPES
; j
++)
117 m_modeDepth
[i
].pred
[j
].predYuv
.destroy();
118 m_modeDepth
[i
].pred
[j
].reconYuv
.destroy();
123 Mode
& Analysis::compressCTU(CUData
& ctu
, Frame
& frame
, const CUGeom
& cuGeom
, const Entropy
& initialContext
)
125 m_slice
= ctu
.m_slice
;
128 invalidateContexts(0);
129 m_quant
.setQPforQuant(ctu
);
130 m_rqt
[0].cur
.load(initialContext
);
131 m_modeDepth
[0].fencYuv
.copyFromPicYuv(*m_frame
->m_fencPic
, ctu
.m_cuAddr
, 0);
133 uint32_t numPartition
= ctu
.m_numPartitions
;
134 if (m_param
->analysisMode
)
136 m_reuseIntraDataCTU
= (analysis_intra_data
*)m_frame
->m_analysisData
.intraData
;
137 int numPredDir
= m_slice
->isInterP() ? 1 : 2;
138 m_reuseInterDataCTU
= (analysis_inter_data
*)m_frame
->m_analysisData
.interData
+ ctu
.m_cuAddr
* X265_MAX_PRED_MODE_PER_CTU
* numPredDir
;
141 if (m_slice
->m_sliceType
== I_SLICE
)
144 compressIntraCU(ctu
, cuGeom
, zOrder
);
145 if (m_param
->analysisMode
== X265_ANALYSIS_SAVE
&& m_frame
->m_analysisData
.intraData
)
147 CUData
*bestCU
= &m_modeDepth
[0].bestMode
->cu
;
148 memcpy(&m_reuseIntraDataCTU
->depth
[ctu
.m_cuAddr
* numPartition
], bestCU
->m_cuDepth
, sizeof(uint8_t) * numPartition
);
149 memcpy(&m_reuseIntraDataCTU
->modes
[ctu
.m_cuAddr
* numPartition
], bestCU
->m_lumaIntraDir
, sizeof(uint8_t) * numPartition
);
150 memcpy(&m_reuseIntraDataCTU
->partSizes
[ctu
.m_cuAddr
* numPartition
], bestCU
->m_partSize
, sizeof(uint8_t) * numPartition
);
155 if (!m_param
->rdLevel
)
157 /* In RD Level 0/1, copy source pixels into the reconstructed block so
158 * they are available for intra predictions */
159 m_modeDepth
[0].fencYuv
.copyToPicYuv(*m_frame
->m_reconPic
, ctu
.m_cuAddr
, 0);
161 compressInterCU_rd0_4(ctu
, cuGeom
);
163 /* generate residual for entire CTU at once and copy to reconPic */
164 encodeResidue(ctu
, cuGeom
);
166 else if (m_param
->bDistributeModeAnalysis
&& m_param
->rdLevel
>= 2)
167 compressInterCU_dist(ctu
, cuGeom
);
168 else if (m_param
->rdLevel
<= 4)
169 compressInterCU_rd0_4(ctu
, cuGeom
);
171 compressInterCU_rd5_6(ctu
, cuGeom
);
174 return *m_modeDepth
[0].bestMode
;
177 void Analysis::tryLossless(const CUGeom
& cuGeom
)
179 ModeDepth
& md
= m_modeDepth
[cuGeom
.depth
];
181 if (!md
.bestMode
->distortion
)
182 /* already lossless */
184 else if (md
.bestMode
->cu
.isIntra(0))
186 md
.pred
[PRED_LOSSLESS
].cu
.initLosslessCU(md
.bestMode
->cu
, cuGeom
);
187 PartSize size
= (PartSize
)md
.pred
[PRED_LOSSLESS
].cu
.m_partSize
[0];
188 uint8_t* modes
= md
.pred
[PRED_LOSSLESS
].cu
.m_lumaIntraDir
;
189 checkIntra(md
.pred
[PRED_LOSSLESS
], cuGeom
, size
, modes
);
190 checkBestMode(md
.pred
[PRED_LOSSLESS
], cuGeom
.depth
);
194 md
.pred
[PRED_LOSSLESS
].cu
.initLosslessCU(md
.bestMode
->cu
, cuGeom
);
195 md
.pred
[PRED_LOSSLESS
].predYuv
.copyFromYuv(md
.bestMode
->predYuv
);
196 encodeResAndCalcRdInterCU(md
.pred
[PRED_LOSSLESS
], cuGeom
);
197 checkBestMode(md
.pred
[PRED_LOSSLESS
], cuGeom
.depth
);
201 void Analysis::compressIntraCU(const CUData
& parentCTU
, const CUGeom
& cuGeom
, uint32_t& zOrder
)
203 uint32_t depth
= cuGeom
.depth
;
204 ModeDepth
& md
= m_modeDepth
[depth
];
207 bool mightSplit
= !(cuGeom
.flags
& CUGeom::LEAF
);
208 bool mightNotSplit
= !(cuGeom
.flags
& CUGeom::SPLIT_MANDATORY
);
210 if (m_param
->analysisMode
== X265_ANALYSIS_LOAD
)
212 uint8_t* reuseDepth
= &m_reuseIntraDataCTU
->depth
[parentCTU
.m_cuAddr
* parentCTU
.m_numPartitions
];
213 uint8_t* reuseModes
= &m_reuseIntraDataCTU
->modes
[parentCTU
.m_cuAddr
* parentCTU
.m_numPartitions
];
214 char* reusePartSizes
= &m_reuseIntraDataCTU
->partSizes
[parentCTU
.m_cuAddr
* parentCTU
.m_numPartitions
];
216 if (mightNotSplit
&& depth
== reuseDepth
[zOrder
] && zOrder
== cuGeom
.encodeIdx
)
218 m_quant
.setQPforQuant(parentCTU
);
220 PartSize size
= (PartSize
)reusePartSizes
[zOrder
];
221 Mode
& mode
= size
== SIZE_2Nx2N
? md
.pred
[PRED_INTRA
] : md
.pred
[PRED_INTRA_NxN
];
222 mode
.cu
.initSubCU(parentCTU
, cuGeom
);
223 checkIntra(mode
, cuGeom
, size
, &reuseModes
[zOrder
]);
224 checkBestMode(mode
, depth
);
230 addSplitFlagCost(*md
.bestMode
, cuGeom
.depth
);
232 // increment zOrder offset to point to next best depth in sharedDepth buffer
233 zOrder
+= g_depthInc
[g_maxCUDepth
- 1][reuseDepth
[zOrder
]];
237 else if (mightNotSplit
)
239 m_quant
.setQPforQuant(parentCTU
);
241 md
.pred
[PRED_INTRA
].cu
.initSubCU(parentCTU
, cuGeom
);
242 checkIntra(md
.pred
[PRED_INTRA
], cuGeom
, SIZE_2Nx2N
, NULL
);
243 checkBestMode(md
.pred
[PRED_INTRA
], depth
);
245 if (depth
== g_maxCUDepth
)
247 md
.pred
[PRED_INTRA_NxN
].cu
.initSubCU(parentCTU
, cuGeom
);
248 checkIntra(md
.pred
[PRED_INTRA_NxN
], cuGeom
, SIZE_NxN
, NULL
);
249 checkBestMode(md
.pred
[PRED_INTRA_NxN
], depth
);
256 addSplitFlagCost(*md
.bestMode
, cuGeom
.depth
);
261 Mode
* splitPred
= &md
.pred
[PRED_SPLIT
];
262 splitPred
->initCosts();
263 CUData
* splitCU
= &splitPred
->cu
;
264 splitCU
->initSubCU(parentCTU
, cuGeom
);
266 uint32_t nextDepth
= depth
+ 1;
267 ModeDepth
& nd
= m_modeDepth
[nextDepth
];
268 invalidateContexts(nextDepth
);
269 Entropy
* nextContext
= &m_rqt
[depth
].cur
;
271 for (uint32_t subPartIdx
= 0; subPartIdx
< 4; subPartIdx
++)
273 const CUGeom
& childGeom
= *(&cuGeom
+ cuGeom
.childOffset
+ subPartIdx
);
274 if (childGeom
.flags
& CUGeom::PRESENT
)
276 m_modeDepth
[0].fencYuv
.copyPartToYuv(nd
.fencYuv
, childGeom
.encodeIdx
);
277 m_rqt
[nextDepth
].cur
.load(*nextContext
);
278 compressIntraCU(parentCTU
, childGeom
, zOrder
);
280 // Save best CU and pred data for this sub CU
281 splitCU
->copyPartFrom(nd
.bestMode
->cu
, childGeom
, subPartIdx
);
282 splitPred
->addSubCosts(*nd
.bestMode
);
283 nd
.bestMode
->reconYuv
.copyToPartYuv(splitPred
->reconYuv
, childGeom
.numPartitions
* subPartIdx
);
284 nextContext
= &nd
.bestMode
->contexts
;
288 /* record the depth of this non-present sub-CU */
289 splitCU
->setEmptyPart(childGeom
, subPartIdx
);
290 zOrder
+= g_depthInc
[g_maxCUDepth
- 1][nextDepth
];
293 nextContext
->store(splitPred
->contexts
);
295 addSplitFlagCost(*splitPred
, cuGeom
.depth
);
297 updateModeCost(*splitPred
);
298 checkBestMode(*splitPred
, depth
);
301 checkDQP(md
.bestMode
->cu
, cuGeom
);
303 /* Copy best data to encData CTU and recon */
304 md
.bestMode
->cu
.copyToPic(depth
);
305 if (md
.bestMode
!= &md
.pred
[PRED_SPLIT
])
306 md
.bestMode
->reconYuv
.copyToPicYuv(*m_frame
->m_reconPic
, parentCTU
.m_cuAddr
, cuGeom
.encodeIdx
);
309 bool Analysis::findJob(int threadId
)
311 /* try to acquire a CU mode to analyze */
312 m_pmodeLock
.acquire();
313 if (m_totalNumJobs
> m_numAcquiredJobs
)
315 int id
= m_numAcquiredJobs
++;
316 m_pmodeLock
.release();
318 parallelModeAnalysis(threadId
, id
);
320 m_pmodeLock
.acquire();
321 if (++m_numCompletedJobs
== m_totalNumJobs
)
322 m_modeCompletionEvent
.trigger();
323 m_pmodeLock
.release();
327 m_pmodeLock
.release();
330 if (m_totalNumME
> m_numAcquiredME
)
332 int id
= m_numAcquiredME
++;
335 parallelME(threadId
, id
);
338 if (++m_numCompletedME
== m_totalNumME
)
339 m_meCompletionEvent
.trigger();
349 void Analysis::parallelME(int threadId
, int meId
)
357 slave
= &m_tld
[threadId
].analysis
;
358 slave
->setQP(*m_slice
, m_rdCost
.m_qp
);
359 slave
->m_slice
= m_slice
;
360 slave
->m_frame
= m_frame
;
362 slave
->m_me
.setSourcePU(*m_curInterMode
->fencYuv
, m_curInterMode
->cu
.m_cuAddr
, m_curGeom
->encodeIdx
, m_puAbsPartIdx
, m_puWidth
, m_puHeight
);
363 slave
->prepMotionCompensation(m_curInterMode
->cu
, *m_curGeom
, m_curPart
);
366 if (meId
< m_slice
->m_numRefIdx
[0])
367 slave
->singleMotionEstimation(*this, *m_curInterMode
, *m_curGeom
, m_curPart
, 0, meId
);
369 slave
->singleMotionEstimation(*this, *m_curInterMode
, *m_curGeom
, m_curPart
, 1, meId
- m_slice
->m_numRefIdx
[0]);
372 void Analysis::parallelModeAnalysis(int threadId
, int jobId
)
380 slave
= &m_tld
[threadId
].analysis
;
381 slave
->m_slice
= m_slice
;
382 slave
->m_frame
= m_frame
;
383 slave
->setQP(*m_slice
, m_rdCost
.m_qp
);
384 slave
->invalidateContexts(0);
387 ModeDepth
& md
= m_modeDepth
[m_curGeom
->depth
];
389 if (m_param
->rdLevel
<= 4)
395 slave
->m_rqt
[m_curGeom
->depth
].cur
.load(m_rqt
[m_curGeom
->depth
].cur
);
396 slave
->checkIntraInInter(md
.pred
[PRED_INTRA
], *m_curGeom
);
397 if (m_param
->rdLevel
> 2)
398 slave
->encodeIntraInInter(md
.pred
[PRED_INTRA
], *m_curGeom
);
402 slave
->checkInter_rd0_4(md
.pred
[PRED_2Nx2N
], *m_curGeom
, SIZE_2Nx2N
);
403 if (m_slice
->m_sliceType
== B_SLICE
)
404 slave
->checkBidir2Nx2N(md
.pred
[PRED_2Nx2N
], md
.pred
[PRED_BIDIR
], *m_curGeom
);
408 slave
->checkInter_rd0_4(md
.pred
[PRED_Nx2N
], *m_curGeom
, SIZE_Nx2N
);
412 slave
->checkInter_rd0_4(md
.pred
[PRED_2NxN
], *m_curGeom
, SIZE_2NxN
);
416 slave
->checkInter_rd0_4(md
.pred
[PRED_2NxnU
], *m_curGeom
, SIZE_2NxnU
);
420 slave
->checkInter_rd0_4(md
.pred
[PRED_2NxnD
], *m_curGeom
, SIZE_2NxnD
);
424 slave
->checkInter_rd0_4(md
.pred
[PRED_nLx2N
], *m_curGeom
, SIZE_nLx2N
);
428 slave
->checkInter_rd0_4(md
.pred
[PRED_nRx2N
], *m_curGeom
, SIZE_nRx2N
);
432 X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
438 bool bMergeOnly
= m_curGeom
->log2CUSize
== 6;
441 slave
->m_rqt
[m_curGeom
->depth
].cur
.load(m_rqt
[m_curGeom
->depth
].cur
);
442 slave
->m_quant
.setQPforQuant(md
.pred
[PRED_2Nx2N
].cu
);
448 slave
->checkIntra(md
.pred
[PRED_INTRA
], *m_curGeom
, SIZE_2Nx2N
, NULL
);
449 if (m_curGeom
->depth
== g_maxCUDepth
&& m_curGeom
->log2CUSize
> m_slice
->m_sps
->quadtreeTULog2MinSize
)
450 slave
->checkIntra(md
.pred
[PRED_INTRA_NxN
], *m_curGeom
, SIZE_NxN
, NULL
);
454 slave
->checkInter_rd5_6(md
.pred
[PRED_2Nx2N
], *m_curGeom
, SIZE_2Nx2N
, false);
455 md
.pred
[PRED_BIDIR
].rdCost
= MAX_INT64
;
456 if (m_slice
->m_sliceType
== B_SLICE
)
458 slave
->checkBidir2Nx2N(md
.pred
[PRED_2Nx2N
], md
.pred
[PRED_BIDIR
], *m_curGeom
);
459 if (md
.pred
[PRED_BIDIR
].sa8dCost
< MAX_INT64
)
460 slave
->encodeResAndCalcRdInterCU(md
.pred
[PRED_BIDIR
], *m_curGeom
);
465 slave
->checkInter_rd5_6(md
.pred
[PRED_Nx2N
], *m_curGeom
, SIZE_Nx2N
, false);
469 slave
->checkInter_rd5_6(md
.pred
[PRED_2NxN
], *m_curGeom
, SIZE_2NxN
, false);
473 slave
->checkInter_rd5_6(md
.pred
[PRED_2NxnU
], *m_curGeom
, SIZE_2NxnU
, bMergeOnly
);
477 slave
->checkInter_rd5_6(md
.pred
[PRED_2NxnD
], *m_curGeom
, SIZE_2NxnD
, bMergeOnly
);
481 slave
->checkInter_rd5_6(md
.pred
[PRED_nLx2N
], *m_curGeom
, SIZE_nLx2N
, bMergeOnly
);
485 slave
->checkInter_rd5_6(md
.pred
[PRED_nRx2N
], *m_curGeom
, SIZE_nRx2N
, bMergeOnly
);
489 X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
495 void Analysis::compressInterCU_dist(const CUData
& parentCTU
, const CUGeom
& cuGeom
)
497 uint32_t depth
= cuGeom
.depth
;
498 uint32_t cuAddr
= parentCTU
.m_cuAddr
;
499 ModeDepth
& md
= m_modeDepth
[depth
];
502 bool mightSplit
= !(cuGeom
.flags
& CUGeom::LEAF
);
503 bool mightNotSplit
= !(cuGeom
.flags
& CUGeom::SPLIT_MANDATORY
);
504 uint32_t minDepth
= m_param
->rdLevel
<= 4 ? topSkipMinDepth(parentCTU
, cuGeom
) : 0;
506 X265_CHECK(m_param
->rdLevel
>= 2, "compressInterCU_dist does not support RD 0 or 1\n");
508 if (mightNotSplit
&& depth
>= minDepth
)
510 int bTryAmp
= m_slice
->m_sps
->maxAMPDepth
> depth
&& (cuGeom
.log2CUSize
< 6 || m_param
->rdLevel
> 4);
511 int bTryIntra
= m_slice
->m_sliceType
!= B_SLICE
|| m_param
->bIntraInBFrames
;
513 /* Initialize all prediction CUs based on parentCTU */
514 md
.pred
[PRED_2Nx2N
].cu
.initSubCU(parentCTU
, cuGeom
);
515 md
.pred
[PRED_BIDIR
].cu
.initSubCU(parentCTU
, cuGeom
);
516 md
.pred
[PRED_MERGE
].cu
.initSubCU(parentCTU
, cuGeom
);
517 md
.pred
[PRED_SKIP
].cu
.initSubCU(parentCTU
, cuGeom
);
518 if (m_param
->bEnableRectInter
)
520 md
.pred
[PRED_2NxN
].cu
.initSubCU(parentCTU
, cuGeom
);
521 md
.pred
[PRED_Nx2N
].cu
.initSubCU(parentCTU
, cuGeom
);
525 md
.pred
[PRED_2NxnU
].cu
.initSubCU(parentCTU
, cuGeom
);
526 md
.pred
[PRED_2NxnD
].cu
.initSubCU(parentCTU
, cuGeom
);
527 md
.pred
[PRED_nLx2N
].cu
.initSubCU(parentCTU
, cuGeom
);
528 md
.pred
[PRED_nRx2N
].cu
.initSubCU(parentCTU
, cuGeom
);
532 md
.pred
[PRED_INTRA
].cu
.initSubCU(parentCTU
, cuGeom
);
533 if (depth
== g_maxCUDepth
&& cuGeom
.log2CUSize
> m_slice
->m_sps
->quadtreeTULog2MinSize
)
534 md
.pred
[PRED_INTRA_NxN
].cu
.initSubCU(parentCTU
, cuGeom
);
537 m_pmodeLock
.acquire();
538 m_totalNumJobs
= 2 + m_param
->bEnableRectInter
* 2 + bTryAmp
* 4;
539 m_numAcquiredJobs
= !bTryIntra
;
540 m_numCompletedJobs
= m_numAcquiredJobs
;
542 m_bJobsQueued
= true;
543 JobProvider::enqueue();
544 m_pmodeLock
.release();
546 for (int i
= 0; i
< m_totalNumJobs
- m_numCompletedJobs
; i
++)
547 m_pool
->pokeIdleThread();
549 /* participate in processing jobs, until all are distributed */
553 JobProvider::dequeue();
554 m_bJobsQueued
= false;
556 /* the master worker thread (this one) does merge analysis. By doing
557 * merge after all the other jobs are at least started, we usually avoid
558 * blocking on another thread */
560 if (m_param
->rdLevel
<= 4)
562 checkMerge2Nx2N_rd0_4(md
.pred
[PRED_SKIP
], md
.pred
[PRED_MERGE
], cuGeom
);
564 m_modeCompletionEvent
.wait();
566 /* select best inter mode based on sa8d cost */
567 Mode
*bestInter
= &md
.pred
[PRED_2Nx2N
];
569 if (m_param
->bEnableRectInter
)
571 if (md
.pred
[PRED_Nx2N
].sa8dCost
< bestInter
->sa8dCost
)
572 bestInter
= &md
.pred
[PRED_Nx2N
];
573 if (md
.pred
[PRED_2NxN
].sa8dCost
< bestInter
->sa8dCost
)
574 bestInter
= &md
.pred
[PRED_2NxN
];
579 if (md
.pred
[PRED_2NxnU
].sa8dCost
< bestInter
->sa8dCost
)
580 bestInter
= &md
.pred
[PRED_2NxnU
];
581 if (md
.pred
[PRED_2NxnD
].sa8dCost
< bestInter
->sa8dCost
)
582 bestInter
= &md
.pred
[PRED_2NxnD
];
583 if (md
.pred
[PRED_nLx2N
].sa8dCost
< bestInter
->sa8dCost
)
584 bestInter
= &md
.pred
[PRED_nLx2N
];
585 if (md
.pred
[PRED_nRx2N
].sa8dCost
< bestInter
->sa8dCost
)
586 bestInter
= &md
.pred
[PRED_nRx2N
];
589 if (m_param
->rdLevel
> 2)
591 /* RD selection between merge, inter, bidir and intra */
592 if (!m_bChromaSa8d
) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
594 for (uint32_t puIdx
= 0; puIdx
< bestInter
->cu
.getNumPartInter(); puIdx
++)
596 prepMotionCompensation(bestInter
->cu
, cuGeom
, puIdx
);
597 motionCompensation(bestInter
->predYuv
, false, true);
600 encodeResAndCalcRdInterCU(*bestInter
, cuGeom
);
601 checkBestMode(*bestInter
, depth
);
603 /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
604 if (m_slice
->m_sliceType
== B_SLICE
&& md
.pred
[PRED_BIDIR
].sa8dCost
!= MAX_INT64
&&
605 md
.pred
[PRED_BIDIR
].sa8dCost
* 16 <= bestInter
->sa8dCost
* 17)
607 encodeResAndCalcRdInterCU(md
.pred
[PRED_BIDIR
], cuGeom
);
608 checkBestMode(md
.pred
[PRED_BIDIR
], depth
);
612 checkBestMode(md
.pred
[PRED_INTRA
], depth
);
614 else /* m_param->rdLevel == 2 */
616 if (!md
.bestMode
|| bestInter
->sa8dCost
< md
.bestMode
->sa8dCost
)
617 md
.bestMode
= bestInter
;
619 if (m_slice
->m_sliceType
== B_SLICE
&& md
.pred
[PRED_BIDIR
].sa8dCost
< md
.bestMode
->sa8dCost
)
620 md
.bestMode
= &md
.pred
[PRED_BIDIR
];
622 if (bTryIntra
&& md
.pred
[PRED_INTRA
].sa8dCost
< md
.bestMode
->sa8dCost
)
624 md
.bestMode
= &md
.pred
[PRED_INTRA
];
625 encodeIntraInInter(*md
.bestMode
, cuGeom
);
627 else if (!md
.bestMode
->cu
.m_mergeFlag
[0])
629 /* finally code the best mode selected from SA8D costs */
630 for (uint32_t puIdx
= 0; puIdx
< md
.bestMode
->cu
.getNumPartInter(); puIdx
++)
632 prepMotionCompensation(md
.bestMode
->cu
, cuGeom
, puIdx
);
633 motionCompensation(md
.bestMode
->predYuv
, false, true);
635 encodeResAndCalcRdInterCU(*md
.bestMode
, cuGeom
);
641 checkMerge2Nx2N_rd5_6(md
.pred
[PRED_SKIP
], md
.pred
[PRED_MERGE
], cuGeom
);
642 m_modeCompletionEvent
.wait();
644 checkBestMode(md
.pred
[PRED_2Nx2N
], depth
);
645 checkBestMode(md
.pred
[PRED_BIDIR
], depth
);
647 if (m_param
->bEnableRectInter
)
649 checkBestMode(md
.pred
[PRED_Nx2N
], depth
);
650 checkBestMode(md
.pred
[PRED_2NxN
], depth
);
655 checkBestMode(md
.pred
[PRED_2NxnU
], depth
);
656 checkBestMode(md
.pred
[PRED_2NxnD
], depth
);
657 checkBestMode(md
.pred
[PRED_nLx2N
], depth
);
658 checkBestMode(md
.pred
[PRED_nRx2N
], depth
);
663 checkBestMode(md
.pred
[PRED_INTRA
], depth
);
664 if (depth
== g_maxCUDepth
&& cuGeom
.log2CUSize
> m_slice
->m_sps
->quadtreeTULog2MinSize
)
665 checkBestMode(md
.pred
[PRED_INTRA_NxN
], depth
);
669 if (md
.bestMode
->rdCost
== MAX_INT64
&& !bTryIntra
)
671 md
.pred
[PRED_INTRA
].cu
.initSubCU(parentCTU
, cuGeom
);
672 checkIntraInInter(md
.pred
[PRED_INTRA
], cuGeom
);
673 encodeIntraInInter(md
.pred
[PRED_INTRA
], cuGeom
);
674 checkBestMode(md
.pred
[PRED_INTRA
], depth
);
681 addSplitFlagCost(*md
.bestMode
, cuGeom
.depth
);
684 bool bNoSplit
= false;
687 bNoSplit
= md
.bestMode
->cu
.isSkipped(0);
688 if (mightSplit
&& depth
&& depth
>= minDepth
&& !bNoSplit
&& m_param
->rdLevel
<= 4)
689 bNoSplit
= recursionDepthCheck(parentCTU
, cuGeom
, *md
.bestMode
);
692 if (mightSplit
&& !bNoSplit
)
694 Mode
* splitPred
= &md
.pred
[PRED_SPLIT
];
695 splitPred
->initCosts();
696 CUData
* splitCU
= &splitPred
->cu
;
697 splitCU
->initSubCU(parentCTU
, cuGeom
);
699 uint32_t nextDepth
= depth
+ 1;
700 ModeDepth
& nd
= m_modeDepth
[nextDepth
];
701 invalidateContexts(nextDepth
);
702 Entropy
* nextContext
= &m_rqt
[depth
].cur
;
704 for (uint32_t subPartIdx
= 0; subPartIdx
< 4; subPartIdx
++)
706 const CUGeom
& childGeom
= *(&cuGeom
+ cuGeom
.childOffset
+ subPartIdx
);
707 if (childGeom
.flags
& CUGeom::PRESENT
)
709 m_modeDepth
[0].fencYuv
.copyPartToYuv(nd
.fencYuv
, childGeom
.encodeIdx
);
710 m_rqt
[nextDepth
].cur
.load(*nextContext
);
711 compressInterCU_dist(parentCTU
, childGeom
);
713 // Save best CU and pred data for this sub CU
714 splitCU
->copyPartFrom(nd
.bestMode
->cu
, childGeom
, subPartIdx
);
715 splitPred
->addSubCosts(*nd
.bestMode
);
717 nd
.bestMode
->reconYuv
.copyToPartYuv(splitPred
->reconYuv
, childGeom
.numPartitions
* subPartIdx
);
718 nextContext
= &nd
.bestMode
->contexts
;
721 splitCU
->setEmptyPart(childGeom
, subPartIdx
);
723 nextContext
->store(splitPred
->contexts
);
726 addSplitFlagCost(*splitPred
, cuGeom
.depth
);
728 updateModeCost(*splitPred
);
730 checkBestMode(*splitPred
, depth
);
735 /* early-out statistics */
736 FrameData
& curEncData
= *m_frame
->m_encData
;
737 FrameData::RCStatCU
& cuStat
= curEncData
.m_cuStat
[parentCTU
.m_cuAddr
];
738 uint64_t temp
= cuStat
.avgCost
[depth
] * cuStat
.count
[depth
];
739 cuStat
.count
[depth
] += 1;
740 cuStat
.avgCost
[depth
] = (temp
+ md
.bestMode
->rdCost
) / cuStat
.count
[depth
];
743 checkDQP(md
.bestMode
->cu
, cuGeom
);
745 /* Copy best data to encData CTU and recon */
746 md
.bestMode
->cu
.copyToPic(depth
);
747 if (md
.bestMode
!= &md
.pred
[PRED_SPLIT
])
748 md
.bestMode
->reconYuv
.copyToPicYuv(*m_frame
->m_reconPic
, cuAddr
, cuGeom
.encodeIdx
);
751 void Analysis::compressInterCU_rd0_4(const CUData
& parentCTU
, const CUGeom
& cuGeom
)
753 uint32_t depth
= cuGeom
.depth
;
754 uint32_t cuAddr
= parentCTU
.m_cuAddr
;
755 ModeDepth
& md
= m_modeDepth
[depth
];
758 bool mightSplit
= !(cuGeom
.flags
& CUGeom::LEAF
);
759 bool mightNotSplit
= !(cuGeom
.flags
& CUGeom::SPLIT_MANDATORY
);
760 uint32_t minDepth
= topSkipMinDepth(parentCTU
, cuGeom
);
762 if (mightNotSplit
&& depth
>= minDepth
)
764 bool bTryIntra
= m_slice
->m_sliceType
!= B_SLICE
|| m_param
->bIntraInBFrames
;
766 /* Compute Merge Cost */
767 md
.pred
[PRED_MERGE
].cu
.initSubCU(parentCTU
, cuGeom
);
768 md
.pred
[PRED_SKIP
].cu
.initSubCU(parentCTU
, cuGeom
);
769 checkMerge2Nx2N_rd0_4(md
.pred
[PRED_SKIP
], md
.pred
[PRED_MERGE
], cuGeom
);
771 bool earlyskip
= false;
772 if (m_param
->rdLevel
)
773 earlyskip
= m_param
->bEnableEarlySkip
&& md
.bestMode
&& md
.bestMode
->cu
.isSkipped(0); // TODO: sa8d threshold per depth
777 md
.pred
[PRED_2Nx2N
].cu
.initSubCU(parentCTU
, cuGeom
);
778 checkInter_rd0_4(md
.pred
[PRED_2Nx2N
], cuGeom
, SIZE_2Nx2N
);
780 if (m_slice
->m_sliceType
== B_SLICE
)
782 md
.pred
[PRED_BIDIR
].cu
.initSubCU(parentCTU
, cuGeom
);
783 checkBidir2Nx2N(md
.pred
[PRED_2Nx2N
], md
.pred
[PRED_BIDIR
], cuGeom
);
786 Mode
*bestInter
= &md
.pred
[PRED_2Nx2N
];
787 if (m_param
->bEnableRectInter
)
789 md
.pred
[PRED_Nx2N
].cu
.initSubCU(parentCTU
, cuGeom
);
790 checkInter_rd0_4(md
.pred
[PRED_Nx2N
], cuGeom
, SIZE_Nx2N
);
791 if (md
.pred
[PRED_Nx2N
].sa8dCost
< bestInter
->sa8dCost
)
792 bestInter
= &md
.pred
[PRED_Nx2N
];
794 md
.pred
[PRED_2NxN
].cu
.initSubCU(parentCTU
, cuGeom
);
795 checkInter_rd0_4(md
.pred
[PRED_2NxN
], cuGeom
, SIZE_2NxN
);
796 if (md
.pred
[PRED_2NxN
].sa8dCost
< bestInter
->sa8dCost
)
797 bestInter
= &md
.pred
[PRED_2NxN
];
800 if (m_slice
->m_sps
->maxAMPDepth
> depth
&& cuGeom
.log2CUSize
< 6)
802 bool bHor
= false, bVer
= false;
803 if (bestInter
->cu
.m_partSize
[0] == SIZE_2NxN
)
805 else if (bestInter
->cu
.m_partSize
[0] == SIZE_Nx2N
)
807 else if (bestInter
->cu
.m_partSize
[0] == SIZE_2Nx2N
&&
808 md
.bestMode
&& md
.bestMode
->cu
.getQtRootCbf(0))
816 md
.pred
[PRED_2NxnU
].cu
.initSubCU(parentCTU
, cuGeom
);
817 checkInter_rd0_4(md
.pred
[PRED_2NxnU
], cuGeom
, SIZE_2NxnU
);
818 if (md
.pred
[PRED_2NxnU
].sa8dCost
< bestInter
->sa8dCost
)
819 bestInter
= &md
.pred
[PRED_2NxnU
];
821 md
.pred
[PRED_2NxnD
].cu
.initSubCU(parentCTU
, cuGeom
);
822 checkInter_rd0_4(md
.pred
[PRED_2NxnD
], cuGeom
, SIZE_2NxnD
);
823 if (md
.pred
[PRED_2NxnD
].sa8dCost
< bestInter
->sa8dCost
)
824 bestInter
= &md
.pred
[PRED_2NxnD
];
828 md
.pred
[PRED_nLx2N
].cu
.initSubCU(parentCTU
, cuGeom
);
829 checkInter_rd0_4(md
.pred
[PRED_nLx2N
], cuGeom
, SIZE_nLx2N
);
830 if (md
.pred
[PRED_nLx2N
].sa8dCost
< bestInter
->sa8dCost
)
831 bestInter
= &md
.pred
[PRED_nLx2N
];
833 md
.pred
[PRED_nRx2N
].cu
.initSubCU(parentCTU
, cuGeom
);
834 checkInter_rd0_4(md
.pred
[PRED_nRx2N
], cuGeom
, SIZE_nRx2N
);
835 if (md
.pred
[PRED_nRx2N
].sa8dCost
< bestInter
->sa8dCost
)
836 bestInter
= &md
.pred
[PRED_nRx2N
];
840 if (m_param
->rdLevel
>= 3)
842 /* Calculate RD cost of best inter option */
843 if (!m_bChromaSa8d
) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
845 for (uint32_t puIdx
= 0; puIdx
< bestInter
->cu
.getNumPartInter(); puIdx
++)
847 prepMotionCompensation(bestInter
->cu
, cuGeom
, puIdx
);
848 motionCompensation(bestInter
->predYuv
, false, true);
851 encodeResAndCalcRdInterCU(*bestInter
, cuGeom
);
852 checkBestMode(*bestInter
, depth
);
854 /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
855 if (m_slice
->m_sliceType
== B_SLICE
&& md
.pred
[PRED_BIDIR
].sa8dCost
!= MAX_INT64
&&
856 md
.pred
[PRED_BIDIR
].sa8dCost
* 16 <= bestInter
->sa8dCost
* 17)
858 encodeResAndCalcRdInterCU(md
.pred
[PRED_BIDIR
], cuGeom
);
859 checkBestMode(md
.pred
[PRED_BIDIR
], depth
);
862 if ((bTryIntra
&& md
.bestMode
->cu
.getQtRootCbf(0)) ||
863 md
.bestMode
->sa8dCost
== MAX_INT64
)
865 md
.pred
[PRED_INTRA
].cu
.initSubCU(parentCTU
, cuGeom
);
866 checkIntraInInter(md
.pred
[PRED_INTRA
], cuGeom
);
867 encodeIntraInInter(md
.pred
[PRED_INTRA
], cuGeom
);
868 checkBestMode(md
.pred
[PRED_INTRA
], depth
);
873 /* SA8D choice between merge/skip, inter, bidir, and intra */
874 if (!md
.bestMode
|| bestInter
->sa8dCost
< md
.bestMode
->sa8dCost
)
875 md
.bestMode
= bestInter
;
877 if (m_slice
->m_sliceType
== B_SLICE
&&
878 md
.pred
[PRED_BIDIR
].sa8dCost
< md
.bestMode
->sa8dCost
)
879 md
.bestMode
= &md
.pred
[PRED_BIDIR
];
881 if (bTryIntra
|| md
.bestMode
->sa8dCost
== MAX_INT64
)
883 md
.pred
[PRED_INTRA
].cu
.initSubCU(parentCTU
, cuGeom
);
884 checkIntraInInter(md
.pred
[PRED_INTRA
], cuGeom
);
885 if (md
.pred
[PRED_INTRA
].sa8dCost
< md
.bestMode
->sa8dCost
)
886 md
.bestMode
= &md
.pred
[PRED_INTRA
];
889 /* finally code the best mode selected by SA8D costs:
890 * RD level 2 - fully encode the best mode
891 * RD level 1 - generate recon pixels
892 * RD level 0 - generate chroma prediction */
893 if (md
.bestMode
->cu
.m_mergeFlag
[0] && md
.bestMode
->cu
.m_partSize
[0] == SIZE_2Nx2N
)
895 /* prediction already generated for this CU, and if rd level
896 * is not 0, it is already fully encoded */
898 else if (md
.bestMode
->cu
.isInter(0))
900 for (uint32_t puIdx
= 0; puIdx
< md
.bestMode
->cu
.getNumPartInter(); puIdx
++)
902 prepMotionCompensation(md
.bestMode
->cu
, cuGeom
, puIdx
);
903 motionCompensation(md
.bestMode
->predYuv
, false, true);
905 if (m_param
->rdLevel
== 2)
906 encodeResAndCalcRdInterCU(*md
.bestMode
, cuGeom
);
907 else if (m_param
->rdLevel
== 1)
909 /* generate recon pixels with no rate distortion considerations */
910 CUData
& cu
= md
.bestMode
->cu
;
911 m_quant
.setQPforQuant(cu
);
913 uint32_t tuDepthRange
[2];
914 cu
.getInterTUQtDepthRange(tuDepthRange
, 0);
916 m_rqt
[cuGeom
.depth
].tmpResiYuv
.subtract(*md
.bestMode
->fencYuv
, md
.bestMode
->predYuv
, cuGeom
.log2CUSize
);
917 residualTransformQuantInter(*md
.bestMode
, cuGeom
, 0, cuGeom
.depth
, tuDepthRange
);
918 if (cu
.getQtRootCbf(0))
919 md
.bestMode
->reconYuv
.addClip(md
.bestMode
->predYuv
, m_rqt
[cuGeom
.depth
].tmpResiYuv
, cu
.m_log2CUSize
[0]);
922 md
.bestMode
->reconYuv
.copyFromYuv(md
.bestMode
->predYuv
);
923 if (cu
.m_mergeFlag
[0] && cu
.m_partSize
[0] == SIZE_2Nx2N
)
924 cu
.setPredModeSubParts(MODE_SKIP
);
930 if (m_param
->rdLevel
== 2)
931 encodeIntraInInter(*md
.bestMode
, cuGeom
);
932 else if (m_param
->rdLevel
== 1)
934 /* generate recon pixels with no rate distortion considerations */
935 CUData
& cu
= md
.bestMode
->cu
;
936 m_quant
.setQPforQuant(cu
);
938 uint32_t tuDepthRange
[2];
939 cu
.getIntraTUQtDepthRange(tuDepthRange
, 0);
941 uint32_t initTuDepth
= cu
.m_partSize
[0] != SIZE_2Nx2N
;
942 residualTransformQuantIntra(*md
.bestMode
, cuGeom
, initTuDepth
, 0, tuDepthRange
);
943 getBestIntraModeChroma(*md
.bestMode
, cuGeom
);
944 residualQTIntraChroma(*md
.bestMode
, cuGeom
, 0, 0);
945 md
.bestMode
->reconYuv
.copyFromPicYuv(*m_frame
->m_reconPic
, cu
.m_cuAddr
, cuGeom
.encodeIdx
); // TODO:
955 addSplitFlagCost(*md
.bestMode
, cuGeom
.depth
);
958 bool bNoSplit
= false;
961 bNoSplit
= md
.bestMode
->cu
.isSkipped(0);
962 if (mightSplit
&& depth
&& depth
>= minDepth
&& !bNoSplit
)
963 bNoSplit
= recursionDepthCheck(parentCTU
, cuGeom
, *md
.bestMode
);
966 if (mightSplit
&& !bNoSplit
)
968 Mode
* splitPred
= &md
.pred
[PRED_SPLIT
];
969 splitPred
->initCosts();
970 CUData
* splitCU
= &splitPred
->cu
;
971 splitCU
->initSubCU(parentCTU
, cuGeom
);
973 uint32_t nextDepth
= depth
+ 1;
974 ModeDepth
& nd
= m_modeDepth
[nextDepth
];
975 invalidateContexts(nextDepth
);
976 Entropy
* nextContext
= &m_rqt
[depth
].cur
;
978 for (uint32_t subPartIdx
= 0; subPartIdx
< 4; subPartIdx
++)
980 const CUGeom
& childGeom
= *(&cuGeom
+ cuGeom
.childOffset
+ subPartIdx
);
981 if (childGeom
.flags
& CUGeom::PRESENT
)
983 m_modeDepth
[0].fencYuv
.copyPartToYuv(nd
.fencYuv
, childGeom
.encodeIdx
);
984 m_rqt
[nextDepth
].cur
.load(*nextContext
);
985 compressInterCU_rd0_4(parentCTU
, childGeom
);
987 // Save best CU and pred data for this sub CU
988 splitCU
->copyPartFrom(nd
.bestMode
->cu
, childGeom
, subPartIdx
);
989 splitPred
->addSubCosts(*nd
.bestMode
);
991 if (m_param
->rdLevel
)
992 nd
.bestMode
->reconYuv
.copyToPartYuv(splitPred
->reconYuv
, childGeom
.numPartitions
* subPartIdx
);
994 nd
.bestMode
->predYuv
.copyToPartYuv(splitPred
->predYuv
, childGeom
.numPartitions
* subPartIdx
);
995 if (m_param
->rdLevel
> 1)
996 nextContext
= &nd
.bestMode
->contexts
;
999 splitCU
->setEmptyPart(childGeom
, subPartIdx
);
1001 nextContext
->store(splitPred
->contexts
);
1004 addSplitFlagCost(*splitPred
, cuGeom
.depth
);
1005 else if (m_param
->rdLevel
> 1)
1006 updateModeCost(*splitPred
);
1008 splitPred
->sa8dCost
= m_rdCost
.calcRdSADCost(splitPred
->distortion
, splitPred
->sa8dBits
);
1011 md
.bestMode
= splitPred
;
1012 else if (m_param
->rdLevel
> 1)
1013 checkBestMode(*splitPred
, cuGeom
.depth
);
1014 else if (splitPred
->sa8dCost
< md
.bestMode
->sa8dCost
)
1015 md
.bestMode
= splitPred
;
1020 /* early-out statistics */
1021 FrameData
& curEncData
= *m_frame
->m_encData
;
1022 FrameData::RCStatCU
& cuStat
= curEncData
.m_cuStat
[parentCTU
.m_cuAddr
];
1023 uint64_t temp
= cuStat
.avgCost
[depth
] * cuStat
.count
[depth
];
1024 cuStat
.count
[depth
] += 1;
1025 cuStat
.avgCost
[depth
] = (temp
+ md
.bestMode
->rdCost
) / cuStat
.count
[depth
];
1028 checkDQP(md
.bestMode
->cu
, cuGeom
);
1030 /* Copy best data to encData CTU and recon */
1031 md
.bestMode
->cu
.copyToPic(depth
);
1032 if (md
.bestMode
!= &md
.pred
[PRED_SPLIT
] && m_param
->rdLevel
)
1033 md
.bestMode
->reconYuv
.copyToPicYuv(*m_frame
->m_reconPic
, cuAddr
, cuGeom
.encodeIdx
);
1036 void Analysis::compressInterCU_rd5_6(const CUData
& parentCTU
, const CUGeom
& cuGeom
)
1038 uint32_t depth
= cuGeom
.depth
;
1039 ModeDepth
& md
= m_modeDepth
[depth
];
1042 bool mightSplit
= !(cuGeom
.flags
& CUGeom::LEAF
);
1043 bool mightNotSplit
= !(cuGeom
.flags
& CUGeom::SPLIT_MANDATORY
);
1047 md
.pred
[PRED_SKIP
].cu
.initSubCU(parentCTU
, cuGeom
);
1048 md
.pred
[PRED_MERGE
].cu
.initSubCU(parentCTU
, cuGeom
);
1049 checkMerge2Nx2N_rd5_6(md
.pred
[PRED_SKIP
], md
.pred
[PRED_MERGE
], cuGeom
);
1050 bool earlySkip
= m_param
->bEnableEarlySkip
&& md
.bestMode
&& !md
.bestMode
->cu
.getQtRootCbf(0);
1054 md
.pred
[PRED_2Nx2N
].cu
.initSubCU(parentCTU
, cuGeom
);
1055 checkInter_rd5_6(md
.pred
[PRED_2Nx2N
], cuGeom
, SIZE_2Nx2N
, false);
1056 checkBestMode(md
.pred
[PRED_2Nx2N
], cuGeom
.depth
);
1058 if (m_slice
->m_sliceType
== B_SLICE
)
1060 md
.pred
[PRED_BIDIR
].cu
.initSubCU(parentCTU
, cuGeom
);
1061 checkBidir2Nx2N(md
.pred
[PRED_2Nx2N
], md
.pred
[PRED_BIDIR
], cuGeom
);
1062 if (md
.pred
[PRED_BIDIR
].sa8dCost
< MAX_INT64
)
1064 encodeResAndCalcRdInterCU(md
.pred
[PRED_BIDIR
], cuGeom
);
1065 checkBestMode(md
.pred
[PRED_BIDIR
], cuGeom
.depth
);
1069 if (m_param
->bEnableRectInter
)
1071 if (!m_param
->bEnableCbfFastMode
|| md
.bestMode
->cu
.getQtRootCbf(0))
1073 md
.pred
[PRED_Nx2N
].cu
.initSubCU(parentCTU
, cuGeom
);
1074 checkInter_rd5_6(md
.pred
[PRED_Nx2N
], cuGeom
, SIZE_Nx2N
, false);
1075 checkBestMode(md
.pred
[PRED_Nx2N
], cuGeom
.depth
);
1077 if (!m_param
->bEnableCbfFastMode
|| md
.bestMode
->cu
.getQtRootCbf(0))
1079 md
.pred
[PRED_2NxN
].cu
.initSubCU(parentCTU
, cuGeom
);
1080 checkInter_rd5_6(md
.pred
[PRED_2NxN
], cuGeom
, SIZE_2NxN
, false);
1081 checkBestMode(md
.pred
[PRED_2NxN
], cuGeom
.depth
);
1085 // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
1086 if (m_slice
->m_sps
->maxAMPDepth
> depth
)
1088 bool bMergeOnly
= cuGeom
.log2CUSize
== 6;
1090 bool bHor
= false, bVer
= false;
1091 if (md
.bestMode
->cu
.m_partSize
[0] == SIZE_2NxN
)
1093 else if (md
.bestMode
->cu
.m_partSize
[0] == SIZE_Nx2N
)
1095 else if (md
.bestMode
->cu
.m_partSize
[0] == SIZE_2Nx2N
&& !md
.bestMode
->cu
.m_mergeFlag
[0] && !md
.bestMode
->cu
.isSkipped(0))
1103 if (!m_param
->bEnableCbfFastMode
|| md
.bestMode
->cu
.getQtRootCbf(0))
1105 md
.pred
[PRED_2NxnU
].cu
.initSubCU(parentCTU
, cuGeom
);
1106 checkInter_rd5_6(md
.pred
[PRED_2NxnU
], cuGeom
, SIZE_2NxnU
, bMergeOnly
);
1107 checkBestMode(md
.pred
[PRED_2NxnU
], cuGeom
.depth
);
1109 if (!m_param
->bEnableCbfFastMode
|| md
.bestMode
->cu
.getQtRootCbf(0))
1111 md
.pred
[PRED_2NxnD
].cu
.initSubCU(parentCTU
, cuGeom
);
1112 checkInter_rd5_6(md
.pred
[PRED_2NxnD
], cuGeom
, SIZE_2NxnD
, bMergeOnly
);
1113 checkBestMode(md
.pred
[PRED_2NxnD
], cuGeom
.depth
);
1118 if (!m_param
->bEnableCbfFastMode
|| md
.bestMode
->cu
.getQtRootCbf(0))
1120 md
.pred
[PRED_nLx2N
].cu
.initSubCU(parentCTU
, cuGeom
);
1121 checkInter_rd5_6(md
.pred
[PRED_nLx2N
], cuGeom
, SIZE_nLx2N
, bMergeOnly
);
1122 checkBestMode(md
.pred
[PRED_nLx2N
], cuGeom
.depth
);
1124 if (!m_param
->bEnableCbfFastMode
|| md
.bestMode
->cu
.getQtRootCbf(0))
1126 md
.pred
[PRED_nRx2N
].cu
.initSubCU(parentCTU
, cuGeom
);
1127 checkInter_rd5_6(md
.pred
[PRED_nRx2N
], cuGeom
, SIZE_nRx2N
, bMergeOnly
);
1128 checkBestMode(md
.pred
[PRED_nRx2N
], cuGeom
.depth
);
1133 if ((m_slice
->m_sliceType
!= B_SLICE
|| m_param
->bIntraInBFrames
) &&
1134 (!m_param
->bEnableCbfFastMode
|| md
.bestMode
->cu
.getQtRootCbf(0)))
1136 md
.pred
[PRED_INTRA
].cu
.initSubCU(parentCTU
, cuGeom
);
1137 checkIntra(md
.pred
[PRED_INTRA
], cuGeom
, SIZE_2Nx2N
, NULL
);
1138 checkBestMode(md
.pred
[PRED_INTRA
], depth
);
1140 if (depth
== g_maxCUDepth
&& cuGeom
.log2CUSize
> m_slice
->m_sps
->quadtreeTULog2MinSize
)
1142 md
.pred
[PRED_INTRA_NxN
].cu
.initSubCU(parentCTU
, cuGeom
);
1143 checkIntra(md
.pred
[PRED_INTRA_NxN
], cuGeom
, SIZE_NxN
, NULL
);
1144 checkBestMode(md
.pred
[PRED_INTRA_NxN
], depth
);
1150 tryLossless(cuGeom
);
1153 addSplitFlagCost(*md
.bestMode
, cuGeom
.depth
);
1156 // estimate split cost
1157 if (mightSplit
&& (!md
.bestMode
|| !md
.bestMode
->cu
.isSkipped(0)))
1159 Mode
* splitPred
= &md
.pred
[PRED_SPLIT
];
1160 splitPred
->initCosts();
1161 CUData
* splitCU
= &splitPred
->cu
;
1162 splitCU
->initSubCU(parentCTU
, cuGeom
);
1164 uint32_t nextDepth
= depth
+ 1;
1165 ModeDepth
& nd
= m_modeDepth
[nextDepth
];
1166 invalidateContexts(nextDepth
);
1167 Entropy
* nextContext
= &m_rqt
[depth
].cur
;
1169 for (uint32_t subPartIdx
= 0; subPartIdx
< 4; subPartIdx
++)
1171 const CUGeom
& childGeom
= *(&cuGeom
+ cuGeom
.childOffset
+ subPartIdx
);
1172 if (childGeom
.flags
& CUGeom::PRESENT
)
1174 m_modeDepth
[0].fencYuv
.copyPartToYuv(nd
.fencYuv
, childGeom
.encodeIdx
);
1175 m_rqt
[nextDepth
].cur
.load(*nextContext
);
1176 compressInterCU_rd5_6(parentCTU
, childGeom
);
1178 // Save best CU and pred data for this sub CU
1179 splitCU
->copyPartFrom(nd
.bestMode
->cu
, childGeom
, subPartIdx
);
1180 splitPred
->addSubCosts(*nd
.bestMode
);
1181 nd
.bestMode
->reconYuv
.copyToPartYuv(splitPred
->reconYuv
, childGeom
.numPartitions
* subPartIdx
);
1182 nextContext
= &nd
.bestMode
->contexts
;
1185 splitCU
->setEmptyPart(childGeom
, subPartIdx
);
1187 nextContext
->store(splitPred
->contexts
);
1189 addSplitFlagCost(*splitPred
, cuGeom
.depth
);
1191 updateModeCost(*splitPred
);
1193 checkBestMode(*splitPred
, depth
);
1196 checkDQP(md
.bestMode
->cu
, cuGeom
);
1198 /* Copy best data to encData CTU and recon */
1199 md
.bestMode
->cu
.copyToPic(depth
);
1200 if (md
.bestMode
!= &md
.pred
[PRED_SPLIT
])
1201 md
.bestMode
->reconYuv
.copyToPicYuv(*m_frame
->m_reconPic
, parentCTU
.m_cuAddr
, cuGeom
.encodeIdx
);
1204 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
1205 void Analysis::checkMerge2Nx2N_rd0_4(Mode
& skip
, Mode
& merge
, const CUGeom
& cuGeom
)
1207 uint32_t depth
= cuGeom
.depth
;
1208 ModeDepth
& md
= m_modeDepth
[depth
];
1209 Yuv
*fencYuv
= &md
.fencYuv
;
1211 /* Note that these two Mode instances are named MERGE and SKIP but they may
1212 * hold the reverse when the function returns. We toggle between the two modes */
1213 Mode
* tempPred
= &merge
;
1214 Mode
* bestPred
= &skip
;
1216 X265_CHECK(m_slice
->m_sliceType
!= I_SLICE
, "Evaluating merge in I slice\n");
1218 tempPred
->cu
.setPartSizeSubParts(SIZE_2Nx2N
);
1219 tempPred
->cu
.setPredModeSubParts(MODE_INTER
);
1220 tempPred
->cu
.m_mergeFlag
[0] = true;
1222 bestPred
->cu
.setPartSizeSubParts(SIZE_2Nx2N
);
1223 bestPred
->cu
.setPredModeSubParts(MODE_INTER
);
1224 bestPred
->cu
.m_mergeFlag
[0] = true;
1226 MVField mvFieldNeighbours
[MRG_MAX_NUM_CANDS
][2]; // double length for mv of both lists
1227 uint8_t interDirNeighbours
[MRG_MAX_NUM_CANDS
];
1228 uint32_t maxNumMergeCand
= tempPred
->cu
.getInterMergeCandidates(0, 0, mvFieldNeighbours
, interDirNeighbours
);
1230 bestPred
->sa8dCost
= MAX_INT64
;
1231 int bestSadCand
= -1;
1232 int cpart
, sizeIdx
= cuGeom
.log2CUSize
- 2;
1235 int cuSize
= 1 << cuGeom
.log2CUSize
;
1236 cpart
= partitionFromSizes(cuSize
>> m_hChromaShift
, cuSize
>> m_vChromaShift
);
1238 for (uint32_t i
= 0; i
< maxNumMergeCand
; ++i
)
1240 if (m_bFrameParallel
&&
1241 (mvFieldNeighbours
[i
][0].mv
.y
>= (m_param
->searchRange
+ 1) * 4 ||
1242 mvFieldNeighbours
[i
][1].mv
.y
>= (m_param
->searchRange
+ 1) * 4))
1245 tempPred
->cu
.m_mvpIdx
[0][0] = (uint8_t)i
; // merge candidate ID is stored in L0 MVP idx
1246 tempPred
->cu
.m_interDir
[0] = interDirNeighbours
[i
];
1247 tempPred
->cu
.m_mv
[0][0] = mvFieldNeighbours
[i
][0].mv
;
1248 tempPred
->cu
.m_refIdx
[0][0] = (int8_t)mvFieldNeighbours
[i
][0].refIdx
;
1249 tempPred
->cu
.m_mv
[1][0] = mvFieldNeighbours
[i
][1].mv
;
1250 tempPred
->cu
.m_refIdx
[1][0] = (int8_t)mvFieldNeighbours
[i
][1].refIdx
;
1252 prepMotionCompensation(tempPred
->cu
, cuGeom
, 0);
1253 motionCompensation(tempPred
->predYuv
, true, m_bChromaSa8d
);
1255 tempPred
->sa8dBits
= getTUBits(i
, maxNumMergeCand
);
1256 tempPred
->distortion
= primitives
.sa8d
[sizeIdx
](fencYuv
->m_buf
[0], fencYuv
->m_size
, tempPred
->predYuv
.m_buf
[0], tempPred
->predYuv
.m_size
);
1259 tempPred
->distortion
+= primitives
.sa8d_inter
[cpart
](fencYuv
->m_buf
[1], fencYuv
->m_csize
, tempPred
->predYuv
.m_buf
[1], tempPred
->predYuv
.m_csize
);
1260 tempPred
->distortion
+= primitives
.sa8d_inter
[cpart
](fencYuv
->m_buf
[2], fencYuv
->m_csize
, tempPred
->predYuv
.m_buf
[2], tempPred
->predYuv
.m_csize
);
1262 tempPred
->sa8dCost
= m_rdCost
.calcRdSADCost(tempPred
->distortion
, tempPred
->sa8dBits
);
1264 if (tempPred
->sa8dCost
< bestPred
->sa8dCost
)
1267 std::swap(tempPred
, bestPred
);
1271 /* force mode decision to take inter or intra */
1272 if (bestSadCand
< 0)
1275 /* calculate the motion compensation for chroma for the best mode selected */
1276 if (!m_bChromaSa8d
) /* Chroma MC was done above */
1278 prepMotionCompensation(bestPred
->cu
, cuGeom
, 0);
1279 motionCompensation(bestPred
->predYuv
, false, true);
1282 if (m_param
->rdLevel
)
1284 if (m_param
->bLossless
)
1285 bestPred
->rdCost
= MAX_INT64
;
1287 encodeResAndCalcRdSkipCU(*bestPred
);
1289 /* Encode with residual */
1290 tempPred
->cu
.m_mvpIdx
[0][0] = (uint8_t)bestSadCand
;
1291 tempPred
->cu
.setPUInterDir(interDirNeighbours
[bestSadCand
], 0, 0);
1292 tempPred
->cu
.setPUMv(0, mvFieldNeighbours
[bestSadCand
][0].mv
, 0, 0);
1293 tempPred
->cu
.setPURefIdx(0, (int8_t)mvFieldNeighbours
[bestSadCand
][0].refIdx
, 0, 0);
1294 tempPred
->cu
.setPUMv(1, mvFieldNeighbours
[bestSadCand
][1].mv
, 0, 0);
1295 tempPred
->cu
.setPURefIdx(1, (int8_t)mvFieldNeighbours
[bestSadCand
][1].refIdx
, 0, 0);
1296 tempPred
->sa8dCost
= bestPred
->sa8dCost
;
1297 tempPred
->predYuv
.copyFromYuv(bestPred
->predYuv
);
1299 encodeResAndCalcRdInterCU(*tempPred
, cuGeom
);
1301 md
.bestMode
= tempPred
->rdCost
< bestPred
->rdCost
? tempPred
: bestPred
;
1304 md
.bestMode
= bestPred
;
1306 /* broadcast sets of MV field data */
1307 bestPred
->cu
.setPUInterDir(interDirNeighbours
[bestSadCand
], 0, 0);
1308 bestPred
->cu
.setPUMv(0, mvFieldNeighbours
[bestSadCand
][0].mv
, 0, 0);
1309 bestPred
->cu
.setPURefIdx(0, (int8_t)mvFieldNeighbours
[bestSadCand
][0].refIdx
, 0, 0);
1310 bestPred
->cu
.setPUMv(1, mvFieldNeighbours
[bestSadCand
][1].mv
, 0, 0);
1311 bestPred
->cu
.setPURefIdx(1, (int8_t)mvFieldNeighbours
[bestSadCand
][1].refIdx
, 0, 0);
1314 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
1315 void Analysis::checkMerge2Nx2N_rd5_6(Mode
& skip
, Mode
& merge
, const CUGeom
& cuGeom
)
1317 uint32_t depth
= cuGeom
.depth
;
1319 /* Note that these two Mode instances are named MERGE and SKIP but they may
1320 * hold the reverse when the function returns. We toggle between the two modes */
1321 Mode
* tempPred
= &merge
;
1322 Mode
* bestPred
= &skip
;
1324 merge
.cu
.setPredModeSubParts(MODE_INTER
);
1325 merge
.cu
.setPartSizeSubParts(SIZE_2Nx2N
);
1326 merge
.cu
.m_mergeFlag
[0] = true;
1328 skip
.cu
.setPredModeSubParts(MODE_INTER
);
1329 skip
.cu
.setPartSizeSubParts(SIZE_2Nx2N
);
1330 skip
.cu
.m_mergeFlag
[0] = true;
1332 MVField mvFieldNeighbours
[MRG_MAX_NUM_CANDS
][2]; // double length for mv of both lists
1333 uint8_t interDirNeighbours
[MRG_MAX_NUM_CANDS
];
1334 uint32_t maxNumMergeCand
= merge
.cu
.getInterMergeCandidates(0, 0, mvFieldNeighbours
, interDirNeighbours
);
1336 bool foundCbf0Merge
= false;
1337 bool triedPZero
= false, triedBZero
= false;
1338 bestPred
->rdCost
= MAX_INT64
;
1339 for (uint32_t i
= 0; i
< maxNumMergeCand
; i
++)
1341 if (m_bFrameParallel
&&
1342 (mvFieldNeighbours
[i
][0].mv
.y
>= (m_param
->searchRange
+ 1) * 4 ||
1343 mvFieldNeighbours
[i
][1].mv
.y
>= (m_param
->searchRange
+ 1) * 4))
1346 /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */
1347 if (interDirNeighbours
[i
] == 1 && !mvFieldNeighbours
[i
][0].mv
.word
&& !mvFieldNeighbours
[i
][0].refIdx
)
1353 else if (interDirNeighbours
[i
] == 3 &&
1354 !mvFieldNeighbours
[i
][0].mv
.word
&& !mvFieldNeighbours
[i
][0].refIdx
&&
1355 !mvFieldNeighbours
[i
][1].mv
.word
&& !mvFieldNeighbours
[i
][1].refIdx
)
1362 tempPred
->cu
.m_mvpIdx
[0][0] = (uint8_t)i
; /* merge candidate ID is stored in L0 MVP idx */
1363 tempPred
->cu
.m_interDir
[0] = interDirNeighbours
[i
];
1364 tempPred
->cu
.m_mv
[0][0] = mvFieldNeighbours
[i
][0].mv
;
1365 tempPred
->cu
.m_refIdx
[0][0] = (int8_t)mvFieldNeighbours
[i
][0].refIdx
;
1366 tempPred
->cu
.m_mv
[1][0] = mvFieldNeighbours
[i
][1].mv
;
1367 tempPred
->cu
.m_refIdx
[1][0] = (int8_t)mvFieldNeighbours
[i
][1].refIdx
;
1368 tempPred
->cu
.setPredModeSubParts(MODE_INTER
); /* must be cleared between encode iterations */
1370 prepMotionCompensation(tempPred
->cu
, cuGeom
, 0);
1371 motionCompensation(tempPred
->predYuv
, true, true);
1373 uint8_t hasCbf
= true;
1374 bool swapped
= false;
1375 if (!foundCbf0Merge
)
1377 /* if the best prediction has CBF (not a skip) then try merge with residual */
1379 encodeResAndCalcRdInterCU(*tempPred
, cuGeom
);
1380 hasCbf
= tempPred
->cu
.getQtRootCbf(0);
1381 foundCbf0Merge
= !hasCbf
;
1383 if (tempPred
->rdCost
< bestPred
->rdCost
)
1385 std::swap(tempPred
, bestPred
);
1389 if (!m_param
->bLossless
&& hasCbf
)
1391 /* try merge without residual (skip), if not lossless coding */
1395 tempPred
->cu
.m_mvpIdx
[0][0] = (uint8_t)i
;
1396 tempPred
->cu
.m_interDir
[0] = interDirNeighbours
[i
];
1397 tempPred
->cu
.m_mv
[0][0] = mvFieldNeighbours
[i
][0].mv
;
1398 tempPred
->cu
.m_refIdx
[0][0] = (int8_t)mvFieldNeighbours
[i
][0].refIdx
;
1399 tempPred
->cu
.m_mv
[1][0] = mvFieldNeighbours
[i
][1].mv
;
1400 tempPred
->cu
.m_refIdx
[1][0] = (int8_t)mvFieldNeighbours
[i
][1].refIdx
;
1401 tempPred
->cu
.setPredModeSubParts(MODE_INTER
);
1402 tempPred
->predYuv
.copyFromYuv(bestPred
->predYuv
);
1405 encodeResAndCalcRdSkipCU(*tempPred
);
1407 if (tempPred
->rdCost
< bestPred
->rdCost
)
1408 std::swap(tempPred
, bestPred
);
1412 if (bestPred
->rdCost
< MAX_INT64
)
1414 m_modeDepth
[depth
].bestMode
= bestPred
;
1416 /* broadcast sets of MV field data */
1417 uint32_t bestCand
= bestPred
->cu
.m_mvpIdx
[0][0];
1418 bestPred
->cu
.setPUInterDir(interDirNeighbours
[bestCand
], 0, 0);
1419 bestPred
->cu
.setPUMv(0, mvFieldNeighbours
[bestCand
][0].mv
, 0, 0);
1420 bestPred
->cu
.setPURefIdx(0, (int8_t)mvFieldNeighbours
[bestCand
][0].refIdx
, 0, 0);
1421 bestPred
->cu
.setPUMv(1, mvFieldNeighbours
[bestCand
][1].mv
, 0, 0);
1422 bestPred
->cu
.setPURefIdx(1, (int8_t)mvFieldNeighbours
[bestCand
][1].refIdx
, 0, 0);
1426 void Analysis::checkInter_rd0_4(Mode
& interMode
, const CUGeom
& cuGeom
, PartSize partSize
)
1428 interMode
.initCosts();
1429 interMode
.cu
.setPartSizeSubParts(partSize
);
1430 interMode
.cu
.setPredModeSubParts(MODE_INTER
);
1431 int numPredDir
= m_slice
->isInterP() ? 1 : 2;
1433 if (m_param
->analysisMode
== X265_ANALYSIS_LOAD
&& m_reuseInterDataCTU
)
1435 for (uint32_t part
= 0; part
< interMode
.cu
.getNumPartInter(); part
++)
1437 MotionData
* bestME
= interMode
.bestME
[part
];
1438 for (int32_t i
= 0; i
< numPredDir
; i
++)
1440 bestME
[i
].ref
= m_reuseInterDataCTU
->ref
;
1441 m_reuseInterDataCTU
++;
1445 if (predInterSearch(interMode
, cuGeom
, false, m_bChromaSa8d
))
1447 /* predInterSearch sets interMode.sa8dBits */
1448 const Yuv
& fencYuv
= *interMode
.fencYuv
;
1449 Yuv
& predYuv
= interMode
.predYuv
;
1450 int part
= partitionFromLog2Size(cuGeom
.log2CUSize
);
1451 interMode
.distortion
= primitives
.sa8d
[part
](fencYuv
.m_buf
[0], fencYuv
.m_size
, predYuv
.m_buf
[0], predYuv
.m_size
);
1454 uint32_t cuSize
= 1 << cuGeom
.log2CUSize
;
1455 int cpart
= partitionFromSizes(cuSize
>> m_hChromaShift
, cuSize
>> m_vChromaShift
);
1456 interMode
.distortion
+= primitives
.sa8d_inter
[cpart
](fencYuv
.m_buf
[1], fencYuv
.m_csize
, predYuv
.m_buf
[1], predYuv
.m_csize
);
1457 interMode
.distortion
+= primitives
.sa8d_inter
[cpart
](fencYuv
.m_buf
[2], fencYuv
.m_csize
, predYuv
.m_buf
[2], predYuv
.m_csize
);
1459 interMode
.sa8dCost
= m_rdCost
.calcRdSADCost(interMode
.distortion
, interMode
.sa8dBits
);
1461 if (m_param
->analysisMode
== X265_ANALYSIS_SAVE
&& m_reuseInterDataCTU
)
1463 for (uint32_t puIdx
= 0; puIdx
< interMode
.cu
.getNumPartInter(); puIdx
++)
1465 MotionData
* bestME
= interMode
.bestME
[puIdx
];
1466 for (int32_t i
= 0; i
< numPredDir
; i
++)
1468 m_reuseInterDataCTU
->ref
= bestME
[i
].ref
;
1469 m_reuseInterDataCTU
++;
1476 interMode
.distortion
= MAX_UINT
;
1477 interMode
.sa8dCost
= MAX_INT64
;
1481 void Analysis::checkInter_rd5_6(Mode
& interMode
, const CUGeom
& cuGeom
, PartSize partSize
, bool bMergeOnly
)
1483 interMode
.initCosts();
1484 interMode
.cu
.setPartSizeSubParts(partSize
);
1485 interMode
.cu
.setPredModeSubParts(MODE_INTER
);
1486 int numPredDir
= m_slice
->isInterP() ? 1 : 2;
1488 if (m_param
->analysisMode
== X265_ANALYSIS_LOAD
&& m_reuseInterDataCTU
)
1490 for (uint32_t puIdx
= 0; puIdx
< interMode
.cu
.getNumPartInter(); puIdx
++)
1492 MotionData
* bestME
= interMode
.bestME
[puIdx
];
1493 for (int32_t i
= 0; i
< numPredDir
; i
++)
1495 bestME
[i
].ref
= m_reuseInterDataCTU
->ref
;
1496 m_reuseInterDataCTU
++;
1500 if (predInterSearch(interMode
, cuGeom
, bMergeOnly
, true))
1502 /* predInterSearch sets interMode.sa8dBits, but this is ignored */
1503 encodeResAndCalcRdInterCU(interMode
, cuGeom
);
1505 if (m_param
->analysisMode
== X265_ANALYSIS_SAVE
&& m_reuseInterDataCTU
)
1507 for (uint32_t puIdx
= 0; puIdx
< interMode
.cu
.getNumPartInter(); puIdx
++)
1509 MotionData
* bestME
= interMode
.bestME
[puIdx
];
1510 for (int32_t i
= 0; i
< numPredDir
; i
++)
1512 m_reuseInterDataCTU
->ref
= bestME
[i
].ref
;
1513 m_reuseInterDataCTU
++;
1520 interMode
.distortion
= MAX_UINT
;
1521 interMode
.rdCost
= MAX_INT64
;
1525 void Analysis::checkBidir2Nx2N(Mode
& inter2Nx2N
, Mode
& bidir2Nx2N
, const CUGeom
& cuGeom
)
1527 CUData
& cu
= bidir2Nx2N
.cu
;
1529 if (cu
.isBipredRestriction() || inter2Nx2N
.bestME
[0][0].cost
== MAX_UINT
|| inter2Nx2N
.bestME
[0][1].cost
== MAX_UINT
)
1531 bidir2Nx2N
.sa8dCost
= MAX_INT64
;
1532 bidir2Nx2N
.rdCost
= MAX_INT64
;
1536 const Yuv
& fencYuv
= *bidir2Nx2N
.fencYuv
;
1538 int cpart
, partEnum
= cuGeom
.log2CUSize
- 2;
1542 int cuSize
= 1 << cuGeom
.log2CUSize
;
1543 cpart
= partitionFromSizes(cuSize
>> m_hChromaShift
, cuSize
>> m_vChromaShift
);
1546 bidir2Nx2N
.bestME
[0][0] = inter2Nx2N
.bestME
[0][0];
1547 bidir2Nx2N
.bestME
[0][1] = inter2Nx2N
.bestME
[0][1];
1548 MotionData
* bestME
= bidir2Nx2N
.bestME
[0];
1549 int ref0
= bestME
[0].ref
;
1550 MV mvp0
= bestME
[0].mvp
;
1551 int mvpIdx0
= bestME
[0].mvpIdx
;
1552 int ref1
= bestME
[1].ref
;
1553 MV mvp1
= bestME
[1].mvp
;
1554 int mvpIdx1
= bestME
[1].mvpIdx
;
1556 bidir2Nx2N
.initCosts();
1557 cu
.setPartSizeSubParts(SIZE_2Nx2N
);
1558 cu
.setPredModeSubParts(MODE_INTER
);
1559 cu
.setPUInterDir(3, 0, 0);
1560 cu
.setPURefIdx(0, (int8_t)ref0
, 0, 0);
1561 cu
.setPURefIdx(1, (int8_t)ref1
, 0, 0);
1562 cu
.m_mvpIdx
[0][0] = (uint8_t)mvpIdx0
;
1563 cu
.m_mvpIdx
[1][0] = (uint8_t)mvpIdx1
;
1564 cu
.m_mergeFlag
[0] = 0;
1566 /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */
1567 cu
.setPUMv(0, bestME
[0].mv
, 0, 0);
1568 cu
.m_mvd
[0][0] = bestME
[0].mv
- mvp0
;
1570 cu
.setPUMv(1, bestME
[1].mv
, 0, 0);
1571 cu
.m_mvd
[1][0] = bestME
[1].mv
- mvp1
;
1573 prepMotionCompensation(cu
, cuGeom
, 0);
1574 motionCompensation(bidir2Nx2N
.predYuv
, true, m_bChromaSa8d
);
1576 int sa8d
= primitives
.sa8d
[partEnum
](fencYuv
.m_buf
[0], fencYuv
.m_size
, bidir2Nx2N
.predYuv
.m_buf
[0], bidir2Nx2N
.predYuv
.m_size
);
1579 /* Add in chroma distortion */
1580 sa8d
+= primitives
.sa8d_inter
[cpart
](fencYuv
.m_buf
[1], fencYuv
.m_csize
, bidir2Nx2N
.predYuv
.m_buf
[1], bidir2Nx2N
.predYuv
.m_csize
);
1581 sa8d
+= primitives
.sa8d_inter
[cpart
](fencYuv
.m_buf
[2], fencYuv
.m_csize
, bidir2Nx2N
.predYuv
.m_buf
[2], bidir2Nx2N
.predYuv
.m_csize
);
1583 bidir2Nx2N
.sa8dBits
= bestME
[0].bits
+ bestME
[1].bits
+ m_listSelBits
[2] - (m_listSelBits
[0] + m_listSelBits
[1]);
1584 bidir2Nx2N
.sa8dCost
= sa8d
+ m_rdCost
.getCost(bidir2Nx2N
.sa8dBits
);
1586 bool bTryZero
= bestME
[0].mv
.notZero() || bestME
[1].mv
.notZero();
1589 /* Do not try zero MV if unidir motion predictors are beyond
1590 * valid search area */
1592 int merange
= X265_MAX(m_param
->sourceWidth
, m_param
->sourceHeight
);
1593 setSearchRange(cu
, mvzero
, merange
, mvmin
, mvmax
);
1594 mvmax
.y
+= 2; // there is some pad for subpel refine
1598 bTryZero
&= bestME
[0].mvp
.checkRange(mvmin
, mvmax
);
1599 bTryZero
&= bestME
[1].mvp
.checkRange(mvmin
, mvmax
);
1603 /* Estimate cost of BIDIR using coincident blocks */
1604 Yuv
& tmpPredYuv
= m_rqt
[cuGeom
.depth
].tmpPredYuv
;
1610 cu
.m_mv
[0][0] = mvzero
;
1611 cu
.m_mv
[1][0] = mvzero
;
1613 prepMotionCompensation(cu
, cuGeom
, 0);
1614 motionCompensation(tmpPredYuv
, true, true);
1616 zsa8d
= primitives
.sa8d
[partEnum
](fencYuv
.m_buf
[0], fencYuv
.m_size
, tmpPredYuv
.m_buf
[0], tmpPredYuv
.m_size
);
1617 zsa8d
+= primitives
.sa8d_inter
[cpart
](fencYuv
.m_buf
[1], fencYuv
.m_csize
, tmpPredYuv
.m_buf
[1], tmpPredYuv
.m_csize
);
1618 zsa8d
+= primitives
.sa8d_inter
[cpart
](fencYuv
.m_buf
[2], fencYuv
.m_csize
, tmpPredYuv
.m_buf
[2], tmpPredYuv
.m_csize
);
1622 pixel
*fref0
= m_slice
->m_mref
[0][ref0
].getLumaAddr(cu
.m_cuAddr
, cuGeom
.encodeIdx
);
1623 pixel
*fref1
= m_slice
->m_mref
[1][ref1
].getLumaAddr(cu
.m_cuAddr
, cuGeom
.encodeIdx
);
1624 intptr_t refStride
= m_slice
->m_mref
[0][0].lumaStride
;
1626 primitives
.pixelavg_pp
[partEnum
](tmpPredYuv
.m_buf
[0], tmpPredYuv
.m_size
, fref0
, refStride
, fref1
, refStride
, 32);
1627 zsa8d
= primitives
.sa8d
[partEnum
](fencYuv
.m_buf
[0], fencYuv
.m_size
, tmpPredYuv
.m_buf
[0], tmpPredYuv
.m_size
);
1630 uint32_t bits0
= bestME
[0].bits
- m_me
.bitcost(bestME
[0].mv
, mvp0
) + m_me
.bitcost(mvzero
, mvp0
);
1631 uint32_t bits1
= bestME
[1].bits
- m_me
.bitcost(bestME
[1].mv
, mvp1
) + m_me
.bitcost(mvzero
, mvp1
);
1632 uint32_t zcost
= zsa8d
+ m_rdCost
.getCost(bits0
) + m_rdCost
.getCost(bits1
);
1634 /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */
1635 checkBestMVP(inter2Nx2N
.amvpCand
[0][ref0
], mvzero
, mvp0
, mvpIdx0
, bits0
, zcost
);
1636 checkBestMVP(inter2Nx2N
.amvpCand
[1][ref1
], mvzero
, mvp1
, mvpIdx1
, bits1
, zcost
);
1638 uint32_t zbits
= bits0
+ bits1
+ m_listSelBits
[2] - (m_listSelBits
[0] + m_listSelBits
[1]);
1639 zcost
= zsa8d
+ m_rdCost
.getCost(zbits
);
1641 if (zcost
< bidir2Nx2N
.sa8dCost
)
1643 bidir2Nx2N
.sa8dBits
= zbits
;
1644 bidir2Nx2N
.sa8dCost
= zcost
;
1646 cu
.setPUMv(0, mvzero
, 0, 0);
1647 cu
.m_mvd
[0][0] = mvzero
- mvp0
;
1648 cu
.m_mvpIdx
[0][0] = (uint8_t)mvpIdx0
;
1650 cu
.setPUMv(1, mvzero
, 0, 0);
1651 cu
.m_mvd
[1][0] = mvzero
- mvp1
;
1652 cu
.m_mvpIdx
[1][0] = (uint8_t)mvpIdx1
;
1655 /* real MC was already performed */
1656 bidir2Nx2N
.predYuv
.copyFromYuv(tmpPredYuv
);
1659 prepMotionCompensation(cu
, cuGeom
, 0);
1660 motionCompensation(bidir2Nx2N
.predYuv
, true, true);
1663 else if (m_bChromaSa8d
)
1665 /* recover overwritten motion vectors */
1666 cu
.m_mv
[0][0] = bestME
[0].mv
;
1667 cu
.m_mv
[1][0] = bestME
[1].mv
;
1672 void Analysis::encodeResidue(const CUData
& ctu
, const CUGeom
& cuGeom
)
1674 if (cuGeom
.depth
< ctu
.m_cuDepth
[cuGeom
.encodeIdx
] && cuGeom
.depth
< g_maxCUDepth
)
1676 for (uint32_t subPartIdx
= 0; subPartIdx
< 4; subPartIdx
++)
1678 const CUGeom
& childGeom
= *(&cuGeom
+ cuGeom
.childOffset
+ subPartIdx
);
1679 if (childGeom
.flags
& CUGeom::PRESENT
)
1680 encodeResidue(ctu
, childGeom
);
1685 uint32_t absPartIdx
= cuGeom
.encodeIdx
;
1686 int sizeIdx
= cuGeom
.log2CUSize
- 2;
1688 /* reuse the bestMode data structures at the current depth */
1689 Mode
*bestMode
= m_modeDepth
[cuGeom
.depth
].bestMode
;
1690 CUData
& cu
= bestMode
->cu
;
1692 cu
.copyFromPic(ctu
, cuGeom
);
1693 m_quant
.setQPforQuant(cu
);
1695 Yuv
& fencYuv
= m_modeDepth
[cuGeom
.depth
].fencYuv
;
1697 m_modeDepth
[0].fencYuv
.copyPartToYuv(fencYuv
, absPartIdx
);
1698 X265_CHECK(bestMode
->fencYuv
== &fencYuv
, "invalid fencYuv\n");
1702 uint32_t tuDepthRange
[2];
1703 cu
.getIntraTUQtDepthRange(tuDepthRange
, 0);
1705 uint32_t initTuDepth
= cu
.m_partSize
[0] != SIZE_2Nx2N
;
1706 residualTransformQuantIntra(*bestMode
, cuGeom
, initTuDepth
, 0, tuDepthRange
);
1707 getBestIntraModeChroma(*bestMode
, cuGeom
);
1708 residualQTIntraChroma(*bestMode
, cuGeom
, 0, 0);
1710 else // if (cu.isInter(0))
1712 X265_CHECK(!ctu
.isSkipped(absPartIdx
), "skip not expected prior to transform\n");
1714 /* Calculate residual for current CU part into depth sized resiYuv */
1716 ShortYuv
& resiYuv
= m_rqt
[cuGeom
.depth
].tmpResiYuv
;
1718 /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */
1719 Yuv
& predYuv
= m_modeDepth
[0].bestMode
->predYuv
;
1720 pixel
* predY
= predYuv
.getLumaAddr(absPartIdx
);
1721 pixel
* predU
= predYuv
.getCbAddr(absPartIdx
);
1722 pixel
* predV
= predYuv
.getCrAddr(absPartIdx
);
1724 primitives
.luma_sub_ps
[sizeIdx
](resiYuv
.m_buf
[0], resiYuv
.m_size
,
1725 fencYuv
.m_buf
[0], predY
,
1726 fencYuv
.m_size
, predYuv
.m_size
);
1728 primitives
.chroma
[m_csp
].sub_ps
[sizeIdx
](resiYuv
.m_buf
[1], resiYuv
.m_csize
,
1729 fencYuv
.m_buf
[1], predU
,
1730 fencYuv
.m_csize
, predYuv
.m_csize
);
1732 primitives
.chroma
[m_csp
].sub_ps
[sizeIdx
](resiYuv
.m_buf
[2], resiYuv
.m_csize
,
1733 fencYuv
.m_buf
[2], predV
,
1734 fencYuv
.m_csize
, predYuv
.m_csize
);
1736 uint32_t tuDepthRange
[2];
1737 cu
.getInterTUQtDepthRange(tuDepthRange
, 0);
1739 residualTransformQuantInter(*bestMode
, cuGeom
, 0, cuGeom
.depth
, tuDepthRange
);
1741 if (cu
.m_mergeFlag
[0] && cu
.m_partSize
[0] == SIZE_2Nx2N
&& !cu
.getQtRootCbf(0))
1742 cu
.setPredModeSubParts(MODE_SKIP
);
1744 /* residualTransformQuantInter() wrote transformed residual back into
1745 * resiYuv. Generate the recon pixels by adding it to the prediction */
1747 PicYuv
& reconPic
= *m_frame
->m_reconPic
;
1749 primitives
.luma_add_ps
[sizeIdx
](reconPic
.getLumaAddr(cu
.m_cuAddr
, absPartIdx
), reconPic
.m_stride
,
1750 predY
, resiYuv
.m_buf
[0], predYuv
.m_size
, resiYuv
.m_size
);
1752 primitives
.luma_copy_pp
[sizeIdx
](reconPic
.getLumaAddr(cu
.m_cuAddr
, absPartIdx
), reconPic
.m_stride
,
1753 predY
, predYuv
.m_size
);
1756 primitives
.chroma
[m_csp
].add_ps
[sizeIdx
](reconPic
.getCbAddr(cu
.m_cuAddr
, absPartIdx
), reconPic
.m_strideC
,
1757 predU
, resiYuv
.m_buf
[1], predYuv
.m_csize
, resiYuv
.m_csize
);
1759 primitives
.chroma
[m_csp
].copy_pp
[sizeIdx
](reconPic
.getCbAddr(cu
.m_cuAddr
, absPartIdx
), reconPic
.m_strideC
,
1760 predU
, predYuv
.m_csize
);
1763 primitives
.chroma
[m_csp
].add_ps
[sizeIdx
](reconPic
.getCrAddr(cu
.m_cuAddr
, absPartIdx
), reconPic
.m_strideC
,
1764 predV
, resiYuv
.m_buf
[2], predYuv
.m_csize
, resiYuv
.m_csize
);
1766 primitives
.chroma
[m_csp
].copy_pp
[sizeIdx
](reconPic
.getCrAddr(cu
.m_cuAddr
, absPartIdx
), reconPic
.m_strideC
,
1767 predV
, predYuv
.m_csize
);
1770 checkDQP(cu
, cuGeom
);
1771 cu
.updatePic(cuGeom
.depth
);
1774 void Analysis::addSplitFlagCost(Mode
& mode
, uint32_t depth
)
1776 if (m_param
->rdLevel
>= 3)
1778 /* code the split flag (0 or 1) and update bit costs */
1779 mode
.contexts
.resetBits();
1780 mode
.contexts
.codeSplitFlag(mode
.cu
, 0, depth
);
1781 uint32_t bits
= mode
.contexts
.getNumberOfWrittenBits();
1782 mode
.mvBits
+= bits
;
1783 mode
.totalBits
+= bits
;
1784 updateModeCost(mode
);
1786 else if (m_param
->rdLevel
<= 1)
1789 mode
.sa8dCost
= m_rdCost
.calcRdSADCost(mode
.distortion
, mode
.sa8dBits
);
1795 updateModeCost(mode
);
1799 void Analysis::checkDQP(CUData
& cu
, const CUGeom
& cuGeom
)
1801 if (m_slice
->m_pps
->bUseDQP
&& cuGeom
.depth
<= m_slice
->m_pps
->maxCuDQPDepth
)
1803 if (cu
.m_cuDepth
[0] > cuGeom
.depth
) // detect splits
1805 bool hasResidual
= false;
1806 for (uint32_t absPartIdx
= 0; absPartIdx
< cu
.m_numPartitions
; absPartIdx
++)
1808 if (cu
.getQtRootCbf(absPartIdx
))
1815 cu
.setQPSubCUs(cu
.getRefQP(0), 0, cuGeom
.depth
);
1817 cu
.setQPSubParts(cu
.getRefQP(0), 0, cuGeom
.depth
);
1821 if (!cu
.getCbf(0, TEXT_LUMA
, 0) && !cu
.getCbf(0, TEXT_CHROMA_U
, 0) && !cu
.getCbf(0, TEXT_CHROMA_V
, 0))
1822 cu
.setQPSubParts(cu
.getRefQP(0), 0, cuGeom
.depth
);
1827 uint32_t Analysis::topSkipMinDepth(const CUData
& parentCTU
, const CUGeom
& cuGeom
)
1829 /* Do not attempt to code a block larger than the largest block in the
1830 * co-located CTUs in L0 and L1 */
1831 int currentQP
= parentCTU
.m_qp
[0];
1832 int previousQP
= currentQP
;
1833 uint32_t minDepth0
= 4, minDepth1
= 4;
1836 if (m_slice
->m_numRefIdx
[0])
1839 const CUData
& cu
= *m_slice
->m_refPicList
[0][0]->m_encData
->getPicCTU(parentCTU
.m_cuAddr
);
1840 previousQP
= cu
.m_qp
[0];
1841 if (!cu
.m_cuDepth
[cuGeom
.encodeIdx
])
1843 for (uint32_t i
= 0; i
< cuGeom
.numPartitions
&& minDepth0
; i
+= 4)
1845 uint32_t d
= cu
.m_cuDepth
[cuGeom
.encodeIdx
+ i
];
1846 minDepth0
= X265_MIN(d
, minDepth0
);
1850 if (m_slice
->m_numRefIdx
[1])
1853 const CUData
& cu
= *m_slice
->m_refPicList
[1][0]->m_encData
->getPicCTU(parentCTU
.m_cuAddr
);
1854 if (!cu
.m_cuDepth
[cuGeom
.encodeIdx
])
1856 for (uint32_t i
= 0; i
< cuGeom
.numPartitions
; i
+= 4)
1858 uint32_t d
= cu
.m_cuDepth
[cuGeom
.encodeIdx
+ i
];
1859 minDepth1
= X265_MIN(d
, minDepth1
);
1866 uint32_t minDepth
= X265_MIN(minDepth0
, minDepth1
);
1867 uint32_t thresh
= minDepth
* numRefs
* (cuGeom
.numPartitions
>> 2);
1869 /* allow block size growth if QP is raising or avg depth is
1870 * less than 1.5 of min depth */
1871 if (minDepth
&& currentQP
>= previousQP
&& (sum
<= thresh
+ (thresh
>> 1)))
1877 /* returns true if recursion should be stopped */
1878 bool Analysis::recursionDepthCheck(const CUData
& parentCTU
, const CUGeom
& cuGeom
, const Mode
& bestMode
)
1880 /* early exit when the RD cost of best mode at depth n is less than the sum
1881 * of average of RD cost of the neighbor CU's(above, aboveleft, aboveright,
1882 * left, colocated) and avg cost of that CU at depth "n" with weightage for
1885 uint32_t depth
= cuGeom
.depth
;
1886 FrameData
& curEncData
= *m_frame
->m_encData
;
1887 FrameData::RCStatCU
& cuStat
= curEncData
.m_cuStat
[parentCTU
.m_cuAddr
];
1888 uint64_t cuCost
= cuStat
.avgCost
[depth
] * cuStat
.count
[depth
];
1889 uint64_t cuCount
= cuStat
.count
[depth
];
1891 uint64_t neighCost
= 0, neighCount
= 0;
1892 const CUData
* above
= parentCTU
.m_cuAbove
;
1895 FrameData::RCStatCU
& astat
= curEncData
.m_cuStat
[above
->m_cuAddr
];
1896 neighCost
+= astat
.avgCost
[depth
] * astat
.count
[depth
];
1897 neighCount
+= astat
.count
[depth
];
1899 const CUData
* aboveLeft
= parentCTU
.m_cuAboveLeft
;
1902 FrameData::RCStatCU
& lstat
= curEncData
.m_cuStat
[aboveLeft
->m_cuAddr
];
1903 neighCost
+= lstat
.avgCost
[depth
] * lstat
.count
[depth
];
1904 neighCount
+= lstat
.count
[depth
];
1907 const CUData
* aboveRight
= parentCTU
.m_cuAboveRight
;
1910 FrameData::RCStatCU
& rstat
= curEncData
.m_cuStat
[aboveRight
->m_cuAddr
];
1911 neighCost
+= rstat
.avgCost
[depth
] * rstat
.count
[depth
];
1912 neighCount
+= rstat
.count
[depth
];
1915 const CUData
* left
= parentCTU
.m_cuLeft
;
1918 FrameData::RCStatCU
& nstat
= curEncData
.m_cuStat
[left
->m_cuAddr
];
1919 neighCost
+= nstat
.avgCost
[depth
] * nstat
.count
[depth
];
1920 neighCount
+= nstat
.count
[depth
];
1923 // give 60% weight to all CU's and 40% weight to neighbour CU's
1924 if (neighCount
+ cuCount
)
1926 uint64_t avgCost
= ((3 * cuCost
) + (2 * neighCost
)) / ((3 * cuCount
) + (2 * neighCount
));
1927 uint64_t curCost
= m_param
->rdLevel
> 1 ? bestMode
.rdCost
: bestMode
.sa8dCost
;
1928 if (curCost
< avgCost
&& avgCost
)