Imported Upstream version 1.4
[deb_x265.git] / source / encoder / analysis.cpp
CommitLineData
72b9787e
JB
1/*****************************************************************************
2* Copyright (C) 2013 x265 project
3*
4* Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
5* Steve Borho <steve@borho.org>
6*
7* This program is free software; you can redistribute it and/or modify
8* it under the terms of the GNU General Public License as published by
9* the Free Software Foundation; either version 2 of the License, or
10* (at your option) any later version.
11*
12* This program is distributed in the hope that it will be useful,
13* but WITHOUT ANY WARRANTY; without even the implied warranty of
14* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15* GNU General Public License for more details.
16*
17* You should have received a copy of the GNU General Public License
18* along with this program; if not, write to the Free Software
19* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20*
21* This program is also available under a commercial proprietary license.
22* For more information, contact us at license @ x265.com.
23*****************************************************************************/
24
25#include "common.h"
26#include "frame.h"
27#include "framedata.h"
28#include "picyuv.h"
29#include "primitives.h"
30#include "threading.h"
31
32#include "analysis.h"
33#include "rdcost.h"
34#include "encoder.h"
35
36#include "PPA/ppa.h"
37
38using namespace x265;
39
40/* An explanation of rate distortion levels (--rd-level)
41 *
42 * rd-level 0 generates no recon per CU (NO RDO or Quant)
43 *
44 * sa8d selection between merge / skip / inter / intra and split
45 * no recon pixels generated until CTU analysis is complete, requiring
46 * intra predictions to use source pixels
47 *
48 * rd-level 1 uses RDO for merge and skip, sa8d for all else
49 *
50 * RDO selection between merge and skip
51 * sa8d selection between (merge/skip) / inter modes / intra and split
52 * intra prediction uses reconstructed pixels
53 *
54 * rd-level 2 uses RDO for merge/skip and split
55 *
56 * RDO selection between merge and skip
57 * sa8d selection between (merge/skip) / inter modes / intra
58 * RDO split decisions
59 *
60 * rd-level 3 uses RDO for merge/skip/best inter/intra
61 *
62 * RDO selection between merge and skip
63 * sa8d selection of best inter mode
64 * RDO selection between (merge/skip) / best inter mode / intra / split
65 *
66 * rd-level 4 enables RDOQuant
67 *
68 * rd-level 5,6 does RDO for each inter mode
69 */
70
71Analysis::Analysis()
72{
73 m_totalNumJobs = m_numAcquiredJobs = m_numCompletedJobs = 0;
74}
75
76bool Analysis::create(ThreadLocalData *tld)
77{
78 m_tld = tld;
79 m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
80
81 int csp = m_param->internalCsp;
82 uint32_t cuSize = g_maxCUSize;
83
84 bool ok = true;
85 for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++, cuSize >>= 1)
86 {
87 ModeDepth &md = m_modeDepth[depth];
88
89 md.cuMemPool.create(depth, csp, MAX_PRED_TYPES);
90 ok &= md.fencYuv.create(cuSize, csp);
91
92 for (int j = 0; j < MAX_PRED_TYPES; j++)
93 {
94 md.pred[j].cu.initialize(md.cuMemPool, depth, csp, j);
95 ok &= md.pred[j].predYuv.create(cuSize, csp);
96 ok &= md.pred[j].reconYuv.create(cuSize, csp);
97 md.pred[j].fencYuv = &md.fencYuv;
98 }
99 }
100
101 return ok;
102}
103
104void Analysis::destroy()
105{
106 for (uint32_t i = 0; i <= g_maxCUDepth; i++)
107 {
108 m_modeDepth[i].cuMemPool.destroy();
109 m_modeDepth[i].fencYuv.destroy();
110
111 for (int j = 0; j < MAX_PRED_TYPES; j++)
112 {
113 m_modeDepth[i].pred[j].predYuv.destroy();
114 m_modeDepth[i].pred[j].reconYuv.destroy();
115 }
116 }
117}
118
119Search::Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
120{
121 m_slice = ctu.m_slice;
122 m_frame = &frame;
123
124 invalidateContexts(0);
125 m_quant.setQPforQuant(ctu);
126 m_rqt[0].cur.load(initialContext);
127 m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_origPicYuv, ctu.m_cuAddr, 0);
128
129 uint32_t numPartition = ctu.m_numPartitions;
130 if (m_slice->m_sliceType == I_SLICE)
131 {
132 uint32_t zOrder = 0;
133 if (m_param->analysisMode == X265_ANALYSIS_LOAD)
134 compressIntraCU(ctu, cuGeom, m_frame->m_intraData, zOrder);
135 else
136 {
137 compressIntraCU(ctu, cuGeom, NULL, zOrder);
138
139 if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_intraData)
140 {
141 CUData *bestCU = &m_modeDepth[0].bestMode->cu;
142 memcpy(&m_frame->m_intraData->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
143 memcpy(&m_frame->m_intraData->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
144 memcpy(&m_frame->m_intraData->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
145 m_frame->m_intraData->cuAddr[ctu.m_cuAddr] = ctu.m_cuAddr;
146 m_frame->m_intraData->poc[ctu.m_cuAddr] = m_frame->m_poc;
147 }
148 }
149 }
150 else
151 {
152 if (!m_param->rdLevel)
153 {
154 /* In RD Level 0/1, copy source pixels into the reconstructed block so
155 * they are available for intra predictions */
156 m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPicYuv, ctu.m_cuAddr, 0);
157
158 compressInterCU_rd0_4(ctu, cuGeom); // TODO: this really wants to be compressInterCU_rd0_1
159
160 /* generate residual for entire CTU at once and copy to reconPic */
161 encodeResidue(ctu, cuGeom);
162 }
163 else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2)
164 compressInterCU_dist(ctu, cuGeom);
165 else if (m_param->rdLevel <= 4)
166 compressInterCU_rd0_4(ctu, cuGeom);
167 else
168 compressInterCU_rd5_6(ctu, cuGeom);
169 }
170
171 return *m_modeDepth[0].bestMode;
172}
173
174void Analysis::tryLossless(const CUGeom& cuGeom)
175{
176 ModeDepth& md = m_modeDepth[cuGeom.depth];
177
178 if (!md.bestMode->distortion)
179 /* already lossless */
180 return;
181 else if (md.bestMode->cu.m_predMode[0] == MODE_INTRA)
182 {
183 md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
184 PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
185 uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir;
186 checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes);
187 checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
188 }
189 else
190 {
191 md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
192 md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
193 encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
194 checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
195 }
196}
197
198void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, x265_intra_data* shared, uint32_t& zOrder)
199{
200 uint32_t depth = cuGeom.depth;
201 ModeDepth& md = m_modeDepth[depth];
202 md.bestMode = NULL;
203
204 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
205 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
206
207 if (shared)
208 {
209 uint8_t* sharedDepth = &shared->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
210 char* sharedPartSizes = &shared->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
211 uint8_t* sharedModes = &shared->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
212
213 if (mightNotSplit && depth == sharedDepth[zOrder] && zOrder == cuGeom.encodeIdx)
214 {
215 m_quant.setQPforQuant(parentCTU);
216
217 PartSize size = (PartSize)sharedPartSizes[zOrder];
218 Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
219 mode.cu.initSubCU(parentCTU, cuGeom);
220 checkIntra(mode, cuGeom, size, sharedModes);
221 checkBestMode(mode, depth);
222
223 if (m_bTryLossless)
224 tryLossless(cuGeom);
225
226 if (mightSplit)
227 addSplitFlagCost(*md.bestMode, cuGeom.depth);
228
229 // increment zOrder offset to point to next best depth in sharedDepth buffer
230 zOrder += g_depthInc[g_maxCUDepth - 1][sharedDepth[zOrder]];
231 mightSplit = false;
232 }
233 }
234 else if (mightNotSplit)
235 {
236 m_quant.setQPforQuant(parentCTU);
237
238 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
239 checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL);
240 checkBestMode(md.pred[PRED_INTRA], depth);
241
242 if (depth == g_maxCUDepth)
243 {
244 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
245 checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL);
246 checkBestMode(md.pred[PRED_INTRA_NxN], depth);
247 }
248
249 if (m_bTryLossless)
250 tryLossless(cuGeom);
251
252 if (mightSplit)
253 addSplitFlagCost(*md.bestMode, cuGeom.depth);
254 }
255
256 if (mightSplit)
257 {
258 Mode* splitPred = &md.pred[PRED_SPLIT];
259 splitPred->initCosts();
260 CUData* splitCU = &splitPred->cu;
261 splitCU->initSubCU(parentCTU, cuGeom);
262
263 uint32_t nextDepth = depth + 1;
264 ModeDepth& nd = m_modeDepth[nextDepth];
265 invalidateContexts(nextDepth);
266 Entropy* nextContext = &m_rqt[depth].cur;
267
268 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
269 {
270 const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
271 if (childCuData.flags & CUGeom::PRESENT)
272 {
273 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
274 m_rqt[nextDepth].cur.load(*nextContext);
275 compressIntraCU(parentCTU, childCuData, shared, zOrder);
276
277 // Save best CU and pred data for this sub CU
278 splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
279 splitPred->addSubCosts(*nd.bestMode);
280 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
281 nextContext = &nd.bestMode->contexts;
282 }
283 else
284 {
285 /* record the depth of this non-present sub-CU */
286 splitCU->setEmptyPart(childCuData, subPartIdx);
287 zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth];
288 }
289 }
290 nextContext->store(splitPred->contexts);
291 if (mightNotSplit)
292 addSplitFlagCost(*splitPred, cuGeom.depth);
293 else
294 updateModeCost(*splitPred);
295 checkBestMode(*splitPred, depth);
296 }
297
298 checkDQP(md.bestMode->cu, cuGeom);
299
300 /* Copy best data to encData CTU and recon */
301 md.bestMode->cu.copyToPic(depth);
302 if (md.bestMode != &md.pred[PRED_SPLIT])
303 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx);
304}
305
306bool Analysis::findJob(int threadId)
307{
308 /* try to acquire a CU mode to analyze */
309 if (m_totalNumJobs > m_numAcquiredJobs)
310 {
311 /* ATOMIC_INC returns the incremented value */
312 int id = ATOMIC_INC(&m_numAcquiredJobs);
313 if (m_totalNumJobs >= id)
314 {
315 parallelModeAnalysis(threadId, id - 1);
316
317 if (ATOMIC_INC(&m_numCompletedJobs) == m_totalNumJobs)
318 m_modeCompletionEvent.trigger();
319 return true;
320 }
321 }
322
323 if (m_totalNumME > m_numAcquiredME)
324 {
325 int id = ATOMIC_INC(&m_numAcquiredME);
326 if (m_totalNumME >= id)
327 {
328 parallelME(threadId, id - 1);
329
330 if (ATOMIC_INC(&m_numCompletedME) == m_totalNumME)
331 m_meCompletionEvent.trigger();
332 return true;
333 }
334 }
335
336 return false;
337}
338
339void Analysis::parallelME(int threadId, int meId)
340{
341 Analysis* slave;
342
343 if (threadId == -1)
344 slave = this;
345 else
346 {
347 slave = &m_tld[threadId].analysis;
348 slave->setQP(*m_slice, m_rdCost.m_qp);
349 slave->m_slice = m_slice;
350 slave->m_frame = m_frame;
351
352 PicYuv* fencPic = m_frame->m_origPicYuv;
353 pixel* pu = fencPic->getLumaAddr(m_curMECu->m_cuAddr, m_curGeom->encodeIdx + m_puAbsPartIdx);
354 slave->m_me.setSourcePlane(fencPic->m_picOrg[0], fencPic->m_stride);
355 slave->m_me.setSourcePU(pu - fencPic->m_picOrg[0], m_puWidth, m_puHeight);
356
357 slave->prepMotionCompensation(*m_curMECu, *m_curGeom, m_curPart);
358 }
359
360 if (meId < m_slice->m_numRefIdx[0])
361 slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 0, meId);
362 else
363 slave->singleMotionEstimation(*this, *m_curMECu, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]);
364}
365
366void Analysis::parallelModeAnalysis(int threadId, int jobId)
367{
368 Analysis* slave;
369
370 if (threadId == -1)
371 slave = this;
372 else
373 {
374 slave = &m_tld[threadId].analysis;
375 slave->m_slice = m_slice;
376 slave->m_frame = m_frame;
377 slave->setQP(*m_slice, m_rdCost.m_qp);
378 slave->invalidateContexts(0);
379 if (jobId)
380 slave->m_me.setSourcePlane(m_frame->m_origPicYuv->m_picOrg[0], m_frame->m_origPicYuv->m_stride);
381 }
382
383 ModeDepth& md = m_modeDepth[m_curGeom->depth];
384
385 if (m_param->rdLevel <= 4)
386 {
387 switch (jobId)
388 {
389 case 0:
390 if (slave != this)
391 slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur);
392 slave->checkIntraInInter_rd0_4(md.pred[PRED_INTRA], *m_curGeom);
393 if (m_param->rdLevel > 2)
394 slave->encodeIntraInInter(md.pred[PRED_INTRA], *m_curGeom);
395 break;
396
397 case 1:
398 slave->checkInter_rd0_4(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N);
399 break;
400
401 case 2:
402 slave->checkInter_rd0_4(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N);
403 break;
404
405 case 3:
406 slave->checkInter_rd0_4(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN);
407 break;
408
409 case 4:
410 slave->checkInter_rd0_4(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU);
411 break;
412
413 case 5:
414 slave->checkInter_rd0_4(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD);
415 break;
416
417 case 6:
418 slave->checkInter_rd0_4(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N);
419 break;
420
421 case 7:
422 slave->checkInter_rd0_4(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N);
423 break;
424
425 default:
426 X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
427 break;
428 }
429 }
430 else
431 {
432 bool bMergeOnly = m_curGeom->log2CUSize == 6;
433 if (slave != this)
434 {
435 slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur);
436 slave->m_quant.setQPforQuant(md.pred[PRED_2Nx2N].cu);
437 }
438
439 switch (jobId)
440 {
441 case 0:
442 slave->checkIntra(md.pred[PRED_INTRA], *m_curGeom, SIZE_2Nx2N, NULL);
443 if (m_curGeom->depth == g_maxCUDepth && m_curGeom->log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
444 slave->checkIntra(md.pred[PRED_INTRA_NxN], *m_curGeom, SIZE_NxN, NULL);
445 break;
446
447 case 1:
448 slave->checkInter_rd5_6(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N, false);
449 break;
450
451 case 2:
452 slave->checkInter_rd5_6(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N, false);
453 break;
454
455 case 3:
456 slave->checkInter_rd5_6(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN, false);
457 break;
458
459 case 4:
460 slave->checkInter_rd5_6(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU, bMergeOnly);
461 break;
462
463 case 5:
464 slave->checkInter_rd5_6(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD, bMergeOnly);
465 break;
466
467 case 6:
468 slave->checkInter_rd5_6(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N, bMergeOnly);
469 break;
470
471 case 7:
472 slave->checkInter_rd5_6(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N, bMergeOnly);
473 break;
474
475 default:
476 X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
477 break;
478 }
479 }
480}
481
482void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom)
483{
484 uint32_t depth = cuGeom.depth;
485 uint32_t cuAddr = parentCTU.m_cuAddr;
486 ModeDepth& md = m_modeDepth[depth];
487 md.bestMode = NULL;
488
489 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
490 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
491 uint32_t minDepth = m_param->rdLevel <= 4 ? topSkipMinDepth(parentCTU, cuGeom) : 0;
492
493 X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not support RD 0 or 1\n");
494
495 if (mightNotSplit && depth >= minDepth)
496 {
497 int bTryAmp = m_slice->m_sps->maxAMPDepth > depth && (cuGeom.log2CUSize < 6 || m_param->rdLevel > 4);
498 int bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
499
500 /* Initialize all prediction CUs based on parentCTU */
501 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
502 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
503 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
504 if (m_param->bEnableRectInter)
505 {
506 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
507 md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
508 }
509 if (bTryAmp)
510 {
511 md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
512 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
513 md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
514 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
515 }
516 if (bTryIntra)
517 {
518 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
519 if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
520 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
521 }
522
523 m_totalNumJobs = 2 + m_param->bEnableRectInter * 2 + bTryAmp * 4;
524 m_numAcquiredJobs = !bTryIntra;
525 m_numCompletedJobs = m_numAcquiredJobs;
526 m_curGeom = &cuGeom;
527 m_bJobsQueued = true;
528 JobProvider::enqueue();
529
530 for (int i = 0; i < m_totalNumJobs - m_numCompletedJobs; i++)
531 m_pool->pokeIdleThread();
532
533 /* participate in processing jobs, until all are distributed */
534 while (findJob(-1))
535 ;
536
537 JobProvider::dequeue();
538 m_bJobsQueued = false;
539
540 /* the master worker thread (this one) does merge analysis. By doing
541 * merge after all the other jobs are at least started, we usually avoid
542 * blocking on another thread */
543
544 if (m_param->rdLevel <= 4)
545 {
546 checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
547
548 m_modeCompletionEvent.wait();
549
550 /* select best inter mode based on sa8d cost */
551 Mode *bestInter = &md.pred[PRED_2Nx2N];
552
553 if (m_param->bEnableRectInter)
554 {
555 if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
556 bestInter = &md.pred[PRED_Nx2N];
557 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
558 bestInter = &md.pred[PRED_2NxN];
559 }
560
561 if (bTryAmp)
562 {
563 if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
564 bestInter = &md.pred[PRED_2NxnU];
565 if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
566 bestInter = &md.pred[PRED_2NxnD];
567 if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
568 bestInter = &md.pred[PRED_nLx2N];
569 if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
570 bestInter = &md.pred[PRED_nRx2N];
571 }
572
573 if (m_param->rdLevel > 2)
574 {
575 /* encode best inter */
576 for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
577 {
578 prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
579 motionCompensation(bestInter->predYuv, false, true);
580 }
581 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
582
583 /* RD selection between merge, inter and intra */
584 checkBestMode(*bestInter, depth);
585
586 if (bTryIntra)
587 checkBestMode(md.pred[PRED_INTRA], depth);
588 }
589 else /* m_param->rdLevel == 2 */
590 {
591 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
592 md.bestMode = bestInter;
593
594 if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
595 {
596 md.bestMode = &md.pred[PRED_INTRA];
597 encodeIntraInInter(*md.bestMode, cuGeom);
598 }
599 else if (!md.bestMode->cu.m_mergeFlag[0])
600 {
601 /* finally code the best mode selected from SA8D costs */
602 for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
603 {
604 prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx);
605 motionCompensation(md.bestMode->predYuv, false, true);
606 }
607 encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
608 }
609 }
610 }
611 else
612 {
613 checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
614 m_modeCompletionEvent.wait();
615
616 checkBestMode(md.pred[PRED_2Nx2N], depth);
617
618 if (m_param->bEnableRectInter)
619 {
620 checkBestMode(md.pred[PRED_Nx2N], depth);
621 checkBestMode(md.pred[PRED_2NxN], depth);
622 }
623
624 if (bTryAmp)
625 {
626 checkBestMode(md.pred[PRED_2NxnU], depth);
627 checkBestMode(md.pred[PRED_2NxnD], depth);
628 checkBestMode(md.pred[PRED_nLx2N], depth);
629 checkBestMode(md.pred[PRED_nRx2N], depth);
630 }
631
632 if (bTryIntra)
633 {
634 checkBestMode(md.pred[PRED_INTRA], depth);
635 if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
636 checkBestMode(md.pred[PRED_INTRA_NxN], depth);
637 }
638 }
639
640 if (md.bestMode->rdCost == MAX_INT64 && !bTryIntra)
641 {
642 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
643 checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
644 encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
645 checkBestMode(md.pred[PRED_INTRA], depth);
646 }
647
648 if (m_bTryLossless)
649 tryLossless(cuGeom);
650
651 if (mightSplit)
652 addSplitFlagCost(*md.bestMode, cuGeom.depth);
653 }
654
655 bool bNoSplit = false;
656 if (md.bestMode)
657 {
658 bNoSplit = !!md.bestMode->cu.isSkipped(0);
659 if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
660 bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
661 }
662
663 if (mightSplit && !bNoSplit)
664 {
665 Mode* splitPred = &md.pred[PRED_SPLIT];
666 splitPred->initCosts();
667 CUData* splitCU = &splitPred->cu;
668 splitCU->initSubCU(parentCTU, cuGeom);
669
670 uint32_t nextDepth = depth + 1;
671 ModeDepth& nd = m_modeDepth[nextDepth];
672 invalidateContexts(nextDepth);
673 Entropy* nextContext = &m_rqt[depth].cur;
674
675 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
676 {
677 const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
678 if (childCuData.flags & CUGeom::PRESENT)
679 {
680 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
681 m_rqt[nextDepth].cur.load(*nextContext);
682 compressInterCU_dist(parentCTU, childCuData);
683
684 // Save best CU and pred data for this sub CU
685 splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
686 splitPred->addSubCosts(*nd.bestMode);
687
688 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
689 nextContext = &nd.bestMode->contexts;
690 }
691 else
692 splitCU->setEmptyPart(childCuData, subPartIdx);
693 }
694 nextContext->store(splitPred->contexts);
695
696 if (mightNotSplit)
697 addSplitFlagCost(*splitPred, cuGeom.depth);
698 else
699 updateModeCost(*splitPred);
700
701 checkBestMode(*splitPred, depth);
702 }
703
704 if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA)
705 {
706 /* early-out statistics */
707 FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
708 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
709 uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
710 cuStat.count[depth] += 1;
711 cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
712 }
713
714 checkDQP(md.bestMode->cu, cuGeom);
715
716 /* Copy best data to encData CTU and recon */
717 md.bestMode->cu.copyToPic(depth);
718 if (md.bestMode != &md.pred[PRED_SPLIT])
719 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx);
720}
721
722void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom)
723{
724 uint32_t depth = cuGeom.depth;
725 uint32_t cuAddr = parentCTU.m_cuAddr;
726 ModeDepth& md = m_modeDepth[depth];
727 md.bestMode = NULL;
728
729 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
730 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
731 uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
732
733 if (mightNotSplit && depth >= minDepth)
734 {
735 bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
736
737 /* Initialize all prediction CUs based on parentCTU */
738 md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
739 md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
740 md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
741 if (m_param->bEnableRectInter)
742 {
743 md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
744 md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
745 }
746 if (m_slice->m_sps->maxAMPDepth > depth && cuGeom.log2CUSize < 6)
747 {
748 md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
749 md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
750 md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
751 md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
752 }
753
754 /* Compute Merge Cost */
755 checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
756
757 bool earlyskip = false;
758 if (m_param->rdLevel)
759 earlyskip = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
760
761 if (!earlyskip)
762 {
763 checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N);
764 Mode *bestInter = &md.pred[PRED_2Nx2N];
765
766 if (m_param->bEnableRectInter)
767 {
768 checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N);
769 if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
770 bestInter = &md.pred[PRED_Nx2N];
771 checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN);
772 if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
773 bestInter = &md.pred[PRED_2NxN];
774 }
775
776 if (m_slice->m_sps->maxAMPDepth > depth && cuGeom.log2CUSize < 6)
777 {
778 bool bHor = false, bVer = false;
779 if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
780 bHor = true;
781 else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)
782 bVer = true;
783 else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&
784 md.bestMode && md.bestMode->cu.getQtRootCbf(0))
785 {
786 bHor = true;
787 bVer = true;
788 }
789
790 if (bHor)
791 {
792 checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU);
793 if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
794 bestInter = &md.pred[PRED_2NxnU];
795 checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD);
796 if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
797 bestInter = &md.pred[PRED_2NxnD];
798 }
799 if (bVer)
800 {
801 checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N);
802 if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
803 bestInter = &md.pred[PRED_nLx2N];
804 checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N);
805 if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
806 bestInter = &md.pred[PRED_nRx2N];
807 }
808 }
809
810 if (m_param->rdLevel >= 3)
811 {
812 /* Calculate RD cost of best inter option */
813 for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
814 {
815 prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
816 motionCompensation(bestInter->predYuv, false, true);
817 }
818
819 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
820
821 if (!md.bestMode || bestInter->rdCost < md.bestMode->rdCost)
822 md.bestMode = bestInter;
823
824 if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
825 md.bestMode->sa8dCost == MAX_INT64)
826 {
827 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
828 checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
829 encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
830 if (md.pred[PRED_INTRA].rdCost < md.bestMode->rdCost)
831 md.bestMode = &md.pred[PRED_INTRA];
832 }
833 }
834 else
835 {
836 /* SA8D choice between merge/skip, inter, and intra */
837 if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
838 md.bestMode = bestInter;
839
840 if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
841 {
842 md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
843 checkIntraInInter_rd0_4(md.pred[PRED_INTRA], cuGeom);
844 if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
845 md.bestMode = &md.pred[PRED_INTRA];
846 }
847
848 /* finally code the best mode selected by SA8D costs:
849 * RD level 2 - fully encode the best mode
850 * RD level 1 - generate recon pixels
851 * RD level 0 - generate chroma prediction */
852 if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)
853 {
854 /* prediction already generated for this CU, and if rd level
855 * is not 0, it is already fully encoded */
856 }
857 else if (md.bestMode->cu.m_predMode[0] == MODE_INTER)
858 {
859 for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
860 {
861 prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx);
862 motionCompensation(md.bestMode->predYuv, false, true);
863 }
864 if (m_param->rdLevel == 2)
865 encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
866 else if (m_param->rdLevel == 1)
867 {
868 m_rqt[cuGeom.depth].tmpResiYuv.subtract(md.fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
869 generateCoeffRecon(*md.bestMode, cuGeom);
870 }
871 }
872 else
873 {
874 if (m_param->rdLevel == 2)
875 encodeIntraInInter(*md.bestMode, cuGeom);
876 else if (m_param->rdLevel == 1)
877 generateCoeffRecon(*md.bestMode, cuGeom);
878 }
879 }
880 } // !earlyskip
881
882 if (m_bTryLossless)
883 tryLossless(cuGeom);
884
885 if (mightSplit)
886 addSplitFlagCost(*md.bestMode, cuGeom.depth);
887 }
888
889 bool bNoSplit = false;
890 if (md.bestMode)
891 {
892 bNoSplit = !!md.bestMode->cu.isSkipped(0);
893 if (mightSplit && depth && depth >= minDepth && !bNoSplit)
894 bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
895 }
896
897 if (mightSplit && !bNoSplit)
898 {
899 Mode* splitPred = &md.pred[PRED_SPLIT];
900 splitPred->initCosts();
901 CUData* splitCU = &splitPred->cu;
902 splitCU->initSubCU(parentCTU, cuGeom);
903
904 uint32_t nextDepth = depth + 1;
905 ModeDepth& nd = m_modeDepth[nextDepth];
906 invalidateContexts(nextDepth);
907 Entropy* nextContext = &m_rqt[depth].cur;
908
909 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
910 {
911 const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
912 if (childCuData.flags & CUGeom::PRESENT)
913 {
914 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
915 m_rqt[nextDepth].cur.load(*nextContext);
916 compressInterCU_rd0_4(parentCTU, childCuData);
917
918 // Save best CU and pred data for this sub CU
919 splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
920 splitPred->addSubCosts(*nd.bestMode);
921
922 if (m_param->rdLevel)
923 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
924 else
925 nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childCuData.numPartitions * subPartIdx);
926 if (m_param->rdLevel > 1)
927 nextContext = &nd.bestMode->contexts;
928 }
929 else
930 splitCU->setEmptyPart(childCuData, subPartIdx);
931 }
932 nextContext->store(splitPred->contexts);
933
934 if (mightNotSplit)
935 addSplitFlagCost(*splitPred, cuGeom.depth);
936 else if (m_param->rdLevel <= 1)
937 splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits);
938 else
939 updateModeCost(*splitPred);
940
941 if (!md.bestMode)
942 md.bestMode = splitPred;
943 else if (m_param->rdLevel >= 1)
944 {
945 if (splitPred->rdCost < md.bestMode->rdCost)
946 md.bestMode = splitPred;
947 }
948 else
949 {
950 if (splitPred->sa8dCost < md.bestMode->sa8dCost)
951 md.bestMode = splitPred;
952 }
953 }
954
955 if (!depth || md.bestMode->cu.m_predMode[0] != MODE_INTRA)
956 {
957 /* early-out statistics */
958 FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
959 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
960 uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
961 cuStat.count[depth] += 1;
962 cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
963 }
964
965 checkDQP(md.bestMode->cu, cuGeom);
966
967 /* Copy best data to encData CTU and recon */
968 md.bestMode->cu.copyToPic(depth);
969 if (md.bestMode != &md.pred[PRED_SPLIT] && m_param->rdLevel)
970 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cuAddr, cuGeom.encodeIdx);
971}
972
973void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom)
974{
975 uint32_t depth = cuGeom.depth;
976 ModeDepth& md = m_modeDepth[depth];
977 md.bestMode = NULL;
978
979 bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
980 bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
981
982 if (mightNotSplit)
983 {
984 for (int i = 0; i < MAX_PRED_TYPES; i++)
985 md.pred[i].cu.initSubCU(parentCTU, cuGeom);
986
987 checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
988 bool earlySkip = m_param->bEnableEarlySkip && md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
989
990 if (!earlySkip)
991 {
992 checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, false);
993 checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
994
995 if (m_param->bEnableRectInter)
996 {
997 // Nx2N rect
998 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
999 {
1000 checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, false);
1001 checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
1002 }
1003 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1004 {
1005 checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, false);
1006 checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
1007 }
1008 }
1009
1010 // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
1011 if (m_slice->m_sps->maxAMPDepth > depth)
1012 {
1013 bool bMergeOnly = cuGeom.log2CUSize == 6;
1014
1015 bool bHor = false, bVer = false;
1016 if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN)
1017 bHor = true;
1018 else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N)
1019 bVer = true;
1020 else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0] && !md.bestMode->cu.isSkipped(0))
1021 {
1022 bHor = true;
1023 bVer = true;
1024 }
1025
1026 if (bHor)
1027 {
1028 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1029 {
1030 checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, bMergeOnly);
1031 checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
1032 }
1033 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1034 {
1035 checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, bMergeOnly);
1036 checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
1037 }
1038 }
1039 if (bVer)
1040 {
1041 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1042 {
1043 checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, bMergeOnly);
1044 checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
1045 }
1046 if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))
1047 {
1048 checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, bMergeOnly);
1049 checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
1050 }
1051 }
1052 }
1053
1054 if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) &&
1055 (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)))
1056 {
1057 checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL);
1058 checkBestMode(md.pred[PRED_INTRA], depth);
1059
1060 if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
1061 {
1062 checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL);
1063 checkBestMode(md.pred[PRED_INTRA_NxN], depth);
1064 }
1065 }
1066 }
1067
1068 if (m_bTryLossless)
1069 tryLossless(cuGeom);
1070
1071 if (mightSplit)
1072 addSplitFlagCost(*md.bestMode, cuGeom.depth);
1073 }
1074
1075 // estimate split cost
1076 if (mightSplit && (!md.bestMode || !md.bestMode->cu.isSkipped(0)))
1077 {
1078 Mode* splitPred = &md.pred[PRED_SPLIT];
1079 splitPred->initCosts();
1080 CUData* splitCU = &splitPred->cu;
1081 splitCU->initSubCU(parentCTU, cuGeom);
1082
1083 uint32_t nextDepth = depth + 1;
1084 ModeDepth& nd = m_modeDepth[nextDepth];
1085 invalidateContexts(nextDepth);
1086 Entropy* nextContext = &m_rqt[depth].cur;
1087
1088 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1089 {
1090 const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1091 if (childCuData.flags & CUGeom::PRESENT)
1092 {
1093 m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childCuData.encodeIdx);
1094 m_rqt[nextDepth].cur.load(*nextContext);
1095 compressInterCU_rd5_6(parentCTU, childCuData);
1096
1097 // Save best CU and pred data for this sub CU
1098 splitCU->copyPartFrom(nd.bestMode->cu, childCuData, subPartIdx);
1099 splitPred->addSubCosts(*nd.bestMode);
1100 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childCuData.numPartitions * subPartIdx);
1101 nextContext = &nd.bestMode->contexts;
1102 }
1103 else
1104 splitCU->setEmptyPart(childCuData, subPartIdx);
1105 }
1106 nextContext->store(splitPred->contexts);
1107 if (mightNotSplit)
1108 addSplitFlagCost(*splitPred, cuGeom.depth);
1109 else
1110 updateModeCost(*splitPred);
1111
1112 checkBestMode(*splitPred, depth);
1113 }
1114
1115 checkDQP(md.bestMode->cu, cuGeom);
1116
1117 /* Copy best data to encData CTU and recon */
1118 md.bestMode->cu.copyToPic(depth);
1119 if (md.bestMode != &md.pred[PRED_SPLIT])
1120 md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, parentCTU.m_cuAddr, cuGeom.encodeIdx);
1121}
1122
1123/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
1124void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom)
1125{
1126 uint32_t depth = cuGeom.depth;
1127 ModeDepth& md = m_modeDepth[depth];
1128 Yuv *fencYuv = &md.fencYuv;
1129
1130 /* Note that these two Mode instances are named MERGE and SKIP but they may
1131 * hold the reverse when the function returns. We toggle between the two modes */
1132 Mode* tempPred = &merge;
1133 Mode* bestPred = &skip;
1134
1135 X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n");
1136
1137 tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
1138 tempPred->cu.setPredModeSubParts(MODE_INTER);
1139 tempPred->cu.m_mergeFlag[0] = true;
1140
1141 bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N);
1142 bestPred->cu.setPredModeSubParts(MODE_INTER);
1143 bestPred->cu.m_mergeFlag[0] = true;
1144
1145 MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
1146 uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS];
1147 uint32_t maxNumMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours);
1148
1149 bestPred->sa8dCost = MAX_INT64;
1150 int bestSadCand = -1;
1151 int sizeIdx = cuGeom.log2CUSize - 2;
1152 for (uint32_t i = 0; i < maxNumMergeCand; ++i)
1153 {
1154 if (m_bFrameParallel &&
1155 (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1156 mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4))
1157 continue;
1158
1159 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
1160 tempPred->cu.m_interDir[0] = interDirNeighbours[i];
1161 tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
1162 tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
1163 tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
1164 tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
1165
1166 // do MC only for Luma part
1167 prepMotionCompensation(tempPred->cu, cuGeom, 0);
1168 motionCompensation(tempPred->predYuv, true, false);
1169
1170 tempPred->sa8dBits = getTUBits(i, maxNumMergeCand);
1171 tempPred->distortion = primitives.sa8d[sizeIdx](fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
1172 tempPred->sa8dCost = m_rdCost.calcRdSADCost(tempPred->distortion, tempPred->sa8dBits);
1173
1174 if (tempPred->sa8dCost < bestPred->sa8dCost)
1175 {
1176 bestSadCand = i;
1177 std::swap(tempPred, bestPred);
1178 }
1179 }
1180
1181 /* force mode decision to take inter or intra */
1182 if (bestSadCand < 0)
1183 return;
1184
1185 /* calculate the motion compensation for chroma for the best mode selected */
1186 prepMotionCompensation(bestPred->cu, cuGeom, 0);
1187 motionCompensation(bestPred->predYuv, false, true);
1188
1189 if (m_param->rdLevel)
1190 {
1191 if (m_param->bLossless)
1192 bestPred->rdCost = MAX_INT64;
1193 else
1194 encodeResAndCalcRdSkipCU(*bestPred);
1195
1196 /* Encode with residual */
1197 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
1198 tempPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
1199 tempPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
1200 tempPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
1201 tempPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
1202 tempPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
1203 tempPred->sa8dCost = bestPred->sa8dCost;
1204 tempPred->predYuv.copyFromYuv(bestPred->predYuv);
1205
1206 encodeResAndCalcRdInterCU(*tempPred, cuGeom);
1207
1208 md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred;
1209 }
1210 else
1211 md.bestMode = bestPred;
1212
1213 /* broadcast sets of MV field data */
1214 bestPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
1215 bestPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
1216 bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
1217 bestPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
1218 bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
1219}
1220
1221/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
1222void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom)
1223{
1224 uint32_t depth = cuGeom.depth;
1225
1226 /* Note that these two Mode instances are named MERGE and SKIP but they may
1227 * hold the reverse when the function returns. We toggle between the two modes */
1228 Mode* tempPred = &merge;
1229 Mode* bestPred = &skip;
1230
1231 merge.cu.setPredModeSubParts(MODE_INTER);
1232 merge.cu.setPartSizeSubParts(SIZE_2Nx2N);
1233 merge.cu.m_mergeFlag[0] = true;
1234
1235 skip.cu.setPredModeSubParts(MODE_INTER);
1236 skip.cu.setPartSizeSubParts(SIZE_2Nx2N);
1237 skip.cu.m_mergeFlag[0] = true;
1238
1239 MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
1240 uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS];
1241 uint32_t maxNumMergeCand = merge.cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours);
1242
1243 bool foundCbf0Merge = false;
1244 bool triedPZero = false, triedBZero = false;
1245 bestPred->rdCost = MAX_INT64;
1246 for (uint32_t i = 0; i < maxNumMergeCand; i++)
1247 {
1248 if (m_bFrameParallel &&
1249 (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1250 mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4))
1251 continue;
1252
1253 /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */
1254 if (interDirNeighbours[i] == 1 && !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx)
1255 {
1256 if (triedPZero)
1257 continue;
1258 triedPZero = true;
1259 }
1260 else if (interDirNeighbours[i] == 3 &&
1261 !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx &&
1262 !mvFieldNeighbours[i][1].mv.word && !mvFieldNeighbours[i][1].refIdx)
1263 {
1264 if (triedBZero)
1265 continue;
1266 triedBZero = true;
1267 }
1268
1269 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; /* merge candidate ID is stored in L0 MVP idx */
1270 tempPred->cu.m_interDir[0] = interDirNeighbours[i];
1271 tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
1272 tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
1273 tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
1274 tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
1275 tempPred->cu.setSkipFlagSubParts(false); /* must be cleared between encode iterations */
1276
1277 prepMotionCompensation(tempPred->cu, cuGeom, 0);
1278 motionCompensation(tempPred->predYuv, true, true);
1279
1280 uint8_t hasCbf = true;
1281 bool swapped = false;
1282 if (!foundCbf0Merge)
1283 {
1284 /* if the best prediction has CBF (not a skip) then try merge with residual */
1285
1286 encodeResAndCalcRdInterCU(*tempPred, cuGeom);
1287 hasCbf = tempPred->cu.getQtRootCbf(0);
1288 foundCbf0Merge = !hasCbf;
1289
1290 if (tempPred->rdCost < bestPred->rdCost)
1291 {
1292 std::swap(tempPred, bestPred);
1293 swapped = true;
1294 }
1295 }
1296 if (!m_param->bLossless && hasCbf)
1297 {
1298 /* try merge without residual (skip), if not lossless coding */
1299
1300 if (swapped)
1301 {
1302 tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
1303 tempPred->cu.m_interDir[0] = interDirNeighbours[i];
1304 tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
1305 tempPred->cu.m_refIdx[0][0] = (char)mvFieldNeighbours[i][0].refIdx;
1306 tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
1307 tempPred->cu.m_refIdx[1][0] = (char)mvFieldNeighbours[i][1].refIdx;
1308 tempPred->cu.setSkipFlagSubParts(false);
1309 tempPred->predYuv.copyFromYuv(bestPred->predYuv);
1310 }
1311
1312 encodeResAndCalcRdSkipCU(*tempPred);
1313
1314 if (tempPred->rdCost < bestPred->rdCost)
1315 std::swap(tempPred, bestPred);
1316 }
1317 }
1318
1319 if (bestPred->rdCost < MAX_INT64)
1320 {
1321 m_modeDepth[depth].bestMode = bestPred;
1322
1323 /* broadcast sets of MV field data */
1324 uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0];
1325 bestPred->cu.setPUInterDir(interDirNeighbours[bestCand], 0, 0);
1326 bestPred->cu.setPUMv(0, mvFieldNeighbours[bestCand][0].mv, 0, 0);
1327 bestPred->cu.setPURefIdx(0, (char)mvFieldNeighbours[bestCand][0].refIdx, 0, 0);
1328 bestPred->cu.setPUMv(1, mvFieldNeighbours[bestCand][1].mv, 0, 0);
1329 bestPred->cu.setPURefIdx(1, (char)mvFieldNeighbours[bestCand][1].refIdx, 0, 0);
1330 }
1331}
1332
1333void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize)
1334{
1335 interMode.initCosts();
1336 interMode.cu.setPartSizeSubParts(partSize);
1337 interMode.cu.setPredModeSubParts(MODE_INTER);
1338
1339 if (predInterSearch(interMode, cuGeom, false, false))
1340 {
1341 /* predInterSearch sets interMode.sa8dBits */
1342 const Yuv& fencYuv = *interMode.fencYuv;
1343 Yuv& predYuv = interMode.predYuv;
1344 interMode.distortion = primitives.sa8d[cuGeom.log2CUSize - 2](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
1345 interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits);
1346 }
1347 else
1348 {
1349 interMode.distortion = MAX_UINT;
1350 interMode.sa8dCost = MAX_INT64;
1351 }
1352}
1353
1354void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, bool bMergeOnly)
1355{
1356 interMode.initCosts();
1357 interMode.cu.setPartSizeSubParts(partSize);
1358 interMode.cu.setPredModeSubParts(MODE_INTER);
1359
1360 if (predInterSearch(interMode, cuGeom, bMergeOnly, true))
1361 {
1362 /* predInterSearch sets interMode.sa8dBits, but this is ignored */
1363 encodeResAndCalcRdInterCU(interMode, cuGeom);
1364 }
1365 else
1366 {
1367 interMode.distortion = MAX_UINT;
1368 interMode.rdCost = MAX_INT64;
1369 }
1370}
1371
1372/* Note that this function does not save the best intra prediction, it must
1373 * be generated later. It records the best mode in the cu */
1374void Analysis::checkIntraInInter_rd0_4(Mode& intraMode, const CUGeom& cuGeom)
1375{
1376 CUData& cu = intraMode.cu;
1377 uint32_t depth = cu.m_cuDepth[0];
1378
1379 cu.setPartSizeSubParts(SIZE_2Nx2N);
1380 cu.setPredModeSubParts(MODE_INTRA);
1381
1382 uint32_t initTrDepth = 0;
1383 uint32_t log2TrSize = cu.m_log2CUSize[0] - initTrDepth;
1384 uint32_t tuSize = 1 << log2TrSize;
1385 const uint32_t absPartIdx = 0;
1386
1387 // Reference sample smoothing
1388 initAdiPattern(cu, cuGeom, absPartIdx, initTrDepth, ALL_IDX);
1389
1390 pixel* fenc = m_modeDepth[depth].fencYuv.m_buf[0];
1391 uint32_t stride = m_modeDepth[depth].fencYuv.m_size;
1392
1393 pixel *above = m_refAbove + tuSize - 1;
1394 pixel *aboveFiltered = m_refAboveFlt + tuSize - 1;
1395 pixel *left = m_refLeft + tuSize - 1;
1396 pixel *leftFiltered = m_refLeftFlt + tuSize - 1;
1397 int sad, bsad;
1398 uint32_t bits, bbits, mode, bmode;
1399 uint64_t cost, bcost;
1400
1401 // 33 Angle modes once
1402 ALIGN_VAR_32(pixel, bufScale[32 * 32]);
1403 ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
1404 ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
1405 int scaleTuSize = tuSize;
1406 int scaleStride = stride;
1407 int costShift = 0;
1408 int sizeIdx = log2TrSize - 2;
1409
1410 if (tuSize > 32)
1411 {
1412 // origin is 64x64, we scale to 32x32 and setup required parameters
1413 primitives.scale2D_64to32(bufScale, fenc, stride);
1414 fenc = bufScale;
1415
1416 // reserve space in case primitives need to store data in above
1417 // or left buffers
1418 pixel _above[4 * 32 + 1];
1419 pixel _left[4 * 32 + 1];
1420 pixel *aboveScale = _above + 2 * 32;
1421 pixel *leftScale = _left + 2 * 32;
1422 aboveScale[0] = leftScale[0] = above[0];
1423 primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
1424 primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
1425
1426 scaleTuSize = 32;
1427 scaleStride = 32;
1428 costShift = 2;
1429 sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
1430
1431 // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
1432 above = aboveScale;
1433 left = leftScale;
1434 aboveFiltered = aboveScale;
1435 leftFiltered = leftScale;
1436 }
1437
1438 pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
1439 int predsize = scaleTuSize * scaleTuSize;
1440
1441 m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
1442
1443 /* there are three cost tiers for intra modes:
1444 * pred[0] - mode probable, least cost
1445 * pred[1], pred[2] - less probable, slightly more cost
1446 * non-mpm modes - all cost the same (rbits) */
1447 uint64_t mpms;
1448 uint32_t preds[3];
1449 uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
1450
1451 // DC
1452 primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
1453 bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
1454 bmode = mode = DC_IDX;
1455 bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
1456 bcost = m_rdCost.calcRdSADCost(bsad, bbits);
1457
1458 pixel *abovePlanar = above;
1459 pixel *leftPlanar = left;
1460
1461 if (tuSize & (8 | 16 | 32))
1462 {
1463 abovePlanar = aboveFiltered;
1464 leftPlanar = leftFiltered;
1465 }
1466
1467 // PLANAR
1468 primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
1469 sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
1470 mode = PLANAR_IDX;
1471 bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
1472 cost = m_rdCost.calcRdSADCost(sad, bits);
1473 COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1474
1475 // Transpose NxN
1476 primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
1477
1478 primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
1479
1480 bool modeHor;
1481 pixel *cmp;
1482 intptr_t srcStride;
1483
1484#define TRY_ANGLE(angle) \
1485 modeHor = angle < 18; \
1486 cmp = modeHor ? bufTrans : fenc; \
1487 srcStride = modeHor ? scaleTuSize : scaleStride; \
1488 sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
1489 bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
1490 cost = m_rdCost.calcRdSADCost(sad, bits)
1491
1492 if (m_param->bEnableFastIntra)
1493 {
1494 int asad = 0;
1495 uint32_t lowmode, highmode, amode = 5, abits = 0;
1496 uint64_t acost = MAX_INT64;
1497
1498 /* pick the best angle, sampling at distance of 5 */
1499 for (mode = 5; mode < 35; mode += 5)
1500 {
1501 TRY_ANGLE(mode);
1502 COPY4_IF_LT(acost, cost, amode, mode, asad, sad, abits, bits);
1503 }
1504
1505 /* refine best angle at distance 2, then distance 1 */
1506 for (uint32_t dist = 2; dist >= 1; dist--)
1507 {
1508 lowmode = amode - dist;
1509 highmode = amode + dist;
1510
1511 X265_CHECK(lowmode >= 2 && lowmode <= 34, "low intra mode out of range\n");
1512 TRY_ANGLE(lowmode);
1513 COPY4_IF_LT(acost, cost, amode, lowmode, asad, sad, abits, bits);
1514
1515 X265_CHECK(highmode >= 2 && highmode <= 34, "high intra mode out of range\n");
1516 TRY_ANGLE(highmode);
1517 COPY4_IF_LT(acost, cost, amode, highmode, asad, sad, abits, bits);
1518 }
1519
1520 if (amode == 33)
1521 {
1522 TRY_ANGLE(34);
1523 COPY4_IF_LT(acost, cost, amode, 34, asad, sad, abits, bits);
1524 }
1525
1526 COPY4_IF_LT(bcost, acost, bmode, amode, bsad, asad, bbits, abits);
1527 }
1528 else // calculate and search all intra prediction angles for lowest cost
1529 {
1530 for (mode = 2; mode < 35; mode++)
1531 {
1532 TRY_ANGLE(mode);
1533 COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
1534 }
1535 }
1536
1537 cu.setLumaIntraDirSubParts((uint8_t)bmode, absPartIdx, depth + initTrDepth);
1538 intraMode.initCosts();
1539 intraMode.totalBits = bbits;
1540 intraMode.distortion = bsad;
1541 intraMode.sa8dCost = bcost;
1542 intraMode.sa8dBits = bbits;
1543}
1544
1545void Analysis::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
1546{
1547 CUData& cu = intraMode.cu;
1548 Yuv* reconYuv = &intraMode.reconYuv;
1549 Yuv* fencYuv = &m_modeDepth[cuGeom.depth].fencYuv;
1550
1551 X265_CHECK(cu.m_partSize[0] == SIZE_2Nx2N, "encodeIntraInInter does not expect NxN intra\n");
1552 X265_CHECK(!m_slice->isIntra(), "encodeIntraInInter does not expect to be used in I slices\n");
1553
1554 m_quant.setQPforQuant(cu);
1555
1556 uint32_t tuDepthRange[2];
1557 cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1558
1559 m_entropyCoder.load(m_rqt[cuGeom.depth].cur);
1560
1561 Cost icosts;
1562 codeIntraLumaQT(intraMode, cuGeom, 0, 0, false, icosts, tuDepthRange);
1563 extractIntraResultQT(cu, *reconYuv, 0, 0);
1564
1565 intraMode.distortion = icosts.distortion;
1566 intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom);
1567
1568 m_entropyCoder.resetBits();
1569 if (m_slice->m_pps->bTransquantBypassEnabled)
1570 m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
1571 m_entropyCoder.codeSkipFlag(cu, 0);
1572 m_entropyCoder.codePredMode(cu.m_predMode[0]);
1573 m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
1574 m_entropyCoder.codePredInfo(cu, 0);
1575 intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits();
1576
1577 bool bCodeDQP = m_slice->m_pps->bUseDQP;
1578 m_entropyCoder.codeCoeff(cu, 0, cuGeom.depth, bCodeDQP, tuDepthRange);
1579
1580 intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
1581 intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
1582 if (m_rdCost.m_psyRd)
1583 intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
1584
1585 m_entropyCoder.store(intraMode.contexts);
1586 updateModeCost(intraMode);
1587}
1588
1589void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
1590{
1591 if (cuGeom.depth < ctu.m_cuDepth[cuGeom.encodeIdx] && cuGeom.depth < g_maxCUDepth)
1592 {
1593 for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1594 {
1595 const CUGeom& childCuData = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1596 if (childCuData.flags & CUGeom::PRESENT)
1597 encodeResidue(ctu, childCuData);
1598 }
1599 return;
1600 }
1601
1602 uint32_t absPartIdx = cuGeom.encodeIdx;
1603 int sizeIdx = cuGeom.log2CUSize - 2;
1604
1605 Yuv& fencYuv = m_modeDepth[0].fencYuv;
1606
1607 /* reuse the bestMode data structures at the current depth */
1608 Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
1609 Yuv& reconYuv = bestMode->reconYuv;
1610 CUData& cu = bestMode->cu;
1611
1612 cu.copyFromPic(ctu, cuGeom);
1613 m_quant.setQPforQuant(cu);
1614
1615 if (cu.m_predMode[0] == MODE_INTRA)
1616 {
1617 uint32_t tuDepthRange[2];
1618 cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1619
1620 uint32_t initTrDepth = cu.m_partSize[0] == SIZE_NxN;
1621 residualTransformQuantIntra(*bestMode, cuGeom, initTrDepth, 0, tuDepthRange);
1622 getBestIntraModeChroma(*bestMode, cuGeom);
1623 residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
1624 }
1625 else if (cu.m_predMode[0] == MODE_INTER)
1626 {
1627 X265_CHECK(!ctu.m_skipFlag[absPartIdx], "skip not expected prior to transform\n");
1628
1629 /* Calculate residual for current CU part into depth sized resiYuv */
1630
1631 ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
1632
1633 /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */
1634 Yuv& predYuv = m_modeDepth[0].bestMode->predYuv;
1635 pixel* predY = predYuv.getLumaAddr(absPartIdx);
1636 pixel* predU = predYuv.getCbAddr(absPartIdx);
1637 pixel* predV = predYuv.getCrAddr(absPartIdx);
1638
1639 primitives.luma_sub_ps[sizeIdx](resiYuv.m_buf[0], resiYuv.m_size,
1640 fencYuv.getLumaAddr(absPartIdx), predY,
1641 fencYuv.m_size, predYuv.m_size);
1642
1643 primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[1], resiYuv.m_csize,
1644 fencYuv.getCbAddr(absPartIdx), predU,
1645 fencYuv.m_csize, predYuv.m_csize);
1646
1647 primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[2], resiYuv.m_csize,
1648 fencYuv.getCrAddr(absPartIdx), predV,
1649 fencYuv.m_csize, predYuv.m_csize);
1650
1651 uint32_t tuDepthRange[2];
1652 cu.getInterTUQtDepthRange(tuDepthRange, 0);
1653
1654 residualTransformQuantInter(*bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange);
1655
1656 if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
1657 cu.setSkipFlagSubParts(true);
1658
1659 PicYuv& reconPicYuv = *m_frame->m_reconPicYuv;
1660 if (cu.getQtRootCbf(0)) // TODO: split to each component
1661 {
1662 /* residualTransformQuantInter() wrote transformed residual back into
1663 * resiYuv. Generate the recon pixels by adding it to the prediction */
1664
1665 primitives.luma_add_ps[sizeIdx](reconYuv.m_buf[0], reconYuv.m_size,
1666 predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
1667 primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[1], reconYuv.m_csize,
1668 predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
1669 primitives.chroma[m_csp].add_ps[sizeIdx](reconYuv.m_buf[2], reconYuv.m_csize,
1670 predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
1671
1672 /* copy the reconstructed part to the recon pic for later intra
1673 * predictions */
1674 reconYuv.copyToPicYuv(*m_frame->m_reconPicYuv, cu.m_cuAddr, absPartIdx);
1675 }
1676 else
1677 {
1678 /* copy the prediction pixels to the recon pic for later intra
1679 * predictions */
1680
1681 primitives.luma_copy_pp[sizeIdx](reconPicYuv.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_stride,
1682 predY, predYuv.m_size);
1683 primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCbAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC,
1684 predU, predYuv.m_csize);
1685 primitives.chroma[m_csp].copy_pp[sizeIdx](reconPicYuv.getCrAddr(cu.m_cuAddr, absPartIdx), reconPicYuv.m_strideC,
1686 predV, predYuv.m_csize);
1687 }
1688 }
1689 /* else if (cu.m_predMode[0] == MODE_NONE) {} */
1690
1691 checkDQP(cu, cuGeom);
1692 cu.updatePic(cuGeom.depth);
1693}
1694
1695/* check whether current try is the best with identifying the depth of current try */
1696void Analysis::checkBestMode(Mode& mode, uint32_t depth)
1697{
1698 ModeDepth& md = m_modeDepth[depth];
1699 if (md.bestMode)
1700 {
1701 if (mode.rdCost < md.bestMode->rdCost)
1702 md.bestMode = &mode;
1703 }
1704 else
1705 md.bestMode = &mode;
1706}
1707
1708void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
1709{
1710 if (m_param->rdLevel >= 3)
1711 {
1712 /* code the split flag (0 or 1) and update bit costs */
1713 mode.contexts.resetBits();
1714 mode.contexts.codeSplitFlag(mode.cu, 0, depth);
1715 uint32_t bits = mode.contexts.getNumberOfWrittenBits();
1716 mode.mvBits += bits;
1717 mode.totalBits += bits;
1718 updateModeCost(mode);
1719 }
1720 else if (m_param->rdLevel <= 1)
1721 {
1722 mode.sa8dBits++;
1723 mode.sa8dCost = m_rdCost.calcRdSADCost(mode.distortion, mode.sa8dBits);
1724 }
1725 else
1726 {
1727 mode.mvBits++;
1728 mode.totalBits++;
1729 updateModeCost(mode);
1730 }
1731}
1732
1733void Analysis::checkDQP(CUData& cu, const CUGeom& cuGeom)
1734{
1735 if (m_slice->m_pps->bUseDQP && cuGeom.depth <= m_slice->m_pps->maxCuDQPDepth)
1736 {
1737 if (cu.m_cuDepth[0] > cuGeom.depth) // detect splits
1738 {
1739 bool hasResidual = false;
1740 for (uint32_t absPartIdx = 0; absPartIdx < cu.m_numPartitions; absPartIdx++)
1741 {
1742 if (cu.getQtRootCbf(absPartIdx))
1743 {
1744 hasResidual = true;
1745 break;
1746 }
1747 }
1748 if (hasResidual)
1749 cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth);
1750 else
1751 cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
1752 }
1753 else
1754 {
1755 if (!cu.getCbf(0, TEXT_LUMA, 0) && !cu.getCbf(0, TEXT_CHROMA_U, 0) && !cu.getCbf(0, TEXT_CHROMA_V, 0))
1756 cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth);
1757 }
1758 }
1759}
1760
1761uint32_t Analysis::topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom)
1762{
1763 /* Do not attempt to code a block larger than the largest block in the
1764 * co-located CTUs in L0 and L1 */
1765 int currentQP = parentCTU.m_qp[0];
1766 int previousQP = currentQP;
1767 uint32_t minDepth0 = 4, minDepth1 = 4;
1768 uint32_t sum = 0;
1769 int numRefs = 0;
1770 if (m_slice->m_numRefIdx[0])
1771 {
1772 numRefs++;
1773 const CUData& cu = *m_slice->m_refPicList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
1774 previousQP = cu.m_qp[0];
1775 if (!cu.m_cuDepth[cuGeom.encodeIdx])
1776 return 0;
1777 for (uint32_t i = 0; i < cuGeom.numPartitions && minDepth0; i += 4)
1778 {
1779 uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i];
1780 minDepth0 = X265_MIN(d, minDepth0);
1781 sum += d;
1782 }
1783 }
1784 if (m_slice->m_numRefIdx[1])
1785 {
1786 numRefs++;
1787 const CUData& cu = *m_slice->m_refPicList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
1788 if (!cu.m_cuDepth[cuGeom.encodeIdx])
1789 return 0;
1790 for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
1791 {
1792 uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i];
1793 minDepth1 = X265_MIN(d, minDepth1);
1794 sum += d;
1795 }
1796 }
1797 if (!numRefs)
1798 return 0;
1799
1800 uint32_t minDepth = X265_MIN(minDepth0, minDepth1);
1801 uint32_t thresh = minDepth * numRefs * (cuGeom.numPartitions >> 2);
1802
1803 /* allow block size growth if QP is raising or avg depth is
1804 * less than 1.5 of min depth */
1805 if (minDepth && currentQP >= previousQP && (sum <= thresh + (thresh >> 1)))
1806 minDepth -= 1;
1807
1808 return minDepth;
1809}
1810
1811/* returns true if recursion should be stopped */
1812bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode)
1813{
1814 /* early exit when the RD cost of best mode at depth n is less than the sum
1815 * of average of RD cost of the neighbor CU's(above, aboveleft, aboveright,
1816 * left, colocated) and avg cost of that CU at depth "n" with weightage for
1817 * each quantity */
1818
1819 uint32_t depth = cuGeom.depth;
1820 FrameData& curEncData = const_cast<FrameData&>(*m_frame->m_encData);
1821 FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
1822 uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth];
1823 uint64_t cuCount = cuStat.count[depth];
1824
1825 uint64_t neighCost = 0, neighCount = 0;
1826 const CUData* above = parentCTU.m_cuAbove;
1827 if (above)
1828 {
1829 FrameData::RCStatCU& astat = curEncData.m_cuStat[above->m_cuAddr];
1830 neighCost += astat.avgCost[depth] * astat.count[depth];
1831 neighCount += astat.count[depth];
1832
1833 const CUData* aboveLeft = parentCTU.m_cuAboveLeft;
1834 if (aboveLeft)
1835 {
1836 FrameData::RCStatCU& lstat = curEncData.m_cuStat[aboveLeft->m_cuAddr];
1837 neighCost += lstat.avgCost[depth] * lstat.count[depth];
1838 neighCount += lstat.count[depth];
1839 }
1840
1841 const CUData* aboveRight = parentCTU.m_cuAboveRight;
1842 if (aboveRight)
1843 {
1844 FrameData::RCStatCU& rstat = curEncData.m_cuStat[aboveRight->m_cuAddr];
1845 neighCost += rstat.avgCost[depth] * rstat.count[depth];
1846 neighCount += rstat.count[depth];
1847 }
1848 }
1849 const CUData* left = parentCTU.m_cuLeft;
1850 if (left)
1851 {
1852 FrameData::RCStatCU& nstat = curEncData.m_cuStat[left->m_cuAddr];
1853 neighCost += nstat.avgCost[depth] * nstat.count[depth];
1854 neighCount += nstat.count[depth];
1855 }
1856
1857 // give 60% weight to all CU's and 40% weight to neighbour CU's
1858 if (neighCost + cuCount)
1859 {
1860 uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount));
1861 uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost;
1862 if (curCost < avgCost && avgCost)
1863 return true;
1864 }
1865
1866 return false;
1867}