Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | /***************************************************************************** |
2 | * Copyright (C) 2013 x265 project | |
3 | * | |
4 | * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com> | |
5 | * Steve Borho <steve@borho.org> | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License as published by | |
9 | * the Free Software Foundation; either version 2 of the License, or | |
10 | * (at your option) any later version. | |
11 | * | |
12 | * This program is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | * GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with this program; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
20 | * | |
21 | * This program is also available under a commercial proprietary license. | |
22 | * For more information, contact us at license @ x265.com. | |
23 | *****************************************************************************/ | |
24 | ||
25 | #include "common.h" | |
26 | #include "frame.h" | |
27 | #include "framedata.h" | |
28 | #include "picyuv.h" | |
29 | #include "primitives.h" | |
30 | #include "threading.h" | |
31 | ||
32 | #include "analysis.h" | |
33 | #include "rdcost.h" | |
34 | #include "encoder.h" | |
35 | ||
72b9787e JB |
36 | using namespace x265; |
37 | ||
38 | /* An explanation of rate distortion levels (--rd-level) | |
39 | * | |
40 | * rd-level 0 generates no recon per CU (NO RDO or Quant) | |
41 | * | |
42 | * sa8d selection between merge / skip / inter / intra and split | |
43 | * no recon pixels generated until CTU analysis is complete, requiring | |
44 | * intra predictions to use source pixels | |
45 | * | |
46 | * rd-level 1 uses RDO for merge and skip, sa8d for all else | |
47 | * | |
48 | * RDO selection between merge and skip | |
49 | * sa8d selection between (merge/skip) / inter modes / intra and split | |
50 | * intra prediction uses reconstructed pixels | |
51 | * | |
52 | * rd-level 2 uses RDO for merge/skip and split | |
53 | * | |
54 | * RDO selection between merge and skip | |
55 | * sa8d selection between (merge/skip) / inter modes / intra | |
56 | * RDO split decisions | |
57 | * | |
58 | * rd-level 3 uses RDO for merge/skip/best inter/intra | |
59 | * | |
60 | * RDO selection between merge and skip | |
61 | * sa8d selection of best inter mode | |
b53f7c52 | 62 | * sa8d decisions include chroma residual cost |
72b9787e JB |
63 | * RDO selection between (merge/skip) / best inter mode / intra / split |
64 | * | |
65 | * rd-level 4 enables RDOQuant | |
b53f7c52 JB |
66 | * chroma residual cost included in satd decisions, including subpel refine |
67 | * (as a result of --subme 3 being used by preset slow) | |
72b9787e JB |
68 | * |
69 | * rd-level 5,6 does RDO for each inter mode | |
70 | */ | |
71 | ||
72 | Analysis::Analysis() | |
73 | { | |
74 | m_totalNumJobs = m_numAcquiredJobs = m_numCompletedJobs = 0; | |
b53f7c52 JB |
75 | m_reuseIntraDataCTU = NULL; |
76 | m_reuseInterDataCTU = NULL; | |
72b9787e JB |
77 | } |
78 | ||
79 | bool Analysis::create(ThreadLocalData *tld) | |
80 | { | |
81 | m_tld = tld; | |
82 | m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2; | |
b53f7c52 | 83 | m_bChromaSa8d = m_param->rdLevel >= 3; |
72b9787e JB |
84 | |
85 | int csp = m_param->internalCsp; | |
86 | uint32_t cuSize = g_maxCUSize; | |
87 | ||
88 | bool ok = true; | |
89 | for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++, cuSize >>= 1) | |
90 | { | |
91 | ModeDepth &md = m_modeDepth[depth]; | |
92 | ||
93 | md.cuMemPool.create(depth, csp, MAX_PRED_TYPES); | |
94 | ok &= md.fencYuv.create(cuSize, csp); | |
95 | ||
96 | for (int j = 0; j < MAX_PRED_TYPES; j++) | |
97 | { | |
98 | md.pred[j].cu.initialize(md.cuMemPool, depth, csp, j); | |
99 | ok &= md.pred[j].predYuv.create(cuSize, csp); | |
100 | ok &= md.pred[j].reconYuv.create(cuSize, csp); | |
101 | md.pred[j].fencYuv = &md.fencYuv; | |
102 | } | |
103 | } | |
104 | ||
105 | return ok; | |
106 | } | |
107 | ||
108 | void Analysis::destroy() | |
109 | { | |
110 | for (uint32_t i = 0; i <= g_maxCUDepth; i++) | |
111 | { | |
112 | m_modeDepth[i].cuMemPool.destroy(); | |
113 | m_modeDepth[i].fencYuv.destroy(); | |
114 | ||
115 | for (int j = 0; j < MAX_PRED_TYPES; j++) | |
116 | { | |
117 | m_modeDepth[i].pred[j].predYuv.destroy(); | |
118 | m_modeDepth[i].pred[j].reconYuv.destroy(); | |
119 | } | |
120 | } | |
121 | } | |
122 | ||
b53f7c52 | 123 | Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext) |
72b9787e JB |
124 | { |
125 | m_slice = ctu.m_slice; | |
126 | m_frame = &frame; | |
127 | ||
128 | invalidateContexts(0); | |
129 | m_quant.setQPforQuant(ctu); | |
130 | m_rqt[0].cur.load(initialContext); | |
b53f7c52 | 131 | m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0); |
72b9787e JB |
132 | |
133 | uint32_t numPartition = ctu.m_numPartitions; | |
b53f7c52 JB |
134 | if (m_param->analysisMode) |
135 | { | |
136 | m_reuseIntraDataCTU = (analysis_intra_data *)m_frame->m_analysisData.intraData; | |
137 | int numPredDir = m_slice->isInterP() ? 1 : 2; | |
138 | m_reuseInterDataCTU = (analysis_inter_data *)m_frame->m_analysisData.interData + ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir; | |
139 | } | |
140 | ||
72b9787e JB |
141 | if (m_slice->m_sliceType == I_SLICE) |
142 | { | |
143 | uint32_t zOrder = 0; | |
b53f7c52 JB |
144 | compressIntraCU(ctu, cuGeom, zOrder); |
145 | if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData) | |
72b9787e | 146 | { |
b53f7c52 JB |
147 | CUData *bestCU = &m_modeDepth[0].bestMode->cu; |
148 | memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition); | |
149 | memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition); | |
150 | memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition); | |
72b9787e JB |
151 | } |
152 | } | |
153 | else | |
154 | { | |
155 | if (!m_param->rdLevel) | |
156 | { | |
157 | /* In RD Level 0/1, copy source pixels into the reconstructed block so | |
b53f7c52 JB |
158 | * they are available for intra predictions */ |
159 | m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0); | |
160 | ||
161 | compressInterCU_rd0_4(ctu, cuGeom); | |
72b9787e JB |
162 | |
163 | /* generate residual for entire CTU at once and copy to reconPic */ | |
164 | encodeResidue(ctu, cuGeom); | |
165 | } | |
166 | else if (m_param->bDistributeModeAnalysis && m_param->rdLevel >= 2) | |
167 | compressInterCU_dist(ctu, cuGeom); | |
168 | else if (m_param->rdLevel <= 4) | |
169 | compressInterCU_rd0_4(ctu, cuGeom); | |
170 | else | |
171 | compressInterCU_rd5_6(ctu, cuGeom); | |
172 | } | |
173 | ||
174 | return *m_modeDepth[0].bestMode; | |
175 | } | |
176 | ||
177 | void Analysis::tryLossless(const CUGeom& cuGeom) | |
178 | { | |
179 | ModeDepth& md = m_modeDepth[cuGeom.depth]; | |
180 | ||
181 | if (!md.bestMode->distortion) | |
182 | /* already lossless */ | |
183 | return; | |
b53f7c52 | 184 | else if (md.bestMode->cu.isIntra(0)) |
72b9787e JB |
185 | { |
186 | md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom); | |
187 | PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0]; | |
188 | uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir; | |
189 | checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes); | |
190 | checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth); | |
191 | } | |
192 | else | |
193 | { | |
194 | md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom); | |
195 | md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv); | |
196 | encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom); | |
197 | checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth); | |
198 | } | |
199 | } | |
200 | ||
b53f7c52 | 201 | void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder) |
72b9787e JB |
202 | { |
203 | uint32_t depth = cuGeom.depth; | |
204 | ModeDepth& md = m_modeDepth[depth]; | |
205 | md.bestMode = NULL; | |
206 | ||
207 | bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); | |
208 | bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); | |
209 | ||
b53f7c52 | 210 | if (m_param->analysisMode == X265_ANALYSIS_LOAD) |
72b9787e | 211 | { |
b53f7c52 JB |
212 | uint8_t* reuseDepth = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; |
213 | uint8_t* reuseModes = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; | |
214 | char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; | |
72b9787e | 215 | |
b53f7c52 | 216 | if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.encodeIdx) |
72b9787e JB |
217 | { |
218 | m_quant.setQPforQuant(parentCTU); | |
219 | ||
b53f7c52 | 220 | PartSize size = (PartSize)reusePartSizes[zOrder]; |
72b9787e JB |
221 | Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN]; |
222 | mode.cu.initSubCU(parentCTU, cuGeom); | |
b53f7c52 | 223 | checkIntra(mode, cuGeom, size, &reuseModes[zOrder]); |
72b9787e JB |
224 | checkBestMode(mode, depth); |
225 | ||
226 | if (m_bTryLossless) | |
227 | tryLossless(cuGeom); | |
228 | ||
229 | if (mightSplit) | |
230 | addSplitFlagCost(*md.bestMode, cuGeom.depth); | |
231 | ||
232 | // increment zOrder offset to point to next best depth in sharedDepth buffer | |
b53f7c52 | 233 | zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]]; |
72b9787e JB |
234 | mightSplit = false; |
235 | } | |
236 | } | |
237 | else if (mightNotSplit) | |
238 | { | |
239 | m_quant.setQPforQuant(parentCTU); | |
240 | ||
241 | md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); | |
242 | checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL); | |
243 | checkBestMode(md.pred[PRED_INTRA], depth); | |
244 | ||
245 | if (depth == g_maxCUDepth) | |
246 | { | |
247 | md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom); | |
248 | checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL); | |
249 | checkBestMode(md.pred[PRED_INTRA_NxN], depth); | |
250 | } | |
251 | ||
252 | if (m_bTryLossless) | |
253 | tryLossless(cuGeom); | |
254 | ||
255 | if (mightSplit) | |
256 | addSplitFlagCost(*md.bestMode, cuGeom.depth); | |
257 | } | |
258 | ||
259 | if (mightSplit) | |
260 | { | |
261 | Mode* splitPred = &md.pred[PRED_SPLIT]; | |
262 | splitPred->initCosts(); | |
263 | CUData* splitCU = &splitPred->cu; | |
264 | splitCU->initSubCU(parentCTU, cuGeom); | |
265 | ||
266 | uint32_t nextDepth = depth + 1; | |
267 | ModeDepth& nd = m_modeDepth[nextDepth]; | |
268 | invalidateContexts(nextDepth); | |
269 | Entropy* nextContext = &m_rqt[depth].cur; | |
270 | ||
271 | for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) | |
272 | { | |
b53f7c52 JB |
273 | const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); |
274 | if (childGeom.flags & CUGeom::PRESENT) | |
72b9787e | 275 | { |
b53f7c52 | 276 | m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx); |
72b9787e | 277 | m_rqt[nextDepth].cur.load(*nextContext); |
b53f7c52 | 278 | compressIntraCU(parentCTU, childGeom, zOrder); |
72b9787e JB |
279 | |
280 | // Save best CU and pred data for this sub CU | |
b53f7c52 | 281 | splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx); |
72b9787e | 282 | splitPred->addSubCosts(*nd.bestMode); |
b53f7c52 | 283 | nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx); |
72b9787e JB |
284 | nextContext = &nd.bestMode->contexts; |
285 | } | |
286 | else | |
287 | { | |
288 | /* record the depth of this non-present sub-CU */ | |
b53f7c52 | 289 | splitCU->setEmptyPart(childGeom, subPartIdx); |
72b9787e JB |
290 | zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth]; |
291 | } | |
292 | } | |
293 | nextContext->store(splitPred->contexts); | |
294 | if (mightNotSplit) | |
295 | addSplitFlagCost(*splitPred, cuGeom.depth); | |
296 | else | |
297 | updateModeCost(*splitPred); | |
298 | checkBestMode(*splitPred, depth); | |
299 | } | |
300 | ||
301 | checkDQP(md.bestMode->cu, cuGeom); | |
302 | ||
303 | /* Copy best data to encData CTU and recon */ | |
304 | md.bestMode->cu.copyToPic(depth); | |
305 | if (md.bestMode != &md.pred[PRED_SPLIT]) | |
b53f7c52 | 306 | md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx); |
72b9787e JB |
307 | } |
308 | ||
309 | bool Analysis::findJob(int threadId) | |
310 | { | |
311 | /* try to acquire a CU mode to analyze */ | |
b53f7c52 | 312 | m_pmodeLock.acquire(); |
72b9787e JB |
313 | if (m_totalNumJobs > m_numAcquiredJobs) |
314 | { | |
b53f7c52 JB |
315 | int id = m_numAcquiredJobs++; |
316 | m_pmodeLock.release(); | |
72b9787e | 317 | |
b53f7c52 JB |
318 | parallelModeAnalysis(threadId, id); |
319 | ||
320 | m_pmodeLock.acquire(); | |
321 | if (++m_numCompletedJobs == m_totalNumJobs) | |
322 | m_modeCompletionEvent.trigger(); | |
323 | m_pmodeLock.release(); | |
324 | return true; | |
72b9787e | 325 | } |
b53f7c52 JB |
326 | else |
327 | m_pmodeLock.release(); | |
72b9787e | 328 | |
b53f7c52 | 329 | m_meLock.acquire(); |
72b9787e JB |
330 | if (m_totalNumME > m_numAcquiredME) |
331 | { | |
b53f7c52 JB |
332 | int id = m_numAcquiredME++; |
333 | m_meLock.release(); | |
72b9787e | 334 | |
b53f7c52 JB |
335 | parallelME(threadId, id); |
336 | ||
337 | m_meLock.acquire(); | |
338 | if (++m_numCompletedME == m_totalNumME) | |
339 | m_meCompletionEvent.trigger(); | |
340 | m_meLock.release(); | |
341 | return true; | |
72b9787e | 342 | } |
b53f7c52 JB |
343 | else |
344 | m_meLock.release(); | |
72b9787e JB |
345 | |
346 | return false; | |
347 | } | |
348 | ||
349 | void Analysis::parallelME(int threadId, int meId) | |
350 | { | |
351 | Analysis* slave; | |
352 | ||
353 | if (threadId == -1) | |
354 | slave = this; | |
355 | else | |
356 | { | |
357 | slave = &m_tld[threadId].analysis; | |
358 | slave->setQP(*m_slice, m_rdCost.m_qp); | |
359 | slave->m_slice = m_slice; | |
360 | slave->m_frame = m_frame; | |
361 | ||
b53f7c52 JB |
362 | slave->m_me.setSourcePU(*m_curInterMode->fencYuv, m_curInterMode->cu.m_cuAddr, m_curGeom->encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight); |
363 | slave->prepMotionCompensation(m_curInterMode->cu, *m_curGeom, m_curPart); | |
72b9787e JB |
364 | } |
365 | ||
366 | if (meId < m_slice->m_numRefIdx[0]) | |
b53f7c52 | 367 | slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 0, meId); |
72b9787e | 368 | else |
b53f7c52 | 369 | slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]); |
72b9787e JB |
370 | } |
371 | ||
372 | void Analysis::parallelModeAnalysis(int threadId, int jobId) | |
373 | { | |
374 | Analysis* slave; | |
375 | ||
376 | if (threadId == -1) | |
377 | slave = this; | |
378 | else | |
379 | { | |
380 | slave = &m_tld[threadId].analysis; | |
381 | slave->m_slice = m_slice; | |
382 | slave->m_frame = m_frame; | |
383 | slave->setQP(*m_slice, m_rdCost.m_qp); | |
384 | slave->invalidateContexts(0); | |
72b9787e JB |
385 | } |
386 | ||
387 | ModeDepth& md = m_modeDepth[m_curGeom->depth]; | |
388 | ||
389 | if (m_param->rdLevel <= 4) | |
390 | { | |
391 | switch (jobId) | |
392 | { | |
393 | case 0: | |
394 | if (slave != this) | |
395 | slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur); | |
b53f7c52 | 396 | slave->checkIntraInInter(md.pred[PRED_INTRA], *m_curGeom); |
72b9787e JB |
397 | if (m_param->rdLevel > 2) |
398 | slave->encodeIntraInInter(md.pred[PRED_INTRA], *m_curGeom); | |
399 | break; | |
400 | ||
401 | case 1: | |
402 | slave->checkInter_rd0_4(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N); | |
b53f7c52 JB |
403 | if (m_slice->m_sliceType == B_SLICE) |
404 | slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom); | |
72b9787e JB |
405 | break; |
406 | ||
407 | case 2: | |
408 | slave->checkInter_rd0_4(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N); | |
409 | break; | |
410 | ||
411 | case 3: | |
412 | slave->checkInter_rd0_4(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN); | |
413 | break; | |
414 | ||
415 | case 4: | |
416 | slave->checkInter_rd0_4(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU); | |
417 | break; | |
418 | ||
419 | case 5: | |
420 | slave->checkInter_rd0_4(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD); | |
421 | break; | |
422 | ||
423 | case 6: | |
424 | slave->checkInter_rd0_4(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N); | |
425 | break; | |
426 | ||
427 | case 7: | |
428 | slave->checkInter_rd0_4(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N); | |
429 | break; | |
430 | ||
431 | default: | |
432 | X265_CHECK(0, "invalid job ID for parallel mode analysis\n"); | |
433 | break; | |
434 | } | |
435 | } | |
436 | else | |
437 | { | |
438 | bool bMergeOnly = m_curGeom->log2CUSize == 6; | |
439 | if (slave != this) | |
440 | { | |
441 | slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur); | |
442 | slave->m_quant.setQPforQuant(md.pred[PRED_2Nx2N].cu); | |
443 | } | |
444 | ||
445 | switch (jobId) | |
446 | { | |
447 | case 0: | |
448 | slave->checkIntra(md.pred[PRED_INTRA], *m_curGeom, SIZE_2Nx2N, NULL); | |
449 | if (m_curGeom->depth == g_maxCUDepth && m_curGeom->log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) | |
450 | slave->checkIntra(md.pred[PRED_INTRA_NxN], *m_curGeom, SIZE_NxN, NULL); | |
451 | break; | |
452 | ||
453 | case 1: | |
454 | slave->checkInter_rd5_6(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N, false); | |
b53f7c52 JB |
455 | md.pred[PRED_BIDIR].rdCost = MAX_INT64; |
456 | if (m_slice->m_sliceType == B_SLICE) | |
457 | { | |
458 | slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom); | |
459 | if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64) | |
460 | slave->encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], *m_curGeom); | |
461 | } | |
72b9787e JB |
462 | break; |
463 | ||
464 | case 2: | |
465 | slave->checkInter_rd5_6(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N, false); | |
466 | break; | |
467 | ||
468 | case 3: | |
469 | slave->checkInter_rd5_6(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN, false); | |
470 | break; | |
471 | ||
472 | case 4: | |
473 | slave->checkInter_rd5_6(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU, bMergeOnly); | |
474 | break; | |
475 | ||
476 | case 5: | |
477 | slave->checkInter_rd5_6(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD, bMergeOnly); | |
478 | break; | |
479 | ||
480 | case 6: | |
481 | slave->checkInter_rd5_6(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N, bMergeOnly); | |
482 | break; | |
483 | ||
484 | case 7: | |
485 | slave->checkInter_rd5_6(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N, bMergeOnly); | |
486 | break; | |
487 | ||
488 | default: | |
489 | X265_CHECK(0, "invalid job ID for parallel mode analysis\n"); | |
490 | break; | |
491 | } | |
492 | } | |
493 | } | |
494 | ||
495 | void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom) | |
496 | { | |
497 | uint32_t depth = cuGeom.depth; | |
498 | uint32_t cuAddr = parentCTU.m_cuAddr; | |
499 | ModeDepth& md = m_modeDepth[depth]; | |
500 | md.bestMode = NULL; | |
501 | ||
502 | bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); | |
503 | bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); | |
504 | uint32_t minDepth = m_param->rdLevel <= 4 ? topSkipMinDepth(parentCTU, cuGeom) : 0; | |
505 | ||
506 | X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not support RD 0 or 1\n"); | |
507 | ||
508 | if (mightNotSplit && depth >= minDepth) | |
509 | { | |
510 | int bTryAmp = m_slice->m_sps->maxAMPDepth > depth && (cuGeom.log2CUSize < 6 || m_param->rdLevel > 4); | |
511 | int bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames; | |
512 | ||
513 | /* Initialize all prediction CUs based on parentCTU */ | |
514 | md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom); | |
b53f7c52 | 515 | md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom); |
72b9787e JB |
516 | md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom); |
517 | md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom); | |
518 | if (m_param->bEnableRectInter) | |
519 | { | |
520 | md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom); | |
521 | md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom); | |
522 | } | |
523 | if (bTryAmp) | |
524 | { | |
525 | md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom); | |
526 | md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom); | |
527 | md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom); | |
528 | md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom); | |
529 | } | |
530 | if (bTryIntra) | |
531 | { | |
532 | md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); | |
533 | if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) | |
534 | md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom); | |
535 | } | |
536 | ||
b53f7c52 | 537 | m_pmodeLock.acquire(); |
72b9787e JB |
538 | m_totalNumJobs = 2 + m_param->bEnableRectInter * 2 + bTryAmp * 4; |
539 | m_numAcquiredJobs = !bTryIntra; | |
540 | m_numCompletedJobs = m_numAcquiredJobs; | |
541 | m_curGeom = &cuGeom; | |
542 | m_bJobsQueued = true; | |
543 | JobProvider::enqueue(); | |
b53f7c52 | 544 | m_pmodeLock.release(); |
72b9787e JB |
545 | |
546 | for (int i = 0; i < m_totalNumJobs - m_numCompletedJobs; i++) | |
547 | m_pool->pokeIdleThread(); | |
548 | ||
549 | /* participate in processing jobs, until all are distributed */ | |
550 | while (findJob(-1)) | |
551 | ; | |
552 | ||
553 | JobProvider::dequeue(); | |
554 | m_bJobsQueued = false; | |
555 | ||
556 | /* the master worker thread (this one) does merge analysis. By doing | |
557 | * merge after all the other jobs are at least started, we usually avoid | |
558 | * blocking on another thread */ | |
559 | ||
560 | if (m_param->rdLevel <= 4) | |
561 | { | |
562 | checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); | |
563 | ||
564 | m_modeCompletionEvent.wait(); | |
565 | ||
566 | /* select best inter mode based on sa8d cost */ | |
567 | Mode *bestInter = &md.pred[PRED_2Nx2N]; | |
568 | ||
569 | if (m_param->bEnableRectInter) | |
570 | { | |
571 | if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost) | |
572 | bestInter = &md.pred[PRED_Nx2N]; | |
573 | if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost) | |
574 | bestInter = &md.pred[PRED_2NxN]; | |
575 | } | |
576 | ||
577 | if (bTryAmp) | |
578 | { | |
579 | if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost) | |
580 | bestInter = &md.pred[PRED_2NxnU]; | |
581 | if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost) | |
582 | bestInter = &md.pred[PRED_2NxnD]; | |
583 | if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost) | |
584 | bestInter = &md.pred[PRED_nLx2N]; | |
585 | if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost) | |
586 | bestInter = &md.pred[PRED_nRx2N]; | |
587 | } | |
588 | ||
589 | if (m_param->rdLevel > 2) | |
590 | { | |
b53f7c52 JB |
591 | /* RD selection between merge, inter, bidir and intra */ |
592 | if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */ | |
72b9787e | 593 | { |
b53f7c52 JB |
594 | for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++) |
595 | { | |
596 | prepMotionCompensation(bestInter->cu, cuGeom, puIdx); | |
597 | motionCompensation(bestInter->predYuv, false, true); | |
598 | } | |
72b9787e JB |
599 | } |
600 | encodeResAndCalcRdInterCU(*bestInter, cuGeom); | |
72b9787e JB |
601 | checkBestMode(*bestInter, depth); |
602 | ||
b53f7c52 JB |
603 | /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */ |
604 | if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 && | |
605 | md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17) | |
606 | { | |
607 | encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom); | |
608 | checkBestMode(md.pred[PRED_BIDIR], depth); | |
609 | } | |
610 | ||
72b9787e JB |
611 | if (bTryIntra) |
612 | checkBestMode(md.pred[PRED_INTRA], depth); | |
613 | } | |
614 | else /* m_param->rdLevel == 2 */ | |
615 | { | |
616 | if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost) | |
617 | md.bestMode = bestInter; | |
618 | ||
b53f7c52 JB |
619 | if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost) |
620 | md.bestMode = &md.pred[PRED_BIDIR]; | |
621 | ||
72b9787e JB |
622 | if (bTryIntra && md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost) |
623 | { | |
624 | md.bestMode = &md.pred[PRED_INTRA]; | |
625 | encodeIntraInInter(*md.bestMode, cuGeom); | |
626 | } | |
627 | else if (!md.bestMode->cu.m_mergeFlag[0]) | |
628 | { | |
629 | /* finally code the best mode selected from SA8D costs */ | |
630 | for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++) | |
631 | { | |
632 | prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx); | |
633 | motionCompensation(md.bestMode->predYuv, false, true); | |
634 | } | |
635 | encodeResAndCalcRdInterCU(*md.bestMode, cuGeom); | |
636 | } | |
637 | } | |
638 | } | |
639 | else | |
640 | { | |
641 | checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); | |
642 | m_modeCompletionEvent.wait(); | |
643 | ||
644 | checkBestMode(md.pred[PRED_2Nx2N], depth); | |
b53f7c52 | 645 | checkBestMode(md.pred[PRED_BIDIR], depth); |
72b9787e JB |
646 | |
647 | if (m_param->bEnableRectInter) | |
648 | { | |
649 | checkBestMode(md.pred[PRED_Nx2N], depth); | |
650 | checkBestMode(md.pred[PRED_2NxN], depth); | |
651 | } | |
652 | ||
653 | if (bTryAmp) | |
654 | { | |
655 | checkBestMode(md.pred[PRED_2NxnU], depth); | |
656 | checkBestMode(md.pred[PRED_2NxnD], depth); | |
657 | checkBestMode(md.pred[PRED_nLx2N], depth); | |
658 | checkBestMode(md.pred[PRED_nRx2N], depth); | |
659 | } | |
660 | ||
661 | if (bTryIntra) | |
662 | { | |
663 | checkBestMode(md.pred[PRED_INTRA], depth); | |
664 | if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) | |
665 | checkBestMode(md.pred[PRED_INTRA_NxN], depth); | |
666 | } | |
667 | } | |
668 | ||
669 | if (md.bestMode->rdCost == MAX_INT64 && !bTryIntra) | |
670 | { | |
671 | md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); | |
b53f7c52 | 672 | checkIntraInInter(md.pred[PRED_INTRA], cuGeom); |
72b9787e JB |
673 | encodeIntraInInter(md.pred[PRED_INTRA], cuGeom); |
674 | checkBestMode(md.pred[PRED_INTRA], depth); | |
675 | } | |
676 | ||
677 | if (m_bTryLossless) | |
678 | tryLossless(cuGeom); | |
679 | ||
680 | if (mightSplit) | |
681 | addSplitFlagCost(*md.bestMode, cuGeom.depth); | |
682 | } | |
683 | ||
684 | bool bNoSplit = false; | |
685 | if (md.bestMode) | |
686 | { | |
b53f7c52 | 687 | bNoSplit = md.bestMode->cu.isSkipped(0); |
72b9787e JB |
688 | if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4) |
689 | bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode); | |
690 | } | |
691 | ||
692 | if (mightSplit && !bNoSplit) | |
693 | { | |
694 | Mode* splitPred = &md.pred[PRED_SPLIT]; | |
695 | splitPred->initCosts(); | |
696 | CUData* splitCU = &splitPred->cu; | |
697 | splitCU->initSubCU(parentCTU, cuGeom); | |
698 | ||
699 | uint32_t nextDepth = depth + 1; | |
700 | ModeDepth& nd = m_modeDepth[nextDepth]; | |
701 | invalidateContexts(nextDepth); | |
702 | Entropy* nextContext = &m_rqt[depth].cur; | |
703 | ||
704 | for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) | |
705 | { | |
b53f7c52 JB |
706 | const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); |
707 | if (childGeom.flags & CUGeom::PRESENT) | |
72b9787e | 708 | { |
b53f7c52 | 709 | m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx); |
72b9787e | 710 | m_rqt[nextDepth].cur.load(*nextContext); |
b53f7c52 | 711 | compressInterCU_dist(parentCTU, childGeom); |
72b9787e JB |
712 | |
713 | // Save best CU and pred data for this sub CU | |
b53f7c52 | 714 | splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx); |
72b9787e JB |
715 | splitPred->addSubCosts(*nd.bestMode); |
716 | ||
b53f7c52 | 717 | nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx); |
72b9787e JB |
718 | nextContext = &nd.bestMode->contexts; |
719 | } | |
720 | else | |
b53f7c52 | 721 | splitCU->setEmptyPart(childGeom, subPartIdx); |
72b9787e JB |
722 | } |
723 | nextContext->store(splitPred->contexts); | |
724 | ||
725 | if (mightNotSplit) | |
726 | addSplitFlagCost(*splitPred, cuGeom.depth); | |
727 | else | |
728 | updateModeCost(*splitPred); | |
729 | ||
730 | checkBestMode(*splitPred, depth); | |
731 | } | |
732 | ||
b53f7c52 | 733 | if (mightNotSplit) |
72b9787e JB |
734 | { |
735 | /* early-out statistics */ | |
b53f7c52 | 736 | FrameData& curEncData = *m_frame->m_encData; |
72b9787e JB |
737 | FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr]; |
738 | uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth]; | |
739 | cuStat.count[depth] += 1; | |
740 | cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth]; | |
741 | } | |
742 | ||
743 | checkDQP(md.bestMode->cu, cuGeom); | |
744 | ||
745 | /* Copy best data to encData CTU and recon */ | |
746 | md.bestMode->cu.copyToPic(depth); | |
747 | if (md.bestMode != &md.pred[PRED_SPLIT]) | |
b53f7c52 | 748 | md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx); |
72b9787e JB |
749 | } |
750 | ||
751 | void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom) | |
752 | { | |
753 | uint32_t depth = cuGeom.depth; | |
754 | uint32_t cuAddr = parentCTU.m_cuAddr; | |
755 | ModeDepth& md = m_modeDepth[depth]; | |
756 | md.bestMode = NULL; | |
757 | ||
758 | bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); | |
759 | bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); | |
760 | uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom); | |
761 | ||
762 | if (mightNotSplit && depth >= minDepth) | |
763 | { | |
764 | bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames; | |
765 | ||
b53f7c52 | 766 | /* Compute Merge Cost */ |
72b9787e JB |
767 | md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom); |
768 | md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom); | |
72b9787e JB |
769 | checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); |
770 | ||
771 | bool earlyskip = false; | |
772 | if (m_param->rdLevel) | |
773 | earlyskip = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth | |
774 | ||
775 | if (!earlyskip) | |
776 | { | |
b53f7c52 | 777 | md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom); |
72b9787e | 778 | checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N); |
72b9787e | 779 | |
b53f7c52 JB |
780 | if (m_slice->m_sliceType == B_SLICE) |
781 | { | |
782 | md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom); | |
783 | checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom); | |
784 | } | |
785 | ||
786 | Mode *bestInter = &md.pred[PRED_2Nx2N]; | |
72b9787e JB |
787 | if (m_param->bEnableRectInter) |
788 | { | |
b53f7c52 | 789 | md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom); |
72b9787e JB |
790 | checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N); |
791 | if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost) | |
792 | bestInter = &md.pred[PRED_Nx2N]; | |
b53f7c52 JB |
793 | |
794 | md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom); | |
72b9787e JB |
795 | checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN); |
796 | if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost) | |
797 | bestInter = &md.pred[PRED_2NxN]; | |
798 | } | |
799 | ||
800 | if (m_slice->m_sps->maxAMPDepth > depth && cuGeom.log2CUSize < 6) | |
801 | { | |
802 | bool bHor = false, bVer = false; | |
803 | if (bestInter->cu.m_partSize[0] == SIZE_2NxN) | |
804 | bHor = true; | |
805 | else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N) | |
806 | bVer = true; | |
807 | else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N && | |
808 | md.bestMode && md.bestMode->cu.getQtRootCbf(0)) | |
809 | { | |
810 | bHor = true; | |
811 | bVer = true; | |
812 | } | |
813 | ||
814 | if (bHor) | |
815 | { | |
b53f7c52 | 816 | md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom); |
72b9787e JB |
817 | checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU); |
818 | if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost) | |
819 | bestInter = &md.pred[PRED_2NxnU]; | |
b53f7c52 JB |
820 | |
821 | md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom); | |
72b9787e JB |
822 | checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD); |
823 | if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost) | |
824 | bestInter = &md.pred[PRED_2NxnD]; | |
825 | } | |
826 | if (bVer) | |
827 | { | |
b53f7c52 | 828 | md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom); |
72b9787e JB |
829 | checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N); |
830 | if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost) | |
831 | bestInter = &md.pred[PRED_nLx2N]; | |
b53f7c52 JB |
832 | |
833 | md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom); | |
72b9787e JB |
834 | checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N); |
835 | if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost) | |
836 | bestInter = &md.pred[PRED_nRx2N]; | |
837 | } | |
838 | } | |
839 | ||
840 | if (m_param->rdLevel >= 3) | |
841 | { | |
842 | /* Calculate RD cost of best inter option */ | |
b53f7c52 | 843 | if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */ |
72b9787e | 844 | { |
b53f7c52 JB |
845 | for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++) |
846 | { | |
847 | prepMotionCompensation(bestInter->cu, cuGeom, puIdx); | |
848 | motionCompensation(bestInter->predYuv, false, true); | |
849 | } | |
72b9787e | 850 | } |
72b9787e | 851 | encodeResAndCalcRdInterCU(*bestInter, cuGeom); |
b53f7c52 | 852 | checkBestMode(*bestInter, depth); |
72b9787e | 853 | |
b53f7c52 JB |
854 | /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */ |
855 | if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 && | |
856 | md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17) | |
857 | { | |
858 | encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom); | |
859 | checkBestMode(md.pred[PRED_BIDIR], depth); | |
860 | } | |
72b9787e JB |
861 | |
862 | if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) || | |
863 | md.bestMode->sa8dCost == MAX_INT64) | |
864 | { | |
865 | md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); | |
b53f7c52 | 866 | checkIntraInInter(md.pred[PRED_INTRA], cuGeom); |
72b9787e | 867 | encodeIntraInInter(md.pred[PRED_INTRA], cuGeom); |
b53f7c52 | 868 | checkBestMode(md.pred[PRED_INTRA], depth); |
72b9787e JB |
869 | } |
870 | } | |
871 | else | |
872 | { | |
b53f7c52 | 873 | /* SA8D choice between merge/skip, inter, bidir, and intra */ |
72b9787e JB |
874 | if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost) |
875 | md.bestMode = bestInter; | |
876 | ||
b53f7c52 JB |
877 | if (m_slice->m_sliceType == B_SLICE && |
878 | md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost) | |
879 | md.bestMode = &md.pred[PRED_BIDIR]; | |
880 | ||
72b9787e JB |
881 | if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64) |
882 | { | |
883 | md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); | |
b53f7c52 | 884 | checkIntraInInter(md.pred[PRED_INTRA], cuGeom); |
72b9787e JB |
885 | if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost) |
886 | md.bestMode = &md.pred[PRED_INTRA]; | |
887 | } | |
888 | ||
889 | /* finally code the best mode selected by SA8D costs: | |
890 | * RD level 2 - fully encode the best mode | |
891 | * RD level 1 - generate recon pixels | |
892 | * RD level 0 - generate chroma prediction */ | |
893 | if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N) | |
894 | { | |
895 | /* prediction already generated for this CU, and if rd level | |
896 | * is not 0, it is already fully encoded */ | |
897 | } | |
b53f7c52 | 898 | else if (md.bestMode->cu.isInter(0)) |
72b9787e JB |
899 | { |
900 | for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++) | |
901 | { | |
902 | prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx); | |
903 | motionCompensation(md.bestMode->predYuv, false, true); | |
904 | } | |
905 | if (m_param->rdLevel == 2) | |
906 | encodeResAndCalcRdInterCU(*md.bestMode, cuGeom); | |
907 | else if (m_param->rdLevel == 1) | |
908 | { | |
b53f7c52 JB |
909 | /* generate recon pixels with no rate distortion considerations */ |
910 | CUData& cu = md.bestMode->cu; | |
911 | m_quant.setQPforQuant(cu); | |
912 | ||
913 | uint32_t tuDepthRange[2]; | |
914 | cu.getInterTUQtDepthRange(tuDepthRange, 0); | |
915 | ||
916 | m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize); | |
917 | residualTransformQuantInter(*md.bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange); | |
918 | if (cu.getQtRootCbf(0)) | |
919 | md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]); | |
920 | else | |
921 | { | |
922 | md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv); | |
923 | if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N) | |
924 | cu.setPredModeSubParts(MODE_SKIP); | |
925 | } | |
72b9787e JB |
926 | } |
927 | } | |
928 | else | |
929 | { | |
930 | if (m_param->rdLevel == 2) | |
931 | encodeIntraInInter(*md.bestMode, cuGeom); | |
932 | else if (m_param->rdLevel == 1) | |
b53f7c52 JB |
933 | { |
934 | /* generate recon pixels with no rate distortion considerations */ | |
935 | CUData& cu = md.bestMode->cu; | |
936 | m_quant.setQPforQuant(cu); | |
937 | ||
938 | uint32_t tuDepthRange[2]; | |
939 | cu.getIntraTUQtDepthRange(tuDepthRange, 0); | |
940 | ||
941 | uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N; | |
942 | residualTransformQuantIntra(*md.bestMode, cuGeom, initTuDepth, 0, tuDepthRange); | |
943 | getBestIntraModeChroma(*md.bestMode, cuGeom); | |
944 | residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0); | |
945 | md.bestMode->reconYuv.copyFromPicYuv(*m_frame->m_reconPic, cu.m_cuAddr, cuGeom.encodeIdx); // TODO: | |
946 | } | |
72b9787e JB |
947 | } |
948 | } | |
949 | } // !earlyskip | |
950 | ||
951 | if (m_bTryLossless) | |
952 | tryLossless(cuGeom); | |
953 | ||
954 | if (mightSplit) | |
955 | addSplitFlagCost(*md.bestMode, cuGeom.depth); | |
956 | } | |
957 | ||
958 | bool bNoSplit = false; | |
959 | if (md.bestMode) | |
960 | { | |
b53f7c52 | 961 | bNoSplit = md.bestMode->cu.isSkipped(0); |
72b9787e JB |
962 | if (mightSplit && depth && depth >= minDepth && !bNoSplit) |
963 | bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode); | |
964 | } | |
965 | ||
966 | if (mightSplit && !bNoSplit) | |
967 | { | |
968 | Mode* splitPred = &md.pred[PRED_SPLIT]; | |
969 | splitPred->initCosts(); | |
970 | CUData* splitCU = &splitPred->cu; | |
971 | splitCU->initSubCU(parentCTU, cuGeom); | |
972 | ||
973 | uint32_t nextDepth = depth + 1; | |
974 | ModeDepth& nd = m_modeDepth[nextDepth]; | |
975 | invalidateContexts(nextDepth); | |
976 | Entropy* nextContext = &m_rqt[depth].cur; | |
977 | ||
978 | for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) | |
979 | { | |
b53f7c52 JB |
980 | const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); |
981 | if (childGeom.flags & CUGeom::PRESENT) | |
72b9787e | 982 | { |
b53f7c52 | 983 | m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx); |
72b9787e | 984 | m_rqt[nextDepth].cur.load(*nextContext); |
b53f7c52 | 985 | compressInterCU_rd0_4(parentCTU, childGeom); |
72b9787e JB |
986 | |
987 | // Save best CU and pred data for this sub CU | |
b53f7c52 | 988 | splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx); |
72b9787e JB |
989 | splitPred->addSubCosts(*nd.bestMode); |
990 | ||
991 | if (m_param->rdLevel) | |
b53f7c52 | 992 | nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx); |
72b9787e | 993 | else |
b53f7c52 | 994 | nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx); |
72b9787e JB |
995 | if (m_param->rdLevel > 1) |
996 | nextContext = &nd.bestMode->contexts; | |
997 | } | |
998 | else | |
b53f7c52 | 999 | splitCU->setEmptyPart(childGeom, subPartIdx); |
72b9787e JB |
1000 | } |
1001 | nextContext->store(splitPred->contexts); | |
1002 | ||
1003 | if (mightNotSplit) | |
1004 | addSplitFlagCost(*splitPred, cuGeom.depth); | |
b53f7c52 | 1005 | else if (m_param->rdLevel > 1) |
72b9787e | 1006 | updateModeCost(*splitPred); |
b53f7c52 JB |
1007 | else |
1008 | splitPred->sa8dCost = m_rdCost.calcRdSADCost(splitPred->distortion, splitPred->sa8dBits); | |
72b9787e JB |
1009 | |
1010 | if (!md.bestMode) | |
1011 | md.bestMode = splitPred; | |
b53f7c52 JB |
1012 | else if (m_param->rdLevel > 1) |
1013 | checkBestMode(*splitPred, cuGeom.depth); | |
1014 | else if (splitPred->sa8dCost < md.bestMode->sa8dCost) | |
1015 | md.bestMode = splitPred; | |
72b9787e JB |
1016 | } |
1017 | ||
b53f7c52 | 1018 | if (mightNotSplit) |
72b9787e JB |
1019 | { |
1020 | /* early-out statistics */ | |
b53f7c52 | 1021 | FrameData& curEncData = *m_frame->m_encData; |
72b9787e JB |
1022 | FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr]; |
1023 | uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth]; | |
1024 | cuStat.count[depth] += 1; | |
1025 | cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth]; | |
1026 | } | |
1027 | ||
1028 | checkDQP(md.bestMode->cu, cuGeom); | |
1029 | ||
1030 | /* Copy best data to encData CTU and recon */ | |
1031 | md.bestMode->cu.copyToPic(depth); | |
1032 | if (md.bestMode != &md.pred[PRED_SPLIT] && m_param->rdLevel) | |
b53f7c52 | 1033 | md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx); |
72b9787e JB |
1034 | } |
1035 | ||
1036 | void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom) | |
1037 | { | |
1038 | uint32_t depth = cuGeom.depth; | |
1039 | ModeDepth& md = m_modeDepth[depth]; | |
1040 | md.bestMode = NULL; | |
1041 | ||
1042 | bool mightSplit = !(cuGeom.flags & CUGeom::LEAF); | |
1043 | bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); | |
1044 | ||
1045 | if (mightNotSplit) | |
1046 | { | |
b53f7c52 JB |
1047 | md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom); |
1048 | md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom); | |
72b9787e JB |
1049 | checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); |
1050 | bool earlySkip = m_param->bEnableEarlySkip && md.bestMode && !md.bestMode->cu.getQtRootCbf(0); | |
1051 | ||
1052 | if (!earlySkip) | |
1053 | { | |
b53f7c52 | 1054 | md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom); |
72b9787e JB |
1055 | checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, false); |
1056 | checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth); | |
1057 | ||
b53f7c52 JB |
1058 | if (m_slice->m_sliceType == B_SLICE) |
1059 | { | |
1060 | md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom); | |
1061 | checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom); | |
1062 | if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64) | |
1063 | { | |
1064 | encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom); | |
1065 | checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth); | |
1066 | } | |
1067 | } | |
1068 | ||
72b9787e JB |
1069 | if (m_param->bEnableRectInter) |
1070 | { | |
72b9787e JB |
1071 | if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) |
1072 | { | |
b53f7c52 | 1073 | md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom); |
72b9787e JB |
1074 | checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, false); |
1075 | checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth); | |
1076 | } | |
1077 | if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) | |
1078 | { | |
b53f7c52 | 1079 | md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom); |
72b9787e JB |
1080 | checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, false); |
1081 | checkBestMode(md.pred[PRED_2NxN], cuGeom.depth); | |
1082 | } | |
1083 | } | |
1084 | ||
1085 | // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N) | |
1086 | if (m_slice->m_sps->maxAMPDepth > depth) | |
1087 | { | |
1088 | bool bMergeOnly = cuGeom.log2CUSize == 6; | |
1089 | ||
1090 | bool bHor = false, bVer = false; | |
1091 | if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN) | |
1092 | bHor = true; | |
1093 | else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N) | |
1094 | bVer = true; | |
1095 | else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0] && !md.bestMode->cu.isSkipped(0)) | |
1096 | { | |
1097 | bHor = true; | |
1098 | bVer = true; | |
1099 | } | |
1100 | ||
1101 | if (bHor) | |
1102 | { | |
1103 | if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) | |
1104 | { | |
b53f7c52 | 1105 | md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom); |
72b9787e JB |
1106 | checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, bMergeOnly); |
1107 | checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth); | |
1108 | } | |
1109 | if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) | |
1110 | { | |
b53f7c52 | 1111 | md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom); |
72b9787e JB |
1112 | checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, bMergeOnly); |
1113 | checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth); | |
1114 | } | |
1115 | } | |
1116 | if (bVer) | |
1117 | { | |
1118 | if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) | |
1119 | { | |
b53f7c52 | 1120 | md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom); |
72b9787e JB |
1121 | checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, bMergeOnly); |
1122 | checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth); | |
1123 | } | |
1124 | if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) | |
1125 | { | |
b53f7c52 | 1126 | md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom); |
72b9787e JB |
1127 | checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, bMergeOnly); |
1128 | checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth); | |
1129 | } | |
1130 | } | |
1131 | } | |
1132 | ||
1133 | if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && | |
1134 | (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))) | |
1135 | { | |
b53f7c52 | 1136 | md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); |
72b9787e JB |
1137 | checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL); |
1138 | checkBestMode(md.pred[PRED_INTRA], depth); | |
1139 | ||
1140 | if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) | |
1141 | { | |
b53f7c52 | 1142 | md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom); |
72b9787e JB |
1143 | checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL); |
1144 | checkBestMode(md.pred[PRED_INTRA_NxN], depth); | |
1145 | } | |
1146 | } | |
1147 | } | |
1148 | ||
1149 | if (m_bTryLossless) | |
1150 | tryLossless(cuGeom); | |
1151 | ||
1152 | if (mightSplit) | |
1153 | addSplitFlagCost(*md.bestMode, cuGeom.depth); | |
1154 | } | |
1155 | ||
1156 | // estimate split cost | |
1157 | if (mightSplit && (!md.bestMode || !md.bestMode->cu.isSkipped(0))) | |
1158 | { | |
1159 | Mode* splitPred = &md.pred[PRED_SPLIT]; | |
1160 | splitPred->initCosts(); | |
1161 | CUData* splitCU = &splitPred->cu; | |
1162 | splitCU->initSubCU(parentCTU, cuGeom); | |
1163 | ||
1164 | uint32_t nextDepth = depth + 1; | |
1165 | ModeDepth& nd = m_modeDepth[nextDepth]; | |
1166 | invalidateContexts(nextDepth); | |
1167 | Entropy* nextContext = &m_rqt[depth].cur; | |
1168 | ||
1169 | for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) | |
1170 | { | |
b53f7c52 JB |
1171 | const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); |
1172 | if (childGeom.flags & CUGeom::PRESENT) | |
72b9787e | 1173 | { |
b53f7c52 | 1174 | m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx); |
72b9787e | 1175 | m_rqt[nextDepth].cur.load(*nextContext); |
b53f7c52 | 1176 | compressInterCU_rd5_6(parentCTU, childGeom); |
72b9787e JB |
1177 | |
1178 | // Save best CU and pred data for this sub CU | |
b53f7c52 | 1179 | splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx); |
72b9787e | 1180 | splitPred->addSubCosts(*nd.bestMode); |
b53f7c52 | 1181 | nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx); |
72b9787e JB |
1182 | nextContext = &nd.bestMode->contexts; |
1183 | } | |
1184 | else | |
b53f7c52 | 1185 | splitCU->setEmptyPart(childGeom, subPartIdx); |
72b9787e JB |
1186 | } |
1187 | nextContext->store(splitPred->contexts); | |
1188 | if (mightNotSplit) | |
1189 | addSplitFlagCost(*splitPred, cuGeom.depth); | |
1190 | else | |
1191 | updateModeCost(*splitPred); | |
1192 | ||
1193 | checkBestMode(*splitPred, depth); | |
1194 | } | |
1195 | ||
1196 | checkDQP(md.bestMode->cu, cuGeom); | |
1197 | ||
1198 | /* Copy best data to encData CTU and recon */ | |
1199 | md.bestMode->cu.copyToPic(depth); | |
1200 | if (md.bestMode != &md.pred[PRED_SPLIT]) | |
b53f7c52 | 1201 | md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx); |
72b9787e JB |
1202 | } |
1203 | ||
1204 | /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */ | |
1205 | void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom) | |
1206 | { | |
1207 | uint32_t depth = cuGeom.depth; | |
1208 | ModeDepth& md = m_modeDepth[depth]; | |
1209 | Yuv *fencYuv = &md.fencYuv; | |
1210 | ||
1211 | /* Note that these two Mode instances are named MERGE and SKIP but they may | |
1212 | * hold the reverse when the function returns. We toggle between the two modes */ | |
1213 | Mode* tempPred = &merge; | |
1214 | Mode* bestPred = &skip; | |
1215 | ||
1216 | X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n"); | |
1217 | ||
1218 | tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N); | |
1219 | tempPred->cu.setPredModeSubParts(MODE_INTER); | |
1220 | tempPred->cu.m_mergeFlag[0] = true; | |
1221 | ||
1222 | bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N); | |
1223 | bestPred->cu.setPredModeSubParts(MODE_INTER); | |
1224 | bestPred->cu.m_mergeFlag[0] = true; | |
1225 | ||
1226 | MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists | |
1227 | uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS]; | |
1228 | uint32_t maxNumMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours); | |
1229 | ||
1230 | bestPred->sa8dCost = MAX_INT64; | |
1231 | int bestSadCand = -1; | |
b53f7c52 JB |
1232 | int cpart, sizeIdx = cuGeom.log2CUSize - 2; |
1233 | if (m_bChromaSa8d) | |
1234 | { | |
1235 | int cuSize = 1 << cuGeom.log2CUSize; | |
1236 | cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift); | |
1237 | } | |
72b9787e JB |
1238 | for (uint32_t i = 0; i < maxNumMergeCand; ++i) |
1239 | { | |
1240 | if (m_bFrameParallel && | |
1241 | (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 || | |
1242 | mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4)) | |
1243 | continue; | |
1244 | ||
1245 | tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx | |
1246 | tempPred->cu.m_interDir[0] = interDirNeighbours[i]; | |
1247 | tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv; | |
b53f7c52 | 1248 | tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx; |
72b9787e | 1249 | tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv; |
b53f7c52 | 1250 | tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx; |
72b9787e | 1251 | |
72b9787e | 1252 | prepMotionCompensation(tempPred->cu, cuGeom, 0); |
b53f7c52 | 1253 | motionCompensation(tempPred->predYuv, true, m_bChromaSa8d); |
72b9787e JB |
1254 | |
1255 | tempPred->sa8dBits = getTUBits(i, maxNumMergeCand); | |
1256 | tempPred->distortion = primitives.sa8d[sizeIdx](fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size); | |
b53f7c52 JB |
1257 | if (m_bChromaSa8d) |
1258 | { | |
1259 | tempPred->distortion += primitives.sa8d_inter[cpart](fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize); | |
1260 | tempPred->distortion += primitives.sa8d_inter[cpart](fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize); | |
1261 | } | |
72b9787e JB |
1262 | tempPred->sa8dCost = m_rdCost.calcRdSADCost(tempPred->distortion, tempPred->sa8dBits); |
1263 | ||
1264 | if (tempPred->sa8dCost < bestPred->sa8dCost) | |
1265 | { | |
1266 | bestSadCand = i; | |
1267 | std::swap(tempPred, bestPred); | |
1268 | } | |
1269 | } | |
1270 | ||
1271 | /* force mode decision to take inter or intra */ | |
1272 | if (bestSadCand < 0) | |
1273 | return; | |
1274 | ||
1275 | /* calculate the motion compensation for chroma for the best mode selected */ | |
b53f7c52 JB |
1276 | if (!m_bChromaSa8d) /* Chroma MC was done above */ |
1277 | { | |
1278 | prepMotionCompensation(bestPred->cu, cuGeom, 0); | |
1279 | motionCompensation(bestPred->predYuv, false, true); | |
1280 | } | |
72b9787e JB |
1281 | |
1282 | if (m_param->rdLevel) | |
1283 | { | |
1284 | if (m_param->bLossless) | |
1285 | bestPred->rdCost = MAX_INT64; | |
1286 | else | |
1287 | encodeResAndCalcRdSkipCU(*bestPred); | |
1288 | ||
1289 | /* Encode with residual */ | |
1290 | tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand; | |
1291 | tempPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0); | |
1292 | tempPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0); | |
b53f7c52 | 1293 | tempPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0); |
72b9787e | 1294 | tempPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0); |
b53f7c52 | 1295 | tempPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0); |
72b9787e JB |
1296 | tempPred->sa8dCost = bestPred->sa8dCost; |
1297 | tempPred->predYuv.copyFromYuv(bestPred->predYuv); | |
1298 | ||
1299 | encodeResAndCalcRdInterCU(*tempPred, cuGeom); | |
1300 | ||
1301 | md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred; | |
1302 | } | |
1303 | else | |
1304 | md.bestMode = bestPred; | |
1305 | ||
1306 | /* broadcast sets of MV field data */ | |
1307 | bestPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0); | |
1308 | bestPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0); | |
b53f7c52 | 1309 | bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0); |
72b9787e | 1310 | bestPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0); |
b53f7c52 | 1311 | bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0); |
72b9787e JB |
1312 | } |
1313 | ||
1314 | /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */ | |
1315 | void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom) | |
1316 | { | |
1317 | uint32_t depth = cuGeom.depth; | |
1318 | ||
1319 | /* Note that these two Mode instances are named MERGE and SKIP but they may | |
1320 | * hold the reverse when the function returns. We toggle between the two modes */ | |
1321 | Mode* tempPred = &merge; | |
1322 | Mode* bestPred = &skip; | |
1323 | ||
1324 | merge.cu.setPredModeSubParts(MODE_INTER); | |
1325 | merge.cu.setPartSizeSubParts(SIZE_2Nx2N); | |
1326 | merge.cu.m_mergeFlag[0] = true; | |
1327 | ||
1328 | skip.cu.setPredModeSubParts(MODE_INTER); | |
1329 | skip.cu.setPartSizeSubParts(SIZE_2Nx2N); | |
1330 | skip.cu.m_mergeFlag[0] = true; | |
1331 | ||
1332 | MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists | |
1333 | uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS]; | |
1334 | uint32_t maxNumMergeCand = merge.cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours); | |
1335 | ||
1336 | bool foundCbf0Merge = false; | |
1337 | bool triedPZero = false, triedBZero = false; | |
1338 | bestPred->rdCost = MAX_INT64; | |
1339 | for (uint32_t i = 0; i < maxNumMergeCand; i++) | |
1340 | { | |
1341 | if (m_bFrameParallel && | |
1342 | (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 || | |
1343 | mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4)) | |
1344 | continue; | |
1345 | ||
1346 | /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */ | |
1347 | if (interDirNeighbours[i] == 1 && !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx) | |
1348 | { | |
1349 | if (triedPZero) | |
1350 | continue; | |
1351 | triedPZero = true; | |
1352 | } | |
1353 | else if (interDirNeighbours[i] == 3 && | |
1354 | !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx && | |
1355 | !mvFieldNeighbours[i][1].mv.word && !mvFieldNeighbours[i][1].refIdx) | |
1356 | { | |
1357 | if (triedBZero) | |
1358 | continue; | |
1359 | triedBZero = true; | |
1360 | } | |
1361 | ||
1362 | tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; /* merge candidate ID is stored in L0 MVP idx */ | |
1363 | tempPred->cu.m_interDir[0] = interDirNeighbours[i]; | |
1364 | tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv; | |
b53f7c52 | 1365 | tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx; |
72b9787e | 1366 | tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv; |
b53f7c52 JB |
1367 | tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx; |
1368 | tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */ | |
72b9787e JB |
1369 | |
1370 | prepMotionCompensation(tempPred->cu, cuGeom, 0); | |
1371 | motionCompensation(tempPred->predYuv, true, true); | |
1372 | ||
1373 | uint8_t hasCbf = true; | |
1374 | bool swapped = false; | |
1375 | if (!foundCbf0Merge) | |
1376 | { | |
1377 | /* if the best prediction has CBF (not a skip) then try merge with residual */ | |
1378 | ||
1379 | encodeResAndCalcRdInterCU(*tempPred, cuGeom); | |
1380 | hasCbf = tempPred->cu.getQtRootCbf(0); | |
1381 | foundCbf0Merge = !hasCbf; | |
1382 | ||
1383 | if (tempPred->rdCost < bestPred->rdCost) | |
1384 | { | |
1385 | std::swap(tempPred, bestPred); | |
1386 | swapped = true; | |
1387 | } | |
1388 | } | |
1389 | if (!m_param->bLossless && hasCbf) | |
1390 | { | |
1391 | /* try merge without residual (skip), if not lossless coding */ | |
1392 | ||
1393 | if (swapped) | |
1394 | { | |
1395 | tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; | |
1396 | tempPred->cu.m_interDir[0] = interDirNeighbours[i]; | |
1397 | tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv; | |
b53f7c52 | 1398 | tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx; |
72b9787e | 1399 | tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv; |
b53f7c52 JB |
1400 | tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx; |
1401 | tempPred->cu.setPredModeSubParts(MODE_INTER); | |
72b9787e JB |
1402 | tempPred->predYuv.copyFromYuv(bestPred->predYuv); |
1403 | } | |
1404 | ||
1405 | encodeResAndCalcRdSkipCU(*tempPred); | |
1406 | ||
1407 | if (tempPred->rdCost < bestPred->rdCost) | |
1408 | std::swap(tempPred, bestPred); | |
1409 | } | |
1410 | } | |
1411 | ||
1412 | if (bestPred->rdCost < MAX_INT64) | |
1413 | { | |
1414 | m_modeDepth[depth].bestMode = bestPred; | |
1415 | ||
1416 | /* broadcast sets of MV field data */ | |
1417 | uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0]; | |
1418 | bestPred->cu.setPUInterDir(interDirNeighbours[bestCand], 0, 0); | |
1419 | bestPred->cu.setPUMv(0, mvFieldNeighbours[bestCand][0].mv, 0, 0); | |
b53f7c52 | 1420 | bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestCand][0].refIdx, 0, 0); |
72b9787e | 1421 | bestPred->cu.setPUMv(1, mvFieldNeighbours[bestCand][1].mv, 0, 0); |
b53f7c52 | 1422 | bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestCand][1].refIdx, 0, 0); |
72b9787e JB |
1423 | } |
1424 | } | |
1425 | ||
1426 | void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize) | |
1427 | { | |
1428 | interMode.initCosts(); | |
1429 | interMode.cu.setPartSizeSubParts(partSize); | |
1430 | interMode.cu.setPredModeSubParts(MODE_INTER); | |
b53f7c52 | 1431 | int numPredDir = m_slice->isInterP() ? 1 : 2; |
72b9787e | 1432 | |
b53f7c52 JB |
1433 | if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU) |
1434 | { | |
1435 | for (uint32_t part = 0; part < interMode.cu.getNumPartInter(); part++) | |
1436 | { | |
1437 | MotionData* bestME = interMode.bestME[part]; | |
1438 | for (int32_t i = 0; i < numPredDir; i++) | |
1439 | { | |
1440 | bestME[i].ref = m_reuseInterDataCTU->ref; | |
1441 | m_reuseInterDataCTU++; | |
1442 | } | |
1443 | } | |
1444 | } | |
1445 | if (predInterSearch(interMode, cuGeom, false, m_bChromaSa8d)) | |
72b9787e JB |
1446 | { |
1447 | /* predInterSearch sets interMode.sa8dBits */ | |
1448 | const Yuv& fencYuv = *interMode.fencYuv; | |
1449 | Yuv& predYuv = interMode.predYuv; | |
b53f7c52 JB |
1450 | int part = partitionFromLog2Size(cuGeom.log2CUSize); |
1451 | interMode.distortion = primitives.sa8d[part](fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size); | |
1452 | if (m_bChromaSa8d) | |
1453 | { | |
1454 | uint32_t cuSize = 1 << cuGeom.log2CUSize; | |
1455 | int cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift); | |
1456 | interMode.distortion += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize); | |
1457 | interMode.distortion += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize); | |
1458 | } | |
72b9787e | 1459 | interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits); |
b53f7c52 JB |
1460 | |
1461 | if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU) | |
1462 | { | |
1463 | for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++) | |
1464 | { | |
1465 | MotionData* bestME = interMode.bestME[puIdx]; | |
1466 | for (int32_t i = 0; i < numPredDir; i++) | |
1467 | { | |
1468 | m_reuseInterDataCTU->ref = bestME[i].ref; | |
1469 | m_reuseInterDataCTU++; | |
1470 | } | |
1471 | } | |
1472 | } | |
72b9787e JB |
1473 | } |
1474 | else | |
1475 | { | |
1476 | interMode.distortion = MAX_UINT; | |
1477 | interMode.sa8dCost = MAX_INT64; | |
1478 | } | |
1479 | } | |
1480 | ||
1481 | void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, bool bMergeOnly) | |
1482 | { | |
1483 | interMode.initCosts(); | |
1484 | interMode.cu.setPartSizeSubParts(partSize); | |
1485 | interMode.cu.setPredModeSubParts(MODE_INTER); | |
b53f7c52 | 1486 | int numPredDir = m_slice->isInterP() ? 1 : 2; |
72b9787e | 1487 | |
b53f7c52 JB |
1488 | if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU) |
1489 | { | |
1490 | for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++) | |
1491 | { | |
1492 | MotionData* bestME = interMode.bestME[puIdx]; | |
1493 | for (int32_t i = 0; i < numPredDir; i++) | |
1494 | { | |
1495 | bestME[i].ref = m_reuseInterDataCTU->ref; | |
1496 | m_reuseInterDataCTU++; | |
1497 | } | |
1498 | } | |
1499 | } | |
72b9787e JB |
1500 | if (predInterSearch(interMode, cuGeom, bMergeOnly, true)) |
1501 | { | |
1502 | /* predInterSearch sets interMode.sa8dBits, but this is ignored */ | |
1503 | encodeResAndCalcRdInterCU(interMode, cuGeom); | |
b53f7c52 JB |
1504 | |
1505 | if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU) | |
1506 | { | |
1507 | for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++) | |
1508 | { | |
1509 | MotionData* bestME = interMode.bestME[puIdx]; | |
1510 | for (int32_t i = 0; i < numPredDir; i++) | |
1511 | { | |
1512 | m_reuseInterDataCTU->ref = bestME[i].ref; | |
1513 | m_reuseInterDataCTU++; | |
1514 | } | |
1515 | } | |
1516 | } | |
72b9787e JB |
1517 | } |
1518 | else | |
1519 | { | |
1520 | interMode.distortion = MAX_UINT; | |
1521 | interMode.rdCost = MAX_INT64; | |
1522 | } | |
1523 | } | |
1524 | ||
b53f7c52 | 1525 | void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom) |
72b9787e | 1526 | { |
b53f7c52 | 1527 | CUData& cu = bidir2Nx2N.cu; |
72b9787e | 1528 | |
b53f7c52 | 1529 | if (cu.isBipredRestriction() || inter2Nx2N.bestME[0][0].cost == MAX_UINT || inter2Nx2N.bestME[0][1].cost == MAX_UINT) |
72b9787e | 1530 | { |
b53f7c52 JB |
1531 | bidir2Nx2N.sa8dCost = MAX_INT64; |
1532 | bidir2Nx2N.rdCost = MAX_INT64; | |
1533 | return; | |
72b9787e JB |
1534 | } |
1535 | ||
b53f7c52 JB |
1536 | const Yuv& fencYuv = *bidir2Nx2N.fencYuv; |
1537 | MV mvzero(0, 0); | |
1538 | int cpart, partEnum = cuGeom.log2CUSize - 2; | |
72b9787e | 1539 | |
b53f7c52 | 1540 | if (m_bChromaSa8d) |
72b9787e | 1541 | { |
b53f7c52 JB |
1542 | int cuSize = 1 << cuGeom.log2CUSize; |
1543 | cpart = partitionFromSizes(cuSize >> m_hChromaShift, cuSize >> m_vChromaShift); | |
72b9787e JB |
1544 | } |
1545 | ||
b53f7c52 JB |
1546 | bidir2Nx2N.bestME[0][0] = inter2Nx2N.bestME[0][0]; |
1547 | bidir2Nx2N.bestME[0][1] = inter2Nx2N.bestME[0][1]; | |
1548 | MotionData* bestME = bidir2Nx2N.bestME[0]; | |
1549 | int ref0 = bestME[0].ref; | |
1550 | MV mvp0 = bestME[0].mvp; | |
1551 | int mvpIdx0 = bestME[0].mvpIdx; | |
1552 | int ref1 = bestME[1].ref; | |
1553 | MV mvp1 = bestME[1].mvp; | |
1554 | int mvpIdx1 = bestME[1].mvpIdx; | |
1555 | ||
1556 | bidir2Nx2N.initCosts(); | |
1557 | cu.setPartSizeSubParts(SIZE_2Nx2N); | |
1558 | cu.setPredModeSubParts(MODE_INTER); | |
1559 | cu.setPUInterDir(3, 0, 0); | |
1560 | cu.setPURefIdx(0, (int8_t)ref0, 0, 0); | |
1561 | cu.setPURefIdx(1, (int8_t)ref1, 0, 0); | |
1562 | cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0; | |
1563 | cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1; | |
1564 | cu.m_mergeFlag[0] = 0; | |
1565 | ||
1566 | /* Estimate cost of BIDIR using best 2Nx2N L0 and L1 motion vectors */ | |
1567 | cu.setPUMv(0, bestME[0].mv, 0, 0); | |
1568 | cu.m_mvd[0][0] = bestME[0].mv - mvp0; | |
1569 | ||
1570 | cu.setPUMv(1, bestME[1].mv, 0, 0); | |
1571 | cu.m_mvd[1][0] = bestME[1].mv - mvp1; | |
1572 | ||
1573 | prepMotionCompensation(cu, cuGeom, 0); | |
1574 | motionCompensation(bidir2Nx2N.predYuv, true, m_bChromaSa8d); | |
1575 | ||
1576 | int sa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size); | |
1577 | if (m_bChromaSa8d) | |
1578 | { | |
1579 | /* Add in chroma distortion */ | |
1580 | sa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize); | |
1581 | sa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[2], bidir2Nx2N.predYuv.m_csize); | |
1582 | } | |
1583 | bidir2Nx2N.sa8dBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]); | |
1584 | bidir2Nx2N.sa8dCost = sa8d + m_rdCost.getCost(bidir2Nx2N.sa8dBits); | |
72b9787e | 1585 | |
b53f7c52 JB |
1586 | bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero(); |
1587 | if (bTryZero) | |
1588 | { | |
1589 | /* Do not try zero MV if unidir motion predictors are beyond | |
1590 | * valid search area */ | |
1591 | MV mvmin, mvmax; | |
1592 | int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight); | |
1593 | setSearchRange(cu, mvzero, merange, mvmin, mvmax); | |
1594 | mvmax.y += 2; // there is some pad for subpel refine | |
1595 | mvmin <<= 2; | |
1596 | mvmax <<= 2; | |
1597 | ||
1598 | bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax); | |
1599 | bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax); | |
1600 | } | |
1601 | if (bTryZero) | |
72b9787e | 1602 | { |
b53f7c52 JB |
1603 | /* Estimate cost of BIDIR using coincident blocks */ |
1604 | Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv; | |
72b9787e | 1605 | |
b53f7c52 | 1606 | int zsa8d; |
72b9787e | 1607 | |
b53f7c52 | 1608 | if (m_bChromaSa8d) |
72b9787e | 1609 | { |
b53f7c52 JB |
1610 | cu.m_mv[0][0] = mvzero; |
1611 | cu.m_mv[1][0] = mvzero; | |
72b9787e | 1612 | |
b53f7c52 JB |
1613 | prepMotionCompensation(cu, cuGeom, 0); |
1614 | motionCompensation(tmpPredYuv, true, true); | |
72b9787e | 1615 | |
b53f7c52 JB |
1616 | zsa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size); |
1617 | zsa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize); | |
1618 | zsa8d += primitives.sa8d_inter[cpart](fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize); | |
72b9787e | 1619 | } |
b53f7c52 | 1620 | else |
72b9787e | 1621 | { |
b53f7c52 JB |
1622 | pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx); |
1623 | pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx); | |
1624 | intptr_t refStride = m_slice->m_mref[0][0].lumaStride; | |
72b9787e | 1625 | |
b53f7c52 JB |
1626 | primitives.pixelavg_pp[partEnum](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32); |
1627 | zsa8d = primitives.sa8d[partEnum](fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size); | |
72b9787e | 1628 | } |
72b9787e | 1629 | |
b53f7c52 JB |
1630 | uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0); |
1631 | uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1); | |
1632 | uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1); | |
72b9787e | 1633 | |
b53f7c52 JB |
1634 | /* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */ |
1635 | checkBestMVP(inter2Nx2N.amvpCand[0][ref0], mvzero, mvp0, mvpIdx0, bits0, zcost); | |
1636 | checkBestMVP(inter2Nx2N.amvpCand[1][ref1], mvzero, mvp1, mvpIdx1, bits1, zcost); | |
72b9787e | 1637 | |
b53f7c52 JB |
1638 | uint32_t zbits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]); |
1639 | zcost = zsa8d + m_rdCost.getCost(zbits); | |
72b9787e | 1640 | |
b53f7c52 JB |
1641 | if (zcost < bidir2Nx2N.sa8dCost) |
1642 | { | |
1643 | bidir2Nx2N.sa8dBits = zbits; | |
1644 | bidir2Nx2N.sa8dCost = zcost; | |
72b9787e | 1645 | |
b53f7c52 JB |
1646 | cu.setPUMv(0, mvzero, 0, 0); |
1647 | cu.m_mvd[0][0] = mvzero - mvp0; | |
1648 | cu.m_mvpIdx[0][0] = (uint8_t)mvpIdx0; | |
72b9787e | 1649 | |
b53f7c52 JB |
1650 | cu.setPUMv(1, mvzero, 0, 0); |
1651 | cu.m_mvd[1][0] = mvzero - mvp1; | |
1652 | cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1; | |
72b9787e | 1653 | |
b53f7c52 JB |
1654 | if (m_bChromaSa8d) |
1655 | /* real MC was already performed */ | |
1656 | bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv); | |
1657 | else | |
1658 | { | |
1659 | prepMotionCompensation(cu, cuGeom, 0); | |
1660 | motionCompensation(bidir2Nx2N.predYuv, true, true); | |
1661 | } | |
1662 | } | |
1663 | else if (m_bChromaSa8d) | |
1664 | { | |
1665 | /* recover overwritten motion vectors */ | |
1666 | cu.m_mv[0][0] = bestME[0].mv; | |
1667 | cu.m_mv[1][0] = bestME[1].mv; | |
1668 | } | |
1669 | } | |
72b9787e JB |
1670 | } |
1671 | ||
1672 | void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom) | |
1673 | { | |
1674 | if (cuGeom.depth < ctu.m_cuDepth[cuGeom.encodeIdx] && cuGeom.depth < g_maxCUDepth) | |
1675 | { | |
1676 | for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) | |
1677 | { | |
b53f7c52 JB |
1678 | const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); |
1679 | if (childGeom.flags & CUGeom::PRESENT) | |
1680 | encodeResidue(ctu, childGeom); | |
72b9787e JB |
1681 | } |
1682 | return; | |
1683 | } | |
1684 | ||
1685 | uint32_t absPartIdx = cuGeom.encodeIdx; | |
1686 | int sizeIdx = cuGeom.log2CUSize - 2; | |
1687 | ||
72b9787e JB |
1688 | /* reuse the bestMode data structures at the current depth */ |
1689 | Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode; | |
72b9787e JB |
1690 | CUData& cu = bestMode->cu; |
1691 | ||
1692 | cu.copyFromPic(ctu, cuGeom); | |
1693 | m_quant.setQPforQuant(cu); | |
1694 | ||
b53f7c52 JB |
1695 | Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv; |
1696 | if (cuGeom.depth) | |
1697 | m_modeDepth[0].fencYuv.copyPartToYuv(fencYuv, absPartIdx); | |
1698 | X265_CHECK(bestMode->fencYuv == &fencYuv, "invalid fencYuv\n"); | |
1699 | ||
1700 | if (cu.isIntra(0)) | |
72b9787e JB |
1701 | { |
1702 | uint32_t tuDepthRange[2]; | |
1703 | cu.getIntraTUQtDepthRange(tuDepthRange, 0); | |
1704 | ||
b53f7c52 JB |
1705 | uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N; |
1706 | residualTransformQuantIntra(*bestMode, cuGeom, initTuDepth, 0, tuDepthRange); | |
72b9787e JB |
1707 | getBestIntraModeChroma(*bestMode, cuGeom); |
1708 | residualQTIntraChroma(*bestMode, cuGeom, 0, 0); | |
1709 | } | |
b53f7c52 | 1710 | else // if (cu.isInter(0)) |
72b9787e | 1711 | { |
b53f7c52 | 1712 | X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n"); |
72b9787e JB |
1713 | |
1714 | /* Calculate residual for current CU part into depth sized resiYuv */ | |
1715 | ||
1716 | ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv; | |
1717 | ||
1718 | /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */ | |
1719 | Yuv& predYuv = m_modeDepth[0].bestMode->predYuv; | |
1720 | pixel* predY = predYuv.getLumaAddr(absPartIdx); | |
1721 | pixel* predU = predYuv.getCbAddr(absPartIdx); | |
1722 | pixel* predV = predYuv.getCrAddr(absPartIdx); | |
1723 | ||
1724 | primitives.luma_sub_ps[sizeIdx](resiYuv.m_buf[0], resiYuv.m_size, | |
b53f7c52 | 1725 | fencYuv.m_buf[0], predY, |
72b9787e JB |
1726 | fencYuv.m_size, predYuv.m_size); |
1727 | ||
1728 | primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[1], resiYuv.m_csize, | |
b53f7c52 JB |
1729 | fencYuv.m_buf[1], predU, |
1730 | fencYuv.m_csize, predYuv.m_csize); | |
72b9787e JB |
1731 | |
1732 | primitives.chroma[m_csp].sub_ps[sizeIdx](resiYuv.m_buf[2], resiYuv.m_csize, | |
b53f7c52 JB |
1733 | fencYuv.m_buf[2], predV, |
1734 | fencYuv.m_csize, predYuv.m_csize); | |
72b9787e JB |
1735 | |
1736 | uint32_t tuDepthRange[2]; | |
1737 | cu.getInterTUQtDepthRange(tuDepthRange, 0); | |
1738 | ||
1739 | residualTransformQuantInter(*bestMode, cuGeom, 0, cuGeom.depth, tuDepthRange); | |
1740 | ||
1741 | if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0)) | |
b53f7c52 | 1742 | cu.setPredModeSubParts(MODE_SKIP); |
72b9787e | 1743 | |
b53f7c52 JB |
1744 | /* residualTransformQuantInter() wrote transformed residual back into |
1745 | * resiYuv. Generate the recon pixels by adding it to the prediction */ | |
72b9787e | 1746 | |
b53f7c52 JB |
1747 | PicYuv& reconPic = *m_frame->m_reconPic; |
1748 | if (cu.m_cbf[0][0]) | |
1749 | primitives.luma_add_ps[sizeIdx](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride, | |
72b9787e | 1750 | predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size); |
72b9787e | 1751 | else |
b53f7c52 | 1752 | primitives.luma_copy_pp[sizeIdx](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride, |
72b9787e | 1753 | predY, predYuv.m_size); |
b53f7c52 JB |
1754 | |
1755 | if (cu.m_cbf[1][0]) | |
1756 | primitives.chroma[m_csp].add_ps[sizeIdx](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, | |
1757 | predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize); | |
1758 | else | |
1759 | primitives.chroma[m_csp].copy_pp[sizeIdx](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, | |
72b9787e | 1760 | predU, predYuv.m_csize); |
b53f7c52 JB |
1761 | |
1762 | if (cu.m_cbf[2][0]) | |
1763 | primitives.chroma[m_csp].add_ps[sizeIdx](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, | |
1764 | predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize); | |
1765 | else | |
1766 | primitives.chroma[m_csp].copy_pp[sizeIdx](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, | |
72b9787e | 1767 | predV, predYuv.m_csize); |
72b9787e | 1768 | } |
72b9787e JB |
1769 | |
1770 | checkDQP(cu, cuGeom); | |
1771 | cu.updatePic(cuGeom.depth); | |
1772 | } | |
1773 | ||
72b9787e JB |
1774 | void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth) |
1775 | { | |
1776 | if (m_param->rdLevel >= 3) | |
1777 | { | |
1778 | /* code the split flag (0 or 1) and update bit costs */ | |
1779 | mode.contexts.resetBits(); | |
1780 | mode.contexts.codeSplitFlag(mode.cu, 0, depth); | |
1781 | uint32_t bits = mode.contexts.getNumberOfWrittenBits(); | |
1782 | mode.mvBits += bits; | |
1783 | mode.totalBits += bits; | |
1784 | updateModeCost(mode); | |
1785 | } | |
1786 | else if (m_param->rdLevel <= 1) | |
1787 | { | |
1788 | mode.sa8dBits++; | |
1789 | mode.sa8dCost = m_rdCost.calcRdSADCost(mode.distortion, mode.sa8dBits); | |
1790 | } | |
1791 | else | |
1792 | { | |
1793 | mode.mvBits++; | |
1794 | mode.totalBits++; | |
1795 | updateModeCost(mode); | |
1796 | } | |
1797 | } | |
1798 | ||
1799 | void Analysis::checkDQP(CUData& cu, const CUGeom& cuGeom) | |
1800 | { | |
1801 | if (m_slice->m_pps->bUseDQP && cuGeom.depth <= m_slice->m_pps->maxCuDQPDepth) | |
1802 | { | |
1803 | if (cu.m_cuDepth[0] > cuGeom.depth) // detect splits | |
1804 | { | |
1805 | bool hasResidual = false; | |
1806 | for (uint32_t absPartIdx = 0; absPartIdx < cu.m_numPartitions; absPartIdx++) | |
1807 | { | |
1808 | if (cu.getQtRootCbf(absPartIdx)) | |
1809 | { | |
1810 | hasResidual = true; | |
1811 | break; | |
1812 | } | |
1813 | } | |
1814 | if (hasResidual) | |
1815 | cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth); | |
1816 | else | |
1817 | cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth); | |
1818 | } | |
1819 | else | |
1820 | { | |
1821 | if (!cu.getCbf(0, TEXT_LUMA, 0) && !cu.getCbf(0, TEXT_CHROMA_U, 0) && !cu.getCbf(0, TEXT_CHROMA_V, 0)) | |
1822 | cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth); | |
1823 | } | |
1824 | } | |
1825 | } | |
1826 | ||
1827 | uint32_t Analysis::topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom) | |
1828 | { | |
1829 | /* Do not attempt to code a block larger than the largest block in the | |
1830 | * co-located CTUs in L0 and L1 */ | |
1831 | int currentQP = parentCTU.m_qp[0]; | |
1832 | int previousQP = currentQP; | |
1833 | uint32_t minDepth0 = 4, minDepth1 = 4; | |
1834 | uint32_t sum = 0; | |
1835 | int numRefs = 0; | |
1836 | if (m_slice->m_numRefIdx[0]) | |
1837 | { | |
1838 | numRefs++; | |
1839 | const CUData& cu = *m_slice->m_refPicList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr); | |
1840 | previousQP = cu.m_qp[0]; | |
1841 | if (!cu.m_cuDepth[cuGeom.encodeIdx]) | |
1842 | return 0; | |
1843 | for (uint32_t i = 0; i < cuGeom.numPartitions && minDepth0; i += 4) | |
1844 | { | |
1845 | uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i]; | |
1846 | minDepth0 = X265_MIN(d, minDepth0); | |
1847 | sum += d; | |
1848 | } | |
1849 | } | |
1850 | if (m_slice->m_numRefIdx[1]) | |
1851 | { | |
1852 | numRefs++; | |
1853 | const CUData& cu = *m_slice->m_refPicList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr); | |
1854 | if (!cu.m_cuDepth[cuGeom.encodeIdx]) | |
1855 | return 0; | |
1856 | for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4) | |
1857 | { | |
1858 | uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i]; | |
1859 | minDepth1 = X265_MIN(d, minDepth1); | |
1860 | sum += d; | |
1861 | } | |
1862 | } | |
1863 | if (!numRefs) | |
1864 | return 0; | |
1865 | ||
1866 | uint32_t minDepth = X265_MIN(minDepth0, minDepth1); | |
1867 | uint32_t thresh = minDepth * numRefs * (cuGeom.numPartitions >> 2); | |
1868 | ||
1869 | /* allow block size growth if QP is raising or avg depth is | |
1870 | * less than 1.5 of min depth */ | |
1871 | if (minDepth && currentQP >= previousQP && (sum <= thresh + (thresh >> 1))) | |
1872 | minDepth -= 1; | |
1873 | ||
1874 | return minDepth; | |
1875 | } | |
1876 | ||
1877 | /* returns true if recursion should be stopped */ | |
1878 | bool Analysis::recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode) | |
1879 | { | |
1880 | /* early exit when the RD cost of best mode at depth n is less than the sum | |
1881 | * of average of RD cost of the neighbor CU's(above, aboveleft, aboveright, | |
1882 | * left, colocated) and avg cost of that CU at depth "n" with weightage for | |
1883 | * each quantity */ | |
1884 | ||
1885 | uint32_t depth = cuGeom.depth; | |
b53f7c52 | 1886 | FrameData& curEncData = *m_frame->m_encData; |
72b9787e JB |
1887 | FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr]; |
1888 | uint64_t cuCost = cuStat.avgCost[depth] * cuStat.count[depth]; | |
1889 | uint64_t cuCount = cuStat.count[depth]; | |
1890 | ||
1891 | uint64_t neighCost = 0, neighCount = 0; | |
1892 | const CUData* above = parentCTU.m_cuAbove; | |
1893 | if (above) | |
1894 | { | |
1895 | FrameData::RCStatCU& astat = curEncData.m_cuStat[above->m_cuAddr]; | |
1896 | neighCost += astat.avgCost[depth] * astat.count[depth]; | |
1897 | neighCount += astat.count[depth]; | |
1898 | ||
1899 | const CUData* aboveLeft = parentCTU.m_cuAboveLeft; | |
1900 | if (aboveLeft) | |
1901 | { | |
1902 | FrameData::RCStatCU& lstat = curEncData.m_cuStat[aboveLeft->m_cuAddr]; | |
1903 | neighCost += lstat.avgCost[depth] * lstat.count[depth]; | |
1904 | neighCount += lstat.count[depth]; | |
1905 | } | |
1906 | ||
1907 | const CUData* aboveRight = parentCTU.m_cuAboveRight; | |
1908 | if (aboveRight) | |
1909 | { | |
1910 | FrameData::RCStatCU& rstat = curEncData.m_cuStat[aboveRight->m_cuAddr]; | |
1911 | neighCost += rstat.avgCost[depth] * rstat.count[depth]; | |
1912 | neighCount += rstat.count[depth]; | |
1913 | } | |
1914 | } | |
1915 | const CUData* left = parentCTU.m_cuLeft; | |
1916 | if (left) | |
1917 | { | |
1918 | FrameData::RCStatCU& nstat = curEncData.m_cuStat[left->m_cuAddr]; | |
1919 | neighCost += nstat.avgCost[depth] * nstat.count[depth]; | |
1920 | neighCount += nstat.count[depth]; | |
1921 | } | |
1922 | ||
1923 | // give 60% weight to all CU's and 40% weight to neighbour CU's | |
b53f7c52 | 1924 | if (neighCount + cuCount) |
72b9787e JB |
1925 | { |
1926 | uint64_t avgCost = ((3 * cuCost) + (2 * neighCost)) / ((3 * cuCount) + (2 * neighCount)); | |
1927 | uint64_t curCost = m_param->rdLevel > 1 ? bestMode.rdCost : bestMode.sa8dCost; | |
1928 | if (curCost < avgCost && avgCost) | |
1929 | return true; | |
1930 | } | |
1931 | ||
1932 | return false; | |
1933 | } |