Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | /***************************************************************************** |
2 | * Copyright (C) 2013 x265 project | |
3 | * | |
4 | * Authors: Gopu Govindaswamy <gopu@multicorewareinc.com> | |
5 | * Steve Borho <steve@borho.org> | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License as published by | |
9 | * the Free Software Foundation; either version 2 of the License, or | |
10 | * (at your option) any later version. | |
11 | * | |
12 | * This program is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | * GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with this program; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
20 | * | |
21 | * This program is also available under a commercial proprietary license. | |
22 | * For more information, contact us at license @ x265.com. | |
23 | *****************************************************************************/ | |
24 | ||
25 | #include "common.h" | |
26 | #include "frame.h" | |
27 | #include "framedata.h" | |
28 | #include "picyuv.h" | |
29 | #include "primitives.h" | |
30 | #include "lowres.h" | |
31 | #include "mv.h" | |
32 | ||
33 | #include "slicetype.h" | |
34 | #include "motion.h" | |
35 | #include "ratecontrol.h" | |
36 | ||
37 | #define NUM_CUS (m_widthInCU > 2 && m_heightInCU > 2 ? (m_widthInCU - 2) * (m_heightInCU - 2) : m_widthInCU * m_heightInCU) | |
38 | ||
39 | using namespace x265; | |
40 | ||
41 | static inline int16_t median(int16_t a, int16_t b, int16_t c) | |
42 | { | |
43 | int16_t t = (a - b) & ((a - b) >> 31); | |
44 | ||
45 | a -= t; | |
46 | b += t; | |
47 | b -= (b - c) & ((b - c) >> 31); | |
48 | b += (a - b) & ((a - b) >> 31); | |
49 | return b; | |
50 | } | |
51 | ||
52 | static inline void median_mv(MV &dst, MV a, MV b, MV c) | |
53 | { | |
54 | dst.x = median(a.x, b.x, c.x); | |
55 | dst.y = median(a.y, b.y, c.y); | |
56 | } | |
57 | ||
58 | Lookahead::Lookahead(x265_param *param, ThreadPool* pool) | |
59 | : JobProvider(pool) | |
60 | , m_est(pool) | |
61 | { | |
62 | m_bReady = 0; | |
63 | m_param = param; | |
64 | m_lastKeyframe = -m_param->keyframeMax; | |
65 | m_lastNonB = NULL; | |
66 | m_bFilling = true; | |
67 | m_bFlushed = false; | |
68 | m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; | |
69 | m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; | |
70 | m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int)); | |
71 | memset(m_histogram, 0, sizeof(m_histogram)); | |
72 | } | |
73 | ||
74 | Lookahead::~Lookahead() { } | |
75 | ||
76 | void Lookahead::init() | |
77 | { | |
78 | if (m_pool && m_pool->getThreadCount() >= 4 && | |
79 | ((m_param->bFrameAdaptive && m_param->bframes) || | |
80 | m_param->rc.cuTree || m_param->scenecutThreshold || | |
81 | (m_param->lookaheadDepth && m_param->rc.vbvBufferSize))) | |
82 | m_pool = m_pool; /* allow use of worker thread */ | |
83 | else | |
84 | m_pool = NULL; /* disable use of worker thread */ | |
85 | } | |
86 | ||
87 | void Lookahead::destroy() | |
88 | { | |
89 | if (m_pool) | |
90 | // flush will dequeue, if it is necessary | |
91 | JobProvider::flush(); | |
92 | ||
93 | // these two queues will be empty unless the encode was aborted | |
94 | while (!m_inputQueue.empty()) | |
95 | { | |
96 | Frame* curFrame = m_inputQueue.popFront(); | |
97 | curFrame->destroy(); | |
98 | delete curFrame; | |
99 | } | |
100 | ||
101 | while (!m_outputQueue.empty()) | |
102 | { | |
103 | Frame* curFrame = m_outputQueue.popFront(); | |
104 | curFrame->destroy(); | |
105 | delete curFrame; | |
106 | } | |
107 | ||
108 | x265_free(m_scratch); | |
109 | } | |
110 | ||
111 | /* Called by API thread */ | |
112 | void Lookahead::addPicture(Frame *curFrame, int sliceType) | |
113 | { | |
114 | PicYuv *orig = curFrame->m_origPicYuv; | |
115 | ||
116 | curFrame->m_lowres.init(orig, curFrame->m_poc, sliceType); | |
117 | ||
118 | m_inputQueueLock.acquire(); | |
119 | m_inputQueue.pushBack(*curFrame); | |
120 | ||
121 | if (m_inputQueue.size() >= m_param->lookaheadDepth) | |
122 | { | |
123 | /* when queue fills the first time, run slicetypeDecide synchronously, | |
124 | * since the encoder will always be blocked here */ | |
125 | if (m_pool && !m_bFilling) | |
126 | { | |
127 | m_inputQueueLock.release(); | |
128 | m_bReady = 1; | |
129 | m_pool->pokeIdleThread(); | |
130 | } | |
131 | else | |
132 | slicetypeDecide(); | |
133 | ||
134 | if (m_bFilling && m_pool) | |
135 | JobProvider::enqueue(); | |
136 | m_bFilling = false; | |
137 | } | |
138 | else | |
139 | m_inputQueueLock.release(); | |
140 | } | |
141 | ||
142 | /* Called by API thread */ | |
143 | void Lookahead::flush() | |
144 | { | |
145 | /* just in case the input queue is never allowed to fill */ | |
146 | m_bFilling = false; | |
147 | ||
148 | /* flush synchronously */ | |
149 | m_inputQueueLock.acquire(); | |
150 | if (!m_inputQueue.empty()) | |
151 | { | |
152 | slicetypeDecide(); | |
153 | } | |
154 | else | |
155 | m_inputQueueLock.release(); | |
156 | ||
157 | m_inputQueueLock.acquire(); | |
158 | ||
159 | /* bFlushed indicates that an empty output queue actually means all frames | |
160 | * have been decided (no more inputs for the encoder) */ | |
161 | if (m_inputQueue.empty()) | |
162 | m_bFlushed = true; | |
163 | m_inputQueueLock.release(); | |
164 | } | |
165 | ||
166 | /* Called by API thread. If the lookahead queue has not yet been filled the | |
167 | * first time, it immediately returns NULL. Else the function blocks until | |
168 | * outputs are available and then pops the first frame from the output queue. If | |
169 | * flush() has been called and the output queue is empty, NULL is returned. */ | |
170 | Frame* Lookahead::getDecidedPicture() | |
171 | { | |
172 | m_outputQueueLock.acquire(); | |
173 | ||
174 | if (m_bFilling) | |
175 | { | |
176 | m_outputQueueLock.release(); | |
177 | return NULL; | |
178 | } | |
179 | ||
180 | while (m_outputQueue.empty() && !m_bFlushed) | |
181 | { | |
182 | m_outputQueueLock.release(); | |
183 | m_outputAvailable.wait(); | |
184 | m_outputQueueLock.acquire(); | |
185 | } | |
186 | ||
187 | Frame *fenc = m_outputQueue.popFront(); | |
188 | m_outputQueueLock.release(); | |
189 | return fenc; | |
190 | } | |
191 | ||
192 | /* Called by pool worker threads */ | |
193 | bool Lookahead::findJob(int) | |
194 | { | |
195 | if (m_bReady && ATOMIC_CAS32(&m_bReady, 1, 0) == 1) | |
196 | { | |
197 | m_inputQueueLock.acquire(); | |
198 | slicetypeDecide(); | |
199 | return true; | |
200 | } | |
201 | else | |
202 | return false; | |
203 | } | |
204 | ||
205 | /* Called by rate-control to calculate the estimated SATD cost for a given | |
206 | * picture. It assumes dpb->prepareEncode() has already been called for the | |
207 | * picture and all the references are established */ | |
208 | void Lookahead::getEstimatedPictureCost(Frame *curFrame) | |
209 | { | |
210 | Lowres *frames[X265_LOOKAHEAD_MAX]; | |
211 | ||
212 | // POC distances to each reference | |
213 | Slice *slice = curFrame->m_encData->m_slice; | |
214 | int p0 = 0, p1, b; | |
215 | int poc = slice->m_poc; | |
216 | int l0poc = slice->m_refPOCList[0][0]; | |
217 | int l1poc = slice->m_refPOCList[1][0]; | |
218 | ||
219 | switch (slice->m_sliceType) | |
220 | { | |
221 | case I_SLICE: | |
222 | frames[p0] = &curFrame->m_lowres; | |
223 | b = p1 = 0; | |
224 | break; | |
225 | ||
226 | case P_SLICE: | |
227 | b = p1 = poc - l0poc; | |
228 | frames[p0] = &slice->m_refPicList[0][0]->m_lowres; | |
229 | frames[b] = &curFrame->m_lowres; | |
230 | break; | |
231 | ||
232 | case B_SLICE: | |
233 | b = poc - l0poc; | |
234 | p1 = b + l1poc - poc; | |
235 | frames[p0] = &slice->m_refPicList[0][0]->m_lowres; | |
236 | frames[b] = &curFrame->m_lowres; | |
237 | frames[p1] = &slice->m_refPicList[1][0]->m_lowres; | |
238 | break; | |
239 | ||
240 | default: | |
241 | return; | |
242 | } | |
243 | ||
244 | if (m_param->rc.cuTree && !m_param->rc.bStatRead) | |
245 | /* update row satds based on cutree offsets */ | |
246 | curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b); | |
247 | else if (m_param->rc.aqMode) | |
248 | curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b]; | |
249 | else | |
250 | curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b]; | |
251 | ||
252 | if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate) | |
253 | { | |
254 | /* aggregate lowres row satds to CTU resolution */ | |
255 | curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCosts[b - p0][p1 - b]; | |
256 | uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0; | |
257 | uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE); | |
258 | uint32_t numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; | |
259 | uint32_t widthInLowresCu = (uint32_t)m_widthInCU, heightInLowresCu = (uint32_t)m_heightInCU; | |
260 | double *qp_offset = 0; | |
261 | /* Factor in qpoffsets based on Aq/Cutree in CU costs */ | |
262 | if (m_param->rc.aqMode) | |
263 | qp_offset = (frames[b]->sliceType == X265_TYPE_B || !m_param->rc.cuTree) ? frames[b]->qpAqOffset : frames[b]->qpCuTreeOffset; | |
264 | ||
265 | for (uint32_t row = 0; row < numCuInHeight; row++) | |
266 | { | |
267 | lowresRow = row * scale; | |
268 | for (uint32_t cnt = 0; cnt < scale && lowresRow < heightInLowresCu; lowresRow++, cnt++) | |
269 | { | |
270 | sum = 0; | |
271 | lowresCuIdx = lowresRow * widthInLowresCu; | |
272 | for (lowresCol = 0; lowresCol < widthInLowresCu; lowresCol++, lowresCuIdx++) | |
273 | { | |
274 | uint16_t lowresCuCost = curFrame->m_lowres.lowresCostForRc[lowresCuIdx] & LOWRES_COST_MASK; | |
275 | if (qp_offset) | |
276 | { | |
277 | lowresCuCost = (uint16_t)((lowresCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8); | |
278 | int32_t intraCuCost = curFrame->m_lowres.intraCost[lowresCuIdx]; | |
279 | curFrame->m_lowres.intraCost[lowresCuIdx] = (intraCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8; | |
280 | } | |
281 | curFrame->m_lowres.lowresCostForRc[lowresCuIdx] = lowresCuCost; | |
282 | sum += lowresCuCost; | |
283 | } | |
284 | curFrame->m_encData->m_rowStat[row].satdForVbv += sum; | |
285 | } | |
286 | } | |
287 | } | |
288 | } | |
289 | ||
290 | /* called by API thread or worker thread with inputQueueLock acquired */ | |
291 | void Lookahead::slicetypeDecide() | |
292 | { | |
293 | ScopedLock lock(m_decideLock); | |
294 | ||
295 | Lowres *frames[X265_LOOKAHEAD_MAX]; | |
296 | Frame *list[X265_LOOKAHEAD_MAX]; | |
297 | int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX); | |
298 | ||
299 | memset(frames, 0, sizeof(frames)); | |
300 | memset(list, 0, sizeof(list)); | |
301 | { | |
302 | Frame *curFrame = m_inputQueue.first(); | |
303 | int j; | |
304 | for (j = 0; j < m_param->bframes + 2; j++) | |
305 | { | |
306 | if (!curFrame) break; | |
307 | list[j] = curFrame; | |
308 | curFrame = curFrame->m_next; | |
309 | } | |
310 | ||
311 | curFrame = m_inputQueue.first(); | |
312 | frames[0] = m_lastNonB; | |
313 | for (j = 0; j < maxSearch; j++) | |
314 | { | |
315 | if (!curFrame) break; | |
316 | frames[j + 1] = &curFrame->m_lowres; | |
317 | curFrame = curFrame->m_next; | |
318 | } | |
319 | ||
320 | maxSearch = j; | |
321 | } | |
322 | ||
323 | m_inputQueueLock.release(); | |
324 | ||
325 | if (!m_est.m_rows && list[0]) | |
326 | m_est.init(m_param, list[0]); | |
327 | ||
328 | if (m_lastNonB && !m_param->rc.bStatRead && | |
329 | ((m_param->bFrameAdaptive && m_param->bframes) || | |
330 | m_param->rc.cuTree || m_param->scenecutThreshold || | |
331 | (m_param->lookaheadDepth && m_param->rc.vbvBufferSize))) | |
332 | { | |
333 | slicetypeAnalyse(frames, false); | |
334 | } | |
335 | ||
336 | int bframes, brefs; | |
337 | for (bframes = 0, brefs = 0;; bframes++) | |
338 | { | |
339 | Lowres& frm = list[bframes]->m_lowres; | |
340 | ||
341 | if (frm.sliceType == X265_TYPE_BREF && !m_param->bBPyramid && brefs == m_param->bBPyramid) | |
342 | { | |
343 | frm.sliceType = X265_TYPE_B; | |
344 | x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid\n", | |
345 | frm.frameNum); | |
346 | } | |
347 | ||
348 | /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available. | |
349 | smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it.*/ | |
350 | else if (frm.sliceType == X265_TYPE_BREF && m_param->bBPyramid && brefs && | |
351 | m_param->maxNumReferences <= (brefs + 3)) | |
352 | { | |
353 | frm.sliceType = X265_TYPE_B; | |
354 | x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n", | |
355 | frm.sliceType, m_param->maxNumReferences); | |
356 | } | |
357 | ||
358 | if ( /*(!param->intraRefresh || frm.frameNum == 0) && */ frm.frameNum - m_lastKeyframe >= m_param->keyframeMax) | |
359 | { | |
360 | if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I) | |
361 | frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR; | |
362 | bool warn = frm.sliceType != X265_TYPE_IDR; | |
363 | if (warn && m_param->bOpenGOP) | |
364 | warn &= frm.sliceType != X265_TYPE_I; | |
365 | if (warn) | |
366 | { | |
367 | x265_log(m_param, X265_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n", | |
368 | frm.sliceType, frm.frameNum); | |
369 | frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR; | |
370 | } | |
371 | } | |
372 | if (frm.sliceType == X265_TYPE_I && frm.frameNum - m_lastKeyframe >= m_param->keyframeMin) | |
373 | { | |
374 | if (m_param->bOpenGOP) | |
375 | { | |
376 | m_lastKeyframe = frm.frameNum; | |
377 | frm.bKeyframe = true; | |
378 | } | |
379 | else | |
380 | frm.sliceType = X265_TYPE_IDR; | |
381 | } | |
382 | if (frm.sliceType == X265_TYPE_IDR) | |
383 | { | |
384 | /* Closed GOP */ | |
385 | m_lastKeyframe = frm.frameNum; | |
386 | frm.bKeyframe = true; | |
387 | if (bframes > 0) | |
388 | { | |
389 | list[bframes - 1]->m_lowres.sliceType = X265_TYPE_P; | |
390 | bframes--; | |
391 | } | |
392 | } | |
393 | if (bframes == m_param->bframes || !list[bframes + 1]) | |
394 | { | |
395 | if (IS_X265_TYPE_B(frm.sliceType)) | |
396 | x265_log(m_param, X265_LOG_WARNING, "specified frame type is not compatible with max B-frames\n"); | |
397 | if (frm.sliceType == X265_TYPE_AUTO || IS_X265_TYPE_B(frm.sliceType)) | |
398 | frm.sliceType = X265_TYPE_P; | |
399 | } | |
400 | if (frm.sliceType == X265_TYPE_BREF) | |
401 | brefs++; | |
402 | if (frm.sliceType == X265_TYPE_AUTO) | |
403 | frm.sliceType = X265_TYPE_B; | |
404 | else if (!IS_X265_TYPE_B(frm.sliceType)) | |
405 | break; | |
406 | } | |
407 | ||
408 | if (bframes) | |
409 | list[bframes - 1]->m_lowres.bLastMiniGopBFrame = true; | |
410 | list[bframes]->m_lowres.leadingBframes = bframes; | |
411 | m_lastNonB = &list[bframes]->m_lowres; | |
412 | m_histogram[bframes]++; | |
413 | ||
414 | /* insert a bref into the sequence */ | |
415 | if (m_param->bBPyramid && bframes > 1 && !brefs) | |
416 | { | |
417 | list[bframes / 2]->m_lowres.sliceType = X265_TYPE_BREF; | |
418 | brefs++; | |
419 | } | |
420 | ||
421 | /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */ | |
422 | if (m_param->rc.rateControlMode != X265_RC_CQP) | |
423 | { | |
424 | int p0, p1, b; | |
425 | /* For zero latency tuning, calculate frame cost to be used later in RC */ | |
426 | if (!maxSearch) | |
427 | { | |
428 | for (int i = 0; i <= bframes; i++) | |
429 | frames[i + 1] = &list[i]->m_lowres; | |
430 | } | |
431 | ||
432 | /* estimate new non-B cost */ | |
433 | p1 = b = bframes + 1; | |
434 | p0 = (IS_X265_TYPE_I(frames[bframes + 1]->sliceType)) ? b : 0; | |
435 | m_est.estimateFrameCost(frames, p0, p1, b, 0); | |
436 | ||
437 | if (bframes) | |
438 | { | |
439 | p0 = 0; // last nonb | |
440 | for (b = 1; b <= bframes; b++) | |
441 | { | |
442 | if (frames[b]->sliceType == X265_TYPE_B) | |
443 | for (p1 = b; frames[p1]->sliceType == X265_TYPE_B; p1++) | |
444 | ; // find new nonb or bref | |
445 | else | |
446 | p1 = bframes + 1; | |
447 | ||
448 | m_est.estimateFrameCost(frames, p0, p1, b, 0); | |
449 | ||
450 | if (frames[b]->sliceType == X265_TYPE_BREF) | |
451 | p0 = b; | |
452 | } | |
453 | } | |
454 | } | |
455 | ||
456 | m_inputQueueLock.acquire(); | |
457 | ||
458 | /* dequeue all frames from inputQueue that are about to be enqueued | |
459 | * in the output queue. The order is important because Frame can | |
460 | * only be in one list at a time */ | |
461 | int64_t pts[X265_BFRAME_MAX + 1]; | |
462 | for (int i = 0; i <= bframes; i++) | |
463 | { | |
464 | Frame *curFrame; | |
465 | curFrame = m_inputQueue.popFront(); | |
466 | pts[i] = curFrame->m_pts; | |
467 | maxSearch--; | |
468 | } | |
469 | ||
470 | m_inputQueueLock.release(); | |
471 | ||
472 | m_outputQueueLock.acquire(); | |
473 | /* add non-B to output queue */ | |
474 | int idx = 0; | |
475 | list[bframes]->m_reorderedPts = pts[idx++]; | |
476 | m_outputQueue.pushBack(*list[bframes]); | |
477 | ||
478 | /* Add B-ref frame next to P frame in output queue, the B-ref encode before non B-ref frame */ | |
479 | if (bframes > 1 && m_param->bBPyramid) | |
480 | { | |
481 | for (int i = 0; i < bframes; i++) | |
482 | { | |
483 | if (list[i]->m_lowres.sliceType == X265_TYPE_BREF) | |
484 | { | |
485 | list[i]->m_reorderedPts = pts[idx++]; | |
486 | m_outputQueue.pushBack(*list[i]); | |
487 | } | |
488 | } | |
489 | } | |
490 | ||
491 | /* add B frames to output queue */ | |
492 | for (int i = 0; i < bframes; i++) | |
493 | { | |
494 | /* push all the B frames into output queue except B-ref, which already pushed into output queue*/ | |
495 | if (list[i]->m_lowres.sliceType != X265_TYPE_BREF) | |
496 | { | |
497 | list[i]->m_reorderedPts = pts[idx++]; | |
498 | m_outputQueue.pushBack(*list[i]); | |
499 | } | |
500 | } | |
501 | ||
502 | bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth)) && !m_param->rc.bStatRead; | |
503 | if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType)) | |
504 | { | |
505 | m_inputQueueLock.acquire(); | |
506 | Frame *curFrame = m_inputQueue.first(); | |
507 | frames[0] = m_lastNonB; | |
508 | int j; | |
509 | for (j = 0; j < maxSearch; j++) | |
510 | { | |
511 | frames[j + 1] = &curFrame->m_lowres; | |
512 | curFrame = curFrame->m_next; | |
513 | } | |
514 | ||
515 | frames[j + 1] = NULL; | |
516 | m_inputQueueLock.release(); | |
517 | slicetypeAnalyse(frames, true); | |
518 | } | |
519 | ||
520 | m_outputQueueLock.release(); | |
521 | m_outputAvailable.trigger(); | |
522 | } | |
523 | ||
524 | void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe) | |
525 | { | |
526 | int prevNonB = 0, curNonB = 1, idx = 0; | |
527 | bool isNextNonB = false; | |
528 | ||
529 | while (curNonB < numFrames && frames[curNonB]->sliceType == X265_TYPE_B) | |
530 | curNonB++; | |
531 | ||
532 | int nextNonB = keyframe ? prevNonB : curNonB; | |
533 | int nextB = keyframe ? prevNonB + 1 : curNonB + 1; | |
534 | ||
535 | while (curNonB < numFrames + !keyframe) | |
536 | { | |
537 | /* P/I cost: This shouldn't include the cost of nextNonB */ | |
538 | if (nextNonB != curNonB) | |
539 | { | |
540 | int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB; | |
541 | frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, p0, curNonB, curNonB); | |
542 | frames[nextNonB]->plannedType[idx] = frames[curNonB]->sliceType; | |
543 | idx++; | |
544 | } | |
545 | /* Handle the B-frames: coded order */ | |
546 | for (int i = prevNonB + 1; i < curNonB; i++, idx++) | |
547 | { | |
548 | frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, prevNonB, curNonB, i); | |
549 | frames[nextNonB]->plannedType[idx] = X265_TYPE_B; | |
550 | } | |
551 | ||
552 | for (int i = nextB; i <= curNonB; i++) | |
553 | { | |
554 | for (int j = frames[i]->indB + i + 1; j <= curNonB; j++, frames[i]->indB++) | |
555 | { | |
556 | if (j == curNonB) | |
557 | { | |
558 | if (isNextNonB) | |
559 | { | |
560 | int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB; | |
561 | frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, p0, curNonB, curNonB); | |
562 | frames[i]->plannedType[frames[i]->indB] = frames[curNonB]->sliceType; | |
563 | } | |
564 | } | |
565 | else | |
566 | { | |
567 | frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, prevNonB, curNonB, j); | |
568 | frames[i]->plannedType[frames[i]->indB] = X265_TYPE_B; | |
569 | } | |
570 | } | |
571 | if (i == curNonB && !isNextNonB) | |
572 | isNextNonB = true; | |
573 | } | |
574 | ||
575 | prevNonB = curNonB; | |
576 | curNonB++; | |
577 | while (curNonB <= numFrames && frames[curNonB]->sliceType == X265_TYPE_B) | |
578 | curNonB++; | |
579 | } | |
580 | ||
581 | frames[nextNonB]->plannedType[idx] = X265_TYPE_AUTO; | |
582 | } | |
583 | ||
584 | int64_t Lookahead::vbvFrameCost(Lowres **frames, int p0, int p1, int b) | |
585 | { | |
586 | int64_t cost = m_est.estimateFrameCost(frames, p0, p1, b, 0); | |
587 | ||
588 | if (m_param->rc.aqMode) | |
589 | { | |
590 | if (m_param->rc.cuTree) | |
591 | return frameCostRecalculate(frames, p0, p1, b); | |
592 | else | |
593 | return frames[b]->costEstAq[b - p0][p1 - b]; | |
594 | } | |
595 | return cost; | |
596 | } | |
597 | ||
598 | void Lookahead::slicetypeAnalyse(Lowres **frames, bool bKeyframe) | |
599 | { | |
600 | int numFrames, origNumFrames, keyintLimit, framecnt; | |
601 | int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX); | |
602 | int cuCount = NUM_CUS; | |
603 | int resetStart; | |
604 | bool bIsVbvLookahead = m_param->rc.vbvBufferSize && m_param->lookaheadDepth; | |
605 | ||
606 | /* count undecided frames */ | |
607 | for (framecnt = 0; framecnt < maxSearch; framecnt++) | |
608 | { | |
609 | Lowres *fenc = frames[framecnt + 1]; | |
610 | if (!fenc || fenc->sliceType != X265_TYPE_AUTO) | |
611 | break; | |
612 | } | |
613 | ||
614 | if (!framecnt) | |
615 | { | |
616 | if (m_param->rc.cuTree) | |
617 | cuTree(frames, 0, bKeyframe); | |
618 | return; | |
619 | } | |
620 | ||
621 | frames[framecnt + 1] = NULL; | |
622 | ||
623 | keyintLimit = m_param->keyframeMax - frames[0]->frameNum + m_lastKeyframe - 1; | |
624 | origNumFrames = numFrames = X265_MIN(framecnt, keyintLimit); | |
625 | ||
626 | if (bIsVbvLookahead) | |
627 | numFrames = framecnt; | |
628 | else if (m_param->bOpenGOP && numFrames < framecnt) | |
629 | numFrames++; | |
630 | else if (numFrames == 0) | |
631 | { | |
632 | frames[1]->sliceType = X265_TYPE_I; | |
633 | return; | |
634 | } | |
635 | ||
636 | int numBFrames = 0; | |
637 | int numAnalyzed = numFrames; | |
638 | if (m_param->scenecutThreshold && scenecut(frames, 0, 1, true, origNumFrames, maxSearch)) | |
639 | { | |
640 | frames[1]->sliceType = X265_TYPE_I; | |
641 | return; | |
642 | } | |
643 | ||
644 | if (m_param->bframes) | |
645 | { | |
646 | if (m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS) | |
647 | { | |
648 | if (numFrames > 1) | |
649 | { | |
650 | char best_paths[X265_BFRAME_MAX + 1][X265_LOOKAHEAD_MAX + 1] = { "", "P" }; | |
651 | int best_path_index = numFrames % (X265_BFRAME_MAX + 1); | |
652 | ||
653 | /* Perform the frametype analysis. */ | |
654 | for (int j = 2; j <= numFrames; j++) | |
655 | { | |
656 | slicetypePath(frames, j, best_paths); | |
657 | } | |
658 | ||
659 | numBFrames = (int)strspn(best_paths[best_path_index], "B"); | |
660 | ||
661 | /* Load the results of the analysis into the frame types. */ | |
662 | for (int j = 1; j < numFrames; j++) | |
663 | { | |
664 | frames[j]->sliceType = best_paths[best_path_index][j - 1] == 'B' ? X265_TYPE_B : X265_TYPE_P; | |
665 | } | |
666 | } | |
667 | frames[numFrames]->sliceType = X265_TYPE_P; | |
668 | } | |
669 | else if (m_param->bFrameAdaptive == X265_B_ADAPT_FAST) | |
670 | { | |
671 | int64_t cost1p0, cost2p0, cost1b1, cost2p1; | |
672 | ||
673 | for (int i = 0; i <= numFrames - 2; ) | |
674 | { | |
675 | cost2p1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 2, 1); | |
676 | if (frames[i + 2]->intraMbs[2] > cuCount / 2) | |
677 | { | |
678 | frames[i + 1]->sliceType = X265_TYPE_P; | |
679 | frames[i + 2]->sliceType = X265_TYPE_P; | |
680 | i += 2; | |
681 | continue; | |
682 | } | |
683 | ||
684 | cost1b1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 1, 0); | |
685 | cost1p0 = m_est.estimateFrameCost(frames, i + 0, i + 1, i + 1, 0); | |
686 | cost2p0 = m_est.estimateFrameCost(frames, i + 1, i + 2, i + 2, 0); | |
687 | ||
688 | if (cost1p0 + cost2p0 < cost1b1 + cost2p1) | |
689 | { | |
690 | frames[i + 1]->sliceType = X265_TYPE_P; | |
691 | i += 1; | |
692 | continue; | |
693 | } | |
694 | ||
695 | // arbitrary and untuned | |
696 | #define INTER_THRESH 300 | |
697 | #define P_SENS_BIAS (50 - m_param->bFrameBias) | |
698 | frames[i + 1]->sliceType = X265_TYPE_B; | |
699 | ||
700 | int j; | |
701 | for (j = i + 2; j <= X265_MIN(i + m_param->bframes, numFrames - 1); j++) | |
702 | { | |
703 | int64_t pthresh = X265_MAX(INTER_THRESH - P_SENS_BIAS * (j - i - 1), INTER_THRESH / 10); | |
704 | int64_t pcost = m_est.estimateFrameCost(frames, i + 0, j + 1, j + 1, 1); | |
705 | if (pcost > pthresh * cuCount || frames[j + 1]->intraMbs[j - i + 1] > cuCount / 3) | |
706 | break; | |
707 | frames[j]->sliceType = X265_TYPE_B; | |
708 | } | |
709 | ||
710 | frames[j]->sliceType = X265_TYPE_P; | |
711 | i = j; | |
712 | } | |
713 | frames[numFrames]->sliceType = X265_TYPE_P; | |
714 | numBFrames = 0; | |
715 | while (numBFrames < numFrames && frames[numBFrames + 1]->sliceType == X265_TYPE_B) | |
716 | { | |
717 | numBFrames++; | |
718 | } | |
719 | } | |
720 | else | |
721 | { | |
722 | numBFrames = X265_MIN(numFrames - 1, m_param->bframes); | |
723 | for (int j = 1; j < numFrames; j++) | |
724 | { | |
725 | frames[j]->sliceType = (j % (numBFrames + 1)) ? X265_TYPE_B : X265_TYPE_P; | |
726 | } | |
727 | ||
728 | frames[numFrames]->sliceType = X265_TYPE_P; | |
729 | } | |
730 | /* Check scenecut on the first minigop. */ | |
731 | for (int j = 1; j < numBFrames + 1; j++) | |
732 | { | |
733 | if (m_param->scenecutThreshold && scenecut(frames, j, j + 1, false, origNumFrames, maxSearch)) | |
734 | { | |
735 | frames[j]->sliceType = X265_TYPE_P; | |
736 | numAnalyzed = j; | |
737 | break; | |
738 | } | |
739 | } | |
740 | ||
741 | resetStart = bKeyframe ? 1 : X265_MIN(numBFrames + 2, numAnalyzed + 1); | |
742 | } | |
743 | else | |
744 | { | |
745 | for (int j = 1; j <= numFrames; j++) | |
746 | { | |
747 | frames[j]->sliceType = X265_TYPE_P; | |
748 | } | |
749 | ||
750 | resetStart = bKeyframe ? 1 : 2; | |
751 | } | |
752 | ||
753 | if (m_param->rc.cuTree) | |
754 | cuTree(frames, X265_MIN(numFrames, m_param->keyframeMax), bKeyframe); | |
755 | ||
756 | // if (!param->bIntraRefresh) | |
757 | for (int j = keyintLimit + 1; j <= numFrames; j += m_param->keyframeMax) | |
758 | { | |
759 | frames[j]->sliceType = X265_TYPE_I; | |
760 | resetStart = X265_MIN(resetStart, j + 1); | |
761 | } | |
762 | ||
763 | if (bIsVbvLookahead) | |
764 | vbvLookahead(frames, numFrames, bKeyframe); | |
765 | ||
766 | /* Restore frametypes for all frames that haven't actually been decided yet. */ | |
767 | for (int j = resetStart; j <= numFrames; j++) | |
768 | { | |
769 | frames[j]->sliceType = X265_TYPE_AUTO; | |
770 | } | |
771 | } | |
772 | ||
773 | bool Lookahead::scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch) | |
774 | { | |
775 | /* Only do analysis during a normal scenecut check. */ | |
776 | if (bRealScenecut && m_param->bframes) | |
777 | { | |
778 | int origmaxp1 = p0 + 1; | |
779 | /* Look ahead to avoid coding short flashes as scenecuts. */ | |
780 | if (m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS) | |
781 | /* Don't analyse any more frames than the trellis would have covered. */ | |
782 | origmaxp1 += m_param->bframes; | |
783 | else | |
784 | origmaxp1++; | |
785 | int maxp1 = X265_MIN(origmaxp1, numFrames); | |
786 | ||
787 | /* Where A and B are scenes: AAAAAABBBAAAAAA | |
788 | * If BBB is shorter than (maxp1-p0), it is detected as a flash | |
789 | * and not considered a scenecut. */ | |
790 | for (int cp1 = p1; cp1 <= maxp1; cp1++) | |
791 | { | |
792 | if (!scenecutInternal(frames, p0, cp1, false)) | |
793 | /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */ | |
794 | for (int i = cp1; i > p0; i--) | |
795 | { | |
796 | frames[i]->bScenecut = false; | |
797 | } | |
798 | } | |
799 | ||
800 | /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF | |
801 | * If each of BB ... EE are shorter than (maxp1-p0), they are | |
802 | * detected as flashes and not considered scenecuts. | |
803 | * Instead, the first F frame becomes a scenecut. | |
804 | * If the video ends before F, no frame becomes a scenecut. */ | |
805 | for (int cp0 = p0; cp0 <= maxp1; cp0++) | |
806 | { | |
807 | if (origmaxp1 > maxSearch || (cp0 < maxp1 && scenecutInternal(frames, cp0, maxp1, false))) | |
808 | /* If cur_p0 is the p0 of a scenecut, it cannot be the p1 of a scenecut. */ | |
809 | frames[cp0]->bScenecut = false; | |
810 | } | |
811 | } | |
812 | ||
813 | /* Ignore frames that are part of a flash, i.e. cannot be real scenecuts. */ | |
814 | if (!frames[p1]->bScenecut) | |
815 | return false; | |
816 | return scenecutInternal(frames, p0, p1, bRealScenecut); | |
817 | } | |
818 | ||
819 | bool Lookahead::scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut) | |
820 | { | |
821 | Lowres *frame = frames[p1]; | |
822 | ||
823 | m_est.estimateFrameCost(frames, p0, p1, p1, 0); | |
824 | ||
825 | int64_t icost = frame->costEst[0][0]; | |
826 | int64_t pcost = frame->costEst[p1 - p0][0]; | |
827 | int gopSize = frame->frameNum - m_lastKeyframe; | |
828 | float threshMax = (float)(m_param->scenecutThreshold / 100.0); | |
829 | ||
830 | /* magic numbers pulled out of thin air */ | |
831 | float threshMin = (float)(threshMax * 0.25); | |
832 | float bias; | |
833 | ||
834 | if (m_param->keyframeMin == m_param->keyframeMax) | |
835 | threshMin = threshMax; | |
836 | if (gopSize <= m_param->keyframeMin / 4) | |
837 | bias = threshMin / 4; | |
838 | else if (gopSize <= m_param->keyframeMin) | |
839 | bias = threshMin * gopSize / m_param->keyframeMin; | |
840 | else | |
841 | { | |
842 | bias = threshMin | |
843 | + (threshMax - threshMin) | |
844 | * (gopSize - m_param->keyframeMin) | |
845 | / (m_param->keyframeMax - m_param->keyframeMin); | |
846 | } | |
847 | ||
848 | bool res = pcost >= (1.0 - bias) * icost; | |
849 | if (res && bRealScenecut) | |
850 | { | |
851 | int imb = frame->intraMbs[p1 - p0]; | |
852 | int pmb = NUM_CUS - imb; | |
853 | x265_log(m_param, X265_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n", | |
854 | frame->frameNum, icost, pcost, 1. - (double)pcost / icost, bias, gopSize, imb, pmb); | |
855 | } | |
856 | return res; | |
857 | } | |
858 | ||
859 | void Lookahead::slicetypePath(Lowres **frames, int length, char(*best_paths)[X265_LOOKAHEAD_MAX + 1]) | |
860 | { | |
861 | char paths[2][X265_LOOKAHEAD_MAX + 1]; | |
862 | int num_paths = X265_MIN(m_param->bframes + 1, length); | |
863 | int64_t best_cost = 1LL << 62; | |
864 | int idx = 0; | |
865 | ||
866 | /* Iterate over all currently possible paths */ | |
867 | for (int path = 0; path < num_paths; path++) | |
868 | { | |
869 | /* Add suffixes to the current path */ | |
870 | int len = length - (path + 1); | |
871 | memcpy(paths[idx], best_paths[len % (X265_BFRAME_MAX + 1)], len); | |
872 | memset(paths[idx] + len, 'B', path); | |
873 | strcpy(paths[idx] + len + path, "P"); | |
874 | ||
875 | /* Calculate the actual cost of the current path */ | |
876 | int64_t cost = slicetypePathCost(frames, paths[idx], best_cost); | |
877 | if (cost < best_cost) | |
878 | { | |
879 | best_cost = cost; | |
880 | idx ^= 1; | |
881 | } | |
882 | } | |
883 | ||
884 | /* Store the best path. */ | |
885 | memcpy(best_paths[length % (X265_BFRAME_MAX + 1)], paths[idx ^ 1], length); | |
886 | } | |
887 | ||
888 | int64_t Lookahead::slicetypePathCost(Lowres **frames, char *path, int64_t threshold) | |
889 | { | |
890 | int64_t cost = 0; | |
891 | int loc = 1; | |
892 | int cur_p = 0; | |
893 | ||
894 | path--; /* Since the 1st path element is really the second frame */ | |
895 | while (path[loc]) | |
896 | { | |
897 | int next_p = loc; | |
898 | /* Find the location of the next P-frame. */ | |
899 | while (path[next_p] != 'P') | |
900 | { | |
901 | next_p++; | |
902 | } | |
903 | ||
904 | /* Add the cost of the P-frame found above */ | |
905 | cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_p, 0); | |
906 | /* Early terminate if the cost we have found is larger than the best path cost so far */ | |
907 | if (cost > threshold) | |
908 | break; | |
909 | ||
910 | if (m_param->bBPyramid && next_p - cur_p > 2) | |
911 | { | |
912 | int middle = cur_p + (next_p - cur_p) / 2; | |
913 | cost += m_est.estimateFrameCost(frames, cur_p, next_p, middle, 0); | |
914 | for (int next_b = loc; next_b < middle && cost < threshold; next_b++) | |
915 | { | |
916 | cost += m_est.estimateFrameCost(frames, cur_p, middle, next_b, 0); | |
917 | } | |
918 | ||
919 | for (int next_b = middle + 1; next_b < next_p && cost < threshold; next_b++) | |
920 | { | |
921 | cost += m_est.estimateFrameCost(frames, middle, next_p, next_b, 0); | |
922 | } | |
923 | } | |
924 | else | |
925 | { | |
926 | for (int next_b = loc; next_b < next_p && cost < threshold; next_b++) | |
927 | { | |
928 | cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_b, 0); | |
929 | } | |
930 | } | |
931 | ||
932 | loc = next_p + 1; | |
933 | cur_p = next_p; | |
934 | } | |
935 | ||
936 | return cost; | |
937 | } | |
938 | ||
939 | void Lookahead::cuTree(Lowres **frames, int numframes, bool bIntra) | |
940 | { | |
941 | int idx = !bIntra; | |
942 | int lastnonb, curnonb = 1; | |
943 | int bframes = 0; | |
944 | ||
945 | x265_emms(); | |
946 | double totalDuration = 0.0; | |
947 | for (int j = 0; j <= numframes; j++) | |
948 | totalDuration += (double)m_param->fpsDenom / m_param->fpsNum; | |
949 | ||
950 | double averageDuration = totalDuration / (numframes + 1); | |
951 | ||
952 | int i = numframes; | |
953 | int cuCount = m_widthInCU * m_heightInCU; | |
954 | ||
955 | if (bIntra) | |
956 | m_est.estimateFrameCost(frames, 0, 0, 0, 0); | |
957 | ||
958 | while (i > 0 && frames[i]->sliceType == X265_TYPE_B) | |
959 | i--; | |
960 | ||
961 | lastnonb = i; | |
962 | ||
963 | /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could | |
964 | * be applied to the end of a lookahead buffer of any size. However, it's most needed when | |
965 | * lookahead=0, so that's what's currently implemented. */ | |
966 | if (!m_param->lookaheadDepth) | |
967 | { | |
968 | if (bIntra) | |
969 | { | |
970 | memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t)); | |
971 | memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, cuCount * sizeof(double)); | |
972 | return; | |
973 | } | |
974 | std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost); | |
975 | memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t)); | |
976 | } | |
977 | else | |
978 | { | |
979 | if (lastnonb < idx) | |
980 | return; | |
981 | memset(frames[lastnonb]->propagateCost, 0, cuCount * sizeof(uint16_t)); | |
982 | } | |
983 | ||
984 | while (i-- > idx) | |
985 | { | |
986 | curnonb = i; | |
987 | while (frames[curnonb]->sliceType == X265_TYPE_B && curnonb > 0) | |
988 | curnonb--; | |
989 | ||
990 | if (curnonb < idx) | |
991 | break; | |
992 | ||
993 | m_est.estimateFrameCost(frames, curnonb, lastnonb, lastnonb, 0); | |
994 | memset(frames[curnonb]->propagateCost, 0, cuCount * sizeof(uint16_t)); | |
995 | bframes = lastnonb - curnonb - 1; | |
996 | if (m_param->bBPyramid && bframes > 1) | |
997 | { | |
998 | int middle = (bframes + 1) / 2 + curnonb; | |
999 | m_est.estimateFrameCost(frames, curnonb, lastnonb, middle, 0); | |
1000 | memset(frames[middle]->propagateCost, 0, cuCount * sizeof(uint16_t)); | |
1001 | while (i > curnonb) | |
1002 | { | |
1003 | int p0 = i > middle ? middle : curnonb; | |
1004 | int p1 = i < middle ? middle : lastnonb; | |
1005 | if (i != middle) | |
1006 | { | |
1007 | m_est.estimateFrameCost(frames, p0, p1, i, 0); | |
1008 | estimateCUPropagate(frames, averageDuration, p0, p1, i, 0); | |
1009 | } | |
1010 | i--; | |
1011 | } | |
1012 | ||
1013 | estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, middle, 1); | |
1014 | } | |
1015 | else | |
1016 | { | |
1017 | while (i > curnonb) | |
1018 | { | |
1019 | m_est.estimateFrameCost(frames, curnonb, lastnonb, i, 0); | |
1020 | estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0); | |
1021 | i--; | |
1022 | } | |
1023 | } | |
1024 | estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, lastnonb, 1); | |
1025 | lastnonb = curnonb; | |
1026 | } | |
1027 | ||
1028 | if (!m_param->lookaheadDepth) | |
1029 | { | |
1030 | m_est.estimateFrameCost(frames, 0, lastnonb, lastnonb, 0); | |
1031 | estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1); | |
1032 | std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost); | |
1033 | } | |
1034 | ||
1035 | cuTreeFinish(frames[lastnonb], averageDuration, lastnonb); | |
1036 | if (m_param->bBPyramid && bframes > 1 && !m_param->rc.vbvBufferSize) | |
1037 | cuTreeFinish(frames[lastnonb + (bframes + 1) / 2], averageDuration, 0); | |
1038 | } | |
1039 | ||
1040 | void Lookahead::estimateCUPropagate(Lowres **frames, double averageDuration, int p0, int p1, int b, int referenced) | |
1041 | { | |
1042 | uint16_t *refCosts[2] = { frames[p0]->propagateCost, frames[p1]->propagateCost }; | |
1043 | int32_t distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0); | |
1044 | int32_t bipredWeight = m_param->bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32; | |
1045 | MV *mvs[2] = { frames[b]->lowresMvs[0][b - p0 - 1], frames[b]->lowresMvs[1][p1 - b - 1] }; | |
1046 | int32_t bipredWeights[2] = { bipredWeight, 64 - bipredWeight }; | |
1047 | ||
1048 | memset(m_scratch, 0, m_widthInCU * sizeof(int)); | |
1049 | ||
1050 | uint16_t *propagateCost = frames[b]->propagateCost; | |
1051 | ||
1052 | x265_emms(); | |
1053 | double fpsFactor = CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) / CLIP_DURATION(averageDuration); | |
1054 | ||
1055 | /* For non-refferd frames the source costs are always zero, so just memset one row and re-use it. */ | |
1056 | if (!referenced) | |
1057 | memset(frames[b]->propagateCost, 0, m_widthInCU * sizeof(uint16_t)); | |
1058 | ||
1059 | int32_t StrideInCU = m_widthInCU; | |
1060 | for (uint16_t blocky = 0; blocky < m_heightInCU; blocky++) | |
1061 | { | |
1062 | int cuIndex = blocky * StrideInCU; | |
1063 | primitives.propagateCost(m_scratch, propagateCost, | |
1064 | frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex, | |
1065 | frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_widthInCU); | |
1066 | ||
1067 | if (referenced) | |
1068 | propagateCost += m_widthInCU; | |
1069 | for (uint16_t blockx = 0; blockx < m_widthInCU; blockx++, cuIndex++) | |
1070 | { | |
1071 | int32_t propagate_amount = m_scratch[blockx]; | |
1072 | /* Don't propagate for an intra block. */ | |
1073 | if (propagate_amount > 0) | |
1074 | { | |
1075 | /* Access width-2 bitfield. */ | |
1076 | int32_t lists_used = frames[b]->lowresCosts[b - p0][p1 - b][cuIndex] >> LOWRES_COST_SHIFT; | |
1077 | /* Follow the MVs to the previous frame(s). */ | |
1078 | for (uint16_t list = 0; list < 2; list++) | |
1079 | { | |
1080 | if ((lists_used >> list) & 1) | |
1081 | { | |
1082 | #define CLIP_ADD(s, x) (s) = (uint16_t)X265_MIN((s) + (x), (1 << 16) - 1) | |
1083 | int32_t listamount = propagate_amount; | |
1084 | /* Apply bipred weighting. */ | |
1085 | if (lists_used == 3) | |
1086 | listamount = (listamount * bipredWeights[list] + 32) >> 6; | |
1087 | ||
1088 | /* Early termination for simple case of mv0. */ | |
1089 | if (!mvs[list][cuIndex].word) | |
1090 | { | |
1091 | CLIP_ADD(refCosts[list][cuIndex], listamount); | |
1092 | continue; | |
1093 | } | |
1094 | ||
1095 | int32_t x = mvs[list][cuIndex].x; | |
1096 | int32_t y = mvs[list][cuIndex].y; | |
1097 | int32_t cux = (x >> 5) + blockx; | |
1098 | int32_t cuy = (y >> 5) + blocky; | |
1099 | int32_t idx0 = cux + cuy * StrideInCU; | |
1100 | int32_t idx1 = idx0 + 1; | |
1101 | int32_t idx2 = idx0 + StrideInCU; | |
1102 | int32_t idx3 = idx0 + StrideInCU + 1; | |
1103 | x &= 31; | |
1104 | y &= 31; | |
1105 | int32_t idx0weight = (32 - y) * (32 - x); | |
1106 | int32_t idx1weight = (32 - y) * x; | |
1107 | int32_t idx2weight = y * (32 - x); | |
1108 | int32_t idx3weight = y * x; | |
1109 | ||
1110 | /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't | |
1111 | * be counted. */ | |
1112 | if (cux < m_widthInCU - 1 && cuy < m_heightInCU - 1 && cux >= 0 && cuy >= 0) | |
1113 | { | |
1114 | CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10); | |
1115 | CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10); | |
1116 | CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10); | |
1117 | CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10); | |
1118 | } | |
1119 | else /* Check offsets individually */ | |
1120 | { | |
1121 | if (cux < m_widthInCU && cuy < m_heightInCU && cux >= 0 && cuy >= 0) | |
1122 | CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10); | |
1123 | if (cux + 1 < m_widthInCU && cuy < m_heightInCU && cux + 1 >= 0 && cuy >= 0) | |
1124 | CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10); | |
1125 | if (cux < m_widthInCU && cuy + 1 < m_heightInCU && cux >= 0 && cuy + 1 >= 0) | |
1126 | CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10); | |
1127 | if (cux + 1 < m_widthInCU && cuy + 1 < m_heightInCU && cux + 1 >= 0 && cuy + 1 >= 0) | |
1128 | CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10); | |
1129 | } | |
1130 | } | |
1131 | } | |
1132 | } | |
1133 | } | |
1134 | } | |
1135 | ||
1136 | if (m_param->rc.vbvBufferSize && m_param->lookaheadDepth && referenced) | |
1137 | cuTreeFinish(frames[b], averageDuration, b == p1 ? b - p0 : 0); | |
1138 | } | |
1139 | ||
1140 | void Lookahead::cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance) | |
1141 | { | |
1142 | int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) * 256); | |
1143 | double weightdelta = 0.0; | |
1144 | ||
1145 | if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0) | |
1146 | weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]); | |
1147 | ||
1148 | /* Allow the strength to be adjusted via qcompress, since the two | |
1149 | * concepts are very similar. */ | |
1150 | ||
1151 | int cuCount = m_widthInCU * m_heightInCU; | |
1152 | double strength = 5.0 * (1.0 - m_param->rc.qCompress); | |
1153 | ||
1154 | for (int cuIndex = 0; cuIndex < cuCount; cuIndex++) | |
1155 | { | |
1156 | int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8; | |
1157 | if (intracost) | |
1158 | { | |
1159 | int propagateCost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8; | |
1160 | double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta; | |
1161 | frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - strength * log2_ratio; | |
1162 | } | |
1163 | } | |
1164 | } | |
1165 | ||
1166 | /* If MB-tree changes the quantizers, we need to recalculate the frame cost without | |
1167 | * re-running lookahead. */ | |
1168 | int64_t Lookahead::frameCostRecalculate(Lowres** frames, int p0, int p1, int b) | |
1169 | { | |
1170 | int64_t score = 0; | |
1171 | int *rowSatd = frames[b]->rowSatds[b - p0][p1 - b]; | |
1172 | double *qp_offset = (frames[b]->sliceType == X265_TYPE_B) ? frames[b]->qpAqOffset : frames[b]->qpCuTreeOffset; | |
1173 | ||
1174 | x265_emms(); | |
1175 | for (int cuy = m_heightInCU - 1; cuy >= 0; cuy--) | |
1176 | { | |
1177 | rowSatd[cuy] = 0; | |
1178 | for (int cux = m_widthInCU - 1; cux >= 0; cux--) | |
1179 | { | |
1180 | int cuxy = cux + cuy * m_widthInCU; | |
1181 | int cuCost = frames[b]->lowresCosts[b - p0][p1 - b][cuxy] & LOWRES_COST_MASK; | |
1182 | double qp_adj = qp_offset[cuxy]; | |
1183 | cuCost = (cuCost * x265_exp2fix8(qp_adj) + 128) >> 8; | |
1184 | rowSatd[cuy] += cuCost; | |
1185 | if ((cuy > 0 && cuy < m_heightInCU - 1 && | |
1186 | cux > 0 && cux < m_widthInCU - 1) || | |
1187 | m_widthInCU <= 2 || m_heightInCU <= 2) | |
1188 | { | |
1189 | score += cuCost; | |
1190 | } | |
1191 | } | |
1192 | } | |
1193 | ||
1194 | return score; | |
1195 | } | |
1196 | ||
1197 | CostEstimate::CostEstimate(ThreadPool *p) | |
1198 | : WaveFront(p) | |
1199 | { | |
1200 | m_param = NULL; | |
1201 | m_curframes = NULL; | |
1202 | m_wbuffer[0] = m_wbuffer[1] = m_wbuffer[2] = m_wbuffer[3] = 0; | |
1203 | m_rows = NULL; | |
1204 | m_paddedLines = m_widthInCU = m_heightInCU = 0; | |
1205 | m_bDoSearch[0] = m_bDoSearch[1] = false; | |
1206 | m_curb = m_curp0 = m_curp1 = 0; | |
1207 | m_bFrameCompleted = false; | |
1208 | } | |
1209 | ||
1210 | CostEstimate::~CostEstimate() | |
1211 | { | |
1212 | for (int i = 0; i < 4; i++) | |
1213 | { | |
1214 | x265_free(m_wbuffer[i]); | |
1215 | } | |
1216 | ||
1217 | delete[] m_rows; | |
1218 | } | |
1219 | ||
1220 | void CostEstimate::init(x265_param *_param, Frame *curFrame) | |
1221 | { | |
1222 | m_param = _param; | |
1223 | m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; | |
1224 | m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; | |
1225 | ||
1226 | m_rows = new EstimateRow[m_heightInCU]; | |
1227 | for (int i = 0; i < m_heightInCU; i++) | |
1228 | { | |
1229 | m_rows[i].m_widthInCU = m_widthInCU; | |
1230 | m_rows[i].m_heightInCU = m_heightInCU; | |
1231 | m_rows[i].m_param = m_param; | |
1232 | } | |
1233 | ||
1234 | if (WaveFront::init(m_heightInCU)) | |
1235 | WaveFront::enableAllRows(); | |
1236 | else | |
1237 | m_pool = NULL; | |
1238 | ||
1239 | if (m_param->bEnableWeightedPred) | |
1240 | { | |
1241 | PicYuv *orig = curFrame->m_origPicYuv; | |
1242 | m_paddedLines = curFrame->m_lowres.lines + 2 * orig->m_lumaMarginY; | |
1243 | intptr_t padoffset = curFrame->m_lowres.lumaStride * orig->m_lumaMarginY + orig->m_lumaMarginX; | |
1244 | ||
1245 | /* allocate weighted lowres buffers */ | |
1246 | for (int i = 0; i < 4; i++) | |
1247 | { | |
1248 | m_wbuffer[i] = (pixel*)x265_malloc(sizeof(pixel) * (curFrame->m_lowres.lumaStride * m_paddedLines)); | |
1249 | m_weightedRef.lowresPlane[i] = m_wbuffer[i] + padoffset; | |
1250 | } | |
1251 | ||
1252 | m_weightedRef.fpelPlane = m_weightedRef.lowresPlane[0]; | |
1253 | m_weightedRef.lumaStride = curFrame->m_lowres.lumaStride; | |
1254 | m_weightedRef.isLowres = true; | |
1255 | m_weightedRef.isWeighted = false; | |
1256 | } | |
1257 | } | |
1258 | ||
1259 | int64_t CostEstimate::estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty) | |
1260 | { | |
1261 | int64_t score = 0; | |
1262 | Lowres *fenc = frames[b]; | |
1263 | ||
1264 | if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1) | |
1265 | score = fenc->costEst[b - p0][p1 - b]; | |
1266 | else | |
1267 | { | |
1268 | m_weightedRef.isWeighted = false; | |
1269 | if (m_param->bEnableWeightedPred && b == p1 && b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF) | |
1270 | { | |
1271 | if (!fenc->bIntraCalculated) | |
1272 | estimateFrameCost(frames, b, b, b, 0); | |
1273 | weightsAnalyse(frames, b, p0); | |
1274 | } | |
1275 | ||
1276 | /* For each list, check to see whether we have lowres motion-searched this reference */ | |
1277 | m_bDoSearch[0] = b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF; | |
1278 | m_bDoSearch[1] = b != p1 && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF; | |
1279 | ||
1280 | if (m_bDoSearch[0]) fenc->lowresMvs[0][b - p0 - 1][0].x = 0; | |
1281 | if (m_bDoSearch[1]) fenc->lowresMvs[1][p1 - b - 1][0].x = 0; | |
1282 | ||
1283 | m_curb = b; | |
1284 | m_curp0 = p0; | |
1285 | m_curp1 = p1; | |
1286 | m_curframes = frames; | |
1287 | fenc->costEst[b - p0][p1 - b] = 0; | |
1288 | fenc->costEstAq[b - p0][p1 - b] = 0; | |
1289 | ||
1290 | for (int i = 0; i < m_heightInCU; i++) | |
1291 | { | |
1292 | m_rows[i].init(); | |
1293 | m_rows[i].m_me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride); | |
1294 | if (!fenc->bIntraCalculated) | |
1295 | fenc->rowSatds[0][0][i] = 0; | |
1296 | fenc->rowSatds[b - p0][p1 - b][i] = 0; | |
1297 | } | |
1298 | ||
1299 | m_bFrameCompleted = false; | |
1300 | ||
1301 | if (m_pool) | |
1302 | { | |
1303 | WaveFront::enqueue(); | |
1304 | ||
1305 | // enableAllRows must be already called | |
1306 | enqueueRow(0); | |
1307 | while (!m_bFrameCompleted) | |
1308 | WaveFront::findJob(-1); | |
1309 | ||
1310 | WaveFront::dequeue(); | |
1311 | } | |
1312 | else | |
1313 | { | |
1314 | for (int row = 0; row < m_heightInCU; row++) | |
1315 | processRow(row, -1); | |
1316 | ||
1317 | x265_emms(); | |
1318 | } | |
1319 | ||
1320 | // Accumulate cost from each row | |
1321 | for (int row = 0; row < m_heightInCU; row++) | |
1322 | { | |
1323 | score += m_rows[row].m_costEst; | |
1324 | fenc->costEst[0][0] += m_rows[row].m_costIntra; | |
1325 | if (m_param->rc.aqMode) | |
1326 | { | |
1327 | fenc->costEstAq[0][0] += m_rows[row].m_costIntraAq; | |
1328 | fenc->costEstAq[b - p0][p1 - b] += m_rows[row].m_costEstAq; | |
1329 | } | |
1330 | fenc->intraMbs[b - p0] += m_rows[row].m_intraMbs; | |
1331 | } | |
1332 | ||
1333 | fenc->bIntraCalculated = true; | |
1334 | ||
1335 | if (b != p1) | |
1336 | score = (uint64_t)score * 100 / (130 + m_param->bFrameBias); | |
1337 | if (b != p0 || b != p1) //Not Intra cost | |
1338 | fenc->costEst[b - p0][p1 - b] = score; | |
1339 | } | |
1340 | ||
1341 | if (bIntraPenalty) | |
1342 | { | |
1343 | // arbitrary penalty for I-blocks after B-frames | |
1344 | int ncu = NUM_CUS; | |
1345 | score += (uint64_t)score * fenc->intraMbs[b - p0] / (ncu * 8); | |
1346 | } | |
1347 | return score; | |
1348 | } | |
1349 | ||
1350 | uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightParam *wp) | |
1351 | { | |
1352 | Lowres *fenc = frames[b]; | |
1353 | Lowres *ref = frames[p0]; | |
1354 | pixel *src = ref->fpelPlane; | |
1355 | intptr_t stride = fenc->lumaStride; | |
1356 | ||
1357 | if (wp) | |
1358 | { | |
1359 | int offset = wp->inputOffset << (X265_DEPTH - 8); | |
1360 | int scale = wp->inputWeight; | |
1361 | int denom = wp->log2WeightDenom; | |
1362 | int round = denom ? 1 << (denom - 1) : 0; | |
1363 | int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth | |
1364 | int widthHeight = (int)stride; | |
1365 | ||
1366 | primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines, | |
1367 | scale, round << correction, denom + correction, offset); | |
1368 | src = m_weightedRef.fpelPlane; | |
1369 | } | |
1370 | ||
1371 | uint32_t cost = 0; | |
1372 | intptr_t pixoff = 0; | |
1373 | int mb = 0; | |
1374 | ||
1375 | for (int y = 0; y < fenc->lines; y += 8, pixoff = y * stride) | |
1376 | { | |
1377 | for (int x = 0; x < fenc->width; x += 8, mb++, pixoff += 8) | |
1378 | { | |
1379 | int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane + pixoff, stride); | |
1380 | cost += X265_MIN(satd, fenc->intraCost[mb]); | |
1381 | } | |
1382 | } | |
1383 | ||
1384 | return cost; | |
1385 | } | |
1386 | ||
1387 | void CostEstimate::weightsAnalyse(Lowres **frames, int b, int p0) | |
1388 | { | |
1389 | static const float epsilon = 1.f / 128.f; | |
1390 | Lowres *fenc, *ref; | |
1391 | ||
1392 | fenc = frames[b]; | |
1393 | ref = frames[p0]; | |
1394 | int deltaIndex = fenc->frameNum - ref->frameNum; | |
1395 | ||
1396 | /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */ | |
1397 | float guessScale, fencMean, refMean; | |
1398 | x265_emms(); | |
1399 | if (fenc->wp_ssd[0] && ref->wp_ssd[0]) | |
1400 | guessScale = sqrtf((float)fenc->wp_ssd[0] / ref->wp_ssd[0]); | |
1401 | else | |
1402 | guessScale = 1.0f; | |
1403 | fencMean = (float)fenc->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8)); | |
1404 | refMean = (float)ref->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8)); | |
1405 | ||
1406 | /* Early termination */ | |
1407 | if (fabsf(refMean - fencMean) < 0.5f && fabsf(1.f - guessScale) < epsilon) | |
1408 | return; | |
1409 | ||
1410 | int minoff = 0, minscale, mindenom; | |
1411 | unsigned int minscore = 0, origscore = 1; | |
1412 | int found = 0; | |
1413 | ||
1414 | m_w.setFromWeightAndOffset((int)(guessScale * 128 + 0.5f), 0, 7, true); | |
1415 | mindenom = m_w.log2WeightDenom; | |
1416 | minscale = m_w.inputWeight; | |
1417 | ||
1418 | origscore = minscore = weightCostLuma(frames, b, p0, NULL); | |
1419 | ||
1420 | if (!minscore) | |
1421 | return; | |
1422 | ||
1423 | unsigned int s = 0; | |
1424 | int curScale = minscale; | |
1425 | int curOffset = (int)(fencMean - refMean * curScale / (1 << mindenom) + 0.5f); | |
1426 | if (curOffset < -128 || curOffset > 127) | |
1427 | { | |
1428 | /* Rescale considering the constraints on curOffset. We do it in this order | |
1429 | * because scale has a much wider range than offset (because of denom), so | |
1430 | * it should almost never need to be clamped. */ | |
1431 | curOffset = Clip3(-128, 127, curOffset); | |
1432 | curScale = (int)((1 << mindenom) * (fencMean - curOffset) / refMean + 0.5f); | |
1433 | curScale = Clip3(0, 127, curScale); | |
1434 | } | |
1435 | SET_WEIGHT(m_w, 1, curScale, mindenom, curOffset); | |
1436 | s = weightCostLuma(frames, b, p0, &m_w); | |
1437 | COPY4_IF_LT(minscore, s, minscale, curScale, minoff, curOffset, found, 1); | |
1438 | ||
1439 | /* Use a smaller denominator if possible */ | |
1440 | while (mindenom > 0 && !(minscale & 1)) | |
1441 | { | |
1442 | mindenom--; | |
1443 | minscale >>= 1; | |
1444 | } | |
1445 | ||
1446 | if (!found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f) | |
1447 | return; | |
1448 | else | |
1449 | { | |
1450 | SET_WEIGHT(m_w, 1, minscale, mindenom, minoff); | |
1451 | // set weighted delta cost | |
1452 | fenc->weightedCostDelta[deltaIndex] = minscore / origscore; | |
1453 | ||
1454 | int offset = m_w.inputOffset << (X265_DEPTH - 8); | |
1455 | int scale = m_w.inputWeight; | |
1456 | int denom = m_w.log2WeightDenom; | |
1457 | int round = denom ? 1 << (denom - 1) : 0; | |
1458 | int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth | |
1459 | intptr_t stride = ref->lumaStride; | |
1460 | int widthHeight = (int)stride; | |
1461 | ||
1462 | for (int i = 0; i < 4; i++) | |
1463 | primitives.weight_pp(ref->buffer[i], m_wbuffer[i], stride, widthHeight, m_paddedLines, | |
1464 | scale, round << correction, denom + correction, offset); | |
1465 | ||
1466 | m_weightedRef.isWeighted = true; | |
1467 | } | |
1468 | } | |
1469 | ||
1470 | void CostEstimate::processRow(int row, int /*threadId*/) | |
1471 | { | |
1472 | int realrow = m_heightInCU - 1 - row; | |
1473 | Lowres **frames = m_curframes; | |
1474 | ReferencePlanes *wfref0 = m_weightedRef.isWeighted ? &m_weightedRef : frames[m_curp0]; | |
1475 | ||
1476 | /* Lowres lookahead goes backwards because the MVs are used as | |
1477 | * predictors in the main encode. This considerably improves MV | |
1478 | * prediction overall. */ | |
1479 | for (int i = m_widthInCU - 1 - m_rows[row].m_completed; i >= 0; i--) | |
1480 | { | |
1481 | // TODO: use lowres MVs as motion candidates in full-res search | |
1482 | m_rows[row].estimateCUCost(frames, wfref0, i, realrow, m_curp0, m_curp1, m_curb, m_bDoSearch); | |
1483 | m_rows[row].m_completed++; | |
1484 | ||
1485 | if (m_rows[row].m_completed >= 2 && row < m_heightInCU - 1) | |
1486 | { | |
1487 | ScopedLock below(m_rows[row + 1].m_lock); | |
1488 | if (m_rows[row + 1].m_active == false && | |
1489 | m_rows[row + 1].m_completed + 2 <= m_rows[row].m_completed) | |
1490 | { | |
1491 | m_rows[row + 1].m_active = true; | |
1492 | enqueueRow(row + 1); | |
1493 | } | |
1494 | } | |
1495 | ||
1496 | ScopedLock self(m_rows[row].m_lock); | |
1497 | if (row > 0 && (int32_t)m_rows[row].m_completed < m_widthInCU - 1 && | |
1498 | m_rows[row - 1].m_completed < m_rows[row].m_completed + 2) | |
1499 | { | |
1500 | m_rows[row].m_active = false; | |
1501 | return; | |
1502 | } | |
1503 | } | |
1504 | ||
1505 | if (row == m_heightInCU - 1) | |
1506 | m_bFrameCompleted = true; | |
1507 | } | |
1508 | ||
1509 | void EstimateRow::init() | |
1510 | { | |
1511 | m_costEst = 0; | |
1512 | m_costEstAq = 0; | |
1513 | m_costIntra = 0; | |
1514 | m_costIntraAq = 0; | |
1515 | m_intraMbs = 0; | |
1516 | m_active = false; | |
1517 | m_completed = 0; | |
1518 | } | |
1519 | ||
1520 | void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2]) | |
1521 | { | |
1522 | Lowres *fref1 = frames[p1]; | |
1523 | Lowres *fenc = frames[b]; | |
1524 | ||
1525 | const int bBidir = (b < p1); | |
1526 | const int cuXY = cux + cuy * m_widthInCU; | |
1527 | const int cuSize = X265_LOWRES_CU_SIZE; | |
1528 | const intptr_t pelOffset = cuSize * cux + cuSize * cuy * fenc->lumaStride; | |
1529 | ||
1530 | // should this CU's cost contribute to the frame cost? | |
1531 | const bool bFrameScoreCU = (cux > 0 && cux < m_widthInCU - 1 && | |
1532 | cuy > 0 && cuy < m_heightInCU - 1) || m_widthInCU <= 2 || m_heightInCU <= 2; | |
1533 | ||
1534 | m_me.setSourcePU(pelOffset, cuSize, cuSize); | |
1535 | ||
1536 | /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */ | |
1537 | int lowresPenalty = 4; | |
1538 | ||
1539 | MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY], | |
1540 | &fenc->lowresMvs[1][p1 - b - 1][cuXY] }; | |
1541 | int(*fenc_costs[2]) = { &fenc->lowresMvCosts[0][b - p0 - 1][cuXY], | |
1542 | &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] }; | |
1543 | ||
1544 | MV mvmin, mvmax; | |
1545 | int bcost = m_me.COST_MAX; | |
1546 | int listused = 0; | |
1547 | ||
1548 | // establish search bounds that don't cross extended frame boundaries | |
1549 | mvmin.x = (int16_t)(-cux * cuSize - 8); | |
1550 | mvmin.y = (int16_t)(-cuy * cuSize - 8); | |
1551 | mvmax.x = (int16_t)((m_widthInCU - cux - 1) * cuSize + 8); | |
1552 | mvmax.y = (int16_t)((m_heightInCU - cuy - 1) * cuSize + 8); | |
1553 | ||
1554 | if (p0 != p1) | |
1555 | { | |
1556 | for (int i = 0; i < 1 + bBidir; i++) | |
1557 | { | |
1558 | if (!bDoSearch[i]) | |
1559 | { | |
1560 | /* Use previously calculated cost */ | |
1561 | COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1); | |
1562 | continue; | |
1563 | } | |
1564 | int numc = 0; | |
1565 | MV mvc[4], mvp; | |
1566 | MV *fenc_mv = fenc_mvs[i]; | |
1567 | ||
1568 | /* Reverse-order MV prediction. */ | |
1569 | mvc[0] = 0; | |
1570 | mvc[2] = 0; | |
1571 | #define MVC(mv) mvc[numc++] = mv; | |
1572 | if (cux < m_widthInCU - 1) | |
1573 | MVC(fenc_mv[1]); | |
1574 | if (cuy < m_heightInCU - 1) | |
1575 | { | |
1576 | MVC(fenc_mv[m_widthInCU]); | |
1577 | if (cux > 0) | |
1578 | MVC(fenc_mv[m_widthInCU - 1]); | |
1579 | if (cux < m_widthInCU - 1) | |
1580 | MVC(fenc_mv[m_widthInCU + 1]); | |
1581 | } | |
1582 | #undef MVC | |
1583 | if (numc <= 1) | |
1584 | mvp = mvc[0]; | |
1585 | else | |
1586 | { | |
1587 | median_mv(mvp, mvc[0], mvc[1], mvc[2]); | |
1588 | } | |
1589 | ||
1590 | *fenc_costs[i] = m_me.motionEstimate(i ? fref1 : wfref0, mvmin, mvmax, mvp, numc, mvc, m_merange, *fenc_mvs[i]); | |
1591 | COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1); | |
1592 | } | |
1593 | if (bBidir) | |
1594 | { | |
1595 | pixel subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE], subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]; | |
1596 | intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE; | |
1597 | pixel *src0 = wfref0->lowresMC(pelOffset, *fenc_mvs[0], subpelbuf0, stride0); | |
1598 | pixel *src1 = fref1->lowresMC(pelOffset, *fenc_mvs[1], subpelbuf1, stride1); | |
1599 | ||
1600 | pixel ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]; | |
1601 | primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32); | |
1602 | int bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE); | |
1603 | COPY2_IF_LT(bcost, bicost, listused, 3); | |
1604 | ||
1605 | // Try 0,0 candidates | |
1606 | src0 = wfref0->lowresPlane[0] + pelOffset; | |
1607 | src1 = fref1->lowresPlane[0] + pelOffset; | |
1608 | primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, wfref0->lumaStride, src1, fref1->lumaStride, 32); | |
1609 | bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE); | |
1610 | COPY2_IF_LT(bcost, bicost, listused, 3); | |
1611 | } | |
1612 | } | |
1613 | if (!fenc->bIntraCalculated) | |
1614 | { | |
1615 | const int sizeIdx = X265_LOWRES_CU_BITS - 2; // partition size | |
1616 | ||
1617 | pixel _above0[X265_LOWRES_CU_SIZE * 4 + 1], *const above0 = _above0 + 2 * X265_LOWRES_CU_SIZE; | |
1618 | pixel _above1[X265_LOWRES_CU_SIZE * 4 + 1], *const above1 = _above1 + 2 * X265_LOWRES_CU_SIZE; | |
1619 | pixel _left0[X265_LOWRES_CU_SIZE * 4 + 1], *const left0 = _left0 + 2 * X265_LOWRES_CU_SIZE; | |
1620 | pixel _left1[X265_LOWRES_CU_SIZE * 4 + 1], *const left1 = _left1 + 2 * X265_LOWRES_CU_SIZE; | |
1621 | ||
1622 | pixel *pix_cur = fenc->lowresPlane[0] + pelOffset; | |
1623 | ||
1624 | // Copy Above | |
1625 | memcpy(above0, pix_cur - 1 - fenc->lumaStride, (cuSize + 1) * sizeof(pixel)); | |
1626 | ||
1627 | // Copy Left | |
1628 | for (int i = 0; i < cuSize + 1; i++) | |
1629 | { | |
1630 | left0[i] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride]; | |
1631 | } | |
1632 | ||
1633 | for (int i = 0; i < cuSize; i++) | |
1634 | { | |
1635 | above0[cuSize + i + 1] = above0[cuSize]; | |
1636 | left0[cuSize + i + 1] = left0[cuSize]; | |
1637 | } | |
1638 | ||
1639 | // filtering with [1 2 1] | |
1640 | // assume getUseStrongIntraSmoothing() is disabled | |
1641 | above1[0] = above0[0]; | |
1642 | above1[2 * cuSize] = above0[2 * cuSize]; | |
1643 | left1[0] = left0[0]; | |
1644 | left1[2 * cuSize] = left0[2 * cuSize]; | |
1645 | for (int i = 1; i < 2 * cuSize; i++) | |
1646 | { | |
1647 | above1[i] = (above0[i - 1] + 2 * above0[i] + above0[i + 1] + 2) >> 2; | |
1648 | left1[i] = (left0[i - 1] + 2 * left0[i] + left0[i + 1] + 2) >> 2; | |
1649 | } | |
1650 | ||
1651 | int predsize = cuSize * cuSize; | |
1652 | ||
1653 | // generate 35 intra predictions into m_predictions | |
1654 | pixelcmp_t satd = primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)]; | |
1655 | int icost = m_me.COST_MAX, cost; | |
1656 | primitives.intra_pred[DC_IDX][sizeIdx](m_predictions, cuSize, left0, above0, 0, (cuSize <= 16)); | |
1657 | cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize); | |
1658 | if (cost < icost) | |
1659 | icost = cost; | |
1660 | pixel *above = (cuSize >= 8) ? above1 : above0; | |
1661 | pixel *left = (cuSize >= 8) ? left1 : left0; | |
1662 | primitives.intra_pred[PLANAR_IDX][sizeIdx](m_predictions, cuSize, left, above, 0, 0); | |
1663 | cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize); | |
1664 | if (cost < icost) | |
1665 | icost = cost; | |
1666 | primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16)); | |
1667 | ||
1668 | // calculate satd costs, keep least cost | |
1669 | ALIGN_VAR_32(pixel, buf_trans[32 * 32]); | |
1670 | primitives.transpose[sizeIdx](buf_trans, m_me.fenc, FENC_STRIDE); | |
1671 | ||
1672 | int acost = m_me.COST_MAX; | |
1673 | uint32_t mode, lowmode = 4; | |
1674 | for (mode = 5; mode < 35; mode += 5) | |
1675 | { | |
1676 | if (mode < 18) | |
1677 | cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize); | |
1678 | else | |
1679 | cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize); | |
1680 | COPY2_IF_LT(acost, cost, lowmode, mode); | |
1681 | } | |
1682 | for (uint32_t dist = 2; dist >= 1; dist--) | |
1683 | { | |
1684 | mode = lowmode - dist; | |
1685 | if (mode < 18) | |
1686 | cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize); | |
1687 | else | |
1688 | cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize); | |
1689 | COPY2_IF_LT(acost, cost, lowmode, mode); | |
1690 | ||
1691 | mode = lowmode + dist; | |
1692 | if (mode < 18) | |
1693 | cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize); | |
1694 | else | |
1695 | cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize); | |
1696 | COPY2_IF_LT(acost, cost, lowmode, mode); | |
1697 | } | |
1698 | if (acost < icost) | |
1699 | icost = acost; | |
1700 | ||
1701 | const int intraPenalty = 5 * m_lookAheadLambda; | |
1702 | icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */ | |
1703 | fenc->intraCost[cuXY] = icost; | |
1704 | int icostAq = icost; | |
1705 | if (bFrameScoreCU) | |
1706 | { | |
1707 | m_costIntra += icost; | |
1708 | if (fenc->invQscaleFactor) | |
1709 | { | |
1710 | icostAq = (icost * fenc->invQscaleFactor[cuXY] + 128) >> 8; | |
1711 | m_costIntraAq += icostAq; | |
1712 | } | |
1713 | } | |
1714 | fenc->rowSatds[0][0][cuy] += icostAq; | |
1715 | } | |
1716 | bcost += lowresPenalty; | |
1717 | if (!bBidir) | |
1718 | { | |
1719 | if (fenc->intraCost[cuXY] < bcost) | |
1720 | { | |
1721 | if (bFrameScoreCU) m_intraMbs++; | |
1722 | bcost = fenc->intraCost[cuXY]; | |
1723 | listused = 0; | |
1724 | } | |
1725 | } | |
1726 | ||
1727 | /* For I frames these costs were accumulated earlier */ | |
1728 | if (p0 != p1) | |
1729 | { | |
1730 | int bcostAq = bcost; | |
1731 | if (bFrameScoreCU) | |
1732 | { | |
1733 | m_costEst += bcost; | |
1734 | if (fenc->invQscaleFactor) | |
1735 | { | |
1736 | bcostAq = (bcost * fenc->invQscaleFactor[cuXY] + 128) >> 8; | |
1737 | m_costEstAq += bcostAq; | |
1738 | } | |
1739 | } | |
1740 | fenc->rowSatds[b - p0][p1 - b][cuy] += bcostAq; | |
1741 | } | |
1742 | fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT)); | |
1743 | } |