Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | /***************************************************************************** |
2 | * Copyright (C) 2013 x265 project | |
3 | * | |
4 | * Authors: Gopu Govindaswamy <gopu@multicorewareinc.com> | |
5 | * Steve Borho <steve@borho.org> | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License as published by | |
9 | * the Free Software Foundation; either version 2 of the License, or | |
10 | * (at your option) any later version. | |
11 | * | |
12 | * This program is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | * GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with this program; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
20 | * | |
21 | * This program is also available under a commercial proprietary license. | |
22 | * For more information, contact us at license @ x265.com. | |
23 | *****************************************************************************/ | |
24 | ||
25 | #include "common.h" | |
26 | #include "frame.h" | |
27 | #include "framedata.h" | |
28 | #include "picyuv.h" | |
29 | #include "primitives.h" | |
30 | #include "lowres.h" | |
31 | #include "mv.h" | |
32 | ||
33 | #include "slicetype.h" | |
34 | #include "motion.h" | |
35 | #include "ratecontrol.h" | |
36 | ||
37 | #define NUM_CUS (m_widthInCU > 2 && m_heightInCU > 2 ? (m_widthInCU - 2) * (m_heightInCU - 2) : m_widthInCU * m_heightInCU) | |
38 | ||
39 | using namespace x265; | |
40 | ||
41 | static inline int16_t median(int16_t a, int16_t b, int16_t c) | |
42 | { | |
43 | int16_t t = (a - b) & ((a - b) >> 31); | |
44 | ||
45 | a -= t; | |
46 | b += t; | |
47 | b -= (b - c) & ((b - c) >> 31); | |
48 | b += (a - b) & ((a - b) >> 31); | |
49 | return b; | |
50 | } | |
51 | ||
52 | static inline void median_mv(MV &dst, MV a, MV b, MV c) | |
53 | { | |
54 | dst.x = median(a.x, b.x, c.x); | |
55 | dst.y = median(a.y, b.y, c.y); | |
56 | } | |
57 | ||
58 | Lookahead::Lookahead(x265_param *param, ThreadPool* pool) | |
59 | : JobProvider(pool) | |
60 | , m_est(pool) | |
61 | { | |
62 | m_bReady = 0; | |
63 | m_param = param; | |
64 | m_lastKeyframe = -m_param->keyframeMax; | |
65 | m_lastNonB = NULL; | |
66 | m_bFilling = true; | |
67 | m_bFlushed = false; | |
68 | m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; | |
69 | m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; | |
70 | m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int)); | |
71 | memset(m_histogram, 0, sizeof(m_histogram)); | |
72 | } | |
73 | ||
74 | Lookahead::~Lookahead() { } | |
75 | ||
76 | void Lookahead::init() | |
77 | { | |
78 | if (m_pool && m_pool->getThreadCount() >= 4 && | |
79 | ((m_param->bFrameAdaptive && m_param->bframes) || | |
80 | m_param->rc.cuTree || m_param->scenecutThreshold || | |
81 | (m_param->lookaheadDepth && m_param->rc.vbvBufferSize))) | |
82 | m_pool = m_pool; /* allow use of worker thread */ | |
83 | else | |
84 | m_pool = NULL; /* disable use of worker thread */ | |
85 | } | |
86 | ||
87 | void Lookahead::destroy() | |
88 | { | |
89 | if (m_pool) | |
90 | // flush will dequeue, if it is necessary | |
91 | JobProvider::flush(); | |
92 | ||
93 | // these two queues will be empty unless the encode was aborted | |
94 | while (!m_inputQueue.empty()) | |
95 | { | |
96 | Frame* curFrame = m_inputQueue.popFront(); | |
97 | curFrame->destroy(); | |
98 | delete curFrame; | |
99 | } | |
100 | ||
101 | while (!m_outputQueue.empty()) | |
102 | { | |
103 | Frame* curFrame = m_outputQueue.popFront(); | |
104 | curFrame->destroy(); | |
105 | delete curFrame; | |
106 | } | |
107 | ||
108 | x265_free(m_scratch); | |
109 | } | |
110 | ||
111 | /* Called by API thread */ | |
112 | void Lookahead::addPicture(Frame *curFrame, int sliceType) | |
113 | { | |
b53f7c52 | 114 | PicYuv *orig = curFrame->m_fencPic; |
72b9787e JB |
115 | |
116 | curFrame->m_lowres.init(orig, curFrame->m_poc, sliceType); | |
117 | ||
118 | m_inputQueueLock.acquire(); | |
119 | m_inputQueue.pushBack(*curFrame); | |
120 | ||
121 | if (m_inputQueue.size() >= m_param->lookaheadDepth) | |
122 | { | |
123 | /* when queue fills the first time, run slicetypeDecide synchronously, | |
124 | * since the encoder will always be blocked here */ | |
125 | if (m_pool && !m_bFilling) | |
126 | { | |
127 | m_inputQueueLock.release(); | |
128 | m_bReady = 1; | |
129 | m_pool->pokeIdleThread(); | |
130 | } | |
131 | else | |
132 | slicetypeDecide(); | |
133 | ||
134 | if (m_bFilling && m_pool) | |
135 | JobProvider::enqueue(); | |
136 | m_bFilling = false; | |
137 | } | |
138 | else | |
139 | m_inputQueueLock.release(); | |
140 | } | |
141 | ||
142 | /* Called by API thread */ | |
143 | void Lookahead::flush() | |
144 | { | |
145 | /* just in case the input queue is never allowed to fill */ | |
146 | m_bFilling = false; | |
147 | ||
148 | /* flush synchronously */ | |
149 | m_inputQueueLock.acquire(); | |
150 | if (!m_inputQueue.empty()) | |
151 | { | |
152 | slicetypeDecide(); | |
153 | } | |
154 | else | |
155 | m_inputQueueLock.release(); | |
156 | ||
157 | m_inputQueueLock.acquire(); | |
158 | ||
159 | /* bFlushed indicates that an empty output queue actually means all frames | |
160 | * have been decided (no more inputs for the encoder) */ | |
161 | if (m_inputQueue.empty()) | |
162 | m_bFlushed = true; | |
163 | m_inputQueueLock.release(); | |
164 | } | |
165 | ||
166 | /* Called by API thread. If the lookahead queue has not yet been filled the | |
167 | * first time, it immediately returns NULL. Else the function blocks until | |
168 | * outputs are available and then pops the first frame from the output queue. If | |
169 | * flush() has been called and the output queue is empty, NULL is returned. */ | |
170 | Frame* Lookahead::getDecidedPicture() | |
171 | { | |
172 | m_outputQueueLock.acquire(); | |
173 | ||
174 | if (m_bFilling) | |
175 | { | |
176 | m_outputQueueLock.release(); | |
177 | return NULL; | |
178 | } | |
179 | ||
180 | while (m_outputQueue.empty() && !m_bFlushed) | |
181 | { | |
182 | m_outputQueueLock.release(); | |
183 | m_outputAvailable.wait(); | |
184 | m_outputQueueLock.acquire(); | |
185 | } | |
186 | ||
187 | Frame *fenc = m_outputQueue.popFront(); | |
188 | m_outputQueueLock.release(); | |
189 | return fenc; | |
190 | } | |
191 | ||
192 | /* Called by pool worker threads */ | |
193 | bool Lookahead::findJob(int) | |
194 | { | |
b53f7c52 | 195 | if (m_bReady > 0 && ATOMIC_DEC(&m_bReady) == 0) |
72b9787e JB |
196 | { |
197 | m_inputQueueLock.acquire(); | |
198 | slicetypeDecide(); | |
199 | return true; | |
200 | } | |
201 | else | |
202 | return false; | |
203 | } | |
204 | ||
205 | /* Called by rate-control to calculate the estimated SATD cost for a given | |
206 | * picture. It assumes dpb->prepareEncode() has already been called for the | |
207 | * picture and all the references are established */ | |
208 | void Lookahead::getEstimatedPictureCost(Frame *curFrame) | |
209 | { | |
210 | Lowres *frames[X265_LOOKAHEAD_MAX]; | |
211 | ||
212 | // POC distances to each reference | |
213 | Slice *slice = curFrame->m_encData->m_slice; | |
214 | int p0 = 0, p1, b; | |
215 | int poc = slice->m_poc; | |
216 | int l0poc = slice->m_refPOCList[0][0]; | |
217 | int l1poc = slice->m_refPOCList[1][0]; | |
218 | ||
219 | switch (slice->m_sliceType) | |
220 | { | |
221 | case I_SLICE: | |
222 | frames[p0] = &curFrame->m_lowres; | |
223 | b = p1 = 0; | |
224 | break; | |
225 | ||
226 | case P_SLICE: | |
227 | b = p1 = poc - l0poc; | |
228 | frames[p0] = &slice->m_refPicList[0][0]->m_lowres; | |
229 | frames[b] = &curFrame->m_lowres; | |
230 | break; | |
231 | ||
232 | case B_SLICE: | |
233 | b = poc - l0poc; | |
234 | p1 = b + l1poc - poc; | |
235 | frames[p0] = &slice->m_refPicList[0][0]->m_lowres; | |
236 | frames[b] = &curFrame->m_lowres; | |
237 | frames[p1] = &slice->m_refPicList[1][0]->m_lowres; | |
238 | break; | |
239 | ||
240 | default: | |
241 | return; | |
242 | } | |
243 | ||
244 | if (m_param->rc.cuTree && !m_param->rc.bStatRead) | |
245 | /* update row satds based on cutree offsets */ | |
246 | curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b); | |
247 | else if (m_param->rc.aqMode) | |
248 | curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b]; | |
249 | else | |
250 | curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b]; | |
251 | ||
252 | if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate) | |
253 | { | |
254 | /* aggregate lowres row satds to CTU resolution */ | |
255 | curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCosts[b - p0][p1 - b]; | |
256 | uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0; | |
257 | uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE); | |
258 | uint32_t numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; | |
259 | uint32_t widthInLowresCu = (uint32_t)m_widthInCU, heightInLowresCu = (uint32_t)m_heightInCU; | |
260 | double *qp_offset = 0; | |
261 | /* Factor in qpoffsets based on Aq/Cutree in CU costs */ | |
262 | if (m_param->rc.aqMode) | |
263 | qp_offset = (frames[b]->sliceType == X265_TYPE_B || !m_param->rc.cuTree) ? frames[b]->qpAqOffset : frames[b]->qpCuTreeOffset; | |
264 | ||
265 | for (uint32_t row = 0; row < numCuInHeight; row++) | |
266 | { | |
267 | lowresRow = row * scale; | |
268 | for (uint32_t cnt = 0; cnt < scale && lowresRow < heightInLowresCu; lowresRow++, cnt++) | |
269 | { | |
270 | sum = 0; | |
271 | lowresCuIdx = lowresRow * widthInLowresCu; | |
272 | for (lowresCol = 0; lowresCol < widthInLowresCu; lowresCol++, lowresCuIdx++) | |
273 | { | |
274 | uint16_t lowresCuCost = curFrame->m_lowres.lowresCostForRc[lowresCuIdx] & LOWRES_COST_MASK; | |
275 | if (qp_offset) | |
276 | { | |
277 | lowresCuCost = (uint16_t)((lowresCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8); | |
278 | int32_t intraCuCost = curFrame->m_lowres.intraCost[lowresCuIdx]; | |
279 | curFrame->m_lowres.intraCost[lowresCuIdx] = (intraCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8; | |
280 | } | |
281 | curFrame->m_lowres.lowresCostForRc[lowresCuIdx] = lowresCuCost; | |
282 | sum += lowresCuCost; | |
283 | } | |
284 | curFrame->m_encData->m_rowStat[row].satdForVbv += sum; | |
285 | } | |
286 | } | |
287 | } | |
288 | } | |
289 | ||
290 | /* called by API thread or worker thread with inputQueueLock acquired */ | |
291 | void Lookahead::slicetypeDecide() | |
292 | { | |
b53f7c52 JB |
293 | ProfileScopeEvent(slicetypeDecideEV); |
294 | ||
72b9787e JB |
295 | ScopedLock lock(m_decideLock); |
296 | ||
297 | Lowres *frames[X265_LOOKAHEAD_MAX]; | |
298 | Frame *list[X265_LOOKAHEAD_MAX]; | |
299 | int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX); | |
300 | ||
301 | memset(frames, 0, sizeof(frames)); | |
302 | memset(list, 0, sizeof(list)); | |
303 | { | |
304 | Frame *curFrame = m_inputQueue.first(); | |
305 | int j; | |
306 | for (j = 0; j < m_param->bframes + 2; j++) | |
307 | { | |
308 | if (!curFrame) break; | |
309 | list[j] = curFrame; | |
310 | curFrame = curFrame->m_next; | |
311 | } | |
312 | ||
313 | curFrame = m_inputQueue.first(); | |
314 | frames[0] = m_lastNonB; | |
315 | for (j = 0; j < maxSearch; j++) | |
316 | { | |
317 | if (!curFrame) break; | |
318 | frames[j + 1] = &curFrame->m_lowres; | |
319 | curFrame = curFrame->m_next; | |
320 | } | |
321 | ||
322 | maxSearch = j; | |
323 | } | |
324 | ||
325 | m_inputQueueLock.release(); | |
326 | ||
327 | if (!m_est.m_rows && list[0]) | |
328 | m_est.init(m_param, list[0]); | |
329 | ||
330 | if (m_lastNonB && !m_param->rc.bStatRead && | |
331 | ((m_param->bFrameAdaptive && m_param->bframes) || | |
332 | m_param->rc.cuTree || m_param->scenecutThreshold || | |
333 | (m_param->lookaheadDepth && m_param->rc.vbvBufferSize))) | |
334 | { | |
335 | slicetypeAnalyse(frames, false); | |
336 | } | |
337 | ||
338 | int bframes, brefs; | |
339 | for (bframes = 0, brefs = 0;; bframes++) | |
340 | { | |
341 | Lowres& frm = list[bframes]->m_lowres; | |
342 | ||
343 | if (frm.sliceType == X265_TYPE_BREF && !m_param->bBPyramid && brefs == m_param->bBPyramid) | |
344 | { | |
345 | frm.sliceType = X265_TYPE_B; | |
346 | x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid\n", | |
347 | frm.frameNum); | |
348 | } | |
349 | ||
350 | /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available. | |
351 | smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it.*/ | |
352 | else if (frm.sliceType == X265_TYPE_BREF && m_param->bBPyramid && brefs && | |
353 | m_param->maxNumReferences <= (brefs + 3)) | |
354 | { | |
355 | frm.sliceType = X265_TYPE_B; | |
356 | x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n", | |
357 | frm.sliceType, m_param->maxNumReferences); | |
358 | } | |
359 | ||
360 | if ( /*(!param->intraRefresh || frm.frameNum == 0) && */ frm.frameNum - m_lastKeyframe >= m_param->keyframeMax) | |
361 | { | |
362 | if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I) | |
363 | frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR; | |
364 | bool warn = frm.sliceType != X265_TYPE_IDR; | |
365 | if (warn && m_param->bOpenGOP) | |
366 | warn &= frm.sliceType != X265_TYPE_I; | |
367 | if (warn) | |
368 | { | |
369 | x265_log(m_param, X265_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n", | |
370 | frm.sliceType, frm.frameNum); | |
371 | frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR; | |
372 | } | |
373 | } | |
374 | if (frm.sliceType == X265_TYPE_I && frm.frameNum - m_lastKeyframe >= m_param->keyframeMin) | |
375 | { | |
376 | if (m_param->bOpenGOP) | |
377 | { | |
378 | m_lastKeyframe = frm.frameNum; | |
379 | frm.bKeyframe = true; | |
380 | } | |
381 | else | |
382 | frm.sliceType = X265_TYPE_IDR; | |
383 | } | |
384 | if (frm.sliceType == X265_TYPE_IDR) | |
385 | { | |
386 | /* Closed GOP */ | |
387 | m_lastKeyframe = frm.frameNum; | |
388 | frm.bKeyframe = true; | |
389 | if (bframes > 0) | |
390 | { | |
391 | list[bframes - 1]->m_lowres.sliceType = X265_TYPE_P; | |
392 | bframes--; | |
393 | } | |
394 | } | |
395 | if (bframes == m_param->bframes || !list[bframes + 1]) | |
396 | { | |
397 | if (IS_X265_TYPE_B(frm.sliceType)) | |
398 | x265_log(m_param, X265_LOG_WARNING, "specified frame type is not compatible with max B-frames\n"); | |
399 | if (frm.sliceType == X265_TYPE_AUTO || IS_X265_TYPE_B(frm.sliceType)) | |
400 | frm.sliceType = X265_TYPE_P; | |
401 | } | |
402 | if (frm.sliceType == X265_TYPE_BREF) | |
403 | brefs++; | |
404 | if (frm.sliceType == X265_TYPE_AUTO) | |
405 | frm.sliceType = X265_TYPE_B; | |
406 | else if (!IS_X265_TYPE_B(frm.sliceType)) | |
407 | break; | |
408 | } | |
409 | ||
410 | if (bframes) | |
411 | list[bframes - 1]->m_lowres.bLastMiniGopBFrame = true; | |
412 | list[bframes]->m_lowres.leadingBframes = bframes; | |
413 | m_lastNonB = &list[bframes]->m_lowres; | |
414 | m_histogram[bframes]++; | |
415 | ||
416 | /* insert a bref into the sequence */ | |
417 | if (m_param->bBPyramid && bframes > 1 && !brefs) | |
418 | { | |
419 | list[bframes / 2]->m_lowres.sliceType = X265_TYPE_BREF; | |
420 | brefs++; | |
421 | } | |
72b9787e JB |
422 | /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */ |
423 | if (m_param->rc.rateControlMode != X265_RC_CQP) | |
424 | { | |
425 | int p0, p1, b; | |
426 | /* For zero latency tuning, calculate frame cost to be used later in RC */ | |
427 | if (!maxSearch) | |
428 | { | |
429 | for (int i = 0; i <= bframes; i++) | |
430 | frames[i + 1] = &list[i]->m_lowres; | |
431 | } | |
432 | ||
433 | /* estimate new non-B cost */ | |
434 | p1 = b = bframes + 1; | |
435 | p0 = (IS_X265_TYPE_I(frames[bframes + 1]->sliceType)) ? b : 0; | |
436 | m_est.estimateFrameCost(frames, p0, p1, b, 0); | |
437 | ||
438 | if (bframes) | |
439 | { | |
440 | p0 = 0; // last nonb | |
441 | for (b = 1; b <= bframes; b++) | |
442 | { | |
443 | if (frames[b]->sliceType == X265_TYPE_B) | |
444 | for (p1 = b; frames[p1]->sliceType == X265_TYPE_B; p1++) | |
445 | ; // find new nonb or bref | |
446 | else | |
447 | p1 = bframes + 1; | |
448 | ||
449 | m_est.estimateFrameCost(frames, p0, p1, b, 0); | |
450 | ||
451 | if (frames[b]->sliceType == X265_TYPE_BREF) | |
452 | p0 = b; | |
453 | } | |
454 | } | |
455 | } | |
456 | ||
457 | m_inputQueueLock.acquire(); | |
458 | ||
459 | /* dequeue all frames from inputQueue that are about to be enqueued | |
460 | * in the output queue. The order is important because Frame can | |
461 | * only be in one list at a time */ | |
462 | int64_t pts[X265_BFRAME_MAX + 1]; | |
463 | for (int i = 0; i <= bframes; i++) | |
464 | { | |
465 | Frame *curFrame; | |
466 | curFrame = m_inputQueue.popFront(); | |
467 | pts[i] = curFrame->m_pts; | |
468 | maxSearch--; | |
469 | } | |
470 | ||
471 | m_inputQueueLock.release(); | |
472 | ||
473 | m_outputQueueLock.acquire(); | |
474 | /* add non-B to output queue */ | |
475 | int idx = 0; | |
476 | list[bframes]->m_reorderedPts = pts[idx++]; | |
477 | m_outputQueue.pushBack(*list[bframes]); | |
478 | ||
479 | /* Add B-ref frame next to P frame in output queue, the B-ref encode before non B-ref frame */ | |
480 | if (bframes > 1 && m_param->bBPyramid) | |
481 | { | |
482 | for (int i = 0; i < bframes; i++) | |
483 | { | |
484 | if (list[i]->m_lowres.sliceType == X265_TYPE_BREF) | |
485 | { | |
486 | list[i]->m_reorderedPts = pts[idx++]; | |
487 | m_outputQueue.pushBack(*list[i]); | |
488 | } | |
489 | } | |
490 | } | |
491 | ||
492 | /* add B frames to output queue */ | |
493 | for (int i = 0; i < bframes; i++) | |
494 | { | |
495 | /* push all the B frames into output queue except B-ref, which already pushed into output queue*/ | |
496 | if (list[i]->m_lowres.sliceType != X265_TYPE_BREF) | |
497 | { | |
498 | list[i]->m_reorderedPts = pts[idx++]; | |
499 | m_outputQueue.pushBack(*list[i]); | |
500 | } | |
501 | } | |
502 | ||
503 | bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth)) && !m_param->rc.bStatRead; | |
504 | if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType)) | |
505 | { | |
506 | m_inputQueueLock.acquire(); | |
507 | Frame *curFrame = m_inputQueue.first(); | |
508 | frames[0] = m_lastNonB; | |
509 | int j; | |
510 | for (j = 0; j < maxSearch; j++) | |
511 | { | |
512 | frames[j + 1] = &curFrame->m_lowres; | |
513 | curFrame = curFrame->m_next; | |
514 | } | |
515 | ||
516 | frames[j + 1] = NULL; | |
517 | m_inputQueueLock.release(); | |
518 | slicetypeAnalyse(frames, true); | |
519 | } | |
520 | ||
521 | m_outputQueueLock.release(); | |
522 | m_outputAvailable.trigger(); | |
523 | } | |
524 | ||
525 | void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe) | |
526 | { | |
527 | int prevNonB = 0, curNonB = 1, idx = 0; | |
72b9787e JB |
528 | while (curNonB < numFrames && frames[curNonB]->sliceType == X265_TYPE_B) |
529 | curNonB++; | |
72b9787e | 530 | int nextNonB = keyframe ? prevNonB : curNonB; |
b53f7c52 JB |
531 | int nextB = prevNonB + 1; |
532 | int nextBRef = 0; | |
533 | int miniGopEnd = keyframe ? prevNonB : curNonB; | |
72b9787e JB |
534 | while (curNonB < numFrames + !keyframe) |
535 | { | |
536 | /* P/I cost: This shouldn't include the cost of nextNonB */ | |
537 | if (nextNonB != curNonB) | |
538 | { | |
539 | int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB; | |
540 | frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, p0, curNonB, curNonB); | |
541 | frames[nextNonB]->plannedType[idx] = frames[curNonB]->sliceType; | |
b53f7c52 JB |
542 | /* Save the nextNonB Cost in each B frame of the current miniGop */ |
543 | if (curNonB > miniGopEnd) | |
544 | { | |
545 | for (int j = nextB; j < miniGopEnd; j++) | |
546 | { | |
547 | frames[j]->plannedSatd[frames[j]->indB] = frames[nextNonB]->plannedSatd[idx]; | |
548 | frames[j]->plannedType[frames[j]->indB++] = frames[nextNonB]->plannedType[idx]; | |
549 | ||
550 | } | |
551 | } | |
72b9787e JB |
552 | idx++; |
553 | } | |
554 | /* Handle the B-frames: coded order */ | |
b53f7c52 JB |
555 | if (m_param->bBPyramid && curNonB - prevNonB > 1) |
556 | nextBRef = (prevNonB + curNonB + 1) / 2; | |
72b9787e | 557 | |
b53f7c52 | 558 | for (int i = prevNonB + 1; i < curNonB; i++, idx++) |
72b9787e | 559 | { |
b53f7c52 JB |
560 | int64_t satdCost = 0; int type = X265_TYPE_B; |
561 | if (nextBRef) | |
72b9787e | 562 | { |
b53f7c52 | 563 | if (i == nextBRef) |
72b9787e | 564 | { |
b53f7c52 JB |
565 | satdCost = vbvFrameCost(frames, prevNonB, curNonB, nextBRef); |
566 | type = X265_TYPE_BREF; | |
72b9787e | 567 | } |
b53f7c52 JB |
568 | else if (i < nextBRef) |
569 | satdCost = vbvFrameCost(frames, prevNonB, nextBRef, i); | |
72b9787e | 570 | else |
b53f7c52 | 571 | satdCost = vbvFrameCost(frames, nextBRef, curNonB, i); |
72b9787e | 572 | } |
b53f7c52 JB |
573 | else |
574 | satdCost = vbvFrameCost(frames, prevNonB, nextNonB, i); | |
575 | frames[nextNonB]->plannedSatd[idx] = satdCost; | |
576 | frames[nextNonB]->plannedType[idx] = type; | |
577 | /* Save the nextB Cost in each B frame of the current miniGop */ | |
72b9787e | 578 | |
b53f7c52 JB |
579 | for (int j = nextB; j < miniGopEnd; j++) |
580 | { | |
581 | if (nextBRef && i == nextBRef) | |
582 | break; | |
583 | if (j >= i && j !=nextBRef) | |
584 | continue; | |
585 | frames[j]->plannedSatd[frames[j]->indB] = satdCost; | |
586 | frames[j]->plannedType[frames[j]->indB++] = X265_TYPE_B; | |
587 | } | |
588 | } | |
72b9787e JB |
589 | prevNonB = curNonB; |
590 | curNonB++; | |
591 | while (curNonB <= numFrames && frames[curNonB]->sliceType == X265_TYPE_B) | |
592 | curNonB++; | |
593 | } | |
594 | ||
595 | frames[nextNonB]->plannedType[idx] = X265_TYPE_AUTO; | |
596 | } | |
597 | ||
598 | int64_t Lookahead::vbvFrameCost(Lowres **frames, int p0, int p1, int b) | |
599 | { | |
600 | int64_t cost = m_est.estimateFrameCost(frames, p0, p1, b, 0); | |
601 | ||
602 | if (m_param->rc.aqMode) | |
603 | { | |
604 | if (m_param->rc.cuTree) | |
605 | return frameCostRecalculate(frames, p0, p1, b); | |
606 | else | |
607 | return frames[b]->costEstAq[b - p0][p1 - b]; | |
608 | } | |
609 | return cost; | |
610 | } | |
611 | ||
612 | void Lookahead::slicetypeAnalyse(Lowres **frames, bool bKeyframe) | |
613 | { | |
614 | int numFrames, origNumFrames, keyintLimit, framecnt; | |
615 | int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX); | |
616 | int cuCount = NUM_CUS; | |
617 | int resetStart; | |
618 | bool bIsVbvLookahead = m_param->rc.vbvBufferSize && m_param->lookaheadDepth; | |
619 | ||
620 | /* count undecided frames */ | |
621 | for (framecnt = 0; framecnt < maxSearch; framecnt++) | |
622 | { | |
623 | Lowres *fenc = frames[framecnt + 1]; | |
624 | if (!fenc || fenc->sliceType != X265_TYPE_AUTO) | |
625 | break; | |
626 | } | |
627 | ||
628 | if (!framecnt) | |
629 | { | |
630 | if (m_param->rc.cuTree) | |
631 | cuTree(frames, 0, bKeyframe); | |
632 | return; | |
633 | } | |
634 | ||
635 | frames[framecnt + 1] = NULL; | |
636 | ||
637 | keyintLimit = m_param->keyframeMax - frames[0]->frameNum + m_lastKeyframe - 1; | |
638 | origNumFrames = numFrames = X265_MIN(framecnt, keyintLimit); | |
639 | ||
640 | if (bIsVbvLookahead) | |
641 | numFrames = framecnt; | |
642 | else if (m_param->bOpenGOP && numFrames < framecnt) | |
643 | numFrames++; | |
644 | else if (numFrames == 0) | |
645 | { | |
646 | frames[1]->sliceType = X265_TYPE_I; | |
647 | return; | |
648 | } | |
649 | ||
650 | int numBFrames = 0; | |
651 | int numAnalyzed = numFrames; | |
652 | if (m_param->scenecutThreshold && scenecut(frames, 0, 1, true, origNumFrames, maxSearch)) | |
653 | { | |
654 | frames[1]->sliceType = X265_TYPE_I; | |
655 | return; | |
656 | } | |
657 | ||
658 | if (m_param->bframes) | |
659 | { | |
660 | if (m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS) | |
661 | { | |
662 | if (numFrames > 1) | |
663 | { | |
664 | char best_paths[X265_BFRAME_MAX + 1][X265_LOOKAHEAD_MAX + 1] = { "", "P" }; | |
665 | int best_path_index = numFrames % (X265_BFRAME_MAX + 1); | |
666 | ||
667 | /* Perform the frametype analysis. */ | |
668 | for (int j = 2; j <= numFrames; j++) | |
669 | { | |
670 | slicetypePath(frames, j, best_paths); | |
671 | } | |
672 | ||
673 | numBFrames = (int)strspn(best_paths[best_path_index], "B"); | |
674 | ||
675 | /* Load the results of the analysis into the frame types. */ | |
676 | for (int j = 1; j < numFrames; j++) | |
677 | { | |
678 | frames[j]->sliceType = best_paths[best_path_index][j - 1] == 'B' ? X265_TYPE_B : X265_TYPE_P; | |
679 | } | |
680 | } | |
681 | frames[numFrames]->sliceType = X265_TYPE_P; | |
682 | } | |
683 | else if (m_param->bFrameAdaptive == X265_B_ADAPT_FAST) | |
684 | { | |
685 | int64_t cost1p0, cost2p0, cost1b1, cost2p1; | |
686 | ||
687 | for (int i = 0; i <= numFrames - 2; ) | |
688 | { | |
689 | cost2p1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 2, 1); | |
690 | if (frames[i + 2]->intraMbs[2] > cuCount / 2) | |
691 | { | |
692 | frames[i + 1]->sliceType = X265_TYPE_P; | |
693 | frames[i + 2]->sliceType = X265_TYPE_P; | |
694 | i += 2; | |
695 | continue; | |
696 | } | |
697 | ||
698 | cost1b1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 1, 0); | |
699 | cost1p0 = m_est.estimateFrameCost(frames, i + 0, i + 1, i + 1, 0); | |
700 | cost2p0 = m_est.estimateFrameCost(frames, i + 1, i + 2, i + 2, 0); | |
701 | ||
702 | if (cost1p0 + cost2p0 < cost1b1 + cost2p1) | |
703 | { | |
704 | frames[i + 1]->sliceType = X265_TYPE_P; | |
705 | i += 1; | |
706 | continue; | |
707 | } | |
708 | ||
709 | // arbitrary and untuned | |
710 | #define INTER_THRESH 300 | |
711 | #define P_SENS_BIAS (50 - m_param->bFrameBias) | |
712 | frames[i + 1]->sliceType = X265_TYPE_B; | |
713 | ||
714 | int j; | |
715 | for (j = i + 2; j <= X265_MIN(i + m_param->bframes, numFrames - 1); j++) | |
716 | { | |
717 | int64_t pthresh = X265_MAX(INTER_THRESH - P_SENS_BIAS * (j - i - 1), INTER_THRESH / 10); | |
718 | int64_t pcost = m_est.estimateFrameCost(frames, i + 0, j + 1, j + 1, 1); | |
719 | if (pcost > pthresh * cuCount || frames[j + 1]->intraMbs[j - i + 1] > cuCount / 3) | |
720 | break; | |
721 | frames[j]->sliceType = X265_TYPE_B; | |
722 | } | |
723 | ||
724 | frames[j]->sliceType = X265_TYPE_P; | |
725 | i = j; | |
726 | } | |
727 | frames[numFrames]->sliceType = X265_TYPE_P; | |
728 | numBFrames = 0; | |
729 | while (numBFrames < numFrames && frames[numBFrames + 1]->sliceType == X265_TYPE_B) | |
730 | { | |
731 | numBFrames++; | |
732 | } | |
733 | } | |
734 | else | |
735 | { | |
736 | numBFrames = X265_MIN(numFrames - 1, m_param->bframes); | |
737 | for (int j = 1; j < numFrames; j++) | |
738 | { | |
739 | frames[j]->sliceType = (j % (numBFrames + 1)) ? X265_TYPE_B : X265_TYPE_P; | |
740 | } | |
741 | ||
742 | frames[numFrames]->sliceType = X265_TYPE_P; | |
743 | } | |
744 | /* Check scenecut on the first minigop. */ | |
745 | for (int j = 1; j < numBFrames + 1; j++) | |
746 | { | |
747 | if (m_param->scenecutThreshold && scenecut(frames, j, j + 1, false, origNumFrames, maxSearch)) | |
748 | { | |
749 | frames[j]->sliceType = X265_TYPE_P; | |
750 | numAnalyzed = j; | |
751 | break; | |
752 | } | |
753 | } | |
754 | ||
755 | resetStart = bKeyframe ? 1 : X265_MIN(numBFrames + 2, numAnalyzed + 1); | |
756 | } | |
757 | else | |
758 | { | |
759 | for (int j = 1; j <= numFrames; j++) | |
760 | { | |
761 | frames[j]->sliceType = X265_TYPE_P; | |
762 | } | |
763 | ||
764 | resetStart = bKeyframe ? 1 : 2; | |
765 | } | |
766 | ||
767 | if (m_param->rc.cuTree) | |
768 | cuTree(frames, X265_MIN(numFrames, m_param->keyframeMax), bKeyframe); | |
769 | ||
770 | // if (!param->bIntraRefresh) | |
771 | for (int j = keyintLimit + 1; j <= numFrames; j += m_param->keyframeMax) | |
772 | { | |
773 | frames[j]->sliceType = X265_TYPE_I; | |
774 | resetStart = X265_MIN(resetStart, j + 1); | |
775 | } | |
776 | ||
777 | if (bIsVbvLookahead) | |
778 | vbvLookahead(frames, numFrames, bKeyframe); | |
779 | ||
780 | /* Restore frametypes for all frames that haven't actually been decided yet. */ | |
781 | for (int j = resetStart; j <= numFrames; j++) | |
782 | { | |
783 | frames[j]->sliceType = X265_TYPE_AUTO; | |
784 | } | |
785 | } | |
786 | ||
787 | bool Lookahead::scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch) | |
788 | { | |
789 | /* Only do analysis during a normal scenecut check. */ | |
790 | if (bRealScenecut && m_param->bframes) | |
791 | { | |
792 | int origmaxp1 = p0 + 1; | |
793 | /* Look ahead to avoid coding short flashes as scenecuts. */ | |
794 | if (m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS) | |
795 | /* Don't analyse any more frames than the trellis would have covered. */ | |
796 | origmaxp1 += m_param->bframes; | |
797 | else | |
798 | origmaxp1++; | |
799 | int maxp1 = X265_MIN(origmaxp1, numFrames); | |
800 | ||
801 | /* Where A and B are scenes: AAAAAABBBAAAAAA | |
802 | * If BBB is shorter than (maxp1-p0), it is detected as a flash | |
803 | * and not considered a scenecut. */ | |
804 | for (int cp1 = p1; cp1 <= maxp1; cp1++) | |
805 | { | |
806 | if (!scenecutInternal(frames, p0, cp1, false)) | |
807 | /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */ | |
808 | for (int i = cp1; i > p0; i--) | |
809 | { | |
810 | frames[i]->bScenecut = false; | |
811 | } | |
812 | } | |
813 | ||
814 | /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF | |
815 | * If each of BB ... EE are shorter than (maxp1-p0), they are | |
816 | * detected as flashes and not considered scenecuts. | |
817 | * Instead, the first F frame becomes a scenecut. | |
818 | * If the video ends before F, no frame becomes a scenecut. */ | |
819 | for (int cp0 = p0; cp0 <= maxp1; cp0++) | |
820 | { | |
821 | if (origmaxp1 > maxSearch || (cp0 < maxp1 && scenecutInternal(frames, cp0, maxp1, false))) | |
822 | /* If cur_p0 is the p0 of a scenecut, it cannot be the p1 of a scenecut. */ | |
823 | frames[cp0]->bScenecut = false; | |
824 | } | |
825 | } | |
826 | ||
827 | /* Ignore frames that are part of a flash, i.e. cannot be real scenecuts. */ | |
828 | if (!frames[p1]->bScenecut) | |
829 | return false; | |
830 | return scenecutInternal(frames, p0, p1, bRealScenecut); | |
831 | } | |
832 | ||
833 | bool Lookahead::scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut) | |
834 | { | |
835 | Lowres *frame = frames[p1]; | |
836 | ||
837 | m_est.estimateFrameCost(frames, p0, p1, p1, 0); | |
838 | ||
839 | int64_t icost = frame->costEst[0][0]; | |
840 | int64_t pcost = frame->costEst[p1 - p0][0]; | |
841 | int gopSize = frame->frameNum - m_lastKeyframe; | |
842 | float threshMax = (float)(m_param->scenecutThreshold / 100.0); | |
843 | ||
844 | /* magic numbers pulled out of thin air */ | |
845 | float threshMin = (float)(threshMax * 0.25); | |
846 | float bias; | |
847 | ||
848 | if (m_param->keyframeMin == m_param->keyframeMax) | |
849 | threshMin = threshMax; | |
850 | if (gopSize <= m_param->keyframeMin / 4) | |
851 | bias = threshMin / 4; | |
852 | else if (gopSize <= m_param->keyframeMin) | |
853 | bias = threshMin * gopSize / m_param->keyframeMin; | |
854 | else | |
855 | { | |
856 | bias = threshMin | |
857 | + (threshMax - threshMin) | |
858 | * (gopSize - m_param->keyframeMin) | |
859 | / (m_param->keyframeMax - m_param->keyframeMin); | |
860 | } | |
861 | ||
862 | bool res = pcost >= (1.0 - bias) * icost; | |
863 | if (res && bRealScenecut) | |
864 | { | |
865 | int imb = frame->intraMbs[p1 - p0]; | |
866 | int pmb = NUM_CUS - imb; | |
867 | x265_log(m_param, X265_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n", | |
868 | frame->frameNum, icost, pcost, 1. - (double)pcost / icost, bias, gopSize, imb, pmb); | |
869 | } | |
870 | return res; | |
871 | } | |
872 | ||
873 | void Lookahead::slicetypePath(Lowres **frames, int length, char(*best_paths)[X265_LOOKAHEAD_MAX + 1]) | |
874 | { | |
875 | char paths[2][X265_LOOKAHEAD_MAX + 1]; | |
876 | int num_paths = X265_MIN(m_param->bframes + 1, length); | |
877 | int64_t best_cost = 1LL << 62; | |
878 | int idx = 0; | |
879 | ||
880 | /* Iterate over all currently possible paths */ | |
881 | for (int path = 0; path < num_paths; path++) | |
882 | { | |
883 | /* Add suffixes to the current path */ | |
884 | int len = length - (path + 1); | |
885 | memcpy(paths[idx], best_paths[len % (X265_BFRAME_MAX + 1)], len); | |
886 | memset(paths[idx] + len, 'B', path); | |
887 | strcpy(paths[idx] + len + path, "P"); | |
888 | ||
889 | /* Calculate the actual cost of the current path */ | |
890 | int64_t cost = slicetypePathCost(frames, paths[idx], best_cost); | |
891 | if (cost < best_cost) | |
892 | { | |
893 | best_cost = cost; | |
894 | idx ^= 1; | |
895 | } | |
896 | } | |
897 | ||
898 | /* Store the best path. */ | |
899 | memcpy(best_paths[length % (X265_BFRAME_MAX + 1)], paths[idx ^ 1], length); | |
900 | } | |
901 | ||
902 | int64_t Lookahead::slicetypePathCost(Lowres **frames, char *path, int64_t threshold) | |
903 | { | |
904 | int64_t cost = 0; | |
905 | int loc = 1; | |
906 | int cur_p = 0; | |
907 | ||
908 | path--; /* Since the 1st path element is really the second frame */ | |
909 | while (path[loc]) | |
910 | { | |
911 | int next_p = loc; | |
912 | /* Find the location of the next P-frame. */ | |
913 | while (path[next_p] != 'P') | |
914 | { | |
915 | next_p++; | |
916 | } | |
917 | ||
918 | /* Add the cost of the P-frame found above */ | |
919 | cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_p, 0); | |
920 | /* Early terminate if the cost we have found is larger than the best path cost so far */ | |
921 | if (cost > threshold) | |
922 | break; | |
923 | ||
924 | if (m_param->bBPyramid && next_p - cur_p > 2) | |
925 | { | |
926 | int middle = cur_p + (next_p - cur_p) / 2; | |
927 | cost += m_est.estimateFrameCost(frames, cur_p, next_p, middle, 0); | |
928 | for (int next_b = loc; next_b < middle && cost < threshold; next_b++) | |
929 | { | |
930 | cost += m_est.estimateFrameCost(frames, cur_p, middle, next_b, 0); | |
931 | } | |
932 | ||
933 | for (int next_b = middle + 1; next_b < next_p && cost < threshold; next_b++) | |
934 | { | |
935 | cost += m_est.estimateFrameCost(frames, middle, next_p, next_b, 0); | |
936 | } | |
937 | } | |
938 | else | |
939 | { | |
940 | for (int next_b = loc; next_b < next_p && cost < threshold; next_b++) | |
941 | { | |
942 | cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_b, 0); | |
943 | } | |
944 | } | |
945 | ||
946 | loc = next_p + 1; | |
947 | cur_p = next_p; | |
948 | } | |
949 | ||
950 | return cost; | |
951 | } | |
952 | ||
953 | void Lookahead::cuTree(Lowres **frames, int numframes, bool bIntra) | |
954 | { | |
955 | int idx = !bIntra; | |
956 | int lastnonb, curnonb = 1; | |
957 | int bframes = 0; | |
958 | ||
959 | x265_emms(); | |
960 | double totalDuration = 0.0; | |
961 | for (int j = 0; j <= numframes; j++) | |
962 | totalDuration += (double)m_param->fpsDenom / m_param->fpsNum; | |
963 | ||
964 | double averageDuration = totalDuration / (numframes + 1); | |
965 | ||
966 | int i = numframes; | |
967 | int cuCount = m_widthInCU * m_heightInCU; | |
968 | ||
969 | if (bIntra) | |
970 | m_est.estimateFrameCost(frames, 0, 0, 0, 0); | |
971 | ||
972 | while (i > 0 && frames[i]->sliceType == X265_TYPE_B) | |
973 | i--; | |
974 | ||
975 | lastnonb = i; | |
976 | ||
977 | /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could | |
978 | * be applied to the end of a lookahead buffer of any size. However, it's most needed when | |
979 | * lookahead=0, so that's what's currently implemented. */ | |
980 | if (!m_param->lookaheadDepth) | |
981 | { | |
982 | if (bIntra) | |
983 | { | |
984 | memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t)); | |
985 | memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, cuCount * sizeof(double)); | |
986 | return; | |
987 | } | |
988 | std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost); | |
989 | memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t)); | |
990 | } | |
991 | else | |
992 | { | |
993 | if (lastnonb < idx) | |
994 | return; | |
995 | memset(frames[lastnonb]->propagateCost, 0, cuCount * sizeof(uint16_t)); | |
996 | } | |
997 | ||
998 | while (i-- > idx) | |
999 | { | |
1000 | curnonb = i; | |
1001 | while (frames[curnonb]->sliceType == X265_TYPE_B && curnonb > 0) | |
1002 | curnonb--; | |
1003 | ||
1004 | if (curnonb < idx) | |
1005 | break; | |
1006 | ||
1007 | m_est.estimateFrameCost(frames, curnonb, lastnonb, lastnonb, 0); | |
1008 | memset(frames[curnonb]->propagateCost, 0, cuCount * sizeof(uint16_t)); | |
1009 | bframes = lastnonb - curnonb - 1; | |
1010 | if (m_param->bBPyramid && bframes > 1) | |
1011 | { | |
1012 | int middle = (bframes + 1) / 2 + curnonb; | |
1013 | m_est.estimateFrameCost(frames, curnonb, lastnonb, middle, 0); | |
1014 | memset(frames[middle]->propagateCost, 0, cuCount * sizeof(uint16_t)); | |
1015 | while (i > curnonb) | |
1016 | { | |
1017 | int p0 = i > middle ? middle : curnonb; | |
1018 | int p1 = i < middle ? middle : lastnonb; | |
1019 | if (i != middle) | |
1020 | { | |
1021 | m_est.estimateFrameCost(frames, p0, p1, i, 0); | |
1022 | estimateCUPropagate(frames, averageDuration, p0, p1, i, 0); | |
1023 | } | |
1024 | i--; | |
1025 | } | |
1026 | ||
1027 | estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, middle, 1); | |
1028 | } | |
1029 | else | |
1030 | { | |
1031 | while (i > curnonb) | |
1032 | { | |
1033 | m_est.estimateFrameCost(frames, curnonb, lastnonb, i, 0); | |
1034 | estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0); | |
1035 | i--; | |
1036 | } | |
1037 | } | |
1038 | estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, lastnonb, 1); | |
1039 | lastnonb = curnonb; | |
1040 | } | |
1041 | ||
1042 | if (!m_param->lookaheadDepth) | |
1043 | { | |
1044 | m_est.estimateFrameCost(frames, 0, lastnonb, lastnonb, 0); | |
1045 | estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1); | |
1046 | std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost); | |
1047 | } | |
1048 | ||
1049 | cuTreeFinish(frames[lastnonb], averageDuration, lastnonb); | |
1050 | if (m_param->bBPyramid && bframes > 1 && !m_param->rc.vbvBufferSize) | |
1051 | cuTreeFinish(frames[lastnonb + (bframes + 1) / 2], averageDuration, 0); | |
1052 | } | |
1053 | ||
1054 | void Lookahead::estimateCUPropagate(Lowres **frames, double averageDuration, int p0, int p1, int b, int referenced) | |
1055 | { | |
1056 | uint16_t *refCosts[2] = { frames[p0]->propagateCost, frames[p1]->propagateCost }; | |
1057 | int32_t distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0); | |
1058 | int32_t bipredWeight = m_param->bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32; | |
1059 | MV *mvs[2] = { frames[b]->lowresMvs[0][b - p0 - 1], frames[b]->lowresMvs[1][p1 - b - 1] }; | |
1060 | int32_t bipredWeights[2] = { bipredWeight, 64 - bipredWeight }; | |
1061 | ||
1062 | memset(m_scratch, 0, m_widthInCU * sizeof(int)); | |
1063 | ||
1064 | uint16_t *propagateCost = frames[b]->propagateCost; | |
1065 | ||
1066 | x265_emms(); | |
1067 | double fpsFactor = CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) / CLIP_DURATION(averageDuration); | |
1068 | ||
1069 | /* For non-refferd frames the source costs are always zero, so just memset one row and re-use it. */ | |
1070 | if (!referenced) | |
1071 | memset(frames[b]->propagateCost, 0, m_widthInCU * sizeof(uint16_t)); | |
1072 | ||
1073 | int32_t StrideInCU = m_widthInCU; | |
1074 | for (uint16_t blocky = 0; blocky < m_heightInCU; blocky++) | |
1075 | { | |
1076 | int cuIndex = blocky * StrideInCU; | |
1077 | primitives.propagateCost(m_scratch, propagateCost, | |
1078 | frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex, | |
1079 | frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_widthInCU); | |
1080 | ||
1081 | if (referenced) | |
1082 | propagateCost += m_widthInCU; | |
1083 | for (uint16_t blockx = 0; blockx < m_widthInCU; blockx++, cuIndex++) | |
1084 | { | |
1085 | int32_t propagate_amount = m_scratch[blockx]; | |
1086 | /* Don't propagate for an intra block. */ | |
1087 | if (propagate_amount > 0) | |
1088 | { | |
1089 | /* Access width-2 bitfield. */ | |
1090 | int32_t lists_used = frames[b]->lowresCosts[b - p0][p1 - b][cuIndex] >> LOWRES_COST_SHIFT; | |
1091 | /* Follow the MVs to the previous frame(s). */ | |
1092 | for (uint16_t list = 0; list < 2; list++) | |
1093 | { | |
1094 | if ((lists_used >> list) & 1) | |
1095 | { | |
1096 | #define CLIP_ADD(s, x) (s) = (uint16_t)X265_MIN((s) + (x), (1 << 16) - 1) | |
1097 | int32_t listamount = propagate_amount; | |
1098 | /* Apply bipred weighting. */ | |
1099 | if (lists_used == 3) | |
1100 | listamount = (listamount * bipredWeights[list] + 32) >> 6; | |
1101 | ||
1102 | /* Early termination for simple case of mv0. */ | |
1103 | if (!mvs[list][cuIndex].word) | |
1104 | { | |
1105 | CLIP_ADD(refCosts[list][cuIndex], listamount); | |
1106 | continue; | |
1107 | } | |
1108 | ||
1109 | int32_t x = mvs[list][cuIndex].x; | |
1110 | int32_t y = mvs[list][cuIndex].y; | |
1111 | int32_t cux = (x >> 5) + blockx; | |
1112 | int32_t cuy = (y >> 5) + blocky; | |
1113 | int32_t idx0 = cux + cuy * StrideInCU; | |
1114 | int32_t idx1 = idx0 + 1; | |
1115 | int32_t idx2 = idx0 + StrideInCU; | |
1116 | int32_t idx3 = idx0 + StrideInCU + 1; | |
1117 | x &= 31; | |
1118 | y &= 31; | |
1119 | int32_t idx0weight = (32 - y) * (32 - x); | |
1120 | int32_t idx1weight = (32 - y) * x; | |
1121 | int32_t idx2weight = y * (32 - x); | |
1122 | int32_t idx3weight = y * x; | |
1123 | ||
1124 | /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't | |
1125 | * be counted. */ | |
1126 | if (cux < m_widthInCU - 1 && cuy < m_heightInCU - 1 && cux >= 0 && cuy >= 0) | |
1127 | { | |
1128 | CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10); | |
1129 | CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10); | |
1130 | CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10); | |
1131 | CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10); | |
1132 | } | |
1133 | else /* Check offsets individually */ | |
1134 | { | |
1135 | if (cux < m_widthInCU && cuy < m_heightInCU && cux >= 0 && cuy >= 0) | |
1136 | CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10); | |
1137 | if (cux + 1 < m_widthInCU && cuy < m_heightInCU && cux + 1 >= 0 && cuy >= 0) | |
1138 | CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10); | |
1139 | if (cux < m_widthInCU && cuy + 1 < m_heightInCU && cux >= 0 && cuy + 1 >= 0) | |
1140 | CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10); | |
1141 | if (cux + 1 < m_widthInCU && cuy + 1 < m_heightInCU && cux + 1 >= 0 && cuy + 1 >= 0) | |
1142 | CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10); | |
1143 | } | |
1144 | } | |
1145 | } | |
1146 | } | |
1147 | } | |
1148 | } | |
1149 | ||
1150 | if (m_param->rc.vbvBufferSize && m_param->lookaheadDepth && referenced) | |
1151 | cuTreeFinish(frames[b], averageDuration, b == p1 ? b - p0 : 0); | |
1152 | } | |
1153 | ||
1154 | void Lookahead::cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance) | |
1155 | { | |
1156 | int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) * 256); | |
1157 | double weightdelta = 0.0; | |
1158 | ||
1159 | if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0) | |
1160 | weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]); | |
1161 | ||
1162 | /* Allow the strength to be adjusted via qcompress, since the two | |
1163 | * concepts are very similar. */ | |
1164 | ||
1165 | int cuCount = m_widthInCU * m_heightInCU; | |
1166 | double strength = 5.0 * (1.0 - m_param->rc.qCompress); | |
1167 | ||
1168 | for (int cuIndex = 0; cuIndex < cuCount; cuIndex++) | |
1169 | { | |
1170 | int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8; | |
1171 | if (intracost) | |
1172 | { | |
1173 | int propagateCost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8; | |
1174 | double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta; | |
1175 | frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - strength * log2_ratio; | |
1176 | } | |
1177 | } | |
1178 | } | |
1179 | ||
1180 | /* If MB-tree changes the quantizers, we need to recalculate the frame cost without | |
1181 | * re-running lookahead. */ | |
1182 | int64_t Lookahead::frameCostRecalculate(Lowres** frames, int p0, int p1, int b) | |
1183 | { | |
1184 | int64_t score = 0; | |
1185 | int *rowSatd = frames[b]->rowSatds[b - p0][p1 - b]; | |
1186 | double *qp_offset = (frames[b]->sliceType == X265_TYPE_B) ? frames[b]->qpAqOffset : frames[b]->qpCuTreeOffset; | |
1187 | ||
1188 | x265_emms(); | |
1189 | for (int cuy = m_heightInCU - 1; cuy >= 0; cuy--) | |
1190 | { | |
1191 | rowSatd[cuy] = 0; | |
1192 | for (int cux = m_widthInCU - 1; cux >= 0; cux--) | |
1193 | { | |
1194 | int cuxy = cux + cuy * m_widthInCU; | |
1195 | int cuCost = frames[b]->lowresCosts[b - p0][p1 - b][cuxy] & LOWRES_COST_MASK; | |
1196 | double qp_adj = qp_offset[cuxy]; | |
1197 | cuCost = (cuCost * x265_exp2fix8(qp_adj) + 128) >> 8; | |
1198 | rowSatd[cuy] += cuCost; | |
1199 | if ((cuy > 0 && cuy < m_heightInCU - 1 && | |
1200 | cux > 0 && cux < m_widthInCU - 1) || | |
1201 | m_widthInCU <= 2 || m_heightInCU <= 2) | |
1202 | { | |
1203 | score += cuCost; | |
1204 | } | |
1205 | } | |
1206 | } | |
1207 | ||
1208 | return score; | |
1209 | } | |
1210 | ||
1211 | CostEstimate::CostEstimate(ThreadPool *p) | |
1212 | : WaveFront(p) | |
1213 | { | |
1214 | m_param = NULL; | |
1215 | m_curframes = NULL; | |
1216 | m_wbuffer[0] = m_wbuffer[1] = m_wbuffer[2] = m_wbuffer[3] = 0; | |
1217 | m_rows = NULL; | |
1218 | m_paddedLines = m_widthInCU = m_heightInCU = 0; | |
1219 | m_bDoSearch[0] = m_bDoSearch[1] = false; | |
1220 | m_curb = m_curp0 = m_curp1 = 0; | |
1221 | m_bFrameCompleted = false; | |
1222 | } | |
1223 | ||
1224 | CostEstimate::~CostEstimate() | |
1225 | { | |
1226 | for (int i = 0; i < 4; i++) | |
1227 | { | |
1228 | x265_free(m_wbuffer[i]); | |
1229 | } | |
1230 | ||
1231 | delete[] m_rows; | |
1232 | } | |
1233 | ||
1234 | void CostEstimate::init(x265_param *_param, Frame *curFrame) | |
1235 | { | |
1236 | m_param = _param; | |
1237 | m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; | |
1238 | m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; | |
1239 | ||
1240 | m_rows = new EstimateRow[m_heightInCU]; | |
1241 | for (int i = 0; i < m_heightInCU; i++) | |
1242 | { | |
1243 | m_rows[i].m_widthInCU = m_widthInCU; | |
1244 | m_rows[i].m_heightInCU = m_heightInCU; | |
1245 | m_rows[i].m_param = m_param; | |
1246 | } | |
1247 | ||
1248 | if (WaveFront::init(m_heightInCU)) | |
1249 | WaveFront::enableAllRows(); | |
1250 | else | |
1251 | m_pool = NULL; | |
1252 | ||
1253 | if (m_param->bEnableWeightedPred) | |
1254 | { | |
b53f7c52 | 1255 | PicYuv *orig = curFrame->m_fencPic; |
72b9787e JB |
1256 | m_paddedLines = curFrame->m_lowres.lines + 2 * orig->m_lumaMarginY; |
1257 | intptr_t padoffset = curFrame->m_lowres.lumaStride * orig->m_lumaMarginY + orig->m_lumaMarginX; | |
1258 | ||
1259 | /* allocate weighted lowres buffers */ | |
1260 | for (int i = 0; i < 4; i++) | |
1261 | { | |
1262 | m_wbuffer[i] = (pixel*)x265_malloc(sizeof(pixel) * (curFrame->m_lowres.lumaStride * m_paddedLines)); | |
1263 | m_weightedRef.lowresPlane[i] = m_wbuffer[i] + padoffset; | |
1264 | } | |
1265 | ||
b53f7c52 | 1266 | m_weightedRef.fpelPlane[0] = m_weightedRef.lowresPlane[0]; |
72b9787e JB |
1267 | m_weightedRef.lumaStride = curFrame->m_lowres.lumaStride; |
1268 | m_weightedRef.isLowres = true; | |
1269 | m_weightedRef.isWeighted = false; | |
1270 | } | |
1271 | } | |
1272 | ||
1273 | int64_t CostEstimate::estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty) | |
1274 | { | |
1275 | int64_t score = 0; | |
1276 | Lowres *fenc = frames[b]; | |
1277 | ||
1278 | if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1) | |
1279 | score = fenc->costEst[b - p0][p1 - b]; | |
1280 | else | |
1281 | { | |
1282 | m_weightedRef.isWeighted = false; | |
1283 | if (m_param->bEnableWeightedPred && b == p1 && b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF) | |
1284 | { | |
1285 | if (!fenc->bIntraCalculated) | |
1286 | estimateFrameCost(frames, b, b, b, 0); | |
1287 | weightsAnalyse(frames, b, p0); | |
1288 | } | |
1289 | ||
1290 | /* For each list, check to see whether we have lowres motion-searched this reference */ | |
1291 | m_bDoSearch[0] = b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF; | |
1292 | m_bDoSearch[1] = b != p1 && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF; | |
1293 | ||
1294 | if (m_bDoSearch[0]) fenc->lowresMvs[0][b - p0 - 1][0].x = 0; | |
1295 | if (m_bDoSearch[1]) fenc->lowresMvs[1][p1 - b - 1][0].x = 0; | |
1296 | ||
1297 | m_curb = b; | |
1298 | m_curp0 = p0; | |
1299 | m_curp1 = p1; | |
1300 | m_curframes = frames; | |
1301 | fenc->costEst[b - p0][p1 - b] = 0; | |
1302 | fenc->costEstAq[b - p0][p1 - b] = 0; | |
1303 | ||
1304 | for (int i = 0; i < m_heightInCU; i++) | |
1305 | { | |
1306 | m_rows[i].init(); | |
72b9787e JB |
1307 | if (!fenc->bIntraCalculated) |
1308 | fenc->rowSatds[0][0][i] = 0; | |
1309 | fenc->rowSatds[b - p0][p1 - b][i] = 0; | |
1310 | } | |
1311 | ||
1312 | m_bFrameCompleted = false; | |
1313 | ||
1314 | if (m_pool) | |
1315 | { | |
1316 | WaveFront::enqueue(); | |
1317 | ||
1318 | // enableAllRows must be already called | |
1319 | enqueueRow(0); | |
1320 | while (!m_bFrameCompleted) | |
1321 | WaveFront::findJob(-1); | |
1322 | ||
1323 | WaveFront::dequeue(); | |
1324 | } | |
1325 | else | |
1326 | { | |
1327 | for (int row = 0; row < m_heightInCU; row++) | |
1328 | processRow(row, -1); | |
1329 | ||
1330 | x265_emms(); | |
1331 | } | |
1332 | ||
1333 | // Accumulate cost from each row | |
1334 | for (int row = 0; row < m_heightInCU; row++) | |
1335 | { | |
1336 | score += m_rows[row].m_costEst; | |
1337 | fenc->costEst[0][0] += m_rows[row].m_costIntra; | |
1338 | if (m_param->rc.aqMode) | |
1339 | { | |
1340 | fenc->costEstAq[0][0] += m_rows[row].m_costIntraAq; | |
1341 | fenc->costEstAq[b - p0][p1 - b] += m_rows[row].m_costEstAq; | |
1342 | } | |
1343 | fenc->intraMbs[b - p0] += m_rows[row].m_intraMbs; | |
1344 | } | |
1345 | ||
1346 | fenc->bIntraCalculated = true; | |
1347 | ||
1348 | if (b != p1) | |
1349 | score = (uint64_t)score * 100 / (130 + m_param->bFrameBias); | |
1350 | if (b != p0 || b != p1) //Not Intra cost | |
1351 | fenc->costEst[b - p0][p1 - b] = score; | |
1352 | } | |
1353 | ||
1354 | if (bIntraPenalty) | |
1355 | { | |
1356 | // arbitrary penalty for I-blocks after B-frames | |
1357 | int ncu = NUM_CUS; | |
1358 | score += (uint64_t)score * fenc->intraMbs[b - p0] / (ncu * 8); | |
1359 | } | |
1360 | return score; | |
1361 | } | |
1362 | ||
1363 | uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightParam *wp) | |
1364 | { | |
1365 | Lowres *fenc = frames[b]; | |
1366 | Lowres *ref = frames[p0]; | |
b53f7c52 | 1367 | pixel *src = ref->fpelPlane[0]; |
72b9787e JB |
1368 | intptr_t stride = fenc->lumaStride; |
1369 | ||
1370 | if (wp) | |
1371 | { | |
1372 | int offset = wp->inputOffset << (X265_DEPTH - 8); | |
1373 | int scale = wp->inputWeight; | |
1374 | int denom = wp->log2WeightDenom; | |
1375 | int round = denom ? 1 << (denom - 1) : 0; | |
1376 | int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth | |
1377 | int widthHeight = (int)stride; | |
1378 | ||
1379 | primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines, | |
1380 | scale, round << correction, denom + correction, offset); | |
b53f7c52 | 1381 | src = m_weightedRef.fpelPlane[0]; |
72b9787e JB |
1382 | } |
1383 | ||
1384 | uint32_t cost = 0; | |
1385 | intptr_t pixoff = 0; | |
1386 | int mb = 0; | |
1387 | ||
1388 | for (int y = 0; y < fenc->lines; y += 8, pixoff = y * stride) | |
1389 | { | |
1390 | for (int x = 0; x < fenc->width; x += 8, mb++, pixoff += 8) | |
1391 | { | |
b53f7c52 | 1392 | int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane[0] + pixoff, stride); |
72b9787e JB |
1393 | cost += X265_MIN(satd, fenc->intraCost[mb]); |
1394 | } | |
1395 | } | |
1396 | ||
1397 | return cost; | |
1398 | } | |
1399 | ||
1400 | void CostEstimate::weightsAnalyse(Lowres **frames, int b, int p0) | |
1401 | { | |
1402 | static const float epsilon = 1.f / 128.f; | |
1403 | Lowres *fenc, *ref; | |
1404 | ||
1405 | fenc = frames[b]; | |
1406 | ref = frames[p0]; | |
1407 | int deltaIndex = fenc->frameNum - ref->frameNum; | |
1408 | ||
1409 | /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */ | |
1410 | float guessScale, fencMean, refMean; | |
1411 | x265_emms(); | |
1412 | if (fenc->wp_ssd[0] && ref->wp_ssd[0]) | |
1413 | guessScale = sqrtf((float)fenc->wp_ssd[0] / ref->wp_ssd[0]); | |
1414 | else | |
1415 | guessScale = 1.0f; | |
1416 | fencMean = (float)fenc->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8)); | |
1417 | refMean = (float)ref->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8)); | |
1418 | ||
1419 | /* Early termination */ | |
1420 | if (fabsf(refMean - fencMean) < 0.5f && fabsf(1.f - guessScale) < epsilon) | |
1421 | return; | |
1422 | ||
1423 | int minoff = 0, minscale, mindenom; | |
1424 | unsigned int minscore = 0, origscore = 1; | |
1425 | int found = 0; | |
1426 | ||
1427 | m_w.setFromWeightAndOffset((int)(guessScale * 128 + 0.5f), 0, 7, true); | |
1428 | mindenom = m_w.log2WeightDenom; | |
1429 | minscale = m_w.inputWeight; | |
1430 | ||
1431 | origscore = minscore = weightCostLuma(frames, b, p0, NULL); | |
1432 | ||
1433 | if (!minscore) | |
1434 | return; | |
1435 | ||
1436 | unsigned int s = 0; | |
1437 | int curScale = minscale; | |
1438 | int curOffset = (int)(fencMean - refMean * curScale / (1 << mindenom) + 0.5f); | |
1439 | if (curOffset < -128 || curOffset > 127) | |
1440 | { | |
1441 | /* Rescale considering the constraints on curOffset. We do it in this order | |
1442 | * because scale has a much wider range than offset (because of denom), so | |
1443 | * it should almost never need to be clamped. */ | |
1444 | curOffset = Clip3(-128, 127, curOffset); | |
1445 | curScale = (int)((1 << mindenom) * (fencMean - curOffset) / refMean + 0.5f); | |
1446 | curScale = Clip3(0, 127, curScale); | |
1447 | } | |
1448 | SET_WEIGHT(m_w, 1, curScale, mindenom, curOffset); | |
1449 | s = weightCostLuma(frames, b, p0, &m_w); | |
1450 | COPY4_IF_LT(minscore, s, minscale, curScale, minoff, curOffset, found, 1); | |
1451 | ||
1452 | /* Use a smaller denominator if possible */ | |
1453 | while (mindenom > 0 && !(minscale & 1)) | |
1454 | { | |
1455 | mindenom--; | |
1456 | minscale >>= 1; | |
1457 | } | |
1458 | ||
1459 | if (!found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f) | |
1460 | return; | |
1461 | else | |
1462 | { | |
1463 | SET_WEIGHT(m_w, 1, minscale, mindenom, minoff); | |
1464 | // set weighted delta cost | |
1465 | fenc->weightedCostDelta[deltaIndex] = minscore / origscore; | |
1466 | ||
1467 | int offset = m_w.inputOffset << (X265_DEPTH - 8); | |
1468 | int scale = m_w.inputWeight; | |
1469 | int denom = m_w.log2WeightDenom; | |
1470 | int round = denom ? 1 << (denom - 1) : 0; | |
1471 | int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth | |
1472 | intptr_t stride = ref->lumaStride; | |
1473 | int widthHeight = (int)stride; | |
1474 | ||
1475 | for (int i = 0; i < 4; i++) | |
1476 | primitives.weight_pp(ref->buffer[i], m_wbuffer[i], stride, widthHeight, m_paddedLines, | |
1477 | scale, round << correction, denom + correction, offset); | |
1478 | ||
1479 | m_weightedRef.isWeighted = true; | |
1480 | } | |
1481 | } | |
1482 | ||
1483 | void CostEstimate::processRow(int row, int /*threadId*/) | |
1484 | { | |
b53f7c52 JB |
1485 | ProfileScopeEvent(costEstimateRow); |
1486 | ||
72b9787e JB |
1487 | int realrow = m_heightInCU - 1 - row; |
1488 | Lowres **frames = m_curframes; | |
1489 | ReferencePlanes *wfref0 = m_weightedRef.isWeighted ? &m_weightedRef : frames[m_curp0]; | |
1490 | ||
1491 | /* Lowres lookahead goes backwards because the MVs are used as | |
1492 | * predictors in the main encode. This considerably improves MV | |
1493 | * prediction overall. */ | |
1494 | for (int i = m_widthInCU - 1 - m_rows[row].m_completed; i >= 0; i--) | |
1495 | { | |
1496 | // TODO: use lowres MVs as motion candidates in full-res search | |
1497 | m_rows[row].estimateCUCost(frames, wfref0, i, realrow, m_curp0, m_curp1, m_curb, m_bDoSearch); | |
1498 | m_rows[row].m_completed++; | |
1499 | ||
1500 | if (m_rows[row].m_completed >= 2 && row < m_heightInCU - 1) | |
1501 | { | |
1502 | ScopedLock below(m_rows[row + 1].m_lock); | |
1503 | if (m_rows[row + 1].m_active == false && | |
1504 | m_rows[row + 1].m_completed + 2 <= m_rows[row].m_completed) | |
1505 | { | |
1506 | m_rows[row + 1].m_active = true; | |
1507 | enqueueRow(row + 1); | |
1508 | } | |
1509 | } | |
1510 | ||
1511 | ScopedLock self(m_rows[row].m_lock); | |
1512 | if (row > 0 && (int32_t)m_rows[row].m_completed < m_widthInCU - 1 && | |
1513 | m_rows[row - 1].m_completed < m_rows[row].m_completed + 2) | |
1514 | { | |
1515 | m_rows[row].m_active = false; | |
1516 | return; | |
1517 | } | |
1518 | } | |
1519 | ||
1520 | if (row == m_heightInCU - 1) | |
1521 | m_bFrameCompleted = true; | |
1522 | } | |
1523 | ||
1524 | void EstimateRow::init() | |
1525 | { | |
1526 | m_costEst = 0; | |
1527 | m_costEstAq = 0; | |
1528 | m_costIntra = 0; | |
1529 | m_costIntraAq = 0; | |
1530 | m_intraMbs = 0; | |
1531 | m_active = false; | |
1532 | m_completed = 0; | |
1533 | } | |
1534 | ||
1535 | void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2]) | |
1536 | { | |
1537 | Lowres *fref1 = frames[p1]; | |
1538 | Lowres *fenc = frames[b]; | |
1539 | ||
1540 | const int bBidir = (b < p1); | |
1541 | const int cuXY = cux + cuy * m_widthInCU; | |
1542 | const int cuSize = X265_LOWRES_CU_SIZE; | |
1543 | const intptr_t pelOffset = cuSize * cux + cuSize * cuy * fenc->lumaStride; | |
1544 | ||
1545 | // should this CU's cost contribute to the frame cost? | |
1546 | const bool bFrameScoreCU = (cux > 0 && cux < m_widthInCU - 1 && | |
1547 | cuy > 0 && cuy < m_heightInCU - 1) || m_widthInCU <= 2 || m_heightInCU <= 2; | |
1548 | ||
b53f7c52 | 1549 | m_me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize); |
72b9787e JB |
1550 | |
1551 | /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */ | |
1552 | int lowresPenalty = 4; | |
1553 | ||
1554 | MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY], | |
1555 | &fenc->lowresMvs[1][p1 - b - 1][cuXY] }; | |
1556 | int(*fenc_costs[2]) = { &fenc->lowresMvCosts[0][b - p0 - 1][cuXY], | |
1557 | &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] }; | |
1558 | ||
1559 | MV mvmin, mvmax; | |
1560 | int bcost = m_me.COST_MAX; | |
1561 | int listused = 0; | |
1562 | ||
1563 | // establish search bounds that don't cross extended frame boundaries | |
1564 | mvmin.x = (int16_t)(-cux * cuSize - 8); | |
1565 | mvmin.y = (int16_t)(-cuy * cuSize - 8); | |
1566 | mvmax.x = (int16_t)((m_widthInCU - cux - 1) * cuSize + 8); | |
1567 | mvmax.y = (int16_t)((m_heightInCU - cuy - 1) * cuSize + 8); | |
1568 | ||
1569 | if (p0 != p1) | |
1570 | { | |
1571 | for (int i = 0; i < 1 + bBidir; i++) | |
1572 | { | |
1573 | if (!bDoSearch[i]) | |
1574 | { | |
1575 | /* Use previously calculated cost */ | |
1576 | COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1); | |
1577 | continue; | |
1578 | } | |
1579 | int numc = 0; | |
1580 | MV mvc[4], mvp; | |
1581 | MV *fenc_mv = fenc_mvs[i]; | |
1582 | ||
1583 | /* Reverse-order MV prediction. */ | |
1584 | mvc[0] = 0; | |
1585 | mvc[2] = 0; | |
1586 | #define MVC(mv) mvc[numc++] = mv; | |
1587 | if (cux < m_widthInCU - 1) | |
1588 | MVC(fenc_mv[1]); | |
1589 | if (cuy < m_heightInCU - 1) | |
1590 | { | |
1591 | MVC(fenc_mv[m_widthInCU]); | |
1592 | if (cux > 0) | |
1593 | MVC(fenc_mv[m_widthInCU - 1]); | |
1594 | if (cux < m_widthInCU - 1) | |
1595 | MVC(fenc_mv[m_widthInCU + 1]); | |
1596 | } | |
1597 | #undef MVC | |
1598 | if (numc <= 1) | |
1599 | mvp = mvc[0]; | |
1600 | else | |
1601 | { | |
1602 | median_mv(mvp, mvc[0], mvc[1], mvc[2]); | |
1603 | } | |
1604 | ||
1605 | *fenc_costs[i] = m_me.motionEstimate(i ? fref1 : wfref0, mvmin, mvmax, mvp, numc, mvc, m_merange, *fenc_mvs[i]); | |
1606 | COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1); | |
1607 | } | |
1608 | if (bBidir) | |
1609 | { | |
b53f7c52 JB |
1610 | ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); |
1611 | ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); | |
72b9787e JB |
1612 | intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE; |
1613 | pixel *src0 = wfref0->lowresMC(pelOffset, *fenc_mvs[0], subpelbuf0, stride0); | |
1614 | pixel *src1 = fref1->lowresMC(pelOffset, *fenc_mvs[1], subpelbuf1, stride1); | |
1615 | ||
b53f7c52 | 1616 | ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); |
72b9787e JB |
1617 | primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32); |
1618 | int bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE); | |
1619 | COPY2_IF_LT(bcost, bicost, listused, 3); | |
1620 | ||
1621 | // Try 0,0 candidates | |
1622 | src0 = wfref0->lowresPlane[0] + pelOffset; | |
1623 | src1 = fref1->lowresPlane[0] + pelOffset; | |
1624 | primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, wfref0->lumaStride, src1, fref1->lumaStride, 32); | |
1625 | bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE); | |
1626 | COPY2_IF_LT(bcost, bicost, listused, 3); | |
1627 | } | |
1628 | } | |
1629 | if (!fenc->bIntraCalculated) | |
1630 | { | |
1631 | const int sizeIdx = X265_LOWRES_CU_BITS - 2; // partition size | |
1632 | ||
1633 | pixel _above0[X265_LOWRES_CU_SIZE * 4 + 1], *const above0 = _above0 + 2 * X265_LOWRES_CU_SIZE; | |
1634 | pixel _above1[X265_LOWRES_CU_SIZE * 4 + 1], *const above1 = _above1 + 2 * X265_LOWRES_CU_SIZE; | |
1635 | pixel _left0[X265_LOWRES_CU_SIZE * 4 + 1], *const left0 = _left0 + 2 * X265_LOWRES_CU_SIZE; | |
1636 | pixel _left1[X265_LOWRES_CU_SIZE * 4 + 1], *const left1 = _left1 + 2 * X265_LOWRES_CU_SIZE; | |
1637 | ||
1638 | pixel *pix_cur = fenc->lowresPlane[0] + pelOffset; | |
1639 | ||
1640 | // Copy Above | |
1641 | memcpy(above0, pix_cur - 1 - fenc->lumaStride, (cuSize + 1) * sizeof(pixel)); | |
1642 | ||
1643 | // Copy Left | |
1644 | for (int i = 0; i < cuSize + 1; i++) | |
72b9787e | 1645 | left0[i] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride]; |
72b9787e JB |
1646 | |
1647 | for (int i = 0; i < cuSize; i++) | |
1648 | { | |
1649 | above0[cuSize + i + 1] = above0[cuSize]; | |
1650 | left0[cuSize + i + 1] = left0[cuSize]; | |
1651 | } | |
1652 | ||
1653 | // filtering with [1 2 1] | |
1654 | // assume getUseStrongIntraSmoothing() is disabled | |
1655 | above1[0] = above0[0]; | |
1656 | above1[2 * cuSize] = above0[2 * cuSize]; | |
1657 | left1[0] = left0[0]; | |
1658 | left1[2 * cuSize] = left0[2 * cuSize]; | |
1659 | for (int i = 1; i < 2 * cuSize; i++) | |
1660 | { | |
1661 | above1[i] = (above0[i - 1] + 2 * above0[i] + above0[i + 1] + 2) >> 2; | |
1662 | left1[i] = (left0[i - 1] + 2 * left0[i] + left0[i + 1] + 2) >> 2; | |
1663 | } | |
1664 | ||
1665 | int predsize = cuSize * cuSize; | |
1666 | ||
1667 | // generate 35 intra predictions into m_predictions | |
1668 | pixelcmp_t satd = primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)]; | |
b53f7c52 | 1669 | int icost = m_me.COST_MAX; |
72b9787e | 1670 | primitives.intra_pred[DC_IDX][sizeIdx](m_predictions, cuSize, left0, above0, 0, (cuSize <= 16)); |
b53f7c52 | 1671 | int cost = m_me.bufSATD(m_predictions, cuSize); |
72b9787e JB |
1672 | if (cost < icost) |
1673 | icost = cost; | |
1674 | pixel *above = (cuSize >= 8) ? above1 : above0; | |
1675 | pixel *left = (cuSize >= 8) ? left1 : left0; | |
1676 | primitives.intra_pred[PLANAR_IDX][sizeIdx](m_predictions, cuSize, left, above, 0, 0); | |
b53f7c52 | 1677 | cost = m_me.bufSATD(m_predictions, cuSize); |
72b9787e JB |
1678 | if (cost < icost) |
1679 | icost = cost; | |
1680 | primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16)); | |
1681 | ||
1682 | // calculate satd costs, keep least cost | |
1683 | ALIGN_VAR_32(pixel, buf_trans[32 * 32]); | |
b53f7c52 | 1684 | primitives.transpose[sizeIdx](buf_trans, m_me.fencPUYuv.m_buf[0], FENC_STRIDE); |
72b9787e JB |
1685 | |
1686 | int acost = m_me.COST_MAX; | |
1687 | uint32_t mode, lowmode = 4; | |
1688 | for (mode = 5; mode < 35; mode += 5) | |
1689 | { | |
1690 | if (mode < 18) | |
1691 | cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize); | |
1692 | else | |
b53f7c52 | 1693 | cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize); |
72b9787e JB |
1694 | COPY2_IF_LT(acost, cost, lowmode, mode); |
1695 | } | |
1696 | for (uint32_t dist = 2; dist >= 1; dist--) | |
1697 | { | |
1698 | mode = lowmode - dist; | |
1699 | if (mode < 18) | |
1700 | cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize); | |
1701 | else | |
b53f7c52 | 1702 | cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize); |
72b9787e JB |
1703 | COPY2_IF_LT(acost, cost, lowmode, mode); |
1704 | ||
1705 | mode = lowmode + dist; | |
1706 | if (mode < 18) | |
1707 | cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize); | |
1708 | else | |
b53f7c52 | 1709 | cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize); |
72b9787e JB |
1710 | COPY2_IF_LT(acost, cost, lowmode, mode); |
1711 | } | |
1712 | if (acost < icost) | |
1713 | icost = acost; | |
1714 | ||
1715 | const int intraPenalty = 5 * m_lookAheadLambda; | |
1716 | icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */ | |
1717 | fenc->intraCost[cuXY] = icost; | |
b53f7c52 | 1718 | fenc->intraMode[cuXY] = (uint8_t)lowmode; |
72b9787e JB |
1719 | int icostAq = icost; |
1720 | if (bFrameScoreCU) | |
1721 | { | |
1722 | m_costIntra += icost; | |
1723 | if (fenc->invQscaleFactor) | |
1724 | { | |
1725 | icostAq = (icost * fenc->invQscaleFactor[cuXY] + 128) >> 8; | |
1726 | m_costIntraAq += icostAq; | |
1727 | } | |
1728 | } | |
1729 | fenc->rowSatds[0][0][cuy] += icostAq; | |
1730 | } | |
1731 | bcost += lowresPenalty; | |
1732 | if (!bBidir) | |
1733 | { | |
1734 | if (fenc->intraCost[cuXY] < bcost) | |
1735 | { | |
1736 | if (bFrameScoreCU) m_intraMbs++; | |
1737 | bcost = fenc->intraCost[cuXY]; | |
1738 | listused = 0; | |
1739 | } | |
1740 | } | |
1741 | ||
1742 | /* For I frames these costs were accumulated earlier */ | |
1743 | if (p0 != p1) | |
1744 | { | |
1745 | int bcostAq = bcost; | |
1746 | if (bFrameScoreCU) | |
1747 | { | |
1748 | m_costEst += bcost; | |
1749 | if (fenc->invQscaleFactor) | |
1750 | { | |
1751 | bcostAq = (bcost * fenc->invQscaleFactor[cuXY] + 128) >> 8; | |
1752 | m_costEstAq += bcostAq; | |
1753 | } | |
1754 | } | |
1755 | fenc->rowSatds[b - p0][p1 - b][cuy] += bcostAq; | |
1756 | } | |
1757 | fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT)); | |
1758 | } |