Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / encoder / slicetype.cpp
CommitLineData
72b9787e
JB
1/*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Gopu Govindaswamy <gopu@multicorewareinc.com>
5 * Steve Borho <steve@borho.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 *
21 * This program is also available under a commercial proprietary license.
22 * For more information, contact us at license @ x265.com.
23 *****************************************************************************/
24
25#include "common.h"
26#include "frame.h"
27#include "framedata.h"
28#include "picyuv.h"
29#include "primitives.h"
30#include "lowres.h"
31#include "mv.h"
32
33#include "slicetype.h"
34#include "motion.h"
35#include "ratecontrol.h"
36
37#define NUM_CUS (m_widthInCU > 2 && m_heightInCU > 2 ? (m_widthInCU - 2) * (m_heightInCU - 2) : m_widthInCU * m_heightInCU)
38
39using namespace x265;
40
41static inline int16_t median(int16_t a, int16_t b, int16_t c)
42{
43 int16_t t = (a - b) & ((a - b) >> 31);
44
45 a -= t;
46 b += t;
47 b -= (b - c) & ((b - c) >> 31);
48 b += (a - b) & ((a - b) >> 31);
49 return b;
50}
51
52static inline void median_mv(MV &dst, MV a, MV b, MV c)
53{
54 dst.x = median(a.x, b.x, c.x);
55 dst.y = median(a.y, b.y, c.y);
56}
57
58Lookahead::Lookahead(x265_param *param, ThreadPool* pool)
59 : JobProvider(pool)
60 , m_est(pool)
61{
62 m_bReady = 0;
63 m_param = param;
64 m_lastKeyframe = -m_param->keyframeMax;
65 m_lastNonB = NULL;
66 m_bFilling = true;
67 m_bFlushed = false;
68 m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
69 m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
70 m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int));
71 memset(m_histogram, 0, sizeof(m_histogram));
72}
73
74Lookahead::~Lookahead() { }
75
76void Lookahead::init()
77{
78 if (m_pool && m_pool->getThreadCount() >= 4 &&
79 ((m_param->bFrameAdaptive && m_param->bframes) ||
80 m_param->rc.cuTree || m_param->scenecutThreshold ||
81 (m_param->lookaheadDepth && m_param->rc.vbvBufferSize)))
82 m_pool = m_pool; /* allow use of worker thread */
83 else
84 m_pool = NULL; /* disable use of worker thread */
85}
86
87void Lookahead::destroy()
88{
89 if (m_pool)
90 // flush will dequeue, if it is necessary
91 JobProvider::flush();
92
93 // these two queues will be empty unless the encode was aborted
94 while (!m_inputQueue.empty())
95 {
96 Frame* curFrame = m_inputQueue.popFront();
97 curFrame->destroy();
98 delete curFrame;
99 }
100
101 while (!m_outputQueue.empty())
102 {
103 Frame* curFrame = m_outputQueue.popFront();
104 curFrame->destroy();
105 delete curFrame;
106 }
107
108 x265_free(m_scratch);
109}
110
111/* Called by API thread */
112void Lookahead::addPicture(Frame *curFrame, int sliceType)
113{
b53f7c52 114 PicYuv *orig = curFrame->m_fencPic;
72b9787e
JB
115
116 curFrame->m_lowres.init(orig, curFrame->m_poc, sliceType);
117
118 m_inputQueueLock.acquire();
119 m_inputQueue.pushBack(*curFrame);
120
121 if (m_inputQueue.size() >= m_param->lookaheadDepth)
122 {
123 /* when queue fills the first time, run slicetypeDecide synchronously,
124 * since the encoder will always be blocked here */
125 if (m_pool && !m_bFilling)
126 {
127 m_inputQueueLock.release();
128 m_bReady = 1;
129 m_pool->pokeIdleThread();
130 }
131 else
132 slicetypeDecide();
133
134 if (m_bFilling && m_pool)
135 JobProvider::enqueue();
136 m_bFilling = false;
137 }
138 else
139 m_inputQueueLock.release();
140}
141
142/* Called by API thread */
143void Lookahead::flush()
144{
145 /* just in case the input queue is never allowed to fill */
146 m_bFilling = false;
147
148 /* flush synchronously */
149 m_inputQueueLock.acquire();
150 if (!m_inputQueue.empty())
151 {
152 slicetypeDecide();
153 }
154 else
155 m_inputQueueLock.release();
156
157 m_inputQueueLock.acquire();
158
159 /* bFlushed indicates that an empty output queue actually means all frames
160 * have been decided (no more inputs for the encoder) */
161 if (m_inputQueue.empty())
162 m_bFlushed = true;
163 m_inputQueueLock.release();
164}
165
166/* Called by API thread. If the lookahead queue has not yet been filled the
167 * first time, it immediately returns NULL. Else the function blocks until
168 * outputs are available and then pops the first frame from the output queue. If
169 * flush() has been called and the output queue is empty, NULL is returned. */
170Frame* Lookahead::getDecidedPicture()
171{
172 m_outputQueueLock.acquire();
173
174 if (m_bFilling)
175 {
176 m_outputQueueLock.release();
177 return NULL;
178 }
179
180 while (m_outputQueue.empty() && !m_bFlushed)
181 {
182 m_outputQueueLock.release();
183 m_outputAvailable.wait();
184 m_outputQueueLock.acquire();
185 }
186
187 Frame *fenc = m_outputQueue.popFront();
188 m_outputQueueLock.release();
189 return fenc;
190}
191
192/* Called by pool worker threads */
193bool Lookahead::findJob(int)
194{
b53f7c52 195 if (m_bReady > 0 && ATOMIC_DEC(&m_bReady) == 0)
72b9787e
JB
196 {
197 m_inputQueueLock.acquire();
198 slicetypeDecide();
199 return true;
200 }
201 else
202 return false;
203}
204
205/* Called by rate-control to calculate the estimated SATD cost for a given
206 * picture. It assumes dpb->prepareEncode() has already been called for the
207 * picture and all the references are established */
208void Lookahead::getEstimatedPictureCost(Frame *curFrame)
209{
210 Lowres *frames[X265_LOOKAHEAD_MAX];
211
212 // POC distances to each reference
213 Slice *slice = curFrame->m_encData->m_slice;
214 int p0 = 0, p1, b;
215 int poc = slice->m_poc;
216 int l0poc = slice->m_refPOCList[0][0];
217 int l1poc = slice->m_refPOCList[1][0];
218
219 switch (slice->m_sliceType)
220 {
221 case I_SLICE:
222 frames[p0] = &curFrame->m_lowres;
223 b = p1 = 0;
224 break;
225
226 case P_SLICE:
227 b = p1 = poc - l0poc;
228 frames[p0] = &slice->m_refPicList[0][0]->m_lowres;
229 frames[b] = &curFrame->m_lowres;
230 break;
231
232 case B_SLICE:
233 b = poc - l0poc;
234 p1 = b + l1poc - poc;
235 frames[p0] = &slice->m_refPicList[0][0]->m_lowres;
236 frames[b] = &curFrame->m_lowres;
237 frames[p1] = &slice->m_refPicList[1][0]->m_lowres;
238 break;
239
240 default:
241 return;
242 }
243
244 if (m_param->rc.cuTree && !m_param->rc.bStatRead)
245 /* update row satds based on cutree offsets */
246 curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
247 else if (m_param->rc.aqMode)
248 curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b];
249 else
250 curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b];
251
252 if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate)
253 {
254 /* aggregate lowres row satds to CTU resolution */
255 curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCosts[b - p0][p1 - b];
256 uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0;
257 uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
258 uint32_t numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
259 uint32_t widthInLowresCu = (uint32_t)m_widthInCU, heightInLowresCu = (uint32_t)m_heightInCU;
260 double *qp_offset = 0;
261 /* Factor in qpoffsets based on Aq/Cutree in CU costs */
262 if (m_param->rc.aqMode)
263 qp_offset = (frames[b]->sliceType == X265_TYPE_B || !m_param->rc.cuTree) ? frames[b]->qpAqOffset : frames[b]->qpCuTreeOffset;
264
265 for (uint32_t row = 0; row < numCuInHeight; row++)
266 {
267 lowresRow = row * scale;
268 for (uint32_t cnt = 0; cnt < scale && lowresRow < heightInLowresCu; lowresRow++, cnt++)
269 {
270 sum = 0;
271 lowresCuIdx = lowresRow * widthInLowresCu;
272 for (lowresCol = 0; lowresCol < widthInLowresCu; lowresCol++, lowresCuIdx++)
273 {
274 uint16_t lowresCuCost = curFrame->m_lowres.lowresCostForRc[lowresCuIdx] & LOWRES_COST_MASK;
275 if (qp_offset)
276 {
277 lowresCuCost = (uint16_t)((lowresCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8);
278 int32_t intraCuCost = curFrame->m_lowres.intraCost[lowresCuIdx];
279 curFrame->m_lowres.intraCost[lowresCuIdx] = (intraCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8;
280 }
281 curFrame->m_lowres.lowresCostForRc[lowresCuIdx] = lowresCuCost;
282 sum += lowresCuCost;
283 }
284 curFrame->m_encData->m_rowStat[row].satdForVbv += sum;
285 }
286 }
287 }
288}
289
290/* called by API thread or worker thread with inputQueueLock acquired */
291void Lookahead::slicetypeDecide()
292{
b53f7c52
JB
293 ProfileScopeEvent(slicetypeDecideEV);
294
72b9787e
JB
295 ScopedLock lock(m_decideLock);
296
297 Lowres *frames[X265_LOOKAHEAD_MAX];
298 Frame *list[X265_LOOKAHEAD_MAX];
299 int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX);
300
301 memset(frames, 0, sizeof(frames));
302 memset(list, 0, sizeof(list));
303 {
304 Frame *curFrame = m_inputQueue.first();
305 int j;
306 for (j = 0; j < m_param->bframes + 2; j++)
307 {
308 if (!curFrame) break;
309 list[j] = curFrame;
310 curFrame = curFrame->m_next;
311 }
312
313 curFrame = m_inputQueue.first();
314 frames[0] = m_lastNonB;
315 for (j = 0; j < maxSearch; j++)
316 {
317 if (!curFrame) break;
318 frames[j + 1] = &curFrame->m_lowres;
319 curFrame = curFrame->m_next;
320 }
321
322 maxSearch = j;
323 }
324
325 m_inputQueueLock.release();
326
327 if (!m_est.m_rows && list[0])
328 m_est.init(m_param, list[0]);
329
330 if (m_lastNonB && !m_param->rc.bStatRead &&
331 ((m_param->bFrameAdaptive && m_param->bframes) ||
332 m_param->rc.cuTree || m_param->scenecutThreshold ||
333 (m_param->lookaheadDepth && m_param->rc.vbvBufferSize)))
334 {
335 slicetypeAnalyse(frames, false);
336 }
337
338 int bframes, brefs;
339 for (bframes = 0, brefs = 0;; bframes++)
340 {
341 Lowres& frm = list[bframes]->m_lowres;
342
343 if (frm.sliceType == X265_TYPE_BREF && !m_param->bBPyramid && brefs == m_param->bBPyramid)
344 {
345 frm.sliceType = X265_TYPE_B;
346 x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid\n",
347 frm.frameNum);
348 }
349
350 /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
351 smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it.*/
352 else if (frm.sliceType == X265_TYPE_BREF && m_param->bBPyramid && brefs &&
353 m_param->maxNumReferences <= (brefs + 3))
354 {
355 frm.sliceType = X265_TYPE_B;
356 x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n",
357 frm.sliceType, m_param->maxNumReferences);
358 }
359
360 if ( /*(!param->intraRefresh || frm.frameNum == 0) && */ frm.frameNum - m_lastKeyframe >= m_param->keyframeMax)
361 {
362 if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I)
363 frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
364 bool warn = frm.sliceType != X265_TYPE_IDR;
365 if (warn && m_param->bOpenGOP)
366 warn &= frm.sliceType != X265_TYPE_I;
367 if (warn)
368 {
369 x265_log(m_param, X265_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n",
370 frm.sliceType, frm.frameNum);
371 frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
372 }
373 }
374 if (frm.sliceType == X265_TYPE_I && frm.frameNum - m_lastKeyframe >= m_param->keyframeMin)
375 {
376 if (m_param->bOpenGOP)
377 {
378 m_lastKeyframe = frm.frameNum;
379 frm.bKeyframe = true;
380 }
381 else
382 frm.sliceType = X265_TYPE_IDR;
383 }
384 if (frm.sliceType == X265_TYPE_IDR)
385 {
386 /* Closed GOP */
387 m_lastKeyframe = frm.frameNum;
388 frm.bKeyframe = true;
389 if (bframes > 0)
390 {
391 list[bframes - 1]->m_lowres.sliceType = X265_TYPE_P;
392 bframes--;
393 }
394 }
395 if (bframes == m_param->bframes || !list[bframes + 1])
396 {
397 if (IS_X265_TYPE_B(frm.sliceType))
398 x265_log(m_param, X265_LOG_WARNING, "specified frame type is not compatible with max B-frames\n");
399 if (frm.sliceType == X265_TYPE_AUTO || IS_X265_TYPE_B(frm.sliceType))
400 frm.sliceType = X265_TYPE_P;
401 }
402 if (frm.sliceType == X265_TYPE_BREF)
403 brefs++;
404 if (frm.sliceType == X265_TYPE_AUTO)
405 frm.sliceType = X265_TYPE_B;
406 else if (!IS_X265_TYPE_B(frm.sliceType))
407 break;
408 }
409
410 if (bframes)
411 list[bframes - 1]->m_lowres.bLastMiniGopBFrame = true;
412 list[bframes]->m_lowres.leadingBframes = bframes;
413 m_lastNonB = &list[bframes]->m_lowres;
414 m_histogram[bframes]++;
415
416 /* insert a bref into the sequence */
417 if (m_param->bBPyramid && bframes > 1 && !brefs)
418 {
419 list[bframes / 2]->m_lowres.sliceType = X265_TYPE_BREF;
420 brefs++;
421 }
72b9787e
JB
422 /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
423 if (m_param->rc.rateControlMode != X265_RC_CQP)
424 {
425 int p0, p1, b;
426 /* For zero latency tuning, calculate frame cost to be used later in RC */
427 if (!maxSearch)
428 {
429 for (int i = 0; i <= bframes; i++)
430 frames[i + 1] = &list[i]->m_lowres;
431 }
432
433 /* estimate new non-B cost */
434 p1 = b = bframes + 1;
435 p0 = (IS_X265_TYPE_I(frames[bframes + 1]->sliceType)) ? b : 0;
436 m_est.estimateFrameCost(frames, p0, p1, b, 0);
437
438 if (bframes)
439 {
440 p0 = 0; // last nonb
441 for (b = 1; b <= bframes; b++)
442 {
443 if (frames[b]->sliceType == X265_TYPE_B)
444 for (p1 = b; frames[p1]->sliceType == X265_TYPE_B; p1++)
445 ; // find new nonb or bref
446 else
447 p1 = bframes + 1;
448
449 m_est.estimateFrameCost(frames, p0, p1, b, 0);
450
451 if (frames[b]->sliceType == X265_TYPE_BREF)
452 p0 = b;
453 }
454 }
455 }
456
457 m_inputQueueLock.acquire();
458
459 /* dequeue all frames from inputQueue that are about to be enqueued
460 * in the output queue. The order is important because Frame can
461 * only be in one list at a time */
462 int64_t pts[X265_BFRAME_MAX + 1];
463 for (int i = 0; i <= bframes; i++)
464 {
465 Frame *curFrame;
466 curFrame = m_inputQueue.popFront();
467 pts[i] = curFrame->m_pts;
468 maxSearch--;
469 }
470
471 m_inputQueueLock.release();
472
473 m_outputQueueLock.acquire();
474 /* add non-B to output queue */
475 int idx = 0;
476 list[bframes]->m_reorderedPts = pts[idx++];
477 m_outputQueue.pushBack(*list[bframes]);
478
479 /* Add B-ref frame next to P frame in output queue, the B-ref encode before non B-ref frame */
480 if (bframes > 1 && m_param->bBPyramid)
481 {
482 for (int i = 0; i < bframes; i++)
483 {
484 if (list[i]->m_lowres.sliceType == X265_TYPE_BREF)
485 {
486 list[i]->m_reorderedPts = pts[idx++];
487 m_outputQueue.pushBack(*list[i]);
488 }
489 }
490 }
491
492 /* add B frames to output queue */
493 for (int i = 0; i < bframes; i++)
494 {
495 /* push all the B frames into output queue except B-ref, which already pushed into output queue*/
496 if (list[i]->m_lowres.sliceType != X265_TYPE_BREF)
497 {
498 list[i]->m_reorderedPts = pts[idx++];
499 m_outputQueue.pushBack(*list[i]);
500 }
501 }
502
503 bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth)) && !m_param->rc.bStatRead;
504 if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType))
505 {
506 m_inputQueueLock.acquire();
507 Frame *curFrame = m_inputQueue.first();
508 frames[0] = m_lastNonB;
509 int j;
510 for (j = 0; j < maxSearch; j++)
511 {
512 frames[j + 1] = &curFrame->m_lowres;
513 curFrame = curFrame->m_next;
514 }
515
516 frames[j + 1] = NULL;
517 m_inputQueueLock.release();
518 slicetypeAnalyse(frames, true);
519 }
520
521 m_outputQueueLock.release();
522 m_outputAvailable.trigger();
523}
524
525void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
526{
527 int prevNonB = 0, curNonB = 1, idx = 0;
72b9787e
JB
528 while (curNonB < numFrames && frames[curNonB]->sliceType == X265_TYPE_B)
529 curNonB++;
72b9787e 530 int nextNonB = keyframe ? prevNonB : curNonB;
b53f7c52
JB
531 int nextB = prevNonB + 1;
532 int nextBRef = 0;
533 int miniGopEnd = keyframe ? prevNonB : curNonB;
72b9787e
JB
534 while (curNonB < numFrames + !keyframe)
535 {
536 /* P/I cost: This shouldn't include the cost of nextNonB */
537 if (nextNonB != curNonB)
538 {
539 int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB;
540 frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, p0, curNonB, curNonB);
541 frames[nextNonB]->plannedType[idx] = frames[curNonB]->sliceType;
b53f7c52
JB
542 /* Save the nextNonB Cost in each B frame of the current miniGop */
543 if (curNonB > miniGopEnd)
544 {
545 for (int j = nextB; j < miniGopEnd; j++)
546 {
547 frames[j]->plannedSatd[frames[j]->indB] = frames[nextNonB]->plannedSatd[idx];
548 frames[j]->plannedType[frames[j]->indB++] = frames[nextNonB]->plannedType[idx];
549
550 }
551 }
72b9787e
JB
552 idx++;
553 }
554 /* Handle the B-frames: coded order */
b53f7c52
JB
555 if (m_param->bBPyramid && curNonB - prevNonB > 1)
556 nextBRef = (prevNonB + curNonB + 1) / 2;
72b9787e 557
b53f7c52 558 for (int i = prevNonB + 1; i < curNonB; i++, idx++)
72b9787e 559 {
b53f7c52
JB
560 int64_t satdCost = 0; int type = X265_TYPE_B;
561 if (nextBRef)
72b9787e 562 {
b53f7c52 563 if (i == nextBRef)
72b9787e 564 {
b53f7c52
JB
565 satdCost = vbvFrameCost(frames, prevNonB, curNonB, nextBRef);
566 type = X265_TYPE_BREF;
72b9787e 567 }
b53f7c52
JB
568 else if (i < nextBRef)
569 satdCost = vbvFrameCost(frames, prevNonB, nextBRef, i);
72b9787e 570 else
b53f7c52 571 satdCost = vbvFrameCost(frames, nextBRef, curNonB, i);
72b9787e 572 }
b53f7c52
JB
573 else
574 satdCost = vbvFrameCost(frames, prevNonB, nextNonB, i);
575 frames[nextNonB]->plannedSatd[idx] = satdCost;
576 frames[nextNonB]->plannedType[idx] = type;
577 /* Save the nextB Cost in each B frame of the current miniGop */
72b9787e 578
b53f7c52
JB
579 for (int j = nextB; j < miniGopEnd; j++)
580 {
581 if (nextBRef && i == nextBRef)
582 break;
583 if (j >= i && j !=nextBRef)
584 continue;
585 frames[j]->plannedSatd[frames[j]->indB] = satdCost;
586 frames[j]->plannedType[frames[j]->indB++] = X265_TYPE_B;
587 }
588 }
72b9787e
JB
589 prevNonB = curNonB;
590 curNonB++;
591 while (curNonB <= numFrames && frames[curNonB]->sliceType == X265_TYPE_B)
592 curNonB++;
593 }
594
595 frames[nextNonB]->plannedType[idx] = X265_TYPE_AUTO;
596}
597
598int64_t Lookahead::vbvFrameCost(Lowres **frames, int p0, int p1, int b)
599{
600 int64_t cost = m_est.estimateFrameCost(frames, p0, p1, b, 0);
601
602 if (m_param->rc.aqMode)
603 {
604 if (m_param->rc.cuTree)
605 return frameCostRecalculate(frames, p0, p1, b);
606 else
607 return frames[b]->costEstAq[b - p0][p1 - b];
608 }
609 return cost;
610}
611
612void Lookahead::slicetypeAnalyse(Lowres **frames, bool bKeyframe)
613{
614 int numFrames, origNumFrames, keyintLimit, framecnt;
615 int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX);
616 int cuCount = NUM_CUS;
617 int resetStart;
618 bool bIsVbvLookahead = m_param->rc.vbvBufferSize && m_param->lookaheadDepth;
619
620 /* count undecided frames */
621 for (framecnt = 0; framecnt < maxSearch; framecnt++)
622 {
623 Lowres *fenc = frames[framecnt + 1];
624 if (!fenc || fenc->sliceType != X265_TYPE_AUTO)
625 break;
626 }
627
628 if (!framecnt)
629 {
630 if (m_param->rc.cuTree)
631 cuTree(frames, 0, bKeyframe);
632 return;
633 }
634
635 frames[framecnt + 1] = NULL;
636
637 keyintLimit = m_param->keyframeMax - frames[0]->frameNum + m_lastKeyframe - 1;
638 origNumFrames = numFrames = X265_MIN(framecnt, keyintLimit);
639
640 if (bIsVbvLookahead)
641 numFrames = framecnt;
642 else if (m_param->bOpenGOP && numFrames < framecnt)
643 numFrames++;
644 else if (numFrames == 0)
645 {
646 frames[1]->sliceType = X265_TYPE_I;
647 return;
648 }
649
650 int numBFrames = 0;
651 int numAnalyzed = numFrames;
652 if (m_param->scenecutThreshold && scenecut(frames, 0, 1, true, origNumFrames, maxSearch))
653 {
654 frames[1]->sliceType = X265_TYPE_I;
655 return;
656 }
657
658 if (m_param->bframes)
659 {
660 if (m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS)
661 {
662 if (numFrames > 1)
663 {
664 char best_paths[X265_BFRAME_MAX + 1][X265_LOOKAHEAD_MAX + 1] = { "", "P" };
665 int best_path_index = numFrames % (X265_BFRAME_MAX + 1);
666
667 /* Perform the frametype analysis. */
668 for (int j = 2; j <= numFrames; j++)
669 {
670 slicetypePath(frames, j, best_paths);
671 }
672
673 numBFrames = (int)strspn(best_paths[best_path_index], "B");
674
675 /* Load the results of the analysis into the frame types. */
676 for (int j = 1; j < numFrames; j++)
677 {
678 frames[j]->sliceType = best_paths[best_path_index][j - 1] == 'B' ? X265_TYPE_B : X265_TYPE_P;
679 }
680 }
681 frames[numFrames]->sliceType = X265_TYPE_P;
682 }
683 else if (m_param->bFrameAdaptive == X265_B_ADAPT_FAST)
684 {
685 int64_t cost1p0, cost2p0, cost1b1, cost2p1;
686
687 for (int i = 0; i <= numFrames - 2; )
688 {
689 cost2p1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 2, 1);
690 if (frames[i + 2]->intraMbs[2] > cuCount / 2)
691 {
692 frames[i + 1]->sliceType = X265_TYPE_P;
693 frames[i + 2]->sliceType = X265_TYPE_P;
694 i += 2;
695 continue;
696 }
697
698 cost1b1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 1, 0);
699 cost1p0 = m_est.estimateFrameCost(frames, i + 0, i + 1, i + 1, 0);
700 cost2p0 = m_est.estimateFrameCost(frames, i + 1, i + 2, i + 2, 0);
701
702 if (cost1p0 + cost2p0 < cost1b1 + cost2p1)
703 {
704 frames[i + 1]->sliceType = X265_TYPE_P;
705 i += 1;
706 continue;
707 }
708
709// arbitrary and untuned
710#define INTER_THRESH 300
711#define P_SENS_BIAS (50 - m_param->bFrameBias)
712 frames[i + 1]->sliceType = X265_TYPE_B;
713
714 int j;
715 for (j = i + 2; j <= X265_MIN(i + m_param->bframes, numFrames - 1); j++)
716 {
717 int64_t pthresh = X265_MAX(INTER_THRESH - P_SENS_BIAS * (j - i - 1), INTER_THRESH / 10);
718 int64_t pcost = m_est.estimateFrameCost(frames, i + 0, j + 1, j + 1, 1);
719 if (pcost > pthresh * cuCount || frames[j + 1]->intraMbs[j - i + 1] > cuCount / 3)
720 break;
721 frames[j]->sliceType = X265_TYPE_B;
722 }
723
724 frames[j]->sliceType = X265_TYPE_P;
725 i = j;
726 }
727 frames[numFrames]->sliceType = X265_TYPE_P;
728 numBFrames = 0;
729 while (numBFrames < numFrames && frames[numBFrames + 1]->sliceType == X265_TYPE_B)
730 {
731 numBFrames++;
732 }
733 }
734 else
735 {
736 numBFrames = X265_MIN(numFrames - 1, m_param->bframes);
737 for (int j = 1; j < numFrames; j++)
738 {
739 frames[j]->sliceType = (j % (numBFrames + 1)) ? X265_TYPE_B : X265_TYPE_P;
740 }
741
742 frames[numFrames]->sliceType = X265_TYPE_P;
743 }
744 /* Check scenecut on the first minigop. */
745 for (int j = 1; j < numBFrames + 1; j++)
746 {
747 if (m_param->scenecutThreshold && scenecut(frames, j, j + 1, false, origNumFrames, maxSearch))
748 {
749 frames[j]->sliceType = X265_TYPE_P;
750 numAnalyzed = j;
751 break;
752 }
753 }
754
755 resetStart = bKeyframe ? 1 : X265_MIN(numBFrames + 2, numAnalyzed + 1);
756 }
757 else
758 {
759 for (int j = 1; j <= numFrames; j++)
760 {
761 frames[j]->sliceType = X265_TYPE_P;
762 }
763
764 resetStart = bKeyframe ? 1 : 2;
765 }
766
767 if (m_param->rc.cuTree)
768 cuTree(frames, X265_MIN(numFrames, m_param->keyframeMax), bKeyframe);
769
770 // if (!param->bIntraRefresh)
771 for (int j = keyintLimit + 1; j <= numFrames; j += m_param->keyframeMax)
772 {
773 frames[j]->sliceType = X265_TYPE_I;
774 resetStart = X265_MIN(resetStart, j + 1);
775 }
776
777 if (bIsVbvLookahead)
778 vbvLookahead(frames, numFrames, bKeyframe);
779
780 /* Restore frametypes for all frames that haven't actually been decided yet. */
781 for (int j = resetStart; j <= numFrames; j++)
782 {
783 frames[j]->sliceType = X265_TYPE_AUTO;
784 }
785}
786
787bool Lookahead::scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch)
788{
789 /* Only do analysis during a normal scenecut check. */
790 if (bRealScenecut && m_param->bframes)
791 {
792 int origmaxp1 = p0 + 1;
793 /* Look ahead to avoid coding short flashes as scenecuts. */
794 if (m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS)
795 /* Don't analyse any more frames than the trellis would have covered. */
796 origmaxp1 += m_param->bframes;
797 else
798 origmaxp1++;
799 int maxp1 = X265_MIN(origmaxp1, numFrames);
800
801 /* Where A and B are scenes: AAAAAABBBAAAAAA
802 * If BBB is shorter than (maxp1-p0), it is detected as a flash
803 * and not considered a scenecut. */
804 for (int cp1 = p1; cp1 <= maxp1; cp1++)
805 {
806 if (!scenecutInternal(frames, p0, cp1, false))
807 /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */
808 for (int i = cp1; i > p0; i--)
809 {
810 frames[i]->bScenecut = false;
811 }
812 }
813
814 /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF
815 * If each of BB ... EE are shorter than (maxp1-p0), they are
816 * detected as flashes and not considered scenecuts.
817 * Instead, the first F frame becomes a scenecut.
818 * If the video ends before F, no frame becomes a scenecut. */
819 for (int cp0 = p0; cp0 <= maxp1; cp0++)
820 {
821 if (origmaxp1 > maxSearch || (cp0 < maxp1 && scenecutInternal(frames, cp0, maxp1, false)))
822 /* If cur_p0 is the p0 of a scenecut, it cannot be the p1 of a scenecut. */
823 frames[cp0]->bScenecut = false;
824 }
825 }
826
827 /* Ignore frames that are part of a flash, i.e. cannot be real scenecuts. */
828 if (!frames[p1]->bScenecut)
829 return false;
830 return scenecutInternal(frames, p0, p1, bRealScenecut);
831}
832
833bool Lookahead::scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut)
834{
835 Lowres *frame = frames[p1];
836
837 m_est.estimateFrameCost(frames, p0, p1, p1, 0);
838
839 int64_t icost = frame->costEst[0][0];
840 int64_t pcost = frame->costEst[p1 - p0][0];
841 int gopSize = frame->frameNum - m_lastKeyframe;
842 float threshMax = (float)(m_param->scenecutThreshold / 100.0);
843
844 /* magic numbers pulled out of thin air */
845 float threshMin = (float)(threshMax * 0.25);
846 float bias;
847
848 if (m_param->keyframeMin == m_param->keyframeMax)
849 threshMin = threshMax;
850 if (gopSize <= m_param->keyframeMin / 4)
851 bias = threshMin / 4;
852 else if (gopSize <= m_param->keyframeMin)
853 bias = threshMin * gopSize / m_param->keyframeMin;
854 else
855 {
856 bias = threshMin
857 + (threshMax - threshMin)
858 * (gopSize - m_param->keyframeMin)
859 / (m_param->keyframeMax - m_param->keyframeMin);
860 }
861
862 bool res = pcost >= (1.0 - bias) * icost;
863 if (res && bRealScenecut)
864 {
865 int imb = frame->intraMbs[p1 - p0];
866 int pmb = NUM_CUS - imb;
867 x265_log(m_param, X265_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n",
868 frame->frameNum, icost, pcost, 1. - (double)pcost / icost, bias, gopSize, imb, pmb);
869 }
870 return res;
871}
872
873void Lookahead::slicetypePath(Lowres **frames, int length, char(*best_paths)[X265_LOOKAHEAD_MAX + 1])
874{
875 char paths[2][X265_LOOKAHEAD_MAX + 1];
876 int num_paths = X265_MIN(m_param->bframes + 1, length);
877 int64_t best_cost = 1LL << 62;
878 int idx = 0;
879
880 /* Iterate over all currently possible paths */
881 for (int path = 0; path < num_paths; path++)
882 {
883 /* Add suffixes to the current path */
884 int len = length - (path + 1);
885 memcpy(paths[idx], best_paths[len % (X265_BFRAME_MAX + 1)], len);
886 memset(paths[idx] + len, 'B', path);
887 strcpy(paths[idx] + len + path, "P");
888
889 /* Calculate the actual cost of the current path */
890 int64_t cost = slicetypePathCost(frames, paths[idx], best_cost);
891 if (cost < best_cost)
892 {
893 best_cost = cost;
894 idx ^= 1;
895 }
896 }
897
898 /* Store the best path. */
899 memcpy(best_paths[length % (X265_BFRAME_MAX + 1)], paths[idx ^ 1], length);
900}
901
902int64_t Lookahead::slicetypePathCost(Lowres **frames, char *path, int64_t threshold)
903{
904 int64_t cost = 0;
905 int loc = 1;
906 int cur_p = 0;
907
908 path--; /* Since the 1st path element is really the second frame */
909 while (path[loc])
910 {
911 int next_p = loc;
912 /* Find the location of the next P-frame. */
913 while (path[next_p] != 'P')
914 {
915 next_p++;
916 }
917
918 /* Add the cost of the P-frame found above */
919 cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_p, 0);
920 /* Early terminate if the cost we have found is larger than the best path cost so far */
921 if (cost > threshold)
922 break;
923
924 if (m_param->bBPyramid && next_p - cur_p > 2)
925 {
926 int middle = cur_p + (next_p - cur_p) / 2;
927 cost += m_est.estimateFrameCost(frames, cur_p, next_p, middle, 0);
928 for (int next_b = loc; next_b < middle && cost < threshold; next_b++)
929 {
930 cost += m_est.estimateFrameCost(frames, cur_p, middle, next_b, 0);
931 }
932
933 for (int next_b = middle + 1; next_b < next_p && cost < threshold; next_b++)
934 {
935 cost += m_est.estimateFrameCost(frames, middle, next_p, next_b, 0);
936 }
937 }
938 else
939 {
940 for (int next_b = loc; next_b < next_p && cost < threshold; next_b++)
941 {
942 cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_b, 0);
943 }
944 }
945
946 loc = next_p + 1;
947 cur_p = next_p;
948 }
949
950 return cost;
951}
952
953void Lookahead::cuTree(Lowres **frames, int numframes, bool bIntra)
954{
955 int idx = !bIntra;
956 int lastnonb, curnonb = 1;
957 int bframes = 0;
958
959 x265_emms();
960 double totalDuration = 0.0;
961 for (int j = 0; j <= numframes; j++)
962 totalDuration += (double)m_param->fpsDenom / m_param->fpsNum;
963
964 double averageDuration = totalDuration / (numframes + 1);
965
966 int i = numframes;
967 int cuCount = m_widthInCU * m_heightInCU;
968
969 if (bIntra)
970 m_est.estimateFrameCost(frames, 0, 0, 0, 0);
971
972 while (i > 0 && frames[i]->sliceType == X265_TYPE_B)
973 i--;
974
975 lastnonb = i;
976
977 /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could
978 * be applied to the end of a lookahead buffer of any size. However, it's most needed when
979 * lookahead=0, so that's what's currently implemented. */
980 if (!m_param->lookaheadDepth)
981 {
982 if (bIntra)
983 {
984 memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t));
985 memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, cuCount * sizeof(double));
986 return;
987 }
988 std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
989 memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t));
990 }
991 else
992 {
993 if (lastnonb < idx)
994 return;
995 memset(frames[lastnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
996 }
997
998 while (i-- > idx)
999 {
1000 curnonb = i;
1001 while (frames[curnonb]->sliceType == X265_TYPE_B && curnonb > 0)
1002 curnonb--;
1003
1004 if (curnonb < idx)
1005 break;
1006
1007 m_est.estimateFrameCost(frames, curnonb, lastnonb, lastnonb, 0);
1008 memset(frames[curnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
1009 bframes = lastnonb - curnonb - 1;
1010 if (m_param->bBPyramid && bframes > 1)
1011 {
1012 int middle = (bframes + 1) / 2 + curnonb;
1013 m_est.estimateFrameCost(frames, curnonb, lastnonb, middle, 0);
1014 memset(frames[middle]->propagateCost, 0, cuCount * sizeof(uint16_t));
1015 while (i > curnonb)
1016 {
1017 int p0 = i > middle ? middle : curnonb;
1018 int p1 = i < middle ? middle : lastnonb;
1019 if (i != middle)
1020 {
1021 m_est.estimateFrameCost(frames, p0, p1, i, 0);
1022 estimateCUPropagate(frames, averageDuration, p0, p1, i, 0);
1023 }
1024 i--;
1025 }
1026
1027 estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, middle, 1);
1028 }
1029 else
1030 {
1031 while (i > curnonb)
1032 {
1033 m_est.estimateFrameCost(frames, curnonb, lastnonb, i, 0);
1034 estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0);
1035 i--;
1036 }
1037 }
1038 estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, lastnonb, 1);
1039 lastnonb = curnonb;
1040 }
1041
1042 if (!m_param->lookaheadDepth)
1043 {
1044 m_est.estimateFrameCost(frames, 0, lastnonb, lastnonb, 0);
1045 estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1);
1046 std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
1047 }
1048
1049 cuTreeFinish(frames[lastnonb], averageDuration, lastnonb);
1050 if (m_param->bBPyramid && bframes > 1 && !m_param->rc.vbvBufferSize)
1051 cuTreeFinish(frames[lastnonb + (bframes + 1) / 2], averageDuration, 0);
1052}
1053
1054void Lookahead::estimateCUPropagate(Lowres **frames, double averageDuration, int p0, int p1, int b, int referenced)
1055{
1056 uint16_t *refCosts[2] = { frames[p0]->propagateCost, frames[p1]->propagateCost };
1057 int32_t distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0);
1058 int32_t bipredWeight = m_param->bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32;
1059 MV *mvs[2] = { frames[b]->lowresMvs[0][b - p0 - 1], frames[b]->lowresMvs[1][p1 - b - 1] };
1060 int32_t bipredWeights[2] = { bipredWeight, 64 - bipredWeight };
1061
1062 memset(m_scratch, 0, m_widthInCU * sizeof(int));
1063
1064 uint16_t *propagateCost = frames[b]->propagateCost;
1065
1066 x265_emms();
1067 double fpsFactor = CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) / CLIP_DURATION(averageDuration);
1068
1069 /* For non-refferd frames the source costs are always zero, so just memset one row and re-use it. */
1070 if (!referenced)
1071 memset(frames[b]->propagateCost, 0, m_widthInCU * sizeof(uint16_t));
1072
1073 int32_t StrideInCU = m_widthInCU;
1074 for (uint16_t blocky = 0; blocky < m_heightInCU; blocky++)
1075 {
1076 int cuIndex = blocky * StrideInCU;
1077 primitives.propagateCost(m_scratch, propagateCost,
1078 frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
1079 frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_widthInCU);
1080
1081 if (referenced)
1082 propagateCost += m_widthInCU;
1083 for (uint16_t blockx = 0; blockx < m_widthInCU; blockx++, cuIndex++)
1084 {
1085 int32_t propagate_amount = m_scratch[blockx];
1086 /* Don't propagate for an intra block. */
1087 if (propagate_amount > 0)
1088 {
1089 /* Access width-2 bitfield. */
1090 int32_t lists_used = frames[b]->lowresCosts[b - p0][p1 - b][cuIndex] >> LOWRES_COST_SHIFT;
1091 /* Follow the MVs to the previous frame(s). */
1092 for (uint16_t list = 0; list < 2; list++)
1093 {
1094 if ((lists_used >> list) & 1)
1095 {
1096#define CLIP_ADD(s, x) (s) = (uint16_t)X265_MIN((s) + (x), (1 << 16) - 1)
1097 int32_t listamount = propagate_amount;
1098 /* Apply bipred weighting. */
1099 if (lists_used == 3)
1100 listamount = (listamount * bipredWeights[list] + 32) >> 6;
1101
1102 /* Early termination for simple case of mv0. */
1103 if (!mvs[list][cuIndex].word)
1104 {
1105 CLIP_ADD(refCosts[list][cuIndex], listamount);
1106 continue;
1107 }
1108
1109 int32_t x = mvs[list][cuIndex].x;
1110 int32_t y = mvs[list][cuIndex].y;
1111 int32_t cux = (x >> 5) + blockx;
1112 int32_t cuy = (y >> 5) + blocky;
1113 int32_t idx0 = cux + cuy * StrideInCU;
1114 int32_t idx1 = idx0 + 1;
1115 int32_t idx2 = idx0 + StrideInCU;
1116 int32_t idx3 = idx0 + StrideInCU + 1;
1117 x &= 31;
1118 y &= 31;
1119 int32_t idx0weight = (32 - y) * (32 - x);
1120 int32_t idx1weight = (32 - y) * x;
1121 int32_t idx2weight = y * (32 - x);
1122 int32_t idx3weight = y * x;
1123
1124 /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
1125 * be counted. */
1126 if (cux < m_widthInCU - 1 && cuy < m_heightInCU - 1 && cux >= 0 && cuy >= 0)
1127 {
1128 CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
1129 CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
1130 CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
1131 CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);
1132 }
1133 else /* Check offsets individually */
1134 {
1135 if (cux < m_widthInCU && cuy < m_heightInCU && cux >= 0 && cuy >= 0)
1136 CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
1137 if (cux + 1 < m_widthInCU && cuy < m_heightInCU && cux + 1 >= 0 && cuy >= 0)
1138 CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
1139 if (cux < m_widthInCU && cuy + 1 < m_heightInCU && cux >= 0 && cuy + 1 >= 0)
1140 CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
1141 if (cux + 1 < m_widthInCU && cuy + 1 < m_heightInCU && cux + 1 >= 0 && cuy + 1 >= 0)
1142 CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);
1143 }
1144 }
1145 }
1146 }
1147 }
1148 }
1149
1150 if (m_param->rc.vbvBufferSize && m_param->lookaheadDepth && referenced)
1151 cuTreeFinish(frames[b], averageDuration, b == p1 ? b - p0 : 0);
1152}
1153
1154void Lookahead::cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance)
1155{
1156 int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) * 256);
1157 double weightdelta = 0.0;
1158
1159 if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
1160 weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);
1161
1162 /* Allow the strength to be adjusted via qcompress, since the two
1163 * concepts are very similar. */
1164
1165 int cuCount = m_widthInCU * m_heightInCU;
1166 double strength = 5.0 * (1.0 - m_param->rc.qCompress);
1167
1168 for (int cuIndex = 0; cuIndex < cuCount; cuIndex++)
1169 {
1170 int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8;
1171 if (intracost)
1172 {
1173 int propagateCost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8;
1174 double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
1175 frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - strength * log2_ratio;
1176 }
1177 }
1178}
1179
1180/* If MB-tree changes the quantizers, we need to recalculate the frame cost without
1181 * re-running lookahead. */
1182int64_t Lookahead::frameCostRecalculate(Lowres** frames, int p0, int p1, int b)
1183{
1184 int64_t score = 0;
1185 int *rowSatd = frames[b]->rowSatds[b - p0][p1 - b];
1186 double *qp_offset = (frames[b]->sliceType == X265_TYPE_B) ? frames[b]->qpAqOffset : frames[b]->qpCuTreeOffset;
1187
1188 x265_emms();
1189 for (int cuy = m_heightInCU - 1; cuy >= 0; cuy--)
1190 {
1191 rowSatd[cuy] = 0;
1192 for (int cux = m_widthInCU - 1; cux >= 0; cux--)
1193 {
1194 int cuxy = cux + cuy * m_widthInCU;
1195 int cuCost = frames[b]->lowresCosts[b - p0][p1 - b][cuxy] & LOWRES_COST_MASK;
1196 double qp_adj = qp_offset[cuxy];
1197 cuCost = (cuCost * x265_exp2fix8(qp_adj) + 128) >> 8;
1198 rowSatd[cuy] += cuCost;
1199 if ((cuy > 0 && cuy < m_heightInCU - 1 &&
1200 cux > 0 && cux < m_widthInCU - 1) ||
1201 m_widthInCU <= 2 || m_heightInCU <= 2)
1202 {
1203 score += cuCost;
1204 }
1205 }
1206 }
1207
1208 return score;
1209}
1210
1211CostEstimate::CostEstimate(ThreadPool *p)
1212 : WaveFront(p)
1213{
1214 m_param = NULL;
1215 m_curframes = NULL;
1216 m_wbuffer[0] = m_wbuffer[1] = m_wbuffer[2] = m_wbuffer[3] = 0;
1217 m_rows = NULL;
1218 m_paddedLines = m_widthInCU = m_heightInCU = 0;
1219 m_bDoSearch[0] = m_bDoSearch[1] = false;
1220 m_curb = m_curp0 = m_curp1 = 0;
1221 m_bFrameCompleted = false;
1222}
1223
1224CostEstimate::~CostEstimate()
1225{
1226 for (int i = 0; i < 4; i++)
1227 {
1228 x265_free(m_wbuffer[i]);
1229 }
1230
1231 delete[] m_rows;
1232}
1233
1234void CostEstimate::init(x265_param *_param, Frame *curFrame)
1235{
1236 m_param = _param;
1237 m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
1238 m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
1239
1240 m_rows = new EstimateRow[m_heightInCU];
1241 for (int i = 0; i < m_heightInCU; i++)
1242 {
1243 m_rows[i].m_widthInCU = m_widthInCU;
1244 m_rows[i].m_heightInCU = m_heightInCU;
1245 m_rows[i].m_param = m_param;
1246 }
1247
1248 if (WaveFront::init(m_heightInCU))
1249 WaveFront::enableAllRows();
1250 else
1251 m_pool = NULL;
1252
1253 if (m_param->bEnableWeightedPred)
1254 {
b53f7c52 1255 PicYuv *orig = curFrame->m_fencPic;
72b9787e
JB
1256 m_paddedLines = curFrame->m_lowres.lines + 2 * orig->m_lumaMarginY;
1257 intptr_t padoffset = curFrame->m_lowres.lumaStride * orig->m_lumaMarginY + orig->m_lumaMarginX;
1258
1259 /* allocate weighted lowres buffers */
1260 for (int i = 0; i < 4; i++)
1261 {
1262 m_wbuffer[i] = (pixel*)x265_malloc(sizeof(pixel) * (curFrame->m_lowres.lumaStride * m_paddedLines));
1263 m_weightedRef.lowresPlane[i] = m_wbuffer[i] + padoffset;
1264 }
1265
b53f7c52 1266 m_weightedRef.fpelPlane[0] = m_weightedRef.lowresPlane[0];
72b9787e
JB
1267 m_weightedRef.lumaStride = curFrame->m_lowres.lumaStride;
1268 m_weightedRef.isLowres = true;
1269 m_weightedRef.isWeighted = false;
1270 }
1271}
1272
1273int64_t CostEstimate::estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty)
1274{
1275 int64_t score = 0;
1276 Lowres *fenc = frames[b];
1277
1278 if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1)
1279 score = fenc->costEst[b - p0][p1 - b];
1280 else
1281 {
1282 m_weightedRef.isWeighted = false;
1283 if (m_param->bEnableWeightedPred && b == p1 && b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF)
1284 {
1285 if (!fenc->bIntraCalculated)
1286 estimateFrameCost(frames, b, b, b, 0);
1287 weightsAnalyse(frames, b, p0);
1288 }
1289
1290 /* For each list, check to see whether we have lowres motion-searched this reference */
1291 m_bDoSearch[0] = b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF;
1292 m_bDoSearch[1] = b != p1 && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF;
1293
1294 if (m_bDoSearch[0]) fenc->lowresMvs[0][b - p0 - 1][0].x = 0;
1295 if (m_bDoSearch[1]) fenc->lowresMvs[1][p1 - b - 1][0].x = 0;
1296
1297 m_curb = b;
1298 m_curp0 = p0;
1299 m_curp1 = p1;
1300 m_curframes = frames;
1301 fenc->costEst[b - p0][p1 - b] = 0;
1302 fenc->costEstAq[b - p0][p1 - b] = 0;
1303
1304 for (int i = 0; i < m_heightInCU; i++)
1305 {
1306 m_rows[i].init();
72b9787e
JB
1307 if (!fenc->bIntraCalculated)
1308 fenc->rowSatds[0][0][i] = 0;
1309 fenc->rowSatds[b - p0][p1 - b][i] = 0;
1310 }
1311
1312 m_bFrameCompleted = false;
1313
1314 if (m_pool)
1315 {
1316 WaveFront::enqueue();
1317
1318 // enableAllRows must be already called
1319 enqueueRow(0);
1320 while (!m_bFrameCompleted)
1321 WaveFront::findJob(-1);
1322
1323 WaveFront::dequeue();
1324 }
1325 else
1326 {
1327 for (int row = 0; row < m_heightInCU; row++)
1328 processRow(row, -1);
1329
1330 x265_emms();
1331 }
1332
1333 // Accumulate cost from each row
1334 for (int row = 0; row < m_heightInCU; row++)
1335 {
1336 score += m_rows[row].m_costEst;
1337 fenc->costEst[0][0] += m_rows[row].m_costIntra;
1338 if (m_param->rc.aqMode)
1339 {
1340 fenc->costEstAq[0][0] += m_rows[row].m_costIntraAq;
1341 fenc->costEstAq[b - p0][p1 - b] += m_rows[row].m_costEstAq;
1342 }
1343 fenc->intraMbs[b - p0] += m_rows[row].m_intraMbs;
1344 }
1345
1346 fenc->bIntraCalculated = true;
1347
1348 if (b != p1)
1349 score = (uint64_t)score * 100 / (130 + m_param->bFrameBias);
1350 if (b != p0 || b != p1) //Not Intra cost
1351 fenc->costEst[b - p0][p1 - b] = score;
1352 }
1353
1354 if (bIntraPenalty)
1355 {
1356 // arbitrary penalty for I-blocks after B-frames
1357 int ncu = NUM_CUS;
1358 score += (uint64_t)score * fenc->intraMbs[b - p0] / (ncu * 8);
1359 }
1360 return score;
1361}
1362
1363uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightParam *wp)
1364{
1365 Lowres *fenc = frames[b];
1366 Lowres *ref = frames[p0];
b53f7c52 1367 pixel *src = ref->fpelPlane[0];
72b9787e
JB
1368 intptr_t stride = fenc->lumaStride;
1369
1370 if (wp)
1371 {
1372 int offset = wp->inputOffset << (X265_DEPTH - 8);
1373 int scale = wp->inputWeight;
1374 int denom = wp->log2WeightDenom;
1375 int round = denom ? 1 << (denom - 1) : 0;
1376 int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
1377 int widthHeight = (int)stride;
1378
1379 primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines,
1380 scale, round << correction, denom + correction, offset);
b53f7c52 1381 src = m_weightedRef.fpelPlane[0];
72b9787e
JB
1382 }
1383
1384 uint32_t cost = 0;
1385 intptr_t pixoff = 0;
1386 int mb = 0;
1387
1388 for (int y = 0; y < fenc->lines; y += 8, pixoff = y * stride)
1389 {
1390 for (int x = 0; x < fenc->width; x += 8, mb++, pixoff += 8)
1391 {
b53f7c52 1392 int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane[0] + pixoff, stride);
72b9787e
JB
1393 cost += X265_MIN(satd, fenc->intraCost[mb]);
1394 }
1395 }
1396
1397 return cost;
1398}
1399
1400void CostEstimate::weightsAnalyse(Lowres **frames, int b, int p0)
1401{
1402 static const float epsilon = 1.f / 128.f;
1403 Lowres *fenc, *ref;
1404
1405 fenc = frames[b];
1406 ref = frames[p0];
1407 int deltaIndex = fenc->frameNum - ref->frameNum;
1408
1409 /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
1410 float guessScale, fencMean, refMean;
1411 x265_emms();
1412 if (fenc->wp_ssd[0] && ref->wp_ssd[0])
1413 guessScale = sqrtf((float)fenc->wp_ssd[0] / ref->wp_ssd[0]);
1414 else
1415 guessScale = 1.0f;
1416 fencMean = (float)fenc->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8));
1417 refMean = (float)ref->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8));
1418
1419 /* Early termination */
1420 if (fabsf(refMean - fencMean) < 0.5f && fabsf(1.f - guessScale) < epsilon)
1421 return;
1422
1423 int minoff = 0, minscale, mindenom;
1424 unsigned int minscore = 0, origscore = 1;
1425 int found = 0;
1426
1427 m_w.setFromWeightAndOffset((int)(guessScale * 128 + 0.5f), 0, 7, true);
1428 mindenom = m_w.log2WeightDenom;
1429 minscale = m_w.inputWeight;
1430
1431 origscore = minscore = weightCostLuma(frames, b, p0, NULL);
1432
1433 if (!minscore)
1434 return;
1435
1436 unsigned int s = 0;
1437 int curScale = minscale;
1438 int curOffset = (int)(fencMean - refMean * curScale / (1 << mindenom) + 0.5f);
1439 if (curOffset < -128 || curOffset > 127)
1440 {
1441 /* Rescale considering the constraints on curOffset. We do it in this order
1442 * because scale has a much wider range than offset (because of denom), so
1443 * it should almost never need to be clamped. */
1444 curOffset = Clip3(-128, 127, curOffset);
1445 curScale = (int)((1 << mindenom) * (fencMean - curOffset) / refMean + 0.5f);
1446 curScale = Clip3(0, 127, curScale);
1447 }
1448 SET_WEIGHT(m_w, 1, curScale, mindenom, curOffset);
1449 s = weightCostLuma(frames, b, p0, &m_w);
1450 COPY4_IF_LT(minscore, s, minscale, curScale, minoff, curOffset, found, 1);
1451
1452 /* Use a smaller denominator if possible */
1453 while (mindenom > 0 && !(minscale & 1))
1454 {
1455 mindenom--;
1456 minscale >>= 1;
1457 }
1458
1459 if (!found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f)
1460 return;
1461 else
1462 {
1463 SET_WEIGHT(m_w, 1, minscale, mindenom, minoff);
1464 // set weighted delta cost
1465 fenc->weightedCostDelta[deltaIndex] = minscore / origscore;
1466
1467 int offset = m_w.inputOffset << (X265_DEPTH - 8);
1468 int scale = m_w.inputWeight;
1469 int denom = m_w.log2WeightDenom;
1470 int round = denom ? 1 << (denom - 1) : 0;
1471 int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
1472 intptr_t stride = ref->lumaStride;
1473 int widthHeight = (int)stride;
1474
1475 for (int i = 0; i < 4; i++)
1476 primitives.weight_pp(ref->buffer[i], m_wbuffer[i], stride, widthHeight, m_paddedLines,
1477 scale, round << correction, denom + correction, offset);
1478
1479 m_weightedRef.isWeighted = true;
1480 }
1481}
1482
1483void CostEstimate::processRow(int row, int /*threadId*/)
1484{
b53f7c52
JB
1485 ProfileScopeEvent(costEstimateRow);
1486
72b9787e
JB
1487 int realrow = m_heightInCU - 1 - row;
1488 Lowres **frames = m_curframes;
1489 ReferencePlanes *wfref0 = m_weightedRef.isWeighted ? &m_weightedRef : frames[m_curp0];
1490
1491 /* Lowres lookahead goes backwards because the MVs are used as
1492 * predictors in the main encode. This considerably improves MV
1493 * prediction overall. */
1494 for (int i = m_widthInCU - 1 - m_rows[row].m_completed; i >= 0; i--)
1495 {
1496 // TODO: use lowres MVs as motion candidates in full-res search
1497 m_rows[row].estimateCUCost(frames, wfref0, i, realrow, m_curp0, m_curp1, m_curb, m_bDoSearch);
1498 m_rows[row].m_completed++;
1499
1500 if (m_rows[row].m_completed >= 2 && row < m_heightInCU - 1)
1501 {
1502 ScopedLock below(m_rows[row + 1].m_lock);
1503 if (m_rows[row + 1].m_active == false &&
1504 m_rows[row + 1].m_completed + 2 <= m_rows[row].m_completed)
1505 {
1506 m_rows[row + 1].m_active = true;
1507 enqueueRow(row + 1);
1508 }
1509 }
1510
1511 ScopedLock self(m_rows[row].m_lock);
1512 if (row > 0 && (int32_t)m_rows[row].m_completed < m_widthInCU - 1 &&
1513 m_rows[row - 1].m_completed < m_rows[row].m_completed + 2)
1514 {
1515 m_rows[row].m_active = false;
1516 return;
1517 }
1518 }
1519
1520 if (row == m_heightInCU - 1)
1521 m_bFrameCompleted = true;
1522}
1523
1524void EstimateRow::init()
1525{
1526 m_costEst = 0;
1527 m_costEstAq = 0;
1528 m_costIntra = 0;
1529 m_costIntraAq = 0;
1530 m_intraMbs = 0;
1531 m_active = false;
1532 m_completed = 0;
1533}
1534
1535void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2])
1536{
1537 Lowres *fref1 = frames[p1];
1538 Lowres *fenc = frames[b];
1539
1540 const int bBidir = (b < p1);
1541 const int cuXY = cux + cuy * m_widthInCU;
1542 const int cuSize = X265_LOWRES_CU_SIZE;
1543 const intptr_t pelOffset = cuSize * cux + cuSize * cuy * fenc->lumaStride;
1544
1545 // should this CU's cost contribute to the frame cost?
1546 const bool bFrameScoreCU = (cux > 0 && cux < m_widthInCU - 1 &&
1547 cuy > 0 && cuy < m_heightInCU - 1) || m_widthInCU <= 2 || m_heightInCU <= 2;
1548
b53f7c52 1549 m_me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize);
72b9787e
JB
1550
1551 /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
1552 int lowresPenalty = 4;
1553
1554 MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY],
1555 &fenc->lowresMvs[1][p1 - b - 1][cuXY] };
1556 int(*fenc_costs[2]) = { &fenc->lowresMvCosts[0][b - p0 - 1][cuXY],
1557 &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] };
1558
1559 MV mvmin, mvmax;
1560 int bcost = m_me.COST_MAX;
1561 int listused = 0;
1562
1563 // establish search bounds that don't cross extended frame boundaries
1564 mvmin.x = (int16_t)(-cux * cuSize - 8);
1565 mvmin.y = (int16_t)(-cuy * cuSize - 8);
1566 mvmax.x = (int16_t)((m_widthInCU - cux - 1) * cuSize + 8);
1567 mvmax.y = (int16_t)((m_heightInCU - cuy - 1) * cuSize + 8);
1568
1569 if (p0 != p1)
1570 {
1571 for (int i = 0; i < 1 + bBidir; i++)
1572 {
1573 if (!bDoSearch[i])
1574 {
1575 /* Use previously calculated cost */
1576 COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1);
1577 continue;
1578 }
1579 int numc = 0;
1580 MV mvc[4], mvp;
1581 MV *fenc_mv = fenc_mvs[i];
1582
1583 /* Reverse-order MV prediction. */
1584 mvc[0] = 0;
1585 mvc[2] = 0;
1586#define MVC(mv) mvc[numc++] = mv;
1587 if (cux < m_widthInCU - 1)
1588 MVC(fenc_mv[1]);
1589 if (cuy < m_heightInCU - 1)
1590 {
1591 MVC(fenc_mv[m_widthInCU]);
1592 if (cux > 0)
1593 MVC(fenc_mv[m_widthInCU - 1]);
1594 if (cux < m_widthInCU - 1)
1595 MVC(fenc_mv[m_widthInCU + 1]);
1596 }
1597#undef MVC
1598 if (numc <= 1)
1599 mvp = mvc[0];
1600 else
1601 {
1602 median_mv(mvp, mvc[0], mvc[1], mvc[2]);
1603 }
1604
1605 *fenc_costs[i] = m_me.motionEstimate(i ? fref1 : wfref0, mvmin, mvmax, mvp, numc, mvc, m_merange, *fenc_mvs[i]);
1606 COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1);
1607 }
1608 if (bBidir)
1609 {
b53f7c52
JB
1610 ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
1611 ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
72b9787e
JB
1612 intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
1613 pixel *src0 = wfref0->lowresMC(pelOffset, *fenc_mvs[0], subpelbuf0, stride0);
1614 pixel *src1 = fref1->lowresMC(pelOffset, *fenc_mvs[1], subpelbuf1, stride1);
1615
b53f7c52 1616 ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
72b9787e
JB
1617 primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
1618 int bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE);
1619 COPY2_IF_LT(bcost, bicost, listused, 3);
1620
1621 // Try 0,0 candidates
1622 src0 = wfref0->lowresPlane[0] + pelOffset;
1623 src1 = fref1->lowresPlane[0] + pelOffset;
1624 primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, wfref0->lumaStride, src1, fref1->lumaStride, 32);
1625 bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE);
1626 COPY2_IF_LT(bcost, bicost, listused, 3);
1627 }
1628 }
1629 if (!fenc->bIntraCalculated)
1630 {
1631 const int sizeIdx = X265_LOWRES_CU_BITS - 2; // partition size
1632
1633 pixel _above0[X265_LOWRES_CU_SIZE * 4 + 1], *const above0 = _above0 + 2 * X265_LOWRES_CU_SIZE;
1634 pixel _above1[X265_LOWRES_CU_SIZE * 4 + 1], *const above1 = _above1 + 2 * X265_LOWRES_CU_SIZE;
1635 pixel _left0[X265_LOWRES_CU_SIZE * 4 + 1], *const left0 = _left0 + 2 * X265_LOWRES_CU_SIZE;
1636 pixel _left1[X265_LOWRES_CU_SIZE * 4 + 1], *const left1 = _left1 + 2 * X265_LOWRES_CU_SIZE;
1637
1638 pixel *pix_cur = fenc->lowresPlane[0] + pelOffset;
1639
1640 // Copy Above
1641 memcpy(above0, pix_cur - 1 - fenc->lumaStride, (cuSize + 1) * sizeof(pixel));
1642
1643 // Copy Left
1644 for (int i = 0; i < cuSize + 1; i++)
72b9787e 1645 left0[i] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride];
72b9787e
JB
1646
1647 for (int i = 0; i < cuSize; i++)
1648 {
1649 above0[cuSize + i + 1] = above0[cuSize];
1650 left0[cuSize + i + 1] = left0[cuSize];
1651 }
1652
1653 // filtering with [1 2 1]
1654 // assume getUseStrongIntraSmoothing() is disabled
1655 above1[0] = above0[0];
1656 above1[2 * cuSize] = above0[2 * cuSize];
1657 left1[0] = left0[0];
1658 left1[2 * cuSize] = left0[2 * cuSize];
1659 for (int i = 1; i < 2 * cuSize; i++)
1660 {
1661 above1[i] = (above0[i - 1] + 2 * above0[i] + above0[i + 1] + 2) >> 2;
1662 left1[i] = (left0[i - 1] + 2 * left0[i] + left0[i + 1] + 2) >> 2;
1663 }
1664
1665 int predsize = cuSize * cuSize;
1666
1667 // generate 35 intra predictions into m_predictions
1668 pixelcmp_t satd = primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
b53f7c52 1669 int icost = m_me.COST_MAX;
72b9787e 1670 primitives.intra_pred[DC_IDX][sizeIdx](m_predictions, cuSize, left0, above0, 0, (cuSize <= 16));
b53f7c52 1671 int cost = m_me.bufSATD(m_predictions, cuSize);
72b9787e
JB
1672 if (cost < icost)
1673 icost = cost;
1674 pixel *above = (cuSize >= 8) ? above1 : above0;
1675 pixel *left = (cuSize >= 8) ? left1 : left0;
1676 primitives.intra_pred[PLANAR_IDX][sizeIdx](m_predictions, cuSize, left, above, 0, 0);
b53f7c52 1677 cost = m_me.bufSATD(m_predictions, cuSize);
72b9787e
JB
1678 if (cost < icost)
1679 icost = cost;
1680 primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
1681
1682 // calculate satd costs, keep least cost
1683 ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
b53f7c52 1684 primitives.transpose[sizeIdx](buf_trans, m_me.fencPUYuv.m_buf[0], FENC_STRIDE);
72b9787e
JB
1685
1686 int acost = m_me.COST_MAX;
1687 uint32_t mode, lowmode = 4;
1688 for (mode = 5; mode < 35; mode += 5)
1689 {
1690 if (mode < 18)
1691 cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
1692 else
b53f7c52 1693 cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
72b9787e
JB
1694 COPY2_IF_LT(acost, cost, lowmode, mode);
1695 }
1696 for (uint32_t dist = 2; dist >= 1; dist--)
1697 {
1698 mode = lowmode - dist;
1699 if (mode < 18)
1700 cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
1701 else
b53f7c52 1702 cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
72b9787e
JB
1703 COPY2_IF_LT(acost, cost, lowmode, mode);
1704
1705 mode = lowmode + dist;
1706 if (mode < 18)
1707 cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
1708 else
b53f7c52 1709 cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
72b9787e
JB
1710 COPY2_IF_LT(acost, cost, lowmode, mode);
1711 }
1712 if (acost < icost)
1713 icost = acost;
1714
1715 const int intraPenalty = 5 * m_lookAheadLambda;
1716 icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */
1717 fenc->intraCost[cuXY] = icost;
b53f7c52 1718 fenc->intraMode[cuXY] = (uint8_t)lowmode;
72b9787e
JB
1719 int icostAq = icost;
1720 if (bFrameScoreCU)
1721 {
1722 m_costIntra += icost;
1723 if (fenc->invQscaleFactor)
1724 {
1725 icostAq = (icost * fenc->invQscaleFactor[cuXY] + 128) >> 8;
1726 m_costIntraAq += icostAq;
1727 }
1728 }
1729 fenc->rowSatds[0][0][cuy] += icostAq;
1730 }
1731 bcost += lowresPenalty;
1732 if (!bBidir)
1733 {
1734 if (fenc->intraCost[cuXY] < bcost)
1735 {
1736 if (bFrameScoreCU) m_intraMbs++;
1737 bcost = fenc->intraCost[cuXY];
1738 listused = 0;
1739 }
1740 }
1741
1742 /* For I frames these costs were accumulated earlier */
1743 if (p0 != p1)
1744 {
1745 int bcostAq = bcost;
1746 if (bFrameScoreCU)
1747 {
1748 m_costEst += bcost;
1749 if (fenc->invQscaleFactor)
1750 {
1751 bcostAq = (bcost * fenc->invQscaleFactor[cuXY] + 128) >> 8;
1752 m_costEstAq += bcostAq;
1753 }
1754 }
1755 fenc->rowSatds[b - p0][p1 - b][cuy] += bcostAq;
1756 }
1757 fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT));
1758}