Imported Upstream version 1.4
[deb_x265.git] / source / encoder / slicetype.cpp
CommitLineData
72b9787e
JB
1/*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Gopu Govindaswamy <gopu@multicorewareinc.com>
5 * Steve Borho <steve@borho.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 *
21 * This program is also available under a commercial proprietary license.
22 * For more information, contact us at license @ x265.com.
23 *****************************************************************************/
24
25#include "common.h"
26#include "frame.h"
27#include "framedata.h"
28#include "picyuv.h"
29#include "primitives.h"
30#include "lowres.h"
31#include "mv.h"
32
33#include "slicetype.h"
34#include "motion.h"
35#include "ratecontrol.h"
36
37#define NUM_CUS (m_widthInCU > 2 && m_heightInCU > 2 ? (m_widthInCU - 2) * (m_heightInCU - 2) : m_widthInCU * m_heightInCU)
38
39using namespace x265;
40
41static inline int16_t median(int16_t a, int16_t b, int16_t c)
42{
43 int16_t t = (a - b) & ((a - b) >> 31);
44
45 a -= t;
46 b += t;
47 b -= (b - c) & ((b - c) >> 31);
48 b += (a - b) & ((a - b) >> 31);
49 return b;
50}
51
52static inline void median_mv(MV &dst, MV a, MV b, MV c)
53{
54 dst.x = median(a.x, b.x, c.x);
55 dst.y = median(a.y, b.y, c.y);
56}
57
58Lookahead::Lookahead(x265_param *param, ThreadPool* pool)
59 : JobProvider(pool)
60 , m_est(pool)
61{
62 m_bReady = 0;
63 m_param = param;
64 m_lastKeyframe = -m_param->keyframeMax;
65 m_lastNonB = NULL;
66 m_bFilling = true;
67 m_bFlushed = false;
68 m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
69 m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
70 m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int));
71 memset(m_histogram, 0, sizeof(m_histogram));
72}
73
74Lookahead::~Lookahead() { }
75
76void Lookahead::init()
77{
78 if (m_pool && m_pool->getThreadCount() >= 4 &&
79 ((m_param->bFrameAdaptive && m_param->bframes) ||
80 m_param->rc.cuTree || m_param->scenecutThreshold ||
81 (m_param->lookaheadDepth && m_param->rc.vbvBufferSize)))
82 m_pool = m_pool; /* allow use of worker thread */
83 else
84 m_pool = NULL; /* disable use of worker thread */
85}
86
87void Lookahead::destroy()
88{
89 if (m_pool)
90 // flush will dequeue, if it is necessary
91 JobProvider::flush();
92
93 // these two queues will be empty unless the encode was aborted
94 while (!m_inputQueue.empty())
95 {
96 Frame* curFrame = m_inputQueue.popFront();
97 curFrame->destroy();
98 delete curFrame;
99 }
100
101 while (!m_outputQueue.empty())
102 {
103 Frame* curFrame = m_outputQueue.popFront();
104 curFrame->destroy();
105 delete curFrame;
106 }
107
108 x265_free(m_scratch);
109}
110
111/* Called by API thread */
112void Lookahead::addPicture(Frame *curFrame, int sliceType)
113{
114 PicYuv *orig = curFrame->m_origPicYuv;
115
116 curFrame->m_lowres.init(orig, curFrame->m_poc, sliceType);
117
118 m_inputQueueLock.acquire();
119 m_inputQueue.pushBack(*curFrame);
120
121 if (m_inputQueue.size() >= m_param->lookaheadDepth)
122 {
123 /* when queue fills the first time, run slicetypeDecide synchronously,
124 * since the encoder will always be blocked here */
125 if (m_pool && !m_bFilling)
126 {
127 m_inputQueueLock.release();
128 m_bReady = 1;
129 m_pool->pokeIdleThread();
130 }
131 else
132 slicetypeDecide();
133
134 if (m_bFilling && m_pool)
135 JobProvider::enqueue();
136 m_bFilling = false;
137 }
138 else
139 m_inputQueueLock.release();
140}
141
142/* Called by API thread */
143void Lookahead::flush()
144{
145 /* just in case the input queue is never allowed to fill */
146 m_bFilling = false;
147
148 /* flush synchronously */
149 m_inputQueueLock.acquire();
150 if (!m_inputQueue.empty())
151 {
152 slicetypeDecide();
153 }
154 else
155 m_inputQueueLock.release();
156
157 m_inputQueueLock.acquire();
158
159 /* bFlushed indicates that an empty output queue actually means all frames
160 * have been decided (no more inputs for the encoder) */
161 if (m_inputQueue.empty())
162 m_bFlushed = true;
163 m_inputQueueLock.release();
164}
165
166/* Called by API thread. If the lookahead queue has not yet been filled the
167 * first time, it immediately returns NULL. Else the function blocks until
168 * outputs are available and then pops the first frame from the output queue. If
169 * flush() has been called and the output queue is empty, NULL is returned. */
170Frame* Lookahead::getDecidedPicture()
171{
172 m_outputQueueLock.acquire();
173
174 if (m_bFilling)
175 {
176 m_outputQueueLock.release();
177 return NULL;
178 }
179
180 while (m_outputQueue.empty() && !m_bFlushed)
181 {
182 m_outputQueueLock.release();
183 m_outputAvailable.wait();
184 m_outputQueueLock.acquire();
185 }
186
187 Frame *fenc = m_outputQueue.popFront();
188 m_outputQueueLock.release();
189 return fenc;
190}
191
192/* Called by pool worker threads */
193bool Lookahead::findJob(int)
194{
195 if (m_bReady && ATOMIC_CAS32(&m_bReady, 1, 0) == 1)
196 {
197 m_inputQueueLock.acquire();
198 slicetypeDecide();
199 return true;
200 }
201 else
202 return false;
203}
204
205/* Called by rate-control to calculate the estimated SATD cost for a given
206 * picture. It assumes dpb->prepareEncode() has already been called for the
207 * picture and all the references are established */
208void Lookahead::getEstimatedPictureCost(Frame *curFrame)
209{
210 Lowres *frames[X265_LOOKAHEAD_MAX];
211
212 // POC distances to each reference
213 Slice *slice = curFrame->m_encData->m_slice;
214 int p0 = 0, p1, b;
215 int poc = slice->m_poc;
216 int l0poc = slice->m_refPOCList[0][0];
217 int l1poc = slice->m_refPOCList[1][0];
218
219 switch (slice->m_sliceType)
220 {
221 case I_SLICE:
222 frames[p0] = &curFrame->m_lowres;
223 b = p1 = 0;
224 break;
225
226 case P_SLICE:
227 b = p1 = poc - l0poc;
228 frames[p0] = &slice->m_refPicList[0][0]->m_lowres;
229 frames[b] = &curFrame->m_lowres;
230 break;
231
232 case B_SLICE:
233 b = poc - l0poc;
234 p1 = b + l1poc - poc;
235 frames[p0] = &slice->m_refPicList[0][0]->m_lowres;
236 frames[b] = &curFrame->m_lowres;
237 frames[p1] = &slice->m_refPicList[1][0]->m_lowres;
238 break;
239
240 default:
241 return;
242 }
243
244 if (m_param->rc.cuTree && !m_param->rc.bStatRead)
245 /* update row satds based on cutree offsets */
246 curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
247 else if (m_param->rc.aqMode)
248 curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b];
249 else
250 curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b];
251
252 if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate)
253 {
254 /* aggregate lowres row satds to CTU resolution */
255 curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCosts[b - p0][p1 - b];
256 uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0;
257 uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
258 uint32_t numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
259 uint32_t widthInLowresCu = (uint32_t)m_widthInCU, heightInLowresCu = (uint32_t)m_heightInCU;
260 double *qp_offset = 0;
261 /* Factor in qpoffsets based on Aq/Cutree in CU costs */
262 if (m_param->rc.aqMode)
263 qp_offset = (frames[b]->sliceType == X265_TYPE_B || !m_param->rc.cuTree) ? frames[b]->qpAqOffset : frames[b]->qpCuTreeOffset;
264
265 for (uint32_t row = 0; row < numCuInHeight; row++)
266 {
267 lowresRow = row * scale;
268 for (uint32_t cnt = 0; cnt < scale && lowresRow < heightInLowresCu; lowresRow++, cnt++)
269 {
270 sum = 0;
271 lowresCuIdx = lowresRow * widthInLowresCu;
272 for (lowresCol = 0; lowresCol < widthInLowresCu; lowresCol++, lowresCuIdx++)
273 {
274 uint16_t lowresCuCost = curFrame->m_lowres.lowresCostForRc[lowresCuIdx] & LOWRES_COST_MASK;
275 if (qp_offset)
276 {
277 lowresCuCost = (uint16_t)((lowresCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8);
278 int32_t intraCuCost = curFrame->m_lowres.intraCost[lowresCuIdx];
279 curFrame->m_lowres.intraCost[lowresCuIdx] = (intraCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8;
280 }
281 curFrame->m_lowres.lowresCostForRc[lowresCuIdx] = lowresCuCost;
282 sum += lowresCuCost;
283 }
284 curFrame->m_encData->m_rowStat[row].satdForVbv += sum;
285 }
286 }
287 }
288}
289
290/* called by API thread or worker thread with inputQueueLock acquired */
291void Lookahead::slicetypeDecide()
292{
293 ScopedLock lock(m_decideLock);
294
295 Lowres *frames[X265_LOOKAHEAD_MAX];
296 Frame *list[X265_LOOKAHEAD_MAX];
297 int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX);
298
299 memset(frames, 0, sizeof(frames));
300 memset(list, 0, sizeof(list));
301 {
302 Frame *curFrame = m_inputQueue.first();
303 int j;
304 for (j = 0; j < m_param->bframes + 2; j++)
305 {
306 if (!curFrame) break;
307 list[j] = curFrame;
308 curFrame = curFrame->m_next;
309 }
310
311 curFrame = m_inputQueue.first();
312 frames[0] = m_lastNonB;
313 for (j = 0; j < maxSearch; j++)
314 {
315 if (!curFrame) break;
316 frames[j + 1] = &curFrame->m_lowres;
317 curFrame = curFrame->m_next;
318 }
319
320 maxSearch = j;
321 }
322
323 m_inputQueueLock.release();
324
325 if (!m_est.m_rows && list[0])
326 m_est.init(m_param, list[0]);
327
328 if (m_lastNonB && !m_param->rc.bStatRead &&
329 ((m_param->bFrameAdaptive && m_param->bframes) ||
330 m_param->rc.cuTree || m_param->scenecutThreshold ||
331 (m_param->lookaheadDepth && m_param->rc.vbvBufferSize)))
332 {
333 slicetypeAnalyse(frames, false);
334 }
335
336 int bframes, brefs;
337 for (bframes = 0, brefs = 0;; bframes++)
338 {
339 Lowres& frm = list[bframes]->m_lowres;
340
341 if (frm.sliceType == X265_TYPE_BREF && !m_param->bBPyramid && brefs == m_param->bBPyramid)
342 {
343 frm.sliceType = X265_TYPE_B;
344 x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid\n",
345 frm.frameNum);
346 }
347
348 /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
349 smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it.*/
350 else if (frm.sliceType == X265_TYPE_BREF && m_param->bBPyramid && brefs &&
351 m_param->maxNumReferences <= (brefs + 3))
352 {
353 frm.sliceType = X265_TYPE_B;
354 x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n",
355 frm.sliceType, m_param->maxNumReferences);
356 }
357
358 if ( /*(!param->intraRefresh || frm.frameNum == 0) && */ frm.frameNum - m_lastKeyframe >= m_param->keyframeMax)
359 {
360 if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I)
361 frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
362 bool warn = frm.sliceType != X265_TYPE_IDR;
363 if (warn && m_param->bOpenGOP)
364 warn &= frm.sliceType != X265_TYPE_I;
365 if (warn)
366 {
367 x265_log(m_param, X265_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n",
368 frm.sliceType, frm.frameNum);
369 frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
370 }
371 }
372 if (frm.sliceType == X265_TYPE_I && frm.frameNum - m_lastKeyframe >= m_param->keyframeMin)
373 {
374 if (m_param->bOpenGOP)
375 {
376 m_lastKeyframe = frm.frameNum;
377 frm.bKeyframe = true;
378 }
379 else
380 frm.sliceType = X265_TYPE_IDR;
381 }
382 if (frm.sliceType == X265_TYPE_IDR)
383 {
384 /* Closed GOP */
385 m_lastKeyframe = frm.frameNum;
386 frm.bKeyframe = true;
387 if (bframes > 0)
388 {
389 list[bframes - 1]->m_lowres.sliceType = X265_TYPE_P;
390 bframes--;
391 }
392 }
393 if (bframes == m_param->bframes || !list[bframes + 1])
394 {
395 if (IS_X265_TYPE_B(frm.sliceType))
396 x265_log(m_param, X265_LOG_WARNING, "specified frame type is not compatible with max B-frames\n");
397 if (frm.sliceType == X265_TYPE_AUTO || IS_X265_TYPE_B(frm.sliceType))
398 frm.sliceType = X265_TYPE_P;
399 }
400 if (frm.sliceType == X265_TYPE_BREF)
401 brefs++;
402 if (frm.sliceType == X265_TYPE_AUTO)
403 frm.sliceType = X265_TYPE_B;
404 else if (!IS_X265_TYPE_B(frm.sliceType))
405 break;
406 }
407
408 if (bframes)
409 list[bframes - 1]->m_lowres.bLastMiniGopBFrame = true;
410 list[bframes]->m_lowres.leadingBframes = bframes;
411 m_lastNonB = &list[bframes]->m_lowres;
412 m_histogram[bframes]++;
413
414 /* insert a bref into the sequence */
415 if (m_param->bBPyramid && bframes > 1 && !brefs)
416 {
417 list[bframes / 2]->m_lowres.sliceType = X265_TYPE_BREF;
418 brefs++;
419 }
420
421 /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
422 if (m_param->rc.rateControlMode != X265_RC_CQP)
423 {
424 int p0, p1, b;
425 /* For zero latency tuning, calculate frame cost to be used later in RC */
426 if (!maxSearch)
427 {
428 for (int i = 0; i <= bframes; i++)
429 frames[i + 1] = &list[i]->m_lowres;
430 }
431
432 /* estimate new non-B cost */
433 p1 = b = bframes + 1;
434 p0 = (IS_X265_TYPE_I(frames[bframes + 1]->sliceType)) ? b : 0;
435 m_est.estimateFrameCost(frames, p0, p1, b, 0);
436
437 if (bframes)
438 {
439 p0 = 0; // last nonb
440 for (b = 1; b <= bframes; b++)
441 {
442 if (frames[b]->sliceType == X265_TYPE_B)
443 for (p1 = b; frames[p1]->sliceType == X265_TYPE_B; p1++)
444 ; // find new nonb or bref
445 else
446 p1 = bframes + 1;
447
448 m_est.estimateFrameCost(frames, p0, p1, b, 0);
449
450 if (frames[b]->sliceType == X265_TYPE_BREF)
451 p0 = b;
452 }
453 }
454 }
455
456 m_inputQueueLock.acquire();
457
458 /* dequeue all frames from inputQueue that are about to be enqueued
459 * in the output queue. The order is important because Frame can
460 * only be in one list at a time */
461 int64_t pts[X265_BFRAME_MAX + 1];
462 for (int i = 0; i <= bframes; i++)
463 {
464 Frame *curFrame;
465 curFrame = m_inputQueue.popFront();
466 pts[i] = curFrame->m_pts;
467 maxSearch--;
468 }
469
470 m_inputQueueLock.release();
471
472 m_outputQueueLock.acquire();
473 /* add non-B to output queue */
474 int idx = 0;
475 list[bframes]->m_reorderedPts = pts[idx++];
476 m_outputQueue.pushBack(*list[bframes]);
477
478 /* Add B-ref frame next to P frame in output queue, the B-ref encode before non B-ref frame */
479 if (bframes > 1 && m_param->bBPyramid)
480 {
481 for (int i = 0; i < bframes; i++)
482 {
483 if (list[i]->m_lowres.sliceType == X265_TYPE_BREF)
484 {
485 list[i]->m_reorderedPts = pts[idx++];
486 m_outputQueue.pushBack(*list[i]);
487 }
488 }
489 }
490
491 /* add B frames to output queue */
492 for (int i = 0; i < bframes; i++)
493 {
494 /* push all the B frames into output queue except B-ref, which already pushed into output queue*/
495 if (list[i]->m_lowres.sliceType != X265_TYPE_BREF)
496 {
497 list[i]->m_reorderedPts = pts[idx++];
498 m_outputQueue.pushBack(*list[i]);
499 }
500 }
501
502 bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth)) && !m_param->rc.bStatRead;
503 if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType))
504 {
505 m_inputQueueLock.acquire();
506 Frame *curFrame = m_inputQueue.first();
507 frames[0] = m_lastNonB;
508 int j;
509 for (j = 0; j < maxSearch; j++)
510 {
511 frames[j + 1] = &curFrame->m_lowres;
512 curFrame = curFrame->m_next;
513 }
514
515 frames[j + 1] = NULL;
516 m_inputQueueLock.release();
517 slicetypeAnalyse(frames, true);
518 }
519
520 m_outputQueueLock.release();
521 m_outputAvailable.trigger();
522}
523
524void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
525{
526 int prevNonB = 0, curNonB = 1, idx = 0;
527 bool isNextNonB = false;
528
529 while (curNonB < numFrames && frames[curNonB]->sliceType == X265_TYPE_B)
530 curNonB++;
531
532 int nextNonB = keyframe ? prevNonB : curNonB;
533 int nextB = keyframe ? prevNonB + 1 : curNonB + 1;
534
535 while (curNonB < numFrames + !keyframe)
536 {
537 /* P/I cost: This shouldn't include the cost of nextNonB */
538 if (nextNonB != curNonB)
539 {
540 int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB;
541 frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, p0, curNonB, curNonB);
542 frames[nextNonB]->plannedType[idx] = frames[curNonB]->sliceType;
543 idx++;
544 }
545 /* Handle the B-frames: coded order */
546 for (int i = prevNonB + 1; i < curNonB; i++, idx++)
547 {
548 frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, prevNonB, curNonB, i);
549 frames[nextNonB]->plannedType[idx] = X265_TYPE_B;
550 }
551
552 for (int i = nextB; i <= curNonB; i++)
553 {
554 for (int j = frames[i]->indB + i + 1; j <= curNonB; j++, frames[i]->indB++)
555 {
556 if (j == curNonB)
557 {
558 if (isNextNonB)
559 {
560 int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB;
561 frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, p0, curNonB, curNonB);
562 frames[i]->plannedType[frames[i]->indB] = frames[curNonB]->sliceType;
563 }
564 }
565 else
566 {
567 frames[i]->plannedSatd[frames[i]->indB] = vbvFrameCost(frames, prevNonB, curNonB, j);
568 frames[i]->plannedType[frames[i]->indB] = X265_TYPE_B;
569 }
570 }
571 if (i == curNonB && !isNextNonB)
572 isNextNonB = true;
573 }
574
575 prevNonB = curNonB;
576 curNonB++;
577 while (curNonB <= numFrames && frames[curNonB]->sliceType == X265_TYPE_B)
578 curNonB++;
579 }
580
581 frames[nextNonB]->plannedType[idx] = X265_TYPE_AUTO;
582}
583
584int64_t Lookahead::vbvFrameCost(Lowres **frames, int p0, int p1, int b)
585{
586 int64_t cost = m_est.estimateFrameCost(frames, p0, p1, b, 0);
587
588 if (m_param->rc.aqMode)
589 {
590 if (m_param->rc.cuTree)
591 return frameCostRecalculate(frames, p0, p1, b);
592 else
593 return frames[b]->costEstAq[b - p0][p1 - b];
594 }
595 return cost;
596}
597
598void Lookahead::slicetypeAnalyse(Lowres **frames, bool bKeyframe)
599{
600 int numFrames, origNumFrames, keyintLimit, framecnt;
601 int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX);
602 int cuCount = NUM_CUS;
603 int resetStart;
604 bool bIsVbvLookahead = m_param->rc.vbvBufferSize && m_param->lookaheadDepth;
605
606 /* count undecided frames */
607 for (framecnt = 0; framecnt < maxSearch; framecnt++)
608 {
609 Lowres *fenc = frames[framecnt + 1];
610 if (!fenc || fenc->sliceType != X265_TYPE_AUTO)
611 break;
612 }
613
614 if (!framecnt)
615 {
616 if (m_param->rc.cuTree)
617 cuTree(frames, 0, bKeyframe);
618 return;
619 }
620
621 frames[framecnt + 1] = NULL;
622
623 keyintLimit = m_param->keyframeMax - frames[0]->frameNum + m_lastKeyframe - 1;
624 origNumFrames = numFrames = X265_MIN(framecnt, keyintLimit);
625
626 if (bIsVbvLookahead)
627 numFrames = framecnt;
628 else if (m_param->bOpenGOP && numFrames < framecnt)
629 numFrames++;
630 else if (numFrames == 0)
631 {
632 frames[1]->sliceType = X265_TYPE_I;
633 return;
634 }
635
636 int numBFrames = 0;
637 int numAnalyzed = numFrames;
638 if (m_param->scenecutThreshold && scenecut(frames, 0, 1, true, origNumFrames, maxSearch))
639 {
640 frames[1]->sliceType = X265_TYPE_I;
641 return;
642 }
643
644 if (m_param->bframes)
645 {
646 if (m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS)
647 {
648 if (numFrames > 1)
649 {
650 char best_paths[X265_BFRAME_MAX + 1][X265_LOOKAHEAD_MAX + 1] = { "", "P" };
651 int best_path_index = numFrames % (X265_BFRAME_MAX + 1);
652
653 /* Perform the frametype analysis. */
654 for (int j = 2; j <= numFrames; j++)
655 {
656 slicetypePath(frames, j, best_paths);
657 }
658
659 numBFrames = (int)strspn(best_paths[best_path_index], "B");
660
661 /* Load the results of the analysis into the frame types. */
662 for (int j = 1; j < numFrames; j++)
663 {
664 frames[j]->sliceType = best_paths[best_path_index][j - 1] == 'B' ? X265_TYPE_B : X265_TYPE_P;
665 }
666 }
667 frames[numFrames]->sliceType = X265_TYPE_P;
668 }
669 else if (m_param->bFrameAdaptive == X265_B_ADAPT_FAST)
670 {
671 int64_t cost1p0, cost2p0, cost1b1, cost2p1;
672
673 for (int i = 0; i <= numFrames - 2; )
674 {
675 cost2p1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 2, 1);
676 if (frames[i + 2]->intraMbs[2] > cuCount / 2)
677 {
678 frames[i + 1]->sliceType = X265_TYPE_P;
679 frames[i + 2]->sliceType = X265_TYPE_P;
680 i += 2;
681 continue;
682 }
683
684 cost1b1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 1, 0);
685 cost1p0 = m_est.estimateFrameCost(frames, i + 0, i + 1, i + 1, 0);
686 cost2p0 = m_est.estimateFrameCost(frames, i + 1, i + 2, i + 2, 0);
687
688 if (cost1p0 + cost2p0 < cost1b1 + cost2p1)
689 {
690 frames[i + 1]->sliceType = X265_TYPE_P;
691 i += 1;
692 continue;
693 }
694
695// arbitrary and untuned
696#define INTER_THRESH 300
697#define P_SENS_BIAS (50 - m_param->bFrameBias)
698 frames[i + 1]->sliceType = X265_TYPE_B;
699
700 int j;
701 for (j = i + 2; j <= X265_MIN(i + m_param->bframes, numFrames - 1); j++)
702 {
703 int64_t pthresh = X265_MAX(INTER_THRESH - P_SENS_BIAS * (j - i - 1), INTER_THRESH / 10);
704 int64_t pcost = m_est.estimateFrameCost(frames, i + 0, j + 1, j + 1, 1);
705 if (pcost > pthresh * cuCount || frames[j + 1]->intraMbs[j - i + 1] > cuCount / 3)
706 break;
707 frames[j]->sliceType = X265_TYPE_B;
708 }
709
710 frames[j]->sliceType = X265_TYPE_P;
711 i = j;
712 }
713 frames[numFrames]->sliceType = X265_TYPE_P;
714 numBFrames = 0;
715 while (numBFrames < numFrames && frames[numBFrames + 1]->sliceType == X265_TYPE_B)
716 {
717 numBFrames++;
718 }
719 }
720 else
721 {
722 numBFrames = X265_MIN(numFrames - 1, m_param->bframes);
723 for (int j = 1; j < numFrames; j++)
724 {
725 frames[j]->sliceType = (j % (numBFrames + 1)) ? X265_TYPE_B : X265_TYPE_P;
726 }
727
728 frames[numFrames]->sliceType = X265_TYPE_P;
729 }
730 /* Check scenecut on the first minigop. */
731 for (int j = 1; j < numBFrames + 1; j++)
732 {
733 if (m_param->scenecutThreshold && scenecut(frames, j, j + 1, false, origNumFrames, maxSearch))
734 {
735 frames[j]->sliceType = X265_TYPE_P;
736 numAnalyzed = j;
737 break;
738 }
739 }
740
741 resetStart = bKeyframe ? 1 : X265_MIN(numBFrames + 2, numAnalyzed + 1);
742 }
743 else
744 {
745 for (int j = 1; j <= numFrames; j++)
746 {
747 frames[j]->sliceType = X265_TYPE_P;
748 }
749
750 resetStart = bKeyframe ? 1 : 2;
751 }
752
753 if (m_param->rc.cuTree)
754 cuTree(frames, X265_MIN(numFrames, m_param->keyframeMax), bKeyframe);
755
756 // if (!param->bIntraRefresh)
757 for (int j = keyintLimit + 1; j <= numFrames; j += m_param->keyframeMax)
758 {
759 frames[j]->sliceType = X265_TYPE_I;
760 resetStart = X265_MIN(resetStart, j + 1);
761 }
762
763 if (bIsVbvLookahead)
764 vbvLookahead(frames, numFrames, bKeyframe);
765
766 /* Restore frametypes for all frames that haven't actually been decided yet. */
767 for (int j = resetStart; j <= numFrames; j++)
768 {
769 frames[j]->sliceType = X265_TYPE_AUTO;
770 }
771}
772
773bool Lookahead::scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch)
774{
775 /* Only do analysis during a normal scenecut check. */
776 if (bRealScenecut && m_param->bframes)
777 {
778 int origmaxp1 = p0 + 1;
779 /* Look ahead to avoid coding short flashes as scenecuts. */
780 if (m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS)
781 /* Don't analyse any more frames than the trellis would have covered. */
782 origmaxp1 += m_param->bframes;
783 else
784 origmaxp1++;
785 int maxp1 = X265_MIN(origmaxp1, numFrames);
786
787 /* Where A and B are scenes: AAAAAABBBAAAAAA
788 * If BBB is shorter than (maxp1-p0), it is detected as a flash
789 * and not considered a scenecut. */
790 for (int cp1 = p1; cp1 <= maxp1; cp1++)
791 {
792 if (!scenecutInternal(frames, p0, cp1, false))
793 /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */
794 for (int i = cp1; i > p0; i--)
795 {
796 frames[i]->bScenecut = false;
797 }
798 }
799
800 /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF
801 * If each of BB ... EE are shorter than (maxp1-p0), they are
802 * detected as flashes and not considered scenecuts.
803 * Instead, the first F frame becomes a scenecut.
804 * If the video ends before F, no frame becomes a scenecut. */
805 for (int cp0 = p0; cp0 <= maxp1; cp0++)
806 {
807 if (origmaxp1 > maxSearch || (cp0 < maxp1 && scenecutInternal(frames, cp0, maxp1, false)))
808 /* If cur_p0 is the p0 of a scenecut, it cannot be the p1 of a scenecut. */
809 frames[cp0]->bScenecut = false;
810 }
811 }
812
813 /* Ignore frames that are part of a flash, i.e. cannot be real scenecuts. */
814 if (!frames[p1]->bScenecut)
815 return false;
816 return scenecutInternal(frames, p0, p1, bRealScenecut);
817}
818
819bool Lookahead::scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut)
820{
821 Lowres *frame = frames[p1];
822
823 m_est.estimateFrameCost(frames, p0, p1, p1, 0);
824
825 int64_t icost = frame->costEst[0][0];
826 int64_t pcost = frame->costEst[p1 - p0][0];
827 int gopSize = frame->frameNum - m_lastKeyframe;
828 float threshMax = (float)(m_param->scenecutThreshold / 100.0);
829
830 /* magic numbers pulled out of thin air */
831 float threshMin = (float)(threshMax * 0.25);
832 float bias;
833
834 if (m_param->keyframeMin == m_param->keyframeMax)
835 threshMin = threshMax;
836 if (gopSize <= m_param->keyframeMin / 4)
837 bias = threshMin / 4;
838 else if (gopSize <= m_param->keyframeMin)
839 bias = threshMin * gopSize / m_param->keyframeMin;
840 else
841 {
842 bias = threshMin
843 + (threshMax - threshMin)
844 * (gopSize - m_param->keyframeMin)
845 / (m_param->keyframeMax - m_param->keyframeMin);
846 }
847
848 bool res = pcost >= (1.0 - bias) * icost;
849 if (res && bRealScenecut)
850 {
851 int imb = frame->intraMbs[p1 - p0];
852 int pmb = NUM_CUS - imb;
853 x265_log(m_param, X265_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n",
854 frame->frameNum, icost, pcost, 1. - (double)pcost / icost, bias, gopSize, imb, pmb);
855 }
856 return res;
857}
858
859void Lookahead::slicetypePath(Lowres **frames, int length, char(*best_paths)[X265_LOOKAHEAD_MAX + 1])
860{
861 char paths[2][X265_LOOKAHEAD_MAX + 1];
862 int num_paths = X265_MIN(m_param->bframes + 1, length);
863 int64_t best_cost = 1LL << 62;
864 int idx = 0;
865
866 /* Iterate over all currently possible paths */
867 for (int path = 0; path < num_paths; path++)
868 {
869 /* Add suffixes to the current path */
870 int len = length - (path + 1);
871 memcpy(paths[idx], best_paths[len % (X265_BFRAME_MAX + 1)], len);
872 memset(paths[idx] + len, 'B', path);
873 strcpy(paths[idx] + len + path, "P");
874
875 /* Calculate the actual cost of the current path */
876 int64_t cost = slicetypePathCost(frames, paths[idx], best_cost);
877 if (cost < best_cost)
878 {
879 best_cost = cost;
880 idx ^= 1;
881 }
882 }
883
884 /* Store the best path. */
885 memcpy(best_paths[length % (X265_BFRAME_MAX + 1)], paths[idx ^ 1], length);
886}
887
888int64_t Lookahead::slicetypePathCost(Lowres **frames, char *path, int64_t threshold)
889{
890 int64_t cost = 0;
891 int loc = 1;
892 int cur_p = 0;
893
894 path--; /* Since the 1st path element is really the second frame */
895 while (path[loc])
896 {
897 int next_p = loc;
898 /* Find the location of the next P-frame. */
899 while (path[next_p] != 'P')
900 {
901 next_p++;
902 }
903
904 /* Add the cost of the P-frame found above */
905 cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_p, 0);
906 /* Early terminate if the cost we have found is larger than the best path cost so far */
907 if (cost > threshold)
908 break;
909
910 if (m_param->bBPyramid && next_p - cur_p > 2)
911 {
912 int middle = cur_p + (next_p - cur_p) / 2;
913 cost += m_est.estimateFrameCost(frames, cur_p, next_p, middle, 0);
914 for (int next_b = loc; next_b < middle && cost < threshold; next_b++)
915 {
916 cost += m_est.estimateFrameCost(frames, cur_p, middle, next_b, 0);
917 }
918
919 for (int next_b = middle + 1; next_b < next_p && cost < threshold; next_b++)
920 {
921 cost += m_est.estimateFrameCost(frames, middle, next_p, next_b, 0);
922 }
923 }
924 else
925 {
926 for (int next_b = loc; next_b < next_p && cost < threshold; next_b++)
927 {
928 cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_b, 0);
929 }
930 }
931
932 loc = next_p + 1;
933 cur_p = next_p;
934 }
935
936 return cost;
937}
938
939void Lookahead::cuTree(Lowres **frames, int numframes, bool bIntra)
940{
941 int idx = !bIntra;
942 int lastnonb, curnonb = 1;
943 int bframes = 0;
944
945 x265_emms();
946 double totalDuration = 0.0;
947 for (int j = 0; j <= numframes; j++)
948 totalDuration += (double)m_param->fpsDenom / m_param->fpsNum;
949
950 double averageDuration = totalDuration / (numframes + 1);
951
952 int i = numframes;
953 int cuCount = m_widthInCU * m_heightInCU;
954
955 if (bIntra)
956 m_est.estimateFrameCost(frames, 0, 0, 0, 0);
957
958 while (i > 0 && frames[i]->sliceType == X265_TYPE_B)
959 i--;
960
961 lastnonb = i;
962
963 /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could
964 * be applied to the end of a lookahead buffer of any size. However, it's most needed when
965 * lookahead=0, so that's what's currently implemented. */
966 if (!m_param->lookaheadDepth)
967 {
968 if (bIntra)
969 {
970 memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t));
971 memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, cuCount * sizeof(double));
972 return;
973 }
974 std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
975 memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t));
976 }
977 else
978 {
979 if (lastnonb < idx)
980 return;
981 memset(frames[lastnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
982 }
983
984 while (i-- > idx)
985 {
986 curnonb = i;
987 while (frames[curnonb]->sliceType == X265_TYPE_B && curnonb > 0)
988 curnonb--;
989
990 if (curnonb < idx)
991 break;
992
993 m_est.estimateFrameCost(frames, curnonb, lastnonb, lastnonb, 0);
994 memset(frames[curnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
995 bframes = lastnonb - curnonb - 1;
996 if (m_param->bBPyramid && bframes > 1)
997 {
998 int middle = (bframes + 1) / 2 + curnonb;
999 m_est.estimateFrameCost(frames, curnonb, lastnonb, middle, 0);
1000 memset(frames[middle]->propagateCost, 0, cuCount * sizeof(uint16_t));
1001 while (i > curnonb)
1002 {
1003 int p0 = i > middle ? middle : curnonb;
1004 int p1 = i < middle ? middle : lastnonb;
1005 if (i != middle)
1006 {
1007 m_est.estimateFrameCost(frames, p0, p1, i, 0);
1008 estimateCUPropagate(frames, averageDuration, p0, p1, i, 0);
1009 }
1010 i--;
1011 }
1012
1013 estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, middle, 1);
1014 }
1015 else
1016 {
1017 while (i > curnonb)
1018 {
1019 m_est.estimateFrameCost(frames, curnonb, lastnonb, i, 0);
1020 estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0);
1021 i--;
1022 }
1023 }
1024 estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, lastnonb, 1);
1025 lastnonb = curnonb;
1026 }
1027
1028 if (!m_param->lookaheadDepth)
1029 {
1030 m_est.estimateFrameCost(frames, 0, lastnonb, lastnonb, 0);
1031 estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1);
1032 std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
1033 }
1034
1035 cuTreeFinish(frames[lastnonb], averageDuration, lastnonb);
1036 if (m_param->bBPyramid && bframes > 1 && !m_param->rc.vbvBufferSize)
1037 cuTreeFinish(frames[lastnonb + (bframes + 1) / 2], averageDuration, 0);
1038}
1039
1040void Lookahead::estimateCUPropagate(Lowres **frames, double averageDuration, int p0, int p1, int b, int referenced)
1041{
1042 uint16_t *refCosts[2] = { frames[p0]->propagateCost, frames[p1]->propagateCost };
1043 int32_t distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0);
1044 int32_t bipredWeight = m_param->bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32;
1045 MV *mvs[2] = { frames[b]->lowresMvs[0][b - p0 - 1], frames[b]->lowresMvs[1][p1 - b - 1] };
1046 int32_t bipredWeights[2] = { bipredWeight, 64 - bipredWeight };
1047
1048 memset(m_scratch, 0, m_widthInCU * sizeof(int));
1049
1050 uint16_t *propagateCost = frames[b]->propagateCost;
1051
1052 x265_emms();
1053 double fpsFactor = CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) / CLIP_DURATION(averageDuration);
1054
1055 /* For non-refferd frames the source costs are always zero, so just memset one row and re-use it. */
1056 if (!referenced)
1057 memset(frames[b]->propagateCost, 0, m_widthInCU * sizeof(uint16_t));
1058
1059 int32_t StrideInCU = m_widthInCU;
1060 for (uint16_t blocky = 0; blocky < m_heightInCU; blocky++)
1061 {
1062 int cuIndex = blocky * StrideInCU;
1063 primitives.propagateCost(m_scratch, propagateCost,
1064 frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
1065 frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_widthInCU);
1066
1067 if (referenced)
1068 propagateCost += m_widthInCU;
1069 for (uint16_t blockx = 0; blockx < m_widthInCU; blockx++, cuIndex++)
1070 {
1071 int32_t propagate_amount = m_scratch[blockx];
1072 /* Don't propagate for an intra block. */
1073 if (propagate_amount > 0)
1074 {
1075 /* Access width-2 bitfield. */
1076 int32_t lists_used = frames[b]->lowresCosts[b - p0][p1 - b][cuIndex] >> LOWRES_COST_SHIFT;
1077 /* Follow the MVs to the previous frame(s). */
1078 for (uint16_t list = 0; list < 2; list++)
1079 {
1080 if ((lists_used >> list) & 1)
1081 {
1082#define CLIP_ADD(s, x) (s) = (uint16_t)X265_MIN((s) + (x), (1 << 16) - 1)
1083 int32_t listamount = propagate_amount;
1084 /* Apply bipred weighting. */
1085 if (lists_used == 3)
1086 listamount = (listamount * bipredWeights[list] + 32) >> 6;
1087
1088 /* Early termination for simple case of mv0. */
1089 if (!mvs[list][cuIndex].word)
1090 {
1091 CLIP_ADD(refCosts[list][cuIndex], listamount);
1092 continue;
1093 }
1094
1095 int32_t x = mvs[list][cuIndex].x;
1096 int32_t y = mvs[list][cuIndex].y;
1097 int32_t cux = (x >> 5) + blockx;
1098 int32_t cuy = (y >> 5) + blocky;
1099 int32_t idx0 = cux + cuy * StrideInCU;
1100 int32_t idx1 = idx0 + 1;
1101 int32_t idx2 = idx0 + StrideInCU;
1102 int32_t idx3 = idx0 + StrideInCU + 1;
1103 x &= 31;
1104 y &= 31;
1105 int32_t idx0weight = (32 - y) * (32 - x);
1106 int32_t idx1weight = (32 - y) * x;
1107 int32_t idx2weight = y * (32 - x);
1108 int32_t idx3weight = y * x;
1109
1110 /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
1111 * be counted. */
1112 if (cux < m_widthInCU - 1 && cuy < m_heightInCU - 1 && cux >= 0 && cuy >= 0)
1113 {
1114 CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
1115 CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
1116 CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
1117 CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);
1118 }
1119 else /* Check offsets individually */
1120 {
1121 if (cux < m_widthInCU && cuy < m_heightInCU && cux >= 0 && cuy >= 0)
1122 CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
1123 if (cux + 1 < m_widthInCU && cuy < m_heightInCU && cux + 1 >= 0 && cuy >= 0)
1124 CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
1125 if (cux < m_widthInCU && cuy + 1 < m_heightInCU && cux >= 0 && cuy + 1 >= 0)
1126 CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
1127 if (cux + 1 < m_widthInCU && cuy + 1 < m_heightInCU && cux + 1 >= 0 && cuy + 1 >= 0)
1128 CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);
1129 }
1130 }
1131 }
1132 }
1133 }
1134 }
1135
1136 if (m_param->rc.vbvBufferSize && m_param->lookaheadDepth && referenced)
1137 cuTreeFinish(frames[b], averageDuration, b == p1 ? b - p0 : 0);
1138}
1139
1140void Lookahead::cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance)
1141{
1142 int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) * 256);
1143 double weightdelta = 0.0;
1144
1145 if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
1146 weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);
1147
1148 /* Allow the strength to be adjusted via qcompress, since the two
1149 * concepts are very similar. */
1150
1151 int cuCount = m_widthInCU * m_heightInCU;
1152 double strength = 5.0 * (1.0 - m_param->rc.qCompress);
1153
1154 for (int cuIndex = 0; cuIndex < cuCount; cuIndex++)
1155 {
1156 int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8;
1157 if (intracost)
1158 {
1159 int propagateCost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8;
1160 double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
1161 frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - strength * log2_ratio;
1162 }
1163 }
1164}
1165
1166/* If MB-tree changes the quantizers, we need to recalculate the frame cost without
1167 * re-running lookahead. */
1168int64_t Lookahead::frameCostRecalculate(Lowres** frames, int p0, int p1, int b)
1169{
1170 int64_t score = 0;
1171 int *rowSatd = frames[b]->rowSatds[b - p0][p1 - b];
1172 double *qp_offset = (frames[b]->sliceType == X265_TYPE_B) ? frames[b]->qpAqOffset : frames[b]->qpCuTreeOffset;
1173
1174 x265_emms();
1175 for (int cuy = m_heightInCU - 1; cuy >= 0; cuy--)
1176 {
1177 rowSatd[cuy] = 0;
1178 for (int cux = m_widthInCU - 1; cux >= 0; cux--)
1179 {
1180 int cuxy = cux + cuy * m_widthInCU;
1181 int cuCost = frames[b]->lowresCosts[b - p0][p1 - b][cuxy] & LOWRES_COST_MASK;
1182 double qp_adj = qp_offset[cuxy];
1183 cuCost = (cuCost * x265_exp2fix8(qp_adj) + 128) >> 8;
1184 rowSatd[cuy] += cuCost;
1185 if ((cuy > 0 && cuy < m_heightInCU - 1 &&
1186 cux > 0 && cux < m_widthInCU - 1) ||
1187 m_widthInCU <= 2 || m_heightInCU <= 2)
1188 {
1189 score += cuCost;
1190 }
1191 }
1192 }
1193
1194 return score;
1195}
1196
1197CostEstimate::CostEstimate(ThreadPool *p)
1198 : WaveFront(p)
1199{
1200 m_param = NULL;
1201 m_curframes = NULL;
1202 m_wbuffer[0] = m_wbuffer[1] = m_wbuffer[2] = m_wbuffer[3] = 0;
1203 m_rows = NULL;
1204 m_paddedLines = m_widthInCU = m_heightInCU = 0;
1205 m_bDoSearch[0] = m_bDoSearch[1] = false;
1206 m_curb = m_curp0 = m_curp1 = 0;
1207 m_bFrameCompleted = false;
1208}
1209
1210CostEstimate::~CostEstimate()
1211{
1212 for (int i = 0; i < 4; i++)
1213 {
1214 x265_free(m_wbuffer[i]);
1215 }
1216
1217 delete[] m_rows;
1218}
1219
1220void CostEstimate::init(x265_param *_param, Frame *curFrame)
1221{
1222 m_param = _param;
1223 m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
1224 m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
1225
1226 m_rows = new EstimateRow[m_heightInCU];
1227 for (int i = 0; i < m_heightInCU; i++)
1228 {
1229 m_rows[i].m_widthInCU = m_widthInCU;
1230 m_rows[i].m_heightInCU = m_heightInCU;
1231 m_rows[i].m_param = m_param;
1232 }
1233
1234 if (WaveFront::init(m_heightInCU))
1235 WaveFront::enableAllRows();
1236 else
1237 m_pool = NULL;
1238
1239 if (m_param->bEnableWeightedPred)
1240 {
1241 PicYuv *orig = curFrame->m_origPicYuv;
1242 m_paddedLines = curFrame->m_lowres.lines + 2 * orig->m_lumaMarginY;
1243 intptr_t padoffset = curFrame->m_lowres.lumaStride * orig->m_lumaMarginY + orig->m_lumaMarginX;
1244
1245 /* allocate weighted lowres buffers */
1246 for (int i = 0; i < 4; i++)
1247 {
1248 m_wbuffer[i] = (pixel*)x265_malloc(sizeof(pixel) * (curFrame->m_lowres.lumaStride * m_paddedLines));
1249 m_weightedRef.lowresPlane[i] = m_wbuffer[i] + padoffset;
1250 }
1251
1252 m_weightedRef.fpelPlane = m_weightedRef.lowresPlane[0];
1253 m_weightedRef.lumaStride = curFrame->m_lowres.lumaStride;
1254 m_weightedRef.isLowres = true;
1255 m_weightedRef.isWeighted = false;
1256 }
1257}
1258
1259int64_t CostEstimate::estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty)
1260{
1261 int64_t score = 0;
1262 Lowres *fenc = frames[b];
1263
1264 if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1)
1265 score = fenc->costEst[b - p0][p1 - b];
1266 else
1267 {
1268 m_weightedRef.isWeighted = false;
1269 if (m_param->bEnableWeightedPred && b == p1 && b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF)
1270 {
1271 if (!fenc->bIntraCalculated)
1272 estimateFrameCost(frames, b, b, b, 0);
1273 weightsAnalyse(frames, b, p0);
1274 }
1275
1276 /* For each list, check to see whether we have lowres motion-searched this reference */
1277 m_bDoSearch[0] = b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF;
1278 m_bDoSearch[1] = b != p1 && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF;
1279
1280 if (m_bDoSearch[0]) fenc->lowresMvs[0][b - p0 - 1][0].x = 0;
1281 if (m_bDoSearch[1]) fenc->lowresMvs[1][p1 - b - 1][0].x = 0;
1282
1283 m_curb = b;
1284 m_curp0 = p0;
1285 m_curp1 = p1;
1286 m_curframes = frames;
1287 fenc->costEst[b - p0][p1 - b] = 0;
1288 fenc->costEstAq[b - p0][p1 - b] = 0;
1289
1290 for (int i = 0; i < m_heightInCU; i++)
1291 {
1292 m_rows[i].init();
1293 m_rows[i].m_me.setSourcePlane(fenc->lowresPlane[0], fenc->lumaStride);
1294 if (!fenc->bIntraCalculated)
1295 fenc->rowSatds[0][0][i] = 0;
1296 fenc->rowSatds[b - p0][p1 - b][i] = 0;
1297 }
1298
1299 m_bFrameCompleted = false;
1300
1301 if (m_pool)
1302 {
1303 WaveFront::enqueue();
1304
1305 // enableAllRows must be already called
1306 enqueueRow(0);
1307 while (!m_bFrameCompleted)
1308 WaveFront::findJob(-1);
1309
1310 WaveFront::dequeue();
1311 }
1312 else
1313 {
1314 for (int row = 0; row < m_heightInCU; row++)
1315 processRow(row, -1);
1316
1317 x265_emms();
1318 }
1319
1320 // Accumulate cost from each row
1321 for (int row = 0; row < m_heightInCU; row++)
1322 {
1323 score += m_rows[row].m_costEst;
1324 fenc->costEst[0][0] += m_rows[row].m_costIntra;
1325 if (m_param->rc.aqMode)
1326 {
1327 fenc->costEstAq[0][0] += m_rows[row].m_costIntraAq;
1328 fenc->costEstAq[b - p0][p1 - b] += m_rows[row].m_costEstAq;
1329 }
1330 fenc->intraMbs[b - p0] += m_rows[row].m_intraMbs;
1331 }
1332
1333 fenc->bIntraCalculated = true;
1334
1335 if (b != p1)
1336 score = (uint64_t)score * 100 / (130 + m_param->bFrameBias);
1337 if (b != p0 || b != p1) //Not Intra cost
1338 fenc->costEst[b - p0][p1 - b] = score;
1339 }
1340
1341 if (bIntraPenalty)
1342 {
1343 // arbitrary penalty for I-blocks after B-frames
1344 int ncu = NUM_CUS;
1345 score += (uint64_t)score * fenc->intraMbs[b - p0] / (ncu * 8);
1346 }
1347 return score;
1348}
1349
1350uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightParam *wp)
1351{
1352 Lowres *fenc = frames[b];
1353 Lowres *ref = frames[p0];
1354 pixel *src = ref->fpelPlane;
1355 intptr_t stride = fenc->lumaStride;
1356
1357 if (wp)
1358 {
1359 int offset = wp->inputOffset << (X265_DEPTH - 8);
1360 int scale = wp->inputWeight;
1361 int denom = wp->log2WeightDenom;
1362 int round = denom ? 1 << (denom - 1) : 0;
1363 int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
1364 int widthHeight = (int)stride;
1365
1366 primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines,
1367 scale, round << correction, denom + correction, offset);
1368 src = m_weightedRef.fpelPlane;
1369 }
1370
1371 uint32_t cost = 0;
1372 intptr_t pixoff = 0;
1373 int mb = 0;
1374
1375 for (int y = 0; y < fenc->lines; y += 8, pixoff = y * stride)
1376 {
1377 for (int x = 0; x < fenc->width; x += 8, mb++, pixoff += 8)
1378 {
1379 int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane + pixoff, stride);
1380 cost += X265_MIN(satd, fenc->intraCost[mb]);
1381 }
1382 }
1383
1384 return cost;
1385}
1386
1387void CostEstimate::weightsAnalyse(Lowres **frames, int b, int p0)
1388{
1389 static const float epsilon = 1.f / 128.f;
1390 Lowres *fenc, *ref;
1391
1392 fenc = frames[b];
1393 ref = frames[p0];
1394 int deltaIndex = fenc->frameNum - ref->frameNum;
1395
1396 /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
1397 float guessScale, fencMean, refMean;
1398 x265_emms();
1399 if (fenc->wp_ssd[0] && ref->wp_ssd[0])
1400 guessScale = sqrtf((float)fenc->wp_ssd[0] / ref->wp_ssd[0]);
1401 else
1402 guessScale = 1.0f;
1403 fencMean = (float)fenc->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8));
1404 refMean = (float)ref->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8));
1405
1406 /* Early termination */
1407 if (fabsf(refMean - fencMean) < 0.5f && fabsf(1.f - guessScale) < epsilon)
1408 return;
1409
1410 int minoff = 0, minscale, mindenom;
1411 unsigned int minscore = 0, origscore = 1;
1412 int found = 0;
1413
1414 m_w.setFromWeightAndOffset((int)(guessScale * 128 + 0.5f), 0, 7, true);
1415 mindenom = m_w.log2WeightDenom;
1416 minscale = m_w.inputWeight;
1417
1418 origscore = minscore = weightCostLuma(frames, b, p0, NULL);
1419
1420 if (!minscore)
1421 return;
1422
1423 unsigned int s = 0;
1424 int curScale = minscale;
1425 int curOffset = (int)(fencMean - refMean * curScale / (1 << mindenom) + 0.5f);
1426 if (curOffset < -128 || curOffset > 127)
1427 {
1428 /* Rescale considering the constraints on curOffset. We do it in this order
1429 * because scale has a much wider range than offset (because of denom), so
1430 * it should almost never need to be clamped. */
1431 curOffset = Clip3(-128, 127, curOffset);
1432 curScale = (int)((1 << mindenom) * (fencMean - curOffset) / refMean + 0.5f);
1433 curScale = Clip3(0, 127, curScale);
1434 }
1435 SET_WEIGHT(m_w, 1, curScale, mindenom, curOffset);
1436 s = weightCostLuma(frames, b, p0, &m_w);
1437 COPY4_IF_LT(minscore, s, minscale, curScale, minoff, curOffset, found, 1);
1438
1439 /* Use a smaller denominator if possible */
1440 while (mindenom > 0 && !(minscale & 1))
1441 {
1442 mindenom--;
1443 minscale >>= 1;
1444 }
1445
1446 if (!found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f)
1447 return;
1448 else
1449 {
1450 SET_WEIGHT(m_w, 1, minscale, mindenom, minoff);
1451 // set weighted delta cost
1452 fenc->weightedCostDelta[deltaIndex] = minscore / origscore;
1453
1454 int offset = m_w.inputOffset << (X265_DEPTH - 8);
1455 int scale = m_w.inputWeight;
1456 int denom = m_w.log2WeightDenom;
1457 int round = denom ? 1 << (denom - 1) : 0;
1458 int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
1459 intptr_t stride = ref->lumaStride;
1460 int widthHeight = (int)stride;
1461
1462 for (int i = 0; i < 4; i++)
1463 primitives.weight_pp(ref->buffer[i], m_wbuffer[i], stride, widthHeight, m_paddedLines,
1464 scale, round << correction, denom + correction, offset);
1465
1466 m_weightedRef.isWeighted = true;
1467 }
1468}
1469
1470void CostEstimate::processRow(int row, int /*threadId*/)
1471{
1472 int realrow = m_heightInCU - 1 - row;
1473 Lowres **frames = m_curframes;
1474 ReferencePlanes *wfref0 = m_weightedRef.isWeighted ? &m_weightedRef : frames[m_curp0];
1475
1476 /* Lowres lookahead goes backwards because the MVs are used as
1477 * predictors in the main encode. This considerably improves MV
1478 * prediction overall. */
1479 for (int i = m_widthInCU - 1 - m_rows[row].m_completed; i >= 0; i--)
1480 {
1481 // TODO: use lowres MVs as motion candidates in full-res search
1482 m_rows[row].estimateCUCost(frames, wfref0, i, realrow, m_curp0, m_curp1, m_curb, m_bDoSearch);
1483 m_rows[row].m_completed++;
1484
1485 if (m_rows[row].m_completed >= 2 && row < m_heightInCU - 1)
1486 {
1487 ScopedLock below(m_rows[row + 1].m_lock);
1488 if (m_rows[row + 1].m_active == false &&
1489 m_rows[row + 1].m_completed + 2 <= m_rows[row].m_completed)
1490 {
1491 m_rows[row + 1].m_active = true;
1492 enqueueRow(row + 1);
1493 }
1494 }
1495
1496 ScopedLock self(m_rows[row].m_lock);
1497 if (row > 0 && (int32_t)m_rows[row].m_completed < m_widthInCU - 1 &&
1498 m_rows[row - 1].m_completed < m_rows[row].m_completed + 2)
1499 {
1500 m_rows[row].m_active = false;
1501 return;
1502 }
1503 }
1504
1505 if (row == m_heightInCU - 1)
1506 m_bFrameCompleted = true;
1507}
1508
1509void EstimateRow::init()
1510{
1511 m_costEst = 0;
1512 m_costEstAq = 0;
1513 m_costIntra = 0;
1514 m_costIntraAq = 0;
1515 m_intraMbs = 0;
1516 m_active = false;
1517 m_completed = 0;
1518}
1519
1520void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2])
1521{
1522 Lowres *fref1 = frames[p1];
1523 Lowres *fenc = frames[b];
1524
1525 const int bBidir = (b < p1);
1526 const int cuXY = cux + cuy * m_widthInCU;
1527 const int cuSize = X265_LOWRES_CU_SIZE;
1528 const intptr_t pelOffset = cuSize * cux + cuSize * cuy * fenc->lumaStride;
1529
1530 // should this CU's cost contribute to the frame cost?
1531 const bool bFrameScoreCU = (cux > 0 && cux < m_widthInCU - 1 &&
1532 cuy > 0 && cuy < m_heightInCU - 1) || m_widthInCU <= 2 || m_heightInCU <= 2;
1533
1534 m_me.setSourcePU(pelOffset, cuSize, cuSize);
1535
1536 /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
1537 int lowresPenalty = 4;
1538
1539 MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY],
1540 &fenc->lowresMvs[1][p1 - b - 1][cuXY] };
1541 int(*fenc_costs[2]) = { &fenc->lowresMvCosts[0][b - p0 - 1][cuXY],
1542 &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] };
1543
1544 MV mvmin, mvmax;
1545 int bcost = m_me.COST_MAX;
1546 int listused = 0;
1547
1548 // establish search bounds that don't cross extended frame boundaries
1549 mvmin.x = (int16_t)(-cux * cuSize - 8);
1550 mvmin.y = (int16_t)(-cuy * cuSize - 8);
1551 mvmax.x = (int16_t)((m_widthInCU - cux - 1) * cuSize + 8);
1552 mvmax.y = (int16_t)((m_heightInCU - cuy - 1) * cuSize + 8);
1553
1554 if (p0 != p1)
1555 {
1556 for (int i = 0; i < 1 + bBidir; i++)
1557 {
1558 if (!bDoSearch[i])
1559 {
1560 /* Use previously calculated cost */
1561 COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1);
1562 continue;
1563 }
1564 int numc = 0;
1565 MV mvc[4], mvp;
1566 MV *fenc_mv = fenc_mvs[i];
1567
1568 /* Reverse-order MV prediction. */
1569 mvc[0] = 0;
1570 mvc[2] = 0;
1571#define MVC(mv) mvc[numc++] = mv;
1572 if (cux < m_widthInCU - 1)
1573 MVC(fenc_mv[1]);
1574 if (cuy < m_heightInCU - 1)
1575 {
1576 MVC(fenc_mv[m_widthInCU]);
1577 if (cux > 0)
1578 MVC(fenc_mv[m_widthInCU - 1]);
1579 if (cux < m_widthInCU - 1)
1580 MVC(fenc_mv[m_widthInCU + 1]);
1581 }
1582#undef MVC
1583 if (numc <= 1)
1584 mvp = mvc[0];
1585 else
1586 {
1587 median_mv(mvp, mvc[0], mvc[1], mvc[2]);
1588 }
1589
1590 *fenc_costs[i] = m_me.motionEstimate(i ? fref1 : wfref0, mvmin, mvmax, mvp, numc, mvc, m_merange, *fenc_mvs[i]);
1591 COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1);
1592 }
1593 if (bBidir)
1594 {
1595 pixel subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE], subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
1596 intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
1597 pixel *src0 = wfref0->lowresMC(pelOffset, *fenc_mvs[0], subpelbuf0, stride0);
1598 pixel *src1 = fref1->lowresMC(pelOffset, *fenc_mvs[1], subpelbuf1, stride1);
1599
1600 pixel ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
1601 primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
1602 int bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE);
1603 COPY2_IF_LT(bcost, bicost, listused, 3);
1604
1605 // Try 0,0 candidates
1606 src0 = wfref0->lowresPlane[0] + pelOffset;
1607 src1 = fref1->lowresPlane[0] + pelOffset;
1608 primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, wfref0->lumaStride, src1, fref1->lumaStride, 32);
1609 bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE);
1610 COPY2_IF_LT(bcost, bicost, listused, 3);
1611 }
1612 }
1613 if (!fenc->bIntraCalculated)
1614 {
1615 const int sizeIdx = X265_LOWRES_CU_BITS - 2; // partition size
1616
1617 pixel _above0[X265_LOWRES_CU_SIZE * 4 + 1], *const above0 = _above0 + 2 * X265_LOWRES_CU_SIZE;
1618 pixel _above1[X265_LOWRES_CU_SIZE * 4 + 1], *const above1 = _above1 + 2 * X265_LOWRES_CU_SIZE;
1619 pixel _left0[X265_LOWRES_CU_SIZE * 4 + 1], *const left0 = _left0 + 2 * X265_LOWRES_CU_SIZE;
1620 pixel _left1[X265_LOWRES_CU_SIZE * 4 + 1], *const left1 = _left1 + 2 * X265_LOWRES_CU_SIZE;
1621
1622 pixel *pix_cur = fenc->lowresPlane[0] + pelOffset;
1623
1624 // Copy Above
1625 memcpy(above0, pix_cur - 1 - fenc->lumaStride, (cuSize + 1) * sizeof(pixel));
1626
1627 // Copy Left
1628 for (int i = 0; i < cuSize + 1; i++)
1629 {
1630 left0[i] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride];
1631 }
1632
1633 for (int i = 0; i < cuSize; i++)
1634 {
1635 above0[cuSize + i + 1] = above0[cuSize];
1636 left0[cuSize + i + 1] = left0[cuSize];
1637 }
1638
1639 // filtering with [1 2 1]
1640 // assume getUseStrongIntraSmoothing() is disabled
1641 above1[0] = above0[0];
1642 above1[2 * cuSize] = above0[2 * cuSize];
1643 left1[0] = left0[0];
1644 left1[2 * cuSize] = left0[2 * cuSize];
1645 for (int i = 1; i < 2 * cuSize; i++)
1646 {
1647 above1[i] = (above0[i - 1] + 2 * above0[i] + above0[i + 1] + 2) >> 2;
1648 left1[i] = (left0[i - 1] + 2 * left0[i] + left0[i + 1] + 2) >> 2;
1649 }
1650
1651 int predsize = cuSize * cuSize;
1652
1653 // generate 35 intra predictions into m_predictions
1654 pixelcmp_t satd = primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
1655 int icost = m_me.COST_MAX, cost;
1656 primitives.intra_pred[DC_IDX][sizeIdx](m_predictions, cuSize, left0, above0, 0, (cuSize <= 16));
1657 cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
1658 if (cost < icost)
1659 icost = cost;
1660 pixel *above = (cuSize >= 8) ? above1 : above0;
1661 pixel *left = (cuSize >= 8) ? left1 : left0;
1662 primitives.intra_pred[PLANAR_IDX][sizeIdx](m_predictions, cuSize, left, above, 0, 0);
1663 cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
1664 if (cost < icost)
1665 icost = cost;
1666 primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
1667
1668 // calculate satd costs, keep least cost
1669 ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
1670 primitives.transpose[sizeIdx](buf_trans, m_me.fenc, FENC_STRIDE);
1671
1672 int acost = m_me.COST_MAX;
1673 uint32_t mode, lowmode = 4;
1674 for (mode = 5; mode < 35; mode += 5)
1675 {
1676 if (mode < 18)
1677 cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
1678 else
1679 cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
1680 COPY2_IF_LT(acost, cost, lowmode, mode);
1681 }
1682 for (uint32_t dist = 2; dist >= 1; dist--)
1683 {
1684 mode = lowmode - dist;
1685 if (mode < 18)
1686 cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
1687 else
1688 cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
1689 COPY2_IF_LT(acost, cost, lowmode, mode);
1690
1691 mode = lowmode + dist;
1692 if (mode < 18)
1693 cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
1694 else
1695 cost = satd(m_me.fenc, FENC_STRIDE, &m_predictions[mode * predsize], cuSize);
1696 COPY2_IF_LT(acost, cost, lowmode, mode);
1697 }
1698 if (acost < icost)
1699 icost = acost;
1700
1701 const int intraPenalty = 5 * m_lookAheadLambda;
1702 icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */
1703 fenc->intraCost[cuXY] = icost;
1704 int icostAq = icost;
1705 if (bFrameScoreCU)
1706 {
1707 m_costIntra += icost;
1708 if (fenc->invQscaleFactor)
1709 {
1710 icostAq = (icost * fenc->invQscaleFactor[cuXY] + 128) >> 8;
1711 m_costIntraAq += icostAq;
1712 }
1713 }
1714 fenc->rowSatds[0][0][cuy] += icostAq;
1715 }
1716 bcost += lowresPenalty;
1717 if (!bBidir)
1718 {
1719 if (fenc->intraCost[cuXY] < bcost)
1720 {
1721 if (bFrameScoreCU) m_intraMbs++;
1722 bcost = fenc->intraCost[cuXY];
1723 listused = 0;
1724 }
1725 }
1726
1727 /* For I frames these costs were accumulated earlier */
1728 if (p0 != p1)
1729 {
1730 int bcostAq = bcost;
1731 if (bFrameScoreCU)
1732 {
1733 m_costEst += bcost;
1734 if (fenc->invQscaleFactor)
1735 {
1736 bcostAq = (bcost * fenc->invQscaleFactor[cuXY] + 128) >> 8;
1737 m_costEstAq += bcostAq;
1738 }
1739 }
1740 fenc->rowSatds[b - p0][p1 - b][cuy] += bcostAq;
1741 }
1742 fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT));
1743}