1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Gopu Govindaswamy <gopu@multicorewareinc.com>
5 * Steve Borho <steve@borho.org>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 * This program is also available under a commercial proprietary license.
22 * For more information, contact us at license @ x265.com.
23 *****************************************************************************/
27 #include "framedata.h"
29 #include "primitives.h"
33 #include "slicetype.h"
35 #include "ratecontrol.h"
37 #define NUM_CUS (m_widthInCU > 2 && m_heightInCU > 2 ? (m_widthInCU - 2) * (m_heightInCU - 2) : m_widthInCU * m_heightInCU)
41 static inline int16_t median(int16_t a
, int16_t b
, int16_t c
)
43 int16_t t
= (a
- b
) & ((a
- b
) >> 31);
47 b
-= (b
- c
) & ((b
- c
) >> 31);
48 b
+= (a
- b
) & ((a
- b
) >> 31);
52 static inline void median_mv(MV
&dst
, MV a
, MV b
, MV c
)
54 dst
.x
= median(a
.x
, b
.x
, c
.x
);
55 dst
.y
= median(a
.y
, b
.y
, c
.y
);
58 Lookahead::Lookahead(x265_param
*param
, ThreadPool
* pool
)
64 m_lastKeyframe
= -m_param
->keyframeMax
;
68 m_widthInCU
= ((m_param
->sourceWidth
/ 2) + X265_LOWRES_CU_SIZE
- 1) >> X265_LOWRES_CU_BITS
;
69 m_heightInCU
= ((m_param
->sourceHeight
/ 2) + X265_LOWRES_CU_SIZE
- 1) >> X265_LOWRES_CU_BITS
;
70 m_scratch
= (int*)x265_malloc(m_widthInCU
* sizeof(int));
71 memset(m_histogram
, 0, sizeof(m_histogram
));
74 Lookahead::~Lookahead() { }
76 void Lookahead::init()
78 if (m_pool
&& m_pool
->getThreadCount() >= 4 &&
79 ((m_param
->bFrameAdaptive
&& m_param
->bframes
) ||
80 m_param
->rc
.cuTree
|| m_param
->scenecutThreshold
||
81 (m_param
->lookaheadDepth
&& m_param
->rc
.vbvBufferSize
)))
82 m_pool
= m_pool
; /* allow use of worker thread */
84 m_pool
= NULL
; /* disable use of worker thread */
87 void Lookahead::destroy()
90 // flush will dequeue, if it is necessary
93 // these two queues will be empty unless the encode was aborted
94 while (!m_inputQueue
.empty())
96 Frame
* curFrame
= m_inputQueue
.popFront();
101 while (!m_outputQueue
.empty())
103 Frame
* curFrame
= m_outputQueue
.popFront();
108 x265_free(m_scratch
);
111 /* Called by API thread */
112 void Lookahead::addPicture(Frame
*curFrame
, int sliceType
)
114 PicYuv
*orig
= curFrame
->m_origPicYuv
;
116 curFrame
->m_lowres
.init(orig
, curFrame
->m_poc
, sliceType
);
118 m_inputQueueLock
.acquire();
119 m_inputQueue
.pushBack(*curFrame
);
121 if (m_inputQueue
.size() >= m_param
->lookaheadDepth
)
123 /* when queue fills the first time, run slicetypeDecide synchronously,
124 * since the encoder will always be blocked here */
125 if (m_pool
&& !m_bFilling
)
127 m_inputQueueLock
.release();
129 m_pool
->pokeIdleThread();
134 if (m_bFilling
&& m_pool
)
135 JobProvider::enqueue();
139 m_inputQueueLock
.release();
142 /* Called by API thread */
143 void Lookahead::flush()
145 /* just in case the input queue is never allowed to fill */
148 /* flush synchronously */
149 m_inputQueueLock
.acquire();
150 if (!m_inputQueue
.empty())
155 m_inputQueueLock
.release();
157 m_inputQueueLock
.acquire();
159 /* bFlushed indicates that an empty output queue actually means all frames
160 * have been decided (no more inputs for the encoder) */
161 if (m_inputQueue
.empty())
163 m_inputQueueLock
.release();
166 /* Called by API thread. If the lookahead queue has not yet been filled the
167 * first time, it immediately returns NULL. Else the function blocks until
168 * outputs are available and then pops the first frame from the output queue. If
169 * flush() has been called and the output queue is empty, NULL is returned. */
170 Frame
* Lookahead::getDecidedPicture()
172 m_outputQueueLock
.acquire();
176 m_outputQueueLock
.release();
180 while (m_outputQueue
.empty() && !m_bFlushed
)
182 m_outputQueueLock
.release();
183 m_outputAvailable
.wait();
184 m_outputQueueLock
.acquire();
187 Frame
*fenc
= m_outputQueue
.popFront();
188 m_outputQueueLock
.release();
192 /* Called by pool worker threads */
193 bool Lookahead::findJob(int)
195 if (m_bReady
&& ATOMIC_CAS32(&m_bReady
, 1, 0) == 1)
197 m_inputQueueLock
.acquire();
205 /* Called by rate-control to calculate the estimated SATD cost for a given
206 * picture. It assumes dpb->prepareEncode() has already been called for the
207 * picture and all the references are established */
208 void Lookahead::getEstimatedPictureCost(Frame
*curFrame
)
210 Lowres
*frames
[X265_LOOKAHEAD_MAX
];
212 // POC distances to each reference
213 Slice
*slice
= curFrame
->m_encData
->m_slice
;
215 int poc
= slice
->m_poc
;
216 int l0poc
= slice
->m_refPOCList
[0][0];
217 int l1poc
= slice
->m_refPOCList
[1][0];
219 switch (slice
->m_sliceType
)
222 frames
[p0
] = &curFrame
->m_lowres
;
227 b
= p1
= poc
- l0poc
;
228 frames
[p0
] = &slice
->m_refPicList
[0][0]->m_lowres
;
229 frames
[b
] = &curFrame
->m_lowres
;
234 p1
= b
+ l1poc
- poc
;
235 frames
[p0
] = &slice
->m_refPicList
[0][0]->m_lowres
;
236 frames
[b
] = &curFrame
->m_lowres
;
237 frames
[p1
] = &slice
->m_refPicList
[1][0]->m_lowres
;
244 if (m_param
->rc
.cuTree
&& !m_param
->rc
.bStatRead
)
245 /* update row satds based on cutree offsets */
246 curFrame
->m_lowres
.satdCost
= frameCostRecalculate(frames
, p0
, p1
, b
);
247 else if (m_param
->rc
.aqMode
)
248 curFrame
->m_lowres
.satdCost
= curFrame
->m_lowres
.costEstAq
[b
- p0
][p1
- b
];
250 curFrame
->m_lowres
.satdCost
= curFrame
->m_lowres
.costEst
[b
- p0
][p1
- b
];
252 if (m_param
->rc
.vbvBufferSize
&& m_param
->rc
.vbvMaxBitrate
)
254 /* aggregate lowres row satds to CTU resolution */
255 curFrame
->m_lowres
.lowresCostForRc
= curFrame
->m_lowres
.lowresCosts
[b
- p0
][p1
- b
];
256 uint32_t lowresRow
= 0, lowresCol
= 0, lowresCuIdx
= 0, sum
= 0;
257 uint32_t scale
= m_param
->maxCUSize
/ (2 * X265_LOWRES_CU_SIZE
);
258 uint32_t numCuInHeight
= (m_param
->sourceHeight
+ g_maxCUSize
- 1) / g_maxCUSize
;
259 uint32_t widthInLowresCu
= (uint32_t)m_widthInCU
, heightInLowresCu
= (uint32_t)m_heightInCU
;
260 double *qp_offset
= 0;
261 /* Factor in qpoffsets based on Aq/Cutree in CU costs */
262 if (m_param
->rc
.aqMode
)
263 qp_offset
= (frames
[b
]->sliceType
== X265_TYPE_B
|| !m_param
->rc
.cuTree
) ? frames
[b
]->qpAqOffset
: frames
[b
]->qpCuTreeOffset
;
265 for (uint32_t row
= 0; row
< numCuInHeight
; row
++)
267 lowresRow
= row
* scale
;
268 for (uint32_t cnt
= 0; cnt
< scale
&& lowresRow
< heightInLowresCu
; lowresRow
++, cnt
++)
271 lowresCuIdx
= lowresRow
* widthInLowresCu
;
272 for (lowresCol
= 0; lowresCol
< widthInLowresCu
; lowresCol
++, lowresCuIdx
++)
274 uint16_t lowresCuCost
= curFrame
->m_lowres
.lowresCostForRc
[lowresCuIdx
] & LOWRES_COST_MASK
;
277 lowresCuCost
= (uint16_t)((lowresCuCost
* x265_exp2fix8(qp_offset
[lowresCuIdx
]) + 128) >> 8);
278 int32_t intraCuCost
= curFrame
->m_lowres
.intraCost
[lowresCuIdx
];
279 curFrame
->m_lowres
.intraCost
[lowresCuIdx
] = (intraCuCost
* x265_exp2fix8(qp_offset
[lowresCuIdx
]) + 128) >> 8;
281 curFrame
->m_lowres
.lowresCostForRc
[lowresCuIdx
] = lowresCuCost
;
284 curFrame
->m_encData
->m_rowStat
[row
].satdForVbv
+= sum
;
290 /* called by API thread or worker thread with inputQueueLock acquired */
291 void Lookahead::slicetypeDecide()
293 ScopedLock
lock(m_decideLock
);
295 Lowres
*frames
[X265_LOOKAHEAD_MAX
];
296 Frame
*list
[X265_LOOKAHEAD_MAX
];
297 int maxSearch
= X265_MIN(m_param
->lookaheadDepth
, X265_LOOKAHEAD_MAX
);
299 memset(frames
, 0, sizeof(frames
));
300 memset(list
, 0, sizeof(list
));
302 Frame
*curFrame
= m_inputQueue
.first();
304 for (j
= 0; j
< m_param
->bframes
+ 2; j
++)
306 if (!curFrame
) break;
308 curFrame
= curFrame
->m_next
;
311 curFrame
= m_inputQueue
.first();
312 frames
[0] = m_lastNonB
;
313 for (j
= 0; j
< maxSearch
; j
++)
315 if (!curFrame
) break;
316 frames
[j
+ 1] = &curFrame
->m_lowres
;
317 curFrame
= curFrame
->m_next
;
323 m_inputQueueLock
.release();
325 if (!m_est
.m_rows
&& list
[0])
326 m_est
.init(m_param
, list
[0]);
328 if (m_lastNonB
&& !m_param
->rc
.bStatRead
&&
329 ((m_param
->bFrameAdaptive
&& m_param
->bframes
) ||
330 m_param
->rc
.cuTree
|| m_param
->scenecutThreshold
||
331 (m_param
->lookaheadDepth
&& m_param
->rc
.vbvBufferSize
)))
333 slicetypeAnalyse(frames
, false);
337 for (bframes
= 0, brefs
= 0;; bframes
++)
339 Lowres
& frm
= list
[bframes
]->m_lowres
;
341 if (frm
.sliceType
== X265_TYPE_BREF
&& !m_param
->bBPyramid
&& brefs
== m_param
->bBPyramid
)
343 frm
.sliceType
= X265_TYPE_B
;
344 x265_log(m_param
, X265_LOG_WARNING
, "B-ref at frame %d incompatible with B-pyramid\n",
348 /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
349 smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it.*/
350 else if (frm
.sliceType
== X265_TYPE_BREF
&& m_param
->bBPyramid
&& brefs
&&
351 m_param
->maxNumReferences
<= (brefs
+ 3))
353 frm
.sliceType
= X265_TYPE_B
;
354 x265_log(m_param
, X265_LOG_WARNING
, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n",
355 frm
.sliceType
, m_param
->maxNumReferences
);
358 if ( /*(!param->intraRefresh || frm.frameNum == 0) && */ frm
.frameNum
- m_lastKeyframe
>= m_param
->keyframeMax
)
360 if (frm
.sliceType
== X265_TYPE_AUTO
|| frm
.sliceType
== X265_TYPE_I
)
361 frm
.sliceType
= m_param
->bOpenGOP
&& m_lastKeyframe
>= 0 ? X265_TYPE_I
: X265_TYPE_IDR
;
362 bool warn
= frm
.sliceType
!= X265_TYPE_IDR
;
363 if (warn
&& m_param
->bOpenGOP
)
364 warn
&= frm
.sliceType
!= X265_TYPE_I
;
367 x265_log(m_param
, X265_LOG_WARNING
, "specified frame type (%d) at %d is not compatible with keyframe interval\n",
368 frm
.sliceType
, frm
.frameNum
);
369 frm
.sliceType
= m_param
->bOpenGOP
&& m_lastKeyframe
>= 0 ? X265_TYPE_I
: X265_TYPE_IDR
;
372 if (frm
.sliceType
== X265_TYPE_I
&& frm
.frameNum
- m_lastKeyframe
>= m_param
->keyframeMin
)
374 if (m_param
->bOpenGOP
)
376 m_lastKeyframe
= frm
.frameNum
;
377 frm
.bKeyframe
= true;
380 frm
.sliceType
= X265_TYPE_IDR
;
382 if (frm
.sliceType
== X265_TYPE_IDR
)
385 m_lastKeyframe
= frm
.frameNum
;
386 frm
.bKeyframe
= true;
389 list
[bframes
- 1]->m_lowres
.sliceType
= X265_TYPE_P
;
393 if (bframes
== m_param
->bframes
|| !list
[bframes
+ 1])
395 if (IS_X265_TYPE_B(frm
.sliceType
))
396 x265_log(m_param
, X265_LOG_WARNING
, "specified frame type is not compatible with max B-frames\n");
397 if (frm
.sliceType
== X265_TYPE_AUTO
|| IS_X265_TYPE_B(frm
.sliceType
))
398 frm
.sliceType
= X265_TYPE_P
;
400 if (frm
.sliceType
== X265_TYPE_BREF
)
402 if (frm
.sliceType
== X265_TYPE_AUTO
)
403 frm
.sliceType
= X265_TYPE_B
;
404 else if (!IS_X265_TYPE_B(frm
.sliceType
))
409 list
[bframes
- 1]->m_lowres
.bLastMiniGopBFrame
= true;
410 list
[bframes
]->m_lowres
.leadingBframes
= bframes
;
411 m_lastNonB
= &list
[bframes
]->m_lowres
;
412 m_histogram
[bframes
]++;
414 /* insert a bref into the sequence */
415 if (m_param
->bBPyramid
&& bframes
> 1 && !brefs
)
417 list
[bframes
/ 2]->m_lowres
.sliceType
= X265_TYPE_BREF
;
421 /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
422 if (m_param
->rc
.rateControlMode
!= X265_RC_CQP
)
425 /* For zero latency tuning, calculate frame cost to be used later in RC */
428 for (int i
= 0; i
<= bframes
; i
++)
429 frames
[i
+ 1] = &list
[i
]->m_lowres
;
432 /* estimate new non-B cost */
433 p1
= b
= bframes
+ 1;
434 p0
= (IS_X265_TYPE_I(frames
[bframes
+ 1]->sliceType
)) ? b
: 0;
435 m_est
.estimateFrameCost(frames
, p0
, p1
, b
, 0);
440 for (b
= 1; b
<= bframes
; b
++)
442 if (frames
[b
]->sliceType
== X265_TYPE_B
)
443 for (p1
= b
; frames
[p1
]->sliceType
== X265_TYPE_B
; p1
++)
444 ; // find new nonb or bref
448 m_est
.estimateFrameCost(frames
, p0
, p1
, b
, 0);
450 if (frames
[b
]->sliceType
== X265_TYPE_BREF
)
456 m_inputQueueLock
.acquire();
458 /* dequeue all frames from inputQueue that are about to be enqueued
459 * in the output queue. The order is important because Frame can
460 * only be in one list at a time */
461 int64_t pts
[X265_BFRAME_MAX
+ 1];
462 for (int i
= 0; i
<= bframes
; i
++)
465 curFrame
= m_inputQueue
.popFront();
466 pts
[i
] = curFrame
->m_pts
;
470 m_inputQueueLock
.release();
472 m_outputQueueLock
.acquire();
473 /* add non-B to output queue */
475 list
[bframes
]->m_reorderedPts
= pts
[idx
++];
476 m_outputQueue
.pushBack(*list
[bframes
]);
478 /* Add B-ref frame next to P frame in output queue, the B-ref encode before non B-ref frame */
479 if (bframes
> 1 && m_param
->bBPyramid
)
481 for (int i
= 0; i
< bframes
; i
++)
483 if (list
[i
]->m_lowres
.sliceType
== X265_TYPE_BREF
)
485 list
[i
]->m_reorderedPts
= pts
[idx
++];
486 m_outputQueue
.pushBack(*list
[i
]);
491 /* add B frames to output queue */
492 for (int i
= 0; i
< bframes
; i
++)
494 /* push all the B frames into output queue except B-ref, which already pushed into output queue*/
495 if (list
[i
]->m_lowres
.sliceType
!= X265_TYPE_BREF
)
497 list
[i
]->m_reorderedPts
= pts
[idx
++];
498 m_outputQueue
.pushBack(*list
[i
]);
502 bool isKeyFrameAnalyse
= (m_param
->rc
.cuTree
|| (m_param
->rc
.vbvBufferSize
&& m_param
->lookaheadDepth
)) && !m_param
->rc
.bStatRead
;
503 if (isKeyFrameAnalyse
&& IS_X265_TYPE_I(m_lastNonB
->sliceType
))
505 m_inputQueueLock
.acquire();
506 Frame
*curFrame
= m_inputQueue
.first();
507 frames
[0] = m_lastNonB
;
509 for (j
= 0; j
< maxSearch
; j
++)
511 frames
[j
+ 1] = &curFrame
->m_lowres
;
512 curFrame
= curFrame
->m_next
;
515 frames
[j
+ 1] = NULL
;
516 m_inputQueueLock
.release();
517 slicetypeAnalyse(frames
, true);
520 m_outputQueueLock
.release();
521 m_outputAvailable
.trigger();
524 void Lookahead::vbvLookahead(Lowres
**frames
, int numFrames
, int keyframe
)
526 int prevNonB
= 0, curNonB
= 1, idx
= 0;
527 bool isNextNonB
= false;
529 while (curNonB
< numFrames
&& frames
[curNonB
]->sliceType
== X265_TYPE_B
)
532 int nextNonB
= keyframe
? prevNonB
: curNonB
;
533 int nextB
= keyframe
? prevNonB
+ 1 : curNonB
+ 1;
535 while (curNonB
< numFrames
+ !keyframe
)
537 /* P/I cost: This shouldn't include the cost of nextNonB */
538 if (nextNonB
!= curNonB
)
540 int p0
= IS_X265_TYPE_I(frames
[curNonB
]->sliceType
) ? curNonB
: prevNonB
;
541 frames
[nextNonB
]->plannedSatd
[idx
] = vbvFrameCost(frames
, p0
, curNonB
, curNonB
);
542 frames
[nextNonB
]->plannedType
[idx
] = frames
[curNonB
]->sliceType
;
545 /* Handle the B-frames: coded order */
546 for (int i
= prevNonB
+ 1; i
< curNonB
; i
++, idx
++)
548 frames
[nextNonB
]->plannedSatd
[idx
] = vbvFrameCost(frames
, prevNonB
, curNonB
, i
);
549 frames
[nextNonB
]->plannedType
[idx
] = X265_TYPE_B
;
552 for (int i
= nextB
; i
<= curNonB
; i
++)
554 for (int j
= frames
[i
]->indB
+ i
+ 1; j
<= curNonB
; j
++, frames
[i
]->indB
++)
560 int p0
= IS_X265_TYPE_I(frames
[curNonB
]->sliceType
) ? curNonB
: prevNonB
;
561 frames
[i
]->plannedSatd
[frames
[i
]->indB
] = vbvFrameCost(frames
, p0
, curNonB
, curNonB
);
562 frames
[i
]->plannedType
[frames
[i
]->indB
] = frames
[curNonB
]->sliceType
;
567 frames
[i
]->plannedSatd
[frames
[i
]->indB
] = vbvFrameCost(frames
, prevNonB
, curNonB
, j
);
568 frames
[i
]->plannedType
[frames
[i
]->indB
] = X265_TYPE_B
;
571 if (i
== curNonB
&& !isNextNonB
)
577 while (curNonB
<= numFrames
&& frames
[curNonB
]->sliceType
== X265_TYPE_B
)
581 frames
[nextNonB
]->plannedType
[idx
] = X265_TYPE_AUTO
;
584 int64_t Lookahead::vbvFrameCost(Lowres
**frames
, int p0
, int p1
, int b
)
586 int64_t cost
= m_est
.estimateFrameCost(frames
, p0
, p1
, b
, 0);
588 if (m_param
->rc
.aqMode
)
590 if (m_param
->rc
.cuTree
)
591 return frameCostRecalculate(frames
, p0
, p1
, b
);
593 return frames
[b
]->costEstAq
[b
- p0
][p1
- b
];
598 void Lookahead::slicetypeAnalyse(Lowres
**frames
, bool bKeyframe
)
600 int numFrames
, origNumFrames
, keyintLimit
, framecnt
;
601 int maxSearch
= X265_MIN(m_param
->lookaheadDepth
, X265_LOOKAHEAD_MAX
);
602 int cuCount
= NUM_CUS
;
604 bool bIsVbvLookahead
= m_param
->rc
.vbvBufferSize
&& m_param
->lookaheadDepth
;
606 /* count undecided frames */
607 for (framecnt
= 0; framecnt
< maxSearch
; framecnt
++)
609 Lowres
*fenc
= frames
[framecnt
+ 1];
610 if (!fenc
|| fenc
->sliceType
!= X265_TYPE_AUTO
)
616 if (m_param
->rc
.cuTree
)
617 cuTree(frames
, 0, bKeyframe
);
621 frames
[framecnt
+ 1] = NULL
;
623 keyintLimit
= m_param
->keyframeMax
- frames
[0]->frameNum
+ m_lastKeyframe
- 1;
624 origNumFrames
= numFrames
= X265_MIN(framecnt
, keyintLimit
);
627 numFrames
= framecnt
;
628 else if (m_param
->bOpenGOP
&& numFrames
< framecnt
)
630 else if (numFrames
== 0)
632 frames
[1]->sliceType
= X265_TYPE_I
;
637 int numAnalyzed
= numFrames
;
638 if (m_param
->scenecutThreshold
&& scenecut(frames
, 0, 1, true, origNumFrames
, maxSearch
))
640 frames
[1]->sliceType
= X265_TYPE_I
;
644 if (m_param
->bframes
)
646 if (m_param
->bFrameAdaptive
== X265_B_ADAPT_TRELLIS
)
650 char best_paths
[X265_BFRAME_MAX
+ 1][X265_LOOKAHEAD_MAX
+ 1] = { "", "P" };
651 int best_path_index
= numFrames
% (X265_BFRAME_MAX
+ 1);
653 /* Perform the frametype analysis. */
654 for (int j
= 2; j
<= numFrames
; j
++)
656 slicetypePath(frames
, j
, best_paths
);
659 numBFrames
= (int)strspn(best_paths
[best_path_index
], "B");
661 /* Load the results of the analysis into the frame types. */
662 for (int j
= 1; j
< numFrames
; j
++)
664 frames
[j
]->sliceType
= best_paths
[best_path_index
][j
- 1] == 'B' ? X265_TYPE_B
: X265_TYPE_P
;
667 frames
[numFrames
]->sliceType
= X265_TYPE_P
;
669 else if (m_param
->bFrameAdaptive
== X265_B_ADAPT_FAST
)
671 int64_t cost1p0
, cost2p0
, cost1b1
, cost2p1
;
673 for (int i
= 0; i
<= numFrames
- 2; )
675 cost2p1
= m_est
.estimateFrameCost(frames
, i
+ 0, i
+ 2, i
+ 2, 1);
676 if (frames
[i
+ 2]->intraMbs
[2] > cuCount
/ 2)
678 frames
[i
+ 1]->sliceType
= X265_TYPE_P
;
679 frames
[i
+ 2]->sliceType
= X265_TYPE_P
;
684 cost1b1
= m_est
.estimateFrameCost(frames
, i
+ 0, i
+ 2, i
+ 1, 0);
685 cost1p0
= m_est
.estimateFrameCost(frames
, i
+ 0, i
+ 1, i
+ 1, 0);
686 cost2p0
= m_est
.estimateFrameCost(frames
, i
+ 1, i
+ 2, i
+ 2, 0);
688 if (cost1p0
+ cost2p0
< cost1b1
+ cost2p1
)
690 frames
[i
+ 1]->sliceType
= X265_TYPE_P
;
695 // arbitrary and untuned
696 #define INTER_THRESH 300
697 #define P_SENS_BIAS (50 - m_param->bFrameBias)
698 frames
[i
+ 1]->sliceType
= X265_TYPE_B
;
701 for (j
= i
+ 2; j
<= X265_MIN(i
+ m_param
->bframes
, numFrames
- 1); j
++)
703 int64_t pthresh
= X265_MAX(INTER_THRESH
- P_SENS_BIAS
* (j
- i
- 1), INTER_THRESH
/ 10);
704 int64_t pcost
= m_est
.estimateFrameCost(frames
, i
+ 0, j
+ 1, j
+ 1, 1);
705 if (pcost
> pthresh
* cuCount
|| frames
[j
+ 1]->intraMbs
[j
- i
+ 1] > cuCount
/ 3)
707 frames
[j
]->sliceType
= X265_TYPE_B
;
710 frames
[j
]->sliceType
= X265_TYPE_P
;
713 frames
[numFrames
]->sliceType
= X265_TYPE_P
;
715 while (numBFrames
< numFrames
&& frames
[numBFrames
+ 1]->sliceType
== X265_TYPE_B
)
722 numBFrames
= X265_MIN(numFrames
- 1, m_param
->bframes
);
723 for (int j
= 1; j
< numFrames
; j
++)
725 frames
[j
]->sliceType
= (j
% (numBFrames
+ 1)) ? X265_TYPE_B
: X265_TYPE_P
;
728 frames
[numFrames
]->sliceType
= X265_TYPE_P
;
730 /* Check scenecut on the first minigop. */
731 for (int j
= 1; j
< numBFrames
+ 1; j
++)
733 if (m_param
->scenecutThreshold
&& scenecut(frames
, j
, j
+ 1, false, origNumFrames
, maxSearch
))
735 frames
[j
]->sliceType
= X265_TYPE_P
;
741 resetStart
= bKeyframe
? 1 : X265_MIN(numBFrames
+ 2, numAnalyzed
+ 1);
745 for (int j
= 1; j
<= numFrames
; j
++)
747 frames
[j
]->sliceType
= X265_TYPE_P
;
750 resetStart
= bKeyframe
? 1 : 2;
753 if (m_param
->rc
.cuTree
)
754 cuTree(frames
, X265_MIN(numFrames
, m_param
->keyframeMax
), bKeyframe
);
756 // if (!param->bIntraRefresh)
757 for (int j
= keyintLimit
+ 1; j
<= numFrames
; j
+= m_param
->keyframeMax
)
759 frames
[j
]->sliceType
= X265_TYPE_I
;
760 resetStart
= X265_MIN(resetStart
, j
+ 1);
764 vbvLookahead(frames
, numFrames
, bKeyframe
);
766 /* Restore frametypes for all frames that haven't actually been decided yet. */
767 for (int j
= resetStart
; j
<= numFrames
; j
++)
769 frames
[j
]->sliceType
= X265_TYPE_AUTO
;
773 bool Lookahead::scenecut(Lowres
**frames
, int p0
, int p1
, bool bRealScenecut
, int numFrames
, int maxSearch
)
775 /* Only do analysis during a normal scenecut check. */
776 if (bRealScenecut
&& m_param
->bframes
)
778 int origmaxp1
= p0
+ 1;
779 /* Look ahead to avoid coding short flashes as scenecuts. */
780 if (m_param
->bFrameAdaptive
== X265_B_ADAPT_TRELLIS
)
781 /* Don't analyse any more frames than the trellis would have covered. */
782 origmaxp1
+= m_param
->bframes
;
785 int maxp1
= X265_MIN(origmaxp1
, numFrames
);
787 /* Where A and B are scenes: AAAAAABBBAAAAAA
788 * If BBB is shorter than (maxp1-p0), it is detected as a flash
789 * and not considered a scenecut. */
790 for (int cp1
= p1
; cp1
<= maxp1
; cp1
++)
792 if (!scenecutInternal(frames
, p0
, cp1
, false))
793 /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */
794 for (int i
= cp1
; i
> p0
; i
--)
796 frames
[i
]->bScenecut
= false;
800 /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF
801 * If each of BB ... EE are shorter than (maxp1-p0), they are
802 * detected as flashes and not considered scenecuts.
803 * Instead, the first F frame becomes a scenecut.
804 * If the video ends before F, no frame becomes a scenecut. */
805 for (int cp0
= p0
; cp0
<= maxp1
; cp0
++)
807 if (origmaxp1
> maxSearch
|| (cp0
< maxp1
&& scenecutInternal(frames
, cp0
, maxp1
, false)))
808 /* If cur_p0 is the p0 of a scenecut, it cannot be the p1 of a scenecut. */
809 frames
[cp0
]->bScenecut
= false;
813 /* Ignore frames that are part of a flash, i.e. cannot be real scenecuts. */
814 if (!frames
[p1
]->bScenecut
)
816 return scenecutInternal(frames
, p0
, p1
, bRealScenecut
);
819 bool Lookahead::scenecutInternal(Lowres
**frames
, int p0
, int p1
, bool bRealScenecut
)
821 Lowres
*frame
= frames
[p1
];
823 m_est
.estimateFrameCost(frames
, p0
, p1
, p1
, 0);
825 int64_t icost
= frame
->costEst
[0][0];
826 int64_t pcost
= frame
->costEst
[p1
- p0
][0];
827 int gopSize
= frame
->frameNum
- m_lastKeyframe
;
828 float threshMax
= (float)(m_param
->scenecutThreshold
/ 100.0);
830 /* magic numbers pulled out of thin air */
831 float threshMin
= (float)(threshMax
* 0.25);
834 if (m_param
->keyframeMin
== m_param
->keyframeMax
)
835 threshMin
= threshMax
;
836 if (gopSize
<= m_param
->keyframeMin
/ 4)
837 bias
= threshMin
/ 4;
838 else if (gopSize
<= m_param
->keyframeMin
)
839 bias
= threshMin
* gopSize
/ m_param
->keyframeMin
;
843 + (threshMax
- threshMin
)
844 * (gopSize
- m_param
->keyframeMin
)
845 / (m_param
->keyframeMax
- m_param
->keyframeMin
);
848 bool res
= pcost
>= (1.0 - bias
) * icost
;
849 if (res
&& bRealScenecut
)
851 int imb
= frame
->intraMbs
[p1
- p0
];
852 int pmb
= NUM_CUS
- imb
;
853 x265_log(m_param
, X265_LOG_DEBUG
, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n",
854 frame
->frameNum
, icost
, pcost
, 1. - (double)pcost
/ icost
, bias
, gopSize
, imb
, pmb
);
859 void Lookahead::slicetypePath(Lowres
**frames
, int length
, char(*best_paths
)[X265_LOOKAHEAD_MAX
+ 1])
861 char paths
[2][X265_LOOKAHEAD_MAX
+ 1];
862 int num_paths
= X265_MIN(m_param
->bframes
+ 1, length
);
863 int64_t best_cost
= 1LL << 62;
866 /* Iterate over all currently possible paths */
867 for (int path
= 0; path
< num_paths
; path
++)
869 /* Add suffixes to the current path */
870 int len
= length
- (path
+ 1);
871 memcpy(paths
[idx
], best_paths
[len
% (X265_BFRAME_MAX
+ 1)], len
);
872 memset(paths
[idx
] + len
, 'B', path
);
873 strcpy(paths
[idx
] + len
+ path
, "P");
875 /* Calculate the actual cost of the current path */
876 int64_t cost
= slicetypePathCost(frames
, paths
[idx
], best_cost
);
877 if (cost
< best_cost
)
884 /* Store the best path. */
885 memcpy(best_paths
[length
% (X265_BFRAME_MAX
+ 1)], paths
[idx
^ 1], length
);
888 int64_t Lookahead::slicetypePathCost(Lowres
**frames
, char *path
, int64_t threshold
)
894 path
--; /* Since the 1st path element is really the second frame */
898 /* Find the location of the next P-frame. */
899 while (path
[next_p
] != 'P')
904 /* Add the cost of the P-frame found above */
905 cost
+= m_est
.estimateFrameCost(frames
, cur_p
, next_p
, next_p
, 0);
906 /* Early terminate if the cost we have found is larger than the best path cost so far */
907 if (cost
> threshold
)
910 if (m_param
->bBPyramid
&& next_p
- cur_p
> 2)
912 int middle
= cur_p
+ (next_p
- cur_p
) / 2;
913 cost
+= m_est
.estimateFrameCost(frames
, cur_p
, next_p
, middle
, 0);
914 for (int next_b
= loc
; next_b
< middle
&& cost
< threshold
; next_b
++)
916 cost
+= m_est
.estimateFrameCost(frames
, cur_p
, middle
, next_b
, 0);
919 for (int next_b
= middle
+ 1; next_b
< next_p
&& cost
< threshold
; next_b
++)
921 cost
+= m_est
.estimateFrameCost(frames
, middle
, next_p
, next_b
, 0);
926 for (int next_b
= loc
; next_b
< next_p
&& cost
< threshold
; next_b
++)
928 cost
+= m_est
.estimateFrameCost(frames
, cur_p
, next_p
, next_b
, 0);
939 void Lookahead::cuTree(Lowres
**frames
, int numframes
, bool bIntra
)
942 int lastnonb
, curnonb
= 1;
946 double totalDuration
= 0.0;
947 for (int j
= 0; j
<= numframes
; j
++)
948 totalDuration
+= (double)m_param
->fpsDenom
/ m_param
->fpsNum
;
950 double averageDuration
= totalDuration
/ (numframes
+ 1);
953 int cuCount
= m_widthInCU
* m_heightInCU
;
956 m_est
.estimateFrameCost(frames
, 0, 0, 0, 0);
958 while (i
> 0 && frames
[i
]->sliceType
== X265_TYPE_B
)
963 /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could
964 * be applied to the end of a lookahead buffer of any size. However, it's most needed when
965 * lookahead=0, so that's what's currently implemented. */
966 if (!m_param
->lookaheadDepth
)
970 memset(frames
[0]->propagateCost
, 0, cuCount
* sizeof(uint16_t));
971 memcpy(frames
[0]->qpCuTreeOffset
, frames
[0]->qpAqOffset
, cuCount
* sizeof(double));
974 std::swap(frames
[lastnonb
]->propagateCost
, frames
[0]->propagateCost
);
975 memset(frames
[0]->propagateCost
, 0, cuCount
* sizeof(uint16_t));
981 memset(frames
[lastnonb
]->propagateCost
, 0, cuCount
* sizeof(uint16_t));
987 while (frames
[curnonb
]->sliceType
== X265_TYPE_B
&& curnonb
> 0)
993 m_est
.estimateFrameCost(frames
, curnonb
, lastnonb
, lastnonb
, 0);
994 memset(frames
[curnonb
]->propagateCost
, 0, cuCount
* sizeof(uint16_t));
995 bframes
= lastnonb
- curnonb
- 1;
996 if (m_param
->bBPyramid
&& bframes
> 1)
998 int middle
= (bframes
+ 1) / 2 + curnonb
;
999 m_est
.estimateFrameCost(frames
, curnonb
, lastnonb
, middle
, 0);
1000 memset(frames
[middle
]->propagateCost
, 0, cuCount
* sizeof(uint16_t));
1003 int p0
= i
> middle
? middle
: curnonb
;
1004 int p1
= i
< middle
? middle
: lastnonb
;
1007 m_est
.estimateFrameCost(frames
, p0
, p1
, i
, 0);
1008 estimateCUPropagate(frames
, averageDuration
, p0
, p1
, i
, 0);
1013 estimateCUPropagate(frames
, averageDuration
, curnonb
, lastnonb
, middle
, 1);
1019 m_est
.estimateFrameCost(frames
, curnonb
, lastnonb
, i
, 0);
1020 estimateCUPropagate(frames
, averageDuration
, curnonb
, lastnonb
, i
, 0);
1024 estimateCUPropagate(frames
, averageDuration
, curnonb
, lastnonb
, lastnonb
, 1);
1028 if (!m_param
->lookaheadDepth
)
1030 m_est
.estimateFrameCost(frames
, 0, lastnonb
, lastnonb
, 0);
1031 estimateCUPropagate(frames
, averageDuration
, 0, lastnonb
, lastnonb
, 1);
1032 std::swap(frames
[lastnonb
]->propagateCost
, frames
[0]->propagateCost
);
1035 cuTreeFinish(frames
[lastnonb
], averageDuration
, lastnonb
);
1036 if (m_param
->bBPyramid
&& bframes
> 1 && !m_param
->rc
.vbvBufferSize
)
1037 cuTreeFinish(frames
[lastnonb
+ (bframes
+ 1) / 2], averageDuration
, 0);
1040 void Lookahead::estimateCUPropagate(Lowres
**frames
, double averageDuration
, int p0
, int p1
, int b
, int referenced
)
1042 uint16_t *refCosts
[2] = { frames
[p0
]->propagateCost
, frames
[p1
]->propagateCost
};
1043 int32_t distScaleFactor
= (((b
- p0
) << 8) + ((p1
- p0
) >> 1)) / (p1
- p0
);
1044 int32_t bipredWeight
= m_param
->bEnableWeightedBiPred
? 64 - (distScaleFactor
>> 2) : 32;
1045 MV
*mvs
[2] = { frames
[b
]->lowresMvs
[0][b
- p0
- 1], frames
[b
]->lowresMvs
[1][p1
- b
- 1] };
1046 int32_t bipredWeights
[2] = { bipredWeight
, 64 - bipredWeight
};
1048 memset(m_scratch
, 0, m_widthInCU
* sizeof(int));
1050 uint16_t *propagateCost
= frames
[b
]->propagateCost
;
1053 double fpsFactor
= CLIP_DURATION((double)m_param
->fpsDenom
/ m_param
->fpsNum
) / CLIP_DURATION(averageDuration
);
1055 /* For non-refferd frames the source costs are always zero, so just memset one row and re-use it. */
1057 memset(frames
[b
]->propagateCost
, 0, m_widthInCU
* sizeof(uint16_t));
1059 int32_t StrideInCU
= m_widthInCU
;
1060 for (uint16_t blocky
= 0; blocky
< m_heightInCU
; blocky
++)
1062 int cuIndex
= blocky
* StrideInCU
;
1063 primitives
.propagateCost(m_scratch
, propagateCost
,
1064 frames
[b
]->intraCost
+ cuIndex
, frames
[b
]->lowresCosts
[b
- p0
][p1
- b
] + cuIndex
,
1065 frames
[b
]->invQscaleFactor
+ cuIndex
, &fpsFactor
, m_widthInCU
);
1068 propagateCost
+= m_widthInCU
;
1069 for (uint16_t blockx
= 0; blockx
< m_widthInCU
; blockx
++, cuIndex
++)
1071 int32_t propagate_amount
= m_scratch
[blockx
];
1072 /* Don't propagate for an intra block. */
1073 if (propagate_amount
> 0)
1075 /* Access width-2 bitfield. */
1076 int32_t lists_used
= frames
[b
]->lowresCosts
[b
- p0
][p1
- b
][cuIndex
] >> LOWRES_COST_SHIFT
;
1077 /* Follow the MVs to the previous frame(s). */
1078 for (uint16_t list
= 0; list
< 2; list
++)
1080 if ((lists_used
>> list
) & 1)
1082 #define CLIP_ADD(s, x) (s) = (uint16_t)X265_MIN((s) + (x), (1 << 16) - 1)
1083 int32_t listamount
= propagate_amount
;
1084 /* Apply bipred weighting. */
1085 if (lists_used
== 3)
1086 listamount
= (listamount
* bipredWeights
[list
] + 32) >> 6;
1088 /* Early termination for simple case of mv0. */
1089 if (!mvs
[list
][cuIndex
].word
)
1091 CLIP_ADD(refCosts
[list
][cuIndex
], listamount
);
1095 int32_t x
= mvs
[list
][cuIndex
].x
;
1096 int32_t y
= mvs
[list
][cuIndex
].y
;
1097 int32_t cux
= (x
>> 5) + blockx
;
1098 int32_t cuy
= (y
>> 5) + blocky
;
1099 int32_t idx0
= cux
+ cuy
* StrideInCU
;
1100 int32_t idx1
= idx0
+ 1;
1101 int32_t idx2
= idx0
+ StrideInCU
;
1102 int32_t idx3
= idx0
+ StrideInCU
+ 1;
1105 int32_t idx0weight
= (32 - y
) * (32 - x
);
1106 int32_t idx1weight
= (32 - y
) * x
;
1107 int32_t idx2weight
= y
* (32 - x
);
1108 int32_t idx3weight
= y
* x
;
1110 /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
1112 if (cux
< m_widthInCU
- 1 && cuy
< m_heightInCU
- 1 && cux
>= 0 && cuy
>= 0)
1114 CLIP_ADD(refCosts
[list
][idx0
], (listamount
* idx0weight
+ 512) >> 10);
1115 CLIP_ADD(refCosts
[list
][idx1
], (listamount
* idx1weight
+ 512) >> 10);
1116 CLIP_ADD(refCosts
[list
][idx2
], (listamount
* idx2weight
+ 512) >> 10);
1117 CLIP_ADD(refCosts
[list
][idx3
], (listamount
* idx3weight
+ 512) >> 10);
1119 else /* Check offsets individually */
1121 if (cux
< m_widthInCU
&& cuy
< m_heightInCU
&& cux
>= 0 && cuy
>= 0)
1122 CLIP_ADD(refCosts
[list
][idx0
], (listamount
* idx0weight
+ 512) >> 10);
1123 if (cux
+ 1 < m_widthInCU
&& cuy
< m_heightInCU
&& cux
+ 1 >= 0 && cuy
>= 0)
1124 CLIP_ADD(refCosts
[list
][idx1
], (listamount
* idx1weight
+ 512) >> 10);
1125 if (cux
< m_widthInCU
&& cuy
+ 1 < m_heightInCU
&& cux
>= 0 && cuy
+ 1 >= 0)
1126 CLIP_ADD(refCosts
[list
][idx2
], (listamount
* idx2weight
+ 512) >> 10);
1127 if (cux
+ 1 < m_widthInCU
&& cuy
+ 1 < m_heightInCU
&& cux
+ 1 >= 0 && cuy
+ 1 >= 0)
1128 CLIP_ADD(refCosts
[list
][idx3
], (listamount
* idx3weight
+ 512) >> 10);
1136 if (m_param
->rc
.vbvBufferSize
&& m_param
->lookaheadDepth
&& referenced
)
1137 cuTreeFinish(frames
[b
], averageDuration
, b
== p1
? b
- p0
: 0);
1140 void Lookahead::cuTreeFinish(Lowres
*frame
, double averageDuration
, int ref0Distance
)
1142 int fpsFactor
= (int)(CLIP_DURATION(averageDuration
) / CLIP_DURATION((double)m_param
->fpsDenom
/ m_param
->fpsNum
) * 256);
1143 double weightdelta
= 0.0;
1145 if (ref0Distance
&& frame
->weightedCostDelta
[ref0Distance
- 1] > 0)
1146 weightdelta
= (1.0 - frame
->weightedCostDelta
[ref0Distance
- 1]);
1148 /* Allow the strength to be adjusted via qcompress, since the two
1149 * concepts are very similar. */
1151 int cuCount
= m_widthInCU
* m_heightInCU
;
1152 double strength
= 5.0 * (1.0 - m_param
->rc
.qCompress
);
1154 for (int cuIndex
= 0; cuIndex
< cuCount
; cuIndex
++)
1156 int intracost
= (frame
->intraCost
[cuIndex
] * frame
->invQscaleFactor
[cuIndex
] + 128) >> 8;
1159 int propagateCost
= (frame
->propagateCost
[cuIndex
] * fpsFactor
+ 128) >> 8;
1160 double log2_ratio
= X265_LOG2(intracost
+ propagateCost
) - X265_LOG2(intracost
) + weightdelta
;
1161 frame
->qpCuTreeOffset
[cuIndex
] = frame
->qpAqOffset
[cuIndex
] - strength
* log2_ratio
;
1166 /* If MB-tree changes the quantizers, we need to recalculate the frame cost without
1167 * re-running lookahead. */
1168 int64_t Lookahead::frameCostRecalculate(Lowres
** frames
, int p0
, int p1
, int b
)
1171 int *rowSatd
= frames
[b
]->rowSatds
[b
- p0
][p1
- b
];
1172 double *qp_offset
= (frames
[b
]->sliceType
== X265_TYPE_B
) ? frames
[b
]->qpAqOffset
: frames
[b
]->qpCuTreeOffset
;
1175 for (int cuy
= m_heightInCU
- 1; cuy
>= 0; cuy
--)
1178 for (int cux
= m_widthInCU
- 1; cux
>= 0; cux
--)
1180 int cuxy
= cux
+ cuy
* m_widthInCU
;
1181 int cuCost
= frames
[b
]->lowresCosts
[b
- p0
][p1
- b
][cuxy
] & LOWRES_COST_MASK
;
1182 double qp_adj
= qp_offset
[cuxy
];
1183 cuCost
= (cuCost
* x265_exp2fix8(qp_adj
) + 128) >> 8;
1184 rowSatd
[cuy
] += cuCost
;
1185 if ((cuy
> 0 && cuy
< m_heightInCU
- 1 &&
1186 cux
> 0 && cux
< m_widthInCU
- 1) ||
1187 m_widthInCU
<= 2 || m_heightInCU
<= 2)
1197 CostEstimate::CostEstimate(ThreadPool
*p
)
1202 m_wbuffer
[0] = m_wbuffer
[1] = m_wbuffer
[2] = m_wbuffer
[3] = 0;
1204 m_paddedLines
= m_widthInCU
= m_heightInCU
= 0;
1205 m_bDoSearch
[0] = m_bDoSearch
[1] = false;
1206 m_curb
= m_curp0
= m_curp1
= 0;
1207 m_bFrameCompleted
= false;
1210 CostEstimate::~CostEstimate()
1212 for (int i
= 0; i
< 4; i
++)
1214 x265_free(m_wbuffer
[i
]);
1220 void CostEstimate::init(x265_param
*_param
, Frame
*curFrame
)
1223 m_widthInCU
= ((m_param
->sourceWidth
/ 2) + X265_LOWRES_CU_SIZE
- 1) >> X265_LOWRES_CU_BITS
;
1224 m_heightInCU
= ((m_param
->sourceHeight
/ 2) + X265_LOWRES_CU_SIZE
- 1) >> X265_LOWRES_CU_BITS
;
1226 m_rows
= new EstimateRow
[m_heightInCU
];
1227 for (int i
= 0; i
< m_heightInCU
; i
++)
1229 m_rows
[i
].m_widthInCU
= m_widthInCU
;
1230 m_rows
[i
].m_heightInCU
= m_heightInCU
;
1231 m_rows
[i
].m_param
= m_param
;
1234 if (WaveFront::init(m_heightInCU
))
1235 WaveFront::enableAllRows();
1239 if (m_param
->bEnableWeightedPred
)
1241 PicYuv
*orig
= curFrame
->m_origPicYuv
;
1242 m_paddedLines
= curFrame
->m_lowres
.lines
+ 2 * orig
->m_lumaMarginY
;
1243 intptr_t padoffset
= curFrame
->m_lowres
.lumaStride
* orig
->m_lumaMarginY
+ orig
->m_lumaMarginX
;
1245 /* allocate weighted lowres buffers */
1246 for (int i
= 0; i
< 4; i
++)
1248 m_wbuffer
[i
] = (pixel
*)x265_malloc(sizeof(pixel
) * (curFrame
->m_lowres
.lumaStride
* m_paddedLines
));
1249 m_weightedRef
.lowresPlane
[i
] = m_wbuffer
[i
] + padoffset
;
1252 m_weightedRef
.fpelPlane
= m_weightedRef
.lowresPlane
[0];
1253 m_weightedRef
.lumaStride
= curFrame
->m_lowres
.lumaStride
;
1254 m_weightedRef
.isLowres
= true;
1255 m_weightedRef
.isWeighted
= false;
1259 int64_t CostEstimate::estimateFrameCost(Lowres
**frames
, int p0
, int p1
, int b
, bool bIntraPenalty
)
1262 Lowres
*fenc
= frames
[b
];
1264 if (fenc
->costEst
[b
- p0
][p1
- b
] >= 0 && fenc
->rowSatds
[b
- p0
][p1
- b
][0] != -1)
1265 score
= fenc
->costEst
[b
- p0
][p1
- b
];
1268 m_weightedRef
.isWeighted
= false;
1269 if (m_param
->bEnableWeightedPred
&& b
== p1
&& b
!= p0
&& fenc
->lowresMvs
[0][b
- p0
- 1][0].x
== 0x7FFF)
1271 if (!fenc
->bIntraCalculated
)
1272 estimateFrameCost(frames
, b
, b
, b
, 0);
1273 weightsAnalyse(frames
, b
, p0
);
1276 /* For each list, check to see whether we have lowres motion-searched this reference */
1277 m_bDoSearch
[0] = b
!= p0
&& fenc
->lowresMvs
[0][b
- p0
- 1][0].x
== 0x7FFF;
1278 m_bDoSearch
[1] = b
!= p1
&& fenc
->lowresMvs
[1][p1
- b
- 1][0].x
== 0x7FFF;
1280 if (m_bDoSearch
[0]) fenc
->lowresMvs
[0][b
- p0
- 1][0].x
= 0;
1281 if (m_bDoSearch
[1]) fenc
->lowresMvs
[1][p1
- b
- 1][0].x
= 0;
1286 m_curframes
= frames
;
1287 fenc
->costEst
[b
- p0
][p1
- b
] = 0;
1288 fenc
->costEstAq
[b
- p0
][p1
- b
] = 0;
1290 for (int i
= 0; i
< m_heightInCU
; i
++)
1293 m_rows
[i
].m_me
.setSourcePlane(fenc
->lowresPlane
[0], fenc
->lumaStride
);
1294 if (!fenc
->bIntraCalculated
)
1295 fenc
->rowSatds
[0][0][i
] = 0;
1296 fenc
->rowSatds
[b
- p0
][p1
- b
][i
] = 0;
1299 m_bFrameCompleted
= false;
1303 WaveFront::enqueue();
1305 // enableAllRows must be already called
1307 while (!m_bFrameCompleted
)
1308 WaveFront::findJob(-1);
1310 WaveFront::dequeue();
1314 for (int row
= 0; row
< m_heightInCU
; row
++)
1315 processRow(row
, -1);
1320 // Accumulate cost from each row
1321 for (int row
= 0; row
< m_heightInCU
; row
++)
1323 score
+= m_rows
[row
].m_costEst
;
1324 fenc
->costEst
[0][0] += m_rows
[row
].m_costIntra
;
1325 if (m_param
->rc
.aqMode
)
1327 fenc
->costEstAq
[0][0] += m_rows
[row
].m_costIntraAq
;
1328 fenc
->costEstAq
[b
- p0
][p1
- b
] += m_rows
[row
].m_costEstAq
;
1330 fenc
->intraMbs
[b
- p0
] += m_rows
[row
].m_intraMbs
;
1333 fenc
->bIntraCalculated
= true;
1336 score
= (uint64_t)score
* 100 / (130 + m_param
->bFrameBias
);
1337 if (b
!= p0
|| b
!= p1
) //Not Intra cost
1338 fenc
->costEst
[b
- p0
][p1
- b
] = score
;
1343 // arbitrary penalty for I-blocks after B-frames
1345 score
+= (uint64_t)score
* fenc
->intraMbs
[b
- p0
] / (ncu
* 8);
1350 uint32_t CostEstimate::weightCostLuma(Lowres
**frames
, int b
, int p0
, WeightParam
*wp
)
1352 Lowres
*fenc
= frames
[b
];
1353 Lowres
*ref
= frames
[p0
];
1354 pixel
*src
= ref
->fpelPlane
;
1355 intptr_t stride
= fenc
->lumaStride
;
1359 int offset
= wp
->inputOffset
<< (X265_DEPTH
- 8);
1360 int scale
= wp
->inputWeight
;
1361 int denom
= wp
->log2WeightDenom
;
1362 int round
= denom
? 1 << (denom
- 1) : 0;
1363 int correction
= IF_INTERNAL_PREC
- X265_DEPTH
; // intermediate interpolation depth
1364 int widthHeight
= (int)stride
;
1366 primitives
.weight_pp(ref
->buffer
[0], m_wbuffer
[0], stride
, widthHeight
, m_paddedLines
,
1367 scale
, round
<< correction
, denom
+ correction
, offset
);
1368 src
= m_weightedRef
.fpelPlane
;
1372 intptr_t pixoff
= 0;
1375 for (int y
= 0; y
< fenc
->lines
; y
+= 8, pixoff
= y
* stride
)
1377 for (int x
= 0; x
< fenc
->width
; x
+= 8, mb
++, pixoff
+= 8)
1379 int satd
= primitives
.satd
[LUMA_8x8
](src
+ pixoff
, stride
, fenc
->fpelPlane
+ pixoff
, stride
);
1380 cost
+= X265_MIN(satd
, fenc
->intraCost
[mb
]);
1387 void CostEstimate::weightsAnalyse(Lowres
**frames
, int b
, int p0
)
1389 static const float epsilon
= 1.f
/ 128.f
;
1394 int deltaIndex
= fenc
->frameNum
- ref
->frameNum
;
1396 /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
1397 float guessScale
, fencMean
, refMean
;
1399 if (fenc
->wp_ssd
[0] && ref
->wp_ssd
[0])
1400 guessScale
= sqrtf((float)fenc
->wp_ssd
[0] / ref
->wp_ssd
[0]);
1403 fencMean
= (float)fenc
->wp_sum
[0] / (fenc
->lines
* fenc
->width
) / (1 << (X265_DEPTH
- 8));
1404 refMean
= (float)ref
->wp_sum
[0] / (fenc
->lines
* fenc
->width
) / (1 << (X265_DEPTH
- 8));
1406 /* Early termination */
1407 if (fabsf(refMean
- fencMean
) < 0.5f
&& fabsf(1.f
- guessScale
) < epsilon
)
1410 int minoff
= 0, minscale
, mindenom
;
1411 unsigned int minscore
= 0, origscore
= 1;
1414 m_w
.setFromWeightAndOffset((int)(guessScale
* 128 + 0.5f
), 0, 7, true);
1415 mindenom
= m_w
.log2WeightDenom
;
1416 minscale
= m_w
.inputWeight
;
1418 origscore
= minscore
= weightCostLuma(frames
, b
, p0
, NULL
);
1424 int curScale
= minscale
;
1425 int curOffset
= (int)(fencMean
- refMean
* curScale
/ (1 << mindenom
) + 0.5f
);
1426 if (curOffset
< -128 || curOffset
> 127)
1428 /* Rescale considering the constraints on curOffset. We do it in this order
1429 * because scale has a much wider range than offset (because of denom), so
1430 * it should almost never need to be clamped. */
1431 curOffset
= Clip3(-128, 127, curOffset
);
1432 curScale
= (int)((1 << mindenom
) * (fencMean
- curOffset
) / refMean
+ 0.5f
);
1433 curScale
= Clip3(0, 127, curScale
);
1435 SET_WEIGHT(m_w
, 1, curScale
, mindenom
, curOffset
);
1436 s
= weightCostLuma(frames
, b
, p0
, &m_w
);
1437 COPY4_IF_LT(minscore
, s
, minscale
, curScale
, minoff
, curOffset
, found
, 1);
1439 /* Use a smaller denominator if possible */
1440 while (mindenom
> 0 && !(minscale
& 1))
1446 if (!found
|| (minscale
== 1 << mindenom
&& minoff
== 0) || (float)minscore
/ origscore
> 0.998f
)
1450 SET_WEIGHT(m_w
, 1, minscale
, mindenom
, minoff
);
1451 // set weighted delta cost
1452 fenc
->weightedCostDelta
[deltaIndex
] = minscore
/ origscore
;
1454 int offset
= m_w
.inputOffset
<< (X265_DEPTH
- 8);
1455 int scale
= m_w
.inputWeight
;
1456 int denom
= m_w
.log2WeightDenom
;
1457 int round
= denom
? 1 << (denom
- 1) : 0;
1458 int correction
= IF_INTERNAL_PREC
- X265_DEPTH
; // intermediate interpolation depth
1459 intptr_t stride
= ref
->lumaStride
;
1460 int widthHeight
= (int)stride
;
1462 for (int i
= 0; i
< 4; i
++)
1463 primitives
.weight_pp(ref
->buffer
[i
], m_wbuffer
[i
], stride
, widthHeight
, m_paddedLines
,
1464 scale
, round
<< correction
, denom
+ correction
, offset
);
1466 m_weightedRef
.isWeighted
= true;
1470 void CostEstimate::processRow(int row
, int /*threadId*/)
1472 int realrow
= m_heightInCU
- 1 - row
;
1473 Lowres
**frames
= m_curframes
;
1474 ReferencePlanes
*wfref0
= m_weightedRef
.isWeighted
? &m_weightedRef
: frames
[m_curp0
];
1476 /* Lowres lookahead goes backwards because the MVs are used as
1477 * predictors in the main encode. This considerably improves MV
1478 * prediction overall. */
1479 for (int i
= m_widthInCU
- 1 - m_rows
[row
].m_completed
; i
>= 0; i
--)
1481 // TODO: use lowres MVs as motion candidates in full-res search
1482 m_rows
[row
].estimateCUCost(frames
, wfref0
, i
, realrow
, m_curp0
, m_curp1
, m_curb
, m_bDoSearch
);
1483 m_rows
[row
].m_completed
++;
1485 if (m_rows
[row
].m_completed
>= 2 && row
< m_heightInCU
- 1)
1487 ScopedLock
below(m_rows
[row
+ 1].m_lock
);
1488 if (m_rows
[row
+ 1].m_active
== false &&
1489 m_rows
[row
+ 1].m_completed
+ 2 <= m_rows
[row
].m_completed
)
1491 m_rows
[row
+ 1].m_active
= true;
1492 enqueueRow(row
+ 1);
1496 ScopedLock
self(m_rows
[row
].m_lock
);
1497 if (row
> 0 && (int32_t)m_rows
[row
].m_completed
< m_widthInCU
- 1 &&
1498 m_rows
[row
- 1].m_completed
< m_rows
[row
].m_completed
+ 2)
1500 m_rows
[row
].m_active
= false;
1505 if (row
== m_heightInCU
- 1)
1506 m_bFrameCompleted
= true;
1509 void EstimateRow::init()
1520 void EstimateRow::estimateCUCost(Lowres
**frames
, ReferencePlanes
*wfref0
, int cux
, int cuy
, int p0
, int p1
, int b
, bool bDoSearch
[2])
1522 Lowres
*fref1
= frames
[p1
];
1523 Lowres
*fenc
= frames
[b
];
1525 const int bBidir
= (b
< p1
);
1526 const int cuXY
= cux
+ cuy
* m_widthInCU
;
1527 const int cuSize
= X265_LOWRES_CU_SIZE
;
1528 const intptr_t pelOffset
= cuSize
* cux
+ cuSize
* cuy
* fenc
->lumaStride
;
1530 // should this CU's cost contribute to the frame cost?
1531 const bool bFrameScoreCU
= (cux
> 0 && cux
< m_widthInCU
- 1 &&
1532 cuy
> 0 && cuy
< m_heightInCU
- 1) || m_widthInCU
<= 2 || m_heightInCU
<= 2;
1534 m_me
.setSourcePU(pelOffset
, cuSize
, cuSize
);
1536 /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
1537 int lowresPenalty
= 4;
1539 MV(*fenc_mvs
[2]) = { &fenc
->lowresMvs
[0][b
- p0
- 1][cuXY
],
1540 &fenc
->lowresMvs
[1][p1
- b
- 1][cuXY
] };
1541 int(*fenc_costs
[2]) = { &fenc
->lowresMvCosts
[0][b
- p0
- 1][cuXY
],
1542 &fenc
->lowresMvCosts
[1][p1
- b
- 1][cuXY
] };
1545 int bcost
= m_me
.COST_MAX
;
1548 // establish search bounds that don't cross extended frame boundaries
1549 mvmin
.x
= (int16_t)(-cux
* cuSize
- 8);
1550 mvmin
.y
= (int16_t)(-cuy
* cuSize
- 8);
1551 mvmax
.x
= (int16_t)((m_widthInCU
- cux
- 1) * cuSize
+ 8);
1552 mvmax
.y
= (int16_t)((m_heightInCU
- cuy
- 1) * cuSize
+ 8);
1556 for (int i
= 0; i
< 1 + bBidir
; i
++)
1560 /* Use previously calculated cost */
1561 COPY2_IF_LT(bcost
, *fenc_costs
[i
], listused
, i
+ 1);
1566 MV
*fenc_mv
= fenc_mvs
[i
];
1568 /* Reverse-order MV prediction. */
1571 #define MVC(mv) mvc[numc++] = mv;
1572 if (cux
< m_widthInCU
- 1)
1574 if (cuy
< m_heightInCU
- 1)
1576 MVC(fenc_mv
[m_widthInCU
]);
1578 MVC(fenc_mv
[m_widthInCU
- 1]);
1579 if (cux
< m_widthInCU
- 1)
1580 MVC(fenc_mv
[m_widthInCU
+ 1]);
1587 median_mv(mvp
, mvc
[0], mvc
[1], mvc
[2]);
1590 *fenc_costs
[i
] = m_me
.motionEstimate(i
? fref1
: wfref0
, mvmin
, mvmax
, mvp
, numc
, mvc
, m_merange
, *fenc_mvs
[i
]);
1591 COPY2_IF_LT(bcost
, *fenc_costs
[i
], listused
, i
+ 1);
1595 pixel subpelbuf0
[X265_LOWRES_CU_SIZE
* X265_LOWRES_CU_SIZE
], subpelbuf1
[X265_LOWRES_CU_SIZE
* X265_LOWRES_CU_SIZE
];
1596 intptr_t stride0
= X265_LOWRES_CU_SIZE
, stride1
= X265_LOWRES_CU_SIZE
;
1597 pixel
*src0
= wfref0
->lowresMC(pelOffset
, *fenc_mvs
[0], subpelbuf0
, stride0
);
1598 pixel
*src1
= fref1
->lowresMC(pelOffset
, *fenc_mvs
[1], subpelbuf1
, stride1
);
1600 pixel ref
[X265_LOWRES_CU_SIZE
* X265_LOWRES_CU_SIZE
];
1601 primitives
.pixelavg_pp
[LUMA_8x8
](ref
, X265_LOWRES_CU_SIZE
, src0
, stride0
, src1
, stride1
, 32);
1602 int bicost
= primitives
.satd
[LUMA_8x8
](fenc
->lowresPlane
[0] + pelOffset
, fenc
->lumaStride
, ref
, X265_LOWRES_CU_SIZE
);
1603 COPY2_IF_LT(bcost
, bicost
, listused
, 3);
1605 // Try 0,0 candidates
1606 src0
= wfref0
->lowresPlane
[0] + pelOffset
;
1607 src1
= fref1
->lowresPlane
[0] + pelOffset
;
1608 primitives
.pixelavg_pp
[LUMA_8x8
](ref
, X265_LOWRES_CU_SIZE
, src0
, wfref0
->lumaStride
, src1
, fref1
->lumaStride
, 32);
1609 bicost
= primitives
.satd
[LUMA_8x8
](fenc
->lowresPlane
[0] + pelOffset
, fenc
->lumaStride
, ref
, X265_LOWRES_CU_SIZE
);
1610 COPY2_IF_LT(bcost
, bicost
, listused
, 3);
1613 if (!fenc
->bIntraCalculated
)
1615 const int sizeIdx
= X265_LOWRES_CU_BITS
- 2; // partition size
1617 pixel _above0
[X265_LOWRES_CU_SIZE
* 4 + 1], *const above0
= _above0
+ 2 * X265_LOWRES_CU_SIZE
;
1618 pixel _above1
[X265_LOWRES_CU_SIZE
* 4 + 1], *const above1
= _above1
+ 2 * X265_LOWRES_CU_SIZE
;
1619 pixel _left0
[X265_LOWRES_CU_SIZE
* 4 + 1], *const left0
= _left0
+ 2 * X265_LOWRES_CU_SIZE
;
1620 pixel _left1
[X265_LOWRES_CU_SIZE
* 4 + 1], *const left1
= _left1
+ 2 * X265_LOWRES_CU_SIZE
;
1622 pixel
*pix_cur
= fenc
->lowresPlane
[0] + pelOffset
;
1625 memcpy(above0
, pix_cur
- 1 - fenc
->lumaStride
, (cuSize
+ 1) * sizeof(pixel
));
1628 for (int i
= 0; i
< cuSize
+ 1; i
++)
1630 left0
[i
] = pix_cur
[-1 - fenc
->lumaStride
+ i
* fenc
->lumaStride
];
1633 for (int i
= 0; i
< cuSize
; i
++)
1635 above0
[cuSize
+ i
+ 1] = above0
[cuSize
];
1636 left0
[cuSize
+ i
+ 1] = left0
[cuSize
];
1639 // filtering with [1 2 1]
1640 // assume getUseStrongIntraSmoothing() is disabled
1641 above1
[0] = above0
[0];
1642 above1
[2 * cuSize
] = above0
[2 * cuSize
];
1643 left1
[0] = left0
[0];
1644 left1
[2 * cuSize
] = left0
[2 * cuSize
];
1645 for (int i
= 1; i
< 2 * cuSize
; i
++)
1647 above1
[i
] = (above0
[i
- 1] + 2 * above0
[i
] + above0
[i
+ 1] + 2) >> 2;
1648 left1
[i
] = (left0
[i
- 1] + 2 * left0
[i
] + left0
[i
+ 1] + 2) >> 2;
1651 int predsize
= cuSize
* cuSize
;
1653 // generate 35 intra predictions into m_predictions
1654 pixelcmp_t satd
= primitives
.satd
[partitionFromLog2Size(X265_LOWRES_CU_BITS
)];
1655 int icost
= m_me
.COST_MAX
, cost
;
1656 primitives
.intra_pred
[DC_IDX
][sizeIdx
](m_predictions
, cuSize
, left0
, above0
, 0, (cuSize
<= 16));
1657 cost
= satd(m_me
.fenc
, FENC_STRIDE
, m_predictions
, cuSize
);
1660 pixel
*above
= (cuSize
>= 8) ? above1
: above0
;
1661 pixel
*left
= (cuSize
>= 8) ? left1
: left0
;
1662 primitives
.intra_pred
[PLANAR_IDX
][sizeIdx
](m_predictions
, cuSize
, left
, above
, 0, 0);
1663 cost
= satd(m_me
.fenc
, FENC_STRIDE
, m_predictions
, cuSize
);
1666 primitives
.intra_pred_allangs
[sizeIdx
](m_predictions
+ 2 * predsize
, above0
, left0
, above1
, left1
, (cuSize
<= 16));
1668 // calculate satd costs, keep least cost
1669 ALIGN_VAR_32(pixel
, buf_trans
[32 * 32]);
1670 primitives
.transpose
[sizeIdx
](buf_trans
, m_me
.fenc
, FENC_STRIDE
);
1672 int acost
= m_me
.COST_MAX
;
1673 uint32_t mode
, lowmode
= 4;
1674 for (mode
= 5; mode
< 35; mode
+= 5)
1677 cost
= satd(buf_trans
, cuSize
, &m_predictions
[mode
* predsize
], cuSize
);
1679 cost
= satd(m_me
.fenc
, FENC_STRIDE
, &m_predictions
[mode
* predsize
], cuSize
);
1680 COPY2_IF_LT(acost
, cost
, lowmode
, mode
);
1682 for (uint32_t dist
= 2; dist
>= 1; dist
--)
1684 mode
= lowmode
- dist
;
1686 cost
= satd(buf_trans
, cuSize
, &m_predictions
[mode
* predsize
], cuSize
);
1688 cost
= satd(m_me
.fenc
, FENC_STRIDE
, &m_predictions
[mode
* predsize
], cuSize
);
1689 COPY2_IF_LT(acost
, cost
, lowmode
, mode
);
1691 mode
= lowmode
+ dist
;
1693 cost
= satd(buf_trans
, cuSize
, &m_predictions
[mode
* predsize
], cuSize
);
1695 cost
= satd(m_me
.fenc
, FENC_STRIDE
, &m_predictions
[mode
* predsize
], cuSize
);
1696 COPY2_IF_LT(acost
, cost
, lowmode
, mode
);
1701 const int intraPenalty
= 5 * m_lookAheadLambda
;
1702 icost
+= intraPenalty
+ lowresPenalty
; /* estimate intra signal cost */
1703 fenc
->intraCost
[cuXY
] = icost
;
1704 int icostAq
= icost
;
1707 m_costIntra
+= icost
;
1708 if (fenc
->invQscaleFactor
)
1710 icostAq
= (icost
* fenc
->invQscaleFactor
[cuXY
] + 128) >> 8;
1711 m_costIntraAq
+= icostAq
;
1714 fenc
->rowSatds
[0][0][cuy
] += icostAq
;
1716 bcost
+= lowresPenalty
;
1719 if (fenc
->intraCost
[cuXY
] < bcost
)
1721 if (bFrameScoreCU
) m_intraMbs
++;
1722 bcost
= fenc
->intraCost
[cuXY
];
1727 /* For I frames these costs were accumulated earlier */
1730 int bcostAq
= bcost
;
1734 if (fenc
->invQscaleFactor
)
1736 bcostAq
= (bcost
* fenc
->invQscaleFactor
[cuXY
] + 128) >> 8;
1737 m_costEstAq
+= bcostAq
;
1740 fenc
->rowSatds
[b
- p0
][p1
- b
][cuy
] += bcostAq
;
1742 fenc
->lowresCosts
[b
- p0
][p1
- b
][cuXY
] = (uint16_t)(X265_MIN(bcost
, LOWRES_COST_MASK
) | (listused
<< LOWRES_COST_SHIFT
));