1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Gopu Govindaswamy <gopu@multicorewareinc.com>
5 * Steve Borho <steve@borho.org>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 * This program is also available under a commercial proprietary license.
22 * For more information, contact us at license @ x265.com.
23 *****************************************************************************/
27 #include "framedata.h"
29 #include "primitives.h"
33 #include "slicetype.h"
35 #include "ratecontrol.h"
37 #define NUM_CUS (m_widthInCU > 2 && m_heightInCU > 2 ? (m_widthInCU - 2) * (m_heightInCU - 2) : m_widthInCU * m_heightInCU)
41 static inline int16_t median(int16_t a
, int16_t b
, int16_t c
)
43 int16_t t
= (a
- b
) & ((a
- b
) >> 31);
47 b
-= (b
- c
) & ((b
- c
) >> 31);
48 b
+= (a
- b
) & ((a
- b
) >> 31);
52 static inline void median_mv(MV
&dst
, MV a
, MV b
, MV c
)
54 dst
.x
= median(a
.x
, b
.x
, c
.x
);
55 dst
.y
= median(a
.y
, b
.y
, c
.y
);
58 Lookahead::Lookahead(x265_param
*param
, ThreadPool
* pool
)
64 m_lastKeyframe
= -m_param
->keyframeMax
;
68 m_widthInCU
= ((m_param
->sourceWidth
/ 2) + X265_LOWRES_CU_SIZE
- 1) >> X265_LOWRES_CU_BITS
;
69 m_heightInCU
= ((m_param
->sourceHeight
/ 2) + X265_LOWRES_CU_SIZE
- 1) >> X265_LOWRES_CU_BITS
;
70 m_scratch
= (int*)x265_malloc(m_widthInCU
* sizeof(int));
71 memset(m_histogram
, 0, sizeof(m_histogram
));
74 Lookahead::~Lookahead() { }
76 void Lookahead::init()
78 if (m_pool
&& m_pool
->getThreadCount() >= 4 &&
79 ((m_param
->bFrameAdaptive
&& m_param
->bframes
) ||
80 m_param
->rc
.cuTree
|| m_param
->scenecutThreshold
||
81 (m_param
->lookaheadDepth
&& m_param
->rc
.vbvBufferSize
)))
82 m_pool
= m_pool
; /* allow use of worker thread */
84 m_pool
= NULL
; /* disable use of worker thread */
87 void Lookahead::destroy()
90 // flush will dequeue, if it is necessary
93 // these two queues will be empty unless the encode was aborted
94 while (!m_inputQueue
.empty())
96 Frame
* curFrame
= m_inputQueue
.popFront();
101 while (!m_outputQueue
.empty())
103 Frame
* curFrame
= m_outputQueue
.popFront();
108 x265_free(m_scratch
);
111 /* Called by API thread */
112 void Lookahead::addPicture(Frame
*curFrame
, int sliceType
)
114 PicYuv
*orig
= curFrame
->m_fencPic
;
116 curFrame
->m_lowres
.init(orig
, curFrame
->m_poc
, sliceType
);
118 m_inputQueueLock
.acquire();
119 m_inputQueue
.pushBack(*curFrame
);
121 if (m_inputQueue
.size() >= m_param
->lookaheadDepth
)
123 /* when queue fills the first time, run slicetypeDecide synchronously,
124 * since the encoder will always be blocked here */
125 if (m_pool
&& !m_bFilling
)
127 m_inputQueueLock
.release();
129 m_pool
->pokeIdleThread();
134 if (m_bFilling
&& m_pool
)
135 JobProvider::enqueue();
139 m_inputQueueLock
.release();
142 /* Called by API thread */
143 void Lookahead::flush()
145 /* just in case the input queue is never allowed to fill */
148 /* flush synchronously */
149 m_inputQueueLock
.acquire();
150 if (!m_inputQueue
.empty())
155 m_inputQueueLock
.release();
157 m_inputQueueLock
.acquire();
159 /* bFlushed indicates that an empty output queue actually means all frames
160 * have been decided (no more inputs for the encoder) */
161 if (m_inputQueue
.empty())
163 m_inputQueueLock
.release();
166 /* Called by API thread. If the lookahead queue has not yet been filled the
167 * first time, it immediately returns NULL. Else the function blocks until
168 * outputs are available and then pops the first frame from the output queue. If
169 * flush() has been called and the output queue is empty, NULL is returned. */
170 Frame
* Lookahead::getDecidedPicture()
172 m_outputQueueLock
.acquire();
176 m_outputQueueLock
.release();
180 while (m_outputQueue
.empty() && !m_bFlushed
)
182 m_outputQueueLock
.release();
183 m_outputAvailable
.wait();
184 m_outputQueueLock
.acquire();
187 Frame
*fenc
= m_outputQueue
.popFront();
188 m_outputQueueLock
.release();
192 /* Called by pool worker threads */
193 bool Lookahead::findJob(int)
195 if (m_bReady
> 0 && ATOMIC_DEC(&m_bReady
) == 0)
197 m_inputQueueLock
.acquire();
205 /* Called by rate-control to calculate the estimated SATD cost for a given
206 * picture. It assumes dpb->prepareEncode() has already been called for the
207 * picture and all the references are established */
208 void Lookahead::getEstimatedPictureCost(Frame
*curFrame
)
210 Lowres
*frames
[X265_LOOKAHEAD_MAX
];
212 // POC distances to each reference
213 Slice
*slice
= curFrame
->m_encData
->m_slice
;
215 int poc
= slice
->m_poc
;
216 int l0poc
= slice
->m_refPOCList
[0][0];
217 int l1poc
= slice
->m_refPOCList
[1][0];
219 switch (slice
->m_sliceType
)
222 frames
[p0
] = &curFrame
->m_lowres
;
227 b
= p1
= poc
- l0poc
;
228 frames
[p0
] = &slice
->m_refPicList
[0][0]->m_lowres
;
229 frames
[b
] = &curFrame
->m_lowres
;
234 p1
= b
+ l1poc
- poc
;
235 frames
[p0
] = &slice
->m_refPicList
[0][0]->m_lowres
;
236 frames
[b
] = &curFrame
->m_lowres
;
237 frames
[p1
] = &slice
->m_refPicList
[1][0]->m_lowres
;
244 if (m_param
->rc
.cuTree
&& !m_param
->rc
.bStatRead
)
245 /* update row satds based on cutree offsets */
246 curFrame
->m_lowres
.satdCost
= frameCostRecalculate(frames
, p0
, p1
, b
);
247 else if (m_param
->rc
.aqMode
)
248 curFrame
->m_lowres
.satdCost
= curFrame
->m_lowres
.costEstAq
[b
- p0
][p1
- b
];
250 curFrame
->m_lowres
.satdCost
= curFrame
->m_lowres
.costEst
[b
- p0
][p1
- b
];
252 if (m_param
->rc
.vbvBufferSize
&& m_param
->rc
.vbvMaxBitrate
)
254 /* aggregate lowres row satds to CTU resolution */
255 curFrame
->m_lowres
.lowresCostForRc
= curFrame
->m_lowres
.lowresCosts
[b
- p0
][p1
- b
];
256 uint32_t lowresRow
= 0, lowresCol
= 0, lowresCuIdx
= 0, sum
= 0;
257 uint32_t scale
= m_param
->maxCUSize
/ (2 * X265_LOWRES_CU_SIZE
);
258 uint32_t numCuInHeight
= (m_param
->sourceHeight
+ g_maxCUSize
- 1) / g_maxCUSize
;
259 uint32_t widthInLowresCu
= (uint32_t)m_widthInCU
, heightInLowresCu
= (uint32_t)m_heightInCU
;
260 double *qp_offset
= 0;
261 /* Factor in qpoffsets based on Aq/Cutree in CU costs */
262 if (m_param
->rc
.aqMode
)
263 qp_offset
= (frames
[b
]->sliceType
== X265_TYPE_B
|| !m_param
->rc
.cuTree
) ? frames
[b
]->qpAqOffset
: frames
[b
]->qpCuTreeOffset
;
265 for (uint32_t row
= 0; row
< numCuInHeight
; row
++)
267 lowresRow
= row
* scale
;
268 for (uint32_t cnt
= 0; cnt
< scale
&& lowresRow
< heightInLowresCu
; lowresRow
++, cnt
++)
271 lowresCuIdx
= lowresRow
* widthInLowresCu
;
272 for (lowresCol
= 0; lowresCol
< widthInLowresCu
; lowresCol
++, lowresCuIdx
++)
274 uint16_t lowresCuCost
= curFrame
->m_lowres
.lowresCostForRc
[lowresCuIdx
] & LOWRES_COST_MASK
;
277 lowresCuCost
= (uint16_t)((lowresCuCost
* x265_exp2fix8(qp_offset
[lowresCuIdx
]) + 128) >> 8);
278 int32_t intraCuCost
= curFrame
->m_lowres
.intraCost
[lowresCuIdx
];
279 curFrame
->m_lowres
.intraCost
[lowresCuIdx
] = (intraCuCost
* x265_exp2fix8(qp_offset
[lowresCuIdx
]) + 128) >> 8;
281 curFrame
->m_lowres
.lowresCostForRc
[lowresCuIdx
] = lowresCuCost
;
284 curFrame
->m_encData
->m_rowStat
[row
].satdForVbv
+= sum
;
290 /* called by API thread or worker thread with inputQueueLock acquired */
291 void Lookahead::slicetypeDecide()
293 ProfileScopeEvent(slicetypeDecideEV
);
295 ScopedLock
lock(m_decideLock
);
297 Lowres
*frames
[X265_LOOKAHEAD_MAX
];
298 Frame
*list
[X265_LOOKAHEAD_MAX
];
299 int maxSearch
= X265_MIN(m_param
->lookaheadDepth
, X265_LOOKAHEAD_MAX
);
301 memset(frames
, 0, sizeof(frames
));
302 memset(list
, 0, sizeof(list
));
304 Frame
*curFrame
= m_inputQueue
.first();
306 for (j
= 0; j
< m_param
->bframes
+ 2; j
++)
308 if (!curFrame
) break;
310 curFrame
= curFrame
->m_next
;
313 curFrame
= m_inputQueue
.first();
314 frames
[0] = m_lastNonB
;
315 for (j
= 0; j
< maxSearch
; j
++)
317 if (!curFrame
) break;
318 frames
[j
+ 1] = &curFrame
->m_lowres
;
319 curFrame
= curFrame
->m_next
;
325 m_inputQueueLock
.release();
327 if (!m_est
.m_rows
&& list
[0])
328 m_est
.init(m_param
, list
[0]);
330 if (m_lastNonB
&& !m_param
->rc
.bStatRead
&&
331 ((m_param
->bFrameAdaptive
&& m_param
->bframes
) ||
332 m_param
->rc
.cuTree
|| m_param
->scenecutThreshold
||
333 (m_param
->lookaheadDepth
&& m_param
->rc
.vbvBufferSize
)))
335 slicetypeAnalyse(frames
, false);
339 for (bframes
= 0, brefs
= 0;; bframes
++)
341 Lowres
& frm
= list
[bframes
]->m_lowres
;
343 if (frm
.sliceType
== X265_TYPE_BREF
&& !m_param
->bBPyramid
&& brefs
== m_param
->bBPyramid
)
345 frm
.sliceType
= X265_TYPE_B
;
346 x265_log(m_param
, X265_LOG_WARNING
, "B-ref at frame %d incompatible with B-pyramid\n",
350 /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
351 smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it.*/
352 else if (frm
.sliceType
== X265_TYPE_BREF
&& m_param
->bBPyramid
&& brefs
&&
353 m_param
->maxNumReferences
<= (brefs
+ 3))
355 frm
.sliceType
= X265_TYPE_B
;
356 x265_log(m_param
, X265_LOG_WARNING
, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n",
357 frm
.sliceType
, m_param
->maxNumReferences
);
360 if ( /*(!param->intraRefresh || frm.frameNum == 0) && */ frm
.frameNum
- m_lastKeyframe
>= m_param
->keyframeMax
)
362 if (frm
.sliceType
== X265_TYPE_AUTO
|| frm
.sliceType
== X265_TYPE_I
)
363 frm
.sliceType
= m_param
->bOpenGOP
&& m_lastKeyframe
>= 0 ? X265_TYPE_I
: X265_TYPE_IDR
;
364 bool warn
= frm
.sliceType
!= X265_TYPE_IDR
;
365 if (warn
&& m_param
->bOpenGOP
)
366 warn
&= frm
.sliceType
!= X265_TYPE_I
;
369 x265_log(m_param
, X265_LOG_WARNING
, "specified frame type (%d) at %d is not compatible with keyframe interval\n",
370 frm
.sliceType
, frm
.frameNum
);
371 frm
.sliceType
= m_param
->bOpenGOP
&& m_lastKeyframe
>= 0 ? X265_TYPE_I
: X265_TYPE_IDR
;
374 if (frm
.sliceType
== X265_TYPE_I
&& frm
.frameNum
- m_lastKeyframe
>= m_param
->keyframeMin
)
376 if (m_param
->bOpenGOP
)
378 m_lastKeyframe
= frm
.frameNum
;
379 frm
.bKeyframe
= true;
382 frm
.sliceType
= X265_TYPE_IDR
;
384 if (frm
.sliceType
== X265_TYPE_IDR
)
387 m_lastKeyframe
= frm
.frameNum
;
388 frm
.bKeyframe
= true;
391 list
[bframes
- 1]->m_lowres
.sliceType
= X265_TYPE_P
;
395 if (bframes
== m_param
->bframes
|| !list
[bframes
+ 1])
397 if (IS_X265_TYPE_B(frm
.sliceType
))
398 x265_log(m_param
, X265_LOG_WARNING
, "specified frame type is not compatible with max B-frames\n");
399 if (frm
.sliceType
== X265_TYPE_AUTO
|| IS_X265_TYPE_B(frm
.sliceType
))
400 frm
.sliceType
= X265_TYPE_P
;
402 if (frm
.sliceType
== X265_TYPE_BREF
)
404 if (frm
.sliceType
== X265_TYPE_AUTO
)
405 frm
.sliceType
= X265_TYPE_B
;
406 else if (!IS_X265_TYPE_B(frm
.sliceType
))
411 list
[bframes
- 1]->m_lowres
.bLastMiniGopBFrame
= true;
412 list
[bframes
]->m_lowres
.leadingBframes
= bframes
;
413 m_lastNonB
= &list
[bframes
]->m_lowres
;
414 m_histogram
[bframes
]++;
416 /* insert a bref into the sequence */
417 if (m_param
->bBPyramid
&& bframes
> 1 && !brefs
)
419 list
[bframes
/ 2]->m_lowres
.sliceType
= X265_TYPE_BREF
;
422 /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */
423 if (m_param
->rc
.rateControlMode
!= X265_RC_CQP
)
426 /* For zero latency tuning, calculate frame cost to be used later in RC */
429 for (int i
= 0; i
<= bframes
; i
++)
430 frames
[i
+ 1] = &list
[i
]->m_lowres
;
433 /* estimate new non-B cost */
434 p1
= b
= bframes
+ 1;
435 p0
= (IS_X265_TYPE_I(frames
[bframes
+ 1]->sliceType
)) ? b
: 0;
436 m_est
.estimateFrameCost(frames
, p0
, p1
, b
, 0);
441 for (b
= 1; b
<= bframes
; b
++)
443 if (frames
[b
]->sliceType
== X265_TYPE_B
)
444 for (p1
= b
; frames
[p1
]->sliceType
== X265_TYPE_B
; p1
++)
445 ; // find new nonb or bref
449 m_est
.estimateFrameCost(frames
, p0
, p1
, b
, 0);
451 if (frames
[b
]->sliceType
== X265_TYPE_BREF
)
457 m_inputQueueLock
.acquire();
459 /* dequeue all frames from inputQueue that are about to be enqueued
460 * in the output queue. The order is important because Frame can
461 * only be in one list at a time */
462 int64_t pts
[X265_BFRAME_MAX
+ 1];
463 for (int i
= 0; i
<= bframes
; i
++)
466 curFrame
= m_inputQueue
.popFront();
467 pts
[i
] = curFrame
->m_pts
;
471 m_inputQueueLock
.release();
473 m_outputQueueLock
.acquire();
474 /* add non-B to output queue */
476 list
[bframes
]->m_reorderedPts
= pts
[idx
++];
477 m_outputQueue
.pushBack(*list
[bframes
]);
479 /* Add B-ref frame next to P frame in output queue, the B-ref encode before non B-ref frame */
480 if (bframes
> 1 && m_param
->bBPyramid
)
482 for (int i
= 0; i
< bframes
; i
++)
484 if (list
[i
]->m_lowres
.sliceType
== X265_TYPE_BREF
)
486 list
[i
]->m_reorderedPts
= pts
[idx
++];
487 m_outputQueue
.pushBack(*list
[i
]);
492 /* add B frames to output queue */
493 for (int i
= 0; i
< bframes
; i
++)
495 /* push all the B frames into output queue except B-ref, which already pushed into output queue*/
496 if (list
[i
]->m_lowres
.sliceType
!= X265_TYPE_BREF
)
498 list
[i
]->m_reorderedPts
= pts
[idx
++];
499 m_outputQueue
.pushBack(*list
[i
]);
503 bool isKeyFrameAnalyse
= (m_param
->rc
.cuTree
|| (m_param
->rc
.vbvBufferSize
&& m_param
->lookaheadDepth
)) && !m_param
->rc
.bStatRead
;
504 if (isKeyFrameAnalyse
&& IS_X265_TYPE_I(m_lastNonB
->sliceType
))
506 m_inputQueueLock
.acquire();
507 Frame
*curFrame
= m_inputQueue
.first();
508 frames
[0] = m_lastNonB
;
510 for (j
= 0; j
< maxSearch
; j
++)
512 frames
[j
+ 1] = &curFrame
->m_lowres
;
513 curFrame
= curFrame
->m_next
;
516 frames
[j
+ 1] = NULL
;
517 m_inputQueueLock
.release();
518 slicetypeAnalyse(frames
, true);
521 m_outputQueueLock
.release();
522 m_outputAvailable
.trigger();
525 void Lookahead::vbvLookahead(Lowres
**frames
, int numFrames
, int keyframe
)
527 int prevNonB
= 0, curNonB
= 1, idx
= 0;
528 while (curNonB
< numFrames
&& frames
[curNonB
]->sliceType
== X265_TYPE_B
)
530 int nextNonB
= keyframe
? prevNonB
: curNonB
;
531 int nextB
= prevNonB
+ 1;
533 int miniGopEnd
= keyframe
? prevNonB
: curNonB
;
534 while (curNonB
< numFrames
+ !keyframe
)
536 /* P/I cost: This shouldn't include the cost of nextNonB */
537 if (nextNonB
!= curNonB
)
539 int p0
= IS_X265_TYPE_I(frames
[curNonB
]->sliceType
) ? curNonB
: prevNonB
;
540 frames
[nextNonB
]->plannedSatd
[idx
] = vbvFrameCost(frames
, p0
, curNonB
, curNonB
);
541 frames
[nextNonB
]->plannedType
[idx
] = frames
[curNonB
]->sliceType
;
542 /* Save the nextNonB Cost in each B frame of the current miniGop */
543 if (curNonB
> miniGopEnd
)
545 for (int j
= nextB
; j
< miniGopEnd
; j
++)
547 frames
[j
]->plannedSatd
[frames
[j
]->indB
] = frames
[nextNonB
]->plannedSatd
[idx
];
548 frames
[j
]->plannedType
[frames
[j
]->indB
++] = frames
[nextNonB
]->plannedType
[idx
];
554 /* Handle the B-frames: coded order */
555 if (m_param
->bBPyramid
&& curNonB
- prevNonB
> 1)
556 nextBRef
= (prevNonB
+ curNonB
+ 1) / 2;
558 for (int i
= prevNonB
+ 1; i
< curNonB
; i
++, idx
++)
560 int64_t satdCost
= 0; int type
= X265_TYPE_B
;
565 satdCost
= vbvFrameCost(frames
, prevNonB
, curNonB
, nextBRef
);
566 type
= X265_TYPE_BREF
;
568 else if (i
< nextBRef
)
569 satdCost
= vbvFrameCost(frames
, prevNonB
, nextBRef
, i
);
571 satdCost
= vbvFrameCost(frames
, nextBRef
, curNonB
, i
);
574 satdCost
= vbvFrameCost(frames
, prevNonB
, nextNonB
, i
);
575 frames
[nextNonB
]->plannedSatd
[idx
] = satdCost
;
576 frames
[nextNonB
]->plannedType
[idx
] = type
;
577 /* Save the nextB Cost in each B frame of the current miniGop */
579 for (int j
= nextB
; j
< miniGopEnd
; j
++)
581 if (nextBRef
&& i
== nextBRef
)
583 if (j
>= i
&& j
!=nextBRef
)
585 frames
[j
]->plannedSatd
[frames
[j
]->indB
] = satdCost
;
586 frames
[j
]->plannedType
[frames
[j
]->indB
++] = X265_TYPE_B
;
591 while (curNonB
<= numFrames
&& frames
[curNonB
]->sliceType
== X265_TYPE_B
)
595 frames
[nextNonB
]->plannedType
[idx
] = X265_TYPE_AUTO
;
598 int64_t Lookahead::vbvFrameCost(Lowres
**frames
, int p0
, int p1
, int b
)
600 int64_t cost
= m_est
.estimateFrameCost(frames
, p0
, p1
, b
, 0);
602 if (m_param
->rc
.aqMode
)
604 if (m_param
->rc
.cuTree
)
605 return frameCostRecalculate(frames
, p0
, p1
, b
);
607 return frames
[b
]->costEstAq
[b
- p0
][p1
- b
];
612 void Lookahead::slicetypeAnalyse(Lowres
**frames
, bool bKeyframe
)
614 int numFrames
, origNumFrames
, keyintLimit
, framecnt
;
615 int maxSearch
= X265_MIN(m_param
->lookaheadDepth
, X265_LOOKAHEAD_MAX
);
616 int cuCount
= NUM_CUS
;
618 bool bIsVbvLookahead
= m_param
->rc
.vbvBufferSize
&& m_param
->lookaheadDepth
;
620 /* count undecided frames */
621 for (framecnt
= 0; framecnt
< maxSearch
; framecnt
++)
623 Lowres
*fenc
= frames
[framecnt
+ 1];
624 if (!fenc
|| fenc
->sliceType
!= X265_TYPE_AUTO
)
630 if (m_param
->rc
.cuTree
)
631 cuTree(frames
, 0, bKeyframe
);
635 frames
[framecnt
+ 1] = NULL
;
637 keyintLimit
= m_param
->keyframeMax
- frames
[0]->frameNum
+ m_lastKeyframe
- 1;
638 origNumFrames
= numFrames
= X265_MIN(framecnt
, keyintLimit
);
641 numFrames
= framecnt
;
642 else if (m_param
->bOpenGOP
&& numFrames
< framecnt
)
644 else if (numFrames
== 0)
646 frames
[1]->sliceType
= X265_TYPE_I
;
651 int numAnalyzed
= numFrames
;
652 if (m_param
->scenecutThreshold
&& scenecut(frames
, 0, 1, true, origNumFrames
, maxSearch
))
654 frames
[1]->sliceType
= X265_TYPE_I
;
658 if (m_param
->bframes
)
660 if (m_param
->bFrameAdaptive
== X265_B_ADAPT_TRELLIS
)
664 char best_paths
[X265_BFRAME_MAX
+ 1][X265_LOOKAHEAD_MAX
+ 1] = { "", "P" };
665 int best_path_index
= numFrames
% (X265_BFRAME_MAX
+ 1);
667 /* Perform the frametype analysis. */
668 for (int j
= 2; j
<= numFrames
; j
++)
670 slicetypePath(frames
, j
, best_paths
);
673 numBFrames
= (int)strspn(best_paths
[best_path_index
], "B");
675 /* Load the results of the analysis into the frame types. */
676 for (int j
= 1; j
< numFrames
; j
++)
678 frames
[j
]->sliceType
= best_paths
[best_path_index
][j
- 1] == 'B' ? X265_TYPE_B
: X265_TYPE_P
;
681 frames
[numFrames
]->sliceType
= X265_TYPE_P
;
683 else if (m_param
->bFrameAdaptive
== X265_B_ADAPT_FAST
)
685 int64_t cost1p0
, cost2p0
, cost1b1
, cost2p1
;
687 for (int i
= 0; i
<= numFrames
- 2; )
689 cost2p1
= m_est
.estimateFrameCost(frames
, i
+ 0, i
+ 2, i
+ 2, 1);
690 if (frames
[i
+ 2]->intraMbs
[2] > cuCount
/ 2)
692 frames
[i
+ 1]->sliceType
= X265_TYPE_P
;
693 frames
[i
+ 2]->sliceType
= X265_TYPE_P
;
698 cost1b1
= m_est
.estimateFrameCost(frames
, i
+ 0, i
+ 2, i
+ 1, 0);
699 cost1p0
= m_est
.estimateFrameCost(frames
, i
+ 0, i
+ 1, i
+ 1, 0);
700 cost2p0
= m_est
.estimateFrameCost(frames
, i
+ 1, i
+ 2, i
+ 2, 0);
702 if (cost1p0
+ cost2p0
< cost1b1
+ cost2p1
)
704 frames
[i
+ 1]->sliceType
= X265_TYPE_P
;
709 // arbitrary and untuned
710 #define INTER_THRESH 300
711 #define P_SENS_BIAS (50 - m_param->bFrameBias)
712 frames
[i
+ 1]->sliceType
= X265_TYPE_B
;
715 for (j
= i
+ 2; j
<= X265_MIN(i
+ m_param
->bframes
, numFrames
- 1); j
++)
717 int64_t pthresh
= X265_MAX(INTER_THRESH
- P_SENS_BIAS
* (j
- i
- 1), INTER_THRESH
/ 10);
718 int64_t pcost
= m_est
.estimateFrameCost(frames
, i
+ 0, j
+ 1, j
+ 1, 1);
719 if (pcost
> pthresh
* cuCount
|| frames
[j
+ 1]->intraMbs
[j
- i
+ 1] > cuCount
/ 3)
721 frames
[j
]->sliceType
= X265_TYPE_B
;
724 frames
[j
]->sliceType
= X265_TYPE_P
;
727 frames
[numFrames
]->sliceType
= X265_TYPE_P
;
729 while (numBFrames
< numFrames
&& frames
[numBFrames
+ 1]->sliceType
== X265_TYPE_B
)
736 numBFrames
= X265_MIN(numFrames
- 1, m_param
->bframes
);
737 for (int j
= 1; j
< numFrames
; j
++)
739 frames
[j
]->sliceType
= (j
% (numBFrames
+ 1)) ? X265_TYPE_B
: X265_TYPE_P
;
742 frames
[numFrames
]->sliceType
= X265_TYPE_P
;
744 /* Check scenecut on the first minigop. */
745 for (int j
= 1; j
< numBFrames
+ 1; j
++)
747 if (m_param
->scenecutThreshold
&& scenecut(frames
, j
, j
+ 1, false, origNumFrames
, maxSearch
))
749 frames
[j
]->sliceType
= X265_TYPE_P
;
755 resetStart
= bKeyframe
? 1 : X265_MIN(numBFrames
+ 2, numAnalyzed
+ 1);
759 for (int j
= 1; j
<= numFrames
; j
++)
761 frames
[j
]->sliceType
= X265_TYPE_P
;
764 resetStart
= bKeyframe
? 1 : 2;
767 if (m_param
->rc
.cuTree
)
768 cuTree(frames
, X265_MIN(numFrames
, m_param
->keyframeMax
), bKeyframe
);
770 // if (!param->bIntraRefresh)
771 for (int j
= keyintLimit
+ 1; j
<= numFrames
; j
+= m_param
->keyframeMax
)
773 frames
[j
]->sliceType
= X265_TYPE_I
;
774 resetStart
= X265_MIN(resetStart
, j
+ 1);
778 vbvLookahead(frames
, numFrames
, bKeyframe
);
780 /* Restore frametypes for all frames that haven't actually been decided yet. */
781 for (int j
= resetStart
; j
<= numFrames
; j
++)
783 frames
[j
]->sliceType
= X265_TYPE_AUTO
;
787 bool Lookahead::scenecut(Lowres
**frames
, int p0
, int p1
, bool bRealScenecut
, int numFrames
, int maxSearch
)
789 /* Only do analysis during a normal scenecut check. */
790 if (bRealScenecut
&& m_param
->bframes
)
792 int origmaxp1
= p0
+ 1;
793 /* Look ahead to avoid coding short flashes as scenecuts. */
794 if (m_param
->bFrameAdaptive
== X265_B_ADAPT_TRELLIS
)
795 /* Don't analyse any more frames than the trellis would have covered. */
796 origmaxp1
+= m_param
->bframes
;
799 int maxp1
= X265_MIN(origmaxp1
, numFrames
);
801 /* Where A and B are scenes: AAAAAABBBAAAAAA
802 * If BBB is shorter than (maxp1-p0), it is detected as a flash
803 * and not considered a scenecut. */
804 for (int cp1
= p1
; cp1
<= maxp1
; cp1
++)
806 if (!scenecutInternal(frames
, p0
, cp1
, false))
807 /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */
808 for (int i
= cp1
; i
> p0
; i
--)
810 frames
[i
]->bScenecut
= false;
814 /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF
815 * If each of BB ... EE are shorter than (maxp1-p0), they are
816 * detected as flashes and not considered scenecuts.
817 * Instead, the first F frame becomes a scenecut.
818 * If the video ends before F, no frame becomes a scenecut. */
819 for (int cp0
= p0
; cp0
<= maxp1
; cp0
++)
821 if (origmaxp1
> maxSearch
|| (cp0
< maxp1
&& scenecutInternal(frames
, cp0
, maxp1
, false)))
822 /* If cur_p0 is the p0 of a scenecut, it cannot be the p1 of a scenecut. */
823 frames
[cp0
]->bScenecut
= false;
827 /* Ignore frames that are part of a flash, i.e. cannot be real scenecuts. */
828 if (!frames
[p1
]->bScenecut
)
830 return scenecutInternal(frames
, p0
, p1
, bRealScenecut
);
833 bool Lookahead::scenecutInternal(Lowres
**frames
, int p0
, int p1
, bool bRealScenecut
)
835 Lowres
*frame
= frames
[p1
];
837 m_est
.estimateFrameCost(frames
, p0
, p1
, p1
, 0);
839 int64_t icost
= frame
->costEst
[0][0];
840 int64_t pcost
= frame
->costEst
[p1
- p0
][0];
841 int gopSize
= frame
->frameNum
- m_lastKeyframe
;
842 float threshMax
= (float)(m_param
->scenecutThreshold
/ 100.0);
844 /* magic numbers pulled out of thin air */
845 float threshMin
= (float)(threshMax
* 0.25);
848 if (m_param
->keyframeMin
== m_param
->keyframeMax
)
849 threshMin
= threshMax
;
850 if (gopSize
<= m_param
->keyframeMin
/ 4)
851 bias
= threshMin
/ 4;
852 else if (gopSize
<= m_param
->keyframeMin
)
853 bias
= threshMin
* gopSize
/ m_param
->keyframeMin
;
857 + (threshMax
- threshMin
)
858 * (gopSize
- m_param
->keyframeMin
)
859 / (m_param
->keyframeMax
- m_param
->keyframeMin
);
862 bool res
= pcost
>= (1.0 - bias
) * icost
;
863 if (res
&& bRealScenecut
)
865 int imb
= frame
->intraMbs
[p1
- p0
];
866 int pmb
= NUM_CUS
- imb
;
867 x265_log(m_param
, X265_LOG_DEBUG
, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n",
868 frame
->frameNum
, icost
, pcost
, 1. - (double)pcost
/ icost
, bias
, gopSize
, imb
, pmb
);
873 void Lookahead::slicetypePath(Lowres
**frames
, int length
, char(*best_paths
)[X265_LOOKAHEAD_MAX
+ 1])
875 char paths
[2][X265_LOOKAHEAD_MAX
+ 1];
876 int num_paths
= X265_MIN(m_param
->bframes
+ 1, length
);
877 int64_t best_cost
= 1LL << 62;
880 /* Iterate over all currently possible paths */
881 for (int path
= 0; path
< num_paths
; path
++)
883 /* Add suffixes to the current path */
884 int len
= length
- (path
+ 1);
885 memcpy(paths
[idx
], best_paths
[len
% (X265_BFRAME_MAX
+ 1)], len
);
886 memset(paths
[idx
] + len
, 'B', path
);
887 strcpy(paths
[idx
] + len
+ path
, "P");
889 /* Calculate the actual cost of the current path */
890 int64_t cost
= slicetypePathCost(frames
, paths
[idx
], best_cost
);
891 if (cost
< best_cost
)
898 /* Store the best path. */
899 memcpy(best_paths
[length
% (X265_BFRAME_MAX
+ 1)], paths
[idx
^ 1], length
);
902 int64_t Lookahead::slicetypePathCost(Lowres
**frames
, char *path
, int64_t threshold
)
908 path
--; /* Since the 1st path element is really the second frame */
912 /* Find the location of the next P-frame. */
913 while (path
[next_p
] != 'P')
918 /* Add the cost of the P-frame found above */
919 cost
+= m_est
.estimateFrameCost(frames
, cur_p
, next_p
, next_p
, 0);
920 /* Early terminate if the cost we have found is larger than the best path cost so far */
921 if (cost
> threshold
)
924 if (m_param
->bBPyramid
&& next_p
- cur_p
> 2)
926 int middle
= cur_p
+ (next_p
- cur_p
) / 2;
927 cost
+= m_est
.estimateFrameCost(frames
, cur_p
, next_p
, middle
, 0);
928 for (int next_b
= loc
; next_b
< middle
&& cost
< threshold
; next_b
++)
930 cost
+= m_est
.estimateFrameCost(frames
, cur_p
, middle
, next_b
, 0);
933 for (int next_b
= middle
+ 1; next_b
< next_p
&& cost
< threshold
; next_b
++)
935 cost
+= m_est
.estimateFrameCost(frames
, middle
, next_p
, next_b
, 0);
940 for (int next_b
= loc
; next_b
< next_p
&& cost
< threshold
; next_b
++)
942 cost
+= m_est
.estimateFrameCost(frames
, cur_p
, next_p
, next_b
, 0);
953 void Lookahead::cuTree(Lowres
**frames
, int numframes
, bool bIntra
)
956 int lastnonb
, curnonb
= 1;
960 double totalDuration
= 0.0;
961 for (int j
= 0; j
<= numframes
; j
++)
962 totalDuration
+= (double)m_param
->fpsDenom
/ m_param
->fpsNum
;
964 double averageDuration
= totalDuration
/ (numframes
+ 1);
967 int cuCount
= m_widthInCU
* m_heightInCU
;
970 m_est
.estimateFrameCost(frames
, 0, 0, 0, 0);
972 while (i
> 0 && frames
[i
]->sliceType
== X265_TYPE_B
)
977 /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could
978 * be applied to the end of a lookahead buffer of any size. However, it's most needed when
979 * lookahead=0, so that's what's currently implemented. */
980 if (!m_param
->lookaheadDepth
)
984 memset(frames
[0]->propagateCost
, 0, cuCount
* sizeof(uint16_t));
985 memcpy(frames
[0]->qpCuTreeOffset
, frames
[0]->qpAqOffset
, cuCount
* sizeof(double));
988 std::swap(frames
[lastnonb
]->propagateCost
, frames
[0]->propagateCost
);
989 memset(frames
[0]->propagateCost
, 0, cuCount
* sizeof(uint16_t));
995 memset(frames
[lastnonb
]->propagateCost
, 0, cuCount
* sizeof(uint16_t));
1001 while (frames
[curnonb
]->sliceType
== X265_TYPE_B
&& curnonb
> 0)
1007 m_est
.estimateFrameCost(frames
, curnonb
, lastnonb
, lastnonb
, 0);
1008 memset(frames
[curnonb
]->propagateCost
, 0, cuCount
* sizeof(uint16_t));
1009 bframes
= lastnonb
- curnonb
- 1;
1010 if (m_param
->bBPyramid
&& bframes
> 1)
1012 int middle
= (bframes
+ 1) / 2 + curnonb
;
1013 m_est
.estimateFrameCost(frames
, curnonb
, lastnonb
, middle
, 0);
1014 memset(frames
[middle
]->propagateCost
, 0, cuCount
* sizeof(uint16_t));
1017 int p0
= i
> middle
? middle
: curnonb
;
1018 int p1
= i
< middle
? middle
: lastnonb
;
1021 m_est
.estimateFrameCost(frames
, p0
, p1
, i
, 0);
1022 estimateCUPropagate(frames
, averageDuration
, p0
, p1
, i
, 0);
1027 estimateCUPropagate(frames
, averageDuration
, curnonb
, lastnonb
, middle
, 1);
1033 m_est
.estimateFrameCost(frames
, curnonb
, lastnonb
, i
, 0);
1034 estimateCUPropagate(frames
, averageDuration
, curnonb
, lastnonb
, i
, 0);
1038 estimateCUPropagate(frames
, averageDuration
, curnonb
, lastnonb
, lastnonb
, 1);
1042 if (!m_param
->lookaheadDepth
)
1044 m_est
.estimateFrameCost(frames
, 0, lastnonb
, lastnonb
, 0);
1045 estimateCUPropagate(frames
, averageDuration
, 0, lastnonb
, lastnonb
, 1);
1046 std::swap(frames
[lastnonb
]->propagateCost
, frames
[0]->propagateCost
);
1049 cuTreeFinish(frames
[lastnonb
], averageDuration
, lastnonb
);
1050 if (m_param
->bBPyramid
&& bframes
> 1 && !m_param
->rc
.vbvBufferSize
)
1051 cuTreeFinish(frames
[lastnonb
+ (bframes
+ 1) / 2], averageDuration
, 0);
1054 void Lookahead::estimateCUPropagate(Lowres
**frames
, double averageDuration
, int p0
, int p1
, int b
, int referenced
)
1056 uint16_t *refCosts
[2] = { frames
[p0
]->propagateCost
, frames
[p1
]->propagateCost
};
1057 int32_t distScaleFactor
= (((b
- p0
) << 8) + ((p1
- p0
) >> 1)) / (p1
- p0
);
1058 int32_t bipredWeight
= m_param
->bEnableWeightedBiPred
? 64 - (distScaleFactor
>> 2) : 32;
1059 MV
*mvs
[2] = { frames
[b
]->lowresMvs
[0][b
- p0
- 1], frames
[b
]->lowresMvs
[1][p1
- b
- 1] };
1060 int32_t bipredWeights
[2] = { bipredWeight
, 64 - bipredWeight
};
1062 memset(m_scratch
, 0, m_widthInCU
* sizeof(int));
1064 uint16_t *propagateCost
= frames
[b
]->propagateCost
;
1067 double fpsFactor
= CLIP_DURATION((double)m_param
->fpsDenom
/ m_param
->fpsNum
) / CLIP_DURATION(averageDuration
);
1069 /* For non-refferd frames the source costs are always zero, so just memset one row and re-use it. */
1071 memset(frames
[b
]->propagateCost
, 0, m_widthInCU
* sizeof(uint16_t));
1073 int32_t StrideInCU
= m_widthInCU
;
1074 for (uint16_t blocky
= 0; blocky
< m_heightInCU
; blocky
++)
1076 int cuIndex
= blocky
* StrideInCU
;
1077 primitives
.propagateCost(m_scratch
, propagateCost
,
1078 frames
[b
]->intraCost
+ cuIndex
, frames
[b
]->lowresCosts
[b
- p0
][p1
- b
] + cuIndex
,
1079 frames
[b
]->invQscaleFactor
+ cuIndex
, &fpsFactor
, m_widthInCU
);
1082 propagateCost
+= m_widthInCU
;
1083 for (uint16_t blockx
= 0; blockx
< m_widthInCU
; blockx
++, cuIndex
++)
1085 int32_t propagate_amount
= m_scratch
[blockx
];
1086 /* Don't propagate for an intra block. */
1087 if (propagate_amount
> 0)
1089 /* Access width-2 bitfield. */
1090 int32_t lists_used
= frames
[b
]->lowresCosts
[b
- p0
][p1
- b
][cuIndex
] >> LOWRES_COST_SHIFT
;
1091 /* Follow the MVs to the previous frame(s). */
1092 for (uint16_t list
= 0; list
< 2; list
++)
1094 if ((lists_used
>> list
) & 1)
1096 #define CLIP_ADD(s, x) (s) = (uint16_t)X265_MIN((s) + (x), (1 << 16) - 1)
1097 int32_t listamount
= propagate_amount
;
1098 /* Apply bipred weighting. */
1099 if (lists_used
== 3)
1100 listamount
= (listamount
* bipredWeights
[list
] + 32) >> 6;
1102 /* Early termination for simple case of mv0. */
1103 if (!mvs
[list
][cuIndex
].word
)
1105 CLIP_ADD(refCosts
[list
][cuIndex
], listamount
);
1109 int32_t x
= mvs
[list
][cuIndex
].x
;
1110 int32_t y
= mvs
[list
][cuIndex
].y
;
1111 int32_t cux
= (x
>> 5) + blockx
;
1112 int32_t cuy
= (y
>> 5) + blocky
;
1113 int32_t idx0
= cux
+ cuy
* StrideInCU
;
1114 int32_t idx1
= idx0
+ 1;
1115 int32_t idx2
= idx0
+ StrideInCU
;
1116 int32_t idx3
= idx0
+ StrideInCU
+ 1;
1119 int32_t idx0weight
= (32 - y
) * (32 - x
);
1120 int32_t idx1weight
= (32 - y
) * x
;
1121 int32_t idx2weight
= y
* (32 - x
);
1122 int32_t idx3weight
= y
* x
;
1124 /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
1126 if (cux
< m_widthInCU
- 1 && cuy
< m_heightInCU
- 1 && cux
>= 0 && cuy
>= 0)
1128 CLIP_ADD(refCosts
[list
][idx0
], (listamount
* idx0weight
+ 512) >> 10);
1129 CLIP_ADD(refCosts
[list
][idx1
], (listamount
* idx1weight
+ 512) >> 10);
1130 CLIP_ADD(refCosts
[list
][idx2
], (listamount
* idx2weight
+ 512) >> 10);
1131 CLIP_ADD(refCosts
[list
][idx3
], (listamount
* idx3weight
+ 512) >> 10);
1133 else /* Check offsets individually */
1135 if (cux
< m_widthInCU
&& cuy
< m_heightInCU
&& cux
>= 0 && cuy
>= 0)
1136 CLIP_ADD(refCosts
[list
][idx0
], (listamount
* idx0weight
+ 512) >> 10);
1137 if (cux
+ 1 < m_widthInCU
&& cuy
< m_heightInCU
&& cux
+ 1 >= 0 && cuy
>= 0)
1138 CLIP_ADD(refCosts
[list
][idx1
], (listamount
* idx1weight
+ 512) >> 10);
1139 if (cux
< m_widthInCU
&& cuy
+ 1 < m_heightInCU
&& cux
>= 0 && cuy
+ 1 >= 0)
1140 CLIP_ADD(refCosts
[list
][idx2
], (listamount
* idx2weight
+ 512) >> 10);
1141 if (cux
+ 1 < m_widthInCU
&& cuy
+ 1 < m_heightInCU
&& cux
+ 1 >= 0 && cuy
+ 1 >= 0)
1142 CLIP_ADD(refCosts
[list
][idx3
], (listamount
* idx3weight
+ 512) >> 10);
1150 if (m_param
->rc
.vbvBufferSize
&& m_param
->lookaheadDepth
&& referenced
)
1151 cuTreeFinish(frames
[b
], averageDuration
, b
== p1
? b
- p0
: 0);
1154 void Lookahead::cuTreeFinish(Lowres
*frame
, double averageDuration
, int ref0Distance
)
1156 int fpsFactor
= (int)(CLIP_DURATION(averageDuration
) / CLIP_DURATION((double)m_param
->fpsDenom
/ m_param
->fpsNum
) * 256);
1157 double weightdelta
= 0.0;
1159 if (ref0Distance
&& frame
->weightedCostDelta
[ref0Distance
- 1] > 0)
1160 weightdelta
= (1.0 - frame
->weightedCostDelta
[ref0Distance
- 1]);
1162 /* Allow the strength to be adjusted via qcompress, since the two
1163 * concepts are very similar. */
1165 int cuCount
= m_widthInCU
* m_heightInCU
;
1166 double strength
= 5.0 * (1.0 - m_param
->rc
.qCompress
);
1168 for (int cuIndex
= 0; cuIndex
< cuCount
; cuIndex
++)
1170 int intracost
= (frame
->intraCost
[cuIndex
] * frame
->invQscaleFactor
[cuIndex
] + 128) >> 8;
1173 int propagateCost
= (frame
->propagateCost
[cuIndex
] * fpsFactor
+ 128) >> 8;
1174 double log2_ratio
= X265_LOG2(intracost
+ propagateCost
) - X265_LOG2(intracost
) + weightdelta
;
1175 frame
->qpCuTreeOffset
[cuIndex
] = frame
->qpAqOffset
[cuIndex
] - strength
* log2_ratio
;
1180 /* If MB-tree changes the quantizers, we need to recalculate the frame cost without
1181 * re-running lookahead. */
1182 int64_t Lookahead::frameCostRecalculate(Lowres
** frames
, int p0
, int p1
, int b
)
1185 int *rowSatd
= frames
[b
]->rowSatds
[b
- p0
][p1
- b
];
1186 double *qp_offset
= (frames
[b
]->sliceType
== X265_TYPE_B
) ? frames
[b
]->qpAqOffset
: frames
[b
]->qpCuTreeOffset
;
1189 for (int cuy
= m_heightInCU
- 1; cuy
>= 0; cuy
--)
1192 for (int cux
= m_widthInCU
- 1; cux
>= 0; cux
--)
1194 int cuxy
= cux
+ cuy
* m_widthInCU
;
1195 int cuCost
= frames
[b
]->lowresCosts
[b
- p0
][p1
- b
][cuxy
] & LOWRES_COST_MASK
;
1196 double qp_adj
= qp_offset
[cuxy
];
1197 cuCost
= (cuCost
* x265_exp2fix8(qp_adj
) + 128) >> 8;
1198 rowSatd
[cuy
] += cuCost
;
1199 if ((cuy
> 0 && cuy
< m_heightInCU
- 1 &&
1200 cux
> 0 && cux
< m_widthInCU
- 1) ||
1201 m_widthInCU
<= 2 || m_heightInCU
<= 2)
1211 CostEstimate::CostEstimate(ThreadPool
*p
)
1216 m_wbuffer
[0] = m_wbuffer
[1] = m_wbuffer
[2] = m_wbuffer
[3] = 0;
1218 m_paddedLines
= m_widthInCU
= m_heightInCU
= 0;
1219 m_bDoSearch
[0] = m_bDoSearch
[1] = false;
1220 m_curb
= m_curp0
= m_curp1
= 0;
1221 m_bFrameCompleted
= false;
1224 CostEstimate::~CostEstimate()
1226 for (int i
= 0; i
< 4; i
++)
1228 x265_free(m_wbuffer
[i
]);
1234 void CostEstimate::init(x265_param
*_param
, Frame
*curFrame
)
1237 m_widthInCU
= ((m_param
->sourceWidth
/ 2) + X265_LOWRES_CU_SIZE
- 1) >> X265_LOWRES_CU_BITS
;
1238 m_heightInCU
= ((m_param
->sourceHeight
/ 2) + X265_LOWRES_CU_SIZE
- 1) >> X265_LOWRES_CU_BITS
;
1240 m_rows
= new EstimateRow
[m_heightInCU
];
1241 for (int i
= 0; i
< m_heightInCU
; i
++)
1243 m_rows
[i
].m_widthInCU
= m_widthInCU
;
1244 m_rows
[i
].m_heightInCU
= m_heightInCU
;
1245 m_rows
[i
].m_param
= m_param
;
1248 if (WaveFront::init(m_heightInCU
))
1249 WaveFront::enableAllRows();
1253 if (m_param
->bEnableWeightedPred
)
1255 PicYuv
*orig
= curFrame
->m_fencPic
;
1256 m_paddedLines
= curFrame
->m_lowres
.lines
+ 2 * orig
->m_lumaMarginY
;
1257 intptr_t padoffset
= curFrame
->m_lowres
.lumaStride
* orig
->m_lumaMarginY
+ orig
->m_lumaMarginX
;
1259 /* allocate weighted lowres buffers */
1260 for (int i
= 0; i
< 4; i
++)
1262 m_wbuffer
[i
] = (pixel
*)x265_malloc(sizeof(pixel
) * (curFrame
->m_lowres
.lumaStride
* m_paddedLines
));
1263 m_weightedRef
.lowresPlane
[i
] = m_wbuffer
[i
] + padoffset
;
1266 m_weightedRef
.fpelPlane
[0] = m_weightedRef
.lowresPlane
[0];
1267 m_weightedRef
.lumaStride
= curFrame
->m_lowres
.lumaStride
;
1268 m_weightedRef
.isLowres
= true;
1269 m_weightedRef
.isWeighted
= false;
1273 int64_t CostEstimate::estimateFrameCost(Lowres
**frames
, int p0
, int p1
, int b
, bool bIntraPenalty
)
1276 Lowres
*fenc
= frames
[b
];
1278 if (fenc
->costEst
[b
- p0
][p1
- b
] >= 0 && fenc
->rowSatds
[b
- p0
][p1
- b
][0] != -1)
1279 score
= fenc
->costEst
[b
- p0
][p1
- b
];
1282 m_weightedRef
.isWeighted
= false;
1283 if (m_param
->bEnableWeightedPred
&& b
== p1
&& b
!= p0
&& fenc
->lowresMvs
[0][b
- p0
- 1][0].x
== 0x7FFF)
1285 if (!fenc
->bIntraCalculated
)
1286 estimateFrameCost(frames
, b
, b
, b
, 0);
1287 weightsAnalyse(frames
, b
, p0
);
1290 /* For each list, check to see whether we have lowres motion-searched this reference */
1291 m_bDoSearch
[0] = b
!= p0
&& fenc
->lowresMvs
[0][b
- p0
- 1][0].x
== 0x7FFF;
1292 m_bDoSearch
[1] = b
!= p1
&& fenc
->lowresMvs
[1][p1
- b
- 1][0].x
== 0x7FFF;
1294 if (m_bDoSearch
[0]) fenc
->lowresMvs
[0][b
- p0
- 1][0].x
= 0;
1295 if (m_bDoSearch
[1]) fenc
->lowresMvs
[1][p1
- b
- 1][0].x
= 0;
1300 m_curframes
= frames
;
1301 fenc
->costEst
[b
- p0
][p1
- b
] = 0;
1302 fenc
->costEstAq
[b
- p0
][p1
- b
] = 0;
1304 for (int i
= 0; i
< m_heightInCU
; i
++)
1307 if (!fenc
->bIntraCalculated
)
1308 fenc
->rowSatds
[0][0][i
] = 0;
1309 fenc
->rowSatds
[b
- p0
][p1
- b
][i
] = 0;
1312 m_bFrameCompleted
= false;
1316 WaveFront::enqueue();
1318 // enableAllRows must be already called
1320 while (!m_bFrameCompleted
)
1321 WaveFront::findJob(-1);
1323 WaveFront::dequeue();
1327 for (int row
= 0; row
< m_heightInCU
; row
++)
1328 processRow(row
, -1);
1333 // Accumulate cost from each row
1334 for (int row
= 0; row
< m_heightInCU
; row
++)
1336 score
+= m_rows
[row
].m_costEst
;
1337 fenc
->costEst
[0][0] += m_rows
[row
].m_costIntra
;
1338 if (m_param
->rc
.aqMode
)
1340 fenc
->costEstAq
[0][0] += m_rows
[row
].m_costIntraAq
;
1341 fenc
->costEstAq
[b
- p0
][p1
- b
] += m_rows
[row
].m_costEstAq
;
1343 fenc
->intraMbs
[b
- p0
] += m_rows
[row
].m_intraMbs
;
1346 fenc
->bIntraCalculated
= true;
1349 score
= (uint64_t)score
* 100 / (130 + m_param
->bFrameBias
);
1350 if (b
!= p0
|| b
!= p1
) //Not Intra cost
1351 fenc
->costEst
[b
- p0
][p1
- b
] = score
;
1356 // arbitrary penalty for I-blocks after B-frames
1358 score
+= (uint64_t)score
* fenc
->intraMbs
[b
- p0
] / (ncu
* 8);
1363 uint32_t CostEstimate::weightCostLuma(Lowres
**frames
, int b
, int p0
, WeightParam
*wp
)
1365 Lowres
*fenc
= frames
[b
];
1366 Lowres
*ref
= frames
[p0
];
1367 pixel
*src
= ref
->fpelPlane
[0];
1368 intptr_t stride
= fenc
->lumaStride
;
1372 int offset
= wp
->inputOffset
<< (X265_DEPTH
- 8);
1373 int scale
= wp
->inputWeight
;
1374 int denom
= wp
->log2WeightDenom
;
1375 int round
= denom
? 1 << (denom
- 1) : 0;
1376 int correction
= IF_INTERNAL_PREC
- X265_DEPTH
; // intermediate interpolation depth
1377 int widthHeight
= (int)stride
;
1379 primitives
.weight_pp(ref
->buffer
[0], m_wbuffer
[0], stride
, widthHeight
, m_paddedLines
,
1380 scale
, round
<< correction
, denom
+ correction
, offset
);
1381 src
= m_weightedRef
.fpelPlane
[0];
1385 intptr_t pixoff
= 0;
1388 for (int y
= 0; y
< fenc
->lines
; y
+= 8, pixoff
= y
* stride
)
1390 for (int x
= 0; x
< fenc
->width
; x
+= 8, mb
++, pixoff
+= 8)
1392 int satd
= primitives
.satd
[LUMA_8x8
](src
+ pixoff
, stride
, fenc
->fpelPlane
[0] + pixoff
, stride
);
1393 cost
+= X265_MIN(satd
, fenc
->intraCost
[mb
]);
1400 void CostEstimate::weightsAnalyse(Lowres
**frames
, int b
, int p0
)
1402 static const float epsilon
= 1.f
/ 128.f
;
1407 int deltaIndex
= fenc
->frameNum
- ref
->frameNum
;
1409 /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
1410 float guessScale
, fencMean
, refMean
;
1412 if (fenc
->wp_ssd
[0] && ref
->wp_ssd
[0])
1413 guessScale
= sqrtf((float)fenc
->wp_ssd
[0] / ref
->wp_ssd
[0]);
1416 fencMean
= (float)fenc
->wp_sum
[0] / (fenc
->lines
* fenc
->width
) / (1 << (X265_DEPTH
- 8));
1417 refMean
= (float)ref
->wp_sum
[0] / (fenc
->lines
* fenc
->width
) / (1 << (X265_DEPTH
- 8));
1419 /* Early termination */
1420 if (fabsf(refMean
- fencMean
) < 0.5f
&& fabsf(1.f
- guessScale
) < epsilon
)
1423 int minoff
= 0, minscale
, mindenom
;
1424 unsigned int minscore
= 0, origscore
= 1;
1427 m_w
.setFromWeightAndOffset((int)(guessScale
* 128 + 0.5f
), 0, 7, true);
1428 mindenom
= m_w
.log2WeightDenom
;
1429 minscale
= m_w
.inputWeight
;
1431 origscore
= minscore
= weightCostLuma(frames
, b
, p0
, NULL
);
1437 int curScale
= minscale
;
1438 int curOffset
= (int)(fencMean
- refMean
* curScale
/ (1 << mindenom
) + 0.5f
);
1439 if (curOffset
< -128 || curOffset
> 127)
1441 /* Rescale considering the constraints on curOffset. We do it in this order
1442 * because scale has a much wider range than offset (because of denom), so
1443 * it should almost never need to be clamped. */
1444 curOffset
= Clip3(-128, 127, curOffset
);
1445 curScale
= (int)((1 << mindenom
) * (fencMean
- curOffset
) / refMean
+ 0.5f
);
1446 curScale
= Clip3(0, 127, curScale
);
1448 SET_WEIGHT(m_w
, 1, curScale
, mindenom
, curOffset
);
1449 s
= weightCostLuma(frames
, b
, p0
, &m_w
);
1450 COPY4_IF_LT(minscore
, s
, minscale
, curScale
, minoff
, curOffset
, found
, 1);
1452 /* Use a smaller denominator if possible */
1453 while (mindenom
> 0 && !(minscale
& 1))
1459 if (!found
|| (minscale
== 1 << mindenom
&& minoff
== 0) || (float)minscore
/ origscore
> 0.998f
)
1463 SET_WEIGHT(m_w
, 1, minscale
, mindenom
, minoff
);
1464 // set weighted delta cost
1465 fenc
->weightedCostDelta
[deltaIndex
] = minscore
/ origscore
;
1467 int offset
= m_w
.inputOffset
<< (X265_DEPTH
- 8);
1468 int scale
= m_w
.inputWeight
;
1469 int denom
= m_w
.log2WeightDenom
;
1470 int round
= denom
? 1 << (denom
- 1) : 0;
1471 int correction
= IF_INTERNAL_PREC
- X265_DEPTH
; // intermediate interpolation depth
1472 intptr_t stride
= ref
->lumaStride
;
1473 int widthHeight
= (int)stride
;
1475 for (int i
= 0; i
< 4; i
++)
1476 primitives
.weight_pp(ref
->buffer
[i
], m_wbuffer
[i
], stride
, widthHeight
, m_paddedLines
,
1477 scale
, round
<< correction
, denom
+ correction
, offset
);
1479 m_weightedRef
.isWeighted
= true;
1483 void CostEstimate::processRow(int row
, int /*threadId*/)
1485 ProfileScopeEvent(costEstimateRow
);
1487 int realrow
= m_heightInCU
- 1 - row
;
1488 Lowres
**frames
= m_curframes
;
1489 ReferencePlanes
*wfref0
= m_weightedRef
.isWeighted
? &m_weightedRef
: frames
[m_curp0
];
1491 /* Lowres lookahead goes backwards because the MVs are used as
1492 * predictors in the main encode. This considerably improves MV
1493 * prediction overall. */
1494 for (int i
= m_widthInCU
- 1 - m_rows
[row
].m_completed
; i
>= 0; i
--)
1496 // TODO: use lowres MVs as motion candidates in full-res search
1497 m_rows
[row
].estimateCUCost(frames
, wfref0
, i
, realrow
, m_curp0
, m_curp1
, m_curb
, m_bDoSearch
);
1498 m_rows
[row
].m_completed
++;
1500 if (m_rows
[row
].m_completed
>= 2 && row
< m_heightInCU
- 1)
1502 ScopedLock
below(m_rows
[row
+ 1].m_lock
);
1503 if (m_rows
[row
+ 1].m_active
== false &&
1504 m_rows
[row
+ 1].m_completed
+ 2 <= m_rows
[row
].m_completed
)
1506 m_rows
[row
+ 1].m_active
= true;
1507 enqueueRow(row
+ 1);
1511 ScopedLock
self(m_rows
[row
].m_lock
);
1512 if (row
> 0 && (int32_t)m_rows
[row
].m_completed
< m_widthInCU
- 1 &&
1513 m_rows
[row
- 1].m_completed
< m_rows
[row
].m_completed
+ 2)
1515 m_rows
[row
].m_active
= false;
1520 if (row
== m_heightInCU
- 1)
1521 m_bFrameCompleted
= true;
1524 void EstimateRow::init()
1535 void EstimateRow::estimateCUCost(Lowres
**frames
, ReferencePlanes
*wfref0
, int cux
, int cuy
, int p0
, int p1
, int b
, bool bDoSearch
[2])
1537 Lowres
*fref1
= frames
[p1
];
1538 Lowres
*fenc
= frames
[b
];
1540 const int bBidir
= (b
< p1
);
1541 const int cuXY
= cux
+ cuy
* m_widthInCU
;
1542 const int cuSize
= X265_LOWRES_CU_SIZE
;
1543 const intptr_t pelOffset
= cuSize
* cux
+ cuSize
* cuy
* fenc
->lumaStride
;
1545 // should this CU's cost contribute to the frame cost?
1546 const bool bFrameScoreCU
= (cux
> 0 && cux
< m_widthInCU
- 1 &&
1547 cuy
> 0 && cuy
< m_heightInCU
- 1) || m_widthInCU
<= 2 || m_heightInCU
<= 2;
1549 m_me
.setSourcePU(fenc
->lowresPlane
[0], fenc
->lumaStride
, pelOffset
, cuSize
, cuSize
);
1551 /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
1552 int lowresPenalty
= 4;
1554 MV(*fenc_mvs
[2]) = { &fenc
->lowresMvs
[0][b
- p0
- 1][cuXY
],
1555 &fenc
->lowresMvs
[1][p1
- b
- 1][cuXY
] };
1556 int(*fenc_costs
[2]) = { &fenc
->lowresMvCosts
[0][b
- p0
- 1][cuXY
],
1557 &fenc
->lowresMvCosts
[1][p1
- b
- 1][cuXY
] };
1560 int bcost
= m_me
.COST_MAX
;
1563 // establish search bounds that don't cross extended frame boundaries
1564 mvmin
.x
= (int16_t)(-cux
* cuSize
- 8);
1565 mvmin
.y
= (int16_t)(-cuy
* cuSize
- 8);
1566 mvmax
.x
= (int16_t)((m_widthInCU
- cux
- 1) * cuSize
+ 8);
1567 mvmax
.y
= (int16_t)((m_heightInCU
- cuy
- 1) * cuSize
+ 8);
1571 for (int i
= 0; i
< 1 + bBidir
; i
++)
1575 /* Use previously calculated cost */
1576 COPY2_IF_LT(bcost
, *fenc_costs
[i
], listused
, i
+ 1);
1581 MV
*fenc_mv
= fenc_mvs
[i
];
1583 /* Reverse-order MV prediction. */
1586 #define MVC(mv) mvc[numc++] = mv;
1587 if (cux
< m_widthInCU
- 1)
1589 if (cuy
< m_heightInCU
- 1)
1591 MVC(fenc_mv
[m_widthInCU
]);
1593 MVC(fenc_mv
[m_widthInCU
- 1]);
1594 if (cux
< m_widthInCU
- 1)
1595 MVC(fenc_mv
[m_widthInCU
+ 1]);
1602 median_mv(mvp
, mvc
[0], mvc
[1], mvc
[2]);
1605 *fenc_costs
[i
] = m_me
.motionEstimate(i
? fref1
: wfref0
, mvmin
, mvmax
, mvp
, numc
, mvc
, m_merange
, *fenc_mvs
[i
]);
1606 COPY2_IF_LT(bcost
, *fenc_costs
[i
], listused
, i
+ 1);
1610 ALIGN_VAR_32(pixel
, subpelbuf0
[X265_LOWRES_CU_SIZE
* X265_LOWRES_CU_SIZE
]);
1611 ALIGN_VAR_32(pixel
, subpelbuf1
[X265_LOWRES_CU_SIZE
* X265_LOWRES_CU_SIZE
]);
1612 intptr_t stride0
= X265_LOWRES_CU_SIZE
, stride1
= X265_LOWRES_CU_SIZE
;
1613 pixel
*src0
= wfref0
->lowresMC(pelOffset
, *fenc_mvs
[0], subpelbuf0
, stride0
);
1614 pixel
*src1
= fref1
->lowresMC(pelOffset
, *fenc_mvs
[1], subpelbuf1
, stride1
);
1616 ALIGN_VAR_32(pixel
, ref
[X265_LOWRES_CU_SIZE
* X265_LOWRES_CU_SIZE
]);
1617 primitives
.pixelavg_pp
[LUMA_8x8
](ref
, X265_LOWRES_CU_SIZE
, src0
, stride0
, src1
, stride1
, 32);
1618 int bicost
= primitives
.satd
[LUMA_8x8
](fenc
->lowresPlane
[0] + pelOffset
, fenc
->lumaStride
, ref
, X265_LOWRES_CU_SIZE
);
1619 COPY2_IF_LT(bcost
, bicost
, listused
, 3);
1621 // Try 0,0 candidates
1622 src0
= wfref0
->lowresPlane
[0] + pelOffset
;
1623 src1
= fref1
->lowresPlane
[0] + pelOffset
;
1624 primitives
.pixelavg_pp
[LUMA_8x8
](ref
, X265_LOWRES_CU_SIZE
, src0
, wfref0
->lumaStride
, src1
, fref1
->lumaStride
, 32);
1625 bicost
= primitives
.satd
[LUMA_8x8
](fenc
->lowresPlane
[0] + pelOffset
, fenc
->lumaStride
, ref
, X265_LOWRES_CU_SIZE
);
1626 COPY2_IF_LT(bcost
, bicost
, listused
, 3);
1629 if (!fenc
->bIntraCalculated
)
1631 const int sizeIdx
= X265_LOWRES_CU_BITS
- 2; // partition size
1633 pixel _above0
[X265_LOWRES_CU_SIZE
* 4 + 1], *const above0
= _above0
+ 2 * X265_LOWRES_CU_SIZE
;
1634 pixel _above1
[X265_LOWRES_CU_SIZE
* 4 + 1], *const above1
= _above1
+ 2 * X265_LOWRES_CU_SIZE
;
1635 pixel _left0
[X265_LOWRES_CU_SIZE
* 4 + 1], *const left0
= _left0
+ 2 * X265_LOWRES_CU_SIZE
;
1636 pixel _left1
[X265_LOWRES_CU_SIZE
* 4 + 1], *const left1
= _left1
+ 2 * X265_LOWRES_CU_SIZE
;
1638 pixel
*pix_cur
= fenc
->lowresPlane
[0] + pelOffset
;
1641 memcpy(above0
, pix_cur
- 1 - fenc
->lumaStride
, (cuSize
+ 1) * sizeof(pixel
));
1644 for (int i
= 0; i
< cuSize
+ 1; i
++)
1645 left0
[i
] = pix_cur
[-1 - fenc
->lumaStride
+ i
* fenc
->lumaStride
];
1647 for (int i
= 0; i
< cuSize
; i
++)
1649 above0
[cuSize
+ i
+ 1] = above0
[cuSize
];
1650 left0
[cuSize
+ i
+ 1] = left0
[cuSize
];
1653 // filtering with [1 2 1]
1654 // assume getUseStrongIntraSmoothing() is disabled
1655 above1
[0] = above0
[0];
1656 above1
[2 * cuSize
] = above0
[2 * cuSize
];
1657 left1
[0] = left0
[0];
1658 left1
[2 * cuSize
] = left0
[2 * cuSize
];
1659 for (int i
= 1; i
< 2 * cuSize
; i
++)
1661 above1
[i
] = (above0
[i
- 1] + 2 * above0
[i
] + above0
[i
+ 1] + 2) >> 2;
1662 left1
[i
] = (left0
[i
- 1] + 2 * left0
[i
] + left0
[i
+ 1] + 2) >> 2;
1665 int predsize
= cuSize
* cuSize
;
1667 // generate 35 intra predictions into m_predictions
1668 pixelcmp_t satd
= primitives
.satd
[partitionFromLog2Size(X265_LOWRES_CU_BITS
)];
1669 int icost
= m_me
.COST_MAX
;
1670 primitives
.intra_pred
[DC_IDX
][sizeIdx
](m_predictions
, cuSize
, left0
, above0
, 0, (cuSize
<= 16));
1671 int cost
= m_me
.bufSATD(m_predictions
, cuSize
);
1674 pixel
*above
= (cuSize
>= 8) ? above1
: above0
;
1675 pixel
*left
= (cuSize
>= 8) ? left1
: left0
;
1676 primitives
.intra_pred
[PLANAR_IDX
][sizeIdx
](m_predictions
, cuSize
, left
, above
, 0, 0);
1677 cost
= m_me
.bufSATD(m_predictions
, cuSize
);
1680 primitives
.intra_pred_allangs
[sizeIdx
](m_predictions
+ 2 * predsize
, above0
, left0
, above1
, left1
, (cuSize
<= 16));
1682 // calculate satd costs, keep least cost
1683 ALIGN_VAR_32(pixel
, buf_trans
[32 * 32]);
1684 primitives
.transpose
[sizeIdx
](buf_trans
, m_me
.fencPUYuv
.m_buf
[0], FENC_STRIDE
);
1686 int acost
= m_me
.COST_MAX
;
1687 uint32_t mode
, lowmode
= 4;
1688 for (mode
= 5; mode
< 35; mode
+= 5)
1691 cost
= satd(buf_trans
, cuSize
, &m_predictions
[mode
* predsize
], cuSize
);
1693 cost
= m_me
.bufSATD(&m_predictions
[mode
* predsize
], cuSize
);
1694 COPY2_IF_LT(acost
, cost
, lowmode
, mode
);
1696 for (uint32_t dist
= 2; dist
>= 1; dist
--)
1698 mode
= lowmode
- dist
;
1700 cost
= satd(buf_trans
, cuSize
, &m_predictions
[mode
* predsize
], cuSize
);
1702 cost
= m_me
.bufSATD(&m_predictions
[mode
* predsize
], cuSize
);
1703 COPY2_IF_LT(acost
, cost
, lowmode
, mode
);
1705 mode
= lowmode
+ dist
;
1707 cost
= satd(buf_trans
, cuSize
, &m_predictions
[mode
* predsize
], cuSize
);
1709 cost
= m_me
.bufSATD(&m_predictions
[mode
* predsize
], cuSize
);
1710 COPY2_IF_LT(acost
, cost
, lowmode
, mode
);
1715 const int intraPenalty
= 5 * m_lookAheadLambda
;
1716 icost
+= intraPenalty
+ lowresPenalty
; /* estimate intra signal cost */
1717 fenc
->intraCost
[cuXY
] = icost
;
1718 fenc
->intraMode
[cuXY
] = (uint8_t)lowmode
;
1719 int icostAq
= icost
;
1722 m_costIntra
+= icost
;
1723 if (fenc
->invQscaleFactor
)
1725 icostAq
= (icost
* fenc
->invQscaleFactor
[cuXY
] + 128) >> 8;
1726 m_costIntraAq
+= icostAq
;
1729 fenc
->rowSatds
[0][0][cuy
] += icostAq
;
1731 bcost
+= lowresPenalty
;
1734 if (fenc
->intraCost
[cuXY
] < bcost
)
1736 if (bFrameScoreCU
) m_intraMbs
++;
1737 bcost
= fenc
->intraCost
[cuXY
];
1742 /* For I frames these costs were accumulated earlier */
1745 int bcostAq
= bcost
;
1749 if (fenc
->invQscaleFactor
)
1751 bcostAq
= (bcost
* fenc
->invQscaleFactor
[cuXY
] + 128) >> 8;
1752 m_costEstAq
+= bcostAq
;
1755 fenc
->rowSatds
[b
- p0
][p1
- b
][cuy
] += bcostAq
;
1757 fenc
->lowresCosts
[b
- p0
][p1
- b
][cuXY
] = (uint16_t)(X265_MIN(bcost
, LOWRES_COST_MASK
) | (listused
<< LOWRES_COST_SHIFT
));