| 1 | /***************************************************************************** |
| 2 | * Copyright (C) 2013 x265 project |
| 3 | * |
| 4 | * Authors: Gopu Govindaswamy <gopu@multicorewareinc.com> |
| 5 | * Steve Borho <steve@borho.org> |
| 6 | * |
| 7 | * This program is free software; you can redistribute it and/or modify |
| 8 | * it under the terms of the GNU General Public License as published by |
| 9 | * the Free Software Foundation; either version 2 of the License, or |
| 10 | * (at your option) any later version. |
| 11 | * |
| 12 | * This program is distributed in the hope that it will be useful, |
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 15 | * GNU General Public License for more details. |
| 16 | * |
| 17 | * You should have received a copy of the GNU General Public License |
| 18 | * along with this program; if not, write to the Free Software |
| 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
| 20 | * |
| 21 | * This program is also available under a commercial proprietary license. |
| 22 | * For more information, contact us at license @ x265.com. |
| 23 | *****************************************************************************/ |
| 24 | |
| 25 | #include "common.h" |
| 26 | #include "frame.h" |
| 27 | #include "framedata.h" |
| 28 | #include "picyuv.h" |
| 29 | #include "primitives.h" |
| 30 | #include "lowres.h" |
| 31 | #include "mv.h" |
| 32 | |
| 33 | #include "slicetype.h" |
| 34 | #include "motion.h" |
| 35 | #include "ratecontrol.h" |
| 36 | |
| 37 | #define NUM_CUS (m_widthInCU > 2 && m_heightInCU > 2 ? (m_widthInCU - 2) * (m_heightInCU - 2) : m_widthInCU * m_heightInCU) |
| 38 | |
| 39 | using namespace x265; |
| 40 | |
| 41 | static inline int16_t median(int16_t a, int16_t b, int16_t c) |
| 42 | { |
| 43 | int16_t t = (a - b) & ((a - b) >> 31); |
| 44 | |
| 45 | a -= t; |
| 46 | b += t; |
| 47 | b -= (b - c) & ((b - c) >> 31); |
| 48 | b += (a - b) & ((a - b) >> 31); |
| 49 | return b; |
| 50 | } |
| 51 | |
| 52 | static inline void median_mv(MV &dst, MV a, MV b, MV c) |
| 53 | { |
| 54 | dst.x = median(a.x, b.x, c.x); |
| 55 | dst.y = median(a.y, b.y, c.y); |
| 56 | } |
| 57 | |
| 58 | Lookahead::Lookahead(x265_param *param, ThreadPool* pool) |
| 59 | : JobProvider(pool) |
| 60 | , m_est(pool) |
| 61 | { |
| 62 | m_bReady = 0; |
| 63 | m_param = param; |
| 64 | m_lastKeyframe = -m_param->keyframeMax; |
| 65 | m_lastNonB = NULL; |
| 66 | m_bFilling = true; |
| 67 | m_bFlushed = false; |
| 68 | m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; |
| 69 | m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; |
| 70 | m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int)); |
| 71 | memset(m_histogram, 0, sizeof(m_histogram)); |
| 72 | } |
| 73 | |
| 74 | Lookahead::~Lookahead() { } |
| 75 | |
| 76 | void Lookahead::init() |
| 77 | { |
| 78 | if (m_pool && m_pool->getThreadCount() >= 4 && |
| 79 | ((m_param->bFrameAdaptive && m_param->bframes) || |
| 80 | m_param->rc.cuTree || m_param->scenecutThreshold || |
| 81 | (m_param->lookaheadDepth && m_param->rc.vbvBufferSize))) |
| 82 | m_pool = m_pool; /* allow use of worker thread */ |
| 83 | else |
| 84 | m_pool = NULL; /* disable use of worker thread */ |
| 85 | } |
| 86 | |
| 87 | void Lookahead::destroy() |
| 88 | { |
| 89 | if (m_pool) |
| 90 | // flush will dequeue, if it is necessary |
| 91 | JobProvider::flush(); |
| 92 | |
| 93 | // these two queues will be empty unless the encode was aborted |
| 94 | while (!m_inputQueue.empty()) |
| 95 | { |
| 96 | Frame* curFrame = m_inputQueue.popFront(); |
| 97 | curFrame->destroy(); |
| 98 | delete curFrame; |
| 99 | } |
| 100 | |
| 101 | while (!m_outputQueue.empty()) |
| 102 | { |
| 103 | Frame* curFrame = m_outputQueue.popFront(); |
| 104 | curFrame->destroy(); |
| 105 | delete curFrame; |
| 106 | } |
| 107 | |
| 108 | x265_free(m_scratch); |
| 109 | } |
| 110 | |
| 111 | /* Called by API thread */ |
| 112 | void Lookahead::addPicture(Frame *curFrame, int sliceType) |
| 113 | { |
| 114 | PicYuv *orig = curFrame->m_fencPic; |
| 115 | |
| 116 | curFrame->m_lowres.init(orig, curFrame->m_poc, sliceType); |
| 117 | |
| 118 | m_inputQueueLock.acquire(); |
| 119 | m_inputQueue.pushBack(*curFrame); |
| 120 | |
| 121 | if (m_inputQueue.size() >= m_param->lookaheadDepth) |
| 122 | { |
| 123 | /* when queue fills the first time, run slicetypeDecide synchronously, |
| 124 | * since the encoder will always be blocked here */ |
| 125 | if (m_pool && !m_bFilling) |
| 126 | { |
| 127 | m_inputQueueLock.release(); |
| 128 | m_bReady = 1; |
| 129 | m_pool->pokeIdleThread(); |
| 130 | } |
| 131 | else |
| 132 | slicetypeDecide(); |
| 133 | |
| 134 | if (m_bFilling && m_pool) |
| 135 | JobProvider::enqueue(); |
| 136 | m_bFilling = false; |
| 137 | } |
| 138 | else |
| 139 | m_inputQueueLock.release(); |
| 140 | } |
| 141 | |
| 142 | /* Called by API thread */ |
| 143 | void Lookahead::flush() |
| 144 | { |
| 145 | /* just in case the input queue is never allowed to fill */ |
| 146 | m_bFilling = false; |
| 147 | |
| 148 | /* flush synchronously */ |
| 149 | m_inputQueueLock.acquire(); |
| 150 | if (!m_inputQueue.empty()) |
| 151 | { |
| 152 | slicetypeDecide(); |
| 153 | } |
| 154 | else |
| 155 | m_inputQueueLock.release(); |
| 156 | |
| 157 | m_inputQueueLock.acquire(); |
| 158 | |
| 159 | /* bFlushed indicates that an empty output queue actually means all frames |
| 160 | * have been decided (no more inputs for the encoder) */ |
| 161 | if (m_inputQueue.empty()) |
| 162 | m_bFlushed = true; |
| 163 | m_inputQueueLock.release(); |
| 164 | } |
| 165 | |
| 166 | /* Called by API thread. If the lookahead queue has not yet been filled the |
| 167 | * first time, it immediately returns NULL. Else the function blocks until |
| 168 | * outputs are available and then pops the first frame from the output queue. If |
| 169 | * flush() has been called and the output queue is empty, NULL is returned. */ |
| 170 | Frame* Lookahead::getDecidedPicture() |
| 171 | { |
| 172 | m_outputQueueLock.acquire(); |
| 173 | |
| 174 | if (m_bFilling) |
| 175 | { |
| 176 | m_outputQueueLock.release(); |
| 177 | return NULL; |
| 178 | } |
| 179 | |
| 180 | while (m_outputQueue.empty() && !m_bFlushed) |
| 181 | { |
| 182 | m_outputQueueLock.release(); |
| 183 | m_outputAvailable.wait(); |
| 184 | m_outputQueueLock.acquire(); |
| 185 | } |
| 186 | |
| 187 | Frame *fenc = m_outputQueue.popFront(); |
| 188 | m_outputQueueLock.release(); |
| 189 | return fenc; |
| 190 | } |
| 191 | |
| 192 | /* Called by pool worker threads */ |
| 193 | bool Lookahead::findJob(int) |
| 194 | { |
| 195 | if (m_bReady > 0 && ATOMIC_DEC(&m_bReady) == 0) |
| 196 | { |
| 197 | m_inputQueueLock.acquire(); |
| 198 | slicetypeDecide(); |
| 199 | return true; |
| 200 | } |
| 201 | else |
| 202 | return false; |
| 203 | } |
| 204 | |
| 205 | /* Called by rate-control to calculate the estimated SATD cost for a given |
| 206 | * picture. It assumes dpb->prepareEncode() has already been called for the |
| 207 | * picture and all the references are established */ |
| 208 | void Lookahead::getEstimatedPictureCost(Frame *curFrame) |
| 209 | { |
| 210 | Lowres *frames[X265_LOOKAHEAD_MAX]; |
| 211 | |
| 212 | // POC distances to each reference |
| 213 | Slice *slice = curFrame->m_encData->m_slice; |
| 214 | int p0 = 0, p1, b; |
| 215 | int poc = slice->m_poc; |
| 216 | int l0poc = slice->m_refPOCList[0][0]; |
| 217 | int l1poc = slice->m_refPOCList[1][0]; |
| 218 | |
| 219 | switch (slice->m_sliceType) |
| 220 | { |
| 221 | case I_SLICE: |
| 222 | frames[p0] = &curFrame->m_lowres; |
| 223 | b = p1 = 0; |
| 224 | break; |
| 225 | |
| 226 | case P_SLICE: |
| 227 | b = p1 = poc - l0poc; |
| 228 | frames[p0] = &slice->m_refPicList[0][0]->m_lowres; |
| 229 | frames[b] = &curFrame->m_lowres; |
| 230 | break; |
| 231 | |
| 232 | case B_SLICE: |
| 233 | b = poc - l0poc; |
| 234 | p1 = b + l1poc - poc; |
| 235 | frames[p0] = &slice->m_refPicList[0][0]->m_lowres; |
| 236 | frames[b] = &curFrame->m_lowres; |
| 237 | frames[p1] = &slice->m_refPicList[1][0]->m_lowres; |
| 238 | break; |
| 239 | |
| 240 | default: |
| 241 | return; |
| 242 | } |
| 243 | |
| 244 | if (m_param->rc.cuTree && !m_param->rc.bStatRead) |
| 245 | /* update row satds based on cutree offsets */ |
| 246 | curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b); |
| 247 | else if (m_param->rc.aqMode) |
| 248 | curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b]; |
| 249 | else |
| 250 | curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b]; |
| 251 | |
| 252 | if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate) |
| 253 | { |
| 254 | /* aggregate lowres row satds to CTU resolution */ |
| 255 | curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCosts[b - p0][p1 - b]; |
| 256 | uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0; |
| 257 | uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE); |
| 258 | uint32_t numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; |
| 259 | uint32_t widthInLowresCu = (uint32_t)m_widthInCU, heightInLowresCu = (uint32_t)m_heightInCU; |
| 260 | double *qp_offset = 0; |
| 261 | /* Factor in qpoffsets based on Aq/Cutree in CU costs */ |
| 262 | if (m_param->rc.aqMode) |
| 263 | qp_offset = (frames[b]->sliceType == X265_TYPE_B || !m_param->rc.cuTree) ? frames[b]->qpAqOffset : frames[b]->qpCuTreeOffset; |
| 264 | |
| 265 | for (uint32_t row = 0; row < numCuInHeight; row++) |
| 266 | { |
| 267 | lowresRow = row * scale; |
| 268 | for (uint32_t cnt = 0; cnt < scale && lowresRow < heightInLowresCu; lowresRow++, cnt++) |
| 269 | { |
| 270 | sum = 0; |
| 271 | lowresCuIdx = lowresRow * widthInLowresCu; |
| 272 | for (lowresCol = 0; lowresCol < widthInLowresCu; lowresCol++, lowresCuIdx++) |
| 273 | { |
| 274 | uint16_t lowresCuCost = curFrame->m_lowres.lowresCostForRc[lowresCuIdx] & LOWRES_COST_MASK; |
| 275 | if (qp_offset) |
| 276 | { |
| 277 | lowresCuCost = (uint16_t)((lowresCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8); |
| 278 | int32_t intraCuCost = curFrame->m_lowres.intraCost[lowresCuIdx]; |
| 279 | curFrame->m_lowres.intraCost[lowresCuIdx] = (intraCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8; |
| 280 | } |
| 281 | curFrame->m_lowres.lowresCostForRc[lowresCuIdx] = lowresCuCost; |
| 282 | sum += lowresCuCost; |
| 283 | } |
| 284 | curFrame->m_encData->m_rowStat[row].satdForVbv += sum; |
| 285 | } |
| 286 | } |
| 287 | } |
| 288 | } |
| 289 | |
| 290 | /* called by API thread or worker thread with inputQueueLock acquired */ |
| 291 | void Lookahead::slicetypeDecide() |
| 292 | { |
| 293 | ProfileScopeEvent(slicetypeDecideEV); |
| 294 | |
| 295 | ScopedLock lock(m_decideLock); |
| 296 | |
| 297 | Lowres *frames[X265_LOOKAHEAD_MAX]; |
| 298 | Frame *list[X265_LOOKAHEAD_MAX]; |
| 299 | int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX); |
| 300 | |
| 301 | memset(frames, 0, sizeof(frames)); |
| 302 | memset(list, 0, sizeof(list)); |
| 303 | { |
| 304 | Frame *curFrame = m_inputQueue.first(); |
| 305 | int j; |
| 306 | for (j = 0; j < m_param->bframes + 2; j++) |
| 307 | { |
| 308 | if (!curFrame) break; |
| 309 | list[j] = curFrame; |
| 310 | curFrame = curFrame->m_next; |
| 311 | } |
| 312 | |
| 313 | curFrame = m_inputQueue.first(); |
| 314 | frames[0] = m_lastNonB; |
| 315 | for (j = 0; j < maxSearch; j++) |
| 316 | { |
| 317 | if (!curFrame) break; |
| 318 | frames[j + 1] = &curFrame->m_lowres; |
| 319 | curFrame = curFrame->m_next; |
| 320 | } |
| 321 | |
| 322 | maxSearch = j; |
| 323 | } |
| 324 | |
| 325 | m_inputQueueLock.release(); |
| 326 | |
| 327 | if (!m_est.m_rows && list[0]) |
| 328 | m_est.init(m_param, list[0]); |
| 329 | |
| 330 | if (m_lastNonB && !m_param->rc.bStatRead && |
| 331 | ((m_param->bFrameAdaptive && m_param->bframes) || |
| 332 | m_param->rc.cuTree || m_param->scenecutThreshold || |
| 333 | (m_param->lookaheadDepth && m_param->rc.vbvBufferSize))) |
| 334 | { |
| 335 | slicetypeAnalyse(frames, false); |
| 336 | } |
| 337 | |
| 338 | int bframes, brefs; |
| 339 | for (bframes = 0, brefs = 0;; bframes++) |
| 340 | { |
| 341 | Lowres& frm = list[bframes]->m_lowres; |
| 342 | |
| 343 | if (frm.sliceType == X265_TYPE_BREF && !m_param->bBPyramid && brefs == m_param->bBPyramid) |
| 344 | { |
| 345 | frm.sliceType = X265_TYPE_B; |
| 346 | x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid\n", |
| 347 | frm.frameNum); |
| 348 | } |
| 349 | |
| 350 | /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available. |
| 351 | smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it.*/ |
| 352 | else if (frm.sliceType == X265_TYPE_BREF && m_param->bBPyramid && brefs && |
| 353 | m_param->maxNumReferences <= (brefs + 3)) |
| 354 | { |
| 355 | frm.sliceType = X265_TYPE_B; |
| 356 | x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n", |
| 357 | frm.sliceType, m_param->maxNumReferences); |
| 358 | } |
| 359 | |
| 360 | if ( /*(!param->intraRefresh || frm.frameNum == 0) && */ frm.frameNum - m_lastKeyframe >= m_param->keyframeMax) |
| 361 | { |
| 362 | if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I) |
| 363 | frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR; |
| 364 | bool warn = frm.sliceType != X265_TYPE_IDR; |
| 365 | if (warn && m_param->bOpenGOP) |
| 366 | warn &= frm.sliceType != X265_TYPE_I; |
| 367 | if (warn) |
| 368 | { |
| 369 | x265_log(m_param, X265_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n", |
| 370 | frm.sliceType, frm.frameNum); |
| 371 | frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR; |
| 372 | } |
| 373 | } |
| 374 | if (frm.sliceType == X265_TYPE_I && frm.frameNum - m_lastKeyframe >= m_param->keyframeMin) |
| 375 | { |
| 376 | if (m_param->bOpenGOP) |
| 377 | { |
| 378 | m_lastKeyframe = frm.frameNum; |
| 379 | frm.bKeyframe = true; |
| 380 | } |
| 381 | else |
| 382 | frm.sliceType = X265_TYPE_IDR; |
| 383 | } |
| 384 | if (frm.sliceType == X265_TYPE_IDR) |
| 385 | { |
| 386 | /* Closed GOP */ |
| 387 | m_lastKeyframe = frm.frameNum; |
| 388 | frm.bKeyframe = true; |
| 389 | if (bframes > 0) |
| 390 | { |
| 391 | list[bframes - 1]->m_lowres.sliceType = X265_TYPE_P; |
| 392 | bframes--; |
| 393 | } |
| 394 | } |
| 395 | if (bframes == m_param->bframes || !list[bframes + 1]) |
| 396 | { |
| 397 | if (IS_X265_TYPE_B(frm.sliceType)) |
| 398 | x265_log(m_param, X265_LOG_WARNING, "specified frame type is not compatible with max B-frames\n"); |
| 399 | if (frm.sliceType == X265_TYPE_AUTO || IS_X265_TYPE_B(frm.sliceType)) |
| 400 | frm.sliceType = X265_TYPE_P; |
| 401 | } |
| 402 | if (frm.sliceType == X265_TYPE_BREF) |
| 403 | brefs++; |
| 404 | if (frm.sliceType == X265_TYPE_AUTO) |
| 405 | frm.sliceType = X265_TYPE_B; |
| 406 | else if (!IS_X265_TYPE_B(frm.sliceType)) |
| 407 | break; |
| 408 | } |
| 409 | |
| 410 | if (bframes) |
| 411 | list[bframes - 1]->m_lowres.bLastMiniGopBFrame = true; |
| 412 | list[bframes]->m_lowres.leadingBframes = bframes; |
| 413 | m_lastNonB = &list[bframes]->m_lowres; |
| 414 | m_histogram[bframes]++; |
| 415 | |
| 416 | /* insert a bref into the sequence */ |
| 417 | if (m_param->bBPyramid && bframes > 1 && !brefs) |
| 418 | { |
| 419 | list[bframes / 2]->m_lowres.sliceType = X265_TYPE_BREF; |
| 420 | brefs++; |
| 421 | } |
| 422 | /* calculate the frame costs ahead of time for estimateFrameCost while we still have lowres */ |
| 423 | if (m_param->rc.rateControlMode != X265_RC_CQP) |
| 424 | { |
| 425 | int p0, p1, b; |
| 426 | /* For zero latency tuning, calculate frame cost to be used later in RC */ |
| 427 | if (!maxSearch) |
| 428 | { |
| 429 | for (int i = 0; i <= bframes; i++) |
| 430 | frames[i + 1] = &list[i]->m_lowres; |
| 431 | } |
| 432 | |
| 433 | /* estimate new non-B cost */ |
| 434 | p1 = b = bframes + 1; |
| 435 | p0 = (IS_X265_TYPE_I(frames[bframes + 1]->sliceType)) ? b : 0; |
| 436 | m_est.estimateFrameCost(frames, p0, p1, b, 0); |
| 437 | |
| 438 | if (bframes) |
| 439 | { |
| 440 | p0 = 0; // last nonb |
| 441 | for (b = 1; b <= bframes; b++) |
| 442 | { |
| 443 | if (frames[b]->sliceType == X265_TYPE_B) |
| 444 | for (p1 = b; frames[p1]->sliceType == X265_TYPE_B; p1++) |
| 445 | ; // find new nonb or bref |
| 446 | else |
| 447 | p1 = bframes + 1; |
| 448 | |
| 449 | m_est.estimateFrameCost(frames, p0, p1, b, 0); |
| 450 | |
| 451 | if (frames[b]->sliceType == X265_TYPE_BREF) |
| 452 | p0 = b; |
| 453 | } |
| 454 | } |
| 455 | } |
| 456 | |
| 457 | m_inputQueueLock.acquire(); |
| 458 | |
| 459 | /* dequeue all frames from inputQueue that are about to be enqueued |
| 460 | * in the output queue. The order is important because Frame can |
| 461 | * only be in one list at a time */ |
| 462 | int64_t pts[X265_BFRAME_MAX + 1]; |
| 463 | for (int i = 0; i <= bframes; i++) |
| 464 | { |
| 465 | Frame *curFrame; |
| 466 | curFrame = m_inputQueue.popFront(); |
| 467 | pts[i] = curFrame->m_pts; |
| 468 | maxSearch--; |
| 469 | } |
| 470 | |
| 471 | m_inputQueueLock.release(); |
| 472 | |
| 473 | m_outputQueueLock.acquire(); |
| 474 | /* add non-B to output queue */ |
| 475 | int idx = 0; |
| 476 | list[bframes]->m_reorderedPts = pts[idx++]; |
| 477 | m_outputQueue.pushBack(*list[bframes]); |
| 478 | |
| 479 | /* Add B-ref frame next to P frame in output queue, the B-ref encode before non B-ref frame */ |
| 480 | if (bframes > 1 && m_param->bBPyramid) |
| 481 | { |
| 482 | for (int i = 0; i < bframes; i++) |
| 483 | { |
| 484 | if (list[i]->m_lowres.sliceType == X265_TYPE_BREF) |
| 485 | { |
| 486 | list[i]->m_reorderedPts = pts[idx++]; |
| 487 | m_outputQueue.pushBack(*list[i]); |
| 488 | } |
| 489 | } |
| 490 | } |
| 491 | |
| 492 | /* add B frames to output queue */ |
| 493 | for (int i = 0; i < bframes; i++) |
| 494 | { |
| 495 | /* push all the B frames into output queue except B-ref, which already pushed into output queue*/ |
| 496 | if (list[i]->m_lowres.sliceType != X265_TYPE_BREF) |
| 497 | { |
| 498 | list[i]->m_reorderedPts = pts[idx++]; |
| 499 | m_outputQueue.pushBack(*list[i]); |
| 500 | } |
| 501 | } |
| 502 | |
| 503 | bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth)) && !m_param->rc.bStatRead; |
| 504 | if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType)) |
| 505 | { |
| 506 | m_inputQueueLock.acquire(); |
| 507 | Frame *curFrame = m_inputQueue.first(); |
| 508 | frames[0] = m_lastNonB; |
| 509 | int j; |
| 510 | for (j = 0; j < maxSearch; j++) |
| 511 | { |
| 512 | frames[j + 1] = &curFrame->m_lowres; |
| 513 | curFrame = curFrame->m_next; |
| 514 | } |
| 515 | |
| 516 | frames[j + 1] = NULL; |
| 517 | m_inputQueueLock.release(); |
| 518 | slicetypeAnalyse(frames, true); |
| 519 | } |
| 520 | |
| 521 | m_outputQueueLock.release(); |
| 522 | m_outputAvailable.trigger(); |
| 523 | } |
| 524 | |
| 525 | void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe) |
| 526 | { |
| 527 | int prevNonB = 0, curNonB = 1, idx = 0; |
| 528 | while (curNonB < numFrames && frames[curNonB]->sliceType == X265_TYPE_B) |
| 529 | curNonB++; |
| 530 | int nextNonB = keyframe ? prevNonB : curNonB; |
| 531 | int nextB = prevNonB + 1; |
| 532 | int nextBRef = 0; |
| 533 | int miniGopEnd = keyframe ? prevNonB : curNonB; |
| 534 | while (curNonB < numFrames + !keyframe) |
| 535 | { |
| 536 | /* P/I cost: This shouldn't include the cost of nextNonB */ |
| 537 | if (nextNonB != curNonB) |
| 538 | { |
| 539 | int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB; |
| 540 | frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, p0, curNonB, curNonB); |
| 541 | frames[nextNonB]->plannedType[idx] = frames[curNonB]->sliceType; |
| 542 | /* Save the nextNonB Cost in each B frame of the current miniGop */ |
| 543 | if (curNonB > miniGopEnd) |
| 544 | { |
| 545 | for (int j = nextB; j < miniGopEnd; j++) |
| 546 | { |
| 547 | frames[j]->plannedSatd[frames[j]->indB] = frames[nextNonB]->plannedSatd[idx]; |
| 548 | frames[j]->plannedType[frames[j]->indB++] = frames[nextNonB]->plannedType[idx]; |
| 549 | |
| 550 | } |
| 551 | } |
| 552 | idx++; |
| 553 | } |
| 554 | /* Handle the B-frames: coded order */ |
| 555 | if (m_param->bBPyramid && curNonB - prevNonB > 1) |
| 556 | nextBRef = (prevNonB + curNonB + 1) / 2; |
| 557 | |
| 558 | for (int i = prevNonB + 1; i < curNonB; i++, idx++) |
| 559 | { |
| 560 | int64_t satdCost = 0; int type = X265_TYPE_B; |
| 561 | if (nextBRef) |
| 562 | { |
| 563 | if (i == nextBRef) |
| 564 | { |
| 565 | satdCost = vbvFrameCost(frames, prevNonB, curNonB, nextBRef); |
| 566 | type = X265_TYPE_BREF; |
| 567 | } |
| 568 | else if (i < nextBRef) |
| 569 | satdCost = vbvFrameCost(frames, prevNonB, nextBRef, i); |
| 570 | else |
| 571 | satdCost = vbvFrameCost(frames, nextBRef, curNonB, i); |
| 572 | } |
| 573 | else |
| 574 | satdCost = vbvFrameCost(frames, prevNonB, nextNonB, i); |
| 575 | frames[nextNonB]->plannedSatd[idx] = satdCost; |
| 576 | frames[nextNonB]->plannedType[idx] = type; |
| 577 | /* Save the nextB Cost in each B frame of the current miniGop */ |
| 578 | |
| 579 | for (int j = nextB; j < miniGopEnd; j++) |
| 580 | { |
| 581 | if (nextBRef && i == nextBRef) |
| 582 | break; |
| 583 | if (j >= i && j !=nextBRef) |
| 584 | continue; |
| 585 | frames[j]->plannedSatd[frames[j]->indB] = satdCost; |
| 586 | frames[j]->plannedType[frames[j]->indB++] = X265_TYPE_B; |
| 587 | } |
| 588 | } |
| 589 | prevNonB = curNonB; |
| 590 | curNonB++; |
| 591 | while (curNonB <= numFrames && frames[curNonB]->sliceType == X265_TYPE_B) |
| 592 | curNonB++; |
| 593 | } |
| 594 | |
| 595 | frames[nextNonB]->plannedType[idx] = X265_TYPE_AUTO; |
| 596 | } |
| 597 | |
| 598 | int64_t Lookahead::vbvFrameCost(Lowres **frames, int p0, int p1, int b) |
| 599 | { |
| 600 | int64_t cost = m_est.estimateFrameCost(frames, p0, p1, b, 0); |
| 601 | |
| 602 | if (m_param->rc.aqMode) |
| 603 | { |
| 604 | if (m_param->rc.cuTree) |
| 605 | return frameCostRecalculate(frames, p0, p1, b); |
| 606 | else |
| 607 | return frames[b]->costEstAq[b - p0][p1 - b]; |
| 608 | } |
| 609 | return cost; |
| 610 | } |
| 611 | |
| 612 | void Lookahead::slicetypeAnalyse(Lowres **frames, bool bKeyframe) |
| 613 | { |
| 614 | int numFrames, origNumFrames, keyintLimit, framecnt; |
| 615 | int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX); |
| 616 | int cuCount = NUM_CUS; |
| 617 | int resetStart; |
| 618 | bool bIsVbvLookahead = m_param->rc.vbvBufferSize && m_param->lookaheadDepth; |
| 619 | |
| 620 | /* count undecided frames */ |
| 621 | for (framecnt = 0; framecnt < maxSearch; framecnt++) |
| 622 | { |
| 623 | Lowres *fenc = frames[framecnt + 1]; |
| 624 | if (!fenc || fenc->sliceType != X265_TYPE_AUTO) |
| 625 | break; |
| 626 | } |
| 627 | |
| 628 | if (!framecnt) |
| 629 | { |
| 630 | if (m_param->rc.cuTree) |
| 631 | cuTree(frames, 0, bKeyframe); |
| 632 | return; |
| 633 | } |
| 634 | |
| 635 | frames[framecnt + 1] = NULL; |
| 636 | |
| 637 | keyintLimit = m_param->keyframeMax - frames[0]->frameNum + m_lastKeyframe - 1; |
| 638 | origNumFrames = numFrames = X265_MIN(framecnt, keyintLimit); |
| 639 | |
| 640 | if (bIsVbvLookahead) |
| 641 | numFrames = framecnt; |
| 642 | else if (m_param->bOpenGOP && numFrames < framecnt) |
| 643 | numFrames++; |
| 644 | else if (numFrames == 0) |
| 645 | { |
| 646 | frames[1]->sliceType = X265_TYPE_I; |
| 647 | return; |
| 648 | } |
| 649 | |
| 650 | int numBFrames = 0; |
| 651 | int numAnalyzed = numFrames; |
| 652 | if (m_param->scenecutThreshold && scenecut(frames, 0, 1, true, origNumFrames, maxSearch)) |
| 653 | { |
| 654 | frames[1]->sliceType = X265_TYPE_I; |
| 655 | return; |
| 656 | } |
| 657 | |
| 658 | if (m_param->bframes) |
| 659 | { |
| 660 | if (m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS) |
| 661 | { |
| 662 | if (numFrames > 1) |
| 663 | { |
| 664 | char best_paths[X265_BFRAME_MAX + 1][X265_LOOKAHEAD_MAX + 1] = { "", "P" }; |
| 665 | int best_path_index = numFrames % (X265_BFRAME_MAX + 1); |
| 666 | |
| 667 | /* Perform the frametype analysis. */ |
| 668 | for (int j = 2; j <= numFrames; j++) |
| 669 | { |
| 670 | slicetypePath(frames, j, best_paths); |
| 671 | } |
| 672 | |
| 673 | numBFrames = (int)strspn(best_paths[best_path_index], "B"); |
| 674 | |
| 675 | /* Load the results of the analysis into the frame types. */ |
| 676 | for (int j = 1; j < numFrames; j++) |
| 677 | { |
| 678 | frames[j]->sliceType = best_paths[best_path_index][j - 1] == 'B' ? X265_TYPE_B : X265_TYPE_P; |
| 679 | } |
| 680 | } |
| 681 | frames[numFrames]->sliceType = X265_TYPE_P; |
| 682 | } |
| 683 | else if (m_param->bFrameAdaptive == X265_B_ADAPT_FAST) |
| 684 | { |
| 685 | int64_t cost1p0, cost2p0, cost1b1, cost2p1; |
| 686 | |
| 687 | for (int i = 0; i <= numFrames - 2; ) |
| 688 | { |
| 689 | cost2p1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 2, 1); |
| 690 | if (frames[i + 2]->intraMbs[2] > cuCount / 2) |
| 691 | { |
| 692 | frames[i + 1]->sliceType = X265_TYPE_P; |
| 693 | frames[i + 2]->sliceType = X265_TYPE_P; |
| 694 | i += 2; |
| 695 | continue; |
| 696 | } |
| 697 | |
| 698 | cost1b1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 1, 0); |
| 699 | cost1p0 = m_est.estimateFrameCost(frames, i + 0, i + 1, i + 1, 0); |
| 700 | cost2p0 = m_est.estimateFrameCost(frames, i + 1, i + 2, i + 2, 0); |
| 701 | |
| 702 | if (cost1p0 + cost2p0 < cost1b1 + cost2p1) |
| 703 | { |
| 704 | frames[i + 1]->sliceType = X265_TYPE_P; |
| 705 | i += 1; |
| 706 | continue; |
| 707 | } |
| 708 | |
| 709 | // arbitrary and untuned |
| 710 | #define INTER_THRESH 300 |
| 711 | #define P_SENS_BIAS (50 - m_param->bFrameBias) |
| 712 | frames[i + 1]->sliceType = X265_TYPE_B; |
| 713 | |
| 714 | int j; |
| 715 | for (j = i + 2; j <= X265_MIN(i + m_param->bframes, numFrames - 1); j++) |
| 716 | { |
| 717 | int64_t pthresh = X265_MAX(INTER_THRESH - P_SENS_BIAS * (j - i - 1), INTER_THRESH / 10); |
| 718 | int64_t pcost = m_est.estimateFrameCost(frames, i + 0, j + 1, j + 1, 1); |
| 719 | if (pcost > pthresh * cuCount || frames[j + 1]->intraMbs[j - i + 1] > cuCount / 3) |
| 720 | break; |
| 721 | frames[j]->sliceType = X265_TYPE_B; |
| 722 | } |
| 723 | |
| 724 | frames[j]->sliceType = X265_TYPE_P; |
| 725 | i = j; |
| 726 | } |
| 727 | frames[numFrames]->sliceType = X265_TYPE_P; |
| 728 | numBFrames = 0; |
| 729 | while (numBFrames < numFrames && frames[numBFrames + 1]->sliceType == X265_TYPE_B) |
| 730 | { |
| 731 | numBFrames++; |
| 732 | } |
| 733 | } |
| 734 | else |
| 735 | { |
| 736 | numBFrames = X265_MIN(numFrames - 1, m_param->bframes); |
| 737 | for (int j = 1; j < numFrames; j++) |
| 738 | { |
| 739 | frames[j]->sliceType = (j % (numBFrames + 1)) ? X265_TYPE_B : X265_TYPE_P; |
| 740 | } |
| 741 | |
| 742 | frames[numFrames]->sliceType = X265_TYPE_P; |
| 743 | } |
| 744 | /* Check scenecut on the first minigop. */ |
| 745 | for (int j = 1; j < numBFrames + 1; j++) |
| 746 | { |
| 747 | if (m_param->scenecutThreshold && scenecut(frames, j, j + 1, false, origNumFrames, maxSearch)) |
| 748 | { |
| 749 | frames[j]->sliceType = X265_TYPE_P; |
| 750 | numAnalyzed = j; |
| 751 | break; |
| 752 | } |
| 753 | } |
| 754 | |
| 755 | resetStart = bKeyframe ? 1 : X265_MIN(numBFrames + 2, numAnalyzed + 1); |
| 756 | } |
| 757 | else |
| 758 | { |
| 759 | for (int j = 1; j <= numFrames; j++) |
| 760 | { |
| 761 | frames[j]->sliceType = X265_TYPE_P; |
| 762 | } |
| 763 | |
| 764 | resetStart = bKeyframe ? 1 : 2; |
| 765 | } |
| 766 | |
| 767 | if (m_param->rc.cuTree) |
| 768 | cuTree(frames, X265_MIN(numFrames, m_param->keyframeMax), bKeyframe); |
| 769 | |
| 770 | // if (!param->bIntraRefresh) |
| 771 | for (int j = keyintLimit + 1; j <= numFrames; j += m_param->keyframeMax) |
| 772 | { |
| 773 | frames[j]->sliceType = X265_TYPE_I; |
| 774 | resetStart = X265_MIN(resetStart, j + 1); |
| 775 | } |
| 776 | |
| 777 | if (bIsVbvLookahead) |
| 778 | vbvLookahead(frames, numFrames, bKeyframe); |
| 779 | |
| 780 | /* Restore frametypes for all frames that haven't actually been decided yet. */ |
| 781 | for (int j = resetStart; j <= numFrames; j++) |
| 782 | { |
| 783 | frames[j]->sliceType = X265_TYPE_AUTO; |
| 784 | } |
| 785 | } |
| 786 | |
| 787 | bool Lookahead::scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch) |
| 788 | { |
| 789 | /* Only do analysis during a normal scenecut check. */ |
| 790 | if (bRealScenecut && m_param->bframes) |
| 791 | { |
| 792 | int origmaxp1 = p0 + 1; |
| 793 | /* Look ahead to avoid coding short flashes as scenecuts. */ |
| 794 | if (m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS) |
| 795 | /* Don't analyse any more frames than the trellis would have covered. */ |
| 796 | origmaxp1 += m_param->bframes; |
| 797 | else |
| 798 | origmaxp1++; |
| 799 | int maxp1 = X265_MIN(origmaxp1, numFrames); |
| 800 | |
| 801 | /* Where A and B are scenes: AAAAAABBBAAAAAA |
| 802 | * If BBB is shorter than (maxp1-p0), it is detected as a flash |
| 803 | * and not considered a scenecut. */ |
| 804 | for (int cp1 = p1; cp1 <= maxp1; cp1++) |
| 805 | { |
| 806 | if (!scenecutInternal(frames, p0, cp1, false)) |
| 807 | /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */ |
| 808 | for (int i = cp1; i > p0; i--) |
| 809 | { |
| 810 | frames[i]->bScenecut = false; |
| 811 | } |
| 812 | } |
| 813 | |
| 814 | /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF |
| 815 | * If each of BB ... EE are shorter than (maxp1-p0), they are |
| 816 | * detected as flashes and not considered scenecuts. |
| 817 | * Instead, the first F frame becomes a scenecut. |
| 818 | * If the video ends before F, no frame becomes a scenecut. */ |
| 819 | for (int cp0 = p0; cp0 <= maxp1; cp0++) |
| 820 | { |
| 821 | if (origmaxp1 > maxSearch || (cp0 < maxp1 && scenecutInternal(frames, cp0, maxp1, false))) |
| 822 | /* If cur_p0 is the p0 of a scenecut, it cannot be the p1 of a scenecut. */ |
| 823 | frames[cp0]->bScenecut = false; |
| 824 | } |
| 825 | } |
| 826 | |
| 827 | /* Ignore frames that are part of a flash, i.e. cannot be real scenecuts. */ |
| 828 | if (!frames[p1]->bScenecut) |
| 829 | return false; |
| 830 | return scenecutInternal(frames, p0, p1, bRealScenecut); |
| 831 | } |
| 832 | |
| 833 | bool Lookahead::scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut) |
| 834 | { |
| 835 | Lowres *frame = frames[p1]; |
| 836 | |
| 837 | m_est.estimateFrameCost(frames, p0, p1, p1, 0); |
| 838 | |
| 839 | int64_t icost = frame->costEst[0][0]; |
| 840 | int64_t pcost = frame->costEst[p1 - p0][0]; |
| 841 | int gopSize = frame->frameNum - m_lastKeyframe; |
| 842 | float threshMax = (float)(m_param->scenecutThreshold / 100.0); |
| 843 | |
| 844 | /* magic numbers pulled out of thin air */ |
| 845 | float threshMin = (float)(threshMax * 0.25); |
| 846 | float bias; |
| 847 | |
| 848 | if (m_param->keyframeMin == m_param->keyframeMax) |
| 849 | threshMin = threshMax; |
| 850 | if (gopSize <= m_param->keyframeMin / 4) |
| 851 | bias = threshMin / 4; |
| 852 | else if (gopSize <= m_param->keyframeMin) |
| 853 | bias = threshMin * gopSize / m_param->keyframeMin; |
| 854 | else |
| 855 | { |
| 856 | bias = threshMin |
| 857 | + (threshMax - threshMin) |
| 858 | * (gopSize - m_param->keyframeMin) |
| 859 | / (m_param->keyframeMax - m_param->keyframeMin); |
| 860 | } |
| 861 | |
| 862 | bool res = pcost >= (1.0 - bias) * icost; |
| 863 | if (res && bRealScenecut) |
| 864 | { |
| 865 | int imb = frame->intraMbs[p1 - p0]; |
| 866 | int pmb = NUM_CUS - imb; |
| 867 | x265_log(m_param, X265_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n", |
| 868 | frame->frameNum, icost, pcost, 1. - (double)pcost / icost, bias, gopSize, imb, pmb); |
| 869 | } |
| 870 | return res; |
| 871 | } |
| 872 | |
| 873 | void Lookahead::slicetypePath(Lowres **frames, int length, char(*best_paths)[X265_LOOKAHEAD_MAX + 1]) |
| 874 | { |
| 875 | char paths[2][X265_LOOKAHEAD_MAX + 1]; |
| 876 | int num_paths = X265_MIN(m_param->bframes + 1, length); |
| 877 | int64_t best_cost = 1LL << 62; |
| 878 | int idx = 0; |
| 879 | |
| 880 | /* Iterate over all currently possible paths */ |
| 881 | for (int path = 0; path < num_paths; path++) |
| 882 | { |
| 883 | /* Add suffixes to the current path */ |
| 884 | int len = length - (path + 1); |
| 885 | memcpy(paths[idx], best_paths[len % (X265_BFRAME_MAX + 1)], len); |
| 886 | memset(paths[idx] + len, 'B', path); |
| 887 | strcpy(paths[idx] + len + path, "P"); |
| 888 | |
| 889 | /* Calculate the actual cost of the current path */ |
| 890 | int64_t cost = slicetypePathCost(frames, paths[idx], best_cost); |
| 891 | if (cost < best_cost) |
| 892 | { |
| 893 | best_cost = cost; |
| 894 | idx ^= 1; |
| 895 | } |
| 896 | } |
| 897 | |
| 898 | /* Store the best path. */ |
| 899 | memcpy(best_paths[length % (X265_BFRAME_MAX + 1)], paths[idx ^ 1], length); |
| 900 | } |
| 901 | |
| 902 | int64_t Lookahead::slicetypePathCost(Lowres **frames, char *path, int64_t threshold) |
| 903 | { |
| 904 | int64_t cost = 0; |
| 905 | int loc = 1; |
| 906 | int cur_p = 0; |
| 907 | |
| 908 | path--; /* Since the 1st path element is really the second frame */ |
| 909 | while (path[loc]) |
| 910 | { |
| 911 | int next_p = loc; |
| 912 | /* Find the location of the next P-frame. */ |
| 913 | while (path[next_p] != 'P') |
| 914 | { |
| 915 | next_p++; |
| 916 | } |
| 917 | |
| 918 | /* Add the cost of the P-frame found above */ |
| 919 | cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_p, 0); |
| 920 | /* Early terminate if the cost we have found is larger than the best path cost so far */ |
| 921 | if (cost > threshold) |
| 922 | break; |
| 923 | |
| 924 | if (m_param->bBPyramid && next_p - cur_p > 2) |
| 925 | { |
| 926 | int middle = cur_p + (next_p - cur_p) / 2; |
| 927 | cost += m_est.estimateFrameCost(frames, cur_p, next_p, middle, 0); |
| 928 | for (int next_b = loc; next_b < middle && cost < threshold; next_b++) |
| 929 | { |
| 930 | cost += m_est.estimateFrameCost(frames, cur_p, middle, next_b, 0); |
| 931 | } |
| 932 | |
| 933 | for (int next_b = middle + 1; next_b < next_p && cost < threshold; next_b++) |
| 934 | { |
| 935 | cost += m_est.estimateFrameCost(frames, middle, next_p, next_b, 0); |
| 936 | } |
| 937 | } |
| 938 | else |
| 939 | { |
| 940 | for (int next_b = loc; next_b < next_p && cost < threshold; next_b++) |
| 941 | { |
| 942 | cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_b, 0); |
| 943 | } |
| 944 | } |
| 945 | |
| 946 | loc = next_p + 1; |
| 947 | cur_p = next_p; |
| 948 | } |
| 949 | |
| 950 | return cost; |
| 951 | } |
| 952 | |
| 953 | void Lookahead::cuTree(Lowres **frames, int numframes, bool bIntra) |
| 954 | { |
| 955 | int idx = !bIntra; |
| 956 | int lastnonb, curnonb = 1; |
| 957 | int bframes = 0; |
| 958 | |
| 959 | x265_emms(); |
| 960 | double totalDuration = 0.0; |
| 961 | for (int j = 0; j <= numframes; j++) |
| 962 | totalDuration += (double)m_param->fpsDenom / m_param->fpsNum; |
| 963 | |
| 964 | double averageDuration = totalDuration / (numframes + 1); |
| 965 | |
| 966 | int i = numframes; |
| 967 | int cuCount = m_widthInCU * m_heightInCU; |
| 968 | |
| 969 | if (bIntra) |
| 970 | m_est.estimateFrameCost(frames, 0, 0, 0, 0); |
| 971 | |
| 972 | while (i > 0 && frames[i]->sliceType == X265_TYPE_B) |
| 973 | i--; |
| 974 | |
| 975 | lastnonb = i; |
| 976 | |
| 977 | /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could |
| 978 | * be applied to the end of a lookahead buffer of any size. However, it's most needed when |
| 979 | * lookahead=0, so that's what's currently implemented. */ |
| 980 | if (!m_param->lookaheadDepth) |
| 981 | { |
| 982 | if (bIntra) |
| 983 | { |
| 984 | memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t)); |
| 985 | memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, cuCount * sizeof(double)); |
| 986 | return; |
| 987 | } |
| 988 | std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost); |
| 989 | memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t)); |
| 990 | } |
| 991 | else |
| 992 | { |
| 993 | if (lastnonb < idx) |
| 994 | return; |
| 995 | memset(frames[lastnonb]->propagateCost, 0, cuCount * sizeof(uint16_t)); |
| 996 | } |
| 997 | |
| 998 | while (i-- > idx) |
| 999 | { |
| 1000 | curnonb = i; |
| 1001 | while (frames[curnonb]->sliceType == X265_TYPE_B && curnonb > 0) |
| 1002 | curnonb--; |
| 1003 | |
| 1004 | if (curnonb < idx) |
| 1005 | break; |
| 1006 | |
| 1007 | m_est.estimateFrameCost(frames, curnonb, lastnonb, lastnonb, 0); |
| 1008 | memset(frames[curnonb]->propagateCost, 0, cuCount * sizeof(uint16_t)); |
| 1009 | bframes = lastnonb - curnonb - 1; |
| 1010 | if (m_param->bBPyramid && bframes > 1) |
| 1011 | { |
| 1012 | int middle = (bframes + 1) / 2 + curnonb; |
| 1013 | m_est.estimateFrameCost(frames, curnonb, lastnonb, middle, 0); |
| 1014 | memset(frames[middle]->propagateCost, 0, cuCount * sizeof(uint16_t)); |
| 1015 | while (i > curnonb) |
| 1016 | { |
| 1017 | int p0 = i > middle ? middle : curnonb; |
| 1018 | int p1 = i < middle ? middle : lastnonb; |
| 1019 | if (i != middle) |
| 1020 | { |
| 1021 | m_est.estimateFrameCost(frames, p0, p1, i, 0); |
| 1022 | estimateCUPropagate(frames, averageDuration, p0, p1, i, 0); |
| 1023 | } |
| 1024 | i--; |
| 1025 | } |
| 1026 | |
| 1027 | estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, middle, 1); |
| 1028 | } |
| 1029 | else |
| 1030 | { |
| 1031 | while (i > curnonb) |
| 1032 | { |
| 1033 | m_est.estimateFrameCost(frames, curnonb, lastnonb, i, 0); |
| 1034 | estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0); |
| 1035 | i--; |
| 1036 | } |
| 1037 | } |
| 1038 | estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, lastnonb, 1); |
| 1039 | lastnonb = curnonb; |
| 1040 | } |
| 1041 | |
| 1042 | if (!m_param->lookaheadDepth) |
| 1043 | { |
| 1044 | m_est.estimateFrameCost(frames, 0, lastnonb, lastnonb, 0); |
| 1045 | estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1); |
| 1046 | std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost); |
| 1047 | } |
| 1048 | |
| 1049 | cuTreeFinish(frames[lastnonb], averageDuration, lastnonb); |
| 1050 | if (m_param->bBPyramid && bframes > 1 && !m_param->rc.vbvBufferSize) |
| 1051 | cuTreeFinish(frames[lastnonb + (bframes + 1) / 2], averageDuration, 0); |
| 1052 | } |
| 1053 | |
| 1054 | void Lookahead::estimateCUPropagate(Lowres **frames, double averageDuration, int p0, int p1, int b, int referenced) |
| 1055 | { |
| 1056 | uint16_t *refCosts[2] = { frames[p0]->propagateCost, frames[p1]->propagateCost }; |
| 1057 | int32_t distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0); |
| 1058 | int32_t bipredWeight = m_param->bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32; |
| 1059 | MV *mvs[2] = { frames[b]->lowresMvs[0][b - p0 - 1], frames[b]->lowresMvs[1][p1 - b - 1] }; |
| 1060 | int32_t bipredWeights[2] = { bipredWeight, 64 - bipredWeight }; |
| 1061 | |
| 1062 | memset(m_scratch, 0, m_widthInCU * sizeof(int)); |
| 1063 | |
| 1064 | uint16_t *propagateCost = frames[b]->propagateCost; |
| 1065 | |
| 1066 | x265_emms(); |
| 1067 | double fpsFactor = CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) / CLIP_DURATION(averageDuration); |
| 1068 | |
| 1069 | /* For non-refferd frames the source costs are always zero, so just memset one row and re-use it. */ |
| 1070 | if (!referenced) |
| 1071 | memset(frames[b]->propagateCost, 0, m_widthInCU * sizeof(uint16_t)); |
| 1072 | |
| 1073 | int32_t StrideInCU = m_widthInCU; |
| 1074 | for (uint16_t blocky = 0; blocky < m_heightInCU; blocky++) |
| 1075 | { |
| 1076 | int cuIndex = blocky * StrideInCU; |
| 1077 | primitives.propagateCost(m_scratch, propagateCost, |
| 1078 | frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex, |
| 1079 | frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_widthInCU); |
| 1080 | |
| 1081 | if (referenced) |
| 1082 | propagateCost += m_widthInCU; |
| 1083 | for (uint16_t blockx = 0; blockx < m_widthInCU; blockx++, cuIndex++) |
| 1084 | { |
| 1085 | int32_t propagate_amount = m_scratch[blockx]; |
| 1086 | /* Don't propagate for an intra block. */ |
| 1087 | if (propagate_amount > 0) |
| 1088 | { |
| 1089 | /* Access width-2 bitfield. */ |
| 1090 | int32_t lists_used = frames[b]->lowresCosts[b - p0][p1 - b][cuIndex] >> LOWRES_COST_SHIFT; |
| 1091 | /* Follow the MVs to the previous frame(s). */ |
| 1092 | for (uint16_t list = 0; list < 2; list++) |
| 1093 | { |
| 1094 | if ((lists_used >> list) & 1) |
| 1095 | { |
| 1096 | #define CLIP_ADD(s, x) (s) = (uint16_t)X265_MIN((s) + (x), (1 << 16) - 1) |
| 1097 | int32_t listamount = propagate_amount; |
| 1098 | /* Apply bipred weighting. */ |
| 1099 | if (lists_used == 3) |
| 1100 | listamount = (listamount * bipredWeights[list] + 32) >> 6; |
| 1101 | |
| 1102 | /* Early termination for simple case of mv0. */ |
| 1103 | if (!mvs[list][cuIndex].word) |
| 1104 | { |
| 1105 | CLIP_ADD(refCosts[list][cuIndex], listamount); |
| 1106 | continue; |
| 1107 | } |
| 1108 | |
| 1109 | int32_t x = mvs[list][cuIndex].x; |
| 1110 | int32_t y = mvs[list][cuIndex].y; |
| 1111 | int32_t cux = (x >> 5) + blockx; |
| 1112 | int32_t cuy = (y >> 5) + blocky; |
| 1113 | int32_t idx0 = cux + cuy * StrideInCU; |
| 1114 | int32_t idx1 = idx0 + 1; |
| 1115 | int32_t idx2 = idx0 + StrideInCU; |
| 1116 | int32_t idx3 = idx0 + StrideInCU + 1; |
| 1117 | x &= 31; |
| 1118 | y &= 31; |
| 1119 | int32_t idx0weight = (32 - y) * (32 - x); |
| 1120 | int32_t idx1weight = (32 - y) * x; |
| 1121 | int32_t idx2weight = y * (32 - x); |
| 1122 | int32_t idx3weight = y * x; |
| 1123 | |
| 1124 | /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't |
| 1125 | * be counted. */ |
| 1126 | if (cux < m_widthInCU - 1 && cuy < m_heightInCU - 1 && cux >= 0 && cuy >= 0) |
| 1127 | { |
| 1128 | CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10); |
| 1129 | CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10); |
| 1130 | CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10); |
| 1131 | CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10); |
| 1132 | } |
| 1133 | else /* Check offsets individually */ |
| 1134 | { |
| 1135 | if (cux < m_widthInCU && cuy < m_heightInCU && cux >= 0 && cuy >= 0) |
| 1136 | CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10); |
| 1137 | if (cux + 1 < m_widthInCU && cuy < m_heightInCU && cux + 1 >= 0 && cuy >= 0) |
| 1138 | CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10); |
| 1139 | if (cux < m_widthInCU && cuy + 1 < m_heightInCU && cux >= 0 && cuy + 1 >= 0) |
| 1140 | CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10); |
| 1141 | if (cux + 1 < m_widthInCU && cuy + 1 < m_heightInCU && cux + 1 >= 0 && cuy + 1 >= 0) |
| 1142 | CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10); |
| 1143 | } |
| 1144 | } |
| 1145 | } |
| 1146 | } |
| 1147 | } |
| 1148 | } |
| 1149 | |
| 1150 | if (m_param->rc.vbvBufferSize && m_param->lookaheadDepth && referenced) |
| 1151 | cuTreeFinish(frames[b], averageDuration, b == p1 ? b - p0 : 0); |
| 1152 | } |
| 1153 | |
| 1154 | void Lookahead::cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance) |
| 1155 | { |
| 1156 | int fpsFactor = (int)(CLIP_DURATION(averageDuration) / CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) * 256); |
| 1157 | double weightdelta = 0.0; |
| 1158 | |
| 1159 | if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0) |
| 1160 | weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]); |
| 1161 | |
| 1162 | /* Allow the strength to be adjusted via qcompress, since the two |
| 1163 | * concepts are very similar. */ |
| 1164 | |
| 1165 | int cuCount = m_widthInCU * m_heightInCU; |
| 1166 | double strength = 5.0 * (1.0 - m_param->rc.qCompress); |
| 1167 | |
| 1168 | for (int cuIndex = 0; cuIndex < cuCount; cuIndex++) |
| 1169 | { |
| 1170 | int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8; |
| 1171 | if (intracost) |
| 1172 | { |
| 1173 | int propagateCost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8; |
| 1174 | double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta; |
| 1175 | frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - strength * log2_ratio; |
| 1176 | } |
| 1177 | } |
| 1178 | } |
| 1179 | |
| 1180 | /* If MB-tree changes the quantizers, we need to recalculate the frame cost without |
| 1181 | * re-running lookahead. */ |
| 1182 | int64_t Lookahead::frameCostRecalculate(Lowres** frames, int p0, int p1, int b) |
| 1183 | { |
| 1184 | int64_t score = 0; |
| 1185 | int *rowSatd = frames[b]->rowSatds[b - p0][p1 - b]; |
| 1186 | double *qp_offset = (frames[b]->sliceType == X265_TYPE_B) ? frames[b]->qpAqOffset : frames[b]->qpCuTreeOffset; |
| 1187 | |
| 1188 | x265_emms(); |
| 1189 | for (int cuy = m_heightInCU - 1; cuy >= 0; cuy--) |
| 1190 | { |
| 1191 | rowSatd[cuy] = 0; |
| 1192 | for (int cux = m_widthInCU - 1; cux >= 0; cux--) |
| 1193 | { |
| 1194 | int cuxy = cux + cuy * m_widthInCU; |
| 1195 | int cuCost = frames[b]->lowresCosts[b - p0][p1 - b][cuxy] & LOWRES_COST_MASK; |
| 1196 | double qp_adj = qp_offset[cuxy]; |
| 1197 | cuCost = (cuCost * x265_exp2fix8(qp_adj) + 128) >> 8; |
| 1198 | rowSatd[cuy] += cuCost; |
| 1199 | if ((cuy > 0 && cuy < m_heightInCU - 1 && |
| 1200 | cux > 0 && cux < m_widthInCU - 1) || |
| 1201 | m_widthInCU <= 2 || m_heightInCU <= 2) |
| 1202 | { |
| 1203 | score += cuCost; |
| 1204 | } |
| 1205 | } |
| 1206 | } |
| 1207 | |
| 1208 | return score; |
| 1209 | } |
| 1210 | |
| 1211 | CostEstimate::CostEstimate(ThreadPool *p) |
| 1212 | : WaveFront(p) |
| 1213 | { |
| 1214 | m_param = NULL; |
| 1215 | m_curframes = NULL; |
| 1216 | m_wbuffer[0] = m_wbuffer[1] = m_wbuffer[2] = m_wbuffer[3] = 0; |
| 1217 | m_rows = NULL; |
| 1218 | m_paddedLines = m_widthInCU = m_heightInCU = 0; |
| 1219 | m_bDoSearch[0] = m_bDoSearch[1] = false; |
| 1220 | m_curb = m_curp0 = m_curp1 = 0; |
| 1221 | m_bFrameCompleted = false; |
| 1222 | } |
| 1223 | |
| 1224 | CostEstimate::~CostEstimate() |
| 1225 | { |
| 1226 | for (int i = 0; i < 4; i++) |
| 1227 | { |
| 1228 | x265_free(m_wbuffer[i]); |
| 1229 | } |
| 1230 | |
| 1231 | delete[] m_rows; |
| 1232 | } |
| 1233 | |
| 1234 | void CostEstimate::init(x265_param *_param, Frame *curFrame) |
| 1235 | { |
| 1236 | m_param = _param; |
| 1237 | m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; |
| 1238 | m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; |
| 1239 | |
| 1240 | m_rows = new EstimateRow[m_heightInCU]; |
| 1241 | for (int i = 0; i < m_heightInCU; i++) |
| 1242 | { |
| 1243 | m_rows[i].m_widthInCU = m_widthInCU; |
| 1244 | m_rows[i].m_heightInCU = m_heightInCU; |
| 1245 | m_rows[i].m_param = m_param; |
| 1246 | } |
| 1247 | |
| 1248 | if (WaveFront::init(m_heightInCU)) |
| 1249 | WaveFront::enableAllRows(); |
| 1250 | else |
| 1251 | m_pool = NULL; |
| 1252 | |
| 1253 | if (m_param->bEnableWeightedPred) |
| 1254 | { |
| 1255 | PicYuv *orig = curFrame->m_fencPic; |
| 1256 | m_paddedLines = curFrame->m_lowres.lines + 2 * orig->m_lumaMarginY; |
| 1257 | intptr_t padoffset = curFrame->m_lowres.lumaStride * orig->m_lumaMarginY + orig->m_lumaMarginX; |
| 1258 | |
| 1259 | /* allocate weighted lowres buffers */ |
| 1260 | for (int i = 0; i < 4; i++) |
| 1261 | { |
| 1262 | m_wbuffer[i] = (pixel*)x265_malloc(sizeof(pixel) * (curFrame->m_lowres.lumaStride * m_paddedLines)); |
| 1263 | m_weightedRef.lowresPlane[i] = m_wbuffer[i] + padoffset; |
| 1264 | } |
| 1265 | |
| 1266 | m_weightedRef.fpelPlane[0] = m_weightedRef.lowresPlane[0]; |
| 1267 | m_weightedRef.lumaStride = curFrame->m_lowres.lumaStride; |
| 1268 | m_weightedRef.isLowres = true; |
| 1269 | m_weightedRef.isWeighted = false; |
| 1270 | } |
| 1271 | } |
| 1272 | |
| 1273 | int64_t CostEstimate::estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty) |
| 1274 | { |
| 1275 | int64_t score = 0; |
| 1276 | Lowres *fenc = frames[b]; |
| 1277 | |
| 1278 | if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1) |
| 1279 | score = fenc->costEst[b - p0][p1 - b]; |
| 1280 | else |
| 1281 | { |
| 1282 | m_weightedRef.isWeighted = false; |
| 1283 | if (m_param->bEnableWeightedPred && b == p1 && b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF) |
| 1284 | { |
| 1285 | if (!fenc->bIntraCalculated) |
| 1286 | estimateFrameCost(frames, b, b, b, 0); |
| 1287 | weightsAnalyse(frames, b, p0); |
| 1288 | } |
| 1289 | |
| 1290 | /* For each list, check to see whether we have lowres motion-searched this reference */ |
| 1291 | m_bDoSearch[0] = b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF; |
| 1292 | m_bDoSearch[1] = b != p1 && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF; |
| 1293 | |
| 1294 | if (m_bDoSearch[0]) fenc->lowresMvs[0][b - p0 - 1][0].x = 0; |
| 1295 | if (m_bDoSearch[1]) fenc->lowresMvs[1][p1 - b - 1][0].x = 0; |
| 1296 | |
| 1297 | m_curb = b; |
| 1298 | m_curp0 = p0; |
| 1299 | m_curp1 = p1; |
| 1300 | m_curframes = frames; |
| 1301 | fenc->costEst[b - p0][p1 - b] = 0; |
| 1302 | fenc->costEstAq[b - p0][p1 - b] = 0; |
| 1303 | |
| 1304 | for (int i = 0; i < m_heightInCU; i++) |
| 1305 | { |
| 1306 | m_rows[i].init(); |
| 1307 | if (!fenc->bIntraCalculated) |
| 1308 | fenc->rowSatds[0][0][i] = 0; |
| 1309 | fenc->rowSatds[b - p0][p1 - b][i] = 0; |
| 1310 | } |
| 1311 | |
| 1312 | m_bFrameCompleted = false; |
| 1313 | |
| 1314 | if (m_pool) |
| 1315 | { |
| 1316 | WaveFront::enqueue(); |
| 1317 | |
| 1318 | // enableAllRows must be already called |
| 1319 | enqueueRow(0); |
| 1320 | while (!m_bFrameCompleted) |
| 1321 | WaveFront::findJob(-1); |
| 1322 | |
| 1323 | WaveFront::dequeue(); |
| 1324 | } |
| 1325 | else |
| 1326 | { |
| 1327 | for (int row = 0; row < m_heightInCU; row++) |
| 1328 | processRow(row, -1); |
| 1329 | |
| 1330 | x265_emms(); |
| 1331 | } |
| 1332 | |
| 1333 | // Accumulate cost from each row |
| 1334 | for (int row = 0; row < m_heightInCU; row++) |
| 1335 | { |
| 1336 | score += m_rows[row].m_costEst; |
| 1337 | fenc->costEst[0][0] += m_rows[row].m_costIntra; |
| 1338 | if (m_param->rc.aqMode) |
| 1339 | { |
| 1340 | fenc->costEstAq[0][0] += m_rows[row].m_costIntraAq; |
| 1341 | fenc->costEstAq[b - p0][p1 - b] += m_rows[row].m_costEstAq; |
| 1342 | } |
| 1343 | fenc->intraMbs[b - p0] += m_rows[row].m_intraMbs; |
| 1344 | } |
| 1345 | |
| 1346 | fenc->bIntraCalculated = true; |
| 1347 | |
| 1348 | if (b != p1) |
| 1349 | score = (uint64_t)score * 100 / (130 + m_param->bFrameBias); |
| 1350 | if (b != p0 || b != p1) //Not Intra cost |
| 1351 | fenc->costEst[b - p0][p1 - b] = score; |
| 1352 | } |
| 1353 | |
| 1354 | if (bIntraPenalty) |
| 1355 | { |
| 1356 | // arbitrary penalty for I-blocks after B-frames |
| 1357 | int ncu = NUM_CUS; |
| 1358 | score += (uint64_t)score * fenc->intraMbs[b - p0] / (ncu * 8); |
| 1359 | } |
| 1360 | return score; |
| 1361 | } |
| 1362 | |
| 1363 | uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightParam *wp) |
| 1364 | { |
| 1365 | Lowres *fenc = frames[b]; |
| 1366 | Lowres *ref = frames[p0]; |
| 1367 | pixel *src = ref->fpelPlane[0]; |
| 1368 | intptr_t stride = fenc->lumaStride; |
| 1369 | |
| 1370 | if (wp) |
| 1371 | { |
| 1372 | int offset = wp->inputOffset << (X265_DEPTH - 8); |
| 1373 | int scale = wp->inputWeight; |
| 1374 | int denom = wp->log2WeightDenom; |
| 1375 | int round = denom ? 1 << (denom - 1) : 0; |
| 1376 | int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth |
| 1377 | int widthHeight = (int)stride; |
| 1378 | |
| 1379 | primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines, |
| 1380 | scale, round << correction, denom + correction, offset); |
| 1381 | src = m_weightedRef.fpelPlane[0]; |
| 1382 | } |
| 1383 | |
| 1384 | uint32_t cost = 0; |
| 1385 | intptr_t pixoff = 0; |
| 1386 | int mb = 0; |
| 1387 | |
| 1388 | for (int y = 0; y < fenc->lines; y += 8, pixoff = y * stride) |
| 1389 | { |
| 1390 | for (int x = 0; x < fenc->width; x += 8, mb++, pixoff += 8) |
| 1391 | { |
| 1392 | int satd = primitives.satd[LUMA_8x8](src + pixoff, stride, fenc->fpelPlane[0] + pixoff, stride); |
| 1393 | cost += X265_MIN(satd, fenc->intraCost[mb]); |
| 1394 | } |
| 1395 | } |
| 1396 | |
| 1397 | return cost; |
| 1398 | } |
| 1399 | |
| 1400 | void CostEstimate::weightsAnalyse(Lowres **frames, int b, int p0) |
| 1401 | { |
| 1402 | static const float epsilon = 1.f / 128.f; |
| 1403 | Lowres *fenc, *ref; |
| 1404 | |
| 1405 | fenc = frames[b]; |
| 1406 | ref = frames[p0]; |
| 1407 | int deltaIndex = fenc->frameNum - ref->frameNum; |
| 1408 | |
| 1409 | /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */ |
| 1410 | float guessScale, fencMean, refMean; |
| 1411 | x265_emms(); |
| 1412 | if (fenc->wp_ssd[0] && ref->wp_ssd[0]) |
| 1413 | guessScale = sqrtf((float)fenc->wp_ssd[0] / ref->wp_ssd[0]); |
| 1414 | else |
| 1415 | guessScale = 1.0f; |
| 1416 | fencMean = (float)fenc->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8)); |
| 1417 | refMean = (float)ref->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8)); |
| 1418 | |
| 1419 | /* Early termination */ |
| 1420 | if (fabsf(refMean - fencMean) < 0.5f && fabsf(1.f - guessScale) < epsilon) |
| 1421 | return; |
| 1422 | |
| 1423 | int minoff = 0, minscale, mindenom; |
| 1424 | unsigned int minscore = 0, origscore = 1; |
| 1425 | int found = 0; |
| 1426 | |
| 1427 | m_w.setFromWeightAndOffset((int)(guessScale * 128 + 0.5f), 0, 7, true); |
| 1428 | mindenom = m_w.log2WeightDenom; |
| 1429 | minscale = m_w.inputWeight; |
| 1430 | |
| 1431 | origscore = minscore = weightCostLuma(frames, b, p0, NULL); |
| 1432 | |
| 1433 | if (!minscore) |
| 1434 | return; |
| 1435 | |
| 1436 | unsigned int s = 0; |
| 1437 | int curScale = minscale; |
| 1438 | int curOffset = (int)(fencMean - refMean * curScale / (1 << mindenom) + 0.5f); |
| 1439 | if (curOffset < -128 || curOffset > 127) |
| 1440 | { |
| 1441 | /* Rescale considering the constraints on curOffset. We do it in this order |
| 1442 | * because scale has a much wider range than offset (because of denom), so |
| 1443 | * it should almost never need to be clamped. */ |
| 1444 | curOffset = Clip3(-128, 127, curOffset); |
| 1445 | curScale = (int)((1 << mindenom) * (fencMean - curOffset) / refMean + 0.5f); |
| 1446 | curScale = Clip3(0, 127, curScale); |
| 1447 | } |
| 1448 | SET_WEIGHT(m_w, 1, curScale, mindenom, curOffset); |
| 1449 | s = weightCostLuma(frames, b, p0, &m_w); |
| 1450 | COPY4_IF_LT(minscore, s, minscale, curScale, minoff, curOffset, found, 1); |
| 1451 | |
| 1452 | /* Use a smaller denominator if possible */ |
| 1453 | while (mindenom > 0 && !(minscale & 1)) |
| 1454 | { |
| 1455 | mindenom--; |
| 1456 | minscale >>= 1; |
| 1457 | } |
| 1458 | |
| 1459 | if (!found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f) |
| 1460 | return; |
| 1461 | else |
| 1462 | { |
| 1463 | SET_WEIGHT(m_w, 1, minscale, mindenom, minoff); |
| 1464 | // set weighted delta cost |
| 1465 | fenc->weightedCostDelta[deltaIndex] = minscore / origscore; |
| 1466 | |
| 1467 | int offset = m_w.inputOffset << (X265_DEPTH - 8); |
| 1468 | int scale = m_w.inputWeight; |
| 1469 | int denom = m_w.log2WeightDenom; |
| 1470 | int round = denom ? 1 << (denom - 1) : 0; |
| 1471 | int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth |
| 1472 | intptr_t stride = ref->lumaStride; |
| 1473 | int widthHeight = (int)stride; |
| 1474 | |
| 1475 | for (int i = 0; i < 4; i++) |
| 1476 | primitives.weight_pp(ref->buffer[i], m_wbuffer[i], stride, widthHeight, m_paddedLines, |
| 1477 | scale, round << correction, denom + correction, offset); |
| 1478 | |
| 1479 | m_weightedRef.isWeighted = true; |
| 1480 | } |
| 1481 | } |
| 1482 | |
| 1483 | void CostEstimate::processRow(int row, int /*threadId*/) |
| 1484 | { |
| 1485 | ProfileScopeEvent(costEstimateRow); |
| 1486 | |
| 1487 | int realrow = m_heightInCU - 1 - row; |
| 1488 | Lowres **frames = m_curframes; |
| 1489 | ReferencePlanes *wfref0 = m_weightedRef.isWeighted ? &m_weightedRef : frames[m_curp0]; |
| 1490 | |
| 1491 | /* Lowres lookahead goes backwards because the MVs are used as |
| 1492 | * predictors in the main encode. This considerably improves MV |
| 1493 | * prediction overall. */ |
| 1494 | for (int i = m_widthInCU - 1 - m_rows[row].m_completed; i >= 0; i--) |
| 1495 | { |
| 1496 | // TODO: use lowres MVs as motion candidates in full-res search |
| 1497 | m_rows[row].estimateCUCost(frames, wfref0, i, realrow, m_curp0, m_curp1, m_curb, m_bDoSearch); |
| 1498 | m_rows[row].m_completed++; |
| 1499 | |
| 1500 | if (m_rows[row].m_completed >= 2 && row < m_heightInCU - 1) |
| 1501 | { |
| 1502 | ScopedLock below(m_rows[row + 1].m_lock); |
| 1503 | if (m_rows[row + 1].m_active == false && |
| 1504 | m_rows[row + 1].m_completed + 2 <= m_rows[row].m_completed) |
| 1505 | { |
| 1506 | m_rows[row + 1].m_active = true; |
| 1507 | enqueueRow(row + 1); |
| 1508 | } |
| 1509 | } |
| 1510 | |
| 1511 | ScopedLock self(m_rows[row].m_lock); |
| 1512 | if (row > 0 && (int32_t)m_rows[row].m_completed < m_widthInCU - 1 && |
| 1513 | m_rows[row - 1].m_completed < m_rows[row].m_completed + 2) |
| 1514 | { |
| 1515 | m_rows[row].m_active = false; |
| 1516 | return; |
| 1517 | } |
| 1518 | } |
| 1519 | |
| 1520 | if (row == m_heightInCU - 1) |
| 1521 | m_bFrameCompleted = true; |
| 1522 | } |
| 1523 | |
| 1524 | void EstimateRow::init() |
| 1525 | { |
| 1526 | m_costEst = 0; |
| 1527 | m_costEstAq = 0; |
| 1528 | m_costIntra = 0; |
| 1529 | m_costIntraAq = 0; |
| 1530 | m_intraMbs = 0; |
| 1531 | m_active = false; |
| 1532 | m_completed = 0; |
| 1533 | } |
| 1534 | |
| 1535 | void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2]) |
| 1536 | { |
| 1537 | Lowres *fref1 = frames[p1]; |
| 1538 | Lowres *fenc = frames[b]; |
| 1539 | |
| 1540 | const int bBidir = (b < p1); |
| 1541 | const int cuXY = cux + cuy * m_widthInCU; |
| 1542 | const int cuSize = X265_LOWRES_CU_SIZE; |
| 1543 | const intptr_t pelOffset = cuSize * cux + cuSize * cuy * fenc->lumaStride; |
| 1544 | |
| 1545 | // should this CU's cost contribute to the frame cost? |
| 1546 | const bool bFrameScoreCU = (cux > 0 && cux < m_widthInCU - 1 && |
| 1547 | cuy > 0 && cuy < m_heightInCU - 1) || m_widthInCU <= 2 || m_heightInCU <= 2; |
| 1548 | |
| 1549 | m_me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize); |
| 1550 | |
| 1551 | /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */ |
| 1552 | int lowresPenalty = 4; |
| 1553 | |
| 1554 | MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY], |
| 1555 | &fenc->lowresMvs[1][p1 - b - 1][cuXY] }; |
| 1556 | int(*fenc_costs[2]) = { &fenc->lowresMvCosts[0][b - p0 - 1][cuXY], |
| 1557 | &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] }; |
| 1558 | |
| 1559 | MV mvmin, mvmax; |
| 1560 | int bcost = m_me.COST_MAX; |
| 1561 | int listused = 0; |
| 1562 | |
| 1563 | // establish search bounds that don't cross extended frame boundaries |
| 1564 | mvmin.x = (int16_t)(-cux * cuSize - 8); |
| 1565 | mvmin.y = (int16_t)(-cuy * cuSize - 8); |
| 1566 | mvmax.x = (int16_t)((m_widthInCU - cux - 1) * cuSize + 8); |
| 1567 | mvmax.y = (int16_t)((m_heightInCU - cuy - 1) * cuSize + 8); |
| 1568 | |
| 1569 | if (p0 != p1) |
| 1570 | { |
| 1571 | for (int i = 0; i < 1 + bBidir; i++) |
| 1572 | { |
| 1573 | if (!bDoSearch[i]) |
| 1574 | { |
| 1575 | /* Use previously calculated cost */ |
| 1576 | COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1); |
| 1577 | continue; |
| 1578 | } |
| 1579 | int numc = 0; |
| 1580 | MV mvc[4], mvp; |
| 1581 | MV *fenc_mv = fenc_mvs[i]; |
| 1582 | |
| 1583 | /* Reverse-order MV prediction. */ |
| 1584 | mvc[0] = 0; |
| 1585 | mvc[2] = 0; |
| 1586 | #define MVC(mv) mvc[numc++] = mv; |
| 1587 | if (cux < m_widthInCU - 1) |
| 1588 | MVC(fenc_mv[1]); |
| 1589 | if (cuy < m_heightInCU - 1) |
| 1590 | { |
| 1591 | MVC(fenc_mv[m_widthInCU]); |
| 1592 | if (cux > 0) |
| 1593 | MVC(fenc_mv[m_widthInCU - 1]); |
| 1594 | if (cux < m_widthInCU - 1) |
| 1595 | MVC(fenc_mv[m_widthInCU + 1]); |
| 1596 | } |
| 1597 | #undef MVC |
| 1598 | if (numc <= 1) |
| 1599 | mvp = mvc[0]; |
| 1600 | else |
| 1601 | { |
| 1602 | median_mv(mvp, mvc[0], mvc[1], mvc[2]); |
| 1603 | } |
| 1604 | |
| 1605 | *fenc_costs[i] = m_me.motionEstimate(i ? fref1 : wfref0, mvmin, mvmax, mvp, numc, mvc, m_merange, *fenc_mvs[i]); |
| 1606 | COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1); |
| 1607 | } |
| 1608 | if (bBidir) |
| 1609 | { |
| 1610 | ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); |
| 1611 | ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); |
| 1612 | intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE; |
| 1613 | pixel *src0 = wfref0->lowresMC(pelOffset, *fenc_mvs[0], subpelbuf0, stride0); |
| 1614 | pixel *src1 = fref1->lowresMC(pelOffset, *fenc_mvs[1], subpelbuf1, stride1); |
| 1615 | |
| 1616 | ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); |
| 1617 | primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32); |
| 1618 | int bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE); |
| 1619 | COPY2_IF_LT(bcost, bicost, listused, 3); |
| 1620 | |
| 1621 | // Try 0,0 candidates |
| 1622 | src0 = wfref0->lowresPlane[0] + pelOffset; |
| 1623 | src1 = fref1->lowresPlane[0] + pelOffset; |
| 1624 | primitives.pixelavg_pp[LUMA_8x8](ref, X265_LOWRES_CU_SIZE, src0, wfref0->lumaStride, src1, fref1->lumaStride, 32); |
| 1625 | bicost = primitives.satd[LUMA_8x8](fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE); |
| 1626 | COPY2_IF_LT(bcost, bicost, listused, 3); |
| 1627 | } |
| 1628 | } |
| 1629 | if (!fenc->bIntraCalculated) |
| 1630 | { |
| 1631 | const int sizeIdx = X265_LOWRES_CU_BITS - 2; // partition size |
| 1632 | |
| 1633 | pixel _above0[X265_LOWRES_CU_SIZE * 4 + 1], *const above0 = _above0 + 2 * X265_LOWRES_CU_SIZE; |
| 1634 | pixel _above1[X265_LOWRES_CU_SIZE * 4 + 1], *const above1 = _above1 + 2 * X265_LOWRES_CU_SIZE; |
| 1635 | pixel _left0[X265_LOWRES_CU_SIZE * 4 + 1], *const left0 = _left0 + 2 * X265_LOWRES_CU_SIZE; |
| 1636 | pixel _left1[X265_LOWRES_CU_SIZE * 4 + 1], *const left1 = _left1 + 2 * X265_LOWRES_CU_SIZE; |
| 1637 | |
| 1638 | pixel *pix_cur = fenc->lowresPlane[0] + pelOffset; |
| 1639 | |
| 1640 | // Copy Above |
| 1641 | memcpy(above0, pix_cur - 1 - fenc->lumaStride, (cuSize + 1) * sizeof(pixel)); |
| 1642 | |
| 1643 | // Copy Left |
| 1644 | for (int i = 0; i < cuSize + 1; i++) |
| 1645 | left0[i] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride]; |
| 1646 | |
| 1647 | for (int i = 0; i < cuSize; i++) |
| 1648 | { |
| 1649 | above0[cuSize + i + 1] = above0[cuSize]; |
| 1650 | left0[cuSize + i + 1] = left0[cuSize]; |
| 1651 | } |
| 1652 | |
| 1653 | // filtering with [1 2 1] |
| 1654 | // assume getUseStrongIntraSmoothing() is disabled |
| 1655 | above1[0] = above0[0]; |
| 1656 | above1[2 * cuSize] = above0[2 * cuSize]; |
| 1657 | left1[0] = left0[0]; |
| 1658 | left1[2 * cuSize] = left0[2 * cuSize]; |
| 1659 | for (int i = 1; i < 2 * cuSize; i++) |
| 1660 | { |
| 1661 | above1[i] = (above0[i - 1] + 2 * above0[i] + above0[i + 1] + 2) >> 2; |
| 1662 | left1[i] = (left0[i - 1] + 2 * left0[i] + left0[i + 1] + 2) >> 2; |
| 1663 | } |
| 1664 | |
| 1665 | int predsize = cuSize * cuSize; |
| 1666 | |
| 1667 | // generate 35 intra predictions into m_predictions |
| 1668 | pixelcmp_t satd = primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)]; |
| 1669 | int icost = m_me.COST_MAX; |
| 1670 | primitives.intra_pred[DC_IDX][sizeIdx](m_predictions, cuSize, left0, above0, 0, (cuSize <= 16)); |
| 1671 | int cost = m_me.bufSATD(m_predictions, cuSize); |
| 1672 | if (cost < icost) |
| 1673 | icost = cost; |
| 1674 | pixel *above = (cuSize >= 8) ? above1 : above0; |
| 1675 | pixel *left = (cuSize >= 8) ? left1 : left0; |
| 1676 | primitives.intra_pred[PLANAR_IDX][sizeIdx](m_predictions, cuSize, left, above, 0, 0); |
| 1677 | cost = m_me.bufSATD(m_predictions, cuSize); |
| 1678 | if (cost < icost) |
| 1679 | icost = cost; |
| 1680 | primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16)); |
| 1681 | |
| 1682 | // calculate satd costs, keep least cost |
| 1683 | ALIGN_VAR_32(pixel, buf_trans[32 * 32]); |
| 1684 | primitives.transpose[sizeIdx](buf_trans, m_me.fencPUYuv.m_buf[0], FENC_STRIDE); |
| 1685 | |
| 1686 | int acost = m_me.COST_MAX; |
| 1687 | uint32_t mode, lowmode = 4; |
| 1688 | for (mode = 5; mode < 35; mode += 5) |
| 1689 | { |
| 1690 | if (mode < 18) |
| 1691 | cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize); |
| 1692 | else |
| 1693 | cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize); |
| 1694 | COPY2_IF_LT(acost, cost, lowmode, mode); |
| 1695 | } |
| 1696 | for (uint32_t dist = 2; dist >= 1; dist--) |
| 1697 | { |
| 1698 | mode = lowmode - dist; |
| 1699 | if (mode < 18) |
| 1700 | cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize); |
| 1701 | else |
| 1702 | cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize); |
| 1703 | COPY2_IF_LT(acost, cost, lowmode, mode); |
| 1704 | |
| 1705 | mode = lowmode + dist; |
| 1706 | if (mode < 18) |
| 1707 | cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize); |
| 1708 | else |
| 1709 | cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize); |
| 1710 | COPY2_IF_LT(acost, cost, lowmode, mode); |
| 1711 | } |
| 1712 | if (acost < icost) |
| 1713 | icost = acost; |
| 1714 | |
| 1715 | const int intraPenalty = 5 * m_lookAheadLambda; |
| 1716 | icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */ |
| 1717 | fenc->intraCost[cuXY] = icost; |
| 1718 | fenc->intraMode[cuXY] = (uint8_t)lowmode; |
| 1719 | int icostAq = icost; |
| 1720 | if (bFrameScoreCU) |
| 1721 | { |
| 1722 | m_costIntra += icost; |
| 1723 | if (fenc->invQscaleFactor) |
| 1724 | { |
| 1725 | icostAq = (icost * fenc->invQscaleFactor[cuXY] + 128) >> 8; |
| 1726 | m_costIntraAq += icostAq; |
| 1727 | } |
| 1728 | } |
| 1729 | fenc->rowSatds[0][0][cuy] += icostAq; |
| 1730 | } |
| 1731 | bcost += lowresPenalty; |
| 1732 | if (!bBidir) |
| 1733 | { |
| 1734 | if (fenc->intraCost[cuXY] < bcost) |
| 1735 | { |
| 1736 | if (bFrameScoreCU) m_intraMbs++; |
| 1737 | bcost = fenc->intraCost[cuXY]; |
| 1738 | listused = 0; |
| 1739 | } |
| 1740 | } |
| 1741 | |
| 1742 | /* For I frames these costs were accumulated earlier */ |
| 1743 | if (p0 != p1) |
| 1744 | { |
| 1745 | int bcostAq = bcost; |
| 1746 | if (bFrameScoreCU) |
| 1747 | { |
| 1748 | m_costEst += bcost; |
| 1749 | if (fenc->invQscaleFactor) |
| 1750 | { |
| 1751 | bcostAq = (bcost * fenc->invQscaleFactor[cuXY] + 128) >> 8; |
| 1752 | m_costEstAq += bcostAq; |
| 1753 | } |
| 1754 | } |
| 1755 | fenc->rowSatds[b - p0][p1 - b][cuy] += bcostAq; |
| 1756 | } |
| 1757 | fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT)); |
| 1758 | } |