source/encoder/weightPrediction.cpp

   1 /*****************************************************************************
   2  * Copyright (C) 2013 x265 project
   3  *
   4  * Author: Shazeb Nawaz Khan <shazeb@multicorewareinc.com>
   5  *         Steve Borho <steve@borho.org>
   6  *         Kavitha Sampas <kavitha@multicorewareinc.com>
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  21  *
  22  * This program is also available under a commercial proprietary license.
  23  * For more information, contact us at license @ x265.com.
  24  *****************************************************************************/
  25
  26 #include "common.h"
  27 #include "frame.h"
  28 #include "picyuv.h"
  29 #include "lowres.h"
  30 #include "mv.h"
  31 #include "slicetype.h"
  32 #include "bitstream.h"
  33
  34 using namespace x265;
  35 namespace {
  36 struct Cache
  37 {
  38     const int * intraCost;
  39     int         numPredDir;
  40     int         csp;
  41     int         hshift;
  42     int         vshift;
  43     int         lowresWidthInCU;
  44     int         lowresHeightInCU;
  45 };
  46
  47 int sliceHeaderCost(WeightParam *w, int lambda, int bChroma)
  48 {
  49     /* 4 times higher, because chroma is analyzed at full resolution. */
  50     if (bChroma)
  51         lambda *= 4;
  52     int denomCost = bs_size_ue(w[0].log2WeightDenom) * (2 - bChroma);
  53     return lambda * (10 + denomCost + 2 * (bs_size_se(w[0].inputWeight) + bs_size_se(w[0].inputOffset)));
  54 }
  55
  56 /* make a motion compensated copy of lowres ref into mcout with the same stride.
  57  * The borders of mcout are not extended */
  58 void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
  59 {
  60     intptr_t stride = ref.lumaStride;
  61     const int cuSize = 8;
  62     MV mvmin, mvmax;
  63
  64     int cu = 0;
  65
  66     for (int y = 0; y < ref.lines; y += cuSize)
  67     {
  68         intptr_t pixoff = y * stride;
  69         mvmin.y = (int16_t)((-y - 8) << 2);
  70         mvmax.y = (int16_t)((ref.lines - y - 1 + 8) << 2);
  71
  72         for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++)
  73         {
  74             ALIGN_VAR_16(pixel, buf8x8[8 * 8]);
  75             intptr_t bstride = 8;
  76             mvmin.x = (int16_t)((-x - 8) << 2);
  77             mvmax.x = (int16_t)((ref.width - x - 1 + 8) << 2);
  78
  79             /* clip MV to available pixels */
  80             MV mv = mvs[cu];
  81             mv = mv.clipped(mvmin, mvmax);
  82             pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
  83             primitives.luma_copy_pp[LUMA_8x8](mcout + pixoff, stride, tmp, bstride);
  84         }
  85     }
  86 }
  87
  88 /* use lowres MVs from lookahead to generate a motion compensated chroma plane.
  89  * if a block had cheaper lowres cost as intra, we treat it as MV 0 */
  90 void mcChroma(pixel *      mcout,
  91               pixel *      src,
  92               intptr_t     stride,
  93               const MV *   mvs,
  94               const Cache& cache,
  95               int          height,
  96               int          width)
  97 {
  98     /* the motion vectors correspond to 8x8 lowres luma blocks, or 16x16 fullres
  99      * luma blocks. We have to adapt block size to chroma csp */
 100     int csp = cache.csp;
 101     int bw = 16 >> cache.hshift;
 102     int bh = 16 >> cache.vshift;
 103     MV mvmin, mvmax;
 104
 105     for (int y = 0; y < height; y += bh)
 106     {
 107         /* note: lowres block count per row might be different from chroma block
 108          * count per row because of rounding issues, so be very careful with indexing
 109          * into the lowres structures */
 110         int cu = y * cache.lowresWidthInCU;
 111         intptr_t pixoff = y * stride;
 112         mvmin.y = (int16_t)((-y - 8) << 2);
 113         mvmax.y = (int16_t)((height - y - 1 + 8) << 2);
 114
 115         for (int x = 0; x < width; x += bw, cu++, pixoff += bw)
 116         {
 117             if (x < cache.lowresWidthInCU && y < cache.lowresHeightInCU)
 118             {
 119                 MV mv = mvs[cu]; // lowres MV
 120                 mv <<= 1;        // fullres MV
 121                 mv.x >>= cache.hshift;
 122                 mv.y >>= cache.vshift;
 123
 124                 /* clip MV to available pixels */
 125                 mvmin.x = (int16_t)((-x - 8) << 2);
 126                 mvmax.x = (int16_t)((width - x - 1 + 8) << 2);
 127                 mv = mv.clipped(mvmin, mvmax);
 128
 129                 intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
 130                 pixel *temp = src + pixoff + fpeloffset;
 131
 132                 int xFrac = mv.x & 0x7;
 133                 int yFrac = mv.y & 0x7;
 134                 if ((yFrac | xFrac) == 0)
 135                 {
 136                     primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, temp, stride);
 137                 }
 138                 else if (yFrac == 0)
 139                 {
 140                     primitives.chroma[csp].filter_hpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, xFrac);
 141                 }
 142                 else if (xFrac == 0)
 143                 {
 144                     primitives.chroma[csp].filter_vpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, yFrac);
 145                 }
 146                 else
 147                 {
 148                     ALIGN_VAR_16(int16_t, imm[16 * (16 + NTAPS_CHROMA)]);
 149                     primitives.chroma[csp].filter_hps[LUMA_16x16](temp, stride, imm, bw, xFrac, 1);
 150                     primitives.chroma[csp].filter_vsp[LUMA_16x16](imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
 151                 }
 152             }
 153             else
 154             {
 155                 primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, src + pixoff, stride);
 156             }
 157         }
 158     }
 159 }
 160
 161 /* Measure sum of 8x8 satd costs between source frame and reference
 162  * frame (potentially weighted, potentially motion compensated). We
 163  * always use source images for this analysis since reference recon
 164  * pixels have unreliable availability */
 165 uint32_t weightCost(pixel *         fenc,
 166                     pixel *         ref,
 167                     pixel *         weightTemp,
 168                     intptr_t        stride,
 169                     const Cache &   cache,
 170                     int             width,
 171                     int             height,
 172                     WeightParam *   w,
 173                     bool            bLuma)
 174 {
 175     if (w)
 176     {
 177         /* make a weighted copy of the reference plane */
 178         int offset = w->inputOffset << (X265_DEPTH - 8);
 179         int weight = w->inputWeight;
 180         int denom = w->log2WeightDenom;
 181         int round = denom ? 1 << (denom - 1) : 0;
 182         int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
 183         int pwidth = ((width + 15) >> 4) << 4;
 184
 185         primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
 186                              weight, round << correction, denom + correction, offset);
 187         ref = weightTemp;
 188     }
 189
 190     uint32_t cost = 0;
 191     pixel *f = fenc, *r = ref;
 192
 193     if (bLuma)
 194     {
 195         int cu = 0;
 196         for (int y = 8; y < height; y += 8, r += 8 * stride, f += 8 * stride)
 197         {
 198             for (int x = 8; x < width; x += 8, cu++)
 199             {
 200                 int cmp = primitives.satd[LUMA_8x8](r + x, stride, f + x, stride);
 201                 cost += X265_MIN(cmp, cache.intraCost[cu]);
 202             }
 203         }
 204     }
 205     else if (cache.csp == X265_CSP_I444)
 206         for (int y = 16; y < height; y += 16, r += 16 * stride, f += 16 * stride)
 207             for (int x = 16; x < width; x += 16)
 208                 cost += primitives.satd[LUMA_16x16](r + x, stride, f + x, stride);
 209     else
 210         for (int y = 8; y < height; y += 8, r += 8 * stride, f += 8 * stride)
 211             for (int x = 8; x < width; x += 8)
 212                 cost += primitives.satd[LUMA_8x8](r + x, stride, f + x, stride);
 213
 214     return cost;
 215 }
 216 }
 217
 218 namespace x265 {
 219 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
 220 {
 221     WeightParam wp[2][MAX_NUM_REF][3];
 222     PicYuv *fencPic = frame.m_fencPic;
 223     Lowres& fenc    = frame.m_lowres;
 224
 225     Cache cache;
 226
 227     memset(&cache, 0, sizeof(cache));
 228     cache.intraCost = fenc.intraCost;
 229     cache.numPredDir = slice.isInterP() ? 1 : 2;
 230     cache.lowresWidthInCU = fenc.width >> 3;
 231     cache.lowresHeightInCU = fenc.lines >> 3;
 232     cache.csp = fencPic->m_picCsp;
 233     cache.hshift = CHROMA_H_SHIFT(cache.csp);
 234     cache.vshift = CHROMA_V_SHIFT(cache.csp);
 235
 236     /* Use single allocation for motion compensated ref and weight buffers */
 237     pixel *mcbuf = X265_MALLOC(pixel, 2 * fencPic->m_stride * fencPic->m_picHeight);
 238     if (!mcbuf)
 239     {
 240         slice.disableWeights();
 241         return;
 242     }
 243     pixel *weightTemp = mcbuf + fencPic->m_stride * fencPic->m_picHeight;
 244
 245     int lambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
 246     int curPoc = slice.m_poc;
 247     const float epsilon = 1.f / 128.f;
 248
 249     int chromaDenom, lumaDenom, denom;
 250     chromaDenom = lumaDenom = 7;
 251     int numpixels[3];
 252     int w16 = ((fencPic->m_picWidth  + 15) >> 4) << 4;
 253     int h16 = ((fencPic->m_picHeight + 15) >> 4) << 4;
 254     numpixels[0] = w16 * h16;
 255     numpixels[1] = numpixels[2] = numpixels[0] >> (cache.hshift + cache.vshift);
 256
 257     for (int list = 0; list < cache.numPredDir; list++)
 258     {
 259         WeightParam *weights = wp[list][0];
 260         Frame *refFrame = slice.m_refPicList[list][0];
 261         Lowres& refLowres = refFrame->m_lowres;
 262         int diffPoc = abs(curPoc - refFrame->m_poc);
 263
 264         /* prepare estimates */
 265         float guessScale[3], fencMean[3], refMean[3];
 266         for (int plane = 0; plane < 3; plane++)
 267         {
 268             SET_WEIGHT(weights[plane], false, 1, 0, 0);
 269             uint64_t fencVar = fenc.wp_ssd[plane] + !refLowres.wp_ssd[plane];
 270             uint64_t refVar  = refLowres.wp_ssd[plane] + !refLowres.wp_ssd[plane];
 271             guessScale[plane] = sqrt((float)fencVar / refVar);
 272             fencMean[plane] = (float)fenc.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
 273             refMean[plane]  = (float)refLowres.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
 274         }
 275
 276         /* make sure both our scale factors fit */
 277         while (!list && chromaDenom > 0)
 278         {
 279             float thresh = 127.f / (1 << chromaDenom);
 280             if (guessScale[1] < thresh && guessScale[2] < thresh)
 281                 break;
 282             chromaDenom--;
 283         }
 284
 285         SET_WEIGHT(weights[1], false, 1 << chromaDenom, chromaDenom, 0);
 286         SET_WEIGHT(weights[2], false, 1 << chromaDenom, chromaDenom, 0);
 287
 288         MV *mvs = NULL;
 289
 290         for (int plane = 0; plane < 3; plane++)
 291         {
 292             denom = plane ? chromaDenom : lumaDenom;
 293             if (plane && !weights[0].bPresentFlag)
 294                 break;
 295
 296             /* Early termination */
 297             x265_emms();
 298             if (fabsf(refMean[plane] - fencMean[plane]) < 0.5f && fabsf(1.f - guessScale[plane]) < epsilon)
 299             {
 300                 SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
 301                 continue;
 302             }
 303
 304             if (plane)
 305             {
 306                 int scale = Clip3(0, 255, (int)(guessScale[plane] * (1 << denom) + 0.5f));
 307                 if (scale > 127)
 308                     continue;
 309                 weights[plane].inputWeight = scale;
 310             }
 311             else
 312             {
 313                 weights[plane].setFromWeightAndOffset((int)(guessScale[plane] * (1 << denom) + 0.5f), 0, denom, !list);
 314             }
 315
 316             int mindenom = weights[plane].log2WeightDenom;
 317             int minscale = weights[plane].inputWeight;
 318             int minoff = 0;
 319
 320             if (!plane && diffPoc <= param.bframes + 1)
 321             {
 322                 mvs = fenc.lowresMvs[list][diffPoc - 1];
 323
 324                 /* test whether this motion search was performed by lookahead */
 325                 if (mvs[0].x != 0x7FFF)
 326                 {
 327                     /* reference chroma planes must be extended prior to being
 328                      * used as motion compensation sources */
 329                     if (!refFrame->m_bChromaExtended)
 330                     {
 331                         refFrame->m_bChromaExtended = true;
 332                         PicYuv *refPic = refFrame->m_fencPic;
 333                         int width = refPic->m_picWidth >> cache.hshift;
 334                         int height = refPic->m_picHeight >> cache.vshift;
 335                         extendPicBorder(refPic->m_picOrg[1], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
 336                         extendPicBorder(refPic->m_picOrg[2], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
 337                     }
 338                 }
 339                 else
 340                     mvs = 0;
 341             }
 342
 343             /* prepare inputs to weight analysis */
 344             pixel *orig;
 345             pixel *fref;
 346             intptr_t stride;
 347             int    width, height;
 348             switch (plane)
 349             {
 350             case 0:
 351                 orig = fenc.lowresPlane[0];
 352                 stride = fenc.lumaStride;
 353                 width = fenc.width;
 354                 height = fenc.lines;
 355                 fref = refLowres.lowresPlane[0];
 356                 if (mvs)
 357                 {
 358                     mcLuma(mcbuf, refLowres, mvs);
 359                     fref = mcbuf;
 360                 }
 361                 break;
 362
 363             case 1:
 364                 orig = fencPic->m_picOrg[1];
 365                 stride = fencPic->m_strideC;
 366                 fref = refFrame->m_fencPic->m_picOrg[1];
 367
 368                 /* Clamp the chroma dimensions to the nearest multiple of
 369                  * 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres
 370                  * blocks and weightCost measures 8x8 blocks. This
 371                  * potentially ignores some edge pixels, but simplifies the
 372                  * logic and prevents reading uninitialized pixels. Lowres
 373                  * planes are border extended and require no clamping. */
 374                 width =  ((fencPic->m_picWidth  >> 4) << 4) >> cache.hshift;
 375                 height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
 376                 if (mvs)
 377                 {
 378                     mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
 379                     fref = mcbuf;
 380                 }
 381                 break;
 382
 383             case 2:
 384                 fref = refFrame->m_fencPic->m_picOrg[2];
 385                 orig = fencPic->m_picOrg[2];
 386                 stride = fencPic->m_strideC;
 387                 width =  ((fencPic->m_picWidth  >> 4) << 4) >> cache.hshift;
 388                 height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
 389                 if (mvs)
 390                 {
 391                     mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
 392                     fref = mcbuf;
 393                 }
 394                 break;
 395
 396             default:
 397                 slice.disableWeights();
 398                 X265_FREE(mcbuf);
 399                 return;
 400             }
 401
 402             uint32_t origscore = weightCost(orig, fref, weightTemp, stride, cache, width, height, NULL, !plane);
 403             if (!origscore)
 404             {
 405                 SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
 406                 continue;
 407             }
 408
 409             uint32_t minscore = origscore;
 410             bool bFound = false;
 411
 412             /* x264 uses a table lookup here, selecting search range based on preset */
 413             static const int scaleDist = 4;
 414             static const int offsetDist = 2;
 415
 416             int startScale = Clip3(0, 127, minscale - scaleDist);
 417             int endScale   = Clip3(0, 127, minscale + scaleDist);
 418             for (int scale = startScale; scale <= endScale; scale++)
 419             {
 420                 int deltaWeight = scale - (1 << mindenom);
 421                 if (deltaWeight > 127 || deltaWeight <= -128)
 422                     continue;
 423
 424                 x265_emms();
 425                 int curScale = scale;
 426                 int curOffset = (int)(fencMean[plane] - refMean[plane] * curScale / (1 << mindenom) + 0.5f);
 427                 if (curOffset < -128 || curOffset > 127)
 428                 {
 429                     /* Rescale considering the constraints on curOffset. We do it in this order
 430                      * because scale has a much wider range than offset (because of denom), so
 431                      * it should almost never need to be clamped. */
 432                     curOffset = Clip3(-128, 127, curOffset);
 433                     curScale = (int)((1 << mindenom) * (fencMean[plane] - curOffset) / refMean[plane] + 0.5f);
 434                     curScale = Clip3(0, 127, curScale);
 435                 }
 436
 437                 int startOffset = Clip3(-128, 127, curOffset - offsetDist);
 438                 int endOffset   = Clip3(-128, 127, curOffset + offsetDist);
 439                 for (int off = startOffset; off <= endOffset; off++)
 440                 {
 441                     WeightParam wsp;
 442                     SET_WEIGHT(wsp, true, curScale, mindenom, off);
 443                     uint32_t s = weightCost(orig, fref, weightTemp, stride, cache, width, height, &wsp, !plane) +
 444                                  sliceHeaderCost(&wsp, lambda, !!plane);
 445                     COPY4_IF_LT(minscore, s, minscale, curScale, minoff, off, bFound, true);
 446
 447                     /* Don't check any more offsets if the previous one had a lower cost than the current one */
 448                     if (minoff == startOffset && off != startOffset)
 449                         break;
 450                 }
 451             }
 452
 453             /* Use a smaller luma denominator if possible */
 454             if (!(plane || list))
 455             {
 456                 while (mindenom > 0 && !(minscale & 1))
 457                 {
 458                     mindenom--;
 459                     minscale >>= 1;
 460                 }
 461             }
 462
 463             if (!bFound || (minscale == (1 << mindenom) && minoff == 0) || (float)minscore / origscore > 0.998f)
 464             {
 465                 SET_WEIGHT(weights[plane], false, 1 << denom, denom, 0);
 466             }
 467             else
 468             {
 469                 SET_WEIGHT(weights[plane], true, minscale, mindenom, minoff);
 470             }
 471         }
 472
 473         if (weights[0].bPresentFlag)
 474         {
 475             // Make sure both chroma channels match
 476             if (weights[1].bPresentFlag != weights[2].bPresentFlag)
 477             {
 478                 if (weights[1].bPresentFlag)
 479                     weights[2] = weights[1];
 480                 else
 481                     weights[1] = weights[2];
 482             }
 483         }
 484
 485         lumaDenom = weights[0].log2WeightDenom;
 486         chromaDenom = weights[1].log2WeightDenom;
 487
 488         /* reset weight states */
 489         for (int ref = 1; ref < slice.m_numRefIdx[list]; ref++)
 490         {
 491             SET_WEIGHT(wp[list][ref][0], false, 1 << lumaDenom, lumaDenom, 0);
 492             SET_WEIGHT(wp[list][ref][1], false, 1 << chromaDenom, chromaDenom, 0);
 493             SET_WEIGHT(wp[list][ref][2], false, 1 << chromaDenom, chromaDenom, 0);
 494         }
 495     }
 496
 497     X265_FREE(mcbuf);
 498
 499     memcpy(slice.m_weightPredTable, wp, sizeof(WeightParam) * 2 * MAX_NUM_REF * 3);
 500
 501     if (param.logLevel >= X265_LOG_FULL)
 502     {
 503         char buf[1024];
 504         int p = 0;
 505         bool bWeighted = false;
 506
 507         p = sprintf(buf, "poc: %d weights:", slice.m_poc);
 508         int numPredDir = slice.isInterP() ? 1 : 2;
 509         for (int list = 0; list < numPredDir; list++)
 510         {
 511             WeightParam* w = &wp[list][0][0];
 512             if (w[0].bPresentFlag || w[1].bPresentFlag || w[2].bPresentFlag)
 513             {
 514                 bWeighted = true;
 515                 p += sprintf(buf + p, " [L%d:R0 ", list);
 516                 if (w[0].bPresentFlag)
 517                     p += sprintf(buf + p, "Y{%d/%d%+d}", w[0].inputWeight, 1 << w[0].log2WeightDenom, w[0].inputOffset);
 518                 if (w[1].bPresentFlag)
 519                     p += sprintf(buf + p, "U{%d/%d%+d}", w[1].inputWeight, 1 << w[1].log2WeightDenom, w[1].inputOffset);
 520                 if (w[2].bPresentFlag)
 521                     p += sprintf(buf + p, "V{%d/%d%+d}", w[2].inputWeight, 1 << w[2].log2WeightDenom, w[2].inputOffset);
 522                 p += sprintf(buf + p, "]");
 523             }
 524         }
 525
 526         if (bWeighted)
 527         {
 528             if (p < 80) // pad with spaces to ensure progress line overwritten
 529                 sprintf(buf + p, "%*s", 80 - p, " ");
 530             x265_log(&param, X265_LOG_FULL, "%s\n", buf);
 531         }
 532     }
 533 }
 534 }