source/encoder/framefilter.cpp

   1 /*****************************************************************************
   2  * Copyright (C) 2013 x265 project
   3  *
   4  * Authors: Chung Shin Yee <shinyee@multicorewareinc.com>
   5  *          Min Chen <chenm003@163.com>
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License as published by
   9  * the Free Software Foundation; either version 2 of the License, or
  10  * (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  20  *
  21  * This program is also available under a commercial proprietary license.
  22  * For more information, contact us at license @ x265.com.
  23  *****************************************************************************/
  24
  25 #include "common.h"
  26 #include "frame.h"
  27 #include "framedata.h"
  28 #include "encoder.h"
  29 #include "framefilter.h"
  30 #include "frameencoder.h"
  31 #include "wavefront.h"
  32 #include "PPA/ppa.h"
  33
  34 using namespace x265;
  35
  36 static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
  37 static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
  38
  39 FrameFilter::FrameFilter()
  40     : m_param(NULL)
  41     , m_frame(NULL)
  42     , m_frameEncoder(NULL)
  43     , m_ssimBuf(NULL)
  44 {
  45 }
  46
  47 void FrameFilter::destroy()
  48 {
  49     if (m_param->bEnableSAO)
  50         m_sao.destroy();
  51
  52     X265_FREE(m_ssimBuf);
  53 }
  54
  55 void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows)
  56 {
  57     m_param = top->m_param;
  58     m_frameEncoder = frame;
  59     m_numRows = numRows;
  60     m_hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
  61     m_vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
  62     m_pad[0] = top->m_sps.conformanceWindow.rightOffset;
  63     m_pad[1] = top->m_sps.conformanceWindow.bottomOffset;
  64     m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
  65     m_lastHeight = m_param->sourceHeight % g_maxCUSize ? m_param->sourceHeight % g_maxCUSize : g_maxCUSize;
  66
  67     m_deblock.init();
  68
  69     if (m_param->bEnableSAO)
  70         if (!m_sao.create(m_param))
  71             m_param->bEnableSAO = 0;
  72
  73     if (m_param->bEnableSsim)
  74         m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
  75 }
  76
  77 void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
  78 {
  79     m_frame = frame;
  80
  81     if (m_param->bEnableSAO)
  82         m_sao.startSlice(frame, initState, qp);
  83 }
  84
  85 void FrameFilter::processRow(int row)
  86 {
  87     PPAScopeEvent(Thread_filterCU);
  88
  89     if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
  90     {
  91         processRowPost(row);
  92         return;
  93     }
  94     FrameData& encData = *m_frame->m_encData;
  95     const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
  96     const uint32_t lineStartCUAddr = row * numCols;
  97
  98     if (m_param->bEnableLoopFilter)
  99     {
 100         for (uint32_t col = 0; col < numCols; col++)
 101         {
 102             uint32_t cuAddr = lineStartCUAddr + col;
 103             CUData* cu = encData.getPicCTU(cuAddr);
 104
 105             m_deblock.deblockCTU(cu, Deblock::EDGE_VER);
 106
 107             if (col > 0)
 108             {
 109                 CUData* cuPrev = encData.getPicCTU(cuAddr - 1);
 110                 m_deblock.deblockCTU(cuPrev, Deblock::EDGE_HOR);
 111             }
 112         }
 113
 114         CUData* cuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
 115         m_deblock.deblockCTU(cuPrev, Deblock::EDGE_HOR);
 116     }
 117
 118     // SAO
 119     SAOParam* saoParam = encData.m_saoParam;
 120     if (m_param->bEnableSAO)
 121     {
 122         m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext);
 123         m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext);
 124         m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext);
 125
 126         m_sao.rdoSaoUnitRow(saoParam, row);
 127
 128         // NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug?
 129         if (row >= m_saoRowDelay)
 130             processSao(row - m_saoRowDelay);
 131     }
 132
 133     // this row of CTUs has been encoded
 134
 135     if (row > 0)
 136         processRowPost(row - 1);
 137
 138     if (row == m_numRows - 1)
 139     {
 140         if (m_param->bEnableSAO)
 141         {
 142             m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
 143
 144             for (int i = m_numRows - m_saoRowDelay; i < m_numRows; i++)
 145                 processSao(i);
 146         }
 147
 148         processRowPost(row);
 149     }
 150 }
 151
 152 uint32_t FrameFilter::getCUHeight(int rowNum) const
 153 {
 154     return rowNum == m_numRows - 1 ? m_lastHeight : g_maxCUSize;
 155 }
 156
 157 void FrameFilter::processRowPost(int row)
 158 {
 159     PicYuv *reconPic = m_frame->m_reconPicYuv;
 160     const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
 161     const uint32_t lineStartCUAddr = row * numCols;
 162     const int realH = getCUHeight(row);
 163
 164     // Border extend Left and Right
 165     primitives.extendRowBorder(reconPic->getLumaAddr(lineStartCUAddr), reconPic->m_stride, reconPic->m_picWidth, realH, reconPic->m_lumaMarginX);
 166     primitives.extendRowBorder(reconPic->getCbAddr(lineStartCUAddr), reconPic->m_strideC, reconPic->m_picWidth >> m_hChromaShift, realH >> m_vChromaShift, reconPic->m_chromaMarginX);
 167     primitives.extendRowBorder(reconPic->getCrAddr(lineStartCUAddr), reconPic->m_strideC, reconPic->m_picWidth >> m_hChromaShift, realH >> m_vChromaShift, reconPic->m_chromaMarginX);
 168
 169     // Border extend Top
 170     if (!row)
 171     {
 172         const intptr_t stride = reconPic->m_stride;
 173         const intptr_t strideC = reconPic->m_strideC;
 174         pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr) - reconPic->m_lumaMarginX;
 175         pixel *pixU = reconPic->getCbAddr(lineStartCUAddr) - reconPic->m_chromaMarginX;
 176         pixel *pixV = reconPic->getCrAddr(lineStartCUAddr) - reconPic->m_chromaMarginX;
 177
 178         for (uint32_t y = 0; y < reconPic->m_lumaMarginY; y++)
 179             memcpy(pixY - (y + 1) * stride, pixY, stride * sizeof(pixel));
 180
 181         for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)
 182         {
 183             memcpy(pixU - (y + 1) * strideC, pixU, strideC * sizeof(pixel));
 184             memcpy(pixV - (y + 1) * strideC, pixV, strideC * sizeof(pixel));
 185         }
 186     }
 187
 188     // Border extend Bottom
 189     if (row == m_numRows - 1)
 190     {
 191         const intptr_t stride = reconPic->m_stride;
 192         const intptr_t strideC = reconPic->m_strideC;
 193         pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr) - reconPic->m_lumaMarginX + (realH - 1) * stride;
 194         pixel *pixU = reconPic->getCbAddr(lineStartCUAddr) - reconPic->m_chromaMarginX + ((realH >> m_vChromaShift) - 1) * strideC;
 195         pixel *pixV = reconPic->getCrAddr(lineStartCUAddr) - reconPic->m_chromaMarginX + ((realH >> m_vChromaShift) - 1) * strideC;
 196         for (uint32_t y = 0; y < reconPic->m_lumaMarginY; y++)
 197             memcpy(pixY + (y + 1) * stride, pixY, stride * sizeof(pixel));
 198
 199         for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)
 200         {
 201             memcpy(pixU + (y + 1) * strideC, pixU, strideC * sizeof(pixel));
 202             memcpy(pixV + (y + 1) * strideC, pixV, strideC * sizeof(pixel));
 203         }
 204     }
 205
 206     // Notify other FrameEncoders that this row of reconstructed pixels is available
 207     m_frame->m_reconRowCount.incr();
 208
 209     uint32_t cuAddr = lineStartCUAddr;
 210     if (m_param->bEnablePsnr)
 211     {
 212         PicYuv* origPic = m_frame->m_origPicYuv;
 213
 214         intptr_t stride = reconPic->m_stride;
 215         uint32_t width  = reconPic->m_picWidth - m_pad[0];
 216         uint32_t height = getCUHeight(row);
 217
 218         uint64_t ssdY = computeSSD(origPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height);
 219         height >>= m_vChromaShift;
 220         width  >>= m_hChromaShift;
 221         stride = reconPic->m_strideC;
 222
 223         uint64_t ssdU = computeSSD(origPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height);
 224         uint64_t ssdV = computeSSD(origPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height);
 225
 226         m_frameEncoder->m_SSDY += ssdY;
 227         m_frameEncoder->m_SSDU += ssdU;
 228         m_frameEncoder->m_SSDV += ssdV;
 229     }
 230     if (m_param->bEnableSsim && m_ssimBuf)
 231     {
 232         pixel *rec = m_frame->m_reconPicYuv->m_picOrg[0];
 233         pixel *org = m_frame->m_origPicYuv->m_picOrg[0];
 234         intptr_t stride1 = m_frame->m_origPicYuv->m_stride;
 235         intptr_t stride2 = m_frame->m_reconPicYuv->m_stride;
 236         uint32_t bEnd = ((row + 1) == (this->m_numRows - 1));
 237         uint32_t bStart = (row == 0);
 238         uint32_t minPixY = row * g_maxCUSize - 4 * !bStart;
 239         uint32_t maxPixY = (row + 1) * g_maxCUSize - 4 * !bEnd;
 240         uint32_t ssim_cnt;
 241         x265_emms();
 242
 243         /* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right
 244         * to avoid alignment of ssim blocks with DCT blocks. */
 245         minPixY += bStart ? 2 : -6;
 246         m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, org + 2 + minPixY * stride2, stride2,
 247                                                 m_param->sourceWidth - 2, maxPixY - minPixY, m_ssimBuf, ssim_cnt);
 248         m_frameEncoder->m_ssimCnt += ssim_cnt;
 249     }
 250     if (m_param->decodedPictureHashSEI == 1)
 251     {
 252         uint32_t height = getCUHeight(row);
 253         uint32_t width = reconPic->m_picWidth;
 254         intptr_t stride = reconPic->m_stride;
 255
 256         if (!row)
 257         {
 258             for (int i = 0; i < 3; i++)
 259                 MD5Init(&m_frameEncoder->m_state[i]);
 260         }
 261
 262         updateMD5Plane(m_frameEncoder->m_state[0], reconPic->getLumaAddr(cuAddr), width, height, stride);
 263         width  >>= m_hChromaShift;
 264         height >>= m_vChromaShift;
 265         stride = reconPic->m_strideC;
 266
 267         updateMD5Plane(m_frameEncoder->m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride);
 268         updateMD5Plane(m_frameEncoder->m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride);
 269     }
 270     else if (m_param->decodedPictureHashSEI == 2)
 271     {
 272         uint32_t height = getCUHeight(row);
 273         uint32_t width = reconPic->m_picWidth;
 274         intptr_t stride = reconPic->m_stride;
 275         if (!row)
 276             m_frameEncoder->m_crc[0] = m_frameEncoder->m_crc[1] = m_frameEncoder->m_crc[2] = 0xffff;
 277         updateCRC(reconPic->getLumaAddr(cuAddr), m_frameEncoder->m_crc[0], height, width, stride);
 278         width  >>= m_hChromaShift;
 279         height >>= m_vChromaShift;
 280         stride = reconPic->m_strideC;
 281
 282         updateCRC(reconPic->getCbAddr(cuAddr), m_frameEncoder->m_crc[1], height, width, stride);
 283         updateCRC(reconPic->getCrAddr(cuAddr), m_frameEncoder->m_crc[2], height, width, stride);
 284     }
 285     else if (m_param->decodedPictureHashSEI == 3)
 286     {
 287         uint32_t width = reconPic->m_picWidth;
 288         uint32_t height = getCUHeight(row);
 289         intptr_t stride = reconPic->m_stride;
 290         uint32_t cuHeight = g_maxCUSize;
 291         if (!row)
 292             m_frameEncoder->m_checksum[0] = m_frameEncoder->m_checksum[1] = m_frameEncoder->m_checksum[2] = 0;
 293         updateChecksum(reconPic->m_picOrg[0], m_frameEncoder->m_checksum[0], height, width, stride, row, cuHeight);
 294         width  >>= m_hChromaShift;
 295         height >>= m_vChromaShift;
 296         stride = reconPic->m_strideC;
 297         cuHeight >>= m_vChromaShift;
 298
 299         updateChecksum(reconPic->m_picOrg[1], m_frameEncoder->m_checksum[1], height, width, stride, row, cuHeight);
 300         updateChecksum(reconPic->m_picOrg[2], m_frameEncoder->m_checksum[2], height, width, stride, row, cuHeight);
 301     }
 302 }
 303
 304 static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height)
 305 {
 306     uint64_t ssd = 0;
 307
 308     if ((width | height) & 3)
 309     {
 310         /* Slow Path */
 311         for (uint32_t y = 0; y < height; y++)
 312         {
 313             for (uint32_t x = 0; x < width; x++)
 314             {
 315                 int diff = (int)(fenc[x] - rec[x]);
 316                 ssd += diff * diff;
 317             }
 318
 319             fenc += stride;
 320             rec += stride;
 321         }
 322
 323         return ssd;
 324     }
 325
 326     uint32_t y = 0;
 327     /* Consume Y in chunks of 64 */
 328     for (; y + 64 <= height; y += 64)
 329     {
 330         uint32_t x = 0;
 331
 332         if (!(stride & 31))
 333             for (; x + 64 <= width; x += 64)
 334                 ssd += primitives.sse_pp[LUMA_64x64](fenc + x, stride, rec + x, stride);
 335
 336         if (!(stride & 15))
 337             for (; x + 16 <= width; x += 16)
 338                 ssd += primitives.sse_pp[LUMA_16x64](fenc + x, stride, rec + x, stride);
 339
 340         for (; x + 4 <= width; x += 4)
 341         {
 342             ssd += primitives.sse_pp[LUMA_4x16](fenc + x, stride, rec + x, stride);
 343             ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 16 * stride, stride, rec + x + 16 * stride, stride);
 344             ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 32 * stride, stride, rec + x + 32 * stride, stride);
 345             ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 48 * stride, stride, rec + x + 48 * stride, stride);
 346         }
 347
 348         fenc += stride * 64;
 349         rec += stride * 64;
 350     }
 351
 352     /* Consume Y in chunks of 16 */
 353     for (; y + 16 <= height; y += 16)
 354     {
 355         uint32_t x = 0;
 356
 357         if (!(stride & 31))
 358             for (; x + 64 <= width; x += 64)
 359                 ssd += primitives.sse_pp[LUMA_64x16](fenc + x, stride, rec + x, stride);
 360
 361         if (!(stride & 15))
 362             for (; x + 16 <= width; x += 16)
 363                 ssd += primitives.sse_pp[LUMA_16x16](fenc + x, stride, rec + x, stride);
 364
 365         for (; x + 4 <= width; x += 4)
 366             ssd += primitives.sse_pp[LUMA_4x16](fenc + x, stride, rec + x, stride);
 367
 368         fenc += stride * 16;
 369         rec += stride * 16;
 370     }
 371
 372     /* Consume Y in chunks of 4 */
 373     for (; y + 4 <= height; y += 4)
 374     {
 375         uint32_t x = 0;
 376
 377         if (!(stride & 15))
 378             for (; x + 16 <= width; x += 16)
 379                 ssd += primitives.sse_pp[LUMA_16x4](fenc + x, stride, rec + x, stride);
 380
 381         for (; x + 4 <= width; x += 4)
 382             ssd += primitives.sse_pp[LUMA_4x4](fenc + x, stride, rec + x, stride);
 383
 384         fenc += stride * 4;
 385         rec += stride * 4;
 386     }
 387
 388     return ssd;
 389 }
 390
 391 /* Function to calculate SSIM for each row */
 392 static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt)
 393 {
 394     uint32_t z = 0;
 395     float ssim = 0.0;
 396
 397     int(*sum0)[4] = (int(*)[4])buf;
 398     int(*sum1)[4] = sum0 + (width >> 2) + 3;
 399     width >>= 2;
 400     height >>= 2;
 401
 402     for (uint32_t y = 1; y < height; y++)
 403     {
 404         for (; z <= y; z++)
 405         {
 406             std::swap(sum0, sum1);
 407             for (uint32_t x = 0; x < width; x += 2)
 408                 primitives.ssim_4x4x2_core(&pix1[(4 * x + (z * stride1))], stride1, &pix2[(4 * x + (z * stride2))], stride2, &sum0[x]);
 409         }
 410
 411         for (uint32_t x = 0; x < width - 1; x += 4)
 412             ssim += primitives.ssim_end_4(sum0 + x, sum1 + x, X265_MIN(4, width - x - 1));
 413     }
 414
 415     cnt = (height - 1) * (width - 1);
 416     return ssim;
 417 }
 418
 419 /* restore original YUV samples to recon after SAO (if lossless) */
 420 static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth)
 421 {
 422     uint32_t size = g_maxCUSize >> depth;
 423     int part = partitionFromSizes(size, size);
 424
 425     PicYuv* reconPic = frame.m_reconPicYuv;
 426     PicYuv* fencPic  = frame.m_origPicYuv;
 427
 428     pixel* dst = reconPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
 429     pixel* src = fencPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
 430
 431     primitives.luma_copy_pp[part](dst, reconPic->m_stride, src, fencPic->m_stride);
 432
 433     pixel* dstCb = reconPic->getCbAddr(cu->m_cuAddr, absPartIdx);
 434     pixel* srcCb = fencPic->getCbAddr(cu->m_cuAddr, absPartIdx);
 435
 436     pixel* dstCr = reconPic->getCrAddr(cu->m_cuAddr, absPartIdx);
 437     pixel* srcCr = fencPic->getCrAddr(cu->m_cuAddr, absPartIdx);
 438
 439     int csp = fencPic->m_picCsp;
 440     primitives.chroma[csp].copy_pp[part](dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC);
 441     primitives.chroma[csp].copy_pp[part](dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC);
 442 }
 443
 444 /* Original YUV restoration for CU in lossless coding */
 445 static void origCUSampleRestoration(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth)
 446 {
 447     if (cu->m_cuDepth[absPartIdx] > depth)
 448     {
 449         /* TODO: this could use cuGeom.numPartition and flags */
 450         uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1);
 451         uint32_t qNumParts   = curNumParts >> 2;
 452         uint32_t xmax = cu->m_slice->m_sps->picWidthInLumaSamples  - cu->m_cuPelX;
 453         uint32_t ymax = cu->m_slice->m_sps->picHeightInLumaSamples - cu->m_cuPelY;
 454
 455         /* process four split sub-cu at next depth */
 456         for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts)
 457         {
 458             if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax)
 459                 origCUSampleRestoration(cu, frame, absPartIdx, depth + 1);
 460         }
 461
 462         return;
 463     }
 464
 465     // restore original YUV samples
 466     if (cu->m_tqBypass[absPartIdx])
 467         restoreOrigLosslessYuv(cu, frame, absPartIdx, depth);
 468 }
 469
 470 void FrameFilter::processSao(int row)
 471 {
 472     SAOParam* saoParam = m_frame->m_encData->m_saoParam;
 473
 474     if (saoParam->bSaoFlag[0])
 475         m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0);
 476
 477     if (saoParam->bSaoFlag[1])
 478     {
 479         m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, 1);
 480         m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2);
 481     }
 482
 483     if (m_frame->m_encData->m_slice->m_pps->bTransquantBypassEnabled)
 484     {
 485         uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
 486         uint32_t lineStartCUAddr = row * numCols;
 487
 488         for (uint32_t col = 0; col < numCols; col++)
 489             origCUSampleRestoration(m_frame->m_encData->getPicCTU(lineStartCUAddr + col), *m_frame, 0, 0);
 490     }
 491 }