[deb_x265.git] / source / encoder / framefilter.cpp

/*****************************************************************************
 * Copyright (C) 2013 x265 project
 *
 * Authors: Chung Shin Yee <shinyee@multicorewareinc.com>
 *          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at license @ x265.com.
 *****************************************************************************/

#include "common.h"
#include "frame.h"
#include "framedata.h"
#include "encoder.h"
#include "framefilter.h"
#include "frameencoder.h"
#include "wavefront.h"

using namespace x265;

static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);

FrameFilter::FrameFilter()
    : m_param(NULL)
    , m_frame(NULL)
    , m_frameEncoder(NULL)
    , m_ssimBuf(NULL)
{
}

void FrameFilter::destroy()
{
    if (m_param->bEnableSAO)
        m_sao.destroy();

    X265_FREE(m_ssimBuf);
}

void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows)
{
    m_param = top->m_param;
    m_frameEncoder = frame;
    m_numRows = numRows;
    m_hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
    m_vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
    m_pad[0] = top->m_sps.conformanceWindow.rightOffset;
    m_pad[1] = top->m_sps.conformanceWindow.bottomOffset;
    m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
    m_lastHeight = m_param->sourceHeight % g_maxCUSize ? m_param->sourceHeight % g_maxCUSize : g_maxCUSize;

    m_deblock.init();

    if (m_param->bEnableSAO)
        if (!m_sao.create(m_param))
            m_param->bEnableSAO = 0;

    if (m_param->bEnableSsim)
        m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
}

void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
{
    m_frame = frame;

    if (m_param->bEnableSAO)
        m_sao.startSlice(frame, initState, qp);
}

void FrameFilter::processRow(int row)
{
    ProfileScopeEvent(filterCTURow);

    if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
    {
        processRowPost(row);
        return;
    }
    FrameData& encData = *m_frame->m_encData;
    const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
    const uint32_t lineStartCUAddr = row * numCols;

    if (m_param->bEnableLoopFilter)
    {
        for (uint32_t col = 0; col < numCols; col++)
        {
            uint32_t cuAddr = lineStartCUAddr + col;
            const CUData* ctu = encData.getPicCTU(cuAddr);

            m_deblock.deblockCTU(ctu, Deblock::EDGE_VER);

            if (col > 0)
            {
                const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
                m_deblock.deblockCTU(ctuPrev, Deblock::EDGE_HOR);
            }
        }

        const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
        m_deblock.deblockCTU(ctuPrev, Deblock::EDGE_HOR);
    }

    // SAO
    SAOParam* saoParam = encData.m_saoParam;
    if (m_param->bEnableSAO)
    {
        m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext);
        m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext);
        m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext);

        m_sao.rdoSaoUnitRow(saoParam, row);

        // NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug?
        if (row >= m_saoRowDelay)
            processSao(row - m_saoRowDelay);
    }

    // this row of CTUs has been encoded

    if (row > 0)
        processRowPost(row - 1);

    if (row == m_numRows - 1)
    {
        if (m_param->bEnableSAO)
        {
            m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);

            for (int i = m_numRows - m_saoRowDelay; i < m_numRows; i++)
                processSao(i);
        }

        processRowPost(row);
    }
}

uint32_t FrameFilter::getCUHeight(int rowNum) const
{
    return rowNum == m_numRows - 1 ? m_lastHeight : g_maxCUSize;
}

void FrameFilter::processRowPost(int row)
{
    PicYuv *reconPic = m_frame->m_reconPic;
    const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
    const uint32_t lineStartCUAddr = row * numCols;
    const int realH = getCUHeight(row);

    // Border extend Left and Right
    primitives.extendRowBorder(reconPic->getLumaAddr(lineStartCUAddr), reconPic->m_stride, reconPic->m_picWidth, realH, reconPic->m_lumaMarginX);
    primitives.extendRowBorder(reconPic->getCbAddr(lineStartCUAddr), reconPic->m_strideC, reconPic->m_picWidth >> m_hChromaShift, realH >> m_vChromaShift, reconPic->m_chromaMarginX);
    primitives.extendRowBorder(reconPic->getCrAddr(lineStartCUAddr), reconPic->m_strideC, reconPic->m_picWidth >> m_hChromaShift, realH >> m_vChromaShift, reconPic->m_chromaMarginX);

    // Border extend Top
    if (!row)
    {
        const intptr_t stride = reconPic->m_stride;
        const intptr_t strideC = reconPic->m_strideC;
        pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr) - reconPic->m_lumaMarginX;
        pixel *pixU = reconPic->getCbAddr(lineStartCUAddr) - reconPic->m_chromaMarginX;
        pixel *pixV = reconPic->getCrAddr(lineStartCUAddr) - reconPic->m_chromaMarginX;

        for (uint32_t y = 0; y < reconPic->m_lumaMarginY; y++)
            memcpy(pixY - (y + 1) * stride, pixY, stride * sizeof(pixel));

        for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)
        {
            memcpy(pixU - (y + 1) * strideC, pixU, strideC * sizeof(pixel));
            memcpy(pixV - (y + 1) * strideC, pixV, strideC * sizeof(pixel));
        }
    }

    // Border extend Bottom
    if (row == m_numRows - 1)
    {
        const intptr_t stride = reconPic->m_stride;
        const intptr_t strideC = reconPic->m_strideC;
        pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr) - reconPic->m_lumaMarginX + (realH - 1) * stride;
        pixel *pixU = reconPic->getCbAddr(lineStartCUAddr) - reconPic->m_chromaMarginX + ((realH >> m_vChromaShift) - 1) * strideC;
        pixel *pixV = reconPic->getCrAddr(lineStartCUAddr) - reconPic->m_chromaMarginX + ((realH >> m_vChromaShift) - 1) * strideC;
        for (uint32_t y = 0; y < reconPic->m_lumaMarginY; y++)
            memcpy(pixY + (y + 1) * stride, pixY, stride * sizeof(pixel));

        for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)
        {
            memcpy(pixU + (y + 1) * strideC, pixU, strideC * sizeof(pixel));
            memcpy(pixV + (y + 1) * strideC, pixV, strideC * sizeof(pixel));
        }
    }

    // Notify other FrameEncoders that this row of reconstructed pixels is available
    m_frame->m_reconRowCount.incr();

    uint32_t cuAddr = lineStartCUAddr;
    if (m_param->bEnablePsnr)
    {
        PicYuv* fencPic = m_frame->m_fencPic;

        intptr_t stride = reconPic->m_stride;
        uint32_t width  = reconPic->m_picWidth - m_pad[0];
        uint32_t height = getCUHeight(row);

        uint64_t ssdY = computeSSD(fencPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height);
        height >>= m_vChromaShift;
        width  >>= m_hChromaShift;
        stride = reconPic->m_strideC;

        uint64_t ssdU = computeSSD(fencPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height);
        uint64_t ssdV = computeSSD(fencPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height);

        m_frameEncoder->m_SSDY += ssdY;
        m_frameEncoder->m_SSDU += ssdU;
        m_frameEncoder->m_SSDV += ssdV;
    }
    if (m_param->bEnableSsim && m_ssimBuf)
    {
        pixel *rec = m_frame->m_reconPic->m_picOrg[0];
        pixel *fenc = m_frame->m_fencPic->m_picOrg[0];
        intptr_t stride1 = m_frame->m_fencPic->m_stride;
        intptr_t stride2 = m_frame->m_reconPic->m_stride;
        uint32_t bEnd = ((row + 1) == (this->m_numRows - 1));
        uint32_t bStart = (row == 0);
        uint32_t minPixY = row * g_maxCUSize - 4 * !bStart;
        uint32_t maxPixY = (row + 1) * g_maxCUSize - 4 * !bEnd;
        uint32_t ssim_cnt;
        x265_emms();

        /* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right
        * to avoid alignment of ssim blocks with DCT blocks. */
        minPixY += bStart ? 2 : -6;
        m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2,
                                                m_param->sourceWidth - 2, maxPixY - minPixY, m_ssimBuf, ssim_cnt);
        m_frameEncoder->m_ssimCnt += ssim_cnt;
    }
    if (m_param->decodedPictureHashSEI == 1)
    {
        uint32_t height = getCUHeight(row);
        uint32_t width = reconPic->m_picWidth;
        intptr_t stride = reconPic->m_stride;

        if (!row)
        {
            for (int i = 0; i < 3; i++)
                MD5Init(&m_frameEncoder->m_state[i]);
        }

        updateMD5Plane(m_frameEncoder->m_state[0], reconPic->getLumaAddr(cuAddr), width, height, stride);
        width  >>= m_hChromaShift;
        height >>= m_vChromaShift;
        stride = reconPic->m_strideC;

        updateMD5Plane(m_frameEncoder->m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride);
        updateMD5Plane(m_frameEncoder->m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride);
    }
    else if (m_param->decodedPictureHashSEI == 2)
    {
        uint32_t height = getCUHeight(row);
        uint32_t width = reconPic->m_picWidth;
        intptr_t stride = reconPic->m_stride;
        if (!row)
            m_frameEncoder->m_crc[0] = m_frameEncoder->m_crc[1] = m_frameEncoder->m_crc[2] = 0xffff;
        updateCRC(reconPic->getLumaAddr(cuAddr), m_frameEncoder->m_crc[0], height, width, stride);
        width  >>= m_hChromaShift;
        height >>= m_vChromaShift;
        stride = reconPic->m_strideC;

        updateCRC(reconPic->getCbAddr(cuAddr), m_frameEncoder->m_crc[1], height, width, stride);
        updateCRC(reconPic->getCrAddr(cuAddr), m_frameEncoder->m_crc[2], height, width, stride);
    }
    else if (m_param->decodedPictureHashSEI == 3)
    {
        uint32_t width = reconPic->m_picWidth;
        uint32_t height = getCUHeight(row);
        intptr_t stride = reconPic->m_stride;
        uint32_t cuHeight = g_maxCUSize;
        if (!row)
            m_frameEncoder->m_checksum[0] = m_frameEncoder->m_checksum[1] = m_frameEncoder->m_checksum[2] = 0;
        updateChecksum(reconPic->m_picOrg[0], m_frameEncoder->m_checksum[0], height, width, stride, row, cuHeight);
        width  >>= m_hChromaShift;
        height >>= m_vChromaShift;
        stride = reconPic->m_strideC;
        cuHeight >>= m_vChromaShift;

        updateChecksum(reconPic->m_picOrg[1], m_frameEncoder->m_checksum[1], height, width, stride, row, cuHeight);
        updateChecksum(reconPic->m_picOrg[2], m_frameEncoder->m_checksum[2], height, width, stride, row, cuHeight);
    }
}

static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height)
{
    uint64_t ssd = 0;

    if ((width | height) & 3)
    {
        /* Slow Path */
        for (uint32_t y = 0; y < height; y++)
        {
            for (uint32_t x = 0; x < width; x++)
            {
                int diff = (int)(fenc[x] - rec[x]);
                ssd += diff * diff;
            }

            fenc += stride;
            rec += stride;
        }

        return ssd;
    }

    uint32_t y = 0;
    /* Consume Y in chunks of 64 */
    for (; y + 64 <= height; y += 64)
    {
        uint32_t x = 0;

        if (!(stride & 31))
            for (; x + 64 <= width; x += 64)
                ssd += primitives.sse_pp[LUMA_64x64](fenc + x, stride, rec + x, stride);

        if (!(stride & 15))
            for (; x + 16 <= width; x += 16)
                ssd += primitives.sse_pp[LUMA_16x64](fenc + x, stride, rec + x, stride);

        for (; x + 4 <= width; x += 4)
        {
            ssd += primitives.sse_pp[LUMA_4x16](fenc + x, stride, rec + x, stride);
            ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 16 * stride, stride, rec + x + 16 * stride, stride);
            ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 32 * stride, stride, rec + x + 32 * stride, stride);
            ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 48 * stride, stride, rec + x + 48 * stride, stride);
        }

        fenc += stride * 64;
        rec += stride * 64;
    }

    /* Consume Y in chunks of 16 */
    for (; y + 16 <= height; y += 16)
    {
        uint32_t x = 0;

        if (!(stride & 31))
            for (; x + 64 <= width; x += 64)
                ssd += primitives.sse_pp[LUMA_64x16](fenc + x, stride, rec + x, stride);

        if (!(stride & 15))
            for (; x + 16 <= width; x += 16)
                ssd += primitives.sse_pp[LUMA_16x16](fenc + x, stride, rec + x, stride);

        for (; x + 4 <= width; x += 4)
            ssd += primitives.sse_pp[LUMA_4x16](fenc + x, stride, rec + x, stride);

        fenc += stride * 16;
        rec += stride * 16;
    }

    /* Consume Y in chunks of 4 */
    for (; y + 4 <= height; y += 4)
    {
        uint32_t x = 0;

        if (!(stride & 15))
            for (; x + 16 <= width; x += 16)
                ssd += primitives.sse_pp[LUMA_16x4](fenc + x, stride, rec + x, stride);

        for (; x + 4 <= width; x += 4)
            ssd += primitives.sse_pp[LUMA_4x4](fenc + x, stride, rec + x, stride);

        fenc += stride * 4;
        rec += stride * 4;
    }

    return ssd;
}

/* Function to calculate SSIM for each row */
static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt)
{
    uint32_t z = 0;
    float ssim = 0.0;

    int(*sum0)[4] = (int(*)[4])buf;
    int(*sum1)[4] = sum0 + (width >> 2) + 3;
    width >>= 2;
    height >>= 2;

    for (uint32_t y = 1; y < height; y++)
    {
        for (; z <= y; z++)
        {
            std::swap(sum0, sum1);
            for (uint32_t x = 0; x < width; x += 2)
                primitives.ssim_4x4x2_core(&pix1[(4 * x + (z * stride1))], stride1, &pix2[(4 * x + (z * stride2))], stride2, &sum0[x]);
        }

        for (uint32_t x = 0; x < width - 1; x += 4)
            ssim += primitives.ssim_end_4(sum0 + x, sum1 + x, X265_MIN(4, width - x - 1));
    }

    cnt = (height - 1) * (width - 1);
    return ssim;
}

/* restore original YUV samples to recon after SAO (if lossless) */
static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth)
{
    uint32_t size = g_maxCUSize >> depth;
    int part = partitionFromSizes(size, size);

    PicYuv* reconPic = frame.m_reconPic;
    PicYuv* fencPic  = frame.m_fencPic;

    pixel* dst = reconPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
    pixel* src = fencPic->getLumaAddr(cu->m_cuAddr, absPartIdx);

    primitives.luma_copy_pp[part](dst, reconPic->m_stride, src, fencPic->m_stride);
   
    pixel* dstCb = reconPic->getCbAddr(cu->m_cuAddr, absPartIdx);
    pixel* srcCb = fencPic->getCbAddr(cu->m_cuAddr, absPartIdx);

    pixel* dstCr = reconPic->getCrAddr(cu->m_cuAddr, absPartIdx);
    pixel* srcCr = fencPic->getCrAddr(cu->m_cuAddr, absPartIdx);

    int csp = fencPic->m_picCsp;
    primitives.chroma[csp].copy_pp[part](dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC);
    primitives.chroma[csp].copy_pp[part](dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC);
}

/* Original YUV restoration for CU in lossless coding */
static void origCUSampleRestoration(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth)
{
    if (cu->m_cuDepth[absPartIdx] > depth)
    {
        /* TODO: this could use cuGeom.numPartition and flags */
        uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1);
        uint32_t qNumParts   = curNumParts >> 2;
        uint32_t xmax = cu->m_slice->m_sps->picWidthInLumaSamples  - cu->m_cuPelX;
        uint32_t ymax = cu->m_slice->m_sps->picHeightInLumaSamples - cu->m_cuPelY;

        /* process four split sub-cu at next depth */
        for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts)
        {
            if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax)
                origCUSampleRestoration(cu, frame, absPartIdx, depth + 1);
        }

        return;
    }

    // restore original YUV samples
    if (cu->m_tqBypass[absPartIdx])
        restoreOrigLosslessYuv(cu, frame, absPartIdx, depth);
}

void FrameFilter::processSao(int row)
{
    SAOParam* saoParam = m_frame->m_encData->m_saoParam;

    if (saoParam->bSaoFlag[0])
        m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0);

    if (saoParam->bSaoFlag[1])
    {
        m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, 1);
        m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2);
    }

    if (m_frame->m_encData->m_slice->m_pps->bTransquantBypassEnabled)
    {
        uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
        uint32_t lineStartCUAddr = row * numCols;

        for (uint32_t col = 0; col < numCols; col++)
            origCUSampleRestoration(m_frame->m_encData->getPicCTU(lineStartCUAddr + col), *m_frame, 0, 0);
    }
}
Commit	Line	Data
	1	/*****************************************************************************
	2	* Copyright (C) 2013 x265 project
	3	*
	4	* Authors: Chung Shin Yee <shinyee@multicorewareinc.com>
	5	* Min Chen <chenm003@163.com>
	6	*
	7	* This program is free software; you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation; either version 2 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* This program is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with this program; if not, write to the Free Software
	19	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
	20	*
	21	* This program is also available under a commercial proprietary license.
	22	* For more information, contact us at license @ x265.com.
	23	*****************************************************************************/
	24
	25	#include "common.h"
	26	#include "frame.h"
	27	#include "framedata.h"
	28	#include "encoder.h"
	29	#include "framefilter.h"
	30	#include "frameencoder.h"
	31	#include "wavefront.h"
	32
	33	using namespace x265;
	34
	35	static uint64_t computeSSD(pixel fenc, pixel rec, intptr_t stride, uint32_t width, uint32_t height);
	36	static float calculateSSIM(pixel pix1, intptr_t stride1, pixel pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
	37
	38	FrameFilter::FrameFilter()
	39	: m_param(NULL)
	40	, m_frame(NULL)
	41	, m_frameEncoder(NULL)
	42	, m_ssimBuf(NULL)
	43	{
	44	}
	45
	46	void FrameFilter::destroy()
	47	{
	48	if (m_param->bEnableSAO)
	49	m_sao.destroy();
	50
	51	X265_FREE(m_ssimBuf);
	52	}
	53
	54	void FrameFilter::init(Encoder top, FrameEncoder frame, int numRows)
	55	{
	56	m_param = top->m_param;
	57	m_frameEncoder = frame;
	58	m_numRows = numRows;
	59	m_hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
	60	m_vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
	61	m_pad[0] = top->m_sps.conformanceWindow.rightOffset;
	62	m_pad[1] = top->m_sps.conformanceWindow.bottomOffset;
	63	m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
	64	m_lastHeight = m_param->sourceHeight % g_maxCUSize ? m_param->sourceHeight % g_maxCUSize : g_maxCUSize;
	65
	66	m_deblock.init();
	67
	68	if (m_param->bEnableSAO)
	69	if (!m_sao.create(m_param))
	70	m_param->bEnableSAO = 0;
	71
	72	if (m_param->bEnableSsim)
	73	m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
	74	}
	75
	76	void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
	77	{
	78	m_frame = frame;
	79
	80	if (m_param->bEnableSAO)
	81	m_sao.startSlice(frame, initState, qp);
	82	}
	83
	84	void FrameFilter::processRow(int row)
	85	{
	86	ProfileScopeEvent(filterCTURow);
	87
	88	if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
	89	{
	90	processRowPost(row);
	91	return;
	92	}
	93	FrameData& encData = *m_frame->m_encData;
	94	const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
	95	const uint32_t lineStartCUAddr = row * numCols;
	96
	97	if (m_param->bEnableLoopFilter)
	98	{
	99	for (uint32_t col = 0; col < numCols; col++)
	100	{
	101	uint32_t cuAddr = lineStartCUAddr + col;
	102	const CUData* ctu = encData.getPicCTU(cuAddr);
	103
	104	m_deblock.deblockCTU(ctu, Deblock::EDGE_VER);
	105
	106	if (col > 0)
	107	{
	108	const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
	109	m_deblock.deblockCTU(ctuPrev, Deblock::EDGE_HOR);
	110	}
	111	}
	112
	113	const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
	114	m_deblock.deblockCTU(ctuPrev, Deblock::EDGE_HOR);
	115	}
	116
	117	// SAO
	118	SAOParam* saoParam = encData.m_saoParam;
	119	if (m_param->bEnableSAO)
	120	{
	121	m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext);
	122	m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext);
	123	m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext);
	124
	125	m_sao.rdoSaoUnitRow(saoParam, row);
	126
	127	// NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug?
	128	if (row >= m_saoRowDelay)
	129	processSao(row - m_saoRowDelay);
	130	}
	131
	132	// this row of CTUs has been encoded
	133
	134	if (row > 0)
	135	processRowPost(row - 1);
	136
	137	if (row == m_numRows - 1)
	138	{
	139	if (m_param->bEnableSAO)
	140	{
	141	m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
	142
	143	for (int i = m_numRows - m_saoRowDelay; i < m_numRows; i++)
	144	processSao(i);
	145	}
	146
	147	processRowPost(row);
	148	}
	149	}
	150
	151	uint32_t FrameFilter::getCUHeight(int rowNum) const
	152	{
	153	return rowNum == m_numRows - 1 ? m_lastHeight : g_maxCUSize;
	154	}
	155
	156	void FrameFilter::processRowPost(int row)
	157	{
	158	PicYuv *reconPic = m_frame->m_reconPic;
	159	const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
	160	const uint32_t lineStartCUAddr = row * numCols;
	161	const int realH = getCUHeight(row);
	162
	163	// Border extend Left and Right
	164	primitives.extendRowBorder(reconPic->getLumaAddr(lineStartCUAddr), reconPic->m_stride, reconPic->m_picWidth, realH, reconPic->m_lumaMarginX);
	165	primitives.extendRowBorder(reconPic->getCbAddr(lineStartCUAddr), reconPic->m_strideC, reconPic->m_picWidth >> m_hChromaShift, realH >> m_vChromaShift, reconPic->m_chromaMarginX);
	166	primitives.extendRowBorder(reconPic->getCrAddr(lineStartCUAddr), reconPic->m_strideC, reconPic->m_picWidth >> m_hChromaShift, realH >> m_vChromaShift, reconPic->m_chromaMarginX);
	167
	168	// Border extend Top
	169	if (!row)
	170	{
	171	const intptr_t stride = reconPic->m_stride;
	172	const intptr_t strideC = reconPic->m_strideC;
	173	pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr) - reconPic->m_lumaMarginX;
	174	pixel *pixU = reconPic->getCbAddr(lineStartCUAddr) - reconPic->m_chromaMarginX;
	175	pixel *pixV = reconPic->getCrAddr(lineStartCUAddr) - reconPic->m_chromaMarginX;
	176
	177	for (uint32_t y = 0; y < reconPic->m_lumaMarginY; y++)
	178	memcpy(pixY - (y + 1) * stride, pixY, stride * sizeof(pixel));
	179
	180	for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)
	181	{
	182	memcpy(pixU - (y + 1) * strideC, pixU, strideC * sizeof(pixel));
	183	memcpy(pixV - (y + 1) * strideC, pixV, strideC * sizeof(pixel));
	184	}
	185	}
	186
	187	// Border extend Bottom
	188	if (row == m_numRows - 1)
	189	{
	190	const intptr_t stride = reconPic->m_stride;
	191	const intptr_t strideC = reconPic->m_strideC;
	192	pixel pixY = reconPic->getLumaAddr(lineStartCUAddr) - reconPic->m_lumaMarginX + (realH - 1) stride;
	193	pixel pixU = reconPic->getCbAddr(lineStartCUAddr) - reconPic->m_chromaMarginX + ((realH >> m_vChromaShift) - 1) strideC;
	194	pixel pixV = reconPic->getCrAddr(lineStartCUAddr) - reconPic->m_chromaMarginX + ((realH >> m_vChromaShift) - 1) strideC;
	195	for (uint32_t y = 0; y < reconPic->m_lumaMarginY; y++)
	196	memcpy(pixY + (y + 1) * stride, pixY, stride * sizeof(pixel));
	197
	198	for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)
	199	{
	200	memcpy(pixU + (y + 1) * strideC, pixU, strideC * sizeof(pixel));
	201	memcpy(pixV + (y + 1) * strideC, pixV, strideC * sizeof(pixel));
	202	}
	203	}
	204
	205	// Notify other FrameEncoders that this row of reconstructed pixels is available
	206	m_frame->m_reconRowCount.incr();
	207
	208	uint32_t cuAddr = lineStartCUAddr;
	209	if (m_param->bEnablePsnr)
	210	{
	211	PicYuv* fencPic = m_frame->m_fencPic;
	212
	213	intptr_t stride = reconPic->m_stride;
	214	uint32_t width = reconPic->m_picWidth - m_pad[0];
	215	uint32_t height = getCUHeight(row);
	216
	217	uint64_t ssdY = computeSSD(fencPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height);
	218	height >>= m_vChromaShift;
	219	width >>= m_hChromaShift;
	220	stride = reconPic->m_strideC;
	221
	222	uint64_t ssdU = computeSSD(fencPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height);
	223	uint64_t ssdV = computeSSD(fencPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height);
	224
	225	m_frameEncoder->m_SSDY += ssdY;
	226	m_frameEncoder->m_SSDU += ssdU;
	227	m_frameEncoder->m_SSDV += ssdV;
	228	}
	229	if (m_param->bEnableSsim && m_ssimBuf)
	230	{
	231	pixel *rec = m_frame->m_reconPic->m_picOrg[0];
	232	pixel *fenc = m_frame->m_fencPic->m_picOrg[0];
	233	intptr_t stride1 = m_frame->m_fencPic->m_stride;
	234	intptr_t stride2 = m_frame->m_reconPic->m_stride;
	235	uint32_t bEnd = ((row + 1) == (this->m_numRows - 1));
	236	uint32_t bStart = (row == 0);
	237	uint32_t minPixY = row * g_maxCUSize - 4 * !bStart;
	238	uint32_t maxPixY = (row + 1) * g_maxCUSize - 4 * !bEnd;
	239	uint32_t ssim_cnt;
	240	x265_emms();
	241
	242	/* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right
	243	* to avoid alignment of ssim blocks with DCT blocks. */
	244	minPixY += bStart ? 2 : -6;
	245	m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2,
	246	m_param->sourceWidth - 2, maxPixY - minPixY, m_ssimBuf, ssim_cnt);
	247	m_frameEncoder->m_ssimCnt += ssim_cnt;
	248	}
	249	if (m_param->decodedPictureHashSEI == 1)
	250	{
	251	uint32_t height = getCUHeight(row);
	252	uint32_t width = reconPic->m_picWidth;
	253	intptr_t stride = reconPic->m_stride;
	254
	255	if (!row)
	256	{
	257	for (int i = 0; i < 3; i++)
	258	MD5Init(&m_frameEncoder->m_state[i]);
	259	}
	260
	261	updateMD5Plane(m_frameEncoder->m_state[0], reconPic->getLumaAddr(cuAddr), width, height, stride);
	262	width >>= m_hChromaShift;
	263	height >>= m_vChromaShift;
	264	stride = reconPic->m_strideC;
	265
	266	updateMD5Plane(m_frameEncoder->m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride);
	267	updateMD5Plane(m_frameEncoder->m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride);
	268	}
	269	else if (m_param->decodedPictureHashSEI == 2)
	270	{
	271	uint32_t height = getCUHeight(row);
	272	uint32_t width = reconPic->m_picWidth;
	273	intptr_t stride = reconPic->m_stride;
	274	if (!row)
	275	m_frameEncoder->m_crc[0] = m_frameEncoder->m_crc[1] = m_frameEncoder->m_crc[2] = 0xffff;
	276	updateCRC(reconPic->getLumaAddr(cuAddr), m_frameEncoder->m_crc[0], height, width, stride);
	277	width >>= m_hChromaShift;
	278	height >>= m_vChromaShift;
	279	stride = reconPic->m_strideC;
	280
	281	updateCRC(reconPic->getCbAddr(cuAddr), m_frameEncoder->m_crc[1], height, width, stride);
	282	updateCRC(reconPic->getCrAddr(cuAddr), m_frameEncoder->m_crc[2], height, width, stride);
	283	}
	284	else if (m_param->decodedPictureHashSEI == 3)
	285	{
	286	uint32_t width = reconPic->m_picWidth;
	287	uint32_t height = getCUHeight(row);
	288	intptr_t stride = reconPic->m_stride;
	289	uint32_t cuHeight = g_maxCUSize;
	290	if (!row)
	291	m_frameEncoder->m_checksum[0] = m_frameEncoder->m_checksum[1] = m_frameEncoder->m_checksum[2] = 0;
	292	updateChecksum(reconPic->m_picOrg[0], m_frameEncoder->m_checksum[0], height, width, stride, row, cuHeight);
	293	width >>= m_hChromaShift;
	294	height >>= m_vChromaShift;
	295	stride = reconPic->m_strideC;
	296	cuHeight >>= m_vChromaShift;
	297
	298	updateChecksum(reconPic->m_picOrg[1], m_frameEncoder->m_checksum[1], height, width, stride, row, cuHeight);
	299	updateChecksum(reconPic->m_picOrg[2], m_frameEncoder->m_checksum[2], height, width, stride, row, cuHeight);
	300	}
	301	}
	302
	303	static uint64_t computeSSD(pixel fenc, pixel rec, intptr_t stride, uint32_t width, uint32_t height)
	304	{
	305	uint64_t ssd = 0;
	306
	307	if ((width \| height) & 3)
	308	{
	309	/* Slow Path */
	310	for (uint32_t y = 0; y < height; y++)
	311	{
	312	for (uint32_t x = 0; x < width; x++)
	313	{
	314	int diff = (int)(fenc[x] - rec[x]);
	315	ssd += diff * diff;
	316	}
	317
	318	fenc += stride;
	319	rec += stride;
	320	}
	321
	322	return ssd;
	323	}
	324
	325	uint32_t y = 0;
	326	/* Consume Y in chunks of 64 */
	327	for (; y + 64 <= height; y += 64)
	328	{
	329	uint32_t x = 0;
	330
	331	if (!(stride & 31))
	332	for (; x + 64 <= width; x += 64)
	333	ssd += primitives.sse_pp[LUMA_64x64](fenc + x, stride, rec + x, stride);
	334
	335	if (!(stride & 15))
	336	for (; x + 16 <= width; x += 16)
	337	ssd += primitives.sse_pp[LUMA_16x64](fenc + x, stride, rec + x, stride);
	338
	339	for (; x + 4 <= width; x += 4)
	340	{
	341	ssd += primitives.sse_pp[LUMA_4x16](fenc + x, stride, rec + x, stride);
	342	ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 16 * stride, stride, rec + x + 16 * stride, stride);
	343	ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 32 * stride, stride, rec + x + 32 * stride, stride);
	344	ssd += primitives.sse_pp[LUMA_4x16](fenc + x + 48 * stride, stride, rec + x + 48 * stride, stride);
	345	}
	346
	347	fenc += stride * 64;
	348	rec += stride * 64;
	349	}
	350
	351	/* Consume Y in chunks of 16 */
	352	for (; y + 16 <= height; y += 16)
	353	{
	354	uint32_t x = 0;
	355
	356	if (!(stride & 31))
	357	for (; x + 64 <= width; x += 64)
	358	ssd += primitives.sse_pp[LUMA_64x16](fenc + x, stride, rec + x, stride);
	359
	360	if (!(stride & 15))
	361	for (; x + 16 <= width; x += 16)
	362	ssd += primitives.sse_pp[LUMA_16x16](fenc + x, stride, rec + x, stride);
	363
	364	for (; x + 4 <= width; x += 4)
	365	ssd += primitives.sse_pp[LUMA_4x16](fenc + x, stride, rec + x, stride);
	366
	367	fenc += stride * 16;
	368	rec += stride * 16;
	369	}
	370
	371	/* Consume Y in chunks of 4 */
	372	for (; y + 4 <= height; y += 4)
	373	{
	374	uint32_t x = 0;
	375
	376	if (!(stride & 15))
	377	for (; x + 16 <= width; x += 16)
	378	ssd += primitives.sse_pp[LUMA_16x4](fenc + x, stride, rec + x, stride);
	379
	380	for (; x + 4 <= width; x += 4)
	381	ssd += primitives.sse_pp[LUMA_4x4](fenc + x, stride, rec + x, stride);
	382
	383	fenc += stride * 4;
	384	rec += stride * 4;
	385	}
	386
	387	return ssd;
	388	}
	389
	390	/* Function to calculate SSIM for each row */
	391	static float calculateSSIM(pixel pix1, intptr_t stride1, pixel pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt)
	392	{
	393	uint32_t z = 0;
	394	float ssim = 0.0;
	395
	396	int(sum0)[4] = (int()[4])buf;
	397	int(*sum1)[4] = sum0 + (width >> 2) + 3;
	398	width >>= 2;
	399	height >>= 2;
	400
	401	for (uint32_t y = 1; y < height; y++)
	402	{
	403	for (; z <= y; z++)
	404	{
	405	std::swap(sum0, sum1);
	406	for (uint32_t x = 0; x < width; x += 2)
	407	primitives.ssim_4x4x2_core(&pix1[(4 * x + (z * stride1))], stride1, &pix2[(4 * x + (z * stride2))], stride2, &sum0[x]);
	408	}
	409
	410	for (uint32_t x = 0; x < width - 1; x += 4)
	411	ssim += primitives.ssim_end_4(sum0 + x, sum1 + x, X265_MIN(4, width - x - 1));
	412	}
	413
	414	cnt = (height - 1) * (width - 1);
	415	return ssim;
	416	}
	417
	418	/* restore original YUV samples to recon after SAO (if lossless) */
	419	static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth)
	420	{
	421	uint32_t size = g_maxCUSize >> depth;
	422	int part = partitionFromSizes(size, size);
	423
	424	PicYuv* reconPic = frame.m_reconPic;
	425	PicYuv* fencPic = frame.m_fencPic;
	426
	427	pixel* dst = reconPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
	428	pixel* src = fencPic->getLumaAddr(cu->m_cuAddr, absPartIdx);
	429
	430	primitives.luma_copy_pp[part](dst, reconPic->m_stride, src, fencPic->m_stride);
	431
	432	pixel* dstCb = reconPic->getCbAddr(cu->m_cuAddr, absPartIdx);
	433	pixel* srcCb = fencPic->getCbAddr(cu->m_cuAddr, absPartIdx);
	434
	435	pixel* dstCr = reconPic->getCrAddr(cu->m_cuAddr, absPartIdx);
	436	pixel* srcCr = fencPic->getCrAddr(cu->m_cuAddr, absPartIdx);
	437
	438	int csp = fencPic->m_picCsp;
	439	primitives.chroma[csp].copy_pp[part](dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC);
	440	primitives.chroma[csp].copy_pp[part](dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC);
	441	}
	442
	443	/* Original YUV restoration for CU in lossless coding */
	444	static void origCUSampleRestoration(const CUData* cu, Frame& frame, uint32_t absPartIdx, uint32_t depth)
	445	{
	446	if (cu->m_cuDepth[absPartIdx] > depth)
	447	{
	448	/* TODO: this could use cuGeom.numPartition and flags */
	449	uint32_t curNumParts = NUM_CU_PARTITIONS >> (depth << 1);
	450	uint32_t qNumParts = curNumParts >> 2;
	451	uint32_t xmax = cu->m_slice->m_sps->picWidthInLumaSamples - cu->m_cuPelX;
	452	uint32_t ymax = cu->m_slice->m_sps->picHeightInLumaSamples - cu->m_cuPelY;
	453
	454	/* process four split sub-cu at next depth */
	455	for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++, absPartIdx += qNumParts)
	456	{
	457	if (g_zscanToPelX[absPartIdx] < xmax && g_zscanToPelY[absPartIdx] < ymax)
	458	origCUSampleRestoration(cu, frame, absPartIdx, depth + 1);
	459	}
	460
	461	return;
	462	}
	463
	464	// restore original YUV samples
	465	if (cu->m_tqBypass[absPartIdx])
	466	restoreOrigLosslessYuv(cu, frame, absPartIdx, depth);
	467	}
	468
	469	void FrameFilter::processSao(int row)
	470	{
	471	SAOParam* saoParam = m_frame->m_encData->m_saoParam;
	472
	473	if (saoParam->bSaoFlag[0])
	474	m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0);
	475
	476	if (saoParam->bSaoFlag[1])
	477	{
	478	m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, 1);
	479	m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2);
	480	}
	481
	482	if (m_frame->m_encData->m_slice->m_pps->bTransquantBypassEnabled)
	483	{
	484	uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
	485	uint32_t lineStartCUAddr = row * numCols;
	486
	487	for (uint32_t col = 0; col < numCols; col++)
	488	origCUSampleRestoration(m_frame->m_encData->getPicCTU(lineStartCUAddr + col), *m_frame, 0, 0);
	489	}
	490	}