[deb_x265.git] / source / encoder / weightPrediction.cpp

/*****************************************************************************
 * Copyright (C) 2013 x265 project
 *
 * Author: Shazeb Nawaz Khan <shazeb@multicorewareinc.com>
 *         Steve Borho <steve@borho.org>
 *         Kavitha Sampas <kavitha@multicorewareinc.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at license @ x265.com.
 *****************************************************************************/

#include "common.h"
#include "frame.h"
#include "picyuv.h"
#include "lowres.h"
#include "mv.h"
#include "slicetype.h"
#include "bitstream.h"

using namespace x265;
namespace {
struct Cache
{
    const int * intraCost;
    int         numPredDir;
    int         csp;
    int         hshift;
    int         vshift;
    int         lowresWidthInCU;
    int         lowresHeightInCU;
};

int sliceHeaderCost(WeightParam *w, int lambda, int bChroma)
{
    /* 4 times higher, because chroma is analyzed at full resolution. */
    if (bChroma)
        lambda *= 4;
    int denomCost = bs_size_ue(w[0].log2WeightDenom) * (2 - bChroma);
    return lambda * (10 + denomCost + 2 * (bs_size_se(w[0].inputWeight) + bs_size_se(w[0].inputOffset)));
}

/* make a motion compensated copy of lowres ref into mcout with the same stride.
 * The borders of mcout are not extended */
void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
{
    intptr_t stride = ref.lumaStride;
    const int cuSize = 8;
    MV mvmin, mvmax;

    int cu = 0;

    for (int y = 0; y < ref.lines; y += cuSize)
    {
        intptr_t pixoff = y * stride;
        mvmin.y = (int16_t)((-y - 8) << 2);
        mvmax.y = (int16_t)((ref.lines - y - 1 + 8) << 2);

        for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++)
        {
            ALIGN_VAR_16(pixel, buf8x8[8 * 8]);
            intptr_t bstride = 8;
            mvmin.x = (int16_t)((-x - 8) << 2);
            mvmax.x = (int16_t)((ref.width - x - 1 + 8) << 2);

            /* clip MV to available pixels */
            MV mv = mvs[cu];
            mv = mv.clipped(mvmin, mvmax);
            pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
            primitives.luma_copy_pp[LUMA_8x8](mcout + pixoff, stride, tmp, bstride);
        }
    }
}

/* use lowres MVs from lookahead to generate a motion compensated chroma plane.
 * if a block had cheaper lowres cost as intra, we treat it as MV 0 */
void mcChroma(pixel *      mcout,
              pixel *      src,
              intptr_t     stride,
              const MV *   mvs,
              const Cache& cache,
              int          height,
              int          width)
{
    /* the motion vectors correspond to 8x8 lowres luma blocks, or 16x16 fullres
     * luma blocks. We have to adapt block size to chroma csp */
    int csp = cache.csp;
    int bw = 16 >> cache.hshift;
    int bh = 16 >> cache.vshift;
    MV mvmin, mvmax;

    for (int y = 0; y < height; y += bh)
    {
        /* note: lowres block count per row might be different from chroma block
         * count per row because of rounding issues, so be very careful with indexing
         * into the lowres structures */
        int cu = y * cache.lowresWidthInCU;
        intptr_t pixoff = y * stride;
        mvmin.y = (int16_t)((-y - 8) << 2);
        mvmax.y = (int16_t)((height - y - 1 + 8) << 2);

        for (int x = 0; x < width; x += bw, cu++, pixoff += bw)
        {
            if (x < cache.lowresWidthInCU && y < cache.lowresHeightInCU)
            {
                MV mv = mvs[cu]; // lowres MV
                mv <<= 1;        // fullres MV
                mv.x >>= cache.hshift;
                mv.y >>= cache.vshift;

                /* clip MV to available pixels */
                mvmin.x = (int16_t)((-x - 8) << 2);
                mvmax.x = (int16_t)((width - x - 1 + 8) << 2);
                mv = mv.clipped(mvmin, mvmax);

                intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
                pixel *temp = src + pixoff + fpeloffset;

                int xFrac = mv.x & 0x7;
                int yFrac = mv.y & 0x7;
                if ((yFrac | xFrac) == 0)
                {
                    primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, temp, stride);
                }
                else if (yFrac == 0)
                {
                    primitives.chroma[csp].filter_hpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, xFrac);
                }
                else if (xFrac == 0)
                {
                    primitives.chroma[csp].filter_vpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, yFrac);
                }
                else
                {
                    ALIGN_VAR_16(int16_t, imm[16 * (16 + NTAPS_CHROMA)]);
                    primitives.chroma[csp].filter_hps[LUMA_16x16](temp, stride, imm, bw, xFrac, 1);
                    primitives.chroma[csp].filter_vsp[LUMA_16x16](imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
                }
            }
            else
            {
                primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, src + pixoff, stride);
            }
        }
    }
}

/* Measure sum of 8x8 satd costs between source frame and reference
 * frame (potentially weighted, potentially motion compensated). We
 * always use source images for this analysis since reference recon
 * pixels have unreliable availability */
uint32_t weightCost(pixel *         fenc,
                    pixel *         ref,
                    pixel *         weightTemp,
                    intptr_t        stride,
                    const Cache &   cache,
                    int             width,
                    int             height,
                    WeightParam *   w,
                    bool            bLuma)
{
    if (w)
    {
        /* make a weighted copy of the reference plane */
        int offset = w->inputOffset << (X265_DEPTH - 8);
        int weight = w->inputWeight;
        int denom = w->log2WeightDenom;
        int round = denom ? 1 << (denom - 1) : 0;
        int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
        int pwidth = ((width + 15) >> 4) << 4;

        primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
                             weight, round << correction, denom + correction, offset);
        ref = weightTemp;
    }

    uint32_t cost = 0;
    pixel *f = fenc, *r = ref;

    if (bLuma)
    {
        int cu = 0;
        for (int y = 8; y < height; y += 8, r += 8 * stride, f += 8 * stride)
        {
            for (int x = 8; x < width; x += 8, cu++)
            {
                int cmp = primitives.satd[LUMA_8x8](r + x, stride, f + x, stride);
                cost += X265_MIN(cmp, cache.intraCost[cu]);
            }
        }
    }
    else if (cache.csp == X265_CSP_I444)
        for (int y = 16; y < height; y += 16, r += 16 * stride, f += 16 * stride)
            for (int x = 16; x < width; x += 16)
                cost += primitives.satd[LUMA_16x16](r + x, stride, f + x, stride);
    else
        for (int y = 8; y < height; y += 8, r += 8 * stride, f += 8 * stride)
            for (int x = 8; x < width; x += 8)
                cost += primitives.satd[LUMA_8x8](r + x, stride, f + x, stride);

    return cost;
}
}

namespace x265 {
void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
{
    WeightParam wp[2][MAX_NUM_REF][3];
    PicYuv *fencPic = frame.m_fencPic;
    Lowres& fenc    = frame.m_lowres;

    Cache cache;

    memset(&cache, 0, sizeof(cache));
    cache.intraCost = fenc.intraCost;
    cache.numPredDir = slice.isInterP() ? 1 : 2;
    cache.lowresWidthInCU = fenc.width >> 3;
    cache.lowresHeightInCU = fenc.lines >> 3;
    cache.csp = fencPic->m_picCsp;
    cache.hshift = CHROMA_H_SHIFT(cache.csp);
    cache.vshift = CHROMA_V_SHIFT(cache.csp);

    /* Use single allocation for motion compensated ref and weight buffers */
    pixel *mcbuf = X265_MALLOC(pixel, 2 * fencPic->m_stride * fencPic->m_picHeight);
    if (!mcbuf)
    {
        slice.disableWeights();
        return;
    }
    pixel *weightTemp = mcbuf + fencPic->m_stride * fencPic->m_picHeight;

    int lambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
    int curPoc = slice.m_poc;
    const float epsilon = 1.f / 128.f;

    int chromaDenom, lumaDenom, denom;
    chromaDenom = lumaDenom = 7;
    int numpixels[3];
    int w16 = ((fencPic->m_picWidth  + 15) >> 4) << 4;
    int h16 = ((fencPic->m_picHeight + 15) >> 4) << 4;
    numpixels[0] = w16 * h16;
    numpixels[1] = numpixels[2] = numpixels[0] >> (cache.hshift + cache.vshift);

    for (int list = 0; list < cache.numPredDir; list++)
    {
        WeightParam *weights = wp[list][0];
        Frame *refFrame = slice.m_refPicList[list][0];
        Lowres& refLowres = refFrame->m_lowres;
        int diffPoc = abs(curPoc - refFrame->m_poc);

        /* prepare estimates */
        float guessScale[3], fencMean[3], refMean[3];
        for (int plane = 0; plane < 3; plane++)
        {
            SET_WEIGHT(weights[plane], false, 1, 0, 0);
            uint64_t fencVar = fenc.wp_ssd[plane] + !refLowres.wp_ssd[plane];
            uint64_t refVar  = refLowres.wp_ssd[plane] + !refLowres.wp_ssd[plane];
            guessScale[plane] = sqrt((float)fencVar / refVar);
            fencMean[plane] = (float)fenc.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
            refMean[plane]  = (float)refLowres.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
        }

        /* make sure both our scale factors fit */
        while (!list && chromaDenom > 0)
        {
            float thresh = 127.f / (1 << chromaDenom);
            if (guessScale[1] < thresh && guessScale[2] < thresh)
                break;
            chromaDenom--;
        }

        SET_WEIGHT(weights[1], false, 1 << chromaDenom, chromaDenom, 0);
        SET_WEIGHT(weights[2], false, 1 << chromaDenom, chromaDenom, 0);

        MV *mvs = NULL;

        for (int plane = 0; plane < 3; plane++)
        {
            denom = plane ? chromaDenom : lumaDenom;
            if (plane && !weights[0].bPresentFlag)
                break;

            /* Early termination */
            x265_emms();
            if (fabsf(refMean[plane] - fencMean[plane]) < 0.5f && fabsf(1.f - guessScale[plane]) < epsilon)
            {
                SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
                continue;
            }

            if (plane)
            {
                int scale = Clip3(0, 255, (int)(guessScale[plane] * (1 << denom) + 0.5f));
                if (scale > 127)
                    continue;
                weights[plane].inputWeight = scale;
            }
            else
            {
                weights[plane].setFromWeightAndOffset((int)(guessScale[plane] * (1 << denom) + 0.5f), 0, denom, !list);
            }

            int mindenom = weights[plane].log2WeightDenom;
            int minscale = weights[plane].inputWeight;
            int minoff = 0;

            if (!plane && diffPoc <= param.bframes + 1)
            {
                mvs = fenc.lowresMvs[list][diffPoc - 1];

                /* test whether this motion search was performed by lookahead */
                if (mvs[0].x != 0x7FFF)
                {
                    /* reference chroma planes must be extended prior to being
                     * used as motion compensation sources */
                    if (!refFrame->m_bChromaExtended)
                    {
                        refFrame->m_bChromaExtended = true;
                        PicYuv *refPic = refFrame->m_fencPic;
                        int width = refPic->m_picWidth >> cache.hshift;
                        int height = refPic->m_picHeight >> cache.vshift;
                        extendPicBorder(refPic->m_picOrg[1], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
                        extendPicBorder(refPic->m_picOrg[2], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
                    }
                }
                else
                    mvs = 0;
            }

            /* prepare inputs to weight analysis */
            pixel *orig;
            pixel *fref;
            intptr_t stride;
            int    width, height;
            switch (plane)
            {
            case 0:
                orig = fenc.lowresPlane[0];
                stride = fenc.lumaStride;
                width = fenc.width;
                height = fenc.lines;
                fref = refLowres.lowresPlane[0];
                if (mvs)
                {
                    mcLuma(mcbuf, refLowres, mvs);
                    fref = mcbuf;
                }
                break;

            case 1:
                orig = fencPic->m_picOrg[1];
                stride = fencPic->m_strideC;
                fref = refFrame->m_fencPic->m_picOrg[1];

                /* Clamp the chroma dimensions to the nearest multiple of
                 * 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres
                 * blocks and weightCost measures 8x8 blocks. This
                 * potentially ignores some edge pixels, but simplifies the
                 * logic and prevents reading uninitialized pixels. Lowres
                 * planes are border extended and require no clamping. */
                width =  ((fencPic->m_picWidth  >> 4) << 4) >> cache.hshift;
                height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
                if (mvs)
                {
                    mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
                    fref = mcbuf;
                }
                break;

            case 2:
                fref = refFrame->m_fencPic->m_picOrg[2];
                orig = fencPic->m_picOrg[2];
                stride = fencPic->m_strideC;
                width =  ((fencPic->m_picWidth  >> 4) << 4) >> cache.hshift;
                height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
                if (mvs)
                {
                    mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
                    fref = mcbuf;
                }
                break;

            default:
                slice.disableWeights();
                X265_FREE(mcbuf);
                return;
            }

            uint32_t origscore = weightCost(orig, fref, weightTemp, stride, cache, width, height, NULL, !plane);
            if (!origscore)
            {
                SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
                continue;
            }

            uint32_t minscore = origscore;
            bool bFound = false;

            /* x264 uses a table lookup here, selecting search range based on preset */
            static const int scaleDist = 4;
            static const int offsetDist = 2;

            int startScale = Clip3(0, 127, minscale - scaleDist);
            int endScale   = Clip3(0, 127, minscale + scaleDist);
            for (int scale = startScale; scale <= endScale; scale++)
            {
                int deltaWeight = scale - (1 << mindenom);
                if (deltaWeight > 127 || deltaWeight <= -128)
                    continue;

                x265_emms();
                int curScale = scale;
                int curOffset = (int)(fencMean[plane] - refMean[plane] * curScale / (1 << mindenom) + 0.5f);
                if (curOffset < -128 || curOffset > 127)
                {
                    /* Rescale considering the constraints on curOffset. We do it in this order
                     * because scale has a much wider range than offset (because of denom), so
                     * it should almost never need to be clamped. */
                    curOffset = Clip3(-128, 127, curOffset);
                    curScale = (int)((1 << mindenom) * (fencMean[plane] - curOffset) / refMean[plane] + 0.5f);
                    curScale = Clip3(0, 127, curScale);
                }

                int startOffset = Clip3(-128, 127, curOffset - offsetDist);
                int endOffset   = Clip3(-128, 127, curOffset + offsetDist);
                for (int off = startOffset; off <= endOffset; off++)
                {
                    WeightParam wsp;
                    SET_WEIGHT(wsp, true, curScale, mindenom, off);
                    uint32_t s = weightCost(orig, fref, weightTemp, stride, cache, width, height, &wsp, !plane) +
                                 sliceHeaderCost(&wsp, lambda, !!plane);
                    COPY4_IF_LT(minscore, s, minscale, curScale, minoff, off, bFound, true);

                    /* Don't check any more offsets if the previous one had a lower cost than the current one */
                    if (minoff == startOffset && off != startOffset)
                        break;
                }
            }

            /* Use a smaller luma denominator if possible */
            if (!(plane || list))
            {
                while (mindenom > 0 && !(minscale & 1))
                {
                    mindenom--;
                    minscale >>= 1;
                }
            }

            if (!bFound || (minscale == (1 << mindenom) && minoff == 0) || (float)minscore / origscore > 0.998f)
            {
                SET_WEIGHT(weights[plane], false, 1 << denom, denom, 0);
            }
            else
            {
                SET_WEIGHT(weights[plane], true, minscale, mindenom, minoff);
            }
        }

        if (weights[0].bPresentFlag)
        {
            // Make sure both chroma channels match
            if (weights[1].bPresentFlag != weights[2].bPresentFlag)
            {
                if (weights[1].bPresentFlag)
                    weights[2] = weights[1];
                else
                    weights[1] = weights[2];
            }
        }

        lumaDenom = weights[0].log2WeightDenom;
        chromaDenom = weights[1].log2WeightDenom;

        /* reset weight states */
        for (int ref = 1; ref < slice.m_numRefIdx[list]; ref++)
        {
            SET_WEIGHT(wp[list][ref][0], false, 1 << lumaDenom, lumaDenom, 0);
            SET_WEIGHT(wp[list][ref][1], false, 1 << chromaDenom, chromaDenom, 0);
            SET_WEIGHT(wp[list][ref][2], false, 1 << chromaDenom, chromaDenom, 0);
        }
    }

    X265_FREE(mcbuf);

    memcpy(slice.m_weightPredTable, wp, sizeof(WeightParam) * 2 * MAX_NUM_REF * 3);

    if (param.logLevel >= X265_LOG_FULL)
    {
        char buf[1024];
        int p = 0;
        bool bWeighted = false;

        p = sprintf(buf, "poc: %d weights:", slice.m_poc);
        int numPredDir = slice.isInterP() ? 1 : 2;
        for (int list = 0; list < numPredDir; list++)
        {
            WeightParam* w = &wp[list][0][0];
            if (w[0].bPresentFlag || w[1].bPresentFlag || w[2].bPresentFlag)
            {
                bWeighted = true;
                p += sprintf(buf + p, " [L%d:R0 ", list);
                if (w[0].bPresentFlag)
                    p += sprintf(buf + p, "Y{%d/%d%+d}", w[0].inputWeight, 1 << w[0].log2WeightDenom, w[0].inputOffset);
                if (w[1].bPresentFlag)
                    p += sprintf(buf + p, "U{%d/%d%+d}", w[1].inputWeight, 1 << w[1].log2WeightDenom, w[1].inputOffset);
                if (w[2].bPresentFlag)
                    p += sprintf(buf + p, "V{%d/%d%+d}", w[2].inputWeight, 1 << w[2].log2WeightDenom, w[2].inputOffset);
                p += sprintf(buf + p, "]");
            }
        }

        if (bWeighted)
        {
            if (p < 80) // pad with spaces to ensure progress line overwritten
                sprintf(buf + p, "%*s", 80 - p, " ");
            x265_log(&param, X265_LOG_FULL, "%s\n", buf);
        }
    }
}
}
Commit	Line	Data
	1	/*****************************************************************************
	2	* Copyright (C) 2013 x265 project
	3	*
	4	* Author: Shazeb Nawaz Khan <shazeb@multicorewareinc.com>
	5	* Steve Borho <steve@borho.org>
	6	* Kavitha Sampas <kavitha@multicorewareinc.com>
	7	*
	8	* This program is free software; you can redistribute it and/or modify
	9	* it under the terms of the GNU General Public License as published by
	10	* the Free Software Foundation; either version 2 of the License, or
	11	* (at your option) any later version.
	12	*
	13	* This program is distributed in the hope that it will be useful,
	14	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	* GNU General Public License for more details.
	17	*
	18	* You should have received a copy of the GNU General Public License
	19	* along with this program; if not, write to the Free Software
	20	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
	21	*
	22	* This program is also available under a commercial proprietary license.
	23	* For more information, contact us at license @ x265.com.
	24	*****************************************************************************/
	25
	26	#include "common.h"
	27	#include "frame.h"
	28	#include "picyuv.h"
	29	#include "lowres.h"
	30	#include "mv.h"
	31	#include "slicetype.h"
	32	#include "bitstream.h"
	33
	34	using namespace x265;
	35	namespace {
	36	struct Cache
	37	{
	38	const int * intraCost;
	39	int numPredDir;
	40	int csp;
	41	int hshift;
	42	int vshift;
	43	int lowresWidthInCU;
	44	int lowresHeightInCU;
	45	};
	46
	47	int sliceHeaderCost(WeightParam *w, int lambda, int bChroma)
	48	{
	49	/* 4 times higher, because chroma is analyzed at full resolution. */
	50	if (bChroma)
	51	lambda *= 4;
	52	int denomCost = bs_size_ue(w[0].log2WeightDenom) * (2 - bChroma);
	53	return lambda * (10 + denomCost + 2 * (bs_size_se(w[0].inputWeight) + bs_size_se(w[0].inputOffset)));
	54	}
	55
	56	/* make a motion compensated copy of lowres ref into mcout with the same stride.
	57	* The borders of mcout are not extended */
	58	void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
	59	{
	60	intptr_t stride = ref.lumaStride;
	61	const int cuSize = 8;
	62	MV mvmin, mvmax;
	63
	64	int cu = 0;
	65
	66	for (int y = 0; y < ref.lines; y += cuSize)
	67	{
	68	intptr_t pixoff = y * stride;
	69	mvmin.y = (int16_t)((-y - 8) << 2);
	70	mvmax.y = (int16_t)((ref.lines - y - 1 + 8) << 2);
	71
	72	for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++)
	73	{
	74	ALIGN_VAR_16(pixel, buf8x8[8 * 8]);
	75	intptr_t bstride = 8;
	76	mvmin.x = (int16_t)((-x - 8) << 2);
	77	mvmax.x = (int16_t)((ref.width - x - 1 + 8) << 2);
	78
	79	/* clip MV to available pixels */
	80	MV mv = mvs[cu];
	81	mv = mv.clipped(mvmin, mvmax);
	82	pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
	83	primitives.luma_copy_pp[LUMA_8x8](mcout + pixoff, stride, tmp, bstride);
	84	}
	85	}
	86	}
	87
	88	/* use lowres MVs from lookahead to generate a motion compensated chroma plane.
	89	* if a block had cheaper lowres cost as intra, we treat it as MV 0 */
	90	void mcChroma(pixel * mcout,
	91	pixel * src,
	92	intptr_t stride,
	93	const MV * mvs,
	94	const Cache& cache,
	95	int height,
	96	int width)
	97	{
	98	/* the motion vectors correspond to 8x8 lowres luma blocks, or 16x16 fullres
	99	* luma blocks. We have to adapt block size to chroma csp */
	100	int csp = cache.csp;
	101	int bw = 16 >> cache.hshift;
	102	int bh = 16 >> cache.vshift;
	103	MV mvmin, mvmax;
	104
	105	for (int y = 0; y < height; y += bh)
	106	{
	107	/* note: lowres block count per row might be different from chroma block
	108	* count per row because of rounding issues, so be very careful with indexing
	109	* into the lowres structures */
	110	int cu = y * cache.lowresWidthInCU;
	111	intptr_t pixoff = y * stride;
	112	mvmin.y = (int16_t)((-y - 8) << 2);
	113	mvmax.y = (int16_t)((height - y - 1 + 8) << 2);
	114
	115	for (int x = 0; x < width; x += bw, cu++, pixoff += bw)
	116	{
	117	if (x < cache.lowresWidthInCU && y < cache.lowresHeightInCU)
	118	{
	119	MV mv = mvs[cu]; // lowres MV
	120	mv <<= 1; // fullres MV
	121	mv.x >>= cache.hshift;
	122	mv.y >>= cache.vshift;
	123
	124	/* clip MV to available pixels */
	125	mvmin.x = (int16_t)((-x - 8) << 2);
	126	mvmax.x = (int16_t)((width - x - 1 + 8) << 2);
	127	mv = mv.clipped(mvmin, mvmax);
	128
	129	intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
	130	pixel *temp = src + pixoff + fpeloffset;
	131
	132	int xFrac = mv.x & 0x7;
	133	int yFrac = mv.y & 0x7;
	134	if ((yFrac \| xFrac) == 0)
	135	{
	136	primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, temp, stride);
	137	}
	138	else if (yFrac == 0)
	139	{
	140	primitives.chroma[csp].filter_hpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, xFrac);
	141	}
	142	else if (xFrac == 0)
	143	{
	144	primitives.chroma[csp].filter_vpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, yFrac);
	145	}
	146	else
	147	{
	148	ALIGN_VAR_16(int16_t, imm[16 * (16 + NTAPS_CHROMA)]);
	149	primitives.chroma[csp].filter_hps[LUMA_16x16](temp, stride, imm, bw, xFrac, 1);
	150	primitives.chroma[csp].filter_vsp[LUMA_16x16](imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
	151	}
	152	}
	153	else
	154	{
	155	primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, src + pixoff, stride);
	156	}
	157	}
	158	}
	159	}
	160
	161	/* Measure sum of 8x8 satd costs between source frame and reference
	162	* frame (potentially weighted, potentially motion compensated). We
	163	* always use source images for this analysis since reference recon
	164	* pixels have unreliable availability */
	165	uint32_t weightCost(pixel * fenc,
	166	pixel * ref,
	167	pixel * weightTemp,
	168	intptr_t stride,
	169	const Cache & cache,
	170	int width,
	171	int height,
	172	WeightParam * w,
	173	bool bLuma)
	174	{
	175	if (w)
	176	{
	177	/* make a weighted copy of the reference plane */
	178	int offset = w->inputOffset << (X265_DEPTH - 8);
	179	int weight = w->inputWeight;
	180	int denom = w->log2WeightDenom;
	181	int round = denom ? 1 << (denom - 1) : 0;
	182	int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
	183	int pwidth = ((width + 15) >> 4) << 4;
	184
	185	primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
	186	weight, round << correction, denom + correction, offset);
	187	ref = weightTemp;
	188	}
	189
	190	uint32_t cost = 0;
	191	pixel f = fenc, r = ref;
	192
	193	if (bLuma)
	194	{
	195	int cu = 0;
	196	for (int y = 8; y < height; y += 8, r += 8 * stride, f += 8 * stride)
	197	{
	198	for (int x = 8; x < width; x += 8, cu++)
	199	{
	200	int cmp = primitives.satd[LUMA_8x8](r + x, stride, f + x, stride);
	201	cost += X265_MIN(cmp, cache.intraCost[cu]);
	202	}
	203	}
	204	}
	205	else if (cache.csp == X265_CSP_I444)
	206	for (int y = 16; y < height; y += 16, r += 16 * stride, f += 16 * stride)
	207	for (int x = 16; x < width; x += 16)
	208	cost += primitives.satd[LUMA_16x16](r + x, stride, f + x, stride);
	209	else
	210	for (int y = 8; y < height; y += 8, r += 8 * stride, f += 8 * stride)
	211	for (int x = 8; x < width; x += 8)
	212	cost += primitives.satd[LUMA_8x8](r + x, stride, f + x, stride);
	213
	214	return cost;
	215	}
	216	}
	217
	218	namespace x265 {
	219	void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
	220	{
	221	WeightParam wp[2][MAX_NUM_REF][3];
	222	PicYuv *fencPic = frame.m_fencPic;
	223	Lowres& fenc = frame.m_lowres;
	224
	225	Cache cache;
	226
	227	memset(&cache, 0, sizeof(cache));
	228	cache.intraCost = fenc.intraCost;
	229	cache.numPredDir = slice.isInterP() ? 1 : 2;
	230	cache.lowresWidthInCU = fenc.width >> 3;
	231	cache.lowresHeightInCU = fenc.lines >> 3;
	232	cache.csp = fencPic->m_picCsp;
	233	cache.hshift = CHROMA_H_SHIFT(cache.csp);
	234	cache.vshift = CHROMA_V_SHIFT(cache.csp);
	235
	236	/* Use single allocation for motion compensated ref and weight buffers */
	237	pixel mcbuf = X265_MALLOC(pixel, 2 fencPic->m_stride * fencPic->m_picHeight);
	238	if (!mcbuf)
	239	{
	240	slice.disableWeights();
	241	return;
	242	}
	243	pixel weightTemp = mcbuf + fencPic->m_stride fencPic->m_picHeight;
	244
	245	int lambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
	246	int curPoc = slice.m_poc;
	247	const float epsilon = 1.f / 128.f;
	248
	249	int chromaDenom, lumaDenom, denom;
	250	chromaDenom = lumaDenom = 7;
	251	int numpixels[3];
	252	int w16 = ((fencPic->m_picWidth + 15) >> 4) << 4;
	253	int h16 = ((fencPic->m_picHeight + 15) >> 4) << 4;
	254	numpixels[0] = w16 * h16;
	255	numpixels[1] = numpixels[2] = numpixels[0] >> (cache.hshift + cache.vshift);
	256
	257	for (int list = 0; list < cache.numPredDir; list++)
	258	{
	259	WeightParam *weights = wp[list][0];
	260	Frame *refFrame = slice.m_refPicList[list][0];
	261	Lowres& refLowres = refFrame->m_lowres;
	262	int diffPoc = abs(curPoc - refFrame->m_poc);
	263
	264	/* prepare estimates */
	265	float guessScale[3], fencMean[3], refMean[3];
	266	for (int plane = 0; plane < 3; plane++)
	267	{
	268	SET_WEIGHT(weights[plane], false, 1, 0, 0);
	269	uint64_t fencVar = fenc.wp_ssd[plane] + !refLowres.wp_ssd[plane];
	270	uint64_t refVar = refLowres.wp_ssd[plane] + !refLowres.wp_ssd[plane];
	271	guessScale[plane] = sqrt((float)fencVar / refVar);
	272	fencMean[plane] = (float)fenc.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
	273	refMean[plane] = (float)refLowres.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
	274	}
	275
	276	/* make sure both our scale factors fit */
	277	while (!list && chromaDenom > 0)
	278	{
	279	float thresh = 127.f / (1 << chromaDenom);
	280	if (guessScale[1] < thresh && guessScale[2] < thresh)
	281	break;
	282	chromaDenom--;
	283	}
	284
	285	SET_WEIGHT(weights[1], false, 1 << chromaDenom, chromaDenom, 0);
	286	SET_WEIGHT(weights[2], false, 1 << chromaDenom, chromaDenom, 0);
	287
	288	MV *mvs = NULL;
	289
	290	for (int plane = 0; plane < 3; plane++)
	291	{
	292	denom = plane ? chromaDenom : lumaDenom;
	293	if (plane && !weights[0].bPresentFlag)
	294	break;
	295
	296	/* Early termination */
	297	x265_emms();
	298	if (fabsf(refMean[plane] - fencMean[plane]) < 0.5f && fabsf(1.f - guessScale[plane]) < epsilon)
	299	{
	300	SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
	301	continue;
	302	}
	303
	304	if (plane)
	305	{
	306	int scale = Clip3(0, 255, (int)(guessScale[plane] * (1 << denom) + 0.5f));
	307	if (scale > 127)
	308	continue;
	309	weights[plane].inputWeight = scale;
	310	}
	311	else
	312	{
	313	weights[plane].setFromWeightAndOffset((int)(guessScale[plane] * (1 << denom) + 0.5f), 0, denom, !list);
	314	}
	315
	316	int mindenom = weights[plane].log2WeightDenom;
	317	int minscale = weights[plane].inputWeight;
	318	int minoff = 0;
	319
	320	if (!plane && diffPoc <= param.bframes + 1)
	321	{
	322	mvs = fenc.lowresMvs[list][diffPoc - 1];
	323
	324	/* test whether this motion search was performed by lookahead */
	325	if (mvs[0].x != 0x7FFF)
	326	{
	327	/* reference chroma planes must be extended prior to being
	328	* used as motion compensation sources */
	329	if (!refFrame->m_bChromaExtended)
	330	{
	331	refFrame->m_bChromaExtended = true;
	332	PicYuv *refPic = refFrame->m_fencPic;
	333	int width = refPic->m_picWidth >> cache.hshift;
	334	int height = refPic->m_picHeight >> cache.vshift;
	335	extendPicBorder(refPic->m_picOrg[1], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
	336	extendPicBorder(refPic->m_picOrg[2], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
	337	}
	338	}
	339	else
	340	mvs = 0;
	341	}
	342
	343	/* prepare inputs to weight analysis */
	344	pixel *orig;
	345	pixel *fref;
	346	intptr_t stride;
	347	int width, height;
	348	switch (plane)
	349	{
	350	case 0:
	351	orig = fenc.lowresPlane[0];
	352	stride = fenc.lumaStride;
	353	width = fenc.width;
	354	height = fenc.lines;
	355	fref = refLowres.lowresPlane[0];
	356	if (mvs)
	357	{
	358	mcLuma(mcbuf, refLowres, mvs);
	359	fref = mcbuf;
	360	}
	361	break;
	362
	363	case 1:
	364	orig = fencPic->m_picOrg[1];
	365	stride = fencPic->m_strideC;
	366	fref = refFrame->m_fencPic->m_picOrg[1];
	367
	368	/* Clamp the chroma dimensions to the nearest multiple of
	369	* 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres
	370	* blocks and weightCost measures 8x8 blocks. This
	371	* potentially ignores some edge pixels, but simplifies the
	372	* logic and prevents reading uninitialized pixels. Lowres
	373	* planes are border extended and require no clamping. */
	374	width = ((fencPic->m_picWidth >> 4) << 4) >> cache.hshift;
	375	height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
	376	if (mvs)
	377	{
	378	mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
	379	fref = mcbuf;
	380	}
	381	break;
	382
	383	case 2:
	384	fref = refFrame->m_fencPic->m_picOrg[2];
	385	orig = fencPic->m_picOrg[2];
	386	stride = fencPic->m_strideC;
	387	width = ((fencPic->m_picWidth >> 4) << 4) >> cache.hshift;
	388	height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
	389	if (mvs)
	390	{
	391	mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
	392	fref = mcbuf;
	393	}
	394	break;
	395
	396	default:
	397	slice.disableWeights();
	398	X265_FREE(mcbuf);
	399	return;
	400	}
	401
	402	uint32_t origscore = weightCost(orig, fref, weightTemp, stride, cache, width, height, NULL, !plane);
	403	if (!origscore)
	404	{
	405	SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
	406	continue;
	407	}
	408
	409	uint32_t minscore = origscore;
	410	bool bFound = false;
	411
	412	/* x264 uses a table lookup here, selecting search range based on preset */
	413	static const int scaleDist = 4;
	414	static const int offsetDist = 2;
	415
	416	int startScale = Clip3(0, 127, minscale - scaleDist);
	417	int endScale = Clip3(0, 127, minscale + scaleDist);
	418	for (int scale = startScale; scale <= endScale; scale++)
	419	{
	420	int deltaWeight = scale - (1 << mindenom);
	421	if (deltaWeight > 127 \|\| deltaWeight <= -128)
	422	continue;
	423
	424	x265_emms();
	425	int curScale = scale;
	426	int curOffset = (int)(fencMean[plane] - refMean[plane] * curScale / (1 << mindenom) + 0.5f);
	427	if (curOffset < -128 \|\| curOffset > 127)
	428	{
	429	/* Rescale considering the constraints on curOffset. We do it in this order
	430	* because scale has a much wider range than offset (because of denom), so
	431	* it should almost never need to be clamped. */
	432	curOffset = Clip3(-128, 127, curOffset);
	433	curScale = (int)((1 << mindenom) * (fencMean[plane] - curOffset) / refMean[plane] + 0.5f);
	434	curScale = Clip3(0, 127, curScale);
	435	}
	436
	437	int startOffset = Clip3(-128, 127, curOffset - offsetDist);
	438	int endOffset = Clip3(-128, 127, curOffset + offsetDist);
	439	for (int off = startOffset; off <= endOffset; off++)
	440	{
	441	WeightParam wsp;
	442	SET_WEIGHT(wsp, true, curScale, mindenom, off);
	443	uint32_t s = weightCost(orig, fref, weightTemp, stride, cache, width, height, &wsp, !plane) +
	444	sliceHeaderCost(&wsp, lambda, !!plane);
	445	COPY4_IF_LT(minscore, s, minscale, curScale, minoff, off, bFound, true);
	446
	447	/* Don't check any more offsets if the previous one had a lower cost than the current one */
	448	if (minoff == startOffset && off != startOffset)
	449	break;
	450	}
	451	}
	452
	453	/* Use a smaller luma denominator if possible */
	454	if (!(plane \|\| list))
	455	{
	456	while (mindenom > 0 && !(minscale & 1))
	457	{
	458	mindenom--;
	459	minscale >>= 1;
	460	}
	461	}
	462
	463	if (!bFound \|\| (minscale == (1 << mindenom) && minoff == 0) \|\| (float)minscore / origscore > 0.998f)
	464	{
	465	SET_WEIGHT(weights[plane], false, 1 << denom, denom, 0);
	466	}
	467	else
	468	{
	469	SET_WEIGHT(weights[plane], true, minscale, mindenom, minoff);
	470	}
	471	}
	472
	473	if (weights[0].bPresentFlag)
	474	{
	475	// Make sure both chroma channels match
	476	if (weights[1].bPresentFlag != weights[2].bPresentFlag)
	477	{
	478	if (weights[1].bPresentFlag)
	479	weights[2] = weights[1];
	480	else
	481	weights[1] = weights[2];
	482	}
	483	}
	484
	485	lumaDenom = weights[0].log2WeightDenom;
	486	chromaDenom = weights[1].log2WeightDenom;
	487
	488	/* reset weight states */
	489	for (int ref = 1; ref < slice.m_numRefIdx[list]; ref++)
	490	{
	491	SET_WEIGHT(wp[list][ref][0], false, 1 << lumaDenom, lumaDenom, 0);
	492	SET_WEIGHT(wp[list][ref][1], false, 1 << chromaDenom, chromaDenom, 0);
	493	SET_WEIGHT(wp[list][ref][2], false, 1 << chromaDenom, chromaDenom, 0);
	494	}
	495	}
	496
	497	X265_FREE(mcbuf);
	498
	499	memcpy(slice.m_weightPredTable, wp, sizeof(WeightParam) * 2 * MAX_NUM_REF * 3);
	500
	501	if (param.logLevel >= X265_LOG_FULL)
	502	{
	503	char buf[1024];
	504	int p = 0;
	505	bool bWeighted = false;
	506
	507	p = sprintf(buf, "poc: %d weights:", slice.m_poc);
	508	int numPredDir = slice.isInterP() ? 1 : 2;
	509	for (int list = 0; list < numPredDir; list++)
	510	{
	511	WeightParam* w = &wp[list][0][0];
	512	if (w[0].bPresentFlag \|\| w[1].bPresentFlag \|\| w[2].bPresentFlag)
	513	{
	514	bWeighted = true;
	515	p += sprintf(buf + p, " [L%d:R0 ", list);
	516	if (w[0].bPresentFlag)
	517	p += sprintf(buf + p, "Y{%d/%d%+d}", w[0].inputWeight, 1 << w[0].log2WeightDenom, w[0].inputOffset);
	518	if (w[1].bPresentFlag)
	519	p += sprintf(buf + p, "U{%d/%d%+d}", w[1].inputWeight, 1 << w[1].log2WeightDenom, w[1].inputOffset);
	520	if (w[2].bPresentFlag)
	521	p += sprintf(buf + p, "V{%d/%d%+d}", w[2].inputWeight, 1 << w[2].log2WeightDenom, w[2].inputOffset);
	522	p += sprintf(buf + p, "]");
	523	}
	524	}
	525
	526	if (bWeighted)
	527	{
	528	if (p < 80) // pad with spaces to ensure progress line overwritten
	529	sprintf(buf + p, "%*s", 80 - p, " ");
	530	x265_log(&param, X265_LOG_FULL, "%s\n", buf);
	531	}
	532	}
	533	}
	534	}