Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / encoder / weightPrediction.cpp
CommitLineData
72b9787e
JB
1/*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Author: Shazeb Nawaz Khan <shazeb@multicorewareinc.com>
5 * Steve Borho <steve@borho.org>
6 * Kavitha Sampas <kavitha@multicorewareinc.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 *
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at license @ x265.com.
24 *****************************************************************************/
25
26#include "common.h"
27#include "frame.h"
28#include "picyuv.h"
29#include "lowres.h"
30#include "mv.h"
31#include "slicetype.h"
32#include "bitstream.h"
33
34using namespace x265;
35namespace {
36struct Cache
37{
38 const int * intraCost;
39 int numPredDir;
40 int csp;
41 int hshift;
42 int vshift;
43 int lowresWidthInCU;
44 int lowresHeightInCU;
45};
46
47int sliceHeaderCost(WeightParam *w, int lambda, int bChroma)
48{
49 /* 4 times higher, because chroma is analyzed at full resolution. */
50 if (bChroma)
51 lambda *= 4;
52 int denomCost = bs_size_ue(w[0].log2WeightDenom) * (2 - bChroma);
53 return lambda * (10 + denomCost + 2 * (bs_size_se(w[0].inputWeight) + bs_size_se(w[0].inputOffset)));
54}
55
56/* make a motion compensated copy of lowres ref into mcout with the same stride.
57 * The borders of mcout are not extended */
58void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
59{
60 intptr_t stride = ref.lumaStride;
61 const int cuSize = 8;
62 MV mvmin, mvmax;
63
64 int cu = 0;
65
66 for (int y = 0; y < ref.lines; y += cuSize)
67 {
68 intptr_t pixoff = y * stride;
69 mvmin.y = (int16_t)((-y - 8) << 2);
70 mvmax.y = (int16_t)((ref.lines - y - 1 + 8) << 2);
71
72 for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++)
73 {
74 ALIGN_VAR_16(pixel, buf8x8[8 * 8]);
75 intptr_t bstride = 8;
76 mvmin.x = (int16_t)((-x - 8) << 2);
77 mvmax.x = (int16_t)((ref.width - x - 1 + 8) << 2);
78
79 /* clip MV to available pixels */
80 MV mv = mvs[cu];
81 mv = mv.clipped(mvmin, mvmax);
82 pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
83 primitives.luma_copy_pp[LUMA_8x8](mcout + pixoff, stride, tmp, bstride);
84 }
85 }
86}
87
88/* use lowres MVs from lookahead to generate a motion compensated chroma plane.
89 * if a block had cheaper lowres cost as intra, we treat it as MV 0 */
90void mcChroma(pixel * mcout,
91 pixel * src,
92 intptr_t stride,
93 const MV * mvs,
94 const Cache& cache,
95 int height,
96 int width)
97{
98 /* the motion vectors correspond to 8x8 lowres luma blocks, or 16x16 fullres
99 * luma blocks. We have to adapt block size to chroma csp */
100 int csp = cache.csp;
101 int bw = 16 >> cache.hshift;
102 int bh = 16 >> cache.vshift;
103 MV mvmin, mvmax;
104
105 for (int y = 0; y < height; y += bh)
106 {
107 /* note: lowres block count per row might be different from chroma block
108 * count per row because of rounding issues, so be very careful with indexing
109 * into the lowres structures */
110 int cu = y * cache.lowresWidthInCU;
111 intptr_t pixoff = y * stride;
112 mvmin.y = (int16_t)((-y - 8) << 2);
113 mvmax.y = (int16_t)((height - y - 1 + 8) << 2);
114
115 for (int x = 0; x < width; x += bw, cu++, pixoff += bw)
116 {
117 if (x < cache.lowresWidthInCU && y < cache.lowresHeightInCU)
118 {
119 MV mv = mvs[cu]; // lowres MV
120 mv <<= 1; // fullres MV
121 mv.x >>= cache.hshift;
122 mv.y >>= cache.vshift;
123
124 /* clip MV to available pixels */
125 mvmin.x = (int16_t)((-x - 8) << 2);
126 mvmax.x = (int16_t)((width - x - 1 + 8) << 2);
127 mv = mv.clipped(mvmin, mvmax);
128
129 intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
130 pixel *temp = src + pixoff + fpeloffset;
131
132 int xFrac = mv.x & 0x7;
133 int yFrac = mv.y & 0x7;
134 if ((yFrac | xFrac) == 0)
135 {
136 primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, temp, stride);
137 }
138 else if (yFrac == 0)
139 {
140 primitives.chroma[csp].filter_hpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, xFrac);
141 }
142 else if (xFrac == 0)
143 {
144 primitives.chroma[csp].filter_vpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, yFrac);
145 }
146 else
147 {
148 ALIGN_VAR_16(int16_t, imm[16 * (16 + NTAPS_CHROMA)]);
149 primitives.chroma[csp].filter_hps[LUMA_16x16](temp, stride, imm, bw, xFrac, 1);
150 primitives.chroma[csp].filter_vsp[LUMA_16x16](imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
151 }
152 }
153 else
154 {
155 primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, src + pixoff, stride);
156 }
157 }
158 }
159}
160
161/* Measure sum of 8x8 satd costs between source frame and reference
162 * frame (potentially weighted, potentially motion compensated). We
163 * always use source images for this analysis since reference recon
164 * pixels have unreliable availability */
165uint32_t weightCost(pixel * fenc,
166 pixel * ref,
167 pixel * weightTemp,
168 intptr_t stride,
169 const Cache & cache,
170 int width,
171 int height,
172 WeightParam * w,
173 bool bLuma)
174{
175 if (w)
176 {
177 /* make a weighted copy of the reference plane */
178 int offset = w->inputOffset << (X265_DEPTH - 8);
179 int weight = w->inputWeight;
180 int denom = w->log2WeightDenom;
181 int round = denom ? 1 << (denom - 1) : 0;
182 int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
183 int pwidth = ((width + 15) >> 4) << 4;
184
185 primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
186 weight, round << correction, denom + correction, offset);
187 ref = weightTemp;
188 }
189
190 uint32_t cost = 0;
191 pixel *f = fenc, *r = ref;
192
193 if (bLuma)
194 {
195 int cu = 0;
196 for (int y = 8; y < height; y += 8, r += 8 * stride, f += 8 * stride)
197 {
198 for (int x = 8; x < width; x += 8, cu++)
199 {
200 int cmp = primitives.satd[LUMA_8x8](r + x, stride, f + x, stride);
201 cost += X265_MIN(cmp, cache.intraCost[cu]);
202 }
203 }
204 }
205 else if (cache.csp == X265_CSP_I444)
206 for (int y = 16; y < height; y += 16, r += 16 * stride, f += 16 * stride)
207 for (int x = 16; x < width; x += 16)
208 cost += primitives.satd[LUMA_16x16](r + x, stride, f + x, stride);
209 else
210 for (int y = 8; y < height; y += 8, r += 8 * stride, f += 8 * stride)
211 for (int x = 8; x < width; x += 8)
212 cost += primitives.satd[LUMA_8x8](r + x, stride, f + x, stride);
213
214 return cost;
215}
216}
217
218namespace x265 {
219void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
220{
221 WeightParam wp[2][MAX_NUM_REF][3];
b53f7c52 222 PicYuv *fencPic = frame.m_fencPic;
72b9787e
JB
223 Lowres& fenc = frame.m_lowres;
224
225 Cache cache;
226
227 memset(&cache, 0, sizeof(cache));
228 cache.intraCost = fenc.intraCost;
229 cache.numPredDir = slice.isInterP() ? 1 : 2;
230 cache.lowresWidthInCU = fenc.width >> 3;
231 cache.lowresHeightInCU = fenc.lines >> 3;
232 cache.csp = fencPic->m_picCsp;
233 cache.hshift = CHROMA_H_SHIFT(cache.csp);
234 cache.vshift = CHROMA_V_SHIFT(cache.csp);
235
236 /* Use single allocation for motion compensated ref and weight buffers */
237 pixel *mcbuf = X265_MALLOC(pixel, 2 * fencPic->m_stride * fencPic->m_picHeight);
238 if (!mcbuf)
239 {
240 slice.disableWeights();
241 return;
242 }
243 pixel *weightTemp = mcbuf + fencPic->m_stride * fencPic->m_picHeight;
244
245 int lambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
246 int curPoc = slice.m_poc;
247 const float epsilon = 1.f / 128.f;
248
249 int chromaDenom, lumaDenom, denom;
250 chromaDenom = lumaDenom = 7;
251 int numpixels[3];
252 int w16 = ((fencPic->m_picWidth + 15) >> 4) << 4;
253 int h16 = ((fencPic->m_picHeight + 15) >> 4) << 4;
254 numpixels[0] = w16 * h16;
255 numpixels[1] = numpixels[2] = numpixels[0] >> (cache.hshift + cache.vshift);
256
257 for (int list = 0; list < cache.numPredDir; list++)
258 {
259 WeightParam *weights = wp[list][0];
260 Frame *refFrame = slice.m_refPicList[list][0];
261 Lowres& refLowres = refFrame->m_lowres;
262 int diffPoc = abs(curPoc - refFrame->m_poc);
263
264 /* prepare estimates */
265 float guessScale[3], fencMean[3], refMean[3];
266 for (int plane = 0; plane < 3; plane++)
267 {
268 SET_WEIGHT(weights[plane], false, 1, 0, 0);
269 uint64_t fencVar = fenc.wp_ssd[plane] + !refLowres.wp_ssd[plane];
270 uint64_t refVar = refLowres.wp_ssd[plane] + !refLowres.wp_ssd[plane];
271 guessScale[plane] = sqrt((float)fencVar / refVar);
272 fencMean[plane] = (float)fenc.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
273 refMean[plane] = (float)refLowres.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
274 }
275
276 /* make sure both our scale factors fit */
277 while (!list && chromaDenom > 0)
278 {
279 float thresh = 127.f / (1 << chromaDenom);
280 if (guessScale[1] < thresh && guessScale[2] < thresh)
281 break;
282 chromaDenom--;
283 }
284
285 SET_WEIGHT(weights[1], false, 1 << chromaDenom, chromaDenom, 0);
286 SET_WEIGHT(weights[2], false, 1 << chromaDenom, chromaDenom, 0);
287
288 MV *mvs = NULL;
289
290 for (int plane = 0; plane < 3; plane++)
291 {
292 denom = plane ? chromaDenom : lumaDenom;
293 if (plane && !weights[0].bPresentFlag)
294 break;
295
296 /* Early termination */
297 x265_emms();
298 if (fabsf(refMean[plane] - fencMean[plane]) < 0.5f && fabsf(1.f - guessScale[plane]) < epsilon)
299 {
300 SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
301 continue;
302 }
303
304 if (plane)
305 {
306 int scale = Clip3(0, 255, (int)(guessScale[plane] * (1 << denom) + 0.5f));
307 if (scale > 127)
308 continue;
309 weights[plane].inputWeight = scale;
310 }
311 else
312 {
313 weights[plane].setFromWeightAndOffset((int)(guessScale[plane] * (1 << denom) + 0.5f), 0, denom, !list);
314 }
315
316 int mindenom = weights[plane].log2WeightDenom;
317 int minscale = weights[plane].inputWeight;
318 int minoff = 0;
319
320 if (!plane && diffPoc <= param.bframes + 1)
321 {
322 mvs = fenc.lowresMvs[list][diffPoc - 1];
323
324 /* test whether this motion search was performed by lookahead */
325 if (mvs[0].x != 0x7FFF)
326 {
327 /* reference chroma planes must be extended prior to being
328 * used as motion compensation sources */
329 if (!refFrame->m_bChromaExtended)
330 {
331 refFrame->m_bChromaExtended = true;
b53f7c52 332 PicYuv *refPic = refFrame->m_fencPic;
72b9787e
JB
333 int width = refPic->m_picWidth >> cache.hshift;
334 int height = refPic->m_picHeight >> cache.vshift;
335 extendPicBorder(refPic->m_picOrg[1], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
336 extendPicBorder(refPic->m_picOrg[2], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
337 }
338 }
339 else
340 mvs = 0;
341 }
342
343 /* prepare inputs to weight analysis */
344 pixel *orig;
345 pixel *fref;
346 intptr_t stride;
347 int width, height;
348 switch (plane)
349 {
350 case 0:
351 orig = fenc.lowresPlane[0];
352 stride = fenc.lumaStride;
353 width = fenc.width;
354 height = fenc.lines;
355 fref = refLowres.lowresPlane[0];
356 if (mvs)
357 {
358 mcLuma(mcbuf, refLowres, mvs);
359 fref = mcbuf;
360 }
361 break;
362
363 case 1:
364 orig = fencPic->m_picOrg[1];
365 stride = fencPic->m_strideC;
b53f7c52 366 fref = refFrame->m_fencPic->m_picOrg[1];
72b9787e
JB
367
368 /* Clamp the chroma dimensions to the nearest multiple of
369 * 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres
370 * blocks and weightCost measures 8x8 blocks. This
371 * potentially ignores some edge pixels, but simplifies the
372 * logic and prevents reading uninitialized pixels. Lowres
373 * planes are border extended and require no clamping. */
374 width = ((fencPic->m_picWidth >> 4) << 4) >> cache.hshift;
375 height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
376 if (mvs)
377 {
378 mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
379 fref = mcbuf;
380 }
381 break;
382
383 case 2:
b53f7c52 384 fref = refFrame->m_fencPic->m_picOrg[2];
72b9787e
JB
385 orig = fencPic->m_picOrg[2];
386 stride = fencPic->m_strideC;
387 width = ((fencPic->m_picWidth >> 4) << 4) >> cache.hshift;
388 height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
389 if (mvs)
390 {
391 mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
392 fref = mcbuf;
393 }
394 break;
395
396 default:
397 slice.disableWeights();
398 X265_FREE(mcbuf);
399 return;
400 }
401
402 uint32_t origscore = weightCost(orig, fref, weightTemp, stride, cache, width, height, NULL, !plane);
403 if (!origscore)
404 {
405 SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
406 continue;
407 }
408
409 uint32_t minscore = origscore;
410 bool bFound = false;
411
412 /* x264 uses a table lookup here, selecting search range based on preset */
413 static const int scaleDist = 4;
414 static const int offsetDist = 2;
415
416 int startScale = Clip3(0, 127, minscale - scaleDist);
417 int endScale = Clip3(0, 127, minscale + scaleDist);
418 for (int scale = startScale; scale <= endScale; scale++)
419 {
420 int deltaWeight = scale - (1 << mindenom);
421 if (deltaWeight > 127 || deltaWeight <= -128)
422 continue;
423
424 x265_emms();
425 int curScale = scale;
426 int curOffset = (int)(fencMean[plane] - refMean[plane] * curScale / (1 << mindenom) + 0.5f);
427 if (curOffset < -128 || curOffset > 127)
428 {
429 /* Rescale considering the constraints on curOffset. We do it in this order
430 * because scale has a much wider range than offset (because of denom), so
431 * it should almost never need to be clamped. */
432 curOffset = Clip3(-128, 127, curOffset);
433 curScale = (int)((1 << mindenom) * (fencMean[plane] - curOffset) / refMean[plane] + 0.5f);
434 curScale = Clip3(0, 127, curScale);
435 }
436
437 int startOffset = Clip3(-128, 127, curOffset - offsetDist);
438 int endOffset = Clip3(-128, 127, curOffset + offsetDist);
439 for (int off = startOffset; off <= endOffset; off++)
440 {
441 WeightParam wsp;
442 SET_WEIGHT(wsp, true, curScale, mindenom, off);
443 uint32_t s = weightCost(orig, fref, weightTemp, stride, cache, width, height, &wsp, !plane) +
444 sliceHeaderCost(&wsp, lambda, !!plane);
445 COPY4_IF_LT(minscore, s, minscale, curScale, minoff, off, bFound, true);
446
447 /* Don't check any more offsets if the previous one had a lower cost than the current one */
448 if (minoff == startOffset && off != startOffset)
449 break;
450 }
451 }
452
453 /* Use a smaller luma denominator if possible */
454 if (!(plane || list))
455 {
456 while (mindenom > 0 && !(minscale & 1))
457 {
458 mindenom--;
459 minscale >>= 1;
460 }
461 }
462
463 if (!bFound || (minscale == (1 << mindenom) && minoff == 0) || (float)minscore / origscore > 0.998f)
464 {
465 SET_WEIGHT(weights[plane], false, 1 << denom, denom, 0);
466 }
467 else
468 {
469 SET_WEIGHT(weights[plane], true, minscale, mindenom, minoff);
470 }
471 }
472
473 if (weights[0].bPresentFlag)
474 {
475 // Make sure both chroma channels match
476 if (weights[1].bPresentFlag != weights[2].bPresentFlag)
477 {
478 if (weights[1].bPresentFlag)
479 weights[2] = weights[1];
480 else
481 weights[1] = weights[2];
482 }
483 }
484
485 lumaDenom = weights[0].log2WeightDenom;
486 chromaDenom = weights[1].log2WeightDenom;
487
488 /* reset weight states */
489 for (int ref = 1; ref < slice.m_numRefIdx[list]; ref++)
490 {
491 SET_WEIGHT(wp[list][ref][0], false, 1 << lumaDenom, lumaDenom, 0);
492 SET_WEIGHT(wp[list][ref][1], false, 1 << chromaDenom, chromaDenom, 0);
493 SET_WEIGHT(wp[list][ref][2], false, 1 << chromaDenom, chromaDenom, 0);
494 }
495 }
496
497 X265_FREE(mcbuf);
498
499 memcpy(slice.m_weightPredTable, wp, sizeof(WeightParam) * 2 * MAX_NUM_REF * 3);
500
501 if (param.logLevel >= X265_LOG_FULL)
502 {
503 char buf[1024];
504 int p = 0;
505 bool bWeighted = false;
506
507 p = sprintf(buf, "poc: %d weights:", slice.m_poc);
508 int numPredDir = slice.isInterP() ? 1 : 2;
509 for (int list = 0; list < numPredDir; list++)
510 {
511 WeightParam* w = &wp[list][0][0];
512 if (w[0].bPresentFlag || w[1].bPresentFlag || w[2].bPresentFlag)
513 {
514 bWeighted = true;
515 p += sprintf(buf + p, " [L%d:R0 ", list);
516 if (w[0].bPresentFlag)
517 p += sprintf(buf + p, "Y{%d/%d%+d}", w[0].inputWeight, 1 << w[0].log2WeightDenom, w[0].inputOffset);
518 if (w[1].bPresentFlag)
519 p += sprintf(buf + p, "U{%d/%d%+d}", w[1].inputWeight, 1 << w[1].log2WeightDenom, w[1].inputOffset);
520 if (w[2].bPresentFlag)
521 p += sprintf(buf + p, "V{%d/%d%+d}", w[2].inputWeight, 1 << w[2].log2WeightDenom, w[2].inputOffset);
522 p += sprintf(buf + p, "]");
523 }
524 }
525
526 if (bWeighted)
527 {
528 if (p < 80) // pad with spaces to ensure progress line overwritten
529 sprintf(buf + p, "%*s", 80 - p, " ");
530 x265_log(&param, X265_LOG_FULL, "%s\n", buf);
531 }
532 }
533}
534}