1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Author: Shazeb Nawaz Khan <shazeb@multicorewareinc.com>
5 * Steve Borho <steve@borho.org>
6 * Kavitha Sampas <kavitha@multicorewareinc.com>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at license @ x265.com.
24 *****************************************************************************/
31 #include "slicetype.h"
32 #include "bitstream.h"
38 const int * intraCost
;
47 int sliceHeaderCost(WeightParam
*w
, int lambda
, int bChroma
)
49 /* 4 times higher, because chroma is analyzed at full resolution. */
52 int denomCost
= bs_size_ue(w
[0].log2WeightDenom
) * (2 - bChroma
);
53 return lambda
* (10 + denomCost
+ 2 * (bs_size_se(w
[0].inputWeight
) + bs_size_se(w
[0].inputOffset
)));
56 /* make a motion compensated copy of lowres ref into mcout with the same stride.
57 * The borders of mcout are not extended */
58 void mcLuma(pixel
* mcout
, Lowres
& ref
, const MV
* mvs
)
60 intptr_t stride
= ref
.lumaStride
;
66 for (int y
= 0; y
< ref
.lines
; y
+= cuSize
)
68 intptr_t pixoff
= y
* stride
;
69 mvmin
.y
= (int16_t)((-y
- 8) << 2);
70 mvmax
.y
= (int16_t)((ref
.lines
- y
- 1 + 8) << 2);
72 for (int x
= 0; x
< ref
.width
; x
+= cuSize
, pixoff
+= cuSize
, cu
++)
74 ALIGN_VAR_16(pixel
, buf8x8
[8 * 8]);
76 mvmin
.x
= (int16_t)((-x
- 8) << 2);
77 mvmax
.x
= (int16_t)((ref
.width
- x
- 1 + 8) << 2);
79 /* clip MV to available pixels */
81 mv
= mv
.clipped(mvmin
, mvmax
);
82 pixel
*tmp
= ref
.lowresMC(pixoff
, mv
, buf8x8
, bstride
);
83 primitives
.luma_copy_pp
[LUMA_8x8
](mcout
+ pixoff
, stride
, tmp
, bstride
);
88 /* use lowres MVs from lookahead to generate a motion compensated chroma plane.
89 * if a block had cheaper lowres cost as intra, we treat it as MV 0 */
90 void mcChroma(pixel
* mcout
,
98 /* the motion vectors correspond to 8x8 lowres luma blocks, or 16x16 fullres
99 * luma blocks. We have to adapt block size to chroma csp */
101 int bw
= 16 >> cache
.hshift
;
102 int bh
= 16 >> cache
.vshift
;
105 for (int y
= 0; y
< height
; y
+= bh
)
107 /* note: lowres block count per row might be different from chroma block
108 * count per row because of rounding issues, so be very careful with indexing
109 * into the lowres structures */
110 int cu
= y
* cache
.lowresWidthInCU
;
111 intptr_t pixoff
= y
* stride
;
112 mvmin
.y
= (int16_t)((-y
- 8) << 2);
113 mvmax
.y
= (int16_t)((height
- y
- 1 + 8) << 2);
115 for (int x
= 0; x
< width
; x
+= bw
, cu
++, pixoff
+= bw
)
117 if (x
< cache
.lowresWidthInCU
&& y
< cache
.lowresHeightInCU
)
119 MV mv
= mvs
[cu
]; // lowres MV
120 mv
<<= 1; // fullres MV
121 mv
.x
>>= cache
.hshift
;
122 mv
.y
>>= cache
.vshift
;
124 /* clip MV to available pixels */
125 mvmin
.x
= (int16_t)((-x
- 8) << 2);
126 mvmax
.x
= (int16_t)((width
- x
- 1 + 8) << 2);
127 mv
= mv
.clipped(mvmin
, mvmax
);
129 intptr_t fpeloffset
= (mv
.y
>> 2) * stride
+ (mv
.x
>> 2);
130 pixel
*temp
= src
+ pixoff
+ fpeloffset
;
132 int xFrac
= mv
.x
& 0x7;
133 int yFrac
= mv
.y
& 0x7;
134 if ((yFrac
| xFrac
) == 0)
136 primitives
.chroma
[csp
].copy_pp
[LUMA_16x16
](mcout
+ pixoff
, stride
, temp
, stride
);
140 primitives
.chroma
[csp
].filter_hpp
[LUMA_16x16
](temp
, stride
, mcout
+ pixoff
, stride
, xFrac
);
144 primitives
.chroma
[csp
].filter_vpp
[LUMA_16x16
](temp
, stride
, mcout
+ pixoff
, stride
, yFrac
);
148 ALIGN_VAR_16(int16_t, imm
[16 * (16 + NTAPS_CHROMA
)]);
149 primitives
.chroma
[csp
].filter_hps
[LUMA_16x16
](temp
, stride
, imm
, bw
, xFrac
, 1);
150 primitives
.chroma
[csp
].filter_vsp
[LUMA_16x16
](imm
+ ((NTAPS_CHROMA
>> 1) - 1) * bw
, bw
, mcout
+ pixoff
, stride
, yFrac
);
155 primitives
.chroma
[csp
].copy_pp
[LUMA_16x16
](mcout
+ pixoff
, stride
, src
+ pixoff
, stride
);
161 /* Measure sum of 8x8 satd costs between source frame and reference
162 * frame (potentially weighted, potentially motion compensated). We
163 * always use source images for this analysis since reference recon
164 * pixels have unreliable availability */
165 uint32_t weightCost(pixel
* fenc
,
177 /* make a weighted copy of the reference plane */
178 int offset
= w
->inputOffset
<< (X265_DEPTH
- 8);
179 int weight
= w
->inputWeight
;
180 int denom
= w
->log2WeightDenom
;
181 int round
= denom
? 1 << (denom
- 1) : 0;
182 int correction
= IF_INTERNAL_PREC
- X265_DEPTH
; /* intermediate interpolation depth */
183 int pwidth
= ((width
+ 15) >> 4) << 4;
185 primitives
.weight_pp(ref
, weightTemp
, stride
, pwidth
, height
,
186 weight
, round
<< correction
, denom
+ correction
, offset
);
191 pixel
*f
= fenc
, *r
= ref
;
196 for (int y
= 8; y
< height
; y
+= 8, r
+= 8 * stride
, f
+= 8 * stride
)
198 for (int x
= 8; x
< width
; x
+= 8, cu
++)
200 int cmp
= primitives
.satd
[LUMA_8x8
](r
+ x
, stride
, f
+ x
, stride
);
201 cost
+= X265_MIN(cmp
, cache
.intraCost
[cu
]);
205 else if (cache
.csp
== X265_CSP_I444
)
206 for (int y
= 16; y
< height
; y
+= 16, r
+= 16 * stride
, f
+= 16 * stride
)
207 for (int x
= 16; x
< width
; x
+= 16)
208 cost
+= primitives
.satd
[LUMA_16x16
](r
+ x
, stride
, f
+ x
, stride
);
210 for (int y
= 8; y
< height
; y
+= 8, r
+= 8 * stride
, f
+= 8 * stride
)
211 for (int x
= 8; x
< width
; x
+= 8)
212 cost
+= primitives
.satd
[LUMA_8x8
](r
+ x
, stride
, f
+ x
, stride
);
219 void weightAnalyse(Slice
& slice
, Frame
& frame
, x265_param
& param
)
221 WeightParam wp
[2][MAX_NUM_REF
][3];
222 PicYuv
*fencPic
= frame
.m_fencPic
;
223 Lowres
& fenc
= frame
.m_lowres
;
227 memset(&cache
, 0, sizeof(cache
));
228 cache
.intraCost
= fenc
.intraCost
;
229 cache
.numPredDir
= slice
.isInterP() ? 1 : 2;
230 cache
.lowresWidthInCU
= fenc
.width
>> 3;
231 cache
.lowresHeightInCU
= fenc
.lines
>> 3;
232 cache
.csp
= fencPic
->m_picCsp
;
233 cache
.hshift
= CHROMA_H_SHIFT(cache
.csp
);
234 cache
.vshift
= CHROMA_V_SHIFT(cache
.csp
);
236 /* Use single allocation for motion compensated ref and weight buffers */
237 pixel
*mcbuf
= X265_MALLOC(pixel
, 2 * fencPic
->m_stride
* fencPic
->m_picHeight
);
240 slice
.disableWeights();
243 pixel
*weightTemp
= mcbuf
+ fencPic
->m_stride
* fencPic
->m_picHeight
;
245 int lambda
= (int)x265_lambda_tab
[X265_LOOKAHEAD_QP
];
246 int curPoc
= slice
.m_poc
;
247 const float epsilon
= 1.f
/ 128.f
;
249 int chromaDenom
, lumaDenom
, denom
;
250 chromaDenom
= lumaDenom
= 7;
252 int w16
= ((fencPic
->m_picWidth
+ 15) >> 4) << 4;
253 int h16
= ((fencPic
->m_picHeight
+ 15) >> 4) << 4;
254 numpixels
[0] = w16
* h16
;
255 numpixels
[1] = numpixels
[2] = numpixels
[0] >> (cache
.hshift
+ cache
.vshift
);
257 for (int list
= 0; list
< cache
.numPredDir
; list
++)
259 WeightParam
*weights
= wp
[list
][0];
260 Frame
*refFrame
= slice
.m_refPicList
[list
][0];
261 Lowres
& refLowres
= refFrame
->m_lowres
;
262 int diffPoc
= abs(curPoc
- refFrame
->m_poc
);
264 /* prepare estimates */
265 float guessScale
[3], fencMean
[3], refMean
[3];
266 for (int plane
= 0; plane
< 3; plane
++)
268 SET_WEIGHT(weights
[plane
], false, 1, 0, 0);
269 uint64_t fencVar
= fenc
.wp_ssd
[plane
] + !refLowres
.wp_ssd
[plane
];
270 uint64_t refVar
= refLowres
.wp_ssd
[plane
] + !refLowres
.wp_ssd
[plane
];
271 guessScale
[plane
] = sqrt((float)fencVar
/ refVar
);
272 fencMean
[plane
] = (float)fenc
.wp_sum
[plane
] / (numpixels
[plane
]) / (1 << (X265_DEPTH
- 8));
273 refMean
[plane
] = (float)refLowres
.wp_sum
[plane
] / (numpixels
[plane
]) / (1 << (X265_DEPTH
- 8));
276 /* make sure both our scale factors fit */
277 while (!list
&& chromaDenom
> 0)
279 float thresh
= 127.f
/ (1 << chromaDenom
);
280 if (guessScale
[1] < thresh
&& guessScale
[2] < thresh
)
285 SET_WEIGHT(weights
[1], false, 1 << chromaDenom
, chromaDenom
, 0);
286 SET_WEIGHT(weights
[2], false, 1 << chromaDenom
, chromaDenom
, 0);
290 for (int plane
= 0; plane
< 3; plane
++)
292 denom
= plane
? chromaDenom
: lumaDenom
;
293 if (plane
&& !weights
[0].bPresentFlag
)
296 /* Early termination */
298 if (fabsf(refMean
[plane
] - fencMean
[plane
]) < 0.5f
&& fabsf(1.f
- guessScale
[plane
]) < epsilon
)
300 SET_WEIGHT(weights
[plane
], 0, 1 << denom
, denom
, 0);
306 int scale
= Clip3(0, 255, (int)(guessScale
[plane
] * (1 << denom
) + 0.5f
));
309 weights
[plane
].inputWeight
= scale
;
313 weights
[plane
].setFromWeightAndOffset((int)(guessScale
[plane
] * (1 << denom
) + 0.5f
), 0, denom
, !list
);
316 int mindenom
= weights
[plane
].log2WeightDenom
;
317 int minscale
= weights
[plane
].inputWeight
;
320 if (!plane
&& diffPoc
<= param
.bframes
+ 1)
322 mvs
= fenc
.lowresMvs
[list
][diffPoc
- 1];
324 /* test whether this motion search was performed by lookahead */
325 if (mvs
[0].x
!= 0x7FFF)
327 /* reference chroma planes must be extended prior to being
328 * used as motion compensation sources */
329 if (!refFrame
->m_bChromaExtended
)
331 refFrame
->m_bChromaExtended
= true;
332 PicYuv
*refPic
= refFrame
->m_fencPic
;
333 int width
= refPic
->m_picWidth
>> cache
.hshift
;
334 int height
= refPic
->m_picHeight
>> cache
.vshift
;
335 extendPicBorder(refPic
->m_picOrg
[1], refPic
->m_strideC
, width
, height
, refPic
->m_chromaMarginX
, refPic
->m_chromaMarginY
);
336 extendPicBorder(refPic
->m_picOrg
[2], refPic
->m_strideC
, width
, height
, refPic
->m_chromaMarginX
, refPic
->m_chromaMarginY
);
343 /* prepare inputs to weight analysis */
351 orig
= fenc
.lowresPlane
[0];
352 stride
= fenc
.lumaStride
;
355 fref
= refLowres
.lowresPlane
[0];
358 mcLuma(mcbuf
, refLowres
, mvs
);
364 orig
= fencPic
->m_picOrg
[1];
365 stride
= fencPic
->m_strideC
;
366 fref
= refFrame
->m_fencPic
->m_picOrg
[1];
368 /* Clamp the chroma dimensions to the nearest multiple of
369 * 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres
370 * blocks and weightCost measures 8x8 blocks. This
371 * potentially ignores some edge pixels, but simplifies the
372 * logic and prevents reading uninitialized pixels. Lowres
373 * planes are border extended and require no clamping. */
374 width
= ((fencPic
->m_picWidth
>> 4) << 4) >> cache
.hshift
;
375 height
= ((fencPic
->m_picHeight
>> 4) << 4) >> cache
.vshift
;
378 mcChroma(mcbuf
, fref
, stride
, mvs
, cache
, height
, width
);
384 fref
= refFrame
->m_fencPic
->m_picOrg
[2];
385 orig
= fencPic
->m_picOrg
[2];
386 stride
= fencPic
->m_strideC
;
387 width
= ((fencPic
->m_picWidth
>> 4) << 4) >> cache
.hshift
;
388 height
= ((fencPic
->m_picHeight
>> 4) << 4) >> cache
.vshift
;
391 mcChroma(mcbuf
, fref
, stride
, mvs
, cache
, height
, width
);
397 slice
.disableWeights();
402 uint32_t origscore
= weightCost(orig
, fref
, weightTemp
, stride
, cache
, width
, height
, NULL
, !plane
);
405 SET_WEIGHT(weights
[plane
], 0, 1 << denom
, denom
, 0);
409 uint32_t minscore
= origscore
;
412 /* x264 uses a table lookup here, selecting search range based on preset */
413 static const int scaleDist
= 4;
414 static const int offsetDist
= 2;
416 int startScale
= Clip3(0, 127, minscale
- scaleDist
);
417 int endScale
= Clip3(0, 127, minscale
+ scaleDist
);
418 for (int scale
= startScale
; scale
<= endScale
; scale
++)
420 int deltaWeight
= scale
- (1 << mindenom
);
421 if (deltaWeight
> 127 || deltaWeight
<= -128)
425 int curScale
= scale
;
426 int curOffset
= (int)(fencMean
[plane
] - refMean
[plane
] * curScale
/ (1 << mindenom
) + 0.5f
);
427 if (curOffset
< -128 || curOffset
> 127)
429 /* Rescale considering the constraints on curOffset. We do it in this order
430 * because scale has a much wider range than offset (because of denom), so
431 * it should almost never need to be clamped. */
432 curOffset
= Clip3(-128, 127, curOffset
);
433 curScale
= (int)((1 << mindenom
) * (fencMean
[plane
] - curOffset
) / refMean
[plane
] + 0.5f
);
434 curScale
= Clip3(0, 127, curScale
);
437 int startOffset
= Clip3(-128, 127, curOffset
- offsetDist
);
438 int endOffset
= Clip3(-128, 127, curOffset
+ offsetDist
);
439 for (int off
= startOffset
; off
<= endOffset
; off
++)
442 SET_WEIGHT(wsp
, true, curScale
, mindenom
, off
);
443 uint32_t s
= weightCost(orig
, fref
, weightTemp
, stride
, cache
, width
, height
, &wsp
, !plane
) +
444 sliceHeaderCost(&wsp
, lambda
, !!plane
);
445 COPY4_IF_LT(minscore
, s
, minscale
, curScale
, minoff
, off
, bFound
, true);
447 /* Don't check any more offsets if the previous one had a lower cost than the current one */
448 if (minoff
== startOffset
&& off
!= startOffset
)
453 /* Use a smaller luma denominator if possible */
454 if (!(plane
|| list
))
456 while (mindenom
> 0 && !(minscale
& 1))
463 if (!bFound
|| (minscale
== (1 << mindenom
) && minoff
== 0) || (float)minscore
/ origscore
> 0.998f
)
465 SET_WEIGHT(weights
[plane
], false, 1 << denom
, denom
, 0);
469 SET_WEIGHT(weights
[plane
], true, minscale
, mindenom
, minoff
);
473 if (weights
[0].bPresentFlag
)
475 // Make sure both chroma channels match
476 if (weights
[1].bPresentFlag
!= weights
[2].bPresentFlag
)
478 if (weights
[1].bPresentFlag
)
479 weights
[2] = weights
[1];
481 weights
[1] = weights
[2];
485 lumaDenom
= weights
[0].log2WeightDenom
;
486 chromaDenom
= weights
[1].log2WeightDenom
;
488 /* reset weight states */
489 for (int ref
= 1; ref
< slice
.m_numRefIdx
[list
]; ref
++)
491 SET_WEIGHT(wp
[list
][ref
][0], false, 1 << lumaDenom
, lumaDenom
, 0);
492 SET_WEIGHT(wp
[list
][ref
][1], false, 1 << chromaDenom
, chromaDenom
, 0);
493 SET_WEIGHT(wp
[list
][ref
][2], false, 1 << chromaDenom
, chromaDenom
, 0);
499 memcpy(slice
.m_weightPredTable
, wp
, sizeof(WeightParam
) * 2 * MAX_NUM_REF
* 3);
501 if (param
.logLevel
>= X265_LOG_FULL
)
505 bool bWeighted
= false;
507 p
= sprintf(buf
, "poc: %d weights:", slice
.m_poc
);
508 int numPredDir
= slice
.isInterP() ? 1 : 2;
509 for (int list
= 0; list
< numPredDir
; list
++)
511 WeightParam
* w
= &wp
[list
][0][0];
512 if (w
[0].bPresentFlag
|| w
[1].bPresentFlag
|| w
[2].bPresentFlag
)
515 p
+= sprintf(buf
+ p
, " [L%d:R0 ", list
);
516 if (w
[0].bPresentFlag
)
517 p
+= sprintf(buf
+ p
, "Y{%d/%d%+d}", w
[0].inputWeight
, 1 << w
[0].log2WeightDenom
, w
[0].inputOffset
);
518 if (w
[1].bPresentFlag
)
519 p
+= sprintf(buf
+ p
, "U{%d/%d%+d}", w
[1].inputWeight
, 1 << w
[1].log2WeightDenom
, w
[1].inputOffset
);
520 if (w
[2].bPresentFlag
)
521 p
+= sprintf(buf
+ p
, "V{%d/%d%+d}", w
[2].inputWeight
, 1 << w
[2].log2WeightDenom
, w
[2].inputOffset
);
522 p
+= sprintf(buf
+ p
, "]");
528 if (p
< 80) // pad with spaces to ensure progress line overwritten
529 sprintf(buf
+ p
, "%*s", 80 - p
, " ");
530 x265_log(¶m
, X265_LOG_FULL
, "%s\n", buf
);