1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Steve Borho <steve@borho.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 * This program is also available under a commercial proprietary license.
21 * For more information, contact us at license @ x265.com.
22 *****************************************************************************/
25 #include "primitives.h"
31 #pragma warning(disable: 4127) // conditional expression is constant (macros use this construct)
47 const SubpelWorkload workload
[X265_MAX_SUBPEL_LEVEL
+ 1] =
49 { 1, 4, 0, 4, false }, // 4 SAD HPEL only
50 { 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL
51 { 1, 4, 1, 4, true }, // 4 SATD HPEL + 4 SATD QPEL
52 { 2, 4, 1, 4, true }, // 2x4 SATD HPEL + 4 SATD QPEL
53 { 2, 4, 2, 4, true }, // 2x4 SATD HPEL + 2x4 SATD QPEL
54 { 1, 8, 1, 8, true }, // 8 SATD HPEL + 8 SATD QPEL (default)
55 { 2, 8, 1, 8, true }, // 2x8 SATD HPEL + 8 SATD QPEL
56 { 2, 8, 2, 8, true }, // 2x8 SATD HPEL + 2x8 SATD QPEL
59 int sizeScale
[NUM_LUMA_PARTITIONS
];
60 #define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum])))
64 #define SETUP_SCALE(W, H) \
65 sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
94 /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
95 const MV hex2
[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
96 const uint8_t mod6m1
[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */
97 const MV square1
[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
100 MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
101 MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
102 MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
103 MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
107 MV(-1, 0), MV(0, -1),
108 MV(-1, -1), MV(1, -1),
110 MV(-1, 1), MV(-1, -1),
115 }; // offsets for Two Point Search
117 /* sum of absolute differences between MV candidates, used for adaptive ME range */
118 inline int predictorDifference(const MV
*mvc
, intptr_t numCandidates
)
122 for (int i
= 0; i
< numCandidates
- 1; i
++)
124 sum
+= abs(mvc
[i
].x
- mvc
[i
+ 1].x
)
125 + abs(mvc
[i
].y
- mvc
[i
+ 1].y
);
133 MotionEstimate::MotionEstimate()
137 searchMethod
= X265_HEX_SEARCH
;
143 void MotionEstimate::init(int method
, int refine
, int csp
)
148 searchMethod
= method
;
149 subpelRefine
= refine
;
150 fencPUYuv
.create(FENC_STRIDE
, csp
);
153 MotionEstimate::~MotionEstimate()
158 /* Called by lookahead, luma only, no use of PicYuv */
159 void MotionEstimate::setSourcePU(pixel
*fencY
, intptr_t stride
, intptr_t offset
, int pwidth
, int pheight
)
161 partEnum
= partitionFromSizes(pwidth
, pheight
);
162 X265_CHECK(LUMA_4x4
!= partEnum
, "4x4 inter partition detected!\n");
163 sad
= primitives
.sad
[partEnum
];
164 satd
= primitives
.satd
[partEnum
];
165 sad_x3
= primitives
.sad_x3
[partEnum
];
166 sad_x4
= primitives
.sad_x4
[partEnum
];
169 blockOffset
= offset
;
170 absPartIdx
= ctuAddr
= -1;
172 /* copy PU block into cache */
173 primitives
.luma_copy_pp
[partEnum
](fencPUYuv
.m_buf
[0], FENC_STRIDE
, fencY
+ offset
, stride
);
174 X265_CHECK(!bChromaSATD
, "chroma distortion measurements impossible in this code path\n");
177 /* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
178 void MotionEstimate::setSourcePU(const Yuv
& srcFencYuv
, int _ctuAddr
, int cuPartIdx
, int puPartIdx
, int pwidth
, int pheight
)
180 partEnum
= partitionFromSizes(pwidth
, pheight
);
181 X265_CHECK(LUMA_4x4
!= partEnum
, "4x4 inter partition detected!\n");
182 sad
= primitives
.sad
[partEnum
];
183 satd
= primitives
.satd
[partEnum
];
184 sad_x3
= primitives
.sad_x3
[partEnum
];
185 sad_x4
= primitives
.sad_x4
[partEnum
];
186 chromaSatd
= primitives
.chroma
[fencPUYuv
.m_csp
].satd
[partEnum
];
188 /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
189 * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
190 bChromaSATD
= subpelRefine
> 2 && chromaSatd
;
191 X265_CHECK(!(bChromaSATD
&& !workload
[subpelRefine
].hpel_satd
), "Chroma SATD cannot be used with SAD hpel\n");
194 absPartIdx
= cuPartIdx
+ puPartIdx
;
198 /* copy PU from CU Yuv */
199 fencPUYuv
.copyPUFromYuv(srcFencYuv
, puPartIdx
, partEnum
, bChromaSATD
);
202 #define COST_MV_PT_DIST(mx, my, point, dist) \
206 int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \
207 cost += mvcost(tmv << 2); \
208 if (cost < bcost) { \
216 #define COST_MV(mx, my) \
219 int cost = sad(fenc, FENC_STRIDE, fref + (mx) + (my) * stride, stride); \
220 cost += mvcost(MV(mx, my) << 2); \
221 COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \
224 #define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \
226 pixel *pix_base = fref + bmv.x + bmv.y * stride; \
228 pix_base + (m0x) + (m0y) * stride, \
229 pix_base + (m1x) + (m1y) * stride, \
230 pix_base + (m2x) + (m2y) * stride, \
232 (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
233 (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
234 (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
237 #define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
240 fref + (m0x) + (m0y) * stride, \
241 fref + (m1x) + (m1y) * stride, \
242 fref + (m2x) + (m2y) * stride, \
243 fref + (m3x) + (m3y) * stride, \
245 costs[0] += mvcost(MV(m0x, m0y) << 2); \
246 costs[1] += mvcost(MV(m1x, m1y) << 2); \
247 costs[2] += mvcost(MV(m2x, m2y) << 2); \
248 costs[3] += mvcost(MV(m3x, m3y) << 2); \
249 COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \
250 COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \
251 COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \
252 COPY4_IF_LT(bcost, costs[3], bmv, MV(m3x, m3y), bPointNr, p3, bDistance, d3); \
255 #define COST_MV_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \
257 pixel *pix_base = fref + omv.x + omv.y * stride; \
259 pix_base + (m0x) + (m0y) * stride, \
260 pix_base + (m1x) + (m1y) * stride, \
261 pix_base + (m2x) + (m2y) * stride, \
262 pix_base + (m3x) + (m3y) * stride, \
264 costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
265 costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
266 costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
267 costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
268 COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
269 COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
270 COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
271 COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
274 #define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
276 pixel *pix_base = fref + bmv.x + bmv.y * stride; \
278 pix_base + (m0x) + (m0y) * stride, \
279 pix_base + (m1x) + (m1y) * stride, \
280 pix_base + (m2x) + (m2y) * stride, \
281 pix_base + (m3x) + (m3y) * stride, \
283 (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
284 (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
285 (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
286 (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
289 #define DIA1_ITER(mx, my) \
291 omv.x = mx; omv.y = my; \
292 COST_MV_X4(0, -1, 0, 1, -1, 0, 1, 0); \
295 #define CROSS(start, x_max, y_max) \
298 if ((x_max) <= X265_MIN(mvmax.x - omv.x, omv.x - mvmin.x)) \
299 for (; i < (x_max) - 2; i += 4) { \
300 COST_MV_X4(i, 0, -i, 0, i + 2, 0, -i - 2, 0); } \
301 for (; i < (x_max); i += 2) \
303 if (omv.x + i <= mvmax.x) \
304 COST_MV(omv.x + i, omv.y); \
305 if (omv.x - i >= mvmin.x) \
306 COST_MV(omv.x - i, omv.y); \
309 if ((y_max) <= X265_MIN(mvmax.y - omv.y, omv.y - mvmin.y)) \
310 for (; i < (y_max) - 2; i += 4) { \
311 COST_MV_X4(0, i, 0, -i, 0, i + 2, 0, -i - 2); } \
312 for (; i < (y_max); i += 2) \
314 if (omv.y + i <= mvmax.y) \
315 COST_MV(omv.x, omv.y + i); \
316 if (omv.y - i >= mvmin.y) \
317 COST_MV(omv.x, omv.y - i); \
321 void MotionEstimate::StarPatternSearch(ReferencePlanes
*ref
,
331 ALIGN_VAR_16(int, costs
[16]);
332 pixel
* fenc
= fencPUYuv
.m_buf
[0];
333 pixel
* fref
= ref
->fpelPlane
[0] + blockOffset
;
334 intptr_t stride
= ref
->lumaStride
;
348 const int16_t top
= omv
.y
- dist
;
349 const int16_t bottom
= omv
.y
+ dist
;
350 const int16_t left
= omv
.x
- dist
;
351 const int16_t right
= omv
.x
+ dist
;
353 if (top
>= mvmin
.y
&& left
>= mvmin
.x
&& right
<= mvmax
.x
&& bottom
<= mvmax
.y
)
355 COST_MV_PT_DIST_X4(omv
.x
, top
, 2, dist
,
356 left
, omv
.y
, 4, dist
,
357 right
, omv
.y
, 5, dist
,
358 omv
.x
, bottom
, 7, dist
);
362 if (top
>= mvmin
.y
) // check top
364 COST_MV_PT_DIST(omv
.x
, top
, 2, dist
);
366 if (left
>= mvmin
.x
) // check middle left
368 COST_MV_PT_DIST(left
, omv
.y
, 4, dist
);
370 if (right
<= mvmax
.x
) // check middle right
372 COST_MV_PT_DIST(right
, omv
.y
, 5, dist
);
374 if (bottom
<= mvmax
.y
) // check bottom
376 COST_MV_PT_DIST(omv
.x
, bottom
, 7, dist
);
381 else if (++rounds
>= earlyExitIters
)
385 for (int16_t dist
= 2; dist
<= 8; dist
<<= 1)
393 Points 2, 4, 5, 7 are dist
394 Points 1, 3, 6, 8 are dist>>1
396 const int16_t top
= omv
.y
- dist
;
397 const int16_t bottom
= omv
.y
+ dist
;
398 const int16_t left
= omv
.x
- dist
;
399 const int16_t right
= omv
.x
+ dist
;
400 const int16_t top2
= omv
.y
- (dist
>> 1);
401 const int16_t bottom2
= omv
.y
+ (dist
>> 1);
402 const int16_t left2
= omv
.x
- (dist
>> 1);
403 const int16_t right2
= omv
.x
+ (dist
>> 1);
406 if (top
>= mvmin
.y
&& left
>= mvmin
.x
&&
407 right
<= mvmax
.x
&& bottom
<= mvmax
.y
) // check border
409 COST_MV_PT_DIST_X4(omv
.x
, top
, 2, dist
,
410 left2
, top2
, 1, dist
>> 1,
411 right2
, top2
, 3, dist
>> 1,
412 left
, omv
.y
, 4, dist
);
413 COST_MV_PT_DIST_X4(right
, omv
.y
, 5, dist
,
414 left2
, bottom2
, 6, dist
>> 1,
415 right2
, bottom2
, 8, dist
>> 1,
416 omv
.x
, bottom
, 7, dist
);
418 else // check border for each mv
420 if (top
>= mvmin
.y
) // check top
422 COST_MV_PT_DIST(omv
.x
, top
, 2, dist
);
424 if (top2
>= mvmin
.y
) // check half top
426 if (left2
>= mvmin
.x
) // check half left
428 COST_MV_PT_DIST(left2
, top2
, 1, (dist
>> 1));
430 if (right2
<= mvmax
.x
) // check half right
432 COST_MV_PT_DIST(right2
, top2
, 3, (dist
>> 1));
435 if (left
>= mvmin
.x
) // check left
437 COST_MV_PT_DIST(left
, omv
.y
, 4, dist
);
439 if (right
<= mvmax
.x
) // check right
441 COST_MV_PT_DIST(right
, omv
.y
, 5, dist
);
443 if (bottom2
<= mvmax
.y
) // check half bottom
445 if (left2
>= mvmin
.x
) // check half left
447 COST_MV_PT_DIST(left2
, bottom2
, 6, (dist
>> 1));
449 if (right2
<= mvmax
.x
) // check half right
451 COST_MV_PT_DIST(right2
, bottom2
, 8, (dist
>> 1));
454 if (bottom
<= mvmax
.y
) // check bottom
456 COST_MV_PT_DIST(omv
.x
, bottom
, 7, dist
);
462 else if (++rounds
>= earlyExitIters
)
466 for (int16_t dist
= 16; dist
<= (int16_t)merange
; dist
<<= 1)
468 const int16_t top
= omv
.y
- dist
;
469 const int16_t bottom
= omv
.y
+ dist
;
470 const int16_t left
= omv
.x
- dist
;
471 const int16_t right
= omv
.x
+ dist
;
474 if (top
>= mvmin
.y
&& left
>= mvmin
.x
&&
475 right
<= mvmax
.x
&& bottom
<= mvmax
.y
) // check border
489 COST_MV_PT_DIST_X4(omv
.x
, top
, 0, dist
,
490 left
, omv
.y
, 0, dist
,
491 right
, omv
.y
, 0, dist
,
492 omv
.x
, bottom
, 0, dist
);
494 for (int16_t index
= 1; index
< 4; index
++)
496 int16_t posYT
= top
+ ((dist
>> 2) * index
);
497 int16_t posYB
= bottom
- ((dist
>> 2) * index
);
498 int16_t posXL
= omv
.x
- ((dist
>> 2) * index
);
499 int16_t posXR
= omv
.x
+ ((dist
>> 2) * index
);
501 COST_MV_PT_DIST_X4(posXL
, posYT
, 0, dist
,
502 posXR
, posYT
, 0, dist
,
503 posXL
, posYB
, 0, dist
,
504 posXR
, posYB
, 0, dist
);
507 else // check border for each mv
509 if (top
>= mvmin
.y
) // check top
511 COST_MV_PT_DIST(omv
.x
, top
, 0, dist
);
513 if (left
>= mvmin
.x
) // check left
515 COST_MV_PT_DIST(left
, omv
.y
, 0, dist
);
517 if (right
<= mvmax
.x
) // check right
519 COST_MV_PT_DIST(right
, omv
.y
, 0, dist
);
521 if (bottom
<= mvmax
.y
) // check bottom
523 COST_MV_PT_DIST(omv
.x
, bottom
, 0, dist
);
525 for (int16_t index
= 1; index
< 4; index
++)
527 int16_t posYT
= top
+ ((dist
>> 2) * index
);
528 int16_t posYB
= bottom
- ((dist
>> 2) * index
);
529 int16_t posXL
= omv
.x
- ((dist
>> 2) * index
);
530 int16_t posXR
= omv
.x
+ ((dist
>> 2) * index
);
532 if (posYT
>= mvmin
.y
) // check top
534 if (posXL
>= mvmin
.x
) // check left
536 COST_MV_PT_DIST(posXL
, posYT
, 0, dist
);
538 if (posXR
<= mvmax
.x
) // check right
540 COST_MV_PT_DIST(posXR
, posYT
, 0, dist
);
543 if (posYB
<= mvmax
.y
) // check bottom
545 if (posXL
>= mvmin
.x
) // check left
547 COST_MV_PT_DIST(posXL
, posYB
, 0, dist
);
549 if (posXR
<= mvmax
.x
) // check right
551 COST_MV_PT_DIST(posXR
, posYB
, 0, dist
);
559 else if (++rounds
>= earlyExitIters
)
564 int MotionEstimate::motionEstimate(ReferencePlanes
*ref
,
573 ALIGN_VAR_16(int, costs
[16]);
575 blockOffset
= ref
->reconPic
->getLumaAddr(ctuAddr
, absPartIdx
) - ref
->reconPic
->getLumaAddr(0);
576 intptr_t stride
= ref
->lumaStride
;
577 pixel
* fenc
= fencPUYuv
.m_buf
[0];
578 pixel
* fref
= ref
->fpelPlane
[0] + blockOffset
;
582 MV qmvmin
= mvmin
.toQPel();
583 MV qmvmax
= mvmax
.toQPel();
585 /* The term cost used here means satd/sad values for that particular search.
586 * The costs used in ME integer search only includes the SAD cost of motion
587 * residual and sqrtLambda times MVD bits. The subpel refine steps use SATD
588 * cost of residual and sqrtLambda * MVD bits. Mode decision will be based
589 * on video distortion cost (SSE/PSNR) plus lambda times all signaling bits
590 * (mode + MVD bits). */
592 // measure SAD cost at clipped QPEL MVP
593 MV pmv
= qmvp
.clipped(qmvmin
, qmvmax
);
598 bprecost
= ref
->lowresQPelCost(fenc
, blockOffset
, pmv
, sad
);
600 bprecost
= subpelCompare(ref
, pmv
, sad
);
602 /* re-measure full pel rounded MVP with SAD as search start point */
603 MV bmv
= pmv
.roundToFPel();
604 int bcost
= bprecost
;
606 bcost
= sad(fenc
, FENC_STRIDE
, fref
+ bmv
.x
+ bmv
.y
* stride
, stride
) + mvcost(bmv
<< 2);
608 // measure SAD cost at MV(0) if MVP is not zero
611 int cost
= sad(fenc
, FENC_STRIDE
, fref
, stride
) + mvcost(MV(0, 0));
619 // measure SAD cost at each QPEL motion vector candidate
622 for (int i
= 0; i
< numCandidates
; i
++)
624 MV m
= mvc
[i
].clipped(qmvmin
, qmvmax
);
625 if (m
.notZero() && m
!= pmv
&& m
!= bestpre
) // check already measured
627 int cost
= ref
->lowresQPelCost(fenc
, blockOffset
, m
, sad
) + mvcost(m
);
638 for (int i
= 0; i
< numCandidates
; i
++)
640 MV m
= mvc
[i
].clipped(qmvmin
, qmvmax
);
641 if (m
.notZero() && m
!= pmv
&& m
!= bestpre
) // check already measured
643 int cost
= subpelCompare(ref
, m
, sad
) + mvcost(m
);
653 pmv
= pmv
.roundToFPel();
654 MV omv
= bmv
; // current search origin or starting point
656 switch (searchMethod
)
658 case X265_DIA_SEARCH
:
660 /* diamond search, radius 1 */
665 COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs
);
666 COPY1_IF_LT(bcost
, (costs
[0] << 4) + 1);
667 COPY1_IF_LT(bcost
, (costs
[1] << 4) + 3);
668 COPY1_IF_LT(bcost
, (costs
[2] << 4) + 4);
669 COPY1_IF_LT(bcost
, (costs
[3] << 4) + 12);
672 bmv
.x
-= (bcost
<< 28) >> 30;
673 bmv
.y
-= (bcost
<< 30) >> 30;
676 while (--i
&& bmv
.checkRange(mvmin
, mvmax
));
681 case X265_HEX_SEARCH
:
684 /* hexagon search, radius 2 */
686 for (int i
= 0; i
< merange
/ 2; i
++)
689 COST_MV(omv
.x
- 2, omv
.y
);
690 COST_MV(omv
.x
- 1, omv
.y
+ 2);
691 COST_MV(omv
.x
+ 1, omv
.y
+ 2);
692 COST_MV(omv
.x
+ 2, omv
.y
);
693 COST_MV(omv
.x
+ 1, omv
.y
- 2);
694 COST_MV(omv
.x
- 1, omv
.y
- 2);
697 if (!bmv
.checkRange(mvmin
, mvmax
))
702 /* equivalent to the above, but eliminates duplicate candidates */
703 COST_MV_X3_DIR(-2, 0, -1, 2, 1, 2, costs
);
705 COPY1_IF_LT(bcost
, (costs
[0] << 3) + 2);
706 COPY1_IF_LT(bcost
, (costs
[1] << 3) + 3);
707 COPY1_IF_LT(bcost
, (costs
[2] << 3) + 4);
708 COST_MV_X3_DIR(2, 0, 1, -2, -1, -2, costs
);
709 COPY1_IF_LT(bcost
, (costs
[0] << 3) + 5);
710 COPY1_IF_LT(bcost
, (costs
[1] << 3) + 6);
711 COPY1_IF_LT(bcost
, (costs
[2] << 3) + 7);
715 int dir
= (bcost
& 7) - 2;
716 bmv
+= hex2
[dir
+ 1];
718 /* half hexagon, not overlapping the previous iteration */
719 for (int i
= (merange
>> 1) - 1; i
> 0 && bmv
.checkRange(mvmin
, mvmax
); i
--)
721 COST_MV_X3_DIR(hex2
[dir
+ 0].x
, hex2
[dir
+ 0].y
,
722 hex2
[dir
+ 1].x
, hex2
[dir
+ 1].y
,
723 hex2
[dir
+ 2].x
, hex2
[dir
+ 2].y
,
726 COPY1_IF_LT(bcost
, (costs
[0] << 3) + 1);
727 COPY1_IF_LT(bcost
, (costs
[1] << 3) + 2);
728 COPY1_IF_LT(bcost
, (costs
[2] << 3) + 3);
731 dir
+= (bcost
& 7) - 2;
732 dir
= mod6m1
[dir
+ 1];
733 bmv
+= hex2
[dir
+ 1];
741 COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs
);
742 COPY2_IF_LT(bcost
, costs
[0], dir
, 1);
743 COPY2_IF_LT(bcost
, costs
[1], dir
, 2);
744 COPY2_IF_LT(bcost
, costs
[2], dir
, 3);
745 COPY2_IF_LT(bcost
, costs
[3], dir
, 4);
746 COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs
);
747 COPY2_IF_LT(bcost
, costs
[0], dir
, 5);
748 COPY2_IF_LT(bcost
, costs
[1], dir
, 6);
749 COPY2_IF_LT(bcost
, costs
[2], dir
, 7);
750 COPY2_IF_LT(bcost
, costs
[3], dir
, 8);
755 case X265_UMH_SEARCH
:
758 int16_t cross_start
= 1;
760 /* refine predictors */
763 DIA1_ITER(pmv
.x
, pmv
.y
);
768 if (bmv
.notZero() && bmv
!= pmv
)
769 DIA1_ITER(bmv
.x
, bmv
.y
);
773 /* Early Termination */
775 if (bcost
== ucost2
&& SAD_THRESH(2000))
777 COST_MV_X4(0, -2, -1, -1, 1, -1, -2, 0);
778 COST_MV_X4(2, 0, -1, 1, 1, 1, 0, 2);
779 if (bcost
== ucost1
&& SAD_THRESH(500))
783 int16_t range
= (int16_t)(merange
>> 1) | 1;
784 CROSS(3, range
, range
);
785 COST_MV_X4(-1, -2, 1, -2, -2, -1, 2, -1);
786 COST_MV_X4(-2, 1, 2, 1, -1, 2, 1, 2);
789 cross_start
= range
+ 2;
793 // TODO: Need to study x264's logic for building mvc list to understand why they
794 // have special cases here for 16x16, and whether they apply to HEVC CTU
796 // adaptive search range based on mvc variability
799 /* range multipliers based on casual inspection of some statistics of
800 * average distance between current predictor and final mv found by ESA.
801 * these have not been tuned much by actual encoding. */
802 static const uint8_t range_mul
[4][4] =
811 int sad_ctx
, mvd_ctx
;
814 if (numCandidates
== 1)
816 if (LUMA_64x64
== partEnum
)
817 /* mvc is probably the same as mvp, so the difference isn't meaningful.
818 * but prediction usually isn't too bad, so just use medium range */
821 mvd
= abs(qmvp
.x
- mvc
[0].x
) + abs(qmvp
.y
- mvc
[0].y
);
825 /* calculate the degree of agreement between predictors. */
827 /* in 64x64, mvc includes all the neighbors used to make mvp,
828 * so don't count mvp separately. */
830 denom
= numCandidates
- 1;
832 if (partEnum
!= LUMA_64x64
)
834 mvd
= abs(qmvp
.x
- mvc
[0].x
) + abs(qmvp
.y
- mvc
[0].y
);
837 mvd
+= predictorDifference(mvc
, numCandidates
);
840 sad_ctx
= SAD_THRESH(1000) ? 0
841 : SAD_THRESH(2000) ? 1
842 : SAD_THRESH(4000) ? 2 : 3;
843 mvd_ctx
= mvd
< 10 * denom
? 0
844 : mvd
< 20 * denom
? 1
845 : mvd
< 40 * denom
? 2 : 3;
847 merange
= (merange
* range_mul
[mvd_ctx
][sad_ctx
]) >> 2;
850 /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
851 * we are still centered on the same place as the DIA2. is this desirable? */
852 CROSS(cross_start
, merange
, merange
>> 1);
853 COST_MV_X4(-2, -2, -2, 2, 2, -2, 2, 2);
857 const uint16_t *p_cost_omvx
= m_cost_mvx
+ omv
.x
* 4;
858 const uint16_t *p_cost_omvy
= m_cost_mvy
+ omv
.y
* 4;
862 if (4 * i
> X265_MIN4(mvmax
.x
- omv
.x
, omv
.x
- mvmin
.x
,
863 mvmax
.y
- omv
.y
, omv
.y
- mvmin
.y
))
865 for (int j
= 0; j
< 16; j
++)
867 MV mv
= omv
+ (hex4
[j
] * i
);
868 if (mv
.checkRange(mvmin
, mvmax
))
875 pixel
*fref_base
= fref
+ omv
.x
+ (omv
.y
- 4 * i
) * stride
;
876 size_t dy
= (size_t)i
* stride
;
877 #define SADS(k, x0, y0, x1, y1, x2, y2, x3, y3) \
879 fref_base x0 * i + (y0 - 2 * k + 4) * dy, \
880 fref_base x1 * i + (y1 - 2 * k + 4) * dy, \
881 fref_base x2 * i + (y2 - 2 * k + 4) * dy, \
882 fref_base x3 * i + (y3 - 2 * k + 4) * dy, \
883 stride, costs + 4 * k); \
885 #define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + p_cost_omvy[y * 4 * i]
886 #define MIN_MV(k, x, y) COPY2_IF_LT(bcost, costs[k], dir, x * 16 + (y & 15))
888 SADS(0, +0, -4, +0, +4, -2, -3, +2, -3);
889 SADS(1, -4, -2, +4, -2, -4, -1, +4, -1);
890 SADS(2, -4, +0, +4, +0, -4, +1, +4, +1);
891 SADS(3, -4, +2, +4, +2, -2, +3, +2, +3);
892 ADD_MVCOST(0, 0, -4);
894 ADD_MVCOST(2, -2, -3);
895 ADD_MVCOST(3, 2, -3);
896 ADD_MVCOST(4, -4, -2);
897 ADD_MVCOST(5, 4, -2);
898 ADD_MVCOST(6, -4, -1);
899 ADD_MVCOST(7, 4, -1);
900 ADD_MVCOST(8, -4, 0);
902 ADD_MVCOST(10, -4, 1);
903 ADD_MVCOST(11, 4, 1);
904 ADD_MVCOST(12, -4, 2);
905 ADD_MVCOST(13, 4, 2);
906 ADD_MVCOST(14, -2, 3);
907 ADD_MVCOST(15, 2, 3);
929 bmv
.x
= omv
.x
+ i
* (dir
>> 4);
930 bmv
.y
= omv
.y
+ i
* ((dir
<< 28) >> 28);
934 while (++i
<= merange
>> 2);
935 if (bmv
.checkRange(mvmin
, mvmax
))
940 case X265_STAR_SEARCH
: // Adapted from HM ME
945 const int EarlyExitIters
= 3;
946 StarPatternSearch(ref
, mvmin
, mvmax
, bmv
, bcost
, bPointNr
, bDistance
, EarlyExitIters
, merange
);
949 // if best distance was only 1, check two missing points. If no new point is found, stop
952 /* For a given direction 1 to 8, check nearest two outer X pixels
960 const MV mv1
= bmv
+ offsets
[(bPointNr
- 1) * 2];
961 const MV mv2
= bmv
+ offsets
[(bPointNr
- 1) * 2 + 1];
962 if (mv1
.checkRange(mvmin
, mvmax
))
964 COST_MV(mv1
.x
, mv1
.y
);
966 if (mv2
.checkRange(mvmin
, mvmax
))
968 COST_MV(mv2
.x
, mv2
.y
);
977 const int RasterDistance
= 5;
978 if (bDistance
> RasterDistance
)
980 // raster search refinement if original search distance was too big
982 for (tmv
.y
= mvmin
.y
; tmv
.y
<= mvmax
.y
; tmv
.y
+= RasterDistance
)
984 for (tmv
.x
= mvmin
.x
; tmv
.x
<= mvmax
.x
; tmv
.x
+= RasterDistance
)
986 if (tmv
.x
+ (RasterDistance
* 3) <= mvmax
.x
)
988 pixel
*pix_base
= fref
+ tmv
.y
* stride
+ tmv
.x
;
991 pix_base
+ RasterDistance
,
992 pix_base
+ RasterDistance
* 2,
993 pix_base
+ RasterDistance
* 3,
995 costs
[0] += mvcost(tmv
<< 2);
996 COPY2_IF_LT(bcost
, costs
[0], bmv
, tmv
);
997 tmv
.x
+= RasterDistance
;
998 costs
[1] += mvcost(tmv
<< 2);
999 COPY2_IF_LT(bcost
, costs
[1], bmv
, tmv
);
1000 tmv
.x
+= RasterDistance
;
1001 costs
[2] += mvcost(tmv
<< 2);
1002 COPY2_IF_LT(bcost
, costs
[2], bmv
, tmv
);
1003 tmv
.x
+= RasterDistance
;
1004 costs
[3] += mvcost(tmv
<< 3);
1005 COPY2_IF_LT(bcost
, costs
[3], bmv
, tmv
);
1008 COST_MV(tmv
.x
, tmv
.y
);
1013 while (bDistance
> 0)
1015 // center a new search around current best
1018 const int MaxIters
= 32;
1019 StarPatternSearch(ref
, mvmin
, mvmax
, bmv
, bcost
, bPointNr
, bDistance
, MaxIters
, merange
);
1026 /* For a given direction 1 to 8, check nearest 2 outer X pixels
1033 const MV mv1
= bmv
+ offsets
[(bPointNr
- 1) * 2];
1034 const MV mv2
= bmv
+ offsets
[(bPointNr
- 1) * 2 + 1];
1035 if (mv1
.checkRange(mvmin
, mvmax
))
1037 COST_MV(mv1
.x
, mv1
.y
);
1039 if (mv2
.checkRange(mvmin
, mvmax
))
1041 COST_MV(mv2
.x
, mv2
.y
);
1050 case X265_FULL_SEARCH
:
1052 // dead slow exhaustive search, but at least it uses sad_x4()
1054 for (tmv
.y
= mvmin
.y
; tmv
.y
<= mvmax
.y
; tmv
.y
++)
1056 for (tmv
.x
= mvmin
.x
; tmv
.x
<= mvmax
.x
; tmv
.x
++)
1058 if (tmv
.x
+ 3 <= mvmax
.x
)
1060 pixel
*pix_base
= fref
+ tmv
.y
* stride
+ tmv
.x
;
1067 costs
[0] += mvcost(tmv
<< 2);
1068 COPY2_IF_LT(bcost
, costs
[0], bmv
, tmv
);
1070 costs
[1] += mvcost(tmv
<< 2);
1071 COPY2_IF_LT(bcost
, costs
[1], bmv
, tmv
);
1073 costs
[2] += mvcost(tmv
<< 2);
1074 COPY2_IF_LT(bcost
, costs
[2], bmv
, tmv
);
1076 costs
[3] += mvcost(tmv
<< 2);
1077 COPY2_IF_LT(bcost
, costs
[3], bmv
, tmv
);
1080 COST_MV(tmv
.x
, tmv
.y
);
1088 X265_CHECK(0, "invalid motion estimate mode\n");
1092 if (bprecost
< bcost
)
1098 bmv
= bmv
.toQPel(); // promote search bmv to qpel
1100 const SubpelWorkload
& wl
= workload
[this->subpelRefine
];
1104 /* if there was zero residual at the clipped MVP, we can skip subpel
1105 * refine, but we do need to include the mvcost in the returned cost */
1106 bcost
= mvcost(bmv
);
1108 else if (ref
->isLowres
)
1111 for (int i
= 1; i
<= wl
.hpel_dirs
; i
++)
1113 MV qmv
= bmv
+ square1
[i
] * 2;
1114 int cost
= ref
->lowresQPelCost(fenc
, blockOffset
, qmv
, sad
) + mvcost(qmv
);
1115 COPY2_IF_LT(bcost
, cost
, bdir
, i
);
1118 bmv
+= square1
[bdir
] * 2;
1119 bcost
= ref
->lowresQPelCost(fenc
, blockOffset
, bmv
, satd
) + mvcost(bmv
);
1122 for (int i
= 1; i
<= wl
.qpel_dirs
; i
++)
1124 MV qmv
= bmv
+ square1
[i
];
1125 int cost
= ref
->lowresQPelCost(fenc
, blockOffset
, qmv
, satd
) + mvcost(qmv
);
1126 COPY2_IF_LT(bcost
, cost
, bdir
, i
);
1129 bmv
+= square1
[bdir
];
1133 pixelcmp_t hpelcomp
;
1137 bcost
= subpelCompare(ref
, bmv
, satd
) + mvcost(bmv
);
1143 for (int iter
= 0; iter
< wl
.hpel_iters
; iter
++)
1146 for (int i
= 1; i
<= wl
.hpel_dirs
; i
++)
1148 MV qmv
= bmv
+ square1
[i
] * 2;
1149 int cost
= subpelCompare(ref
, qmv
, hpelcomp
) + mvcost(qmv
);
1150 COPY2_IF_LT(bcost
, cost
, bdir
, i
);
1154 bmv
+= square1
[bdir
] * 2;
1159 /* if HPEL search used SAD, remeasure with SATD before QPEL */
1161 bcost
= subpelCompare(ref
, bmv
, satd
) + mvcost(bmv
);
1163 for (int iter
= 0; iter
< wl
.qpel_iters
; iter
++)
1166 for (int i
= 1; i
<= wl
.qpel_dirs
; i
++)
1168 MV qmv
= bmv
+ square1
[i
];
1169 int cost
= subpelCompare(ref
, qmv
, satd
) + mvcost(qmv
);
1170 COPY2_IF_LT(bcost
, cost
, bdir
, i
);
1174 bmv
+= square1
[bdir
];
1185 int MotionEstimate::subpelCompare(ReferencePlanes
*ref
, const MV
& qmv
, pixelcmp_t cmp
)
1187 intptr_t refStride
= ref
->lumaStride
;
1188 pixel
*fref
= ref
->fpelPlane
[0] + blockOffset
+ (qmv
.x
>> 2) + (qmv
.y
>> 2) * refStride
;
1189 int xFrac
= qmv
.x
& 0x3;
1190 int yFrac
= qmv
.y
& 0x3;
1192 intptr_t lclStride
= fencPUYuv
.m_size
;
1193 X265_CHECK(lclStride
== FENC_STRIDE
, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
1195 if (!(yFrac
| xFrac
))
1196 cost
= cmp(fencPUYuv
.m_buf
[0], lclStride
, fref
, refStride
);
1199 /* we are taking a short-cut here if the reference is weighted. To be
1200 * accurate we should be interpolating unweighted pixels and weighting
1201 * the final 16bit values prior to rounding and down shifting. Instead we
1202 * are simply interpolating the weighted full-pel pixels. Not 100%
1203 * accurate but good enough for fast qpel ME */
1204 ALIGN_VAR_32(pixel
, subpelbuf
[64 * 64]);
1206 primitives
.luma_hpp
[partEnum
](fref
, refStride
, subpelbuf
, lclStride
, xFrac
);
1208 primitives
.luma_vpp
[partEnum
](fref
, refStride
, subpelbuf
, lclStride
, yFrac
);
1211 ALIGN_VAR_32(int16_t, immed
[64 * (64 + NTAPS_LUMA
)]);
1213 int filterSize
= NTAPS_LUMA
;
1214 int halfFilterSize
= filterSize
>> 1;
1215 primitives
.luma_hps
[partEnum
](fref
, refStride
, immed
, blockwidth
, xFrac
, 1);
1216 primitives
.luma_vsp
[partEnum
](immed
+ (halfFilterSize
- 1) * blockwidth
, blockwidth
, subpelbuf
, lclStride
, yFrac
);
1218 cost
= cmp(fencPUYuv
.m_buf
[0], lclStride
, subpelbuf
, lclStride
);
1223 int csp
= fencPUYuv
.m_csp
;
1224 int hshift
= fencPUYuv
.m_hChromaShift
;
1225 int vshift
= fencPUYuv
.m_vChromaShift
;
1226 int shiftHor
= (2 + hshift
);
1227 int shiftVer
= (2 + vshift
);
1228 lclStride
= fencPUYuv
.m_csize
;
1230 intptr_t refStrideC
= ref
->reconPic
->m_strideC
;
1231 intptr_t refOffset
= (qmv
.x
>> shiftHor
) + (qmv
.y
>> shiftVer
) * refStrideC
;
1233 const pixel
* refCb
= ref
->getCbAddr(ctuAddr
, absPartIdx
) + refOffset
;
1234 const pixel
* refCr
= ref
->getCrAddr(ctuAddr
, absPartIdx
) + refOffset
;
1236 xFrac
= qmv
.x
& ((1 << shiftHor
) - 1);
1237 yFrac
= qmv
.y
& ((1 << shiftVer
) - 1);
1239 if (!(yFrac
| xFrac
))
1241 cost
+= chromaSatd(fencPUYuv
.m_buf
[1], lclStride
, refCb
, refStrideC
);
1242 cost
+= chromaSatd(fencPUYuv
.m_buf
[2], lclStride
, refCr
, refStrideC
);
1246 ALIGN_VAR_32(pixel
, subpelbuf
[64 * 64]);
1249 primitives
.chroma
[csp
].filter_hpp
[partEnum
](refCb
, refStrideC
, subpelbuf
, lclStride
, xFrac
<< (1 - hshift
));
1250 cost
+= chromaSatd(fencPUYuv
.m_buf
[1], lclStride
, subpelbuf
, lclStride
);
1252 primitives
.chroma
[csp
].filter_hpp
[partEnum
](refCr
, refStrideC
, subpelbuf
, lclStride
, xFrac
<< (1 - hshift
));
1253 cost
+= chromaSatd(fencPUYuv
.m_buf
[2], lclStride
, subpelbuf
, lclStride
);
1257 primitives
.chroma
[csp
].filter_vpp
[partEnum
](refCb
, refStrideC
, subpelbuf
, lclStride
, yFrac
<< (1 - vshift
));
1258 cost
+= chromaSatd(fencPUYuv
.m_buf
[1], lclStride
, subpelbuf
, lclStride
);
1260 primitives
.chroma
[csp
].filter_vpp
[partEnum
](refCr
, refStrideC
, subpelbuf
, lclStride
, yFrac
<< (1 - vshift
));
1261 cost
+= chromaSatd(fencPUYuv
.m_buf
[2], lclStride
, subpelbuf
, lclStride
);
1265 ALIGN_VAR_32(int16_t, immed
[64 * (64 + NTAPS_CHROMA
)]);
1267 int extStride
= blockwidth
>> hshift
;
1268 int filterSize
= NTAPS_CHROMA
;
1269 int halfFilterSize
= (filterSize
>> 1);
1271 primitives
.chroma
[csp
].filter_hps
[partEnum
](refCb
, refStrideC
, immed
, extStride
, xFrac
<< (1 - hshift
), 1);
1272 primitives
.chroma
[csp
].filter_vsp
[partEnum
](immed
+ (halfFilterSize
- 1) * extStride
, extStride
, subpelbuf
, lclStride
, yFrac
<< (1 - vshift
));
1273 cost
+= chromaSatd(fencPUYuv
.m_buf
[1], lclStride
, subpelbuf
, lclStride
);
1275 primitives
.chroma
[csp
].filter_hps
[partEnum
](refCr
, refStrideC
, immed
, extStride
, xFrac
<< (1 - hshift
), 1);
1276 primitives
.chroma
[csp
].filter_vsp
[partEnum
](immed
+ (halfFilterSize
- 1) * extStride
, extStride
, subpelbuf
, lclStride
, yFrac
<< (1 - vshift
));
1277 cost
+= chromaSatd(fencPUYuv
.m_buf
[2], lclStride
, subpelbuf
, lclStride
);