f6129ff60f92bb9862df92a5060fda0e27b853dc
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Steve Borho <steve@borho.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 * This program is also available under a commercial proprietary license.
21 * For more information, contact us at license @ x265.com.
22 *****************************************************************************/
25 #include "primitives.h"
31 #pragma warning(disable: 4127) // conditional expression is constant (macros use this construct)
46 SubpelWorkload workload
[X265_MAX_SUBPEL_LEVEL
+ 1] =
48 { 1, 4, 0, 4, false }, // 4 SAD HPEL only
49 { 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL
50 { 1, 4, 1, 4, true }, // 4 SATD HPEL + 4 SATD QPEL
51 { 2, 4, 1, 4, true }, // 2x4 SATD HPEL + 4 SATD QPEL
52 { 2, 4, 2, 4, true }, // 2x4 SATD HPEL + 2x4 SATD QPEL
53 { 1, 8, 1, 8, true }, // 8 SATD HPEL + 8 SATD QPEL (default)
54 { 2, 8, 1, 8, true }, // 2x8 SATD HPEL + 8 SATD QPEL
55 { 2, 8, 2, 8, true }, // 2x8 SATD HPEL + 2x8 SATD QPEL
59 static int size_scale
[NUM_LUMA_PARTITIONS
];
60 #define SAD_THRESH(v) (bcost < (((v >> 4) * size_scale[partEnum])))
62 static void init_scales(void)
64 #define SETUP_SCALE(W, H) \
65 size_scale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
94 MotionEstimate::MotionEstimate()
98 if (size_scale
[0] == 0)
101 fenc
= X265_MALLOC(pixel
, MAX_CU_SIZE
* MAX_CU_SIZE
);
104 MotionEstimate::~MotionEstimate()
109 void MotionEstimate::setSourcePU(intptr_t offset
, int width
, int height
)
111 partEnum
= partitionFromSizes(width
, height
);
112 X265_CHECK(LUMA_4x4
!= partEnum
, "4x4 inter partition detected!\n");
113 sad
= primitives
.sad
[partEnum
];
114 satd
= primitives
.satd
[partEnum
];
115 sa8d
= primitives
.sa8d_inter
[partEnum
];
116 sad_x3
= primitives
.sad_x3
[partEnum
];
117 sad_x4
= primitives
.sad_x4
[partEnum
];
120 blockheight
= height
;
121 blockOffset
= offset
;
123 /* copy PU block into cache */
124 primitives
.luma_copy_pp
[partEnum
](fenc
, FENC_STRIDE
, fencplane
+ offset
, fencLumaStride
);
127 /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
128 static const MV hex2
[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
129 static const uint8_t mod6m1
[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */
130 static const MV square1
[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
131 static const MV hex4
[16] =
133 MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
134 MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
135 MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
136 MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
138 static const MV offsets
[] =
140 MV(-1, 0), MV(0, -1),
141 MV(-1, -1), MV(1, -1),
143 MV(-1, 1), MV(-1, -1),
148 }; // offsets for Two Point Search
150 /* sum of absolute differences between MV candidates */
151 static inline int x265_predictor_difference(const MV
*mvc
, intptr_t numCandidates
)
155 for (int i
= 0; i
< numCandidates
- 1; i
++)
157 sum
+= abs(mvc
[i
].x
- mvc
[i
+ 1].x
)
158 + abs(mvc
[i
].y
- mvc
[i
+ 1].y
);
164 #define COST_MV_PT_DIST(mx, my, point, dist) \
168 int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \
169 cost += mvcost(tmv << 2); \
170 if (cost < bcost) { \
178 #define COST_MV(mx, my) \
181 int cost = sad(fenc, FENC_STRIDE, fref + (mx) + (my) * stride, stride); \
182 cost += mvcost(MV(mx, my) << 2); \
183 COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \
186 #define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \
188 pixel *pix_base = fref + bmv.x + bmv.y * stride; \
190 pix_base + (m0x) + (m0y) * stride, \
191 pix_base + (m1x) + (m1y) * stride, \
192 pix_base + (m2x) + (m2y) * stride, \
194 (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
195 (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
196 (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
199 #define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
202 fref + (m0x) + (m0y) * stride, \
203 fref + (m1x) + (m1y) * stride, \
204 fref + (m2x) + (m2y) * stride, \
205 fref + (m3x) + (m3y) * stride, \
207 costs[0] += mvcost(MV(m0x, m0y) << 2); \
208 costs[1] += mvcost(MV(m1x, m1y) << 2); \
209 costs[2] += mvcost(MV(m2x, m2y) << 2); \
210 costs[3] += mvcost(MV(m3x, m3y) << 2); \
211 COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \
212 COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \
213 COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \
214 COPY4_IF_LT(bcost, costs[3], bmv, MV(m3x, m3y), bPointNr, p3, bDistance, d3); \
217 #define COST_MV_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \
219 pixel *pix_base = fref + omv.x + omv.y * stride; \
221 pix_base + (m0x) + (m0y) * stride, \
222 pix_base + (m1x) + (m1y) * stride, \
223 pix_base + (m2x) + (m2y) * stride, \
224 pix_base + (m3x) + (m3y) * stride, \
226 costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
227 costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
228 costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
229 costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
230 COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
231 COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
232 COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
233 COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
236 #define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
238 pixel *pix_base = fref + bmv.x + bmv.y * stride; \
240 pix_base + (m0x) + (m0y) * stride, \
241 pix_base + (m1x) + (m1y) * stride, \
242 pix_base + (m2x) + (m2y) * stride, \
243 pix_base + (m3x) + (m3y) * stride, \
245 (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
246 (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
247 (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
248 (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
251 #define DIA1_ITER(mx, my) \
253 omv.x = mx; omv.y = my; \
254 COST_MV_X4(0, -1, 0, 1, -1, 0, 1, 0); \
257 #define CROSS(start, x_max, y_max) \
260 if ((x_max) <= X265_MIN(mvmax.x - omv.x, omv.x - mvmin.x)) \
261 for (; i < (x_max) - 2; i += 4) { \
262 COST_MV_X4(i, 0, -i, 0, i + 2, 0, -i - 2, 0); } \
263 for (; i < (x_max); i += 2) \
265 if (omv.x + i <= mvmax.x) \
266 COST_MV(omv.x + i, omv.y); \
267 if (omv.x - i >= mvmin.x) \
268 COST_MV(omv.x - i, omv.y); \
271 if ((y_max) <= X265_MIN(mvmax.y - omv.y, omv.y - mvmin.y)) \
272 for (; i < (y_max) - 2; i += 4) { \
273 COST_MV_X4(0, i, 0, -i, 0, i + 2, 0, -i - 2); } \
274 for (; i < (y_max); i += 2) \
276 if (omv.y + i <= mvmax.y) \
277 COST_MV(omv.x, omv.y + i); \
278 if (omv.y - i >= mvmin.y) \
279 COST_MV(omv.x, omv.y - i); \
283 void MotionEstimate::StarPatternSearch(ReferencePlanes
*ref
,
293 ALIGN_VAR_16(int, costs
[16]);
294 pixel
*fref
= ref
->fpelPlane
+ blockOffset
;
295 size_t stride
= ref
->lumaStride
;
309 const int16_t top
= omv
.y
- dist
;
310 const int16_t bottom
= omv
.y
+ dist
;
311 const int16_t left
= omv
.x
- dist
;
312 const int16_t right
= omv
.x
+ dist
;
314 if (top
>= mvmin
.y
&& left
>= mvmin
.x
&& right
<= mvmax
.x
&& bottom
<= mvmax
.y
)
316 COST_MV_PT_DIST_X4(omv
.x
, top
, 2, dist
,
317 left
, omv
.y
, 4, dist
,
318 right
, omv
.y
, 5, dist
,
319 omv
.x
, bottom
, 7, dist
);
323 if (top
>= mvmin
.y
) // check top
325 COST_MV_PT_DIST(omv
.x
, top
, 2, dist
);
327 if (left
>= mvmin
.x
) // check middle left
329 COST_MV_PT_DIST(left
, omv
.y
, 4, dist
);
331 if (right
<= mvmax
.x
) // check middle right
333 COST_MV_PT_DIST(right
, omv
.y
, 5, dist
);
335 if (bottom
<= mvmax
.y
) // check bottom
337 COST_MV_PT_DIST(omv
.x
, bottom
, 7, dist
);
342 else if (++rounds
>= earlyExitIters
)
346 for (int16_t dist
= 2; dist
<= 8; dist
<<= 1)
354 Points 2, 4, 5, 7 are dist
355 Points 1, 3, 6, 8 are dist>>1
357 const int16_t top
= omv
.y
- dist
;
358 const int16_t bottom
= omv
.y
+ dist
;
359 const int16_t left
= omv
.x
- dist
;
360 const int16_t right
= omv
.x
+ dist
;
361 const int16_t top2
= omv
.y
- (dist
>> 1);
362 const int16_t bottom2
= omv
.y
+ (dist
>> 1);
363 const int16_t left2
= omv
.x
- (dist
>> 1);
364 const int16_t right2
= omv
.x
+ (dist
>> 1);
367 if (top
>= mvmin
.y
&& left
>= mvmin
.x
&&
368 right
<= mvmax
.x
&& bottom
<= mvmax
.y
) // check border
370 COST_MV_PT_DIST_X4(omv
.x
, top
, 2, dist
,
371 left2
, top2
, 1, dist
>> 1,
372 right2
, top2
, 3, dist
>> 1,
373 left
, omv
.y
, 4, dist
);
374 COST_MV_PT_DIST_X4(right
, omv
.y
, 5, dist
,
375 left2
, bottom2
, 6, dist
>> 1,
376 right2
, bottom2
, 8, dist
>> 1,
377 omv
.x
, bottom
, 7, dist
);
379 else // check border for each mv
381 if (top
>= mvmin
.y
) // check top
383 COST_MV_PT_DIST(omv
.x
, top
, 2, dist
);
385 if (top2
>= mvmin
.y
) // check half top
387 if (left2
>= mvmin
.x
) // check half left
389 COST_MV_PT_DIST(left2
, top2
, 1, (dist
>> 1));
391 if (right2
<= mvmax
.x
) // check half right
393 COST_MV_PT_DIST(right2
, top2
, 3, (dist
>> 1));
396 if (left
>= mvmin
.x
) // check left
398 COST_MV_PT_DIST(left
, omv
.y
, 4, dist
);
400 if (right
<= mvmax
.x
) // check right
402 COST_MV_PT_DIST(right
, omv
.y
, 5, dist
);
404 if (bottom2
<= mvmax
.y
) // check half bottom
406 if (left2
>= mvmin
.x
) // check half left
408 COST_MV_PT_DIST(left2
, bottom2
, 6, (dist
>> 1));
410 if (right2
<= mvmax
.x
) // check half right
412 COST_MV_PT_DIST(right2
, bottom2
, 8, (dist
>> 1));
415 if (bottom
<= mvmax
.y
) // check bottom
417 COST_MV_PT_DIST(omv
.x
, bottom
, 7, dist
);
423 else if (++rounds
>= earlyExitIters
)
427 for (int16_t dist
= 16; dist
<= (int16_t)merange
; dist
<<= 1)
429 const int16_t top
= omv
.y
- dist
;
430 const int16_t bottom
= omv
.y
+ dist
;
431 const int16_t left
= omv
.x
- dist
;
432 const int16_t right
= omv
.x
+ dist
;
435 if (top
>= mvmin
.y
&& left
>= mvmin
.x
&&
436 right
<= mvmax
.x
&& bottom
<= mvmax
.y
) // check border
450 COST_MV_PT_DIST_X4(omv
.x
, top
, 0, dist
,
451 left
, omv
.y
, 0, dist
,
452 right
, omv
.y
, 0, dist
,
453 omv
.x
, bottom
, 0, dist
);
455 for (int16_t index
= 1; index
< 4; index
++)
457 int16_t posYT
= top
+ ((dist
>> 2) * index
);
458 int16_t posYB
= bottom
- ((dist
>> 2) * index
);
459 int16_t posXL
= omv
.x
- ((dist
>> 2) * index
);
460 int16_t posXR
= omv
.x
+ ((dist
>> 2) * index
);
462 COST_MV_PT_DIST_X4(posXL
, posYT
, 0, dist
,
463 posXR
, posYT
, 0, dist
,
464 posXL
, posYB
, 0, dist
,
465 posXR
, posYB
, 0, dist
);
468 else // check border for each mv
470 if (top
>= mvmin
.y
) // check top
472 COST_MV_PT_DIST(omv
.x
, top
, 0, dist
);
474 if (left
>= mvmin
.x
) // check left
476 COST_MV_PT_DIST(left
, omv
.y
, 0, dist
);
478 if (right
<= mvmax
.x
) // check right
480 COST_MV_PT_DIST(right
, omv
.y
, 0, dist
);
482 if (bottom
<= mvmax
.y
) // check bottom
484 COST_MV_PT_DIST(omv
.x
, bottom
, 0, dist
);
486 for (int16_t index
= 1; index
< 4; index
++)
488 int16_t posYT
= top
+ ((dist
>> 2) * index
);
489 int16_t posYB
= bottom
- ((dist
>> 2) * index
);
490 int16_t posXL
= omv
.x
- ((dist
>> 2) * index
);
491 int16_t posXR
= omv
.x
+ ((dist
>> 2) * index
);
493 if (posYT
>= mvmin
.y
) // check top
495 if (posXL
>= mvmin
.x
) // check left
497 COST_MV_PT_DIST(posXL
, posYT
, 0, dist
);
499 if (posXR
<= mvmax
.x
) // check right
501 COST_MV_PT_DIST(posXR
, posYT
, 0, dist
);
504 if (posYB
<= mvmax
.y
) // check bottom
506 if (posXL
>= mvmin
.x
) // check left
508 COST_MV_PT_DIST(posXL
, posYB
, 0, dist
);
510 if (posXR
<= mvmax
.x
) // check right
512 COST_MV_PT_DIST(posXR
, posYB
, 0, dist
);
520 else if (++rounds
>= earlyExitIters
)
525 int MotionEstimate::motionEstimate(ReferencePlanes
*ref
,
534 ALIGN_VAR_16(int, costs
[16]);
535 size_t stride
= ref
->lumaStride
;
536 pixel
*fref
= ref
->fpelPlane
+ blockOffset
;
540 MV qmvmin
= mvmin
.toQPel();
541 MV qmvmax
= mvmax
.toQPel();
543 /* The term cost used here means satd/sad values for that particular search.
544 * The costs used in ME integer search only includes the SAD cost of motion
545 * residual and sqrtLambda times MVD bits. The subpel refine steps use SATD
546 * cost of residual and sqrtLambda * MVD bits. Mode decision will be based
547 * on video distortion cost (SSE/PSNR) plus lambda times all signaling bits
548 * (mode + MVD bits). */
550 // measure SAD cost at clipped QPEL MVP
551 MV pmv
= qmvp
.clipped(qmvmin
, qmvmax
);
556 bprecost
= ref
->lowresQPelCost(fenc
, blockOffset
, pmv
, sad
);
558 bprecost
= subpelCompare(ref
, pmv
, sad
);
560 /* re-measure full pel rounded MVP with SAD as search start point */
561 MV bmv
= pmv
.roundToFPel();
562 int bcost
= bprecost
;
565 bcost
= sad(fenc
, FENC_STRIDE
, fref
+ bmv
.x
+ bmv
.y
* stride
, stride
) + mvcost(bmv
<< 2);
568 // measure SAD cost at MV(0) if MVP is not zero
571 int cost
= sad(fenc
, FENC_STRIDE
, fref
, stride
) + mvcost(MV(0, 0));
579 // measure SAD cost at each QPEL motion vector candidate
580 for (int i
= 0; i
< numCandidates
; i
++)
582 MV m
= mvc
[i
].clipped(qmvmin
, qmvmax
);
583 if (m
.notZero() && m
!= pmv
&& m
!= bestpre
) // check already measured
587 cost
= ref
->lowresQPelCost(fenc
, blockOffset
, m
, sad
) + mvcost(m
);
589 cost
= subpelCompare(ref
, m
, sad
) + mvcost(m
);
599 pmv
= pmv
.roundToFPel();
600 MV omv
= bmv
; // current search origin or starting point
602 switch (searchMethod
)
604 case X265_DIA_SEARCH
:
606 /* diamond search, radius 1 */
611 COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs
);
612 COPY1_IF_LT(bcost
, (costs
[0] << 4) + 1);
613 COPY1_IF_LT(bcost
, (costs
[1] << 4) + 3);
614 COPY1_IF_LT(bcost
, (costs
[2] << 4) + 4);
615 COPY1_IF_LT(bcost
, (costs
[3] << 4) + 12);
618 bmv
.x
-= (bcost
<< 28) >> 30;
619 bmv
.y
-= (bcost
<< 30) >> 30;
622 while (--i
&& bmv
.checkRange(mvmin
, mvmax
));
627 case X265_HEX_SEARCH
:
630 /* hexagon search, radius 2 */
632 for (int i
= 0; i
< merange
/ 2; i
++)
635 COST_MV(omv
.x
- 2, omv
.y
);
636 COST_MV(omv
.x
- 1, omv
.y
+ 2);
637 COST_MV(omv
.x
+ 1, omv
.y
+ 2);
638 COST_MV(omv
.x
+ 2, omv
.y
);
639 COST_MV(omv
.x
+ 1, omv
.y
- 2);
640 COST_MV(omv
.x
- 1, omv
.y
- 2);
643 if (!bmv
.checkRange(mvmin
, mvmax
))
648 /* equivalent to the above, but eliminates duplicate candidates */
649 COST_MV_X3_DIR(-2, 0, -1, 2, 1, 2, costs
);
651 COPY1_IF_LT(bcost
, (costs
[0] << 3) + 2);
652 COPY1_IF_LT(bcost
, (costs
[1] << 3) + 3);
653 COPY1_IF_LT(bcost
, (costs
[2] << 3) + 4);
654 COST_MV_X3_DIR(2, 0, 1, -2, -1, -2, costs
);
655 COPY1_IF_LT(bcost
, (costs
[0] << 3) + 5);
656 COPY1_IF_LT(bcost
, (costs
[1] << 3) + 6);
657 COPY1_IF_LT(bcost
, (costs
[2] << 3) + 7);
661 int dir
= (bcost
& 7) - 2;
662 bmv
+= hex2
[dir
+ 1];
664 /* half hexagon, not overlapping the previous iteration */
665 for (int i
= (merange
>> 1) - 1; i
> 0 && bmv
.checkRange(mvmin
, mvmax
); i
--)
667 COST_MV_X3_DIR(hex2
[dir
+ 0].x
, hex2
[dir
+ 0].y
,
668 hex2
[dir
+ 1].x
, hex2
[dir
+ 1].y
,
669 hex2
[dir
+ 2].x
, hex2
[dir
+ 2].y
,
672 COPY1_IF_LT(bcost
, (costs
[0] << 3) + 1);
673 COPY1_IF_LT(bcost
, (costs
[1] << 3) + 2);
674 COPY1_IF_LT(bcost
, (costs
[2] << 3) + 3);
677 dir
+= (bcost
& 7) - 2;
678 dir
= mod6m1
[dir
+ 1];
679 bmv
+= hex2
[dir
+ 1];
687 COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs
);
688 COPY2_IF_LT(bcost
, costs
[0], dir
, 1);
689 COPY2_IF_LT(bcost
, costs
[1], dir
, 2);
690 COPY2_IF_LT(bcost
, costs
[2], dir
, 3);
691 COPY2_IF_LT(bcost
, costs
[3], dir
, 4);
692 COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs
);
693 COPY2_IF_LT(bcost
, costs
[0], dir
, 5);
694 COPY2_IF_LT(bcost
, costs
[1], dir
, 6);
695 COPY2_IF_LT(bcost
, costs
[2], dir
, 7);
696 COPY2_IF_LT(bcost
, costs
[3], dir
, 8);
701 case X265_UMH_SEARCH
:
704 int16_t cross_start
= 1;
706 /* refine predictors */
709 DIA1_ITER(pmv
.x
, pmv
.y
);
714 if (bmv
.notZero() && bmv
!= pmv
)
715 DIA1_ITER(bmv
.x
, bmv
.y
);
719 /* Early Termination */
721 if (bcost
== ucost2
&& SAD_THRESH(2000))
723 COST_MV_X4(0, -2, -1, -1, 1, -1, -2, 0);
724 COST_MV_X4(2, 0, -1, 1, 1, 1, 0, 2);
725 if (bcost
== ucost1
&& SAD_THRESH(500))
729 int16_t range
= (int16_t)(merange
>> 1) | 1;
730 CROSS(3, range
, range
);
731 COST_MV_X4(-1, -2, 1, -2, -2, -1, 2, -1);
732 COST_MV_X4(-2, 1, 2, 1, -1, 2, 1, 2);
735 cross_start
= range
+ 2;
739 // TODO: Need to study x264's logic for building mvc list to understand why they
740 // have special cases here for 16x16, and whether they apply to HEVC CTU
742 // adaptive search range based on mvc variability
745 /* range multipliers based on casual inspection of some statistics of
746 * average distance between current predictor and final mv found by ESA.
747 * these have not been tuned much by actual encoding. */
748 static const uint8_t range_mul
[4][4] =
757 int sad_ctx
, mvd_ctx
;
760 if (numCandidates
== 1)
762 if (LUMA_64x64
== partEnum
)
763 /* mvc is probably the same as mvp, so the difference isn't meaningful.
764 * but prediction usually isn't too bad, so just use medium range */
767 mvd
= abs(qmvp
.x
- mvc
[0].x
) + abs(qmvp
.y
- mvc
[0].y
);
771 /* calculate the degree of agreement between predictors. */
773 /* in 64x64, mvc includes all the neighbors used to make mvp,
774 * so don't count mvp separately. */
776 denom
= numCandidates
- 1;
778 if (partEnum
!= LUMA_64x64
)
780 mvd
= abs(qmvp
.x
- mvc
[0].x
) + abs(qmvp
.y
- mvc
[0].y
);
783 mvd
+= x265_predictor_difference(mvc
, numCandidates
);
786 sad_ctx
= SAD_THRESH(1000) ? 0
787 : SAD_THRESH(2000) ? 1
788 : SAD_THRESH(4000) ? 2 : 3;
789 mvd_ctx
= mvd
< 10 * denom
? 0
790 : mvd
< 20 * denom
? 1
791 : mvd
< 40 * denom
? 2 : 3;
793 merange
= (merange
* range_mul
[mvd_ctx
][sad_ctx
]) >> 2;
796 /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
797 * we are still centered on the same place as the DIA2. is this desirable? */
798 CROSS(cross_start
, merange
, merange
>> 1);
799 COST_MV_X4(-2, -2, -2, 2, 2, -2, 2, 2);
803 const uint16_t *p_cost_omvx
= m_cost_mvx
+ omv
.x
* 4;
804 const uint16_t *p_cost_omvy
= m_cost_mvy
+ omv
.y
* 4;
808 if (4 * i
> X265_MIN4(mvmax
.x
- omv
.x
, omv
.x
- mvmin
.x
,
809 mvmax
.y
- omv
.y
, omv
.y
- mvmin
.y
))
811 for (int j
= 0; j
< 16; j
++)
813 MV mv
= omv
+ (hex4
[j
] * i
);
814 if (mv
.checkRange(mvmin
, mvmax
))
821 pixel
*fref_base
= fref
+ omv
.x
+ (omv
.y
- 4 * i
) * stride
;
822 size_t dy
= (size_t)i
* stride
;
823 #define SADS(k, x0, y0, x1, y1, x2, y2, x3, y3) \
825 fref_base x0 * i + (y0 - 2 * k + 4) * dy, \
826 fref_base x1 * i + (y1 - 2 * k + 4) * dy, \
827 fref_base x2 * i + (y2 - 2 * k + 4) * dy, \
828 fref_base x3 * i + (y3 - 2 * k + 4) * dy, \
829 stride, costs + 4 * k); \
831 #define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + p_cost_omvy[y * 4 * i]
832 #define MIN_MV(k, x, y) COPY2_IF_LT(bcost, costs[k], dir, x * 16 + (y & 15))
834 SADS(0, +0, -4, +0, +4, -2, -3, +2, -3);
835 SADS(1, -4, -2, +4, -2, -4, -1, +4, -1);
836 SADS(2, -4, +0, +4, +0, -4, +1, +4, +1);
837 SADS(3, -4, +2, +4, +2, -2, +3, +2, +3);
838 ADD_MVCOST(0, 0, -4);
840 ADD_MVCOST(2, -2, -3);
841 ADD_MVCOST(3, 2, -3);
842 ADD_MVCOST(4, -4, -2);
843 ADD_MVCOST(5, 4, -2);
844 ADD_MVCOST(6, -4, -1);
845 ADD_MVCOST(7, 4, -1);
846 ADD_MVCOST(8, -4, 0);
848 ADD_MVCOST(10, -4, 1);
849 ADD_MVCOST(11, 4, 1);
850 ADD_MVCOST(12, -4, 2);
851 ADD_MVCOST(13, 4, 2);
852 ADD_MVCOST(14, -2, 3);
853 ADD_MVCOST(15, 2, 3);
875 bmv
.x
= omv
.x
+ i
* (dir
>> 4);
876 bmv
.y
= omv
.y
+ i
* ((dir
<< 28) >> 28);
880 while (++i
<= merange
>> 2);
881 if (bmv
.checkRange(mvmin
, mvmax
))
886 case X265_STAR_SEARCH
: // Adapted from HM ME
891 const int EarlyExitIters
= 3;
892 StarPatternSearch(ref
, mvmin
, mvmax
, bmv
, bcost
, bPointNr
, bDistance
, EarlyExitIters
, merange
);
895 // if best distance was only 1, check two missing points. If no new point is found, stop
898 /* For a given direction 1 to 8, check nearest two outer X pixels
906 const MV mv1
= bmv
+ offsets
[(bPointNr
- 1) * 2];
907 const MV mv2
= bmv
+ offsets
[(bPointNr
- 1) * 2 + 1];
908 if (mv1
.checkRange(mvmin
, mvmax
))
910 COST_MV(mv1
.x
, mv1
.y
);
912 if (mv2
.checkRange(mvmin
, mvmax
))
914 COST_MV(mv2
.x
, mv2
.y
);
923 const int RasterDistance
= 5;
924 if (bDistance
> RasterDistance
)
926 // raster search refinement if original search distance was too big
928 for (tmv
.y
= mvmin
.y
; tmv
.y
<= mvmax
.y
; tmv
.y
+= RasterDistance
)
930 for (tmv
.x
= mvmin
.x
; tmv
.x
<= mvmax
.x
; tmv
.x
+= RasterDistance
)
932 if (tmv
.x
+ (RasterDistance
* 3) <= mvmax
.x
)
934 pixel
*pix_base
= fref
+ tmv
.y
* stride
+ tmv
.x
;
937 pix_base
+ RasterDistance
,
938 pix_base
+ RasterDistance
* 2,
939 pix_base
+ RasterDistance
* 3,
941 costs
[0] += mvcost(tmv
<< 2);
942 COPY2_IF_LT(bcost
, costs
[0], bmv
, tmv
);
943 tmv
.x
+= RasterDistance
;
944 costs
[1] += mvcost(tmv
<< 2);
945 COPY2_IF_LT(bcost
, costs
[1], bmv
, tmv
);
946 tmv
.x
+= RasterDistance
;
947 costs
[2] += mvcost(tmv
<< 2);
948 COPY2_IF_LT(bcost
, costs
[2], bmv
, tmv
);
949 tmv
.x
+= RasterDistance
;
950 costs
[3] += mvcost(tmv
<< 3);
951 COPY2_IF_LT(bcost
, costs
[3], bmv
, tmv
);
954 COST_MV(tmv
.x
, tmv
.y
);
959 while (bDistance
> 0)
961 // center a new search around current best
964 const int MaxIters
= 32;
965 StarPatternSearch(ref
, mvmin
, mvmax
, bmv
, bcost
, bPointNr
, bDistance
, MaxIters
, merange
);
972 /* For a given direction 1 to 8, check nearest 2 outer X pixels
979 const MV mv1
= bmv
+ offsets
[(bPointNr
- 1) * 2];
980 const MV mv2
= bmv
+ offsets
[(bPointNr
- 1) * 2 + 1];
981 if (mv1
.checkRange(mvmin
, mvmax
))
983 COST_MV(mv1
.x
, mv1
.y
);
985 if (mv2
.checkRange(mvmin
, mvmax
))
987 COST_MV(mv2
.x
, mv2
.y
);
996 case X265_FULL_SEARCH
:
998 // dead slow exhaustive search, but at least it uses sad_x4()
1000 for (tmv
.y
= mvmin
.y
; tmv
.y
<= mvmax
.y
; tmv
.y
++)
1002 for (tmv
.x
= mvmin
.x
; tmv
.x
<= mvmax
.x
; tmv
.x
++)
1004 if (tmv
.x
+ 3 <= mvmax
.x
)
1006 pixel
*pix_base
= fref
+ tmv
.y
* stride
+ tmv
.x
;
1013 costs
[0] += mvcost(tmv
<< 2);
1014 COPY2_IF_LT(bcost
, costs
[0], bmv
, tmv
);
1016 costs
[1] += mvcost(tmv
<< 2);
1017 COPY2_IF_LT(bcost
, costs
[1], bmv
, tmv
);
1019 costs
[2] += mvcost(tmv
<< 2);
1020 COPY2_IF_LT(bcost
, costs
[2], bmv
, tmv
);
1022 costs
[3] += mvcost(tmv
<< 2);
1023 COPY2_IF_LT(bcost
, costs
[3], bmv
, tmv
);
1026 COST_MV(tmv
.x
, tmv
.y
);
1034 X265_CHECK(0, "invalid motion estimate mode\n");
1038 if (bprecost
< bcost
)
1044 bmv
= bmv
.toQPel(); // promote search bmv to qpel
1046 SubpelWorkload
& wl
= workload
[this->subpelRefine
];
1050 /* if there was zero residual at the clipped MVP, we can skip subpel
1051 * refine, but we do need to include the mvcost in the returned cost */
1052 bcost
= mvcost(bmv
);
1054 else if (ref
->isLowres
)
1057 for (int i
= 1; i
<= wl
.hpel_dirs
; i
++)
1059 MV qmv
= bmv
+ square1
[i
] * 2;
1060 cost
= ref
->lowresQPelCost(fenc
, blockOffset
, qmv
, sad
) + mvcost(qmv
);
1061 COPY2_IF_LT(bcost
, cost
, bdir
, i
);
1064 bmv
+= square1
[bdir
] * 2;
1065 bcost
= ref
->lowresQPelCost(fenc
, blockOffset
, bmv
, satd
) + mvcost(bmv
);
1068 for (int i
= 1; i
<= wl
.qpel_dirs
; i
++)
1070 MV qmv
= bmv
+ square1
[i
];
1071 cost
= ref
->lowresQPelCost(fenc
, blockOffset
, qmv
, satd
) + mvcost(qmv
);
1072 COPY2_IF_LT(bcost
, cost
, bdir
, i
);
1075 bmv
+= square1
[bdir
];
1079 pixelcmp_t hpelcomp
;
1083 bcost
= subpelCompare(ref
, bmv
, satd
) + mvcost(bmv
);
1089 for (int iter
= 0; iter
< wl
.hpel_iters
; iter
++)
1092 for (int i
= 1; i
<= wl
.hpel_dirs
; i
++)
1094 MV qmv
= bmv
+ square1
[i
] * 2;
1095 cost
= subpelCompare(ref
, qmv
, hpelcomp
) + mvcost(qmv
);
1096 COPY2_IF_LT(bcost
, cost
, bdir
, i
);
1100 bmv
+= square1
[bdir
] * 2;
1105 /* if HPEL search used SAD, remeasure with SATD before QPEL */
1107 bcost
= subpelCompare(ref
, bmv
, satd
) + mvcost(bmv
);
1109 for (int iter
= 0; iter
< wl
.qpel_iters
; iter
++)
1112 for (int i
= 1; i
<= wl
.qpel_dirs
; i
++)
1114 MV qmv
= bmv
+ square1
[i
];
1115 cost
= subpelCompare(ref
, qmv
, satd
) + mvcost(qmv
);
1116 COPY2_IF_LT(bcost
, cost
, bdir
, i
);
1120 bmv
+= square1
[bdir
];
1131 int MotionEstimate::subpelCompare(ReferencePlanes
*ref
, const MV
& qmv
, pixelcmp_t cmp
)
1133 int xFrac
= qmv
.x
& 0x3;
1134 int yFrac
= qmv
.y
& 0x3;
1136 if ((yFrac
| xFrac
) == 0)
1138 pixel
*fref
= ref
->fpelPlane
+ blockOffset
+ (qmv
.x
>> 2) + (qmv
.y
>> 2) * ref
->lumaStride
;
1139 return cmp(fenc
, FENC_STRIDE
, fref
, ref
->lumaStride
);
1143 /* We are taking a short-cut here if the reference is weighted. To be
1144 * accurate we should be interpolating unweighted pixels and weighting
1145 * the final 16bit values prior to rounding and downshifting. Instead we
1146 * are simply interpolating the weighted full-pel pixels. Not 100%
1147 * accurate but good enough for fast qpel ME */
1148 ALIGN_VAR_32(pixel
, subpelbuf
[64 * 64]);
1149 pixel
*fref
= ref
->fpelPlane
+ blockOffset
+ (qmv
.x
>> 2) + (qmv
.y
>> 2) * ref
->lumaStride
;
1152 primitives
.luma_hpp
[partEnum
](fref
, ref
->lumaStride
, subpelbuf
, FENC_STRIDE
, xFrac
);
1154 else if (xFrac
== 0)
1156 primitives
.luma_vpp
[partEnum
](fref
, ref
->lumaStride
, subpelbuf
, FENC_STRIDE
, yFrac
);
1160 ALIGN_VAR_32(int16_t, immed
[64 * (64 + 8)]);
1162 int filterSize
= NTAPS_LUMA
;
1163 int halfFilterSize
= filterSize
>> 1;
1164 primitives
.luma_hps
[partEnum
](fref
, ref
->lumaStride
, immed
, blockwidth
, xFrac
, 1);
1165 primitives
.luma_vsp
[partEnum
](immed
+ (halfFilterSize
- 1) * blockwidth
, blockwidth
, subpelbuf
, FENC_STRIDE
, yFrac
);
1167 return cmp(fenc
, FENC_STRIDE
, subpelbuf
, FENC_STRIDE
);