Imported Upstream version 1.4
[deb_x265.git] / source / encoder / motion.cpp
CommitLineData
72b9787e
JB
1/*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19 *
20 * This program is also available under a commercial proprietary license.
21 * For more information, contact us at license @ x265.com.
22 *****************************************************************************/
23
24#include "common.h"
25#include "primitives.h"
26#include "lowres.h"
27#include "motion.h"
28#include "x265.h"
29
30#if _MSC_VER
31#pragma warning(disable: 4127) // conditional expression is constant (macros use this construct)
32#endif
33
34using namespace x265;
35
36namespace {
37struct SubpelWorkload
38{
39 int hpel_iters;
40 int hpel_dirs;
41 int qpel_iters;
42 int qpel_dirs;
43 bool hpel_satd;
44};
45
46SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
47{
48 { 1, 4, 0, 4, false }, // 4 SAD HPEL only
49 { 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL
50 { 1, 4, 1, 4, true }, // 4 SATD HPEL + 4 SATD QPEL
51 { 2, 4, 1, 4, true }, // 2x4 SATD HPEL + 4 SATD QPEL
52 { 2, 4, 2, 4, true }, // 2x4 SATD HPEL + 2x4 SATD QPEL
53 { 1, 8, 1, 8, true }, // 8 SATD HPEL + 8 SATD QPEL (default)
54 { 2, 8, 1, 8, true }, // 2x8 SATD HPEL + 8 SATD QPEL
55 { 2, 8, 2, 8, true }, // 2x8 SATD HPEL + 2x8 SATD QPEL
56};
57}
58
59static int size_scale[NUM_LUMA_PARTITIONS];
60#define SAD_THRESH(v) (bcost < (((v >> 4) * size_scale[partEnum])))
61
62static void init_scales(void)
63{
64#define SETUP_SCALE(W, H) \
65 size_scale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
66 SETUP_SCALE(4, 4);
67 SETUP_SCALE(8, 8);
68 SETUP_SCALE(8, 4);
69 SETUP_SCALE(4, 8);
70 SETUP_SCALE(16, 16);
71 SETUP_SCALE(16, 8);
72 SETUP_SCALE(8, 16);
73 SETUP_SCALE(16, 12);
74 SETUP_SCALE(12, 16);
75 SETUP_SCALE(4, 16);
76 SETUP_SCALE(16, 4);
77 SETUP_SCALE(32, 32);
78 SETUP_SCALE(32, 16);
79 SETUP_SCALE(16, 32);
80 SETUP_SCALE(32, 24);
81 SETUP_SCALE(24, 32);
82 SETUP_SCALE(32, 8);
83 SETUP_SCALE(8, 32);
84 SETUP_SCALE(64, 64);
85 SETUP_SCALE(64, 32);
86 SETUP_SCALE(32, 64);
87 SETUP_SCALE(64, 48);
88 SETUP_SCALE(48, 64);
89 SETUP_SCALE(64, 16);
90 SETUP_SCALE(16, 64);
91#undef SETUP_SCALE
92}
93
94MotionEstimate::MotionEstimate()
95 : searchMethod(3)
96 , subpelRefine(5)
97{
98 if (size_scale[0] == 0)
99 init_scales();
100
101 fenc = X265_MALLOC(pixel, MAX_CU_SIZE * MAX_CU_SIZE);
102}
103
104MotionEstimate::~MotionEstimate()
105{
106 X265_FREE(fenc);
107}
108
109void MotionEstimate::setSourcePU(intptr_t offset, int width, int height)
110{
111 partEnum = partitionFromSizes(width, height);
112 X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
113 sad = primitives.sad[partEnum];
114 satd = primitives.satd[partEnum];
115 sa8d = primitives.sa8d_inter[partEnum];
116 sad_x3 = primitives.sad_x3[partEnum];
117 sad_x4 = primitives.sad_x4[partEnum];
118
119 blockwidth = width;
120 blockheight = height;
121 blockOffset = offset;
122
123 /* copy PU block into cache */
124 primitives.luma_copy_pp[partEnum](fenc, FENC_STRIDE, fencplane + offset, fencLumaStride);
125}
126
127/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
128static const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
129static const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */
130static const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
131static const MV hex4[16] =
132{
133 MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
134 MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
135 MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
136 MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
137};
138static const MV offsets[] =
139{
140 MV(-1, 0), MV(0, -1),
141 MV(-1, -1), MV(1, -1),
142 MV(-1, 0), MV(1, 0),
143 MV(-1, 1), MV(-1, -1),
144 MV(1, -1), MV(1, 1),
145 MV(-1, 0), MV(0, 1),
146 MV(-1, 1), MV(1, 1),
147 MV(1, 0), MV(0, 1),
148}; // offsets for Two Point Search
149
150/* sum of absolute differences between MV candidates */
151static inline int x265_predictor_difference(const MV *mvc, intptr_t numCandidates)
152{
153 int sum = 0;
154
155 for (int i = 0; i < numCandidates - 1; i++)
156 {
157 sum += abs(mvc[i].x - mvc[i + 1].x)
158 + abs(mvc[i].y - mvc[i + 1].y);
159 }
160
161 return sum;
162}
163
164#define COST_MV_PT_DIST(mx, my, point, dist) \
165 do \
166 { \
167 MV tmv(mx, my); \
168 int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \
169 cost += mvcost(tmv << 2); \
170 if (cost < bcost) { \
171 bcost = cost; \
172 bmv = tmv; \
173 bPointNr = point; \
174 bDistance = dist; \
175 } \
176 } while (0)
177
178#define COST_MV(mx, my) \
179 do \
180 { \
181 int cost = sad(fenc, FENC_STRIDE, fref + (mx) + (my) * stride, stride); \
182 cost += mvcost(MV(mx, my) << 2); \
183 COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \
184 } while (0)
185
186#define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \
187 { \
188 pixel *pix_base = fref + bmv.x + bmv.y * stride; \
189 sad_x3(fenc, \
190 pix_base + (m0x) + (m0y) * stride, \
191 pix_base + (m1x) + (m1y) * stride, \
192 pix_base + (m2x) + (m2y) * stride, \
193 stride, costs); \
194 (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
195 (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
196 (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
197 }
198
199#define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
200 { \
201 sad_x4(fenc, \
202 fref + (m0x) + (m0y) * stride, \
203 fref + (m1x) + (m1y) * stride, \
204 fref + (m2x) + (m2y) * stride, \
205 fref + (m3x) + (m3y) * stride, \
206 stride, costs); \
207 costs[0] += mvcost(MV(m0x, m0y) << 2); \
208 costs[1] += mvcost(MV(m1x, m1y) << 2); \
209 costs[2] += mvcost(MV(m2x, m2y) << 2); \
210 costs[3] += mvcost(MV(m3x, m3y) << 2); \
211 COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \
212 COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \
213 COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \
214 COPY4_IF_LT(bcost, costs[3], bmv, MV(m3x, m3y), bPointNr, p3, bDistance, d3); \
215 }
216
217#define COST_MV_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \
218 { \
219 pixel *pix_base = fref + omv.x + omv.y * stride; \
220 sad_x4(fenc, \
221 pix_base + (m0x) + (m0y) * stride, \
222 pix_base + (m1x) + (m1y) * stride, \
223 pix_base + (m2x) + (m2y) * stride, \
224 pix_base + (m3x) + (m3y) * stride, \
225 stride, costs); \
226 costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
227 costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
228 costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
229 costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
230 COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
231 COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
232 COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
233 COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
234 }
235
236#define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
237 { \
238 pixel *pix_base = fref + bmv.x + bmv.y * stride; \
239 sad_x4(fenc, \
240 pix_base + (m0x) + (m0y) * stride, \
241 pix_base + (m1x) + (m1y) * stride, \
242 pix_base + (m2x) + (m2y) * stride, \
243 pix_base + (m3x) + (m3y) * stride, \
244 stride, costs); \
245 (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
246 (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
247 (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
248 (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
249 }
250
251#define DIA1_ITER(mx, my) \
252 { \
253 omv.x = mx; omv.y = my; \
254 COST_MV_X4(0, -1, 0, 1, -1, 0, 1, 0); \
255 }
256
257#define CROSS(start, x_max, y_max) \
258 { \
259 int16_t i = start; \
260 if ((x_max) <= X265_MIN(mvmax.x - omv.x, omv.x - mvmin.x)) \
261 for (; i < (x_max) - 2; i += 4) { \
262 COST_MV_X4(i, 0, -i, 0, i + 2, 0, -i - 2, 0); } \
263 for (; i < (x_max); i += 2) \
264 { \
265 if (omv.x + i <= mvmax.x) \
266 COST_MV(omv.x + i, omv.y); \
267 if (omv.x - i >= mvmin.x) \
268 COST_MV(omv.x - i, omv.y); \
269 } \
270 i = start; \
271 if ((y_max) <= X265_MIN(mvmax.y - omv.y, omv.y - mvmin.y)) \
272 for (; i < (y_max) - 2; i += 4) { \
273 COST_MV_X4(0, i, 0, -i, 0, i + 2, 0, -i - 2); } \
274 for (; i < (y_max); i += 2) \
275 { \
276 if (omv.y + i <= mvmax.y) \
277 COST_MV(omv.x, omv.y + i); \
278 if (omv.y - i >= mvmin.y) \
279 COST_MV(omv.x, omv.y - i); \
280 } \
281 }
282
283void MotionEstimate::StarPatternSearch(ReferencePlanes *ref,
284 const MV & mvmin,
285 const MV & mvmax,
286 MV & bmv,
287 int & bcost,
288 int & bPointNr,
289 int & bDistance,
290 int earlyExitIters,
291 int merange)
292{
293 ALIGN_VAR_16(int, costs[16]);
294 pixel *fref = ref->fpelPlane + blockOffset;
295 size_t stride = ref->lumaStride;
296
297 MV omv = bmv;
298 int saved = bcost;
299 int rounds = 0;
300
301 {
302 int16_t dist = 1;
303
304 /* bPointNr
305 2
306 4 * 5
307 7
308 */
309 const int16_t top = omv.y - dist;
310 const int16_t bottom = omv.y + dist;
311 const int16_t left = omv.x - dist;
312 const int16_t right = omv.x + dist;
313
314 if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y)
315 {
316 COST_MV_PT_DIST_X4(omv.x, top, 2, dist,
317 left, omv.y, 4, dist,
318 right, omv.y, 5, dist,
319 omv.x, bottom, 7, dist);
320 }
321 else
322 {
323 if (top >= mvmin.y) // check top
324 {
325 COST_MV_PT_DIST(omv.x, top, 2, dist);
326 }
327 if (left >= mvmin.x) // check middle left
328 {
329 COST_MV_PT_DIST(left, omv.y, 4, dist);
330 }
331 if (right <= mvmax.x) // check middle right
332 {
333 COST_MV_PT_DIST(right, omv.y, 5, dist);
334 }
335 if (bottom <= mvmax.y) // check bottom
336 {
337 COST_MV_PT_DIST(omv.x, bottom, 7, dist);
338 }
339 }
340 if (bcost < saved)
341 rounds = 0;
342 else if (++rounds >= earlyExitIters)
343 return;
344 }
345
346 for (int16_t dist = 2; dist <= 8; dist <<= 1)
347 {
348 /* bPointNr
349 2
350 1 3
351 4 * 5
352 6 8
353 7
354 Points 2, 4, 5, 7 are dist
355 Points 1, 3, 6, 8 are dist>>1
356 */
357 const int16_t top = omv.y - dist;
358 const int16_t bottom = omv.y + dist;
359 const int16_t left = omv.x - dist;
360 const int16_t right = omv.x + dist;
361 const int16_t top2 = omv.y - (dist >> 1);
362 const int16_t bottom2 = omv.y + (dist >> 1);
363 const int16_t left2 = omv.x - (dist >> 1);
364 const int16_t right2 = omv.x + (dist >> 1);
365 saved = bcost;
366
367 if (top >= mvmin.y && left >= mvmin.x &&
368 right <= mvmax.x && bottom <= mvmax.y) // check border
369 {
370 COST_MV_PT_DIST_X4(omv.x, top, 2, dist,
371 left2, top2, 1, dist >> 1,
372 right2, top2, 3, dist >> 1,
373 left, omv.y, 4, dist);
374 COST_MV_PT_DIST_X4(right, omv.y, 5, dist,
375 left2, bottom2, 6, dist >> 1,
376 right2, bottom2, 8, dist >> 1,
377 omv.x, bottom, 7, dist);
378 }
379 else // check border for each mv
380 {
381 if (top >= mvmin.y) // check top
382 {
383 COST_MV_PT_DIST(omv.x, top, 2, dist);
384 }
385 if (top2 >= mvmin.y) // check half top
386 {
387 if (left2 >= mvmin.x) // check half left
388 {
389 COST_MV_PT_DIST(left2, top2, 1, (dist >> 1));
390 }
391 if (right2 <= mvmax.x) // check half right
392 {
393 COST_MV_PT_DIST(right2, top2, 3, (dist >> 1));
394 }
395 }
396 if (left >= mvmin.x) // check left
397 {
398 COST_MV_PT_DIST(left, omv.y, 4, dist);
399 }
400 if (right <= mvmax.x) // check right
401 {
402 COST_MV_PT_DIST(right, omv.y, 5, dist);
403 }
404 if (bottom2 <= mvmax.y) // check half bottom
405 {
406 if (left2 >= mvmin.x) // check half left
407 {
408 COST_MV_PT_DIST(left2, bottom2, 6, (dist >> 1));
409 }
410 if (right2 <= mvmax.x) // check half right
411 {
412 COST_MV_PT_DIST(right2, bottom2, 8, (dist >> 1));
413 }
414 }
415 if (bottom <= mvmax.y) // check bottom
416 {
417 COST_MV_PT_DIST(omv.x, bottom, 7, dist);
418 }
419 }
420
421 if (bcost < saved)
422 rounds = 0;
423 else if (++rounds >= earlyExitIters)
424 return;
425 }
426
427 for (int16_t dist = 16; dist <= (int16_t)merange; dist <<= 1)
428 {
429 const int16_t top = omv.y - dist;
430 const int16_t bottom = omv.y + dist;
431 const int16_t left = omv.x - dist;
432 const int16_t right = omv.x + dist;
433
434 saved = bcost;
435 if (top >= mvmin.y && left >= mvmin.x &&
436 right <= mvmax.x && bottom <= mvmax.y) // check border
437 {
438 /* index
439 0
440 3
441 2
442 1
443 0 3 2 1 * 1 2 3 0
444 1
445 2
446 3
447 0
448 */
449
450 COST_MV_PT_DIST_X4(omv.x, top, 0, dist,
451 left, omv.y, 0, dist,
452 right, omv.y, 0, dist,
453 omv.x, bottom, 0, dist);
454
455 for (int16_t index = 1; index < 4; index++)
456 {
457 int16_t posYT = top + ((dist >> 2) * index);
458 int16_t posYB = bottom - ((dist >> 2) * index);
459 int16_t posXL = omv.x - ((dist >> 2) * index);
460 int16_t posXR = omv.x + ((dist >> 2) * index);
461
462 COST_MV_PT_DIST_X4(posXL, posYT, 0, dist,
463 posXR, posYT, 0, dist,
464 posXL, posYB, 0, dist,
465 posXR, posYB, 0, dist);
466 }
467 }
468 else // check border for each mv
469 {
470 if (top >= mvmin.y) // check top
471 {
472 COST_MV_PT_DIST(omv.x, top, 0, dist);
473 }
474 if (left >= mvmin.x) // check left
475 {
476 COST_MV_PT_DIST(left, omv.y, 0, dist);
477 }
478 if (right <= mvmax.x) // check right
479 {
480 COST_MV_PT_DIST(right, omv.y, 0, dist);
481 }
482 if (bottom <= mvmax.y) // check bottom
483 {
484 COST_MV_PT_DIST(omv.x, bottom, 0, dist);
485 }
486 for (int16_t index = 1; index < 4; index++)
487 {
488 int16_t posYT = top + ((dist >> 2) * index);
489 int16_t posYB = bottom - ((dist >> 2) * index);
490 int16_t posXL = omv.x - ((dist >> 2) * index);
491 int16_t posXR = omv.x + ((dist >> 2) * index);
492
493 if (posYT >= mvmin.y) // check top
494 {
495 if (posXL >= mvmin.x) // check left
496 {
497 COST_MV_PT_DIST(posXL, posYT, 0, dist);
498 }
499 if (posXR <= mvmax.x) // check right
500 {
501 COST_MV_PT_DIST(posXR, posYT, 0, dist);
502 }
503 }
504 if (posYB <= mvmax.y) // check bottom
505 {
506 if (posXL >= mvmin.x) // check left
507 {
508 COST_MV_PT_DIST(posXL, posYB, 0, dist);
509 }
510 if (posXR <= mvmax.x) // check right
511 {
512 COST_MV_PT_DIST(posXR, posYB, 0, dist);
513 }
514 }
515 }
516 }
517
518 if (bcost < saved)
519 rounds = 0;
520 else if (++rounds >= earlyExitIters)
521 return;
522 }
523}
524
525int MotionEstimate::motionEstimate(ReferencePlanes *ref,
526 const MV & mvmin,
527 const MV & mvmax,
528 const MV & qmvp,
529 int numCandidates,
530 const MV * mvc,
531 int merange,
532 MV & outQMv)
533{
534 ALIGN_VAR_16(int, costs[16]);
535 size_t stride = ref->lumaStride;
536 pixel *fref = ref->fpelPlane + blockOffset;
537
538 setMVP(qmvp);
539
540 MV qmvmin = mvmin.toQPel();
541 MV qmvmax = mvmax.toQPel();
542
543 /* The term cost used here means satd/sad values for that particular search.
544 * The costs used in ME integer search only includes the SAD cost of motion
545 * residual and sqrtLambda times MVD bits. The subpel refine steps use SATD
546 * cost of residual and sqrtLambda * MVD bits. Mode decision will be based
547 * on video distortion cost (SSE/PSNR) plus lambda times all signaling bits
548 * (mode + MVD bits). */
549
550 // measure SAD cost at clipped QPEL MVP
551 MV pmv = qmvp.clipped(qmvmin, qmvmax);
552 MV bestpre = pmv;
553 int bprecost;
554
555 if (ref->isLowres)
556 bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad);
557 else
558 bprecost = subpelCompare(ref, pmv, sad);
559
560 /* re-measure full pel rounded MVP with SAD as search start point */
561 MV bmv = pmv.roundToFPel();
562 int bcost = bprecost;
563 if (pmv.isSubpel())
564 {
565 bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
566 }
567
568 // measure SAD cost at MV(0) if MVP is not zero
569 if (pmv.notZero())
570 {
571 int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(MV(0, 0));
572 if (cost < bcost)
573 {
574 bcost = cost;
575 bmv = 0;
576 }
577 }
578
579 // measure SAD cost at each QPEL motion vector candidate
580 for (int i = 0; i < numCandidates; i++)
581 {
582 MV m = mvc[i].clipped(qmvmin, qmvmax);
583 if (m.notZero() && m != pmv && m != bestpre) // check already measured
584 {
585 int cost;
586 if (ref->isLowres)
587 cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m);
588 else
589 cost = subpelCompare(ref, m, sad) + mvcost(m);
590
591 if (cost < bprecost)
592 {
593 bprecost = cost;
594 bestpre = m;
595 }
596 }
597 }
598
599 pmv = pmv.roundToFPel();
600 MV omv = bmv; // current search origin or starting point
601
602 switch (searchMethod)
603 {
604 case X265_DIA_SEARCH:
605 {
606 /* diamond search, radius 1 */
607 bcost <<= 4;
608 int i = merange;
609 do
610 {
611 COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
612 COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
613 COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
614 COPY1_IF_LT(bcost, (costs[2] << 4) + 4);
615 COPY1_IF_LT(bcost, (costs[3] << 4) + 12);
616 if (!(bcost & 15))
617 break;
618 bmv.x -= (bcost << 28) >> 30;
619 bmv.y -= (bcost << 30) >> 30;
620 bcost &= ~15;
621 }
622 while (--i && bmv.checkRange(mvmin, mvmax));
623 bcost >>= 4;
624 break;
625 }
626
627 case X265_HEX_SEARCH:
628 {
629me_hex2:
630 /* hexagon search, radius 2 */
631#if 0
632 for (int i = 0; i < merange / 2; i++)
633 {
634 omv = bmv;
635 COST_MV(omv.x - 2, omv.y);
636 COST_MV(omv.x - 1, omv.y + 2);
637 COST_MV(omv.x + 1, omv.y + 2);
638 COST_MV(omv.x + 2, omv.y);
639 COST_MV(omv.x + 1, omv.y - 2);
640 COST_MV(omv.x - 1, omv.y - 2);
641 if (omv == bmv)
642 break;
643 if (!bmv.checkRange(mvmin, mvmax))
644 break;
645 }
646
647#else // if 0
648 /* equivalent to the above, but eliminates duplicate candidates */
649 COST_MV_X3_DIR(-2, 0, -1, 2, 1, 2, costs);
650 bcost <<= 3;
651 COPY1_IF_LT(bcost, (costs[0] << 3) + 2);
652 COPY1_IF_LT(bcost, (costs[1] << 3) + 3);
653 COPY1_IF_LT(bcost, (costs[2] << 3) + 4);
654 COST_MV_X3_DIR(2, 0, 1, -2, -1, -2, costs);
655 COPY1_IF_LT(bcost, (costs[0] << 3) + 5);
656 COPY1_IF_LT(bcost, (costs[1] << 3) + 6);
657 COPY1_IF_LT(bcost, (costs[2] << 3) + 7);
658
659 if (bcost & 7)
660 {
661 int dir = (bcost & 7) - 2;
662 bmv += hex2[dir + 1];
663
664 /* half hexagon, not overlapping the previous iteration */
665 for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--)
666 {
667 COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,
668 hex2[dir + 1].x, hex2[dir + 1].y,
669 hex2[dir + 2].x, hex2[dir + 2].y,
670 costs);
671 bcost &= ~7;
672 COPY1_IF_LT(bcost, (costs[0] << 3) + 1);
673 COPY1_IF_LT(bcost, (costs[1] << 3) + 2);
674 COPY1_IF_LT(bcost, (costs[2] << 3) + 3);
675 if (!(bcost & 7))
676 break;
677 dir += (bcost & 7) - 2;
678 dir = mod6m1[dir + 1];
679 bmv += hex2[dir + 1];
680 }
681 }
682 bcost >>= 3;
683#endif // if 0
684
685 /* square refine */
686 int dir = 0;
687 COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
688 COPY2_IF_LT(bcost, costs[0], dir, 1);
689 COPY2_IF_LT(bcost, costs[1], dir, 2);
690 COPY2_IF_LT(bcost, costs[2], dir, 3);
691 COPY2_IF_LT(bcost, costs[3], dir, 4);
692 COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
693 COPY2_IF_LT(bcost, costs[0], dir, 5);
694 COPY2_IF_LT(bcost, costs[1], dir, 6);
695 COPY2_IF_LT(bcost, costs[2], dir, 7);
696 COPY2_IF_LT(bcost, costs[3], dir, 8);
697 bmv += square1[dir];
698 break;
699 }
700
701 case X265_UMH_SEARCH:
702 {
703 int ucost1, ucost2;
704 int16_t cross_start = 1;
705
706 /* refine predictors */
707 omv = bmv;
708 ucost1 = bcost;
709 DIA1_ITER(pmv.x, pmv.y);
710 if (pmv.notZero())
711 DIA1_ITER(0, 0);
712
713 ucost2 = bcost;
714 if (bmv.notZero() && bmv != pmv)
715 DIA1_ITER(bmv.x, bmv.y);
716 if (bcost == ucost2)
717 cross_start = 3;
718
719 /* Early Termination */
720 omv = bmv;
721 if (bcost == ucost2 && SAD_THRESH(2000))
722 {
723 COST_MV_X4(0, -2, -1, -1, 1, -1, -2, 0);
724 COST_MV_X4(2, 0, -1, 1, 1, 1, 0, 2);
725 if (bcost == ucost1 && SAD_THRESH(500))
726 break;
727 if (bcost == ucost2)
728 {
729 int16_t range = (int16_t)(merange >> 1) | 1;
730 CROSS(3, range, range);
731 COST_MV_X4(-1, -2, 1, -2, -2, -1, 2, -1);
732 COST_MV_X4(-2, 1, 2, 1, -1, 2, 1, 2);
733 if (bcost == ucost2)
734 break;
735 cross_start = range + 2;
736 }
737 }
738
739 // TODO: Need to study x264's logic for building mvc list to understand why they
740 // have special cases here for 16x16, and whether they apply to HEVC CTU
741
742 // adaptive search range based on mvc variability
743 if (numCandidates)
744 {
745 /* range multipliers based on casual inspection of some statistics of
746 * average distance between current predictor and final mv found by ESA.
747 * these have not been tuned much by actual encoding. */
748 static const uint8_t range_mul[4][4] =
749 {
750 { 3, 3, 4, 4 },
751 { 3, 4, 4, 4 },
752 { 4, 4, 4, 5 },
753 { 4, 4, 5, 6 },
754 };
755
756 int mvd;
757 int sad_ctx, mvd_ctx;
758 int denom = 1;
759
760 if (numCandidates == 1)
761 {
762 if (LUMA_64x64 == partEnum)
763 /* mvc is probably the same as mvp, so the difference isn't meaningful.
764 * but prediction usually isn't too bad, so just use medium range */
765 mvd = 25;
766 else
767 mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
768 }
769 else
770 {
771 /* calculate the degree of agreement between predictors. */
772
773 /* in 64x64, mvc includes all the neighbors used to make mvp,
774 * so don't count mvp separately. */
775
776 denom = numCandidates - 1;
777 mvd = 0;
778 if (partEnum != LUMA_64x64)
779 {
780 mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
781 denom++;
782 }
783 mvd += x265_predictor_difference(mvc, numCandidates);
784 }
785
786 sad_ctx = SAD_THRESH(1000) ? 0
787 : SAD_THRESH(2000) ? 1
788 : SAD_THRESH(4000) ? 2 : 3;
789 mvd_ctx = mvd < 10 * denom ? 0
790 : mvd < 20 * denom ? 1
791 : mvd < 40 * denom ? 2 : 3;
792
793 merange = (merange * range_mul[mvd_ctx][sad_ctx]) >> 2;
794 }
795
796 /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
797 * we are still centered on the same place as the DIA2. is this desirable? */
798 CROSS(cross_start, merange, merange >> 1);
799 COST_MV_X4(-2, -2, -2, 2, 2, -2, 2, 2);
800
801 /* hexagon grid */
802 omv = bmv;
803 const uint16_t *p_cost_omvx = m_cost_mvx + omv.x * 4;
804 const uint16_t *p_cost_omvy = m_cost_mvy + omv.y * 4;
805 uint16_t i = 1;
806 do
807 {
808 if (4 * i > X265_MIN4(mvmax.x - omv.x, omv.x - mvmin.x,
809 mvmax.y - omv.y, omv.y - mvmin.y))
810 {
811 for (int j = 0; j < 16; j++)
812 {
813 MV mv = omv + (hex4[j] * i);
814 if (mv.checkRange(mvmin, mvmax))
815 COST_MV(mv.x, mv.y);
816 }
817 }
818 else
819 {
820 int16_t dir = 0;
821 pixel *fref_base = fref + omv.x + (omv.y - 4 * i) * stride;
822 size_t dy = (size_t)i * stride;
823#define SADS(k, x0, y0, x1, y1, x2, y2, x3, y3) \
824 sad_x4(fenc, \
825 fref_base x0 * i + (y0 - 2 * k + 4) * dy, \
826 fref_base x1 * i + (y1 - 2 * k + 4) * dy, \
827 fref_base x2 * i + (y2 - 2 * k + 4) * dy, \
828 fref_base x3 * i + (y3 - 2 * k + 4) * dy, \
829 stride, costs + 4 * k); \
830 fref_base += 2 * dy;
831#define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + p_cost_omvy[y * 4 * i]
832#define MIN_MV(k, x, y) COPY2_IF_LT(bcost, costs[k], dir, x * 16 + (y & 15))
833
834 SADS(0, +0, -4, +0, +4, -2, -3, +2, -3);
835 SADS(1, -4, -2, +4, -2, -4, -1, +4, -1);
836 SADS(2, -4, +0, +4, +0, -4, +1, +4, +1);
837 SADS(3, -4, +2, +4, +2, -2, +3, +2, +3);
838 ADD_MVCOST(0, 0, -4);
839 ADD_MVCOST(1, 0, 4);
840 ADD_MVCOST(2, -2, -3);
841 ADD_MVCOST(3, 2, -3);
842 ADD_MVCOST(4, -4, -2);
843 ADD_MVCOST(5, 4, -2);
844 ADD_MVCOST(6, -4, -1);
845 ADD_MVCOST(7, 4, -1);
846 ADD_MVCOST(8, -4, 0);
847 ADD_MVCOST(9, 4, 0);
848 ADD_MVCOST(10, -4, 1);
849 ADD_MVCOST(11, 4, 1);
850 ADD_MVCOST(12, -4, 2);
851 ADD_MVCOST(13, 4, 2);
852 ADD_MVCOST(14, -2, 3);
853 ADD_MVCOST(15, 2, 3);
854 MIN_MV(0, 0, -4);
855 MIN_MV(1, 0, 4);
856 MIN_MV(2, -2, -3);
857 MIN_MV(3, 2, -3);
858 MIN_MV(4, -4, -2);
859 MIN_MV(5, 4, -2);
860 MIN_MV(6, -4, -1);
861 MIN_MV(7, 4, -1);
862 MIN_MV(8, -4, 0);
863 MIN_MV(9, 4, 0);
864 MIN_MV(10, -4, 1);
865 MIN_MV(11, 4, 1);
866 MIN_MV(12, -4, 2);
867 MIN_MV(13, 4, 2);
868 MIN_MV(14, -2, 3);
869 MIN_MV(15, 2, 3);
870#undef SADS
871#undef ADD_MVCOST
872#undef MIN_MV
873 if (dir)
874 {
875 bmv.x = omv.x + i * (dir >> 4);
876 bmv.y = omv.y + i * ((dir << 28) >> 28);
877 }
878 }
879 }
880 while (++i <= merange >> 2);
881 if (bmv.checkRange(mvmin, mvmax))
882 goto me_hex2;
883 break;
884 }
885
886 case X265_STAR_SEARCH: // Adapted from HM ME
887 {
888 int bPointNr = 0;
889 int bDistance = 0;
890
891 const int EarlyExitIters = 3;
892 StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange);
893 if (bDistance == 1)
894 {
895 // if best distance was only 1, check two missing points. If no new point is found, stop
896 if (bPointNr)
897 {
898 /* For a given direction 1 to 8, check nearest two outer X pixels
899 X X
900 X 1 2 3 X
901 4 * 5
902 X 6 7 8 X
903 X X
904 */
905 int saved = bcost;
906 const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
907 const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
908 if (mv1.checkRange(mvmin, mvmax))
909 {
910 COST_MV(mv1.x, mv1.y);
911 }
912 if (mv2.checkRange(mvmin, mvmax))
913 {
914 COST_MV(mv2.x, mv2.y);
915 }
916 if (bcost == saved)
917 break;
918 }
919 else
920 break;
921 }
922
923 const int RasterDistance = 5;
924 if (bDistance > RasterDistance)
925 {
926 // raster search refinement if original search distance was too big
927 MV tmv;
928 for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y += RasterDistance)
929 {
930 for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x += RasterDistance)
931 {
932 if (tmv.x + (RasterDistance * 3) <= mvmax.x)
933 {
934 pixel *pix_base = fref + tmv.y * stride + tmv.x;
935 sad_x4(fenc,
936 pix_base,
937 pix_base + RasterDistance,
938 pix_base + RasterDistance * 2,
939 pix_base + RasterDistance * 3,
940 stride, costs);
941 costs[0] += mvcost(tmv << 2);
942 COPY2_IF_LT(bcost, costs[0], bmv, tmv);
943 tmv.x += RasterDistance;
944 costs[1] += mvcost(tmv << 2);
945 COPY2_IF_LT(bcost, costs[1], bmv, tmv);
946 tmv.x += RasterDistance;
947 costs[2] += mvcost(tmv << 2);
948 COPY2_IF_LT(bcost, costs[2], bmv, tmv);
949 tmv.x += RasterDistance;
950 costs[3] += mvcost(tmv << 3);
951 COPY2_IF_LT(bcost, costs[3], bmv, tmv);
952 }
953 else
954 COST_MV(tmv.x, tmv.y);
955 }
956 }
957 }
958
959 while (bDistance > 0)
960 {
961 // center a new search around current best
962 bDistance = 0;
963 bPointNr = 0;
964 const int MaxIters = 32;
965 StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange);
966
967 if (bDistance == 1)
968 {
969 if (!bPointNr)
970 break;
971
972 /* For a given direction 1 to 8, check nearest 2 outer X pixels
973 X X
974 X 1 2 3 X
975 4 * 5
976 X 6 7 8 X
977 X X
978 */
979 const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
980 const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
981 if (mv1.checkRange(mvmin, mvmax))
982 {
983 COST_MV(mv1.x, mv1.y);
984 }
985 if (mv2.checkRange(mvmin, mvmax))
986 {
987 COST_MV(mv2.x, mv2.y);
988 }
989 break;
990 }
991 }
992
993 break;
994 }
995
996 case X265_FULL_SEARCH:
997 {
998 // dead slow exhaustive search, but at least it uses sad_x4()
999 MV tmv;
1000 for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y++)
1001 {
1002 for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x++)
1003 {
1004 if (tmv.x + 3 <= mvmax.x)
1005 {
1006 pixel *pix_base = fref + tmv.y * stride + tmv.x;
1007 sad_x4(fenc,
1008 pix_base,
1009 pix_base + 1,
1010 pix_base + 2,
1011 pix_base + 3,
1012 stride, costs);
1013 costs[0] += mvcost(tmv << 2);
1014 COPY2_IF_LT(bcost, costs[0], bmv, tmv);
1015 tmv.x++;
1016 costs[1] += mvcost(tmv << 2);
1017 COPY2_IF_LT(bcost, costs[1], bmv, tmv);
1018 tmv.x++;
1019 costs[2] += mvcost(tmv << 2);
1020 COPY2_IF_LT(bcost, costs[2], bmv, tmv);
1021 tmv.x++;
1022 costs[3] += mvcost(tmv << 2);
1023 COPY2_IF_LT(bcost, costs[3], bmv, tmv);
1024 }
1025 else
1026 COST_MV(tmv.x, tmv.y);
1027 }
1028 }
1029
1030 break;
1031 }
1032
1033 default:
1034 X265_CHECK(0, "invalid motion estimate mode\n");
1035 break;
1036 }
1037
1038 if (bprecost < bcost)
1039 {
1040 bmv = bestpre;
1041 bcost = bprecost;
1042 }
1043 else
1044 bmv = bmv.toQPel(); // promote search bmv to qpel
1045
1046 SubpelWorkload& wl = workload[this->subpelRefine];
1047
1048 if (!bcost)
1049 {
1050 /* if there was zero residual at the clipped MVP, we can skip subpel
1051 * refine, but we do need to include the mvcost in the returned cost */
1052 bcost = mvcost(bmv);
1053 }
1054 else if (ref->isLowres)
1055 {
1056 int bdir = 0, cost;
1057 for (int i = 1; i <= wl.hpel_dirs; i++)
1058 {
1059 MV qmv = bmv + square1[i] * 2;
1060 cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);
1061 COPY2_IF_LT(bcost, cost, bdir, i);
1062 }
1063
1064 bmv += square1[bdir] * 2;
1065 bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + mvcost(bmv);
1066
1067 bdir = 0;
1068 for (int i = 1; i <= wl.qpel_dirs; i++)
1069 {
1070 MV qmv = bmv + square1[i];
1071 cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv);
1072 COPY2_IF_LT(bcost, cost, bdir, i);
1073 }
1074
1075 bmv += square1[bdir];
1076 }
1077 else
1078 {
1079 pixelcmp_t hpelcomp;
1080
1081 if (wl.hpel_satd)
1082 {
1083 bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1084 hpelcomp = satd;
1085 }
1086 else
1087 hpelcomp = sad;
1088
1089 for (int iter = 0; iter < wl.hpel_iters; iter++)
1090 {
1091 int bdir = 0, cost;
1092 for (int i = 1; i <= wl.hpel_dirs; i++)
1093 {
1094 MV qmv = bmv + square1[i] * 2;
1095 cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
1096 COPY2_IF_LT(bcost, cost, bdir, i);
1097 }
1098
1099 if (bdir)
1100 bmv += square1[bdir] * 2;
1101 else
1102 break;
1103 }
1104
1105 /* if HPEL search used SAD, remeasure with SATD before QPEL */
1106 if (!wl.hpel_satd)
1107 bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1108
1109 for (int iter = 0; iter < wl.qpel_iters; iter++)
1110 {
1111 int bdir = 0, cost;
1112 for (int i = 1; i <= wl.qpel_dirs; i++)
1113 {
1114 MV qmv = bmv + square1[i];
1115 cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
1116 COPY2_IF_LT(bcost, cost, bdir, i);
1117 }
1118
1119 if (bdir)
1120 bmv += square1[bdir];
1121 else
1122 break;
1123 }
1124 }
1125
1126 x265_emms();
1127 outQMv = bmv;
1128 return bcost;
1129}
1130
1131int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
1132{
1133 int xFrac = qmv.x & 0x3;
1134 int yFrac = qmv.y & 0x3;
1135
1136 if ((yFrac | xFrac) == 0)
1137 {
1138 pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
1139 return cmp(fenc, FENC_STRIDE, fref, ref->lumaStride);
1140 }
1141 else
1142 {
1143 /* We are taking a short-cut here if the reference is weighted. To be
1144 * accurate we should be interpolating unweighted pixels and weighting
1145 * the final 16bit values prior to rounding and downshifting. Instead we
1146 * are simply interpolating the weighted full-pel pixels. Not 100%
1147 * accurate but good enough for fast qpel ME */
1148 ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
1149 pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride;
1150 if (yFrac == 0)
1151 {
1152 primitives.luma_hpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, xFrac);
1153 }
1154 else if (xFrac == 0)
1155 {
1156 primitives.luma_vpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, yFrac);
1157 }
1158 else
1159 {
1160 ALIGN_VAR_32(int16_t, immed[64 * (64 + 8)]);
1161
1162 int filterSize = NTAPS_LUMA;
1163 int halfFilterSize = filterSize >> 1;
1164 primitives.luma_hps[partEnum](fref, ref->lumaStride, immed, blockwidth, xFrac, 1);
1165 primitives.luma_vsp[partEnum](immed + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, FENC_STRIDE, yFrac);
1166 }
1167 return cmp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE);
1168 }
1169}