Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / encoder / motion.cpp
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19 *
20 * This program is also available under a commercial proprietary license.
21 * For more information, contact us at license @ x265.com.
22 *****************************************************************************/
23
24 #include "common.h"
25 #include "primitives.h"
26 #include "lowres.h"
27 #include "motion.h"
28 #include "x265.h"
29
30 #if _MSC_VER
31 #pragma warning(disable: 4127) // conditional expression is constant (macros use this construct)
32 #endif
33
34 using namespace x265;
35
36 namespace {
37
38 struct SubpelWorkload
39 {
40 int hpel_iters;
41 int hpel_dirs;
42 int qpel_iters;
43 int qpel_dirs;
44 bool hpel_satd;
45 };
46
47 const SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
48 {
49 { 1, 4, 0, 4, false }, // 4 SAD HPEL only
50 { 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL
51 { 1, 4, 1, 4, true }, // 4 SATD HPEL + 4 SATD QPEL
52 { 2, 4, 1, 4, true }, // 2x4 SATD HPEL + 4 SATD QPEL
53 { 2, 4, 2, 4, true }, // 2x4 SATD HPEL + 2x4 SATD QPEL
54 { 1, 8, 1, 8, true }, // 8 SATD HPEL + 8 SATD QPEL (default)
55 { 2, 8, 1, 8, true }, // 2x8 SATD HPEL + 8 SATD QPEL
56 { 2, 8, 2, 8, true }, // 2x8 SATD HPEL + 2x8 SATD QPEL
57 };
58
59 int sizeScale[NUM_LUMA_PARTITIONS];
60 #define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum])))
61
62 void initScales(void)
63 {
64 #define SETUP_SCALE(W, H) \
65 sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
66 SETUP_SCALE(4, 4);
67 SETUP_SCALE(8, 8);
68 SETUP_SCALE(8, 4);
69 SETUP_SCALE(4, 8);
70 SETUP_SCALE(16, 16);
71 SETUP_SCALE(16, 8);
72 SETUP_SCALE(8, 16);
73 SETUP_SCALE(16, 12);
74 SETUP_SCALE(12, 16);
75 SETUP_SCALE(4, 16);
76 SETUP_SCALE(16, 4);
77 SETUP_SCALE(32, 32);
78 SETUP_SCALE(32, 16);
79 SETUP_SCALE(16, 32);
80 SETUP_SCALE(32, 24);
81 SETUP_SCALE(24, 32);
82 SETUP_SCALE(32, 8);
83 SETUP_SCALE(8, 32);
84 SETUP_SCALE(64, 64);
85 SETUP_SCALE(64, 32);
86 SETUP_SCALE(32, 64);
87 SETUP_SCALE(64, 48);
88 SETUP_SCALE(48, 64);
89 SETUP_SCALE(64, 16);
90 SETUP_SCALE(16, 64);
91 #undef SETUP_SCALE
92 }
93
94 /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
95 const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
96 const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */
97 const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
98 const MV hex4[16] =
99 {
100 MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
101 MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
102 MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
103 MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
104 };
105 const MV offsets[] =
106 {
107 MV(-1, 0), MV(0, -1),
108 MV(-1, -1), MV(1, -1),
109 MV(-1, 0), MV(1, 0),
110 MV(-1, 1), MV(-1, -1),
111 MV(1, -1), MV(1, 1),
112 MV(-1, 0), MV(0, 1),
113 MV(-1, 1), MV(1, 1),
114 MV(1, 0), MV(0, 1),
115 }; // offsets for Two Point Search
116
117 /* sum of absolute differences between MV candidates, used for adaptive ME range */
118 inline int predictorDifference(const MV *mvc, intptr_t numCandidates)
119 {
120 int sum = 0;
121
122 for (int i = 0; i < numCandidates - 1; i++)
123 {
124 sum += abs(mvc[i].x - mvc[i + 1].x)
125 + abs(mvc[i].y - mvc[i + 1].y);
126 }
127
128 return sum;
129 }
130
131 }
132
133 MotionEstimate::MotionEstimate()
134 {
135 ctuAddr = -1;
136 absPartIdx = -1;
137 searchMethod = X265_HEX_SEARCH;
138 subpelRefine = 2;
139 bChromaSATD = false;
140 chromaSatd = NULL;
141 }
142
143 void MotionEstimate::init(int method, int refine, int csp)
144 {
145 if (!sizeScale[0])
146 initScales();
147
148 searchMethod = method;
149 subpelRefine = refine;
150 fencPUYuv.create(FENC_STRIDE, csp);
151 }
152
153 MotionEstimate::~MotionEstimate()
154 {
155 fencPUYuv.destroy();
156 }
157
158 /* Called by lookahead, luma only, no use of PicYuv */
159 void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight)
160 {
161 partEnum = partitionFromSizes(pwidth, pheight);
162 X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
163 sad = primitives.sad[partEnum];
164 satd = primitives.satd[partEnum];
165 sad_x3 = primitives.sad_x3[partEnum];
166 sad_x4 = primitives.sad_x4[partEnum];
167
168 blockwidth = pwidth;
169 blockOffset = offset;
170 absPartIdx = ctuAddr = -1;
171
172 /* copy PU block into cache */
173 primitives.luma_copy_pp[partEnum](fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
174 X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
175 }
176
177 /* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
178 void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight)
179 {
180 partEnum = partitionFromSizes(pwidth, pheight);
181 X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
182 sad = primitives.sad[partEnum];
183 satd = primitives.satd[partEnum];
184 sad_x3 = primitives.sad_x3[partEnum];
185 sad_x4 = primitives.sad_x4[partEnum];
186 chromaSatd = primitives.chroma[fencPUYuv.m_csp].satd[partEnum];
187
188 /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
189 * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
190 bChromaSATD = subpelRefine > 2 && chromaSatd;
191 X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
192
193 ctuAddr = _ctuAddr;
194 absPartIdx = cuPartIdx + puPartIdx;
195 blockwidth = pwidth;
196 blockOffset = 0;
197
198 /* copy PU from CU Yuv */
199 fencPUYuv.copyPUFromYuv(srcFencYuv, puPartIdx, partEnum, bChromaSATD);
200 }
201
202 #define COST_MV_PT_DIST(mx, my, point, dist) \
203 do \
204 { \
205 MV tmv(mx, my); \
206 int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \
207 cost += mvcost(tmv << 2); \
208 if (cost < bcost) { \
209 bcost = cost; \
210 bmv = tmv; \
211 bPointNr = point; \
212 bDistance = dist; \
213 } \
214 } while (0)
215
216 #define COST_MV(mx, my) \
217 do \
218 { \
219 int cost = sad(fenc, FENC_STRIDE, fref + (mx) + (my) * stride, stride); \
220 cost += mvcost(MV(mx, my) << 2); \
221 COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \
222 } while (0)
223
224 #define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \
225 { \
226 pixel *pix_base = fref + bmv.x + bmv.y * stride; \
227 sad_x3(fenc, \
228 pix_base + (m0x) + (m0y) * stride, \
229 pix_base + (m1x) + (m1y) * stride, \
230 pix_base + (m2x) + (m2y) * stride, \
231 stride, costs); \
232 (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
233 (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
234 (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
235 }
236
237 #define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
238 { \
239 sad_x4(fenc, \
240 fref + (m0x) + (m0y) * stride, \
241 fref + (m1x) + (m1y) * stride, \
242 fref + (m2x) + (m2y) * stride, \
243 fref + (m3x) + (m3y) * stride, \
244 stride, costs); \
245 costs[0] += mvcost(MV(m0x, m0y) << 2); \
246 costs[1] += mvcost(MV(m1x, m1y) << 2); \
247 costs[2] += mvcost(MV(m2x, m2y) << 2); \
248 costs[3] += mvcost(MV(m3x, m3y) << 2); \
249 COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \
250 COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \
251 COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \
252 COPY4_IF_LT(bcost, costs[3], bmv, MV(m3x, m3y), bPointNr, p3, bDistance, d3); \
253 }
254
255 #define COST_MV_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \
256 { \
257 pixel *pix_base = fref + omv.x + omv.y * stride; \
258 sad_x4(fenc, \
259 pix_base + (m0x) + (m0y) * stride, \
260 pix_base + (m1x) + (m1y) * stride, \
261 pix_base + (m2x) + (m2y) * stride, \
262 pix_base + (m3x) + (m3y) * stride, \
263 stride, costs); \
264 costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
265 costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
266 costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
267 costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
268 COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
269 COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
270 COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
271 COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
272 }
273
274 #define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
275 { \
276 pixel *pix_base = fref + bmv.x + bmv.y * stride; \
277 sad_x4(fenc, \
278 pix_base + (m0x) + (m0y) * stride, \
279 pix_base + (m1x) + (m1y) * stride, \
280 pix_base + (m2x) + (m2y) * stride, \
281 pix_base + (m3x) + (m3y) * stride, \
282 stride, costs); \
283 (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
284 (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
285 (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
286 (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
287 }
288
289 #define DIA1_ITER(mx, my) \
290 { \
291 omv.x = mx; omv.y = my; \
292 COST_MV_X4(0, -1, 0, 1, -1, 0, 1, 0); \
293 }
294
295 #define CROSS(start, x_max, y_max) \
296 { \
297 int16_t i = start; \
298 if ((x_max) <= X265_MIN(mvmax.x - omv.x, omv.x - mvmin.x)) \
299 for (; i < (x_max) - 2; i += 4) { \
300 COST_MV_X4(i, 0, -i, 0, i + 2, 0, -i - 2, 0); } \
301 for (; i < (x_max); i += 2) \
302 { \
303 if (omv.x + i <= mvmax.x) \
304 COST_MV(omv.x + i, omv.y); \
305 if (omv.x - i >= mvmin.x) \
306 COST_MV(omv.x - i, omv.y); \
307 } \
308 i = start; \
309 if ((y_max) <= X265_MIN(mvmax.y - omv.y, omv.y - mvmin.y)) \
310 for (; i < (y_max) - 2; i += 4) { \
311 COST_MV_X4(0, i, 0, -i, 0, i + 2, 0, -i - 2); } \
312 for (; i < (y_max); i += 2) \
313 { \
314 if (omv.y + i <= mvmax.y) \
315 COST_MV(omv.x, omv.y + i); \
316 if (omv.y - i >= mvmin.y) \
317 COST_MV(omv.x, omv.y - i); \
318 } \
319 }
320
321 void MotionEstimate::StarPatternSearch(ReferencePlanes *ref,
322 const MV & mvmin,
323 const MV & mvmax,
324 MV & bmv,
325 int & bcost,
326 int & bPointNr,
327 int & bDistance,
328 int earlyExitIters,
329 int merange)
330 {
331 ALIGN_VAR_16(int, costs[16]);
332 pixel* fenc = fencPUYuv.m_buf[0];
333 pixel* fref = ref->fpelPlane[0] + blockOffset;
334 intptr_t stride = ref->lumaStride;
335
336 MV omv = bmv;
337 int saved = bcost;
338 int rounds = 0;
339
340 {
341 int16_t dist = 1;
342
343 /* bPointNr
344 2
345 4 * 5
346 7
347 */
348 const int16_t top = omv.y - dist;
349 const int16_t bottom = omv.y + dist;
350 const int16_t left = omv.x - dist;
351 const int16_t right = omv.x + dist;
352
353 if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y)
354 {
355 COST_MV_PT_DIST_X4(omv.x, top, 2, dist,
356 left, omv.y, 4, dist,
357 right, omv.y, 5, dist,
358 omv.x, bottom, 7, dist);
359 }
360 else
361 {
362 if (top >= mvmin.y) // check top
363 {
364 COST_MV_PT_DIST(omv.x, top, 2, dist);
365 }
366 if (left >= mvmin.x) // check middle left
367 {
368 COST_MV_PT_DIST(left, omv.y, 4, dist);
369 }
370 if (right <= mvmax.x) // check middle right
371 {
372 COST_MV_PT_DIST(right, omv.y, 5, dist);
373 }
374 if (bottom <= mvmax.y) // check bottom
375 {
376 COST_MV_PT_DIST(omv.x, bottom, 7, dist);
377 }
378 }
379 if (bcost < saved)
380 rounds = 0;
381 else if (++rounds >= earlyExitIters)
382 return;
383 }
384
385 for (int16_t dist = 2; dist <= 8; dist <<= 1)
386 {
387 /* bPointNr
388 2
389 1 3
390 4 * 5
391 6 8
392 7
393 Points 2, 4, 5, 7 are dist
394 Points 1, 3, 6, 8 are dist>>1
395 */
396 const int16_t top = omv.y - dist;
397 const int16_t bottom = omv.y + dist;
398 const int16_t left = omv.x - dist;
399 const int16_t right = omv.x + dist;
400 const int16_t top2 = omv.y - (dist >> 1);
401 const int16_t bottom2 = omv.y + (dist >> 1);
402 const int16_t left2 = omv.x - (dist >> 1);
403 const int16_t right2 = omv.x + (dist >> 1);
404 saved = bcost;
405
406 if (top >= mvmin.y && left >= mvmin.x &&
407 right <= mvmax.x && bottom <= mvmax.y) // check border
408 {
409 COST_MV_PT_DIST_X4(omv.x, top, 2, dist,
410 left2, top2, 1, dist >> 1,
411 right2, top2, 3, dist >> 1,
412 left, omv.y, 4, dist);
413 COST_MV_PT_DIST_X4(right, omv.y, 5, dist,
414 left2, bottom2, 6, dist >> 1,
415 right2, bottom2, 8, dist >> 1,
416 omv.x, bottom, 7, dist);
417 }
418 else // check border for each mv
419 {
420 if (top >= mvmin.y) // check top
421 {
422 COST_MV_PT_DIST(omv.x, top, 2, dist);
423 }
424 if (top2 >= mvmin.y) // check half top
425 {
426 if (left2 >= mvmin.x) // check half left
427 {
428 COST_MV_PT_DIST(left2, top2, 1, (dist >> 1));
429 }
430 if (right2 <= mvmax.x) // check half right
431 {
432 COST_MV_PT_DIST(right2, top2, 3, (dist >> 1));
433 }
434 }
435 if (left >= mvmin.x) // check left
436 {
437 COST_MV_PT_DIST(left, omv.y, 4, dist);
438 }
439 if (right <= mvmax.x) // check right
440 {
441 COST_MV_PT_DIST(right, omv.y, 5, dist);
442 }
443 if (bottom2 <= mvmax.y) // check half bottom
444 {
445 if (left2 >= mvmin.x) // check half left
446 {
447 COST_MV_PT_DIST(left2, bottom2, 6, (dist >> 1));
448 }
449 if (right2 <= mvmax.x) // check half right
450 {
451 COST_MV_PT_DIST(right2, bottom2, 8, (dist >> 1));
452 }
453 }
454 if (bottom <= mvmax.y) // check bottom
455 {
456 COST_MV_PT_DIST(omv.x, bottom, 7, dist);
457 }
458 }
459
460 if (bcost < saved)
461 rounds = 0;
462 else if (++rounds >= earlyExitIters)
463 return;
464 }
465
466 for (int16_t dist = 16; dist <= (int16_t)merange; dist <<= 1)
467 {
468 const int16_t top = omv.y - dist;
469 const int16_t bottom = omv.y + dist;
470 const int16_t left = omv.x - dist;
471 const int16_t right = omv.x + dist;
472
473 saved = bcost;
474 if (top >= mvmin.y && left >= mvmin.x &&
475 right <= mvmax.x && bottom <= mvmax.y) // check border
476 {
477 /* index
478 0
479 3
480 2
481 1
482 0 3 2 1 * 1 2 3 0
483 1
484 2
485 3
486 0
487 */
488
489 COST_MV_PT_DIST_X4(omv.x, top, 0, dist,
490 left, omv.y, 0, dist,
491 right, omv.y, 0, dist,
492 omv.x, bottom, 0, dist);
493
494 for (int16_t index = 1; index < 4; index++)
495 {
496 int16_t posYT = top + ((dist >> 2) * index);
497 int16_t posYB = bottom - ((dist >> 2) * index);
498 int16_t posXL = omv.x - ((dist >> 2) * index);
499 int16_t posXR = omv.x + ((dist >> 2) * index);
500
501 COST_MV_PT_DIST_X4(posXL, posYT, 0, dist,
502 posXR, posYT, 0, dist,
503 posXL, posYB, 0, dist,
504 posXR, posYB, 0, dist);
505 }
506 }
507 else // check border for each mv
508 {
509 if (top >= mvmin.y) // check top
510 {
511 COST_MV_PT_DIST(omv.x, top, 0, dist);
512 }
513 if (left >= mvmin.x) // check left
514 {
515 COST_MV_PT_DIST(left, omv.y, 0, dist);
516 }
517 if (right <= mvmax.x) // check right
518 {
519 COST_MV_PT_DIST(right, omv.y, 0, dist);
520 }
521 if (bottom <= mvmax.y) // check bottom
522 {
523 COST_MV_PT_DIST(omv.x, bottom, 0, dist);
524 }
525 for (int16_t index = 1; index < 4; index++)
526 {
527 int16_t posYT = top + ((dist >> 2) * index);
528 int16_t posYB = bottom - ((dist >> 2) * index);
529 int16_t posXL = omv.x - ((dist >> 2) * index);
530 int16_t posXR = omv.x + ((dist >> 2) * index);
531
532 if (posYT >= mvmin.y) // check top
533 {
534 if (posXL >= mvmin.x) // check left
535 {
536 COST_MV_PT_DIST(posXL, posYT, 0, dist);
537 }
538 if (posXR <= mvmax.x) // check right
539 {
540 COST_MV_PT_DIST(posXR, posYT, 0, dist);
541 }
542 }
543 if (posYB <= mvmax.y) // check bottom
544 {
545 if (posXL >= mvmin.x) // check left
546 {
547 COST_MV_PT_DIST(posXL, posYB, 0, dist);
548 }
549 if (posXR <= mvmax.x) // check right
550 {
551 COST_MV_PT_DIST(posXR, posYB, 0, dist);
552 }
553 }
554 }
555 }
556
557 if (bcost < saved)
558 rounds = 0;
559 else if (++rounds >= earlyExitIters)
560 return;
561 }
562 }
563
564 int MotionEstimate::motionEstimate(ReferencePlanes *ref,
565 const MV & mvmin,
566 const MV & mvmax,
567 const MV & qmvp,
568 int numCandidates,
569 const MV * mvc,
570 int merange,
571 MV & outQMv)
572 {
573 ALIGN_VAR_16(int, costs[16]);
574 if (ctuAddr >= 0)
575 blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
576 intptr_t stride = ref->lumaStride;
577 pixel* fenc = fencPUYuv.m_buf[0];
578 pixel* fref = ref->fpelPlane[0] + blockOffset;
579
580 setMVP(qmvp);
581
582 MV qmvmin = mvmin.toQPel();
583 MV qmvmax = mvmax.toQPel();
584
585 /* The term cost used here means satd/sad values for that particular search.
586 * The costs used in ME integer search only includes the SAD cost of motion
587 * residual and sqrtLambda times MVD bits. The subpel refine steps use SATD
588 * cost of residual and sqrtLambda * MVD bits. Mode decision will be based
589 * on video distortion cost (SSE/PSNR) plus lambda times all signaling bits
590 * (mode + MVD bits). */
591
592 // measure SAD cost at clipped QPEL MVP
593 MV pmv = qmvp.clipped(qmvmin, qmvmax);
594 MV bestpre = pmv;
595 int bprecost;
596
597 if (ref->isLowres)
598 bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad);
599 else
600 bprecost = subpelCompare(ref, pmv, sad);
601
602 /* re-measure full pel rounded MVP with SAD as search start point */
603 MV bmv = pmv.roundToFPel();
604 int bcost = bprecost;
605 if (pmv.isSubpel())
606 bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
607
608 // measure SAD cost at MV(0) if MVP is not zero
609 if (pmv.notZero())
610 {
611 int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(MV(0, 0));
612 if (cost < bcost)
613 {
614 bcost = cost;
615 bmv = 0;
616 }
617 }
618
619 // measure SAD cost at each QPEL motion vector candidate
620 if (ref->isLowres)
621 {
622 for (int i = 0; i < numCandidates; i++)
623 {
624 MV m = mvc[i].clipped(qmvmin, qmvmax);
625 if (m.notZero() && m != pmv && m != bestpre) // check already measured
626 {
627 int cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m);
628 if (cost < bprecost)
629 {
630 bprecost = cost;
631 bestpre = m;
632 }
633 }
634 }
635 }
636 else
637 {
638 for (int i = 0; i < numCandidates; i++)
639 {
640 MV m = mvc[i].clipped(qmvmin, qmvmax);
641 if (m.notZero() && m != pmv && m != bestpre) // check already measured
642 {
643 int cost = subpelCompare(ref, m, sad) + mvcost(m);
644 if (cost < bprecost)
645 {
646 bprecost = cost;
647 bestpre = m;
648 }
649 }
650 }
651 }
652
653 pmv = pmv.roundToFPel();
654 MV omv = bmv; // current search origin or starting point
655
656 switch (searchMethod)
657 {
658 case X265_DIA_SEARCH:
659 {
660 /* diamond search, radius 1 */
661 bcost <<= 4;
662 int i = merange;
663 do
664 {
665 COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
666 COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
667 COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
668 COPY1_IF_LT(bcost, (costs[2] << 4) + 4);
669 COPY1_IF_LT(bcost, (costs[3] << 4) + 12);
670 if (!(bcost & 15))
671 break;
672 bmv.x -= (bcost << 28) >> 30;
673 bmv.y -= (bcost << 30) >> 30;
674 bcost &= ~15;
675 }
676 while (--i && bmv.checkRange(mvmin, mvmax));
677 bcost >>= 4;
678 break;
679 }
680
681 case X265_HEX_SEARCH:
682 {
683 me_hex2:
684 /* hexagon search, radius 2 */
685 #if 0
686 for (int i = 0; i < merange / 2; i++)
687 {
688 omv = bmv;
689 COST_MV(omv.x - 2, omv.y);
690 COST_MV(omv.x - 1, omv.y + 2);
691 COST_MV(omv.x + 1, omv.y + 2);
692 COST_MV(omv.x + 2, omv.y);
693 COST_MV(omv.x + 1, omv.y - 2);
694 COST_MV(omv.x - 1, omv.y - 2);
695 if (omv == bmv)
696 break;
697 if (!bmv.checkRange(mvmin, mvmax))
698 break;
699 }
700
701 #else // if 0
702 /* equivalent to the above, but eliminates duplicate candidates */
703 COST_MV_X3_DIR(-2, 0, -1, 2, 1, 2, costs);
704 bcost <<= 3;
705 COPY1_IF_LT(bcost, (costs[0] << 3) + 2);
706 COPY1_IF_LT(bcost, (costs[1] << 3) + 3);
707 COPY1_IF_LT(bcost, (costs[2] << 3) + 4);
708 COST_MV_X3_DIR(2, 0, 1, -2, -1, -2, costs);
709 COPY1_IF_LT(bcost, (costs[0] << 3) + 5);
710 COPY1_IF_LT(bcost, (costs[1] << 3) + 6);
711 COPY1_IF_LT(bcost, (costs[2] << 3) + 7);
712
713 if (bcost & 7)
714 {
715 int dir = (bcost & 7) - 2;
716 bmv += hex2[dir + 1];
717
718 /* half hexagon, not overlapping the previous iteration */
719 for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--)
720 {
721 COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,
722 hex2[dir + 1].x, hex2[dir + 1].y,
723 hex2[dir + 2].x, hex2[dir + 2].y,
724 costs);
725 bcost &= ~7;
726 COPY1_IF_LT(bcost, (costs[0] << 3) + 1);
727 COPY1_IF_LT(bcost, (costs[1] << 3) + 2);
728 COPY1_IF_LT(bcost, (costs[2] << 3) + 3);
729 if (!(bcost & 7))
730 break;
731 dir += (bcost & 7) - 2;
732 dir = mod6m1[dir + 1];
733 bmv += hex2[dir + 1];
734 }
735 }
736 bcost >>= 3;
737 #endif // if 0
738
739 /* square refine */
740 int dir = 0;
741 COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
742 COPY2_IF_LT(bcost, costs[0], dir, 1);
743 COPY2_IF_LT(bcost, costs[1], dir, 2);
744 COPY2_IF_LT(bcost, costs[2], dir, 3);
745 COPY2_IF_LT(bcost, costs[3], dir, 4);
746 COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
747 COPY2_IF_LT(bcost, costs[0], dir, 5);
748 COPY2_IF_LT(bcost, costs[1], dir, 6);
749 COPY2_IF_LT(bcost, costs[2], dir, 7);
750 COPY2_IF_LT(bcost, costs[3], dir, 8);
751 bmv += square1[dir];
752 break;
753 }
754
755 case X265_UMH_SEARCH:
756 {
757 int ucost1, ucost2;
758 int16_t cross_start = 1;
759
760 /* refine predictors */
761 omv = bmv;
762 ucost1 = bcost;
763 DIA1_ITER(pmv.x, pmv.y);
764 if (pmv.notZero())
765 DIA1_ITER(0, 0);
766
767 ucost2 = bcost;
768 if (bmv.notZero() && bmv != pmv)
769 DIA1_ITER(bmv.x, bmv.y);
770 if (bcost == ucost2)
771 cross_start = 3;
772
773 /* Early Termination */
774 omv = bmv;
775 if (bcost == ucost2 && SAD_THRESH(2000))
776 {
777 COST_MV_X4(0, -2, -1, -1, 1, -1, -2, 0);
778 COST_MV_X4(2, 0, -1, 1, 1, 1, 0, 2);
779 if (bcost == ucost1 && SAD_THRESH(500))
780 break;
781 if (bcost == ucost2)
782 {
783 int16_t range = (int16_t)(merange >> 1) | 1;
784 CROSS(3, range, range);
785 COST_MV_X4(-1, -2, 1, -2, -2, -1, 2, -1);
786 COST_MV_X4(-2, 1, 2, 1, -1, 2, 1, 2);
787 if (bcost == ucost2)
788 break;
789 cross_start = range + 2;
790 }
791 }
792
793 // TODO: Need to study x264's logic for building mvc list to understand why they
794 // have special cases here for 16x16, and whether they apply to HEVC CTU
795
796 // adaptive search range based on mvc variability
797 if (numCandidates)
798 {
799 /* range multipliers based on casual inspection of some statistics of
800 * average distance between current predictor and final mv found by ESA.
801 * these have not been tuned much by actual encoding. */
802 static const uint8_t range_mul[4][4] =
803 {
804 { 3, 3, 4, 4 },
805 { 3, 4, 4, 4 },
806 { 4, 4, 4, 5 },
807 { 4, 4, 5, 6 },
808 };
809
810 int mvd;
811 int sad_ctx, mvd_ctx;
812 int denom = 1;
813
814 if (numCandidates == 1)
815 {
816 if (LUMA_64x64 == partEnum)
817 /* mvc is probably the same as mvp, so the difference isn't meaningful.
818 * but prediction usually isn't too bad, so just use medium range */
819 mvd = 25;
820 else
821 mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
822 }
823 else
824 {
825 /* calculate the degree of agreement between predictors. */
826
827 /* in 64x64, mvc includes all the neighbors used to make mvp,
828 * so don't count mvp separately. */
829
830 denom = numCandidates - 1;
831 mvd = 0;
832 if (partEnum != LUMA_64x64)
833 {
834 mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
835 denom++;
836 }
837 mvd += predictorDifference(mvc, numCandidates);
838 }
839
840 sad_ctx = SAD_THRESH(1000) ? 0
841 : SAD_THRESH(2000) ? 1
842 : SAD_THRESH(4000) ? 2 : 3;
843 mvd_ctx = mvd < 10 * denom ? 0
844 : mvd < 20 * denom ? 1
845 : mvd < 40 * denom ? 2 : 3;
846
847 merange = (merange * range_mul[mvd_ctx][sad_ctx]) >> 2;
848 }
849
850 /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
851 * we are still centered on the same place as the DIA2. is this desirable? */
852 CROSS(cross_start, merange, merange >> 1);
853 COST_MV_X4(-2, -2, -2, 2, 2, -2, 2, 2);
854
855 /* hexagon grid */
856 omv = bmv;
857 const uint16_t *p_cost_omvx = m_cost_mvx + omv.x * 4;
858 const uint16_t *p_cost_omvy = m_cost_mvy + omv.y * 4;
859 uint16_t i = 1;
860 do
861 {
862 if (4 * i > X265_MIN4(mvmax.x - omv.x, omv.x - mvmin.x,
863 mvmax.y - omv.y, omv.y - mvmin.y))
864 {
865 for (int j = 0; j < 16; j++)
866 {
867 MV mv = omv + (hex4[j] * i);
868 if (mv.checkRange(mvmin, mvmax))
869 COST_MV(mv.x, mv.y);
870 }
871 }
872 else
873 {
874 int16_t dir = 0;
875 pixel *fref_base = fref + omv.x + (omv.y - 4 * i) * stride;
876 size_t dy = (size_t)i * stride;
877 #define SADS(k, x0, y0, x1, y1, x2, y2, x3, y3) \
878 sad_x4(fenc, \
879 fref_base x0 * i + (y0 - 2 * k + 4) * dy, \
880 fref_base x1 * i + (y1 - 2 * k + 4) * dy, \
881 fref_base x2 * i + (y2 - 2 * k + 4) * dy, \
882 fref_base x3 * i + (y3 - 2 * k + 4) * dy, \
883 stride, costs + 4 * k); \
884 fref_base += 2 * dy;
885 #define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + p_cost_omvy[y * 4 * i]
886 #define MIN_MV(k, x, y) COPY2_IF_LT(bcost, costs[k], dir, x * 16 + (y & 15))
887
888 SADS(0, +0, -4, +0, +4, -2, -3, +2, -3);
889 SADS(1, -4, -2, +4, -2, -4, -1, +4, -1);
890 SADS(2, -4, +0, +4, +0, -4, +1, +4, +1);
891 SADS(3, -4, +2, +4, +2, -2, +3, +2, +3);
892 ADD_MVCOST(0, 0, -4);
893 ADD_MVCOST(1, 0, 4);
894 ADD_MVCOST(2, -2, -3);
895 ADD_MVCOST(3, 2, -3);
896 ADD_MVCOST(4, -4, -2);
897 ADD_MVCOST(5, 4, -2);
898 ADD_MVCOST(6, -4, -1);
899 ADD_MVCOST(7, 4, -1);
900 ADD_MVCOST(8, -4, 0);
901 ADD_MVCOST(9, 4, 0);
902 ADD_MVCOST(10, -4, 1);
903 ADD_MVCOST(11, 4, 1);
904 ADD_MVCOST(12, -4, 2);
905 ADD_MVCOST(13, 4, 2);
906 ADD_MVCOST(14, -2, 3);
907 ADD_MVCOST(15, 2, 3);
908 MIN_MV(0, 0, -4);
909 MIN_MV(1, 0, 4);
910 MIN_MV(2, -2, -3);
911 MIN_MV(3, 2, -3);
912 MIN_MV(4, -4, -2);
913 MIN_MV(5, 4, -2);
914 MIN_MV(6, -4, -1);
915 MIN_MV(7, 4, -1);
916 MIN_MV(8, -4, 0);
917 MIN_MV(9, 4, 0);
918 MIN_MV(10, -4, 1);
919 MIN_MV(11, 4, 1);
920 MIN_MV(12, -4, 2);
921 MIN_MV(13, 4, 2);
922 MIN_MV(14, -2, 3);
923 MIN_MV(15, 2, 3);
924 #undef SADS
925 #undef ADD_MVCOST
926 #undef MIN_MV
927 if (dir)
928 {
929 bmv.x = omv.x + i * (dir >> 4);
930 bmv.y = omv.y + i * ((dir << 28) >> 28);
931 }
932 }
933 }
934 while (++i <= merange >> 2);
935 if (bmv.checkRange(mvmin, mvmax))
936 goto me_hex2;
937 break;
938 }
939
940 case X265_STAR_SEARCH: // Adapted from HM ME
941 {
942 int bPointNr = 0;
943 int bDistance = 0;
944
945 const int EarlyExitIters = 3;
946 StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange);
947 if (bDistance == 1)
948 {
949 // if best distance was only 1, check two missing points. If no new point is found, stop
950 if (bPointNr)
951 {
952 /* For a given direction 1 to 8, check nearest two outer X pixels
953 X X
954 X 1 2 3 X
955 4 * 5
956 X 6 7 8 X
957 X X
958 */
959 int saved = bcost;
960 const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
961 const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
962 if (mv1.checkRange(mvmin, mvmax))
963 {
964 COST_MV(mv1.x, mv1.y);
965 }
966 if (mv2.checkRange(mvmin, mvmax))
967 {
968 COST_MV(mv2.x, mv2.y);
969 }
970 if (bcost == saved)
971 break;
972 }
973 else
974 break;
975 }
976
977 const int RasterDistance = 5;
978 if (bDistance > RasterDistance)
979 {
980 // raster search refinement if original search distance was too big
981 MV tmv;
982 for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y += RasterDistance)
983 {
984 for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x += RasterDistance)
985 {
986 if (tmv.x + (RasterDistance * 3) <= mvmax.x)
987 {
988 pixel *pix_base = fref + tmv.y * stride + tmv.x;
989 sad_x4(fenc,
990 pix_base,
991 pix_base + RasterDistance,
992 pix_base + RasterDistance * 2,
993 pix_base + RasterDistance * 3,
994 stride, costs);
995 costs[0] += mvcost(tmv << 2);
996 COPY2_IF_LT(bcost, costs[0], bmv, tmv);
997 tmv.x += RasterDistance;
998 costs[1] += mvcost(tmv << 2);
999 COPY2_IF_LT(bcost, costs[1], bmv, tmv);
1000 tmv.x += RasterDistance;
1001 costs[2] += mvcost(tmv << 2);
1002 COPY2_IF_LT(bcost, costs[2], bmv, tmv);
1003 tmv.x += RasterDistance;
1004 costs[3] += mvcost(tmv << 3);
1005 COPY2_IF_LT(bcost, costs[3], bmv, tmv);
1006 }
1007 else
1008 COST_MV(tmv.x, tmv.y);
1009 }
1010 }
1011 }
1012
1013 while (bDistance > 0)
1014 {
1015 // center a new search around current best
1016 bDistance = 0;
1017 bPointNr = 0;
1018 const int MaxIters = 32;
1019 StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange);
1020
1021 if (bDistance == 1)
1022 {
1023 if (!bPointNr)
1024 break;
1025
1026 /* For a given direction 1 to 8, check nearest 2 outer X pixels
1027 X X
1028 X 1 2 3 X
1029 4 * 5
1030 X 6 7 8 X
1031 X X
1032 */
1033 const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
1034 const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
1035 if (mv1.checkRange(mvmin, mvmax))
1036 {
1037 COST_MV(mv1.x, mv1.y);
1038 }
1039 if (mv2.checkRange(mvmin, mvmax))
1040 {
1041 COST_MV(mv2.x, mv2.y);
1042 }
1043 break;
1044 }
1045 }
1046
1047 break;
1048 }
1049
1050 case X265_FULL_SEARCH:
1051 {
1052 // dead slow exhaustive search, but at least it uses sad_x4()
1053 MV tmv;
1054 for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y++)
1055 {
1056 for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x++)
1057 {
1058 if (tmv.x + 3 <= mvmax.x)
1059 {
1060 pixel *pix_base = fref + tmv.y * stride + tmv.x;
1061 sad_x4(fenc,
1062 pix_base,
1063 pix_base + 1,
1064 pix_base + 2,
1065 pix_base + 3,
1066 stride, costs);
1067 costs[0] += mvcost(tmv << 2);
1068 COPY2_IF_LT(bcost, costs[0], bmv, tmv);
1069 tmv.x++;
1070 costs[1] += mvcost(tmv << 2);
1071 COPY2_IF_LT(bcost, costs[1], bmv, tmv);
1072 tmv.x++;
1073 costs[2] += mvcost(tmv << 2);
1074 COPY2_IF_LT(bcost, costs[2], bmv, tmv);
1075 tmv.x++;
1076 costs[3] += mvcost(tmv << 2);
1077 COPY2_IF_LT(bcost, costs[3], bmv, tmv);
1078 }
1079 else
1080 COST_MV(tmv.x, tmv.y);
1081 }
1082 }
1083
1084 break;
1085 }
1086
1087 default:
1088 X265_CHECK(0, "invalid motion estimate mode\n");
1089 break;
1090 }
1091
1092 if (bprecost < bcost)
1093 {
1094 bmv = bestpre;
1095 bcost = bprecost;
1096 }
1097 else
1098 bmv = bmv.toQPel(); // promote search bmv to qpel
1099
1100 const SubpelWorkload& wl = workload[this->subpelRefine];
1101
1102 if (!bcost)
1103 {
1104 /* if there was zero residual at the clipped MVP, we can skip subpel
1105 * refine, but we do need to include the mvcost in the returned cost */
1106 bcost = mvcost(bmv);
1107 }
1108 else if (ref->isLowres)
1109 {
1110 int bdir = 0;
1111 for (int i = 1; i <= wl.hpel_dirs; i++)
1112 {
1113 MV qmv = bmv + square1[i] * 2;
1114 int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);
1115 COPY2_IF_LT(bcost, cost, bdir, i);
1116 }
1117
1118 bmv += square1[bdir] * 2;
1119 bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + mvcost(bmv);
1120
1121 bdir = 0;
1122 for (int i = 1; i <= wl.qpel_dirs; i++)
1123 {
1124 MV qmv = bmv + square1[i];
1125 int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv);
1126 COPY2_IF_LT(bcost, cost, bdir, i);
1127 }
1128
1129 bmv += square1[bdir];
1130 }
1131 else
1132 {
1133 pixelcmp_t hpelcomp;
1134
1135 if (wl.hpel_satd)
1136 {
1137 bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1138 hpelcomp = satd;
1139 }
1140 else
1141 hpelcomp = sad;
1142
1143 for (int iter = 0; iter < wl.hpel_iters; iter++)
1144 {
1145 int bdir = 0;
1146 for (int i = 1; i <= wl.hpel_dirs; i++)
1147 {
1148 MV qmv = bmv + square1[i] * 2;
1149 int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
1150 COPY2_IF_LT(bcost, cost, bdir, i);
1151 }
1152
1153 if (bdir)
1154 bmv += square1[bdir] * 2;
1155 else
1156 break;
1157 }
1158
1159 /* if HPEL search used SAD, remeasure with SATD before QPEL */
1160 if (!wl.hpel_satd)
1161 bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1162
1163 for (int iter = 0; iter < wl.qpel_iters; iter++)
1164 {
1165 int bdir = 0;
1166 for (int i = 1; i <= wl.qpel_dirs; i++)
1167 {
1168 MV qmv = bmv + square1[i];
1169 int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
1170 COPY2_IF_LT(bcost, cost, bdir, i);
1171 }
1172
1173 if (bdir)
1174 bmv += square1[bdir];
1175 else
1176 break;
1177 }
1178 }
1179
1180 x265_emms();
1181 outQMv = bmv;
1182 return bcost;
1183 }
1184
1185 int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
1186 {
1187 intptr_t refStride = ref->lumaStride;
1188 pixel *fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
1189 int xFrac = qmv.x & 0x3;
1190 int yFrac = qmv.y & 0x3;
1191 int cost;
1192 intptr_t lclStride = fencPUYuv.m_size;
1193 X265_CHECK(lclStride == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
1194
1195 if (!(yFrac | xFrac))
1196 cost = cmp(fencPUYuv.m_buf[0], lclStride, fref, refStride);
1197 else
1198 {
1199 /* we are taking a short-cut here if the reference is weighted. To be
1200 * accurate we should be interpolating unweighted pixels and weighting
1201 * the final 16bit values prior to rounding and down shifting. Instead we
1202 * are simply interpolating the weighted full-pel pixels. Not 100%
1203 * accurate but good enough for fast qpel ME */
1204 ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
1205 if (!yFrac)
1206 primitives.luma_hpp[partEnum](fref, refStride, subpelbuf, lclStride, xFrac);
1207 else if (!xFrac)
1208 primitives.luma_vpp[partEnum](fref, refStride, subpelbuf, lclStride, yFrac);
1209 else
1210 {
1211 ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_LUMA)]);
1212
1213 int filterSize = NTAPS_LUMA;
1214 int halfFilterSize = filterSize >> 1;
1215 primitives.luma_hps[partEnum](fref, refStride, immed, blockwidth, xFrac, 1);
1216 primitives.luma_vsp[partEnum](immed + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, lclStride, yFrac);
1217 }
1218 cost = cmp(fencPUYuv.m_buf[0], lclStride, subpelbuf, lclStride);
1219 }
1220
1221 if (bChromaSATD)
1222 {
1223 int csp = fencPUYuv.m_csp;
1224 int hshift = fencPUYuv.m_hChromaShift;
1225 int vshift = fencPUYuv.m_vChromaShift;
1226 int shiftHor = (2 + hshift);
1227 int shiftVer = (2 + vshift);
1228 lclStride = fencPUYuv.m_csize;
1229
1230 intptr_t refStrideC = ref->reconPic->m_strideC;
1231 intptr_t refOffset = (qmv.x >> shiftHor) + (qmv.y >> shiftVer) * refStrideC;
1232
1233 const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
1234 const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
1235
1236 xFrac = qmv.x & ((1 << shiftHor) - 1);
1237 yFrac = qmv.y & ((1 << shiftVer) - 1);
1238
1239 if (!(yFrac | xFrac))
1240 {
1241 cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, refCb, refStrideC);
1242 cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, refCr, refStrideC);
1243 }
1244 else
1245 {
1246 ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
1247 if (!yFrac)
1248 {
1249 primitives.chroma[csp].filter_hpp[partEnum](refCb, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
1250 cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
1251
1252 primitives.chroma[csp].filter_hpp[partEnum](refCr, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
1253 cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
1254 }
1255 else if (!xFrac)
1256 {
1257 primitives.chroma[csp].filter_vpp[partEnum](refCb, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
1258 cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
1259
1260 primitives.chroma[csp].filter_vpp[partEnum](refCr, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
1261 cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
1262 }
1263 else
1264 {
1265 ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_CHROMA)]);
1266
1267 int extStride = blockwidth >> hshift;
1268 int filterSize = NTAPS_CHROMA;
1269 int halfFilterSize = (filterSize >> 1);
1270
1271 primitives.chroma[csp].filter_hps[partEnum](refCb, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
1272 primitives.chroma[csp].filter_vsp[partEnum](immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
1273 cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
1274
1275 primitives.chroma[csp].filter_hps[partEnum](refCr, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
1276 primitives.chroma[csp].filter_vsp[partEnum](immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
1277 cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
1278 }
1279 }
1280 }
1281
1282 return cost;
1283 }