Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | /***************************************************************************** |
2 | * Copyright (C) 2013 x265 project | |
3 | * | |
4 | * Authors: Steve Borho <steve@borho.org> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | * This program is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with this program; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
19 | * | |
20 | * This program is also available under a commercial proprietary license. | |
21 | * For more information, contact us at license @ x265.com. | |
22 | *****************************************************************************/ | |
23 | ||
24 | #include "common.h" | |
25 | #include "primitives.h" | |
26 | #include "lowres.h" | |
27 | #include "motion.h" | |
28 | #include "x265.h" | |
29 | ||
30 | #if _MSC_VER | |
31 | #pragma warning(disable: 4127) // conditional expression is constant (macros use this construct) | |
32 | #endif | |
33 | ||
34 | using namespace x265; | |
35 | ||
36 | namespace { | |
37 | struct SubpelWorkload | |
38 | { | |
39 | int hpel_iters; | |
40 | int hpel_dirs; | |
41 | int qpel_iters; | |
42 | int qpel_dirs; | |
43 | bool hpel_satd; | |
44 | }; | |
45 | ||
46 | SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] = | |
47 | { | |
48 | { 1, 4, 0, 4, false }, // 4 SAD HPEL only | |
49 | { 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL | |
50 | { 1, 4, 1, 4, true }, // 4 SATD HPEL + 4 SATD QPEL | |
51 | { 2, 4, 1, 4, true }, // 2x4 SATD HPEL + 4 SATD QPEL | |
52 | { 2, 4, 2, 4, true }, // 2x4 SATD HPEL + 2x4 SATD QPEL | |
53 | { 1, 8, 1, 8, true }, // 8 SATD HPEL + 8 SATD QPEL (default) | |
54 | { 2, 8, 1, 8, true }, // 2x8 SATD HPEL + 8 SATD QPEL | |
55 | { 2, 8, 2, 8, true }, // 2x8 SATD HPEL + 2x8 SATD QPEL | |
56 | }; | |
57 | } | |
58 | ||
59 | static int size_scale[NUM_LUMA_PARTITIONS]; | |
60 | #define SAD_THRESH(v) (bcost < (((v >> 4) * size_scale[partEnum]))) | |
61 | ||
62 | static void init_scales(void) | |
63 | { | |
64 | #define SETUP_SCALE(W, H) \ | |
65 | size_scale[LUMA_ ## W ## x ## H] = (H * H) >> 4; | |
66 | SETUP_SCALE(4, 4); | |
67 | SETUP_SCALE(8, 8); | |
68 | SETUP_SCALE(8, 4); | |
69 | SETUP_SCALE(4, 8); | |
70 | SETUP_SCALE(16, 16); | |
71 | SETUP_SCALE(16, 8); | |
72 | SETUP_SCALE(8, 16); | |
73 | SETUP_SCALE(16, 12); | |
74 | SETUP_SCALE(12, 16); | |
75 | SETUP_SCALE(4, 16); | |
76 | SETUP_SCALE(16, 4); | |
77 | SETUP_SCALE(32, 32); | |
78 | SETUP_SCALE(32, 16); | |
79 | SETUP_SCALE(16, 32); | |
80 | SETUP_SCALE(32, 24); | |
81 | SETUP_SCALE(24, 32); | |
82 | SETUP_SCALE(32, 8); | |
83 | SETUP_SCALE(8, 32); | |
84 | SETUP_SCALE(64, 64); | |
85 | SETUP_SCALE(64, 32); | |
86 | SETUP_SCALE(32, 64); | |
87 | SETUP_SCALE(64, 48); | |
88 | SETUP_SCALE(48, 64); | |
89 | SETUP_SCALE(64, 16); | |
90 | SETUP_SCALE(16, 64); | |
91 | #undef SETUP_SCALE | |
92 | } | |
93 | ||
94 | MotionEstimate::MotionEstimate() | |
95 | : searchMethod(3) | |
96 | , subpelRefine(5) | |
97 | { | |
98 | if (size_scale[0] == 0) | |
99 | init_scales(); | |
100 | ||
101 | fenc = X265_MALLOC(pixel, MAX_CU_SIZE * MAX_CU_SIZE); | |
102 | } | |
103 | ||
104 | MotionEstimate::~MotionEstimate() | |
105 | { | |
106 | X265_FREE(fenc); | |
107 | } | |
108 | ||
109 | void MotionEstimate::setSourcePU(intptr_t offset, int width, int height) | |
110 | { | |
111 | partEnum = partitionFromSizes(width, height); | |
112 | X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n"); | |
113 | sad = primitives.sad[partEnum]; | |
114 | satd = primitives.satd[partEnum]; | |
115 | sa8d = primitives.sa8d_inter[partEnum]; | |
116 | sad_x3 = primitives.sad_x3[partEnum]; | |
117 | sad_x4 = primitives.sad_x4[partEnum]; | |
118 | ||
119 | blockwidth = width; | |
120 | blockheight = height; | |
121 | blockOffset = offset; | |
122 | ||
123 | /* copy PU block into cache */ | |
124 | primitives.luma_copy_pp[partEnum](fenc, FENC_STRIDE, fencplane + offset, fencLumaStride); | |
125 | } | |
126 | ||
127 | /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */ | |
128 | static const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) }; | |
129 | static const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */ | |
130 | static const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) }; | |
131 | static const MV hex4[16] = | |
132 | { | |
133 | MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3), | |
134 | MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1), | |
135 | MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1), | |
136 | MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3), | |
137 | }; | |
138 | static const MV offsets[] = | |
139 | { | |
140 | MV(-1, 0), MV(0, -1), | |
141 | MV(-1, -1), MV(1, -1), | |
142 | MV(-1, 0), MV(1, 0), | |
143 | MV(-1, 1), MV(-1, -1), | |
144 | MV(1, -1), MV(1, 1), | |
145 | MV(-1, 0), MV(0, 1), | |
146 | MV(-1, 1), MV(1, 1), | |
147 | MV(1, 0), MV(0, 1), | |
148 | }; // offsets for Two Point Search | |
149 | ||
150 | /* sum of absolute differences between MV candidates */ | |
151 | static inline int x265_predictor_difference(const MV *mvc, intptr_t numCandidates) | |
152 | { | |
153 | int sum = 0; | |
154 | ||
155 | for (int i = 0; i < numCandidates - 1; i++) | |
156 | { | |
157 | sum += abs(mvc[i].x - mvc[i + 1].x) | |
158 | + abs(mvc[i].y - mvc[i + 1].y); | |
159 | } | |
160 | ||
161 | return sum; | |
162 | } | |
163 | ||
164 | #define COST_MV_PT_DIST(mx, my, point, dist) \ | |
165 | do \ | |
166 | { \ | |
167 | MV tmv(mx, my); \ | |
168 | int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \ | |
169 | cost += mvcost(tmv << 2); \ | |
170 | if (cost < bcost) { \ | |
171 | bcost = cost; \ | |
172 | bmv = tmv; \ | |
173 | bPointNr = point; \ | |
174 | bDistance = dist; \ | |
175 | } \ | |
176 | } while (0) | |
177 | ||
178 | #define COST_MV(mx, my) \ | |
179 | do \ | |
180 | { \ | |
181 | int cost = sad(fenc, FENC_STRIDE, fref + (mx) + (my) * stride, stride); \ | |
182 | cost += mvcost(MV(mx, my) << 2); \ | |
183 | COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \ | |
184 | } while (0) | |
185 | ||
186 | #define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \ | |
187 | { \ | |
188 | pixel *pix_base = fref + bmv.x + bmv.y * stride; \ | |
189 | sad_x3(fenc, \ | |
190 | pix_base + (m0x) + (m0y) * stride, \ | |
191 | pix_base + (m1x) + (m1y) * stride, \ | |
192 | pix_base + (m2x) + (m2y) * stride, \ | |
193 | stride, costs); \ | |
194 | (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \ | |
195 | (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \ | |
196 | (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \ | |
197 | } | |
198 | ||
199 | #define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \ | |
200 | { \ | |
201 | sad_x4(fenc, \ | |
202 | fref + (m0x) + (m0y) * stride, \ | |
203 | fref + (m1x) + (m1y) * stride, \ | |
204 | fref + (m2x) + (m2y) * stride, \ | |
205 | fref + (m3x) + (m3y) * stride, \ | |
206 | stride, costs); \ | |
207 | costs[0] += mvcost(MV(m0x, m0y) << 2); \ | |
208 | costs[1] += mvcost(MV(m1x, m1y) << 2); \ | |
209 | costs[2] += mvcost(MV(m2x, m2y) << 2); \ | |
210 | costs[3] += mvcost(MV(m3x, m3y) << 2); \ | |
211 | COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \ | |
212 | COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \ | |
213 | COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \ | |
214 | COPY4_IF_LT(bcost, costs[3], bmv, MV(m3x, m3y), bPointNr, p3, bDistance, d3); \ | |
215 | } | |
216 | ||
217 | #define COST_MV_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \ | |
218 | { \ | |
219 | pixel *pix_base = fref + omv.x + omv.y * stride; \ | |
220 | sad_x4(fenc, \ | |
221 | pix_base + (m0x) + (m0y) * stride, \ | |
222 | pix_base + (m1x) + (m1y) * stride, \ | |
223 | pix_base + (m2x) + (m2y) * stride, \ | |
224 | pix_base + (m3x) + (m3y) * stride, \ | |
225 | stride, costs); \ | |
226 | costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \ | |
227 | costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \ | |
228 | costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \ | |
229 | costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \ | |
230 | COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \ | |
231 | COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \ | |
232 | COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \ | |
233 | COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \ | |
234 | } | |
235 | ||
236 | #define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \ | |
237 | { \ | |
238 | pixel *pix_base = fref + bmv.x + bmv.y * stride; \ | |
239 | sad_x4(fenc, \ | |
240 | pix_base + (m0x) + (m0y) * stride, \ | |
241 | pix_base + (m1x) + (m1y) * stride, \ | |
242 | pix_base + (m2x) + (m2y) * stride, \ | |
243 | pix_base + (m3x) + (m3y) * stride, \ | |
244 | stride, costs); \ | |
245 | (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \ | |
246 | (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \ | |
247 | (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \ | |
248 | (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \ | |
249 | } | |
250 | ||
251 | #define DIA1_ITER(mx, my) \ | |
252 | { \ | |
253 | omv.x = mx; omv.y = my; \ | |
254 | COST_MV_X4(0, -1, 0, 1, -1, 0, 1, 0); \ | |
255 | } | |
256 | ||
257 | #define CROSS(start, x_max, y_max) \ | |
258 | { \ | |
259 | int16_t i = start; \ | |
260 | if ((x_max) <= X265_MIN(mvmax.x - omv.x, omv.x - mvmin.x)) \ | |
261 | for (; i < (x_max) - 2; i += 4) { \ | |
262 | COST_MV_X4(i, 0, -i, 0, i + 2, 0, -i - 2, 0); } \ | |
263 | for (; i < (x_max); i += 2) \ | |
264 | { \ | |
265 | if (omv.x + i <= mvmax.x) \ | |
266 | COST_MV(omv.x + i, omv.y); \ | |
267 | if (omv.x - i >= mvmin.x) \ | |
268 | COST_MV(omv.x - i, omv.y); \ | |
269 | } \ | |
270 | i = start; \ | |
271 | if ((y_max) <= X265_MIN(mvmax.y - omv.y, omv.y - mvmin.y)) \ | |
272 | for (; i < (y_max) - 2; i += 4) { \ | |
273 | COST_MV_X4(0, i, 0, -i, 0, i + 2, 0, -i - 2); } \ | |
274 | for (; i < (y_max); i += 2) \ | |
275 | { \ | |
276 | if (omv.y + i <= mvmax.y) \ | |
277 | COST_MV(omv.x, omv.y + i); \ | |
278 | if (omv.y - i >= mvmin.y) \ | |
279 | COST_MV(omv.x, omv.y - i); \ | |
280 | } \ | |
281 | } | |
282 | ||
283 | void MotionEstimate::StarPatternSearch(ReferencePlanes *ref, | |
284 | const MV & mvmin, | |
285 | const MV & mvmax, | |
286 | MV & bmv, | |
287 | int & bcost, | |
288 | int & bPointNr, | |
289 | int & bDistance, | |
290 | int earlyExitIters, | |
291 | int merange) | |
292 | { | |
293 | ALIGN_VAR_16(int, costs[16]); | |
294 | pixel *fref = ref->fpelPlane + blockOffset; | |
295 | size_t stride = ref->lumaStride; | |
296 | ||
297 | MV omv = bmv; | |
298 | int saved = bcost; | |
299 | int rounds = 0; | |
300 | ||
301 | { | |
302 | int16_t dist = 1; | |
303 | ||
304 | /* bPointNr | |
305 | 2 | |
306 | 4 * 5 | |
307 | 7 | |
308 | */ | |
309 | const int16_t top = omv.y - dist; | |
310 | const int16_t bottom = omv.y + dist; | |
311 | const int16_t left = omv.x - dist; | |
312 | const int16_t right = omv.x + dist; | |
313 | ||
314 | if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y) | |
315 | { | |
316 | COST_MV_PT_DIST_X4(omv.x, top, 2, dist, | |
317 | left, omv.y, 4, dist, | |
318 | right, omv.y, 5, dist, | |
319 | omv.x, bottom, 7, dist); | |
320 | } | |
321 | else | |
322 | { | |
323 | if (top >= mvmin.y) // check top | |
324 | { | |
325 | COST_MV_PT_DIST(omv.x, top, 2, dist); | |
326 | } | |
327 | if (left >= mvmin.x) // check middle left | |
328 | { | |
329 | COST_MV_PT_DIST(left, omv.y, 4, dist); | |
330 | } | |
331 | if (right <= mvmax.x) // check middle right | |
332 | { | |
333 | COST_MV_PT_DIST(right, omv.y, 5, dist); | |
334 | } | |
335 | if (bottom <= mvmax.y) // check bottom | |
336 | { | |
337 | COST_MV_PT_DIST(omv.x, bottom, 7, dist); | |
338 | } | |
339 | } | |
340 | if (bcost < saved) | |
341 | rounds = 0; | |
342 | else if (++rounds >= earlyExitIters) | |
343 | return; | |
344 | } | |
345 | ||
346 | for (int16_t dist = 2; dist <= 8; dist <<= 1) | |
347 | { | |
348 | /* bPointNr | |
349 | 2 | |
350 | 1 3 | |
351 | 4 * 5 | |
352 | 6 8 | |
353 | 7 | |
354 | Points 2, 4, 5, 7 are dist | |
355 | Points 1, 3, 6, 8 are dist>>1 | |
356 | */ | |
357 | const int16_t top = omv.y - dist; | |
358 | const int16_t bottom = omv.y + dist; | |
359 | const int16_t left = omv.x - dist; | |
360 | const int16_t right = omv.x + dist; | |
361 | const int16_t top2 = omv.y - (dist >> 1); | |
362 | const int16_t bottom2 = omv.y + (dist >> 1); | |
363 | const int16_t left2 = omv.x - (dist >> 1); | |
364 | const int16_t right2 = omv.x + (dist >> 1); | |
365 | saved = bcost; | |
366 | ||
367 | if (top >= mvmin.y && left >= mvmin.x && | |
368 | right <= mvmax.x && bottom <= mvmax.y) // check border | |
369 | { | |
370 | COST_MV_PT_DIST_X4(omv.x, top, 2, dist, | |
371 | left2, top2, 1, dist >> 1, | |
372 | right2, top2, 3, dist >> 1, | |
373 | left, omv.y, 4, dist); | |
374 | COST_MV_PT_DIST_X4(right, omv.y, 5, dist, | |
375 | left2, bottom2, 6, dist >> 1, | |
376 | right2, bottom2, 8, dist >> 1, | |
377 | omv.x, bottom, 7, dist); | |
378 | } | |
379 | else // check border for each mv | |
380 | { | |
381 | if (top >= mvmin.y) // check top | |
382 | { | |
383 | COST_MV_PT_DIST(omv.x, top, 2, dist); | |
384 | } | |
385 | if (top2 >= mvmin.y) // check half top | |
386 | { | |
387 | if (left2 >= mvmin.x) // check half left | |
388 | { | |
389 | COST_MV_PT_DIST(left2, top2, 1, (dist >> 1)); | |
390 | } | |
391 | if (right2 <= mvmax.x) // check half right | |
392 | { | |
393 | COST_MV_PT_DIST(right2, top2, 3, (dist >> 1)); | |
394 | } | |
395 | } | |
396 | if (left >= mvmin.x) // check left | |
397 | { | |
398 | COST_MV_PT_DIST(left, omv.y, 4, dist); | |
399 | } | |
400 | if (right <= mvmax.x) // check right | |
401 | { | |
402 | COST_MV_PT_DIST(right, omv.y, 5, dist); | |
403 | } | |
404 | if (bottom2 <= mvmax.y) // check half bottom | |
405 | { | |
406 | if (left2 >= mvmin.x) // check half left | |
407 | { | |
408 | COST_MV_PT_DIST(left2, bottom2, 6, (dist >> 1)); | |
409 | } | |
410 | if (right2 <= mvmax.x) // check half right | |
411 | { | |
412 | COST_MV_PT_DIST(right2, bottom2, 8, (dist >> 1)); | |
413 | } | |
414 | } | |
415 | if (bottom <= mvmax.y) // check bottom | |
416 | { | |
417 | COST_MV_PT_DIST(omv.x, bottom, 7, dist); | |
418 | } | |
419 | } | |
420 | ||
421 | if (bcost < saved) | |
422 | rounds = 0; | |
423 | else if (++rounds >= earlyExitIters) | |
424 | return; | |
425 | } | |
426 | ||
427 | for (int16_t dist = 16; dist <= (int16_t)merange; dist <<= 1) | |
428 | { | |
429 | const int16_t top = omv.y - dist; | |
430 | const int16_t bottom = omv.y + dist; | |
431 | const int16_t left = omv.x - dist; | |
432 | const int16_t right = omv.x + dist; | |
433 | ||
434 | saved = bcost; | |
435 | if (top >= mvmin.y && left >= mvmin.x && | |
436 | right <= mvmax.x && bottom <= mvmax.y) // check border | |
437 | { | |
438 | /* index | |
439 | 0 | |
440 | 3 | |
441 | 2 | |
442 | 1 | |
443 | 0 3 2 1 * 1 2 3 0 | |
444 | 1 | |
445 | 2 | |
446 | 3 | |
447 | 0 | |
448 | */ | |
449 | ||
450 | COST_MV_PT_DIST_X4(omv.x, top, 0, dist, | |
451 | left, omv.y, 0, dist, | |
452 | right, omv.y, 0, dist, | |
453 | omv.x, bottom, 0, dist); | |
454 | ||
455 | for (int16_t index = 1; index < 4; index++) | |
456 | { | |
457 | int16_t posYT = top + ((dist >> 2) * index); | |
458 | int16_t posYB = bottom - ((dist >> 2) * index); | |
459 | int16_t posXL = omv.x - ((dist >> 2) * index); | |
460 | int16_t posXR = omv.x + ((dist >> 2) * index); | |
461 | ||
462 | COST_MV_PT_DIST_X4(posXL, posYT, 0, dist, | |
463 | posXR, posYT, 0, dist, | |
464 | posXL, posYB, 0, dist, | |
465 | posXR, posYB, 0, dist); | |
466 | } | |
467 | } | |
468 | else // check border for each mv | |
469 | { | |
470 | if (top >= mvmin.y) // check top | |
471 | { | |
472 | COST_MV_PT_DIST(omv.x, top, 0, dist); | |
473 | } | |
474 | if (left >= mvmin.x) // check left | |
475 | { | |
476 | COST_MV_PT_DIST(left, omv.y, 0, dist); | |
477 | } | |
478 | if (right <= mvmax.x) // check right | |
479 | { | |
480 | COST_MV_PT_DIST(right, omv.y, 0, dist); | |
481 | } | |
482 | if (bottom <= mvmax.y) // check bottom | |
483 | { | |
484 | COST_MV_PT_DIST(omv.x, bottom, 0, dist); | |
485 | } | |
486 | for (int16_t index = 1; index < 4; index++) | |
487 | { | |
488 | int16_t posYT = top + ((dist >> 2) * index); | |
489 | int16_t posYB = bottom - ((dist >> 2) * index); | |
490 | int16_t posXL = omv.x - ((dist >> 2) * index); | |
491 | int16_t posXR = omv.x + ((dist >> 2) * index); | |
492 | ||
493 | if (posYT >= mvmin.y) // check top | |
494 | { | |
495 | if (posXL >= mvmin.x) // check left | |
496 | { | |
497 | COST_MV_PT_DIST(posXL, posYT, 0, dist); | |
498 | } | |
499 | if (posXR <= mvmax.x) // check right | |
500 | { | |
501 | COST_MV_PT_DIST(posXR, posYT, 0, dist); | |
502 | } | |
503 | } | |
504 | if (posYB <= mvmax.y) // check bottom | |
505 | { | |
506 | if (posXL >= mvmin.x) // check left | |
507 | { | |
508 | COST_MV_PT_DIST(posXL, posYB, 0, dist); | |
509 | } | |
510 | if (posXR <= mvmax.x) // check right | |
511 | { | |
512 | COST_MV_PT_DIST(posXR, posYB, 0, dist); | |
513 | } | |
514 | } | |
515 | } | |
516 | } | |
517 | ||
518 | if (bcost < saved) | |
519 | rounds = 0; | |
520 | else if (++rounds >= earlyExitIters) | |
521 | return; | |
522 | } | |
523 | } | |
524 | ||
525 | int MotionEstimate::motionEstimate(ReferencePlanes *ref, | |
526 | const MV & mvmin, | |
527 | const MV & mvmax, | |
528 | const MV & qmvp, | |
529 | int numCandidates, | |
530 | const MV * mvc, | |
531 | int merange, | |
532 | MV & outQMv) | |
533 | { | |
534 | ALIGN_VAR_16(int, costs[16]); | |
535 | size_t stride = ref->lumaStride; | |
536 | pixel *fref = ref->fpelPlane + blockOffset; | |
537 | ||
538 | setMVP(qmvp); | |
539 | ||
540 | MV qmvmin = mvmin.toQPel(); | |
541 | MV qmvmax = mvmax.toQPel(); | |
542 | ||
543 | /* The term cost used here means satd/sad values for that particular search. | |
544 | * The costs used in ME integer search only includes the SAD cost of motion | |
545 | * residual and sqrtLambda times MVD bits. The subpel refine steps use SATD | |
546 | * cost of residual and sqrtLambda * MVD bits. Mode decision will be based | |
547 | * on video distortion cost (SSE/PSNR) plus lambda times all signaling bits | |
548 | * (mode + MVD bits). */ | |
549 | ||
550 | // measure SAD cost at clipped QPEL MVP | |
551 | MV pmv = qmvp.clipped(qmvmin, qmvmax); | |
552 | MV bestpre = pmv; | |
553 | int bprecost; | |
554 | ||
555 | if (ref->isLowres) | |
556 | bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad); | |
557 | else | |
558 | bprecost = subpelCompare(ref, pmv, sad); | |
559 | ||
560 | /* re-measure full pel rounded MVP with SAD as search start point */ | |
561 | MV bmv = pmv.roundToFPel(); | |
562 | int bcost = bprecost; | |
563 | if (pmv.isSubpel()) | |
564 | { | |
565 | bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2); | |
566 | } | |
567 | ||
568 | // measure SAD cost at MV(0) if MVP is not zero | |
569 | if (pmv.notZero()) | |
570 | { | |
571 | int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(MV(0, 0)); | |
572 | if (cost < bcost) | |
573 | { | |
574 | bcost = cost; | |
575 | bmv = 0; | |
576 | } | |
577 | } | |
578 | ||
579 | // measure SAD cost at each QPEL motion vector candidate | |
580 | for (int i = 0; i < numCandidates; i++) | |
581 | { | |
582 | MV m = mvc[i].clipped(qmvmin, qmvmax); | |
583 | if (m.notZero() && m != pmv && m != bestpre) // check already measured | |
584 | { | |
585 | int cost; | |
586 | if (ref->isLowres) | |
587 | cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m); | |
588 | else | |
589 | cost = subpelCompare(ref, m, sad) + mvcost(m); | |
590 | ||
591 | if (cost < bprecost) | |
592 | { | |
593 | bprecost = cost; | |
594 | bestpre = m; | |
595 | } | |
596 | } | |
597 | } | |
598 | ||
599 | pmv = pmv.roundToFPel(); | |
600 | MV omv = bmv; // current search origin or starting point | |
601 | ||
602 | switch (searchMethod) | |
603 | { | |
604 | case X265_DIA_SEARCH: | |
605 | { | |
606 | /* diamond search, radius 1 */ | |
607 | bcost <<= 4; | |
608 | int i = merange; | |
609 | do | |
610 | { | |
611 | COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs); | |
612 | COPY1_IF_LT(bcost, (costs[0] << 4) + 1); | |
613 | COPY1_IF_LT(bcost, (costs[1] << 4) + 3); | |
614 | COPY1_IF_LT(bcost, (costs[2] << 4) + 4); | |
615 | COPY1_IF_LT(bcost, (costs[3] << 4) + 12); | |
616 | if (!(bcost & 15)) | |
617 | break; | |
618 | bmv.x -= (bcost << 28) >> 30; | |
619 | bmv.y -= (bcost << 30) >> 30; | |
620 | bcost &= ~15; | |
621 | } | |
622 | while (--i && bmv.checkRange(mvmin, mvmax)); | |
623 | bcost >>= 4; | |
624 | break; | |
625 | } | |
626 | ||
627 | case X265_HEX_SEARCH: | |
628 | { | |
629 | me_hex2: | |
630 | /* hexagon search, radius 2 */ | |
631 | #if 0 | |
632 | for (int i = 0; i < merange / 2; i++) | |
633 | { | |
634 | omv = bmv; | |
635 | COST_MV(omv.x - 2, omv.y); | |
636 | COST_MV(omv.x - 1, omv.y + 2); | |
637 | COST_MV(omv.x + 1, omv.y + 2); | |
638 | COST_MV(omv.x + 2, omv.y); | |
639 | COST_MV(omv.x + 1, omv.y - 2); | |
640 | COST_MV(omv.x - 1, omv.y - 2); | |
641 | if (omv == bmv) | |
642 | break; | |
643 | if (!bmv.checkRange(mvmin, mvmax)) | |
644 | break; | |
645 | } | |
646 | ||
647 | #else // if 0 | |
648 | /* equivalent to the above, but eliminates duplicate candidates */ | |
649 | COST_MV_X3_DIR(-2, 0, -1, 2, 1, 2, costs); | |
650 | bcost <<= 3; | |
651 | COPY1_IF_LT(bcost, (costs[0] << 3) + 2); | |
652 | COPY1_IF_LT(bcost, (costs[1] << 3) + 3); | |
653 | COPY1_IF_LT(bcost, (costs[2] << 3) + 4); | |
654 | COST_MV_X3_DIR(2, 0, 1, -2, -1, -2, costs); | |
655 | COPY1_IF_LT(bcost, (costs[0] << 3) + 5); | |
656 | COPY1_IF_LT(bcost, (costs[1] << 3) + 6); | |
657 | COPY1_IF_LT(bcost, (costs[2] << 3) + 7); | |
658 | ||
659 | if (bcost & 7) | |
660 | { | |
661 | int dir = (bcost & 7) - 2; | |
662 | bmv += hex2[dir + 1]; | |
663 | ||
664 | /* half hexagon, not overlapping the previous iteration */ | |
665 | for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--) | |
666 | { | |
667 | COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y, | |
668 | hex2[dir + 1].x, hex2[dir + 1].y, | |
669 | hex2[dir + 2].x, hex2[dir + 2].y, | |
670 | costs); | |
671 | bcost &= ~7; | |
672 | COPY1_IF_LT(bcost, (costs[0] << 3) + 1); | |
673 | COPY1_IF_LT(bcost, (costs[1] << 3) + 2); | |
674 | COPY1_IF_LT(bcost, (costs[2] << 3) + 3); | |
675 | if (!(bcost & 7)) | |
676 | break; | |
677 | dir += (bcost & 7) - 2; | |
678 | dir = mod6m1[dir + 1]; | |
679 | bmv += hex2[dir + 1]; | |
680 | } | |
681 | } | |
682 | bcost >>= 3; | |
683 | #endif // if 0 | |
684 | ||
685 | /* square refine */ | |
686 | int dir = 0; | |
687 | COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs); | |
688 | COPY2_IF_LT(bcost, costs[0], dir, 1); | |
689 | COPY2_IF_LT(bcost, costs[1], dir, 2); | |
690 | COPY2_IF_LT(bcost, costs[2], dir, 3); | |
691 | COPY2_IF_LT(bcost, costs[3], dir, 4); | |
692 | COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs); | |
693 | COPY2_IF_LT(bcost, costs[0], dir, 5); | |
694 | COPY2_IF_LT(bcost, costs[1], dir, 6); | |
695 | COPY2_IF_LT(bcost, costs[2], dir, 7); | |
696 | COPY2_IF_LT(bcost, costs[3], dir, 8); | |
697 | bmv += square1[dir]; | |
698 | break; | |
699 | } | |
700 | ||
701 | case X265_UMH_SEARCH: | |
702 | { | |
703 | int ucost1, ucost2; | |
704 | int16_t cross_start = 1; | |
705 | ||
706 | /* refine predictors */ | |
707 | omv = bmv; | |
708 | ucost1 = bcost; | |
709 | DIA1_ITER(pmv.x, pmv.y); | |
710 | if (pmv.notZero()) | |
711 | DIA1_ITER(0, 0); | |
712 | ||
713 | ucost2 = bcost; | |
714 | if (bmv.notZero() && bmv != pmv) | |
715 | DIA1_ITER(bmv.x, bmv.y); | |
716 | if (bcost == ucost2) | |
717 | cross_start = 3; | |
718 | ||
719 | /* Early Termination */ | |
720 | omv = bmv; | |
721 | if (bcost == ucost2 && SAD_THRESH(2000)) | |
722 | { | |
723 | COST_MV_X4(0, -2, -1, -1, 1, -1, -2, 0); | |
724 | COST_MV_X4(2, 0, -1, 1, 1, 1, 0, 2); | |
725 | if (bcost == ucost1 && SAD_THRESH(500)) | |
726 | break; | |
727 | if (bcost == ucost2) | |
728 | { | |
729 | int16_t range = (int16_t)(merange >> 1) | 1; | |
730 | CROSS(3, range, range); | |
731 | COST_MV_X4(-1, -2, 1, -2, -2, -1, 2, -1); | |
732 | COST_MV_X4(-2, 1, 2, 1, -1, 2, 1, 2); | |
733 | if (bcost == ucost2) | |
734 | break; | |
735 | cross_start = range + 2; | |
736 | } | |
737 | } | |
738 | ||
739 | // TODO: Need to study x264's logic for building mvc list to understand why they | |
740 | // have special cases here for 16x16, and whether they apply to HEVC CTU | |
741 | ||
742 | // adaptive search range based on mvc variability | |
743 | if (numCandidates) | |
744 | { | |
745 | /* range multipliers based on casual inspection of some statistics of | |
746 | * average distance between current predictor and final mv found by ESA. | |
747 | * these have not been tuned much by actual encoding. */ | |
748 | static const uint8_t range_mul[4][4] = | |
749 | { | |
750 | { 3, 3, 4, 4 }, | |
751 | { 3, 4, 4, 4 }, | |
752 | { 4, 4, 4, 5 }, | |
753 | { 4, 4, 5, 6 }, | |
754 | }; | |
755 | ||
756 | int mvd; | |
757 | int sad_ctx, mvd_ctx; | |
758 | int denom = 1; | |
759 | ||
760 | if (numCandidates == 1) | |
761 | { | |
762 | if (LUMA_64x64 == partEnum) | |
763 | /* mvc is probably the same as mvp, so the difference isn't meaningful. | |
764 | * but prediction usually isn't too bad, so just use medium range */ | |
765 | mvd = 25; | |
766 | else | |
767 | mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y); | |
768 | } | |
769 | else | |
770 | { | |
771 | /* calculate the degree of agreement between predictors. */ | |
772 | ||
773 | /* in 64x64, mvc includes all the neighbors used to make mvp, | |
774 | * so don't count mvp separately. */ | |
775 | ||
776 | denom = numCandidates - 1; | |
777 | mvd = 0; | |
778 | if (partEnum != LUMA_64x64) | |
779 | { | |
780 | mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y); | |
781 | denom++; | |
782 | } | |
783 | mvd += x265_predictor_difference(mvc, numCandidates); | |
784 | } | |
785 | ||
786 | sad_ctx = SAD_THRESH(1000) ? 0 | |
787 | : SAD_THRESH(2000) ? 1 | |
788 | : SAD_THRESH(4000) ? 2 : 3; | |
789 | mvd_ctx = mvd < 10 * denom ? 0 | |
790 | : mvd < 20 * denom ? 1 | |
791 | : mvd < 40 * denom ? 2 : 3; | |
792 | ||
793 | merange = (merange * range_mul[mvd_ctx][sad_ctx]) >> 2; | |
794 | } | |
795 | ||
796 | /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy. | |
797 | * we are still centered on the same place as the DIA2. is this desirable? */ | |
798 | CROSS(cross_start, merange, merange >> 1); | |
799 | COST_MV_X4(-2, -2, -2, 2, 2, -2, 2, 2); | |
800 | ||
801 | /* hexagon grid */ | |
802 | omv = bmv; | |
803 | const uint16_t *p_cost_omvx = m_cost_mvx + omv.x * 4; | |
804 | const uint16_t *p_cost_omvy = m_cost_mvy + omv.y * 4; | |
805 | uint16_t i = 1; | |
806 | do | |
807 | { | |
808 | if (4 * i > X265_MIN4(mvmax.x - omv.x, omv.x - mvmin.x, | |
809 | mvmax.y - omv.y, omv.y - mvmin.y)) | |
810 | { | |
811 | for (int j = 0; j < 16; j++) | |
812 | { | |
813 | MV mv = omv + (hex4[j] * i); | |
814 | if (mv.checkRange(mvmin, mvmax)) | |
815 | COST_MV(mv.x, mv.y); | |
816 | } | |
817 | } | |
818 | else | |
819 | { | |
820 | int16_t dir = 0; | |
821 | pixel *fref_base = fref + omv.x + (omv.y - 4 * i) * stride; | |
822 | size_t dy = (size_t)i * stride; | |
823 | #define SADS(k, x0, y0, x1, y1, x2, y2, x3, y3) \ | |
824 | sad_x4(fenc, \ | |
825 | fref_base x0 * i + (y0 - 2 * k + 4) * dy, \ | |
826 | fref_base x1 * i + (y1 - 2 * k + 4) * dy, \ | |
827 | fref_base x2 * i + (y2 - 2 * k + 4) * dy, \ | |
828 | fref_base x3 * i + (y3 - 2 * k + 4) * dy, \ | |
829 | stride, costs + 4 * k); \ | |
830 | fref_base += 2 * dy; | |
831 | #define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + p_cost_omvy[y * 4 * i] | |
832 | #define MIN_MV(k, x, y) COPY2_IF_LT(bcost, costs[k], dir, x * 16 + (y & 15)) | |
833 | ||
834 | SADS(0, +0, -4, +0, +4, -2, -3, +2, -3); | |
835 | SADS(1, -4, -2, +4, -2, -4, -1, +4, -1); | |
836 | SADS(2, -4, +0, +4, +0, -4, +1, +4, +1); | |
837 | SADS(3, -4, +2, +4, +2, -2, +3, +2, +3); | |
838 | ADD_MVCOST(0, 0, -4); | |
839 | ADD_MVCOST(1, 0, 4); | |
840 | ADD_MVCOST(2, -2, -3); | |
841 | ADD_MVCOST(3, 2, -3); | |
842 | ADD_MVCOST(4, -4, -2); | |
843 | ADD_MVCOST(5, 4, -2); | |
844 | ADD_MVCOST(6, -4, -1); | |
845 | ADD_MVCOST(7, 4, -1); | |
846 | ADD_MVCOST(8, -4, 0); | |
847 | ADD_MVCOST(9, 4, 0); | |
848 | ADD_MVCOST(10, -4, 1); | |
849 | ADD_MVCOST(11, 4, 1); | |
850 | ADD_MVCOST(12, -4, 2); | |
851 | ADD_MVCOST(13, 4, 2); | |
852 | ADD_MVCOST(14, -2, 3); | |
853 | ADD_MVCOST(15, 2, 3); | |
854 | MIN_MV(0, 0, -4); | |
855 | MIN_MV(1, 0, 4); | |
856 | MIN_MV(2, -2, -3); | |
857 | MIN_MV(3, 2, -3); | |
858 | MIN_MV(4, -4, -2); | |
859 | MIN_MV(5, 4, -2); | |
860 | MIN_MV(6, -4, -1); | |
861 | MIN_MV(7, 4, -1); | |
862 | MIN_MV(8, -4, 0); | |
863 | MIN_MV(9, 4, 0); | |
864 | MIN_MV(10, -4, 1); | |
865 | MIN_MV(11, 4, 1); | |
866 | MIN_MV(12, -4, 2); | |
867 | MIN_MV(13, 4, 2); | |
868 | MIN_MV(14, -2, 3); | |
869 | MIN_MV(15, 2, 3); | |
870 | #undef SADS | |
871 | #undef ADD_MVCOST | |
872 | #undef MIN_MV | |
873 | if (dir) | |
874 | { | |
875 | bmv.x = omv.x + i * (dir >> 4); | |
876 | bmv.y = omv.y + i * ((dir << 28) >> 28); | |
877 | } | |
878 | } | |
879 | } | |
880 | while (++i <= merange >> 2); | |
881 | if (bmv.checkRange(mvmin, mvmax)) | |
882 | goto me_hex2; | |
883 | break; | |
884 | } | |
885 | ||
886 | case X265_STAR_SEARCH: // Adapted from HM ME | |
887 | { | |
888 | int bPointNr = 0; | |
889 | int bDistance = 0; | |
890 | ||
891 | const int EarlyExitIters = 3; | |
892 | StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange); | |
893 | if (bDistance == 1) | |
894 | { | |
895 | // if best distance was only 1, check two missing points. If no new point is found, stop | |
896 | if (bPointNr) | |
897 | { | |
898 | /* For a given direction 1 to 8, check nearest two outer X pixels | |
899 | X X | |
900 | X 1 2 3 X | |
901 | 4 * 5 | |
902 | X 6 7 8 X | |
903 | X X | |
904 | */ | |
905 | int saved = bcost; | |
906 | const MV mv1 = bmv + offsets[(bPointNr - 1) * 2]; | |
907 | const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1]; | |
908 | if (mv1.checkRange(mvmin, mvmax)) | |
909 | { | |
910 | COST_MV(mv1.x, mv1.y); | |
911 | } | |
912 | if (mv2.checkRange(mvmin, mvmax)) | |
913 | { | |
914 | COST_MV(mv2.x, mv2.y); | |
915 | } | |
916 | if (bcost == saved) | |
917 | break; | |
918 | } | |
919 | else | |
920 | break; | |
921 | } | |
922 | ||
923 | const int RasterDistance = 5; | |
924 | if (bDistance > RasterDistance) | |
925 | { | |
926 | // raster search refinement if original search distance was too big | |
927 | MV tmv; | |
928 | for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y += RasterDistance) | |
929 | { | |
930 | for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x += RasterDistance) | |
931 | { | |
932 | if (tmv.x + (RasterDistance * 3) <= mvmax.x) | |
933 | { | |
934 | pixel *pix_base = fref + tmv.y * stride + tmv.x; | |
935 | sad_x4(fenc, | |
936 | pix_base, | |
937 | pix_base + RasterDistance, | |
938 | pix_base + RasterDistance * 2, | |
939 | pix_base + RasterDistance * 3, | |
940 | stride, costs); | |
941 | costs[0] += mvcost(tmv << 2); | |
942 | COPY2_IF_LT(bcost, costs[0], bmv, tmv); | |
943 | tmv.x += RasterDistance; | |
944 | costs[1] += mvcost(tmv << 2); | |
945 | COPY2_IF_LT(bcost, costs[1], bmv, tmv); | |
946 | tmv.x += RasterDistance; | |
947 | costs[2] += mvcost(tmv << 2); | |
948 | COPY2_IF_LT(bcost, costs[2], bmv, tmv); | |
949 | tmv.x += RasterDistance; | |
950 | costs[3] += mvcost(tmv << 3); | |
951 | COPY2_IF_LT(bcost, costs[3], bmv, tmv); | |
952 | } | |
953 | else | |
954 | COST_MV(tmv.x, tmv.y); | |
955 | } | |
956 | } | |
957 | } | |
958 | ||
959 | while (bDistance > 0) | |
960 | { | |
961 | // center a new search around current best | |
962 | bDistance = 0; | |
963 | bPointNr = 0; | |
964 | const int MaxIters = 32; | |
965 | StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange); | |
966 | ||
967 | if (bDistance == 1) | |
968 | { | |
969 | if (!bPointNr) | |
970 | break; | |
971 | ||
972 | /* For a given direction 1 to 8, check nearest 2 outer X pixels | |
973 | X X | |
974 | X 1 2 3 X | |
975 | 4 * 5 | |
976 | X 6 7 8 X | |
977 | X X | |
978 | */ | |
979 | const MV mv1 = bmv + offsets[(bPointNr - 1) * 2]; | |
980 | const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1]; | |
981 | if (mv1.checkRange(mvmin, mvmax)) | |
982 | { | |
983 | COST_MV(mv1.x, mv1.y); | |
984 | } | |
985 | if (mv2.checkRange(mvmin, mvmax)) | |
986 | { | |
987 | COST_MV(mv2.x, mv2.y); | |
988 | } | |
989 | break; | |
990 | } | |
991 | } | |
992 | ||
993 | break; | |
994 | } | |
995 | ||
996 | case X265_FULL_SEARCH: | |
997 | { | |
998 | // dead slow exhaustive search, but at least it uses sad_x4() | |
999 | MV tmv; | |
1000 | for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y++) | |
1001 | { | |
1002 | for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x++) | |
1003 | { | |
1004 | if (tmv.x + 3 <= mvmax.x) | |
1005 | { | |
1006 | pixel *pix_base = fref + tmv.y * stride + tmv.x; | |
1007 | sad_x4(fenc, | |
1008 | pix_base, | |
1009 | pix_base + 1, | |
1010 | pix_base + 2, | |
1011 | pix_base + 3, | |
1012 | stride, costs); | |
1013 | costs[0] += mvcost(tmv << 2); | |
1014 | COPY2_IF_LT(bcost, costs[0], bmv, tmv); | |
1015 | tmv.x++; | |
1016 | costs[1] += mvcost(tmv << 2); | |
1017 | COPY2_IF_LT(bcost, costs[1], bmv, tmv); | |
1018 | tmv.x++; | |
1019 | costs[2] += mvcost(tmv << 2); | |
1020 | COPY2_IF_LT(bcost, costs[2], bmv, tmv); | |
1021 | tmv.x++; | |
1022 | costs[3] += mvcost(tmv << 2); | |
1023 | COPY2_IF_LT(bcost, costs[3], bmv, tmv); | |
1024 | } | |
1025 | else | |
1026 | COST_MV(tmv.x, tmv.y); | |
1027 | } | |
1028 | } | |
1029 | ||
1030 | break; | |
1031 | } | |
1032 | ||
1033 | default: | |
1034 | X265_CHECK(0, "invalid motion estimate mode\n"); | |
1035 | break; | |
1036 | } | |
1037 | ||
1038 | if (bprecost < bcost) | |
1039 | { | |
1040 | bmv = bestpre; | |
1041 | bcost = bprecost; | |
1042 | } | |
1043 | else | |
1044 | bmv = bmv.toQPel(); // promote search bmv to qpel | |
1045 | ||
1046 | SubpelWorkload& wl = workload[this->subpelRefine]; | |
1047 | ||
1048 | if (!bcost) | |
1049 | { | |
1050 | /* if there was zero residual at the clipped MVP, we can skip subpel | |
1051 | * refine, but we do need to include the mvcost in the returned cost */ | |
1052 | bcost = mvcost(bmv); | |
1053 | } | |
1054 | else if (ref->isLowres) | |
1055 | { | |
1056 | int bdir = 0, cost; | |
1057 | for (int i = 1; i <= wl.hpel_dirs; i++) | |
1058 | { | |
1059 | MV qmv = bmv + square1[i] * 2; | |
1060 | cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv); | |
1061 | COPY2_IF_LT(bcost, cost, bdir, i); | |
1062 | } | |
1063 | ||
1064 | bmv += square1[bdir] * 2; | |
1065 | bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + mvcost(bmv); | |
1066 | ||
1067 | bdir = 0; | |
1068 | for (int i = 1; i <= wl.qpel_dirs; i++) | |
1069 | { | |
1070 | MV qmv = bmv + square1[i]; | |
1071 | cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv); | |
1072 | COPY2_IF_LT(bcost, cost, bdir, i); | |
1073 | } | |
1074 | ||
1075 | bmv += square1[bdir]; | |
1076 | } | |
1077 | else | |
1078 | { | |
1079 | pixelcmp_t hpelcomp; | |
1080 | ||
1081 | if (wl.hpel_satd) | |
1082 | { | |
1083 | bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv); | |
1084 | hpelcomp = satd; | |
1085 | } | |
1086 | else | |
1087 | hpelcomp = sad; | |
1088 | ||
1089 | for (int iter = 0; iter < wl.hpel_iters; iter++) | |
1090 | { | |
1091 | int bdir = 0, cost; | |
1092 | for (int i = 1; i <= wl.hpel_dirs; i++) | |
1093 | { | |
1094 | MV qmv = bmv + square1[i] * 2; | |
1095 | cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv); | |
1096 | COPY2_IF_LT(bcost, cost, bdir, i); | |
1097 | } | |
1098 | ||
1099 | if (bdir) | |
1100 | bmv += square1[bdir] * 2; | |
1101 | else | |
1102 | break; | |
1103 | } | |
1104 | ||
1105 | /* if HPEL search used SAD, remeasure with SATD before QPEL */ | |
1106 | if (!wl.hpel_satd) | |
1107 | bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv); | |
1108 | ||
1109 | for (int iter = 0; iter < wl.qpel_iters; iter++) | |
1110 | { | |
1111 | int bdir = 0, cost; | |
1112 | for (int i = 1; i <= wl.qpel_dirs; i++) | |
1113 | { | |
1114 | MV qmv = bmv + square1[i]; | |
1115 | cost = subpelCompare(ref, qmv, satd) + mvcost(qmv); | |
1116 | COPY2_IF_LT(bcost, cost, bdir, i); | |
1117 | } | |
1118 | ||
1119 | if (bdir) | |
1120 | bmv += square1[bdir]; | |
1121 | else | |
1122 | break; | |
1123 | } | |
1124 | } | |
1125 | ||
1126 | x265_emms(); | |
1127 | outQMv = bmv; | |
1128 | return bcost; | |
1129 | } | |
1130 | ||
1131 | int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp) | |
1132 | { | |
1133 | int xFrac = qmv.x & 0x3; | |
1134 | int yFrac = qmv.y & 0x3; | |
1135 | ||
1136 | if ((yFrac | xFrac) == 0) | |
1137 | { | |
1138 | pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride; | |
1139 | return cmp(fenc, FENC_STRIDE, fref, ref->lumaStride); | |
1140 | } | |
1141 | else | |
1142 | { | |
1143 | /* We are taking a short-cut here if the reference is weighted. To be | |
1144 | * accurate we should be interpolating unweighted pixels and weighting | |
1145 | * the final 16bit values prior to rounding and downshifting. Instead we | |
1146 | * are simply interpolating the weighted full-pel pixels. Not 100% | |
1147 | * accurate but good enough for fast qpel ME */ | |
1148 | ALIGN_VAR_32(pixel, subpelbuf[64 * 64]); | |
1149 | pixel *fref = ref->fpelPlane + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * ref->lumaStride; | |
1150 | if (yFrac == 0) | |
1151 | { | |
1152 | primitives.luma_hpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, xFrac); | |
1153 | } | |
1154 | else if (xFrac == 0) | |
1155 | { | |
1156 | primitives.luma_vpp[partEnum](fref, ref->lumaStride, subpelbuf, FENC_STRIDE, yFrac); | |
1157 | } | |
1158 | else | |
1159 | { | |
1160 | ALIGN_VAR_32(int16_t, immed[64 * (64 + 8)]); | |
1161 | ||
1162 | int filterSize = NTAPS_LUMA; | |
1163 | int halfFilterSize = filterSize >> 1; | |
1164 | primitives.luma_hps[partEnum](fref, ref->lumaStride, immed, blockwidth, xFrac, 1); | |
1165 | primitives.luma_vsp[partEnum](immed + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, FENC_STRIDE, yFrac); | |
1166 | } | |
1167 | return cmp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE); | |
1168 | } | |
1169 | } |