Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | /***************************************************************************** |
2 | * Copyright (C) 2013 x265 project | |
3 | * | |
4 | * Authors: Steve Borho <steve@borho.org> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | * This program is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with this program; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
19 | * | |
20 | * This program is also available under a commercial proprietary license. | |
21 | * For more information, contact us at license @ x265.com. | |
22 | *****************************************************************************/ | |
23 | ||
24 | #include "common.h" | |
25 | #include "primitives.h" | |
26 | #include "lowres.h" | |
27 | #include "motion.h" | |
28 | #include "x265.h" | |
29 | ||
30 | #if _MSC_VER | |
31 | #pragma warning(disable: 4127) // conditional expression is constant (macros use this construct) | |
32 | #endif | |
33 | ||
34 | using namespace x265; | |
35 | ||
36 | namespace { | |
b53f7c52 | 37 | |
72b9787e JB |
38 | struct SubpelWorkload |
39 | { | |
40 | int hpel_iters; | |
41 | int hpel_dirs; | |
42 | int qpel_iters; | |
43 | int qpel_dirs; | |
44 | bool hpel_satd; | |
45 | }; | |
46 | ||
b53f7c52 | 47 | const SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] = |
72b9787e JB |
48 | { |
49 | { 1, 4, 0, 4, false }, // 4 SAD HPEL only | |
50 | { 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL | |
51 | { 1, 4, 1, 4, true }, // 4 SATD HPEL + 4 SATD QPEL | |
52 | { 2, 4, 1, 4, true }, // 2x4 SATD HPEL + 4 SATD QPEL | |
53 | { 2, 4, 2, 4, true }, // 2x4 SATD HPEL + 2x4 SATD QPEL | |
54 | { 1, 8, 1, 8, true }, // 8 SATD HPEL + 8 SATD QPEL (default) | |
55 | { 2, 8, 1, 8, true }, // 2x8 SATD HPEL + 8 SATD QPEL | |
56 | { 2, 8, 2, 8, true }, // 2x8 SATD HPEL + 2x8 SATD QPEL | |
57 | }; | |
72b9787e | 58 | |
b53f7c52 JB |
59 | int sizeScale[NUM_LUMA_PARTITIONS]; |
60 | #define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum]))) | |
72b9787e | 61 | |
b53f7c52 | 62 | void initScales(void) |
72b9787e JB |
63 | { |
64 | #define SETUP_SCALE(W, H) \ | |
b53f7c52 | 65 | sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4; |
72b9787e JB |
66 | SETUP_SCALE(4, 4); |
67 | SETUP_SCALE(8, 8); | |
68 | SETUP_SCALE(8, 4); | |
69 | SETUP_SCALE(4, 8); | |
70 | SETUP_SCALE(16, 16); | |
71 | SETUP_SCALE(16, 8); | |
72 | SETUP_SCALE(8, 16); | |
73 | SETUP_SCALE(16, 12); | |
74 | SETUP_SCALE(12, 16); | |
75 | SETUP_SCALE(4, 16); | |
76 | SETUP_SCALE(16, 4); | |
77 | SETUP_SCALE(32, 32); | |
78 | SETUP_SCALE(32, 16); | |
79 | SETUP_SCALE(16, 32); | |
80 | SETUP_SCALE(32, 24); | |
81 | SETUP_SCALE(24, 32); | |
82 | SETUP_SCALE(32, 8); | |
83 | SETUP_SCALE(8, 32); | |
84 | SETUP_SCALE(64, 64); | |
85 | SETUP_SCALE(64, 32); | |
86 | SETUP_SCALE(32, 64); | |
87 | SETUP_SCALE(64, 48); | |
88 | SETUP_SCALE(48, 64); | |
89 | SETUP_SCALE(64, 16); | |
90 | SETUP_SCALE(16, 64); | |
91 | #undef SETUP_SCALE | |
92 | } | |
93 | ||
72b9787e | 94 | /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */ |
b53f7c52 JB |
95 | const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) }; |
96 | const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */ | |
97 | const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) }; | |
98 | const MV hex4[16] = | |
72b9787e | 99 | { |
b53f7c52 | 100 | MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3), |
72b9787e | 101 | MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1), |
b53f7c52 | 102 | MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1), |
72b9787e JB |
103 | MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3), |
104 | }; | |
b53f7c52 | 105 | const MV offsets[] = |
72b9787e JB |
106 | { |
107 | MV(-1, 0), MV(0, -1), | |
108 | MV(-1, -1), MV(1, -1), | |
109 | MV(-1, 0), MV(1, 0), | |
110 | MV(-1, 1), MV(-1, -1), | |
111 | MV(1, -1), MV(1, 1), | |
112 | MV(-1, 0), MV(0, 1), | |
113 | MV(-1, 1), MV(1, 1), | |
114 | MV(1, 0), MV(0, 1), | |
115 | }; // offsets for Two Point Search | |
116 | ||
b53f7c52 JB |
117 | /* sum of absolute differences between MV candidates, used for adaptive ME range */ |
118 | inline int predictorDifference(const MV *mvc, intptr_t numCandidates) | |
72b9787e JB |
119 | { |
120 | int sum = 0; | |
121 | ||
122 | for (int i = 0; i < numCandidates - 1; i++) | |
123 | { | |
124 | sum += abs(mvc[i].x - mvc[i + 1].x) | |
125 | + abs(mvc[i].y - mvc[i + 1].y); | |
126 | } | |
127 | ||
128 | return sum; | |
129 | } | |
130 | ||
b53f7c52 JB |
131 | } |
132 | ||
133 | MotionEstimate::MotionEstimate() | |
134 | { | |
135 | ctuAddr = -1; | |
136 | absPartIdx = -1; | |
137 | searchMethod = X265_HEX_SEARCH; | |
138 | subpelRefine = 2; | |
139 | bChromaSATD = false; | |
140 | chromaSatd = NULL; | |
141 | } | |
142 | ||
143 | void MotionEstimate::init(int method, int refine, int csp) | |
144 | { | |
145 | if (!sizeScale[0]) | |
146 | initScales(); | |
147 | ||
148 | searchMethod = method; | |
149 | subpelRefine = refine; | |
150 | fencPUYuv.create(FENC_STRIDE, csp); | |
151 | } | |
152 | ||
153 | MotionEstimate::~MotionEstimate() | |
154 | { | |
155 | fencPUYuv.destroy(); | |
156 | } | |
157 | ||
158 | /* Called by lookahead, luma only, no use of PicYuv */ | |
159 | void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight) | |
160 | { | |
161 | partEnum = partitionFromSizes(pwidth, pheight); | |
162 | X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n"); | |
163 | sad = primitives.sad[partEnum]; | |
164 | satd = primitives.satd[partEnum]; | |
165 | sad_x3 = primitives.sad_x3[partEnum]; | |
166 | sad_x4 = primitives.sad_x4[partEnum]; | |
167 | ||
168 | blockwidth = pwidth; | |
169 | blockOffset = offset; | |
170 | absPartIdx = ctuAddr = -1; | |
171 | ||
172 | /* copy PU block into cache */ | |
173 | primitives.luma_copy_pp[partEnum](fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride); | |
174 | X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n"); | |
175 | } | |
176 | ||
177 | /* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */ | |
178 | void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight) | |
179 | { | |
180 | partEnum = partitionFromSizes(pwidth, pheight); | |
181 | X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n"); | |
182 | sad = primitives.sad[partEnum]; | |
183 | satd = primitives.satd[partEnum]; | |
184 | sad_x3 = primitives.sad_x3[partEnum]; | |
185 | sad_x4 = primitives.sad_x4[partEnum]; | |
186 | chromaSatd = primitives.chroma[fencPUYuv.m_csp].satd[partEnum]; | |
187 | ||
188 | /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size | |
189 | * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */ | |
190 | bChromaSATD = subpelRefine > 2 && chromaSatd; | |
191 | X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n"); | |
192 | ||
193 | ctuAddr = _ctuAddr; | |
194 | absPartIdx = cuPartIdx + puPartIdx; | |
195 | blockwidth = pwidth; | |
196 | blockOffset = 0; | |
197 | ||
198 | /* copy PU from CU Yuv */ | |
199 | fencPUYuv.copyPUFromYuv(srcFencYuv, puPartIdx, partEnum, bChromaSATD); | |
200 | } | |
201 | ||
72b9787e JB |
202 | #define COST_MV_PT_DIST(mx, my, point, dist) \ |
203 | do \ | |
204 | { \ | |
205 | MV tmv(mx, my); \ | |
206 | int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \ | |
207 | cost += mvcost(tmv << 2); \ | |
208 | if (cost < bcost) { \ | |
209 | bcost = cost; \ | |
210 | bmv = tmv; \ | |
211 | bPointNr = point; \ | |
212 | bDistance = dist; \ | |
213 | } \ | |
214 | } while (0) | |
215 | ||
216 | #define COST_MV(mx, my) \ | |
217 | do \ | |
218 | { \ | |
219 | int cost = sad(fenc, FENC_STRIDE, fref + (mx) + (my) * stride, stride); \ | |
220 | cost += mvcost(MV(mx, my) << 2); \ | |
221 | COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \ | |
222 | } while (0) | |
223 | ||
224 | #define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \ | |
225 | { \ | |
226 | pixel *pix_base = fref + bmv.x + bmv.y * stride; \ | |
227 | sad_x3(fenc, \ | |
228 | pix_base + (m0x) + (m0y) * stride, \ | |
229 | pix_base + (m1x) + (m1y) * stride, \ | |
230 | pix_base + (m2x) + (m2y) * stride, \ | |
231 | stride, costs); \ | |
232 | (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \ | |
233 | (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \ | |
234 | (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \ | |
235 | } | |
236 | ||
237 | #define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \ | |
238 | { \ | |
239 | sad_x4(fenc, \ | |
240 | fref + (m0x) + (m0y) * stride, \ | |
241 | fref + (m1x) + (m1y) * stride, \ | |
242 | fref + (m2x) + (m2y) * stride, \ | |
243 | fref + (m3x) + (m3y) * stride, \ | |
244 | stride, costs); \ | |
245 | costs[0] += mvcost(MV(m0x, m0y) << 2); \ | |
246 | costs[1] += mvcost(MV(m1x, m1y) << 2); \ | |
247 | costs[2] += mvcost(MV(m2x, m2y) << 2); \ | |
248 | costs[3] += mvcost(MV(m3x, m3y) << 2); \ | |
249 | COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \ | |
250 | COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \ | |
251 | COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \ | |
252 | COPY4_IF_LT(bcost, costs[3], bmv, MV(m3x, m3y), bPointNr, p3, bDistance, d3); \ | |
253 | } | |
254 | ||
255 | #define COST_MV_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \ | |
256 | { \ | |
257 | pixel *pix_base = fref + omv.x + omv.y * stride; \ | |
258 | sad_x4(fenc, \ | |
259 | pix_base + (m0x) + (m0y) * stride, \ | |
260 | pix_base + (m1x) + (m1y) * stride, \ | |
261 | pix_base + (m2x) + (m2y) * stride, \ | |
262 | pix_base + (m3x) + (m3y) * stride, \ | |
263 | stride, costs); \ | |
264 | costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \ | |
265 | costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \ | |
266 | costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \ | |
267 | costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \ | |
268 | COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \ | |
269 | COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \ | |
270 | COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \ | |
271 | COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \ | |
272 | } | |
273 | ||
274 | #define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \ | |
275 | { \ | |
276 | pixel *pix_base = fref + bmv.x + bmv.y * stride; \ | |
277 | sad_x4(fenc, \ | |
278 | pix_base + (m0x) + (m0y) * stride, \ | |
279 | pix_base + (m1x) + (m1y) * stride, \ | |
280 | pix_base + (m2x) + (m2y) * stride, \ | |
281 | pix_base + (m3x) + (m3y) * stride, \ | |
282 | stride, costs); \ | |
283 | (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \ | |
284 | (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \ | |
285 | (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \ | |
286 | (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \ | |
287 | } | |
288 | ||
289 | #define DIA1_ITER(mx, my) \ | |
290 | { \ | |
291 | omv.x = mx; omv.y = my; \ | |
292 | COST_MV_X4(0, -1, 0, 1, -1, 0, 1, 0); \ | |
293 | } | |
294 | ||
295 | #define CROSS(start, x_max, y_max) \ | |
296 | { \ | |
297 | int16_t i = start; \ | |
298 | if ((x_max) <= X265_MIN(mvmax.x - omv.x, omv.x - mvmin.x)) \ | |
299 | for (; i < (x_max) - 2; i += 4) { \ | |
300 | COST_MV_X4(i, 0, -i, 0, i + 2, 0, -i - 2, 0); } \ | |
301 | for (; i < (x_max); i += 2) \ | |
302 | { \ | |
303 | if (omv.x + i <= mvmax.x) \ | |
304 | COST_MV(omv.x + i, omv.y); \ | |
305 | if (omv.x - i >= mvmin.x) \ | |
306 | COST_MV(omv.x - i, omv.y); \ | |
307 | } \ | |
308 | i = start; \ | |
309 | if ((y_max) <= X265_MIN(mvmax.y - omv.y, omv.y - mvmin.y)) \ | |
310 | for (; i < (y_max) - 2; i += 4) { \ | |
311 | COST_MV_X4(0, i, 0, -i, 0, i + 2, 0, -i - 2); } \ | |
312 | for (; i < (y_max); i += 2) \ | |
313 | { \ | |
314 | if (omv.y + i <= mvmax.y) \ | |
315 | COST_MV(omv.x, omv.y + i); \ | |
316 | if (omv.y - i >= mvmin.y) \ | |
317 | COST_MV(omv.x, omv.y - i); \ | |
318 | } \ | |
319 | } | |
320 | ||
321 | void MotionEstimate::StarPatternSearch(ReferencePlanes *ref, | |
322 | const MV & mvmin, | |
323 | const MV & mvmax, | |
324 | MV & bmv, | |
325 | int & bcost, | |
326 | int & bPointNr, | |
327 | int & bDistance, | |
328 | int earlyExitIters, | |
329 | int merange) | |
330 | { | |
331 | ALIGN_VAR_16(int, costs[16]); | |
b53f7c52 JB |
332 | pixel* fenc = fencPUYuv.m_buf[0]; |
333 | pixel* fref = ref->fpelPlane[0] + blockOffset; | |
334 | intptr_t stride = ref->lumaStride; | |
72b9787e JB |
335 | |
336 | MV omv = bmv; | |
337 | int saved = bcost; | |
338 | int rounds = 0; | |
339 | ||
340 | { | |
341 | int16_t dist = 1; | |
342 | ||
343 | /* bPointNr | |
344 | 2 | |
345 | 4 * 5 | |
346 | 7 | |
347 | */ | |
348 | const int16_t top = omv.y - dist; | |
349 | const int16_t bottom = omv.y + dist; | |
350 | const int16_t left = omv.x - dist; | |
351 | const int16_t right = omv.x + dist; | |
352 | ||
353 | if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y) | |
354 | { | |
355 | COST_MV_PT_DIST_X4(omv.x, top, 2, dist, | |
356 | left, omv.y, 4, dist, | |
357 | right, omv.y, 5, dist, | |
358 | omv.x, bottom, 7, dist); | |
359 | } | |
360 | else | |
361 | { | |
362 | if (top >= mvmin.y) // check top | |
363 | { | |
364 | COST_MV_PT_DIST(omv.x, top, 2, dist); | |
365 | } | |
366 | if (left >= mvmin.x) // check middle left | |
367 | { | |
368 | COST_MV_PT_DIST(left, omv.y, 4, dist); | |
369 | } | |
370 | if (right <= mvmax.x) // check middle right | |
371 | { | |
372 | COST_MV_PT_DIST(right, omv.y, 5, dist); | |
373 | } | |
374 | if (bottom <= mvmax.y) // check bottom | |
375 | { | |
376 | COST_MV_PT_DIST(omv.x, bottom, 7, dist); | |
377 | } | |
378 | } | |
379 | if (bcost < saved) | |
380 | rounds = 0; | |
381 | else if (++rounds >= earlyExitIters) | |
382 | return; | |
383 | } | |
384 | ||
385 | for (int16_t dist = 2; dist <= 8; dist <<= 1) | |
386 | { | |
387 | /* bPointNr | |
388 | 2 | |
389 | 1 3 | |
390 | 4 * 5 | |
391 | 6 8 | |
392 | 7 | |
393 | Points 2, 4, 5, 7 are dist | |
394 | Points 1, 3, 6, 8 are dist>>1 | |
395 | */ | |
396 | const int16_t top = omv.y - dist; | |
397 | const int16_t bottom = omv.y + dist; | |
398 | const int16_t left = omv.x - dist; | |
399 | const int16_t right = omv.x + dist; | |
400 | const int16_t top2 = omv.y - (dist >> 1); | |
401 | const int16_t bottom2 = omv.y + (dist >> 1); | |
402 | const int16_t left2 = omv.x - (dist >> 1); | |
403 | const int16_t right2 = omv.x + (dist >> 1); | |
404 | saved = bcost; | |
405 | ||
406 | if (top >= mvmin.y && left >= mvmin.x && | |
407 | right <= mvmax.x && bottom <= mvmax.y) // check border | |
408 | { | |
409 | COST_MV_PT_DIST_X4(omv.x, top, 2, dist, | |
410 | left2, top2, 1, dist >> 1, | |
411 | right2, top2, 3, dist >> 1, | |
412 | left, omv.y, 4, dist); | |
413 | COST_MV_PT_DIST_X4(right, omv.y, 5, dist, | |
414 | left2, bottom2, 6, dist >> 1, | |
415 | right2, bottom2, 8, dist >> 1, | |
416 | omv.x, bottom, 7, dist); | |
417 | } | |
418 | else // check border for each mv | |
419 | { | |
420 | if (top >= mvmin.y) // check top | |
421 | { | |
422 | COST_MV_PT_DIST(omv.x, top, 2, dist); | |
423 | } | |
424 | if (top2 >= mvmin.y) // check half top | |
425 | { | |
426 | if (left2 >= mvmin.x) // check half left | |
427 | { | |
428 | COST_MV_PT_DIST(left2, top2, 1, (dist >> 1)); | |
429 | } | |
430 | if (right2 <= mvmax.x) // check half right | |
431 | { | |
432 | COST_MV_PT_DIST(right2, top2, 3, (dist >> 1)); | |
433 | } | |
434 | } | |
435 | if (left >= mvmin.x) // check left | |
436 | { | |
437 | COST_MV_PT_DIST(left, omv.y, 4, dist); | |
438 | } | |
439 | if (right <= mvmax.x) // check right | |
440 | { | |
441 | COST_MV_PT_DIST(right, omv.y, 5, dist); | |
442 | } | |
443 | if (bottom2 <= mvmax.y) // check half bottom | |
444 | { | |
445 | if (left2 >= mvmin.x) // check half left | |
446 | { | |
447 | COST_MV_PT_DIST(left2, bottom2, 6, (dist >> 1)); | |
448 | } | |
449 | if (right2 <= mvmax.x) // check half right | |
450 | { | |
451 | COST_MV_PT_DIST(right2, bottom2, 8, (dist >> 1)); | |
452 | } | |
453 | } | |
454 | if (bottom <= mvmax.y) // check bottom | |
455 | { | |
456 | COST_MV_PT_DIST(omv.x, bottom, 7, dist); | |
457 | } | |
458 | } | |
459 | ||
460 | if (bcost < saved) | |
461 | rounds = 0; | |
462 | else if (++rounds >= earlyExitIters) | |
463 | return; | |
464 | } | |
465 | ||
466 | for (int16_t dist = 16; dist <= (int16_t)merange; dist <<= 1) | |
467 | { | |
468 | const int16_t top = omv.y - dist; | |
469 | const int16_t bottom = omv.y + dist; | |
470 | const int16_t left = omv.x - dist; | |
471 | const int16_t right = omv.x + dist; | |
472 | ||
473 | saved = bcost; | |
474 | if (top >= mvmin.y && left >= mvmin.x && | |
475 | right <= mvmax.x && bottom <= mvmax.y) // check border | |
476 | { | |
477 | /* index | |
478 | 0 | |
479 | 3 | |
480 | 2 | |
481 | 1 | |
482 | 0 3 2 1 * 1 2 3 0 | |
483 | 1 | |
484 | 2 | |
485 | 3 | |
486 | 0 | |
487 | */ | |
488 | ||
489 | COST_MV_PT_DIST_X4(omv.x, top, 0, dist, | |
490 | left, omv.y, 0, dist, | |
491 | right, omv.y, 0, dist, | |
492 | omv.x, bottom, 0, dist); | |
493 | ||
494 | for (int16_t index = 1; index < 4; index++) | |
495 | { | |
496 | int16_t posYT = top + ((dist >> 2) * index); | |
497 | int16_t posYB = bottom - ((dist >> 2) * index); | |
498 | int16_t posXL = omv.x - ((dist >> 2) * index); | |
499 | int16_t posXR = omv.x + ((dist >> 2) * index); | |
500 | ||
501 | COST_MV_PT_DIST_X4(posXL, posYT, 0, dist, | |
502 | posXR, posYT, 0, dist, | |
503 | posXL, posYB, 0, dist, | |
504 | posXR, posYB, 0, dist); | |
505 | } | |
506 | } | |
507 | else // check border for each mv | |
508 | { | |
509 | if (top >= mvmin.y) // check top | |
510 | { | |
511 | COST_MV_PT_DIST(omv.x, top, 0, dist); | |
512 | } | |
513 | if (left >= mvmin.x) // check left | |
514 | { | |
515 | COST_MV_PT_DIST(left, omv.y, 0, dist); | |
516 | } | |
517 | if (right <= mvmax.x) // check right | |
518 | { | |
519 | COST_MV_PT_DIST(right, omv.y, 0, dist); | |
520 | } | |
521 | if (bottom <= mvmax.y) // check bottom | |
522 | { | |
523 | COST_MV_PT_DIST(omv.x, bottom, 0, dist); | |
524 | } | |
525 | for (int16_t index = 1; index < 4; index++) | |
526 | { | |
527 | int16_t posYT = top + ((dist >> 2) * index); | |
528 | int16_t posYB = bottom - ((dist >> 2) * index); | |
529 | int16_t posXL = omv.x - ((dist >> 2) * index); | |
530 | int16_t posXR = omv.x + ((dist >> 2) * index); | |
531 | ||
532 | if (posYT >= mvmin.y) // check top | |
533 | { | |
534 | if (posXL >= mvmin.x) // check left | |
535 | { | |
536 | COST_MV_PT_DIST(posXL, posYT, 0, dist); | |
537 | } | |
538 | if (posXR <= mvmax.x) // check right | |
539 | { | |
540 | COST_MV_PT_DIST(posXR, posYT, 0, dist); | |
541 | } | |
542 | } | |
543 | if (posYB <= mvmax.y) // check bottom | |
544 | { | |
545 | if (posXL >= mvmin.x) // check left | |
546 | { | |
547 | COST_MV_PT_DIST(posXL, posYB, 0, dist); | |
548 | } | |
549 | if (posXR <= mvmax.x) // check right | |
550 | { | |
551 | COST_MV_PT_DIST(posXR, posYB, 0, dist); | |
552 | } | |
553 | } | |
554 | } | |
555 | } | |
556 | ||
557 | if (bcost < saved) | |
558 | rounds = 0; | |
559 | else if (++rounds >= earlyExitIters) | |
560 | return; | |
561 | } | |
562 | } | |
563 | ||
564 | int MotionEstimate::motionEstimate(ReferencePlanes *ref, | |
565 | const MV & mvmin, | |
566 | const MV & mvmax, | |
567 | const MV & qmvp, | |
568 | int numCandidates, | |
569 | const MV * mvc, | |
570 | int merange, | |
571 | MV & outQMv) | |
572 | { | |
573 | ALIGN_VAR_16(int, costs[16]); | |
b53f7c52 JB |
574 | if (ctuAddr >= 0) |
575 | blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0); | |
576 | intptr_t stride = ref->lumaStride; | |
577 | pixel* fenc = fencPUYuv.m_buf[0]; | |
578 | pixel* fref = ref->fpelPlane[0] + blockOffset; | |
72b9787e JB |
579 | |
580 | setMVP(qmvp); | |
581 | ||
582 | MV qmvmin = mvmin.toQPel(); | |
583 | MV qmvmax = mvmax.toQPel(); | |
584 | ||
585 | /* The term cost used here means satd/sad values for that particular search. | |
586 | * The costs used in ME integer search only includes the SAD cost of motion | |
587 | * residual and sqrtLambda times MVD bits. The subpel refine steps use SATD | |
588 | * cost of residual and sqrtLambda * MVD bits. Mode decision will be based | |
589 | * on video distortion cost (SSE/PSNR) plus lambda times all signaling bits | |
590 | * (mode + MVD bits). */ | |
591 | ||
592 | // measure SAD cost at clipped QPEL MVP | |
593 | MV pmv = qmvp.clipped(qmvmin, qmvmax); | |
594 | MV bestpre = pmv; | |
595 | int bprecost; | |
596 | ||
597 | if (ref->isLowres) | |
598 | bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad); | |
599 | else | |
600 | bprecost = subpelCompare(ref, pmv, sad); | |
601 | ||
602 | /* re-measure full pel rounded MVP with SAD as search start point */ | |
603 | MV bmv = pmv.roundToFPel(); | |
604 | int bcost = bprecost; | |
605 | if (pmv.isSubpel()) | |
72b9787e | 606 | bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2); |
72b9787e JB |
607 | |
608 | // measure SAD cost at MV(0) if MVP is not zero | |
609 | if (pmv.notZero()) | |
610 | { | |
611 | int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(MV(0, 0)); | |
612 | if (cost < bcost) | |
613 | { | |
614 | bcost = cost; | |
615 | bmv = 0; | |
616 | } | |
617 | } | |
618 | ||
619 | // measure SAD cost at each QPEL motion vector candidate | |
b53f7c52 | 620 | if (ref->isLowres) |
72b9787e | 621 | { |
b53f7c52 | 622 | for (int i = 0; i < numCandidates; i++) |
72b9787e | 623 | { |
b53f7c52 JB |
624 | MV m = mvc[i].clipped(qmvmin, qmvmax); |
625 | if (m.notZero() && m != pmv && m != bestpre) // check already measured | |
72b9787e | 626 | { |
b53f7c52 JB |
627 | int cost = ref->lowresQPelCost(fenc, blockOffset, m, sad) + mvcost(m); |
628 | if (cost < bprecost) | |
629 | { | |
630 | bprecost = cost; | |
631 | bestpre = m; | |
632 | } | |
633 | } | |
634 | } | |
635 | } | |
636 | else | |
637 | { | |
638 | for (int i = 0; i < numCandidates; i++) | |
639 | { | |
640 | MV m = mvc[i].clipped(qmvmin, qmvmax); | |
641 | if (m.notZero() && m != pmv && m != bestpre) // check already measured | |
642 | { | |
643 | int cost = subpelCompare(ref, m, sad) + mvcost(m); | |
644 | if (cost < bprecost) | |
645 | { | |
646 | bprecost = cost; | |
647 | bestpre = m; | |
648 | } | |
72b9787e JB |
649 | } |
650 | } | |
651 | } | |
652 | ||
653 | pmv = pmv.roundToFPel(); | |
654 | MV omv = bmv; // current search origin or starting point | |
655 | ||
656 | switch (searchMethod) | |
657 | { | |
658 | case X265_DIA_SEARCH: | |
659 | { | |
660 | /* diamond search, radius 1 */ | |
661 | bcost <<= 4; | |
662 | int i = merange; | |
663 | do | |
664 | { | |
665 | COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs); | |
666 | COPY1_IF_LT(bcost, (costs[0] << 4) + 1); | |
667 | COPY1_IF_LT(bcost, (costs[1] << 4) + 3); | |
668 | COPY1_IF_LT(bcost, (costs[2] << 4) + 4); | |
669 | COPY1_IF_LT(bcost, (costs[3] << 4) + 12); | |
670 | if (!(bcost & 15)) | |
671 | break; | |
672 | bmv.x -= (bcost << 28) >> 30; | |
673 | bmv.y -= (bcost << 30) >> 30; | |
674 | bcost &= ~15; | |
675 | } | |
676 | while (--i && bmv.checkRange(mvmin, mvmax)); | |
677 | bcost >>= 4; | |
678 | break; | |
679 | } | |
680 | ||
681 | case X265_HEX_SEARCH: | |
682 | { | |
683 | me_hex2: | |
684 | /* hexagon search, radius 2 */ | |
685 | #if 0 | |
686 | for (int i = 0; i < merange / 2; i++) | |
687 | { | |
688 | omv = bmv; | |
689 | COST_MV(omv.x - 2, omv.y); | |
690 | COST_MV(omv.x - 1, omv.y + 2); | |
691 | COST_MV(omv.x + 1, omv.y + 2); | |
692 | COST_MV(omv.x + 2, omv.y); | |
693 | COST_MV(omv.x + 1, omv.y - 2); | |
694 | COST_MV(omv.x - 1, omv.y - 2); | |
695 | if (omv == bmv) | |
696 | break; | |
697 | if (!bmv.checkRange(mvmin, mvmax)) | |
698 | break; | |
699 | } | |
700 | ||
701 | #else // if 0 | |
702 | /* equivalent to the above, but eliminates duplicate candidates */ | |
703 | COST_MV_X3_DIR(-2, 0, -1, 2, 1, 2, costs); | |
704 | bcost <<= 3; | |
705 | COPY1_IF_LT(bcost, (costs[0] << 3) + 2); | |
706 | COPY1_IF_LT(bcost, (costs[1] << 3) + 3); | |
707 | COPY1_IF_LT(bcost, (costs[2] << 3) + 4); | |
708 | COST_MV_X3_DIR(2, 0, 1, -2, -1, -2, costs); | |
709 | COPY1_IF_LT(bcost, (costs[0] << 3) + 5); | |
710 | COPY1_IF_LT(bcost, (costs[1] << 3) + 6); | |
711 | COPY1_IF_LT(bcost, (costs[2] << 3) + 7); | |
712 | ||
713 | if (bcost & 7) | |
714 | { | |
715 | int dir = (bcost & 7) - 2; | |
716 | bmv += hex2[dir + 1]; | |
717 | ||
718 | /* half hexagon, not overlapping the previous iteration */ | |
719 | for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--) | |
720 | { | |
721 | COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y, | |
722 | hex2[dir + 1].x, hex2[dir + 1].y, | |
723 | hex2[dir + 2].x, hex2[dir + 2].y, | |
724 | costs); | |
725 | bcost &= ~7; | |
726 | COPY1_IF_LT(bcost, (costs[0] << 3) + 1); | |
727 | COPY1_IF_LT(bcost, (costs[1] << 3) + 2); | |
728 | COPY1_IF_LT(bcost, (costs[2] << 3) + 3); | |
729 | if (!(bcost & 7)) | |
730 | break; | |
731 | dir += (bcost & 7) - 2; | |
732 | dir = mod6m1[dir + 1]; | |
733 | bmv += hex2[dir + 1]; | |
734 | } | |
735 | } | |
736 | bcost >>= 3; | |
737 | #endif // if 0 | |
738 | ||
739 | /* square refine */ | |
740 | int dir = 0; | |
741 | COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs); | |
742 | COPY2_IF_LT(bcost, costs[0], dir, 1); | |
743 | COPY2_IF_LT(bcost, costs[1], dir, 2); | |
744 | COPY2_IF_LT(bcost, costs[2], dir, 3); | |
745 | COPY2_IF_LT(bcost, costs[3], dir, 4); | |
746 | COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs); | |
747 | COPY2_IF_LT(bcost, costs[0], dir, 5); | |
748 | COPY2_IF_LT(bcost, costs[1], dir, 6); | |
749 | COPY2_IF_LT(bcost, costs[2], dir, 7); | |
750 | COPY2_IF_LT(bcost, costs[3], dir, 8); | |
751 | bmv += square1[dir]; | |
752 | break; | |
753 | } | |
754 | ||
755 | case X265_UMH_SEARCH: | |
756 | { | |
757 | int ucost1, ucost2; | |
758 | int16_t cross_start = 1; | |
759 | ||
760 | /* refine predictors */ | |
761 | omv = bmv; | |
762 | ucost1 = bcost; | |
763 | DIA1_ITER(pmv.x, pmv.y); | |
764 | if (pmv.notZero()) | |
765 | DIA1_ITER(0, 0); | |
766 | ||
767 | ucost2 = bcost; | |
768 | if (bmv.notZero() && bmv != pmv) | |
769 | DIA1_ITER(bmv.x, bmv.y); | |
770 | if (bcost == ucost2) | |
771 | cross_start = 3; | |
772 | ||
773 | /* Early Termination */ | |
774 | omv = bmv; | |
775 | if (bcost == ucost2 && SAD_THRESH(2000)) | |
776 | { | |
777 | COST_MV_X4(0, -2, -1, -1, 1, -1, -2, 0); | |
778 | COST_MV_X4(2, 0, -1, 1, 1, 1, 0, 2); | |
779 | if (bcost == ucost1 && SAD_THRESH(500)) | |
780 | break; | |
781 | if (bcost == ucost2) | |
782 | { | |
783 | int16_t range = (int16_t)(merange >> 1) | 1; | |
784 | CROSS(3, range, range); | |
785 | COST_MV_X4(-1, -2, 1, -2, -2, -1, 2, -1); | |
786 | COST_MV_X4(-2, 1, 2, 1, -1, 2, 1, 2); | |
787 | if (bcost == ucost2) | |
788 | break; | |
789 | cross_start = range + 2; | |
790 | } | |
791 | } | |
792 | ||
793 | // TODO: Need to study x264's logic for building mvc list to understand why they | |
794 | // have special cases here for 16x16, and whether they apply to HEVC CTU | |
795 | ||
796 | // adaptive search range based on mvc variability | |
797 | if (numCandidates) | |
798 | { | |
799 | /* range multipliers based on casual inspection of some statistics of | |
800 | * average distance between current predictor and final mv found by ESA. | |
801 | * these have not been tuned much by actual encoding. */ | |
802 | static const uint8_t range_mul[4][4] = | |
803 | { | |
804 | { 3, 3, 4, 4 }, | |
805 | { 3, 4, 4, 4 }, | |
806 | { 4, 4, 4, 5 }, | |
807 | { 4, 4, 5, 6 }, | |
808 | }; | |
809 | ||
810 | int mvd; | |
811 | int sad_ctx, mvd_ctx; | |
812 | int denom = 1; | |
813 | ||
814 | if (numCandidates == 1) | |
815 | { | |
816 | if (LUMA_64x64 == partEnum) | |
817 | /* mvc is probably the same as mvp, so the difference isn't meaningful. | |
818 | * but prediction usually isn't too bad, so just use medium range */ | |
819 | mvd = 25; | |
820 | else | |
821 | mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y); | |
822 | } | |
823 | else | |
824 | { | |
825 | /* calculate the degree of agreement between predictors. */ | |
826 | ||
827 | /* in 64x64, mvc includes all the neighbors used to make mvp, | |
828 | * so don't count mvp separately. */ | |
829 | ||
830 | denom = numCandidates - 1; | |
831 | mvd = 0; | |
832 | if (partEnum != LUMA_64x64) | |
833 | { | |
834 | mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y); | |
835 | denom++; | |
836 | } | |
b53f7c52 | 837 | mvd += predictorDifference(mvc, numCandidates); |
72b9787e JB |
838 | } |
839 | ||
840 | sad_ctx = SAD_THRESH(1000) ? 0 | |
841 | : SAD_THRESH(2000) ? 1 | |
842 | : SAD_THRESH(4000) ? 2 : 3; | |
843 | mvd_ctx = mvd < 10 * denom ? 0 | |
844 | : mvd < 20 * denom ? 1 | |
845 | : mvd < 40 * denom ? 2 : 3; | |
846 | ||
847 | merange = (merange * range_mul[mvd_ctx][sad_ctx]) >> 2; | |
848 | } | |
849 | ||
850 | /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy. | |
851 | * we are still centered on the same place as the DIA2. is this desirable? */ | |
852 | CROSS(cross_start, merange, merange >> 1); | |
853 | COST_MV_X4(-2, -2, -2, 2, 2, -2, 2, 2); | |
854 | ||
855 | /* hexagon grid */ | |
856 | omv = bmv; | |
857 | const uint16_t *p_cost_omvx = m_cost_mvx + omv.x * 4; | |
858 | const uint16_t *p_cost_omvy = m_cost_mvy + omv.y * 4; | |
859 | uint16_t i = 1; | |
860 | do | |
861 | { | |
862 | if (4 * i > X265_MIN4(mvmax.x - omv.x, omv.x - mvmin.x, | |
863 | mvmax.y - omv.y, omv.y - mvmin.y)) | |
864 | { | |
865 | for (int j = 0; j < 16; j++) | |
866 | { | |
867 | MV mv = omv + (hex4[j] * i); | |
868 | if (mv.checkRange(mvmin, mvmax)) | |
869 | COST_MV(mv.x, mv.y); | |
870 | } | |
871 | } | |
872 | else | |
873 | { | |
874 | int16_t dir = 0; | |
875 | pixel *fref_base = fref + omv.x + (omv.y - 4 * i) * stride; | |
876 | size_t dy = (size_t)i * stride; | |
877 | #define SADS(k, x0, y0, x1, y1, x2, y2, x3, y3) \ | |
878 | sad_x4(fenc, \ | |
879 | fref_base x0 * i + (y0 - 2 * k + 4) * dy, \ | |
880 | fref_base x1 * i + (y1 - 2 * k + 4) * dy, \ | |
881 | fref_base x2 * i + (y2 - 2 * k + 4) * dy, \ | |
882 | fref_base x3 * i + (y3 - 2 * k + 4) * dy, \ | |
883 | stride, costs + 4 * k); \ | |
884 | fref_base += 2 * dy; | |
885 | #define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + p_cost_omvy[y * 4 * i] | |
886 | #define MIN_MV(k, x, y) COPY2_IF_LT(bcost, costs[k], dir, x * 16 + (y & 15)) | |
887 | ||
888 | SADS(0, +0, -4, +0, +4, -2, -3, +2, -3); | |
889 | SADS(1, -4, -2, +4, -2, -4, -1, +4, -1); | |
890 | SADS(2, -4, +0, +4, +0, -4, +1, +4, +1); | |
891 | SADS(3, -4, +2, +4, +2, -2, +3, +2, +3); | |
892 | ADD_MVCOST(0, 0, -4); | |
893 | ADD_MVCOST(1, 0, 4); | |
894 | ADD_MVCOST(2, -2, -3); | |
895 | ADD_MVCOST(3, 2, -3); | |
896 | ADD_MVCOST(4, -4, -2); | |
897 | ADD_MVCOST(5, 4, -2); | |
898 | ADD_MVCOST(6, -4, -1); | |
899 | ADD_MVCOST(7, 4, -1); | |
900 | ADD_MVCOST(8, -4, 0); | |
901 | ADD_MVCOST(9, 4, 0); | |
902 | ADD_MVCOST(10, -4, 1); | |
903 | ADD_MVCOST(11, 4, 1); | |
904 | ADD_MVCOST(12, -4, 2); | |
905 | ADD_MVCOST(13, 4, 2); | |
906 | ADD_MVCOST(14, -2, 3); | |
907 | ADD_MVCOST(15, 2, 3); | |
908 | MIN_MV(0, 0, -4); | |
909 | MIN_MV(1, 0, 4); | |
910 | MIN_MV(2, -2, -3); | |
911 | MIN_MV(3, 2, -3); | |
912 | MIN_MV(4, -4, -2); | |
913 | MIN_MV(5, 4, -2); | |
914 | MIN_MV(6, -4, -1); | |
915 | MIN_MV(7, 4, -1); | |
916 | MIN_MV(8, -4, 0); | |
917 | MIN_MV(9, 4, 0); | |
918 | MIN_MV(10, -4, 1); | |
919 | MIN_MV(11, 4, 1); | |
920 | MIN_MV(12, -4, 2); | |
921 | MIN_MV(13, 4, 2); | |
922 | MIN_MV(14, -2, 3); | |
923 | MIN_MV(15, 2, 3); | |
924 | #undef SADS | |
925 | #undef ADD_MVCOST | |
926 | #undef MIN_MV | |
927 | if (dir) | |
928 | { | |
929 | bmv.x = omv.x + i * (dir >> 4); | |
930 | bmv.y = omv.y + i * ((dir << 28) >> 28); | |
931 | } | |
932 | } | |
933 | } | |
934 | while (++i <= merange >> 2); | |
935 | if (bmv.checkRange(mvmin, mvmax)) | |
936 | goto me_hex2; | |
937 | break; | |
938 | } | |
939 | ||
940 | case X265_STAR_SEARCH: // Adapted from HM ME | |
941 | { | |
942 | int bPointNr = 0; | |
943 | int bDistance = 0; | |
944 | ||
945 | const int EarlyExitIters = 3; | |
946 | StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange); | |
947 | if (bDistance == 1) | |
948 | { | |
949 | // if best distance was only 1, check two missing points. If no new point is found, stop | |
950 | if (bPointNr) | |
951 | { | |
952 | /* For a given direction 1 to 8, check nearest two outer X pixels | |
953 | X X | |
954 | X 1 2 3 X | |
955 | 4 * 5 | |
956 | X 6 7 8 X | |
957 | X X | |
958 | */ | |
959 | int saved = bcost; | |
960 | const MV mv1 = bmv + offsets[(bPointNr - 1) * 2]; | |
961 | const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1]; | |
962 | if (mv1.checkRange(mvmin, mvmax)) | |
963 | { | |
964 | COST_MV(mv1.x, mv1.y); | |
965 | } | |
966 | if (mv2.checkRange(mvmin, mvmax)) | |
967 | { | |
968 | COST_MV(mv2.x, mv2.y); | |
969 | } | |
970 | if (bcost == saved) | |
971 | break; | |
972 | } | |
973 | else | |
974 | break; | |
975 | } | |
976 | ||
977 | const int RasterDistance = 5; | |
978 | if (bDistance > RasterDistance) | |
979 | { | |
980 | // raster search refinement if original search distance was too big | |
981 | MV tmv; | |
982 | for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y += RasterDistance) | |
983 | { | |
984 | for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x += RasterDistance) | |
985 | { | |
986 | if (tmv.x + (RasterDistance * 3) <= mvmax.x) | |
987 | { | |
988 | pixel *pix_base = fref + tmv.y * stride + tmv.x; | |
989 | sad_x4(fenc, | |
990 | pix_base, | |
991 | pix_base + RasterDistance, | |
992 | pix_base + RasterDistance * 2, | |
993 | pix_base + RasterDistance * 3, | |
994 | stride, costs); | |
995 | costs[0] += mvcost(tmv << 2); | |
996 | COPY2_IF_LT(bcost, costs[0], bmv, tmv); | |
997 | tmv.x += RasterDistance; | |
998 | costs[1] += mvcost(tmv << 2); | |
999 | COPY2_IF_LT(bcost, costs[1], bmv, tmv); | |
1000 | tmv.x += RasterDistance; | |
1001 | costs[2] += mvcost(tmv << 2); | |
1002 | COPY2_IF_LT(bcost, costs[2], bmv, tmv); | |
1003 | tmv.x += RasterDistance; | |
1004 | costs[3] += mvcost(tmv << 3); | |
1005 | COPY2_IF_LT(bcost, costs[3], bmv, tmv); | |
1006 | } | |
1007 | else | |
1008 | COST_MV(tmv.x, tmv.y); | |
1009 | } | |
1010 | } | |
1011 | } | |
1012 | ||
1013 | while (bDistance > 0) | |
1014 | { | |
1015 | // center a new search around current best | |
1016 | bDistance = 0; | |
1017 | bPointNr = 0; | |
1018 | const int MaxIters = 32; | |
1019 | StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange); | |
1020 | ||
1021 | if (bDistance == 1) | |
1022 | { | |
1023 | if (!bPointNr) | |
1024 | break; | |
1025 | ||
1026 | /* For a given direction 1 to 8, check nearest 2 outer X pixels | |
1027 | X X | |
1028 | X 1 2 3 X | |
1029 | 4 * 5 | |
1030 | X 6 7 8 X | |
1031 | X X | |
1032 | */ | |
1033 | const MV mv1 = bmv + offsets[(bPointNr - 1) * 2]; | |
1034 | const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1]; | |
1035 | if (mv1.checkRange(mvmin, mvmax)) | |
1036 | { | |
1037 | COST_MV(mv1.x, mv1.y); | |
1038 | } | |
1039 | if (mv2.checkRange(mvmin, mvmax)) | |
1040 | { | |
1041 | COST_MV(mv2.x, mv2.y); | |
1042 | } | |
1043 | break; | |
1044 | } | |
1045 | } | |
1046 | ||
1047 | break; | |
1048 | } | |
1049 | ||
1050 | case X265_FULL_SEARCH: | |
1051 | { | |
1052 | // dead slow exhaustive search, but at least it uses sad_x4() | |
1053 | MV tmv; | |
1054 | for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y++) | |
1055 | { | |
1056 | for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x++) | |
1057 | { | |
1058 | if (tmv.x + 3 <= mvmax.x) | |
1059 | { | |
1060 | pixel *pix_base = fref + tmv.y * stride + tmv.x; | |
1061 | sad_x4(fenc, | |
1062 | pix_base, | |
1063 | pix_base + 1, | |
1064 | pix_base + 2, | |
1065 | pix_base + 3, | |
1066 | stride, costs); | |
1067 | costs[0] += mvcost(tmv << 2); | |
1068 | COPY2_IF_LT(bcost, costs[0], bmv, tmv); | |
1069 | tmv.x++; | |
1070 | costs[1] += mvcost(tmv << 2); | |
1071 | COPY2_IF_LT(bcost, costs[1], bmv, tmv); | |
1072 | tmv.x++; | |
1073 | costs[2] += mvcost(tmv << 2); | |
1074 | COPY2_IF_LT(bcost, costs[2], bmv, tmv); | |
1075 | tmv.x++; | |
1076 | costs[3] += mvcost(tmv << 2); | |
1077 | COPY2_IF_LT(bcost, costs[3], bmv, tmv); | |
1078 | } | |
1079 | else | |
1080 | COST_MV(tmv.x, tmv.y); | |
1081 | } | |
1082 | } | |
1083 | ||
1084 | break; | |
1085 | } | |
1086 | ||
1087 | default: | |
1088 | X265_CHECK(0, "invalid motion estimate mode\n"); | |
1089 | break; | |
1090 | } | |
1091 | ||
1092 | if (bprecost < bcost) | |
1093 | { | |
1094 | bmv = bestpre; | |
1095 | bcost = bprecost; | |
1096 | } | |
1097 | else | |
1098 | bmv = bmv.toQPel(); // promote search bmv to qpel | |
1099 | ||
b53f7c52 | 1100 | const SubpelWorkload& wl = workload[this->subpelRefine]; |
72b9787e JB |
1101 | |
1102 | if (!bcost) | |
1103 | { | |
1104 | /* if there was zero residual at the clipped MVP, we can skip subpel | |
1105 | * refine, but we do need to include the mvcost in the returned cost */ | |
1106 | bcost = mvcost(bmv); | |
1107 | } | |
1108 | else if (ref->isLowres) | |
1109 | { | |
b53f7c52 | 1110 | int bdir = 0; |
72b9787e JB |
1111 | for (int i = 1; i <= wl.hpel_dirs; i++) |
1112 | { | |
1113 | MV qmv = bmv + square1[i] * 2; | |
b53f7c52 | 1114 | int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv); |
72b9787e JB |
1115 | COPY2_IF_LT(bcost, cost, bdir, i); |
1116 | } | |
1117 | ||
1118 | bmv += square1[bdir] * 2; | |
1119 | bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + mvcost(bmv); | |
1120 | ||
1121 | bdir = 0; | |
1122 | for (int i = 1; i <= wl.qpel_dirs; i++) | |
1123 | { | |
1124 | MV qmv = bmv + square1[i]; | |
b53f7c52 | 1125 | int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv); |
72b9787e JB |
1126 | COPY2_IF_LT(bcost, cost, bdir, i); |
1127 | } | |
1128 | ||
1129 | bmv += square1[bdir]; | |
1130 | } | |
1131 | else | |
1132 | { | |
1133 | pixelcmp_t hpelcomp; | |
1134 | ||
1135 | if (wl.hpel_satd) | |
1136 | { | |
1137 | bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv); | |
1138 | hpelcomp = satd; | |
1139 | } | |
1140 | else | |
1141 | hpelcomp = sad; | |
1142 | ||
1143 | for (int iter = 0; iter < wl.hpel_iters; iter++) | |
1144 | { | |
b53f7c52 | 1145 | int bdir = 0; |
72b9787e JB |
1146 | for (int i = 1; i <= wl.hpel_dirs; i++) |
1147 | { | |
1148 | MV qmv = bmv + square1[i] * 2; | |
b53f7c52 | 1149 | int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv); |
72b9787e JB |
1150 | COPY2_IF_LT(bcost, cost, bdir, i); |
1151 | } | |
1152 | ||
1153 | if (bdir) | |
1154 | bmv += square1[bdir] * 2; | |
1155 | else | |
1156 | break; | |
1157 | } | |
1158 | ||
1159 | /* if HPEL search used SAD, remeasure with SATD before QPEL */ | |
1160 | if (!wl.hpel_satd) | |
1161 | bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv); | |
1162 | ||
1163 | for (int iter = 0; iter < wl.qpel_iters; iter++) | |
1164 | { | |
b53f7c52 | 1165 | int bdir = 0; |
72b9787e JB |
1166 | for (int i = 1; i <= wl.qpel_dirs; i++) |
1167 | { | |
1168 | MV qmv = bmv + square1[i]; | |
b53f7c52 | 1169 | int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv); |
72b9787e JB |
1170 | COPY2_IF_LT(bcost, cost, bdir, i); |
1171 | } | |
1172 | ||
1173 | if (bdir) | |
1174 | bmv += square1[bdir]; | |
1175 | else | |
1176 | break; | |
1177 | } | |
1178 | } | |
1179 | ||
1180 | x265_emms(); | |
1181 | outQMv = bmv; | |
1182 | return bcost; | |
1183 | } | |
1184 | ||
1185 | int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp) | |
1186 | { | |
b53f7c52 JB |
1187 | intptr_t refStride = ref->lumaStride; |
1188 | pixel *fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride; | |
72b9787e JB |
1189 | int xFrac = qmv.x & 0x3; |
1190 | int yFrac = qmv.y & 0x3; | |
b53f7c52 JB |
1191 | int cost; |
1192 | intptr_t lclStride = fencPUYuv.m_size; | |
1193 | X265_CHECK(lclStride == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n"); | |
72b9787e | 1194 | |
b53f7c52 JB |
1195 | if (!(yFrac | xFrac)) |
1196 | cost = cmp(fencPUYuv.m_buf[0], lclStride, fref, refStride); | |
72b9787e JB |
1197 | else |
1198 | { | |
b53f7c52 | 1199 | /* we are taking a short-cut here if the reference is weighted. To be |
72b9787e | 1200 | * accurate we should be interpolating unweighted pixels and weighting |
b53f7c52 | 1201 | * the final 16bit values prior to rounding and down shifting. Instead we |
72b9787e JB |
1202 | * are simply interpolating the weighted full-pel pixels. Not 100% |
1203 | * accurate but good enough for fast qpel ME */ | |
1204 | ALIGN_VAR_32(pixel, subpelbuf[64 * 64]); | |
b53f7c52 JB |
1205 | if (!yFrac) |
1206 | primitives.luma_hpp[partEnum](fref, refStride, subpelbuf, lclStride, xFrac); | |
1207 | else if (!xFrac) | |
1208 | primitives.luma_vpp[partEnum](fref, refStride, subpelbuf, lclStride, yFrac); | |
1209 | else | |
72b9787e | 1210 | { |
b53f7c52 JB |
1211 | ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_LUMA)]); |
1212 | ||
1213 | int filterSize = NTAPS_LUMA; | |
1214 | int halfFilterSize = filterSize >> 1; | |
1215 | primitives.luma_hps[partEnum](fref, refStride, immed, blockwidth, xFrac, 1); | |
1216 | primitives.luma_vsp[partEnum](immed + (halfFilterSize - 1) * blockwidth, blockwidth, subpelbuf, lclStride, yFrac); | |
72b9787e | 1217 | } |
b53f7c52 JB |
1218 | cost = cmp(fencPUYuv.m_buf[0], lclStride, subpelbuf, lclStride); |
1219 | } | |
1220 | ||
1221 | if (bChromaSATD) | |
1222 | { | |
1223 | int csp = fencPUYuv.m_csp; | |
1224 | int hshift = fencPUYuv.m_hChromaShift; | |
1225 | int vshift = fencPUYuv.m_vChromaShift; | |
1226 | int shiftHor = (2 + hshift); | |
1227 | int shiftVer = (2 + vshift); | |
1228 | lclStride = fencPUYuv.m_csize; | |
1229 | ||
1230 | intptr_t refStrideC = ref->reconPic->m_strideC; | |
1231 | intptr_t refOffset = (qmv.x >> shiftHor) + (qmv.y >> shiftVer) * refStrideC; | |
1232 | ||
1233 | const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset; | |
1234 | const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset; | |
1235 | ||
1236 | xFrac = qmv.x & ((1 << shiftHor) - 1); | |
1237 | yFrac = qmv.y & ((1 << shiftVer) - 1); | |
1238 | ||
1239 | if (!(yFrac | xFrac)) | |
72b9787e | 1240 | { |
b53f7c52 JB |
1241 | cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, refCb, refStrideC); |
1242 | cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, refCr, refStrideC); | |
72b9787e JB |
1243 | } |
1244 | else | |
1245 | { | |
b53f7c52 JB |
1246 | ALIGN_VAR_32(pixel, subpelbuf[64 * 64]); |
1247 | if (!yFrac) | |
1248 | { | |
1249 | primitives.chroma[csp].filter_hpp[partEnum](refCb, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift)); | |
1250 | cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride); | |
72b9787e | 1251 | |
b53f7c52 JB |
1252 | primitives.chroma[csp].filter_hpp[partEnum](refCr, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift)); |
1253 | cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride); | |
1254 | } | |
1255 | else if (!xFrac) | |
1256 | { | |
1257 | primitives.chroma[csp].filter_vpp[partEnum](refCb, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift)); | |
1258 | cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride); | |
1259 | ||
1260 | primitives.chroma[csp].filter_vpp[partEnum](refCr, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift)); | |
1261 | cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride); | |
1262 | } | |
1263 | else | |
1264 | { | |
1265 | ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_CHROMA)]); | |
1266 | ||
1267 | int extStride = blockwidth >> hshift; | |
1268 | int filterSize = NTAPS_CHROMA; | |
1269 | int halfFilterSize = (filterSize >> 1); | |
1270 | ||
1271 | primitives.chroma[csp].filter_hps[partEnum](refCb, refStrideC, immed, extStride, xFrac << (1 - hshift), 1); | |
1272 | primitives.chroma[csp].filter_vsp[partEnum](immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift)); | |
1273 | cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride); | |
1274 | ||
1275 | primitives.chroma[csp].filter_hps[partEnum](refCr, refStrideC, immed, extStride, xFrac << (1 - hshift), 1); | |
1276 | primitives.chroma[csp].filter_vsp[partEnum](immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift)); | |
1277 | cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride); | |
1278 | } | |
72b9787e | 1279 | } |
72b9787e | 1280 | } |
b53f7c52 JB |
1281 | |
1282 | return cost; | |
72b9787e | 1283 | } |