Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * SIMD-optimized motion estimation | |
3 | * Copyright (c) 2000, 2001 Fabrice Bellard | |
4 | * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |
5 | * | |
6 | * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
7 | * | |
8 | * This file is part of FFmpeg. | |
9 | * | |
10 | * FFmpeg is free software; you can redistribute it and/or | |
11 | * modify it under the terms of the GNU Lesser General Public | |
12 | * License as published by the Free Software Foundation; either | |
13 | * version 2.1 of the License, or (at your option) any later version. | |
14 | * | |
15 | * FFmpeg is distributed in the hope that it will be useful, | |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 | * Lesser General Public License for more details. | |
19 | * | |
20 | * You should have received a copy of the GNU Lesser General Public | |
21 | * License along with FFmpeg; if not, write to the Free Software | |
22 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 | */ | |
24 | ||
25 | #include "libavutil/attributes.h" | |
26 | #include "libavutil/cpu.h" | |
27 | #include "libavutil/x86/asm.h" | |
28 | #include "libavutil/x86/cpu.h" | |
29 | #include "libavcodec/me_cmp.h" | |
30 | #include "libavcodec/mpegvideo.h" | |
31 | ||
32 | int ff_sum_abs_dctelem_mmx(int16_t *block); | |
33 | int ff_sum_abs_dctelem_mmxext(int16_t *block); | |
34 | int ff_sum_abs_dctelem_sse2(int16_t *block); | |
35 | int ff_sum_abs_dctelem_ssse3(int16_t *block); | |
36 | int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
37 | int line_size, int h); | |
38 | int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
39 | int line_size, int h); | |
40 | int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
41 | int line_size, int h); | |
42 | int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h); | |
43 | int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h); | |
44 | ||
45 | #define hadamard_func(cpu) \ | |
46 | int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \ | |
47 | uint8_t *src2, int stride, int h); \ | |
48 | int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \ | |
49 | uint8_t *src2, int stride, int h); | |
50 | ||
51 | hadamard_func(mmx) | |
52 | hadamard_func(mmxext) | |
53 | hadamard_func(sse2) | |
54 | hadamard_func(ssse3) | |
55 | ||
56 | #if HAVE_YASM | |
57 | static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, | |
58 | int line_size, int h) | |
59 | { | |
60 | int score1, score2; | |
61 | ||
62 | if (c) | |
63 | score1 = c->mecc.sse[0](c, pix1, pix2, line_size, h); | |
64 | else | |
65 | score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h); | |
66 | score2 = ff_hf_noise16_mmx(pix1, line_size, h) + ff_hf_noise8_mmx(pix1+8, line_size, h) | |
67 | - ff_hf_noise16_mmx(pix2, line_size, h) - ff_hf_noise8_mmx(pix2+8, line_size, h); | |
68 | ||
69 | if (c) | |
70 | return score1 + FFABS(score2) * c->avctx->nsse_weight; | |
71 | else | |
72 | return score1 + FFABS(score2) * 8; | |
73 | } | |
74 | ||
75 | static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, | |
76 | int line_size, int h) | |
77 | { | |
78 | int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h); | |
79 | int score2 = ff_hf_noise8_mmx(pix1, line_size, h) - | |
80 | ff_hf_noise8_mmx(pix2, line_size, h); | |
81 | ||
82 | if (c) | |
83 | return score1 + FFABS(score2) * c->avctx->nsse_weight; | |
84 | else | |
85 | return score1 + FFABS(score2) * 8; | |
86 | } | |
87 | ||
88 | #endif /* HAVE_YASM */ | |
89 | ||
90 | #if HAVE_INLINE_ASM | |
91 | ||
92 | static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, | |
93 | int line_size, int h) | |
94 | { | |
95 | int tmp; | |
96 | ||
97 | av_assert2((((int) pix) & 7) == 0); | |
98 | av_assert2((line_size & 7) == 0); | |
99 | ||
100 | #define SUM(in0, in1, out0, out1) \ | |
101 | "movq (%0), %%mm2\n" \ | |
102 | "movq 8(%0), %%mm3\n" \ | |
103 | "add %2,%0\n" \ | |
104 | "movq %%mm2, " #out0 "\n" \ | |
105 | "movq %%mm3, " #out1 "\n" \ | |
106 | "psubusb " #in0 ", %%mm2\n" \ | |
107 | "psubusb " #in1 ", %%mm3\n" \ | |
108 | "psubusb " #out0 ", " #in0 "\n" \ | |
109 | "psubusb " #out1 ", " #in1 "\n" \ | |
110 | "por %%mm2, " #in0 "\n" \ | |
111 | "por %%mm3, " #in1 "\n" \ | |
112 | "movq " #in0 ", %%mm2\n" \ | |
113 | "movq " #in1 ", %%mm3\n" \ | |
114 | "punpcklbw %%mm7, " #in0 "\n" \ | |
115 | "punpcklbw %%mm7, " #in1 "\n" \ | |
116 | "punpckhbw %%mm7, %%mm2\n" \ | |
117 | "punpckhbw %%mm7, %%mm3\n" \ | |
118 | "paddw " #in1 ", " #in0 "\n" \ | |
119 | "paddw %%mm3, %%mm2\n" \ | |
120 | "paddw %%mm2, " #in0 "\n" \ | |
121 | "paddw " #in0 ", %%mm6\n" | |
122 | ||
123 | ||
124 | __asm__ volatile ( | |
125 | "movl %3, %%ecx\n" | |
126 | "pxor %%mm6, %%mm6\n" | |
127 | "pxor %%mm7, %%mm7\n" | |
128 | "movq (%0), %%mm0\n" | |
129 | "movq 8(%0), %%mm1\n" | |
130 | "add %2, %0\n" | |
131 | "jmp 2f\n" | |
132 | "1:\n" | |
133 | ||
134 | SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
135 | "2:\n" | |
136 | SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
137 | ||
138 | "subl $2, %%ecx\n" | |
139 | "jnz 1b\n" | |
140 | ||
141 | "movq %%mm6, %%mm0\n" | |
142 | "psrlq $32, %%mm6\n" | |
143 | "paddw %%mm6, %%mm0\n" | |
144 | "movq %%mm0, %%mm6\n" | |
145 | "psrlq $16, %%mm0\n" | |
146 | "paddw %%mm6, %%mm0\n" | |
147 | "movd %%mm0, %1\n" | |
148 | : "+r" (pix), "=r" (tmp) | |
149 | : "r" ((x86_reg) line_size), "m" (h) | |
150 | : "%ecx"); | |
151 | ||
152 | return tmp & 0xFFFF; | |
153 | } | |
154 | #undef SUM | |
155 | ||
156 | static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, | |
157 | int line_size, int h) | |
158 | { | |
159 | int tmp; | |
160 | ||
161 | av_assert2((((int) pix) & 7) == 0); | |
162 | av_assert2((line_size & 7) == 0); | |
163 | ||
164 | #define SUM(in0, in1, out0, out1) \ | |
165 | "movq (%0), " #out0 "\n" \ | |
166 | "movq 8(%0), " #out1 "\n" \ | |
167 | "add %2, %0\n" \ | |
168 | "psadbw " #out0 ", " #in0 "\n" \ | |
169 | "psadbw " #out1 ", " #in1 "\n" \ | |
170 | "paddw " #in1 ", " #in0 "\n" \ | |
171 | "paddw " #in0 ", %%mm6\n" | |
172 | ||
173 | __asm__ volatile ( | |
174 | "movl %3, %%ecx\n" | |
175 | "pxor %%mm6, %%mm6\n" | |
176 | "pxor %%mm7, %%mm7\n" | |
177 | "movq (%0), %%mm0\n" | |
178 | "movq 8(%0), %%mm1\n" | |
179 | "add %2, %0\n" | |
180 | "jmp 2f\n" | |
181 | "1:\n" | |
182 | ||
183 | SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
184 | "2:\n" | |
185 | SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
186 | ||
187 | "subl $2, %%ecx\n" | |
188 | "jnz 1b\n" | |
189 | ||
190 | "movd %%mm6, %1\n" | |
191 | : "+r" (pix), "=r" (tmp) | |
192 | : "r" ((x86_reg) line_size), "m" (h) | |
193 | : "%ecx"); | |
194 | ||
195 | return tmp; | |
196 | } | |
197 | #undef SUM | |
198 | ||
199 | static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
200 | int line_size, int h) | |
201 | { | |
202 | int tmp; | |
203 | ||
204 | av_assert2((((int) pix1) & 7) == 0); | |
205 | av_assert2((((int) pix2) & 7) == 0); | |
206 | av_assert2((line_size & 7) == 0); | |
207 | ||
208 | #define SUM(in0, in1, out0, out1) \ | |
209 | "movq (%0), %%mm2\n" \ | |
210 | "movq (%1), " #out0 "\n" \ | |
211 | "movq 8(%0), %%mm3\n" \ | |
212 | "movq 8(%1), " #out1 "\n" \ | |
213 | "add %3, %0\n" \ | |
214 | "add %3, %1\n" \ | |
215 | "psubb " #out0 ", %%mm2\n" \ | |
216 | "psubb " #out1 ", %%mm3\n" \ | |
217 | "pxor %%mm7, %%mm2\n" \ | |
218 | "pxor %%mm7, %%mm3\n" \ | |
219 | "movq %%mm2, " #out0 "\n" \ | |
220 | "movq %%mm3, " #out1 "\n" \ | |
221 | "psubusb " #in0 ", %%mm2\n" \ | |
222 | "psubusb " #in1 ", %%mm3\n" \ | |
223 | "psubusb " #out0 ", " #in0 "\n" \ | |
224 | "psubusb " #out1 ", " #in1 "\n" \ | |
225 | "por %%mm2, " #in0 "\n" \ | |
226 | "por %%mm3, " #in1 "\n" \ | |
227 | "movq " #in0 ", %%mm2\n" \ | |
228 | "movq " #in1 ", %%mm3\n" \ | |
229 | "punpcklbw %%mm7, " #in0 "\n" \ | |
230 | "punpcklbw %%mm7, " #in1 "\n" \ | |
231 | "punpckhbw %%mm7, %%mm2\n" \ | |
232 | "punpckhbw %%mm7, %%mm3\n" \ | |
233 | "paddw " #in1 ", " #in0 "\n" \ | |
234 | "paddw %%mm3, %%mm2\n" \ | |
235 | "paddw %%mm2, " #in0 "\n" \ | |
236 | "paddw " #in0 ", %%mm6\n" | |
237 | ||
238 | ||
239 | __asm__ volatile ( | |
240 | "movl %4, %%ecx\n" | |
241 | "pxor %%mm6, %%mm6\n" | |
242 | "pcmpeqw %%mm7, %%mm7\n" | |
243 | "psllw $15, %%mm7\n" | |
244 | "packsswb %%mm7, %%mm7\n" | |
245 | "movq (%0), %%mm0\n" | |
246 | "movq (%1), %%mm2\n" | |
247 | "movq 8(%0), %%mm1\n" | |
248 | "movq 8(%1), %%mm3\n" | |
249 | "add %3, %0\n" | |
250 | "add %3, %1\n" | |
251 | "psubb %%mm2, %%mm0\n" | |
252 | "psubb %%mm3, %%mm1\n" | |
253 | "pxor %%mm7, %%mm0\n" | |
254 | "pxor %%mm7, %%mm1\n" | |
255 | "jmp 2f\n" | |
256 | "1:\n" | |
257 | ||
258 | SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
259 | "2:\n" | |
260 | SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
261 | ||
262 | "subl $2, %%ecx\n" | |
263 | "jnz 1b\n" | |
264 | ||
265 | "movq %%mm6, %%mm0\n" | |
266 | "psrlq $32, %%mm6\n" | |
267 | "paddw %%mm6, %%mm0\n" | |
268 | "movq %%mm0, %%mm6\n" | |
269 | "psrlq $16, %%mm0\n" | |
270 | "paddw %%mm6, %%mm0\n" | |
271 | "movd %%mm0, %2\n" | |
272 | : "+r" (pix1), "+r" (pix2), "=r" (tmp) | |
273 | : "r" ((x86_reg) line_size), "m" (h) | |
274 | : "%ecx"); | |
275 | ||
276 | return tmp & 0x7FFF; | |
277 | } | |
278 | #undef SUM | |
279 | ||
280 | static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
281 | int line_size, int h) | |
282 | { | |
283 | int tmp; | |
284 | ||
285 | av_assert2((((int) pix1) & 7) == 0); | |
286 | av_assert2((((int) pix2) & 7) == 0); | |
287 | av_assert2((line_size & 7) == 0); | |
288 | ||
289 | #define SUM(in0, in1, out0, out1) \ | |
290 | "movq (%0), " #out0 "\n" \ | |
291 | "movq (%1), %%mm2\n" \ | |
292 | "movq 8(%0), " #out1 "\n" \ | |
293 | "movq 8(%1), %%mm3\n" \ | |
294 | "add %3, %0\n" \ | |
295 | "add %3, %1\n" \ | |
296 | "psubb %%mm2, " #out0 "\n" \ | |
297 | "psubb %%mm3, " #out1 "\n" \ | |
298 | "pxor %%mm7, " #out0 "\n" \ | |
299 | "pxor %%mm7, " #out1 "\n" \ | |
300 | "psadbw " #out0 ", " #in0 "\n" \ | |
301 | "psadbw " #out1 ", " #in1 "\n" \ | |
302 | "paddw " #in1 ", " #in0 "\n" \ | |
303 | "paddw " #in0 ", %%mm6\n " | |
304 | ||
305 | __asm__ volatile ( | |
306 | "movl %4, %%ecx\n" | |
307 | "pxor %%mm6, %%mm6\n" | |
308 | "pcmpeqw %%mm7, %%mm7\n" | |
309 | "psllw $15, %%mm7\n" | |
310 | "packsswb %%mm7, %%mm7\n" | |
311 | "movq (%0), %%mm0\n" | |
312 | "movq (%1), %%mm2\n" | |
313 | "movq 8(%0), %%mm1\n" | |
314 | "movq 8(%1), %%mm3\n" | |
315 | "add %3, %0\n" | |
316 | "add %3, %1\n" | |
317 | "psubb %%mm2, %%mm0\n" | |
318 | "psubb %%mm3, %%mm1\n" | |
319 | "pxor %%mm7, %%mm0\n" | |
320 | "pxor %%mm7, %%mm1\n" | |
321 | "jmp 2f\n" | |
322 | "1:\n" | |
323 | ||
324 | SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
325 | "2:\n" | |
326 | SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
327 | ||
328 | "subl $2, %%ecx\n" | |
329 | "jnz 1b\n" | |
330 | ||
331 | "movd %%mm6, %2\n" | |
332 | : "+r" (pix1), "+r" (pix2), "=r" (tmp) | |
333 | : "r" ((x86_reg) line_size), "m" (h) | |
334 | : "%ecx"); | |
335 | ||
336 | return tmp; | |
337 | } | |
338 | #undef SUM | |
339 | ||
340 | ||
341 | ||
342 | DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = { | |
343 | 0x0000000000000000ULL, | |
344 | 0x0001000100010001ULL, | |
345 | 0x0002000200020002ULL, | |
346 | }; | |
347 | ||
348 | DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL; | |
349 | ||
350 | static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) | |
351 | { | |
352 | x86_reg len = -(x86_reg)stride * h; | |
353 | __asm__ volatile ( | |
354 | ".p2align 4 \n\t" | |
355 | "1: \n\t" | |
356 | "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
357 | "movq (%2, %%"REG_a"), %%mm2 \n\t" | |
358 | "movq (%2, %%"REG_a"), %%mm4 \n\t" | |
359 | "add %3, %%"REG_a" \n\t" | |
360 | "psubusb %%mm0, %%mm2 \n\t" | |
361 | "psubusb %%mm4, %%mm0 \n\t" | |
362 | "movq (%1, %%"REG_a"), %%mm1 \n\t" | |
363 | "movq (%2, %%"REG_a"), %%mm3 \n\t" | |
364 | "movq (%2, %%"REG_a"), %%mm5 \n\t" | |
365 | "psubusb %%mm1, %%mm3 \n\t" | |
366 | "psubusb %%mm5, %%mm1 \n\t" | |
367 | "por %%mm2, %%mm0 \n\t" | |
368 | "por %%mm1, %%mm3 \n\t" | |
369 | "movq %%mm0, %%mm1 \n\t" | |
370 | "movq %%mm3, %%mm2 \n\t" | |
371 | "punpcklbw %%mm7, %%mm0 \n\t" | |
372 | "punpckhbw %%mm7, %%mm1 \n\t" | |
373 | "punpcklbw %%mm7, %%mm3 \n\t" | |
374 | "punpckhbw %%mm7, %%mm2 \n\t" | |
375 | "paddw %%mm1, %%mm0 \n\t" | |
376 | "paddw %%mm3, %%mm2 \n\t" | |
377 | "paddw %%mm2, %%mm0 \n\t" | |
378 | "paddw %%mm0, %%mm6 \n\t" | |
379 | "add %3, %%"REG_a" \n\t" | |
380 | " js 1b \n\t" | |
381 | : "+a" (len) | |
382 | : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride)); | |
383 | } | |
384 | ||
385 | static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2, | |
386 | int stride, int h) | |
387 | { | |
388 | __asm__ volatile ( | |
389 | ".p2align 4 \n\t" | |
390 | "1: \n\t" | |
391 | "movq (%1), %%mm0 \n\t" | |
392 | "movq (%1, %3), %%mm1 \n\t" | |
393 | "psadbw (%2), %%mm0 \n\t" | |
394 | "psadbw (%2, %3), %%mm1 \n\t" | |
395 | "paddw %%mm0, %%mm6 \n\t" | |
396 | "paddw %%mm1, %%mm6 \n\t" | |
397 | "lea (%1,%3,2), %1 \n\t" | |
398 | "lea (%2,%3,2), %2 \n\t" | |
399 | "sub $2, %0 \n\t" | |
400 | " jg 1b \n\t" | |
401 | : "+r" (h), "+r" (blk1), "+r" (blk2) | |
402 | : "r" ((x86_reg) stride)); | |
403 | } | |
404 | ||
405 | static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1, | |
406 | int stride, int h) | |
407 | { | |
408 | int ret; | |
409 | __asm__ volatile ( | |
410 | "pxor %%xmm2, %%xmm2 \n\t" | |
411 | ".p2align 4 \n\t" | |
412 | "1: \n\t" | |
413 | "movdqu (%1), %%xmm0 \n\t" | |
414 | "movdqu (%1, %4), %%xmm1 \n\t" | |
415 | "psadbw (%2), %%xmm0 \n\t" | |
416 | "psadbw (%2, %4), %%xmm1 \n\t" | |
417 | "paddw %%xmm0, %%xmm2 \n\t" | |
418 | "paddw %%xmm1, %%xmm2 \n\t" | |
419 | "lea (%1,%4,2), %1 \n\t" | |
420 | "lea (%2,%4,2), %2 \n\t" | |
421 | "sub $2, %0 \n\t" | |
422 | " jg 1b \n\t" | |
423 | "movhlps %%xmm2, %%xmm0 \n\t" | |
424 | "paddw %%xmm0, %%xmm2 \n\t" | |
425 | "movd %%xmm2, %3 \n\t" | |
426 | : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret) | |
427 | : "r" ((x86_reg) stride)); | |
428 | return ret; | |
429 | } | |
430 | ||
431 | static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2, | |
432 | int stride, int h) | |
433 | { | |
434 | __asm__ volatile ( | |
435 | ".p2align 4 \n\t" | |
436 | "1: \n\t" | |
437 | "movq (%1), %%mm0 \n\t" | |
438 | "movq (%1, %3), %%mm1 \n\t" | |
439 | "pavgb 1(%1), %%mm0 \n\t" | |
440 | "pavgb 1(%1, %3), %%mm1 \n\t" | |
441 | "psadbw (%2), %%mm0 \n\t" | |
442 | "psadbw (%2, %3), %%mm1 \n\t" | |
443 | "paddw %%mm0, %%mm6 \n\t" | |
444 | "paddw %%mm1, %%mm6 \n\t" | |
445 | "lea (%1,%3,2), %1 \n\t" | |
446 | "lea (%2,%3,2), %2 \n\t" | |
447 | "sub $2, %0 \n\t" | |
448 | " jg 1b \n\t" | |
449 | : "+r" (h), "+r" (blk1), "+r" (blk2) | |
450 | : "r" ((x86_reg) stride)); | |
451 | } | |
452 | ||
453 | static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2, | |
454 | int stride, int h) | |
455 | { | |
456 | __asm__ volatile ( | |
457 | "movq (%1), %%mm0 \n\t" | |
458 | "add %3, %1 \n\t" | |
459 | ".p2align 4 \n\t" | |
460 | "1: \n\t" | |
461 | "movq (%1), %%mm1 \n\t" | |
462 | "movq (%1, %3), %%mm2 \n\t" | |
463 | "pavgb %%mm1, %%mm0 \n\t" | |
464 | "pavgb %%mm2, %%mm1 \n\t" | |
465 | "psadbw (%2), %%mm0 \n\t" | |
466 | "psadbw (%2, %3), %%mm1 \n\t" | |
467 | "paddw %%mm0, %%mm6 \n\t" | |
468 | "paddw %%mm1, %%mm6 \n\t" | |
469 | "movq %%mm2, %%mm0 \n\t" | |
470 | "lea (%1,%3,2), %1 \n\t" | |
471 | "lea (%2,%3,2), %2 \n\t" | |
472 | "sub $2, %0 \n\t" | |
473 | " jg 1b \n\t" | |
474 | : "+r" (h), "+r" (blk1), "+r" (blk2) | |
475 | : "r" ((x86_reg) stride)); | |
476 | } | |
477 | ||
478 | static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2, | |
479 | int stride, int h) | |
480 | { | |
481 | __asm__ volatile ( | |
482 | "movq "MANGLE(bone)", %%mm5 \n\t" | |
483 | "movq (%1), %%mm0 \n\t" | |
484 | "pavgb 1(%1), %%mm0 \n\t" | |
485 | "add %3, %1 \n\t" | |
486 | ".p2align 4 \n\t" | |
487 | "1: \n\t" | |
488 | "movq (%1), %%mm1 \n\t" | |
489 | "movq (%1,%3), %%mm2 \n\t" | |
490 | "pavgb 1(%1), %%mm1 \n\t" | |
491 | "pavgb 1(%1,%3), %%mm2 \n\t" | |
492 | "psubusb %%mm5, %%mm1 \n\t" | |
493 | "pavgb %%mm1, %%mm0 \n\t" | |
494 | "pavgb %%mm2, %%mm1 \n\t" | |
495 | "psadbw (%2), %%mm0 \n\t" | |
496 | "psadbw (%2,%3), %%mm1 \n\t" | |
497 | "paddw %%mm0, %%mm6 \n\t" | |
498 | "paddw %%mm1, %%mm6 \n\t" | |
499 | "movq %%mm2, %%mm0 \n\t" | |
500 | "lea (%1,%3,2), %1 \n\t" | |
501 | "lea (%2,%3,2), %2 \n\t" | |
502 | "sub $2, %0 \n\t" | |
503 | " jg 1b \n\t" | |
504 | : "+r" (h), "+r" (blk1), "+r" (blk2) | |
505 | : "r" ((x86_reg) stride) | |
506 | NAMED_CONSTRAINTS_ADD(bone)); | |
507 | } | |
508 | ||
509 | static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, | |
510 | int stride, int h) | |
511 | { | |
512 | x86_reg len = -(x86_reg)stride * h; | |
513 | __asm__ volatile ( | |
514 | ".p2align 4 \n\t" | |
515 | "1: \n\t" | |
516 | "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
517 | "movq (%2, %%"REG_a"), %%mm1 \n\t" | |
518 | "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
519 | "movq (%2, %%"REG_a"), %%mm3 \n\t" | |
520 | "punpcklbw %%mm7, %%mm0 \n\t" | |
521 | "punpcklbw %%mm7, %%mm1 \n\t" | |
522 | "punpckhbw %%mm7, %%mm2 \n\t" | |
523 | "punpckhbw %%mm7, %%mm3 \n\t" | |
524 | "paddw %%mm0, %%mm1 \n\t" | |
525 | "paddw %%mm2, %%mm3 \n\t" | |
526 | "movq (%3, %%"REG_a"), %%mm4 \n\t" | |
527 | "movq (%3, %%"REG_a"), %%mm2 \n\t" | |
528 | "paddw %%mm5, %%mm1 \n\t" | |
529 | "paddw %%mm5, %%mm3 \n\t" | |
530 | "psrlw $1, %%mm1 \n\t" | |
531 | "psrlw $1, %%mm3 \n\t" | |
532 | "packuswb %%mm3, %%mm1 \n\t" | |
533 | "psubusb %%mm1, %%mm4 \n\t" | |
534 | "psubusb %%mm2, %%mm1 \n\t" | |
535 | "por %%mm4, %%mm1 \n\t" | |
536 | "movq %%mm1, %%mm0 \n\t" | |
537 | "punpcklbw %%mm7, %%mm0 \n\t" | |
538 | "punpckhbw %%mm7, %%mm1 \n\t" | |
539 | "paddw %%mm1, %%mm0 \n\t" | |
540 | "paddw %%mm0, %%mm6 \n\t" | |
541 | "add %4, %%"REG_a" \n\t" | |
542 | " js 1b \n\t" | |
543 | : "+a" (len) | |
544 | : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len), | |
545 | "r" ((x86_reg) stride)); | |
546 | } | |
547 | ||
548 | static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) | |
549 | { | |
550 | x86_reg len = -(x86_reg)stride * h; | |
551 | __asm__ volatile ( | |
552 | "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
553 | "movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |
554 | "movq %%mm0, %%mm1 \n\t" | |
555 | "movq %%mm2, %%mm3 \n\t" | |
556 | "punpcklbw %%mm7, %%mm0 \n\t" | |
557 | "punpckhbw %%mm7, %%mm1 \n\t" | |
558 | "punpcklbw %%mm7, %%mm2 \n\t" | |
559 | "punpckhbw %%mm7, %%mm3 \n\t" | |
560 | "paddw %%mm2, %%mm0 \n\t" | |
561 | "paddw %%mm3, %%mm1 \n\t" | |
562 | ".p2align 4 \n\t" | |
563 | "1: \n\t" | |
564 | "movq (%2, %%"REG_a"), %%mm2 \n\t" | |
565 | "movq 1(%2, %%"REG_a"), %%mm4 \n\t" | |
566 | "movq %%mm2, %%mm3 \n\t" | |
567 | "movq %%mm4, %%mm5 \n\t" | |
568 | "punpcklbw %%mm7, %%mm2 \n\t" | |
569 | "punpckhbw %%mm7, %%mm3 \n\t" | |
570 | "punpcklbw %%mm7, %%mm4 \n\t" | |
571 | "punpckhbw %%mm7, %%mm5 \n\t" | |
572 | "paddw %%mm4, %%mm2 \n\t" | |
573 | "paddw %%mm5, %%mm3 \n\t" | |
574 | "movq %5, %%mm5 \n\t" | |
575 | "paddw %%mm2, %%mm0 \n\t" | |
576 | "paddw %%mm3, %%mm1 \n\t" | |
577 | "paddw %%mm5, %%mm0 \n\t" | |
578 | "paddw %%mm5, %%mm1 \n\t" | |
579 | "movq (%3, %%"REG_a"), %%mm4 \n\t" | |
580 | "movq (%3, %%"REG_a"), %%mm5 \n\t" | |
581 | "psrlw $2, %%mm0 \n\t" | |
582 | "psrlw $2, %%mm1 \n\t" | |
583 | "packuswb %%mm1, %%mm0 \n\t" | |
584 | "psubusb %%mm0, %%mm4 \n\t" | |
585 | "psubusb %%mm5, %%mm0 \n\t" | |
586 | "por %%mm4, %%mm0 \n\t" | |
587 | "movq %%mm0, %%mm4 \n\t" | |
588 | "punpcklbw %%mm7, %%mm0 \n\t" | |
589 | "punpckhbw %%mm7, %%mm4 \n\t" | |
590 | "paddw %%mm0, %%mm6 \n\t" | |
591 | "paddw %%mm4, %%mm6 \n\t" | |
592 | "movq %%mm2, %%mm0 \n\t" | |
593 | "movq %%mm3, %%mm1 \n\t" | |
594 | "add %4, %%"REG_a" \n\t" | |
595 | " js 1b \n\t" | |
596 | : "+a" (len) | |
597 | : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), | |
598 | "r" ((x86_reg) stride), "m" (round_tab[2])); | |
599 | } | |
600 | ||
601 | static inline int sum_mmx(void) | |
602 | { | |
603 | int ret; | |
604 | __asm__ volatile ( | |
605 | "movq %%mm6, %%mm0 \n\t" | |
606 | "psrlq $32, %%mm6 \n\t" | |
607 | "paddw %%mm0, %%mm6 \n\t" | |
608 | "movq %%mm6, %%mm0 \n\t" | |
609 | "psrlq $16, %%mm6 \n\t" | |
610 | "paddw %%mm0, %%mm6 \n\t" | |
611 | "movd %%mm6, %0 \n\t" | |
612 | : "=r" (ret)); | |
613 | return ret & 0xFFFF; | |
614 | } | |
615 | ||
616 | static inline int sum_mmxext(void) | |
617 | { | |
618 | int ret; | |
619 | __asm__ volatile ( | |
620 | "movd %%mm6, %0 \n\t" | |
621 | : "=r" (ret)); | |
622 | return ret; | |
623 | } | |
624 | ||
625 | static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) | |
626 | { | |
627 | sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h); | |
628 | } | |
629 | ||
630 | static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) | |
631 | { | |
632 | sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h); | |
633 | } | |
634 | ||
635 | #define PIX_SAD(suf) \ | |
636 | static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \ | |
637 | uint8_t *blk1, int stride, int h) \ | |
638 | { \ | |
639 | av_assert2(h == 8); \ | |
640 | __asm__ volatile ( \ | |
641 | "pxor %%mm7, %%mm7 \n\t" \ | |
642 | "pxor %%mm6, %%mm6 \n\t" \ | |
643 | :); \ | |
644 | \ | |
645 | sad8_1_ ## suf(blk1, blk2, stride, 8); \ | |
646 | \ | |
647 | return sum_ ## suf(); \ | |
648 | } \ | |
649 | \ | |
650 | static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ | |
651 | uint8_t *blk1, int stride, int h) \ | |
652 | { \ | |
653 | av_assert2(h == 8); \ | |
654 | __asm__ volatile ( \ | |
655 | "pxor %%mm7, %%mm7 \n\t" \ | |
656 | "pxor %%mm6, %%mm6 \n\t" \ | |
657 | "movq %0, %%mm5 \n\t" \ | |
658 | :: "m" (round_tab[1])); \ | |
659 | \ | |
660 | sad8_x2a_ ## suf(blk1, blk2, stride, 8); \ | |
661 | \ | |
662 | return sum_ ## suf(); \ | |
663 | } \ | |
664 | \ | |
665 | static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ | |
666 | uint8_t *blk1, int stride, int h) \ | |
667 | { \ | |
668 | av_assert2(h == 8); \ | |
669 | __asm__ volatile ( \ | |
670 | "pxor %%mm7, %%mm7 \n\t" \ | |
671 | "pxor %%mm6, %%mm6 \n\t" \ | |
672 | "movq %0, %%mm5 \n\t" \ | |
673 | :: "m" (round_tab[1])); \ | |
674 | \ | |
675 | sad8_y2a_ ## suf(blk1, blk2, stride, 8); \ | |
676 | \ | |
677 | return sum_ ## suf(); \ | |
678 | } \ | |
679 | \ | |
680 | static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ | |
681 | uint8_t *blk1, int stride, int h) \ | |
682 | { \ | |
683 | av_assert2(h == 8); \ | |
684 | __asm__ volatile ( \ | |
685 | "pxor %%mm7, %%mm7 \n\t" \ | |
686 | "pxor %%mm6, %%mm6 \n\t" \ | |
687 | ::); \ | |
688 | \ | |
689 | sad8_4_ ## suf(blk1, blk2, stride, 8); \ | |
690 | \ | |
691 | return sum_ ## suf(); \ | |
692 | } \ | |
693 | \ | |
694 | static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \ | |
695 | uint8_t *blk1, int stride, int h) \ | |
696 | { \ | |
697 | __asm__ volatile ( \ | |
698 | "pxor %%mm7, %%mm7 \n\t" \ | |
699 | "pxor %%mm6, %%mm6 \n\t" \ | |
700 | :); \ | |
701 | \ | |
702 | sad8_1_ ## suf(blk1, blk2, stride, h); \ | |
703 | sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ | |
704 | \ | |
705 | return sum_ ## suf(); \ | |
706 | } \ | |
707 | \ | |
708 | static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ | |
709 | uint8_t *blk1, int stride, int h) \ | |
710 | { \ | |
711 | __asm__ volatile ( \ | |
712 | "pxor %%mm7, %%mm7 \n\t" \ | |
713 | "pxor %%mm6, %%mm6 \n\t" \ | |
714 | "movq %0, %%mm5 \n\t" \ | |
715 | :: "m" (round_tab[1])); \ | |
716 | \ | |
717 | sad8_x2a_ ## suf(blk1, blk2, stride, h); \ | |
718 | sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ | |
719 | \ | |
720 | return sum_ ## suf(); \ | |
721 | } \ | |
722 | \ | |
723 | static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ | |
724 | uint8_t *blk1, int stride, int h) \ | |
725 | { \ | |
726 | __asm__ volatile ( \ | |
727 | "pxor %%mm7, %%mm7 \n\t" \ | |
728 | "pxor %%mm6, %%mm6 \n\t" \ | |
729 | "movq %0, %%mm5 \n\t" \ | |
730 | :: "m" (round_tab[1])); \ | |
731 | \ | |
732 | sad8_y2a_ ## suf(blk1, blk2, stride, h); \ | |
733 | sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ | |
734 | \ | |
735 | return sum_ ## suf(); \ | |
736 | } \ | |
737 | \ | |
738 | static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ | |
739 | uint8_t *blk1, int stride, int h) \ | |
740 | { \ | |
741 | __asm__ volatile ( \ | |
742 | "pxor %%mm7, %%mm7 \n\t" \ | |
743 | "pxor %%mm6, %%mm6 \n\t" \ | |
744 | ::); \ | |
745 | \ | |
746 | sad8_4_ ## suf(blk1, blk2, stride, h); \ | |
747 | sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ | |
748 | \ | |
749 | return sum_ ## suf(); \ | |
750 | } \ | |
751 | ||
752 | PIX_SAD(mmx) | |
753 | PIX_SAD(mmxext) | |
754 | ||
755 | #endif /* HAVE_INLINE_ASM */ | |
756 | ||
757 | av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) | |
758 | { | |
759 | int cpu_flags = av_get_cpu_flags(); | |
760 | ||
761 | #if HAVE_INLINE_ASM | |
762 | if (INLINE_MMX(cpu_flags)) { | |
763 | c->pix_abs[0][0] = sad16_mmx; | |
764 | c->pix_abs[0][1] = sad16_x2_mmx; | |
765 | c->pix_abs[0][2] = sad16_y2_mmx; | |
766 | c->pix_abs[0][3] = sad16_xy2_mmx; | |
767 | c->pix_abs[1][0] = sad8_mmx; | |
768 | c->pix_abs[1][1] = sad8_x2_mmx; | |
769 | c->pix_abs[1][2] = sad8_y2_mmx; | |
770 | c->pix_abs[1][3] = sad8_xy2_mmx; | |
771 | ||
772 | c->sad[0] = sad16_mmx; | |
773 | c->sad[1] = sad8_mmx; | |
774 | ||
775 | c->vsad[4] = vsad_intra16_mmx; | |
776 | ||
777 | if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { | |
778 | c->vsad[0] = vsad16_mmx; | |
779 | } | |
780 | } | |
781 | ||
782 | if (INLINE_MMXEXT(cpu_flags)) { | |
783 | c->vsad[4] = vsad_intra16_mmxext; | |
784 | ||
785 | c->pix_abs[0][0] = sad16_mmxext; | |
786 | c->pix_abs[1][0] = sad8_mmxext; | |
787 | ||
788 | c->sad[0] = sad16_mmxext; | |
789 | c->sad[1] = sad8_mmxext; | |
790 | ||
791 | c->pix_abs[0][1] = sad16_x2_mmxext; | |
792 | c->pix_abs[0][2] = sad16_y2_mmxext; | |
793 | c->pix_abs[1][1] = sad8_x2_mmxext; | |
794 | c->pix_abs[1][2] = sad8_y2_mmxext; | |
795 | ||
796 | if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { | |
797 | c->pix_abs[0][3] = sad16_xy2_mmxext; | |
798 | c->pix_abs[1][3] = sad8_xy2_mmxext; | |
799 | ||
800 | c->vsad[0] = vsad16_mmxext; | |
801 | } | |
802 | } | |
803 | ||
804 | if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) { | |
805 | c->sad[0] = sad16_sse2; | |
806 | } | |
807 | ||
808 | #endif /* HAVE_INLINE_ASM */ | |
809 | ||
810 | if (EXTERNAL_MMX(cpu_flags)) { | |
811 | c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; | |
812 | c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; | |
813 | c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx; | |
814 | c->sse[0] = ff_sse16_mmx; | |
815 | c->sse[1] = ff_sse8_mmx; | |
816 | #if HAVE_YASM | |
817 | c->nsse[0] = nsse16_mmx; | |
818 | c->nsse[1] = nsse8_mmx; | |
819 | #endif | |
820 | } | |
821 | ||
822 | if (EXTERNAL_MMXEXT(cpu_flags)) { | |
823 | c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; | |
824 | c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; | |
825 | c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext; | |
826 | } | |
827 | ||
828 | if (EXTERNAL_SSE2(cpu_flags)) { | |
829 | c->sse[0] = ff_sse16_sse2; | |
830 | c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2; | |
831 | ||
832 | #if HAVE_ALIGNED_STACK | |
833 | c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; | |
834 | c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; | |
835 | #endif | |
836 | } | |
837 | ||
838 | if (EXTERNAL_SSSE3(cpu_flags)) { | |
839 | c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3; | |
840 | #if HAVE_ALIGNED_STACK | |
841 | c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; | |
842 | c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; | |
843 | #endif | |
844 | } | |
845 | } |