2 * SIMD-optimized motion estimation
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavutil/x86/cpu.h"
29 #include "libavcodec/me_cmp.h"
30 #include "libavcodec/mpegvideo.h"
32 int ff_sum_abs_dctelem_mmx(int16_t *block
);
33 int ff_sum_abs_dctelem_mmxext(int16_t *block
);
34 int ff_sum_abs_dctelem_sse2(int16_t *block
);
35 int ff_sum_abs_dctelem_ssse3(int16_t *block
);
36 int ff_sse8_mmx(MpegEncContext
*v
, uint8_t *pix1
, uint8_t *pix2
,
37 int line_size
, int h
);
38 int ff_sse16_mmx(MpegEncContext
*v
, uint8_t *pix1
, uint8_t *pix2
,
39 int line_size
, int h
);
40 int ff_sse16_sse2(MpegEncContext
*v
, uint8_t *pix1
, uint8_t *pix2
,
41 int line_size
, int h
);
42 int ff_hf_noise8_mmx(uint8_t *pix1
, int lsize
, int h
);
43 int ff_hf_noise16_mmx(uint8_t *pix1
, int lsize
, int h
);
45 #define hadamard_func(cpu) \
46 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
47 uint8_t *src2, int stride, int h); \
48 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
49 uint8_t *src2, int stride, int h);
57 static int nsse16_mmx(MpegEncContext
*c
, uint8_t *pix1
, uint8_t *pix2
,
63 score1
= c
->mecc
.sse
[0](c
, pix1
, pix2
, line_size
, h
);
65 score1
= ff_sse16_mmx(c
, pix1
, pix2
, line_size
, h
);
66 score2
= ff_hf_noise16_mmx(pix1
, line_size
, h
) + ff_hf_noise8_mmx(pix1
+8, line_size
, h
)
67 - ff_hf_noise16_mmx(pix2
, line_size
, h
) - ff_hf_noise8_mmx(pix2
+8, line_size
, h
);
70 return score1
+ FFABS(score2
) * c
->avctx
->nsse_weight
;
72 return score1
+ FFABS(score2
) * 8;
75 static int nsse8_mmx(MpegEncContext
*c
, uint8_t *pix1
, uint8_t *pix2
,
78 int score1
= ff_sse8_mmx(c
, pix1
, pix2
, line_size
, h
);
79 int score2
= ff_hf_noise8_mmx(pix1
, line_size
, h
) -
80 ff_hf_noise8_mmx(pix2
, line_size
, h
);
83 return score1
+ FFABS(score2
) * c
->avctx
->nsse_weight
;
85 return score1
+ FFABS(score2
) * 8;
88 #endif /* HAVE_YASM */
92 static int vsad_intra16_mmx(MpegEncContext
*v
, uint8_t *pix
, uint8_t *dummy
,
97 av_assert2((((int) pix
) & 7) == 0);
98 av_assert2((line_size
& 7) == 0);
100 #define SUM(in0, in1, out0, out1) \
101 "movq (%0), %%mm2\n" \
102 "movq 8(%0), %%mm3\n" \
104 "movq %%mm2, " #out0 "\n" \
105 "movq %%mm3, " #out1 "\n" \
106 "psubusb " #in0 ", %%mm2\n" \
107 "psubusb " #in1 ", %%mm3\n" \
108 "psubusb " #out0 ", " #in0 "\n" \
109 "psubusb " #out1 ", " #in1 "\n" \
110 "por %%mm2, " #in0 "\n" \
111 "por %%mm3, " #in1 "\n" \
112 "movq " #in0 ", %%mm2\n" \
113 "movq " #in1 ", %%mm3\n" \
114 "punpcklbw %%mm7, " #in0 "\n" \
115 "punpcklbw %%mm7, " #in1 "\n" \
116 "punpckhbw %%mm7, %%mm2\n" \
117 "punpckhbw %%mm7, %%mm3\n" \
118 "paddw " #in1 ", " #in0 "\n" \
119 "paddw %%mm3, %%mm2\n" \
120 "paddw %%mm2, " #in0 "\n" \
121 "paddw " #in0 ", %%mm6\n"
126 "pxor %%mm6, %%mm6\n"
127 "pxor %%mm7, %%mm7\n"
129 "movq 8(%0), %%mm1\n"
134 SUM(%%mm4
, %%mm5
, %%mm0
, %%mm1
)
136 SUM(%%mm0
, %%mm1
, %%mm4
, %%mm5
)
141 "movq %%mm6, %%mm0\n"
143 "paddw %%mm6, %%mm0\n"
144 "movq %%mm0, %%mm6\n"
146 "paddw %%mm6, %%mm0\n"
148 : "+r" (pix
), "=r" (tmp
)
149 : "r" ((x86_reg
) line_size
), "m" (h
)
156 static int vsad_intra16_mmxext(MpegEncContext
*v
, uint8_t *pix
, uint8_t *dummy
,
157 int line_size
, int h
)
161 av_assert2((((int) pix
) & 7) == 0);
162 av_assert2((line_size
& 7) == 0);
164 #define SUM(in0, in1, out0, out1) \
165 "movq (%0), " #out0 "\n" \
166 "movq 8(%0), " #out1 "\n" \
168 "psadbw " #out0 ", " #in0 "\n" \
169 "psadbw " #out1 ", " #in1 "\n" \
170 "paddw " #in1 ", " #in0 "\n" \
171 "paddw " #in0 ", %%mm6\n"
175 "pxor %%mm6, %%mm6\n"
176 "pxor %%mm7, %%mm7\n"
178 "movq 8(%0), %%mm1\n"
183 SUM(%%mm4
, %%mm5
, %%mm0
, %%mm1
)
185 SUM(%%mm0
, %%mm1
, %%mm4
, %%mm5
)
191 : "+r" (pix
), "=r" (tmp
)
192 : "r" ((x86_reg
) line_size
), "m" (h
)
199 static int vsad16_mmx(MpegEncContext
*v
, uint8_t *pix1
, uint8_t *pix2
,
200 int line_size
, int h
)
204 av_assert2((((int) pix1
) & 7) == 0);
205 av_assert2((((int) pix2
) & 7) == 0);
206 av_assert2((line_size
& 7) == 0);
208 #define SUM(in0, in1, out0, out1) \
209 "movq (%0), %%mm2\n" \
210 "movq (%1), " #out0 "\n" \
211 "movq 8(%0), %%mm3\n" \
212 "movq 8(%1), " #out1 "\n" \
215 "psubb " #out0 ", %%mm2\n" \
216 "psubb " #out1 ", %%mm3\n" \
217 "pxor %%mm7, %%mm2\n" \
218 "pxor %%mm7, %%mm3\n" \
219 "movq %%mm2, " #out0 "\n" \
220 "movq %%mm3, " #out1 "\n" \
221 "psubusb " #in0 ", %%mm2\n" \
222 "psubusb " #in1 ", %%mm3\n" \
223 "psubusb " #out0 ", " #in0 "\n" \
224 "psubusb " #out1 ", " #in1 "\n" \
225 "por %%mm2, " #in0 "\n" \
226 "por %%mm3, " #in1 "\n" \
227 "movq " #in0 ", %%mm2\n" \
228 "movq " #in1 ", %%mm3\n" \
229 "punpcklbw %%mm7, " #in0 "\n" \
230 "punpcklbw %%mm7, " #in1 "\n" \
231 "punpckhbw %%mm7, %%mm2\n" \
232 "punpckhbw %%mm7, %%mm3\n" \
233 "paddw " #in1 ", " #in0 "\n" \
234 "paddw %%mm3, %%mm2\n" \
235 "paddw %%mm2, " #in0 "\n" \
236 "paddw " #in0 ", %%mm6\n"
241 "pxor %%mm6, %%mm6\n"
242 "pcmpeqw %%mm7, %%mm7\n"
244 "packsswb %%mm7, %%mm7\n"
247 "movq 8(%0), %%mm1\n"
248 "movq 8(%1), %%mm3\n"
251 "psubb %%mm2, %%mm0\n"
252 "psubb %%mm3, %%mm1\n"
253 "pxor %%mm7, %%mm0\n"
254 "pxor %%mm7, %%mm1\n"
258 SUM(%%mm4
, %%mm5
, %%mm0
, %%mm1
)
260 SUM(%%mm0
, %%mm1
, %%mm4
, %%mm5
)
265 "movq %%mm6, %%mm0\n"
267 "paddw %%mm6, %%mm0\n"
268 "movq %%mm0, %%mm6\n"
270 "paddw %%mm6, %%mm0\n"
272 : "+r" (pix1
), "+r" (pix2
), "=r" (tmp
)
273 : "r" ((x86_reg
) line_size
), "m" (h
)
280 static int vsad16_mmxext(MpegEncContext
*v
, uint8_t *pix1
, uint8_t *pix2
,
281 int line_size
, int h
)
285 av_assert2((((int) pix1
) & 7) == 0);
286 av_assert2((((int) pix2
) & 7) == 0);
287 av_assert2((line_size
& 7) == 0);
289 #define SUM(in0, in1, out0, out1) \
290 "movq (%0), " #out0 "\n" \
291 "movq (%1), %%mm2\n" \
292 "movq 8(%0), " #out1 "\n" \
293 "movq 8(%1), %%mm3\n" \
296 "psubb %%mm2, " #out0 "\n" \
297 "psubb %%mm3, " #out1 "\n" \
298 "pxor %%mm7, " #out0 "\n" \
299 "pxor %%mm7, " #out1 "\n" \
300 "psadbw " #out0 ", " #in0 "\n" \
301 "psadbw " #out1 ", " #in1 "\n" \
302 "paddw " #in1 ", " #in0 "\n" \
303 "paddw " #in0 ", %%mm6\n "
307 "pxor %%mm6, %%mm6\n"
308 "pcmpeqw %%mm7, %%mm7\n"
310 "packsswb %%mm7, %%mm7\n"
313 "movq 8(%0), %%mm1\n"
314 "movq 8(%1), %%mm3\n"
317 "psubb %%mm2, %%mm0\n"
318 "psubb %%mm3, %%mm1\n"
319 "pxor %%mm7, %%mm0\n"
320 "pxor %%mm7, %%mm1\n"
324 SUM(%%mm4
, %%mm5
, %%mm0
, %%mm1
)
326 SUM(%%mm0
, %%mm1
, %%mm4
, %%mm5
)
332 : "+r" (pix1
), "+r" (pix2
), "=r" (tmp
)
333 : "r" ((x86_reg
) line_size
), "m" (h
)
342 DECLARE_ASM_CONST(8, uint64_t, round_tab
)[3] = {
343 0x0000000000000000ULL
,
344 0x0001000100010001ULL
,
345 0x0002000200020002ULL
,
348 DECLARE_ASM_CONST(8, uint64_t, bone
) = 0x0101010101010101LL
;
350 static inline void sad8_1_mmx(uint8_t *blk1
, uint8_t *blk2
, int stride
, int h
)
352 x86_reg len
= -(x86_reg
)stride
* h
;
356 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
357 "movq (%2, %%"REG_a
"), %%mm2 \n\t"
358 "movq (%2, %%"REG_a
"), %%mm4 \n\t"
359 "add %3, %%"REG_a
" \n\t"
360 "psubusb %%mm0, %%mm2 \n\t"
361 "psubusb %%mm4, %%mm0 \n\t"
362 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
363 "movq (%2, %%"REG_a
"), %%mm3 \n\t"
364 "movq (%2, %%"REG_a
"), %%mm5 \n\t"
365 "psubusb %%mm1, %%mm3 \n\t"
366 "psubusb %%mm5, %%mm1 \n\t"
367 "por %%mm2, %%mm0 \n\t"
368 "por %%mm1, %%mm3 \n\t"
369 "movq %%mm0, %%mm1 \n\t"
370 "movq %%mm3, %%mm2 \n\t"
371 "punpcklbw %%mm7, %%mm0 \n\t"
372 "punpckhbw %%mm7, %%mm1 \n\t"
373 "punpcklbw %%mm7, %%mm3 \n\t"
374 "punpckhbw %%mm7, %%mm2 \n\t"
375 "paddw %%mm1, %%mm0 \n\t"
376 "paddw %%mm3, %%mm2 \n\t"
377 "paddw %%mm2, %%mm0 \n\t"
378 "paddw %%mm0, %%mm6 \n\t"
379 "add %3, %%"REG_a
" \n\t"
382 : "r" (blk1
- len
), "r" (blk2
- len
), "r" ((x86_reg
) stride
));
385 static inline void sad8_1_mmxext(uint8_t *blk1
, uint8_t *blk2
,
391 "movq (%1), %%mm0 \n\t"
392 "movq (%1, %3), %%mm1 \n\t"
393 "psadbw (%2), %%mm0 \n\t"
394 "psadbw (%2, %3), %%mm1 \n\t"
395 "paddw %%mm0, %%mm6 \n\t"
396 "paddw %%mm1, %%mm6 \n\t"
397 "lea (%1,%3,2), %1 \n\t"
398 "lea (%2,%3,2), %2 \n\t"
401 : "+r" (h
), "+r" (blk1
), "+r" (blk2
)
402 : "r" ((x86_reg
) stride
));
405 static int sad16_sse2(MpegEncContext
*v
, uint8_t *blk2
, uint8_t *blk1
,
410 "pxor %%xmm2, %%xmm2 \n\t"
413 "movdqu (%1), %%xmm0 \n\t"
414 "movdqu (%1, %4), %%xmm1 \n\t"
415 "psadbw (%2), %%xmm0 \n\t"
416 "psadbw (%2, %4), %%xmm1 \n\t"
417 "paddw %%xmm0, %%xmm2 \n\t"
418 "paddw %%xmm1, %%xmm2 \n\t"
419 "lea (%1,%4,2), %1 \n\t"
420 "lea (%2,%4,2), %2 \n\t"
423 "movhlps %%xmm2, %%xmm0 \n\t"
424 "paddw %%xmm0, %%xmm2 \n\t"
425 "movd %%xmm2, %3 \n\t"
426 : "+r" (h
), "+r" (blk1
), "+r" (blk2
), "=r" (ret
)
427 : "r" ((x86_reg
) stride
));
431 static inline void sad8_x2a_mmxext(uint8_t *blk1
, uint8_t *blk2
,
437 "movq (%1), %%mm0 \n\t"
438 "movq (%1, %3), %%mm1 \n\t"
439 "pavgb 1(%1), %%mm0 \n\t"
440 "pavgb 1(%1, %3), %%mm1 \n\t"
441 "psadbw (%2), %%mm0 \n\t"
442 "psadbw (%2, %3), %%mm1 \n\t"
443 "paddw %%mm0, %%mm6 \n\t"
444 "paddw %%mm1, %%mm6 \n\t"
445 "lea (%1,%3,2), %1 \n\t"
446 "lea (%2,%3,2), %2 \n\t"
449 : "+r" (h
), "+r" (blk1
), "+r" (blk2
)
450 : "r" ((x86_reg
) stride
));
453 static inline void sad8_y2a_mmxext(uint8_t *blk1
, uint8_t *blk2
,
457 "movq (%1), %%mm0 \n\t"
461 "movq (%1), %%mm1 \n\t"
462 "movq (%1, %3), %%mm2 \n\t"
463 "pavgb %%mm1, %%mm0 \n\t"
464 "pavgb %%mm2, %%mm1 \n\t"
465 "psadbw (%2), %%mm0 \n\t"
466 "psadbw (%2, %3), %%mm1 \n\t"
467 "paddw %%mm0, %%mm6 \n\t"
468 "paddw %%mm1, %%mm6 \n\t"
469 "movq %%mm2, %%mm0 \n\t"
470 "lea (%1,%3,2), %1 \n\t"
471 "lea (%2,%3,2), %2 \n\t"
474 : "+r" (h
), "+r" (blk1
), "+r" (blk2
)
475 : "r" ((x86_reg
) stride
));
478 static inline void sad8_4_mmxext(uint8_t *blk1
, uint8_t *blk2
,
482 "movq "MANGLE(bone
)", %%mm5 \n\t"
483 "movq (%1), %%mm0 \n\t"
484 "pavgb 1(%1), %%mm0 \n\t"
488 "movq (%1), %%mm1 \n\t"
489 "movq (%1,%3), %%mm2 \n\t"
490 "pavgb 1(%1), %%mm1 \n\t"
491 "pavgb 1(%1,%3), %%mm2 \n\t"
492 "psubusb %%mm5, %%mm1 \n\t"
493 "pavgb %%mm1, %%mm0 \n\t"
494 "pavgb %%mm2, %%mm1 \n\t"
495 "psadbw (%2), %%mm0 \n\t"
496 "psadbw (%2,%3), %%mm1 \n\t"
497 "paddw %%mm0, %%mm6 \n\t"
498 "paddw %%mm1, %%mm6 \n\t"
499 "movq %%mm2, %%mm0 \n\t"
500 "lea (%1,%3,2), %1 \n\t"
501 "lea (%2,%3,2), %2 \n\t"
504 : "+r" (h
), "+r" (blk1
), "+r" (blk2
)
505 : "r" ((x86_reg
) stride
)
506 NAMED_CONSTRAINTS_ADD(bone
));
509 static inline void sad8_2_mmx(uint8_t *blk1a
, uint8_t *blk1b
, uint8_t *blk2
,
512 x86_reg len
= -(x86_reg
)stride
* h
;
516 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
517 "movq (%2, %%"REG_a
"), %%mm1 \n\t"
518 "movq (%1, %%"REG_a
"), %%mm2 \n\t"
519 "movq (%2, %%"REG_a
"), %%mm3 \n\t"
520 "punpcklbw %%mm7, %%mm0 \n\t"
521 "punpcklbw %%mm7, %%mm1 \n\t"
522 "punpckhbw %%mm7, %%mm2 \n\t"
523 "punpckhbw %%mm7, %%mm3 \n\t"
524 "paddw %%mm0, %%mm1 \n\t"
525 "paddw %%mm2, %%mm3 \n\t"
526 "movq (%3, %%"REG_a
"), %%mm4 \n\t"
527 "movq (%3, %%"REG_a
"), %%mm2 \n\t"
528 "paddw %%mm5, %%mm1 \n\t"
529 "paddw %%mm5, %%mm3 \n\t"
530 "psrlw $1, %%mm1 \n\t"
531 "psrlw $1, %%mm3 \n\t"
532 "packuswb %%mm3, %%mm1 \n\t"
533 "psubusb %%mm1, %%mm4 \n\t"
534 "psubusb %%mm2, %%mm1 \n\t"
535 "por %%mm4, %%mm1 \n\t"
536 "movq %%mm1, %%mm0 \n\t"
537 "punpcklbw %%mm7, %%mm0 \n\t"
538 "punpckhbw %%mm7, %%mm1 \n\t"
539 "paddw %%mm1, %%mm0 \n\t"
540 "paddw %%mm0, %%mm6 \n\t"
541 "add %4, %%"REG_a
" \n\t"
544 : "r" (blk1a
- len
), "r" (blk1b
- len
), "r" (blk2
- len
),
545 "r" ((x86_reg
) stride
));
548 static inline void sad8_4_mmx(uint8_t *blk1
, uint8_t *blk2
, int stride
, int h
)
550 x86_reg len
= -(x86_reg
)stride
* h
;
552 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
553 "movq 1(%1, %%"REG_a
"), %%mm2 \n\t"
554 "movq %%mm0, %%mm1 \n\t"
555 "movq %%mm2, %%mm3 \n\t"
556 "punpcklbw %%mm7, %%mm0 \n\t"
557 "punpckhbw %%mm7, %%mm1 \n\t"
558 "punpcklbw %%mm7, %%mm2 \n\t"
559 "punpckhbw %%mm7, %%mm3 \n\t"
560 "paddw %%mm2, %%mm0 \n\t"
561 "paddw %%mm3, %%mm1 \n\t"
564 "movq (%2, %%"REG_a
"), %%mm2 \n\t"
565 "movq 1(%2, %%"REG_a
"), %%mm4 \n\t"
566 "movq %%mm2, %%mm3 \n\t"
567 "movq %%mm4, %%mm5 \n\t"
568 "punpcklbw %%mm7, %%mm2 \n\t"
569 "punpckhbw %%mm7, %%mm3 \n\t"
570 "punpcklbw %%mm7, %%mm4 \n\t"
571 "punpckhbw %%mm7, %%mm5 \n\t"
572 "paddw %%mm4, %%mm2 \n\t"
573 "paddw %%mm5, %%mm3 \n\t"
574 "movq %5, %%mm5 \n\t"
575 "paddw %%mm2, %%mm0 \n\t"
576 "paddw %%mm3, %%mm1 \n\t"
577 "paddw %%mm5, %%mm0 \n\t"
578 "paddw %%mm5, %%mm1 \n\t"
579 "movq (%3, %%"REG_a
"), %%mm4 \n\t"
580 "movq (%3, %%"REG_a
"), %%mm5 \n\t"
581 "psrlw $2, %%mm0 \n\t"
582 "psrlw $2, %%mm1 \n\t"
583 "packuswb %%mm1, %%mm0 \n\t"
584 "psubusb %%mm0, %%mm4 \n\t"
585 "psubusb %%mm5, %%mm0 \n\t"
586 "por %%mm4, %%mm0 \n\t"
587 "movq %%mm0, %%mm4 \n\t"
588 "punpcklbw %%mm7, %%mm0 \n\t"
589 "punpckhbw %%mm7, %%mm4 \n\t"
590 "paddw %%mm0, %%mm6 \n\t"
591 "paddw %%mm4, %%mm6 \n\t"
592 "movq %%mm2, %%mm0 \n\t"
593 "movq %%mm3, %%mm1 \n\t"
594 "add %4, %%"REG_a
" \n\t"
597 : "r" (blk1
- len
), "r" (blk1
- len
+ stride
), "r" (blk2
- len
),
598 "r" ((x86_reg
) stride
), "m" (round_tab
[2]));
601 static inline int sum_mmx(void)
605 "movq %%mm6, %%mm0 \n\t"
606 "psrlq $32, %%mm6 \n\t"
607 "paddw %%mm0, %%mm6 \n\t"
608 "movq %%mm6, %%mm0 \n\t"
609 "psrlq $16, %%mm6 \n\t"
610 "paddw %%mm0, %%mm6 \n\t"
611 "movd %%mm6, %0 \n\t"
616 static inline int sum_mmxext(void)
620 "movd %%mm6, %0 \n\t"
625 static inline void sad8_x2a_mmx(uint8_t *blk1
, uint8_t *blk2
, int stride
, int h
)
627 sad8_2_mmx(blk1
, blk1
+ 1, blk2
, stride
, h
);
630 static inline void sad8_y2a_mmx(uint8_t *blk1
, uint8_t *blk2
, int stride
, int h
)
632 sad8_2_mmx(blk1
, blk1
+ stride
, blk2
, stride
, h
);
635 #define PIX_SAD(suf) \
636 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
637 uint8_t *blk1, int stride, int h) \
639 av_assert2(h == 8); \
641 "pxor %%mm7, %%mm7 \n\t" \
642 "pxor %%mm6, %%mm6 \n\t" \
645 sad8_1_ ## suf(blk1, blk2, stride, 8); \
647 return sum_ ## suf(); \
650 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
651 uint8_t *blk1, int stride, int h) \
653 av_assert2(h == 8); \
655 "pxor %%mm7, %%mm7 \n\t" \
656 "pxor %%mm6, %%mm6 \n\t" \
657 "movq %0, %%mm5 \n\t" \
658 :: "m" (round_tab[1])); \
660 sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
662 return sum_ ## suf(); \
665 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
666 uint8_t *blk1, int stride, int h) \
668 av_assert2(h == 8); \
670 "pxor %%mm7, %%mm7 \n\t" \
671 "pxor %%mm6, %%mm6 \n\t" \
672 "movq %0, %%mm5 \n\t" \
673 :: "m" (round_tab[1])); \
675 sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
677 return sum_ ## suf(); \
680 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
681 uint8_t *blk1, int stride, int h) \
683 av_assert2(h == 8); \
685 "pxor %%mm7, %%mm7 \n\t" \
686 "pxor %%mm6, %%mm6 \n\t" \
689 sad8_4_ ## suf(blk1, blk2, stride, 8); \
691 return sum_ ## suf(); \
694 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
695 uint8_t *blk1, int stride, int h) \
698 "pxor %%mm7, %%mm7 \n\t" \
699 "pxor %%mm6, %%mm6 \n\t" \
702 sad8_1_ ## suf(blk1, blk2, stride, h); \
703 sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
705 return sum_ ## suf(); \
708 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
709 uint8_t *blk1, int stride, int h) \
712 "pxor %%mm7, %%mm7 \n\t" \
713 "pxor %%mm6, %%mm6 \n\t" \
714 "movq %0, %%mm5 \n\t" \
715 :: "m" (round_tab[1])); \
717 sad8_x2a_ ## suf(blk1, blk2, stride, h); \
718 sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
720 return sum_ ## suf(); \
723 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
724 uint8_t *blk1, int stride, int h) \
727 "pxor %%mm7, %%mm7 \n\t" \
728 "pxor %%mm6, %%mm6 \n\t" \
729 "movq %0, %%mm5 \n\t" \
730 :: "m" (round_tab[1])); \
732 sad8_y2a_ ## suf(blk1, blk2, stride, h); \
733 sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
735 return sum_ ## suf(); \
738 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
739 uint8_t *blk1, int stride, int h) \
742 "pxor %%mm7, %%mm7 \n\t" \
743 "pxor %%mm6, %%mm6 \n\t" \
746 sad8_4_ ## suf(blk1, blk2, stride, h); \
747 sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
749 return sum_ ## suf(); \
755 #endif /* HAVE_INLINE_ASM */
757 av_cold
void ff_me_cmp_init_x86(MECmpContext
*c
, AVCodecContext
*avctx
)
759 int cpu_flags
= av_get_cpu_flags();
762 if (INLINE_MMX(cpu_flags
)) {
763 c
->pix_abs
[0][0] = sad16_mmx
;
764 c
->pix_abs
[0][1] = sad16_x2_mmx
;
765 c
->pix_abs
[0][2] = sad16_y2_mmx
;
766 c
->pix_abs
[0][3] = sad16_xy2_mmx
;
767 c
->pix_abs
[1][0] = sad8_mmx
;
768 c
->pix_abs
[1][1] = sad8_x2_mmx
;
769 c
->pix_abs
[1][2] = sad8_y2_mmx
;
770 c
->pix_abs
[1][3] = sad8_xy2_mmx
;
772 c
->sad
[0] = sad16_mmx
;
773 c
->sad
[1] = sad8_mmx
;
775 c
->vsad
[4] = vsad_intra16_mmx
;
777 if (!(avctx
->flags
& CODEC_FLAG_BITEXACT
)) {
778 c
->vsad
[0] = vsad16_mmx
;
782 if (INLINE_MMXEXT(cpu_flags
)) {
783 c
->vsad
[4] = vsad_intra16_mmxext
;
785 c
->pix_abs
[0][0] = sad16_mmxext
;
786 c
->pix_abs
[1][0] = sad8_mmxext
;
788 c
->sad
[0] = sad16_mmxext
;
789 c
->sad
[1] = sad8_mmxext
;
791 c
->pix_abs
[0][1] = sad16_x2_mmxext
;
792 c
->pix_abs
[0][2] = sad16_y2_mmxext
;
793 c
->pix_abs
[1][1] = sad8_x2_mmxext
;
794 c
->pix_abs
[1][2] = sad8_y2_mmxext
;
796 if (!(avctx
->flags
& CODEC_FLAG_BITEXACT
)) {
797 c
->pix_abs
[0][3] = sad16_xy2_mmxext
;
798 c
->pix_abs
[1][3] = sad8_xy2_mmxext
;
800 c
->vsad
[0] = vsad16_mmxext
;
804 if (INLINE_SSE2(cpu_flags
) && !(cpu_flags
& AV_CPU_FLAG_SSE2SLOW
) && avctx
->codec_id
!= AV_CODEC_ID_SNOW
) {
805 c
->sad
[0] = sad16_sse2
;
808 #endif /* HAVE_INLINE_ASM */
810 if (EXTERNAL_MMX(cpu_flags
)) {
811 c
->hadamard8_diff
[0] = ff_hadamard8_diff16_mmx
;
812 c
->hadamard8_diff
[1] = ff_hadamard8_diff_mmx
;
813 c
->sum_abs_dctelem
= ff_sum_abs_dctelem_mmx
;
814 c
->sse
[0] = ff_sse16_mmx
;
815 c
->sse
[1] = ff_sse8_mmx
;
817 c
->nsse
[0] = nsse16_mmx
;
818 c
->nsse
[1] = nsse8_mmx
;
822 if (EXTERNAL_MMXEXT(cpu_flags
)) {
823 c
->hadamard8_diff
[0] = ff_hadamard8_diff16_mmxext
;
824 c
->hadamard8_diff
[1] = ff_hadamard8_diff_mmxext
;
825 c
->sum_abs_dctelem
= ff_sum_abs_dctelem_mmxext
;
828 if (EXTERNAL_SSE2(cpu_flags
)) {
829 c
->sse
[0] = ff_sse16_sse2
;
830 c
->sum_abs_dctelem
= ff_sum_abs_dctelem_sse2
;
832 #if HAVE_ALIGNED_STACK
833 c
->hadamard8_diff
[0] = ff_hadamard8_diff16_sse2
;
834 c
->hadamard8_diff
[1] = ff_hadamard8_diff_sse2
;
838 if (EXTERNAL_SSSE3(cpu_flags
)) {
839 c
->sum_abs_dctelem
= ff_sum_abs_dctelem_ssse3
;
840 #if HAVE_ALIGNED_STACK
841 c
->hadamard8_diff
[0] = ff_hadamard8_diff16_ssse3
;
842 c
->hadamard8_diff
[1] = ff_hadamard8_diff_ssse3
;