ffmpeg/libavcodec/x86/me_cmp_init.c

   1 /*
   2  * SIMD-optimized motion estimation
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 #include "libavutil/attributes.h"
  26 #include "libavutil/cpu.h"
  27 #include "libavutil/x86/asm.h"
  28 #include "libavutil/x86/cpu.h"
  29 #include "libavcodec/me_cmp.h"
  30 #include "libavcodec/mpegvideo.h"
  31
  32 int ff_sum_abs_dctelem_mmx(int16_t *block);
  33 int ff_sum_abs_dctelem_mmxext(int16_t *block);
  34 int ff_sum_abs_dctelem_sse2(int16_t *block);
  35 int ff_sum_abs_dctelem_ssse3(int16_t *block);
  36 int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  37                 int line_size, int h);
  38 int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  39                  int line_size, int h);
  40 int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  41                   int line_size, int h);
  42 int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h);
  43 int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h);
  44
  45 #define hadamard_func(cpu)                                              \
  46     int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,     \
  47                                   uint8_t *src2, int stride, int h);    \
  48     int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,   \
  49                                     uint8_t *src2, int stride, int h);
  50
  51 hadamard_func(mmx)
  52 hadamard_func(mmxext)
  53 hadamard_func(sse2)
  54 hadamard_func(ssse3)
  55
  56 #if HAVE_YASM
  57 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
  58                       int line_size, int h)
  59 {
  60     int score1, score2;
  61
  62     if (c)
  63         score1 = c->mecc.sse[0](c, pix1, pix2, line_size, h);
  64     else
  65         score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h);
  66     score2 = ff_hf_noise16_mmx(pix1, line_size, h) + ff_hf_noise8_mmx(pix1+8, line_size, h)
  67            - ff_hf_noise16_mmx(pix2, line_size, h) - ff_hf_noise8_mmx(pix2+8, line_size, h);
  68
  69     if (c)
  70         return score1 + FFABS(score2) * c->avctx->nsse_weight;
  71     else
  72         return score1 + FFABS(score2) * 8;
  73 }
  74
  75 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
  76                      int line_size, int h)
  77 {
  78     int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h);
  79     int score2 = ff_hf_noise8_mmx(pix1, line_size, h) -
  80                  ff_hf_noise8_mmx(pix2, line_size, h);
  81
  82     if (c)
  83         return score1 + FFABS(score2) * c->avctx->nsse_weight;
  84     else
  85         return score1 + FFABS(score2) * 8;
  86 }
  87
  88 #endif /* HAVE_YASM */
  89
  90 #if HAVE_INLINE_ASM
  91
  92 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
  93                             int line_size, int h)
  94 {
  95     int tmp;
  96
  97     av_assert2((((int) pix) & 7) == 0);
  98     av_assert2((line_size & 7) == 0);
  99
 100 #define SUM(in0, in1, out0, out1)               \
 101     "movq (%0), %%mm2\n"                        \
 102     "movq 8(%0), %%mm3\n"                       \
 103     "add %2,%0\n"                               \
 104     "movq %%mm2, " #out0 "\n"                   \
 105     "movq %%mm3, " #out1 "\n"                   \
 106     "psubusb " #in0 ", %%mm2\n"                 \
 107     "psubusb " #in1 ", %%mm3\n"                 \
 108     "psubusb " #out0 ", " #in0 "\n"             \
 109     "psubusb " #out1 ", " #in1 "\n"             \
 110     "por %%mm2, " #in0 "\n"                     \
 111     "por %%mm3, " #in1 "\n"                     \
 112     "movq " #in0 ", %%mm2\n"                    \
 113     "movq " #in1 ", %%mm3\n"                    \
 114     "punpcklbw %%mm7, " #in0 "\n"               \
 115     "punpcklbw %%mm7, " #in1 "\n"               \
 116     "punpckhbw %%mm7, %%mm2\n"                  \
 117     "punpckhbw %%mm7, %%mm3\n"                  \
 118     "paddw " #in1 ", " #in0 "\n"                \
 119     "paddw %%mm3, %%mm2\n"                      \
 120     "paddw %%mm2, " #in0 "\n"                   \
 121     "paddw " #in0 ", %%mm6\n"
 122
 123
 124     __asm__ volatile (
 125         "movl    %3, %%ecx\n"
 126         "pxor %%mm6, %%mm6\n"
 127         "pxor %%mm7, %%mm7\n"
 128         "movq  (%0), %%mm0\n"
 129         "movq 8(%0), %%mm1\n"
 130         "add %2, %0\n"
 131         "jmp 2f\n"
 132         "1:\n"
 133
 134         SUM(%%mm4, %%mm5, %%mm0, %%mm1)
 135         "2:\n"
 136         SUM(%%mm0, %%mm1, %%mm4, %%mm5)
 137
 138         "subl $2, %%ecx\n"
 139         "jnz 1b\n"
 140
 141         "movq  %%mm6, %%mm0\n"
 142         "psrlq $32,   %%mm6\n"
 143         "paddw %%mm6, %%mm0\n"
 144         "movq  %%mm0, %%mm6\n"
 145         "psrlq $16,   %%mm0\n"
 146         "paddw %%mm6, %%mm0\n"
 147         "movd  %%mm0, %1\n"
 148         : "+r" (pix), "=r" (tmp)
 149         : "r" ((x86_reg) line_size), "m" (h)
 150         : "%ecx");
 151
 152     return tmp & 0xFFFF;
 153 }
 154 #undef SUM
 155
 156 static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
 157                                int line_size, int h)
 158 {
 159     int tmp;
 160
 161     av_assert2((((int) pix) & 7) == 0);
 162     av_assert2((line_size & 7) == 0);
 163
 164 #define SUM(in0, in1, out0, out1)               \
 165     "movq (%0), " #out0 "\n"                    \
 166     "movq 8(%0), " #out1 "\n"                   \
 167     "add %2, %0\n"                              \
 168     "psadbw " #out0 ", " #in0 "\n"              \
 169     "psadbw " #out1 ", " #in1 "\n"              \
 170     "paddw " #in1 ", " #in0 "\n"                \
 171     "paddw " #in0 ", %%mm6\n"
 172
 173     __asm__ volatile (
 174         "movl %3, %%ecx\n"
 175         "pxor %%mm6, %%mm6\n"
 176         "pxor %%mm7, %%mm7\n"
 177         "movq (%0), %%mm0\n"
 178         "movq 8(%0), %%mm1\n"
 179         "add %2, %0\n"
 180         "jmp 2f\n"
 181         "1:\n"
 182
 183         SUM(%%mm4, %%mm5, %%mm0, %%mm1)
 184         "2:\n"
 185         SUM(%%mm0, %%mm1, %%mm4, %%mm5)
 186
 187         "subl $2, %%ecx\n"
 188         "jnz 1b\n"
 189
 190         "movd %%mm6, %1\n"
 191         : "+r" (pix), "=r" (tmp)
 192         : "r" ((x86_reg) line_size), "m" (h)
 193         : "%ecx");
 194
 195     return tmp;
 196 }
 197 #undef SUM
 198
 199 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 200                       int line_size, int h)
 201 {
 202     int tmp;
 203
 204     av_assert2((((int) pix1) & 7) == 0);
 205     av_assert2((((int) pix2) & 7) == 0);
 206     av_assert2((line_size & 7) == 0);
 207
 208 #define SUM(in0, in1, out0, out1)       \
 209     "movq (%0), %%mm2\n"                \
 210     "movq (%1), " #out0 "\n"            \
 211     "movq 8(%0), %%mm3\n"               \
 212     "movq 8(%1), " #out1 "\n"           \
 213     "add %3, %0\n"                      \
 214     "add %3, %1\n"                      \
 215     "psubb " #out0 ", %%mm2\n"          \
 216     "psubb " #out1 ", %%mm3\n"          \
 217     "pxor %%mm7, %%mm2\n"               \
 218     "pxor %%mm7, %%mm3\n"               \
 219     "movq %%mm2, " #out0 "\n"           \
 220     "movq %%mm3, " #out1 "\n"           \
 221     "psubusb " #in0 ", %%mm2\n"         \
 222     "psubusb " #in1 ", %%mm3\n"         \
 223     "psubusb " #out0 ", " #in0 "\n"     \
 224     "psubusb " #out1 ", " #in1 "\n"     \
 225     "por %%mm2, " #in0 "\n"             \
 226     "por %%mm3, " #in1 "\n"             \
 227     "movq " #in0 ", %%mm2\n"            \
 228     "movq " #in1 ", %%mm3\n"            \
 229     "punpcklbw %%mm7, " #in0 "\n"       \
 230     "punpcklbw %%mm7, " #in1 "\n"       \
 231     "punpckhbw %%mm7, %%mm2\n"          \
 232     "punpckhbw %%mm7, %%mm3\n"          \
 233     "paddw " #in1 ", " #in0 "\n"        \
 234     "paddw %%mm3, %%mm2\n"              \
 235     "paddw %%mm2, " #in0 "\n"           \
 236     "paddw " #in0 ", %%mm6\n"
 237
 238
 239     __asm__ volatile (
 240         "movl %4, %%ecx\n"
 241         "pxor %%mm6, %%mm6\n"
 242         "pcmpeqw %%mm7, %%mm7\n"
 243         "psllw $15, %%mm7\n"
 244         "packsswb %%mm7, %%mm7\n"
 245         "movq (%0), %%mm0\n"
 246         "movq (%1), %%mm2\n"
 247         "movq 8(%0), %%mm1\n"
 248         "movq 8(%1), %%mm3\n"
 249         "add %3, %0\n"
 250         "add %3, %1\n"
 251         "psubb %%mm2, %%mm0\n"
 252         "psubb %%mm3, %%mm1\n"
 253         "pxor %%mm7, %%mm0\n"
 254         "pxor %%mm7, %%mm1\n"
 255         "jmp 2f\n"
 256         "1:\n"
 257
 258         SUM(%%mm4, %%mm5, %%mm0, %%mm1)
 259         "2:\n"
 260         SUM(%%mm0, %%mm1, %%mm4, %%mm5)
 261
 262         "subl $2, %%ecx\n"
 263         "jnz 1b\n"
 264
 265         "movq %%mm6, %%mm0\n"
 266         "psrlq $32, %%mm6\n"
 267         "paddw %%mm6, %%mm0\n"
 268         "movq %%mm0, %%mm6\n"
 269         "psrlq $16, %%mm0\n"
 270         "paddw %%mm6, %%mm0\n"
 271         "movd %%mm0, %2\n"
 272         : "+r" (pix1), "+r" (pix2), "=r" (tmp)
 273         : "r" ((x86_reg) line_size), "m" (h)
 274         : "%ecx");
 275
 276     return tmp & 0x7FFF;
 277 }
 278 #undef SUM
 279
 280 static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 281                          int line_size, int h)
 282 {
 283     int tmp;
 284
 285     av_assert2((((int) pix1) & 7) == 0);
 286     av_assert2((((int) pix2) & 7) == 0);
 287     av_assert2((line_size & 7) == 0);
 288
 289 #define SUM(in0, in1, out0, out1)               \
 290     "movq (%0), " #out0 "\n"                    \
 291     "movq (%1), %%mm2\n"                        \
 292     "movq 8(%0), " #out1 "\n"                   \
 293     "movq 8(%1), %%mm3\n"                       \
 294     "add %3, %0\n"                              \
 295     "add %3, %1\n"                              \
 296     "psubb %%mm2, " #out0 "\n"                  \
 297     "psubb %%mm3, " #out1 "\n"                  \
 298     "pxor %%mm7, " #out0 "\n"                   \
 299     "pxor %%mm7, " #out1 "\n"                   \
 300     "psadbw " #out0 ", " #in0 "\n"              \
 301     "psadbw " #out1 ", " #in1 "\n"              \
 302     "paddw " #in1 ", " #in0 "\n"                \
 303     "paddw " #in0 ", %%mm6\n    "
 304
 305     __asm__ volatile (
 306         "movl %4, %%ecx\n"
 307         "pxor %%mm6, %%mm6\n"
 308         "pcmpeqw %%mm7, %%mm7\n"
 309         "psllw $15, %%mm7\n"
 310         "packsswb %%mm7, %%mm7\n"
 311         "movq (%0), %%mm0\n"
 312         "movq (%1), %%mm2\n"
 313         "movq 8(%0), %%mm1\n"
 314         "movq 8(%1), %%mm3\n"
 315         "add %3, %0\n"
 316         "add %3, %1\n"
 317         "psubb %%mm2, %%mm0\n"
 318         "psubb %%mm3, %%mm1\n"
 319         "pxor %%mm7, %%mm0\n"
 320         "pxor %%mm7, %%mm1\n"
 321         "jmp 2f\n"
 322         "1:\n"
 323
 324         SUM(%%mm4, %%mm5, %%mm0, %%mm1)
 325         "2:\n"
 326         SUM(%%mm0, %%mm1, %%mm4, %%mm5)
 327
 328         "subl $2, %%ecx\n"
 329         "jnz 1b\n"
 330
 331         "movd %%mm6, %2\n"
 332         : "+r" (pix1), "+r" (pix2), "=r" (tmp)
 333         : "r" ((x86_reg) line_size), "m" (h)
 334         : "%ecx");
 335
 336     return tmp;
 337 }
 338 #undef SUM
 339
 340
 341
 342 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
 343     0x0000000000000000ULL,
 344     0x0001000100010001ULL,
 345     0x0002000200020002ULL,
 346 };
 347
 348 DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
 349
 350 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 351 {
 352     x86_reg len = -(x86_reg)stride * h;
 353     __asm__ volatile (
 354         ".p2align 4                     \n\t"
 355         "1:                             \n\t"
 356         "movq (%1, %%"REG_a"), %%mm0    \n\t"
 357         "movq (%2, %%"REG_a"), %%mm2    \n\t"
 358         "movq (%2, %%"REG_a"), %%mm4    \n\t"
 359         "add %3, %%"REG_a"              \n\t"
 360         "psubusb %%mm0, %%mm2           \n\t"
 361         "psubusb %%mm4, %%mm0           \n\t"
 362         "movq (%1, %%"REG_a"), %%mm1    \n\t"
 363         "movq (%2, %%"REG_a"), %%mm3    \n\t"
 364         "movq (%2, %%"REG_a"), %%mm5    \n\t"
 365         "psubusb %%mm1, %%mm3           \n\t"
 366         "psubusb %%mm5, %%mm1           \n\t"
 367         "por %%mm2, %%mm0               \n\t"
 368         "por %%mm1, %%mm3               \n\t"
 369         "movq %%mm0, %%mm1              \n\t"
 370         "movq %%mm3, %%mm2              \n\t"
 371         "punpcklbw %%mm7, %%mm0         \n\t"
 372         "punpckhbw %%mm7, %%mm1         \n\t"
 373         "punpcklbw %%mm7, %%mm3         \n\t"
 374         "punpckhbw %%mm7, %%mm2         \n\t"
 375         "paddw %%mm1, %%mm0             \n\t"
 376         "paddw %%mm3, %%mm2             \n\t"
 377         "paddw %%mm2, %%mm0             \n\t"
 378         "paddw %%mm0, %%mm6             \n\t"
 379         "add %3, %%"REG_a"              \n\t"
 380         " js 1b                         \n\t"
 381         : "+a" (len)
 382         : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride));
 383 }
 384
 385 static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
 386                                  int stride, int h)
 387 {
 388     __asm__ volatile (
 389         ".p2align 4                     \n\t"
 390         "1:                             \n\t"
 391         "movq (%1), %%mm0               \n\t"
 392         "movq (%1, %3), %%mm1           \n\t"
 393         "psadbw (%2), %%mm0             \n\t"
 394         "psadbw (%2, %3), %%mm1         \n\t"
 395         "paddw %%mm0, %%mm6             \n\t"
 396         "paddw %%mm1, %%mm6             \n\t"
 397         "lea (%1,%3,2), %1              \n\t"
 398         "lea (%2,%3,2), %2              \n\t"
 399         "sub $2, %0                     \n\t"
 400         " jg 1b                         \n\t"
 401         : "+r" (h), "+r" (blk1), "+r" (blk2)
 402         : "r" ((x86_reg) stride));
 403 }
 404
 405 static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
 406                       int stride, int h)
 407 {
 408     int ret;
 409     __asm__ volatile (
 410         "pxor %%xmm2, %%xmm2            \n\t"
 411         ".p2align 4                     \n\t"
 412         "1:                             \n\t"
 413         "movdqu (%1), %%xmm0            \n\t"
 414         "movdqu (%1, %4), %%xmm1        \n\t"
 415         "psadbw (%2), %%xmm0            \n\t"
 416         "psadbw (%2, %4), %%xmm1        \n\t"
 417         "paddw %%xmm0, %%xmm2           \n\t"
 418         "paddw %%xmm1, %%xmm2           \n\t"
 419         "lea (%1,%4,2), %1              \n\t"
 420         "lea (%2,%4,2), %2              \n\t"
 421         "sub $2, %0                     \n\t"
 422         " jg 1b                         \n\t"
 423         "movhlps %%xmm2, %%xmm0         \n\t"
 424         "paddw   %%xmm0, %%xmm2         \n\t"
 425         "movd    %%xmm2, %3             \n\t"
 426         : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
 427         : "r" ((x86_reg) stride));
 428     return ret;
 429 }
 430
 431 static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
 432                                    int stride, int h)
 433 {
 434     __asm__ volatile (
 435         ".p2align 4                     \n\t"
 436         "1:                             \n\t"
 437         "movq (%1), %%mm0               \n\t"
 438         "movq (%1, %3), %%mm1           \n\t"
 439         "pavgb 1(%1), %%mm0             \n\t"
 440         "pavgb 1(%1, %3), %%mm1         \n\t"
 441         "psadbw (%2), %%mm0             \n\t"
 442         "psadbw (%2, %3), %%mm1         \n\t"
 443         "paddw %%mm0, %%mm6             \n\t"
 444         "paddw %%mm1, %%mm6             \n\t"
 445         "lea (%1,%3,2), %1              \n\t"
 446         "lea (%2,%3,2), %2              \n\t"
 447         "sub $2, %0                     \n\t"
 448         " jg 1b                         \n\t"
 449         : "+r" (h), "+r" (blk1), "+r" (blk2)
 450         : "r" ((x86_reg) stride));
 451 }
 452
 453 static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
 454                                    int stride, int h)
 455 {
 456     __asm__ volatile (
 457         "movq (%1), %%mm0               \n\t"
 458         "add %3, %1                     \n\t"
 459         ".p2align 4                     \n\t"
 460         "1:                             \n\t"
 461         "movq (%1), %%mm1               \n\t"
 462         "movq (%1, %3), %%mm2           \n\t"
 463         "pavgb %%mm1, %%mm0             \n\t"
 464         "pavgb %%mm2, %%mm1             \n\t"
 465         "psadbw (%2), %%mm0             \n\t"
 466         "psadbw (%2, %3), %%mm1         \n\t"
 467         "paddw %%mm0, %%mm6             \n\t"
 468         "paddw %%mm1, %%mm6             \n\t"
 469         "movq %%mm2, %%mm0              \n\t"
 470         "lea (%1,%3,2), %1              \n\t"
 471         "lea (%2,%3,2), %2              \n\t"
 472         "sub $2, %0                     \n\t"
 473         " jg 1b                         \n\t"
 474         : "+r" (h), "+r" (blk1), "+r" (blk2)
 475         : "r" ((x86_reg) stride));
 476 }
 477
 478 static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
 479                                  int stride, int h)
 480 {
 481     __asm__ volatile (
 482         "movq "MANGLE(bone)", %%mm5     \n\t"
 483         "movq (%1), %%mm0               \n\t"
 484         "pavgb 1(%1), %%mm0             \n\t"
 485         "add %3, %1                     \n\t"
 486         ".p2align 4                     \n\t"
 487         "1:                             \n\t"
 488         "movq (%1), %%mm1               \n\t"
 489         "movq (%1,%3), %%mm2            \n\t"
 490         "pavgb 1(%1), %%mm1             \n\t"
 491         "pavgb 1(%1,%3), %%mm2          \n\t"
 492         "psubusb %%mm5, %%mm1           \n\t"
 493         "pavgb %%mm1, %%mm0             \n\t"
 494         "pavgb %%mm2, %%mm1             \n\t"
 495         "psadbw (%2), %%mm0             \n\t"
 496         "psadbw (%2,%3), %%mm1          \n\t"
 497         "paddw %%mm0, %%mm6             \n\t"
 498         "paddw %%mm1, %%mm6             \n\t"
 499         "movq %%mm2, %%mm0              \n\t"
 500         "lea (%1,%3,2), %1              \n\t"
 501         "lea (%2,%3,2), %2              \n\t"
 502         "sub $2, %0                     \n\t"
 503         " jg 1b                         \n\t"
 504         : "+r" (h), "+r" (blk1), "+r" (blk2)
 505         : "r" ((x86_reg) stride)
 506           NAMED_CONSTRAINTS_ADD(bone));
 507 }
 508
 509 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
 510                               int stride, int h)
 511 {
 512     x86_reg len = -(x86_reg)stride * h;
 513     __asm__ volatile (
 514         ".p2align 4                     \n\t"
 515         "1:                             \n\t"
 516         "movq (%1, %%"REG_a"), %%mm0    \n\t"
 517         "movq (%2, %%"REG_a"), %%mm1    \n\t"
 518         "movq (%1, %%"REG_a"), %%mm2    \n\t"
 519         "movq (%2, %%"REG_a"), %%mm3    \n\t"
 520         "punpcklbw %%mm7, %%mm0         \n\t"
 521         "punpcklbw %%mm7, %%mm1         \n\t"
 522         "punpckhbw %%mm7, %%mm2         \n\t"
 523         "punpckhbw %%mm7, %%mm3         \n\t"
 524         "paddw %%mm0, %%mm1             \n\t"
 525         "paddw %%mm2, %%mm3             \n\t"
 526         "movq (%3, %%"REG_a"), %%mm4    \n\t"
 527         "movq (%3, %%"REG_a"), %%mm2    \n\t"
 528         "paddw %%mm5, %%mm1             \n\t"
 529         "paddw %%mm5, %%mm3             \n\t"
 530         "psrlw $1, %%mm1                \n\t"
 531         "psrlw $1, %%mm3                \n\t"
 532         "packuswb %%mm3, %%mm1          \n\t"
 533         "psubusb %%mm1, %%mm4           \n\t"
 534         "psubusb %%mm2, %%mm1           \n\t"
 535         "por %%mm4, %%mm1               \n\t"
 536         "movq %%mm1, %%mm0              \n\t"
 537         "punpcklbw %%mm7, %%mm0         \n\t"
 538         "punpckhbw %%mm7, %%mm1         \n\t"
 539         "paddw %%mm1, %%mm0             \n\t"
 540         "paddw %%mm0, %%mm6             \n\t"
 541         "add %4, %%"REG_a"              \n\t"
 542         " js 1b                         \n\t"
 543         : "+a" (len)
 544         : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
 545           "r" ((x86_reg) stride));
 546 }
 547
 548 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 549 {
 550     x86_reg len = -(x86_reg)stride * h;
 551     __asm__ volatile (
 552         "movq  (%1, %%"REG_a"), %%mm0   \n\t"
 553         "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
 554         "movq %%mm0, %%mm1              \n\t"
 555         "movq %%mm2, %%mm3              \n\t"
 556         "punpcklbw %%mm7, %%mm0         \n\t"
 557         "punpckhbw %%mm7, %%mm1         \n\t"
 558         "punpcklbw %%mm7, %%mm2         \n\t"
 559         "punpckhbw %%mm7, %%mm3         \n\t"
 560         "paddw %%mm2, %%mm0             \n\t"
 561         "paddw %%mm3, %%mm1             \n\t"
 562         ".p2align 4                     \n\t"
 563         "1:                             \n\t"
 564         "movq  (%2, %%"REG_a"), %%mm2   \n\t"
 565         "movq 1(%2, %%"REG_a"), %%mm4   \n\t"
 566         "movq %%mm2, %%mm3              \n\t"
 567         "movq %%mm4, %%mm5              \n\t"
 568         "punpcklbw %%mm7, %%mm2         \n\t"
 569         "punpckhbw %%mm7, %%mm3         \n\t"
 570         "punpcklbw %%mm7, %%mm4         \n\t"
 571         "punpckhbw %%mm7, %%mm5         \n\t"
 572         "paddw %%mm4, %%mm2             \n\t"
 573         "paddw %%mm5, %%mm3             \n\t"
 574         "movq %5, %%mm5                 \n\t"
 575         "paddw %%mm2, %%mm0             \n\t"
 576         "paddw %%mm3, %%mm1             \n\t"
 577         "paddw %%mm5, %%mm0             \n\t"
 578         "paddw %%mm5, %%mm1             \n\t"
 579         "movq (%3, %%"REG_a"), %%mm4    \n\t"
 580         "movq (%3, %%"REG_a"), %%mm5    \n\t"
 581         "psrlw $2, %%mm0                \n\t"
 582         "psrlw $2, %%mm1                \n\t"
 583         "packuswb %%mm1, %%mm0          \n\t"
 584         "psubusb %%mm0, %%mm4           \n\t"
 585         "psubusb %%mm5, %%mm0           \n\t"
 586         "por %%mm4, %%mm0               \n\t"
 587         "movq %%mm0, %%mm4              \n\t"
 588         "punpcklbw %%mm7, %%mm0         \n\t"
 589         "punpckhbw %%mm7, %%mm4         \n\t"
 590         "paddw %%mm0, %%mm6             \n\t"
 591         "paddw %%mm4, %%mm6             \n\t"
 592         "movq  %%mm2, %%mm0             \n\t"
 593         "movq  %%mm3, %%mm1             \n\t"
 594         "add %4, %%"REG_a"              \n\t"
 595         " js 1b                         \n\t"
 596         : "+a" (len)
 597         : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
 598           "r" ((x86_reg) stride), "m" (round_tab[2]));
 599 }
 600
 601 static inline int sum_mmx(void)
 602 {
 603     int ret;
 604     __asm__ volatile (
 605         "movq %%mm6, %%mm0              \n\t"
 606         "psrlq $32, %%mm6               \n\t"
 607         "paddw %%mm0, %%mm6             \n\t"
 608         "movq %%mm6, %%mm0              \n\t"
 609         "psrlq $16, %%mm6               \n\t"
 610         "paddw %%mm0, %%mm6             \n\t"
 611         "movd %%mm6, %0                 \n\t"
 612         : "=r" (ret));
 613     return ret & 0xFFFF;
 614 }
 615
 616 static inline int sum_mmxext(void)
 617 {
 618     int ret;
 619     __asm__ volatile (
 620         "movd %%mm6, %0                 \n\t"
 621         : "=r" (ret));
 622     return ret;
 623 }
 624
 625 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 626 {
 627     sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
 628 }
 629
 630 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 631 {
 632     sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
 633 }
 634
 635 #define PIX_SAD(suf)                                                    \
 636 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
 637                         uint8_t *blk1, int stride, int h)               \
 638 {                                                                       \
 639     av_assert2(h == 8);                                                     \
 640     __asm__ volatile (                                                  \
 641         "pxor %%mm7, %%mm7     \n\t"                                    \
 642         "pxor %%mm6, %%mm6     \n\t"                                    \
 643         :);                                                             \
 644                                                                         \
 645     sad8_1_ ## suf(blk1, blk2, stride, 8);                              \
 646                                                                         \
 647     return sum_ ## suf();                                               \
 648 }                                                                       \
 649                                                                         \
 650 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
 651                            uint8_t *blk1, int stride, int h)            \
 652 {                                                                       \
 653     av_assert2(h == 8);                                                     \
 654     __asm__ volatile (                                                  \
 655         "pxor %%mm7, %%mm7     \n\t"                                    \
 656         "pxor %%mm6, %%mm6     \n\t"                                    \
 657         "movq %0, %%mm5        \n\t"                                    \
 658         :: "m" (round_tab[1]));                                         \
 659                                                                         \
 660     sad8_x2a_ ## suf(blk1, blk2, stride, 8);                            \
 661                                                                         \
 662     return sum_ ## suf();                                               \
 663 }                                                                       \
 664                                                                         \
 665 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
 666                            uint8_t *blk1, int stride, int h)            \
 667 {                                                                       \
 668     av_assert2(h == 8);                                                     \
 669     __asm__ volatile (                                                  \
 670         "pxor %%mm7, %%mm7     \n\t"                                    \
 671         "pxor %%mm6, %%mm6     \n\t"                                    \
 672         "movq %0, %%mm5        \n\t"                                    \
 673         :: "m" (round_tab[1]));                                         \
 674                                                                         \
 675     sad8_y2a_ ## suf(blk1, blk2, stride, 8);                            \
 676                                                                         \
 677     return sum_ ## suf();                                               \
 678 }                                                                       \
 679                                                                         \
 680 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
 681                             uint8_t *blk1, int stride, int h)           \
 682 {                                                                       \
 683     av_assert2(h == 8);                                                     \
 684     __asm__ volatile (                                                  \
 685         "pxor %%mm7, %%mm7     \n\t"                                    \
 686         "pxor %%mm6, %%mm6     \n\t"                                    \
 687         ::);                                                            \
 688                                                                         \
 689     sad8_4_ ## suf(blk1, blk2, stride, 8);                              \
 690                                                                         \
 691     return sum_ ## suf();                                               \
 692 }                                                                       \
 693                                                                         \
 694 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2,              \
 695                          uint8_t *blk1, int stride, int h)              \
 696 {                                                                       \
 697     __asm__ volatile (                                                  \
 698         "pxor %%mm7, %%mm7     \n\t"                                    \
 699         "pxor %%mm6, %%mm6     \n\t"                                    \
 700         :);                                                             \
 701                                                                         \
 702     sad8_1_ ## suf(blk1,     blk2,     stride, h);                      \
 703     sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
 704                                                                         \
 705     return sum_ ## suf();                                               \
 706 }                                                                       \
 707                                                                         \
 708 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
 709                             uint8_t *blk1, int stride, int h)           \
 710 {                                                                       \
 711     __asm__ volatile (                                                  \
 712         "pxor %%mm7, %%mm7     \n\t"                                    \
 713         "pxor %%mm6, %%mm6     \n\t"                                    \
 714         "movq %0, %%mm5        \n\t"                                    \
 715         :: "m" (round_tab[1]));                                         \
 716                                                                         \
 717     sad8_x2a_ ## suf(blk1,     blk2,     stride, h);                    \
 718     sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
 719                                                                         \
 720     return sum_ ## suf();                                               \
 721 }                                                                       \
 722                                                                         \
 723 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
 724                             uint8_t *blk1, int stride, int h)           \
 725 {                                                                       \
 726     __asm__ volatile (                                                  \
 727         "pxor %%mm7, %%mm7     \n\t"                                    \
 728         "pxor %%mm6, %%mm6     \n\t"                                    \
 729         "movq %0, %%mm5        \n\t"                                    \
 730         :: "m" (round_tab[1]));                                         \
 731                                                                         \
 732     sad8_y2a_ ## suf(blk1,     blk2,     stride, h);                    \
 733     sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
 734                                                                         \
 735     return sum_ ## suf();                                               \
 736 }                                                                       \
 737                                                                         \
 738 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
 739                              uint8_t *blk1, int stride, int h)          \
 740 {                                                                       \
 741     __asm__ volatile (                                                  \
 742         "pxor %%mm7, %%mm7     \n\t"                                    \
 743         "pxor %%mm6, %%mm6     \n\t"                                    \
 744         ::);                                                            \
 745                                                                         \
 746     sad8_4_ ## suf(blk1,     blk2,     stride, h);                      \
 747     sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
 748                                                                         \
 749     return sum_ ## suf();                                               \
 750 }                                                                       \
 751
 752 PIX_SAD(mmx)
 753 PIX_SAD(mmxext)
 754
 755 #endif /* HAVE_INLINE_ASM */
 756
 757 av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
 758 {
 759     int cpu_flags = av_get_cpu_flags();
 760
 761 #if HAVE_INLINE_ASM
 762     if (INLINE_MMX(cpu_flags)) {
 763         c->pix_abs[0][0] = sad16_mmx;
 764         c->pix_abs[0][1] = sad16_x2_mmx;
 765         c->pix_abs[0][2] = sad16_y2_mmx;
 766         c->pix_abs[0][3] = sad16_xy2_mmx;
 767         c->pix_abs[1][0] = sad8_mmx;
 768         c->pix_abs[1][1] = sad8_x2_mmx;
 769         c->pix_abs[1][2] = sad8_y2_mmx;
 770         c->pix_abs[1][3] = sad8_xy2_mmx;
 771
 772         c->sad[0] = sad16_mmx;
 773         c->sad[1] = sad8_mmx;
 774
 775         c->vsad[4] = vsad_intra16_mmx;
 776
 777         if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
 778             c->vsad[0] = vsad16_mmx;
 779         }
 780     }
 781
 782     if (INLINE_MMXEXT(cpu_flags)) {
 783         c->vsad[4] = vsad_intra16_mmxext;
 784
 785         c->pix_abs[0][0] = sad16_mmxext;
 786         c->pix_abs[1][0] = sad8_mmxext;
 787
 788         c->sad[0] = sad16_mmxext;
 789         c->sad[1] = sad8_mmxext;
 790
 791         c->pix_abs[0][1] = sad16_x2_mmxext;
 792         c->pix_abs[0][2] = sad16_y2_mmxext;
 793         c->pix_abs[1][1] = sad8_x2_mmxext;
 794         c->pix_abs[1][2] = sad8_y2_mmxext;
 795
 796         if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
 797             c->pix_abs[0][3] = sad16_xy2_mmxext;
 798             c->pix_abs[1][3] = sad8_xy2_mmxext;
 799
 800             c->vsad[0] = vsad16_mmxext;
 801         }
 802     }
 803
 804     if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
 805         c->sad[0] = sad16_sse2;
 806     }
 807
 808 #endif /* HAVE_INLINE_ASM */
 809
 810     if (EXTERNAL_MMX(cpu_flags)) {
 811         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
 812         c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
 813         c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmx;
 814         c->sse[0]            = ff_sse16_mmx;
 815         c->sse[1]            = ff_sse8_mmx;
 816 #if HAVE_YASM
 817         c->nsse[0]           = nsse16_mmx;
 818         c->nsse[1]           = nsse8_mmx;
 819 #endif
 820     }
 821
 822     if (EXTERNAL_MMXEXT(cpu_flags)) {
 823         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
 824         c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
 825         c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmxext;
 826     }
 827
 828     if (EXTERNAL_SSE2(cpu_flags)) {
 829         c->sse[0] = ff_sse16_sse2;
 830         c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
 831
 832 #if HAVE_ALIGNED_STACK
 833         c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
 834         c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
 835 #endif
 836     }
 837
 838     if (EXTERNAL_SSSE3(cpu_flags)) {
 839         c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
 840 #if HAVE_ALIGNED_STACK
 841         c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
 842         c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
 843 #endif
 844     }
 845 }