Imported Debian version 2.5.0~trusty1.1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / me_cmp_init.c
CommitLineData
2ba45a60
DM
1/*
2 * SIMD-optimized motion estimation
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25#include "libavutil/attributes.h"
26#include "libavutil/cpu.h"
27#include "libavutil/x86/asm.h"
28#include "libavutil/x86/cpu.h"
29#include "libavcodec/me_cmp.h"
30#include "libavcodec/mpegvideo.h"
31
32int ff_sum_abs_dctelem_mmx(int16_t *block);
33int ff_sum_abs_dctelem_mmxext(int16_t *block);
34int ff_sum_abs_dctelem_sse2(int16_t *block);
35int ff_sum_abs_dctelem_ssse3(int16_t *block);
36int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
f6fa7814 37 ptrdiff_t stride, int h);
2ba45a60 38int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
f6fa7814 39 ptrdiff_t stride, int h);
2ba45a60 40int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
f6fa7814
DM
41 ptrdiff_t stride, int h);
42int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
43int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
44int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
45 ptrdiff_t stride, int h);
46int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
47 ptrdiff_t stride, int h);
48int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
49 ptrdiff_t stride, int h);
50int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
51 ptrdiff_t stride, int h);
52int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
53 ptrdiff_t stride, int h);
54int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
55 ptrdiff_t stride, int h);
56int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
57 ptrdiff_t stride, int h);
58int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
59 ptrdiff_t stride, int h);
60int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
61 ptrdiff_t stride, int h);
62int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
63 ptrdiff_t stride, int h);
64int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
65 ptrdiff_t stride, int h);
66int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
67 ptrdiff_t stride, int h);
68int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
69 ptrdiff_t stride, int h);
70int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
71 ptrdiff_t stride, int h);
72int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
73 ptrdiff_t stride, int h);
74int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
75 ptrdiff_t stride, int h);
76int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
77 ptrdiff_t stride, int h);
78int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
79 ptrdiff_t stride, int h);
80
81#define hadamard_func(cpu) \
82 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
83 uint8_t *src2, ptrdiff_t stride, int h); \
84 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
85 uint8_t *src2, ptrdiff_t stride, int h);
2ba45a60
DM
86
87hadamard_func(mmx)
88hadamard_func(mmxext)
89hadamard_func(sse2)
90hadamard_func(ssse3)
91
92#if HAVE_YASM
93static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
f6fa7814 94 ptrdiff_t stride, int h)
2ba45a60
DM
95{
96 int score1, score2;
97
98 if (c)
f6fa7814 99 score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
2ba45a60 100 else
f6fa7814
DM
101 score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
102 score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
103 - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
2ba45a60
DM
104
105 if (c)
106 return score1 + FFABS(score2) * c->avctx->nsse_weight;
107 else
108 return score1 + FFABS(score2) * 8;
109}
110
111static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
f6fa7814 112 ptrdiff_t stride, int h)
2ba45a60 113{
f6fa7814
DM
114 int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
115 int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
116 ff_hf_noise8_mmx(pix2, stride, h);
2ba45a60
DM
117
118 if (c)
119 return score1 + FFABS(score2) * c->avctx->nsse_weight;
120 else
121 return score1 + FFABS(score2) * 8;
122}
123
124#endif /* HAVE_YASM */
125
126#if HAVE_INLINE_ASM
127
128static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
f6fa7814 129 ptrdiff_t stride, int h)
2ba45a60
DM
130{
131 int tmp;
132
133 av_assert2((((int) pix) & 7) == 0);
f6fa7814 134 av_assert2((stride & 7) == 0);
2ba45a60
DM
135
136#define SUM(in0, in1, out0, out1) \
137 "movq (%0), %%mm2\n" \
138 "movq 8(%0), %%mm3\n" \
139 "add %2,%0\n" \
140 "movq %%mm2, " #out0 "\n" \
141 "movq %%mm3, " #out1 "\n" \
142 "psubusb " #in0 ", %%mm2\n" \
143 "psubusb " #in1 ", %%mm3\n" \
144 "psubusb " #out0 ", " #in0 "\n" \
145 "psubusb " #out1 ", " #in1 "\n" \
146 "por %%mm2, " #in0 "\n" \
147 "por %%mm3, " #in1 "\n" \
148 "movq " #in0 ", %%mm2\n" \
149 "movq " #in1 ", %%mm3\n" \
150 "punpcklbw %%mm7, " #in0 "\n" \
151 "punpcklbw %%mm7, " #in1 "\n" \
152 "punpckhbw %%mm7, %%mm2\n" \
153 "punpckhbw %%mm7, %%mm3\n" \
154 "paddw " #in1 ", " #in0 "\n" \
155 "paddw %%mm3, %%mm2\n" \
156 "paddw %%mm2, " #in0 "\n" \
157 "paddw " #in0 ", %%mm6\n"
158
159
160 __asm__ volatile (
161 "movl %3, %%ecx\n"
162 "pxor %%mm6, %%mm6\n"
163 "pxor %%mm7, %%mm7\n"
164 "movq (%0), %%mm0\n"
165 "movq 8(%0), %%mm1\n"
166 "add %2, %0\n"
167 "jmp 2f\n"
168 "1:\n"
169
170 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
171 "2:\n"
172 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
173
174 "subl $2, %%ecx\n"
175 "jnz 1b\n"
176
177 "movq %%mm6, %%mm0\n"
178 "psrlq $32, %%mm6\n"
179 "paddw %%mm6, %%mm0\n"
180 "movq %%mm0, %%mm6\n"
181 "psrlq $16, %%mm0\n"
182 "paddw %%mm6, %%mm0\n"
183 "movd %%mm0, %1\n"
184 : "+r" (pix), "=r" (tmp)
f6fa7814 185 : "r" (stride), "m" (h)
2ba45a60
DM
186 : "%ecx");
187
188 return tmp & 0xFFFF;
189}
190#undef SUM
191
2ba45a60 192static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
f6fa7814 193 ptrdiff_t stride, int h)
2ba45a60
DM
194{
195 int tmp;
196
197 av_assert2((((int) pix1) & 7) == 0);
198 av_assert2((((int) pix2) & 7) == 0);
f6fa7814 199 av_assert2((stride & 7) == 0);
2ba45a60
DM
200
201#define SUM(in0, in1, out0, out1) \
202 "movq (%0), %%mm2\n" \
203 "movq (%1), " #out0 "\n" \
204 "movq 8(%0), %%mm3\n" \
205 "movq 8(%1), " #out1 "\n" \
206 "add %3, %0\n" \
207 "add %3, %1\n" \
208 "psubb " #out0 ", %%mm2\n" \
209 "psubb " #out1 ", %%mm3\n" \
210 "pxor %%mm7, %%mm2\n" \
211 "pxor %%mm7, %%mm3\n" \
212 "movq %%mm2, " #out0 "\n" \
213 "movq %%mm3, " #out1 "\n" \
214 "psubusb " #in0 ", %%mm2\n" \
215 "psubusb " #in1 ", %%mm3\n" \
216 "psubusb " #out0 ", " #in0 "\n" \
217 "psubusb " #out1 ", " #in1 "\n" \
218 "por %%mm2, " #in0 "\n" \
219 "por %%mm3, " #in1 "\n" \
220 "movq " #in0 ", %%mm2\n" \
221 "movq " #in1 ", %%mm3\n" \
222 "punpcklbw %%mm7, " #in0 "\n" \
223 "punpcklbw %%mm7, " #in1 "\n" \
224 "punpckhbw %%mm7, %%mm2\n" \
225 "punpckhbw %%mm7, %%mm3\n" \
226 "paddw " #in1 ", " #in0 "\n" \
227 "paddw %%mm3, %%mm2\n" \
228 "paddw %%mm2, " #in0 "\n" \
229 "paddw " #in0 ", %%mm6\n"
230
231
232 __asm__ volatile (
233 "movl %4, %%ecx\n"
234 "pxor %%mm6, %%mm6\n"
235 "pcmpeqw %%mm7, %%mm7\n"
236 "psllw $15, %%mm7\n"
237 "packsswb %%mm7, %%mm7\n"
238 "movq (%0), %%mm0\n"
239 "movq (%1), %%mm2\n"
240 "movq 8(%0), %%mm1\n"
241 "movq 8(%1), %%mm3\n"
242 "add %3, %0\n"
243 "add %3, %1\n"
244 "psubb %%mm2, %%mm0\n"
245 "psubb %%mm3, %%mm1\n"
246 "pxor %%mm7, %%mm0\n"
247 "pxor %%mm7, %%mm1\n"
248 "jmp 2f\n"
249 "1:\n"
250
251 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
252 "2:\n"
253 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
254
255 "subl $2, %%ecx\n"
256 "jnz 1b\n"
257
258 "movq %%mm6, %%mm0\n"
259 "psrlq $32, %%mm6\n"
260 "paddw %%mm6, %%mm0\n"
261 "movq %%mm0, %%mm6\n"
262 "psrlq $16, %%mm0\n"
263 "paddw %%mm6, %%mm0\n"
264 "movd %%mm0, %2\n"
265 : "+r" (pix1), "+r" (pix2), "=r" (tmp)
f6fa7814 266 : "r" (stride), "m" (h)
2ba45a60
DM
267 : "%ecx");
268
269 return tmp & 0x7FFF;
270}
271#undef SUM
272
2ba45a60
DM
273DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
274 0x0000000000000000ULL,
275 0x0001000100010001ULL,
276 0x0002000200020002ULL,
277};
278
f6fa7814
DM
279static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
280 ptrdiff_t stride, int h)
2ba45a60 281{
f6fa7814 282 x86_reg len = -stride * h;
2ba45a60
DM
283 __asm__ volatile (
284 ".p2align 4 \n\t"
285 "1: \n\t"
286 "movq (%1, %%"REG_a"), %%mm0 \n\t"
287 "movq (%2, %%"REG_a"), %%mm2 \n\t"
288 "movq (%2, %%"REG_a"), %%mm4 \n\t"
289 "add %3, %%"REG_a" \n\t"
290 "psubusb %%mm0, %%mm2 \n\t"
291 "psubusb %%mm4, %%mm0 \n\t"
292 "movq (%1, %%"REG_a"), %%mm1 \n\t"
293 "movq (%2, %%"REG_a"), %%mm3 \n\t"
294 "movq (%2, %%"REG_a"), %%mm5 \n\t"
295 "psubusb %%mm1, %%mm3 \n\t"
296 "psubusb %%mm5, %%mm1 \n\t"
297 "por %%mm2, %%mm0 \n\t"
298 "por %%mm1, %%mm3 \n\t"
299 "movq %%mm0, %%mm1 \n\t"
300 "movq %%mm3, %%mm2 \n\t"
301 "punpcklbw %%mm7, %%mm0 \n\t"
302 "punpckhbw %%mm7, %%mm1 \n\t"
303 "punpcklbw %%mm7, %%mm3 \n\t"
304 "punpckhbw %%mm7, %%mm2 \n\t"
305 "paddw %%mm1, %%mm0 \n\t"
306 "paddw %%mm3, %%mm2 \n\t"
307 "paddw %%mm2, %%mm0 \n\t"
308 "paddw %%mm0, %%mm6 \n\t"
309 "add %3, %%"REG_a" \n\t"
310 " js 1b \n\t"
311 : "+a" (len)
f6fa7814 312 : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
2ba45a60
DM
313}
314
315static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
f6fa7814 316 ptrdiff_t stride, int h)
2ba45a60 317{
f6fa7814 318 x86_reg len = -stride * h;
2ba45a60
DM
319 __asm__ volatile (
320 ".p2align 4 \n\t"
321 "1: \n\t"
322 "movq (%1, %%"REG_a"), %%mm0 \n\t"
323 "movq (%2, %%"REG_a"), %%mm1 \n\t"
324 "movq (%1, %%"REG_a"), %%mm2 \n\t"
325 "movq (%2, %%"REG_a"), %%mm3 \n\t"
326 "punpcklbw %%mm7, %%mm0 \n\t"
327 "punpcklbw %%mm7, %%mm1 \n\t"
328 "punpckhbw %%mm7, %%mm2 \n\t"
329 "punpckhbw %%mm7, %%mm3 \n\t"
330 "paddw %%mm0, %%mm1 \n\t"
331 "paddw %%mm2, %%mm3 \n\t"
332 "movq (%3, %%"REG_a"), %%mm4 \n\t"
333 "movq (%3, %%"REG_a"), %%mm2 \n\t"
334 "paddw %%mm5, %%mm1 \n\t"
335 "paddw %%mm5, %%mm3 \n\t"
336 "psrlw $1, %%mm1 \n\t"
337 "psrlw $1, %%mm3 \n\t"
338 "packuswb %%mm3, %%mm1 \n\t"
339 "psubusb %%mm1, %%mm4 \n\t"
340 "psubusb %%mm2, %%mm1 \n\t"
341 "por %%mm4, %%mm1 \n\t"
342 "movq %%mm1, %%mm0 \n\t"
343 "punpcklbw %%mm7, %%mm0 \n\t"
344 "punpckhbw %%mm7, %%mm1 \n\t"
345 "paddw %%mm1, %%mm0 \n\t"
346 "paddw %%mm0, %%mm6 \n\t"
347 "add %4, %%"REG_a" \n\t"
348 " js 1b \n\t"
349 : "+a" (len)
350 : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
f6fa7814 351 "r" (stride));
2ba45a60
DM
352}
353
f6fa7814
DM
354static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
355 ptrdiff_t stride, int h)
2ba45a60 356{
f6fa7814 357 x86_reg len = -stride * h;
2ba45a60
DM
358 __asm__ volatile (
359 "movq (%1, %%"REG_a"), %%mm0 \n\t"
360 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
361 "movq %%mm0, %%mm1 \n\t"
362 "movq %%mm2, %%mm3 \n\t"
363 "punpcklbw %%mm7, %%mm0 \n\t"
364 "punpckhbw %%mm7, %%mm1 \n\t"
365 "punpcklbw %%mm7, %%mm2 \n\t"
366 "punpckhbw %%mm7, %%mm3 \n\t"
367 "paddw %%mm2, %%mm0 \n\t"
368 "paddw %%mm3, %%mm1 \n\t"
369 ".p2align 4 \n\t"
370 "1: \n\t"
371 "movq (%2, %%"REG_a"), %%mm2 \n\t"
372 "movq 1(%2, %%"REG_a"), %%mm4 \n\t"
373 "movq %%mm2, %%mm3 \n\t"
374 "movq %%mm4, %%mm5 \n\t"
375 "punpcklbw %%mm7, %%mm2 \n\t"
376 "punpckhbw %%mm7, %%mm3 \n\t"
377 "punpcklbw %%mm7, %%mm4 \n\t"
378 "punpckhbw %%mm7, %%mm5 \n\t"
379 "paddw %%mm4, %%mm2 \n\t"
380 "paddw %%mm5, %%mm3 \n\t"
381 "movq %5, %%mm5 \n\t"
382 "paddw %%mm2, %%mm0 \n\t"
383 "paddw %%mm3, %%mm1 \n\t"
384 "paddw %%mm5, %%mm0 \n\t"
385 "paddw %%mm5, %%mm1 \n\t"
386 "movq (%3, %%"REG_a"), %%mm4 \n\t"
387 "movq (%3, %%"REG_a"), %%mm5 \n\t"
388 "psrlw $2, %%mm0 \n\t"
389 "psrlw $2, %%mm1 \n\t"
390 "packuswb %%mm1, %%mm0 \n\t"
391 "psubusb %%mm0, %%mm4 \n\t"
392 "psubusb %%mm5, %%mm0 \n\t"
393 "por %%mm4, %%mm0 \n\t"
394 "movq %%mm0, %%mm4 \n\t"
395 "punpcklbw %%mm7, %%mm0 \n\t"
396 "punpckhbw %%mm7, %%mm4 \n\t"
397 "paddw %%mm0, %%mm6 \n\t"
398 "paddw %%mm4, %%mm6 \n\t"
399 "movq %%mm2, %%mm0 \n\t"
400 "movq %%mm3, %%mm1 \n\t"
401 "add %4, %%"REG_a" \n\t"
402 " js 1b \n\t"
403 : "+a" (len)
404 : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
f6fa7814 405 "r" (stride), "m" (round_tab[2]));
2ba45a60
DM
406}
407
408static inline int sum_mmx(void)
409{
410 int ret;
411 __asm__ volatile (
412 "movq %%mm6, %%mm0 \n\t"
413 "psrlq $32, %%mm6 \n\t"
414 "paddw %%mm0, %%mm6 \n\t"
415 "movq %%mm6, %%mm0 \n\t"
416 "psrlq $16, %%mm6 \n\t"
417 "paddw %%mm0, %%mm6 \n\t"
418 "movd %%mm6, %0 \n\t"
419 : "=r" (ret));
420 return ret & 0xFFFF;
421}
422
f6fa7814
DM
423static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
424 ptrdiff_t stride, int h)
2ba45a60
DM
425{
426 sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
427}
428
f6fa7814
DM
429static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
430 ptrdiff_t stride, int h)
2ba45a60
DM
431{
432 sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
433}
434
435#define PIX_SAD(suf) \
436static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
f6fa7814 437 uint8_t *blk1, ptrdiff_t stride, int h) \
2ba45a60
DM
438{ \
439 av_assert2(h == 8); \
440 __asm__ volatile ( \
441 "pxor %%mm7, %%mm7 \n\t" \
442 "pxor %%mm6, %%mm6 \n\t" \
443 :); \
444 \
445 sad8_1_ ## suf(blk1, blk2, stride, 8); \
446 \
447 return sum_ ## suf(); \
448} \
449 \
450static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
f6fa7814 451 uint8_t *blk1, ptrdiff_t stride, int h) \
2ba45a60
DM
452{ \
453 av_assert2(h == 8); \
454 __asm__ volatile ( \
455 "pxor %%mm7, %%mm7 \n\t" \
456 "pxor %%mm6, %%mm6 \n\t" \
457 "movq %0, %%mm5 \n\t" \
458 :: "m" (round_tab[1])); \
459 \
460 sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
461 \
462 return sum_ ## suf(); \
463} \
464 \
465static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
f6fa7814 466 uint8_t *blk1, ptrdiff_t stride, int h) \
2ba45a60
DM
467{ \
468 av_assert2(h == 8); \
469 __asm__ volatile ( \
470 "pxor %%mm7, %%mm7 \n\t" \
471 "pxor %%mm6, %%mm6 \n\t" \
472 "movq %0, %%mm5 \n\t" \
473 :: "m" (round_tab[1])); \
474 \
475 sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
476 \
477 return sum_ ## suf(); \
478} \
479 \
480static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
f6fa7814 481 uint8_t *blk1, ptrdiff_t stride, int h) \
2ba45a60
DM
482{ \
483 av_assert2(h == 8); \
484 __asm__ volatile ( \
485 "pxor %%mm7, %%mm7 \n\t" \
486 "pxor %%mm6, %%mm6 \n\t" \
487 ::); \
488 \
489 sad8_4_ ## suf(blk1, blk2, stride, 8); \
490 \
491 return sum_ ## suf(); \
492} \
493 \
494static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
f6fa7814 495 uint8_t *blk1, ptrdiff_t stride, int h) \
2ba45a60
DM
496{ \
497 __asm__ volatile ( \
498 "pxor %%mm7, %%mm7 \n\t" \
499 "pxor %%mm6, %%mm6 \n\t" \
500 :); \
501 \
502 sad8_1_ ## suf(blk1, blk2, stride, h); \
503 sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
504 \
505 return sum_ ## suf(); \
506} \
507 \
508static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
f6fa7814 509 uint8_t *blk1, ptrdiff_t stride, int h) \
2ba45a60
DM
510{ \
511 __asm__ volatile ( \
512 "pxor %%mm7, %%mm7 \n\t" \
513 "pxor %%mm6, %%mm6 \n\t" \
514 "movq %0, %%mm5 \n\t" \
515 :: "m" (round_tab[1])); \
516 \
517 sad8_x2a_ ## suf(blk1, blk2, stride, h); \
518 sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
519 \
520 return sum_ ## suf(); \
521} \
522 \
523static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
f6fa7814 524 uint8_t *blk1, ptrdiff_t stride, int h) \
2ba45a60
DM
525{ \
526 __asm__ volatile ( \
527 "pxor %%mm7, %%mm7 \n\t" \
528 "pxor %%mm6, %%mm6 \n\t" \
529 "movq %0, %%mm5 \n\t" \
530 :: "m" (round_tab[1])); \
531 \
532 sad8_y2a_ ## suf(blk1, blk2, stride, h); \
533 sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
534 \
535 return sum_ ## suf(); \
536} \
537 \
538static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
f6fa7814 539 uint8_t *blk1, ptrdiff_t stride, int h) \
2ba45a60
DM
540{ \
541 __asm__ volatile ( \
542 "pxor %%mm7, %%mm7 \n\t" \
543 "pxor %%mm6, %%mm6 \n\t" \
544 ::); \
545 \
546 sad8_4_ ## suf(blk1, blk2, stride, h); \
547 sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
548 \
549 return sum_ ## suf(); \
550} \
551
552PIX_SAD(mmx)
2ba45a60
DM
553
554#endif /* HAVE_INLINE_ASM */
555
556av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
557{
558 int cpu_flags = av_get_cpu_flags();
559
560#if HAVE_INLINE_ASM
561 if (INLINE_MMX(cpu_flags)) {
562 c->pix_abs[0][0] = sad16_mmx;
563 c->pix_abs[0][1] = sad16_x2_mmx;
564 c->pix_abs[0][2] = sad16_y2_mmx;
565 c->pix_abs[0][3] = sad16_xy2_mmx;
566 c->pix_abs[1][0] = sad8_mmx;
567 c->pix_abs[1][1] = sad8_x2_mmx;
568 c->pix_abs[1][2] = sad8_y2_mmx;
569 c->pix_abs[1][3] = sad8_xy2_mmx;
570
571 c->sad[0] = sad16_mmx;
572 c->sad[1] = sad8_mmx;
573
574 c->vsad[4] = vsad_intra16_mmx;
575
576 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
577 c->vsad[0] = vsad16_mmx;
578 }
579 }
580
2ba45a60
DM
581#endif /* HAVE_INLINE_ASM */
582
583 if (EXTERNAL_MMX(cpu_flags)) {
584 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
585 c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
586 c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
587 c->sse[0] = ff_sse16_mmx;
588 c->sse[1] = ff_sse8_mmx;
589#if HAVE_YASM
590 c->nsse[0] = nsse16_mmx;
591 c->nsse[1] = nsse8_mmx;
592#endif
593 }
594
595 if (EXTERNAL_MMXEXT(cpu_flags)) {
596 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
597 c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
598 c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
f6fa7814
DM
599
600 c->sad[0] = ff_sad16_mmxext;
601 c->sad[1] = ff_sad8_mmxext;
602
603 c->pix_abs[0][0] = ff_sad16_mmxext;
604 c->pix_abs[0][1] = ff_sad16_x2_mmxext;
605 c->pix_abs[0][2] = ff_sad16_y2_mmxext;
606 c->pix_abs[1][0] = ff_sad8_mmxext;
607 c->pix_abs[1][1] = ff_sad8_x2_mmxext;
608 c->pix_abs[1][2] = ff_sad8_y2_mmxext;
609
610 c->vsad[4] = ff_vsad_intra16_mmxext;
611 c->vsad[5] = ff_vsad_intra8_mmxext;
612
613 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
614 c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
615 c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
616
617 c->vsad[0] = ff_vsad16_approx_mmxext;
618 c->vsad[1] = ff_vsad8_approx_mmxext;
619 }
2ba45a60
DM
620 }
621
622 if (EXTERNAL_SSE2(cpu_flags)) {
623 c->sse[0] = ff_sse16_sse2;
624 c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
625
626#if HAVE_ALIGNED_STACK
627 c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
628 c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
629#endif
f6fa7814
DM
630 if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
631 c->sad[0] = ff_sad16_sse2;
632 c->pix_abs[0][0] = ff_sad16_sse2;
633 c->pix_abs[0][1] = ff_sad16_x2_sse2;
634 c->pix_abs[0][2] = ff_sad16_y2_sse2;
635
636 c->vsad[4] = ff_vsad_intra16_sse2;
637 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
638 c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
639 c->vsad[0] = ff_vsad16_approx_sse2;
640 }
641 }
2ba45a60
DM
642 }
643
644 if (EXTERNAL_SSSE3(cpu_flags)) {
645 c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
646#if HAVE_ALIGNED_STACK
647 c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
648 c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
649#endif
650 }
651}