Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * SIMD-optimized motion estimation | |
3 | * Copyright (c) 2000, 2001 Fabrice Bellard | |
4 | * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |
5 | * | |
6 | * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
7 | * | |
8 | * This file is part of FFmpeg. | |
9 | * | |
10 | * FFmpeg is free software; you can redistribute it and/or | |
11 | * modify it under the terms of the GNU Lesser General Public | |
12 | * License as published by the Free Software Foundation; either | |
13 | * version 2.1 of the License, or (at your option) any later version. | |
14 | * | |
15 | * FFmpeg is distributed in the hope that it will be useful, | |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 | * Lesser General Public License for more details. | |
19 | * | |
20 | * You should have received a copy of the GNU Lesser General Public | |
21 | * License along with FFmpeg; if not, write to the Free Software | |
22 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 | */ | |
24 | ||
25 | #include "libavutil/attributes.h" | |
26 | #include "libavutil/cpu.h" | |
27 | #include "libavutil/x86/asm.h" | |
28 | #include "libavutil/x86/cpu.h" | |
29 | #include "libavcodec/me_cmp.h" | |
30 | #include "libavcodec/mpegvideo.h" | |
31 | ||
32 | int ff_sum_abs_dctelem_mmx(int16_t *block); | |
33 | int ff_sum_abs_dctelem_mmxext(int16_t *block); | |
34 | int ff_sum_abs_dctelem_sse2(int16_t *block); | |
35 | int ff_sum_abs_dctelem_ssse3(int16_t *block); | |
36 | int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
f6fa7814 | 37 | ptrdiff_t stride, int h); |
2ba45a60 | 38 | int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
f6fa7814 | 39 | ptrdiff_t stride, int h); |
2ba45a60 | 40 | int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
f6fa7814 DM |
41 | ptrdiff_t stride, int h); |
42 | int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h); | |
43 | int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h); | |
44 | int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
45 | ptrdiff_t stride, int h); | |
46 | int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
47 | ptrdiff_t stride, int h); | |
48 | int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
49 | ptrdiff_t stride, int h); | |
50 | int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
51 | ptrdiff_t stride, int h); | |
52 | int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
53 | ptrdiff_t stride, int h); | |
54 | int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
55 | ptrdiff_t stride, int h); | |
56 | int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
57 | ptrdiff_t stride, int h); | |
58 | int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
59 | ptrdiff_t stride, int h); | |
60 | int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
61 | ptrdiff_t stride, int h); | |
62 | int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
63 | ptrdiff_t stride, int h); | |
64 | int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
65 | ptrdiff_t stride, int h); | |
66 | int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
67 | ptrdiff_t stride, int h); | |
68 | int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
69 | ptrdiff_t stride, int h); | |
70 | int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
71 | ptrdiff_t stride, int h); | |
72 | int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
73 | ptrdiff_t stride, int h); | |
74 | int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
75 | ptrdiff_t stride, int h); | |
76 | int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
77 | ptrdiff_t stride, int h); | |
78 | int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |
79 | ptrdiff_t stride, int h); | |
80 | ||
81 | #define hadamard_func(cpu) \ | |
82 | int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \ | |
83 | uint8_t *src2, ptrdiff_t stride, int h); \ | |
84 | int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \ | |
85 | uint8_t *src2, ptrdiff_t stride, int h); | |
2ba45a60 DM |
86 | |
87 | hadamard_func(mmx) | |
88 | hadamard_func(mmxext) | |
89 | hadamard_func(sse2) | |
90 | hadamard_func(ssse3) | |
91 | ||
92 | #if HAVE_YASM | |
93 | static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, | |
f6fa7814 | 94 | ptrdiff_t stride, int h) |
2ba45a60 DM |
95 | { |
96 | int score1, score2; | |
97 | ||
98 | if (c) | |
f6fa7814 | 99 | score1 = c->mecc.sse[0](c, pix1, pix2, stride, h); |
2ba45a60 | 100 | else |
f6fa7814 DM |
101 | score1 = ff_sse16_mmx(c, pix1, pix2, stride, h); |
102 | score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h) | |
103 | - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h); | |
2ba45a60 DM |
104 | |
105 | if (c) | |
106 | return score1 + FFABS(score2) * c->avctx->nsse_weight; | |
107 | else | |
108 | return score1 + FFABS(score2) * 8; | |
109 | } | |
110 | ||
111 | static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, | |
f6fa7814 | 112 | ptrdiff_t stride, int h) |
2ba45a60 | 113 | { |
f6fa7814 DM |
114 | int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h); |
115 | int score2 = ff_hf_noise8_mmx(pix1, stride, h) - | |
116 | ff_hf_noise8_mmx(pix2, stride, h); | |
2ba45a60 DM |
117 | |
118 | if (c) | |
119 | return score1 + FFABS(score2) * c->avctx->nsse_weight; | |
120 | else | |
121 | return score1 + FFABS(score2) * 8; | |
122 | } | |
123 | ||
124 | #endif /* HAVE_YASM */ | |
125 | ||
126 | #if HAVE_INLINE_ASM | |
127 | ||
128 | static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, | |
f6fa7814 | 129 | ptrdiff_t stride, int h) |
2ba45a60 DM |
130 | { |
131 | int tmp; | |
132 | ||
133 | av_assert2((((int) pix) & 7) == 0); | |
f6fa7814 | 134 | av_assert2((stride & 7) == 0); |
2ba45a60 DM |
135 | |
136 | #define SUM(in0, in1, out0, out1) \ | |
137 | "movq (%0), %%mm2\n" \ | |
138 | "movq 8(%0), %%mm3\n" \ | |
139 | "add %2,%0\n" \ | |
140 | "movq %%mm2, " #out0 "\n" \ | |
141 | "movq %%mm3, " #out1 "\n" \ | |
142 | "psubusb " #in0 ", %%mm2\n" \ | |
143 | "psubusb " #in1 ", %%mm3\n" \ | |
144 | "psubusb " #out0 ", " #in0 "\n" \ | |
145 | "psubusb " #out1 ", " #in1 "\n" \ | |
146 | "por %%mm2, " #in0 "\n" \ | |
147 | "por %%mm3, " #in1 "\n" \ | |
148 | "movq " #in0 ", %%mm2\n" \ | |
149 | "movq " #in1 ", %%mm3\n" \ | |
150 | "punpcklbw %%mm7, " #in0 "\n" \ | |
151 | "punpcklbw %%mm7, " #in1 "\n" \ | |
152 | "punpckhbw %%mm7, %%mm2\n" \ | |
153 | "punpckhbw %%mm7, %%mm3\n" \ | |
154 | "paddw " #in1 ", " #in0 "\n" \ | |
155 | "paddw %%mm3, %%mm2\n" \ | |
156 | "paddw %%mm2, " #in0 "\n" \ | |
157 | "paddw " #in0 ", %%mm6\n" | |
158 | ||
159 | ||
160 | __asm__ volatile ( | |
161 | "movl %3, %%ecx\n" | |
162 | "pxor %%mm6, %%mm6\n" | |
163 | "pxor %%mm7, %%mm7\n" | |
164 | "movq (%0), %%mm0\n" | |
165 | "movq 8(%0), %%mm1\n" | |
166 | "add %2, %0\n" | |
167 | "jmp 2f\n" | |
168 | "1:\n" | |
169 | ||
170 | SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
171 | "2:\n" | |
172 | SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
173 | ||
174 | "subl $2, %%ecx\n" | |
175 | "jnz 1b\n" | |
176 | ||
177 | "movq %%mm6, %%mm0\n" | |
178 | "psrlq $32, %%mm6\n" | |
179 | "paddw %%mm6, %%mm0\n" | |
180 | "movq %%mm0, %%mm6\n" | |
181 | "psrlq $16, %%mm0\n" | |
182 | "paddw %%mm6, %%mm0\n" | |
183 | "movd %%mm0, %1\n" | |
184 | : "+r" (pix), "=r" (tmp) | |
f6fa7814 | 185 | : "r" (stride), "m" (h) |
2ba45a60 DM |
186 | : "%ecx"); |
187 | ||
188 | return tmp & 0xFFFF; | |
189 | } | |
190 | #undef SUM | |
191 | ||
2ba45a60 | 192 | static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
f6fa7814 | 193 | ptrdiff_t stride, int h) |
2ba45a60 DM |
194 | { |
195 | int tmp; | |
196 | ||
197 | av_assert2((((int) pix1) & 7) == 0); | |
198 | av_assert2((((int) pix2) & 7) == 0); | |
f6fa7814 | 199 | av_assert2((stride & 7) == 0); |
2ba45a60 DM |
200 | |
201 | #define SUM(in0, in1, out0, out1) \ | |
202 | "movq (%0), %%mm2\n" \ | |
203 | "movq (%1), " #out0 "\n" \ | |
204 | "movq 8(%0), %%mm3\n" \ | |
205 | "movq 8(%1), " #out1 "\n" \ | |
206 | "add %3, %0\n" \ | |
207 | "add %3, %1\n" \ | |
208 | "psubb " #out0 ", %%mm2\n" \ | |
209 | "psubb " #out1 ", %%mm3\n" \ | |
210 | "pxor %%mm7, %%mm2\n" \ | |
211 | "pxor %%mm7, %%mm3\n" \ | |
212 | "movq %%mm2, " #out0 "\n" \ | |
213 | "movq %%mm3, " #out1 "\n" \ | |
214 | "psubusb " #in0 ", %%mm2\n" \ | |
215 | "psubusb " #in1 ", %%mm3\n" \ | |
216 | "psubusb " #out0 ", " #in0 "\n" \ | |
217 | "psubusb " #out1 ", " #in1 "\n" \ | |
218 | "por %%mm2, " #in0 "\n" \ | |
219 | "por %%mm3, " #in1 "\n" \ | |
220 | "movq " #in0 ", %%mm2\n" \ | |
221 | "movq " #in1 ", %%mm3\n" \ | |
222 | "punpcklbw %%mm7, " #in0 "\n" \ | |
223 | "punpcklbw %%mm7, " #in1 "\n" \ | |
224 | "punpckhbw %%mm7, %%mm2\n" \ | |
225 | "punpckhbw %%mm7, %%mm3\n" \ | |
226 | "paddw " #in1 ", " #in0 "\n" \ | |
227 | "paddw %%mm3, %%mm2\n" \ | |
228 | "paddw %%mm2, " #in0 "\n" \ | |
229 | "paddw " #in0 ", %%mm6\n" | |
230 | ||
231 | ||
232 | __asm__ volatile ( | |
233 | "movl %4, %%ecx\n" | |
234 | "pxor %%mm6, %%mm6\n" | |
235 | "pcmpeqw %%mm7, %%mm7\n" | |
236 | "psllw $15, %%mm7\n" | |
237 | "packsswb %%mm7, %%mm7\n" | |
238 | "movq (%0), %%mm0\n" | |
239 | "movq (%1), %%mm2\n" | |
240 | "movq 8(%0), %%mm1\n" | |
241 | "movq 8(%1), %%mm3\n" | |
242 | "add %3, %0\n" | |
243 | "add %3, %1\n" | |
244 | "psubb %%mm2, %%mm0\n" | |
245 | "psubb %%mm3, %%mm1\n" | |
246 | "pxor %%mm7, %%mm0\n" | |
247 | "pxor %%mm7, %%mm1\n" | |
248 | "jmp 2f\n" | |
249 | "1:\n" | |
250 | ||
251 | SUM(%%mm4, %%mm5, %%mm0, %%mm1) | |
252 | "2:\n" | |
253 | SUM(%%mm0, %%mm1, %%mm4, %%mm5) | |
254 | ||
255 | "subl $2, %%ecx\n" | |
256 | "jnz 1b\n" | |
257 | ||
258 | "movq %%mm6, %%mm0\n" | |
259 | "psrlq $32, %%mm6\n" | |
260 | "paddw %%mm6, %%mm0\n" | |
261 | "movq %%mm0, %%mm6\n" | |
262 | "psrlq $16, %%mm0\n" | |
263 | "paddw %%mm6, %%mm0\n" | |
264 | "movd %%mm0, %2\n" | |
265 | : "+r" (pix1), "+r" (pix2), "=r" (tmp) | |
f6fa7814 | 266 | : "r" (stride), "m" (h) |
2ba45a60 DM |
267 | : "%ecx"); |
268 | ||
269 | return tmp & 0x7FFF; | |
270 | } | |
271 | #undef SUM | |
272 | ||
2ba45a60 DM |
273 | DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = { |
274 | 0x0000000000000000ULL, | |
275 | 0x0001000100010001ULL, | |
276 | 0x0002000200020002ULL, | |
277 | }; | |
278 | ||
f6fa7814 DM |
279 | static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, |
280 | ptrdiff_t stride, int h) | |
2ba45a60 | 281 | { |
f6fa7814 | 282 | x86_reg len = -stride * h; |
2ba45a60 DM |
283 | __asm__ volatile ( |
284 | ".p2align 4 \n\t" | |
285 | "1: \n\t" | |
286 | "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
287 | "movq (%2, %%"REG_a"), %%mm2 \n\t" | |
288 | "movq (%2, %%"REG_a"), %%mm4 \n\t" | |
289 | "add %3, %%"REG_a" \n\t" | |
290 | "psubusb %%mm0, %%mm2 \n\t" | |
291 | "psubusb %%mm4, %%mm0 \n\t" | |
292 | "movq (%1, %%"REG_a"), %%mm1 \n\t" | |
293 | "movq (%2, %%"REG_a"), %%mm3 \n\t" | |
294 | "movq (%2, %%"REG_a"), %%mm5 \n\t" | |
295 | "psubusb %%mm1, %%mm3 \n\t" | |
296 | "psubusb %%mm5, %%mm1 \n\t" | |
297 | "por %%mm2, %%mm0 \n\t" | |
298 | "por %%mm1, %%mm3 \n\t" | |
299 | "movq %%mm0, %%mm1 \n\t" | |
300 | "movq %%mm3, %%mm2 \n\t" | |
301 | "punpcklbw %%mm7, %%mm0 \n\t" | |
302 | "punpckhbw %%mm7, %%mm1 \n\t" | |
303 | "punpcklbw %%mm7, %%mm3 \n\t" | |
304 | "punpckhbw %%mm7, %%mm2 \n\t" | |
305 | "paddw %%mm1, %%mm0 \n\t" | |
306 | "paddw %%mm3, %%mm2 \n\t" | |
307 | "paddw %%mm2, %%mm0 \n\t" | |
308 | "paddw %%mm0, %%mm6 \n\t" | |
309 | "add %3, %%"REG_a" \n\t" | |
310 | " js 1b \n\t" | |
311 | : "+a" (len) | |
f6fa7814 | 312 | : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)); |
2ba45a60 DM |
313 | } |
314 | ||
315 | static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, | |
f6fa7814 | 316 | ptrdiff_t stride, int h) |
2ba45a60 | 317 | { |
f6fa7814 | 318 | x86_reg len = -stride * h; |
2ba45a60 DM |
319 | __asm__ volatile ( |
320 | ".p2align 4 \n\t" | |
321 | "1: \n\t" | |
322 | "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
323 | "movq (%2, %%"REG_a"), %%mm1 \n\t" | |
324 | "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
325 | "movq (%2, %%"REG_a"), %%mm3 \n\t" | |
326 | "punpcklbw %%mm7, %%mm0 \n\t" | |
327 | "punpcklbw %%mm7, %%mm1 \n\t" | |
328 | "punpckhbw %%mm7, %%mm2 \n\t" | |
329 | "punpckhbw %%mm7, %%mm3 \n\t" | |
330 | "paddw %%mm0, %%mm1 \n\t" | |
331 | "paddw %%mm2, %%mm3 \n\t" | |
332 | "movq (%3, %%"REG_a"), %%mm4 \n\t" | |
333 | "movq (%3, %%"REG_a"), %%mm2 \n\t" | |
334 | "paddw %%mm5, %%mm1 \n\t" | |
335 | "paddw %%mm5, %%mm3 \n\t" | |
336 | "psrlw $1, %%mm1 \n\t" | |
337 | "psrlw $1, %%mm3 \n\t" | |
338 | "packuswb %%mm3, %%mm1 \n\t" | |
339 | "psubusb %%mm1, %%mm4 \n\t" | |
340 | "psubusb %%mm2, %%mm1 \n\t" | |
341 | "por %%mm4, %%mm1 \n\t" | |
342 | "movq %%mm1, %%mm0 \n\t" | |
343 | "punpcklbw %%mm7, %%mm0 \n\t" | |
344 | "punpckhbw %%mm7, %%mm1 \n\t" | |
345 | "paddw %%mm1, %%mm0 \n\t" | |
346 | "paddw %%mm0, %%mm6 \n\t" | |
347 | "add %4, %%"REG_a" \n\t" | |
348 | " js 1b \n\t" | |
349 | : "+a" (len) | |
350 | : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len), | |
f6fa7814 | 351 | "r" (stride)); |
2ba45a60 DM |
352 | } |
353 | ||
f6fa7814 DM |
354 | static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, |
355 | ptrdiff_t stride, int h) | |
2ba45a60 | 356 | { |
f6fa7814 | 357 | x86_reg len = -stride * h; |
2ba45a60 DM |
358 | __asm__ volatile ( |
359 | "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
360 | "movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |
361 | "movq %%mm0, %%mm1 \n\t" | |
362 | "movq %%mm2, %%mm3 \n\t" | |
363 | "punpcklbw %%mm7, %%mm0 \n\t" | |
364 | "punpckhbw %%mm7, %%mm1 \n\t" | |
365 | "punpcklbw %%mm7, %%mm2 \n\t" | |
366 | "punpckhbw %%mm7, %%mm3 \n\t" | |
367 | "paddw %%mm2, %%mm0 \n\t" | |
368 | "paddw %%mm3, %%mm1 \n\t" | |
369 | ".p2align 4 \n\t" | |
370 | "1: \n\t" | |
371 | "movq (%2, %%"REG_a"), %%mm2 \n\t" | |
372 | "movq 1(%2, %%"REG_a"), %%mm4 \n\t" | |
373 | "movq %%mm2, %%mm3 \n\t" | |
374 | "movq %%mm4, %%mm5 \n\t" | |
375 | "punpcklbw %%mm7, %%mm2 \n\t" | |
376 | "punpckhbw %%mm7, %%mm3 \n\t" | |
377 | "punpcklbw %%mm7, %%mm4 \n\t" | |
378 | "punpckhbw %%mm7, %%mm5 \n\t" | |
379 | "paddw %%mm4, %%mm2 \n\t" | |
380 | "paddw %%mm5, %%mm3 \n\t" | |
381 | "movq %5, %%mm5 \n\t" | |
382 | "paddw %%mm2, %%mm0 \n\t" | |
383 | "paddw %%mm3, %%mm1 \n\t" | |
384 | "paddw %%mm5, %%mm0 \n\t" | |
385 | "paddw %%mm5, %%mm1 \n\t" | |
386 | "movq (%3, %%"REG_a"), %%mm4 \n\t" | |
387 | "movq (%3, %%"REG_a"), %%mm5 \n\t" | |
388 | "psrlw $2, %%mm0 \n\t" | |
389 | "psrlw $2, %%mm1 \n\t" | |
390 | "packuswb %%mm1, %%mm0 \n\t" | |
391 | "psubusb %%mm0, %%mm4 \n\t" | |
392 | "psubusb %%mm5, %%mm0 \n\t" | |
393 | "por %%mm4, %%mm0 \n\t" | |
394 | "movq %%mm0, %%mm4 \n\t" | |
395 | "punpcklbw %%mm7, %%mm0 \n\t" | |
396 | "punpckhbw %%mm7, %%mm4 \n\t" | |
397 | "paddw %%mm0, %%mm6 \n\t" | |
398 | "paddw %%mm4, %%mm6 \n\t" | |
399 | "movq %%mm2, %%mm0 \n\t" | |
400 | "movq %%mm3, %%mm1 \n\t" | |
401 | "add %4, %%"REG_a" \n\t" | |
402 | " js 1b \n\t" | |
403 | : "+a" (len) | |
404 | : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), | |
f6fa7814 | 405 | "r" (stride), "m" (round_tab[2])); |
2ba45a60 DM |
406 | } |
407 | ||
408 | static inline int sum_mmx(void) | |
409 | { | |
410 | int ret; | |
411 | __asm__ volatile ( | |
412 | "movq %%mm6, %%mm0 \n\t" | |
413 | "psrlq $32, %%mm6 \n\t" | |
414 | "paddw %%mm0, %%mm6 \n\t" | |
415 | "movq %%mm6, %%mm0 \n\t" | |
416 | "psrlq $16, %%mm6 \n\t" | |
417 | "paddw %%mm0, %%mm6 \n\t" | |
418 | "movd %%mm6, %0 \n\t" | |
419 | : "=r" (ret)); | |
420 | return ret & 0xFFFF; | |
421 | } | |
422 | ||
f6fa7814 DM |
423 | static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, |
424 | ptrdiff_t stride, int h) | |
2ba45a60 DM |
425 | { |
426 | sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h); | |
427 | } | |
428 | ||
f6fa7814 DM |
429 | static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, |
430 | ptrdiff_t stride, int h) | |
2ba45a60 DM |
431 | { |
432 | sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h); | |
433 | } | |
434 | ||
435 | #define PIX_SAD(suf) \ | |
436 | static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \ | |
f6fa7814 | 437 | uint8_t *blk1, ptrdiff_t stride, int h) \ |
2ba45a60 DM |
438 | { \ |
439 | av_assert2(h == 8); \ | |
440 | __asm__ volatile ( \ | |
441 | "pxor %%mm7, %%mm7 \n\t" \ | |
442 | "pxor %%mm6, %%mm6 \n\t" \ | |
443 | :); \ | |
444 | \ | |
445 | sad8_1_ ## suf(blk1, blk2, stride, 8); \ | |
446 | \ | |
447 | return sum_ ## suf(); \ | |
448 | } \ | |
449 | \ | |
450 | static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ | |
f6fa7814 | 451 | uint8_t *blk1, ptrdiff_t stride, int h) \ |
2ba45a60 DM |
452 | { \ |
453 | av_assert2(h == 8); \ | |
454 | __asm__ volatile ( \ | |
455 | "pxor %%mm7, %%mm7 \n\t" \ | |
456 | "pxor %%mm6, %%mm6 \n\t" \ | |
457 | "movq %0, %%mm5 \n\t" \ | |
458 | :: "m" (round_tab[1])); \ | |
459 | \ | |
460 | sad8_x2a_ ## suf(blk1, blk2, stride, 8); \ | |
461 | \ | |
462 | return sum_ ## suf(); \ | |
463 | } \ | |
464 | \ | |
465 | static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ | |
f6fa7814 | 466 | uint8_t *blk1, ptrdiff_t stride, int h) \ |
2ba45a60 DM |
467 | { \ |
468 | av_assert2(h == 8); \ | |
469 | __asm__ volatile ( \ | |
470 | "pxor %%mm7, %%mm7 \n\t" \ | |
471 | "pxor %%mm6, %%mm6 \n\t" \ | |
472 | "movq %0, %%mm5 \n\t" \ | |
473 | :: "m" (round_tab[1])); \ | |
474 | \ | |
475 | sad8_y2a_ ## suf(blk1, blk2, stride, 8); \ | |
476 | \ | |
477 | return sum_ ## suf(); \ | |
478 | } \ | |
479 | \ | |
480 | static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ | |
f6fa7814 | 481 | uint8_t *blk1, ptrdiff_t stride, int h) \ |
2ba45a60 DM |
482 | { \ |
483 | av_assert2(h == 8); \ | |
484 | __asm__ volatile ( \ | |
485 | "pxor %%mm7, %%mm7 \n\t" \ | |
486 | "pxor %%mm6, %%mm6 \n\t" \ | |
487 | ::); \ | |
488 | \ | |
489 | sad8_4_ ## suf(blk1, blk2, stride, 8); \ | |
490 | \ | |
491 | return sum_ ## suf(); \ | |
492 | } \ | |
493 | \ | |
494 | static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \ | |
f6fa7814 | 495 | uint8_t *blk1, ptrdiff_t stride, int h) \ |
2ba45a60 DM |
496 | { \ |
497 | __asm__ volatile ( \ | |
498 | "pxor %%mm7, %%mm7 \n\t" \ | |
499 | "pxor %%mm6, %%mm6 \n\t" \ | |
500 | :); \ | |
501 | \ | |
502 | sad8_1_ ## suf(blk1, blk2, stride, h); \ | |
503 | sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ | |
504 | \ | |
505 | return sum_ ## suf(); \ | |
506 | } \ | |
507 | \ | |
508 | static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ | |
f6fa7814 | 509 | uint8_t *blk1, ptrdiff_t stride, int h) \ |
2ba45a60 DM |
510 | { \ |
511 | __asm__ volatile ( \ | |
512 | "pxor %%mm7, %%mm7 \n\t" \ | |
513 | "pxor %%mm6, %%mm6 \n\t" \ | |
514 | "movq %0, %%mm5 \n\t" \ | |
515 | :: "m" (round_tab[1])); \ | |
516 | \ | |
517 | sad8_x2a_ ## suf(blk1, blk2, stride, h); \ | |
518 | sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ | |
519 | \ | |
520 | return sum_ ## suf(); \ | |
521 | } \ | |
522 | \ | |
523 | static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ | |
f6fa7814 | 524 | uint8_t *blk1, ptrdiff_t stride, int h) \ |
2ba45a60 DM |
525 | { \ |
526 | __asm__ volatile ( \ | |
527 | "pxor %%mm7, %%mm7 \n\t" \ | |
528 | "pxor %%mm6, %%mm6 \n\t" \ | |
529 | "movq %0, %%mm5 \n\t" \ | |
530 | :: "m" (round_tab[1])); \ | |
531 | \ | |
532 | sad8_y2a_ ## suf(blk1, blk2, stride, h); \ | |
533 | sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ | |
534 | \ | |
535 | return sum_ ## suf(); \ | |
536 | } \ | |
537 | \ | |
538 | static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ | |
f6fa7814 | 539 | uint8_t *blk1, ptrdiff_t stride, int h) \ |
2ba45a60 DM |
540 | { \ |
541 | __asm__ volatile ( \ | |
542 | "pxor %%mm7, %%mm7 \n\t" \ | |
543 | "pxor %%mm6, %%mm6 \n\t" \ | |
544 | ::); \ | |
545 | \ | |
546 | sad8_4_ ## suf(blk1, blk2, stride, h); \ | |
547 | sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ | |
548 | \ | |
549 | return sum_ ## suf(); \ | |
550 | } \ | |
551 | ||
552 | PIX_SAD(mmx) | |
2ba45a60 DM |
553 | |
554 | #endif /* HAVE_INLINE_ASM */ | |
555 | ||
556 | av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) | |
557 | { | |
558 | int cpu_flags = av_get_cpu_flags(); | |
559 | ||
560 | #if HAVE_INLINE_ASM | |
561 | if (INLINE_MMX(cpu_flags)) { | |
562 | c->pix_abs[0][0] = sad16_mmx; | |
563 | c->pix_abs[0][1] = sad16_x2_mmx; | |
564 | c->pix_abs[0][2] = sad16_y2_mmx; | |
565 | c->pix_abs[0][3] = sad16_xy2_mmx; | |
566 | c->pix_abs[1][0] = sad8_mmx; | |
567 | c->pix_abs[1][1] = sad8_x2_mmx; | |
568 | c->pix_abs[1][2] = sad8_y2_mmx; | |
569 | c->pix_abs[1][3] = sad8_xy2_mmx; | |
570 | ||
571 | c->sad[0] = sad16_mmx; | |
572 | c->sad[1] = sad8_mmx; | |
573 | ||
574 | c->vsad[4] = vsad_intra16_mmx; | |
575 | ||
576 | if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { | |
577 | c->vsad[0] = vsad16_mmx; | |
578 | } | |
579 | } | |
580 | ||
2ba45a60 DM |
581 | #endif /* HAVE_INLINE_ASM */ |
582 | ||
583 | if (EXTERNAL_MMX(cpu_flags)) { | |
584 | c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; | |
585 | c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; | |
586 | c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx; | |
587 | c->sse[0] = ff_sse16_mmx; | |
588 | c->sse[1] = ff_sse8_mmx; | |
589 | #if HAVE_YASM | |
590 | c->nsse[0] = nsse16_mmx; | |
591 | c->nsse[1] = nsse8_mmx; | |
592 | #endif | |
593 | } | |
594 | ||
595 | if (EXTERNAL_MMXEXT(cpu_flags)) { | |
596 | c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; | |
597 | c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; | |
598 | c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext; | |
f6fa7814 DM |
599 | |
600 | c->sad[0] = ff_sad16_mmxext; | |
601 | c->sad[1] = ff_sad8_mmxext; | |
602 | ||
603 | c->pix_abs[0][0] = ff_sad16_mmxext; | |
604 | c->pix_abs[0][1] = ff_sad16_x2_mmxext; | |
605 | c->pix_abs[0][2] = ff_sad16_y2_mmxext; | |
606 | c->pix_abs[1][0] = ff_sad8_mmxext; | |
607 | c->pix_abs[1][1] = ff_sad8_x2_mmxext; | |
608 | c->pix_abs[1][2] = ff_sad8_y2_mmxext; | |
609 | ||
610 | c->vsad[4] = ff_vsad_intra16_mmxext; | |
611 | c->vsad[5] = ff_vsad_intra8_mmxext; | |
612 | ||
613 | if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { | |
614 | c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext; | |
615 | c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext; | |
616 | ||
617 | c->vsad[0] = ff_vsad16_approx_mmxext; | |
618 | c->vsad[1] = ff_vsad8_approx_mmxext; | |
619 | } | |
2ba45a60 DM |
620 | } |
621 | ||
622 | if (EXTERNAL_SSE2(cpu_flags)) { | |
623 | c->sse[0] = ff_sse16_sse2; | |
624 | c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2; | |
625 | ||
626 | #if HAVE_ALIGNED_STACK | |
627 | c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; | |
628 | c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; | |
629 | #endif | |
f6fa7814 DM |
630 | if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) { |
631 | c->sad[0] = ff_sad16_sse2; | |
632 | c->pix_abs[0][0] = ff_sad16_sse2; | |
633 | c->pix_abs[0][1] = ff_sad16_x2_sse2; | |
634 | c->pix_abs[0][2] = ff_sad16_y2_sse2; | |
635 | ||
636 | c->vsad[4] = ff_vsad_intra16_sse2; | |
637 | if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { | |
638 | c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2; | |
639 | c->vsad[0] = ff_vsad16_approx_sse2; | |
640 | } | |
641 | } | |
2ba45a60 DM |
642 | } |
643 | ||
644 | if (EXTERNAL_SSSE3(cpu_flags)) { | |
645 | c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3; | |
646 | #if HAVE_ALIGNED_STACK | |
647 | c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; | |
648 | c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; | |
649 | #endif | |
650 | } | |
651 | } |