Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * SIMD-optimized halfpel functions | |
3 | * Copyright (c) 2000, 2001 Fabrice Bellard | |
4 | * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |
5 | * | |
6 | * This file is part of FFmpeg. | |
7 | * | |
8 | * FFmpeg is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License as published by the Free Software Foundation; either | |
11 | * version 2.1 of the License, or (at your option) any later version. | |
12 | * | |
13 | * FFmpeg is distributed in the hope that it will be useful, | |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | * Lesser General Public License for more details. | |
17 | * | |
18 | * You should have received a copy of the GNU Lesser General Public | |
19 | * License along with FFmpeg; if not, write to the Free Software | |
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | * | |
22 | * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
23 | */ | |
24 | ||
25 | #include "libavutil/attributes.h" | |
26 | #include "libavutil/cpu.h" | |
27 | #include "libavutil/x86/asm.h" | |
28 | #include "libavutil/x86/cpu.h" | |
29 | #include "libavcodec/avcodec.h" | |
30 | #include "libavcodec/hpeldsp.h" | |
31 | #include "libavcodec/pixels.h" | |
32 | #include "fpel.h" | |
33 | #include "hpeldsp.h" | |
34 | ||
35 | void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, | |
36 | ptrdiff_t line_size, int h); | |
37 | void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, | |
38 | ptrdiff_t line_size, int h); | |
39 | void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels, | |
40 | ptrdiff_t line_size, int h); | |
41 | void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels, | |
42 | ptrdiff_t line_size, int h); | |
43 | void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, | |
44 | ptrdiff_t line_size, int h); | |
45 | void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels, | |
46 | ptrdiff_t line_size, int h); | |
47 | void ff_put_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, | |
48 | ptrdiff_t line_size, int h); | |
49 | void ff_avg_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels, | |
50 | ptrdiff_t line_size, int h); | |
51 | void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, | |
52 | ptrdiff_t line_size, int h); | |
53 | void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, | |
54 | ptrdiff_t line_size, int h); | |
55 | void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block, | |
56 | const uint8_t *pixels, | |
57 | ptrdiff_t line_size, int h); | |
58 | void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block, | |
59 | const uint8_t *pixels, | |
60 | ptrdiff_t line_size, int h); | |
61 | void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, | |
62 | ptrdiff_t line_size, int h); | |
63 | void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, | |
64 | ptrdiff_t line_size, int h); | |
65 | void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, | |
66 | ptrdiff_t line_size, int h); | |
67 | void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, | |
68 | ptrdiff_t line_size, int h); | |
69 | void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block, | |
70 | const uint8_t *pixels, | |
71 | ptrdiff_t line_size, int h); | |
72 | void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block, | |
73 | const uint8_t *pixels, | |
74 | ptrdiff_t line_size, int h); | |
75 | void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels, | |
76 | ptrdiff_t line_size, int h); | |
77 | void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, | |
78 | ptrdiff_t line_size, int h); | |
79 | void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, | |
80 | ptrdiff_t line_size, int h); | |
81 | void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, | |
82 | ptrdiff_t line_size, int h); | |
83 | void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, | |
84 | ptrdiff_t line_size, int h); | |
85 | void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, | |
86 | ptrdiff_t line_size, int h); | |
87 | void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, | |
88 | ptrdiff_t line_size, int h); | |
89 | void ff_avg_approx_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, | |
90 | ptrdiff_t line_size, int h); | |
91 | ||
92 | #define avg_pixels8_mmx ff_avg_pixels8_mmx | |
93 | #define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx | |
94 | #define avg_pixels16_mmx ff_avg_pixels16_mmx | |
95 | #define avg_pixels8_xy2_mmx ff_avg_pixels8_xy2_mmx | |
96 | #define avg_pixels16_xy2_mmx ff_avg_pixels16_xy2_mmx | |
97 | #define put_pixels8_mmx ff_put_pixels8_mmx | |
98 | #define put_pixels16_mmx ff_put_pixels16_mmx | |
99 | #define put_pixels8_xy2_mmx ff_put_pixels8_xy2_mmx | |
100 | #define put_pixels16_xy2_mmx ff_put_pixels16_xy2_mmx | |
101 | #define avg_no_rnd_pixels16_mmx ff_avg_pixels16_mmx | |
102 | #define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx | |
103 | #define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx | |
104 | ||
105 | #if HAVE_INLINE_ASM | |
106 | ||
107 | /***********************************/ | |
108 | /* MMX no rounding */ | |
109 | #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx | |
110 | #define SET_RND MOVQ_WONE | |
111 | #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) | |
112 | #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) | |
113 | #define STATIC static | |
114 | ||
115 | #include "rnd_template.c" | |
116 | #include "hpeldsp_rnd_template.c" | |
117 | ||
118 | #undef DEF | |
119 | #undef SET_RND | |
120 | #undef PAVGBP | |
121 | #undef PAVGB | |
122 | #undef STATIC | |
123 | ||
124 | CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8) | |
125 | CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8) | |
126 | ||
127 | CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8) | |
128 | CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8) | |
129 | ||
130 | /***********************************/ | |
131 | /* MMX rounding */ | |
132 | ||
133 | #define DEF(x, y) x ## _ ## y ## _mmx | |
134 | #define SET_RND MOVQ_WTWO | |
135 | #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) | |
136 | #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) | |
137 | ||
138 | #include "hpeldsp_rnd_template.c" | |
139 | ||
140 | #undef DEF | |
141 | #define DEF(x, y) ff_ ## x ## _ ## y ## _mmx | |
142 | #define STATIC | |
143 | ||
144 | #include "rnd_template.c" | |
145 | ||
146 | #undef DEF | |
147 | #undef SET_RND | |
148 | #undef PAVGBP | |
149 | #undef PAVGB | |
150 | ||
151 | CALL_2X_PIXELS(avg_pixels16_y2_mmx, avg_pixels8_y2_mmx, 8) | |
152 | CALL_2X_PIXELS(put_pixels16_y2_mmx, put_pixels8_y2_mmx, 8) | |
153 | ||
154 | CALL_2X_PIXELS_EXPORT(ff_avg_pixels16_xy2_mmx, ff_avg_pixels8_xy2_mmx, 8) | |
155 | CALL_2X_PIXELS_EXPORT(ff_put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8) | |
156 | ||
157 | #endif /* HAVE_INLINE_ASM */ | |
158 | ||
159 | ||
160 | #if HAVE_YASM | |
161 | ||
162 | #define HPELDSP_AVG_PIXELS16(CPUEXT) \ | |
163 | CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2 ## CPUEXT, 8) \ | |
164 | CALL_2X_PIXELS(put_pixels16_y2 ## CPUEXT, ff_put_pixels8_y2 ## CPUEXT, 8) \ | |
165 | CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2 ## CPUEXT, 8) \ | |
166 | CALL_2X_PIXELS(avg_pixels16 ## CPUEXT, ff_avg_pixels8 ## CPUEXT, 8) \ | |
167 | CALL_2X_PIXELS(avg_pixels16_x2 ## CPUEXT, ff_avg_pixels8_x2 ## CPUEXT, 8) \ | |
168 | CALL_2X_PIXELS(avg_pixels16_y2 ## CPUEXT, ff_avg_pixels8_y2 ## CPUEXT, 8) \ | |
169 | CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8) \ | |
170 | CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, ff_avg_approx_pixels8_xy2## CPUEXT, 8) | |
171 | ||
172 | HPELDSP_AVG_PIXELS16(_3dnow) | |
173 | HPELDSP_AVG_PIXELS16(_mmxext) | |
174 | ||
175 | #endif /* HAVE_YASM */ | |
176 | ||
177 | #define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \ | |
178 | if (HAVE_MMX_EXTERNAL) \ | |
179 | c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; | |
180 | ||
181 | #if HAVE_MMX_INLINE | |
182 | #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ | |
183 | do { \ | |
184 | SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \ | |
185 | c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ | |
186 | c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ | |
187 | c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \ | |
188 | } while (0) | |
189 | #else | |
190 | #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ | |
191 | do { \ | |
192 | SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \ | |
193 | } while (0) | |
194 | #endif | |
195 | ||
196 | static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int cpu_flags) | |
197 | { | |
198 | SET_HPEL_FUNCS(put, [0], 16, mmx); | |
199 | SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx); | |
200 | SET_HPEL_FUNCS(avg, [0], 16, mmx); | |
201 | SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx); | |
202 | SET_HPEL_FUNCS(put, [1], 8, mmx); | |
203 | SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx); | |
204 | if (HAVE_MMX_EXTERNAL) { | |
205 | c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmx; | |
206 | c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmx; | |
207 | } | |
208 | #if HAVE_MMX_INLINE | |
209 | c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx; | |
210 | c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmx; | |
211 | #endif | |
212 | } | |
213 | ||
214 | static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags) | |
215 | { | |
216 | #if HAVE_MMXEXT_EXTERNAL | |
217 | c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext; | |
218 | c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext; | |
219 | ||
220 | c->avg_pixels_tab[0][0] = avg_pixels16_mmxext; | |
221 | c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext; | |
222 | c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext; | |
223 | c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext; | |
224 | ||
225 | c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext; | |
226 | c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext; | |
227 | ||
228 | c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext; | |
229 | c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext; | |
230 | c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; | |
231 | c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext; | |
232 | ||
233 | if (!(flags & CODEC_FLAG_BITEXACT)) { | |
234 | c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext; | |
235 | c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext; | |
236 | c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext; | |
237 | c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext; | |
238 | ||
239 | c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext; | |
240 | c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext; | |
241 | } | |
242 | ||
243 | if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) { | |
244 | c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext; | |
245 | c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext; | |
246 | } | |
247 | #endif /* HAVE_MMXEXT_EXTERNAL */ | |
248 | } | |
249 | ||
250 | static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags) | |
251 | { | |
252 | #if HAVE_AMD3DNOW_EXTERNAL | |
253 | c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow; | |
254 | c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; | |
255 | ||
256 | c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; | |
257 | c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; | |
258 | c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; | |
259 | c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; | |
260 | ||
261 | c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow; | |
262 | c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow; | |
263 | ||
264 | c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow; | |
265 | c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow; | |
266 | c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow; | |
267 | c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow; | |
268 | ||
269 | if (!(flags & CODEC_FLAG_BITEXACT)){ | |
270 | c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; | |
271 | c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; | |
272 | c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow; | |
273 | c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow; | |
274 | ||
275 | c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_3dnow; | |
276 | c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_3dnow; | |
277 | } | |
278 | ||
279 | if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) { | |
280 | c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow; | |
281 | c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow; | |
282 | } | |
283 | #endif /* HAVE_AMD3DNOW_EXTERNAL */ | |
284 | } | |
285 | ||
286 | static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags) | |
287 | { | |
288 | #if HAVE_SSE2_EXTERNAL | |
289 | if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { | |
290 | // these functions are slower than mmx on AMD, but faster on Intel | |
291 | c->put_pixels_tab[0][0] = ff_put_pixels16_sse2; | |
292 | c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2; | |
293 | c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2; | |
294 | c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2; | |
295 | c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_sse2; | |
296 | c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2; | |
297 | c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2; | |
298 | c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2; | |
299 | c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2; | |
300 | } | |
301 | #endif /* HAVE_SSE2_EXTERNAL */ | |
302 | } | |
303 | ||
304 | static void hpeldsp_init_ssse3(HpelDSPContext *c, int flags, int cpu_flags) | |
305 | { | |
306 | #if HAVE_SSSE3_EXTERNAL | |
307 | c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_ssse3; | |
308 | c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_ssse3; | |
309 | c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_ssse3; | |
310 | c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_ssse3; | |
311 | #endif | |
312 | } | |
313 | ||
314 | av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) | |
315 | { | |
316 | int cpu_flags = av_get_cpu_flags(); | |
317 | ||
318 | if (INLINE_MMX(cpu_flags)) | |
319 | hpeldsp_init_mmx(c, flags, cpu_flags); | |
320 | ||
321 | if (EXTERNAL_AMD3DNOW(cpu_flags)) | |
322 | hpeldsp_init_3dnow(c, flags, cpu_flags); | |
323 | ||
324 | if (EXTERNAL_MMXEXT(cpu_flags)) | |
325 | hpeldsp_init_mmxext(c, flags, cpu_flags); | |
326 | ||
327 | if (EXTERNAL_SSE2(cpu_flags)) | |
328 | hpeldsp_init_sse2(c, flags, cpu_flags); | |
329 | ||
330 | if (EXTERNAL_SSSE3(cpu_flags)) | |
331 | hpeldsp_init_ssse3(c, flags, cpu_flags); | |
332 | } |