Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (C) 2002-2012 Michael Niedermayer | |
3 | * Copyright (C) 2012 Ronald S. Bultje | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | #include "config.h" | |
23 | #include "libavutil/attributes.h" | |
24 | #include "libavutil/avassert.h" | |
25 | #include "libavutil/common.h" | |
26 | #include "libavutil/cpu.h" | |
27 | #include "libavutil/mem.h" | |
28 | #include "libavutil/x86/asm.h" | |
29 | #include "libavutil/x86/cpu.h" | |
30 | #include "libavcodec/videodsp.h" | |
31 | ||
32 | #if HAVE_YASM | |
33 | typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride, | |
34 | const uint8_t *src, x86_reg src_stride, | |
35 | x86_reg start_y, x86_reg end_y, x86_reg bh); | |
36 | typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride, | |
37 | const uint8_t *src, x86_reg src_stride, | |
38 | x86_reg start_y, x86_reg end_y, x86_reg bh, | |
39 | x86_reg w); | |
40 | ||
41 | extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx; | |
42 | extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx; | |
43 | extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx; | |
44 | extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx; | |
45 | extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx; | |
46 | extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx; | |
47 | extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx; | |
48 | extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx; | |
49 | extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx; | |
50 | extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx; | |
51 | extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx; | |
52 | extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx; | |
53 | extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx; | |
54 | extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx; | |
55 | extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx; | |
56 | extern emu_edge_vfix_func ff_emu_edge_vfix16_mmx; | |
57 | extern emu_edge_vfix_func ff_emu_edge_vfix17_mmx; | |
58 | extern emu_edge_vfix_func ff_emu_edge_vfix18_mmx; | |
59 | extern emu_edge_vfix_func ff_emu_edge_vfix19_mmx; | |
60 | extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx; | |
61 | extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx; | |
62 | extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx; | |
63 | #if ARCH_X86_32 | |
64 | static emu_edge_vfix_func * const vfixtbl_mmx[22] = { | |
65 | &ff_emu_edge_vfix1_mmx, &ff_emu_edge_vfix2_mmx, &ff_emu_edge_vfix3_mmx, | |
66 | &ff_emu_edge_vfix4_mmx, &ff_emu_edge_vfix5_mmx, &ff_emu_edge_vfix6_mmx, | |
67 | &ff_emu_edge_vfix7_mmx, &ff_emu_edge_vfix8_mmx, &ff_emu_edge_vfix9_mmx, | |
68 | &ff_emu_edge_vfix10_mmx, &ff_emu_edge_vfix11_mmx, &ff_emu_edge_vfix12_mmx, | |
69 | &ff_emu_edge_vfix13_mmx, &ff_emu_edge_vfix14_mmx, &ff_emu_edge_vfix15_mmx, | |
70 | &ff_emu_edge_vfix16_mmx, &ff_emu_edge_vfix17_mmx, &ff_emu_edge_vfix18_mmx, | |
71 | &ff_emu_edge_vfix19_mmx, &ff_emu_edge_vfix20_mmx, &ff_emu_edge_vfix21_mmx, | |
72 | &ff_emu_edge_vfix22_mmx | |
73 | }; | |
74 | #endif | |
75 | extern emu_edge_vvar_func ff_emu_edge_vvar_mmx; | |
76 | extern emu_edge_vfix_func ff_emu_edge_vfix16_sse; | |
77 | extern emu_edge_vfix_func ff_emu_edge_vfix17_sse; | |
78 | extern emu_edge_vfix_func ff_emu_edge_vfix18_sse; | |
79 | extern emu_edge_vfix_func ff_emu_edge_vfix19_sse; | |
80 | extern emu_edge_vfix_func ff_emu_edge_vfix20_sse; | |
81 | extern emu_edge_vfix_func ff_emu_edge_vfix21_sse; | |
82 | extern emu_edge_vfix_func ff_emu_edge_vfix22_sse; | |
83 | static emu_edge_vfix_func * const vfixtbl_sse[22] = { | |
84 | ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx, | |
85 | ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx, | |
86 | ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx, | |
87 | ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx, | |
88 | ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx, | |
89 | ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse, | |
90 | ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse, | |
91 | ff_emu_edge_vfix22_sse | |
92 | }; | |
93 | extern emu_edge_vvar_func ff_emu_edge_vvar_sse; | |
94 | ||
95 | typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride, | |
96 | x86_reg start_x, x86_reg bh); | |
97 | typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride, | |
98 | x86_reg start_x, x86_reg n_words, x86_reg bh); | |
99 | ||
100 | extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx; | |
101 | extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx; | |
102 | extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx; | |
103 | extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx; | |
104 | extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx; | |
105 | extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx; | |
106 | extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx; | |
107 | extern emu_edge_hfix_func ff_emu_edge_hfix16_mmx; | |
108 | extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx; | |
109 | extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx; | |
110 | extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx; | |
111 | #if ARCH_X86_32 | |
112 | static emu_edge_hfix_func * const hfixtbl_mmx[11] = { | |
113 | ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, | |
114 | ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, | |
115 | ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx, | |
116 | ff_emu_edge_hfix20_mmx, ff_emu_edge_hfix22_mmx | |
117 | }; | |
118 | #endif | |
119 | extern emu_edge_hvar_func ff_emu_edge_hvar_mmx; | |
120 | extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2; | |
121 | extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2; | |
122 | extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2; | |
123 | extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2; | |
124 | static emu_edge_hfix_func * const hfixtbl_sse2[11] = { | |
125 | ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, | |
126 | ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, | |
127 | ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2, | |
128 | ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2 | |
129 | }; | |
130 | extern emu_edge_hvar_func ff_emu_edge_hvar_sse2; | |
f6fa7814 DM |
131 | #if HAVE_AVX2_EXTERNAL |
132 | extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2; | |
133 | extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2; | |
134 | extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2; | |
135 | extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2; | |
136 | extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2; | |
137 | extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2; | |
138 | extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2; | |
139 | extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2; | |
140 | static emu_edge_hfix_func * const hfixtbl_avx2[11] = { | |
141 | ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, | |
142 | ff_emu_edge_hfix8_avx2, ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2, | |
143 | ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2, | |
144 | ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2 | |
145 | }; | |
146 | extern emu_edge_hvar_func ff_emu_edge_hvar_avx2; | |
147 | #endif | |
2ba45a60 DM |
148 | |
149 | static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src, | |
150 | ptrdiff_t dst_stride, | |
151 | ptrdiff_t src_stride, | |
152 | x86_reg block_w, x86_reg block_h, | |
153 | x86_reg src_x, x86_reg src_y, | |
154 | x86_reg w, x86_reg h, | |
f6fa7814 | 155 | emu_edge_vfix_func * const *vfix_tbl, |
2ba45a60 | 156 | emu_edge_vvar_func *v_extend_var, |
f6fa7814 | 157 | emu_edge_hfix_func * const *hfix_tbl, |
2ba45a60 DM |
158 | emu_edge_hvar_func *h_extend_var) |
159 | { | |
160 | x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p; | |
161 | ||
162 | if (!w || !h) | |
163 | return; | |
164 | ||
165 | if (src_y >= h) { | |
166 | src -= src_y*src_stride; | |
167 | src_y_add = h - 1; | |
168 | src_y = h - 1; | |
169 | } else if (src_y <= -block_h) { | |
170 | src -= src_y*src_stride; | |
171 | src_y_add = 1 - block_h; | |
172 | src_y = 1 - block_h; | |
173 | } | |
174 | if (src_x >= w) { | |
175 | src += w - 1 - src_x; | |
176 | src_x = w - 1; | |
177 | } else if (src_x <= -block_w) { | |
178 | src += 1 - block_w - src_x; | |
179 | src_x = 1 - block_w; | |
180 | } | |
181 | ||
182 | start_y = FFMAX(0, -src_y); | |
183 | start_x = FFMAX(0, -src_x); | |
184 | end_y = FFMIN(block_h, h-src_y); | |
185 | end_x = FFMIN(block_w, w-src_x); | |
186 | av_assert2(start_x < end_x && block_w > 0); | |
187 | av_assert2(start_y < end_y && block_h > 0); | |
188 | ||
189 | // fill in the to-be-copied part plus all above/below | |
190 | src += (src_y_add + start_y) * src_stride + start_x; | |
191 | w = end_x - start_x; | |
192 | if (w <= 22) { | |
193 | vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride, | |
194 | start_y, end_y, block_h); | |
195 | } else { | |
196 | v_extend_var(dst + start_x, dst_stride, src, src_stride, | |
197 | start_y, end_y, block_h, w); | |
198 | } | |
199 | ||
200 | // fill left | |
201 | if (start_x) { | |
202 | if (start_x <= 22) { | |
203 | hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h); | |
204 | } else { | |
205 | h_extend_var(dst, dst_stride, | |
206 | start_x, (start_x + 1) >> 1, block_h); | |
207 | } | |
208 | } | |
209 | ||
210 | // fill right | |
211 | p = block_w - end_x; | |
212 | if (p) { | |
213 | if (p <= 22) { | |
214 | hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride, | |
215 | -!(p & 1), block_h); | |
216 | } else { | |
217 | h_extend_var(dst + end_x - (p & 1), dst_stride, | |
218 | -!(p & 1), (p + 1) >> 1, block_h); | |
219 | } | |
220 | } | |
221 | } | |
222 | ||
223 | #if ARCH_X86_32 | |
224 | static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, | |
225 | ptrdiff_t buf_stride, | |
226 | ptrdiff_t src_stride, | |
227 | int block_w, int block_h, | |
228 | int src_x, int src_y, int w, int h) | |
229 | { | |
230 | emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, | |
231 | src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx, | |
232 | hfixtbl_mmx, &ff_emu_edge_hvar_mmx); | |
233 | } | |
234 | ||
235 | static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, | |
236 | ptrdiff_t buf_stride, | |
237 | ptrdiff_t src_stride, | |
238 | int block_w, int block_h, | |
239 | int src_x, int src_y, int w, int h) | |
240 | { | |
241 | emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, | |
242 | src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, | |
243 | hfixtbl_mmx, &ff_emu_edge_hvar_mmx); | |
244 | } | |
245 | #endif | |
246 | ||
247 | static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src, | |
248 | ptrdiff_t buf_stride, | |
249 | ptrdiff_t src_stride, | |
250 | int block_w, int block_h, | |
251 | int src_x, int src_y, int w, | |
252 | int h) | |
253 | { | |
254 | emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, | |
255 | src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, | |
256 | hfixtbl_sse2, &ff_emu_edge_hvar_sse2); | |
257 | } | |
f6fa7814 DM |
258 | |
259 | #if HAVE_AVX2_EXTERNAL | |
260 | static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src, | |
261 | ptrdiff_t buf_stride, | |
262 | ptrdiff_t src_stride, | |
263 | int block_w, int block_h, | |
264 | int src_x, int src_y, int w, | |
265 | int h) | |
266 | { | |
267 | emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, | |
268 | src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, | |
269 | hfixtbl_avx2, &ff_emu_edge_hvar_avx2); | |
270 | } | |
271 | #endif /* HAVE_AVX2_EXTERNAL */ | |
2ba45a60 DM |
272 | #endif /* HAVE_YASM */ |
273 | ||
274 | void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h); | |
275 | void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h); | |
276 | ||
277 | av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc) | |
278 | { | |
279 | #if HAVE_YASM | |
280 | int cpu_flags = av_get_cpu_flags(); | |
281 | ||
282 | #if ARCH_X86_32 | |
283 | if (EXTERNAL_MMX(cpu_flags) && bpc <= 8) { | |
284 | ctx->emulated_edge_mc = emulated_edge_mc_mmx; | |
285 | } | |
286 | if (EXTERNAL_AMD3DNOW(cpu_flags)) { | |
287 | ctx->prefetch = ff_prefetch_3dnow; | |
288 | } | |
289 | #endif /* ARCH_X86_32 */ | |
290 | if (EXTERNAL_MMXEXT(cpu_flags)) { | |
291 | ctx->prefetch = ff_prefetch_mmxext; | |
292 | } | |
293 | #if ARCH_X86_32 | |
294 | if (EXTERNAL_SSE(cpu_flags) && bpc <= 8) { | |
295 | ctx->emulated_edge_mc = emulated_edge_mc_sse; | |
296 | } | |
297 | #endif /* ARCH_X86_32 */ | |
298 | if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) { | |
299 | ctx->emulated_edge_mc = emulated_edge_mc_sse2; | |
300 | } | |
f6fa7814 DM |
301 | #if HAVE_AVX2_EXTERNAL |
302 | if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) { | |
303 | ctx->emulated_edge_mc = emulated_edge_mc_avx2; | |
304 | } | |
305 | #endif | |
2ba45a60 DM |
306 | #endif /* HAVE_YASM */ |
307 | } |