Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt | |
3 | * | |
4 | * This file is part of FFmpeg. | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with FFmpeg; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | */ | |
20 | ||
21 | #include "libavutil/attributes.h" | |
22 | #include "libavutil/cpu.h" | |
23 | #include "libavutil/x86/asm.h" | |
24 | #include "libavutil/x86/cpu.h" | |
25 | #include "libavcodec/h264dsp.h" | |
26 | ||
27 | /***********************************/ | |
28 | /* IDCT */ | |
29 | #define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \ | |
30 | void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \ | |
31 | int16_t *block, \ | |
32 | int stride); | |
33 | ||
34 | IDCT_ADD_FUNC(, 8, mmx) | |
35 | IDCT_ADD_FUNC(, 10, sse2) | |
36 | IDCT_ADD_FUNC(_dc, 8, mmxext) | |
37 | IDCT_ADD_FUNC(_dc, 10, mmxext) | |
38 | IDCT_ADD_FUNC(8_dc, 8, mmxext) | |
39 | IDCT_ADD_FUNC(8_dc, 10, sse2) | |
40 | IDCT_ADD_FUNC(8, 8, mmx) | |
41 | IDCT_ADD_FUNC(8, 8, sse2) | |
42 | IDCT_ADD_FUNC(8, 10, sse2) | |
43 | IDCT_ADD_FUNC(, 10, avx) | |
44 | IDCT_ADD_FUNC(8_dc, 10, avx) | |
45 | IDCT_ADD_FUNC(8, 10, avx) | |
46 | ||
47 | ||
48 | #define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \ | |
49 | void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ | |
50 | (uint8_t *dst, const int *block_offset, \ | |
51 | int16_t *block, int stride, const uint8_t nnzc[6 * 8]); | |
52 | ||
53 | IDCT_ADD_REP_FUNC(8, 4, 8, mmx) | |
54 | IDCT_ADD_REP_FUNC(8, 4, 8, mmxext) | |
55 | IDCT_ADD_REP_FUNC(8, 4, 8, sse2) | |
56 | IDCT_ADD_REP_FUNC(8, 4, 10, sse2) | |
57 | IDCT_ADD_REP_FUNC(8, 4, 10, avx) | |
58 | IDCT_ADD_REP_FUNC(, 16, 8, mmx) | |
59 | IDCT_ADD_REP_FUNC(, 16, 8, mmxext) | |
60 | IDCT_ADD_REP_FUNC(, 16, 8, sse2) | |
61 | IDCT_ADD_REP_FUNC(, 16, 10, sse2) | |
62 | IDCT_ADD_REP_FUNC(, 16intra, 8, mmx) | |
63 | IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext) | |
64 | IDCT_ADD_REP_FUNC(, 16intra, 8, sse2) | |
65 | IDCT_ADD_REP_FUNC(, 16intra, 10, sse2) | |
66 | IDCT_ADD_REP_FUNC(, 16, 10, avx) | |
67 | IDCT_ADD_REP_FUNC(, 16intra, 10, avx) | |
68 | ||
69 | ||
70 | #define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \ | |
71 | void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ | |
72 | (uint8_t **dst, const int *block_offset, \ | |
73 | int16_t *block, int stride, const uint8_t nnzc[6 * 8]); | |
74 | ||
75 | IDCT_ADD_REP_FUNC2(, 8, 8, mmx) | |
76 | IDCT_ADD_REP_FUNC2(, 8, 8, mmxext) | |
77 | IDCT_ADD_REP_FUNC2(, 8, 8, sse2) | |
78 | IDCT_ADD_REP_FUNC2(, 8, 10, sse2) | |
79 | IDCT_ADD_REP_FUNC2(, 8, 10, avx) | |
80 | ||
81 | void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul); | |
82 | void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul); | |
83 | ||
84 | /***********************************/ | |
85 | /* deblocking */ | |
86 | ||
87 | void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40], | |
88 | int8_t ref[2][40], | |
89 | int16_t mv[2][40][2], | |
90 | int bidir, int edges, int step, | |
91 | int mask_mv0, int mask_mv1, int field); | |
92 | ||
93 | #define LF_FUNC(DIR, TYPE, DEPTH, OPT) \ | |
94 | void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ | |
95 | int stride, \ | |
96 | int alpha, \ | |
97 | int beta, \ | |
98 | int8_t *tc0); | |
99 | #define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \ | |
100 | void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ | |
101 | int stride, \ | |
102 | int alpha, \ | |
103 | int beta); | |
104 | ||
105 | #define LF_FUNCS(type, depth) \ | |
106 | LF_FUNC(h, chroma, depth, mmxext) \ | |
107 | LF_IFUNC(h, chroma_intra, depth, mmxext) \ | |
108 | LF_FUNC(v, chroma, depth, mmxext) \ | |
109 | LF_IFUNC(v, chroma_intra, depth, mmxext) \ | |
110 | LF_FUNC(h, luma, depth, mmxext) \ | |
111 | LF_IFUNC(h, luma_intra, depth, mmxext) \ | |
112 | LF_FUNC(h, luma, depth, sse2) \ | |
113 | LF_IFUNC(h, luma_intra, depth, sse2) \ | |
114 | LF_FUNC(v, luma, depth, sse2) \ | |
115 | LF_IFUNC(v, luma_intra, depth, sse2) \ | |
116 | LF_FUNC(h, chroma, depth, sse2) \ | |
117 | LF_IFUNC(h, chroma_intra, depth, sse2) \ | |
118 | LF_FUNC(v, chroma, depth, sse2) \ | |
119 | LF_IFUNC(v, chroma_intra, depth, sse2) \ | |
120 | LF_FUNC(h, luma, depth, avx) \ | |
121 | LF_IFUNC(h, luma_intra, depth, avx) \ | |
122 | LF_FUNC(v, luma, depth, avx) \ | |
123 | LF_IFUNC(v, luma_intra, depth, avx) \ | |
124 | LF_FUNC(h, chroma, depth, avx) \ | |
125 | LF_IFUNC(h, chroma_intra, depth, avx) \ | |
126 | LF_FUNC(v, chroma, depth, avx) \ | |
127 | LF_IFUNC(v, chroma_intra, depth, avx) | |
128 | ||
129 | LF_FUNCS(uint8_t, 8) | |
130 | LF_FUNCS(uint16_t, 10) | |
131 | ||
132 | #if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL | |
133 | LF_FUNC(v8, luma, 8, mmxext) | |
134 | static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, | |
135 | int beta, int8_t *tc0) | |
136 | { | |
137 | if ((tc0[0] & tc0[1]) >= 0) | |
138 | ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0); | |
139 | if ((tc0[2] & tc0[3]) >= 0) | |
140 | ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2); | |
141 | } | |
142 | LF_IFUNC(v8, luma_intra, 8, mmxext) | |
143 | static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, | |
144 | int alpha, int beta) | |
145 | { | |
146 | ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta); | |
147 | ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta); | |
148 | } | |
149 | #endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */ | |
150 | ||
151 | LF_FUNC(v, luma, 10, mmxext) | |
152 | LF_IFUNC(v, luma_intra, 10, mmxext) | |
153 | ||
154 | /***********************************/ | |
155 | /* weighted prediction */ | |
156 | ||
157 | #define H264_WEIGHT(W, OPT) \ | |
158 | void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride, \ | |
159 | int height, int log2_denom, \ | |
160 | int weight, int offset); | |
161 | ||
162 | #define H264_BIWEIGHT(W, OPT) \ | |
163 | void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \ | |
164 | int stride, int height, \ | |
165 | int log2_denom, int weightd, \ | |
166 | int weights, int offset); | |
167 | ||
168 | #define H264_BIWEIGHT_MMX(W) \ | |
169 | H264_WEIGHT(W, mmxext) \ | |
170 | H264_BIWEIGHT(W, mmxext) | |
171 | ||
172 | #define H264_BIWEIGHT_MMX_SSE(W) \ | |
173 | H264_BIWEIGHT_MMX(W) \ | |
174 | H264_WEIGHT(W, sse2) \ | |
175 | H264_BIWEIGHT(W, sse2) \ | |
176 | H264_BIWEIGHT(W, ssse3) | |
177 | ||
178 | H264_BIWEIGHT_MMX_SSE(16) | |
179 | H264_BIWEIGHT_MMX_SSE(8) | |
180 | H264_BIWEIGHT_MMX(4) | |
181 | ||
182 | #define H264_WEIGHT_10(W, DEPTH, OPT) \ | |
183 | void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ | |
184 | int stride, \ | |
185 | int height, \ | |
186 | int log2_denom, \ | |
187 | int weight, \ | |
188 | int offset); | |
189 | ||
190 | #define H264_BIWEIGHT_10(W, DEPTH, OPT) \ | |
191 | void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ | |
192 | uint8_t *src, \ | |
193 | int stride, \ | |
194 | int height, \ | |
195 | int log2_denom, \ | |
196 | int weightd, \ | |
197 | int weights, \ | |
198 | int offset); | |
199 | ||
200 | #define H264_BIWEIGHT_10_SSE(W, DEPTH) \ | |
201 | H264_WEIGHT_10(W, DEPTH, sse2) \ | |
202 | H264_WEIGHT_10(W, DEPTH, sse4) \ | |
203 | H264_BIWEIGHT_10(W, DEPTH, sse2) \ | |
204 | H264_BIWEIGHT_10(W, DEPTH, sse4) | |
205 | ||
206 | H264_BIWEIGHT_10_SSE(16, 10) | |
207 | H264_BIWEIGHT_10_SSE(8, 10) | |
208 | H264_BIWEIGHT_10_SSE(4, 10) | |
209 | ||
210 | av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, | |
211 | const int chroma_format_idc) | |
212 | { | |
213 | #if HAVE_YASM | |
214 | int cpu_flags = av_get_cpu_flags(); | |
215 | ||
216 | if (EXTERNAL_MMXEXT(cpu_flags) && chroma_format_idc <= 1) | |
217 | c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext; | |
218 | ||
219 | if (bit_depth == 8) { | |
220 | if (EXTERNAL_MMX(cpu_flags)) { | |
221 | c->h264_idct_dc_add = | |
222 | c->h264_idct_add = ff_h264_idct_add_8_mmx; | |
223 | c->h264_idct8_dc_add = | |
224 | c->h264_idct8_add = ff_h264_idct8_add_8_mmx; | |
225 | ||
226 | c->h264_idct_add16 = ff_h264_idct_add16_8_mmx; | |
227 | c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx; | |
228 | if (chroma_format_idc <= 1) | |
229 | c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; | |
230 | c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx; | |
231 | if (cpu_flags & AV_CPU_FLAG_CMOV) | |
232 | c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx; | |
233 | } | |
234 | if (EXTERNAL_MMXEXT(cpu_flags)) { | |
235 | c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext; | |
236 | c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext; | |
237 | c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext; | |
238 | c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext; | |
239 | if (chroma_format_idc <= 1) | |
240 | c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext; | |
241 | c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext; | |
242 | ||
243 | c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext; | |
244 | c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext; | |
245 | if (chroma_format_idc <= 1) { | |
246 | c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext; | |
247 | c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext; | |
248 | } | |
249 | #if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL | |
250 | c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext; | |
251 | c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext; | |
252 | c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext; | |
253 | c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; | |
254 | #endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */ | |
255 | c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext; | |
256 | c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext; | |
257 | c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext; | |
258 | ||
259 | c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext; | |
260 | c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext; | |
261 | c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext; | |
262 | } | |
263 | if (EXTERNAL_SSE2(cpu_flags)) { | |
264 | c->h264_idct8_add = ff_h264_idct8_add_8_sse2; | |
265 | ||
266 | c->h264_idct_add16 = ff_h264_idct_add16_8_sse2; | |
267 | c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2; | |
268 | if (chroma_format_idc <= 1) | |
269 | c->h264_idct_add8 = ff_h264_idct_add8_8_sse2; | |
270 | c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; | |
271 | c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2; | |
272 | ||
273 | c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2; | |
274 | c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2; | |
275 | ||
276 | c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2; | |
277 | c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2; | |
278 | ||
279 | c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; | |
280 | c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; | |
281 | c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; | |
282 | c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; | |
283 | } | |
284 | if (EXTERNAL_SSSE3(cpu_flags)) { | |
285 | c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; | |
286 | c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3; | |
287 | } | |
288 | if (EXTERNAL_AVX(cpu_flags)) { | |
289 | c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; | |
290 | c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; | |
291 | c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; | |
292 | c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx; | |
293 | } | |
294 | } else if (bit_depth == 10) { | |
295 | if (EXTERNAL_MMXEXT(cpu_flags)) { | |
296 | #if ARCH_X86_32 | |
297 | c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext; | |
298 | c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext; | |
299 | c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext; | |
300 | c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext; | |
301 | c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext; | |
302 | c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext; | |
303 | #endif /* ARCH_X86_32 */ | |
304 | c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext; | |
305 | } | |
306 | if (EXTERNAL_SSE2(cpu_flags)) { | |
307 | c->h264_idct_add = ff_h264_idct_add_10_sse2; | |
308 | c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; | |
309 | ||
310 | c->h264_idct_add16 = ff_h264_idct_add16_10_sse2; | |
311 | if (chroma_format_idc <= 1) | |
312 | c->h264_idct_add8 = ff_h264_idct_add8_10_sse2; | |
313 | c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2; | |
314 | #if HAVE_ALIGNED_STACK | |
315 | c->h264_idct8_add = ff_h264_idct8_add_10_sse2; | |
316 | c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; | |
317 | #endif /* HAVE_ALIGNED_STACK */ | |
318 | ||
319 | c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; | |
320 | c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; | |
321 | c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; | |
322 | ||
323 | c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; | |
324 | c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; | |
325 | c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; | |
326 | ||
327 | c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2; | |
328 | c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2; | |
329 | #if HAVE_ALIGNED_STACK | |
330 | c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; | |
331 | c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; | |
332 | c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2; | |
333 | c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; | |
334 | #endif /* HAVE_ALIGNED_STACK */ | |
335 | } | |
336 | if (EXTERNAL_SSE4(cpu_flags)) { | |
337 | c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; | |
338 | c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; | |
339 | c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; | |
340 | ||
341 | c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; | |
342 | c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; | |
343 | c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; | |
344 | } | |
345 | if (EXTERNAL_AVX(cpu_flags)) { | |
346 | c->h264_idct_dc_add = | |
347 | c->h264_idct_add = ff_h264_idct_add_10_avx; | |
348 | c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx; | |
349 | ||
350 | c->h264_idct_add16 = ff_h264_idct_add16_10_avx; | |
351 | if (chroma_format_idc <= 1) | |
352 | c->h264_idct_add8 = ff_h264_idct_add8_10_avx; | |
353 | c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx; | |
354 | #if HAVE_ALIGNED_STACK | |
355 | c->h264_idct8_add = ff_h264_idct8_add_10_avx; | |
356 | c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx; | |
357 | #endif /* HAVE_ALIGNED_STACK */ | |
358 | ||
359 | c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx; | |
360 | c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx; | |
361 | #if HAVE_ALIGNED_STACK | |
362 | c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; | |
363 | c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx; | |
364 | c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx; | |
365 | c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx; | |
366 | #endif /* HAVE_ALIGNED_STACK */ | |
367 | } | |
368 | } | |
369 | #endif | |
370 | } |