Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2013 Seppo Tomperi | |
3 | * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere | |
4 | * | |
5 | * | |
6 | * This file is part of FFmpeg. | |
7 | * | |
8 | * FFmpeg is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License as published by the Free Software Foundation; either | |
11 | * version 2.1 of the License, or (at your option) any later version. | |
12 | * | |
13 | * FFmpeg is distributed in the hope that it will be useful, | |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | * Lesser General Public License for more details. | |
17 | * | |
18 | * You should have received a copy of the GNU Lesser General Public | |
19 | * License along with FFmpeg; if not, write to the Free Software | |
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | */ | |
22 | ||
23 | #include "config.h" | |
24 | #include "libavutil/cpu.h" | |
25 | #include "libavutil/x86/asm.h" | |
26 | #include "libavutil/x86/cpu.h" | |
27 | #include "libavcodec/get_bits.h" /* required for hevcdsp.h GetBitContext */ | |
28 | #include "libavcodec/hevcdsp.h" | |
29 | #include "libavcodec/x86/hevcdsp.h" | |
30 | ||
31 | #define LFC_FUNC(DIR, DEPTH, OPT) \ | |
32 | void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q); | |
33 | ||
34 | #define LFL_FUNC(DIR, DEPTH, OPT) \ | |
35 | void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q); | |
36 | ||
37 | #define LFC_FUNCS(type, depth, opt) \ | |
38 | LFC_FUNC(h, depth, opt) \ | |
39 | LFC_FUNC(v, depth, opt) | |
40 | ||
41 | #define LFL_FUNCS(type, depth, opt) \ | |
42 | LFL_FUNC(h, depth, opt) \ | |
43 | LFL_FUNC(v, depth, opt) | |
44 | ||
45 | LFC_FUNCS(uint8_t, 8, sse2) | |
46 | LFC_FUNCS(uint8_t, 10, sse2) | |
47 | LFC_FUNCS(uint8_t, 12, sse2) | |
48 | LFC_FUNCS(uint8_t, 8, avx) | |
49 | LFC_FUNCS(uint8_t, 10, avx) | |
50 | LFC_FUNCS(uint8_t, 12, avx) | |
51 | LFL_FUNCS(uint8_t, 8, sse2) | |
52 | LFL_FUNCS(uint8_t, 10, sse2) | |
53 | LFL_FUNCS(uint8_t, 12, sse2) | |
54 | LFL_FUNCS(uint8_t, 8, ssse3) | |
55 | LFL_FUNCS(uint8_t, 10, ssse3) | |
56 | LFL_FUNCS(uint8_t, 12, ssse3) | |
57 | LFL_FUNCS(uint8_t, 8, avx) | |
58 | LFL_FUNCS(uint8_t, 10, avx) | |
59 | LFL_FUNCS(uint8_t, 12, avx) | |
60 | ||
61 | #define IDCT_FUNCS(W, opt) \ | |
62 | void ff_hevc_idct##W##_dc_8_##opt(int16_t *coeffs); \ | |
63 | void ff_hevc_idct##W##_dc_10_##opt(int16_t *coeffs); \ | |
64 | void ff_hevc_idct##W##_dc_12_##opt(int16_t *coeffs) | |
65 | ||
66 | IDCT_FUNCS(4x4, mmxext); | |
67 | IDCT_FUNCS(8x8, mmxext); | |
68 | IDCT_FUNCS(8x8, sse2); | |
69 | IDCT_FUNCS(16x16, sse2); | |
70 | IDCT_FUNCS(32x32, sse2); | |
71 | IDCT_FUNCS(16x16, avx2); | |
72 | IDCT_FUNCS(32x32, avx2); | |
73 | ||
74 | #define mc_rep_func(name, bitd, step, W, opt) \ | |
75 | void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, \ | |
76 | uint8_t *_src, ptrdiff_t _srcstride, int height, \ | |
77 | intptr_t mx, intptr_t my, int width) \ | |
78 | { \ | |
79 | int i; \ | |
80 | uint8_t *src; \ | |
81 | int16_t *dst; \ | |
82 | for (i = 0; i < W; i += step) { \ | |
83 | src = _src + (i * ((bitd + 7) / 8)); \ | |
84 | dst = _dst + i; \ | |
85 | ff_hevc_put_hevc_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \ | |
86 | } \ | |
87 | } | |
88 | #define mc_rep_uni_func(name, bitd, step, W, opt) \ | |
89 | void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, \ | |
90 | uint8_t *_src, ptrdiff_t _srcstride, int height, \ | |
91 | intptr_t mx, intptr_t my, int width) \ | |
92 | { \ | |
93 | int i; \ | |
94 | uint8_t *src; \ | |
95 | uint8_t *dst; \ | |
96 | for (i = 0; i < W; i += step) { \ | |
97 | src = _src + (i * ((bitd + 7) / 8)); \ | |
98 | dst = _dst + (i * ((bitd + 7) / 8)); \ | |
99 | ff_hevc_put_hevc_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, \ | |
100 | height, mx, my, width); \ | |
101 | } \ | |
102 | } | |
103 | #define mc_rep_bi_func(name, bitd, step, W, opt) \ | |
104 | void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, uint8_t *_src, \ | |
105 | ptrdiff_t _srcstride, int16_t* _src2, \ | |
106 | int height, intptr_t mx, intptr_t my, int width) \ | |
107 | { \ | |
108 | int i; \ | |
109 | uint8_t *src; \ | |
110 | uint8_t *dst; \ | |
111 | int16_t *src2; \ | |
112 | for (i = 0; i < W ; i += step) { \ | |
113 | src = _src + (i * ((bitd + 7) / 8)); \ | |
114 | dst = _dst + (i * ((bitd + 7) / 8)); \ | |
115 | src2 = _src2 + i; \ | |
116 | ff_hevc_put_hevc_bi_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, \ | |
117 | height, mx, my, width); \ | |
118 | } \ | |
119 | } | |
120 | ||
121 | #define mc_rep_funcs(name, bitd, step, W, opt) \ | |
122 | mc_rep_func(name, bitd, step, W, opt); \ | |
123 | mc_rep_uni_func(name, bitd, step, W, opt); \ | |
124 | mc_rep_bi_func(name, bitd, step, W, opt) | |
125 | ||
126 | #define mc_rep_func2(name, bitd, step1, step2, W, opt) \ | |
127 | void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *dst, \ | |
128 | uint8_t *src, ptrdiff_t _srcstride, int height, \ | |
129 | intptr_t mx, intptr_t my, int width) \ | |
130 | { \ | |
131 | ff_hevc_put_hevc_##name##step1##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \ | |
132 | ff_hevc_put_hevc_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 * ((bitd + 7) / 8)), \ | |
133 | _srcstride, height, mx, my, width); \ | |
134 | } | |
135 | #define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \ | |
136 | void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, \ | |
137 | uint8_t *src, ptrdiff_t _srcstride, int height, \ | |
138 | intptr_t mx, intptr_t my, int width) \ | |
139 | { \ | |
140 | ff_hevc_put_hevc_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, height, mx, my, width);\ | |
141 | ff_hevc_put_hevc_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \ | |
142 | src + (step1 * ((bitd + 7) / 8)), _srcstride, \ | |
143 | height, mx, my, width); \ | |
144 | } | |
145 | #define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \ | |
146 | void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ | |
147 | ptrdiff_t _srcstride, int16_t* src2, \ | |
148 | int height, intptr_t mx, intptr_t my, int width) \ | |
149 | { \ | |
150 | ff_hevc_put_hevc_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, height, mx, my, width);\ | |
151 | ff_hevc_put_hevc_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \ | |
152 | src + (step1 * ((bitd + 7) / 8)), _srcstride, \ | |
153 | src2 + step1, height, mx, my, width); \ | |
154 | } | |
155 | ||
156 | #define mc_rep_funcs(name, bitd, step, W, opt) \ | |
157 | mc_rep_func(name, bitd, step, W, opt); \ | |
158 | mc_rep_uni_func(name, bitd, step, W, opt); \ | |
159 | mc_rep_bi_func(name, bitd, step, W, opt) | |
160 | ||
161 | #define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \ | |
162 | mc_rep_func2(name, bitd, step1, step2, W, opt); \ | |
163 | mc_rep_uni_func2(name, bitd, step1, step2, W, opt); \ | |
164 | mc_rep_bi_func2(name, bitd, step1, step2, W, opt) | |
165 | ||
166 | #if ARCH_X86_64 && HAVE_SSE4_EXTERNAL | |
167 | ||
168 | mc_rep_funcs(pel_pixels, 8, 16, 64, sse4); | |
169 | mc_rep_funcs(pel_pixels, 8, 16, 48, sse4); | |
170 | mc_rep_funcs(pel_pixels, 8, 16, 32, sse4); | |
171 | mc_rep_funcs(pel_pixels, 8, 8, 24, sse4); | |
172 | mc_rep_funcs(pel_pixels,10, 8, 64, sse4); | |
173 | mc_rep_funcs(pel_pixels,10, 8, 48, sse4); | |
174 | mc_rep_funcs(pel_pixels,10, 8, 32, sse4); | |
175 | mc_rep_funcs(pel_pixels,10, 8, 24, sse4); | |
176 | mc_rep_funcs(pel_pixels,10, 8, 16, sse4); | |
177 | mc_rep_funcs(pel_pixels,10, 4, 12, sse4); | |
178 | mc_rep_funcs(pel_pixels,12, 8, 64, sse4); | |
179 | mc_rep_funcs(pel_pixels,12, 8, 48, sse4); | |
180 | mc_rep_funcs(pel_pixels,12, 8, 32, sse4); | |
181 | mc_rep_funcs(pel_pixels,12, 8, 24, sse4); | |
182 | mc_rep_funcs(pel_pixels,12, 8, 16, sse4); | |
183 | mc_rep_funcs(pel_pixels,12, 4, 12, sse4); | |
184 | ||
185 | mc_rep_funcs(epel_h, 8, 16, 64, sse4); | |
186 | mc_rep_funcs(epel_h, 8, 16, 48, sse4); | |
187 | mc_rep_funcs(epel_h, 8, 16, 32, sse4); | |
188 | mc_rep_funcs(epel_h, 8, 8, 24, sse4); | |
189 | mc_rep_funcs(epel_h,10, 8, 64, sse4); | |
190 | mc_rep_funcs(epel_h,10, 8, 48, sse4); | |
191 | mc_rep_funcs(epel_h,10, 8, 32, sse4); | |
192 | mc_rep_funcs(epel_h,10, 8, 24, sse4); | |
193 | mc_rep_funcs(epel_h,10, 8, 16, sse4); | |
194 | mc_rep_funcs(epel_h,10, 4, 12, sse4); | |
195 | mc_rep_funcs(epel_h,12, 8, 64, sse4); | |
196 | mc_rep_funcs(epel_h,12, 8, 48, sse4); | |
197 | mc_rep_funcs(epel_h,12, 8, 32, sse4); | |
198 | mc_rep_funcs(epel_h,12, 8, 24, sse4); | |
199 | mc_rep_funcs(epel_h,12, 8, 16, sse4); | |
200 | mc_rep_funcs(epel_h,12, 4, 12, sse4); | |
201 | mc_rep_funcs(epel_v, 8, 16, 64, sse4); | |
202 | mc_rep_funcs(epel_v, 8, 16, 48, sse4); | |
203 | mc_rep_funcs(epel_v, 8, 16, 32, sse4); | |
204 | mc_rep_funcs(epel_v, 8, 8, 24, sse4); | |
205 | mc_rep_funcs(epel_v,10, 8, 64, sse4); | |
206 | mc_rep_funcs(epel_v,10, 8, 48, sse4); | |
207 | mc_rep_funcs(epel_v,10, 8, 32, sse4); | |
208 | mc_rep_funcs(epel_v,10, 8, 24, sse4); | |
209 | mc_rep_funcs(epel_v,10, 8, 16, sse4); | |
210 | mc_rep_funcs(epel_v,10, 4, 12, sse4); | |
211 | mc_rep_funcs(epel_v,12, 8, 64, sse4); | |
212 | mc_rep_funcs(epel_v,12, 8, 48, sse4); | |
213 | mc_rep_funcs(epel_v,12, 8, 32, sse4); | |
214 | mc_rep_funcs(epel_v,12, 8, 24, sse4); | |
215 | mc_rep_funcs(epel_v,12, 8, 16, sse4); | |
216 | mc_rep_funcs(epel_v,12, 4, 12, sse4); | |
217 | mc_rep_funcs(epel_hv, 8, 8, 64, sse4); | |
218 | mc_rep_funcs(epel_hv, 8, 8, 48, sse4); | |
219 | mc_rep_funcs(epel_hv, 8, 8, 32, sse4); | |
220 | mc_rep_funcs(epel_hv, 8, 8, 24, sse4); | |
221 | mc_rep_funcs(epel_hv, 8, 8, 16, sse4); | |
222 | mc_rep_funcs2(epel_hv,8, 8, 4, 12, sse4); | |
223 | mc_rep_funcs(epel_hv,10, 8, 64, sse4); | |
224 | mc_rep_funcs(epel_hv,10, 8, 48, sse4); | |
225 | mc_rep_funcs(epel_hv,10, 8, 32, sse4); | |
226 | mc_rep_funcs(epel_hv,10, 8, 24, sse4); | |
227 | mc_rep_funcs(epel_hv,10, 8, 16, sse4); | |
228 | mc_rep_funcs(epel_hv,10, 4, 12, sse4); | |
229 | mc_rep_funcs(epel_hv,12, 8, 64, sse4); | |
230 | mc_rep_funcs(epel_hv,12, 8, 48, sse4); | |
231 | mc_rep_funcs(epel_hv,12, 8, 32, sse4); | |
232 | mc_rep_funcs(epel_hv,12, 8, 24, sse4); | |
233 | mc_rep_funcs(epel_hv,12, 8, 16, sse4); | |
234 | mc_rep_funcs(epel_hv,12, 4, 12, sse4); | |
235 | ||
236 | mc_rep_funcs(qpel_h, 8, 16, 64, sse4); | |
237 | mc_rep_funcs(qpel_h, 8, 16, 48, sse4); | |
238 | mc_rep_funcs(qpel_h, 8, 16, 32, sse4); | |
239 | mc_rep_funcs(qpel_h, 8, 8, 24, sse4); | |
240 | mc_rep_funcs(qpel_h,10, 8, 64, sse4); | |
241 | mc_rep_funcs(qpel_h,10, 8, 48, sse4); | |
242 | mc_rep_funcs(qpel_h,10, 8, 32, sse4); | |
243 | mc_rep_funcs(qpel_h,10, 8, 24, sse4); | |
244 | mc_rep_funcs(qpel_h,10, 8, 16, sse4); | |
245 | mc_rep_funcs(qpel_h,10, 4, 12, sse4); | |
246 | mc_rep_funcs(qpel_h,12, 8, 64, sse4); | |
247 | mc_rep_funcs(qpel_h,12, 8, 48, sse4); | |
248 | mc_rep_funcs(qpel_h,12, 8, 32, sse4); | |
249 | mc_rep_funcs(qpel_h,12, 8, 24, sse4); | |
250 | mc_rep_funcs(qpel_h,12, 8, 16, sse4); | |
251 | mc_rep_funcs(qpel_h,12, 4, 12, sse4); | |
252 | mc_rep_funcs(qpel_v, 8, 16, 64, sse4); | |
253 | mc_rep_funcs(qpel_v, 8, 16, 48, sse4); | |
254 | mc_rep_funcs(qpel_v, 8, 16, 32, sse4); | |
255 | mc_rep_funcs(qpel_v, 8, 8, 24, sse4); | |
256 | mc_rep_funcs(qpel_v,10, 8, 64, sse4); | |
257 | mc_rep_funcs(qpel_v,10, 8, 48, sse4); | |
258 | mc_rep_funcs(qpel_v,10, 8, 32, sse4); | |
259 | mc_rep_funcs(qpel_v,10, 8, 24, sse4); | |
260 | mc_rep_funcs(qpel_v,10, 8, 16, sse4); | |
261 | mc_rep_funcs(qpel_v,10, 4, 12, sse4); | |
262 | mc_rep_funcs(qpel_v,12, 8, 64, sse4); | |
263 | mc_rep_funcs(qpel_v,12, 8, 48, sse4); | |
264 | mc_rep_funcs(qpel_v,12, 8, 32, sse4); | |
265 | mc_rep_funcs(qpel_v,12, 8, 24, sse4); | |
266 | mc_rep_funcs(qpel_v,12, 8, 16, sse4); | |
267 | mc_rep_funcs(qpel_v,12, 4, 12, sse4); | |
268 | mc_rep_funcs(qpel_hv, 8, 8, 64, sse4); | |
269 | mc_rep_funcs(qpel_hv, 8, 8, 48, sse4); | |
270 | mc_rep_funcs(qpel_hv, 8, 8, 32, sse4); | |
271 | mc_rep_funcs(qpel_hv, 8, 8, 24, sse4); | |
272 | mc_rep_funcs(qpel_hv, 8, 8, 16, sse4); | |
273 | mc_rep_funcs2(qpel_hv,8, 8, 4, 12, sse4); | |
274 | mc_rep_funcs(qpel_hv,10, 8, 64, sse4); | |
275 | mc_rep_funcs(qpel_hv,10, 8, 48, sse4); | |
276 | mc_rep_funcs(qpel_hv,10, 8, 32, sse4); | |
277 | mc_rep_funcs(qpel_hv,10, 8, 24, sse4); | |
278 | mc_rep_funcs(qpel_hv,10, 8, 16, sse4); | |
279 | mc_rep_funcs(qpel_hv,10, 4, 12, sse4); | |
280 | mc_rep_funcs(qpel_hv,12, 8, 64, sse4); | |
281 | mc_rep_funcs(qpel_hv,12, 8, 48, sse4); | |
282 | mc_rep_funcs(qpel_hv,12, 8, 32, sse4); | |
283 | mc_rep_funcs(qpel_hv,12, 8, 24, sse4); | |
284 | mc_rep_funcs(qpel_hv,12, 8, 16, sse4); | |
285 | mc_rep_funcs(qpel_hv,12, 4, 12, sse4); | |
286 | ||
287 | #define mc_rep_uni_w(bitd, step, W, opt) \ | |
288 | void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride,\ | |
289 | int height, int denom, int _wx, int _ox) \ | |
290 | { \ | |
291 | int i; \ | |
292 | int16_t *src; \ | |
293 | uint8_t *dst; \ | |
294 | for (i = 0; i < W; i += step) { \ | |
295 | src= _src + i; \ | |
296 | dst= _dst + (i * ((bitd + 7) / 8)); \ | |
297 | ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, _srcstride, \ | |
298 | height, denom, _wx, _ox); \ | |
299 | } \ | |
300 | } | |
301 | ||
302 | mc_rep_uni_w(8, 6, 12, sse4); | |
303 | mc_rep_uni_w(8, 8, 16, sse4); | |
304 | mc_rep_uni_w(8, 8, 24, sse4); | |
305 | mc_rep_uni_w(8, 8, 32, sse4); | |
306 | mc_rep_uni_w(8, 8, 48, sse4); | |
307 | mc_rep_uni_w(8, 8, 64, sse4); | |
308 | ||
309 | mc_rep_uni_w(10, 6, 12, sse4); | |
310 | mc_rep_uni_w(10, 8, 16, sse4); | |
311 | mc_rep_uni_w(10, 8, 24, sse4); | |
312 | mc_rep_uni_w(10, 8, 32, sse4); | |
313 | mc_rep_uni_w(10, 8, 48, sse4); | |
314 | mc_rep_uni_w(10, 8, 64, sse4); | |
315 | ||
316 | mc_rep_uni_w(12, 6, 12, sse4); | |
317 | mc_rep_uni_w(12, 8, 16, sse4); | |
318 | mc_rep_uni_w(12, 8, 24, sse4); | |
319 | mc_rep_uni_w(12, 8, 32, sse4); | |
320 | mc_rep_uni_w(12, 8, 48, sse4); | |
321 | mc_rep_uni_w(12, 8, 64, sse4); | |
322 | ||
323 | #define mc_rep_bi_w(bitd, step, W, opt) \ | |
324 | void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride, \ | |
325 | int16_t *_src2, int height, \ | |
326 | int denom, int _wx0, int _wx1, int _ox0, int _ox1) \ | |
327 | { \ | |
328 | int i; \ | |
329 | int16_t *src; \ | |
330 | int16_t *src2; \ | |
331 | uint8_t *dst; \ | |
332 | for (i = 0; i < W; i += step) { \ | |
333 | src = _src + i; \ | |
334 | src2 = _src2 + i; \ | |
335 | dst = _dst + (i * ((bitd + 7) / 8)); \ | |
336 | ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, \ | |
337 | height, denom, _wx0, _wx1, _ox0, _ox1); \ | |
338 | } \ | |
339 | } | |
340 | ||
341 | mc_rep_bi_w(8, 6, 12, sse4); | |
342 | mc_rep_bi_w(8, 8, 16, sse4); | |
343 | mc_rep_bi_w(8, 8, 24, sse4); | |
344 | mc_rep_bi_w(8, 8, 32, sse4); | |
345 | mc_rep_bi_w(8, 8, 48, sse4); | |
346 | mc_rep_bi_w(8, 8, 64, sse4); | |
347 | ||
348 | mc_rep_bi_w(10, 6, 12, sse4); | |
349 | mc_rep_bi_w(10, 8, 16, sse4); | |
350 | mc_rep_bi_w(10, 8, 24, sse4); | |
351 | mc_rep_bi_w(10, 8, 32, sse4); | |
352 | mc_rep_bi_w(10, 8, 48, sse4); | |
353 | mc_rep_bi_w(10, 8, 64, sse4); | |
354 | ||
355 | mc_rep_bi_w(12, 6, 12, sse4); | |
356 | mc_rep_bi_w(12, 8, 16, sse4); | |
357 | mc_rep_bi_w(12, 8, 24, sse4); | |
358 | mc_rep_bi_w(12, 8, 32, sse4); | |
359 | mc_rep_bi_w(12, 8, 48, sse4); | |
360 | mc_rep_bi_w(12, 8, 64, sse4); | |
361 | ||
362 | #define mc_uni_w_func(name, bitd, W, opt) \ | |
363 | void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \ | |
364 | uint8_t *_src, ptrdiff_t _srcstride, \ | |
365 | int height, int denom, \ | |
366 | int _wx, int _ox, \ | |
367 | intptr_t mx, intptr_t my, int width) \ | |
368 | { \ | |
369 | LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \ | |
370 | ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \ | |
371 | ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, MAX_PB_SIZE, height, denom, _wx, _ox);\ | |
372 | } | |
373 | ||
374 | #define mc_uni_w_funcs(name, bitd, opt) \ | |
375 | mc_uni_w_func(name, bitd, 4, opt); \ | |
376 | mc_uni_w_func(name, bitd, 8, opt); \ | |
377 | mc_uni_w_func(name, bitd, 12, opt); \ | |
378 | mc_uni_w_func(name, bitd, 16, opt); \ | |
379 | mc_uni_w_func(name, bitd, 24, opt); \ | |
380 | mc_uni_w_func(name, bitd, 32, opt); \ | |
381 | mc_uni_w_func(name, bitd, 48, opt); \ | |
382 | mc_uni_w_func(name, bitd, 64, opt) | |
383 | ||
384 | mc_uni_w_funcs(pel_pixels, 8, sse4); | |
385 | mc_uni_w_func(pel_pixels, 8, 6, sse4); | |
386 | mc_uni_w_funcs(epel_h, 8, sse4); | |
387 | mc_uni_w_func(epel_h, 8, 6, sse4); | |
388 | mc_uni_w_funcs(epel_v, 8, sse4); | |
389 | mc_uni_w_func(epel_v, 8, 6, sse4); | |
390 | mc_uni_w_funcs(epel_hv, 8, sse4); | |
391 | mc_uni_w_func(epel_hv, 8, 6, sse4); | |
392 | mc_uni_w_funcs(qpel_h, 8, sse4); | |
393 | mc_uni_w_funcs(qpel_v, 8, sse4); | |
394 | mc_uni_w_funcs(qpel_hv, 8, sse4); | |
395 | ||
396 | mc_uni_w_funcs(pel_pixels, 10, sse4); | |
397 | mc_uni_w_func(pel_pixels, 10, 6, sse4); | |
398 | mc_uni_w_funcs(epel_h, 10, sse4); | |
399 | mc_uni_w_func(epel_h, 10, 6, sse4); | |
400 | mc_uni_w_funcs(epel_v, 10, sse4); | |
401 | mc_uni_w_func(epel_v, 10, 6, sse4); | |
402 | mc_uni_w_funcs(epel_hv, 10, sse4); | |
403 | mc_uni_w_func(epel_hv, 10, 6, sse4); | |
404 | mc_uni_w_funcs(qpel_h, 10, sse4); | |
405 | mc_uni_w_funcs(qpel_v, 10, sse4); | |
406 | mc_uni_w_funcs(qpel_hv, 10, sse4); | |
407 | ||
408 | mc_uni_w_funcs(pel_pixels, 12, sse4); | |
409 | mc_uni_w_func(pel_pixels, 12, 6, sse4); | |
410 | mc_uni_w_funcs(epel_h, 12, sse4); | |
411 | mc_uni_w_func(epel_h, 12, 6, sse4); | |
412 | mc_uni_w_funcs(epel_v, 12, sse4); | |
413 | mc_uni_w_func(epel_v, 12, 6, sse4); | |
414 | mc_uni_w_funcs(epel_hv, 12, sse4); | |
415 | mc_uni_w_func(epel_hv, 12, 6, sse4); | |
416 | mc_uni_w_funcs(qpel_h, 12, sse4); | |
417 | mc_uni_w_funcs(qpel_v, 12, sse4); | |
418 | mc_uni_w_funcs(qpel_hv, 12, sse4); | |
419 | ||
420 | #define mc_bi_w_func(name, bitd, W, opt) \ | |
421 | void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \ | |
422 | uint8_t *_src, ptrdiff_t _srcstride, \ | |
423 | int16_t *_src2, \ | |
424 | int height, int denom, \ | |
425 | int _wx0, int _wx1, int _ox0, int _ox1, \ | |
426 | intptr_t mx, intptr_t my, int width) \ | |
427 | { \ | |
428 | LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \ | |
429 | ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \ | |
430 | ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, MAX_PB_SIZE, _src2, \ | |
431 | height, denom, _wx0, _wx1, _ox0, _ox1); \ | |
432 | } | |
433 | ||
434 | #define mc_bi_w_funcs(name, bitd, opt) \ | |
435 | mc_bi_w_func(name, bitd, 4, opt); \ | |
436 | mc_bi_w_func(name, bitd, 8, opt); \ | |
437 | mc_bi_w_func(name, bitd, 12, opt); \ | |
438 | mc_bi_w_func(name, bitd, 16, opt); \ | |
439 | mc_bi_w_func(name, bitd, 24, opt); \ | |
440 | mc_bi_w_func(name, bitd, 32, opt); \ | |
441 | mc_bi_w_func(name, bitd, 48, opt); \ | |
442 | mc_bi_w_func(name, bitd, 64, opt) | |
443 | ||
444 | mc_bi_w_funcs(pel_pixels, 8, sse4); | |
445 | mc_bi_w_func(pel_pixels, 8, 6, sse4); | |
446 | mc_bi_w_funcs(epel_h, 8, sse4); | |
447 | mc_bi_w_func(epel_h, 8, 6, sse4); | |
448 | mc_bi_w_funcs(epel_v, 8, sse4); | |
449 | mc_bi_w_func(epel_v, 8, 6, sse4); | |
450 | mc_bi_w_funcs(epel_hv, 8, sse4); | |
451 | mc_bi_w_func(epel_hv, 8, 6, sse4); | |
452 | mc_bi_w_funcs(qpel_h, 8, sse4); | |
453 | mc_bi_w_funcs(qpel_v, 8, sse4); | |
454 | mc_bi_w_funcs(qpel_hv, 8, sse4); | |
455 | ||
456 | mc_bi_w_funcs(pel_pixels, 10, sse4); | |
457 | mc_bi_w_func(pel_pixels, 10, 6, sse4); | |
458 | mc_bi_w_funcs(epel_h, 10, sse4); | |
459 | mc_bi_w_func(epel_h, 10, 6, sse4); | |
460 | mc_bi_w_funcs(epel_v, 10, sse4); | |
461 | mc_bi_w_func(epel_v, 10, 6, sse4); | |
462 | mc_bi_w_funcs(epel_hv, 10, sse4); | |
463 | mc_bi_w_func(epel_hv, 10, 6, sse4); | |
464 | mc_bi_w_funcs(qpel_h, 10, sse4); | |
465 | mc_bi_w_funcs(qpel_v, 10, sse4); | |
466 | mc_bi_w_funcs(qpel_hv, 10, sse4); | |
467 | ||
468 | mc_bi_w_funcs(pel_pixels, 12, sse4); | |
469 | mc_bi_w_func(pel_pixels, 12, 6, sse4); | |
470 | mc_bi_w_funcs(epel_h, 12, sse4); | |
471 | mc_bi_w_func(epel_h, 12, 6, sse4); | |
472 | mc_bi_w_funcs(epel_v, 12, sse4); | |
473 | mc_bi_w_func(epel_v, 12, 6, sse4); | |
474 | mc_bi_w_funcs(epel_hv, 12, sse4); | |
475 | mc_bi_w_func(epel_hv, 12, 6, sse4); | |
476 | mc_bi_w_funcs(qpel_h, 12, sse4); | |
477 | mc_bi_w_funcs(qpel_v, 12, sse4); | |
478 | mc_bi_w_funcs(qpel_hv, 12, sse4); | |
479 | #endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL | |
480 | ||
481 | ||
482 | #define EPEL_LINKS(pointer, my, mx, fname, bitd, opt ) \ | |
483 | PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \ | |
484 | PEL_LINK(pointer, 2, my , mx , fname##6 , bitd, opt ); \ | |
485 | PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \ | |
486 | PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \ | |
487 | PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \ | |
488 | PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \ | |
489 | PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \ | |
490 | PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \ | |
491 | PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt ) | |
492 | #define QPEL_LINKS(pointer, my, mx, fname, bitd, opt) \ | |
493 | PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \ | |
494 | PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \ | |
495 | PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \ | |
496 | PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \ | |
497 | PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \ | |
498 | PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \ | |
499 | PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \ | |
500 | PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt ) | |
501 | ||
502 | ||
503 | void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) | |
504 | { | |
505 | int cpu_flags = av_get_cpu_flags(); | |
506 | ||
507 | if (bit_depth == 8) { | |
508 | if (EXTERNAL_MMXEXT(cpu_flags)) { | |
509 | c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext; | |
510 | c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext; | |
511 | c->transform_add[0] = ff_hevc_transform_add4_8_mmxext; | |
512 | } | |
513 | if (EXTERNAL_SSE2(cpu_flags)) { | |
514 | c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2; | |
515 | c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2; | |
516 | if (ARCH_X86_64) { | |
517 | c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2; | |
518 | c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2; | |
519 | } | |
520 | c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2; | |
521 | c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2; | |
522 | c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2; | |
523 | ||
524 | c->transform_add[1] = ff_hevc_transform_add8_8_sse2; | |
525 | c->transform_add[2] = ff_hevc_transform_add16_8_sse2; | |
526 | c->transform_add[3] = ff_hevc_transform_add32_8_sse2; | |
527 | } | |
528 | if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { | |
529 | c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3; | |
530 | c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3; | |
531 | } | |
532 | if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) { | |
533 | ||
534 | EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, sse4); | |
535 | EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 8, sse4); | |
536 | EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 8, sse4); | |
537 | EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 8, sse4); | |
538 | ||
539 | QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4); | |
540 | QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 8, sse4); | |
541 | QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4); | |
542 | QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4); | |
543 | } | |
544 | if (EXTERNAL_AVX(cpu_flags)) { | |
545 | c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx; | |
546 | c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx; | |
547 | if (ARCH_X86_64) { | |
548 | c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx; | |
549 | c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx; | |
550 | } | |
551 | c->transform_add[1] = ff_hevc_transform_add8_8_avx; | |
552 | c->transform_add[2] = ff_hevc_transform_add16_8_avx; | |
553 | c->transform_add[3] = ff_hevc_transform_add32_8_avx; | |
554 | } | |
555 | if (EXTERNAL_AVX2(cpu_flags)) { | |
556 | c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2; | |
557 | c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2; | |
558 | ||
559 | c->transform_add[3] = ff_hevc_transform_add32_8_avx2; | |
560 | } | |
561 | } else if (bit_depth == 10) { | |
562 | if (EXTERNAL_MMXEXT(cpu_flags)) { | |
563 | c->transform_add[0] = ff_hevc_transform_add4_10_mmxext; | |
564 | c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext; | |
565 | c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext; | |
566 | } | |
567 | if (EXTERNAL_SSE2(cpu_flags)) { | |
568 | c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2; | |
569 | c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2; | |
570 | if (ARCH_X86_64) { | |
571 | c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2; | |
572 | c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2; | |
573 | } | |
574 | ||
575 | c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2; | |
576 | c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2; | |
577 | c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2; | |
578 | ||
579 | c->transform_add[1] = ff_hevc_transform_add8_10_sse2; | |
580 | c->transform_add[2] = ff_hevc_transform_add16_10_sse2; | |
581 | c->transform_add[3] = ff_hevc_transform_add32_10_sse2; | |
582 | } | |
583 | if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { | |
584 | c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3; | |
585 | c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3; | |
586 | } | |
587 | if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) { | |
588 | EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4); | |
589 | EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 10, sse4); | |
590 | EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 10, sse4); | |
591 | EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 10, sse4); | |
592 | ||
593 | QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4); | |
594 | QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 10, sse4); | |
595 | QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, sse4); | |
596 | QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, sse4); | |
597 | } | |
598 | if (EXTERNAL_AVX(cpu_flags)) { | |
599 | c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx; | |
600 | c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx; | |
601 | if (ARCH_X86_64) { | |
602 | c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx; | |
603 | c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx; | |
604 | } | |
605 | } | |
606 | if (EXTERNAL_AVX2(cpu_flags)) { | |
607 | ||
608 | c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2; | |
609 | c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2; | |
610 | ||
611 | c->transform_add[2] = ff_hevc_transform_add16_10_avx2; | |
612 | c->transform_add[3] = ff_hevc_transform_add32_10_avx2; | |
613 | ||
614 | } | |
615 | } else if (bit_depth == 12) { | |
616 | if (EXTERNAL_MMXEXT(cpu_flags)) { | |
617 | c->idct_dc[0] = ff_hevc_idct4x4_dc_12_mmxext; | |
618 | c->idct_dc[1] = ff_hevc_idct8x8_dc_12_mmxext; | |
619 | } | |
620 | if (EXTERNAL_SSE2(cpu_flags)) { | |
621 | c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2; | |
622 | c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2; | |
623 | if (ARCH_X86_64) { | |
624 | c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2; | |
625 | c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2; | |
626 | } | |
627 | ||
628 | c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2; | |
629 | c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2; | |
630 | c->idct_dc[3] = ff_hevc_idct32x32_dc_12_sse2; | |
631 | } | |
632 | if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { | |
633 | c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3; | |
634 | c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3; | |
635 | } | |
636 | if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) { | |
637 | EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4); | |
638 | EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 12, sse4); | |
639 | EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 12, sse4); | |
640 | EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 12, sse4); | |
641 | ||
642 | QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4); | |
643 | QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 12, sse4); | |
644 | QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 12, sse4); | |
645 | QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 12, sse4); | |
646 | } | |
647 | if (EXTERNAL_AVX(cpu_flags)) { | |
648 | c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx; | |
649 | c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx; | |
650 | if (ARCH_X86_64) { | |
651 | c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx; | |
652 | c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx; | |
653 | } | |
654 | } | |
655 | if (EXTERNAL_AVX2(cpu_flags)) { | |
656 | c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2; | |
657 | c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2; | |
658 | } | |
659 | } | |
660 | } |