Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * RV40 decoder motion compensation functions x86-optimised | |
3 | * Copyright (c) 2008 Konstantin Shishkov | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | /** | |
23 | * @file | |
24 | * RV40 decoder motion compensation functions x86-optimised | |
25 | * 2,0 and 0,2 have h264 equivalents. | |
26 | * 3,3 is bugged in the rv40 format and maps to _xy2 version | |
27 | */ | |
28 | ||
29 | #include "libavcodec/rv34dsp.h" | |
30 | #include "libavutil/attributes.h" | |
31 | #include "libavutil/mem.h" | |
32 | #include "libavutil/x86/cpu.h" | |
33 | #include "hpeldsp.h" | |
34 | ||
35 | #define DEFINE_FN(op, size, insn) \ | |
36 | static void op##_rv40_qpel##size##_mc33_##insn(uint8_t *dst, const uint8_t *src, \ | |
37 | ptrdiff_t stride) \ | |
38 | { \ | |
39 | ff_##op##_pixels##size##_xy2_##insn(dst, src, stride, size); \ | |
40 | } | |
41 | ||
42 | #if HAVE_YASM | |
43 | void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src, | |
44 | int stride, int h, int x, int y); | |
45 | void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src, | |
46 | int stride, int h, int x, int y); | |
47 | void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src, | |
48 | int stride, int h, int x, int y); | |
49 | ||
50 | void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, | |
51 | int stride, int h, int x, int y); | |
52 | void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src, | |
53 | int stride, int h, int x, int y); | |
54 | void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src, | |
55 | int stride, int h, int x, int y); | |
56 | ||
57 | #define DECLARE_WEIGHT(opt) \ | |
58 | void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ | |
59 | int w1, int w2, ptrdiff_t stride); \ | |
60 | void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ | |
61 | int w1, int w2, ptrdiff_t stride); \ | |
62 | void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ | |
63 | int w1, int w2, ptrdiff_t stride); \ | |
64 | void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ | |
65 | int w1, int w2, ptrdiff_t stride); | |
66 | DECLARE_WEIGHT(mmxext) | |
67 | DECLARE_WEIGHT(sse2) | |
68 | DECLARE_WEIGHT(ssse3) | |
69 | ||
70 | /** @{ */ | |
71 | /** | |
72 | * Define one qpel function. | |
73 | * LOOPSIZE must be already set to the number of pixels processed per | |
74 | * iteration in the inner loop of the called functions. | |
75 | * COFF(x) must be already defined so as to provide the offset into any | |
76 | * array of coeffs used by the called function for the qpel position x. | |
77 | */ | |
78 | #define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \ | |
79 | static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \ | |
80 | const uint8_t *src, \ | |
81 | ptrdiff_t stride) \ | |
82 | { \ | |
83 | int i; \ | |
84 | if (PH && PV) { \ | |
85 | DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)]; \ | |
86 | uint8_t *tmpptr = tmp + SIZE * 2; \ | |
87 | src -= stride * 2; \ | |
88 | \ | |
89 | for (i = 0; i < SIZE; i += LOOPSIZE) \ | |
90 | ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \ | |
91 | SIZE + 5, HCOFF(PH)); \ | |
92 | for (i = 0; i < SIZE; i += LOOPSIZE) \ | |
93 | ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \ | |
94 | SIZE, SIZE, VCOFF(PV)); \ | |
95 | } else if (PV) { \ | |
96 | for (i = 0; i < SIZE; i += LOOPSIZE) \ | |
97 | ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \ | |
98 | stride, SIZE, VCOFF(PV)); \ | |
99 | } else { \ | |
100 | for (i = 0; i < SIZE; i += LOOPSIZE) \ | |
101 | ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \ | |
102 | stride, SIZE, HCOFF(PH)); \ | |
103 | } \ | |
104 | }; | |
105 | ||
106 | /** Declare functions for sizes 8 and 16 and given operations | |
107 | * and qpel position. */ | |
108 | #define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \ | |
109 | QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \ | |
110 | QPEL_FUNC_DECL(OP, 16, PH, PV, OPT) | |
111 | ||
112 | /** Declare all functions for all sizes and qpel positions */ | |
113 | #define QPEL_MC_DECL(OP, OPT) \ | |
114 | void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \ | |
115 | const uint8_t *src, \ | |
116 | ptrdiff_t srcStride, \ | |
117 | int len, int m); \ | |
118 | void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \ | |
119 | const uint8_t *src, \ | |
120 | ptrdiff_t srcStride, \ | |
121 | int len, int m); \ | |
122 | QPEL_FUNCS_DECL(OP, 0, 1, OPT) \ | |
123 | QPEL_FUNCS_DECL(OP, 0, 3, OPT) \ | |
124 | QPEL_FUNCS_DECL(OP, 1, 0, OPT) \ | |
125 | QPEL_FUNCS_DECL(OP, 1, 1, OPT) \ | |
126 | QPEL_FUNCS_DECL(OP, 1, 2, OPT) \ | |
127 | QPEL_FUNCS_DECL(OP, 1, 3, OPT) \ | |
128 | QPEL_FUNCS_DECL(OP, 2, 1, OPT) \ | |
129 | QPEL_FUNCS_DECL(OP, 2, 2, OPT) \ | |
130 | QPEL_FUNCS_DECL(OP, 2, 3, OPT) \ | |
131 | QPEL_FUNCS_DECL(OP, 3, 0, OPT) \ | |
132 | QPEL_FUNCS_DECL(OP, 3, 1, OPT) \ | |
133 | QPEL_FUNCS_DECL(OP, 3, 2, OPT) | |
134 | /** @} */ | |
135 | ||
136 | #define LOOPSIZE 8 | |
137 | #define HCOFF(x) (32 * ((x) - 1)) | |
138 | #define VCOFF(x) (32 * ((x) - 1)) | |
139 | QPEL_MC_DECL(put_, _ssse3) | |
140 | QPEL_MC_DECL(avg_, _ssse3) | |
141 | ||
142 | #undef LOOPSIZE | |
143 | #undef HCOFF | |
144 | #undef VCOFF | |
145 | #define LOOPSIZE 8 | |
146 | #define HCOFF(x) (64 * ((x) - 1)) | |
147 | #define VCOFF(x) (64 * ((x) - 1)) | |
148 | QPEL_MC_DECL(put_, _sse2) | |
149 | QPEL_MC_DECL(avg_, _sse2) | |
150 | ||
151 | #if ARCH_X86_32 | |
152 | #undef LOOPSIZE | |
153 | #undef HCOFF | |
154 | #undef VCOFF | |
155 | #define LOOPSIZE 4 | |
156 | #define HCOFF(x) (64 * ((x) - 1)) | |
157 | #define VCOFF(x) (64 * ((x) - 1)) | |
158 | ||
159 | QPEL_MC_DECL(put_, _mmx) | |
160 | ||
161 | #define ff_put_rv40_qpel_h_mmxext ff_put_rv40_qpel_h_mmx | |
162 | #define ff_put_rv40_qpel_v_mmxext ff_put_rv40_qpel_v_mmx | |
163 | QPEL_MC_DECL(avg_, _mmxext) | |
164 | ||
165 | #define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx | |
166 | #define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx | |
167 | QPEL_MC_DECL(avg_, _3dnow) | |
168 | #endif | |
169 | ||
170 | /** @{ */ | |
171 | /** Set one function */ | |
172 | #define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \ | |
173 | c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT; | |
174 | ||
175 | /** Set functions put and avg for sizes 8 and 16 and a given qpel position */ | |
176 | #define QPEL_FUNCS_SET(OP, PH, PV, OPT) \ | |
177 | QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \ | |
178 | QPEL_FUNC_SET(OP, 16, PH, PV, OPT) | |
179 | ||
180 | /** Set all functions for all sizes and qpel positions */ | |
181 | #define QPEL_MC_SET(OP, OPT) \ | |
182 | QPEL_FUNCS_SET (OP, 0, 1, OPT) \ | |
183 | QPEL_FUNCS_SET (OP, 0, 3, OPT) \ | |
184 | QPEL_FUNCS_SET (OP, 1, 0, OPT) \ | |
185 | QPEL_FUNCS_SET (OP, 1, 1, OPT) \ | |
186 | QPEL_FUNCS_SET (OP, 1, 2, OPT) \ | |
187 | QPEL_FUNCS_SET (OP, 1, 3, OPT) \ | |
188 | QPEL_FUNCS_SET (OP, 2, 1, OPT) \ | |
189 | QPEL_FUNCS_SET (OP, 2, 2, OPT) \ | |
190 | QPEL_FUNCS_SET (OP, 2, 3, OPT) \ | |
191 | QPEL_FUNCS_SET (OP, 3, 0, OPT) \ | |
192 | QPEL_FUNCS_SET (OP, 3, 1, OPT) \ | |
193 | QPEL_FUNCS_SET (OP, 3, 2, OPT) | |
194 | /** @} */ | |
195 | ||
196 | DEFINE_FN(put, 8, ssse3) | |
197 | ||
198 | DEFINE_FN(put, 16, sse2) | |
199 | DEFINE_FN(put, 16, ssse3) | |
200 | ||
201 | DEFINE_FN(avg, 8, mmxext) | |
202 | DEFINE_FN(avg, 8, ssse3) | |
203 | ||
204 | DEFINE_FN(avg, 16, sse2) | |
205 | DEFINE_FN(avg, 16, ssse3) | |
206 | #endif /* HAVE_YASM */ | |
207 | ||
208 | #if HAVE_MMX_INLINE | |
209 | DEFINE_FN(put, 8, mmx) | |
210 | DEFINE_FN(avg, 8, mmx) | |
211 | DEFINE_FN(put, 16, mmx) | |
212 | DEFINE_FN(avg, 16, mmx) | |
213 | #endif | |
214 | ||
215 | av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c) | |
216 | { | |
217 | int cpu_flags = av_get_cpu_flags(); | |
218 | ||
219 | #if HAVE_MMX_INLINE | |
220 | if (INLINE_MMX(cpu_flags)) { | |
221 | c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_mmx; | |
222 | c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_mmx; | |
223 | c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_mmx; | |
224 | c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmx; | |
225 | } | |
226 | #endif /* HAVE_MMX_INLINE */ | |
227 | ||
228 | #if HAVE_YASM | |
229 | if (EXTERNAL_MMX(cpu_flags)) { | |
230 | c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx; | |
231 | c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx; | |
232 | #if ARCH_X86_32 | |
233 | QPEL_MC_SET(put_, _mmx) | |
234 | #endif | |
235 | } | |
236 | if (EXTERNAL_AMD3DNOW(cpu_flags)) { | |
237 | c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow; | |
238 | c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow; | |
239 | #if ARCH_X86_32 | |
240 | QPEL_MC_SET(avg_, _3dnow) | |
241 | #endif | |
242 | } | |
243 | if (EXTERNAL_MMXEXT(cpu_flags)) { | |
244 | c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmxext; | |
245 | c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext; | |
246 | c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext; | |
247 | c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext; | |
248 | c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmxext; | |
249 | c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmxext; | |
250 | c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmxext; | |
251 | #if ARCH_X86_32 | |
252 | QPEL_MC_SET(avg_, _mmxext) | |
253 | #endif | |
254 | } | |
255 | if (EXTERNAL_SSE2(cpu_flags)) { | |
256 | c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_sse2; | |
257 | c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_sse2; | |
258 | c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2; | |
259 | c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2; | |
260 | c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2; | |
261 | c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2; | |
262 | QPEL_MC_SET(put_, _sse2) | |
263 | QPEL_MC_SET(avg_, _sse2) | |
264 | } | |
265 | if (EXTERNAL_SSSE3(cpu_flags)) { | |
266 | c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_ssse3; | |
267 | c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_ssse3; | |
268 | c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_ssse3; | |
269 | c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_ssse3; | |
270 | c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3; | |
271 | c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3; | |
272 | c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3; | |
273 | c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3; | |
274 | QPEL_MC_SET(put_, _ssse3) | |
275 | QPEL_MC_SET(avg_, _ssse3) | |
276 | } | |
277 | #endif /* HAVE_YASM */ | |
278 | } |