Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * VC-1 and WMV3 - DSP functions MMX-optimized | |
3 | * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> | |
4 | * | |
5 | * Permission is hereby granted, free of charge, to any person | |
6 | * obtaining a copy of this software and associated documentation | |
7 | * files (the "Software"), to deal in the Software without | |
8 | * restriction, including without limitation the rights to use, | |
9 | * copy, modify, merge, publish, distribute, sublicense, and/or sell | |
10 | * copies of the Software, and to permit persons to whom the | |
11 | * Software is furnished to do so, subject to the following | |
12 | * conditions: | |
13 | * | |
14 | * The above copyright notice and this permission notice shall be | |
15 | * included in all copies or substantial portions of the Software. | |
16 | * | |
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
18 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | |
19 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
20 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
21 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
22 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
23 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
24 | * OTHER DEALINGS IN THE SOFTWARE. | |
25 | */ | |
26 | ||
27 | #include "libavutil/cpu.h" | |
28 | #include "libavutil/mem.h" | |
29 | #include "libavutil/x86/asm.h" | |
30 | #include "libavutil/x86/cpu.h" | |
31 | #include "libavcodec/vc1dsp.h" | |
32 | #include "constants.h" | |
33 | #include "fpel.h" | |
34 | #include "vc1dsp.h" | |
35 | ||
36 | #if HAVE_6REGS && HAVE_INLINE_ASM | |
37 | ||
38 | #define OP_PUT(S,D) | |
39 | #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t" | |
40 | ||
41 | /** Add rounder from mm7 to mm3 and pack result at destination */ | |
42 | #define NORMALIZE_MMX(SHIFT) \ | |
43 | "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \ | |
44 | "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \ | |
45 | "psraw "SHIFT", %%mm3 \n\t" \ | |
46 | "psraw "SHIFT", %%mm4 \n\t" | |
47 | ||
48 | #define TRANSFER_DO_PACK(OP) \ | |
49 | "packuswb %%mm4, %%mm3 \n\t" \ | |
50 | OP((%2), %%mm3) \ | |
51 | "movq %%mm3, (%2) \n\t" | |
52 | ||
53 | #define TRANSFER_DONT_PACK(OP) \ | |
54 | OP(0(%2), %%mm3) \ | |
55 | OP(8(%2), %%mm4) \ | |
56 | "movq %%mm3, 0(%2) \n\t" \ | |
57 | "movq %%mm4, 8(%2) \n\t" | |
58 | ||
59 | /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */ | |
60 | #define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t" | |
61 | #define DONT_UNPACK(reg) | |
62 | ||
63 | /** Compute the rounder 32-r or 8-r and unpacks it to mm7 */ | |
64 | #define LOAD_ROUNDER_MMX(ROUND) \ | |
65 | "movd "ROUND", %%mm7 \n\t" \ | |
66 | "punpcklwd %%mm7, %%mm7 \n\t" \ | |
67 | "punpckldq %%mm7, %%mm7 \n\t" | |
68 | ||
69 | #define SHIFT2_LINE(OFF, R0,R1,R2,R3) \ | |
70 | "paddw %%mm"#R2", %%mm"#R1" \n\t" \ | |
71 | "movd (%0,%3), %%mm"#R0" \n\t" \ | |
72 | "pmullw %%mm6, %%mm"#R1" \n\t" \ | |
73 | "punpcklbw %%mm0, %%mm"#R0" \n\t" \ | |
74 | "movd (%0,%2), %%mm"#R3" \n\t" \ | |
75 | "psubw %%mm"#R0", %%mm"#R1" \n\t" \ | |
76 | "punpcklbw %%mm0, %%mm"#R3" \n\t" \ | |
77 | "paddw %%mm7, %%mm"#R1" \n\t" \ | |
78 | "psubw %%mm"#R3", %%mm"#R1" \n\t" \ | |
79 | "psraw %4, %%mm"#R1" \n\t" \ | |
80 | "movq %%mm"#R1", "#OFF"(%1) \n\t" \ | |
81 | "add %2, %0 \n\t" | |
82 | ||
83 | /** Sacrifying mm6 allows to pipeline loads from src */ | |
84 | static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, | |
85 | const uint8_t *src, x86_reg stride, | |
86 | int rnd, int64_t shift) | |
87 | { | |
88 | __asm__ volatile( | |
89 | "mov $3, %%"REG_c" \n\t" | |
90 | LOAD_ROUNDER_MMX("%5") | |
91 | "movq "MANGLE(ff_pw_9)", %%mm6 \n\t" | |
92 | "1: \n\t" | |
93 | "movd (%0), %%mm2 \n\t" | |
94 | "add %2, %0 \n\t" | |
95 | "movd (%0), %%mm3 \n\t" | |
96 | "punpcklbw %%mm0, %%mm2 \n\t" | |
97 | "punpcklbw %%mm0, %%mm3 \n\t" | |
98 | SHIFT2_LINE( 0, 1, 2, 3, 4) | |
99 | SHIFT2_LINE( 24, 2, 3, 4, 1) | |
100 | SHIFT2_LINE( 48, 3, 4, 1, 2) | |
101 | SHIFT2_LINE( 72, 4, 1, 2, 3) | |
102 | SHIFT2_LINE( 96, 1, 2, 3, 4) | |
103 | SHIFT2_LINE(120, 2, 3, 4, 1) | |
104 | SHIFT2_LINE(144, 3, 4, 1, 2) | |
105 | SHIFT2_LINE(168, 4, 1, 2, 3) | |
106 | "sub %6, %0 \n\t" | |
107 | "add $8, %1 \n\t" | |
108 | "dec %%"REG_c" \n\t" | |
109 | "jnz 1b \n\t" | |
110 | : "+r"(src), "+r"(dst) | |
111 | : "r"(stride), "r"(-2*stride), | |
112 | "m"(shift), "m"(rnd), "r"(9*stride-4) | |
113 | NAMED_CONSTRAINTS_ADD(ff_pw_9) | |
114 | : "%"REG_c, "memory" | |
115 | ); | |
116 | } | |
117 | ||
118 | /** | |
119 | * Data is already unpacked, so some operations can directly be made from | |
120 | * memory. | |
121 | */ | |
122 | #define VC1_HOR_16b_SHIFT2(OP, OPNAME)\ | |
123 | static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\ | |
124 | const int16_t *src, int rnd)\ | |
125 | {\ | |
126 | int h = 8;\ | |
127 | \ | |
128 | src -= 1;\ | |
129 | rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\ | |
130 | __asm__ volatile(\ | |
131 | LOAD_ROUNDER_MMX("%4")\ | |
132 | "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\ | |
133 | "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\ | |
134 | "1: \n\t"\ | |
135 | "movq 2*0+0(%1), %%mm1 \n\t"\ | |
136 | "movq 2*0+8(%1), %%mm2 \n\t"\ | |
137 | "movq 2*1+0(%1), %%mm3 \n\t"\ | |
138 | "movq 2*1+8(%1), %%mm4 \n\t"\ | |
139 | "paddw 2*3+0(%1), %%mm1 \n\t"\ | |
140 | "paddw 2*3+8(%1), %%mm2 \n\t"\ | |
141 | "paddw 2*2+0(%1), %%mm3 \n\t"\ | |
142 | "paddw 2*2+8(%1), %%mm4 \n\t"\ | |
143 | "pmullw %%mm5, %%mm3 \n\t"\ | |
144 | "pmullw %%mm5, %%mm4 \n\t"\ | |
145 | "psubw %%mm1, %%mm3 \n\t"\ | |
146 | "psubw %%mm2, %%mm4 \n\t"\ | |
147 | NORMALIZE_MMX("$7")\ | |
148 | /* Remove bias */\ | |
149 | "paddw %%mm6, %%mm3 \n\t"\ | |
150 | "paddw %%mm6, %%mm4 \n\t"\ | |
151 | TRANSFER_DO_PACK(OP)\ | |
152 | "add $24, %1 \n\t"\ | |
153 | "add %3, %2 \n\t"\ | |
154 | "decl %0 \n\t"\ | |
155 | "jnz 1b \n\t"\ | |
156 | : "+r"(h), "+r" (src), "+r" (dst)\ | |
157 | : "r"(stride), "m"(rnd)\ | |
158 | NAMED_CONSTRAINTS_ADD(ff_pw_128,ff_pw_9)\ | |
159 | : "memory"\ | |
160 | );\ | |
161 | } | |
162 | ||
163 | VC1_HOR_16b_SHIFT2(OP_PUT, put_) | |
164 | VC1_HOR_16b_SHIFT2(OP_AVG, avg_) | |
165 | ||
166 | ||
167 | /** | |
168 | * Purely vertical or horizontal 1/2 shift interpolation. | |
169 | * Sacrify mm6 for *9 factor. | |
170 | */ | |
171 | #define VC1_SHIFT2(OP, OPNAME)\ | |
172 | static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\ | |
173 | x86_reg stride, int rnd, x86_reg offset)\ | |
174 | {\ | |
175 | rnd = 8-rnd;\ | |
176 | __asm__ volatile(\ | |
177 | "mov $8, %%"REG_c" \n\t"\ | |
178 | LOAD_ROUNDER_MMX("%5")\ | |
179 | "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\ | |
180 | "1: \n\t"\ | |
181 | "movd 0(%0 ), %%mm3 \n\t"\ | |
182 | "movd 4(%0 ), %%mm4 \n\t"\ | |
183 | "movd 0(%0,%2), %%mm1 \n\t"\ | |
184 | "movd 4(%0,%2), %%mm2 \n\t"\ | |
185 | "add %2, %0 \n\t"\ | |
186 | "punpcklbw %%mm0, %%mm3 \n\t"\ | |
187 | "punpcklbw %%mm0, %%mm4 \n\t"\ | |
188 | "punpcklbw %%mm0, %%mm1 \n\t"\ | |
189 | "punpcklbw %%mm0, %%mm2 \n\t"\ | |
190 | "paddw %%mm1, %%mm3 \n\t"\ | |
191 | "paddw %%mm2, %%mm4 \n\t"\ | |
192 | "movd 0(%0,%3), %%mm1 \n\t"\ | |
193 | "movd 4(%0,%3), %%mm2 \n\t"\ | |
194 | "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\ | |
195 | "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\ | |
196 | "punpcklbw %%mm0, %%mm1 \n\t"\ | |
197 | "punpcklbw %%mm0, %%mm2 \n\t"\ | |
198 | "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\ | |
199 | "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\ | |
200 | "movd 0(%0,%2), %%mm1 \n\t"\ | |
201 | "movd 4(%0,%2), %%mm2 \n\t"\ | |
202 | "punpcklbw %%mm0, %%mm1 \n\t"\ | |
203 | "punpcklbw %%mm0, %%mm2 \n\t"\ | |
204 | "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\ | |
205 | "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\ | |
206 | NORMALIZE_MMX("$4")\ | |
207 | "packuswb %%mm4, %%mm3 \n\t"\ | |
208 | OP((%1), %%mm3)\ | |
209 | "movq %%mm3, (%1) \n\t"\ | |
210 | "add %6, %0 \n\t"\ | |
211 | "add %4, %1 \n\t"\ | |
212 | "dec %%"REG_c" \n\t"\ | |
213 | "jnz 1b \n\t"\ | |
214 | : "+r"(src), "+r"(dst)\ | |
215 | : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\ | |
216 | "g"(stride-offset)\ | |
217 | NAMED_CONSTRAINTS_ADD(ff_pw_9)\ | |
218 | : "%"REG_c, "memory"\ | |
219 | );\ | |
220 | } | |
221 | ||
222 | VC1_SHIFT2(OP_PUT, put_) | |
223 | VC1_SHIFT2(OP_AVG, avg_) | |
224 | ||
225 | /** | |
226 | * Core of the 1/4 and 3/4 shift bicubic interpolation. | |
227 | * | |
228 | * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty). | |
229 | * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked. | |
230 | * @param A1 Address of 1st tap (beware of unpacked/packed). | |
231 | * @param A2 Address of 2nd tap | |
232 | * @param A3 Address of 3rd tap | |
233 | * @param A4 Address of 4th tap | |
234 | */ | |
235 | #define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \ | |
236 | MOVQ "*0+"A1", %%mm1 \n\t" \ | |
237 | MOVQ "*4+"A1", %%mm2 \n\t" \ | |
238 | UNPACK("%%mm1") \ | |
239 | UNPACK("%%mm2") \ | |
240 | "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \ | |
241 | "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \ | |
242 | MOVQ "*0+"A2", %%mm3 \n\t" \ | |
243 | MOVQ "*4+"A2", %%mm4 \n\t" \ | |
244 | UNPACK("%%mm3") \ | |
245 | UNPACK("%%mm4") \ | |
246 | "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ | |
247 | "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \ | |
248 | "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \ | |
249 | "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \ | |
250 | MOVQ "*0+"A4", %%mm1 \n\t" \ | |
251 | MOVQ "*4+"A4", %%mm2 \n\t" \ | |
252 | UNPACK("%%mm1") \ | |
253 | UNPACK("%%mm2") \ | |
254 | "psllw $2, %%mm1 \n\t" /* 4* */ \ | |
255 | "psllw $2, %%mm2 \n\t" /* 4* */ \ | |
256 | "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \ | |
257 | "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \ | |
258 | MOVQ "*0+"A3", %%mm1 \n\t" \ | |
259 | MOVQ "*4+"A3", %%mm2 \n\t" \ | |
260 | UNPACK("%%mm1") \ | |
261 | UNPACK("%%mm2") \ | |
262 | "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ | |
263 | "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \ | |
264 | "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \ | |
265 | "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */ | |
266 | ||
267 | /** | |
268 | * Macro to build the vertical 16bits version of vc1_put_shift[13]. | |
269 | * Here, offset=src_stride. Parameters passed A1 to A4 must use | |
270 | * %3 (src_stride) and %4 (3*src_stride). | |
271 | * | |
272 | * @param NAME Either 1 or 3 | |
273 | * @see MSPEL_FILTER13_CORE for information on A1->A4 | |
274 | */ | |
275 | #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \ | |
276 | static void \ | |
277 | vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \ | |
278 | x86_reg src_stride, \ | |
279 | int rnd, int64_t shift) \ | |
280 | { \ | |
281 | int h = 8; \ | |
282 | src -= src_stride; \ | |
283 | __asm__ volatile( \ | |
284 | LOAD_ROUNDER_MMX("%5") \ | |
285 | "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \ | |
286 | "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \ | |
287 | ".p2align 3 \n\t" \ | |
288 | "1: \n\t" \ | |
289 | MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ | |
290 | NORMALIZE_MMX("%6") \ | |
291 | TRANSFER_DONT_PACK(OP_PUT) \ | |
292 | /* Last 3 (in fact 4) bytes on the line */ \ | |
293 | "movd 8+"A1", %%mm1 \n\t" \ | |
294 | DO_UNPACK("%%mm1") \ | |
295 | "movq %%mm1, %%mm3 \n\t" \ | |
296 | "paddw %%mm1, %%mm1 \n\t" \ | |
297 | "paddw %%mm3, %%mm1 \n\t" /* 3* */ \ | |
298 | "movd 8+"A2", %%mm3 \n\t" \ | |
299 | DO_UNPACK("%%mm3") \ | |
300 | "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ | |
301 | "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \ | |
302 | "movd 8+"A3", %%mm1 \n\t" \ | |
303 | DO_UNPACK("%%mm1") \ | |
304 | "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ | |
305 | "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \ | |
306 | "movd 8+"A4", %%mm1 \n\t" \ | |
307 | DO_UNPACK("%%mm1") \ | |
308 | "psllw $2, %%mm1 \n\t" /* 4* */ \ | |
309 | "psubw %%mm1, %%mm3 \n\t" \ | |
310 | "paddw %%mm7, %%mm3 \n\t" \ | |
311 | "psraw %6, %%mm3 \n\t" \ | |
312 | "movq %%mm3, 16(%2) \n\t" \ | |
313 | "add %3, %1 \n\t" \ | |
314 | "add $24, %2 \n\t" \ | |
315 | "decl %0 \n\t" \ | |
316 | "jnz 1b \n\t" \ | |
317 | : "+r"(h), "+r" (src), "+r" (dst) \ | |
318 | : "r"(src_stride), "r"(3*src_stride), \ | |
319 | "m"(rnd), "m"(shift) \ | |
320 | NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18) \ | |
321 | : "memory" \ | |
322 | ); \ | |
323 | } | |
324 | ||
325 | /** | |
326 | * Macro to build the horizontal 16bits version of vc1_put_shift[13]. | |
327 | * Here, offset=16bits, so parameters passed A1 to A4 should be simple. | |
328 | * | |
329 | * @param NAME Either 1 or 3 | |
330 | * @see MSPEL_FILTER13_CORE for information on A1->A4 | |
331 | */ | |
332 | #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \ | |
333 | static void \ | |
334 | OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \ | |
335 | const int16_t *src, int rnd) \ | |
336 | { \ | |
337 | int h = 8; \ | |
338 | src -= 1; \ | |
339 | rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ | |
340 | __asm__ volatile( \ | |
341 | LOAD_ROUNDER_MMX("%4") \ | |
342 | "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ | |
343 | "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ | |
344 | ".p2align 3 \n\t" \ | |
345 | "1: \n\t" \ | |
346 | MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \ | |
347 | NORMALIZE_MMX("$7") \ | |
348 | /* Remove bias */ \ | |
349 | "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \ | |
350 | "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \ | |
351 | TRANSFER_DO_PACK(OP) \ | |
352 | "add $24, %1 \n\t" \ | |
353 | "add %3, %2 \n\t" \ | |
354 | "decl %0 \n\t" \ | |
355 | "jnz 1b \n\t" \ | |
356 | : "+r"(h), "+r" (src), "+r" (dst) \ | |
357 | : "r"(stride), "m"(rnd) \ | |
358 | NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128) \ | |
359 | : "memory" \ | |
360 | ); \ | |
361 | } | |
362 | ||
363 | /** | |
364 | * Macro to build the 8bits, any direction, version of vc1_put_shift[13]. | |
365 | * Here, offset=src_stride. Parameters passed A1 to A4 must use | |
366 | * %3 (offset) and %4 (3*offset). | |
367 | * | |
368 | * @param NAME Either 1 or 3 | |
369 | * @see MSPEL_FILTER13_CORE for information on A1->A4 | |
370 | */ | |
371 | #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \ | |
372 | static void \ | |
373 | OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \ | |
374 | x86_reg stride, int rnd, x86_reg offset) \ | |
375 | { \ | |
376 | int h = 8; \ | |
377 | src -= offset; \ | |
378 | rnd = 32-rnd; \ | |
379 | __asm__ volatile ( \ | |
380 | LOAD_ROUNDER_MMX("%6") \ | |
381 | "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ | |
382 | "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ | |
383 | ".p2align 3 \n\t" \ | |
384 | "1: \n\t" \ | |
385 | MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ | |
386 | NORMALIZE_MMX("$6") \ | |
387 | TRANSFER_DO_PACK(OP) \ | |
388 | "add %5, %1 \n\t" \ | |
389 | "add %5, %2 \n\t" \ | |
390 | "decl %0 \n\t" \ | |
391 | "jnz 1b \n\t" \ | |
392 | : "+r"(h), "+r" (src), "+r" (dst) \ | |
393 | : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \ | |
394 | NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3) \ | |
395 | : "memory" \ | |
396 | ); \ | |
397 | } | |
398 | ||
399 | /** 1/4 shift bicubic interpolation */ | |
400 | MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_) | |
401 | MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_) | |
402 | MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") | |
403 | MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_) | |
404 | MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_) | |
405 | ||
406 | /** 3/4 shift bicubic interpolation */ | |
407 | MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_) | |
408 | MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_) | |
409 | MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") | |
410 | MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_) | |
411 | MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_) | |
412 | ||
413 | typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift); | |
414 | typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd); | |
415 | typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset); | |
416 | ||
417 | /** | |
418 | * Interpolate fractional pel values by applying proper vertical then | |
419 | * horizontal filter. | |
420 | * | |
421 | * @param dst Destination buffer for interpolated pels. | |
422 | * @param src Source buffer. | |
423 | * @param stride Stride for both src and dst buffers. | |
424 | * @param hmode Horizontal filter (expressed in quarter pixels shift). | |
425 | * @param hmode Vertical filter. | |
426 | * @param rnd Rounding bias. | |
427 | */ | |
428 | #define VC1_MSPEL_MC(OP)\ | |
429 | static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ | |
430 | int hmode, int vmode, int rnd)\ | |
431 | {\ | |
432 | static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ | |
433 | { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\ | |
434 | static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ | |
435 | { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\ | |
436 | static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\ | |
437 | { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\ | |
438 | \ | |
439 | __asm__ volatile(\ | |
440 | "pxor %%mm0, %%mm0 \n\t"\ | |
441 | ::: "memory"\ | |
442 | );\ | |
443 | \ | |
444 | if (vmode) { /* Vertical filter to apply */\ | |
445 | if (hmode) { /* Horizontal filter to apply, output to tmp */\ | |
446 | static const int shift_value[] = { 0, 5, 1, 5 };\ | |
447 | int shift = (shift_value[hmode]+shift_value[vmode])>>1;\ | |
448 | int r;\ | |
449 | DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\ | |
450 | \ | |
451 | r = (1<<(shift-1)) + rnd-1;\ | |
452 | vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\ | |
453 | \ | |
454 | vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ | |
455 | return;\ | |
456 | }\ | |
457 | else { /* No horizontal filter, output 8 lines to dst */\ | |
458 | vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\ | |
459 | return;\ | |
460 | }\ | |
461 | }\ | |
462 | \ | |
463 | /* Horizontal mode with no vertical mode */\ | |
464 | vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ | |
465 | } \ | |
466 | static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \ | |
467 | int stride, int hmode, int vmode, int rnd)\ | |
468 | { \ | |
469 | OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ | |
470 | OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ | |
471 | dst += 8*stride; src += 8*stride; \ | |
472 | OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ | |
473 | OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ | |
474 | } | |
475 | ||
476 | VC1_MSPEL_MC(put_) | |
477 | VC1_MSPEL_MC(avg_) | |
478 | ||
479 | /** Macro to ease bicubic filter interpolation functions declarations */ | |
480 | #define DECLARE_FUNCTION(a, b) \ | |
481 | static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, \ | |
482 | const uint8_t *src, \ | |
483 | ptrdiff_t stride, \ | |
484 | int rnd) \ | |
485 | { \ | |
486 | put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ | |
487 | }\ | |
488 | static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \ | |
489 | const uint8_t *src, \ | |
490 | ptrdiff_t stride, \ | |
491 | int rnd) \ | |
492 | { \ | |
493 | avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ | |
494 | }\ | |
495 | static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst, \ | |
496 | const uint8_t *src, \ | |
497 | ptrdiff_t stride, \ | |
498 | int rnd) \ | |
499 | { \ | |
500 | put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ | |
501 | }\ | |
502 | static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst, \ | |
503 | const uint8_t *src,\ | |
504 | ptrdiff_t stride, \ | |
505 | int rnd) \ | |
506 | { \ | |
507 | avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ | |
508 | } | |
509 | ||
510 | DECLARE_FUNCTION(0, 1) | |
511 | DECLARE_FUNCTION(0, 2) | |
512 | DECLARE_FUNCTION(0, 3) | |
513 | ||
514 | DECLARE_FUNCTION(1, 0) | |
515 | DECLARE_FUNCTION(1, 1) | |
516 | DECLARE_FUNCTION(1, 2) | |
517 | DECLARE_FUNCTION(1, 3) | |
518 | ||
519 | DECLARE_FUNCTION(2, 0) | |
520 | DECLARE_FUNCTION(2, 1) | |
521 | DECLARE_FUNCTION(2, 2) | |
522 | DECLARE_FUNCTION(2, 3) | |
523 | ||
524 | DECLARE_FUNCTION(3, 0) | |
525 | DECLARE_FUNCTION(3, 1) | |
526 | DECLARE_FUNCTION(3, 2) | |
527 | DECLARE_FUNCTION(3, 3) | |
528 | ||
529 | static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize, | |
530 | int16_t *block) | |
531 | { | |
532 | int dc = block[0]; | |
533 | dc = (17 * dc + 4) >> 3; | |
534 | dc = (17 * dc + 64) >> 7; | |
535 | __asm__ volatile( | |
536 | "movd %0, %%mm0 \n\t" | |
537 | "pshufw $0, %%mm0, %%mm0 \n\t" | |
538 | "pxor %%mm1, %%mm1 \n\t" | |
539 | "psubw %%mm0, %%mm1 \n\t" | |
540 | "packuswb %%mm0, %%mm0 \n\t" | |
541 | "packuswb %%mm1, %%mm1 \n\t" | |
542 | ::"r"(dc) | |
543 | ); | |
544 | __asm__ volatile( | |
545 | "movd %0, %%mm2 \n\t" | |
546 | "movd %1, %%mm3 \n\t" | |
547 | "movd %2, %%mm4 \n\t" | |
548 | "movd %3, %%mm5 \n\t" | |
549 | "paddusb %%mm0, %%mm2 \n\t" | |
550 | "paddusb %%mm0, %%mm3 \n\t" | |
551 | "paddusb %%mm0, %%mm4 \n\t" | |
552 | "paddusb %%mm0, %%mm5 \n\t" | |
553 | "psubusb %%mm1, %%mm2 \n\t" | |
554 | "psubusb %%mm1, %%mm3 \n\t" | |
555 | "psubusb %%mm1, %%mm4 \n\t" | |
556 | "psubusb %%mm1, %%mm5 \n\t" | |
557 | "movd %%mm2, %0 \n\t" | |
558 | "movd %%mm3, %1 \n\t" | |
559 | "movd %%mm4, %2 \n\t" | |
560 | "movd %%mm5, %3 \n\t" | |
561 | :"+m"(*(uint32_t*)(dest+0*linesize)), | |
562 | "+m"(*(uint32_t*)(dest+1*linesize)), | |
563 | "+m"(*(uint32_t*)(dest+2*linesize)), | |
564 | "+m"(*(uint32_t*)(dest+3*linesize)) | |
565 | ); | |
566 | } | |
567 | ||
568 | static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize, | |
569 | int16_t *block) | |
570 | { | |
571 | int dc = block[0]; | |
572 | dc = (17 * dc + 4) >> 3; | |
573 | dc = (12 * dc + 64) >> 7; | |
574 | __asm__ volatile( | |
575 | "movd %0, %%mm0 \n\t" | |
576 | "pshufw $0, %%mm0, %%mm0 \n\t" | |
577 | "pxor %%mm1, %%mm1 \n\t" | |
578 | "psubw %%mm0, %%mm1 \n\t" | |
579 | "packuswb %%mm0, %%mm0 \n\t" | |
580 | "packuswb %%mm1, %%mm1 \n\t" | |
581 | ::"r"(dc) | |
582 | ); | |
583 | __asm__ volatile( | |
584 | "movd %0, %%mm2 \n\t" | |
585 | "movd %1, %%mm3 \n\t" | |
586 | "movd %2, %%mm4 \n\t" | |
587 | "movd %3, %%mm5 \n\t" | |
588 | "paddusb %%mm0, %%mm2 \n\t" | |
589 | "paddusb %%mm0, %%mm3 \n\t" | |
590 | "paddusb %%mm0, %%mm4 \n\t" | |
591 | "paddusb %%mm0, %%mm5 \n\t" | |
592 | "psubusb %%mm1, %%mm2 \n\t" | |
593 | "psubusb %%mm1, %%mm3 \n\t" | |
594 | "psubusb %%mm1, %%mm4 \n\t" | |
595 | "psubusb %%mm1, %%mm5 \n\t" | |
596 | "movd %%mm2, %0 \n\t" | |
597 | "movd %%mm3, %1 \n\t" | |
598 | "movd %%mm4, %2 \n\t" | |
599 | "movd %%mm5, %3 \n\t" | |
600 | :"+m"(*(uint32_t*)(dest+0*linesize)), | |
601 | "+m"(*(uint32_t*)(dest+1*linesize)), | |
602 | "+m"(*(uint32_t*)(dest+2*linesize)), | |
603 | "+m"(*(uint32_t*)(dest+3*linesize)) | |
604 | ); | |
605 | dest += 4*linesize; | |
606 | __asm__ volatile( | |
607 | "movd %0, %%mm2 \n\t" | |
608 | "movd %1, %%mm3 \n\t" | |
609 | "movd %2, %%mm4 \n\t" | |
610 | "movd %3, %%mm5 \n\t" | |
611 | "paddusb %%mm0, %%mm2 \n\t" | |
612 | "paddusb %%mm0, %%mm3 \n\t" | |
613 | "paddusb %%mm0, %%mm4 \n\t" | |
614 | "paddusb %%mm0, %%mm5 \n\t" | |
615 | "psubusb %%mm1, %%mm2 \n\t" | |
616 | "psubusb %%mm1, %%mm3 \n\t" | |
617 | "psubusb %%mm1, %%mm4 \n\t" | |
618 | "psubusb %%mm1, %%mm5 \n\t" | |
619 | "movd %%mm2, %0 \n\t" | |
620 | "movd %%mm3, %1 \n\t" | |
621 | "movd %%mm4, %2 \n\t" | |
622 | "movd %%mm5, %3 \n\t" | |
623 | :"+m"(*(uint32_t*)(dest+0*linesize)), | |
624 | "+m"(*(uint32_t*)(dest+1*linesize)), | |
625 | "+m"(*(uint32_t*)(dest+2*linesize)), | |
626 | "+m"(*(uint32_t*)(dest+3*linesize)) | |
627 | ); | |
628 | } | |
629 | ||
630 | static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize, | |
631 | int16_t *block) | |
632 | { | |
633 | int dc = block[0]; | |
634 | dc = ( 3 * dc + 1) >> 1; | |
635 | dc = (17 * dc + 64) >> 7; | |
636 | __asm__ volatile( | |
637 | "movd %0, %%mm0 \n\t" | |
638 | "pshufw $0, %%mm0, %%mm0 \n\t" | |
639 | "pxor %%mm1, %%mm1 \n\t" | |
640 | "psubw %%mm0, %%mm1 \n\t" | |
641 | "packuswb %%mm0, %%mm0 \n\t" | |
642 | "packuswb %%mm1, %%mm1 \n\t" | |
643 | ::"r"(dc) | |
644 | ); | |
645 | __asm__ volatile( | |
646 | "movq %0, %%mm2 \n\t" | |
647 | "movq %1, %%mm3 \n\t" | |
648 | "movq %2, %%mm4 \n\t" | |
649 | "movq %3, %%mm5 \n\t" | |
650 | "paddusb %%mm0, %%mm2 \n\t" | |
651 | "paddusb %%mm0, %%mm3 \n\t" | |
652 | "paddusb %%mm0, %%mm4 \n\t" | |
653 | "paddusb %%mm0, %%mm5 \n\t" | |
654 | "psubusb %%mm1, %%mm2 \n\t" | |
655 | "psubusb %%mm1, %%mm3 \n\t" | |
656 | "psubusb %%mm1, %%mm4 \n\t" | |
657 | "psubusb %%mm1, %%mm5 \n\t" | |
658 | "movq %%mm2, %0 \n\t" | |
659 | "movq %%mm3, %1 \n\t" | |
660 | "movq %%mm4, %2 \n\t" | |
661 | "movq %%mm5, %3 \n\t" | |
662 | :"+m"(*(uint32_t*)(dest+0*linesize)), | |
663 | "+m"(*(uint32_t*)(dest+1*linesize)), | |
664 | "+m"(*(uint32_t*)(dest+2*linesize)), | |
665 | "+m"(*(uint32_t*)(dest+3*linesize)) | |
666 | ); | |
667 | } | |
668 | ||
669 | static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize, | |
670 | int16_t *block) | |
671 | { | |
672 | int dc = block[0]; | |
673 | dc = (3 * dc + 1) >> 1; | |
674 | dc = (3 * dc + 16) >> 5; | |
675 | __asm__ volatile( | |
676 | "movd %0, %%mm0 \n\t" | |
677 | "pshufw $0, %%mm0, %%mm0 \n\t" | |
678 | "pxor %%mm1, %%mm1 \n\t" | |
679 | "psubw %%mm0, %%mm1 \n\t" | |
680 | "packuswb %%mm0, %%mm0 \n\t" | |
681 | "packuswb %%mm1, %%mm1 \n\t" | |
682 | ::"r"(dc) | |
683 | ); | |
684 | __asm__ volatile( | |
685 | "movq %0, %%mm2 \n\t" | |
686 | "movq %1, %%mm3 \n\t" | |
687 | "movq %2, %%mm4 \n\t" | |
688 | "movq %3, %%mm5 \n\t" | |
689 | "paddusb %%mm0, %%mm2 \n\t" | |
690 | "paddusb %%mm0, %%mm3 \n\t" | |
691 | "paddusb %%mm0, %%mm4 \n\t" | |
692 | "paddusb %%mm0, %%mm5 \n\t" | |
693 | "psubusb %%mm1, %%mm2 \n\t" | |
694 | "psubusb %%mm1, %%mm3 \n\t" | |
695 | "psubusb %%mm1, %%mm4 \n\t" | |
696 | "psubusb %%mm1, %%mm5 \n\t" | |
697 | "movq %%mm2, %0 \n\t" | |
698 | "movq %%mm3, %1 \n\t" | |
699 | "movq %%mm4, %2 \n\t" | |
700 | "movq %%mm5, %3 \n\t" | |
701 | :"+m"(*(uint32_t*)(dest+0*linesize)), | |
702 | "+m"(*(uint32_t*)(dest+1*linesize)), | |
703 | "+m"(*(uint32_t*)(dest+2*linesize)), | |
704 | "+m"(*(uint32_t*)(dest+3*linesize)) | |
705 | ); | |
706 | dest += 4*linesize; | |
707 | __asm__ volatile( | |
708 | "movq %0, %%mm2 \n\t" | |
709 | "movq %1, %%mm3 \n\t" | |
710 | "movq %2, %%mm4 \n\t" | |
711 | "movq %3, %%mm5 \n\t" | |
712 | "paddusb %%mm0, %%mm2 \n\t" | |
713 | "paddusb %%mm0, %%mm3 \n\t" | |
714 | "paddusb %%mm0, %%mm4 \n\t" | |
715 | "paddusb %%mm0, %%mm5 \n\t" | |
716 | "psubusb %%mm1, %%mm2 \n\t" | |
717 | "psubusb %%mm1, %%mm3 \n\t" | |
718 | "psubusb %%mm1, %%mm4 \n\t" | |
719 | "psubusb %%mm1, %%mm5 \n\t" | |
720 | "movq %%mm2, %0 \n\t" | |
721 | "movq %%mm3, %1 \n\t" | |
722 | "movq %%mm4, %2 \n\t" | |
723 | "movq %%mm5, %3 \n\t" | |
724 | :"+m"(*(uint32_t*)(dest+0*linesize)), | |
725 | "+m"(*(uint32_t*)(dest+1*linesize)), | |
726 | "+m"(*(uint32_t*)(dest+2*linesize)), | |
727 | "+m"(*(uint32_t*)(dest+3*linesize)) | |
728 | ); | |
729 | } | |
730 | ||
731 | #if HAVE_MMX_EXTERNAL | |
732 | static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, | |
733 | ptrdiff_t stride, int rnd) | |
734 | { | |
735 | ff_put_pixels8_mmx(dst, src, stride, 8); | |
736 | } | |
737 | static void put_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src, | |
738 | ptrdiff_t stride, int rnd) | |
739 | { | |
740 | ff_put_pixels16_mmx(dst, src, stride, 16); | |
741 | } | |
742 | static void avg_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, | |
743 | ptrdiff_t stride, int rnd) | |
744 | { | |
745 | ff_avg_pixels8_mmx(dst, src, stride, 8); | |
746 | } | |
747 | static void avg_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src, | |
748 | ptrdiff_t stride, int rnd) | |
749 | { | |
750 | ff_avg_pixels16_mmx(dst, src, stride, 16); | |
751 | } | |
752 | #endif | |
753 | ||
754 | #define FN_ASSIGN(OP, X, Y, INSN) \ | |
755 | dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \ | |
756 | dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN | |
757 | ||
758 | av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) | |
759 | { | |
760 | #if HAVE_MMX_EXTERNAL | |
761 | FN_ASSIGN(put_, 0, 0, _mmx); | |
762 | FN_ASSIGN(avg_, 0, 0, _mmx); | |
763 | #endif | |
764 | FN_ASSIGN(put_, 0, 1, _mmx); | |
765 | FN_ASSIGN(put_, 0, 2, _mmx); | |
766 | FN_ASSIGN(put_, 0, 3, _mmx); | |
767 | ||
768 | FN_ASSIGN(put_, 1, 0, _mmx); | |
769 | FN_ASSIGN(put_, 1, 1, _mmx); | |
770 | FN_ASSIGN(put_, 1, 2, _mmx); | |
771 | FN_ASSIGN(put_, 1, 3, _mmx); | |
772 | ||
773 | FN_ASSIGN(put_, 2, 0, _mmx); | |
774 | FN_ASSIGN(put_, 2, 1, _mmx); | |
775 | FN_ASSIGN(put_, 2, 2, _mmx); | |
776 | FN_ASSIGN(put_, 2, 3, _mmx); | |
777 | ||
778 | FN_ASSIGN(put_, 3, 0, _mmx); | |
779 | FN_ASSIGN(put_, 3, 1, _mmx); | |
780 | FN_ASSIGN(put_, 3, 2, _mmx); | |
781 | FN_ASSIGN(put_, 3, 3, _mmx); | |
782 | } | |
783 | ||
784 | av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp) | |
785 | { | |
786 | FN_ASSIGN(avg_, 0, 1, _mmxext); | |
787 | FN_ASSIGN(avg_, 0, 2, _mmxext); | |
788 | FN_ASSIGN(avg_, 0, 3, _mmxext); | |
789 | ||
790 | FN_ASSIGN(avg_, 1, 0, _mmxext); | |
791 | FN_ASSIGN(avg_, 1, 1, _mmxext); | |
792 | FN_ASSIGN(avg_, 1, 2, _mmxext); | |
793 | FN_ASSIGN(avg_, 1, 3, _mmxext); | |
794 | ||
795 | FN_ASSIGN(avg_, 2, 0, _mmxext); | |
796 | FN_ASSIGN(avg_, 2, 1, _mmxext); | |
797 | FN_ASSIGN(avg_, 2, 2, _mmxext); | |
798 | FN_ASSIGN(avg_, 2, 3, _mmxext); | |
799 | ||
800 | FN_ASSIGN(avg_, 3, 0, _mmxext); | |
801 | FN_ASSIGN(avg_, 3, 1, _mmxext); | |
802 | FN_ASSIGN(avg_, 3, 2, _mmxext); | |
803 | FN_ASSIGN(avg_, 3, 3, _mmxext); | |
804 | ||
805 | dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext; | |
806 | dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext; | |
807 | dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext; | |
808 | dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext; | |
809 | } | |
810 | #endif /* HAVE_6REGS && HAVE_INLINE_ASM */ |