Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / vc1dsp_mmx.c
CommitLineData
2ba45a60
DM
1/*
2 * VC-1 and WMV3 - DSP functions MMX-optimized
3 * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
4 *
5 * Permission is hereby granted, free of charge, to any person
6 * obtaining a copy of this software and associated documentation
7 * files (the "Software"), to deal in the Software without
8 * restriction, including without limitation the rights to use,
9 * copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following
12 * conditions:
13 *
14 * The above copyright notice and this permission notice shall be
15 * included in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 * OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27#include "libavutil/cpu.h"
28#include "libavutil/mem.h"
29#include "libavutil/x86/asm.h"
30#include "libavutil/x86/cpu.h"
31#include "libavcodec/vc1dsp.h"
32#include "constants.h"
33#include "fpel.h"
34#include "vc1dsp.h"
35
36#if HAVE_6REGS && HAVE_INLINE_ASM
37
38#define OP_PUT(S,D)
39#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
40
41/** Add rounder from mm7 to mm3 and pack result at destination */
42#define NORMALIZE_MMX(SHIFT) \
43 "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \
44 "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \
45 "psraw "SHIFT", %%mm3 \n\t" \
46 "psraw "SHIFT", %%mm4 \n\t"
47
48#define TRANSFER_DO_PACK(OP) \
49 "packuswb %%mm4, %%mm3 \n\t" \
50 OP((%2), %%mm3) \
51 "movq %%mm3, (%2) \n\t"
52
53#define TRANSFER_DONT_PACK(OP) \
54 OP(0(%2), %%mm3) \
55 OP(8(%2), %%mm4) \
56 "movq %%mm3, 0(%2) \n\t" \
57 "movq %%mm4, 8(%2) \n\t"
58
59/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
60#define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
61#define DONT_UNPACK(reg)
62
63/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
64#define LOAD_ROUNDER_MMX(ROUND) \
65 "movd "ROUND", %%mm7 \n\t" \
66 "punpcklwd %%mm7, %%mm7 \n\t" \
67 "punpckldq %%mm7, %%mm7 \n\t"
68
69#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
70 "paddw %%mm"#R2", %%mm"#R1" \n\t" \
71 "movd (%0,%3), %%mm"#R0" \n\t" \
72 "pmullw %%mm6, %%mm"#R1" \n\t" \
73 "punpcklbw %%mm0, %%mm"#R0" \n\t" \
74 "movd (%0,%2), %%mm"#R3" \n\t" \
75 "psubw %%mm"#R0", %%mm"#R1" \n\t" \
76 "punpcklbw %%mm0, %%mm"#R3" \n\t" \
77 "paddw %%mm7, %%mm"#R1" \n\t" \
78 "psubw %%mm"#R3", %%mm"#R1" \n\t" \
79 "psraw %4, %%mm"#R1" \n\t" \
80 "movq %%mm"#R1", "#OFF"(%1) \n\t" \
81 "add %2, %0 \n\t"
82
83/** Sacrifying mm6 allows to pipeline loads from src */
84static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
85 const uint8_t *src, x86_reg stride,
86 int rnd, int64_t shift)
87{
88 __asm__ volatile(
89 "mov $3, %%"REG_c" \n\t"
90 LOAD_ROUNDER_MMX("%5")
91 "movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
92 "1: \n\t"
93 "movd (%0), %%mm2 \n\t"
94 "add %2, %0 \n\t"
95 "movd (%0), %%mm3 \n\t"
96 "punpcklbw %%mm0, %%mm2 \n\t"
97 "punpcklbw %%mm0, %%mm3 \n\t"
98 SHIFT2_LINE( 0, 1, 2, 3, 4)
99 SHIFT2_LINE( 24, 2, 3, 4, 1)
100 SHIFT2_LINE( 48, 3, 4, 1, 2)
101 SHIFT2_LINE( 72, 4, 1, 2, 3)
102 SHIFT2_LINE( 96, 1, 2, 3, 4)
103 SHIFT2_LINE(120, 2, 3, 4, 1)
104 SHIFT2_LINE(144, 3, 4, 1, 2)
105 SHIFT2_LINE(168, 4, 1, 2, 3)
106 "sub %6, %0 \n\t"
107 "add $8, %1 \n\t"
108 "dec %%"REG_c" \n\t"
109 "jnz 1b \n\t"
110 : "+r"(src), "+r"(dst)
111 : "r"(stride), "r"(-2*stride),
112 "m"(shift), "m"(rnd), "r"(9*stride-4)
113 NAMED_CONSTRAINTS_ADD(ff_pw_9)
114 : "%"REG_c, "memory"
115 );
116}
117
118/**
119 * Data is already unpacked, so some operations can directly be made from
120 * memory.
121 */
122#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
123static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
124 const int16_t *src, int rnd)\
125{\
126 int h = 8;\
127\
128 src -= 1;\
129 rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
130 __asm__ volatile(\
131 LOAD_ROUNDER_MMX("%4")\
132 "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
133 "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
134 "1: \n\t"\
135 "movq 2*0+0(%1), %%mm1 \n\t"\
136 "movq 2*0+8(%1), %%mm2 \n\t"\
137 "movq 2*1+0(%1), %%mm3 \n\t"\
138 "movq 2*1+8(%1), %%mm4 \n\t"\
139 "paddw 2*3+0(%1), %%mm1 \n\t"\
140 "paddw 2*3+8(%1), %%mm2 \n\t"\
141 "paddw 2*2+0(%1), %%mm3 \n\t"\
142 "paddw 2*2+8(%1), %%mm4 \n\t"\
143 "pmullw %%mm5, %%mm3 \n\t"\
144 "pmullw %%mm5, %%mm4 \n\t"\
145 "psubw %%mm1, %%mm3 \n\t"\
146 "psubw %%mm2, %%mm4 \n\t"\
147 NORMALIZE_MMX("$7")\
148 /* Remove bias */\
149 "paddw %%mm6, %%mm3 \n\t"\
150 "paddw %%mm6, %%mm4 \n\t"\
151 TRANSFER_DO_PACK(OP)\
152 "add $24, %1 \n\t"\
153 "add %3, %2 \n\t"\
154 "decl %0 \n\t"\
155 "jnz 1b \n\t"\
156 : "+r"(h), "+r" (src), "+r" (dst)\
157 : "r"(stride), "m"(rnd)\
158 NAMED_CONSTRAINTS_ADD(ff_pw_128,ff_pw_9)\
159 : "memory"\
160 );\
161}
162
163VC1_HOR_16b_SHIFT2(OP_PUT, put_)
164VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
165
166
167/**
168 * Purely vertical or horizontal 1/2 shift interpolation.
169 * Sacrify mm6 for *9 factor.
170 */
171#define VC1_SHIFT2(OP, OPNAME)\
172static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
173 x86_reg stride, int rnd, x86_reg offset)\
174{\
175 rnd = 8-rnd;\
176 __asm__ volatile(\
177 "mov $8, %%"REG_c" \n\t"\
178 LOAD_ROUNDER_MMX("%5")\
179 "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\
180 "1: \n\t"\
181 "movd 0(%0 ), %%mm3 \n\t"\
182 "movd 4(%0 ), %%mm4 \n\t"\
183 "movd 0(%0,%2), %%mm1 \n\t"\
184 "movd 4(%0,%2), %%mm2 \n\t"\
185 "add %2, %0 \n\t"\
186 "punpcklbw %%mm0, %%mm3 \n\t"\
187 "punpcklbw %%mm0, %%mm4 \n\t"\
188 "punpcklbw %%mm0, %%mm1 \n\t"\
189 "punpcklbw %%mm0, %%mm2 \n\t"\
190 "paddw %%mm1, %%mm3 \n\t"\
191 "paddw %%mm2, %%mm4 \n\t"\
192 "movd 0(%0,%3), %%mm1 \n\t"\
193 "movd 4(%0,%3), %%mm2 \n\t"\
194 "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\
195 "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\
196 "punpcklbw %%mm0, %%mm1 \n\t"\
197 "punpcklbw %%mm0, %%mm2 \n\t"\
198 "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\
199 "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\
200 "movd 0(%0,%2), %%mm1 \n\t"\
201 "movd 4(%0,%2), %%mm2 \n\t"\
202 "punpcklbw %%mm0, %%mm1 \n\t"\
203 "punpcklbw %%mm0, %%mm2 \n\t"\
204 "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\
205 "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\
206 NORMALIZE_MMX("$4")\
207 "packuswb %%mm4, %%mm3 \n\t"\
208 OP((%1), %%mm3)\
209 "movq %%mm3, (%1) \n\t"\
210 "add %6, %0 \n\t"\
211 "add %4, %1 \n\t"\
212 "dec %%"REG_c" \n\t"\
213 "jnz 1b \n\t"\
214 : "+r"(src), "+r"(dst)\
215 : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
216 "g"(stride-offset)\
217 NAMED_CONSTRAINTS_ADD(ff_pw_9)\
218 : "%"REG_c, "memory"\
219 );\
220}
221
222VC1_SHIFT2(OP_PUT, put_)
223VC1_SHIFT2(OP_AVG, avg_)
224
225/**
226 * Core of the 1/4 and 3/4 shift bicubic interpolation.
227 *
228 * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
229 * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked.
230 * @param A1 Address of 1st tap (beware of unpacked/packed).
231 * @param A2 Address of 2nd tap
232 * @param A3 Address of 3rd tap
233 * @param A4 Address of 4th tap
234 */
235#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
236 MOVQ "*0+"A1", %%mm1 \n\t" \
237 MOVQ "*4+"A1", %%mm2 \n\t" \
238 UNPACK("%%mm1") \
239 UNPACK("%%mm2") \
240 "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
241 "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
242 MOVQ "*0+"A2", %%mm3 \n\t" \
243 MOVQ "*4+"A2", %%mm4 \n\t" \
244 UNPACK("%%mm3") \
245 UNPACK("%%mm4") \
246 "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
247 "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \
248 "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \
249 "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \
250 MOVQ "*0+"A4", %%mm1 \n\t" \
251 MOVQ "*4+"A4", %%mm2 \n\t" \
252 UNPACK("%%mm1") \
253 UNPACK("%%mm2") \
254 "psllw $2, %%mm1 \n\t" /* 4* */ \
255 "psllw $2, %%mm2 \n\t" /* 4* */ \
256 "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \
257 "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \
258 MOVQ "*0+"A3", %%mm1 \n\t" \
259 MOVQ "*4+"A3", %%mm2 \n\t" \
260 UNPACK("%%mm1") \
261 UNPACK("%%mm2") \
262 "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
263 "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \
264 "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \
265 "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */
266
267/**
268 * Macro to build the vertical 16bits version of vc1_put_shift[13].
269 * Here, offset=src_stride. Parameters passed A1 to A4 must use
270 * %3 (src_stride) and %4 (3*src_stride).
271 *
272 * @param NAME Either 1 or 3
273 * @see MSPEL_FILTER13_CORE for information on A1->A4
274 */
275#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
276static void \
277vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
278 x86_reg src_stride, \
279 int rnd, int64_t shift) \
280{ \
281 int h = 8; \
282 src -= src_stride; \
283 __asm__ volatile( \
284 LOAD_ROUNDER_MMX("%5") \
285 "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
286 "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
287 ".p2align 3 \n\t" \
288 "1: \n\t" \
289 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
290 NORMALIZE_MMX("%6") \
291 TRANSFER_DONT_PACK(OP_PUT) \
292 /* Last 3 (in fact 4) bytes on the line */ \
293 "movd 8+"A1", %%mm1 \n\t" \
294 DO_UNPACK("%%mm1") \
295 "movq %%mm1, %%mm3 \n\t" \
296 "paddw %%mm1, %%mm1 \n\t" \
297 "paddw %%mm3, %%mm1 \n\t" /* 3* */ \
298 "movd 8+"A2", %%mm3 \n\t" \
299 DO_UNPACK("%%mm3") \
300 "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
301 "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \
302 "movd 8+"A3", %%mm1 \n\t" \
303 DO_UNPACK("%%mm1") \
304 "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
305 "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \
306 "movd 8+"A4", %%mm1 \n\t" \
307 DO_UNPACK("%%mm1") \
308 "psllw $2, %%mm1 \n\t" /* 4* */ \
309 "psubw %%mm1, %%mm3 \n\t" \
310 "paddw %%mm7, %%mm3 \n\t" \
311 "psraw %6, %%mm3 \n\t" \
312 "movq %%mm3, 16(%2) \n\t" \
313 "add %3, %1 \n\t" \
314 "add $24, %2 \n\t" \
315 "decl %0 \n\t" \
316 "jnz 1b \n\t" \
317 : "+r"(h), "+r" (src), "+r" (dst) \
318 : "r"(src_stride), "r"(3*src_stride), \
319 "m"(rnd), "m"(shift) \
320 NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18) \
321 : "memory" \
322 ); \
323}
324
325/**
326 * Macro to build the horizontal 16bits version of vc1_put_shift[13].
327 * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
328 *
329 * @param NAME Either 1 or 3
330 * @see MSPEL_FILTER13_CORE for information on A1->A4
331 */
332#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
333static void \
334OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
335 const int16_t *src, int rnd) \
336{ \
337 int h = 8; \
338 src -= 1; \
339 rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
340 __asm__ volatile( \
341 LOAD_ROUNDER_MMX("%4") \
342 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
343 "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
344 ".p2align 3 \n\t" \
345 "1: \n\t" \
346 MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
347 NORMALIZE_MMX("$7") \
348 /* Remove bias */ \
349 "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
350 "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
351 TRANSFER_DO_PACK(OP) \
352 "add $24, %1 \n\t" \
353 "add %3, %2 \n\t" \
354 "decl %0 \n\t" \
355 "jnz 1b \n\t" \
356 : "+r"(h), "+r" (src), "+r" (dst) \
357 : "r"(stride), "m"(rnd) \
358 NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128) \
359 : "memory" \
360 ); \
361}
362
363/**
364 * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
365 * Here, offset=src_stride. Parameters passed A1 to A4 must use
366 * %3 (offset) and %4 (3*offset).
367 *
368 * @param NAME Either 1 or 3
369 * @see MSPEL_FILTER13_CORE for information on A1->A4
370 */
371#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
372static void \
373OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
374 x86_reg stride, int rnd, x86_reg offset) \
375{ \
376 int h = 8; \
377 src -= offset; \
378 rnd = 32-rnd; \
379 __asm__ volatile ( \
380 LOAD_ROUNDER_MMX("%6") \
381 "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
382 "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
383 ".p2align 3 \n\t" \
384 "1: \n\t" \
385 MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
386 NORMALIZE_MMX("$6") \
387 TRANSFER_DO_PACK(OP) \
388 "add %5, %1 \n\t" \
389 "add %5, %2 \n\t" \
390 "decl %0 \n\t" \
391 "jnz 1b \n\t" \
392 : "+r"(h), "+r" (src), "+r" (dst) \
393 : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
394 NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3) \
395 : "memory" \
396 ); \
397}
398
399/** 1/4 shift bicubic interpolation */
400MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_)
401MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_)
402MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
403MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_)
404MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
405
406/** 3/4 shift bicubic interpolation */
407MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_)
408MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_)
409MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
410MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
411MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
412
413typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
414typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
415typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
416
417/**
418 * Interpolate fractional pel values by applying proper vertical then
419 * horizontal filter.
420 *
421 * @param dst Destination buffer for interpolated pels.
422 * @param src Source buffer.
423 * @param stride Stride for both src and dst buffers.
424 * @param hmode Horizontal filter (expressed in quarter pixels shift).
425 * @param hmode Vertical filter.
426 * @param rnd Rounding bias.
427 */
428#define VC1_MSPEL_MC(OP)\
429static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
430 int hmode, int vmode, int rnd)\
431{\
432 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
433 { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
434 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
435 { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
436 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
437 { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
438\
439 __asm__ volatile(\
440 "pxor %%mm0, %%mm0 \n\t"\
441 ::: "memory"\
442 );\
443\
444 if (vmode) { /* Vertical filter to apply */\
445 if (hmode) { /* Horizontal filter to apply, output to tmp */\
446 static const int shift_value[] = { 0, 5, 1, 5 };\
447 int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
448 int r;\
449 DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
450\
451 r = (1<<(shift-1)) + rnd-1;\
452 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
453\
454 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
455 return;\
456 }\
457 else { /* No horizontal filter, output 8 lines to dst */\
458 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
459 return;\
460 }\
461 }\
462\
463 /* Horizontal mode with no vertical mode */\
464 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
465} \
466static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
467 int stride, int hmode, int vmode, int rnd)\
468{ \
469 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
470 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
471 dst += 8*stride; src += 8*stride; \
472 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
473 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
474}
475
476VC1_MSPEL_MC(put_)
477VC1_MSPEL_MC(avg_)
478
479/** Macro to ease bicubic filter interpolation functions declarations */
480#define DECLARE_FUNCTION(a, b) \
481static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, \
482 const uint8_t *src, \
483 ptrdiff_t stride, \
484 int rnd) \
485{ \
486 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
487}\
488static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \
489 const uint8_t *src, \
490 ptrdiff_t stride, \
491 int rnd) \
492{ \
493 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
494}\
495static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst, \
496 const uint8_t *src, \
497 ptrdiff_t stride, \
498 int rnd) \
499{ \
500 put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
501}\
502static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst, \
503 const uint8_t *src,\
504 ptrdiff_t stride, \
505 int rnd) \
506{ \
507 avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
508}
509
510DECLARE_FUNCTION(0, 1)
511DECLARE_FUNCTION(0, 2)
512DECLARE_FUNCTION(0, 3)
513
514DECLARE_FUNCTION(1, 0)
515DECLARE_FUNCTION(1, 1)
516DECLARE_FUNCTION(1, 2)
517DECLARE_FUNCTION(1, 3)
518
519DECLARE_FUNCTION(2, 0)
520DECLARE_FUNCTION(2, 1)
521DECLARE_FUNCTION(2, 2)
522DECLARE_FUNCTION(2, 3)
523
524DECLARE_FUNCTION(3, 0)
525DECLARE_FUNCTION(3, 1)
526DECLARE_FUNCTION(3, 2)
527DECLARE_FUNCTION(3, 3)
528
529static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
530 int16_t *block)
531{
532 int dc = block[0];
533 dc = (17 * dc + 4) >> 3;
534 dc = (17 * dc + 64) >> 7;
535 __asm__ volatile(
536 "movd %0, %%mm0 \n\t"
537 "pshufw $0, %%mm0, %%mm0 \n\t"
538 "pxor %%mm1, %%mm1 \n\t"
539 "psubw %%mm0, %%mm1 \n\t"
540 "packuswb %%mm0, %%mm0 \n\t"
541 "packuswb %%mm1, %%mm1 \n\t"
542 ::"r"(dc)
543 );
544 __asm__ volatile(
545 "movd %0, %%mm2 \n\t"
546 "movd %1, %%mm3 \n\t"
547 "movd %2, %%mm4 \n\t"
548 "movd %3, %%mm5 \n\t"
549 "paddusb %%mm0, %%mm2 \n\t"
550 "paddusb %%mm0, %%mm3 \n\t"
551 "paddusb %%mm0, %%mm4 \n\t"
552 "paddusb %%mm0, %%mm5 \n\t"
553 "psubusb %%mm1, %%mm2 \n\t"
554 "psubusb %%mm1, %%mm3 \n\t"
555 "psubusb %%mm1, %%mm4 \n\t"
556 "psubusb %%mm1, %%mm5 \n\t"
557 "movd %%mm2, %0 \n\t"
558 "movd %%mm3, %1 \n\t"
559 "movd %%mm4, %2 \n\t"
560 "movd %%mm5, %3 \n\t"
561 :"+m"(*(uint32_t*)(dest+0*linesize)),
562 "+m"(*(uint32_t*)(dest+1*linesize)),
563 "+m"(*(uint32_t*)(dest+2*linesize)),
564 "+m"(*(uint32_t*)(dest+3*linesize))
565 );
566}
567
568static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
569 int16_t *block)
570{
571 int dc = block[0];
572 dc = (17 * dc + 4) >> 3;
573 dc = (12 * dc + 64) >> 7;
574 __asm__ volatile(
575 "movd %0, %%mm0 \n\t"
576 "pshufw $0, %%mm0, %%mm0 \n\t"
577 "pxor %%mm1, %%mm1 \n\t"
578 "psubw %%mm0, %%mm1 \n\t"
579 "packuswb %%mm0, %%mm0 \n\t"
580 "packuswb %%mm1, %%mm1 \n\t"
581 ::"r"(dc)
582 );
583 __asm__ volatile(
584 "movd %0, %%mm2 \n\t"
585 "movd %1, %%mm3 \n\t"
586 "movd %2, %%mm4 \n\t"
587 "movd %3, %%mm5 \n\t"
588 "paddusb %%mm0, %%mm2 \n\t"
589 "paddusb %%mm0, %%mm3 \n\t"
590 "paddusb %%mm0, %%mm4 \n\t"
591 "paddusb %%mm0, %%mm5 \n\t"
592 "psubusb %%mm1, %%mm2 \n\t"
593 "psubusb %%mm1, %%mm3 \n\t"
594 "psubusb %%mm1, %%mm4 \n\t"
595 "psubusb %%mm1, %%mm5 \n\t"
596 "movd %%mm2, %0 \n\t"
597 "movd %%mm3, %1 \n\t"
598 "movd %%mm4, %2 \n\t"
599 "movd %%mm5, %3 \n\t"
600 :"+m"(*(uint32_t*)(dest+0*linesize)),
601 "+m"(*(uint32_t*)(dest+1*linesize)),
602 "+m"(*(uint32_t*)(dest+2*linesize)),
603 "+m"(*(uint32_t*)(dest+3*linesize))
604 );
605 dest += 4*linesize;
606 __asm__ volatile(
607 "movd %0, %%mm2 \n\t"
608 "movd %1, %%mm3 \n\t"
609 "movd %2, %%mm4 \n\t"
610 "movd %3, %%mm5 \n\t"
611 "paddusb %%mm0, %%mm2 \n\t"
612 "paddusb %%mm0, %%mm3 \n\t"
613 "paddusb %%mm0, %%mm4 \n\t"
614 "paddusb %%mm0, %%mm5 \n\t"
615 "psubusb %%mm1, %%mm2 \n\t"
616 "psubusb %%mm1, %%mm3 \n\t"
617 "psubusb %%mm1, %%mm4 \n\t"
618 "psubusb %%mm1, %%mm5 \n\t"
619 "movd %%mm2, %0 \n\t"
620 "movd %%mm3, %1 \n\t"
621 "movd %%mm4, %2 \n\t"
622 "movd %%mm5, %3 \n\t"
623 :"+m"(*(uint32_t*)(dest+0*linesize)),
624 "+m"(*(uint32_t*)(dest+1*linesize)),
625 "+m"(*(uint32_t*)(dest+2*linesize)),
626 "+m"(*(uint32_t*)(dest+3*linesize))
627 );
628}
629
630static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
631 int16_t *block)
632{
633 int dc = block[0];
634 dc = ( 3 * dc + 1) >> 1;
635 dc = (17 * dc + 64) >> 7;
636 __asm__ volatile(
637 "movd %0, %%mm0 \n\t"
638 "pshufw $0, %%mm0, %%mm0 \n\t"
639 "pxor %%mm1, %%mm1 \n\t"
640 "psubw %%mm0, %%mm1 \n\t"
641 "packuswb %%mm0, %%mm0 \n\t"
642 "packuswb %%mm1, %%mm1 \n\t"
643 ::"r"(dc)
644 );
645 __asm__ volatile(
646 "movq %0, %%mm2 \n\t"
647 "movq %1, %%mm3 \n\t"
648 "movq %2, %%mm4 \n\t"
649 "movq %3, %%mm5 \n\t"
650 "paddusb %%mm0, %%mm2 \n\t"
651 "paddusb %%mm0, %%mm3 \n\t"
652 "paddusb %%mm0, %%mm4 \n\t"
653 "paddusb %%mm0, %%mm5 \n\t"
654 "psubusb %%mm1, %%mm2 \n\t"
655 "psubusb %%mm1, %%mm3 \n\t"
656 "psubusb %%mm1, %%mm4 \n\t"
657 "psubusb %%mm1, %%mm5 \n\t"
658 "movq %%mm2, %0 \n\t"
659 "movq %%mm3, %1 \n\t"
660 "movq %%mm4, %2 \n\t"
661 "movq %%mm5, %3 \n\t"
662 :"+m"(*(uint32_t*)(dest+0*linesize)),
663 "+m"(*(uint32_t*)(dest+1*linesize)),
664 "+m"(*(uint32_t*)(dest+2*linesize)),
665 "+m"(*(uint32_t*)(dest+3*linesize))
666 );
667}
668
669static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
670 int16_t *block)
671{
672 int dc = block[0];
673 dc = (3 * dc + 1) >> 1;
674 dc = (3 * dc + 16) >> 5;
675 __asm__ volatile(
676 "movd %0, %%mm0 \n\t"
677 "pshufw $0, %%mm0, %%mm0 \n\t"
678 "pxor %%mm1, %%mm1 \n\t"
679 "psubw %%mm0, %%mm1 \n\t"
680 "packuswb %%mm0, %%mm0 \n\t"
681 "packuswb %%mm1, %%mm1 \n\t"
682 ::"r"(dc)
683 );
684 __asm__ volatile(
685 "movq %0, %%mm2 \n\t"
686 "movq %1, %%mm3 \n\t"
687 "movq %2, %%mm4 \n\t"
688 "movq %3, %%mm5 \n\t"
689 "paddusb %%mm0, %%mm2 \n\t"
690 "paddusb %%mm0, %%mm3 \n\t"
691 "paddusb %%mm0, %%mm4 \n\t"
692 "paddusb %%mm0, %%mm5 \n\t"
693 "psubusb %%mm1, %%mm2 \n\t"
694 "psubusb %%mm1, %%mm3 \n\t"
695 "psubusb %%mm1, %%mm4 \n\t"
696 "psubusb %%mm1, %%mm5 \n\t"
697 "movq %%mm2, %0 \n\t"
698 "movq %%mm3, %1 \n\t"
699 "movq %%mm4, %2 \n\t"
700 "movq %%mm5, %3 \n\t"
701 :"+m"(*(uint32_t*)(dest+0*linesize)),
702 "+m"(*(uint32_t*)(dest+1*linesize)),
703 "+m"(*(uint32_t*)(dest+2*linesize)),
704 "+m"(*(uint32_t*)(dest+3*linesize))
705 );
706 dest += 4*linesize;
707 __asm__ volatile(
708 "movq %0, %%mm2 \n\t"
709 "movq %1, %%mm3 \n\t"
710 "movq %2, %%mm4 \n\t"
711 "movq %3, %%mm5 \n\t"
712 "paddusb %%mm0, %%mm2 \n\t"
713 "paddusb %%mm0, %%mm3 \n\t"
714 "paddusb %%mm0, %%mm4 \n\t"
715 "paddusb %%mm0, %%mm5 \n\t"
716 "psubusb %%mm1, %%mm2 \n\t"
717 "psubusb %%mm1, %%mm3 \n\t"
718 "psubusb %%mm1, %%mm4 \n\t"
719 "psubusb %%mm1, %%mm5 \n\t"
720 "movq %%mm2, %0 \n\t"
721 "movq %%mm3, %1 \n\t"
722 "movq %%mm4, %2 \n\t"
723 "movq %%mm5, %3 \n\t"
724 :"+m"(*(uint32_t*)(dest+0*linesize)),
725 "+m"(*(uint32_t*)(dest+1*linesize)),
726 "+m"(*(uint32_t*)(dest+2*linesize)),
727 "+m"(*(uint32_t*)(dest+3*linesize))
728 );
729}
730
731#if HAVE_MMX_EXTERNAL
732static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
733 ptrdiff_t stride, int rnd)
734{
735 ff_put_pixels8_mmx(dst, src, stride, 8);
736}
737static void put_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src,
738 ptrdiff_t stride, int rnd)
739{
740 ff_put_pixels16_mmx(dst, src, stride, 16);
741}
742static void avg_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
743 ptrdiff_t stride, int rnd)
744{
745 ff_avg_pixels8_mmx(dst, src, stride, 8);
746}
747static void avg_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src,
748 ptrdiff_t stride, int rnd)
749{
750 ff_avg_pixels16_mmx(dst, src, stride, 16);
751}
752#endif
753
754#define FN_ASSIGN(OP, X, Y, INSN) \
755 dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \
756 dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN
757
758av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
759{
760#if HAVE_MMX_EXTERNAL
761 FN_ASSIGN(put_, 0, 0, _mmx);
762 FN_ASSIGN(avg_, 0, 0, _mmx);
763#endif
764 FN_ASSIGN(put_, 0, 1, _mmx);
765 FN_ASSIGN(put_, 0, 2, _mmx);
766 FN_ASSIGN(put_, 0, 3, _mmx);
767
768 FN_ASSIGN(put_, 1, 0, _mmx);
769 FN_ASSIGN(put_, 1, 1, _mmx);
770 FN_ASSIGN(put_, 1, 2, _mmx);
771 FN_ASSIGN(put_, 1, 3, _mmx);
772
773 FN_ASSIGN(put_, 2, 0, _mmx);
774 FN_ASSIGN(put_, 2, 1, _mmx);
775 FN_ASSIGN(put_, 2, 2, _mmx);
776 FN_ASSIGN(put_, 2, 3, _mmx);
777
778 FN_ASSIGN(put_, 3, 0, _mmx);
779 FN_ASSIGN(put_, 3, 1, _mmx);
780 FN_ASSIGN(put_, 3, 2, _mmx);
781 FN_ASSIGN(put_, 3, 3, _mmx);
782}
783
784av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
785{
786 FN_ASSIGN(avg_, 0, 1, _mmxext);
787 FN_ASSIGN(avg_, 0, 2, _mmxext);
788 FN_ASSIGN(avg_, 0, 3, _mmxext);
789
790 FN_ASSIGN(avg_, 1, 0, _mmxext);
791 FN_ASSIGN(avg_, 1, 1, _mmxext);
792 FN_ASSIGN(avg_, 1, 2, _mmxext);
793 FN_ASSIGN(avg_, 1, 3, _mmxext);
794
795 FN_ASSIGN(avg_, 2, 0, _mmxext);
796 FN_ASSIGN(avg_, 2, 1, _mmxext);
797 FN_ASSIGN(avg_, 2, 2, _mmxext);
798 FN_ASSIGN(avg_, 2, 3, _mmxext);
799
800 FN_ASSIGN(avg_, 3, 0, _mmxext);
801 FN_ASSIGN(avg_, 3, 1, _mmxext);
802 FN_ASSIGN(avg_, 3, 2, _mmxext);
803 FN_ASSIGN(avg_, 3, 3, _mmxext);
804
805 dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
806 dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
807 dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
808 dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
809}
810#endif /* HAVE_6REGS && HAVE_INLINE_ASM */