| 1 | /* |
| 2 | * VC-1 and WMV3 - DSP functions MMX-optimized |
| 3 | * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> |
| 4 | * |
| 5 | * Permission is hereby granted, free of charge, to any person |
| 6 | * obtaining a copy of this software and associated documentation |
| 7 | * files (the "Software"), to deal in the Software without |
| 8 | * restriction, including without limitation the rights to use, |
| 9 | * copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 10 | * copies of the Software, and to permit persons to whom the |
| 11 | * Software is furnished to do so, subject to the following |
| 12 | * conditions: |
| 13 | * |
| 14 | * The above copyright notice and this permission notice shall be |
| 15 | * included in all copies or substantial portions of the Software. |
| 16 | * |
| 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| 18 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
| 19 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
| 20 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
| 21 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
| 22 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| 23 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
| 24 | * OTHER DEALINGS IN THE SOFTWARE. |
| 25 | */ |
| 26 | |
| 27 | #include "libavutil/cpu.h" |
| 28 | #include "libavutil/mem.h" |
| 29 | #include "libavutil/x86/asm.h" |
| 30 | #include "libavutil/x86/cpu.h" |
| 31 | #include "libavcodec/vc1dsp.h" |
| 32 | #include "constants.h" |
| 33 | #include "fpel.h" |
| 34 | #include "vc1dsp.h" |
| 35 | |
| 36 | #if HAVE_6REGS && HAVE_INLINE_ASM |
| 37 | |
| 38 | #define OP_PUT(S,D) |
| 39 | #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t" |
| 40 | |
| 41 | /** Add rounder from mm7 to mm3 and pack result at destination */ |
| 42 | #define NORMALIZE_MMX(SHIFT) \ |
| 43 | "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \ |
| 44 | "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \ |
| 45 | "psraw "SHIFT", %%mm3 \n\t" \ |
| 46 | "psraw "SHIFT", %%mm4 \n\t" |
| 47 | |
| 48 | #define TRANSFER_DO_PACK(OP) \ |
| 49 | "packuswb %%mm4, %%mm3 \n\t" \ |
| 50 | OP((%2), %%mm3) \ |
| 51 | "movq %%mm3, (%2) \n\t" |
| 52 | |
| 53 | #define TRANSFER_DONT_PACK(OP) \ |
| 54 | OP(0(%2), %%mm3) \ |
| 55 | OP(8(%2), %%mm4) \ |
| 56 | "movq %%mm3, 0(%2) \n\t" \ |
| 57 | "movq %%mm4, 8(%2) \n\t" |
| 58 | |
| 59 | /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */ |
| 60 | #define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t" |
| 61 | #define DONT_UNPACK(reg) |
| 62 | |
| 63 | /** Compute the rounder 32-r or 8-r and unpacks it to mm7 */ |
| 64 | #define LOAD_ROUNDER_MMX(ROUND) \ |
| 65 | "movd "ROUND", %%mm7 \n\t" \ |
| 66 | "punpcklwd %%mm7, %%mm7 \n\t" \ |
| 67 | "punpckldq %%mm7, %%mm7 \n\t" |
| 68 | |
| 69 | #define SHIFT2_LINE(OFF, R0,R1,R2,R3) \ |
| 70 | "paddw %%mm"#R2", %%mm"#R1" \n\t" \ |
| 71 | "movd (%0,%3), %%mm"#R0" \n\t" \ |
| 72 | "pmullw %%mm6, %%mm"#R1" \n\t" \ |
| 73 | "punpcklbw %%mm0, %%mm"#R0" \n\t" \ |
| 74 | "movd (%0,%2), %%mm"#R3" \n\t" \ |
| 75 | "psubw %%mm"#R0", %%mm"#R1" \n\t" \ |
| 76 | "punpcklbw %%mm0, %%mm"#R3" \n\t" \ |
| 77 | "paddw %%mm7, %%mm"#R1" \n\t" \ |
| 78 | "psubw %%mm"#R3", %%mm"#R1" \n\t" \ |
| 79 | "psraw %4, %%mm"#R1" \n\t" \ |
| 80 | "movq %%mm"#R1", "#OFF"(%1) \n\t" \ |
| 81 | "add %2, %0 \n\t" |
| 82 | |
| 83 | /** Sacrifying mm6 allows to pipeline loads from src */ |
| 84 | static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, |
| 85 | const uint8_t *src, x86_reg stride, |
| 86 | int rnd, int64_t shift) |
| 87 | { |
| 88 | __asm__ volatile( |
| 89 | "mov $3, %%"REG_c" \n\t" |
| 90 | LOAD_ROUNDER_MMX("%5") |
| 91 | "movq "MANGLE(ff_pw_9)", %%mm6 \n\t" |
| 92 | "1: \n\t" |
| 93 | "movd (%0), %%mm2 \n\t" |
| 94 | "add %2, %0 \n\t" |
| 95 | "movd (%0), %%mm3 \n\t" |
| 96 | "punpcklbw %%mm0, %%mm2 \n\t" |
| 97 | "punpcklbw %%mm0, %%mm3 \n\t" |
| 98 | SHIFT2_LINE( 0, 1, 2, 3, 4) |
| 99 | SHIFT2_LINE( 24, 2, 3, 4, 1) |
| 100 | SHIFT2_LINE( 48, 3, 4, 1, 2) |
| 101 | SHIFT2_LINE( 72, 4, 1, 2, 3) |
| 102 | SHIFT2_LINE( 96, 1, 2, 3, 4) |
| 103 | SHIFT2_LINE(120, 2, 3, 4, 1) |
| 104 | SHIFT2_LINE(144, 3, 4, 1, 2) |
| 105 | SHIFT2_LINE(168, 4, 1, 2, 3) |
| 106 | "sub %6, %0 \n\t" |
| 107 | "add $8, %1 \n\t" |
| 108 | "dec %%"REG_c" \n\t" |
| 109 | "jnz 1b \n\t" |
| 110 | : "+r"(src), "+r"(dst) |
| 111 | : "r"(stride), "r"(-2*stride), |
| 112 | "m"(shift), "m"(rnd), "r"(9*stride-4) |
| 113 | NAMED_CONSTRAINTS_ADD(ff_pw_9) |
| 114 | : "%"REG_c, "memory" |
| 115 | ); |
| 116 | } |
| 117 | |
| 118 | /** |
| 119 | * Data is already unpacked, so some operations can directly be made from |
| 120 | * memory. |
| 121 | */ |
| 122 | #define VC1_HOR_16b_SHIFT2(OP, OPNAME)\ |
| 123 | static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\ |
| 124 | const int16_t *src, int rnd)\ |
| 125 | {\ |
| 126 | int h = 8;\ |
| 127 | \ |
| 128 | src -= 1;\ |
| 129 | rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\ |
| 130 | __asm__ volatile(\ |
| 131 | LOAD_ROUNDER_MMX("%4")\ |
| 132 | "movq "MANGLE(ff_pw_128)", %%mm6\n\t"\ |
| 133 | "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\ |
| 134 | "1: \n\t"\ |
| 135 | "movq 2*0+0(%1), %%mm1 \n\t"\ |
| 136 | "movq 2*0+8(%1), %%mm2 \n\t"\ |
| 137 | "movq 2*1+0(%1), %%mm3 \n\t"\ |
| 138 | "movq 2*1+8(%1), %%mm4 \n\t"\ |
| 139 | "paddw 2*3+0(%1), %%mm1 \n\t"\ |
| 140 | "paddw 2*3+8(%1), %%mm2 \n\t"\ |
| 141 | "paddw 2*2+0(%1), %%mm3 \n\t"\ |
| 142 | "paddw 2*2+8(%1), %%mm4 \n\t"\ |
| 143 | "pmullw %%mm5, %%mm3 \n\t"\ |
| 144 | "pmullw %%mm5, %%mm4 \n\t"\ |
| 145 | "psubw %%mm1, %%mm3 \n\t"\ |
| 146 | "psubw %%mm2, %%mm4 \n\t"\ |
| 147 | NORMALIZE_MMX("$7")\ |
| 148 | /* Remove bias */\ |
| 149 | "paddw %%mm6, %%mm3 \n\t"\ |
| 150 | "paddw %%mm6, %%mm4 \n\t"\ |
| 151 | TRANSFER_DO_PACK(OP)\ |
| 152 | "add $24, %1 \n\t"\ |
| 153 | "add %3, %2 \n\t"\ |
| 154 | "decl %0 \n\t"\ |
| 155 | "jnz 1b \n\t"\ |
| 156 | : "+r"(h), "+r" (src), "+r" (dst)\ |
| 157 | : "r"(stride), "m"(rnd)\ |
| 158 | NAMED_CONSTRAINTS_ADD(ff_pw_128,ff_pw_9)\ |
| 159 | : "memory"\ |
| 160 | );\ |
| 161 | } |
| 162 | |
| 163 | VC1_HOR_16b_SHIFT2(OP_PUT, put_) |
| 164 | VC1_HOR_16b_SHIFT2(OP_AVG, avg_) |
| 165 | |
| 166 | |
| 167 | /** |
| 168 | * Purely vertical or horizontal 1/2 shift interpolation. |
| 169 | * Sacrify mm6 for *9 factor. |
| 170 | */ |
| 171 | #define VC1_SHIFT2(OP, OPNAME)\ |
| 172 | static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\ |
| 173 | x86_reg stride, int rnd, x86_reg offset)\ |
| 174 | {\ |
| 175 | rnd = 8-rnd;\ |
| 176 | __asm__ volatile(\ |
| 177 | "mov $8, %%"REG_c" \n\t"\ |
| 178 | LOAD_ROUNDER_MMX("%5")\ |
| 179 | "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\ |
| 180 | "1: \n\t"\ |
| 181 | "movd 0(%0 ), %%mm3 \n\t"\ |
| 182 | "movd 4(%0 ), %%mm4 \n\t"\ |
| 183 | "movd 0(%0,%2), %%mm1 \n\t"\ |
| 184 | "movd 4(%0,%2), %%mm2 \n\t"\ |
| 185 | "add %2, %0 \n\t"\ |
| 186 | "punpcklbw %%mm0, %%mm3 \n\t"\ |
| 187 | "punpcklbw %%mm0, %%mm4 \n\t"\ |
| 188 | "punpcklbw %%mm0, %%mm1 \n\t"\ |
| 189 | "punpcklbw %%mm0, %%mm2 \n\t"\ |
| 190 | "paddw %%mm1, %%mm3 \n\t"\ |
| 191 | "paddw %%mm2, %%mm4 \n\t"\ |
| 192 | "movd 0(%0,%3), %%mm1 \n\t"\ |
| 193 | "movd 4(%0,%3), %%mm2 \n\t"\ |
| 194 | "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\ |
| 195 | "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\ |
| 196 | "punpcklbw %%mm0, %%mm1 \n\t"\ |
| 197 | "punpcklbw %%mm0, %%mm2 \n\t"\ |
| 198 | "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\ |
| 199 | "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\ |
| 200 | "movd 0(%0,%2), %%mm1 \n\t"\ |
| 201 | "movd 4(%0,%2), %%mm2 \n\t"\ |
| 202 | "punpcklbw %%mm0, %%mm1 \n\t"\ |
| 203 | "punpcklbw %%mm0, %%mm2 \n\t"\ |
| 204 | "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\ |
| 205 | "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\ |
| 206 | NORMALIZE_MMX("$4")\ |
| 207 | "packuswb %%mm4, %%mm3 \n\t"\ |
| 208 | OP((%1), %%mm3)\ |
| 209 | "movq %%mm3, (%1) \n\t"\ |
| 210 | "add %6, %0 \n\t"\ |
| 211 | "add %4, %1 \n\t"\ |
| 212 | "dec %%"REG_c" \n\t"\ |
| 213 | "jnz 1b \n\t"\ |
| 214 | : "+r"(src), "+r"(dst)\ |
| 215 | : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\ |
| 216 | "g"(stride-offset)\ |
| 217 | NAMED_CONSTRAINTS_ADD(ff_pw_9)\ |
| 218 | : "%"REG_c, "memory"\ |
| 219 | );\ |
| 220 | } |
| 221 | |
| 222 | VC1_SHIFT2(OP_PUT, put_) |
| 223 | VC1_SHIFT2(OP_AVG, avg_) |
| 224 | |
| 225 | /** |
| 226 | * Core of the 1/4 and 3/4 shift bicubic interpolation. |
| 227 | * |
| 228 | * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty). |
| 229 | * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked. |
| 230 | * @param A1 Address of 1st tap (beware of unpacked/packed). |
| 231 | * @param A2 Address of 2nd tap |
| 232 | * @param A3 Address of 3rd tap |
| 233 | * @param A4 Address of 4th tap |
| 234 | */ |
| 235 | #define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \ |
| 236 | MOVQ "*0+"A1", %%mm1 \n\t" \ |
| 237 | MOVQ "*4+"A1", %%mm2 \n\t" \ |
| 238 | UNPACK("%%mm1") \ |
| 239 | UNPACK("%%mm2") \ |
| 240 | "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \ |
| 241 | "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \ |
| 242 | MOVQ "*0+"A2", %%mm3 \n\t" \ |
| 243 | MOVQ "*4+"A2", %%mm4 \n\t" \ |
| 244 | UNPACK("%%mm3") \ |
| 245 | UNPACK("%%mm4") \ |
| 246 | "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ |
| 247 | "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \ |
| 248 | "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \ |
| 249 | "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \ |
| 250 | MOVQ "*0+"A4", %%mm1 \n\t" \ |
| 251 | MOVQ "*4+"A4", %%mm2 \n\t" \ |
| 252 | UNPACK("%%mm1") \ |
| 253 | UNPACK("%%mm2") \ |
| 254 | "psllw $2, %%mm1 \n\t" /* 4* */ \ |
| 255 | "psllw $2, %%mm2 \n\t" /* 4* */ \ |
| 256 | "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \ |
| 257 | "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \ |
| 258 | MOVQ "*0+"A3", %%mm1 \n\t" \ |
| 259 | MOVQ "*4+"A3", %%mm2 \n\t" \ |
| 260 | UNPACK("%%mm1") \ |
| 261 | UNPACK("%%mm2") \ |
| 262 | "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ |
| 263 | "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \ |
| 264 | "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \ |
| 265 | "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */ |
| 266 | |
| 267 | /** |
| 268 | * Macro to build the vertical 16bits version of vc1_put_shift[13]. |
| 269 | * Here, offset=src_stride. Parameters passed A1 to A4 must use |
| 270 | * %3 (src_stride) and %4 (3*src_stride). |
| 271 | * |
| 272 | * @param NAME Either 1 or 3 |
| 273 | * @see MSPEL_FILTER13_CORE for information on A1->A4 |
| 274 | */ |
| 275 | #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \ |
| 276 | static void \ |
| 277 | vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \ |
| 278 | x86_reg src_stride, \ |
| 279 | int rnd, int64_t shift) \ |
| 280 | { \ |
| 281 | int h = 8; \ |
| 282 | src -= src_stride; \ |
| 283 | __asm__ volatile( \ |
| 284 | LOAD_ROUNDER_MMX("%5") \ |
| 285 | "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \ |
| 286 | "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \ |
| 287 | ".p2align 3 \n\t" \ |
| 288 | "1: \n\t" \ |
| 289 | MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ |
| 290 | NORMALIZE_MMX("%6") \ |
| 291 | TRANSFER_DONT_PACK(OP_PUT) \ |
| 292 | /* Last 3 (in fact 4) bytes on the line */ \ |
| 293 | "movd 8+"A1", %%mm1 \n\t" \ |
| 294 | DO_UNPACK("%%mm1") \ |
| 295 | "movq %%mm1, %%mm3 \n\t" \ |
| 296 | "paddw %%mm1, %%mm1 \n\t" \ |
| 297 | "paddw %%mm3, %%mm1 \n\t" /* 3* */ \ |
| 298 | "movd 8+"A2", %%mm3 \n\t" \ |
| 299 | DO_UNPACK("%%mm3") \ |
| 300 | "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ |
| 301 | "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \ |
| 302 | "movd 8+"A3", %%mm1 \n\t" \ |
| 303 | DO_UNPACK("%%mm1") \ |
| 304 | "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ |
| 305 | "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \ |
| 306 | "movd 8+"A4", %%mm1 \n\t" \ |
| 307 | DO_UNPACK("%%mm1") \ |
| 308 | "psllw $2, %%mm1 \n\t" /* 4* */ \ |
| 309 | "psubw %%mm1, %%mm3 \n\t" \ |
| 310 | "paddw %%mm7, %%mm3 \n\t" \ |
| 311 | "psraw %6, %%mm3 \n\t" \ |
| 312 | "movq %%mm3, 16(%2) \n\t" \ |
| 313 | "add %3, %1 \n\t" \ |
| 314 | "add $24, %2 \n\t" \ |
| 315 | "decl %0 \n\t" \ |
| 316 | "jnz 1b \n\t" \ |
| 317 | : "+r"(h), "+r" (src), "+r" (dst) \ |
| 318 | : "r"(src_stride), "r"(3*src_stride), \ |
| 319 | "m"(rnd), "m"(shift) \ |
| 320 | NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18) \ |
| 321 | : "memory" \ |
| 322 | ); \ |
| 323 | } |
| 324 | |
| 325 | /** |
| 326 | * Macro to build the horizontal 16bits version of vc1_put_shift[13]. |
| 327 | * Here, offset=16bits, so parameters passed A1 to A4 should be simple. |
| 328 | * |
| 329 | * @param NAME Either 1 or 3 |
| 330 | * @see MSPEL_FILTER13_CORE for information on A1->A4 |
| 331 | */ |
| 332 | #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \ |
| 333 | static void \ |
| 334 | OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \ |
| 335 | const int16_t *src, int rnd) \ |
| 336 | { \ |
| 337 | int h = 8; \ |
| 338 | src -= 1; \ |
| 339 | rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ |
| 340 | __asm__ volatile( \ |
| 341 | LOAD_ROUNDER_MMX("%4") \ |
| 342 | "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ |
| 343 | "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ |
| 344 | ".p2align 3 \n\t" \ |
| 345 | "1: \n\t" \ |
| 346 | MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \ |
| 347 | NORMALIZE_MMX("$7") \ |
| 348 | /* Remove bias */ \ |
| 349 | "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \ |
| 350 | "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \ |
| 351 | TRANSFER_DO_PACK(OP) \ |
| 352 | "add $24, %1 \n\t" \ |
| 353 | "add %3, %2 \n\t" \ |
| 354 | "decl %0 \n\t" \ |
| 355 | "jnz 1b \n\t" \ |
| 356 | : "+r"(h), "+r" (src), "+r" (dst) \ |
| 357 | : "r"(stride), "m"(rnd) \ |
| 358 | NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128) \ |
| 359 | : "memory" \ |
| 360 | ); \ |
| 361 | } |
| 362 | |
| 363 | /** |
| 364 | * Macro to build the 8bits, any direction, version of vc1_put_shift[13]. |
| 365 | * Here, offset=src_stride. Parameters passed A1 to A4 must use |
| 366 | * %3 (offset) and %4 (3*offset). |
| 367 | * |
| 368 | * @param NAME Either 1 or 3 |
| 369 | * @see MSPEL_FILTER13_CORE for information on A1->A4 |
| 370 | */ |
| 371 | #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \ |
| 372 | static void \ |
| 373 | OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \ |
| 374 | x86_reg stride, int rnd, x86_reg offset) \ |
| 375 | { \ |
| 376 | int h = 8; \ |
| 377 | src -= offset; \ |
| 378 | rnd = 32-rnd; \ |
| 379 | __asm__ volatile ( \ |
| 380 | LOAD_ROUNDER_MMX("%6") \ |
| 381 | "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ |
| 382 | "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ |
| 383 | ".p2align 3 \n\t" \ |
| 384 | "1: \n\t" \ |
| 385 | MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ |
| 386 | NORMALIZE_MMX("$6") \ |
| 387 | TRANSFER_DO_PACK(OP) \ |
| 388 | "add %5, %1 \n\t" \ |
| 389 | "add %5, %2 \n\t" \ |
| 390 | "decl %0 \n\t" \ |
| 391 | "jnz 1b \n\t" \ |
| 392 | : "+r"(h), "+r" (src), "+r" (dst) \ |
| 393 | : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \ |
| 394 | NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3) \ |
| 395 | : "memory" \ |
| 396 | ); \ |
| 397 | } |
| 398 | |
| 399 | /** 1/4 shift bicubic interpolation */ |
| 400 | MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_) |
| 401 | MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_) |
| 402 | MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") |
| 403 | MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_) |
| 404 | MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_) |
| 405 | |
| 406 | /** 3/4 shift bicubic interpolation */ |
| 407 | MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_) |
| 408 | MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_) |
| 409 | MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") |
| 410 | MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_) |
| 411 | MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_) |
| 412 | |
| 413 | typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift); |
| 414 | typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd); |
| 415 | typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset); |
| 416 | |
| 417 | /** |
| 418 | * Interpolate fractional pel values by applying proper vertical then |
| 419 | * horizontal filter. |
| 420 | * |
| 421 | * @param dst Destination buffer for interpolated pels. |
| 422 | * @param src Source buffer. |
| 423 | * @param stride Stride for both src and dst buffers. |
| 424 | * @param hmode Horizontal filter (expressed in quarter pixels shift). |
| 425 | * @param hmode Vertical filter. |
| 426 | * @param rnd Rounding bias. |
| 427 | */ |
| 428 | #define VC1_MSPEL_MC(OP)\ |
| 429 | static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ |
| 430 | int hmode, int vmode, int rnd)\ |
| 431 | {\ |
| 432 | static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ |
| 433 | { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\ |
| 434 | static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ |
| 435 | { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\ |
| 436 | static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\ |
| 437 | { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\ |
| 438 | \ |
| 439 | __asm__ volatile(\ |
| 440 | "pxor %%mm0, %%mm0 \n\t"\ |
| 441 | ::: "memory"\ |
| 442 | );\ |
| 443 | \ |
| 444 | if (vmode) { /* Vertical filter to apply */\ |
| 445 | if (hmode) { /* Horizontal filter to apply, output to tmp */\ |
| 446 | static const int shift_value[] = { 0, 5, 1, 5 };\ |
| 447 | int shift = (shift_value[hmode]+shift_value[vmode])>>1;\ |
| 448 | int r;\ |
| 449 | DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\ |
| 450 | \ |
| 451 | r = (1<<(shift-1)) + rnd-1;\ |
| 452 | vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\ |
| 453 | \ |
| 454 | vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ |
| 455 | return;\ |
| 456 | }\ |
| 457 | else { /* No horizontal filter, output 8 lines to dst */\ |
| 458 | vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\ |
| 459 | return;\ |
| 460 | }\ |
| 461 | }\ |
| 462 | \ |
| 463 | /* Horizontal mode with no vertical mode */\ |
| 464 | vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ |
| 465 | } \ |
| 466 | static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \ |
| 467 | int stride, int hmode, int vmode, int rnd)\ |
| 468 | { \ |
| 469 | OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ |
| 470 | OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ |
| 471 | dst += 8*stride; src += 8*stride; \ |
| 472 | OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ |
| 473 | OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ |
| 474 | } |
| 475 | |
| 476 | VC1_MSPEL_MC(put_) |
| 477 | VC1_MSPEL_MC(avg_) |
| 478 | |
| 479 | /** Macro to ease bicubic filter interpolation functions declarations */ |
| 480 | #define DECLARE_FUNCTION(a, b) \ |
| 481 | static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, \ |
| 482 | const uint8_t *src, \ |
| 483 | ptrdiff_t stride, \ |
| 484 | int rnd) \ |
| 485 | { \ |
| 486 | put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ |
| 487 | }\ |
| 488 | static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \ |
| 489 | const uint8_t *src, \ |
| 490 | ptrdiff_t stride, \ |
| 491 | int rnd) \ |
| 492 | { \ |
| 493 | avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ |
| 494 | }\ |
| 495 | static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst, \ |
| 496 | const uint8_t *src, \ |
| 497 | ptrdiff_t stride, \ |
| 498 | int rnd) \ |
| 499 | { \ |
| 500 | put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ |
| 501 | }\ |
| 502 | static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst, \ |
| 503 | const uint8_t *src,\ |
| 504 | ptrdiff_t stride, \ |
| 505 | int rnd) \ |
| 506 | { \ |
| 507 | avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ |
| 508 | } |
| 509 | |
| 510 | DECLARE_FUNCTION(0, 1) |
| 511 | DECLARE_FUNCTION(0, 2) |
| 512 | DECLARE_FUNCTION(0, 3) |
| 513 | |
| 514 | DECLARE_FUNCTION(1, 0) |
| 515 | DECLARE_FUNCTION(1, 1) |
| 516 | DECLARE_FUNCTION(1, 2) |
| 517 | DECLARE_FUNCTION(1, 3) |
| 518 | |
| 519 | DECLARE_FUNCTION(2, 0) |
| 520 | DECLARE_FUNCTION(2, 1) |
| 521 | DECLARE_FUNCTION(2, 2) |
| 522 | DECLARE_FUNCTION(2, 3) |
| 523 | |
| 524 | DECLARE_FUNCTION(3, 0) |
| 525 | DECLARE_FUNCTION(3, 1) |
| 526 | DECLARE_FUNCTION(3, 2) |
| 527 | DECLARE_FUNCTION(3, 3) |
| 528 | |
| 529 | static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize, |
| 530 | int16_t *block) |
| 531 | { |
| 532 | int dc = block[0]; |
| 533 | dc = (17 * dc + 4) >> 3; |
| 534 | dc = (17 * dc + 64) >> 7; |
| 535 | __asm__ volatile( |
| 536 | "movd %0, %%mm0 \n\t" |
| 537 | "pshufw $0, %%mm0, %%mm0 \n\t" |
| 538 | "pxor %%mm1, %%mm1 \n\t" |
| 539 | "psubw %%mm0, %%mm1 \n\t" |
| 540 | "packuswb %%mm0, %%mm0 \n\t" |
| 541 | "packuswb %%mm1, %%mm1 \n\t" |
| 542 | ::"r"(dc) |
| 543 | ); |
| 544 | __asm__ volatile( |
| 545 | "movd %0, %%mm2 \n\t" |
| 546 | "movd %1, %%mm3 \n\t" |
| 547 | "movd %2, %%mm4 \n\t" |
| 548 | "movd %3, %%mm5 \n\t" |
| 549 | "paddusb %%mm0, %%mm2 \n\t" |
| 550 | "paddusb %%mm0, %%mm3 \n\t" |
| 551 | "paddusb %%mm0, %%mm4 \n\t" |
| 552 | "paddusb %%mm0, %%mm5 \n\t" |
| 553 | "psubusb %%mm1, %%mm2 \n\t" |
| 554 | "psubusb %%mm1, %%mm3 \n\t" |
| 555 | "psubusb %%mm1, %%mm4 \n\t" |
| 556 | "psubusb %%mm1, %%mm5 \n\t" |
| 557 | "movd %%mm2, %0 \n\t" |
| 558 | "movd %%mm3, %1 \n\t" |
| 559 | "movd %%mm4, %2 \n\t" |
| 560 | "movd %%mm5, %3 \n\t" |
| 561 | :"+m"(*(uint32_t*)(dest+0*linesize)), |
| 562 | "+m"(*(uint32_t*)(dest+1*linesize)), |
| 563 | "+m"(*(uint32_t*)(dest+2*linesize)), |
| 564 | "+m"(*(uint32_t*)(dest+3*linesize)) |
| 565 | ); |
| 566 | } |
| 567 | |
| 568 | static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize, |
| 569 | int16_t *block) |
| 570 | { |
| 571 | int dc = block[0]; |
| 572 | dc = (17 * dc + 4) >> 3; |
| 573 | dc = (12 * dc + 64) >> 7; |
| 574 | __asm__ volatile( |
| 575 | "movd %0, %%mm0 \n\t" |
| 576 | "pshufw $0, %%mm0, %%mm0 \n\t" |
| 577 | "pxor %%mm1, %%mm1 \n\t" |
| 578 | "psubw %%mm0, %%mm1 \n\t" |
| 579 | "packuswb %%mm0, %%mm0 \n\t" |
| 580 | "packuswb %%mm1, %%mm1 \n\t" |
| 581 | ::"r"(dc) |
| 582 | ); |
| 583 | __asm__ volatile( |
| 584 | "movd %0, %%mm2 \n\t" |
| 585 | "movd %1, %%mm3 \n\t" |
| 586 | "movd %2, %%mm4 \n\t" |
| 587 | "movd %3, %%mm5 \n\t" |
| 588 | "paddusb %%mm0, %%mm2 \n\t" |
| 589 | "paddusb %%mm0, %%mm3 \n\t" |
| 590 | "paddusb %%mm0, %%mm4 \n\t" |
| 591 | "paddusb %%mm0, %%mm5 \n\t" |
| 592 | "psubusb %%mm1, %%mm2 \n\t" |
| 593 | "psubusb %%mm1, %%mm3 \n\t" |
| 594 | "psubusb %%mm1, %%mm4 \n\t" |
| 595 | "psubusb %%mm1, %%mm5 \n\t" |
| 596 | "movd %%mm2, %0 \n\t" |
| 597 | "movd %%mm3, %1 \n\t" |
| 598 | "movd %%mm4, %2 \n\t" |
| 599 | "movd %%mm5, %3 \n\t" |
| 600 | :"+m"(*(uint32_t*)(dest+0*linesize)), |
| 601 | "+m"(*(uint32_t*)(dest+1*linesize)), |
| 602 | "+m"(*(uint32_t*)(dest+2*linesize)), |
| 603 | "+m"(*(uint32_t*)(dest+3*linesize)) |
| 604 | ); |
| 605 | dest += 4*linesize; |
| 606 | __asm__ volatile( |
| 607 | "movd %0, %%mm2 \n\t" |
| 608 | "movd %1, %%mm3 \n\t" |
| 609 | "movd %2, %%mm4 \n\t" |
| 610 | "movd %3, %%mm5 \n\t" |
| 611 | "paddusb %%mm0, %%mm2 \n\t" |
| 612 | "paddusb %%mm0, %%mm3 \n\t" |
| 613 | "paddusb %%mm0, %%mm4 \n\t" |
| 614 | "paddusb %%mm0, %%mm5 \n\t" |
| 615 | "psubusb %%mm1, %%mm2 \n\t" |
| 616 | "psubusb %%mm1, %%mm3 \n\t" |
| 617 | "psubusb %%mm1, %%mm4 \n\t" |
| 618 | "psubusb %%mm1, %%mm5 \n\t" |
| 619 | "movd %%mm2, %0 \n\t" |
| 620 | "movd %%mm3, %1 \n\t" |
| 621 | "movd %%mm4, %2 \n\t" |
| 622 | "movd %%mm5, %3 \n\t" |
| 623 | :"+m"(*(uint32_t*)(dest+0*linesize)), |
| 624 | "+m"(*(uint32_t*)(dest+1*linesize)), |
| 625 | "+m"(*(uint32_t*)(dest+2*linesize)), |
| 626 | "+m"(*(uint32_t*)(dest+3*linesize)) |
| 627 | ); |
| 628 | } |
| 629 | |
| 630 | static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize, |
| 631 | int16_t *block) |
| 632 | { |
| 633 | int dc = block[0]; |
| 634 | dc = ( 3 * dc + 1) >> 1; |
| 635 | dc = (17 * dc + 64) >> 7; |
| 636 | __asm__ volatile( |
| 637 | "movd %0, %%mm0 \n\t" |
| 638 | "pshufw $0, %%mm0, %%mm0 \n\t" |
| 639 | "pxor %%mm1, %%mm1 \n\t" |
| 640 | "psubw %%mm0, %%mm1 \n\t" |
| 641 | "packuswb %%mm0, %%mm0 \n\t" |
| 642 | "packuswb %%mm1, %%mm1 \n\t" |
| 643 | ::"r"(dc) |
| 644 | ); |
| 645 | __asm__ volatile( |
| 646 | "movq %0, %%mm2 \n\t" |
| 647 | "movq %1, %%mm3 \n\t" |
| 648 | "movq %2, %%mm4 \n\t" |
| 649 | "movq %3, %%mm5 \n\t" |
| 650 | "paddusb %%mm0, %%mm2 \n\t" |
| 651 | "paddusb %%mm0, %%mm3 \n\t" |
| 652 | "paddusb %%mm0, %%mm4 \n\t" |
| 653 | "paddusb %%mm0, %%mm5 \n\t" |
| 654 | "psubusb %%mm1, %%mm2 \n\t" |
| 655 | "psubusb %%mm1, %%mm3 \n\t" |
| 656 | "psubusb %%mm1, %%mm4 \n\t" |
| 657 | "psubusb %%mm1, %%mm5 \n\t" |
| 658 | "movq %%mm2, %0 \n\t" |
| 659 | "movq %%mm3, %1 \n\t" |
| 660 | "movq %%mm4, %2 \n\t" |
| 661 | "movq %%mm5, %3 \n\t" |
| 662 | :"+m"(*(uint32_t*)(dest+0*linesize)), |
| 663 | "+m"(*(uint32_t*)(dest+1*linesize)), |
| 664 | "+m"(*(uint32_t*)(dest+2*linesize)), |
| 665 | "+m"(*(uint32_t*)(dest+3*linesize)) |
| 666 | ); |
| 667 | } |
| 668 | |
| 669 | static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize, |
| 670 | int16_t *block) |
| 671 | { |
| 672 | int dc = block[0]; |
| 673 | dc = (3 * dc + 1) >> 1; |
| 674 | dc = (3 * dc + 16) >> 5; |
| 675 | __asm__ volatile( |
| 676 | "movd %0, %%mm0 \n\t" |
| 677 | "pshufw $0, %%mm0, %%mm0 \n\t" |
| 678 | "pxor %%mm1, %%mm1 \n\t" |
| 679 | "psubw %%mm0, %%mm1 \n\t" |
| 680 | "packuswb %%mm0, %%mm0 \n\t" |
| 681 | "packuswb %%mm1, %%mm1 \n\t" |
| 682 | ::"r"(dc) |
| 683 | ); |
| 684 | __asm__ volatile( |
| 685 | "movq %0, %%mm2 \n\t" |
| 686 | "movq %1, %%mm3 \n\t" |
| 687 | "movq %2, %%mm4 \n\t" |
| 688 | "movq %3, %%mm5 \n\t" |
| 689 | "paddusb %%mm0, %%mm2 \n\t" |
| 690 | "paddusb %%mm0, %%mm3 \n\t" |
| 691 | "paddusb %%mm0, %%mm4 \n\t" |
| 692 | "paddusb %%mm0, %%mm5 \n\t" |
| 693 | "psubusb %%mm1, %%mm2 \n\t" |
| 694 | "psubusb %%mm1, %%mm3 \n\t" |
| 695 | "psubusb %%mm1, %%mm4 \n\t" |
| 696 | "psubusb %%mm1, %%mm5 \n\t" |
| 697 | "movq %%mm2, %0 \n\t" |
| 698 | "movq %%mm3, %1 \n\t" |
| 699 | "movq %%mm4, %2 \n\t" |
| 700 | "movq %%mm5, %3 \n\t" |
| 701 | :"+m"(*(uint32_t*)(dest+0*linesize)), |
| 702 | "+m"(*(uint32_t*)(dest+1*linesize)), |
| 703 | "+m"(*(uint32_t*)(dest+2*linesize)), |
| 704 | "+m"(*(uint32_t*)(dest+3*linesize)) |
| 705 | ); |
| 706 | dest += 4*linesize; |
| 707 | __asm__ volatile( |
| 708 | "movq %0, %%mm2 \n\t" |
| 709 | "movq %1, %%mm3 \n\t" |
| 710 | "movq %2, %%mm4 \n\t" |
| 711 | "movq %3, %%mm5 \n\t" |
| 712 | "paddusb %%mm0, %%mm2 \n\t" |
| 713 | "paddusb %%mm0, %%mm3 \n\t" |
| 714 | "paddusb %%mm0, %%mm4 \n\t" |
| 715 | "paddusb %%mm0, %%mm5 \n\t" |
| 716 | "psubusb %%mm1, %%mm2 \n\t" |
| 717 | "psubusb %%mm1, %%mm3 \n\t" |
| 718 | "psubusb %%mm1, %%mm4 \n\t" |
| 719 | "psubusb %%mm1, %%mm5 \n\t" |
| 720 | "movq %%mm2, %0 \n\t" |
| 721 | "movq %%mm3, %1 \n\t" |
| 722 | "movq %%mm4, %2 \n\t" |
| 723 | "movq %%mm5, %3 \n\t" |
| 724 | :"+m"(*(uint32_t*)(dest+0*linesize)), |
| 725 | "+m"(*(uint32_t*)(dest+1*linesize)), |
| 726 | "+m"(*(uint32_t*)(dest+2*linesize)), |
| 727 | "+m"(*(uint32_t*)(dest+3*linesize)) |
| 728 | ); |
| 729 | } |
| 730 | |
| 731 | #if HAVE_MMX_EXTERNAL |
| 732 | static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, |
| 733 | ptrdiff_t stride, int rnd) |
| 734 | { |
| 735 | ff_put_pixels8_mmx(dst, src, stride, 8); |
| 736 | } |
| 737 | static void put_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src, |
| 738 | ptrdiff_t stride, int rnd) |
| 739 | { |
| 740 | ff_put_pixels16_mmx(dst, src, stride, 16); |
| 741 | } |
| 742 | static void avg_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, |
| 743 | ptrdiff_t stride, int rnd) |
| 744 | { |
| 745 | ff_avg_pixels8_mmx(dst, src, stride, 8); |
| 746 | } |
| 747 | static void avg_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src, |
| 748 | ptrdiff_t stride, int rnd) |
| 749 | { |
| 750 | ff_avg_pixels16_mmx(dst, src, stride, 16); |
| 751 | } |
| 752 | #endif |
| 753 | |
| 754 | #define FN_ASSIGN(OP, X, Y, INSN) \ |
| 755 | dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \ |
| 756 | dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN |
| 757 | |
| 758 | av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) |
| 759 | { |
| 760 | #if HAVE_MMX_EXTERNAL |
| 761 | FN_ASSIGN(put_, 0, 0, _mmx); |
| 762 | FN_ASSIGN(avg_, 0, 0, _mmx); |
| 763 | #endif |
| 764 | FN_ASSIGN(put_, 0, 1, _mmx); |
| 765 | FN_ASSIGN(put_, 0, 2, _mmx); |
| 766 | FN_ASSIGN(put_, 0, 3, _mmx); |
| 767 | |
| 768 | FN_ASSIGN(put_, 1, 0, _mmx); |
| 769 | FN_ASSIGN(put_, 1, 1, _mmx); |
| 770 | FN_ASSIGN(put_, 1, 2, _mmx); |
| 771 | FN_ASSIGN(put_, 1, 3, _mmx); |
| 772 | |
| 773 | FN_ASSIGN(put_, 2, 0, _mmx); |
| 774 | FN_ASSIGN(put_, 2, 1, _mmx); |
| 775 | FN_ASSIGN(put_, 2, 2, _mmx); |
| 776 | FN_ASSIGN(put_, 2, 3, _mmx); |
| 777 | |
| 778 | FN_ASSIGN(put_, 3, 0, _mmx); |
| 779 | FN_ASSIGN(put_, 3, 1, _mmx); |
| 780 | FN_ASSIGN(put_, 3, 2, _mmx); |
| 781 | FN_ASSIGN(put_, 3, 3, _mmx); |
| 782 | } |
| 783 | |
| 784 | av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp) |
| 785 | { |
| 786 | FN_ASSIGN(avg_, 0, 1, _mmxext); |
| 787 | FN_ASSIGN(avg_, 0, 2, _mmxext); |
| 788 | FN_ASSIGN(avg_, 0, 3, _mmxext); |
| 789 | |
| 790 | FN_ASSIGN(avg_, 1, 0, _mmxext); |
| 791 | FN_ASSIGN(avg_, 1, 1, _mmxext); |
| 792 | FN_ASSIGN(avg_, 1, 2, _mmxext); |
| 793 | FN_ASSIGN(avg_, 1, 3, _mmxext); |
| 794 | |
| 795 | FN_ASSIGN(avg_, 2, 0, _mmxext); |
| 796 | FN_ASSIGN(avg_, 2, 1, _mmxext); |
| 797 | FN_ASSIGN(avg_, 2, 2, _mmxext); |
| 798 | FN_ASSIGN(avg_, 2, 3, _mmxext); |
| 799 | |
| 800 | FN_ASSIGN(avg_, 3, 0, _mmxext); |
| 801 | FN_ASSIGN(avg_, 3, 1, _mmxext); |
| 802 | FN_ASSIGN(avg_, 3, 2, _mmxext); |
| 803 | FN_ASSIGN(avg_, 3, 3, _mmxext); |
| 804 | |
| 805 | dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext; |
| 806 | dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext; |
| 807 | dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext; |
| 808 | dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext; |
| 809 | } |
| 810 | #endif /* HAVE_6REGS && HAVE_INLINE_ASM */ |