| 1 | ;****************************************************************************** |
| 2 | ;* 32 point SSE-optimized DCT transform |
| 3 | ;* Copyright (c) 2010 Vitor Sessak |
| 4 | ;* |
| 5 | ;* This file is part of FFmpeg. |
| 6 | ;* |
| 7 | ;* FFmpeg is free software; you can redistribute it and/or |
| 8 | ;* modify it under the terms of the GNU Lesser General Public |
| 9 | ;* License as published by the Free Software Foundation; either |
| 10 | ;* version 2.1 of the License, or (at your option) any later version. |
| 11 | ;* |
| 12 | ;* FFmpeg is distributed in the hope that it will be useful, |
| 13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | ;* Lesser General Public License for more details. |
| 16 | ;* |
| 17 | ;* You should have received a copy of the GNU Lesser General Public |
| 18 | ;* License along with FFmpeg; if not, write to the Free Software |
| 19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 | ;****************************************************************************** |
| 21 | |
| 22 | %include "libavutil/x86/x86util.asm" |
| 23 | |
| 24 | SECTION_RODATA 32 |
| 25 | |
| 26 | align 32 |
| 27 | ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 |
| 28 | dd 0.553104, 0.582935, 0.622504, 0.674808 |
| 29 | dd -10.190008, -3.407609, -2.057781, -1.484165 |
| 30 | dd -1.169440, -0.972568, -0.839350, -0.744536 |
| 31 | dd 0.502419, 0.522499, 0.566944, 0.646822 |
| 32 | dd 0.788155, 1.060678, 1.722447, 5.101149 |
| 33 | dd 0.509796, 0.601345, 0.899976, 2.562916 |
| 34 | dd 0.509796, 0.601345, 0.899976, 2.562916 |
| 35 | dd 1.000000, 1.000000, 1.306563, 0.541196 |
| 36 | dd 1.000000, 1.000000, 1.306563, 0.541196 |
| 37 | dd 1.000000, 0.707107, 1.000000, -0.707107 |
| 38 | dd 1.000000, 0.707107, 1.000000, -0.707107 |
| 39 | dd 0.707107, 0.707107, 0.707107, 0.707107 |
| 40 | |
| 41 | align 32 |
| 42 | ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 |
| 43 | |
| 44 | %macro BUTTERFLY 4 |
| 45 | subps %4, %1, %2 |
| 46 | addps %2, %2, %1 |
| 47 | mulps %1, %4, %3 |
| 48 | %endmacro |
| 49 | |
| 50 | %macro BUTTERFLY0 5 |
| 51 | %if cpuflag(sse2) && notcpuflag(avx) |
| 52 | pshufd %4, %1, %5 |
| 53 | xorps %1, %2 |
| 54 | addps %1, %4 |
| 55 | mulps %1, %3 |
| 56 | %else |
| 57 | shufps %4, %1, %1, %5 |
| 58 | xorps %1, %1, %2 |
| 59 | addps %4, %4, %1 |
| 60 | mulps %1, %4, %3 |
| 61 | %endif |
| 62 | %endmacro |
| 63 | |
| 64 | %macro BUTTERFLY2 4 |
| 65 | BUTTERFLY0 %1, %2, %3, %4, 0x1b |
| 66 | %endmacro |
| 67 | |
| 68 | %macro BUTTERFLY3 4 |
| 69 | BUTTERFLY0 %1, %2, %3, %4, 0xb1 |
| 70 | %endmacro |
| 71 | |
| 72 | %macro BUTTERFLY3V 5 |
| 73 | movaps m%5, m%1 |
| 74 | addps m%1, m%2 |
| 75 | subps m%5, m%2 |
| 76 | SWAP %2, %5 |
| 77 | mulps m%2, [ps_cos_vec+192] |
| 78 | movaps m%5, m%3 |
| 79 | addps m%3, m%4 |
| 80 | subps m%4, m%5 |
| 81 | mulps m%4, [ps_cos_vec+192] |
| 82 | %endmacro |
| 83 | |
| 84 | %macro PASS6_AND_PERMUTE 0 |
| 85 | mov tmpd, [outq+4] |
| 86 | movss m7, [outq+72] |
| 87 | addss m7, [outq+76] |
| 88 | movss m3, [outq+56] |
| 89 | addss m3, [outq+60] |
| 90 | addss m4, m3 |
| 91 | movss m2, [outq+52] |
| 92 | addss m2, m3 |
| 93 | movss m3, [outq+104] |
| 94 | addss m3, [outq+108] |
| 95 | addss m1, m3 |
| 96 | addss m5, m4 |
| 97 | movss [outq+ 16], m1 |
| 98 | movss m1, [outq+100] |
| 99 | addss m1, m3 |
| 100 | movss m3, [outq+40] |
| 101 | movss [outq+ 48], m1 |
| 102 | addss m3, [outq+44] |
| 103 | movss m1, [outq+100] |
| 104 | addss m4, m3 |
| 105 | addss m3, m2 |
| 106 | addss m1, [outq+108] |
| 107 | movss [outq+ 40], m3 |
| 108 | addss m2, [outq+36] |
| 109 | movss m3, [outq+8] |
| 110 | movss [outq+ 56], m2 |
| 111 | addss m3, [outq+12] |
| 112 | movss [outq+ 32], m3 |
| 113 | movss m3, [outq+80] |
| 114 | movss [outq+ 8], m5 |
| 115 | movss [outq+ 80], m1 |
| 116 | movss m2, [outq+52] |
| 117 | movss m5, [outq+120] |
| 118 | addss m5, [outq+124] |
| 119 | movss m1, [outq+64] |
| 120 | addss m2, [outq+60] |
| 121 | addss m0, m5 |
| 122 | addss m5, [outq+116] |
| 123 | mov [outq+64], tmpd |
| 124 | addss m6, m0 |
| 125 | addss m1, m6 |
| 126 | mov tmpd, [outq+12] |
| 127 | mov [outq+ 96], tmpd |
| 128 | movss [outq+ 4], m1 |
| 129 | movss m1, [outq+24] |
| 130 | movss [outq+ 24], m4 |
| 131 | movss m4, [outq+88] |
| 132 | addss m4, [outq+92] |
| 133 | addss m3, m4 |
| 134 | addss m4, [outq+84] |
| 135 | mov tmpd, [outq+108] |
| 136 | addss m1, [outq+28] |
| 137 | addss m0, m1 |
| 138 | addss m1, m5 |
| 139 | addss m6, m3 |
| 140 | addss m3, m0 |
| 141 | addss m0, m7 |
| 142 | addss m5, [outq+20] |
| 143 | addss m7, m1 |
| 144 | movss [outq+ 12], m6 |
| 145 | mov [outq+112], tmpd |
| 146 | movss m6, [outq+28] |
| 147 | movss [outq+ 28], m0 |
| 148 | movss m0, [outq+36] |
| 149 | movss [outq+ 36], m7 |
| 150 | addss m1, m4 |
| 151 | movss m7, [outq+116] |
| 152 | addss m0, m2 |
| 153 | addss m7, [outq+124] |
| 154 | movss [outq+ 72], m0 |
| 155 | movss m0, [outq+44] |
| 156 | addss m2, m0 |
| 157 | movss [outq+ 44], m1 |
| 158 | movss [outq+ 88], m2 |
| 159 | addss m0, [outq+60] |
| 160 | mov tmpd, [outq+60] |
| 161 | mov [outq+120], tmpd |
| 162 | movss [outq+104], m0 |
| 163 | addss m4, m5 |
| 164 | addss m5, [outq+68] |
| 165 | movss [outq+52], m4 |
| 166 | movss [outq+60], m5 |
| 167 | movss m4, [outq+68] |
| 168 | movss m5, [outq+20] |
| 169 | movss [outq+ 20], m3 |
| 170 | addss m5, m7 |
| 171 | addss m7, m6 |
| 172 | addss m4, m5 |
| 173 | movss m2, [outq+84] |
| 174 | addss m2, [outq+92] |
| 175 | addss m5, m2 |
| 176 | movss [outq+ 68], m4 |
| 177 | addss m2, m7 |
| 178 | movss m4, [outq+76] |
| 179 | movss [outq+ 84], m2 |
| 180 | movss [outq+ 76], m5 |
| 181 | addss m7, m4 |
| 182 | addss m6, [outq+124] |
| 183 | addss m4, m6 |
| 184 | addss m6, [outq+92] |
| 185 | movss [outq+100], m4 |
| 186 | movss [outq+108], m6 |
| 187 | movss m6, [outq+92] |
| 188 | movss [outq+92], m7 |
| 189 | addss m6, [outq+124] |
| 190 | movss [outq+116], m6 |
| 191 | %endmacro |
| 192 | |
| 193 | INIT_YMM avx |
| 194 | SECTION_TEXT |
| 195 | %if HAVE_AVX_EXTERNAL |
| 196 | ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in) |
| 197 | cglobal dct32_float, 2,3,8, out, in, tmp |
| 198 | ; pass 1 |
| 199 | vmovaps m4, [inq+0] |
| 200 | vinsertf128 m5, m5, [inq+96], 1 |
| 201 | vinsertf128 m5, m5, [inq+112], 0 |
| 202 | vshufps m5, m5, m5, 0x1b |
| 203 | BUTTERFLY m4, m5, [ps_cos_vec], m6 |
| 204 | |
| 205 | vmovaps m2, [inq+64] |
| 206 | vinsertf128 m6, m6, [inq+32], 1 |
| 207 | vinsertf128 m6, m6, [inq+48], 0 |
| 208 | vshufps m6, m6, m6, 0x1b |
| 209 | BUTTERFLY m2, m6, [ps_cos_vec+32], m0 |
| 210 | |
| 211 | ; pass 2 |
| 212 | |
| 213 | BUTTERFLY m5, m6, [ps_cos_vec+64], m0 |
| 214 | BUTTERFLY m4, m2, [ps_cos_vec+64], m7 |
| 215 | |
| 216 | |
| 217 | ; pass 3 |
| 218 | vperm2f128 m3, m6, m4, 0x31 |
| 219 | vperm2f128 m1, m6, m4, 0x20 |
| 220 | vshufps m3, m3, m3, 0x1b |
| 221 | |
| 222 | BUTTERFLY m1, m3, [ps_cos_vec+96], m6 |
| 223 | |
| 224 | |
| 225 | vperm2f128 m4, m5, m2, 0x20 |
| 226 | vperm2f128 m5, m5, m2, 0x31 |
| 227 | vshufps m5, m5, m5, 0x1b |
| 228 | |
| 229 | BUTTERFLY m4, m5, [ps_cos_vec+96], m6 |
| 230 | |
| 231 | ; pass 4 |
| 232 | vmovaps m6, [ps_p1p1m1m1+0] |
| 233 | vmovaps m2, [ps_cos_vec+128] |
| 234 | |
| 235 | BUTTERFLY2 m5, m6, m2, m7 |
| 236 | BUTTERFLY2 m4, m6, m2, m7 |
| 237 | BUTTERFLY2 m1, m6, m2, m7 |
| 238 | BUTTERFLY2 m3, m6, m2, m7 |
| 239 | |
| 240 | |
| 241 | ; pass 5 |
| 242 | vshufps m6, m6, m6, 0xcc |
| 243 | vmovaps m2, [ps_cos_vec+160] |
| 244 | |
| 245 | BUTTERFLY3 m5, m6, m2, m7 |
| 246 | BUTTERFLY3 m4, m6, m2, m7 |
| 247 | BUTTERFLY3 m1, m6, m2, m7 |
| 248 | BUTTERFLY3 m3, m6, m2, m7 |
| 249 | |
| 250 | vperm2f128 m6, m3, m3, 0x31 |
| 251 | vmovaps [outq], m3 |
| 252 | |
| 253 | vextractf128 [outq+64], m5, 1 |
| 254 | vextractf128 [outq+32], m5, 0 |
| 255 | |
| 256 | vextractf128 [outq+80], m4, 1 |
| 257 | vextractf128 [outq+48], m4, 0 |
| 258 | |
| 259 | vperm2f128 m0, m1, m1, 0x31 |
| 260 | vmovaps [outq+96], m1 |
| 261 | |
| 262 | vzeroupper |
| 263 | |
| 264 | ; pass 6, no SIMD... |
| 265 | INIT_XMM |
| 266 | PASS6_AND_PERMUTE |
| 267 | RET |
| 268 | %endif |
| 269 | |
| 270 | %if ARCH_X86_64 |
| 271 | %define SPILL SWAP |
| 272 | %define UNSPILL SWAP |
| 273 | |
| 274 | %macro PASS5 0 |
| 275 | nop ; FIXME code alignment |
| 276 | SWAP 5, 8 |
| 277 | SWAP 4, 12 |
| 278 | SWAP 6, 14 |
| 279 | SWAP 7, 13 |
| 280 | SWAP 0, 15 |
| 281 | PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13 |
| 282 | TRANSPOSE4x4PS 8, 9, 10, 11, 0 |
| 283 | BUTTERFLY3V 8, 9, 10, 11, 0 |
| 284 | addps m10, m11 |
| 285 | TRANSPOSE4x4PS 12, 13, 14, 15, 0 |
| 286 | BUTTERFLY3V 12, 13, 14, 15, 0 |
| 287 | addps m14, m15 |
| 288 | addps m12, m14 |
| 289 | addps m14, m13 |
| 290 | addps m13, m15 |
| 291 | %endmacro |
| 292 | |
| 293 | %macro PASS6 0 |
| 294 | SWAP 9, 12 |
| 295 | SWAP 11, 14 |
| 296 | movss [outq+0x00], m8 |
| 297 | pshuflw m0, m8, 0xe |
| 298 | movss [outq+0x10], m9 |
| 299 | pshuflw m1, m9, 0xe |
| 300 | movss [outq+0x20], m10 |
| 301 | pshuflw m2, m10, 0xe |
| 302 | movss [outq+0x30], m11 |
| 303 | pshuflw m3, m11, 0xe |
| 304 | movss [outq+0x40], m12 |
| 305 | pshuflw m4, m12, 0xe |
| 306 | movss [outq+0x50], m13 |
| 307 | pshuflw m5, m13, 0xe |
| 308 | movss [outq+0x60], m14 |
| 309 | pshuflw m6, m14, 0xe |
| 310 | movaps [outq+0x70], m15 |
| 311 | pshuflw m7, m15, 0xe |
| 312 | addss m0, m1 |
| 313 | addss m1, m2 |
| 314 | movss [outq+0x08], m0 |
| 315 | addss m2, m3 |
| 316 | movss [outq+0x18], m1 |
| 317 | addss m3, m4 |
| 318 | movss [outq+0x28], m2 |
| 319 | addss m4, m5 |
| 320 | movss [outq+0x38], m3 |
| 321 | addss m5, m6 |
| 322 | movss [outq+0x48], m4 |
| 323 | addss m6, m7 |
| 324 | movss [outq+0x58], m5 |
| 325 | movss [outq+0x68], m6 |
| 326 | movss [outq+0x78], m7 |
| 327 | |
| 328 | PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7 |
| 329 | movhlps m0, m1 |
| 330 | pshufd m1, m1, 3 |
| 331 | SWAP 0, 2, 4, 6, 8, 10, 12, 14 |
| 332 | SWAP 1, 3, 5, 7, 9, 11, 13, 15 |
| 333 | %rep 7 |
| 334 | movhlps m0, m1 |
| 335 | pshufd m1, m1, 3 |
| 336 | addss m15, m1 |
| 337 | SWAP 0, 2, 4, 6, 8, 10, 12, 14 |
| 338 | SWAP 1, 3, 5, 7, 9, 11, 13, 15 |
| 339 | %endrep |
| 340 | %assign i 4 |
| 341 | %rep 15 |
| 342 | addss m0, m1 |
| 343 | movss [outq+i], m0 |
| 344 | SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| 345 | %assign i i+8 |
| 346 | %endrep |
| 347 | %endmacro |
| 348 | |
| 349 | %else ; ARCH_X86_32 |
| 350 | %macro SPILL 2 ; xmm#, mempos |
| 351 | movaps [outq+(%2-8)*16], m%1 |
| 352 | %endmacro |
| 353 | %macro UNSPILL 2 |
| 354 | movaps m%1, [outq+(%2-8)*16] |
| 355 | %endmacro |
| 356 | |
| 357 | %define PASS6 PASS6_AND_PERMUTE |
| 358 | %macro PASS5 0 |
| 359 | movaps m2, [ps_cos_vec+160] |
| 360 | shufps m3, m3, 0xcc |
| 361 | |
| 362 | BUTTERFLY3 m5, m3, m2, m1 |
| 363 | SPILL 5, 8 |
| 364 | |
| 365 | UNSPILL 1, 9 |
| 366 | BUTTERFLY3 m1, m3, m2, m5 |
| 367 | SPILL 1, 14 |
| 368 | |
| 369 | BUTTERFLY3 m4, m3, m2, m5 |
| 370 | SPILL 4, 12 |
| 371 | |
| 372 | BUTTERFLY3 m7, m3, m2, m5 |
| 373 | SPILL 7, 13 |
| 374 | |
| 375 | UNSPILL 5, 10 |
| 376 | BUTTERFLY3 m5, m3, m2, m7 |
| 377 | SPILL 5, 10 |
| 378 | |
| 379 | UNSPILL 4, 11 |
| 380 | BUTTERFLY3 m4, m3, m2, m7 |
| 381 | SPILL 4, 11 |
| 382 | |
| 383 | BUTTERFLY3 m6, m3, m2, m7 |
| 384 | SPILL 6, 9 |
| 385 | |
| 386 | BUTTERFLY3 m0, m3, m2, m7 |
| 387 | SPILL 0, 15 |
| 388 | %endmacro |
| 389 | %endif |
| 390 | |
| 391 | |
| 392 | ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) |
| 393 | %macro DCT32_FUNC 0 |
| 394 | cglobal dct32_float, 2, 3, 16, out, in, tmp |
| 395 | ; pass 1 |
| 396 | |
| 397 | movaps m0, [inq+0] |
| 398 | LOAD_INV m1, [inq+112] |
| 399 | BUTTERFLY m0, m1, [ps_cos_vec], m3 |
| 400 | |
| 401 | movaps m7, [inq+64] |
| 402 | LOAD_INV m4, [inq+48] |
| 403 | BUTTERFLY m7, m4, [ps_cos_vec+32], m3 |
| 404 | |
| 405 | ; pass 2 |
| 406 | movaps m2, [ps_cos_vec+64] |
| 407 | BUTTERFLY m1, m4, m2, m3 |
| 408 | SPILL 1, 11 |
| 409 | SPILL 4, 8 |
| 410 | |
| 411 | ; pass 1 |
| 412 | movaps m1, [inq+16] |
| 413 | LOAD_INV m6, [inq+96] |
| 414 | BUTTERFLY m1, m6, [ps_cos_vec+16], m3 |
| 415 | |
| 416 | movaps m4, [inq+80] |
| 417 | LOAD_INV m5, [inq+32] |
| 418 | BUTTERFLY m4, m5, [ps_cos_vec+48], m3 |
| 419 | |
| 420 | ; pass 2 |
| 421 | BUTTERFLY m0, m7, m2, m3 |
| 422 | |
| 423 | movaps m2, [ps_cos_vec+80] |
| 424 | BUTTERFLY m6, m5, m2, m3 |
| 425 | |
| 426 | BUTTERFLY m1, m4, m2, m3 |
| 427 | |
| 428 | ; pass 3 |
| 429 | movaps m2, [ps_cos_vec+96] |
| 430 | shufps m1, m1, 0x1b |
| 431 | BUTTERFLY m0, m1, m2, m3 |
| 432 | SPILL 0, 15 |
| 433 | SPILL 1, 14 |
| 434 | |
| 435 | UNSPILL 0, 8 |
| 436 | shufps m5, m5, 0x1b |
| 437 | BUTTERFLY m0, m5, m2, m3 |
| 438 | |
| 439 | UNSPILL 1, 11 |
| 440 | shufps m6, m6, 0x1b |
| 441 | BUTTERFLY m1, m6, m2, m3 |
| 442 | SPILL 1, 11 |
| 443 | |
| 444 | shufps m4, m4, 0x1b |
| 445 | BUTTERFLY m7, m4, m2, m3 |
| 446 | |
| 447 | ; pass 4 |
| 448 | movaps m3, [ps_p1p1m1m1+0] |
| 449 | movaps m2, [ps_cos_vec+128] |
| 450 | |
| 451 | BUTTERFLY2 m5, m3, m2, m1 |
| 452 | |
| 453 | BUTTERFLY2 m0, m3, m2, m1 |
| 454 | SPILL 0, 9 |
| 455 | |
| 456 | BUTTERFLY2 m6, m3, m2, m1 |
| 457 | SPILL 6, 10 |
| 458 | |
| 459 | UNSPILL 0, 11 |
| 460 | BUTTERFLY2 m0, m3, m2, m1 |
| 461 | SPILL 0, 11 |
| 462 | |
| 463 | BUTTERFLY2 m4, m3, m2, m1 |
| 464 | |
| 465 | BUTTERFLY2 m7, m3, m2, m1 |
| 466 | |
| 467 | UNSPILL 6, 14 |
| 468 | BUTTERFLY2 m6, m3, m2, m1 |
| 469 | |
| 470 | UNSPILL 0, 15 |
| 471 | BUTTERFLY2 m0, m3, m2, m1 |
| 472 | |
| 473 | PASS5 |
| 474 | PASS6 |
| 475 | RET |
| 476 | %endmacro |
| 477 | |
| 478 | %macro LOAD_INV 2 |
| 479 | %if cpuflag(sse2) |
| 480 | pshufd %1, %2, 0x1b |
| 481 | %elif cpuflag(sse) |
| 482 | movaps %1, %2 |
| 483 | shufps %1, %1, 0x1b |
| 484 | %endif |
| 485 | %endmacro |
| 486 | |
| 487 | %if ARCH_X86_32 |
| 488 | INIT_XMM sse |
| 489 | DCT32_FUNC |
| 490 | %endif |
| 491 | INIT_XMM sse2 |
| 492 | DCT32_FUNC |