| 1 | ;****************************************************************************** |
| 2 | ;* FFT transform with SSE/3DNow optimizations |
| 3 | ;* Copyright (c) 2008 Loren Merritt |
| 4 | ;* Copyright (c) 2011 Vitor Sessak |
| 5 | ;* |
| 6 | ;* This algorithm (though not any of the implementation details) is |
| 7 | ;* based on libdjbfft by D. J. Bernstein. |
| 8 | ;* |
| 9 | ;* This file is part of FFmpeg. |
| 10 | ;* |
| 11 | ;* FFmpeg is free software; you can redistribute it and/or |
| 12 | ;* modify it under the terms of the GNU Lesser General Public |
| 13 | ;* License as published by the Free Software Foundation; either |
| 14 | ;* version 2.1 of the License, or (at your option) any later version. |
| 15 | ;* |
| 16 | ;* FFmpeg is distributed in the hope that it will be useful, |
| 17 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 18 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 19 | ;* Lesser General Public License for more details. |
| 20 | ;* |
| 21 | ;* You should have received a copy of the GNU Lesser General Public |
| 22 | ;* License along with FFmpeg; if not, write to the Free Software |
| 23 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 24 | ;****************************************************************************** |
| 25 | |
| 26 | ; These functions are not individually interchangeable with the C versions. |
| 27 | ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results |
| 28 | ; in blocks as conventient to the vector size. |
| 29 | ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) |
| 30 | |
| 31 | %include "libavutil/x86/x86util.asm" |
| 32 | |
| 33 | %if ARCH_X86_64 |
| 34 | %define pointer resq |
| 35 | %else |
| 36 | %define pointer resd |
| 37 | %endif |
| 38 | |
| 39 | SECTION_RODATA 32 |
| 40 | |
| 41 | struc FFTContext |
| 42 | .nbits: resd 1 |
| 43 | .reverse: resd 1 |
| 44 | .revtab: pointer 1 |
| 45 | .tmpbuf: pointer 1 |
| 46 | .mdctsize: resd 1 |
| 47 | .mdctbits: resd 1 |
| 48 | .tcos: pointer 1 |
| 49 | .tsin: pointer 1 |
| 50 | .fftperm: pointer 1 |
| 51 | .fftcalc: pointer 1 |
| 52 | .imdctcalc:pointer 1 |
| 53 | .imdcthalf:pointer 1 |
| 54 | endstruc |
| 55 | |
| 56 | %define M_SQRT1_2 0.70710678118654752440 |
| 57 | %define M_COS_PI_1_8 0.923879532511287 |
| 58 | %define M_COS_PI_3_8 0.38268343236509 |
| 59 | |
| 60 | ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 |
| 61 | ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 |
| 62 | |
| 63 | ps_root2: times 8 dd M_SQRT1_2 |
| 64 | ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 |
| 65 | ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0 |
| 66 | |
| 67 | perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 |
| 68 | perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 |
| 69 | ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 |
| 70 | ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 |
| 71 | ps_m1p1: dd 1<<31, 0 |
| 72 | |
| 73 | cextern ps_neg |
| 74 | |
| 75 | %assign i 16 |
| 76 | %rep 13 |
| 77 | cextern cos_ %+ i |
| 78 | %assign i i<<1 |
| 79 | %endrep |
| 80 | |
| 81 | %if ARCH_X86_64 |
| 82 | %define pointer dq |
| 83 | %else |
| 84 | %define pointer dd |
| 85 | %endif |
| 86 | |
| 87 | %macro IF0 1+ |
| 88 | %endmacro |
| 89 | %macro IF1 1+ |
| 90 | %1 |
| 91 | %endmacro |
| 92 | |
| 93 | SECTION_TEXT |
| 94 | |
| 95 | %macro T2_3DNOW 4 ; z0, z1, mem0, mem1 |
| 96 | mova %1, %3 |
| 97 | mova %2, %1 |
| 98 | pfadd %1, %4 |
| 99 | pfsub %2, %4 |
| 100 | %endmacro |
| 101 | |
| 102 | %macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1 |
| 103 | mova %5, %3 |
| 104 | pfsub %3, %4 |
| 105 | pfadd %5, %4 ; {t6,t5} |
| 106 | pxor %3, [ps_m1p1] ; {t8,t7} |
| 107 | mova %6, %1 |
| 108 | movd [r0+12], %3 |
| 109 | punpckhdq %3, [r0+8] |
| 110 | pfadd %1, %5 ; {r0,i0} |
| 111 | pfsub %6, %5 ; {r2,i2} |
| 112 | mova %4, %2 |
| 113 | pfadd %2, %3 ; {r1,i1} |
| 114 | pfsub %4, %3 ; {r3,i3} |
| 115 | SWAP %3, %6 |
| 116 | %endmacro |
| 117 | |
| 118 | ; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6} |
| 119 | ; %2 = {r1,i1,r3,i3,r5,i5,r7,i7} |
| 120 | ; %3, %4, %5 tmp |
| 121 | ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3} |
| 122 | ; %2 = {r4,r5,r6,r7,i4,i5,i6,i7} |
| 123 | %macro T8_AVX 5 |
| 124 | vsubps %5, %1, %2 ; v = %1 - %2 |
| 125 | vaddps %3, %1, %2 ; w = %1 + %2 |
| 126 | vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1 |
| 127 | vpermilps %2, %2, [perm1] |
| 128 | vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6} |
| 129 | vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5} |
| 130 | vsubps %4, %5, %1 ; s = r - q |
| 131 | vaddps %1, %5, %1 ; u = r + q |
| 132 | vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8} |
| 133 | vshufps %5, %4, %1, 0xbb |
| 134 | vshufps %3, %4, %1, 0xee |
| 135 | vperm2f128 %3, %3, %5, 0x13 |
| 136 | vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1} |
| 137 | vshufps %2, %1, %4, 0xdd |
| 138 | vshufps %1, %1, %4, 0x88 |
| 139 | vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4} |
| 140 | vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7} |
| 141 | vsubps %5, %1, %3 |
| 142 | vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8} |
| 143 | vsubps %2, %4, %1 ; %2 = v - w |
| 144 | vaddps %1, %4, %1 ; %1 = v + w |
| 145 | %endmacro |
| 146 | |
| 147 | ; In SSE mode do one fft4 transforms |
| 148 | ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} |
| 149 | ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} |
| 150 | ; |
| 151 | ; In AVX mode do two fft4 transforms |
| 152 | ; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7} |
| 153 | ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7} |
| 154 | %macro T4_SSE 3 |
| 155 | subps %3, %1, %2 ; {t3,t4,-t8,t7} |
| 156 | addps %1, %1, %2 ; {t1,t2,t6,t5} |
| 157 | xorps %3, %3, [ps_p1p1m1p1] |
| 158 | shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8} |
| 159 | shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4} |
| 160 | subps %3, %1, %2 ; {r2,i2,r3,i3} |
| 161 | addps %1, %1, %2 ; {r0,i0,r1,i1} |
| 162 | shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3} |
| 163 | shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3} |
| 164 | %endmacro |
| 165 | |
| 166 | ; In SSE mode do one FFT8 |
| 167 | ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} |
| 168 | ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} |
| 169 | ; |
| 170 | ; In AVX mode do two FFT8 |
| 171 | ; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11} |
| 172 | ; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15} |
| 173 | ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11} |
| 174 | ; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} |
| 175 | %macro T8_SSE 6 |
| 176 | addps %6, %3, %4 ; {t1,t2,t3,t4} |
| 177 | subps %3, %3, %4 ; {r5,i5,r7,i7} |
| 178 | shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7} |
| 179 | mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} |
| 180 | mulps %4, %4, [ps_root2] |
| 181 | addps %3, %3, %4 ; {t8,t7,ta,t9} |
| 182 | shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta} |
| 183 | shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8} |
| 184 | subps %3, %6, %4 ; {t6,t5,tc,tb} |
| 185 | addps %6, %6, %4 ; {t1,t2,t9,ta} |
| 186 | shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc} |
| 187 | shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb} |
| 188 | subps %3, %1, %6 ; {r4,r5,r6,r7} |
| 189 | addps %1, %1, %6 ; {r0,r1,r2,r3} |
| 190 | subps %4, %2, %5 ; {i4,i5,i6,i7} |
| 191 | addps %2, %2, %5 ; {i0,i1,i2,i3} |
| 192 | %endmacro |
| 193 | |
| 194 | ; scheduled for cpu-bound sizes |
| 195 | %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim |
| 196 | IF%1 mova m4, Z(4) |
| 197 | IF%1 mova m5, Z(5) |
| 198 | mova m0, %2 ; wre |
| 199 | mova m1, %3 ; wim |
| 200 | mulps m2, m4, m0 ; r2*wre |
| 201 | IF%1 mova m6, Z2(6) |
| 202 | mulps m3, m5, m1 ; i2*wim |
| 203 | IF%1 mova m7, Z2(7) |
| 204 | mulps m4, m4, m1 ; r2*wim |
| 205 | mulps m5, m5, m0 ; i2*wre |
| 206 | addps m2, m2, m3 ; r2*wre + i2*wim |
| 207 | mulps m3, m1, m7 ; i3*wim |
| 208 | subps m5, m5, m4 ; i2*wre - r2*wim |
| 209 | mulps m1, m1, m6 ; r3*wim |
| 210 | mulps m4, m0, m6 ; r3*wre |
| 211 | mulps m0, m0, m7 ; i3*wre |
| 212 | subps m4, m4, m3 ; r3*wre - i3*wim |
| 213 | mova m3, Z(0) |
| 214 | addps m0, m0, m1 ; i3*wre + r3*wim |
| 215 | subps m1, m4, m2 ; t3 |
| 216 | addps m4, m4, m2 ; t5 |
| 217 | subps m3, m3, m4 ; r2 |
| 218 | addps m4, m4, Z(0) ; r0 |
| 219 | mova m6, Z(2) |
| 220 | mova Z(4), m3 |
| 221 | mova Z(0), m4 |
| 222 | subps m3, m5, m0 ; t4 |
| 223 | subps m4, m6, m3 ; r3 |
| 224 | addps m3, m3, m6 ; r1 |
| 225 | mova Z2(6), m4 |
| 226 | mova Z(2), m3 |
| 227 | mova m2, Z(3) |
| 228 | addps m3, m5, m0 ; t6 |
| 229 | subps m2, m2, m1 ; i3 |
| 230 | mova m7, Z(1) |
| 231 | addps m1, m1, Z(3) ; i1 |
| 232 | mova Z2(7), m2 |
| 233 | mova Z(3), m1 |
| 234 | subps m4, m7, m3 ; i2 |
| 235 | addps m3, m3, m7 ; i0 |
| 236 | mova Z(5), m4 |
| 237 | mova Z(1), m3 |
| 238 | %endmacro |
| 239 | |
| 240 | ; scheduled to avoid store->load aliasing |
| 241 | %macro PASS_BIG 1 ; (!interleave) |
| 242 | mova m4, Z(4) ; r2 |
| 243 | mova m5, Z(5) ; i2 |
| 244 | mova m0, [wq] ; wre |
| 245 | mova m1, [wq+o1q] ; wim |
| 246 | mulps m2, m4, m0 ; r2*wre |
| 247 | mova m6, Z2(6) ; r3 |
| 248 | mulps m3, m5, m1 ; i2*wim |
| 249 | mova m7, Z2(7) ; i3 |
| 250 | mulps m4, m4, m1 ; r2*wim |
| 251 | mulps m5, m5, m0 ; i2*wre |
| 252 | addps m2, m2, m3 ; r2*wre + i2*wim |
| 253 | mulps m3, m1, m7 ; i3*wim |
| 254 | mulps m1, m1, m6 ; r3*wim |
| 255 | subps m5, m5, m4 ; i2*wre - r2*wim |
| 256 | mulps m4, m0, m6 ; r3*wre |
| 257 | mulps m0, m0, m7 ; i3*wre |
| 258 | subps m4, m4, m3 ; r3*wre - i3*wim |
| 259 | mova m3, Z(0) |
| 260 | addps m0, m0, m1 ; i3*wre + r3*wim |
| 261 | subps m1, m4, m2 ; t3 |
| 262 | addps m4, m4, m2 ; t5 |
| 263 | subps m3, m3, m4 ; r2 |
| 264 | addps m4, m4, Z(0) ; r0 |
| 265 | mova m6, Z(2) |
| 266 | mova Z(4), m3 |
| 267 | mova Z(0), m4 |
| 268 | subps m3, m5, m0 ; t4 |
| 269 | subps m4, m6, m3 ; r3 |
| 270 | addps m3, m3, m6 ; r1 |
| 271 | IF%1 mova Z2(6), m4 |
| 272 | IF%1 mova Z(2), m3 |
| 273 | mova m2, Z(3) |
| 274 | addps m5, m5, m0 ; t6 |
| 275 | subps m2, m2, m1 ; i3 |
| 276 | mova m7, Z(1) |
| 277 | addps m1, m1, Z(3) ; i1 |
| 278 | IF%1 mova Z2(7), m2 |
| 279 | IF%1 mova Z(3), m1 |
| 280 | subps m6, m7, m5 ; i2 |
| 281 | addps m5, m5, m7 ; i0 |
| 282 | IF%1 mova Z(5), m6 |
| 283 | IF%1 mova Z(1), m5 |
| 284 | %if %1==0 |
| 285 | INTERL m1, m3, m7, Z, 2 |
| 286 | INTERL m2, m4, m0, Z2, 6 |
| 287 | |
| 288 | mova m1, Z(0) |
| 289 | mova m2, Z(4) |
| 290 | |
| 291 | INTERL m5, m1, m3, Z, 0 |
| 292 | INTERL m6, m2, m7, Z, 4 |
| 293 | %endif |
| 294 | %endmacro |
| 295 | |
| 296 | %macro PUNPCK 3 |
| 297 | mova %3, %1 |
| 298 | punpckldq %1, %2 |
| 299 | punpckhdq %3, %2 |
| 300 | %endmacro |
| 301 | |
| 302 | %define Z(x) [r0+mmsize*x] |
| 303 | %define Z2(x) [r0+mmsize*x] |
| 304 | %define ZH(x) [r0+mmsize*x+mmsize/2] |
| 305 | |
| 306 | INIT_YMM avx |
| 307 | |
| 308 | %if HAVE_AVX_EXTERNAL |
| 309 | align 16 |
| 310 | fft8_avx: |
| 311 | mova m0, Z(0) |
| 312 | mova m1, Z(1) |
| 313 | T8_AVX m0, m1, m2, m3, m4 |
| 314 | mova Z(0), m0 |
| 315 | mova Z(1), m1 |
| 316 | ret |
| 317 | |
| 318 | |
| 319 | align 16 |
| 320 | fft16_avx: |
| 321 | mova m2, Z(2) |
| 322 | mova m3, Z(3) |
| 323 | T4_SSE m2, m3, m7 |
| 324 | |
| 325 | mova m0, Z(0) |
| 326 | mova m1, Z(1) |
| 327 | T8_AVX m0, m1, m4, m5, m7 |
| 328 | |
| 329 | mova m4, [ps_cos16_1] |
| 330 | mova m5, [ps_cos16_2] |
| 331 | vmulps m6, m2, m4 |
| 332 | vmulps m7, m3, m5 |
| 333 | vaddps m7, m7, m6 |
| 334 | vmulps m2, m2, m5 |
| 335 | vmulps m3, m3, m4 |
| 336 | vsubps m3, m3, m2 |
| 337 | vblendps m2, m7, m3, 0xf0 |
| 338 | vperm2f128 m3, m7, m3, 0x21 |
| 339 | vaddps m4, m2, m3 |
| 340 | vsubps m2, m3, m2 |
| 341 | vperm2f128 m2, m2, m2, 0x01 |
| 342 | vsubps m3, m1, m2 |
| 343 | vaddps m1, m1, m2 |
| 344 | vsubps m5, m0, m4 |
| 345 | vaddps m0, m0, m4 |
| 346 | vextractf128 Z(0), m0, 0 |
| 347 | vextractf128 ZH(0), m1, 0 |
| 348 | vextractf128 Z(1), m0, 1 |
| 349 | vextractf128 ZH(1), m1, 1 |
| 350 | vextractf128 Z(2), m5, 0 |
| 351 | vextractf128 ZH(2), m3, 0 |
| 352 | vextractf128 Z(3), m5, 1 |
| 353 | vextractf128 ZH(3), m3, 1 |
| 354 | ret |
| 355 | |
| 356 | align 16 |
| 357 | fft32_avx: |
| 358 | call fft16_avx |
| 359 | |
| 360 | mova m0, Z(4) |
| 361 | mova m1, Z(5) |
| 362 | |
| 363 | T4_SSE m0, m1, m4 |
| 364 | |
| 365 | mova m2, Z(6) |
| 366 | mova m3, Z(7) |
| 367 | |
| 368 | T8_SSE m0, m1, m2, m3, m4, m6 |
| 369 | ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11} |
| 370 | ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15} |
| 371 | |
| 372 | vperm2f128 m4, m0, m2, 0x20 |
| 373 | vperm2f128 m5, m1, m3, 0x20 |
| 374 | vperm2f128 m6, m0, m2, 0x31 |
| 375 | vperm2f128 m7, m1, m3, 0x31 |
| 376 | |
| 377 | PASS_SMALL 0, [cos_32], [cos_32+32] |
| 378 | |
| 379 | ret |
| 380 | |
| 381 | fft32_interleave_avx: |
| 382 | call fft32_avx |
| 383 | mov r2d, 32 |
| 384 | .deint_loop: |
| 385 | mova m2, Z(0) |
| 386 | mova m3, Z(1) |
| 387 | vunpcklps m0, m2, m3 |
| 388 | vunpckhps m1, m2, m3 |
| 389 | vextractf128 Z(0), m0, 0 |
| 390 | vextractf128 ZH(0), m1, 0 |
| 391 | vextractf128 Z(1), m0, 1 |
| 392 | vextractf128 ZH(1), m1, 1 |
| 393 | add r0, mmsize*2 |
| 394 | sub r2d, mmsize/4 |
| 395 | jg .deint_loop |
| 396 | ret |
| 397 | |
| 398 | %endif |
| 399 | |
| 400 | INIT_XMM sse |
| 401 | |
| 402 | align 16 |
| 403 | fft4_avx: |
| 404 | fft4_sse: |
| 405 | mova m0, Z(0) |
| 406 | mova m1, Z(1) |
| 407 | T4_SSE m0, m1, m2 |
| 408 | mova Z(0), m0 |
| 409 | mova Z(1), m1 |
| 410 | ret |
| 411 | |
| 412 | align 16 |
| 413 | fft8_sse: |
| 414 | mova m0, Z(0) |
| 415 | mova m1, Z(1) |
| 416 | T4_SSE m0, m1, m2 |
| 417 | mova m2, Z(2) |
| 418 | mova m3, Z(3) |
| 419 | T8_SSE m0, m1, m2, m3, m4, m5 |
| 420 | mova Z(0), m0 |
| 421 | mova Z(1), m1 |
| 422 | mova Z(2), m2 |
| 423 | mova Z(3), m3 |
| 424 | ret |
| 425 | |
| 426 | align 16 |
| 427 | fft16_sse: |
| 428 | mova m0, Z(0) |
| 429 | mova m1, Z(1) |
| 430 | T4_SSE m0, m1, m2 |
| 431 | mova m2, Z(2) |
| 432 | mova m3, Z(3) |
| 433 | T8_SSE m0, m1, m2, m3, m4, m5 |
| 434 | mova m4, Z(4) |
| 435 | mova m5, Z(5) |
| 436 | mova Z(0), m0 |
| 437 | mova Z(1), m1 |
| 438 | mova Z(2), m2 |
| 439 | mova Z(3), m3 |
| 440 | T4_SSE m4, m5, m6 |
| 441 | mova m6, Z2(6) |
| 442 | mova m7, Z2(7) |
| 443 | T4_SSE m6, m7, m0 |
| 444 | PASS_SMALL 0, [cos_16], [cos_16+16] |
| 445 | ret |
| 446 | |
| 447 | |
| 448 | %macro FFT48_3DNOW 0 |
| 449 | align 16 |
| 450 | fft4 %+ SUFFIX: |
| 451 | T2_3DNOW m0, m1, Z(0), Z(1) |
| 452 | mova m2, Z(2) |
| 453 | mova m3, Z(3) |
| 454 | T4_3DNOW m0, m1, m2, m3, m4, m5 |
| 455 | PUNPCK m0, m1, m4 |
| 456 | PUNPCK m2, m3, m5 |
| 457 | mova Z(0), m0 |
| 458 | mova Z(1), m4 |
| 459 | mova Z(2), m2 |
| 460 | mova Z(3), m5 |
| 461 | ret |
| 462 | |
| 463 | align 16 |
| 464 | fft8 %+ SUFFIX: |
| 465 | T2_3DNOW m0, m1, Z(0), Z(1) |
| 466 | mova m2, Z(2) |
| 467 | mova m3, Z(3) |
| 468 | T4_3DNOW m0, m1, m2, m3, m4, m5 |
| 469 | mova Z(0), m0 |
| 470 | mova Z(2), m2 |
| 471 | T2_3DNOW m4, m5, Z(4), Z(5) |
| 472 | T2_3DNOW m6, m7, Z2(6), Z2(7) |
| 473 | PSWAPD m0, m5 |
| 474 | PSWAPD m2, m7 |
| 475 | pxor m0, [ps_m1p1] |
| 476 | pxor m2, [ps_m1p1] |
| 477 | pfsub m5, m0 |
| 478 | pfadd m7, m2 |
| 479 | pfmul m5, [ps_root2] |
| 480 | pfmul m7, [ps_root2] |
| 481 | T4_3DNOW m1, m3, m5, m7, m0, m2 |
| 482 | mova Z(5), m5 |
| 483 | mova Z2(7), m7 |
| 484 | mova m0, Z(0) |
| 485 | mova m2, Z(2) |
| 486 | T4_3DNOW m0, m2, m4, m6, m5, m7 |
| 487 | PUNPCK m0, m1, m5 |
| 488 | PUNPCK m2, m3, m7 |
| 489 | mova Z(0), m0 |
| 490 | mova Z(1), m5 |
| 491 | mova Z(2), m2 |
| 492 | mova Z(3), m7 |
| 493 | PUNPCK m4, Z(5), m5 |
| 494 | PUNPCK m6, Z2(7), m7 |
| 495 | mova Z(4), m4 |
| 496 | mova Z(5), m5 |
| 497 | mova Z2(6), m6 |
| 498 | mova Z2(7), m7 |
| 499 | ret |
| 500 | %endmacro |
| 501 | |
| 502 | %if ARCH_X86_32 |
| 503 | INIT_MMX 3dnowext |
| 504 | FFT48_3DNOW |
| 505 | |
| 506 | INIT_MMX 3dnow |
| 507 | FFT48_3DNOW |
| 508 | %endif |
| 509 | |
| 510 | %define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] |
| 511 | %define Z2(x) [zcq + o3q + mmsize*(x&1)] |
| 512 | %define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] |
| 513 | %define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] |
| 514 | |
| 515 | %macro DECL_PASS 2+ ; name, payload |
| 516 | align 16 |
| 517 | %1: |
| 518 | DEFINE_ARGS zc, w, n, o1, o3 |
| 519 | lea o3q, [nq*3] |
| 520 | lea o1q, [nq*8] |
| 521 | shl o3q, 4 |
| 522 | .loop: |
| 523 | %2 |
| 524 | add zcq, mmsize*2 |
| 525 | add wq, mmsize |
| 526 | sub nd, mmsize/8 |
| 527 | jg .loop |
| 528 | rep ret |
| 529 | %endmacro |
| 530 | |
| 531 | %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs |
| 532 | lea r2, [dispatch_tab%1] |
| 533 | mov r2, [r2 + (%2q-2)*gprsize] |
| 534 | %ifdef PIC |
| 535 | lea r3, [$$] |
| 536 | add r2, r3 |
| 537 | %endif |
| 538 | call r2 |
| 539 | %endmacro ; FFT_DISPATCH |
| 540 | |
| 541 | INIT_YMM avx |
| 542 | |
| 543 | %if HAVE_AVX_EXTERNAL |
| 544 | %macro INTERL_AVX 5 |
| 545 | vunpckhps %3, %2, %1 |
| 546 | vunpcklps %2, %2, %1 |
| 547 | vextractf128 %4(%5), %2, 0 |
| 548 | vextractf128 %4 %+ H(%5), %3, 0 |
| 549 | vextractf128 %4(%5 + 1), %2, 1 |
| 550 | vextractf128 %4 %+ H(%5 + 1), %3, 1 |
| 551 | %endmacro |
| 552 | |
| 553 | %define INTERL INTERL_AVX |
| 554 | |
| 555 | DECL_PASS pass_avx, PASS_BIG 1 |
| 556 | DECL_PASS pass_interleave_avx, PASS_BIG 0 |
| 557 | |
| 558 | cglobal fft_calc, 2,5,8 |
| 559 | mov r3d, [r0 + FFTContext.nbits] |
| 560 | mov r0, r1 |
| 561 | mov r1, r3 |
| 562 | FFT_DISPATCH _interleave %+ SUFFIX, r1 |
| 563 | REP_RET |
| 564 | |
| 565 | %endif |
| 566 | |
| 567 | INIT_XMM sse |
| 568 | |
| 569 | %macro INTERL_SSE 5 |
| 570 | mova %3, %2 |
| 571 | unpcklps %2, %1 |
| 572 | unpckhps %3, %1 |
| 573 | mova %4(%5), %2 |
| 574 | mova %4(%5+1), %3 |
| 575 | %endmacro |
| 576 | |
| 577 | %define INTERL INTERL_SSE |
| 578 | |
| 579 | DECL_PASS pass_sse, PASS_BIG 1 |
| 580 | DECL_PASS pass_interleave_sse, PASS_BIG 0 |
| 581 | |
| 582 | %macro FFT_CALC_FUNC 0 |
| 583 | cglobal fft_calc, 2,5,8 |
| 584 | mov r3d, [r0 + FFTContext.nbits] |
| 585 | PUSH r1 |
| 586 | PUSH r3 |
| 587 | mov r0, r1 |
| 588 | mov r1, r3 |
| 589 | FFT_DISPATCH _interleave %+ SUFFIX, r1 |
| 590 | POP rcx |
| 591 | POP r4 |
| 592 | cmp rcx, 3+(mmsize/16) |
| 593 | jg .end |
| 594 | mov r2, -1 |
| 595 | add rcx, 3 |
| 596 | shl r2, cl |
| 597 | sub r4, r2 |
| 598 | .loop: |
| 599 | %if mmsize == 8 |
| 600 | PSWAPD m0, [r4 + r2 + 4] |
| 601 | mova [r4 + r2 + 4], m0 |
| 602 | %else |
| 603 | movaps xmm0, [r4 + r2] |
| 604 | movaps xmm1, xmm0 |
| 605 | unpcklps xmm0, [r4 + r2 + 16] |
| 606 | unpckhps xmm1, [r4 + r2 + 16] |
| 607 | movaps [r4 + r2], xmm0 |
| 608 | movaps [r4 + r2 + 16], xmm1 |
| 609 | %endif |
| 610 | add r2, mmsize*2 |
| 611 | jl .loop |
| 612 | .end: |
| 613 | %if cpuflag(3dnow) |
| 614 | femms |
| 615 | RET |
| 616 | %else |
| 617 | REP_RET |
| 618 | %endif |
| 619 | %endmacro |
| 620 | |
| 621 | %if ARCH_X86_32 |
| 622 | INIT_MMX 3dnow |
| 623 | FFT_CALC_FUNC |
| 624 | INIT_MMX 3dnowext |
| 625 | FFT_CALC_FUNC |
| 626 | %endif |
| 627 | INIT_XMM sse |
| 628 | FFT_CALC_FUNC |
| 629 | |
| 630 | cglobal fft_permute, 2,7,1 |
| 631 | mov r4, [r0 + FFTContext.revtab] |
| 632 | mov r5, [r0 + FFTContext.tmpbuf] |
| 633 | mov ecx, [r0 + FFTContext.nbits] |
| 634 | mov r2, 1 |
| 635 | shl r2, cl |
| 636 | xor r0, r0 |
| 637 | %if ARCH_X86_32 |
| 638 | mov r1, r1m |
| 639 | %endif |
| 640 | .loop: |
| 641 | movaps xmm0, [r1 + 8*r0] |
| 642 | movzx r6, word [r4 + 2*r0] |
| 643 | movzx r3, word [r4 + 2*r0 + 2] |
| 644 | movlps [r5 + 8*r6], xmm0 |
| 645 | movhps [r5 + 8*r3], xmm0 |
| 646 | add r0, 2 |
| 647 | cmp r0, r2 |
| 648 | jl .loop |
| 649 | shl r2, 3 |
| 650 | add r1, r2 |
| 651 | add r5, r2 |
| 652 | neg r2 |
| 653 | ; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B |
| 654 | .loopcopy: |
| 655 | movaps xmm0, [r5 + r2] |
| 656 | movaps xmm1, [r5 + r2 + 16] |
| 657 | movaps [r1 + r2], xmm0 |
| 658 | movaps [r1 + r2 + 16], xmm1 |
| 659 | add r2, 32 |
| 660 | jl .loopcopy |
| 661 | REP_RET |
| 662 | |
| 663 | %macro IMDCT_CALC_FUNC 0 |
| 664 | cglobal imdct_calc, 3,5,3 |
| 665 | mov r3d, [r0 + FFTContext.mdctsize] |
| 666 | mov r4, [r0 + FFTContext.imdcthalf] |
| 667 | add r1, r3 |
| 668 | PUSH r3 |
| 669 | PUSH r1 |
| 670 | %if ARCH_X86_32 |
| 671 | push r2 |
| 672 | push r1 |
| 673 | push r0 |
| 674 | %else |
| 675 | sub rsp, 8+32*WIN64 ; allocate win64 shadow space |
| 676 | %endif |
| 677 | call r4 |
| 678 | %if ARCH_X86_32 |
| 679 | add esp, 12 |
| 680 | %else |
| 681 | add rsp, 8+32*WIN64 |
| 682 | %endif |
| 683 | POP r1 |
| 684 | POP r3 |
| 685 | lea r0, [r1 + 2*r3] |
| 686 | mov r2, r3 |
| 687 | sub r3, mmsize |
| 688 | neg r2 |
| 689 | mova m2, [ps_neg] |
| 690 | .loop: |
| 691 | %if mmsize == 8 |
| 692 | PSWAPD m0, [r1 + r3] |
| 693 | PSWAPD m1, [r0 + r2] |
| 694 | pxor m0, m2 |
| 695 | %else |
| 696 | mova m0, [r1 + r3] |
| 697 | mova m1, [r0 + r2] |
| 698 | shufps m0, m0, 0x1b |
| 699 | shufps m1, m1, 0x1b |
| 700 | xorps m0, m2 |
| 701 | %endif |
| 702 | mova [r0 + r3], m1 |
| 703 | mova [r1 + r2], m0 |
| 704 | sub r3, mmsize |
| 705 | add r2, mmsize |
| 706 | jl .loop |
| 707 | %if cpuflag(3dnow) |
| 708 | femms |
| 709 | RET |
| 710 | %else |
| 711 | REP_RET |
| 712 | %endif |
| 713 | %endmacro |
| 714 | |
| 715 | %if ARCH_X86_32 |
| 716 | INIT_MMX 3dnow |
| 717 | IMDCT_CALC_FUNC |
| 718 | INIT_MMX 3dnowext |
| 719 | IMDCT_CALC_FUNC |
| 720 | %endif |
| 721 | |
| 722 | INIT_XMM sse |
| 723 | IMDCT_CALC_FUNC |
| 724 | |
| 725 | %if ARCH_X86_32 |
| 726 | INIT_MMX 3dnow |
| 727 | %define mulps pfmul |
| 728 | %define addps pfadd |
| 729 | %define subps pfsub |
| 730 | %define unpcklps punpckldq |
| 731 | %define unpckhps punpckhdq |
| 732 | DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q] |
| 733 | DECL_PASS pass_interleave_3dnow, PASS_BIG 0 |
| 734 | %define pass_3dnowext pass_3dnow |
| 735 | %define pass_interleave_3dnowext pass_interleave_3dnow |
| 736 | %endif |
| 737 | |
| 738 | %ifdef PIC |
| 739 | %define SECTION_REL - $$ |
| 740 | %else |
| 741 | %define SECTION_REL |
| 742 | %endif |
| 743 | |
| 744 | %macro DECL_FFT 1-2 ; nbits, suffix |
| 745 | %ifidn %0, 1 |
| 746 | %xdefine fullsuffix SUFFIX |
| 747 | %else |
| 748 | %xdefine fullsuffix %2 %+ SUFFIX |
| 749 | %endif |
| 750 | %xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL |
| 751 | %if %1>=5 |
| 752 | %xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL |
| 753 | %endif |
| 754 | %if %1>=6 |
| 755 | %xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL |
| 756 | %endif |
| 757 | |
| 758 | %assign n 1<<%1 |
| 759 | %rep 17-%1 |
| 760 | %assign n2 n/2 |
| 761 | %assign n4 n/4 |
| 762 | %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL |
| 763 | |
| 764 | align 16 |
| 765 | fft %+ n %+ fullsuffix: |
| 766 | call fft %+ n2 %+ SUFFIX |
| 767 | add r0, n*4 - (n&(-2<<%1)) |
| 768 | call fft %+ n4 %+ SUFFIX |
| 769 | add r0, n*2 - (n2&(-2<<%1)) |
| 770 | call fft %+ n4 %+ SUFFIX |
| 771 | sub r0, n*6 + (n2&(-2<<%1)) |
| 772 | lea r1, [cos_ %+ n] |
| 773 | mov r2d, n4/2 |
| 774 | jmp pass %+ fullsuffix |
| 775 | |
| 776 | %assign n n*2 |
| 777 | %endrep |
| 778 | %undef n |
| 779 | |
| 780 | align 8 |
| 781 | dispatch_tab %+ fullsuffix: pointer list_of_fft |
| 782 | %endmacro ; DECL_FFT |
| 783 | |
| 784 | %if HAVE_AVX_EXTERNAL |
| 785 | INIT_YMM avx |
| 786 | DECL_FFT 6 |
| 787 | DECL_FFT 6, _interleave |
| 788 | %endif |
| 789 | INIT_XMM sse |
| 790 | DECL_FFT 5 |
| 791 | DECL_FFT 5, _interleave |
| 792 | %if ARCH_X86_32 |
| 793 | INIT_MMX 3dnow |
| 794 | DECL_FFT 4 |
| 795 | DECL_FFT 4, _interleave |
| 796 | INIT_MMX 3dnowext |
| 797 | DECL_FFT 4 |
| 798 | DECL_FFT 4, _interleave |
| 799 | %endif |
| 800 | |
| 801 | INIT_XMM sse |
| 802 | %undef mulps |
| 803 | %undef addps |
| 804 | %undef subps |
| 805 | %undef unpcklps |
| 806 | %undef unpckhps |
| 807 | |
| 808 | %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 |
| 809 | %if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8 |
| 810 | PSWAPD m0, [%3+%2*4] |
| 811 | movq m2, [%3+%1*4-8] |
| 812 | movq m3, m0 |
| 813 | punpckldq m0, m2 |
| 814 | punpckhdq m2, m3 |
| 815 | movd m1, [%4+%1*2-4] ; tcos[j] |
| 816 | movd m3, [%4+%2*2] ; tcos[n4-j-1] |
| 817 | punpckldq m1, [%5+%1*2-4] ; tsin[j] |
| 818 | punpckldq m3, [%5+%2*2] ; tsin[n4-j-1] |
| 819 | |
| 820 | mova m4, m0 |
| 821 | PSWAPD m5, m1 |
| 822 | pfmul m0, m1 |
| 823 | pfmul m4, m5 |
| 824 | mova m6, m2 |
| 825 | PSWAPD m5, m3 |
| 826 | pfmul m2, m3 |
| 827 | pfmul m6, m5 |
| 828 | %if cpuflag(3dnowext) |
| 829 | pfpnacc m0, m4 |
| 830 | pfpnacc m2, m6 |
| 831 | %else |
| 832 | SBUTTERFLY dq, 0, 4, 1 |
| 833 | SBUTTERFLY dq, 2, 6, 3 |
| 834 | pxor m4, m7 |
| 835 | pxor m6, m7 |
| 836 | pfadd m0, m4 |
| 837 | pfadd m2, m6 |
| 838 | %endif |
| 839 | %else |
| 840 | movaps xmm0, [%3+%2*4] |
| 841 | movaps xmm1, [%3+%1*4-0x10] |
| 842 | movaps xmm2, xmm0 |
| 843 | shufps xmm0, xmm1, 0x88 |
| 844 | shufps xmm1, xmm2, 0x77 |
| 845 | movlps xmm4, [%4+%2*2] |
| 846 | movlps xmm5, [%5+%2*2+0x0] |
| 847 | movhps xmm4, [%4+%1*2-0x8] |
| 848 | movhps xmm5, [%5+%1*2-0x8] |
| 849 | movaps xmm2, xmm0 |
| 850 | movaps xmm3, xmm1 |
| 851 | mulps xmm0, xmm5 |
| 852 | mulps xmm1, xmm4 |
| 853 | mulps xmm2, xmm4 |
| 854 | mulps xmm3, xmm5 |
| 855 | subps xmm1, xmm0 |
| 856 | addps xmm2, xmm3 |
| 857 | movaps xmm0, xmm1 |
| 858 | unpcklps xmm1, xmm2 |
| 859 | unpckhps xmm0, xmm2 |
| 860 | %endif |
| 861 | %endmacro |
| 862 | |
| 863 | %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 |
| 864 | mulps m6, %3, [%5+%1] |
| 865 | mulps m7, %2, [%5+%1] |
| 866 | mulps %2, %2, [%6+%1] |
| 867 | mulps %3, %3, [%6+%1] |
| 868 | subps %2, %2, m6 |
| 869 | addps %3, %3, m7 |
| 870 | %endmacro |
| 871 | |
| 872 | %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8 |
| 873 | .post: |
| 874 | vmovaps ymm1, [%3+%1*2] |
| 875 | vmovaps ymm0, [%3+%1*2+0x20] |
| 876 | vmovaps ymm3, [%3+%2*2] |
| 877 | vmovaps ymm2, [%3+%2*2+0x20] |
| 878 | |
| 879 | CMUL %1, ymm0, ymm1, %3, %4, %5 |
| 880 | CMUL %2, ymm2, ymm3, %3, %4, %5 |
| 881 | vshufps ymm1, ymm1, ymm1, 0x1b |
| 882 | vshufps ymm3, ymm3, ymm3, 0x1b |
| 883 | vperm2f128 ymm1, ymm1, ymm1, 0x01 |
| 884 | vperm2f128 ymm3, ymm3, ymm3, 0x01 |
| 885 | vunpcklps ymm6, ymm2, ymm1 |
| 886 | vunpckhps ymm4, ymm2, ymm1 |
| 887 | vunpcklps ymm7, ymm0, ymm3 |
| 888 | vunpckhps ymm5, ymm0, ymm3 |
| 889 | |
| 890 | vextractf128 [%3+%1*2], ymm7, 0 |
| 891 | vextractf128 [%3+%1*2+0x10], ymm5, 0 |
| 892 | vextractf128 [%3+%1*2+0x20], ymm7, 1 |
| 893 | vextractf128 [%3+%1*2+0x30], ymm5, 1 |
| 894 | |
| 895 | vextractf128 [%3+%2*2], ymm6, 0 |
| 896 | vextractf128 [%3+%2*2+0x10], ymm4, 0 |
| 897 | vextractf128 [%3+%2*2+0x20], ymm6, 1 |
| 898 | vextractf128 [%3+%2*2+0x30], ymm4, 1 |
| 899 | sub %2, 0x20 |
| 900 | add %1, 0x20 |
| 901 | jl .post |
| 902 | %endmacro |
| 903 | |
| 904 | %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 |
| 905 | .post: |
| 906 | movaps xmm1, [%3+%1*2] |
| 907 | movaps xmm0, [%3+%1*2+0x10] |
| 908 | CMUL %1, xmm0, xmm1, %3, %4, %5 |
| 909 | movaps xmm5, [%3+%2*2] |
| 910 | movaps xmm4, [%3+%2*2+0x10] |
| 911 | CMUL %2, xmm4, xmm5, %3, %4, %5 |
| 912 | shufps xmm1, xmm1, 0x1b |
| 913 | shufps xmm5, xmm5, 0x1b |
| 914 | movaps xmm6, xmm4 |
| 915 | unpckhps xmm4, xmm1 |
| 916 | unpcklps xmm6, xmm1 |
| 917 | movaps xmm2, xmm0 |
| 918 | unpcklps xmm0, xmm5 |
| 919 | unpckhps xmm2, xmm5 |
| 920 | movaps [%3+%2*2], xmm6 |
| 921 | movaps [%3+%2*2+0x10], xmm4 |
| 922 | movaps [%3+%1*2], xmm0 |
| 923 | movaps [%3+%1*2+0x10], xmm2 |
| 924 | sub %2, 0x10 |
| 925 | add %1, 0x10 |
| 926 | jl .post |
| 927 | %endmacro |
| 928 | |
| 929 | %macro CMUL_3DNOW 6 |
| 930 | mova m6, [%1+%2*2] |
| 931 | mova %3, [%1+%2*2+8] |
| 932 | mova %4, m6 |
| 933 | mova m7, %3 |
| 934 | pfmul m6, [%5+%2] |
| 935 | pfmul %3, [%6+%2] |
| 936 | pfmul %4, [%6+%2] |
| 937 | pfmul m7, [%5+%2] |
| 938 | pfsub %3, m6 |
| 939 | pfadd %4, m7 |
| 940 | %endmacro |
| 941 | |
| 942 | %macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8 |
| 943 | .post: |
| 944 | CMUL_3DNOW %3, %1, m0, m1, %4, %5 |
| 945 | CMUL_3DNOW %3, %2, m2, m3, %4, %5 |
| 946 | movd [%3+%1*2+ 0], m0 |
| 947 | movd [%3+%2*2+12], m1 |
| 948 | movd [%3+%2*2+ 0], m2 |
| 949 | movd [%3+%1*2+12], m3 |
| 950 | psrlq m0, 32 |
| 951 | psrlq m1, 32 |
| 952 | psrlq m2, 32 |
| 953 | psrlq m3, 32 |
| 954 | movd [%3+%1*2+ 8], m0 |
| 955 | movd [%3+%2*2+ 4], m1 |
| 956 | movd [%3+%2*2+ 8], m2 |
| 957 | movd [%3+%1*2+ 4], m3 |
| 958 | sub %2, 8 |
| 959 | add %1, 8 |
| 960 | jl .post |
| 961 | %endmacro |
| 962 | |
| 963 | %macro DECL_IMDCT 1 |
| 964 | cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input |
| 965 | %if ARCH_X86_64 |
| 966 | %define rrevtab r7 |
| 967 | %define rtcos r8 |
| 968 | %define rtsin r9 |
| 969 | %else |
| 970 | %define rrevtab r6 |
| 971 | %define rtsin r6 |
| 972 | %define rtcos r5 |
| 973 | %endif |
| 974 | mov r3d, [r0+FFTContext.mdctsize] |
| 975 | add r2, r3 |
| 976 | shr r3, 1 |
| 977 | mov rtcos, [r0+FFTContext.tcos] |
| 978 | mov rtsin, [r0+FFTContext.tsin] |
| 979 | add rtcos, r3 |
| 980 | add rtsin, r3 |
| 981 | %if ARCH_X86_64 == 0 |
| 982 | push rtcos |
| 983 | push rtsin |
| 984 | %endif |
| 985 | shr r3, 1 |
| 986 | mov rrevtab, [r0+FFTContext.revtab] |
| 987 | add rrevtab, r3 |
| 988 | %if ARCH_X86_64 == 0 |
| 989 | push rrevtab |
| 990 | %endif |
| 991 | |
| 992 | %if mmsize == 8 |
| 993 | sub r3, 2 |
| 994 | %else |
| 995 | sub r3, 4 |
| 996 | %endif |
| 997 | %if ARCH_X86_64 || mmsize == 8 |
| 998 | xor r4, r4 |
| 999 | sub r4, r3 |
| 1000 | %endif |
| 1001 | %if notcpuflag(3dnowext) && mmsize == 8 |
| 1002 | movd m7, [ps_neg] |
| 1003 | %endif |
| 1004 | .pre: |
| 1005 | %if ARCH_X86_64 == 0 |
| 1006 | ;unspill |
| 1007 | %if mmsize != 8 |
| 1008 | xor r4, r4 |
| 1009 | sub r4, r3 |
| 1010 | %endif |
| 1011 | mov rtcos, [esp+8] |
| 1012 | mov rtsin, [esp+4] |
| 1013 | %endif |
| 1014 | |
| 1015 | PREROTATER r4, r3, r2, rtcos, rtsin |
| 1016 | %if mmsize == 8 |
| 1017 | mov r6, [esp] ; rrevtab = ptr+n8 |
| 1018 | movzx r5, word [rrevtab+r4-2] ; rrevtab[j] |
| 1019 | movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1] |
| 1020 | mova [r1+r5*8], m0 |
| 1021 | mova [r1+r6*8], m2 |
| 1022 | add r4, 2 |
| 1023 | sub r3, 2 |
| 1024 | %else |
| 1025 | %if ARCH_X86_64 |
| 1026 | movzx r5, word [rrevtab+r4-4] |
| 1027 | movzx r6, word [rrevtab+r4-2] |
| 1028 | movzx r10, word [rrevtab+r3] |
| 1029 | movzx r11, word [rrevtab+r3+2] |
| 1030 | movlps [r1+r5 *8], xmm0 |
| 1031 | movhps [r1+r6 *8], xmm0 |
| 1032 | movlps [r1+r10*8], xmm1 |
| 1033 | movhps [r1+r11*8], xmm1 |
| 1034 | add r4, 4 |
| 1035 | %else |
| 1036 | mov r6, [esp] |
| 1037 | movzx r5, word [r6+r4-4] |
| 1038 | movzx r4, word [r6+r4-2] |
| 1039 | movlps [r1+r5*8], xmm0 |
| 1040 | movhps [r1+r4*8], xmm0 |
| 1041 | movzx r5, word [r6+r3] |
| 1042 | movzx r4, word [r6+r3+2] |
| 1043 | movlps [r1+r5*8], xmm1 |
| 1044 | movhps [r1+r4*8], xmm1 |
| 1045 | %endif |
| 1046 | sub r3, 4 |
| 1047 | %endif |
| 1048 | jns .pre |
| 1049 | |
| 1050 | mov r5, r0 |
| 1051 | mov r6, r1 |
| 1052 | mov r0, r1 |
| 1053 | mov r1d, [r5+FFTContext.nbits] |
| 1054 | |
| 1055 | FFT_DISPATCH SUFFIX, r1 |
| 1056 | |
| 1057 | mov r0d, [r5+FFTContext.mdctsize] |
| 1058 | add r6, r0 |
| 1059 | shr r0, 1 |
| 1060 | %if ARCH_X86_64 == 0 |
| 1061 | %define rtcos r2 |
| 1062 | %define rtsin r3 |
| 1063 | mov rtcos, [esp+8] |
| 1064 | mov rtsin, [esp+4] |
| 1065 | %endif |
| 1066 | neg r0 |
| 1067 | mov r1, -mmsize |
| 1068 | sub r1, r0 |
| 1069 | %1 r0, r1, r6, rtcos, rtsin |
| 1070 | %if ARCH_X86_64 == 0 |
| 1071 | add esp, 12 |
| 1072 | %endif |
| 1073 | %if mmsize == 8 |
| 1074 | femms |
| 1075 | %endif |
| 1076 | RET |
| 1077 | %endmacro |
| 1078 | |
| 1079 | DECL_IMDCT POSROTATESHUF |
| 1080 | |
| 1081 | %if ARCH_X86_32 |
| 1082 | INIT_MMX 3dnow |
| 1083 | DECL_IMDCT POSROTATESHUF_3DNOW |
| 1084 | |
| 1085 | INIT_MMX 3dnowext |
| 1086 | DECL_IMDCT POSROTATESHUF_3DNOW |
| 1087 | %endif |
| 1088 | |
| 1089 | INIT_YMM avx |
| 1090 | |
| 1091 | %if HAVE_AVX_EXTERNAL |
| 1092 | DECL_IMDCT POSROTATESHUF_AVX |
| 1093 | %endif |