| 1 | ;****************************************************************************** |
| 2 | ;* AAC Spectral Band Replication decoding functions |
| 3 | ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> |
| 4 | ;* |
| 5 | ;* This file is part of FFmpeg. |
| 6 | ;* |
| 7 | ;* FFmpeg is free software; you can redistribute it and/or |
| 8 | ;* modify it under the terms of the GNU Lesser General Public |
| 9 | ;* License as published by the Free Software Foundation; either |
| 10 | ;* version 2.1 of the License, or (at your option) any later version. |
| 11 | ;* |
| 12 | ;* FFmpeg is distributed in the hope that it will be useful, |
| 13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | ;* Lesser General Public License for more details. |
| 16 | ;* |
| 17 | ;* You should have received a copy of the GNU Lesser General Public |
| 18 | ;* License along with FFmpeg; if not, write to the Free Software |
| 19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 | ;****************************************************************************** |
| 21 | |
| 22 | %include "libavutil/x86/x86util.asm" |
| 23 | |
| 24 | SECTION_RODATA |
| 25 | ; mask equivalent for multiply by -1.0 1.0 |
| 26 | ps_mask times 2 dd 1<<31, 0 |
| 27 | ps_mask2 times 2 dd 0, 1<<31 |
| 28 | ps_noise0 times 2 dd 1.0, 0.0, |
| 29 | ps_noise2 times 2 dd -1.0, 0.0 |
| 30 | ps_noise13 dd 0.0, 1.0, 0.0, -1.0 |
| 31 | dd 0.0, -1.0, 0.0, 1.0 |
| 32 | dd 0.0, 1.0, 0.0, -1.0 |
| 33 | cextern sbr_noise_table |
| 34 | cextern ps_neg |
| 35 | |
| 36 | SECTION_TEXT |
| 37 | |
| 38 | INIT_XMM sse |
| 39 | cglobal sbr_sum_square, 2, 3, 6 |
| 40 | mov r2, r1 |
| 41 | xorps m0, m0 |
| 42 | xorps m1, m1 |
| 43 | sar r2, 3 |
| 44 | jz .prepare |
| 45 | .loop: |
| 46 | movu m2, [r0 + 0] |
| 47 | movu m3, [r0 + 16] |
| 48 | movu m4, [r0 + 32] |
| 49 | movu m5, [r0 + 48] |
| 50 | mulps m2, m2 |
| 51 | mulps m3, m3 |
| 52 | mulps m4, m4 |
| 53 | mulps m5, m5 |
| 54 | addps m0, m2 |
| 55 | addps m1, m3 |
| 56 | addps m0, m4 |
| 57 | addps m1, m5 |
| 58 | add r0, 64 |
| 59 | dec r2 |
| 60 | jnz .loop |
| 61 | .prepare: |
| 62 | and r1, 7 |
| 63 | sar r1, 1 |
| 64 | jz .end |
| 65 | ; len is a multiple of 2, thus there are at least 4 elements to process |
| 66 | .endloop: |
| 67 | movu m2, [r0] |
| 68 | add r0, 16 |
| 69 | mulps m2, m2 |
| 70 | dec r1 |
| 71 | addps m0, m2 |
| 72 | jnz .endloop |
| 73 | .end: |
| 74 | addps m0, m1 |
| 75 | movhlps m2, m0 |
| 76 | addps m0, m2 |
| 77 | movss m1, m0 |
| 78 | shufps m0, m0, 1 |
| 79 | addss m0, m1 |
| 80 | %if ARCH_X86_64 == 0 |
| 81 | movss r0m, m0 |
| 82 | fld dword r0m |
| 83 | %endif |
| 84 | RET |
| 85 | |
| 86 | %define STEP 40*4*2 |
| 87 | cglobal sbr_hf_g_filt, 5, 6, 5 |
| 88 | lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high |
| 89 | mov r5, r3 |
| 90 | and r3, 0xFC |
| 91 | lea r2, [r2 + r3*4] |
| 92 | lea r0, [r0 + r3*8] |
| 93 | neg r3 |
| 94 | jz .loop1 |
| 95 | .loop4: |
| 96 | movlps m0, [r2 + 4*r3 + 0] |
| 97 | movlps m1, [r2 + 4*r3 + 8] |
| 98 | movlps m2, [r1 + 0*STEP] |
| 99 | movlps m3, [r1 + 2*STEP] |
| 100 | movhps m2, [r1 + 1*STEP] |
| 101 | movhps m3, [r1 + 3*STEP] |
| 102 | unpcklps m0, m0 |
| 103 | unpcklps m1, m1 |
| 104 | mulps m0, m2 |
| 105 | mulps m1, m3 |
| 106 | movu [r0 + 8*r3 + 0], m0 |
| 107 | movu [r0 + 8*r3 + 16], m1 |
| 108 | add r1, 4*STEP |
| 109 | add r3, 4 |
| 110 | jnz .loop4 |
| 111 | and r5, 3 ; number of single element loops |
| 112 | jz .end |
| 113 | .loop1: ; element 0 and 1 can be computed at the same time |
| 114 | movss m0, [r2] |
| 115 | movlps m2, [r1] |
| 116 | unpcklps m0, m0 |
| 117 | mulps m2, m0 |
| 118 | movlps [r0], m2 |
| 119 | add r0, 8 |
| 120 | add r2, 4 |
| 121 | add r1, STEP |
| 122 | dec r5 |
| 123 | jnz .loop1 |
| 124 | .end: |
| 125 | RET |
| 126 | |
| 127 | ; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2], |
| 128 | ; const float alpha0[2], const float alpha1[2], |
| 129 | ; float bw, int start, int end) |
| 130 | ; |
| 131 | cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E |
| 132 | ; load alpha factors |
| 133 | %define bw m0 |
| 134 | %if ARCH_X86_64 == 0 || WIN64 |
| 135 | movss bw, BWm |
| 136 | %endif |
| 137 | movlps m2, [alpha1q] |
| 138 | movlps m1, [alpha0q] |
| 139 | shufps bw, bw, 0 |
| 140 | mulps m2, bw ; (a1[0] a1[1])*bw |
| 141 | mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3) |
| 142 | mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1) |
| 143 | mova m3, m1 |
| 144 | mova m4, m2 |
| 145 | |
| 146 | ; Set pointers |
| 147 | %if ARCH_X86_64 == 0 || WIN64 |
| 148 | ; start and end 6th and 7th args on stack |
| 149 | mov r2d, Sm |
| 150 | mov r3d, Em |
| 151 | %define start r2q |
| 152 | %define end r3q |
| 153 | %else |
| 154 | ; BW does not actually occupy a register, so shift by 1 |
| 155 | %define start BWq |
| 156 | %define end Sq |
| 157 | %endif |
| 158 | sub start, end ; neg num of loops |
| 159 | lea X_highq, [X_highq + end*2*4] |
| 160 | lea X_lowq, [X_lowq + end*2*4 - 2*2*4] |
| 161 | shl start, 3 ; offset from num loops |
| 162 | |
| 163 | mova m0, [X_lowq + start] |
| 164 | shufps m3, m3, q1111 |
| 165 | shufps m4, m4, q1111 |
| 166 | xorps m3, [ps_mask] |
| 167 | shufps m1, m1, q0000 |
| 168 | shufps m2, m2, q0000 |
| 169 | xorps m4, [ps_mask] |
| 170 | .loop2: |
| 171 | movu m7, [X_lowq + start + 8] ; BbCc |
| 172 | mova m6, m0 |
| 173 | mova m5, m7 |
| 174 | shufps m0, m0, q2301 ; aAbB |
| 175 | shufps m7, m7, q2301 ; bBcC |
| 176 | mulps m0, m4 |
| 177 | mulps m7, m3 |
| 178 | mulps m6, m2 |
| 179 | mulps m5, m1 |
| 180 | addps m7, m0 |
| 181 | mova m0, [X_lowq + start +16] ; CcDd |
| 182 | addps m7, m0 |
| 183 | addps m6, m5 |
| 184 | addps m7, m6 |
| 185 | mova [X_highq + start], m7 |
| 186 | add start, 16 |
| 187 | jnz .loop2 |
| 188 | RET |
| 189 | |
| 190 | cglobal sbr_sum64x5, 1,2,4,z |
| 191 | lea r1q, [zq+ 256] |
| 192 | .loop: |
| 193 | mova m0, [zq+ 0] |
| 194 | mova m2, [zq+ 16] |
| 195 | mova m1, [zq+ 256] |
| 196 | mova m3, [zq+ 272] |
| 197 | addps m0, [zq+ 512] |
| 198 | addps m2, [zq+ 528] |
| 199 | addps m1, [zq+ 768] |
| 200 | addps m3, [zq+ 784] |
| 201 | addps m0, [zq+1024] |
| 202 | addps m2, [zq+1040] |
| 203 | addps m0, m1 |
| 204 | addps m2, m3 |
| 205 | mova [zq], m0 |
| 206 | mova [zq+16], m2 |
| 207 | add zq, 32 |
| 208 | cmp zq, r1q |
| 209 | jne .loop |
| 210 | REP_RET |
| 211 | |
| 212 | INIT_XMM sse |
| 213 | cglobal sbr_qmf_post_shuffle, 2,3,4,W,z |
| 214 | lea r2q, [zq + (64-4)*4] |
| 215 | mova m3, [ps_neg] |
| 216 | .loop: |
| 217 | mova m1, [zq] |
| 218 | xorps m0, m3, [r2q] |
| 219 | shufps m0, m0, m0, q0123 |
| 220 | unpcklps m2, m0, m1 |
| 221 | unpckhps m0, m0, m1 |
| 222 | mova [Wq + 0], m2 |
| 223 | mova [Wq + 16], m0 |
| 224 | add Wq, 32 |
| 225 | sub r2q, 16 |
| 226 | add zq, 16 |
| 227 | cmp zq, r2q |
| 228 | jl .loop |
| 229 | REP_RET |
| 230 | |
| 231 | INIT_XMM sse |
| 232 | cglobal sbr_neg_odd_64, 1,2,4,z |
| 233 | lea r1q, [zq+256] |
| 234 | .loop: |
| 235 | mova m0, [zq+ 0] |
| 236 | mova m1, [zq+16] |
| 237 | mova m2, [zq+32] |
| 238 | mova m3, [zq+48] |
| 239 | xorps m0, [ps_mask2] |
| 240 | xorps m1, [ps_mask2] |
| 241 | xorps m2, [ps_mask2] |
| 242 | xorps m3, [ps_mask2] |
| 243 | mova [zq+ 0], m0 |
| 244 | mova [zq+16], m1 |
| 245 | mova [zq+32], m2 |
| 246 | mova [zq+48], m3 |
| 247 | add zq, 64 |
| 248 | cmp zq, r1q |
| 249 | jne .loop |
| 250 | REP_RET |
| 251 | |
| 252 | ; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1) |
| 253 | %macro SBR_QMF_DEINT_BFLY 0 |
| 254 | cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c |
| 255 | mov cq, 64*4-2*mmsize |
| 256 | lea vrevq, [vq + 64*4] |
| 257 | .loop: |
| 258 | mova m0, [src0q+cq] |
| 259 | mova m1, [src1q] |
| 260 | mova m4, [src0q+cq+mmsize] |
| 261 | mova m5, [src1q+mmsize] |
| 262 | %if cpuflag(sse2) |
| 263 | pshufd m2, m0, q0123 |
| 264 | pshufd m3, m1, q0123 |
| 265 | pshufd m6, m4, q0123 |
| 266 | pshufd m7, m5, q0123 |
| 267 | %else |
| 268 | shufps m2, m0, m0, q0123 |
| 269 | shufps m3, m1, m1, q0123 |
| 270 | shufps m6, m4, m4, q0123 |
| 271 | shufps m7, m5, m5, q0123 |
| 272 | %endif |
| 273 | addps m5, m2 |
| 274 | subps m0, m7 |
| 275 | addps m1, m6 |
| 276 | subps m4, m3 |
| 277 | mova [vrevq], m1 |
| 278 | mova [vrevq+mmsize], m5 |
| 279 | mova [vq+cq], m0 |
| 280 | mova [vq+cq+mmsize], m4 |
| 281 | add src1q, 2*mmsize |
| 282 | add vrevq, 2*mmsize |
| 283 | sub cq, 2*mmsize |
| 284 | jge .loop |
| 285 | REP_RET |
| 286 | %endmacro |
| 287 | |
| 288 | INIT_XMM sse |
| 289 | SBR_QMF_DEINT_BFLY |
| 290 | |
| 291 | INIT_XMM sse2 |
| 292 | SBR_QMF_DEINT_BFLY |
| 293 | |
| 294 | INIT_XMM sse2 |
| 295 | cglobal sbr_qmf_pre_shuffle, 1,4,6,z |
| 296 | %define OFFSET (32*4-2*mmsize) |
| 297 | mov r3q, OFFSET |
| 298 | lea r1q, [zq + (32+1)*4] |
| 299 | lea r2q, [zq + 64*4] |
| 300 | mova m5, [ps_neg] |
| 301 | .loop: |
| 302 | movu m0, [r1q] |
| 303 | movu m2, [r1q + mmsize] |
| 304 | movu m1, [zq + r3q + 4 + mmsize] |
| 305 | movu m3, [zq + r3q + 4] |
| 306 | |
| 307 | pxor m2, m5 |
| 308 | pxor m0, m5 |
| 309 | pshufd m2, m2, q0123 |
| 310 | pshufd m0, m0, q0123 |
| 311 | SBUTTERFLY dq, 2, 3, 4 |
| 312 | SBUTTERFLY dq, 0, 1, 4 |
| 313 | mova [r2q + 2*r3q + 0*mmsize], m2 |
| 314 | mova [r2q + 2*r3q + 1*mmsize], m3 |
| 315 | mova [r2q + 2*r3q + 2*mmsize], m0 |
| 316 | mova [r2q + 2*r3q + 3*mmsize], m1 |
| 317 | add r1q, 2*mmsize |
| 318 | sub r3q, 2*mmsize |
| 319 | jge .loop |
| 320 | movq m2, [zq] |
| 321 | movq [r2q], m2 |
| 322 | REP_RET |
| 323 | |
| 324 | %ifdef PIC |
| 325 | %define NREGS 1 |
| 326 | %if UNIX64 |
| 327 | %define NOISE_TABLE r6q ; r5q is m_max |
| 328 | %else |
| 329 | %define NOISE_TABLE r5q |
| 330 | %endif |
| 331 | %else |
| 332 | %define NREGS 0 |
| 333 | %define NOISE_TABLE sbr_noise_table |
| 334 | %endif |
| 335 | |
| 336 | %macro LOAD_NST 1 |
| 337 | %ifdef PIC |
| 338 | lea NOISE_TABLE, [%1] |
| 339 | mova m0, [kxq + NOISE_TABLE] |
| 340 | %else |
| 341 | mova m0, [kxq + %1] |
| 342 | %endif |
| 343 | %endmacro |
| 344 | |
| 345 | INIT_XMM sse2 |
| 346 | ; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m, |
| 347 | ; const float *q_filt, int noise, |
| 348 | ; int kx, int m_max) |
| 349 | cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max |
| 350 | mova m0, [ps_noise0] |
| 351 | jmp apply_noise_main |
| 352 | |
| 353 | ; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m, |
| 354 | ; const float *q_filt, int noise, |
| 355 | ; int kx, int m_max) |
| 356 | cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max |
| 357 | and kxq, 1 |
| 358 | shl kxq, 4 |
| 359 | LOAD_NST ps_noise13 |
| 360 | jmp apply_noise_main |
| 361 | |
| 362 | ; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m, |
| 363 | ; const float *q_filt, int noise, |
| 364 | ; int kx, int m_max) |
| 365 | cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max |
| 366 | mova m0, [ps_noise2] |
| 367 | jmp apply_noise_main |
| 368 | |
| 369 | ; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m, |
| 370 | ; const float *q_filt, int noise, |
| 371 | ; int kx, int m_max) |
| 372 | cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max |
| 373 | and kxq, 1 |
| 374 | shl kxq, 4 |
| 375 | LOAD_NST ps_noise13+16 |
| 376 | |
| 377 | apply_noise_main: |
| 378 | %if ARCH_X86_64 == 0 || WIN64 |
| 379 | mov kxd, m_maxm |
| 380 | %define count kxq |
| 381 | %else |
| 382 | %define count m_maxq |
| 383 | %endif |
| 384 | dec noiseq |
| 385 | shl count, 2 |
| 386 | %ifdef PIC |
| 387 | lea NOISE_TABLE, [sbr_noise_table] |
| 388 | %endif |
| 389 | lea Yq, [Yq + 2*count] |
| 390 | add s_mq, count |
| 391 | add q_filtq, count |
| 392 | shl noiseq, 3 |
| 393 | pxor m5, m5 |
| 394 | neg count |
| 395 | .loop: |
| 396 | mova m1, [q_filtq + count] |
| 397 | movu m3, [noiseq + NOISE_TABLE + 1*mmsize] |
| 398 | movu m4, [noiseq + NOISE_TABLE + 2*mmsize] |
| 399 | add noiseq, 2*mmsize |
| 400 | and noiseq, 0x1ff<<3 |
| 401 | punpckhdq m2, m1, m1 |
| 402 | punpckldq m1, m1 |
| 403 | mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] |
| 404 | mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] |
| 405 | mova m3, [s_mq + count] |
| 406 | ; TODO: replace by a vpermd in AVX2 |
| 407 | punpckhdq m4, m3, m3 |
| 408 | punpckldq m3, m3 |
| 409 | pcmpeqd m6, m3, m5 ; m6 == 0 |
| 410 | pcmpeqd m7, m4, m5 ; m7 == 0 |
| 411 | mulps m3, m0 ; s_m[m] * phi_sign |
| 412 | mulps m4, m0 ; s_m[m] * phi_sign |
| 413 | pand m1, m6 |
| 414 | pand m2, m7 |
| 415 | movu m6, [Yq + 2*count] |
| 416 | movu m7, [Yq + 2*count + mmsize] |
| 417 | addps m3, m1 |
| 418 | addps m4, m2 |
| 419 | addps m6, m3 |
| 420 | addps m7, m4 |
| 421 | movu [Yq + 2*count], m6 |
| 422 | movu [Yq + 2*count + mmsize], m7 |
| 423 | add count, mmsize |
| 424 | jl .loop |
| 425 | RET |
| 426 | |
| 427 | INIT_XMM sse |
| 428 | cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c |
| 429 | %define COUNT 32*4 |
| 430 | %define OFFSET 32*4 |
| 431 | mov cq, -COUNT |
| 432 | lea vrevq, [vq + OFFSET + COUNT] |
| 433 | add vq, OFFSET-mmsize |
| 434 | add srcq, 2*COUNT |
| 435 | mova m3, [ps_neg] |
| 436 | .loop: |
| 437 | mova m0, [srcq + 2*cq + 0*mmsize] |
| 438 | mova m1, [srcq + 2*cq + 1*mmsize] |
| 439 | shufps m2, m0, m1, q2020 |
| 440 | shufps m1, m0, q1313 |
| 441 | xorps m2, m3 |
| 442 | mova [vq], m1 |
| 443 | mova [vrevq + cq], m2 |
| 444 | sub vq, mmsize |
| 445 | add cq, mmsize |
| 446 | jl .loop |
| 447 | REP_RET |