| 1 | ;***************************************************************************** |
| 2 | ;* x86-optimized Float DSP functions |
| 3 | ;* |
| 4 | ;* Copyright 2006 Loren Merritt |
| 5 | ;* |
| 6 | ;* This file is part of FFmpeg. |
| 7 | ;* |
| 8 | ;* FFmpeg is free software; you can redistribute it and/or |
| 9 | ;* modify it under the terms of the GNU Lesser General Public |
| 10 | ;* License as published by the Free Software Foundation; either |
| 11 | ;* version 2.1 of the License, or (at your option) any later version. |
| 12 | ;* |
| 13 | ;* FFmpeg is distributed in the hope that it will be useful, |
| 14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 16 | ;* Lesser General Public License for more details. |
| 17 | ;* |
| 18 | ;* You should have received a copy of the GNU Lesser General Public |
| 19 | ;* License along with FFmpeg; if not, write to the Free Software |
| 20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 21 | ;****************************************************************************** |
| 22 | |
| 23 | %include "x86util.asm" |
| 24 | |
| 25 | SECTION .text |
| 26 | |
| 27 | ;----------------------------------------------------------------------------- |
| 28 | ; void vector_fmul(float *dst, const float *src0, const float *src1, int len) |
| 29 | ;----------------------------------------------------------------------------- |
| 30 | %macro VECTOR_FMUL 0 |
| 31 | cglobal vector_fmul, 4,4,2, dst, src0, src1, len |
| 32 | lea lenq, [lend*4 - 64] |
| 33 | ALIGN 16 |
| 34 | .loop: |
| 35 | %assign a 0 |
| 36 | %rep 32/mmsize |
| 37 | mova m0, [src0q + lenq + (a+0)*mmsize] |
| 38 | mova m1, [src0q + lenq + (a+1)*mmsize] |
| 39 | mulps m0, m0, [src1q + lenq + (a+0)*mmsize] |
| 40 | mulps m1, m1, [src1q + lenq + (a+1)*mmsize] |
| 41 | mova [dstq + lenq + (a+0)*mmsize], m0 |
| 42 | mova [dstq + lenq + (a+1)*mmsize], m1 |
| 43 | %assign a a+2 |
| 44 | %endrep |
| 45 | |
| 46 | sub lenq, 64 |
| 47 | jge .loop |
| 48 | REP_RET |
| 49 | %endmacro |
| 50 | |
| 51 | INIT_XMM sse |
| 52 | VECTOR_FMUL |
| 53 | %if HAVE_AVX_EXTERNAL |
| 54 | INIT_YMM avx |
| 55 | VECTOR_FMUL |
| 56 | %endif |
| 57 | |
| 58 | ;------------------------------------------------------------------------------ |
| 59 | ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len) |
| 60 | ;------------------------------------------------------------------------------ |
| 61 | |
| 62 | %macro VECTOR_FMAC_SCALAR 0 |
| 63 | %if UNIX64 |
| 64 | cglobal vector_fmac_scalar, 3,3,5, dst, src, len |
| 65 | %else |
| 66 | cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len |
| 67 | %endif |
| 68 | %if ARCH_X86_32 |
| 69 | VBROADCASTSS m0, mulm |
| 70 | %else |
| 71 | %if WIN64 |
| 72 | SWAP 0, 2 |
| 73 | %endif |
| 74 | shufps xm0, xm0, 0 |
| 75 | %if cpuflag(avx) |
| 76 | vinsertf128 m0, m0, xm0, 1 |
| 77 | %endif |
| 78 | %endif |
| 79 | lea lenq, [lend*4-64] |
| 80 | .loop: |
| 81 | %if cpuflag(fma3) |
| 82 | mova m1, [dstq+lenq] |
| 83 | mova m2, [dstq+lenq+1*mmsize] |
| 84 | fmaddps m1, m0, [srcq+lenq], m1 |
| 85 | fmaddps m2, m0, [srcq+lenq+1*mmsize], m2 |
| 86 | %else ; cpuflag |
| 87 | mulps m1, m0, [srcq+lenq] |
| 88 | mulps m2, m0, [srcq+lenq+1*mmsize] |
| 89 | %if mmsize < 32 |
| 90 | mulps m3, m0, [srcq+lenq+2*mmsize] |
| 91 | mulps m4, m0, [srcq+lenq+3*mmsize] |
| 92 | %endif ; mmsize |
| 93 | addps m1, m1, [dstq+lenq] |
| 94 | addps m2, m2, [dstq+lenq+1*mmsize] |
| 95 | %if mmsize < 32 |
| 96 | addps m3, m3, [dstq+lenq+2*mmsize] |
| 97 | addps m4, m4, [dstq+lenq+3*mmsize] |
| 98 | %endif ; mmsize |
| 99 | %endif ; cpuflag |
| 100 | mova [dstq+lenq], m1 |
| 101 | mova [dstq+lenq+1*mmsize], m2 |
| 102 | %if mmsize < 32 |
| 103 | mova [dstq+lenq+2*mmsize], m3 |
| 104 | mova [dstq+lenq+3*mmsize], m4 |
| 105 | %endif ; mmsize |
| 106 | sub lenq, 64 |
| 107 | jge .loop |
| 108 | REP_RET |
| 109 | %endmacro |
| 110 | |
| 111 | INIT_XMM sse |
| 112 | VECTOR_FMAC_SCALAR |
| 113 | %if HAVE_AVX_EXTERNAL |
| 114 | INIT_YMM avx |
| 115 | VECTOR_FMAC_SCALAR |
| 116 | %endif |
| 117 | %if HAVE_FMA3_EXTERNAL |
| 118 | INIT_YMM fma3 |
| 119 | VECTOR_FMAC_SCALAR |
| 120 | %endif |
| 121 | |
| 122 | ;------------------------------------------------------------------------------ |
| 123 | ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len) |
| 124 | ;------------------------------------------------------------------------------ |
| 125 | |
| 126 | %macro VECTOR_FMUL_SCALAR 0 |
| 127 | %if UNIX64 |
| 128 | cglobal vector_fmul_scalar, 3,3,2, dst, src, len |
| 129 | %else |
| 130 | cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len |
| 131 | %endif |
| 132 | %if ARCH_X86_32 |
| 133 | movss m0, mulm |
| 134 | %elif WIN64 |
| 135 | SWAP 0, 2 |
| 136 | %endif |
| 137 | shufps m0, m0, 0 |
| 138 | lea lenq, [lend*4-mmsize] |
| 139 | .loop: |
| 140 | mova m1, [srcq+lenq] |
| 141 | mulps m1, m0 |
| 142 | mova [dstq+lenq], m1 |
| 143 | sub lenq, mmsize |
| 144 | jge .loop |
| 145 | REP_RET |
| 146 | %endmacro |
| 147 | |
| 148 | INIT_XMM sse |
| 149 | VECTOR_FMUL_SCALAR |
| 150 | |
| 151 | ;------------------------------------------------------------------------------ |
| 152 | ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul, |
| 153 | ; int len) |
| 154 | ;------------------------------------------------------------------------------ |
| 155 | |
| 156 | %macro VECTOR_DMUL_SCALAR 0 |
| 157 | %if ARCH_X86_32 |
| 158 | cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr |
| 159 | mov lenq, lenaddrm |
| 160 | %elif UNIX64 |
| 161 | cglobal vector_dmul_scalar, 3,3,3, dst, src, len |
| 162 | %else |
| 163 | cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len |
| 164 | %endif |
| 165 | %if ARCH_X86_32 |
| 166 | VBROADCASTSD m0, mulm |
| 167 | %else |
| 168 | %if WIN64 |
| 169 | SWAP 0, 2 |
| 170 | %endif |
| 171 | movlhps xm0, xm0 |
| 172 | %if cpuflag(avx) |
| 173 | vinsertf128 ym0, ym0, xm0, 1 |
| 174 | %endif |
| 175 | %endif |
| 176 | lea lenq, [lend*8-2*mmsize] |
| 177 | .loop: |
| 178 | mulpd m1, m0, [srcq+lenq ] |
| 179 | mulpd m2, m0, [srcq+lenq+mmsize] |
| 180 | mova [dstq+lenq ], m1 |
| 181 | mova [dstq+lenq+mmsize], m2 |
| 182 | sub lenq, 2*mmsize |
| 183 | jge .loop |
| 184 | REP_RET |
| 185 | %endmacro |
| 186 | |
| 187 | INIT_XMM sse2 |
| 188 | VECTOR_DMUL_SCALAR |
| 189 | %if HAVE_AVX_EXTERNAL |
| 190 | INIT_YMM avx |
| 191 | VECTOR_DMUL_SCALAR |
| 192 | %endif |
| 193 | |
| 194 | ;----------------------------------------------------------------------------- |
| 195 | ; vector_fmul_window(float *dst, const float *src0, |
| 196 | ; const float *src1, const float *win, int len); |
| 197 | ;----------------------------------------------------------------------------- |
| 198 | %macro VECTOR_FMUL_WINDOW 0 |
| 199 | cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1 |
| 200 | shl lend, 2 |
| 201 | lea len1q, [lenq - mmsize] |
| 202 | add src0q, lenq |
| 203 | add dstq, lenq |
| 204 | add winq, lenq |
| 205 | neg lenq |
| 206 | .loop |
| 207 | mova m0, [winq + lenq] |
| 208 | mova m4, [src0q + lenq] |
| 209 | %if cpuflag(sse) |
| 210 | mova m1, [winq + len1q] |
| 211 | mova m5, [src1q + len1q] |
| 212 | shufps m1, m1, 0x1b |
| 213 | shufps m5, m5, 0x1b |
| 214 | mova m2, m0 |
| 215 | mova m3, m1 |
| 216 | mulps m2, m4 |
| 217 | mulps m3, m5 |
| 218 | mulps m1, m4 |
| 219 | mulps m0, m5 |
| 220 | addps m2, m3 |
| 221 | subps m1, m0 |
| 222 | shufps m2, m2, 0x1b |
| 223 | %else |
| 224 | pswapd m1, [winq + len1q] |
| 225 | pswapd m5, [src1q + len1q] |
| 226 | mova m2, m0 |
| 227 | mova m3, m1 |
| 228 | pfmul m2, m4 |
| 229 | pfmul m3, m5 |
| 230 | pfmul m1, m4 |
| 231 | pfmul m0, m5 |
| 232 | pfadd m2, m3 |
| 233 | pfsub m1, m0 |
| 234 | pswapd m2, m2 |
| 235 | %endif |
| 236 | mova [dstq + lenq], m1 |
| 237 | mova [dstq + len1q], m2 |
| 238 | sub len1q, mmsize |
| 239 | add lenq, mmsize |
| 240 | jl .loop |
| 241 | %if mmsize == 8 |
| 242 | femms |
| 243 | %endif |
| 244 | REP_RET |
| 245 | %endmacro |
| 246 | |
| 247 | INIT_MMX 3dnowext |
| 248 | VECTOR_FMUL_WINDOW |
| 249 | INIT_XMM sse |
| 250 | VECTOR_FMUL_WINDOW |
| 251 | |
| 252 | ;----------------------------------------------------------------------------- |
| 253 | ; vector_fmul_add(float *dst, const float *src0, const float *src1, |
| 254 | ; const float *src2, int len) |
| 255 | ;----------------------------------------------------------------------------- |
| 256 | %macro VECTOR_FMUL_ADD 0 |
| 257 | cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len |
| 258 | lea lenq, [lend*4 - 2*mmsize] |
| 259 | ALIGN 16 |
| 260 | .loop: |
| 261 | mova m0, [src0q + lenq] |
| 262 | mova m1, [src0q + lenq + mmsize] |
| 263 | %if cpuflag(fma3) |
| 264 | mova m2, [src2q + lenq] |
| 265 | mova m3, [src2q + lenq + mmsize] |
| 266 | fmaddps m0, m0, [src1q + lenq], m2 |
| 267 | fmaddps m1, m1, [src1q + lenq + mmsize], m3 |
| 268 | %else |
| 269 | mulps m0, m0, [src1q + lenq] |
| 270 | mulps m1, m1, [src1q + lenq + mmsize] |
| 271 | addps m0, m0, [src2q + lenq] |
| 272 | addps m1, m1, [src2q + lenq + mmsize] |
| 273 | %endif |
| 274 | mova [dstq + lenq], m0 |
| 275 | mova [dstq + lenq + mmsize], m1 |
| 276 | |
| 277 | sub lenq, 2*mmsize |
| 278 | jge .loop |
| 279 | REP_RET |
| 280 | %endmacro |
| 281 | |
| 282 | INIT_XMM sse |
| 283 | VECTOR_FMUL_ADD |
| 284 | %if HAVE_AVX_EXTERNAL |
| 285 | INIT_YMM avx |
| 286 | VECTOR_FMUL_ADD |
| 287 | %endif |
| 288 | %if HAVE_FMA3_EXTERNAL |
| 289 | INIT_YMM fma3 |
| 290 | VECTOR_FMUL_ADD |
| 291 | %endif |
| 292 | |
| 293 | ;----------------------------------------------------------------------------- |
| 294 | ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, |
| 295 | ; int len) |
| 296 | ;----------------------------------------------------------------------------- |
| 297 | %macro VECTOR_FMUL_REVERSE 0 |
| 298 | cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len |
| 299 | lea lenq, [lend*4 - 2*mmsize] |
| 300 | ALIGN 16 |
| 301 | .loop: |
| 302 | %if cpuflag(avx) |
| 303 | vmovaps xmm0, [src1q + 16] |
| 304 | vinsertf128 m0, m0, [src1q], 1 |
| 305 | vshufps m0, m0, m0, q0123 |
| 306 | vmovaps xmm1, [src1q + mmsize + 16] |
| 307 | vinsertf128 m1, m1, [src1q + mmsize], 1 |
| 308 | vshufps m1, m1, m1, q0123 |
| 309 | %else |
| 310 | mova m0, [src1q] |
| 311 | mova m1, [src1q + mmsize] |
| 312 | shufps m0, m0, q0123 |
| 313 | shufps m1, m1, q0123 |
| 314 | %endif |
| 315 | mulps m0, m0, [src0q + lenq + mmsize] |
| 316 | mulps m1, m1, [src0q + lenq] |
| 317 | mova [dstq + lenq + mmsize], m0 |
| 318 | mova [dstq + lenq], m1 |
| 319 | add src1q, 2*mmsize |
| 320 | sub lenq, 2*mmsize |
| 321 | jge .loop |
| 322 | REP_RET |
| 323 | %endmacro |
| 324 | |
| 325 | INIT_XMM sse |
| 326 | VECTOR_FMUL_REVERSE |
| 327 | %if HAVE_AVX_EXTERNAL |
| 328 | INIT_YMM avx |
| 329 | VECTOR_FMUL_REVERSE |
| 330 | %endif |
| 331 | |
| 332 | ; float scalarproduct_float_sse(const float *v1, const float *v2, int len) |
| 333 | INIT_XMM sse |
| 334 | cglobal scalarproduct_float, 3,3,2, v1, v2, offset |
| 335 | neg offsetq |
| 336 | shl offsetq, 2 |
| 337 | sub v1q, offsetq |
| 338 | sub v2q, offsetq |
| 339 | xorps xmm0, xmm0 |
| 340 | .loop: |
| 341 | movaps xmm1, [v1q+offsetq] |
| 342 | mulps xmm1, [v2q+offsetq] |
| 343 | addps xmm0, xmm1 |
| 344 | add offsetq, 16 |
| 345 | js .loop |
| 346 | movhlps xmm1, xmm0 |
| 347 | addps xmm0, xmm1 |
| 348 | movss xmm1, xmm0 |
| 349 | shufps xmm0, xmm0, 1 |
| 350 | addss xmm0, xmm1 |
| 351 | %if ARCH_X86_64 == 0 |
| 352 | movss r0m, xmm0 |
| 353 | fld dword r0m |
| 354 | %endif |
| 355 | RET |
| 356 | |
| 357 | ;----------------------------------------------------------------------------- |
| 358 | ; void ff_butterflies_float(float *src0, float *src1, int len); |
| 359 | ;----------------------------------------------------------------------------- |
| 360 | INIT_XMM sse |
| 361 | cglobal butterflies_float, 3,3,3, src0, src1, len |
| 362 | %if ARCH_X86_64 |
| 363 | movsxd lenq, lend |
| 364 | %endif |
| 365 | test lenq, lenq |
| 366 | jz .end |
| 367 | shl lenq, 2 |
| 368 | add src0q, lenq |
| 369 | add src1q, lenq |
| 370 | neg lenq |
| 371 | .loop: |
| 372 | mova m0, [src0q + lenq] |
| 373 | mova m1, [src1q + lenq] |
| 374 | subps m2, m0, m1 |
| 375 | addps m0, m0, m1 |
| 376 | mova [src1q + lenq], m2 |
| 377 | mova [src0q + lenq], m0 |
| 378 | add lenq, mmsize |
| 379 | jl .loop |
| 380 | .end: |
| 381 | REP_RET |