| 1 | ;****************************************************************************** |
| 2 | ;* FLAC DSP SIMD optimizations |
| 3 | ;* |
| 4 | ;* Copyright (C) 2014 Loren Merritt |
| 5 | ;* Copyright (C) 2014 James Almer |
| 6 | ;* |
| 7 | ;* This file is part of FFmpeg. |
| 8 | ;* |
| 9 | ;* FFmpeg is free software; you can redistribute it and/or |
| 10 | ;* modify it under the terms of the GNU Lesser General Public |
| 11 | ;* License as published by the Free Software Foundation; either |
| 12 | ;* version 2.1 of the License, or (at your option) any later version. |
| 13 | ;* |
| 14 | ;* FFmpeg is distributed in the hope that it will be useful, |
| 15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 17 | ;* Lesser General Public License for more details. |
| 18 | ;* |
| 19 | ;* You should have received a copy of the GNU Lesser General Public |
| 20 | ;* License along with FFmpeg; if not, write to the Free Software |
| 21 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 22 | ;****************************************************************************** |
| 23 | |
| 24 | %include "libavutil/x86/x86util.asm" |
| 25 | |
| 26 | SECTION .text |
| 27 | |
| 28 | %macro LPC_32 1 |
| 29 | INIT_XMM %1 |
| 30 | cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j |
| 31 | sub lend, pred_orderd |
| 32 | jle .ret |
| 33 | lea decodedq, [decodedq+pred_orderq*4-8] |
| 34 | lea coeffsq, [coeffsq+pred_orderq*4] |
| 35 | neg pred_orderq |
| 36 | movd m4, qlevelm |
| 37 | ALIGN 16 |
| 38 | .loop_sample: |
| 39 | movd m0, [decodedq+pred_orderq*4+8] |
| 40 | add decodedq, 8 |
| 41 | movd m1, [coeffsq+pred_orderq*4] |
| 42 | pxor m2, m2 |
| 43 | pxor m3, m3 |
| 44 | lea jq, [pred_orderq+1] |
| 45 | test jq, jq |
| 46 | jz .end_order |
| 47 | .loop_order: |
| 48 | PMACSDQL m2, m0, m1, m2, m0 |
| 49 | movd m0, [decodedq+jq*4] |
| 50 | PMACSDQL m3, m1, m0, m3, m1 |
| 51 | movd m1, [coeffsq+jq*4] |
| 52 | inc jq |
| 53 | jl .loop_order |
| 54 | .end_order: |
| 55 | PMACSDQL m2, m0, m1, m2, m0 |
| 56 | psrlq m2, m4 |
| 57 | movd m0, [decodedq] |
| 58 | paddd m0, m2 |
| 59 | movd [decodedq], m0 |
| 60 | sub lend, 2 |
| 61 | jl .ret |
| 62 | PMACSDQL m3, m1, m0, m3, m1 |
| 63 | psrlq m3, m4 |
| 64 | movd m1, [decodedq+4] |
| 65 | paddd m1, m3 |
| 66 | movd [decodedq+4], m1 |
| 67 | jg .loop_sample |
| 68 | .ret: |
| 69 | REP_RET |
| 70 | %endmacro |
| 71 | |
| 72 | %if HAVE_XOP_EXTERNAL |
| 73 | LPC_32 xop |
| 74 | %endif |
| 75 | LPC_32 sse4 |
| 76 | |
| 77 | ;---------------------------------------------------------------------------------- |
| 78 | ;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels, |
| 79 | ; int len, int shift); |
| 80 | ;---------------------------------------------------------------------------------- |
| 81 | %macro FLAC_DECORRELATE_16 3-4 |
| 82 | cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len |
| 83 | %if ARCH_X86_32 || WIN64 |
| 84 | movd m3, r4m |
| 85 | %if ARCH_X86_32 |
| 86 | mov lend, lenm |
| 87 | %endif |
| 88 | %else ; UNIX64 |
| 89 | movd m3, r4d |
| 90 | %endif |
| 91 | shl lend, 2 |
| 92 | mov in1q, [in0q + gprsize] |
| 93 | mov in0q, [in0q] |
| 94 | mov outq, [outq] |
| 95 | add in1q, lenq |
| 96 | add in0q, lenq |
| 97 | add outq, lenq |
| 98 | neg lenq |
| 99 | |
| 100 | align 16 |
| 101 | .loop: |
| 102 | mova m0, [in0q + lenq] |
| 103 | mova m1, [in1q + lenq] |
| 104 | %ifidn %1, ms |
| 105 | psrad m2, m1, 1 |
| 106 | psubd m0, m2 |
| 107 | %endif |
| 108 | %ifnidn %1, indep2 |
| 109 | p%4d m2, m0, m1 |
| 110 | %endif |
| 111 | packssdw m%2, m%2 |
| 112 | packssdw m%3, m%3 |
| 113 | punpcklwd m%2, m%3 |
| 114 | psllw m%2, m3 |
| 115 | mova [outq + lenq], m%2 |
| 116 | add lenq, 16 |
| 117 | jl .loop |
| 118 | REP_RET |
| 119 | %endmacro |
| 120 | |
| 121 | INIT_XMM sse2 |
| 122 | FLAC_DECORRELATE_16 ls, 0, 2, sub |
| 123 | FLAC_DECORRELATE_16 rs, 2, 1, add |
| 124 | FLAC_DECORRELATE_16 ms, 2, 0, add |
| 125 | |
| 126 | ;---------------------------------------------------------------------------------- |
| 127 | ;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels, |
| 128 | ; int len, int shift); |
| 129 | ;---------------------------------------------------------------------------------- |
| 130 | %macro FLAC_DECORRELATE_32 5 |
| 131 | cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len |
| 132 | %if ARCH_X86_32 || WIN64 |
| 133 | movd m3, r4m |
| 134 | %if ARCH_X86_32 |
| 135 | mov lend, lenm |
| 136 | %endif |
| 137 | %else ; UNIX64 |
| 138 | movd m3, r4d |
| 139 | %endif |
| 140 | mov in1q, [in0q + gprsize] |
| 141 | mov in0q, [in0q] |
| 142 | mov outq, [outq] |
| 143 | sub in1q, in0q |
| 144 | |
| 145 | align 16 |
| 146 | .loop: |
| 147 | mova m0, [in0q] |
| 148 | mova m1, [in0q + in1q] |
| 149 | %ifidn %1, ms |
| 150 | psrad m2, m1, 1 |
| 151 | psubd m0, m2 |
| 152 | %endif |
| 153 | p%5d m2, m0, m1 |
| 154 | pslld m%2, m3 |
| 155 | pslld m%3, m3 |
| 156 | |
| 157 | SBUTTERFLY dq, %2, %3, %4 |
| 158 | |
| 159 | mova [outq ], m%2 |
| 160 | mova [outq + mmsize], m%3 |
| 161 | |
| 162 | add in0q, mmsize |
| 163 | add outq, mmsize*2 |
| 164 | sub lend, mmsize/4 |
| 165 | jg .loop |
| 166 | REP_RET |
| 167 | %endmacro |
| 168 | |
| 169 | INIT_XMM sse2 |
| 170 | FLAC_DECORRELATE_32 ls, 0, 2, 1, sub |
| 171 | FLAC_DECORRELATE_32 rs, 2, 1, 0, add |
| 172 | FLAC_DECORRELATE_32 ms, 2, 0, 1, add |
| 173 | |
| 174 | ;----------------------------------------------------------------------------------------- |
| 175 | ;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels, |
| 176 | ; int len, int shift); |
| 177 | ;----------------------------------------------------------------------------------------- |
| 178 | %macro TRANSPOSE8x4D 9 |
| 179 | SBUTTERFLY dq, %1, %2, %9 |
| 180 | SBUTTERFLY dq, %3, %4, %9 |
| 181 | SBUTTERFLY dq, %5, %6, %9 |
| 182 | SBUTTERFLY dq, %7, %8, %9 |
| 183 | SBUTTERFLY qdq, %1, %3, %9 |
| 184 | SBUTTERFLY qdq, %2, %4, %9 |
| 185 | SBUTTERFLY qdq, %5, %7, %9 |
| 186 | SBUTTERFLY qdq, %6, %8, %9 |
| 187 | SWAP %2, %5 |
| 188 | SWAP %4, %7 |
| 189 | %endmacro |
| 190 | |
| 191 | ;%1 = bps |
| 192 | ;%2 = channels |
| 193 | ;%3 = last xmm reg used |
| 194 | ;%4 = word/dword (shift instruction) |
| 195 | %macro FLAC_DECORRELATE_INDEP 4 |
| 196 | %define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels |
| 197 | cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7 |
| 198 | %if ARCH_X86_32 |
| 199 | movd m%3, r4m |
| 200 | %if %2 == 6 |
| 201 | DEFINE_ARGS out, in0, in1, in2, in3, in4, in5 |
| 202 | %define lend dword r3m |
| 203 | %else |
| 204 | mov lend, lenm |
| 205 | %endif |
| 206 | %elif WIN64 |
| 207 | movd m%3, r4m |
| 208 | %else ; UNIX64 |
| 209 | movd m%3, r4d |
| 210 | %endif |
| 211 | |
| 212 | %assign %%i 1 |
| 213 | %rep %2-1 |
| 214 | mov in %+ %%i %+ q, [in0q+%%i*gprsize] |
| 215 | %assign %%i %%i+1 |
| 216 | %endrep |
| 217 | |
| 218 | mov in0q, [in0q] |
| 219 | mov outq, [outq] |
| 220 | |
| 221 | %assign %%i 1 |
| 222 | %rep %2-1 |
| 223 | sub in %+ %%i %+ q, in0q |
| 224 | %assign %%i %%i+1 |
| 225 | %endrep |
| 226 | |
| 227 | align 16 |
| 228 | .loop: |
| 229 | mova m0, [in0q] |
| 230 | |
| 231 | %assign %%i 1 |
| 232 | %rep REPCOUNT-1 |
| 233 | mova m %+ %%i, [in0q + in %+ %%i %+ q] |
| 234 | %assign %%i %%i+1 |
| 235 | %endrep |
| 236 | |
| 237 | %if %1 == 32 |
| 238 | |
| 239 | %if %2 == 8 |
| 240 | TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8 |
| 241 | %elif %2 == 6 |
| 242 | SBUTTERFLY dq, 0, 1, 6 |
| 243 | SBUTTERFLY dq, 2, 3, 6 |
| 244 | SBUTTERFLY dq, 4, 5, 6 |
| 245 | |
| 246 | punpcklqdq m6, m0, m2 |
| 247 | punpckhqdq m2, m4 |
| 248 | shufps m4, m0, 0xe4 |
| 249 | punpcklqdq m0, m1, m3 |
| 250 | punpckhqdq m3, m5 |
| 251 | shufps m5, m1, 0xe4 |
| 252 | SWAP 0,6,1,4,5,3 |
| 253 | %elif %2 == 4 |
| 254 | TRANSPOSE4x4D 0, 1, 2, 3, 4 |
| 255 | %else ; %2 == 2 |
| 256 | SBUTTERFLY dq, 0, 1, 2 |
| 257 | %endif |
| 258 | |
| 259 | %else ; %1 == 16 |
| 260 | |
| 261 | %if %2 == 8 |
| 262 | packssdw m0, [in0q + in4q] |
| 263 | packssdw m1, [in0q + in5q] |
| 264 | packssdw m2, [in0q + in6q] |
| 265 | packssdw m3, [in0q + in7q] |
| 266 | TRANSPOSE2x4x4W 0, 1, 2, 3, 4 |
| 267 | %elif %2 == 6 |
| 268 | packssdw m0, [in0q + in3q] |
| 269 | packssdw m1, [in0q + in4q] |
| 270 | packssdw m2, [in0q + in5q] |
| 271 | pshufd m3, m0, q1032 |
| 272 | punpcklwd m0, m1 |
| 273 | punpckhwd m1, m2 |
| 274 | punpcklwd m2, m3 |
| 275 | |
| 276 | shufps m3, m0, m2, q2020 |
| 277 | shufps m0, m1, q2031 |
| 278 | shufps m2, m1, q3131 |
| 279 | shufps m1, m2, m3, q3120 |
| 280 | shufps m3, m0, q0220 |
| 281 | shufps m0, m2, q3113 |
| 282 | SWAP 2, 0, 3 |
| 283 | %else ; %2 == 4 |
| 284 | packssdw m0, [in0q + in2q] |
| 285 | packssdw m1, [in0q + in3q] |
| 286 | SBUTTERFLY wd, 0, 1, 2 |
| 287 | SBUTTERFLY dq, 0, 1, 2 |
| 288 | %endif |
| 289 | |
| 290 | %endif |
| 291 | |
| 292 | %assign %%i 0 |
| 293 | %rep REPCOUNT |
| 294 | psll%4 m %+ %%i, m%3 |
| 295 | %assign %%i %%i+1 |
| 296 | %endrep |
| 297 | |
| 298 | %assign %%i 0 |
| 299 | %rep REPCOUNT |
| 300 | mova [outq + %%i*mmsize], m %+ %%i |
| 301 | %assign %%i %%i+1 |
| 302 | %endrep |
| 303 | |
| 304 | add in0q, mmsize |
| 305 | add outq, mmsize*REPCOUNT |
| 306 | sub lend, mmsize/4 |
| 307 | jg .loop |
| 308 | REP_RET |
| 309 | %endmacro |
| 310 | |
| 311 | INIT_XMM sse2 |
| 312 | FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro |
| 313 | FLAC_DECORRELATE_INDEP 32, 2, 3, d |
| 314 | FLAC_DECORRELATE_INDEP 16, 4, 3, w |
| 315 | FLAC_DECORRELATE_INDEP 32, 4, 5, d |
| 316 | FLAC_DECORRELATE_INDEP 16, 6, 4, w |
| 317 | FLAC_DECORRELATE_INDEP 32, 6, 7, d |
| 318 | %if ARCH_X86_64 |
| 319 | FLAC_DECORRELATE_INDEP 16, 8, 5, w |
| 320 | FLAC_DECORRELATE_INDEP 32, 8, 9, d |
| 321 | %endif |
| 322 | |
| 323 | INIT_XMM avx |
| 324 | FLAC_DECORRELATE_INDEP 32, 4, 5, d |
| 325 | FLAC_DECORRELATE_INDEP 32, 6, 7, d |
| 326 | %if ARCH_X86_64 |
| 327 | FLAC_DECORRELATE_INDEP 16, 8, 5, w |
| 328 | FLAC_DECORRELATE_INDEP 32, 8, 9, d |
| 329 | %endif |