| 1 | ;****************************************************************************** |
| 2 | ;* Copyright (c) 2012 Michael Niedermayer |
| 3 | ;* Copyright (c) 2014 James Almer <jamrial <at> gmail.com> |
| 4 | ;* Copyright (c) 2014 Ronald S. Bultje <rsbultje@gmail.com> |
| 5 | ;* |
| 6 | ;* This file is part of FFmpeg. |
| 7 | ;* |
| 8 | ;* FFmpeg is free software; you can redistribute it and/or |
| 9 | ;* modify it under the terms of the GNU Lesser General Public |
| 10 | ;* License as published by the Free Software Foundation; either |
| 11 | ;* version 2.1 of the License, or (at your option) any later version. |
| 12 | ;* |
| 13 | ;* FFmpeg is distributed in the hope that it will be useful, |
| 14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 16 | ;* Lesser General Public License for more details. |
| 17 | ;* |
| 18 | ;* You should have received a copy of the GNU Lesser General Public |
| 19 | ;* License along with FFmpeg; if not, write to the Free Software |
| 20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 21 | ;****************************************************************************** |
| 22 | |
| 23 | %include "libavutil/x86/x86util.asm" |
| 24 | |
| 25 | %if ARCH_X86_64 |
| 26 | %define pointer resq |
| 27 | %else |
| 28 | %define pointer resd |
| 29 | %endif |
| 30 | |
| 31 | struc ResampleContext |
| 32 | .av_class: pointer 1 |
| 33 | .filter_bank: pointer 1 |
| 34 | .filter_length: resd 1 |
| 35 | .filter_alloc: resd 1 |
| 36 | .ideal_dst_incr: resd 1 |
| 37 | .dst_incr: resd 1 |
| 38 | .dst_incr_div: resd 1 |
| 39 | .dst_incr_mod: resd 1 |
| 40 | .index: resd 1 |
| 41 | .frac: resd 1 |
| 42 | .src_incr: resd 1 |
| 43 | .compensation_distance: resd 1 |
| 44 | .phase_shift: resd 1 |
| 45 | .phase_mask: resd 1 |
| 46 | |
| 47 | ; there's a few more here but we only care about the first few |
| 48 | endstruc |
| 49 | |
| 50 | SECTION_RODATA |
| 51 | |
| 52 | pf_1: dd 1.0 |
| 53 | pdbl_1: dq 1.0 |
| 54 | pd_0x4000: dd 0x4000 |
| 55 | |
| 56 | SECTION .text |
| 57 | |
| 58 | %macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant |
| 59 | ; int resample_common_$format(ResampleContext *ctx, $format *dst, |
| 60 | ; const $format *src, int size, int update_ctx) |
| 61 | %if ARCH_X86_64 ; unix64 and win64 |
| 62 | cglobal resample_common_%1, 0, 15, 2, ctx, dst, src, phase_shift, index, frac, \ |
| 63 | dst_incr_mod, size, min_filter_count_x4, \ |
| 64 | min_filter_len_x4, dst_incr_div, src_incr, \ |
| 65 | phase_mask, dst_end, filter_bank |
| 66 | |
| 67 | ; use red-zone for variable storage |
| 68 | %define ctx_stackq [rsp-0x8] |
| 69 | %define src_stackq [rsp-0x10] |
| 70 | %if WIN64 |
| 71 | %define update_context_stackd r4m |
| 72 | %else ; unix64 |
| 73 | %define update_context_stackd [rsp-0x14] |
| 74 | %endif |
| 75 | |
| 76 | ; load as many variables in registers as possible; for the rest, store |
| 77 | ; on stack so that we have 'ctx' available as one extra register |
| 78 | mov sized, r3d |
| 79 | mov phase_maskd, [ctxq+ResampleContext.phase_mask] |
| 80 | %if UNIX64 |
| 81 | mov update_context_stackd, r4d |
| 82 | %endif |
| 83 | mov indexd, [ctxq+ResampleContext.index] |
| 84 | mov fracd, [ctxq+ResampleContext.frac] |
| 85 | mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod] |
| 86 | mov filter_bankq, [ctxq+ResampleContext.filter_bank] |
| 87 | mov src_incrd, [ctxq+ResampleContext.src_incr] |
| 88 | mov ctx_stackq, ctxq |
| 89 | mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length] |
| 90 | mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div] |
| 91 | shl min_filter_len_x4d, %3 |
| 92 | lea dst_endq, [dstq+sizeq*%2] |
| 93 | |
| 94 | %if UNIX64 |
| 95 | mov ecx, [ctxq+ResampleContext.phase_shift] |
| 96 | mov edi, [ctxq+ResampleContext.filter_alloc] |
| 97 | |
| 98 | DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \ |
| 99 | filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ |
| 100 | src_incr, phase_mask, dst_end, filter_bank |
| 101 | %elif WIN64 |
| 102 | mov R9d, [ctxq+ResampleContext.filter_alloc] |
| 103 | mov ecx, [ctxq+ResampleContext.phase_shift] |
| 104 | |
| 105 | DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \ |
| 106 | filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ |
| 107 | src_incr, phase_mask, dst_end, filter_bank |
| 108 | %endif |
| 109 | |
| 110 | neg min_filter_len_x4q |
| 111 | sub filter_bankq, min_filter_len_x4q |
| 112 | sub srcq, min_filter_len_x4q |
| 113 | mov src_stackq, srcq |
| 114 | %else ; x86-32 |
| 115 | cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \ |
| 116 | index, min_filter_length_x4, filter_bank |
| 117 | |
| 118 | ; push temp variables to stack |
| 119 | %define ctx_stackq r0mp |
| 120 | %define src_stackq r2mp |
| 121 | %define update_context_stackd r4m |
| 122 | |
| 123 | mov dstq, r1mp |
| 124 | mov r3, r3mp |
| 125 | lea r3, [dstq+r3*%2] |
| 126 | PUSH dword [ctxq+ResampleContext.dst_incr_div] |
| 127 | PUSH dword [ctxq+ResampleContext.dst_incr_mod] |
| 128 | PUSH dword [ctxq+ResampleContext.filter_alloc] |
| 129 | PUSH r3 |
| 130 | PUSH dword [ctxq+ResampleContext.phase_mask] |
| 131 | PUSH dword [ctxq+ResampleContext.src_incr] |
| 132 | mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length] |
| 133 | mov indexd, [ctxq+ResampleContext.index] |
| 134 | shl min_filter_length_x4d, %3 |
| 135 | mov fracd, [ctxq+ResampleContext.frac] |
| 136 | neg min_filter_length_x4q |
| 137 | mov filter_bankq, [ctxq+ResampleContext.filter_bank] |
| 138 | sub r2mp, min_filter_length_x4q |
| 139 | sub filter_bankq, min_filter_length_x4q |
| 140 | PUSH min_filter_length_x4q |
| 141 | PUSH filter_bankq |
| 142 | mov phase_shiftd, [ctxq+ResampleContext.phase_shift] |
| 143 | |
| 144 | DEFINE_ARGS src, phase_shift, dst, frac, index, min_filter_count_x4, filter |
| 145 | |
| 146 | %define filter_bankq dword [rsp+0x0] |
| 147 | %define min_filter_length_x4q dword [rsp+0x4] |
| 148 | %define src_incrd dword [rsp+0x8] |
| 149 | %define phase_maskd dword [rsp+0xc] |
| 150 | %define dst_endq dword [rsp+0x10] |
| 151 | %define filter_allocd dword [rsp+0x14] |
| 152 | %define dst_incr_modd dword [rsp+0x18] |
| 153 | %define dst_incr_divd dword [rsp+0x1c] |
| 154 | |
| 155 | mov srcq, r2mp |
| 156 | %endif |
| 157 | |
| 158 | .loop: |
| 159 | mov filterd, filter_allocd |
| 160 | imul filterd, indexd |
| 161 | %if ARCH_X86_64 |
| 162 | mov min_filter_count_x4q, min_filter_len_x4q |
| 163 | lea filterq, [filter_bankq+filterq*%2] |
| 164 | %else ; x86-32 |
| 165 | mov min_filter_count_x4q, filter_bankq |
| 166 | lea filterq, [min_filter_count_x4q+filterq*%2] |
| 167 | mov min_filter_count_x4q, min_filter_length_x4q |
| 168 | %endif |
| 169 | %ifidn %1, int16 |
| 170 | movd m0, [pd_0x4000] |
| 171 | %else ; float/double |
| 172 | xorps m0, m0, m0 |
| 173 | %endif |
| 174 | |
| 175 | align 16 |
| 176 | .inner_loop: |
| 177 | movu m1, [srcq+min_filter_count_x4q*1] |
| 178 | %ifidn %1, int16 |
| 179 | PMADCSWD m0, m1, [filterq+min_filter_count_x4q*1], m0, m1 |
| 180 | %else ; float/double |
| 181 | %if cpuflag(fma4) || cpuflag(fma3) |
| 182 | fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0 |
| 183 | %else |
| 184 | mulp%4 m1, m1, [filterq+min_filter_count_x4q*1] |
| 185 | addp%4 m0, m0, m1 |
| 186 | %endif ; cpuflag |
| 187 | %endif |
| 188 | add min_filter_count_x4q, mmsize |
| 189 | js .inner_loop |
| 190 | |
| 191 | %ifidn %1, int16 |
| 192 | HADDD m0, m1 |
| 193 | psrad m0, 15 |
| 194 | add fracd, dst_incr_modd |
| 195 | packssdw m0, m0 |
| 196 | add indexd, dst_incr_divd |
| 197 | movd [dstq], m0 |
| 198 | %else ; float/double |
| 199 | ; horizontal sum & store |
| 200 | %if mmsize == 32 |
| 201 | vextractf128 xm1, m0, 0x1 |
| 202 | addps xm0, xm1 |
| 203 | %endif |
| 204 | movhlps xm1, xm0 |
| 205 | %ifidn %1, float |
| 206 | addps xm0, xm1 |
| 207 | shufps xm1, xm0, xm0, q0001 |
| 208 | %endif |
| 209 | add fracd, dst_incr_modd |
| 210 | addp%4 xm0, xm1 |
| 211 | add indexd, dst_incr_divd |
| 212 | movs%4 [dstq], xm0 |
| 213 | %endif |
| 214 | cmp fracd, src_incrd |
| 215 | jl .skip |
| 216 | sub fracd, src_incrd |
| 217 | inc indexd |
| 218 | |
| 219 | %if UNIX64 |
| 220 | DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \ |
| 221 | index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ |
| 222 | src_incr, phase_mask, dst_end, filter_bank |
| 223 | %elif WIN64 |
| 224 | DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \ |
| 225 | index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ |
| 226 | src_incr, phase_mask, dst_end, filter_bank |
| 227 | %else ; x86-32 |
| 228 | DEFINE_ARGS src, phase_shift, dst, frac, index, index_incr |
| 229 | %endif |
| 230 | |
| 231 | .skip: |
| 232 | mov index_incrd, indexd |
| 233 | add dstq, %2 |
| 234 | and indexd, phase_maskd |
| 235 | sar index_incrd, phase_shiftb |
| 236 | lea srcq, [srcq+index_incrq*%2] |
| 237 | cmp dstq, dst_endq |
| 238 | jne .loop |
| 239 | |
| 240 | %if ARCH_X86_64 |
| 241 | DEFINE_ARGS ctx, dst, src, phase_shift, index, frac |
| 242 | %else ; x86-32 |
| 243 | DEFINE_ARGS src, ctx, update_context, frac, index |
| 244 | %endif |
| 245 | |
| 246 | cmp dword update_context_stackd, 0 |
| 247 | jz .skip_store |
| 248 | ; strictly speaking, the function should always return the consumed |
| 249 | ; number of bytes; however, we only use the value if update_context |
| 250 | ; is true, so let's just leave it uninitialized otherwise |
| 251 | mov ctxq, ctx_stackq |
| 252 | movifnidn rax, srcq |
| 253 | mov [ctxq+ResampleContext.frac ], fracd |
| 254 | sub rax, src_stackq |
| 255 | mov [ctxq+ResampleContext.index], indexd |
| 256 | shr rax, %3 |
| 257 | |
| 258 | .skip_store: |
| 259 | %if ARCH_X86_32 |
| 260 | ADD rsp, 0x20 |
| 261 | %endif |
| 262 | RET |
| 263 | |
| 264 | ; int resample_linear_$format(ResampleContext *ctx, float *dst, |
| 265 | ; const float *src, int size, int update_ctx) |
| 266 | %if ARCH_X86_64 ; unix64 and win64 |
| 267 | %if UNIX64 |
| 268 | cglobal resample_linear_%1, 0, 15, 5, ctx, dst, phase_mask, phase_shift, index, frac, \ |
| 269 | size, dst_incr_mod, min_filter_count_x4, \ |
| 270 | min_filter_len_x4, dst_incr_div, src_incr, \ |
| 271 | src, dst_end, filter_bank |
| 272 | |
| 273 | mov srcq, r2mp |
| 274 | %else ; win64 |
| 275 | cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_shift, index, frac, \ |
| 276 | size, dst_incr_mod, min_filter_count_x4, \ |
| 277 | min_filter_len_x4, dst_incr_div, src_incr, \ |
| 278 | dst, dst_end, filter_bank |
| 279 | |
| 280 | mov dstq, r1mp |
| 281 | %endif |
| 282 | |
| 283 | ; use red-zone for variable storage |
| 284 | %define ctx_stackq [rsp-0x8] |
| 285 | %define src_stackq [rsp-0x10] |
| 286 | %define phase_mask_stackd [rsp-0x14] |
| 287 | %if WIN64 |
| 288 | %define update_context_stackd r4m |
| 289 | %else ; unix64 |
| 290 | %define update_context_stackd [rsp-0x18] |
| 291 | %endif |
| 292 | |
| 293 | ; load as many variables in registers as possible; for the rest, store |
| 294 | ; on stack so that we have 'ctx' available as one extra register |
| 295 | mov sized, r3d |
| 296 | mov phase_maskd, [ctxq+ResampleContext.phase_mask] |
| 297 | %if UNIX64 |
| 298 | mov update_context_stackd, r4d |
| 299 | %endif |
| 300 | mov indexd, [ctxq+ResampleContext.index] |
| 301 | mov fracd, [ctxq+ResampleContext.frac] |
| 302 | mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod] |
| 303 | mov filter_bankq, [ctxq+ResampleContext.filter_bank] |
| 304 | mov src_incrd, [ctxq+ResampleContext.src_incr] |
| 305 | mov ctx_stackq, ctxq |
| 306 | mov phase_mask_stackd, phase_maskd |
| 307 | mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length] |
| 308 | %ifidn %1, int16 |
| 309 | movd m4, [pd_0x4000] |
| 310 | %else ; float/double |
| 311 | cvtsi2s%4 xm0, src_incrd |
| 312 | movs%4 xm4, [%5] |
| 313 | divs%4 xm4, xm0 |
| 314 | %endif |
| 315 | mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div] |
| 316 | shl min_filter_len_x4d, %3 |
| 317 | lea dst_endq, [dstq+sizeq*%2] |
| 318 | |
| 319 | %if UNIX64 |
| 320 | mov ecx, [ctxq+ResampleContext.phase_shift] |
| 321 | mov edi, [ctxq+ResampleContext.filter_alloc] |
| 322 | |
| 323 | DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, filter1, \ |
| 324 | dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ |
| 325 | dst_incr_div, src_incr, src, dst_end, filter_bank |
| 326 | %elif WIN64 |
| 327 | mov R9d, [ctxq+ResampleContext.filter_alloc] |
| 328 | mov ecx, [ctxq+ResampleContext.phase_shift] |
| 329 | |
| 330 | DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, filter1, \ |
| 331 | dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ |
| 332 | dst_incr_div, src_incr, dst, dst_end, filter_bank |
| 333 | %endif |
| 334 | |
| 335 | neg min_filter_len_x4q |
| 336 | sub filter_bankq, min_filter_len_x4q |
| 337 | sub srcq, min_filter_len_x4q |
| 338 | mov src_stackq, srcq |
| 339 | %else ; x86-32 |
| 340 | cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \ |
| 341 | frac, index, dst, filter_bank |
| 342 | |
| 343 | ; push temp variables to stack |
| 344 | %define ctx_stackq r0mp |
| 345 | %define src_stackq r2mp |
| 346 | %define update_context_stackd r4m |
| 347 | |
| 348 | mov dstq, r1mp |
| 349 | mov r3, r3mp |
| 350 | lea r3, [dstq+r3*%2] |
| 351 | PUSH dword [ctxq+ResampleContext.dst_incr_div] |
| 352 | PUSH r3 |
| 353 | mov r3, dword [ctxq+ResampleContext.filter_alloc] |
| 354 | PUSH dword [ctxq+ResampleContext.dst_incr_mod] |
| 355 | PUSH r3 |
| 356 | shl r3, %3 |
| 357 | PUSH r3 |
| 358 | mov r3, dword [ctxq+ResampleContext.src_incr] |
| 359 | PUSH dword [ctxq+ResampleContext.phase_mask] |
| 360 | PUSH r3d |
| 361 | %ifidn %1, int16 |
| 362 | movd m4, [pd_0x4000] |
| 363 | %else ; float/double |
| 364 | cvtsi2s%4 xm0, r3d |
| 365 | movs%4 xm4, [%5] |
| 366 | divs%4 xm4, xm0 |
| 367 | %endif |
| 368 | mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length] |
| 369 | mov indexd, [ctxq+ResampleContext.index] |
| 370 | shl min_filter_length_x4d, %3 |
| 371 | mov fracd, [ctxq+ResampleContext.frac] |
| 372 | neg min_filter_length_x4q |
| 373 | mov filter_bankq, [ctxq+ResampleContext.filter_bank] |
| 374 | sub r2mp, min_filter_length_x4q |
| 375 | sub filter_bankq, min_filter_length_x4q |
| 376 | PUSH min_filter_length_x4q |
| 377 | PUSH filter_bankq |
| 378 | PUSH dword [ctxq+ResampleContext.phase_shift] |
| 379 | |
| 380 | DEFINE_ARGS filter1, min_filter_count_x4, filter2, frac, index, dst, src |
| 381 | |
| 382 | %define phase_shift_stackd dword [rsp+0x0] |
| 383 | %define filter_bankq dword [rsp+0x4] |
| 384 | %define min_filter_length_x4q dword [rsp+0x8] |
| 385 | %define src_incrd dword [rsp+0xc] |
| 386 | %define phase_mask_stackd dword [rsp+0x10] |
| 387 | %define filter_alloc_x4q dword [rsp+0x14] |
| 388 | %define filter_allocd dword [rsp+0x18] |
| 389 | %define dst_incr_modd dword [rsp+0x1c] |
| 390 | %define dst_endq dword [rsp+0x20] |
| 391 | %define dst_incr_divd dword [rsp+0x24] |
| 392 | |
| 393 | mov srcq, r2mp |
| 394 | %endif |
| 395 | |
| 396 | .loop: |
| 397 | mov filter1d, filter_allocd |
| 398 | imul filter1d, indexd |
| 399 | %if ARCH_X86_64 |
| 400 | mov min_filter_count_x4q, min_filter_len_x4q |
| 401 | lea filter1q, [filter_bankq+filter1q*%2] |
| 402 | lea filter2q, [filter1q+filter_allocq*%2] |
| 403 | %else ; x86-32 |
| 404 | mov min_filter_count_x4q, filter_bankq |
| 405 | lea filter1q, [min_filter_count_x4q+filter1q*%2] |
| 406 | mov min_filter_count_x4q, min_filter_length_x4q |
| 407 | mov filter2q, filter1q |
| 408 | add filter2q, filter_alloc_x4q |
| 409 | %endif |
| 410 | %ifidn %1, int16 |
| 411 | mova m0, m4 |
| 412 | mova m2, m4 |
| 413 | %else ; float/double |
| 414 | xorps m0, m0, m0 |
| 415 | xorps m2, m2, m2 |
| 416 | %endif |
| 417 | |
| 418 | align 16 |
| 419 | .inner_loop: |
| 420 | movu m1, [srcq+min_filter_count_x4q*1] |
| 421 | %ifidn %1, int16 |
| 422 | %if cpuflag(xop) |
| 423 | vpmadcswd m2, m1, [filter2q+min_filter_count_x4q*1], m2 |
| 424 | vpmadcswd m0, m1, [filter1q+min_filter_count_x4q*1], m0 |
| 425 | %else |
| 426 | pmaddwd m3, m1, [filter2q+min_filter_count_x4q*1] |
| 427 | pmaddwd m1, [filter1q+min_filter_count_x4q*1] |
| 428 | paddd m2, m3 |
| 429 | paddd m0, m1 |
| 430 | %endif ; cpuflag |
| 431 | %else ; float/double |
| 432 | %if cpuflag(fma4) || cpuflag(fma3) |
| 433 | fmaddp%4 m2, m1, [filter2q+min_filter_count_x4q*1], m2 |
| 434 | fmaddp%4 m0, m1, [filter1q+min_filter_count_x4q*1], m0 |
| 435 | %else |
| 436 | mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1] |
| 437 | mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1] |
| 438 | addp%4 m2, m2, m3 |
| 439 | addp%4 m0, m0, m1 |
| 440 | %endif ; cpuflag |
| 441 | %endif |
| 442 | add min_filter_count_x4q, mmsize |
| 443 | js .inner_loop |
| 444 | |
| 445 | %ifidn %1, int16 |
| 446 | %if mmsize == 16 |
| 447 | %if cpuflag(xop) |
| 448 | vphadddq m2, m2 |
| 449 | vphadddq m0, m0 |
| 450 | %endif |
| 451 | pshufd m3, m2, q0032 |
| 452 | pshufd m1, m0, q0032 |
| 453 | paddd m2, m3 |
| 454 | paddd m0, m1 |
| 455 | %endif |
| 456 | %if notcpuflag(xop) |
| 457 | PSHUFLW m3, m2, q0032 |
| 458 | PSHUFLW m1, m0, q0032 |
| 459 | paddd m2, m3 |
| 460 | paddd m0, m1 |
| 461 | %endif |
| 462 | psubd m2, m0 |
| 463 | ; This is probably a really bad idea on atom and other machines with a |
| 464 | ; long transfer latency between GPRs and XMMs (atom). However, it does |
| 465 | ; make the clip a lot simpler... |
| 466 | movd eax, m2 |
| 467 | add indexd, dst_incr_divd |
| 468 | imul fracd |
| 469 | idiv src_incrd |
| 470 | movd m1, eax |
| 471 | add fracd, dst_incr_modd |
| 472 | paddd m0, m1 |
| 473 | psrad m0, 15 |
| 474 | packssdw m0, m0 |
| 475 | movd [dstq], m0 |
| 476 | |
| 477 | ; note that for imul/idiv, I need to move filter to edx/eax for each: |
| 478 | ; - 32bit: eax=r0[filter1], edx=r2[filter2] |
| 479 | ; - win64: eax=r6[filter1], edx=r1[todo] |
| 480 | ; - unix64: eax=r6[filter1], edx=r2[todo] |
| 481 | %else ; float/double |
| 482 | ; val += (v2 - val) * (FELEML) frac / c->src_incr; |
| 483 | %if mmsize == 32 |
| 484 | vextractf128 xm1, m0, 0x1 |
| 485 | vextractf128 xm3, m2, 0x1 |
| 486 | addps xm0, xm1 |
| 487 | addps xm2, xm3 |
| 488 | %endif |
| 489 | cvtsi2s%4 xm1, fracd |
| 490 | subp%4 xm2, xm0 |
| 491 | mulp%4 xm1, xm4 |
| 492 | shufp%4 xm1, xm1, q0000 |
| 493 | %if cpuflag(fma4) || cpuflag(fma3) |
| 494 | fmaddp%4 xm0, xm2, xm1, xm0 |
| 495 | %else |
| 496 | mulp%4 xm2, xm1 |
| 497 | addp%4 xm0, xm2 |
| 498 | %endif ; cpuflag |
| 499 | |
| 500 | ; horizontal sum & store |
| 501 | movhlps xm1, xm0 |
| 502 | %ifidn %1, float |
| 503 | addps xm0, xm1 |
| 504 | shufps xm1, xm0, xm0, q0001 |
| 505 | %endif |
| 506 | add fracd, dst_incr_modd |
| 507 | addp%4 xm0, xm1 |
| 508 | add indexd, dst_incr_divd |
| 509 | movs%4 [dstq], xm0 |
| 510 | %endif |
| 511 | cmp fracd, src_incrd |
| 512 | jl .skip |
| 513 | sub fracd, src_incrd |
| 514 | inc indexd |
| 515 | |
| 516 | %if UNIX64 |
| 517 | DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, index_incr, \ |
| 518 | dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ |
| 519 | dst_incr_div, src_incr, src, dst_end, filter_bank |
| 520 | %elif WIN64 |
| 521 | DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, index_incr, \ |
| 522 | dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ |
| 523 | dst_incr_div, src_incr, dst, dst_end, filter_bank |
| 524 | %else ; x86-32 |
| 525 | DEFINE_ARGS filter1, phase_shift, index_incr, frac, index, dst, src |
| 526 | %endif |
| 527 | |
| 528 | .skip: |
| 529 | %if ARCH_X86_32 |
| 530 | mov phase_shiftd, phase_shift_stackd |
| 531 | %endif |
| 532 | mov index_incrd, indexd |
| 533 | add dstq, %2 |
| 534 | and indexd, phase_mask_stackd |
| 535 | sar index_incrd, phase_shiftb |
| 536 | lea srcq, [srcq+index_incrq*%2] |
| 537 | cmp dstq, dst_endq |
| 538 | jne .loop |
| 539 | |
| 540 | %if UNIX64 |
| 541 | DEFINE_ARGS ctx, dst, filter2, phase_shift, index, frac, index_incr, \ |
| 542 | dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ |
| 543 | dst_incr_div, src_incr, src, dst_end, filter_bank |
| 544 | %elif WIN64 |
| 545 | DEFINE_ARGS ctx, filter2, src, phase_shift, index, frac, index_incr, \ |
| 546 | dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ |
| 547 | dst_incr_div, src_incr, dst, dst_end, filter_bank |
| 548 | %else ; x86-32 |
| 549 | DEFINE_ARGS filter1, ctx, update_context, frac, index, dst, src |
| 550 | %endif |
| 551 | |
| 552 | cmp dword update_context_stackd, 0 |
| 553 | jz .skip_store |
| 554 | ; strictly speaking, the function should always return the consumed |
| 555 | ; number of bytes; however, we only use the value if update_context |
| 556 | ; is true, so let's just leave it uninitialized otherwise |
| 557 | mov ctxq, ctx_stackq |
| 558 | movifnidn rax, srcq |
| 559 | mov [ctxq+ResampleContext.frac ], fracd |
| 560 | sub rax, src_stackq |
| 561 | mov [ctxq+ResampleContext.index], indexd |
| 562 | shr rax, %3 |
| 563 | |
| 564 | .skip_store: |
| 565 | %if ARCH_X86_32 |
| 566 | ADD rsp, 0x28 |
| 567 | %endif |
| 568 | RET |
| 569 | %endmacro |
| 570 | |
| 571 | INIT_XMM sse |
| 572 | RESAMPLE_FNS float, 4, 2, s, pf_1 |
| 573 | |
| 574 | %if HAVE_AVX_EXTERNAL |
| 575 | INIT_YMM avx |
| 576 | RESAMPLE_FNS float, 4, 2, s, pf_1 |
| 577 | %endif |
| 578 | %if HAVE_FMA3_EXTERNAL |
| 579 | INIT_YMM fma3 |
| 580 | RESAMPLE_FNS float, 4, 2, s, pf_1 |
| 581 | %endif |
| 582 | %if HAVE_FMA4_EXTERNAL |
| 583 | INIT_XMM fma4 |
| 584 | RESAMPLE_FNS float, 4, 2, s, pf_1 |
| 585 | %endif |
| 586 | |
| 587 | %if ARCH_X86_32 |
| 588 | INIT_MMX mmxext |
| 589 | RESAMPLE_FNS int16, 2, 1 |
| 590 | %endif |
| 591 | |
| 592 | INIT_XMM sse2 |
| 593 | RESAMPLE_FNS int16, 2, 1 |
| 594 | %if HAVE_XOP_EXTERNAL |
| 595 | INIT_XMM xop |
| 596 | RESAMPLE_FNS int16, 2, 1 |
| 597 | %endif |
| 598 | |
| 599 | INIT_XMM sse2 |
| 600 | RESAMPLE_FNS double, 8, 3, d, pdbl_1 |