1 ;*****************************************************************************
2 ;* x86-optimized functions for volume filter
3 ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
26 pd_1_256: times 4 dq 0x3F70000000000000
27 pd_int32_max: times 4 dq 0x41DFFFFFFFC00000
29 pw_128: times 8 dw 128
30 pq_128: times 2 dq 128
34 ;------------------------------------------------------------------------------
35 ; void ff_scale_samples_s16(uint8_t *dst, const uint8_t *src, int len,
37 ;------------------------------------------------------------------------------
40 cglobal scale_samples_s16, 4,4,4, dst, src, len, volume
45 lea lenq, [lend*2-mmsize]
47 ; dst[i] = av_clip_int16((src[i] * volume + 128) >> 8);
61 ;------------------------------------------------------------------------------
62 ; void ff_scale_samples_s32(uint8_t *dst, const uint8_t *src, int len,
64 ;------------------------------------------------------------------------------
66 %macro SCALE_SAMPLES_S32 0
67 cglobal scale_samples_s32, 4,4,4, dst, src, len, volume
68 %if ARCH_X86_32 && cpuflag(avx)
69 vbroadcastss xmm2, volumem
75 mulpd m2, m2, [pd_1_256]
76 mova m3, [pd_int32_max]
77 lea lenq, [lend*4-mmsize]
79 CVTDQ2PD m0, [srcq+lenq ]
80 CVTDQ2PD m1, [srcq+lenq+mmsize/2]
88 vmovdqa [dstq+lenq ], xmm0
89 vmovdqa [dstq+lenq+mmsize/2], xmm1
91 movq [dstq+lenq ], xmm0
92 movq [dstq+lenq+mmsize/2], xmm1
100 %define CVTDQ2PD cvtdq2pd
102 %if HAVE_AVX_EXTERNAL
103 %define CVTDQ2PD vcvtdq2pd
109 ; NOTE: This is not bit-identical with the C version because it clips to
110 ; [-INT_MAX, INT_MAX] instead of [INT_MIN, INT_MAX]
113 cglobal scale_samples_s32, 4,4,8, dst, src, len, volume
118 lea lenq, [lend*4-mmsize]
120 ; src[i] = av_clipl_int32((src[i] * volume + 128) >> 8);
131 shufps m2, m0, m1, q3131
132 shufps m0, m0, m1, q2020