1 ;******************************************************************************
2 ;* FLAC DSP SIMD optimizations
4 ;* Copyright (C) 2014 Loren Merritt
5 ;* Copyright (C) 2014 James Almer
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
30 cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
33 lea decodedq, [decodedq+pred_orderq*4-8]
34 lea coeffsq, [coeffsq+pred_orderq*4]
39 movd m0, [decodedq+pred_orderq*4+8]
41 movd m1, [coeffsq+pred_orderq*4]
44 lea jq, [pred_orderq+1]
48 PMACSDQL m2, m0, m1, m2, m0
49 movd m0, [decodedq+jq*4]
50 PMACSDQL m3, m1, m0, m3, m1
51 movd m1, [coeffsq+jq*4]
55 PMACSDQL m2, m0, m1, m2, m0
62 PMACSDQL m3, m1, m0, m3, m1
77 ;----------------------------------------------------------------------------------
78 ;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
79 ; int len, int shift);
80 ;----------------------------------------------------------------------------------
81 %macro FLAC_DECORRELATE_16 3-4
82 cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
83 %if ARCH_X86_32 || WIN64
92 mov in1q, [in0q + gprsize]
102 mova m0, [in0q + lenq]
103 mova m1, [in1q + lenq]
115 mova [outq + lenq], m%2
122 FLAC_DECORRELATE_16 ls, 0, 2, sub
123 FLAC_DECORRELATE_16 rs, 2, 1, add
124 FLAC_DECORRELATE_16 ms, 2, 0, add
126 ;----------------------------------------------------------------------------------
127 ;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
128 ; int len, int shift);
129 ;----------------------------------------------------------------------------------
130 %macro FLAC_DECORRELATE_32 5
131 cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
132 %if ARCH_X86_32 || WIN64
140 mov in1q, [in0q + gprsize]
148 mova m1, [in0q + in1q]
157 SBUTTERFLY dq, %2, %3, %4
160 mova [outq + mmsize], m%3
170 FLAC_DECORRELATE_32 ls, 0, 2, 1, sub
171 FLAC_DECORRELATE_32 rs, 2, 1, 0, add
172 FLAC_DECORRELATE_32 ms, 2, 0, 1, add
174 ;-----------------------------------------------------------------------------------------
175 ;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
176 ; int len, int shift);
177 ;-----------------------------------------------------------------------------------------
178 %macro TRANSPOSE8x4D 9
179 SBUTTERFLY dq, %1, %2, %9
180 SBUTTERFLY dq, %3, %4, %9
181 SBUTTERFLY dq, %5, %6, %9
182 SBUTTERFLY dq, %7, %8, %9
183 SBUTTERFLY qdq, %1, %3, %9
184 SBUTTERFLY qdq, %2, %4, %9
185 SBUTTERFLY qdq, %5, %7, %9
186 SBUTTERFLY qdq, %6, %8, %9
193 ;%3 = last xmm reg used
194 ;%4 = word/dword (shift instruction)
195 %macro FLAC_DECORRELATE_INDEP 4
196 %define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
197 cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
201 DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
202 %define lend dword r3m
214 mov in %+ %%i %+ q, [in0q+%%i*gprsize]
223 sub in %+ %%i %+ q, in0q
233 mova m %+ %%i, [in0q + in %+ %%i %+ q]
240 TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
242 SBUTTERFLY dq, 0, 1, 6
243 SBUTTERFLY dq, 2, 3, 6
244 SBUTTERFLY dq, 4, 5, 6
246 punpcklqdq m6, m0, m2
249 punpcklqdq m0, m1, m3
254 TRANSPOSE4x4D 0, 1, 2, 3, 4
256 SBUTTERFLY dq, 0, 1, 2
262 packssdw m0, [in0q + in4q]
263 packssdw m1, [in0q + in5q]
264 packssdw m2, [in0q + in6q]
265 packssdw m3, [in0q + in7q]
266 TRANSPOSE2x4x4W 0, 1, 2, 3, 4
268 packssdw m0, [in0q + in3q]
269 packssdw m1, [in0q + in4q]
270 packssdw m2, [in0q + in5q]
276 shufps m3, m0, m2, q2020
279 shufps m1, m2, m3, q3120
284 packssdw m0, [in0q + in2q]
285 packssdw m1, [in0q + in3q]
286 SBUTTERFLY wd, 0, 1, 2
287 SBUTTERFLY dq, 0, 1, 2
300 mova [outq + %%i*mmsize], m %+ %%i
305 add outq, mmsize*REPCOUNT
312 FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
313 FLAC_DECORRELATE_INDEP 32, 2, 3, d
314 FLAC_DECORRELATE_INDEP 16, 4, 3, w
315 FLAC_DECORRELATE_INDEP 32, 4, 5, d
316 FLAC_DECORRELATE_INDEP 16, 6, 4, w
317 FLAC_DECORRELATE_INDEP 32, 6, 7, d
319 FLAC_DECORRELATE_INDEP 16, 8, 5, w
320 FLAC_DECORRELATE_INDEP 32, 8, 9, d
324 FLAC_DECORRELATE_INDEP 32, 4, 5, d
325 FLAC_DECORRELATE_INDEP 32, 6, 7, d
327 FLAC_DECORRELATE_INDEP 16, 8, 5, w
328 FLAC_DECORRELATE_INDEP 32, 8, 9, d