Imported Debian version 2.5.0~trusty1.1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / flacdsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* FLAC DSP SIMD optimizations
3;*
4;* Copyright (C) 2014 Loren Merritt
f6fa7814 5;* Copyright (C) 2014 James Almer
2ba45a60
DM
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION .text
27
28%macro LPC_32 1
29INIT_XMM %1
30cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
31 sub lend, pred_orderd
32 jle .ret
33 lea decodedq, [decodedq+pred_orderq*4-8]
34 lea coeffsq, [coeffsq+pred_orderq*4]
35 neg pred_orderq
36 movd m4, qlevelm
37ALIGN 16
38.loop_sample:
39 movd m0, [decodedq+pred_orderq*4+8]
40 add decodedq, 8
41 movd m1, [coeffsq+pred_orderq*4]
42 pxor m2, m2
43 pxor m3, m3
44 lea jq, [pred_orderq+1]
45 test jq, jq
46 jz .end_order
47.loop_order:
48 PMACSDQL m2, m0, m1, m2, m0
49 movd m0, [decodedq+jq*4]
50 PMACSDQL m3, m1, m0, m3, m1
51 movd m1, [coeffsq+jq*4]
52 inc jq
53 jl .loop_order
54.end_order:
55 PMACSDQL m2, m0, m1, m2, m0
56 psrlq m2, m4
57 movd m0, [decodedq]
58 paddd m0, m2
59 movd [decodedq], m0
60 sub lend, 2
61 jl .ret
62 PMACSDQL m3, m1, m0, m3, m1
63 psrlq m3, m4
64 movd m1, [decodedq+4]
65 paddd m1, m3
66 movd [decodedq+4], m1
67 jg .loop_sample
68.ret:
69 REP_RET
70%endmacro
71
72%if HAVE_XOP_EXTERNAL
73LPC_32 xop
74%endif
75LPC_32 sse4
f6fa7814
DM
76
77;----------------------------------------------------------------------------------
78;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
79; int len, int shift);
80;----------------------------------------------------------------------------------
81%macro FLAC_DECORRELATE_16 3-4
82cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
83%if ARCH_X86_32 || WIN64
84 movd m3, r4m
85%if ARCH_X86_32
86 mov lend, lenm
87%endif
88%else ; UNIX64
89 movd m3, r4d
90%endif
91 shl lend, 2
92 mov in1q, [in0q + gprsize]
93 mov in0q, [in0q]
94 mov outq, [outq]
95 add in1q, lenq
96 add in0q, lenq
97 add outq, lenq
98 neg lenq
99
100align 16
101.loop:
102 mova m0, [in0q + lenq]
103 mova m1, [in1q + lenq]
104%ifidn %1, ms
105 psrad m2, m1, 1
106 psubd m0, m2
107%endif
108%ifnidn %1, indep2
109 p%4d m2, m0, m1
110%endif
111 packssdw m%2, m%2
112 packssdw m%3, m%3
113 punpcklwd m%2, m%3
114 psllw m%2, m3
115 mova [outq + lenq], m%2
116 add lenq, 16
117 jl .loop
118 REP_RET
119%endmacro
120
121INIT_XMM sse2
122FLAC_DECORRELATE_16 ls, 0, 2, sub
123FLAC_DECORRELATE_16 rs, 2, 1, add
124FLAC_DECORRELATE_16 ms, 2, 0, add
125
126;----------------------------------------------------------------------------------
127;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
128; int len, int shift);
129;----------------------------------------------------------------------------------
130%macro FLAC_DECORRELATE_32 5
131cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
132%if ARCH_X86_32 || WIN64
133 movd m3, r4m
134%if ARCH_X86_32
135 mov lend, lenm
136%endif
137%else ; UNIX64
138 movd m3, r4d
139%endif
140 mov in1q, [in0q + gprsize]
141 mov in0q, [in0q]
142 mov outq, [outq]
143 sub in1q, in0q
144
145align 16
146.loop:
147 mova m0, [in0q]
148 mova m1, [in0q + in1q]
149%ifidn %1, ms
150 psrad m2, m1, 1
151 psubd m0, m2
152%endif
153 p%5d m2, m0, m1
154 pslld m%2, m3
155 pslld m%3, m3
156
157 SBUTTERFLY dq, %2, %3, %4
158
159 mova [outq ], m%2
160 mova [outq + mmsize], m%3
161
162 add in0q, mmsize
163 add outq, mmsize*2
164 sub lend, mmsize/4
165 jg .loop
166 REP_RET
167%endmacro
168
169INIT_XMM sse2
170FLAC_DECORRELATE_32 ls, 0, 2, 1, sub
171FLAC_DECORRELATE_32 rs, 2, 1, 0, add
172FLAC_DECORRELATE_32 ms, 2, 0, 1, add
173
174;-----------------------------------------------------------------------------------------
175;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
176; int len, int shift);
177;-----------------------------------------------------------------------------------------
178%macro TRANSPOSE8x4D 9
179 SBUTTERFLY dq, %1, %2, %9
180 SBUTTERFLY dq, %3, %4, %9
181 SBUTTERFLY dq, %5, %6, %9
182 SBUTTERFLY dq, %7, %8, %9
183 SBUTTERFLY qdq, %1, %3, %9
184 SBUTTERFLY qdq, %2, %4, %9
185 SBUTTERFLY qdq, %5, %7, %9
186 SBUTTERFLY qdq, %6, %8, %9
187 SWAP %2, %5
188 SWAP %4, %7
189%endmacro
190
191;%1 = bps
192;%2 = channels
193;%3 = last xmm reg used
194;%4 = word/dword (shift instruction)
195%macro FLAC_DECORRELATE_INDEP 4
196%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
197cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
198%if ARCH_X86_32
199 movd m%3, r4m
200%if %2 == 6
201 DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
202 %define lend dword r3m
203%else
204 mov lend, lenm
205%endif
206%elif WIN64
207 movd m%3, r4m
208%else ; UNIX64
209 movd m%3, r4d
210%endif
211
212%assign %%i 1
213%rep %2-1
214 mov in %+ %%i %+ q, [in0q+%%i*gprsize]
215%assign %%i %%i+1
216%endrep
217
218 mov in0q, [in0q]
219 mov outq, [outq]
220
221%assign %%i 1
222%rep %2-1
223 sub in %+ %%i %+ q, in0q
224%assign %%i %%i+1
225%endrep
226
227align 16
228.loop:
229 mova m0, [in0q]
230
231%assign %%i 1
232%rep REPCOUNT-1
233 mova m %+ %%i, [in0q + in %+ %%i %+ q]
234%assign %%i %%i+1
235%endrep
236
237%if %1 == 32
238
239%if %2 == 8
240 TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
241%elif %2 == 6
242 SBUTTERFLY dq, 0, 1, 6
243 SBUTTERFLY dq, 2, 3, 6
244 SBUTTERFLY dq, 4, 5, 6
245
246 punpcklqdq m6, m0, m2
247 punpckhqdq m2, m4
248 shufps m4, m0, 0xe4
249 punpcklqdq m0, m1, m3
250 punpckhqdq m3, m5
251 shufps m5, m1, 0xe4
252 SWAP 0,6,1,4,5,3
253%elif %2 == 4
254 TRANSPOSE4x4D 0, 1, 2, 3, 4
255%else ; %2 == 2
256 SBUTTERFLY dq, 0, 1, 2
257%endif
258
259%else ; %1 == 16
260
261%if %2 == 8
262 packssdw m0, [in0q + in4q]
263 packssdw m1, [in0q + in5q]
264 packssdw m2, [in0q + in6q]
265 packssdw m3, [in0q + in7q]
266 TRANSPOSE2x4x4W 0, 1, 2, 3, 4
267%elif %2 == 6
268 packssdw m0, [in0q + in3q]
269 packssdw m1, [in0q + in4q]
270 packssdw m2, [in0q + in5q]
271 pshufd m3, m0, q1032
272 punpcklwd m0, m1
273 punpckhwd m1, m2
274 punpcklwd m2, m3
275
276 shufps m3, m0, m2, q2020
277 shufps m0, m1, q2031
278 shufps m2, m1, q3131
279 shufps m1, m2, m3, q3120
280 shufps m3, m0, q0220
281 shufps m0, m2, q3113
282 SWAP 2, 0, 3
283%else ; %2 == 4
284 packssdw m0, [in0q + in2q]
285 packssdw m1, [in0q + in3q]
286 SBUTTERFLY wd, 0, 1, 2
287 SBUTTERFLY dq, 0, 1, 2
288%endif
289
290%endif
291
292%assign %%i 0
293%rep REPCOUNT
294 psll%4 m %+ %%i, m%3
295%assign %%i %%i+1
296%endrep
297
298%assign %%i 0
299%rep REPCOUNT
300 mova [outq + %%i*mmsize], m %+ %%i
301%assign %%i %%i+1
302%endrep
303
304 add in0q, mmsize
305 add outq, mmsize*REPCOUNT
306 sub lend, mmsize/4
307 jg .loop
308 REP_RET
309%endmacro
310
311INIT_XMM sse2
312FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
313FLAC_DECORRELATE_INDEP 32, 2, 3, d
314FLAC_DECORRELATE_INDEP 16, 4, 3, w
315FLAC_DECORRELATE_INDEP 32, 4, 5, d
316FLAC_DECORRELATE_INDEP 16, 6, 4, w
317FLAC_DECORRELATE_INDEP 32, 6, 7, d
318%if ARCH_X86_64
319FLAC_DECORRELATE_INDEP 16, 8, 5, w
320FLAC_DECORRELATE_INDEP 32, 8, 9, d
321%endif
322
323INIT_XMM avx
324FLAC_DECORRELATE_INDEP 32, 4, 5, d
325FLAC_DECORRELATE_INDEP 32, 6, 7, d
326%if ARCH_X86_64
327FLAC_DECORRELATE_INDEP 16, 8, 5, w
328FLAC_DECORRELATE_INDEP 32, 8, 9, d
329%endif