Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / dcadsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* SSE-optimized functions for the DCA decoder
3;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25pf_inv16: times 4 dd 0x3D800000 ; 1/16
26
27SECTION_TEXT
28
29; void decode_hf(float dst[DCA_SUBBANDS][8], const int32_t vq_num[DCA_SUBBANDS],
30; const int8_t hf_vq[1024][32], intptr_t vq_offset,
31; int32_t scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end)
32
33%macro DECODE_HF 0
34cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
35 lea srcq, [srcq + offsetq]
36 shl startq, 2
37 mov offsetd, endm
38%define DICT offsetq
39 shl offsetq, 2
40 mov endm, offsetq
41.loop:
42%if ARCH_X86_64
43 mov offsetd, [scaleq + 2 * startq]
44 cvtsi2ss m0, offsetd
45%else
46 cvtsi2ss m0, [scaleq + 2 * startq]
47%endif
48 mov offsetd, [numq + startq]
49 mulss m0, [pf_inv16]
50 shl DICT, 5
51 shufps m0, m0, 0
52%if cpuflag(sse2)
53%if cpuflag(sse4)
54 pmovsxbd m1, [srcq + DICT + 0]
55 pmovsxbd m2, [srcq + DICT + 4]
56%else
57 movq m1, [srcq + DICT]
58 punpcklbw m1, m1
59 mova m2, m1
60 punpcklwd m1, m1
61 punpckhwd m2, m2
62 psrad m1, 24
63 psrad m2, 24
64%endif
65 cvtdq2ps m1, m1
66 cvtdq2ps m2, m2
67%else
68 movd mm0, [srcq + DICT + 0]
69 movd mm1, [srcq + DICT + 4]
70 punpcklbw mm0, mm0
71 punpcklbw mm1, mm1
72 movq mm2, mm0
73 movq mm3, mm1
74 punpcklwd mm0, mm0
75 punpcklwd mm1, mm1
76 punpckhwd mm2, mm2
77 punpckhwd mm3, mm3
78 psrad mm0, 24
79 psrad mm1, 24
80 psrad mm2, 24
81 psrad mm3, 24
82 cvtpi2ps m1, mm0
83 cvtpi2ps m2, mm1
84 cvtpi2ps m3, mm2
85 cvtpi2ps m4, mm3
86 shufps m0, m0, 0
87 shufps m1, m3, q1010
88 shufps m2, m4, q1010
89%endif
90 mulps m1, m0
91 mulps m2, m0
92 mova [dstq + 8 * startq + 0], m1
93 mova [dstq + 8 * startq + 16], m2
94 add startq, 4
95 cmp startq, endm
96 jl .loop
97.end:
98%if notcpuflag(sse2)
99 emms
100%endif
101 REP_RET
102%endmacro
103
104%if ARCH_X86_32
105INIT_XMM sse
106DECODE_HF
107%endif
108
109INIT_XMM sse2
110DECODE_HF
111
112INIT_XMM sse4
113DECODE_HF
114
115; %1=v0/v1 %2=in1 %3=in2
116%macro FIR_LOOP 2-3
117.loop%1:
118%define va m1
119%define vb m2
120%if %1
121%define OFFSET 0
122%else
123%define OFFSET NUM_COEF*count
124%endif
125; for v0, incrementing and for v1, decrementing
126 mova va, [cf0q + OFFSET]
127 mova vb, [cf0q + OFFSET + 4*NUM_COEF]
128%if %0 == 3
129 mova m4, [cf0q + OFFSET + mmsize]
130 mova m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
131%endif
132 mulps va, %2
133 mulps vb, %2
134%if %0 == 3
135%if cpuflag(fma3)
136 fmaddps va, m4, %3, va
137 fmaddps vb, m0, %3, vb
138%else
139 mulps m4, %3
140 mulps m0, %3
141 addps va, m4
142 addps vb, m0
143%endif
144%endif
145 ; va = va1 va2 va3 va4
146 ; vb = vb1 vb2 vb3 vb4
147%if %1
148 SWAP va, vb
149%endif
150 mova m4, va
151 unpcklps va, vb ; va3 vb3 va4 vb4
152 unpckhps m4, vb ; va1 vb1 va2 vb2
153 addps m4, va ; va1+3 vb1+3 va2+4 vb2+4
154 movhlps vb, m4 ; va1+3 vb1+3
155 addps vb, m4 ; va0..4 vb0..4
156 movlps [outq + count], vb
157%if %1
158 sub cf0q, 8*NUM_COEF
159%endif
160 add count, 8
161 jl .loop%1
162%endmacro
163
164; void dca_lfe_fir(float *out, float *in, float *coefs)
165%macro DCA_LFE_FIR 1
166cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
167%define IN1 m3
168%define IN2 m5
169%define count inq
170%define NUM_COEF 4*(2-%1)
171%define NUM_OUT 32*(%1+1)
172
173 movu IN1, [inq + 4 - 1*mmsize]
174 shufps IN1, IN1, q0123
175%if %1 == 0
176 movu IN2, [inq + 4 - 2*mmsize]
177 shufps IN2, IN2, q0123
178%endif
179
180 mov count, -4*NUM_OUT
181 add cf0q, 4*NUM_COEF*NUM_OUT
182 add outq, 4*NUM_OUT
183 ; compute v0 first
184%if %1 == 0
185 FIR_LOOP 0, IN1, IN2
186%else
187 FIR_LOOP 0, IN1
188%endif
189 shufps IN1, IN1, q0123
190 mov count, -4*NUM_OUT
191 ; cf1 already correctly positioned
192 add outq, 4*NUM_OUT ; outq now at out2
193 sub cf0q, 8*NUM_COEF
194%if %1 == 0
195 shufps IN2, IN2, q0123
196 FIR_LOOP 1, IN2, IN1
197%else
198 FIR_LOOP 1, IN1
199%endif
200 RET
201%endmacro
202
203INIT_XMM sse
204DCA_LFE_FIR 0
205DCA_LFE_FIR 1
206%if HAVE_FMA3_EXTERNAL
207INIT_XMM fma3
208DCA_LFE_FIR 0
209%endif
210
211%macro SETZERO 1
212%if cpuflag(sse2) && notcpuflag(avx)
213 pxor %1, %1
214%else
215 xorps %1, %1, %1
216%endif
217%endmacro
218
219%macro SHUF 3
220%if cpuflag(avx)
221 mova %3, [%2 - 16]
222 vperm2f128 %1, %3, %3, 1
223 vshufps %1, %1, %1, q0123
224%elif cpuflag(sse2)
225 pshufd %1, [%2], q0123
226%else
227 mova %1, [%2]
228 shufps %1, %1, q0123
229%endif
230%endmacro
231
232%macro INNER_LOOP 1
233 ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
234 ;~ a += window[i + j] * (-synth_buf[15 - i + j])
235 ;~ b += window[i + j + 16] * (synth_buf[i + j])
236 SHUF m5, ptr2 + j + (15 - 3) * 4, m6
237 mova m6, [ptr1 + j]
238%if ARCH_X86_64
239 SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12
240 mova m12, [ptr1 + j + mmsize]
241%endif
242%if cpuflag(fma3)
243 fmaddps m2, m6, [win + %1 + j + 16 * 4], m2
244 fnmaddps m1, m5, [win + %1 + j], m1
245%if ARCH_X86_64
246 fmaddps m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
247 fnmaddps m7, m11, [win + %1 + j + mmsize], m7
248%endif
249%else ; non-FMA
250 mulps m6, m6, [win + %1 + j + 16 * 4]
251 mulps m5, m5, [win + %1 + j]
252%if ARCH_X86_64
253 mulps m12, m12, [win + %1 + j + mmsize + 16 * 4]
254 mulps m11, m11, [win + %1 + j + mmsize]
255%endif
256 addps m2, m2, m6
257 subps m1, m1, m5
258%if ARCH_X86_64
259 addps m8, m8, m12
260 subps m7, m7, m11
261%endif
262%endif ; cpuflag(fma3)
263 ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
264 ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
265 SHUF m6, ptr2 + j + (31 - 3) * 4, m5
266 mova m5, [ptr1 + j + 16 * 4]
267%if ARCH_X86_64
268 SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11
269 mova m11, [ptr1 + j + mmsize + 16 * 4]
270%endif
271%if cpuflag(fma3)
272 fmaddps m3, m5, [win + %1 + j + 32 * 4], m3
273 fmaddps m4, m6, [win + %1 + j + 48 * 4], m4
274%if ARCH_X86_64
275 fmaddps m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
276 fmaddps m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
277%endif
278%else ; non-FMA
279 mulps m5, m5, [win + %1 + j + 32 * 4]
280 mulps m6, m6, [win + %1 + j + 48 * 4]
281%if ARCH_X86_64
282 mulps m11, m11, [win + %1 + j + mmsize + 32 * 4]
283 mulps m12, m12, [win + %1 + j + mmsize + 48 * 4]
284%endif
285 addps m3, m3, m5
286 addps m4, m4, m6
287%if ARCH_X86_64
288 addps m9, m9, m11
289 addps m10, m10, m12
290%endif
291%endif ; cpuflag(fma3)
292 sub j, 64 * 4
293%endmacro
294
295; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
296; const float window[512], float out[32],
297; intptr_t offset, float scale)
298%macro SYNTH_FILTER 0
299cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
300 synth_buf, synth_buf2, window, out, off, scale
301%define scale m0
302%if ARCH_X86_32 || WIN64
303%if cpuflag(sse2) && notcpuflag(avx)
304 movd scale, scalem
305 SPLATD m0
306%else
307 VBROADCASTSS m0, scalem
308%endif
309; Make sure offset is in a register and not on the stack
310%define OFFQ r4q
311%else
312 SPLATD xmm0
313%if cpuflag(avx)
314 vinsertf128 m0, m0, xmm0, 1
315%endif
316%define OFFQ offq
317%endif
318 ; prepare inner counter limit 1
319 mov r5q, 480
320 sub r5q, offmp
321 and r5q, -64
322 shl r5q, 2
323%if ARCH_X86_32 || notcpuflag(avx)
324 mov OFFQ, r5q
325%define i r5q
326 mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter
327%else
328%define i 0
329%define OFFQ r5q
330%endif
331
332%define buf2 synth_buf2q
333%if ARCH_X86_32
334 mov buf2, synth_buf2mp
335%endif
336.mainloop
337 ; m1 = a m2 = b m3 = c m4 = d
338 SETZERO m3
339 SETZERO m4
340 mova m1, [buf2 + i]
341 mova m2, [buf2 + i + 16 * 4]
342%if ARCH_X86_32
343%define ptr1 r0q
344%define ptr2 r1q
345%define win r2q
346%define j r3q
347 mov win, windowm
348 mov ptr1, synth_bufm
349%if ARCH_X86_32 || notcpuflag(avx)
350 add win, i
351 add ptr1, i
352%endif
353%else ; ARCH_X86_64
354%define ptr1 r6q
355%define ptr2 r7q ; must be loaded
356%define win r8q
357%define j r9q
358 SETZERO m9
359 SETZERO m10
360 mova m7, [buf2 + i + mmsize]
361 mova m8, [buf2 + i + mmsize + 16 * 4]
362 lea win, [windowq + i]
363 lea ptr1, [synth_bufq + i]
364%endif
365 mov ptr2, synth_bufmp
366 ; prepare the inner loop counter
367 mov j, OFFQ
368%if ARCH_X86_32 || notcpuflag(avx)
369 sub ptr2, i
370%endif
371.loop1:
372 INNER_LOOP 0
373 jge .loop1
374
375 mov j, 448 * 4
376 sub j, OFFQ
377 jz .end
378 sub ptr1, j
379 sub ptr2, j
380 add win, OFFQ ; now at j-64, so define OFFSET
381 sub j, 64 * 4
382.loop2:
383 INNER_LOOP 64 * 4
384 jge .loop2
385
386.end:
387%if ARCH_X86_32
388 mov buf2, synth_buf2m ; needed for next iteration anyway
389 mov outq, outmp ; j, which will be set again during it
390%endif
391 ;~ out[i] = a * scale;
392 ;~ out[i + 16] = b * scale;
393 mulps m1, m1, scale
394 mulps m2, m2, scale
395%if ARCH_X86_64
396 mulps m7, m7, scale
397 mulps m8, m8, scale
398%endif
399 ;~ synth_buf2[i] = c;
400 ;~ synth_buf2[i + 16] = d;
401 mova [buf2 + i + 0 * 4], m3
402 mova [buf2 + i + 16 * 4], m4
403%if ARCH_X86_64
404 mova [buf2 + i + 0 * 4 + mmsize], m9
405 mova [buf2 + i + 16 * 4 + mmsize], m10
406%endif
407 ;~ out[i] = a;
408 ;~ out[i + 16] = a;
409 mova [outq + i + 0 * 4], m1
410 mova [outq + i + 16 * 4], m2
411%if ARCH_X86_64
412 mova [outq + i + 0 * 4 + mmsize], m7
413 mova [outq + i + 16 * 4 + mmsize], m8
414%endif
415%if ARCH_X86_32 || notcpuflag(avx)
416 sub i, (ARCH_X86_64 + 1) * mmsize
417 jge .mainloop
418%endif
419 RET
420%endmacro
421
422%if ARCH_X86_32
423INIT_XMM sse
424SYNTH_FILTER
425%endif
426INIT_XMM sse2
427SYNTH_FILTER
428INIT_YMM avx
429SYNTH_FILTER
430INIT_YMM fma3
431SYNTH_FILTER