Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / sbrdsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* AAC Spectral Band Replication decoding functions
3;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25; mask equivalent for multiply by -1.0 1.0
26ps_mask times 2 dd 1<<31, 0
27ps_mask2 times 2 dd 0, 1<<31
28ps_noise0 times 2 dd 1.0, 0.0,
29ps_noise2 times 2 dd -1.0, 0.0
30ps_noise13 dd 0.0, 1.0, 0.0, -1.0
31 dd 0.0, -1.0, 0.0, 1.0
32 dd 0.0, 1.0, 0.0, -1.0
33cextern sbr_noise_table
34cextern ps_neg
35
36SECTION_TEXT
37
38INIT_XMM sse
39cglobal sbr_sum_square, 2, 3, 6
40 mov r2, r1
41 xorps m0, m0
42 xorps m1, m1
43 sar r2, 3
44 jz .prepare
45.loop:
46 movu m2, [r0 + 0]
47 movu m3, [r0 + 16]
48 movu m4, [r0 + 32]
49 movu m5, [r0 + 48]
50 mulps m2, m2
51 mulps m3, m3
52 mulps m4, m4
53 mulps m5, m5
54 addps m0, m2
55 addps m1, m3
56 addps m0, m4
57 addps m1, m5
58 add r0, 64
59 dec r2
60 jnz .loop
61.prepare:
62 and r1, 7
63 sar r1, 1
64 jz .end
65; len is a multiple of 2, thus there are at least 4 elements to process
66.endloop:
67 movu m2, [r0]
68 add r0, 16
69 mulps m2, m2
70 dec r1
71 addps m0, m2
72 jnz .endloop
73.end:
74 addps m0, m1
75 movhlps m2, m0
76 addps m0, m2
77 movss m1, m0
78 shufps m0, m0, 1
79 addss m0, m1
80%if ARCH_X86_64 == 0
81 movss r0m, m0
82 fld dword r0m
83%endif
84 RET
85
86%define STEP 40*4*2
87cglobal sbr_hf_g_filt, 5, 6, 5
88 lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
89 mov r5, r3
90 and r3, 0xFC
91 lea r2, [r2 + r3*4]
92 lea r0, [r0 + r3*8]
93 neg r3
94 jz .loop1
95.loop4:
96 movlps m0, [r2 + 4*r3 + 0]
97 movlps m1, [r2 + 4*r3 + 8]
98 movlps m2, [r1 + 0*STEP]
99 movlps m3, [r1 + 2*STEP]
100 movhps m2, [r1 + 1*STEP]
101 movhps m3, [r1 + 3*STEP]
102 unpcklps m0, m0
103 unpcklps m1, m1
104 mulps m0, m2
105 mulps m1, m3
106 movu [r0 + 8*r3 + 0], m0
107 movu [r0 + 8*r3 + 16], m1
108 add r1, 4*STEP
109 add r3, 4
110 jnz .loop4
111 and r5, 3 ; number of single element loops
112 jz .end
113.loop1: ; element 0 and 1 can be computed at the same time
114 movss m0, [r2]
115 movlps m2, [r1]
116 unpcklps m0, m0
117 mulps m2, m0
118 movlps [r0], m2
119 add r0, 8
120 add r2, 4
121 add r1, STEP
122 dec r5
123 jnz .loop1
124.end:
125 RET
126
127; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
128; const float alpha0[2], const float alpha1[2],
129; float bw, int start, int end)
130;
131cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
132 ; load alpha factors
133%define bw m0
134%if ARCH_X86_64 == 0 || WIN64
135 movss bw, BWm
136%endif
137 movlps m2, [alpha1q]
138 movlps m1, [alpha0q]
139 shufps bw, bw, 0
140 mulps m2, bw ; (a1[0] a1[1])*bw
141 mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
142 mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
143 mova m3, m1
144 mova m4, m2
145
146 ; Set pointers
147%if ARCH_X86_64 == 0 || WIN64
148 ; start and end 6th and 7th args on stack
149 mov r2d, Sm
150 mov r3d, Em
151%define start r2q
152%define end r3q
153%else
154; BW does not actually occupy a register, so shift by 1
155%define start BWq
156%define end Sq
157%endif
158 sub start, end ; neg num of loops
159 lea X_highq, [X_highq + end*2*4]
160 lea X_lowq, [X_lowq + end*2*4 - 2*2*4]
161 shl start, 3 ; offset from num loops
162
163 mova m0, [X_lowq + start]
164 shufps m3, m3, q1111
165 shufps m4, m4, q1111
166 xorps m3, [ps_mask]
167 shufps m1, m1, q0000
168 shufps m2, m2, q0000
169 xorps m4, [ps_mask]
170.loop2:
171 movu m7, [X_lowq + start + 8] ; BbCc
172 mova m6, m0
173 mova m5, m7
174 shufps m0, m0, q2301 ; aAbB
175 shufps m7, m7, q2301 ; bBcC
176 mulps m0, m4
177 mulps m7, m3
178 mulps m6, m2
179 mulps m5, m1
180 addps m7, m0
181 mova m0, [X_lowq + start +16] ; CcDd
182 addps m7, m0
183 addps m6, m5
184 addps m7, m6
185 mova [X_highq + start], m7
186 add start, 16
187 jnz .loop2
188 RET
189
190cglobal sbr_sum64x5, 1,2,4,z
191 lea r1q, [zq+ 256]
192.loop:
193 mova m0, [zq+ 0]
194 mova m2, [zq+ 16]
195 mova m1, [zq+ 256]
196 mova m3, [zq+ 272]
197 addps m0, [zq+ 512]
198 addps m2, [zq+ 528]
199 addps m1, [zq+ 768]
200 addps m3, [zq+ 784]
201 addps m0, [zq+1024]
202 addps m2, [zq+1040]
203 addps m0, m1
204 addps m2, m3
205 mova [zq], m0
206 mova [zq+16], m2
207 add zq, 32
208 cmp zq, r1q
209 jne .loop
210 REP_RET
211
212INIT_XMM sse
213cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
214 lea r2q, [zq + (64-4)*4]
215 mova m3, [ps_neg]
216.loop:
217 mova m1, [zq]
218 xorps m0, m3, [r2q]
219 shufps m0, m0, m0, q0123
220 unpcklps m2, m0, m1
221 unpckhps m0, m0, m1
222 mova [Wq + 0], m2
223 mova [Wq + 16], m0
224 add Wq, 32
225 sub r2q, 16
226 add zq, 16
227 cmp zq, r2q
228 jl .loop
229 REP_RET
230
231INIT_XMM sse
232cglobal sbr_neg_odd_64, 1,2,4,z
233 lea r1q, [zq+256]
234.loop:
235 mova m0, [zq+ 0]
236 mova m1, [zq+16]
237 mova m2, [zq+32]
238 mova m3, [zq+48]
239 xorps m0, [ps_mask2]
240 xorps m1, [ps_mask2]
241 xorps m2, [ps_mask2]
242 xorps m3, [ps_mask2]
243 mova [zq+ 0], m0
244 mova [zq+16], m1
245 mova [zq+32], m2
246 mova [zq+48], m3
247 add zq, 64
248 cmp zq, r1q
249 jne .loop
250 REP_RET
251
252; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
253%macro SBR_QMF_DEINT_BFLY 0
254cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
255 mov cq, 64*4-2*mmsize
256 lea vrevq, [vq + 64*4]
257.loop:
258 mova m0, [src0q+cq]
259 mova m1, [src1q]
260 mova m4, [src0q+cq+mmsize]
261 mova m5, [src1q+mmsize]
262%if cpuflag(sse2)
263 pshufd m2, m0, q0123
264 pshufd m3, m1, q0123
265 pshufd m6, m4, q0123
266 pshufd m7, m5, q0123
267%else
268 shufps m2, m0, m0, q0123
269 shufps m3, m1, m1, q0123
270 shufps m6, m4, m4, q0123
271 shufps m7, m5, m5, q0123
272%endif
273 addps m5, m2
274 subps m0, m7
275 addps m1, m6
276 subps m4, m3
277 mova [vrevq], m1
278 mova [vrevq+mmsize], m5
279 mova [vq+cq], m0
280 mova [vq+cq+mmsize], m4
281 add src1q, 2*mmsize
282 add vrevq, 2*mmsize
283 sub cq, 2*mmsize
284 jge .loop
285 REP_RET
286%endmacro
287
288INIT_XMM sse
289SBR_QMF_DEINT_BFLY
290
291INIT_XMM sse2
292SBR_QMF_DEINT_BFLY
293
294INIT_XMM sse2
295cglobal sbr_qmf_pre_shuffle, 1,4,6,z
296%define OFFSET (32*4-2*mmsize)
297 mov r3q, OFFSET
298 lea r1q, [zq + (32+1)*4]
299 lea r2q, [zq + 64*4]
300 mova m5, [ps_neg]
301.loop:
302 movu m0, [r1q]
303 movu m2, [r1q + mmsize]
304 movu m1, [zq + r3q + 4 + mmsize]
305 movu m3, [zq + r3q + 4]
306
307 pxor m2, m5
308 pxor m0, m5
309 pshufd m2, m2, q0123
310 pshufd m0, m0, q0123
311 SBUTTERFLY dq, 2, 3, 4
312 SBUTTERFLY dq, 0, 1, 4
313 mova [r2q + 2*r3q + 0*mmsize], m2
314 mova [r2q + 2*r3q + 1*mmsize], m3
315 mova [r2q + 2*r3q + 2*mmsize], m0
316 mova [r2q + 2*r3q + 3*mmsize], m1
317 add r1q, 2*mmsize
318 sub r3q, 2*mmsize
319 jge .loop
320 movq m2, [zq]
321 movq [r2q], m2
322 REP_RET
323
324%ifdef PIC
325%define NREGS 1
326%if UNIX64
327%define NOISE_TABLE r6q ; r5q is m_max
328%else
329%define NOISE_TABLE r5q
330%endif
331%else
332%define NREGS 0
333%define NOISE_TABLE sbr_noise_table
334%endif
335
336%macro LOAD_NST 1
337%ifdef PIC
338 lea NOISE_TABLE, [%1]
339 mova m0, [kxq + NOISE_TABLE]
340%else
341 mova m0, [kxq + %1]
342%endif
343%endmacro
344
345INIT_XMM sse2
346; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
347; const float *q_filt, int noise,
348; int kx, int m_max)
349cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
350 mova m0, [ps_noise0]
351 jmp apply_noise_main
352
353; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
354; const float *q_filt, int noise,
355; int kx, int m_max)
356cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
357 and kxq, 1
358 shl kxq, 4
359 LOAD_NST ps_noise13
360 jmp apply_noise_main
361
362; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
363; const float *q_filt, int noise,
364; int kx, int m_max)
365cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
366 mova m0, [ps_noise2]
367 jmp apply_noise_main
368
369; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
370; const float *q_filt, int noise,
371; int kx, int m_max)
372cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
373 and kxq, 1
374 shl kxq, 4
375 LOAD_NST ps_noise13+16
376
377apply_noise_main:
378%if ARCH_X86_64 == 0 || WIN64
379 mov kxd, m_maxm
380%define count kxq
381%else
382%define count m_maxq
383%endif
384 dec noiseq
385 shl count, 2
386%ifdef PIC
387 lea NOISE_TABLE, [sbr_noise_table]
388%endif
389 lea Yq, [Yq + 2*count]
390 add s_mq, count
391 add q_filtq, count
392 shl noiseq, 3
393 pxor m5, m5
394 neg count
395.loop:
396 mova m1, [q_filtq + count]
397 movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
398 movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
399 add noiseq, 2*mmsize
400 and noiseq, 0x1ff<<3
401 punpckhdq m2, m1, m1
402 punpckldq m1, m1
403 mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
404 mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
405 mova m3, [s_mq + count]
406 ; TODO: replace by a vpermd in AVX2
407 punpckhdq m4, m3, m3
408 punpckldq m3, m3
409 pcmpeqd m6, m3, m5 ; m6 == 0
410 pcmpeqd m7, m4, m5 ; m7 == 0
411 mulps m3, m0 ; s_m[m] * phi_sign
412 mulps m4, m0 ; s_m[m] * phi_sign
413 pand m1, m6
414 pand m2, m7
415 movu m6, [Yq + 2*count]
416 movu m7, [Yq + 2*count + mmsize]
417 addps m3, m1
418 addps m4, m2
419 addps m6, m3
420 addps m7, m4
421 movu [Yq + 2*count], m6
422 movu [Yq + 2*count + mmsize], m7
423 add count, mmsize
424 jl .loop
425 RET
426
427INIT_XMM sse
428cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
429%define COUNT 32*4
430%define OFFSET 32*4
431 mov cq, -COUNT
432 lea vrevq, [vq + OFFSET + COUNT]
433 add vq, OFFSET-mmsize
434 add srcq, 2*COUNT
435 mova m3, [ps_neg]
436.loop:
437 mova m0, [srcq + 2*cq + 0*mmsize]
438 mova m1, [srcq + 2*cq + 1*mmsize]
439 shufps m2, m0, m1, q2020
440 shufps m1, m0, q1313
441 xorps m2, m3
442 mova [vq], m1
443 mova [vrevq + cq], m2
444 sub vq, mmsize
445 add cq, mmsize
446 jl .loop
447 REP_RET