Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* AAC Spectral Band Replication decoding functions | |
3 | ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> | |
4 | ;* | |
5 | ;* This file is part of FFmpeg. | |
6 | ;* | |
7 | ;* FFmpeg is free software; you can redistribute it and/or | |
8 | ;* modify it under the terms of the GNU Lesser General Public | |
9 | ;* License as published by the Free Software Foundation; either | |
10 | ;* version 2.1 of the License, or (at your option) any later version. | |
11 | ;* | |
12 | ;* FFmpeg is distributed in the hope that it will be useful, | |
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;* Lesser General Public License for more details. | |
16 | ;* | |
17 | ;* You should have received a copy of the GNU Lesser General Public | |
18 | ;* License along with FFmpeg; if not, write to the Free Software | |
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ;****************************************************************************** | |
21 | ||
22 | %include "libavutil/x86/x86util.asm" | |
23 | ||
24 | SECTION_RODATA | |
25 | ; mask equivalent for multiply by -1.0 1.0 | |
26 | ps_mask times 2 dd 1<<31, 0 | |
27 | ps_mask2 times 2 dd 0, 1<<31 | |
28 | ps_noise0 times 2 dd 1.0, 0.0, | |
29 | ps_noise2 times 2 dd -1.0, 0.0 | |
30 | ps_noise13 dd 0.0, 1.0, 0.0, -1.0 | |
31 | dd 0.0, -1.0, 0.0, 1.0 | |
32 | dd 0.0, 1.0, 0.0, -1.0 | |
33 | cextern sbr_noise_table | |
34 | cextern ps_neg | |
35 | ||
36 | SECTION_TEXT | |
37 | ||
38 | INIT_XMM sse | |
39 | cglobal sbr_sum_square, 2, 3, 6 | |
40 | mov r2, r1 | |
41 | xorps m0, m0 | |
42 | xorps m1, m1 | |
43 | sar r2, 3 | |
44 | jz .prepare | |
45 | .loop: | |
46 | movu m2, [r0 + 0] | |
47 | movu m3, [r0 + 16] | |
48 | movu m4, [r0 + 32] | |
49 | movu m5, [r0 + 48] | |
50 | mulps m2, m2 | |
51 | mulps m3, m3 | |
52 | mulps m4, m4 | |
53 | mulps m5, m5 | |
54 | addps m0, m2 | |
55 | addps m1, m3 | |
56 | addps m0, m4 | |
57 | addps m1, m5 | |
58 | add r0, 64 | |
59 | dec r2 | |
60 | jnz .loop | |
61 | .prepare: | |
62 | and r1, 7 | |
63 | sar r1, 1 | |
64 | jz .end | |
65 | ; len is a multiple of 2, thus there are at least 4 elements to process | |
66 | .endloop: | |
67 | movu m2, [r0] | |
68 | add r0, 16 | |
69 | mulps m2, m2 | |
70 | dec r1 | |
71 | addps m0, m2 | |
72 | jnz .endloop | |
73 | .end: | |
74 | addps m0, m1 | |
75 | movhlps m2, m0 | |
76 | addps m0, m2 | |
77 | movss m1, m0 | |
78 | shufps m0, m0, 1 | |
79 | addss m0, m1 | |
80 | %if ARCH_X86_64 == 0 | |
81 | movss r0m, m0 | |
82 | fld dword r0m | |
83 | %endif | |
84 | RET | |
85 | ||
86 | %define STEP 40*4*2 | |
87 | cglobal sbr_hf_g_filt, 5, 6, 5 | |
88 | lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high | |
89 | mov r5, r3 | |
90 | and r3, 0xFC | |
91 | lea r2, [r2 + r3*4] | |
92 | lea r0, [r0 + r3*8] | |
93 | neg r3 | |
94 | jz .loop1 | |
95 | .loop4: | |
96 | movlps m0, [r2 + 4*r3 + 0] | |
97 | movlps m1, [r2 + 4*r3 + 8] | |
98 | movlps m2, [r1 + 0*STEP] | |
99 | movlps m3, [r1 + 2*STEP] | |
100 | movhps m2, [r1 + 1*STEP] | |
101 | movhps m3, [r1 + 3*STEP] | |
102 | unpcklps m0, m0 | |
103 | unpcklps m1, m1 | |
104 | mulps m0, m2 | |
105 | mulps m1, m3 | |
106 | movu [r0 + 8*r3 + 0], m0 | |
107 | movu [r0 + 8*r3 + 16], m1 | |
108 | add r1, 4*STEP | |
109 | add r3, 4 | |
110 | jnz .loop4 | |
111 | and r5, 3 ; number of single element loops | |
112 | jz .end | |
113 | .loop1: ; element 0 and 1 can be computed at the same time | |
114 | movss m0, [r2] | |
115 | movlps m2, [r1] | |
116 | unpcklps m0, m0 | |
117 | mulps m2, m0 | |
118 | movlps [r0], m2 | |
119 | add r0, 8 | |
120 | add r2, 4 | |
121 | add r1, STEP | |
122 | dec r5 | |
123 | jnz .loop1 | |
124 | .end: | |
125 | RET | |
126 | ||
127 | ; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2], | |
128 | ; const float alpha0[2], const float alpha1[2], | |
129 | ; float bw, int start, int end) | |
130 | ; | |
131 | cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E | |
132 | ; load alpha factors | |
133 | %define bw m0 | |
134 | %if ARCH_X86_64 == 0 || WIN64 | |
135 | movss bw, BWm | |
136 | %endif | |
137 | movlps m2, [alpha1q] | |
138 | movlps m1, [alpha0q] | |
139 | shufps bw, bw, 0 | |
140 | mulps m2, bw ; (a1[0] a1[1])*bw | |
141 | mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3) | |
142 | mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1) | |
143 | mova m3, m1 | |
144 | mova m4, m2 | |
145 | ||
146 | ; Set pointers | |
147 | %if ARCH_X86_64 == 0 || WIN64 | |
148 | ; start and end 6th and 7th args on stack | |
149 | mov r2d, Sm | |
150 | mov r3d, Em | |
151 | %define start r2q | |
152 | %define end r3q | |
153 | %else | |
154 | ; BW does not actually occupy a register, so shift by 1 | |
155 | %define start BWq | |
156 | %define end Sq | |
157 | %endif | |
158 | sub start, end ; neg num of loops | |
159 | lea X_highq, [X_highq + end*2*4] | |
160 | lea X_lowq, [X_lowq + end*2*4 - 2*2*4] | |
161 | shl start, 3 ; offset from num loops | |
162 | ||
163 | mova m0, [X_lowq + start] | |
164 | shufps m3, m3, q1111 | |
165 | shufps m4, m4, q1111 | |
166 | xorps m3, [ps_mask] | |
167 | shufps m1, m1, q0000 | |
168 | shufps m2, m2, q0000 | |
169 | xorps m4, [ps_mask] | |
170 | .loop2: | |
171 | movu m7, [X_lowq + start + 8] ; BbCc | |
172 | mova m6, m0 | |
173 | mova m5, m7 | |
174 | shufps m0, m0, q2301 ; aAbB | |
175 | shufps m7, m7, q2301 ; bBcC | |
176 | mulps m0, m4 | |
177 | mulps m7, m3 | |
178 | mulps m6, m2 | |
179 | mulps m5, m1 | |
180 | addps m7, m0 | |
181 | mova m0, [X_lowq + start +16] ; CcDd | |
182 | addps m7, m0 | |
183 | addps m6, m5 | |
184 | addps m7, m6 | |
185 | mova [X_highq + start], m7 | |
186 | add start, 16 | |
187 | jnz .loop2 | |
188 | RET | |
189 | ||
190 | cglobal sbr_sum64x5, 1,2,4,z | |
191 | lea r1q, [zq+ 256] | |
192 | .loop: | |
193 | mova m0, [zq+ 0] | |
194 | mova m2, [zq+ 16] | |
195 | mova m1, [zq+ 256] | |
196 | mova m3, [zq+ 272] | |
197 | addps m0, [zq+ 512] | |
198 | addps m2, [zq+ 528] | |
199 | addps m1, [zq+ 768] | |
200 | addps m3, [zq+ 784] | |
201 | addps m0, [zq+1024] | |
202 | addps m2, [zq+1040] | |
203 | addps m0, m1 | |
204 | addps m2, m3 | |
205 | mova [zq], m0 | |
206 | mova [zq+16], m2 | |
207 | add zq, 32 | |
208 | cmp zq, r1q | |
209 | jne .loop | |
210 | REP_RET | |
211 | ||
212 | INIT_XMM sse | |
213 | cglobal sbr_qmf_post_shuffle, 2,3,4,W,z | |
214 | lea r2q, [zq + (64-4)*4] | |
215 | mova m3, [ps_neg] | |
216 | .loop: | |
217 | mova m1, [zq] | |
218 | xorps m0, m3, [r2q] | |
219 | shufps m0, m0, m0, q0123 | |
220 | unpcklps m2, m0, m1 | |
221 | unpckhps m0, m0, m1 | |
222 | mova [Wq + 0], m2 | |
223 | mova [Wq + 16], m0 | |
224 | add Wq, 32 | |
225 | sub r2q, 16 | |
226 | add zq, 16 | |
227 | cmp zq, r2q | |
228 | jl .loop | |
229 | REP_RET | |
230 | ||
231 | INIT_XMM sse | |
232 | cglobal sbr_neg_odd_64, 1,2,4,z | |
233 | lea r1q, [zq+256] | |
234 | .loop: | |
235 | mova m0, [zq+ 0] | |
236 | mova m1, [zq+16] | |
237 | mova m2, [zq+32] | |
238 | mova m3, [zq+48] | |
239 | xorps m0, [ps_mask2] | |
240 | xorps m1, [ps_mask2] | |
241 | xorps m2, [ps_mask2] | |
242 | xorps m3, [ps_mask2] | |
243 | mova [zq+ 0], m0 | |
244 | mova [zq+16], m1 | |
245 | mova [zq+32], m2 | |
246 | mova [zq+48], m3 | |
247 | add zq, 64 | |
248 | cmp zq, r1q | |
249 | jne .loop | |
250 | REP_RET | |
251 | ||
252 | ; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1) | |
253 | %macro SBR_QMF_DEINT_BFLY 0 | |
254 | cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c | |
255 | mov cq, 64*4-2*mmsize | |
256 | lea vrevq, [vq + 64*4] | |
257 | .loop: | |
258 | mova m0, [src0q+cq] | |
259 | mova m1, [src1q] | |
260 | mova m4, [src0q+cq+mmsize] | |
261 | mova m5, [src1q+mmsize] | |
262 | %if cpuflag(sse2) | |
263 | pshufd m2, m0, q0123 | |
264 | pshufd m3, m1, q0123 | |
265 | pshufd m6, m4, q0123 | |
266 | pshufd m7, m5, q0123 | |
267 | %else | |
268 | shufps m2, m0, m0, q0123 | |
269 | shufps m3, m1, m1, q0123 | |
270 | shufps m6, m4, m4, q0123 | |
271 | shufps m7, m5, m5, q0123 | |
272 | %endif | |
273 | addps m5, m2 | |
274 | subps m0, m7 | |
275 | addps m1, m6 | |
276 | subps m4, m3 | |
277 | mova [vrevq], m1 | |
278 | mova [vrevq+mmsize], m5 | |
279 | mova [vq+cq], m0 | |
280 | mova [vq+cq+mmsize], m4 | |
281 | add src1q, 2*mmsize | |
282 | add vrevq, 2*mmsize | |
283 | sub cq, 2*mmsize | |
284 | jge .loop | |
285 | REP_RET | |
286 | %endmacro | |
287 | ||
288 | INIT_XMM sse | |
289 | SBR_QMF_DEINT_BFLY | |
290 | ||
291 | INIT_XMM sse2 | |
292 | SBR_QMF_DEINT_BFLY | |
293 | ||
294 | INIT_XMM sse2 | |
295 | cglobal sbr_qmf_pre_shuffle, 1,4,6,z | |
296 | %define OFFSET (32*4-2*mmsize) | |
297 | mov r3q, OFFSET | |
298 | lea r1q, [zq + (32+1)*4] | |
299 | lea r2q, [zq + 64*4] | |
300 | mova m5, [ps_neg] | |
301 | .loop: | |
302 | movu m0, [r1q] | |
303 | movu m2, [r1q + mmsize] | |
304 | movu m1, [zq + r3q + 4 + mmsize] | |
305 | movu m3, [zq + r3q + 4] | |
306 | ||
307 | pxor m2, m5 | |
308 | pxor m0, m5 | |
309 | pshufd m2, m2, q0123 | |
310 | pshufd m0, m0, q0123 | |
311 | SBUTTERFLY dq, 2, 3, 4 | |
312 | SBUTTERFLY dq, 0, 1, 4 | |
313 | mova [r2q + 2*r3q + 0*mmsize], m2 | |
314 | mova [r2q + 2*r3q + 1*mmsize], m3 | |
315 | mova [r2q + 2*r3q + 2*mmsize], m0 | |
316 | mova [r2q + 2*r3q + 3*mmsize], m1 | |
317 | add r1q, 2*mmsize | |
318 | sub r3q, 2*mmsize | |
319 | jge .loop | |
320 | movq m2, [zq] | |
321 | movq [r2q], m2 | |
322 | REP_RET | |
323 | ||
324 | %ifdef PIC | |
325 | %define NREGS 1 | |
326 | %if UNIX64 | |
327 | %define NOISE_TABLE r6q ; r5q is m_max | |
328 | %else | |
329 | %define NOISE_TABLE r5q | |
330 | %endif | |
331 | %else | |
332 | %define NREGS 0 | |
333 | %define NOISE_TABLE sbr_noise_table | |
334 | %endif | |
335 | ||
336 | %macro LOAD_NST 1 | |
337 | %ifdef PIC | |
338 | lea NOISE_TABLE, [%1] | |
339 | mova m0, [kxq + NOISE_TABLE] | |
340 | %else | |
341 | mova m0, [kxq + %1] | |
342 | %endif | |
343 | %endmacro | |
344 | ||
345 | INIT_XMM sse2 | |
346 | ; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m, | |
347 | ; const float *q_filt, int noise, | |
348 | ; int kx, int m_max) | |
349 | cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max | |
350 | mova m0, [ps_noise0] | |
351 | jmp apply_noise_main | |
352 | ||
353 | ; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m, | |
354 | ; const float *q_filt, int noise, | |
355 | ; int kx, int m_max) | |
356 | cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max | |
357 | and kxq, 1 | |
358 | shl kxq, 4 | |
359 | LOAD_NST ps_noise13 | |
360 | jmp apply_noise_main | |
361 | ||
362 | ; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m, | |
363 | ; const float *q_filt, int noise, | |
364 | ; int kx, int m_max) | |
365 | cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max | |
366 | mova m0, [ps_noise2] | |
367 | jmp apply_noise_main | |
368 | ||
369 | ; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m, | |
370 | ; const float *q_filt, int noise, | |
371 | ; int kx, int m_max) | |
372 | cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max | |
373 | and kxq, 1 | |
374 | shl kxq, 4 | |
375 | LOAD_NST ps_noise13+16 | |
376 | ||
377 | apply_noise_main: | |
378 | %if ARCH_X86_64 == 0 || WIN64 | |
379 | mov kxd, m_maxm | |
380 | %define count kxq | |
381 | %else | |
382 | %define count m_maxq | |
383 | %endif | |
384 | dec noiseq | |
385 | shl count, 2 | |
386 | %ifdef PIC | |
387 | lea NOISE_TABLE, [sbr_noise_table] | |
388 | %endif | |
389 | lea Yq, [Yq + 2*count] | |
390 | add s_mq, count | |
391 | add q_filtq, count | |
392 | shl noiseq, 3 | |
393 | pxor m5, m5 | |
394 | neg count | |
395 | .loop: | |
396 | mova m1, [q_filtq + count] | |
397 | movu m3, [noiseq + NOISE_TABLE + 1*mmsize] | |
398 | movu m4, [noiseq + NOISE_TABLE + 2*mmsize] | |
399 | add noiseq, 2*mmsize | |
400 | and noiseq, 0x1ff<<3 | |
401 | punpckhdq m2, m1, m1 | |
402 | punpckldq m1, m1 | |
403 | mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] | |
404 | mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] | |
405 | mova m3, [s_mq + count] | |
406 | ; TODO: replace by a vpermd in AVX2 | |
407 | punpckhdq m4, m3, m3 | |
408 | punpckldq m3, m3 | |
409 | pcmpeqd m6, m3, m5 ; m6 == 0 | |
410 | pcmpeqd m7, m4, m5 ; m7 == 0 | |
411 | mulps m3, m0 ; s_m[m] * phi_sign | |
412 | mulps m4, m0 ; s_m[m] * phi_sign | |
413 | pand m1, m6 | |
414 | pand m2, m7 | |
415 | movu m6, [Yq + 2*count] | |
416 | movu m7, [Yq + 2*count + mmsize] | |
417 | addps m3, m1 | |
418 | addps m4, m2 | |
419 | addps m6, m3 | |
420 | addps m7, m4 | |
421 | movu [Yq + 2*count], m6 | |
422 | movu [Yq + 2*count + mmsize], m7 | |
423 | add count, mmsize | |
424 | jl .loop | |
425 | RET | |
426 | ||
427 | INIT_XMM sse | |
428 | cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c | |
429 | %define COUNT 32*4 | |
430 | %define OFFSET 32*4 | |
431 | mov cq, -COUNT | |
432 | lea vrevq, [vq + OFFSET + COUNT] | |
433 | add vq, OFFSET-mmsize | |
434 | add srcq, 2*COUNT | |
435 | mova m3, [ps_neg] | |
436 | .loop: | |
437 | mova m0, [srcq + 2*cq + 0*mmsize] | |
438 | mova m1, [srcq + 2*cq + 1*mmsize] | |
439 | shufps m2, m0, m1, q2020 | |
440 | shufps m1, m0, q1313 | |
441 | xorps m2, m3 | |
442 | mova [vq], m1 | |
443 | mova [vrevq + cq], m2 | |
444 | sub vq, mmsize | |
445 | add cq, mmsize | |
446 | jl .loop | |
447 | REP_RET |