Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / ac3dsp.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* x86-optimized AC-3 DSP functions
3;* Copyright (c) 2011 Justin Ruggles
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25
26; 16777216.0f - used in ff_float_to_fixed24()
27pf_1_24: times 4 dd 0x4B800000
28
29; used in ff_ac3_compute_mantissa_size()
30cextern ac3_bap_bits
31pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
32pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
33
34; used in ff_ac3_extract_exponents()
35pd_1: times 4 dd 1
36pd_151: times 4 dd 151
37
38; used in ff_apply_window_int16()
39pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
40pd_16384: times 4 dd 16384
41
42SECTION .text
43
44;-----------------------------------------------------------------------------
45; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
46;-----------------------------------------------------------------------------
47
48%macro AC3_EXPONENT_MIN 0
49cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
50 shl reuse_blksq, 8
51 jz .end
52 LOOP_ALIGN
53.nextexp:
54 mov offsetq, reuse_blksq
55 mova m0, [expq+offsetq]
56 sub offsetq, 256
57 LOOP_ALIGN
58.nextblk:
59 PMINUB m0, [expq+offsetq], m1
60 sub offsetq, 256
61 jae .nextblk
62 mova [expq], m0
63 add expq, mmsize
64 sub expnq, mmsize
65 jg .nextexp
66.end:
67 REP_RET
68%endmacro
69
70%define LOOP_ALIGN
71INIT_MMX mmx
72AC3_EXPONENT_MIN
73%if HAVE_MMXEXT_EXTERNAL
74%define LOOP_ALIGN ALIGN 16
75INIT_MMX mmxext
76AC3_EXPONENT_MIN
77%endif
78%if HAVE_SSE2_EXTERNAL
79INIT_XMM sse2
80AC3_EXPONENT_MIN
81%endif
82%undef LOOP_ALIGN
83
84;-----------------------------------------------------------------------------
85; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
86;
87; This function uses 2 different methods to calculate a valid result.
88; 1) logical 'or' of abs of each element
89; This is used for ssse3 because of the pabsw instruction.
90; It is also used for mmx because of the lack of min/max instructions.
91; 2) calculate min/max for the array, then or(abs(min),abs(max))
92; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
93;-----------------------------------------------------------------------------
94
95; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
96%macro OR_WORDS_HORIZ 2 ; src, tmp
97%if cpuflag(sse2)
98 movhlps %2, %1
99 por %1, %2
100 pshuflw %2, %1, q0032
101 por %1, %2
102 pshuflw %2, %1, q0001
103 por %1, %2
104%elif cpuflag(mmxext)
105 pshufw %2, %1, q0032
106 por %1, %2
107 pshufw %2, %1, q0001
108 por %1, %2
109%else ; mmx
110 movq %2, %1
111 psrlq %2, 32
112 por %1, %2
113 movq %2, %1
114 psrlq %2, 16
115 por %1, %2
116%endif
117%endmacro
118
119%macro AC3_MAX_MSB_ABS_INT16 1
120cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
121 pxor m2, m2
122 pxor m3, m3
123.loop:
124%ifidn %1, min_max
125 mova m0, [srcq]
126 mova m1, [srcq+mmsize]
127 pminsw m2, m0
128 pminsw m2, m1
129 pmaxsw m3, m0
130 pmaxsw m3, m1
131%else ; or_abs
132%if notcpuflag(ssse3)
133 mova m0, [srcq]
134 mova m1, [srcq+mmsize]
135 ABS2 m0, m1, m3, m4
136%else ; ssse3
137 ; using memory args is faster for ssse3
138 pabsw m0, [srcq]
139 pabsw m1, [srcq+mmsize]
140%endif
141 por m2, m0
142 por m2, m1
143%endif
144 add srcq, mmsize*2
145 sub lend, mmsize
146 ja .loop
147%ifidn %1, min_max
148 ABS2 m2, m3, m0, m1
149 por m2, m3
150%endif
151 OR_WORDS_HORIZ m2, m0
152 movd eax, m2
153 and eax, 0xFFFF
154 RET
155%endmacro
156
157INIT_MMX mmx
158AC3_MAX_MSB_ABS_INT16 or_abs
159INIT_MMX mmxext
160AC3_MAX_MSB_ABS_INT16 min_max
161INIT_XMM sse2
162AC3_MAX_MSB_ABS_INT16 min_max
163INIT_XMM ssse3
164AC3_MAX_MSB_ABS_INT16 or_abs
165
166;-----------------------------------------------------------------------------
167; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
168;-----------------------------------------------------------------------------
169
170%macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
171cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
172 movd m0, shiftd
173.loop:
174 mova m1, [srcq ]
175 mova m2, [srcq+mmsize ]
176 mova m3, [srcq+mmsize*2]
177 mova m4, [srcq+mmsize*3]
178 %3 m1, m0
179 %3 m2, m0
180 %3 m3, m0
181 %3 m4, m0
182 mova [srcq ], m1
183 mova [srcq+mmsize ], m2
184 mova [srcq+mmsize*2], m3
185 mova [srcq+mmsize*3], m4
186 add srcq, mmsize*4
187 sub lend, mmsize*32/%2
188 ja .loop
189.end:
190 REP_RET
191%endmacro
192
193;-----------------------------------------------------------------------------
194; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
195;-----------------------------------------------------------------------------
196
197INIT_MMX mmx
198AC3_SHIFT l, 16, psllw
199INIT_XMM sse2
200AC3_SHIFT l, 16, psllw
201
202;-----------------------------------------------------------------------------
203; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
204;-----------------------------------------------------------------------------
205
206INIT_MMX mmx
207AC3_SHIFT r, 32, psrad
208INIT_XMM sse2
209AC3_SHIFT r, 32, psrad
210
211;-----------------------------------------------------------------------------
212; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
213;-----------------------------------------------------------------------------
214
215; The 3DNow! version is not bit-identical because pf2id uses truncation rather
216; than round-to-nearest.
217INIT_MMX 3dnow
218cglobal float_to_fixed24, 3, 3, 0, dst, src, len
219 movq m0, [pf_1_24]
220.loop:
221 movq m1, [srcq ]
222 movq m2, [srcq+8 ]
223 movq m3, [srcq+16]
224 movq m4, [srcq+24]
225 pfmul m1, m0
226 pfmul m2, m0
227 pfmul m3, m0
228 pfmul m4, m0
229 pf2id m1, m1
230 pf2id m2, m2
231 pf2id m3, m3
232 pf2id m4, m4
233 movq [dstq ], m1
234 movq [dstq+8 ], m2
235 movq [dstq+16], m3
236 movq [dstq+24], m4
237 add srcq, 32
238 add dstq, 32
239 sub lend, 8
240 ja .loop
241 femms
242 RET
243
244INIT_XMM sse
245cglobal float_to_fixed24, 3, 3, 3, dst, src, len
246 movaps m0, [pf_1_24]
247.loop:
248 movaps m1, [srcq ]
249 movaps m2, [srcq+16]
250 mulps m1, m0
251 mulps m2, m0
252 cvtps2pi mm0, m1
253 movhlps m1, m1
254 cvtps2pi mm1, m1
255 cvtps2pi mm2, m2
256 movhlps m2, m2
257 cvtps2pi mm3, m2
258 movq [dstq ], mm0
259 movq [dstq+ 8], mm1
260 movq [dstq+16], mm2
261 movq [dstq+24], mm3
262 add srcq, 32
263 add dstq, 32
264 sub lend, 8
265 ja .loop
266 emms
267 RET
268
269INIT_XMM sse2
270cglobal float_to_fixed24, 3, 3, 9, dst, src, len
271 movaps m0, [pf_1_24]
272.loop:
273 movaps m1, [srcq ]
274 movaps m2, [srcq+16 ]
275 movaps m3, [srcq+32 ]
276 movaps m4, [srcq+48 ]
277%ifdef m8
278 movaps m5, [srcq+64 ]
279 movaps m6, [srcq+80 ]
280 movaps m7, [srcq+96 ]
281 movaps m8, [srcq+112]
282%endif
283 mulps m1, m0
284 mulps m2, m0
285 mulps m3, m0
286 mulps m4, m0
287%ifdef m8
288 mulps m5, m0
289 mulps m6, m0
290 mulps m7, m0
291 mulps m8, m0
292%endif
293 cvtps2dq m1, m1
294 cvtps2dq m2, m2
295 cvtps2dq m3, m3
296 cvtps2dq m4, m4
297%ifdef m8
298 cvtps2dq m5, m5
299 cvtps2dq m6, m6
300 cvtps2dq m7, m7
301 cvtps2dq m8, m8
302%endif
303 movdqa [dstq ], m1
304 movdqa [dstq+16 ], m2
305 movdqa [dstq+32 ], m3
306 movdqa [dstq+48 ], m4
307%ifdef m8
308 movdqa [dstq+64 ], m5
309 movdqa [dstq+80 ], m6
310 movdqa [dstq+96 ], m7
311 movdqa [dstq+112], m8
312 add srcq, 128
313 add dstq, 128
314 sub lenq, 32
315%else
316 add srcq, 64
317 add dstq, 64
318 sub lenq, 16
319%endif
320 ja .loop
321 REP_RET
322
323;------------------------------------------------------------------------------
324; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
325;------------------------------------------------------------------------------
326
327%macro PHADDD4 2 ; xmm src, xmm tmp
328 movhlps %2, %1
329 paddd %1, %2
330 pshufd %2, %1, 0x1
331 paddd %1, %2
332%endmacro
333
334INIT_XMM sse2
335cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
336 movdqa m0, [mant_cntq ]
337 movdqa m1, [mant_cntq+ 1*16]
338 paddw m0, [mant_cntq+ 2*16]
339 paddw m1, [mant_cntq+ 3*16]
340 paddw m0, [mant_cntq+ 4*16]
341 paddw m1, [mant_cntq+ 5*16]
342 paddw m0, [mant_cntq+ 6*16]
343 paddw m1, [mant_cntq+ 7*16]
344 paddw m0, [mant_cntq+ 8*16]
345 paddw m1, [mant_cntq+ 9*16]
346 paddw m0, [mant_cntq+10*16]
347 paddw m1, [mant_cntq+11*16]
348 pmaddwd m0, [ac3_bap_bits ]
349 pmaddwd m1, [ac3_bap_bits+16]
350 paddd m0, m1
351 PHADDD4 m0, m1
352 movd sumd, m0
353 movdqa m3, [pw_bap_mul1]
354 movhpd m0, [mant_cntq +2]
355 movlpd m0, [mant_cntq+1*32+2]
356 movhpd m1, [mant_cntq+2*32+2]
357 movlpd m1, [mant_cntq+3*32+2]
358 movhpd m2, [mant_cntq+4*32+2]
359 movlpd m2, [mant_cntq+5*32+2]
360 pmulhuw m0, m3
361 pmulhuw m1, m3
362 pmulhuw m2, m3
363 paddusw m0, m1
364 paddusw m0, m2
365 pmaddwd m0, [pw_bap_mul2]
366 PHADDD4 m0, m1
367 movd eax, m0
368 add eax, sumd
369 RET
370
371;------------------------------------------------------------------------------
372; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
373;------------------------------------------------------------------------------
374
375%macro PABSD 1-2 ; src/dst, unused
376%if cpuflag(ssse3)
377 pabsd %1, %1
378%else ; src/dst, tmp
379 pxor %2, %2
380 pcmpgtd %2, %1
381 pxor %1, %2
382 psubd %1, %2
383%endif
384%endmacro
385
386%macro AC3_EXTRACT_EXPONENTS 0
387cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
388 add expq, lenq
389 lea coefq, [coefq+4*lenq]
390 neg lenq
391 mova m2, [pd_1]
392 mova m3, [pd_151]
393.loop:
394 ; move 4 32-bit coefs to xmm0
395 mova m0, [coefq+4*lenq]
396 ; absolute value
397 PABSD m0, m1
398 ; convert to float and extract exponents
399 pslld m0, 1
400 por m0, m2
401 cvtdq2ps m1, m0
402 psrld m1, 23
403 mova m0, m3
404 psubd m0, m1
405 ; move the lowest byte in each of 4 dwords to the low dword
406 ; NOTE: We cannot just extract the low bytes with pshufb because the dword
407 ; result for 16777215 is -1 due to float inaccuracy. Using packuswb
408 ; clips this to 0, which is the correct exponent.
409 packssdw m0, m0
410 packuswb m0, m0
411 movd [expq+lenq], m0
412
413 add lenq, 4
414 jl .loop
415 REP_RET
416%endmacro
417
418%if HAVE_SSE2_EXTERNAL
419INIT_XMM sse2
420AC3_EXTRACT_EXPONENTS
421%endif
422%if HAVE_SSSE3_EXTERNAL
423INIT_XMM ssse3
424AC3_EXTRACT_EXPONENTS
425%endif
426
427;-----------------------------------------------------------------------------
428; void ff_apply_window_int16(int16_t *output, const int16_t *input,
429; const int16_t *window, unsigned int len)
430;-----------------------------------------------------------------------------
431
432%macro REVERSE_WORDS 1-2
433%if cpuflag(ssse3) && notcpuflag(atom)
434 pshufb %1, %2
435%elif cpuflag(sse2)
436 pshuflw %1, %1, 0x1B
437 pshufhw %1, %1, 0x1B
438 pshufd %1, %1, 0x4E
439%elif cpuflag(mmxext)
440 pshufw %1, %1, 0x1B
441%endif
442%endmacro
443
444%macro MUL16FIXED 3
445%if cpuflag(ssse3) ; dst, src, unused
446; dst = ((dst * src) + (1<<14)) >> 15
447 pmulhrsw %1, %2
448%elif cpuflag(mmxext) ; dst, src, temp
449; dst = (dst * src) >> 15
450; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
451; in from the pmullw result.
452 mova %3, %1
453 pmulhw %1, %2
454 pmullw %3, %2
455 psrlw %3, 15
456 psllw %1, 1
457 por %1, %3
458%endif
459%endmacro
460
461%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
462%if %1
463cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
464%else
465cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
466%endif
467 lea offset2q, [offsetq-mmsize]
468%if cpuflag(ssse3) && notcpuflag(atom)
469 mova m5, [pb_revwords]
470 ALIGN 16
471%elif %1
472 mova m5, [pd_16384]
473%endif
474.loop:
475%if cpuflag(ssse3)
476 ; This version does the 16x16->16 multiplication in-place without expanding
477 ; to 32-bit. The ssse3 version is bit-identical.
478 mova m0, [windowq+offset2q]
479 mova m1, [ inputq+offset2q]
480 pmulhrsw m1, m0
481 REVERSE_WORDS m0, m5
482 pmulhrsw m0, [ inputq+offsetq ]
483 mova [outputq+offset2q], m1
484 mova [outputq+offsetq ], m0
485%elif %1
486 ; This version expands 16-bit to 32-bit, multiplies by the window,
487 ; adds 16384 for rounding, right shifts 15, then repacks back to words to
488 ; save to the output. The window is reversed for the second half.
489 mova m3, [windowq+offset2q]
490 mova m4, [ inputq+offset2q]
491 pxor m0, m0
492 punpcklwd m0, m3
493 punpcklwd m1, m4
494 pmaddwd m0, m1
495 paddd m0, m5
496 psrad m0, 15
497 pxor m2, m2
498 punpckhwd m2, m3
499 punpckhwd m1, m4
500 pmaddwd m2, m1
501 paddd m2, m5
502 psrad m2, 15
503 packssdw m0, m2
504 mova [outputq+offset2q], m0
505 REVERSE_WORDS m3
506 mova m4, [ inputq+offsetq]
507 pxor m0, m0
508 punpcklwd m0, m3
509 punpcklwd m1, m4
510 pmaddwd m0, m1
511 paddd m0, m5
512 psrad m0, 15
513 pxor m2, m2
514 punpckhwd m2, m3
515 punpckhwd m1, m4
516 pmaddwd m2, m1
517 paddd m2, m5
518 psrad m2, 15
519 packssdw m0, m2
520 mova [outputq+offsetq], m0
521%else
522 ; This version does the 16x16->16 multiplication in-place without expanding
523 ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
524 ; therefore are not bit-identical to the C version.
525 mova m0, [windowq+offset2q]
526 mova m1, [ inputq+offset2q]
527 mova m2, [ inputq+offsetq ]
528 MUL16FIXED m1, m0, m3
529 REVERSE_WORDS m0
530 MUL16FIXED m2, m0, m3
531 mova [outputq+offset2q], m1
532 mova [outputq+offsetq ], m2
533%endif
534 add offsetd, mmsize
535 sub offset2d, mmsize
536 jae .loop
537 REP_RET
538%endmacro
539
540INIT_MMX mmxext
541APPLY_WINDOW_INT16 0
542INIT_XMM sse2
543APPLY_WINDOW_INT16 0
544
545INIT_MMX mmxext
546APPLY_WINDOW_INT16 1
547INIT_XMM sse2
548APPLY_WINDOW_INT16 1
549INIT_XMM ssse3
550APPLY_WINDOW_INT16 1
551INIT_XMM ssse3, atom
552APPLY_WINDOW_INT16 1