Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavresample / x86 / audio_convert.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* x86 optimized Format Conversion Utils
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24%include "util.asm"
25
26SECTION_RODATA 32
27
28pf_s32_inv_scale: times 8 dd 0x30000000
29pf_s32_scale: times 8 dd 0x4f000000
30pf_s32_clip: times 8 dd 0x4effffff
31pf_s16_inv_scale: times 4 dd 0x38000000
32pf_s16_scale: times 4 dd 0x47000000
33pb_shuf_unpack_even: db -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 8, 9, -1, -1, 10, 11
34pb_shuf_unpack_odd: db -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 12, 13, -1, -1, 14, 15
35pb_interleave_words: SHUFFLE_MASK_W 0, 4, 1, 5, 2, 6, 3, 7
36pb_deinterleave_words: SHUFFLE_MASK_W 0, 2, 4, 6, 1, 3, 5, 7
37pw_zero_even: times 4 dw 0x0000, 0xffff
38
39SECTION_TEXT
40
41;------------------------------------------------------------------------------
42; void ff_conv_s16_to_s32(int32_t *dst, const int16_t *src, int len);
43;------------------------------------------------------------------------------
44
45INIT_XMM sse2
46cglobal conv_s16_to_s32, 3,3,3, dst, src, len
47 lea lenq, [2*lend]
48 lea dstq, [dstq+2*lenq]
49 add srcq, lenq
50 neg lenq
51.loop:
52 mova m2, [srcq+lenq]
53 pxor m0, m0
54 pxor m1, m1
55 punpcklwd m0, m2
56 punpckhwd m1, m2
57 mova [dstq+2*lenq ], m0
58 mova [dstq+2*lenq+mmsize], m1
59 add lenq, mmsize
60 jl .loop
61 REP_RET
62
63;------------------------------------------------------------------------------
64; void ff_conv_s16_to_flt(float *dst, const int16_t *src, int len);
65;------------------------------------------------------------------------------
66
67%macro CONV_S16_TO_FLT 0
68cglobal conv_s16_to_flt, 3,3,3, dst, src, len
69 lea lenq, [2*lend]
70 add srcq, lenq
71 lea dstq, [dstq + 2*lenq]
72 neg lenq
73 mova m2, [pf_s16_inv_scale]
74 ALIGN 16
75.loop:
76 mova m0, [srcq+lenq]
77 S16_TO_S32_SX 0, 1
78 cvtdq2ps m0, m0
79 cvtdq2ps m1, m1
80 mulps m0, m2
81 mulps m1, m2
82 mova [dstq+2*lenq ], m0
83 mova [dstq+2*lenq+mmsize], m1
84 add lenq, mmsize
85 jl .loop
86 REP_RET
87%endmacro
88
89INIT_XMM sse2
90CONV_S16_TO_FLT
91INIT_XMM sse4
92CONV_S16_TO_FLT
93
94;------------------------------------------------------------------------------
95; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len);
96;------------------------------------------------------------------------------
97
98%macro CONV_S32_TO_S16 0
99cglobal conv_s32_to_s16, 3,3,4, dst, src, len
100 lea lenq, [2*lend]
101 lea srcq, [srcq+2*lenq]
102 add dstq, lenq
103 neg lenq
104.loop:
105 mova m0, [srcq+2*lenq ]
106 mova m1, [srcq+2*lenq+ mmsize]
107 mova m2, [srcq+2*lenq+2*mmsize]
108 mova m3, [srcq+2*lenq+3*mmsize]
109 psrad m0, 16
110 psrad m1, 16
111 psrad m2, 16
112 psrad m3, 16
113 packssdw m0, m1
114 packssdw m2, m3
115 mova [dstq+lenq ], m0
116 mova [dstq+lenq+mmsize], m2
117 add lenq, mmsize*2
118 jl .loop
119%if mmsize == 8
120 emms
121 RET
122%else
123 REP_RET
124%endif
125%endmacro
126
127INIT_MMX mmx
128CONV_S32_TO_S16
129INIT_XMM sse2
130CONV_S32_TO_S16
131
132;------------------------------------------------------------------------------
133; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len);
134;------------------------------------------------------------------------------
135
136%macro CONV_S32_TO_FLT 0
137cglobal conv_s32_to_flt, 3,3,3, dst, src, len
138 lea lenq, [4*lend]
139 add srcq, lenq
140 add dstq, lenq
141 neg lenq
142 mova m0, [pf_s32_inv_scale]
143 ALIGN 16
144.loop:
145 cvtdq2ps m1, [srcq+lenq ]
146 cvtdq2ps m2, [srcq+lenq+mmsize]
147 mulps m1, m1, m0
148 mulps m2, m2, m0
149 mova [dstq+lenq ], m1
150 mova [dstq+lenq+mmsize], m2
151 add lenq, mmsize*2
152 jl .loop
153 REP_RET
154%endmacro
155
156INIT_XMM sse2
157CONV_S32_TO_FLT
158%if HAVE_AVX_EXTERNAL
159INIT_YMM avx
160CONV_S32_TO_FLT
161%endif
162
163;------------------------------------------------------------------------------
164; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len);
165;------------------------------------------------------------------------------
166
167INIT_XMM sse2
168cglobal conv_flt_to_s16, 3,3,5, dst, src, len
169 lea lenq, [2*lend]
170 lea srcq, [srcq+2*lenq]
171 add dstq, lenq
172 neg lenq
173 mova m4, [pf_s16_scale]
174.loop:
175 mova m0, [srcq+2*lenq ]
176 mova m1, [srcq+2*lenq+1*mmsize]
177 mova m2, [srcq+2*lenq+2*mmsize]
178 mova m3, [srcq+2*lenq+3*mmsize]
179 mulps m0, m4
180 mulps m1, m4
181 mulps m2, m4
182 mulps m3, m4
183 cvtps2dq m0, m0
184 cvtps2dq m1, m1
185 cvtps2dq m2, m2
186 cvtps2dq m3, m3
187 packssdw m0, m1
188 packssdw m2, m3
189 mova [dstq+lenq ], m0
190 mova [dstq+lenq+mmsize], m2
191 add lenq, mmsize*2
192 jl .loop
193 REP_RET
194
195;------------------------------------------------------------------------------
196; void ff_conv_flt_to_s32(int32_t *dst, const float *src, int len);
197;------------------------------------------------------------------------------
198
199%macro CONV_FLT_TO_S32 0
200cglobal conv_flt_to_s32, 3,3,6, dst, src, len
201 lea lenq, [lend*4]
202 add srcq, lenq
203 add dstq, lenq
204 neg lenq
205 mova m4, [pf_s32_scale]
206 mova m5, [pf_s32_clip]
207.loop:
208 mulps m0, m4, [srcq+lenq ]
209 mulps m1, m4, [srcq+lenq+1*mmsize]
210 mulps m2, m4, [srcq+lenq+2*mmsize]
211 mulps m3, m4, [srcq+lenq+3*mmsize]
212 minps m0, m0, m5
213 minps m1, m1, m5
214 minps m2, m2, m5
215 minps m3, m3, m5
216 cvtps2dq m0, m0
217 cvtps2dq m1, m1
218 cvtps2dq m2, m2
219 cvtps2dq m3, m3
220 mova [dstq+lenq ], m0
221 mova [dstq+lenq+1*mmsize], m1
222 mova [dstq+lenq+2*mmsize], m2
223 mova [dstq+lenq+3*mmsize], m3
224 add lenq, mmsize*4
225 jl .loop
226 REP_RET
227%endmacro
228
229INIT_XMM sse2
230CONV_FLT_TO_S32
231%if HAVE_AVX_EXTERNAL
232INIT_YMM avx
233CONV_FLT_TO_S32
234%endif
235
236;------------------------------------------------------------------------------
237; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len,
238; int channels);
239;------------------------------------------------------------------------------
240
241%macro CONV_S16P_TO_S16_2CH 0
242cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1
243 mov src1q, [src0q+gprsize]
244 mov src0q, [src0q ]
245 lea lenq, [2*lend]
246 add src0q, lenq
247 add src1q, lenq
248 lea dstq, [dstq+2*lenq]
249 neg lenq
250.loop:
251 mova m0, [src0q+lenq ]
252 mova m1, [src1q+lenq ]
253 mova m2, [src0q+lenq+mmsize]
254 mova m3, [src1q+lenq+mmsize]
255 SBUTTERFLY2 wd, 0, 1, 4
256 SBUTTERFLY2 wd, 2, 3, 4
257 mova [dstq+2*lenq+0*mmsize], m0
258 mova [dstq+2*lenq+1*mmsize], m1
259 mova [dstq+2*lenq+2*mmsize], m2
260 mova [dstq+2*lenq+3*mmsize], m3
261 add lenq, 2*mmsize
262 jl .loop
263 REP_RET
264%endmacro
265
266INIT_XMM sse2
267CONV_S16P_TO_S16_2CH
268%if HAVE_AVX_EXTERNAL
269INIT_XMM avx
270CONV_S16P_TO_S16_2CH
271%endif
272
273;------------------------------------------------------------------------------
274; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len,
275; int channels);
276;------------------------------------------------------------------------------
277
278;------------------------------------------------------------------------------
279; NOTE: In the 6-channel functions, len could be used as an index on x86-64
280; instead of just a counter, which would avoid incrementing the
281; pointers, but the extra complexity and amount of code is not worth
282; the small gain. On x86-32 there are not enough registers to use len
283; as an index without keeping two of the pointers on the stack and
284; loading them in each iteration.
285;------------------------------------------------------------------------------
286
287%macro CONV_S16P_TO_S16_6CH 0
288%if ARCH_X86_64
289cglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5
290%else
291cglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5
292%define lend dword r2m
293%endif
294 mov src1q, [src0q+1*gprsize]
295 mov src2q, [src0q+2*gprsize]
296 mov src3q, [src0q+3*gprsize]
297 mov src4q, [src0q+4*gprsize]
298 mov src5q, [src0q+5*gprsize]
299 mov src0q, [src0q]
300 sub src1q, src0q
301 sub src2q, src0q
302 sub src3q, src0q
303 sub src4q, src0q
304 sub src5q, src0q
305.loop:
306%if cpuflag(sse2slow)
307 movq m0, [src0q ] ; m0 = 0, 6, 12, 18, x, x, x, x
308 movq m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
309 movq m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
310 movq m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
311 movq m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
312 movq m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
313 ; unpack words:
314 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
315 punpcklwd m2, m3 ; m2 = 4, 5, 10, 11, 16, 17, 22, 23
316 punpcklwd m4, m5 ; m4 = 2, 3, 8, 9, 14, 15, 20, 21
317 ; blend dwords
318 shufps m1, m0, m2, q2020 ; m1 = 0, 1, 12, 13, 2, 3, 14, 15
319 shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
320 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
321 ; shuffle dwords
322 pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
323 pshufd m1, m1, q3120 ; m1 = 0, 1, 2, 3, 12, 13, 14, 15
324 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
325 movq [dstq+0*mmsize/2], m1
326 movq [dstq+1*mmsize/2], m0
327 movq [dstq+2*mmsize/2], m2
328 movhps [dstq+3*mmsize/2], m1
329 movhps [dstq+4*mmsize/2], m0
330 movhps [dstq+5*mmsize/2], m2
331 add src0q, mmsize/2
332 add dstq, mmsize*3
333 sub lend, mmsize/4
334%else
335 mova m0, [src0q ] ; m0 = 0, 6, 12, 18, 24, 30, 36, 42
336 mova m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, 25, 31, 37, 43
337 mova m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, 26, 32, 38, 44
338 mova m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, 27, 33, 39, 45
339 mova m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, 28, 34, 40, 46
340 mova m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, 29, 35, 41, 47
341 ; unpack words:
342 SBUTTERFLY2 wd, 0, 1, 6 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
343 ; m1 = 24, 25, 30, 31, 36, 37, 42, 43
344 SBUTTERFLY2 wd, 2, 3, 6 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
345 ; m3 = 26, 27, 32, 33, 38, 39, 44, 45
346 SBUTTERFLY2 wd, 4, 5, 6 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
347 ; m5 = 28, 29, 34, 35, 40, 41, 46, 47
348 ; blend dwords
349 shufps m6, m0, m2, q2020 ; m6 = 0, 1, 12, 13, 2, 3, 14, 15
350 shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
351 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
352 SWAP 4,6 ; m4 = 0, 1, 12, 13, 2, 3, 14, 15
353 shufps m6, m1, m3, q2020 ; m6 = 24, 25, 36, 37, 26, 27, 38, 39
354 shufps m1, m5, q2031 ; m1 = 30, 31, 42, 43, 28, 29, 40, 41
355 shufps m3, m5, q3131 ; m3 = 32, 33, 44, 45, 34, 35, 46, 47
356 SWAP 5,6 ; m5 = 24, 25, 36, 37, 26, 27, 38, 39
357 ; shuffle dwords
358 pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19
359 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
360 pshufd m4, m4, q3120 ; m4 = 0, 1, 2, 3, 12, 13, 14, 15
361 pshufd m1, m1, q1302 ; m1 = 28, 29, 30, 31, 40, 41, 42, 43
362 pshufd m3, m3, q3120 ; m3 = 32, 33, 34, 35, 44, 45, 46, 47
363 pshufd m5, m5, q3120 ; m5 = 24, 25, 26, 27, 36, 37, 38, 39
364 ; shuffle qwords
365 punpcklqdq m6, m4, m0 ; m6 = 0, 1, 2, 3, 4, 5, 6, 7
366 punpckhqdq m0, m2 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
367 shufps m2, m4, q3210 ; m2 = 8, 9, 10, 11, 12, 13, 14, 15
368 SWAP 4,6 ; m4 = 0, 1, 2, 3, 4, 5, 6, 7
369 punpcklqdq m6, m5, m1 ; m6 = 24, 25, 26, 27, 28, 29, 30, 31
370 punpckhqdq m1, m3 ; m1 = 40, 41, 42, 43, 44, 45, 46, 47
371 shufps m3, m5, q3210 ; m3 = 32, 33, 34, 35, 36, 37, 38, 39
372 SWAP 5,6 ; m5 = 24, 25, 26, 27, 28, 29, 30, 31
373 mova [dstq+0*mmsize], m4
374 mova [dstq+1*mmsize], m2
375 mova [dstq+2*mmsize], m0
376 mova [dstq+3*mmsize], m5
377 mova [dstq+4*mmsize], m3
378 mova [dstq+5*mmsize], m1
379 add src0q, mmsize
380 add dstq, mmsize*6
381 sub lend, mmsize/2
382%endif
383 jg .loop
384 REP_RET
385%endmacro
386
387INIT_XMM sse2
388CONV_S16P_TO_S16_6CH
389INIT_XMM sse2slow
390CONV_S16P_TO_S16_6CH
391%if HAVE_AVX_EXTERNAL
392INIT_XMM avx
393CONV_S16P_TO_S16_6CH
394%endif
395
396;------------------------------------------------------------------------------
397; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len,
398; int channels);
399;------------------------------------------------------------------------------
400
401%macro CONV_S16P_TO_FLT_2CH 0
402cglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1
403 lea lenq, [2*lend]
404 mov src1q, [src0q+gprsize]
405 mov src0q, [src0q ]
406 lea dstq, [dstq+4*lenq]
407 add src0q, lenq
408 add src1q, lenq
409 neg lenq
410 mova m5, [pf_s32_inv_scale]
411.loop:
412 mova m2, [src0q+lenq] ; m2 = 0, 2, 4, 6, 8, 10, 12, 14
413 mova m4, [src1q+lenq] ; m4 = 1, 3, 5, 7, 9, 11, 13, 15
414 SBUTTERFLY2 wd, 2, 4, 3 ; m2 = 0, 1, 2, 3, 4, 5, 6, 7
415 ; m4 = 8, 9, 10, 11, 12, 13, 14, 15
416 pxor m3, m3
417 punpcklwd m0, m3, m2 ; m0 = 0, 1, 2, 3
418 punpckhwd m1, m3, m2 ; m1 = 4, 5, 6, 7
419 punpcklwd m2, m3, m4 ; m2 = 8, 9, 10, 11
420 punpckhwd m3, m4 ; m3 = 12, 13, 14, 15
421 cvtdq2ps m0, m0
422 cvtdq2ps m1, m1
423 cvtdq2ps m2, m2
424 cvtdq2ps m3, m3
425 mulps m0, m5
426 mulps m1, m5
427 mulps m2, m5
428 mulps m3, m5
429 mova [dstq+4*lenq ], m0
430 mova [dstq+4*lenq+ mmsize], m1
431 mova [dstq+4*lenq+2*mmsize], m2
432 mova [dstq+4*lenq+3*mmsize], m3
433 add lenq, mmsize
434 jl .loop
435 REP_RET
436%endmacro
437
438INIT_XMM sse2
439CONV_S16P_TO_FLT_2CH
440%if HAVE_AVX_EXTERNAL
441INIT_XMM avx
442CONV_S16P_TO_FLT_2CH
443%endif
444
445;------------------------------------------------------------------------------
446; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len,
447; int channels);
448;------------------------------------------------------------------------------
449
450%macro CONV_S16P_TO_FLT_6CH 0
451%if ARCH_X86_64
452cglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5
453%else
454cglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5
455%define lend dword r2m
456%endif
457 mov src1q, [srcq+1*gprsize]
458 mov src2q, [srcq+2*gprsize]
459 mov src3q, [srcq+3*gprsize]
460 mov src4q, [srcq+4*gprsize]
461 mov src5q, [srcq+5*gprsize]
462 mov srcq, [srcq]
463 sub src1q, srcq
464 sub src2q, srcq
465 sub src3q, srcq
466 sub src4q, srcq
467 sub src5q, srcq
468 mova m7, [pf_s32_inv_scale]
469%if cpuflag(ssse3)
470 %define unpack_even m6
471 mova m6, [pb_shuf_unpack_even]
472%if ARCH_X86_64
473 %define unpack_odd m8
474 mova m8, [pb_shuf_unpack_odd]
475%else
476 %define unpack_odd [pb_shuf_unpack_odd]
477%endif
478%endif
479.loop:
480 movq m0, [srcq ] ; m0 = 0, 6, 12, 18, x, x, x, x
481 movq m1, [srcq+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x
482 movq m2, [srcq+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x
483 movq m3, [srcq+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x
484 movq m4, [srcq+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x
485 movq m5, [srcq+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x
486 ; unpack words:
487 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
488 punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
489 punpcklwd m4, m5 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23
490 ; blend dwords
491 shufps m1, m4, m0, q3120 ; m1 = 4, 5, 16, 17, 6, 7, 18, 19
492 shufps m0, m2, q2020 ; m0 = 0, 1, 12, 13, 2, 3, 14, 15
493 shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
494%if cpuflag(ssse3)
495 pshufb m3, m0, unpack_odd ; m3 = 12, 13, 14, 15
496 pshufb m0, unpack_even ; m0 = 0, 1, 2, 3
497 pshufb m4, m1, unpack_odd ; m4 = 16, 17, 18, 19
498 pshufb m1, unpack_even ; m1 = 4, 5, 6, 7
499 pshufb m5, m2, unpack_odd ; m5 = 20, 21, 22, 23
500 pshufb m2, unpack_even ; m2 = 8, 9, 10, 11
501%else
502 ; shuffle dwords
503 pshufd m0, m0, q3120 ; m0 = 0, 1, 2, 3, 12, 13, 14, 15
504 pshufd m1, m1, q3120 ; m1 = 4, 5, 6, 7, 16, 17, 18, 19
505 pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23
506 pxor m6, m6 ; convert s16 in m0-m2 to s32 in m0-m5
507 punpcklwd m3, m6, m0 ; m3 = 0, 1, 2, 3
508 punpckhwd m4, m6, m0 ; m4 = 12, 13, 14, 15
509 punpcklwd m0, m6, m1 ; m0 = 4, 5, 6, 7
510 punpckhwd m5, m6, m1 ; m5 = 16, 17, 18, 19
511 punpcklwd m1, m6, m2 ; m1 = 8, 9, 10, 11
512 punpckhwd m6, m2 ; m6 = 20, 21, 22, 23
513 SWAP 6,2,1,0,3,4,5 ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5
514%endif
515 cvtdq2ps m0, m0 ; convert s32 to float
516 cvtdq2ps m1, m1
517 cvtdq2ps m2, m2
518 cvtdq2ps m3, m3
519 cvtdq2ps m4, m4
520 cvtdq2ps m5, m5
521 mulps m0, m7 ; scale float from s32 range to [-1.0,1.0]
522 mulps m1, m7
523 mulps m2, m7
524 mulps m3, m7
525 mulps m4, m7
526 mulps m5, m7
527 mova [dstq ], m0
528 mova [dstq+ mmsize], m1
529 mova [dstq+2*mmsize], m2
530 mova [dstq+3*mmsize], m3
531 mova [dstq+4*mmsize], m4
532 mova [dstq+5*mmsize], m5
533 add srcq, mmsize/2
534 add dstq, mmsize*6
535 sub lend, mmsize/4
536 jg .loop
537 REP_RET
538%endmacro
539
540INIT_XMM sse2
541CONV_S16P_TO_FLT_6CH
542INIT_XMM ssse3
543CONV_S16P_TO_FLT_6CH
544%if HAVE_AVX_EXTERNAL
545INIT_XMM avx
546CONV_S16P_TO_FLT_6CH
547%endif
548
549;------------------------------------------------------------------------------
550; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len,
551; int channels);
552;------------------------------------------------------------------------------
553
554%macro CONV_FLTP_TO_S16_2CH 0
555cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1
556 lea lenq, [4*lend]
557 mov src1q, [src0q+gprsize]
558 mov src0q, [src0q ]
559 add dstq, lenq
560 add src0q, lenq
561 add src1q, lenq
562 neg lenq
563 mova m2, [pf_s16_scale]
564%if cpuflag(ssse3)
565 mova m3, [pb_interleave_words]
566%endif
567.loop:
568 mulps m0, m2, [src0q+lenq] ; m0 = 0, 2, 4, 6
569 mulps m1, m2, [src1q+lenq] ; m1 = 1, 3, 5, 7
570 cvtps2dq m0, m0
571 cvtps2dq m1, m1
572%if cpuflag(ssse3)
573 packssdw m0, m1 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
574 pshufb m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
575%else
576 packssdw m0, m0 ; m0 = 0, 2, 4, 6, x, x, x, x
577 packssdw m1, m1 ; m1 = 1, 3, 5, 7, x, x, x, x
578 punpcklwd m0, m1 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
579%endif
580 mova [dstq+lenq], m0
581 add lenq, mmsize
582 jl .loop
583 REP_RET
584%endmacro
585
586INIT_XMM sse2
587CONV_FLTP_TO_S16_2CH
588INIT_XMM ssse3
589CONV_FLTP_TO_S16_2CH
590
591;------------------------------------------------------------------------------
592; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len,
593; int channels);
594;------------------------------------------------------------------------------
595
596%macro CONV_FLTP_TO_S16_6CH 0
597%if ARCH_X86_64
598cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5
599%else
600cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5
601%define lend dword r2m
602%endif
603 mov src1q, [srcq+1*gprsize]
604 mov src2q, [srcq+2*gprsize]
605 mov src3q, [srcq+3*gprsize]
606 mov src4q, [srcq+4*gprsize]
607 mov src5q, [srcq+5*gprsize]
608 mov srcq, [srcq]
609 sub src1q, srcq
610 sub src2q, srcq
611 sub src3q, srcq
612 sub src4q, srcq
613 sub src5q, srcq
614 movaps xmm6, [pf_s16_scale]
615.loop:
616%if cpuflag(sse2)
617 mulps m0, m6, [srcq ]
618 mulps m1, m6, [srcq+src1q]
619 mulps m2, m6, [srcq+src2q]
620 mulps m3, m6, [srcq+src3q]
621 mulps m4, m6, [srcq+src4q]
622 mulps m5, m6, [srcq+src5q]
623 cvtps2dq m0, m0
624 cvtps2dq m1, m1
625 cvtps2dq m2, m2
626 cvtps2dq m3, m3
627 cvtps2dq m4, m4
628 cvtps2dq m5, m5
629 packssdw m0, m3 ; m0 = 0, 6, 12, 18, 3, 9, 15, 21
630 packssdw m1, m4 ; m1 = 1, 7, 13, 19, 4, 10, 16, 22
631 packssdw m2, m5 ; m2 = 2, 8, 14, 20, 5, 11, 17, 23
632 ; unpack words:
633 movhlps m3, m0 ; m3 = 3, 9, 15, 21, x, x, x, x
634 punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19
635 punpckhwd m1, m2 ; m1 = 4, 5, 10, 11, 16, 17, 22, 23
636 punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21
637 ; blend dwords:
638 shufps m3, m0, m2, q2020 ; m3 = 0, 1, 12, 13, 2, 3, 14, 15
639 shufps m0, m1, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17
640 shufps m2, m1, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23
641 ; shuffle dwords:
642 shufps m1, m2, m3, q3120 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
643 shufps m3, m0, q0220 ; m3 = 0, 1, 2, 3, 4, 5, 6, 7
644 shufps m0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23
645 mova [dstq+0*mmsize], m3
646 mova [dstq+1*mmsize], m1
647 mova [dstq+2*mmsize], m0
648%else ; sse
649 movlps xmm0, [srcq ]
650 movlps xmm1, [srcq+src1q]
651 movlps xmm2, [srcq+src2q]
652 movlps xmm3, [srcq+src3q]
653 movlps xmm4, [srcq+src4q]
654 movlps xmm5, [srcq+src5q]
655 mulps xmm0, xmm6
656 mulps xmm1, xmm6
657 mulps xmm2, xmm6
658 mulps xmm3, xmm6
659 mulps xmm4, xmm6
660 mulps xmm5, xmm6
661 cvtps2pi mm0, xmm0
662 cvtps2pi mm1, xmm1
663 cvtps2pi mm2, xmm2
664 cvtps2pi mm3, xmm3
665 cvtps2pi mm4, xmm4
666 cvtps2pi mm5, xmm5
667 packssdw mm0, mm3 ; m0 = 0, 6, 3, 9
668 packssdw mm1, mm4 ; m1 = 1, 7, 4, 10
669 packssdw mm2, mm5 ; m2 = 2, 8, 5, 11
670 ; unpack words
671 pshufw mm3, mm0, q1032 ; m3 = 3, 9, 0, 6
672 punpcklwd mm0, mm1 ; m0 = 0, 1, 6, 7
673 punpckhwd mm1, mm2 ; m1 = 4, 5, 10, 11
674 punpcklwd mm2, mm3 ; m2 = 2, 3, 8, 9
675 ; unpack dwords
676 pshufw mm3, mm0, q1032 ; m3 = 6, 7, 0, 1
677 punpckldq mm0, mm2 ; m0 = 0, 1, 2, 3 (final)
678 punpckhdq mm2, mm1 ; m2 = 8, 9, 10, 11 (final)
679 punpckldq mm1, mm3 ; m1 = 4, 5, 6, 7 (final)
680 mova [dstq+0*mmsize], mm0
681 mova [dstq+1*mmsize], mm1
682 mova [dstq+2*mmsize], mm2
683%endif
684 add srcq, mmsize
685 add dstq, mmsize*3
686 sub lend, mmsize/4
687 jg .loop
688%if mmsize == 8
689 emms
690 RET
691%else
692 REP_RET
693%endif
694%endmacro
695
696INIT_MMX sse
697CONV_FLTP_TO_S16_6CH
698INIT_XMM sse2
699CONV_FLTP_TO_S16_6CH
700%if HAVE_AVX_EXTERNAL
701INIT_XMM avx
702CONV_FLTP_TO_S16_6CH
703%endif
704
705;------------------------------------------------------------------------------
706; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len,
707; int channels);
708;------------------------------------------------------------------------------
709
710%macro CONV_FLTP_TO_FLT_2CH 0
711cglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1
712 mov src1q, [src0q+gprsize]
713 mov src0q, [src0q]
714 lea lenq, [4*lend]
715 add src0q, lenq
716 add src1q, lenq
717 lea dstq, [dstq+2*lenq]
718 neg lenq
719.loop:
720 mova m0, [src0q+lenq ]
721 mova m1, [src1q+lenq ]
722 mova m2, [src0q+lenq+mmsize]
723 mova m3, [src1q+lenq+mmsize]
724 SBUTTERFLYPS 0, 1, 4
725 SBUTTERFLYPS 2, 3, 4
726 mova [dstq+2*lenq+0*mmsize], m0
727 mova [dstq+2*lenq+1*mmsize], m1
728 mova [dstq+2*lenq+2*mmsize], m2
729 mova [dstq+2*lenq+3*mmsize], m3
730 add lenq, 2*mmsize
731 jl .loop
732 REP_RET
733%endmacro
734
735INIT_XMM sse
736CONV_FLTP_TO_FLT_2CH
737%if HAVE_AVX_EXTERNAL
738INIT_XMM avx
739CONV_FLTP_TO_FLT_2CH
740%endif
741
742;-----------------------------------------------------------------------------
743; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len,
744; int channels);
745;-----------------------------------------------------------------------------
746
747%macro CONV_FLTP_TO_FLT_6CH 0
748cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
749%if ARCH_X86_64
750 mov lend, r2d
751%else
752 %define lend dword r2m
753%endif
754 mov src1q, [srcq+1*gprsize]
755 mov src2q, [srcq+2*gprsize]
756 mov src3q, [srcq+3*gprsize]
757 mov src4q, [srcq+4*gprsize]
758 mov src5q, [srcq+5*gprsize]
759 mov srcq, [srcq]
760 sub src1q, srcq
761 sub src2q, srcq
762 sub src3q, srcq
763 sub src4q, srcq
764 sub src5q, srcq
765.loop:
766 mova m0, [srcq ]
767 mova m1, [srcq+src1q]
768 mova m2, [srcq+src2q]
769 mova m3, [srcq+src3q]
770 mova m4, [srcq+src4q]
771 mova m5, [srcq+src5q]
772%if cpuflag(sse4)
773 SBUTTERFLYPS 0, 1, 6
774 SBUTTERFLYPS 2, 3, 6
775 SBUTTERFLYPS 4, 5, 6
776
777 blendps m6, m4, m0, 1100b
778 movlhps m0, m2
779 movhlps m4, m2
780 blendps m2, m5, m1, 1100b
781 movlhps m1, m3
782 movhlps m5, m3
783
784 movaps [dstq ], m0
785 movaps [dstq+16], m6
786 movaps [dstq+32], m4
787 movaps [dstq+48], m1
788 movaps [dstq+64], m2
789 movaps [dstq+80], m5
790%else ; mmx
791 SBUTTERFLY dq, 0, 1, 6
792 SBUTTERFLY dq, 2, 3, 6
793 SBUTTERFLY dq, 4, 5, 6
794
795 movq [dstq ], m0
796 movq [dstq+ 8], m2
797 movq [dstq+16], m4
798 movq [dstq+24], m1
799 movq [dstq+32], m3
800 movq [dstq+40], m5
801%endif
802 add srcq, mmsize
803 add dstq, mmsize*6
804 sub lend, mmsize/4
805 jg .loop
806%if mmsize == 8
807 emms
808 RET
809%else
810 REP_RET
811%endif
812%endmacro
813
814INIT_MMX mmx
815CONV_FLTP_TO_FLT_6CH
816INIT_XMM sse4
817CONV_FLTP_TO_FLT_6CH
818%if HAVE_AVX_EXTERNAL
819INIT_XMM avx
820CONV_FLTP_TO_FLT_6CH
821%endif
822
823;------------------------------------------------------------------------------
824; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len,
825; int channels);
826;------------------------------------------------------------------------------
827
828%macro CONV_S16_TO_S16P_2CH 0
829cglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1
830 lea lenq, [2*lend]
831 mov dst1q, [dst0q+gprsize]
832 mov dst0q, [dst0q ]
833 lea srcq, [srcq+2*lenq]
834 add dst0q, lenq
835 add dst1q, lenq
836 neg lenq
837%if cpuflag(ssse3)
838 mova m3, [pb_deinterleave_words]
839%endif
840.loop:
841 mova m0, [srcq+2*lenq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
842 mova m1, [srcq+2*lenq+mmsize] ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
843%if cpuflag(ssse3)
844 pshufb m0, m3 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7
845 pshufb m1, m3 ; m1 = 8, 10, 12, 14, 9, 11, 13, 15
846 SBUTTERFLY2 qdq, 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
847 ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
848%else ; sse2
849 pshuflw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 5, 6, 7
850 pshufhw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 6, 5, 7
851 pshuflw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 13, 14, 15
852 pshufhw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 14, 13, 15
853 DEINT2_PS 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14
854 ; m1 = 1, 3, 5, 7, 9, 11, 13, 15
855%endif
856 mova [dst0q+lenq], m0
857 mova [dst1q+lenq], m1
858 add lenq, mmsize
859 jl .loop
860 REP_RET
861%endmacro
862
863INIT_XMM sse2
864CONV_S16_TO_S16P_2CH
865INIT_XMM ssse3
866CONV_S16_TO_S16P_2CH
867%if HAVE_AVX_EXTERNAL
868INIT_XMM avx
869CONV_S16_TO_S16P_2CH
870%endif
871
872;------------------------------------------------------------------------------
873; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len,
874; int channels);
875;------------------------------------------------------------------------------
876
877%macro CONV_S16_TO_S16P_6CH 0
878%if ARCH_X86_64
879cglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5
880%else
881cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5
882%define lend dword r2m
883%endif
884 mov dst1q, [dstq+ gprsize]
885 mov dst2q, [dstq+2*gprsize]
886 mov dst3q, [dstq+3*gprsize]
887 mov dst4q, [dstq+4*gprsize]
888 mov dst5q, [dstq+5*gprsize]
889 mov dstq, [dstq ]
890 sub dst1q, dstq
891 sub dst2q, dstq
892 sub dst3q, dstq
893 sub dst4q, dstq
894 sub dst5q, dstq
895.loop:
896 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
897 mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
898 mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
899 PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
900 shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
901 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
902 SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
903 ; m1 = 4, 10, 5, 11, x, x, x, x
904 SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
905 ; m2 = 16, 22, 17, 23, x, x, x, x
906 SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
907 ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
908 punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
909 movq [dstq ], m0
910 movhps [dstq+dst1q], m0
911 movq [dstq+dst2q], m3
912 movhps [dstq+dst3q], m3
913 movq [dstq+dst4q], m1
914 movhps [dstq+dst5q], m1
915 add srcq, mmsize*3
916 add dstq, mmsize/2
917 sub lend, mmsize/4
918 jg .loop
919 REP_RET
920%endmacro
921
922INIT_XMM sse2
923CONV_S16_TO_S16P_6CH
924INIT_XMM ssse3
925CONV_S16_TO_S16P_6CH
926%if HAVE_AVX_EXTERNAL
927INIT_XMM avx
928CONV_S16_TO_S16P_6CH
929%endif
930
931;------------------------------------------------------------------------------
932; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len,
933; int channels);
934;------------------------------------------------------------------------------
935
936%macro CONV_S16_TO_FLTP_2CH 0
937cglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1
938 lea lenq, [4*lend]
939 mov dst1q, [dst0q+gprsize]
940 mov dst0q, [dst0q ]
941 add srcq, lenq
942 add dst0q, lenq
943 add dst1q, lenq
944 neg lenq
945 mova m3, [pf_s32_inv_scale]
946 mova m4, [pw_zero_even]
947.loop:
948 mova m1, [srcq+lenq]
949 pslld m0, m1, 16
950 pand m1, m4
951 cvtdq2ps m0, m0
952 cvtdq2ps m1, m1
953 mulps m0, m0, m3
954 mulps m1, m1, m3
955 mova [dst0q+lenq], m0
956 mova [dst1q+lenq], m1
957 add lenq, mmsize
958 jl .loop
959 REP_RET
960%endmacro
961
962INIT_XMM sse2
963CONV_S16_TO_FLTP_2CH
964%if HAVE_AVX_EXTERNAL
965INIT_XMM avx
966CONV_S16_TO_FLTP_2CH
967%endif
968
969;------------------------------------------------------------------------------
970; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len,
971; int channels);
972;------------------------------------------------------------------------------
973
974%macro CONV_S16_TO_FLTP_6CH 0
975%if ARCH_X86_64
976cglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
977%else
978cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
979%define lend dword r2m
980%endif
981 mov dst1q, [dstq+ gprsize]
982 mov dst2q, [dstq+2*gprsize]
983 mov dst3q, [dstq+3*gprsize]
984 mov dst4q, [dstq+4*gprsize]
985 mov dst5q, [dstq+5*gprsize]
986 mov dstq, [dstq ]
987 sub dst1q, dstq
988 sub dst2q, dstq
989 sub dst3q, dstq
990 sub dst4q, dstq
991 sub dst5q, dstq
992 mova m6, [pf_s16_inv_scale]
993.loop:
994 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
995 mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15
996 mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
997 PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x
998 shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
999 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
1000 SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
1001 ; m1 = 4, 10, 5, 11, x, x, x, x
1002 SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21
1003 ; m2 = 16, 22, 17, 23, x, x, x, x
1004 SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
1005 ; m3 = 2, 8, 14, 20, 3, 9, 15, 21
1006 punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23
1007 S16_TO_S32_SX 0, 2 ; m0 = 0, 6, 12, 18
1008 ; m2 = 1, 7, 13, 19
1009 S16_TO_S32_SX 3, 4 ; m3 = 2, 8, 14, 20
1010 ; m4 = 3, 9, 15, 21
1011 S16_TO_S32_SX 1, 5 ; m1 = 4, 10, 16, 22
1012 ; m5 = 5, 11, 17, 23
1013 SWAP 1,2,3,4
1014 cvtdq2ps m0, m0
1015 cvtdq2ps m1, m1
1016 cvtdq2ps m2, m2
1017 cvtdq2ps m3, m3
1018 cvtdq2ps m4, m4
1019 cvtdq2ps m5, m5
1020 mulps m0, m6
1021 mulps m1, m6
1022 mulps m2, m6
1023 mulps m3, m6
1024 mulps m4, m6
1025 mulps m5, m6
1026 mova [dstq ], m0
1027 mova [dstq+dst1q], m1
1028 mova [dstq+dst2q], m2
1029 mova [dstq+dst3q], m3
1030 mova [dstq+dst4q], m4
1031 mova [dstq+dst5q], m5
1032 add srcq, mmsize*3
1033 add dstq, mmsize
1034 sub lend, mmsize/4
1035 jg .loop
1036 REP_RET
1037%endmacro
1038
1039INIT_XMM sse2
1040CONV_S16_TO_FLTP_6CH
1041INIT_XMM ssse3
1042CONV_S16_TO_FLTP_6CH
1043INIT_XMM sse4
1044CONV_S16_TO_FLTP_6CH
1045%if HAVE_AVX_EXTERNAL
1046INIT_XMM avx
1047CONV_S16_TO_FLTP_6CH
1048%endif
1049
1050;------------------------------------------------------------------------------
1051; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len,
1052; int channels);
1053;------------------------------------------------------------------------------
1054
1055%macro CONV_FLT_TO_S16P_2CH 0
1056cglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1
1057 lea lenq, [2*lend]
1058 mov dst1q, [dst0q+gprsize]
1059 mov dst0q, [dst0q ]
1060 lea srcq, [srcq+4*lenq]
1061 add dst0q, lenq
1062 add dst1q, lenq
1063 neg lenq
1064 mova m5, [pf_s16_scale]
1065.loop:
1066 mova m0, [srcq+4*lenq ]
1067 mova m1, [srcq+4*lenq+ mmsize]
1068 mova m2, [srcq+4*lenq+2*mmsize]
1069 mova m3, [srcq+4*lenq+3*mmsize]
1070 DEINT2_PS 0, 1, 4
1071 DEINT2_PS 2, 3, 4
1072 mulps m0, m0, m5
1073 mulps m1, m1, m5
1074 mulps m2, m2, m5
1075 mulps m3, m3, m5
1076 cvtps2dq m0, m0
1077 cvtps2dq m1, m1
1078 cvtps2dq m2, m2
1079 cvtps2dq m3, m3
1080 packssdw m0, m2
1081 packssdw m1, m3
1082 mova [dst0q+lenq], m0
1083 mova [dst1q+lenq], m1
1084 add lenq, mmsize
1085 jl .loop
1086 REP_RET
1087%endmacro
1088
1089INIT_XMM sse2
1090CONV_FLT_TO_S16P_2CH
1091%if HAVE_AVX_EXTERNAL
1092INIT_XMM avx
1093CONV_FLT_TO_S16P_2CH
1094%endif
1095
1096;------------------------------------------------------------------------------
1097; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len,
1098; int channels);
1099;------------------------------------------------------------------------------
1100
1101%macro CONV_FLT_TO_S16P_6CH 0
1102%if ARCH_X86_64
1103cglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
1104%else
1105cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
1106%define lend dword r2m
1107%endif
1108 mov dst1q, [dstq+ gprsize]
1109 mov dst2q, [dstq+2*gprsize]
1110 mov dst3q, [dstq+3*gprsize]
1111 mov dst4q, [dstq+4*gprsize]
1112 mov dst5q, [dstq+5*gprsize]
1113 mov dstq, [dstq ]
1114 sub dst1q, dstq
1115 sub dst2q, dstq
1116 sub dst3q, dstq
1117 sub dst4q, dstq
1118 sub dst5q, dstq
1119 mova m6, [pf_s16_scale]
1120.loop:
1121 mulps m0, m6, [srcq+0*mmsize]
1122 mulps m3, m6, [srcq+1*mmsize]
1123 mulps m1, m6, [srcq+2*mmsize]
1124 mulps m4, m6, [srcq+3*mmsize]
1125 mulps m2, m6, [srcq+4*mmsize]
1126 mulps m5, m6, [srcq+5*mmsize]
1127 cvtps2dq m0, m0
1128 cvtps2dq m1, m1
1129 cvtps2dq m2, m2
1130 cvtps2dq m3, m3
1131 cvtps2dq m4, m4
1132 cvtps2dq m5, m5
1133 packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7
1134 packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15
1135 packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
1136 PALIGNR m3, m1, m0, 12, m4 ; m3 = 6, 7, 8, 9, 10, 11, x, x
1137 shufps m1, m2, q1032 ; m1 = 12, 13, 14, 15, 16, 17, 18, 19
1138 psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x
1139 SBUTTERFLY2 wd, 0, 3, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9
1140 ; m3 = 4, 10, 5, 11, x, x, x, x
1141 SBUTTERFLY2 wd, 1, 2, 4 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21
1142 ; m2 = 16, 22, 17, 23, x, x, x, x
1143 SBUTTERFLY2 dq, 0, 1, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19
1144 ; m1 = 2, 8, 14, 20, 3, 9, 15, 21
1145 punpckldq m3, m2 ; m3 = 4, 10, 16, 22, 5, 11, 17, 23
1146 movq [dstq ], m0
1147 movhps [dstq+dst1q], m0
1148 movq [dstq+dst2q], m1
1149 movhps [dstq+dst3q], m1
1150 movq [dstq+dst4q], m3
1151 movhps [dstq+dst5q], m3
1152 add srcq, mmsize*6
1153 add dstq, mmsize/2
1154 sub lend, mmsize/4
1155 jg .loop
1156 REP_RET
1157%endmacro
1158
1159INIT_XMM sse2
1160CONV_FLT_TO_S16P_6CH
1161INIT_XMM ssse3
1162CONV_FLT_TO_S16P_6CH
1163%if HAVE_AVX_EXTERNAL
1164INIT_XMM avx
1165CONV_FLT_TO_S16P_6CH
1166%endif
1167
1168;------------------------------------------------------------------------------
1169; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len,
1170; int channels);
1171;------------------------------------------------------------------------------
1172
1173%macro CONV_FLT_TO_FLTP_2CH 0
1174cglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1
1175 lea lenq, [4*lend]
1176 mov dst1q, [dst0q+gprsize]
1177 mov dst0q, [dst0q ]
1178 lea srcq, [srcq+2*lenq]
1179 add dst0q, lenq
1180 add dst1q, lenq
1181 neg lenq
1182.loop:
1183 mova m0, [srcq+2*lenq ]
1184 mova m1, [srcq+2*lenq+mmsize]
1185 DEINT2_PS 0, 1, 2
1186 mova [dst0q+lenq], m0
1187 mova [dst1q+lenq], m1
1188 add lenq, mmsize
1189 jl .loop
1190 REP_RET
1191%endmacro
1192
1193INIT_XMM sse
1194CONV_FLT_TO_FLTP_2CH
1195%if HAVE_AVX_EXTERNAL
1196INIT_XMM avx
1197CONV_FLT_TO_FLTP_2CH
1198%endif
1199
1200;------------------------------------------------------------------------------
1201; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len,
1202; int channels);
1203;------------------------------------------------------------------------------
1204
1205%macro CONV_FLT_TO_FLTP_6CH 0
1206%if ARCH_X86_64
1207cglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5
1208%else
1209cglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
1210%define lend dword r2m
1211%endif
1212 mov dst1q, [dstq+ gprsize]
1213 mov dst2q, [dstq+2*gprsize]
1214 mov dst3q, [dstq+3*gprsize]
1215 mov dst4q, [dstq+4*gprsize]
1216 mov dst5q, [dstq+5*gprsize]
1217 mov dstq, [dstq ]
1218 sub dst1q, dstq
1219 sub dst2q, dstq
1220 sub dst3q, dstq
1221 sub dst4q, dstq
1222 sub dst5q, dstq
1223.loop:
1224 mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3
1225 mova m1, [srcq+1*mmsize] ; m1 = 4, 5, 6, 7
1226 mova m2, [srcq+2*mmsize] ; m2 = 8, 9, 10, 11
1227 mova m3, [srcq+3*mmsize] ; m3 = 12, 13, 14, 15
1228 mova m4, [srcq+4*mmsize] ; m4 = 16, 17, 18, 19
1229 mova m5, [srcq+5*mmsize] ; m5 = 20, 21, 22, 23
1230
1231 SBUTTERFLY2 dq, 0, 3, 6 ; m0 = 0, 12, 1, 13
1232 ; m3 = 2, 14, 3, 15
1233 SBUTTERFLY2 dq, 1, 4, 6 ; m1 = 4, 16, 5, 17
1234 ; m4 = 6, 18, 7, 19
1235 SBUTTERFLY2 dq, 2, 5, 6 ; m2 = 8, 20, 9, 21
1236 ; m5 = 10, 22, 11, 23
1237 SBUTTERFLY2 dq, 0, 4, 6 ; m0 = 0, 6, 12, 18
1238 ; m4 = 1, 7, 13, 19
1239 SBUTTERFLY2 dq, 3, 2, 6 ; m3 = 2, 8, 14, 20
1240 ; m2 = 3, 9, 15, 21
1241 SBUTTERFLY2 dq, 1, 5, 6 ; m1 = 4, 10, 16, 22
1242 ; m5 = 5, 11, 17, 23
1243 mova [dstq ], m0
1244 mova [dstq+dst1q], m4
1245 mova [dstq+dst2q], m3
1246 mova [dstq+dst3q], m2
1247 mova [dstq+dst4q], m1
1248 mova [dstq+dst5q], m5
1249 add srcq, mmsize*6
1250 add dstq, mmsize
1251 sub lend, mmsize/4
1252 jg .loop
1253 REP_RET
1254%endmacro
1255
1256INIT_XMM sse2
1257CONV_FLT_TO_FLTP_6CH
1258%if HAVE_AVX_EXTERNAL
1259INIT_XMM avx
1260CONV_FLT_TO_FLTP_6CH
1261%endif