Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* x86 optimized Format Conversion Utils | |
3 | ;* Copyright (c) 2008 Loren Merritt | |
4 | ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com> | |
5 | ;* | |
6 | ;* This file is part of FFmpeg. | |
7 | ;* | |
8 | ;* FFmpeg is free software; you can redistribute it and/or | |
9 | ;* modify it under the terms of the GNU Lesser General Public | |
10 | ;* License as published by the Free Software Foundation; either | |
11 | ;* version 2.1 of the License, or (at your option) any later version. | |
12 | ;* | |
13 | ;* FFmpeg is distributed in the hope that it will be useful, | |
14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | ;* Lesser General Public License for more details. | |
17 | ;* | |
18 | ;* You should have received a copy of the GNU Lesser General Public | |
19 | ;* License along with FFmpeg; if not, write to the Free Software | |
20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | ;****************************************************************************** | |
22 | ||
23 | %include "libavutil/x86/x86util.asm" | |
24 | %include "util.asm" | |
25 | ||
26 | SECTION_RODATA 32 | |
27 | ||
28 | pf_s32_inv_scale: times 8 dd 0x30000000 | |
29 | pf_s32_scale: times 8 dd 0x4f000000 | |
30 | pf_s32_clip: times 8 dd 0x4effffff | |
31 | pf_s16_inv_scale: times 4 dd 0x38000000 | |
32 | pf_s16_scale: times 4 dd 0x47000000 | |
33 | pb_shuf_unpack_even: db -1, -1, 0, 1, -1, -1, 2, 3, -1, -1, 8, 9, -1, -1, 10, 11 | |
34 | pb_shuf_unpack_odd: db -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 12, 13, -1, -1, 14, 15 | |
35 | pb_interleave_words: SHUFFLE_MASK_W 0, 4, 1, 5, 2, 6, 3, 7 | |
36 | pb_deinterleave_words: SHUFFLE_MASK_W 0, 2, 4, 6, 1, 3, 5, 7 | |
37 | pw_zero_even: times 4 dw 0x0000, 0xffff | |
38 | ||
39 | SECTION_TEXT | |
40 | ||
41 | ;------------------------------------------------------------------------------ | |
42 | ; void ff_conv_s16_to_s32(int32_t *dst, const int16_t *src, int len); | |
43 | ;------------------------------------------------------------------------------ | |
44 | ||
45 | INIT_XMM sse2 | |
46 | cglobal conv_s16_to_s32, 3,3,3, dst, src, len | |
47 | lea lenq, [2*lend] | |
48 | lea dstq, [dstq+2*lenq] | |
49 | add srcq, lenq | |
50 | neg lenq | |
51 | .loop: | |
52 | mova m2, [srcq+lenq] | |
53 | pxor m0, m0 | |
54 | pxor m1, m1 | |
55 | punpcklwd m0, m2 | |
56 | punpckhwd m1, m2 | |
57 | mova [dstq+2*lenq ], m0 | |
58 | mova [dstq+2*lenq+mmsize], m1 | |
59 | add lenq, mmsize | |
60 | jl .loop | |
61 | REP_RET | |
62 | ||
63 | ;------------------------------------------------------------------------------ | |
64 | ; void ff_conv_s16_to_flt(float *dst, const int16_t *src, int len); | |
65 | ;------------------------------------------------------------------------------ | |
66 | ||
67 | %macro CONV_S16_TO_FLT 0 | |
68 | cglobal conv_s16_to_flt, 3,3,3, dst, src, len | |
69 | lea lenq, [2*lend] | |
70 | add srcq, lenq | |
71 | lea dstq, [dstq + 2*lenq] | |
72 | neg lenq | |
73 | mova m2, [pf_s16_inv_scale] | |
74 | ALIGN 16 | |
75 | .loop: | |
76 | mova m0, [srcq+lenq] | |
77 | S16_TO_S32_SX 0, 1 | |
78 | cvtdq2ps m0, m0 | |
79 | cvtdq2ps m1, m1 | |
80 | mulps m0, m2 | |
81 | mulps m1, m2 | |
82 | mova [dstq+2*lenq ], m0 | |
83 | mova [dstq+2*lenq+mmsize], m1 | |
84 | add lenq, mmsize | |
85 | jl .loop | |
86 | REP_RET | |
87 | %endmacro | |
88 | ||
89 | INIT_XMM sse2 | |
90 | CONV_S16_TO_FLT | |
91 | INIT_XMM sse4 | |
92 | CONV_S16_TO_FLT | |
93 | ||
94 | ;------------------------------------------------------------------------------ | |
95 | ; void ff_conv_s32_to_s16(int16_t *dst, const int32_t *src, int len); | |
96 | ;------------------------------------------------------------------------------ | |
97 | ||
98 | %macro CONV_S32_TO_S16 0 | |
99 | cglobal conv_s32_to_s16, 3,3,4, dst, src, len | |
100 | lea lenq, [2*lend] | |
101 | lea srcq, [srcq+2*lenq] | |
102 | add dstq, lenq | |
103 | neg lenq | |
104 | .loop: | |
105 | mova m0, [srcq+2*lenq ] | |
106 | mova m1, [srcq+2*lenq+ mmsize] | |
107 | mova m2, [srcq+2*lenq+2*mmsize] | |
108 | mova m3, [srcq+2*lenq+3*mmsize] | |
109 | psrad m0, 16 | |
110 | psrad m1, 16 | |
111 | psrad m2, 16 | |
112 | psrad m3, 16 | |
113 | packssdw m0, m1 | |
114 | packssdw m2, m3 | |
115 | mova [dstq+lenq ], m0 | |
116 | mova [dstq+lenq+mmsize], m2 | |
117 | add lenq, mmsize*2 | |
118 | jl .loop | |
119 | %if mmsize == 8 | |
120 | emms | |
121 | RET | |
122 | %else | |
123 | REP_RET | |
124 | %endif | |
125 | %endmacro | |
126 | ||
127 | INIT_MMX mmx | |
128 | CONV_S32_TO_S16 | |
129 | INIT_XMM sse2 | |
130 | CONV_S32_TO_S16 | |
131 | ||
132 | ;------------------------------------------------------------------------------ | |
133 | ; void ff_conv_s32_to_flt(float *dst, const int32_t *src, int len); | |
134 | ;------------------------------------------------------------------------------ | |
135 | ||
136 | %macro CONV_S32_TO_FLT 0 | |
137 | cglobal conv_s32_to_flt, 3,3,3, dst, src, len | |
138 | lea lenq, [4*lend] | |
139 | add srcq, lenq | |
140 | add dstq, lenq | |
141 | neg lenq | |
142 | mova m0, [pf_s32_inv_scale] | |
143 | ALIGN 16 | |
144 | .loop: | |
145 | cvtdq2ps m1, [srcq+lenq ] | |
146 | cvtdq2ps m2, [srcq+lenq+mmsize] | |
147 | mulps m1, m1, m0 | |
148 | mulps m2, m2, m0 | |
149 | mova [dstq+lenq ], m1 | |
150 | mova [dstq+lenq+mmsize], m2 | |
151 | add lenq, mmsize*2 | |
152 | jl .loop | |
153 | REP_RET | |
154 | %endmacro | |
155 | ||
156 | INIT_XMM sse2 | |
157 | CONV_S32_TO_FLT | |
158 | %if HAVE_AVX_EXTERNAL | |
159 | INIT_YMM avx | |
160 | CONV_S32_TO_FLT | |
161 | %endif | |
162 | ||
163 | ;------------------------------------------------------------------------------ | |
164 | ; void ff_conv_flt_to_s16(int16_t *dst, const float *src, int len); | |
165 | ;------------------------------------------------------------------------------ | |
166 | ||
167 | INIT_XMM sse2 | |
168 | cglobal conv_flt_to_s16, 3,3,5, dst, src, len | |
169 | lea lenq, [2*lend] | |
170 | lea srcq, [srcq+2*lenq] | |
171 | add dstq, lenq | |
172 | neg lenq | |
173 | mova m4, [pf_s16_scale] | |
174 | .loop: | |
175 | mova m0, [srcq+2*lenq ] | |
176 | mova m1, [srcq+2*lenq+1*mmsize] | |
177 | mova m2, [srcq+2*lenq+2*mmsize] | |
178 | mova m3, [srcq+2*lenq+3*mmsize] | |
179 | mulps m0, m4 | |
180 | mulps m1, m4 | |
181 | mulps m2, m4 | |
182 | mulps m3, m4 | |
183 | cvtps2dq m0, m0 | |
184 | cvtps2dq m1, m1 | |
185 | cvtps2dq m2, m2 | |
186 | cvtps2dq m3, m3 | |
187 | packssdw m0, m1 | |
188 | packssdw m2, m3 | |
189 | mova [dstq+lenq ], m0 | |
190 | mova [dstq+lenq+mmsize], m2 | |
191 | add lenq, mmsize*2 | |
192 | jl .loop | |
193 | REP_RET | |
194 | ||
195 | ;------------------------------------------------------------------------------ | |
196 | ; void ff_conv_flt_to_s32(int32_t *dst, const float *src, int len); | |
197 | ;------------------------------------------------------------------------------ | |
198 | ||
199 | %macro CONV_FLT_TO_S32 0 | |
200 | cglobal conv_flt_to_s32, 3,3,6, dst, src, len | |
201 | lea lenq, [lend*4] | |
202 | add srcq, lenq | |
203 | add dstq, lenq | |
204 | neg lenq | |
205 | mova m4, [pf_s32_scale] | |
206 | mova m5, [pf_s32_clip] | |
207 | .loop: | |
208 | mulps m0, m4, [srcq+lenq ] | |
209 | mulps m1, m4, [srcq+lenq+1*mmsize] | |
210 | mulps m2, m4, [srcq+lenq+2*mmsize] | |
211 | mulps m3, m4, [srcq+lenq+3*mmsize] | |
212 | minps m0, m0, m5 | |
213 | minps m1, m1, m5 | |
214 | minps m2, m2, m5 | |
215 | minps m3, m3, m5 | |
216 | cvtps2dq m0, m0 | |
217 | cvtps2dq m1, m1 | |
218 | cvtps2dq m2, m2 | |
219 | cvtps2dq m3, m3 | |
220 | mova [dstq+lenq ], m0 | |
221 | mova [dstq+lenq+1*mmsize], m1 | |
222 | mova [dstq+lenq+2*mmsize], m2 | |
223 | mova [dstq+lenq+3*mmsize], m3 | |
224 | add lenq, mmsize*4 | |
225 | jl .loop | |
226 | REP_RET | |
227 | %endmacro | |
228 | ||
229 | INIT_XMM sse2 | |
230 | CONV_FLT_TO_S32 | |
231 | %if HAVE_AVX_EXTERNAL | |
232 | INIT_YMM avx | |
233 | CONV_FLT_TO_S32 | |
234 | %endif | |
235 | ||
236 | ;------------------------------------------------------------------------------ | |
237 | ; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len, | |
238 | ; int channels); | |
239 | ;------------------------------------------------------------------------------ | |
240 | ||
241 | %macro CONV_S16P_TO_S16_2CH 0 | |
242 | cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src0, len, src1 | |
243 | mov src1q, [src0q+gprsize] | |
244 | mov src0q, [src0q ] | |
245 | lea lenq, [2*lend] | |
246 | add src0q, lenq | |
247 | add src1q, lenq | |
248 | lea dstq, [dstq+2*lenq] | |
249 | neg lenq | |
250 | .loop: | |
251 | mova m0, [src0q+lenq ] | |
252 | mova m1, [src1q+lenq ] | |
253 | mova m2, [src0q+lenq+mmsize] | |
254 | mova m3, [src1q+lenq+mmsize] | |
255 | SBUTTERFLY2 wd, 0, 1, 4 | |
256 | SBUTTERFLY2 wd, 2, 3, 4 | |
257 | mova [dstq+2*lenq+0*mmsize], m0 | |
258 | mova [dstq+2*lenq+1*mmsize], m1 | |
259 | mova [dstq+2*lenq+2*mmsize], m2 | |
260 | mova [dstq+2*lenq+3*mmsize], m3 | |
261 | add lenq, 2*mmsize | |
262 | jl .loop | |
263 | REP_RET | |
264 | %endmacro | |
265 | ||
266 | INIT_XMM sse2 | |
267 | CONV_S16P_TO_S16_2CH | |
268 | %if HAVE_AVX_EXTERNAL | |
269 | INIT_XMM avx | |
270 | CONV_S16P_TO_S16_2CH | |
271 | %endif | |
272 | ||
273 | ;------------------------------------------------------------------------------ | |
274 | ; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len, | |
275 | ; int channels); | |
276 | ;------------------------------------------------------------------------------ | |
277 | ||
278 | ;------------------------------------------------------------------------------ | |
279 | ; NOTE: In the 6-channel functions, len could be used as an index on x86-64 | |
280 | ; instead of just a counter, which would avoid incrementing the | |
281 | ; pointers, but the extra complexity and amount of code is not worth | |
282 | ; the small gain. On x86-32 there are not enough registers to use len | |
283 | ; as an index without keeping two of the pointers on the stack and | |
284 | ; loading them in each iteration. | |
285 | ;------------------------------------------------------------------------------ | |
286 | ||
287 | %macro CONV_S16P_TO_S16_6CH 0 | |
288 | %if ARCH_X86_64 | |
289 | cglobal conv_s16p_to_s16_6ch, 3,8,7, dst, src0, len, src1, src2, src3, src4, src5 | |
290 | %else | |
291 | cglobal conv_s16p_to_s16_6ch, 2,7,7, dst, src0, src1, src2, src3, src4, src5 | |
292 | %define lend dword r2m | |
293 | %endif | |
294 | mov src1q, [src0q+1*gprsize] | |
295 | mov src2q, [src0q+2*gprsize] | |
296 | mov src3q, [src0q+3*gprsize] | |
297 | mov src4q, [src0q+4*gprsize] | |
298 | mov src5q, [src0q+5*gprsize] | |
299 | mov src0q, [src0q] | |
300 | sub src1q, src0q | |
301 | sub src2q, src0q | |
302 | sub src3q, src0q | |
303 | sub src4q, src0q | |
304 | sub src5q, src0q | |
305 | .loop: | |
306 | %if cpuflag(sse2slow) | |
307 | movq m0, [src0q ] ; m0 = 0, 6, 12, 18, x, x, x, x | |
308 | movq m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x | |
309 | movq m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x | |
310 | movq m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x | |
311 | movq m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x | |
312 | movq m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x | |
313 | ; unpack words: | |
314 | punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19 | |
315 | punpcklwd m2, m3 ; m2 = 4, 5, 10, 11, 16, 17, 22, 23 | |
316 | punpcklwd m4, m5 ; m4 = 2, 3, 8, 9, 14, 15, 20, 21 | |
317 | ; blend dwords | |
318 | shufps m1, m0, m2, q2020 ; m1 = 0, 1, 12, 13, 2, 3, 14, 15 | |
319 | shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17 | |
320 | shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23 | |
321 | ; shuffle dwords | |
322 | pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19 | |
323 | pshufd m1, m1, q3120 ; m1 = 0, 1, 2, 3, 12, 13, 14, 15 | |
324 | pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23 | |
325 | movq [dstq+0*mmsize/2], m1 | |
326 | movq [dstq+1*mmsize/2], m0 | |
327 | movq [dstq+2*mmsize/2], m2 | |
328 | movhps [dstq+3*mmsize/2], m1 | |
329 | movhps [dstq+4*mmsize/2], m0 | |
330 | movhps [dstq+5*mmsize/2], m2 | |
331 | add src0q, mmsize/2 | |
332 | add dstq, mmsize*3 | |
333 | sub lend, mmsize/4 | |
334 | %else | |
335 | mova m0, [src0q ] ; m0 = 0, 6, 12, 18, 24, 30, 36, 42 | |
336 | mova m1, [src0q+src1q] ; m1 = 1, 7, 13, 19, 25, 31, 37, 43 | |
337 | mova m2, [src0q+src2q] ; m2 = 2, 8, 14, 20, 26, 32, 38, 44 | |
338 | mova m3, [src0q+src3q] ; m3 = 3, 9, 15, 21, 27, 33, 39, 45 | |
339 | mova m4, [src0q+src4q] ; m4 = 4, 10, 16, 22, 28, 34, 40, 46 | |
340 | mova m5, [src0q+src5q] ; m5 = 5, 11, 17, 23, 29, 35, 41, 47 | |
341 | ; unpack words: | |
342 | SBUTTERFLY2 wd, 0, 1, 6 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19 | |
343 | ; m1 = 24, 25, 30, 31, 36, 37, 42, 43 | |
344 | SBUTTERFLY2 wd, 2, 3, 6 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21 | |
345 | ; m3 = 26, 27, 32, 33, 38, 39, 44, 45 | |
346 | SBUTTERFLY2 wd, 4, 5, 6 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23 | |
347 | ; m5 = 28, 29, 34, 35, 40, 41, 46, 47 | |
348 | ; blend dwords | |
349 | shufps m6, m0, m2, q2020 ; m6 = 0, 1, 12, 13, 2, 3, 14, 15 | |
350 | shufps m0, m4, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17 | |
351 | shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23 | |
352 | SWAP 4,6 ; m4 = 0, 1, 12, 13, 2, 3, 14, 15 | |
353 | shufps m6, m1, m3, q2020 ; m6 = 24, 25, 36, 37, 26, 27, 38, 39 | |
354 | shufps m1, m5, q2031 ; m1 = 30, 31, 42, 43, 28, 29, 40, 41 | |
355 | shufps m3, m5, q3131 ; m3 = 32, 33, 44, 45, 34, 35, 46, 47 | |
356 | SWAP 5,6 ; m5 = 24, 25, 36, 37, 26, 27, 38, 39 | |
357 | ; shuffle dwords | |
358 | pshufd m0, m0, q1302 ; m0 = 4, 5, 6, 7, 16, 17, 18, 19 | |
359 | pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23 | |
360 | pshufd m4, m4, q3120 ; m4 = 0, 1, 2, 3, 12, 13, 14, 15 | |
361 | pshufd m1, m1, q1302 ; m1 = 28, 29, 30, 31, 40, 41, 42, 43 | |
362 | pshufd m3, m3, q3120 ; m3 = 32, 33, 34, 35, 44, 45, 46, 47 | |
363 | pshufd m5, m5, q3120 ; m5 = 24, 25, 26, 27, 36, 37, 38, 39 | |
364 | ; shuffle qwords | |
365 | punpcklqdq m6, m4, m0 ; m6 = 0, 1, 2, 3, 4, 5, 6, 7 | |
366 | punpckhqdq m0, m2 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23 | |
367 | shufps m2, m4, q3210 ; m2 = 8, 9, 10, 11, 12, 13, 14, 15 | |
368 | SWAP 4,6 ; m4 = 0, 1, 2, 3, 4, 5, 6, 7 | |
369 | punpcklqdq m6, m5, m1 ; m6 = 24, 25, 26, 27, 28, 29, 30, 31 | |
370 | punpckhqdq m1, m3 ; m1 = 40, 41, 42, 43, 44, 45, 46, 47 | |
371 | shufps m3, m5, q3210 ; m3 = 32, 33, 34, 35, 36, 37, 38, 39 | |
372 | SWAP 5,6 ; m5 = 24, 25, 26, 27, 28, 29, 30, 31 | |
373 | mova [dstq+0*mmsize], m4 | |
374 | mova [dstq+1*mmsize], m2 | |
375 | mova [dstq+2*mmsize], m0 | |
376 | mova [dstq+3*mmsize], m5 | |
377 | mova [dstq+4*mmsize], m3 | |
378 | mova [dstq+5*mmsize], m1 | |
379 | add src0q, mmsize | |
380 | add dstq, mmsize*6 | |
381 | sub lend, mmsize/2 | |
382 | %endif | |
383 | jg .loop | |
384 | REP_RET | |
385 | %endmacro | |
386 | ||
387 | INIT_XMM sse2 | |
388 | CONV_S16P_TO_S16_6CH | |
389 | INIT_XMM sse2slow | |
390 | CONV_S16P_TO_S16_6CH | |
391 | %if HAVE_AVX_EXTERNAL | |
392 | INIT_XMM avx | |
393 | CONV_S16P_TO_S16_6CH | |
394 | %endif | |
395 | ||
396 | ;------------------------------------------------------------------------------ | |
397 | ; void ff_conv_s16p_to_flt_2ch(float *dst, int16_t *const *src, int len, | |
398 | ; int channels); | |
399 | ;------------------------------------------------------------------------------ | |
400 | ||
401 | %macro CONV_S16P_TO_FLT_2CH 0 | |
402 | cglobal conv_s16p_to_flt_2ch, 3,4,6, dst, src0, len, src1 | |
403 | lea lenq, [2*lend] | |
404 | mov src1q, [src0q+gprsize] | |
405 | mov src0q, [src0q ] | |
406 | lea dstq, [dstq+4*lenq] | |
407 | add src0q, lenq | |
408 | add src1q, lenq | |
409 | neg lenq | |
410 | mova m5, [pf_s32_inv_scale] | |
411 | .loop: | |
412 | mova m2, [src0q+lenq] ; m2 = 0, 2, 4, 6, 8, 10, 12, 14 | |
413 | mova m4, [src1q+lenq] ; m4 = 1, 3, 5, 7, 9, 11, 13, 15 | |
414 | SBUTTERFLY2 wd, 2, 4, 3 ; m2 = 0, 1, 2, 3, 4, 5, 6, 7 | |
415 | ; m4 = 8, 9, 10, 11, 12, 13, 14, 15 | |
416 | pxor m3, m3 | |
417 | punpcklwd m0, m3, m2 ; m0 = 0, 1, 2, 3 | |
418 | punpckhwd m1, m3, m2 ; m1 = 4, 5, 6, 7 | |
419 | punpcklwd m2, m3, m4 ; m2 = 8, 9, 10, 11 | |
420 | punpckhwd m3, m4 ; m3 = 12, 13, 14, 15 | |
421 | cvtdq2ps m0, m0 | |
422 | cvtdq2ps m1, m1 | |
423 | cvtdq2ps m2, m2 | |
424 | cvtdq2ps m3, m3 | |
425 | mulps m0, m5 | |
426 | mulps m1, m5 | |
427 | mulps m2, m5 | |
428 | mulps m3, m5 | |
429 | mova [dstq+4*lenq ], m0 | |
430 | mova [dstq+4*lenq+ mmsize], m1 | |
431 | mova [dstq+4*lenq+2*mmsize], m2 | |
432 | mova [dstq+4*lenq+3*mmsize], m3 | |
433 | add lenq, mmsize | |
434 | jl .loop | |
435 | REP_RET | |
436 | %endmacro | |
437 | ||
438 | INIT_XMM sse2 | |
439 | CONV_S16P_TO_FLT_2CH | |
440 | %if HAVE_AVX_EXTERNAL | |
441 | INIT_XMM avx | |
442 | CONV_S16P_TO_FLT_2CH | |
443 | %endif | |
444 | ||
445 | ;------------------------------------------------------------------------------ | |
446 | ; void ff_conv_s16p_to_flt_6ch(float *dst, int16_t *const *src, int len, | |
447 | ; int channels); | |
448 | ;------------------------------------------------------------------------------ | |
449 | ||
450 | %macro CONV_S16P_TO_FLT_6CH 0 | |
451 | %if ARCH_X86_64 | |
452 | cglobal conv_s16p_to_flt_6ch, 3,8,8, dst, src, len, src1, src2, src3, src4, src5 | |
453 | %else | |
454 | cglobal conv_s16p_to_flt_6ch, 2,7,8, dst, src, src1, src2, src3, src4, src5 | |
455 | %define lend dword r2m | |
456 | %endif | |
457 | mov src1q, [srcq+1*gprsize] | |
458 | mov src2q, [srcq+2*gprsize] | |
459 | mov src3q, [srcq+3*gprsize] | |
460 | mov src4q, [srcq+4*gprsize] | |
461 | mov src5q, [srcq+5*gprsize] | |
462 | mov srcq, [srcq] | |
463 | sub src1q, srcq | |
464 | sub src2q, srcq | |
465 | sub src3q, srcq | |
466 | sub src4q, srcq | |
467 | sub src5q, srcq | |
468 | mova m7, [pf_s32_inv_scale] | |
469 | %if cpuflag(ssse3) | |
470 | %define unpack_even m6 | |
471 | mova m6, [pb_shuf_unpack_even] | |
472 | %if ARCH_X86_64 | |
473 | %define unpack_odd m8 | |
474 | mova m8, [pb_shuf_unpack_odd] | |
475 | %else | |
476 | %define unpack_odd [pb_shuf_unpack_odd] | |
477 | %endif | |
478 | %endif | |
479 | .loop: | |
480 | movq m0, [srcq ] ; m0 = 0, 6, 12, 18, x, x, x, x | |
481 | movq m1, [srcq+src1q] ; m1 = 1, 7, 13, 19, x, x, x, x | |
482 | movq m2, [srcq+src2q] ; m2 = 2, 8, 14, 20, x, x, x, x | |
483 | movq m3, [srcq+src3q] ; m3 = 3, 9, 15, 21, x, x, x, x | |
484 | movq m4, [srcq+src4q] ; m4 = 4, 10, 16, 22, x, x, x, x | |
485 | movq m5, [srcq+src5q] ; m5 = 5, 11, 17, 23, x, x, x, x | |
486 | ; unpack words: | |
487 | punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19 | |
488 | punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21 | |
489 | punpcklwd m4, m5 ; m4 = 4, 5, 10, 11, 16, 17, 22, 23 | |
490 | ; blend dwords | |
491 | shufps m1, m4, m0, q3120 ; m1 = 4, 5, 16, 17, 6, 7, 18, 19 | |
492 | shufps m0, m2, q2020 ; m0 = 0, 1, 12, 13, 2, 3, 14, 15 | |
493 | shufps m2, m4, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23 | |
494 | %if cpuflag(ssse3) | |
495 | pshufb m3, m0, unpack_odd ; m3 = 12, 13, 14, 15 | |
496 | pshufb m0, unpack_even ; m0 = 0, 1, 2, 3 | |
497 | pshufb m4, m1, unpack_odd ; m4 = 16, 17, 18, 19 | |
498 | pshufb m1, unpack_even ; m1 = 4, 5, 6, 7 | |
499 | pshufb m5, m2, unpack_odd ; m5 = 20, 21, 22, 23 | |
500 | pshufb m2, unpack_even ; m2 = 8, 9, 10, 11 | |
501 | %else | |
502 | ; shuffle dwords | |
503 | pshufd m0, m0, q3120 ; m0 = 0, 1, 2, 3, 12, 13, 14, 15 | |
504 | pshufd m1, m1, q3120 ; m1 = 4, 5, 6, 7, 16, 17, 18, 19 | |
505 | pshufd m2, m2, q3120 ; m2 = 8, 9, 10, 11, 20, 21, 22, 23 | |
506 | pxor m6, m6 ; convert s16 in m0-m2 to s32 in m0-m5 | |
507 | punpcklwd m3, m6, m0 ; m3 = 0, 1, 2, 3 | |
508 | punpckhwd m4, m6, m0 ; m4 = 12, 13, 14, 15 | |
509 | punpcklwd m0, m6, m1 ; m0 = 4, 5, 6, 7 | |
510 | punpckhwd m5, m6, m1 ; m5 = 16, 17, 18, 19 | |
511 | punpcklwd m1, m6, m2 ; m1 = 8, 9, 10, 11 | |
512 | punpckhwd m6, m2 ; m6 = 20, 21, 22, 23 | |
513 | SWAP 6,2,1,0,3,4,5 ; swap registers 3,0,1,4,5,6 to 0,1,2,3,4,5 | |
514 | %endif | |
515 | cvtdq2ps m0, m0 ; convert s32 to float | |
516 | cvtdq2ps m1, m1 | |
517 | cvtdq2ps m2, m2 | |
518 | cvtdq2ps m3, m3 | |
519 | cvtdq2ps m4, m4 | |
520 | cvtdq2ps m5, m5 | |
521 | mulps m0, m7 ; scale float from s32 range to [-1.0,1.0] | |
522 | mulps m1, m7 | |
523 | mulps m2, m7 | |
524 | mulps m3, m7 | |
525 | mulps m4, m7 | |
526 | mulps m5, m7 | |
527 | mova [dstq ], m0 | |
528 | mova [dstq+ mmsize], m1 | |
529 | mova [dstq+2*mmsize], m2 | |
530 | mova [dstq+3*mmsize], m3 | |
531 | mova [dstq+4*mmsize], m4 | |
532 | mova [dstq+5*mmsize], m5 | |
533 | add srcq, mmsize/2 | |
534 | add dstq, mmsize*6 | |
535 | sub lend, mmsize/4 | |
536 | jg .loop | |
537 | REP_RET | |
538 | %endmacro | |
539 | ||
540 | INIT_XMM sse2 | |
541 | CONV_S16P_TO_FLT_6CH | |
542 | INIT_XMM ssse3 | |
543 | CONV_S16P_TO_FLT_6CH | |
544 | %if HAVE_AVX_EXTERNAL | |
545 | INIT_XMM avx | |
546 | CONV_S16P_TO_FLT_6CH | |
547 | %endif | |
548 | ||
549 | ;------------------------------------------------------------------------------ | |
550 | ; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len, | |
551 | ; int channels); | |
552 | ;------------------------------------------------------------------------------ | |
553 | ||
554 | %macro CONV_FLTP_TO_S16_2CH 0 | |
555 | cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1 | |
556 | lea lenq, [4*lend] | |
557 | mov src1q, [src0q+gprsize] | |
558 | mov src0q, [src0q ] | |
559 | add dstq, lenq | |
560 | add src0q, lenq | |
561 | add src1q, lenq | |
562 | neg lenq | |
563 | mova m2, [pf_s16_scale] | |
564 | %if cpuflag(ssse3) | |
565 | mova m3, [pb_interleave_words] | |
566 | %endif | |
567 | .loop: | |
568 | mulps m0, m2, [src0q+lenq] ; m0 = 0, 2, 4, 6 | |
569 | mulps m1, m2, [src1q+lenq] ; m1 = 1, 3, 5, 7 | |
570 | cvtps2dq m0, m0 | |
571 | cvtps2dq m1, m1 | |
572 | %if cpuflag(ssse3) | |
573 | packssdw m0, m1 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7 | |
574 | pshufb m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 | |
575 | %else | |
576 | packssdw m0, m0 ; m0 = 0, 2, 4, 6, x, x, x, x | |
577 | packssdw m1, m1 ; m1 = 1, 3, 5, 7, x, x, x, x | |
578 | punpcklwd m0, m1 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 | |
579 | %endif | |
580 | mova [dstq+lenq], m0 | |
581 | add lenq, mmsize | |
582 | jl .loop | |
583 | REP_RET | |
584 | %endmacro | |
585 | ||
586 | INIT_XMM sse2 | |
587 | CONV_FLTP_TO_S16_2CH | |
588 | INIT_XMM ssse3 | |
589 | CONV_FLTP_TO_S16_2CH | |
590 | ||
591 | ;------------------------------------------------------------------------------ | |
592 | ; void ff_conv_fltp_to_s16_6ch(int16_t *dst, float *const *src, int len, | |
593 | ; int channels); | |
594 | ;------------------------------------------------------------------------------ | |
595 | ||
596 | %macro CONV_FLTP_TO_S16_6CH 0 | |
597 | %if ARCH_X86_64 | |
598 | cglobal conv_fltp_to_s16_6ch, 3,8,7, dst, src, len, src1, src2, src3, src4, src5 | |
599 | %else | |
600 | cglobal conv_fltp_to_s16_6ch, 2,7,7, dst, src, src1, src2, src3, src4, src5 | |
601 | %define lend dword r2m | |
602 | %endif | |
603 | mov src1q, [srcq+1*gprsize] | |
604 | mov src2q, [srcq+2*gprsize] | |
605 | mov src3q, [srcq+3*gprsize] | |
606 | mov src4q, [srcq+4*gprsize] | |
607 | mov src5q, [srcq+5*gprsize] | |
608 | mov srcq, [srcq] | |
609 | sub src1q, srcq | |
610 | sub src2q, srcq | |
611 | sub src3q, srcq | |
612 | sub src4q, srcq | |
613 | sub src5q, srcq | |
614 | movaps xmm6, [pf_s16_scale] | |
615 | .loop: | |
616 | %if cpuflag(sse2) | |
617 | mulps m0, m6, [srcq ] | |
618 | mulps m1, m6, [srcq+src1q] | |
619 | mulps m2, m6, [srcq+src2q] | |
620 | mulps m3, m6, [srcq+src3q] | |
621 | mulps m4, m6, [srcq+src4q] | |
622 | mulps m5, m6, [srcq+src5q] | |
623 | cvtps2dq m0, m0 | |
624 | cvtps2dq m1, m1 | |
625 | cvtps2dq m2, m2 | |
626 | cvtps2dq m3, m3 | |
627 | cvtps2dq m4, m4 | |
628 | cvtps2dq m5, m5 | |
629 | packssdw m0, m3 ; m0 = 0, 6, 12, 18, 3, 9, 15, 21 | |
630 | packssdw m1, m4 ; m1 = 1, 7, 13, 19, 4, 10, 16, 22 | |
631 | packssdw m2, m5 ; m2 = 2, 8, 14, 20, 5, 11, 17, 23 | |
632 | ; unpack words: | |
633 | movhlps m3, m0 ; m3 = 3, 9, 15, 21, x, x, x, x | |
634 | punpcklwd m0, m1 ; m0 = 0, 1, 6, 7, 12, 13, 18, 19 | |
635 | punpckhwd m1, m2 ; m1 = 4, 5, 10, 11, 16, 17, 22, 23 | |
636 | punpcklwd m2, m3 ; m2 = 2, 3, 8, 9, 14, 15, 20, 21 | |
637 | ; blend dwords: | |
638 | shufps m3, m0, m2, q2020 ; m3 = 0, 1, 12, 13, 2, 3, 14, 15 | |
639 | shufps m0, m1, q2031 ; m0 = 6, 7, 18, 19, 4, 5, 16, 17 | |
640 | shufps m2, m1, q3131 ; m2 = 8, 9, 20, 21, 10, 11, 22, 23 | |
641 | ; shuffle dwords: | |
642 | shufps m1, m2, m3, q3120 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15 | |
643 | shufps m3, m0, q0220 ; m3 = 0, 1, 2, 3, 4, 5, 6, 7 | |
644 | shufps m0, m2, q3113 ; m0 = 16, 17, 18, 19, 20, 21, 22, 23 | |
645 | mova [dstq+0*mmsize], m3 | |
646 | mova [dstq+1*mmsize], m1 | |
647 | mova [dstq+2*mmsize], m0 | |
648 | %else ; sse | |
649 | movlps xmm0, [srcq ] | |
650 | movlps xmm1, [srcq+src1q] | |
651 | movlps xmm2, [srcq+src2q] | |
652 | movlps xmm3, [srcq+src3q] | |
653 | movlps xmm4, [srcq+src4q] | |
654 | movlps xmm5, [srcq+src5q] | |
655 | mulps xmm0, xmm6 | |
656 | mulps xmm1, xmm6 | |
657 | mulps xmm2, xmm6 | |
658 | mulps xmm3, xmm6 | |
659 | mulps xmm4, xmm6 | |
660 | mulps xmm5, xmm6 | |
661 | cvtps2pi mm0, xmm0 | |
662 | cvtps2pi mm1, xmm1 | |
663 | cvtps2pi mm2, xmm2 | |
664 | cvtps2pi mm3, xmm3 | |
665 | cvtps2pi mm4, xmm4 | |
666 | cvtps2pi mm5, xmm5 | |
667 | packssdw mm0, mm3 ; m0 = 0, 6, 3, 9 | |
668 | packssdw mm1, mm4 ; m1 = 1, 7, 4, 10 | |
669 | packssdw mm2, mm5 ; m2 = 2, 8, 5, 11 | |
670 | ; unpack words | |
671 | pshufw mm3, mm0, q1032 ; m3 = 3, 9, 0, 6 | |
672 | punpcklwd mm0, mm1 ; m0 = 0, 1, 6, 7 | |
673 | punpckhwd mm1, mm2 ; m1 = 4, 5, 10, 11 | |
674 | punpcklwd mm2, mm3 ; m2 = 2, 3, 8, 9 | |
675 | ; unpack dwords | |
676 | pshufw mm3, mm0, q1032 ; m3 = 6, 7, 0, 1 | |
677 | punpckldq mm0, mm2 ; m0 = 0, 1, 2, 3 (final) | |
678 | punpckhdq mm2, mm1 ; m2 = 8, 9, 10, 11 (final) | |
679 | punpckldq mm1, mm3 ; m1 = 4, 5, 6, 7 (final) | |
680 | mova [dstq+0*mmsize], mm0 | |
681 | mova [dstq+1*mmsize], mm1 | |
682 | mova [dstq+2*mmsize], mm2 | |
683 | %endif | |
684 | add srcq, mmsize | |
685 | add dstq, mmsize*3 | |
686 | sub lend, mmsize/4 | |
687 | jg .loop | |
688 | %if mmsize == 8 | |
689 | emms | |
690 | RET | |
691 | %else | |
692 | REP_RET | |
693 | %endif | |
694 | %endmacro | |
695 | ||
696 | INIT_MMX sse | |
697 | CONV_FLTP_TO_S16_6CH | |
698 | INIT_XMM sse2 | |
699 | CONV_FLTP_TO_S16_6CH | |
700 | %if HAVE_AVX_EXTERNAL | |
701 | INIT_XMM avx | |
702 | CONV_FLTP_TO_S16_6CH | |
703 | %endif | |
704 | ||
705 | ;------------------------------------------------------------------------------ | |
706 | ; void ff_conv_fltp_to_flt_2ch(float *dst, float *const *src, int len, | |
707 | ; int channels); | |
708 | ;------------------------------------------------------------------------------ | |
709 | ||
710 | %macro CONV_FLTP_TO_FLT_2CH 0 | |
711 | cglobal conv_fltp_to_flt_2ch, 3,4,5, dst, src0, len, src1 | |
712 | mov src1q, [src0q+gprsize] | |
713 | mov src0q, [src0q] | |
714 | lea lenq, [4*lend] | |
715 | add src0q, lenq | |
716 | add src1q, lenq | |
717 | lea dstq, [dstq+2*lenq] | |
718 | neg lenq | |
719 | .loop: | |
720 | mova m0, [src0q+lenq ] | |
721 | mova m1, [src1q+lenq ] | |
722 | mova m2, [src0q+lenq+mmsize] | |
723 | mova m3, [src1q+lenq+mmsize] | |
724 | SBUTTERFLYPS 0, 1, 4 | |
725 | SBUTTERFLYPS 2, 3, 4 | |
726 | mova [dstq+2*lenq+0*mmsize], m0 | |
727 | mova [dstq+2*lenq+1*mmsize], m1 | |
728 | mova [dstq+2*lenq+2*mmsize], m2 | |
729 | mova [dstq+2*lenq+3*mmsize], m3 | |
730 | add lenq, 2*mmsize | |
731 | jl .loop | |
732 | REP_RET | |
733 | %endmacro | |
734 | ||
735 | INIT_XMM sse | |
736 | CONV_FLTP_TO_FLT_2CH | |
737 | %if HAVE_AVX_EXTERNAL | |
738 | INIT_XMM avx | |
739 | CONV_FLTP_TO_FLT_2CH | |
740 | %endif | |
741 | ||
742 | ;----------------------------------------------------------------------------- | |
743 | ; void ff_conv_fltp_to_flt_6ch(float *dst, float *const *src, int len, | |
744 | ; int channels); | |
745 | ;----------------------------------------------------------------------------- | |
746 | ||
747 | %macro CONV_FLTP_TO_FLT_6CH 0 | |
748 | cglobal conv_fltp_to_flt_6ch, 2,8,7, dst, src, src1, src2, src3, src4, src5, len | |
749 | %if ARCH_X86_64 | |
750 | mov lend, r2d | |
751 | %else | |
752 | %define lend dword r2m | |
753 | %endif | |
754 | mov src1q, [srcq+1*gprsize] | |
755 | mov src2q, [srcq+2*gprsize] | |
756 | mov src3q, [srcq+3*gprsize] | |
757 | mov src4q, [srcq+4*gprsize] | |
758 | mov src5q, [srcq+5*gprsize] | |
759 | mov srcq, [srcq] | |
760 | sub src1q, srcq | |
761 | sub src2q, srcq | |
762 | sub src3q, srcq | |
763 | sub src4q, srcq | |
764 | sub src5q, srcq | |
765 | .loop: | |
766 | mova m0, [srcq ] | |
767 | mova m1, [srcq+src1q] | |
768 | mova m2, [srcq+src2q] | |
769 | mova m3, [srcq+src3q] | |
770 | mova m4, [srcq+src4q] | |
771 | mova m5, [srcq+src5q] | |
772 | %if cpuflag(sse4) | |
773 | SBUTTERFLYPS 0, 1, 6 | |
774 | SBUTTERFLYPS 2, 3, 6 | |
775 | SBUTTERFLYPS 4, 5, 6 | |
776 | ||
777 | blendps m6, m4, m0, 1100b | |
778 | movlhps m0, m2 | |
779 | movhlps m4, m2 | |
780 | blendps m2, m5, m1, 1100b | |
781 | movlhps m1, m3 | |
782 | movhlps m5, m3 | |
783 | ||
784 | movaps [dstq ], m0 | |
785 | movaps [dstq+16], m6 | |
786 | movaps [dstq+32], m4 | |
787 | movaps [dstq+48], m1 | |
788 | movaps [dstq+64], m2 | |
789 | movaps [dstq+80], m5 | |
790 | %else ; mmx | |
791 | SBUTTERFLY dq, 0, 1, 6 | |
792 | SBUTTERFLY dq, 2, 3, 6 | |
793 | SBUTTERFLY dq, 4, 5, 6 | |
794 | ||
795 | movq [dstq ], m0 | |
796 | movq [dstq+ 8], m2 | |
797 | movq [dstq+16], m4 | |
798 | movq [dstq+24], m1 | |
799 | movq [dstq+32], m3 | |
800 | movq [dstq+40], m5 | |
801 | %endif | |
802 | add srcq, mmsize | |
803 | add dstq, mmsize*6 | |
804 | sub lend, mmsize/4 | |
805 | jg .loop | |
806 | %if mmsize == 8 | |
807 | emms | |
808 | RET | |
809 | %else | |
810 | REP_RET | |
811 | %endif | |
812 | %endmacro | |
813 | ||
814 | INIT_MMX mmx | |
815 | CONV_FLTP_TO_FLT_6CH | |
816 | INIT_XMM sse4 | |
817 | CONV_FLTP_TO_FLT_6CH | |
818 | %if HAVE_AVX_EXTERNAL | |
819 | INIT_XMM avx | |
820 | CONV_FLTP_TO_FLT_6CH | |
821 | %endif | |
822 | ||
823 | ;------------------------------------------------------------------------------ | |
824 | ; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len, | |
825 | ; int channels); | |
826 | ;------------------------------------------------------------------------------ | |
827 | ||
828 | %macro CONV_S16_TO_S16P_2CH 0 | |
829 | cglobal conv_s16_to_s16p_2ch, 3,4,4, dst0, src, len, dst1 | |
830 | lea lenq, [2*lend] | |
831 | mov dst1q, [dst0q+gprsize] | |
832 | mov dst0q, [dst0q ] | |
833 | lea srcq, [srcq+2*lenq] | |
834 | add dst0q, lenq | |
835 | add dst1q, lenq | |
836 | neg lenq | |
837 | %if cpuflag(ssse3) | |
838 | mova m3, [pb_deinterleave_words] | |
839 | %endif | |
840 | .loop: | |
841 | mova m0, [srcq+2*lenq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 | |
842 | mova m1, [srcq+2*lenq+mmsize] ; m1 = 8, 9, 10, 11, 12, 13, 14, 15 | |
843 | %if cpuflag(ssse3) | |
844 | pshufb m0, m3 ; m0 = 0, 2, 4, 6, 1, 3, 5, 7 | |
845 | pshufb m1, m3 ; m1 = 8, 10, 12, 14, 9, 11, 13, 15 | |
846 | SBUTTERFLY2 qdq, 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14 | |
847 | ; m1 = 1, 3, 5, 7, 9, 11, 13, 15 | |
848 | %else ; sse2 | |
849 | pshuflw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 5, 6, 7 | |
850 | pshufhw m0, m0, q3120 ; m0 = 0, 2, 1, 3, 4, 6, 5, 7 | |
851 | pshuflw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 13, 14, 15 | |
852 | pshufhw m1, m1, q3120 ; m1 = 8, 10, 9, 11, 12, 14, 13, 15 | |
853 | DEINT2_PS 0, 1, 2 ; m0 = 0, 2, 4, 6, 8, 10, 12, 14 | |
854 | ; m1 = 1, 3, 5, 7, 9, 11, 13, 15 | |
855 | %endif | |
856 | mova [dst0q+lenq], m0 | |
857 | mova [dst1q+lenq], m1 | |
858 | add lenq, mmsize | |
859 | jl .loop | |
860 | REP_RET | |
861 | %endmacro | |
862 | ||
863 | INIT_XMM sse2 | |
864 | CONV_S16_TO_S16P_2CH | |
865 | INIT_XMM ssse3 | |
866 | CONV_S16_TO_S16P_2CH | |
867 | %if HAVE_AVX_EXTERNAL | |
868 | INIT_XMM avx | |
869 | CONV_S16_TO_S16P_2CH | |
870 | %endif | |
871 | ||
872 | ;------------------------------------------------------------------------------ | |
873 | ; void ff_conv_s16_to_s16p_6ch(int16_t *const *dst, int16_t *src, int len, | |
874 | ; int channels); | |
875 | ;------------------------------------------------------------------------------ | |
876 | ||
877 | %macro CONV_S16_TO_S16P_6CH 0 | |
878 | %if ARCH_X86_64 | |
879 | cglobal conv_s16_to_s16p_6ch, 3,8,5, dst, src, len, dst1, dst2, dst3, dst4, dst5 | |
880 | %else | |
881 | cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5 | |
882 | %define lend dword r2m | |
883 | %endif | |
884 | mov dst1q, [dstq+ gprsize] | |
885 | mov dst2q, [dstq+2*gprsize] | |
886 | mov dst3q, [dstq+3*gprsize] | |
887 | mov dst4q, [dstq+4*gprsize] | |
888 | mov dst5q, [dstq+5*gprsize] | |
889 | mov dstq, [dstq ] | |
890 | sub dst1q, dstq | |
891 | sub dst2q, dstq | |
892 | sub dst3q, dstq | |
893 | sub dst4q, dstq | |
894 | sub dst5q, dstq | |
895 | .loop: | |
896 | mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 | |
897 | mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15 | |
898 | mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23 | |
899 | PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x | |
900 | shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19 | |
901 | psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x | |
902 | SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9 | |
903 | ; m1 = 4, 10, 5, 11, x, x, x, x | |
904 | SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21 | |
905 | ; m2 = 16, 22, 17, 23, x, x, x, x | |
906 | SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19 | |
907 | ; m3 = 2, 8, 14, 20, 3, 9, 15, 21 | |
908 | punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23 | |
909 | movq [dstq ], m0 | |
910 | movhps [dstq+dst1q], m0 | |
911 | movq [dstq+dst2q], m3 | |
912 | movhps [dstq+dst3q], m3 | |
913 | movq [dstq+dst4q], m1 | |
914 | movhps [dstq+dst5q], m1 | |
915 | add srcq, mmsize*3 | |
916 | add dstq, mmsize/2 | |
917 | sub lend, mmsize/4 | |
918 | jg .loop | |
919 | REP_RET | |
920 | %endmacro | |
921 | ||
922 | INIT_XMM sse2 | |
923 | CONV_S16_TO_S16P_6CH | |
924 | INIT_XMM ssse3 | |
925 | CONV_S16_TO_S16P_6CH | |
926 | %if HAVE_AVX_EXTERNAL | |
927 | INIT_XMM avx | |
928 | CONV_S16_TO_S16P_6CH | |
929 | %endif | |
930 | ||
931 | ;------------------------------------------------------------------------------ | |
932 | ; void ff_conv_s16_to_fltp_2ch(float *const *dst, int16_t *src, int len, | |
933 | ; int channels); | |
934 | ;------------------------------------------------------------------------------ | |
935 | ||
936 | %macro CONV_S16_TO_FLTP_2CH 0 | |
937 | cglobal conv_s16_to_fltp_2ch, 3,4,5, dst0, src, len, dst1 | |
938 | lea lenq, [4*lend] | |
939 | mov dst1q, [dst0q+gprsize] | |
940 | mov dst0q, [dst0q ] | |
941 | add srcq, lenq | |
942 | add dst0q, lenq | |
943 | add dst1q, lenq | |
944 | neg lenq | |
945 | mova m3, [pf_s32_inv_scale] | |
946 | mova m4, [pw_zero_even] | |
947 | .loop: | |
948 | mova m1, [srcq+lenq] | |
949 | pslld m0, m1, 16 | |
950 | pand m1, m4 | |
951 | cvtdq2ps m0, m0 | |
952 | cvtdq2ps m1, m1 | |
953 | mulps m0, m0, m3 | |
954 | mulps m1, m1, m3 | |
955 | mova [dst0q+lenq], m0 | |
956 | mova [dst1q+lenq], m1 | |
957 | add lenq, mmsize | |
958 | jl .loop | |
959 | REP_RET | |
960 | %endmacro | |
961 | ||
962 | INIT_XMM sse2 | |
963 | CONV_S16_TO_FLTP_2CH | |
964 | %if HAVE_AVX_EXTERNAL | |
965 | INIT_XMM avx | |
966 | CONV_S16_TO_FLTP_2CH | |
967 | %endif | |
968 | ||
969 | ;------------------------------------------------------------------------------ | |
970 | ; void ff_conv_s16_to_fltp_6ch(float *const *dst, int16_t *src, int len, | |
971 | ; int channels); | |
972 | ;------------------------------------------------------------------------------ | |
973 | ||
974 | %macro CONV_S16_TO_FLTP_6CH 0 | |
975 | %if ARCH_X86_64 | |
976 | cglobal conv_s16_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5 | |
977 | %else | |
978 | cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5 | |
979 | %define lend dword r2m | |
980 | %endif | |
981 | mov dst1q, [dstq+ gprsize] | |
982 | mov dst2q, [dstq+2*gprsize] | |
983 | mov dst3q, [dstq+3*gprsize] | |
984 | mov dst4q, [dstq+4*gprsize] | |
985 | mov dst5q, [dstq+5*gprsize] | |
986 | mov dstq, [dstq ] | |
987 | sub dst1q, dstq | |
988 | sub dst2q, dstq | |
989 | sub dst3q, dstq | |
990 | sub dst4q, dstq | |
991 | sub dst5q, dstq | |
992 | mova m6, [pf_s16_inv_scale] | |
993 | .loop: | |
994 | mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 | |
995 | mova m3, [srcq+1*mmsize] ; m3 = 8, 9, 10, 11, 12, 13, 14, 15 | |
996 | mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23 | |
997 | PALIGNR m1, m3, m0, 12, m4 ; m1 = 6, 7, 8, 9, 10, 11, x, x | |
998 | shufps m3, m2, q1032 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19 | |
999 | psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x | |
1000 | SBUTTERFLY2 wd, 0, 1, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9 | |
1001 | ; m1 = 4, 10, 5, 11, x, x, x, x | |
1002 | SBUTTERFLY2 wd, 3, 2, 4 ; m3 = 12, 18, 13, 19, 14, 20, 15, 21 | |
1003 | ; m2 = 16, 22, 17, 23, x, x, x, x | |
1004 | SBUTTERFLY2 dq, 0, 3, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19 | |
1005 | ; m3 = 2, 8, 14, 20, 3, 9, 15, 21 | |
1006 | punpckldq m1, m2 ; m1 = 4, 10, 16, 22, 5, 11, 17, 23 | |
1007 | S16_TO_S32_SX 0, 2 ; m0 = 0, 6, 12, 18 | |
1008 | ; m2 = 1, 7, 13, 19 | |
1009 | S16_TO_S32_SX 3, 4 ; m3 = 2, 8, 14, 20 | |
1010 | ; m4 = 3, 9, 15, 21 | |
1011 | S16_TO_S32_SX 1, 5 ; m1 = 4, 10, 16, 22 | |
1012 | ; m5 = 5, 11, 17, 23 | |
1013 | SWAP 1,2,3,4 | |
1014 | cvtdq2ps m0, m0 | |
1015 | cvtdq2ps m1, m1 | |
1016 | cvtdq2ps m2, m2 | |
1017 | cvtdq2ps m3, m3 | |
1018 | cvtdq2ps m4, m4 | |
1019 | cvtdq2ps m5, m5 | |
1020 | mulps m0, m6 | |
1021 | mulps m1, m6 | |
1022 | mulps m2, m6 | |
1023 | mulps m3, m6 | |
1024 | mulps m4, m6 | |
1025 | mulps m5, m6 | |
1026 | mova [dstq ], m0 | |
1027 | mova [dstq+dst1q], m1 | |
1028 | mova [dstq+dst2q], m2 | |
1029 | mova [dstq+dst3q], m3 | |
1030 | mova [dstq+dst4q], m4 | |
1031 | mova [dstq+dst5q], m5 | |
1032 | add srcq, mmsize*3 | |
1033 | add dstq, mmsize | |
1034 | sub lend, mmsize/4 | |
1035 | jg .loop | |
1036 | REP_RET | |
1037 | %endmacro | |
1038 | ||
1039 | INIT_XMM sse2 | |
1040 | CONV_S16_TO_FLTP_6CH | |
1041 | INIT_XMM ssse3 | |
1042 | CONV_S16_TO_FLTP_6CH | |
1043 | INIT_XMM sse4 | |
1044 | CONV_S16_TO_FLTP_6CH | |
1045 | %if HAVE_AVX_EXTERNAL | |
1046 | INIT_XMM avx | |
1047 | CONV_S16_TO_FLTP_6CH | |
1048 | %endif | |
1049 | ||
1050 | ;------------------------------------------------------------------------------ | |
1051 | ; void ff_conv_flt_to_s16p_2ch(int16_t *const *dst, float *src, int len, | |
1052 | ; int channels); | |
1053 | ;------------------------------------------------------------------------------ | |
1054 | ||
1055 | %macro CONV_FLT_TO_S16P_2CH 0 | |
1056 | cglobal conv_flt_to_s16p_2ch, 3,4,6, dst0, src, len, dst1 | |
1057 | lea lenq, [2*lend] | |
1058 | mov dst1q, [dst0q+gprsize] | |
1059 | mov dst0q, [dst0q ] | |
1060 | lea srcq, [srcq+4*lenq] | |
1061 | add dst0q, lenq | |
1062 | add dst1q, lenq | |
1063 | neg lenq | |
1064 | mova m5, [pf_s16_scale] | |
1065 | .loop: | |
1066 | mova m0, [srcq+4*lenq ] | |
1067 | mova m1, [srcq+4*lenq+ mmsize] | |
1068 | mova m2, [srcq+4*lenq+2*mmsize] | |
1069 | mova m3, [srcq+4*lenq+3*mmsize] | |
1070 | DEINT2_PS 0, 1, 4 | |
1071 | DEINT2_PS 2, 3, 4 | |
1072 | mulps m0, m0, m5 | |
1073 | mulps m1, m1, m5 | |
1074 | mulps m2, m2, m5 | |
1075 | mulps m3, m3, m5 | |
1076 | cvtps2dq m0, m0 | |
1077 | cvtps2dq m1, m1 | |
1078 | cvtps2dq m2, m2 | |
1079 | cvtps2dq m3, m3 | |
1080 | packssdw m0, m2 | |
1081 | packssdw m1, m3 | |
1082 | mova [dst0q+lenq], m0 | |
1083 | mova [dst1q+lenq], m1 | |
1084 | add lenq, mmsize | |
1085 | jl .loop | |
1086 | REP_RET | |
1087 | %endmacro | |
1088 | ||
1089 | INIT_XMM sse2 | |
1090 | CONV_FLT_TO_S16P_2CH | |
1091 | %if HAVE_AVX_EXTERNAL | |
1092 | INIT_XMM avx | |
1093 | CONV_FLT_TO_S16P_2CH | |
1094 | %endif | |
1095 | ||
1096 | ;------------------------------------------------------------------------------ | |
1097 | ; void ff_conv_flt_to_s16p_6ch(int16_t *const *dst, float *src, int len, | |
1098 | ; int channels); | |
1099 | ;------------------------------------------------------------------------------ | |
1100 | ||
1101 | %macro CONV_FLT_TO_S16P_6CH 0 | |
1102 | %if ARCH_X86_64 | |
1103 | cglobal conv_flt_to_s16p_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5 | |
1104 | %else | |
1105 | cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5 | |
1106 | %define lend dword r2m | |
1107 | %endif | |
1108 | mov dst1q, [dstq+ gprsize] | |
1109 | mov dst2q, [dstq+2*gprsize] | |
1110 | mov dst3q, [dstq+3*gprsize] | |
1111 | mov dst4q, [dstq+4*gprsize] | |
1112 | mov dst5q, [dstq+5*gprsize] | |
1113 | mov dstq, [dstq ] | |
1114 | sub dst1q, dstq | |
1115 | sub dst2q, dstq | |
1116 | sub dst3q, dstq | |
1117 | sub dst4q, dstq | |
1118 | sub dst5q, dstq | |
1119 | mova m6, [pf_s16_scale] | |
1120 | .loop: | |
1121 | mulps m0, m6, [srcq+0*mmsize] | |
1122 | mulps m3, m6, [srcq+1*mmsize] | |
1123 | mulps m1, m6, [srcq+2*mmsize] | |
1124 | mulps m4, m6, [srcq+3*mmsize] | |
1125 | mulps m2, m6, [srcq+4*mmsize] | |
1126 | mulps m5, m6, [srcq+5*mmsize] | |
1127 | cvtps2dq m0, m0 | |
1128 | cvtps2dq m1, m1 | |
1129 | cvtps2dq m2, m2 | |
1130 | cvtps2dq m3, m3 | |
1131 | cvtps2dq m4, m4 | |
1132 | cvtps2dq m5, m5 | |
1133 | packssdw m0, m3 ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 | |
1134 | packssdw m1, m4 ; m1 = 8, 9, 10, 11, 12, 13, 14, 15 | |
1135 | packssdw m2, m5 ; m2 = 16, 17, 18, 19, 20, 21, 22, 23 | |
1136 | PALIGNR m3, m1, m0, 12, m4 ; m3 = 6, 7, 8, 9, 10, 11, x, x | |
1137 | shufps m1, m2, q1032 ; m1 = 12, 13, 14, 15, 16, 17, 18, 19 | |
1138 | psrldq m2, 4 ; m2 = 18, 19, 20, 21, 22, 23, x, x | |
1139 | SBUTTERFLY2 wd, 0, 3, 4 ; m0 = 0, 6, 1, 7, 2, 8, 3, 9 | |
1140 | ; m3 = 4, 10, 5, 11, x, x, x, x | |
1141 | SBUTTERFLY2 wd, 1, 2, 4 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21 | |
1142 | ; m2 = 16, 22, 17, 23, x, x, x, x | |
1143 | SBUTTERFLY2 dq, 0, 1, 4 ; m0 = 0, 6, 12, 18, 1, 7, 13, 19 | |
1144 | ; m1 = 2, 8, 14, 20, 3, 9, 15, 21 | |
1145 | punpckldq m3, m2 ; m3 = 4, 10, 16, 22, 5, 11, 17, 23 | |
1146 | movq [dstq ], m0 | |
1147 | movhps [dstq+dst1q], m0 | |
1148 | movq [dstq+dst2q], m1 | |
1149 | movhps [dstq+dst3q], m1 | |
1150 | movq [dstq+dst4q], m3 | |
1151 | movhps [dstq+dst5q], m3 | |
1152 | add srcq, mmsize*6 | |
1153 | add dstq, mmsize/2 | |
1154 | sub lend, mmsize/4 | |
1155 | jg .loop | |
1156 | REP_RET | |
1157 | %endmacro | |
1158 | ||
1159 | INIT_XMM sse2 | |
1160 | CONV_FLT_TO_S16P_6CH | |
1161 | INIT_XMM ssse3 | |
1162 | CONV_FLT_TO_S16P_6CH | |
1163 | %if HAVE_AVX_EXTERNAL | |
1164 | INIT_XMM avx | |
1165 | CONV_FLT_TO_S16P_6CH | |
1166 | %endif | |
1167 | ||
1168 | ;------------------------------------------------------------------------------ | |
1169 | ; void ff_conv_flt_to_fltp_2ch(float *const *dst, float *src, int len, | |
1170 | ; int channels); | |
1171 | ;------------------------------------------------------------------------------ | |
1172 | ||
1173 | %macro CONV_FLT_TO_FLTP_2CH 0 | |
1174 | cglobal conv_flt_to_fltp_2ch, 3,4,3, dst0, src, len, dst1 | |
1175 | lea lenq, [4*lend] | |
1176 | mov dst1q, [dst0q+gprsize] | |
1177 | mov dst0q, [dst0q ] | |
1178 | lea srcq, [srcq+2*lenq] | |
1179 | add dst0q, lenq | |
1180 | add dst1q, lenq | |
1181 | neg lenq | |
1182 | .loop: | |
1183 | mova m0, [srcq+2*lenq ] | |
1184 | mova m1, [srcq+2*lenq+mmsize] | |
1185 | DEINT2_PS 0, 1, 2 | |
1186 | mova [dst0q+lenq], m0 | |
1187 | mova [dst1q+lenq], m1 | |
1188 | add lenq, mmsize | |
1189 | jl .loop | |
1190 | REP_RET | |
1191 | %endmacro | |
1192 | ||
1193 | INIT_XMM sse | |
1194 | CONV_FLT_TO_FLTP_2CH | |
1195 | %if HAVE_AVX_EXTERNAL | |
1196 | INIT_XMM avx | |
1197 | CONV_FLT_TO_FLTP_2CH | |
1198 | %endif | |
1199 | ||
1200 | ;------------------------------------------------------------------------------ | |
1201 | ; void ff_conv_flt_to_fltp_6ch(float *const *dst, float *src, int len, | |
1202 | ; int channels); | |
1203 | ;------------------------------------------------------------------------------ | |
1204 | ||
1205 | %macro CONV_FLT_TO_FLTP_6CH 0 | |
1206 | %if ARCH_X86_64 | |
1207 | cglobal conv_flt_to_fltp_6ch, 3,8,7, dst, src, len, dst1, dst2, dst3, dst4, dst5 | |
1208 | %else | |
1209 | cglobal conv_flt_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5 | |
1210 | %define lend dword r2m | |
1211 | %endif | |
1212 | mov dst1q, [dstq+ gprsize] | |
1213 | mov dst2q, [dstq+2*gprsize] | |
1214 | mov dst3q, [dstq+3*gprsize] | |
1215 | mov dst4q, [dstq+4*gprsize] | |
1216 | mov dst5q, [dstq+5*gprsize] | |
1217 | mov dstq, [dstq ] | |
1218 | sub dst1q, dstq | |
1219 | sub dst2q, dstq | |
1220 | sub dst3q, dstq | |
1221 | sub dst4q, dstq | |
1222 | sub dst5q, dstq | |
1223 | .loop: | |
1224 | mova m0, [srcq+0*mmsize] ; m0 = 0, 1, 2, 3 | |
1225 | mova m1, [srcq+1*mmsize] ; m1 = 4, 5, 6, 7 | |
1226 | mova m2, [srcq+2*mmsize] ; m2 = 8, 9, 10, 11 | |
1227 | mova m3, [srcq+3*mmsize] ; m3 = 12, 13, 14, 15 | |
1228 | mova m4, [srcq+4*mmsize] ; m4 = 16, 17, 18, 19 | |
1229 | mova m5, [srcq+5*mmsize] ; m5 = 20, 21, 22, 23 | |
1230 | ||
1231 | SBUTTERFLY2 dq, 0, 3, 6 ; m0 = 0, 12, 1, 13 | |
1232 | ; m3 = 2, 14, 3, 15 | |
1233 | SBUTTERFLY2 dq, 1, 4, 6 ; m1 = 4, 16, 5, 17 | |
1234 | ; m4 = 6, 18, 7, 19 | |
1235 | SBUTTERFLY2 dq, 2, 5, 6 ; m2 = 8, 20, 9, 21 | |
1236 | ; m5 = 10, 22, 11, 23 | |
1237 | SBUTTERFLY2 dq, 0, 4, 6 ; m0 = 0, 6, 12, 18 | |
1238 | ; m4 = 1, 7, 13, 19 | |
1239 | SBUTTERFLY2 dq, 3, 2, 6 ; m3 = 2, 8, 14, 20 | |
1240 | ; m2 = 3, 9, 15, 21 | |
1241 | SBUTTERFLY2 dq, 1, 5, 6 ; m1 = 4, 10, 16, 22 | |
1242 | ; m5 = 5, 11, 17, 23 | |
1243 | mova [dstq ], m0 | |
1244 | mova [dstq+dst1q], m4 | |
1245 | mova [dstq+dst2q], m3 | |
1246 | mova [dstq+dst3q], m2 | |
1247 | mova [dstq+dst4q], m1 | |
1248 | mova [dstq+dst5q], m5 | |
1249 | add srcq, mmsize*6 | |
1250 | add dstq, mmsize | |
1251 | sub lend, mmsize/4 | |
1252 | jg .loop | |
1253 | REP_RET | |
1254 | %endmacro | |
1255 | ||
1256 | INIT_XMM sse2 | |
1257 | CONV_FLT_TO_FLTP_6CH | |
1258 | %if HAVE_AVX_EXTERNAL | |
1259 | INIT_XMM avx | |
1260 | CONV_FLT_TO_FLTP_6CH | |
1261 | %endif |