Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* x86 optimized channel mixing | |
3 | ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com> | |
4 | ;* | |
5 | ;* This file is part of FFmpeg. | |
6 | ;* | |
7 | ;* FFmpeg is free software; you can redistribute it and/or | |
8 | ;* modify it under the terms of the GNU Lesser General Public | |
9 | ;* License as published by the Free Software Foundation; either | |
10 | ;* version 2.1 of the License, or (at your option) any later version. | |
11 | ;* | |
12 | ;* FFmpeg is distributed in the hope that it will be useful, | |
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;* Lesser General Public License for more details. | |
16 | ;* | |
17 | ;* You should have received a copy of the GNU Lesser General Public | |
18 | ;* License along with FFmpeg; if not, write to the Free Software | |
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ;****************************************************************************** | |
21 | ||
22 | %include "libavutil/x86/x86util.asm" | |
23 | %include "util.asm" | |
24 | ||
25 | SECTION_TEXT | |
26 | ||
27 | ;----------------------------------------------------------------------------- | |
28 | ; void ff_mix_2_to_1_fltp_flt(float **src, float **matrix, int len, | |
29 | ; int out_ch, int in_ch); | |
30 | ;----------------------------------------------------------------------------- | |
31 | ||
32 | %macro MIX_2_TO_1_FLTP_FLT 0 | |
33 | cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1 | |
34 | mov src1q, [srcq+gprsize] | |
35 | mov srcq, [srcq ] | |
36 | sub src1q, srcq | |
37 | mov matrixq, [matrixq ] | |
38 | VBROADCASTSS m4, [matrixq ] | |
39 | VBROADCASTSS m5, [matrixq+4] | |
40 | ALIGN 16 | |
41 | .loop: | |
42 | mulps m0, m4, [srcq ] | |
43 | mulps m1, m5, [srcq+src1q ] | |
44 | mulps m2, m4, [srcq+ mmsize] | |
45 | mulps m3, m5, [srcq+src1q+mmsize] | |
46 | addps m0, m0, m1 | |
47 | addps m2, m2, m3 | |
48 | mova [srcq ], m0 | |
49 | mova [srcq+mmsize], m2 | |
50 | add srcq, mmsize*2 | |
51 | sub lend, mmsize*2/4 | |
52 | jg .loop | |
53 | REP_RET | |
54 | %endmacro | |
55 | ||
56 | INIT_XMM sse | |
57 | MIX_2_TO_1_FLTP_FLT | |
58 | %if HAVE_AVX_EXTERNAL | |
59 | INIT_YMM avx | |
60 | MIX_2_TO_1_FLTP_FLT | |
61 | %endif | |
62 | ||
63 | ;----------------------------------------------------------------------------- | |
64 | ; void ff_mix_2_to_1_s16p_flt(int16_t **src, float **matrix, int len, | |
65 | ; int out_ch, int in_ch); | |
66 | ;----------------------------------------------------------------------------- | |
67 | ||
68 | %macro MIX_2_TO_1_S16P_FLT 0 | |
69 | cglobal mix_2_to_1_s16p_flt, 3,4,6, src, matrix, len, src1 | |
70 | mov src1q, [srcq+gprsize] | |
71 | mov srcq, [srcq] | |
72 | sub src1q, srcq | |
73 | mov matrixq, [matrixq ] | |
74 | VBROADCASTSS m4, [matrixq ] | |
75 | VBROADCASTSS m5, [matrixq+4] | |
76 | ALIGN 16 | |
77 | .loop: | |
78 | mova m0, [srcq ] | |
79 | mova m2, [srcq+src1q] | |
80 | S16_TO_S32_SX 0, 1 | |
81 | S16_TO_S32_SX 2, 3 | |
82 | cvtdq2ps m0, m0 | |
83 | cvtdq2ps m1, m1 | |
84 | cvtdq2ps m2, m2 | |
85 | cvtdq2ps m3, m3 | |
86 | mulps m0, m4 | |
87 | mulps m1, m4 | |
88 | mulps m2, m5 | |
89 | mulps m3, m5 | |
90 | addps m0, m2 | |
91 | addps m1, m3 | |
92 | cvtps2dq m0, m0 | |
93 | cvtps2dq m1, m1 | |
94 | packssdw m0, m1 | |
95 | mova [srcq], m0 | |
96 | add srcq, mmsize | |
97 | sub lend, mmsize/2 | |
98 | jg .loop | |
99 | REP_RET | |
100 | %endmacro | |
101 | ||
102 | INIT_XMM sse2 | |
103 | MIX_2_TO_1_S16P_FLT | |
104 | INIT_XMM sse4 | |
105 | MIX_2_TO_1_S16P_FLT | |
106 | ||
107 | ;----------------------------------------------------------------------------- | |
108 | ; void ff_mix_2_to_1_s16p_q8(int16_t **src, int16_t **matrix, int len, | |
109 | ; int out_ch, int in_ch); | |
110 | ;----------------------------------------------------------------------------- | |
111 | ||
112 | INIT_XMM sse2 | |
113 | cglobal mix_2_to_1_s16p_q8, 3,4,6, src, matrix, len, src1 | |
114 | mov src1q, [srcq+gprsize] | |
115 | mov srcq, [srcq] | |
116 | sub src1q, srcq | |
117 | mov matrixq, [matrixq] | |
118 | movd m4, [matrixq] | |
119 | movd m5, [matrixq] | |
120 | SPLATW m4, m4, 0 | |
121 | SPLATW m5, m5, 1 | |
122 | pxor m0, m0 | |
123 | punpcklwd m4, m0 | |
124 | punpcklwd m5, m0 | |
125 | ALIGN 16 | |
126 | .loop: | |
127 | mova m0, [srcq ] | |
128 | mova m2, [srcq+src1q] | |
129 | punpckhwd m1, m0, m0 | |
130 | punpcklwd m0, m0 | |
131 | punpckhwd m3, m2, m2 | |
132 | punpcklwd m2, m2 | |
133 | pmaddwd m0, m4 | |
134 | pmaddwd m1, m4 | |
135 | pmaddwd m2, m5 | |
136 | pmaddwd m3, m5 | |
137 | paddd m0, m2 | |
138 | paddd m1, m3 | |
139 | psrad m0, 8 | |
140 | psrad m1, 8 | |
141 | packssdw m0, m1 | |
142 | mova [srcq], m0 | |
143 | add srcq, mmsize | |
144 | sub lend, mmsize/2 | |
145 | jg .loop | |
146 | REP_RET | |
147 | ||
148 | ;----------------------------------------------------------------------------- | |
149 | ; void ff_mix_1_to_2_fltp_flt(float **src, float **matrix, int len, | |
150 | ; int out_ch, int in_ch); | |
151 | ;----------------------------------------------------------------------------- | |
152 | ||
153 | %macro MIX_1_TO_2_FLTP_FLT 0 | |
154 | cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1 | |
155 | mov src1q, [src0q+gprsize] | |
156 | mov src0q, [src0q] | |
157 | sub src1q, src0q | |
158 | mov matrix1q, [matrix0q+gprsize] | |
159 | mov matrix0q, [matrix0q] | |
160 | VBROADCASTSS m2, [matrix0q] | |
161 | VBROADCASTSS m3, [matrix1q] | |
162 | ALIGN 16 | |
163 | .loop: | |
164 | mova m0, [src0q] | |
165 | mulps m1, m0, m3 | |
166 | mulps m0, m0, m2 | |
167 | mova [src0q ], m0 | |
168 | mova [src0q+src1q], m1 | |
169 | add src0q, mmsize | |
170 | sub lend, mmsize/4 | |
171 | jg .loop | |
172 | REP_RET | |
173 | %endmacro | |
174 | ||
175 | INIT_XMM sse | |
176 | MIX_1_TO_2_FLTP_FLT | |
177 | %if HAVE_AVX_EXTERNAL | |
178 | INIT_YMM avx | |
179 | MIX_1_TO_2_FLTP_FLT | |
180 | %endif | |
181 | ||
182 | ;----------------------------------------------------------------------------- | |
183 | ; void ff_mix_1_to_2_s16p_flt(int16_t **src, float **matrix, int len, | |
184 | ; int out_ch, int in_ch); | |
185 | ;----------------------------------------------------------------------------- | |
186 | ||
187 | %macro MIX_1_TO_2_S16P_FLT 0 | |
188 | cglobal mix_1_to_2_s16p_flt, 3,5,6, src0, matrix0, len, src1, matrix1 | |
189 | mov src1q, [src0q+gprsize] | |
190 | mov src0q, [src0q] | |
191 | sub src1q, src0q | |
192 | mov matrix1q, [matrix0q+gprsize] | |
193 | mov matrix0q, [matrix0q] | |
194 | VBROADCASTSS m4, [matrix0q] | |
195 | VBROADCASTSS m5, [matrix1q] | |
196 | ALIGN 16 | |
197 | .loop: | |
198 | mova m0, [src0q] | |
199 | S16_TO_S32_SX 0, 2 | |
200 | cvtdq2ps m0, m0 | |
201 | cvtdq2ps m2, m2 | |
202 | mulps m1, m0, m5 | |
203 | mulps m0, m0, m4 | |
204 | mulps m3, m2, m5 | |
205 | mulps m2, m2, m4 | |
206 | cvtps2dq m0, m0 | |
207 | cvtps2dq m1, m1 | |
208 | cvtps2dq m2, m2 | |
209 | cvtps2dq m3, m3 | |
210 | packssdw m0, m2 | |
211 | packssdw m1, m3 | |
212 | mova [src0q ], m0 | |
213 | mova [src0q+src1q], m1 | |
214 | add src0q, mmsize | |
215 | sub lend, mmsize/2 | |
216 | jg .loop | |
217 | REP_RET | |
218 | %endmacro | |
219 | ||
220 | INIT_XMM sse2 | |
221 | MIX_1_TO_2_S16P_FLT | |
222 | INIT_XMM sse4 | |
223 | MIX_1_TO_2_S16P_FLT | |
224 | %if HAVE_AVX_EXTERNAL | |
225 | INIT_XMM avx | |
226 | MIX_1_TO_2_S16P_FLT | |
227 | %endif | |
228 | ||
229 | ;----------------------------------------------------------------------------- | |
230 | ; void ff_mix_3_8_to_1_2_fltp/s16p_flt(float/int16_t **src, float **matrix, | |
231 | ; int len, int out_ch, int in_ch); | |
232 | ;----------------------------------------------------------------------------- | |
233 | ||
234 | %macro MIX_3_8_TO_1_2_FLT 3 ; %1 = in channels, %2 = out channels, %3 = s16p or fltp | |
235 | ; define some names to make the code clearer | |
236 | %assign in_channels %1 | |
237 | %assign out_channels %2 | |
238 | %assign stereo out_channels - 1 | |
239 | %ifidn %3, s16p | |
240 | %assign is_s16 1 | |
241 | %else | |
242 | %assign is_s16 0 | |
243 | %endif | |
244 | ||
245 | ; determine how many matrix elements must go on the stack vs. mmregs | |
246 | %assign matrix_elements in_channels * out_channels | |
247 | %if is_s16 | |
248 | %if stereo | |
249 | %assign needed_mmregs 7 | |
250 | %else | |
251 | %assign needed_mmregs 5 | |
252 | %endif | |
253 | %else | |
254 | %if stereo | |
255 | %assign needed_mmregs 4 | |
256 | %else | |
257 | %assign needed_mmregs 3 | |
258 | %endif | |
259 | %endif | |
260 | %assign matrix_elements_mm num_mmregs - needed_mmregs | |
261 | %if matrix_elements < matrix_elements_mm | |
262 | %assign matrix_elements_mm matrix_elements | |
263 | %endif | |
264 | %if matrix_elements_mm < matrix_elements | |
265 | %assign matrix_elements_stack matrix_elements - matrix_elements_mm | |
266 | %else | |
267 | %assign matrix_elements_stack 0 | |
268 | %endif | |
269 | %assign matrix_stack_size matrix_elements_stack * mmsize | |
270 | ||
271 | %assign needed_stack_size -1 * matrix_stack_size | |
272 | %if ARCH_X86_32 && in_channels >= 7 | |
273 | %assign needed_stack_size needed_stack_size - 16 | |
274 | %endif | |
275 | ||
276 | cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, needed_stack_size, src0, src1, len, src2, src3, src4, src5, src6, src7 | |
277 | ||
278 | ; define src pointers on stack if needed | |
279 | %if matrix_elements_stack > 0 && ARCH_X86_32 && in_channels >= 7 | |
280 | %define src5m [rsp+matrix_stack_size+0] | |
281 | %define src6m [rsp+matrix_stack_size+4] | |
282 | %define src7m [rsp+matrix_stack_size+8] | |
283 | %endif | |
284 | ||
285 | ; load matrix pointers | |
286 | %define matrix0q r1q | |
287 | %define matrix1q r3q | |
288 | %if stereo | |
289 | mov matrix1q, [matrix0q+gprsize] | |
290 | %endif | |
291 | mov matrix0q, [matrix0q] | |
292 | ||
293 | ; define matrix coeff names | |
294 | %assign %%i 0 | |
295 | %assign %%j needed_mmregs | |
296 | %rep in_channels | |
297 | %if %%i >= matrix_elements_mm | |
298 | CAT_XDEFINE mx_stack_0_, %%i, 1 | |
299 | CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize] | |
300 | %else | |
301 | CAT_XDEFINE mx_stack_0_, %%i, 0 | |
302 | CAT_XDEFINE mx_0_, %%i, m %+ %%j | |
303 | %assign %%j %%j+1 | |
304 | %endif | |
305 | %assign %%i %%i+1 | |
306 | %endrep | |
307 | %if stereo | |
308 | %assign %%i 0 | |
309 | %rep in_channels | |
310 | %if in_channels + %%i >= matrix_elements_mm | |
311 | CAT_XDEFINE mx_stack_1_, %%i, 1 | |
312 | CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize] | |
313 | %else | |
314 | CAT_XDEFINE mx_stack_1_, %%i, 0 | |
315 | CAT_XDEFINE mx_1_, %%i, m %+ %%j | |
316 | %assign %%j %%j+1 | |
317 | %endif | |
318 | %assign %%i %%i+1 | |
319 | %endrep | |
320 | %endif | |
321 | ||
322 | ; load/splat matrix coeffs | |
323 | %assign %%i 0 | |
324 | %rep in_channels | |
325 | %if mx_stack_0_ %+ %%i | |
326 | VBROADCASTSS m0, [matrix0q+4*%%i] | |
327 | mova mx_0_ %+ %%i, m0 | |
328 | %else | |
329 | VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i] | |
330 | %endif | |
331 | %if stereo | |
332 | %if mx_stack_1_ %+ %%i | |
333 | VBROADCASTSS m0, [matrix1q+4*%%i] | |
334 | mova mx_1_ %+ %%i, m0 | |
335 | %else | |
336 | VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i] | |
337 | %endif | |
338 | %endif | |
339 | %assign %%i %%i+1 | |
340 | %endrep | |
341 | ||
342 | ; load channel pointers to registers as offsets from the first channel pointer | |
343 | %if ARCH_X86_64 | |
344 | movsxd lenq, r2d | |
345 | %endif | |
346 | shl lenq, 2-is_s16 | |
347 | %assign %%i 1 | |
348 | %rep (in_channels - 1) | |
349 | %if ARCH_X86_32 && in_channels >= 7 && %%i >= 5 | |
350 | mov src5q, [src0q+%%i*gprsize] | |
351 | add src5q, lenq | |
352 | mov src %+ %%i %+ m, src5q | |
353 | %else | |
354 | mov src %+ %%i %+ q, [src0q+%%i*gprsize] | |
355 | add src %+ %%i %+ q, lenq | |
356 | %endif | |
357 | %assign %%i %%i+1 | |
358 | %endrep | |
359 | mov src0q, [src0q] | |
360 | add src0q, lenq | |
361 | neg lenq | |
362 | .loop: | |
363 | ; for x86-32 with 7-8 channels we do not have enough gp registers for all src | |
364 | ; pointers, so we have to load some of them from the stack each time | |
365 | %define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5 | |
366 | %if is_s16 | |
367 | ; mix with s16p input | |
368 | mova m0, [src0q+lenq] | |
369 | S16_TO_S32_SX 0, 1 | |
370 | cvtdq2ps m0, m0 | |
371 | cvtdq2ps m1, m1 | |
372 | %if stereo | |
373 | mulps m2, m0, mx_1_0 | |
374 | mulps m3, m1, mx_1_0 | |
375 | %endif | |
376 | mulps m0, m0, mx_0_0 | |
377 | mulps m1, m1, mx_0_0 | |
378 | %assign %%i 1 | |
379 | %rep (in_channels - 1) | |
380 | %if copy_src_from_stack | |
381 | %define src_ptr src5q | |
382 | %else | |
383 | %define src_ptr src %+ %%i %+ q | |
384 | %endif | |
385 | %if stereo | |
386 | %if copy_src_from_stack | |
387 | mov src_ptr, src %+ %%i %+ m | |
388 | %endif | |
389 | mova m4, [src_ptr+lenq] | |
390 | S16_TO_S32_SX 4, 5 | |
391 | cvtdq2ps m4, m4 | |
392 | cvtdq2ps m5, m5 | |
393 | FMULADD_PS m2, m4, mx_1_ %+ %%i, m2, m6 | |
394 | FMULADD_PS m3, m5, mx_1_ %+ %%i, m3, m6 | |
395 | FMULADD_PS m0, m4, mx_0_ %+ %%i, m0, m4 | |
396 | FMULADD_PS m1, m5, mx_0_ %+ %%i, m1, m5 | |
397 | %else | |
398 | %if copy_src_from_stack | |
399 | mov src_ptr, src %+ %%i %+ m | |
400 | %endif | |
401 | mova m2, [src_ptr+lenq] | |
402 | S16_TO_S32_SX 2, 3 | |
403 | cvtdq2ps m2, m2 | |
404 | cvtdq2ps m3, m3 | |
405 | FMULADD_PS m0, m2, mx_0_ %+ %%i, m0, m4 | |
406 | FMULADD_PS m1, m3, mx_0_ %+ %%i, m1, m4 | |
407 | %endif | |
408 | %assign %%i %%i+1 | |
409 | %endrep | |
410 | %if stereo | |
411 | cvtps2dq m2, m2 | |
412 | cvtps2dq m3, m3 | |
413 | packssdw m2, m3 | |
414 | mova [src1q+lenq], m2 | |
415 | %endif | |
416 | cvtps2dq m0, m0 | |
417 | cvtps2dq m1, m1 | |
418 | packssdw m0, m1 | |
419 | mova [src0q+lenq], m0 | |
420 | %else | |
421 | ; mix with fltp input | |
422 | %if stereo || mx_stack_0_0 | |
423 | mova m0, [src0q+lenq] | |
424 | %endif | |
425 | %if stereo | |
426 | mulps m1, m0, mx_1_0 | |
427 | %endif | |
428 | %if stereo || mx_stack_0_0 | |
429 | mulps m0, m0, mx_0_0 | |
430 | %else | |
431 | mulps m0, mx_0_0, [src0q+lenq] | |
432 | %endif | |
433 | %assign %%i 1 | |
434 | %rep (in_channels - 1) | |
435 | %if copy_src_from_stack | |
436 | %define src_ptr src5q | |
437 | mov src_ptr, src %+ %%i %+ m | |
438 | %else | |
439 | %define src_ptr src %+ %%i %+ q | |
440 | %endif | |
441 | ; avoid extra load for mono if matrix is in a mm register | |
442 | %if stereo || mx_stack_0_ %+ %%i | |
443 | mova m2, [src_ptr+lenq] | |
444 | %endif | |
445 | %if stereo | |
446 | FMULADD_PS m1, m2, mx_1_ %+ %%i, m1, m3 | |
447 | %endif | |
448 | %if stereo || mx_stack_0_ %+ %%i | |
449 | FMULADD_PS m0, m2, mx_0_ %+ %%i, m0, m2 | |
450 | %else | |
451 | FMULADD_PS m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1 | |
452 | %endif | |
453 | %assign %%i %%i+1 | |
454 | %endrep | |
455 | mova [src0q+lenq], m0 | |
456 | %if stereo | |
457 | mova [src1q+lenq], m1 | |
458 | %endif | |
459 | %endif | |
460 | ||
461 | add lenq, mmsize | |
462 | jl .loop | |
463 | ; zero ymm high halves | |
464 | %if mmsize == 32 | |
465 | vzeroupper | |
466 | %endif | |
467 | RET | |
468 | %endmacro | |
469 | ||
470 | %macro MIX_3_8_TO_1_2_FLT_FUNCS 0 | |
471 | %assign %%i 3 | |
472 | %rep 6 | |
473 | INIT_XMM sse | |
474 | MIX_3_8_TO_1_2_FLT %%i, 1, fltp | |
475 | MIX_3_8_TO_1_2_FLT %%i, 2, fltp | |
476 | INIT_XMM sse2 | |
477 | MIX_3_8_TO_1_2_FLT %%i, 1, s16p | |
478 | MIX_3_8_TO_1_2_FLT %%i, 2, s16p | |
479 | INIT_XMM sse4 | |
480 | MIX_3_8_TO_1_2_FLT %%i, 1, s16p | |
481 | MIX_3_8_TO_1_2_FLT %%i, 2, s16p | |
482 | ; do not use ymm AVX or FMA4 in x86-32 for 6 or more channels due to stack alignment issues | |
483 | %if HAVE_AVX_EXTERNAL | |
484 | %if ARCH_X86_64 || %%i < 6 | |
485 | INIT_YMM avx | |
486 | %else | |
487 | INIT_XMM avx | |
488 | %endif | |
489 | MIX_3_8_TO_1_2_FLT %%i, 1, fltp | |
490 | MIX_3_8_TO_1_2_FLT %%i, 2, fltp | |
491 | INIT_XMM avx | |
492 | MIX_3_8_TO_1_2_FLT %%i, 1, s16p | |
493 | MIX_3_8_TO_1_2_FLT %%i, 2, s16p | |
494 | %endif | |
495 | %if HAVE_FMA4_EXTERNAL | |
496 | %if ARCH_X86_64 || %%i < 6 | |
497 | INIT_YMM fma4 | |
498 | %else | |
499 | INIT_XMM fma4 | |
500 | %endif | |
501 | MIX_3_8_TO_1_2_FLT %%i, 1, fltp | |
502 | MIX_3_8_TO_1_2_FLT %%i, 2, fltp | |
503 | INIT_XMM fma4 | |
504 | MIX_3_8_TO_1_2_FLT %%i, 1, s16p | |
505 | MIX_3_8_TO_1_2_FLT %%i, 2, s16p | |
506 | %endif | |
507 | %assign %%i %%i+1 | |
508 | %endrep | |
509 | %endmacro | |
510 | ||
511 | MIX_3_8_TO_1_2_FLT_FUNCS |