Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavutil / x86 / float_dsp.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* x86-optimized Float DSP functions
3;*
4;* Copyright 2006 Loren Merritt
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "x86util.asm"
24
25SECTION .text
26
27;-----------------------------------------------------------------------------
28; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
29;-----------------------------------------------------------------------------
30%macro VECTOR_FMUL 0
31cglobal vector_fmul, 4,4,2, dst, src0, src1, len
32 lea lenq, [lend*4 - 64]
33ALIGN 16
34.loop:
35%assign a 0
36%rep 32/mmsize
37 mova m0, [src0q + lenq + (a+0)*mmsize]
38 mova m1, [src0q + lenq + (a+1)*mmsize]
39 mulps m0, m0, [src1q + lenq + (a+0)*mmsize]
40 mulps m1, m1, [src1q + lenq + (a+1)*mmsize]
41 mova [dstq + lenq + (a+0)*mmsize], m0
42 mova [dstq + lenq + (a+1)*mmsize], m1
43%assign a a+2
44%endrep
45
46 sub lenq, 64
47 jge .loop
48 REP_RET
49%endmacro
50
51INIT_XMM sse
52VECTOR_FMUL
53%if HAVE_AVX_EXTERNAL
54INIT_YMM avx
55VECTOR_FMUL
56%endif
57
58;------------------------------------------------------------------------------
59; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
60;------------------------------------------------------------------------------
61
62%macro VECTOR_FMAC_SCALAR 0
63%if UNIX64
64cglobal vector_fmac_scalar, 3,3,5, dst, src, len
65%else
66cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
67%endif
68%if ARCH_X86_32
69 VBROADCASTSS m0, mulm
70%else
71%if WIN64
72 SWAP 0, 2
73%endif
74 shufps xm0, xm0, 0
75%if cpuflag(avx)
76 vinsertf128 m0, m0, xm0, 1
77%endif
78%endif
79 lea lenq, [lend*4-64]
80.loop:
81%if cpuflag(fma3)
82 mova m1, [dstq+lenq]
83 mova m2, [dstq+lenq+1*mmsize]
84 fmaddps m1, m0, [srcq+lenq], m1
85 fmaddps m2, m0, [srcq+lenq+1*mmsize], m2
86%else ; cpuflag
87 mulps m1, m0, [srcq+lenq]
88 mulps m2, m0, [srcq+lenq+1*mmsize]
89%if mmsize < 32
90 mulps m3, m0, [srcq+lenq+2*mmsize]
91 mulps m4, m0, [srcq+lenq+3*mmsize]
92%endif ; mmsize
93 addps m1, m1, [dstq+lenq]
94 addps m2, m2, [dstq+lenq+1*mmsize]
95%if mmsize < 32
96 addps m3, m3, [dstq+lenq+2*mmsize]
97 addps m4, m4, [dstq+lenq+3*mmsize]
98%endif ; mmsize
99%endif ; cpuflag
100 mova [dstq+lenq], m1
101 mova [dstq+lenq+1*mmsize], m2
102%if mmsize < 32
103 mova [dstq+lenq+2*mmsize], m3
104 mova [dstq+lenq+3*mmsize], m4
105%endif ; mmsize
106 sub lenq, 64
107 jge .loop
108 REP_RET
109%endmacro
110
111INIT_XMM sse
112VECTOR_FMAC_SCALAR
113%if HAVE_AVX_EXTERNAL
114INIT_YMM avx
115VECTOR_FMAC_SCALAR
116%endif
117%if HAVE_FMA3_EXTERNAL
118INIT_YMM fma3
119VECTOR_FMAC_SCALAR
120%endif
121
122;------------------------------------------------------------------------------
123; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
124;------------------------------------------------------------------------------
125
126%macro VECTOR_FMUL_SCALAR 0
127%if UNIX64
128cglobal vector_fmul_scalar, 3,3,2, dst, src, len
129%else
130cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
131%endif
132%if ARCH_X86_32
133 movss m0, mulm
134%elif WIN64
135 SWAP 0, 2
136%endif
137 shufps m0, m0, 0
138 lea lenq, [lend*4-mmsize]
139.loop:
140 mova m1, [srcq+lenq]
141 mulps m1, m0
142 mova [dstq+lenq], m1
143 sub lenq, mmsize
144 jge .loop
145 REP_RET
146%endmacro
147
148INIT_XMM sse
149VECTOR_FMUL_SCALAR
150
151;------------------------------------------------------------------------------
152; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
153; int len)
154;------------------------------------------------------------------------------
155
156%macro VECTOR_DMUL_SCALAR 0
157%if ARCH_X86_32
158cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
159 mov lenq, lenaddrm
160%elif UNIX64
161cglobal vector_dmul_scalar, 3,3,3, dst, src, len
162%else
163cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
164%endif
165%if ARCH_X86_32
166 VBROADCASTSD m0, mulm
167%else
168%if WIN64
169 SWAP 0, 2
170%endif
171 movlhps xm0, xm0
172%if cpuflag(avx)
173 vinsertf128 ym0, ym0, xm0, 1
174%endif
175%endif
176 lea lenq, [lend*8-2*mmsize]
177.loop:
178 mulpd m1, m0, [srcq+lenq ]
179 mulpd m2, m0, [srcq+lenq+mmsize]
180 mova [dstq+lenq ], m1
181 mova [dstq+lenq+mmsize], m2
182 sub lenq, 2*mmsize
183 jge .loop
184 REP_RET
185%endmacro
186
187INIT_XMM sse2
188VECTOR_DMUL_SCALAR
189%if HAVE_AVX_EXTERNAL
190INIT_YMM avx
191VECTOR_DMUL_SCALAR
192%endif
193
194;-----------------------------------------------------------------------------
195; vector_fmul_window(float *dst, const float *src0,
196; const float *src1, const float *win, int len);
197;-----------------------------------------------------------------------------
198%macro VECTOR_FMUL_WINDOW 0
199cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
200 shl lend, 2
201 lea len1q, [lenq - mmsize]
202 add src0q, lenq
203 add dstq, lenq
204 add winq, lenq
205 neg lenq
206.loop
207 mova m0, [winq + lenq]
208 mova m4, [src0q + lenq]
209%if cpuflag(sse)
210 mova m1, [winq + len1q]
211 mova m5, [src1q + len1q]
212 shufps m1, m1, 0x1b
213 shufps m5, m5, 0x1b
214 mova m2, m0
215 mova m3, m1
216 mulps m2, m4
217 mulps m3, m5
218 mulps m1, m4
219 mulps m0, m5
220 addps m2, m3
221 subps m1, m0
222 shufps m2, m2, 0x1b
223%else
224 pswapd m1, [winq + len1q]
225 pswapd m5, [src1q + len1q]
226 mova m2, m0
227 mova m3, m1
228 pfmul m2, m4
229 pfmul m3, m5
230 pfmul m1, m4
231 pfmul m0, m5
232 pfadd m2, m3
233 pfsub m1, m0
234 pswapd m2, m2
235%endif
236 mova [dstq + lenq], m1
237 mova [dstq + len1q], m2
238 sub len1q, mmsize
239 add lenq, mmsize
240 jl .loop
241%if mmsize == 8
242 femms
243%endif
244 REP_RET
245%endmacro
246
247INIT_MMX 3dnowext
248VECTOR_FMUL_WINDOW
249INIT_XMM sse
250VECTOR_FMUL_WINDOW
251
252;-----------------------------------------------------------------------------
253; vector_fmul_add(float *dst, const float *src0, const float *src1,
254; const float *src2, int len)
255;-----------------------------------------------------------------------------
256%macro VECTOR_FMUL_ADD 0
257cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
258 lea lenq, [lend*4 - 2*mmsize]
259ALIGN 16
260.loop:
261 mova m0, [src0q + lenq]
262 mova m1, [src0q + lenq + mmsize]
263%if cpuflag(fma3)
264 mova m2, [src2q + lenq]
265 mova m3, [src2q + lenq + mmsize]
266 fmaddps m0, m0, [src1q + lenq], m2
267 fmaddps m1, m1, [src1q + lenq + mmsize], m3
268%else
269 mulps m0, m0, [src1q + lenq]
270 mulps m1, m1, [src1q + lenq + mmsize]
271 addps m0, m0, [src2q + lenq]
272 addps m1, m1, [src2q + lenq + mmsize]
273%endif
274 mova [dstq + lenq], m0
275 mova [dstq + lenq + mmsize], m1
276
277 sub lenq, 2*mmsize
278 jge .loop
279 REP_RET
280%endmacro
281
282INIT_XMM sse
283VECTOR_FMUL_ADD
284%if HAVE_AVX_EXTERNAL
285INIT_YMM avx
286VECTOR_FMUL_ADD
287%endif
288%if HAVE_FMA3_EXTERNAL
289INIT_YMM fma3
290VECTOR_FMUL_ADD
291%endif
292
293;-----------------------------------------------------------------------------
294; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
295; int len)
296;-----------------------------------------------------------------------------
297%macro VECTOR_FMUL_REVERSE 0
298cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
299 lea lenq, [lend*4 - 2*mmsize]
300ALIGN 16
301.loop:
302%if cpuflag(avx)
303 vmovaps xmm0, [src1q + 16]
304 vinsertf128 m0, m0, [src1q], 1
305 vshufps m0, m0, m0, q0123
306 vmovaps xmm1, [src1q + mmsize + 16]
307 vinsertf128 m1, m1, [src1q + mmsize], 1
308 vshufps m1, m1, m1, q0123
309%else
310 mova m0, [src1q]
311 mova m1, [src1q + mmsize]
312 shufps m0, m0, q0123
313 shufps m1, m1, q0123
314%endif
315 mulps m0, m0, [src0q + lenq + mmsize]
316 mulps m1, m1, [src0q + lenq]
317 mova [dstq + lenq + mmsize], m0
318 mova [dstq + lenq], m1
319 add src1q, 2*mmsize
320 sub lenq, 2*mmsize
321 jge .loop
322 REP_RET
323%endmacro
324
325INIT_XMM sse
326VECTOR_FMUL_REVERSE
327%if HAVE_AVX_EXTERNAL
328INIT_YMM avx
329VECTOR_FMUL_REVERSE
330%endif
331
332; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
333INIT_XMM sse
334cglobal scalarproduct_float, 3,3,2, v1, v2, offset
335 neg offsetq
336 shl offsetq, 2
337 sub v1q, offsetq
338 sub v2q, offsetq
339 xorps xmm0, xmm0
340.loop:
341 movaps xmm1, [v1q+offsetq]
342 mulps xmm1, [v2q+offsetq]
343 addps xmm0, xmm1
344 add offsetq, 16
345 js .loop
346 movhlps xmm1, xmm0
347 addps xmm0, xmm1
348 movss xmm1, xmm0
349 shufps xmm0, xmm0, 1
350 addss xmm0, xmm1
351%if ARCH_X86_64 == 0
352 movss r0m, xmm0
353 fld dword r0m
354%endif
355 RET
356
357;-----------------------------------------------------------------------------
358; void ff_butterflies_float(float *src0, float *src1, int len);
359;-----------------------------------------------------------------------------
360INIT_XMM sse
361cglobal butterflies_float, 3,3,3, src0, src1, len
362%if ARCH_X86_64
363 movsxd lenq, lend
364%endif
365 test lenq, lenq
366 jz .end
367 shl lenq, 2
368 add src0q, lenq
369 add src1q, lenq
370 neg lenq
371.loop:
372 mova m0, [src0q + lenq]
373 mova m1, [src1q + lenq]
374 subps m2, m0, m1
375 addps m0, m0, m1
376 mova [src1q + lenq], m2
377 mova [src0q + lenq], m0
378 add lenq, mmsize
379 jl .loop
380.end:
381 REP_RET