Imported Debian version 2.5.0~trusty1.1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / fmtconvert.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* x86 optimized Format Conversion Utils
3;* Copyright (c) 2008 Loren Merritt
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_TEXT
25
26%macro CVTPS2PI 2
27%if cpuflag(sse)
28 cvtps2pi %1, %2
29%elif cpuflag(3dnow)
30 pf2id %1, %2
31%endif
32%endmacro
33
34;------------------------------------------------------------------------------
35; void ff_int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul,
36; int len);
37;------------------------------------------------------------------------------
38%macro INT32_TO_FLOAT_FMUL_SCALAR 1
39%if UNIX64
40cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
41%else
42cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
43%endif
44%if WIN64
45 SWAP 0, 2
46%elif ARCH_X86_32
47 movss m0, mulm
48%endif
49 SPLATD m0
50 shl lenq, 2
51 add srcq, lenq
52 add dstq, lenq
53 neg lenq
54.loop:
55%if cpuflag(sse2)
56 cvtdq2ps m1, [srcq+lenq ]
57 cvtdq2ps m2, [srcq+lenq+16]
58%else
59 cvtpi2ps m1, [srcq+lenq ]
60 cvtpi2ps m3, [srcq+lenq+ 8]
61 cvtpi2ps m2, [srcq+lenq+16]
62 cvtpi2ps m4, [srcq+lenq+24]
63 movlhps m1, m3
64 movlhps m2, m4
65%endif
66 mulps m1, m0
67 mulps m2, m0
68 mova [dstq+lenq ], m1
69 mova [dstq+lenq+16], m2
70 add lenq, 32
71 jl .loop
72 REP_RET
73%endmacro
74
75INIT_XMM sse
76INT32_TO_FLOAT_FMUL_SCALAR 5
77INIT_XMM sse2
78INT32_TO_FLOAT_FMUL_SCALAR 3
79
f6fa7814
DM
80;------------------------------------------------------------------------------
81; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src,
82; const float *mul, int len);
83;------------------------------------------------------------------------------
84%macro INT32_TO_FLOAT_FMUL_ARRAY8 0
85cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
86 shl lend, 2
87 add srcq, lenq
88 add dstq, lenq
89 neg lenq
90.loop:
91 movss m0, [mulq]
92 SPLATD m0
93%if cpuflag(sse2)
94 cvtdq2ps m1, [srcq+lenq ]
95 cvtdq2ps m2, [srcq+lenq+16]
96%else
97 cvtpi2ps m1, [srcq+lenq ]
98 cvtpi2ps m3, [srcq+lenq+ 8]
99 cvtpi2ps m2, [srcq+lenq+16]
100 cvtpi2ps m4, [srcq+lenq+24]
101 movlhps m1, m3
102 movlhps m2, m4
103%endif
104 mulps m1, m0
105 mulps m2, m0
106 mova [dstq+lenq ], m1
107 mova [dstq+lenq+16], m2
108 add mulq, 4
109 add lenq, 32
110 jl .loop
111 REP_RET
112%endmacro
113
114INIT_XMM sse
115INT32_TO_FLOAT_FMUL_ARRAY8
116INIT_XMM sse2
117INT32_TO_FLOAT_FMUL_ARRAY8
2ba45a60
DM
118
119;------------------------------------------------------------------------------
120; void ff_float_to_int16(int16_t *dst, const float *src, long len);
121;------------------------------------------------------------------------------
122%macro FLOAT_TO_INT16 1
123cglobal float_to_int16, 3, 3, %1, dst, src, len
124 add lenq, lenq
125 lea srcq, [srcq+2*lenq]
126 add dstq, lenq
127 neg lenq
128.loop:
129%if cpuflag(sse2)
130 cvtps2dq m0, [srcq+2*lenq ]
131 cvtps2dq m1, [srcq+2*lenq+16]
132 packssdw m0, m1
133 mova [dstq+lenq], m0
134%else
135 CVTPS2PI m0, [srcq+2*lenq ]
136 CVTPS2PI m1, [srcq+2*lenq+ 8]
137 CVTPS2PI m2, [srcq+2*lenq+16]
138 CVTPS2PI m3, [srcq+2*lenq+24]
139 packssdw m0, m1
140 packssdw m2, m3
141 mova [dstq+lenq ], m0
142 mova [dstq+lenq+8], m2
143%endif
144 add lenq, 16
145 js .loop
146%if mmsize == 8
147 emms
148%endif
149 REP_RET
150%endmacro
151
152INIT_XMM sse2
153FLOAT_TO_INT16 2
154INIT_MMX sse
155FLOAT_TO_INT16 0
156INIT_MMX 3dnow
157FLOAT_TO_INT16 0
158
159;------------------------------------------------------------------------------
160; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
161;------------------------------------------------------------------------------
162%macro FLOAT_TO_INT16_STEP 1
163cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
164 add lenq, lenq
165 lea srcq, [srcq+2*lenq]
166 lea step3q, [stepq*3]
167 neg lenq
168.loop:
169%if cpuflag(sse2)
170 cvtps2dq m0, [srcq+2*lenq ]
171 cvtps2dq m1, [srcq+2*lenq+16]
172 packssdw m0, m1
173 movd v1d, m0
174 psrldq m0, 4
175 movd v2d, m0
176 psrldq m0, 4
177 mov [dstq], v1w
178 mov [dstq+stepq*4], v2w
179 shr v1d, 16
180 shr v2d, 16
181 mov [dstq+stepq*2], v1w
182 mov [dstq+step3q*2], v2w
183 lea dstq, [dstq+stepq*8]
184 movd v1d, m0
185 psrldq m0, 4
186 movd v2d, m0
187 mov [dstq], v1w
188 mov [dstq+stepq*4], v2w
189 shr v1d, 16
190 shr v2d, 16
191 mov [dstq+stepq*2], v1w
192 mov [dstq+step3q*2], v2w
193 lea dstq, [dstq+stepq*8]
194%else
195 CVTPS2PI m0, [srcq+2*lenq ]
196 CVTPS2PI m1, [srcq+2*lenq+ 8]
197 CVTPS2PI m2, [srcq+2*lenq+16]
198 CVTPS2PI m3, [srcq+2*lenq+24]
199 packssdw m0, m1
200 packssdw m2, m3
201 movd v1d, m0
202 psrlq m0, 32
203 movd v2d, m0
204 mov [dstq], v1w
205 mov [dstq+stepq*4], v2w
206 shr v1d, 16
207 shr v2d, 16
208 mov [dstq+stepq*2], v1w
209 mov [dstq+step3q*2], v2w
210 lea dstq, [dstq+stepq*8]
211 movd v1d, m2
212 psrlq m2, 32
213 movd v2d, m2
214 mov [dstq], v1w
215 mov [dstq+stepq*4], v2w
216 shr v1d, 16
217 shr v2d, 16
218 mov [dstq+stepq*2], v1w
219 mov [dstq+step3q*2], v2w
220 lea dstq, [dstq+stepq*8]
221%endif
222 add lenq, 16
223 js .loop
224%if mmsize == 8
225 emms
226%endif
227 REP_RET
228%endmacro
229
230INIT_XMM sse2
231FLOAT_TO_INT16_STEP 2
232INIT_MMX sse
233FLOAT_TO_INT16_STEP 0
234INIT_MMX 3dnow
235FLOAT_TO_INT16_STEP 0
236
237;-------------------------------------------------------------------------------
238; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
239;-------------------------------------------------------------------------------
240%macro FLOAT_TO_INT16_INTERLEAVE2 0
241cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
242 lea lenq, [4*r2q]
243 mov src1q, [src0q+gprsize]
244 mov src0q, [src0q]
245 add dstq, lenq
246 add src0q, lenq
247 add src1q, lenq
248 neg lenq
249.loop:
250%if cpuflag(sse2)
251 cvtps2dq m0, [src0q+lenq]
252 cvtps2dq m1, [src1q+lenq]
253 packssdw m0, m1
254 movhlps m1, m0
255 punpcklwd m0, m1
256 mova [dstq+lenq], m0
257%else
258 CVTPS2PI m0, [src0q+lenq ]
259 CVTPS2PI m1, [src0q+lenq+8]
260 CVTPS2PI m2, [src1q+lenq ]
261 CVTPS2PI m3, [src1q+lenq+8]
262 packssdw m0, m1
263 packssdw m2, m3
264 mova m1, m0
265 punpcklwd m0, m2
266 punpckhwd m1, m2
267 mova [dstq+lenq ], m0
268 mova [dstq+lenq+8], m1
269%endif
270 add lenq, 16
271 js .loop
272%if mmsize == 8
273 emms
274%endif
275 REP_RET
276%endmacro
277
278INIT_MMX 3dnow
279FLOAT_TO_INT16_INTERLEAVE2
280INIT_MMX sse
281FLOAT_TO_INT16_INTERLEAVE2
282INIT_XMM sse2
283FLOAT_TO_INT16_INTERLEAVE2
284
285;-----------------------------------------------------------------------------
286; void ff_float_to_int16_interleave6(int16_t *dst, const float **src, int len)
287;-----------------------------------------------------------------------------
288%macro FLOAT_TO_INT16_INTERLEAVE6 0
289cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
290%if ARCH_X86_64
291 mov lend, r2d
292%else
293 %define lend dword r2m
294%endif
295 mov src1q, [srcq+1*gprsize]
296 mov src2q, [srcq+2*gprsize]
297 mov src3q, [srcq+3*gprsize]
298 mov src4q, [srcq+4*gprsize]
299 mov src5q, [srcq+5*gprsize]
300 mov srcq, [srcq]
301 sub src1q, srcq
302 sub src2q, srcq
303 sub src3q, srcq
304 sub src4q, srcq
305 sub src5q, srcq
306.loop:
307 CVTPS2PI mm0, [srcq]
308 CVTPS2PI mm1, [srcq+src1q]
309 CVTPS2PI mm2, [srcq+src2q]
310 CVTPS2PI mm3, [srcq+src3q]
311 CVTPS2PI mm4, [srcq+src4q]
312 CVTPS2PI mm5, [srcq+src5q]
313 packssdw mm0, mm3
314 packssdw mm1, mm4
315 packssdw mm2, mm5
316 PSWAPD mm3, mm0
317 punpcklwd mm0, mm1
318 punpckhwd mm1, mm2
319 punpcklwd mm2, mm3
320 PSWAPD mm3, mm0
321 punpckldq mm0, mm2
322 punpckhdq mm2, mm1
323 punpckldq mm1, mm3
324 movq [dstq ], mm0
325 movq [dstq+16], mm2
326 movq [dstq+ 8], mm1
327 add srcq, 8
328 add dstq, 24
329 sub lend, 2
330 jg .loop
331 emms
332 RET
333%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
334
335INIT_MMX sse
336FLOAT_TO_INT16_INTERLEAVE6
337INIT_MMX 3dnow
338FLOAT_TO_INT16_INTERLEAVE6
339INIT_MMX 3dnowext
340FLOAT_TO_INT16_INTERLEAVE6
341
342;-----------------------------------------------------------------------------
343; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
344;-----------------------------------------------------------------------------
345
346%macro FLOAT_INTERLEAVE6 1
347cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
348%if ARCH_X86_64
349 mov lend, r2d
350%else
351 %define lend dword r2m
352%endif
353 mov src1q, [srcq+1*gprsize]
354 mov src2q, [srcq+2*gprsize]
355 mov src3q, [srcq+3*gprsize]
356 mov src4q, [srcq+4*gprsize]
357 mov src5q, [srcq+5*gprsize]
358 mov srcq, [srcq]
359 sub src1q, srcq
360 sub src2q, srcq
361 sub src3q, srcq
362 sub src4q, srcq
363 sub src5q, srcq
364.loop:
365%if cpuflag(sse)
366 movaps m0, [srcq]
367 movaps m1, [srcq+src1q]
368 movaps m2, [srcq+src2q]
369 movaps m3, [srcq+src3q]
370 movaps m4, [srcq+src4q]
371 movaps m5, [srcq+src5q]
372
373 SBUTTERFLYPS 0, 1, 6
374 SBUTTERFLYPS 2, 3, 6
375 SBUTTERFLYPS 4, 5, 6
376
377 movaps m6, m4
378 shufps m4, m0, 0xe4
379 movlhps m0, m2
380 movhlps m6, m2
381 movaps [dstq ], m0
382 movaps [dstq+16], m4
383 movaps [dstq+32], m6
384
385 movaps m6, m5
386 shufps m5, m1, 0xe4
387 movlhps m1, m3
388 movhlps m6, m3
389 movaps [dstq+48], m1
390 movaps [dstq+64], m5
391 movaps [dstq+80], m6
392%else ; mmx
393 movq m0, [srcq]
394 movq m1, [srcq+src1q]
395 movq m2, [srcq+src2q]
396 movq m3, [srcq+src3q]
397 movq m4, [srcq+src4q]
398 movq m5, [srcq+src5q]
399
400 SBUTTERFLY dq, 0, 1, 6
401 SBUTTERFLY dq, 2, 3, 6
402 SBUTTERFLY dq, 4, 5, 6
403 movq [dstq ], m0
404 movq [dstq+ 8], m2
405 movq [dstq+16], m4
406 movq [dstq+24], m1
407 movq [dstq+32], m3
408 movq [dstq+40], m5
409%endif
410 add srcq, mmsize
411 add dstq, mmsize*6
412 sub lend, mmsize/4
413 jg .loop
414%if mmsize == 8
415 emms
416%endif
417 REP_RET
418%endmacro
419
420INIT_MMX mmx
421FLOAT_INTERLEAVE6 0
422INIT_XMM sse
423FLOAT_INTERLEAVE6 7
424
425;-----------------------------------------------------------------------------
426; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
427;-----------------------------------------------------------------------------
428
429%macro FLOAT_INTERLEAVE2 1
430cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
431 mov src1q, [srcq+gprsize]
432 mov srcq, [srcq ]
433 sub src1q, srcq
434.loop:
435 mova m0, [srcq ]
436 mova m1, [srcq+src1q ]
437 mova m3, [srcq +mmsize]
438 mova m4, [srcq+src1q+mmsize]
439
440 mova m2, m0
441 PUNPCKLDQ m0, m1
442 PUNPCKHDQ m2, m1
443
444 mova m1, m3
445 PUNPCKLDQ m3, m4
446 PUNPCKHDQ m1, m4
447
448 mova [dstq ], m0
449 mova [dstq+1*mmsize], m2
450 mova [dstq+2*mmsize], m3
451 mova [dstq+3*mmsize], m1
452
453 add srcq, mmsize*2
454 add dstq, mmsize*4
455 sub lend, mmsize/2
456 jg .loop
457%if mmsize == 8
458 emms
459%endif
460 REP_RET
461%endmacro
462
463INIT_MMX mmx
464%define PUNPCKLDQ punpckldq
465%define PUNPCKHDQ punpckhdq
466FLOAT_INTERLEAVE2 0
467INIT_XMM sse
468%define PUNPCKLDQ unpcklps
469%define PUNPCKHDQ unpckhps
470FLOAT_INTERLEAVE2 5