Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* x86 optimized Format Conversion Utils | |
3 | ;* Copyright (c) 2008 Loren Merritt | |
4 | ;* | |
5 | ;* This file is part of FFmpeg. | |
6 | ;* | |
7 | ;* FFmpeg is free software; you can redistribute it and/or | |
8 | ;* modify it under the terms of the GNU Lesser General Public | |
9 | ;* License as published by the Free Software Foundation; either | |
10 | ;* version 2.1 of the License, or (at your option) any later version. | |
11 | ;* | |
12 | ;* FFmpeg is distributed in the hope that it will be useful, | |
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;* Lesser General Public License for more details. | |
16 | ;* | |
17 | ;* You should have received a copy of the GNU Lesser General Public | |
18 | ;* License along with FFmpeg; if not, write to the Free Software | |
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ;****************************************************************************** | |
21 | ||
22 | %include "libavutil/x86/x86util.asm" | |
23 | ||
24 | SECTION_TEXT | |
25 | ||
26 | %macro CVTPS2PI 2 | |
27 | %if cpuflag(sse) | |
28 | cvtps2pi %1, %2 | |
29 | %elif cpuflag(3dnow) | |
30 | pf2id %1, %2 | |
31 | %endif | |
32 | %endmacro | |
33 | ||
34 | ;------------------------------------------------------------------------------ | |
35 | ; void ff_int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul, | |
36 | ; int len); | |
37 | ;------------------------------------------------------------------------------ | |
38 | %macro INT32_TO_FLOAT_FMUL_SCALAR 1 | |
39 | %if UNIX64 | |
40 | cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len | |
41 | %else | |
42 | cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len | |
43 | %endif | |
44 | %if WIN64 | |
45 | SWAP 0, 2 | |
46 | %elif ARCH_X86_32 | |
47 | movss m0, mulm | |
48 | %endif | |
49 | SPLATD m0 | |
50 | shl lenq, 2 | |
51 | add srcq, lenq | |
52 | add dstq, lenq | |
53 | neg lenq | |
54 | .loop: | |
55 | %if cpuflag(sse2) | |
56 | cvtdq2ps m1, [srcq+lenq ] | |
57 | cvtdq2ps m2, [srcq+lenq+16] | |
58 | %else | |
59 | cvtpi2ps m1, [srcq+lenq ] | |
60 | cvtpi2ps m3, [srcq+lenq+ 8] | |
61 | cvtpi2ps m2, [srcq+lenq+16] | |
62 | cvtpi2ps m4, [srcq+lenq+24] | |
63 | movlhps m1, m3 | |
64 | movlhps m2, m4 | |
65 | %endif | |
66 | mulps m1, m0 | |
67 | mulps m2, m0 | |
68 | mova [dstq+lenq ], m1 | |
69 | mova [dstq+lenq+16], m2 | |
70 | add lenq, 32 | |
71 | jl .loop | |
72 | REP_RET | |
73 | %endmacro | |
74 | ||
75 | INIT_XMM sse | |
76 | INT32_TO_FLOAT_FMUL_SCALAR 5 | |
77 | INIT_XMM sse2 | |
78 | INT32_TO_FLOAT_FMUL_SCALAR 3 | |
79 | ||
f6fa7814 DM |
80 | ;------------------------------------------------------------------------------ |
81 | ; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src, | |
82 | ; const float *mul, int len); | |
83 | ;------------------------------------------------------------------------------ | |
84 | %macro INT32_TO_FLOAT_FMUL_ARRAY8 0 | |
85 | cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len | |
86 | shl lend, 2 | |
87 | add srcq, lenq | |
88 | add dstq, lenq | |
89 | neg lenq | |
90 | .loop: | |
91 | movss m0, [mulq] | |
92 | SPLATD m0 | |
93 | %if cpuflag(sse2) | |
94 | cvtdq2ps m1, [srcq+lenq ] | |
95 | cvtdq2ps m2, [srcq+lenq+16] | |
96 | %else | |
97 | cvtpi2ps m1, [srcq+lenq ] | |
98 | cvtpi2ps m3, [srcq+lenq+ 8] | |
99 | cvtpi2ps m2, [srcq+lenq+16] | |
100 | cvtpi2ps m4, [srcq+lenq+24] | |
101 | movlhps m1, m3 | |
102 | movlhps m2, m4 | |
103 | %endif | |
104 | mulps m1, m0 | |
105 | mulps m2, m0 | |
106 | mova [dstq+lenq ], m1 | |
107 | mova [dstq+lenq+16], m2 | |
108 | add mulq, 4 | |
109 | add lenq, 32 | |
110 | jl .loop | |
111 | REP_RET | |
112 | %endmacro | |
113 | ||
114 | INIT_XMM sse | |
115 | INT32_TO_FLOAT_FMUL_ARRAY8 | |
116 | INIT_XMM sse2 | |
117 | INT32_TO_FLOAT_FMUL_ARRAY8 | |
2ba45a60 DM |
118 | |
119 | ;------------------------------------------------------------------------------ | |
120 | ; void ff_float_to_int16(int16_t *dst, const float *src, long len); | |
121 | ;------------------------------------------------------------------------------ | |
122 | %macro FLOAT_TO_INT16 1 | |
123 | cglobal float_to_int16, 3, 3, %1, dst, src, len | |
124 | add lenq, lenq | |
125 | lea srcq, [srcq+2*lenq] | |
126 | add dstq, lenq | |
127 | neg lenq | |
128 | .loop: | |
129 | %if cpuflag(sse2) | |
130 | cvtps2dq m0, [srcq+2*lenq ] | |
131 | cvtps2dq m1, [srcq+2*lenq+16] | |
132 | packssdw m0, m1 | |
133 | mova [dstq+lenq], m0 | |
134 | %else | |
135 | CVTPS2PI m0, [srcq+2*lenq ] | |
136 | CVTPS2PI m1, [srcq+2*lenq+ 8] | |
137 | CVTPS2PI m2, [srcq+2*lenq+16] | |
138 | CVTPS2PI m3, [srcq+2*lenq+24] | |
139 | packssdw m0, m1 | |
140 | packssdw m2, m3 | |
141 | mova [dstq+lenq ], m0 | |
142 | mova [dstq+lenq+8], m2 | |
143 | %endif | |
144 | add lenq, 16 | |
145 | js .loop | |
146 | %if mmsize == 8 | |
147 | emms | |
148 | %endif | |
149 | REP_RET | |
150 | %endmacro | |
151 | ||
152 | INIT_XMM sse2 | |
153 | FLOAT_TO_INT16 2 | |
154 | INIT_MMX sse | |
155 | FLOAT_TO_INT16 0 | |
156 | INIT_MMX 3dnow | |
157 | FLOAT_TO_INT16 0 | |
158 | ||
159 | ;------------------------------------------------------------------------------ | |
160 | ; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step); | |
161 | ;------------------------------------------------------------------------------ | |
162 | %macro FLOAT_TO_INT16_STEP 1 | |
163 | cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2 | |
164 | add lenq, lenq | |
165 | lea srcq, [srcq+2*lenq] | |
166 | lea step3q, [stepq*3] | |
167 | neg lenq | |
168 | .loop: | |
169 | %if cpuflag(sse2) | |
170 | cvtps2dq m0, [srcq+2*lenq ] | |
171 | cvtps2dq m1, [srcq+2*lenq+16] | |
172 | packssdw m0, m1 | |
173 | movd v1d, m0 | |
174 | psrldq m0, 4 | |
175 | movd v2d, m0 | |
176 | psrldq m0, 4 | |
177 | mov [dstq], v1w | |
178 | mov [dstq+stepq*4], v2w | |
179 | shr v1d, 16 | |
180 | shr v2d, 16 | |
181 | mov [dstq+stepq*2], v1w | |
182 | mov [dstq+step3q*2], v2w | |
183 | lea dstq, [dstq+stepq*8] | |
184 | movd v1d, m0 | |
185 | psrldq m0, 4 | |
186 | movd v2d, m0 | |
187 | mov [dstq], v1w | |
188 | mov [dstq+stepq*4], v2w | |
189 | shr v1d, 16 | |
190 | shr v2d, 16 | |
191 | mov [dstq+stepq*2], v1w | |
192 | mov [dstq+step3q*2], v2w | |
193 | lea dstq, [dstq+stepq*8] | |
194 | %else | |
195 | CVTPS2PI m0, [srcq+2*lenq ] | |
196 | CVTPS2PI m1, [srcq+2*lenq+ 8] | |
197 | CVTPS2PI m2, [srcq+2*lenq+16] | |
198 | CVTPS2PI m3, [srcq+2*lenq+24] | |
199 | packssdw m0, m1 | |
200 | packssdw m2, m3 | |
201 | movd v1d, m0 | |
202 | psrlq m0, 32 | |
203 | movd v2d, m0 | |
204 | mov [dstq], v1w | |
205 | mov [dstq+stepq*4], v2w | |
206 | shr v1d, 16 | |
207 | shr v2d, 16 | |
208 | mov [dstq+stepq*2], v1w | |
209 | mov [dstq+step3q*2], v2w | |
210 | lea dstq, [dstq+stepq*8] | |
211 | movd v1d, m2 | |
212 | psrlq m2, 32 | |
213 | movd v2d, m2 | |
214 | mov [dstq], v1w | |
215 | mov [dstq+stepq*4], v2w | |
216 | shr v1d, 16 | |
217 | shr v2d, 16 | |
218 | mov [dstq+stepq*2], v1w | |
219 | mov [dstq+step3q*2], v2w | |
220 | lea dstq, [dstq+stepq*8] | |
221 | %endif | |
222 | add lenq, 16 | |
223 | js .loop | |
224 | %if mmsize == 8 | |
225 | emms | |
226 | %endif | |
227 | REP_RET | |
228 | %endmacro | |
229 | ||
230 | INIT_XMM sse2 | |
231 | FLOAT_TO_INT16_STEP 2 | |
232 | INIT_MMX sse | |
233 | FLOAT_TO_INT16_STEP 0 | |
234 | INIT_MMX 3dnow | |
235 | FLOAT_TO_INT16_STEP 0 | |
236 | ||
237 | ;------------------------------------------------------------------------------- | |
238 | ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len); | |
239 | ;------------------------------------------------------------------------------- | |
240 | %macro FLOAT_TO_INT16_INTERLEAVE2 0 | |
241 | cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len | |
242 | lea lenq, [4*r2q] | |
243 | mov src1q, [src0q+gprsize] | |
244 | mov src0q, [src0q] | |
245 | add dstq, lenq | |
246 | add src0q, lenq | |
247 | add src1q, lenq | |
248 | neg lenq | |
249 | .loop: | |
250 | %if cpuflag(sse2) | |
251 | cvtps2dq m0, [src0q+lenq] | |
252 | cvtps2dq m1, [src1q+lenq] | |
253 | packssdw m0, m1 | |
254 | movhlps m1, m0 | |
255 | punpcklwd m0, m1 | |
256 | mova [dstq+lenq], m0 | |
257 | %else | |
258 | CVTPS2PI m0, [src0q+lenq ] | |
259 | CVTPS2PI m1, [src0q+lenq+8] | |
260 | CVTPS2PI m2, [src1q+lenq ] | |
261 | CVTPS2PI m3, [src1q+lenq+8] | |
262 | packssdw m0, m1 | |
263 | packssdw m2, m3 | |
264 | mova m1, m0 | |
265 | punpcklwd m0, m2 | |
266 | punpckhwd m1, m2 | |
267 | mova [dstq+lenq ], m0 | |
268 | mova [dstq+lenq+8], m1 | |
269 | %endif | |
270 | add lenq, 16 | |
271 | js .loop | |
272 | %if mmsize == 8 | |
273 | emms | |
274 | %endif | |
275 | REP_RET | |
276 | %endmacro | |
277 | ||
278 | INIT_MMX 3dnow | |
279 | FLOAT_TO_INT16_INTERLEAVE2 | |
280 | INIT_MMX sse | |
281 | FLOAT_TO_INT16_INTERLEAVE2 | |
282 | INIT_XMM sse2 | |
283 | FLOAT_TO_INT16_INTERLEAVE2 | |
284 | ||
285 | ;----------------------------------------------------------------------------- | |
286 | ; void ff_float_to_int16_interleave6(int16_t *dst, const float **src, int len) | |
287 | ;----------------------------------------------------------------------------- | |
288 | %macro FLOAT_TO_INT16_INTERLEAVE6 0 | |
289 | cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len | |
290 | %if ARCH_X86_64 | |
291 | mov lend, r2d | |
292 | %else | |
293 | %define lend dword r2m | |
294 | %endif | |
295 | mov src1q, [srcq+1*gprsize] | |
296 | mov src2q, [srcq+2*gprsize] | |
297 | mov src3q, [srcq+3*gprsize] | |
298 | mov src4q, [srcq+4*gprsize] | |
299 | mov src5q, [srcq+5*gprsize] | |
300 | mov srcq, [srcq] | |
301 | sub src1q, srcq | |
302 | sub src2q, srcq | |
303 | sub src3q, srcq | |
304 | sub src4q, srcq | |
305 | sub src5q, srcq | |
306 | .loop: | |
307 | CVTPS2PI mm0, [srcq] | |
308 | CVTPS2PI mm1, [srcq+src1q] | |
309 | CVTPS2PI mm2, [srcq+src2q] | |
310 | CVTPS2PI mm3, [srcq+src3q] | |
311 | CVTPS2PI mm4, [srcq+src4q] | |
312 | CVTPS2PI mm5, [srcq+src5q] | |
313 | packssdw mm0, mm3 | |
314 | packssdw mm1, mm4 | |
315 | packssdw mm2, mm5 | |
316 | PSWAPD mm3, mm0 | |
317 | punpcklwd mm0, mm1 | |
318 | punpckhwd mm1, mm2 | |
319 | punpcklwd mm2, mm3 | |
320 | PSWAPD mm3, mm0 | |
321 | punpckldq mm0, mm2 | |
322 | punpckhdq mm2, mm1 | |
323 | punpckldq mm1, mm3 | |
324 | movq [dstq ], mm0 | |
325 | movq [dstq+16], mm2 | |
326 | movq [dstq+ 8], mm1 | |
327 | add srcq, 8 | |
328 | add dstq, 24 | |
329 | sub lend, 2 | |
330 | jg .loop | |
331 | emms | |
332 | RET | |
333 | %endmacro ; FLOAT_TO_INT16_INTERLEAVE6 | |
334 | ||
335 | INIT_MMX sse | |
336 | FLOAT_TO_INT16_INTERLEAVE6 | |
337 | INIT_MMX 3dnow | |
338 | FLOAT_TO_INT16_INTERLEAVE6 | |
339 | INIT_MMX 3dnowext | |
340 | FLOAT_TO_INT16_INTERLEAVE6 | |
341 | ||
342 | ;----------------------------------------------------------------------------- | |
343 | ; void ff_float_interleave6(float *dst, const float **src, unsigned int len); | |
344 | ;----------------------------------------------------------------------------- | |
345 | ||
346 | %macro FLOAT_INTERLEAVE6 1 | |
347 | cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len | |
348 | %if ARCH_X86_64 | |
349 | mov lend, r2d | |
350 | %else | |
351 | %define lend dword r2m | |
352 | %endif | |
353 | mov src1q, [srcq+1*gprsize] | |
354 | mov src2q, [srcq+2*gprsize] | |
355 | mov src3q, [srcq+3*gprsize] | |
356 | mov src4q, [srcq+4*gprsize] | |
357 | mov src5q, [srcq+5*gprsize] | |
358 | mov srcq, [srcq] | |
359 | sub src1q, srcq | |
360 | sub src2q, srcq | |
361 | sub src3q, srcq | |
362 | sub src4q, srcq | |
363 | sub src5q, srcq | |
364 | .loop: | |
365 | %if cpuflag(sse) | |
366 | movaps m0, [srcq] | |
367 | movaps m1, [srcq+src1q] | |
368 | movaps m2, [srcq+src2q] | |
369 | movaps m3, [srcq+src3q] | |
370 | movaps m4, [srcq+src4q] | |
371 | movaps m5, [srcq+src5q] | |
372 | ||
373 | SBUTTERFLYPS 0, 1, 6 | |
374 | SBUTTERFLYPS 2, 3, 6 | |
375 | SBUTTERFLYPS 4, 5, 6 | |
376 | ||
377 | movaps m6, m4 | |
378 | shufps m4, m0, 0xe4 | |
379 | movlhps m0, m2 | |
380 | movhlps m6, m2 | |
381 | movaps [dstq ], m0 | |
382 | movaps [dstq+16], m4 | |
383 | movaps [dstq+32], m6 | |
384 | ||
385 | movaps m6, m5 | |
386 | shufps m5, m1, 0xe4 | |
387 | movlhps m1, m3 | |
388 | movhlps m6, m3 | |
389 | movaps [dstq+48], m1 | |
390 | movaps [dstq+64], m5 | |
391 | movaps [dstq+80], m6 | |
392 | %else ; mmx | |
393 | movq m0, [srcq] | |
394 | movq m1, [srcq+src1q] | |
395 | movq m2, [srcq+src2q] | |
396 | movq m3, [srcq+src3q] | |
397 | movq m4, [srcq+src4q] | |
398 | movq m5, [srcq+src5q] | |
399 | ||
400 | SBUTTERFLY dq, 0, 1, 6 | |
401 | SBUTTERFLY dq, 2, 3, 6 | |
402 | SBUTTERFLY dq, 4, 5, 6 | |
403 | movq [dstq ], m0 | |
404 | movq [dstq+ 8], m2 | |
405 | movq [dstq+16], m4 | |
406 | movq [dstq+24], m1 | |
407 | movq [dstq+32], m3 | |
408 | movq [dstq+40], m5 | |
409 | %endif | |
410 | add srcq, mmsize | |
411 | add dstq, mmsize*6 | |
412 | sub lend, mmsize/4 | |
413 | jg .loop | |
414 | %if mmsize == 8 | |
415 | emms | |
416 | %endif | |
417 | REP_RET | |
418 | %endmacro | |
419 | ||
420 | INIT_MMX mmx | |
421 | FLOAT_INTERLEAVE6 0 | |
422 | INIT_XMM sse | |
423 | FLOAT_INTERLEAVE6 7 | |
424 | ||
425 | ;----------------------------------------------------------------------------- | |
426 | ; void ff_float_interleave2(float *dst, const float **src, unsigned int len); | |
427 | ;----------------------------------------------------------------------------- | |
428 | ||
429 | %macro FLOAT_INTERLEAVE2 1 | |
430 | cglobal float_interleave2, 3, 4, %1, dst, src, len, src1 | |
431 | mov src1q, [srcq+gprsize] | |
432 | mov srcq, [srcq ] | |
433 | sub src1q, srcq | |
434 | .loop: | |
435 | mova m0, [srcq ] | |
436 | mova m1, [srcq+src1q ] | |
437 | mova m3, [srcq +mmsize] | |
438 | mova m4, [srcq+src1q+mmsize] | |
439 | ||
440 | mova m2, m0 | |
441 | PUNPCKLDQ m0, m1 | |
442 | PUNPCKHDQ m2, m1 | |
443 | ||
444 | mova m1, m3 | |
445 | PUNPCKLDQ m3, m4 | |
446 | PUNPCKHDQ m1, m4 | |
447 | ||
448 | mova [dstq ], m0 | |
449 | mova [dstq+1*mmsize], m2 | |
450 | mova [dstq+2*mmsize], m3 | |
451 | mova [dstq+3*mmsize], m1 | |
452 | ||
453 | add srcq, mmsize*2 | |
454 | add dstq, mmsize*4 | |
455 | sub lend, mmsize/2 | |
456 | jg .loop | |
457 | %if mmsize == 8 | |
458 | emms | |
459 | %endif | |
460 | REP_RET | |
461 | %endmacro | |
462 | ||
463 | INIT_MMX mmx | |
464 | %define PUNPCKLDQ punpckldq | |
465 | %define PUNPCKHDQ punpckhdq | |
466 | FLOAT_INTERLEAVE2 0 | |
467 | INIT_XMM sse | |
468 | %define PUNPCKLDQ unpcklps | |
469 | %define PUNPCKHDQ unpckhps | |
470 | FLOAT_INTERLEAVE2 5 |