Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* x86 optimized Format Conversion Utils | |
3 | ;* Copyright (c) 2008 Loren Merritt | |
4 | ;* | |
5 | ;* This file is part of FFmpeg. | |
6 | ;* | |
7 | ;* FFmpeg is free software; you can redistribute it and/or | |
8 | ;* modify it under the terms of the GNU Lesser General Public | |
9 | ;* License as published by the Free Software Foundation; either | |
10 | ;* version 2.1 of the License, or (at your option) any later version. | |
11 | ;* | |
12 | ;* FFmpeg is distributed in the hope that it will be useful, | |
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;* Lesser General Public License for more details. | |
16 | ;* | |
17 | ;* You should have received a copy of the GNU Lesser General Public | |
18 | ;* License along with FFmpeg; if not, write to the Free Software | |
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ;****************************************************************************** | |
21 | ||
22 | %include "libavutil/x86/x86util.asm" | |
23 | ||
24 | SECTION_TEXT | |
25 | ||
26 | %macro CVTPS2PI 2 | |
27 | %if cpuflag(sse) | |
28 | cvtps2pi %1, %2 | |
29 | %elif cpuflag(3dnow) | |
30 | pf2id %1, %2 | |
31 | %endif | |
32 | %endmacro | |
33 | ||
34 | ;------------------------------------------------------------------------------ | |
35 | ; void ff_int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul, | |
36 | ; int len); | |
37 | ;------------------------------------------------------------------------------ | |
38 | %macro INT32_TO_FLOAT_FMUL_SCALAR 1 | |
39 | %if UNIX64 | |
40 | cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len | |
41 | %else | |
42 | cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len | |
43 | %endif | |
44 | %if WIN64 | |
45 | SWAP 0, 2 | |
46 | %elif ARCH_X86_32 | |
47 | movss m0, mulm | |
48 | %endif | |
49 | SPLATD m0 | |
50 | shl lenq, 2 | |
51 | add srcq, lenq | |
52 | add dstq, lenq | |
53 | neg lenq | |
54 | .loop: | |
55 | %if cpuflag(sse2) | |
56 | cvtdq2ps m1, [srcq+lenq ] | |
57 | cvtdq2ps m2, [srcq+lenq+16] | |
58 | %else | |
59 | cvtpi2ps m1, [srcq+lenq ] | |
60 | cvtpi2ps m3, [srcq+lenq+ 8] | |
61 | cvtpi2ps m2, [srcq+lenq+16] | |
62 | cvtpi2ps m4, [srcq+lenq+24] | |
63 | movlhps m1, m3 | |
64 | movlhps m2, m4 | |
65 | %endif | |
66 | mulps m1, m0 | |
67 | mulps m2, m0 | |
68 | mova [dstq+lenq ], m1 | |
69 | mova [dstq+lenq+16], m2 | |
70 | add lenq, 32 | |
71 | jl .loop | |
72 | REP_RET | |
73 | %endmacro | |
74 | ||
75 | INIT_XMM sse | |
76 | INT32_TO_FLOAT_FMUL_SCALAR 5 | |
77 | INIT_XMM sse2 | |
78 | INT32_TO_FLOAT_FMUL_SCALAR 3 | |
79 | ||
80 | ||
81 | ;------------------------------------------------------------------------------ | |
82 | ; void ff_float_to_int16(int16_t *dst, const float *src, long len); | |
83 | ;------------------------------------------------------------------------------ | |
84 | %macro FLOAT_TO_INT16 1 | |
85 | cglobal float_to_int16, 3, 3, %1, dst, src, len | |
86 | add lenq, lenq | |
87 | lea srcq, [srcq+2*lenq] | |
88 | add dstq, lenq | |
89 | neg lenq | |
90 | .loop: | |
91 | %if cpuflag(sse2) | |
92 | cvtps2dq m0, [srcq+2*lenq ] | |
93 | cvtps2dq m1, [srcq+2*lenq+16] | |
94 | packssdw m0, m1 | |
95 | mova [dstq+lenq], m0 | |
96 | %else | |
97 | CVTPS2PI m0, [srcq+2*lenq ] | |
98 | CVTPS2PI m1, [srcq+2*lenq+ 8] | |
99 | CVTPS2PI m2, [srcq+2*lenq+16] | |
100 | CVTPS2PI m3, [srcq+2*lenq+24] | |
101 | packssdw m0, m1 | |
102 | packssdw m2, m3 | |
103 | mova [dstq+lenq ], m0 | |
104 | mova [dstq+lenq+8], m2 | |
105 | %endif | |
106 | add lenq, 16 | |
107 | js .loop | |
108 | %if mmsize == 8 | |
109 | emms | |
110 | %endif | |
111 | REP_RET | |
112 | %endmacro | |
113 | ||
114 | INIT_XMM sse2 | |
115 | FLOAT_TO_INT16 2 | |
116 | INIT_MMX sse | |
117 | FLOAT_TO_INT16 0 | |
118 | INIT_MMX 3dnow | |
119 | FLOAT_TO_INT16 0 | |
120 | ||
121 | ;------------------------------------------------------------------------------ | |
122 | ; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step); | |
123 | ;------------------------------------------------------------------------------ | |
124 | %macro FLOAT_TO_INT16_STEP 1 | |
125 | cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2 | |
126 | add lenq, lenq | |
127 | lea srcq, [srcq+2*lenq] | |
128 | lea step3q, [stepq*3] | |
129 | neg lenq | |
130 | .loop: | |
131 | %if cpuflag(sse2) | |
132 | cvtps2dq m0, [srcq+2*lenq ] | |
133 | cvtps2dq m1, [srcq+2*lenq+16] | |
134 | packssdw m0, m1 | |
135 | movd v1d, m0 | |
136 | psrldq m0, 4 | |
137 | movd v2d, m0 | |
138 | psrldq m0, 4 | |
139 | mov [dstq], v1w | |
140 | mov [dstq+stepq*4], v2w | |
141 | shr v1d, 16 | |
142 | shr v2d, 16 | |
143 | mov [dstq+stepq*2], v1w | |
144 | mov [dstq+step3q*2], v2w | |
145 | lea dstq, [dstq+stepq*8] | |
146 | movd v1d, m0 | |
147 | psrldq m0, 4 | |
148 | movd v2d, m0 | |
149 | mov [dstq], v1w | |
150 | mov [dstq+stepq*4], v2w | |
151 | shr v1d, 16 | |
152 | shr v2d, 16 | |
153 | mov [dstq+stepq*2], v1w | |
154 | mov [dstq+step3q*2], v2w | |
155 | lea dstq, [dstq+stepq*8] | |
156 | %else | |
157 | CVTPS2PI m0, [srcq+2*lenq ] | |
158 | CVTPS2PI m1, [srcq+2*lenq+ 8] | |
159 | CVTPS2PI m2, [srcq+2*lenq+16] | |
160 | CVTPS2PI m3, [srcq+2*lenq+24] | |
161 | packssdw m0, m1 | |
162 | packssdw m2, m3 | |
163 | movd v1d, m0 | |
164 | psrlq m0, 32 | |
165 | movd v2d, m0 | |
166 | mov [dstq], v1w | |
167 | mov [dstq+stepq*4], v2w | |
168 | shr v1d, 16 | |
169 | shr v2d, 16 | |
170 | mov [dstq+stepq*2], v1w | |
171 | mov [dstq+step3q*2], v2w | |
172 | lea dstq, [dstq+stepq*8] | |
173 | movd v1d, m2 | |
174 | psrlq m2, 32 | |
175 | movd v2d, m2 | |
176 | mov [dstq], v1w | |
177 | mov [dstq+stepq*4], v2w | |
178 | shr v1d, 16 | |
179 | shr v2d, 16 | |
180 | mov [dstq+stepq*2], v1w | |
181 | mov [dstq+step3q*2], v2w | |
182 | lea dstq, [dstq+stepq*8] | |
183 | %endif | |
184 | add lenq, 16 | |
185 | js .loop | |
186 | %if mmsize == 8 | |
187 | emms | |
188 | %endif | |
189 | REP_RET | |
190 | %endmacro | |
191 | ||
192 | INIT_XMM sse2 | |
193 | FLOAT_TO_INT16_STEP 2 | |
194 | INIT_MMX sse | |
195 | FLOAT_TO_INT16_STEP 0 | |
196 | INIT_MMX 3dnow | |
197 | FLOAT_TO_INT16_STEP 0 | |
198 | ||
199 | ;------------------------------------------------------------------------------- | |
200 | ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len); | |
201 | ;------------------------------------------------------------------------------- | |
202 | %macro FLOAT_TO_INT16_INTERLEAVE2 0 | |
203 | cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len | |
204 | lea lenq, [4*r2q] | |
205 | mov src1q, [src0q+gprsize] | |
206 | mov src0q, [src0q] | |
207 | add dstq, lenq | |
208 | add src0q, lenq | |
209 | add src1q, lenq | |
210 | neg lenq | |
211 | .loop: | |
212 | %if cpuflag(sse2) | |
213 | cvtps2dq m0, [src0q+lenq] | |
214 | cvtps2dq m1, [src1q+lenq] | |
215 | packssdw m0, m1 | |
216 | movhlps m1, m0 | |
217 | punpcklwd m0, m1 | |
218 | mova [dstq+lenq], m0 | |
219 | %else | |
220 | CVTPS2PI m0, [src0q+lenq ] | |
221 | CVTPS2PI m1, [src0q+lenq+8] | |
222 | CVTPS2PI m2, [src1q+lenq ] | |
223 | CVTPS2PI m3, [src1q+lenq+8] | |
224 | packssdw m0, m1 | |
225 | packssdw m2, m3 | |
226 | mova m1, m0 | |
227 | punpcklwd m0, m2 | |
228 | punpckhwd m1, m2 | |
229 | mova [dstq+lenq ], m0 | |
230 | mova [dstq+lenq+8], m1 | |
231 | %endif | |
232 | add lenq, 16 | |
233 | js .loop | |
234 | %if mmsize == 8 | |
235 | emms | |
236 | %endif | |
237 | REP_RET | |
238 | %endmacro | |
239 | ||
240 | INIT_MMX 3dnow | |
241 | FLOAT_TO_INT16_INTERLEAVE2 | |
242 | INIT_MMX sse | |
243 | FLOAT_TO_INT16_INTERLEAVE2 | |
244 | INIT_XMM sse2 | |
245 | FLOAT_TO_INT16_INTERLEAVE2 | |
246 | ||
247 | ;----------------------------------------------------------------------------- | |
248 | ; void ff_float_to_int16_interleave6(int16_t *dst, const float **src, int len) | |
249 | ;----------------------------------------------------------------------------- | |
250 | %macro FLOAT_TO_INT16_INTERLEAVE6 0 | |
251 | cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len | |
252 | %if ARCH_X86_64 | |
253 | mov lend, r2d | |
254 | %else | |
255 | %define lend dword r2m | |
256 | %endif | |
257 | mov src1q, [srcq+1*gprsize] | |
258 | mov src2q, [srcq+2*gprsize] | |
259 | mov src3q, [srcq+3*gprsize] | |
260 | mov src4q, [srcq+4*gprsize] | |
261 | mov src5q, [srcq+5*gprsize] | |
262 | mov srcq, [srcq] | |
263 | sub src1q, srcq | |
264 | sub src2q, srcq | |
265 | sub src3q, srcq | |
266 | sub src4q, srcq | |
267 | sub src5q, srcq | |
268 | .loop: | |
269 | CVTPS2PI mm0, [srcq] | |
270 | CVTPS2PI mm1, [srcq+src1q] | |
271 | CVTPS2PI mm2, [srcq+src2q] | |
272 | CVTPS2PI mm3, [srcq+src3q] | |
273 | CVTPS2PI mm4, [srcq+src4q] | |
274 | CVTPS2PI mm5, [srcq+src5q] | |
275 | packssdw mm0, mm3 | |
276 | packssdw mm1, mm4 | |
277 | packssdw mm2, mm5 | |
278 | PSWAPD mm3, mm0 | |
279 | punpcklwd mm0, mm1 | |
280 | punpckhwd mm1, mm2 | |
281 | punpcklwd mm2, mm3 | |
282 | PSWAPD mm3, mm0 | |
283 | punpckldq mm0, mm2 | |
284 | punpckhdq mm2, mm1 | |
285 | punpckldq mm1, mm3 | |
286 | movq [dstq ], mm0 | |
287 | movq [dstq+16], mm2 | |
288 | movq [dstq+ 8], mm1 | |
289 | add srcq, 8 | |
290 | add dstq, 24 | |
291 | sub lend, 2 | |
292 | jg .loop | |
293 | emms | |
294 | RET | |
295 | %endmacro ; FLOAT_TO_INT16_INTERLEAVE6 | |
296 | ||
297 | INIT_MMX sse | |
298 | FLOAT_TO_INT16_INTERLEAVE6 | |
299 | INIT_MMX 3dnow | |
300 | FLOAT_TO_INT16_INTERLEAVE6 | |
301 | INIT_MMX 3dnowext | |
302 | FLOAT_TO_INT16_INTERLEAVE6 | |
303 | ||
304 | ;----------------------------------------------------------------------------- | |
305 | ; void ff_float_interleave6(float *dst, const float **src, unsigned int len); | |
306 | ;----------------------------------------------------------------------------- | |
307 | ||
308 | %macro FLOAT_INTERLEAVE6 1 | |
309 | cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len | |
310 | %if ARCH_X86_64 | |
311 | mov lend, r2d | |
312 | %else | |
313 | %define lend dword r2m | |
314 | %endif | |
315 | mov src1q, [srcq+1*gprsize] | |
316 | mov src2q, [srcq+2*gprsize] | |
317 | mov src3q, [srcq+3*gprsize] | |
318 | mov src4q, [srcq+4*gprsize] | |
319 | mov src5q, [srcq+5*gprsize] | |
320 | mov srcq, [srcq] | |
321 | sub src1q, srcq | |
322 | sub src2q, srcq | |
323 | sub src3q, srcq | |
324 | sub src4q, srcq | |
325 | sub src5q, srcq | |
326 | .loop: | |
327 | %if cpuflag(sse) | |
328 | movaps m0, [srcq] | |
329 | movaps m1, [srcq+src1q] | |
330 | movaps m2, [srcq+src2q] | |
331 | movaps m3, [srcq+src3q] | |
332 | movaps m4, [srcq+src4q] | |
333 | movaps m5, [srcq+src5q] | |
334 | ||
335 | SBUTTERFLYPS 0, 1, 6 | |
336 | SBUTTERFLYPS 2, 3, 6 | |
337 | SBUTTERFLYPS 4, 5, 6 | |
338 | ||
339 | movaps m6, m4 | |
340 | shufps m4, m0, 0xe4 | |
341 | movlhps m0, m2 | |
342 | movhlps m6, m2 | |
343 | movaps [dstq ], m0 | |
344 | movaps [dstq+16], m4 | |
345 | movaps [dstq+32], m6 | |
346 | ||
347 | movaps m6, m5 | |
348 | shufps m5, m1, 0xe4 | |
349 | movlhps m1, m3 | |
350 | movhlps m6, m3 | |
351 | movaps [dstq+48], m1 | |
352 | movaps [dstq+64], m5 | |
353 | movaps [dstq+80], m6 | |
354 | %else ; mmx | |
355 | movq m0, [srcq] | |
356 | movq m1, [srcq+src1q] | |
357 | movq m2, [srcq+src2q] | |
358 | movq m3, [srcq+src3q] | |
359 | movq m4, [srcq+src4q] | |
360 | movq m5, [srcq+src5q] | |
361 | ||
362 | SBUTTERFLY dq, 0, 1, 6 | |
363 | SBUTTERFLY dq, 2, 3, 6 | |
364 | SBUTTERFLY dq, 4, 5, 6 | |
365 | movq [dstq ], m0 | |
366 | movq [dstq+ 8], m2 | |
367 | movq [dstq+16], m4 | |
368 | movq [dstq+24], m1 | |
369 | movq [dstq+32], m3 | |
370 | movq [dstq+40], m5 | |
371 | %endif | |
372 | add srcq, mmsize | |
373 | add dstq, mmsize*6 | |
374 | sub lend, mmsize/4 | |
375 | jg .loop | |
376 | %if mmsize == 8 | |
377 | emms | |
378 | %endif | |
379 | REP_RET | |
380 | %endmacro | |
381 | ||
382 | INIT_MMX mmx | |
383 | FLOAT_INTERLEAVE6 0 | |
384 | INIT_XMM sse | |
385 | FLOAT_INTERLEAVE6 7 | |
386 | ||
387 | ;----------------------------------------------------------------------------- | |
388 | ; void ff_float_interleave2(float *dst, const float **src, unsigned int len); | |
389 | ;----------------------------------------------------------------------------- | |
390 | ||
391 | %macro FLOAT_INTERLEAVE2 1 | |
392 | cglobal float_interleave2, 3, 4, %1, dst, src, len, src1 | |
393 | mov src1q, [srcq+gprsize] | |
394 | mov srcq, [srcq ] | |
395 | sub src1q, srcq | |
396 | .loop: | |
397 | mova m0, [srcq ] | |
398 | mova m1, [srcq+src1q ] | |
399 | mova m3, [srcq +mmsize] | |
400 | mova m4, [srcq+src1q+mmsize] | |
401 | ||
402 | mova m2, m0 | |
403 | PUNPCKLDQ m0, m1 | |
404 | PUNPCKHDQ m2, m1 | |
405 | ||
406 | mova m1, m3 | |
407 | PUNPCKLDQ m3, m4 | |
408 | PUNPCKHDQ m1, m4 | |
409 | ||
410 | mova [dstq ], m0 | |
411 | mova [dstq+1*mmsize], m2 | |
412 | mova [dstq+2*mmsize], m3 | |
413 | mova [dstq+3*mmsize], m1 | |
414 | ||
415 | add srcq, mmsize*2 | |
416 | add dstq, mmsize*4 | |
417 | sub lend, mmsize/2 | |
418 | jg .loop | |
419 | %if mmsize == 8 | |
420 | emms | |
421 | %endif | |
422 | REP_RET | |
423 | %endmacro | |
424 | ||
425 | INIT_MMX mmx | |
426 | %define PUNPCKLDQ punpckldq | |
427 | %define PUNPCKHDQ punpckhdq | |
428 | FLOAT_INTERLEAVE2 0 | |
429 | INIT_XMM sse | |
430 | %define PUNPCKLDQ unpcklps | |
431 | %define PUNPCKHDQ unpckhps | |
432 | FLOAT_INTERLEAVE2 5 |