Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / dct32.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* 32 point SSE-optimized DCT transform
3;* Copyright (c) 2010 Vitor Sessak
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA 32
25
26align 32
27ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
28 dd 0.553104, 0.582935, 0.622504, 0.674808
29 dd -10.190008, -3.407609, -2.057781, -1.484165
30 dd -1.169440, -0.972568, -0.839350, -0.744536
31 dd 0.502419, 0.522499, 0.566944, 0.646822
32 dd 0.788155, 1.060678, 1.722447, 5.101149
33 dd 0.509796, 0.601345, 0.899976, 2.562916
34 dd 0.509796, 0.601345, 0.899976, 2.562916
35 dd 1.000000, 1.000000, 1.306563, 0.541196
36 dd 1.000000, 1.000000, 1.306563, 0.541196
37 dd 1.000000, 0.707107, 1.000000, -0.707107
38 dd 1.000000, 0.707107, 1.000000, -0.707107
39 dd 0.707107, 0.707107, 0.707107, 0.707107
40
41align 32
42ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
43
44%macro BUTTERFLY 4
45 subps %4, %1, %2
46 addps %2, %2, %1
47 mulps %1, %4, %3
48%endmacro
49
50%macro BUTTERFLY0 5
51%if cpuflag(sse2) && notcpuflag(avx)
52 pshufd %4, %1, %5
53 xorps %1, %2
54 addps %1, %4
55 mulps %1, %3
56%else
57 shufps %4, %1, %1, %5
58 xorps %1, %1, %2
59 addps %4, %4, %1
60 mulps %1, %4, %3
61%endif
62%endmacro
63
64%macro BUTTERFLY2 4
65 BUTTERFLY0 %1, %2, %3, %4, 0x1b
66%endmacro
67
68%macro BUTTERFLY3 4
69 BUTTERFLY0 %1, %2, %3, %4, 0xb1
70%endmacro
71
72%macro BUTTERFLY3V 5
73 movaps m%5, m%1
74 addps m%1, m%2
75 subps m%5, m%2
76 SWAP %2, %5
77 mulps m%2, [ps_cos_vec+192]
78 movaps m%5, m%3
79 addps m%3, m%4
80 subps m%4, m%5
81 mulps m%4, [ps_cos_vec+192]
82%endmacro
83
84%macro PASS6_AND_PERMUTE 0
85 mov tmpd, [outq+4]
86 movss m7, [outq+72]
87 addss m7, [outq+76]
88 movss m3, [outq+56]
89 addss m3, [outq+60]
90 addss m4, m3
91 movss m2, [outq+52]
92 addss m2, m3
93 movss m3, [outq+104]
94 addss m3, [outq+108]
95 addss m1, m3
96 addss m5, m4
97 movss [outq+ 16], m1
98 movss m1, [outq+100]
99 addss m1, m3
100 movss m3, [outq+40]
101 movss [outq+ 48], m1
102 addss m3, [outq+44]
103 movss m1, [outq+100]
104 addss m4, m3
105 addss m3, m2
106 addss m1, [outq+108]
107 movss [outq+ 40], m3
108 addss m2, [outq+36]
109 movss m3, [outq+8]
110 movss [outq+ 56], m2
111 addss m3, [outq+12]
112 movss [outq+ 32], m3
113 movss m3, [outq+80]
114 movss [outq+ 8], m5
115 movss [outq+ 80], m1
116 movss m2, [outq+52]
117 movss m5, [outq+120]
118 addss m5, [outq+124]
119 movss m1, [outq+64]
120 addss m2, [outq+60]
121 addss m0, m5
122 addss m5, [outq+116]
123 mov [outq+64], tmpd
124 addss m6, m0
125 addss m1, m6
126 mov tmpd, [outq+12]
127 mov [outq+ 96], tmpd
128 movss [outq+ 4], m1
129 movss m1, [outq+24]
130 movss [outq+ 24], m4
131 movss m4, [outq+88]
132 addss m4, [outq+92]
133 addss m3, m4
134 addss m4, [outq+84]
135 mov tmpd, [outq+108]
136 addss m1, [outq+28]
137 addss m0, m1
138 addss m1, m5
139 addss m6, m3
140 addss m3, m0
141 addss m0, m7
142 addss m5, [outq+20]
143 addss m7, m1
144 movss [outq+ 12], m6
145 mov [outq+112], tmpd
146 movss m6, [outq+28]
147 movss [outq+ 28], m0
148 movss m0, [outq+36]
149 movss [outq+ 36], m7
150 addss m1, m4
151 movss m7, [outq+116]
152 addss m0, m2
153 addss m7, [outq+124]
154 movss [outq+ 72], m0
155 movss m0, [outq+44]
156 addss m2, m0
157 movss [outq+ 44], m1
158 movss [outq+ 88], m2
159 addss m0, [outq+60]
160 mov tmpd, [outq+60]
161 mov [outq+120], tmpd
162 movss [outq+104], m0
163 addss m4, m5
164 addss m5, [outq+68]
165 movss [outq+52], m4
166 movss [outq+60], m5
167 movss m4, [outq+68]
168 movss m5, [outq+20]
169 movss [outq+ 20], m3
170 addss m5, m7
171 addss m7, m6
172 addss m4, m5
173 movss m2, [outq+84]
174 addss m2, [outq+92]
175 addss m5, m2
176 movss [outq+ 68], m4
177 addss m2, m7
178 movss m4, [outq+76]
179 movss [outq+ 84], m2
180 movss [outq+ 76], m5
181 addss m7, m4
182 addss m6, [outq+124]
183 addss m4, m6
184 addss m6, [outq+92]
185 movss [outq+100], m4
186 movss [outq+108], m6
187 movss m6, [outq+92]
188 movss [outq+92], m7
189 addss m6, [outq+124]
190 movss [outq+116], m6
191%endmacro
192
193INIT_YMM avx
194SECTION_TEXT
195%if HAVE_AVX_EXTERNAL
196; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
197cglobal dct32_float, 2,3,8, out, in, tmp
198 ; pass 1
199 vmovaps m4, [inq+0]
200 vinsertf128 m5, m5, [inq+96], 1
201 vinsertf128 m5, m5, [inq+112], 0
202 vshufps m5, m5, m5, 0x1b
203 BUTTERFLY m4, m5, [ps_cos_vec], m6
204
205 vmovaps m2, [inq+64]
206 vinsertf128 m6, m6, [inq+32], 1
207 vinsertf128 m6, m6, [inq+48], 0
208 vshufps m6, m6, m6, 0x1b
209 BUTTERFLY m2, m6, [ps_cos_vec+32], m0
210
211 ; pass 2
212
213 BUTTERFLY m5, m6, [ps_cos_vec+64], m0
214 BUTTERFLY m4, m2, [ps_cos_vec+64], m7
215
216
217 ; pass 3
218 vperm2f128 m3, m6, m4, 0x31
219 vperm2f128 m1, m6, m4, 0x20
220 vshufps m3, m3, m3, 0x1b
221
222 BUTTERFLY m1, m3, [ps_cos_vec+96], m6
223
224
225 vperm2f128 m4, m5, m2, 0x20
226 vperm2f128 m5, m5, m2, 0x31
227 vshufps m5, m5, m5, 0x1b
228
229 BUTTERFLY m4, m5, [ps_cos_vec+96], m6
230
231 ; pass 4
232 vmovaps m6, [ps_p1p1m1m1+0]
233 vmovaps m2, [ps_cos_vec+128]
234
235 BUTTERFLY2 m5, m6, m2, m7
236 BUTTERFLY2 m4, m6, m2, m7
237 BUTTERFLY2 m1, m6, m2, m7
238 BUTTERFLY2 m3, m6, m2, m7
239
240
241 ; pass 5
242 vshufps m6, m6, m6, 0xcc
243 vmovaps m2, [ps_cos_vec+160]
244
245 BUTTERFLY3 m5, m6, m2, m7
246 BUTTERFLY3 m4, m6, m2, m7
247 BUTTERFLY3 m1, m6, m2, m7
248 BUTTERFLY3 m3, m6, m2, m7
249
250 vperm2f128 m6, m3, m3, 0x31
251 vmovaps [outq], m3
252
253 vextractf128 [outq+64], m5, 1
254 vextractf128 [outq+32], m5, 0
255
256 vextractf128 [outq+80], m4, 1
257 vextractf128 [outq+48], m4, 0
258
259 vperm2f128 m0, m1, m1, 0x31
260 vmovaps [outq+96], m1
261
262 vzeroupper
263
264 ; pass 6, no SIMD...
265INIT_XMM
266 PASS6_AND_PERMUTE
267 RET
268%endif
269
270%if ARCH_X86_64
271%define SPILL SWAP
272%define UNSPILL SWAP
273
274%macro PASS5 0
275 nop ; FIXME code alignment
276 SWAP 5, 8
277 SWAP 4, 12
278 SWAP 6, 14
279 SWAP 7, 13
280 SWAP 0, 15
281 PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
282 TRANSPOSE4x4PS 8, 9, 10, 11, 0
283 BUTTERFLY3V 8, 9, 10, 11, 0
284 addps m10, m11
285 TRANSPOSE4x4PS 12, 13, 14, 15, 0
286 BUTTERFLY3V 12, 13, 14, 15, 0
287 addps m14, m15
288 addps m12, m14
289 addps m14, m13
290 addps m13, m15
291%endmacro
292
293%macro PASS6 0
294 SWAP 9, 12
295 SWAP 11, 14
296 movss [outq+0x00], m8
297 pshuflw m0, m8, 0xe
298 movss [outq+0x10], m9
299 pshuflw m1, m9, 0xe
300 movss [outq+0x20], m10
301 pshuflw m2, m10, 0xe
302 movss [outq+0x30], m11
303 pshuflw m3, m11, 0xe
304 movss [outq+0x40], m12
305 pshuflw m4, m12, 0xe
306 movss [outq+0x50], m13
307 pshuflw m5, m13, 0xe
308 movss [outq+0x60], m14
309 pshuflw m6, m14, 0xe
310 movaps [outq+0x70], m15
311 pshuflw m7, m15, 0xe
312 addss m0, m1
313 addss m1, m2
314 movss [outq+0x08], m0
315 addss m2, m3
316 movss [outq+0x18], m1
317 addss m3, m4
318 movss [outq+0x28], m2
319 addss m4, m5
320 movss [outq+0x38], m3
321 addss m5, m6
322 movss [outq+0x48], m4
323 addss m6, m7
324 movss [outq+0x58], m5
325 movss [outq+0x68], m6
326 movss [outq+0x78], m7
327
328 PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
329 movhlps m0, m1
330 pshufd m1, m1, 3
331 SWAP 0, 2, 4, 6, 8, 10, 12, 14
332 SWAP 1, 3, 5, 7, 9, 11, 13, 15
333%rep 7
334 movhlps m0, m1
335 pshufd m1, m1, 3
336 addss m15, m1
337 SWAP 0, 2, 4, 6, 8, 10, 12, 14
338 SWAP 1, 3, 5, 7, 9, 11, 13, 15
339%endrep
340%assign i 4
341%rep 15
342 addss m0, m1
343 movss [outq+i], m0
344 SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
345 %assign i i+8
346%endrep
347%endmacro
348
349%else ; ARCH_X86_32
350%macro SPILL 2 ; xmm#, mempos
351 movaps [outq+(%2-8)*16], m%1
352%endmacro
353%macro UNSPILL 2
354 movaps m%1, [outq+(%2-8)*16]
355%endmacro
356
357%define PASS6 PASS6_AND_PERMUTE
358%macro PASS5 0
359 movaps m2, [ps_cos_vec+160]
360 shufps m3, m3, 0xcc
361
362 BUTTERFLY3 m5, m3, m2, m1
363 SPILL 5, 8
364
365 UNSPILL 1, 9
366 BUTTERFLY3 m1, m3, m2, m5
367 SPILL 1, 14
368
369 BUTTERFLY3 m4, m3, m2, m5
370 SPILL 4, 12
371
372 BUTTERFLY3 m7, m3, m2, m5
373 SPILL 7, 13
374
375 UNSPILL 5, 10
376 BUTTERFLY3 m5, m3, m2, m7
377 SPILL 5, 10
378
379 UNSPILL 4, 11
380 BUTTERFLY3 m4, m3, m2, m7
381 SPILL 4, 11
382
383 BUTTERFLY3 m6, m3, m2, m7
384 SPILL 6, 9
385
386 BUTTERFLY3 m0, m3, m2, m7
387 SPILL 0, 15
388%endmacro
389%endif
390
391
392; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
393%macro DCT32_FUNC 0
394cglobal dct32_float, 2, 3, 16, out, in, tmp
395 ; pass 1
396
397 movaps m0, [inq+0]
398 LOAD_INV m1, [inq+112]
399 BUTTERFLY m0, m1, [ps_cos_vec], m3
400
401 movaps m7, [inq+64]
402 LOAD_INV m4, [inq+48]
403 BUTTERFLY m7, m4, [ps_cos_vec+32], m3
404
405 ; pass 2
406 movaps m2, [ps_cos_vec+64]
407 BUTTERFLY m1, m4, m2, m3
408 SPILL 1, 11
409 SPILL 4, 8
410
411 ; pass 1
412 movaps m1, [inq+16]
413 LOAD_INV m6, [inq+96]
414 BUTTERFLY m1, m6, [ps_cos_vec+16], m3
415
416 movaps m4, [inq+80]
417 LOAD_INV m5, [inq+32]
418 BUTTERFLY m4, m5, [ps_cos_vec+48], m3
419
420 ; pass 2
421 BUTTERFLY m0, m7, m2, m3
422
423 movaps m2, [ps_cos_vec+80]
424 BUTTERFLY m6, m5, m2, m3
425
426 BUTTERFLY m1, m4, m2, m3
427
428 ; pass 3
429 movaps m2, [ps_cos_vec+96]
430 shufps m1, m1, 0x1b
431 BUTTERFLY m0, m1, m2, m3
432 SPILL 0, 15
433 SPILL 1, 14
434
435 UNSPILL 0, 8
436 shufps m5, m5, 0x1b
437 BUTTERFLY m0, m5, m2, m3
438
439 UNSPILL 1, 11
440 shufps m6, m6, 0x1b
441 BUTTERFLY m1, m6, m2, m3
442 SPILL 1, 11
443
444 shufps m4, m4, 0x1b
445 BUTTERFLY m7, m4, m2, m3
446
447 ; pass 4
448 movaps m3, [ps_p1p1m1m1+0]
449 movaps m2, [ps_cos_vec+128]
450
451 BUTTERFLY2 m5, m3, m2, m1
452
453 BUTTERFLY2 m0, m3, m2, m1
454 SPILL 0, 9
455
456 BUTTERFLY2 m6, m3, m2, m1
457 SPILL 6, 10
458
459 UNSPILL 0, 11
460 BUTTERFLY2 m0, m3, m2, m1
461 SPILL 0, 11
462
463 BUTTERFLY2 m4, m3, m2, m1
464
465 BUTTERFLY2 m7, m3, m2, m1
466
467 UNSPILL 6, 14
468 BUTTERFLY2 m6, m3, m2, m1
469
470 UNSPILL 0, 15
471 BUTTERFLY2 m0, m3, m2, m1
472
473 PASS5
474 PASS6
475 RET
476%endmacro
477
478%macro LOAD_INV 2
479%if cpuflag(sse2)
480 pshufd %1, %2, 0x1b
481%elif cpuflag(sse)
482 movaps %1, %2
483 shufps %1, %1, 0x1b
484%endif
485%endmacro
486
487%if ARCH_X86_32
488INIT_XMM sse
489DCT32_FUNC
490%endif
491INIT_XMM sse2
492DCT32_FUNC