Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / imdct36.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* 36 point SSE-optimized IMDCT transform
3;* Copyright (c) 2011 Vitor Sessak
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25
26align 16
27ps_mask: dd 0, ~0, ~0, ~0
28ps_mask2: dd 0, ~0, 0, ~0
29ps_mask3: dd 0, 0, 0, ~0
30ps_mask4: dd 0, ~0, 0, 0
31
32ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038
33ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038
34ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433
35ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038
36ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
37ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097
38ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097
39
40ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
41ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
42
43ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461
44 dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349
45 dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
46 dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
47 dd 1.0, 0.70710678118654752439, 0.0, 0.0
48
49ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
50 dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
51 dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
52 dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
53 dd 1.0, -0.70710678118654752439, 0.0, 0.0
54
55costabs: times 4 dd 0.98480773
56 times 4 dd 0.93969262
57 times 4 dd 0.86602539
58 times 4 dd -0.76604444
59 times 4 dd -0.64278764
60 times 4 dd 0.50000000
61 times 4 dd -0.50000000
62 times 4 dd -0.34202015
63 times 4 dd -0.17364818
64 times 4 dd 0.50190992
65 times 4 dd 0.51763808
66 times 4 dd 0.55168896
67 times 4 dd 0.61038726
68 times 4 dd 0.70710677
69 times 4 dd 0.87172341
70 times 4 dd 1.18310082
71 times 4 dd 1.93185163
72 times 4 dd 5.73685646
73
74%define SBLIMIT 32
75SECTION_TEXT
76
77%macro PSHUFD 3
78%if cpuflag(sse2) && notcpuflag(avx)
79 pshufd %1, %2, %3
80%else
81 shufps %1, %2, %2, %3
82%endif
83%endmacro
84
85; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
86; output %1={x3,x4,y1,y2}
87%macro BUILDINVHIGHLOW 3
88%if cpuflag(avx)
89 shufps %1, %2, %3, 0x4e
90%else
91 movlhps %1, %3
92 movhlps %1, %2
93%endif
94%endmacro
95
96; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
97; output %1={x4,y1,y2,y3}
98%macro ROTLEFT 3
99%if cpuflag(ssse3)
100 palignr %1, %3, %2, 12
101%else
102 BUILDINVHIGHLOW %1, %2, %3
103 shufps %1, %1, %3, 0x99
104%endif
105%endmacro
106
107%macro INVERTHL 2
108%if cpuflag(sse2)
109 PSHUFD %1, %2, 0x4e
110%else
111 movhlps %1, %2
112 movlhps %1, %2
113%endif
114%endmacro
115
116%macro BUTTERF 3
117 INVERTHL %2, %1
118 xorps %1, [ps_p1p1m1m1]
119 addps %1, %2
120%if cpuflag(sse3)
121 mulps %1, %1, [ps_cosh_sse3 + %3]
122 PSHUFD %2, %1, 0xb1
123 addsubps %1, %1, %2
124%else
125 mulps %1, [ps_cosh + %3]
126 PSHUFD %2, %1, 0xb1
127 xorps %1, [ps_p1m1p1m1]
128 addps %1, %2
129%endif
130%endmacro
131
132%macro BUTTERF2 3
133%if cpuflag(sse3)
134 mulps %1, %1, [ps_cosh_sse3 + %3]
135 PSHUFD %2, %1, 0xe1
136 addsubps %1, %1, %2
137%else
138 mulps %1, [ps_cosh + %3]
139 PSHUFD %2, %1, 0xe1
140 xorps %1, [ps_p1m1p1m1]
141 addps %1, %2
142%endif
143%endmacro
144
145%macro STORE 4
146 movhlps %2, %1
147 movss [%3 ], %1
148 movss [%3 + 2*%4], %2
149 shufps %1, %1, 0xb1
150 movss [%3 + %4], %1
151 movhlps %2, %1
152 movss [%3 + 3*%4], %2
153%endmacro
154
155%macro LOAD 4
156 movlps %1, [%3 ]
157 movhps %1, [%3 + %4]
158 movlps %2, [%3 + 2*%4]
159 movhps %2, [%3 + 3*%4]
160 shufps %1, %2, 0x88
161%endmacro
162
163%macro LOADA64 2
164%if cpuflag(avx)
165 movu %1, [%2]
166%else
167 movlps %1, [%2]
168 movhps %1, [%2 + 8]
169%endif
170%endmacro
171
172%macro DEFINE_IMDCT 0
173cglobal imdct36_float, 4,4,9, out, buf, in, win
174
175 ; for(i=17;i>=1;i--) in[i] += in[i-1];
176 LOADA64 m0, inq
177 LOADA64 m1, inq + 16
178
179 ROTLEFT m5, m0, m1
180
181 PSHUFD m6, m0, 0x93
182 andps m6, m6, [ps_mask]
183 addps m0, m0, m6
184
185 LOADA64 m2, inq + 32
186
187 ROTLEFT m7, m1, m2
188
189 addps m1, m1, m5
190 LOADA64 m3, inq + 48
191
192 ROTLEFT m5, m2, m3
193
194 xorps m4, m4, m4
195 movlps m4, [inq+64]
196 BUILDINVHIGHLOW m6, m3, m4
197 shufps m6, m6, m4, 0xa9
198
199 addps m4, m4, m6
200 addps m2, m2, m7
201 addps m3, m3, m5
202
203 ; for(i=17;i>=3;i-=2) in[i] += in[i-2];
204 movlhps m5, m5, m0
205 andps m5, m5, [ps_mask3]
206
207 BUILDINVHIGHLOW m7, m0, m1
208 andps m7, m7, [ps_mask2]
209
210 addps m0, m0, m5
211
212 BUILDINVHIGHLOW m6, m1, m2
213 andps m6, m6, [ps_mask2]
214
215 addps m1, m1, m7
216
217 BUILDINVHIGHLOW m7, m2, m3
218 andps m7, m7, [ps_mask2]
219
220 addps m2, m2, m6
221
222 movhlps m6, m6, m3
223 andps m6, m6, [ps_mask4]
224
225 addps m3, m3, m7
226 addps m4, m4, m6
227
228 ; Populate tmp[]
229 movlhps m6, m1, m5 ; zero out high values
230 subps m6, m6, m4
231
232 subps m5, m0, m3
233
234%if ARCH_X86_64
235 SWAP m5, m8
236%endif
237
238 mulps m7, m2, [ps_val1]
239
240%if ARCH_X86_64
241 mulps m5, m8, [ps_val2]
242%else
243 mulps m5, m5, [ps_val2]
244%endif
245 addps m7, m7, m5
246
247 mulps m5, m6, [ps_val1]
248 subps m7, m7, m5
249
250%if ARCH_X86_64
251 SWAP m5, m8
252%else
253 subps m5, m0, m3
254%endif
255
256 subps m5, m5, m6
257 addps m5, m5, m2
258
259 shufps m6, m4, m3, 0xe4
260 subps m6, m6, m2
261 mulps m6, m6, [ps_val3]
262
263 addps m4, m4, m1
264 mulps m4, m4, [ps_val4]
265
266 shufps m1, m1, m0, 0xe4
267 addps m1, m1, m2
268 mulps m1, m1, [ps_val5]
269
270 mulps m3, m3, [ps_val6]
271 mulps m0, m0, [ps_val7]
272 addps m0, m0, m3
273
274 xorps m2, m1, [ps_p1p1m1m1]
275 subps m2, m2, m4
276 addps m2, m2, m0
277
278 addps m3, m4, m0
279 subps m3, m3, m6
280 xorps m3, m3, [ps_p1p1m1m1]
281
282 shufps m0, m0, m4, 0xe4
283 subps m0, m0, m1
284 addps m0, m0, m6
285
286 BUILDINVHIGHLOW m4, m2, m3
287 shufps m3, m3, m2, 0x4e
288
289 ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
290
291 BUTTERF m0, m1, 0
292 BUTTERF m7, m2, 16
293 BUTTERF m3, m6, 32
294 BUTTERF m4, m1, 48
295 BUTTERF2 m5, m1, 64
296
297 ; permutates:
298 ; m0 0 1 2 3 => 2 6 10 14 m1
299 ; m7 4 5 6 7 => 3 7 11 15 m2
300 ; m3 8 9 10 11 => 17 13 9 5 m3
301 ; m4 12 13 14 15 => 16 12 8 4 m5
302 ; m5 16 17 xx xx => 0 1 xx xx m0
303
304 unpckhps m1, m0, m7
305 unpckhps m6, m3, m4
306 movhlps m2, m6, m1
307 movlhps m1, m1, m6
308
309 unpcklps m5, m5, m4
310 unpcklps m3, m3, m7
311 movhlps m4, m3, m5
312 movlhps m5, m5, m3
313 SWAP m4, m3
314 ; permutation done
315
316 PSHUFD m6, m2, 0xb1
317 movss m4, [bufq + 4*68]
318 movss m7, [bufq + 4*64]
319 unpcklps m7, m7, m4
320 mulps m6, m6, [winq + 16*4]
321 addps m6, m6, m7
322 movss [outq + 64*SBLIMIT], m6
323 shufps m6, m6, m6, 0xb1
324 movss [outq + 68*SBLIMIT], m6
325
326 mulps m6, m3, [winq + 4*4]
327 LOAD m4, m7, bufq + 4*16, 16
328 addps m6, m6, m4
329 STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
330
331 shufps m4, m0, m3, 0xb5
332 mulps m4, m4, [winq + 8*4]
333 LOAD m7, m6, bufq + 4*32, 16
334 addps m4, m4, m7
335 STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
336
337 shufps m3, m3, m2, 0xb1
338 mulps m3, m3, [winq + 12*4]
339 LOAD m7, m6, bufq + 4*48, 16
340 addps m3, m3, m7
341 STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
342
343 mulps m2, m2, [winq]
344 LOAD m6, m7, bufq, 16
345 addps m2, m2, m6
346 STORE m2, m7, outq, 4*SBLIMIT
347
348 mulps m4, m1, [winq + 20*4]
349 STORE m4, m7, bufq, 16
350
351 mulps m3, m5, [winq + 24*4]
352 STORE m3, m7, bufq + 4*16, 16
353
354 shufps m0, m0, m5, 0xb0
355 mulps m0, m0, [winq + 28*4]
356 STORE m0, m7, bufq + 4*32, 16
357
358 shufps m5, m5, m1, 0xb1
359 mulps m5, m5, [winq + 32*4]
360 STORE m5, m7, bufq + 4*48, 16
361
362 shufps m1, m1, m1, 0xb1
363 mulps m1, m1, [winq + 36*4]
364 movss [bufq + 4*64], m1
365 shufps m1, m1, 0xb1
366 movss [bufq + 4*68], m1
367 RET
368%endmacro
369
370%if ARCH_X86_32
371INIT_XMM sse
372DEFINE_IMDCT
373%endif
374
375INIT_XMM sse2
376DEFINE_IMDCT
377
378INIT_XMM sse3
379DEFINE_IMDCT
380
381INIT_XMM ssse3
382DEFINE_IMDCT
383
384%if HAVE_AVX_EXTERNAL
385INIT_XMM avx
386DEFINE_IMDCT
387%endif
388
389INIT_XMM sse
390
391%if ARCH_X86_64
392%define SPILL SWAP
393%define UNSPILL SWAP
394%define SPILLED(x) m %+ x
395%else
396%define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
397%macro SPILL 2 ; xmm#, mempos
398 movaps SPILLED(%2), m%1
399%endmacro
400%macro UNSPILL 2
401 movaps m%1, SPILLED(%2)
402%endmacro
403%endif
404
405%macro DEFINE_FOUR_IMDCT 0
406cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
407 movlps m0, [inq+64]
408 movhps m0, [inq+64 + 72]
409 movlps m3, [inq+64 + 2*72]
410 movhps m3, [inq+64 + 3*72]
411
412 shufps m5, m0, m3, 0xdd
413 shufps m0, m0, m3, 0x88
414
415 mova m1, [inq+48]
416 movu m6, [inq+48 + 72]
417 mova m7, [inq+48 + 2*72]
418 movu m3, [inq+48 + 3*72]
419
420 TRANSPOSE4x4PS 1, 6, 7, 3, 4
421
422 addps m4, m6, m7
423 mova [tmpq+4*28], m4
424
425 addps m7, m3
426 addps m6, m1
427 addps m3, m0
428 addps m0, m5
429 addps m0, m7
430 addps m7, m6
431 mova [tmpq+4*12], m7
432 SPILL 3, 12
433
434 mova m4, [inq+32]
435 movu m5, [inq+32 + 72]
436 mova m2, [inq+32 + 2*72]
437 movu m7, [inq+32 + 3*72]
438
439 TRANSPOSE4x4PS 4, 5, 2, 7, 3
440
441 addps m1, m7
442 SPILL 1, 11
443
444 addps m3, m5, m2
445 SPILL 3, 13
446
447 addps m7, m2
448 addps m5, m4
449 addps m6, m7
450 mova [tmpq], m6
451 addps m7, m5
452 mova [tmpq+4*16], m7
453
454 mova m2, [inq+16]
455 movu m7, [inq+16 + 72]
456 mova m1, [inq+16 + 2*72]
457 movu m6, [inq+16 + 3*72]
458
459 TRANSPOSE4x4PS 2, 7, 1, 6, 3
460
461 addps m4, m6
462 addps m6, m1
463 addps m1, m7
464 addps m7, m2
465 addps m5, m6
466 SPILL 5, 15
467 addps m6, m7
468 mulps m6, [costabs + 16*2]
469 mova [tmpq+4*8], m6
470 SPILL 1, 10
471 SPILL 0, 14
472
473 mova m1, [inq]
474 movu m6, [inq + 72]
475 mova m3, [inq + 2*72]
476 movu m5, [inq + 3*72]
477
478 TRANSPOSE4x4PS 1, 6, 3, 5, 0
479
480 addps m2, m5
481 addps m5, m3
482 addps m7, m5
483 addps m3, m6
484 addps m6, m1
485 SPILL 7, 8
486 addps m5, m6
487 SPILL 6, 9
488 addps m6, m4, SPILLED(12)
489 subps m6, m2
490 UNSPILL 7, 11
491 SPILL 5, 11
492 subps m5, m1, m7
493 mulps m7, [costabs + 16*5]
494 addps m7, m1
495 mulps m0, m6, [costabs + 16*6]
496 addps m0, m5
497 mova [tmpq+4*24], m0
498 addps m6, m5
499 mova [tmpq+4*4], m6
500 addps m6, m4, m2
501 mulps m6, [costabs + 16*1]
502 subps m4, SPILLED(12)
503 mulps m4, [costabs + 16*8]
504 addps m2, SPILLED(12)
505 mulps m2, [costabs + 16*3]
506 subps m5, m7, m6
507 subps m5, m2
508 addps m6, m7
509 addps m6, m4
510 addps m7, m2
511 subps m7, m4
512 mova [tmpq+4*20], m7
513 mova m2, [tmpq+4*28]
514 mova [tmpq+4*28], m5
515 UNSPILL 7, 13
516 subps m5, m7, m2
517 mulps m5, [costabs + 16*7]
518 UNSPILL 1, 10
519 mulps m1, [costabs + 16*2]
520 addps m4, m3, m2
521 mulps m4, [costabs + 16*4]
522 addps m2, m7
523 addps m7, m3
524 mulps m7, [costabs]
525 subps m3, m2
526 mulps m3, [costabs + 16*2]
527 addps m2, m7, m5
528 addps m2, m1
529 SPILL 2, 10
530 addps m7, m4
531 subps m7, m1
532 SPILL 7, 12
533 subps m5, m4
534 subps m5, m1
535 UNSPILL 0, 14
536 SPILL 5, 13
537 addps m1, m0, SPILLED(15)
538 subps m1, SPILLED(8)
539 mova m4, [costabs + 16*5]
540 mulps m4, [tmpq]
541 UNSPILL 2, 9
542 addps m4, m2
543 subps m2, [tmpq]
544 mulps m5, m1, [costabs + 16*6]
545 addps m5, m2
546 SPILL 5, 9
547 addps m2, m1
548 SPILL 2, 14
549 UNSPILL 5, 15
550 subps m7, m5, m0
551 addps m5, SPILLED(8)
552 mulps m5, [costabs + 16*1]
553 mulps m7, [costabs + 16*8]
554 addps m0, SPILLED(8)
555 mulps m0, [costabs + 16*3]
556 subps m2, m4, m5
557 subps m2, m0
558 SPILL 2, 15
559 addps m5, m4
560 addps m5, m7
561 addps m4, m0
562 subps m4, m7
563 SPILL 4, 8
564 mova m7, [tmpq+4*16]
565 mova m2, [tmpq+4*12]
566 addps m0, m7, m2
567 subps m0, SPILLED(11)
568 mulps m0, [costabs + 16*2]
569 addps m4, m7, SPILLED(11)
570 mulps m4, [costabs]
571 subps m7, m2
572 mulps m7, [costabs + 16*7]
573 addps m2, SPILLED(11)
574 mulps m2, [costabs + 16*4]
575 addps m1, m7, [tmpq+4*8]
576 addps m1, m4
577 addps m4, m2
578 subps m4, [tmpq+4*8]
579 SPILL 4, 11
580 subps m7, m2
581 subps m7, [tmpq+4*8]
582 addps m4, m6, SPILLED(10)
583 subps m6, SPILLED(10)
584 addps m2, m5, m1
585 mulps m2, [costabs + 16*9]
586 subps m5, m1
587 mulps m5, [costabs + 16*17]
588 subps m1, m4, m2
589 addps m4, m2
590 mulps m2, m1, [winq+4*36]
591 addps m2, [bufq+4*36]
592 mova [outq+1152], m2
593 mulps m1, [winq+4*32]
594 addps m1, [bufq+4*32]
595 mova [outq+1024], m1
596 mulps m1, m4, [winq+4*116]
597 mova [bufq+4*36], m1
598 mulps m4, [winq+4*112]
599 mova [bufq+4*32], m4
600 addps m2, m6, m5
601 subps m6, m5
602 mulps m1, m6, [winq+4*68]
603 addps m1, [bufq+4*68]
604 mova [outq+2176], m1
605 mulps m6, [winq]
606 addps m6, [bufq]
607 mova [outq], m6
608 mulps m1, m2, [winq+4*148]
609 mova [bufq+4*68], m1
610 mulps m2, [winq+4*80]
611 mova [bufq], m2
612 addps m5, m3, [tmpq+4*24]
613 mova m2, [tmpq+4*24]
614 subps m2, m3
615 mova m1, SPILLED(9)
616 subps m1, m0
617 mulps m1, [costabs + 16*10]
618 addps m0, SPILLED(9)
619 mulps m0, [costabs + 16*16]
620 addps m6, m5, m1
621 subps m5, m1
622 mulps m3, m5, [winq+4*40]
623 addps m3, [bufq+4*40]
624 mova [outq+1280], m3
625 mulps m5, [winq+4*28]
626 addps m5, [bufq+4*28]
627 mova [outq+896], m5
628 mulps m1, m6, [winq+4*120]
629 mova [bufq+4*40], m1
630 mulps m6, [winq+4*108]
631 mova [bufq+4*28], m6
632 addps m1, m2, m0
633 subps m2, m0
634 mulps m5, m2, [winq+4*64]
635 addps m5, [bufq+4*64]
636 mova [outq+2048], m5
637 mulps m2, [winq+4*4]
638 addps m2, [bufq+4*4]
639 mova [outq+128], m2
640 mulps m0, m1, [winq+4*144]
641 mova [bufq+4*64], m0
642 mulps m1, [winq+4*84]
643 mova [bufq+4*4], m1
644 mova m1, [tmpq+4*28]
645 mova m5, m1
646 addps m1, SPILLED(13)
647 subps m5, SPILLED(13)
648 UNSPILL 3, 15
649 addps m2, m7, m3
650 mulps m2, [costabs + 16*11]
651 subps m3, m7
652 mulps m3, [costabs + 16*15]
653 addps m0, m2, m1
654 subps m1, m2
655 SWAP m0, m2
656 mulps m6, m1, [winq+4*44]
657 addps m6, [bufq+4*44]
658 mova [outq+1408], m6
659 mulps m1, [winq+4*24]
660 addps m1, [bufq+4*24]
661 mova [outq+768], m1
662 mulps m0, m2, [winq+4*124]
663 mova [bufq+4*44], m0
664 mulps m2, [winq+4*104]
665 mova [bufq+4*24], m2
666 addps m0, m5, m3
667 subps m5, m3
668 mulps m1, m5, [winq+4*60]
669 addps m1, [bufq+4*60]
670 mova [outq+1920], m1
671 mulps m5, [winq+4*8]
672 addps m5, [bufq+4*8]
673 mova [outq+256], m5
674 mulps m1, m0, [winq+4*140]
675 mova [bufq+4*60], m1
676 mulps m0, [winq+4*88]
677 mova [bufq+4*8], m0
678 mova m1, [tmpq+4*20]
679 addps m1, SPILLED(12)
680 mova m2, [tmpq+4*20]
681 subps m2, SPILLED(12)
682 UNSPILL 7, 8
683 subps m0, m7, SPILLED(11)
684 addps m7, SPILLED(11)
685 mulps m4, m7, [costabs + 16*12]
686 mulps m0, [costabs + 16*14]
687 addps m5, m1, m4
688 subps m1, m4
689 mulps m7, m1, [winq+4*48]
690 addps m7, [bufq+4*48]
691 mova [outq+1536], m7
692 mulps m1, [winq+4*20]
693 addps m1, [bufq+4*20]
694 mova [outq+640], m1
695 mulps m1, m5, [winq+4*128]
696 mova [bufq+4*48], m1
697 mulps m5, [winq+4*100]
698 mova [bufq+4*20], m5
699 addps m6, m2, m0
700 subps m2, m0
701 mulps m1, m2, [winq+4*56]
702 addps m1, [bufq+4*56]
703 mova [outq+1792], m1
704 mulps m2, [winq+4*12]
705 addps m2, [bufq+4*12]
706 mova [outq+384], m2
707 mulps m0, m6, [winq+4*136]
708 mova [bufq+4*56], m0
709 mulps m6, [winq+4*92]
710 mova [bufq+4*12], m6
711 UNSPILL 0, 14
712 mulps m0, [costabs + 16*13]
713 mova m3, [tmpq+4*4]
714 addps m2, m0, m3
715 subps m3, m0
716 mulps m0, m3, [winq+4*52]
717 addps m0, [bufq+4*52]
718 mova [outq+1664], m0
719 mulps m3, [winq+4*16]
720 addps m3, [bufq+4*16]
721 mova [outq+512], m3
722 mulps m0, m2, [winq+4*132]
723 mova [bufq+4*52], m0
724 mulps m2, [winq+4*96]
725 mova [bufq+4*16], m2
726 RET
727%endmacro
728
729INIT_XMM sse
730DEFINE_FOUR_IMDCT
731
732%if HAVE_AVX_EXTERNAL
733INIT_XMM avx
734DEFINE_FOUR_IMDCT
735%endif