Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* 36 point SSE-optimized IMDCT transform | |
3 | ;* Copyright (c) 2011 Vitor Sessak | |
4 | ;* | |
5 | ;* This file is part of FFmpeg. | |
6 | ;* | |
7 | ;* FFmpeg is free software; you can redistribute it and/or | |
8 | ;* modify it under the terms of the GNU Lesser General Public | |
9 | ;* License as published by the Free Software Foundation; either | |
10 | ;* version 2.1 of the License, or (at your option) any later version. | |
11 | ;* | |
12 | ;* FFmpeg is distributed in the hope that it will be useful, | |
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;* Lesser General Public License for more details. | |
16 | ;* | |
17 | ;* You should have received a copy of the GNU Lesser General Public | |
18 | ;* License along with FFmpeg; if not, write to the Free Software | |
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ;****************************************************************************** | |
21 | ||
22 | %include "libavutil/x86/x86util.asm" | |
23 | ||
24 | SECTION_RODATA | |
25 | ||
26 | align 16 | |
27 | ps_mask: dd 0, ~0, ~0, ~0 | |
28 | ps_mask2: dd 0, ~0, 0, ~0 | |
29 | ps_mask3: dd 0, 0, 0, ~0 | |
30 | ps_mask4: dd 0, ~0, 0, 0 | |
31 | ||
32 | ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038 | |
33 | ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038 | |
34 | ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433 | |
35 | ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038 | |
36 | ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530 | |
37 | ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097 | |
38 | ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097 | |
39 | ||
40 | ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 | |
41 | ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 | |
42 | ||
43 | ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461 | |
44 | dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349 | |
45 | dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896 | |
46 | dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991 | |
47 | dd 1.0, 0.70710678118654752439, 0.0, 0.0 | |
48 | ||
49 | ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461 | |
50 | dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349 | |
51 | dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896 | |
52 | dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991 | |
53 | dd 1.0, -0.70710678118654752439, 0.0, 0.0 | |
54 | ||
55 | costabs: times 4 dd 0.98480773 | |
56 | times 4 dd 0.93969262 | |
57 | times 4 dd 0.86602539 | |
58 | times 4 dd -0.76604444 | |
59 | times 4 dd -0.64278764 | |
60 | times 4 dd 0.50000000 | |
61 | times 4 dd -0.50000000 | |
62 | times 4 dd -0.34202015 | |
63 | times 4 dd -0.17364818 | |
64 | times 4 dd 0.50190992 | |
65 | times 4 dd 0.51763808 | |
66 | times 4 dd 0.55168896 | |
67 | times 4 dd 0.61038726 | |
68 | times 4 dd 0.70710677 | |
69 | times 4 dd 0.87172341 | |
70 | times 4 dd 1.18310082 | |
71 | times 4 dd 1.93185163 | |
72 | times 4 dd 5.73685646 | |
73 | ||
74 | %define SBLIMIT 32 | |
75 | SECTION_TEXT | |
76 | ||
77 | %macro PSHUFD 3 | |
78 | %if cpuflag(sse2) && notcpuflag(avx) | |
79 | pshufd %1, %2, %3 | |
80 | %else | |
81 | shufps %1, %2, %2, %3 | |
82 | %endif | |
83 | %endmacro | |
84 | ||
85 | ; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} | |
86 | ; output %1={x3,x4,y1,y2} | |
87 | %macro BUILDINVHIGHLOW 3 | |
88 | %if cpuflag(avx) | |
89 | shufps %1, %2, %3, 0x4e | |
90 | %else | |
91 | movlhps %1, %3 | |
92 | movhlps %1, %2 | |
93 | %endif | |
94 | %endmacro | |
95 | ||
96 | ; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} | |
97 | ; output %1={x4,y1,y2,y3} | |
98 | %macro ROTLEFT 3 | |
99 | %if cpuflag(ssse3) | |
100 | palignr %1, %3, %2, 12 | |
101 | %else | |
102 | BUILDINVHIGHLOW %1, %2, %3 | |
103 | shufps %1, %1, %3, 0x99 | |
104 | %endif | |
105 | %endmacro | |
106 | ||
107 | %macro INVERTHL 2 | |
108 | %if cpuflag(sse2) | |
109 | PSHUFD %1, %2, 0x4e | |
110 | %else | |
111 | movhlps %1, %2 | |
112 | movlhps %1, %2 | |
113 | %endif | |
114 | %endmacro | |
115 | ||
116 | %macro BUTTERF 3 | |
117 | INVERTHL %2, %1 | |
118 | xorps %1, [ps_p1p1m1m1] | |
119 | addps %1, %2 | |
120 | %if cpuflag(sse3) | |
121 | mulps %1, %1, [ps_cosh_sse3 + %3] | |
122 | PSHUFD %2, %1, 0xb1 | |
123 | addsubps %1, %1, %2 | |
124 | %else | |
125 | mulps %1, [ps_cosh + %3] | |
126 | PSHUFD %2, %1, 0xb1 | |
127 | xorps %1, [ps_p1m1p1m1] | |
128 | addps %1, %2 | |
129 | %endif | |
130 | %endmacro | |
131 | ||
132 | %macro BUTTERF2 3 | |
133 | %if cpuflag(sse3) | |
134 | mulps %1, %1, [ps_cosh_sse3 + %3] | |
135 | PSHUFD %2, %1, 0xe1 | |
136 | addsubps %1, %1, %2 | |
137 | %else | |
138 | mulps %1, [ps_cosh + %3] | |
139 | PSHUFD %2, %1, 0xe1 | |
140 | xorps %1, [ps_p1m1p1m1] | |
141 | addps %1, %2 | |
142 | %endif | |
143 | %endmacro | |
144 | ||
145 | %macro STORE 4 | |
146 | movhlps %2, %1 | |
147 | movss [%3 ], %1 | |
148 | movss [%3 + 2*%4], %2 | |
149 | shufps %1, %1, 0xb1 | |
150 | movss [%3 + %4], %1 | |
151 | movhlps %2, %1 | |
152 | movss [%3 + 3*%4], %2 | |
153 | %endmacro | |
154 | ||
155 | %macro LOAD 4 | |
156 | movlps %1, [%3 ] | |
157 | movhps %1, [%3 + %4] | |
158 | movlps %2, [%3 + 2*%4] | |
159 | movhps %2, [%3 + 3*%4] | |
160 | shufps %1, %2, 0x88 | |
161 | %endmacro | |
162 | ||
163 | %macro LOADA64 2 | |
164 | %if cpuflag(avx) | |
165 | movu %1, [%2] | |
166 | %else | |
167 | movlps %1, [%2] | |
168 | movhps %1, [%2 + 8] | |
169 | %endif | |
170 | %endmacro | |
171 | ||
172 | %macro DEFINE_IMDCT 0 | |
173 | cglobal imdct36_float, 4,4,9, out, buf, in, win | |
174 | ||
175 | ; for(i=17;i>=1;i--) in[i] += in[i-1]; | |
176 | LOADA64 m0, inq | |
177 | LOADA64 m1, inq + 16 | |
178 | ||
179 | ROTLEFT m5, m0, m1 | |
180 | ||
181 | PSHUFD m6, m0, 0x93 | |
182 | andps m6, m6, [ps_mask] | |
183 | addps m0, m0, m6 | |
184 | ||
185 | LOADA64 m2, inq + 32 | |
186 | ||
187 | ROTLEFT m7, m1, m2 | |
188 | ||
189 | addps m1, m1, m5 | |
190 | LOADA64 m3, inq + 48 | |
191 | ||
192 | ROTLEFT m5, m2, m3 | |
193 | ||
194 | xorps m4, m4, m4 | |
195 | movlps m4, [inq+64] | |
196 | BUILDINVHIGHLOW m6, m3, m4 | |
197 | shufps m6, m6, m4, 0xa9 | |
198 | ||
199 | addps m4, m4, m6 | |
200 | addps m2, m2, m7 | |
201 | addps m3, m3, m5 | |
202 | ||
203 | ; for(i=17;i>=3;i-=2) in[i] += in[i-2]; | |
204 | movlhps m5, m5, m0 | |
205 | andps m5, m5, [ps_mask3] | |
206 | ||
207 | BUILDINVHIGHLOW m7, m0, m1 | |
208 | andps m7, m7, [ps_mask2] | |
209 | ||
210 | addps m0, m0, m5 | |
211 | ||
212 | BUILDINVHIGHLOW m6, m1, m2 | |
213 | andps m6, m6, [ps_mask2] | |
214 | ||
215 | addps m1, m1, m7 | |
216 | ||
217 | BUILDINVHIGHLOW m7, m2, m3 | |
218 | andps m7, m7, [ps_mask2] | |
219 | ||
220 | addps m2, m2, m6 | |
221 | ||
222 | movhlps m6, m6, m3 | |
223 | andps m6, m6, [ps_mask4] | |
224 | ||
225 | addps m3, m3, m7 | |
226 | addps m4, m4, m6 | |
227 | ||
228 | ; Populate tmp[] | |
229 | movlhps m6, m1, m5 ; zero out high values | |
230 | subps m6, m6, m4 | |
231 | ||
232 | subps m5, m0, m3 | |
233 | ||
234 | %if ARCH_X86_64 | |
235 | SWAP m5, m8 | |
236 | %endif | |
237 | ||
238 | mulps m7, m2, [ps_val1] | |
239 | ||
240 | %if ARCH_X86_64 | |
241 | mulps m5, m8, [ps_val2] | |
242 | %else | |
243 | mulps m5, m5, [ps_val2] | |
244 | %endif | |
245 | addps m7, m7, m5 | |
246 | ||
247 | mulps m5, m6, [ps_val1] | |
248 | subps m7, m7, m5 | |
249 | ||
250 | %if ARCH_X86_64 | |
251 | SWAP m5, m8 | |
252 | %else | |
253 | subps m5, m0, m3 | |
254 | %endif | |
255 | ||
256 | subps m5, m5, m6 | |
257 | addps m5, m5, m2 | |
258 | ||
259 | shufps m6, m4, m3, 0xe4 | |
260 | subps m6, m6, m2 | |
261 | mulps m6, m6, [ps_val3] | |
262 | ||
263 | addps m4, m4, m1 | |
264 | mulps m4, m4, [ps_val4] | |
265 | ||
266 | shufps m1, m1, m0, 0xe4 | |
267 | addps m1, m1, m2 | |
268 | mulps m1, m1, [ps_val5] | |
269 | ||
270 | mulps m3, m3, [ps_val6] | |
271 | mulps m0, m0, [ps_val7] | |
272 | addps m0, m0, m3 | |
273 | ||
274 | xorps m2, m1, [ps_p1p1m1m1] | |
275 | subps m2, m2, m4 | |
276 | addps m2, m2, m0 | |
277 | ||
278 | addps m3, m4, m0 | |
279 | subps m3, m3, m6 | |
280 | xorps m3, m3, [ps_p1p1m1m1] | |
281 | ||
282 | shufps m0, m0, m4, 0xe4 | |
283 | subps m0, m0, m1 | |
284 | addps m0, m0, m6 | |
285 | ||
286 | BUILDINVHIGHLOW m4, m2, m3 | |
287 | shufps m3, m3, m2, 0x4e | |
288 | ||
289 | ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5} | |
290 | ||
291 | BUTTERF m0, m1, 0 | |
292 | BUTTERF m7, m2, 16 | |
293 | BUTTERF m3, m6, 32 | |
294 | BUTTERF m4, m1, 48 | |
295 | BUTTERF2 m5, m1, 64 | |
296 | ||
297 | ; permutates: | |
298 | ; m0 0 1 2 3 => 2 6 10 14 m1 | |
299 | ; m7 4 5 6 7 => 3 7 11 15 m2 | |
300 | ; m3 8 9 10 11 => 17 13 9 5 m3 | |
301 | ; m4 12 13 14 15 => 16 12 8 4 m5 | |
302 | ; m5 16 17 xx xx => 0 1 xx xx m0 | |
303 | ||
304 | unpckhps m1, m0, m7 | |
305 | unpckhps m6, m3, m4 | |
306 | movhlps m2, m6, m1 | |
307 | movlhps m1, m1, m6 | |
308 | ||
309 | unpcklps m5, m5, m4 | |
310 | unpcklps m3, m3, m7 | |
311 | movhlps m4, m3, m5 | |
312 | movlhps m5, m5, m3 | |
313 | SWAP m4, m3 | |
314 | ; permutation done | |
315 | ||
316 | PSHUFD m6, m2, 0xb1 | |
317 | movss m4, [bufq + 4*68] | |
318 | movss m7, [bufq + 4*64] | |
319 | unpcklps m7, m7, m4 | |
320 | mulps m6, m6, [winq + 16*4] | |
321 | addps m6, m6, m7 | |
322 | movss [outq + 64*SBLIMIT], m6 | |
323 | shufps m6, m6, m6, 0xb1 | |
324 | movss [outq + 68*SBLIMIT], m6 | |
325 | ||
326 | mulps m6, m3, [winq + 4*4] | |
327 | LOAD m4, m7, bufq + 4*16, 16 | |
328 | addps m6, m6, m4 | |
329 | STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT | |
330 | ||
331 | shufps m4, m0, m3, 0xb5 | |
332 | mulps m4, m4, [winq + 8*4] | |
333 | LOAD m7, m6, bufq + 4*32, 16 | |
334 | addps m4, m4, m7 | |
335 | STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT | |
336 | ||
337 | shufps m3, m3, m2, 0xb1 | |
338 | mulps m3, m3, [winq + 12*4] | |
339 | LOAD m7, m6, bufq + 4*48, 16 | |
340 | addps m3, m3, m7 | |
341 | STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT | |
342 | ||
343 | mulps m2, m2, [winq] | |
344 | LOAD m6, m7, bufq, 16 | |
345 | addps m2, m2, m6 | |
346 | STORE m2, m7, outq, 4*SBLIMIT | |
347 | ||
348 | mulps m4, m1, [winq + 20*4] | |
349 | STORE m4, m7, bufq, 16 | |
350 | ||
351 | mulps m3, m5, [winq + 24*4] | |
352 | STORE m3, m7, bufq + 4*16, 16 | |
353 | ||
354 | shufps m0, m0, m5, 0xb0 | |
355 | mulps m0, m0, [winq + 28*4] | |
356 | STORE m0, m7, bufq + 4*32, 16 | |
357 | ||
358 | shufps m5, m5, m1, 0xb1 | |
359 | mulps m5, m5, [winq + 32*4] | |
360 | STORE m5, m7, bufq + 4*48, 16 | |
361 | ||
362 | shufps m1, m1, m1, 0xb1 | |
363 | mulps m1, m1, [winq + 36*4] | |
364 | movss [bufq + 4*64], m1 | |
365 | shufps m1, m1, 0xb1 | |
366 | movss [bufq + 4*68], m1 | |
367 | RET | |
368 | %endmacro | |
369 | ||
370 | %if ARCH_X86_32 | |
371 | INIT_XMM sse | |
372 | DEFINE_IMDCT | |
373 | %endif | |
374 | ||
375 | INIT_XMM sse2 | |
376 | DEFINE_IMDCT | |
377 | ||
378 | INIT_XMM sse3 | |
379 | DEFINE_IMDCT | |
380 | ||
381 | INIT_XMM ssse3 | |
382 | DEFINE_IMDCT | |
383 | ||
384 | %if HAVE_AVX_EXTERNAL | |
385 | INIT_XMM avx | |
386 | DEFINE_IMDCT | |
387 | %endif | |
388 | ||
389 | INIT_XMM sse | |
390 | ||
391 | %if ARCH_X86_64 | |
392 | %define SPILL SWAP | |
393 | %define UNSPILL SWAP | |
394 | %define SPILLED(x) m %+ x | |
395 | %else | |
396 | %define SPILLED(x) [tmpq+(x-8)*16 + 32*4] | |
397 | %macro SPILL 2 ; xmm#, mempos | |
398 | movaps SPILLED(%2), m%1 | |
399 | %endmacro | |
400 | %macro UNSPILL 2 | |
401 | movaps m%1, SPILLED(%2) | |
402 | %endmacro | |
403 | %endif | |
404 | ||
405 | %macro DEFINE_FOUR_IMDCT 0 | |
406 | cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp | |
407 | movlps m0, [inq+64] | |
408 | movhps m0, [inq+64 + 72] | |
409 | movlps m3, [inq+64 + 2*72] | |
410 | movhps m3, [inq+64 + 3*72] | |
411 | ||
412 | shufps m5, m0, m3, 0xdd | |
413 | shufps m0, m0, m3, 0x88 | |
414 | ||
415 | mova m1, [inq+48] | |
416 | movu m6, [inq+48 + 72] | |
417 | mova m7, [inq+48 + 2*72] | |
418 | movu m3, [inq+48 + 3*72] | |
419 | ||
420 | TRANSPOSE4x4PS 1, 6, 7, 3, 4 | |
421 | ||
422 | addps m4, m6, m7 | |
423 | mova [tmpq+4*28], m4 | |
424 | ||
425 | addps m7, m3 | |
426 | addps m6, m1 | |
427 | addps m3, m0 | |
428 | addps m0, m5 | |
429 | addps m0, m7 | |
430 | addps m7, m6 | |
431 | mova [tmpq+4*12], m7 | |
432 | SPILL 3, 12 | |
433 | ||
434 | mova m4, [inq+32] | |
435 | movu m5, [inq+32 + 72] | |
436 | mova m2, [inq+32 + 2*72] | |
437 | movu m7, [inq+32 + 3*72] | |
438 | ||
439 | TRANSPOSE4x4PS 4, 5, 2, 7, 3 | |
440 | ||
441 | addps m1, m7 | |
442 | SPILL 1, 11 | |
443 | ||
444 | addps m3, m5, m2 | |
445 | SPILL 3, 13 | |
446 | ||
447 | addps m7, m2 | |
448 | addps m5, m4 | |
449 | addps m6, m7 | |
450 | mova [tmpq], m6 | |
451 | addps m7, m5 | |
452 | mova [tmpq+4*16], m7 | |
453 | ||
454 | mova m2, [inq+16] | |
455 | movu m7, [inq+16 + 72] | |
456 | mova m1, [inq+16 + 2*72] | |
457 | movu m6, [inq+16 + 3*72] | |
458 | ||
459 | TRANSPOSE4x4PS 2, 7, 1, 6, 3 | |
460 | ||
461 | addps m4, m6 | |
462 | addps m6, m1 | |
463 | addps m1, m7 | |
464 | addps m7, m2 | |
465 | addps m5, m6 | |
466 | SPILL 5, 15 | |
467 | addps m6, m7 | |
468 | mulps m6, [costabs + 16*2] | |
469 | mova [tmpq+4*8], m6 | |
470 | SPILL 1, 10 | |
471 | SPILL 0, 14 | |
472 | ||
473 | mova m1, [inq] | |
474 | movu m6, [inq + 72] | |
475 | mova m3, [inq + 2*72] | |
476 | movu m5, [inq + 3*72] | |
477 | ||
478 | TRANSPOSE4x4PS 1, 6, 3, 5, 0 | |
479 | ||
480 | addps m2, m5 | |
481 | addps m5, m3 | |
482 | addps m7, m5 | |
483 | addps m3, m6 | |
484 | addps m6, m1 | |
485 | SPILL 7, 8 | |
486 | addps m5, m6 | |
487 | SPILL 6, 9 | |
488 | addps m6, m4, SPILLED(12) | |
489 | subps m6, m2 | |
490 | UNSPILL 7, 11 | |
491 | SPILL 5, 11 | |
492 | subps m5, m1, m7 | |
493 | mulps m7, [costabs + 16*5] | |
494 | addps m7, m1 | |
495 | mulps m0, m6, [costabs + 16*6] | |
496 | addps m0, m5 | |
497 | mova [tmpq+4*24], m0 | |
498 | addps m6, m5 | |
499 | mova [tmpq+4*4], m6 | |
500 | addps m6, m4, m2 | |
501 | mulps m6, [costabs + 16*1] | |
502 | subps m4, SPILLED(12) | |
503 | mulps m4, [costabs + 16*8] | |
504 | addps m2, SPILLED(12) | |
505 | mulps m2, [costabs + 16*3] | |
506 | subps m5, m7, m6 | |
507 | subps m5, m2 | |
508 | addps m6, m7 | |
509 | addps m6, m4 | |
510 | addps m7, m2 | |
511 | subps m7, m4 | |
512 | mova [tmpq+4*20], m7 | |
513 | mova m2, [tmpq+4*28] | |
514 | mova [tmpq+4*28], m5 | |
515 | UNSPILL 7, 13 | |
516 | subps m5, m7, m2 | |
517 | mulps m5, [costabs + 16*7] | |
518 | UNSPILL 1, 10 | |
519 | mulps m1, [costabs + 16*2] | |
520 | addps m4, m3, m2 | |
521 | mulps m4, [costabs + 16*4] | |
522 | addps m2, m7 | |
523 | addps m7, m3 | |
524 | mulps m7, [costabs] | |
525 | subps m3, m2 | |
526 | mulps m3, [costabs + 16*2] | |
527 | addps m2, m7, m5 | |
528 | addps m2, m1 | |
529 | SPILL 2, 10 | |
530 | addps m7, m4 | |
531 | subps m7, m1 | |
532 | SPILL 7, 12 | |
533 | subps m5, m4 | |
534 | subps m5, m1 | |
535 | UNSPILL 0, 14 | |
536 | SPILL 5, 13 | |
537 | addps m1, m0, SPILLED(15) | |
538 | subps m1, SPILLED(8) | |
539 | mova m4, [costabs + 16*5] | |
540 | mulps m4, [tmpq] | |
541 | UNSPILL 2, 9 | |
542 | addps m4, m2 | |
543 | subps m2, [tmpq] | |
544 | mulps m5, m1, [costabs + 16*6] | |
545 | addps m5, m2 | |
546 | SPILL 5, 9 | |
547 | addps m2, m1 | |
548 | SPILL 2, 14 | |
549 | UNSPILL 5, 15 | |
550 | subps m7, m5, m0 | |
551 | addps m5, SPILLED(8) | |
552 | mulps m5, [costabs + 16*1] | |
553 | mulps m7, [costabs + 16*8] | |
554 | addps m0, SPILLED(8) | |
555 | mulps m0, [costabs + 16*3] | |
556 | subps m2, m4, m5 | |
557 | subps m2, m0 | |
558 | SPILL 2, 15 | |
559 | addps m5, m4 | |
560 | addps m5, m7 | |
561 | addps m4, m0 | |
562 | subps m4, m7 | |
563 | SPILL 4, 8 | |
564 | mova m7, [tmpq+4*16] | |
565 | mova m2, [tmpq+4*12] | |
566 | addps m0, m7, m2 | |
567 | subps m0, SPILLED(11) | |
568 | mulps m0, [costabs + 16*2] | |
569 | addps m4, m7, SPILLED(11) | |
570 | mulps m4, [costabs] | |
571 | subps m7, m2 | |
572 | mulps m7, [costabs + 16*7] | |
573 | addps m2, SPILLED(11) | |
574 | mulps m2, [costabs + 16*4] | |
575 | addps m1, m7, [tmpq+4*8] | |
576 | addps m1, m4 | |
577 | addps m4, m2 | |
578 | subps m4, [tmpq+4*8] | |
579 | SPILL 4, 11 | |
580 | subps m7, m2 | |
581 | subps m7, [tmpq+4*8] | |
582 | addps m4, m6, SPILLED(10) | |
583 | subps m6, SPILLED(10) | |
584 | addps m2, m5, m1 | |
585 | mulps m2, [costabs + 16*9] | |
586 | subps m5, m1 | |
587 | mulps m5, [costabs + 16*17] | |
588 | subps m1, m4, m2 | |
589 | addps m4, m2 | |
590 | mulps m2, m1, [winq+4*36] | |
591 | addps m2, [bufq+4*36] | |
592 | mova [outq+1152], m2 | |
593 | mulps m1, [winq+4*32] | |
594 | addps m1, [bufq+4*32] | |
595 | mova [outq+1024], m1 | |
596 | mulps m1, m4, [winq+4*116] | |
597 | mova [bufq+4*36], m1 | |
598 | mulps m4, [winq+4*112] | |
599 | mova [bufq+4*32], m4 | |
600 | addps m2, m6, m5 | |
601 | subps m6, m5 | |
602 | mulps m1, m6, [winq+4*68] | |
603 | addps m1, [bufq+4*68] | |
604 | mova [outq+2176], m1 | |
605 | mulps m6, [winq] | |
606 | addps m6, [bufq] | |
607 | mova [outq], m6 | |
608 | mulps m1, m2, [winq+4*148] | |
609 | mova [bufq+4*68], m1 | |
610 | mulps m2, [winq+4*80] | |
611 | mova [bufq], m2 | |
612 | addps m5, m3, [tmpq+4*24] | |
613 | mova m2, [tmpq+4*24] | |
614 | subps m2, m3 | |
615 | mova m1, SPILLED(9) | |
616 | subps m1, m0 | |
617 | mulps m1, [costabs + 16*10] | |
618 | addps m0, SPILLED(9) | |
619 | mulps m0, [costabs + 16*16] | |
620 | addps m6, m5, m1 | |
621 | subps m5, m1 | |
622 | mulps m3, m5, [winq+4*40] | |
623 | addps m3, [bufq+4*40] | |
624 | mova [outq+1280], m3 | |
625 | mulps m5, [winq+4*28] | |
626 | addps m5, [bufq+4*28] | |
627 | mova [outq+896], m5 | |
628 | mulps m1, m6, [winq+4*120] | |
629 | mova [bufq+4*40], m1 | |
630 | mulps m6, [winq+4*108] | |
631 | mova [bufq+4*28], m6 | |
632 | addps m1, m2, m0 | |
633 | subps m2, m0 | |
634 | mulps m5, m2, [winq+4*64] | |
635 | addps m5, [bufq+4*64] | |
636 | mova [outq+2048], m5 | |
637 | mulps m2, [winq+4*4] | |
638 | addps m2, [bufq+4*4] | |
639 | mova [outq+128], m2 | |
640 | mulps m0, m1, [winq+4*144] | |
641 | mova [bufq+4*64], m0 | |
642 | mulps m1, [winq+4*84] | |
643 | mova [bufq+4*4], m1 | |
644 | mova m1, [tmpq+4*28] | |
645 | mova m5, m1 | |
646 | addps m1, SPILLED(13) | |
647 | subps m5, SPILLED(13) | |
648 | UNSPILL 3, 15 | |
649 | addps m2, m7, m3 | |
650 | mulps m2, [costabs + 16*11] | |
651 | subps m3, m7 | |
652 | mulps m3, [costabs + 16*15] | |
653 | addps m0, m2, m1 | |
654 | subps m1, m2 | |
655 | SWAP m0, m2 | |
656 | mulps m6, m1, [winq+4*44] | |
657 | addps m6, [bufq+4*44] | |
658 | mova [outq+1408], m6 | |
659 | mulps m1, [winq+4*24] | |
660 | addps m1, [bufq+4*24] | |
661 | mova [outq+768], m1 | |
662 | mulps m0, m2, [winq+4*124] | |
663 | mova [bufq+4*44], m0 | |
664 | mulps m2, [winq+4*104] | |
665 | mova [bufq+4*24], m2 | |
666 | addps m0, m5, m3 | |
667 | subps m5, m3 | |
668 | mulps m1, m5, [winq+4*60] | |
669 | addps m1, [bufq+4*60] | |
670 | mova [outq+1920], m1 | |
671 | mulps m5, [winq+4*8] | |
672 | addps m5, [bufq+4*8] | |
673 | mova [outq+256], m5 | |
674 | mulps m1, m0, [winq+4*140] | |
675 | mova [bufq+4*60], m1 | |
676 | mulps m0, [winq+4*88] | |
677 | mova [bufq+4*8], m0 | |
678 | mova m1, [tmpq+4*20] | |
679 | addps m1, SPILLED(12) | |
680 | mova m2, [tmpq+4*20] | |
681 | subps m2, SPILLED(12) | |
682 | UNSPILL 7, 8 | |
683 | subps m0, m7, SPILLED(11) | |
684 | addps m7, SPILLED(11) | |
685 | mulps m4, m7, [costabs + 16*12] | |
686 | mulps m0, [costabs + 16*14] | |
687 | addps m5, m1, m4 | |
688 | subps m1, m4 | |
689 | mulps m7, m1, [winq+4*48] | |
690 | addps m7, [bufq+4*48] | |
691 | mova [outq+1536], m7 | |
692 | mulps m1, [winq+4*20] | |
693 | addps m1, [bufq+4*20] | |
694 | mova [outq+640], m1 | |
695 | mulps m1, m5, [winq+4*128] | |
696 | mova [bufq+4*48], m1 | |
697 | mulps m5, [winq+4*100] | |
698 | mova [bufq+4*20], m5 | |
699 | addps m6, m2, m0 | |
700 | subps m2, m0 | |
701 | mulps m1, m2, [winq+4*56] | |
702 | addps m1, [bufq+4*56] | |
703 | mova [outq+1792], m1 | |
704 | mulps m2, [winq+4*12] | |
705 | addps m2, [bufq+4*12] | |
706 | mova [outq+384], m2 | |
707 | mulps m0, m6, [winq+4*136] | |
708 | mova [bufq+4*56], m0 | |
709 | mulps m6, [winq+4*92] | |
710 | mova [bufq+4*12], m6 | |
711 | UNSPILL 0, 14 | |
712 | mulps m0, [costabs + 16*13] | |
713 | mova m3, [tmpq+4*4] | |
714 | addps m2, m0, m3 | |
715 | subps m3, m0 | |
716 | mulps m0, m3, [winq+4*52] | |
717 | addps m0, [bufq+4*52] | |
718 | mova [outq+1664], m0 | |
719 | mulps m3, [winq+4*16] | |
720 | addps m3, [bufq+4*16] | |
721 | mova [outq+512], m3 | |
722 | mulps m0, m2, [winq+4*132] | |
723 | mova [bufq+4*52], m0 | |
724 | mulps m2, [winq+4*96] | |
725 | mova [bufq+4*16], m2 | |
726 | RET | |
727 | %endmacro | |
728 | ||
729 | INIT_XMM sse | |
730 | DEFINE_FOUR_IMDCT | |
731 | ||
732 | %if HAVE_AVX_EXTERNAL | |
733 | INIT_XMM avx | |
734 | DEFINE_FOUR_IMDCT | |
735 | %endif |