Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;***************************************************************************** |
2 | ;* MMX/SSE2-optimized H.264 iDCT | |
3 | ;***************************************************************************** | |
4 | ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt | |
5 | ;* Copyright (C) 2003-2008 x264 project | |
6 | ;* | |
7 | ;* Authors: Laurent Aimar <fenrir@via.ecp.fr> | |
8 | ;* Loren Merritt <lorenm@u.washington.edu> | |
9 | ;* Holger Lubitz <hal@duncan.ol.sub.de> | |
10 | ;* Min Chen <chenm001.163.com> | |
11 | ;* | |
12 | ;* This file is part of FFmpeg. | |
13 | ;* | |
14 | ;* FFmpeg is free software; you can redistribute it and/or | |
15 | ;* modify it under the terms of the GNU Lesser General Public | |
16 | ;* License as published by the Free Software Foundation; either | |
17 | ;* version 2.1 of the License, or (at your option) any later version. | |
18 | ;* | |
19 | ;* FFmpeg is distributed in the hope that it will be useful, | |
20 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
21 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
22 | ;* Lesser General Public License for more details. | |
23 | ;* | |
24 | ;* You should have received a copy of the GNU Lesser General Public | |
25 | ;* License along with FFmpeg; if not, write to the Free Software | |
26 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
27 | ;***************************************************************************** | |
28 | ||
29 | %include "libavutil/x86/x86util.asm" | |
30 | ||
31 | SECTION_RODATA | |
32 | ||
33 | scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 | |
34 | db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 | |
35 | db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 | |
36 | db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 | |
37 | db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 | |
38 | db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 | |
39 | db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 | |
40 | db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 | |
41 | db 4+11*8, 5+11*8, 4+12*8, 5+12*8 | |
42 | db 6+11*8, 7+11*8, 6+12*8, 7+12*8 | |
43 | db 4+13*8, 5+13*8, 4+14*8, 5+14*8 | |
44 | db 6+13*8, 7+13*8, 6+14*8, 7+14*8 | |
45 | %ifdef PIC | |
46 | %define npicregs 1 | |
47 | %define scan8 picregq | |
48 | %else | |
49 | %define npicregs 0 | |
50 | %define scan8 scan8_mem | |
51 | %endif | |
52 | ||
53 | cextern pw_32 | |
54 | cextern pw_1 | |
55 | ||
56 | SECTION .text | |
57 | ||
58 | ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride | |
59 | %macro IDCT4_ADD 3 | |
60 | ; Load dct coeffs | |
61 | movq m0, [%2] | |
62 | movq m1, [%2+8] | |
63 | movq m2, [%2+16] | |
64 | movq m3, [%2+24] | |
65 | ||
66 | IDCT4_1D w, 0, 1, 2, 3, 4, 5 | |
67 | mova m6, [pw_32] | |
68 | TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
69 | paddw m0, m6 | |
70 | IDCT4_1D w, 0, 1, 2, 3, 4, 5 | |
71 | pxor m7, m7 | |
72 | movq [%2+ 0], m7 | |
73 | movq [%2+ 8], m7 | |
74 | movq [%2+16], m7 | |
75 | movq [%2+24], m7 | |
76 | ||
77 | STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 | |
78 | lea %1, [%1+%3*2] | |
79 | STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3 | |
80 | %endmacro | |
81 | ||
82 | INIT_MMX mmx | |
83 | ; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride) | |
84 | cglobal h264_idct_add_8, 3, 3, 0 | |
85 | IDCT4_ADD r0, r1, r2 | |
86 | RET | |
87 | ||
88 | %macro IDCT8_1D 2 | |
89 | mova m0, m1 | |
90 | psraw m1, 1 | |
91 | mova m4, m5 | |
92 | psraw m4, 1 | |
93 | paddw m4, m5 | |
94 | paddw m1, m0 | |
95 | paddw m4, m7 | |
96 | paddw m1, m5 | |
97 | psubw m4, m0 | |
98 | paddw m1, m3 | |
99 | ||
100 | psubw m0, m3 | |
101 | psubw m5, m3 | |
102 | psraw m3, 1 | |
103 | paddw m0, m7 | |
104 | psubw m5, m7 | |
105 | psraw m7, 1 | |
106 | psubw m0, m3 | |
107 | psubw m5, m7 | |
108 | ||
109 | mova m7, m1 | |
110 | psraw m1, 2 | |
111 | mova m3, m4 | |
112 | psraw m3, 2 | |
113 | paddw m3, m0 | |
114 | psraw m0, 2 | |
115 | paddw m1, m5 | |
116 | psraw m5, 2 | |
117 | psubw m0, m4 | |
118 | psubw m7, m5 | |
119 | ||
120 | mova m5, m6 | |
121 | psraw m6, 1 | |
122 | mova m4, m2 | |
123 | psraw m4, 1 | |
124 | paddw m6, m2 | |
125 | psubw m4, m5 | |
126 | ||
127 | mova m2, %1 | |
128 | mova m5, %2 | |
129 | SUMSUB_BA w, 5, 2 | |
130 | SUMSUB_BA w, 6, 5 | |
131 | SUMSUB_BA w, 4, 2 | |
132 | SUMSUB_BA w, 7, 6 | |
133 | SUMSUB_BA w, 0, 4 | |
134 | SUMSUB_BA w, 3, 2 | |
135 | SUMSUB_BA w, 1, 5 | |
136 | SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 | |
137 | %endmacro | |
138 | ||
139 | %macro IDCT8_1D_FULL 1 | |
140 | mova m7, [%1+112] | |
141 | mova m6, [%1+ 96] | |
142 | mova m5, [%1+ 80] | |
143 | mova m3, [%1+ 48] | |
144 | mova m2, [%1+ 32] | |
145 | mova m1, [%1+ 16] | |
146 | IDCT8_1D [%1], [%1+ 64] | |
147 | %endmacro | |
148 | ||
149 | ; %1=int16_t *block, %2=int16_t *dstblock | |
150 | %macro IDCT8_ADD_MMX_START 2 | |
151 | IDCT8_1D_FULL %1 | |
152 | mova [%1], m7 | |
153 | TRANSPOSE4x4W 0, 1, 2, 3, 7 | |
154 | mova m7, [%1] | |
155 | mova [%2 ], m0 | |
156 | mova [%2+16], m1 | |
157 | mova [%2+32], m2 | |
158 | mova [%2+48], m3 | |
159 | TRANSPOSE4x4W 4, 5, 6, 7, 3 | |
160 | mova [%2+ 8], m4 | |
161 | mova [%2+24], m5 | |
162 | mova [%2+40], m6 | |
163 | mova [%2+56], m7 | |
164 | %endmacro | |
165 | ||
166 | ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride | |
167 | %macro IDCT8_ADD_MMX_END 3-4 | |
168 | IDCT8_1D_FULL %2 | |
169 | mova [%2 ], m5 | |
170 | mova [%2+16], m6 | |
171 | mova [%2+32], m7 | |
172 | ||
173 | pxor m7, m7 | |
174 | %if %0 == 4 | |
175 | movq [%4+ 0], m7 | |
176 | movq [%4+ 8], m7 | |
177 | movq [%4+ 16], m7 | |
178 | movq [%4+ 24], m7 | |
179 | movq [%4+ 32], m7 | |
180 | movq [%4+ 40], m7 | |
181 | movq [%4+ 48], m7 | |
182 | movq [%4+ 56], m7 | |
183 | movq [%4+ 64], m7 | |
184 | movq [%4+ 72], m7 | |
185 | movq [%4+ 80], m7 | |
186 | movq [%4+ 88], m7 | |
187 | movq [%4+ 96], m7 | |
188 | movq [%4+104], m7 | |
189 | movq [%4+112], m7 | |
190 | movq [%4+120], m7 | |
191 | %endif | |
192 | STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 | |
193 | lea %1, [%1+%3*2] | |
194 | STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 | |
195 | mova m0, [%2 ] | |
196 | mova m1, [%2+16] | |
197 | mova m2, [%2+32] | |
198 | lea %1, [%1+%3*2] | |
199 | STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3 | |
200 | lea %1, [%1+%3*2] | |
201 | STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3 | |
202 | %endmacro | |
203 | ||
204 | INIT_MMX mmx | |
205 | ; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride) | |
206 | cglobal h264_idct8_add_8, 3, 4, 0 | |
207 | %assign pad 128+4-(stack_offset&7) | |
208 | SUB rsp, pad | |
209 | ||
210 | add word [r1], 32 | |
211 | IDCT8_ADD_MMX_START r1 , rsp | |
212 | IDCT8_ADD_MMX_START r1+8, rsp+64 | |
213 | lea r3, [r0+4] | |
214 | IDCT8_ADD_MMX_END r0 , rsp, r2, r1 | |
215 | IDCT8_ADD_MMX_END r3 , rsp+8, r2 | |
216 | ||
217 | ADD rsp, pad | |
218 | RET | |
219 | ||
220 | ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride | |
221 | %macro IDCT8_ADD_SSE 4 | |
222 | IDCT8_1D_FULL %2 | |
223 | %if ARCH_X86_64 | |
224 | TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 | |
225 | %else | |
226 | TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16] | |
227 | %endif | |
228 | paddw m0, [pw_32] | |
229 | ||
230 | %if ARCH_X86_64 == 0 | |
231 | mova [%2 ], m0 | |
232 | mova [%2+16], m4 | |
233 | IDCT8_1D [%2], [%2+ 16] | |
234 | mova [%2 ], m6 | |
235 | mova [%2+16], m7 | |
236 | %else | |
237 | SWAP 0, 8 | |
238 | SWAP 4, 9 | |
239 | IDCT8_1D m8, m9 | |
240 | SWAP 6, 8 | |
241 | SWAP 7, 9 | |
242 | %endif | |
243 | ||
244 | pxor m7, m7 | |
245 | lea %4, [%3*3] | |
246 | STORE_DIFF m0, m6, m7, [%1 ] | |
247 | STORE_DIFF m1, m6, m7, [%1+%3 ] | |
248 | STORE_DIFF m2, m6, m7, [%1+%3*2] | |
249 | STORE_DIFF m3, m6, m7, [%1+%4 ] | |
250 | %if ARCH_X86_64 == 0 | |
251 | mova m0, [%2 ] | |
252 | mova m1, [%2+16] | |
253 | %else | |
254 | SWAP 0, 8 | |
255 | SWAP 1, 9 | |
256 | %endif | |
257 | mova [%2+ 0], m7 | |
258 | mova [%2+ 16], m7 | |
259 | mova [%2+ 32], m7 | |
260 | mova [%2+ 48], m7 | |
261 | mova [%2+ 64], m7 | |
262 | mova [%2+ 80], m7 | |
263 | mova [%2+ 96], m7 | |
264 | mova [%2+112], m7 | |
265 | lea %1, [%1+%3*4] | |
266 | STORE_DIFF m4, m6, m7, [%1 ] | |
267 | STORE_DIFF m5, m6, m7, [%1+%3 ] | |
268 | STORE_DIFF m0, m6, m7, [%1+%3*2] | |
269 | STORE_DIFF m1, m6, m7, [%1+%4 ] | |
270 | %endmacro | |
271 | ||
272 | INIT_XMM sse2 | |
273 | ; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride) | |
274 | cglobal h264_idct8_add_8, 3, 4, 10 | |
275 | IDCT8_ADD_SSE r0, r1, r2, r3 | |
276 | RET | |
277 | ||
278 | %macro DC_ADD_MMXEXT_INIT 2 | |
279 | add %1, 32 | |
280 | sar %1, 6 | |
281 | movd m0, %1d | |
282 | lea %1, [%2*3] | |
283 | pshufw m0, m0, 0 | |
284 | pxor m1, m1 | |
285 | psubw m1, m0 | |
286 | packuswb m0, m0 | |
287 | packuswb m1, m1 | |
288 | %endmacro | |
289 | ||
290 | %macro DC_ADD_MMXEXT_OP 4 | |
291 | %1 m2, [%2 ] | |
292 | %1 m3, [%2+%3 ] | |
293 | %1 m4, [%2+%3*2] | |
294 | %1 m5, [%2+%4 ] | |
295 | paddusb m2, m0 | |
296 | paddusb m3, m0 | |
297 | paddusb m4, m0 | |
298 | paddusb m5, m0 | |
299 | psubusb m2, m1 | |
300 | psubusb m3, m1 | |
301 | psubusb m4, m1 | |
302 | psubusb m5, m1 | |
303 | %1 [%2 ], m2 | |
304 | %1 [%2+%3 ], m3 | |
305 | %1 [%2+%3*2], m4 | |
306 | %1 [%2+%4 ], m5 | |
307 | %endmacro | |
308 | ||
309 | INIT_MMX mmxext | |
310 | ; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) | |
311 | %if ARCH_X86_64 | |
312 | cglobal h264_idct_dc_add_8, 3, 4, 0 | |
313 | movsx r3, word [r1] | |
314 | mov dword [r1], 0 | |
315 | DC_ADD_MMXEXT_INIT r3, r2 | |
316 | DC_ADD_MMXEXT_OP movh, r0, r2, r3 | |
317 | RET | |
318 | ||
319 | ; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) | |
320 | cglobal h264_idct8_dc_add_8, 3, 4, 0 | |
321 | movsx r3, word [r1] | |
322 | mov dword [r1], 0 | |
323 | DC_ADD_MMXEXT_INIT r3, r2 | |
324 | DC_ADD_MMXEXT_OP mova, r0, r2, r3 | |
325 | lea r0, [r0+r2*4] | |
326 | DC_ADD_MMXEXT_OP mova, r0, r2, r3 | |
327 | RET | |
328 | %else | |
329 | ; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) | |
330 | cglobal h264_idct_dc_add_8, 2, 3, 0 | |
331 | movsx r2, word [r1] | |
332 | mov dword [r1], 0 | |
333 | mov r1, r2m | |
334 | DC_ADD_MMXEXT_INIT r2, r1 | |
335 | DC_ADD_MMXEXT_OP movh, r0, r1, r2 | |
336 | RET | |
337 | ||
338 | ; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) | |
339 | cglobal h264_idct8_dc_add_8, 2, 3, 0 | |
340 | movsx r2, word [r1] | |
341 | mov dword [r1], 0 | |
342 | mov r1, r2m | |
343 | DC_ADD_MMXEXT_INIT r2, r1 | |
344 | DC_ADD_MMXEXT_OP mova, r0, r1, r2 | |
345 | lea r0, [r0+r1*4] | |
346 | DC_ADD_MMXEXT_OP mova, r0, r1, r2 | |
347 | RET | |
348 | %endif | |
349 | ||
350 | INIT_MMX mmx | |
351 | ; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset, | |
352 | ; int16_t *block, int stride, | |
353 | ; const uint8_t nnzc[6 * 8]) | |
354 | cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg | |
355 | xor r5, r5 | |
356 | %ifdef PIC | |
357 | lea picregq, [scan8_mem] | |
358 | %endif | |
359 | .nextblock: | |
360 | movzx r6, byte [scan8+r5] | |
361 | movzx r6, byte [r4+r6] | |
362 | test r6, r6 | |
363 | jz .skipblock | |
364 | mov r6d, dword [r1+r5*4] | |
365 | lea r6, [r0+r6] | |
366 | IDCT4_ADD r6, r2, r3 | |
367 | .skipblock: | |
368 | inc r5 | |
369 | add r2, 32 | |
370 | cmp r5, 16 | |
371 | jl .nextblock | |
372 | REP_RET | |
373 | ||
374 | ; void ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset, | |
375 | ; int16_t *block, int stride, | |
376 | ; const uint8_t nnzc[6 * 8]) | |
377 | cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg | |
378 | %assign pad 128+4-(stack_offset&7) | |
379 | SUB rsp, pad | |
380 | ||
381 | xor r5, r5 | |
382 | %ifdef PIC | |
383 | lea picregq, [scan8_mem] | |
384 | %endif | |
385 | .nextblock: | |
386 | movzx r6, byte [scan8+r5] | |
387 | movzx r6, byte [r4+r6] | |
388 | test r6, r6 | |
389 | jz .skipblock | |
390 | mov r6d, dword [r1+r5*4] | |
391 | add r6, r0 | |
392 | add word [r2], 32 | |
393 | IDCT8_ADD_MMX_START r2 , rsp | |
394 | IDCT8_ADD_MMX_START r2+8, rsp+64 | |
395 | IDCT8_ADD_MMX_END r6 , rsp, r3, r2 | |
396 | mov r6d, dword [r1+r5*4] | |
397 | lea r6, [r0+r6+4] | |
398 | IDCT8_ADD_MMX_END r6 , rsp+8, r3 | |
399 | .skipblock: | |
400 | add r5, 4 | |
401 | add r2, 128 | |
402 | cmp r5, 16 | |
403 | jl .nextblock | |
404 | ADD rsp, pad | |
405 | RET | |
406 | ||
407 | INIT_MMX mmxext | |
408 | ; void ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset, | |
409 | ; int16_t *block, int stride, | |
410 | ; const uint8_t nnzc[6 * 8]) | |
411 | cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |
412 | xor r5, r5 | |
413 | %ifdef PIC | |
414 | lea picregq, [scan8_mem] | |
415 | %endif | |
416 | .nextblock: | |
417 | movzx r6, byte [scan8+r5] | |
418 | movzx r6, byte [r4+r6] | |
419 | test r6, r6 | |
420 | jz .skipblock | |
421 | cmp r6, 1 | |
422 | jnz .no_dc | |
423 | movsx r6, word [r2] | |
424 | test r6, r6 | |
425 | jz .no_dc | |
426 | mov word [r2], 0 | |
427 | DC_ADD_MMXEXT_INIT r6, r3 | |
428 | %if ARCH_X86_64 == 0 | |
429 | %define dst2q r1 | |
430 | %define dst2d r1d | |
431 | %endif | |
432 | mov dst2d, dword [r1+r5*4] | |
433 | lea dst2q, [r0+dst2q] | |
434 | DC_ADD_MMXEXT_OP movh, dst2q, r3, r6 | |
435 | %if ARCH_X86_64 == 0 | |
436 | mov r1, r1m | |
437 | %endif | |
438 | inc r5 | |
439 | add r2, 32 | |
440 | cmp r5, 16 | |
441 | jl .nextblock | |
442 | REP_RET | |
443 | .no_dc: | |
444 | mov r6d, dword [r1+r5*4] | |
445 | add r6, r0 | |
446 | IDCT4_ADD r6, r2, r3 | |
447 | .skipblock: | |
448 | inc r5 | |
449 | add r2, 32 | |
450 | cmp r5, 16 | |
451 | jl .nextblock | |
452 | REP_RET | |
453 | ||
454 | INIT_MMX mmx | |
455 | ; void ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset, | |
456 | ; int16_t *block, int stride, | |
457 | ; const uint8_t nnzc[6 * 8]) | |
458 | cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg | |
459 | xor r5, r5 | |
460 | %ifdef PIC | |
461 | lea picregq, [scan8_mem] | |
462 | %endif | |
463 | .nextblock: | |
464 | movzx r6, byte [scan8+r5] | |
465 | movzx r6, byte [r4+r6] | |
466 | or r6w, word [r2] | |
467 | test r6, r6 | |
468 | jz .skipblock | |
469 | mov r6d, dword [r1+r5*4] | |
470 | add r6, r0 | |
471 | IDCT4_ADD r6, r2, r3 | |
472 | .skipblock: | |
473 | inc r5 | |
474 | add r2, 32 | |
475 | cmp r5, 16 | |
476 | jl .nextblock | |
477 | REP_RET | |
478 | ||
479 | INIT_MMX mmxext | |
480 | ; void ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset, | |
481 | ; int16_t *block, int stride, | |
482 | ; const uint8_t nnzc[6 * 8]) | |
483 | cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |
484 | xor r5, r5 | |
485 | %ifdef PIC | |
486 | lea picregq, [scan8_mem] | |
487 | %endif | |
488 | .nextblock: | |
489 | movzx r6, byte [scan8+r5] | |
490 | movzx r6, byte [r4+r6] | |
491 | test r6, r6 | |
492 | jz .try_dc | |
493 | mov r6d, dword [r1+r5*4] | |
494 | lea r6, [r0+r6] | |
495 | IDCT4_ADD r6, r2, r3 | |
496 | inc r5 | |
497 | add r2, 32 | |
498 | cmp r5, 16 | |
499 | jl .nextblock | |
500 | REP_RET | |
501 | .try_dc: | |
502 | movsx r6, word [r2] | |
503 | test r6, r6 | |
504 | jz .skipblock | |
505 | mov word [r2], 0 | |
506 | DC_ADD_MMXEXT_INIT r6, r3 | |
507 | %if ARCH_X86_64 == 0 | |
508 | %define dst2q r1 | |
509 | %define dst2d r1d | |
510 | %endif | |
511 | mov dst2d, dword [r1+r5*4] | |
512 | add dst2q, r0 | |
513 | DC_ADD_MMXEXT_OP movh, dst2q, r3, r6 | |
514 | %if ARCH_X86_64 == 0 | |
515 | mov r1, r1m | |
516 | %endif | |
517 | .skipblock: | |
518 | inc r5 | |
519 | add r2, 32 | |
520 | cmp r5, 16 | |
521 | jl .nextblock | |
522 | REP_RET | |
523 | ||
524 | ; void ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset, | |
525 | ; int16_t *block, int stride, | |
526 | ; const uint8_t nnzc[6 * 8]) | |
527 | cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |
528 | %assign pad 128+4-(stack_offset&7) | |
529 | SUB rsp, pad | |
530 | ||
531 | xor r5, r5 | |
532 | %ifdef PIC | |
533 | lea picregq, [scan8_mem] | |
534 | %endif | |
535 | .nextblock: | |
536 | movzx r6, byte [scan8+r5] | |
537 | movzx r6, byte [r4+r6] | |
538 | test r6, r6 | |
539 | jz .skipblock | |
540 | cmp r6, 1 | |
541 | jnz .no_dc | |
542 | movsx r6, word [r2] | |
543 | test r6, r6 | |
544 | jz .no_dc | |
545 | mov word [r2], 0 | |
546 | DC_ADD_MMXEXT_INIT r6, r3 | |
547 | %if ARCH_X86_64 == 0 | |
548 | %define dst2q r1 | |
549 | %define dst2d r1d | |
550 | %endif | |
551 | mov dst2d, dword [r1+r5*4] | |
552 | lea dst2q, [r0+dst2q] | |
553 | DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 | |
554 | lea dst2q, [dst2q+r3*4] | |
555 | DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 | |
556 | %if ARCH_X86_64 == 0 | |
557 | mov r1, r1m | |
558 | %endif | |
559 | add r5, 4 | |
560 | add r2, 128 | |
561 | cmp r5, 16 | |
562 | jl .nextblock | |
563 | ||
564 | ADD rsp, pad | |
565 | RET | |
566 | .no_dc: | |
567 | mov r6d, dword [r1+r5*4] | |
568 | add r6, r0 | |
569 | add word [r2], 32 | |
570 | IDCT8_ADD_MMX_START r2 , rsp | |
571 | IDCT8_ADD_MMX_START r2+8, rsp+64 | |
572 | IDCT8_ADD_MMX_END r6 , rsp, r3, r2 | |
573 | mov r6d, dword [r1+r5*4] | |
574 | lea r6, [r0+r6+4] | |
575 | IDCT8_ADD_MMX_END r6 , rsp+8, r3 | |
576 | .skipblock: | |
577 | add r5, 4 | |
578 | add r2, 128 | |
579 | cmp r5, 16 | |
580 | jl .nextblock | |
581 | ||
582 | ADD rsp, pad | |
583 | RET | |
584 | ||
585 | INIT_XMM sse2 | |
586 | ; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset, | |
587 | ; int16_t *block, int stride, | |
588 | ; const uint8_t nnzc[6 * 8]) | |
589 | cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |
590 | xor r5, r5 | |
591 | %ifdef PIC | |
592 | lea picregq, [scan8_mem] | |
593 | %endif | |
594 | .nextblock: | |
595 | movzx r6, byte [scan8+r5] | |
596 | movzx r6, byte [r4+r6] | |
597 | test r6, r6 | |
598 | jz .skipblock | |
599 | cmp r6, 1 | |
600 | jnz .no_dc | |
601 | movsx r6, word [r2] | |
602 | test r6, r6 | |
603 | jz .no_dc | |
604 | INIT_MMX cpuname | |
605 | mov word [r2], 0 | |
606 | DC_ADD_MMXEXT_INIT r6, r3 | |
607 | %if ARCH_X86_64 == 0 | |
608 | %define dst2q r1 | |
609 | %define dst2d r1d | |
610 | %endif | |
611 | mov dst2d, dword [r1+r5*4] | |
612 | add dst2q, r0 | |
613 | DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 | |
614 | lea dst2q, [dst2q+r3*4] | |
615 | DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 | |
616 | %if ARCH_X86_64 == 0 | |
617 | mov r1, r1m | |
618 | %endif | |
619 | add r5, 4 | |
620 | add r2, 128 | |
621 | cmp r5, 16 | |
622 | jl .nextblock | |
623 | REP_RET | |
624 | .no_dc: | |
625 | INIT_XMM cpuname | |
626 | mov dst2d, dword [r1+r5*4] | |
627 | add dst2q, r0 | |
628 | IDCT8_ADD_SSE dst2q, r2, r3, r6 | |
629 | %if ARCH_X86_64 == 0 | |
630 | mov r1, r1m | |
631 | %endif | |
632 | .skipblock: | |
633 | add r5, 4 | |
634 | add r2, 128 | |
635 | cmp r5, 16 | |
636 | jl .nextblock | |
637 | REP_RET | |
638 | ||
639 | INIT_MMX mmx | |
640 | h264_idct_add8_mmx_plane: | |
641 | .nextblock: | |
642 | movzx r6, byte [scan8+r5] | |
643 | movzx r6, byte [r4+r6] | |
644 | or r6w, word [r2] | |
645 | test r6, r6 | |
646 | jz .skipblock | |
647 | %if ARCH_X86_64 | |
648 | mov r0d, dword [r1+r5*4] | |
649 | add r0, [dst2q] | |
650 | %else | |
651 | mov r0, r1m ; XXX r1m here is actually r0m of the calling func | |
652 | mov r0, [r0] | |
653 | add r0, dword [r1+r5*4] | |
654 | %endif | |
655 | IDCT4_ADD r0, r2, r3 | |
656 | .skipblock: | |
657 | inc r5 | |
658 | add r2, 32 | |
659 | test r5, 3 | |
660 | jnz .nextblock | |
661 | rep ret | |
662 | ||
663 | ; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset, | |
664 | ; int16_t *block, int stride, | |
665 | ; const uint8_t nnzc[6 * 8]) | |
666 | cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |
667 | mov r5, 16 | |
668 | add r2, 512 | |
669 | %ifdef PIC | |
670 | lea picregq, [scan8_mem] | |
671 | %endif | |
672 | %if ARCH_X86_64 | |
673 | mov dst2q, r0 | |
674 | %endif | |
675 | call h264_idct_add8_mmx_plane | |
676 | mov r5, 32 | |
677 | add r2, 384 | |
678 | %if ARCH_X86_64 | |
679 | add dst2q, gprsize | |
680 | %else | |
681 | add r0mp, gprsize | |
682 | %endif | |
683 | call h264_idct_add8_mmx_plane | |
684 | RET | |
685 | ||
686 | h264_idct_add8_mmxext_plane: | |
687 | .nextblock: | |
688 | movzx r6, byte [scan8+r5] | |
689 | movzx r6, byte [r4+r6] | |
690 | test r6, r6 | |
691 | jz .try_dc | |
692 | %if ARCH_X86_64 | |
693 | mov r0d, dword [r1+r5*4] | |
694 | add r0, [dst2q] | |
695 | %else | |
696 | mov r0, r1m ; XXX r1m here is actually r0m of the calling func | |
697 | mov r0, [r0] | |
698 | add r0, dword [r1+r5*4] | |
699 | %endif | |
700 | IDCT4_ADD r0, r2, r3 | |
701 | inc r5 | |
702 | add r2, 32 | |
703 | test r5, 3 | |
704 | jnz .nextblock | |
705 | rep ret | |
706 | .try_dc: | |
707 | movsx r6, word [r2] | |
708 | test r6, r6 | |
709 | jz .skipblock | |
710 | mov word [r2], 0 | |
711 | DC_ADD_MMXEXT_INIT r6, r3 | |
712 | %if ARCH_X86_64 | |
713 | mov r0d, dword [r1+r5*4] | |
714 | add r0, [dst2q] | |
715 | %else | |
716 | mov r0, r1m ; XXX r1m here is actually r0m of the calling func | |
717 | mov r0, [r0] | |
718 | add r0, dword [r1+r5*4] | |
719 | %endif | |
720 | DC_ADD_MMXEXT_OP movh, r0, r3, r6 | |
721 | .skipblock: | |
722 | inc r5 | |
723 | add r2, 32 | |
724 | test r5, 3 | |
725 | jnz .nextblock | |
726 | rep ret | |
727 | ||
728 | INIT_MMX mmxext | |
729 | ; void ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset, | |
730 | ; int16_t *block, int stride, | |
731 | ; const uint8_t nnzc[6 * 8]) | |
732 | cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg | |
733 | mov r5, 16 | |
734 | add r2, 512 | |
735 | %if ARCH_X86_64 | |
736 | mov dst2q, r0 | |
737 | %endif | |
738 | %ifdef PIC | |
739 | lea picregq, [scan8_mem] | |
740 | %endif | |
741 | call h264_idct_add8_mmxext_plane | |
742 | mov r5, 32 | |
743 | add r2, 384 | |
744 | %if ARCH_X86_64 | |
745 | add dst2q, gprsize | |
746 | %else | |
747 | add r0mp, gprsize | |
748 | %endif | |
749 | call h264_idct_add8_mmxext_plane | |
750 | RET | |
751 | ||
752 | ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered | |
753 | h264_idct_dc_add8_mmxext: | |
754 | movd m0, [r2 ] ; 0 0 X D | |
755 | mov word [r2+ 0], 0 | |
756 | punpcklwd m0, [r2+32] ; x X d D | |
757 | mov word [r2+32], 0 | |
758 | paddsw m0, [pw_32] | |
759 | psraw m0, 6 | |
760 | punpcklwd m0, m0 ; d d D D | |
761 | pxor m1, m1 ; 0 0 0 0 | |
762 | psubw m1, m0 ; -d-d-D-D | |
763 | packuswb m0, m1 ; -d-d-D-D d d D D | |
764 | pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D | |
765 | punpcklwd m0, m0 ; d d d d D D D D | |
766 | lea r6, [r3*3] | |
767 | DC_ADD_MMXEXT_OP movq, r0, r3, r6 | |
768 | ret | |
769 | ||
770 | ALIGN 16 | |
771 | INIT_XMM sse2 | |
772 | ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride | |
773 | h264_add8x4_idct_sse2: | |
774 | movq m0, [r2+ 0] | |
775 | movq m1, [r2+ 8] | |
776 | movq m2, [r2+16] | |
777 | movq m3, [r2+24] | |
778 | movhps m0, [r2+32] | |
779 | movhps m1, [r2+40] | |
780 | movhps m2, [r2+48] | |
781 | movhps m3, [r2+56] | |
782 | IDCT4_1D w,0,1,2,3,4,5 | |
783 | TRANSPOSE2x4x4W 0,1,2,3,4 | |
784 | paddw m0, [pw_32] | |
785 | IDCT4_1D w,0,1,2,3,4,5 | |
786 | pxor m7, m7 | |
787 | mova [r2+ 0], m7 | |
788 | mova [r2+16], m7 | |
789 | mova [r2+32], m7 | |
790 | mova [r2+48], m7 | |
791 | STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 | |
792 | lea r0, [r0+r3*2] | |
793 | STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 | |
794 | ret | |
795 | ||
796 | %macro add16_sse2_cycle 2 | |
797 | movzx r0, word [r4+%2] | |
798 | test r0, r0 | |
799 | jz .cycle%1end | |
800 | mov r0d, dword [r1+%1*8] | |
801 | %if ARCH_X86_64 | |
802 | add r0, r5 | |
803 | %else | |
804 | add r0, r0m | |
805 | %endif | |
806 | call h264_add8x4_idct_sse2 | |
807 | .cycle%1end: | |
808 | %if %1 < 7 | |
809 | add r2, 64 | |
810 | %endif | |
811 | %endmacro | |
812 | ||
813 | ; void ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset, | |
814 | ; int16_t *block, int stride, | |
815 | ; const uint8_t nnzc[6 * 8]) | |
816 | cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8 | |
817 | %if ARCH_X86_64 | |
818 | mov r5, r0 | |
819 | %endif | |
820 | ; unrolling of the loop leads to an average performance gain of | |
821 | ; 20-25% | |
822 | add16_sse2_cycle 0, 0xc | |
823 | add16_sse2_cycle 1, 0x14 | |
824 | add16_sse2_cycle 2, 0xe | |
825 | add16_sse2_cycle 3, 0x16 | |
826 | add16_sse2_cycle 4, 0x1c | |
827 | add16_sse2_cycle 5, 0x24 | |
828 | add16_sse2_cycle 6, 0x1e | |
829 | add16_sse2_cycle 7, 0x26 | |
830 | RET | |
831 | ||
832 | %macro add16intra_sse2_cycle 2 | |
833 | movzx r0, word [r4+%2] | |
834 | test r0, r0 | |
835 | jz .try%1dc | |
836 | mov r0d, dword [r1+%1*8] | |
837 | %if ARCH_X86_64 | |
838 | add r0, r7 | |
839 | %else | |
840 | add r0, r0m | |
841 | %endif | |
842 | call h264_add8x4_idct_sse2 | |
843 | jmp .cycle%1end | |
844 | .try%1dc: | |
845 | movsx r0, word [r2 ] | |
846 | or r0w, word [r2+32] | |
847 | jz .cycle%1end | |
848 | mov r0d, dword [r1+%1*8] | |
849 | %if ARCH_X86_64 | |
850 | add r0, r7 | |
851 | %else | |
852 | add r0, r0m | |
853 | %endif | |
854 | call h264_idct_dc_add8_mmxext | |
855 | .cycle%1end: | |
856 | %if %1 < 7 | |
857 | add r2, 64 | |
858 | %endif | |
859 | %endmacro | |
860 | ||
861 | ; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset, | |
862 | ; int16_t *block, int stride, | |
863 | ; const uint8_t nnzc[6 * 8]) | |
864 | cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8 | |
865 | %if ARCH_X86_64 | |
866 | mov r7, r0 | |
867 | %endif | |
868 | add16intra_sse2_cycle 0, 0xc | |
869 | add16intra_sse2_cycle 1, 0x14 | |
870 | add16intra_sse2_cycle 2, 0xe | |
871 | add16intra_sse2_cycle 3, 0x16 | |
872 | add16intra_sse2_cycle 4, 0x1c | |
873 | add16intra_sse2_cycle 5, 0x24 | |
874 | add16intra_sse2_cycle 6, 0x1e | |
875 | add16intra_sse2_cycle 7, 0x26 | |
876 | RET | |
877 | ||
878 | %macro add8_sse2_cycle 2 | |
879 | movzx r0, word [r4+%2] | |
880 | test r0, r0 | |
881 | jz .try%1dc | |
882 | %if ARCH_X86_64 | |
883 | mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] | |
884 | add r0, [r7] | |
885 | %else | |
886 | mov r0, r0m | |
887 | mov r0, [r0] | |
888 | add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] | |
889 | %endif | |
890 | call h264_add8x4_idct_sse2 | |
891 | jmp .cycle%1end | |
892 | .try%1dc: | |
893 | movsx r0, word [r2 ] | |
894 | or r0w, word [r2+32] | |
895 | jz .cycle%1end | |
896 | %if ARCH_X86_64 | |
897 | mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] | |
898 | add r0, [r7] | |
899 | %else | |
900 | mov r0, r0m | |
901 | mov r0, [r0] | |
902 | add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] | |
903 | %endif | |
904 | call h264_idct_dc_add8_mmxext | |
905 | .cycle%1end: | |
906 | %if %1 == 1 | |
907 | add r2, 384+64 | |
908 | %elif %1 < 3 | |
909 | add r2, 64 | |
910 | %endif | |
911 | %endmacro | |
912 | ||
913 | ; void ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset, | |
914 | ; int16_t *block, int stride, | |
915 | ; const uint8_t nnzc[6 * 8]) | |
916 | cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8 | |
917 | add r2, 512 | |
918 | %if ARCH_X86_64 | |
919 | mov r7, r0 | |
920 | %endif | |
921 | add8_sse2_cycle 0, 0x34 | |
922 | add8_sse2_cycle 1, 0x3c | |
923 | %if ARCH_X86_64 | |
924 | add r7, gprsize | |
925 | %else | |
926 | add r0mp, gprsize | |
927 | %endif | |
928 | add8_sse2_cycle 2, 0x5c | |
929 | add8_sse2_cycle 3, 0x64 | |
930 | RET | |
931 | ||
932 | ;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul) | |
933 | ||
934 | %macro WALSH4_1D 5 | |
935 | SUMSUB_BADC w, %4, %3, %2, %1, %5 | |
936 | SUMSUB_BADC w, %4, %2, %3, %1, %5 | |
937 | SWAP %1, %4, %3 | |
938 | %endmacro | |
939 | ||
940 | %macro DEQUANT_MMX 3 | |
941 | mova m7, [pw_1] | |
942 | mova m4, %1 | |
943 | punpcklwd %1, m7 | |
944 | punpckhwd m4, m7 | |
945 | mova m5, %2 | |
946 | punpcklwd %2, m7 | |
947 | punpckhwd m5, m7 | |
948 | movd m7, t3d | |
949 | punpckldq m7, m7 | |
950 | pmaddwd %1, m7 | |
951 | pmaddwd %2, m7 | |
952 | pmaddwd m4, m7 | |
953 | pmaddwd m5, m7 | |
954 | psrad %1, %3 | |
955 | psrad %2, %3 | |
956 | psrad m4, %3 | |
957 | psrad m5, %3 | |
958 | packssdw %1, m4 | |
959 | packssdw %2, m5 | |
960 | %endmacro | |
961 | ||
962 | %macro STORE_WORDS 5-9 | |
963 | %if cpuflag(sse) | |
964 | movd t0d, %1 | |
965 | psrldq %1, 4 | |
966 | movd t1d, %1 | |
967 | psrldq %1, 4 | |
968 | mov [t2+%2*32], t0w | |
969 | mov [t2+%4*32], t1w | |
970 | shr t0d, 16 | |
971 | shr t1d, 16 | |
972 | mov [t2+%3*32], t0w | |
973 | mov [t2+%5*32], t1w | |
974 | movd t0d, %1 | |
975 | psrldq %1, 4 | |
976 | movd t1d, %1 | |
977 | mov [t2+%6*32], t0w | |
978 | mov [t2+%8*32], t1w | |
979 | shr t0d, 16 | |
980 | shr t1d, 16 | |
981 | mov [t2+%7*32], t0w | |
982 | mov [t2+%9*32], t1w | |
983 | %else | |
984 | movd t0d, %1 | |
985 | psrlq %1, 32 | |
986 | movd t1d, %1 | |
987 | mov [t2+%2*32], t0w | |
988 | mov [t2+%4*32], t1w | |
989 | shr t0d, 16 | |
990 | shr t1d, 16 | |
991 | mov [t2+%3*32], t0w | |
992 | mov [t2+%5*32], t1w | |
993 | %endif | |
994 | %endmacro | |
995 | ||
996 | %macro DEQUANT_STORE 1 | |
997 | %if cpuflag(sse2) | |
998 | movd xmm4, t3d | |
999 | movq xmm5, [pw_1] | |
1000 | pshufd xmm4, xmm4, 0 | |
1001 | movq2dq xmm0, m0 | |
1002 | movq2dq xmm1, m1 | |
1003 | movq2dq xmm2, m2 | |
1004 | movq2dq xmm3, m3 | |
1005 | punpcklwd xmm0, xmm5 | |
1006 | punpcklwd xmm1, xmm5 | |
1007 | punpcklwd xmm2, xmm5 | |
1008 | punpcklwd xmm3, xmm5 | |
1009 | pmaddwd xmm0, xmm4 | |
1010 | pmaddwd xmm1, xmm4 | |
1011 | pmaddwd xmm2, xmm4 | |
1012 | pmaddwd xmm3, xmm4 | |
1013 | psrad xmm0, %1 | |
1014 | psrad xmm1, %1 | |
1015 | psrad xmm2, %1 | |
1016 | psrad xmm3, %1 | |
1017 | packssdw xmm0, xmm1 | |
1018 | packssdw xmm2, xmm3 | |
1019 | STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7 | |
1020 | STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15 | |
1021 | %else | |
1022 | DEQUANT_MMX m0, m1, %1 | |
1023 | STORE_WORDS m0, 0, 1, 4, 5 | |
1024 | STORE_WORDS m1, 2, 3, 6, 7 | |
1025 | ||
1026 | DEQUANT_MMX m2, m3, %1 | |
1027 | STORE_WORDS m2, 8, 9, 12, 13 | |
1028 | STORE_WORDS m3, 10, 11, 14, 15 | |
1029 | %endif | |
1030 | %endmacro | |
1031 | ||
1032 | %macro IDCT_DC_DEQUANT 1 | |
1033 | cglobal h264_luma_dc_dequant_idct, 3, 4, %1 | |
1034 | ; manually spill XMM registers for Win64 because | |
1035 | ; the code here is initialized with INIT_MMX | |
1036 | WIN64_SPILL_XMM %1 | |
1037 | movq m3, [r1+24] | |
1038 | movq m2, [r1+16] | |
1039 | movq m1, [r1+ 8] | |
1040 | movq m0, [r1+ 0] | |
1041 | WALSH4_1D 0,1,2,3,4 | |
1042 | TRANSPOSE4x4W 0,1,2,3,4 | |
1043 | WALSH4_1D 0,1,2,3,4 | |
1044 | ||
1045 | ; shift, tmp, output, qmul | |
1046 | %if WIN64 | |
1047 | DECLARE_REG_TMP 0,3,1,2 | |
1048 | ; we can't avoid this, because r0 is the shift register (ecx) on win64 | |
1049 | xchg r0, t2 | |
1050 | %elif ARCH_X86_64 | |
1051 | DECLARE_REG_TMP 3,1,0,2 | |
1052 | %else | |
1053 | DECLARE_REG_TMP 1,3,0,2 | |
1054 | %endif | |
1055 | ||
1056 | cmp t3d, 32767 | |
1057 | jg .big_qmul | |
1058 | add t3d, 128 << 16 | |
1059 | DEQUANT_STORE 8 | |
1060 | RET | |
1061 | .big_qmul: | |
1062 | bsr t0d, t3d | |
1063 | add t3d, 128 << 16 | |
1064 | mov t1d, 7 | |
1065 | cmp t0d, t1d | |
1066 | cmovg t0d, t1d | |
1067 | inc t1d | |
1068 | shr t3d, t0b | |
1069 | sub t1d, t0d | |
1070 | %if cpuflag(sse2) | |
1071 | movd xmm6, t1d | |
1072 | DEQUANT_STORE xmm6 | |
1073 | %else | |
1074 | movd m6, t1d | |
1075 | DEQUANT_STORE m6 | |
1076 | %endif | |
1077 | RET | |
1078 | %endmacro | |
1079 | ||
1080 | INIT_MMX mmx | |
1081 | IDCT_DC_DEQUANT 0 | |
1082 | INIT_MMX sse2 | |
1083 | IDCT_DC_DEQUANT 7 |