Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / h264_idct.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* MMX/SSE2-optimized H.264 iDCT
3;*****************************************************************************
4;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
5;* Copyright (C) 2003-2008 x264 project
6;*
7;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
8;* Loren Merritt <lorenm@u.washington.edu>
9;* Holger Lubitz <hal@duncan.ol.sub.de>
10;* Min Chen <chenm001.163.com>
11;*
12;* This file is part of FFmpeg.
13;*
14;* FFmpeg is free software; you can redistribute it and/or
15;* modify it under the terms of the GNU Lesser General Public
16;* License as published by the Free Software Foundation; either
17;* version 2.1 of the License, or (at your option) any later version.
18;*
19;* FFmpeg is distributed in the hope that it will be useful,
20;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22;* Lesser General Public License for more details.
23;*
24;* You should have received a copy of the GNU Lesser General Public
25;* License along with FFmpeg; if not, write to the Free Software
26;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27;*****************************************************************************
28
29%include "libavutil/x86/x86util.asm"
30
31SECTION_RODATA
32
33scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
34 db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
35 db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
36 db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
37 db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
38 db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
39 db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
40 db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
41 db 4+11*8, 5+11*8, 4+12*8, 5+12*8
42 db 6+11*8, 7+11*8, 6+12*8, 7+12*8
43 db 4+13*8, 5+13*8, 4+14*8, 5+14*8
44 db 6+13*8, 7+13*8, 6+14*8, 7+14*8
45%ifdef PIC
46%define npicregs 1
47%define scan8 picregq
48%else
49%define npicregs 0
50%define scan8 scan8_mem
51%endif
52
53cextern pw_32
54cextern pw_1
55
56SECTION .text
57
58; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
59%macro IDCT4_ADD 3
60 ; Load dct coeffs
61 movq m0, [%2]
62 movq m1, [%2+8]
63 movq m2, [%2+16]
64 movq m3, [%2+24]
65
66 IDCT4_1D w, 0, 1, 2, 3, 4, 5
67 mova m6, [pw_32]
68 TRANSPOSE4x4W 0, 1, 2, 3, 4
69 paddw m0, m6
70 IDCT4_1D w, 0, 1, 2, 3, 4, 5
71 pxor m7, m7
72 movq [%2+ 0], m7
73 movq [%2+ 8], m7
74 movq [%2+16], m7
75 movq [%2+24], m7
76
77 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
78 lea %1, [%1+%3*2]
79 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
80%endmacro
81
82INIT_MMX mmx
83; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
84cglobal h264_idct_add_8, 3, 3, 0
85 IDCT4_ADD r0, r1, r2
86 RET
87
88%macro IDCT8_1D 2
89 mova m0, m1
90 psraw m1, 1
91 mova m4, m5
92 psraw m4, 1
93 paddw m4, m5
94 paddw m1, m0
95 paddw m4, m7
96 paddw m1, m5
97 psubw m4, m0
98 paddw m1, m3
99
100 psubw m0, m3
101 psubw m5, m3
102 psraw m3, 1
103 paddw m0, m7
104 psubw m5, m7
105 psraw m7, 1
106 psubw m0, m3
107 psubw m5, m7
108
109 mova m7, m1
110 psraw m1, 2
111 mova m3, m4
112 psraw m3, 2
113 paddw m3, m0
114 psraw m0, 2
115 paddw m1, m5
116 psraw m5, 2
117 psubw m0, m4
118 psubw m7, m5
119
120 mova m5, m6
121 psraw m6, 1
122 mova m4, m2
123 psraw m4, 1
124 paddw m6, m2
125 psubw m4, m5
126
127 mova m2, %1
128 mova m5, %2
129 SUMSUB_BA w, 5, 2
130 SUMSUB_BA w, 6, 5
131 SUMSUB_BA w, 4, 2
132 SUMSUB_BA w, 7, 6
133 SUMSUB_BA w, 0, 4
134 SUMSUB_BA w, 3, 2
135 SUMSUB_BA w, 1, 5
136 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
137%endmacro
138
139%macro IDCT8_1D_FULL 1
140 mova m7, [%1+112]
141 mova m6, [%1+ 96]
142 mova m5, [%1+ 80]
143 mova m3, [%1+ 48]
144 mova m2, [%1+ 32]
145 mova m1, [%1+ 16]
146 IDCT8_1D [%1], [%1+ 64]
147%endmacro
148
149; %1=int16_t *block, %2=int16_t *dstblock
150%macro IDCT8_ADD_MMX_START 2
151 IDCT8_1D_FULL %1
152 mova [%1], m7
153 TRANSPOSE4x4W 0, 1, 2, 3, 7
154 mova m7, [%1]
155 mova [%2 ], m0
156 mova [%2+16], m1
157 mova [%2+32], m2
158 mova [%2+48], m3
159 TRANSPOSE4x4W 4, 5, 6, 7, 3
160 mova [%2+ 8], m4
161 mova [%2+24], m5
162 mova [%2+40], m6
163 mova [%2+56], m7
164%endmacro
165
166; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
167%macro IDCT8_ADD_MMX_END 3-4
168 IDCT8_1D_FULL %2
169 mova [%2 ], m5
170 mova [%2+16], m6
171 mova [%2+32], m7
172
173 pxor m7, m7
174%if %0 == 4
175 movq [%4+ 0], m7
176 movq [%4+ 8], m7
177 movq [%4+ 16], m7
178 movq [%4+ 24], m7
179 movq [%4+ 32], m7
180 movq [%4+ 40], m7
181 movq [%4+ 48], m7
182 movq [%4+ 56], m7
183 movq [%4+ 64], m7
184 movq [%4+ 72], m7
185 movq [%4+ 80], m7
186 movq [%4+ 88], m7
187 movq [%4+ 96], m7
188 movq [%4+104], m7
189 movq [%4+112], m7
190 movq [%4+120], m7
191%endif
192 STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
193 lea %1, [%1+%3*2]
194 STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
195 mova m0, [%2 ]
196 mova m1, [%2+16]
197 mova m2, [%2+32]
198 lea %1, [%1+%3*2]
199 STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
200 lea %1, [%1+%3*2]
201 STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
202%endmacro
203
204INIT_MMX mmx
205; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
206cglobal h264_idct8_add_8, 3, 4, 0
207 %assign pad 128+4-(stack_offset&7)
208 SUB rsp, pad
209
210 add word [r1], 32
211 IDCT8_ADD_MMX_START r1 , rsp
212 IDCT8_ADD_MMX_START r1+8, rsp+64
213 lea r3, [r0+4]
214 IDCT8_ADD_MMX_END r0 , rsp, r2, r1
215 IDCT8_ADD_MMX_END r3 , rsp+8, r2
216
217 ADD rsp, pad
218 RET
219
220; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
221%macro IDCT8_ADD_SSE 4
222 IDCT8_1D_FULL %2
223%if ARCH_X86_64
224 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
225%else
226 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
227%endif
228 paddw m0, [pw_32]
229
230%if ARCH_X86_64 == 0
231 mova [%2 ], m0
232 mova [%2+16], m4
233 IDCT8_1D [%2], [%2+ 16]
234 mova [%2 ], m6
235 mova [%2+16], m7
236%else
237 SWAP 0, 8
238 SWAP 4, 9
239 IDCT8_1D m8, m9
240 SWAP 6, 8
241 SWAP 7, 9
242%endif
243
244 pxor m7, m7
245 lea %4, [%3*3]
246 STORE_DIFF m0, m6, m7, [%1 ]
247 STORE_DIFF m1, m6, m7, [%1+%3 ]
248 STORE_DIFF m2, m6, m7, [%1+%3*2]
249 STORE_DIFF m3, m6, m7, [%1+%4 ]
250%if ARCH_X86_64 == 0
251 mova m0, [%2 ]
252 mova m1, [%2+16]
253%else
254 SWAP 0, 8
255 SWAP 1, 9
256%endif
257 mova [%2+ 0], m7
258 mova [%2+ 16], m7
259 mova [%2+ 32], m7
260 mova [%2+ 48], m7
261 mova [%2+ 64], m7
262 mova [%2+ 80], m7
263 mova [%2+ 96], m7
264 mova [%2+112], m7
265 lea %1, [%1+%3*4]
266 STORE_DIFF m4, m6, m7, [%1 ]
267 STORE_DIFF m5, m6, m7, [%1+%3 ]
268 STORE_DIFF m0, m6, m7, [%1+%3*2]
269 STORE_DIFF m1, m6, m7, [%1+%4 ]
270%endmacro
271
272INIT_XMM sse2
273; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride)
274cglobal h264_idct8_add_8, 3, 4, 10
275 IDCT8_ADD_SSE r0, r1, r2, r3
276 RET
277
278%macro DC_ADD_MMXEXT_INIT 2
279 add %1, 32
280 sar %1, 6
281 movd m0, %1d
282 lea %1, [%2*3]
283 pshufw m0, m0, 0
284 pxor m1, m1
285 psubw m1, m0
286 packuswb m0, m0
287 packuswb m1, m1
288%endmacro
289
290%macro DC_ADD_MMXEXT_OP 4
291 %1 m2, [%2 ]
292 %1 m3, [%2+%3 ]
293 %1 m4, [%2+%3*2]
294 %1 m5, [%2+%4 ]
295 paddusb m2, m0
296 paddusb m3, m0
297 paddusb m4, m0
298 paddusb m5, m0
299 psubusb m2, m1
300 psubusb m3, m1
301 psubusb m4, m1
302 psubusb m5, m1
303 %1 [%2 ], m2
304 %1 [%2+%3 ], m3
305 %1 [%2+%3*2], m4
306 %1 [%2+%4 ], m5
307%endmacro
308
309INIT_MMX mmxext
310; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
311%if ARCH_X86_64
312cglobal h264_idct_dc_add_8, 3, 4, 0
313 movsx r3, word [r1]
314 mov dword [r1], 0
315 DC_ADD_MMXEXT_INIT r3, r2
316 DC_ADD_MMXEXT_OP movh, r0, r2, r3
317 RET
318
319; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
320cglobal h264_idct8_dc_add_8, 3, 4, 0
321 movsx r3, word [r1]
322 mov dword [r1], 0
323 DC_ADD_MMXEXT_INIT r3, r2
324 DC_ADD_MMXEXT_OP mova, r0, r2, r3
325 lea r0, [r0+r2*4]
326 DC_ADD_MMXEXT_OP mova, r0, r2, r3
327 RET
328%else
329; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
330cglobal h264_idct_dc_add_8, 2, 3, 0
331 movsx r2, word [r1]
332 mov dword [r1], 0
333 mov r1, r2m
334 DC_ADD_MMXEXT_INIT r2, r1
335 DC_ADD_MMXEXT_OP movh, r0, r1, r2
336 RET
337
338; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
339cglobal h264_idct8_dc_add_8, 2, 3, 0
340 movsx r2, word [r1]
341 mov dword [r1], 0
342 mov r1, r2m
343 DC_ADD_MMXEXT_INIT r2, r1
344 DC_ADD_MMXEXT_OP mova, r0, r1, r2
345 lea r0, [r0+r1*4]
346 DC_ADD_MMXEXT_OP mova, r0, r1, r2
347 RET
348%endif
349
350INIT_MMX mmx
351; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
352; int16_t *block, int stride,
353; const uint8_t nnzc[6 * 8])
354cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
355 xor r5, r5
356%ifdef PIC
357 lea picregq, [scan8_mem]
358%endif
359.nextblock:
360 movzx r6, byte [scan8+r5]
361 movzx r6, byte [r4+r6]
362 test r6, r6
363 jz .skipblock
364 mov r6d, dword [r1+r5*4]
365 lea r6, [r0+r6]
366 IDCT4_ADD r6, r2, r3
367.skipblock:
368 inc r5
369 add r2, 32
370 cmp r5, 16
371 jl .nextblock
372 REP_RET
373
374; void ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset,
375; int16_t *block, int stride,
376; const uint8_t nnzc[6 * 8])
377cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
378 %assign pad 128+4-(stack_offset&7)
379 SUB rsp, pad
380
381 xor r5, r5
382%ifdef PIC
383 lea picregq, [scan8_mem]
384%endif
385.nextblock:
386 movzx r6, byte [scan8+r5]
387 movzx r6, byte [r4+r6]
388 test r6, r6
389 jz .skipblock
390 mov r6d, dword [r1+r5*4]
391 add r6, r0
392 add word [r2], 32
393 IDCT8_ADD_MMX_START r2 , rsp
394 IDCT8_ADD_MMX_START r2+8, rsp+64
395 IDCT8_ADD_MMX_END r6 , rsp, r3, r2
396 mov r6d, dword [r1+r5*4]
397 lea r6, [r0+r6+4]
398 IDCT8_ADD_MMX_END r6 , rsp+8, r3
399.skipblock:
400 add r5, 4
401 add r2, 128
402 cmp r5, 16
403 jl .nextblock
404 ADD rsp, pad
405 RET
406
407INIT_MMX mmxext
408; void ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset,
409; int16_t *block, int stride,
410; const uint8_t nnzc[6 * 8])
411cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
412 xor r5, r5
413%ifdef PIC
414 lea picregq, [scan8_mem]
415%endif
416.nextblock:
417 movzx r6, byte [scan8+r5]
418 movzx r6, byte [r4+r6]
419 test r6, r6
420 jz .skipblock
421 cmp r6, 1
422 jnz .no_dc
423 movsx r6, word [r2]
424 test r6, r6
425 jz .no_dc
426 mov word [r2], 0
427 DC_ADD_MMXEXT_INIT r6, r3
428%if ARCH_X86_64 == 0
429%define dst2q r1
430%define dst2d r1d
431%endif
432 mov dst2d, dword [r1+r5*4]
433 lea dst2q, [r0+dst2q]
434 DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
435%if ARCH_X86_64 == 0
436 mov r1, r1m
437%endif
438 inc r5
439 add r2, 32
440 cmp r5, 16
441 jl .nextblock
442 REP_RET
443.no_dc:
444 mov r6d, dword [r1+r5*4]
445 add r6, r0
446 IDCT4_ADD r6, r2, r3
447.skipblock:
448 inc r5
449 add r2, 32
450 cmp r5, 16
451 jl .nextblock
452 REP_RET
453
454INIT_MMX mmx
455; void ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset,
456; int16_t *block, int stride,
457; const uint8_t nnzc[6 * 8])
458cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
459 xor r5, r5
460%ifdef PIC
461 lea picregq, [scan8_mem]
462%endif
463.nextblock:
464 movzx r6, byte [scan8+r5]
465 movzx r6, byte [r4+r6]
466 or r6w, word [r2]
467 test r6, r6
468 jz .skipblock
469 mov r6d, dword [r1+r5*4]
470 add r6, r0
471 IDCT4_ADD r6, r2, r3
472.skipblock:
473 inc r5
474 add r2, 32
475 cmp r5, 16
476 jl .nextblock
477 REP_RET
478
479INIT_MMX mmxext
480; void ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset,
481; int16_t *block, int stride,
482; const uint8_t nnzc[6 * 8])
483cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
484 xor r5, r5
485%ifdef PIC
486 lea picregq, [scan8_mem]
487%endif
488.nextblock:
489 movzx r6, byte [scan8+r5]
490 movzx r6, byte [r4+r6]
491 test r6, r6
492 jz .try_dc
493 mov r6d, dword [r1+r5*4]
494 lea r6, [r0+r6]
495 IDCT4_ADD r6, r2, r3
496 inc r5
497 add r2, 32
498 cmp r5, 16
499 jl .nextblock
500 REP_RET
501.try_dc:
502 movsx r6, word [r2]
503 test r6, r6
504 jz .skipblock
505 mov word [r2], 0
506 DC_ADD_MMXEXT_INIT r6, r3
507%if ARCH_X86_64 == 0
508%define dst2q r1
509%define dst2d r1d
510%endif
511 mov dst2d, dword [r1+r5*4]
512 add dst2q, r0
513 DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
514%if ARCH_X86_64 == 0
515 mov r1, r1m
516%endif
517.skipblock:
518 inc r5
519 add r2, 32
520 cmp r5, 16
521 jl .nextblock
522 REP_RET
523
524; void ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset,
525; int16_t *block, int stride,
526; const uint8_t nnzc[6 * 8])
527cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
528 %assign pad 128+4-(stack_offset&7)
529 SUB rsp, pad
530
531 xor r5, r5
532%ifdef PIC
533 lea picregq, [scan8_mem]
534%endif
535.nextblock:
536 movzx r6, byte [scan8+r5]
537 movzx r6, byte [r4+r6]
538 test r6, r6
539 jz .skipblock
540 cmp r6, 1
541 jnz .no_dc
542 movsx r6, word [r2]
543 test r6, r6
544 jz .no_dc
545 mov word [r2], 0
546 DC_ADD_MMXEXT_INIT r6, r3
547%if ARCH_X86_64 == 0
548%define dst2q r1
549%define dst2d r1d
550%endif
551 mov dst2d, dword [r1+r5*4]
552 lea dst2q, [r0+dst2q]
553 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
554 lea dst2q, [dst2q+r3*4]
555 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
556%if ARCH_X86_64 == 0
557 mov r1, r1m
558%endif
559 add r5, 4
560 add r2, 128
561 cmp r5, 16
562 jl .nextblock
563
564 ADD rsp, pad
565 RET
566.no_dc:
567 mov r6d, dword [r1+r5*4]
568 add r6, r0
569 add word [r2], 32
570 IDCT8_ADD_MMX_START r2 , rsp
571 IDCT8_ADD_MMX_START r2+8, rsp+64
572 IDCT8_ADD_MMX_END r6 , rsp, r3, r2
573 mov r6d, dword [r1+r5*4]
574 lea r6, [r0+r6+4]
575 IDCT8_ADD_MMX_END r6 , rsp+8, r3
576.skipblock:
577 add r5, 4
578 add r2, 128
579 cmp r5, 16
580 jl .nextblock
581
582 ADD rsp, pad
583 RET
584
585INIT_XMM sse2
586; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
587; int16_t *block, int stride,
588; const uint8_t nnzc[6 * 8])
589cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
590 xor r5, r5
591%ifdef PIC
592 lea picregq, [scan8_mem]
593%endif
594.nextblock:
595 movzx r6, byte [scan8+r5]
596 movzx r6, byte [r4+r6]
597 test r6, r6
598 jz .skipblock
599 cmp r6, 1
600 jnz .no_dc
601 movsx r6, word [r2]
602 test r6, r6
603 jz .no_dc
604INIT_MMX cpuname
605 mov word [r2], 0
606 DC_ADD_MMXEXT_INIT r6, r3
607%if ARCH_X86_64 == 0
608%define dst2q r1
609%define dst2d r1d
610%endif
611 mov dst2d, dword [r1+r5*4]
612 add dst2q, r0
613 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
614 lea dst2q, [dst2q+r3*4]
615 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
616%if ARCH_X86_64 == 0
617 mov r1, r1m
618%endif
619 add r5, 4
620 add r2, 128
621 cmp r5, 16
622 jl .nextblock
623 REP_RET
624.no_dc:
625INIT_XMM cpuname
626 mov dst2d, dword [r1+r5*4]
627 add dst2q, r0
628 IDCT8_ADD_SSE dst2q, r2, r3, r6
629%if ARCH_X86_64 == 0
630 mov r1, r1m
631%endif
632.skipblock:
633 add r5, 4
634 add r2, 128
635 cmp r5, 16
636 jl .nextblock
637 REP_RET
638
639INIT_MMX mmx
640h264_idct_add8_mmx_plane:
641.nextblock:
642 movzx r6, byte [scan8+r5]
643 movzx r6, byte [r4+r6]
644 or r6w, word [r2]
645 test r6, r6
646 jz .skipblock
647%if ARCH_X86_64
648 mov r0d, dword [r1+r5*4]
649 add r0, [dst2q]
650%else
651 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
652 mov r0, [r0]
653 add r0, dword [r1+r5*4]
654%endif
655 IDCT4_ADD r0, r2, r3
656.skipblock:
657 inc r5
658 add r2, 32
659 test r5, 3
660 jnz .nextblock
661 rep ret
662
663; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset,
664; int16_t *block, int stride,
665; const uint8_t nnzc[6 * 8])
666cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
667 mov r5, 16
668 add r2, 512
669%ifdef PIC
670 lea picregq, [scan8_mem]
671%endif
672%if ARCH_X86_64
673 mov dst2q, r0
674%endif
675 call h264_idct_add8_mmx_plane
676 mov r5, 32
677 add r2, 384
678%if ARCH_X86_64
679 add dst2q, gprsize
680%else
681 add r0mp, gprsize
682%endif
683 call h264_idct_add8_mmx_plane
684 RET
685
686h264_idct_add8_mmxext_plane:
687.nextblock:
688 movzx r6, byte [scan8+r5]
689 movzx r6, byte [r4+r6]
690 test r6, r6
691 jz .try_dc
692%if ARCH_X86_64
693 mov r0d, dword [r1+r5*4]
694 add r0, [dst2q]
695%else
696 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
697 mov r0, [r0]
698 add r0, dword [r1+r5*4]
699%endif
700 IDCT4_ADD r0, r2, r3
701 inc r5
702 add r2, 32
703 test r5, 3
704 jnz .nextblock
705 rep ret
706.try_dc:
707 movsx r6, word [r2]
708 test r6, r6
709 jz .skipblock
710 mov word [r2], 0
711 DC_ADD_MMXEXT_INIT r6, r3
712%if ARCH_X86_64
713 mov r0d, dword [r1+r5*4]
714 add r0, [dst2q]
715%else
716 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
717 mov r0, [r0]
718 add r0, dword [r1+r5*4]
719%endif
720 DC_ADD_MMXEXT_OP movh, r0, r3, r6
721.skipblock:
722 inc r5
723 add r2, 32
724 test r5, 3
725 jnz .nextblock
726 rep ret
727
728INIT_MMX mmxext
729; void ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset,
730; int16_t *block, int stride,
731; const uint8_t nnzc[6 * 8])
732cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
733 mov r5, 16
734 add r2, 512
735%if ARCH_X86_64
736 mov dst2q, r0
737%endif
738%ifdef PIC
739 lea picregq, [scan8_mem]
740%endif
741 call h264_idct_add8_mmxext_plane
742 mov r5, 32
743 add r2, 384
744%if ARCH_X86_64
745 add dst2q, gprsize
746%else
747 add r0mp, gprsize
748%endif
749 call h264_idct_add8_mmxext_plane
750 RET
751
752; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
753h264_idct_dc_add8_mmxext:
754 movd m0, [r2 ] ; 0 0 X D
755 mov word [r2+ 0], 0
756 punpcklwd m0, [r2+32] ; x X d D
757 mov word [r2+32], 0
758 paddsw m0, [pw_32]
759 psraw m0, 6
760 punpcklwd m0, m0 ; d d D D
761 pxor m1, m1 ; 0 0 0 0
762 psubw m1, m0 ; -d-d-D-D
763 packuswb m0, m1 ; -d-d-D-D d d D D
764 pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
765 punpcklwd m0, m0 ; d d d d D D D D
766 lea r6, [r3*3]
767 DC_ADD_MMXEXT_OP movq, r0, r3, r6
768 ret
769
770ALIGN 16
771INIT_XMM sse2
772; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
773h264_add8x4_idct_sse2:
774 movq m0, [r2+ 0]
775 movq m1, [r2+ 8]
776 movq m2, [r2+16]
777 movq m3, [r2+24]
778 movhps m0, [r2+32]
779 movhps m1, [r2+40]
780 movhps m2, [r2+48]
781 movhps m3, [r2+56]
782 IDCT4_1D w,0,1,2,3,4,5
783 TRANSPOSE2x4x4W 0,1,2,3,4
784 paddw m0, [pw_32]
785 IDCT4_1D w,0,1,2,3,4,5
786 pxor m7, m7
787 mova [r2+ 0], m7
788 mova [r2+16], m7
789 mova [r2+32], m7
790 mova [r2+48], m7
791 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
792 lea r0, [r0+r3*2]
793 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
794 ret
795
796%macro add16_sse2_cycle 2
797 movzx r0, word [r4+%2]
798 test r0, r0
799 jz .cycle%1end
800 mov r0d, dword [r1+%1*8]
801%if ARCH_X86_64
802 add r0, r5
803%else
804 add r0, r0m
805%endif
806 call h264_add8x4_idct_sse2
807.cycle%1end:
808%if %1 < 7
809 add r2, 64
810%endif
811%endmacro
812
813; void ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset,
814; int16_t *block, int stride,
815; const uint8_t nnzc[6 * 8])
816cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
817%if ARCH_X86_64
818 mov r5, r0
819%endif
820 ; unrolling of the loop leads to an average performance gain of
821 ; 20-25%
822 add16_sse2_cycle 0, 0xc
823 add16_sse2_cycle 1, 0x14
824 add16_sse2_cycle 2, 0xe
825 add16_sse2_cycle 3, 0x16
826 add16_sse2_cycle 4, 0x1c
827 add16_sse2_cycle 5, 0x24
828 add16_sse2_cycle 6, 0x1e
829 add16_sse2_cycle 7, 0x26
830 RET
831
832%macro add16intra_sse2_cycle 2
833 movzx r0, word [r4+%2]
834 test r0, r0
835 jz .try%1dc
836 mov r0d, dword [r1+%1*8]
837%if ARCH_X86_64
838 add r0, r7
839%else
840 add r0, r0m
841%endif
842 call h264_add8x4_idct_sse2
843 jmp .cycle%1end
844.try%1dc:
845 movsx r0, word [r2 ]
846 or r0w, word [r2+32]
847 jz .cycle%1end
848 mov r0d, dword [r1+%1*8]
849%if ARCH_X86_64
850 add r0, r7
851%else
852 add r0, r0m
853%endif
854 call h264_idct_dc_add8_mmxext
855.cycle%1end:
856%if %1 < 7
857 add r2, 64
858%endif
859%endmacro
860
861; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset,
862; int16_t *block, int stride,
863; const uint8_t nnzc[6 * 8])
864cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
865%if ARCH_X86_64
866 mov r7, r0
867%endif
868 add16intra_sse2_cycle 0, 0xc
869 add16intra_sse2_cycle 1, 0x14
870 add16intra_sse2_cycle 2, 0xe
871 add16intra_sse2_cycle 3, 0x16
872 add16intra_sse2_cycle 4, 0x1c
873 add16intra_sse2_cycle 5, 0x24
874 add16intra_sse2_cycle 6, 0x1e
875 add16intra_sse2_cycle 7, 0x26
876 RET
877
878%macro add8_sse2_cycle 2
879 movzx r0, word [r4+%2]
880 test r0, r0
881 jz .try%1dc
882%if ARCH_X86_64
883 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
884 add r0, [r7]
885%else
886 mov r0, r0m
887 mov r0, [r0]
888 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
889%endif
890 call h264_add8x4_idct_sse2
891 jmp .cycle%1end
892.try%1dc:
893 movsx r0, word [r2 ]
894 or r0w, word [r2+32]
895 jz .cycle%1end
896%if ARCH_X86_64
897 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
898 add r0, [r7]
899%else
900 mov r0, r0m
901 mov r0, [r0]
902 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
903%endif
904 call h264_idct_dc_add8_mmxext
905.cycle%1end:
906%if %1 == 1
907 add r2, 384+64
908%elif %1 < 3
909 add r2, 64
910%endif
911%endmacro
912
913; void ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset,
914; int16_t *block, int stride,
915; const uint8_t nnzc[6 * 8])
916cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
917 add r2, 512
918%if ARCH_X86_64
919 mov r7, r0
920%endif
921 add8_sse2_cycle 0, 0x34
922 add8_sse2_cycle 1, 0x3c
923%if ARCH_X86_64
924 add r7, gprsize
925%else
926 add r0mp, gprsize
927%endif
928 add8_sse2_cycle 2, 0x5c
929 add8_sse2_cycle 3, 0x64
930 RET
931
932;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
933
934%macro WALSH4_1D 5
935 SUMSUB_BADC w, %4, %3, %2, %1, %5
936 SUMSUB_BADC w, %4, %2, %3, %1, %5
937 SWAP %1, %4, %3
938%endmacro
939
940%macro DEQUANT_MMX 3
941 mova m7, [pw_1]
942 mova m4, %1
943 punpcklwd %1, m7
944 punpckhwd m4, m7
945 mova m5, %2
946 punpcklwd %2, m7
947 punpckhwd m5, m7
948 movd m7, t3d
949 punpckldq m7, m7
950 pmaddwd %1, m7
951 pmaddwd %2, m7
952 pmaddwd m4, m7
953 pmaddwd m5, m7
954 psrad %1, %3
955 psrad %2, %3
956 psrad m4, %3
957 psrad m5, %3
958 packssdw %1, m4
959 packssdw %2, m5
960%endmacro
961
962%macro STORE_WORDS 5-9
963%if cpuflag(sse)
964 movd t0d, %1
965 psrldq %1, 4
966 movd t1d, %1
967 psrldq %1, 4
968 mov [t2+%2*32], t0w
969 mov [t2+%4*32], t1w
970 shr t0d, 16
971 shr t1d, 16
972 mov [t2+%3*32], t0w
973 mov [t2+%5*32], t1w
974 movd t0d, %1
975 psrldq %1, 4
976 movd t1d, %1
977 mov [t2+%6*32], t0w
978 mov [t2+%8*32], t1w
979 shr t0d, 16
980 shr t1d, 16
981 mov [t2+%7*32], t0w
982 mov [t2+%9*32], t1w
983%else
984 movd t0d, %1
985 psrlq %1, 32
986 movd t1d, %1
987 mov [t2+%2*32], t0w
988 mov [t2+%4*32], t1w
989 shr t0d, 16
990 shr t1d, 16
991 mov [t2+%3*32], t0w
992 mov [t2+%5*32], t1w
993%endif
994%endmacro
995
996%macro DEQUANT_STORE 1
997%if cpuflag(sse2)
998 movd xmm4, t3d
999 movq xmm5, [pw_1]
1000 pshufd xmm4, xmm4, 0
1001 movq2dq xmm0, m0
1002 movq2dq xmm1, m1
1003 movq2dq xmm2, m2
1004 movq2dq xmm3, m3
1005 punpcklwd xmm0, xmm5
1006 punpcklwd xmm1, xmm5
1007 punpcklwd xmm2, xmm5
1008 punpcklwd xmm3, xmm5
1009 pmaddwd xmm0, xmm4
1010 pmaddwd xmm1, xmm4
1011 pmaddwd xmm2, xmm4
1012 pmaddwd xmm3, xmm4
1013 psrad xmm0, %1
1014 psrad xmm1, %1
1015 psrad xmm2, %1
1016 psrad xmm3, %1
1017 packssdw xmm0, xmm1
1018 packssdw xmm2, xmm3
1019 STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7
1020 STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15
1021%else
1022 DEQUANT_MMX m0, m1, %1
1023 STORE_WORDS m0, 0, 1, 4, 5
1024 STORE_WORDS m1, 2, 3, 6, 7
1025
1026 DEQUANT_MMX m2, m3, %1
1027 STORE_WORDS m2, 8, 9, 12, 13
1028 STORE_WORDS m3, 10, 11, 14, 15
1029%endif
1030%endmacro
1031
1032%macro IDCT_DC_DEQUANT 1
1033cglobal h264_luma_dc_dequant_idct, 3, 4, %1
1034 ; manually spill XMM registers for Win64 because
1035 ; the code here is initialized with INIT_MMX
1036 WIN64_SPILL_XMM %1
1037 movq m3, [r1+24]
1038 movq m2, [r1+16]
1039 movq m1, [r1+ 8]
1040 movq m0, [r1+ 0]
1041 WALSH4_1D 0,1,2,3,4
1042 TRANSPOSE4x4W 0,1,2,3,4
1043 WALSH4_1D 0,1,2,3,4
1044
1045; shift, tmp, output, qmul
1046%if WIN64
1047 DECLARE_REG_TMP 0,3,1,2
1048 ; we can't avoid this, because r0 is the shift register (ecx) on win64
1049 xchg r0, t2
1050%elif ARCH_X86_64
1051 DECLARE_REG_TMP 3,1,0,2
1052%else
1053 DECLARE_REG_TMP 1,3,0,2
1054%endif
1055
1056 cmp t3d, 32767
1057 jg .big_qmul
1058 add t3d, 128 << 16
1059 DEQUANT_STORE 8
1060 RET
1061.big_qmul:
1062 bsr t0d, t3d
1063 add t3d, 128 << 16
1064 mov t1d, 7
1065 cmp t0d, t1d
1066 cmovg t0d, t1d
1067 inc t1d
1068 shr t3d, t0b
1069 sub t1d, t0d
1070%if cpuflag(sse2)
1071 movd xmm6, t1d
1072 DEQUANT_STORE xmm6
1073%else
1074 movd m6, t1d
1075 DEQUANT_STORE m6
1076%endif
1077 RET
1078%endmacro
1079
1080INIT_MMX mmx
1081IDCT_DC_DEQUANT 0
1082INIT_MMX sse2
1083IDCT_DC_DEQUANT 7