Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / h264_idct_10bit.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29pw_pixel_max: times 8 dw ((1 << 10)-1)
30pd_32: times 4 dd 32
31
32SECTION .text
33
34;-----------------------------------------------------------------------------
35; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride)
36;-----------------------------------------------------------------------------
37%macro STORE_DIFFx2 6
38 psrad %1, 6
39 psrad %2, 6
40 packssdw %1, %2
41 movq %3, [%5]
42 movhps %3, [%5+%6]
43 paddsw %1, %3
44 CLIPW %1, %4, [pw_pixel_max]
45 movq [%5], %1
46 movhps [%5+%6], %1
47%endmacro
48
49%macro STORE_DIFF16 5
50 psrad %1, 6
51 psrad %2, 6
52 packssdw %1, %2
53 paddsw %1, [%5]
54 CLIPW %1, %3, %4
55 mova [%5], %1
56%endmacro
57
58;dst, in, stride
59%macro IDCT4_ADD_10 3
60 mova m0, [%2+ 0]
61 mova m1, [%2+16]
62 mova m2, [%2+32]
63 mova m3, [%2+48]
64 IDCT4_1D d,0,1,2,3,4,5
65 TRANSPOSE4x4D 0,1,2,3,4
66 paddd m0, [pd_32]
67 IDCT4_1D d,0,1,2,3,4,5
68 pxor m5, m5
69 mova [%2+ 0], m5
70 mova [%2+16], m5
71 mova [%2+32], m5
72 mova [%2+48], m5
73 STORE_DIFFx2 m0, m1, m4, m5, %1, %3
74 lea %1, [%1+%3*2]
75 STORE_DIFFx2 m2, m3, m4, m5, %1, %3
76%endmacro
77
78%macro IDCT_ADD_10 0
79cglobal h264_idct_add_10, 3,3
80 IDCT4_ADD_10 r0, r1, r2
81 RET
82%endmacro
83
84INIT_XMM sse2
85IDCT_ADD_10
86%if HAVE_AVX_EXTERNAL
87INIT_XMM avx
88IDCT_ADD_10
89%endif
90
91;-----------------------------------------------------------------------------
92; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset,
93; int16_t *block, int stride,
94; const uint8_t nnzc[6*8])
95;-----------------------------------------------------------------------------
96;;;;;;; NO FATE SAMPLES TRIGGER THIS
97%macro ADD4x4IDCT 0
98add4x4_idct %+ SUFFIX:
99 add r5, r0
100 mova m0, [r2+ 0]
101 mova m1, [r2+16]
102 mova m2, [r2+32]
103 mova m3, [r2+48]
104 IDCT4_1D d,0,1,2,3,4,5
105 TRANSPOSE4x4D 0,1,2,3,4
106 paddd m0, [pd_32]
107 IDCT4_1D d,0,1,2,3,4,5
108 pxor m5, m5
109 mova [r2+ 0], m5
110 mova [r2+16], m5
111 mova [r2+32], m5
112 mova [r2+48], m5
113 STORE_DIFFx2 m0, m1, m4, m5, r5, r3
114 lea r5, [r5+r3*2]
115 STORE_DIFFx2 m2, m3, m4, m5, r5, r3
116 ret
117%endmacro
118
119INIT_XMM sse2
120ALIGN 16
121ADD4x4IDCT
122%if HAVE_AVX_EXTERNAL
123INIT_XMM avx
124ALIGN 16
125ADD4x4IDCT
126%endif
127
128%macro ADD16_OP 2
129 cmp byte [r4+%2], 0
130 jz .skipblock%1
131 mov r5d, [r1+%1*4]
132 call add4x4_idct %+ SUFFIX
133.skipblock%1:
134%if %1<15
135 add r2, 64
136%endif
137%endmacro
138
139%macro IDCT_ADD16_10 0
140cglobal h264_idct_add16_10, 5,6
141 ADD16_OP 0, 4+1*8
142 ADD16_OP 1, 5+1*8
143 ADD16_OP 2, 4+2*8
144 ADD16_OP 3, 5+2*8
145 ADD16_OP 4, 6+1*8
146 ADD16_OP 5, 7+1*8
147 ADD16_OP 6, 6+2*8
148 ADD16_OP 7, 7+2*8
149 ADD16_OP 8, 4+3*8
150 ADD16_OP 9, 5+3*8
151 ADD16_OP 10, 4+4*8
152 ADD16_OP 11, 5+4*8
153 ADD16_OP 12, 6+3*8
154 ADD16_OP 13, 7+3*8
155 ADD16_OP 14, 6+4*8
156 ADD16_OP 15, 7+4*8
157 REP_RET
158%endmacro
159
160INIT_XMM sse2
161IDCT_ADD16_10
162%if HAVE_AVX_EXTERNAL
163INIT_XMM avx
164IDCT_ADD16_10
165%endif
166
167;-----------------------------------------------------------------------------
168; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
169;-----------------------------------------------------------------------------
170%macro IDCT_DC_ADD_OP_10 3
171 pxor m5, m5
172%if avx_enabled
173 paddw m1, m0, [%1+0 ]
174 paddw m2, m0, [%1+%2 ]
175 paddw m3, m0, [%1+%2*2]
176 paddw m4, m0, [%1+%3 ]
177%else
178 mova m1, [%1+0 ]
179 mova m2, [%1+%2 ]
180 mova m3, [%1+%2*2]
181 mova m4, [%1+%3 ]
182 paddw m1, m0
183 paddw m2, m0
184 paddw m3, m0
185 paddw m4, m0
186%endif
187 CLIPW m1, m5, m6
188 CLIPW m2, m5, m6
189 CLIPW m3, m5, m6
190 CLIPW m4, m5, m6
191 mova [%1+0 ], m1
192 mova [%1+%2 ], m2
193 mova [%1+%2*2], m3
194 mova [%1+%3 ], m4
195%endmacro
196
197INIT_MMX mmxext
198cglobal h264_idct_dc_add_10,3,3
199 movd m0, [r1]
200 mov dword [r1], 0
201 paddd m0, [pd_32]
202 psrad m0, 6
203 lea r1, [r2*3]
204 pshufw m0, m0, 0
205 mova m6, [pw_pixel_max]
206 IDCT_DC_ADD_OP_10 r0, r2, r1
207 RET
208
209;-----------------------------------------------------------------------------
210; void ff_h264_idct8_dc_add_10(pixel *dst, int16_t *block, int stride)
211;-----------------------------------------------------------------------------
212%macro IDCT8_DC_ADD 0
213cglobal h264_idct8_dc_add_10,3,4,7
214 movd m0, [r1]
215 mov dword[r1], 0
216 paddd m0, [pd_32]
217 psrad m0, 6
218 lea r1, [r2*3]
219 SPLATW m0, m0, 0
220 mova m6, [pw_pixel_max]
221 IDCT_DC_ADD_OP_10 r0, r2, r1
222 lea r0, [r0+r2*4]
223 IDCT_DC_ADD_OP_10 r0, r2, r1
224 RET
225%endmacro
226
227INIT_XMM sse2
228IDCT8_DC_ADD
229%if HAVE_AVX_EXTERNAL
230INIT_XMM avx
231IDCT8_DC_ADD
232%endif
233
234;-----------------------------------------------------------------------------
235; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset,
236; int16_t *block, int stride,
237; const uint8_t nnzc[6*8])
238;-----------------------------------------------------------------------------
239%macro AC 1
240.ac%1:
241 mov r5d, [r1+(%1+0)*4]
242 call add4x4_idct %+ SUFFIX
243 mov r5d, [r1+(%1+1)*4]
244 add r2, 64
245 call add4x4_idct %+ SUFFIX
246 add r2, 64
247 jmp .skipadd%1
248%endmacro
249
250%assign last_block 16
251%macro ADD16_OP_INTRA 2
252 cmp word [r4+%2], 0
253 jnz .ac%1
254 mov r5d, [r2+ 0]
255 or r5d, [r2+64]
256 jz .skipblock%1
257 mov r5d, [r1+(%1+0)*4]
258 call idct_dc_add %+ SUFFIX
259.skipblock%1:
260%if %1<last_block-2
261 add r2, 128
262%endif
263.skipadd%1:
264%endmacro
265
266%macro IDCT_ADD16INTRA_10 0
267idct_dc_add %+ SUFFIX:
268 add r5, r0
269 movq m0, [r2+ 0]
270 movhps m0, [r2+64]
271 mov dword [r2+ 0], 0
272 mov dword [r2+64], 0
273 paddd m0, [pd_32]
274 psrad m0, 6
275 pshufhw m0, m0, 0
276 pshuflw m0, m0, 0
277 lea r6, [r3*3]
278 mova m6, [pw_pixel_max]
279 IDCT_DC_ADD_OP_10 r5, r3, r6
280 ret
281
282cglobal h264_idct_add16intra_10,5,7,8
283 ADD16_OP_INTRA 0, 4+1*8
284 ADD16_OP_INTRA 2, 4+2*8
285 ADD16_OP_INTRA 4, 6+1*8
286 ADD16_OP_INTRA 6, 6+2*8
287 ADD16_OP_INTRA 8, 4+3*8
288 ADD16_OP_INTRA 10, 4+4*8
289 ADD16_OP_INTRA 12, 6+3*8
290 ADD16_OP_INTRA 14, 6+4*8
291 REP_RET
292 AC 8
293 AC 10
294 AC 12
295 AC 14
296 AC 0
297 AC 2
298 AC 4
299 AC 6
300%endmacro
301
302INIT_XMM sse2
303IDCT_ADD16INTRA_10
304%if HAVE_AVX_EXTERNAL
305INIT_XMM avx
306IDCT_ADD16INTRA_10
307%endif
308
309%assign last_block 36
310;-----------------------------------------------------------------------------
311; void ff_h264_idct_add8_10(pixel **dst, const int *block_offset,
312; int16_t *block, int stride,
313; const uint8_t nnzc[6*8])
314;-----------------------------------------------------------------------------
315%macro IDCT_ADD8 0
316cglobal h264_idct_add8_10,5,8,7
317%if ARCH_X86_64
318 mov r7, r0
319%endif
320 add r2, 1024
321 mov r0, [r0]
322 ADD16_OP_INTRA 16, 4+ 6*8
323 ADD16_OP_INTRA 18, 4+ 7*8
324 add r2, 1024-128*2
325%if ARCH_X86_64
326 mov r0, [r7+gprsize]
327%else
328 mov r0, r0m
329 mov r0, [r0+gprsize]
330%endif
331 ADD16_OP_INTRA 32, 4+11*8
332 ADD16_OP_INTRA 34, 4+12*8
333 REP_RET
334 AC 16
335 AC 18
336 AC 32
337 AC 34
338
339%endmacro ; IDCT_ADD8
340
341INIT_XMM sse2
342IDCT_ADD8
343%if HAVE_AVX_EXTERNAL
344INIT_XMM avx
345IDCT_ADD8
346%endif
347
348;-----------------------------------------------------------------------------
349; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride)
350;-----------------------------------------------------------------------------
351%macro IDCT8_1D 2
352 SWAP 0, 1
353 psrad m4, m5, 1
354 psrad m1, m0, 1
355 paddd m4, m5
356 paddd m1, m0
357 paddd m4, m7
358 paddd m1, m5
359 psubd m4, m0
360 paddd m1, m3
361
362 psubd m0, m3
363 psubd m5, m3
364 paddd m0, m7
365 psubd m5, m7
366 psrad m3, 1
367 psrad m7, 1
368 psubd m0, m3
369 psubd m5, m7
370
371 SWAP 1, 7
372 psrad m1, m7, 2
373 psrad m3, m4, 2
374 paddd m3, m0
375 psrad m0, 2
376 paddd m1, m5
377 psrad m5, 2
378 psubd m0, m4
379 psubd m7, m5
380
381 SWAP 5, 6
382 psrad m4, m2, 1
383 psrad m6, m5, 1
384 psubd m4, m5
385 paddd m6, m2
386
387 mova m2, %1
388 mova m5, %2
389 SUMSUB_BA d, 5, 2
390 SUMSUB_BA d, 6, 5
391 SUMSUB_BA d, 4, 2
392 SUMSUB_BA d, 7, 6
393 SUMSUB_BA d, 0, 4
394 SUMSUB_BA d, 3, 2
395 SUMSUB_BA d, 1, 5
396 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
397%endmacro
398
399%macro IDCT8_1D_FULL 1
400 mova m7, [%1+112*2]
401 mova m6, [%1+ 96*2]
402 mova m5, [%1+ 80*2]
403 mova m3, [%1+ 48*2]
404 mova m2, [%1+ 32*2]
405 mova m1, [%1+ 16*2]
406 IDCT8_1D [%1], [%1+ 64*2]
407%endmacro
408
409; %1=int16_t *block, %2=int16_t *dstblock
410%macro IDCT8_ADD_SSE_START 2
411 IDCT8_1D_FULL %1
412%if ARCH_X86_64
413 TRANSPOSE4x4D 0,1,2,3,8
414 mova [%2 ], m0
415 TRANSPOSE4x4D 4,5,6,7,8
416 mova [%2+8*2], m4
417%else
418 mova [%1], m7
419 TRANSPOSE4x4D 0,1,2,3,7
420 mova m7, [%1]
421 mova [%2 ], m0
422 mova [%2+16*2], m1
423 mova [%2+32*2], m2
424 mova [%2+48*2], m3
425 TRANSPOSE4x4D 4,5,6,7,3
426 mova [%2+ 8*2], m4
427 mova [%2+24*2], m5
428 mova [%2+40*2], m6
429 mova [%2+56*2], m7
430%endif
431%endmacro
432
433; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
434%macro IDCT8_ADD_SSE_END 3
435 IDCT8_1D_FULL %2
436 mova [%2 ], m6
437 mova [%2+16*2], m7
438
439 pxor m7, m7
440 STORE_DIFFx2 m0, m1, m6, m7, %1, %3
441 lea %1, [%1+%3*2]
442 STORE_DIFFx2 m2, m3, m6, m7, %1, %3
443 mova m0, [%2 ]
444 mova m1, [%2+16*2]
445 lea %1, [%1+%3*2]
446 STORE_DIFFx2 m4, m5, m6, m7, %1, %3
447 lea %1, [%1+%3*2]
448 STORE_DIFFx2 m0, m1, m6, m7, %1, %3
449%endmacro
450
451%macro IDCT8_ADD 0
452cglobal h264_idct8_add_10, 3,4,16
453%if UNIX64 == 0
454 %assign pad 16-gprsize-(stack_offset&15)
455 sub rsp, pad
456 call h264_idct8_add1_10 %+ SUFFIX
457 add rsp, pad
458 RET
459%endif
460
461ALIGN 16
462; TODO: does not need to use stack
463h264_idct8_add1_10 %+ SUFFIX:
464%assign pad 256+16-gprsize
465 sub rsp, pad
466 add dword [r1], 32
467
468%if ARCH_X86_64
469 IDCT8_ADD_SSE_START r1, rsp
470 SWAP 1, 9
471 SWAP 2, 10
472 SWAP 3, 11
473 SWAP 5, 13
474 SWAP 6, 14
475 SWAP 7, 15
476 IDCT8_ADD_SSE_START r1+16, rsp+128
477 PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7
478 IDCT8_1D [rsp], [rsp+128]
479 SWAP 0, 8
480 SWAP 1, 9
481 SWAP 2, 10
482 SWAP 3, 11
483 SWAP 4, 12
484 SWAP 5, 13
485 SWAP 6, 14
486 SWAP 7, 15
487 IDCT8_1D [rsp+16], [rsp+144]
488 psrad m8, 6
489 psrad m0, 6
490 packssdw m8, m0
491 paddsw m8, [r0]
492 pxor m0, m0
493 mova [r1+ 0], m0
494 mova [r1+ 16], m0
495 mova [r1+ 32], m0
496 mova [r1+ 48], m0
497 mova [r1+ 64], m0
498 mova [r1+ 80], m0
499 mova [r1+ 96], m0
500 mova [r1+112], m0
501 mova [r1+128], m0
502 mova [r1+144], m0
503 mova [r1+160], m0
504 mova [r1+176], m0
505 mova [r1+192], m0
506 mova [r1+208], m0
507 mova [r1+224], m0
508 mova [r1+240], m0
509 CLIPW m8, m0, [pw_pixel_max]
510 mova [r0], m8
511 mova m8, [pw_pixel_max]
512 STORE_DIFF16 m9, m1, m0, m8, r0+r2
513 lea r0, [r0+r2*2]
514 STORE_DIFF16 m10, m2, m0, m8, r0
515 STORE_DIFF16 m11, m3, m0, m8, r0+r2
516 lea r0, [r0+r2*2]
517 STORE_DIFF16 m12, m4, m0, m8, r0
518 STORE_DIFF16 m13, m5, m0, m8, r0+r2
519 lea r0, [r0+r2*2]
520 STORE_DIFF16 m14, m6, m0, m8, r0
521 STORE_DIFF16 m15, m7, m0, m8, r0+r2
522%else
523 IDCT8_ADD_SSE_START r1, rsp
524 IDCT8_ADD_SSE_START r1+16, rsp+128
525 lea r3, [r0+8]
526 IDCT8_ADD_SSE_END r0, rsp, r2
527 IDCT8_ADD_SSE_END r3, rsp+16, r2
528 mova [r1+ 0], m7
529 mova [r1+ 16], m7
530 mova [r1+ 32], m7
531 mova [r1+ 48], m7
532 mova [r1+ 64], m7
533 mova [r1+ 80], m7
534 mova [r1+ 96], m7
535 mova [r1+112], m7
536 mova [r1+128], m7
537 mova [r1+144], m7
538 mova [r1+160], m7
539 mova [r1+176], m7
540 mova [r1+192], m7
541 mova [r1+208], m7
542 mova [r1+224], m7
543 mova [r1+240], m7
544%endif ; ARCH_X86_64
545
546 add rsp, pad
547 ret
548%endmacro
549
550INIT_XMM sse2
551IDCT8_ADD
552%if HAVE_AVX_EXTERNAL
553INIT_XMM avx
554IDCT8_ADD
555%endif
556
557;-----------------------------------------------------------------------------
558; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset,
559; int16_t *block, int stride,
560; const uint8_t nnzc[6*8])
561;-----------------------------------------------------------------------------
562;;;;;;; NO FATE SAMPLES TRIGGER THIS
563%macro IDCT8_ADD4_OP 2
564 cmp byte [r4+%2], 0
565 jz .skipblock%1
566 mov r0d, [r6+%1*4]
567 add r0, r5
568 call h264_idct8_add1_10 %+ SUFFIX
569.skipblock%1:
570%if %1<12
571 add r1, 256
572%endif
573%endmacro
574
575%macro IDCT8_ADD4 0
576cglobal h264_idct8_add4_10, 0,7,16
577 %assign pad 16-gprsize-(stack_offset&15)
578 SUB rsp, pad
579 mov r5, r0mp
580 mov r6, r1mp
581 mov r1, r2mp
582 mov r2d, r3m
583 movifnidn r4, r4mp
584 IDCT8_ADD4_OP 0, 4+1*8
585 IDCT8_ADD4_OP 4, 6+1*8
586 IDCT8_ADD4_OP 8, 4+3*8
587 IDCT8_ADD4_OP 12, 6+3*8
588 ADD rsp, pad
589 RET
590%endmacro ; IDCT8_ADD4
591
592INIT_XMM sse2
593IDCT8_ADD4
594%if HAVE_AVX_EXTERNAL
595INIT_XMM avx
596IDCT8_ADD4
597%endif