Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ; /* |
2 | ; * SIMD optimized idct functions for HEVC decoding | |
3 | ; * Copyright (c) 2014 Pierre-Edouard LEPERE | |
4 | ; * Copyright (c) 2014 James Almer | |
5 | ; * | |
6 | ; * This file is part of FFmpeg. | |
7 | ; * | |
8 | ; * FFmpeg is free software; you can redistribute it and/or | |
9 | ; * modify it under the terms of the GNU Lesser General Public | |
10 | ; * License as published by the Free Software Foundation; either | |
11 | ; * version 2.1 of the License, or (at your option) any later version. | |
12 | ; * | |
13 | ; * FFmpeg is distributed in the hope that it will be useful, | |
14 | ; * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | ; * Lesser General Public License for more details. | |
17 | ; * | |
18 | ; * You should have received a copy of the GNU Lesser General Public | |
19 | ; * License along with FFmpeg; if not, write to the Free Software | |
20 | ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | ; */ | |
22 | %include "libavutil/x86/x86util.asm" | |
23 | ||
24 | SECTION_TEXT 32 | |
25 | ||
26 | ; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs) | |
27 | ; %1 = HxW | |
28 | ; %2 = number of loops | |
29 | ; %3 = bitdepth | |
30 | %macro IDCT_DC 3 | |
31 | cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp | |
32 | movsx tmpq, word [coeffq] | |
33 | add tmpw, ((1 << 14-%3) + 1) | |
34 | sar tmpw, (15-%3) | |
35 | movd xm0, tmpd | |
36 | SPLATW m0, xm0 | |
37 | DEFINE_ARGS coeff, cnt | |
38 | mov cntd, %2 | |
39 | .loop: | |
40 | mova [coeffq+mmsize*0], m0 | |
41 | mova [coeffq+mmsize*1], m0 | |
42 | mova [coeffq+mmsize*2], m0 | |
43 | mova [coeffq+mmsize*3], m0 | |
44 | mova [coeffq+mmsize*4], m0 | |
45 | mova [coeffq+mmsize*5], m0 | |
46 | mova [coeffq+mmsize*6], m0 | |
47 | mova [coeffq+mmsize*7], m0 | |
48 | add coeffq, mmsize*8 | |
49 | dec cntd | |
50 | jg .loop | |
51 | RET | |
52 | %endmacro | |
53 | ||
54 | ; %1 = HxW | |
55 | ; %2 = bitdepth | |
56 | %macro IDCT_DC_NL 2 ; No loop | |
57 | cglobal hevc_idct%1x%1_dc_%2, 1, 2, 1, coeff, tmp | |
58 | movsx tmpq, word [coeffq] | |
59 | add tmpw, ((1 << 14-%2) + 1) | |
60 | sar tmpw, (15-%2) | |
61 | movd m0, tmpd | |
62 | SPLATW m0, xm0 | |
63 | mova [coeffq+mmsize*0], m0 | |
64 | mova [coeffq+mmsize*1], m0 | |
65 | mova [coeffq+mmsize*2], m0 | |
66 | mova [coeffq+mmsize*3], m0 | |
67 | %if mmsize == 16 | |
68 | mova [coeffq+mmsize*4], m0 | |
69 | mova [coeffq+mmsize*5], m0 | |
70 | mova [coeffq+mmsize*6], m0 | |
71 | mova [coeffq+mmsize*7], m0 | |
72 | %endif | |
73 | RET | |
74 | %endmacro | |
75 | ||
76 | ; 8-bit | |
77 | INIT_MMX mmxext | |
78 | IDCT_DC_NL 4, 8 | |
79 | IDCT_DC 8, 2, 8 | |
80 | ||
81 | INIT_XMM sse2 | |
82 | IDCT_DC_NL 8, 8 | |
83 | IDCT_DC 16, 4, 8 | |
84 | IDCT_DC 32, 16, 8 | |
85 | ||
86 | %if HAVE_AVX2_EXTERNAL | |
87 | INIT_YMM avx2 | |
88 | IDCT_DC 16, 2, 8 | |
89 | IDCT_DC 32, 8, 8 | |
90 | %endif ;HAVE_AVX2_EXTERNAL | |
91 | ||
92 | ; 10-bit | |
93 | INIT_MMX mmxext | |
94 | IDCT_DC_NL 4, 10 | |
95 | IDCT_DC 8, 2, 10 | |
96 | ||
97 | INIT_XMM sse2 | |
98 | IDCT_DC_NL 8, 10 | |
99 | IDCT_DC 16, 4, 10 | |
100 | IDCT_DC 32, 16, 10 | |
101 | ||
102 | %if HAVE_AVX2_EXTERNAL | |
103 | INIT_YMM avx2 | |
104 | IDCT_DC 16, 2, 10 | |
105 | IDCT_DC 32, 8, 10 | |
106 | %endif ;HAVE_AVX2_EXTERNAL | |
107 | ||
108 | ; 12-bit | |
109 | INIT_MMX mmxext | |
110 | IDCT_DC_NL 4, 12 | |
111 | IDCT_DC 8, 2, 12 | |
112 | ||
113 | INIT_XMM sse2 | |
114 | IDCT_DC_NL 8, 12 | |
115 | IDCT_DC 16, 4, 12 | |
116 | IDCT_DC 32, 16, 12 | |
117 | ||
118 | %if HAVE_AVX2_EXTERNAL | |
119 | INIT_YMM avx2 | |
120 | IDCT_DC 16, 2, 12 | |
121 | IDCT_DC 32, 8, 12 | |
122 | %endif ;HAVE_AVX2_EXTERNAL |