Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | /***************************************************************************** |
2 | * Copyright (C) 2013 x265 project | |
3 | * | |
4 | * Authors: Steve Borho <steve@borho.org> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License as published by | |
8 | * the Free Software Foundation; either version 2 of the License, or | |
9 | * (at your option) any later version. | |
10 | * | |
11 | * This program is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | * GNU General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * along with this program; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
19 | * | |
20 | * This program is also available under a commercial proprietary license. | |
21 | * For more information, contact us at license @ x265.com. | |
22 | *****************************************************************************/ | |
23 | ||
24 | #include "common.h" | |
25 | #include "primitives.h" | |
26 | ||
27 | namespace x265 { | |
28 | // x265 private namespace | |
29 | ||
30 | extern const uint8_t lumaPartitionMapTable[] = | |
31 | { | |
32 | // 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 64 | |
33 | LUMA_4x4, LUMA_4x8, 255, LUMA_4x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 4 | |
34 | LUMA_8x4, LUMA_8x8, 255, LUMA_8x16, 255, 255, 255, LUMA_8x32, 255, 255, 255, 255, 255, 255, 255, 255, // 8 | |
35 | 255, 255, 255, LUMA_12x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 12 | |
36 | LUMA_16x4, LUMA_16x8, LUMA_16x12, LUMA_16x16, 255, 255, 255, LUMA_16x32, 255, 255, 255, 255, 255, 255, 255, LUMA_16x64, // 16 | |
37 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 20 | |
38 | 255, 255, 255, 255, 255, 255, 255, LUMA_24x32, 255, 255, 255, 255, 255, 255, 255, 255, // 24 | |
39 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 28 | |
40 | 255, LUMA_32x8, 255, LUMA_32x16, 255, LUMA_32x24, 255, LUMA_32x32, 255, 255, 255, 255, 255, 255, 255, LUMA_32x64, // 32 | |
41 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 36 | |
42 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 40 | |
43 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 44 | |
44 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, LUMA_48x64, // 48 | |
45 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 52 | |
46 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 56 | |
47 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 60 | |
48 | 255, 255, 255, LUMA_64x16, 255, 255, 255, LUMA_64x32, 255, 255, 255, LUMA_64x48, 255, 255, 255, LUMA_64x64 // 64 | |
49 | }; | |
50 | ||
51 | /* the "authoritative" set of encoder primitives */ | |
52 | EncoderPrimitives primitives; | |
53 | ||
54 | void Setup_C_PixelPrimitives(EncoderPrimitives &p); | |
55 | void Setup_C_DCTPrimitives(EncoderPrimitives &p); | |
56 | void Setup_C_IPFilterPrimitives(EncoderPrimitives &p); | |
57 | void Setup_C_IPredPrimitives(EncoderPrimitives &p); | |
58 | void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p); | |
59 | ||
60 | void Setup_C_Primitives(EncoderPrimitives &p) | |
61 | { | |
62 | Setup_C_PixelPrimitives(p); // pixel.cpp | |
63 | Setup_C_DCTPrimitives(p); // dct.cpp | |
64 | Setup_C_IPFilterPrimitives(p); // ipfilter.cpp | |
65 | Setup_C_IPredPrimitives(p); // intrapred.cpp | |
66 | Setup_C_LoopFilterPrimitives(p); // loopfilter.cpp | |
67 | } | |
68 | ||
69 | void Setup_Alias_Primitives(EncoderPrimitives &p) | |
70 | { | |
71 | /* copy reusable luma primitives to chroma 4:4:4 */ | |
72 | for (int i = 0; i < NUM_LUMA_PARTITIONS; i++) | |
73 | { | |
74 | p.chroma[X265_CSP_I444].copy_pp[i] = p.luma_copy_pp[i]; | |
75 | p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i]; | |
76 | p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i]; | |
77 | p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i]; | |
78 | p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i]; | |
79 | } | |
80 | ||
81 | for (int i = 0; i < NUM_SQUARE_BLOCKS; i++) | |
82 | { | |
83 | p.chroma[X265_CSP_I444].add_ps[i] = p.luma_add_ps[i]; | |
84 | p.chroma[X265_CSP_I444].sub_ps[i] = p.luma_sub_ps[i]; | |
85 | } | |
86 | ||
87 | for (int i = 0; i < NUM_SQUARE_BLOCKS; i++) | |
88 | { | |
89 | int partL = partitionFromLog2Size(i + 2); | |
90 | p.square_copy_pp[i] = p.luma_copy_pp[partL]; | |
91 | p.square_copy_ps[i] = p.luma_copy_ps[partL]; | |
92 | p.square_copy_sp[i] = p.luma_copy_sp[partL]; | |
93 | p.square_copy_ss[i] = p.luma_copy_ss[partL]; | |
94 | } | |
95 | ||
96 | primitives.sa8d[BLOCK_4x4] = primitives.sa8d_inter[LUMA_4x4]; | |
97 | primitives.sa8d[BLOCK_8x8] = primitives.sa8d_inter[LUMA_8x8]; | |
98 | primitives.sa8d[BLOCK_16x16] = primitives.sa8d_inter[LUMA_16x16]; | |
99 | primitives.sa8d[BLOCK_32x32] = primitives.sa8d_inter[LUMA_32x32]; | |
100 | primitives.sa8d[BLOCK_64x64] = primitives.sa8d_inter[LUMA_64x64]; | |
101 | ||
102 | // SA8D devolves to SATD for blocks not even multiples of 8x8 | |
103 | primitives.sa8d_inter[LUMA_4x4] = primitives.satd[LUMA_4x4]; | |
104 | primitives.sa8d_inter[LUMA_4x8] = primitives.satd[LUMA_4x8]; | |
105 | primitives.sa8d_inter[LUMA_4x16] = primitives.satd[LUMA_4x16]; | |
106 | primitives.sa8d_inter[LUMA_8x4] = primitives.satd[LUMA_8x4]; | |
107 | primitives.sa8d_inter[LUMA_16x4] = primitives.satd[LUMA_16x4]; | |
108 | primitives.sa8d_inter[LUMA_16x12] = primitives.satd[LUMA_16x12]; | |
109 | primitives.sa8d_inter[LUMA_12x16] = primitives.satd[LUMA_12x16]; | |
110 | } | |
111 | } | |
112 | using namespace x265; | |
113 | ||
114 | /* cpuid >= 0 - force CPU type | |
115 | * cpuid < 0 - auto-detect if uninitialized */ | |
116 | extern "C" | |
117 | void x265_setup_primitives(x265_param *param, int cpuid) | |
118 | { | |
119 | if (cpuid < 0) | |
120 | cpuid = x265::cpu_detect(); | |
121 | ||
122 | // initialize global variables | |
123 | if (!primitives.sad[0]) | |
124 | { | |
125 | Setup_C_Primitives(primitives); | |
126 | Setup_Instrinsic_Primitives(primitives, cpuid); | |
127 | ||
128 | #if ENABLE_ASSEMBLY | |
129 | Setup_Assembly_Primitives(primitives, cpuid); | |
130 | #else | |
131 | x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n"); | |
132 | #endif | |
133 | ||
134 | Setup_Alias_Primitives(primitives); | |
135 | ||
136 | initROM(); | |
137 | } | |
138 | ||
139 | if (param->logLevel >= X265_LOG_INFO) | |
140 | { | |
141 | char buf[1000]; | |
142 | char *p = buf + sprintf(buf, "using cpu capabilities:"); | |
143 | char *none = p; | |
144 | for (int i = 0; x265::cpu_names[i].flags; i++) | |
145 | { | |
146 | if (!strcmp(x265::cpu_names[i].name, "SSE") | |
147 | && (cpuid & X265_CPU_SSE2)) | |
148 | continue; | |
149 | if (!strcmp(x265::cpu_names[i].name, "SSE2") | |
150 | && (cpuid & (X265_CPU_SSE2_IS_FAST | X265_CPU_SSE2_IS_SLOW))) | |
151 | continue; | |
152 | if (!strcmp(x265::cpu_names[i].name, "SSE3") | |
153 | && (cpuid & X265_CPU_SSSE3 || !(cpuid & X265_CPU_CACHELINE_64))) | |
154 | continue; | |
155 | if (!strcmp(x265::cpu_names[i].name, "SSE4.1") | |
156 | && (cpuid & X265_CPU_SSE42)) | |
157 | continue; | |
158 | if (!strcmp(x265::cpu_names[i].name, "BMI1") | |
159 | && (cpuid & X265_CPU_BMI2)) | |
160 | continue; | |
161 | if ((cpuid & x265::cpu_names[i].flags) == x265::cpu_names[i].flags | |
162 | && (!i || x265::cpu_names[i].flags != x265::cpu_names[i - 1].flags)) | |
163 | p += sprintf(p, " %s", x265::cpu_names[i].name); | |
164 | } | |
165 | ||
166 | if (p == none) | |
167 | sprintf(p, " none!"); | |
168 | x265_log(param, X265_LOG_INFO, "%s\n", buf); | |
169 | } | |
170 | } | |
171 | ||
172 | #if !defined(ENABLE_ASSEMBLY) | |
173 | #if defined(_MSC_VER) | |
174 | #include <intrin.h> | |
175 | #endif | |
176 | ||
177 | extern "C" { | |
178 | // the intrinsic primitives will not use MMX instructions, so if assembly | |
179 | // is disabled there should be no reason to use EMMS. | |
180 | void x265_cpu_emms(void) {} | |
181 | ||
182 | #if defined(X265_ARCH_X86) | |
183 | ||
184 | #if defined(_MSC_VER) | |
185 | # pragma warning(disable: 4100) | |
186 | #elif defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax | |
187 | # define __cpuidex(regsArray, level, index) \ | |
188 | __asm__ __volatile__ ("cpuid" \ | |
189 | : "=a" ((regsArray)[0]), "=b" ((regsArray)[1]), "=c" ((regsArray)[2]), "=d" ((regsArray)[3]) \ | |
190 | : "0" (level), "2" (index)); | |
191 | #else | |
192 | # error "compiler not supported" | |
193 | #endif | |
194 | ||
195 | int x265_cpu_cpuid_test(void) | |
196 | { | |
197 | return 0; | |
198 | } | |
199 | ||
200 | void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) | |
201 | { | |
202 | int output[4]; | |
203 | ||
204 | __cpuidex(output, op, 0); | |
205 | *eax = output[0]; | |
206 | *ebx = output[1]; | |
207 | *ecx = output[2]; | |
208 | *edx = output[3]; | |
209 | } | |
210 | ||
211 | void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx) | |
212 | { | |
213 | uint64_t out = 0; | |
214 | ||
215 | #if X265_ARCH_X86 | |
216 | ||
217 | #if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) | |
218 | ||
219 | // MSVC 2010 SP1 or later, or similar Intel release | |
220 | out = _xgetbv(op); | |
221 | ||
222 | #elif defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax | |
223 | ||
224 | uint32_t a, d; | |
225 | __asm("xgetbv" : "=a" (a), "=d" (d) : "c" (op) :); | |
226 | *eax = a; | |
227 | *edx = d; | |
228 | return; | |
229 | ||
230 | #elif defined(_WIN64) // On x64 with older compilers, this is impossible | |
231 | ||
232 | #endif // if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) | |
233 | ||
234 | #endif // if x86 | |
235 | ||
236 | *eax = (uint32_t)out; | |
237 | *edx = (uint32_t)(out >> 32); | |
238 | } | |
239 | ||
240 | #endif // X265_ARCH_X86 | |
241 | } | |
242 | #endif // if !ENABLE_ASSEMBLY |