Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | /***************************************************************************** |
2 | * Copyright (C) 2013 x265 project | |
3 | * | |
4 | * Authors: Loren Merritt <lorenm@u.washington.edu> | |
5 | * Laurent Aimar <fenrir@via.ecp.fr> | |
6 | * Fiona Glaser <fiona@x264.com> | |
7 | * Steve Borho <steve@borho.org> | |
8 | * | |
9 | * This program is free software; you can redistribute it and/or modify | |
10 | * it under the terms of the GNU General Public License as published by | |
11 | * the Free Software Foundation; either version 2 of the License, or | |
12 | * (at your option) any later version. | |
13 | * | |
14 | * This program is distributed in the hope that it will be useful, | |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 | * GNU General Public License for more details. | |
18 | * | |
19 | * You should have received a copy of the GNU General Public License | |
20 | * along with this program; if not, write to the Free Software | |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
22 | * | |
23 | * This program is also available under a commercial proprietary license. | |
24 | * For more information, contact us at license @ x265.com. | |
25 | *****************************************************************************/ | |
26 | ||
27 | #include "cpu.h" | |
28 | #include "common.h" | |
29 | ||
30 | #if MACOS || SYS_FREEBSD | |
31 | #include <sys/types.h> | |
32 | #include <sys/sysctl.h> | |
33 | #endif | |
34 | #if SYS_OPENBSD | |
35 | #include <sys/param.h> | |
36 | #include <sys/sysctl.h> | |
37 | #include <machine/cpu.h> | |
38 | #endif | |
39 | ||
40 | #if X265_ARCH_ARM && !defined(HAVE_NEON) | |
41 | #include <signal.h> | |
42 | #include <setjmp.h> | |
43 | static sigjmp_buf jmpbuf; | |
44 | static volatile sig_atomic_t canjump = 0; | |
45 | ||
46 | static void sigill_handler(int sig) | |
47 | { | |
48 | if (!canjump) | |
49 | { | |
50 | signal(sig, SIG_DFL); | |
51 | raise(sig); | |
52 | } | |
53 | ||
54 | canjump = 0; | |
55 | siglongjmp(jmpbuf, 1); | |
56 | } | |
57 | ||
58 | #endif // if X265_ARCH_ARM | |
59 | ||
60 | namespace x265 { | |
61 | const cpu_name_t cpu_names[] = | |
62 | { | |
63 | #if X265_ARCH_X86 | |
64 | #define MMX2 X265_CPU_MMX | X265_CPU_MMX2 | X265_CPU_CMOV | |
65 | { "MMX2", MMX2 }, | |
66 | { "MMXEXT", MMX2 }, | |
67 | { "SSE", MMX2 | X265_CPU_SSE }, | |
68 | #define SSE2 MMX2 | X265_CPU_SSE | X265_CPU_SSE2 | |
69 | { "SSE2Slow", SSE2 | X265_CPU_SSE2_IS_SLOW }, | |
70 | { "SSE2", SSE2 }, | |
71 | { "SSE2Fast", SSE2 | X265_CPU_SSE2_IS_FAST }, | |
72 | { "SSE3", SSE2 | X265_CPU_SSE3 }, | |
73 | { "SSSE3", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 }, | |
74 | { "SSE4.1", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 }, | |
75 | { "SSE4", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 }, | |
76 | { "SSE4.2", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 }, | |
77 | #define AVX SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 | X265_CPU_AVX | |
78 | { "AVX", AVX }, | |
79 | { "XOP", AVX | X265_CPU_XOP }, | |
80 | { "FMA4", AVX | X265_CPU_FMA4 }, | |
81 | { "AVX2", AVX | X265_CPU_AVX2 }, | |
82 | { "FMA3", AVX | X265_CPU_FMA3 }, | |
83 | #undef AVX | |
84 | #undef SSE2 | |
85 | #undef MMX2 | |
86 | { "Cache32", X265_CPU_CACHELINE_32 }, | |
87 | { "Cache64", X265_CPU_CACHELINE_64 }, | |
88 | { "LZCNT", X265_CPU_LZCNT }, | |
89 | { "BMI1", X265_CPU_BMI1 }, | |
90 | { "BMI2", X265_CPU_BMI1 | X265_CPU_BMI2 }, | |
91 | { "SlowCTZ", X265_CPU_SLOW_CTZ }, | |
92 | { "SlowAtom", X265_CPU_SLOW_ATOM }, | |
93 | { "SlowPshufb", X265_CPU_SLOW_PSHUFB }, | |
94 | { "SlowPalignr", X265_CPU_SLOW_PALIGNR }, | |
95 | { "SlowShuffle", X265_CPU_SLOW_SHUFFLE }, | |
96 | { "UnalignedStack", X265_CPU_STACK_MOD4 }, | |
97 | ||
98 | #elif X265_ARCH_ARM | |
99 | { "ARMv6", X265_CPU_ARMV6 }, | |
100 | { "NEON", X265_CPU_NEON }, | |
101 | { "FastNeonMRC", X265_CPU_FAST_NEON_MRC }, | |
102 | #endif // if X265_ARCH_X86 | |
103 | { "", 0 }, | |
104 | }; | |
105 | ||
106 | #if X265_ARCH_X86 | |
107 | ||
108 | extern "C" { | |
109 | /* cpu-a.asm */ | |
110 | int x265_cpu_cpuid_test(void); | |
111 | void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx); | |
112 | void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx); | |
113 | } | |
114 | ||
115 | #if defined(_MSC_VER) | |
116 | #pragma warning(disable: 4309) // truncation of constant value | |
117 | #endif | |
118 | ||
119 | uint32_t cpu_detect(void) | |
120 | { | |
121 | uint32_t cpu = 0; | |
122 | ||
123 | uint32_t eax, ebx, ecx, edx; | |
124 | uint32_t vendor[4] = { 0 }; | |
125 | uint32_t max_extended_cap, max_basic_cap; | |
126 | ||
127 | #if !X86_64 | |
128 | if (!x265_cpu_cpuid_test()) | |
129 | return 0; | |
130 | #endif | |
131 | ||
132 | x265_cpu_cpuid(0, &eax, vendor + 0, vendor + 2, vendor + 1); | |
133 | max_basic_cap = eax; | |
134 | if (max_basic_cap == 0) | |
135 | return 0; | |
136 | ||
137 | x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx); | |
138 | if (edx & 0x00800000) | |
139 | cpu |= X265_CPU_MMX; | |
140 | else | |
141 | return cpu; | |
142 | if (edx & 0x02000000) | |
143 | cpu |= X265_CPU_MMX2 | X265_CPU_SSE; | |
144 | if (edx & 0x00008000) | |
145 | cpu |= X265_CPU_CMOV; | |
146 | else | |
147 | return cpu; | |
148 | if (edx & 0x04000000) | |
149 | cpu |= X265_CPU_SSE2; | |
150 | if (ecx & 0x00000001) | |
151 | cpu |= X265_CPU_SSE3; | |
152 | if (ecx & 0x00000200) | |
153 | cpu |= X265_CPU_SSSE3; | |
154 | if (ecx & 0x00080000) | |
155 | cpu |= X265_CPU_SSE4; | |
156 | if (ecx & 0x00100000) | |
157 | cpu |= X265_CPU_SSE42; | |
158 | /* Check OXSAVE and AVX bits */ | |
159 | if ((ecx & 0x18000000) == 0x18000000) | |
160 | { | |
161 | /* Check for OS support */ | |
162 | x265_cpu_xgetbv(0, &eax, &edx); | |
163 | if ((eax & 0x6) == 0x6) | |
164 | { | |
165 | cpu |= X265_CPU_AVX; | |
166 | if (ecx & 0x00001000) | |
167 | cpu |= X265_CPU_FMA3; | |
168 | } | |
169 | } | |
170 | ||
171 | if (max_basic_cap >= 7) | |
172 | { | |
173 | x265_cpu_cpuid(7, &eax, &ebx, &ecx, &edx); | |
174 | /* AVX2 requires OS support, but BMI1/2 don't. */ | |
175 | if ((cpu & X265_CPU_AVX) && (ebx & 0x00000020)) | |
176 | cpu |= X265_CPU_AVX2; | |
177 | if (ebx & 0x00000008) | |
178 | { | |
179 | cpu |= X265_CPU_BMI1; | |
180 | if (ebx & 0x00000100) | |
181 | cpu |= X265_CPU_BMI2; | |
182 | } | |
183 | } | |
184 | ||
185 | if (cpu & X265_CPU_SSSE3) | |
186 | cpu |= X265_CPU_SSE2_IS_FAST; | |
187 | ||
188 | x265_cpu_cpuid(0x80000000, &eax, &ebx, &ecx, &edx); | |
189 | max_extended_cap = eax; | |
190 | ||
191 | if (max_extended_cap >= 0x80000001) | |
192 | { | |
193 | x265_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | |
194 | ||
195 | if (ecx & 0x00000020) | |
196 | cpu |= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */ | |
197 | if (ecx & 0x00000040) /* SSE4a, AMD only */ | |
198 | { | |
199 | int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); | |
200 | cpu |= X265_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */ | |
201 | if (family == 0x14) | |
202 | { | |
203 | cpu &= ~X265_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */ | |
204 | cpu |= X265_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */ | |
205 | cpu |= X265_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */ | |
206 | } | |
207 | if (family == 0x16) | |
208 | { | |
209 | cpu |= X265_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough | |
210 | * compared to alternate instruction sequences that this | |
211 | * is equal or faster on almost all such functions. */ | |
212 | } | |
213 | } | |
214 | ||
215 | if (cpu & X265_CPU_AVX) | |
216 | { | |
217 | if (ecx & 0x00000800) /* XOP */ | |
218 | cpu |= X265_CPU_XOP; | |
219 | if (ecx & 0x00010000) /* FMA4 */ | |
220 | cpu |= X265_CPU_FMA4; | |
221 | } | |
222 | ||
223 | if (!strcmp((char*)vendor, "AuthenticAMD")) | |
224 | { | |
225 | if (edx & 0x00400000) | |
226 | cpu |= X265_CPU_MMX2; | |
227 | if (!(cpu & X265_CPU_LZCNT)) | |
228 | cpu |= X265_CPU_SLOW_CTZ; | |
229 | if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST)) | |
230 | cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */ | |
231 | } | |
232 | } | |
233 | ||
234 | if (!strcmp((char*)vendor, "GenuineIntel")) | |
235 | { | |
236 | x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx); | |
237 | int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); | |
238 | int model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); | |
239 | if (family == 6) | |
240 | { | |
241 | /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah") | |
242 | * theoretically support sse2, but it's significantly slower than mmx for | |
243 | * almost all of x264's functions, so let's just pretend they don't. */ | |
244 | if (model == 9 || model == 13 || model == 14) | |
245 | { | |
246 | cpu &= ~(X265_CPU_SSE2 | X265_CPU_SSE3); | |
247 | X265_CHECK(!(cpu & (X265_CPU_SSSE3 | X265_CPU_SSE4)), "unexpected CPU ID %d\n", cpu); | |
248 | } | |
249 | /* Detect Atom CPU */ | |
250 | else if (model == 28) | |
251 | { | |
252 | cpu |= X265_CPU_SLOW_ATOM; | |
253 | cpu |= X265_CPU_SLOW_CTZ; | |
254 | cpu |= X265_CPU_SLOW_PSHUFB; | |
255 | } | |
256 | ||
257 | /* Conroe has a slow shuffle unit. Check the model number to make sure not | |
258 | * to include crippled low-end Penryns and Nehalems that don't have SSE4. */ | |
259 | else if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE4) && model < 23) | |
260 | cpu |= X265_CPU_SLOW_SHUFFLE; | |
261 | } | |
262 | } | |
263 | ||
264 | if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42)) | |
265 | { | |
266 | /* cacheline size is specified in 3 places, any of which may be missing */ | |
267 | x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx); | |
268 | int cache = (ebx & 0xff00) >> 5; // cflush size | |
269 | if (!cache && max_extended_cap >= 0x80000006) | |
270 | { | |
271 | x265_cpu_cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | |
272 | cache = ecx & 0xff; // cacheline size | |
273 | } | |
274 | if (!cache && max_basic_cap >= 2) | |
275 | { | |
276 | // Cache and TLB Information | |
277 | static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 }; | |
278 | static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, | |
279 | 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 }; | |
280 | uint32_t buf[4]; | |
281 | int max, i = 0; | |
282 | do | |
283 | { | |
284 | x265_cpu_cpuid(2, buf + 0, buf + 1, buf + 2, buf + 3); | |
285 | max = buf[0] & 0xff; | |
286 | buf[0] &= ~0xff; | |
287 | for (int j = 0; j < 4; j++) | |
288 | { | |
289 | if (!(buf[j] >> 31)) | |
290 | while (buf[j]) | |
291 | { | |
292 | if (strchr(cache32_ids, buf[j] & 0xff)) | |
293 | cache = 32; | |
294 | if (strchr(cache64_ids, buf[j] & 0xff)) | |
295 | cache = 64; | |
296 | buf[j] >>= 8; | |
297 | } | |
298 | } | |
299 | } | |
300 | while (++i < max); | |
301 | } | |
302 | ||
303 | if (cache == 32) | |
304 | cpu |= X265_CPU_CACHELINE_32; | |
305 | else if (cache == 64) | |
306 | cpu |= X265_CPU_CACHELINE_64; | |
307 | else | |
308 | x265_log(NULL, X265_LOG_WARNING, "unable to determine cacheline size\n"); | |
309 | } | |
310 | ||
311 | #if BROKEN_STACK_ALIGNMENT | |
312 | cpu |= X265_CPU_STACK_MOD4; | |
313 | #endif | |
314 | ||
315 | return cpu; | |
316 | } | |
317 | ||
318 | #elif X265_ARCH_ARM | |
319 | ||
320 | extern "C" { | |
321 | void x265_cpu_neon_test(void); | |
322 | int x265_cpu_fast_neon_mrc_test(void); | |
323 | } | |
324 | ||
325 | uint32_t cpu_detect(void) | |
326 | { | |
327 | int flags = 0; | |
328 | ||
329 | #if HAVE_ARMV6 | |
330 | flags |= X265_CPU_ARMV6; | |
331 | ||
332 | // don't do this hack if compiled with -mfpu=neon | |
333 | #if !HAVE_NEON | |
334 | static void (* oldsig)(int); | |
335 | oldsig = signal(SIGILL, sigill_handler); | |
336 | if (sigsetjmp(jmpbuf, 1)) | |
337 | { | |
338 | signal(SIGILL, oldsig); | |
339 | return flags; | |
340 | } | |
341 | ||
342 | canjump = 1; | |
343 | x265_cpu_neon_test(); | |
344 | canjump = 0; | |
345 | signal(SIGILL, oldsig); | |
346 | #endif // if !HAVE_NEON | |
347 | ||
348 | flags |= X265_CPU_NEON; | |
349 | ||
350 | // fast neon -> arm (Cortex-A9) detection relies on user access to the | |
351 | // cycle counter; this assumes ARMv7 performance counters. | |
352 | // NEON requires at least ARMv7, ARMv8 may require changes here, but | |
353 | // hopefully this hacky detection method will have been replaced by then. | |
354 | // Note that there is potential for a race condition if another program or | |
355 | // x264 instance disables or reinits the counters while x264 is using them, | |
356 | // which may result in incorrect detection and the counters stuck enabled. | |
357 | // right now Apple does not seem to support performance counters for this test | |
358 | #ifndef __MACH__ | |
359 | flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0; | |
360 | #endif | |
361 | // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc) | |
362 | #endif // if HAVE_ARMV6 | |
363 | return flags; | |
364 | } | |
365 | ||
366 | #else // if X265_ARCH_X86 | |
367 | ||
368 | uint32_t cpu_detect(void) | |
369 | { | |
370 | return 0; | |
371 | } | |
372 | ||
373 | #endif // if X265_ARCH_X86 | |
374 | } |