1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Loren Merritt <lorenm@u.washington.edu>
5 * Laurent Aimar <fenrir@via.ecp.fr>
6 * Fiona Glaser <fiona@x264.com>
7 * Steve Borho <steve@borho.org>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at license @ x265.com.
25 *****************************************************************************/
30 #if MACOS || SYS_FREEBSD
31 #include <sys/types.h>
32 #include <sys/sysctl.h>
35 #include <sys/param.h>
36 #include <sys/sysctl.h>
37 #include <machine/cpu.h>
40 #if X265_ARCH_ARM && !defined(HAVE_NEON)
43 static sigjmp_buf jmpbuf
;
44 static volatile sig_atomic_t canjump
= 0;
46 static void sigill_handler(int sig
)
55 siglongjmp(jmpbuf
, 1);
58 #endif // if X265_ARCH_ARM
61 const cpu_name_t cpu_names
[] =
64 #define MMX2 X265_CPU_MMX | X265_CPU_MMX2 | X265_CPU_CMOV
67 { "SSE", MMX2
| X265_CPU_SSE
},
68 #define SSE2 MMX2 | X265_CPU_SSE | X265_CPU_SSE2
69 { "SSE2Slow", SSE2
| X265_CPU_SSE2_IS_SLOW
},
71 { "SSE2Fast", SSE2
| X265_CPU_SSE2_IS_FAST
},
72 { "SSE3", SSE2
| X265_CPU_SSE3
},
73 { "SSSE3", SSE2
| X265_CPU_SSE3
| X265_CPU_SSSE3
},
74 { "SSE4.1", SSE2
| X265_CPU_SSE3
| X265_CPU_SSSE3
| X265_CPU_SSE4
},
75 { "SSE4", SSE2
| X265_CPU_SSE3
| X265_CPU_SSSE3
| X265_CPU_SSE4
},
76 { "SSE4.2", SSE2
| X265_CPU_SSE3
| X265_CPU_SSSE3
| X265_CPU_SSE4
| X265_CPU_SSE42
},
77 #define AVX SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 | X265_CPU_AVX
79 { "XOP", AVX
| X265_CPU_XOP
},
80 { "FMA4", AVX
| X265_CPU_FMA4
},
81 { "AVX2", AVX
| X265_CPU_AVX2
},
82 { "FMA3", AVX
| X265_CPU_FMA3
},
86 { "Cache32", X265_CPU_CACHELINE_32
},
87 { "Cache64", X265_CPU_CACHELINE_64
},
88 { "LZCNT", X265_CPU_LZCNT
},
89 { "BMI1", X265_CPU_BMI1
},
90 { "BMI2", X265_CPU_BMI1
| X265_CPU_BMI2
},
91 { "SlowCTZ", X265_CPU_SLOW_CTZ
},
92 { "SlowAtom", X265_CPU_SLOW_ATOM
},
93 { "SlowPshufb", X265_CPU_SLOW_PSHUFB
},
94 { "SlowPalignr", X265_CPU_SLOW_PALIGNR
},
95 { "SlowShuffle", X265_CPU_SLOW_SHUFFLE
},
96 { "UnalignedStack", X265_CPU_STACK_MOD4
},
99 { "ARMv6", X265_CPU_ARMV6
},
100 { "NEON", X265_CPU_NEON
},
101 { "FastNeonMRC", X265_CPU_FAST_NEON_MRC
},
102 #endif // if X265_ARCH_X86
110 int x265_cpu_cpuid_test(void);
111 void x265_cpu_cpuid(uint32_t op
, uint32_t *eax
, uint32_t *ebx
, uint32_t *ecx
, uint32_t *edx
);
112 void x265_cpu_xgetbv(uint32_t op
, uint32_t *eax
, uint32_t *edx
);
115 #if defined(_MSC_VER)
116 #pragma warning(disable: 4309) // truncation of constant value
119 uint32_t cpu_detect(void)
123 uint32_t eax
, ebx
, ecx
, edx
;
124 uint32_t vendor
[4] = { 0 };
125 uint32_t max_extended_cap
, max_basic_cap
;
128 if (!x265_cpu_cpuid_test())
132 x265_cpu_cpuid(0, &eax
, vendor
+ 0, vendor
+ 2, vendor
+ 1);
134 if (max_basic_cap
== 0)
137 x265_cpu_cpuid(1, &eax
, &ebx
, &ecx
, &edx
);
138 if (edx
& 0x00800000)
142 if (edx
& 0x02000000)
143 cpu
|= X265_CPU_MMX2
| X265_CPU_SSE
;
144 if (edx
& 0x00008000)
145 cpu
|= X265_CPU_CMOV
;
148 if (edx
& 0x04000000)
149 cpu
|= X265_CPU_SSE2
;
150 if (ecx
& 0x00000001)
151 cpu
|= X265_CPU_SSE3
;
152 if (ecx
& 0x00000200)
153 cpu
|= X265_CPU_SSSE3
;
154 if (ecx
& 0x00080000)
155 cpu
|= X265_CPU_SSE4
;
156 if (ecx
& 0x00100000)
157 cpu
|= X265_CPU_SSE42
;
158 /* Check OXSAVE and AVX bits */
159 if ((ecx
& 0x18000000) == 0x18000000)
161 /* Check for OS support */
162 x265_cpu_xgetbv(0, &eax
, &edx
);
163 if ((eax
& 0x6) == 0x6)
166 if (ecx
& 0x00001000)
167 cpu
|= X265_CPU_FMA3
;
171 if (max_basic_cap
>= 7)
173 x265_cpu_cpuid(7, &eax
, &ebx
, &ecx
, &edx
);
174 /* AVX2 requires OS support, but BMI1/2 don't. */
175 if ((cpu
& X265_CPU_AVX
) && (ebx
& 0x00000020))
176 cpu
|= X265_CPU_AVX2
;
177 if (ebx
& 0x00000008)
179 cpu
|= X265_CPU_BMI1
;
180 if (ebx
& 0x00000100)
181 cpu
|= X265_CPU_BMI2
;
185 if (cpu
& X265_CPU_SSSE3
)
186 cpu
|= X265_CPU_SSE2_IS_FAST
;
188 x265_cpu_cpuid(0x80000000, &eax
, &ebx
, &ecx
, &edx
);
189 max_extended_cap
= eax
;
191 if (max_extended_cap
>= 0x80000001)
193 x265_cpu_cpuid(0x80000001, &eax
, &ebx
, &ecx
, &edx
);
195 if (ecx
& 0x00000020)
196 cpu
|= X265_CPU_LZCNT
; /* Supported by Intel chips starting with Haswell */
197 if (ecx
& 0x00000040) /* SSE4a, AMD only */
199 int family
= ((eax
>> 8) & 0xf) + ((eax
>> 20) & 0xff);
200 cpu
|= X265_CPU_SSE2_IS_FAST
; /* Phenom and later CPUs have fast SSE units */
203 cpu
&= ~X265_CPU_SSE2_IS_FAST
; /* SSSE3 doesn't imply fast SSE anymore... */
204 cpu
|= X265_CPU_SSE2_IS_SLOW
; /* Bobcat has 64-bit SIMD units */
205 cpu
|= X265_CPU_SLOW_PALIGNR
; /* palignr is insanely slow on Bobcat */
209 cpu
|= X265_CPU_SLOW_PSHUFB
; /* Jaguar's pshufb isn't that slow, but it's slow enough
210 * compared to alternate instruction sequences that this
211 * is equal or faster on almost all such functions. */
215 if (cpu
& X265_CPU_AVX
)
217 if (ecx
& 0x00000800) /* XOP */
219 if (ecx
& 0x00010000) /* FMA4 */
220 cpu
|= X265_CPU_FMA4
;
223 if (!strcmp((char*)vendor
, "AuthenticAMD"))
225 if (edx
& 0x00400000)
226 cpu
|= X265_CPU_MMX2
;
227 if (!(cpu
& X265_CPU_LZCNT
))
228 cpu
|= X265_CPU_SLOW_CTZ
;
229 if ((cpu
& X265_CPU_SSE2
) && !(cpu
& X265_CPU_SSE2_IS_FAST
))
230 cpu
|= X265_CPU_SSE2_IS_SLOW
; /* AMD CPUs come in two types: terrible at SSE and great at it */
234 if (!strcmp((char*)vendor
, "GenuineIntel"))
236 x265_cpu_cpuid(1, &eax
, &ebx
, &ecx
, &edx
);
237 int family
= ((eax
>> 8) & 0xf) + ((eax
>> 20) & 0xff);
238 int model
= ((eax
>> 4) & 0xf) + ((eax
>> 12) & 0xf0);
241 /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
242 * theoretically support sse2, but it's significantly slower than mmx for
243 * almost all of x264's functions, so let's just pretend they don't. */
244 if (model
== 9 || model
== 13 || model
== 14)
246 cpu
&= ~(X265_CPU_SSE2
| X265_CPU_SSE3
);
247 X265_CHECK(!(cpu
& (X265_CPU_SSSE3
| X265_CPU_SSE4
)), "unexpected CPU ID %d\n", cpu
);
249 /* Detect Atom CPU */
250 else if (model
== 28)
252 cpu
|= X265_CPU_SLOW_ATOM
;
253 cpu
|= X265_CPU_SLOW_CTZ
;
254 cpu
|= X265_CPU_SLOW_PSHUFB
;
257 /* Conroe has a slow shuffle unit. Check the model number to make sure not
258 * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
259 else if ((cpu
& X265_CPU_SSSE3
) && !(cpu
& X265_CPU_SSE4
) && model
< 23)
260 cpu
|= X265_CPU_SLOW_SHUFFLE
;
264 if ((!strcmp((char*)vendor
, "GenuineIntel") || !strcmp((char*)vendor
, "CyrixInstead")) && !(cpu
& X265_CPU_SSE42
))
266 /* cacheline size is specified in 3 places, any of which may be missing */
267 x265_cpu_cpuid(1, &eax
, &ebx
, &ecx
, &edx
);
268 int cache
= (ebx
& 0xff00) >> 5; // cflush size
269 if (!cache
&& max_extended_cap
>= 0x80000006)
271 x265_cpu_cpuid(0x80000006, &eax
, &ebx
, &ecx
, &edx
);
272 cache
= ecx
& 0xff; // cacheline size
274 if (!cache
&& max_basic_cap
>= 2)
276 // Cache and TLB Information
277 static const char cache32_ids
[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
278 static const char cache64_ids
[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
279 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
284 x265_cpu_cpuid(2, buf
+ 0, buf
+ 1, buf
+ 2, buf
+ 3);
287 for (int j
= 0; j
< 4; j
++)
292 if (strchr(cache32_ids
, buf
[j
] & 0xff))
294 if (strchr(cache64_ids
, buf
[j
] & 0xff))
304 cpu
|= X265_CPU_CACHELINE_32
;
305 else if (cache
== 64)
306 cpu
|= X265_CPU_CACHELINE_64
;
308 x265_log(NULL
, X265_LOG_WARNING
, "unable to determine cacheline size\n");
311 #if BROKEN_STACK_ALIGNMENT
312 cpu
|= X265_CPU_STACK_MOD4
;
321 void x265_cpu_neon_test(void);
322 int x265_cpu_fast_neon_mrc_test(void);
325 uint32_t cpu_detect(void)
330 flags
|= X265_CPU_ARMV6
;
332 // don't do this hack if compiled with -mfpu=neon
334 static void (* oldsig
)(int);
335 oldsig
= signal(SIGILL
, sigill_handler
);
336 if (sigsetjmp(jmpbuf
, 1))
338 signal(SIGILL
, oldsig
);
343 x265_cpu_neon_test();
345 signal(SIGILL
, oldsig
);
346 #endif // if !HAVE_NEON
348 flags
|= X265_CPU_NEON
;
350 // fast neon -> arm (Cortex-A9) detection relies on user access to the
351 // cycle counter; this assumes ARMv7 performance counters.
352 // NEON requires at least ARMv7, ARMv8 may require changes here, but
353 // hopefully this hacky detection method will have been replaced by then.
354 // Note that there is potential for a race condition if another program or
355 // x264 instance disables or reinits the counters while x264 is using them,
356 // which may result in incorrect detection and the counters stuck enabled.
357 // right now Apple does not seem to support performance counters for this test
359 flags
|= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC
: 0;
361 // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
362 #endif // if HAVE_ARMV6
366 #else // if X265_ARCH_X86
368 uint32_t cpu_detect(void)
373 #endif // if X265_ARCH_X86