Imported Upstream version 1.4
[deb_x265.git] / source / common / cpu.cpp
CommitLineData
72b9787e
JB
1/*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Loren Merritt <lorenm@u.washington.edu>
5 * Laurent Aimar <fenrir@via.ecp.fr>
6 * Fiona Glaser <fiona@x264.com>
7 * Steve Borho <steve@borho.org>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at license @ x265.com.
25 *****************************************************************************/
26
27#include "cpu.h"
28#include "common.h"
29
30#if MACOS || SYS_FREEBSD
31#include <sys/types.h>
32#include <sys/sysctl.h>
33#endif
34#if SYS_OPENBSD
35#include <sys/param.h>
36#include <sys/sysctl.h>
37#include <machine/cpu.h>
38#endif
39
40#if X265_ARCH_ARM && !defined(HAVE_NEON)
41#include <signal.h>
42#include <setjmp.h>
43static sigjmp_buf jmpbuf;
44static volatile sig_atomic_t canjump = 0;
45
46static void sigill_handler(int sig)
47{
48 if (!canjump)
49 {
50 signal(sig, SIG_DFL);
51 raise(sig);
52 }
53
54 canjump = 0;
55 siglongjmp(jmpbuf, 1);
56}
57
58#endif // if X265_ARCH_ARM
59
60namespace x265 {
61const cpu_name_t cpu_names[] =
62{
63#if X265_ARCH_X86
64#define MMX2 X265_CPU_MMX | X265_CPU_MMX2 | X265_CPU_CMOV
65 { "MMX2", MMX2 },
66 { "MMXEXT", MMX2 },
67 { "SSE", MMX2 | X265_CPU_SSE },
68#define SSE2 MMX2 | X265_CPU_SSE | X265_CPU_SSE2
69 { "SSE2Slow", SSE2 | X265_CPU_SSE2_IS_SLOW },
70 { "SSE2", SSE2 },
71 { "SSE2Fast", SSE2 | X265_CPU_SSE2_IS_FAST },
72 { "SSE3", SSE2 | X265_CPU_SSE3 },
73 { "SSSE3", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 },
74 { "SSE4.1", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
75 { "SSE4", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
76 { "SSE4.2", SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 },
77#define AVX SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 | X265_CPU_AVX
78 { "AVX", AVX },
79 { "XOP", AVX | X265_CPU_XOP },
80 { "FMA4", AVX | X265_CPU_FMA4 },
81 { "AVX2", AVX | X265_CPU_AVX2 },
82 { "FMA3", AVX | X265_CPU_FMA3 },
83#undef AVX
84#undef SSE2
85#undef MMX2
86 { "Cache32", X265_CPU_CACHELINE_32 },
87 { "Cache64", X265_CPU_CACHELINE_64 },
88 { "LZCNT", X265_CPU_LZCNT },
89 { "BMI1", X265_CPU_BMI1 },
90 { "BMI2", X265_CPU_BMI1 | X265_CPU_BMI2 },
91 { "SlowCTZ", X265_CPU_SLOW_CTZ },
92 { "SlowAtom", X265_CPU_SLOW_ATOM },
93 { "SlowPshufb", X265_CPU_SLOW_PSHUFB },
94 { "SlowPalignr", X265_CPU_SLOW_PALIGNR },
95 { "SlowShuffle", X265_CPU_SLOW_SHUFFLE },
96 { "UnalignedStack", X265_CPU_STACK_MOD4 },
97
98#elif X265_ARCH_ARM
99 { "ARMv6", X265_CPU_ARMV6 },
100 { "NEON", X265_CPU_NEON },
101 { "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
102#endif // if X265_ARCH_X86
103 { "", 0 },
104};
105
106#if X265_ARCH_X86
107
108extern "C" {
109/* cpu-a.asm */
110int x265_cpu_cpuid_test(void);
111void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
112void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx);
113}
114
115#if defined(_MSC_VER)
116#pragma warning(disable: 4309) // truncation of constant value
117#endif
118
119uint32_t cpu_detect(void)
120{
121 uint32_t cpu = 0;
122
123 uint32_t eax, ebx, ecx, edx;
124 uint32_t vendor[4] = { 0 };
125 uint32_t max_extended_cap, max_basic_cap;
126
127#if !X86_64
128 if (!x265_cpu_cpuid_test())
129 return 0;
130#endif
131
132 x265_cpu_cpuid(0, &eax, vendor + 0, vendor + 2, vendor + 1);
133 max_basic_cap = eax;
134 if (max_basic_cap == 0)
135 return 0;
136
137 x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
138 if (edx & 0x00800000)
139 cpu |= X265_CPU_MMX;
140 else
141 return cpu;
142 if (edx & 0x02000000)
143 cpu |= X265_CPU_MMX2 | X265_CPU_SSE;
144 if (edx & 0x00008000)
145 cpu |= X265_CPU_CMOV;
146 else
147 return cpu;
148 if (edx & 0x04000000)
149 cpu |= X265_CPU_SSE2;
150 if (ecx & 0x00000001)
151 cpu |= X265_CPU_SSE3;
152 if (ecx & 0x00000200)
153 cpu |= X265_CPU_SSSE3;
154 if (ecx & 0x00080000)
155 cpu |= X265_CPU_SSE4;
156 if (ecx & 0x00100000)
157 cpu |= X265_CPU_SSE42;
158 /* Check OXSAVE and AVX bits */
159 if ((ecx & 0x18000000) == 0x18000000)
160 {
161 /* Check for OS support */
162 x265_cpu_xgetbv(0, &eax, &edx);
163 if ((eax & 0x6) == 0x6)
164 {
165 cpu |= X265_CPU_AVX;
166 if (ecx & 0x00001000)
167 cpu |= X265_CPU_FMA3;
168 }
169 }
170
171 if (max_basic_cap >= 7)
172 {
173 x265_cpu_cpuid(7, &eax, &ebx, &ecx, &edx);
174 /* AVX2 requires OS support, but BMI1/2 don't. */
175 if ((cpu & X265_CPU_AVX) && (ebx & 0x00000020))
176 cpu |= X265_CPU_AVX2;
177 if (ebx & 0x00000008)
178 {
179 cpu |= X265_CPU_BMI1;
180 if (ebx & 0x00000100)
181 cpu |= X265_CPU_BMI2;
182 }
183 }
184
185 if (cpu & X265_CPU_SSSE3)
186 cpu |= X265_CPU_SSE2_IS_FAST;
187
188 x265_cpu_cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
189 max_extended_cap = eax;
190
191 if (max_extended_cap >= 0x80000001)
192 {
193 x265_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
194
195 if (ecx & 0x00000020)
196 cpu |= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
197 if (ecx & 0x00000040) /* SSE4a, AMD only */
198 {
199 int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
200 cpu |= X265_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */
201 if (family == 0x14)
202 {
203 cpu &= ~X265_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
204 cpu |= X265_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */
205 cpu |= X265_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */
206 }
207 if (family == 0x16)
208 {
209 cpu |= X265_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough
210 * compared to alternate instruction sequences that this
211 * is equal or faster on almost all such functions. */
212 }
213 }
214
215 if (cpu & X265_CPU_AVX)
216 {
217 if (ecx & 0x00000800) /* XOP */
218 cpu |= X265_CPU_XOP;
219 if (ecx & 0x00010000) /* FMA4 */
220 cpu |= X265_CPU_FMA4;
221 }
222
223 if (!strcmp((char*)vendor, "AuthenticAMD"))
224 {
225 if (edx & 0x00400000)
226 cpu |= X265_CPU_MMX2;
227 if (!(cpu & X265_CPU_LZCNT))
228 cpu |= X265_CPU_SLOW_CTZ;
229 if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
230 cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
231 }
232 }
233
234 if (!strcmp((char*)vendor, "GenuineIntel"))
235 {
236 x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
237 int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
238 int model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
239 if (family == 6)
240 {
241 /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
242 * theoretically support sse2, but it's significantly slower than mmx for
243 * almost all of x264's functions, so let's just pretend they don't. */
244 if (model == 9 || model == 13 || model == 14)
245 {
246 cpu &= ~(X265_CPU_SSE2 | X265_CPU_SSE3);
247 X265_CHECK(!(cpu & (X265_CPU_SSSE3 | X265_CPU_SSE4)), "unexpected CPU ID %d\n", cpu);
248 }
249 /* Detect Atom CPU */
250 else if (model == 28)
251 {
252 cpu |= X265_CPU_SLOW_ATOM;
253 cpu |= X265_CPU_SLOW_CTZ;
254 cpu |= X265_CPU_SLOW_PSHUFB;
255 }
256
257 /* Conroe has a slow shuffle unit. Check the model number to make sure not
258 * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
259 else if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE4) && model < 23)
260 cpu |= X265_CPU_SLOW_SHUFFLE;
261 }
262 }
263
264 if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42))
265 {
266 /* cacheline size is specified in 3 places, any of which may be missing */
267 x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
268 int cache = (ebx & 0xff00) >> 5; // cflush size
269 if (!cache && max_extended_cap >= 0x80000006)
270 {
271 x265_cpu_cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
272 cache = ecx & 0xff; // cacheline size
273 }
274 if (!cache && max_basic_cap >= 2)
275 {
276 // Cache and TLB Information
277 static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
278 static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
279 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
280 uint32_t buf[4];
281 int max, i = 0;
282 do
283 {
284 x265_cpu_cpuid(2, buf + 0, buf + 1, buf + 2, buf + 3);
285 max = buf[0] & 0xff;
286 buf[0] &= ~0xff;
287 for (int j = 0; j < 4; j++)
288 {
289 if (!(buf[j] >> 31))
290 while (buf[j])
291 {
292 if (strchr(cache32_ids, buf[j] & 0xff))
293 cache = 32;
294 if (strchr(cache64_ids, buf[j] & 0xff))
295 cache = 64;
296 buf[j] >>= 8;
297 }
298 }
299 }
300 while (++i < max);
301 }
302
303 if (cache == 32)
304 cpu |= X265_CPU_CACHELINE_32;
305 else if (cache == 64)
306 cpu |= X265_CPU_CACHELINE_64;
307 else
308 x265_log(NULL, X265_LOG_WARNING, "unable to determine cacheline size\n");
309 }
310
311#if BROKEN_STACK_ALIGNMENT
312 cpu |= X265_CPU_STACK_MOD4;
313#endif
314
315 return cpu;
316}
317
318#elif X265_ARCH_ARM
319
320extern "C" {
321void x265_cpu_neon_test(void);
322int x265_cpu_fast_neon_mrc_test(void);
323}
324
325uint32_t cpu_detect(void)
326{
327 int flags = 0;
328
329#if HAVE_ARMV6
330 flags |= X265_CPU_ARMV6;
331
332 // don't do this hack if compiled with -mfpu=neon
333#if !HAVE_NEON
334 static void (* oldsig)(int);
335 oldsig = signal(SIGILL, sigill_handler);
336 if (sigsetjmp(jmpbuf, 1))
337 {
338 signal(SIGILL, oldsig);
339 return flags;
340 }
341
342 canjump = 1;
343 x265_cpu_neon_test();
344 canjump = 0;
345 signal(SIGILL, oldsig);
346#endif // if !HAVE_NEON
347
348 flags |= X265_CPU_NEON;
349
350 // fast neon -> arm (Cortex-A9) detection relies on user access to the
351 // cycle counter; this assumes ARMv7 performance counters.
352 // NEON requires at least ARMv7, ARMv8 may require changes here, but
353 // hopefully this hacky detection method will have been replaced by then.
354 // Note that there is potential for a race condition if another program or
355 // x264 instance disables or reinits the counters while x264 is using them,
356 // which may result in incorrect detection and the counters stuck enabled.
357 // right now Apple does not seem to support performance counters for this test
358#ifndef __MACH__
359 flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0;
360#endif
361 // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
362#endif // if HAVE_ARMV6
363 return flags;
364}
365
366#else // if X265_ARCH_X86
367
368uint32_t cpu_detect(void)
369{
370 return 0;
371}
372
373#endif // if X265_ARCH_X86
374}