source/common/cpu.cpp

   1 /*****************************************************************************
   2  * Copyright (C) 2013 x265 project
   3  *
   4  * Authors: Loren Merritt <lorenm@u.washington.edu>
   5  *          Laurent Aimar <fenrir@via.ecp.fr>
   6  *          Fiona Glaser <fiona@x264.com>
   7  *          Steve Borho <steve@borho.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *
  23  * This program is also available under a commercial proprietary license.
  24  * For more information, contact us at license @ x265.com.
  25  *****************************************************************************/
  26
  27 #include "cpu.h"
  28 #include "common.h"
  29
  30 #if MACOS || SYS_FREEBSD
  31 #include <sys/types.h>
  32 #include <sys/sysctl.h>
  33 #endif
  34 #if SYS_OPENBSD
  35 #include <sys/param.h>
  36 #include <sys/sysctl.h>
  37 #include <machine/cpu.h>
  38 #endif
  39
  40 #if X265_ARCH_ARM && !defined(HAVE_NEON)
  41 #include <signal.h>
  42 #include <setjmp.h>
  43 static sigjmp_buf jmpbuf;
  44 static volatile sig_atomic_t canjump = 0;
  45
  46 static void sigill_handler(int sig)
  47 {
  48     if (!canjump)
  49     {
  50         signal(sig, SIG_DFL);
  51         raise(sig);
  52     }
  53
  54     canjump = 0;
  55     siglongjmp(jmpbuf, 1);
  56 }
  57
  58 #endif // if X265_ARCH_ARM
  59
  60 namespace x265 {
  61 const cpu_name_t cpu_names[] =
  62 {
  63 #if X265_ARCH_X86
  64 #define MMX2 X265_CPU_MMX | X265_CPU_MMX2 | X265_CPU_CMOV
  65     { "MMX2",        MMX2 },
  66     { "MMXEXT",      MMX2 },
  67     { "SSE",         MMX2 | X265_CPU_SSE },
  68 #define SSE2 MMX2 | X265_CPU_SSE | X265_CPU_SSE2
  69     { "SSE2Slow",    SSE2 | X265_CPU_SSE2_IS_SLOW },
  70     { "SSE2",        SSE2 },
  71     { "SSE2Fast",    SSE2 | X265_CPU_SSE2_IS_FAST },
  72     { "SSE3",        SSE2 | X265_CPU_SSE3 },
  73     { "SSSE3",       SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 },
  74     { "SSE4.1",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
  75     { "SSE4",        SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
  76     { "SSE4.2",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 },
  77 #define AVX SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 | X265_CPU_AVX
  78     { "AVX",         AVX },
  79     { "XOP",         AVX | X265_CPU_XOP },
  80     { "FMA4",        AVX | X265_CPU_FMA4 },
  81     { "AVX2",        AVX | X265_CPU_AVX2 },
  82     { "FMA3",        AVX | X265_CPU_FMA3 },
  83 #undef AVX
  84 #undef SSE2
  85 #undef MMX2
  86     { "Cache32",         X265_CPU_CACHELINE_32 },
  87     { "Cache64",         X265_CPU_CACHELINE_64 },
  88     { "LZCNT",           X265_CPU_LZCNT },
  89     { "BMI1",            X265_CPU_BMI1 },
  90     { "BMI2",            X265_CPU_BMI1 | X265_CPU_BMI2 },
  91     { "SlowCTZ",         X265_CPU_SLOW_CTZ },
  92     { "SlowAtom",        X265_CPU_SLOW_ATOM },
  93     { "SlowPshufb",      X265_CPU_SLOW_PSHUFB },
  94     { "SlowPalignr",     X265_CPU_SLOW_PALIGNR },
  95     { "SlowShuffle",     X265_CPU_SLOW_SHUFFLE },
  96     { "UnalignedStack",  X265_CPU_STACK_MOD4 },
  97
  98 #elif X265_ARCH_ARM
  99     { "ARMv6",           X265_CPU_ARMV6 },
 100     { "NEON",            X265_CPU_NEON },
 101     { "FastNeonMRC",     X265_CPU_FAST_NEON_MRC },
 102 #endif // if X265_ARCH_X86
 103     { "", 0 },
 104 };
 105
 106 #if X265_ARCH_X86
 107
 108 extern "C" {
 109 /* cpu-a.asm */
 110 int x265_cpu_cpuid_test(void);
 111 void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
 112 void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx);
 113 }
 114
 115 #if defined(_MSC_VER)
 116 #pragma warning(disable: 4309) // truncation of constant value
 117 #endif
 118
 119 uint32_t cpu_detect(void)
 120 {
 121     uint32_t cpu = 0;
 122
 123     uint32_t eax, ebx, ecx, edx;
 124     uint32_t vendor[4] = { 0 };
 125     uint32_t max_extended_cap, max_basic_cap;
 126
 127 #if !X86_64
 128     if (!x265_cpu_cpuid_test())
 129         return 0;
 130 #endif
 131
 132     x265_cpu_cpuid(0, &eax, vendor + 0, vendor + 2, vendor + 1);
 133     max_basic_cap = eax;
 134     if (max_basic_cap == 0)
 135         return 0;
 136
 137     x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
 138     if (edx & 0x00800000)
 139         cpu |= X265_CPU_MMX;
 140     else
 141         return cpu;
 142     if (edx & 0x02000000)
 143         cpu |= X265_CPU_MMX2 | X265_CPU_SSE;
 144     if (edx & 0x00008000)
 145         cpu |= X265_CPU_CMOV;
 146     else
 147         return cpu;
 148     if (edx & 0x04000000)
 149         cpu |= X265_CPU_SSE2;
 150     if (ecx & 0x00000001)
 151         cpu |= X265_CPU_SSE3;
 152     if (ecx & 0x00000200)
 153         cpu |= X265_CPU_SSSE3;
 154     if (ecx & 0x00080000)
 155         cpu |= X265_CPU_SSE4;
 156     if (ecx & 0x00100000)
 157         cpu |= X265_CPU_SSE42;
 158     /* Check OXSAVE and AVX bits */
 159     if ((ecx & 0x18000000) == 0x18000000)
 160     {
 161         /* Check for OS support */
 162         x265_cpu_xgetbv(0, &eax, &edx);
 163         if ((eax & 0x6) == 0x6)
 164         {
 165             cpu |= X265_CPU_AVX;
 166             if (ecx & 0x00001000)
 167                 cpu |= X265_CPU_FMA3;
 168         }
 169     }
 170
 171     if (max_basic_cap >= 7)
 172     {
 173         x265_cpu_cpuid(7, &eax, &ebx, &ecx, &edx);
 174         /* AVX2 requires OS support, but BMI1/2 don't. */
 175         if ((cpu & X265_CPU_AVX) && (ebx & 0x00000020))
 176             cpu |= X265_CPU_AVX2;
 177         if (ebx & 0x00000008)
 178         {
 179             cpu |= X265_CPU_BMI1;
 180             if (ebx & 0x00000100)
 181                 cpu |= X265_CPU_BMI2;
 182         }
 183     }
 184
 185     if (cpu & X265_CPU_SSSE3)
 186         cpu |= X265_CPU_SSE2_IS_FAST;
 187
 188     x265_cpu_cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
 189     max_extended_cap = eax;
 190
 191     if (max_extended_cap >= 0x80000001)
 192     {
 193         x265_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
 194
 195         if (ecx & 0x00000020)
 196             cpu |= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
 197         if (ecx & 0x00000040) /* SSE4a, AMD only */
 198         {
 199             int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
 200             cpu |= X265_CPU_SSE2_IS_FAST;      /* Phenom and later CPUs have fast SSE units */
 201             if (family == 0x14)
 202             {
 203                 cpu &= ~X265_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
 204                 cpu |= X265_CPU_SSE2_IS_SLOW;  /* Bobcat has 64-bit SIMD units */
 205                 cpu |= X265_CPU_SLOW_PALIGNR;  /* palignr is insanely slow on Bobcat */
 206             }
 207             if (family == 0x16)
 208             {
 209                 cpu |= X265_CPU_SLOW_PSHUFB;   /* Jaguar's pshufb isn't that slow, but it's slow enough
 210                                                 * compared to alternate instruction sequences that this
 211                                                 * is equal or faster on almost all such functions. */
 212             }
 213         }
 214
 215         if (cpu & X265_CPU_AVX)
 216         {
 217             if (ecx & 0x00000800) /* XOP */
 218                 cpu |= X265_CPU_XOP;
 219             if (ecx & 0x00010000) /* FMA4 */
 220                 cpu |= X265_CPU_FMA4;
 221         }
 222
 223         if (!strcmp((char*)vendor, "AuthenticAMD"))
 224         {
 225             if (edx & 0x00400000)
 226                 cpu |= X265_CPU_MMX2;
 227             if (!(cpu & X265_CPU_LZCNT))
 228                 cpu |= X265_CPU_SLOW_CTZ;
 229             if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
 230                 cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
 231         }
 232     }
 233
 234     if (!strcmp((char*)vendor, "GenuineIntel"))
 235     {
 236         x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
 237         int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
 238         int model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
 239         if (family == 6)
 240         {
 241             /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
 242              * theoretically support sse2, but it's significantly slower than mmx for
 243              * almost all of x264's functions, so let's just pretend they don't. */
 244             if (model == 9 || model == 13 || model == 14)
 245             {
 246                 cpu &= ~(X265_CPU_SSE2 | X265_CPU_SSE3);
 247                 X265_CHECK(!(cpu & (X265_CPU_SSSE3 | X265_CPU_SSE4)), "unexpected CPU ID %d\n", cpu);
 248             }
 249             /* Detect Atom CPU */
 250             else if (model == 28)
 251             {
 252                 cpu |= X265_CPU_SLOW_ATOM;
 253                 cpu |= X265_CPU_SLOW_CTZ;
 254                 cpu |= X265_CPU_SLOW_PSHUFB;
 255             }
 256
 257             /* Conroe has a slow shuffle unit. Check the model number to make sure not
 258              * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
 259             else if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE4) && model < 23)
 260                 cpu |= X265_CPU_SLOW_SHUFFLE;
 261         }
 262     }
 263
 264     if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42))
 265     {
 266         /* cacheline size is specified in 3 places, any of which may be missing */
 267         x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
 268         int cache = (ebx & 0xff00) >> 5; // cflush size
 269         if (!cache && max_extended_cap >= 0x80000006)
 270         {
 271             x265_cpu_cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
 272             cache = ecx & 0xff; // cacheline size
 273         }
 274         if (!cache && max_basic_cap >= 2)
 275         {
 276             // Cache and TLB Information
 277             static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
 278             static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
 279                                                 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
 280             uint32_t buf[4];
 281             int max, i = 0;
 282             do
 283             {
 284                 x265_cpu_cpuid(2, buf + 0, buf + 1, buf + 2, buf + 3);
 285                 max = buf[0] & 0xff;
 286                 buf[0] &= ~0xff;
 287                 for (int j = 0; j < 4; j++)
 288                 {
 289                     if (!(buf[j] >> 31))
 290                         while (buf[j])
 291                         {
 292                             if (strchr(cache32_ids, buf[j] & 0xff))
 293                                 cache = 32;
 294                             if (strchr(cache64_ids, buf[j] & 0xff))
 295                                 cache = 64;
 296                             buf[j] >>= 8;
 297                         }
 298                 }
 299             }
 300             while (++i < max);
 301         }
 302
 303         if (cache == 32)
 304             cpu |= X265_CPU_CACHELINE_32;
 305         else if (cache == 64)
 306             cpu |= X265_CPU_CACHELINE_64;
 307         else
 308             x265_log(NULL, X265_LOG_WARNING, "unable to determine cacheline size\n");
 309     }
 310
 311 #if BROKEN_STACK_ALIGNMENT
 312     cpu |= X265_CPU_STACK_MOD4;
 313 #endif
 314
 315     return cpu;
 316 }
 317
 318 #elif X265_ARCH_ARM
 319
 320 extern "C" {
 321 void x265_cpu_neon_test(void);
 322 int x265_cpu_fast_neon_mrc_test(void);
 323 }
 324
 325 uint32_t cpu_detect(void)
 326 {
 327     int flags = 0;
 328
 329 #if HAVE_ARMV6
 330     flags |= X265_CPU_ARMV6;
 331
 332     // don't do this hack if compiled with -mfpu=neon
 333 #if !HAVE_NEON
 334     static void (* oldsig)(int);
 335     oldsig = signal(SIGILL, sigill_handler);
 336     if (sigsetjmp(jmpbuf, 1))
 337     {
 338         signal(SIGILL, oldsig);
 339         return flags;
 340     }
 341
 342     canjump = 1;
 343     x265_cpu_neon_test();
 344     canjump = 0;
 345     signal(SIGILL, oldsig);
 346 #endif // if !HAVE_NEON
 347
 348     flags |= X265_CPU_NEON;
 349
 350     // fast neon -> arm (Cortex-A9) detection relies on user access to the
 351     // cycle counter; this assumes ARMv7 performance counters.
 352     // NEON requires at least ARMv7, ARMv8 may require changes here, but
 353     // hopefully this hacky detection method will have been replaced by then.
 354     // Note that there is potential for a race condition if another program or
 355     // x264 instance disables or reinits the counters while x264 is using them,
 356     // which may result in incorrect detection and the counters stuck enabled.
 357     // right now Apple does not seem to support performance counters for this test
 358 #ifndef __MACH__
 359     flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0;
 360 #endif
 361     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
 362 #endif // if HAVE_ARMV6
 363     return flags;
 364 }
 365
 366 #else // if X265_ARCH_X86
 367
 368 uint32_t cpu_detect(void)
 369 {
 370     return 0;
 371 }
 372
 373 #endif // if X265_ARCH_X86
 374 }