[deb_x265.git] / source / common / cpu.cpp

/*****************************************************************************
 * Copyright (C) 2013 x265 project
 *
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
 *          Fiona Glaser <fiona@x264.com>
 *          Steve Borho <steve@borho.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at license @ x265.com.
 *****************************************************************************/

#include "cpu.h"
#include "common.h"

#if MACOS || SYS_FREEBSD
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
#if SYS_OPENBSD
#include <sys/param.h>
#include <sys/sysctl.h>
#include <machine/cpu.h>
#endif

#if X265_ARCH_ARM && !defined(HAVE_NEON)
#include <signal.h>
#include <setjmp.h>
static sigjmp_buf jmpbuf;
static volatile sig_atomic_t canjump = 0;

static void sigill_handler(int sig)
{
    if (!canjump)
    {
        signal(sig, SIG_DFL);
        raise(sig);
    }

    canjump = 0;
    siglongjmp(jmpbuf, 1);
}

#endif // if X265_ARCH_ARM

namespace x265 {
const cpu_name_t cpu_names[] =
{
#if X265_ARCH_X86
#define MMX2 X265_CPU_MMX | X265_CPU_MMX2 | X265_CPU_CMOV
    { "MMX2",        MMX2 },
    { "MMXEXT",      MMX2 },
    { "SSE",         MMX2 | X265_CPU_SSE },
#define SSE2 MMX2 | X265_CPU_SSE | X265_CPU_SSE2
    { "SSE2Slow",    SSE2 | X265_CPU_SSE2_IS_SLOW },
    { "SSE2",        SSE2 },
    { "SSE2Fast",    SSE2 | X265_CPU_SSE2_IS_FAST },
    { "SSE3",        SSE2 | X265_CPU_SSE3 },
    { "SSSE3",       SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 },
    { "SSE4.1",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
    { "SSE4",        SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 },
    { "SSE4.2",      SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 },
#define AVX SSE2 | X265_CPU_SSE3 | X265_CPU_SSSE3 | X265_CPU_SSE4 | X265_CPU_SSE42 | X265_CPU_AVX
    { "AVX",         AVX },
    { "XOP",         AVX | X265_CPU_XOP },
    { "FMA4",        AVX | X265_CPU_FMA4 },
    { "AVX2",        AVX | X265_CPU_AVX2 },
    { "FMA3",        AVX | X265_CPU_FMA3 },
#undef AVX
#undef SSE2
#undef MMX2
    { "Cache32",         X265_CPU_CACHELINE_32 },
    { "Cache64",         X265_CPU_CACHELINE_64 },
    { "LZCNT",           X265_CPU_LZCNT },
    { "BMI1",            X265_CPU_BMI1 },
    { "BMI2",            X265_CPU_BMI1 | X265_CPU_BMI2 },
    { "SlowCTZ",         X265_CPU_SLOW_CTZ },
    { "SlowAtom",        X265_CPU_SLOW_ATOM },
    { "SlowPshufb",      X265_CPU_SLOW_PSHUFB },
    { "SlowPalignr",     X265_CPU_SLOW_PALIGNR },
    { "SlowShuffle",     X265_CPU_SLOW_SHUFFLE },
    { "UnalignedStack",  X265_CPU_STACK_MOD4 },

#elif X265_ARCH_ARM
    { "ARMv6",           X265_CPU_ARMV6 },
    { "NEON",            X265_CPU_NEON },
    { "FastNeonMRC",     X265_CPU_FAST_NEON_MRC },
#endif // if X265_ARCH_X86
    { "", 0 },
};

#if X265_ARCH_X86

extern "C" {
/* cpu-a.asm */
int x265_cpu_cpuid_test(void);
void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx);
}

#if defined(_MSC_VER)
#pragma warning(disable: 4309) // truncation of constant value
#endif

uint32_t cpu_detect(void)
{
    uint32_t cpu = 0;

    uint32_t eax, ebx, ecx, edx;
    uint32_t vendor[4] = { 0 };
    uint32_t max_extended_cap, max_basic_cap;

#if !X86_64
    if (!x265_cpu_cpuid_test())
        return 0;
#endif

    x265_cpu_cpuid(0, &eax, vendor + 0, vendor + 2, vendor + 1);
    max_basic_cap = eax;
    if (max_basic_cap == 0)
        return 0;

    x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
    if (edx & 0x00800000)
        cpu |= X265_CPU_MMX;
    else
        return cpu;
    if (edx & 0x02000000)
        cpu |= X265_CPU_MMX2 | X265_CPU_SSE;
    if (edx & 0x00008000)
        cpu |= X265_CPU_CMOV;
    else
        return cpu;
    if (edx & 0x04000000)
        cpu |= X265_CPU_SSE2;
    if (ecx & 0x00000001)
        cpu |= X265_CPU_SSE3;
    if (ecx & 0x00000200)
        cpu |= X265_CPU_SSSE3;
    if (ecx & 0x00080000)
        cpu |= X265_CPU_SSE4;
    if (ecx & 0x00100000)
        cpu |= X265_CPU_SSE42;
    /* Check OXSAVE and AVX bits */
    if ((ecx & 0x18000000) == 0x18000000)
    {
        /* Check for OS support */
        x265_cpu_xgetbv(0, &eax, &edx);
        if ((eax & 0x6) == 0x6)
        {
            cpu |= X265_CPU_AVX;
            if (ecx & 0x00001000)
                cpu |= X265_CPU_FMA3;
        }
    }

    if (max_basic_cap >= 7)
    {
        x265_cpu_cpuid(7, &eax, &ebx, &ecx, &edx);
        /* AVX2 requires OS support, but BMI1/2 don't. */
        if ((cpu & X265_CPU_AVX) && (ebx & 0x00000020))
            cpu |= X265_CPU_AVX2;
        if (ebx & 0x00000008)
        {
            cpu |= X265_CPU_BMI1;
            if (ebx & 0x00000100)
                cpu |= X265_CPU_BMI2;
        }
    }

    if (cpu & X265_CPU_SSSE3)
        cpu |= X265_CPU_SSE2_IS_FAST;

    x265_cpu_cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
    max_extended_cap = eax;

    if (max_extended_cap >= 0x80000001)
    {
        x265_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &edx);

        if (ecx & 0x00000020)
            cpu |= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
        if (ecx & 0x00000040) /* SSE4a, AMD only */
        {
            int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
            cpu |= X265_CPU_SSE2_IS_FAST;      /* Phenom and later CPUs have fast SSE units */
            if (family == 0x14)
            {
                cpu &= ~X265_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
                cpu |= X265_CPU_SSE2_IS_SLOW;  /* Bobcat has 64-bit SIMD units */
                cpu |= X265_CPU_SLOW_PALIGNR;  /* palignr is insanely slow on Bobcat */
            }
            if (family == 0x16)
            {
                cpu |= X265_CPU_SLOW_PSHUFB;   /* Jaguar's pshufb isn't that slow, but it's slow enough
                                                * compared to alternate instruction sequences that this
                                                * is equal or faster on almost all such functions. */
            }
        }

        if (cpu & X265_CPU_AVX)
        {
            if (ecx & 0x00000800) /* XOP */
                cpu |= X265_CPU_XOP;
            if (ecx & 0x00010000) /* FMA4 */
                cpu |= X265_CPU_FMA4;
        }

        if (!strcmp((char*)vendor, "AuthenticAMD"))
        {
            if (edx & 0x00400000)
                cpu |= X265_CPU_MMX2;
            if (!(cpu & X265_CPU_LZCNT))
                cpu |= X265_CPU_SLOW_CTZ;
            if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
                cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
        }
    }

    if (!strcmp((char*)vendor, "GenuineIntel"))
    {
        x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
        int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
        int model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
        if (family == 6)
        {
            /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
             * theoretically support sse2, but it's significantly slower than mmx for
             * almost all of x264's functions, so let's just pretend they don't. */
            if (model == 9 || model == 13 || model == 14)
            {
                cpu &= ~(X265_CPU_SSE2 | X265_CPU_SSE3);
                X265_CHECK(!(cpu & (X265_CPU_SSSE3 | X265_CPU_SSE4)), "unexpected CPU ID %d\n", cpu);
            }
            /* Detect Atom CPU */
            else if (model == 28)
            {
                cpu |= X265_CPU_SLOW_ATOM;
                cpu |= X265_CPU_SLOW_CTZ;
                cpu |= X265_CPU_SLOW_PSHUFB;
            }

            /* Conroe has a slow shuffle unit. Check the model number to make sure not
             * to include crippled low-end Penryns and Nehalems that don't have SSE4. */
            else if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE4) && model < 23)
                cpu |= X265_CPU_SLOW_SHUFFLE;
        }
    }

    if ((!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42))
    {
        /* cacheline size is specified in 3 places, any of which may be missing */
        x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
        int cache = (ebx & 0xff00) >> 5; // cflush size
        if (!cache && max_extended_cap >= 0x80000006)
        {
            x265_cpu_cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
            cache = ecx & 0xff; // cacheline size
        }
        if (!cache && max_basic_cap >= 2)
        {
            // Cache and TLB Information
            static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
            static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
                                                0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
            uint32_t buf[4];
            int max, i = 0;
            do
            {
                x265_cpu_cpuid(2, buf + 0, buf + 1, buf + 2, buf + 3);
                max = buf[0] & 0xff;
                buf[0] &= ~0xff;
                for (int j = 0; j < 4; j++)
                {
                    if (!(buf[j] >> 31))
                        while (buf[j])
                        {
                            if (strchr(cache32_ids, buf[j] & 0xff))
                                cache = 32;
                            if (strchr(cache64_ids, buf[j] & 0xff))
                                cache = 64;
                            buf[j] >>= 8;
                        }
                }
            }
            while (++i < max);
        }

        if (cache == 32)
            cpu |= X265_CPU_CACHELINE_32;
        else if (cache == 64)
            cpu |= X265_CPU_CACHELINE_64;
        else
            x265_log(NULL, X265_LOG_WARNING, "unable to determine cacheline size\n");
    }

#if BROKEN_STACK_ALIGNMENT
    cpu |= X265_CPU_STACK_MOD4;
#endif

    return cpu;
}

#elif X265_ARCH_ARM

extern "C" {
void x265_cpu_neon_test(void);
int x265_cpu_fast_neon_mrc_test(void);
}

uint32_t cpu_detect(void)
{
    int flags = 0;

#if HAVE_ARMV6
    flags |= X265_CPU_ARMV6;

    // don't do this hack if compiled with -mfpu=neon
#if !HAVE_NEON
    static void (* oldsig)(int);
    oldsig = signal(SIGILL, sigill_handler);
    if (sigsetjmp(jmpbuf, 1))
    {
        signal(SIGILL, oldsig);
        return flags;
    }

    canjump = 1;
    x265_cpu_neon_test();
    canjump = 0;
    signal(SIGILL, oldsig);
#endif // if !HAVE_NEON

    flags |= X265_CPU_NEON;

    // fast neon -> arm (Cortex-A9) detection relies on user access to the
    // cycle counter; this assumes ARMv7 performance counters.
    // NEON requires at least ARMv7, ARMv8 may require changes here, but
    // hopefully this hacky detection method will have been replaced by then.
    // Note that there is potential for a race condition if another program or
    // x264 instance disables or reinits the counters while x264 is using them,
    // which may result in incorrect detection and the counters stuck enabled.
    // right now Apple does not seem to support performance counters for this test
#ifndef __MACH__
    flags |= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0;
#endif
    // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
#endif // if HAVE_ARMV6
    return flags;
}

#else // if X265_ARCH_X86

uint32_t cpu_detect(void)
{
    return 0;
}

#endif // if X265_ARCH_X86
}
Commit	Line	Data
	1	/*****************************************************************************
	2	* Copyright (C) 2013 x265 project
	3	*
	4	* Authors: Loren Merritt <lorenm@u.washington.edu>
	5	* Laurent Aimar <fenrir@via.ecp.fr>
	6	* Fiona Glaser <fiona@x264.com>
	7	* Steve Borho <steve@borho.org>
	8	*
	9	* This program is free software; you can redistribute it and/or modify
	10	* it under the terms of the GNU General Public License as published by
	11	* the Free Software Foundation; either version 2 of the License, or
	12	* (at your option) any later version.
	13	*
	14	* This program is distributed in the hope that it will be useful,
	15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	17	* GNU General Public License for more details.
	18	*
	19	* You should have received a copy of the GNU General Public License
	20	* along with this program; if not, write to the Free Software
	21	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
	22	*
	23	* This program is also available under a commercial proprietary license.
	24	* For more information, contact us at license @ x265.com.
	25	*****************************************************************************/
	26
	27	#include "cpu.h"
	28	#include "common.h"
	29
	30	#if MACOS \|\| SYS_FREEBSD
	31	#include <sys/types.h>
	32	#include <sys/sysctl.h>
	33	#endif
	34	#if SYS_OPENBSD
	35	#include <sys/param.h>
	36	#include <sys/sysctl.h>
	37	#include <machine/cpu.h>
	38	#endif
	39
	40	#if X265_ARCH_ARM && !defined(HAVE_NEON)
	41	#include <signal.h>
	42	#include <setjmp.h>
	43	static sigjmp_buf jmpbuf;
	44	static volatile sig_atomic_t canjump = 0;
	45
	46	static void sigill_handler(int sig)
	47	{
	48	if (!canjump)
	49	{
	50	signal(sig, SIG_DFL);
	51	raise(sig);
	52	}
	53
	54	canjump = 0;
	55	siglongjmp(jmpbuf, 1);
	56	}
	57
	58	#endif // if X265_ARCH_ARM
	59
	60	namespace x265 {
	61	const cpu_name_t cpu_names[] =
	62	{
	63	#if X265_ARCH_X86
	64	#define MMX2 X265_CPU_MMX \| X265_CPU_MMX2 \| X265_CPU_CMOV
	65	{ "MMX2", MMX2 },
	66	{ "MMXEXT", MMX2 },
	67	{ "SSE", MMX2 \| X265_CPU_SSE },
	68	#define SSE2 MMX2 \| X265_CPU_SSE \| X265_CPU_SSE2
	69	{ "SSE2Slow", SSE2 \| X265_CPU_SSE2_IS_SLOW },
	70	{ "SSE2", SSE2 },
	71	{ "SSE2Fast", SSE2 \| X265_CPU_SSE2_IS_FAST },
	72	{ "SSE3", SSE2 \| X265_CPU_SSE3 },
	73	{ "SSSE3", SSE2 \| X265_CPU_SSE3 \| X265_CPU_SSSE3 },
	74	{ "SSE4.1", SSE2 \| X265_CPU_SSE3 \| X265_CPU_SSSE3 \| X265_CPU_SSE4 },
	75	{ "SSE4", SSE2 \| X265_CPU_SSE3 \| X265_CPU_SSSE3 \| X265_CPU_SSE4 },
	76	{ "SSE4.2", SSE2 \| X265_CPU_SSE3 \| X265_CPU_SSSE3 \| X265_CPU_SSE4 \| X265_CPU_SSE42 },
	77	#define AVX SSE2 \| X265_CPU_SSE3 \| X265_CPU_SSSE3 \| X265_CPU_SSE4 \| X265_CPU_SSE42 \| X265_CPU_AVX
	78	{ "AVX", AVX },
	79	{ "XOP", AVX \| X265_CPU_XOP },
	80	{ "FMA4", AVX \| X265_CPU_FMA4 },
	81	{ "AVX2", AVX \| X265_CPU_AVX2 },
	82	{ "FMA3", AVX \| X265_CPU_FMA3 },
	83	#undef AVX
	84	#undef SSE2
	85	#undef MMX2
	86	{ "Cache32", X265_CPU_CACHELINE_32 },
	87	{ "Cache64", X265_CPU_CACHELINE_64 },
	88	{ "LZCNT", X265_CPU_LZCNT },
	89	{ "BMI1", X265_CPU_BMI1 },
	90	{ "BMI2", X265_CPU_BMI1 \| X265_CPU_BMI2 },
	91	{ "SlowCTZ", X265_CPU_SLOW_CTZ },
	92	{ "SlowAtom", X265_CPU_SLOW_ATOM },
	93	{ "SlowPshufb", X265_CPU_SLOW_PSHUFB },
	94	{ "SlowPalignr", X265_CPU_SLOW_PALIGNR },
	95	{ "SlowShuffle", X265_CPU_SLOW_SHUFFLE },
	96	{ "UnalignedStack", X265_CPU_STACK_MOD4 },
	97
	98	#elif X265_ARCH_ARM
	99	{ "ARMv6", X265_CPU_ARMV6 },
	100	{ "NEON", X265_CPU_NEON },
	101	{ "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
	102	#endif // if X265_ARCH_X86
	103	{ "", 0 },
	104	};
	105
	106	#if X265_ARCH_X86
	107
	108	extern "C" {
	109	/* cpu-a.asm */
	110	int x265_cpu_cpuid_test(void);
	111	void x265_cpu_cpuid(uint32_t op, uint32_t eax, uint32_t ebx, uint32_t ecx, uint32_t edx);
	112	void x265_cpu_xgetbv(uint32_t op, uint32_t eax, uint32_t edx);
	113	}
	114
	115	#if defined(_MSC_VER)
	116	#pragma warning(disable: 4309) // truncation of constant value
	117	#endif
	118
	119	uint32_t cpu_detect(void)
	120	{
	121	uint32_t cpu = 0;
	122
	123	uint32_t eax, ebx, ecx, edx;
	124	uint32_t vendor[4] = { 0 };
	125	uint32_t max_extended_cap, max_basic_cap;
	126
	127	#if !X86_64
	128	if (!x265_cpu_cpuid_test())
	129	return 0;
	130	#endif
	131
	132	x265_cpu_cpuid(0, &eax, vendor + 0, vendor + 2, vendor + 1);
	133	max_basic_cap = eax;
	134	if (max_basic_cap == 0)
	135	return 0;
	136
	137	x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
	138	if (edx & 0x00800000)
	139	cpu \|= X265_CPU_MMX;
	140	else
	141	return cpu;
	142	if (edx & 0x02000000)
	143	cpu \|= X265_CPU_MMX2 \| X265_CPU_SSE;
	144	if (edx & 0x00008000)
	145	cpu \|= X265_CPU_CMOV;
	146	else
	147	return cpu;
	148	if (edx & 0x04000000)
	149	cpu \|= X265_CPU_SSE2;
	150	if (ecx & 0x00000001)
	151	cpu \|= X265_CPU_SSE3;
	152	if (ecx & 0x00000200)
	153	cpu \|= X265_CPU_SSSE3;
	154	if (ecx & 0x00080000)
	155	cpu \|= X265_CPU_SSE4;
	156	if (ecx & 0x00100000)
	157	cpu \|= X265_CPU_SSE42;
	158	/* Check OXSAVE and AVX bits */
	159	if ((ecx & 0x18000000) == 0x18000000)
	160	{
	161	/* Check for OS support */
	162	x265_cpu_xgetbv(0, &eax, &edx);
	163	if ((eax & 0x6) == 0x6)
	164	{
	165	cpu \|= X265_CPU_AVX;
	166	if (ecx & 0x00001000)
	167	cpu \|= X265_CPU_FMA3;
	168	}
	169	}
	170
	171	if (max_basic_cap >= 7)
	172	{
	173	x265_cpu_cpuid(7, &eax, &ebx, &ecx, &edx);
	174	/* AVX2 requires OS support, but BMI1/2 don't. */
	175	if ((cpu & X265_CPU_AVX) && (ebx & 0x00000020))
	176	cpu \|= X265_CPU_AVX2;
	177	if (ebx & 0x00000008)
	178	{
	179	cpu \|= X265_CPU_BMI1;
	180	if (ebx & 0x00000100)
	181	cpu \|= X265_CPU_BMI2;
	182	}
	183	}
	184
	185	if (cpu & X265_CPU_SSSE3)
	186	cpu \|= X265_CPU_SSE2_IS_FAST;
	187
	188	x265_cpu_cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
	189	max_extended_cap = eax;
	190
	191	if (max_extended_cap >= 0x80000001)
	192	{
	193	x265_cpu_cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
	194
	195	if (ecx & 0x00000020)
	196	cpu \|= X265_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
	197	if (ecx & 0x00000040) /* SSE4a, AMD only */
	198	{
	199	int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
	200	cpu \|= X265_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */
	201	if (family == 0x14)
	202	{
	203	cpu &= ~X265_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
	204	cpu \|= X265_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */
	205	cpu \|= X265_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */
	206	}
	207	if (family == 0x16)
	208	{
	209	cpu \|= X265_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough
	210	* compared to alternate instruction sequences that this
	211	* is equal or faster on almost all such functions. */
	212	}
	213	}
	214
	215	if (cpu & X265_CPU_AVX)
	216	{
	217	if (ecx & 0x00000800) /* XOP */
	218	cpu \|= X265_CPU_XOP;
	219	if (ecx & 0x00010000) /* FMA4 */
	220	cpu \|= X265_CPU_FMA4;
	221	}
	222
	223	if (!strcmp((char*)vendor, "AuthenticAMD"))
	224	{
	225	if (edx & 0x00400000)
	226	cpu \|= X265_CPU_MMX2;
	227	if (!(cpu & X265_CPU_LZCNT))
	228	cpu \|= X265_CPU_SLOW_CTZ;
	229	if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
	230	cpu \|= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
	231	}
	232	}
	233
	234	if (!strcmp((char*)vendor, "GenuineIntel"))
	235	{
	236	x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
	237	int family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
	238	int model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
	239	if (family == 6)
	240	{
	241	/* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
	242	* theoretically support sse2, but it's significantly slower than mmx for
	243	* almost all of x264's functions, so let's just pretend they don't. */
	244	if (model == 9 \|\| model == 13 \|\| model == 14)
	245	{
	246	cpu &= ~(X265_CPU_SSE2 \| X265_CPU_SSE3);
	247	X265_CHECK(!(cpu & (X265_CPU_SSSE3 \| X265_CPU_SSE4)), "unexpected CPU ID %d\n", cpu);
	248	}
	249	/* Detect Atom CPU */
	250	else if (model == 28)
	251	{
	252	cpu \|= X265_CPU_SLOW_ATOM;
	253	cpu \|= X265_CPU_SLOW_CTZ;
	254	cpu \|= X265_CPU_SLOW_PSHUFB;
	255	}
	256
	257	/* Conroe has a slow shuffle unit. Check the model number to make sure not
	258	* to include crippled low-end Penryns and Nehalems that don't have SSE4. */
	259	else if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE4) && model < 23)
	260	cpu \|= X265_CPU_SLOW_SHUFFLE;
	261	}
	262	}
	263
	264	if ((!strcmp((char)vendor, "GenuineIntel") \|\| !strcmp((char)vendor, "CyrixInstead")) && !(cpu & X265_CPU_SSE42))
	265	{
	266	/* cacheline size is specified in 3 places, any of which may be missing */
	267	x265_cpu_cpuid(1, &eax, &ebx, &ecx, &edx);
	268	int cache = (ebx & 0xff00) >> 5; // cflush size
	269	if (!cache && max_extended_cap >= 0x80000006)
	270	{
	271	x265_cpu_cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
	272	cache = ecx & 0xff; // cacheline size
	273	}
	274	if (!cache && max_basic_cap >= 2)
	275	{
	276	// Cache and TLB Information
	277	static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
	278	static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
	279	0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
	280	uint32_t buf[4];
	281	int max, i = 0;
	282	do
	283	{
	284	x265_cpu_cpuid(2, buf + 0, buf + 1, buf + 2, buf + 3);
	285	max = buf[0] & 0xff;
	286	buf[0] &= ~0xff;
	287	for (int j = 0; j < 4; j++)
	288	{
	289	if (!(buf[j] >> 31))
	290	while (buf[j])
	291	{
	292	if (strchr(cache32_ids, buf[j] & 0xff))
	293	cache = 32;
	294	if (strchr(cache64_ids, buf[j] & 0xff))
	295	cache = 64;
	296	buf[j] >>= 8;
	297	}
	298	}
	299	}
	300	while (++i < max);
	301	}
	302
	303	if (cache == 32)
	304	cpu \|= X265_CPU_CACHELINE_32;
	305	else if (cache == 64)
	306	cpu \|= X265_CPU_CACHELINE_64;
	307	else
	308	x265_log(NULL, X265_LOG_WARNING, "unable to determine cacheline size\n");
	309	}
	310
	311	#if BROKEN_STACK_ALIGNMENT
	312	cpu \|= X265_CPU_STACK_MOD4;
	313	#endif
	314
	315	return cpu;
	316	}
	317
	318	#elif X265_ARCH_ARM
	319
	320	extern "C" {
	321	void x265_cpu_neon_test(void);
	322	int x265_cpu_fast_neon_mrc_test(void);
	323	}
	324
	325	uint32_t cpu_detect(void)
	326	{
	327	int flags = 0;
	328
	329	#if HAVE_ARMV6
	330	flags \|= X265_CPU_ARMV6;
	331
	332	// don't do this hack if compiled with -mfpu=neon
	333	#if !HAVE_NEON
	334	static void (* oldsig)(int);
	335	oldsig = signal(SIGILL, sigill_handler);
	336	if (sigsetjmp(jmpbuf, 1))
	337	{
	338	signal(SIGILL, oldsig);
	339	return flags;
	340	}
	341
	342	canjump = 1;
	343	x265_cpu_neon_test();
	344	canjump = 0;
	345	signal(SIGILL, oldsig);
	346	#endif // if !HAVE_NEON
	347
	348	flags \|= X265_CPU_NEON;
	349
	350	// fast neon -> arm (Cortex-A9) detection relies on user access to the
	351	// cycle counter; this assumes ARMv7 performance counters.
	352	// NEON requires at least ARMv7, ARMv8 may require changes here, but
	353	// hopefully this hacky detection method will have been replaced by then.
	354	// Note that there is potential for a race condition if another program or
	355	// x264 instance disables or reinits the counters while x264 is using them,
	356	// which may result in incorrect detection and the counters stuck enabled.
	357	// right now Apple does not seem to support performance counters for this test
	358	#ifndef __MACH__
	359	flags \|= x265_cpu_fast_neon_mrc_test() ? X265_CPU_FAST_NEON_MRC : 0;
	360	#endif
	361	// TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
	362	#endif // if HAVE_ARMV6
	363	return flags;
	364	}
	365
	366	#else // if X265_ARCH_X86
	367
	368	uint32_t cpu_detect(void)
	369	{
	370	return 0;
	371	}
	372
	373	#endif // if X265_ARCH_X86
	374	}