| 1 | /***************************************************************************** |
| 2 | * Copyright (C) 2013 x265 project |
| 3 | * |
| 4 | * Authors: Steve Borho <steve@borho.org> |
| 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License as published by |
| 8 | * the Free Software Foundation; either version 2 of the License, or |
| 9 | * (at your option) any later version. |
| 10 | * |
| 11 | * This program is distributed in the hope that it will be useful, |
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | * GNU General Public License for more details. |
| 15 | * |
| 16 | * You should have received a copy of the GNU General Public License |
| 17 | * along with this program; if not, write to the Free Software |
| 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
| 19 | * |
| 20 | * This program is also available under a commercial proprietary license. |
| 21 | * For more information, contact us at license @ x265.com. |
| 22 | *****************************************************************************/ |
| 23 | |
| 24 | #include "common.h" |
| 25 | #include "primitives.h" |
| 26 | |
| 27 | namespace x265 { |
| 28 | // x265 private namespace |
| 29 | |
| 30 | extern const uint8_t lumaPartitionMapTable[] = |
| 31 | { |
| 32 | // 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 64 |
| 33 | LUMA_4x4, LUMA_4x8, 255, LUMA_4x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 4 |
| 34 | LUMA_8x4, LUMA_8x8, 255, LUMA_8x16, 255, 255, 255, LUMA_8x32, 255, 255, 255, 255, 255, 255, 255, 255, // 8 |
| 35 | 255, 255, 255, LUMA_12x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 12 |
| 36 | LUMA_16x4, LUMA_16x8, LUMA_16x12, LUMA_16x16, 255, 255, 255, LUMA_16x32, 255, 255, 255, 255, 255, 255, 255, LUMA_16x64, // 16 |
| 37 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 20 |
| 38 | 255, 255, 255, 255, 255, 255, 255, LUMA_24x32, 255, 255, 255, 255, 255, 255, 255, 255, // 24 |
| 39 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 28 |
| 40 | 255, LUMA_32x8, 255, LUMA_32x16, 255, LUMA_32x24, 255, LUMA_32x32, 255, 255, 255, 255, 255, 255, 255, LUMA_32x64, // 32 |
| 41 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 36 |
| 42 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 40 |
| 43 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 44 |
| 44 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, LUMA_48x64, // 48 |
| 45 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 52 |
| 46 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 56 |
| 47 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 60 |
| 48 | 255, 255, 255, LUMA_64x16, 255, 255, 255, LUMA_64x32, 255, 255, 255, LUMA_64x48, 255, 255, 255, LUMA_64x64 // 64 |
| 49 | }; |
| 50 | |
| 51 | /* the "authoritative" set of encoder primitives */ |
| 52 | EncoderPrimitives primitives; |
| 53 | |
| 54 | void Setup_C_PixelPrimitives(EncoderPrimitives &p); |
| 55 | void Setup_C_DCTPrimitives(EncoderPrimitives &p); |
| 56 | void Setup_C_IPFilterPrimitives(EncoderPrimitives &p); |
| 57 | void Setup_C_IPredPrimitives(EncoderPrimitives &p); |
| 58 | void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p); |
| 59 | |
| 60 | void Setup_C_Primitives(EncoderPrimitives &p) |
| 61 | { |
| 62 | Setup_C_PixelPrimitives(p); // pixel.cpp |
| 63 | Setup_C_DCTPrimitives(p); // dct.cpp |
| 64 | Setup_C_IPFilterPrimitives(p); // ipfilter.cpp |
| 65 | Setup_C_IPredPrimitives(p); // intrapred.cpp |
| 66 | Setup_C_LoopFilterPrimitives(p); // loopfilter.cpp |
| 67 | } |
| 68 | |
| 69 | void Setup_Alias_Primitives(EncoderPrimitives &p) |
| 70 | { |
| 71 | /* copy reusable luma primitives to chroma 4:4:4 */ |
| 72 | for (int i = 0; i < NUM_LUMA_PARTITIONS; i++) |
| 73 | { |
| 74 | p.chroma[X265_CSP_I444].copy_pp[i] = p.luma_copy_pp[i]; |
| 75 | p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i]; |
| 76 | p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i]; |
| 77 | p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i]; |
| 78 | p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i]; |
| 79 | } |
| 80 | |
| 81 | for (int i = 0; i < NUM_SQUARE_BLOCKS; i++) |
| 82 | { |
| 83 | p.chroma[X265_CSP_I444].add_ps[i] = p.luma_add_ps[i]; |
| 84 | p.chroma[X265_CSP_I444].sub_ps[i] = p.luma_sub_ps[i]; |
| 85 | } |
| 86 | |
| 87 | for (int i = 0; i < NUM_SQUARE_BLOCKS; i++) |
| 88 | { |
| 89 | int partL = partitionFromLog2Size(i + 2); |
| 90 | p.square_copy_pp[i] = p.luma_copy_pp[partL]; |
| 91 | p.square_copy_ps[i] = p.luma_copy_ps[partL]; |
| 92 | p.square_copy_sp[i] = p.luma_copy_sp[partL]; |
| 93 | p.square_copy_ss[i] = p.luma_copy_ss[partL]; |
| 94 | } |
| 95 | |
| 96 | primitives.sa8d[BLOCK_4x4] = primitives.sa8d_inter[LUMA_4x4]; |
| 97 | primitives.sa8d[BLOCK_8x8] = primitives.sa8d_inter[LUMA_8x8]; |
| 98 | primitives.sa8d[BLOCK_16x16] = primitives.sa8d_inter[LUMA_16x16]; |
| 99 | primitives.sa8d[BLOCK_32x32] = primitives.sa8d_inter[LUMA_32x32]; |
| 100 | primitives.sa8d[BLOCK_64x64] = primitives.sa8d_inter[LUMA_64x64]; |
| 101 | |
| 102 | // SA8D devolves to SATD for blocks not even multiples of 8x8 |
| 103 | primitives.sa8d_inter[LUMA_4x4] = primitives.satd[LUMA_4x4]; |
| 104 | primitives.sa8d_inter[LUMA_4x8] = primitives.satd[LUMA_4x8]; |
| 105 | primitives.sa8d_inter[LUMA_4x16] = primitives.satd[LUMA_4x16]; |
| 106 | primitives.sa8d_inter[LUMA_8x4] = primitives.satd[LUMA_8x4]; |
| 107 | primitives.sa8d_inter[LUMA_16x4] = primitives.satd[LUMA_16x4]; |
| 108 | primitives.sa8d_inter[LUMA_16x12] = primitives.satd[LUMA_16x12]; |
| 109 | primitives.sa8d_inter[LUMA_12x16] = primitives.satd[LUMA_12x16]; |
| 110 | } |
| 111 | } |
| 112 | using namespace x265; |
| 113 | |
| 114 | /* cpuid >= 0 - force CPU type |
| 115 | * cpuid < 0 - auto-detect if uninitialized */ |
| 116 | extern "C" |
| 117 | void x265_setup_primitives(x265_param *param, int cpuid) |
| 118 | { |
| 119 | if (cpuid < 0) |
| 120 | cpuid = x265::cpu_detect(); |
| 121 | |
| 122 | // initialize global variables |
| 123 | if (!primitives.sad[0]) |
| 124 | { |
| 125 | Setup_C_Primitives(primitives); |
| 126 | Setup_Instrinsic_Primitives(primitives, cpuid); |
| 127 | |
| 128 | #if ENABLE_ASSEMBLY |
| 129 | Setup_Assembly_Primitives(primitives, cpuid); |
| 130 | #else |
| 131 | x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n"); |
| 132 | #endif |
| 133 | |
| 134 | Setup_Alias_Primitives(primitives); |
| 135 | |
| 136 | initROM(); |
| 137 | } |
| 138 | |
| 139 | if (param->logLevel >= X265_LOG_INFO) |
| 140 | { |
| 141 | char buf[1000]; |
| 142 | char *p = buf + sprintf(buf, "using cpu capabilities:"); |
| 143 | char *none = p; |
| 144 | for (int i = 0; x265::cpu_names[i].flags; i++) |
| 145 | { |
| 146 | if (!strcmp(x265::cpu_names[i].name, "SSE") |
| 147 | && (cpuid & X265_CPU_SSE2)) |
| 148 | continue; |
| 149 | if (!strcmp(x265::cpu_names[i].name, "SSE2") |
| 150 | && (cpuid & (X265_CPU_SSE2_IS_FAST | X265_CPU_SSE2_IS_SLOW))) |
| 151 | continue; |
| 152 | if (!strcmp(x265::cpu_names[i].name, "SSE3") |
| 153 | && (cpuid & X265_CPU_SSSE3 || !(cpuid & X265_CPU_CACHELINE_64))) |
| 154 | continue; |
| 155 | if (!strcmp(x265::cpu_names[i].name, "SSE4.1") |
| 156 | && (cpuid & X265_CPU_SSE42)) |
| 157 | continue; |
| 158 | if (!strcmp(x265::cpu_names[i].name, "BMI1") |
| 159 | && (cpuid & X265_CPU_BMI2)) |
| 160 | continue; |
| 161 | if ((cpuid & x265::cpu_names[i].flags) == x265::cpu_names[i].flags |
| 162 | && (!i || x265::cpu_names[i].flags != x265::cpu_names[i - 1].flags)) |
| 163 | p += sprintf(p, " %s", x265::cpu_names[i].name); |
| 164 | } |
| 165 | |
| 166 | if (p == none) |
| 167 | sprintf(p, " none!"); |
| 168 | x265_log(param, X265_LOG_INFO, "%s\n", buf); |
| 169 | } |
| 170 | } |
| 171 | |
| 172 | #if !defined(ENABLE_ASSEMBLY) |
| 173 | #if defined(_MSC_VER) |
| 174 | #include <intrin.h> |
| 175 | #endif |
| 176 | |
| 177 | extern "C" { |
| 178 | // the intrinsic primitives will not use MMX instructions, so if assembly |
| 179 | // is disabled there should be no reason to use EMMS. |
| 180 | void x265_cpu_emms(void) {} |
| 181 | |
| 182 | #if defined(X265_ARCH_X86) |
| 183 | |
| 184 | #if defined(_MSC_VER) |
| 185 | # pragma warning(disable: 4100) |
| 186 | #elif defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax |
| 187 | # define __cpuidex(regsArray, level, index) \ |
| 188 | __asm__ __volatile__ ("cpuid" \ |
| 189 | : "=a" ((regsArray)[0]), "=b" ((regsArray)[1]), "=c" ((regsArray)[2]), "=d" ((regsArray)[3]) \ |
| 190 | : "0" (level), "2" (index)); |
| 191 | #else |
| 192 | # error "compiler not supported" |
| 193 | #endif |
| 194 | |
| 195 | int x265_cpu_cpuid_test(void) |
| 196 | { |
| 197 | return 0; |
| 198 | } |
| 199 | |
| 200 | void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) |
| 201 | { |
| 202 | int output[4]; |
| 203 | |
| 204 | __cpuidex(output, op, 0); |
| 205 | *eax = output[0]; |
| 206 | *ebx = output[1]; |
| 207 | *ecx = output[2]; |
| 208 | *edx = output[3]; |
| 209 | } |
| 210 | |
| 211 | void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx) |
| 212 | { |
| 213 | uint64_t out = 0; |
| 214 | |
| 215 | #if X265_ARCH_X86 |
| 216 | |
| 217 | #if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) |
| 218 | |
| 219 | // MSVC 2010 SP1 or later, or similar Intel release |
| 220 | out = _xgetbv(op); |
| 221 | |
| 222 | #elif defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax |
| 223 | |
| 224 | uint32_t a, d; |
| 225 | __asm("xgetbv" : "=a" (a), "=d" (d) : "c" (op) :); |
| 226 | *eax = a; |
| 227 | *edx = d; |
| 228 | return; |
| 229 | |
| 230 | #elif defined(_WIN64) // On x64 with older compilers, this is impossible |
| 231 | |
| 232 | #endif // if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) |
| 233 | |
| 234 | #endif // if x86 |
| 235 | |
| 236 | *eax = (uint32_t)out; |
| 237 | *edx = (uint32_t)(out >> 32); |
| 238 | } |
| 239 | |
| 240 | #endif // X265_ARCH_X86 |
| 241 | } |
| 242 | #endif // if !ENABLE_ASSEMBLY |