| 1 | /***************************************************************************** |
| 2 | * Copyright (C) 2013 x265 project |
| 3 | * |
| 4 | * Authors: Steve Borho <steve@borho.org> |
| 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License as published by |
| 8 | * the Free Software Foundation; either version 2 of the License, or |
| 9 | * (at your option) any later version. |
| 10 | * |
| 11 | * This program is distributed in the hope that it will be useful, |
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | * GNU General Public License for more details. |
| 15 | * |
| 16 | * You should have received a copy of the GNU General Public License |
| 17 | * along with this program; if not, write to the Free Software |
| 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
| 19 | * |
| 20 | * This program is also available under a commercial proprietary license. |
| 21 | * For more information, contact us at license @ x265.com. |
| 22 | *****************************************************************************/ |
| 23 | |
| 24 | #include "common.h" |
| 25 | #include "primitives.h" |
| 26 | |
| 27 | namespace x265 { |
| 28 | // x265 private namespace |
| 29 | |
| 30 | extern const uint8_t lumaPartitionMapTable[] = |
| 31 | { |
| 32 | // 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 64 |
| 33 | LUMA_4x4, LUMA_4x8, 255, LUMA_4x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 4 |
| 34 | LUMA_8x4, LUMA_8x8, 255, LUMA_8x16, 255, 255, 255, LUMA_8x32, 255, 255, 255, 255, 255, 255, 255, 255, // 8 |
| 35 | 255, 255, 255, LUMA_12x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 12 |
| 36 | LUMA_16x4, LUMA_16x8, LUMA_16x12, LUMA_16x16, 255, 255, 255, LUMA_16x32, 255, 255, 255, 255, 255, 255, 255, LUMA_16x64, // 16 |
| 37 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 20 |
| 38 | 255, 255, 255, 255, 255, 255, 255, LUMA_24x32, 255, 255, 255, 255, 255, 255, 255, 255, // 24 |
| 39 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 28 |
| 40 | 255, LUMA_32x8, 255, LUMA_32x16, 255, LUMA_32x24, 255, LUMA_32x32, 255, 255, 255, 255, 255, 255, 255, LUMA_32x64, // 32 |
| 41 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 36 |
| 42 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 40 |
| 43 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 44 |
| 44 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, LUMA_48x64, // 48 |
| 45 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 52 |
| 46 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 56 |
| 47 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 60 |
| 48 | 255, 255, 255, LUMA_64x16, 255, 255, 255, LUMA_64x32, 255, 255, 255, LUMA_64x48, 255, 255, 255, LUMA_64x64 // 64 |
| 49 | }; |
| 50 | |
| 51 | /* the "authoritative" set of encoder primitives */ |
| 52 | EncoderPrimitives primitives; |
| 53 | |
| 54 | void Setup_C_PixelPrimitives(EncoderPrimitives &p); |
| 55 | void Setup_C_DCTPrimitives(EncoderPrimitives &p); |
| 56 | void Setup_C_IPFilterPrimitives(EncoderPrimitives &p); |
| 57 | void Setup_C_IPredPrimitives(EncoderPrimitives &p); |
| 58 | void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p); |
| 59 | |
| 60 | void Setup_C_Primitives(EncoderPrimitives &p) |
| 61 | { |
| 62 | Setup_C_PixelPrimitives(p); // pixel.cpp |
| 63 | Setup_C_DCTPrimitives(p); // dct.cpp |
| 64 | Setup_C_IPFilterPrimitives(p); // ipfilter.cpp |
| 65 | Setup_C_IPredPrimitives(p); // intrapred.cpp |
| 66 | Setup_C_LoopFilterPrimitives(p); // loopfilter.cpp |
| 67 | } |
| 68 | |
| 69 | void Setup_Alias_Primitives(EncoderPrimitives &p) |
| 70 | { |
| 71 | /* copy reusable luma primitives to chroma 4:4:4 */ |
| 72 | for (int i = 0; i < NUM_LUMA_PARTITIONS; i++) |
| 73 | { |
| 74 | p.chroma[X265_CSP_I444].copy_pp[i] = p.luma_copy_pp[i]; |
| 75 | p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i]; |
| 76 | p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i]; |
| 77 | p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i]; |
| 78 | p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i]; |
| 79 | p.chroma[X265_CSP_I444].satd[i] = p.satd[i]; |
| 80 | } |
| 81 | |
| 82 | for (int i = 0; i < NUM_SQUARE_BLOCKS; i++) |
| 83 | { |
| 84 | p.chroma[X265_CSP_I444].add_ps[i] = p.luma_add_ps[i]; |
| 85 | p.chroma[X265_CSP_I444].sub_ps[i] = p.luma_sub_ps[i]; |
| 86 | } |
| 87 | |
| 88 | primitives.sa8d[BLOCK_4x4] = primitives.sa8d_inter[LUMA_4x4]; |
| 89 | primitives.sa8d[BLOCK_8x8] = primitives.sa8d_inter[LUMA_8x8]; |
| 90 | primitives.sa8d[BLOCK_16x16] = primitives.sa8d_inter[LUMA_16x16]; |
| 91 | primitives.sa8d[BLOCK_32x32] = primitives.sa8d_inter[LUMA_32x32]; |
| 92 | primitives.sa8d[BLOCK_64x64] = primitives.sa8d_inter[LUMA_64x64]; |
| 93 | |
| 94 | // SA8D devolves to SATD for blocks not even multiples of 8x8 |
| 95 | primitives.sa8d_inter[LUMA_4x4] = primitives.satd[LUMA_4x4]; |
| 96 | primitives.sa8d_inter[LUMA_4x8] = primitives.satd[LUMA_4x8]; |
| 97 | primitives.sa8d_inter[LUMA_4x16] = primitives.satd[LUMA_4x16]; |
| 98 | primitives.sa8d_inter[LUMA_8x4] = primitives.satd[LUMA_8x4]; |
| 99 | primitives.sa8d_inter[LUMA_16x4] = primitives.satd[LUMA_16x4]; |
| 100 | primitives.sa8d_inter[LUMA_16x12] = primitives.satd[LUMA_16x12]; |
| 101 | primitives.sa8d_inter[LUMA_12x16] = primitives.satd[LUMA_12x16]; |
| 102 | |
| 103 | // Chroma SATD can often reuse luma primitives |
| 104 | p.chroma[X265_CSP_I420].satd[CHROMA_4x4] = primitives.satd[LUMA_4x4]; |
| 105 | p.chroma[X265_CSP_I420].satd[CHROMA_8x8] = primitives.satd[LUMA_8x8]; |
| 106 | p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = primitives.satd[LUMA_16x16]; |
| 107 | p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = primitives.satd[LUMA_32x32]; |
| 108 | |
| 109 | p.chroma[X265_CSP_I420].satd[CHROMA_8x4] = primitives.satd[LUMA_8x4]; |
| 110 | p.chroma[X265_CSP_I420].satd[CHROMA_4x8] = primitives.satd[LUMA_4x8]; |
| 111 | p.chroma[X265_CSP_I420].satd[CHROMA_16x8] = primitives.satd[LUMA_16x8]; |
| 112 | p.chroma[X265_CSP_I420].satd[CHROMA_8x16] = primitives.satd[LUMA_8x16]; |
| 113 | p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = primitives.satd[LUMA_32x16]; |
| 114 | p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = primitives.satd[LUMA_16x32]; |
| 115 | |
| 116 | p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = primitives.satd[LUMA_16x12]; |
| 117 | p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = primitives.satd[LUMA_12x16]; |
| 118 | p.chroma[X265_CSP_I420].satd[CHROMA_16x4] = primitives.satd[LUMA_16x4]; |
| 119 | p.chroma[X265_CSP_I420].satd[CHROMA_4x16] = primitives.satd[LUMA_4x16]; |
| 120 | p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = primitives.satd[LUMA_32x24]; |
| 121 | p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = primitives.satd[LUMA_24x32]; |
| 122 | p.chroma[X265_CSP_I420].satd[CHROMA_32x8] = primitives.satd[LUMA_32x8]; |
| 123 | p.chroma[X265_CSP_I420].satd[CHROMA_8x32] = primitives.satd[LUMA_8x32]; |
| 124 | |
| 125 | p.chroma[X265_CSP_I422].satd[CHROMA422_4x8] = primitives.satd[LUMA_4x8]; |
| 126 | p.chroma[X265_CSP_I422].satd[CHROMA422_8x16] = primitives.satd[LUMA_8x16]; |
| 127 | p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = primitives.satd[LUMA_16x32]; |
| 128 | p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = primitives.satd[LUMA_32x64]; |
| 129 | |
| 130 | p.chroma[X265_CSP_I422].satd[CHROMA422_4x4] = primitives.satd[LUMA_4x4]; |
| 131 | p.chroma[X265_CSP_I422].satd[CHROMA422_8x8] = primitives.satd[LUMA_8x8]; |
| 132 | p.chroma[X265_CSP_I422].satd[CHROMA422_4x16] = primitives.satd[LUMA_4x16]; |
| 133 | p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = primitives.satd[LUMA_16x16]; |
| 134 | p.chroma[X265_CSP_I422].satd[CHROMA422_8x32] = primitives.satd[LUMA_8x32]; |
| 135 | p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = primitives.satd[LUMA_32x32]; |
| 136 | p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = primitives.satd[LUMA_16x64]; |
| 137 | |
| 138 | //p.chroma[X265_CSP_I422].satd[CHROMA422_8x12] = satd4<8, 12>; |
| 139 | p.chroma[X265_CSP_I422].satd[CHROMA422_8x4] = primitives.satd[LUMA_8x4]; |
| 140 | //p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>; |
| 141 | //p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>; |
| 142 | p.chroma[X265_CSP_I422].satd[CHROMA422_16x8] = primitives.satd[LUMA_16x8]; |
| 143 | //p.chroma[X265_CSP_I422].satd[CHROMA422_4x32] = satd4<4, 32>; |
| 144 | //p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>; |
| 145 | //p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>; |
| 146 | p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = primitives.satd[LUMA_32x16]; |
| 147 | //p.chroma[X265_CSP_I422].satd[CHROMA422_8x64] = satd8<8, 64>; |
| 148 | } |
| 149 | } |
| 150 | using namespace x265; |
| 151 | |
| 152 | /* cpuid >= 0 - force CPU type |
| 153 | * cpuid < 0 - auto-detect if uninitialized */ |
| 154 | extern "C" |
| 155 | void x265_setup_primitives(x265_param *param, int cpuid) |
| 156 | { |
| 157 | if (cpuid < 0) |
| 158 | cpuid = x265::cpu_detect(); |
| 159 | |
| 160 | // initialize global variables |
| 161 | if (!primitives.sad[0]) |
| 162 | { |
| 163 | Setup_C_Primitives(primitives); |
| 164 | |
| 165 | #if ENABLE_ASSEMBLY |
| 166 | Setup_Instrinsic_Primitives(primitives, cpuid); |
| 167 | Setup_Assembly_Primitives(primitives, cpuid); |
| 168 | #else |
| 169 | x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n"); |
| 170 | #endif |
| 171 | |
| 172 | Setup_Alias_Primitives(primitives); |
| 173 | } |
| 174 | |
| 175 | if (param->logLevel >= X265_LOG_INFO) |
| 176 | { |
| 177 | char buf[1000]; |
| 178 | char *p = buf + sprintf(buf, "using cpu capabilities:"); |
| 179 | char *none = p; |
| 180 | for (int i = 0; x265::cpu_names[i].flags; i++) |
| 181 | { |
| 182 | if (!strcmp(x265::cpu_names[i].name, "SSE") |
| 183 | && (cpuid & X265_CPU_SSE2)) |
| 184 | continue; |
| 185 | if (!strcmp(x265::cpu_names[i].name, "SSE2") |
| 186 | && (cpuid & (X265_CPU_SSE2_IS_FAST | X265_CPU_SSE2_IS_SLOW))) |
| 187 | continue; |
| 188 | if (!strcmp(x265::cpu_names[i].name, "SSE3") |
| 189 | && (cpuid & X265_CPU_SSSE3 || !(cpuid & X265_CPU_CACHELINE_64))) |
| 190 | continue; |
| 191 | if (!strcmp(x265::cpu_names[i].name, "SSE4.1") |
| 192 | && (cpuid & X265_CPU_SSE42)) |
| 193 | continue; |
| 194 | if (!strcmp(x265::cpu_names[i].name, "BMI1") |
| 195 | && (cpuid & X265_CPU_BMI2)) |
| 196 | continue; |
| 197 | if ((cpuid & x265::cpu_names[i].flags) == x265::cpu_names[i].flags |
| 198 | && (!i || x265::cpu_names[i].flags != x265::cpu_names[i - 1].flags)) |
| 199 | p += sprintf(p, " %s", x265::cpu_names[i].name); |
| 200 | } |
| 201 | |
| 202 | if (p == none) |
| 203 | sprintf(p, " none!"); |
| 204 | x265_log(param, X265_LOG_INFO, "%s\n", buf); |
| 205 | } |
| 206 | } |
| 207 | |
| 208 | #if ENABLE_ASSEMBLY |
| 209 | /* these functions are implemented in assembly. When assembly is not being |
| 210 | * compiled, they are unnecessary and can be NOPs */ |
| 211 | #else |
| 212 | extern "C" { |
| 213 | int x265_cpu_cpuid_test(void) { return 0; } |
| 214 | void x265_cpu_emms(void) {} |
| 215 | void x265_cpu_cpuid(uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *) {} |
| 216 | void x265_cpu_xgetbv(uint32_t, uint32_t *, uint32_t *) {} |
| 217 | } |
| 218 | #endif |