Imported Upstream version 1.4
[deb_x265.git] / source / common / primitives.cpp
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19 *
20 * This program is also available under a commercial proprietary license.
21 * For more information, contact us at license @ x265.com.
22 *****************************************************************************/
23
24 #include "common.h"
25 #include "primitives.h"
26
27 namespace x265 {
28 // x265 private namespace
29
30 extern const uint8_t lumaPartitionMapTable[] =
31 {
32 // 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 64
33 LUMA_4x4, LUMA_4x8, 255, LUMA_4x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 4
34 LUMA_8x4, LUMA_8x8, 255, LUMA_8x16, 255, 255, 255, LUMA_8x32, 255, 255, 255, 255, 255, 255, 255, 255, // 8
35 255, 255, 255, LUMA_12x16, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 12
36 LUMA_16x4, LUMA_16x8, LUMA_16x12, LUMA_16x16, 255, 255, 255, LUMA_16x32, 255, 255, 255, 255, 255, 255, 255, LUMA_16x64, // 16
37 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 20
38 255, 255, 255, 255, 255, 255, 255, LUMA_24x32, 255, 255, 255, 255, 255, 255, 255, 255, // 24
39 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 28
40 255, LUMA_32x8, 255, LUMA_32x16, 255, LUMA_32x24, 255, LUMA_32x32, 255, 255, 255, 255, 255, 255, 255, LUMA_32x64, // 32
41 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 36
42 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 40
43 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 44
44 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, LUMA_48x64, // 48
45 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 52
46 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 56
47 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 60
48 255, 255, 255, LUMA_64x16, 255, 255, 255, LUMA_64x32, 255, 255, 255, LUMA_64x48, 255, 255, 255, LUMA_64x64 // 64
49 };
50
51 /* the "authoritative" set of encoder primitives */
52 EncoderPrimitives primitives;
53
54 void Setup_C_PixelPrimitives(EncoderPrimitives &p);
55 void Setup_C_DCTPrimitives(EncoderPrimitives &p);
56 void Setup_C_IPFilterPrimitives(EncoderPrimitives &p);
57 void Setup_C_IPredPrimitives(EncoderPrimitives &p);
58 void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p);
59
60 void Setup_C_Primitives(EncoderPrimitives &p)
61 {
62 Setup_C_PixelPrimitives(p); // pixel.cpp
63 Setup_C_DCTPrimitives(p); // dct.cpp
64 Setup_C_IPFilterPrimitives(p); // ipfilter.cpp
65 Setup_C_IPredPrimitives(p); // intrapred.cpp
66 Setup_C_LoopFilterPrimitives(p); // loopfilter.cpp
67 }
68
69 void Setup_Alias_Primitives(EncoderPrimitives &p)
70 {
71 /* copy reusable luma primitives to chroma 4:4:4 */
72 for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
73 {
74 p.chroma[X265_CSP_I444].copy_pp[i] = p.luma_copy_pp[i];
75 p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i];
76 p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i];
77 p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i];
78 p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i];
79 }
80
81 for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
82 {
83 p.chroma[X265_CSP_I444].add_ps[i] = p.luma_add_ps[i];
84 p.chroma[X265_CSP_I444].sub_ps[i] = p.luma_sub_ps[i];
85 }
86
87 for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)
88 {
89 int partL = partitionFromLog2Size(i + 2);
90 p.square_copy_pp[i] = p.luma_copy_pp[partL];
91 p.square_copy_ps[i] = p.luma_copy_ps[partL];
92 p.square_copy_sp[i] = p.luma_copy_sp[partL];
93 p.square_copy_ss[i] = p.luma_copy_ss[partL];
94 }
95
96 primitives.sa8d[BLOCK_4x4] = primitives.sa8d_inter[LUMA_4x4];
97 primitives.sa8d[BLOCK_8x8] = primitives.sa8d_inter[LUMA_8x8];
98 primitives.sa8d[BLOCK_16x16] = primitives.sa8d_inter[LUMA_16x16];
99 primitives.sa8d[BLOCK_32x32] = primitives.sa8d_inter[LUMA_32x32];
100 primitives.sa8d[BLOCK_64x64] = primitives.sa8d_inter[LUMA_64x64];
101
102 // SA8D devolves to SATD for blocks not even multiples of 8x8
103 primitives.sa8d_inter[LUMA_4x4] = primitives.satd[LUMA_4x4];
104 primitives.sa8d_inter[LUMA_4x8] = primitives.satd[LUMA_4x8];
105 primitives.sa8d_inter[LUMA_4x16] = primitives.satd[LUMA_4x16];
106 primitives.sa8d_inter[LUMA_8x4] = primitives.satd[LUMA_8x4];
107 primitives.sa8d_inter[LUMA_16x4] = primitives.satd[LUMA_16x4];
108 primitives.sa8d_inter[LUMA_16x12] = primitives.satd[LUMA_16x12];
109 primitives.sa8d_inter[LUMA_12x16] = primitives.satd[LUMA_12x16];
110 }
111 }
112 using namespace x265;
113
114 /* cpuid >= 0 - force CPU type
115 * cpuid < 0 - auto-detect if uninitialized */
116 extern "C"
117 void x265_setup_primitives(x265_param *param, int cpuid)
118 {
119 if (cpuid < 0)
120 cpuid = x265::cpu_detect();
121
122 // initialize global variables
123 if (!primitives.sad[0])
124 {
125 Setup_C_Primitives(primitives);
126 Setup_Instrinsic_Primitives(primitives, cpuid);
127
128 #if ENABLE_ASSEMBLY
129 Setup_Assembly_Primitives(primitives, cpuid);
130 #else
131 x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n");
132 #endif
133
134 Setup_Alias_Primitives(primitives);
135
136 initROM();
137 }
138
139 if (param->logLevel >= X265_LOG_INFO)
140 {
141 char buf[1000];
142 char *p = buf + sprintf(buf, "using cpu capabilities:");
143 char *none = p;
144 for (int i = 0; x265::cpu_names[i].flags; i++)
145 {
146 if (!strcmp(x265::cpu_names[i].name, "SSE")
147 && (cpuid & X265_CPU_SSE2))
148 continue;
149 if (!strcmp(x265::cpu_names[i].name, "SSE2")
150 && (cpuid & (X265_CPU_SSE2_IS_FAST | X265_CPU_SSE2_IS_SLOW)))
151 continue;
152 if (!strcmp(x265::cpu_names[i].name, "SSE3")
153 && (cpuid & X265_CPU_SSSE3 || !(cpuid & X265_CPU_CACHELINE_64)))
154 continue;
155 if (!strcmp(x265::cpu_names[i].name, "SSE4.1")
156 && (cpuid & X265_CPU_SSE42))
157 continue;
158 if (!strcmp(x265::cpu_names[i].name, "BMI1")
159 && (cpuid & X265_CPU_BMI2))
160 continue;
161 if ((cpuid & x265::cpu_names[i].flags) == x265::cpu_names[i].flags
162 && (!i || x265::cpu_names[i].flags != x265::cpu_names[i - 1].flags))
163 p += sprintf(p, " %s", x265::cpu_names[i].name);
164 }
165
166 if (p == none)
167 sprintf(p, " none!");
168 x265_log(param, X265_LOG_INFO, "%s\n", buf);
169 }
170 }
171
172 #if !defined(ENABLE_ASSEMBLY)
173 #if defined(_MSC_VER)
174 #include <intrin.h>
175 #endif
176
177 extern "C" {
178 // the intrinsic primitives will not use MMX instructions, so if assembly
179 // is disabled there should be no reason to use EMMS.
180 void x265_cpu_emms(void) {}
181
182 #if defined(X265_ARCH_X86)
183
184 #if defined(_MSC_VER)
185 # pragma warning(disable: 4100)
186 #elif defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax
187 # define __cpuidex(regsArray, level, index) \
188 __asm__ __volatile__ ("cpuid" \
189 : "=a" ((regsArray)[0]), "=b" ((regsArray)[1]), "=c" ((regsArray)[2]), "=d" ((regsArray)[3]) \
190 : "0" (level), "2" (index));
191 #else
192 # error "compiler not supported"
193 #endif
194
195 int x265_cpu_cpuid_test(void)
196 {
197 return 0;
198 }
199
200 void x265_cpu_cpuid(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
201 {
202 int output[4];
203
204 __cpuidex(output, op, 0);
205 *eax = output[0];
206 *ebx = output[1];
207 *ecx = output[2];
208 *edx = output[3];
209 }
210
211 void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx)
212 {
213 uint64_t out = 0;
214
215 #if X265_ARCH_X86
216
217 #if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
218
219 // MSVC 2010 SP1 or later, or similar Intel release
220 out = _xgetbv(op);
221
222 #elif defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax
223
224 uint32_t a, d;
225 __asm("xgetbv" : "=a" (a), "=d" (d) : "c" (op) :);
226 *eax = a;
227 *edx = d;
228 return;
229
230 #elif defined(_WIN64) // On x64 with older compilers, this is impossible
231
232 #endif // if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
233
234 #endif // if x86
235
236 *eax = (uint32_t)out;
237 *edx = (uint32_t)(out >> 32);
238 }
239
240 #endif // X265_ARCH_X86
241 }
242 #endif // if !ENABLE_ASSEMBLY