Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | /***************************************************************************** |
2 | * Copyright (C) 2013 x265 project | |
3 | * | |
4 | * Authors: Steve Borho <steve@borho.org> | |
5 | * Mandar Gurav <mandar@multicorewareinc.com> | |
6 | * Mahesh Pittala <mahesh@multicorewareinc.com> | |
7 | * Min Chen <min.chen@multicorewareinc.com> | |
8 | * | |
9 | * This program is free software; you can redistribute it and/or modify | |
10 | * it under the terms of the GNU General Public License as published by | |
11 | * the Free Software Foundation; either version 2 of the License, or | |
12 | * (at your option) any later version. | |
13 | * | |
14 | * This program is distributed in the hope that it will be useful, | |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 | * GNU General Public License for more details. | |
18 | * | |
19 | * You should have received a copy of the GNU General Public License | |
20 | * along with this program; if not, write to the Free Software | |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
22 | * | |
23 | * This program is also available under a commercial proprietary license. | |
24 | * For more information, contact us at license @ x265.com. | |
25 | *****************************************************************************/ | |
26 | ||
27 | #include "common.h" | |
28 | #include "primitives.h" | |
29 | #include "x265.h" | |
30 | ||
31 | #include <cstdlib> // abs() | |
32 | ||
33 | using namespace x265; | |
34 | ||
b53f7c52 JB |
35 | #define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \ |
36 | p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \ | |
37 | p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \ | |
38 | p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \ | |
39 | p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \ | |
40 | p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \ | |
41 | p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \ | |
42 | p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \ | |
43 | p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \ | |
44 | p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \ | |
45 | p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \ | |
46 | p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \ | |
47 | p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \ | |
48 | p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \ | |
49 | p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \ | |
50 | p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \ | |
51 | p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \ | |
52 | p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \ | |
53 | p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \ | |
54 | p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \ | |
55 | p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \ | |
56 | p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \ | |
57 | p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \ | |
58 | p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \ | |
59 | p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \ | |
60 | p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>; | |
72b9787e JB |
61 | |
62 | #define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \ | |
63 | p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX<4, 4>; \ | |
64 | p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX<8, 8>; \ | |
65 | p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX<8, 4>; \ | |
66 | p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX<4, 8>; \ | |
67 | p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX<16, 16>; \ | |
68 | p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX<16, 8>; \ | |
69 | p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX<8, 16>; \ | |
70 | p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX<16, 12>; \ | |
71 | p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX<12, 16>; \ | |
72 | p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX<16, 4>; \ | |
73 | p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX<4, 16>; \ | |
74 | p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX<32, 32>; \ | |
75 | p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX<32, 16>; \ | |
76 | p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX<16, 32>; \ | |
77 | p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX<32, 24>; \ | |
78 | p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX<24, 32>; \ | |
79 | p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX<32, 8>; \ | |
80 | p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX<8, 32>; \ | |
81 | p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX<64, 64>; \ | |
82 | p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX<64, 32>; \ | |
83 | p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX<32, 64>; \ | |
84 | p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX<64, 48>; \ | |
85 | p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX<48, 64>; \ | |
86 | p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX<64, 16>; \ | |
87 | p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX<16, 64>; | |
88 | ||
89 | namespace { | |
90 | // place functions in anonymous namespace (file static) | |
91 | ||
92 | template<int lx, int ly> | |
b53f7c52 | 93 | int sad(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) |
72b9787e JB |
94 | { |
95 | int sum = 0; | |
96 | ||
97 | for (int y = 0; y < ly; y++) | |
98 | { | |
99 | for (int x = 0; x < lx; x++) | |
72b9787e | 100 | sum += abs(pix1[x] - pix2[x]); |
72b9787e JB |
101 | |
102 | pix1 += stride_pix1; | |
103 | pix2 += stride_pix2; | |
104 | } | |
105 | ||
106 | return sum; | |
107 | } | |
108 | ||
109 | template<int lx, int ly> | |
b53f7c52 | 110 | int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2) |
72b9787e JB |
111 | { |
112 | int sum = 0; | |
113 | ||
114 | for (int y = 0; y < ly; y++) | |
115 | { | |
116 | for (int x = 0; x < lx; x++) | |
72b9787e | 117 | sum += abs(pix1[x] - pix2[x]); |
72b9787e JB |
118 | |
119 | pix1 += stride_pix1; | |
120 | pix2 += stride_pix2; | |
121 | } | |
122 | ||
123 | return sum; | |
124 | } | |
125 | ||
126 | template<int lx, int ly> | |
b53f7c52 | 127 | void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res) |
72b9787e JB |
128 | { |
129 | res[0] = 0; | |
130 | res[1] = 0; | |
131 | res[2] = 0; | |
132 | for (int y = 0; y < ly; y++) | |
133 | { | |
134 | for (int x = 0; x < lx; x++) | |
135 | { | |
136 | res[0] += abs(pix1[x] - pix2[x]); | |
137 | res[1] += abs(pix1[x] - pix3[x]); | |
138 | res[2] += abs(pix1[x] - pix4[x]); | |
139 | } | |
140 | ||
141 | pix1 += FENC_STRIDE; | |
142 | pix2 += frefstride; | |
143 | pix3 += frefstride; | |
144 | pix4 += frefstride; | |
145 | } | |
146 | } | |
147 | ||
148 | template<int lx, int ly> | |
b53f7c52 | 149 | void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res) |
72b9787e JB |
150 | { |
151 | res[0] = 0; | |
152 | res[1] = 0; | |
153 | res[2] = 0; | |
154 | res[3] = 0; | |
155 | for (int y = 0; y < ly; y++) | |
156 | { | |
157 | for (int x = 0; x < lx; x++) | |
158 | { | |
159 | res[0] += abs(pix1[x] - pix2[x]); | |
160 | res[1] += abs(pix1[x] - pix3[x]); | |
161 | res[2] += abs(pix1[x] - pix4[x]); | |
162 | res[3] += abs(pix1[x] - pix5[x]); | |
163 | } | |
164 | ||
165 | pix1 += FENC_STRIDE; | |
166 | pix2 += frefstride; | |
167 | pix3 += frefstride; | |
168 | pix4 += frefstride; | |
169 | pix5 += frefstride; | |
170 | } | |
171 | } | |
172 | ||
173 | template<int lx, int ly, class T1, class T2> | |
b53f7c52 | 174 | int sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2) |
72b9787e JB |
175 | { |
176 | int sum = 0; | |
b53f7c52 | 177 | int tmp; |
72b9787e JB |
178 | |
179 | for (int y = 0; y < ly; y++) | |
180 | { | |
181 | for (int x = 0; x < lx; x++) | |
182 | { | |
b53f7c52 JB |
183 | tmp = pix1[x] - pix2[x]; |
184 | sum += (tmp * tmp); | |
72b9787e JB |
185 | } |
186 | ||
187 | pix1 += stride_pix1; | |
188 | pix2 += stride_pix2; | |
189 | } | |
190 | ||
191 | return sum; | |
192 | } | |
193 | ||
194 | #define BITS_PER_SUM (8 * sizeof(sum_t)) | |
195 | ||
196 | #define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) { \ | |
197 | sum2_t t0 = s0 + s1; \ | |
198 | sum2_t t1 = s0 - s1; \ | |
199 | sum2_t t2 = s2 + s3; \ | |
200 | sum2_t t3 = s2 - s3; \ | |
201 | d0 = t0 + t2; \ | |
202 | d2 = t0 - t2; \ | |
203 | d1 = t1 + t3; \ | |
204 | d3 = t1 - t3; \ | |
205 | } | |
206 | ||
207 | // in: a pseudo-simd number of the form x+(y<<16) | |
208 | // return: abs(x)+(abs(y)<<16) | |
209 | inline sum2_t abs2(sum2_t a) | |
210 | { | |
211 | sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1); | |
212 | ||
213 | return (a + s) ^ s; | |
214 | } | |
215 | ||
b53f7c52 | 216 | int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) |
72b9787e JB |
217 | { |
218 | sum2_t tmp[4][2]; | |
219 | sum2_t a0, a1, a2, a3, b0, b1; | |
220 | sum2_t sum = 0; | |
221 | ||
222 | for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2) | |
223 | { | |
224 | a0 = pix1[0] - pix2[0]; | |
225 | a1 = pix1[1] - pix2[1]; | |
226 | b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM); | |
227 | a2 = pix1[2] - pix2[2]; | |
228 | a3 = pix1[3] - pix2[3]; | |
229 | b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM); | |
230 | tmp[i][0] = b0 + b1; | |
231 | tmp[i][1] = b0 - b1; | |
232 | } | |
233 | ||
234 | for (int i = 0; i < 2; i++) | |
235 | { | |
236 | HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); | |
237 | a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); | |
238 | sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM); | |
239 | } | |
240 | ||
241 | return (int)(sum >> 1); | |
242 | } | |
243 | ||
b53f7c52 | 244 | int satd_4x4(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2) |
72b9787e JB |
245 | { |
246 | ssum2_t tmp[4][2]; | |
247 | ssum2_t a0, a1, a2, a3, b0, b1; | |
248 | ssum2_t sum = 0; | |
249 | ||
250 | for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2) | |
251 | { | |
252 | a0 = pix1[0] - pix2[0]; | |
253 | a1 = pix1[1] - pix2[1]; | |
254 | b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM); | |
255 | a2 = pix1[2] - pix2[2]; | |
256 | a3 = pix1[3] - pix2[3]; | |
257 | b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM); | |
258 | tmp[i][0] = b0 + b1; | |
259 | tmp[i][1] = b0 - b1; | |
260 | } | |
261 | ||
262 | for (int i = 0; i < 2; i++) | |
263 | { | |
264 | HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); | |
265 | a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); | |
266 | sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM); | |
267 | } | |
268 | ||
269 | return (int)(sum >> 1); | |
270 | } | |
271 | ||
272 | // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once | |
b53f7c52 | 273 | int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) |
72b9787e JB |
274 | { |
275 | sum2_t tmp[4][4]; | |
276 | sum2_t a0, a1, a2, a3; | |
277 | sum2_t sum = 0; | |
278 | ||
279 | for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2) | |
280 | { | |
281 | a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM); | |
282 | a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM); | |
283 | a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM); | |
284 | a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM); | |
285 | HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3); | |
286 | } | |
287 | ||
288 | for (int i = 0; i < 4; i++) | |
289 | { | |
290 | HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); | |
291 | sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); | |
292 | } | |
293 | ||
294 | return (((sum_t)sum) + (sum >> BITS_PER_SUM)) >> 1; | |
295 | } | |
296 | ||
297 | template<int w, int h> | |
298 | // calculate satd in blocks of 4x4 | |
b53f7c52 | 299 | int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) |
72b9787e JB |
300 | { |
301 | int satd = 0; | |
302 | ||
303 | for (int row = 0; row < h; row += 4) | |
72b9787e | 304 | for (int col = 0; col < w; col += 4) |
72b9787e JB |
305 | satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1, |
306 | pix2 + row * stride_pix2 + col, stride_pix2); | |
72b9787e JB |
307 | |
308 | return satd; | |
309 | } | |
310 | ||
311 | template<int w, int h> | |
312 | // calculate satd in blocks of 8x4 | |
b53f7c52 | 313 | int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) |
72b9787e JB |
314 | { |
315 | int satd = 0; | |
316 | ||
317 | for (int row = 0; row < h; row += 4) | |
72b9787e | 318 | for (int col = 0; col < w; col += 8) |
72b9787e JB |
319 | satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1, |
320 | pix2 + row * stride_pix2 + col, stride_pix2); | |
72b9787e JB |
321 | |
322 | return satd; | |
323 | } | |
324 | ||
b53f7c52 | 325 | inline int _sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) |
72b9787e JB |
326 | { |
327 | sum2_t tmp[8][4]; | |
328 | sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3; | |
329 | sum2_t sum = 0; | |
330 | ||
331 | for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2) | |
332 | { | |
333 | a0 = pix1[0] - pix2[0]; | |
334 | a1 = pix1[1] - pix2[1]; | |
335 | b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM); | |
336 | a2 = pix1[2] - pix2[2]; | |
337 | a3 = pix1[3] - pix2[3]; | |
338 | b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM); | |
339 | a4 = pix1[4] - pix2[4]; | |
340 | a5 = pix1[5] - pix2[5]; | |
341 | b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM); | |
342 | a6 = pix1[6] - pix2[6]; | |
343 | a7 = pix1[7] - pix2[7]; | |
344 | b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM); | |
345 | HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3); | |
346 | } | |
347 | ||
348 | for (int i = 0; i < 4; i++) | |
349 | { | |
350 | HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); | |
351 | HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]); | |
352 | b0 = abs2(a0 + a4) + abs2(a0 - a4); | |
353 | b0 += abs2(a1 + a5) + abs2(a1 - a5); | |
354 | b0 += abs2(a2 + a6) + abs2(a2 - a6); | |
355 | b0 += abs2(a3 + a7) + abs2(a3 - a7); | |
356 | sum += (sum_t)b0 + (b0 >> BITS_PER_SUM); | |
357 | } | |
358 | ||
359 | return (int)sum; | |
360 | } | |
361 | ||
b53f7c52 | 362 | int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) |
72b9787e JB |
363 | { |
364 | return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2); | |
365 | } | |
366 | ||
b53f7c52 | 367 | inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2) |
72b9787e JB |
368 | { |
369 | ssum2_t tmp[8][4]; | |
370 | ssum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3; | |
371 | ssum2_t sum = 0; | |
372 | ||
373 | for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2) | |
374 | { | |
375 | a0 = pix1[0] - pix2[0]; | |
376 | a1 = pix1[1] - pix2[1]; | |
377 | b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM); | |
378 | a2 = pix1[2] - pix2[2]; | |
379 | a3 = pix1[3] - pix2[3]; | |
380 | b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM); | |
381 | a4 = pix1[4] - pix2[4]; | |
382 | a5 = pix1[5] - pix2[5]; | |
383 | b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM); | |
384 | a6 = pix1[6] - pix2[6]; | |
385 | a7 = pix1[7] - pix2[7]; | |
386 | b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM); | |
387 | HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3); | |
388 | } | |
389 | ||
390 | for (int i = 0; i < 4; i++) | |
391 | { | |
392 | HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); | |
393 | HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]); | |
394 | b0 = abs2(a0 + a4) + abs2(a0 - a4); | |
395 | b0 += abs2(a1 + a5) + abs2(a1 - a5); | |
396 | b0 += abs2(a2 + a6) + abs2(a2 - a6); | |
397 | b0 += abs2(a3 + a7) + abs2(a3 - a7); | |
398 | sum += (sum_t)b0 + (b0 >> BITS_PER_SUM); | |
399 | } | |
400 | ||
401 | return (int)sum; | |
402 | } | |
403 | ||
b53f7c52 | 404 | int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2) |
72b9787e JB |
405 | { |
406 | return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2); | |
407 | } | |
408 | ||
b53f7c52 | 409 | int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) |
72b9787e JB |
410 | { |
411 | int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2) | |
412 | + _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2) | |
413 | + _sa8d_8x8(pix1 + 8 * i_pix1, i_pix1, pix2 + 8 * i_pix2, i_pix2) | |
414 | + _sa8d_8x8(pix1 + 8 + 8 * i_pix1, i_pix1, pix2 + 8 + 8 * i_pix2, i_pix2); | |
415 | ||
416 | // This matches x264 sa8d_16x16, but is slightly different from HM's behavior because | |
417 | // this version only rounds once at the end | |
418 | return (sum + 2) >> 2; | |
419 | } | |
420 | ||
421 | template<int w, int h> | |
422 | // Calculate sa8d in blocks of 8x8 | |
b53f7c52 | 423 | int sa8d8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) |
72b9787e JB |
424 | { |
425 | int cost = 0; | |
426 | ||
427 | for (int y = 0; y < h; y += 8) | |
72b9787e | 428 | for (int x = 0; x < w; x += 8) |
72b9787e | 429 | cost += sa8d_8x8(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2); |
72b9787e JB |
430 | |
431 | return cost; | |
432 | } | |
433 | ||
434 | template<int w, int h> | |
435 | // Calculate sa8d in blocks of 16x16 | |
b53f7c52 | 436 | int sa8d16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) |
72b9787e JB |
437 | { |
438 | int cost = 0; | |
439 | ||
440 | for (int y = 0; y < h; y += 16) | |
72b9787e | 441 | for (int x = 0; x < w; x += 16) |
72b9787e | 442 | cost += sa8d_16x16(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2); |
72b9787e JB |
443 | |
444 | return cost; | |
445 | } | |
446 | ||
447 | template<int size> | |
b53f7c52 | 448 | int pixel_ssd_s_c(const int16_t* a, intptr_t dstride) |
72b9787e JB |
449 | { |
450 | int sum = 0; | |
451 | for (int y = 0; y < size; y++) | |
452 | { | |
453 | for (int x = 0; x < size; x++) | |
72b9787e | 454 | sum += a[x] * a[x]; |
b53f7c52 | 455 | |
72b9787e JB |
456 | a += dstride; |
457 | } | |
458 | return sum; | |
459 | } | |
460 | ||
461 | template<int size> | |
b53f7c52 | 462 | void blockfil_s_c(int16_t* dst, intptr_t dstride, int16_t val) |
72b9787e JB |
463 | { |
464 | for (int y = 0; y < size; y++) | |
72b9787e | 465 | for (int x = 0; x < size; x++) |
72b9787e | 466 | dst[y * dstride + x] = val; |
72b9787e JB |
467 | } |
468 | ||
469 | template<int size> | |
b53f7c52 | 470 | void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) |
72b9787e | 471 | { |
b53f7c52 JB |
472 | X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n"); |
473 | X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n"); | |
474 | X265_CHECK(shift >= 0, "invalid shift\n"); | |
72b9787e JB |
475 | |
476 | for (int i = 0; i < size; i++) | |
477 | { | |
478 | for (int j = 0; j < size; j++) | |
b53f7c52 | 479 | dst[j] = src[j] << shift; |
72b9787e | 480 | |
b53f7c52 JB |
481 | src += srcStride; |
482 | dst += size; | |
72b9787e JB |
483 | } |
484 | } | |
485 | ||
b53f7c52 JB |
486 | template<int size> |
487 | void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) | |
72b9787e | 488 | { |
b53f7c52 JB |
489 | X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n"); |
490 | X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n"); | |
491 | X265_CHECK(shift > 0, "invalid shift\n"); | |
72b9787e | 492 | |
b53f7c52 | 493 | int16_t round = 1 << (shift - 1); |
72b9787e JB |
494 | for (int i = 0; i < size; i++) |
495 | { | |
496 | for (int j = 0; j < size; j++) | |
b53f7c52 | 497 | dst[j] = (src[j] + round) >> shift; |
72b9787e | 498 | |
b53f7c52 JB |
499 | src += srcStride; |
500 | dst += size; | |
72b9787e JB |
501 | } |
502 | } | |
503 | ||
504 | template<int size> | |
b53f7c52 | 505 | void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) |
72b9787e | 506 | { |
b53f7c52 JB |
507 | X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n"); |
508 | X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n"); | |
509 | X265_CHECK(shift >= 0, "invalid shift\n"); | |
510 | ||
72b9787e JB |
511 | for (int i = 0; i < size; i++) |
512 | { | |
513 | for (int j = 0; j < size; j++) | |
b53f7c52 | 514 | dst[j] = src[j] << shift; |
72b9787e JB |
515 | |
516 | src += size; | |
b53f7c52 | 517 | dst += dstStride; |
72b9787e JB |
518 | } |
519 | } | |
520 | ||
521 | template<int size> | |
b53f7c52 | 522 | void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) |
72b9787e | 523 | { |
b53f7c52 JB |
524 | X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n"); |
525 | X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n"); | |
526 | X265_CHECK(shift > 0, "invalid shift\n"); | |
527 | ||
528 | int16_t round = 1 << (shift - 1); | |
72b9787e JB |
529 | for (int i = 0; i < size; i++) |
530 | { | |
531 | for (int j = 0; j < size; j++) | |
b53f7c52 | 532 | dst[j] = (src[j] + round) >> shift; |
72b9787e JB |
533 | |
534 | src += size; | |
b53f7c52 | 535 | dst += dstStride; |
72b9787e JB |
536 | } |
537 | } | |
538 | ||
539 | template<int blockSize> | |
b53f7c52 | 540 | void getResidual(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride) |
72b9787e JB |
541 | { |
542 | for (int y = 0; y < blockSize; y++) | |
543 | { | |
544 | for (int x = 0; x < blockSize; x++) | |
72b9787e | 545 | residual[x] = static_cast<int16_t>(fenc[x]) - static_cast<int16_t>(pred[x]); |
72b9787e JB |
546 | |
547 | fenc += stride; | |
548 | residual += stride; | |
549 | pred += stride; | |
550 | } | |
551 | } | |
552 | ||
553 | template<int blockSize> | |
b53f7c52 | 554 | void transpose(pixel* dst, const pixel* src, intptr_t stride) |
72b9787e JB |
555 | { |
556 | for (int k = 0; k < blockSize; k++) | |
72b9787e | 557 | for (int l = 0; l < blockSize; l++) |
72b9787e | 558 | dst[k * blockSize + l] = src[l * stride + k]; |
72b9787e JB |
559 | } |
560 | ||
b53f7c52 | 561 | void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset) |
72b9787e JB |
562 | { |
563 | int x, y; | |
564 | ||
565 | for (y = 0; y <= height - 1; y++) | |
566 | { | |
567 | for (x = 0; x <= width - 1; ) | |
568 | { | |
569 | // note: width can be odd | |
570 | dst[x] = (pixel)Clip3(0, ((1 << X265_DEPTH) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset); | |
571 | x++; | |
572 | } | |
573 | ||
574 | src += srcStride; | |
575 | dst += dstStride; | |
576 | } | |
577 | } | |
578 | ||
b53f7c52 | 579 | void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset) |
72b9787e JB |
580 | { |
581 | int x, y; | |
582 | ||
583 | X265_CHECK(!(width & 15), "weightp alignment error\n"); | |
584 | X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n"); | |
585 | X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n"); | |
586 | ||
587 | for (y = 0; y <= height - 1; y++) | |
588 | { | |
589 | for (x = 0; x <= width - 1; ) | |
590 | { | |
591 | // simulating pixel to short conversion | |
592 | int16_t val = src[x] << (IF_INTERNAL_PREC - X265_DEPTH); | |
593 | dst[x] = (pixel)Clip3(0, ((1 << X265_DEPTH) - 1), ((w0 * (val) + round) >> shift) + offset); | |
594 | x++; | |
595 | } | |
596 | ||
597 | src += stride; | |
598 | dst += stride; | |
599 | } | |
600 | } | |
601 | ||
602 | template<int lx, int ly> | |
b53f7c52 | 603 | void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) |
72b9787e JB |
604 | { |
605 | for (int y = 0; y < ly; y++) | |
606 | { | |
607 | for (int x = 0; x < lx; x++) | |
72b9787e | 608 | dst[x] = (src0[x] + src1[x] + 1) >> 1; |
72b9787e JB |
609 | |
610 | src0 += sstride0; | |
611 | src1 += sstride1; | |
612 | dst += dstride; | |
613 | } | |
614 | } | |
615 | ||
b53f7c52 | 616 | void scale1D_128to64(pixel* dst, const pixel* src, intptr_t /*stride*/) |
72b9787e JB |
617 | { |
618 | int x; | |
619 | ||
620 | for (x = 0; x < 128; x += 2) | |
621 | { | |
622 | pixel pix0 = src[(x + 0)]; | |
623 | pixel pix1 = src[(x + 1)]; | |
624 | int sum = pix0 + pix1; | |
625 | ||
626 | dst[x >> 1] = (pixel)((sum + 1) >> 1); | |
627 | } | |
628 | } | |
629 | ||
b53f7c52 | 630 | void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride) |
72b9787e | 631 | { |
b53f7c52 | 632 | uint32_t x, y; |
72b9787e JB |
633 | |
634 | for (y = 0; y < 64; y += 2) | |
635 | { | |
636 | for (x = 0; x < 64; x += 2) | |
637 | { | |
638 | pixel pix0 = src[(y + 0) * stride + (x + 0)]; | |
639 | pixel pix1 = src[(y + 0) * stride + (x + 1)]; | |
640 | pixel pix2 = src[(y + 1) * stride + (x + 0)]; | |
641 | pixel pix3 = src[(y + 1) * stride + (x + 1)]; | |
642 | int sum = pix0 + pix1 + pix2 + pix3; | |
643 | ||
644 | dst[y / 2 * 32 + x / 2] = (pixel)((sum + 2) >> 2); | |
645 | } | |
646 | } | |
647 | } | |
648 | ||
b53f7c52 | 649 | void frame_init_lowres_core(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc, |
72b9787e JB |
650 | intptr_t src_stride, intptr_t dst_stride, int width, int height) |
651 | { | |
652 | for (int y = 0; y < height; y++) | |
653 | { | |
b53f7c52 JB |
654 | const pixel* src1 = src0 + src_stride; |
655 | const pixel* src2 = src1 + src_stride; | |
72b9787e JB |
656 | for (int x = 0; x < width; x++) |
657 | { | |
658 | // slower than naive bilinear, but matches asm | |
659 | #define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1) | |
660 | dst0[x] = FILTER(src0[2 * x], src1[2 * x], src0[2 * x + 1], src1[2 * x + 1]); | |
661 | dsth[x] = FILTER(src0[2 * x + 1], src1[2 * x + 1], src0[2 * x + 2], src1[2 * x + 2]); | |
662 | dstv[x] = FILTER(src1[2 * x], src2[2 * x], src1[2 * x + 1], src2[2 * x + 1]); | |
663 | dstc[x] = FILTER(src1[2 * x + 1], src2[2 * x + 1], src1[2 * x + 2], src2[2 * x + 2]); | |
664 | #undef FILTER | |
665 | } | |
666 | src0 += src_stride * 2; | |
667 | dst0 += dst_stride; | |
668 | dsth += dst_stride; | |
669 | dstv += dst_stride; | |
670 | dstc += dst_stride; | |
671 | } | |
672 | } | |
673 | ||
674 | /* structural similarity metric */ | |
b53f7c52 | 675 | void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]) |
72b9787e JB |
676 | { |
677 | for (int z = 0; z < 2; z++) | |
678 | { | |
679 | uint32_t s1 = 0, s2 = 0, ss = 0, s12 = 0; | |
680 | for (int y = 0; y < 4; y++) | |
681 | { | |
682 | for (int x = 0; x < 4; x++) | |
683 | { | |
684 | int a = pix1[x + y * stride1]; | |
685 | int b = pix2[x + y * stride2]; | |
686 | s1 += a; | |
687 | s2 += b; | |
688 | ss += a * a; | |
689 | ss += b * b; | |
690 | s12 += a * b; | |
691 | } | |
692 | } | |
693 | ||
694 | sums[z][0] = s1; | |
695 | sums[z][1] = s2; | |
696 | sums[z][2] = ss; | |
697 | sums[z][3] = s12; | |
698 | pix1 += 4; | |
699 | pix2 += 4; | |
700 | } | |
701 | } | |
702 | ||
703 | float ssim_end_1(int s1, int s2, int ss, int s12) | |
704 | { | |
705 | /* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases. | |
706 | * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784. | |
707 | * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */ | |
708 | ||
709 | #define PIXEL_MAX ((1 << X265_DEPTH) - 1) | |
710 | #if HIGH_BIT_DEPTH | |
711 | X265_CHECK(X265_DEPTH == 10, "ssim invalid depth\n"); | |
712 | #define type float | |
713 | static const float ssim_c1 = (float)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64); | |
714 | static const float ssim_c2 = (float)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63); | |
715 | #else | |
716 | X265_CHECK(X265_DEPTH == 8, "ssim invalid depth\n"); | |
717 | #define type int | |
718 | static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5); | |
719 | static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5); | |
720 | #endif | |
721 | type fs1 = (type)s1; | |
722 | type fs2 = (type)s2; | |
723 | type fss = (type)ss; | |
724 | type fs12 = (type)s12; | |
725 | type vars = (type)(fss * 64 - fs1 * fs1 - fs2 * fs2); | |
726 | type covar = (type)(fs12 * 64 - fs1 * fs2); | |
727 | return (float)(2 * fs1 * fs2 + ssim_c1) * (float)(2 * covar + ssim_c2) | |
728 | / ((float)(fs1 * fs1 + fs2 * fs2 + ssim_c1) * (float)(vars + ssim_c2)); | |
729 | #undef type | |
730 | #undef PIXEL_MAX | |
731 | } | |
732 | ||
733 | float ssim_end_4(int sum0[5][4], int sum1[5][4], int width) | |
734 | { | |
735 | float ssim = 0.0; | |
736 | ||
737 | for (int i = 0; i < width; i++) | |
738 | { | |
739 | ssim += ssim_end_1(sum0[i][0] + sum0[i + 1][0] + sum1[i][0] + sum1[i + 1][0], | |
740 | sum0[i][1] + sum0[i + 1][1] + sum1[i][1] + sum1[i + 1][1], | |
741 | sum0[i][2] + sum0[i + 1][2] + sum1[i][2] + sum1[i + 1][2], | |
742 | sum0[i][3] + sum0[i + 1][3] + sum1[i][3] + sum1[i + 1][3]); | |
743 | } | |
744 | ||
745 | return ssim; | |
746 | } | |
747 | ||
748 | template<int size> | |
b53f7c52 | 749 | uint64_t pixel_var(const pixel* pix, intptr_t i_stride) |
72b9787e JB |
750 | { |
751 | uint32_t sum = 0, sqr = 0; | |
752 | ||
753 | for (int y = 0; y < size; y++) | |
754 | { | |
755 | for (int x = 0; x < size; x++) | |
756 | { | |
757 | sum += pix[x]; | |
758 | sqr += pix[x] * pix[x]; | |
759 | } | |
760 | ||
761 | pix += i_stride; | |
762 | } | |
763 | ||
764 | return sum + ((uint64_t)sqr << 32); | |
765 | } | |
766 | ||
767 | #if defined(_MSC_VER) | |
768 | #pragma warning(disable: 4127) // conditional expression is constant | |
769 | #endif | |
770 | ||
771 | template<int size> | |
b53f7c52 | 772 | int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride) |
72b9787e JB |
773 | { |
774 | static pixel zeroBuf[8] /* = { 0 } */; | |
775 | ||
776 | if (size) | |
777 | { | |
778 | int dim = 1 << (size + 2); | |
779 | uint32_t totEnergy = 0; | |
780 | for (int i = 0; i < dim; i += 8) | |
781 | { | |
782 | for (int j = 0; j < dim; j+= 8) | |
783 | { | |
784 | /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */ | |
785 | int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride, zeroBuf, 0) - | |
786 | (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2); | |
787 | int reconEnergy = sa8d_8x8(recon + i * rstride + j, rstride, zeroBuf, 0) - | |
788 | (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2); | |
789 | ||
790 | totEnergy += abs(sourceEnergy - reconEnergy); | |
791 | } | |
792 | } | |
793 | return totEnergy; | |
794 | } | |
795 | else | |
796 | { | |
797 | /* 4x4 is too small for sa8d */ | |
798 | int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2); | |
799 | int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2); | |
800 | return abs(sourceEnergy - reconEnergy); | |
801 | } | |
802 | } | |
803 | ||
804 | template<int size> | |
b53f7c52 | 805 | int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) |
72b9787e JB |
806 | { |
807 | static int16_t zeroBuf[8] /* = { 0 } */; | |
808 | ||
809 | if (size) | |
810 | { | |
811 | int dim = 1 << (size + 2); | |
812 | uint32_t totEnergy = 0; | |
813 | for (int i = 0; i < dim; i += 8) | |
814 | { | |
815 | for (int j = 0; j < dim; j+= 8) | |
816 | { | |
817 | /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */ | |
818 | int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride, zeroBuf, 0) - | |
819 | (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2); | |
820 | int reconEnergy = sa8d_8x8(recon + i * rstride + j, rstride, zeroBuf, 0) - | |
821 | (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2); | |
822 | ||
823 | totEnergy += abs(sourceEnergy - reconEnergy); | |
824 | } | |
825 | } | |
826 | return totEnergy; | |
827 | } | |
828 | else | |
829 | { | |
830 | /* 4x4 is too small for sa8d */ | |
831 | int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2); | |
832 | int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2); | |
833 | return abs(sourceEnergy - reconEnergy); | |
834 | } | |
835 | } | |
836 | ||
72b9787e | 837 | template<int bx, int by> |
b53f7c52 | 838 | void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb) |
72b9787e JB |
839 | { |
840 | for (int y = 0; y < by; y++) | |
841 | { | |
842 | for (int x = 0; x < bx; x++) | |
72b9787e | 843 | a[x] = b[x]; |
72b9787e JB |
844 | |
845 | a += stridea; | |
846 | b += strideb; | |
847 | } | |
848 | } | |
849 | ||
850 | template<int bx, int by> | |
b53f7c52 | 851 | void blockcopy_ss_c(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb) |
72b9787e JB |
852 | { |
853 | for (int y = 0; y < by; y++) | |
854 | { | |
855 | for (int x = 0; x < bx; x++) | |
72b9787e | 856 | a[x] = b[x]; |
72b9787e JB |
857 | |
858 | a += stridea; | |
859 | b += strideb; | |
860 | } | |
861 | } | |
862 | ||
863 | template<int bx, int by> | |
b53f7c52 | 864 | void blockcopy_sp_c(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb) |
72b9787e JB |
865 | { |
866 | for (int y = 0; y < by; y++) | |
867 | { | |
868 | for (int x = 0; x < bx; x++) | |
869 | { | |
870 | X265_CHECK((b[x] >= 0) && (b[x] <= ((1 << X265_DEPTH) - 1)), "blockcopy pixel size fail\n"); | |
871 | a[x] = (pixel)b[x]; | |
872 | } | |
873 | ||
874 | a += stridea; | |
875 | b += strideb; | |
876 | } | |
877 | } | |
878 | ||
879 | template<int bx, int by> | |
b53f7c52 | 880 | void blockcopy_ps_c(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb) |
72b9787e JB |
881 | { |
882 | for (int y = 0; y < by; y++) | |
883 | { | |
884 | for (int x = 0; x < bx; x++) | |
72b9787e | 885 | a[x] = (int16_t)b[x]; |
72b9787e JB |
886 | |
887 | a += stridea; | |
888 | b += strideb; | |
889 | } | |
890 | } | |
891 | ||
892 | template<int bx, int by> | |
b53f7c52 | 893 | void pixel_sub_ps_c(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1) |
72b9787e JB |
894 | { |
895 | for (int y = 0; y < by; y++) | |
896 | { | |
897 | for (int x = 0; x < bx; x++) | |
72b9787e | 898 | a[x] = (int16_t)(b0[x] - b1[x]); |
72b9787e JB |
899 | |
900 | b0 += sstride0; | |
901 | b1 += sstride1; | |
902 | a += dstride; | |
903 | } | |
904 | } | |
905 | ||
906 | template<int bx, int by> | |
b53f7c52 | 907 | void pixel_add_ps_c(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1) |
72b9787e JB |
908 | { |
909 | for (int y = 0; y < by; y++) | |
910 | { | |
911 | for (int x = 0; x < bx; x++) | |
72b9787e | 912 | a[x] = Clip(b0[x] + b1[x]); |
72b9787e JB |
913 | |
914 | b0 += sstride0; | |
915 | b1 += sstride1; | |
916 | a += dstride; | |
917 | } | |
918 | } | |
919 | ||
920 | template<int bx, int by> | |
b53f7c52 | 921 | void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) |
72b9787e JB |
922 | { |
923 | int shiftNum, offset; | |
924 | ||
925 | shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH; | |
926 | offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS; | |
927 | ||
928 | for (int y = 0; y < by; y++) | |
929 | { | |
930 | for (int x = 0; x < bx; x += 2) | |
931 | { | |
932 | dst[x + 0] = Clip((src0[x + 0] + src1[x + 0] + offset) >> shiftNum); | |
933 | dst[x + 1] = Clip((src0[x + 1] + src1[x + 1] + offset) >> shiftNum); | |
934 | } | |
935 | ||
936 | src0 += src0Stride; | |
937 | src1 += src1Stride; | |
938 | dst += dstStride; | |
939 | } | |
940 | } | |
941 | ||
b53f7c52 | 942 | void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift) |
72b9787e JB |
943 | { |
944 | for (int r = 0; r < height; r++) | |
945 | { | |
946 | for (int c = 0; c < width; c++) | |
72b9787e | 947 | dst[c] = ((pixel)src[c]) << shift; |
72b9787e JB |
948 | |
949 | dst += dstStride; | |
950 | src += srcStride; | |
951 | } | |
952 | } | |
953 | ||
b53f7c52 | 954 | void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) |
72b9787e JB |
955 | { |
956 | for (int r = 0; r < height; r++) | |
957 | { | |
958 | for (int c = 0; c < width; c++) | |
72b9787e | 959 | dst[c] = (pixel)((src[c] >> shift) & mask); |
72b9787e JB |
960 | |
961 | dst += dstStride; | |
962 | src += srcStride; | |
963 | } | |
964 | } | |
965 | ||
966 | /* Estimate the total amount of influence on future quality that could be had if we | |
967 | * were to improve the reference samples used to inter predict any given CU. */ | |
b53f7c52 JB |
968 | void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, |
969 | const int32_t* invQscales, const double* fpsFactor, int len) | |
72b9787e JB |
970 | { |
971 | double fps = *fpsFactor / 256; | |
972 | ||
973 | for (int i = 0; i < len; i++) | |
974 | { | |
975 | double intraCost = intraCosts[i] * invQscales[i]; | |
976 | double propagateAmount = (double)propagateIn[i] + intraCost * fps; | |
977 | double propagateNum = (double)intraCosts[i] - (interCosts[i] & ((1 << 14) - 1)); | |
978 | double propagateDenom = (double)intraCosts[i]; | |
979 | dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5); | |
980 | } | |
981 | } | |
982 | } // end anonymous namespace | |
983 | ||
984 | namespace x265 { | |
985 | // x265 private namespace | |
986 | ||
987 | /* Extend the edges of a picture so that it may safely be used for motion | |
988 | * compensation. This function assumes the picture is stored in a buffer with | |
989 | * sufficient padding for the X and Y margins */ | |
990 | void extendPicBorder(pixel* pic, intptr_t stride, int width, int height, int marginX, int marginY) | |
991 | { | |
992 | /* extend left and right margins */ | |
993 | primitives.extendRowBorder(pic, stride, width, height, marginX); | |
994 | ||
995 | /* copy top row to create above margin */ | |
b53f7c52 | 996 | pixel* top = pic - marginX; |
72b9787e JB |
997 | for (int y = 0; y < marginY; y++) |
998 | memcpy(top - (y + 1) * stride, top, stride * sizeof(pixel)); | |
999 | ||
1000 | /* copy bottom row to create below margin */ | |
b53f7c52 | 1001 | pixel* bot = pic - marginX + (height - 1) * stride; |
72b9787e JB |
1002 | for (int y = 0; y < marginY; y++) |
1003 | memcpy(bot + (y + 1) * stride, bot, stride * sizeof(pixel)); | |
1004 | } | |
1005 | ||
1006 | /* Initialize entries for pixel functions defined in this file */ | |
1007 | void Setup_C_PixelPrimitives(EncoderPrimitives &p) | |
1008 | { | |
1009 | SET_FUNC_PRIMITIVE_TABLE_C2(sad) | |
1010 | SET_FUNC_PRIMITIVE_TABLE_C2(sad_x3) | |
1011 | SET_FUNC_PRIMITIVE_TABLE_C2(sad_x4) | |
1012 | SET_FUNC_PRIMITIVE_TABLE_C2(pixelavg_pp) | |
1013 | ||
1014 | // satd | |
1015 | p.satd[LUMA_4x4] = satd_4x4; | |
1016 | p.satd[LUMA_8x8] = satd8<8, 8>; | |
1017 | p.satd[LUMA_8x4] = satd_8x4; | |
1018 | p.satd[LUMA_4x8] = satd4<4, 8>; | |
1019 | p.satd[LUMA_16x16] = satd8<16, 16>; | |
1020 | p.satd[LUMA_16x8] = satd8<16, 8>; | |
1021 | p.satd[LUMA_8x16] = satd8<8, 16>; | |
1022 | p.satd[LUMA_16x12] = satd8<16, 12>; | |
1023 | p.satd[LUMA_12x16] = satd4<12, 16>; | |
1024 | p.satd[LUMA_16x4] = satd8<16, 4>; | |
1025 | p.satd[LUMA_4x16] = satd4<4, 16>; | |
1026 | p.satd[LUMA_32x32] = satd8<32, 32>; | |
1027 | p.satd[LUMA_32x16] = satd8<32, 16>; | |
1028 | p.satd[LUMA_16x32] = satd8<16, 32>; | |
1029 | p.satd[LUMA_32x24] = satd8<32, 24>; | |
1030 | p.satd[LUMA_24x32] = satd8<24, 32>; | |
1031 | p.satd[LUMA_32x8] = satd8<32, 8>; | |
1032 | p.satd[LUMA_8x32] = satd8<8, 32>; | |
1033 | p.satd[LUMA_64x64] = satd8<64, 64>; | |
1034 | p.satd[LUMA_64x32] = satd8<64, 32>; | |
1035 | p.satd[LUMA_32x64] = satd8<32, 64>; | |
1036 | p.satd[LUMA_64x48] = satd8<64, 48>; | |
1037 | p.satd[LUMA_48x64] = satd8<48, 64>; | |
1038 | p.satd[LUMA_64x16] = satd8<64, 16>; | |
1039 | p.satd[LUMA_16x64] = satd8<16, 64>; | |
1040 | ||
b53f7c52 JB |
1041 | p.chroma[X265_CSP_I420].satd[CHROMA_2x2] = NULL; |
1042 | p.chroma[X265_CSP_I420].satd[CHROMA_4x4] = satd_4x4; | |
1043 | p.chroma[X265_CSP_I420].satd[CHROMA_8x8] = satd8<8, 8>; | |
1044 | p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = satd8<16, 16>; | |
1045 | p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = satd8<32, 32>; | |
1046 | ||
1047 | p.chroma[X265_CSP_I420].satd[CHROMA_4x2] = NULL; | |
1048 | p.chroma[X265_CSP_I420].satd[CHROMA_2x4] = NULL; | |
1049 | p.chroma[X265_CSP_I420].satd[CHROMA_8x4] = satd_8x4; | |
1050 | p.chroma[X265_CSP_I420].satd[CHROMA_4x8] = satd4<4, 8>; | |
1051 | p.chroma[X265_CSP_I420].satd[CHROMA_16x8] = satd8<16, 8>; | |
1052 | p.chroma[X265_CSP_I420].satd[CHROMA_8x16] = satd8<8, 16>; | |
1053 | p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = satd8<32, 16>; | |
1054 | p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = satd8<16, 32>; | |
1055 | ||
1056 | p.chroma[X265_CSP_I420].satd[CHROMA_8x6] = NULL; | |
1057 | p.chroma[X265_CSP_I420].satd[CHROMA_6x8] = NULL; | |
1058 | p.chroma[X265_CSP_I420].satd[CHROMA_8x2] = NULL; | |
1059 | p.chroma[X265_CSP_I420].satd[CHROMA_2x8] = NULL; | |
1060 | p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = satd4<16, 12>; | |
1061 | p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = satd4<12, 16>; | |
1062 | p.chroma[X265_CSP_I420].satd[CHROMA_16x4] = satd4<16, 4>; | |
1063 | p.chroma[X265_CSP_I420].satd[CHROMA_4x16] = satd4<4, 16>; | |
1064 | p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = satd8<32, 24>; | |
1065 | p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = satd8<24, 32>; | |
1066 | p.chroma[X265_CSP_I420].satd[CHROMA_32x8] = satd8<32, 8>; | |
1067 | p.chroma[X265_CSP_I420].satd[CHROMA_8x32] = satd8<8, 32>; | |
1068 | ||
1069 | p.chroma[X265_CSP_I422].satd[CHROMA422_2x4] = NULL; | |
1070 | p.chroma[X265_CSP_I422].satd[CHROMA422_4x8] = satd4<4, 8>; | |
1071 | p.chroma[X265_CSP_I422].satd[CHROMA422_8x16] = satd8<8, 16>; | |
1072 | p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = satd8<16, 32>; | |
1073 | p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = satd8<32, 64>; | |
1074 | ||
1075 | p.chroma[X265_CSP_I422].satd[CHROMA422_4x4] = satd_4x4; | |
1076 | p.chroma[X265_CSP_I422].satd[CHROMA422_2x8] = NULL; | |
1077 | p.chroma[X265_CSP_I422].satd[CHROMA422_8x8] = satd8<8, 8>; | |
1078 | p.chroma[X265_CSP_I422].satd[CHROMA422_4x16] = satd4<4, 16>; | |
1079 | p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = satd8<16, 16>; | |
1080 | p.chroma[X265_CSP_I422].satd[CHROMA422_8x32] = satd8<8, 32>; | |
1081 | p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = satd8<32, 32>; | |
1082 | p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = satd8<16, 64>; | |
1083 | ||
1084 | p.chroma[X265_CSP_I422].satd[CHROMA422_8x12] = satd4<8, 12>; | |
1085 | p.chroma[X265_CSP_I422].satd[CHROMA422_6x16] = NULL; | |
1086 | p.chroma[X265_CSP_I422].satd[CHROMA422_8x4] = satd4<8, 4>; | |
1087 | p.chroma[X265_CSP_I422].satd[CHROMA422_2x16] = NULL; | |
1088 | p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>; | |
1089 | p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>; | |
1090 | p.chroma[X265_CSP_I422].satd[CHROMA422_16x8] = satd8<16, 8>; | |
1091 | p.chroma[X265_CSP_I422].satd[CHROMA422_4x32] = satd4<4, 32>; | |
1092 | p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>; | |
1093 | p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>; | |
1094 | p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = satd8<32, 16>; | |
1095 | p.chroma[X265_CSP_I422].satd[CHROMA422_8x64] = satd8<8, 64>; | |
1096 | ||
72b9787e JB |
1097 | #define CHROMA_420(W, H) \ |
1098 | p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = addAvg<W, H>; \ | |
1099 | p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \ | |
1100 | p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \ | |
1101 | p.chroma[X265_CSP_I420].copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \ | |
1102 | p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>; | |
1103 | ||
1104 | #define CHROMA_422(W, H) \ | |
b53f7c52 | 1105 | p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg<W, H>; \ |
72b9787e JB |
1106 | p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \ |
1107 | p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \ | |
1108 | p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \ | |
1109 | p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>; | |
1110 | ||
1111 | #define CHROMA_444(W, H) \ | |
b53f7c52 | 1112 | p.chroma[X265_CSP_I444].satd[LUMA_ ## W ## x ## H] = p.satd[LUMA_ ## W ## x ## H]; \ |
72b9787e JB |
1113 | p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \ |
1114 | p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \ | |
1115 | p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \ | |
1116 | p.chroma[X265_CSP_I444].copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \ | |
1117 | p.chroma[X265_CSP_I444].copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>; | |
1118 | ||
1119 | #define LUMA(W, H) \ | |
1120 | p.luma_addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \ | |
1121 | p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \ | |
1122 | p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \ | |
1123 | p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \ | |
1124 | p.luma_copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>; | |
1125 | ||
1126 | #define LUMA_PIXELSUB(W, H) \ | |
1127 | p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \ | |
1128 | p.luma_add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>; | |
1129 | ||
1130 | #define CHROMA_PIXELSUB_420(W, H) \ | |
1131 | p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \ | |
1132 | p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>; | |
1133 | ||
1134 | #define CHROMA_PIXELSUB_422(W, H) \ | |
1135 | p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \ | |
1136 | p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = pixel_add_ps_c<W, H>; | |
1137 | ||
1138 | #define CHROMA_PIXELSUB_444(W, H) \ | |
1139 | p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \ | |
1140 | p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>; | |
1141 | ||
72b9787e JB |
1142 | LUMA(4, 4); |
1143 | LUMA(8, 8); | |
1144 | CHROMA_420(4, 4); | |
1145 | LUMA(4, 8); | |
1146 | CHROMA_420(2, 4); | |
1147 | LUMA(8, 4); | |
1148 | CHROMA_420(4, 2); | |
1149 | LUMA(16, 16); | |
1150 | CHROMA_420(8, 8); | |
1151 | LUMA(16, 8); | |
1152 | CHROMA_420(8, 4); | |
1153 | LUMA(8, 16); | |
1154 | CHROMA_420(4, 8); | |
1155 | LUMA(16, 12); | |
1156 | CHROMA_420(8, 6); | |
1157 | LUMA(12, 16); | |
1158 | CHROMA_420(6, 8); | |
1159 | LUMA(16, 4); | |
1160 | CHROMA_420(8, 2); | |
1161 | LUMA(4, 16); | |
1162 | CHROMA_420(2, 8); | |
1163 | LUMA(32, 32); | |
1164 | CHROMA_420(16, 16); | |
1165 | LUMA(32, 16); | |
1166 | CHROMA_420(16, 8); | |
1167 | LUMA(16, 32); | |
1168 | CHROMA_420(8, 16); | |
1169 | LUMA(32, 24); | |
1170 | CHROMA_420(16, 12); | |
1171 | LUMA(24, 32); | |
1172 | CHROMA_420(12, 16); | |
1173 | LUMA(32, 8); | |
1174 | CHROMA_420(16, 4); | |
1175 | LUMA(8, 32); | |
1176 | CHROMA_420(4, 16); | |
1177 | LUMA(64, 64); | |
1178 | CHROMA_420(32, 32); | |
1179 | LUMA(64, 32); | |
1180 | CHROMA_420(32, 16); | |
1181 | LUMA(32, 64); | |
1182 | CHROMA_420(16, 32); | |
1183 | LUMA(64, 48); | |
1184 | CHROMA_420(32, 24); | |
1185 | LUMA(48, 64); | |
1186 | CHROMA_420(24, 32); | |
1187 | LUMA(64, 16); | |
1188 | CHROMA_420(32, 8); | |
1189 | LUMA(16, 64); | |
1190 | CHROMA_420(8, 32); | |
1191 | ||
1192 | LUMA_PIXELSUB(4, 4); | |
1193 | LUMA_PIXELSUB(8, 8); | |
1194 | LUMA_PIXELSUB(16, 16); | |
1195 | LUMA_PIXELSUB(32, 32); | |
1196 | LUMA_PIXELSUB(64, 64); | |
1197 | CHROMA_PIXELSUB_420(4, 4) | |
1198 | CHROMA_PIXELSUB_420(8, 8) | |
1199 | CHROMA_PIXELSUB_420(16, 16) | |
1200 | CHROMA_PIXELSUB_420(32, 32) | |
1201 | CHROMA_PIXELSUB_422(4, 8) | |
1202 | CHROMA_PIXELSUB_422(8, 16) | |
1203 | CHROMA_PIXELSUB_422(16, 32) | |
1204 | CHROMA_PIXELSUB_422(32, 64) | |
1205 | CHROMA_PIXELSUB_444(8, 8) | |
1206 | CHROMA_PIXELSUB_444(16, 16) | |
1207 | CHROMA_PIXELSUB_444(32, 32) | |
1208 | CHROMA_PIXELSUB_444(64, 64) | |
1209 | ||
1210 | CHROMA_422(4, 8); | |
1211 | CHROMA_422(4, 4); | |
1212 | CHROMA_422(2, 8); | |
1213 | CHROMA_422(8, 16); | |
1214 | CHROMA_422(8, 8); | |
1215 | CHROMA_422(4, 16); | |
1216 | CHROMA_422(8, 12); | |
1217 | CHROMA_422(6, 16); | |
1218 | CHROMA_422(8, 4); | |
1219 | CHROMA_422(2, 16); | |
1220 | CHROMA_422(16, 32); | |
1221 | CHROMA_422(16, 16); | |
1222 | CHROMA_422(8, 32); | |
1223 | CHROMA_422(16, 24); | |
1224 | CHROMA_422(12, 32); | |
1225 | CHROMA_422(16, 8); | |
1226 | CHROMA_422(4, 32); | |
1227 | CHROMA_422(32, 64); | |
1228 | CHROMA_422(32, 32); | |
1229 | CHROMA_422(16, 64); | |
1230 | CHROMA_422(32, 48); | |
1231 | CHROMA_422(24, 64); | |
1232 | CHROMA_422(32, 16); | |
1233 | CHROMA_422(8, 64); | |
1234 | ||
1235 | CHROMA_444(4, 4); | |
1236 | CHROMA_444(8, 8); | |
1237 | CHROMA_444(4, 8); | |
1238 | CHROMA_444(8, 4); | |
1239 | CHROMA_444(16, 16); | |
1240 | CHROMA_444(16, 8); | |
1241 | CHROMA_444(8, 16); | |
1242 | CHROMA_444(16, 12); | |
1243 | CHROMA_444(12, 16); | |
1244 | CHROMA_444(16, 4); | |
1245 | CHROMA_444(4, 16); | |
1246 | CHROMA_444(32, 32); | |
1247 | CHROMA_444(32, 16); | |
1248 | CHROMA_444(16, 32); | |
1249 | CHROMA_444(32, 24); | |
1250 | CHROMA_444(24, 32); | |
1251 | CHROMA_444(32, 8); | |
1252 | CHROMA_444(8, 32); | |
1253 | CHROMA_444(64, 64); | |
1254 | CHROMA_444(64, 32); | |
1255 | CHROMA_444(32, 64); | |
1256 | CHROMA_444(64, 48); | |
1257 | CHROMA_444(48, 64); | |
1258 | CHROMA_444(64, 16); | |
1259 | CHROMA_444(16, 64); | |
1260 | ||
b53f7c52 JB |
1261 | SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixel, pixel) |
1262 | SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel) | |
1263 | SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t) | |
72b9787e JB |
1264 | |
1265 | p.blockfill_s[BLOCK_4x4] = blockfil_s_c<4>; | |
1266 | p.blockfill_s[BLOCK_8x8] = blockfil_s_c<8>; | |
1267 | p.blockfill_s[BLOCK_16x16] = blockfil_s_c<16>; | |
1268 | p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>; | |
1269 | p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>; | |
1270 | ||
b53f7c52 JB |
1271 | p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>; |
1272 | p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>; | |
1273 | p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>; | |
1274 | p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>; | |
1275 | p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>; | |
1276 | p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>; | |
1277 | p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>; | |
1278 | p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>; | |
1279 | p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>; | |
1280 | p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>; | |
1281 | p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>; | |
1282 | p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>; | |
1283 | p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>; | |
1284 | p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>; | |
1285 | p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>; | |
1286 | p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>; | |
72b9787e JB |
1287 | |
1288 | p.sa8d[BLOCK_4x4] = satd_4x4; | |
1289 | p.sa8d[BLOCK_8x8] = sa8d_8x8; | |
1290 | p.sa8d[BLOCK_16x16] = sa8d_16x16; | |
1291 | p.sa8d[BLOCK_32x32] = sa8d16<32, 32>; | |
1292 | p.sa8d[BLOCK_64x64] = sa8d16<64, 64>; | |
1293 | ||
1294 | p.psy_cost_pp[BLOCK_4x4] = psyCost_pp<BLOCK_4x4>; | |
1295 | p.psy_cost_pp[BLOCK_8x8] = psyCost_pp<BLOCK_8x8>; | |
1296 | p.psy_cost_pp[BLOCK_16x16] = psyCost_pp<BLOCK_16x16>; | |
1297 | p.psy_cost_pp[BLOCK_32x32] = psyCost_pp<BLOCK_32x32>; | |
1298 | p.psy_cost_pp[BLOCK_64x64] = psyCost_pp<BLOCK_64x64>; | |
1299 | ||
1300 | p.psy_cost_ss[BLOCK_4x4] = psyCost_ss<BLOCK_4x4>; | |
1301 | p.psy_cost_ss[BLOCK_8x8] = psyCost_ss<BLOCK_8x8>; | |
1302 | p.psy_cost_ss[BLOCK_16x16] = psyCost_ss<BLOCK_16x16>; | |
1303 | p.psy_cost_ss[BLOCK_32x32] = psyCost_ss<BLOCK_32x32>; | |
1304 | p.psy_cost_ss[BLOCK_64x64] = psyCost_ss<BLOCK_64x64>; | |
1305 | ||
1306 | p.sa8d_inter[LUMA_4x4] = satd_4x4; | |
1307 | p.sa8d_inter[LUMA_8x8] = sa8d_8x8; | |
1308 | p.sa8d_inter[LUMA_8x4] = satd_8x4; | |
1309 | p.sa8d_inter[LUMA_4x8] = satd4<4, 8>; | |
1310 | p.sa8d_inter[LUMA_16x16] = sa8d_16x16; | |
1311 | p.sa8d_inter[LUMA_16x8] = sa8d8<16, 8>; | |
1312 | p.sa8d_inter[LUMA_8x16] = sa8d8<8, 16>; | |
1313 | p.sa8d_inter[LUMA_16x12] = satd8<16, 12>; | |
1314 | p.sa8d_inter[LUMA_12x16] = satd4<12, 16>; | |
1315 | p.sa8d_inter[LUMA_4x16] = satd4<4, 16>; | |
1316 | p.sa8d_inter[LUMA_16x4] = satd8<16, 4>; | |
1317 | p.sa8d_inter[LUMA_32x32] = sa8d16<32, 32>; | |
1318 | p.sa8d_inter[LUMA_32x16] = sa8d16<32, 16>; | |
1319 | p.sa8d_inter[LUMA_16x32] = sa8d16<16, 32>; | |
1320 | p.sa8d_inter[LUMA_32x24] = sa8d8<32, 24>; | |
1321 | p.sa8d_inter[LUMA_24x32] = sa8d8<24, 32>; | |
1322 | p.sa8d_inter[LUMA_32x8] = sa8d8<32, 8>; | |
1323 | p.sa8d_inter[LUMA_8x32] = sa8d8<8, 32>; | |
1324 | p.sa8d_inter[LUMA_64x64] = sa8d16<64, 64>; | |
1325 | p.sa8d_inter[LUMA_64x32] = sa8d16<64, 32>; | |
1326 | p.sa8d_inter[LUMA_32x64] = sa8d16<32, 64>; | |
1327 | p.sa8d_inter[LUMA_64x48] = sa8d16<64, 48>; | |
1328 | p.sa8d_inter[LUMA_48x64] = sa8d16<48, 64>; | |
1329 | p.sa8d_inter[LUMA_64x16] = sa8d16<64, 16>; | |
1330 | p.sa8d_inter[LUMA_16x64] = sa8d16<16, 64>; | |
1331 | ||
1332 | p.calcresidual[BLOCK_4x4] = getResidual<4>; | |
1333 | p.calcresidual[BLOCK_8x8] = getResidual<8>; | |
1334 | p.calcresidual[BLOCK_16x16] = getResidual<16>; | |
1335 | p.calcresidual[BLOCK_32x32] = getResidual<32>; | |
1336 | p.calcresidual[BLOCK_64x64] = NULL; | |
1337 | ||
1338 | p.transpose[BLOCK_4x4] = transpose<4>; | |
1339 | p.transpose[BLOCK_8x8] = transpose<8>; | |
1340 | p.transpose[BLOCK_16x16] = transpose<16>; | |
1341 | p.transpose[BLOCK_32x32] = transpose<32>; | |
1342 | p.transpose[BLOCK_64x64] = transpose<64>; | |
1343 | ||
1344 | p.ssd_s[BLOCK_4x4] = pixel_ssd_s_c<4>; | |
1345 | p.ssd_s[BLOCK_8x8] = pixel_ssd_s_c<8>; | |
1346 | p.ssd_s[BLOCK_16x16] = pixel_ssd_s_c<16>; | |
1347 | p.ssd_s[BLOCK_32x32] = pixel_ssd_s_c<32>; | |
1348 | ||
1349 | p.weight_pp = weight_pp_c; | |
1350 | p.weight_sp = weight_sp_c; | |
1351 | ||
1352 | p.scale1D_128to64 = scale1D_128to64; | |
1353 | p.scale2D_64to32 = scale2D_64to32; | |
b53f7c52 | 1354 | p.frameInitLowres = frame_init_lowres_core; |
72b9787e JB |
1355 | p.ssim_4x4x2_core = ssim_4x4x2_core; |
1356 | p.ssim_end_4 = ssim_end_4; | |
1357 | ||
1358 | p.var[BLOCK_8x8] = pixel_var<8>; | |
1359 | p.var[BLOCK_16x16] = pixel_var<16>; | |
1360 | p.var[BLOCK_32x32] = pixel_var<32>; | |
1361 | p.var[BLOCK_64x64] = pixel_var<64>; | |
72b9787e JB |
1362 | p.planecopy_cp = planecopy_cp_c; |
1363 | p.planecopy_sp = planecopy_sp_c; | |
1364 | p.propagateCost = estimateCUPropagateCost; | |
1365 | } | |
1366 | } |