Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / common / pixel.cpp
1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 * Mandar Gurav <mandar@multicorewareinc.com>
6 * Mahesh Pittala <mahesh@multicorewareinc.com>
7 * Min Chen <min.chen@multicorewareinc.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at license @ x265.com.
25 *****************************************************************************/
26
27 #include "common.h"
28 #include "primitives.h"
29 #include "x265.h"
30
31 #include <cstdlib> // abs()
32
33 using namespace x265;
34
35 #define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \
36 p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \
37 p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \
38 p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \
39 p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \
40 p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
41 p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \
42 p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
43 p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
44 p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
45 p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \
46 p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
47 p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
48 p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
49 p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
50 p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
51 p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
52 p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \
53 p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
54 p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
55 p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
56 p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
57 p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
58 p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
59 p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
60 p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
61
62 #define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
63 p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX<4, 4>; \
64 p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX<8, 8>; \
65 p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX<8, 4>; \
66 p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX<4, 8>; \
67 p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX<16, 16>; \
68 p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX<16, 8>; \
69 p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX<8, 16>; \
70 p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX<16, 12>; \
71 p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX<12, 16>; \
72 p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX<16, 4>; \
73 p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX<4, 16>; \
74 p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX<32, 32>; \
75 p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX<32, 16>; \
76 p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX<16, 32>; \
77 p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX<32, 24>; \
78 p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX<24, 32>; \
79 p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX<32, 8>; \
80 p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX<8, 32>; \
81 p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX<64, 64>; \
82 p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX<64, 32>; \
83 p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX<32, 64>; \
84 p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX<64, 48>; \
85 p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX<48, 64>; \
86 p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX<64, 16>; \
87 p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX<16, 64>;
88
89 namespace {
90 // place functions in anonymous namespace (file static)
91
92 template<int lx, int ly>
93 int sad(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
94 {
95 int sum = 0;
96
97 for (int y = 0; y < ly; y++)
98 {
99 for (int x = 0; x < lx; x++)
100 sum += abs(pix1[x] - pix2[x]);
101
102 pix1 += stride_pix1;
103 pix2 += stride_pix2;
104 }
105
106 return sum;
107 }
108
109 template<int lx, int ly>
110 int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
111 {
112 int sum = 0;
113
114 for (int y = 0; y < ly; y++)
115 {
116 for (int x = 0; x < lx; x++)
117 sum += abs(pix1[x] - pix2[x]);
118
119 pix1 += stride_pix1;
120 pix2 += stride_pix2;
121 }
122
123 return sum;
124 }
125
126 template<int lx, int ly>
127 void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
128 {
129 res[0] = 0;
130 res[1] = 0;
131 res[2] = 0;
132 for (int y = 0; y < ly; y++)
133 {
134 for (int x = 0; x < lx; x++)
135 {
136 res[0] += abs(pix1[x] - pix2[x]);
137 res[1] += abs(pix1[x] - pix3[x]);
138 res[2] += abs(pix1[x] - pix4[x]);
139 }
140
141 pix1 += FENC_STRIDE;
142 pix2 += frefstride;
143 pix3 += frefstride;
144 pix4 += frefstride;
145 }
146 }
147
148 template<int lx, int ly>
149 void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
150 {
151 res[0] = 0;
152 res[1] = 0;
153 res[2] = 0;
154 res[3] = 0;
155 for (int y = 0; y < ly; y++)
156 {
157 for (int x = 0; x < lx; x++)
158 {
159 res[0] += abs(pix1[x] - pix2[x]);
160 res[1] += abs(pix1[x] - pix3[x]);
161 res[2] += abs(pix1[x] - pix4[x]);
162 res[3] += abs(pix1[x] - pix5[x]);
163 }
164
165 pix1 += FENC_STRIDE;
166 pix2 += frefstride;
167 pix3 += frefstride;
168 pix4 += frefstride;
169 pix5 += frefstride;
170 }
171 }
172
173 template<int lx, int ly, class T1, class T2>
174 int sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
175 {
176 int sum = 0;
177 int tmp;
178
179 for (int y = 0; y < ly; y++)
180 {
181 for (int x = 0; x < lx; x++)
182 {
183 tmp = pix1[x] - pix2[x];
184 sum += (tmp * tmp);
185 }
186
187 pix1 += stride_pix1;
188 pix2 += stride_pix2;
189 }
190
191 return sum;
192 }
193
194 #define BITS_PER_SUM (8 * sizeof(sum_t))
195
196 #define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) { \
197 sum2_t t0 = s0 + s1; \
198 sum2_t t1 = s0 - s1; \
199 sum2_t t2 = s2 + s3; \
200 sum2_t t3 = s2 - s3; \
201 d0 = t0 + t2; \
202 d2 = t0 - t2; \
203 d1 = t1 + t3; \
204 d3 = t1 - t3; \
205 }
206
207 // in: a pseudo-simd number of the form x+(y<<16)
208 // return: abs(x)+(abs(y)<<16)
209 inline sum2_t abs2(sum2_t a)
210 {
211 sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
212
213 return (a + s) ^ s;
214 }
215
216 int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
217 {
218 sum2_t tmp[4][2];
219 sum2_t a0, a1, a2, a3, b0, b1;
220 sum2_t sum = 0;
221
222 for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)
223 {
224 a0 = pix1[0] - pix2[0];
225 a1 = pix1[1] - pix2[1];
226 b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
227 a2 = pix1[2] - pix2[2];
228 a3 = pix1[3] - pix2[3];
229 b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
230 tmp[i][0] = b0 + b1;
231 tmp[i][1] = b0 - b1;
232 }
233
234 for (int i = 0; i < 2; i++)
235 {
236 HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
237 a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
238 sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);
239 }
240
241 return (int)(sum >> 1);
242 }
243
244 int satd_4x4(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
245 {
246 ssum2_t tmp[4][2];
247 ssum2_t a0, a1, a2, a3, b0, b1;
248 ssum2_t sum = 0;
249
250 for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)
251 {
252 a0 = pix1[0] - pix2[0];
253 a1 = pix1[1] - pix2[1];
254 b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
255 a2 = pix1[2] - pix2[2];
256 a3 = pix1[3] - pix2[3];
257 b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
258 tmp[i][0] = b0 + b1;
259 tmp[i][1] = b0 - b1;
260 }
261
262 for (int i = 0; i < 2; i++)
263 {
264 HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
265 a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
266 sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);
267 }
268
269 return (int)(sum >> 1);
270 }
271
272 // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
273 int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
274 {
275 sum2_t tmp[4][4];
276 sum2_t a0, a1, a2, a3;
277 sum2_t sum = 0;
278
279 for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)
280 {
281 a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
282 a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
283 a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
284 a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
285 HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
286 }
287
288 for (int i = 0; i < 4; i++)
289 {
290 HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
291 sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
292 }
293
294 return (((sum_t)sum) + (sum >> BITS_PER_SUM)) >> 1;
295 }
296
297 template<int w, int h>
298 // calculate satd in blocks of 4x4
299 int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
300 {
301 int satd = 0;
302
303 for (int row = 0; row < h; row += 4)
304 for (int col = 0; col < w; col += 4)
305 satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
306 pix2 + row * stride_pix2 + col, stride_pix2);
307
308 return satd;
309 }
310
311 template<int w, int h>
312 // calculate satd in blocks of 8x4
313 int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
314 {
315 int satd = 0;
316
317 for (int row = 0; row < h; row += 4)
318 for (int col = 0; col < w; col += 8)
319 satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
320 pix2 + row * stride_pix2 + col, stride_pix2);
321
322 return satd;
323 }
324
325 inline int _sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
326 {
327 sum2_t tmp[8][4];
328 sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
329 sum2_t sum = 0;
330
331 for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2)
332 {
333 a0 = pix1[0] - pix2[0];
334 a1 = pix1[1] - pix2[1];
335 b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
336 a2 = pix1[2] - pix2[2];
337 a3 = pix1[3] - pix2[3];
338 b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
339 a4 = pix1[4] - pix2[4];
340 a5 = pix1[5] - pix2[5];
341 b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM);
342 a6 = pix1[6] - pix2[6];
343 a7 = pix1[7] - pix2[7];
344 b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM);
345 HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3);
346 }
347
348 for (int i = 0; i < 4; i++)
349 {
350 HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
351 HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]);
352 b0 = abs2(a0 + a4) + abs2(a0 - a4);
353 b0 += abs2(a1 + a5) + abs2(a1 - a5);
354 b0 += abs2(a2 + a6) + abs2(a2 - a6);
355 b0 += abs2(a3 + a7) + abs2(a3 - a7);
356 sum += (sum_t)b0 + (b0 >> BITS_PER_SUM);
357 }
358
359 return (int)sum;
360 }
361
362 int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
363 {
364 return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
365 }
366
367 inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2)
368 {
369 ssum2_t tmp[8][4];
370 ssum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
371 ssum2_t sum = 0;
372
373 for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2)
374 {
375 a0 = pix1[0] - pix2[0];
376 a1 = pix1[1] - pix2[1];
377 b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
378 a2 = pix1[2] - pix2[2];
379 a3 = pix1[3] - pix2[3];
380 b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
381 a4 = pix1[4] - pix2[4];
382 a5 = pix1[5] - pix2[5];
383 b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM);
384 a6 = pix1[6] - pix2[6];
385 a7 = pix1[7] - pix2[7];
386 b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM);
387 HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3);
388 }
389
390 for (int i = 0; i < 4; i++)
391 {
392 HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
393 HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]);
394 b0 = abs2(a0 + a4) + abs2(a0 - a4);
395 b0 += abs2(a1 + a5) + abs2(a1 - a5);
396 b0 += abs2(a2 + a6) + abs2(a2 - a6);
397 b0 += abs2(a3 + a7) + abs2(a3 - a7);
398 sum += (sum_t)b0 + (b0 >> BITS_PER_SUM);
399 }
400
401 return (int)sum;
402 }
403
404 int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2)
405 {
406 return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
407 }
408
409 int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
410 {
411 int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
412 + _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2)
413 + _sa8d_8x8(pix1 + 8 * i_pix1, i_pix1, pix2 + 8 * i_pix2, i_pix2)
414 + _sa8d_8x8(pix1 + 8 + 8 * i_pix1, i_pix1, pix2 + 8 + 8 * i_pix2, i_pix2);
415
416 // This matches x264 sa8d_16x16, but is slightly different from HM's behavior because
417 // this version only rounds once at the end
418 return (sum + 2) >> 2;
419 }
420
421 template<int w, int h>
422 // Calculate sa8d in blocks of 8x8
423 int sa8d8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
424 {
425 int cost = 0;
426
427 for (int y = 0; y < h; y += 8)
428 for (int x = 0; x < w; x += 8)
429 cost += sa8d_8x8(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
430
431 return cost;
432 }
433
434 template<int w, int h>
435 // Calculate sa8d in blocks of 16x16
436 int sa8d16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
437 {
438 int cost = 0;
439
440 for (int y = 0; y < h; y += 16)
441 for (int x = 0; x < w; x += 16)
442 cost += sa8d_16x16(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
443
444 return cost;
445 }
446
447 template<int size>
448 int pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
449 {
450 int sum = 0;
451 for (int y = 0; y < size; y++)
452 {
453 for (int x = 0; x < size; x++)
454 sum += a[x] * a[x];
455
456 a += dstride;
457 }
458 return sum;
459 }
460
461 template<int size>
462 void blockfil_s_c(int16_t* dst, intptr_t dstride, int16_t val)
463 {
464 for (int y = 0; y < size; y++)
465 for (int x = 0; x < size; x++)
466 dst[y * dstride + x] = val;
467 }
468
469 template<int size>
470 void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
471 {
472 X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
473 X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
474 X265_CHECK(shift >= 0, "invalid shift\n");
475
476 for (int i = 0; i < size; i++)
477 {
478 for (int j = 0; j < size; j++)
479 dst[j] = src[j] << shift;
480
481 src += srcStride;
482 dst += size;
483 }
484 }
485
486 template<int size>
487 void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
488 {
489 X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
490 X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
491 X265_CHECK(shift > 0, "invalid shift\n");
492
493 int16_t round = 1 << (shift - 1);
494 for (int i = 0; i < size; i++)
495 {
496 for (int j = 0; j < size; j++)
497 dst[j] = (src[j] + round) >> shift;
498
499 src += srcStride;
500 dst += size;
501 }
502 }
503
504 template<int size>
505 void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
506 {
507 X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
508 X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
509 X265_CHECK(shift >= 0, "invalid shift\n");
510
511 for (int i = 0; i < size; i++)
512 {
513 for (int j = 0; j < size; j++)
514 dst[j] = src[j] << shift;
515
516 src += size;
517 dst += dstStride;
518 }
519 }
520
521 template<int size>
522 void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
523 {
524 X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
525 X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
526 X265_CHECK(shift > 0, "invalid shift\n");
527
528 int16_t round = 1 << (shift - 1);
529 for (int i = 0; i < size; i++)
530 {
531 for (int j = 0; j < size; j++)
532 dst[j] = (src[j] + round) >> shift;
533
534 src += size;
535 dst += dstStride;
536 }
537 }
538
539 template<int blockSize>
540 void getResidual(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
541 {
542 for (int y = 0; y < blockSize; y++)
543 {
544 for (int x = 0; x < blockSize; x++)
545 residual[x] = static_cast<int16_t>(fenc[x]) - static_cast<int16_t>(pred[x]);
546
547 fenc += stride;
548 residual += stride;
549 pred += stride;
550 }
551 }
552
553 template<int blockSize>
554 void transpose(pixel* dst, const pixel* src, intptr_t stride)
555 {
556 for (int k = 0; k < blockSize; k++)
557 for (int l = 0; l < blockSize; l++)
558 dst[k * blockSize + l] = src[l * stride + k];
559 }
560
561 void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
562 {
563 int x, y;
564
565 for (y = 0; y <= height - 1; y++)
566 {
567 for (x = 0; x <= width - 1; )
568 {
569 // note: width can be odd
570 dst[x] = (pixel)Clip3(0, ((1 << X265_DEPTH) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
571 x++;
572 }
573
574 src += srcStride;
575 dst += dstStride;
576 }
577 }
578
579 void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
580 {
581 int x, y;
582
583 X265_CHECK(!(width & 15), "weightp alignment error\n");
584 X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
585 X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n");
586
587 for (y = 0; y <= height - 1; y++)
588 {
589 for (x = 0; x <= width - 1; )
590 {
591 // simulating pixel to short conversion
592 int16_t val = src[x] << (IF_INTERNAL_PREC - X265_DEPTH);
593 dst[x] = (pixel)Clip3(0, ((1 << X265_DEPTH) - 1), ((w0 * (val) + round) >> shift) + offset);
594 x++;
595 }
596
597 src += stride;
598 dst += stride;
599 }
600 }
601
602 template<int lx, int ly>
603 void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
604 {
605 for (int y = 0; y < ly; y++)
606 {
607 for (int x = 0; x < lx; x++)
608 dst[x] = (src0[x] + src1[x] + 1) >> 1;
609
610 src0 += sstride0;
611 src1 += sstride1;
612 dst += dstride;
613 }
614 }
615
616 void scale1D_128to64(pixel* dst, const pixel* src, intptr_t /*stride*/)
617 {
618 int x;
619
620 for (x = 0; x < 128; x += 2)
621 {
622 pixel pix0 = src[(x + 0)];
623 pixel pix1 = src[(x + 1)];
624 int sum = pix0 + pix1;
625
626 dst[x >> 1] = (pixel)((sum + 1) >> 1);
627 }
628 }
629
630 void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
631 {
632 uint32_t x, y;
633
634 for (y = 0; y < 64; y += 2)
635 {
636 for (x = 0; x < 64; x += 2)
637 {
638 pixel pix0 = src[(y + 0) * stride + (x + 0)];
639 pixel pix1 = src[(y + 0) * stride + (x + 1)];
640 pixel pix2 = src[(y + 1) * stride + (x + 0)];
641 pixel pix3 = src[(y + 1) * stride + (x + 1)];
642 int sum = pix0 + pix1 + pix2 + pix3;
643
644 dst[y / 2 * 32 + x / 2] = (pixel)((sum + 2) >> 2);
645 }
646 }
647 }
648
649 void frame_init_lowres_core(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc,
650 intptr_t src_stride, intptr_t dst_stride, int width, int height)
651 {
652 for (int y = 0; y < height; y++)
653 {
654 const pixel* src1 = src0 + src_stride;
655 const pixel* src2 = src1 + src_stride;
656 for (int x = 0; x < width; x++)
657 {
658 // slower than naive bilinear, but matches asm
659 #define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1)
660 dst0[x] = FILTER(src0[2 * x], src1[2 * x], src0[2 * x + 1], src1[2 * x + 1]);
661 dsth[x] = FILTER(src0[2 * x + 1], src1[2 * x + 1], src0[2 * x + 2], src1[2 * x + 2]);
662 dstv[x] = FILTER(src1[2 * x], src2[2 * x], src1[2 * x + 1], src2[2 * x + 1]);
663 dstc[x] = FILTER(src1[2 * x + 1], src2[2 * x + 1], src1[2 * x + 2], src2[2 * x + 2]);
664 #undef FILTER
665 }
666 src0 += src_stride * 2;
667 dst0 += dst_stride;
668 dsth += dst_stride;
669 dstv += dst_stride;
670 dstc += dst_stride;
671 }
672 }
673
674 /* structural similarity metric */
675 void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4])
676 {
677 for (int z = 0; z < 2; z++)
678 {
679 uint32_t s1 = 0, s2 = 0, ss = 0, s12 = 0;
680 for (int y = 0; y < 4; y++)
681 {
682 for (int x = 0; x < 4; x++)
683 {
684 int a = pix1[x + y * stride1];
685 int b = pix2[x + y * stride2];
686 s1 += a;
687 s2 += b;
688 ss += a * a;
689 ss += b * b;
690 s12 += a * b;
691 }
692 }
693
694 sums[z][0] = s1;
695 sums[z][1] = s2;
696 sums[z][2] = ss;
697 sums[z][3] = s12;
698 pix1 += 4;
699 pix2 += 4;
700 }
701 }
702
703 float ssim_end_1(int s1, int s2, int ss, int s12)
704 {
705 /* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases.
706 * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784.
707 * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */
708
709 #define PIXEL_MAX ((1 << X265_DEPTH) - 1)
710 #if HIGH_BIT_DEPTH
711 X265_CHECK(X265_DEPTH == 10, "ssim invalid depth\n");
712 #define type float
713 static const float ssim_c1 = (float)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64);
714 static const float ssim_c2 = (float)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63);
715 #else
716 X265_CHECK(X265_DEPTH == 8, "ssim invalid depth\n");
717 #define type int
718 static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5);
719 static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5);
720 #endif
721 type fs1 = (type)s1;
722 type fs2 = (type)s2;
723 type fss = (type)ss;
724 type fs12 = (type)s12;
725 type vars = (type)(fss * 64 - fs1 * fs1 - fs2 * fs2);
726 type covar = (type)(fs12 * 64 - fs1 * fs2);
727 return (float)(2 * fs1 * fs2 + ssim_c1) * (float)(2 * covar + ssim_c2)
728 / ((float)(fs1 * fs1 + fs2 * fs2 + ssim_c1) * (float)(vars + ssim_c2));
729 #undef type
730 #undef PIXEL_MAX
731 }
732
733 float ssim_end_4(int sum0[5][4], int sum1[5][4], int width)
734 {
735 float ssim = 0.0;
736
737 for (int i = 0; i < width; i++)
738 {
739 ssim += ssim_end_1(sum0[i][0] + sum0[i + 1][0] + sum1[i][0] + sum1[i + 1][0],
740 sum0[i][1] + sum0[i + 1][1] + sum1[i][1] + sum1[i + 1][1],
741 sum0[i][2] + sum0[i + 1][2] + sum1[i][2] + sum1[i + 1][2],
742 sum0[i][3] + sum0[i + 1][3] + sum1[i][3] + sum1[i + 1][3]);
743 }
744
745 return ssim;
746 }
747
748 template<int size>
749 uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
750 {
751 uint32_t sum = 0, sqr = 0;
752
753 for (int y = 0; y < size; y++)
754 {
755 for (int x = 0; x < size; x++)
756 {
757 sum += pix[x];
758 sqr += pix[x] * pix[x];
759 }
760
761 pix += i_stride;
762 }
763
764 return sum + ((uint64_t)sqr << 32);
765 }
766
767 #if defined(_MSC_VER)
768 #pragma warning(disable: 4127) // conditional expression is constant
769 #endif
770
771 template<int size>
772 int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
773 {
774 static pixel zeroBuf[8] /* = { 0 } */;
775
776 if (size)
777 {
778 int dim = 1 << (size + 2);
779 uint32_t totEnergy = 0;
780 for (int i = 0; i < dim; i += 8)
781 {
782 for (int j = 0; j < dim; j+= 8)
783 {
784 /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
785 int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride, zeroBuf, 0) -
786 (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
787 int reconEnergy = sa8d_8x8(recon + i * rstride + j, rstride, zeroBuf, 0) -
788 (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
789
790 totEnergy += abs(sourceEnergy - reconEnergy);
791 }
792 }
793 return totEnergy;
794 }
795 else
796 {
797 /* 4x4 is too small for sa8d */
798 int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
799 int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
800 return abs(sourceEnergy - reconEnergy);
801 }
802 }
803
804 template<int size>
805 int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
806 {
807 static int16_t zeroBuf[8] /* = { 0 } */;
808
809 if (size)
810 {
811 int dim = 1 << (size + 2);
812 uint32_t totEnergy = 0;
813 for (int i = 0; i < dim; i += 8)
814 {
815 for (int j = 0; j < dim; j+= 8)
816 {
817 /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
818 int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride, zeroBuf, 0) -
819 (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
820 int reconEnergy = sa8d_8x8(recon + i * rstride + j, rstride, zeroBuf, 0) -
821 (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
822
823 totEnergy += abs(sourceEnergy - reconEnergy);
824 }
825 }
826 return totEnergy;
827 }
828 else
829 {
830 /* 4x4 is too small for sa8d */
831 int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
832 int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
833 return abs(sourceEnergy - reconEnergy);
834 }
835 }
836
837 template<int bx, int by>
838 void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
839 {
840 for (int y = 0; y < by; y++)
841 {
842 for (int x = 0; x < bx; x++)
843 a[x] = b[x];
844
845 a += stridea;
846 b += strideb;
847 }
848 }
849
850 template<int bx, int by>
851 void blockcopy_ss_c(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
852 {
853 for (int y = 0; y < by; y++)
854 {
855 for (int x = 0; x < bx; x++)
856 a[x] = b[x];
857
858 a += stridea;
859 b += strideb;
860 }
861 }
862
863 template<int bx, int by>
864 void blockcopy_sp_c(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
865 {
866 for (int y = 0; y < by; y++)
867 {
868 for (int x = 0; x < bx; x++)
869 {
870 X265_CHECK((b[x] >= 0) && (b[x] <= ((1 << X265_DEPTH) - 1)), "blockcopy pixel size fail\n");
871 a[x] = (pixel)b[x];
872 }
873
874 a += stridea;
875 b += strideb;
876 }
877 }
878
879 template<int bx, int by>
880 void blockcopy_ps_c(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
881 {
882 for (int y = 0; y < by; y++)
883 {
884 for (int x = 0; x < bx; x++)
885 a[x] = (int16_t)b[x];
886
887 a += stridea;
888 b += strideb;
889 }
890 }
891
892 template<int bx, int by>
893 void pixel_sub_ps_c(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
894 {
895 for (int y = 0; y < by; y++)
896 {
897 for (int x = 0; x < bx; x++)
898 a[x] = (int16_t)(b0[x] - b1[x]);
899
900 b0 += sstride0;
901 b1 += sstride1;
902 a += dstride;
903 }
904 }
905
906 template<int bx, int by>
907 void pixel_add_ps_c(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1)
908 {
909 for (int y = 0; y < by; y++)
910 {
911 for (int x = 0; x < bx; x++)
912 a[x] = Clip(b0[x] + b1[x]);
913
914 b0 += sstride0;
915 b1 += sstride1;
916 a += dstride;
917 }
918 }
919
920 template<int bx, int by>
921 void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
922 {
923 int shiftNum, offset;
924
925 shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
926 offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
927
928 for (int y = 0; y < by; y++)
929 {
930 for (int x = 0; x < bx; x += 2)
931 {
932 dst[x + 0] = Clip((src0[x + 0] + src1[x + 0] + offset) >> shiftNum);
933 dst[x + 1] = Clip((src0[x + 1] + src1[x + 1] + offset) >> shiftNum);
934 }
935
936 src0 += src0Stride;
937 src1 += src1Stride;
938 dst += dstStride;
939 }
940 }
941
942 void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
943 {
944 for (int r = 0; r < height; r++)
945 {
946 for (int c = 0; c < width; c++)
947 dst[c] = ((pixel)src[c]) << shift;
948
949 dst += dstStride;
950 src += srcStride;
951 }
952 }
953
954 void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
955 {
956 for (int r = 0; r < height; r++)
957 {
958 for (int c = 0; c < width; c++)
959 dst[c] = (pixel)((src[c] >> shift) & mask);
960
961 dst += dstStride;
962 src += srcStride;
963 }
964 }
965
966 /* Estimate the total amount of influence on future quality that could be had if we
967 * were to improve the reference samples used to inter predict any given CU. */
968 void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
969 const int32_t* invQscales, const double* fpsFactor, int len)
970 {
971 double fps = *fpsFactor / 256;
972
973 for (int i = 0; i < len; i++)
974 {
975 double intraCost = intraCosts[i] * invQscales[i];
976 double propagateAmount = (double)propagateIn[i] + intraCost * fps;
977 double propagateNum = (double)intraCosts[i] - (interCosts[i] & ((1 << 14) - 1));
978 double propagateDenom = (double)intraCosts[i];
979 dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
980 }
981 }
982 } // end anonymous namespace
983
984 namespace x265 {
985 // x265 private namespace
986
987 /* Extend the edges of a picture so that it may safely be used for motion
988 * compensation. This function assumes the picture is stored in a buffer with
989 * sufficient padding for the X and Y margins */
990 void extendPicBorder(pixel* pic, intptr_t stride, int width, int height, int marginX, int marginY)
991 {
992 /* extend left and right margins */
993 primitives.extendRowBorder(pic, stride, width, height, marginX);
994
995 /* copy top row to create above margin */
996 pixel* top = pic - marginX;
997 for (int y = 0; y < marginY; y++)
998 memcpy(top - (y + 1) * stride, top, stride * sizeof(pixel));
999
1000 /* copy bottom row to create below margin */
1001 pixel* bot = pic - marginX + (height - 1) * stride;
1002 for (int y = 0; y < marginY; y++)
1003 memcpy(bot + (y + 1) * stride, bot, stride * sizeof(pixel));
1004 }
1005
1006 /* Initialize entries for pixel functions defined in this file */
1007 void Setup_C_PixelPrimitives(EncoderPrimitives &p)
1008 {
1009 SET_FUNC_PRIMITIVE_TABLE_C2(sad)
1010 SET_FUNC_PRIMITIVE_TABLE_C2(sad_x3)
1011 SET_FUNC_PRIMITIVE_TABLE_C2(sad_x4)
1012 SET_FUNC_PRIMITIVE_TABLE_C2(pixelavg_pp)
1013
1014 // satd
1015 p.satd[LUMA_4x4] = satd_4x4;
1016 p.satd[LUMA_8x8] = satd8<8, 8>;
1017 p.satd[LUMA_8x4] = satd_8x4;
1018 p.satd[LUMA_4x8] = satd4<4, 8>;
1019 p.satd[LUMA_16x16] = satd8<16, 16>;
1020 p.satd[LUMA_16x8] = satd8<16, 8>;
1021 p.satd[LUMA_8x16] = satd8<8, 16>;
1022 p.satd[LUMA_16x12] = satd8<16, 12>;
1023 p.satd[LUMA_12x16] = satd4<12, 16>;
1024 p.satd[LUMA_16x4] = satd8<16, 4>;
1025 p.satd[LUMA_4x16] = satd4<4, 16>;
1026 p.satd[LUMA_32x32] = satd8<32, 32>;
1027 p.satd[LUMA_32x16] = satd8<32, 16>;
1028 p.satd[LUMA_16x32] = satd8<16, 32>;
1029 p.satd[LUMA_32x24] = satd8<32, 24>;
1030 p.satd[LUMA_24x32] = satd8<24, 32>;
1031 p.satd[LUMA_32x8] = satd8<32, 8>;
1032 p.satd[LUMA_8x32] = satd8<8, 32>;
1033 p.satd[LUMA_64x64] = satd8<64, 64>;
1034 p.satd[LUMA_64x32] = satd8<64, 32>;
1035 p.satd[LUMA_32x64] = satd8<32, 64>;
1036 p.satd[LUMA_64x48] = satd8<64, 48>;
1037 p.satd[LUMA_48x64] = satd8<48, 64>;
1038 p.satd[LUMA_64x16] = satd8<64, 16>;
1039 p.satd[LUMA_16x64] = satd8<16, 64>;
1040
1041 p.chroma[X265_CSP_I420].satd[CHROMA_2x2] = NULL;
1042 p.chroma[X265_CSP_I420].satd[CHROMA_4x4] = satd_4x4;
1043 p.chroma[X265_CSP_I420].satd[CHROMA_8x8] = satd8<8, 8>;
1044 p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = satd8<16, 16>;
1045 p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = satd8<32, 32>;
1046
1047 p.chroma[X265_CSP_I420].satd[CHROMA_4x2] = NULL;
1048 p.chroma[X265_CSP_I420].satd[CHROMA_2x4] = NULL;
1049 p.chroma[X265_CSP_I420].satd[CHROMA_8x4] = satd_8x4;
1050 p.chroma[X265_CSP_I420].satd[CHROMA_4x8] = satd4<4, 8>;
1051 p.chroma[X265_CSP_I420].satd[CHROMA_16x8] = satd8<16, 8>;
1052 p.chroma[X265_CSP_I420].satd[CHROMA_8x16] = satd8<8, 16>;
1053 p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = satd8<32, 16>;
1054 p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = satd8<16, 32>;
1055
1056 p.chroma[X265_CSP_I420].satd[CHROMA_8x6] = NULL;
1057 p.chroma[X265_CSP_I420].satd[CHROMA_6x8] = NULL;
1058 p.chroma[X265_CSP_I420].satd[CHROMA_8x2] = NULL;
1059 p.chroma[X265_CSP_I420].satd[CHROMA_2x8] = NULL;
1060 p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = satd4<16, 12>;
1061 p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = satd4<12, 16>;
1062 p.chroma[X265_CSP_I420].satd[CHROMA_16x4] = satd4<16, 4>;
1063 p.chroma[X265_CSP_I420].satd[CHROMA_4x16] = satd4<4, 16>;
1064 p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = satd8<32, 24>;
1065 p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = satd8<24, 32>;
1066 p.chroma[X265_CSP_I420].satd[CHROMA_32x8] = satd8<32, 8>;
1067 p.chroma[X265_CSP_I420].satd[CHROMA_8x32] = satd8<8, 32>;
1068
1069 p.chroma[X265_CSP_I422].satd[CHROMA422_2x4] = NULL;
1070 p.chroma[X265_CSP_I422].satd[CHROMA422_4x8] = satd4<4, 8>;
1071 p.chroma[X265_CSP_I422].satd[CHROMA422_8x16] = satd8<8, 16>;
1072 p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = satd8<16, 32>;
1073 p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = satd8<32, 64>;
1074
1075 p.chroma[X265_CSP_I422].satd[CHROMA422_4x4] = satd_4x4;
1076 p.chroma[X265_CSP_I422].satd[CHROMA422_2x8] = NULL;
1077 p.chroma[X265_CSP_I422].satd[CHROMA422_8x8] = satd8<8, 8>;
1078 p.chroma[X265_CSP_I422].satd[CHROMA422_4x16] = satd4<4, 16>;
1079 p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = satd8<16, 16>;
1080 p.chroma[X265_CSP_I422].satd[CHROMA422_8x32] = satd8<8, 32>;
1081 p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = satd8<32, 32>;
1082 p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = satd8<16, 64>;
1083
1084 p.chroma[X265_CSP_I422].satd[CHROMA422_8x12] = satd4<8, 12>;
1085 p.chroma[X265_CSP_I422].satd[CHROMA422_6x16] = NULL;
1086 p.chroma[X265_CSP_I422].satd[CHROMA422_8x4] = satd4<8, 4>;
1087 p.chroma[X265_CSP_I422].satd[CHROMA422_2x16] = NULL;
1088 p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>;
1089 p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>;
1090 p.chroma[X265_CSP_I422].satd[CHROMA422_16x8] = satd8<16, 8>;
1091 p.chroma[X265_CSP_I422].satd[CHROMA422_4x32] = satd4<4, 32>;
1092 p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>;
1093 p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>;
1094 p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = satd8<32, 16>;
1095 p.chroma[X265_CSP_I422].satd[CHROMA422_8x64] = satd8<8, 64>;
1096
1097 #define CHROMA_420(W, H) \
1098 p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = addAvg<W, H>; \
1099 p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1100 p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1101 p.chroma[X265_CSP_I420].copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1102 p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1103
1104 #define CHROMA_422(W, H) \
1105 p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg<W, H>; \
1106 p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1107 p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1108 p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1109 p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1110
1111 #define CHROMA_444(W, H) \
1112 p.chroma[X265_CSP_I444].satd[LUMA_ ## W ## x ## H] = p.satd[LUMA_ ## W ## x ## H]; \
1113 p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
1114 p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1115 p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1116 p.chroma[X265_CSP_I444].copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1117 p.chroma[X265_CSP_I444].copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1118
1119 #define LUMA(W, H) \
1120 p.luma_addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
1121 p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1122 p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1123 p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1124 p.luma_copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1125
1126 #define LUMA_PIXELSUB(W, H) \
1127 p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1128 p.luma_add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1129
1130 #define CHROMA_PIXELSUB_420(W, H) \
1131 p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1132 p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1133
1134 #define CHROMA_PIXELSUB_422(W, H) \
1135 p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1136 p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1137
1138 #define CHROMA_PIXELSUB_444(W, H) \
1139 p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1140 p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1141
1142 LUMA(4, 4);
1143 LUMA(8, 8);
1144 CHROMA_420(4, 4);
1145 LUMA(4, 8);
1146 CHROMA_420(2, 4);
1147 LUMA(8, 4);
1148 CHROMA_420(4, 2);
1149 LUMA(16, 16);
1150 CHROMA_420(8, 8);
1151 LUMA(16, 8);
1152 CHROMA_420(8, 4);
1153 LUMA(8, 16);
1154 CHROMA_420(4, 8);
1155 LUMA(16, 12);
1156 CHROMA_420(8, 6);
1157 LUMA(12, 16);
1158 CHROMA_420(6, 8);
1159 LUMA(16, 4);
1160 CHROMA_420(8, 2);
1161 LUMA(4, 16);
1162 CHROMA_420(2, 8);
1163 LUMA(32, 32);
1164 CHROMA_420(16, 16);
1165 LUMA(32, 16);
1166 CHROMA_420(16, 8);
1167 LUMA(16, 32);
1168 CHROMA_420(8, 16);
1169 LUMA(32, 24);
1170 CHROMA_420(16, 12);
1171 LUMA(24, 32);
1172 CHROMA_420(12, 16);
1173 LUMA(32, 8);
1174 CHROMA_420(16, 4);
1175 LUMA(8, 32);
1176 CHROMA_420(4, 16);
1177 LUMA(64, 64);
1178 CHROMA_420(32, 32);
1179 LUMA(64, 32);
1180 CHROMA_420(32, 16);
1181 LUMA(32, 64);
1182 CHROMA_420(16, 32);
1183 LUMA(64, 48);
1184 CHROMA_420(32, 24);
1185 LUMA(48, 64);
1186 CHROMA_420(24, 32);
1187 LUMA(64, 16);
1188 CHROMA_420(32, 8);
1189 LUMA(16, 64);
1190 CHROMA_420(8, 32);
1191
1192 LUMA_PIXELSUB(4, 4);
1193 LUMA_PIXELSUB(8, 8);
1194 LUMA_PIXELSUB(16, 16);
1195 LUMA_PIXELSUB(32, 32);
1196 LUMA_PIXELSUB(64, 64);
1197 CHROMA_PIXELSUB_420(4, 4)
1198 CHROMA_PIXELSUB_420(8, 8)
1199 CHROMA_PIXELSUB_420(16, 16)
1200 CHROMA_PIXELSUB_420(32, 32)
1201 CHROMA_PIXELSUB_422(4, 8)
1202 CHROMA_PIXELSUB_422(8, 16)
1203 CHROMA_PIXELSUB_422(16, 32)
1204 CHROMA_PIXELSUB_422(32, 64)
1205 CHROMA_PIXELSUB_444(8, 8)
1206 CHROMA_PIXELSUB_444(16, 16)
1207 CHROMA_PIXELSUB_444(32, 32)
1208 CHROMA_PIXELSUB_444(64, 64)
1209
1210 CHROMA_422(4, 8);
1211 CHROMA_422(4, 4);
1212 CHROMA_422(2, 8);
1213 CHROMA_422(8, 16);
1214 CHROMA_422(8, 8);
1215 CHROMA_422(4, 16);
1216 CHROMA_422(8, 12);
1217 CHROMA_422(6, 16);
1218 CHROMA_422(8, 4);
1219 CHROMA_422(2, 16);
1220 CHROMA_422(16, 32);
1221 CHROMA_422(16, 16);
1222 CHROMA_422(8, 32);
1223 CHROMA_422(16, 24);
1224 CHROMA_422(12, 32);
1225 CHROMA_422(16, 8);
1226 CHROMA_422(4, 32);
1227 CHROMA_422(32, 64);
1228 CHROMA_422(32, 32);
1229 CHROMA_422(16, 64);
1230 CHROMA_422(32, 48);
1231 CHROMA_422(24, 64);
1232 CHROMA_422(32, 16);
1233 CHROMA_422(8, 64);
1234
1235 CHROMA_444(4, 4);
1236 CHROMA_444(8, 8);
1237 CHROMA_444(4, 8);
1238 CHROMA_444(8, 4);
1239 CHROMA_444(16, 16);
1240 CHROMA_444(16, 8);
1241 CHROMA_444(8, 16);
1242 CHROMA_444(16, 12);
1243 CHROMA_444(12, 16);
1244 CHROMA_444(16, 4);
1245 CHROMA_444(4, 16);
1246 CHROMA_444(32, 32);
1247 CHROMA_444(32, 16);
1248 CHROMA_444(16, 32);
1249 CHROMA_444(32, 24);
1250 CHROMA_444(24, 32);
1251 CHROMA_444(32, 8);
1252 CHROMA_444(8, 32);
1253 CHROMA_444(64, 64);
1254 CHROMA_444(64, 32);
1255 CHROMA_444(32, 64);
1256 CHROMA_444(64, 48);
1257 CHROMA_444(48, 64);
1258 CHROMA_444(64, 16);
1259 CHROMA_444(16, 64);
1260
1261 SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixel, pixel)
1262 SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel)
1263 SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t)
1264
1265 p.blockfill_s[BLOCK_4x4] = blockfil_s_c<4>;
1266 p.blockfill_s[BLOCK_8x8] = blockfil_s_c<8>;
1267 p.blockfill_s[BLOCK_16x16] = blockfil_s_c<16>;
1268 p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
1269 p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
1270
1271 p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>;
1272 p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>;
1273 p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>;
1274 p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>;
1275 p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>;
1276 p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>;
1277 p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>;
1278 p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>;
1279 p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>;
1280 p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>;
1281 p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>;
1282 p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>;
1283 p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>;
1284 p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>;
1285 p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>;
1286 p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>;
1287
1288 p.sa8d[BLOCK_4x4] = satd_4x4;
1289 p.sa8d[BLOCK_8x8] = sa8d_8x8;
1290 p.sa8d[BLOCK_16x16] = sa8d_16x16;
1291 p.sa8d[BLOCK_32x32] = sa8d16<32, 32>;
1292 p.sa8d[BLOCK_64x64] = sa8d16<64, 64>;
1293
1294 p.psy_cost_pp[BLOCK_4x4] = psyCost_pp<BLOCK_4x4>;
1295 p.psy_cost_pp[BLOCK_8x8] = psyCost_pp<BLOCK_8x8>;
1296 p.psy_cost_pp[BLOCK_16x16] = psyCost_pp<BLOCK_16x16>;
1297 p.psy_cost_pp[BLOCK_32x32] = psyCost_pp<BLOCK_32x32>;
1298 p.psy_cost_pp[BLOCK_64x64] = psyCost_pp<BLOCK_64x64>;
1299
1300 p.psy_cost_ss[BLOCK_4x4] = psyCost_ss<BLOCK_4x4>;
1301 p.psy_cost_ss[BLOCK_8x8] = psyCost_ss<BLOCK_8x8>;
1302 p.psy_cost_ss[BLOCK_16x16] = psyCost_ss<BLOCK_16x16>;
1303 p.psy_cost_ss[BLOCK_32x32] = psyCost_ss<BLOCK_32x32>;
1304 p.psy_cost_ss[BLOCK_64x64] = psyCost_ss<BLOCK_64x64>;
1305
1306 p.sa8d_inter[LUMA_4x4] = satd_4x4;
1307 p.sa8d_inter[LUMA_8x8] = sa8d_8x8;
1308 p.sa8d_inter[LUMA_8x4] = satd_8x4;
1309 p.sa8d_inter[LUMA_4x8] = satd4<4, 8>;
1310 p.sa8d_inter[LUMA_16x16] = sa8d_16x16;
1311 p.sa8d_inter[LUMA_16x8] = sa8d8<16, 8>;
1312 p.sa8d_inter[LUMA_8x16] = sa8d8<8, 16>;
1313 p.sa8d_inter[LUMA_16x12] = satd8<16, 12>;
1314 p.sa8d_inter[LUMA_12x16] = satd4<12, 16>;
1315 p.sa8d_inter[LUMA_4x16] = satd4<4, 16>;
1316 p.sa8d_inter[LUMA_16x4] = satd8<16, 4>;
1317 p.sa8d_inter[LUMA_32x32] = sa8d16<32, 32>;
1318 p.sa8d_inter[LUMA_32x16] = sa8d16<32, 16>;
1319 p.sa8d_inter[LUMA_16x32] = sa8d16<16, 32>;
1320 p.sa8d_inter[LUMA_32x24] = sa8d8<32, 24>;
1321 p.sa8d_inter[LUMA_24x32] = sa8d8<24, 32>;
1322 p.sa8d_inter[LUMA_32x8] = sa8d8<32, 8>;
1323 p.sa8d_inter[LUMA_8x32] = sa8d8<8, 32>;
1324 p.sa8d_inter[LUMA_64x64] = sa8d16<64, 64>;
1325 p.sa8d_inter[LUMA_64x32] = sa8d16<64, 32>;
1326 p.sa8d_inter[LUMA_32x64] = sa8d16<32, 64>;
1327 p.sa8d_inter[LUMA_64x48] = sa8d16<64, 48>;
1328 p.sa8d_inter[LUMA_48x64] = sa8d16<48, 64>;
1329 p.sa8d_inter[LUMA_64x16] = sa8d16<64, 16>;
1330 p.sa8d_inter[LUMA_16x64] = sa8d16<16, 64>;
1331
1332 p.calcresidual[BLOCK_4x4] = getResidual<4>;
1333 p.calcresidual[BLOCK_8x8] = getResidual<8>;
1334 p.calcresidual[BLOCK_16x16] = getResidual<16>;
1335 p.calcresidual[BLOCK_32x32] = getResidual<32>;
1336 p.calcresidual[BLOCK_64x64] = NULL;
1337
1338 p.transpose[BLOCK_4x4] = transpose<4>;
1339 p.transpose[BLOCK_8x8] = transpose<8>;
1340 p.transpose[BLOCK_16x16] = transpose<16>;
1341 p.transpose[BLOCK_32x32] = transpose<32>;
1342 p.transpose[BLOCK_64x64] = transpose<64>;
1343
1344 p.ssd_s[BLOCK_4x4] = pixel_ssd_s_c<4>;
1345 p.ssd_s[BLOCK_8x8] = pixel_ssd_s_c<8>;
1346 p.ssd_s[BLOCK_16x16] = pixel_ssd_s_c<16>;
1347 p.ssd_s[BLOCK_32x32] = pixel_ssd_s_c<32>;
1348
1349 p.weight_pp = weight_pp_c;
1350 p.weight_sp = weight_sp_c;
1351
1352 p.scale1D_128to64 = scale1D_128to64;
1353 p.scale2D_64to32 = scale2D_64to32;
1354 p.frameInitLowres = frame_init_lowres_core;
1355 p.ssim_4x4x2_core = ssim_4x4x2_core;
1356 p.ssim_end_4 = ssim_end_4;
1357
1358 p.var[BLOCK_8x8] = pixel_var<8>;
1359 p.var[BLOCK_16x16] = pixel_var<16>;
1360 p.var[BLOCK_32x32] = pixel_var<32>;
1361 p.var[BLOCK_64x64] = pixel_var<64>;
1362 p.planecopy_cp = planecopy_cp_c;
1363 p.planecopy_sp = planecopy_sp_c;
1364 p.propagateCost = estimateCUPropagateCost;
1365 }
1366 }