Imported Upstream version 1.4
[deb_x265.git] / source / common / pixel.cpp
CommitLineData
72b9787e
JB
1/*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 * Mandar Gurav <mandar@multicorewareinc.com>
6 * Mahesh Pittala <mahesh@multicorewareinc.com>
7 * Min Chen <min.chen@multicorewareinc.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at license @ x265.com.
25 *****************************************************************************/
26
27#include "common.h"
28#include "primitives.h"
29#include "x265.h"
30
31#include <cstdlib> // abs()
32
33using namespace x265;
34
35#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \
36 p.FUNC_PREFIX[LUMA_4x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \
37 p.FUNC_PREFIX[LUMA_8x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \
38 p.FUNC_PREFIX[LUMA_8x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \
39 p.FUNC_PREFIX[LUMA_4x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \
40 p.FUNC_PREFIX[LUMA_16x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
41 p.FUNC_PREFIX[LUMA_16x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \
42 p.FUNC_PREFIX[LUMA_8x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
43 p.FUNC_PREFIX[LUMA_16x12] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
44 p.FUNC_PREFIX[LUMA_12x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
45 p.FUNC_PREFIX[LUMA_16x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \
46 p.FUNC_PREFIX[LUMA_4x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
47 p.FUNC_PREFIX[LUMA_32x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
48 p.FUNC_PREFIX[LUMA_32x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
49 p.FUNC_PREFIX[LUMA_16x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
50 p.FUNC_PREFIX[LUMA_32x24] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
51 p.FUNC_PREFIX[LUMA_24x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
52 p.FUNC_PREFIX[LUMA_32x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \
53 p.FUNC_PREFIX[LUMA_8x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
54 p.FUNC_PREFIX[LUMA_64x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
55 p.FUNC_PREFIX[LUMA_64x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
56 p.FUNC_PREFIX[LUMA_32x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
57 p.FUNC_PREFIX[LUMA_64x48] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
58 p.FUNC_PREFIX[LUMA_48x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
59 p.FUNC_PREFIX[LUMA_64x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
60 p.FUNC_PREFIX[LUMA_16x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
61
62#define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
63 p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX<4, 4>; \
64 p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX<8, 8>; \
65 p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX<8, 4>; \
66 p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX<4, 8>; \
67 p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX<16, 16>; \
68 p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX<16, 8>; \
69 p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX<8, 16>; \
70 p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX<16, 12>; \
71 p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX<12, 16>; \
72 p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX<16, 4>; \
73 p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX<4, 16>; \
74 p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX<32, 32>; \
75 p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX<32, 16>; \
76 p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX<16, 32>; \
77 p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX<32, 24>; \
78 p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX<24, 32>; \
79 p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX<32, 8>; \
80 p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX<8, 32>; \
81 p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX<64, 64>; \
82 p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX<64, 32>; \
83 p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX<32, 64>; \
84 p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX<64, 48>; \
85 p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX<48, 64>; \
86 p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX<64, 16>; \
87 p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX<16, 64>;
88
89namespace {
90// place functions in anonymous namespace (file static)
91
92template<int lx, int ly>
93int sad(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
94{
95 int sum = 0;
96
97 for (int y = 0; y < ly; y++)
98 {
99 for (int x = 0; x < lx; x++)
100 {
101 sum += abs(pix1[x] - pix2[x]);
102 }
103
104 pix1 += stride_pix1;
105 pix2 += stride_pix2;
106 }
107
108 return sum;
109}
110
111template<int lx, int ly>
112int sad(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2)
113{
114 int sum = 0;
115
116 for (int y = 0; y < ly; y++)
117 {
118 for (int x = 0; x < lx; x++)
119 {
120 sum += abs(pix1[x] - pix2[x]);
121 }
122
123 pix1 += stride_pix1;
124 pix2 += stride_pix2;
125 }
126
127 return sum;
128}
129
130template<int lx, int ly>
131void sad_x3(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, intptr_t frefstride, int32_t *res)
132{
133 res[0] = 0;
134 res[1] = 0;
135 res[2] = 0;
136 for (int y = 0; y < ly; y++)
137 {
138 for (int x = 0; x < lx; x++)
139 {
140 res[0] += abs(pix1[x] - pix2[x]);
141 res[1] += abs(pix1[x] - pix3[x]);
142 res[2] += abs(pix1[x] - pix4[x]);
143 }
144
145 pix1 += FENC_STRIDE;
146 pix2 += frefstride;
147 pix3 += frefstride;
148 pix4 += frefstride;
149 }
150}
151
152template<int lx, int ly>
153void sad_x4(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, pixel *pix5, intptr_t frefstride, int32_t *res)
154{
155 res[0] = 0;
156 res[1] = 0;
157 res[2] = 0;
158 res[3] = 0;
159 for (int y = 0; y < ly; y++)
160 {
161 for (int x = 0; x < lx; x++)
162 {
163 res[0] += abs(pix1[x] - pix2[x]);
164 res[1] += abs(pix1[x] - pix3[x]);
165 res[2] += abs(pix1[x] - pix4[x]);
166 res[3] += abs(pix1[x] - pix5[x]);
167 }
168
169 pix1 += FENC_STRIDE;
170 pix2 += frefstride;
171 pix3 += frefstride;
172 pix4 += frefstride;
173 pix5 += frefstride;
174 }
175}
176
177template<int lx, int ly, class T1, class T2>
178int sse(T1 *pix1, intptr_t stride_pix1, T2 *pix2, intptr_t stride_pix2)
179{
180 int sum = 0;
181 int iTemp;
182
183 for (int y = 0; y < ly; y++)
184 {
185 for (int x = 0; x < lx; x++)
186 {
187 iTemp = pix1[x] - pix2[x];
188 sum += (iTemp * iTemp);
189 }
190
191 pix1 += stride_pix1;
192 pix2 += stride_pix2;
193 }
194
195 return sum;
196}
197
198#define BITS_PER_SUM (8 * sizeof(sum_t))
199
200#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) { \
201 sum2_t t0 = s0 + s1; \
202 sum2_t t1 = s0 - s1; \
203 sum2_t t2 = s2 + s3; \
204 sum2_t t3 = s2 - s3; \
205 d0 = t0 + t2; \
206 d2 = t0 - t2; \
207 d1 = t1 + t3; \
208 d3 = t1 - t3; \
209}
210
211// in: a pseudo-simd number of the form x+(y<<16)
212// return: abs(x)+(abs(y)<<16)
213inline sum2_t abs2(sum2_t a)
214{
215 sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
216
217 return (a + s) ^ s;
218}
219
220int satd_4x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
221{
222 sum2_t tmp[4][2];
223 sum2_t a0, a1, a2, a3, b0, b1;
224 sum2_t sum = 0;
225
226 for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)
227 {
228 a0 = pix1[0] - pix2[0];
229 a1 = pix1[1] - pix2[1];
230 b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
231 a2 = pix1[2] - pix2[2];
232 a3 = pix1[3] - pix2[3];
233 b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
234 tmp[i][0] = b0 + b1;
235 tmp[i][1] = b0 - b1;
236 }
237
238 for (int i = 0; i < 2; i++)
239 {
240 HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
241 a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
242 sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);
243 }
244
245 return (int)(sum >> 1);
246}
247
248int satd_4x4(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2)
249{
250 ssum2_t tmp[4][2];
251 ssum2_t a0, a1, a2, a3, b0, b1;
252 ssum2_t sum = 0;
253
254 for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)
255 {
256 a0 = pix1[0] - pix2[0];
257 a1 = pix1[1] - pix2[1];
258 b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
259 a2 = pix1[2] - pix2[2];
260 a3 = pix1[3] - pix2[3];
261 b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
262 tmp[i][0] = b0 + b1;
263 tmp[i][1] = b0 - b1;
264 }
265
266 for (int i = 0; i < 2; i++)
267 {
268 HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
269 a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
270 sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);
271 }
272
273 return (int)(sum >> 1);
274}
275
276// x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
277int satd_8x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
278{
279 sum2_t tmp[4][4];
280 sum2_t a0, a1, a2, a3;
281 sum2_t sum = 0;
282
283 for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)
284 {
285 a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
286 a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
287 a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
288 a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
289 HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
290 }
291
292 for (int i = 0; i < 4; i++)
293 {
294 HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
295 sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
296 }
297
298 return (((sum_t)sum) + (sum >> BITS_PER_SUM)) >> 1;
299}
300
301template<int w, int h>
302// calculate satd in blocks of 4x4
303int satd4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
304{
305 int satd = 0;
306
307 for (int row = 0; row < h; row += 4)
308 {
309 for (int col = 0; col < w; col += 4)
310 {
311 satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
312 pix2 + row * stride_pix2 + col, stride_pix2);
313 }
314 }
315
316 return satd;
317}
318
319template<int w, int h>
320// calculate satd in blocks of 8x4
321int satd8(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
322{
323 int satd = 0;
324
325 for (int row = 0; row < h; row += 4)
326 {
327 for (int col = 0; col < w; col += 8)
328 {
329 satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
330 pix2 + row * stride_pix2 + col, stride_pix2);
331 }
332 }
333
334 return satd;
335}
336
337inline int _sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
338{
339 sum2_t tmp[8][4];
340 sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
341 sum2_t sum = 0;
342
343 for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2)
344 {
345 a0 = pix1[0] - pix2[0];
346 a1 = pix1[1] - pix2[1];
347 b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
348 a2 = pix1[2] - pix2[2];
349 a3 = pix1[3] - pix2[3];
350 b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
351 a4 = pix1[4] - pix2[4];
352 a5 = pix1[5] - pix2[5];
353 b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM);
354 a6 = pix1[6] - pix2[6];
355 a7 = pix1[7] - pix2[7];
356 b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM);
357 HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3);
358 }
359
360 for (int i = 0; i < 4; i++)
361 {
362 HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
363 HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]);
364 b0 = abs2(a0 + a4) + abs2(a0 - a4);
365 b0 += abs2(a1 + a5) + abs2(a1 - a5);
366 b0 += abs2(a2 + a6) + abs2(a2 - a6);
367 b0 += abs2(a3 + a7) + abs2(a3 - a7);
368 sum += (sum_t)b0 + (b0 >> BITS_PER_SUM);
369 }
370
371 return (int)sum;
372}
373
374int sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
375{
376 return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
377}
378
379inline int _sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2)
380{
381 ssum2_t tmp[8][4];
382 ssum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
383 ssum2_t sum = 0;
384
385 for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2)
386 {
387 a0 = pix1[0] - pix2[0];
388 a1 = pix1[1] - pix2[1];
389 b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
390 a2 = pix1[2] - pix2[2];
391 a3 = pix1[3] - pix2[3];
392 b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
393 a4 = pix1[4] - pix2[4];
394 a5 = pix1[5] - pix2[5];
395 b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM);
396 a6 = pix1[6] - pix2[6];
397 a7 = pix1[7] - pix2[7];
398 b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM);
399 HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3);
400 }
401
402 for (int i = 0; i < 4; i++)
403 {
404 HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
405 HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]);
406 b0 = abs2(a0 + a4) + abs2(a0 - a4);
407 b0 += abs2(a1 + a5) + abs2(a1 - a5);
408 b0 += abs2(a2 + a6) + abs2(a2 - a6);
409 b0 += abs2(a3 + a7) + abs2(a3 - a7);
410 sum += (sum_t)b0 + (b0 >> BITS_PER_SUM);
411 }
412
413 return (int)sum;
414}
415
416int sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2)
417{
418 return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
419}
420
421int sa8d_16x16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
422{
423 int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
424 + _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2)
425 + _sa8d_8x8(pix1 + 8 * i_pix1, i_pix1, pix2 + 8 * i_pix2, i_pix2)
426 + _sa8d_8x8(pix1 + 8 + 8 * i_pix1, i_pix1, pix2 + 8 + 8 * i_pix2, i_pix2);
427
428 // This matches x264 sa8d_16x16, but is slightly different from HM's behavior because
429 // this version only rounds once at the end
430 return (sum + 2) >> 2;
431}
432
433template<int w, int h>
434// Calculate sa8d in blocks of 8x8
435int sa8d8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
436{
437 int cost = 0;
438
439 for (int y = 0; y < h; y += 8)
440 {
441 for (int x = 0; x < w; x += 8)
442 {
443 cost += sa8d_8x8(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
444 }
445 }
446
447 return cost;
448}
449
450template<int w, int h>
451// Calculate sa8d in blocks of 16x16
452int sa8d16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
453{
454 int cost = 0;
455
456 for (int y = 0; y < h; y += 16)
457 {
458 for (int x = 0; x < w; x += 16)
459 {
460 cost += sa8d_16x16(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
461 }
462 }
463
464 return cost;
465}
466
467template<int size>
468int pixel_ssd_s_c(short *a, intptr_t dstride)
469{
470 int sum = 0;
471 for (int y = 0; y < size; y++)
472 {
473 for (int x = 0; x < size; x++)
474 {
475 sum += a[x] * a[x];
476 }
477 a += dstride;
478 }
479 return sum;
480}
481
482template<int size>
483void blockfil_s_c(int16_t *dst, intptr_t dstride, int16_t val)
484{
485 for (int y = 0; y < size; y++)
486 {
487 for (int x = 0; x < size; x++)
488 {
489 dst[y * dstride + x] = val;
490 }
491 }
492}
493
494void convert16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size)
495{
496 for (int i = 0; i < size; i++)
497 {
498 for (int j = 0; j < size; j++)
499 {
500 dst[i * size + j] = ((int)src[i * stride + j]) << shift;
501 }
502 }
503}
504
505template<int size>
506void convert16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset)
507{
508 for (int i = 0; i < size; i++)
509 {
510 for (int j = 0; j < size; j++)
511 {
512 dst[i * size + j] = ((int)src[i * stride + j] + offset) >> shift;
513 }
514 }
515}
516
517void convert32to16_shr(int16_t *dst, int32_t *src, intptr_t stride, int shift, int size)
518{
519 int round = 1 << (shift - 1);
520
521 for (int i = 0; i < size; i++)
522 {
523 for (int j = 0; j < size; j++)
524 {
525 dst[j] = (int16_t)((src[j] + round) >> shift);
526 }
527
528 src += size;
529 dst += stride;
530 }
531}
532
533void copy_shr(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size)
534{
535 int round = 1 << (shift - 1);
536
537 for (int i = 0; i < size; i++)
538 {
539 for (int j = 0; j < size; j++)
540 {
541 dst[j] = (int16_t)((src[j] + round) >> shift);
542 }
543
544 src += size;
545 dst += stride;
546 }
547}
548
549template<int size>
550void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
551{
552 for (int i = 0; i < size; i++)
553 {
554 for (int j = 0; j < size; j++)
555 {
556 dst[j] = ((int16_t)src[j] << shift);
557 }
558
559 src += size;
560 dst += stride;
561 }
562}
563
564template<int size>
565void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
566{
567 for (int i = 0; i < size; i++)
568 {
569 for (int j = 0; j < size; j++)
570 {
571 dst[j] = (src[j] << shift);
572 }
573
574 src += size;
575 dst += stride;
576 }
577}
578
579template<int blockSize>
580void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
581{
582 for (int y = 0; y < blockSize; y++)
583 {
584 for (int x = 0; x < blockSize; x++)
585 {
586 residual[x] = static_cast<int16_t>(fenc[x]) - static_cast<int16_t>(pred[x]);
587 }
588
589 fenc += stride;
590 residual += stride;
591 pred += stride;
592 }
593}
594
595template<int blockSize>
596void transpose(pixel* dst, pixel* src, intptr_t stride)
597{
598 for (int k = 0; k < blockSize; k++)
599 {
600 for (int l = 0; l < blockSize; l++)
601 {
602 dst[k * blockSize + l] = src[l * stride + k];
603 }
604 }
605}
606
607void weight_sp_c(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
608{
609 int x, y;
610
611 for (y = 0; y <= height - 1; y++)
612 {
613 for (x = 0; x <= width - 1; )
614 {
615 // note: width can be odd
616 dst[x] = (pixel)Clip3(0, ((1 << X265_DEPTH) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
617 x++;
618 }
619
620 src += srcStride;
621 dst += dstStride;
622 }
623}
624
625void weight_pp_c(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
626{
627 int x, y;
628
629 X265_CHECK(!(width & 15), "weightp alignment error\n");
630 X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
631 X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n");
632
633 for (y = 0; y <= height - 1; y++)
634 {
635 for (x = 0; x <= width - 1; )
636 {
637 // simulating pixel to short conversion
638 int16_t val = src[x] << (IF_INTERNAL_PREC - X265_DEPTH);
639 dst[x] = (pixel)Clip3(0, ((1 << X265_DEPTH) - 1), ((w0 * (val) + round) >> shift) + offset);
640 x++;
641 }
642
643 src += stride;
644 dst += stride;
645 }
646}
647
648template<int lx, int ly>
649void pixelavg_pp(pixel* dst, intptr_t dstride, pixel* src0, intptr_t sstride0, pixel* src1, intptr_t sstride1, int)
650{
651 for (int y = 0; y < ly; y++)
652 {
653 for (int x = 0; x < lx; x++)
654 {
655 dst[x] = (src0[x] + src1[x] + 1) >> 1;
656 }
657
658 src0 += sstride0;
659 src1 += sstride1;
660 dst += dstride;
661 }
662}
663
664void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
665{
666 int x;
667
668 for (x = 0; x < 128; x += 2)
669 {
670 pixel pix0 = src[(x + 0)];
671 pixel pix1 = src[(x + 1)];
672 int sum = pix0 + pix1;
673
674 dst[x >> 1] = (pixel)((sum + 1) >> 1);
675 }
676}
677
678void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
679{
680 int x, y;
681
682 for (y = 0; y < 64; y += 2)
683 {
684 for (x = 0; x < 64; x += 2)
685 {
686 pixel pix0 = src[(y + 0) * stride + (x + 0)];
687 pixel pix1 = src[(y + 0) * stride + (x + 1)];
688 pixel pix2 = src[(y + 1) * stride + (x + 0)];
689 pixel pix3 = src[(y + 1) * stride + (x + 1)];
690 int sum = pix0 + pix1 + pix2 + pix3;
691
692 dst[y / 2 * 32 + x / 2] = (pixel)((sum + 2) >> 2);
693 }
694 }
695}
696
697void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
698 intptr_t src_stride, intptr_t dst_stride, int width, int height)
699{
700 for (int y = 0; y < height; y++)
701 {
702 pixel *src1 = src0 + src_stride;
703 pixel *src2 = src1 + src_stride;
704 for (int x = 0; x < width; x++)
705 {
706 // slower than naive bilinear, but matches asm
707#define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1)
708 dst0[x] = FILTER(src0[2 * x], src1[2 * x], src0[2 * x + 1], src1[2 * x + 1]);
709 dsth[x] = FILTER(src0[2 * x + 1], src1[2 * x + 1], src0[2 * x + 2], src1[2 * x + 2]);
710 dstv[x] = FILTER(src1[2 * x], src2[2 * x], src1[2 * x + 1], src2[2 * x + 1]);
711 dstc[x] = FILTER(src1[2 * x + 1], src2[2 * x + 1], src1[2 * x + 2], src2[2 * x + 2]);
712#undef FILTER
713 }
714 src0 += src_stride * 2;
715 dst0 += dst_stride;
716 dsth += dst_stride;
717 dstv += dst_stride;
718 dstc += dst_stride;
719 }
720}
721
722/* structural similarity metric */
723void ssim_4x4x2_core(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4])
724{
725 for (int z = 0; z < 2; z++)
726 {
727 uint32_t s1 = 0, s2 = 0, ss = 0, s12 = 0;
728 for (int y = 0; y < 4; y++)
729 {
730 for (int x = 0; x < 4; x++)
731 {
732 int a = pix1[x + y * stride1];
733 int b = pix2[x + y * stride2];
734 s1 += a;
735 s2 += b;
736 ss += a * a;
737 ss += b * b;
738 s12 += a * b;
739 }
740 }
741
742 sums[z][0] = s1;
743 sums[z][1] = s2;
744 sums[z][2] = ss;
745 sums[z][3] = s12;
746 pix1 += 4;
747 pix2 += 4;
748 }
749}
750
751float ssim_end_1(int s1, int s2, int ss, int s12)
752{
753/* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases.
754 * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784.
755 * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */
756
757#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
758#if HIGH_BIT_DEPTH
759 X265_CHECK(X265_DEPTH == 10, "ssim invalid depth\n");
760#define type float
761 static const float ssim_c1 = (float)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64);
762 static const float ssim_c2 = (float)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63);
763#else
764 X265_CHECK(X265_DEPTH == 8, "ssim invalid depth\n");
765#define type int
766 static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5);
767 static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5);
768#endif
769 type fs1 = (type)s1;
770 type fs2 = (type)s2;
771 type fss = (type)ss;
772 type fs12 = (type)s12;
773 type vars = (type)(fss * 64 - fs1 * fs1 - fs2 * fs2);
774 type covar = (type)(fs12 * 64 - fs1 * fs2);
775 return (float)(2 * fs1 * fs2 + ssim_c1) * (float)(2 * covar + ssim_c2)
776 / ((float)(fs1 * fs1 + fs2 * fs2 + ssim_c1) * (float)(vars + ssim_c2));
777#undef type
778#undef PIXEL_MAX
779}
780
781float ssim_end_4(int sum0[5][4], int sum1[5][4], int width)
782{
783 float ssim = 0.0;
784
785 for (int i = 0; i < width; i++)
786 {
787 ssim += ssim_end_1(sum0[i][0] + sum0[i + 1][0] + sum1[i][0] + sum1[i + 1][0],
788 sum0[i][1] + sum0[i + 1][1] + sum1[i][1] + sum1[i + 1][1],
789 sum0[i][2] + sum0[i + 1][2] + sum1[i][2] + sum1[i + 1][2],
790 sum0[i][3] + sum0[i + 1][3] + sum1[i][3] + sum1[i + 1][3]);
791 }
792
793 return ssim;
794}
795
796template<int size>
797uint64_t pixel_var(pixel *pix, intptr_t i_stride)
798{
799 uint32_t sum = 0, sqr = 0;
800
801 for (int y = 0; y < size; y++)
802 {
803 for (int x = 0; x < size; x++)
804 {
805 sum += pix[x];
806 sqr += pix[x] * pix[x];
807 }
808
809 pix += i_stride;
810 }
811
812 return sum + ((uint64_t)sqr << 32);
813}
814
815#if defined(_MSC_VER)
816#pragma warning(disable: 4127) // conditional expression is constant
817#endif
818
819template<int size>
820int psyCost_pp(pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride)
821{
822 static pixel zeroBuf[8] /* = { 0 } */;
823
824 if (size)
825 {
826 int dim = 1 << (size + 2);
827 uint32_t totEnergy = 0;
828 for (int i = 0; i < dim; i += 8)
829 {
830 for (int j = 0; j < dim; j+= 8)
831 {
832 /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
833 int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride, zeroBuf, 0) -
834 (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
835 int reconEnergy = sa8d_8x8(recon + i * rstride + j, rstride, zeroBuf, 0) -
836 (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
837
838 totEnergy += abs(sourceEnergy - reconEnergy);
839 }
840 }
841 return totEnergy;
842 }
843 else
844 {
845 /* 4x4 is too small for sa8d */
846 int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
847 int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
848 return abs(sourceEnergy - reconEnergy);
849 }
850}
851
852template<int size>
853int psyCost_ss(int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstride)
854{
855 static int16_t zeroBuf[8] /* = { 0 } */;
856
857 if (size)
858 {
859 int dim = 1 << (size + 2);
860 uint32_t totEnergy = 0;
861 for (int i = 0; i < dim; i += 8)
862 {
863 for (int j = 0; j < dim; j+= 8)
864 {
865 /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
866 int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride, zeroBuf, 0) -
867 (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
868 int reconEnergy = sa8d_8x8(recon + i * rstride + j, rstride, zeroBuf, 0) -
869 (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
870
871 totEnergy += abs(sourceEnergy - reconEnergy);
872 }
873 }
874 return totEnergy;
875 }
876 else
877 {
878 /* 4x4 is too small for sa8d */
879 int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
880 int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
881 return abs(sourceEnergy - reconEnergy);
882 }
883}
884
885void plane_copy_deinterleave_chroma(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride,
886 pixel *src, intptr_t srcStride, int w, int h)
887{
888 for (int y = 0; y < h; y++, dstu += dstuStride, dstv += dstvStride, src += srcStride)
889 {
890 for (int x = 0; x < w; x++)
891 {
892 dstu[x] = src[2 * x];
893 dstv[x] = src[2 * x + 1];
894 }
895 }
896}
897
898template<int bx, int by>
899void blockcopy_pp_c(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb)
900{
901 for (int y = 0; y < by; y++)
902 {
903 for (int x = 0; x < bx; x++)
904 {
905 a[x] = b[x];
906 }
907
908 a += stridea;
909 b += strideb;
910 }
911}
912
913template<int bx, int by>
914void blockcopy_ss_c(int16_t *a, intptr_t stridea, int16_t *b, intptr_t strideb)
915{
916 for (int y = 0; y < by; y++)
917 {
918 for (int x = 0; x < bx; x++)
919 {
920 a[x] = b[x];
921 }
922
923 a += stridea;
924 b += strideb;
925 }
926}
927
928template<int bx, int by>
929void blockcopy_sp_c(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb)
930{
931 for (int y = 0; y < by; y++)
932 {
933 for (int x = 0; x < bx; x++)
934 {
935 X265_CHECK((b[x] >= 0) && (b[x] <= ((1 << X265_DEPTH) - 1)), "blockcopy pixel size fail\n");
936 a[x] = (pixel)b[x];
937 }
938
939 a += stridea;
940 b += strideb;
941 }
942}
943
944template<int bx, int by>
945void blockcopy_ps_c(int16_t *a, intptr_t stridea, pixel *b, intptr_t strideb)
946{
947 for (int y = 0; y < by; y++)
948 {
949 for (int x = 0; x < bx; x++)
950 {
951 a[x] = (int16_t)b[x];
952 }
953
954 a += stridea;
955 b += strideb;
956 }
957}
958
959template<int bx, int by>
960void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1)
961{
962 for (int y = 0; y < by; y++)
963 {
964 for (int x = 0; x < bx; x++)
965 {
966 a[x] = (int16_t)(b0[x] - b1[x]);
967 }
968
969 b0 += sstride0;
970 b1 += sstride1;
971 a += dstride;
972 }
973}
974
975template<int bx, int by>
976void pixel_add_ps_c(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1)
977{
978 for (int y = 0; y < by; y++)
979 {
980 for (int x = 0; x < bx; x++)
981 {
982 a[x] = Clip(b0[x] + b1[x]);
983 }
984
985 b0 += sstride0;
986 b1 += sstride1;
987 a += dstride;
988 }
989}
990
991template<int bx, int by>
992void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
993{
994 int shiftNum, offset;
995
996 shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
997 offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
998
999 for (int y = 0; y < by; y++)
1000 {
1001 for (int x = 0; x < bx; x += 2)
1002 {
1003 dst[x + 0] = Clip((src0[x + 0] + src1[x + 0] + offset) >> shiftNum);
1004 dst[x + 1] = Clip((src0[x + 1] + src1[x + 1] + offset) >> shiftNum);
1005 }
1006
1007 src0 += src0Stride;
1008 src1 += src1Stride;
1009 dst += dstStride;
1010 }
1011}
1012
1013void planecopy_cp_c(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
1014{
1015 for (int r = 0; r < height; r++)
1016 {
1017 for (int c = 0; c < width; c++)
1018 {
1019 dst[c] = ((pixel)src[c]) << shift;
1020 }
1021
1022 dst += dstStride;
1023 src += srcStride;
1024 }
1025}
1026
1027void planecopy_sp_c(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
1028{
1029 for (int r = 0; r < height; r++)
1030 {
1031 for (int c = 0; c < width; c++)
1032 {
1033 dst[c] = (pixel)((src[c] >> shift) & mask);
1034 }
1035
1036 dst += dstStride;
1037 src += srcStride;
1038 }
1039}
1040
1041/* Estimate the total amount of influence on future quality that could be had if we
1042 * were to improve the reference samples used to inter predict any given CU. */
1043void estimateCUPropagateCost(int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts,
1044 int32_t *invQscales, double *fpsFactor, int len)
1045{
1046 double fps = *fpsFactor / 256;
1047
1048 for (int i = 0; i < len; i++)
1049 {
1050 double intraCost = intraCosts[i] * invQscales[i];
1051 double propagateAmount = (double)propagateIn[i] + intraCost * fps;
1052 double propagateNum = (double)intraCosts[i] - (interCosts[i] & ((1 << 14) - 1));
1053 double propagateDenom = (double)intraCosts[i];
1054 dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
1055 }
1056}
1057} // end anonymous namespace
1058
1059namespace x265 {
1060// x265 private namespace
1061
1062/* Extend the edges of a picture so that it may safely be used for motion
1063 * compensation. This function assumes the picture is stored in a buffer with
1064 * sufficient padding for the X and Y margins */
1065void extendPicBorder(pixel* pic, intptr_t stride, int width, int height, int marginX, int marginY)
1066{
1067 /* extend left and right margins */
1068 primitives.extendRowBorder(pic, stride, width, height, marginX);
1069
1070 /* copy top row to create above margin */
1071 pixel *top = pic - marginX;
1072 for (int y = 0; y < marginY; y++)
1073 memcpy(top - (y + 1) * stride, top, stride * sizeof(pixel));
1074
1075 /* copy bottom row to create below margin */
1076 pixel *bot = pic - marginX + (height - 1) * stride;
1077 for (int y = 0; y < marginY; y++)
1078 memcpy(bot + (y + 1) * stride, bot, stride * sizeof(pixel));
1079}
1080
1081/* Initialize entries for pixel functions defined in this file */
1082void Setup_C_PixelPrimitives(EncoderPrimitives &p)
1083{
1084 SET_FUNC_PRIMITIVE_TABLE_C2(sad)
1085 SET_FUNC_PRIMITIVE_TABLE_C2(sad_x3)
1086 SET_FUNC_PRIMITIVE_TABLE_C2(sad_x4)
1087 SET_FUNC_PRIMITIVE_TABLE_C2(pixelavg_pp)
1088
1089 // satd
1090 p.satd[LUMA_4x4] = satd_4x4;
1091 p.satd[LUMA_8x8] = satd8<8, 8>;
1092 p.satd[LUMA_8x4] = satd_8x4;
1093 p.satd[LUMA_4x8] = satd4<4, 8>;
1094 p.satd[LUMA_16x16] = satd8<16, 16>;
1095 p.satd[LUMA_16x8] = satd8<16, 8>;
1096 p.satd[LUMA_8x16] = satd8<8, 16>;
1097 p.satd[LUMA_16x12] = satd8<16, 12>;
1098 p.satd[LUMA_12x16] = satd4<12, 16>;
1099 p.satd[LUMA_16x4] = satd8<16, 4>;
1100 p.satd[LUMA_4x16] = satd4<4, 16>;
1101 p.satd[LUMA_32x32] = satd8<32, 32>;
1102 p.satd[LUMA_32x16] = satd8<32, 16>;
1103 p.satd[LUMA_16x32] = satd8<16, 32>;
1104 p.satd[LUMA_32x24] = satd8<32, 24>;
1105 p.satd[LUMA_24x32] = satd8<24, 32>;
1106 p.satd[LUMA_32x8] = satd8<32, 8>;
1107 p.satd[LUMA_8x32] = satd8<8, 32>;
1108 p.satd[LUMA_64x64] = satd8<64, 64>;
1109 p.satd[LUMA_64x32] = satd8<64, 32>;
1110 p.satd[LUMA_32x64] = satd8<32, 64>;
1111 p.satd[LUMA_64x48] = satd8<64, 48>;
1112 p.satd[LUMA_48x64] = satd8<48, 64>;
1113 p.satd[LUMA_64x16] = satd8<64, 16>;
1114 p.satd[LUMA_16x64] = satd8<16, 64>;
1115
1116#define CHROMA_420(W, H) \
1117 p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = addAvg<W, H>; \
1118 p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1119 p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1120 p.chroma[X265_CSP_I420].copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1121 p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1122
1123#define CHROMA_422(W, H) \
1124 p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg<W, H>; \
1125 p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1126 p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1127 p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1128 p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1129
1130#define CHROMA_444(W, H) \
1131 p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
1132 p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1133 p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1134 p.chroma[X265_CSP_I444].copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1135 p.chroma[X265_CSP_I444].copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1136
1137#define LUMA(W, H) \
1138 p.luma_addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
1139 p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1140 p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1141 p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1142 p.luma_copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1143
1144#define LUMA_PIXELSUB(W, H) \
1145 p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1146 p.luma_add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1147
1148#define CHROMA_PIXELSUB_420(W, H) \
1149 p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1150 p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1151
1152#define CHROMA_PIXELSUB_422(W, H) \
1153 p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1154 p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1155
1156#define CHROMA_PIXELSUB_444(W, H) \
1157 p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1158 p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1159
1160
1161
1162 LUMA(4, 4);
1163 LUMA(8, 8);
1164 CHROMA_420(4, 4);
1165 LUMA(4, 8);
1166 CHROMA_420(2, 4);
1167 LUMA(8, 4);
1168 CHROMA_420(4, 2);
1169 LUMA(16, 16);
1170 CHROMA_420(8, 8);
1171 LUMA(16, 8);
1172 CHROMA_420(8, 4);
1173 LUMA(8, 16);
1174 CHROMA_420(4, 8);
1175 LUMA(16, 12);
1176 CHROMA_420(8, 6);
1177 LUMA(12, 16);
1178 CHROMA_420(6, 8);
1179 LUMA(16, 4);
1180 CHROMA_420(8, 2);
1181 LUMA(4, 16);
1182 CHROMA_420(2, 8);
1183 LUMA(32, 32);
1184 CHROMA_420(16, 16);
1185 LUMA(32, 16);
1186 CHROMA_420(16, 8);
1187 LUMA(16, 32);
1188 CHROMA_420(8, 16);
1189 LUMA(32, 24);
1190 CHROMA_420(16, 12);
1191 LUMA(24, 32);
1192 CHROMA_420(12, 16);
1193 LUMA(32, 8);
1194 CHROMA_420(16, 4);
1195 LUMA(8, 32);
1196 CHROMA_420(4, 16);
1197 LUMA(64, 64);
1198 CHROMA_420(32, 32);
1199 LUMA(64, 32);
1200 CHROMA_420(32, 16);
1201 LUMA(32, 64);
1202 CHROMA_420(16, 32);
1203 LUMA(64, 48);
1204 CHROMA_420(32, 24);
1205 LUMA(48, 64);
1206 CHROMA_420(24, 32);
1207 LUMA(64, 16);
1208 CHROMA_420(32, 8);
1209 LUMA(16, 64);
1210 CHROMA_420(8, 32);
1211
1212 LUMA_PIXELSUB(4, 4);
1213 LUMA_PIXELSUB(8, 8);
1214 LUMA_PIXELSUB(16, 16);
1215 LUMA_PIXELSUB(32, 32);
1216 LUMA_PIXELSUB(64, 64);
1217 CHROMA_PIXELSUB_420(4, 4)
1218 CHROMA_PIXELSUB_420(8, 8)
1219 CHROMA_PIXELSUB_420(16, 16)
1220 CHROMA_PIXELSUB_420(32, 32)
1221 CHROMA_PIXELSUB_422(4, 8)
1222 CHROMA_PIXELSUB_422(8, 16)
1223 CHROMA_PIXELSUB_422(16, 32)
1224 CHROMA_PIXELSUB_422(32, 64)
1225 CHROMA_PIXELSUB_444(8, 8)
1226 CHROMA_PIXELSUB_444(16, 16)
1227 CHROMA_PIXELSUB_444(32, 32)
1228 CHROMA_PIXELSUB_444(64, 64)
1229
1230 CHROMA_422(4, 8);
1231 CHROMA_422(4, 4);
1232 CHROMA_422(2, 8);
1233 CHROMA_422(8, 16);
1234 CHROMA_422(8, 8);
1235 CHROMA_422(4, 16);
1236 CHROMA_422(8, 12);
1237 CHROMA_422(6, 16);
1238 CHROMA_422(8, 4);
1239 CHROMA_422(2, 16);
1240 CHROMA_422(16, 32);
1241 CHROMA_422(16, 16);
1242 CHROMA_422(8, 32);
1243 CHROMA_422(16, 24);
1244 CHROMA_422(12, 32);
1245 CHROMA_422(16, 8);
1246 CHROMA_422(4, 32);
1247 CHROMA_422(32, 64);
1248 CHROMA_422(32, 32);
1249 CHROMA_422(16, 64);
1250 CHROMA_422(32, 48);
1251 CHROMA_422(24, 64);
1252 CHROMA_422(32, 16);
1253 CHROMA_422(8, 64);
1254
1255 CHROMA_444(4, 4);
1256 CHROMA_444(8, 8);
1257 CHROMA_444(4, 8);
1258 CHROMA_444(8, 4);
1259 CHROMA_444(16, 16);
1260 CHROMA_444(16, 8);
1261 CHROMA_444(8, 16);
1262 CHROMA_444(16, 12);
1263 CHROMA_444(12, 16);
1264 CHROMA_444(16, 4);
1265 CHROMA_444(4, 16);
1266 CHROMA_444(32, 32);
1267 CHROMA_444(32, 16);
1268 CHROMA_444(16, 32);
1269 CHROMA_444(32, 24);
1270 CHROMA_444(24, 32);
1271 CHROMA_444(32, 8);
1272 CHROMA_444(8, 32);
1273 CHROMA_444(64, 64);
1274 CHROMA_444(64, 32);
1275 CHROMA_444(32, 64);
1276 CHROMA_444(64, 48);
1277 CHROMA_444(48, 64);
1278 CHROMA_444(64, 16);
1279 CHROMA_444(16, 64);
1280
1281 SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixelcmp_t, pixel, pixel)
1282 SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, pixelcmp_sp_t, int16_t, pixel)
1283 SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, pixelcmp_ss_t, int16_t, int16_t)
1284
1285 p.blockfill_s[BLOCK_4x4] = blockfil_s_c<4>;
1286 p.blockfill_s[BLOCK_8x8] = blockfil_s_c<8>;
1287 p.blockfill_s[BLOCK_16x16] = blockfil_s_c<16>;
1288 p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
1289 p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
1290
1291 p.cvt16to32_shl = convert16to32_shl;
1292 p.cvt16to32_shr[BLOCK_4x4] = convert16to32_shr<4>;
1293 p.cvt16to32_shr[BLOCK_8x8] = convert16to32_shr<8>;
1294 p.cvt16to32_shr[BLOCK_16x16] = convert16to32_shr<16>;
1295 p.cvt16to32_shr[BLOCK_32x32] = convert16to32_shr<32>;
1296 p.cvt32to16_shr = convert32to16_shr;
1297 p.cvt32to16_shl[BLOCK_4x4] = convert32to16_shl<4>;
1298 p.cvt32to16_shl[BLOCK_8x8] = convert32to16_shl<8>;
1299 p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>;
1300 p.cvt32to16_shl[BLOCK_32x32] = convert32to16_shl<32>;
1301
1302 p.copy_shr = copy_shr;
1303 p.copy_shl[BLOCK_4x4] = copy_shl<4>;
1304 p.copy_shl[BLOCK_8x8] = copy_shl<8>;
1305 p.copy_shl[BLOCK_16x16] = copy_shl<16>;
1306 p.copy_shl[BLOCK_32x32] = copy_shl<32>;
1307
1308 p.sa8d[BLOCK_4x4] = satd_4x4;
1309 p.sa8d[BLOCK_8x8] = sa8d_8x8;
1310 p.sa8d[BLOCK_16x16] = sa8d_16x16;
1311 p.sa8d[BLOCK_32x32] = sa8d16<32, 32>;
1312 p.sa8d[BLOCK_64x64] = sa8d16<64, 64>;
1313
1314 p.psy_cost_pp[BLOCK_4x4] = psyCost_pp<BLOCK_4x4>;
1315 p.psy_cost_pp[BLOCK_8x8] = psyCost_pp<BLOCK_8x8>;
1316 p.psy_cost_pp[BLOCK_16x16] = psyCost_pp<BLOCK_16x16>;
1317 p.psy_cost_pp[BLOCK_32x32] = psyCost_pp<BLOCK_32x32>;
1318 p.psy_cost_pp[BLOCK_64x64] = psyCost_pp<BLOCK_64x64>;
1319
1320 p.psy_cost_ss[BLOCK_4x4] = psyCost_ss<BLOCK_4x4>;
1321 p.psy_cost_ss[BLOCK_8x8] = psyCost_ss<BLOCK_8x8>;
1322 p.psy_cost_ss[BLOCK_16x16] = psyCost_ss<BLOCK_16x16>;
1323 p.psy_cost_ss[BLOCK_32x32] = psyCost_ss<BLOCK_32x32>;
1324 p.psy_cost_ss[BLOCK_64x64] = psyCost_ss<BLOCK_64x64>;
1325
1326 p.sa8d_inter[LUMA_4x4] = satd_4x4;
1327 p.sa8d_inter[LUMA_8x8] = sa8d_8x8;
1328 p.sa8d_inter[LUMA_8x4] = satd_8x4;
1329 p.sa8d_inter[LUMA_4x8] = satd4<4, 8>;
1330 p.sa8d_inter[LUMA_16x16] = sa8d_16x16;
1331 p.sa8d_inter[LUMA_16x8] = sa8d8<16, 8>;
1332 p.sa8d_inter[LUMA_8x16] = sa8d8<8, 16>;
1333 p.sa8d_inter[LUMA_16x12] = satd8<16, 12>;
1334 p.sa8d_inter[LUMA_12x16] = satd4<12, 16>;
1335 p.sa8d_inter[LUMA_4x16] = satd4<4, 16>;
1336 p.sa8d_inter[LUMA_16x4] = satd8<16, 4>;
1337 p.sa8d_inter[LUMA_32x32] = sa8d16<32, 32>;
1338 p.sa8d_inter[LUMA_32x16] = sa8d16<32, 16>;
1339 p.sa8d_inter[LUMA_16x32] = sa8d16<16, 32>;
1340 p.sa8d_inter[LUMA_32x24] = sa8d8<32, 24>;
1341 p.sa8d_inter[LUMA_24x32] = sa8d8<24, 32>;
1342 p.sa8d_inter[LUMA_32x8] = sa8d8<32, 8>;
1343 p.sa8d_inter[LUMA_8x32] = sa8d8<8, 32>;
1344 p.sa8d_inter[LUMA_64x64] = sa8d16<64, 64>;
1345 p.sa8d_inter[LUMA_64x32] = sa8d16<64, 32>;
1346 p.sa8d_inter[LUMA_32x64] = sa8d16<32, 64>;
1347 p.sa8d_inter[LUMA_64x48] = sa8d16<64, 48>;
1348 p.sa8d_inter[LUMA_48x64] = sa8d16<48, 64>;
1349 p.sa8d_inter[LUMA_64x16] = sa8d16<64, 16>;
1350 p.sa8d_inter[LUMA_16x64] = sa8d16<16, 64>;
1351
1352 p.calcresidual[BLOCK_4x4] = getResidual<4>;
1353 p.calcresidual[BLOCK_8x8] = getResidual<8>;
1354 p.calcresidual[BLOCK_16x16] = getResidual<16>;
1355 p.calcresidual[BLOCK_32x32] = getResidual<32>;
1356 p.calcresidual[BLOCK_64x64] = NULL;
1357
1358 p.transpose[BLOCK_4x4] = transpose<4>;
1359 p.transpose[BLOCK_8x8] = transpose<8>;
1360 p.transpose[BLOCK_16x16] = transpose<16>;
1361 p.transpose[BLOCK_32x32] = transpose<32>;
1362 p.transpose[BLOCK_64x64] = transpose<64>;
1363
1364 p.ssd_s[BLOCK_4x4] = pixel_ssd_s_c<4>;
1365 p.ssd_s[BLOCK_8x8] = pixel_ssd_s_c<8>;
1366 p.ssd_s[BLOCK_16x16] = pixel_ssd_s_c<16>;
1367 p.ssd_s[BLOCK_32x32] = pixel_ssd_s_c<32>;
1368
1369 p.weight_pp = weight_pp_c;
1370 p.weight_sp = weight_sp_c;
1371
1372 p.scale1D_128to64 = scale1D_128to64;
1373 p.scale2D_64to32 = scale2D_64to32;
1374 p.frame_init_lowres_core = frame_init_lowres_core;
1375 p.ssim_4x4x2_core = ssim_4x4x2_core;
1376 p.ssim_end_4 = ssim_end_4;
1377
1378 p.var[BLOCK_8x8] = pixel_var<8>;
1379 p.var[BLOCK_16x16] = pixel_var<16>;
1380 p.var[BLOCK_32x32] = pixel_var<32>;
1381 p.var[BLOCK_64x64] = pixel_var<64>;
1382 p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
1383 p.planecopy_cp = planecopy_cp_c;
1384 p.planecopy_sp = planecopy_sp_c;
1385 p.propagateCost = estimateCUPropagateCost;
1386}
1387}