Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / common / pixel.cpp
CommitLineData
72b9787e
JB
1/*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 * Mandar Gurav <mandar@multicorewareinc.com>
6 * Mahesh Pittala <mahesh@multicorewareinc.com>
7 * Min Chen <min.chen@multicorewareinc.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at license @ x265.com.
25 *****************************************************************************/
26
27#include "common.h"
28#include "primitives.h"
29#include "x265.h"
30
31#include <cstdlib> // abs()
32
33using namespace x265;
34
b53f7c52
JB
35#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \
36 p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \
37 p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \
38 p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \
39 p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \
40 p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
41 p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \
42 p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
43 p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
44 p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
45 p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \
46 p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
47 p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
48 p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
49 p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
50 p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
51 p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
52 p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \
53 p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
54 p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
55 p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
56 p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
57 p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
58 p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
59 p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
60 p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
72b9787e
JB
61
62#define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
63 p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX<4, 4>; \
64 p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX<8, 8>; \
65 p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX<8, 4>; \
66 p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX<4, 8>; \
67 p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX<16, 16>; \
68 p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX<16, 8>; \
69 p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX<8, 16>; \
70 p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX<16, 12>; \
71 p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX<12, 16>; \
72 p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX<16, 4>; \
73 p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX<4, 16>; \
74 p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX<32, 32>; \
75 p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX<32, 16>; \
76 p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX<16, 32>; \
77 p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX<32, 24>; \
78 p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX<24, 32>; \
79 p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX<32, 8>; \
80 p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX<8, 32>; \
81 p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX<64, 64>; \
82 p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX<64, 32>; \
83 p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX<32, 64>; \
84 p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX<64, 48>; \
85 p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX<48, 64>; \
86 p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX<64, 16>; \
87 p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX<16, 64>;
88
89namespace {
90// place functions in anonymous namespace (file static)
91
92template<int lx, int ly>
b53f7c52 93int sad(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
72b9787e
JB
94{
95 int sum = 0;
96
97 for (int y = 0; y < ly; y++)
98 {
99 for (int x = 0; x < lx; x++)
72b9787e 100 sum += abs(pix1[x] - pix2[x]);
72b9787e
JB
101
102 pix1 += stride_pix1;
103 pix2 += stride_pix2;
104 }
105
106 return sum;
107}
108
109template<int lx, int ly>
b53f7c52 110int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
72b9787e
JB
111{
112 int sum = 0;
113
114 for (int y = 0; y < ly; y++)
115 {
116 for (int x = 0; x < lx; x++)
72b9787e 117 sum += abs(pix1[x] - pix2[x]);
72b9787e
JB
118
119 pix1 += stride_pix1;
120 pix2 += stride_pix2;
121 }
122
123 return sum;
124}
125
126template<int lx, int ly>
b53f7c52 127void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
72b9787e
JB
128{
129 res[0] = 0;
130 res[1] = 0;
131 res[2] = 0;
132 for (int y = 0; y < ly; y++)
133 {
134 for (int x = 0; x < lx; x++)
135 {
136 res[0] += abs(pix1[x] - pix2[x]);
137 res[1] += abs(pix1[x] - pix3[x]);
138 res[2] += abs(pix1[x] - pix4[x]);
139 }
140
141 pix1 += FENC_STRIDE;
142 pix2 += frefstride;
143 pix3 += frefstride;
144 pix4 += frefstride;
145 }
146}
147
148template<int lx, int ly>
b53f7c52 149void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
72b9787e
JB
150{
151 res[0] = 0;
152 res[1] = 0;
153 res[2] = 0;
154 res[3] = 0;
155 for (int y = 0; y < ly; y++)
156 {
157 for (int x = 0; x < lx; x++)
158 {
159 res[0] += abs(pix1[x] - pix2[x]);
160 res[1] += abs(pix1[x] - pix3[x]);
161 res[2] += abs(pix1[x] - pix4[x]);
162 res[3] += abs(pix1[x] - pix5[x]);
163 }
164
165 pix1 += FENC_STRIDE;
166 pix2 += frefstride;
167 pix3 += frefstride;
168 pix4 += frefstride;
169 pix5 += frefstride;
170 }
171}
172
173template<int lx, int ly, class T1, class T2>
b53f7c52 174int sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
72b9787e
JB
175{
176 int sum = 0;
b53f7c52 177 int tmp;
72b9787e
JB
178
179 for (int y = 0; y < ly; y++)
180 {
181 for (int x = 0; x < lx; x++)
182 {
b53f7c52
JB
183 tmp = pix1[x] - pix2[x];
184 sum += (tmp * tmp);
72b9787e
JB
185 }
186
187 pix1 += stride_pix1;
188 pix2 += stride_pix2;
189 }
190
191 return sum;
192}
193
194#define BITS_PER_SUM (8 * sizeof(sum_t))
195
196#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) { \
197 sum2_t t0 = s0 + s1; \
198 sum2_t t1 = s0 - s1; \
199 sum2_t t2 = s2 + s3; \
200 sum2_t t3 = s2 - s3; \
201 d0 = t0 + t2; \
202 d2 = t0 - t2; \
203 d1 = t1 + t3; \
204 d3 = t1 - t3; \
205}
206
207// in: a pseudo-simd number of the form x+(y<<16)
208// return: abs(x)+(abs(y)<<16)
209inline sum2_t abs2(sum2_t a)
210{
211 sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
212
213 return (a + s) ^ s;
214}
215
b53f7c52 216int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
72b9787e
JB
217{
218 sum2_t tmp[4][2];
219 sum2_t a0, a1, a2, a3, b0, b1;
220 sum2_t sum = 0;
221
222 for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)
223 {
224 a0 = pix1[0] - pix2[0];
225 a1 = pix1[1] - pix2[1];
226 b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
227 a2 = pix1[2] - pix2[2];
228 a3 = pix1[3] - pix2[3];
229 b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
230 tmp[i][0] = b0 + b1;
231 tmp[i][1] = b0 - b1;
232 }
233
234 for (int i = 0; i < 2; i++)
235 {
236 HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
237 a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
238 sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);
239 }
240
241 return (int)(sum >> 1);
242}
243
b53f7c52 244int satd_4x4(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
72b9787e
JB
245{
246 ssum2_t tmp[4][2];
247 ssum2_t a0, a1, a2, a3, b0, b1;
248 ssum2_t sum = 0;
249
250 for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)
251 {
252 a0 = pix1[0] - pix2[0];
253 a1 = pix1[1] - pix2[1];
254 b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
255 a2 = pix1[2] - pix2[2];
256 a3 = pix1[3] - pix2[3];
257 b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
258 tmp[i][0] = b0 + b1;
259 tmp[i][1] = b0 - b1;
260 }
261
262 for (int i = 0; i < 2; i++)
263 {
264 HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
265 a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
266 sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);
267 }
268
269 return (int)(sum >> 1);
270}
271
272// x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
b53f7c52 273int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
72b9787e
JB
274{
275 sum2_t tmp[4][4];
276 sum2_t a0, a1, a2, a3;
277 sum2_t sum = 0;
278
279 for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)
280 {
281 a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
282 a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
283 a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
284 a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
285 HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
286 }
287
288 for (int i = 0; i < 4; i++)
289 {
290 HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
291 sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
292 }
293
294 return (((sum_t)sum) + (sum >> BITS_PER_SUM)) >> 1;
295}
296
297template<int w, int h>
298// calculate satd in blocks of 4x4
b53f7c52 299int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
72b9787e
JB
300{
301 int satd = 0;
302
303 for (int row = 0; row < h; row += 4)
72b9787e 304 for (int col = 0; col < w; col += 4)
72b9787e
JB
305 satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
306 pix2 + row * stride_pix2 + col, stride_pix2);
72b9787e
JB
307
308 return satd;
309}
310
311template<int w, int h>
312// calculate satd in blocks of 8x4
b53f7c52 313int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
72b9787e
JB
314{
315 int satd = 0;
316
317 for (int row = 0; row < h; row += 4)
72b9787e 318 for (int col = 0; col < w; col += 8)
72b9787e
JB
319 satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
320 pix2 + row * stride_pix2 + col, stride_pix2);
72b9787e
JB
321
322 return satd;
323}
324
b53f7c52 325inline int _sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
72b9787e
JB
326{
327 sum2_t tmp[8][4];
328 sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
329 sum2_t sum = 0;
330
331 for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2)
332 {
333 a0 = pix1[0] - pix2[0];
334 a1 = pix1[1] - pix2[1];
335 b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
336 a2 = pix1[2] - pix2[2];
337 a3 = pix1[3] - pix2[3];
338 b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
339 a4 = pix1[4] - pix2[4];
340 a5 = pix1[5] - pix2[5];
341 b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM);
342 a6 = pix1[6] - pix2[6];
343 a7 = pix1[7] - pix2[7];
344 b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM);
345 HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3);
346 }
347
348 for (int i = 0; i < 4; i++)
349 {
350 HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
351 HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]);
352 b0 = abs2(a0 + a4) + abs2(a0 - a4);
353 b0 += abs2(a1 + a5) + abs2(a1 - a5);
354 b0 += abs2(a2 + a6) + abs2(a2 - a6);
355 b0 += abs2(a3 + a7) + abs2(a3 - a7);
356 sum += (sum_t)b0 + (b0 >> BITS_PER_SUM);
357 }
358
359 return (int)sum;
360}
361
b53f7c52 362int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
72b9787e
JB
363{
364 return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
365}
366
b53f7c52 367inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2)
72b9787e
JB
368{
369 ssum2_t tmp[8][4];
370 ssum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
371 ssum2_t sum = 0;
372
373 for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2)
374 {
375 a0 = pix1[0] - pix2[0];
376 a1 = pix1[1] - pix2[1];
377 b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
378 a2 = pix1[2] - pix2[2];
379 a3 = pix1[3] - pix2[3];
380 b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
381 a4 = pix1[4] - pix2[4];
382 a5 = pix1[5] - pix2[5];
383 b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM);
384 a6 = pix1[6] - pix2[6];
385 a7 = pix1[7] - pix2[7];
386 b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM);
387 HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3);
388 }
389
390 for (int i = 0; i < 4; i++)
391 {
392 HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
393 HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]);
394 b0 = abs2(a0 + a4) + abs2(a0 - a4);
395 b0 += abs2(a1 + a5) + abs2(a1 - a5);
396 b0 += abs2(a2 + a6) + abs2(a2 - a6);
397 b0 += abs2(a3 + a7) + abs2(a3 - a7);
398 sum += (sum_t)b0 + (b0 >> BITS_PER_SUM);
399 }
400
401 return (int)sum;
402}
403
b53f7c52 404int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2)
72b9787e
JB
405{
406 return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
407}
408
b53f7c52 409int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
72b9787e
JB
410{
411 int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
412 + _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2)
413 + _sa8d_8x8(pix1 + 8 * i_pix1, i_pix1, pix2 + 8 * i_pix2, i_pix2)
414 + _sa8d_8x8(pix1 + 8 + 8 * i_pix1, i_pix1, pix2 + 8 + 8 * i_pix2, i_pix2);
415
416 // This matches x264 sa8d_16x16, but is slightly different from HM's behavior because
417 // this version only rounds once at the end
418 return (sum + 2) >> 2;
419}
420
421template<int w, int h>
422// Calculate sa8d in blocks of 8x8
b53f7c52 423int sa8d8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
72b9787e
JB
424{
425 int cost = 0;
426
427 for (int y = 0; y < h; y += 8)
72b9787e 428 for (int x = 0; x < w; x += 8)
72b9787e 429 cost += sa8d_8x8(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
72b9787e
JB
430
431 return cost;
432}
433
434template<int w, int h>
435// Calculate sa8d in blocks of 16x16
b53f7c52 436int sa8d16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
72b9787e
JB
437{
438 int cost = 0;
439
440 for (int y = 0; y < h; y += 16)
72b9787e 441 for (int x = 0; x < w; x += 16)
72b9787e 442 cost += sa8d_16x16(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
72b9787e
JB
443
444 return cost;
445}
446
447template<int size>
b53f7c52 448int pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
72b9787e
JB
449{
450 int sum = 0;
451 for (int y = 0; y < size; y++)
452 {
453 for (int x = 0; x < size; x++)
72b9787e 454 sum += a[x] * a[x];
b53f7c52 455
72b9787e
JB
456 a += dstride;
457 }
458 return sum;
459}
460
461template<int size>
b53f7c52 462void blockfil_s_c(int16_t* dst, intptr_t dstride, int16_t val)
72b9787e
JB
463{
464 for (int y = 0; y < size; y++)
72b9787e 465 for (int x = 0; x < size; x++)
72b9787e 466 dst[y * dstride + x] = val;
72b9787e
JB
467}
468
469template<int size>
b53f7c52 470void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
72b9787e 471{
b53f7c52
JB
472 X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
473 X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
474 X265_CHECK(shift >= 0, "invalid shift\n");
72b9787e
JB
475
476 for (int i = 0; i < size; i++)
477 {
478 for (int j = 0; j < size; j++)
b53f7c52 479 dst[j] = src[j] << shift;
72b9787e 480
b53f7c52
JB
481 src += srcStride;
482 dst += size;
72b9787e
JB
483 }
484}
485
b53f7c52
JB
486template<int size>
487void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
72b9787e 488{
b53f7c52
JB
489 X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
490 X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
491 X265_CHECK(shift > 0, "invalid shift\n");
72b9787e 492
b53f7c52 493 int16_t round = 1 << (shift - 1);
72b9787e
JB
494 for (int i = 0; i < size; i++)
495 {
496 for (int j = 0; j < size; j++)
b53f7c52 497 dst[j] = (src[j] + round) >> shift;
72b9787e 498
b53f7c52
JB
499 src += srcStride;
500 dst += size;
72b9787e
JB
501 }
502}
503
504template<int size>
b53f7c52 505void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
72b9787e 506{
b53f7c52
JB
507 X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
508 X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
509 X265_CHECK(shift >= 0, "invalid shift\n");
510
72b9787e
JB
511 for (int i = 0; i < size; i++)
512 {
513 for (int j = 0; j < size; j++)
b53f7c52 514 dst[j] = src[j] << shift;
72b9787e
JB
515
516 src += size;
b53f7c52 517 dst += dstStride;
72b9787e
JB
518 }
519}
520
521template<int size>
b53f7c52 522void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
72b9787e 523{
b53f7c52
JB
524 X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
525 X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
526 X265_CHECK(shift > 0, "invalid shift\n");
527
528 int16_t round = 1 << (shift - 1);
72b9787e
JB
529 for (int i = 0; i < size; i++)
530 {
531 for (int j = 0; j < size; j++)
b53f7c52 532 dst[j] = (src[j] + round) >> shift;
72b9787e
JB
533
534 src += size;
b53f7c52 535 dst += dstStride;
72b9787e
JB
536 }
537}
538
539template<int blockSize>
b53f7c52 540void getResidual(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
72b9787e
JB
541{
542 for (int y = 0; y < blockSize; y++)
543 {
544 for (int x = 0; x < blockSize; x++)
72b9787e 545 residual[x] = static_cast<int16_t>(fenc[x]) - static_cast<int16_t>(pred[x]);
72b9787e
JB
546
547 fenc += stride;
548 residual += stride;
549 pred += stride;
550 }
551}
552
553template<int blockSize>
b53f7c52 554void transpose(pixel* dst, const pixel* src, intptr_t stride)
72b9787e
JB
555{
556 for (int k = 0; k < blockSize; k++)
72b9787e 557 for (int l = 0; l < blockSize; l++)
72b9787e 558 dst[k * blockSize + l] = src[l * stride + k];
72b9787e
JB
559}
560
b53f7c52 561void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
72b9787e
JB
562{
563 int x, y;
564
565 for (y = 0; y <= height - 1; y++)
566 {
567 for (x = 0; x <= width - 1; )
568 {
569 // note: width can be odd
570 dst[x] = (pixel)Clip3(0, ((1 << X265_DEPTH) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
571 x++;
572 }
573
574 src += srcStride;
575 dst += dstStride;
576 }
577}
578
b53f7c52 579void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
72b9787e
JB
580{
581 int x, y;
582
583 X265_CHECK(!(width & 15), "weightp alignment error\n");
584 X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
585 X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n");
586
587 for (y = 0; y <= height - 1; y++)
588 {
589 for (x = 0; x <= width - 1; )
590 {
591 // simulating pixel to short conversion
592 int16_t val = src[x] << (IF_INTERNAL_PREC - X265_DEPTH);
593 dst[x] = (pixel)Clip3(0, ((1 << X265_DEPTH) - 1), ((w0 * (val) + round) >> shift) + offset);
594 x++;
595 }
596
597 src += stride;
598 dst += stride;
599 }
600}
601
602template<int lx, int ly>
b53f7c52 603void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
72b9787e
JB
604{
605 for (int y = 0; y < ly; y++)
606 {
607 for (int x = 0; x < lx; x++)
72b9787e 608 dst[x] = (src0[x] + src1[x] + 1) >> 1;
72b9787e
JB
609
610 src0 += sstride0;
611 src1 += sstride1;
612 dst += dstride;
613 }
614}
615
b53f7c52 616void scale1D_128to64(pixel* dst, const pixel* src, intptr_t /*stride*/)
72b9787e
JB
617{
618 int x;
619
620 for (x = 0; x < 128; x += 2)
621 {
622 pixel pix0 = src[(x + 0)];
623 pixel pix1 = src[(x + 1)];
624 int sum = pix0 + pix1;
625
626 dst[x >> 1] = (pixel)((sum + 1) >> 1);
627 }
628}
629
b53f7c52 630void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
72b9787e 631{
b53f7c52 632 uint32_t x, y;
72b9787e
JB
633
634 for (y = 0; y < 64; y += 2)
635 {
636 for (x = 0; x < 64; x += 2)
637 {
638 pixel pix0 = src[(y + 0) * stride + (x + 0)];
639 pixel pix1 = src[(y + 0) * stride + (x + 1)];
640 pixel pix2 = src[(y + 1) * stride + (x + 0)];
641 pixel pix3 = src[(y + 1) * stride + (x + 1)];
642 int sum = pix0 + pix1 + pix2 + pix3;
643
644 dst[y / 2 * 32 + x / 2] = (pixel)((sum + 2) >> 2);
645 }
646 }
647}
648
b53f7c52 649void frame_init_lowres_core(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc,
72b9787e
JB
650 intptr_t src_stride, intptr_t dst_stride, int width, int height)
651{
652 for (int y = 0; y < height; y++)
653 {
b53f7c52
JB
654 const pixel* src1 = src0 + src_stride;
655 const pixel* src2 = src1 + src_stride;
72b9787e
JB
656 for (int x = 0; x < width; x++)
657 {
658 // slower than naive bilinear, but matches asm
659#define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1)
660 dst0[x] = FILTER(src0[2 * x], src1[2 * x], src0[2 * x + 1], src1[2 * x + 1]);
661 dsth[x] = FILTER(src0[2 * x + 1], src1[2 * x + 1], src0[2 * x + 2], src1[2 * x + 2]);
662 dstv[x] = FILTER(src1[2 * x], src2[2 * x], src1[2 * x + 1], src2[2 * x + 1]);
663 dstc[x] = FILTER(src1[2 * x + 1], src2[2 * x + 1], src1[2 * x + 2], src2[2 * x + 2]);
664#undef FILTER
665 }
666 src0 += src_stride * 2;
667 dst0 += dst_stride;
668 dsth += dst_stride;
669 dstv += dst_stride;
670 dstc += dst_stride;
671 }
672}
673
674/* structural similarity metric */
b53f7c52 675void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4])
72b9787e
JB
676{
677 for (int z = 0; z < 2; z++)
678 {
679 uint32_t s1 = 0, s2 = 0, ss = 0, s12 = 0;
680 for (int y = 0; y < 4; y++)
681 {
682 for (int x = 0; x < 4; x++)
683 {
684 int a = pix1[x + y * stride1];
685 int b = pix2[x + y * stride2];
686 s1 += a;
687 s2 += b;
688 ss += a * a;
689 ss += b * b;
690 s12 += a * b;
691 }
692 }
693
694 sums[z][0] = s1;
695 sums[z][1] = s2;
696 sums[z][2] = ss;
697 sums[z][3] = s12;
698 pix1 += 4;
699 pix2 += 4;
700 }
701}
702
703float ssim_end_1(int s1, int s2, int ss, int s12)
704{
705/* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases.
706 * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784.
707 * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */
708
709#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
710#if HIGH_BIT_DEPTH
711 X265_CHECK(X265_DEPTH == 10, "ssim invalid depth\n");
712#define type float
713 static const float ssim_c1 = (float)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64);
714 static const float ssim_c2 = (float)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63);
715#else
716 X265_CHECK(X265_DEPTH == 8, "ssim invalid depth\n");
717#define type int
718 static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5);
719 static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5);
720#endif
721 type fs1 = (type)s1;
722 type fs2 = (type)s2;
723 type fss = (type)ss;
724 type fs12 = (type)s12;
725 type vars = (type)(fss * 64 - fs1 * fs1 - fs2 * fs2);
726 type covar = (type)(fs12 * 64 - fs1 * fs2);
727 return (float)(2 * fs1 * fs2 + ssim_c1) * (float)(2 * covar + ssim_c2)
728 / ((float)(fs1 * fs1 + fs2 * fs2 + ssim_c1) * (float)(vars + ssim_c2));
729#undef type
730#undef PIXEL_MAX
731}
732
733float ssim_end_4(int sum0[5][4], int sum1[5][4], int width)
734{
735 float ssim = 0.0;
736
737 for (int i = 0; i < width; i++)
738 {
739 ssim += ssim_end_1(sum0[i][0] + sum0[i + 1][0] + sum1[i][0] + sum1[i + 1][0],
740 sum0[i][1] + sum0[i + 1][1] + sum1[i][1] + sum1[i + 1][1],
741 sum0[i][2] + sum0[i + 1][2] + sum1[i][2] + sum1[i + 1][2],
742 sum0[i][3] + sum0[i + 1][3] + sum1[i][3] + sum1[i + 1][3]);
743 }
744
745 return ssim;
746}
747
748template<int size>
b53f7c52 749uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
72b9787e
JB
750{
751 uint32_t sum = 0, sqr = 0;
752
753 for (int y = 0; y < size; y++)
754 {
755 for (int x = 0; x < size; x++)
756 {
757 sum += pix[x];
758 sqr += pix[x] * pix[x];
759 }
760
761 pix += i_stride;
762 }
763
764 return sum + ((uint64_t)sqr << 32);
765}
766
767#if defined(_MSC_VER)
768#pragma warning(disable: 4127) // conditional expression is constant
769#endif
770
771template<int size>
b53f7c52 772int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
72b9787e
JB
773{
774 static pixel zeroBuf[8] /* = { 0 } */;
775
776 if (size)
777 {
778 int dim = 1 << (size + 2);
779 uint32_t totEnergy = 0;
780 for (int i = 0; i < dim; i += 8)
781 {
782 for (int j = 0; j < dim; j+= 8)
783 {
784 /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
785 int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride, zeroBuf, 0) -
786 (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
787 int reconEnergy = sa8d_8x8(recon + i * rstride + j, rstride, zeroBuf, 0) -
788 (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
789
790 totEnergy += abs(sourceEnergy - reconEnergy);
791 }
792 }
793 return totEnergy;
794 }
795 else
796 {
797 /* 4x4 is too small for sa8d */
798 int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
799 int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
800 return abs(sourceEnergy - reconEnergy);
801 }
802}
803
804template<int size>
b53f7c52 805int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
72b9787e
JB
806{
807 static int16_t zeroBuf[8] /* = { 0 } */;
808
809 if (size)
810 {
811 int dim = 1 << (size + 2);
812 uint32_t totEnergy = 0;
813 for (int i = 0; i < dim; i += 8)
814 {
815 for (int j = 0; j < dim; j+= 8)
816 {
817 /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
818 int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride, zeroBuf, 0) -
819 (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
820 int reconEnergy = sa8d_8x8(recon + i * rstride + j, rstride, zeroBuf, 0) -
821 (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
822
823 totEnergy += abs(sourceEnergy - reconEnergy);
824 }
825 }
826 return totEnergy;
827 }
828 else
829 {
830 /* 4x4 is too small for sa8d */
831 int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
832 int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
833 return abs(sourceEnergy - reconEnergy);
834 }
835}
836
72b9787e 837template<int bx, int by>
b53f7c52 838void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
72b9787e
JB
839{
840 for (int y = 0; y < by; y++)
841 {
842 for (int x = 0; x < bx; x++)
72b9787e 843 a[x] = b[x];
72b9787e
JB
844
845 a += stridea;
846 b += strideb;
847 }
848}
849
850template<int bx, int by>
b53f7c52 851void blockcopy_ss_c(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
72b9787e
JB
852{
853 for (int y = 0; y < by; y++)
854 {
855 for (int x = 0; x < bx; x++)
72b9787e 856 a[x] = b[x];
72b9787e
JB
857
858 a += stridea;
859 b += strideb;
860 }
861}
862
863template<int bx, int by>
b53f7c52 864void blockcopy_sp_c(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
72b9787e
JB
865{
866 for (int y = 0; y < by; y++)
867 {
868 for (int x = 0; x < bx; x++)
869 {
870 X265_CHECK((b[x] >= 0) && (b[x] <= ((1 << X265_DEPTH) - 1)), "blockcopy pixel size fail\n");
871 a[x] = (pixel)b[x];
872 }
873
874 a += stridea;
875 b += strideb;
876 }
877}
878
879template<int bx, int by>
b53f7c52 880void blockcopy_ps_c(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
72b9787e
JB
881{
882 for (int y = 0; y < by; y++)
883 {
884 for (int x = 0; x < bx; x++)
72b9787e 885 a[x] = (int16_t)b[x];
72b9787e
JB
886
887 a += stridea;
888 b += strideb;
889 }
890}
891
892template<int bx, int by>
b53f7c52 893void pixel_sub_ps_c(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
72b9787e
JB
894{
895 for (int y = 0; y < by; y++)
896 {
897 for (int x = 0; x < bx; x++)
72b9787e 898 a[x] = (int16_t)(b0[x] - b1[x]);
72b9787e
JB
899
900 b0 += sstride0;
901 b1 += sstride1;
902 a += dstride;
903 }
904}
905
906template<int bx, int by>
b53f7c52 907void pixel_add_ps_c(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1)
72b9787e
JB
908{
909 for (int y = 0; y < by; y++)
910 {
911 for (int x = 0; x < bx; x++)
72b9787e 912 a[x] = Clip(b0[x] + b1[x]);
72b9787e
JB
913
914 b0 += sstride0;
915 b1 += sstride1;
916 a += dstride;
917 }
918}
919
920template<int bx, int by>
b53f7c52 921void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
72b9787e
JB
922{
923 int shiftNum, offset;
924
925 shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
926 offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
927
928 for (int y = 0; y < by; y++)
929 {
930 for (int x = 0; x < bx; x += 2)
931 {
932 dst[x + 0] = Clip((src0[x + 0] + src1[x + 0] + offset) >> shiftNum);
933 dst[x + 1] = Clip((src0[x + 1] + src1[x + 1] + offset) >> shiftNum);
934 }
935
936 src0 += src0Stride;
937 src1 += src1Stride;
938 dst += dstStride;
939 }
940}
941
b53f7c52 942void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
72b9787e
JB
943{
944 for (int r = 0; r < height; r++)
945 {
946 for (int c = 0; c < width; c++)
72b9787e 947 dst[c] = ((pixel)src[c]) << shift;
72b9787e
JB
948
949 dst += dstStride;
950 src += srcStride;
951 }
952}
953
b53f7c52 954void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
72b9787e
JB
955{
956 for (int r = 0; r < height; r++)
957 {
958 for (int c = 0; c < width; c++)
72b9787e 959 dst[c] = (pixel)((src[c] >> shift) & mask);
72b9787e
JB
960
961 dst += dstStride;
962 src += srcStride;
963 }
964}
965
966/* Estimate the total amount of influence on future quality that could be had if we
967 * were to improve the reference samples used to inter predict any given CU. */
b53f7c52
JB
968void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
969 const int32_t* invQscales, const double* fpsFactor, int len)
72b9787e
JB
970{
971 double fps = *fpsFactor / 256;
972
973 for (int i = 0; i < len; i++)
974 {
975 double intraCost = intraCosts[i] * invQscales[i];
976 double propagateAmount = (double)propagateIn[i] + intraCost * fps;
977 double propagateNum = (double)intraCosts[i] - (interCosts[i] & ((1 << 14) - 1));
978 double propagateDenom = (double)intraCosts[i];
979 dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
980 }
981}
982} // end anonymous namespace
983
984namespace x265 {
985// x265 private namespace
986
987/* Extend the edges of a picture so that it may safely be used for motion
988 * compensation. This function assumes the picture is stored in a buffer with
989 * sufficient padding for the X and Y margins */
990void extendPicBorder(pixel* pic, intptr_t stride, int width, int height, int marginX, int marginY)
991{
992 /* extend left and right margins */
993 primitives.extendRowBorder(pic, stride, width, height, marginX);
994
995 /* copy top row to create above margin */
b53f7c52 996 pixel* top = pic - marginX;
72b9787e
JB
997 for (int y = 0; y < marginY; y++)
998 memcpy(top - (y + 1) * stride, top, stride * sizeof(pixel));
999
1000 /* copy bottom row to create below margin */
b53f7c52 1001 pixel* bot = pic - marginX + (height - 1) * stride;
72b9787e
JB
1002 for (int y = 0; y < marginY; y++)
1003 memcpy(bot + (y + 1) * stride, bot, stride * sizeof(pixel));
1004}
1005
1006/* Initialize entries for pixel functions defined in this file */
1007void Setup_C_PixelPrimitives(EncoderPrimitives &p)
1008{
1009 SET_FUNC_PRIMITIVE_TABLE_C2(sad)
1010 SET_FUNC_PRIMITIVE_TABLE_C2(sad_x3)
1011 SET_FUNC_PRIMITIVE_TABLE_C2(sad_x4)
1012 SET_FUNC_PRIMITIVE_TABLE_C2(pixelavg_pp)
1013
1014 // satd
1015 p.satd[LUMA_4x4] = satd_4x4;
1016 p.satd[LUMA_8x8] = satd8<8, 8>;
1017 p.satd[LUMA_8x4] = satd_8x4;
1018 p.satd[LUMA_4x8] = satd4<4, 8>;
1019 p.satd[LUMA_16x16] = satd8<16, 16>;
1020 p.satd[LUMA_16x8] = satd8<16, 8>;
1021 p.satd[LUMA_8x16] = satd8<8, 16>;
1022 p.satd[LUMA_16x12] = satd8<16, 12>;
1023 p.satd[LUMA_12x16] = satd4<12, 16>;
1024 p.satd[LUMA_16x4] = satd8<16, 4>;
1025 p.satd[LUMA_4x16] = satd4<4, 16>;
1026 p.satd[LUMA_32x32] = satd8<32, 32>;
1027 p.satd[LUMA_32x16] = satd8<32, 16>;
1028 p.satd[LUMA_16x32] = satd8<16, 32>;
1029 p.satd[LUMA_32x24] = satd8<32, 24>;
1030 p.satd[LUMA_24x32] = satd8<24, 32>;
1031 p.satd[LUMA_32x8] = satd8<32, 8>;
1032 p.satd[LUMA_8x32] = satd8<8, 32>;
1033 p.satd[LUMA_64x64] = satd8<64, 64>;
1034 p.satd[LUMA_64x32] = satd8<64, 32>;
1035 p.satd[LUMA_32x64] = satd8<32, 64>;
1036 p.satd[LUMA_64x48] = satd8<64, 48>;
1037 p.satd[LUMA_48x64] = satd8<48, 64>;
1038 p.satd[LUMA_64x16] = satd8<64, 16>;
1039 p.satd[LUMA_16x64] = satd8<16, 64>;
1040
b53f7c52
JB
1041 p.chroma[X265_CSP_I420].satd[CHROMA_2x2] = NULL;
1042 p.chroma[X265_CSP_I420].satd[CHROMA_4x4] = satd_4x4;
1043 p.chroma[X265_CSP_I420].satd[CHROMA_8x8] = satd8<8, 8>;
1044 p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = satd8<16, 16>;
1045 p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = satd8<32, 32>;
1046
1047 p.chroma[X265_CSP_I420].satd[CHROMA_4x2] = NULL;
1048 p.chroma[X265_CSP_I420].satd[CHROMA_2x4] = NULL;
1049 p.chroma[X265_CSP_I420].satd[CHROMA_8x4] = satd_8x4;
1050 p.chroma[X265_CSP_I420].satd[CHROMA_4x8] = satd4<4, 8>;
1051 p.chroma[X265_CSP_I420].satd[CHROMA_16x8] = satd8<16, 8>;
1052 p.chroma[X265_CSP_I420].satd[CHROMA_8x16] = satd8<8, 16>;
1053 p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = satd8<32, 16>;
1054 p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = satd8<16, 32>;
1055
1056 p.chroma[X265_CSP_I420].satd[CHROMA_8x6] = NULL;
1057 p.chroma[X265_CSP_I420].satd[CHROMA_6x8] = NULL;
1058 p.chroma[X265_CSP_I420].satd[CHROMA_8x2] = NULL;
1059 p.chroma[X265_CSP_I420].satd[CHROMA_2x8] = NULL;
1060 p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = satd4<16, 12>;
1061 p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = satd4<12, 16>;
1062 p.chroma[X265_CSP_I420].satd[CHROMA_16x4] = satd4<16, 4>;
1063 p.chroma[X265_CSP_I420].satd[CHROMA_4x16] = satd4<4, 16>;
1064 p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = satd8<32, 24>;
1065 p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = satd8<24, 32>;
1066 p.chroma[X265_CSP_I420].satd[CHROMA_32x8] = satd8<32, 8>;
1067 p.chroma[X265_CSP_I420].satd[CHROMA_8x32] = satd8<8, 32>;
1068
1069 p.chroma[X265_CSP_I422].satd[CHROMA422_2x4] = NULL;
1070 p.chroma[X265_CSP_I422].satd[CHROMA422_4x8] = satd4<4, 8>;
1071 p.chroma[X265_CSP_I422].satd[CHROMA422_8x16] = satd8<8, 16>;
1072 p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = satd8<16, 32>;
1073 p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = satd8<32, 64>;
1074
1075 p.chroma[X265_CSP_I422].satd[CHROMA422_4x4] = satd_4x4;
1076 p.chroma[X265_CSP_I422].satd[CHROMA422_2x8] = NULL;
1077 p.chroma[X265_CSP_I422].satd[CHROMA422_8x8] = satd8<8, 8>;
1078 p.chroma[X265_CSP_I422].satd[CHROMA422_4x16] = satd4<4, 16>;
1079 p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = satd8<16, 16>;
1080 p.chroma[X265_CSP_I422].satd[CHROMA422_8x32] = satd8<8, 32>;
1081 p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = satd8<32, 32>;
1082 p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = satd8<16, 64>;
1083
1084 p.chroma[X265_CSP_I422].satd[CHROMA422_8x12] = satd4<8, 12>;
1085 p.chroma[X265_CSP_I422].satd[CHROMA422_6x16] = NULL;
1086 p.chroma[X265_CSP_I422].satd[CHROMA422_8x4] = satd4<8, 4>;
1087 p.chroma[X265_CSP_I422].satd[CHROMA422_2x16] = NULL;
1088 p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>;
1089 p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>;
1090 p.chroma[X265_CSP_I422].satd[CHROMA422_16x8] = satd8<16, 8>;
1091 p.chroma[X265_CSP_I422].satd[CHROMA422_4x32] = satd4<4, 32>;
1092 p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>;
1093 p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>;
1094 p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = satd8<32, 16>;
1095 p.chroma[X265_CSP_I422].satd[CHROMA422_8x64] = satd8<8, 64>;
1096
72b9787e
JB
1097#define CHROMA_420(W, H) \
1098 p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = addAvg<W, H>; \
1099 p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1100 p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1101 p.chroma[X265_CSP_I420].copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1102 p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1103
1104#define CHROMA_422(W, H) \
b53f7c52 1105 p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg<W, H>; \
72b9787e
JB
1106 p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1107 p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1108 p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1109 p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1110
1111#define CHROMA_444(W, H) \
b53f7c52 1112 p.chroma[X265_CSP_I444].satd[LUMA_ ## W ## x ## H] = p.satd[LUMA_ ## W ## x ## H]; \
72b9787e
JB
1113 p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
1114 p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1115 p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1116 p.chroma[X265_CSP_I444].copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1117 p.chroma[X265_CSP_I444].copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1118
1119#define LUMA(W, H) \
1120 p.luma_addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
1121 p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1122 p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1123 p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1124 p.luma_copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1125
1126#define LUMA_PIXELSUB(W, H) \
1127 p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1128 p.luma_add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1129
1130#define CHROMA_PIXELSUB_420(W, H) \
1131 p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1132 p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1133
1134#define CHROMA_PIXELSUB_422(W, H) \
1135 p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1136 p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1137
1138#define CHROMA_PIXELSUB_444(W, H) \
1139 p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1140 p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1141
72b9787e
JB
1142 LUMA(4, 4);
1143 LUMA(8, 8);
1144 CHROMA_420(4, 4);
1145 LUMA(4, 8);
1146 CHROMA_420(2, 4);
1147 LUMA(8, 4);
1148 CHROMA_420(4, 2);
1149 LUMA(16, 16);
1150 CHROMA_420(8, 8);
1151 LUMA(16, 8);
1152 CHROMA_420(8, 4);
1153 LUMA(8, 16);
1154 CHROMA_420(4, 8);
1155 LUMA(16, 12);
1156 CHROMA_420(8, 6);
1157 LUMA(12, 16);
1158 CHROMA_420(6, 8);
1159 LUMA(16, 4);
1160 CHROMA_420(8, 2);
1161 LUMA(4, 16);
1162 CHROMA_420(2, 8);
1163 LUMA(32, 32);
1164 CHROMA_420(16, 16);
1165 LUMA(32, 16);
1166 CHROMA_420(16, 8);
1167 LUMA(16, 32);
1168 CHROMA_420(8, 16);
1169 LUMA(32, 24);
1170 CHROMA_420(16, 12);
1171 LUMA(24, 32);
1172 CHROMA_420(12, 16);
1173 LUMA(32, 8);
1174 CHROMA_420(16, 4);
1175 LUMA(8, 32);
1176 CHROMA_420(4, 16);
1177 LUMA(64, 64);
1178 CHROMA_420(32, 32);
1179 LUMA(64, 32);
1180 CHROMA_420(32, 16);
1181 LUMA(32, 64);
1182 CHROMA_420(16, 32);
1183 LUMA(64, 48);
1184 CHROMA_420(32, 24);
1185 LUMA(48, 64);
1186 CHROMA_420(24, 32);
1187 LUMA(64, 16);
1188 CHROMA_420(32, 8);
1189 LUMA(16, 64);
1190 CHROMA_420(8, 32);
1191
1192 LUMA_PIXELSUB(4, 4);
1193 LUMA_PIXELSUB(8, 8);
1194 LUMA_PIXELSUB(16, 16);
1195 LUMA_PIXELSUB(32, 32);
1196 LUMA_PIXELSUB(64, 64);
1197 CHROMA_PIXELSUB_420(4, 4)
1198 CHROMA_PIXELSUB_420(8, 8)
1199 CHROMA_PIXELSUB_420(16, 16)
1200 CHROMA_PIXELSUB_420(32, 32)
1201 CHROMA_PIXELSUB_422(4, 8)
1202 CHROMA_PIXELSUB_422(8, 16)
1203 CHROMA_PIXELSUB_422(16, 32)
1204 CHROMA_PIXELSUB_422(32, 64)
1205 CHROMA_PIXELSUB_444(8, 8)
1206 CHROMA_PIXELSUB_444(16, 16)
1207 CHROMA_PIXELSUB_444(32, 32)
1208 CHROMA_PIXELSUB_444(64, 64)
1209
1210 CHROMA_422(4, 8);
1211 CHROMA_422(4, 4);
1212 CHROMA_422(2, 8);
1213 CHROMA_422(8, 16);
1214 CHROMA_422(8, 8);
1215 CHROMA_422(4, 16);
1216 CHROMA_422(8, 12);
1217 CHROMA_422(6, 16);
1218 CHROMA_422(8, 4);
1219 CHROMA_422(2, 16);
1220 CHROMA_422(16, 32);
1221 CHROMA_422(16, 16);
1222 CHROMA_422(8, 32);
1223 CHROMA_422(16, 24);
1224 CHROMA_422(12, 32);
1225 CHROMA_422(16, 8);
1226 CHROMA_422(4, 32);
1227 CHROMA_422(32, 64);
1228 CHROMA_422(32, 32);
1229 CHROMA_422(16, 64);
1230 CHROMA_422(32, 48);
1231 CHROMA_422(24, 64);
1232 CHROMA_422(32, 16);
1233 CHROMA_422(8, 64);
1234
1235 CHROMA_444(4, 4);
1236 CHROMA_444(8, 8);
1237 CHROMA_444(4, 8);
1238 CHROMA_444(8, 4);
1239 CHROMA_444(16, 16);
1240 CHROMA_444(16, 8);
1241 CHROMA_444(8, 16);
1242 CHROMA_444(16, 12);
1243 CHROMA_444(12, 16);
1244 CHROMA_444(16, 4);
1245 CHROMA_444(4, 16);
1246 CHROMA_444(32, 32);
1247 CHROMA_444(32, 16);
1248 CHROMA_444(16, 32);
1249 CHROMA_444(32, 24);
1250 CHROMA_444(24, 32);
1251 CHROMA_444(32, 8);
1252 CHROMA_444(8, 32);
1253 CHROMA_444(64, 64);
1254 CHROMA_444(64, 32);
1255 CHROMA_444(32, 64);
1256 CHROMA_444(64, 48);
1257 CHROMA_444(48, 64);
1258 CHROMA_444(64, 16);
1259 CHROMA_444(16, 64);
1260
b53f7c52
JB
1261 SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixel, pixel)
1262 SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel)
1263 SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t)
72b9787e
JB
1264
1265 p.blockfill_s[BLOCK_4x4] = blockfil_s_c<4>;
1266 p.blockfill_s[BLOCK_8x8] = blockfil_s_c<8>;
1267 p.blockfill_s[BLOCK_16x16] = blockfil_s_c<16>;
1268 p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
1269 p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
1270
b53f7c52
JB
1271 p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>;
1272 p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>;
1273 p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>;
1274 p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>;
1275 p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>;
1276 p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>;
1277 p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>;
1278 p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>;
1279 p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>;
1280 p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>;
1281 p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>;
1282 p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>;
1283 p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>;
1284 p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>;
1285 p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>;
1286 p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>;
72b9787e
JB
1287
1288 p.sa8d[BLOCK_4x4] = satd_4x4;
1289 p.sa8d[BLOCK_8x8] = sa8d_8x8;
1290 p.sa8d[BLOCK_16x16] = sa8d_16x16;
1291 p.sa8d[BLOCK_32x32] = sa8d16<32, 32>;
1292 p.sa8d[BLOCK_64x64] = sa8d16<64, 64>;
1293
1294 p.psy_cost_pp[BLOCK_4x4] = psyCost_pp<BLOCK_4x4>;
1295 p.psy_cost_pp[BLOCK_8x8] = psyCost_pp<BLOCK_8x8>;
1296 p.psy_cost_pp[BLOCK_16x16] = psyCost_pp<BLOCK_16x16>;
1297 p.psy_cost_pp[BLOCK_32x32] = psyCost_pp<BLOCK_32x32>;
1298 p.psy_cost_pp[BLOCK_64x64] = psyCost_pp<BLOCK_64x64>;
1299
1300 p.psy_cost_ss[BLOCK_4x4] = psyCost_ss<BLOCK_4x4>;
1301 p.psy_cost_ss[BLOCK_8x8] = psyCost_ss<BLOCK_8x8>;
1302 p.psy_cost_ss[BLOCK_16x16] = psyCost_ss<BLOCK_16x16>;
1303 p.psy_cost_ss[BLOCK_32x32] = psyCost_ss<BLOCK_32x32>;
1304 p.psy_cost_ss[BLOCK_64x64] = psyCost_ss<BLOCK_64x64>;
1305
1306 p.sa8d_inter[LUMA_4x4] = satd_4x4;
1307 p.sa8d_inter[LUMA_8x8] = sa8d_8x8;
1308 p.sa8d_inter[LUMA_8x4] = satd_8x4;
1309 p.sa8d_inter[LUMA_4x8] = satd4<4, 8>;
1310 p.sa8d_inter[LUMA_16x16] = sa8d_16x16;
1311 p.sa8d_inter[LUMA_16x8] = sa8d8<16, 8>;
1312 p.sa8d_inter[LUMA_8x16] = sa8d8<8, 16>;
1313 p.sa8d_inter[LUMA_16x12] = satd8<16, 12>;
1314 p.sa8d_inter[LUMA_12x16] = satd4<12, 16>;
1315 p.sa8d_inter[LUMA_4x16] = satd4<4, 16>;
1316 p.sa8d_inter[LUMA_16x4] = satd8<16, 4>;
1317 p.sa8d_inter[LUMA_32x32] = sa8d16<32, 32>;
1318 p.sa8d_inter[LUMA_32x16] = sa8d16<32, 16>;
1319 p.sa8d_inter[LUMA_16x32] = sa8d16<16, 32>;
1320 p.sa8d_inter[LUMA_32x24] = sa8d8<32, 24>;
1321 p.sa8d_inter[LUMA_24x32] = sa8d8<24, 32>;
1322 p.sa8d_inter[LUMA_32x8] = sa8d8<32, 8>;
1323 p.sa8d_inter[LUMA_8x32] = sa8d8<8, 32>;
1324 p.sa8d_inter[LUMA_64x64] = sa8d16<64, 64>;
1325 p.sa8d_inter[LUMA_64x32] = sa8d16<64, 32>;
1326 p.sa8d_inter[LUMA_32x64] = sa8d16<32, 64>;
1327 p.sa8d_inter[LUMA_64x48] = sa8d16<64, 48>;
1328 p.sa8d_inter[LUMA_48x64] = sa8d16<48, 64>;
1329 p.sa8d_inter[LUMA_64x16] = sa8d16<64, 16>;
1330 p.sa8d_inter[LUMA_16x64] = sa8d16<16, 64>;
1331
1332 p.calcresidual[BLOCK_4x4] = getResidual<4>;
1333 p.calcresidual[BLOCK_8x8] = getResidual<8>;
1334 p.calcresidual[BLOCK_16x16] = getResidual<16>;
1335 p.calcresidual[BLOCK_32x32] = getResidual<32>;
1336 p.calcresidual[BLOCK_64x64] = NULL;
1337
1338 p.transpose[BLOCK_4x4] = transpose<4>;
1339 p.transpose[BLOCK_8x8] = transpose<8>;
1340 p.transpose[BLOCK_16x16] = transpose<16>;
1341 p.transpose[BLOCK_32x32] = transpose<32>;
1342 p.transpose[BLOCK_64x64] = transpose<64>;
1343
1344 p.ssd_s[BLOCK_4x4] = pixel_ssd_s_c<4>;
1345 p.ssd_s[BLOCK_8x8] = pixel_ssd_s_c<8>;
1346 p.ssd_s[BLOCK_16x16] = pixel_ssd_s_c<16>;
1347 p.ssd_s[BLOCK_32x32] = pixel_ssd_s_c<32>;
1348
1349 p.weight_pp = weight_pp_c;
1350 p.weight_sp = weight_sp_c;
1351
1352 p.scale1D_128to64 = scale1D_128to64;
1353 p.scale2D_64to32 = scale2D_64to32;
b53f7c52 1354 p.frameInitLowres = frame_init_lowres_core;
72b9787e
JB
1355 p.ssim_4x4x2_core = ssim_4x4x2_core;
1356 p.ssim_end_4 = ssim_end_4;
1357
1358 p.var[BLOCK_8x8] = pixel_var<8>;
1359 p.var[BLOCK_16x16] = pixel_var<16>;
1360 p.var[BLOCK_32x32] = pixel_var<32>;
1361 p.var[BLOCK_64x64] = pixel_var<64>;
72b9787e
JB
1362 p.planecopy_cp = planecopy_cp_c;
1363 p.planecopy_sp = planecopy_sp_c;
1364 p.propagateCost = estimateCUPropagateCost;
1365}
1366}