1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Steve Borho <steve@borho.org>
5 * Mandar Gurav <mandar@multicorewareinc.com>
6 * Mahesh Pittala <mahesh@multicorewareinc.com>
7 * Min Chen <min.chen@multicorewareinc.com>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at license @ x265.com.
25 *****************************************************************************/
28 #include "primitives.h"
31 #include <cstdlib> // abs()
35 #define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \
36 p.FUNC_PREFIX[LUMA_4x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \
37 p.FUNC_PREFIX[LUMA_8x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \
38 p.FUNC_PREFIX[LUMA_8x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \
39 p.FUNC_PREFIX[LUMA_4x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \
40 p.FUNC_PREFIX[LUMA_16x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
41 p.FUNC_PREFIX[LUMA_16x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \
42 p.FUNC_PREFIX[LUMA_8x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
43 p.FUNC_PREFIX[LUMA_16x12] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
44 p.FUNC_PREFIX[LUMA_12x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
45 p.FUNC_PREFIX[LUMA_16x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \
46 p.FUNC_PREFIX[LUMA_4x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
47 p.FUNC_PREFIX[LUMA_32x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
48 p.FUNC_PREFIX[LUMA_32x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
49 p.FUNC_PREFIX[LUMA_16x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
50 p.FUNC_PREFIX[LUMA_32x24] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
51 p.FUNC_PREFIX[LUMA_24x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
52 p.FUNC_PREFIX[LUMA_32x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \
53 p.FUNC_PREFIX[LUMA_8x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
54 p.FUNC_PREFIX[LUMA_64x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
55 p.FUNC_PREFIX[LUMA_64x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
56 p.FUNC_PREFIX[LUMA_32x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
57 p.FUNC_PREFIX[LUMA_64x48] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
58 p.FUNC_PREFIX[LUMA_48x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
59 p.FUNC_PREFIX[LUMA_64x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
60 p.FUNC_PREFIX[LUMA_16x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
62 #define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
63 p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX<4, 4>; \
64 p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX<8, 8>; \
65 p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX<8, 4>; \
66 p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX<4, 8>; \
67 p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX<16, 16>; \
68 p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX<16, 8>; \
69 p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX<8, 16>; \
70 p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX<16, 12>; \
71 p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX<12, 16>; \
72 p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX<16, 4>; \
73 p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX<4, 16>; \
74 p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX<32, 32>; \
75 p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX<32, 16>; \
76 p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX<16, 32>; \
77 p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX<32, 24>; \
78 p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX<24, 32>; \
79 p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX<32, 8>; \
80 p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX<8, 32>; \
81 p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX<64, 64>; \
82 p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX<64, 32>; \
83 p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX<32, 64>; \
84 p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX<64, 48>; \
85 p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX<48, 64>; \
86 p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX<64, 16>; \
87 p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX<16, 64>;
90 // place functions in anonymous namespace (file static)
92 template<int lx
, int ly
>
93 int sad(pixel
*pix1
, intptr_t stride_pix1
, pixel
*pix2
, intptr_t stride_pix2
)
97 for (int y
= 0; y
< ly
; y
++)
99 for (int x
= 0; x
< lx
; x
++)
101 sum
+= abs(pix1
[x
] - pix2
[x
]);
111 template<int lx
, int ly
>
112 int sad(int16_t *pix1
, intptr_t stride_pix1
, int16_t *pix2
, intptr_t stride_pix2
)
116 for (int y
= 0; y
< ly
; y
++)
118 for (int x
= 0; x
< lx
; x
++)
120 sum
+= abs(pix1
[x
] - pix2
[x
]);
130 template<int lx
, int ly
>
131 void sad_x3(pixel
*pix1
, pixel
*pix2
, pixel
*pix3
, pixel
*pix4
, intptr_t frefstride
, int32_t *res
)
136 for (int y
= 0; y
< ly
; y
++)
138 for (int x
= 0; x
< lx
; x
++)
140 res
[0] += abs(pix1
[x
] - pix2
[x
]);
141 res
[1] += abs(pix1
[x
] - pix3
[x
]);
142 res
[2] += abs(pix1
[x
] - pix4
[x
]);
152 template<int lx
, int ly
>
153 void sad_x4(pixel
*pix1
, pixel
*pix2
, pixel
*pix3
, pixel
*pix4
, pixel
*pix5
, intptr_t frefstride
, int32_t *res
)
159 for (int y
= 0; y
< ly
; y
++)
161 for (int x
= 0; x
< lx
; x
++)
163 res
[0] += abs(pix1
[x
] - pix2
[x
]);
164 res
[1] += abs(pix1
[x
] - pix3
[x
]);
165 res
[2] += abs(pix1
[x
] - pix4
[x
]);
166 res
[3] += abs(pix1
[x
] - pix5
[x
]);
177 template<int lx
, int ly
, class T1
, class T2
>
178 int sse(T1
*pix1
, intptr_t stride_pix1
, T2
*pix2
, intptr_t stride_pix2
)
183 for (int y
= 0; y
< ly
; y
++)
185 for (int x
= 0; x
< lx
; x
++)
187 iTemp
= pix1
[x
] - pix2
[x
];
188 sum
+= (iTemp
* iTemp
);
198 #define BITS_PER_SUM (8 * sizeof(sum_t))
200 #define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) { \
201 sum2_t t0 = s0 + s1; \
202 sum2_t t1 = s0 - s1; \
203 sum2_t t2 = s2 + s3; \
204 sum2_t t3 = s2 - s3; \
211 // in: a pseudo-simd number of the form x+(y<<16)
212 // return: abs(x)+(abs(y)<<16)
213 inline sum2_t
abs2(sum2_t a
)
215 sum2_t s
= ((a
>> (BITS_PER_SUM
- 1)) & (((sum2_t
)1 << BITS_PER_SUM
) + 1)) * ((sum_t
)-1);
220 int satd_4x4(pixel
*pix1
, intptr_t stride_pix1
, pixel
*pix2
, intptr_t stride_pix2
)
223 sum2_t a0
, a1
, a2
, a3
, b0
, b1
;
226 for (int i
= 0; i
< 4; i
++, pix1
+= stride_pix1
, pix2
+= stride_pix2
)
228 a0
= pix1
[0] - pix2
[0];
229 a1
= pix1
[1] - pix2
[1];
230 b0
= (a0
+ a1
) + ((a0
- a1
) << BITS_PER_SUM
);
231 a2
= pix1
[2] - pix2
[2];
232 a3
= pix1
[3] - pix2
[3];
233 b1
= (a2
+ a3
) + ((a2
- a3
) << BITS_PER_SUM
);
238 for (int i
= 0; i
< 2; i
++)
240 HADAMARD4(a0
, a1
, a2
, a3
, tmp
[0][i
], tmp
[1][i
], tmp
[2][i
], tmp
[3][i
]);
241 a0
= abs2(a0
) + abs2(a1
) + abs2(a2
) + abs2(a3
);
242 sum
+= ((sum_t
)a0
) + (a0
>> BITS_PER_SUM
);
245 return (int)(sum
>> 1);
248 int satd_4x4(int16_t *pix1
, intptr_t stride_pix1
, int16_t *pix2
, intptr_t stride_pix2
)
251 ssum2_t a0
, a1
, a2
, a3
, b0
, b1
;
254 for (int i
= 0; i
< 4; i
++, pix1
+= stride_pix1
, pix2
+= stride_pix2
)
256 a0
= pix1
[0] - pix2
[0];
257 a1
= pix1
[1] - pix2
[1];
258 b0
= (a0
+ a1
) + ((a0
- a1
) << BITS_PER_SUM
);
259 a2
= pix1
[2] - pix2
[2];
260 a3
= pix1
[3] - pix2
[3];
261 b1
= (a2
+ a3
) + ((a2
- a3
) << BITS_PER_SUM
);
266 for (int i
= 0; i
< 2; i
++)
268 HADAMARD4(a0
, a1
, a2
, a3
, tmp
[0][i
], tmp
[1][i
], tmp
[2][i
], tmp
[3][i
]);
269 a0
= abs2(a0
) + abs2(a1
) + abs2(a2
) + abs2(a3
);
270 sum
+= ((sum_t
)a0
) + (a0
>> BITS_PER_SUM
);
273 return (int)(sum
>> 1);
276 // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
277 int satd_8x4(pixel
*pix1
, intptr_t stride_pix1
, pixel
*pix2
, intptr_t stride_pix2
)
280 sum2_t a0
, a1
, a2
, a3
;
283 for (int i
= 0; i
< 4; i
++, pix1
+= stride_pix1
, pix2
+= stride_pix2
)
285 a0
= (pix1
[0] - pix2
[0]) + ((sum2_t
)(pix1
[4] - pix2
[4]) << BITS_PER_SUM
);
286 a1
= (pix1
[1] - pix2
[1]) + ((sum2_t
)(pix1
[5] - pix2
[5]) << BITS_PER_SUM
);
287 a2
= (pix1
[2] - pix2
[2]) + ((sum2_t
)(pix1
[6] - pix2
[6]) << BITS_PER_SUM
);
288 a3
= (pix1
[3] - pix2
[3]) + ((sum2_t
)(pix1
[7] - pix2
[7]) << BITS_PER_SUM
);
289 HADAMARD4(tmp
[i
][0], tmp
[i
][1], tmp
[i
][2], tmp
[i
][3], a0
, a1
, a2
, a3
);
292 for (int i
= 0; i
< 4; i
++)
294 HADAMARD4(a0
, a1
, a2
, a3
, tmp
[0][i
], tmp
[1][i
], tmp
[2][i
], tmp
[3][i
]);
295 sum
+= abs2(a0
) + abs2(a1
) + abs2(a2
) + abs2(a3
);
298 return (((sum_t
)sum
) + (sum
>> BITS_PER_SUM
)) >> 1;
301 template<int w
, int h
>
302 // calculate satd in blocks of 4x4
303 int satd4(pixel
*pix1
, intptr_t stride_pix1
, pixel
*pix2
, intptr_t stride_pix2
)
307 for (int row
= 0; row
< h
; row
+= 4)
309 for (int col
= 0; col
< w
; col
+= 4)
311 satd
+= satd_4x4(pix1
+ row
* stride_pix1
+ col
, stride_pix1
,
312 pix2
+ row
* stride_pix2
+ col
, stride_pix2
);
319 template<int w
, int h
>
320 // calculate satd in blocks of 8x4
321 int satd8(pixel
*pix1
, intptr_t stride_pix1
, pixel
*pix2
, intptr_t stride_pix2
)
325 for (int row
= 0; row
< h
; row
+= 4)
327 for (int col
= 0; col
< w
; col
+= 8)
329 satd
+= satd_8x4(pix1
+ row
* stride_pix1
+ col
, stride_pix1
,
330 pix2
+ row
* stride_pix2
+ col
, stride_pix2
);
337 inline int _sa8d_8x8(pixel
*pix1
, intptr_t i_pix1
, pixel
*pix2
, intptr_t i_pix2
)
340 sum2_t a0
, a1
, a2
, a3
, a4
, a5
, a6
, a7
, b0
, b1
, b2
, b3
;
343 for (int i
= 0; i
< 8; i
++, pix1
+= i_pix1
, pix2
+= i_pix2
)
345 a0
= pix1
[0] - pix2
[0];
346 a1
= pix1
[1] - pix2
[1];
347 b0
= (a0
+ a1
) + ((a0
- a1
) << BITS_PER_SUM
);
348 a2
= pix1
[2] - pix2
[2];
349 a3
= pix1
[3] - pix2
[3];
350 b1
= (a2
+ a3
) + ((a2
- a3
) << BITS_PER_SUM
);
351 a4
= pix1
[4] - pix2
[4];
352 a5
= pix1
[5] - pix2
[5];
353 b2
= (a4
+ a5
) + ((a4
- a5
) << BITS_PER_SUM
);
354 a6
= pix1
[6] - pix2
[6];
355 a7
= pix1
[7] - pix2
[7];
356 b3
= (a6
+ a7
) + ((a6
- a7
) << BITS_PER_SUM
);
357 HADAMARD4(tmp
[i
][0], tmp
[i
][1], tmp
[i
][2], tmp
[i
][3], b0
, b1
, b2
, b3
);
360 for (int i
= 0; i
< 4; i
++)
362 HADAMARD4(a0
, a1
, a2
, a3
, tmp
[0][i
], tmp
[1][i
], tmp
[2][i
], tmp
[3][i
]);
363 HADAMARD4(a4
, a5
, a6
, a7
, tmp
[4][i
], tmp
[5][i
], tmp
[6][i
], tmp
[7][i
]);
364 b0
= abs2(a0
+ a4
) + abs2(a0
- a4
);
365 b0
+= abs2(a1
+ a5
) + abs2(a1
- a5
);
366 b0
+= abs2(a2
+ a6
) + abs2(a2
- a6
);
367 b0
+= abs2(a3
+ a7
) + abs2(a3
- a7
);
368 sum
+= (sum_t
)b0
+ (b0
>> BITS_PER_SUM
);
374 int sa8d_8x8(pixel
*pix1
, intptr_t i_pix1
, pixel
*pix2
, intptr_t i_pix2
)
376 return (int)((_sa8d_8x8(pix1
, i_pix1
, pix2
, i_pix2
) + 2) >> 2);
379 inline int _sa8d_8x8(int16_t *pix1
, intptr_t i_pix1
, int16_t *pix2
, intptr_t i_pix2
)
382 ssum2_t a0
, a1
, a2
, a3
, a4
, a5
, a6
, a7
, b0
, b1
, b2
, b3
;
385 for (int i
= 0; i
< 8; i
++, pix1
+= i_pix1
, pix2
+= i_pix2
)
387 a0
= pix1
[0] - pix2
[0];
388 a1
= pix1
[1] - pix2
[1];
389 b0
= (a0
+ a1
) + ((a0
- a1
) << BITS_PER_SUM
);
390 a2
= pix1
[2] - pix2
[2];
391 a3
= pix1
[3] - pix2
[3];
392 b1
= (a2
+ a3
) + ((a2
- a3
) << BITS_PER_SUM
);
393 a4
= pix1
[4] - pix2
[4];
394 a5
= pix1
[5] - pix2
[5];
395 b2
= (a4
+ a5
) + ((a4
- a5
) << BITS_PER_SUM
);
396 a6
= pix1
[6] - pix2
[6];
397 a7
= pix1
[7] - pix2
[7];
398 b3
= (a6
+ a7
) + ((a6
- a7
) << BITS_PER_SUM
);
399 HADAMARD4(tmp
[i
][0], tmp
[i
][1], tmp
[i
][2], tmp
[i
][3], b0
, b1
, b2
, b3
);
402 for (int i
= 0; i
< 4; i
++)
404 HADAMARD4(a0
, a1
, a2
, a3
, tmp
[0][i
], tmp
[1][i
], tmp
[2][i
], tmp
[3][i
]);
405 HADAMARD4(a4
, a5
, a6
, a7
, tmp
[4][i
], tmp
[5][i
], tmp
[6][i
], tmp
[7][i
]);
406 b0
= abs2(a0
+ a4
) + abs2(a0
- a4
);
407 b0
+= abs2(a1
+ a5
) + abs2(a1
- a5
);
408 b0
+= abs2(a2
+ a6
) + abs2(a2
- a6
);
409 b0
+= abs2(a3
+ a7
) + abs2(a3
- a7
);
410 sum
+= (sum_t
)b0
+ (b0
>> BITS_PER_SUM
);
416 int sa8d_8x8(int16_t *pix1
, intptr_t i_pix1
, int16_t *pix2
, intptr_t i_pix2
)
418 return (int)((_sa8d_8x8(pix1
, i_pix1
, pix2
, i_pix2
) + 2) >> 2);
421 int sa8d_16x16(pixel
*pix1
, intptr_t i_pix1
, pixel
*pix2
, intptr_t i_pix2
)
423 int sum
= _sa8d_8x8(pix1
, i_pix1
, pix2
, i_pix2
)
424 + _sa8d_8x8(pix1
+ 8, i_pix1
, pix2
+ 8, i_pix2
)
425 + _sa8d_8x8(pix1
+ 8 * i_pix1
, i_pix1
, pix2
+ 8 * i_pix2
, i_pix2
)
426 + _sa8d_8x8(pix1
+ 8 + 8 * i_pix1
, i_pix1
, pix2
+ 8 + 8 * i_pix2
, i_pix2
);
428 // This matches x264 sa8d_16x16, but is slightly different from HM's behavior because
429 // this version only rounds once at the end
430 return (sum
+ 2) >> 2;
433 template<int w
, int h
>
434 // Calculate sa8d in blocks of 8x8
435 int sa8d8(pixel
*pix1
, intptr_t i_pix1
, pixel
*pix2
, intptr_t i_pix2
)
439 for (int y
= 0; y
< h
; y
+= 8)
441 for (int x
= 0; x
< w
; x
+= 8)
443 cost
+= sa8d_8x8(pix1
+ i_pix1
* y
+ x
, i_pix1
, pix2
+ i_pix2
* y
+ x
, i_pix2
);
450 template<int w
, int h
>
451 // Calculate sa8d in blocks of 16x16
452 int sa8d16(pixel
*pix1
, intptr_t i_pix1
, pixel
*pix2
, intptr_t i_pix2
)
456 for (int y
= 0; y
< h
; y
+= 16)
458 for (int x
= 0; x
< w
; x
+= 16)
460 cost
+= sa8d_16x16(pix1
+ i_pix1
* y
+ x
, i_pix1
, pix2
+ i_pix2
* y
+ x
, i_pix2
);
468 int pixel_ssd_s_c(short *a
, intptr_t dstride
)
471 for (int y
= 0; y
< size
; y
++)
473 for (int x
= 0; x
< size
; x
++)
483 void blockfil_s_c(int16_t *dst
, intptr_t dstride
, int16_t val
)
485 for (int y
= 0; y
< size
; y
++)
487 for (int x
= 0; x
< size
; x
++)
489 dst
[y
* dstride
+ x
] = val
;
494 void convert16to32_shl(int32_t *dst
, int16_t *src
, intptr_t stride
, int shift
, int size
)
496 for (int i
= 0; i
< size
; i
++)
498 for (int j
= 0; j
< size
; j
++)
500 dst
[i
* size
+ j
] = ((int)src
[i
* stride
+ j
]) << shift
;
506 void convert16to32_shr(int32_t *dst
, int16_t *src
, intptr_t stride
, int shift
, int offset
)
508 for (int i
= 0; i
< size
; i
++)
510 for (int j
= 0; j
< size
; j
++)
512 dst
[i
* size
+ j
] = ((int)src
[i
* stride
+ j
] + offset
) >> shift
;
517 void convert32to16_shr(int16_t *dst
, int32_t *src
, intptr_t stride
, int shift
, int size
)
519 int round
= 1 << (shift
- 1);
521 for (int i
= 0; i
< size
; i
++)
523 for (int j
= 0; j
< size
; j
++)
525 dst
[j
] = (int16_t)((src
[j
] + round
) >> shift
);
533 void copy_shr(int16_t *dst
, int16_t *src
, intptr_t stride
, int shift
, int size
)
535 int round
= 1 << (shift
- 1);
537 for (int i
= 0; i
< size
; i
++)
539 for (int j
= 0; j
< size
; j
++)
541 dst
[j
] = (int16_t)((src
[j
] + round
) >> shift
);
550 void convert32to16_shl(int16_t *dst
, int32_t *src
, intptr_t stride
, int shift
)
552 for (int i
= 0; i
< size
; i
++)
554 for (int j
= 0; j
< size
; j
++)
556 dst
[j
] = ((int16_t)src
[j
] << shift
);
565 void copy_shl(int16_t *dst
, int16_t *src
, intptr_t stride
, int shift
)
567 for (int i
= 0; i
< size
; i
++)
569 for (int j
= 0; j
< size
; j
++)
571 dst
[j
] = (src
[j
] << shift
);
579 template<int blockSize
>
580 void getResidual(pixel
*fenc
, pixel
*pred
, int16_t *residual
, intptr_t stride
)
582 for (int y
= 0; y
< blockSize
; y
++)
584 for (int x
= 0; x
< blockSize
; x
++)
586 residual
[x
] = static_cast<int16_t>(fenc
[x
]) - static_cast<int16_t>(pred
[x
]);
595 template<int blockSize
>
596 void transpose(pixel
* dst
, pixel
* src
, intptr_t stride
)
598 for (int k
= 0; k
< blockSize
; k
++)
600 for (int l
= 0; l
< blockSize
; l
++)
602 dst
[k
* blockSize
+ l
] = src
[l
* stride
+ k
];
607 void weight_sp_c(int16_t *src
, pixel
*dst
, intptr_t srcStride
, intptr_t dstStride
, int width
, int height
, int w0
, int round
, int shift
, int offset
)
611 for (y
= 0; y
<= height
- 1; y
++)
613 for (x
= 0; x
<= width
- 1; )
615 // note: width can be odd
616 dst
[x
] = (pixel
)Clip3(0, ((1 << X265_DEPTH
) - 1), ((w0
* (src
[x
] + IF_INTERNAL_OFFS
) + round
) >> shift
) + offset
);
625 void weight_pp_c(pixel
*src
, pixel
*dst
, intptr_t stride
, int width
, int height
, int w0
, int round
, int shift
, int offset
)
629 X265_CHECK(!(width
& 15), "weightp alignment error\n");
630 X265_CHECK(!((w0
<< 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
631 X265_CHECK(!(round
> 32767), "round using more than 16 bits, asm output will mismatch\n");
633 for (y
= 0; y
<= height
- 1; y
++)
635 for (x
= 0; x
<= width
- 1; )
637 // simulating pixel to short conversion
638 int16_t val
= src
[x
] << (IF_INTERNAL_PREC
- X265_DEPTH
);
639 dst
[x
] = (pixel
)Clip3(0, ((1 << X265_DEPTH
) - 1), ((w0
* (val
) + round
) >> shift
) + offset
);
648 template<int lx
, int ly
>
649 void pixelavg_pp(pixel
* dst
, intptr_t dstride
, pixel
* src0
, intptr_t sstride0
, pixel
* src1
, intptr_t sstride1
, int)
651 for (int y
= 0; y
< ly
; y
++)
653 for (int x
= 0; x
< lx
; x
++)
655 dst
[x
] = (src0
[x
] + src1
[x
] + 1) >> 1;
664 void scale1D_128to64(pixel
*dst
, pixel
*src
, intptr_t /*stride*/)
668 for (x
= 0; x
< 128; x
+= 2)
670 pixel pix0
= src
[(x
+ 0)];
671 pixel pix1
= src
[(x
+ 1)];
672 int sum
= pix0
+ pix1
;
674 dst
[x
>> 1] = (pixel
)((sum
+ 1) >> 1);
678 void scale2D_64to32(pixel
*dst
, pixel
*src
, intptr_t stride
)
682 for (y
= 0; y
< 64; y
+= 2)
684 for (x
= 0; x
< 64; x
+= 2)
686 pixel pix0
= src
[(y
+ 0) * stride
+ (x
+ 0)];
687 pixel pix1
= src
[(y
+ 0) * stride
+ (x
+ 1)];
688 pixel pix2
= src
[(y
+ 1) * stride
+ (x
+ 0)];
689 pixel pix3
= src
[(y
+ 1) * stride
+ (x
+ 1)];
690 int sum
= pix0
+ pix1
+ pix2
+ pix3
;
692 dst
[y
/ 2 * 32 + x
/ 2] = (pixel
)((sum
+ 2) >> 2);
697 void frame_init_lowres_core(pixel
*src0
, pixel
*dst0
, pixel
*dsth
, pixel
*dstv
, pixel
*dstc
,
698 intptr_t src_stride
, intptr_t dst_stride
, int width
, int height
)
700 for (int y
= 0; y
< height
; y
++)
702 pixel
*src1
= src0
+ src_stride
;
703 pixel
*src2
= src1
+ src_stride
;
704 for (int x
= 0; x
< width
; x
++)
706 // slower than naive bilinear, but matches asm
707 #define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1)
708 dst0
[x
] = FILTER(src0
[2 * x
], src1
[2 * x
], src0
[2 * x
+ 1], src1
[2 * x
+ 1]);
709 dsth
[x
] = FILTER(src0
[2 * x
+ 1], src1
[2 * x
+ 1], src0
[2 * x
+ 2], src1
[2 * x
+ 2]);
710 dstv
[x
] = FILTER(src1
[2 * x
], src2
[2 * x
], src1
[2 * x
+ 1], src2
[2 * x
+ 1]);
711 dstc
[x
] = FILTER(src1
[2 * x
+ 1], src2
[2 * x
+ 1], src1
[2 * x
+ 2], src2
[2 * x
+ 2]);
714 src0
+= src_stride
* 2;
722 /* structural similarity metric */
723 void ssim_4x4x2_core(const pixel
*pix1
, intptr_t stride1
, const pixel
*pix2
, intptr_t stride2
, int sums
[2][4])
725 for (int z
= 0; z
< 2; z
++)
727 uint32_t s1
= 0, s2
= 0, ss
= 0, s12
= 0;
728 for (int y
= 0; y
< 4; y
++)
730 for (int x
= 0; x
< 4; x
++)
732 int a
= pix1
[x
+ y
* stride1
];
733 int b
= pix2
[x
+ y
* stride2
];
751 float ssim_end_1(int s1
, int s2
, int ss
, int s12
)
753 /* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases.
754 * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784.
755 * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */
757 #define PIXEL_MAX ((1 << X265_DEPTH) - 1)
759 X265_CHECK(X265_DEPTH
== 10, "ssim invalid depth\n");
761 static const float ssim_c1
= (float)(.01 * .01 * PIXEL_MAX
* PIXEL_MAX
* 64);
762 static const float ssim_c2
= (float)(.03 * .03 * PIXEL_MAX
* PIXEL_MAX
* 64 * 63);
764 X265_CHECK(X265_DEPTH
== 8, "ssim invalid depth\n");
766 static const int ssim_c1
= (int)(.01 * .01 * PIXEL_MAX
* PIXEL_MAX
* 64 + .5);
767 static const int ssim_c2
= (int)(.03 * .03 * PIXEL_MAX
* PIXEL_MAX
* 64 * 63 + .5);
772 type fs12
= (type
)s12
;
773 type vars
= (type
)(fss
* 64 - fs1
* fs1
- fs2
* fs2
);
774 type covar
= (type
)(fs12
* 64 - fs1
* fs2
);
775 return (float)(2 * fs1
* fs2
+ ssim_c1
) * (float)(2 * covar
+ ssim_c2
)
776 / ((float)(fs1
* fs1
+ fs2
* fs2
+ ssim_c1
) * (float)(vars
+ ssim_c2
));
781 float ssim_end_4(int sum0
[5][4], int sum1
[5][4], int width
)
785 for (int i
= 0; i
< width
; i
++)
787 ssim
+= ssim_end_1(sum0
[i
][0] + sum0
[i
+ 1][0] + sum1
[i
][0] + sum1
[i
+ 1][0],
788 sum0
[i
][1] + sum0
[i
+ 1][1] + sum1
[i
][1] + sum1
[i
+ 1][1],
789 sum0
[i
][2] + sum0
[i
+ 1][2] + sum1
[i
][2] + sum1
[i
+ 1][2],
790 sum0
[i
][3] + sum0
[i
+ 1][3] + sum1
[i
][3] + sum1
[i
+ 1][3]);
797 uint64_t pixel_var(pixel
*pix
, intptr_t i_stride
)
799 uint32_t sum
= 0, sqr
= 0;
801 for (int y
= 0; y
< size
; y
++)
803 for (int x
= 0; x
< size
; x
++)
806 sqr
+= pix
[x
] * pix
[x
];
812 return sum
+ ((uint64_t)sqr
<< 32);
815 #if defined(_MSC_VER)
816 #pragma warning(disable: 4127) // conditional expression is constant
820 int psyCost_pp(pixel
*source
, intptr_t sstride
, pixel
*recon
, intptr_t rstride
)
822 static pixel zeroBuf
[8] /* = { 0 } */;
826 int dim
= 1 << (size
+ 2);
827 uint32_t totEnergy
= 0;
828 for (int i
= 0; i
< dim
; i
+= 8)
830 for (int j
= 0; j
< dim
; j
+= 8)
832 /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
833 int sourceEnergy
= sa8d_8x8(source
+ i
* sstride
+ j
, sstride
, zeroBuf
, 0) -
834 (sad
<8, 8>(source
+ i
* sstride
+ j
, sstride
, zeroBuf
, 0) >> 2);
835 int reconEnergy
= sa8d_8x8(recon
+ i
* rstride
+ j
, rstride
, zeroBuf
, 0) -
836 (sad
<8, 8>(recon
+ i
* rstride
+ j
, rstride
, zeroBuf
, 0) >> 2);
838 totEnergy
+= abs(sourceEnergy
- reconEnergy
);
845 /* 4x4 is too small for sa8d */
846 int sourceEnergy
= satd_4x4(source
, sstride
, zeroBuf
, 0) - (sad
<4, 4>(source
, sstride
, zeroBuf
, 0) >> 2);
847 int reconEnergy
= satd_4x4(recon
, rstride
, zeroBuf
, 0) - (sad
<4, 4>(recon
, rstride
, zeroBuf
, 0) >> 2);
848 return abs(sourceEnergy
- reconEnergy
);
853 int psyCost_ss(int16_t *source
, intptr_t sstride
, int16_t *recon
, intptr_t rstride
)
855 static int16_t zeroBuf
[8] /* = { 0 } */;
859 int dim
= 1 << (size
+ 2);
860 uint32_t totEnergy
= 0;
861 for (int i
= 0; i
< dim
; i
+= 8)
863 for (int j
= 0; j
< dim
; j
+= 8)
865 /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
866 int sourceEnergy
= sa8d_8x8(source
+ i
* sstride
+ j
, sstride
, zeroBuf
, 0) -
867 (sad
<8, 8>(source
+ i
* sstride
+ j
, sstride
, zeroBuf
, 0) >> 2);
868 int reconEnergy
= sa8d_8x8(recon
+ i
* rstride
+ j
, rstride
, zeroBuf
, 0) -
869 (sad
<8, 8>(recon
+ i
* rstride
+ j
, rstride
, zeroBuf
, 0) >> 2);
871 totEnergy
+= abs(sourceEnergy
- reconEnergy
);
878 /* 4x4 is too small for sa8d */
879 int sourceEnergy
= satd_4x4(source
, sstride
, zeroBuf
, 0) - (sad
<4, 4>(source
, sstride
, zeroBuf
, 0) >> 2);
880 int reconEnergy
= satd_4x4(recon
, rstride
, zeroBuf
, 0) - (sad
<4, 4>(recon
, rstride
, zeroBuf
, 0) >> 2);
881 return abs(sourceEnergy
- reconEnergy
);
885 void plane_copy_deinterleave_chroma(pixel
*dstu
, intptr_t dstuStride
, pixel
*dstv
, intptr_t dstvStride
,
886 pixel
*src
, intptr_t srcStride
, int w
, int h
)
888 for (int y
= 0; y
< h
; y
++, dstu
+= dstuStride
, dstv
+= dstvStride
, src
+= srcStride
)
890 for (int x
= 0; x
< w
; x
++)
892 dstu
[x
] = src
[2 * x
];
893 dstv
[x
] = src
[2 * x
+ 1];
898 template<int bx
, int by
>
899 void blockcopy_pp_c(pixel
*a
, intptr_t stridea
, pixel
*b
, intptr_t strideb
)
901 for (int y
= 0; y
< by
; y
++)
903 for (int x
= 0; x
< bx
; x
++)
913 template<int bx
, int by
>
914 void blockcopy_ss_c(int16_t *a
, intptr_t stridea
, int16_t *b
, intptr_t strideb
)
916 for (int y
= 0; y
< by
; y
++)
918 for (int x
= 0; x
< bx
; x
++)
928 template<int bx
, int by
>
929 void blockcopy_sp_c(pixel
*a
, intptr_t stridea
, int16_t *b
, intptr_t strideb
)
931 for (int y
= 0; y
< by
; y
++)
933 for (int x
= 0; x
< bx
; x
++)
935 X265_CHECK((b
[x
] >= 0) && (b
[x
] <= ((1 << X265_DEPTH
) - 1)), "blockcopy pixel size fail\n");
944 template<int bx
, int by
>
945 void blockcopy_ps_c(int16_t *a
, intptr_t stridea
, pixel
*b
, intptr_t strideb
)
947 for (int y
= 0; y
< by
; y
++)
949 for (int x
= 0; x
< bx
; x
++)
951 a
[x
] = (int16_t)b
[x
];
959 template<int bx
, int by
>
960 void pixel_sub_ps_c(int16_t *a
, intptr_t dstride
, pixel
*b0
, pixel
*b1
, intptr_t sstride0
, intptr_t sstride1
)
962 for (int y
= 0; y
< by
; y
++)
964 for (int x
= 0; x
< bx
; x
++)
966 a
[x
] = (int16_t)(b0
[x
] - b1
[x
]);
975 template<int bx
, int by
>
976 void pixel_add_ps_c(pixel
*a
, intptr_t dstride
, pixel
*b0
, int16_t *b1
, intptr_t sstride0
, intptr_t sstride1
)
978 for (int y
= 0; y
< by
; y
++)
980 for (int x
= 0; x
< bx
; x
++)
982 a
[x
] = Clip(b0
[x
] + b1
[x
]);
991 template<int bx
, int by
>
992 void addAvg(int16_t* src0
, int16_t* src1
, pixel
* dst
, intptr_t src0Stride
, intptr_t src1Stride
, intptr_t dstStride
)
994 int shiftNum
, offset
;
996 shiftNum
= IF_INTERNAL_PREC
+ 1 - X265_DEPTH
;
997 offset
= (1 << (shiftNum
- 1)) + 2 * IF_INTERNAL_OFFS
;
999 for (int y
= 0; y
< by
; y
++)
1001 for (int x
= 0; x
< bx
; x
+= 2)
1003 dst
[x
+ 0] = Clip((src0
[x
+ 0] + src1
[x
+ 0] + offset
) >> shiftNum
);
1004 dst
[x
+ 1] = Clip((src0
[x
+ 1] + src1
[x
+ 1] + offset
) >> shiftNum
);
1013 void planecopy_cp_c(uint8_t *src
, intptr_t srcStride
, pixel
*dst
, intptr_t dstStride
, int width
, int height
, int shift
)
1015 for (int r
= 0; r
< height
; r
++)
1017 for (int c
= 0; c
< width
; c
++)
1019 dst
[c
] = ((pixel
)src
[c
]) << shift
;
1027 void planecopy_sp_c(uint16_t *src
, intptr_t srcStride
, pixel
*dst
, intptr_t dstStride
, int width
, int height
, int shift
, uint16_t mask
)
1029 for (int r
= 0; r
< height
; r
++)
1031 for (int c
= 0; c
< width
; c
++)
1033 dst
[c
] = (pixel
)((src
[c
] >> shift
) & mask
);
1041 /* Estimate the total amount of influence on future quality that could be had if we
1042 * were to improve the reference samples used to inter predict any given CU. */
1043 void estimateCUPropagateCost(int *dst
, uint16_t *propagateIn
, int32_t *intraCosts
, uint16_t *interCosts
,
1044 int32_t *invQscales
, double *fpsFactor
, int len
)
1046 double fps
= *fpsFactor
/ 256;
1048 for (int i
= 0; i
< len
; i
++)
1050 double intraCost
= intraCosts
[i
] * invQscales
[i
];
1051 double propagateAmount
= (double)propagateIn
[i
] + intraCost
* fps
;
1052 double propagateNum
= (double)intraCosts
[i
] - (interCosts
[i
] & ((1 << 14) - 1));
1053 double propagateDenom
= (double)intraCosts
[i
];
1054 dst
[i
] = (int)(propagateAmount
* propagateNum
/ propagateDenom
+ 0.5);
1057 } // end anonymous namespace
1060 // x265 private namespace
1062 /* Extend the edges of a picture so that it may safely be used for motion
1063 * compensation. This function assumes the picture is stored in a buffer with
1064 * sufficient padding for the X and Y margins */
1065 void extendPicBorder(pixel
* pic
, intptr_t stride
, int width
, int height
, int marginX
, int marginY
)
1067 /* extend left and right margins */
1068 primitives
.extendRowBorder(pic
, stride
, width
, height
, marginX
);
1070 /* copy top row to create above margin */
1071 pixel
*top
= pic
- marginX
;
1072 for (int y
= 0; y
< marginY
; y
++)
1073 memcpy(top
- (y
+ 1) * stride
, top
, stride
* sizeof(pixel
));
1075 /* copy bottom row to create below margin */
1076 pixel
*bot
= pic
- marginX
+ (height
- 1) * stride
;
1077 for (int y
= 0; y
< marginY
; y
++)
1078 memcpy(bot
+ (y
+ 1) * stride
, bot
, stride
* sizeof(pixel
));
1081 /* Initialize entries for pixel functions defined in this file */
1082 void Setup_C_PixelPrimitives(EncoderPrimitives
&p
)
1084 SET_FUNC_PRIMITIVE_TABLE_C2(sad
)
1085 SET_FUNC_PRIMITIVE_TABLE_C2(sad_x3
)
1086 SET_FUNC_PRIMITIVE_TABLE_C2(sad_x4
)
1087 SET_FUNC_PRIMITIVE_TABLE_C2(pixelavg_pp
)
1090 p
.satd
[LUMA_4x4
] = satd_4x4
;
1091 p
.satd
[LUMA_8x8
] = satd8
<8, 8>;
1092 p
.satd
[LUMA_8x4
] = satd_8x4
;
1093 p
.satd
[LUMA_4x8
] = satd4
<4, 8>;
1094 p
.satd
[LUMA_16x16
] = satd8
<16, 16>;
1095 p
.satd
[LUMA_16x8
] = satd8
<16, 8>;
1096 p
.satd
[LUMA_8x16
] = satd8
<8, 16>;
1097 p
.satd
[LUMA_16x12
] = satd8
<16, 12>;
1098 p
.satd
[LUMA_12x16
] = satd4
<12, 16>;
1099 p
.satd
[LUMA_16x4
] = satd8
<16, 4>;
1100 p
.satd
[LUMA_4x16
] = satd4
<4, 16>;
1101 p
.satd
[LUMA_32x32
] = satd8
<32, 32>;
1102 p
.satd
[LUMA_32x16
] = satd8
<32, 16>;
1103 p
.satd
[LUMA_16x32
] = satd8
<16, 32>;
1104 p
.satd
[LUMA_32x24
] = satd8
<32, 24>;
1105 p
.satd
[LUMA_24x32
] = satd8
<24, 32>;
1106 p
.satd
[LUMA_32x8
] = satd8
<32, 8>;
1107 p
.satd
[LUMA_8x32
] = satd8
<8, 32>;
1108 p
.satd
[LUMA_64x64
] = satd8
<64, 64>;
1109 p
.satd
[LUMA_64x32
] = satd8
<64, 32>;
1110 p
.satd
[LUMA_32x64
] = satd8
<32, 64>;
1111 p
.satd
[LUMA_64x48
] = satd8
<64, 48>;
1112 p
.satd
[LUMA_48x64
] = satd8
<48, 64>;
1113 p
.satd
[LUMA_64x16
] = satd8
<64, 16>;
1114 p
.satd
[LUMA_16x64
] = satd8
<16, 64>;
1116 #define CHROMA_420(W, H) \
1117 p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = addAvg<W, H>; \
1118 p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1119 p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1120 p.chroma[X265_CSP_I420].copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1121 p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1123 #define CHROMA_422(W, H) \
1124 p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg<W, H>; \
1125 p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1126 p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1127 p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1128 p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1130 #define CHROMA_444(W, H) \
1131 p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
1132 p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1133 p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1134 p.chroma[X265_CSP_I444].copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1135 p.chroma[X265_CSP_I444].copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1137 #define LUMA(W, H) \
1138 p.luma_addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
1139 p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1140 p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1141 p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1142 p.luma_copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1144 #define LUMA_PIXELSUB(W, H) \
1145 p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1146 p.luma_add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1148 #define CHROMA_PIXELSUB_420(W, H) \
1149 p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1150 p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1152 #define CHROMA_PIXELSUB_422(W, H) \
1153 p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1154 p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1156 #define CHROMA_PIXELSUB_444(W, H) \
1157 p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1158 p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1212 LUMA_PIXELSUB(4, 4);
1213 LUMA_PIXELSUB(8, 8);
1214 LUMA_PIXELSUB(16, 16);
1215 LUMA_PIXELSUB(32, 32);
1216 LUMA_PIXELSUB(64, 64);
1217 CHROMA_PIXELSUB_420(4, 4)
1218 CHROMA_PIXELSUB_420(8, 8)
1219 CHROMA_PIXELSUB_420(16, 16)
1220 CHROMA_PIXELSUB_420(32, 32)
1221 CHROMA_PIXELSUB_422(4, 8)
1222 CHROMA_PIXELSUB_422(8, 16)
1223 CHROMA_PIXELSUB_422(16, 32)
1224 CHROMA_PIXELSUB_422(32, 64)
1225 CHROMA_PIXELSUB_444(8, 8)
1226 CHROMA_PIXELSUB_444(16, 16)
1227 CHROMA_PIXELSUB_444(32, 32)
1228 CHROMA_PIXELSUB_444(64, 64)
1281 SET_FUNC_PRIMITIVE_TABLE_C(sse_pp
, sse
, pixelcmp_t
, pixel
, pixel
)
1282 SET_FUNC_PRIMITIVE_TABLE_C(sse_sp
, sse
, pixelcmp_sp_t
, int16_t, pixel
)
1283 SET_FUNC_PRIMITIVE_TABLE_C(sse_ss
, sse
, pixelcmp_ss_t
, int16_t, int16_t)
1285 p
.blockfill_s
[BLOCK_4x4
] = blockfil_s_c
<4>;
1286 p
.blockfill_s
[BLOCK_8x8
] = blockfil_s_c
<8>;
1287 p
.blockfill_s
[BLOCK_16x16
] = blockfil_s_c
<16>;
1288 p
.blockfill_s
[BLOCK_32x32
] = blockfil_s_c
<32>;
1289 p
.blockfill_s
[BLOCK_64x64
] = blockfil_s_c
<64>;
1291 p
.cvt16to32_shl
= convert16to32_shl
;
1292 p
.cvt16to32_shr
[BLOCK_4x4
] = convert16to32_shr
<4>;
1293 p
.cvt16to32_shr
[BLOCK_8x8
] = convert16to32_shr
<8>;
1294 p
.cvt16to32_shr
[BLOCK_16x16
] = convert16to32_shr
<16>;
1295 p
.cvt16to32_shr
[BLOCK_32x32
] = convert16to32_shr
<32>;
1296 p
.cvt32to16_shr
= convert32to16_shr
;
1297 p
.cvt32to16_shl
[BLOCK_4x4
] = convert32to16_shl
<4>;
1298 p
.cvt32to16_shl
[BLOCK_8x8
] = convert32to16_shl
<8>;
1299 p
.cvt32to16_shl
[BLOCK_16x16
] = convert32to16_shl
<16>;
1300 p
.cvt32to16_shl
[BLOCK_32x32
] = convert32to16_shl
<32>;
1302 p
.copy_shr
= copy_shr
;
1303 p
.copy_shl
[BLOCK_4x4
] = copy_shl
<4>;
1304 p
.copy_shl
[BLOCK_8x8
] = copy_shl
<8>;
1305 p
.copy_shl
[BLOCK_16x16
] = copy_shl
<16>;
1306 p
.copy_shl
[BLOCK_32x32
] = copy_shl
<32>;
1308 p
.sa8d
[BLOCK_4x4
] = satd_4x4
;
1309 p
.sa8d
[BLOCK_8x8
] = sa8d_8x8
;
1310 p
.sa8d
[BLOCK_16x16
] = sa8d_16x16
;
1311 p
.sa8d
[BLOCK_32x32
] = sa8d16
<32, 32>;
1312 p
.sa8d
[BLOCK_64x64
] = sa8d16
<64, 64>;
1314 p
.psy_cost_pp
[BLOCK_4x4
] = psyCost_pp
<BLOCK_4x4
>;
1315 p
.psy_cost_pp
[BLOCK_8x8
] = psyCost_pp
<BLOCK_8x8
>;
1316 p
.psy_cost_pp
[BLOCK_16x16
] = psyCost_pp
<BLOCK_16x16
>;
1317 p
.psy_cost_pp
[BLOCK_32x32
] = psyCost_pp
<BLOCK_32x32
>;
1318 p
.psy_cost_pp
[BLOCK_64x64
] = psyCost_pp
<BLOCK_64x64
>;
1320 p
.psy_cost_ss
[BLOCK_4x4
] = psyCost_ss
<BLOCK_4x4
>;
1321 p
.psy_cost_ss
[BLOCK_8x8
] = psyCost_ss
<BLOCK_8x8
>;
1322 p
.psy_cost_ss
[BLOCK_16x16
] = psyCost_ss
<BLOCK_16x16
>;
1323 p
.psy_cost_ss
[BLOCK_32x32
] = psyCost_ss
<BLOCK_32x32
>;
1324 p
.psy_cost_ss
[BLOCK_64x64
] = psyCost_ss
<BLOCK_64x64
>;
1326 p
.sa8d_inter
[LUMA_4x4
] = satd_4x4
;
1327 p
.sa8d_inter
[LUMA_8x8
] = sa8d_8x8
;
1328 p
.sa8d_inter
[LUMA_8x4
] = satd_8x4
;
1329 p
.sa8d_inter
[LUMA_4x8
] = satd4
<4, 8>;
1330 p
.sa8d_inter
[LUMA_16x16
] = sa8d_16x16
;
1331 p
.sa8d_inter
[LUMA_16x8
] = sa8d8
<16, 8>;
1332 p
.sa8d_inter
[LUMA_8x16
] = sa8d8
<8, 16>;
1333 p
.sa8d_inter
[LUMA_16x12
] = satd8
<16, 12>;
1334 p
.sa8d_inter
[LUMA_12x16
] = satd4
<12, 16>;
1335 p
.sa8d_inter
[LUMA_4x16
] = satd4
<4, 16>;
1336 p
.sa8d_inter
[LUMA_16x4
] = satd8
<16, 4>;
1337 p
.sa8d_inter
[LUMA_32x32
] = sa8d16
<32, 32>;
1338 p
.sa8d_inter
[LUMA_32x16
] = sa8d16
<32, 16>;
1339 p
.sa8d_inter
[LUMA_16x32
] = sa8d16
<16, 32>;
1340 p
.sa8d_inter
[LUMA_32x24
] = sa8d8
<32, 24>;
1341 p
.sa8d_inter
[LUMA_24x32
] = sa8d8
<24, 32>;
1342 p
.sa8d_inter
[LUMA_32x8
] = sa8d8
<32, 8>;
1343 p
.sa8d_inter
[LUMA_8x32
] = sa8d8
<8, 32>;
1344 p
.sa8d_inter
[LUMA_64x64
] = sa8d16
<64, 64>;
1345 p
.sa8d_inter
[LUMA_64x32
] = sa8d16
<64, 32>;
1346 p
.sa8d_inter
[LUMA_32x64
] = sa8d16
<32, 64>;
1347 p
.sa8d_inter
[LUMA_64x48
] = sa8d16
<64, 48>;
1348 p
.sa8d_inter
[LUMA_48x64
] = sa8d16
<48, 64>;
1349 p
.sa8d_inter
[LUMA_64x16
] = sa8d16
<64, 16>;
1350 p
.sa8d_inter
[LUMA_16x64
] = sa8d16
<16, 64>;
1352 p
.calcresidual
[BLOCK_4x4
] = getResidual
<4>;
1353 p
.calcresidual
[BLOCK_8x8
] = getResidual
<8>;
1354 p
.calcresidual
[BLOCK_16x16
] = getResidual
<16>;
1355 p
.calcresidual
[BLOCK_32x32
] = getResidual
<32>;
1356 p
.calcresidual
[BLOCK_64x64
] = NULL
;
1358 p
.transpose
[BLOCK_4x4
] = transpose
<4>;
1359 p
.transpose
[BLOCK_8x8
] = transpose
<8>;
1360 p
.transpose
[BLOCK_16x16
] = transpose
<16>;
1361 p
.transpose
[BLOCK_32x32
] = transpose
<32>;
1362 p
.transpose
[BLOCK_64x64
] = transpose
<64>;
1364 p
.ssd_s
[BLOCK_4x4
] = pixel_ssd_s_c
<4>;
1365 p
.ssd_s
[BLOCK_8x8
] = pixel_ssd_s_c
<8>;
1366 p
.ssd_s
[BLOCK_16x16
] = pixel_ssd_s_c
<16>;
1367 p
.ssd_s
[BLOCK_32x32
] = pixel_ssd_s_c
<32>;
1369 p
.weight_pp
= weight_pp_c
;
1370 p
.weight_sp
= weight_sp_c
;
1372 p
.scale1D_128to64
= scale1D_128to64
;
1373 p
.scale2D_64to32
= scale2D_64to32
;
1374 p
.frame_init_lowres_core
= frame_init_lowres_core
;
1375 p
.ssim_4x4x2_core
= ssim_4x4x2_core
;
1376 p
.ssim_end_4
= ssim_end_4
;
1378 p
.var
[BLOCK_8x8
] = pixel_var
<8>;
1379 p
.var
[BLOCK_16x16
] = pixel_var
<16>;
1380 p
.var
[BLOCK_32x32
] = pixel_var
<32>;
1381 p
.var
[BLOCK_64x64
] = pixel_var
<64>;
1382 p
.plane_copy_deinterleave_c
= plane_copy_deinterleave_chroma
;
1383 p
.planecopy_cp
= planecopy_cp_c
;
1384 p
.planecopy_sp
= planecopy_sp_c
;
1385 p
.propagateCost
= estimateCUPropagateCost
;