1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
4 * Authors: Steve Borho <steve@borho.org>
5 * Mandar Gurav <mandar@multicorewareinc.com>
6 * Mahesh Pittala <mahesh@multicorewareinc.com>
7 * Min Chen <min.chen@multicorewareinc.com>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at license @ x265.com.
25 *****************************************************************************/
28 #include "primitives.h"
31 #include <cstdlib> // abs()
35 #define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \
36 p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \
37 p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \
38 p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \
39 p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \
40 p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
41 p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \
42 p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
43 p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
44 p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
45 p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \
46 p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
47 p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
48 p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
49 p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
50 p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
51 p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
52 p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \
53 p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
54 p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
55 p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
56 p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
57 p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
58 p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
59 p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
60 p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
62 #define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
63 p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX<4, 4>; \
64 p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX<8, 8>; \
65 p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX<8, 4>; \
66 p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX<4, 8>; \
67 p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX<16, 16>; \
68 p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX<16, 8>; \
69 p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX<8, 16>; \
70 p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX<16, 12>; \
71 p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX<12, 16>; \
72 p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX<16, 4>; \
73 p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX<4, 16>; \
74 p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX<32, 32>; \
75 p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX<32, 16>; \
76 p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX<16, 32>; \
77 p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX<32, 24>; \
78 p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX<24, 32>; \
79 p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX<32, 8>; \
80 p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX<8, 32>; \
81 p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX<64, 64>; \
82 p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX<64, 32>; \
83 p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX<32, 64>; \
84 p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX<64, 48>; \
85 p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX<48, 64>; \
86 p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX<64, 16>; \
87 p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX<16, 64>;
90 // place functions in anonymous namespace (file static)
92 template<int lx
, int ly
>
93 int sad(const pixel
* pix1
, intptr_t stride_pix1
, const pixel
* pix2
, intptr_t stride_pix2
)
97 for (int y
= 0; y
< ly
; y
++)
99 for (int x
= 0; x
< lx
; x
++)
100 sum
+= abs(pix1
[x
] - pix2
[x
]);
109 template<int lx
, int ly
>
110 int sad(const int16_t* pix1
, intptr_t stride_pix1
, const int16_t* pix2
, intptr_t stride_pix2
)
114 for (int y
= 0; y
< ly
; y
++)
116 for (int x
= 0; x
< lx
; x
++)
117 sum
+= abs(pix1
[x
] - pix2
[x
]);
126 template<int lx
, int ly
>
127 void sad_x3(const pixel
* pix1
, const pixel
* pix2
, const pixel
* pix3
, const pixel
* pix4
, intptr_t frefstride
, int32_t* res
)
132 for (int y
= 0; y
< ly
; y
++)
134 for (int x
= 0; x
< lx
; x
++)
136 res
[0] += abs(pix1
[x
] - pix2
[x
]);
137 res
[1] += abs(pix1
[x
] - pix3
[x
]);
138 res
[2] += abs(pix1
[x
] - pix4
[x
]);
148 template<int lx
, int ly
>
149 void sad_x4(const pixel
* pix1
, const pixel
* pix2
, const pixel
* pix3
, const pixel
* pix4
, const pixel
* pix5
, intptr_t frefstride
, int32_t* res
)
155 for (int y
= 0; y
< ly
; y
++)
157 for (int x
= 0; x
< lx
; x
++)
159 res
[0] += abs(pix1
[x
] - pix2
[x
]);
160 res
[1] += abs(pix1
[x
] - pix3
[x
]);
161 res
[2] += abs(pix1
[x
] - pix4
[x
]);
162 res
[3] += abs(pix1
[x
] - pix5
[x
]);
173 template<int lx
, int ly
, class T1
, class T2
>
174 int sse(const T1
* pix1
, intptr_t stride_pix1
, const T2
* pix2
, intptr_t stride_pix2
)
179 for (int y
= 0; y
< ly
; y
++)
181 for (int x
= 0; x
< lx
; x
++)
183 tmp
= pix1
[x
] - pix2
[x
];
194 #define BITS_PER_SUM (8 * sizeof(sum_t))
196 #define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) { \
197 sum2_t t0 = s0 + s1; \
198 sum2_t t1 = s0 - s1; \
199 sum2_t t2 = s2 + s3; \
200 sum2_t t3 = s2 - s3; \
207 // in: a pseudo-simd number of the form x+(y<<16)
208 // return: abs(x)+(abs(y)<<16)
209 inline sum2_t
abs2(sum2_t a
)
211 sum2_t s
= ((a
>> (BITS_PER_SUM
- 1)) & (((sum2_t
)1 << BITS_PER_SUM
) + 1)) * ((sum_t
)-1);
216 int satd_4x4(const pixel
* pix1
, intptr_t stride_pix1
, const pixel
* pix2
, intptr_t stride_pix2
)
219 sum2_t a0
, a1
, a2
, a3
, b0
, b1
;
222 for (int i
= 0; i
< 4; i
++, pix1
+= stride_pix1
, pix2
+= stride_pix2
)
224 a0
= pix1
[0] - pix2
[0];
225 a1
= pix1
[1] - pix2
[1];
226 b0
= (a0
+ a1
) + ((a0
- a1
) << BITS_PER_SUM
);
227 a2
= pix1
[2] - pix2
[2];
228 a3
= pix1
[3] - pix2
[3];
229 b1
= (a2
+ a3
) + ((a2
- a3
) << BITS_PER_SUM
);
234 for (int i
= 0; i
< 2; i
++)
236 HADAMARD4(a0
, a1
, a2
, a3
, tmp
[0][i
], tmp
[1][i
], tmp
[2][i
], tmp
[3][i
]);
237 a0
= abs2(a0
) + abs2(a1
) + abs2(a2
) + abs2(a3
);
238 sum
+= ((sum_t
)a0
) + (a0
>> BITS_PER_SUM
);
241 return (int)(sum
>> 1);
244 int satd_4x4(const int16_t* pix1
, intptr_t stride_pix1
, const int16_t* pix2
, intptr_t stride_pix2
)
247 ssum2_t a0
, a1
, a2
, a3
, b0
, b1
;
250 for (int i
= 0; i
< 4; i
++, pix1
+= stride_pix1
, pix2
+= stride_pix2
)
252 a0
= pix1
[0] - pix2
[0];
253 a1
= pix1
[1] - pix2
[1];
254 b0
= (a0
+ a1
) + ((a0
- a1
) << BITS_PER_SUM
);
255 a2
= pix1
[2] - pix2
[2];
256 a3
= pix1
[3] - pix2
[3];
257 b1
= (a2
+ a3
) + ((a2
- a3
) << BITS_PER_SUM
);
262 for (int i
= 0; i
< 2; i
++)
264 HADAMARD4(a0
, a1
, a2
, a3
, tmp
[0][i
], tmp
[1][i
], tmp
[2][i
], tmp
[3][i
]);
265 a0
= abs2(a0
) + abs2(a1
) + abs2(a2
) + abs2(a3
);
266 sum
+= ((sum_t
)a0
) + (a0
>> BITS_PER_SUM
);
269 return (int)(sum
>> 1);
272 // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
273 int satd_8x4(const pixel
* pix1
, intptr_t stride_pix1
, const pixel
* pix2
, intptr_t stride_pix2
)
276 sum2_t a0
, a1
, a2
, a3
;
279 for (int i
= 0; i
< 4; i
++, pix1
+= stride_pix1
, pix2
+= stride_pix2
)
281 a0
= (pix1
[0] - pix2
[0]) + ((sum2_t
)(pix1
[4] - pix2
[4]) << BITS_PER_SUM
);
282 a1
= (pix1
[1] - pix2
[1]) + ((sum2_t
)(pix1
[5] - pix2
[5]) << BITS_PER_SUM
);
283 a2
= (pix1
[2] - pix2
[2]) + ((sum2_t
)(pix1
[6] - pix2
[6]) << BITS_PER_SUM
);
284 a3
= (pix1
[3] - pix2
[3]) + ((sum2_t
)(pix1
[7] - pix2
[7]) << BITS_PER_SUM
);
285 HADAMARD4(tmp
[i
][0], tmp
[i
][1], tmp
[i
][2], tmp
[i
][3], a0
, a1
, a2
, a3
);
288 for (int i
= 0; i
< 4; i
++)
290 HADAMARD4(a0
, a1
, a2
, a3
, tmp
[0][i
], tmp
[1][i
], tmp
[2][i
], tmp
[3][i
]);
291 sum
+= abs2(a0
) + abs2(a1
) + abs2(a2
) + abs2(a3
);
294 return (((sum_t
)sum
) + (sum
>> BITS_PER_SUM
)) >> 1;
297 template<int w
, int h
>
298 // calculate satd in blocks of 4x4
299 int satd4(const pixel
* pix1
, intptr_t stride_pix1
, const pixel
* pix2
, intptr_t stride_pix2
)
303 for (int row
= 0; row
< h
; row
+= 4)
304 for (int col
= 0; col
< w
; col
+= 4)
305 satd
+= satd_4x4(pix1
+ row
* stride_pix1
+ col
, stride_pix1
,
306 pix2
+ row
* stride_pix2
+ col
, stride_pix2
);
311 template<int w
, int h
>
312 // calculate satd in blocks of 8x4
313 int satd8(const pixel
* pix1
, intptr_t stride_pix1
, const pixel
* pix2
, intptr_t stride_pix2
)
317 for (int row
= 0; row
< h
; row
+= 4)
318 for (int col
= 0; col
< w
; col
+= 8)
319 satd
+= satd_8x4(pix1
+ row
* stride_pix1
+ col
, stride_pix1
,
320 pix2
+ row
* stride_pix2
+ col
, stride_pix2
);
325 inline int _sa8d_8x8(const pixel
* pix1
, intptr_t i_pix1
, const pixel
* pix2
, intptr_t i_pix2
)
328 sum2_t a0
, a1
, a2
, a3
, a4
, a5
, a6
, a7
, b0
, b1
, b2
, b3
;
331 for (int i
= 0; i
< 8; i
++, pix1
+= i_pix1
, pix2
+= i_pix2
)
333 a0
= pix1
[0] - pix2
[0];
334 a1
= pix1
[1] - pix2
[1];
335 b0
= (a0
+ a1
) + ((a0
- a1
) << BITS_PER_SUM
);
336 a2
= pix1
[2] - pix2
[2];
337 a3
= pix1
[3] - pix2
[3];
338 b1
= (a2
+ a3
) + ((a2
- a3
) << BITS_PER_SUM
);
339 a4
= pix1
[4] - pix2
[4];
340 a5
= pix1
[5] - pix2
[5];
341 b2
= (a4
+ a5
) + ((a4
- a5
) << BITS_PER_SUM
);
342 a6
= pix1
[6] - pix2
[6];
343 a7
= pix1
[7] - pix2
[7];
344 b3
= (a6
+ a7
) + ((a6
- a7
) << BITS_PER_SUM
);
345 HADAMARD4(tmp
[i
][0], tmp
[i
][1], tmp
[i
][2], tmp
[i
][3], b0
, b1
, b2
, b3
);
348 for (int i
= 0; i
< 4; i
++)
350 HADAMARD4(a0
, a1
, a2
, a3
, tmp
[0][i
], tmp
[1][i
], tmp
[2][i
], tmp
[3][i
]);
351 HADAMARD4(a4
, a5
, a6
, a7
, tmp
[4][i
], tmp
[5][i
], tmp
[6][i
], tmp
[7][i
]);
352 b0
= abs2(a0
+ a4
) + abs2(a0
- a4
);
353 b0
+= abs2(a1
+ a5
) + abs2(a1
- a5
);
354 b0
+= abs2(a2
+ a6
) + abs2(a2
- a6
);
355 b0
+= abs2(a3
+ a7
) + abs2(a3
- a7
);
356 sum
+= (sum_t
)b0
+ (b0
>> BITS_PER_SUM
);
362 int sa8d_8x8(const pixel
* pix1
, intptr_t i_pix1
, const pixel
* pix2
, intptr_t i_pix2
)
364 return (int)((_sa8d_8x8(pix1
, i_pix1
, pix2
, i_pix2
) + 2) >> 2);
367 inline int _sa8d_8x8(const int16_t* pix1
, intptr_t i_pix1
, const int16_t* pix2
, intptr_t i_pix2
)
370 ssum2_t a0
, a1
, a2
, a3
, a4
, a5
, a6
, a7
, b0
, b1
, b2
, b3
;
373 for (int i
= 0; i
< 8; i
++, pix1
+= i_pix1
, pix2
+= i_pix2
)
375 a0
= pix1
[0] - pix2
[0];
376 a1
= pix1
[1] - pix2
[1];
377 b0
= (a0
+ a1
) + ((a0
- a1
) << BITS_PER_SUM
);
378 a2
= pix1
[2] - pix2
[2];
379 a3
= pix1
[3] - pix2
[3];
380 b1
= (a2
+ a3
) + ((a2
- a3
) << BITS_PER_SUM
);
381 a4
= pix1
[4] - pix2
[4];
382 a5
= pix1
[5] - pix2
[5];
383 b2
= (a4
+ a5
) + ((a4
- a5
) << BITS_PER_SUM
);
384 a6
= pix1
[6] - pix2
[6];
385 a7
= pix1
[7] - pix2
[7];
386 b3
= (a6
+ a7
) + ((a6
- a7
) << BITS_PER_SUM
);
387 HADAMARD4(tmp
[i
][0], tmp
[i
][1], tmp
[i
][2], tmp
[i
][3], b0
, b1
, b2
, b3
);
390 for (int i
= 0; i
< 4; i
++)
392 HADAMARD4(a0
, a1
, a2
, a3
, tmp
[0][i
], tmp
[1][i
], tmp
[2][i
], tmp
[3][i
]);
393 HADAMARD4(a4
, a5
, a6
, a7
, tmp
[4][i
], tmp
[5][i
], tmp
[6][i
], tmp
[7][i
]);
394 b0
= abs2(a0
+ a4
) + abs2(a0
- a4
);
395 b0
+= abs2(a1
+ a5
) + abs2(a1
- a5
);
396 b0
+= abs2(a2
+ a6
) + abs2(a2
- a6
);
397 b0
+= abs2(a3
+ a7
) + abs2(a3
- a7
);
398 sum
+= (sum_t
)b0
+ (b0
>> BITS_PER_SUM
);
404 int sa8d_8x8(const int16_t* pix1
, intptr_t i_pix1
, const int16_t* pix2
, intptr_t i_pix2
)
406 return (int)((_sa8d_8x8(pix1
, i_pix1
, pix2
, i_pix2
) + 2) >> 2);
409 int sa8d_16x16(const pixel
* pix1
, intptr_t i_pix1
, const pixel
* pix2
, intptr_t i_pix2
)
411 int sum
= _sa8d_8x8(pix1
, i_pix1
, pix2
, i_pix2
)
412 + _sa8d_8x8(pix1
+ 8, i_pix1
, pix2
+ 8, i_pix2
)
413 + _sa8d_8x8(pix1
+ 8 * i_pix1
, i_pix1
, pix2
+ 8 * i_pix2
, i_pix2
)
414 + _sa8d_8x8(pix1
+ 8 + 8 * i_pix1
, i_pix1
, pix2
+ 8 + 8 * i_pix2
, i_pix2
);
416 // This matches x264 sa8d_16x16, but is slightly different from HM's behavior because
417 // this version only rounds once at the end
418 return (sum
+ 2) >> 2;
421 template<int w
, int h
>
422 // Calculate sa8d in blocks of 8x8
423 int sa8d8(const pixel
* pix1
, intptr_t i_pix1
, const pixel
* pix2
, intptr_t i_pix2
)
427 for (int y
= 0; y
< h
; y
+= 8)
428 for (int x
= 0; x
< w
; x
+= 8)
429 cost
+= sa8d_8x8(pix1
+ i_pix1
* y
+ x
, i_pix1
, pix2
+ i_pix2
* y
+ x
, i_pix2
);
434 template<int w
, int h
>
435 // Calculate sa8d in blocks of 16x16
436 int sa8d16(const pixel
* pix1
, intptr_t i_pix1
, const pixel
* pix2
, intptr_t i_pix2
)
440 for (int y
= 0; y
< h
; y
+= 16)
441 for (int x
= 0; x
< w
; x
+= 16)
442 cost
+= sa8d_16x16(pix1
+ i_pix1
* y
+ x
, i_pix1
, pix2
+ i_pix2
* y
+ x
, i_pix2
);
448 int pixel_ssd_s_c(const int16_t* a
, intptr_t dstride
)
451 for (int y
= 0; y
< size
; y
++)
453 for (int x
= 0; x
< size
; x
++)
462 void blockfil_s_c(int16_t* dst
, intptr_t dstride
, int16_t val
)
464 for (int y
= 0; y
< size
; y
++)
465 for (int x
= 0; x
< size
; x
++)
466 dst
[y
* dstride
+ x
] = val
;
470 void cpy2Dto1D_shl(int16_t* dst
, const int16_t* src
, intptr_t srcStride
, int shift
)
472 X265_CHECK(((intptr_t)dst
& 15) == 0, "dst alignment error\n");
473 X265_CHECK((((intptr_t)src
| srcStride
) & 15) == 0 || size
== 4, "src alignment error\n");
474 X265_CHECK(shift
>= 0, "invalid shift\n");
476 for (int i
= 0; i
< size
; i
++)
478 for (int j
= 0; j
< size
; j
++)
479 dst
[j
] = src
[j
] << shift
;
487 void cpy2Dto1D_shr(int16_t* dst
, const int16_t* src
, intptr_t srcStride
, int shift
)
489 X265_CHECK(((intptr_t)dst
& 15) == 0, "dst alignment error\n");
490 X265_CHECK((((intptr_t)src
| srcStride
) & 15) == 0 || size
== 4, "src alignment error\n");
491 X265_CHECK(shift
> 0, "invalid shift\n");
493 int16_t round
= 1 << (shift
- 1);
494 for (int i
= 0; i
< size
; i
++)
496 for (int j
= 0; j
< size
; j
++)
497 dst
[j
] = (src
[j
] + round
) >> shift
;
505 void cpy1Dto2D_shl(int16_t* dst
, const int16_t* src
, intptr_t dstStride
, int shift
)
507 X265_CHECK((((intptr_t)dst
| dstStride
) & 15) == 0 || size
== 4, "dst alignment error\n");
508 X265_CHECK(((intptr_t)src
& 15) == 0, "src alignment error\n");
509 X265_CHECK(shift
>= 0, "invalid shift\n");
511 for (int i
= 0; i
< size
; i
++)
513 for (int j
= 0; j
< size
; j
++)
514 dst
[j
] = src
[j
] << shift
;
522 void cpy1Dto2D_shr(int16_t* dst
, const int16_t* src
, intptr_t dstStride
, int shift
)
524 X265_CHECK((((intptr_t)dst
| dstStride
) & 15) == 0 || size
== 4, "dst alignment error\n");
525 X265_CHECK(((intptr_t)src
& 15) == 0, "src alignment error\n");
526 X265_CHECK(shift
> 0, "invalid shift\n");
528 int16_t round
= 1 << (shift
- 1);
529 for (int i
= 0; i
< size
; i
++)
531 for (int j
= 0; j
< size
; j
++)
532 dst
[j
] = (src
[j
] + round
) >> shift
;
539 template<int blockSize
>
540 void getResidual(const pixel
* fenc
, const pixel
* pred
, int16_t* residual
, intptr_t stride
)
542 for (int y
= 0; y
< blockSize
; y
++)
544 for (int x
= 0; x
< blockSize
; x
++)
545 residual
[x
] = static_cast<int16_t>(fenc
[x
]) - static_cast<int16_t>(pred
[x
]);
553 template<int blockSize
>
554 void transpose(pixel
* dst
, const pixel
* src
, intptr_t stride
)
556 for (int k
= 0; k
< blockSize
; k
++)
557 for (int l
= 0; l
< blockSize
; l
++)
558 dst
[k
* blockSize
+ l
] = src
[l
* stride
+ k
];
561 void weight_sp_c(const int16_t* src
, pixel
* dst
, intptr_t srcStride
, intptr_t dstStride
, int width
, int height
, int w0
, int round
, int shift
, int offset
)
565 for (y
= 0; y
<= height
- 1; y
++)
567 for (x
= 0; x
<= width
- 1; )
569 // note: width can be odd
570 dst
[x
] = (pixel
)Clip3(0, ((1 << X265_DEPTH
) - 1), ((w0
* (src
[x
] + IF_INTERNAL_OFFS
) + round
) >> shift
) + offset
);
579 void weight_pp_c(const pixel
* src
, pixel
* dst
, intptr_t stride
, int width
, int height
, int w0
, int round
, int shift
, int offset
)
583 X265_CHECK(!(width
& 15), "weightp alignment error\n");
584 X265_CHECK(!((w0
<< 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
585 X265_CHECK(!(round
> 32767), "round using more than 16 bits, asm output will mismatch\n");
587 for (y
= 0; y
<= height
- 1; y
++)
589 for (x
= 0; x
<= width
- 1; )
591 // simulating pixel to short conversion
592 int16_t val
= src
[x
] << (IF_INTERNAL_PREC
- X265_DEPTH
);
593 dst
[x
] = (pixel
)Clip3(0, ((1 << X265_DEPTH
) - 1), ((w0
* (val
) + round
) >> shift
) + offset
);
602 template<int lx
, int ly
>
603 void pixelavg_pp(pixel
* dst
, intptr_t dstride
, const pixel
* src0
, intptr_t sstride0
, const pixel
* src1
, intptr_t sstride1
, int)
605 for (int y
= 0; y
< ly
; y
++)
607 for (int x
= 0; x
< lx
; x
++)
608 dst
[x
] = (src0
[x
] + src1
[x
] + 1) >> 1;
616 void scale1D_128to64(pixel
* dst
, const pixel
* src
, intptr_t /*stride*/)
620 for (x
= 0; x
< 128; x
+= 2)
622 pixel pix0
= src
[(x
+ 0)];
623 pixel pix1
= src
[(x
+ 1)];
624 int sum
= pix0
+ pix1
;
626 dst
[x
>> 1] = (pixel
)((sum
+ 1) >> 1);
630 void scale2D_64to32(pixel
* dst
, const pixel
* src
, intptr_t stride
)
634 for (y
= 0; y
< 64; y
+= 2)
636 for (x
= 0; x
< 64; x
+= 2)
638 pixel pix0
= src
[(y
+ 0) * stride
+ (x
+ 0)];
639 pixel pix1
= src
[(y
+ 0) * stride
+ (x
+ 1)];
640 pixel pix2
= src
[(y
+ 1) * stride
+ (x
+ 0)];
641 pixel pix3
= src
[(y
+ 1) * stride
+ (x
+ 1)];
642 int sum
= pix0
+ pix1
+ pix2
+ pix3
;
644 dst
[y
/ 2 * 32 + x
/ 2] = (pixel
)((sum
+ 2) >> 2);
649 void frame_init_lowres_core(const pixel
* src0
, pixel
* dst0
, pixel
* dsth
, pixel
* dstv
, pixel
* dstc
,
650 intptr_t src_stride
, intptr_t dst_stride
, int width
, int height
)
652 for (int y
= 0; y
< height
; y
++)
654 const pixel
* src1
= src0
+ src_stride
;
655 const pixel
* src2
= src1
+ src_stride
;
656 for (int x
= 0; x
< width
; x
++)
658 // slower than naive bilinear, but matches asm
659 #define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1)
660 dst0
[x
] = FILTER(src0
[2 * x
], src1
[2 * x
], src0
[2 * x
+ 1], src1
[2 * x
+ 1]);
661 dsth
[x
] = FILTER(src0
[2 * x
+ 1], src1
[2 * x
+ 1], src0
[2 * x
+ 2], src1
[2 * x
+ 2]);
662 dstv
[x
] = FILTER(src1
[2 * x
], src2
[2 * x
], src1
[2 * x
+ 1], src2
[2 * x
+ 1]);
663 dstc
[x
] = FILTER(src1
[2 * x
+ 1], src2
[2 * x
+ 1], src1
[2 * x
+ 2], src2
[2 * x
+ 2]);
666 src0
+= src_stride
* 2;
674 /* structural similarity metric */
675 void ssim_4x4x2_core(const pixel
* pix1
, intptr_t stride1
, const pixel
* pix2
, intptr_t stride2
, int sums
[2][4])
677 for (int z
= 0; z
< 2; z
++)
679 uint32_t s1
= 0, s2
= 0, ss
= 0, s12
= 0;
680 for (int y
= 0; y
< 4; y
++)
682 for (int x
= 0; x
< 4; x
++)
684 int a
= pix1
[x
+ y
* stride1
];
685 int b
= pix2
[x
+ y
* stride2
];
703 float ssim_end_1(int s1
, int s2
, int ss
, int s12
)
705 /* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases.
706 * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784.
707 * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */
709 #define PIXEL_MAX ((1 << X265_DEPTH) - 1)
711 X265_CHECK(X265_DEPTH
== 10, "ssim invalid depth\n");
713 static const float ssim_c1
= (float)(.01 * .01 * PIXEL_MAX
* PIXEL_MAX
* 64);
714 static const float ssim_c2
= (float)(.03 * .03 * PIXEL_MAX
* PIXEL_MAX
* 64 * 63);
716 X265_CHECK(X265_DEPTH
== 8, "ssim invalid depth\n");
718 static const int ssim_c1
= (int)(.01 * .01 * PIXEL_MAX
* PIXEL_MAX
* 64 + .5);
719 static const int ssim_c2
= (int)(.03 * .03 * PIXEL_MAX
* PIXEL_MAX
* 64 * 63 + .5);
724 type fs12
= (type
)s12
;
725 type vars
= (type
)(fss
* 64 - fs1
* fs1
- fs2
* fs2
);
726 type covar
= (type
)(fs12
* 64 - fs1
* fs2
);
727 return (float)(2 * fs1
* fs2
+ ssim_c1
) * (float)(2 * covar
+ ssim_c2
)
728 / ((float)(fs1
* fs1
+ fs2
* fs2
+ ssim_c1
) * (float)(vars
+ ssim_c2
));
733 float ssim_end_4(int sum0
[5][4], int sum1
[5][4], int width
)
737 for (int i
= 0; i
< width
; i
++)
739 ssim
+= ssim_end_1(sum0
[i
][0] + sum0
[i
+ 1][0] + sum1
[i
][0] + sum1
[i
+ 1][0],
740 sum0
[i
][1] + sum0
[i
+ 1][1] + sum1
[i
][1] + sum1
[i
+ 1][1],
741 sum0
[i
][2] + sum0
[i
+ 1][2] + sum1
[i
][2] + sum1
[i
+ 1][2],
742 sum0
[i
][3] + sum0
[i
+ 1][3] + sum1
[i
][3] + sum1
[i
+ 1][3]);
749 uint64_t pixel_var(const pixel
* pix
, intptr_t i_stride
)
751 uint32_t sum
= 0, sqr
= 0;
753 for (int y
= 0; y
< size
; y
++)
755 for (int x
= 0; x
< size
; x
++)
758 sqr
+= pix
[x
] * pix
[x
];
764 return sum
+ ((uint64_t)sqr
<< 32);
767 #if defined(_MSC_VER)
768 #pragma warning(disable: 4127) // conditional expression is constant
772 int psyCost_pp(const pixel
* source
, intptr_t sstride
, const pixel
* recon
, intptr_t rstride
)
774 static pixel zeroBuf
[8] /* = { 0 } */;
778 int dim
= 1 << (size
+ 2);
779 uint32_t totEnergy
= 0;
780 for (int i
= 0; i
< dim
; i
+= 8)
782 for (int j
= 0; j
< dim
; j
+= 8)
784 /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
785 int sourceEnergy
= sa8d_8x8(source
+ i
* sstride
+ j
, sstride
, zeroBuf
, 0) -
786 (sad
<8, 8>(source
+ i
* sstride
+ j
, sstride
, zeroBuf
, 0) >> 2);
787 int reconEnergy
= sa8d_8x8(recon
+ i
* rstride
+ j
, rstride
, zeroBuf
, 0) -
788 (sad
<8, 8>(recon
+ i
* rstride
+ j
, rstride
, zeroBuf
, 0) >> 2);
790 totEnergy
+= abs(sourceEnergy
- reconEnergy
);
797 /* 4x4 is too small for sa8d */
798 int sourceEnergy
= satd_4x4(source
, sstride
, zeroBuf
, 0) - (sad
<4, 4>(source
, sstride
, zeroBuf
, 0) >> 2);
799 int reconEnergy
= satd_4x4(recon
, rstride
, zeroBuf
, 0) - (sad
<4, 4>(recon
, rstride
, zeroBuf
, 0) >> 2);
800 return abs(sourceEnergy
- reconEnergy
);
805 int psyCost_ss(const int16_t* source
, intptr_t sstride
, const int16_t* recon
, intptr_t rstride
)
807 static int16_t zeroBuf
[8] /* = { 0 } */;
811 int dim
= 1 << (size
+ 2);
812 uint32_t totEnergy
= 0;
813 for (int i
= 0; i
< dim
; i
+= 8)
815 for (int j
= 0; j
< dim
; j
+= 8)
817 /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
818 int sourceEnergy
= sa8d_8x8(source
+ i
* sstride
+ j
, sstride
, zeroBuf
, 0) -
819 (sad
<8, 8>(source
+ i
* sstride
+ j
, sstride
, zeroBuf
, 0) >> 2);
820 int reconEnergy
= sa8d_8x8(recon
+ i
* rstride
+ j
, rstride
, zeroBuf
, 0) -
821 (sad
<8, 8>(recon
+ i
* rstride
+ j
, rstride
, zeroBuf
, 0) >> 2);
823 totEnergy
+= abs(sourceEnergy
- reconEnergy
);
830 /* 4x4 is too small for sa8d */
831 int sourceEnergy
= satd_4x4(source
, sstride
, zeroBuf
, 0) - (sad
<4, 4>(source
, sstride
, zeroBuf
, 0) >> 2);
832 int reconEnergy
= satd_4x4(recon
, rstride
, zeroBuf
, 0) - (sad
<4, 4>(recon
, rstride
, zeroBuf
, 0) >> 2);
833 return abs(sourceEnergy
- reconEnergy
);
837 template<int bx
, int by
>
838 void blockcopy_pp_c(pixel
* a
, intptr_t stridea
, const pixel
* b
, intptr_t strideb
)
840 for (int y
= 0; y
< by
; y
++)
842 for (int x
= 0; x
< bx
; x
++)
850 template<int bx
, int by
>
851 void blockcopy_ss_c(int16_t* a
, intptr_t stridea
, const int16_t* b
, intptr_t strideb
)
853 for (int y
= 0; y
< by
; y
++)
855 for (int x
= 0; x
< bx
; x
++)
863 template<int bx
, int by
>
864 void blockcopy_sp_c(pixel
* a
, intptr_t stridea
, const int16_t* b
, intptr_t strideb
)
866 for (int y
= 0; y
< by
; y
++)
868 for (int x
= 0; x
< bx
; x
++)
870 X265_CHECK((b
[x
] >= 0) && (b
[x
] <= ((1 << X265_DEPTH
) - 1)), "blockcopy pixel size fail\n");
879 template<int bx
, int by
>
880 void blockcopy_ps_c(int16_t* a
, intptr_t stridea
, const pixel
* b
, intptr_t strideb
)
882 for (int y
= 0; y
< by
; y
++)
884 for (int x
= 0; x
< bx
; x
++)
885 a
[x
] = (int16_t)b
[x
];
892 template<int bx
, int by
>
893 void pixel_sub_ps_c(int16_t* a
, intptr_t dstride
, const pixel
* b0
, const pixel
* b1
, intptr_t sstride0
, intptr_t sstride1
)
895 for (int y
= 0; y
< by
; y
++)
897 for (int x
= 0; x
< bx
; x
++)
898 a
[x
] = (int16_t)(b0
[x
] - b1
[x
]);
906 template<int bx
, int by
>
907 void pixel_add_ps_c(pixel
* a
, intptr_t dstride
, const pixel
* b0
, const int16_t* b1
, intptr_t sstride0
, intptr_t sstride1
)
909 for (int y
= 0; y
< by
; y
++)
911 for (int x
= 0; x
< bx
; x
++)
912 a
[x
] = Clip(b0
[x
] + b1
[x
]);
920 template<int bx
, int by
>
921 void addAvg(const int16_t* src0
, const int16_t* src1
, pixel
* dst
, intptr_t src0Stride
, intptr_t src1Stride
, intptr_t dstStride
)
923 int shiftNum
, offset
;
925 shiftNum
= IF_INTERNAL_PREC
+ 1 - X265_DEPTH
;
926 offset
= (1 << (shiftNum
- 1)) + 2 * IF_INTERNAL_OFFS
;
928 for (int y
= 0; y
< by
; y
++)
930 for (int x
= 0; x
< bx
; x
+= 2)
932 dst
[x
+ 0] = Clip((src0
[x
+ 0] + src1
[x
+ 0] + offset
) >> shiftNum
);
933 dst
[x
+ 1] = Clip((src0
[x
+ 1] + src1
[x
+ 1] + offset
) >> shiftNum
);
942 void planecopy_cp_c(const uint8_t* src
, intptr_t srcStride
, pixel
* dst
, intptr_t dstStride
, int width
, int height
, int shift
)
944 for (int r
= 0; r
< height
; r
++)
946 for (int c
= 0; c
< width
; c
++)
947 dst
[c
] = ((pixel
)src
[c
]) << shift
;
954 void planecopy_sp_c(const uint16_t* src
, intptr_t srcStride
, pixel
* dst
, intptr_t dstStride
, int width
, int height
, int shift
, uint16_t mask
)
956 for (int r
= 0; r
< height
; r
++)
958 for (int c
= 0; c
< width
; c
++)
959 dst
[c
] = (pixel
)((src
[c
] >> shift
) & mask
);
966 /* Estimate the total amount of influence on future quality that could be had if we
967 * were to improve the reference samples used to inter predict any given CU. */
968 void estimateCUPropagateCost(int* dst
, const uint16_t* propagateIn
, const int32_t* intraCosts
, const uint16_t* interCosts
,
969 const int32_t* invQscales
, const double* fpsFactor
, int len
)
971 double fps
= *fpsFactor
/ 256;
973 for (int i
= 0; i
< len
; i
++)
975 double intraCost
= intraCosts
[i
] * invQscales
[i
];
976 double propagateAmount
= (double)propagateIn
[i
] + intraCost
* fps
;
977 double propagateNum
= (double)intraCosts
[i
] - (interCosts
[i
] & ((1 << 14) - 1));
978 double propagateDenom
= (double)intraCosts
[i
];
979 dst
[i
] = (int)(propagateAmount
* propagateNum
/ propagateDenom
+ 0.5);
982 } // end anonymous namespace
985 // x265 private namespace
987 /* Extend the edges of a picture so that it may safely be used for motion
988 * compensation. This function assumes the picture is stored in a buffer with
989 * sufficient padding for the X and Y margins */
990 void extendPicBorder(pixel
* pic
, intptr_t stride
, int width
, int height
, int marginX
, int marginY
)
992 /* extend left and right margins */
993 primitives
.extendRowBorder(pic
, stride
, width
, height
, marginX
);
995 /* copy top row to create above margin */
996 pixel
* top
= pic
- marginX
;
997 for (int y
= 0; y
< marginY
; y
++)
998 memcpy(top
- (y
+ 1) * stride
, top
, stride
* sizeof(pixel
));
1000 /* copy bottom row to create below margin */
1001 pixel
* bot
= pic
- marginX
+ (height
- 1) * stride
;
1002 for (int y
= 0; y
< marginY
; y
++)
1003 memcpy(bot
+ (y
+ 1) * stride
, bot
, stride
* sizeof(pixel
));
1006 /* Initialize entries for pixel functions defined in this file */
1007 void Setup_C_PixelPrimitives(EncoderPrimitives
&p
)
1009 SET_FUNC_PRIMITIVE_TABLE_C2(sad
)
1010 SET_FUNC_PRIMITIVE_TABLE_C2(sad_x3
)
1011 SET_FUNC_PRIMITIVE_TABLE_C2(sad_x4
)
1012 SET_FUNC_PRIMITIVE_TABLE_C2(pixelavg_pp
)
1015 p
.satd
[LUMA_4x4
] = satd_4x4
;
1016 p
.satd
[LUMA_8x8
] = satd8
<8, 8>;
1017 p
.satd
[LUMA_8x4
] = satd_8x4
;
1018 p
.satd
[LUMA_4x8
] = satd4
<4, 8>;
1019 p
.satd
[LUMA_16x16
] = satd8
<16, 16>;
1020 p
.satd
[LUMA_16x8
] = satd8
<16, 8>;
1021 p
.satd
[LUMA_8x16
] = satd8
<8, 16>;
1022 p
.satd
[LUMA_16x12
] = satd8
<16, 12>;
1023 p
.satd
[LUMA_12x16
] = satd4
<12, 16>;
1024 p
.satd
[LUMA_16x4
] = satd8
<16, 4>;
1025 p
.satd
[LUMA_4x16
] = satd4
<4, 16>;
1026 p
.satd
[LUMA_32x32
] = satd8
<32, 32>;
1027 p
.satd
[LUMA_32x16
] = satd8
<32, 16>;
1028 p
.satd
[LUMA_16x32
] = satd8
<16, 32>;
1029 p
.satd
[LUMA_32x24
] = satd8
<32, 24>;
1030 p
.satd
[LUMA_24x32
] = satd8
<24, 32>;
1031 p
.satd
[LUMA_32x8
] = satd8
<32, 8>;
1032 p
.satd
[LUMA_8x32
] = satd8
<8, 32>;
1033 p
.satd
[LUMA_64x64
] = satd8
<64, 64>;
1034 p
.satd
[LUMA_64x32
] = satd8
<64, 32>;
1035 p
.satd
[LUMA_32x64
] = satd8
<32, 64>;
1036 p
.satd
[LUMA_64x48
] = satd8
<64, 48>;
1037 p
.satd
[LUMA_48x64
] = satd8
<48, 64>;
1038 p
.satd
[LUMA_64x16
] = satd8
<64, 16>;
1039 p
.satd
[LUMA_16x64
] = satd8
<16, 64>;
1041 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_2x2
] = NULL
;
1042 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_4x4
] = satd_4x4
;
1043 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_8x8
] = satd8
<8, 8>;
1044 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_16x16
] = satd8
<16, 16>;
1045 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_32x32
] = satd8
<32, 32>;
1047 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_4x2
] = NULL
;
1048 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_2x4
] = NULL
;
1049 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_8x4
] = satd_8x4
;
1050 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_4x8
] = satd4
<4, 8>;
1051 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_16x8
] = satd8
<16, 8>;
1052 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_8x16
] = satd8
<8, 16>;
1053 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_32x16
] = satd8
<32, 16>;
1054 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_16x32
] = satd8
<16, 32>;
1056 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_8x6
] = NULL
;
1057 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_6x8
] = NULL
;
1058 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_8x2
] = NULL
;
1059 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_2x8
] = NULL
;
1060 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_16x12
] = satd4
<16, 12>;
1061 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_12x16
] = satd4
<12, 16>;
1062 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_16x4
] = satd4
<16, 4>;
1063 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_4x16
] = satd4
<4, 16>;
1064 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_32x24
] = satd8
<32, 24>;
1065 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_24x32
] = satd8
<24, 32>;
1066 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_32x8
] = satd8
<32, 8>;
1067 p
.chroma
[X265_CSP_I420
].satd
[CHROMA_8x32
] = satd8
<8, 32>;
1069 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_2x4
] = NULL
;
1070 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_4x8
] = satd4
<4, 8>;
1071 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_8x16
] = satd8
<8, 16>;
1072 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_16x32
] = satd8
<16, 32>;
1073 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_32x64
] = satd8
<32, 64>;
1075 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_4x4
] = satd_4x4
;
1076 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_2x8
] = NULL
;
1077 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_8x8
] = satd8
<8, 8>;
1078 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_4x16
] = satd4
<4, 16>;
1079 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_16x16
] = satd8
<16, 16>;
1080 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_8x32
] = satd8
<8, 32>;
1081 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_32x32
] = satd8
<32, 32>;
1082 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_16x64
] = satd8
<16, 64>;
1084 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_8x12
] = satd4
<8, 12>;
1085 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_6x16
] = NULL
;
1086 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_8x4
] = satd4
<8, 4>;
1087 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_2x16
] = NULL
;
1088 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_16x24
] = satd8
<16, 24>;
1089 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_12x32
] = satd4
<12, 32>;
1090 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_16x8
] = satd8
<16, 8>;
1091 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_4x32
] = satd4
<4, 32>;
1092 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_32x48
] = satd8
<32, 48>;
1093 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_24x64
] = satd8
<24, 64>;
1094 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_32x16
] = satd8
<32, 16>;
1095 p
.chroma
[X265_CSP_I422
].satd
[CHROMA422_8x64
] = satd8
<8, 64>;
1097 #define CHROMA_420(W, H) \
1098 p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = addAvg<W, H>; \
1099 p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1100 p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1101 p.chroma[X265_CSP_I420].copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1102 p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1104 #define CHROMA_422(W, H) \
1105 p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg<W, H>; \
1106 p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1107 p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1108 p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1109 p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1111 #define CHROMA_444(W, H) \
1112 p.chroma[X265_CSP_I444].satd[LUMA_ ## W ## x ## H] = p.satd[LUMA_ ## W ## x ## H]; \
1113 p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
1114 p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1115 p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1116 p.chroma[X265_CSP_I444].copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1117 p.chroma[X265_CSP_I444].copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1119 #define LUMA(W, H) \
1120 p.luma_addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
1121 p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1122 p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1123 p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1124 p.luma_copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1126 #define LUMA_PIXELSUB(W, H) \
1127 p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1128 p.luma_add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1130 #define CHROMA_PIXELSUB_420(W, H) \
1131 p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1132 p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1134 #define CHROMA_PIXELSUB_422(W, H) \
1135 p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1136 p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1138 #define CHROMA_PIXELSUB_444(W, H) \
1139 p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1140 p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1192 LUMA_PIXELSUB(4, 4);
1193 LUMA_PIXELSUB(8, 8);
1194 LUMA_PIXELSUB(16, 16);
1195 LUMA_PIXELSUB(32, 32);
1196 LUMA_PIXELSUB(64, 64);
1197 CHROMA_PIXELSUB_420(4, 4)
1198 CHROMA_PIXELSUB_420(8, 8)
1199 CHROMA_PIXELSUB_420(16, 16)
1200 CHROMA_PIXELSUB_420(32, 32)
1201 CHROMA_PIXELSUB_422(4, 8)
1202 CHROMA_PIXELSUB_422(8, 16)
1203 CHROMA_PIXELSUB_422(16, 32)
1204 CHROMA_PIXELSUB_422(32, 64)
1205 CHROMA_PIXELSUB_444(8, 8)
1206 CHROMA_PIXELSUB_444(16, 16)
1207 CHROMA_PIXELSUB_444(32, 32)
1208 CHROMA_PIXELSUB_444(64, 64)
1261 SET_FUNC_PRIMITIVE_TABLE_C(sse_pp
, sse
, pixel
, pixel
)
1262 SET_FUNC_PRIMITIVE_TABLE_C(sse_sp
, sse
, int16_t, pixel
)
1263 SET_FUNC_PRIMITIVE_TABLE_C(sse_ss
, sse
, int16_t, int16_t)
1265 p
.blockfill_s
[BLOCK_4x4
] = blockfil_s_c
<4>;
1266 p
.blockfill_s
[BLOCK_8x8
] = blockfil_s_c
<8>;
1267 p
.blockfill_s
[BLOCK_16x16
] = blockfil_s_c
<16>;
1268 p
.blockfill_s
[BLOCK_32x32
] = blockfil_s_c
<32>;
1269 p
.blockfill_s
[BLOCK_64x64
] = blockfil_s_c
<64>;
1271 p
.cpy2Dto1D_shl
[BLOCK_4x4
] = cpy2Dto1D_shl
<4>;
1272 p
.cpy2Dto1D_shl
[BLOCK_8x8
] = cpy2Dto1D_shl
<8>;
1273 p
.cpy2Dto1D_shl
[BLOCK_16x16
] = cpy2Dto1D_shl
<16>;
1274 p
.cpy2Dto1D_shl
[BLOCK_32x32
] = cpy2Dto1D_shl
<32>;
1275 p
.cpy2Dto1D_shr
[BLOCK_4x4
] = cpy2Dto1D_shr
<4>;
1276 p
.cpy2Dto1D_shr
[BLOCK_8x8
] = cpy2Dto1D_shr
<8>;
1277 p
.cpy2Dto1D_shr
[BLOCK_16x16
] = cpy2Dto1D_shr
<16>;
1278 p
.cpy2Dto1D_shr
[BLOCK_32x32
] = cpy2Dto1D_shr
<32>;
1279 p
.cpy1Dto2D_shl
[BLOCK_4x4
] = cpy1Dto2D_shl
<4>;
1280 p
.cpy1Dto2D_shl
[BLOCK_8x8
] = cpy1Dto2D_shl
<8>;
1281 p
.cpy1Dto2D_shl
[BLOCK_16x16
] = cpy1Dto2D_shl
<16>;
1282 p
.cpy1Dto2D_shl
[BLOCK_32x32
] = cpy1Dto2D_shl
<32>;
1283 p
.cpy1Dto2D_shr
[BLOCK_4x4
] = cpy1Dto2D_shr
<4>;
1284 p
.cpy1Dto2D_shr
[BLOCK_8x8
] = cpy1Dto2D_shr
<8>;
1285 p
.cpy1Dto2D_shr
[BLOCK_16x16
] = cpy1Dto2D_shr
<16>;
1286 p
.cpy1Dto2D_shr
[BLOCK_32x32
] = cpy1Dto2D_shr
<32>;
1288 p
.sa8d
[BLOCK_4x4
] = satd_4x4
;
1289 p
.sa8d
[BLOCK_8x8
] = sa8d_8x8
;
1290 p
.sa8d
[BLOCK_16x16
] = sa8d_16x16
;
1291 p
.sa8d
[BLOCK_32x32
] = sa8d16
<32, 32>;
1292 p
.sa8d
[BLOCK_64x64
] = sa8d16
<64, 64>;
1294 p
.psy_cost_pp
[BLOCK_4x4
] = psyCost_pp
<BLOCK_4x4
>;
1295 p
.psy_cost_pp
[BLOCK_8x8
] = psyCost_pp
<BLOCK_8x8
>;
1296 p
.psy_cost_pp
[BLOCK_16x16
] = psyCost_pp
<BLOCK_16x16
>;
1297 p
.psy_cost_pp
[BLOCK_32x32
] = psyCost_pp
<BLOCK_32x32
>;
1298 p
.psy_cost_pp
[BLOCK_64x64
] = psyCost_pp
<BLOCK_64x64
>;
1300 p
.psy_cost_ss
[BLOCK_4x4
] = psyCost_ss
<BLOCK_4x4
>;
1301 p
.psy_cost_ss
[BLOCK_8x8
] = psyCost_ss
<BLOCK_8x8
>;
1302 p
.psy_cost_ss
[BLOCK_16x16
] = psyCost_ss
<BLOCK_16x16
>;
1303 p
.psy_cost_ss
[BLOCK_32x32
] = psyCost_ss
<BLOCK_32x32
>;
1304 p
.psy_cost_ss
[BLOCK_64x64
] = psyCost_ss
<BLOCK_64x64
>;
1306 p
.sa8d_inter
[LUMA_4x4
] = satd_4x4
;
1307 p
.sa8d_inter
[LUMA_8x8
] = sa8d_8x8
;
1308 p
.sa8d_inter
[LUMA_8x4
] = satd_8x4
;
1309 p
.sa8d_inter
[LUMA_4x8
] = satd4
<4, 8>;
1310 p
.sa8d_inter
[LUMA_16x16
] = sa8d_16x16
;
1311 p
.sa8d_inter
[LUMA_16x8
] = sa8d8
<16, 8>;
1312 p
.sa8d_inter
[LUMA_8x16
] = sa8d8
<8, 16>;
1313 p
.sa8d_inter
[LUMA_16x12
] = satd8
<16, 12>;
1314 p
.sa8d_inter
[LUMA_12x16
] = satd4
<12, 16>;
1315 p
.sa8d_inter
[LUMA_4x16
] = satd4
<4, 16>;
1316 p
.sa8d_inter
[LUMA_16x4
] = satd8
<16, 4>;
1317 p
.sa8d_inter
[LUMA_32x32
] = sa8d16
<32, 32>;
1318 p
.sa8d_inter
[LUMA_32x16
] = sa8d16
<32, 16>;
1319 p
.sa8d_inter
[LUMA_16x32
] = sa8d16
<16, 32>;
1320 p
.sa8d_inter
[LUMA_32x24
] = sa8d8
<32, 24>;
1321 p
.sa8d_inter
[LUMA_24x32
] = sa8d8
<24, 32>;
1322 p
.sa8d_inter
[LUMA_32x8
] = sa8d8
<32, 8>;
1323 p
.sa8d_inter
[LUMA_8x32
] = sa8d8
<8, 32>;
1324 p
.sa8d_inter
[LUMA_64x64
] = sa8d16
<64, 64>;
1325 p
.sa8d_inter
[LUMA_64x32
] = sa8d16
<64, 32>;
1326 p
.sa8d_inter
[LUMA_32x64
] = sa8d16
<32, 64>;
1327 p
.sa8d_inter
[LUMA_64x48
] = sa8d16
<64, 48>;
1328 p
.sa8d_inter
[LUMA_48x64
] = sa8d16
<48, 64>;
1329 p
.sa8d_inter
[LUMA_64x16
] = sa8d16
<64, 16>;
1330 p
.sa8d_inter
[LUMA_16x64
] = sa8d16
<16, 64>;
1332 p
.calcresidual
[BLOCK_4x4
] = getResidual
<4>;
1333 p
.calcresidual
[BLOCK_8x8
] = getResidual
<8>;
1334 p
.calcresidual
[BLOCK_16x16
] = getResidual
<16>;
1335 p
.calcresidual
[BLOCK_32x32
] = getResidual
<32>;
1336 p
.calcresidual
[BLOCK_64x64
] = NULL
;
1338 p
.transpose
[BLOCK_4x4
] = transpose
<4>;
1339 p
.transpose
[BLOCK_8x8
] = transpose
<8>;
1340 p
.transpose
[BLOCK_16x16
] = transpose
<16>;
1341 p
.transpose
[BLOCK_32x32
] = transpose
<32>;
1342 p
.transpose
[BLOCK_64x64
] = transpose
<64>;
1344 p
.ssd_s
[BLOCK_4x4
] = pixel_ssd_s_c
<4>;
1345 p
.ssd_s
[BLOCK_8x8
] = pixel_ssd_s_c
<8>;
1346 p
.ssd_s
[BLOCK_16x16
] = pixel_ssd_s_c
<16>;
1347 p
.ssd_s
[BLOCK_32x32
] = pixel_ssd_s_c
<32>;
1349 p
.weight_pp
= weight_pp_c
;
1350 p
.weight_sp
= weight_sp_c
;
1352 p
.scale1D_128to64
= scale1D_128to64
;
1353 p
.scale2D_64to32
= scale2D_64to32
;
1354 p
.frameInitLowres
= frame_init_lowres_core
;
1355 p
.ssim_4x4x2_core
= ssim_4x4x2_core
;
1356 p
.ssim_end_4
= ssim_end_4
;
1358 p
.var
[BLOCK_8x8
] = pixel_var
<8>;
1359 p
.var
[BLOCK_16x16
] = pixel_var
<16>;
1360 p
.var
[BLOCK_32x32
] = pixel_var
<32>;
1361 p
.var
[BLOCK_64x64
] = pixel_var
<64>;
1362 p
.planecopy_cp
= planecopy_cp_c
;
1363 p
.planecopy_sp
= planecopy_sp_c
;
1364 p
.propagateCost
= estimateCUPropagateCost
;