| 1 | /***************************************************************************** |
| 2 | * Copyright (C) 2013 x265 project |
| 3 | * |
| 4 | * Authors: Steve Borho <steve@borho.org> |
| 5 | * Mandar Gurav <mandar@multicorewareinc.com> |
| 6 | * Mahesh Pittala <mahesh@multicorewareinc.com> |
| 7 | * Min Chen <min.chen@multicorewareinc.com> |
| 8 | * |
| 9 | * This program is free software; you can redistribute it and/or modify |
| 10 | * it under the terms of the GNU General Public License as published by |
| 11 | * the Free Software Foundation; either version 2 of the License, or |
| 12 | * (at your option) any later version. |
| 13 | * |
| 14 | * This program is distributed in the hope that it will be useful, |
| 15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 17 | * GNU General Public License for more details. |
| 18 | * |
| 19 | * You should have received a copy of the GNU General Public License |
| 20 | * along with this program; if not, write to the Free Software |
| 21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
| 22 | * |
| 23 | * This program is also available under a commercial proprietary license. |
| 24 | * For more information, contact us at license @ x265.com. |
| 25 | *****************************************************************************/ |
| 26 | |
| 27 | #include "common.h" |
| 28 | #include "primitives.h" |
| 29 | #include "x265.h" |
| 30 | |
| 31 | #include <cstdlib> // abs() |
| 32 | |
| 33 | using namespace x265; |
| 34 | |
| 35 | #define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \ |
| 36 | p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \ |
| 37 | p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \ |
| 38 | p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \ |
| 39 | p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \ |
| 40 | p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \ |
| 41 | p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \ |
| 42 | p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \ |
| 43 | p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \ |
| 44 | p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \ |
| 45 | p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \ |
| 46 | p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \ |
| 47 | p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \ |
| 48 | p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \ |
| 49 | p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \ |
| 50 | p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \ |
| 51 | p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \ |
| 52 | p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \ |
| 53 | p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \ |
| 54 | p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \ |
| 55 | p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \ |
| 56 | p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \ |
| 57 | p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \ |
| 58 | p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \ |
| 59 | p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \ |
| 60 | p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>; |
| 61 | |
| 62 | #define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \ |
| 63 | p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX<4, 4>; \ |
| 64 | p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX<8, 8>; \ |
| 65 | p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX<8, 4>; \ |
| 66 | p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX<4, 8>; \ |
| 67 | p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX<16, 16>; \ |
| 68 | p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX<16, 8>; \ |
| 69 | p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX<8, 16>; \ |
| 70 | p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX<16, 12>; \ |
| 71 | p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX<12, 16>; \ |
| 72 | p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX<16, 4>; \ |
| 73 | p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX<4, 16>; \ |
| 74 | p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX<32, 32>; \ |
| 75 | p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX<32, 16>; \ |
| 76 | p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX<16, 32>; \ |
| 77 | p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX<32, 24>; \ |
| 78 | p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX<24, 32>; \ |
| 79 | p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX<32, 8>; \ |
| 80 | p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX<8, 32>; \ |
| 81 | p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX<64, 64>; \ |
| 82 | p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX<64, 32>; \ |
| 83 | p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX<32, 64>; \ |
| 84 | p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX<64, 48>; \ |
| 85 | p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX<48, 64>; \ |
| 86 | p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX<64, 16>; \ |
| 87 | p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX<16, 64>; |
| 88 | |
| 89 | namespace { |
| 90 | // place functions in anonymous namespace (file static) |
| 91 | |
| 92 | template<int lx, int ly> |
| 93 | int sad(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) |
| 94 | { |
| 95 | int sum = 0; |
| 96 | |
| 97 | for (int y = 0; y < ly; y++) |
| 98 | { |
| 99 | for (int x = 0; x < lx; x++) |
| 100 | sum += abs(pix1[x] - pix2[x]); |
| 101 | |
| 102 | pix1 += stride_pix1; |
| 103 | pix2 += stride_pix2; |
| 104 | } |
| 105 | |
| 106 | return sum; |
| 107 | } |
| 108 | |
| 109 | template<int lx, int ly> |
| 110 | int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2) |
| 111 | { |
| 112 | int sum = 0; |
| 113 | |
| 114 | for (int y = 0; y < ly; y++) |
| 115 | { |
| 116 | for (int x = 0; x < lx; x++) |
| 117 | sum += abs(pix1[x] - pix2[x]); |
| 118 | |
| 119 | pix1 += stride_pix1; |
| 120 | pix2 += stride_pix2; |
| 121 | } |
| 122 | |
| 123 | return sum; |
| 124 | } |
| 125 | |
| 126 | template<int lx, int ly> |
| 127 | void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res) |
| 128 | { |
| 129 | res[0] = 0; |
| 130 | res[1] = 0; |
| 131 | res[2] = 0; |
| 132 | for (int y = 0; y < ly; y++) |
| 133 | { |
| 134 | for (int x = 0; x < lx; x++) |
| 135 | { |
| 136 | res[0] += abs(pix1[x] - pix2[x]); |
| 137 | res[1] += abs(pix1[x] - pix3[x]); |
| 138 | res[2] += abs(pix1[x] - pix4[x]); |
| 139 | } |
| 140 | |
| 141 | pix1 += FENC_STRIDE; |
| 142 | pix2 += frefstride; |
| 143 | pix3 += frefstride; |
| 144 | pix4 += frefstride; |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | template<int lx, int ly> |
| 149 | void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res) |
| 150 | { |
| 151 | res[0] = 0; |
| 152 | res[1] = 0; |
| 153 | res[2] = 0; |
| 154 | res[3] = 0; |
| 155 | for (int y = 0; y < ly; y++) |
| 156 | { |
| 157 | for (int x = 0; x < lx; x++) |
| 158 | { |
| 159 | res[0] += abs(pix1[x] - pix2[x]); |
| 160 | res[1] += abs(pix1[x] - pix3[x]); |
| 161 | res[2] += abs(pix1[x] - pix4[x]); |
| 162 | res[3] += abs(pix1[x] - pix5[x]); |
| 163 | } |
| 164 | |
| 165 | pix1 += FENC_STRIDE; |
| 166 | pix2 += frefstride; |
| 167 | pix3 += frefstride; |
| 168 | pix4 += frefstride; |
| 169 | pix5 += frefstride; |
| 170 | } |
| 171 | } |
| 172 | |
| 173 | template<int lx, int ly, class T1, class T2> |
| 174 | int sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2) |
| 175 | { |
| 176 | int sum = 0; |
| 177 | int tmp; |
| 178 | |
| 179 | for (int y = 0; y < ly; y++) |
| 180 | { |
| 181 | for (int x = 0; x < lx; x++) |
| 182 | { |
| 183 | tmp = pix1[x] - pix2[x]; |
| 184 | sum += (tmp * tmp); |
| 185 | } |
| 186 | |
| 187 | pix1 += stride_pix1; |
| 188 | pix2 += stride_pix2; |
| 189 | } |
| 190 | |
| 191 | return sum; |
| 192 | } |
| 193 | |
| 194 | #define BITS_PER_SUM (8 * sizeof(sum_t)) |
| 195 | |
| 196 | #define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) { \ |
| 197 | sum2_t t0 = s0 + s1; \ |
| 198 | sum2_t t1 = s0 - s1; \ |
| 199 | sum2_t t2 = s2 + s3; \ |
| 200 | sum2_t t3 = s2 - s3; \ |
| 201 | d0 = t0 + t2; \ |
| 202 | d2 = t0 - t2; \ |
| 203 | d1 = t1 + t3; \ |
| 204 | d3 = t1 - t3; \ |
| 205 | } |
| 206 | |
| 207 | // in: a pseudo-simd number of the form x+(y<<16) |
| 208 | // return: abs(x)+(abs(y)<<16) |
| 209 | inline sum2_t abs2(sum2_t a) |
| 210 | { |
| 211 | sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1); |
| 212 | |
| 213 | return (a + s) ^ s; |
| 214 | } |
| 215 | |
| 216 | int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) |
| 217 | { |
| 218 | sum2_t tmp[4][2]; |
| 219 | sum2_t a0, a1, a2, a3, b0, b1; |
| 220 | sum2_t sum = 0; |
| 221 | |
| 222 | for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2) |
| 223 | { |
| 224 | a0 = pix1[0] - pix2[0]; |
| 225 | a1 = pix1[1] - pix2[1]; |
| 226 | b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM); |
| 227 | a2 = pix1[2] - pix2[2]; |
| 228 | a3 = pix1[3] - pix2[3]; |
| 229 | b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM); |
| 230 | tmp[i][0] = b0 + b1; |
| 231 | tmp[i][1] = b0 - b1; |
| 232 | } |
| 233 | |
| 234 | for (int i = 0; i < 2; i++) |
| 235 | { |
| 236 | HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); |
| 237 | a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); |
| 238 | sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM); |
| 239 | } |
| 240 | |
| 241 | return (int)(sum >> 1); |
| 242 | } |
| 243 | |
| 244 | int satd_4x4(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2) |
| 245 | { |
| 246 | ssum2_t tmp[4][2]; |
| 247 | ssum2_t a0, a1, a2, a3, b0, b1; |
| 248 | ssum2_t sum = 0; |
| 249 | |
| 250 | for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2) |
| 251 | { |
| 252 | a0 = pix1[0] - pix2[0]; |
| 253 | a1 = pix1[1] - pix2[1]; |
| 254 | b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM); |
| 255 | a2 = pix1[2] - pix2[2]; |
| 256 | a3 = pix1[3] - pix2[3]; |
| 257 | b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM); |
| 258 | tmp[i][0] = b0 + b1; |
| 259 | tmp[i][1] = b0 - b1; |
| 260 | } |
| 261 | |
| 262 | for (int i = 0; i < 2; i++) |
| 263 | { |
| 264 | HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); |
| 265 | a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); |
| 266 | sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM); |
| 267 | } |
| 268 | |
| 269 | return (int)(sum >> 1); |
| 270 | } |
| 271 | |
| 272 | // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once |
| 273 | int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) |
| 274 | { |
| 275 | sum2_t tmp[4][4]; |
| 276 | sum2_t a0, a1, a2, a3; |
| 277 | sum2_t sum = 0; |
| 278 | |
| 279 | for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2) |
| 280 | { |
| 281 | a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM); |
| 282 | a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM); |
| 283 | a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM); |
| 284 | a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM); |
| 285 | HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3); |
| 286 | } |
| 287 | |
| 288 | for (int i = 0; i < 4; i++) |
| 289 | { |
| 290 | HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); |
| 291 | sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); |
| 292 | } |
| 293 | |
| 294 | return (((sum_t)sum) + (sum >> BITS_PER_SUM)) >> 1; |
| 295 | } |
| 296 | |
| 297 | template<int w, int h> |
| 298 | // calculate satd in blocks of 4x4 |
| 299 | int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) |
| 300 | { |
| 301 | int satd = 0; |
| 302 | |
| 303 | for (int row = 0; row < h; row += 4) |
| 304 | for (int col = 0; col < w; col += 4) |
| 305 | satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1, |
| 306 | pix2 + row * stride_pix2 + col, stride_pix2); |
| 307 | |
| 308 | return satd; |
| 309 | } |
| 310 | |
| 311 | template<int w, int h> |
| 312 | // calculate satd in blocks of 8x4 |
| 313 | int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) |
| 314 | { |
| 315 | int satd = 0; |
| 316 | |
| 317 | for (int row = 0; row < h; row += 4) |
| 318 | for (int col = 0; col < w; col += 8) |
| 319 | satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1, |
| 320 | pix2 + row * stride_pix2 + col, stride_pix2); |
| 321 | |
| 322 | return satd; |
| 323 | } |
| 324 | |
| 325 | inline int _sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) |
| 326 | { |
| 327 | sum2_t tmp[8][4]; |
| 328 | sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3; |
| 329 | sum2_t sum = 0; |
| 330 | |
| 331 | for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2) |
| 332 | { |
| 333 | a0 = pix1[0] - pix2[0]; |
| 334 | a1 = pix1[1] - pix2[1]; |
| 335 | b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM); |
| 336 | a2 = pix1[2] - pix2[2]; |
| 337 | a3 = pix1[3] - pix2[3]; |
| 338 | b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM); |
| 339 | a4 = pix1[4] - pix2[4]; |
| 340 | a5 = pix1[5] - pix2[5]; |
| 341 | b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM); |
| 342 | a6 = pix1[6] - pix2[6]; |
| 343 | a7 = pix1[7] - pix2[7]; |
| 344 | b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM); |
| 345 | HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3); |
| 346 | } |
| 347 | |
| 348 | for (int i = 0; i < 4; i++) |
| 349 | { |
| 350 | HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); |
| 351 | HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]); |
| 352 | b0 = abs2(a0 + a4) + abs2(a0 - a4); |
| 353 | b0 += abs2(a1 + a5) + abs2(a1 - a5); |
| 354 | b0 += abs2(a2 + a6) + abs2(a2 - a6); |
| 355 | b0 += abs2(a3 + a7) + abs2(a3 - a7); |
| 356 | sum += (sum_t)b0 + (b0 >> BITS_PER_SUM); |
| 357 | } |
| 358 | |
| 359 | return (int)sum; |
| 360 | } |
| 361 | |
| 362 | int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) |
| 363 | { |
| 364 | return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2); |
| 365 | } |
| 366 | |
| 367 | inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2) |
| 368 | { |
| 369 | ssum2_t tmp[8][4]; |
| 370 | ssum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3; |
| 371 | ssum2_t sum = 0; |
| 372 | |
| 373 | for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2) |
| 374 | { |
| 375 | a0 = pix1[0] - pix2[0]; |
| 376 | a1 = pix1[1] - pix2[1]; |
| 377 | b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM); |
| 378 | a2 = pix1[2] - pix2[2]; |
| 379 | a3 = pix1[3] - pix2[3]; |
| 380 | b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM); |
| 381 | a4 = pix1[4] - pix2[4]; |
| 382 | a5 = pix1[5] - pix2[5]; |
| 383 | b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM); |
| 384 | a6 = pix1[6] - pix2[6]; |
| 385 | a7 = pix1[7] - pix2[7]; |
| 386 | b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM); |
| 387 | HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3); |
| 388 | } |
| 389 | |
| 390 | for (int i = 0; i < 4; i++) |
| 391 | { |
| 392 | HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); |
| 393 | HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]); |
| 394 | b0 = abs2(a0 + a4) + abs2(a0 - a4); |
| 395 | b0 += abs2(a1 + a5) + abs2(a1 - a5); |
| 396 | b0 += abs2(a2 + a6) + abs2(a2 - a6); |
| 397 | b0 += abs2(a3 + a7) + abs2(a3 - a7); |
| 398 | sum += (sum_t)b0 + (b0 >> BITS_PER_SUM); |
| 399 | } |
| 400 | |
| 401 | return (int)sum; |
| 402 | } |
| 403 | |
| 404 | int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2) |
| 405 | { |
| 406 | return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2); |
| 407 | } |
| 408 | |
| 409 | int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) |
| 410 | { |
| 411 | int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2) |
| 412 | + _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2) |
| 413 | + _sa8d_8x8(pix1 + 8 * i_pix1, i_pix1, pix2 + 8 * i_pix2, i_pix2) |
| 414 | + _sa8d_8x8(pix1 + 8 + 8 * i_pix1, i_pix1, pix2 + 8 + 8 * i_pix2, i_pix2); |
| 415 | |
| 416 | // This matches x264 sa8d_16x16, but is slightly different from HM's behavior because |
| 417 | // this version only rounds once at the end |
| 418 | return (sum + 2) >> 2; |
| 419 | } |
| 420 | |
| 421 | template<int w, int h> |
| 422 | // Calculate sa8d in blocks of 8x8 |
| 423 | int sa8d8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) |
| 424 | { |
| 425 | int cost = 0; |
| 426 | |
| 427 | for (int y = 0; y < h; y += 8) |
| 428 | for (int x = 0; x < w; x += 8) |
| 429 | cost += sa8d_8x8(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2); |
| 430 | |
| 431 | return cost; |
| 432 | } |
| 433 | |
| 434 | template<int w, int h> |
| 435 | // Calculate sa8d in blocks of 16x16 |
| 436 | int sa8d16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) |
| 437 | { |
| 438 | int cost = 0; |
| 439 | |
| 440 | for (int y = 0; y < h; y += 16) |
| 441 | for (int x = 0; x < w; x += 16) |
| 442 | cost += sa8d_16x16(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2); |
| 443 | |
| 444 | return cost; |
| 445 | } |
| 446 | |
| 447 | template<int size> |
| 448 | int pixel_ssd_s_c(const int16_t* a, intptr_t dstride) |
| 449 | { |
| 450 | int sum = 0; |
| 451 | for (int y = 0; y < size; y++) |
| 452 | { |
| 453 | for (int x = 0; x < size; x++) |
| 454 | sum += a[x] * a[x]; |
| 455 | |
| 456 | a += dstride; |
| 457 | } |
| 458 | return sum; |
| 459 | } |
| 460 | |
| 461 | template<int size> |
| 462 | void blockfil_s_c(int16_t* dst, intptr_t dstride, int16_t val) |
| 463 | { |
| 464 | for (int y = 0; y < size; y++) |
| 465 | for (int x = 0; x < size; x++) |
| 466 | dst[y * dstride + x] = val; |
| 467 | } |
| 468 | |
| 469 | template<int size> |
| 470 | void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) |
| 471 | { |
| 472 | X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n"); |
| 473 | X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n"); |
| 474 | X265_CHECK(shift >= 0, "invalid shift\n"); |
| 475 | |
| 476 | for (int i = 0; i < size; i++) |
| 477 | { |
| 478 | for (int j = 0; j < size; j++) |
| 479 | dst[j] = src[j] << shift; |
| 480 | |
| 481 | src += srcStride; |
| 482 | dst += size; |
| 483 | } |
| 484 | } |
| 485 | |
| 486 | template<int size> |
| 487 | void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) |
| 488 | { |
| 489 | X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n"); |
| 490 | X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n"); |
| 491 | X265_CHECK(shift > 0, "invalid shift\n"); |
| 492 | |
| 493 | int16_t round = 1 << (shift - 1); |
| 494 | for (int i = 0; i < size; i++) |
| 495 | { |
| 496 | for (int j = 0; j < size; j++) |
| 497 | dst[j] = (src[j] + round) >> shift; |
| 498 | |
| 499 | src += srcStride; |
| 500 | dst += size; |
| 501 | } |
| 502 | } |
| 503 | |
| 504 | template<int size> |
| 505 | void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) |
| 506 | { |
| 507 | X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n"); |
| 508 | X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n"); |
| 509 | X265_CHECK(shift >= 0, "invalid shift\n"); |
| 510 | |
| 511 | for (int i = 0; i < size; i++) |
| 512 | { |
| 513 | for (int j = 0; j < size; j++) |
| 514 | dst[j] = src[j] << shift; |
| 515 | |
| 516 | src += size; |
| 517 | dst += dstStride; |
| 518 | } |
| 519 | } |
| 520 | |
| 521 | template<int size> |
| 522 | void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) |
| 523 | { |
| 524 | X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n"); |
| 525 | X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n"); |
| 526 | X265_CHECK(shift > 0, "invalid shift\n"); |
| 527 | |
| 528 | int16_t round = 1 << (shift - 1); |
| 529 | for (int i = 0; i < size; i++) |
| 530 | { |
| 531 | for (int j = 0; j < size; j++) |
| 532 | dst[j] = (src[j] + round) >> shift; |
| 533 | |
| 534 | src += size; |
| 535 | dst += dstStride; |
| 536 | } |
| 537 | } |
| 538 | |
| 539 | template<int blockSize> |
| 540 | void getResidual(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride) |
| 541 | { |
| 542 | for (int y = 0; y < blockSize; y++) |
| 543 | { |
| 544 | for (int x = 0; x < blockSize; x++) |
| 545 | residual[x] = static_cast<int16_t>(fenc[x]) - static_cast<int16_t>(pred[x]); |
| 546 | |
| 547 | fenc += stride; |
| 548 | residual += stride; |
| 549 | pred += stride; |
| 550 | } |
| 551 | } |
| 552 | |
| 553 | template<int blockSize> |
| 554 | void transpose(pixel* dst, const pixel* src, intptr_t stride) |
| 555 | { |
| 556 | for (int k = 0; k < blockSize; k++) |
| 557 | for (int l = 0; l < blockSize; l++) |
| 558 | dst[k * blockSize + l] = src[l * stride + k]; |
| 559 | } |
| 560 | |
| 561 | void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset) |
| 562 | { |
| 563 | int x, y; |
| 564 | |
| 565 | for (y = 0; y <= height - 1; y++) |
| 566 | { |
| 567 | for (x = 0; x <= width - 1; ) |
| 568 | { |
| 569 | // note: width can be odd |
| 570 | dst[x] = (pixel)Clip3(0, ((1 << X265_DEPTH) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset); |
| 571 | x++; |
| 572 | } |
| 573 | |
| 574 | src += srcStride; |
| 575 | dst += dstStride; |
| 576 | } |
| 577 | } |
| 578 | |
| 579 | void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset) |
| 580 | { |
| 581 | int x, y; |
| 582 | |
| 583 | X265_CHECK(!(width & 15), "weightp alignment error\n"); |
| 584 | X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n"); |
| 585 | X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n"); |
| 586 | |
| 587 | for (y = 0; y <= height - 1; y++) |
| 588 | { |
| 589 | for (x = 0; x <= width - 1; ) |
| 590 | { |
| 591 | // simulating pixel to short conversion |
| 592 | int16_t val = src[x] << (IF_INTERNAL_PREC - X265_DEPTH); |
| 593 | dst[x] = (pixel)Clip3(0, ((1 << X265_DEPTH) - 1), ((w0 * (val) + round) >> shift) + offset); |
| 594 | x++; |
| 595 | } |
| 596 | |
| 597 | src += stride; |
| 598 | dst += stride; |
| 599 | } |
| 600 | } |
| 601 | |
| 602 | template<int lx, int ly> |
| 603 | void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) |
| 604 | { |
| 605 | for (int y = 0; y < ly; y++) |
| 606 | { |
| 607 | for (int x = 0; x < lx; x++) |
| 608 | dst[x] = (src0[x] + src1[x] + 1) >> 1; |
| 609 | |
| 610 | src0 += sstride0; |
| 611 | src1 += sstride1; |
| 612 | dst += dstride; |
| 613 | } |
| 614 | } |
| 615 | |
| 616 | void scale1D_128to64(pixel* dst, const pixel* src, intptr_t /*stride*/) |
| 617 | { |
| 618 | int x; |
| 619 | |
| 620 | for (x = 0; x < 128; x += 2) |
| 621 | { |
| 622 | pixel pix0 = src[(x + 0)]; |
| 623 | pixel pix1 = src[(x + 1)]; |
| 624 | int sum = pix0 + pix1; |
| 625 | |
| 626 | dst[x >> 1] = (pixel)((sum + 1) >> 1); |
| 627 | } |
| 628 | } |
| 629 | |
| 630 | void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride) |
| 631 | { |
| 632 | uint32_t x, y; |
| 633 | |
| 634 | for (y = 0; y < 64; y += 2) |
| 635 | { |
| 636 | for (x = 0; x < 64; x += 2) |
| 637 | { |
| 638 | pixel pix0 = src[(y + 0) * stride + (x + 0)]; |
| 639 | pixel pix1 = src[(y + 0) * stride + (x + 1)]; |
| 640 | pixel pix2 = src[(y + 1) * stride + (x + 0)]; |
| 641 | pixel pix3 = src[(y + 1) * stride + (x + 1)]; |
| 642 | int sum = pix0 + pix1 + pix2 + pix3; |
| 643 | |
| 644 | dst[y / 2 * 32 + x / 2] = (pixel)((sum + 2) >> 2); |
| 645 | } |
| 646 | } |
| 647 | } |
| 648 | |
| 649 | void frame_init_lowres_core(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc, |
| 650 | intptr_t src_stride, intptr_t dst_stride, int width, int height) |
| 651 | { |
| 652 | for (int y = 0; y < height; y++) |
| 653 | { |
| 654 | const pixel* src1 = src0 + src_stride; |
| 655 | const pixel* src2 = src1 + src_stride; |
| 656 | for (int x = 0; x < width; x++) |
| 657 | { |
| 658 | // slower than naive bilinear, but matches asm |
| 659 | #define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1) |
| 660 | dst0[x] = FILTER(src0[2 * x], src1[2 * x], src0[2 * x + 1], src1[2 * x + 1]); |
| 661 | dsth[x] = FILTER(src0[2 * x + 1], src1[2 * x + 1], src0[2 * x + 2], src1[2 * x + 2]); |
| 662 | dstv[x] = FILTER(src1[2 * x], src2[2 * x], src1[2 * x + 1], src2[2 * x + 1]); |
| 663 | dstc[x] = FILTER(src1[2 * x + 1], src2[2 * x + 1], src1[2 * x + 2], src2[2 * x + 2]); |
| 664 | #undef FILTER |
| 665 | } |
| 666 | src0 += src_stride * 2; |
| 667 | dst0 += dst_stride; |
| 668 | dsth += dst_stride; |
| 669 | dstv += dst_stride; |
| 670 | dstc += dst_stride; |
| 671 | } |
| 672 | } |
| 673 | |
| 674 | /* structural similarity metric */ |
| 675 | void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]) |
| 676 | { |
| 677 | for (int z = 0; z < 2; z++) |
| 678 | { |
| 679 | uint32_t s1 = 0, s2 = 0, ss = 0, s12 = 0; |
| 680 | for (int y = 0; y < 4; y++) |
| 681 | { |
| 682 | for (int x = 0; x < 4; x++) |
| 683 | { |
| 684 | int a = pix1[x + y * stride1]; |
| 685 | int b = pix2[x + y * stride2]; |
| 686 | s1 += a; |
| 687 | s2 += b; |
| 688 | ss += a * a; |
| 689 | ss += b * b; |
| 690 | s12 += a * b; |
| 691 | } |
| 692 | } |
| 693 | |
| 694 | sums[z][0] = s1; |
| 695 | sums[z][1] = s2; |
| 696 | sums[z][2] = ss; |
| 697 | sums[z][3] = s12; |
| 698 | pix1 += 4; |
| 699 | pix2 += 4; |
| 700 | } |
| 701 | } |
| 702 | |
| 703 | float ssim_end_1(int s1, int s2, int ss, int s12) |
| 704 | { |
| 705 | /* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases. |
| 706 | * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784. |
| 707 | * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */ |
| 708 | |
| 709 | #define PIXEL_MAX ((1 << X265_DEPTH) - 1) |
| 710 | #if HIGH_BIT_DEPTH |
| 711 | X265_CHECK(X265_DEPTH == 10, "ssim invalid depth\n"); |
| 712 | #define type float |
| 713 | static const float ssim_c1 = (float)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64); |
| 714 | static const float ssim_c2 = (float)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63); |
| 715 | #else |
| 716 | X265_CHECK(X265_DEPTH == 8, "ssim invalid depth\n"); |
| 717 | #define type int |
| 718 | static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5); |
| 719 | static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5); |
| 720 | #endif |
| 721 | type fs1 = (type)s1; |
| 722 | type fs2 = (type)s2; |
| 723 | type fss = (type)ss; |
| 724 | type fs12 = (type)s12; |
| 725 | type vars = (type)(fss * 64 - fs1 * fs1 - fs2 * fs2); |
| 726 | type covar = (type)(fs12 * 64 - fs1 * fs2); |
| 727 | return (float)(2 * fs1 * fs2 + ssim_c1) * (float)(2 * covar + ssim_c2) |
| 728 | / ((float)(fs1 * fs1 + fs2 * fs2 + ssim_c1) * (float)(vars + ssim_c2)); |
| 729 | #undef type |
| 730 | #undef PIXEL_MAX |
| 731 | } |
| 732 | |
| 733 | float ssim_end_4(int sum0[5][4], int sum1[5][4], int width) |
| 734 | { |
| 735 | float ssim = 0.0; |
| 736 | |
| 737 | for (int i = 0; i < width; i++) |
| 738 | { |
| 739 | ssim += ssim_end_1(sum0[i][0] + sum0[i + 1][0] + sum1[i][0] + sum1[i + 1][0], |
| 740 | sum0[i][1] + sum0[i + 1][1] + sum1[i][1] + sum1[i + 1][1], |
| 741 | sum0[i][2] + sum0[i + 1][2] + sum1[i][2] + sum1[i + 1][2], |
| 742 | sum0[i][3] + sum0[i + 1][3] + sum1[i][3] + sum1[i + 1][3]); |
| 743 | } |
| 744 | |
| 745 | return ssim; |
| 746 | } |
| 747 | |
| 748 | template<int size> |
| 749 | uint64_t pixel_var(const pixel* pix, intptr_t i_stride) |
| 750 | { |
| 751 | uint32_t sum = 0, sqr = 0; |
| 752 | |
| 753 | for (int y = 0; y < size; y++) |
| 754 | { |
| 755 | for (int x = 0; x < size; x++) |
| 756 | { |
| 757 | sum += pix[x]; |
| 758 | sqr += pix[x] * pix[x]; |
| 759 | } |
| 760 | |
| 761 | pix += i_stride; |
| 762 | } |
| 763 | |
| 764 | return sum + ((uint64_t)sqr << 32); |
| 765 | } |
| 766 | |
| 767 | #if defined(_MSC_VER) |
| 768 | #pragma warning(disable: 4127) // conditional expression is constant |
| 769 | #endif |
| 770 | |
| 771 | template<int size> |
| 772 | int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride) |
| 773 | { |
| 774 | static pixel zeroBuf[8] /* = { 0 } */; |
| 775 | |
| 776 | if (size) |
| 777 | { |
| 778 | int dim = 1 << (size + 2); |
| 779 | uint32_t totEnergy = 0; |
| 780 | for (int i = 0; i < dim; i += 8) |
| 781 | { |
| 782 | for (int j = 0; j < dim; j+= 8) |
| 783 | { |
| 784 | /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */ |
| 785 | int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride, zeroBuf, 0) - |
| 786 | (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2); |
| 787 | int reconEnergy = sa8d_8x8(recon + i * rstride + j, rstride, zeroBuf, 0) - |
| 788 | (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2); |
| 789 | |
| 790 | totEnergy += abs(sourceEnergy - reconEnergy); |
| 791 | } |
| 792 | } |
| 793 | return totEnergy; |
| 794 | } |
| 795 | else |
| 796 | { |
| 797 | /* 4x4 is too small for sa8d */ |
| 798 | int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2); |
| 799 | int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2); |
| 800 | return abs(sourceEnergy - reconEnergy); |
| 801 | } |
| 802 | } |
| 803 | |
| 804 | template<int size> |
| 805 | int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) |
| 806 | { |
| 807 | static int16_t zeroBuf[8] /* = { 0 } */; |
| 808 | |
| 809 | if (size) |
| 810 | { |
| 811 | int dim = 1 << (size + 2); |
| 812 | uint32_t totEnergy = 0; |
| 813 | for (int i = 0; i < dim; i += 8) |
| 814 | { |
| 815 | for (int j = 0; j < dim; j+= 8) |
| 816 | { |
| 817 | /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */ |
| 818 | int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride, zeroBuf, 0) - |
| 819 | (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2); |
| 820 | int reconEnergy = sa8d_8x8(recon + i * rstride + j, rstride, zeroBuf, 0) - |
| 821 | (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2); |
| 822 | |
| 823 | totEnergy += abs(sourceEnergy - reconEnergy); |
| 824 | } |
| 825 | } |
| 826 | return totEnergy; |
| 827 | } |
| 828 | else |
| 829 | { |
| 830 | /* 4x4 is too small for sa8d */ |
| 831 | int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2); |
| 832 | int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2); |
| 833 | return abs(sourceEnergy - reconEnergy); |
| 834 | } |
| 835 | } |
| 836 | |
| 837 | template<int bx, int by> |
| 838 | void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb) |
| 839 | { |
| 840 | for (int y = 0; y < by; y++) |
| 841 | { |
| 842 | for (int x = 0; x < bx; x++) |
| 843 | a[x] = b[x]; |
| 844 | |
| 845 | a += stridea; |
| 846 | b += strideb; |
| 847 | } |
| 848 | } |
| 849 | |
| 850 | template<int bx, int by> |
| 851 | void blockcopy_ss_c(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb) |
| 852 | { |
| 853 | for (int y = 0; y < by; y++) |
| 854 | { |
| 855 | for (int x = 0; x < bx; x++) |
| 856 | a[x] = b[x]; |
| 857 | |
| 858 | a += stridea; |
| 859 | b += strideb; |
| 860 | } |
| 861 | } |
| 862 | |
| 863 | template<int bx, int by> |
| 864 | void blockcopy_sp_c(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb) |
| 865 | { |
| 866 | for (int y = 0; y < by; y++) |
| 867 | { |
| 868 | for (int x = 0; x < bx; x++) |
| 869 | { |
| 870 | X265_CHECK((b[x] >= 0) && (b[x] <= ((1 << X265_DEPTH) - 1)), "blockcopy pixel size fail\n"); |
| 871 | a[x] = (pixel)b[x]; |
| 872 | } |
| 873 | |
| 874 | a += stridea; |
| 875 | b += strideb; |
| 876 | } |
| 877 | } |
| 878 | |
| 879 | template<int bx, int by> |
| 880 | void blockcopy_ps_c(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb) |
| 881 | { |
| 882 | for (int y = 0; y < by; y++) |
| 883 | { |
| 884 | for (int x = 0; x < bx; x++) |
| 885 | a[x] = (int16_t)b[x]; |
| 886 | |
| 887 | a += stridea; |
| 888 | b += strideb; |
| 889 | } |
| 890 | } |
| 891 | |
| 892 | template<int bx, int by> |
| 893 | void pixel_sub_ps_c(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1) |
| 894 | { |
| 895 | for (int y = 0; y < by; y++) |
| 896 | { |
| 897 | for (int x = 0; x < bx; x++) |
| 898 | a[x] = (int16_t)(b0[x] - b1[x]); |
| 899 | |
| 900 | b0 += sstride0; |
| 901 | b1 += sstride1; |
| 902 | a += dstride; |
| 903 | } |
| 904 | } |
| 905 | |
| 906 | template<int bx, int by> |
| 907 | void pixel_add_ps_c(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1) |
| 908 | { |
| 909 | for (int y = 0; y < by; y++) |
| 910 | { |
| 911 | for (int x = 0; x < bx; x++) |
| 912 | a[x] = Clip(b0[x] + b1[x]); |
| 913 | |
| 914 | b0 += sstride0; |
| 915 | b1 += sstride1; |
| 916 | a += dstride; |
| 917 | } |
| 918 | } |
| 919 | |
| 920 | template<int bx, int by> |
| 921 | void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) |
| 922 | { |
| 923 | int shiftNum, offset; |
| 924 | |
| 925 | shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH; |
| 926 | offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS; |
| 927 | |
| 928 | for (int y = 0; y < by; y++) |
| 929 | { |
| 930 | for (int x = 0; x < bx; x += 2) |
| 931 | { |
| 932 | dst[x + 0] = Clip((src0[x + 0] + src1[x + 0] + offset) >> shiftNum); |
| 933 | dst[x + 1] = Clip((src0[x + 1] + src1[x + 1] + offset) >> shiftNum); |
| 934 | } |
| 935 | |
| 936 | src0 += src0Stride; |
| 937 | src1 += src1Stride; |
| 938 | dst += dstStride; |
| 939 | } |
| 940 | } |
| 941 | |
| 942 | void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift) |
| 943 | { |
| 944 | for (int r = 0; r < height; r++) |
| 945 | { |
| 946 | for (int c = 0; c < width; c++) |
| 947 | dst[c] = ((pixel)src[c]) << shift; |
| 948 | |
| 949 | dst += dstStride; |
| 950 | src += srcStride; |
| 951 | } |
| 952 | } |
| 953 | |
| 954 | void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) |
| 955 | { |
| 956 | for (int r = 0; r < height; r++) |
| 957 | { |
| 958 | for (int c = 0; c < width; c++) |
| 959 | dst[c] = (pixel)((src[c] >> shift) & mask); |
| 960 | |
| 961 | dst += dstStride; |
| 962 | src += srcStride; |
| 963 | } |
| 964 | } |
| 965 | |
| 966 | /* Estimate the total amount of influence on future quality that could be had if we |
| 967 | * were to improve the reference samples used to inter predict any given CU. */ |
| 968 | void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, |
| 969 | const int32_t* invQscales, const double* fpsFactor, int len) |
| 970 | { |
| 971 | double fps = *fpsFactor / 256; |
| 972 | |
| 973 | for (int i = 0; i < len; i++) |
| 974 | { |
| 975 | double intraCost = intraCosts[i] * invQscales[i]; |
| 976 | double propagateAmount = (double)propagateIn[i] + intraCost * fps; |
| 977 | double propagateNum = (double)intraCosts[i] - (interCosts[i] & ((1 << 14) - 1)); |
| 978 | double propagateDenom = (double)intraCosts[i]; |
| 979 | dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5); |
| 980 | } |
| 981 | } |
| 982 | } // end anonymous namespace |
| 983 | |
| 984 | namespace x265 { |
| 985 | // x265 private namespace |
| 986 | |
| 987 | /* Extend the edges of a picture so that it may safely be used for motion |
| 988 | * compensation. This function assumes the picture is stored in a buffer with |
| 989 | * sufficient padding for the X and Y margins */ |
| 990 | void extendPicBorder(pixel* pic, intptr_t stride, int width, int height, int marginX, int marginY) |
| 991 | { |
| 992 | /* extend left and right margins */ |
| 993 | primitives.extendRowBorder(pic, stride, width, height, marginX); |
| 994 | |
| 995 | /* copy top row to create above margin */ |
| 996 | pixel* top = pic - marginX; |
| 997 | for (int y = 0; y < marginY; y++) |
| 998 | memcpy(top - (y + 1) * stride, top, stride * sizeof(pixel)); |
| 999 | |
| 1000 | /* copy bottom row to create below margin */ |
| 1001 | pixel* bot = pic - marginX + (height - 1) * stride; |
| 1002 | for (int y = 0; y < marginY; y++) |
| 1003 | memcpy(bot + (y + 1) * stride, bot, stride * sizeof(pixel)); |
| 1004 | } |
| 1005 | |
| 1006 | /* Initialize entries for pixel functions defined in this file */ |
| 1007 | void Setup_C_PixelPrimitives(EncoderPrimitives &p) |
| 1008 | { |
| 1009 | SET_FUNC_PRIMITIVE_TABLE_C2(sad) |
| 1010 | SET_FUNC_PRIMITIVE_TABLE_C2(sad_x3) |
| 1011 | SET_FUNC_PRIMITIVE_TABLE_C2(sad_x4) |
| 1012 | SET_FUNC_PRIMITIVE_TABLE_C2(pixelavg_pp) |
| 1013 | |
| 1014 | // satd |
| 1015 | p.satd[LUMA_4x4] = satd_4x4; |
| 1016 | p.satd[LUMA_8x8] = satd8<8, 8>; |
| 1017 | p.satd[LUMA_8x4] = satd_8x4; |
| 1018 | p.satd[LUMA_4x8] = satd4<4, 8>; |
| 1019 | p.satd[LUMA_16x16] = satd8<16, 16>; |
| 1020 | p.satd[LUMA_16x8] = satd8<16, 8>; |
| 1021 | p.satd[LUMA_8x16] = satd8<8, 16>; |
| 1022 | p.satd[LUMA_16x12] = satd8<16, 12>; |
| 1023 | p.satd[LUMA_12x16] = satd4<12, 16>; |
| 1024 | p.satd[LUMA_16x4] = satd8<16, 4>; |
| 1025 | p.satd[LUMA_4x16] = satd4<4, 16>; |
| 1026 | p.satd[LUMA_32x32] = satd8<32, 32>; |
| 1027 | p.satd[LUMA_32x16] = satd8<32, 16>; |
| 1028 | p.satd[LUMA_16x32] = satd8<16, 32>; |
| 1029 | p.satd[LUMA_32x24] = satd8<32, 24>; |
| 1030 | p.satd[LUMA_24x32] = satd8<24, 32>; |
| 1031 | p.satd[LUMA_32x8] = satd8<32, 8>; |
| 1032 | p.satd[LUMA_8x32] = satd8<8, 32>; |
| 1033 | p.satd[LUMA_64x64] = satd8<64, 64>; |
| 1034 | p.satd[LUMA_64x32] = satd8<64, 32>; |
| 1035 | p.satd[LUMA_32x64] = satd8<32, 64>; |
| 1036 | p.satd[LUMA_64x48] = satd8<64, 48>; |
| 1037 | p.satd[LUMA_48x64] = satd8<48, 64>; |
| 1038 | p.satd[LUMA_64x16] = satd8<64, 16>; |
| 1039 | p.satd[LUMA_16x64] = satd8<16, 64>; |
| 1040 | |
| 1041 | p.chroma[X265_CSP_I420].satd[CHROMA_2x2] = NULL; |
| 1042 | p.chroma[X265_CSP_I420].satd[CHROMA_4x4] = satd_4x4; |
| 1043 | p.chroma[X265_CSP_I420].satd[CHROMA_8x8] = satd8<8, 8>; |
| 1044 | p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = satd8<16, 16>; |
| 1045 | p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = satd8<32, 32>; |
| 1046 | |
| 1047 | p.chroma[X265_CSP_I420].satd[CHROMA_4x2] = NULL; |
| 1048 | p.chroma[X265_CSP_I420].satd[CHROMA_2x4] = NULL; |
| 1049 | p.chroma[X265_CSP_I420].satd[CHROMA_8x4] = satd_8x4; |
| 1050 | p.chroma[X265_CSP_I420].satd[CHROMA_4x8] = satd4<4, 8>; |
| 1051 | p.chroma[X265_CSP_I420].satd[CHROMA_16x8] = satd8<16, 8>; |
| 1052 | p.chroma[X265_CSP_I420].satd[CHROMA_8x16] = satd8<8, 16>; |
| 1053 | p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = satd8<32, 16>; |
| 1054 | p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = satd8<16, 32>; |
| 1055 | |
| 1056 | p.chroma[X265_CSP_I420].satd[CHROMA_8x6] = NULL; |
| 1057 | p.chroma[X265_CSP_I420].satd[CHROMA_6x8] = NULL; |
| 1058 | p.chroma[X265_CSP_I420].satd[CHROMA_8x2] = NULL; |
| 1059 | p.chroma[X265_CSP_I420].satd[CHROMA_2x8] = NULL; |
| 1060 | p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = satd4<16, 12>; |
| 1061 | p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = satd4<12, 16>; |
| 1062 | p.chroma[X265_CSP_I420].satd[CHROMA_16x4] = satd4<16, 4>; |
| 1063 | p.chroma[X265_CSP_I420].satd[CHROMA_4x16] = satd4<4, 16>; |
| 1064 | p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = satd8<32, 24>; |
| 1065 | p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = satd8<24, 32>; |
| 1066 | p.chroma[X265_CSP_I420].satd[CHROMA_32x8] = satd8<32, 8>; |
| 1067 | p.chroma[X265_CSP_I420].satd[CHROMA_8x32] = satd8<8, 32>; |
| 1068 | |
| 1069 | p.chroma[X265_CSP_I422].satd[CHROMA422_2x4] = NULL; |
| 1070 | p.chroma[X265_CSP_I422].satd[CHROMA422_4x8] = satd4<4, 8>; |
| 1071 | p.chroma[X265_CSP_I422].satd[CHROMA422_8x16] = satd8<8, 16>; |
| 1072 | p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = satd8<16, 32>; |
| 1073 | p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = satd8<32, 64>; |
| 1074 | |
| 1075 | p.chroma[X265_CSP_I422].satd[CHROMA422_4x4] = satd_4x4; |
| 1076 | p.chroma[X265_CSP_I422].satd[CHROMA422_2x8] = NULL; |
| 1077 | p.chroma[X265_CSP_I422].satd[CHROMA422_8x8] = satd8<8, 8>; |
| 1078 | p.chroma[X265_CSP_I422].satd[CHROMA422_4x16] = satd4<4, 16>; |
| 1079 | p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = satd8<16, 16>; |
| 1080 | p.chroma[X265_CSP_I422].satd[CHROMA422_8x32] = satd8<8, 32>; |
| 1081 | p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = satd8<32, 32>; |
| 1082 | p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = satd8<16, 64>; |
| 1083 | |
| 1084 | p.chroma[X265_CSP_I422].satd[CHROMA422_8x12] = satd4<8, 12>; |
| 1085 | p.chroma[X265_CSP_I422].satd[CHROMA422_6x16] = NULL; |
| 1086 | p.chroma[X265_CSP_I422].satd[CHROMA422_8x4] = satd4<8, 4>; |
| 1087 | p.chroma[X265_CSP_I422].satd[CHROMA422_2x16] = NULL; |
| 1088 | p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>; |
| 1089 | p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>; |
| 1090 | p.chroma[X265_CSP_I422].satd[CHROMA422_16x8] = satd8<16, 8>; |
| 1091 | p.chroma[X265_CSP_I422].satd[CHROMA422_4x32] = satd4<4, 32>; |
| 1092 | p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>; |
| 1093 | p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>; |
| 1094 | p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = satd8<32, 16>; |
| 1095 | p.chroma[X265_CSP_I422].satd[CHROMA422_8x64] = satd8<8, 64>; |
| 1096 | |
| 1097 | #define CHROMA_420(W, H) \ |
| 1098 | p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = addAvg<W, H>; \ |
| 1099 | p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \ |
| 1100 | p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \ |
| 1101 | p.chroma[X265_CSP_I420].copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \ |
| 1102 | p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>; |
| 1103 | |
| 1104 | #define CHROMA_422(W, H) \ |
| 1105 | p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg<W, H>; \ |
| 1106 | p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \ |
| 1107 | p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \ |
| 1108 | p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \ |
| 1109 | p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>; |
| 1110 | |
| 1111 | #define CHROMA_444(W, H) \ |
| 1112 | p.chroma[X265_CSP_I444].satd[LUMA_ ## W ## x ## H] = p.satd[LUMA_ ## W ## x ## H]; \ |
| 1113 | p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \ |
| 1114 | p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \ |
| 1115 | p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \ |
| 1116 | p.chroma[X265_CSP_I444].copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \ |
| 1117 | p.chroma[X265_CSP_I444].copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>; |
| 1118 | |
| 1119 | #define LUMA(W, H) \ |
| 1120 | p.luma_addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \ |
| 1121 | p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \ |
| 1122 | p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \ |
| 1123 | p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \ |
| 1124 | p.luma_copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>; |
| 1125 | |
| 1126 | #define LUMA_PIXELSUB(W, H) \ |
| 1127 | p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \ |
| 1128 | p.luma_add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>; |
| 1129 | |
| 1130 | #define CHROMA_PIXELSUB_420(W, H) \ |
| 1131 | p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \ |
| 1132 | p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>; |
| 1133 | |
| 1134 | #define CHROMA_PIXELSUB_422(W, H) \ |
| 1135 | p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \ |
| 1136 | p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = pixel_add_ps_c<W, H>; |
| 1137 | |
| 1138 | #define CHROMA_PIXELSUB_444(W, H) \ |
| 1139 | p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \ |
| 1140 | p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>; |
| 1141 | |
| 1142 | LUMA(4, 4); |
| 1143 | LUMA(8, 8); |
| 1144 | CHROMA_420(4, 4); |
| 1145 | LUMA(4, 8); |
| 1146 | CHROMA_420(2, 4); |
| 1147 | LUMA(8, 4); |
| 1148 | CHROMA_420(4, 2); |
| 1149 | LUMA(16, 16); |
| 1150 | CHROMA_420(8, 8); |
| 1151 | LUMA(16, 8); |
| 1152 | CHROMA_420(8, 4); |
| 1153 | LUMA(8, 16); |
| 1154 | CHROMA_420(4, 8); |
| 1155 | LUMA(16, 12); |
| 1156 | CHROMA_420(8, 6); |
| 1157 | LUMA(12, 16); |
| 1158 | CHROMA_420(6, 8); |
| 1159 | LUMA(16, 4); |
| 1160 | CHROMA_420(8, 2); |
| 1161 | LUMA(4, 16); |
| 1162 | CHROMA_420(2, 8); |
| 1163 | LUMA(32, 32); |
| 1164 | CHROMA_420(16, 16); |
| 1165 | LUMA(32, 16); |
| 1166 | CHROMA_420(16, 8); |
| 1167 | LUMA(16, 32); |
| 1168 | CHROMA_420(8, 16); |
| 1169 | LUMA(32, 24); |
| 1170 | CHROMA_420(16, 12); |
| 1171 | LUMA(24, 32); |
| 1172 | CHROMA_420(12, 16); |
| 1173 | LUMA(32, 8); |
| 1174 | CHROMA_420(16, 4); |
| 1175 | LUMA(8, 32); |
| 1176 | CHROMA_420(4, 16); |
| 1177 | LUMA(64, 64); |
| 1178 | CHROMA_420(32, 32); |
| 1179 | LUMA(64, 32); |
| 1180 | CHROMA_420(32, 16); |
| 1181 | LUMA(32, 64); |
| 1182 | CHROMA_420(16, 32); |
| 1183 | LUMA(64, 48); |
| 1184 | CHROMA_420(32, 24); |
| 1185 | LUMA(48, 64); |
| 1186 | CHROMA_420(24, 32); |
| 1187 | LUMA(64, 16); |
| 1188 | CHROMA_420(32, 8); |
| 1189 | LUMA(16, 64); |
| 1190 | CHROMA_420(8, 32); |
| 1191 | |
| 1192 | LUMA_PIXELSUB(4, 4); |
| 1193 | LUMA_PIXELSUB(8, 8); |
| 1194 | LUMA_PIXELSUB(16, 16); |
| 1195 | LUMA_PIXELSUB(32, 32); |
| 1196 | LUMA_PIXELSUB(64, 64); |
| 1197 | CHROMA_PIXELSUB_420(4, 4) |
| 1198 | CHROMA_PIXELSUB_420(8, 8) |
| 1199 | CHROMA_PIXELSUB_420(16, 16) |
| 1200 | CHROMA_PIXELSUB_420(32, 32) |
| 1201 | CHROMA_PIXELSUB_422(4, 8) |
| 1202 | CHROMA_PIXELSUB_422(8, 16) |
| 1203 | CHROMA_PIXELSUB_422(16, 32) |
| 1204 | CHROMA_PIXELSUB_422(32, 64) |
| 1205 | CHROMA_PIXELSUB_444(8, 8) |
| 1206 | CHROMA_PIXELSUB_444(16, 16) |
| 1207 | CHROMA_PIXELSUB_444(32, 32) |
| 1208 | CHROMA_PIXELSUB_444(64, 64) |
| 1209 | |
| 1210 | CHROMA_422(4, 8); |
| 1211 | CHROMA_422(4, 4); |
| 1212 | CHROMA_422(2, 8); |
| 1213 | CHROMA_422(8, 16); |
| 1214 | CHROMA_422(8, 8); |
| 1215 | CHROMA_422(4, 16); |
| 1216 | CHROMA_422(8, 12); |
| 1217 | CHROMA_422(6, 16); |
| 1218 | CHROMA_422(8, 4); |
| 1219 | CHROMA_422(2, 16); |
| 1220 | CHROMA_422(16, 32); |
| 1221 | CHROMA_422(16, 16); |
| 1222 | CHROMA_422(8, 32); |
| 1223 | CHROMA_422(16, 24); |
| 1224 | CHROMA_422(12, 32); |
| 1225 | CHROMA_422(16, 8); |
| 1226 | CHROMA_422(4, 32); |
| 1227 | CHROMA_422(32, 64); |
| 1228 | CHROMA_422(32, 32); |
| 1229 | CHROMA_422(16, 64); |
| 1230 | CHROMA_422(32, 48); |
| 1231 | CHROMA_422(24, 64); |
| 1232 | CHROMA_422(32, 16); |
| 1233 | CHROMA_422(8, 64); |
| 1234 | |
| 1235 | CHROMA_444(4, 4); |
| 1236 | CHROMA_444(8, 8); |
| 1237 | CHROMA_444(4, 8); |
| 1238 | CHROMA_444(8, 4); |
| 1239 | CHROMA_444(16, 16); |
| 1240 | CHROMA_444(16, 8); |
| 1241 | CHROMA_444(8, 16); |
| 1242 | CHROMA_444(16, 12); |
| 1243 | CHROMA_444(12, 16); |
| 1244 | CHROMA_444(16, 4); |
| 1245 | CHROMA_444(4, 16); |
| 1246 | CHROMA_444(32, 32); |
| 1247 | CHROMA_444(32, 16); |
| 1248 | CHROMA_444(16, 32); |
| 1249 | CHROMA_444(32, 24); |
| 1250 | CHROMA_444(24, 32); |
| 1251 | CHROMA_444(32, 8); |
| 1252 | CHROMA_444(8, 32); |
| 1253 | CHROMA_444(64, 64); |
| 1254 | CHROMA_444(64, 32); |
| 1255 | CHROMA_444(32, 64); |
| 1256 | CHROMA_444(64, 48); |
| 1257 | CHROMA_444(48, 64); |
| 1258 | CHROMA_444(64, 16); |
| 1259 | CHROMA_444(16, 64); |
| 1260 | |
| 1261 | SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixel, pixel) |
| 1262 | SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel) |
| 1263 | SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t) |
| 1264 | |
| 1265 | p.blockfill_s[BLOCK_4x4] = blockfil_s_c<4>; |
| 1266 | p.blockfill_s[BLOCK_8x8] = blockfil_s_c<8>; |
| 1267 | p.blockfill_s[BLOCK_16x16] = blockfil_s_c<16>; |
| 1268 | p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>; |
| 1269 | p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>; |
| 1270 | |
| 1271 | p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>; |
| 1272 | p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>; |
| 1273 | p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>; |
| 1274 | p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>; |
| 1275 | p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>; |
| 1276 | p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>; |
| 1277 | p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>; |
| 1278 | p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>; |
| 1279 | p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>; |
| 1280 | p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>; |
| 1281 | p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>; |
| 1282 | p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>; |
| 1283 | p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>; |
| 1284 | p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>; |
| 1285 | p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>; |
| 1286 | p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>; |
| 1287 | |
| 1288 | p.sa8d[BLOCK_4x4] = satd_4x4; |
| 1289 | p.sa8d[BLOCK_8x8] = sa8d_8x8; |
| 1290 | p.sa8d[BLOCK_16x16] = sa8d_16x16; |
| 1291 | p.sa8d[BLOCK_32x32] = sa8d16<32, 32>; |
| 1292 | p.sa8d[BLOCK_64x64] = sa8d16<64, 64>; |
| 1293 | |
| 1294 | p.psy_cost_pp[BLOCK_4x4] = psyCost_pp<BLOCK_4x4>; |
| 1295 | p.psy_cost_pp[BLOCK_8x8] = psyCost_pp<BLOCK_8x8>; |
| 1296 | p.psy_cost_pp[BLOCK_16x16] = psyCost_pp<BLOCK_16x16>; |
| 1297 | p.psy_cost_pp[BLOCK_32x32] = psyCost_pp<BLOCK_32x32>; |
| 1298 | p.psy_cost_pp[BLOCK_64x64] = psyCost_pp<BLOCK_64x64>; |
| 1299 | |
| 1300 | p.psy_cost_ss[BLOCK_4x4] = psyCost_ss<BLOCK_4x4>; |
| 1301 | p.psy_cost_ss[BLOCK_8x8] = psyCost_ss<BLOCK_8x8>; |
| 1302 | p.psy_cost_ss[BLOCK_16x16] = psyCost_ss<BLOCK_16x16>; |
| 1303 | p.psy_cost_ss[BLOCK_32x32] = psyCost_ss<BLOCK_32x32>; |
| 1304 | p.psy_cost_ss[BLOCK_64x64] = psyCost_ss<BLOCK_64x64>; |
| 1305 | |
| 1306 | p.sa8d_inter[LUMA_4x4] = satd_4x4; |
| 1307 | p.sa8d_inter[LUMA_8x8] = sa8d_8x8; |
| 1308 | p.sa8d_inter[LUMA_8x4] = satd_8x4; |
| 1309 | p.sa8d_inter[LUMA_4x8] = satd4<4, 8>; |
| 1310 | p.sa8d_inter[LUMA_16x16] = sa8d_16x16; |
| 1311 | p.sa8d_inter[LUMA_16x8] = sa8d8<16, 8>; |
| 1312 | p.sa8d_inter[LUMA_8x16] = sa8d8<8, 16>; |
| 1313 | p.sa8d_inter[LUMA_16x12] = satd8<16, 12>; |
| 1314 | p.sa8d_inter[LUMA_12x16] = satd4<12, 16>; |
| 1315 | p.sa8d_inter[LUMA_4x16] = satd4<4, 16>; |
| 1316 | p.sa8d_inter[LUMA_16x4] = satd8<16, 4>; |
| 1317 | p.sa8d_inter[LUMA_32x32] = sa8d16<32, 32>; |
| 1318 | p.sa8d_inter[LUMA_32x16] = sa8d16<32, 16>; |
| 1319 | p.sa8d_inter[LUMA_16x32] = sa8d16<16, 32>; |
| 1320 | p.sa8d_inter[LUMA_32x24] = sa8d8<32, 24>; |
| 1321 | p.sa8d_inter[LUMA_24x32] = sa8d8<24, 32>; |
| 1322 | p.sa8d_inter[LUMA_32x8] = sa8d8<32, 8>; |
| 1323 | p.sa8d_inter[LUMA_8x32] = sa8d8<8, 32>; |
| 1324 | p.sa8d_inter[LUMA_64x64] = sa8d16<64, 64>; |
| 1325 | p.sa8d_inter[LUMA_64x32] = sa8d16<64, 32>; |
| 1326 | p.sa8d_inter[LUMA_32x64] = sa8d16<32, 64>; |
| 1327 | p.sa8d_inter[LUMA_64x48] = sa8d16<64, 48>; |
| 1328 | p.sa8d_inter[LUMA_48x64] = sa8d16<48, 64>; |
| 1329 | p.sa8d_inter[LUMA_64x16] = sa8d16<64, 16>; |
| 1330 | p.sa8d_inter[LUMA_16x64] = sa8d16<16, 64>; |
| 1331 | |
| 1332 | p.calcresidual[BLOCK_4x4] = getResidual<4>; |
| 1333 | p.calcresidual[BLOCK_8x8] = getResidual<8>; |
| 1334 | p.calcresidual[BLOCK_16x16] = getResidual<16>; |
| 1335 | p.calcresidual[BLOCK_32x32] = getResidual<32>; |
| 1336 | p.calcresidual[BLOCK_64x64] = NULL; |
| 1337 | |
| 1338 | p.transpose[BLOCK_4x4] = transpose<4>; |
| 1339 | p.transpose[BLOCK_8x8] = transpose<8>; |
| 1340 | p.transpose[BLOCK_16x16] = transpose<16>; |
| 1341 | p.transpose[BLOCK_32x32] = transpose<32>; |
| 1342 | p.transpose[BLOCK_64x64] = transpose<64>; |
| 1343 | |
| 1344 | p.ssd_s[BLOCK_4x4] = pixel_ssd_s_c<4>; |
| 1345 | p.ssd_s[BLOCK_8x8] = pixel_ssd_s_c<8>; |
| 1346 | p.ssd_s[BLOCK_16x16] = pixel_ssd_s_c<16>; |
| 1347 | p.ssd_s[BLOCK_32x32] = pixel_ssd_s_c<32>; |
| 1348 | |
| 1349 | p.weight_pp = weight_pp_c; |
| 1350 | p.weight_sp = weight_sp_c; |
| 1351 | |
| 1352 | p.scale1D_128to64 = scale1D_128to64; |
| 1353 | p.scale2D_64to32 = scale2D_64to32; |
| 1354 | p.frameInitLowres = frame_init_lowres_core; |
| 1355 | p.ssim_4x4x2_core = ssim_4x4x2_core; |
| 1356 | p.ssim_end_4 = ssim_end_4; |
| 1357 | |
| 1358 | p.var[BLOCK_8x8] = pixel_var<8>; |
| 1359 | p.var[BLOCK_16x16] = pixel_var<16>; |
| 1360 | p.var[BLOCK_32x32] = pixel_var<32>; |
| 1361 | p.var[BLOCK_64x64] = pixel_var<64>; |
| 1362 | p.planecopy_cp = planecopy_cp_c; |
| 1363 | p.planecopy_sp = planecopy_sp_c; |
| 1364 | p.propagateCost = estimateCUPropagateCost; |
| 1365 | } |
| 1366 | } |