source/common/pixel.cpp

   1 /*****************************************************************************
   2  * Copyright (C) 2013 x265 project
   3  *
   4  * Authors: Steve Borho <steve@borho.org>
   5  *          Mandar Gurav <mandar@multicorewareinc.com>
   6  *          Mahesh Pittala <mahesh@multicorewareinc.com>
   7  *          Min Chen <min.chen@multicorewareinc.com>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *
  23  * This program is also available under a commercial proprietary license.
  24  * For more information, contact us at license @ x265.com.
  25  *****************************************************************************/
  26
  27 #include "common.h"
  28 #include "primitives.h"
  29 #include "x265.h"
  30
  31 #include <cstdlib> // abs()
  32
  33 using namespace x265;
  34
  35 #define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \
  36     p.FUNC_PREFIX[LUMA_4x4]   = FUNC_PREFIX_DEF<4,  4, DATA_TYPE1, DATA_TYPE2>; \
  37     p.FUNC_PREFIX[LUMA_8x8]   = FUNC_PREFIX_DEF<8,  8, DATA_TYPE1, DATA_TYPE2>; \
  38     p.FUNC_PREFIX[LUMA_8x4]   = FUNC_PREFIX_DEF<8,  4, DATA_TYPE1, DATA_TYPE2>; \
  39     p.FUNC_PREFIX[LUMA_4x8]   = FUNC_PREFIX_DEF<4,  8, DATA_TYPE1, DATA_TYPE2>; \
  40     p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
  41     p.FUNC_PREFIX[LUMA_16x8]  = FUNC_PREFIX_DEF<16,  8, DATA_TYPE1, DATA_TYPE2>; \
  42     p.FUNC_PREFIX[LUMA_8x16]  = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
  43     p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
  44     p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
  45     p.FUNC_PREFIX[LUMA_16x4]  = FUNC_PREFIX_DEF<16,  4, DATA_TYPE1, DATA_TYPE2>; \
  46     p.FUNC_PREFIX[LUMA_4x16]  = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
  47     p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
  48     p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
  49     p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
  50     p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
  51     p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
  52     p.FUNC_PREFIX[LUMA_32x8]  = FUNC_PREFIX_DEF<32,  8, DATA_TYPE1, DATA_TYPE2>; \
  53     p.FUNC_PREFIX[LUMA_8x32]  = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
  54     p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
  55     p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
  56     p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
  57     p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
  58     p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
  59     p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
  60     p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
  61
  62 #define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
  63     p.FUNC_PREFIX[LUMA_4x4]   = FUNC_PREFIX<4,  4>; \
  64     p.FUNC_PREFIX[LUMA_8x8]   = FUNC_PREFIX<8,  8>; \
  65     p.FUNC_PREFIX[LUMA_8x4]   = FUNC_PREFIX<8,  4>; \
  66     p.FUNC_PREFIX[LUMA_4x8]   = FUNC_PREFIX<4,  8>; \
  67     p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX<16, 16>; \
  68     p.FUNC_PREFIX[LUMA_16x8]  = FUNC_PREFIX<16,  8>; \
  69     p.FUNC_PREFIX[LUMA_8x16]  = FUNC_PREFIX<8, 16>; \
  70     p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX<16, 12>; \
  71     p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX<12, 16>; \
  72     p.FUNC_PREFIX[LUMA_16x4]  = FUNC_PREFIX<16,  4>; \
  73     p.FUNC_PREFIX[LUMA_4x16]  = FUNC_PREFIX<4, 16>; \
  74     p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX<32, 32>; \
  75     p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX<32, 16>; \
  76     p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX<16, 32>; \
  77     p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX<32, 24>; \
  78     p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX<24, 32>; \
  79     p.FUNC_PREFIX[LUMA_32x8]  = FUNC_PREFIX<32,  8>; \
  80     p.FUNC_PREFIX[LUMA_8x32]  = FUNC_PREFIX<8, 32>; \
  81     p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX<64, 64>; \
  82     p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX<64, 32>; \
  83     p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX<32, 64>; \
  84     p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX<64, 48>; \
  85     p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX<48, 64>; \
  86     p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX<64, 16>; \
  87     p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX<16, 64>;
  88
  89 namespace {
  90 // place functions in anonymous namespace (file static)
  91
  92 template<int lx, int ly>
  93 int sad(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
  94 {
  95     int sum = 0;
  96
  97     for (int y = 0; y < ly; y++)
  98     {
  99         for (int x = 0; x < lx; x++)
 100             sum += abs(pix1[x] - pix2[x]);
 101
 102         pix1 += stride_pix1;
 103         pix2 += stride_pix2;
 104     }
 105
 106     return sum;
 107 }
 108
 109 template<int lx, int ly>
 110 int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
 111 {
 112     int sum = 0;
 113
 114     for (int y = 0; y < ly; y++)
 115     {
 116         for (int x = 0; x < lx; x++)
 117             sum += abs(pix1[x] - pix2[x]);
 118
 119         pix1 += stride_pix1;
 120         pix2 += stride_pix2;
 121     }
 122
 123     return sum;
 124 }
 125
 126 template<int lx, int ly>
 127 void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
 128 {
 129     res[0] = 0;
 130     res[1] = 0;
 131     res[2] = 0;
 132     for (int y = 0; y < ly; y++)
 133     {
 134         for (int x = 0; x < lx; x++)
 135         {
 136             res[0] += abs(pix1[x] - pix2[x]);
 137             res[1] += abs(pix1[x] - pix3[x]);
 138             res[2] += abs(pix1[x] - pix4[x]);
 139         }
 140
 141         pix1 += FENC_STRIDE;
 142         pix2 += frefstride;
 143         pix3 += frefstride;
 144         pix4 += frefstride;
 145     }
 146 }
 147
 148 template<int lx, int ly>
 149 void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
 150 {
 151     res[0] = 0;
 152     res[1] = 0;
 153     res[2] = 0;
 154     res[3] = 0;
 155     for (int y = 0; y < ly; y++)
 156     {
 157         for (int x = 0; x < lx; x++)
 158         {
 159             res[0] += abs(pix1[x] - pix2[x]);
 160             res[1] += abs(pix1[x] - pix3[x]);
 161             res[2] += abs(pix1[x] - pix4[x]);
 162             res[3] += abs(pix1[x] - pix5[x]);
 163         }
 164
 165         pix1 += FENC_STRIDE;
 166         pix2 += frefstride;
 167         pix3 += frefstride;
 168         pix4 += frefstride;
 169         pix5 += frefstride;
 170     }
 171 }
 172
 173 template<int lx, int ly, class T1, class T2>
 174 int sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
 175 {
 176     int sum = 0;
 177     int tmp;
 178
 179     for (int y = 0; y < ly; y++)
 180     {
 181         for (int x = 0; x < lx; x++)
 182         {
 183             tmp = pix1[x] - pix2[x];
 184             sum += (tmp * tmp);
 185         }
 186
 187         pix1 += stride_pix1;
 188         pix2 += stride_pix2;
 189     }
 190
 191     return sum;
 192 }
 193
 194 #define BITS_PER_SUM (8 * sizeof(sum_t))
 195
 196 #define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) { \
 197         sum2_t t0 = s0 + s1; \
 198         sum2_t t1 = s0 - s1; \
 199         sum2_t t2 = s2 + s3; \
 200         sum2_t t3 = s2 - s3; \
 201         d0 = t0 + t2; \
 202         d2 = t0 - t2; \
 203         d1 = t1 + t3; \
 204         d3 = t1 - t3; \
 205 }
 206
 207 // in: a pseudo-simd number of the form x+(y<<16)
 208 // return: abs(x)+(abs(y)<<16)
 209 inline sum2_t abs2(sum2_t a)
 210 {
 211     sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
 212
 213     return (a + s) ^ s;
 214 }
 215
 216 int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
 217 {
 218     sum2_t tmp[4][2];
 219     sum2_t a0, a1, a2, a3, b0, b1;
 220     sum2_t sum = 0;
 221
 222     for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)
 223     {
 224         a0 = pix1[0] - pix2[0];
 225         a1 = pix1[1] - pix2[1];
 226         b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
 227         a2 = pix1[2] - pix2[2];
 228         a3 = pix1[3] - pix2[3];
 229         b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
 230         tmp[i][0] = b0 + b1;
 231         tmp[i][1] = b0 - b1;
 232     }
 233
 234     for (int i = 0; i < 2; i++)
 235     {
 236         HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
 237         a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
 238         sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);
 239     }
 240
 241     return (int)(sum >> 1);
 242 }
 243
 244 int satd_4x4(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
 245 {
 246     ssum2_t tmp[4][2];
 247     ssum2_t a0, a1, a2, a3, b0, b1;
 248     ssum2_t sum = 0;
 249
 250     for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)
 251     {
 252         a0 = pix1[0] - pix2[0];
 253         a1 = pix1[1] - pix2[1];
 254         b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
 255         a2 = pix1[2] - pix2[2];
 256         a3 = pix1[3] - pix2[3];
 257         b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
 258         tmp[i][0] = b0 + b1;
 259         tmp[i][1] = b0 - b1;
 260     }
 261
 262     for (int i = 0; i < 2; i++)
 263     {
 264         HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
 265         a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
 266         sum += ((sum_t)a0) + (a0 >> BITS_PER_SUM);
 267     }
 268
 269     return (int)(sum >> 1);
 270 }
 271
 272 // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
 273 int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
 274 {
 275     sum2_t tmp[4][4];
 276     sum2_t a0, a1, a2, a3;
 277     sum2_t sum = 0;
 278
 279     for (int i = 0; i < 4; i++, pix1 += stride_pix1, pix2 += stride_pix2)
 280     {
 281         a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
 282         a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
 283         a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
 284         a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
 285         HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
 286     }
 287
 288     for (int i = 0; i < 4; i++)
 289     {
 290         HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
 291         sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
 292     }
 293
 294     return (((sum_t)sum) + (sum >> BITS_PER_SUM)) >> 1;
 295 }
 296
 297 template<int w, int h>
 298 // calculate satd in blocks of 4x4
 299 int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
 300 {
 301     int satd = 0;
 302
 303     for (int row = 0; row < h; row += 4)
 304         for (int col = 0; col < w; col += 4)
 305             satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
 306                              pix2 + row * stride_pix2 + col, stride_pix2);
 307
 308     return satd;
 309 }
 310
 311 template<int w, int h>
 312 // calculate satd in blocks of 8x4
 313 int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
 314 {
 315     int satd = 0;
 316
 317     for (int row = 0; row < h; row += 4)
 318         for (int col = 0; col < w; col += 8)
 319             satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
 320                              pix2 + row * stride_pix2 + col, stride_pix2);
 321
 322     return satd;
 323 }
 324
 325 inline int _sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
 326 {
 327     sum2_t tmp[8][4];
 328     sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
 329     sum2_t sum = 0;
 330
 331     for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2)
 332     {
 333         a0 = pix1[0] - pix2[0];
 334         a1 = pix1[1] - pix2[1];
 335         b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
 336         a2 = pix1[2] - pix2[2];
 337         a3 = pix1[3] - pix2[3];
 338         b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
 339         a4 = pix1[4] - pix2[4];
 340         a5 = pix1[5] - pix2[5];
 341         b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM);
 342         a6 = pix1[6] - pix2[6];
 343         a7 = pix1[7] - pix2[7];
 344         b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM);
 345         HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3);
 346     }
 347
 348     for (int i = 0; i < 4; i++)
 349     {
 350         HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
 351         HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]);
 352         b0  = abs2(a0 + a4) + abs2(a0 - a4);
 353         b0 += abs2(a1 + a5) + abs2(a1 - a5);
 354         b0 += abs2(a2 + a6) + abs2(a2 - a6);
 355         b0 += abs2(a3 + a7) + abs2(a3 - a7);
 356         sum += (sum_t)b0 + (b0 >> BITS_PER_SUM);
 357     }
 358
 359     return (int)sum;
 360 }
 361
 362 int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
 363 {
 364     return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
 365 }
 366
 367 inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2)
 368 {
 369     ssum2_t tmp[8][4];
 370     ssum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
 371     ssum2_t sum = 0;
 372
 373     for (int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2)
 374     {
 375         a0 = pix1[0] - pix2[0];
 376         a1 = pix1[1] - pix2[1];
 377         b0 = (a0 + a1) + ((a0 - a1) << BITS_PER_SUM);
 378         a2 = pix1[2] - pix2[2];
 379         a3 = pix1[3] - pix2[3];
 380         b1 = (a2 + a3) + ((a2 - a3) << BITS_PER_SUM);
 381         a4 = pix1[4] - pix2[4];
 382         a5 = pix1[5] - pix2[5];
 383         b2 = (a4 + a5) + ((a4 - a5) << BITS_PER_SUM);
 384         a6 = pix1[6] - pix2[6];
 385         a7 = pix1[7] - pix2[7];
 386         b3 = (a6 + a7) + ((a6 - a7) << BITS_PER_SUM);
 387         HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0, b1, b2, b3);
 388     }
 389
 390     for (int i = 0; i < 4; i++)
 391     {
 392         HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
 393         HADAMARD4(a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i]);
 394         b0  = abs2(a0 + a4) + abs2(a0 - a4);
 395         b0 += abs2(a1 + a5) + abs2(a1 - a5);
 396         b0 += abs2(a2 + a6) + abs2(a2 - a6);
 397         b0 += abs2(a3 + a7) + abs2(a3 - a7);
 398         sum += (sum_t)b0 + (b0 >> BITS_PER_SUM);
 399     }
 400
 401     return (int)sum;
 402 }
 403
 404 int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2)
 405 {
 406     return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
 407 }
 408
 409 int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
 410 {
 411     int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
 412         + _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2)
 413         + _sa8d_8x8(pix1 + 8 * i_pix1, i_pix1, pix2 + 8 * i_pix2, i_pix2)
 414         + _sa8d_8x8(pix1 + 8 + 8 * i_pix1, i_pix1, pix2 + 8 + 8 * i_pix2, i_pix2);
 415
 416     // This matches x264 sa8d_16x16, but is slightly different from HM's behavior because
 417     // this version only rounds once at the end
 418     return (sum + 2) >> 2;
 419 }
 420
 421 template<int w, int h>
 422 // Calculate sa8d in blocks of 8x8
 423 int sa8d8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
 424 {
 425     int cost = 0;
 426
 427     for (int y = 0; y < h; y += 8)
 428         for (int x = 0; x < w; x += 8)
 429             cost += sa8d_8x8(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
 430
 431     return cost;
 432 }
 433
 434 template<int w, int h>
 435 // Calculate sa8d in blocks of 16x16
 436 int sa8d16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
 437 {
 438     int cost = 0;
 439
 440     for (int y = 0; y < h; y += 16)
 441         for (int x = 0; x < w; x += 16)
 442             cost += sa8d_16x16(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
 443
 444     return cost;
 445 }
 446
 447 template<int size>
 448 int pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
 449 {
 450     int sum = 0;
 451     for (int y = 0; y < size; y++)
 452     {
 453         for (int x = 0; x < size; x++)
 454             sum += a[x] * a[x];
 455
 456         a += dstride;
 457     }
 458     return sum;
 459 }
 460
 461 template<int size>
 462 void blockfil_s_c(int16_t* dst, intptr_t dstride, int16_t val)
 463 {
 464     for (int y = 0; y < size; y++)
 465         for (int x = 0; x < size; x++)
 466             dst[y * dstride + x] = val;
 467 }
 468
 469 template<int size>
 470 void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
 471 {
 472     X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
 473     X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
 474     X265_CHECK(shift >= 0, "invalid shift\n");
 475
 476     for (int i = 0; i < size; i++)
 477     {
 478         for (int j = 0; j < size; j++)
 479             dst[j] = src[j] << shift;
 480
 481         src += srcStride;
 482         dst += size;
 483     }
 484 }
 485
 486 template<int size>
 487 void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
 488 {
 489     X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
 490     X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
 491     X265_CHECK(shift > 0, "invalid shift\n");
 492
 493     int16_t round = 1 << (shift - 1);
 494     for (int i = 0; i < size; i++)
 495     {
 496         for (int j = 0; j < size; j++)
 497             dst[j] = (src[j] + round) >> shift;
 498
 499         src += srcStride;
 500         dst += size;
 501     }
 502 }
 503
 504 template<int size>
 505 void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 506 {
 507     X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
 508     X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
 509     X265_CHECK(shift >= 0, "invalid shift\n");
 510
 511     for (int i = 0; i < size; i++)
 512     {
 513         for (int j = 0; j < size; j++)
 514             dst[j] = src[j] << shift;
 515
 516         src += size;
 517         dst += dstStride;
 518     }
 519 }
 520
 521 template<int size>
 522 void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
 523 {
 524     X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
 525     X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
 526     X265_CHECK(shift > 0, "invalid shift\n");
 527
 528     int16_t round = 1 << (shift - 1);
 529     for (int i = 0; i < size; i++)
 530     {
 531         for (int j = 0; j < size; j++)
 532             dst[j] = (src[j] + round) >> shift;
 533
 534         src += size;
 535         dst += dstStride;
 536     }
 537 }
 538
 539 template<int blockSize>
 540 void getResidual(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
 541 {
 542     for (int y = 0; y < blockSize; y++)
 543     {
 544         for (int x = 0; x < blockSize; x++)
 545             residual[x] = static_cast<int16_t>(fenc[x]) - static_cast<int16_t>(pred[x]);
 546
 547         fenc += stride;
 548         residual += stride;
 549         pred += stride;
 550     }
 551 }
 552
 553 template<int blockSize>
 554 void transpose(pixel* dst, const pixel* src, intptr_t stride)
 555 {
 556     for (int k = 0; k < blockSize; k++)
 557         for (int l = 0; l < blockSize; l++)
 558             dst[k * blockSize + l] = src[l * stride + k];
 559 }
 560
 561 void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
 562 {
 563     int x, y;
 564
 565     for (y = 0; y <= height - 1; y++)
 566     {
 567         for (x = 0; x <= width - 1; )
 568         {
 569             // note: width can be odd
 570             dst[x] = (pixel)Clip3(0, ((1 << X265_DEPTH) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
 571             x++;
 572         }
 573
 574         src += srcStride;
 575         dst += dstStride;
 576     }
 577 }
 578
 579 void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
 580 {
 581     int x, y;
 582
 583     X265_CHECK(!(width & 15), "weightp alignment error\n");
 584     X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
 585     X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n");
 586
 587     for (y = 0; y <= height - 1; y++)
 588     {
 589         for (x = 0; x <= width - 1; )
 590         {
 591             // simulating pixel to short conversion
 592             int16_t val = src[x] << (IF_INTERNAL_PREC - X265_DEPTH);
 593             dst[x] = (pixel)Clip3(0, ((1 << X265_DEPTH) - 1), ((w0 * (val) + round) >> shift) + offset);
 594             x++;
 595         }
 596
 597         src += stride;
 598         dst += stride;
 599     }
 600 }
 601
 602 template<int lx, int ly>
 603 void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
 604 {
 605     for (int y = 0; y < ly; y++)
 606     {
 607         for (int x = 0; x < lx; x++)
 608             dst[x] = (src0[x] + src1[x] + 1) >> 1;
 609
 610         src0 += sstride0;
 611         src1 += sstride1;
 612         dst += dstride;
 613     }
 614 }
 615
 616 void scale1D_128to64(pixel* dst, const pixel* src, intptr_t /*stride*/)
 617 {
 618     int x;
 619
 620     for (x = 0; x < 128; x += 2)
 621     {
 622         pixel pix0 = src[(x + 0)];
 623         pixel pix1 = src[(x + 1)];
 624         int sum = pix0 + pix1;
 625
 626         dst[x >> 1] = (pixel)((sum + 1) >> 1);
 627     }
 628 }
 629
 630 void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
 631 {
 632     uint32_t x, y;
 633
 634     for (y = 0; y < 64; y += 2)
 635     {
 636         for (x = 0; x < 64; x += 2)
 637         {
 638             pixel pix0 = src[(y + 0) * stride + (x + 0)];
 639             pixel pix1 = src[(y + 0) * stride + (x + 1)];
 640             pixel pix2 = src[(y + 1) * stride + (x + 0)];
 641             pixel pix3 = src[(y + 1) * stride + (x + 1)];
 642             int sum = pix0 + pix1 + pix2 + pix3;
 643
 644             dst[y / 2 * 32 + x / 2] = (pixel)((sum + 2) >> 2);
 645         }
 646     }
 647 }
 648
 649 void frame_init_lowres_core(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc,
 650                             intptr_t src_stride, intptr_t dst_stride, int width, int height)
 651 {
 652     for (int y = 0; y < height; y++)
 653     {
 654         const pixel* src1 = src0 + src_stride;
 655         const pixel* src2 = src1 + src_stride;
 656         for (int x = 0; x < width; x++)
 657         {
 658             // slower than naive bilinear, but matches asm
 659 #define FILTER(a, b, c, d) ((((a + b + 1) >> 1) + ((c + d + 1) >> 1) + 1) >> 1)
 660             dst0[x] = FILTER(src0[2 * x], src1[2 * x], src0[2 * x + 1], src1[2 * x + 1]);
 661             dsth[x] = FILTER(src0[2 * x + 1], src1[2 * x + 1], src0[2 * x + 2], src1[2 * x + 2]);
 662             dstv[x] = FILTER(src1[2 * x], src2[2 * x], src1[2 * x + 1], src2[2 * x + 1]);
 663             dstc[x] = FILTER(src1[2 * x + 1], src2[2 * x + 1], src1[2 * x + 2], src2[2 * x + 2]);
 664 #undef FILTER
 665         }
 666         src0 += src_stride * 2;
 667         dst0 += dst_stride;
 668         dsth += dst_stride;
 669         dstv += dst_stride;
 670         dstc += dst_stride;
 671     }
 672 }
 673
 674 /* structural similarity metric */
 675 void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4])
 676 {
 677     for (int z = 0; z < 2; z++)
 678     {
 679         uint32_t s1 = 0, s2 = 0, ss = 0, s12 = 0;
 680         for (int y = 0; y < 4; y++)
 681         {
 682             for (int x = 0; x < 4; x++)
 683             {
 684                 int a = pix1[x + y * stride1];
 685                 int b = pix2[x + y * stride2];
 686                 s1 += a;
 687                 s2 += b;
 688                 ss += a * a;
 689                 ss += b * b;
 690                 s12 += a * b;
 691             }
 692         }
 693
 694         sums[z][0] = s1;
 695         sums[z][1] = s2;
 696         sums[z][2] = ss;
 697         sums[z][3] = s12;
 698         pix1 += 4;
 699         pix2 += 4;
 700     }
 701 }
 702
 703 float ssim_end_1(int s1, int s2, int ss, int s12)
 704 {
 705 /* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases.
 706  * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784.
 707  * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */
 708
 709 #define PIXEL_MAX ((1 << X265_DEPTH) - 1)
 710 #if HIGH_BIT_DEPTH
 711     X265_CHECK(X265_DEPTH == 10, "ssim invalid depth\n");
 712 #define type float
 713     static const float ssim_c1 = (float)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64);
 714     static const float ssim_c2 = (float)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63);
 715 #else
 716     X265_CHECK(X265_DEPTH == 8, "ssim invalid depth\n");
 717 #define type int
 718     static const int ssim_c1 = (int)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64 + .5);
 719     static const int ssim_c2 = (int)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63 + .5);
 720 #endif
 721     type fs1 = (type)s1;
 722     type fs2 = (type)s2;
 723     type fss = (type)ss;
 724     type fs12 = (type)s12;
 725     type vars = (type)(fss * 64 - fs1 * fs1 - fs2 * fs2);
 726     type covar = (type)(fs12 * 64 - fs1 * fs2);
 727     return (float)(2 * fs1 * fs2 + ssim_c1) * (float)(2 * covar + ssim_c2)
 728            / ((float)(fs1 * fs1 + fs2 * fs2 + ssim_c1) * (float)(vars + ssim_c2));
 729 #undef type
 730 #undef PIXEL_MAX
 731 }
 732
 733 float ssim_end_4(int sum0[5][4], int sum1[5][4], int width)
 734 {
 735     float ssim = 0.0;
 736
 737     for (int i = 0; i < width; i++)
 738     {
 739         ssim += ssim_end_1(sum0[i][0] + sum0[i + 1][0] + sum1[i][0] + sum1[i + 1][0],
 740                            sum0[i][1] + sum0[i + 1][1] + sum1[i][1] + sum1[i + 1][1],
 741                            sum0[i][2] + sum0[i + 1][2] + sum1[i][2] + sum1[i + 1][2],
 742                            sum0[i][3] + sum0[i + 1][3] + sum1[i][3] + sum1[i + 1][3]);
 743     }
 744
 745     return ssim;
 746 }
 747
 748 template<int size>
 749 uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
 750 {
 751     uint32_t sum = 0, sqr = 0;
 752
 753     for (int y = 0; y < size; y++)
 754     {
 755         for (int x = 0; x < size; x++)
 756         {
 757             sum += pix[x];
 758             sqr += pix[x] * pix[x];
 759         }
 760
 761         pix += i_stride;
 762     }
 763
 764     return sum + ((uint64_t)sqr << 32);
 765 }
 766
 767 #if defined(_MSC_VER)
 768 #pragma warning(disable: 4127) // conditional expression is constant
 769 #endif
 770
 771 template<int size>
 772 int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
 773 {
 774     static pixel zeroBuf[8] /* = { 0 } */;
 775
 776     if (size)
 777     {
 778         int dim = 1 << (size + 2);
 779         uint32_t totEnergy = 0;
 780         for (int i = 0; i < dim; i += 8)
 781         {
 782             for (int j = 0; j < dim; j+= 8)
 783             {
 784                 /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
 785                 int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride, zeroBuf, 0) -
 786                                    (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
 787                 int reconEnergy =  sa8d_8x8(recon + i * rstride + j, rstride, zeroBuf, 0) -
 788                                    (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
 789
 790                 totEnergy += abs(sourceEnergy - reconEnergy);
 791             }
 792         }
 793         return totEnergy;
 794     }
 795     else
 796     {
 797         /* 4x4 is too small for sa8d */
 798         int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
 799         int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
 800         return abs(sourceEnergy - reconEnergy);
 801     }
 802 }
 803
 804 template<int size>
 805 int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
 806 {
 807     static int16_t zeroBuf[8] /* = { 0 } */;
 808
 809     if (size)
 810     {
 811         int dim = 1 << (size + 2);
 812         uint32_t totEnergy = 0;
 813         for (int i = 0; i < dim; i += 8)
 814         {
 815             for (int j = 0; j < dim; j+= 8)
 816             {
 817                 /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
 818                 int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride, zeroBuf, 0) -
 819                                    (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
 820                 int reconEnergy =  sa8d_8x8(recon + i * rstride + j, rstride, zeroBuf, 0) -
 821                                    (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
 822
 823                 totEnergy += abs(sourceEnergy - reconEnergy);
 824             }
 825         }
 826         return totEnergy;
 827     }
 828     else
 829     {
 830         /* 4x4 is too small for sa8d */
 831         int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
 832         int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
 833         return abs(sourceEnergy - reconEnergy);
 834     }
 835 }
 836
 837 template<int bx, int by>
 838 void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
 839 {
 840     for (int y = 0; y < by; y++)
 841     {
 842         for (int x = 0; x < bx; x++)
 843             a[x] = b[x];
 844
 845         a += stridea;
 846         b += strideb;
 847     }
 848 }
 849
 850 template<int bx, int by>
 851 void blockcopy_ss_c(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
 852 {
 853     for (int y = 0; y < by; y++)
 854     {
 855         for (int x = 0; x < bx; x++)
 856             a[x] = b[x];
 857
 858         a += stridea;
 859         b += strideb;
 860     }
 861 }
 862
 863 template<int bx, int by>
 864 void blockcopy_sp_c(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
 865 {
 866     for (int y = 0; y < by; y++)
 867     {
 868         for (int x = 0; x < bx; x++)
 869         {
 870             X265_CHECK((b[x] >= 0) && (b[x] <= ((1 << X265_DEPTH) - 1)), "blockcopy pixel size fail\n");
 871             a[x] = (pixel)b[x];
 872         }
 873
 874         a += stridea;
 875         b += strideb;
 876     }
 877 }
 878
 879 template<int bx, int by>
 880 void blockcopy_ps_c(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
 881 {
 882     for (int y = 0; y < by; y++)
 883     {
 884         for (int x = 0; x < bx; x++)
 885             a[x] = (int16_t)b[x];
 886
 887         a += stridea;
 888         b += strideb;
 889     }
 890 }
 891
 892 template<int bx, int by>
 893 void pixel_sub_ps_c(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
 894 {
 895     for (int y = 0; y < by; y++)
 896     {
 897         for (int x = 0; x < bx; x++)
 898             a[x] = (int16_t)(b0[x] - b1[x]);
 899
 900         b0 += sstride0;
 901         b1 += sstride1;
 902         a += dstride;
 903     }
 904 }
 905
 906 template<int bx, int by>
 907 void pixel_add_ps_c(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1)
 908 {
 909     for (int y = 0; y < by; y++)
 910     {
 911         for (int x = 0; x < bx; x++)
 912             a[x] = Clip(b0[x] + b1[x]);
 913
 914         b0 += sstride0;
 915         b1 += sstride1;
 916         a += dstride;
 917     }
 918 }
 919
 920 template<int bx, int by>
 921 void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
 922 {
 923     int shiftNum, offset;
 924
 925     shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
 926     offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
 927
 928     for (int y = 0; y < by; y++)
 929     {
 930         for (int x = 0; x < bx; x += 2)
 931         {
 932             dst[x + 0] = Clip((src0[x + 0] + src1[x + 0] + offset) >> shiftNum);
 933             dst[x + 1] = Clip((src0[x + 1] + src1[x + 1] + offset) >> shiftNum);
 934         }
 935
 936         src0 += src0Stride;
 937         src1 += src1Stride;
 938         dst  += dstStride;
 939     }
 940 }
 941
 942 void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
 943 {
 944     for (int r = 0; r < height; r++)
 945     {
 946         for (int c = 0; c < width; c++)
 947             dst[c] = ((pixel)src[c]) << shift;
 948
 949         dst += dstStride;
 950         src += srcStride;
 951     }
 952 }
 953
 954 void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
 955 {
 956     for (int r = 0; r < height; r++)
 957     {
 958         for (int c = 0; c < width; c++)
 959             dst[c] = (pixel)((src[c] >> shift) & mask);
 960
 961         dst += dstStride;
 962         src += srcStride;
 963     }
 964 }
 965
 966 /* Estimate the total amount of influence on future quality that could be had if we
 967  * were to improve the reference samples used to inter predict any given CU. */
 968 void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
 969                              const int32_t* invQscales, const double* fpsFactor, int len)
 970 {
 971     double fps = *fpsFactor / 256;
 972
 973     for (int i = 0; i < len; i++)
 974     {
 975         double intraCost       = intraCosts[i] * invQscales[i];
 976         double propagateAmount = (double)propagateIn[i] + intraCost * fps;
 977         double propagateNum    = (double)intraCosts[i] - (interCosts[i] & ((1 << 14) - 1));
 978         double propagateDenom  = (double)intraCosts[i];
 979         dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
 980     }
 981 }
 982 }  // end anonymous namespace
 983
 984 namespace x265 {
 985 // x265 private namespace
 986
 987 /* Extend the edges of a picture so that it may safely be used for motion
 988  * compensation. This function assumes the picture is stored in a buffer with
 989  * sufficient padding for the X and Y margins */
 990 void extendPicBorder(pixel* pic, intptr_t stride, int width, int height, int marginX, int marginY)
 991 {
 992     /* extend left and right margins */
 993     primitives.extendRowBorder(pic, stride, width, height, marginX);
 994
 995     /* copy top row to create above margin */
 996     pixel* top = pic - marginX;
 997     for (int y = 0; y < marginY; y++)
 998         memcpy(top - (y + 1) * stride, top, stride * sizeof(pixel));
 999
1000     /* copy bottom row to create below margin */
1001     pixel* bot = pic - marginX + (height - 1) * stride;
1002     for (int y = 0; y < marginY; y++)
1003         memcpy(bot + (y + 1) * stride, bot, stride * sizeof(pixel));
1004 }
1005
1006 /* Initialize entries for pixel functions defined in this file */
1007 void Setup_C_PixelPrimitives(EncoderPrimitives &p)
1008 {
1009     SET_FUNC_PRIMITIVE_TABLE_C2(sad)
1010     SET_FUNC_PRIMITIVE_TABLE_C2(sad_x3)
1011     SET_FUNC_PRIMITIVE_TABLE_C2(sad_x4)
1012     SET_FUNC_PRIMITIVE_TABLE_C2(pixelavg_pp)
1013
1014     // satd
1015     p.satd[LUMA_4x4]   = satd_4x4;
1016     p.satd[LUMA_8x8]   = satd8<8, 8>;
1017     p.satd[LUMA_8x4]   = satd_8x4;
1018     p.satd[LUMA_4x8]   = satd4<4, 8>;
1019     p.satd[LUMA_16x16] = satd8<16, 16>;
1020     p.satd[LUMA_16x8]  = satd8<16, 8>;
1021     p.satd[LUMA_8x16]  = satd8<8, 16>;
1022     p.satd[LUMA_16x12] = satd8<16, 12>;
1023     p.satd[LUMA_12x16] = satd4<12, 16>;
1024     p.satd[LUMA_16x4]  = satd8<16, 4>;
1025     p.satd[LUMA_4x16]  = satd4<4, 16>;
1026     p.satd[LUMA_32x32] = satd8<32, 32>;
1027     p.satd[LUMA_32x16] = satd8<32, 16>;
1028     p.satd[LUMA_16x32] = satd8<16, 32>;
1029     p.satd[LUMA_32x24] = satd8<32, 24>;
1030     p.satd[LUMA_24x32] = satd8<24, 32>;
1031     p.satd[LUMA_32x8]  = satd8<32, 8>;
1032     p.satd[LUMA_8x32]  = satd8<8, 32>;
1033     p.satd[LUMA_64x64] = satd8<64, 64>;
1034     p.satd[LUMA_64x32] = satd8<64, 32>;
1035     p.satd[LUMA_32x64] = satd8<32, 64>;
1036     p.satd[LUMA_64x48] = satd8<64, 48>;
1037     p.satd[LUMA_48x64] = satd8<48, 64>;
1038     p.satd[LUMA_64x16] = satd8<64, 16>;
1039     p.satd[LUMA_16x64] = satd8<16, 64>;
1040
1041     p.chroma[X265_CSP_I420].satd[CHROMA_2x2]   = NULL;
1042     p.chroma[X265_CSP_I420].satd[CHROMA_4x4]   = satd_4x4;
1043     p.chroma[X265_CSP_I420].satd[CHROMA_8x8]   = satd8<8, 8>;
1044     p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = satd8<16, 16>;
1045     p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = satd8<32, 32>;
1046
1047     p.chroma[X265_CSP_I420].satd[CHROMA_4x2]   = NULL;
1048     p.chroma[X265_CSP_I420].satd[CHROMA_2x4]   = NULL;
1049     p.chroma[X265_CSP_I420].satd[CHROMA_8x4]   = satd_8x4;
1050     p.chroma[X265_CSP_I420].satd[CHROMA_4x8]   = satd4<4, 8>;
1051     p.chroma[X265_CSP_I420].satd[CHROMA_16x8]  = satd8<16, 8>;
1052     p.chroma[X265_CSP_I420].satd[CHROMA_8x16]  = satd8<8, 16>;
1053     p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = satd8<32, 16>;
1054     p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = satd8<16, 32>;
1055
1056     p.chroma[X265_CSP_I420].satd[CHROMA_8x6]   = NULL;
1057     p.chroma[X265_CSP_I420].satd[CHROMA_6x8]   = NULL;
1058     p.chroma[X265_CSP_I420].satd[CHROMA_8x2]   = NULL;
1059     p.chroma[X265_CSP_I420].satd[CHROMA_2x8]   = NULL;
1060     p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = satd4<16, 12>;
1061     p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = satd4<12, 16>;
1062     p.chroma[X265_CSP_I420].satd[CHROMA_16x4]  = satd4<16, 4>;
1063     p.chroma[X265_CSP_I420].satd[CHROMA_4x16]  = satd4<4, 16>;
1064     p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = satd8<32, 24>;
1065     p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = satd8<24, 32>;
1066     p.chroma[X265_CSP_I420].satd[CHROMA_32x8]  = satd8<32, 8>;
1067     p.chroma[X265_CSP_I420].satd[CHROMA_8x32]  = satd8<8, 32>;
1068
1069     p.chroma[X265_CSP_I422].satd[CHROMA422_2x4]   = NULL;
1070     p.chroma[X265_CSP_I422].satd[CHROMA422_4x8]   = satd4<4, 8>;
1071     p.chroma[X265_CSP_I422].satd[CHROMA422_8x16]  = satd8<8, 16>;
1072     p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = satd8<16, 32>;
1073     p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = satd8<32, 64>;
1074
1075     p.chroma[X265_CSP_I422].satd[CHROMA422_4x4]   = satd_4x4;
1076     p.chroma[X265_CSP_I422].satd[CHROMA422_2x8]   = NULL;
1077     p.chroma[X265_CSP_I422].satd[CHROMA422_8x8]   = satd8<8, 8>;
1078     p.chroma[X265_CSP_I422].satd[CHROMA422_4x16]  = satd4<4, 16>;
1079     p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = satd8<16, 16>;
1080     p.chroma[X265_CSP_I422].satd[CHROMA422_8x32]  = satd8<8, 32>;
1081     p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = satd8<32, 32>;
1082     p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = satd8<16, 64>;
1083
1084     p.chroma[X265_CSP_I422].satd[CHROMA422_8x12]  = satd4<8, 12>;
1085     p.chroma[X265_CSP_I422].satd[CHROMA422_6x16]  = NULL;
1086     p.chroma[X265_CSP_I422].satd[CHROMA422_8x4]   = satd4<8, 4>;
1087     p.chroma[X265_CSP_I422].satd[CHROMA422_2x16]  = NULL;
1088     p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>;
1089     p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>;
1090     p.chroma[X265_CSP_I422].satd[CHROMA422_16x8]  = satd8<16, 8>;
1091     p.chroma[X265_CSP_I422].satd[CHROMA422_4x32]  = satd4<4, 32>;
1092     p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>;
1093     p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>;
1094     p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = satd8<32, 16>;
1095     p.chroma[X265_CSP_I422].satd[CHROMA422_8x64]  = satd8<8, 64>;
1096
1097 #define CHROMA_420(W, H) \
1098     p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H]  = addAvg<W, H>;         \
1099     p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1100     p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1101     p.chroma[X265_CSP_I420].copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1102     p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1103
1104 #define CHROMA_422(W, H) \
1105     p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H]  = addAvg<W, H>;         \
1106     p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1107     p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1108     p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1109     p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1110
1111 #define CHROMA_444(W, H) \
1112     p.chroma[X265_CSP_I444].satd[LUMA_ ## W ## x ## H]    = p.satd[LUMA_ ## W ## x ## H]; \
1113     p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H]  = addAvg<W, H>; \
1114     p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1115     p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1116     p.chroma[X265_CSP_I444].copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1117     p.chroma[X265_CSP_I444].copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1118
1119 #define LUMA(W, H) \
1120     p.luma_addAvg[LUMA_ ## W ## x ## H]  = addAvg<W, H>; \
1121     p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
1122     p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
1123     p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
1124     p.luma_copy_ss[LUMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
1125
1126 #define LUMA_PIXELSUB(W, H) \
1127     p.luma_sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1128     p.luma_add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1129
1130 #define CHROMA_PIXELSUB_420(W, H) \
1131     p.chroma[X265_CSP_I420].sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;  \
1132     p.chroma[X265_CSP_I420].add_ps[CHROMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1133
1134 #define CHROMA_PIXELSUB_422(W, H) \
1135     p.chroma[X265_CSP_I422].sub_ps[CHROMA422_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1136     p.chroma[X265_CSP_I422].add_ps[CHROMA422_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1137
1138 #define CHROMA_PIXELSUB_444(W, H) \
1139     p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
1140     p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
1141
1142     LUMA(4, 4);
1143     LUMA(8, 8);
1144     CHROMA_420(4, 4);
1145     LUMA(4, 8);
1146     CHROMA_420(2, 4);
1147     LUMA(8, 4);
1148     CHROMA_420(4, 2);
1149     LUMA(16, 16);
1150     CHROMA_420(8,  8);
1151     LUMA(16,  8);
1152     CHROMA_420(8,  4);
1153     LUMA(8, 16);
1154     CHROMA_420(4,  8);
1155     LUMA(16, 12);
1156     CHROMA_420(8,  6);
1157     LUMA(12, 16);
1158     CHROMA_420(6,  8);
1159     LUMA(16,  4);
1160     CHROMA_420(8,  2);
1161     LUMA(4, 16);
1162     CHROMA_420(2,  8);
1163     LUMA(32, 32);
1164     CHROMA_420(16, 16);
1165     LUMA(32, 16);
1166     CHROMA_420(16, 8);
1167     LUMA(16, 32);
1168     CHROMA_420(8,  16);
1169     LUMA(32, 24);
1170     CHROMA_420(16, 12);
1171     LUMA(24, 32);
1172     CHROMA_420(12, 16);
1173     LUMA(32,  8);
1174     CHROMA_420(16, 4);
1175     LUMA(8, 32);
1176     CHROMA_420(4,  16);
1177     LUMA(64, 64);
1178     CHROMA_420(32, 32);
1179     LUMA(64, 32);
1180     CHROMA_420(32, 16);
1181     LUMA(32, 64);
1182     CHROMA_420(16, 32);
1183     LUMA(64, 48);
1184     CHROMA_420(32, 24);
1185     LUMA(48, 64);
1186     CHROMA_420(24, 32);
1187     LUMA(64, 16);
1188     CHROMA_420(32, 8);
1189     LUMA(16, 64);
1190     CHROMA_420(8,  32);
1191
1192     LUMA_PIXELSUB(4, 4);
1193     LUMA_PIXELSUB(8, 8);
1194     LUMA_PIXELSUB(16, 16);
1195     LUMA_PIXELSUB(32, 32);
1196     LUMA_PIXELSUB(64, 64);
1197     CHROMA_PIXELSUB_420(4, 4)
1198     CHROMA_PIXELSUB_420(8, 8)
1199     CHROMA_PIXELSUB_420(16, 16)
1200     CHROMA_PIXELSUB_420(32, 32)
1201     CHROMA_PIXELSUB_422(4, 8)
1202     CHROMA_PIXELSUB_422(8, 16)
1203     CHROMA_PIXELSUB_422(16, 32)
1204     CHROMA_PIXELSUB_422(32, 64)
1205     CHROMA_PIXELSUB_444(8, 8)
1206     CHROMA_PIXELSUB_444(16, 16)
1207     CHROMA_PIXELSUB_444(32, 32)
1208     CHROMA_PIXELSUB_444(64, 64)
1209
1210     CHROMA_422(4, 8);
1211     CHROMA_422(4, 4);
1212     CHROMA_422(2, 8);
1213     CHROMA_422(8,  16);
1214     CHROMA_422(8,  8);
1215     CHROMA_422(4,  16);
1216     CHROMA_422(8,  12);
1217     CHROMA_422(6,  16);
1218     CHROMA_422(8,  4);
1219     CHROMA_422(2,  16);
1220     CHROMA_422(16, 32);
1221     CHROMA_422(16, 16);
1222     CHROMA_422(8,  32);
1223     CHROMA_422(16, 24);
1224     CHROMA_422(12, 32);
1225     CHROMA_422(16, 8);
1226     CHROMA_422(4,  32);
1227     CHROMA_422(32, 64);
1228     CHROMA_422(32, 32);
1229     CHROMA_422(16, 64);
1230     CHROMA_422(32, 48);
1231     CHROMA_422(24, 64);
1232     CHROMA_422(32, 16);
1233     CHROMA_422(8,  64);
1234
1235     CHROMA_444(4,  4);
1236     CHROMA_444(8,  8);
1237     CHROMA_444(4,  8);
1238     CHROMA_444(8,  4);
1239     CHROMA_444(16, 16);
1240     CHROMA_444(16, 8);
1241     CHROMA_444(8,  16);
1242     CHROMA_444(16, 12);
1243     CHROMA_444(12, 16);
1244     CHROMA_444(16, 4);
1245     CHROMA_444(4,  16);
1246     CHROMA_444(32, 32);
1247     CHROMA_444(32, 16);
1248     CHROMA_444(16, 32);
1249     CHROMA_444(32, 24);
1250     CHROMA_444(24, 32);
1251     CHROMA_444(32, 8);
1252     CHROMA_444(8,  32);
1253     CHROMA_444(64, 64);
1254     CHROMA_444(64, 32);
1255     CHROMA_444(32, 64);
1256     CHROMA_444(64, 48);
1257     CHROMA_444(48, 64);
1258     CHROMA_444(64, 16);
1259     CHROMA_444(16, 64);
1260
1261     SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixel, pixel)
1262     SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel)
1263     SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t)
1264
1265     p.blockfill_s[BLOCK_4x4]   = blockfil_s_c<4>;
1266     p.blockfill_s[BLOCK_8x8]   = blockfil_s_c<8>;
1267     p.blockfill_s[BLOCK_16x16] = blockfil_s_c<16>;
1268     p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
1269     p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
1270
1271     p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>;
1272     p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>;
1273     p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>;
1274     p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>;
1275     p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>;
1276     p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>;
1277     p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>;
1278     p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>;
1279     p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>;
1280     p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>;
1281     p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>;
1282     p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>;
1283     p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>;
1284     p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>;
1285     p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>;
1286     p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>;
1287
1288     p.sa8d[BLOCK_4x4]   = satd_4x4;
1289     p.sa8d[BLOCK_8x8]   = sa8d_8x8;
1290     p.sa8d[BLOCK_16x16] = sa8d_16x16;
1291     p.sa8d[BLOCK_32x32] = sa8d16<32, 32>;
1292     p.sa8d[BLOCK_64x64] = sa8d16<64, 64>;
1293
1294     p.psy_cost_pp[BLOCK_4x4] = psyCost_pp<BLOCK_4x4>;
1295     p.psy_cost_pp[BLOCK_8x8] = psyCost_pp<BLOCK_8x8>;
1296     p.psy_cost_pp[BLOCK_16x16] = psyCost_pp<BLOCK_16x16>;
1297     p.psy_cost_pp[BLOCK_32x32] = psyCost_pp<BLOCK_32x32>;
1298     p.psy_cost_pp[BLOCK_64x64] = psyCost_pp<BLOCK_64x64>;
1299
1300     p.psy_cost_ss[BLOCK_4x4] = psyCost_ss<BLOCK_4x4>;
1301     p.psy_cost_ss[BLOCK_8x8] = psyCost_ss<BLOCK_8x8>;
1302     p.psy_cost_ss[BLOCK_16x16] = psyCost_ss<BLOCK_16x16>;
1303     p.psy_cost_ss[BLOCK_32x32] = psyCost_ss<BLOCK_32x32>;
1304     p.psy_cost_ss[BLOCK_64x64] = psyCost_ss<BLOCK_64x64>;
1305
1306     p.sa8d_inter[LUMA_4x4]   = satd_4x4;
1307     p.sa8d_inter[LUMA_8x8]   = sa8d_8x8;
1308     p.sa8d_inter[LUMA_8x4]   = satd_8x4;
1309     p.sa8d_inter[LUMA_4x8]   = satd4<4, 8>;
1310     p.sa8d_inter[LUMA_16x16] = sa8d_16x16;
1311     p.sa8d_inter[LUMA_16x8]  = sa8d8<16, 8>;
1312     p.sa8d_inter[LUMA_8x16]  = sa8d8<8, 16>;
1313     p.sa8d_inter[LUMA_16x12] = satd8<16, 12>;
1314     p.sa8d_inter[LUMA_12x16] = satd4<12, 16>;
1315     p.sa8d_inter[LUMA_4x16]  = satd4<4, 16>;
1316     p.sa8d_inter[LUMA_16x4]  = satd8<16, 4>;
1317     p.sa8d_inter[LUMA_32x32] = sa8d16<32, 32>;
1318     p.sa8d_inter[LUMA_32x16] = sa8d16<32, 16>;
1319     p.sa8d_inter[LUMA_16x32] = sa8d16<16, 32>;
1320     p.sa8d_inter[LUMA_32x24] = sa8d8<32, 24>;
1321     p.sa8d_inter[LUMA_24x32] = sa8d8<24, 32>;
1322     p.sa8d_inter[LUMA_32x8]  = sa8d8<32, 8>;
1323     p.sa8d_inter[LUMA_8x32]  = sa8d8<8, 32>;
1324     p.sa8d_inter[LUMA_64x64] = sa8d16<64, 64>;
1325     p.sa8d_inter[LUMA_64x32] = sa8d16<64, 32>;
1326     p.sa8d_inter[LUMA_32x64] = sa8d16<32, 64>;
1327     p.sa8d_inter[LUMA_64x48] = sa8d16<64, 48>;
1328     p.sa8d_inter[LUMA_48x64] = sa8d16<48, 64>;
1329     p.sa8d_inter[LUMA_64x16] = sa8d16<64, 16>;
1330     p.sa8d_inter[LUMA_16x64] = sa8d16<16, 64>;
1331
1332     p.calcresidual[BLOCK_4x4] = getResidual<4>;
1333     p.calcresidual[BLOCK_8x8] = getResidual<8>;
1334     p.calcresidual[BLOCK_16x16] = getResidual<16>;
1335     p.calcresidual[BLOCK_32x32] = getResidual<32>;
1336     p.calcresidual[BLOCK_64x64] = NULL;
1337
1338     p.transpose[BLOCK_4x4] = transpose<4>;
1339     p.transpose[BLOCK_8x8] = transpose<8>;
1340     p.transpose[BLOCK_16x16] = transpose<16>;
1341     p.transpose[BLOCK_32x32] = transpose<32>;
1342     p.transpose[BLOCK_64x64] = transpose<64>;
1343
1344     p.ssd_s[BLOCK_4x4] = pixel_ssd_s_c<4>;
1345     p.ssd_s[BLOCK_8x8] = pixel_ssd_s_c<8>;
1346     p.ssd_s[BLOCK_16x16] = pixel_ssd_s_c<16>;
1347     p.ssd_s[BLOCK_32x32] = pixel_ssd_s_c<32>;
1348
1349     p.weight_pp = weight_pp_c;
1350     p.weight_sp = weight_sp_c;
1351
1352     p.scale1D_128to64 = scale1D_128to64;
1353     p.scale2D_64to32 = scale2D_64to32;
1354     p.frameInitLowres = frame_init_lowres_core;
1355     p.ssim_4x4x2_core = ssim_4x4x2_core;
1356     p.ssim_end_4 = ssim_end_4;
1357
1358     p.var[BLOCK_8x8] = pixel_var<8>;
1359     p.var[BLOCK_16x16] = pixel_var<16>;
1360     p.var[BLOCK_32x32] = pixel_var<32>;
1361     p.var[BLOCK_64x64] = pixel_var<64>;
1362     p.planecopy_cp = planecopy_cp_c;
1363     p.planecopy_sp = planecopy_sp_c;
1364     p.propagateCost = estimateCUPropagateCost;
1365 }
1366 }