Imported Upstream version 1.4+222+hg5f9f7194267b

[deb_x265.git] / source / common / pixel.cpp
diff --git a/source/common/pixel.cpp b/source/common/pixel.cpp

index 3e0530dddb5f162d27249e782c1d6ae4ed86528f..a56b8d7e712e155ed17074f07640aa8a5bc8aaea 100644 (file)
--- a/source/common/pixel.cpp
+++ b/source/common/pixel.cpp
@@ -32,32 +32,32 @@
  
  using namespace x265;
  
-#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \
-    p.FUNC_PREFIX[LUMA_4x4]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4,  4, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_8x8]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8,  8, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_8x4]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8,  4, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_4x8]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4,  8, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x8]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16,  8, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_8x16]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x12] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_12x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x4]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16,  4, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_4x16]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x24] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_24x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x8]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32,  8, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_8x32]  = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_64x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_64x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_32x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_64x48] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_48x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_64x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
-    p.FUNC_PREFIX[LUMA_16x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
+#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \
+    p.FUNC_PREFIX[LUMA_4x4]   = FUNC_PREFIX_DEF<4,  4, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_8x8]   = FUNC_PREFIX_DEF<8,  8, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_8x4]   = FUNC_PREFIX_DEF<8,  4, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_4x8]   = FUNC_PREFIX_DEF<4,  8, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x8]  = FUNC_PREFIX_DEF<16,  8, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_8x16]  = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x4]  = FUNC_PREFIX_DEF<16,  4, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_4x16]  = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_32x8]  = FUNC_PREFIX_DEF<32,  8, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_8x32]  = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
+    p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
  
  #define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
      p.FUNC_PREFIX[LUMA_4x4]   = FUNC_PREFIX<4,  4>; \
@@ -90,16 +90,14 @@ namespace {
  // place functions in anonymous namespace (file static)
  
  template<int lx, int ly>
-int sad(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int sad(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
  {
      int sum = 0;
  
      for (int y = 0; y < ly; y++)
      {
          for (int x = 0; x < lx; x++)
-        {
              sum += abs(pix1[x] - pix2[x]);
-        }
  
          pix1 += stride_pix1;
          pix2 += stride_pix2;
@@ -109,16 +107,14 @@ int sad(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
  }
  
  template<int lx, int ly>
-int sad(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2)
+int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
  {
      int sum = 0;
  
      for (int y = 0; y < ly; y++)
      {
          for (int x = 0; x < lx; x++)
-        {
              sum += abs(pix1[x] - pix2[x]);
-        }
  
          pix1 += stride_pix1;
          pix2 += stride_pix2;
@@ -128,7 +124,7 @@ int sad(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2
  }
  
  template<int lx, int ly>
-void sad_x3(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, intptr_t frefstride, int32_t *res)
+void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
  {
      res[0] = 0;
      res[1] = 0;
@@ -150,7 +146,7 @@ void sad_x3(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, intptr_t frefstr
  }
  
  template<int lx, int ly>
-void sad_x4(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, pixel *pix5, intptr_t frefstride, int32_t *res)
+void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
  {
      res[0] = 0;
      res[1] = 0;
@@ -175,17 +171,17 @@ void sad_x4(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, pixel *pix5, int
  }
  
  template<int lx, int ly, class T1, class T2>
-int sse(T1 *pix1, intptr_t stride_pix1, T2 *pix2, intptr_t stride_pix2)
+int sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
  {
      int sum = 0;
-    int iTemp;
+    int tmp;
  
      for (int y = 0; y < ly; y++)
      {
          for (int x = 0; x < lx; x++)
          {
-            iTemp = pix1[x] - pix2[x];
-            sum += (iTemp * iTemp);
+            tmp = pix1[x] - pix2[x];
+            sum += (tmp * tmp);
          }
  
          pix1 += stride_pix1;
@@ -217,7 +213,7 @@ inline sum2_t abs2(sum2_t a)
      return (a + s) ^ s;
  }
  
-int satd_4x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
  {
      sum2_t tmp[4][2];
      sum2_t a0, a1, a2, a3, b0, b1;
@@ -245,7 +241,7 @@ int satd_4x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix
      return (int)(sum >> 1);
  }
  
-int satd_4x4(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2)
+int satd_4x4(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
  {
      ssum2_t tmp[4][2];
      ssum2_t a0, a1, a2, a3, b0, b1;
@@ -274,7 +270,7 @@ int satd_4x4(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride
  }
  
  // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
-int satd_8x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
  {
      sum2_t tmp[4][4];
      sum2_t a0, a1, a2, a3;
@@ -300,41 +296,33 @@ int satd_8x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix
  
  template<int w, int h>
  // calculate satd in blocks of 4x4
-int satd4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
  {
      int satd = 0;
  
      for (int row = 0; row < h; row += 4)
-    {
          for (int col = 0; col < w; col += 4)
-        {
              satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
                               pix2 + row * stride_pix2 + col, stride_pix2);
-        }
-    }
  
      return satd;
  }
  
  template<int w, int h>
  // calculate satd in blocks of 8x4
-int satd8(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
  {
      int satd = 0;
  
      for (int row = 0; row < h; row += 4)
-    {
          for (int col = 0; col < w; col += 8)
-        {
              satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
                               pix2 + row * stride_pix2 + col, stride_pix2);
-        }
-    }
  
      return satd;
  }
  
-inline int _sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+inline int _sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
  {
      sum2_t tmp[8][4];
      sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
@@ -371,12 +359,12 @@ inline int _sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
      return (int)sum;
  }
  
-int sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
  {
      return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
  }
  
-inline int _sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2)
+inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2)
  {
      ssum2_t tmp[8][4];
      ssum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
@@ -413,12 +401,12 @@ inline int _sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_p
      return (int)sum;
  }
  
-int sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2)
+int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2)
  {
      return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
  }
  
-int sa8d_16x16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
  {
      int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
          + _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2)
@@ -432,159 +420,129 @@ int sa8d_16x16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
  
  template<int w, int h>
  // Calculate sa8d in blocks of 8x8
-int sa8d8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
  {
      int cost = 0;
  
      for (int y = 0; y < h; y += 8)
-    {
          for (int x = 0; x < w; x += 8)
-        {
              cost += sa8d_8x8(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
-        }
-    }
  
      return cost;
  }
  
  template<int w, int h>
  // Calculate sa8d in blocks of 16x16
-int sa8d16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
  {
      int cost = 0;
  
      for (int y = 0; y < h; y += 16)
-    {
          for (int x = 0; x < w; x += 16)
-        {
              cost += sa8d_16x16(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
-        }
-    }
  
      return cost;
  }
  
  template<int size>
-int pixel_ssd_s_c(short *a, intptr_t dstride)
+int pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
  {
      int sum = 0;
      for (int y = 0; y < size; y++)
      {
          for (int x = 0; x < size; x++)
-        {
              sum += a[x] * a[x];
-        }
+
          a += dstride;
      }
      return sum;
  }
  
  template<int size>
-void blockfil_s_c(int16_t *dst, intptr_t dstride, int16_t val)
+void blockfil_s_c(int16_t* dst, intptr_t dstride, int16_t val)
  {
      for (int y = 0; y < size; y++)
-    {
          for (int x = 0; x < size; x++)
-        {
              dst[y * dstride + x] = val;
-        }
-    }
-}
-
-void convert16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size)
-{
-    for (int i = 0; i < size; i++)
-    {
-        for (int j = 0; j < size; j++)
-        {
-            dst[i * size + j] = ((int)src[i * stride + j]) << shift;
-        }
-    }
  }
  
  template<int size>
-void convert16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset)
+void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
  {
-    for (int i = 0; i < size; i++)
-    {
-        for (int j = 0; j < size; j++)
-        {
-            dst[i * size + j] = ((int)src[i * stride + j] + offset) >> shift;
-        }
-    }
-}
-
-void convert32to16_shr(int16_t *dst, int32_t *src, intptr_t stride, int shift, int size)
-{
-    int round = 1 << (shift - 1);
+    X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
+    X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
+    X265_CHECK(shift >= 0, "invalid shift\n");
  
      for (int i = 0; i < size; i++)
      {
          for (int j = 0; j < size; j++)
-        {
-            dst[j] = (int16_t)((src[j] + round) >> shift);
-        }
+            dst[j] = src[j] << shift;
  
-        src += size;
-        dst += stride;
+        src += srcStride;
+        dst += size;
      }
  }
  
-void copy_shr(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size)
+template<int size>
+void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
  {
-    int round = 1 << (shift - 1);
+    X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
+    X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
+    X265_CHECK(shift > 0, "invalid shift\n");
  
+    int16_t round = 1 << (shift - 1);
      for (int i = 0; i < size; i++)
      {
          for (int j = 0; j < size; j++)
-        {
-            dst[j] = (int16_t)((src[j] + round) >> shift);
-        }
+            dst[j] = (src[j] + round) >> shift;
  
-        src += size;
-        dst += stride;
+        src += srcStride;
+        dst += size;
      }
  }
  
  template<int size>
-void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
  {
+    X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
+    X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
+    X265_CHECK(shift >= 0, "invalid shift\n");
+
      for (int i = 0; i < size; i++)
      {
          for (int j = 0; j < size; j++)
-        {
-            dst[j] = ((int16_t)src[j] << shift);
-        }
+            dst[j] = src[j] << shift;
  
          src += size;
-        dst += stride;
+        dst += dstStride;
      }
  }
  
  template<int size>
-void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
  {
+    X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
+    X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
+    X265_CHECK(shift > 0, "invalid shift\n");
+
+    int16_t round = 1 << (shift - 1);
      for (int i = 0; i < size; i++)
      {
          for (int j = 0; j < size; j++)
-        {
-            dst[j] = (src[j] << shift);
-        }
+            dst[j] = (src[j] + round) >> shift;
  
          src += size;
-        dst += stride;
+        dst += dstStride;
      }
  }
  
  template<int blockSize>
-void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
+void getResidual(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
  {
      for (int y = 0; y < blockSize; y++)
      {
          for (int x = 0; x < blockSize; x++)
-        {
              residual[x] = static_cast<int16_t>(fenc[x]) - static_cast<int16_t>(pred[x]);
-        }
  
          fenc += stride;
          residual += stride;
@@ -593,18 +551,14 @@ void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
  }
  
  template<int blockSize>
-void transpose(pixel* dst, pixel* src, intptr_t stride)
+void transpose(pixel* dst, const pixel* src, intptr_t stride)
  {
      for (int k = 0; k < blockSize; k++)
-    {
          for (int l = 0; l < blockSize; l++)
-        {
              dst[k * blockSize + l] = src[l * stride + k];
-        }
-    }
  }
  
-void weight_sp_c(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
+void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
  {
      int x, y;
  
@@ -622,7 +576,7 @@ void weight_sp_c(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStrid
      }
  }
  
-void weight_pp_c(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
+void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
  {
      int x, y;
  
@@ -646,14 +600,12 @@ void weight_pp_c(pixel *src, pixel *dst, intptr_t stride, int width, int height,
  }
  
  template<int lx, int ly>
-void pixelavg_pp(pixel* dst, intptr_t dstride, pixel* src0, intptr_t sstride0, pixel* src1, intptr_t sstride1, int)
+void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
  {
      for (int y = 0; y < ly; y++)
      {
          for (int x = 0; x < lx; x++)
-        {
              dst[x] = (src0[x] + src1[x] + 1) >> 1;
-        }
  
          src0 += sstride0;
          src1 += sstride1;
@@ -661,7 +613,7 @@ void pixelavg_pp(pixel* dst, intptr_t dstride, pixel* src0, intptr_t sstride0, p
      }
  }
  
-void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
+void scale1D_128to64(pixel* dst, const pixel* src, intptr_t /*stride*/)
  {
      int x;
  
@@ -675,9 +627,9 @@ void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
      }
  }
  
-void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
+void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
  {
-    int x, y;
+    uint32_t x, y;
  
      for (y = 0; y < 64; y += 2)
      {
@@ -694,13 +646,13 @@ void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
      }
  }
  
-void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
+void frame_init_lowres_core(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc,
                              intptr_t src_stride, intptr_t dst_stride, int width, int height)
  {
      for (int y = 0; y < height; y++)
      {
-        pixel *src1 = src0 + src_stride;
-        pixel *src2 = src1 + src_stride;
+        const pixel* src1 = src0 + src_stride;
+        const pixel* src2 = src1 + src_stride;
          for (int x = 0; x < width; x++)
          {
              // slower than naive bilinear, but matches asm
@@ -720,7 +672,7 @@ void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv,
  }
  
  /* structural similarity metric */
-void ssim_4x4x2_core(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4])
+void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4])
  {
      for (int z = 0; z < 2; z++)
      {
@@ -794,7 +746,7 @@ float ssim_end_4(int sum0[5][4], int sum1[5][4], int width)
  }
  
  template<int size>
-uint64_t pixel_var(pixel *pix, intptr_t i_stride)
+uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
  {
      uint32_t sum = 0, sqr = 0;
  
@@ -817,7 +769,7 @@ uint64_t pixel_var(pixel *pix, intptr_t i_stride)
  #endif
  
  template<int size>
-int psyCost_pp(pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride)
+int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
  {
      static pixel zeroBuf[8] /* = { 0 } */;
  
@@ -850,7 +802,7 @@ int psyCost_pp(pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride)
  }
  
  template<int size>
-int psyCost_ss(int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstride)
+int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
  {
      static int16_t zeroBuf[8] /* = { 0 } */;
  
@@ -882,28 +834,13 @@ int psyCost_ss(int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstri
      }
  }
  
-void plane_copy_deinterleave_chroma(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride,
-                                    pixel *src,  intptr_t srcStride, int w, int h)
-{
-    for (int y = 0; y < h; y++, dstu += dstuStride, dstv += dstvStride, src += srcStride)
-    {
-        for (int x = 0; x < w; x++)
-        {
-            dstu[x] = src[2 * x];
-            dstv[x] = src[2 * x + 1];
-        }
-    }
-}
-
  template<int bx, int by>
-void blockcopy_pp_c(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb)
+void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
  {
      for (int y = 0; y < by; y++)
      {
          for (int x = 0; x < bx; x++)
-        {
              a[x] = b[x];
-        }
  
          a += stridea;
          b += strideb;
@@ -911,14 +848,12 @@ void blockcopy_pp_c(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb)
  }
  
  template<int bx, int by>
-void blockcopy_ss_c(int16_t *a, intptr_t stridea, int16_t *b, intptr_t strideb)
+void blockcopy_ss_c(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
  {
      for (int y = 0; y < by; y++)
      {
          for (int x = 0; x < bx; x++)
-        {
              a[x] = b[x];
-        }
  
          a += stridea;
          b += strideb;
@@ -926,7 +861,7 @@ void blockcopy_ss_c(int16_t *a, intptr_t stridea, int16_t *b, intptr_t strideb)
  }
  
  template<int bx, int by>
-void blockcopy_sp_c(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb)
+void blockcopy_sp_c(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
  {
      for (int y = 0; y < by; y++)
      {
@@ -942,14 +877,12 @@ void blockcopy_sp_c(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb)
  }
  
  template<int bx, int by>
-void blockcopy_ps_c(int16_t *a, intptr_t stridea, pixel *b, intptr_t strideb)
+void blockcopy_ps_c(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
  {
      for (int y = 0; y < by; y++)
      {
          for (int x = 0; x < bx; x++)
-        {
              a[x] = (int16_t)b[x];
-        }
  
          a += stridea;
          b += strideb;
@@ -957,14 +890,12 @@ void blockcopy_ps_c(int16_t *a, intptr_t stridea, pixel *b, intptr_t strideb)
  }
  
  template<int bx, int by>
-void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1)
+void pixel_sub_ps_c(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
  {
      for (int y = 0; y < by; y++)
      {
          for (int x = 0; x < bx; x++)
-        {
              a[x] = (int16_t)(b0[x] - b1[x]);
-        }
  
          b0 += sstride0;
          b1 += sstride1;
@@ -973,14 +904,12 @@ void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t
  }
  
  template<int bx, int by>
-void pixel_add_ps_c(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1)
+void pixel_add_ps_c(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1)
  {
      for (int y = 0; y < by; y++)
      {
          for (int x = 0; x < bx; x++)
-        {
              a[x] = Clip(b0[x] + b1[x]);
-        }
  
          b0 += sstride0;
          b1 += sstride1;
@@ -989,7 +918,7 @@ void pixel_add_ps_c(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t
  }
  
  template<int bx, int by>
-void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
+void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
  {
      int shiftNum, offset;
  
@@ -1010,28 +939,24 @@ void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intpt
      }
  }
  
-void planecopy_cp_c(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
+void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
  {
      for (int r = 0; r < height; r++)
      {
          for (int c = 0; c < width; c++)
-        {
              dst[c] = ((pixel)src[c]) << shift;
-        }
  
          dst += dstStride;
          src += srcStride;
      }
  }
  
-void planecopy_sp_c(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
  {
      for (int r = 0; r < height; r++)
      {
          for (int c = 0; c < width; c++)
-        {
              dst[c] = (pixel)((src[c] >> shift) & mask);
-        }
  
          dst += dstStride;
          src += srcStride;
@@ -1040,8 +965,8 @@ void planecopy_sp_c(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstS
  
  /* Estimate the total amount of influence on future quality that could be had if we
   * were to improve the reference samples used to inter predict any given CU. */
-void estimateCUPropagateCost(int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts,
-                             int32_t *invQscales, double *fpsFactor, int len)
+void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
+                             const int32_t* invQscales, const double* fpsFactor, int len)
  {
      double fps = *fpsFactor / 256;
  
@@ -1068,12 +993,12 @@ void extendPicBorder(pixel* pic, intptr_t stride, int width, int height, int mar
      primitives.extendRowBorder(pic, stride, width, height, marginX);
  
      /* copy top row to create above margin */
-    pixel *top = pic - marginX;
+    pixel* top = pic - marginX;
      for (int y = 0; y < marginY; y++)
          memcpy(top - (y + 1) * stride, top, stride * sizeof(pixel));
  
      /* copy bottom row to create below margin */
-    pixel *bot = pic - marginX + (height - 1) * stride;
+    pixel* bot = pic - marginX + (height - 1) * stride;
      for (int y = 0; y < marginY; y++)
          memcpy(bot + (y + 1) * stride, bot, stride * sizeof(pixel));
  }
@@ -1113,6 +1038,62 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p)
      p.satd[LUMA_64x16] = satd8<64, 16>;
      p.satd[LUMA_16x64] = satd8<16, 64>;
  
+    p.chroma[X265_CSP_I420].satd[CHROMA_2x2]   = NULL;
+    p.chroma[X265_CSP_I420].satd[CHROMA_4x4]   = satd_4x4;
+    p.chroma[X265_CSP_I420].satd[CHROMA_8x8]   = satd8<8, 8>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = satd8<16, 16>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = satd8<32, 32>;
+
+    p.chroma[X265_CSP_I420].satd[CHROMA_4x2]   = NULL;
+    p.chroma[X265_CSP_I420].satd[CHROMA_2x4]   = NULL;
+    p.chroma[X265_CSP_I420].satd[CHROMA_8x4]   = satd_8x4;
+    p.chroma[X265_CSP_I420].satd[CHROMA_4x8]   = satd4<4, 8>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_16x8]  = satd8<16, 8>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_8x16]  = satd8<8, 16>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = satd8<32, 16>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = satd8<16, 32>;
+
+    p.chroma[X265_CSP_I420].satd[CHROMA_8x6]   = NULL;
+    p.chroma[X265_CSP_I420].satd[CHROMA_6x8]   = NULL;
+    p.chroma[X265_CSP_I420].satd[CHROMA_8x2]   = NULL;
+    p.chroma[X265_CSP_I420].satd[CHROMA_2x8]   = NULL;
+    p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = satd4<16, 12>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = satd4<12, 16>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_16x4]  = satd4<16, 4>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_4x16]  = satd4<4, 16>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = satd8<32, 24>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = satd8<24, 32>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_32x8]  = satd8<32, 8>;
+    p.chroma[X265_CSP_I420].satd[CHROMA_8x32]  = satd8<8, 32>;
+
+    p.chroma[X265_CSP_I422].satd[CHROMA422_2x4]   = NULL;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_4x8]   = satd4<4, 8>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_8x16]  = satd8<8, 16>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = satd8<16, 32>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = satd8<32, 64>;
+
+    p.chroma[X265_CSP_I422].satd[CHROMA422_4x4]   = satd_4x4;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_2x8]   = NULL;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_8x8]   = satd8<8, 8>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_4x16]  = satd4<4, 16>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = satd8<16, 16>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_8x32]  = satd8<8, 32>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = satd8<32, 32>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = satd8<16, 64>;
+
+    p.chroma[X265_CSP_I422].satd[CHROMA422_8x12]  = satd4<8, 12>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_6x16]  = NULL;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_8x4]   = satd4<8, 4>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_2x16]  = NULL;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_16x8]  = satd8<16, 8>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_4x32]  = satd4<4, 32>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = satd8<32, 16>;
+    p.chroma[X265_CSP_I422].satd[CHROMA422_8x64]  = satd8<8, 64>;
+
  #define CHROMA_420(W, H) \
      p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H]  = addAvg<W, H>;         \
      p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
@@ -1121,13 +1102,14 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p)
      p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
  
  #define CHROMA_422(W, H) \
-    p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg<W, H>;         \
+    p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H]  = addAvg<W, H>;         \
      p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
      p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
      p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
      p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
  
  #define CHROMA_444(W, H) \
+    p.chroma[X265_CSP_I444].satd[LUMA_ ## W ## x ## H]    = p.satd[LUMA_ ## W ## x ## H]; \
      p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H]  = addAvg<W, H>; \
      p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
      p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
@@ -1157,8 +1139,6 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p)
      p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
      p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
  
-
-
      LUMA(4, 4);
      LUMA(8, 8);
      CHROMA_420(4, 4);
@@ -1278,9 +1258,9 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p)
      CHROMA_444(64, 16);
      CHROMA_444(16, 64);
  
-    SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixelcmp_t, pixel, pixel)
-    SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, pixelcmp_sp_t, int16_t, pixel)
-    SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, pixelcmp_ss_t, int16_t, int16_t)
+    SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixel, pixel)
+    SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel)
+    SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t)
  
      p.blockfill_s[BLOCK_4x4]   = blockfil_s_c<4>;
      p.blockfill_s[BLOCK_8x8]   = blockfil_s_c<8>;
@@ -1288,22 +1268,22 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p)
      p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
      p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
  
-    p.cvt16to32_shl = convert16to32_shl;
-    p.cvt16to32_shr[BLOCK_4x4] = convert16to32_shr<4>;
-    p.cvt16to32_shr[BLOCK_8x8] = convert16to32_shr<8>;
-    p.cvt16to32_shr[BLOCK_16x16] = convert16to32_shr<16>;
-    p.cvt16to32_shr[BLOCK_32x32] = convert16to32_shr<32>;
-    p.cvt32to16_shr = convert32to16_shr;
-    p.cvt32to16_shl[BLOCK_4x4] = convert32to16_shl<4>;
-    p.cvt32to16_shl[BLOCK_8x8] = convert32to16_shl<8>;
-    p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>;
-    p.cvt32to16_shl[BLOCK_32x32] = convert32to16_shl<32>;
-
-    p.copy_shr = copy_shr;
-    p.copy_shl[BLOCK_4x4] = copy_shl<4>;
-    p.copy_shl[BLOCK_8x8] = copy_shl<8>;
-    p.copy_shl[BLOCK_16x16] = copy_shl<16>;
-    p.copy_shl[BLOCK_32x32] = copy_shl<32>;
+    p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>;
+    p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>;
+    p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>;
+    p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>;
+    p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>;
+    p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>;
+    p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>;
+    p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>;
+    p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>;
+    p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>;
+    p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>;
+    p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>;
+    p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>;
+    p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>;
+    p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>;
+    p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>;
  
      p.sa8d[BLOCK_4x4]   = satd_4x4;
      p.sa8d[BLOCK_8x8]   = sa8d_8x8;
@@ -1371,7 +1351,7 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p)
  
      p.scale1D_128to64 = scale1D_128to64;
      p.scale2D_64to32 = scale2D_64to32;
-    p.frame_init_lowres_core = frame_init_lowres_core;
+    p.frameInitLowres = frame_init_lowres_core;
      p.ssim_4x4x2_core = ssim_4x4x2_core;
      p.ssim_end_4 = ssim_end_4;
  
@@ -1379,7 +1359,6 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p)
      p.var[BLOCK_16x16] = pixel_var<16>;
      p.var[BLOCK_32x32] = pixel_var<32>;
      p.var[BLOCK_64x64] = pixel_var<64>;
-    p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
      p.planecopy_cp = planecopy_cp_c;
      p.planecopy_sp = planecopy_sp_c;
      p.propagateCost = estimateCUPropagateCost;