using namespace x265;
-#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \
- p.FUNC_PREFIX[LUMA_4x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_8x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_8x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_4x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_8x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x12] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_12x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_4x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x24] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_24x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_8x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_64x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_64x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_32x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_64x48] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_48x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_64x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
- p.FUNC_PREFIX[LUMA_16x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
+#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \
+ p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \
+ p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>;
#define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \
p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX<4, 4>; \
// place functions in anonymous namespace (file static)
template<int lx, int ly>
-int sad(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int sad(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
int sum = 0;
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
- {
sum += abs(pix1[x] - pix2[x]);
- }
pix1 += stride_pix1;
pix2 += stride_pix2;
}
template<int lx, int ly>
-int sad(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2)
+int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
{
int sum = 0;
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
- {
sum += abs(pix1[x] - pix2[x]);
- }
pix1 += stride_pix1;
pix2 += stride_pix2;
}
template<int lx, int ly>
-void sad_x3(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, intptr_t frefstride, int32_t *res)
+void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
{
res[0] = 0;
res[1] = 0;
}
template<int lx, int ly>
-void sad_x4(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, pixel *pix5, intptr_t frefstride, int32_t *res)
+void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
{
res[0] = 0;
res[1] = 0;
}
template<int lx, int ly, class T1, class T2>
-int sse(T1 *pix1, intptr_t stride_pix1, T2 *pix2, intptr_t stride_pix2)
+int sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
{
int sum = 0;
- int iTemp;
+ int tmp;
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
{
- iTemp = pix1[x] - pix2[x];
- sum += (iTemp * iTemp);
+ tmp = pix1[x] - pix2[x];
+ sum += (tmp * tmp);
}
pix1 += stride_pix1;
return (a + s) ^ s;
}
-int satd_4x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
sum2_t tmp[4][2];
sum2_t a0, a1, a2, a3, b0, b1;
return (int)(sum >> 1);
}
-int satd_4x4(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2)
+int satd_4x4(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
{
ssum2_t tmp[4][2];
ssum2_t a0, a1, a2, a3, b0, b1;
}
// x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
-int satd_8x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
sum2_t tmp[4][4];
sum2_t a0, a1, a2, a3;
template<int w, int h>
// calculate satd in blocks of 4x4
-int satd4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
int satd = 0;
for (int row = 0; row < h; row += 4)
- {
for (int col = 0; col < w; col += 4)
- {
satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
pix2 + row * stride_pix2 + col, stride_pix2);
- }
- }
return satd;
}
template<int w, int h>
// calculate satd in blocks of 8x4
-int satd8(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
+int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
{
int satd = 0;
for (int row = 0; row < h; row += 4)
- {
for (int col = 0; col < w; col += 8)
- {
satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
pix2 + row * stride_pix2 + col, stride_pix2);
- }
- }
return satd;
}
-inline int _sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+inline int _sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
sum2_t tmp[8][4];
sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
return (int)sum;
}
-int sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
}
-inline int _sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2)
+inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2)
{
ssum2_t tmp[8][4];
ssum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
return (int)sum;
}
-int sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2)
+int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2)
{
return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
}
-int sa8d_16x16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
+ _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2)
template<int w, int h>
// Calculate sa8d in blocks of 8x8
-int sa8d8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
int cost = 0;
for (int y = 0; y < h; y += 8)
- {
for (int x = 0; x < w; x += 8)
- {
cost += sa8d_8x8(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
- }
- }
return cost;
}
template<int w, int h>
// Calculate sa8d in blocks of 16x16
-int sa8d16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2)
+int sa8d16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
{
int cost = 0;
for (int y = 0; y < h; y += 16)
- {
for (int x = 0; x < w; x += 16)
- {
cost += sa8d_16x16(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2);
- }
- }
return cost;
}
template<int size>
-int pixel_ssd_s_c(short *a, intptr_t dstride)
+int pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
{
int sum = 0;
for (int y = 0; y < size; y++)
{
for (int x = 0; x < size; x++)
- {
sum += a[x] * a[x];
- }
+
a += dstride;
}
return sum;
}
template<int size>
-void blockfil_s_c(int16_t *dst, intptr_t dstride, int16_t val)
+void blockfil_s_c(int16_t* dst, intptr_t dstride, int16_t val)
{
for (int y = 0; y < size; y++)
- {
for (int x = 0; x < size; x++)
- {
dst[y * dstride + x] = val;
- }
- }
-}
-
-void convert16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size)
-{
- for (int i = 0; i < size; i++)
- {
- for (int j = 0; j < size; j++)
- {
- dst[i * size + j] = ((int)src[i * stride + j]) << shift;
- }
- }
}
template<int size>
-void convert16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset)
+void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
{
- for (int i = 0; i < size; i++)
- {
- for (int j = 0; j < size; j++)
- {
- dst[i * size + j] = ((int)src[i * stride + j] + offset) >> shift;
- }
- }
-}
-
-void convert32to16_shr(int16_t *dst, int32_t *src, intptr_t stride, int shift, int size)
-{
- int round = 1 << (shift - 1);
+ X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
+ X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
+ X265_CHECK(shift >= 0, "invalid shift\n");
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
- {
- dst[j] = (int16_t)((src[j] + round) >> shift);
- }
+ dst[j] = src[j] << shift;
- src += size;
- dst += stride;
+ src += srcStride;
+ dst += size;
}
}
-void copy_shr(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size)
+template<int size>
+void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
{
- int round = 1 << (shift - 1);
+ X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
+ X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
+ X265_CHECK(shift > 0, "invalid shift\n");
+ int16_t round = 1 << (shift - 1);
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
- {
- dst[j] = (int16_t)((src[j] + round) >> shift);
- }
+ dst[j] = (src[j] + round) >> shift;
- src += size;
- dst += stride;
+ src += srcStride;
+ dst += size;
}
}
template<int size>
-void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
+void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
{
+ X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
+ X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
+ X265_CHECK(shift >= 0, "invalid shift\n");
+
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
- {
- dst[j] = ((int16_t)src[j] << shift);
- }
+ dst[j] = src[j] << shift;
src += size;
- dst += stride;
+ dst += dstStride;
}
}
template<int size>
-void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
{
+ X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
+ X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
+ X265_CHECK(shift > 0, "invalid shift\n");
+
+ int16_t round = 1 << (shift - 1);
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
- {
- dst[j] = (src[j] << shift);
- }
+ dst[j] = (src[j] + round) >> shift;
src += size;
- dst += stride;
+ dst += dstStride;
}
}
template<int blockSize>
-void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
+void getResidual(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
{
for (int y = 0; y < blockSize; y++)
{
for (int x = 0; x < blockSize; x++)
- {
residual[x] = static_cast<int16_t>(fenc[x]) - static_cast<int16_t>(pred[x]);
- }
fenc += stride;
residual += stride;
}
template<int blockSize>
-void transpose(pixel* dst, pixel* src, intptr_t stride)
+void transpose(pixel* dst, const pixel* src, intptr_t stride)
{
for (int k = 0; k < blockSize; k++)
- {
for (int l = 0; l < blockSize; l++)
- {
dst[k * blockSize + l] = src[l * stride + k];
- }
- }
}
-void weight_sp_c(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
+void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
{
int x, y;
}
}
-void weight_pp_c(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
+void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
{
int x, y;
}
template<int lx, int ly>
-void pixelavg_pp(pixel* dst, intptr_t dstride, pixel* src0, intptr_t sstride0, pixel* src1, intptr_t sstride1, int)
+void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
{
for (int y = 0; y < ly; y++)
{
for (int x = 0; x < lx; x++)
- {
dst[x] = (src0[x] + src1[x] + 1) >> 1;
- }
src0 += sstride0;
src1 += sstride1;
}
}
-void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
+void scale1D_128to64(pixel* dst, const pixel* src, intptr_t /*stride*/)
{
int x;
}
}
-void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
+void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
{
- int x, y;
+ uint32_t x, y;
for (y = 0; y < 64; y += 2)
{
}
}
-void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
+void frame_init_lowres_core(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc,
intptr_t src_stride, intptr_t dst_stride, int width, int height)
{
for (int y = 0; y < height; y++)
{
- pixel *src1 = src0 + src_stride;
- pixel *src2 = src1 + src_stride;
+ const pixel* src1 = src0 + src_stride;
+ const pixel* src2 = src1 + src_stride;
for (int x = 0; x < width; x++)
{
// slower than naive bilinear, but matches asm
}
/* structural similarity metric */
-void ssim_4x4x2_core(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4])
+void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4])
{
for (int z = 0; z < 2; z++)
{
}
template<int size>
-uint64_t pixel_var(pixel *pix, intptr_t i_stride)
+uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
{
uint32_t sum = 0, sqr = 0;
#endif
template<int size>
-int psyCost_pp(pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride)
+int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
{
static pixel zeroBuf[8] /* = { 0 } */;
}
template<int size>
-int psyCost_ss(int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstride)
+int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
{
static int16_t zeroBuf[8] /* = { 0 } */;
}
}
-void plane_copy_deinterleave_chroma(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride,
- pixel *src, intptr_t srcStride, int w, int h)
-{
- for (int y = 0; y < h; y++, dstu += dstuStride, dstv += dstvStride, src += srcStride)
- {
- for (int x = 0; x < w; x++)
- {
- dstu[x] = src[2 * x];
- dstv[x] = src[2 * x + 1];
- }
- }
-}
-
template<int bx, int by>
-void blockcopy_pp_c(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb)
+void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
- {
a[x] = b[x];
- }
a += stridea;
b += strideb;
}
template<int bx, int by>
-void blockcopy_ss_c(int16_t *a, intptr_t stridea, int16_t *b, intptr_t strideb)
+void blockcopy_ss_c(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
- {
a[x] = b[x];
- }
a += stridea;
b += strideb;
}
template<int bx, int by>
-void blockcopy_sp_c(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb)
+void blockcopy_sp_c(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
{
}
template<int bx, int by>
-void blockcopy_ps_c(int16_t *a, intptr_t stridea, pixel *b, intptr_t strideb)
+void blockcopy_ps_c(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
- {
a[x] = (int16_t)b[x];
- }
a += stridea;
b += strideb;
}
template<int bx, int by>
-void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1)
+void pixel_sub_ps_c(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
- {
a[x] = (int16_t)(b0[x] - b1[x]);
- }
b0 += sstride0;
b1 += sstride1;
}
template<int bx, int by>
-void pixel_add_ps_c(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1)
+void pixel_add_ps_c(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1)
{
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x++)
- {
a[x] = Clip(b0[x] + b1[x]);
- }
b0 += sstride0;
b1 += sstride1;
}
template<int bx, int by>
-void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
+void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
{
int shiftNum, offset;
}
}
-void planecopy_cp_c(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
+void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
{
for (int r = 0; r < height; r++)
{
for (int c = 0; c < width; c++)
- {
dst[c] = ((pixel)src[c]) << shift;
- }
dst += dstStride;
src += srcStride;
}
}
-void planecopy_sp_c(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
{
for (int r = 0; r < height; r++)
{
for (int c = 0; c < width; c++)
- {
dst[c] = (pixel)((src[c] >> shift) & mask);
- }
dst += dstStride;
src += srcStride;
/* Estimate the total amount of influence on future quality that could be had if we
* were to improve the reference samples used to inter predict any given CU. */
-void estimateCUPropagateCost(int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts,
- int32_t *invQscales, double *fpsFactor, int len)
+void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
+ const int32_t* invQscales, const double* fpsFactor, int len)
{
double fps = *fpsFactor / 256;
primitives.extendRowBorder(pic, stride, width, height, marginX);
/* copy top row to create above margin */
- pixel *top = pic - marginX;
+ pixel* top = pic - marginX;
for (int y = 0; y < marginY; y++)
memcpy(top - (y + 1) * stride, top, stride * sizeof(pixel));
/* copy bottom row to create below margin */
- pixel *bot = pic - marginX + (height - 1) * stride;
+ pixel* bot = pic - marginX + (height - 1) * stride;
for (int y = 0; y < marginY; y++)
memcpy(bot + (y + 1) * stride, bot, stride * sizeof(pixel));
}
p.satd[LUMA_64x16] = satd8<64, 16>;
p.satd[LUMA_16x64] = satd8<16, 64>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_2x2] = NULL;
+ p.chroma[X265_CSP_I420].satd[CHROMA_4x4] = satd_4x4;
+ p.chroma[X265_CSP_I420].satd[CHROMA_8x8] = satd8<8, 8>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = satd8<16, 16>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = satd8<32, 32>;
+
+ p.chroma[X265_CSP_I420].satd[CHROMA_4x2] = NULL;
+ p.chroma[X265_CSP_I420].satd[CHROMA_2x4] = NULL;
+ p.chroma[X265_CSP_I420].satd[CHROMA_8x4] = satd_8x4;
+ p.chroma[X265_CSP_I420].satd[CHROMA_4x8] = satd4<4, 8>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_16x8] = satd8<16, 8>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_8x16] = satd8<8, 16>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = satd8<32, 16>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = satd8<16, 32>;
+
+ p.chroma[X265_CSP_I420].satd[CHROMA_8x6] = NULL;
+ p.chroma[X265_CSP_I420].satd[CHROMA_6x8] = NULL;
+ p.chroma[X265_CSP_I420].satd[CHROMA_8x2] = NULL;
+ p.chroma[X265_CSP_I420].satd[CHROMA_2x8] = NULL;
+ p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = satd4<16, 12>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = satd4<12, 16>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_16x4] = satd4<16, 4>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_4x16] = satd4<4, 16>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = satd8<32, 24>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = satd8<24, 32>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_32x8] = satd8<32, 8>;
+ p.chroma[X265_CSP_I420].satd[CHROMA_8x32] = satd8<8, 32>;
+
+ p.chroma[X265_CSP_I422].satd[CHROMA422_2x4] = NULL;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_4x8] = satd4<4, 8>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_8x16] = satd8<8, 16>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = satd8<16, 32>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = satd8<32, 64>;
+
+ p.chroma[X265_CSP_I422].satd[CHROMA422_4x4] = satd_4x4;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_2x8] = NULL;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_8x8] = satd8<8, 8>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_4x16] = satd4<4, 16>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = satd8<16, 16>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_8x32] = satd8<8, 32>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = satd8<32, 32>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = satd8<16, 64>;
+
+ p.chroma[X265_CSP_I422].satd[CHROMA422_8x12] = satd4<8, 12>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_6x16] = NULL;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_8x4] = satd4<8, 4>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_2x16] = NULL;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_16x8] = satd8<16, 8>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_4x32] = satd4<4, 32>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = satd8<32, 16>;
+ p.chroma[X265_CSP_I422].satd[CHROMA422_8x64] = satd8<8, 64>;
+
#define CHROMA_420(W, H) \
p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = addAvg<W, H>; \
p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
#define CHROMA_422(W, H) \
- p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg<W, H>; \
+ p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg<W, H>; \
p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c<W, H>; \
p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c<W, H>;
#define CHROMA_444(W, H) \
+ p.chroma[X265_CSP_I444].satd[LUMA_ ## W ## x ## H] = p.satd[LUMA_ ## W ## x ## H]; \
p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H] = addAvg<W, H>; \
p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>; \
p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c<W, H>;
-
-
LUMA(4, 4);
LUMA(8, 8);
CHROMA_420(4, 4);
CHROMA_444(64, 16);
CHROMA_444(16, 64);
- SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixelcmp_t, pixel, pixel)
- SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, pixelcmp_sp_t, int16_t, pixel)
- SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, pixelcmp_ss_t, int16_t, int16_t)
+ SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixel, pixel)
+ SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel)
+ SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t)
p.blockfill_s[BLOCK_4x4] = blockfil_s_c<4>;
p.blockfill_s[BLOCK_8x8] = blockfil_s_c<8>;
p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>;
p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>;
- p.cvt16to32_shl = convert16to32_shl;
- p.cvt16to32_shr[BLOCK_4x4] = convert16to32_shr<4>;
- p.cvt16to32_shr[BLOCK_8x8] = convert16to32_shr<8>;
- p.cvt16to32_shr[BLOCK_16x16] = convert16to32_shr<16>;
- p.cvt16to32_shr[BLOCK_32x32] = convert16to32_shr<32>;
- p.cvt32to16_shr = convert32to16_shr;
- p.cvt32to16_shl[BLOCK_4x4] = convert32to16_shl<4>;
- p.cvt32to16_shl[BLOCK_8x8] = convert32to16_shl<8>;
- p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>;
- p.cvt32to16_shl[BLOCK_32x32] = convert32to16_shl<32>;
-
- p.copy_shr = copy_shr;
- p.copy_shl[BLOCK_4x4] = copy_shl<4>;
- p.copy_shl[BLOCK_8x8] = copy_shl<8>;
- p.copy_shl[BLOCK_16x16] = copy_shl<16>;
- p.copy_shl[BLOCK_32x32] = copy_shl<32>;
+ p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>;
+ p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>;
+ p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>;
+ p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>;
+ p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>;
+ p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>;
+ p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>;
+ p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>;
+ p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>;
+ p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>;
+ p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>;
+ p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>;
+ p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>;
+ p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>;
+ p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>;
+ p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>;
p.sa8d[BLOCK_4x4] = satd_4x4;
p.sa8d[BLOCK_8x8] = sa8d_8x8;
p.scale1D_128to64 = scale1D_128to64;
p.scale2D_64to32 = scale2D_64to32;
- p.frame_init_lowres_core = frame_init_lowres_core;
+ p.frameInitLowres = frame_init_lowres_core;
p.ssim_4x4x2_core = ssim_4x4x2_core;
p.ssim_end_4 = ssim_end_4;
p.var[BLOCK_16x16] = pixel_var<16>;
p.var[BLOCK_32x32] = pixel_var<32>;
p.var[BLOCK_64x64] = pixel_var<64>;
- p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
p.planecopy_cp = planecopy_cp_c;
p.planecopy_sp = planecopy_sp_c;
p.propagateCost = estimateCUPropagateCost;