X-Git-Url: https://git.piment-noir.org/?p=deb_x265.git;a=blobdiff_plain;f=source%2Fcommon%2Fpixel.cpp;h=a56b8d7e712e155ed17074f07640aa8a5bc8aaea;hp=3e0530dddb5f162d27249e782c1d6ae4ed86528f;hb=b53f7c52d8280ab63876efd6eb292c21430ac607;hpb=5c9b45285dd64723ad1dac380b98a7b1f3095674 diff --git a/source/common/pixel.cpp b/source/common/pixel.cpp index 3e0530d..a56b8d7 100644 --- a/source/common/pixel.cpp +++ b/source/common/pixel.cpp @@ -32,32 +32,32 @@ using namespace x265; -#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \ - p.FUNC_PREFIX[LUMA_4x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_8x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_8x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_4x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_16x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_16x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_8x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_16x12] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_12x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_16x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_4x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_32x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_32x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_16x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_32x24] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_24x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_32x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_8x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_64x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_64x32] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_32x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_64x48] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_48x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_64x16] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \ - p.FUNC_PREFIX[LUMA_16x64] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>; +#define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, DATA_TYPE1, DATA_TYPE2) \ + p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX_DEF<4, 4, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_8x8] = FUNC_PREFIX_DEF<8, 8, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_8x4] = FUNC_PREFIX_DEF<8, 4, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_4x8] = FUNC_PREFIX_DEF<4, 8, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x16] = FUNC_PREFIX_DEF<16, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x8] = FUNC_PREFIX_DEF<16, 8, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_8x16] = FUNC_PREFIX_DEF<8, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x12] = FUNC_PREFIX_DEF<16, 12, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_12x16] = FUNC_PREFIX_DEF<12, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x4] = FUNC_PREFIX_DEF<16, 4, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_4x16] = FUNC_PREFIX_DEF<4, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_32x32] = FUNC_PREFIX_DEF<32, 32, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_32x16] = FUNC_PREFIX_DEF<32, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x32] = FUNC_PREFIX_DEF<16, 32, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_32x24] = FUNC_PREFIX_DEF<32, 24, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_24x32] = FUNC_PREFIX_DEF<24, 32, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_32x8] = FUNC_PREFIX_DEF<32, 8, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_8x32] = FUNC_PREFIX_DEF<8, 32, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_64x64] = FUNC_PREFIX_DEF<64, 64, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_64x32] = FUNC_PREFIX_DEF<64, 32, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_32x64] = FUNC_PREFIX_DEF<32, 64, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_64x48] = FUNC_PREFIX_DEF<64, 48, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_48x64] = FUNC_PREFIX_DEF<48, 64, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_64x16] = FUNC_PREFIX_DEF<64, 16, DATA_TYPE1, DATA_TYPE2>; \ + p.FUNC_PREFIX[LUMA_16x64] = FUNC_PREFIX_DEF<16, 64, DATA_TYPE1, DATA_TYPE2>; #define SET_FUNC_PRIMITIVE_TABLE_C2(FUNC_PREFIX) \ p.FUNC_PREFIX[LUMA_4x4] = FUNC_PREFIX<4, 4>; \ @@ -90,16 +90,14 @@ namespace { // place functions in anonymous namespace (file static) template -int sad(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) +int sad(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) { int sum = 0; for (int y = 0; y < ly; y++) { for (int x = 0; x < lx; x++) - { sum += abs(pix1[x] - pix2[x]); - } pix1 += stride_pix1; pix2 += stride_pix2; @@ -109,16 +107,14 @@ int sad(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) } template -int sad(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2) +int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2) { int sum = 0; for (int y = 0; y < ly; y++) { for (int x = 0; x < lx; x++) - { sum += abs(pix1[x] - pix2[x]); - } pix1 += stride_pix1; pix2 += stride_pix2; @@ -128,7 +124,7 @@ int sad(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2 } template -void sad_x3(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, intptr_t frefstride, int32_t *res) +void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res) { res[0] = 0; res[1] = 0; @@ -150,7 +146,7 @@ void sad_x3(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, intptr_t frefstr } template -void sad_x4(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, pixel *pix5, intptr_t frefstride, int32_t *res) +void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res) { res[0] = 0; res[1] = 0; @@ -175,17 +171,17 @@ void sad_x4(pixel *pix1, pixel *pix2, pixel *pix3, pixel *pix4, pixel *pix5, int } template -int sse(T1 *pix1, intptr_t stride_pix1, T2 *pix2, intptr_t stride_pix2) +int sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2) { int sum = 0; - int iTemp; + int tmp; for (int y = 0; y < ly; y++) { for (int x = 0; x < lx; x++) { - iTemp = pix1[x] - pix2[x]; - sum += (iTemp * iTemp); + tmp = pix1[x] - pix2[x]; + sum += (tmp * tmp); } pix1 += stride_pix1; @@ -217,7 +213,7 @@ inline sum2_t abs2(sum2_t a) return (a + s) ^ s; } -int satd_4x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) +int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) { sum2_t tmp[4][2]; sum2_t a0, a1, a2, a3, b0, b1; @@ -245,7 +241,7 @@ int satd_4x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix return (int)(sum >> 1); } -int satd_4x4(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride_pix2) +int satd_4x4(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2) { ssum2_t tmp[4][2]; ssum2_t a0, a1, a2, a3, b0, b1; @@ -274,7 +270,7 @@ int satd_4x4(int16_t *pix1, intptr_t stride_pix1, int16_t *pix2, intptr_t stride } // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once -int satd_8x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) +int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) { sum2_t tmp[4][4]; sum2_t a0, a1, a2, a3; @@ -300,41 +296,33 @@ int satd_8x4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix template // calculate satd in blocks of 4x4 -int satd4(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) +int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) { int satd = 0; for (int row = 0; row < h; row += 4) - { for (int col = 0; col < w; col += 4) - { satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1, pix2 + row * stride_pix2 + col, stride_pix2); - } - } return satd; } template // calculate satd in blocks of 8x4 -int satd8(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2) +int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2) { int satd = 0; for (int row = 0; row < h; row += 4) - { for (int col = 0; col < w; col += 8) - { satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1, pix2 + row * stride_pix2 + col, stride_pix2); - } - } return satd; } -inline int _sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) +inline int _sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) { sum2_t tmp[8][4]; sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3; @@ -371,12 +359,12 @@ inline int _sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) return (int)sum; } -int sa8d_8x8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) +int sa8d_8x8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) { return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2); } -inline int _sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2) +inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2) { ssum2_t tmp[8][4]; ssum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3; @@ -413,12 +401,12 @@ inline int _sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_p return (int)sum; } -int sa8d_8x8(int16_t *pix1, intptr_t i_pix1, int16_t *pix2, intptr_t i_pix2) +int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1, const int16_t* pix2, intptr_t i_pix2) { return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2); } -int sa8d_16x16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) +int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) { int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + _sa8d_8x8(pix1 + 8, i_pix1, pix2 + 8, i_pix2) @@ -432,159 +420,129 @@ int sa8d_16x16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) template // Calculate sa8d in blocks of 8x8 -int sa8d8(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) +int sa8d8(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) { int cost = 0; for (int y = 0; y < h; y += 8) - { for (int x = 0; x < w; x += 8) - { cost += sa8d_8x8(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2); - } - } return cost; } template // Calculate sa8d in blocks of 16x16 -int sa8d16(pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2) +int sa8d16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2) { int cost = 0; for (int y = 0; y < h; y += 16) - { for (int x = 0; x < w; x += 16) - { cost += sa8d_16x16(pix1 + i_pix1 * y + x, i_pix1, pix2 + i_pix2 * y + x, i_pix2); - } - } return cost; } template -int pixel_ssd_s_c(short *a, intptr_t dstride) +int pixel_ssd_s_c(const int16_t* a, intptr_t dstride) { int sum = 0; for (int y = 0; y < size; y++) { for (int x = 0; x < size; x++) - { sum += a[x] * a[x]; - } + a += dstride; } return sum; } template -void blockfil_s_c(int16_t *dst, intptr_t dstride, int16_t val) +void blockfil_s_c(int16_t* dst, intptr_t dstride, int16_t val) { for (int y = 0; y < size; y++) - { for (int x = 0; x < size; x++) - { dst[y * dstride + x] = val; - } - } -} - -void convert16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size) -{ - for (int i = 0; i < size; i++) - { - for (int j = 0; j < size; j++) - { - dst[i * size + j] = ((int)src[i * stride + j]) << shift; - } - } } template -void convert16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset) +void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) { - for (int i = 0; i < size; i++) - { - for (int j = 0; j < size; j++) - { - dst[i * size + j] = ((int)src[i * stride + j] + offset) >> shift; - } - } -} - -void convert32to16_shr(int16_t *dst, int32_t *src, intptr_t stride, int shift, int size) -{ - int round = 1 << (shift - 1); + X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n"); + X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n"); + X265_CHECK(shift >= 0, "invalid shift\n"); for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) - { - dst[j] = (int16_t)((src[j] + round) >> shift); - } + dst[j] = src[j] << shift; - src += size; - dst += stride; + src += srcStride; + dst += size; } } -void copy_shr(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size) +template +void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) { - int round = 1 << (shift - 1); + X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n"); + X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n"); + X265_CHECK(shift > 0, "invalid shift\n"); + int16_t round = 1 << (shift - 1); for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) - { - dst[j] = (int16_t)((src[j] + round) >> shift); - } + dst[j] = (src[j] + round) >> shift; - src += size; - dst += stride; + src += srcStride; + dst += size; } } template -void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift) +void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) { + X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n"); + X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n"); + X265_CHECK(shift >= 0, "invalid shift\n"); + for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) - { - dst[j] = ((int16_t)src[j] << shift); - } + dst[j] = src[j] << shift; src += size; - dst += stride; + dst += dstStride; } } template -void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift) +void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) { + X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n"); + X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n"); + X265_CHECK(shift > 0, "invalid shift\n"); + + int16_t round = 1 << (shift - 1); for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) - { - dst[j] = (src[j] << shift); - } + dst[j] = (src[j] + round) >> shift; src += size; - dst += stride; + dst += dstStride; } } template -void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride) +void getResidual(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride) { for (int y = 0; y < blockSize; y++) { for (int x = 0; x < blockSize; x++) - { residual[x] = static_cast(fenc[x]) - static_cast(pred[x]); - } fenc += stride; residual += stride; @@ -593,18 +551,14 @@ void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride) } template -void transpose(pixel* dst, pixel* src, intptr_t stride) +void transpose(pixel* dst, const pixel* src, intptr_t stride) { for (int k = 0; k < blockSize; k++) - { for (int l = 0; l < blockSize; l++) - { dst[k * blockSize + l] = src[l * stride + k]; - } - } } -void weight_sp_c(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset) +void weight_sp_c(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset) { int x, y; @@ -622,7 +576,7 @@ void weight_sp_c(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStrid } } -void weight_pp_c(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset) +void weight_pp_c(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset) { int x, y; @@ -646,14 +600,12 @@ void weight_pp_c(pixel *src, pixel *dst, intptr_t stride, int width, int height, } template -void pixelavg_pp(pixel* dst, intptr_t dstride, pixel* src0, intptr_t sstride0, pixel* src1, intptr_t sstride1, int) +void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) { for (int y = 0; y < ly; y++) { for (int x = 0; x < lx; x++) - { dst[x] = (src0[x] + src1[x] + 1) >> 1; - } src0 += sstride0; src1 += sstride1; @@ -661,7 +613,7 @@ void pixelavg_pp(pixel* dst, intptr_t dstride, pixel* src0, intptr_t sstride0, p } } -void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/) +void scale1D_128to64(pixel* dst, const pixel* src, intptr_t /*stride*/) { int x; @@ -675,9 +627,9 @@ void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/) } } -void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride) +void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride) { - int x, y; + uint32_t x, y; for (y = 0; y < 64; y += 2) { @@ -694,13 +646,13 @@ void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride) } } -void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc, +void frame_init_lowres_core(const pixel* src0, pixel* dst0, pixel* dsth, pixel* dstv, pixel* dstc, intptr_t src_stride, intptr_t dst_stride, int width, int height) { for (int y = 0; y < height; y++) { - pixel *src1 = src0 + src_stride; - pixel *src2 = src1 + src_stride; + const pixel* src1 = src0 + src_stride; + const pixel* src2 = src1 + src_stride; for (int x = 0; x < width; x++) { // slower than naive bilinear, but matches asm @@ -720,7 +672,7 @@ void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, } /* structural similarity metric */ -void ssim_4x4x2_core(const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4]) +void ssim_4x4x2_core(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]) { for (int z = 0; z < 2; z++) { @@ -794,7 +746,7 @@ float ssim_end_4(int sum0[5][4], int sum1[5][4], int width) } template -uint64_t pixel_var(pixel *pix, intptr_t i_stride) +uint64_t pixel_var(const pixel* pix, intptr_t i_stride) { uint32_t sum = 0, sqr = 0; @@ -817,7 +769,7 @@ uint64_t pixel_var(pixel *pix, intptr_t i_stride) #endif template -int psyCost_pp(pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride) +int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride) { static pixel zeroBuf[8] /* = { 0 } */; @@ -850,7 +802,7 @@ int psyCost_pp(pixel *source, intptr_t sstride, pixel *recon, intptr_t rstride) } template -int psyCost_ss(int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstride) +int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) { static int16_t zeroBuf[8] /* = { 0 } */; @@ -882,28 +834,13 @@ int psyCost_ss(int16_t *source, intptr_t sstride, int16_t *recon, intptr_t rstri } } -void plane_copy_deinterleave_chroma(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride, - pixel *src, intptr_t srcStride, int w, int h) -{ - for (int y = 0; y < h; y++, dstu += dstuStride, dstv += dstvStride, src += srcStride) - { - for (int x = 0; x < w; x++) - { - dstu[x] = src[2 * x]; - dstv[x] = src[2 * x + 1]; - } - } -} - template -void blockcopy_pp_c(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb) +void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb) { for (int y = 0; y < by; y++) { for (int x = 0; x < bx; x++) - { a[x] = b[x]; - } a += stridea; b += strideb; @@ -911,14 +848,12 @@ void blockcopy_pp_c(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb) } template -void blockcopy_ss_c(int16_t *a, intptr_t stridea, int16_t *b, intptr_t strideb) +void blockcopy_ss_c(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb) { for (int y = 0; y < by; y++) { for (int x = 0; x < bx; x++) - { a[x] = b[x]; - } a += stridea; b += strideb; @@ -926,7 +861,7 @@ void blockcopy_ss_c(int16_t *a, intptr_t stridea, int16_t *b, intptr_t strideb) } template -void blockcopy_sp_c(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb) +void blockcopy_sp_c(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb) { for (int y = 0; y < by; y++) { @@ -942,14 +877,12 @@ void blockcopy_sp_c(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb) } template -void blockcopy_ps_c(int16_t *a, intptr_t stridea, pixel *b, intptr_t strideb) +void blockcopy_ps_c(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb) { for (int y = 0; y < by; y++) { for (int x = 0; x < bx; x++) - { a[x] = (int16_t)b[x]; - } a += stridea; b += strideb; @@ -957,14 +890,12 @@ void blockcopy_ps_c(int16_t *a, intptr_t stridea, pixel *b, intptr_t strideb) } template -void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1) +void pixel_sub_ps_c(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1) { for (int y = 0; y < by; y++) { for (int x = 0; x < bx; x++) - { a[x] = (int16_t)(b0[x] - b1[x]); - } b0 += sstride0; b1 += sstride1; @@ -973,14 +904,12 @@ void pixel_sub_ps_c(int16_t *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t } template -void pixel_add_ps_c(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1) +void pixel_add_ps_c(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1) { for (int y = 0; y < by; y++) { for (int x = 0; x < bx; x++) - { a[x] = Clip(b0[x] + b1[x]); - } b0 += sstride0; b1 += sstride1; @@ -989,7 +918,7 @@ void pixel_add_ps_c(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t } template -void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) +void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) { int shiftNum, offset; @@ -1010,28 +939,24 @@ void addAvg(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intpt } } -void planecopy_cp_c(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift) +void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift) { for (int r = 0; r < height; r++) { for (int c = 0; c < width; c++) - { dst[c] = ((pixel)src[c]) << shift; - } dst += dstStride; src += srcStride; } } -void planecopy_sp_c(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) +void planecopy_sp_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) { for (int r = 0; r < height; r++) { for (int c = 0; c < width; c++) - { dst[c] = (pixel)((src[c] >> shift) & mask); - } dst += dstStride; src += srcStride; @@ -1040,8 +965,8 @@ void planecopy_sp_c(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstS /* Estimate the total amount of influence on future quality that could be had if we * were to improve the reference samples used to inter predict any given CU. */ -void estimateCUPropagateCost(int *dst, uint16_t *propagateIn, int32_t *intraCosts, uint16_t *interCosts, - int32_t *invQscales, double *fpsFactor, int len) +void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, + const int32_t* invQscales, const double* fpsFactor, int len) { double fps = *fpsFactor / 256; @@ -1068,12 +993,12 @@ void extendPicBorder(pixel* pic, intptr_t stride, int width, int height, int mar primitives.extendRowBorder(pic, stride, width, height, marginX); /* copy top row to create above margin */ - pixel *top = pic - marginX; + pixel* top = pic - marginX; for (int y = 0; y < marginY; y++) memcpy(top - (y + 1) * stride, top, stride * sizeof(pixel)); /* copy bottom row to create below margin */ - pixel *bot = pic - marginX + (height - 1) * stride; + pixel* bot = pic - marginX + (height - 1) * stride; for (int y = 0; y < marginY; y++) memcpy(bot + (y + 1) * stride, bot, stride * sizeof(pixel)); } @@ -1113,6 +1038,62 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p) p.satd[LUMA_64x16] = satd8<64, 16>; p.satd[LUMA_16x64] = satd8<16, 64>; + p.chroma[X265_CSP_I420].satd[CHROMA_2x2] = NULL; + p.chroma[X265_CSP_I420].satd[CHROMA_4x4] = satd_4x4; + p.chroma[X265_CSP_I420].satd[CHROMA_8x8] = satd8<8, 8>; + p.chroma[X265_CSP_I420].satd[CHROMA_16x16] = satd8<16, 16>; + p.chroma[X265_CSP_I420].satd[CHROMA_32x32] = satd8<32, 32>; + + p.chroma[X265_CSP_I420].satd[CHROMA_4x2] = NULL; + p.chroma[X265_CSP_I420].satd[CHROMA_2x4] = NULL; + p.chroma[X265_CSP_I420].satd[CHROMA_8x4] = satd_8x4; + p.chroma[X265_CSP_I420].satd[CHROMA_4x8] = satd4<4, 8>; + p.chroma[X265_CSP_I420].satd[CHROMA_16x8] = satd8<16, 8>; + p.chroma[X265_CSP_I420].satd[CHROMA_8x16] = satd8<8, 16>; + p.chroma[X265_CSP_I420].satd[CHROMA_32x16] = satd8<32, 16>; + p.chroma[X265_CSP_I420].satd[CHROMA_16x32] = satd8<16, 32>; + + p.chroma[X265_CSP_I420].satd[CHROMA_8x6] = NULL; + p.chroma[X265_CSP_I420].satd[CHROMA_6x8] = NULL; + p.chroma[X265_CSP_I420].satd[CHROMA_8x2] = NULL; + p.chroma[X265_CSP_I420].satd[CHROMA_2x8] = NULL; + p.chroma[X265_CSP_I420].satd[CHROMA_16x12] = satd4<16, 12>; + p.chroma[X265_CSP_I420].satd[CHROMA_12x16] = satd4<12, 16>; + p.chroma[X265_CSP_I420].satd[CHROMA_16x4] = satd4<16, 4>; + p.chroma[X265_CSP_I420].satd[CHROMA_4x16] = satd4<4, 16>; + p.chroma[X265_CSP_I420].satd[CHROMA_32x24] = satd8<32, 24>; + p.chroma[X265_CSP_I420].satd[CHROMA_24x32] = satd8<24, 32>; + p.chroma[X265_CSP_I420].satd[CHROMA_32x8] = satd8<32, 8>; + p.chroma[X265_CSP_I420].satd[CHROMA_8x32] = satd8<8, 32>; + + p.chroma[X265_CSP_I422].satd[CHROMA422_2x4] = NULL; + p.chroma[X265_CSP_I422].satd[CHROMA422_4x8] = satd4<4, 8>; + p.chroma[X265_CSP_I422].satd[CHROMA422_8x16] = satd8<8, 16>; + p.chroma[X265_CSP_I422].satd[CHROMA422_16x32] = satd8<16, 32>; + p.chroma[X265_CSP_I422].satd[CHROMA422_32x64] = satd8<32, 64>; + + p.chroma[X265_CSP_I422].satd[CHROMA422_4x4] = satd_4x4; + p.chroma[X265_CSP_I422].satd[CHROMA422_2x8] = NULL; + p.chroma[X265_CSP_I422].satd[CHROMA422_8x8] = satd8<8, 8>; + p.chroma[X265_CSP_I422].satd[CHROMA422_4x16] = satd4<4, 16>; + p.chroma[X265_CSP_I422].satd[CHROMA422_16x16] = satd8<16, 16>; + p.chroma[X265_CSP_I422].satd[CHROMA422_8x32] = satd8<8, 32>; + p.chroma[X265_CSP_I422].satd[CHROMA422_32x32] = satd8<32, 32>; + p.chroma[X265_CSP_I422].satd[CHROMA422_16x64] = satd8<16, 64>; + + p.chroma[X265_CSP_I422].satd[CHROMA422_8x12] = satd4<8, 12>; + p.chroma[X265_CSP_I422].satd[CHROMA422_6x16] = NULL; + p.chroma[X265_CSP_I422].satd[CHROMA422_8x4] = satd4<8, 4>; + p.chroma[X265_CSP_I422].satd[CHROMA422_2x16] = NULL; + p.chroma[X265_CSP_I422].satd[CHROMA422_16x24] = satd8<16, 24>; + p.chroma[X265_CSP_I422].satd[CHROMA422_12x32] = satd4<12, 32>; + p.chroma[X265_CSP_I422].satd[CHROMA422_16x8] = satd8<16, 8>; + p.chroma[X265_CSP_I422].satd[CHROMA422_4x32] = satd4<4, 32>; + p.chroma[X265_CSP_I422].satd[CHROMA422_32x48] = satd8<32, 48>; + p.chroma[X265_CSP_I422].satd[CHROMA422_24x64] = satd8<24, 64>; + p.chroma[X265_CSP_I422].satd[CHROMA422_32x16] = satd8<32, 16>; + p.chroma[X265_CSP_I422].satd[CHROMA422_8x64] = satd8<8, 64>; + #define CHROMA_420(W, H) \ p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = addAvg; \ p.chroma[X265_CSP_I420].copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c; \ @@ -1121,13 +1102,14 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p) p.chroma[X265_CSP_I420].copy_ss[CHROMA_ ## W ## x ## H] = blockcopy_ss_c; #define CHROMA_422(W, H) \ - p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg; \ + p.chroma[X265_CSP_I422].addAvg[CHROMA422_ ## W ## x ## H] = addAvg; \ p.chroma[X265_CSP_I422].copy_pp[CHROMA422_ ## W ## x ## H] = blockcopy_pp_c; \ p.chroma[X265_CSP_I422].copy_sp[CHROMA422_ ## W ## x ## H] = blockcopy_sp_c; \ p.chroma[X265_CSP_I422].copy_ps[CHROMA422_ ## W ## x ## H] = blockcopy_ps_c; \ p.chroma[X265_CSP_I422].copy_ss[CHROMA422_ ## W ## x ## H] = blockcopy_ss_c; #define CHROMA_444(W, H) \ + p.chroma[X265_CSP_I444].satd[LUMA_ ## W ## x ## H] = p.satd[LUMA_ ## W ## x ## H]; \ p.chroma[X265_CSP_I444].addAvg[LUMA_ ## W ## x ## H] = addAvg; \ p.chroma[X265_CSP_I444].copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c; \ p.chroma[X265_CSP_I444].copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_c; \ @@ -1157,8 +1139,6 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p) p.chroma[X265_CSP_I444].sub_ps[LUMA_ ## W ## x ## H] = pixel_sub_ps_c; \ p.chroma[X265_CSP_I444].add_ps[LUMA_ ## W ## x ## H] = pixel_add_ps_c; - - LUMA(4, 4); LUMA(8, 8); CHROMA_420(4, 4); @@ -1278,9 +1258,9 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p) CHROMA_444(64, 16); CHROMA_444(16, 64); - SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixelcmp_t, pixel, pixel) - SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, pixelcmp_sp_t, int16_t, pixel) - SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, pixelcmp_ss_t, int16_t, int16_t) + SET_FUNC_PRIMITIVE_TABLE_C(sse_pp, sse, pixel, pixel) + SET_FUNC_PRIMITIVE_TABLE_C(sse_sp, sse, int16_t, pixel) + SET_FUNC_PRIMITIVE_TABLE_C(sse_ss, sse, int16_t, int16_t) p.blockfill_s[BLOCK_4x4] = blockfil_s_c<4>; p.blockfill_s[BLOCK_8x8] = blockfil_s_c<8>; @@ -1288,22 +1268,22 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p) p.blockfill_s[BLOCK_32x32] = blockfil_s_c<32>; p.blockfill_s[BLOCK_64x64] = blockfil_s_c<64>; - p.cvt16to32_shl = convert16to32_shl; - p.cvt16to32_shr[BLOCK_4x4] = convert16to32_shr<4>; - p.cvt16to32_shr[BLOCK_8x8] = convert16to32_shr<8>; - p.cvt16to32_shr[BLOCK_16x16] = convert16to32_shr<16>; - p.cvt16to32_shr[BLOCK_32x32] = convert16to32_shr<32>; - p.cvt32to16_shr = convert32to16_shr; - p.cvt32to16_shl[BLOCK_4x4] = convert32to16_shl<4>; - p.cvt32to16_shl[BLOCK_8x8] = convert32to16_shl<8>; - p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>; - p.cvt32to16_shl[BLOCK_32x32] = convert32to16_shl<32>; - - p.copy_shr = copy_shr; - p.copy_shl[BLOCK_4x4] = copy_shl<4>; - p.copy_shl[BLOCK_8x8] = copy_shl<8>; - p.copy_shl[BLOCK_16x16] = copy_shl<16>; - p.copy_shl[BLOCK_32x32] = copy_shl<32>; + p.cpy2Dto1D_shl[BLOCK_4x4] = cpy2Dto1D_shl<4>; + p.cpy2Dto1D_shl[BLOCK_8x8] = cpy2Dto1D_shl<8>; + p.cpy2Dto1D_shl[BLOCK_16x16] = cpy2Dto1D_shl<16>; + p.cpy2Dto1D_shl[BLOCK_32x32] = cpy2Dto1D_shl<32>; + p.cpy2Dto1D_shr[BLOCK_4x4] = cpy2Dto1D_shr<4>; + p.cpy2Dto1D_shr[BLOCK_8x8] = cpy2Dto1D_shr<8>; + p.cpy2Dto1D_shr[BLOCK_16x16] = cpy2Dto1D_shr<16>; + p.cpy2Dto1D_shr[BLOCK_32x32] = cpy2Dto1D_shr<32>; + p.cpy1Dto2D_shl[BLOCK_4x4] = cpy1Dto2D_shl<4>; + p.cpy1Dto2D_shl[BLOCK_8x8] = cpy1Dto2D_shl<8>; + p.cpy1Dto2D_shl[BLOCK_16x16] = cpy1Dto2D_shl<16>; + p.cpy1Dto2D_shl[BLOCK_32x32] = cpy1Dto2D_shl<32>; + p.cpy1Dto2D_shr[BLOCK_4x4] = cpy1Dto2D_shr<4>; + p.cpy1Dto2D_shr[BLOCK_8x8] = cpy1Dto2D_shr<8>; + p.cpy1Dto2D_shr[BLOCK_16x16] = cpy1Dto2D_shr<16>; + p.cpy1Dto2D_shr[BLOCK_32x32] = cpy1Dto2D_shr<32>; p.sa8d[BLOCK_4x4] = satd_4x4; p.sa8d[BLOCK_8x8] = sa8d_8x8; @@ -1371,7 +1351,7 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p) p.scale1D_128to64 = scale1D_128to64; p.scale2D_64to32 = scale2D_64to32; - p.frame_init_lowres_core = frame_init_lowres_core; + p.frameInitLowres = frame_init_lowres_core; p.ssim_4x4x2_core = ssim_4x4x2_core; p.ssim_end_4 = ssim_end_4; @@ -1379,7 +1359,6 @@ void Setup_C_PixelPrimitives(EncoderPrimitives &p) p.var[BLOCK_16x16] = pixel_var<16>; p.var[BLOCK_32x32] = pixel_var<32>; p.var[BLOCK_64x64] = pixel_var<64>; - p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma; p.planecopy_cp = planecopy_cp_c; p.planecopy_sp = planecopy_sp_c; p.propagateCost = estimateCUPropagateCost;