X-Git-Url: https://git.piment-noir.org/?p=deb_x265.git;a=blobdiff_plain;f=source%2Fcommon%2Fx86%2Fblockcopy8.h;h=9fbbeea555ff270af29b22f673f69e04eb820060;hp=115e3406b2aab93a1cb8708a90998fc713ac62b8;hb=b53f7c52d8280ab63876efd6eb292c21430ac607;hpb=5c9b45285dd64723ad1dac380b98a7b1f3095674 diff --git a/source/common/x86/blockcopy8.h b/source/common/x86/blockcopy8.h index 115e340..9fbbeea 100644 --- a/source/common/x86/blockcopy8.h +++ b/source/common/x86/blockcopy8.h @@ -24,48 +24,53 @@ #ifndef X265_BLOCKCOPY8_H #define X265_BLOCKCOPY8_H -void x265_cvt32to16_shr_sse2(int16_t * dst, int *src, intptr_t, int, int); -void x265_cvt32to16_shl_4_sse2(int16_t * dst, int *src, intptr_t, int); -void x265_cvt32to16_shl_8_sse2(int16_t * dst, int *src, intptr_t, int); -void x265_cvt32to16_shl_16_sse2(int16_t * dst, int *src, intptr_t, int); -void x265_cvt32to16_shl_32_sse2(int16_t * dst, int *src, intptr_t, int); -void x265_cvt32to16_shl_4_avx2(int16_t * dst, int *src, intptr_t, int); -void x265_cvt32to16_shl_8_avx2(int16_t * dst, int *src, intptr_t, int); -void x265_cvt32to16_shl_16_avx2(int16_t * dst, int *src, intptr_t, int); -void x265_cvt32to16_shl_32_avx2(int16_t * dst, int *src, intptr_t, int); -void x265_cvt16to32_shl_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t); -void x265_cvt16to32_shr_4_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t); -void x265_cvt16to32_shr_8_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t); -void x265_cvt16to32_shr_16_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t); -void x265_cvt16to32_shr_32_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t); -void x265_copy_shr_sse4(int16_t * dst, int16_t *src, intptr_t, int, int); -void x265_copy_shl_4_sse2(int16_t * dst, int16_t *src, intptr_t, int); -void x265_copy_shl_8_sse2(int16_t * dst, int16_t *src, intptr_t, int); -void x265_copy_shl_16_sse2(int16_t * dst, int16_t *src, intptr_t, int); -void x265_copy_shl_32_sse2(int16_t * dst, int16_t *src, intptr_t, int); -uint32_t x265_copy_cnt_4_sse4(int16_t * dst, int16_t * src, intptr_t); -uint32_t x265_copy_cnt_8_sse4(int16_t * dst, int16_t * src, intptr_t); -uint32_t x265_copy_cnt_16_sse4(int16_t * dst, int16_t * src, intptr_t); -uint32_t x265_copy_cnt_32_sse4(int16_t * dst, int16_t * src, intptr_t); -uint32_t x265_copy_cnt_4_avx2(int16_t * dst, int16_t * src, intptr_t); -uint32_t x265_copy_cnt_8_avx2(int16_t * dst, int16_t * src, intptr_t); -uint32_t x265_copy_cnt_16_avx2(int16_t * dst, int16_t * src, intptr_t); -uint32_t x265_copy_cnt_32_avx2(int16_t * dst, int16_t * src, intptr_t); +void x265_cpy2Dto1D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy1Dto2D_shl_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy1Dto2D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy1Dto2D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy1Dto2D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy1Dto2D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shr_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy1Dto2D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy1Dto2D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); +uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); +uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); +uint32_t x265_copy_cnt_32_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); +uint32_t x265_copy_cnt_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride); +uint32_t x265_copy_cnt_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride); +uint32_t x265_copy_cnt_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride); +uint32_t x265_copy_cnt_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride); #define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \ - void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \ - void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, int16_t * b, intptr_t strideb); \ - void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t * a, intptr_t stridea, int16_t * b, intptr_t strideb); + void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \ + void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb); \ + void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb); #define SETUP_BLOCKCOPY_PS(W, H, cpu) \ - void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t * dst, intptr_t dstStride, pixel * src, intptr_t srcStride); + void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); #define SETUP_BLOCKCOPY_SP(W, H, cpu) \ - void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, int16_t * b, intptr_t strideb); + void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb); #define SETUP_BLOCKCOPY_SS_PP(W, H, cpu) \ - void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \ - void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t * a, intptr_t stridea, int16_t * b, intptr_t strideb); + void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \ + void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb); #define BLOCKCOPY_COMMON(cpu) \ SETUP_BLOCKCOPY_FUNC(4, 4, cpu); \ @@ -178,31 +183,31 @@ BLOCKCOPY_PS(_sse4); BLOCKCOPY_SP(_sse2); -void x265_blockfill_s_4x4_sse2(int16_t *dst, intptr_t dstride, int16_t val); -void x265_blockfill_s_8x8_sse2(int16_t *dst, intptr_t dstride, int16_t val); -void x265_blockfill_s_16x16_sse2(int16_t *dst, intptr_t dstride, int16_t val); -void x265_blockfill_s_32x32_sse2(int16_t *dst, intptr_t dstride, int16_t val); -void x265_blockcopy_ss_16x4_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_16x8_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_16x12_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_16x16_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_16x24_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_16x32_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_16x64_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_64x16_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_64x32_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_64x48_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); -void x265_blockcopy_ss_64x64_avx(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride); - -void x265_blockcopy_pp_32x8_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); -void x265_blockcopy_pp_32x16_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); -void x265_blockcopy_pp_32x24_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); -void x265_blockcopy_pp_32x32_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); -void x265_blockcopy_pp_32x48_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); -void x265_blockcopy_pp_32x64_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); - -void x265_blockfill_s_16x16_avx2(int16_t *dst, intptr_t dstride, int16_t val); -void x265_blockfill_s_32x32_avx2(int16_t *dst, intptr_t dstride, int16_t val); +void x265_blockfill_s_4x4_sse2(int16_t* dst, intptr_t dstride, int16_t val); +void x265_blockfill_s_8x8_sse2(int16_t* dst, intptr_t dstride, int16_t val); +void x265_blockfill_s_16x16_sse2(int16_t* dst, intptr_t dstride, int16_t val); +void x265_blockfill_s_32x32_sse2(int16_t* dst, intptr_t dstride, int16_t val); +void x265_blockcopy_ss_16x4_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_16x8_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_16x12_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_16x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_16x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_16x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_16x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_64x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_64x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_64x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_64x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); + +void x265_blockcopy_pp_32x8_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_pp_32x16_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_pp_32x24_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_pp_32x32_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_pp_32x48_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_pp_32x64_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); + +void x265_blockfill_s_16x16_avx2(int16_t* dst, intptr_t dstride, int16_t val); +void x265_blockfill_s_32x32_avx2(int16_t* dst, intptr_t dstride, int16_t val); #undef BLOCKCOPY_COMMON #undef BLOCKCOPY_SS_PP