X-Git-Url: https://git.piment-noir.org/?p=deb_x265.git;a=blobdiff_plain;f=source%2Fcommon%2Fx86%2Fpixel-util8.asm;h=8adeb84f900985379694804e1a621fbcc9606643;hp=38fb52e8b5726027ea34f64ad5bc6467f8f07dc7;hb=b53f7c52d8280ab63876efd6eb292c21430ac607;hpb=5c9b45285dd64723ad1dac380b98a7b1f3095674 diff --git a/source/common/x86/pixel-util8.asm b/source/common/x86/pixel-util8.asm index 38fb52e..8adeb84 100644 --- a/source/common/x86/pixel-util8.asm +++ b/source/common/x86/pixel-util8.asm @@ -61,447 +61,6 @@ cextern pd_1 cextern pd_32767 cextern pd_n32768 -;----------------------------------------------------------------------------- -; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred) -;----------------------------------------------------------------------------- -INIT_XMM sse2 -%if HIGH_BIT_DEPTH -%if ARCH_X86_64 == 1 -cglobal calcRecons4, 5,8,4 - %define t7b r7b -%else -cglobal calcRecons4, 5,7,4,0-1 - %define t7b byte [rsp] -%endif - mov r4d, r4m - mov r5d, r5m - mov r6d, r6m - add r4d, r4d - add r5d, r5d - add r6d, r6d - - pxor m4, m4 - mova m5, [pw_pixel_max] - mov t7b, 4/2 -.loop: - movh m0, [r0] - movh m1, [r0 + r4] - punpcklqdq m0, m1 - movh m2, [r1] - movh m3, [r1 + r4] - punpcklqdq m2, m3 - paddw m0, m2 - CLIPW m0, m4, m5 - - ; store recipred[] - movh [r3], m0 - movhps [r3 + r6], m0 - - ; store recqt[] - movh [r2], m0 - movhps [r2 + r5], m0 - - lea r0, [r0 + r4 * 2] - lea r1, [r1 + r4 * 2] - lea r2, [r2 + r5 * 2] - lea r3, [r3 + r6 * 2] - - dec t7b - jnz .loop - RET -%else ;HIGH_BIT_DEPTH - -%if ARCH_X86_64 == 1 -cglobal calcRecons4, 5,8,4 - %define t7b r7b -%else -cglobal calcRecons4, 5,7,4,0-1 - %define t7b byte [rsp] -%endif - mov r4d, r4m - mov r5d, r5m - mov r6d, r6m - add r5d, r5d - - pxor m0, m0 - mov t7b, 4/2 -.loop: - movd m1, [r0] - movd m2, [r0 + r4] - punpckldq m1, m2 - punpcklbw m1, m0 - movh m2, [r1] - movh m3, [r1 + r4 * 2] - punpcklqdq m2, m3 - paddw m1, m2 - packuswb m1, m1 - - ; store recon[] and recipred[] - movd [r3], m1 - pshufd m2, m1, 1 - movd [r3 + r6], m2 - - ; store recqt[] - punpcklbw m1, m0 - movh [r2], m1 - movhps [r2 + r5], m1 - - lea r0, [r0 + r4 * 2] - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r5 * 2] - lea r3, [r3 + r6 * 2] - - dec t7b - jnz .loop - RET -%endif ;HIGH_BIT_DEPTH - - -INIT_XMM sse2 -%if ARCH_X86_64 == 1 -cglobal calcRecons8, 5,8,4 - %define t7b r7b -%else -cglobal calcRecons8, 5,7,4,0-1 - %define t7b byte [rsp] -%endif - -%if HIGH_BIT_DEPTH - mov r4d, r4m - mov r5d, r5m - mov r6d, r6m - add r4d, r4d - add r5d, r5d - add r6d, r6d - - pxor m4, m4 - mova m5, [pw_pixel_max] - mov t7b, 8/2 -.loop: - movu m0, [r0] - movu m1, [r0 + r4] - movu m2, [r1] - movu m3, [r1 + r4] - paddw m0, m2 - paddw m1, m3 - CLIPW2 m0, m1, m4, m5 - - ; store recipred[] - movu [r3], m0 - movu [r3 + r6], m1 - - ; store recqt[] - movu [r2], m0 - movu [r2 + r5], m1 - - lea r0, [r0 + r4 * 2] - lea r1, [r1 + r4 * 2] - lea r2, [r2 + r5 * 2] - lea r3, [r3 + r6 * 2] - - dec t7b - jnz .loop - RET -%else ;HIGH_BIT_DEPTH - - mov r4d, r4m - mov r5d, r5m - mov r6d, r6m - add r5d, r5d - - pxor m0, m0 - mov t7b, 8/2 -.loop: - movh m1, [r0] - movh m2, [r0 + r4] - punpcklbw m1, m0 - punpcklbw m2, m0 - movu m3, [r1] - movu m4, [r1 + r4 * 2] - paddw m1, m3 - paddw m2, m4 - packuswb m1, m2 - - ; store recon[] and recipred[] - movh [r3], m1 - movhps [r3 + r6], m1 - - ; store recqt[] - punpcklbw m2, m1, m0 - punpckhbw m1, m0 - movu [r2], m2 - movu [r2 + r5], m1 - - lea r0, [r0 + r4 * 2] - lea r1, [r1 + r4 * 4] - lea r2, [r2 + r5 * 2] - lea r3, [r3 + r6 * 2] - - dec t7b - jnz .loop - RET -%endif ;HIGH_BIT_DEPTH - - - -%if HIGH_BIT_DEPTH -INIT_XMM sse2 -%if ARCH_X86_64 == 1 -cglobal calcRecons16, 5,8,4 - %define t7b r7b -%else -cglobal calcRecons16, 5,7,4,0-1 - %define t7b byte [rsp] -%endif - - mov r4d, r4m - mov r5d, r5m - mov r6d, r6m - add r4d, r4d - add r5d, r5d - add r6d, r6d - - pxor m4, m4 - mova m5, [pw_pixel_max] - mov t7b, 16/2 -.loop: - movu m0, [r0] - movu m1, [r0 + 16] - movu m2, [r1] - movu m3, [r1 + 16] - paddw m0, m2 - paddw m1, m3 - CLIPW2 m0, m1, m4, m5 - - ; store recipred[] - movu [r3], m0 - movu [r3 + 16], m1 - - ; store recqt[] - movu [r2], m0 - movu [r2 + 16], m1 - - movu m0, [r0 + r4] - movu m1, [r0 + r4 + 16] - movu m2, [r1 + r4] - movu m3, [r1 + r4 + 16] - paddw m0, m2 - paddw m1, m3 - CLIPW2 m0, m1, m4, m5 - - ; store recon[] and recipred[] - movu [r3 + r6], m0 - movu [r3 + r6 + 16], m1 - - ; store recqt[] - movu [r2 + r5], m0 - movu [r2 + r5 + 16], m1 - - lea r0, [r0 + r4 * 2] - lea r1, [r1 + r4 * 2] - lea r2, [r2 + r5 * 2] - lea r3, [r3 + r6 * 2] - - dec t7b - jnz .loop - RET -%else ;HIGH_BIT_DEPTH - -INIT_XMM sse4 -%if ARCH_X86_64 == 1 -cglobal calcRecons16, 5,8,4 - %define t7b r7b -%else -cglobal calcRecons16, 5,7,4,0-1 - %define t7b byte [rsp] -%endif - - mov r4d, r4m - mov r5d, r5m - mov r6d, r6m - add r5d, r5d - - pxor m0, m0 - mov t7b, 16 -.loop: - movu m2, [r0] - pmovzxbw m1, m2 - punpckhbw m2, m0 - paddw m1, [r1] - paddw m2, [r1 + 16] - packuswb m1, m2 - - ; store recon[] and recipred[] - movu [r3], m1 - - ; store recqt[] - pmovzxbw m2, m1 - punpckhbw m1, m0 - movu [r2], m2 - movu [r2 + 16], m1 - - add r2, r5 - add r3, r6 - add r0, r4 - lea r1, [r1 + r4 * 2] - - dec t7b - jnz .loop - RET -%endif ;HIGH_BIT_DEPTH - -%if HIGH_BIT_DEPTH -INIT_XMM sse2 -%if ARCH_X86_64 == 1 -cglobal calcRecons32, 5,8,4 - %define t7b r7b -%else -cglobal calcRecons32, 5,7,4,0-1 - %define t7b byte [rsp] -%endif - - mov r4d, r4m - mov r5d, r5m - mov r6d, r6m - add r4d, r4d - add r5d, r5d - add r6d, r6d - - pxor m4, m4 - mova m5, [pw_pixel_max] - mov t7b, 32/2 -.loop: - - movu m0, [r0] - movu m1, [r0 + 16] - movu m2, [r1] - movu m3, [r1 + 16] - paddw m0, m2 - paddw m1, m3 - CLIPW2 m0, m1, m4, m5 - - ; store recipred[] - movu [r3], m0 - movu [r3 + 16], m1 - - ; store recqt[] - movu [r2], m0 - movu [r2 + 16], m1 - - movu m0, [r0 + 32] - movu m1, [r0 + 48] - movu m2, [r1 + 32] - movu m3, [r1 + 48] - paddw m0, m2 - paddw m1, m3 - CLIPW2 m0, m1, m4, m5 - - ; store recon[] and recipred[] - movu [r3 + 32], m0 - movu [r3 + 48], m1 - - ; store recqt[] - movu [r2 + 32], m0 - movu [r2 + 48], m1 - add r2, r5 - - movu m0, [r0 + r4] - movu m1, [r0 + r4 + 16] - movu m2, [r1 + r4] - movu m3, [r1 + r4 + 16] - paddw m0, m2 - paddw m1, m3 - CLIPW2 m0, m1, m4, m5 - - ; store recon[] and recipred[] - movu [r3 + r6], m0 - movu [r3 + r6 + 16], m1 - - ; store recqt[] - movu [r2], m0 - movu [r2 + 16], m1 - - movu m0, [r0 + r4 + 32] - movu m1, [r0 + r4 + 48] - movu m2, [r1 + r4 + 32] - movu m3, [r1 + r4 + 48] - paddw m0, m2 - paddw m1, m3 - CLIPW2 m0, m1, m4, m5 - - ; store recon[] and recipred[] - movu [r3 + r6 + 32], m0 - movu [r3 + r6 + 48], m1 - lea r3, [r3 + r6 * 2] - - ; store recqt[] - movu [r2 + 32], m0 - movu [r2 + 48], m1 - add r2, r5 - - lea r0, [r0 + r4 * 2] - lea r1, [r1 + r4 * 2] - - dec t7b - jnz .loop - RET -%else ;HIGH_BIT_DEPTH -INIT_XMM sse4 -%if ARCH_X86_64 == 1 -cglobal calcRecons32, 5,8,4 - %define t7b r7b -%else -cglobal calcRecons32, 5,7,4,0-1 - %define t7b byte [rsp] -%endif - - mov r4d, r4m - mov r5d, r5m - mov r6d, r6m - add r5d, r5d - - pxor m0, m0 - mov t7b, 32 -.loop: - movu m2, [r0] - movu m4, [r0 + 16] - pmovzxbw m1, m2 - punpckhbw m2, m0 - pmovzxbw m3, m4 - punpckhbw m4, m0 - - paddw m1, [r1 + 0 * 16] - paddw m2, [r1 + 1 * 16] - packuswb m1, m2 - - paddw m3, [r1 + 2 * 16] - paddw m4, [r1 + 3 * 16] - packuswb m3, m4 - - ; store recon[] and recipred[] - movu [r3], m1 - movu [r3 + 16], m3 - - ; store recqt[] - pmovzxbw m2, m1 - punpckhbw m1, m0 - movu [r2 + 0 * 16], m2 - movu [r2 + 1 * 16], m1 - pmovzxbw m4, m3 - punpckhbw m3, m0 - movu [r2 + 2 * 16], m4 - movu [r2 + 3 * 16], m3 - - add r2, r5 - add r3, r6 - add r0, r4 - lea r1, [r1 + r4 * 2] - - dec t7b - jnz .loop - RET -%endif ;HIGH_BIT_DEPTH - ;----------------------------------------------------------------------------- ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride) @@ -861,7 +420,7 @@ cglobal getResidual32, 4,5,7 ;----------------------------------------------------------------------------- -; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); +; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal quant, 5,6,8 @@ -883,7 +442,7 @@ cglobal quant, 5,6,8 pxor m7, m7 ; m7 = numZero .loop: ; 4 coeff - movu m0, [r0] ; m0 = level + pmovsxwd m0, [r0] ; m0 = level pabsd m1, m0 pmulld m1, [r1] ; m0 = tmpLevel1 paddd m2, m1, m5 @@ -901,7 +460,7 @@ cglobal quant, 5,6,8 movh [r3], m3 ; 4 coeff - movu m0, [r0 + 16] ; m0 = level + pmovsxwd m0, [r0 + 8] ; m0 = level pabsd m1, m0 pmulld m1, [r1 + 16] ; m0 = tmpLevel1 paddd m2, m1, m5 @@ -916,7 +475,7 @@ cglobal quant, 5,6,8 packssdw m3, m3 movh [r3 + 8], m3 - add r0, 32 + add r0, 16 add r1, 32 add r2, 32 add r3, 16 @@ -953,7 +512,7 @@ cglobal quant, 5,5,10 pxor m7, m7 ; m7 = numZero .loop: ; 8 coeff - movu m0, [r0] ; m0 = level + pmovsxwd m0, [r0] ; m0 = level pabsd m1, m0 pmulld m1, [r1] ; m0 = tmpLevel1 paddd m2, m1, m5 @@ -966,7 +525,7 @@ cglobal quant, 5,5,10 psignd m2, m0 ; 8 coeff - movu m0, [r0 + mmsize] ; m0 = level + pmovsxwd m0, [r0 + mmsize/2] ; m0 = level pabsd m1, m0 pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1 paddd m3, m1, m5 @@ -987,7 +546,7 @@ cglobal quant, 5,5,10 pminuw m2, m9 paddw m7, m2 - add r0, mmsize*2 + add r0, mmsize add r1, mmsize*2 add r2, mmsize*2 add r3, mmsize @@ -1025,7 +584,7 @@ cglobal quant, 5,6,8 pxor m7, m7 ; m7 = numZero .loop: ; 8 coeff - movu m0, [r0] ; m0 = level + pmovsxwd m0, [r0] ; m0 = level pabsd m1, m0 pmulld m1, [r1] ; m0 = tmpLevel1 paddd m2, m1, m5 @@ -1044,7 +603,7 @@ cglobal quant, 5,6,8 movu [r3], xm3 ; 8 coeff - movu m0, [r0 + mmsize] ; m0 = level + pmovsxwd m0, [r0 + mmsize/2] ; m0 = level pabsd m1, m0 pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1 paddd m2, m1, m5 @@ -1062,7 +621,7 @@ cglobal quant, 5,6,8 vpermq m3, m3, q0020 movu [r3 + mmsize/2], xm3 - add r0, mmsize*2 + add r0, mmsize add r1, mmsize*2 add r2, mmsize*2 add r3, mmsize @@ -1083,7 +642,7 @@ IACA_END ;----------------------------------------------------------------------------- -; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff); +; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff); ;----------------------------------------------------------------------------- INIT_XMM sse4 cglobal nquant, 3,5,8 @@ -1096,8 +655,8 @@ cglobal nquant, 3,5,8 shr r4d, 3 .loop: - movu m0, [r0] ; m0 = level - movu m1, [r0 + 16] ; m1 = level + pmovsxwd m0, [r0] ; m0 = level + pmovsxwd m1, [r0 + 8] ; m1 = level pabsd m2, m0 pmulld m2, [r1] ; m0 = tmpLevel1 * qcoeff @@ -1114,7 +673,7 @@ cglobal nquant, 3,5,8 packssdw m2, m3 movu [r2], m2 - add r0, 32 + add r0, 16 add r1, 32 add r2, 16 @@ -1144,14 +703,14 @@ cglobal nquant, 3,5,7 shr r4d, 4 .loop: - movu m0, [r0] ; m0 = level + pmovsxwd m0, [r0] ; m0 = level pabsd m1, m0 pmulld m1, [r1] ; m0 = tmpLevel1 * qcoeff paddd m1, m4 psrad m1, xm3 ; m0 = level1 psignd m1, m0 - movu m0, [r0 + mmsize] ; m0 = level + pmovsxwd m0, [r0 + mmsize/2] ; m0 = level pabsd m2, m0 pmulld m2, [r1 + mmsize] ; m0 = tmpLevel1 * qcoeff paddd m2, m4 @@ -1162,7 +721,7 @@ cglobal nquant, 3,5,7 vpermq m2, m1, q3120 movu [r2], m2 - add r0, mmsize * 2 + add r0, mmsize add r1, mmsize * 2 add r2, mmsize @@ -1211,15 +770,11 @@ cglobal dequant_normal, 5,5,5 pmaddwd m4, m1 psrad m3, m0 psrad m4, m0 - packssdw m3, m3 ; OPT_ME: store must be 32 bits - pmovsxwd m3, m3 - packssdw m4, m4 - pmovsxwd m4, m4 + packssdw m3, m4 mova [r1], m3 - mova [r1 + 16], m4 add r0, 16 - add r1, 32 + add r1, 16 sub r2d, 8 jnz .loop @@ -1259,13 +814,12 @@ cglobal dequant_normal, 5,5,7 pmaxsd m3, m6 pminsd m4, m5 pmaxsd m4, m6 + packssdw m3, m4 mova [r1 + 0 * mmsize/2], xm3 - mova [r1 + 1 * mmsize/2], xm4 - vextracti128 [r1 + 2 * mmsize/2], m3, 1 - vextracti128 [r1 + 3 * mmsize/2], m4, 1 + vextracti128 [r1 + 1 * mmsize/2], m3, 1 add r0, mmsize - add r1, mmsize * 2 + add r1, mmsize dec r2d jnz .loop