cextern pd_32767
cextern pd_n32768
-;-----------------------------------------------------------------------------
-; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-%if HIGH_BIT_DEPTH
-%if ARCH_X86_64 == 1
-cglobal calcRecons4, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons4, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r4d, r4d
- add r5d, r5d
- add r6d, r6d
-
- pxor m4, m4
- mova m5, [pw_pixel_max]
- mov t7b, 4/2
-.loop:
- movh m0, [r0]
- movh m1, [r0 + r4]
- punpcklqdq m0, m1
- movh m2, [r1]
- movh m3, [r1 + r4]
- punpcklqdq m2, m3
- paddw m0, m2
- CLIPW m0, m4, m5
-
- ; store recipred[]
- movh [r3], m0
- movhps [r3 + r6], m0
-
- ; store recqt[]
- movh [r2], m0
- movhps [r2 + r5], m0
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 2]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%else ;HIGH_BIT_DEPTH
-
-%if ARCH_X86_64 == 1
-cglobal calcRecons4, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons4, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r5d, r5d
-
- pxor m0, m0
- mov t7b, 4/2
-.loop:
- movd m1, [r0]
- movd m2, [r0 + r4]
- punpckldq m1, m2
- punpcklbw m1, m0
- movh m2, [r1]
- movh m3, [r1 + r4 * 2]
- punpcklqdq m2, m3
- paddw m1, m2
- packuswb m1, m1
-
- ; store recon[] and recipred[]
- movd [r3], m1
- pshufd m2, m1, 1
- movd [r3 + r6], m2
-
- ; store recqt[]
- punpcklbw m1, m0
- movh [r2], m1
- movhps [r2 + r5], m1
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%endif ;HIGH_BIT_DEPTH
-
-
-INIT_XMM sse2
-%if ARCH_X86_64 == 1
-cglobal calcRecons8, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons8, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
-%if HIGH_BIT_DEPTH
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r4d, r4d
- add r5d, r5d
- add r6d, r6d
-
- pxor m4, m4
- mova m5, [pw_pixel_max]
- mov t7b, 8/2
-.loop:
- movu m0, [r0]
- movu m1, [r0 + r4]
- movu m2, [r1]
- movu m3, [r1 + r4]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recipred[]
- movu [r3], m0
- movu [r3 + r6], m1
-
- ; store recqt[]
- movu [r2], m0
- movu [r2 + r5], m1
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 2]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%else ;HIGH_BIT_DEPTH
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r5d, r5d
-
- pxor m0, m0
- mov t7b, 8/2
-.loop:
- movh m1, [r0]
- movh m2, [r0 + r4]
- punpcklbw m1, m0
- punpcklbw m2, m0
- movu m3, [r1]
- movu m4, [r1 + r4 * 2]
- paddw m1, m3
- paddw m2, m4
- packuswb m1, m2
-
- ; store recon[] and recipred[]
- movh [r3], m1
- movhps [r3 + r6], m1
-
- ; store recqt[]
- punpcklbw m2, m1, m0
- punpckhbw m1, m0
- movu [r2], m2
- movu [r2 + r5], m1
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%endif ;HIGH_BIT_DEPTH
-
-
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-%if ARCH_X86_64 == 1
-cglobal calcRecons16, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons16, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r4d, r4d
- add r5d, r5d
- add r6d, r6d
-
- pxor m4, m4
- mova m5, [pw_pixel_max]
- mov t7b, 16/2
-.loop:
- movu m0, [r0]
- movu m1, [r0 + 16]
- movu m2, [r1]
- movu m3, [r1 + 16]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recipred[]
- movu [r3], m0
- movu [r3 + 16], m1
-
- ; store recqt[]
- movu [r2], m0
- movu [r2 + 16], m1
-
- movu m0, [r0 + r4]
- movu m1, [r0 + r4 + 16]
- movu m2, [r1 + r4]
- movu m3, [r1 + r4 + 16]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recon[] and recipred[]
- movu [r3 + r6], m0
- movu [r3 + r6 + 16], m1
-
- ; store recqt[]
- movu [r2 + r5], m0
- movu [r2 + r5 + 16], m1
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 2]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%else ;HIGH_BIT_DEPTH
-
-INIT_XMM sse4
-%if ARCH_X86_64 == 1
-cglobal calcRecons16, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons16, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r5d, r5d
-
- pxor m0, m0
- mov t7b, 16
-.loop:
- movu m2, [r0]
- pmovzxbw m1, m2
- punpckhbw m2, m0
- paddw m1, [r1]
- paddw m2, [r1 + 16]
- packuswb m1, m2
-
- ; store recon[] and recipred[]
- movu [r3], m1
-
- ; store recqt[]
- pmovzxbw m2, m1
- punpckhbw m1, m0
- movu [r2], m2
- movu [r2 + 16], m1
-
- add r2, r5
- add r3, r6
- add r0, r4
- lea r1, [r1 + r4 * 2]
-
- dec t7b
- jnz .loop
- RET
-%endif ;HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-%if ARCH_X86_64 == 1
-cglobal calcRecons32, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons32, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r4d, r4d
- add r5d, r5d
- add r6d, r6d
-
- pxor m4, m4
- mova m5, [pw_pixel_max]
- mov t7b, 32/2
-.loop:
-
- movu m0, [r0]
- movu m1, [r0 + 16]
- movu m2, [r1]
- movu m3, [r1 + 16]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recipred[]
- movu [r3], m0
- movu [r3 + 16], m1
-
- ; store recqt[]
- movu [r2], m0
- movu [r2 + 16], m1
-
- movu m0, [r0 + 32]
- movu m1, [r0 + 48]
- movu m2, [r1 + 32]
- movu m3, [r1 + 48]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recon[] and recipred[]
- movu [r3 + 32], m0
- movu [r3 + 48], m1
-
- ; store recqt[]
- movu [r2 + 32], m0
- movu [r2 + 48], m1
- add r2, r5
-
- movu m0, [r0 + r4]
- movu m1, [r0 + r4 + 16]
- movu m2, [r1 + r4]
- movu m3, [r1 + r4 + 16]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recon[] and recipred[]
- movu [r3 + r6], m0
- movu [r3 + r6 + 16], m1
-
- ; store recqt[]
- movu [r2], m0
- movu [r2 + 16], m1
-
- movu m0, [r0 + r4 + 32]
- movu m1, [r0 + r4 + 48]
- movu m2, [r1 + r4 + 32]
- movu m3, [r1 + r4 + 48]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recon[] and recipred[]
- movu [r3 + r6 + 32], m0
- movu [r3 + r6 + 48], m1
- lea r3, [r3 + r6 * 2]
-
- ; store recqt[]
- movu [r2 + 32], m0
- movu [r2 + 48], m1
- add r2, r5
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 2]
-
- dec t7b
- jnz .loop
- RET
-%else ;HIGH_BIT_DEPTH
-INIT_XMM sse4
-%if ARCH_X86_64 == 1
-cglobal calcRecons32, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons32, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r5d, r5d
-
- pxor m0, m0
- mov t7b, 32
-.loop:
- movu m2, [r0]
- movu m4, [r0 + 16]
- pmovzxbw m1, m2
- punpckhbw m2, m0
- pmovzxbw m3, m4
- punpckhbw m4, m0
-
- paddw m1, [r1 + 0 * 16]
- paddw m2, [r1 + 1 * 16]
- packuswb m1, m2
-
- paddw m3, [r1 + 2 * 16]
- paddw m4, [r1 + 3 * 16]
- packuswb m3, m4
-
- ; store recon[] and recipred[]
- movu [r3], m1
- movu [r3 + 16], m3
-
- ; store recqt[]
- pmovzxbw m2, m1
- punpckhbw m1, m0
- movu [r2 + 0 * 16], m2
- movu [r2 + 1 * 16], m1
- pmovzxbw m4, m3
- punpckhbw m3, m0
- movu [r2 + 2 * 16], m4
- movu [r2 + 3 * 16], m3
-
- add r2, r5
- add r3, r6
- add r0, r4
- lea r1, [r1 + r4 * 2]
-
- dec t7b
- jnz .loop
- RET
-%endif ;HIGH_BIT_DEPTH
-
;-----------------------------------------------------------------------------
; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
;-----------------------------------------------------------------------------
-; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
+; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal quant, 5,6,8
pxor m7, m7 ; m7 = numZero
.loop:
; 4 coeff
- movu m0, [r0] ; m0 = level
+ pmovsxwd m0, [r0] ; m0 = level
pabsd m1, m0
pmulld m1, [r1] ; m0 = tmpLevel1
paddd m2, m1, m5
movh [r3], m3
; 4 coeff
- movu m0, [r0 + 16] ; m0 = level
+ pmovsxwd m0, [r0 + 8] ; m0 = level
pabsd m1, m0
pmulld m1, [r1 + 16] ; m0 = tmpLevel1
paddd m2, m1, m5
packssdw m3, m3
movh [r3 + 8], m3
- add r0, 32
+ add r0, 16
add r1, 32
add r2, 32
add r3, 16
pxor m7, m7 ; m7 = numZero
.loop:
; 8 coeff
- movu m0, [r0] ; m0 = level
+ pmovsxwd m0, [r0] ; m0 = level
pabsd m1, m0
pmulld m1, [r1] ; m0 = tmpLevel1
paddd m2, m1, m5
psignd m2, m0
; 8 coeff
- movu m0, [r0 + mmsize] ; m0 = level
+ pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
pabsd m1, m0
pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
paddd m3, m1, m5
pminuw m2, m9
paddw m7, m2
- add r0, mmsize*2
+ add r0, mmsize
add r1, mmsize*2
add r2, mmsize*2
add r3, mmsize
pxor m7, m7 ; m7 = numZero
.loop:
; 8 coeff
- movu m0, [r0] ; m0 = level
+ pmovsxwd m0, [r0] ; m0 = level
pabsd m1, m0
pmulld m1, [r1] ; m0 = tmpLevel1
paddd m2, m1, m5
movu [r3], xm3
; 8 coeff
- movu m0, [r0 + mmsize] ; m0 = level
+ pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
pabsd m1, m0
pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
paddd m2, m1, m5
vpermq m3, m3, q0020
movu [r3 + mmsize/2], xm3
- add r0, mmsize*2
+ add r0, mmsize
add r1, mmsize*2
add r2, mmsize*2
add r3, mmsize
;-----------------------------------------------------------------------------
-; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
+; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal nquant, 3,5,8
shr r4d, 3
.loop:
- movu m0, [r0] ; m0 = level
- movu m1, [r0 + 16] ; m1 = level
+ pmovsxwd m0, [r0] ; m0 = level
+ pmovsxwd m1, [r0 + 8] ; m1 = level
pabsd m2, m0
pmulld m2, [r1] ; m0 = tmpLevel1 * qcoeff
packssdw m2, m3
movu [r2], m2
- add r0, 32
+ add r0, 16
add r1, 32
add r2, 16
shr r4d, 4
.loop:
- movu m0, [r0] ; m0 = level
+ pmovsxwd m0, [r0] ; m0 = level
pabsd m1, m0
pmulld m1, [r1] ; m0 = tmpLevel1 * qcoeff
paddd m1, m4
psrad m1, xm3 ; m0 = level1
psignd m1, m0
- movu m0, [r0 + mmsize] ; m0 = level
+ pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
pabsd m2, m0
pmulld m2, [r1 + mmsize] ; m0 = tmpLevel1 * qcoeff
paddd m2, m4
vpermq m2, m1, q3120
movu [r2], m2
- add r0, mmsize * 2
+ add r0, mmsize
add r1, mmsize * 2
add r2, mmsize
pmaddwd m4, m1
psrad m3, m0
psrad m4, m0
- packssdw m3, m3 ; OPT_ME: store must be 32 bits
- pmovsxwd m3, m3
- packssdw m4, m4
- pmovsxwd m4, m4
+ packssdw m3, m4
mova [r1], m3
- mova [r1 + 16], m4
add r0, 16
- add r1, 32
+ add r1, 16
sub r2d, 8
jnz .loop
pmaxsd m3, m6
pminsd m4, m5
pmaxsd m4, m6
+ packssdw m3, m4
mova [r1 + 0 * mmsize/2], xm3
- mova [r1 + 1 * mmsize/2], xm4
- vextracti128 [r1 + 2 * mmsize/2], m3, 1
- vextracti128 [r1 + 3 * mmsize/2], m4, 1
+ vextracti128 [r1 + 1 * mmsize/2], m3, 1
add r0, mmsize
- add r1, mmsize * 2
+ add r1, mmsize
dec r2d
jnz .loop