-;-----------------------------------------------------------------------------
-; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-%if HIGH_BIT_DEPTH
-%if ARCH_X86_64 == 1
-cglobal calcRecons4, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons4, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r4d, r4d
- add r5d, r5d
- add r6d, r6d
-
- pxor m4, m4
- mova m5, [pw_pixel_max]
- mov t7b, 4/2
-.loop:
- movh m0, [r0]
- movh m1, [r0 + r4]
- punpcklqdq m0, m1
- movh m2, [r1]
- movh m3, [r1 + r4]
- punpcklqdq m2, m3
- paddw m0, m2
- CLIPW m0, m4, m5
-
- ; store recipred[]
- movh [r3], m0
- movhps [r3 + r6], m0
-
- ; store recqt[]
- movh [r2], m0
- movhps [r2 + r5], m0
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 2]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%else ;HIGH_BIT_DEPTH
-
-%if ARCH_X86_64 == 1
-cglobal calcRecons4, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons4, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r5d, r5d
-
- pxor m0, m0
- mov t7b, 4/2
-.loop:
- movd m1, [r0]
- movd m2, [r0 + r4]
- punpckldq m1, m2
- punpcklbw m1, m0
- movh m2, [r1]
- movh m3, [r1 + r4 * 2]
- punpcklqdq m2, m3
- paddw m1, m2
- packuswb m1, m1
-
- ; store recon[] and recipred[]
- movd [r3], m1
- pshufd m2, m1, 1
- movd [r3 + r6], m2
-
- ; store recqt[]
- punpcklbw m1, m0
- movh [r2], m1
- movhps [r2 + r5], m1
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%endif ;HIGH_BIT_DEPTH
-
-
-INIT_XMM sse2
-%if ARCH_X86_64 == 1
-cglobal calcRecons8, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons8, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
-%if HIGH_BIT_DEPTH
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r4d, r4d
- add r5d, r5d
- add r6d, r6d
-
- pxor m4, m4
- mova m5, [pw_pixel_max]
- mov t7b, 8/2
-.loop:
- movu m0, [r0]
- movu m1, [r0 + r4]
- movu m2, [r1]
- movu m3, [r1 + r4]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recipred[]
- movu [r3], m0
- movu [r3 + r6], m1
-
- ; store recqt[]
- movu [r2], m0
- movu [r2 + r5], m1
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 2]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%else ;HIGH_BIT_DEPTH
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r5d, r5d
-
- pxor m0, m0
- mov t7b, 8/2
-.loop:
- movh m1, [r0]
- movh m2, [r0 + r4]
- punpcklbw m1, m0
- punpcklbw m2, m0
- movu m3, [r1]
- movu m4, [r1 + r4 * 2]
- paddw m1, m3
- paddw m2, m4
- packuswb m1, m2
-
- ; store recon[] and recipred[]
- movh [r3], m1
- movhps [r3 + r6], m1
-
- ; store recqt[]
- punpcklbw m2, m1, m0
- punpckhbw m1, m0
- movu [r2], m2
- movu [r2 + r5], m1
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 4]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%endif ;HIGH_BIT_DEPTH
-
-
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-%if ARCH_X86_64 == 1
-cglobal calcRecons16, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons16, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r4d, r4d
- add r5d, r5d
- add r6d, r6d
-
- pxor m4, m4
- mova m5, [pw_pixel_max]
- mov t7b, 16/2
-.loop:
- movu m0, [r0]
- movu m1, [r0 + 16]
- movu m2, [r1]
- movu m3, [r1 + 16]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recipred[]
- movu [r3], m0
- movu [r3 + 16], m1
-
- ; store recqt[]
- movu [r2], m0
- movu [r2 + 16], m1
-
- movu m0, [r0 + r4]
- movu m1, [r0 + r4 + 16]
- movu m2, [r1 + r4]
- movu m3, [r1 + r4 + 16]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recon[] and recipred[]
- movu [r3 + r6], m0
- movu [r3 + r6 + 16], m1
-
- ; store recqt[]
- movu [r2 + r5], m0
- movu [r2 + r5 + 16], m1
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 2]
- lea r2, [r2 + r5 * 2]
- lea r3, [r3 + r6 * 2]
-
- dec t7b
- jnz .loop
- RET
-%else ;HIGH_BIT_DEPTH
-
-INIT_XMM sse4
-%if ARCH_X86_64 == 1
-cglobal calcRecons16, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons16, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r5d, r5d
-
- pxor m0, m0
- mov t7b, 16
-.loop:
- movu m2, [r0]
- pmovzxbw m1, m2
- punpckhbw m2, m0
- paddw m1, [r1]
- paddw m2, [r1 + 16]
- packuswb m1, m2
-
- ; store recon[] and recipred[]
- movu [r3], m1
-
- ; store recqt[]
- pmovzxbw m2, m1
- punpckhbw m1, m0
- movu [r2], m2
- movu [r2 + 16], m1
-
- add r2, r5
- add r3, r6
- add r0, r4
- lea r1, [r1 + r4 * 2]
-
- dec t7b
- jnz .loop
- RET
-%endif ;HIGH_BIT_DEPTH
-
-%if HIGH_BIT_DEPTH
-INIT_XMM sse2
-%if ARCH_X86_64 == 1
-cglobal calcRecons32, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons32, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r4d, r4d
- add r5d, r5d
- add r6d, r6d
-
- pxor m4, m4
- mova m5, [pw_pixel_max]
- mov t7b, 32/2
-.loop:
-
- movu m0, [r0]
- movu m1, [r0 + 16]
- movu m2, [r1]
- movu m3, [r1 + 16]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recipred[]
- movu [r3], m0
- movu [r3 + 16], m1
-
- ; store recqt[]
- movu [r2], m0
- movu [r2 + 16], m1
-
- movu m0, [r0 + 32]
- movu m1, [r0 + 48]
- movu m2, [r1 + 32]
- movu m3, [r1 + 48]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recon[] and recipred[]
- movu [r3 + 32], m0
- movu [r3 + 48], m1
-
- ; store recqt[]
- movu [r2 + 32], m0
- movu [r2 + 48], m1
- add r2, r5
-
- movu m0, [r0 + r4]
- movu m1, [r0 + r4 + 16]
- movu m2, [r1 + r4]
- movu m3, [r1 + r4 + 16]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recon[] and recipred[]
- movu [r3 + r6], m0
- movu [r3 + r6 + 16], m1
-
- ; store recqt[]
- movu [r2], m0
- movu [r2 + 16], m1
-
- movu m0, [r0 + r4 + 32]
- movu m1, [r0 + r4 + 48]
- movu m2, [r1 + r4 + 32]
- movu m3, [r1 + r4 + 48]
- paddw m0, m2
- paddw m1, m3
- CLIPW2 m0, m1, m4, m5
-
- ; store recon[] and recipred[]
- movu [r3 + r6 + 32], m0
- movu [r3 + r6 + 48], m1
- lea r3, [r3 + r6 * 2]
-
- ; store recqt[]
- movu [r2 + 32], m0
- movu [r2 + 48], m1
- add r2, r5
-
- lea r0, [r0 + r4 * 2]
- lea r1, [r1 + r4 * 2]
-
- dec t7b
- jnz .loop
- RET
-%else ;HIGH_BIT_DEPTH
-INIT_XMM sse4
-%if ARCH_X86_64 == 1
-cglobal calcRecons32, 5,8,4
- %define t7b r7b
-%else
-cglobal calcRecons32, 5,7,4,0-1
- %define t7b byte [rsp]
-%endif
-
- mov r4d, r4m
- mov r5d, r5m
- mov r6d, r6m
- add r5d, r5d
-
- pxor m0, m0
- mov t7b, 32
-.loop:
- movu m2, [r0]
- movu m4, [r0 + 16]
- pmovzxbw m1, m2
- punpckhbw m2, m0
- pmovzxbw m3, m4
- punpckhbw m4, m0
-
- paddw m1, [r1 + 0 * 16]
- paddw m2, [r1 + 1 * 16]
- packuswb m1, m2
-
- paddw m3, [r1 + 2 * 16]
- paddw m4, [r1 + 3 * 16]
- packuswb m3, m4
-
- ; store recon[] and recipred[]
- movu [r3], m1
- movu [r3 + 16], m3
-
- ; store recqt[]
- pmovzxbw m2, m1
- punpckhbw m1, m0
- movu [r2 + 0 * 16], m2
- movu [r2 + 1 * 16], m1
- pmovzxbw m4, m3
- punpckhbw m3, m0
- movu [r2 + 2 * 16], m4
- movu [r2 + 3 * 16], m3
-
- add r2, r5
- add r3, r6
- add r0, r4
- lea r1, [r1 + r4 * 2]
-
- dec t7b
- jnz .loop
- RET
-%endif ;HIGH_BIT_DEPTH
-