- movu m1, [r1 + 4 * mmsize]
- movu m2, [r1 + 5 * mmsize]
- movu m3, [r1 + 6 * mmsize]
- movu m4, [r1 + 7 * mmsize]
- psllw m1, m0
- psllw m2, m0
- psllw m3, m0
- psllw m4, m0
- movu [r0 + r2 * 2], m1
- lea r0, [r0 + 2 * r2]
- movu [r0 + r2], m2
- movu [r0 + 2 * r2], m3
- lea r0, [r0 + 2 * r2]
- movu [r0 + r2], m4
+ mova m2, [r1 + 4 * mmsize]
+ mova m3, [r1 + 5 * mmsize]
+ mova m4, [r1 + 6 * mmsize]
+ mova m5, [r1 + 7 * mmsize]
+ lea r0, [r0 + r2 * 4]
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+ psraw m2, m0
+ psraw m3, m0
+ psraw m4, m0
+ psraw m5, m0
+ mova [r0], m2
+ mova [r0 + r2], m3
+ mova [r0 + r2 * 2], m4
+ mova [r0 + r3], m5
+ RET
+
+
+INIT_YMM avx2
+cglobal cpy1Dto2D_shr_8, 3, 4, 4
+ add r2d, r2d
+ movd xm0, r3m
+ pcmpeqw m1, m1
+ psllw m1, xm0
+ psraw m1, 1
+ lea r3, [r2 * 3]
+
+ ; Row 0-3
+ movu m2, [r1 + 0 * mmsize]
+ movu m3, [r1 + 1 * mmsize]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ movu [r0], xm2
+ vextracti128 [r0 + r2], m2, 1
+ movu [r0 + r2 * 2], xm3
+ vextracti128 [r0 + r3], m3, 1
+
+ ; Row 4-7
+ movu m2, [r1 + 2 * mmsize]
+ movu m3, [r1 + 3 * mmsize]
+ lea r0, [r0 + r2 * 4]
+ psubw m2, m1
+ psubw m3, m1
+ psraw m2, xm0
+ psraw m3, xm0
+ movu [r0], xm2
+ vextracti128 [r0 + r2], m2, 1
+ movu [r0 + r2 * 2], xm3
+ vextracti128 [r0 + r3], m3, 1