1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
4 ;* Authors: Min Chen <chenm003@163.com>
5 ;* Nabajit Deka <nabajit@multicorewareinc.com>
6 ;* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 ;* This program is also available under a commercial proprietary license.
23 ;* For more information, contact us at license @ x265.com.
24 ;*****************************************************************************/
27 %include "x86util.asm"
30 tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
31 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
32 db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
35 const interp4_vpp_shuf, times 2 db 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15
38 const interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4
39 dd 2, 3, 3, 4, 4, 5, 5, 6
42 tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
43 db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10
44 db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12
45 db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14
47 tab_Vm: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
48 db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
50 tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3
52 tab_c_526336: times 4 dd 8192*64+2048
54 tab_ChromaCoeff: db 0, 64, 0, 0
63 tab_ChromaCoeffV: times 4 dw 0, 64
87 tab_LumaCoeff: db 0, 0, 0, 64, 0, 0, 0, 0
88 db -1, 4, -10, 58, 17, -5, 1, 0
89 db -1, 4, -11, 40, 40, -11, 4, -1
90 db 0, 1, -5, 17, 58, -10, 4, -1
92 tab_LumaCoeffV: times 4 dw 0, 0
112 tab_LumaCoeffVer: times 8 db 0, 0
133 tab_LumaCoeffVer_32: times 16 db 0, 0
154 tab_ChromaCoeffVer_32: times 16 db 0, 64
178 tab_c_64_n64: times 8 db 64, -64
180 const interp4_shuf, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
183 interp4_horiz_shuf1: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
184 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
193 %macro FILTER_H4_w2_2 3
196 movh %1, [srcq + srcstrideq - 1]
206 mov [dstq + dststrideq], r4w
209 ;-----------------------------------------------------------------------------
210 ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
211 ;-----------------------------------------------------------------------------
213 cglobal interp_4tap_horiz_pp_2x4, 4, 6, 5, src, srcstride, dst, dststride
223 lea r5, [tab_ChromaCoeff]
224 movd coef2, [r5 + r4 * 4]
226 movd coef2, [tab_ChromaCoeff + r4 * 4]
229 pshufd coef2, coef2, 0
234 FILTER_H4_w2_2 t0, t1, t2
235 lea srcq, [srcq + srcstrideq * 2]
236 lea dstq, [dstq + dststrideq * 2]
241 ;-----------------------------------------------------------------------------
242 ; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
243 ;-----------------------------------------------------------------------------
245 cglobal interp_4tap_horiz_pp_2x8, 4, 6, 5, src, srcstride, dst, dststride
255 lea r5, [tab_ChromaCoeff]
256 movd coef2, [r5 + r4 * 4]
258 movd coef2, [tab_ChromaCoeff + r4 * 4]
261 pshufd coef2, coef2, 0
266 FILTER_H4_w2_2 t0, t1, t2
267 lea srcq, [srcq + srcstrideq * 2]
268 lea dstq, [dstq + dststrideq * 2]
273 ;-----------------------------------------------------------------------------
274 ; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
275 ;-----------------------------------------------------------------------------
277 cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride
287 lea r5, [tab_ChromaCoeff]
288 movd coef2, [r5 + r4 * 4]
290 movd coef2, [tab_ChromaCoeff + r4 * 4]
293 pshufd coef2, coef2, 0
300 FILTER_H4_w2_2 t0, t1, t2
301 lea srcq, [srcq + srcstrideq * 2]
302 lea dstq, [dstq + dststrideq * 2]
308 %macro FILTER_H4_w4_2 3
312 movh %1, [srcq + srcstrideq - 1]
320 movd [dstq + dststrideq], %2
323 ;-----------------------------------------------------------------------------
324 ; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
325 ;-----------------------------------------------------------------------------
327 cglobal interp_4tap_horiz_pp_4x2, 4, 6, 5, src, srcstride, dst, dststride
337 lea r5, [tab_ChromaCoeff]
338 movd coef2, [r5 + r4 * 4]
340 movd coef2, [tab_ChromaCoeff + r4 * 4]
343 pshufd coef2, coef2, 0
347 FILTER_H4_w4_2 t0, t1, t2
351 ;-----------------------------------------------------------------------------
352 ; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
353 ;-----------------------------------------------------------------------------
355 cglobal interp_4tap_horiz_pp_4x4, 4, 6, 5, src, srcstride, dst, dststride
365 lea r5, [tab_ChromaCoeff]
366 movd coef2, [r5 + r4 * 4]
368 movd coef2, [tab_ChromaCoeff + r4 * 4]
371 pshufd coef2, coef2, 0
376 FILTER_H4_w4_2 t0, t1, t2
377 lea srcq, [srcq + srcstrideq * 2]
378 lea dstq, [dstq + dststrideq * 2]
383 ;-----------------------------------------------------------------------------
384 ; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
385 ;-----------------------------------------------------------------------------
387 cglobal interp_4tap_horiz_pp_4x8, 4, 6, 5, src, srcstride, dst, dststride
397 lea r5, [tab_ChromaCoeff]
398 movd coef2, [r5 + r4 * 4]
400 movd coef2, [tab_ChromaCoeff + r4 * 4]
403 pshufd coef2, coef2, 0
408 FILTER_H4_w4_2 t0, t1, t2
409 lea srcq, [srcq + srcstrideq * 2]
410 lea dstq, [dstq + dststrideq * 2]
415 ;-----------------------------------------------------------------------------
416 ; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
417 ;-----------------------------------------------------------------------------
419 cglobal interp_4tap_horiz_pp_4x16, 4, 6, 5, src, srcstride, dst, dststride
429 lea r5, [tab_ChromaCoeff]
430 movd coef2, [r5 + r4 * 4]
432 movd coef2, [tab_ChromaCoeff + r4 * 4]
435 pshufd coef2, coef2, 0
440 FILTER_H4_w4_2 t0, t1, t2
441 lea srcq, [srcq + srcstrideq * 2]
442 lea dstq, [dstq + dststrideq * 2]
447 ;-----------------------------------------------------------------------------
448 ; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
449 ;-----------------------------------------------------------------------------
451 cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride
461 lea r5, [tab_ChromaCoeff]
462 movd coef2, [r5 + r4 * 4]
464 movd coef2, [tab_ChromaCoeff + r4 * 4]
467 pshufd coef2, coef2, 0
474 FILTER_H4_w4_2 t0, t1, t2
475 lea srcq, [srcq + srcstrideq * 2]
476 lea dstq, [dstq + dststrideq * 2]
483 const interp_4tap_8x8_horiz_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
486 %macro FILTER_H4_w6 3
496 pextrw [dstq + 4], %2, 2
499 %macro FILTER_H4_w8 3
511 %macro FILTER_H4_w12 3
519 movu %1, [srcq - 1 + 8]
526 pextrd [dstq + 8], %2, 2
529 %macro FILTER_H4_w16 4
536 movu %1, [srcq - 1 + 8]
548 %macro FILTER_H4_w24 4
555 movu %1, [srcq - 1 + 8]
565 movu %1, [srcq - 1 + 16]
576 %macro FILTER_H4_w32 4
583 movu %1, [srcq - 1 + 8]
593 movu %1, [srcq - 1 + 16]
599 movu %1, [srcq - 1 + 24]
611 %macro FILTER_H4_w16o 5
612 movu %1, [srcq + %5 - 1]
618 movu %1, [srcq + %5 - 1 + 8]
630 %macro FILTER_H4_w48 4
631 FILTER_H4_w16o %1, %2, %3, %4, 0
632 FILTER_H4_w16o %1, %2, %3, %4, 16
633 FILTER_H4_w16o %1, %2, %3, %4, 32
636 %macro FILTER_H4_w64 4
637 FILTER_H4_w16o %1, %2, %3, %4, 0
638 FILTER_H4_w16o %1, %2, %3, %4, 16
639 FILTER_H4_w16o %1, %2, %3, %4, 32
640 FILTER_H4_w16o %1, %2, %3, %4, 48
643 ;-----------------------------------------------------------------------------
644 ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
645 ;-----------------------------------------------------------------------------
646 %macro IPFILTER_CHROMA 2
648 cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
659 lea r5, [tab_ChromaCoeff]
660 movd coef2, [r5 + r4 * 4]
662 movd coef2, [tab_ChromaCoeff + r4 * 4]
667 pshufd coef2, coef2, 0
670 mova Tm1, [tab_Tm + 16]
673 FILTER_H4_w%1 t0, t1, t2
689 IPFILTER_CHROMA 8, 16
690 IPFILTER_CHROMA 8, 32
691 IPFILTER_CHROMA 12, 16
693 IPFILTER_CHROMA 6, 16
694 IPFILTER_CHROMA 8, 12
695 IPFILTER_CHROMA 8, 64
696 IPFILTER_CHROMA 12, 32
698 ;-----------------------------------------------------------------------------
699 ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
700 ;-----------------------------------------------------------------------------
701 %macro IPFILTER_CHROMA_W 2
703 cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride
715 lea r5, [tab_ChromaCoeff]
716 movd coef2, [r5 + r4 * 4]
718 movd coef2, [tab_ChromaCoeff + r4 * 4]
723 pshufd coef2, coef2, 0
726 mova Tm1, [tab_Tm + 16]
729 FILTER_H4_w%1 t0, t1, t2, t3
739 IPFILTER_CHROMA_W 16, 4
740 IPFILTER_CHROMA_W 16, 8
741 IPFILTER_CHROMA_W 16, 12
742 IPFILTER_CHROMA_W 16, 16
743 IPFILTER_CHROMA_W 16, 32
744 IPFILTER_CHROMA_W 32, 8
745 IPFILTER_CHROMA_W 32, 16
746 IPFILTER_CHROMA_W 32, 24
747 IPFILTER_CHROMA_W 24, 32
748 IPFILTER_CHROMA_W 32, 32
750 IPFILTER_CHROMA_W 16, 24
751 IPFILTER_CHROMA_W 16, 64
752 IPFILTER_CHROMA_W 32, 48
753 IPFILTER_CHROMA_W 24, 64
754 IPFILTER_CHROMA_W 32, 64
756 IPFILTER_CHROMA_W 64, 64
757 IPFILTER_CHROMA_W 64, 32
758 IPFILTER_CHROMA_W 64, 48
759 IPFILTER_CHROMA_W 48, 64
760 IPFILTER_CHROMA_W 64, 16
763 %macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst
765 pshufb %2, %1, [tab_Lm + 0]
767 pshufb %3, %1, [tab_Lm + 16]
770 pshufb %4, %1, [tab_Lm + 32]
772 pshufb %1, %1, [tab_Lm + 48]
783 %macro FILTER_H8_W4 2
784 movu %1, [r0 - 3 + r5]
785 pshufb %2, %1, [tab_Lm]
787 pshufb m7, %1, [tab_Lm + 16]
793 ;----------------------------------------------------------------------------------------------------------------------------
794 ; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
795 ;----------------------------------------------------------------------------------------------------------------------------
796 %macro IPFILTER_LUMA 3
798 cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
803 lea r6, [tab_LumaCoeff]
804 movh m3, [r6 + r4 * 8]
806 movh m3, [tab_LumaCoeff + r4 * 8]
821 lea r6, [r1 + 2 * r1]
830 FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5]
832 FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5]
834 movu [r2 + 2 * r5], m1
847 movh [r2 + 2 * r5], m1
861 cglobal interp_8tap_horiz_pp_4x4, 4,6,6
865 lea r5, [tab_LumaCoeff]
866 vpbroadcastq m0, [r5 + r4 * 8]
868 vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
872 vpbroadcastd m2, [pw_1]
875 ; m0 - interpolate coeff
876 ; m1 - shuffle order table
877 ; m2 - constant word 1
881 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
885 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
889 phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
892 lea r0, [r0 + r1 * 2]
893 vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
897 vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
901 phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
903 packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
904 pmulhrsw m3, [pw_512]
905 vextracti128 xm4, m3, 1
906 packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
907 pshufb xm3, [interp4_shuf] ; [row3 row1 row2 row0]
911 pextrd [r2+r3], xm3, 2
912 pextrd [r2+r3*2], xm3, 1
913 pextrd [r2+r0], xm3, 3
917 cglobal interp_8tap_horiz_pp_8x4, 4, 6, 7
921 lea r5, [tab_LumaCoeff]
922 vpbroadcastq m0, [r5 + r4 * 8]
924 vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
928 mova m2, [tab_Lm + 32]
931 ; m0 - interpolate coeff
932 ; m1, m2 - shuffle order table
939 vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
946 vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
953 phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A]
954 pmulhrsw m3, [pw_512]
957 vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
964 vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
971 phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A]
972 pmulhrsw m4, [pw_512]
975 vextracti128 xm4, m3, 1
976 punpcklwd xm5, xm3, xm4
979 movhps [r2 + r3], xm5
981 punpckhwd xm5, xm3, xm4
982 movq [r2 + r3 * 2], xm5
983 movhps [r2 + r4], xm5
986 %macro IPFILTER_LUMA_AVX2_8xN 2
988 cglobal interp_8tap_horiz_pp_%1x%2, 4, 7, 7
992 lea r5, [tab_LumaCoeff]
993 vpbroadcastq m0, [r5 + r4 * 8]
995 vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
999 mova m2, [tab_Lm + 32]
1002 ; m0 - interpolate coeff
1003 ; m1, m2 - shuffle order table
1011 vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1018 vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1025 phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A]
1026 pmulhrsw m3, [pw_512]
1029 vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1036 vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1043 phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A]
1044 pmulhrsw m4, [pw_512]
1047 vextracti128 xm4, m3, 1
1048 punpcklwd xm5, xm3, xm4
1051 movhps [r2 + r3], xm5
1053 punpckhwd xm5, xm3, xm4
1054 movq [r2 + r3 * 2], xm5
1055 movhps [r2 + r6], xm5
1057 lea r0, [r0 + r1 * 4]
1058 lea r2, [r2 + r3 * 4]
1064 IPFILTER_LUMA_AVX2_8xN 8, 8
1065 IPFILTER_LUMA_AVX2_8xN 8, 16
1066 IPFILTER_LUMA_AVX2_8xN 8, 32
1068 %macro IPFILTER_LUMA_AVX2 2
1070 cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
1074 lea r5, [tab_LumaCoeff]
1075 vpbroadcastd m0, [r5 + r4 * 8]
1076 vpbroadcastd m1, [r5 + r4 * 8 + 4]
1078 vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
1079 vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
1081 movu m3, [tab_Tm + 16]
1082 vpbroadcastd m7, [pw_1]
1085 ; m0 , m1 interpolate coeff
1086 ; m2 , m2 shuffle order table
1092 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1099 vbroadcasti128 m5, [r0 + 8] ; second 8 elements in Row0
1106 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
1107 pmulhrsw m4, [pw_512]
1108 vbroadcasti128 m2, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1115 vbroadcasti128 m5, [r0 + r1 + 8] ; second 8 elements in Row0
1122 packssdw m2, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
1123 pmulhrsw m2, [pw_512]
1125 vpermq m4, m4, 11011000b
1126 vextracti128 xm5, m4, 1
1127 pshufd xm4, xm4, 11011000b
1128 pshufd xm5, xm5, 11011000b
1131 lea r0, [r0 + r1 * 2]
1132 lea r2, [r2 + r3 * 2]
1138 %macro IPFILTER_LUMA_32x_avx2 2
1140 cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
1144 lea r5, [tab_LumaCoeff]
1145 vpbroadcastd m0, [r5 + r4 * 8]
1146 vpbroadcastd m1, [r5 + r4 * 8 + 4]
1148 vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
1149 vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
1151 movu m3, [tab_Tm + 16]
1152 vpbroadcastd m7, [pw_1]
1155 ; m0 , m1 interpolate coeff
1156 ; m2 , m2 shuffle order table
1162 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1169 vbroadcasti128 m5, [r0 + 8]
1176 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
1177 pmulhrsw m4, [pw_512]
1178 vbroadcasti128 m2, [r0 + 16]
1185 vbroadcasti128 m5, [r0 + 24]
1193 pmulhrsw m2, [pw_512]
1195 vpermq m4, m4, 11011000b
1196 vextracti128 xm5, m4, 1
1197 pshufd xm4, xm4, 11011000b
1198 pshufd xm5, xm5, 11011000b
1208 %macro IPFILTER_LUMA_64x_avx2 2
1210 cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
1214 lea r5, [tab_LumaCoeff]
1215 vpbroadcastd m0, [r5 + r4 * 8]
1216 vpbroadcastd m1, [r5 + r4 * 8 + 4]
1218 vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
1219 vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
1221 movu m3, [tab_Tm + 16]
1222 vpbroadcastd m7, [pw_1]
1225 ; m0 , m1 interpolate coeff
1226 ; m2 , m2 shuffle order table
1232 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1239 vbroadcasti128 m5, [r0 + 8]
1246 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
1247 pmulhrsw m4, [pw_512]
1248 vbroadcasti128 m2, [r0 + 16]
1255 vbroadcasti128 m5, [r0 + 24]
1263 pmulhrsw m2, [pw_512]
1265 vpermq m4, m4, 11011000b
1266 vextracti128 xm5, m4, 1
1267 pshufd xm4, xm4, 11011000b
1268 pshufd xm5, xm5, 11011000b
1272 vbroadcasti128 m4, [r0 + 32]
1279 vbroadcasti128 m5, [r0 + 40]
1287 pmulhrsw m4, [pw_512]
1288 vbroadcasti128 m2, [r0 + 48]
1295 vbroadcasti128 m5, [r0 + 56]
1303 pmulhrsw m2, [pw_512]
1305 vpermq m4, m4, 11011000b
1306 vextracti128 xm5, m4, 1
1307 pshufd xm4, xm4, 11011000b
1308 pshufd xm5, xm5, 11011000b
1320 cglobal interp_8tap_horiz_pp_48x64, 4,6,8
1324 lea r5, [tab_LumaCoeff]
1325 vpbroadcastd m0, [r5 + r4 * 8]
1326 vpbroadcastd m1, [r5 + r4 * 8 + 4]
1328 vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
1329 vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
1331 movu m3, [tab_Tm + 16]
1332 vpbroadcastd m7, [pw_1]
1335 ; m0 , m1 interpolate coeff
1336 ; m2 , m2 shuffle order table
1342 vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
1349 vbroadcasti128 m5, [r0 + 8]
1356 packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
1357 pmulhrsw m4, [pw_512]
1359 vbroadcasti128 m2, [r0 + 16]
1366 vbroadcasti128 m5, [r0 + 24]
1374 pmulhrsw m2, [pw_512]
1376 vpermq m4, m4, 11011000b
1377 vextracti128 xm5, m4, 1
1378 pshufd xm4, xm4, 11011000b
1379 pshufd xm5, xm5, 11011000b
1383 vbroadcasti128 m4, [r0 + 32]
1390 vbroadcasti128 m5, [r0 + 40]
1398 pmulhrsw m4, [pw_512]
1400 vpermq m4, m4, 11011000b
1401 pshufd xm4, xm4, 11011000b
1411 cglobal interp_4tap_horiz_pp_4x4, 4,6,6
1415 lea r5, [tab_ChromaCoeff]
1416 vpbroadcastd m0, [r5 + r4 * 4]
1418 vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1421 vpbroadcastd m2, [pw_1]
1422 vbroadcasti128 m1, [tab_Tm]
1425 ; m0 - interpolate coeff
1426 ; m1 - shuffle order table
1427 ; m2 - constant word 1
1432 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1433 vinserti128 m3, m3, [r0 + r1], 1
1439 lea r0, [r0 + r1 * 2]
1440 vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1441 vinserti128 m4, m4, [r0 + r1], 1
1447 pmulhrsw m3, [pw_512]
1448 vextracti128 xm4, m3, 1
1453 pextrd [r2+r3], xm3, 2
1454 pextrd [r2+r3*2], xm3, 1
1455 pextrd [r2+r0], xm3, 3
1459 cglobal interp_4tap_horiz_pp_32x32, 4,6,7
1463 lea r5, [tab_ChromaCoeff]
1464 vpbroadcastd m0, [r5 + r4 * 4]
1466 vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1469 mova m1, [interp4_horiz_shuf1]
1470 vpbroadcastd m2, [pw_1]
1473 ; m0 - interpolate coeff
1474 ; m1 - shuffle order table
1475 ; m2 - constant word 1
1482 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1486 vbroadcasti128 m4, [r0 + 4]
1493 vbroadcasti128 m4, [r0 + 16]
1497 vbroadcasti128 m5, [r0 + 20]
1505 vpermq m3, m3, 11011000b
1516 cglobal interp_4tap_horiz_pp_16x16, 4, 6, 7
1520 lea r5, [tab_ChromaCoeff]
1521 vpbroadcastd m0, [r5 + r4 * 4]
1523 vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1527 mova m1, [interp4_horiz_shuf1]
1528 vpbroadcastd m2, [pw_1]
1531 ; m0 - interpolate coeff
1532 ; m1 - shuffle order table
1533 ; m2 - constant word 1
1540 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1544 vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1552 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1556 vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1564 vpermq m3, m3, 11011000b
1566 vextracti128 xm4, m3, 1
1569 lea r2, [r2 + r3 * 2]
1570 lea r0, [r0 + r1 * 2]
1574 ;--------------------------------------------------------------------------------------------------------------
1575 ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1576 ;--------------------------------------------------------------------------------------------------------------
1577 IPFILTER_LUMA 4, 4, pp
1578 IPFILTER_LUMA 4, 8, pp
1579 IPFILTER_LUMA 12, 16, pp
1580 IPFILTER_LUMA 4, 16, pp
1583 cglobal interp_4tap_horiz_pp_8x8, 4,6,6
1587 lea r5, [tab_ChromaCoeff]
1588 vpbroadcastd m0, [r5 + r4 * 4]
1590 vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1594 vpbroadcastd m2, [pw_1]
1597 ; m0 - interpolate coeff
1598 ; m1 - shuffle order table
1599 ; m2 - constant word 1
1606 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1612 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1617 pmulhrsw m3, [pw_512]
1618 lea r0, [r0 + r1 * 2]
1621 vbroadcasti128 m4, [r0 ] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1627 vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
1632 pmulhrsw m4, [pw_512]
1635 mova m5, [interp_4tap_8x8_horiz_shuf]
1637 vextracti128 xm4, m3, 1
1639 movhps [r2 + r3], xm3
1640 lea r2, [r2 + r3 * 2]
1642 movhps [r2 + r3], xm4
1643 lea r2, [r2 + r3 * 2]
1649 IPFILTER_LUMA_AVX2 16, 4
1650 IPFILTER_LUMA_AVX2 16, 8
1651 IPFILTER_LUMA_AVX2 16, 12
1652 IPFILTER_LUMA_AVX2 16, 16
1653 IPFILTER_LUMA_AVX2 16, 32
1654 IPFILTER_LUMA_AVX2 16, 64
1656 IPFILTER_LUMA_32x_avx2 32 , 8
1657 IPFILTER_LUMA_32x_avx2 32 , 16
1658 IPFILTER_LUMA_32x_avx2 32 , 24
1659 IPFILTER_LUMA_32x_avx2 32 , 32
1660 IPFILTER_LUMA_32x_avx2 32 , 64
1662 IPFILTER_LUMA_64x_avx2 64 , 64
1663 IPFILTER_LUMA_64x_avx2 64 , 48
1664 IPFILTER_LUMA_64x_avx2 64 , 32
1665 IPFILTER_LUMA_64x_avx2 64 , 16
1667 ;--------------------------------------------------------------------------------------------------------------
1668 ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1669 ;--------------------------------------------------------------------------------------------------------------
1670 %macro IPFILTER_LUMA_PP_W8 2
1672 cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7
1676 lea r5, [tab_LumaCoeff]
1677 movh m3, [r5 + r4 * 8]
1679 movh m3, [tab_LumaCoeff + r4 * 8]
1681 pshufd m0, m3, 0 ; m0 = coeff-L
1682 pshufd m1, m3, 0x55 ; m1 = coeff-H
1683 lea r5, [tab_Tm] ; r5 = shuffle
1684 mova m2, [pw_512] ; m2 = 512
1690 movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0]
1691 pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0]
1692 pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4]
1693 pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8]
1695 pmaddubsw m6, m5, m1
1715 IPFILTER_LUMA_PP_W8 8, 4
1716 IPFILTER_LUMA_PP_W8 8, 8
1717 IPFILTER_LUMA_PP_W8 8, 16
1718 IPFILTER_LUMA_PP_W8 8, 32
1719 IPFILTER_LUMA_PP_W8 16, 4
1720 IPFILTER_LUMA_PP_W8 16, 8
1721 IPFILTER_LUMA_PP_W8 16, 12
1722 IPFILTER_LUMA_PP_W8 16, 16
1723 IPFILTER_LUMA_PP_W8 16, 32
1724 IPFILTER_LUMA_PP_W8 16, 64
1725 IPFILTER_LUMA_PP_W8 24, 32
1726 IPFILTER_LUMA_PP_W8 32, 8
1727 IPFILTER_LUMA_PP_W8 32, 16
1728 IPFILTER_LUMA_PP_W8 32, 24
1729 IPFILTER_LUMA_PP_W8 32, 32
1730 IPFILTER_LUMA_PP_W8 32, 64
1731 IPFILTER_LUMA_PP_W8 48, 64
1732 IPFILTER_LUMA_PP_W8 64, 16
1733 IPFILTER_LUMA_PP_W8 64, 32
1734 IPFILTER_LUMA_PP_W8 64, 48
1735 IPFILTER_LUMA_PP_W8 64, 64
1737 ;----------------------------------------------------------------------------------------------------------------------------
1738 ; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
1739 ;----------------------------------------------------------------------------------------------------------------------------
1740 IPFILTER_LUMA 4, 4, ps
1741 IPFILTER_LUMA 8, 8, ps
1742 IPFILTER_LUMA 8, 4, ps
1743 IPFILTER_LUMA 4, 8, ps
1744 IPFILTER_LUMA 16, 16, ps
1745 IPFILTER_LUMA 16, 8, ps
1746 IPFILTER_LUMA 8, 16, ps
1747 IPFILTER_LUMA 16, 12, ps
1748 IPFILTER_LUMA 12, 16, ps
1749 IPFILTER_LUMA 16, 4, ps
1750 IPFILTER_LUMA 4, 16, ps
1751 IPFILTER_LUMA 32, 32, ps
1752 IPFILTER_LUMA 32, 16, ps
1753 IPFILTER_LUMA 16, 32, ps
1754 IPFILTER_LUMA 32, 24, ps
1755 IPFILTER_LUMA 24, 32, ps
1756 IPFILTER_LUMA 32, 8, ps
1757 IPFILTER_LUMA 8, 32, ps
1758 IPFILTER_LUMA 64, 64, ps
1759 IPFILTER_LUMA 64, 32, ps
1760 IPFILTER_LUMA 32, 64, ps
1761 IPFILTER_LUMA 64, 48, ps
1762 IPFILTER_LUMA 48, 64, ps
1763 IPFILTER_LUMA 64, 16, ps
1764 IPFILTER_LUMA 16, 64, ps
1766 ;-----------------------------------------------------------------------------
1768 ;-----------------------------------------------------------------------------
1769 %macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2]
1770 mova %5, [r0 + (%6 + 0) * 16]
1771 mova %1, [r0 + (%6 + 1) * 16]
1772 mova %2, [r0 + (%6 + 2) * 16]
1773 punpcklwd %3, %5, %1
1775 pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0
1776 pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1]
1777 punpcklwd %4, %1, %2
1779 pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1
1780 pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2]
1781 %endmacro ; FILTER_HV8_START
1783 %macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6]
1784 mova %8, [r0 + (%9 + 0) * 16]
1785 mova %1, [r0 + (%9 + 1) * 16]
1786 punpcklwd %7, %2, %8
1788 pmaddwd %7, [r5 + %10 * 16]
1789 pmaddwd %2, [r5 + %10 * 16]
1790 paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0
1791 paddd %5, %2 ; R0 = H[0+1+2+3]
1792 punpcklwd %7, %8, %1
1794 pmaddwd %7, [r5 + %10 * 16]
1795 pmaddwd %8, [r5 + %10 * 16]
1796 paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1
1797 paddd %6, %8 ; R1 = H[1+2+3+4]
1798 %endmacro ; FILTER_HV8_MID
1800 ; Round and Saturate
1801 %macro FILTER_HV8_END 4 ; output in [1, 3]
1802 paddd %1, [tab_c_526336]
1803 paddd %2, [tab_c_526336]
1804 paddd %3, [tab_c_526336]
1805 paddd %4, [tab_c_526336]
1813 ; TODO: is merge better? I think this way is short dependency link
1815 %endmacro ; FILTER_HV8_END
1817 ;-----------------------------------------------------------------------------
1818 ; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
1819 ;-----------------------------------------------------------------------------
1821 cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
1829 lea r6, [tab_LumaCoeff]
1830 movh coef, [r6 + r4 * 8]
1832 movh coef, [tab_LumaCoeff + r4 * 8]
1834 punpcklqdq coef, coef
1837 lea r6, [r1 + r1 * 2]
1844 FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3]
1855 ; Here all of mN is free
1859 lea r6, [tab_LumaCoeffV]
1862 ; load intermedia buffer
1873 ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache
1876 FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0
1877 FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
1878 FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
1879 FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
1880 FILTER_HV8_END m3, m0, m4, m1
1883 movhps [r2 + r3], m3
1885 lea r0, [r0 + 16 * 2]
1886 lea r2, [r2 + r3 * 2]
1894 ;-----------------------------------------------------------------------------
1895 ;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1896 ;-----------------------------------------------------------------------------
1898 cglobal interp_4tap_vert_pp_2x4, 4, 6, 8
1904 lea r5, [tab_ChromaCoeff]
1905 movd m0, [r5 + r4 * 4]
1907 movd m0, [tab_ChromaCoeff + r4 * 4]
1910 lea r5, [r0 + 4 * r1]
1916 movd m4, [r0 + 2 * r1]
1920 punpcklbw m6, m4, m5
1928 punpcklbw m7, m5, m6
1940 punpcklbw m3, m6, m7
1945 movd m3, [r5 + 2 * r1]
1959 pextrw [r2 + r3], m2, 2
1960 lea r2, [r2 + 2 * r3]
1962 pextrw [r2 + r3], m2, 6
1966 ;-----------------------------------------------------------------------------
1967 ; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1968 ;-----------------------------------------------------------------------------
1969 %macro FILTER_V4_W2_H4 2
1971 cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8
1977 lea r5, [tab_ChromaCoeff]
1978 movd m0, [r5 + r4 * 4]
1980 movd m0, [tab_ChromaCoeff + r4 * 4]
1993 movd m4, [r0 + 2 * r1]
1997 punpcklbw m6, m4, m5
2002 lea r0, [r0 + 4 * r1]
2006 punpcklbw m7, m5, m6
2018 punpcklbw m3, m6, m7
2023 movd m3, [r0 + 2 * r1]
2037 pextrw [r2 + r3], m2, 2
2038 lea r2, [r2 + 2 * r3]
2040 pextrw [r2 + r3], m2, 6
2042 lea r2, [r2 + 2 * r3]
2049 FILTER_V4_W2_H4 2, 8
2051 FILTER_V4_W2_H4 2, 16
2053 ;-----------------------------------------------------------------------------
2054 ; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2055 ;-----------------------------------------------------------------------------
2057 cglobal interp_4tap_vert_pp_4x2, 4, 6, 6
2063 lea r5, [tab_ChromaCoeff]
2064 movd m0, [r5 + r4 * 4]
2066 movd m0, [tab_ChromaCoeff + r4 * 4]
2070 lea r5, [r0 + 2 * r1]
2078 punpcklbw m1, m4, m5
2083 movd m1, [r0 + 4 * r1]
2093 pmulhrsw m2, [pw_512]
2096 pextrd [r2 + r3], m2, 1
2100 ;-----------------------------------------------------------------------------
2101 ; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2102 ;-----------------------------------------------------------------------------
2104 cglobal interp_4tap_vert_pp_4x4, 4, 6, 8
2110 lea r5, [tab_ChromaCoeff]
2111 movd m0, [r5 + r4 * 4]
2113 movd m0, [tab_ChromaCoeff + r4 * 4]
2118 lea r5, [r0 + 4 * r1]
2123 movd m4, [r0 + 2 * r1]
2127 punpcklbw m6, m4, m5
2135 punpcklbw m7, m5, m6
2147 punpcklbw m3, m6, m7
2152 movd m3, [r5 + 2 * r1]
2166 pextrd [r2 + r3], m2, 1
2167 lea r2, [r2 + 2 * r3]
2169 pextrd [r2 + r3], m2, 3
2174 cglobal interp_4tap_vert_pp_4x4, 4, 6, 3
2180 lea r5, [tab_ChromaCoeffVer_32]
2183 lea r5, [tab_ChromaCoeffVer_32 + r4]
2189 pinsrd xm1, [r0 + r1], 1
2190 pinsrd xm1, [r0 + r1 * 2], 2
2191 pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0]
2192 lea r0, [r0 + r1 * 4]
2194 pinsrd xm2, [r0 + r1], 1
2195 pinsrd xm2, [r0 + r1 * 2], 2 ; m2 = row[x 6 5 4]
2196 vinserti128 m1, m1, xm2, 1 ; m1 = row[x 6 5 4 3 2 1 0]
2197 mova m2, [interp4_vpp_shuf1]
2198 vpermd m0, m2, m1 ; m0 = row[4 3 3 2 2 1 1 0]
2199 mova m2, [interp4_vpp_shuf1 + mmsize]
2200 vpermd m1, m2, m1 ; m1 = row[6 5 5 4 4 3 3 2]
2202 mova m2, [interp4_vpp_shuf]
2206 pmaddubsw m1, [r5 + mmsize]
2207 paddw m0, m1 ; m0 = WORD ROW[3 2 1 0]
2208 pmulhrsw m0, [pw_512]
2209 vextracti128 xm1, m0, 1
2213 pextrd [r2 + r3], xm0, 1
2214 pextrd [r2 + r3 * 2], xm0, 2
2215 pextrd [r2 + r5], xm0, 3
2218 ;-----------------------------------------------------------------------------
2219 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2220 ;-----------------------------------------------------------------------------
2221 %macro FILTER_V4_W4_H4 2
2223 cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
2229 lea r5, [tab_ChromaCoeff]
2230 movd m0, [r5 + r4 * 4]
2232 movd m0, [tab_ChromaCoeff + r4 * 4]
2246 movd m4, [r0 + 2 * r1]
2250 punpcklbw m6, m4, m5
2255 lea r0, [r0 + 4 * r1]
2259 punpcklbw m7, m5, m6
2271 punpcklbw m3, m6, m7
2276 movd m3, [r0 + 2 * r1]
2289 pextrd [r2 + r3], m2, 1
2290 lea r2, [r2 + 2 * r3]
2292 pextrd [r2 + r3], m2, 3
2294 lea r2, [r2 + 2 * r3]
2301 FILTER_V4_W4_H4 4, 8
2302 FILTER_V4_W4_H4 4, 16
2304 FILTER_V4_W4_H4 4, 32
2306 %macro FILTER_V4_W8_H2 0
2308 punpcklbw m7, m3, m0
2319 %macro FILTER_V4_W8_H3 0
2321 punpcklbw m7, m0, m1
2332 %macro FILTER_V4_W8_H4 0
2334 punpcklbw m7, m1, m2
2345 %macro FILTER_V4_W8_H5 0
2347 punpcklbw m7, m2, m3
2358 %macro FILTER_V4_W8_8x2 2
2360 movq m0, [r0 + 4 * r1]
2367 %macro FILTER_V4_W8_8x4 2
2368 FILTER_V4_W8_8x2 %1, %2
2370 lea r6, [r0 + 4 * r1]
2375 movh [r2 + 2 * r3], m2
2378 movq m2, [r6 + 2 * r1]
2382 lea r5, [r2 + 2 * r3]
2386 %macro FILTER_V4_W8_8x6 2
2387 FILTER_V4_W8_8x4 %1, %2
2389 lea r6, [r6 + 2 * r1]
2394 movh [r2 + 4 * r3], m0
2397 movq m0, [r0 + 8 * r1]
2401 lea r5, [r2 + 4 * r3]
2405 ;-----------------------------------------------------------------------------
2406 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2407 ;-----------------------------------------------------------------------------
2408 %macro FILTER_V4_W8 2
2410 cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
2417 movq m2, [r0 + 2 * r1]
2418 lea r5, [r0 + 2 * r1]
2422 punpcklbw m4, m2, m3
2425 lea r6, [tab_ChromaCoeff]
2426 movd m5, [r6 + r4 * 4]
2428 movd m5, [tab_ChromaCoeff + r4 * 4]
2431 pshufb m6, m5, [tab_Vm]
2434 pshufb m5, [tab_Vm + 16]
2446 ;-----------------------------------------------------------------------------
2447 ; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2448 ;-----------------------------------------------------------------------------
2449 FILTER_V4_W8_8x2 8, 2
2453 ;-----------------------------------------------------------------------------
2454 ; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2455 ;-----------------------------------------------------------------------------
2456 FILTER_V4_W8_8x4 8, 4
2460 ;-----------------------------------------------------------------------------
2461 ; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2462 ;-----------------------------------------------------------------------------
2463 FILTER_V4_W8_8x6 8, 6
2467 ;-------------------------------------------------------------------------------------------------------------
2468 ; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2469 ;-------------------------------------------------------------------------------------------------------------
2471 cglobal interp_4tap_vert_ps_4x2, 4, 6, 6
2478 lea r5, [tab_ChromaCoeff]
2479 movd m0, [r5 + r4 * 4]
2481 movd m0, [tab_ChromaCoeff + r4 * 4]
2488 lea r5, [r0 + 2 * r1]
2493 punpcklbw m1, m4, m5
2498 movd m1, [r0 + 4 * r1]
2510 movhps [r2 + r3], m2
2514 ;-------------------------------------------------------------------------------------------------------------
2515 ; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2516 ;-------------------------------------------------------------------------------------------------------------
2518 cglobal interp_4tap_vert_ps_4x4, 4, 6, 7
2525 lea r5, [tab_ChromaCoeff]
2526 movd m0, [r5 + r4 * 4]
2528 movd m0, [tab_ChromaCoeff + r4 * 4]
2534 lea r5, [r0 + 4 * r1]
2538 movd m4, [r0 + 2 * r1]
2542 punpcklbw m6, m4, m5
2550 punpcklbw m1, m5, m6
2561 movhps [r2 + r3], m2
2566 punpcklbw m3, m6, m2
2571 movd m3, [r5 + 2 * r1]
2582 lea r2, [r2 + 2 * r3]
2584 movhps [r2 + r3], m4
2588 ;---------------------------------------------------------------------------------------------------------------
2589 ; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2590 ;---------------------------------------------------------------------------------------------------------------
2591 %macro FILTER_V_PS_W4_H4 2
2593 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
2600 lea r5, [tab_ChromaCoeff]
2601 movd m0, [r5 + r4 * 4]
2603 movd m0, [tab_ChromaCoeff + r4 * 4]
2616 movd m4, [r0 + 2 * r1]
2620 punpcklbw m6, m4, m5
2625 lea r0, [r0 + 4 * r1]
2629 punpcklbw m7, m5, m6
2638 movhps [r2 + r3], m2
2643 punpcklbw m3, m6, m2
2648 movd m3, [r0 + 2 * r1]
2659 lea r2, [r2 + 2 * r3]
2661 movhps [r2 + r3], m4
2663 lea r2, [r2 + 2 * r3]
2670 FILTER_V_PS_W4_H4 4, 8
2671 FILTER_V_PS_W4_H4 4, 16
2673 FILTER_V_PS_W4_H4 4, 32
2675 ;--------------------------------------------------------------------------------------------------------------
2676 ; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2677 ;--------------------------------------------------------------------------------------------------------------
2678 %macro FILTER_V_PS_W8_H8_H16_H2 2
2680 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7
2687 lea r5, [tab_ChromaCoeff]
2688 movd m5, [r5 + r4 * 4]
2690 movd m5, [tab_ChromaCoeff + r4 * 4]
2693 pshufb m6, m5, [tab_Vm]
2694 pshufb m5, [tab_Vm + 16]
2703 movq m2, [r0 + 2 * r1]
2718 movq m0, [r0 + 4 * r1]
2730 lea r0, [r0 + 2 * r1]
2731 lea r2, [r2 + 2 * r3]
2739 FILTER_V_PS_W8_H8_H16_H2 8, 2
2740 FILTER_V_PS_W8_H8_H16_H2 8, 4
2741 FILTER_V_PS_W8_H8_H16_H2 8, 6
2743 FILTER_V_PS_W8_H8_H16_H2 8, 12
2744 FILTER_V_PS_W8_H8_H16_H2 8, 64
2746 ;--------------------------------------------------------------------------------------------------------------
2747 ; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2748 ;--------------------------------------------------------------------------------------------------------------
2749 %macro FILTER_V_PS_W8_H8_H16_H32 2
2751 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
2758 lea r5, [tab_ChromaCoeff]
2759 movd m5, [r5 + r4 * 4]
2761 movd m5, [tab_ChromaCoeff + r4 * 4]
2764 pshufb m6, m5, [tab_Vm]
2765 pshufb m5, [tab_Vm + 16]
2774 movq m2, [r0 + 2 * r1]
2782 pmaddubsw m7, m2, m5
2789 lea r0, [r0 + 4 * r1]
2795 pmaddubsw m7, m3, m5
2812 lea r2, [r2 + 2 * r3]
2815 movq m2, [r0 + 2 * r1]
2827 lea r2, [r2 + 2 * r3]
2834 FILTER_V_PS_W8_H8_H16_H32 8, 8
2835 FILTER_V_PS_W8_H8_H16_H32 8, 16
2836 FILTER_V_PS_W8_H8_H16_H32 8, 32
2838 ;------------------------------------------------------------------------------------------------------------
2839 ;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2840 ;------------------------------------------------------------------------------------------------------------
2841 %macro FILTER_V_PS_W6 2
2843 cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8
2850 lea r5, [tab_ChromaCoeff]
2851 movd m5, [r5 + r4 * 4]
2853 movd m5, [tab_ChromaCoeff + r4 * 4]
2856 pshufb m6, m5, [tab_Vm]
2857 pshufb m5, [tab_Vm + 16]
2865 movq m2, [r0 + 2 * r1]
2873 pmaddubsw m7, m2, m5
2882 lea r0, [r0 + 4 * r1]
2887 pmaddubsw m7, m3, m5
2894 movd [r2 + r3 + 8], m1
2905 lea r2,[r2 + 2 * r3]
2910 movq m2,[r0 + 2 * r1]
2921 movd [r2 + r3 + 8], m3
2923 lea r2, [r2 + 2 * r3]
2931 FILTER_V_PS_W6 6, 16
2933 ;---------------------------------------------------------------------------------------------------------------
2934 ; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2935 ;---------------------------------------------------------------------------------------------------------------
2936 %macro FILTER_V_PS_W12 2
2938 cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8
2945 lea r5, [tab_ChromaCoeff]
2946 movd m0, [r5 + r4 * 4]
2948 movd m0, [tab_ChromaCoeff + r4 * 4]
2951 pshufb m1, m0, [tab_Vm]
2952 pshufb m0, [tab_Vm + 16]
2960 punpcklbw m4, m2, m3
2966 lea r0, [r0 + 2 * r1]
2970 punpcklbw m6, m5, m7
2974 punpckhbw m6, m5, m7
2986 punpcklbw m4, m3, m5
2992 movu m2, [r0 + 2 * r1]
2994 punpcklbw m5, m7, m2
3007 movh [r2 + r3 + 16], m3
3009 lea r2, [r2 + 2 * r3]
3016 FILTER_V_PS_W12 12, 16
3017 FILTER_V_PS_W12 12, 32
3019 ;---------------------------------------------------------------------------------------------------------------
3020 ; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3021 ;---------------------------------------------------------------------------------------------------------------
3022 %macro FILTER_V_PS_W16 2
3024 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
3031 lea r5, [tab_ChromaCoeff]
3032 movd m0, [r5 + r4 * 4]
3034 movd m0, [tab_ChromaCoeff + r4 * 4]
3037 pshufb m1, m0, [tab_Vm]
3038 pshufb m0, [tab_Vm + 16]
3045 punpcklbw m4, m2, m3
3051 lea r0, [r0 + 2 * r1]
3055 punpcklbw m6, m5, m7
3059 punpckhbw m6, m5, m7
3071 punpcklbw m4, m3, m5
3077 movu m5, [r0 + 2 * r1]
3079 punpcklbw m2, m7, m5
3092 movu [r2 + r3 + 16], m3
3094 lea r2, [r2 + 2 * r3]
3101 FILTER_V_PS_W16 16, 4
3102 FILTER_V_PS_W16 16, 8
3103 FILTER_V_PS_W16 16, 12
3104 FILTER_V_PS_W16 16, 16
3105 FILTER_V_PS_W16 16, 32
3107 FILTER_V_PS_W16 16, 24
3108 FILTER_V_PS_W16 16, 64
3110 ;--------------------------------------------------------------------------------------------------------------
3111 ;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3112 ;--------------------------------------------------------------------------------------------------------------
3113 %macro FILTER_V4_PS_W24 2
3115 cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8
3122 lea r5, [tab_ChromaCoeff]
3123 movd m0, [r5 + r4 * 4]
3125 movd m0, [tab_ChromaCoeff + r4 * 4]
3128 pshufb m1, m0, [tab_Vm]
3129 pshufb m0, [tab_Vm + 16]
3137 punpcklbw m4, m2, m3
3143 lea r5, [r0 + 2 * r1]
3148 punpcklbw m6, m5, m7
3152 punpckhbw m6, m5, m7
3164 punpcklbw m4, m3, m5
3170 movu m2, [r5 + 2 * r1]
3172 punpcklbw m5, m7, m2
3185 movu [r2 + r3 + 16], m3
3188 movq m3, [r0 + r1 + 16]
3190 movq m5, [r5 + r1 + 16]
3193 punpcklbw m7, m4, m5
3203 movq m2, [r5 + 2 * r1 + 16]
3214 movu [r2 + r3 + 32], m3
3217 lea r2, [r2 + 2 * r3]
3224 FILTER_V4_PS_W24 24, 32
3226 FILTER_V4_PS_W24 24, 64
3228 ;---------------------------------------------------------------------------------------------------------------
3229 ; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3230 ;---------------------------------------------------------------------------------------------------------------
3231 %macro FILTER_V_PS_W32 2
3233 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
3240 lea r5, [tab_ChromaCoeff]
3241 movd m0, [r5 + r4 * 4]
3243 movd m0, [tab_ChromaCoeff + r4 * 4]
3246 pshufb m1, m0, [tab_Vm]
3247 pshufb m0, [tab_Vm + 16]
3257 punpcklbw m4, m2, m3
3263 lea r5, [r0 + 2 * r1]
3267 punpcklbw m6, m3, m5
3283 movu m3, [r0 + r1 + 16]
3285 punpcklbw m4, m2, m3
3292 movu m5, [r5 + r1 + 16]
3294 punpcklbw m6, m3, m5
3317 FILTER_V_PS_W32 32, 8
3318 FILTER_V_PS_W32 32, 16
3319 FILTER_V_PS_W32 32, 24
3320 FILTER_V_PS_W32 32, 32
3322 FILTER_V_PS_W32 32, 48
3323 FILTER_V_PS_W32 32, 64
3325 ;-----------------------------------------------------------------------------
3326 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3327 ;-----------------------------------------------------------------------------
3328 %macro FILTER_V4_W8_H8_H16_H32 2
3330 cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
3336 lea r5, [tab_ChromaCoeff]
3337 movd m5, [r5 + r4 * 4]
3339 movd m5, [tab_ChromaCoeff + r4 * 4]
3342 pshufb m6, m5, [tab_Vm]
3343 pshufb m5, [tab_Vm + 16]
3352 movq m2, [r0 + 2 * r1]
3360 pmaddubsw m7, m2, m5
3368 lea r0, [r0 + 4 * r1]
3374 pmaddubsw m7, m3, m5
3393 movq m7, [r0 + 2 * r1]
3404 lea r2, [r2 + 2 * r3]
3406 movhps [r2 + r3], m2
3408 lea r2, [r2 + 2 * r3]
3415 FILTER_V4_W8_H8_H16_H32 8, 8
3416 FILTER_V4_W8_H8_H16_H32 8, 16
3417 FILTER_V4_W8_H8_H16_H32 8, 32
3419 FILTER_V4_W8_H8_H16_H32 8, 12
3420 FILTER_V4_W8_H8_H16_H32 8, 64
3422 %macro PROCESS_CHROMA_AVX2_W8_8R 0
3423 movq xm1, [r0] ; m1 = row 0
3424 movq xm2, [r0 + r1] ; m2 = row 1
3425 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
3426 movq xm3, [r0 + r1 * 2] ; m3 = row 2
3427 punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
3428 vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
3430 movq xm4, [r0 + r4] ; m4 = row 3
3431 punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
3432 lea r0, [r0 + r1 * 4]
3433 movq xm1, [r0] ; m1 = row 4
3434 punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
3435 vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
3436 pmaddubsw m0, m2, [r5 + 1 * mmsize]
3439 movq xm3, [r0 + r1] ; m3 = row 5
3440 punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
3441 movq xm4, [r0 + r1 * 2] ; m4 = row 6
3442 punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
3443 vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
3444 pmaddubsw m0, m1, [r5 + 1 * mmsize]
3447 movq xm3, [r0 + r4] ; m3 = row 7
3448 punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
3449 lea r0, [r0 + r1 * 4]
3450 movq xm0, [r0] ; m0 = row 8
3451 punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
3452 vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
3453 pmaddubsw m3, m4, [r5 + 1 * mmsize]
3456 movq xm3, [r0 + r1] ; m3 = row 9
3457 punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
3458 movq xm6, [r0 + r1 * 2] ; m6 = row 10
3459 punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
3460 vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
3461 pmaddubsw m0, [r5 + 1 * mmsize]
3466 cglobal interp_4tap_vert_pp_8x8, 4, 6, 7
3471 lea r5, [tab_ChromaCoeffVer_32]
3474 lea r5, [tab_ChromaCoeffVer_32 + r4]
3479 PROCESS_CHROMA_AVX2_W8_8R
3482 pmulhrsw m5, m3 ; m5 = word: row 0, row 1
3483 pmulhrsw m2, m3 ; m2 = word: row 2, row 3
3484 pmulhrsw m1, m3 ; m1 = word: row 4, row 5
3485 pmulhrsw m4, m3 ; m4 = word: row 6, row 7
3488 vextracti128 xm2, m5, 1
3489 vextracti128 xm4, m1, 1
3492 movhps [r2 + r3 * 2], xm5
3493 movhps [r2 + r4], xm2
3494 lea r2, [r2 + r3 * 4]
3497 movhps [r2 + r3 * 2], xm1
3498 movhps [r2 + r4], xm4
3501 ;-----------------------------------------------------------------------------
3502 ;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3503 ;-----------------------------------------------------------------------------
3504 %macro FILTER_V4_W6_H4 2
3506 cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8
3512 lea r5, [tab_ChromaCoeff]
3513 movd m5, [r5 + r4 * 4]
3515 movd m5, [tab_ChromaCoeff + r4 * 4]
3518 pshufb m6, m5, [tab_Vm]
3519 pshufb m5, [tab_Vm + 16]
3528 movq m2, [r0 + 2 * r1]
3536 pmaddubsw m7, m2, m5
3543 pextrw [r2 + 4], m0, 2
3545 lea r0, [r0 + 4 * r1]
3551 pmaddubsw m7, m3, m5
3558 pextrw [r2 + r3 + 4], m1, 2
3561 punpcklbw m7, m0, m1
3570 lea r2, [r2 + 2 * r3]
3572 pextrw [r2 + 4], m2, 2
3574 movq m2, [r0 + 2 * r1]
3586 pextrw [r2 + r3 + 4], m3, 2
3588 lea r2, [r2 + 2 * r3]
3595 FILTER_V4_W6_H4 6, 8
3597 FILTER_V4_W6_H4 6, 16
3599 ;-----------------------------------------------------------------------------
3600 ; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3601 ;-----------------------------------------------------------------------------
3602 %macro FILTER_V4_W12_H2 2
3604 cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8
3610 lea r5, [tab_ChromaCoeff]
3611 movd m0, [r5 + r4 * 4]
3613 movd m0, [tab_ChromaCoeff + r4 * 4]
3616 pshufb m1, m0, [tab_Vm]
3617 pshufb m0, [tab_Vm + 16]
3625 punpcklbw m4, m2, m3
3631 lea r0, [r0 + 2 * r1]
3635 punpcklbw m6, m5, m7
3639 punpckhbw m6, m5, m7
3651 pextrd [r2 + 8], m4, 2
3653 punpcklbw m4, m3, m5
3659 movu m5, [r0 + 2 * r1]
3661 punpcklbw m2, m7, m5
3676 pextrd [r2 + r3 + 8], m4, 2
3678 lea r2, [r2 + 2 * r3]
3685 FILTER_V4_W12_H2 12, 16
3687 FILTER_V4_W12_H2 12, 32
3689 ;-----------------------------------------------------------------------------
3690 ; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3691 ;-----------------------------------------------------------------------------
3692 %macro FILTER_V4_W16_H2 2
3694 cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8
3700 lea r5, [tab_ChromaCoeff]
3701 movd m0, [r5 + r4 * 4]
3703 movd m0, [tab_ChromaCoeff + r4 * 4]
3706 pshufb m1, m0, [tab_Vm]
3707 pshufb m0, [tab_Vm + 16]
3715 punpcklbw m4, m2, m3
3721 lea r0, [r0 + 2 * r1]
3725 punpckhbw m7, m5, m6
3729 punpcklbw m7, m5, m6
3742 punpcklbw m4, m3, m5
3748 movu m5, [r0 + 2 * r1]
3750 punpcklbw m2, m6, m5
3766 lea r2, [r2 + 2 * r3]
3773 FILTER_V4_W16_H2 16, 4
3774 FILTER_V4_W16_H2 16, 8
3775 FILTER_V4_W16_H2 16, 12
3776 FILTER_V4_W16_H2 16, 16
3777 FILTER_V4_W16_H2 16, 32
3779 FILTER_V4_W16_H2 16, 24
3780 FILTER_V4_W16_H2 16, 64
3783 %if ARCH_X86_64 == 1
3784 cglobal interp_4tap_vert_pp_16x16, 4, 6, 15
3789 lea r5, [tab_ChromaCoeffVer_32]
3792 lea r5, [tab_ChromaCoeffVer_32 + r4]
3796 mova m13, [r5 + mmsize]
3802 movu xm0, [r0] ; m0 = row 0
3803 movu xm1, [r0 + r1] ; m1 = row 1
3804 punpckhbw xm2, xm0, xm1
3806 vinserti128 m0, m0, xm2, 1
3808 movu xm2, [r0 + r1 * 2] ; m2 = row 2
3809 punpckhbw xm3, xm1, xm2
3811 vinserti128 m1, m1, xm3, 1
3813 movu xm3, [r0 + r4] ; m3 = row 3
3814 punpckhbw xm4, xm2, xm3
3816 vinserti128 m2, m2, xm4, 1
3817 pmaddubsw m4, m2, m13
3820 lea r0, [r0 + r1 * 4]
3821 movu xm4, [r0] ; m4 = row 4
3822 punpckhbw xm5, xm3, xm4
3824 vinserti128 m3, m3, xm5, 1
3825 pmaddubsw m5, m3, m13
3828 movu xm5, [r0 + r1] ; m5 = row 5
3829 punpckhbw xm6, xm4, xm5
3831 vinserti128 m4, m4, xm6, 1
3832 pmaddubsw m6, m4, m13
3835 movu xm6, [r0 + r1 * 2] ; m6 = row 6
3836 punpckhbw xm7, xm5, xm6
3838 vinserti128 m5, m5, xm7, 1
3839 pmaddubsw m7, m5, m13
3842 movu xm7, [r0 + r4] ; m7 = row 7
3843 punpckhbw xm8, xm6, xm7
3845 vinserti128 m6, m6, xm8, 1
3846 pmaddubsw m8, m6, m13
3849 lea r0, [r0 + r1 * 4]
3850 movu xm8, [r0] ; m8 = row 8
3851 punpckhbw xm9, xm7, xm8
3853 vinserti128 m7, m7, xm9, 1
3854 pmaddubsw m9, m7, m13
3857 movu xm9, [r0 + r1] ; m9 = row 9
3858 punpckhbw xm10, xm8, xm9
3860 vinserti128 m8, m8, xm10, 1
3861 pmaddubsw m10, m8, m13
3864 movu xm10, [r0 + r1 * 2] ; m10 = row 10
3865 punpckhbw xm11, xm9, xm10
3867 vinserti128 m9, m9, xm11, 1
3868 pmaddubsw m11, m9, m13
3872 pmulhrsw m0, m14 ; m0 = word: row 0
3873 pmulhrsw m1, m14 ; m1 = word: row 1
3874 pmulhrsw m2, m14 ; m2 = word: row 2
3875 pmulhrsw m3, m14 ; m3 = word: row 3
3876 pmulhrsw m4, m14 ; m4 = word: row 4
3877 pmulhrsw m5, m14 ; m5 = word: row 5
3878 pmulhrsw m6, m14 ; m6 = word: row 6
3879 pmulhrsw m7, m14 ; m7 = word: row 7
3884 vpermq m0, m0, 11011000b
3885 vpermq m2, m2, 11011000b
3886 vpermq m4, m4, 11011000b
3887 vpermq m6, m6, 11011000b
3888 vextracti128 xm1, m0, 1
3889 vextracti128 xm3, m2, 1
3890 vextracti128 xm5, m4, 1
3891 vextracti128 xm7, m6, 1
3894 movu [r2 + r3 * 2], xm2
3896 lea r2, [r2 + r3 * 4]
3899 movu [r2 + r3 * 2], xm6
3901 lea r2, [r2 + r3 * 4]
3903 movu xm11, [r0 + r4] ; m11 = row 11
3904 punpckhbw xm6, xm10, xm11
3905 punpcklbw xm10, xm11
3906 vinserti128 m10, m10, xm6, 1
3907 pmaddubsw m6, m10, m13
3910 lea r0, [r0 + r1 * 4]
3911 movu xm6, [r0] ; m6 = row 12
3912 punpckhbw xm7, xm11, xm6
3914 vinserti128 m11, m11, xm7, 1
3915 pmaddubsw m7, m11, m13
3919 movu xm7, [r0 + r1] ; m7 = row 13
3920 punpckhbw xm0, xm6, xm7
3922 vinserti128 m6, m6, xm0, 1
3923 pmaddubsw m0, m6, m13
3926 movu xm0, [r0 + r1 * 2] ; m0 = row 14
3927 punpckhbw xm1, xm7, xm0
3929 vinserti128 m7, m7, xm1, 1
3930 pmaddubsw m1, m7, m13
3933 movu xm1, [r0 + r4] ; m1 = row 15
3934 punpckhbw xm2, xm0, xm1
3936 vinserti128 m0, m0, xm2, 1
3937 pmaddubsw m2, m0, m13
3940 lea r0, [r0 + r1 * 4]
3941 movu xm2, [r0] ; m2 = row 16
3942 punpckhbw xm3, xm1, xm2
3944 vinserti128 m1, m1, xm3, 1
3945 pmaddubsw m3, m1, m13
3948 movu xm3, [r0 + r1] ; m3 = row 17
3949 punpckhbw xm4, xm2, xm3
3951 vinserti128 m2, m2, xm4, 1
3954 movu xm4, [r0 + r1 * 2] ; m4 = row 18
3955 punpckhbw xm5, xm3, xm4
3957 vinserti128 m3, m3, xm5, 1
3961 pmulhrsw m8, m14 ; m8 = word: row 8
3962 pmulhrsw m9, m14 ; m9 = word: row 9
3963 pmulhrsw m10, m14 ; m10 = word: row 10
3964 pmulhrsw m11, m14 ; m11 = word: row 11
3965 pmulhrsw m6, m14 ; m6 = word: row 12
3966 pmulhrsw m7, m14 ; m7 = word: row 13
3967 pmulhrsw m0, m14 ; m0 = word: row 14
3968 pmulhrsw m1, m14 ; m1 = word: row 15
3973 vpermq m8, m8, 11011000b
3974 vpermq m10, m10, 11011000b
3975 vpermq m6, m6, 11011000b
3976 vpermq m0, m0, 11011000b
3977 vextracti128 xm9, m8, 1
3978 vextracti128 xm11, m10, 1
3979 vextracti128 xm7, m6, 1
3980 vextracti128 xm1, m0, 1
3983 movu [r2 + r3 * 2], xm10
3984 movu [r2 + r5], xm11
3985 lea r2, [r2 + r3 * 4]
3988 movu [r2 + r3 * 2], xm0
3993 ;-----------------------------------------------------------------------------
3994 ;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3995 ;-----------------------------------------------------------------------------
3996 %macro FILTER_V4_W24 2
3998 cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8
4004 lea r5, [tab_ChromaCoeff]
4005 movd m0, [r5 + r4 * 4]
4007 movd m0, [tab_ChromaCoeff + r4 * 4]
4010 pshufb m1, m0, [tab_Vm]
4011 pshufb m0, [tab_Vm + 16]
4019 punpcklbw m4, m2, m3
4025 lea r5, [r0 + 2 * r1]
4029 punpcklbw m6, m5, m7
4033 punpckhbw m6, m5, m7
4046 punpcklbw m4, m3, m5
4052 movu m2, [r5 + 2 * r1]
4054 punpcklbw m5, m7, m2
4071 movq m3, [r0 + r1 + 16]
4073 movq m5, [r5 + r1 + 16]
4085 movq m3, [r0 + r1 + 16]
4087 movq m5, [r5 + r1 + 16]
4088 movq m7, [r5 + 2 * r1 + 16]
4102 movhps [r2 + r3 + 16], m2
4105 lea r2, [r2 + 2 * r3]
4112 FILTER_V4_W24 24, 32
4114 FILTER_V4_W24 24, 64
4116 ;-----------------------------------------------------------------------------
4117 ; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4118 ;-----------------------------------------------------------------------------
4119 %macro FILTER_V4_W32 2
4121 cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
4127 lea r5, [tab_ChromaCoeff]
4128 movd m0, [r5 + r4 * 4]
4130 movd m0, [tab_ChromaCoeff + r4 * 4]
4133 pshufb m1, m0, [tab_Vm]
4134 pshufb m0, [tab_Vm + 16]
4144 punpcklbw m4, m2, m3
4150 lea r5, [r0 + 2 * r1]
4154 punpcklbw m6, m3, m5
4171 movu m3, [r0 + r1 + 16]
4173 punpcklbw m4, m2, m3
4180 movu m5, [r5 + r1 + 16]
4182 punpcklbw m6, m3, m5
4207 FILTER_V4_W32 32, 16
4208 FILTER_V4_W32 32, 24
4209 FILTER_V4_W32 32, 32
4211 FILTER_V4_W32 32, 48
4212 FILTER_V4_W32 32, 64
4215 %if ARCH_X86_64 == 1
4216 cglobal interp_4tap_vert_pp_32x32, 4, 7, 13
4221 lea r5, [tab_ChromaCoeffVer_32]
4224 lea r5, [tab_ChromaCoeffVer_32 + r4]
4228 mova m11, [r5 + mmsize]
4235 movu m0, [r0] ; m0 = row 0
4236 movu m1, [r0 + r1] ; m1 = row 1
4237 punpcklbw m2, m0, m1
4238 punpckhbw m3, m0, m1
4241 movu m0, [r0 + r1 * 2] ; m0 = row 2
4242 punpcklbw m4, m1, m0
4243 punpckhbw m5, m1, m0
4246 movu m1, [r0 + r4] ; m1 = row 3
4247 punpcklbw m6, m0, m1
4248 punpckhbw m7, m0, m1
4249 pmaddubsw m8, m6, m11
4250 pmaddubsw m9, m7, m11
4260 lea r0, [r0 + r1 * 4]
4261 movu m0, [r0] ; m0 = row 4
4262 punpcklbw m2, m1, m0
4263 punpckhbw m3, m1, m0
4264 pmaddubsw m8, m2, m11
4265 pmaddubsw m9, m3, m11
4275 movu m1, [r0 + r1] ; m1 = row 5
4276 punpcklbw m4, m0, m1
4277 punpckhbw m5, m0, m1
4285 movu [r2 + r3 * 2], m6
4287 movu m0, [r0 + r1 * 2] ; m0 = row 6
4288 punpcklbw m6, m1, m0
4289 punpckhbw m7, m1, m0
4299 lea r2, [r2 + r3 * 4]
4305 ;-----------------------------------------------------------------------------
4306 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4307 ;-----------------------------------------------------------------------------
4308 %macro FILTER_V4_W16n_H2 2
4310 cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
4316 lea r5, [tab_ChromaCoeff]
4317 movd m0, [r5 + r4 * 4]
4319 movd m0, [tab_ChromaCoeff + r4 * 4]
4322 pshufb m1, m0, [tab_Vm]
4323 pshufb m0, [tab_Vm + 16]
4336 punpcklbw m4, m2, m3
4342 lea r5, [r0 + 2 * r1]
4346 punpckhbw m7, m5, m6
4350 punpcklbw m7, m5, m6
4363 punpcklbw m4, m3, m5
4369 movu m5, [r5 + 2 * r1]
4371 punpcklbw m2, m6, m5
4392 lea r0, [r0 + r1 * 2 - %1]
4393 lea r2, [r2 + r3 * 2 - %1]
4400 FILTER_V4_W16n_H2 64, 64
4401 FILTER_V4_W16n_H2 64, 32
4402 FILTER_V4_W16n_H2 64, 48
4403 FILTER_V4_W16n_H2 48, 64
4404 FILTER_V4_W16n_H2 64, 16
4407 ;-----------------------------------------------------------------------------
4408 ; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
4409 ;-----------------------------------------------------------------------------
4411 cglobal luma_p2s, 3, 7, 6
4413 ; load width and height
4419 mova m5, [tab_c_64_n64]
4435 movh m2, [r6 + r1 * 2]
4439 lea r6, [r6 + r1 * 2]
4447 movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
4448 movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
4449 movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
4450 movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
4455 movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
4456 movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
4457 movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
4458 movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
4461 lea r0, [r0 + r1 * 4]
4462 add r2, FENC_STRIDE * 8
4469 %macro PROCESS_LUMA_W4_4R 0
4472 punpcklbw m2, m0, m1 ; m2=[0 1]
4474 lea r0, [r0 + 2 * r1]
4476 punpcklbw m1, m0 ; m1=[1 2]
4477 punpcklqdq m2, m1 ; m2=[0 1 1 2]
4478 pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2]
4481 punpcklbw m5, m0, m1 ; m2=[2 3]
4482 lea r0, [r0 + 2 * r1]
4484 punpcklbw m1, m0 ; m1=[3 4]
4485 punpcklqdq m5, m1 ; m5=[2 3 3 4]
4486 pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4]
4487 paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2
4488 pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4
4491 punpcklbw m2, m0, m1 ; m2=[4 5]
4492 lea r0, [r0 + 2 * r1]
4494 punpcklbw m1, m0 ; m1=[5 6]
4495 punpcklqdq m2, m1 ; m2=[4 5 5 6]
4496 pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6]
4497 paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2
4498 pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6]
4499 paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4
4502 punpcklbw m2, m0, m1 ; m2=[6 7]
4503 lea r0, [r0 + 2 * r1]
4505 punpcklbw m1, m0 ; m1=[7 8]
4506 punpcklqdq m2, m1 ; m2=[6 7 7 8]
4507 pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8]
4508 paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end
4509 pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8]
4510 paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4
4513 punpcklbw m2, m0, m1 ; m2=[8 9]
4514 movd m0, [r0 + 2 * r1]
4515 punpcklbw m1, m0 ; m1=[9 10]
4516 punpcklqdq m2, m1 ; m2=[8 9 9 10]
4517 pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10]
4518 paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end
4521 %macro PROCESS_LUMA_W8_4R 0
4525 pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1
4527 lea r0, [r0 + 2 * r1]
4530 pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2
4534 pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3
4535 pmaddubsw m0, [r6 + 1 * 16]
4536 paddw m7, m0 ;m7=[0+1+2+3] Row1
4538 lea r0, [r0 + 2 * r1]
4541 pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4
4542 pmaddubsw m1, [r6 + 1 * 16]
4543 paddw m6, m1 ;m6 = [1+2+3+4] Row2
4547 pmaddubsw m2, m0, [r6 + 1 * 16]
4548 pmaddubsw m0, [r6 + 2 * 16]
4549 paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1
4550 paddw m5, m2 ;m5=[2+3+4+5] Row3
4552 lea r0, [r0 + 2 * r1]
4555 pmaddubsw m2, m1, [r6 + 1 * 16]
4556 pmaddubsw m1, [r6 + 2 * 16]
4557 paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2
4558 paddw m4, m2 ;m4=[3+4+5+6] Row4
4562 pmaddubsw m2, m0, [r6 + 2 * 16]
4563 pmaddubsw m0, [r6 + 3 * 16]
4564 paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end
4565 paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3
4567 lea r0, [r0 + 2 * r1]
4570 pmaddubsw m2, m1, [r6 + 2 * 16]
4571 pmaddubsw m1, [r6 + 3 * 16]
4572 paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end
4573 paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4
4577 pmaddubsw m0, [r6 + 3 * 16]
4578 paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end
4580 movq m0, [r0 + 2 * r1]
4582 pmaddubsw m1, [r6 + 3 * 16]
4583 paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end
4586 ;-------------------------------------------------------------------------------------------------------------
4587 ; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4588 ;-------------------------------------------------------------------------------------------------------------
4589 %macro FILTER_VER_LUMA_4xN 3
4591 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6
4600 lea r5, [tab_LumaCoeffVer]
4603 lea r6, [tab_LumaCoeffVer + r4]
4625 pextrd [r2 + r3], m4, 1
4626 lea r2, [r2 + 2 * r3]
4628 pextrd [r2 + r3], m4, 3
4634 movhps [r2 + r3], m4
4635 lea r2, [r2 + 2 * r3]
4637 movhps [r2 + r3], m5
4641 lea r2, [r2 + 2 * r3]
4651 cglobal interp_8tap_vert_pp_4x4, 4,6,8
4657 movd xm1, [r0] ; m1 = row0
4658 movd xm2, [r0 + r1] ; m2 = row1
4659 punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00]
4661 movd xm3, [r0 + r1 * 2] ; m3 = row2
4662 punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10]
4664 punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20]
4665 punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
4667 lea r0, [r0 + r1 * 4]
4668 movd xm5, [r0] ; m5 = row4
4669 punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30]
4670 punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10]
4671 vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
4672 movd xm2, [r0 + r1] ; m2 = row5
4673 punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40]
4674 punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
4675 movd xm6, [r0 + r1 * 2] ; m6 = row6
4676 punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50]
4677 punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30]
4678 vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
4679 movd xm4, [r0 + r5] ; m4 = row7
4680 punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60]
4681 punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
4683 lea r0, [r0 + r1 * 4]
4684 movd xm7, [r0] ; m7 = row8
4685 punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70]
4686 punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50]
4687 vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
4688 movd xm2, [r0 + r1] ; m2 = row9
4689 punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80]
4690 punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
4691 movd xm7, [r0 + r1 * 2] ; m7 = rowA
4692 punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90]
4693 punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70]
4694 vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
4698 lea r5, [tab_LumaCoeff]
4699 vpbroadcastd m0, [r5 + r4 * 8 + 0]
4700 vpbroadcastd m2, [r5 + r4 * 8 + 4]
4702 vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0]
4703 vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4]
4710 vbroadcasti128 m0, [pw_1]
4715 paddd m1, m5 ; m1 = DQWORD ROW[1 0]
4716 paddd m3, m6 ; m3 = DQWORD ROW[3 2]
4717 packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0]
4719 ; TODO: does it overflow?
4720 pmulhrsw m1, [pw_512]
4721 vextracti128 xm2, m1, 1
4722 packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0]
4724 pextrd [r2 + r3], xm1, 2
4725 pextrd [r2 + r3 * 2], xm1, 1
4727 pextrd [r2 + r4], xm1, 3
4731 cglobal interp_8tap_vert_ps_4x4, 4, 6, 5
4736 lea r5, [tab_LumaCoeffVer_32]
4739 lea r5, [tab_LumaCoeffVer_32 + r4]
4748 pinsrd xm1, [r0 + r1], 1
4749 pinsrd xm1, [r0 + r1 * 2], 2
4750 pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0]
4751 lea r0, [r0 + r1 * 4]
4753 pinsrd xm2, [r0 + r1], 1
4754 pinsrd xm2, [r0 + r1 * 2], 2
4755 pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4]
4756 vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0]
4757 lea r0, [r0 + r1 * 4]
4759 pinsrd xm3, [r0 + r1], 1
4760 pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8]
4761 vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4]
4762 mova m3, [interp4_vpp_shuf1]
4763 vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0]
4764 vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4]
4765 mova m3, [interp4_vpp_shuf1 + mmsize]
4766 vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2]
4767 vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6]
4769 mova m3, [interp4_vpp_shuf]
4775 pmaddubsw m1, [r5 + mmsize]
4776 pmaddubsw m4, [r5 + 2 * mmsize]
4777 pmaddubsw m2, [r5 + 3 * mmsize]
4780 paddw m0, m2 ; m0 = WORD ROW[3 2 1 0]
4782 vbroadcasti128 m3, [pw_2000]
4784 vextracti128 xm2, m0, 1
4787 movhps [r2 + r3], xm0
4788 movq [r2 + r3 * 2], xm2
4789 movhps [r2 + r5], xm2
4792 ;-------------------------------------------------------------------------------------------------------------
4793 ; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4794 ;-------------------------------------------------------------------------------------------------------------
4795 FILTER_VER_LUMA_4xN 4, 4, pp
4797 ;-------------------------------------------------------------------------------------------------------------
4798 ; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4799 ;-------------------------------------------------------------------------------------------------------------
4800 FILTER_VER_LUMA_4xN 4, 8, pp
4802 ;-------------------------------------------------------------------------------------------------------------
4803 ; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4804 ;-------------------------------------------------------------------------------------------------------------
4805 FILTER_VER_LUMA_4xN 4, 16, pp
4807 ;-------------------------------------------------------------------------------------------------------------
4808 ; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4809 ;-------------------------------------------------------------------------------------------------------------
4810 FILTER_VER_LUMA_4xN 4, 4, ps
4812 ;-------------------------------------------------------------------------------------------------------------
4813 ; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4814 ;-------------------------------------------------------------------------------------------------------------
4815 FILTER_VER_LUMA_4xN 4, 8, ps
4817 ;-------------------------------------------------------------------------------------------------------------
4818 ; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4819 ;-------------------------------------------------------------------------------------------------------------
4820 FILTER_VER_LUMA_4xN 4, 16, ps
4822 %macro PROCESS_LUMA_AVX2_W8_8R 0
4823 movq xm1, [r0] ; m1 = row 0
4824 movq xm2, [r0 + r1] ; m2 = row 1
4825 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
4826 movq xm3, [r0 + r1 * 2] ; m3 = row 2
4827 punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
4828 vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
4830 movq xm4, [r0 + r4] ; m4 = row 3
4831 punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
4832 lea r0, [r0 + r1 * 4]
4833 movq xm1, [r0] ; m1 = row 4
4834 punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
4835 vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
4836 pmaddubsw m0, m2, [r5 + 1 * mmsize]
4839 movq xm3, [r0 + r1] ; m3 = row 5
4840 punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
4841 movq xm4, [r0 + r1 * 2] ; m4 = row 6
4842 punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
4843 vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
4844 pmaddubsw m3, m1, [r5 + 2 * mmsize]
4846 pmaddubsw m0, m1, [r5 + 1 * mmsize]
4849 movq xm3, [r0 + r4] ; m3 = row 7
4850 punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
4851 lea r0, [r0 + r1 * 4]
4852 movq xm0, [r0] ; m0 = row 8
4853 punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
4854 vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
4855 pmaddubsw m3, m4, [r5 + 3 * mmsize]
4857 pmaddubsw m3, m4, [r5 + 2 * mmsize]
4859 pmaddubsw m3, m4, [r5 + 1 * mmsize]
4862 movq xm3, [r0 + r1] ; m3 = row 9
4863 punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
4864 movq xm6, [r0 + r1 * 2] ; m6 = row 10
4865 punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
4866 vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
4867 pmaddubsw m3, m0, [r5 + 3 * mmsize]
4869 pmaddubsw m3, m0, [r5 + 2 * mmsize]
4871 pmaddubsw m0, [r5 + 1 * mmsize]
4874 movq xm3, [r0 + r4] ; m3 = row 11
4875 punpcklbw xm6, xm3 ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0]
4876 lea r0, [r0 + r1 * 4]
4877 movq xm0, [r0] ; m0 = row 12
4878 punpcklbw xm3, xm0 ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0]
4879 vinserti128 m6, m6, xm3, 1 ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0]
4880 pmaddubsw m3, m6, [r5 + 3 * mmsize]
4882 pmaddubsw m6, [r5 + 2 * mmsize]
4884 movq xm3, [r0 + r1] ; m3 = row 13
4885 punpcklbw xm0, xm3 ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0]
4886 movq xm6, [r0 + r1 * 2] ; m6 = row 14
4887 punpcklbw xm3, xm6 ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0]
4888 vinserti128 m0, m0, xm3, 1 ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0]
4889 pmaddubsw m0, [r5 + 3 * mmsize]
4893 %macro PROCESS_LUMA_AVX2_W8_4R 0
4894 movq xm1, [r0] ; m1 = row 0
4895 movq xm2, [r0 + r1] ; m2 = row 1
4896 punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
4897 movq xm3, [r0 + r1 * 2] ; m3 = row 2
4898 punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
4899 vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
4901 movq xm4, [r0 + r4] ; m4 = row 3
4902 punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
4903 lea r0, [r0 + r1 * 4]
4904 movq xm1, [r0] ; m1 = row 4
4905 punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
4906 vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
4907 pmaddubsw m0, m2, [r5 + 1 * mmsize]
4910 movq xm3, [r0 + r1] ; m3 = row 5
4911 punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
4912 movq xm4, [r0 + r1 * 2] ; m4 = row 6
4913 punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
4914 vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
4915 pmaddubsw m3, m1, [r5 + 2 * mmsize]
4917 pmaddubsw m0, m1, [r5 + 1 * mmsize]
4919 movq xm3, [r0 + r4] ; m3 = row 7
4920 punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
4921 lea r0, [r0 + r1 * 4]
4922 movq xm0, [r0] ; m0 = row 8
4923 punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
4924 vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
4925 pmaddubsw m3, m4, [r5 + 3 * mmsize]
4927 pmaddubsw m3, m4, [r5 + 2 * mmsize]
4929 movq xm3, [r0 + r1] ; m3 = row 9
4930 punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
4931 movq xm6, [r0 + r1 * 2] ; m6 = row 10
4932 punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
4933 vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
4934 pmaddubsw m3, m0, [r5 + 3 * mmsize]
4938 ;-------------------------------------------------------------------------------------------------------------
4939 ; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4940 ;-------------------------------------------------------------------------------------------------------------
4941 %macro FILTER_VER_LUMA_8xN 3
4943 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
4953 lea r5, [tab_LumaCoeffVer]
4956 lea r6, [tab_LumaCoeffVer + r4]
4981 movhps [r2 + r3], m7
4982 lea r2, [r2 + 2 * r3]
4984 movhps [r2 + r3], m5
4993 lea r2, [r2 + 2 * r3]
4999 lea r2, [r2 + 2 * r3]
5007 %macro FILTER_VER_LUMA_AVX2_8xN 2
5009 cglobal interp_8tap_vert_pp_%1x%2, 4, 7, 8, 0-gprsize
5014 lea r5, [tab_LumaCoeffVer_32]
5017 lea r5, [tab_LumaCoeffVer_32 + r4]
5022 mov word [rsp], %2 / 8
5026 PROCESS_LUMA_AVX2_W8_8R
5027 pmulhrsw m5, m7 ; m5 = word: row 0, row 1
5028 pmulhrsw m2, m7 ; m2 = word: row 2, row 3
5029 pmulhrsw m1, m7 ; m1 = word: row 4, row 5
5030 pmulhrsw m4, m7 ; m4 = word: row 6, row 7
5033 vextracti128 xm2, m5, 1
5034 vextracti128 xm4, m1, 1
5037 lea r2, [r2 + r3 * 2]
5039 movhps [r2 + r3], xm2
5040 lea r2, [r2 + r3 * 2]
5043 lea r2, [r2 + r3 * 2]
5045 movhps [r2 + r3], xm4
5046 lea r2, [r2 + r3 * 2]
5054 cglobal interp_8tap_vert_pp_8x8, 4, 6, 7
5059 lea r5, [tab_LumaCoeffVer_32]
5062 lea r5, [tab_LumaCoeffVer_32 + r4]
5067 PROCESS_LUMA_AVX2_W8_8R
5070 pmulhrsw m5, m3 ; m5 = word: row 0, row 1
5071 pmulhrsw m2, m3 ; m2 = word: row 2, row 3
5072 pmulhrsw m1, m3 ; m1 = word: row 4, row 5
5073 pmulhrsw m4, m3 ; m4 = word: row 6, row 7
5076 vextracti128 xm2, m5, 1
5077 vextracti128 xm4, m1, 1
5080 movhps [r2 + r3 * 2], xm5
5081 movhps [r2 + r4], xm2
5082 lea r2, [r2 + r3 * 4]
5085 movhps [r2 + r3 * 2], xm1
5086 movhps [r2 + r4], xm4
5090 cglobal interp_8tap_vert_pp_8x4, 4, 6, 7
5095 lea r5, [tab_LumaCoeffVer_32]
5098 lea r5, [tab_LumaCoeffVer_32 + r4]
5103 PROCESS_LUMA_AVX2_W8_4R
5106 pmulhrsw m5, m3 ; m5 = word: row 0, row 1
5107 pmulhrsw m2, m3 ; m2 = word: row 2, row 3
5109 vextracti128 xm2, m5, 1
5112 movhps [r2 + r3 * 2], xm5
5113 movhps [r2 + r4], xm2
5116 ;-------------------------------------------------------------------------------------------------------------
5117 ; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5118 ;-------------------------------------------------------------------------------------------------------------
5119 FILTER_VER_LUMA_8xN 8, 4, pp
5121 ;-------------------------------------------------------------------------------------------------------------
5122 ; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5123 ;-------------------------------------------------------------------------------------------------------------
5124 FILTER_VER_LUMA_8xN 8, 8, pp
5126 ;-------------------------------------------------------------------------------------------------------------
5127 ; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5128 ;-------------------------------------------------------------------------------------------------------------
5129 FILTER_VER_LUMA_8xN 8, 16, pp
5130 FILTER_VER_LUMA_AVX2_8xN 8, 16
5132 ;-------------------------------------------------------------------------------------------------------------
5133 ; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5134 ;-------------------------------------------------------------------------------------------------------------
5135 FILTER_VER_LUMA_8xN 8, 32, pp
5136 FILTER_VER_LUMA_AVX2_8xN 8, 32
5138 ;-------------------------------------------------------------------------------------------------------------
5139 ; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5140 ;-------------------------------------------------------------------------------------------------------------
5141 FILTER_VER_LUMA_8xN 8, 4, ps
5143 ;-------------------------------------------------------------------------------------------------------------
5144 ; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5145 ;-------------------------------------------------------------------------------------------------------------
5146 FILTER_VER_LUMA_8xN 8, 8, ps
5148 ;-------------------------------------------------------------------------------------------------------------
5149 ; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5150 ;-------------------------------------------------------------------------------------------------------------
5151 FILTER_VER_LUMA_8xN 8, 16, ps
5153 ;-------------------------------------------------------------------------------------------------------------
5154 ; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5155 ;-------------------------------------------------------------------------------------------------------------
5156 FILTER_VER_LUMA_8xN 8, 32, ps
5158 ;-------------------------------------------------------------------------------------------------------------
5159 ; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5160 ;-------------------------------------------------------------------------------------------------------------
5161 %macro FILTER_VER_LUMA_12xN 3
5163 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
5172 lea r5, [tab_LumaCoeffVer]
5175 lea r6, [tab_LumaCoeffVer + r4]
5199 movhps [r2 + r3], m7
5200 lea r5, [r2 + 2 * r3]
5202 movhps [r5 + r3], m5
5211 lea r5, [r2 + 2 * r3]
5216 lea r5, [8 * r1 - 8]
5233 pextrd [r2 + r3], m4, 1
5234 lea r5, [r2 + 2 * r3]
5236 pextrd [r5 + r3], m4, 3
5242 movhps [r2 + r3], m4
5243 lea r5, [r2 + 2 * r3]
5245 movhps [r5 + r3], m5
5248 lea r5, [4 * r1 + 8]
5251 lea r2, [r2 + 4 * r3 - 8]
5253 lea r2, [r2 + 4 * r3 - 16]
5262 ;-------------------------------------------------------------------------------------------------------------
5263 ; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5264 ;-------------------------------------------------------------------------------------------------------------
5265 FILTER_VER_LUMA_12xN 12, 16, pp
5267 ;-------------------------------------------------------------------------------------------------------------
5268 ; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
5269 ;-------------------------------------------------------------------------------------------------------------
5270 FILTER_VER_LUMA_12xN 12, 16, ps
5273 %if ARCH_X86_64 == 1
5274 cglobal interp_8tap_vert_pp_12x16, 4, 7, 15
5279 lea r5, [tab_LumaCoeffVer_32]
5282 lea r5, [tab_LumaCoeffVer_32 + r4]
5290 movu xm0, [r0] ; m0 = row 0
5291 movu xm1, [r0 + r1] ; m1 = row 1
5292 punpckhbw xm2, xm0, xm1
5294 vinserti128 m0, m0, xm2, 1
5296 movu xm2, [r0 + r1 * 2] ; m2 = row 2
5297 punpckhbw xm3, xm1, xm2
5299 vinserti128 m1, m1, xm3, 1
5301 movu xm3, [r0 + r4] ; m3 = row 3
5302 punpckhbw xm4, xm2, xm3
5304 vinserti128 m2, m2, xm4, 1
5305 pmaddubsw m4, m2, [r5 + 1 * mmsize]
5308 lea r0, [r0 + r1 * 4]
5309 movu xm4, [r0] ; m4 = row 4
5310 punpckhbw xm5, xm3, xm4
5312 vinserti128 m3, m3, xm5, 1
5313 pmaddubsw m5, m3, [r5 + 1 * mmsize]
5316 movu xm5, [r0 + r1] ; m5 = row 5
5317 punpckhbw xm6, xm4, xm5
5319 vinserti128 m4, m4, xm6, 1
5320 pmaddubsw m6, m4, [r5 + 2 * mmsize]
5322 pmaddubsw m6, m4, [r5 + 1 * mmsize]
5325 movu xm6, [r0 + r1 * 2] ; m6 = row 6
5326 punpckhbw xm7, xm5, xm6
5328 vinserti128 m5, m5, xm7, 1
5329 pmaddubsw m7, m5, [r5 + 2 * mmsize]
5331 pmaddubsw m7, m5, [r5 + 1 * mmsize]
5334 movu xm7, [r0 + r4] ; m7 = row 7
5335 punpckhbw xm8, xm6, xm7
5337 vinserti128 m6, m6, xm8, 1
5338 pmaddubsw m8, m6, [r5 + 3 * mmsize]
5340 pmaddubsw m8, m6, [r5 + 2 * mmsize]
5342 pmaddubsw m8, m6, [r5 + 1 * mmsize]
5345 lea r0, [r0 + r1 * 4]
5346 movu xm8, [r0] ; m8 = row 8
5347 punpckhbw xm9, xm7, xm8
5349 vinserti128 m7, m7, xm9, 1
5350 pmaddubsw m9, m7, [r5 + 3 * mmsize]
5352 pmaddubsw m9, m7, [r5 + 2 * mmsize]
5354 pmaddubsw m9, m7, [r5 + 1 * mmsize]
5357 movu xm9, [r0 + r1] ; m9 = row 9
5358 punpckhbw xm10, xm8, xm9
5360 vinserti128 m8, m8, xm10, 1
5361 pmaddubsw m10, m8, [r5 + 3 * mmsize]
5363 pmaddubsw m10, m8, [r5 + 2 * mmsize]
5365 pmaddubsw m10, m8, [r5 + 1 * mmsize]
5368 movu xm10, [r0 + r1 * 2] ; m10 = row 10
5369 punpckhbw xm11, xm9, xm10
5371 vinserti128 m9, m9, xm11, 1
5372 pmaddubsw m11, m9, [r5 + 3 * mmsize]
5374 pmaddubsw m11, m9, [r5 + 2 * mmsize]
5376 pmaddubsw m11, m9, [r5 + 1 * mmsize]
5379 movu xm11, [r0 + r4] ; m11 = row 11
5380 punpckhbw xm12, xm10, xm11
5381 punpcklbw xm10, xm11
5382 vinserti128 m10, m10, xm12, 1
5383 pmaddubsw m12, m10, [r5 + 3 * mmsize]
5385 pmaddubsw m12, m10, [r5 + 2 * mmsize]
5387 pmaddubsw m12, m10, [r5 + 1 * mmsize]
5390 lea r0, [r0 + r1 * 4]
5391 movu xm12, [r0] ; m12 = row 12
5392 punpckhbw xm13, xm11, xm12
5393 punpcklbw xm11, xm12
5394 vinserti128 m11, m11, xm13, 1
5395 pmaddubsw m13, m11, [r5 + 3 * mmsize]
5397 pmaddubsw m13, m11, [r5 + 2 * mmsize]
5399 pmaddubsw m13, m11, [r5 + 1 * mmsize]
5403 pmulhrsw m0, m14 ; m0 = word: row 0
5404 pmulhrsw m1, m14 ; m1 = word: row 1
5405 pmulhrsw m2, m14 ; m2 = word: row 2
5406 pmulhrsw m3, m14 ; m3 = word: row 3
5407 pmulhrsw m4, m14 ; m4 = word: row 4
5408 pmulhrsw m5, m14 ; m5 = word: row 5
5412 vpermq m0, m0, 11011000b
5413 vpermq m2, m2, 11011000b
5414 vpermq m4, m4, 11011000b
5415 vextracti128 xm1, m0, 1
5416 vextracti128 xm3, m2, 1
5417 vextracti128 xm5, m4, 1
5419 pextrd [r2 + 8], xm0, 2
5421 pextrd [r2 + r3 + 8], xm1, 2
5422 movq [r2 + r3 * 2], xm2
5423 pextrd [r2 + r3 * 2 + 8], xm2, 2
5425 pextrd [r2 + r6 + 8], xm3, 2
5426 lea r2, [r2 + r3 * 4]
5428 pextrd [r2 + 8], xm4, 2
5430 pextrd [r2 + r3 + 8], xm5, 2
5432 movu xm13, [r0 + r1] ; m13 = row 13
5433 punpckhbw xm0, xm12, xm13
5434 punpcklbw xm12, xm13
5435 vinserti128 m12, m12, xm0, 1
5436 pmaddubsw m0, m12, [r5 + 3 * mmsize]
5438 pmaddubsw m0, m12, [r5 + 2 * mmsize]
5440 pmaddubsw m0, m12, [r5 + 1 * mmsize]
5443 movu xm0, [r0 + r1 * 2] ; m0 = row 14
5444 punpckhbw xm1, xm13, xm0
5446 vinserti128 m13, m13, xm1, 1
5447 pmaddubsw m1, m13, [r5 + 3 * mmsize]
5449 pmaddubsw m1, m13, [r5 + 2 * mmsize]
5451 pmaddubsw m1, m13, [r5 + 1 * mmsize]
5455 pmulhrsw m6, m14 ; m6 = word: row 6
5456 pmulhrsw m7, m14 ; m7 = word: row 7
5458 vpermq m6, m6, 11011000b
5459 vextracti128 xm7, m6, 1
5460 movq [r2 + r3 * 2], xm6
5461 pextrd [r2 + r3 * 2 + 8], xm6, 2
5463 pextrd [r2 + r6 + 8], xm7, 2
5464 lea r2, [r2 + r3 * 4]
5466 movu xm1, [r0 + r4] ; m1 = row 15
5467 punpckhbw xm2, xm0, xm1
5469 vinserti128 m0, m0, xm2, 1
5470 pmaddubsw m2, m0, [r5 + 3 * mmsize]
5472 pmaddubsw m2, m0, [r5 + 2 * mmsize]
5474 pmaddubsw m2, m0, [r5 + 1 * mmsize]
5477 lea r0, [r0 + r1 * 4]
5478 movu xm2, [r0] ; m2 = row 16
5479 punpckhbw xm3, xm1, xm2
5481 vinserti128 m1, m1, xm3, 1
5482 pmaddubsw m3, m1, [r5 + 3 * mmsize]
5484 pmaddubsw m3, m1, [r5 + 2 * mmsize]
5486 pmaddubsw m3, m1, [r5 + 1 * mmsize]
5489 movu xm3, [r0 + r1] ; m3 = row 17
5490 punpckhbw xm4, xm2, xm3
5492 vinserti128 m2, m2, xm4, 1
5493 pmaddubsw m4, m2, [r5 + 3 * mmsize]
5495 pmaddubsw m4, m2, [r5 + 2 * mmsize]
5497 pmaddubsw m2, [r5 + 1 * mmsize]
5499 movu xm4, [r0 + r1 * 2] ; m4 = row 18
5500 punpckhbw xm5, xm3, xm4
5502 vinserti128 m3, m3, xm5, 1
5503 pmaddubsw m5, m3, [r5 + 3 * mmsize]
5505 pmaddubsw m5, m3, [r5 + 2 * mmsize]
5507 pmaddubsw m3, [r5 + 1 * mmsize]
5509 movu xm5, [r0 + r4] ; m5 = row 19
5510 punpckhbw xm6, xm4, xm5
5512 vinserti128 m4, m4, xm6, 1
5513 pmaddubsw m6, m4, [r5 + 3 * mmsize]
5515 pmaddubsw m4, [r5 + 2 * mmsize]
5517 lea r0, [r0 + r1 * 4]
5518 movu xm6, [r0] ; m6 = row 20
5519 punpckhbw xm7, xm5, xm6
5521 vinserti128 m5, m5, xm7, 1
5522 pmaddubsw m7, m5, [r5 + 3 * mmsize]
5524 pmaddubsw m5, [r5 + 2 * mmsize]
5526 movu xm7, [r0 + r1] ; m7 = row 21
5527 punpckhbw xm2, xm6, xm7
5529 vinserti128 m6, m6, xm2, 1
5530 pmaddubsw m6, [r5 + 3 * mmsize]
5532 movu xm2, [r0 + r1 * 2] ; m2 = row 22
5533 punpckhbw xm3, xm7, xm2
5535 vinserti128 m7, m7, xm3, 1
5536 pmaddubsw m7, [r5 + 3 * mmsize]
5539 pmulhrsw m8, m14 ; m8 = word: row 8
5540 pmulhrsw m9, m14 ; m9 = word: row 9
5541 pmulhrsw m10, m14 ; m10 = word: row 10
5542 pmulhrsw m11, m14 ; m11 = word: row 11
5543 pmulhrsw m12, m14 ; m12 = word: row 12
5544 pmulhrsw m13, m14 ; m13 = word: row 13
5545 pmulhrsw m0, m14 ; m0 = word: row 14
5546 pmulhrsw m1, m14 ; m1 = word: row 15
5551 vpermq m8, m8, 11011000b
5552 vpermq m10, m10, 11011000b
5553 vpermq m12, m12, 11011000b
5554 vpermq m0, m0, 11011000b
5555 vextracti128 xm9, m8, 1
5556 vextracti128 xm11, m10, 1
5557 vextracti128 xm13, m12, 1
5558 vextracti128 xm1, m0, 1
5560 pextrd [r2 + 8], xm8, 2
5562 pextrd [r2 + r3 + 8], xm9, 2
5563 movq [r2 + r3 * 2], xm10
5564 pextrd [r2 + r3 * 2 + 8], xm10, 2
5565 movq [r2 + r6], xm11
5566 pextrd [r2 + r6 + 8], xm11, 2
5567 lea r2, [r2 + r3 * 4]
5569 pextrd [r2 + 8], xm12, 2
5570 movq [r2 + r3], xm13
5571 pextrd [r2 + r3 + 8], xm13, 2
5572 movq [r2 + r3 * 2], xm0
5573 pextrd [r2 + r3 * 2 + 8], xm0, 2
5575 pextrd [r2 + r6 + 8], xm1, 2
5580 %if ARCH_X86_64 == 1
5581 cglobal interp_8tap_vert_pp_16x16, 4, 7, 15
5586 lea r5, [tab_LumaCoeffVer_32]
5589 lea r5, [tab_LumaCoeffVer_32 + r4]
5597 movu xm0, [r0] ; m0 = row 0
5598 movu xm1, [r0 + r1] ; m1 = row 1
5599 punpckhbw xm2, xm0, xm1
5601 vinserti128 m0, m0, xm2, 1
5603 movu xm2, [r0 + r1 * 2] ; m2 = row 2
5604 punpckhbw xm3, xm1, xm2
5606 vinserti128 m1, m1, xm3, 1
5608 movu xm3, [r0 + r4] ; m3 = row 3
5609 punpckhbw xm4, xm2, xm3
5611 vinserti128 m2, m2, xm4, 1
5612 pmaddubsw m4, m2, [r5 + 1 * mmsize]
5615 lea r0, [r0 + r1 * 4]
5616 movu xm4, [r0] ; m4 = row 4
5617 punpckhbw xm5, xm3, xm4
5619 vinserti128 m3, m3, xm5, 1
5620 pmaddubsw m5, m3, [r5 + 1 * mmsize]
5623 movu xm5, [r0 + r1] ; m5 = row 5
5624 punpckhbw xm6, xm4, xm5
5626 vinserti128 m4, m4, xm6, 1
5627 pmaddubsw m6, m4, [r5 + 2 * mmsize]
5629 pmaddubsw m6, m4, [r5 + 1 * mmsize]
5632 movu xm6, [r0 + r1 * 2] ; m6 = row 6
5633 punpckhbw xm7, xm5, xm6
5635 vinserti128 m5, m5, xm7, 1
5636 pmaddubsw m7, m5, [r5 + 2 * mmsize]
5638 pmaddubsw m7, m5, [r5 + 1 * mmsize]
5641 movu xm7, [r0 + r4] ; m7 = row 7
5642 punpckhbw xm8, xm6, xm7
5644 vinserti128 m6, m6, xm8, 1
5645 pmaddubsw m8, m6, [r5 + 3 * mmsize]
5647 pmaddubsw m8, m6, [r5 + 2 * mmsize]
5649 pmaddubsw m8, m6, [r5 + 1 * mmsize]
5652 lea r0, [r0 + r1 * 4]
5653 movu xm8, [r0] ; m8 = row 8
5654 punpckhbw xm9, xm7, xm8
5656 vinserti128 m7, m7, xm9, 1
5657 pmaddubsw m9, m7, [r5 + 3 * mmsize]
5659 pmaddubsw m9, m7, [r5 + 2 * mmsize]
5661 pmaddubsw m9, m7, [r5 + 1 * mmsize]
5664 movu xm9, [r0 + r1] ; m9 = row 9
5665 punpckhbw xm10, xm8, xm9
5667 vinserti128 m8, m8, xm10, 1
5668 pmaddubsw m10, m8, [r5 + 3 * mmsize]
5670 pmaddubsw m10, m8, [r5 + 2 * mmsize]
5672 pmaddubsw m10, m8, [r5 + 1 * mmsize]
5675 movu xm10, [r0 + r1 * 2] ; m10 = row 10
5676 punpckhbw xm11, xm9, xm10
5678 vinserti128 m9, m9, xm11, 1
5679 pmaddubsw m11, m9, [r5 + 3 * mmsize]
5681 pmaddubsw m11, m9, [r5 + 2 * mmsize]
5683 pmaddubsw m11, m9, [r5 + 1 * mmsize]
5686 movu xm11, [r0 + r4] ; m11 = row 11
5687 punpckhbw xm12, xm10, xm11
5688 punpcklbw xm10, xm11
5689 vinserti128 m10, m10, xm12, 1
5690 pmaddubsw m12, m10, [r5 + 3 * mmsize]
5692 pmaddubsw m12, m10, [r5 + 2 * mmsize]
5694 pmaddubsw m12, m10, [r5 + 1 * mmsize]
5697 lea r0, [r0 + r1 * 4]
5698 movu xm12, [r0] ; m12 = row 12
5699 punpckhbw xm13, xm11, xm12
5700 punpcklbw xm11, xm12
5701 vinserti128 m11, m11, xm13, 1
5702 pmaddubsw m13, m11, [r5 + 3 * mmsize]
5704 pmaddubsw m13, m11, [r5 + 2 * mmsize]
5706 pmaddubsw m13, m11, [r5 + 1 * mmsize]
5710 pmulhrsw m0, m14 ; m0 = word: row 0
5711 pmulhrsw m1, m14 ; m1 = word: row 1
5712 pmulhrsw m2, m14 ; m2 = word: row 2
5713 pmulhrsw m3, m14 ; m3 = word: row 3
5714 pmulhrsw m4, m14 ; m4 = word: row 4
5715 pmulhrsw m5, m14 ; m5 = word: row 5
5719 vpermq m0, m0, 11011000b
5720 vpermq m2, m2, 11011000b
5721 vpermq m4, m4, 11011000b
5722 vextracti128 xm1, m0, 1
5723 vextracti128 xm3, m2, 1
5724 vextracti128 xm5, m4, 1
5727 movu [r2 + r3 * 2], xm2
5729 lea r2, [r2 + r3 * 4]
5733 movu xm13, [r0 + r1] ; m13 = row 13
5734 punpckhbw xm0, xm12, xm13
5735 punpcklbw xm12, xm13
5736 vinserti128 m12, m12, xm0, 1
5737 pmaddubsw m0, m12, [r5 + 3 * mmsize]
5739 pmaddubsw m0, m12, [r5 + 2 * mmsize]
5741 pmaddubsw m0, m12, [r5 + 1 * mmsize]
5744 movu xm0, [r0 + r1 * 2] ; m0 = row 14
5745 punpckhbw xm1, xm13, xm0
5747 vinserti128 m13, m13, xm1, 1
5748 pmaddubsw m1, m13, [r5 + 3 * mmsize]
5750 pmaddubsw m1, m13, [r5 + 2 * mmsize]
5752 pmaddubsw m1, m13, [r5 + 1 * mmsize]
5756 pmulhrsw m6, m14 ; m6 = word: row 6
5757 pmulhrsw m7, m14 ; m7 = word: row 7
5759 vpermq m6, m6, 11011000b
5760 vextracti128 xm7, m6, 1
5761 movu [r2 + r3 * 2], xm6
5763 lea r2, [r2 + r3 * 4]
5765 movu xm1, [r0 + r4] ; m1 = row 15
5766 punpckhbw xm2, xm0, xm1
5768 vinserti128 m0, m0, xm2, 1
5769 pmaddubsw m2, m0, [r5 + 3 * mmsize]
5771 pmaddubsw m2, m0, [r5 + 2 * mmsize]
5773 pmaddubsw m2, m0, [r5 + 1 * mmsize]
5776 lea r0, [r0 + r1 * 4]
5777 movu xm2, [r0] ; m2 = row 16
5778 punpckhbw xm3, xm1, xm2
5780 vinserti128 m1, m1, xm3, 1
5781 pmaddubsw m3, m1, [r5 + 3 * mmsize]
5783 pmaddubsw m3, m1, [r5 + 2 * mmsize]
5785 pmaddubsw m3, m1, [r5 + 1 * mmsize]
5788 movu xm3, [r0 + r1] ; m3 = row 17
5789 punpckhbw xm4, xm2, xm3
5791 vinserti128 m2, m2, xm4, 1
5792 pmaddubsw m4, m2, [r5 + 3 * mmsize]
5794 pmaddubsw m4, m2, [r5 + 2 * mmsize]
5796 pmaddubsw m2, [r5 + 1 * mmsize]
5798 movu xm4, [r0 + r1 * 2] ; m4 = row 18
5799 punpckhbw xm5, xm3, xm4
5801 vinserti128 m3, m3, xm5, 1
5802 pmaddubsw m5, m3, [r5 + 3 * mmsize]
5804 pmaddubsw m5, m3, [r5 + 2 * mmsize]
5806 pmaddubsw m3, [r5 + 1 * mmsize]
5808 movu xm5, [r0 + r4] ; m5 = row 19
5809 punpckhbw xm6, xm4, xm5
5811 vinserti128 m4, m4, xm6, 1
5812 pmaddubsw m6, m4, [r5 + 3 * mmsize]
5814 pmaddubsw m4, [r5 + 2 * mmsize]
5816 lea r0, [r0 + r1 * 4]
5817 movu xm6, [r0] ; m6 = row 20
5818 punpckhbw xm7, xm5, xm6
5820 vinserti128 m5, m5, xm7, 1
5821 pmaddubsw m7, m5, [r5 + 3 * mmsize]
5823 pmaddubsw m5, [r5 + 2 * mmsize]
5825 movu xm7, [r0 + r1] ; m7 = row 21
5826 punpckhbw xm2, xm6, xm7
5828 vinserti128 m6, m6, xm2, 1
5829 pmaddubsw m6, [r5 + 3 * mmsize]
5831 movu xm2, [r0 + r1 * 2] ; m2 = row 22
5832 punpckhbw xm3, xm7, xm2
5834 vinserti128 m7, m7, xm3, 1
5835 pmaddubsw m7, [r5 + 3 * mmsize]
5838 pmulhrsw m8, m14 ; m8 = word: row 8
5839 pmulhrsw m9, m14 ; m9 = word: row 9
5840 pmulhrsw m10, m14 ; m10 = word: row 10
5841 pmulhrsw m11, m14 ; m11 = word: row 11
5842 pmulhrsw m12, m14 ; m12 = word: row 12
5843 pmulhrsw m13, m14 ; m13 = word: row 13
5844 pmulhrsw m0, m14 ; m0 = word: row 14
5845 pmulhrsw m1, m14 ; m1 = word: row 15
5850 vpermq m8, m8, 11011000b
5851 vpermq m10, m10, 11011000b
5852 vpermq m12, m12, 11011000b
5853 vpermq m0, m0, 11011000b
5854 vextracti128 xm9, m8, 1
5855 vextracti128 xm11, m10, 1
5856 vextracti128 xm13, m12, 1
5857 vextracti128 xm1, m0, 1
5860 movu [r2 + r3 * 2], xm10
5861 movu [r2 + r6], xm11
5862 lea r2, [r2 + r3 * 4]
5864 movu [r2 + r3], xm13
5865 movu [r2 + r3 * 2], xm0
5871 %if ARCH_X86_64 == 1
5872 cglobal interp_8tap_vert_pp_16x12, 4, 7, 15
5877 lea r5, [tab_LumaCoeffVer_32]
5880 lea r5, [tab_LumaCoeffVer_32 + r4]
5888 movu xm0, [r0] ; m0 = row 0
5889 movu xm1, [r0 + r1] ; m1 = row 1
5890 punpckhbw xm2, xm0, xm1
5892 vinserti128 m0, m0, xm2, 1
5894 movu xm2, [r0 + r1 * 2] ; m2 = row 2
5895 punpckhbw xm3, xm1, xm2
5897 vinserti128 m1, m1, xm3, 1
5899 movu xm3, [r0 + r4] ; m3 = row 3
5900 punpckhbw xm4, xm2, xm3
5902 vinserti128 m2, m2, xm4, 1
5903 pmaddubsw m4, m2, [r5 + 1 * mmsize]
5906 lea r0, [r0 + r1 * 4]
5907 movu xm4, [r0] ; m4 = row 4
5908 punpckhbw xm5, xm3, xm4
5910 vinserti128 m3, m3, xm5, 1
5911 pmaddubsw m5, m3, [r5 + 1 * mmsize]
5914 movu xm5, [r0 + r1] ; m5 = row 5
5915 punpckhbw xm6, xm4, xm5
5917 vinserti128 m4, m4, xm6, 1
5918 pmaddubsw m6, m4, [r5 + 2 * mmsize]
5920 pmaddubsw m6, m4, [r5 + 1 * mmsize]
5923 movu xm6, [r0 + r1 * 2] ; m6 = row 6
5924 punpckhbw xm7, xm5, xm6
5926 vinserti128 m5, m5, xm7, 1
5927 pmaddubsw m7, m5, [r5 + 2 * mmsize]
5929 pmaddubsw m7, m5, [r5 + 1 * mmsize]
5932 movu xm7, [r0 + r4] ; m7 = row 7
5933 punpckhbw xm8, xm6, xm7
5935 vinserti128 m6, m6, xm8, 1
5936 pmaddubsw m8, m6, [r5 + 3 * mmsize]
5938 pmaddubsw m8, m6, [r5 + 2 * mmsize]
5940 pmaddubsw m8, m6, [r5 + 1 * mmsize]
5943 lea r0, [r0 + r1 * 4]
5944 movu xm8, [r0] ; m8 = row 8
5945 punpckhbw xm9, xm7, xm8
5947 vinserti128 m7, m7, xm9, 1
5948 pmaddubsw m9, m7, [r5 + 3 * mmsize]
5950 pmaddubsw m9, m7, [r5 + 2 * mmsize]
5952 pmaddubsw m9, m7, [r5 + 1 * mmsize]
5955 movu xm9, [r0 + r1] ; m9 = row 9
5956 punpckhbw xm10, xm8, xm9
5958 vinserti128 m8, m8, xm10, 1
5959 pmaddubsw m10, m8, [r5 + 3 * mmsize]
5961 pmaddubsw m10, m8, [r5 + 2 * mmsize]
5963 pmaddubsw m10, m8, [r5 + 1 * mmsize]
5966 movu xm10, [r0 + r1 * 2] ; m10 = row 10
5967 punpckhbw xm11, xm9, xm10
5969 vinserti128 m9, m9, xm11, 1
5970 pmaddubsw m11, m9, [r5 + 3 * mmsize]
5972 pmaddubsw m11, m9, [r5 + 2 * mmsize]
5974 pmaddubsw m11, m9, [r5 + 1 * mmsize]
5977 movu xm11, [r0 + r4] ; m11 = row 11
5978 punpckhbw xm12, xm10, xm11
5979 punpcklbw xm10, xm11
5980 vinserti128 m10, m10, xm12, 1
5981 pmaddubsw m12, m10, [r5 + 3 * mmsize]
5983 pmaddubsw m12, m10, [r5 + 2 * mmsize]
5985 pmaddubsw m12, m10, [r5 + 1 * mmsize]
5988 lea r0, [r0 + r1 * 4]
5989 movu xm12, [r0] ; m12 = row 12
5990 punpckhbw xm13, xm11, xm12
5991 punpcklbw xm11, xm12
5992 vinserti128 m11, m11, xm13, 1
5993 pmaddubsw m13, m11, [r5 + 3 * mmsize]
5995 pmaddubsw m13, m11, [r5 + 2 * mmsize]
5997 pmaddubsw m13, m11, [r5 + 1 * mmsize]
6001 pmulhrsw m0, m14 ; m0 = word: row 0
6002 pmulhrsw m1, m14 ; m1 = word: row 1
6003 pmulhrsw m2, m14 ; m2 = word: row 2
6004 pmulhrsw m3, m14 ; m3 = word: row 3
6005 pmulhrsw m4, m14 ; m4 = word: row 4
6006 pmulhrsw m5, m14 ; m5 = word: row 5
6010 vpermq m0, m0, 11011000b
6011 vpermq m2, m2, 11011000b
6012 vpermq m4, m4, 11011000b
6013 vextracti128 xm1, m0, 1
6014 vextracti128 xm3, m2, 1
6015 vextracti128 xm5, m4, 1
6018 movu [r2 + r3 * 2], xm2
6020 lea r2, [r2 + r3 * 4]
6024 movu xm13, [r0 + r1] ; m13 = row 13
6025 punpckhbw xm0, xm12, xm13
6026 punpcklbw xm12, xm13
6027 vinserti128 m12, m12, xm0, 1
6028 pmaddubsw m0, m12, [r5 + 3 * mmsize]
6030 pmaddubsw m0, m12, [r5 + 2 * mmsize]
6032 pmaddubsw m0, m12, [r5 + 1 * mmsize]
6034 movu xm0, [r0 + r1 * 2] ; m0 = row 14
6035 punpckhbw xm1, xm13, xm0
6037 vinserti128 m13, m13, xm1, 1
6038 pmaddubsw m1, m13, [r5 + 3 * mmsize]
6040 pmaddubsw m1, m13, [r5 + 2 * mmsize]
6042 pmaddubsw m1, m13, [r5 + 1 * mmsize]
6045 pmulhrsw m6, m14 ; m6 = word: row 6
6046 pmulhrsw m7, m14 ; m7 = word: row 7
6048 vpermq m6, m6, 11011000b
6049 vextracti128 xm7, m6, 1
6050 movu [r2 + r3 * 2], xm6
6052 lea r2, [r2 + r3 * 4]
6054 movu xm1, [r0 + r4] ; m1 = row 15
6055 punpckhbw xm2, xm0, xm1
6057 vinserti128 m0, m0, xm2, 1
6058 pmaddubsw m2, m0, [r5 + 3 * mmsize]
6060 pmaddubsw m2, m0, [r5 + 2 * mmsize]
6062 lea r0, [r0 + r1 * 4]
6063 movu xm2, [r0] ; m2 = row 16
6064 punpckhbw xm3, xm1, xm2
6066 vinserti128 m1, m1, xm3, 1
6067 pmaddubsw m3, m1, [r5 + 3 * mmsize]
6069 pmaddubsw m3, m1, [r5 + 2 * mmsize]
6071 movu xm3, [r0 + r1] ; m3 = row 17
6072 punpckhbw xm4, xm2, xm3
6074 vinserti128 m2, m2, xm4, 1
6075 pmaddubsw m4, m2, [r5 + 3 * mmsize]
6077 movu xm4, [r0 + r1 * 2] ; m4 = row 18
6078 punpckhbw xm5, xm3, xm4
6080 vinserti128 m3, m3, xm5, 1
6081 pmaddubsw m5, m3, [r5 + 3 * mmsize]
6084 pmulhrsw m8, m14 ; m8 = word: row 8
6085 pmulhrsw m9, m14 ; m9 = word: row 9
6086 pmulhrsw m10, m14 ; m10 = word: row 10
6087 pmulhrsw m11, m14 ; m11 = word: row 11
6090 vpermq m8, m8, 11011000b
6091 vpermq m10, m10, 11011000b
6092 vextracti128 xm9, m8, 1
6093 vextracti128 xm11, m10, 1
6096 movu [r2 + r3 * 2], xm10
6097 movu [r2 + r6], xm11
6102 %if ARCH_X86_64 == 1
6103 cglobal interp_8tap_vert_pp_16x8, 4, 7, 15
6108 lea r5, [tab_LumaCoeffVer_32]
6111 lea r5, [tab_LumaCoeffVer_32 + r4]
6119 movu xm0, [r0] ; m0 = row 0
6120 movu xm1, [r0 + r1] ; m1 = row 1
6121 punpckhbw xm2, xm0, xm1
6123 vinserti128 m0, m0, xm2, 1
6125 movu xm2, [r0 + r1 * 2] ; m2 = row 2
6126 punpckhbw xm3, xm1, xm2
6128 vinserti128 m1, m1, xm3, 1
6130 movu xm3, [r0 + r4] ; m3 = row 3
6131 punpckhbw xm4, xm2, xm3
6133 vinserti128 m2, m2, xm4, 1
6134 pmaddubsw m4, m2, [r5 + 1 * mmsize]
6137 lea r0, [r0 + r1 * 4]
6138 movu xm4, [r0] ; m4 = row 4
6139 punpckhbw xm5, xm3, xm4
6141 vinserti128 m3, m3, xm5, 1
6142 pmaddubsw m5, m3, [r5 + 1 * mmsize]
6145 movu xm5, [r0 + r1] ; m5 = row 5
6146 punpckhbw xm6, xm4, xm5
6148 vinserti128 m4, m4, xm6, 1
6149 pmaddubsw m6, m4, [r5 + 2 * mmsize]
6151 pmaddubsw m6, m4, [r5 + 1 * mmsize]
6154 movu xm6, [r0 + r1 * 2] ; m6 = row 6
6155 punpckhbw xm7, xm5, xm6
6157 vinserti128 m5, m5, xm7, 1
6158 pmaddubsw m7, m5, [r5 + 2 * mmsize]
6160 pmaddubsw m7, m5, [r5 + 1 * mmsize]
6163 movu xm7, [r0 + r4] ; m7 = row 7
6164 punpckhbw xm8, xm6, xm7
6166 vinserti128 m6, m6, xm8, 1
6167 pmaddubsw m8, m6, [r5 + 3 * mmsize]
6169 pmaddubsw m8, m6, [r5 + 2 * mmsize]
6171 pmaddubsw m8, m6, [r5 + 1 * mmsize]
6174 lea r0, [r0 + r1 * 4]
6175 movu xm8, [r0] ; m8 = row 8
6176 punpckhbw xm9, xm7, xm8
6178 vinserti128 m7, m7, xm9, 1
6179 pmaddubsw m9, m7, [r5 + 3 * mmsize]
6181 pmaddubsw m9, m7, [r5 + 2 * mmsize]
6183 pmaddubsw m9, m7, [r5 + 1 * mmsize]
6186 movu xm9, [r0 + r1] ; m9 = row 9
6187 punpckhbw xm10, xm8, xm9
6189 vinserti128 m8, m8, xm10, 1
6190 pmaddubsw m10, m8, [r5 + 3 * mmsize]
6192 pmaddubsw m10, m8, [r5 + 2 * mmsize]
6194 pmaddubsw m10, m8, [r5 + 1 * mmsize]
6196 movu xm10, [r0 + r1 * 2] ; m10 = row 10
6197 punpckhbw xm11, xm9, xm10
6199 vinserti128 m9, m9, xm11, 1
6200 pmaddubsw m11, m9, [r5 + 3 * mmsize]
6202 pmaddubsw m11, m9, [r5 + 2 * mmsize]
6204 pmaddubsw m11, m9, [r5 + 1 * mmsize]
6206 movu xm11, [r0 + r4] ; m11 = row 11
6207 punpckhbw xm12, xm10, xm11
6208 punpcklbw xm10, xm11
6209 vinserti128 m10, m10, xm12, 1
6210 pmaddubsw m12, m10, [r5 + 3 * mmsize]
6212 pmaddubsw m12, m10, [r5 + 2 * mmsize]
6214 lea r0, [r0 + r1 * 4]
6215 movu xm12, [r0] ; m12 = row 12
6216 punpckhbw xm13, xm11, xm12
6217 punpcklbw xm11, xm12
6218 vinserti128 m11, m11, xm13, 1
6219 pmaddubsw m13, m11, [r5 + 3 * mmsize]
6221 pmaddubsw m13, m11, [r5 + 2 * mmsize]
6224 pmulhrsw m0, m14 ; m0 = word: row 0
6225 pmulhrsw m1, m14 ; m1 = word: row 1
6226 pmulhrsw m2, m14 ; m2 = word: row 2
6227 pmulhrsw m3, m14 ; m3 = word: row 3
6228 pmulhrsw m4, m14 ; m4 = word: row 4
6229 pmulhrsw m5, m14 ; m5 = word: row 5
6233 vpermq m0, m0, 11011000b
6234 vpermq m2, m2, 11011000b
6235 vpermq m4, m4, 11011000b
6236 vextracti128 xm1, m0, 1
6237 vextracti128 xm3, m2, 1
6238 vextracti128 xm5, m4, 1
6241 movu [r2 + r3 * 2], xm2
6243 lea r2, [r2 + r3 * 4]
6247 movu xm13, [r0 + r1] ; m13 = row 13
6248 punpckhbw xm0, xm12, xm13
6249 punpcklbw xm12, xm13
6250 vinserti128 m12, m12, xm0, 1
6251 pmaddubsw m0, m12, [r5 + 3 * mmsize]
6253 movu xm0, [r0 + r1 * 2] ; m0 = row 14
6254 punpckhbw xm1, xm13, xm0
6256 vinserti128 m13, m13, xm1, 1
6257 pmaddubsw m1, m13, [r5 + 3 * mmsize]
6260 pmulhrsw m6, m14 ; m6 = word: row 6
6261 pmulhrsw m7, m14 ; m7 = word: row 7
6263 vpermq m6, m6, 11011000b
6264 vextracti128 xm7, m6, 1
6265 movu [r2 + r3 * 2], xm6
6271 %if ARCH_X86_64 == 1
6272 cglobal interp_8tap_vert_pp_16x4, 4, 7, 13
6277 lea r5, [tab_LumaCoeffVer_32]
6280 lea r5, [tab_LumaCoeffVer_32 + r4]
6288 movu xm0, [r0] ; m0 = row 0
6289 movu xm1, [r0 + r1] ; m1 = row 1
6290 punpckhbw xm2, xm0, xm1
6292 vinserti128 m0, m0, xm2, 1
6294 movu xm2, [r0 + r1 * 2] ; m2 = row 2
6295 punpckhbw xm3, xm1, xm2
6297 vinserti128 m1, m1, xm3, 1
6299 movu xm3, [r0 + r4] ; m3 = row 3
6300 punpckhbw xm4, xm2, xm3
6302 vinserti128 m2, m2, xm4, 1
6303 pmaddubsw m4, m2, [r5 + 1 * mmsize]
6306 lea r0, [r0 + r1 * 4]
6307 movu xm4, [r0] ; m4 = row 4
6308 punpckhbw xm5, xm3, xm4
6310 vinserti128 m3, m3, xm5, 1
6311 pmaddubsw m5, m3, [r5 + 1 * mmsize]
6314 movu xm5, [r0 + r1] ; m5 = row 5
6315 punpckhbw xm6, xm4, xm5
6317 vinserti128 m4, m4, xm6, 1
6318 pmaddubsw m6, m4, [r5 + 2 * mmsize]
6320 pmaddubsw m6, m4, [r5 + 1 * mmsize]
6322 movu xm6, [r0 + r1 * 2] ; m6 = row 6
6323 punpckhbw xm7, xm5, xm6
6325 vinserti128 m5, m5, xm7, 1
6326 pmaddubsw m7, m5, [r5 + 2 * mmsize]
6328 pmaddubsw m7, m5, [r5 + 1 * mmsize]
6330 movu xm7, [r0 + r4] ; m7 = row 7
6331 punpckhbw xm8, xm6, xm7
6333 vinserti128 m6, m6, xm8, 1
6334 pmaddubsw m8, m6, [r5 + 3 * mmsize]
6336 pmaddubsw m8, m6, [r5 + 2 * mmsize]
6338 lea r0, [r0 + r1 * 4]
6339 movu xm8, [r0] ; m8 = row 8
6340 punpckhbw xm9, xm7, xm8
6342 vinserti128 m7, m7, xm9, 1
6343 pmaddubsw m9, m7, [r5 + 3 * mmsize]
6345 pmaddubsw m9, m7, [r5 + 2 * mmsize]
6347 movu xm9, [r0 + r1] ; m9 = row 9
6348 punpckhbw xm10, xm8, xm9
6350 vinserti128 m8, m8, xm10, 1
6351 pmaddubsw m10, m8, [r5 + 3 * mmsize]
6353 movu xm10, [r0 + r1 * 2] ; m10 = row 10
6354 punpckhbw xm11, xm9, xm10
6356 vinserti128 m9, m9, xm11, 1
6357 pmaddubsw m11, m9, [r5 + 3 * mmsize]
6360 pmulhrsw m0, m12 ; m0 = word: row 0
6361 pmulhrsw m1, m12 ; m1 = word: row 1
6362 pmulhrsw m2, m12 ; m2 = word: row 2
6363 pmulhrsw m3, m12 ; m3 = word: row 3
6366 vpermq m0, m0, 11011000b
6367 vpermq m2, m2, 11011000b
6368 vextracti128 xm1, m0, 1
6369 vextracti128 xm3, m2, 1
6372 movu [r2 + r3 * 2], xm2
6377 %macro FILTER_VER_LUMA_AVX2_16xN 2
6379 %if ARCH_X86_64 == 1
6380 cglobal interp_8tap_vert_pp_%1x%2, 4, 9, 15
6385 lea r5, [tab_LumaCoeffVer_32]
6388 lea r5, [tab_LumaCoeffVer_32 + r4]
6399 movu xm0, [r0] ; m0 = row 0
6400 movu xm1, [r0 + r1] ; m1 = row 1
6401 punpckhbw xm2, xm0, xm1
6403 vinserti128 m0, m0, xm2, 1
6405 movu xm2, [r0 + r1 * 2] ; m2 = row 2
6406 punpckhbw xm3, xm1, xm2
6408 vinserti128 m1, m1, xm3, 1
6410 movu xm3, [r0 + r4] ; m3 = row 3
6411 punpckhbw xm4, xm2, xm3
6413 vinserti128 m2, m2, xm4, 1
6414 pmaddubsw m4, m2, [r5 + 1 * mmsize]
6417 lea r0, [r0 + r1 * 4]
6418 movu xm4, [r0] ; m4 = row 4
6419 punpckhbw xm5, xm3, xm4
6421 vinserti128 m3, m3, xm5, 1
6422 pmaddubsw m5, m3, [r5 + 1 * mmsize]
6425 movu xm5, [r0 + r1] ; m5 = row 5
6426 punpckhbw xm6, xm4, xm5
6428 vinserti128 m4, m4, xm6, 1
6429 pmaddubsw m6, m4, [r5 + 2 * mmsize]
6431 pmaddubsw m6, m4, [r5 + 1 * mmsize]
6434 movu xm6, [r0 + r1 * 2] ; m6 = row 6
6435 punpckhbw xm7, xm5, xm6
6437 vinserti128 m5, m5, xm7, 1
6438 pmaddubsw m7, m5, [r5 + 2 * mmsize]
6440 pmaddubsw m7, m5, [r5 + 1 * mmsize]
6443 movu xm7, [r0 + r4] ; m7 = row 7
6444 punpckhbw xm8, xm6, xm7
6446 vinserti128 m6, m6, xm8, 1
6447 pmaddubsw m8, m6, [r5 + 3 * mmsize]
6449 pmaddubsw m8, m6, [r5 + 2 * mmsize]
6451 pmaddubsw m8, m6, [r5 + 1 * mmsize]
6454 lea r0, [r0 + r1 * 4]
6455 movu xm8, [r0] ; m8 = row 8
6456 punpckhbw xm9, xm7, xm8
6458 vinserti128 m7, m7, xm9, 1
6459 pmaddubsw m9, m7, [r5 + 3 * mmsize]
6461 pmaddubsw m9, m7, [r5 + 2 * mmsize]
6463 pmaddubsw m9, m7, [r5 + 1 * mmsize]
6466 movu xm9, [r0 + r1] ; m9 = row 9
6467 punpckhbw xm10, xm8, xm9
6469 vinserti128 m8, m8, xm10, 1
6470 pmaddubsw m10, m8, [r5 + 3 * mmsize]
6472 pmaddubsw m10, m8, [r5 + 2 * mmsize]
6474 pmaddubsw m10, m8, [r5 + 1 * mmsize]
6477 movu xm10, [r0 + r1 * 2] ; m10 = row 10
6478 punpckhbw xm11, xm9, xm10
6480 vinserti128 m9, m9, xm11, 1
6481 pmaddubsw m11, m9, [r5 + 3 * mmsize]
6483 pmaddubsw m11, m9, [r5 + 2 * mmsize]
6485 pmaddubsw m11, m9, [r5 + 1 * mmsize]
6488 movu xm11, [r0 + r4] ; m11 = row 11
6489 punpckhbw xm12, xm10, xm11
6490 punpcklbw xm10, xm11
6491 vinserti128 m10, m10, xm12, 1
6492 pmaddubsw m12, m10, [r5 + 3 * mmsize]
6494 pmaddubsw m12, m10, [r5 + 2 * mmsize]
6496 pmaddubsw m12, m10, [r5 + 1 * mmsize]
6499 lea r0, [r0 + r1 * 4]
6500 movu xm12, [r0] ; m12 = row 12
6501 punpckhbw xm13, xm11, xm12
6502 punpcklbw xm11, xm12
6503 vinserti128 m11, m11, xm13, 1
6504 pmaddubsw m13, m11, [r5 + 3 * mmsize]
6506 pmaddubsw m13, m11, [r5 + 2 * mmsize]
6508 pmaddubsw m13, m11, [r5 + 1 * mmsize]
6512 pmulhrsw m0, m14 ; m0 = word: row 0
6513 pmulhrsw m1, m14 ; m1 = word: row 1
6514 pmulhrsw m2, m14 ; m2 = word: row 2
6515 pmulhrsw m3, m14 ; m3 = word: row 3
6516 pmulhrsw m4, m14 ; m4 = word: row 4
6517 pmulhrsw m5, m14 ; m5 = word: row 5
6521 vpermq m0, m0, 11011000b
6522 vpermq m2, m2, 11011000b
6523 vpermq m4, m4, 11011000b
6524 vextracti128 xm1, m0, 1
6525 vextracti128 xm3, m2, 1
6526 vextracti128 xm5, m4, 1
6529 movu [r2 + r3 * 2], xm2
6531 lea r2, [r2 + r3 * 4]
6535 movu xm13, [r0 + r1] ; m13 = row 13
6536 punpckhbw xm0, xm12, xm13
6537 punpcklbw xm12, xm13
6538 vinserti128 m12, m12, xm0, 1
6539 pmaddubsw m0, m12, [r5 + 3 * mmsize]
6541 pmaddubsw m0, m12, [r5 + 2 * mmsize]
6543 pmaddubsw m0, m12, [r5 + 1 * mmsize]
6546 movu xm0, [r0 + r1 * 2] ; m0 = row 14
6547 punpckhbw xm1, xm13, xm0
6549 vinserti128 m13, m13, xm1, 1
6550 pmaddubsw m1, m13, [r5 + 3 * mmsize]
6552 pmaddubsw m1, m13, [r5 + 2 * mmsize]
6554 pmaddubsw m1, m13, [r5 + 1 * mmsize]
6558 pmulhrsw m6, m14 ; m6 = word: row 6
6559 pmulhrsw m7, m14 ; m7 = word: row 7
6561 vpermq m6, m6, 11011000b
6562 vextracti128 xm7, m6, 1
6563 movu [r2 + r3 * 2], xm6
6565 lea r2, [r2 + r3 * 4]
6567 movu xm1, [r0 + r4] ; m1 = row 15
6568 punpckhbw xm2, xm0, xm1
6570 vinserti128 m0, m0, xm2, 1
6571 pmaddubsw m2, m0, [r5 + 3 * mmsize]
6573 pmaddubsw m2, m0, [r5 + 2 * mmsize]
6575 pmaddubsw m2, m0, [r5 + 1 * mmsize]
6578 lea r0, [r0 + r1 * 4]
6579 movu xm2, [r0] ; m2 = row 16
6580 punpckhbw xm3, xm1, xm2
6582 vinserti128 m1, m1, xm3, 1
6583 pmaddubsw m3, m1, [r5 + 3 * mmsize]
6585 pmaddubsw m3, m1, [r5 + 2 * mmsize]
6587 pmaddubsw m3, m1, [r5 + 1 * mmsize]
6590 movu xm3, [r0 + r1] ; m3 = row 17
6591 punpckhbw xm4, xm2, xm3
6593 vinserti128 m2, m2, xm4, 1
6594 pmaddubsw m4, m2, [r5 + 3 * mmsize]
6596 pmaddubsw m4, m2, [r5 + 2 * mmsize]
6598 pmaddubsw m2, [r5 + 1 * mmsize]
6600 movu xm4, [r0 + r1 * 2] ; m4 = row 18
6601 punpckhbw xm5, xm3, xm4
6603 vinserti128 m3, m3, xm5, 1
6604 pmaddubsw m5, m3, [r5 + 3 * mmsize]
6606 pmaddubsw m5, m3, [r5 + 2 * mmsize]
6608 pmaddubsw m3, [r5 + 1 * mmsize]
6610 movu xm5, [r0 + r4] ; m5 = row 19
6611 punpckhbw xm6, xm4, xm5
6613 vinserti128 m4, m4, xm6, 1
6614 pmaddubsw m6, m4, [r5 + 3 * mmsize]
6616 pmaddubsw m4, [r5 + 2 * mmsize]
6618 lea r0, [r0 + r1 * 4]
6619 movu xm6, [r0] ; m6 = row 20
6620 punpckhbw xm7, xm5, xm6
6622 vinserti128 m5, m5, xm7, 1
6623 pmaddubsw m7, m5, [r5 + 3 * mmsize]
6625 pmaddubsw m5, [r5 + 2 * mmsize]
6627 movu xm7, [r0 + r1] ; m7 = row 21
6628 punpckhbw xm2, xm6, xm7
6630 vinserti128 m6, m6, xm2, 1
6631 pmaddubsw m6, [r5 + 3 * mmsize]
6633 movu xm2, [r0 + r1 * 2] ; m2 = row 22
6634 punpckhbw xm3, xm7, xm2
6636 vinserti128 m7, m7, xm3, 1
6637 pmaddubsw m7, [r5 + 3 * mmsize]
6640 pmulhrsw m8, m14 ; m8 = word: row 8
6641 pmulhrsw m9, m14 ; m9 = word: row 9
6642 pmulhrsw m10, m14 ; m10 = word: row 10
6643 pmulhrsw m11, m14 ; m11 = word: row 11
6644 pmulhrsw m12, m14 ; m12 = word: row 12
6645 pmulhrsw m13, m14 ; m13 = word: row 13
6646 pmulhrsw m0, m14 ; m0 = word: row 14
6647 pmulhrsw m1, m14 ; m1 = word: row 15
6652 vpermq m8, m8, 11011000b
6653 vpermq m10, m10, 11011000b
6654 vpermq m12, m12, 11011000b
6655 vpermq m0, m0, 11011000b
6656 vextracti128 xm9, m8, 1
6657 vextracti128 xm11, m10, 1
6658 vextracti128 xm13, m12, 1
6659 vextracti128 xm1, m0, 1
6662 movu [r2 + r3 * 2], xm10
6663 movu [r2 + r6], xm11
6664 lea r2, [r2 + r3 * 4]
6666 movu [r2 + r3], xm13
6667 movu [r2 + r3 * 2], xm0
6669 lea r2, [r2 + r3 * 4]
6677 FILTER_VER_LUMA_AVX2_16xN 16, 32
6678 FILTER_VER_LUMA_AVX2_16xN 16, 64
6680 %macro PROCESS_LUMA_AVX2_W16_16R 0
6681 movu xm0, [r0] ; m0 = row 0
6682 movu xm1, [r0 + r1] ; m1 = row 1
6683 punpckhbw xm2, xm0, xm1
6685 vinserti128 m0, m0, xm2, 1
6687 movu xm2, [r0 + r1 * 2] ; m2 = row 2
6688 punpckhbw xm3, xm1, xm2
6690 vinserti128 m1, m1, xm3, 1
6692 movu xm3, [r0 + r4] ; m3 = row 3
6693 punpckhbw xm4, xm2, xm3
6695 vinserti128 m2, m2, xm4, 1
6696 pmaddubsw m4, m2, [r5 + 1 * mmsize]
6699 lea r7, [r0 + r1 * 4]
6700 movu xm4, [r7] ; m4 = row 4
6701 punpckhbw xm5, xm3, xm4
6703 vinserti128 m3, m3, xm5, 1
6704 pmaddubsw m5, m3, [r5 + 1 * mmsize]
6707 movu xm5, [r7 + r1] ; m5 = row 5
6708 punpckhbw xm6, xm4, xm5
6710 vinserti128 m4, m4, xm6, 1
6711 pmaddubsw m6, m4, [r5 + 2 * mmsize]
6713 pmaddubsw m6, m4, [r5 + 1 * mmsize]
6716 movu xm6, [r7 + r1 * 2] ; m6 = row 6
6717 punpckhbw xm7, xm5, xm6
6719 vinserti128 m5, m5, xm7, 1
6720 pmaddubsw m7, m5, [r5 + 2 * mmsize]
6722 pmaddubsw m7, m5, [r5 + 1 * mmsize]
6725 movu xm7, [r7 + r4] ; m7 = row 7
6726 punpckhbw xm8, xm6, xm7
6728 vinserti128 m6, m6, xm8, 1
6729 pmaddubsw m8, m6, [r5 + 3 * mmsize]
6731 pmaddubsw m8, m6, [r5 + 2 * mmsize]
6733 pmaddubsw m8, m6, [r5 + 1 * mmsize]
6736 lea r7, [r7 + r1 * 4]
6737 movu xm8, [r7] ; m8 = row 8
6738 punpckhbw xm9, xm7, xm8
6740 vinserti128 m7, m7, xm9, 1
6741 pmaddubsw m9, m7, [r5 + 3 * mmsize]
6743 pmaddubsw m9, m7, [r5 + 2 * mmsize]
6745 pmaddubsw m9, m7, [r5 + 1 * mmsize]
6748 movu xm9, [r7 + r1] ; m9 = row 9
6749 punpckhbw xm10, xm8, xm9
6751 vinserti128 m8, m8, xm10, 1
6752 pmaddubsw m10, m8, [r5 + 3 * mmsize]
6754 pmaddubsw m10, m8, [r5 + 2 * mmsize]
6756 pmaddubsw m10, m8, [r5 + 1 * mmsize]
6759 movu xm10, [r7 + r1 * 2] ; m10 = row 10
6760 punpckhbw xm11, xm9, xm10
6762 vinserti128 m9, m9, xm11, 1
6763 pmaddubsw m11, m9, [r5 + 3 * mmsize]
6765 pmaddubsw m11, m9, [r5 + 2 * mmsize]
6767 pmaddubsw m11, m9, [r5 + 1 * mmsize]
6770 movu xm11, [r7 + r4] ; m11 = row 11
6771 punpckhbw xm12, xm10, xm11
6772 punpcklbw xm10, xm11
6773 vinserti128 m10, m10, xm12, 1
6774 pmaddubsw m12, m10, [r5 + 3 * mmsize]
6776 pmaddubsw m12, m10, [r5 + 2 * mmsize]
6778 pmaddubsw m12, m10, [r5 + 1 * mmsize]
6781 lea r7, [r7 + r1 * 4]
6782 movu xm12, [r7] ; m12 = row 12
6783 punpckhbw xm13, xm11, xm12
6784 punpcklbw xm11, xm12
6785 vinserti128 m11, m11, xm13, 1
6786 pmaddubsw m13, m11, [r5 + 3 * mmsize]
6788 pmaddubsw m13, m11, [r5 + 2 * mmsize]
6790 pmaddubsw m13, m11, [r5 + 1 * mmsize]
6794 pmulhrsw m0, m14 ; m0 = word: row 0
6795 pmulhrsw m1, m14 ; m1 = word: row 1
6796 pmulhrsw m2, m14 ; m2 = word: row 2
6797 pmulhrsw m3, m14 ; m3 = word: row 3
6798 pmulhrsw m4, m14 ; m4 = word: row 4
6799 pmulhrsw m5, m14 ; m5 = word: row 5
6803 vpermq m0, m0, 11011000b
6804 vpermq m2, m2, 11011000b
6805 vpermq m4, m4, 11011000b
6806 vextracti128 xm1, m0, 1
6807 vextracti128 xm3, m2, 1
6808 vextracti128 xm5, m4, 1
6811 movu [r2 + r3 * 2], xm2
6813 lea r8, [r2 + r3 * 4]
6817 movu xm13, [r7 + r1] ; m13 = row 13
6818 punpckhbw xm0, xm12, xm13
6819 punpcklbw xm12, xm13
6820 vinserti128 m12, m12, xm0, 1
6821 pmaddubsw m0, m12, [r5 + 3 * mmsize]
6823 pmaddubsw m0, m12, [r5 + 2 * mmsize]
6825 pmaddubsw m0, m12, [r5 + 1 * mmsize]
6828 movu xm0, [r7 + r1 * 2] ; m0 = row 14
6829 punpckhbw xm1, xm13, xm0
6831 vinserti128 m13, m13, xm1, 1
6832 pmaddubsw m1, m13, [r5 + 3 * mmsize]
6834 pmaddubsw m1, m13, [r5 + 2 * mmsize]
6836 pmaddubsw m1, m13, [r5 + 1 * mmsize]
6840 pmulhrsw m6, m14 ; m6 = word: row 6
6841 pmulhrsw m7, m14 ; m7 = word: row 7
6843 vpermq m6, m6, 11011000b
6844 vextracti128 xm7, m6, 1
6845 movu [r8 + r3 * 2], xm6
6847 lea r8, [r8 + r3 * 4]
6849 movu xm1, [r7 + r4] ; m1 = row 15
6850 punpckhbw xm2, xm0, xm1
6852 vinserti128 m0, m0, xm2, 1
6853 pmaddubsw m2, m0, [r5 + 3 * mmsize]
6855 pmaddubsw m2, m0, [r5 + 2 * mmsize]
6857 pmaddubsw m2, m0, [r5 + 1 * mmsize]
6860 lea r7, [r7 + r1 * 4]
6861 movu xm2, [r7] ; m2 = row 16
6862 punpckhbw xm3, xm1, xm2
6864 vinserti128 m1, m1, xm3, 1
6865 pmaddubsw m3, m1, [r5 + 3 * mmsize]
6867 pmaddubsw m3, m1, [r5 + 2 * mmsize]
6869 pmaddubsw m3, m1, [r5 + 1 * mmsize]
6872 movu xm3, [r7 + r1] ; m3 = row 17
6873 punpckhbw xm4, xm2, xm3
6875 vinserti128 m2, m2, xm4, 1
6876 pmaddubsw m4, m2, [r5 + 3 * mmsize]
6878 pmaddubsw m4, m2, [r5 + 2 * mmsize]
6880 pmaddubsw m2, [r5 + 1 * mmsize]
6882 movu xm4, [r7 + r1 * 2] ; m4 = row 18
6883 punpckhbw xm5, xm3, xm4
6885 vinserti128 m3, m3, xm5, 1
6886 pmaddubsw m5, m3, [r5 + 3 * mmsize]
6888 pmaddubsw m5, m3, [r5 + 2 * mmsize]
6890 pmaddubsw m3, [r5 + 1 * mmsize]
6892 movu xm5, [r7 + r4] ; m5 = row 19
6893 punpckhbw xm6, xm4, xm5
6895 vinserti128 m4, m4, xm6, 1
6896 pmaddubsw m6, m4, [r5 + 3 * mmsize]
6898 pmaddubsw m4, [r5 + 2 * mmsize]
6900 lea r7, [r7 + r1 * 4]
6901 movu xm6, [r7] ; m6 = row 20
6902 punpckhbw xm7, xm5, xm6
6904 vinserti128 m5, m5, xm7, 1
6905 pmaddubsw m7, m5, [r5 + 3 * mmsize]
6907 pmaddubsw m5, [r5 + 2 * mmsize]
6909 movu xm7, [r7 + r1] ; m7 = row 21
6910 punpckhbw xm2, xm6, xm7
6912 vinserti128 m6, m6, xm2, 1
6913 pmaddubsw m6, [r5 + 3 * mmsize]
6915 movu xm2, [r7 + r1 * 2] ; m2 = row 22
6916 punpckhbw xm3, xm7, xm2
6918 vinserti128 m7, m7, xm3, 1
6919 pmaddubsw m7, [r5 + 3 * mmsize]
6922 pmulhrsw m8, m14 ; m8 = word: row 8
6923 pmulhrsw m9, m14 ; m9 = word: row 9
6924 pmulhrsw m10, m14 ; m10 = word: row 10
6925 pmulhrsw m11, m14 ; m11 = word: row 11
6926 pmulhrsw m12, m14 ; m12 = word: row 12
6927 pmulhrsw m13, m14 ; m13 = word: row 13
6928 pmulhrsw m0, m14 ; m0 = word: row 14
6929 pmulhrsw m1, m14 ; m1 = word: row 15
6934 vpermq m8, m8, 11011000b
6935 vpermq m10, m10, 11011000b
6936 vpermq m12, m12, 11011000b
6937 vpermq m0, m0, 11011000b
6938 vextracti128 xm9, m8, 1
6939 vextracti128 xm11, m10, 1
6940 vextracti128 xm13, m12, 1
6941 vextracti128 xm1, m0, 1
6944 movu [r8 + r3 * 2], xm10
6945 movu [r8 + r6], xm11
6946 lea r8, [r8 + r3 * 4]
6948 movu [r8 + r3], xm13
6949 movu [r8 + r3 * 2], xm0
6953 %macro PROCESS_LUMA_AVX2_W16_8R 0
6954 movu xm0, [r0] ; m0 = row 0
6955 movu xm1, [r0 + r1] ; m1 = row 1
6956 punpckhbw xm2, xm0, xm1
6958 vinserti128 m0, m0, xm2, 1
6960 movu xm2, [r0 + r1 * 2] ; m2 = row 2
6961 punpckhbw xm3, xm1, xm2
6963 vinserti128 m1, m1, xm3, 1
6965 movu xm3, [r0 + r4] ; m3 = row 3
6966 punpckhbw xm4, xm2, xm3
6968 vinserti128 m2, m2, xm4, 1
6969 pmaddubsw m4, m2, [r5 + 1 * mmsize]
6972 lea r7, [r0 + r1 * 4]
6973 movu xm4, [r7] ; m4 = row 4
6974 punpckhbw xm5, xm3, xm4
6976 vinserti128 m3, m3, xm5, 1
6977 pmaddubsw m5, m3, [r5 + 1 * mmsize]
6980 movu xm5, [r7 + r1] ; m5 = row 5
6981 punpckhbw xm6, xm4, xm5
6983 vinserti128 m4, m4, xm6, 1
6984 pmaddubsw m6, m4, [r5 + 2 * mmsize]
6986 pmaddubsw m6, m4, [r5 + 1 * mmsize]
6989 movu xm6, [r7 + r1 * 2] ; m6 = row 6
6990 punpckhbw xm7, xm5, xm6
6992 vinserti128 m5, m5, xm7, 1
6993 pmaddubsw m7, m5, [r5 + 2 * mmsize]
6995 pmaddubsw m7, m5, [r5 + 1 * mmsize]
6998 movu xm7, [r7 + r4] ; m7 = row 7
6999 punpckhbw xm8, xm6, xm7
7001 vinserti128 m6, m6, xm8, 1
7002 pmaddubsw m8, m6, [r5 + 3 * mmsize]
7004 pmaddubsw m8, m6, [r5 + 2 * mmsize]
7006 pmaddubsw m8, m6, [r5 + 1 * mmsize]
7009 lea r7, [r7 + r1 * 4]
7010 movu xm8, [r7] ; m8 = row 8
7011 punpckhbw xm9, xm7, xm8
7013 vinserti128 m7, m7, xm9, 1
7014 pmaddubsw m9, m7, [r5 + 3 * mmsize]
7016 pmaddubsw m9, m7, [r5 + 2 * mmsize]
7018 pmaddubsw m9, m7, [r5 + 1 * mmsize]
7021 movu xm9, [r7 + r1] ; m9 = row 9
7022 punpckhbw xm10, xm8, xm9
7024 vinserti128 m8, m8, xm10, 1
7025 pmaddubsw m10, m8, [r5 + 3 * mmsize]
7027 pmaddubsw m10, m8, [r5 + 2 * mmsize]
7029 pmaddubsw m10, m8, [r5 + 1 * mmsize]
7031 movu xm10, [r7 + r1 * 2] ; m10 = row 10
7032 punpckhbw xm11, xm9, xm10
7034 vinserti128 m9, m9, xm11, 1
7035 pmaddubsw m11, m9, [r5 + 3 * mmsize]
7037 pmaddubsw m11, m9, [r5 + 2 * mmsize]
7039 pmaddubsw m11, m9, [r5 + 1 * mmsize]
7041 movu xm11, [r7 + r4] ; m11 = row 11
7042 punpckhbw xm12, xm10, xm11
7043 punpcklbw xm10, xm11
7044 vinserti128 m10, m10, xm12, 1
7045 pmaddubsw m12, m10, [r5 + 3 * mmsize]
7047 pmaddubsw m12, m10, [r5 + 2 * mmsize]
7049 lea r7, [r7 + r1 * 4]
7050 movu xm12, [r7] ; m12 = row 12
7051 punpckhbw xm13, xm11, xm12
7052 punpcklbw xm11, xm12
7053 vinserti128 m11, m11, xm13, 1
7054 pmaddubsw m13, m11, [r5 + 3 * mmsize]
7056 pmaddubsw m13, m11, [r5 + 2 * mmsize]
7059 pmulhrsw m0, m14 ; m0 = word: row 0
7060 pmulhrsw m1, m14 ; m1 = word: row 1
7061 pmulhrsw m2, m14 ; m2 = word: row 2
7062 pmulhrsw m3, m14 ; m3 = word: row 3
7063 pmulhrsw m4, m14 ; m4 = word: row 4
7064 pmulhrsw m5, m14 ; m5 = word: row 5
7068 vpermq m0, m0, 11011000b
7069 vpermq m2, m2, 11011000b
7070 vpermq m4, m4, 11011000b
7071 vextracti128 xm1, m0, 1
7072 vextracti128 xm3, m2, 1
7073 vextracti128 xm5, m4, 1
7076 movu [r2 + r3 * 2], xm2
7078 lea r8, [r2 + r3 * 4]
7082 movu xm13, [r7 + r1] ; m13 = row 13
7083 punpckhbw xm0, xm12, xm13
7084 punpcklbw xm12, xm13
7085 vinserti128 m12, m12, xm0, 1
7086 pmaddubsw m0, m12, [r5 + 3 * mmsize]
7088 movu xm0, [r7 + r1 * 2] ; m0 = row 14
7089 punpckhbw xm1, xm13, xm0
7091 vinserti128 m13, m13, xm1, 1
7092 pmaddubsw m1, m13, [r5 + 3 * mmsize]
7095 pmulhrsw m6, m14 ; m6 = word: row 6
7096 pmulhrsw m7, m14 ; m7 = word: row 7
7098 vpermq m6, m6, 11011000b
7099 vextracti128 xm7, m6, 1
7100 movu [r8 + r3 * 2], xm6
7105 %if ARCH_X86_64 == 1
7106 cglobal interp_8tap_vert_pp_24x32, 4, 11, 15
7111 lea r5, [tab_LumaCoeffVer_32]
7114 lea r5, [tab_LumaCoeffVer_32 + r4]
7124 PROCESS_LUMA_AVX2_W16_16R
7128 movq xm1, [r0] ; m1 = row 0
7129 movq xm2, [r0 + r1] ; m2 = row 1
7131 movq xm3, [r0 + r1 * 2] ; m3 = row 2
7133 vinserti128 m5, m1, xm2, 1
7135 movq xm4, [r0 + r4] ; m4 = row 3
7137 lea r7, [r0 + r1 * 4]
7138 movq xm1, [r7] ; m1 = row 4
7140 vinserti128 m2, m3, xm4, 1
7141 pmaddubsw m0, m2, [r5 + 1 * mmsize]
7144 movq xm3, [r7 + r1] ; m3 = row 5
7146 movq xm4, [r7 + r1 * 2] ; m4 = row 6
7148 vinserti128 m1, m1, xm3, 1
7149 pmaddubsw m3, m1, [r5 + 2 * mmsize]
7151 pmaddubsw m0, m1, [r5 + 1 * mmsize]
7154 movq xm3, [r7 + r4] ; m3 = row 7
7156 lea r7, [r7 + r1 * 4]
7157 movq xm0, [r7] ; m0 = row 8
7159 vinserti128 m4, m4, xm3, 1
7160 pmaddubsw m3, m4, [r5 + 3 * mmsize]
7162 pmaddubsw m3, m4, [r5 + 2 * mmsize]
7164 pmaddubsw m3, m4, [r5 + 1 * mmsize]
7167 movq xm3, [r7 + r1] ; m3 = row 9
7169 movq xm6, [r7 + r1 * 2] ; m6 = row 10
7171 vinserti128 m0, m0, xm3, 1
7172 pmaddubsw m3, m0, [r5 + 3 * mmsize]
7174 pmaddubsw m3, m0, [r5 + 2 * mmsize]
7176 pmaddubsw m3, m0, [r5 + 1 * mmsize]
7180 movq xm3, [r7 + r4] ; m3 = row 11
7182 lea r7, [r7 + r1 * 4]
7183 movq xm7, [r7] ; m7 = row 12
7185 vinserti128 m6, m6, xm3, 1
7186 pmaddubsw m3, m6, [r5 + 3 * mmsize]
7188 pmaddubsw m3, m6, [r5 + 2 * mmsize]
7190 pmaddubsw m3, m6, [r5 + 1 * mmsize]
7193 movq xm3, [r7 + r1] ; m3 = row 13
7195 movq xm8, [r7 + r1 * 2] ; m8 = row 14
7197 vinserti128 m7, m7, xm3, 1
7198 pmaddubsw m3, m7, [r5 + 3 * mmsize]
7200 pmaddubsw m3, m7, [r5 + 2 * mmsize]
7202 pmaddubsw m3, m7, [r5 + 1 * mmsize]
7205 movq xm3, [r7 + r4] ; m3 = row 15
7207 lea r7, [r7 + r1 * 4]
7208 movq xm9, [r7] ; m9 = row 16
7210 vinserti128 m8, m8, xm3, 1
7211 pmaddubsw m3, m8, [r5 + 3 * mmsize]
7213 pmaddubsw m3, m8, [r5 + 2 * mmsize]
7215 pmaddubsw m3, m8, [r5 + 1 * mmsize]
7218 movq xm3, [r7 + r1] ; m3 = row 17
7220 movq xm10, [r7 + r1 * 2] ; m10 = row 18
7222 vinserti128 m9, m9, xm3, 1
7223 pmaddubsw m3, m9, [r5 + 3 * mmsize]
7225 pmaddubsw m3, m9, [r5 + 2 * mmsize]
7227 pmaddubsw m3, m9, [r5 + 1 * mmsize]
7229 movq xm3, [r7 + r4] ; m3 = row 19
7231 lea r7, [r7 + r1 * 4]
7232 movq xm9, [r7] ; m9 = row 20
7234 vinserti128 m10, m10, xm3, 1
7235 pmaddubsw m3, m10, [r5 + 3 * mmsize]
7237 pmaddubsw m3, m10, [r5 + 2 * mmsize]
7239 movq xm3, [r7 + r1] ; m3 = row 21
7241 movq xm10, [r7 + r1 * 2] ; m10 = row 22
7243 vinserti128 m9, m9, xm3, 1
7244 pmaddubsw m3, m9, [r5 + 3 * mmsize]
7247 pmulhrsw m5, m14 ; m5 = word: row 0, row 1
7248 pmulhrsw m2, m14 ; m2 = word: row 2, row 3
7249 pmulhrsw m1, m14 ; m1 = word: row 4, row 5
7250 pmulhrsw m4, m14 ; m4 = word: row 6, row 7
7251 pmulhrsw m0, m14 ; m0 = word: row 8, row 9
7252 pmulhrsw m6, m14 ; m6 = word: row 10, row 11
7253 pmulhrsw m7, m14 ; m7 = word: row 12, row 13
7254 pmulhrsw m8, m14 ; m8 = word: row 14, row 15
7259 vextracti128 xm2, m5, 1
7260 vextracti128 xm4, m1, 1
7261 vextracti128 xm6, m0, 1
7262 vextracti128 xm8, m7, 1
7265 movhps [r2 + r3 * 2], xm5
7266 movhps [r2 + r6], xm2
7267 lea r8, [r2 + r3 * 4]
7270 movhps [r8 + r3 * 2], xm1
7271 movhps [r8 + r6], xm4
7272 lea r8, [r8 + r3 * 4]
7275 movhps [r8 + r3 * 2], xm0
7276 movhps [r8 + r6], xm6
7277 lea r8, [r8 + r3 * 4]
7280 movhps [r8 + r3 * 2], xm7
7281 movhps [r8 + r6], xm8
7285 lea r2, [r8 + r3 * 4 - 16]
7291 %macro FILTER_VER_LUMA_AVX2_32xN 2
7293 %if ARCH_X86_64 == 1
7294 cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15
7299 lea r5, [tab_LumaCoeffVer_32]
7302 lea r5, [tab_LumaCoeffVer_32 + r4]
7314 PROCESS_LUMA_AVX2_W16_16R
7321 lea r2, [r8 + r3 * 4 - 16]
7328 FILTER_VER_LUMA_AVX2_32xN 32, 32
7329 FILTER_VER_LUMA_AVX2_32xN 32, 64
7332 %if ARCH_X86_64 == 1
7333 cglobal interp_8tap_vert_pp_32x16, 4, 10, 15
7338 lea r5, [tab_LumaCoeffVer_32]
7341 lea r5, [tab_LumaCoeffVer_32 + r4]
7350 PROCESS_LUMA_AVX2_W16_16R
7359 %if ARCH_X86_64 == 1
7360 cglobal interp_8tap_vert_pp_32x24, 4, 10, 15
7365 lea r5, [tab_LumaCoeffVer_32]
7368 lea r5, [tab_LumaCoeffVer_32 + r4]
7377 PROCESS_LUMA_AVX2_W16_16R
7385 lea r2, [r8 + r3 * 4 - 16]
7388 PROCESS_LUMA_AVX2_W16_8R
7397 %if ARCH_X86_64 == 1
7398 cglobal interp_8tap_vert_pp_32x8, 4, 10, 15
7403 lea r5, [tab_LumaCoeffVer_32]
7406 lea r5, [tab_LumaCoeffVer_32 + r4]
7415 PROCESS_LUMA_AVX2_W16_8R
7424 %if ARCH_X86_64 == 1
7425 cglobal interp_8tap_vert_pp_48x64, 4, 12, 15
7430 lea r5, [tab_LumaCoeffVer_32]
7433 lea r5, [tab_LumaCoeffVer_32 + r4]
7445 PROCESS_LUMA_AVX2_W16_16R
7452 lea r2, [r8 + r3 * 4 - 32]
7458 %macro FILTER_VER_LUMA_AVX2_64xN 2
7460 %if ARCH_X86_64 == 1
7461 cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15
7466 lea r5, [tab_LumaCoeffVer_32]
7469 lea r5, [tab_LumaCoeffVer_32 + r4]
7481 PROCESS_LUMA_AVX2_W16_16R
7488 lea r2, [r8 + r3 * 4 - 48]
7495 FILTER_VER_LUMA_AVX2_64xN 64, 32
7496 FILTER_VER_LUMA_AVX2_64xN 64, 48
7497 FILTER_VER_LUMA_AVX2_64xN 64, 64
7500 %if ARCH_X86_64 == 1
7501 cglobal interp_8tap_vert_pp_64x16, 4, 10, 15
7506 lea r5, [tab_LumaCoeffVer_32]
7509 lea r5, [tab_LumaCoeffVer_32 + r4]
7518 PROCESS_LUMA_AVX2_W16_16R
7526 ;-------------------------------------------------------------------------------------------------------------
7527 ; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
7528 ;-------------------------------------------------------------------------------------------------------------
7529 %macro FILTER_VER_LUMA 3
7531 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize
7540 lea r5, [tab_LumaCoeffVer]
7543 lea r6, [tab_LumaCoeffVer + r4]
7551 mov dword [rsp], %2/4
7567 movhps [r2 + r3], m7
7568 lea r5, [r2 + 2 * r3]
7570 movhps [r5 + r3], m5
7579 lea r5, [r2 + 2 * r3]
7584 lea r5, [8 * r1 - 8]
7594 lea r0, [r0 + 4 * r1 - %1]
7596 lea r2, [r2 + 4 * r3 - %1]
7598 lea r2, [r2 + 4 * r3 - 2 * %1]
7607 FILTER_VER_LUMA 16, 4, pp
7608 FILTER_VER_LUMA 16, 8, pp
7609 FILTER_VER_LUMA 16, 12, pp
7610 FILTER_VER_LUMA 16, 16, pp
7611 FILTER_VER_LUMA 16, 32, pp
7612 FILTER_VER_LUMA 16, 64, pp
7613 FILTER_VER_LUMA 24, 32, pp
7614 FILTER_VER_LUMA 32, 8, pp
7615 FILTER_VER_LUMA 32, 16, pp
7616 FILTER_VER_LUMA 32, 24, pp
7617 FILTER_VER_LUMA 32, 32, pp
7618 FILTER_VER_LUMA 32, 64, pp
7619 FILTER_VER_LUMA 48, 64, pp
7620 FILTER_VER_LUMA 64, 16, pp
7621 FILTER_VER_LUMA 64, 32, pp
7622 FILTER_VER_LUMA 64, 48, pp
7623 FILTER_VER_LUMA 64, 64, pp
7625 FILTER_VER_LUMA 16, 4, ps
7626 FILTER_VER_LUMA 16, 8, ps
7627 FILTER_VER_LUMA 16, 12, ps
7628 FILTER_VER_LUMA 16, 16, ps
7629 FILTER_VER_LUMA 16, 32, ps
7630 FILTER_VER_LUMA 16, 64, ps
7631 FILTER_VER_LUMA 24, 32, ps
7632 FILTER_VER_LUMA 32, 8, ps
7633 FILTER_VER_LUMA 32, 16, ps
7634 FILTER_VER_LUMA 32, 24, ps
7635 FILTER_VER_LUMA 32, 32, ps
7636 FILTER_VER_LUMA 32, 64, ps
7637 FILTER_VER_LUMA 48, 64, ps
7638 FILTER_VER_LUMA 64, 16, ps
7639 FILTER_VER_LUMA 64, 32, ps
7640 FILTER_VER_LUMA 64, 48, ps
7641 FILTER_VER_LUMA 64, 64, ps
7643 %macro PROCESS_LUMA_SP_W4_4R 0
7646 punpcklwd m0, m1 ;m0=[0 1]
7647 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
7649 lea r0, [r0 + 2 * r1]
7651 punpcklwd m1, m4 ;m1=[1 2]
7652 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
7655 punpcklwd m4, m5 ;m4=[2 3]
7656 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
7657 pmaddwd m4, [r6 + 1 * 16]
7658 paddd m0, m4 ;m0=[0+1+2+3] Row1
7660 lea r0, [r0 + 2 * r1]
7662 punpcklwd m5, m4 ;m5=[3 4]
7663 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
7664 pmaddwd m5, [r6 + 1 * 16]
7665 paddd m1, m5 ;m1 = [1+2+3+4] Row2
7668 punpcklwd m4, m5 ;m4=[4 5]
7669 pmaddwd m6, m4, [r6 + 1 * 16]
7670 paddd m2, m6 ;m2=[2+3+4+5] Row3
7671 pmaddwd m4, [r6 + 2 * 16]
7672 paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
7674 lea r0, [r0 + 2 * r1]
7676 punpcklwd m5, m4 ;m5=[5 6]
7677 pmaddwd m6, m5, [r6 + 1 * 16]
7678 paddd m3, m6 ;m3=[3+4+5+6] Row4
7679 pmaddwd m5, [r6 + 2 * 16]
7680 paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
7683 punpcklwd m4, m5 ;m4=[6 7]
7684 pmaddwd m6, m4, [r6 + 2 * 16]
7685 paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
7686 pmaddwd m4, [r6 + 3 * 16]
7687 paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
7689 lea r0, [r0 + 2 * r1]
7691 punpcklwd m5, m4 ;m5=[7 8]
7692 pmaddwd m6, m5, [r6 + 2 * 16]
7693 paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
7694 pmaddwd m5, [r6 + 3 * 16]
7695 paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
7698 punpcklwd m4, m5 ;m4=[8 9]
7699 pmaddwd m4, [r6 + 3 * 16]
7700 paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
7702 movq m4, [r0 + 2 * r1]
7703 punpcklwd m5, m4 ;m5=[9 10]
7704 pmaddwd m5, [r6 + 3 * 16]
7705 paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
7708 ;--------------------------------------------------------------------------------------------------------------
7709 ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
7710 ;--------------------------------------------------------------------------------------------------------------
7711 %macro FILTER_VER_LUMA_SP 2
7713 cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize
7716 lea r5, [r1 + 2 * r1]
7721 lea r5, [tab_LumaCoeffV]
7724 lea r6, [tab_LumaCoeffV + r4]
7727 mova m7, [tab_c_526336]
7729 mov dword [rsp], %2/4
7733 PROCESS_LUMA_SP_W4_4R
7751 pextrd [r2 + r3], m0, 1
7752 lea r5, [r2 + 2 * r3]
7754 pextrd [r5 + r3], m0, 3
7756 lea r5, [8 * r1 - 2 * 4]
7763 lea r0, [r0 + 4 * r1 - 2 * %1]
7764 lea r2, [r2 + 4 * r3 - %1]
7772 ;--------------------------------------------------------------------------------------------------------------
7773 ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
7774 ;--------------------------------------------------------------------------------------------------------------
7775 FILTER_VER_LUMA_SP 4, 4
7776 FILTER_VER_LUMA_SP 8, 8
7777 FILTER_VER_LUMA_SP 8, 4
7778 FILTER_VER_LUMA_SP 4, 8
7779 FILTER_VER_LUMA_SP 16, 16
7780 FILTER_VER_LUMA_SP 16, 8
7781 FILTER_VER_LUMA_SP 8, 16
7782 FILTER_VER_LUMA_SP 16, 12
7783 FILTER_VER_LUMA_SP 12, 16
7784 FILTER_VER_LUMA_SP 16, 4
7785 FILTER_VER_LUMA_SP 4, 16
7786 FILTER_VER_LUMA_SP 32, 32
7787 FILTER_VER_LUMA_SP 32, 16
7788 FILTER_VER_LUMA_SP 16, 32
7789 FILTER_VER_LUMA_SP 32, 24
7790 FILTER_VER_LUMA_SP 24, 32
7791 FILTER_VER_LUMA_SP 32, 8
7792 FILTER_VER_LUMA_SP 8, 32
7793 FILTER_VER_LUMA_SP 64, 64
7794 FILTER_VER_LUMA_SP 64, 32
7795 FILTER_VER_LUMA_SP 32, 64
7796 FILTER_VER_LUMA_SP 64, 48
7797 FILTER_VER_LUMA_SP 48, 64
7798 FILTER_VER_LUMA_SP 64, 16
7799 FILTER_VER_LUMA_SP 16, 64
7801 ; TODO: combin of U and V is more performance, but need more register
7802 ; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it
7804 cglobal chroma_p2s, 3, 7, 4
7806 ; load width and height
7812 mova m3, [tab_c_64_n64]
7830 lea r6, [r2 + r5 * 2]
7832 movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0
7833 movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1
7841 movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0
7842 movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1
7849 movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0
7850 movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1
7853 lea r0, [r0 + r1 * 2]
7854 add r2, FENC_STRIDE / 2 * 4
7861 %macro PROCESS_CHROMA_SP_W4_4R 0
7864 punpcklwd m0, m1 ;m0=[0 1]
7865 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
7867 lea r0, [r0 + 2 * r1]
7869 punpcklwd m1, m4 ;m1=[1 2]
7870 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
7873 punpcklwd m4, m5 ;m4=[2 3]
7874 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
7875 pmaddwd m4, [r6 + 1 * 16]
7876 paddd m0, m4 ;m0=[0+1+2+3] Row1 done
7878 lea r0, [r0 + 2 * r1]
7880 punpcklwd m5, m4 ;m5=[3 4]
7881 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
7882 pmaddwd m5, [r6 + 1 * 16]
7883 paddd m1, m5 ;m1 = [1+2+3+4] Row2
7886 punpcklwd m4, m5 ;m4=[4 5]
7887 pmaddwd m4, [r6 + 1 * 16]
7888 paddd m2, m4 ;m2=[2+3+4+5] Row3
7890 movq m4, [r0 + 2 * r1]
7891 punpcklwd m5, m4 ;m5=[5 6]
7892 pmaddwd m5, [r6 + 1 * 16]
7893 paddd m3, m5 ;m3=[3+4+5+6] Row4
7896 ;--------------------------------------------------------------------------------------------------------------
7897 ; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
7898 ;--------------------------------------------------------------------------------------------------------------
7899 %macro FILTER_VER_CHROMA_SP 2
7901 cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize
7908 lea r5, [tab_ChromaCoeffV]
7911 lea r6, [tab_ChromaCoeffV + r4]
7914 mova m6, [tab_c_526336]
7916 mov dword [rsp], %2/4
7921 PROCESS_CHROMA_SP_W4_4R
7939 pextrd [r2 + r3], m0, 1
7940 lea r5, [r2 + 2 * r3]
7942 pextrd [r5 + r3], m0, 3
7944 lea r5, [4 * r1 - 2 * 4]
7951 lea r0, [r0 + 4 * r1 - 2 * %1]
7952 lea r2, [r2 + 4 * r3 - %1]
7960 FILTER_VER_CHROMA_SP 4, 4
7961 FILTER_VER_CHROMA_SP 4, 8
7962 FILTER_VER_CHROMA_SP 16, 16
7963 FILTER_VER_CHROMA_SP 16, 8
7964 FILTER_VER_CHROMA_SP 16, 12
7965 FILTER_VER_CHROMA_SP 12, 16
7966 FILTER_VER_CHROMA_SP 16, 4
7967 FILTER_VER_CHROMA_SP 4, 16
7968 FILTER_VER_CHROMA_SP 32, 32
7969 FILTER_VER_CHROMA_SP 32, 16
7970 FILTER_VER_CHROMA_SP 16, 32
7971 FILTER_VER_CHROMA_SP 32, 24
7972 FILTER_VER_CHROMA_SP 24, 32
7973 FILTER_VER_CHROMA_SP 32, 8
7975 FILTER_VER_CHROMA_SP 16, 24
7976 FILTER_VER_CHROMA_SP 16, 64
7977 FILTER_VER_CHROMA_SP 12, 32
7978 FILTER_VER_CHROMA_SP 4, 32
7979 FILTER_VER_CHROMA_SP 32, 64
7980 FILTER_VER_CHROMA_SP 32, 48
7981 FILTER_VER_CHROMA_SP 24, 64
7983 FILTER_VER_CHROMA_SP 64, 64
7984 FILTER_VER_CHROMA_SP 64, 32
7985 FILTER_VER_CHROMA_SP 64, 48
7986 FILTER_VER_CHROMA_SP 48, 64
7987 FILTER_VER_CHROMA_SP 64, 16
7990 %macro PROCESS_CHROMA_SP_W2_4R 1
7993 punpcklwd m0, m1 ;m0=[0 1]
7995 lea r0, [r0 + 2 * r1]
7997 punpcklwd m1, m2 ;m1=[1 2]
7998 punpcklqdq m0, m1 ;m0=[0 1 1 2]
7999 pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2
8002 punpcklwd m2, m1 ;m2=[2 3]
8004 lea r0, [r0 + 2 * r1]
8006 punpcklwd m1, m3 ;m2=[3 4]
8007 punpcklqdq m2, m1 ;m2=[2 3 3 4]
8009 pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2
8010 pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4
8011 paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2
8014 punpcklwd m3, m1 ;m3=[4 5]
8016 movd m4, [r0 + 2 * r1]
8017 punpcklwd m1, m4 ;m1=[5 6]
8018 punpcklqdq m3, m1 ;m2=[4 5 5 6]
8019 pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4
8020 paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4
8023 ;-------------------------------------------------------------------------------------------------------------------
8024 ; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
8025 ;-------------------------------------------------------------------------------------------------------------------
8026 %macro FILTER_VER_CHROMA_SP_W2_4R 2
8028 cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6
8035 lea r5, [tab_ChromaCoeffV]
8038 lea r5, [tab_ChromaCoeffV + r4]
8041 mova m5, [tab_c_526336]
8046 PROCESS_CHROMA_SP_W2_4R r5
8058 pextrw [r2 + r3], m0, 1
8059 lea r2, [r2 + 2 * r3]
8061 pextrw [r2 + r3], m0, 3
8063 lea r2, [r2 + 2 * r3]
8071 FILTER_VER_CHROMA_SP_W2_4R 2, 4
8072 FILTER_VER_CHROMA_SP_W2_4R 2, 8
8074 FILTER_VER_CHROMA_SP_W2_4R 2, 16
8076 ;--------------------------------------------------------------------------------------------------------------
8077 ; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
8078 ;--------------------------------------------------------------------------------------------------------------
8080 cglobal interp_4tap_vert_sp_4x2, 5, 6, 5
8087 lea r5, [tab_ChromaCoeffV]
8090 lea r5, [tab_ChromaCoeffV + r4]
8093 mova m4, [tab_c_526336]
8097 punpcklwd m0, m1 ;m0=[0 1]
8098 pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
8100 lea r0, [r0 + 2 * r1]
8102 punpcklwd m1, m2 ;m1=[1 2]
8103 pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
8106 punpcklwd m2, m3 ;m4=[2 3]
8107 pmaddwd m2, [r5 + 1 * 16]
8108 paddd m0, m2 ;m0=[0+1+2+3] Row1 done
8112 movq m2, [r0 + 2 * r1]
8113 punpcklwd m3, m2 ;m5=[3 4]
8114 pmaddwd m3, [r5 + 1 * 16]
8115 paddd m1, m3 ;m1 = [1+2+3+4] Row2 done
8123 pextrd [r2 + r3], m0, 1
8127 ;-------------------------------------------------------------------------------------------------------------------
8128 ; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
8129 ;-------------------------------------------------------------------------------------------------------------------
8130 %macro FILTER_VER_CHROMA_SP_W6_H4 2
8132 cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7
8139 lea r5, [tab_ChromaCoeffV]
8142 lea r6, [tab_ChromaCoeffV + r4]
8145 mova m6, [tab_c_526336]
8150 PROCESS_CHROMA_SP_W4_4R
8168 pextrd [r2 + r3], m0, 1
8169 lea r5, [r2 + 2 * r3]
8171 pextrd [r5 + r3], m0, 3
8173 lea r5, [4 * r1 - 2 * 4]
8177 PROCESS_CHROMA_SP_W2_4R r6
8189 pextrw [r2 + r3], m0, 1
8190 lea r2, [r2 + 2 * r3]
8192 pextrw [r2 + r3], m0, 3
8195 lea r2, [r2 + 2 * r3 - 4]
8203 FILTER_VER_CHROMA_SP_W6_H4 6, 8
8205 FILTER_VER_CHROMA_SP_W6_H4 6, 16
8207 %macro PROCESS_CHROMA_SP_W8_2R 0
8210 punpcklwd m0, m1, m3
8211 pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l
8213 pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h
8215 movu m4, [r0 + 2 * r1]
8216 punpcklwd m2, m3, m4
8217 pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l
8219 pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h
8221 lea r0, [r0 + 2 * r1]
8223 punpcklwd m6, m4, m5
8224 pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l
8225 paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum
8227 pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h
8228 paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum
8230 movu m4, [r0 + 2 * r1]
8231 punpcklwd m6, m5, m4
8232 pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l
8233 paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum
8235 pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h
8236 paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum
8239 ;--------------------------------------------------------------------------------------------------------------
8240 ; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
8241 ;--------------------------------------------------------------------------------------------------------------
8242 %macro FILTER_VER_CHROMA_SP_W8_H2 2
8244 cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8
8251 lea r5, [tab_ChromaCoeffV]
8254 lea r5, [tab_ChromaCoeffV + r4]
8257 mova m7, [tab_c_526336]
8261 PROCESS_CHROMA_SP_W8_2R
8279 movhps [r2 + r3], m0
8281 lea r2, [r2 + 2 * r3]
8289 FILTER_VER_CHROMA_SP_W8_H2 8, 2
8290 FILTER_VER_CHROMA_SP_W8_H2 8, 4
8291 FILTER_VER_CHROMA_SP_W8_H2 8, 6
8292 FILTER_VER_CHROMA_SP_W8_H2 8, 8
8293 FILTER_VER_CHROMA_SP_W8_H2 8, 16
8294 FILTER_VER_CHROMA_SP_W8_H2 8, 32
8296 FILTER_VER_CHROMA_SP_W8_H2 8, 12
8297 FILTER_VER_CHROMA_SP_W8_H2 8, 64
8300 ;-----------------------------------------------------------------------------------------------------------------------------
8301 ; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
8302 ;-----------------------------------------------------------------------------------------------------------------------------
8303 %macro FILTER_HORIZ_CHROMA_2xN 2
8305 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
8313 add dststrided, dststrided
8316 lea r6, [tab_ChromaCoeff]
8317 movd coef2, [r6 + r4 * 4]
8319 movd coef2, [tab_ChromaCoeff + r4 * 4]
8322 pshufd coef2, coef2, 0
8329 sub srcq, srcstrideq
8340 lea srcq, [srcq + srcstrideq]
8341 lea dstq, [dstq + dststrideq]
8349 FILTER_HORIZ_CHROMA_2xN 2, 4
8350 FILTER_HORIZ_CHROMA_2xN 2, 8
8352 FILTER_HORIZ_CHROMA_2xN 2, 16
8354 ;-----------------------------------------------------------------------------------------------------------------------------
8355 ; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
8356 ;-----------------------------------------------------------------------------------------------------------------------------
8357 %macro FILTER_HORIZ_CHROMA_4xN 2
8359 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
8367 add dststrided, dststrided
8370 lea r6, [tab_ChromaCoeff]
8371 movd coef2, [r6 + r4 * 4]
8373 movd coef2, [tab_ChromaCoeff + r4 * 4]
8376 pshufd coef2, coef2, 0
8383 sub srcq, srcstrideq
8394 lea srcq, [srcq + srcstrideq]
8395 lea dstq, [dstq + dststrideq]
8402 FILTER_HORIZ_CHROMA_4xN 4, 2
8403 FILTER_HORIZ_CHROMA_4xN 4, 4
8404 FILTER_HORIZ_CHROMA_4xN 4, 8
8405 FILTER_HORIZ_CHROMA_4xN 4, 16
8407 FILTER_HORIZ_CHROMA_4xN 4, 32
8409 %macro PROCESS_CHROMA_W6 3
8422 %macro PROCESS_CHROMA_W12 3
8436 movh [dstq + 16], %1
8439 ;-----------------------------------------------------------------------------------------------------------------------------
8440 ; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
8441 ;-----------------------------------------------------------------------------------------------------------------------------
8442 %macro FILTER_HORIZ_CHROMA 2
8444 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
8454 add dststrided, dststrided
8457 lea r6, [tab_ChromaCoeff]
8458 movd coef2, [r6 + r4 * 4]
8460 movd coef2, [tab_ChromaCoeff + r4 * 4]
8463 pshufd coef2, coef2, 0
8466 mova Tm1, [tab_Tm + 16]
8471 sub srcq, srcstrideq
8475 PROCESS_CHROMA_W%1 t0, t1, t2
8476 add srcq, srcstrideq
8477 add dstq, dststrideq
8485 FILTER_HORIZ_CHROMA 6, 8
8486 FILTER_HORIZ_CHROMA 12, 16
8488 FILTER_HORIZ_CHROMA 6, 16
8489 FILTER_HORIZ_CHROMA 12, 32
8491 %macro PROCESS_CHROMA_W8 3
8502 ;-----------------------------------------------------------------------------------------------------------------------------
8503 ; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
8504 ;-----------------------------------------------------------------------------------------------------------------------------
8505 %macro FILTER_HORIZ_CHROMA_8xN 2
8507 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
8517 add dststrided, dststrided
8520 lea r6, [tab_ChromaCoeff]
8521 movd coef2, [r6 + r4 * 4]
8523 movd coef2, [tab_ChromaCoeff + r4 * 4]
8526 pshufd coef2, coef2, 0
8529 mova Tm1, [tab_Tm + 16]
8534 sub srcq, srcstrideq
8538 PROCESS_CHROMA_W8 t0, t1, t2
8539 add srcq, srcstrideq
8540 add dstq, dststrideq
8548 FILTER_HORIZ_CHROMA_8xN 8, 2
8549 FILTER_HORIZ_CHROMA_8xN 8, 4
8550 FILTER_HORIZ_CHROMA_8xN 8, 6
8551 FILTER_HORIZ_CHROMA_8xN 8, 8
8552 FILTER_HORIZ_CHROMA_8xN 8, 16
8553 FILTER_HORIZ_CHROMA_8xN 8, 32
8555 FILTER_HORIZ_CHROMA_8xN 8, 12
8556 FILTER_HORIZ_CHROMA_8xN 8, 64
8558 %macro PROCESS_CHROMA_W16 4
8574 movu [dstq + 16], %4
8577 %macro PROCESS_CHROMA_W24 4
8593 movu [dstq + 16], %4
8594 movu %1, [srcq + 16]
8601 movu [dstq + 32], %2
8604 %macro PROCESS_CHROMA_W32 4
8620 movu [dstq + 16], %4
8621 movu %1, [srcq + 16]
8627 movu %1, [srcq + 24]
8635 movu [dstq + 32], %2
8636 movu [dstq + 48], %4
8639 %macro PROCESS_CHROMA_W16o 5
8640 movu %1, [srcq + %5]
8646 movu %1, [srcq + %5 + 8]
8654 movu [dstq + %5 * 2], %2
8655 movu [dstq + %5 * 2 + 16], %4
8658 %macro PROCESS_CHROMA_W48 4
8659 PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
8660 PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
8661 PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
8664 %macro PROCESS_CHROMA_W64 4
8665 PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
8666 PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
8667 PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
8668 PROCESS_CHROMA_W16o %1, %2, %3, %4, 48
8671 ;------------------------------------------------------------------------------------------------------------------------------
8672 ; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
8673 ;------------------------------------------------------------------------------------------------------------------------------
8674 %macro FILTER_HORIZ_CHROMA_WxN 2
8676 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride
8687 add dststrided, dststrided
8690 lea r6, [tab_ChromaCoeff]
8691 movd coef2, [r6 + r4 * 4]
8693 movd coef2, [tab_ChromaCoeff + r4 * 4]
8696 pshufd coef2, coef2, 0
8699 mova Tm1, [tab_Tm + 16]
8704 sub srcq, srcstrideq
8708 PROCESS_CHROMA_W%1 t0, t1, t2, t3
8709 add srcq, srcstrideq
8710 add dstq, dststrideq
8718 FILTER_HORIZ_CHROMA_WxN 16, 4
8719 FILTER_HORIZ_CHROMA_WxN 16, 8
8720 FILTER_HORIZ_CHROMA_WxN 16, 12
8721 FILTER_HORIZ_CHROMA_WxN 16, 16
8722 FILTER_HORIZ_CHROMA_WxN 16, 32
8723 FILTER_HORIZ_CHROMA_WxN 24, 32
8724 FILTER_HORIZ_CHROMA_WxN 32, 8
8725 FILTER_HORIZ_CHROMA_WxN 32, 16
8726 FILTER_HORIZ_CHROMA_WxN 32, 24
8727 FILTER_HORIZ_CHROMA_WxN 32, 32
8729 FILTER_HORIZ_CHROMA_WxN 16, 24
8730 FILTER_HORIZ_CHROMA_WxN 16, 64
8731 FILTER_HORIZ_CHROMA_WxN 24, 64
8732 FILTER_HORIZ_CHROMA_WxN 32, 48
8733 FILTER_HORIZ_CHROMA_WxN 32, 64
8735 FILTER_HORIZ_CHROMA_WxN 64, 64
8736 FILTER_HORIZ_CHROMA_WxN 64, 32
8737 FILTER_HORIZ_CHROMA_WxN 64, 48
8738 FILTER_HORIZ_CHROMA_WxN 48, 64
8739 FILTER_HORIZ_CHROMA_WxN 64, 16
8742 ;---------------------------------------------------------------------------------------------------------------
8743 ; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8744 ;---------------------------------------------------------------------------------------------------------------
8745 %macro FILTER_V_PS_W16n 2
8747 cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8
8754 lea r5, [tab_ChromaCoeff]
8755 movd m0, [r5 + r4 * 4]
8757 movd m0, [tab_ChromaCoeff + r4 * 4]
8760 pshufb m1, m0, [tab_Vm]
8761 pshufb m0, [tab_Vm + 16]
8773 punpcklbw m4, m2, m3
8779 lea r5, [r0 + 2 * r1]
8783 punpcklbw m6, m5, m7
8787 punpckhbw m6, m5, m7
8799 punpcklbw m4, m3, m5
8805 movu m5, [r5 + 2 * r1]
8807 punpcklbw m2, m7, m5
8820 movu [r2 + r3 + 16], m3
8827 lea r0, [r0 + r1 * 2 - %1]
8828 lea r2, [r2 + r3 * 2 - %1 * 2]
8835 FILTER_V_PS_W16n 64, 64
8836 FILTER_V_PS_W16n 64, 32
8837 FILTER_V_PS_W16n 64, 48
8838 FILTER_V_PS_W16n 48, 64
8839 FILTER_V_PS_W16n 64, 16
8842 ;------------------------------------------------------------------------------------------------------------
8843 ;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8844 ;------------------------------------------------------------------------------------------------------------
8846 cglobal interp_4tap_vert_ps_2x4, 4, 6, 7
8853 lea r5, [tab_ChromaCoeff]
8854 movd m0, [r5 + r4 * 4]
8856 movd m0, [tab_ChromaCoeff + r4 * 4]
8865 movd m4, [r0 + 2 * r1]
8869 punpcklbw m6, m4, m5
8874 lea r0, [r0 + 4 * r1]
8878 punpcklbw m1, m5, m6
8889 pextrd [r2 + r3], m2, 2
8894 punpcklbw m3, m6, m2
8899 movd m3, [r0 + 2 * r1]
8909 lea r2, [r2 + 2 * r3]
8911 pextrd [r2 + r3], m4, 2
8915 ;-------------------------------------------------------------------------------------------------------------
8916 ; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8917 ;-------------------------------------------------------------------------------------------------------------
8918 %macro FILTER_V_PS_W2 2
8920 cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8
8927 lea r5, [tab_ChromaCoeff]
8928 movd m0, [r5 + r4 * 4]
8930 movd m0, [tab_ChromaCoeff + r4 * 4]
8941 movd m4, [r0 + 2 * r1]
8945 punpcklbw m6, m4, m5
8950 lea r0, [r0 + 4 * r1]
8954 punpcklbw m7, m5, m6
8970 punpcklbw m3, m6, m2
8975 movd m3, [r0 + 2 * r1]
8987 lea r2, [r2 + 2 * r3]
8992 lea r2, [r2 + 2 * r3]
9002 FILTER_V_PS_W2 2, 16
9004 ;-----------------------------------------------------------------------------------------------------------------
9005 ; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9006 ;-----------------------------------------------------------------------------------------------------------------
9007 %macro FILTER_VER_CHROMA_SS 2
9009 cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize
9017 lea r5, [tab_ChromaCoeffV]
9020 lea r6, [tab_ChromaCoeffV + r4]
9023 mov dword [rsp], %2/4
9028 PROCESS_CHROMA_SP_W4_4R
9039 movhps [r2 + r3], m0
9040 lea r5, [r2 + 2 * r3]
9042 movhps [r5 + r3], m2
9044 lea r5, [4 * r1 - 2 * 4]
9051 lea r0, [r0 + 4 * r1 - 2 * %1]
9052 lea r2, [r2 + 4 * r3 - 2 * %1]
9060 FILTER_VER_CHROMA_SS 4, 4
9061 FILTER_VER_CHROMA_SS 4, 8
9062 FILTER_VER_CHROMA_SS 16, 16
9063 FILTER_VER_CHROMA_SS 16, 8
9064 FILTER_VER_CHROMA_SS 16, 12
9065 FILTER_VER_CHROMA_SS 12, 16
9066 FILTER_VER_CHROMA_SS 16, 4
9067 FILTER_VER_CHROMA_SS 4, 16
9068 FILTER_VER_CHROMA_SS 32, 32
9069 FILTER_VER_CHROMA_SS 32, 16
9070 FILTER_VER_CHROMA_SS 16, 32
9071 FILTER_VER_CHROMA_SS 32, 24
9072 FILTER_VER_CHROMA_SS 24, 32
9073 FILTER_VER_CHROMA_SS 32, 8
9075 FILTER_VER_CHROMA_SS 16, 24
9076 FILTER_VER_CHROMA_SS 12, 32
9077 FILTER_VER_CHROMA_SS 4, 32
9078 FILTER_VER_CHROMA_SS 32, 64
9079 FILTER_VER_CHROMA_SS 16, 64
9080 FILTER_VER_CHROMA_SS 32, 48
9081 FILTER_VER_CHROMA_SS 24, 64
9083 FILTER_VER_CHROMA_SS 64, 64
9084 FILTER_VER_CHROMA_SS 64, 32
9085 FILTER_VER_CHROMA_SS 64, 48
9086 FILTER_VER_CHROMA_SS 48, 64
9087 FILTER_VER_CHROMA_SS 64, 16
9090 ;---------------------------------------------------------------------------------------------------------------------
9091 ; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9092 ;---------------------------------------------------------------------------------------------------------------------
9093 %macro FILTER_VER_CHROMA_SS_W2_4R 2
9095 cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5
9103 lea r5, [tab_ChromaCoeffV]
9106 lea r5, [tab_ChromaCoeffV + r4]
9112 PROCESS_CHROMA_SP_W2_4R r5
9120 pextrd [r2 + r3], m0, 1
9121 lea r2, [r2 + 2 * r3]
9123 pextrd [r2 + r3], m0, 3
9125 lea r2, [r2 + 2 * r3]
9133 FILTER_VER_CHROMA_SS_W2_4R 2, 4
9134 FILTER_VER_CHROMA_SS_W2_4R 2, 8
9136 FILTER_VER_CHROMA_SS_W2_4R 2, 16
9138 ;---------------------------------------------------------------------------------------------------------------
9139 ; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9140 ;---------------------------------------------------------------------------------------------------------------
9142 cglobal interp_4tap_vert_ss_4x2, 5, 6, 4
9150 lea r5, [tab_ChromaCoeffV]
9153 lea r5, [tab_ChromaCoeffV + r4]
9158 punpcklwd m0, m1 ;m0=[0 1]
9159 pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
9161 lea r0, [r0 + 2 * r1]
9163 punpcklwd m1, m2 ;m1=[1 2]
9164 pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
9167 punpcklwd m2, m3 ;m4=[2 3]
9168 pmaddwd m2, [r5 + 1 * 16]
9169 paddd m0, m2 ;m0=[0+1+2+3] Row1 done
9172 movq m2, [r0 + 2 * r1]
9173 punpcklwd m3, m2 ;m5=[3 4]
9174 pmaddwd m3, [r5 + 1 * 16]
9175 paddd m1, m3 ;m1=[1+2+3+4] Row2 done
9181 movhps [r2 + r3], m0
9185 ;-------------------------------------------------------------------------------------------------------------------
9186 ; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9187 ;-------------------------------------------------------------------------------------------------------------------
9188 %macro FILTER_VER_CHROMA_SS_W6_H4 2
9190 cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6
9198 lea r5, [tab_ChromaCoeffV]
9201 lea r6, [tab_ChromaCoeffV + r4]
9207 PROCESS_CHROMA_SP_W4_4R
9218 movhps [r2 + r3], m0
9219 lea r5, [r2 + 2 * r3]
9221 movhps [r5 + r3], m2
9223 lea r5, [4 * r1 - 2 * 4]
9227 PROCESS_CHROMA_SP_W2_4R r6
9235 pextrd [r2 + r3], m0, 1
9236 lea r2, [r2 + 2 * r3]
9238 pextrd [r2 + r3], m0, 3
9241 lea r2, [r2 + 2 * r3 - 2 * 4]
9249 FILTER_VER_CHROMA_SS_W6_H4 6, 8
9251 FILTER_VER_CHROMA_SS_W6_H4 6, 16
9254 ;----------------------------------------------------------------------------------------------------------------
9255 ; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9256 ;----------------------------------------------------------------------------------------------------------------
9257 %macro FILTER_VER_CHROMA_SS_W8_H2 2
9259 cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7
9267 lea r5, [tab_ChromaCoeffV]
9270 lea r5, [tab_ChromaCoeffV + r4]
9275 PROCESS_CHROMA_SP_W8_2R
9288 lea r2, [r2 + 2 * r3]
9296 FILTER_VER_CHROMA_SS_W8_H2 8, 2
9297 FILTER_VER_CHROMA_SS_W8_H2 8, 4
9298 FILTER_VER_CHROMA_SS_W8_H2 8, 6
9299 FILTER_VER_CHROMA_SS_W8_H2 8, 8
9300 FILTER_VER_CHROMA_SS_W8_H2 8, 16
9301 FILTER_VER_CHROMA_SS_W8_H2 8, 32
9303 FILTER_VER_CHROMA_SS_W8_H2 8, 12
9304 FILTER_VER_CHROMA_SS_W8_H2 8, 64
9306 ;-----------------------------------------------------------------------------------------------------------------
9307 ; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9308 ;-----------------------------------------------------------------------------------------------------------------
9309 %macro FILTER_VER_LUMA_SS 2
9311 cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize
9320 lea r5, [tab_LumaCoeffV]
9323 lea r6, [tab_LumaCoeffV + r4]
9326 mov dword [rsp], %2/4
9332 punpcklwd m0, m1 ;m0=[0 1]
9333 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
9335 lea r0, [r0 + 2 * r1]
9337 punpcklwd m1, m4 ;m1=[1 2]
9338 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
9341 punpcklwd m4, m5 ;m4=[2 3]
9342 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
9343 pmaddwd m4, [r6 + 1 * 16]
9344 paddd m0, m4 ;m0=[0+1+2+3] Row1
9346 lea r0, [r0 + 2 * r1]
9348 punpcklwd m5, m4 ;m5=[3 4]
9349 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
9350 pmaddwd m5, [r6 + 1 * 16]
9351 paddd m1, m5 ;m1 = [1+2+3+4] Row2
9354 punpcklwd m4, m5 ;m4=[4 5]
9355 pmaddwd m6, m4, [r6 + 1 * 16]
9356 paddd m2, m6 ;m2=[2+3+4+5] Row3
9357 pmaddwd m4, [r6 + 2 * 16]
9358 paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
9360 lea r0, [r0 + 2 * r1]
9362 punpcklwd m5, m4 ;m5=[5 6]
9363 pmaddwd m6, m5, [r6 + 1 * 16]
9364 paddd m3, m6 ;m3=[3+4+5+6] Row4
9365 pmaddwd m5, [r6 + 2 * 16]
9366 paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
9369 punpcklwd m4, m5 ;m4=[6 7]
9370 pmaddwd m6, m4, [r6 + 2 * 16]
9371 paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
9372 pmaddwd m4, [r6 + 3 * 16]
9373 paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
9376 lea r0, [r0 + 2 * r1]
9378 punpcklwd m5, m4 ;m5=[7 8]
9379 pmaddwd m6, m5, [r6 + 2 * 16]
9380 paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
9381 pmaddwd m5, [r6 + 3 * 16]
9382 paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
9388 movhps [r2 + r3], m0
9391 punpcklwd m4, m5 ;m4=[8 9]
9392 pmaddwd m4, [r6 + 3 * 16]
9393 paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
9396 movq m4, [r0 + 2 * r1]
9397 punpcklwd m5, m4 ;m5=[9 10]
9398 pmaddwd m5, [r6 + 3 * 16]
9399 paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
9404 movlps [r2 + 2 * r3], m2
9406 movhps [r2 + r5], m2
9408 lea r5, [8 * r1 - 2 * 4]
9415 lea r0, [r0 + 4 * r1 - 2 * %1]
9416 lea r2, [r2 + 4 * r3 - 2 * %1]
9424 FILTER_VER_LUMA_SS 4, 4
9425 FILTER_VER_LUMA_SS 8, 8
9426 FILTER_VER_LUMA_SS 8, 4
9427 FILTER_VER_LUMA_SS 4, 8
9428 FILTER_VER_LUMA_SS 16, 16
9429 FILTER_VER_LUMA_SS 16, 8
9430 FILTER_VER_LUMA_SS 8, 16
9431 FILTER_VER_LUMA_SS 16, 12
9432 FILTER_VER_LUMA_SS 12, 16
9433 FILTER_VER_LUMA_SS 16, 4
9434 FILTER_VER_LUMA_SS 4, 16
9435 FILTER_VER_LUMA_SS 32, 32
9436 FILTER_VER_LUMA_SS 32, 16
9437 FILTER_VER_LUMA_SS 16, 32
9438 FILTER_VER_LUMA_SS 32, 24
9439 FILTER_VER_LUMA_SS 24, 32
9440 FILTER_VER_LUMA_SS 32, 8
9441 FILTER_VER_LUMA_SS 8, 32
9442 FILTER_VER_LUMA_SS 64, 64
9443 FILTER_VER_LUMA_SS 64, 32
9444 FILTER_VER_LUMA_SS 32, 64
9445 FILTER_VER_LUMA_SS 64, 48
9446 FILTER_VER_LUMA_SS 48, 64
9447 FILTER_VER_LUMA_SS 64, 16
9448 FILTER_VER_LUMA_SS 16, 64