1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
4 ;* Authors: Min Chen <chenm003@163.com>
5 ;* Nabajit Deka <nabajit@multicorewareinc.com>
6 ;* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 ;* This program is also available under a commercial proprietary license.
23 ;* For more information, contact us at license @ x265.com.
24 ;*****************************************************************************/
27 %include "x86util.asm"
30 tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
31 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
32 db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
35 tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
36 db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10
37 db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12
38 db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14
40 tab_Vm: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
41 db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
43 tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3
45 tab_c_512: times 8 dw 512
46 tab_c_526336: times 4 dd 8192*64+2048
48 tab_ChromaCoeff: db 0, 64, 0, 0
57 tab_ChromaCoeffV: times 4 dw 0, 64
81 tab_LumaCoeff: db 0, 0, 0, 64, 0, 0, 0, 0
82 db -1, 4, -10, 58, 17, -5, 1, 0
83 db -1, 4, -11, 40, 40, -11, 4, -1
84 db 0, 1, -5, 17, 58, -10, 4, -1
86 tab_LumaCoeffV: times 4 dw 0, 0
106 tab_LumaCoeffVer: times 8 db 0, 0
126 tab_c_128: times 16 db 0x80
127 tab_c_64_n64: times 8 db 64, -64
137 %macro FILTER_H4_w2_2 3
140 movh %1, [srcq + srcstrideq - 1]
150 mov [dstq + dststrideq], r4w
153 ;-----------------------------------------------------------------------------
154 ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
155 ;-----------------------------------------------------------------------------
157 cglobal interp_4tap_horiz_pp_2x4, 4, 6, 5, src, srcstride, dst, dststride
167 lea r5, [tab_ChromaCoeff]
168 movd coef2, [r5 + r4 * 4]
170 movd coef2, [tab_ChromaCoeff + r4 * 4]
173 pshufd coef2, coef2, 0
178 FILTER_H4_w2_2 t0, t1, t2
179 lea srcq, [srcq + srcstrideq * 2]
180 lea dstq, [dstq + dststrideq * 2]
185 ;-----------------------------------------------------------------------------
186 ; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
187 ;-----------------------------------------------------------------------------
189 cglobal interp_4tap_horiz_pp_2x8, 4, 6, 5, src, srcstride, dst, dststride
199 lea r5, [tab_ChromaCoeff]
200 movd coef2, [r5 + r4 * 4]
202 movd coef2, [tab_ChromaCoeff + r4 * 4]
205 pshufd coef2, coef2, 0
210 FILTER_H4_w2_2 t0, t1, t2
211 lea srcq, [srcq + srcstrideq * 2]
212 lea dstq, [dstq + dststrideq * 2]
217 ;-----------------------------------------------------------------------------
218 ; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
219 ;-----------------------------------------------------------------------------
221 cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride
231 lea r5, [tab_ChromaCoeff]
232 movd coef2, [r5 + r4 * 4]
234 movd coef2, [tab_ChromaCoeff + r4 * 4]
237 pshufd coef2, coef2, 0
244 FILTER_H4_w2_2 t0, t1, t2
245 lea srcq, [srcq + srcstrideq * 2]
246 lea dstq, [dstq + dststrideq * 2]
252 %macro FILTER_H4_w4_2 3
256 movh %1, [srcq + srcstrideq - 1]
264 movd [dstq + dststrideq], %2
267 ;-----------------------------------------------------------------------------
268 ; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
269 ;-----------------------------------------------------------------------------
271 cglobal interp_4tap_horiz_pp_4x2, 4, 6, 5, src, srcstride, dst, dststride
281 lea r5, [tab_ChromaCoeff]
282 movd coef2, [r5 + r4 * 4]
284 movd coef2, [tab_ChromaCoeff + r4 * 4]
287 pshufd coef2, coef2, 0
291 FILTER_H4_w4_2 t0, t1, t2
295 ;-----------------------------------------------------------------------------
296 ; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
297 ;-----------------------------------------------------------------------------
299 cglobal interp_4tap_horiz_pp_4x4, 4, 6, 5, src, srcstride, dst, dststride
309 lea r5, [tab_ChromaCoeff]
310 movd coef2, [r5 + r4 * 4]
312 movd coef2, [tab_ChromaCoeff + r4 * 4]
315 pshufd coef2, coef2, 0
320 FILTER_H4_w4_2 t0, t1, t2
321 lea srcq, [srcq + srcstrideq * 2]
322 lea dstq, [dstq + dststrideq * 2]
327 ;-----------------------------------------------------------------------------
328 ; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
329 ;-----------------------------------------------------------------------------
331 cglobal interp_4tap_horiz_pp_4x8, 4, 6, 5, src, srcstride, dst, dststride
341 lea r5, [tab_ChromaCoeff]
342 movd coef2, [r5 + r4 * 4]
344 movd coef2, [tab_ChromaCoeff + r4 * 4]
347 pshufd coef2, coef2, 0
352 FILTER_H4_w4_2 t0, t1, t2
353 lea srcq, [srcq + srcstrideq * 2]
354 lea dstq, [dstq + dststrideq * 2]
359 ;-----------------------------------------------------------------------------
360 ; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
361 ;-----------------------------------------------------------------------------
363 cglobal interp_4tap_horiz_pp_4x16, 4, 6, 5, src, srcstride, dst, dststride
373 lea r5, [tab_ChromaCoeff]
374 movd coef2, [r5 + r4 * 4]
376 movd coef2, [tab_ChromaCoeff + r4 * 4]
379 pshufd coef2, coef2, 0
384 FILTER_H4_w4_2 t0, t1, t2
385 lea srcq, [srcq + srcstrideq * 2]
386 lea dstq, [dstq + dststrideq * 2]
391 ;-----------------------------------------------------------------------------
392 ; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
393 ;-----------------------------------------------------------------------------
395 cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride
405 lea r5, [tab_ChromaCoeff]
406 movd coef2, [r5 + r4 * 4]
408 movd coef2, [tab_ChromaCoeff + r4 * 4]
411 pshufd coef2, coef2, 0
418 FILTER_H4_w4_2 t0, t1, t2
419 lea srcq, [srcq + srcstrideq * 2]
420 lea dstq, [dstq + dststrideq * 2]
427 %macro FILTER_H4_w6 3
437 pextrw [dstq + 4], %2, 2
440 %macro FILTER_H4_w8 3
452 %macro FILTER_H4_w12 3
460 movu %1, [srcq - 1 + 8]
467 pextrd [dstq + 8], %2, 2
470 %macro FILTER_H4_w16 4
477 movu %1, [srcq - 1 + 8]
489 %macro FILTER_H4_w24 4
496 movu %1, [srcq - 1 + 8]
506 movu %1, [srcq - 1 + 16]
517 %macro FILTER_H4_w32 4
524 movu %1, [srcq - 1 + 8]
534 movu %1, [srcq - 1 + 16]
540 movu %1, [srcq - 1 + 24]
552 %macro FILTER_H4_w16o 5
553 movu %1, [srcq + %5 - 1]
559 movu %1, [srcq + %5 - 1 + 8]
571 %macro FILTER_H4_w48 4
572 FILTER_H4_w16o %1, %2, %3, %4, 0
573 FILTER_H4_w16o %1, %2, %3, %4, 16
574 FILTER_H4_w16o %1, %2, %3, %4, 32
577 %macro FILTER_H4_w64 4
578 FILTER_H4_w16o %1, %2, %3, %4, 0
579 FILTER_H4_w16o %1, %2, %3, %4, 16
580 FILTER_H4_w16o %1, %2, %3, %4, 32
581 FILTER_H4_w16o %1, %2, %3, %4, 48
584 ;-----------------------------------------------------------------------------
585 ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
586 ;-----------------------------------------------------------------------------
587 %macro IPFILTER_CHROMA 2
589 cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
600 lea r5, [tab_ChromaCoeff]
601 movd coef2, [r5 + r4 * 4]
603 movd coef2, [tab_ChromaCoeff + r4 * 4]
608 pshufd coef2, coef2, 0
611 mova Tm1, [tab_Tm + 16]
614 FILTER_H4_w%1 t0, t1, t2
630 IPFILTER_CHROMA 8, 16
631 IPFILTER_CHROMA 8, 32
632 IPFILTER_CHROMA 12, 16
634 IPFILTER_CHROMA 6, 16
635 IPFILTER_CHROMA 8, 12
636 IPFILTER_CHROMA 8, 64
637 IPFILTER_CHROMA 12, 32
639 ;-----------------------------------------------------------------------------
640 ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
641 ;-----------------------------------------------------------------------------
642 %macro IPFILTER_CHROMA_W 2
644 cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride
656 lea r5, [tab_ChromaCoeff]
657 movd coef2, [r5 + r4 * 4]
659 movd coef2, [tab_ChromaCoeff + r4 * 4]
664 pshufd coef2, coef2, 0
667 mova Tm1, [tab_Tm + 16]
670 FILTER_H4_w%1 t0, t1, t2, t3
680 IPFILTER_CHROMA_W 16, 4
681 IPFILTER_CHROMA_W 16, 8
682 IPFILTER_CHROMA_W 16, 12
683 IPFILTER_CHROMA_W 16, 16
684 IPFILTER_CHROMA_W 16, 32
685 IPFILTER_CHROMA_W 32, 8
686 IPFILTER_CHROMA_W 32, 16
687 IPFILTER_CHROMA_W 32, 24
688 IPFILTER_CHROMA_W 24, 32
689 IPFILTER_CHROMA_W 32, 32
691 IPFILTER_CHROMA_W 16, 24
692 IPFILTER_CHROMA_W 16, 64
693 IPFILTER_CHROMA_W 32, 48
694 IPFILTER_CHROMA_W 24, 64
695 IPFILTER_CHROMA_W 32, 64
697 IPFILTER_CHROMA_W 64, 64
698 IPFILTER_CHROMA_W 64, 32
699 IPFILTER_CHROMA_W 64, 48
700 IPFILTER_CHROMA_W 48, 64
701 IPFILTER_CHROMA_W 64, 16
704 %macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst
706 pshufb %2, %1, [tab_Lm + 0]
708 pshufb %3, %1, [tab_Lm + 16]
711 pshufb %4, %1, [tab_Lm + 32]
713 pshufb %1, %1, [tab_Lm + 48]
724 %macro FILTER_H8_W4 2
725 movu %1, [r0 - 3 + r5]
726 pshufb %2, %1, [tab_Lm]
728 pshufb m7, %1, [tab_Lm + 16]
734 ;----------------------------------------------------------------------------------------------------------------------------
735 ; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
736 ;----------------------------------------------------------------------------------------------------------------------------
737 %macro IPFILTER_LUMA 3
739 cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
744 lea r6, [tab_LumaCoeff]
745 movh m3, [r6 + r4 * 8]
747 movh m3, [tab_LumaCoeff + r4 * 8]
762 lea r6, [r1 + 2 * r1]
771 FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5]
773 FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5]
775 movu [r2 + 2 * r5], m1
788 movh [r2 + 2 * r5], m1
802 cglobal interp_8tap_horiz_pp_4x4, 4,6,6
806 lea r5, [tab_LumaCoeff]
807 vpbroadcastq m0, [r5 + r4 * 8]
809 vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
813 vpbroadcastd m2, [pw_1]
816 ; m0 - interpolate coeff
817 ; m1 - shuffle order table
818 ; m2 - constant word 1
822 vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
826 vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
830 phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
833 lea r0, [r0 + r1 * 2]
834 vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
838 vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
842 phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
844 packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
845 pmulhrsw m3, [pw_512]
846 vextracti128 xm4, m3, 1
847 packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
848 pshufb xm3, [idct4_shuf1] ; [row3 row1 row2 row0]
852 pextrd [r2+r3], xm3, 2
853 pextrd [r2+r3*2], xm3, 1
854 pextrd [r2+r0], xm3, 3
858 ;--------------------------------------------------------------------------------------------------------------
859 ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
860 ;--------------------------------------------------------------------------------------------------------------
861 IPFILTER_LUMA 4, 4, pp
862 IPFILTER_LUMA 4, 8, pp
863 IPFILTER_LUMA 12, 16, pp
864 IPFILTER_LUMA 4, 16, pp
866 ;--------------------------------------------------------------------------------------------------------------
867 ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
868 ;--------------------------------------------------------------------------------------------------------------
869 %macro IPFILTER_LUMA_PP_W8 2
871 cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7
875 lea r5, [tab_LumaCoeff]
876 movh m3, [r5 + r4 * 8]
878 movh m3, [tab_LumaCoeff + r4 * 8]
880 pshufd m0, m3, 0 ; m0 = coeff-L
881 pshufd m1, m3, 0x55 ; m1 = coeff-H
882 lea r5, [tab_Tm] ; r5 = shuffle
883 mova m2, [pw_512] ; m2 = 512
889 movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0]
890 pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0]
891 pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4]
892 pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8]
914 IPFILTER_LUMA_PP_W8 8, 4
915 IPFILTER_LUMA_PP_W8 8, 8
916 IPFILTER_LUMA_PP_W8 8, 16
917 IPFILTER_LUMA_PP_W8 8, 32
918 IPFILTER_LUMA_PP_W8 16, 4
919 IPFILTER_LUMA_PP_W8 16, 8
920 IPFILTER_LUMA_PP_W8 16, 12
921 IPFILTER_LUMA_PP_W8 16, 16
922 IPFILTER_LUMA_PP_W8 16, 32
923 IPFILTER_LUMA_PP_W8 16, 64
924 IPFILTER_LUMA_PP_W8 24, 32
925 IPFILTER_LUMA_PP_W8 32, 8
926 IPFILTER_LUMA_PP_W8 32, 16
927 IPFILTER_LUMA_PP_W8 32, 24
928 IPFILTER_LUMA_PP_W8 32, 32
929 IPFILTER_LUMA_PP_W8 32, 64
930 IPFILTER_LUMA_PP_W8 48, 64
931 IPFILTER_LUMA_PP_W8 64, 16
932 IPFILTER_LUMA_PP_W8 64, 32
933 IPFILTER_LUMA_PP_W8 64, 48
934 IPFILTER_LUMA_PP_W8 64, 64
936 ;----------------------------------------------------------------------------------------------------------------------------
937 ; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
938 ;----------------------------------------------------------------------------------------------------------------------------
939 IPFILTER_LUMA 4, 4, ps
940 IPFILTER_LUMA 8, 8, ps
941 IPFILTER_LUMA 8, 4, ps
942 IPFILTER_LUMA 4, 8, ps
943 IPFILTER_LUMA 16, 16, ps
944 IPFILTER_LUMA 16, 8, ps
945 IPFILTER_LUMA 8, 16, ps
946 IPFILTER_LUMA 16, 12, ps
947 IPFILTER_LUMA 12, 16, ps
948 IPFILTER_LUMA 16, 4, ps
949 IPFILTER_LUMA 4, 16, ps
950 IPFILTER_LUMA 32, 32, ps
951 IPFILTER_LUMA 32, 16, ps
952 IPFILTER_LUMA 16, 32, ps
953 IPFILTER_LUMA 32, 24, ps
954 IPFILTER_LUMA 24, 32, ps
955 IPFILTER_LUMA 32, 8, ps
956 IPFILTER_LUMA 8, 32, ps
957 IPFILTER_LUMA 64, 64, ps
958 IPFILTER_LUMA 64, 32, ps
959 IPFILTER_LUMA 32, 64, ps
960 IPFILTER_LUMA 64, 48, ps
961 IPFILTER_LUMA 48, 64, ps
962 IPFILTER_LUMA 64, 16, ps
963 IPFILTER_LUMA 16, 64, ps
965 ;-----------------------------------------------------------------------------
967 ;-----------------------------------------------------------------------------
968 %macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2]
969 mova %5, [r0 + (%6 + 0) * 16]
970 mova %1, [r0 + (%6 + 1) * 16]
971 mova %2, [r0 + (%6 + 2) * 16]
974 pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0
975 pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1]
978 pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1
979 pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2]
980 %endmacro ; FILTER_HV8_START
982 %macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6]
983 mova %8, [r0 + (%9 + 0) * 16]
984 mova %1, [r0 + (%9 + 1) * 16]
987 pmaddwd %7, [r5 + %10 * 16]
988 pmaddwd %2, [r5 + %10 * 16]
989 paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0
990 paddd %5, %2 ; R0 = H[0+1+2+3]
993 pmaddwd %7, [r5 + %10 * 16]
994 pmaddwd %8, [r5 + %10 * 16]
995 paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1
996 paddd %6, %8 ; R1 = H[1+2+3+4]
997 %endmacro ; FILTER_HV8_MID
1000 %macro FILTER_HV8_END 4 ; output in [1, 3]
1001 paddd %1, [tab_c_526336]
1002 paddd %2, [tab_c_526336]
1003 paddd %3, [tab_c_526336]
1004 paddd %4, [tab_c_526336]
1012 ; TODO: is merge better? I think this way is short dependency link
1014 %endmacro ; FILTER_HV8_END
1016 ;-----------------------------------------------------------------------------
1017 ; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
1018 ;-----------------------------------------------------------------------------
1020 cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
1028 lea r6, [tab_LumaCoeff]
1029 movh coef, [r6 + r4 * 8]
1031 movh coef, [tab_LumaCoeff + r4 * 8]
1033 punpcklqdq coef, coef
1036 lea r6, [r1 + r1 * 2]
1043 FILTER_H8_W8 m0, m1, m2, m3, coef, [tab_c_512], [r0 - 3]
1054 ; Here all of mN is free
1058 lea r6, [tab_LumaCoeffV]
1061 ; load intermedia buffer
1072 ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache
1075 FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0
1076 FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
1077 FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
1078 FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
1079 FILTER_HV8_END m3, m0, m4, m1
1082 movhps [r2 + r3], m3
1084 lea r0, [r0 + 16 * 2]
1085 lea r2, [r2 + r3 * 2]
1093 ;-----------------------------------------------------------------------------
1094 ;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1095 ;-----------------------------------------------------------------------------
1097 cglobal interp_4tap_vert_pp_2x4, 4, 6, 8
1103 lea r5, [tab_ChromaCoeff]
1104 movd m0, [r5 + r4 * 4]
1106 movd m0, [tab_ChromaCoeff + r4 * 4]
1109 lea r5, [r0 + 4 * r1]
1111 mova m1, [tab_c_512]
1115 movd m4, [r0 + 2 * r1]
1119 punpcklbw m6, m4, m5
1127 punpcklbw m7, m5, m6
1139 punpcklbw m3, m6, m7
1144 movd m3, [r5 + 2 * r1]
1158 pextrw [r2 + r3], m2, 2
1159 lea r2, [r2 + 2 * r3]
1161 pextrw [r2 + r3], m2, 6
1165 ;-----------------------------------------------------------------------------
1166 ; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1167 ;-----------------------------------------------------------------------------
1168 %macro FILTER_V4_W2_H4 2
1170 cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8
1176 lea r5, [tab_ChromaCoeff]
1177 movd m0, [r5 + r4 * 4]
1179 movd m0, [tab_ChromaCoeff + r4 * 4]
1184 mova m1, [tab_c_512]
1192 movd m4, [r0 + 2 * r1]
1196 punpcklbw m6, m4, m5
1201 lea r0, [r0 + 4 * r1]
1205 punpcklbw m7, m5, m6
1217 punpcklbw m3, m6, m7
1222 movd m3, [r0 + 2 * r1]
1236 pextrw [r2 + r3], m2, 2
1237 lea r2, [r2 + 2 * r3]
1239 pextrw [r2 + r3], m2, 6
1241 lea r2, [r2 + 2 * r3]
1248 FILTER_V4_W2_H4 2, 8
1250 FILTER_V4_W2_H4 2, 16
1252 ;-----------------------------------------------------------------------------
1253 ; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1254 ;-----------------------------------------------------------------------------
1256 cglobal interp_4tap_vert_pp_4x2, 4, 6, 6
1262 lea r5, [tab_ChromaCoeff]
1263 movd m0, [r5 + r4 * 4]
1265 movd m0, [tab_ChromaCoeff + r4 * 4]
1269 lea r5, [r0 + 2 * r1]
1277 punpcklbw m1, m4, m5
1282 movd m1, [r0 + 4 * r1]
1292 pmulhrsw m2, [tab_c_512]
1295 pextrd [r2 + r3], m2, 1
1299 ;-----------------------------------------------------------------------------
1300 ; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1301 ;-----------------------------------------------------------------------------
1303 cglobal interp_4tap_vert_pp_4x4, 4, 6, 8
1309 lea r5, [tab_ChromaCoeff]
1310 movd m0, [r5 + r4 * 4]
1312 movd m0, [tab_ChromaCoeff + r4 * 4]
1316 mova m1, [tab_c_512]
1317 lea r5, [r0 + 4 * r1]
1322 movd m4, [r0 + 2 * r1]
1326 punpcklbw m6, m4, m5
1334 punpcklbw m7, m5, m6
1346 punpcklbw m3, m6, m7
1351 movd m3, [r5 + 2 * r1]
1365 pextrd [r2 + r3], m2, 1
1366 lea r2, [r2 + 2 * r3]
1368 pextrd [r2 + r3], m2, 3
1372 ;-----------------------------------------------------------------------------
1373 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1374 ;-----------------------------------------------------------------------------
1375 %macro FILTER_V4_W4_H4 2
1377 cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
1383 lea r5, [tab_ChromaCoeff]
1384 movd m0, [r5 + r4 * 4]
1386 movd m0, [tab_ChromaCoeff + r4 * 4]
1391 mova m1, [tab_c_512]
1400 movd m4, [r0 + 2 * r1]
1404 punpcklbw m6, m4, m5
1409 lea r0, [r0 + 4 * r1]
1413 punpcklbw m7, m5, m6
1425 punpcklbw m3, m6, m7
1430 movd m3, [r0 + 2 * r1]
1443 pextrd [r2 + r3], m2, 1
1444 lea r2, [r2 + 2 * r3]
1446 pextrd [r2 + r3], m2, 3
1448 lea r2, [r2 + 2 * r3]
1455 FILTER_V4_W4_H4 4, 8
1456 FILTER_V4_W4_H4 4, 16
1458 FILTER_V4_W4_H4 4, 32
1460 %macro FILTER_V4_W8_H2 0
1462 punpcklbw m7, m3, m0
1473 %macro FILTER_V4_W8_H3 0
1475 punpcklbw m7, m0, m1
1486 %macro FILTER_V4_W8_H4 0
1488 punpcklbw m7, m1, m2
1499 %macro FILTER_V4_W8_H5 0
1501 punpcklbw m7, m2, m3
1512 %macro FILTER_V4_W8_8x2 2
1514 movq m0, [r0 + 4 * r1]
1521 %macro FILTER_V4_W8_8x4 2
1522 FILTER_V4_W8_8x2 %1, %2
1524 lea r6, [r0 + 4 * r1]
1529 movh [r2 + 2 * r3], m2
1532 movq m2, [r6 + 2 * r1]
1536 lea r5, [r2 + 2 * r3]
1540 %macro FILTER_V4_W8_8x6 2
1541 FILTER_V4_W8_8x4 %1, %2
1543 lea r6, [r6 + 2 * r1]
1548 movh [r2 + 4 * r3], m0
1551 movq m0, [r0 + 8 * r1]
1555 lea r5, [r2 + 4 * r3]
1559 ;-----------------------------------------------------------------------------
1560 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1561 ;-----------------------------------------------------------------------------
1562 %macro FILTER_V4_W8 2
1564 cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
1571 movq m2, [r0 + 2 * r1]
1572 lea r5, [r0 + 2 * r1]
1576 punpcklbw m4, m2, m3
1579 lea r6, [tab_ChromaCoeff]
1580 movd m5, [r6 + r4 * 4]
1582 movd m5, [tab_ChromaCoeff + r4 * 4]
1585 pshufb m6, m5, [tab_Vm]
1588 pshufb m5, [tab_Vm + 16]
1593 mova m4, [tab_c_512]
1600 ;-----------------------------------------------------------------------------
1601 ; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1602 ;-----------------------------------------------------------------------------
1603 FILTER_V4_W8_8x2 8, 2
1607 ;-----------------------------------------------------------------------------
1608 ; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1609 ;-----------------------------------------------------------------------------
1610 FILTER_V4_W8_8x4 8, 4
1614 ;-----------------------------------------------------------------------------
1615 ; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1616 ;-----------------------------------------------------------------------------
1617 FILTER_V4_W8_8x6 8, 6
1621 ;-------------------------------------------------------------------------------------------------------------
1622 ; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1623 ;-------------------------------------------------------------------------------------------------------------
1625 cglobal interp_4tap_vert_ps_4x2, 4, 6, 6
1632 lea r5, [tab_ChromaCoeff]
1633 movd m0, [r5 + r4 * 4]
1635 movd m0, [tab_ChromaCoeff + r4 * 4]
1642 lea r5, [r0 + 2 * r1]
1647 punpcklbw m1, m4, m5
1652 movd m1, [r0 + 4 * r1]
1664 movhps [r2 + r3], m2
1668 ;-------------------------------------------------------------------------------------------------------------
1669 ; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1670 ;-------------------------------------------------------------------------------------------------------------
1672 cglobal interp_4tap_vert_ps_4x4, 4, 6, 7
1679 lea r5, [tab_ChromaCoeff]
1680 movd m0, [r5 + r4 * 4]
1682 movd m0, [tab_ChromaCoeff + r4 * 4]
1688 lea r5, [r0 + 4 * r1]
1692 movd m4, [r0 + 2 * r1]
1696 punpcklbw m6, m4, m5
1704 punpcklbw m1, m5, m6
1715 movhps [r2 + r3], m2
1720 punpcklbw m3, m6, m2
1725 movd m3, [r5 + 2 * r1]
1736 lea r2, [r2 + 2 * r3]
1738 movhps [r2 + r3], m4
1742 ;---------------------------------------------------------------------------------------------------------------
1743 ; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1744 ;---------------------------------------------------------------------------------------------------------------
1745 %macro FILTER_V_PS_W4_H4 2
1747 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
1754 lea r5, [tab_ChromaCoeff]
1755 movd m0, [r5 + r4 * 4]
1757 movd m0, [tab_ChromaCoeff + r4 * 4]
1770 movd m4, [r0 + 2 * r1]
1774 punpcklbw m6, m4, m5
1779 lea r0, [r0 + 4 * r1]
1783 punpcklbw m7, m5, m6
1792 movhps [r2 + r3], m2
1797 punpcklbw m3, m6, m2
1802 movd m3, [r0 + 2 * r1]
1813 lea r2, [r2 + 2 * r3]
1815 movhps [r2 + r3], m4
1817 lea r2, [r2 + 2 * r3]
1824 FILTER_V_PS_W4_H4 4, 8
1825 FILTER_V_PS_W4_H4 4, 16
1827 FILTER_V_PS_W4_H4 4, 32
1829 ;--------------------------------------------------------------------------------------------------------------
1830 ; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1831 ;--------------------------------------------------------------------------------------------------------------
1832 %macro FILTER_V_PS_W8_H8_H16_H2 2
1834 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7
1841 lea r5, [tab_ChromaCoeff]
1842 movd m5, [r5 + r4 * 4]
1844 movd m5, [tab_ChromaCoeff + r4 * 4]
1847 pshufb m6, m5, [tab_Vm]
1848 pshufb m5, [tab_Vm + 16]
1857 movq m2, [r0 + 2 * r1]
1872 movq m0, [r0 + 4 * r1]
1884 lea r0, [r0 + 2 * r1]
1885 lea r2, [r2 + 2 * r3]
1893 FILTER_V_PS_W8_H8_H16_H2 8, 2
1894 FILTER_V_PS_W8_H8_H16_H2 8, 4
1895 FILTER_V_PS_W8_H8_H16_H2 8, 6
1897 FILTER_V_PS_W8_H8_H16_H2 8, 12
1898 FILTER_V_PS_W8_H8_H16_H2 8, 64
1900 ;--------------------------------------------------------------------------------------------------------------
1901 ; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1902 ;--------------------------------------------------------------------------------------------------------------
1903 %macro FILTER_V_PS_W8_H8_H16_H32 2
1905 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
1912 lea r5, [tab_ChromaCoeff]
1913 movd m5, [r5 + r4 * 4]
1915 movd m5, [tab_ChromaCoeff + r4 * 4]
1918 pshufb m6, m5, [tab_Vm]
1919 pshufb m5, [tab_Vm + 16]
1928 movq m2, [r0 + 2 * r1]
1936 pmaddubsw m7, m2, m5
1943 lea r0, [r0 + 4 * r1]
1949 pmaddubsw m7, m3, m5
1966 lea r2, [r2 + 2 * r3]
1969 movq m2, [r0 + 2 * r1]
1981 lea r2, [r2 + 2 * r3]
1988 FILTER_V_PS_W8_H8_H16_H32 8, 8
1989 FILTER_V_PS_W8_H8_H16_H32 8, 16
1990 FILTER_V_PS_W8_H8_H16_H32 8, 32
1992 ;------------------------------------------------------------------------------------------------------------
1993 ;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1994 ;------------------------------------------------------------------------------------------------------------
1995 %macro FILTER_V_PS_W6 2
1997 cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8
2004 lea r5, [tab_ChromaCoeff]
2005 movd m5, [r5 + r4 * 4]
2007 movd m5, [tab_ChromaCoeff + r4 * 4]
2010 pshufb m6, m5, [tab_Vm]
2011 pshufb m5, [tab_Vm + 16]
2019 movq m2, [r0 + 2 * r1]
2027 pmaddubsw m7, m2, m5
2036 lea r0, [r0 + 4 * r1]
2041 pmaddubsw m7, m3, m5
2048 movd [r2 + r3 + 8], m1
2059 lea r2,[r2 + 2 * r3]
2064 movq m2,[r0 + 2 * r1]
2075 movd [r2 + r3 + 8], m3
2077 lea r2, [r2 + 2 * r3]
2085 FILTER_V_PS_W6 6, 16
2087 ;---------------------------------------------------------------------------------------------------------------
2088 ; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2089 ;---------------------------------------------------------------------------------------------------------------
2090 %macro FILTER_V_PS_W12 2
2092 cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8
2099 lea r5, [tab_ChromaCoeff]
2100 movd m0, [r5 + r4 * 4]
2102 movd m0, [tab_ChromaCoeff + r4 * 4]
2105 pshufb m1, m0, [tab_Vm]
2106 pshufb m0, [tab_Vm + 16]
2114 punpcklbw m4, m2, m3
2120 lea r0, [r0 + 2 * r1]
2124 punpcklbw m6, m5, m7
2128 punpckhbw m6, m5, m7
2140 punpcklbw m4, m3, m5
2146 movu m2, [r0 + 2 * r1]
2148 punpcklbw m5, m7, m2
2161 movh [r2 + r3 + 16], m3
2163 lea r2, [r2 + 2 * r3]
2170 FILTER_V_PS_W12 12, 16
2171 FILTER_V_PS_W12 12, 32
2173 ;---------------------------------------------------------------------------------------------------------------
2174 ; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2175 ;---------------------------------------------------------------------------------------------------------------
2176 %macro FILTER_V_PS_W16 2
2178 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
2185 lea r5, [tab_ChromaCoeff]
2186 movd m0, [r5 + r4 * 4]
2188 movd m0, [tab_ChromaCoeff + r4 * 4]
2191 pshufb m1, m0, [tab_Vm]
2192 pshufb m0, [tab_Vm + 16]
2199 punpcklbw m4, m2, m3
2205 lea r0, [r0 + 2 * r1]
2209 punpcklbw m6, m5, m7
2213 punpckhbw m6, m5, m7
2225 punpcklbw m4, m3, m5
2231 movu m5, [r0 + 2 * r1]
2233 punpcklbw m2, m7, m5
2246 movu [r2 + r3 + 16], m3
2248 lea r2, [r2 + 2 * r3]
2255 FILTER_V_PS_W16 16, 4
2256 FILTER_V_PS_W16 16, 8
2257 FILTER_V_PS_W16 16, 12
2258 FILTER_V_PS_W16 16, 16
2259 FILTER_V_PS_W16 16, 32
2261 FILTER_V_PS_W16 16, 24
2262 FILTER_V_PS_W16 16, 64
2264 ;--------------------------------------------------------------------------------------------------------------
2265 ;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2266 ;--------------------------------------------------------------------------------------------------------------
2267 %macro FILTER_V4_PS_W24 2
2269 cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8
2276 lea r5, [tab_ChromaCoeff]
2277 movd m0, [r5 + r4 * 4]
2279 movd m0, [tab_ChromaCoeff + r4 * 4]
2282 pshufb m1, m0, [tab_Vm]
2283 pshufb m0, [tab_Vm + 16]
2291 punpcklbw m4, m2, m3
2297 lea r5, [r0 + 2 * r1]
2302 punpcklbw m6, m5, m7
2306 punpckhbw m6, m5, m7
2318 punpcklbw m4, m3, m5
2324 movu m2, [r5 + 2 * r1]
2326 punpcklbw m5, m7, m2
2339 movu [r2 + r3 + 16], m3
2342 movq m3, [r0 + r1 + 16]
2344 movq m5, [r5 + r1 + 16]
2347 punpcklbw m7, m4, m5
2357 movq m2, [r5 + 2 * r1 + 16]
2368 movu [r2 + r3 + 32], m3
2371 lea r2, [r2 + 2 * r3]
2378 FILTER_V4_PS_W24 24, 32
2380 FILTER_V4_PS_W24 24, 64
2382 ;---------------------------------------------------------------------------------------------------------------
2383 ; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2384 ;---------------------------------------------------------------------------------------------------------------
2385 %macro FILTER_V_PS_W32 2
2387 cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
2394 lea r5, [tab_ChromaCoeff]
2395 movd m0, [r5 + r4 * 4]
2397 movd m0, [tab_ChromaCoeff + r4 * 4]
2400 pshufb m1, m0, [tab_Vm]
2401 pshufb m0, [tab_Vm + 16]
2411 punpcklbw m4, m2, m3
2417 lea r5, [r0 + 2 * r1]
2421 punpcklbw m6, m3, m5
2437 movu m3, [r0 + r1 + 16]
2439 punpcklbw m4, m2, m3
2446 movu m5, [r5 + r1 + 16]
2448 punpcklbw m6, m3, m5
2471 FILTER_V_PS_W32 32, 8
2472 FILTER_V_PS_W32 32, 16
2473 FILTER_V_PS_W32 32, 24
2474 FILTER_V_PS_W32 32, 32
2476 FILTER_V_PS_W32 32, 48
2477 FILTER_V_PS_W32 32, 64
2479 ;-----------------------------------------------------------------------------
2480 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2481 ;-----------------------------------------------------------------------------
2482 %macro FILTER_V4_W8_H8_H16_H32 2
2484 cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
2490 lea r5, [tab_ChromaCoeff]
2491 movd m5, [r5 + r4 * 4]
2493 movd m5, [tab_ChromaCoeff + r4 * 4]
2496 pshufb m6, m5, [tab_Vm]
2497 pshufb m5, [tab_Vm + 16]
2498 mova m4, [tab_c_512]
2506 movq m2, [r0 + 2 * r1]
2514 pmaddubsw m7, m2, m5
2522 lea r0, [r0 + 4 * r1]
2528 pmaddubsw m7, m3, m5
2547 movq m7, [r0 + 2 * r1]
2558 lea r2, [r2 + 2 * r3]
2560 movhps [r2 + r3], m2
2562 lea r2, [r2 + 2 * r3]
2569 FILTER_V4_W8_H8_H16_H32 8, 8
2570 FILTER_V4_W8_H8_H16_H32 8, 16
2571 FILTER_V4_W8_H8_H16_H32 8, 32
2573 FILTER_V4_W8_H8_H16_H32 8, 12
2574 FILTER_V4_W8_H8_H16_H32 8, 64
2577 ;-----------------------------------------------------------------------------
2578 ;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2579 ;-----------------------------------------------------------------------------
2580 %macro FILTER_V4_W6_H4 2
2582 cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8
2588 lea r5, [tab_ChromaCoeff]
2589 movd m5, [r5 + r4 * 4]
2591 movd m5, [tab_ChromaCoeff + r4 * 4]
2594 pshufb m6, m5, [tab_Vm]
2595 pshufb m5, [tab_Vm + 16]
2596 mova m4, [tab_c_512]
2604 movq m2, [r0 + 2 * r1]
2612 pmaddubsw m7, m2, m5
2619 pextrw [r2 + 4], m0, 2
2621 lea r0, [r0 + 4 * r1]
2627 pmaddubsw m7, m3, m5
2634 pextrw [r2 + r3 + 4], m1, 2
2637 punpcklbw m7, m0, m1
2646 lea r2, [r2 + 2 * r3]
2648 pextrw [r2 + 4], m2, 2
2650 movq m2, [r0 + 2 * r1]
2662 pextrw [r2 + r3 + 4], m3, 2
2664 lea r2, [r2 + 2 * r3]
2671 FILTER_V4_W6_H4 6, 8
2673 FILTER_V4_W6_H4 6, 16
2675 ;-----------------------------------------------------------------------------
2676 ; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2677 ;-----------------------------------------------------------------------------
2678 %macro FILTER_V4_W12_H2 2
2680 cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8
2686 lea r5, [tab_ChromaCoeff]
2687 movd m0, [r5 + r4 * 4]
2689 movd m0, [tab_ChromaCoeff + r4 * 4]
2692 pshufb m1, m0, [tab_Vm]
2693 pshufb m0, [tab_Vm + 16]
2701 punpcklbw m4, m2, m3
2707 lea r0, [r0 + 2 * r1]
2711 punpcklbw m6, m5, m7
2715 punpckhbw m6, m5, m7
2719 mova m6, [tab_c_512]
2727 pextrd [r2 + 8], m4, 2
2729 punpcklbw m4, m3, m5
2735 movu m5, [r0 + 2 * r1]
2737 punpcklbw m2, m7, m5
2752 pextrd [r2 + r3 + 8], m4, 2
2754 lea r2, [r2 + 2 * r3]
2761 FILTER_V4_W12_H2 12, 16
2763 FILTER_V4_W12_H2 12, 32
2765 ;-----------------------------------------------------------------------------
2766 ; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2767 ;-----------------------------------------------------------------------------
2768 %macro FILTER_V4_W16_H2 2
2770 cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8
2776 lea r5, [tab_ChromaCoeff]
2777 movd m0, [r5 + r4 * 4]
2779 movd m0, [tab_ChromaCoeff + r4 * 4]
2782 pshufb m1, m0, [tab_Vm]
2783 pshufb m0, [tab_Vm + 16]
2791 punpcklbw m4, m2, m3
2797 lea r0, [r0 + 2 * r1]
2801 punpckhbw m7, m5, m6
2805 punpcklbw m7, m5, m6
2809 mova m7, [tab_c_512]
2818 punpcklbw m4, m3, m5
2824 movu m5, [r0 + 2 * r1]
2826 punpcklbw m2, m6, m5
2842 lea r2, [r2 + 2 * r3]
2849 FILTER_V4_W16_H2 16, 4
2850 FILTER_V4_W16_H2 16, 8
2851 FILTER_V4_W16_H2 16, 12
2852 FILTER_V4_W16_H2 16, 16
2853 FILTER_V4_W16_H2 16, 32
2855 FILTER_V4_W16_H2 16, 24
2856 FILTER_V4_W16_H2 16, 64
2858 ;-----------------------------------------------------------------------------
2859 ;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2860 ;-----------------------------------------------------------------------------
2861 %macro FILTER_V4_W24 2
2863 cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8
2869 lea r5, [tab_ChromaCoeff]
2870 movd m0, [r5 + r4 * 4]
2872 movd m0, [tab_ChromaCoeff + r4 * 4]
2875 pshufb m1, m0, [tab_Vm]
2876 pshufb m0, [tab_Vm + 16]
2884 punpcklbw m4, m2, m3
2890 lea r5, [r0 + 2 * r1]
2894 punpcklbw m6, m5, m7
2898 punpckhbw m6, m5, m7
2902 mova m6, [tab_c_512]
2911 punpcklbw m4, m3, m5
2917 movu m2, [r5 + 2 * r1]
2919 punpcklbw m5, m7, m2
2936 movq m3, [r0 + r1 + 16]
2938 movq m5, [r5 + r1 + 16]
2950 movq m3, [r0 + r1 + 16]
2952 movq m5, [r5 + r1 + 16]
2953 movq m7, [r5 + 2 * r1 + 16]
2967 movhps [r2 + r3 + 16], m2
2970 lea r2, [r2 + 2 * r3]
2977 FILTER_V4_W24 24, 32
2979 FILTER_V4_W24 24, 64
2981 ;-----------------------------------------------------------------------------
2982 ; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2983 ;-----------------------------------------------------------------------------
2984 %macro FILTER_V4_W32 2
2986 cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
2992 lea r5, [tab_ChromaCoeff]
2993 movd m0, [r5 + r4 * 4]
2995 movd m0, [tab_ChromaCoeff + r4 * 4]
2998 pshufb m1, m0, [tab_Vm]
2999 pshufb m0, [tab_Vm + 16]
3001 mova m7, [tab_c_512]
3009 punpcklbw m4, m2, m3
3015 lea r5, [r0 + 2 * r1]
3019 punpcklbw m6, m3, m5
3036 movu m3, [r0 + r1 + 16]
3038 punpcklbw m4, m2, m3
3045 movu m5, [r5 + r1 + 16]
3047 punpcklbw m6, m3, m5
3072 FILTER_V4_W32 32, 16
3073 FILTER_V4_W32 32, 24
3074 FILTER_V4_W32 32, 32
3076 FILTER_V4_W32 32, 48
3077 FILTER_V4_W32 32, 64
3080 ;-----------------------------------------------------------------------------
3081 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3082 ;-----------------------------------------------------------------------------
3083 %macro FILTER_V4_W16n_H2 2
3085 cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
3091 lea r5, [tab_ChromaCoeff]
3092 movd m0, [r5 + r4 * 4]
3094 movd m0, [tab_ChromaCoeff + r4 * 4]
3097 pshufb m1, m0, [tab_Vm]
3098 pshufb m0, [tab_Vm + 16]
3111 punpcklbw m4, m2, m3
3117 lea r5, [r0 + 2 * r1]
3121 punpckhbw m7, m5, m6
3125 punpcklbw m7, m5, m6
3129 mova m7, [tab_c_512]
3138 punpcklbw m4, m3, m5
3144 movu m5, [r5 + 2 * r1]
3146 punpcklbw m2, m6, m5
3167 lea r0, [r0 + r1 * 2 - %1]
3168 lea r2, [r2 + r3 * 2 - %1]
3175 FILTER_V4_W16n_H2 64, 64
3176 FILTER_V4_W16n_H2 64, 32
3177 FILTER_V4_W16n_H2 64, 48
3178 FILTER_V4_W16n_H2 48, 64
3179 FILTER_V4_W16n_H2 64, 16
3182 ;-----------------------------------------------------------------------------
3183 ; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
3184 ;-----------------------------------------------------------------------------
3186 cglobal luma_p2s, 3, 7, 6
3188 ; load width and height
3193 mova m4, [tab_c_128]
3194 mova m5, [tab_c_64_n64]
3210 movh m2, [r6 + r1 * 2]
3214 lea r6, [r6 + r1 * 2]
3222 movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
3223 movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
3224 movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
3225 movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
3230 movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
3231 movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
3232 movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
3233 movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
3236 lea r0, [r0 + r1 * 4]
3237 add r2, FENC_STRIDE * 8
3244 %macro PROCESS_LUMA_W4_4R 0
3247 punpcklbw m2, m0, m1 ; m2=[0 1]
3249 lea r0, [r0 + 2 * r1]
3251 punpcklbw m1, m0 ; m1=[1 2]
3252 punpcklqdq m2, m1 ; m2=[0 1 1 2]
3253 pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2]
3256 punpcklbw m5, m0, m1 ; m2=[2 3]
3257 lea r0, [r0 + 2 * r1]
3259 punpcklbw m1, m0 ; m1=[3 4]
3260 punpcklqdq m5, m1 ; m5=[2 3 3 4]
3261 pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4]
3262 paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2
3263 pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4
3266 punpcklbw m2, m0, m1 ; m2=[4 5]
3267 lea r0, [r0 + 2 * r1]
3269 punpcklbw m1, m0 ; m1=[5 6]
3270 punpcklqdq m2, m1 ; m2=[4 5 5 6]
3271 pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6]
3272 paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2
3273 pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6]
3274 paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4
3277 punpcklbw m2, m0, m1 ; m2=[6 7]
3278 lea r0, [r0 + 2 * r1]
3280 punpcklbw m1, m0 ; m1=[7 8]
3281 punpcklqdq m2, m1 ; m2=[6 7 7 8]
3282 pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8]
3283 paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end
3284 pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8]
3285 paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4
3288 punpcklbw m2, m0, m1 ; m2=[8 9]
3289 movd m0, [r0 + 2 * r1]
3290 punpcklbw m1, m0 ; m1=[9 10]
3291 punpcklqdq m2, m1 ; m2=[8 9 9 10]
3292 pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10]
3293 paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end
3296 %macro PROCESS_LUMA_W8_4R 0
3300 pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1
3302 lea r0, [r0 + 2 * r1]
3305 pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2
3309 pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3
3310 pmaddubsw m0, [r6 + 1 * 16]
3311 paddw m7, m0 ;m7=[0+1+2+3] Row1
3313 lea r0, [r0 + 2 * r1]
3316 pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4
3317 pmaddubsw m1, [r6 + 1 * 16]
3318 paddw m6, m1 ;m6 = [1+2+3+4] Row2
3322 pmaddubsw m2, m0, [r6 + 1 * 16]
3323 pmaddubsw m0, [r6 + 2 * 16]
3324 paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1
3325 paddw m5, m2 ;m5=[2+3+4+5] Row3
3327 lea r0, [r0 + 2 * r1]
3330 pmaddubsw m2, m1, [r6 + 1 * 16]
3331 pmaddubsw m1, [r6 + 2 * 16]
3332 paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2
3333 paddw m4, m2 ;m4=[3+4+5+6] Row4
3337 pmaddubsw m2, m0, [r6 + 2 * 16]
3338 pmaddubsw m0, [r6 + 3 * 16]
3339 paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end
3340 paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3
3342 lea r0, [r0 + 2 * r1]
3345 pmaddubsw m2, m1, [r6 + 2 * 16]
3346 pmaddubsw m1, [r6 + 3 * 16]
3347 paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end
3348 paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4
3352 pmaddubsw m0, [r6 + 3 * 16]
3353 paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end
3355 movq m0, [r0 + 2 * r1]
3357 pmaddubsw m1, [r6 + 3 * 16]
3358 paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end
3361 ;-------------------------------------------------------------------------------------------------------------
3362 ; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3363 ;-------------------------------------------------------------------------------------------------------------
3364 %macro FILTER_VER_LUMA_4xN 3
3366 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6
3375 lea r5, [tab_LumaCoeffVer]
3378 lea r6, [tab_LumaCoeffVer + r4]
3382 mova m3, [tab_c_512]
3400 pextrd [r2 + r3], m4, 1
3401 lea r2, [r2 + 2 * r3]
3403 pextrd [r2 + r3], m4, 3
3409 movhps [r2 + r3], m4
3410 lea r2, [r2 + 2 * r3]
3412 movhps [r2 + r3], m5
3416 lea r2, [r2 + 2 * r3]
3424 ;-------------------------------------------------------------------------------------------------------------
3425 ; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3426 ;-------------------------------------------------------------------------------------------------------------
3427 FILTER_VER_LUMA_4xN 4, 4, pp
3429 ;-------------------------------------------------------------------------------------------------------------
3430 ; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3431 ;-------------------------------------------------------------------------------------------------------------
3432 FILTER_VER_LUMA_4xN 4, 8, pp
3434 ;-------------------------------------------------------------------------------------------------------------
3435 ; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3436 ;-------------------------------------------------------------------------------------------------------------
3437 FILTER_VER_LUMA_4xN 4, 16, pp
3439 ;-------------------------------------------------------------------------------------------------------------
3440 ; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3441 ;-------------------------------------------------------------------------------------------------------------
3442 FILTER_VER_LUMA_4xN 4, 4, ps
3444 ;-------------------------------------------------------------------------------------------------------------
3445 ; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3446 ;-------------------------------------------------------------------------------------------------------------
3447 FILTER_VER_LUMA_4xN 4, 8, ps
3449 ;-------------------------------------------------------------------------------------------------------------
3450 ; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3451 ;-------------------------------------------------------------------------------------------------------------
3452 FILTER_VER_LUMA_4xN 4, 16, ps
3454 ;-------------------------------------------------------------------------------------------------------------
3455 ; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3456 ;-------------------------------------------------------------------------------------------------------------
3457 %macro FILTER_VER_LUMA_8xN 3
3459 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
3469 lea r5, [tab_LumaCoeffVer]
3472 lea r6, [tab_LumaCoeffVer + r4]
3476 mova m3, [tab_c_512]
3497 movhps [r2 + r3], m7
3498 lea r2, [r2 + 2 * r3]
3500 movhps [r2 + r3], m5
3509 lea r2, [r2 + 2 * r3]
3515 lea r2, [r2 + 2 * r3]
3523 ;-------------------------------------------------------------------------------------------------------------
3524 ; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3525 ;-------------------------------------------------------------------------------------------------------------
3526 FILTER_VER_LUMA_8xN 8, 4, pp
3528 ;-------------------------------------------------------------------------------------------------------------
3529 ; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3530 ;-------------------------------------------------------------------------------------------------------------
3531 FILTER_VER_LUMA_8xN 8, 8, pp
3533 ;-------------------------------------------------------------------------------------------------------------
3534 ; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3535 ;-------------------------------------------------------------------------------------------------------------
3536 FILTER_VER_LUMA_8xN 8, 16, pp
3538 ;-------------------------------------------------------------------------------------------------------------
3539 ; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3540 ;-------------------------------------------------------------------------------------------------------------
3541 FILTER_VER_LUMA_8xN 8, 32, pp
3543 ;-------------------------------------------------------------------------------------------------------------
3544 ; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3545 ;-------------------------------------------------------------------------------------------------------------
3546 FILTER_VER_LUMA_8xN 8, 4, ps
3548 ;-------------------------------------------------------------------------------------------------------------
3549 ; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3550 ;-------------------------------------------------------------------------------------------------------------
3551 FILTER_VER_LUMA_8xN 8, 8, ps
3553 ;-------------------------------------------------------------------------------------------------------------
3554 ; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3555 ;-------------------------------------------------------------------------------------------------------------
3556 FILTER_VER_LUMA_8xN 8, 16, ps
3558 ;-------------------------------------------------------------------------------------------------------------
3559 ; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3560 ;-------------------------------------------------------------------------------------------------------------
3561 FILTER_VER_LUMA_8xN 8, 32, ps
3563 ;-------------------------------------------------------------------------------------------------------------
3564 ; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3565 ;-------------------------------------------------------------------------------------------------------------
3566 %macro FILTER_VER_LUMA_12xN 3
3568 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
3577 lea r5, [tab_LumaCoeffVer]
3580 lea r6, [tab_LumaCoeffVer + r4]
3584 mova m3, [tab_c_512]
3604 movhps [r2 + r3], m7
3605 lea r5, [r2 + 2 * r3]
3607 movhps [r5 + r3], m5
3616 lea r5, [r2 + 2 * r3]
3621 lea r5, [8 * r1 - 8]
3638 pextrd [r2 + r3], m4, 1
3639 lea r5, [r2 + 2 * r3]
3641 pextrd [r5 + r3], m4, 3
3647 movhps [r2 + r3], m4
3648 lea r5, [r2 + 2 * r3]
3650 movhps [r5 + r3], m5
3653 lea r5, [4 * r1 + 8]
3656 lea r2, [r2 + 4 * r3 - 8]
3658 lea r2, [r2 + 4 * r3 - 16]
3667 ;-------------------------------------------------------------------------------------------------------------
3668 ; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3669 ;-------------------------------------------------------------------------------------------------------------
3670 FILTER_VER_LUMA_12xN 12, 16, pp
3672 ;-------------------------------------------------------------------------------------------------------------
3673 ; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3674 ;-------------------------------------------------------------------------------------------------------------
3675 FILTER_VER_LUMA_12xN 12, 16, ps
3677 ;-------------------------------------------------------------------------------------------------------------
3678 ; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3679 ;-------------------------------------------------------------------------------------------------------------
3680 %macro FILTER_VER_LUMA 3
3682 cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize
3691 lea r5, [tab_LumaCoeffVer]
3694 lea r6, [tab_LumaCoeffVer + r4]
3698 mova m3, [tab_c_512]
3702 mov dword [rsp], %2/4
3718 movhps [r2 + r3], m7
3719 lea r5, [r2 + 2 * r3]
3721 movhps [r5 + r3], m5
3730 lea r5, [r2 + 2 * r3]
3735 lea r5, [8 * r1 - 8]
3745 lea r0, [r0 + 4 * r1 - %1]
3747 lea r2, [r2 + 4 * r3 - %1]
3749 lea r2, [r2 + 4 * r3 - 2 * %1]
3758 FILTER_VER_LUMA 16, 4, pp
3759 FILTER_VER_LUMA 16, 8, pp
3760 FILTER_VER_LUMA 16, 12, pp
3761 FILTER_VER_LUMA 16, 16, pp
3762 FILTER_VER_LUMA 16, 32, pp
3763 FILTER_VER_LUMA 16, 64, pp
3764 FILTER_VER_LUMA 24, 32, pp
3765 FILTER_VER_LUMA 32, 8, pp
3766 FILTER_VER_LUMA 32, 16, pp
3767 FILTER_VER_LUMA 32, 24, pp
3768 FILTER_VER_LUMA 32, 32, pp
3769 FILTER_VER_LUMA 32, 64, pp
3770 FILTER_VER_LUMA 48, 64, pp
3771 FILTER_VER_LUMA 64, 16, pp
3772 FILTER_VER_LUMA 64, 32, pp
3773 FILTER_VER_LUMA 64, 48, pp
3774 FILTER_VER_LUMA 64, 64, pp
3776 FILTER_VER_LUMA 16, 4, ps
3777 FILTER_VER_LUMA 16, 8, ps
3778 FILTER_VER_LUMA 16, 12, ps
3779 FILTER_VER_LUMA 16, 16, ps
3780 FILTER_VER_LUMA 16, 32, ps
3781 FILTER_VER_LUMA 16, 64, ps
3782 FILTER_VER_LUMA 24, 32, ps
3783 FILTER_VER_LUMA 32, 8, ps
3784 FILTER_VER_LUMA 32, 16, ps
3785 FILTER_VER_LUMA 32, 24, ps
3786 FILTER_VER_LUMA 32, 32, ps
3787 FILTER_VER_LUMA 32, 64, ps
3788 FILTER_VER_LUMA 48, 64, ps
3789 FILTER_VER_LUMA 64, 16, ps
3790 FILTER_VER_LUMA 64, 32, ps
3791 FILTER_VER_LUMA 64, 48, ps
3792 FILTER_VER_LUMA 64, 64, ps
3794 %macro PROCESS_LUMA_SP_W4_4R 0
3797 punpcklwd m0, m1 ;m0=[0 1]
3798 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
3800 lea r0, [r0 + 2 * r1]
3802 punpcklwd m1, m4 ;m1=[1 2]
3803 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
3806 punpcklwd m4, m5 ;m4=[2 3]
3807 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
3808 pmaddwd m4, [r6 + 1 * 16]
3809 paddd m0, m4 ;m0=[0+1+2+3] Row1
3811 lea r0, [r0 + 2 * r1]
3813 punpcklwd m5, m4 ;m5=[3 4]
3814 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
3815 pmaddwd m5, [r6 + 1 * 16]
3816 paddd m1, m5 ;m1 = [1+2+3+4] Row2
3819 punpcklwd m4, m5 ;m4=[4 5]
3820 pmaddwd m6, m4, [r6 + 1 * 16]
3821 paddd m2, m6 ;m2=[2+3+4+5] Row3
3822 pmaddwd m4, [r6 + 2 * 16]
3823 paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
3825 lea r0, [r0 + 2 * r1]
3827 punpcklwd m5, m4 ;m5=[5 6]
3828 pmaddwd m6, m5, [r6 + 1 * 16]
3829 paddd m3, m6 ;m3=[3+4+5+6] Row4
3830 pmaddwd m5, [r6 + 2 * 16]
3831 paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
3834 punpcklwd m4, m5 ;m4=[6 7]
3835 pmaddwd m6, m4, [r6 + 2 * 16]
3836 paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
3837 pmaddwd m4, [r6 + 3 * 16]
3838 paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
3840 lea r0, [r0 + 2 * r1]
3842 punpcklwd m5, m4 ;m5=[7 8]
3843 pmaddwd m6, m5, [r6 + 2 * 16]
3844 paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
3845 pmaddwd m5, [r6 + 3 * 16]
3846 paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
3849 punpcklwd m4, m5 ;m4=[8 9]
3850 pmaddwd m4, [r6 + 3 * 16]
3851 paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
3853 movq m4, [r0 + 2 * r1]
3854 punpcklwd m5, m4 ;m5=[9 10]
3855 pmaddwd m5, [r6 + 3 * 16]
3856 paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
3859 ;--------------------------------------------------------------------------------------------------------------
3860 ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3861 ;--------------------------------------------------------------------------------------------------------------
3862 %macro FILTER_VER_LUMA_SP 2
3864 cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize
3867 lea r5, [r1 + 2 * r1]
3872 lea r5, [tab_LumaCoeffV]
3875 lea r6, [tab_LumaCoeffV + r4]
3878 mova m7, [tab_c_526336]
3880 mov dword [rsp], %2/4
3884 PROCESS_LUMA_SP_W4_4R
3902 pextrd [r2 + r3], m0, 1
3903 lea r5, [r2 + 2 * r3]
3905 pextrd [r5 + r3], m0, 3
3907 lea r5, [8 * r1 - 2 * 4]
3914 lea r0, [r0 + 4 * r1 - 2 * %1]
3915 lea r2, [r2 + 4 * r3 - %1]
3923 ;--------------------------------------------------------------------------------------------------------------
3924 ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
3925 ;--------------------------------------------------------------------------------------------------------------
3926 FILTER_VER_LUMA_SP 4, 4
3927 FILTER_VER_LUMA_SP 8, 8
3928 FILTER_VER_LUMA_SP 8, 4
3929 FILTER_VER_LUMA_SP 4, 8
3930 FILTER_VER_LUMA_SP 16, 16
3931 FILTER_VER_LUMA_SP 16, 8
3932 FILTER_VER_LUMA_SP 8, 16
3933 FILTER_VER_LUMA_SP 16, 12
3934 FILTER_VER_LUMA_SP 12, 16
3935 FILTER_VER_LUMA_SP 16, 4
3936 FILTER_VER_LUMA_SP 4, 16
3937 FILTER_VER_LUMA_SP 32, 32
3938 FILTER_VER_LUMA_SP 32, 16
3939 FILTER_VER_LUMA_SP 16, 32
3940 FILTER_VER_LUMA_SP 32, 24
3941 FILTER_VER_LUMA_SP 24, 32
3942 FILTER_VER_LUMA_SP 32, 8
3943 FILTER_VER_LUMA_SP 8, 32
3944 FILTER_VER_LUMA_SP 64, 64
3945 FILTER_VER_LUMA_SP 64, 32
3946 FILTER_VER_LUMA_SP 32, 64
3947 FILTER_VER_LUMA_SP 64, 48
3948 FILTER_VER_LUMA_SP 48, 64
3949 FILTER_VER_LUMA_SP 64, 16
3950 FILTER_VER_LUMA_SP 16, 64
3952 ; TODO: combin of U and V is more performance, but need more register
3953 ; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it
3955 cglobal chroma_p2s, 3, 7, 4
3957 ; load width and height
3962 mova m2, [tab_c_128]
3963 mova m3, [tab_c_64_n64]
3981 lea r6, [r2 + r5 * 2]
3983 movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0
3984 movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1
3992 movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0
3993 movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1
4000 movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0
4001 movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1
4004 lea r0, [r0 + r1 * 2]
4005 add r2, FENC_STRIDE / 2 * 4
4012 %macro PROCESS_CHROMA_SP_W4_4R 0
4015 punpcklwd m0, m1 ;m0=[0 1]
4016 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
4018 lea r0, [r0 + 2 * r1]
4020 punpcklwd m1, m4 ;m1=[1 2]
4021 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
4024 punpcklwd m4, m5 ;m4=[2 3]
4025 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
4026 pmaddwd m4, [r6 + 1 * 16]
4027 paddd m0, m4 ;m0=[0+1+2+3] Row1 done
4029 lea r0, [r0 + 2 * r1]
4031 punpcklwd m5, m4 ;m5=[3 4]
4032 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
4033 pmaddwd m5, [r6 + 1 * 16]
4034 paddd m1, m5 ;m1 = [1+2+3+4] Row2
4037 punpcklwd m4, m5 ;m4=[4 5]
4038 pmaddwd m4, [r6 + 1 * 16]
4039 paddd m2, m4 ;m2=[2+3+4+5] Row3
4041 movq m4, [r0 + 2 * r1]
4042 punpcklwd m5, m4 ;m5=[5 6]
4043 pmaddwd m5, [r6 + 1 * 16]
4044 paddd m3, m5 ;m3=[3+4+5+6] Row4
4047 ;--------------------------------------------------------------------------------------------------------------
4048 ; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4049 ;--------------------------------------------------------------------------------------------------------------
4050 %macro FILTER_VER_CHROMA_SP 2
4052 cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize
4059 lea r5, [tab_ChromaCoeffV]
4062 lea r6, [tab_ChromaCoeffV + r4]
4065 mova m6, [tab_c_526336]
4067 mov dword [rsp], %2/4
4072 PROCESS_CHROMA_SP_W4_4R
4090 pextrd [r2 + r3], m0, 1
4091 lea r5, [r2 + 2 * r3]
4093 pextrd [r5 + r3], m0, 3
4095 lea r5, [4 * r1 - 2 * 4]
4102 lea r0, [r0 + 4 * r1 - 2 * %1]
4103 lea r2, [r2 + 4 * r3 - %1]
4111 FILTER_VER_CHROMA_SP 4, 4
4112 FILTER_VER_CHROMA_SP 4, 8
4113 FILTER_VER_CHROMA_SP 16, 16
4114 FILTER_VER_CHROMA_SP 16, 8
4115 FILTER_VER_CHROMA_SP 16, 12
4116 FILTER_VER_CHROMA_SP 12, 16
4117 FILTER_VER_CHROMA_SP 16, 4
4118 FILTER_VER_CHROMA_SP 4, 16
4119 FILTER_VER_CHROMA_SP 32, 32
4120 FILTER_VER_CHROMA_SP 32, 16
4121 FILTER_VER_CHROMA_SP 16, 32
4122 FILTER_VER_CHROMA_SP 32, 24
4123 FILTER_VER_CHROMA_SP 24, 32
4124 FILTER_VER_CHROMA_SP 32, 8
4126 FILTER_VER_CHROMA_SP 16, 24
4127 FILTER_VER_CHROMA_SP 16, 64
4128 FILTER_VER_CHROMA_SP 12, 32
4129 FILTER_VER_CHROMA_SP 4, 32
4130 FILTER_VER_CHROMA_SP 32, 64
4131 FILTER_VER_CHROMA_SP 32, 48
4132 FILTER_VER_CHROMA_SP 24, 64
4134 FILTER_VER_CHROMA_SP 64, 64
4135 FILTER_VER_CHROMA_SP 64, 32
4136 FILTER_VER_CHROMA_SP 64, 48
4137 FILTER_VER_CHROMA_SP 48, 64
4138 FILTER_VER_CHROMA_SP 64, 16
4141 %macro PROCESS_CHROMA_SP_W2_4R 1
4144 punpcklwd m0, m1 ;m0=[0 1]
4146 lea r0, [r0 + 2 * r1]
4148 punpcklwd m1, m2 ;m1=[1 2]
4149 punpcklqdq m0, m1 ;m0=[0 1 1 2]
4150 pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2
4153 punpcklwd m2, m1 ;m2=[2 3]
4155 lea r0, [r0 + 2 * r1]
4157 punpcklwd m1, m3 ;m2=[3 4]
4158 punpcklqdq m2, m1 ;m2=[2 3 3 4]
4160 pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2
4161 pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4
4162 paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2
4165 punpcklwd m3, m1 ;m3=[4 5]
4167 movd m4, [r0 + 2 * r1]
4168 punpcklwd m1, m4 ;m1=[5 6]
4169 punpcklqdq m3, m1 ;m2=[4 5 5 6]
4170 pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4
4171 paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4
4174 ;-------------------------------------------------------------------------------------------------------------------
4175 ; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4176 ;-------------------------------------------------------------------------------------------------------------------
4177 %macro FILTER_VER_CHROMA_SP_W2_4R 2
4179 cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6
4186 lea r5, [tab_ChromaCoeffV]
4189 lea r5, [tab_ChromaCoeffV + r4]
4192 mova m5, [tab_c_526336]
4197 PROCESS_CHROMA_SP_W2_4R r5
4209 pextrw [r2 + r3], m0, 1
4210 lea r2, [r2 + 2 * r3]
4212 pextrw [r2 + r3], m0, 3
4214 lea r2, [r2 + 2 * r3]
4222 FILTER_VER_CHROMA_SP_W2_4R 2, 4
4223 FILTER_VER_CHROMA_SP_W2_4R 2, 8
4225 FILTER_VER_CHROMA_SP_W2_4R 2, 16
4227 ;--------------------------------------------------------------------------------------------------------------
4228 ; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4229 ;--------------------------------------------------------------------------------------------------------------
4231 cglobal interp_4tap_vert_sp_4x2, 5, 6, 5
4238 lea r5, [tab_ChromaCoeffV]
4241 lea r5, [tab_ChromaCoeffV + r4]
4244 mova m4, [tab_c_526336]
4248 punpcklwd m0, m1 ;m0=[0 1]
4249 pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
4251 lea r0, [r0 + 2 * r1]
4253 punpcklwd m1, m2 ;m1=[1 2]
4254 pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
4257 punpcklwd m2, m3 ;m4=[2 3]
4258 pmaddwd m2, [r5 + 1 * 16]
4259 paddd m0, m2 ;m0=[0+1+2+3] Row1 done
4263 movq m2, [r0 + 2 * r1]
4264 punpcklwd m3, m2 ;m5=[3 4]
4265 pmaddwd m3, [r5 + 1 * 16]
4266 paddd m1, m3 ;m1 = [1+2+3+4] Row2 done
4274 pextrd [r2 + r3], m0, 1
4278 ;-------------------------------------------------------------------------------------------------------------------
4279 ; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4280 ;-------------------------------------------------------------------------------------------------------------------
4281 %macro FILTER_VER_CHROMA_SP_W6_H4 2
4283 cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7
4290 lea r5, [tab_ChromaCoeffV]
4293 lea r6, [tab_ChromaCoeffV + r4]
4296 mova m6, [tab_c_526336]
4301 PROCESS_CHROMA_SP_W4_4R
4319 pextrd [r2 + r3], m0, 1
4320 lea r5, [r2 + 2 * r3]
4322 pextrd [r5 + r3], m0, 3
4324 lea r5, [4 * r1 - 2 * 4]
4328 PROCESS_CHROMA_SP_W2_4R r6
4340 pextrw [r2 + r3], m0, 1
4341 lea r2, [r2 + 2 * r3]
4343 pextrw [r2 + r3], m0, 3
4346 lea r2, [r2 + 2 * r3 - 4]
4354 FILTER_VER_CHROMA_SP_W6_H4 6, 8
4356 FILTER_VER_CHROMA_SP_W6_H4 6, 16
4358 %macro PROCESS_CHROMA_SP_W8_2R 0
4361 punpcklwd m0, m1, m3
4362 pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l
4364 pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h
4366 movu m4, [r0 + 2 * r1]
4367 punpcklwd m2, m3, m4
4368 pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l
4370 pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h
4372 lea r0, [r0 + 2 * r1]
4374 punpcklwd m6, m4, m5
4375 pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l
4376 paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum
4378 pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h
4379 paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum
4381 movu m4, [r0 + 2 * r1]
4382 punpcklwd m6, m5, m4
4383 pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l
4384 paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum
4386 pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h
4387 paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum
4390 ;--------------------------------------------------------------------------------------------------------------
4391 ; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
4392 ;--------------------------------------------------------------------------------------------------------------
4393 %macro FILTER_VER_CHROMA_SP_W8_H2 2
4395 cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8
4402 lea r5, [tab_ChromaCoeffV]
4405 lea r5, [tab_ChromaCoeffV + r4]
4408 mova m7, [tab_c_526336]
4412 PROCESS_CHROMA_SP_W8_2R
4430 movhps [r2 + r3], m0
4432 lea r2, [r2 + 2 * r3]
4440 FILTER_VER_CHROMA_SP_W8_H2 8, 2
4441 FILTER_VER_CHROMA_SP_W8_H2 8, 4
4442 FILTER_VER_CHROMA_SP_W8_H2 8, 6
4443 FILTER_VER_CHROMA_SP_W8_H2 8, 8
4444 FILTER_VER_CHROMA_SP_W8_H2 8, 16
4445 FILTER_VER_CHROMA_SP_W8_H2 8, 32
4447 FILTER_VER_CHROMA_SP_W8_H2 8, 12
4448 FILTER_VER_CHROMA_SP_W8_H2 8, 64
4451 ;-----------------------------------------------------------------------------------------------------------------------------
4452 ; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
4453 ;-----------------------------------------------------------------------------------------------------------------------------
4454 %macro FILTER_HORIZ_CHROMA_2xN 2
4456 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
4464 add dststrided, dststrided
4467 lea r6, [tab_ChromaCoeff]
4468 movd coef2, [r6 + r4 * 4]
4470 movd coef2, [tab_ChromaCoeff + r4 * 4]
4473 pshufd coef2, coef2, 0
4480 sub srcq, srcstrideq
4491 lea srcq, [srcq + srcstrideq]
4492 lea dstq, [dstq + dststrideq]
4500 FILTER_HORIZ_CHROMA_2xN 2, 4
4501 FILTER_HORIZ_CHROMA_2xN 2, 8
4503 FILTER_HORIZ_CHROMA_2xN 2, 16
4505 ;-----------------------------------------------------------------------------------------------------------------------------
4506 ; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
4507 ;-----------------------------------------------------------------------------------------------------------------------------
4508 %macro FILTER_HORIZ_CHROMA_4xN 2
4510 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
4518 add dststrided, dststrided
4521 lea r6, [tab_ChromaCoeff]
4522 movd coef2, [r6 + r4 * 4]
4524 movd coef2, [tab_ChromaCoeff + r4 * 4]
4527 pshufd coef2, coef2, 0
4534 sub srcq, srcstrideq
4545 lea srcq, [srcq + srcstrideq]
4546 lea dstq, [dstq + dststrideq]
4553 FILTER_HORIZ_CHROMA_4xN 4, 2
4554 FILTER_HORIZ_CHROMA_4xN 4, 4
4555 FILTER_HORIZ_CHROMA_4xN 4, 8
4556 FILTER_HORIZ_CHROMA_4xN 4, 16
4558 FILTER_HORIZ_CHROMA_4xN 4, 32
4560 %macro PROCESS_CHROMA_W6 3
4573 %macro PROCESS_CHROMA_W12 3
4587 movh [dstq + 16], %1
4590 ;-----------------------------------------------------------------------------------------------------------------------------
4591 ; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
4592 ;-----------------------------------------------------------------------------------------------------------------------------
4593 %macro FILTER_HORIZ_CHROMA 2
4595 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
4605 add dststrided, dststrided
4608 lea r6, [tab_ChromaCoeff]
4609 movd coef2, [r6 + r4 * 4]
4611 movd coef2, [tab_ChromaCoeff + r4 * 4]
4614 pshufd coef2, coef2, 0
4617 mova Tm1, [tab_Tm + 16]
4622 sub srcq, srcstrideq
4626 PROCESS_CHROMA_W%1 t0, t1, t2
4627 add srcq, srcstrideq
4628 add dstq, dststrideq
4636 FILTER_HORIZ_CHROMA 6, 8
4637 FILTER_HORIZ_CHROMA 12, 16
4639 FILTER_HORIZ_CHROMA 6, 16
4640 FILTER_HORIZ_CHROMA 12, 32
4642 %macro PROCESS_CHROMA_W8 3
4653 ;-----------------------------------------------------------------------------------------------------------------------------
4654 ; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
4655 ;-----------------------------------------------------------------------------------------------------------------------------
4656 %macro FILTER_HORIZ_CHROMA_8xN 2
4658 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
4668 add dststrided, dststrided
4671 lea r6, [tab_ChromaCoeff]
4672 movd coef2, [r6 + r4 * 4]
4674 movd coef2, [tab_ChromaCoeff + r4 * 4]
4677 pshufd coef2, coef2, 0
4680 mova Tm1, [tab_Tm + 16]
4685 sub srcq, srcstrideq
4689 PROCESS_CHROMA_W8 t0, t1, t2
4690 add srcq, srcstrideq
4691 add dstq, dststrideq
4699 FILTER_HORIZ_CHROMA_8xN 8, 2
4700 FILTER_HORIZ_CHROMA_8xN 8, 4
4701 FILTER_HORIZ_CHROMA_8xN 8, 6
4702 FILTER_HORIZ_CHROMA_8xN 8, 8
4703 FILTER_HORIZ_CHROMA_8xN 8, 16
4704 FILTER_HORIZ_CHROMA_8xN 8, 32
4706 FILTER_HORIZ_CHROMA_8xN 8, 12
4707 FILTER_HORIZ_CHROMA_8xN 8, 64
4709 %macro PROCESS_CHROMA_W16 4
4725 movu [dstq + 16], %4
4728 %macro PROCESS_CHROMA_W24 4
4744 movu [dstq + 16], %4
4745 movu %1, [srcq + 16]
4752 movu [dstq + 32], %2
4755 %macro PROCESS_CHROMA_W32 4
4771 movu [dstq + 16], %4
4772 movu %1, [srcq + 16]
4778 movu %1, [srcq + 24]
4786 movu [dstq + 32], %2
4787 movu [dstq + 48], %4
4790 %macro PROCESS_CHROMA_W16o 5
4791 movu %1, [srcq + %5]
4797 movu %1, [srcq + %5 + 8]
4805 movu [dstq + %5 * 2], %2
4806 movu [dstq + %5 * 2 + 16], %4
4809 %macro PROCESS_CHROMA_W48 4
4810 PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
4811 PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
4812 PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
4815 %macro PROCESS_CHROMA_W64 4
4816 PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
4817 PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
4818 PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
4819 PROCESS_CHROMA_W16o %1, %2, %3, %4, 48
4822 ;------------------------------------------------------------------------------------------------------------------------------
4823 ; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
4824 ;------------------------------------------------------------------------------------------------------------------------------
4825 %macro FILTER_HORIZ_CHROMA_WxN 2
4827 cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride
4838 add dststrided, dststrided
4841 lea r6, [tab_ChromaCoeff]
4842 movd coef2, [r6 + r4 * 4]
4844 movd coef2, [tab_ChromaCoeff + r4 * 4]
4847 pshufd coef2, coef2, 0
4850 mova Tm1, [tab_Tm + 16]
4855 sub srcq, srcstrideq
4859 PROCESS_CHROMA_W%1 t0, t1, t2, t3
4860 add srcq, srcstrideq
4861 add dstq, dststrideq
4869 FILTER_HORIZ_CHROMA_WxN 16, 4
4870 FILTER_HORIZ_CHROMA_WxN 16, 8
4871 FILTER_HORIZ_CHROMA_WxN 16, 12
4872 FILTER_HORIZ_CHROMA_WxN 16, 16
4873 FILTER_HORIZ_CHROMA_WxN 16, 32
4874 FILTER_HORIZ_CHROMA_WxN 24, 32
4875 FILTER_HORIZ_CHROMA_WxN 32, 8
4876 FILTER_HORIZ_CHROMA_WxN 32, 16
4877 FILTER_HORIZ_CHROMA_WxN 32, 24
4878 FILTER_HORIZ_CHROMA_WxN 32, 32
4880 FILTER_HORIZ_CHROMA_WxN 16, 24
4881 FILTER_HORIZ_CHROMA_WxN 16, 64
4882 FILTER_HORIZ_CHROMA_WxN 24, 64
4883 FILTER_HORIZ_CHROMA_WxN 32, 48
4884 FILTER_HORIZ_CHROMA_WxN 32, 64
4886 FILTER_HORIZ_CHROMA_WxN 64, 64
4887 FILTER_HORIZ_CHROMA_WxN 64, 32
4888 FILTER_HORIZ_CHROMA_WxN 64, 48
4889 FILTER_HORIZ_CHROMA_WxN 48, 64
4890 FILTER_HORIZ_CHROMA_WxN 64, 16
4893 ;---------------------------------------------------------------------------------------------------------------
4894 ; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4895 ;---------------------------------------------------------------------------------------------------------------
4896 %macro FILTER_V_PS_W16n 2
4898 cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8
4905 lea r5, [tab_ChromaCoeff]
4906 movd m0, [r5 + r4 * 4]
4908 movd m0, [tab_ChromaCoeff + r4 * 4]
4911 pshufb m1, m0, [tab_Vm]
4912 pshufb m0, [tab_Vm + 16]
4924 punpcklbw m4, m2, m3
4930 lea r5, [r0 + 2 * r1]
4934 punpcklbw m6, m5, m7
4938 punpckhbw m6, m5, m7
4950 punpcklbw m4, m3, m5
4956 movu m5, [r5 + 2 * r1]
4958 punpcklbw m2, m7, m5
4971 movu [r2 + r3 + 16], m3
4978 lea r0, [r0 + r1 * 2 - %1]
4979 lea r2, [r2 + r3 * 2 - %1 * 2]
4986 FILTER_V_PS_W16n 64, 64
4987 FILTER_V_PS_W16n 64, 32
4988 FILTER_V_PS_W16n 64, 48
4989 FILTER_V_PS_W16n 48, 64
4990 FILTER_V_PS_W16n 64, 16
4993 ;------------------------------------------------------------------------------------------------------------
4994 ;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4995 ;------------------------------------------------------------------------------------------------------------
4997 cglobal interp_4tap_vert_ps_2x4, 4, 6, 7
5004 lea r5, [tab_ChromaCoeff]
5005 movd m0, [r5 + r4 * 4]
5007 movd m0, [tab_ChromaCoeff + r4 * 4]
5016 movd m4, [r0 + 2 * r1]
5020 punpcklbw m6, m4, m5
5025 lea r0, [r0 + 4 * r1]
5029 punpcklbw m1, m5, m6
5040 pextrd [r2 + r3], m2, 2
5045 punpcklbw m3, m6, m2
5050 movd m3, [r0 + 2 * r1]
5060 lea r2, [r2 + 2 * r3]
5062 pextrd [r2 + r3], m4, 2
5066 ;-------------------------------------------------------------------------------------------------------------
5067 ; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5068 ;-------------------------------------------------------------------------------------------------------------
5069 %macro FILTER_V_PS_W2 2
5071 cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8
5078 lea r5, [tab_ChromaCoeff]
5079 movd m0, [r5 + r4 * 4]
5081 movd m0, [tab_ChromaCoeff + r4 * 4]
5092 movd m4, [r0 + 2 * r1]
5096 punpcklbw m6, m4, m5
5101 lea r0, [r0 + 4 * r1]
5105 punpcklbw m7, m5, m6
5121 punpcklbw m3, m6, m2
5126 movd m3, [r0 + 2 * r1]
5138 lea r2, [r2 + 2 * r3]
5143 lea r2, [r2 + 2 * r3]
5153 FILTER_V_PS_W2 2, 16
5155 ;-----------------------------------------------------------------------------------------------------------------
5156 ; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5157 ;-----------------------------------------------------------------------------------------------------------------
5158 %macro FILTER_VER_CHROMA_SS 2
5160 cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize
5168 lea r5, [tab_ChromaCoeffV]
5171 lea r6, [tab_ChromaCoeffV + r4]
5174 mov dword [rsp], %2/4
5179 PROCESS_CHROMA_SP_W4_4R
5190 movhps [r2 + r3], m0
5191 lea r5, [r2 + 2 * r3]
5193 movhps [r5 + r3], m2
5195 lea r5, [4 * r1 - 2 * 4]
5202 lea r0, [r0 + 4 * r1 - 2 * %1]
5203 lea r2, [r2 + 4 * r3 - 2 * %1]
5211 FILTER_VER_CHROMA_SS 4, 4
5212 FILTER_VER_CHROMA_SS 4, 8
5213 FILTER_VER_CHROMA_SS 16, 16
5214 FILTER_VER_CHROMA_SS 16, 8
5215 FILTER_VER_CHROMA_SS 16, 12
5216 FILTER_VER_CHROMA_SS 12, 16
5217 FILTER_VER_CHROMA_SS 16, 4
5218 FILTER_VER_CHROMA_SS 4, 16
5219 FILTER_VER_CHROMA_SS 32, 32
5220 FILTER_VER_CHROMA_SS 32, 16
5221 FILTER_VER_CHROMA_SS 16, 32
5222 FILTER_VER_CHROMA_SS 32, 24
5223 FILTER_VER_CHROMA_SS 24, 32
5224 FILTER_VER_CHROMA_SS 32, 8
5226 FILTER_VER_CHROMA_SS 16, 24
5227 FILTER_VER_CHROMA_SS 12, 32
5228 FILTER_VER_CHROMA_SS 4, 32
5229 FILTER_VER_CHROMA_SS 32, 64
5230 FILTER_VER_CHROMA_SS 16, 64
5231 FILTER_VER_CHROMA_SS 32, 48
5232 FILTER_VER_CHROMA_SS 24, 64
5234 FILTER_VER_CHROMA_SS 64, 64
5235 FILTER_VER_CHROMA_SS 64, 32
5236 FILTER_VER_CHROMA_SS 64, 48
5237 FILTER_VER_CHROMA_SS 48, 64
5238 FILTER_VER_CHROMA_SS 64, 16
5241 ;---------------------------------------------------------------------------------------------------------------------
5242 ; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5243 ;---------------------------------------------------------------------------------------------------------------------
5244 %macro FILTER_VER_CHROMA_SS_W2_4R 2
5246 cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5
5254 lea r5, [tab_ChromaCoeffV]
5257 lea r5, [tab_ChromaCoeffV + r4]
5263 PROCESS_CHROMA_SP_W2_4R r5
5271 pextrd [r2 + r3], m0, 1
5272 lea r2, [r2 + 2 * r3]
5274 pextrd [r2 + r3], m0, 3
5276 lea r2, [r2 + 2 * r3]
5284 FILTER_VER_CHROMA_SS_W2_4R 2, 4
5285 FILTER_VER_CHROMA_SS_W2_4R 2, 8
5287 FILTER_VER_CHROMA_SS_W2_4R 2, 16
5289 ;---------------------------------------------------------------------------------------------------------------
5290 ; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5291 ;---------------------------------------------------------------------------------------------------------------
5293 cglobal interp_4tap_vert_ss_4x2, 5, 6, 4
5301 lea r5, [tab_ChromaCoeffV]
5304 lea r5, [tab_ChromaCoeffV + r4]
5309 punpcklwd m0, m1 ;m0=[0 1]
5310 pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
5312 lea r0, [r0 + 2 * r1]
5314 punpcklwd m1, m2 ;m1=[1 2]
5315 pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
5318 punpcklwd m2, m3 ;m4=[2 3]
5319 pmaddwd m2, [r5 + 1 * 16]
5320 paddd m0, m2 ;m0=[0+1+2+3] Row1 done
5323 movq m2, [r0 + 2 * r1]
5324 punpcklwd m3, m2 ;m5=[3 4]
5325 pmaddwd m3, [r5 + 1 * 16]
5326 paddd m1, m3 ;m1=[1+2+3+4] Row2 done
5332 movhps [r2 + r3], m0
5336 ;-------------------------------------------------------------------------------------------------------------------
5337 ; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5338 ;-------------------------------------------------------------------------------------------------------------------
5339 %macro FILTER_VER_CHROMA_SS_W6_H4 2
5341 cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6
5349 lea r5, [tab_ChromaCoeffV]
5352 lea r6, [tab_ChromaCoeffV + r4]
5358 PROCESS_CHROMA_SP_W4_4R
5369 movhps [r2 + r3], m0
5370 lea r5, [r2 + 2 * r3]
5372 movhps [r5 + r3], m2
5374 lea r5, [4 * r1 - 2 * 4]
5378 PROCESS_CHROMA_SP_W2_4R r6
5386 pextrd [r2 + r3], m0, 1
5387 lea r2, [r2 + 2 * r3]
5389 pextrd [r2 + r3], m0, 3
5392 lea r2, [r2 + 2 * r3 - 2 * 4]
5400 FILTER_VER_CHROMA_SS_W6_H4 6, 8
5402 FILTER_VER_CHROMA_SS_W6_H4 6, 16
5405 ;----------------------------------------------------------------------------------------------------------------
5406 ; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5407 ;----------------------------------------------------------------------------------------------------------------
5408 %macro FILTER_VER_CHROMA_SS_W8_H2 2
5410 cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7
5418 lea r5, [tab_ChromaCoeffV]
5421 lea r5, [tab_ChromaCoeffV + r4]
5426 PROCESS_CHROMA_SP_W8_2R
5439 lea r2, [r2 + 2 * r3]
5447 FILTER_VER_CHROMA_SS_W8_H2 8, 2
5448 FILTER_VER_CHROMA_SS_W8_H2 8, 4
5449 FILTER_VER_CHROMA_SS_W8_H2 8, 6
5450 FILTER_VER_CHROMA_SS_W8_H2 8, 8
5451 FILTER_VER_CHROMA_SS_W8_H2 8, 16
5452 FILTER_VER_CHROMA_SS_W8_H2 8, 32
5454 FILTER_VER_CHROMA_SS_W8_H2 8, 12
5455 FILTER_VER_CHROMA_SS_W8_H2 8, 64
5457 ;-----------------------------------------------------------------------------------------------------------------
5458 ; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5459 ;-----------------------------------------------------------------------------------------------------------------
5460 %macro FILTER_VER_LUMA_SS 2
5462 cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize
5471 lea r5, [tab_LumaCoeffV]
5474 lea r6, [tab_LumaCoeffV + r4]
5477 mov dword [rsp], %2/4
5483 punpcklwd m0, m1 ;m0=[0 1]
5484 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
5486 lea r0, [r0 + 2 * r1]
5488 punpcklwd m1, m4 ;m1=[1 2]
5489 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
5492 punpcklwd m4, m5 ;m4=[2 3]
5493 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
5494 pmaddwd m4, [r6 + 1 * 16]
5495 paddd m0, m4 ;m0=[0+1+2+3] Row1
5497 lea r0, [r0 + 2 * r1]
5499 punpcklwd m5, m4 ;m5=[3 4]
5500 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
5501 pmaddwd m5, [r6 + 1 * 16]
5502 paddd m1, m5 ;m1 = [1+2+3+4] Row2
5505 punpcklwd m4, m5 ;m4=[4 5]
5506 pmaddwd m6, m4, [r6 + 1 * 16]
5507 paddd m2, m6 ;m2=[2+3+4+5] Row3
5508 pmaddwd m4, [r6 + 2 * 16]
5509 paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
5511 lea r0, [r0 + 2 * r1]
5513 punpcklwd m5, m4 ;m5=[5 6]
5514 pmaddwd m6, m5, [r6 + 1 * 16]
5515 paddd m3, m6 ;m3=[3+4+5+6] Row4
5516 pmaddwd m5, [r6 + 2 * 16]
5517 paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
5520 punpcklwd m4, m5 ;m4=[6 7]
5521 pmaddwd m6, m4, [r6 + 2 * 16]
5522 paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
5523 pmaddwd m4, [r6 + 3 * 16]
5524 paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
5527 lea r0, [r0 + 2 * r1]
5529 punpcklwd m5, m4 ;m5=[7 8]
5530 pmaddwd m6, m5, [r6 + 2 * 16]
5531 paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
5532 pmaddwd m5, [r6 + 3 * 16]
5533 paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
5539 movhps [r2 + r3], m0
5542 punpcklwd m4, m5 ;m4=[8 9]
5543 pmaddwd m4, [r6 + 3 * 16]
5544 paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
5547 movq m4, [r0 + 2 * r1]
5548 punpcklwd m5, m4 ;m5=[9 10]
5549 pmaddwd m5, [r6 + 3 * 16]
5550 paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
5555 movlps [r2 + 2 * r3], m2
5557 movhps [r2 + r5], m2
5559 lea r5, [8 * r1 - 2 * 4]
5566 lea r0, [r0 + 4 * r1 - 2 * %1]
5567 lea r2, [r2 + 4 * r3 - 2 * %1]
5575 FILTER_VER_LUMA_SS 4, 4
5576 FILTER_VER_LUMA_SS 8, 8
5577 FILTER_VER_LUMA_SS 8, 4
5578 FILTER_VER_LUMA_SS 4, 8
5579 FILTER_VER_LUMA_SS 16, 16
5580 FILTER_VER_LUMA_SS 16, 8
5581 FILTER_VER_LUMA_SS 8, 16
5582 FILTER_VER_LUMA_SS 16, 12
5583 FILTER_VER_LUMA_SS 12, 16
5584 FILTER_VER_LUMA_SS 16, 4
5585 FILTER_VER_LUMA_SS 4, 16
5586 FILTER_VER_LUMA_SS 32, 32
5587 FILTER_VER_LUMA_SS 32, 16
5588 FILTER_VER_LUMA_SS 16, 32
5589 FILTER_VER_LUMA_SS 32, 24
5590 FILTER_VER_LUMA_SS 24, 32
5591 FILTER_VER_LUMA_SS 32, 8
5592 FILTER_VER_LUMA_SS 8, 32
5593 FILTER_VER_LUMA_SS 64, 64
5594 FILTER_VER_LUMA_SS 64, 32
5595 FILTER_VER_LUMA_SS 32, 64
5596 FILTER_VER_LUMA_SS 64, 48
5597 FILTER_VER_LUMA_SS 48, 64
5598 FILTER_VER_LUMA_SS 64, 16
5599 FILTER_VER_LUMA_SS 16, 64