1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
4 ;* Authors: Nabajit Deka <nabajit@multicorewareinc.com>
5 ;* Murugan Vairavel <murugan@multicorewareinc.com>
7 ;* This program is free software; you can redistribute it and/or modify
8 ;* it under the terms of the GNU General Public License as published by
9 ;* the Free Software Foundation; either version 2 of the License, or
10 ;* (at your option) any later version.
12 ;* This program is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;* GNU General Public License for more details.
17 ;* You should have received a copy of the GNU General Public License
18 ;* along with this program; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 ;* This program is also available under a commercial proprietary license.
22 ;* For more information, contact us at license @ x265.com.
23 ;*****************************************************************************/
26 %include "x86util.asm"
30 tab_c_32: times 4 dd 32
31 tab_c_n32768: times 4 dd -32768
32 tab_c_524800: times 4 dd 524800
33 tab_c_n8192: times 8 dw -8192
35 tab_Tm16: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
37 tab_ChromaCoeff: dw 0, 64, 0, 0
46 tab_ChromaCoeffV: times 4 dw 0, 64
70 tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0
71 dw -1, 4, -10, 58, 17, -5, 1, 0
72 dw -1, 4, -11, 40, 40, -11, 4, -1
73 dw 0, 1, -5, 17, 58, -10, 4, -1
75 tab_LumaCoeffV: times 4 dw 0, 0
101 ;------------------------------------------------------------------------------------------------------------
102 ; void interp_8tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
103 ;------------------------------------------------------------------------------------------------------------
104 %macro FILTER_HOR_LUMA_W4 3
106 cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
115 lea r6, [tab_LumaCoeff]
118 mova m0, [tab_LumaCoeff + r4]
124 mova m7, [pw_pixel_max]
133 lea r6, [r1 + 2 * r1]
139 movu m2, [r0] ; m2 = src[0-7]
140 movu m3, [r0 + 16] ; m3 = src[8-15]
143 palignr m5, m3, m2, 2 ; m5 = src[1-8]
147 palignr m5, m3, m2, 4 ; m5 = src[2-9]
149 palignr m3, m2, 6 ; m3 = src[3-10]
174 ;------------------------------------------------------------------------------------------------------------
175 ; void interp_8tap_horiz_pp_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
176 ;------------------------------------------------------------------------------------------------------------
177 FILTER_HOR_LUMA_W4 4, 4, pp
178 FILTER_HOR_LUMA_W4 4, 8, pp
179 FILTER_HOR_LUMA_W4 4, 16, pp
181 ;---------------------------------------------------------------------------------------------------------------------------
182 ; void interp_8tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
183 ;---------------------------------------------------------------------------------------------------------------------------
184 FILTER_HOR_LUMA_W4 4, 4, ps
185 FILTER_HOR_LUMA_W4 4, 8, ps
186 FILTER_HOR_LUMA_W4 4, 16, ps
188 ;------------------------------------------------------------------------------------------------------------
189 ; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
190 ;------------------------------------------------------------------------------------------------------------
191 %macro FILTER_HOR_LUMA_W8 3
193 cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
202 lea r6, [tab_LumaCoeff]
205 mova m0, [tab_LumaCoeff + r4]
219 lea r6, [r1 + 2 * r1]
225 movu m2, [r0] ; m2 = src[0-7]
226 movu m3, [r0 + 16] ; m3 = src[8-15]
229 palignr m5, m3, m2, 2 ; m5 = src[1-8]
233 palignr m5, m3, m2, 4 ; m5 = src[2-9]
235 palignr m6, m3, m2, 6 ; m6 = src[3-10]
241 palignr m5, m3, m2, 8 ; m5 = src[4-11]
243 palignr m6, m3, m2, 10 ; m6 = src[5-12]
247 palignr m6, m3, m2, 12 ; m6 = src[6-13]
249 palignr m3, m2, 14 ; m3 = src[7-14]
258 CLIPW m4, m7, [pw_pixel_max]
275 ;------------------------------------------------------------------------------------------------------------
276 ; void interp_8tap_horiz_pp_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
277 ;------------------------------------------------------------------------------------------------------------
278 FILTER_HOR_LUMA_W8 8, 4, pp
279 FILTER_HOR_LUMA_W8 8, 8, pp
280 FILTER_HOR_LUMA_W8 8, 16, pp
281 FILTER_HOR_LUMA_W8 8, 32, pp
283 ;---------------------------------------------------------------------------------------------------------------------------
284 ; void interp_8tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
285 ;---------------------------------------------------------------------------------------------------------------------------
286 FILTER_HOR_LUMA_W8 8, 4, ps
287 FILTER_HOR_LUMA_W8 8, 8, ps
288 FILTER_HOR_LUMA_W8 8, 16, ps
289 FILTER_HOR_LUMA_W8 8, 32, ps
291 ;--------------------------------------------------------------------------------------------------------------
292 ; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
293 ;--------------------------------------------------------------------------------------------------------------
294 %macro FILTER_HOR_LUMA_W12 3
296 cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
305 lea r6, [tab_LumaCoeff]
308 mova m0, [tab_LumaCoeff + r4]
320 lea r6, [r1 + 2 * r1]
326 movu m2, [r0] ; m2 = src[0-7]
327 movu m3, [r0 + 16] ; m3 = src[8-15]
330 palignr m5, m3, m2, 2 ; m5 = src[1-8]
334 palignr m5, m3, m2, 4 ; m5 = src[2-9]
336 palignr m6, m3, m2, 6 ; m6 = src[3-10]
342 palignr m5, m3, m2, 8 ; m5 = src[4-11]
344 palignr m6, m3, m2, 10 ; m6 = src[5-12]
348 palignr m6, m3, m2, 12 ; m6 = src[6-13]
350 palignr m7, m3, m2, 14 ; m2 = src[7-14]
360 CLIPW m4, m5, [pw_pixel_max]
369 movu m2, [r0 + 32] ; m2 = src[16-23]
371 pmaddwd m4, m3, m0 ; m3 = src[8-15]
372 palignr m5, m2, m3, 2 ; m5 = src[9-16]
376 palignr m5, m2, m3, 4 ; m5 = src[10-17]
378 palignr m2, m3, 6 ; m2 = src[11-18]
387 CLIPW m4, m5, [pw_pixel_max]
403 ;-------------------------------------------------------------------------------------------------------------
404 ; void interp_8tap_horiz_pp_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
405 ;-------------------------------------------------------------------------------------------------------------
406 FILTER_HOR_LUMA_W12 12, 16, pp
408 ;----------------------------------------------------------------------------------------------------------------------------
409 ; void interp_8tap_horiz_ps_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
410 ;----------------------------------------------------------------------------------------------------------------------------
411 FILTER_HOR_LUMA_W12 12, 16, ps
413 ;--------------------------------------------------------------------------------------------------------------
414 ; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
415 ;--------------------------------------------------------------------------------------------------------------
416 %macro FILTER_HOR_LUMA_W16 3
418 cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
427 lea r6, [tab_LumaCoeff]
430 mova m0, [tab_LumaCoeff + r4]
443 lea r6, [r1 + 2 * r1]
451 movu m2, [r0 + x] ; m2 = src[0-7]
452 movu m3, [r0 + 16 + x] ; m3 = src[8-15]
455 palignr m5, m3, m2, 2 ; m5 = src[1-8]
459 palignr m5, m3, m2, 4 ; m5 = src[2-9]
461 palignr m6, m3, m2, 6 ; m6 = src[3-10]
467 palignr m5, m3, m2, 8 ; m5 = src[4-11]
469 palignr m6, m3, m2, 10 ; m6 = src[5-12]
473 palignr m6, m3, m2, 12 ; m6 = src[6-13]
475 palignr m7, m3, m2, 14 ; m2 = src[7-14]
485 CLIPW m4, m5, [pw_pixel_max]
493 movu m2, [r0 + 32 + x] ; m2 = src[16-23]
495 pmaddwd m4, m3, m0 ; m3 = src[8-15]
496 palignr m5, m2, m3, 2 ; m5 = src[9-16]
500 palignr m5, m2, m3, 4 ; m5 = src[10-17]
502 palignr m6, m2, m3, 6 ; m6 = src[11-18]
508 palignr m5, m2, m3, 8 ; m5 = src[12-19]
510 palignr m6, m2, m3, 10 ; m6 = src[13-20]
514 palignr m6, m2, m3, 12 ; m6 = src[14-21]
516 palignr m2, m3, 14 ; m3 = src[15-22]
526 CLIPW m4, m5, [pw_pixel_max]
532 movu [r2 + 16 + x], m4
545 ;-------------------------------------------------------------------------------------------------------------
546 ; void interp_8tap_horiz_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
547 ;-------------------------------------------------------------------------------------------------------------
548 FILTER_HOR_LUMA_W16 16, 4, pp
549 FILTER_HOR_LUMA_W16 16, 8, pp
550 FILTER_HOR_LUMA_W16 16, 12, pp
551 FILTER_HOR_LUMA_W16 16, 16, pp
552 FILTER_HOR_LUMA_W16 16, 32, pp
553 FILTER_HOR_LUMA_W16 16, 64, pp
555 ;----------------------------------------------------------------------------------------------------------------------------
556 ; void interp_8tap_horiz_ps_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
557 ;----------------------------------------------------------------------------------------------------------------------------
558 FILTER_HOR_LUMA_W16 16, 4, ps
559 FILTER_HOR_LUMA_W16 16, 8, ps
560 FILTER_HOR_LUMA_W16 16, 12, ps
561 FILTER_HOR_LUMA_W16 16, 16, ps
562 FILTER_HOR_LUMA_W16 16, 32, ps
563 FILTER_HOR_LUMA_W16 16, 64, ps
565 ;-------------------------------------------------------------------------------------------------------------
566 ; void interp_8tap_horiz_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
567 ;-------------------------------------------------------------------------------------------------------------
568 FILTER_HOR_LUMA_W16 32, 8, pp
569 FILTER_HOR_LUMA_W16 32, 16, pp
570 FILTER_HOR_LUMA_W16 32, 24, pp
571 FILTER_HOR_LUMA_W16 32, 32, pp
572 FILTER_HOR_LUMA_W16 32, 64, pp
574 ;----------------------------------------------------------------------------------------------------------------------------
575 ; void interp_8tap_horiz_ps_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
576 ;----------------------------------------------------------------------------------------------------------------------------
577 FILTER_HOR_LUMA_W16 32, 8, ps
578 FILTER_HOR_LUMA_W16 32, 16, ps
579 FILTER_HOR_LUMA_W16 32, 24, ps
580 FILTER_HOR_LUMA_W16 32, 32, ps
581 FILTER_HOR_LUMA_W16 32, 64, ps
583 ;-------------------------------------------------------------------------------------------------------------
584 ; void interp_8tap_horiz_pp_48x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
585 ;-------------------------------------------------------------------------------------------------------------
586 FILTER_HOR_LUMA_W16 48, 64, pp
588 ;----------------------------------------------------------------------------------------------------------------------------
589 ; void interp_8tap_horiz_ps_48x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
590 ;----------------------------------------------------------------------------------------------------------------------------
591 FILTER_HOR_LUMA_W16 48, 64, ps
593 ;-------------------------------------------------------------------------------------------------------------
594 ; void interp_8tap_horiz_pp_64x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
595 ;-------------------------------------------------------------------------------------------------------------
596 FILTER_HOR_LUMA_W16 64, 16, pp
597 FILTER_HOR_LUMA_W16 64, 32, pp
598 FILTER_HOR_LUMA_W16 64, 48, pp
599 FILTER_HOR_LUMA_W16 64, 64, pp
601 ;----------------------------------------------------------------------------------------------------------------------------
602 ; void interp_8tap_horiz_ps_64x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
603 ;----------------------------------------------------------------------------------------------------------------------------
604 FILTER_HOR_LUMA_W16 64, 16, ps
605 FILTER_HOR_LUMA_W16 64, 32, ps
606 FILTER_HOR_LUMA_W16 64, 48, ps
607 FILTER_HOR_LUMA_W16 64, 64, ps
609 ;--------------------------------------------------------------------------------------------------------------
610 ; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
611 ;--------------------------------------------------------------------------------------------------------------
612 %macro FILTER_HOR_LUMA_W24 3
614 cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
623 lea r6, [tab_LumaCoeff]
626 mova m0, [tab_LumaCoeff + r4]
638 lea r6, [r1 + 2 * r1]
644 movu m2, [r0] ; m2 = src[0-7]
645 movu m3, [r0 + 16] ; m3 = src[8-15]
648 palignr m5, m3, m2, 2 ; m5 = src[1-8]
652 palignr m5, m3, m2, 4 ; m5 = src[2-9]
654 palignr m6, m3, m2, 6 ; m6 = src[3-10]
660 palignr m5, m3, m2, 8 ; m5 = src[4-11]
662 palignr m6, m3, m2, 10 ; m6 = src[5-12]
666 palignr m6, m3, m2, 12 ; m6 = src[6-13]
668 palignr m7, m3, m2, 14 ; m7 = src[7-14]
678 CLIPW m4, m5, [pw_pixel_max]
686 movu m2, [r0 + 32] ; m2 = src[16-23]
688 pmaddwd m4, m3, m0 ; m3 = src[8-15]
689 palignr m5, m2, m3, 2 ; m5 = src[1-8]
693 palignr m5, m2, m3, 4 ; m5 = src[2-9]
695 palignr m6, m2, m3, 6 ; m6 = src[3-10]
701 palignr m5, m2, m3, 8 ; m5 = src[4-11]
703 palignr m6, m2, m3, 10 ; m6 = src[5-12]
707 palignr m6, m2, m3, 12 ; m6 = src[6-13]
709 palignr m7, m2, m3, 14 ; m7 = src[7-14]
719 CLIPW m4, m5, [pw_pixel_max]
727 movu m3, [r0 + 48] ; m3 = src[24-31]
729 pmaddwd m4, m2, m0 ; m2 = src[16-23]
730 palignr m5, m3, m2, 2 ; m5 = src[1-8]
734 palignr m5, m3, m2, 4 ; m5 = src[2-9]
736 palignr m6, m3, m2, 6 ; m6 = src[3-10]
742 palignr m5, m3, m2, 8 ; m5 = src[4-11]
744 palignr m6, m3, m2, 10 ; m6 = src[5-12]
748 palignr m6, m3, m2, 12 ; m6 = src[6-13]
750 palignr m7, m3, m2, 14 ; m7 = src[7-14]
760 CLIPW m4, m5, [pw_pixel_max]
776 ;-------------------------------------------------------------------------------------------------------------
777 ; void interp_8tap_horiz_pp_24x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
778 ;-------------------------------------------------------------------------------------------------------------
779 FILTER_HOR_LUMA_W24 24, 32, pp
781 ;----------------------------------------------------------------------------------------------------------------------------
782 ; void interp_8tap_horiz_ps_24x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
783 ;----------------------------------------------------------------------------------------------------------------------------
784 FILTER_HOR_LUMA_W24 24, 32, ps
804 pextrd [r2 + r3], m3, 1
820 movu m4, [r0 + r1 + 4]
839 ;-----------------------------------------------------------------------------
840 ; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
841 ;-----------------------------------------------------------------------------
842 %macro FILTER_CHROMA_H 6
844 cglobal interp_4tap_horiz_%3_%1x%2, 4, %4, %5
853 lea r%6, [tab_ChromaCoeff]
854 movh m0, [r%6 + r4 * 4]
856 movh m0, [tab_ChromaCoeff + r4 * 4]
863 mova m1, [tab_c_n32768]
893 lea r0, [r0 + 2 * r1]
894 lea r2, [r2 + 2 * r3]
900 mova m6, [pw_pixel_max]
902 %endif ;%ifidn %3, ps
907 lea r0, [r0 + 2 * r1]
908 lea r2, [r2 + 2 * r3]
915 FILTER_CHROMA_H 2, 4, pp, 6, 8, 5
916 FILTER_CHROMA_H 2, 8, pp, 6, 8, 5
917 FILTER_CHROMA_H 4, 2, pp, 6, 8, 5
918 FILTER_CHROMA_H 4, 4, pp, 6, 8, 5
919 FILTER_CHROMA_H 4, 8, pp, 6, 8, 5
920 FILTER_CHROMA_H 4, 16, pp, 6, 8, 5
922 FILTER_CHROMA_H 2, 4, ps, 7, 5, 6
923 FILTER_CHROMA_H 2, 8, ps, 7, 5, 6
924 FILTER_CHROMA_H 4, 2, ps, 7, 6, 6
925 FILTER_CHROMA_H 4, 4, ps, 7, 6, 6
926 FILTER_CHROMA_H 4, 8, ps, 7, 6, 6
927 FILTER_CHROMA_H 4, 16, ps, 7, 6, 6
929 FILTER_CHROMA_H 2, 16, pp, 6, 8, 5
930 FILTER_CHROMA_H 4, 32, pp, 6, 8, 5
931 FILTER_CHROMA_H 2, 16, ps, 7, 5, 6
932 FILTER_CHROMA_H 4, 32, ps, 7, 6, 6
961 pextrd [r2 + 8], m3, 2
964 cglobal chroma_filter_pp_6x1_internal
968 cglobal chroma_filter_ps_6x1_internal
1004 cglobal chroma_filter_pp_8x1_internal
1008 cglobal chroma_filter_ps_8x1_internal
1012 %macro FILTER_W12_1 1
1063 cglobal chroma_filter_pp_12x1_internal
1067 cglobal chroma_filter_ps_12x1_internal
1071 %macro FILTER_W16_1 1
1130 movhps [r2 + 24], m3
1133 cglobal chroma_filter_pp_16x1_internal
1137 cglobal chroma_filter_ps_16x1_internal
1141 %macro FILTER_W24_1 1
1200 movhps [r2 + 24], m3
1230 movhps [r2 + 40], m3
1233 cglobal chroma_filter_pp_24x1_internal
1237 cglobal chroma_filter_ps_24x1_internal
1241 %macro FILTER_W32_1 1
1300 movhps [r2 + 24], m3
1330 movhps [r2 + 40], m3
1360 movhps [r2 + 56], m3
1363 cglobal chroma_filter_pp_32x1_internal
1367 cglobal chroma_filter_ps_32x1_internal
1371 %macro FILTER_W8o_1 2
1375 movu m4, [r0 + %2 + 4]
1381 movu m5, [r0 + %2 + 8]
1384 movu m4, [r0 + %2 + 12]
1400 movhps [r2 + %2 + 8], m3
1403 %macro FILTER_W48_1 1
1412 cglobal chroma_filter_pp_48x1_internal
1416 cglobal chroma_filter_ps_48x1_internal
1420 %macro FILTER_W64_1 1
1428 FILTER_W8o_1 %1, 112
1431 cglobal chroma_filter_pp_64x1_internal
1435 cglobal chroma_filter_ps_64x1_internal
1440 ;-----------------------------------------------------------------------------
1441 ; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
1442 ;-----------------------------------------------------------------------------
1445 %macro IPFILTER_CHROMA 6
1446 cglobal interp_4tap_horiz_%3_%1x%2, 4, %5, %6
1455 lea r%4, [tab_ChromaCoeff]
1456 movh m0, [r%4 + r4 * 4]
1458 movh m0, [tab_ChromaCoeff + r4 * 4]
1465 mova m1, [tab_c_n32768]
1469 call chroma_filter_%3_%1x1_internal
1472 call chroma_filter_%3_%1x1_internal
1475 call chroma_filter_%3_%1x1_internal
1482 mova m7, [pw_pixel_max]
1485 call chroma_filter_%3_%1x1_internal
1489 call chroma_filter_%3_%1x1_internal
1493 IPFILTER_CHROMA 6, 8, pp, 5, 6, 8
1494 IPFILTER_CHROMA 8, 2, pp, 5, 6, 8
1495 IPFILTER_CHROMA 8, 4, pp, 5, 6, 8
1496 IPFILTER_CHROMA 8, 6, pp, 5, 6, 8
1497 IPFILTER_CHROMA 8, 8, pp, 5, 6, 8
1498 IPFILTER_CHROMA 8, 16, pp, 5, 6, 8
1499 IPFILTER_CHROMA 8, 32, pp, 5, 6, 8
1500 IPFILTER_CHROMA 12, 16, pp, 5, 6, 8
1501 IPFILTER_CHROMA 16, 4, pp, 5, 6, 8
1502 IPFILTER_CHROMA 16, 8, pp, 5, 6, 8
1503 IPFILTER_CHROMA 16, 12, pp, 5, 6, 8
1504 IPFILTER_CHROMA 16, 16, pp, 5, 6, 8
1505 IPFILTER_CHROMA 16, 32, pp, 5, 6, 8
1506 IPFILTER_CHROMA 24, 32, pp, 5, 6, 8
1507 IPFILTER_CHROMA 32, 8, pp, 5, 6, 8
1508 IPFILTER_CHROMA 32, 16, pp, 5, 6, 8
1509 IPFILTER_CHROMA 32, 24, pp, 5, 6, 8
1510 IPFILTER_CHROMA 32, 32, pp, 5, 6, 8
1512 IPFILTER_CHROMA 6, 8, ps, 6, 7, 6
1513 IPFILTER_CHROMA 8, 2, ps, 6, 7, 6
1514 IPFILTER_CHROMA 8, 4, ps, 6, 7, 6
1515 IPFILTER_CHROMA 8, 6, ps, 6, 7, 6
1516 IPFILTER_CHROMA 8, 8, ps, 6, 7, 6
1517 IPFILTER_CHROMA 8, 16, ps, 6, 7, 6
1518 IPFILTER_CHROMA 8, 32, ps, 6, 7, 6
1519 IPFILTER_CHROMA 12, 16, ps, 6, 7, 6
1520 IPFILTER_CHROMA 16, 4, ps, 6, 7, 6
1521 IPFILTER_CHROMA 16, 8, ps, 6, 7, 6
1522 IPFILTER_CHROMA 16, 12, ps, 6, 7, 6
1523 IPFILTER_CHROMA 16, 16, ps, 6, 7, 6
1524 IPFILTER_CHROMA 16, 32, ps, 6, 7, 6
1525 IPFILTER_CHROMA 24, 32, ps, 6, 7, 6
1526 IPFILTER_CHROMA 32, 8, ps, 6, 7, 6
1527 IPFILTER_CHROMA 32, 16, ps, 6, 7, 6
1528 IPFILTER_CHROMA 32, 24, ps, 6, 7, 6
1529 IPFILTER_CHROMA 32, 32, ps, 6, 7, 6
1531 IPFILTER_CHROMA 6, 16, pp, 5, 6, 8
1532 IPFILTER_CHROMA 8, 12, pp, 5, 6, 8
1533 IPFILTER_CHROMA 8, 64, pp, 5, 6, 8
1534 IPFILTER_CHROMA 12, 32, pp, 5, 6, 8
1535 IPFILTER_CHROMA 16, 24, pp, 5, 6, 8
1536 IPFILTER_CHROMA 16, 64, pp, 5, 6, 8
1537 IPFILTER_CHROMA 24, 64, pp, 5, 6, 8
1538 IPFILTER_CHROMA 32, 48, pp, 5, 6, 8
1539 IPFILTER_CHROMA 32, 64, pp, 5, 6, 8
1540 IPFILTER_CHROMA 6, 16, ps, 6, 7, 6
1541 IPFILTER_CHROMA 8, 12, ps, 6, 7, 6
1542 IPFILTER_CHROMA 8, 64, ps, 6, 7, 6
1543 IPFILTER_CHROMA 12, 32, ps, 6, 7, 6
1544 IPFILTER_CHROMA 16, 24, ps, 6, 7, 6
1545 IPFILTER_CHROMA 16, 64, ps, 6, 7, 6
1546 IPFILTER_CHROMA 24, 64, ps, 6, 7, 6
1547 IPFILTER_CHROMA 32, 48, ps, 6, 7, 6
1548 IPFILTER_CHROMA 32, 64, ps, 6, 7, 6
1550 IPFILTER_CHROMA 48, 64, pp, 5, 6, 8
1551 IPFILTER_CHROMA 64, 48, pp, 5, 6, 8
1552 IPFILTER_CHROMA 64, 64, pp, 5, 6, 8
1553 IPFILTER_CHROMA 64, 32, pp, 5, 6, 8
1554 IPFILTER_CHROMA 64, 16, pp, 5, 6, 8
1555 IPFILTER_CHROMA 48, 64, ps, 6, 7, 6
1556 IPFILTER_CHROMA 64, 48, ps, 6, 7, 6
1557 IPFILTER_CHROMA 64, 64, ps, 6, 7, 6
1558 IPFILTER_CHROMA 64, 32, ps, 6, 7, 6
1559 IPFILTER_CHROMA 64, 16, ps, 6, 7, 6
1562 %macro PROCESS_CHROMA_SP_W4_4R 0
1565 punpcklwd m0, m1 ;m0=[0 1]
1566 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
1568 lea r0, [r0 + 2 * r1]
1570 punpcklwd m1, m4 ;m1=[1 2]
1571 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
1574 punpcklwd m4, m5 ;m4=[2 3]
1575 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
1576 pmaddwd m4, [r6 + 1 * 16]
1577 paddd m0, m4 ;m0=[0+1+2+3] Row1 done
1579 lea r0, [r0 + 2 * r1]
1581 punpcklwd m5, m4 ;m5=[3 4]
1582 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
1583 pmaddwd m5, [r6 + 1 * 16]
1584 paddd m1, m5 ;m1 = [1+2+3+4] Row2
1587 punpcklwd m4, m5 ;m4=[4 5]
1588 pmaddwd m4, [r6 + 1 * 16]
1589 paddd m2, m4 ;m2=[2+3+4+5] Row3
1591 movq m4, [r0 + 2 * r1]
1592 punpcklwd m5, m4 ;m5=[5 6]
1593 pmaddwd m5, [r6 + 1 * 16]
1594 paddd m3, m5 ;m3=[3+4+5+6] Row4
1597 ;-----------------------------------------------------------------------------------------------------------------
1598 ; void interp_4tap_vert_%3_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1599 ;-----------------------------------------------------------------------------------------------------------------
1600 %macro FILTER_VER_CHROMA_SS 4
1602 cglobal interp_4tap_vert_%3_%1x%2, 5, 7, %4 ,0-gprsize
1610 lea r5, [tab_ChromaCoeffV]
1613 lea r6, [tab_ChromaCoeffV + r4]
1616 mov dword [rsp], %2/4
1620 mova m7, [pw_pixel_max]
1624 mova m6, [tab_c_524800]
1627 mova m6, [tab_c_n32768]
1634 PROCESS_CHROMA_SP_W4_4R
1675 CLIPW2 m0, m2, m5, m7
1679 movhps [r2 + r3], m0
1680 lea r5, [r2 + 2 * r3]
1682 movhps [r5 + r3], m2
1684 lea r5, [4 * r1 - 2 * 4]
1691 lea r0, [r0 + 4 * r1 - 2 * %1]
1692 lea r2, [r2 + 4 * r3 - 2 * %1]
1700 FILTER_VER_CHROMA_SS 4, 4, ss, 6
1701 FILTER_VER_CHROMA_SS 4, 8, ss, 6
1702 FILTER_VER_CHROMA_SS 16, 16, ss, 6
1703 FILTER_VER_CHROMA_SS 16, 8, ss, 6
1704 FILTER_VER_CHROMA_SS 16, 12, ss, 6
1705 FILTER_VER_CHROMA_SS 12, 16, ss, 6
1706 FILTER_VER_CHROMA_SS 16, 4, ss, 6
1707 FILTER_VER_CHROMA_SS 4, 16, ss, 6
1708 FILTER_VER_CHROMA_SS 32, 32, ss, 6
1709 FILTER_VER_CHROMA_SS 32, 16, ss, 6
1710 FILTER_VER_CHROMA_SS 16, 32, ss, 6
1711 FILTER_VER_CHROMA_SS 32, 24, ss, 6
1712 FILTER_VER_CHROMA_SS 24, 32, ss, 6
1713 FILTER_VER_CHROMA_SS 32, 8, ss, 6
1715 FILTER_VER_CHROMA_SS 4, 4, ps, 7
1716 FILTER_VER_CHROMA_SS 4, 8, ps, 7
1717 FILTER_VER_CHROMA_SS 16, 16, ps, 7
1718 FILTER_VER_CHROMA_SS 16, 8, ps, 7
1719 FILTER_VER_CHROMA_SS 16, 12, ps, 7
1720 FILTER_VER_CHROMA_SS 12, 16, ps, 7
1721 FILTER_VER_CHROMA_SS 16, 4, ps, 7
1722 FILTER_VER_CHROMA_SS 4, 16, ps, 7
1723 FILTER_VER_CHROMA_SS 32, 32, ps, 7
1724 FILTER_VER_CHROMA_SS 32, 16, ps, 7
1725 FILTER_VER_CHROMA_SS 16, 32, ps, 7
1726 FILTER_VER_CHROMA_SS 32, 24, ps, 7
1727 FILTER_VER_CHROMA_SS 24, 32, ps, 7
1728 FILTER_VER_CHROMA_SS 32, 8, ps, 7
1730 FILTER_VER_CHROMA_SS 4, 4, sp, 8
1731 FILTER_VER_CHROMA_SS 4, 8, sp, 8
1732 FILTER_VER_CHROMA_SS 16, 16, sp, 8
1733 FILTER_VER_CHROMA_SS 16, 8, sp, 8
1734 FILTER_VER_CHROMA_SS 16, 12, sp, 8
1735 FILTER_VER_CHROMA_SS 12, 16, sp, 8
1736 FILTER_VER_CHROMA_SS 16, 4, sp, 8
1737 FILTER_VER_CHROMA_SS 4, 16, sp, 8
1738 FILTER_VER_CHROMA_SS 32, 32, sp, 8
1739 FILTER_VER_CHROMA_SS 32, 16, sp, 8
1740 FILTER_VER_CHROMA_SS 16, 32, sp, 8
1741 FILTER_VER_CHROMA_SS 32, 24, sp, 8
1742 FILTER_VER_CHROMA_SS 24, 32, sp, 8
1743 FILTER_VER_CHROMA_SS 32, 8, sp, 8
1745 FILTER_VER_CHROMA_SS 4, 4, pp, 8
1746 FILTER_VER_CHROMA_SS 4, 8, pp, 8
1747 FILTER_VER_CHROMA_SS 16, 16, pp, 8
1748 FILTER_VER_CHROMA_SS 16, 8, pp, 8
1749 FILTER_VER_CHROMA_SS 16, 12, pp, 8
1750 FILTER_VER_CHROMA_SS 12, 16, pp, 8
1751 FILTER_VER_CHROMA_SS 16, 4, pp, 8
1752 FILTER_VER_CHROMA_SS 4, 16, pp, 8
1753 FILTER_VER_CHROMA_SS 32, 32, pp, 8
1754 FILTER_VER_CHROMA_SS 32, 16, pp, 8
1755 FILTER_VER_CHROMA_SS 16, 32, pp, 8
1756 FILTER_VER_CHROMA_SS 32, 24, pp, 8
1757 FILTER_VER_CHROMA_SS 24, 32, pp, 8
1758 FILTER_VER_CHROMA_SS 32, 8, pp, 8
1761 FILTER_VER_CHROMA_SS 16, 24, ss, 6
1762 FILTER_VER_CHROMA_SS 12, 32, ss, 6
1763 FILTER_VER_CHROMA_SS 4, 32, ss, 6
1764 FILTER_VER_CHROMA_SS 32, 64, ss, 6
1765 FILTER_VER_CHROMA_SS 16, 64, ss, 6
1766 FILTER_VER_CHROMA_SS 32, 48, ss, 6
1767 FILTER_VER_CHROMA_SS 24, 64, ss, 6
1769 FILTER_VER_CHROMA_SS 16, 24, ps, 7
1770 FILTER_VER_CHROMA_SS 12, 32, ps, 7
1771 FILTER_VER_CHROMA_SS 4, 32, ps, 7
1772 FILTER_VER_CHROMA_SS 32, 64, ps, 7
1773 FILTER_VER_CHROMA_SS 16, 64, ps, 7
1774 FILTER_VER_CHROMA_SS 32, 48, ps, 7
1775 FILTER_VER_CHROMA_SS 24, 64, ps, 7
1777 FILTER_VER_CHROMA_SS 16, 24, sp, 8
1778 FILTER_VER_CHROMA_SS 12, 32, sp, 8
1779 FILTER_VER_CHROMA_SS 4, 32, sp, 8
1780 FILTER_VER_CHROMA_SS 32, 64, sp, 8
1781 FILTER_VER_CHROMA_SS 16, 64, sp, 8
1782 FILTER_VER_CHROMA_SS 32, 48, sp, 8
1783 FILTER_VER_CHROMA_SS 24, 64, sp, 8
1785 FILTER_VER_CHROMA_SS 16, 24, pp, 8
1786 FILTER_VER_CHROMA_SS 12, 32, pp, 8
1787 FILTER_VER_CHROMA_SS 4, 32, pp, 8
1788 FILTER_VER_CHROMA_SS 32, 64, pp, 8
1789 FILTER_VER_CHROMA_SS 16, 64, pp, 8
1790 FILTER_VER_CHROMA_SS 32, 48, pp, 8
1791 FILTER_VER_CHROMA_SS 24, 64, pp, 8
1794 FILTER_VER_CHROMA_SS 48, 64, ss, 6
1795 FILTER_VER_CHROMA_SS 64, 48, ss, 6
1796 FILTER_VER_CHROMA_SS 64, 64, ss, 6
1797 FILTER_VER_CHROMA_SS 64, 32, ss, 6
1798 FILTER_VER_CHROMA_SS 64, 16, ss, 6
1800 FILTER_VER_CHROMA_SS 48, 64, ps, 7
1801 FILTER_VER_CHROMA_SS 64, 48, ps, 7
1802 FILTER_VER_CHROMA_SS 64, 64, ps, 7
1803 FILTER_VER_CHROMA_SS 64, 32, ps, 7
1804 FILTER_VER_CHROMA_SS 64, 16, ps, 7
1806 FILTER_VER_CHROMA_SS 48, 64, sp, 8
1807 FILTER_VER_CHROMA_SS 64, 48, sp, 8
1808 FILTER_VER_CHROMA_SS 64, 64, sp, 8
1809 FILTER_VER_CHROMA_SS 64, 32, sp, 8
1810 FILTER_VER_CHROMA_SS 64, 16, sp, 8
1812 FILTER_VER_CHROMA_SS 48, 64, pp, 8
1813 FILTER_VER_CHROMA_SS 64, 48, pp, 8
1814 FILTER_VER_CHROMA_SS 64, 64, pp, 8
1815 FILTER_VER_CHROMA_SS 64, 32, pp, 8
1816 FILTER_VER_CHROMA_SS 64, 16, pp, 8
1819 %macro PROCESS_CHROMA_SP_W2_4R 1
1822 punpcklwd m0, m1 ;m0=[0 1]
1824 lea r0, [r0 + 2 * r1]
1826 punpcklwd m1, m2 ;m1=[1 2]
1827 punpcklqdq m0, m1 ;m0=[0 1 1 2]
1828 pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2
1831 punpcklwd m2, m1 ;m2=[2 3]
1833 lea r0, [r0 + 2 * r1]
1835 punpcklwd m1, m3 ;m2=[3 4]
1836 punpcklqdq m2, m1 ;m2=[2 3 3 4]
1838 pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2
1839 pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4
1840 paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2
1843 punpcklwd m3, m1 ;m3=[4 5]
1845 movd m4, [r0 + 2 * r1]
1846 punpcklwd m1, m4 ;m1=[5 6]
1847 punpcklqdq m3, m1 ;m2=[4 5 5 6]
1848 pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4
1849 paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4
1852 ;---------------------------------------------------------------------------------------------------------------------
1853 ; void interp_4tap_vertical_%2_2x%1(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1854 ;---------------------------------------------------------------------------------------------------------------------
1855 %macro FILTER_VER_CHROMA_W2 3
1857 cglobal interp_4tap_vert_%2_2x%1, 5, 6, %3
1865 lea r5, [tab_ChromaCoeffV]
1868 lea r5, [tab_ChromaCoeffV + r4]
1875 mova m6, [pw_pixel_max]
1879 mova m5, [tab_c_524800]
1882 mova m5, [tab_c_n32768]
1887 PROCESS_CHROMA_SP_W2_4R r5
1913 pextrd [r2 + r3], m0, 1
1914 lea r2, [r2 + 2 * r3]
1916 pextrd [r2 + r3], m0, 3
1918 lea r2, [r2 + 2 * r3]
1926 FILTER_VER_CHROMA_W2 4, ss, 5
1927 FILTER_VER_CHROMA_W2 8, ss, 5
1929 FILTER_VER_CHROMA_W2 4, pp, 8
1930 FILTER_VER_CHROMA_W2 8, pp, 8
1932 FILTER_VER_CHROMA_W2 4, ps, 6
1933 FILTER_VER_CHROMA_W2 8, ps, 6
1935 FILTER_VER_CHROMA_W2 4, sp, 8
1936 FILTER_VER_CHROMA_W2 8, sp, 8
1938 FILTER_VER_CHROMA_W2 16, ss, 5
1939 FILTER_VER_CHROMA_W2 16, pp, 8
1940 FILTER_VER_CHROMA_W2 16, ps, 6
1941 FILTER_VER_CHROMA_W2 16, sp, 8
1944 ;---------------------------------------------------------------------------------------------------------------
1945 ; void interp_4tap_vert_%1_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1946 ;---------------------------------------------------------------------------------------------------------------
1947 %macro FILTER_VER_CHROMA_W4 3
1949 cglobal interp_4tap_vert_%2_4x%1, 5, 6, %3
1957 lea r5, [tab_ChromaCoeffV]
1960 lea r5, [tab_ChromaCoeffV + r4]
1970 mova m5, [pw_pixel_max]
1974 mova m4, [tab_c_524800]
1977 mova m4, [tab_c_n32768]
1987 punpcklwd m0, m1 ;m0=[0 1]
1988 pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
1990 lea r0, [r0 + 2 * r1]
1992 punpcklwd m1, m2 ;m1=[1 2]
1993 pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
1996 punpcklwd m2, m3 ;m4=[2 3]
1997 pmaddwd m2, [r5 + 1 * 16]
1998 paddd m0, m2 ;m0=[0+1+2+3] Row1 done
2000 movh m2, [r0 + 2 * r1]
2001 punpcklwd m3, m2 ;m5=[3 4]
2002 pmaddwd m3, [r5 + 1 * 16]
2003 paddd m1, m3 ;m1=[1+2+3+4] Row2 done
2030 movhps [r2 + r3], m0
2033 lea r2, [r2 + r3 * 2]
2041 FILTER_VER_CHROMA_W4 2, ss, 4
2042 FILTER_VER_CHROMA_W4 2, pp, 7
2043 FILTER_VER_CHROMA_W4 2, ps, 5
2044 FILTER_VER_CHROMA_W4 2, sp, 7
2046 FILTER_VER_CHROMA_W4 4, ss, 4
2047 FILTER_VER_CHROMA_W4 4, pp, 7
2048 FILTER_VER_CHROMA_W4 4, ps, 5
2049 FILTER_VER_CHROMA_W4 4, sp, 7
2051 ;-------------------------------------------------------------------------------------------------------------------
2052 ; void interp_4tap_vertical_%1_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2053 ;-------------------------------------------------------------------------------------------------------------------
2054 %macro FILTER_VER_CHROMA_W6 3
2056 cglobal interp_4tap_vert_%2_6x%1, 5, 7, %3
2064 lea r5, [tab_ChromaCoeffV]
2067 lea r6, [tab_ChromaCoeffV + r4]
2074 mova m7, [pw_pixel_max]
2078 mova m6, [tab_c_524800]
2081 mova m6, [tab_c_n32768]
2086 PROCESS_CHROMA_SP_W4_4R
2127 CLIPW2 m0, m2, m5, m7
2131 movhps [r2 + r3], m0
2132 lea r5, [r2 + 2 * r3]
2134 movhps [r5 + r3], m2
2136 lea r5, [4 * r1 - 2 * 4]
2140 PROCESS_CHROMA_SP_W2_4R r6
2167 pextrd [r2 + r3], m0, 1
2168 lea r2, [r2 + 2 * r3]
2170 pextrd [r2 + r3], m0, 3
2173 lea r2, [r2 + 2 * r3 - 2 * 4]
2181 FILTER_VER_CHROMA_W6 8, ss, 6
2182 FILTER_VER_CHROMA_W6 8, ps, 7
2183 FILTER_VER_CHROMA_W6 8, sp, 8
2184 FILTER_VER_CHROMA_W6 8, pp, 8
2186 FILTER_VER_CHROMA_W6 16, ss, 6
2187 FILTER_VER_CHROMA_W6 16, ps, 7
2188 FILTER_VER_CHROMA_W6 16, sp, 8
2189 FILTER_VER_CHROMA_W6 16, pp, 8
2191 %macro PROCESS_CHROMA_SP_W8_2R 0
2194 punpcklwd m0, m1, m3
2195 pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l
2197 pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h
2199 movu m4, [r0 + 2 * r1]
2200 punpcklwd m2, m3, m4
2201 pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l
2203 pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h
2205 lea r0, [r0 + 2 * r1]
2207 punpcklwd m6, m4, m5
2208 pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l
2209 paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum
2211 pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h
2212 paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum
2214 movu m4, [r0 + 2 * r1]
2215 punpcklwd m6, m5, m4
2216 pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l
2217 paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum
2219 pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h
2220 paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum
2223 ;----------------------------------------------------------------------------------------------------------------
2224 ; void interp_4tap_vert_%3_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2225 ;----------------------------------------------------------------------------------------------------------------
2226 %macro FILTER_VER_CHROMA_W8 4
2228 cglobal interp_4tap_vert_%3_%1x%2, 5, 6, %4
2236 lea r5, [tab_ChromaCoeffV]
2239 lea r5, [tab_ChromaCoeffV + r4]
2247 mova m7, [tab_c_524800]
2249 mova m7, [tab_c_n32768]
2253 PROCESS_CHROMA_SP_W8_2R
2294 mova m6, [pw_pixel_max]
2295 CLIPW2 m0, m2, m5, m6
2301 lea r2, [r2 + 2 * r3]
2309 FILTER_VER_CHROMA_W8 8, 2, ss, 7
2310 FILTER_VER_CHROMA_W8 8, 4, ss, 7
2311 FILTER_VER_CHROMA_W8 8, 6, ss, 7
2312 FILTER_VER_CHROMA_W8 8, 8, ss, 7
2313 FILTER_VER_CHROMA_W8 8, 16, ss, 7
2314 FILTER_VER_CHROMA_W8 8, 32, ss, 7
2316 FILTER_VER_CHROMA_W8 8, 2, sp, 8
2317 FILTER_VER_CHROMA_W8 8, 4, sp, 8
2318 FILTER_VER_CHROMA_W8 8, 6, sp, 8
2319 FILTER_VER_CHROMA_W8 8, 8, sp, 8
2320 FILTER_VER_CHROMA_W8 8, 16, sp, 8
2321 FILTER_VER_CHROMA_W8 8, 32, sp, 8
2323 FILTER_VER_CHROMA_W8 8, 2, ps, 8
2324 FILTER_VER_CHROMA_W8 8, 4, ps, 8
2325 FILTER_VER_CHROMA_W8 8, 6, ps, 8
2326 FILTER_VER_CHROMA_W8 8, 8, ps, 8
2327 FILTER_VER_CHROMA_W8 8, 16, ps, 8
2328 FILTER_VER_CHROMA_W8 8, 32, ps, 8
2330 FILTER_VER_CHROMA_W8 8, 2, pp, 8
2331 FILTER_VER_CHROMA_W8 8, 4, pp, 8
2332 FILTER_VER_CHROMA_W8 8, 6, pp, 8
2333 FILTER_VER_CHROMA_W8 8, 8, pp, 8
2334 FILTER_VER_CHROMA_W8 8, 16, pp, 8
2335 FILTER_VER_CHROMA_W8 8, 32, pp, 8
2337 FILTER_VER_CHROMA_W8 8, 12, ss, 7
2338 FILTER_VER_CHROMA_W8 8, 64, ss, 7
2339 FILTER_VER_CHROMA_W8 8, 12, sp, 8
2340 FILTER_VER_CHROMA_W8 8, 64, sp, 8
2341 FILTER_VER_CHROMA_W8 8, 12, ps, 8
2342 FILTER_VER_CHROMA_W8 8, 64, ps, 8
2343 FILTER_VER_CHROMA_W8 8, 12, pp, 8
2344 FILTER_VER_CHROMA_W8 8, 64, pp, 8
2348 cglobal chroma_p2s, 3, 7, 3
2350 ; load width and height
2356 mova m2, [tab_c_n8192]
2362 lea r6, [r0 + r5 * 2]
2374 lea r6, [r2 + r5 * 2]
2376 movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0
2377 movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1
2385 movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0
2386 movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1
2393 movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0
2394 movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1
2397 lea r0, [r0 + r1 * 2]
2398 add r2, FENC_STRIDE / 2 * 4
2405 %macro PROCESS_LUMA_VER_W4_4R 0
2408 punpcklwd m0, m1 ;m0=[0 1]
2409 pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
2411 lea r0, [r0 + 2 * r1]
2413 punpcklwd m1, m4 ;m1=[1 2]
2414 pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
2417 punpcklwd m4, m5 ;m4=[2 3]
2418 pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
2419 pmaddwd m4, [r6 + 1 * 16]
2420 paddd m0, m4 ;m0=[0+1+2+3] Row1
2422 lea r0, [r0 + 2 * r1]
2424 punpcklwd m5, m4 ;m5=[3 4]
2425 pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
2426 pmaddwd m5, [r6 + 1 * 16]
2427 paddd m1, m5 ;m1 = [1+2+3+4] Row2
2430 punpcklwd m4, m5 ;m4=[4 5]
2431 pmaddwd m6, m4, [r6 + 1 * 16]
2432 paddd m2, m6 ;m2=[2+3+4+5] Row3
2433 pmaddwd m4, [r6 + 2 * 16]
2434 paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
2436 lea r0, [r0 + 2 * r1]
2438 punpcklwd m5, m4 ;m5=[5 6]
2439 pmaddwd m6, m5, [r6 + 1 * 16]
2440 paddd m3, m6 ;m3=[3+4+5+6] Row4
2441 pmaddwd m5, [r6 + 2 * 16]
2442 paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
2445 punpcklwd m4, m5 ;m4=[6 7]
2446 pmaddwd m6, m4, [r6 + 2 * 16]
2447 paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
2448 pmaddwd m4, [r6 + 3 * 16]
2449 paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
2451 lea r0, [r0 + 2 * r1]
2453 punpcklwd m5, m4 ;m5=[7 8]
2454 pmaddwd m6, m5, [r6 + 2 * 16]
2455 paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
2456 pmaddwd m5, [r6 + 3 * 16]
2457 paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
2460 punpcklwd m4, m5 ;m4=[8 9]
2461 pmaddwd m4, [r6 + 3 * 16]
2462 paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
2464 movq m4, [r0 + 2 * r1]
2465 punpcklwd m5, m4 ;m5=[9 10]
2466 pmaddwd m5, [r6 + 3 * 16]
2467 paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
2470 ;--------------------------------------------------------------------------------------------------------------
2471 ; void interp_8tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2472 ;--------------------------------------------------------------------------------------------------------------
2473 %macro FILTER_VER_LUMA_PP 2
2475 cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 8 ,0-gprsize
2479 lea r5, [r1 + 2 * r1]
2484 lea r5, [tab_LumaCoeffV]
2487 lea r6, [tab_LumaCoeffV + r4]
2492 mov dword [rsp], %2/4
2496 PROCESS_LUMA_VER_W4_4R
2512 CLIPW2 m0, m2, m1, [pw_pixel_max]
2515 movhps [r2 + r3], m0
2516 lea r5, [r2 + 2 * r3]
2518 movhps [r5 + r3], m2
2520 lea r5, [8 * r1 - 2 * 4]
2527 lea r0, [r0 + 4 * r1 - 2 * %1]
2528 lea r2, [r2 + 4 * r3 - 2 * %1]
2536 ;-------------------------------------------------------------------------------------------------------------
2537 ; void interp_8tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2538 ;-------------------------------------------------------------------------------------------------------------
2539 FILTER_VER_LUMA_PP 4, 4
2540 FILTER_VER_LUMA_PP 8, 8
2541 FILTER_VER_LUMA_PP 8, 4
2542 FILTER_VER_LUMA_PP 4, 8
2543 FILTER_VER_LUMA_PP 16, 16
2544 FILTER_VER_LUMA_PP 16, 8
2545 FILTER_VER_LUMA_PP 8, 16
2546 FILTER_VER_LUMA_PP 16, 12
2547 FILTER_VER_LUMA_PP 12, 16
2548 FILTER_VER_LUMA_PP 16, 4
2549 FILTER_VER_LUMA_PP 4, 16
2550 FILTER_VER_LUMA_PP 32, 32
2551 FILTER_VER_LUMA_PP 32, 16
2552 FILTER_VER_LUMA_PP 16, 32
2553 FILTER_VER_LUMA_PP 32, 24
2554 FILTER_VER_LUMA_PP 24, 32
2555 FILTER_VER_LUMA_PP 32, 8
2556 FILTER_VER_LUMA_PP 8, 32
2557 FILTER_VER_LUMA_PP 64, 64
2558 FILTER_VER_LUMA_PP 64, 32
2559 FILTER_VER_LUMA_PP 32, 64
2560 FILTER_VER_LUMA_PP 64, 48
2561 FILTER_VER_LUMA_PP 48, 64
2562 FILTER_VER_LUMA_PP 64, 16
2563 FILTER_VER_LUMA_PP 16, 64
2565 ;---------------------------------------------------------------------------------------------------------------
2566 ; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2567 ;---------------------------------------------------------------------------------------------------------------
2568 %macro FILTER_VER_LUMA_PS 2
2570 cglobal interp_8tap_vert_ps_%1x%2, 5, 7, 8 ,0-gprsize
2574 lea r5, [r1 + 2 * r1]
2579 lea r5, [tab_LumaCoeffV]
2582 lea r6, [tab_LumaCoeffV + r4]
2585 mova m7, [pd_n32768]
2587 mov dword [rsp], %2/4
2591 PROCESS_LUMA_VER_W4_4R
2607 movhps [r2 + r3], m0
2608 lea r5, [r2 + 2 * r3]
2610 movhps [r5 + r3], m2
2612 lea r5, [8 * r1 - 2 * 4]
2619 lea r0, [r0 + 4 * r1 - 2 * %1]
2620 lea r2, [r2 + 4 * r3 - 2 * %1]
2628 ;---------------------------------------------------------------------------------------------------------------
2629 ; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2630 ;---------------------------------------------------------------------------------------------------------------
2631 FILTER_VER_LUMA_PS 4, 4
2632 FILTER_VER_LUMA_PS 8, 8
2633 FILTER_VER_LUMA_PS 8, 4
2634 FILTER_VER_LUMA_PS 4, 8
2635 FILTER_VER_LUMA_PS 16, 16
2636 FILTER_VER_LUMA_PS 16, 8
2637 FILTER_VER_LUMA_PS 8, 16
2638 FILTER_VER_LUMA_PS 16, 12
2639 FILTER_VER_LUMA_PS 12, 16
2640 FILTER_VER_LUMA_PS 16, 4
2641 FILTER_VER_LUMA_PS 4, 16
2642 FILTER_VER_LUMA_PS 32, 32
2643 FILTER_VER_LUMA_PS 32, 16
2644 FILTER_VER_LUMA_PS 16, 32
2645 FILTER_VER_LUMA_PS 32, 24
2646 FILTER_VER_LUMA_PS 24, 32
2647 FILTER_VER_LUMA_PS 32, 8
2648 FILTER_VER_LUMA_PS 8, 32
2649 FILTER_VER_LUMA_PS 64, 64
2650 FILTER_VER_LUMA_PS 64, 32
2651 FILTER_VER_LUMA_PS 32, 64
2652 FILTER_VER_LUMA_PS 64, 48
2653 FILTER_VER_LUMA_PS 48, 64
2654 FILTER_VER_LUMA_PS 64, 16
2655 FILTER_VER_LUMA_PS 16, 64
2657 ;--------------------------------------------------------------------------------------------------------------
2658 ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2659 ;--------------------------------------------------------------------------------------------------------------
2660 %macro FILTER_VER_LUMA_SP 2
2662 cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize
2666 lea r5, [r1 + 2 * r1]
2671 lea r5, [tab_LumaCoeffV]
2674 lea r6, [tab_LumaCoeffV + r4]
2677 mova m7, [tab_c_524800]
2679 mov dword [rsp], %2/4
2683 PROCESS_LUMA_VER_W4_4R
2699 CLIPW2 m0, m2, m1, [pw_pixel_max]
2702 movhps [r2 + r3], m0
2703 lea r5, [r2 + 2 * r3]
2705 movhps [r5 + r3], m2
2707 lea r5, [8 * r1 - 2 * 4]
2714 lea r0, [r0 + 4 * r1 - 2 * %1]
2715 lea r2, [r2 + 4 * r3 - 2 * %1]
2723 ;--------------------------------------------------------------------------------------------------------------
2724 ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
2725 ;--------------------------------------------------------------------------------------------------------------
2726 FILTER_VER_LUMA_SP 4, 4
2727 FILTER_VER_LUMA_SP 8, 8
2728 FILTER_VER_LUMA_SP 8, 4
2729 FILTER_VER_LUMA_SP 4, 8
2730 FILTER_VER_LUMA_SP 16, 16
2731 FILTER_VER_LUMA_SP 16, 8
2732 FILTER_VER_LUMA_SP 8, 16
2733 FILTER_VER_LUMA_SP 16, 12
2734 FILTER_VER_LUMA_SP 12, 16
2735 FILTER_VER_LUMA_SP 16, 4
2736 FILTER_VER_LUMA_SP 4, 16
2737 FILTER_VER_LUMA_SP 32, 32
2738 FILTER_VER_LUMA_SP 32, 16
2739 FILTER_VER_LUMA_SP 16, 32
2740 FILTER_VER_LUMA_SP 32, 24
2741 FILTER_VER_LUMA_SP 24, 32
2742 FILTER_VER_LUMA_SP 32, 8
2743 FILTER_VER_LUMA_SP 8, 32
2744 FILTER_VER_LUMA_SP 64, 64
2745 FILTER_VER_LUMA_SP 64, 32
2746 FILTER_VER_LUMA_SP 32, 64
2747 FILTER_VER_LUMA_SP 64, 48
2748 FILTER_VER_LUMA_SP 48, 64
2749 FILTER_VER_LUMA_SP 64, 16
2750 FILTER_VER_LUMA_SP 16, 64
2752 ;-----------------------------------------------------------------------------------------------------------------
2753 ; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2754 ;-----------------------------------------------------------------------------------------------------------------
2755 %macro FILTER_VER_LUMA_SS 2
2757 cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize
2766 lea r5, [tab_LumaCoeffV]
2769 lea r6, [tab_LumaCoeffV + r4]
2772 mov dword [rsp], %2/4
2776 PROCESS_LUMA_VER_W4_4R
2782 movhps [r2 + r3], m0
2787 movlps [r2 + 2 * r3], m2
2789 movhps [r2 + r5], m2
2791 lea r5, [8 * r1 - 2 * 4]
2798 lea r0, [r0 + 4 * r1 - 2 * %1]
2799 lea r2, [r2 + 4 * r3 - 2 * %1]
2807 FILTER_VER_LUMA_SS 4, 4
2808 FILTER_VER_LUMA_SS 8, 8
2809 FILTER_VER_LUMA_SS 8, 4
2810 FILTER_VER_LUMA_SS 4, 8
2811 FILTER_VER_LUMA_SS 16, 16
2812 FILTER_VER_LUMA_SS 16, 8
2813 FILTER_VER_LUMA_SS 8, 16
2814 FILTER_VER_LUMA_SS 16, 12
2815 FILTER_VER_LUMA_SS 12, 16
2816 FILTER_VER_LUMA_SS 16, 4
2817 FILTER_VER_LUMA_SS 4, 16
2818 FILTER_VER_LUMA_SS 32, 32
2819 FILTER_VER_LUMA_SS 32, 16
2820 FILTER_VER_LUMA_SS 16, 32
2821 FILTER_VER_LUMA_SS 32, 24
2822 FILTER_VER_LUMA_SS 24, 32
2823 FILTER_VER_LUMA_SS 32, 8
2824 FILTER_VER_LUMA_SS 8, 32
2825 FILTER_VER_LUMA_SS 64, 64
2826 FILTER_VER_LUMA_SS 64, 32
2827 FILTER_VER_LUMA_SS 32, 64
2828 FILTER_VER_LUMA_SS 64, 48
2829 FILTER_VER_LUMA_SS 48, 64
2830 FILTER_VER_LUMA_SS 64, 16
2831 FILTER_VER_LUMA_SS 16, 64
2833 ;--------------------------------------------------------------------------------------------------
2834 ; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
2835 ;--------------------------------------------------------------------------------------------------
2837 cglobal luma_p2s, 3, 7, 5
2841 ; load width and height
2846 mova m4, [tab_c_n8192]
2852 lea r6, [r0 + r5 * 2]
2862 movu m2, [r6 + r1 * 2]
2866 lea r6, [r6 + r1 * 2]
2874 movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
2875 movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
2876 movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
2877 movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
2882 movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
2883 movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
2884 movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
2885 movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
2888 lea r0, [r0 + r1 * 4]
2889 add r2, FENC_STRIDE * 8