1 ;*****************************************************************************
2 ;* mc-a.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2013 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Dylan Yudaken <dyudaken@gmail.com>
10 ;* Holger Lubitz <holger@lubitz.org>
11 ;* Min Chen <chenm001@163.com>
12 ;* Oskar Arvidsson <oskar@irock.se>
14 ;* This program is free software; you can redistribute it and/or modify
15 ;* it under the terms of the GNU General Public License as published by
16 ;* the Free Software Foundation; either version 2 of the License, or
17 ;* (at your option) any later version.
19 ;* This program is distributed in the hope that it will be useful,
20 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;* GNU General Public License for more details.
24 ;* You should have received a copy of the GNU General Public License
25 ;* along with this program; if not, write to the Free Software
26 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
28 ;* This program is also available under a commercial proprietary license.
29 ;* For more information, contact us at license @ x265.com.
30 ;*****************************************************************************
33 %include "x86util.asm"
37 ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
38 ch_shuf_adj: times 8 db 0
61 cextern deinterleave_shufd
63 ;====================================================================================================================
64 ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
65 ;====================================================================================================================
66 ; r0 = pSrc0, r1 = pSrc1
67 ; r2 = pDst, r3 = iStride0
68 ; r4 = iStride1, r5 = iDstStride
71 cglobal addAvg_2x4, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
96 pmulhrsw m1, [pw_1024]
103 pextrd [r2 + r5], m1, 1
104 lea r2, [r2 + 2 * r5]
106 pextrd [r2 + r5], m1, 3
109 ;-----------------------------------------------------------------------------
111 cglobal addAvg_2x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
127 lea r0, [r0 + 2 * r3]
128 lea r1, [r1 + 2 * r4]
140 pmulhrsw m1, [pw_1024]
146 pextrd [r2 + r5], m1, 1
147 lea r2, [r2 + 2 * r5]
149 pextrd [r2 + r5], m1, 3
151 lea r0, [r0 + 2 * r3]
152 lea r1, [r1 + 2 * r4]
153 lea r2, [r2 + 2 * r5]
157 ;-----------------------------------------------------------------------------
159 cglobal addAvg_2x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
171 lea r0, [r0 + r3 * 2]
172 lea r1, [r1 + r4 * 2]
179 lea r0, [r0 + r3 * 2]
180 lea r1, [r1 + r4 * 2]
192 pextrd [r2 + r5], m1, 1
193 lea r2, [r2 + r5 * 2]
195 pextrd [r2 + r5], m1, 3
196 lea r2, [r2 + r5 * 2]
200 ;-----------------------------------------------------------------------------
202 ;-----------------------------------------------------------------------------
204 cglobal addAvg_4x2, 6,6,7, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
217 pmulhrsw m0, [pw_1024]
226 ;-----------------------------------------------------------------------------
228 cglobal addAvg_6x8, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
247 pextrd [r2 + 8], m0, 2
258 pextrd [r2 + r5 + 8], m1, 2
260 lea r2, [r2 + 2 * r5]
261 lea r0, [r0 + 2 * r3]
262 lea r1, [r1 + 2 * r4]
265 ;-----------------------------------------------------------------------------
267 cglobal addAvg_6x16, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
282 lea r0, [r0 + r3 * 2]
283 lea r1, [r1 + r4 * 2]
295 pextrd [r2 + 8], m0, 2
297 pextrd [r2 + r5 + 8], m1, 2
298 lea r2, [r2 + r5 * 2]
301 ;-----------------------------------------------------------------------------
303 cglobal addAvg_8x2, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
332 ;-----------------------------------------------------------------------------
334 cglobal addAvg_8x6, 6,6,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
364 lea r2, [r2 + 2 * r5]
365 lea r0, [r0 + 2 * r3]
366 lea r1, [r1 + 2 * r4]
370 ;-----------------------------------------------------------------------------
371 %macro ADDAVG_W4_H4 1
373 cglobal addAvg_4x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
404 lea r2, [r2 + 2 * r5]
405 lea r0, [r0 + 2 * r3]
406 lea r1, [r1 + 2 * r4]
420 ;-----------------------------------------------------------------------------
421 %macro ADDAVG_W8_H4 1
423 cglobal addAvg_8x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
453 lea r2, [r2 + 2 * r5]
454 lea r0, [r0 + 2 * r3]
455 lea r1, [r1 + 2 * r4]
470 ;-----------------------------------------------------------------------------
471 %macro ADDAVG_W12_H4 1
473 cglobal addAvg_12x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
495 movh m1, [r0 + 16 + r3]
497 movh m3, [r1 + 16 + r4]
508 movhps [r2 + r5 + 16], m0
519 lea r2, [r2 + 2 * r5]
520 lea r0, [r0 + 2 * r3]
521 lea r1, [r1 + 2 * r4]
532 ;-----------------------------------------------------------------------------
533 %macro ADDAVG_W16_H4 1
535 cglobal addAvg_16x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
574 movu m2, [r0 + 16 + r3]
575 movu m3, [r1 + 16 + r4]
581 movu [r2 + r5 + 16], m2
583 lea r2, [r2 + 2 * r5]
584 lea r0, [r0 + 2 * r3]
585 lea r1, [r1 + 2 * r4]
601 ;-----------------------------------------------------------------------------
602 %macro ADDAVG_W24_H2 2
604 cglobal addAvg_%1x%2, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
652 movu m2, [r0 + r3 + 16]
653 movu m3, [r1 + r4 + 16]
659 movu [r2 + r5 + 16], m2
661 movu m1, [r0 + r3 + 32]
662 movu m3, [r1 + r4 + 32]
668 movu [r2 + r5 + 32], m1
670 lea r2, [r2 + 2 * r5]
671 lea r0, [r0 + 2 * r3]
672 lea r1, [r1 + 2 * r4]
683 ;-----------------------------------------------------------------------------
684 %macro ADDAVG_W32_H2 1
686 cglobal addAvg_32x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
743 movu m2, [r0 + 16 + r3]
744 movu m3, [r1 + 16 + r4]
750 movu [r2 + r5 + 16], m2
752 movu m1, [r0 + 32 + r3]
753 movu m3, [r1 + 32 + r4]
759 movu [r2 + r5 + 32], m1
761 movu m2, [r0 + 48 + r3]
762 movu m3, [r1 + 48 + r4]
768 movu [r2 + r5 + 48], m2
770 lea r2, [r2 + 2 * r5]
771 lea r0, [r0 + 2 * r3]
772 lea r1, [r1 + 2 * r4]
787 ;-----------------------------------------------------------------------------
788 %macro ADDAVG_W48_H2 1
790 cglobal addAvg_48x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
865 movu m2, [r0 + 16 + r3]
866 movu m3, [r1 + 16 + r4]
872 movu [r2 + 16 + r5], m2
874 movu m1, [r0 + 32 + r3]
875 movu m3, [r1 + 32 + r4]
881 movu [r2 + 32 + r5], m1
883 movu m2, [r0 + 48 + r3]
884 movu m3, [r1 + 48 + r4]
890 movu [r2 + 48 + r5], m2
892 movu m1, [r0 + 64 + r3]
893 movu m3, [r1 + 64 + r4]
899 movu [r2 + 64 + r5], m1
901 movu m2, [r0 + 80 + r3]
902 movu m3, [r1 + 80 + r4]
908 movu [r2 + 80 + r5], m2
910 lea r2, [r2 + 2 * r5]
911 lea r0, [r0 + 2 * r3]
912 lea r1, [r1 + 2 * r4]
921 ;-----------------------------------------------------------------------------
922 %macro ADDAVG_W64_H1 1
924 cglobal addAvg_64x%1, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
1020 ;-----------------------------------------------------------------------------
1021 %else ; !HIGH_BIT_DEPTH
1022 ;-----------------------------------------------------------------------------
1024 cglobal addAvg_2x4, 6,6,8, src0, src1, dst, src0Stride, src1tride, dstStride
1039 lea r0, [r0 + 2 * r3]
1040 lea r1, [r1 + 2 * r4]
1058 pextrw [r2 + r5], m1, 1
1059 lea r2, [r2 + 2 * r5]
1061 pextrw [r2 + r5], m1, 3
1064 ;-----------------------------------------------------------------------------
1066 ;-----------------------------------------------------------------------------
1068 cglobal addAvg_2x8, 6,6,8, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1083 lea r0, [r0 + 2 * r3]
1084 lea r1, [r1 + 2 * r4]
1102 pextrw [r2 + r5], m1, 1
1103 lea r2, [r2 + 2 * r5]
1105 pextrw [r2 + r5], m1, 3
1107 lea r2, [r2 + 2 * r5]
1108 lea r0, [r0 + 2 * r3]
1109 lea r1, [r1 + 2 * r4]
1119 lea r0, [r0 + 2 * r3]
1120 lea r1, [r1 + 2 * r4]
1138 pextrw [r2 + r5], m1, 1
1139 lea r2, [r2 + 2 * r5]
1141 pextrw [r2 + r5], m1, 3
1144 ;-----------------------------------------------------------------------------
1146 ;-----------------------------------------------------------------------------
1148 cglobal addAvg_2x16, 6,7,8, src0, src1, dst, src0Stride, src1tride, dstStride
1159 lea r0, [r0 + r3 * 2]
1160 lea r1, [r1 + r4 * 2]
1167 lea r0, [r0 + r3 * 2]
1168 lea r1, [r1 + r4 * 2]
1178 pextrw [r2 + r5], m1, 1
1179 lea r2, [r2 + r5 * 2]
1181 pextrw [r2 + r5], m1, 3
1182 lea r2, [r2 + r5 * 2]
1186 ;-----------------------------------------------------------------------------
1188 ;-----------------------------------------------------------------------------
1190 cglobal addAvg_4x2, 6,6,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1198 movhps m0, [r0 + r3]
1200 movhps m2, [r1 + r4]
1212 ;-----------------------------------------------------------------------------
1214 ;-----------------------------------------------------------------------------
1215 %macro ADDAVG_W4_H4 1
1217 cglobal addAvg_4x%1, 6,7,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1227 movhps m0, [r0 + r3]
1229 movhps m2, [r1 + r4]
1240 lea r2, [r2 + 2 * r5]
1241 lea r0, [r0 + 2 * r3]
1242 lea r1, [r1 + 2 * r4]
1245 movhps m0, [r0 + r3]
1247 movhps m2, [r1 + r4]
1258 lea r2, [r2 + 2 * r5]
1259 lea r0, [r0 + 2 * r3]
1260 lea r1, [r1 + 2 * r4]
1273 ;-----------------------------------------------------------------------------
1275 ;-----------------------------------------------------------------------------
1277 cglobal addAvg_6x8, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1291 pextrw [r2 + 4], m0, 2
1300 pextrw [r2 + r5 + 4], m1, 2
1302 lea r2, [r2 + 2 * r5]
1303 lea r0, [r0 + 2 * r3]
1304 lea r1, [r1 + 2 * r4]
1313 pextrw [r2 + 4], m0, 2
1322 pextrw [r2 + r5 + 4], m1, 2
1324 lea r2, [r2 + 2 * r5]
1325 lea r0, [r0 + 2 * r3]
1326 lea r1, [r1 + 2 * r4]
1335 pextrw [r2 + 4], m0, 2
1344 pextrw [r2 + r5 + 4], m1, 2
1346 lea r2, [r2 + 2 * r5]
1347 lea r0, [r0 + 2 * r3]
1348 lea r1, [r1 + 2 * r4]
1357 pextrw [r2 + 4], m0, 2
1366 pextrw [r2 + r5 + 4], m1, 2
1369 ;-----------------------------------------------------------------------------
1371 ;-----------------------------------------------------------------------------
1373 cglobal addAvg_6x16, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1385 lea r0, [r0 + r3 * 2]
1386 lea r1, [r1 + r4 * 2]
1396 pextrw [r2 + 4], m0, 2
1398 pextrw [r2 + r5 + 4], m1, 2
1399 lea r2, [r2 + r5 * 2]
1402 ;-----------------------------------------------------------------------------
1404 ;-----------------------------------------------------------------------------
1406 cglobal addAvg_8x2, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1429 ;-----------------------------------------------------------------------------
1431 ;-----------------------------------------------------------------------------
1433 cglobal addAvg_8x6, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1456 lea r2, [r2 + 2 * r5]
1457 lea r0, [r0 + 2 * r3]
1458 lea r1, [r1 + 2 * r4]
1476 lea r2, [r2 + 2 * r5]
1477 lea r0, [r0 + 2 * r3]
1478 lea r1, [r1 + 2 * r4]
1497 ;-----------------------------------------------------------------------------
1499 ;-----------------------------------------------------------------------------
1500 %macro ADDAVG_W8_H4 1
1502 cglobal addAvg_8x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1530 lea r2, [r2 + 2 * r5]
1531 lea r0, [r0 + 2 * r3]
1532 lea r1, [r1 + 2 * r4]
1552 lea r2, [r2 + 2 * r5]
1553 lea r0, [r0 + 2 * r3]
1554 lea r1, [r1 + 2 * r4]
1569 ;-----------------------------------------------------------------------------
1572 ;-----------------------------------------------------------------------------
1573 %macro ADDAVG_W12_H4 1
1575 cglobal addAvg_12x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1593 movhps m0, [r0 + 16 + r3]
1595 movhps m2, [r1 + 16 + r4]
1604 movd [r2 + 8 + r5], m0
1615 lea r2, [r2 + 2 * r5]
1616 lea r0, [r0 + 2 * r3]
1617 lea r1, [r1 + 2 * r4]
1629 movhps m0, [r0 + 16 + r3]
1631 movhps m2, [r1 + 16 + r4]
1640 movd [r2 + 8 + r5], m0
1651 lea r2, [r2 + 2 * r5]
1652 lea r0, [r0 + 2 * r3]
1653 lea r1, [r1 + 2 * r4]
1664 ;-----------------------------------------------------------------------------
1667 ;-----------------------------------------------------------------------------
1668 %macro ADDAVG_W16_H4 1
1670 cglobal addAvg_16x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1700 movu m2, [r0 + 16 + r3]
1701 movu m3, [r1 + 16 + r4]
1709 lea r2, [r2 + 2 * r5]
1710 lea r0, [r0 + 2 * r3]
1711 lea r1, [r1 + 2 * r4]
1734 movu m2, [r0 + 16 + r3]
1735 movu m3, [r1 + 16 + r4]
1743 lea r2, [r2 + 2 * r5]
1744 lea r0, [r0 + 2 * r3]
1745 lea r1, [r1 + 2 * r4]
1761 ;-----------------------------------------------------------------------------
1764 ;-----------------------------------------------------------------------------
1765 %macro ADDAVG_W24_H2 2
1767 cglobal addAvg_%1x%2, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1806 movu m2, [r0 + 16 + r3]
1807 movu m3, [r1 + 16 + r4]
1815 movu m1, [r0 + 32 + r3]
1816 movu m3, [r1 + 32 + r4]
1822 movh [r2 + 16 + r5], m1
1824 lea r2, [r2 + 2 * r5]
1825 lea r0, [r0 + 2 * r3]
1826 lea r1, [r1 + 2 * r4]
1833 ADDAVG_W24_H2 24, 32
1835 ADDAVG_W24_H2 24, 64
1837 ;-----------------------------------------------------------------------------
1839 ;-----------------------------------------------------------------------------
1840 %macro ADDAVG_W32_H2 1
1842 cglobal addAvg_32x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1887 movu m2, [r0 + 16 + r3]
1888 movu m3, [r1 + 16 + r4]
1896 movu m1, [r0 + 32 + r3]
1897 movu m3, [r1 + 32 + r4]
1902 movu m2, [r0 + 48 + r3]
1903 movu m3, [r1 + 48 + r4]
1909 movu [r2 + 16 + r5], m1
1911 lea r2, [r2 + 2 * r5]
1912 lea r0, [r0 + 2 * r3]
1913 lea r1, [r1 + 2 * r4]
1928 ;-----------------------------------------------------------------------------
1931 ;-----------------------------------------------------------------------------
1932 %macro ADDAVG_W48_H2 1
1934 cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
1995 movu m2, [r0 + 16 + r3]
1996 movu m3, [r1 + 16 + r4]
2004 movu m1, [r0 + 32 + r3]
2005 movu m3, [r1 + 32 + r4]
2010 movu m2, [r0 + 48 + r3]
2011 movu m3, [r1 + 48 + r4]
2017 movu [r2 + 16 + r5], m1
2019 movu m1, [r0 + 64 + r3]
2020 movu m3, [r1 + 64 + r4]
2025 movu m2, [r0 + 80 + r3]
2026 movu m3, [r1 + 80 + r4]
2032 movu [r2 + 32 + r5], m1
2034 lea r2, [r2 + 2 * r5]
2035 lea r0, [r0 + 2 * r3]
2036 lea r1, [r1 + 2 * r4]
2045 ;-----------------------------------------------------------------------------
2047 ;-----------------------------------------------------------------------------
2048 %macro ADDAVG_W64_H1 1
2050 cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
2132 ;-----------------------------------------------------------------------------
2133 %endif ; HIGH_BIT_DEPTH
2135 ;=============================================================================
2136 ; implicit weighted biprediction
2137 ;=============================================================================
2138 ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
2140 DECLARE_REG_TMP 0,1,2,3,4,5,4,5
2141 %macro AVG_START 0-1 0
2145 DECLARE_REG_TMP 0,1,2,3,4,5,7,8
2146 %macro AVG_START 0-1 0
2150 DECLARE_REG_TMP 1,2,3,4,5,6,1,2
2151 %macro AVG_START 0-1 0
2163 lea t4, [t4+t5*2*SIZEOF_PIXEL]
2164 lea t2, [t2+t3*2*SIZEOF_PIXEL]
2165 lea t0, [t0+t1*2*SIZEOF_PIXEL]
2168 %ifidn movu,movq ; detect MMX
2176 %macro BIWEIGHT_MMX 2
2185 %macro BIWEIGHT_START_MMX 0
2197 %else ;!HIGH_BIT_DEPTH
2198 %macro BIWEIGHT_MMX 2
2210 %macro BIWEIGHT_START_MMX 0
2212 SPLATW m2, m2 ; weight_dst
2214 psubw m3, m2 ; weight_src
2215 mova m4, [pw_32] ; rounding
2218 %endif ;HIGH_BIT_DEPTH
2220 %macro BIWEIGHT_SSSE3 2
2228 %macro BIWEIGHT_START_SSSE3 0
2229 movzx t6d, byte r6m ; FIXME x86_64
2237 vpbroadcastw m3, xm3
2239 SPLATW m3, m3 ; weight_dst,src
2244 %macro BIWEIGHT_ROW 4
2252 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
2259 %else ;!HIGH_BIT_DEPTH
2260 %macro BIWEIGHT_ROW 4
2267 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
2274 movd [%1+mmsize/2], m6
2279 %endif ;HIGH_BIT_DEPTH
2281 ;-----------------------------------------------------------------------------
2282 ; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight )
2283 ;-----------------------------------------------------------------------------
2284 %macro AVG_WEIGHT 1-2 0
2285 cglobal pixel_avg_weight_w%1
2289 mova m7, [pw_pixel_max]
2292 %if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL)
2295 BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
2299 %else ;!HIGH_BIT_DEPTH
2301 %endif ;HIGH_BIT_DEPTH
2303 movhps [t0+SIZEOF_PIXEL*t1], m6
2306 %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
2308 %if (%1 == 12) && (%1*SIZEOF_PIXEL-x < mmsize)
2309 %assign y (%1*SIZEOF_PIXEL-x)
2311 BIWEIGHT_ROW t0+x, t2+x, t4+x, y
2312 BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, y
2319 %define BIWEIGHT BIWEIGHT_MMX
2320 %define BIWEIGHT_START BIWEIGHT_START_MMX
2340 %else ;!HIGH_BIT_DEPTH
2349 %define BIWEIGHT BIWEIGHT_SSSE3
2350 %define BIWEIGHT_START BIWEIGHT_START_SSSE3
2363 cglobal pixel_avg_weight_w16
2369 vinserti128 m0, m0, [t2+t3], 1
2370 vinserti128 m1, m1, [t4+t5], 1
2371 SBUTTERFLY bw, 0, 1, 2
2378 vextracti128 [t0+t1], m0, 1
2380 %endif ;HIGH_BIT_DEPTH
2382 ;=============================================================================
2383 ; P frame explicit weighted prediction
2384 ;=============================================================================
2388 %macro WEIGHT_START 1
2389 mova m0, [r4+ 0] ; 1<<denom
2391 movd m2, [r4+32] ; denom
2392 mova m4, [pw_pixel_max]
2393 paddw m2, [sq_1] ; denom+1
2410 %macro WEIGHT_TWO_ROW 4
2412 %rep (%3+mmsize/2-1)/(mmsize/2)
2413 %if %3-x/2 <= 4 && mmsize == 16
2414 WEIGHT %1+x, %1+r3+x
2415 CLIPW m5, [pb_0], m4
2417 movhps [%2+r1+x], m5
2419 WEIGHT %1+x, %1+x+mmsize/2
2421 WEIGHT %1+r3+x, %1+r3+x+mmsize/2
2422 CLIPW m5, [pb_0], m4
2423 CLIPW m7, [pb_0], m4
2431 %else ; !HIGH_BIT_DEPTH
2433 %macro WEIGHT_START 1
2435 vbroadcasti128 m3, [r4]
2436 vbroadcasti128 m4, [r4+16]
2440 %if notcpuflag(ssse3)
2447 ; src1, src2, dst1, dst2, fast
2448 %macro WEIGHT_ROWx2 5
2450 movh m1, [%1+mmsize/2]
2452 movh m7, [%2+mmsize/2]
2477 paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
2492 ; src1, src2, dst1, dst2, width, fast
2497 vinserti128 m0, m0, [%2], 1
2498 punpckhbw m1, m0, m2
2499 punpcklbw m0, m0, m2
2510 vextracti128 [%4], m0, 1
2513 vinserti128 m0, m0, [%2], 1
2521 vextracti128 xm1, m0, 1
2547 paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
2559 movd [%3], m0 ; width 2 can write garbage for the last 2 bytes
2565 %macro WEIGHT_TWO_ROW 4
2568 %if (%3-x) >= mmsize
2569 WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4
2570 %assign x (x+mmsize)
2576 WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4
2585 %endif ; HIGH_BIT_DEPTH
2587 ;-----------------------------------------------------------------------------
2588 ;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h )
2589 ;-----------------------------------------------------------------------------
2592 cglobal mc_weight_w%1, 6,6,8
2595 %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
2596 ; we can merge the shift step into the scale factor
2597 ; if (m3<<7) doesn't overflow an int16_t
2602 WEIGHT_TWO_ROW r2, r0, %1, 0
2608 %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
2612 WEIGHT_TWO_ROW r2, r0, %1, 1
2664 %macro OFFSET_TWO_ROW 4
2667 %if (%3*SIZEOF_PIXEL-x) >= mmsize
2668 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
2669 %assign x (x+mmsize)
2672 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
2674 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
2678 %if x >= %3*SIZEOF_PIXEL
2684 ;-----------------------------------------------------------------------------
2685 ;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h )
2686 ;-----------------------------------------------------------------------------
2688 cglobal mc_offset%2_w%1, 6,6
2693 mova m3, [pw_pixel_max]
2697 OFFSET_TWO_ROW r2, r0, %1, %2
2725 ;=============================================================================
2727 ;=============================================================================
2729 ;-----------------------------------------------------------------------------
2730 ; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
2731 ; pixel *src2, intptr_t src2_stride, int weight );
2732 ;-----------------------------------------------------------------------------
2734 cglobal pixel_avg_%1x%2
2737 jne pixel_avg_weight_w%1 %+ SUFFIX
2738 %if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads
2739 jmp pixel_avg_w%1_avx2
2741 %if mmsize == 16 && (%1 % 16 == 0)
2743 jz pixel_avg_w%1_sse2
2745 jmp pixel_avg_w%1_mmx2
2749 ;-----------------------------------------------------------------------------
2750 ; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
2751 ; pixel *src2, intptr_t src2_stride, int height, int weight );
2752 ;-----------------------------------------------------------------------------
2755 cglobal pixel_avg_w%1
2759 %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
2761 %2 m1, [t2+x+SIZEOF_PIXEL*t3]
2764 pavgw m1, [t4+x+SIZEOF_PIXEL*t5]
2765 %else ;!HIGH_BIT_DEPTH
2767 pavgb m1, [t4+x+SIZEOF_PIXEL*t5]
2769 %if (%1 == 12) && (%1-x/SIZEOF_PIXEL < mmsize)
2771 %4 [t0+x+SIZEOF_PIXEL*t1], m1
2774 %3 [t0+x+SIZEOF_PIXEL*t1], m1
2784 AVG_FUNC 4, movq, movq
2790 AVG_FUNC 8, movq, movq
2796 AVG_FUNC 16, movq, movq
2804 AVG_FUNC 24, movq, movq
2807 AVG_FUNC 32, movq, movq
2813 AVG_FUNC 48, movq, movq
2816 AVG_FUNC 64, movq, movq
2822 AVG_FUNC 12, movq, movq, movq
2826 AVG_FUNC 4, movq, movq
2832 AVG_FUNC 8, movdqu, movdqa
2838 AVG_FUNC 16, movdqu, movdqa
2846 AVG_FUNC 24, movdqu, movdqa
2849 AVG_FUNC 32, movdqu, movdqa
2856 AVG_FUNC 48, movdqu, movdqa
2859 AVG_FUNC 64, movdqu, movdqa
2865 AVG_FUNC 12, movdqu, movdqa, movq
2868 %else ;!HIGH_BIT_DEPTH
2871 AVG_FUNC 4, movd, movd
2877 AVG_FUNC 8, movq, movq
2883 AVG_FUNC 12, movq, movq, movd
2886 AVG_FUNC 16, movq, movq
2894 AVG_FUNC 32, movq, movq
2900 AVG_FUNC 64, movq, movq
2905 AVG_FUNC 24, movq, movq
2908 AVG_FUNC 48, movq, movq
2912 AVG_FUNC 64, movdqu, movdqa
2918 AVG_FUNC 32, movdqu, movdqa
2925 AVG_FUNC 24, movdqu, movdqa
2928 AVG_FUNC 16, movdqu, movdqa
2936 AVG_FUNC 48, movdqu, movdqa
2939 AVG_FUNC 12, movdqu, movdqa, movq
2981 ; TODO: active AVX2 after debug
2982 ;AVG_FUNC 24, movdqu, movdqa
2985 ;AVG_FUNC 64, movdqu, movdqa
2990 ;AVG_FUNC 32, movdqu, movdqa
2996 AVG_FUNC 16, movdqu, movdqa
3000 %endif ;HIGH_BIT_DEPTH
3004 ;=============================================================================
3006 ;=============================================================================
3009 ;-----------------------------------------------------------------------------
3010 ; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride,
3011 ; uint16_t *src1, intptr_t src_stride,
3012 ; uint16_t *src2, int height );
3013 ;-----------------------------------------------------------------------------
3015 cglobal pixel_avg2_w%1, 6,7,4
3021 %if cpuflag(avx) || mmsize == 8
3040 cglobal pixel_avg2_w%1, 6,7,8
3047 %2 m3, [r2+r3*2+mmsize]
3050 pavgw m1, [r2+r4+mmsize]
3052 pavgw m3, [r2+r6+mmsize]
3055 %2 m5, [r2+r4+mmsize]
3057 %2 m7, [r2+r6+mmsize]
3066 %3 [r0+r1*2+mmsize], m3
3076 AVG2_W_TWO 8, movu, mova
3079 AVG2_W_TWO 10, movd, movd
3080 AVG2_W_TWO 16, movu, mova
3085 cglobal pixel_avg2_w10_mmx2, 6,7
3092 movu m3, [r2+r3*2+ 0]
3093 movu m4, [r2+r3*2+ 8]
3094 movh m5, [r2+r3*2+16]
3095 pavgw m0, [r2+r4+ 0]
3096 pavgw m1, [r2+r4+ 8]
3097 pavgw m2, [r2+r4+16]
3098 pavgw m3, [r2+r6+ 0]
3099 pavgw m4, [r2+r6+ 8]
3100 pavgw m5, [r2+r6+16]
3104 mova [r0+r1*2+ 0], m3
3105 mova [r0+r1*2+ 8], m4
3106 movh [r0+r1*2+16], m5
3113 cglobal pixel_avg2_w16_mmx2, 6,7
3121 movu m4, [r2+r3*2+ 0]
3122 movu m5, [r2+r3*2+ 8]
3123 movu m6, [r2+r3*2+16]
3124 movu m7, [r2+r3*2+24]
3125 pavgw m0, [r2+r4+ 0]
3126 pavgw m1, [r2+r4+ 8]
3127 pavgw m2, [r2+r4+16]
3128 pavgw m3, [r2+r4+24]
3129 pavgw m4, [r2+r6+ 0]
3130 pavgw m5, [r2+r6+ 8]
3131 pavgw m6, [r2+r6+16]
3132 pavgw m7, [r2+r6+24]
3137 mova [r0+r1*2+ 0], m4
3138 mova [r0+r1*2+ 8], m5
3139 mova [r0+r1*2+16], m6
3140 mova [r0+r1*2+24], m7
3147 cglobal pixel_avg2_w18_mmx2, 6,7
3155 pavgw m0, [r2+r4+ 0]
3156 pavgw m1, [r2+r4+ 8]
3157 pavgw m2, [r2+r4+16]
3158 pavgw m3, [r2+r4+24]
3159 pavgw m4, [r2+r4+32]
3171 %macro PIXEL_AVG_W18 0
3172 cglobal pixel_avg2_w18, 6,7
3178 pavgw m0, [r2+r4+ 0]
3179 movd xm1, [r2+r4+32]
3205 %endif ; HIGH_BIT_DEPTH
3207 %if HIGH_BIT_DEPTH == 0
3208 ;-----------------------------------------------------------------------------
3209 ; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride,
3210 ; uint8_t *src1, intptr_t src_stride,
3211 ; uint8_t *src2, int height );
3212 ;-----------------------------------------------------------------------------
3214 cglobal pixel_avg2_w%1_mmx2, 6,7
3236 cglobal pixel_avg2_w%1_mmx2, 6,7
3245 pavgb mm1, [r4+r2+8]
3247 pavgb mm3, [r4+r6+8]
3262 cglobal pixel_avg2_w20_mmx2, 6,7
3271 movd mm5, [r4+r3+16]
3273 pavgb mm1, [r4+r2+8]
3274 pavgb mm2, [r4+r2+16]
3276 pavgb mm4, [r4+r6+8]
3277 pavgb mm5, [r4+r6+16]
3284 movd [r0+r1+16], mm5
3291 cglobal pixel_avg2_w16_sse2, 6,7
3309 cglobal pixel_avg2_w20_sse2, 6,7
3318 movd mm5, [r4+r3+16]
3321 pavgb mm4, [r4+r2+16]
3322 pavgb mm5, [r4+r6+16]
3327 movd [r0+r1+16], mm5
3334 cglobal pixel_avg2_w20, 6,7
3350 ; Cacheline split code for processors with high latencies for loads
3351 ; split over cache lines. See sad-a.asm for a more detailed explanation.
3352 ; This particular instance is complicated by the fact that src1 and src2
3353 ; can have different alignments. For simplicity and code size, only the
3354 ; MMX cacheline workaround is used. As a result, in the case of SSE2
3355 ; pixel_avg, the cacheline check functions calls the SSE2 version if there
3356 ; is no cacheline split, and the MMX workaround if there is.
3366 %macro AVG_CACHELINE_START 0
3367 %assign stack_offset 0
3378 %macro AVG_CACHELINE_LOOP 2
3381 movq mm3, [r2+r4+%1]
3382 movq mm2, [r2+r4+8+%1]
3393 %macro AVG_CACHELINE_FUNC 2
3394 pixel_avg2_w%1_cache_mmx2:
3396 AVG_CACHELINE_LOOP 0, movq
3398 AVG_CACHELINE_LOOP 8, movq
3400 AVG_CACHELINE_LOOP 16, movd
3410 %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
3412 ;w12 isn't needed because w16 is just as fast if there's no cacheline split
3413 %define cachesplit pixel_avg2_w16_cache_mmx2
3415 %define cachesplit pixel_avg2_w%1_cache_mmx2
3417 cglobal pixel_avg2_w%1_cache%2_%3
3420 cmp eax, (%2-%1-(%1 % 8))
3422 jbe pixel_avg2_w%1_%3
3424 jb pixel_avg2_w%1_%3
3426 %if 0 ; or %1==8 - but the extra branch seems too expensive
3433 jz pixel_avg2_w%1_%3
3437 jz pixel_avg2_w%1_%3
3440 %if mmsize==16 || (%1==8 && %2==64)
3441 AVG_CACHELINE_FUNC %1, %2
3448 AVG_CACHELINE_CHECK 8, 64, mmx2
3449 AVG_CACHELINE_CHECK 12, 64, mmx2
3450 %if ARCH_X86_64 == 0
3451 AVG_CACHELINE_CHECK 16, 64, mmx2
3452 AVG_CACHELINE_CHECK 20, 64, mmx2
3453 AVG_CACHELINE_CHECK 8, 32, mmx2
3454 AVG_CACHELINE_CHECK 12, 32, mmx2
3455 AVG_CACHELINE_CHECK 16, 32, mmx2
3456 AVG_CACHELINE_CHECK 20, 32, mmx2
3459 AVG_CACHELINE_CHECK 16, 64, sse2
3460 AVG_CACHELINE_CHECK 20, 64, sse2
3462 ; computed jump assumes this loop is exactly 48 bytes
3463 %macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
3465 avg_w16_align%1_%2_ssse3:
3471 movdqa xmm1, [r2+r4+16]
3472 palignr xmm1, [r2+r4], %2
3476 movdqa xmm1, [r2+16]
3477 palignr xmm1, [r2], %1
3481 movdqa xmm1, [r2+16]
3482 movdqa xmm2, [r2+r4+16]
3483 palignr xmm1, [r2], %1
3484 palignr xmm2, [r2+r4], %2&15
3491 jg avg_w16_align%1_%2_ssse3
3494 ; make sure the first ones don't end up short
3496 times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop
3500 cglobal pixel_avg2_w16_cache64_ssse3
3501 %if 0 ; seems both tests aren't worth it if src1%16==0 is optimized
3505 jb x265_pixel_avg2_w16_sse2
3508 jz x265_pixel_avg2_w16_sse2
3515 lea r6, [r6*3] ;(offset + align*2)*3
3517 shl r6, 4 ;jump = (offset + align*2)*48
3518 %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
3520 lea r7, [avg_w16_addr]
3523 lea r6, [avg_w16_addr + r6]
3530 AVG16_CACHELINE_LOOP_SSSE3 j, j
3531 AVG16_CACHELINE_LOOP_SSSE3 j, k
3535 %endif ; !HIGH_BIT_DEPTH
3537 ;=============================================================================
3539 ;=============================================================================
3552 %macro COPY2 2-4 0, 1
3553 movu m0, [r2+%3*mmsize]
3554 movu m1, [r2+%4*mmsize]
3555 movu m2, [r2+r3+%3*mmsize]
3556 movu m3, [r2+r3+%4*mmsize]
3557 mova [r0+%3*mmsize], m0
3558 mova [r0+%4*mmsize], m1
3559 mova [r0+r1+%3*mmsize], m2
3560 mova [r0+r1+%4*mmsize], m3
3561 movu m0, [r2+r3*2+%3*mmsize]
3562 movu m1, [r2+r3*2+%4*mmsize]
3563 movu m2, [r2+%2+%3*mmsize]
3564 movu m3, [r2+%2+%4*mmsize]
3565 mova [r0+r1*2+%3*mmsize], m0
3566 mova [r0+r1*2+%4*mmsize], m1
3567 mova [r0+%1+%3*mmsize], m2
3568 mova [r0+%1+%4*mmsize], m3
3576 ;-----------------------------------------------------------------------------
3577 ; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride,
3578 ; uint8_t *src, intptr_t i_src_stride, int i_height )
3579 ;-----------------------------------------------------------------------------
3581 cglobal mc_copy_w4_mmx, 4,6
3587 %if HIGH_BIT_DEPTH == 0
3599 %assign %%w %1*SIZEOF_PIXEL/mmsize
3601 cglobal mc_copy_w%1, 5,7
3621 INIT_XMM aligned, sse
3626 INIT_YMM aligned, avx
3630 ;=============================================================================
3632 ;=============================================================================
3633 ; assumes 64 byte cachelines
3634 ; FIXME doesn't cover all pixels in high depth and/or 4:4:4
3636 ;-----------------------------------------------------------------------------
3637 ; void prefetch_fenc( pixel *pix_y, intptr_t stride_y,
3638 ; pixel *pix_uv, intptr_t stride_uv, int mb_x )
3639 ;-----------------------------------------------------------------------------
3641 %macro PREFETCH_FENC 1
3643 cglobal prefetch_fenc_%1, 5,5
3648 lea r0, [r0+r4*4+64*SIZEOF_PIXEL]
3656 lea r2, [r2+rax*2+64*SIZEOF_PIXEL]
3667 cglobal prefetch_fenc_%1, 0,3
3674 lea r0, [r0+r2*4+64*SIZEOF_PIXEL]
3687 lea r0, [r0+r2*2+64*SIZEOF_PIXEL]
3696 %endif ; ARCH_X86_64
3703 ;-----------------------------------------------------------------------------
3704 ; void prefetch_ref( pixel *pix, intptr_t stride, int parity )
3705 ;-----------------------------------------------------------------------------
3707 cglobal prefetch_ref, 3,3
3711 lea r0, [r0+r2*8+64*SIZEOF_PIXEL]
3715 prefetcht0 [r0+r1*2]
3720 prefetcht0 [r0+r1*2]