1 ;*****************************************************************************
2 ;* ssd-a.asm: x86 ssd functions
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2013 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;* This program is also available under a commercial proprietary license.
26 ;* For more information, contact us at license @ x265.com.
27 ;*****************************************************************************
30 %include "x86util.asm"
39 ;=============================================================================
41 ;=============================================================================
44 ;-----------------------------------------------------------------------------
45 ; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
46 ;-----------------------------------------------------------------------------
48 cglobal pixel_ssd_ss_%1x%2, 4,7,8
52 %define offset0_2 r1*2
55 %define offset1_2 r3*2
60 %define offset0_1 mmsize
62 %define offset0_3 r1+mmsize
63 %define offset1_1 mmsize
65 %define offset1_3 r3+mmsize
67 %define offset0_1 mmsize
68 %define offset0_2 mmsize*2
69 %define offset0_3 mmsize*3
70 %define offset1_1 mmsize
71 %define offset1_2 mmsize*2
72 %define offset1_3 mmsize*3
74 %assign %%n %2/(2*mmsize/%1)
81 movu m2, [r0+offset0_1]
82 movu m3, [r0+offset0_2]
83 movu m4, [r0+offset0_3]
85 movu m7, [r2+offset1_1]
88 movu m6, [r2+offset1_2]
89 movu m7, [r2+offset1_3]
93 lea r0, [r0+r1*(%2/%%n)]
94 lea r2, [r2+r3*(%2/%%n)]
110 %ifidn movu,movq ; detect MMX
117 cglobal pixel_ssd_ss_%1x%2, 4,7,8
167 movu m2, [r0 + r1 + 16]
168 movu m3, [r0 + r1 + 32]
169 movu m4, [r0 + r1 + 48]
171 movu m7, [r2 + r3 + 16]
174 movu m6, [r2 + r3 + 32]
175 movu m7, [r2 + r3 + 48]
186 movu m1, [r0 + r1 + 64]
187 movu m2, [r0 + r1 + 80]
188 movu m6, [r2 + r3 + 64]
189 movu m7, [r2 + r3 + 80]
197 movu m3, [r0 + r1 + 96]
198 movu m4, [r0 + r1 + 112]
199 movu m6, [r2 + r3 + 96]
200 movu m7, [r2 + r3 + 112]
217 cglobal pixel_ssd_ss_%1x%2, 4,7,8
239 movu m2, [r0 + r1 + 16]
240 movu m4, [r0 + r1 + 32]
242 movu m6, [r2 + r3 + 16]
243 movu m7, [r2 + r3 + 32]
263 cglobal pixel_ssd_ss_%1x%2, 4,7,8
273 punpcklqdq m2, [r0 + r1 + 16]
279 punpcklqdq m4, [r2 + r3 + 16]
288 movh m2, [r0 + r5 + 16]
291 punpcklqdq m2, [r0 + r1 + 16]
294 movh m4, [r2 + r6 + 16]
298 punpcklqdq m4, [r2 + r3 + 16]
350 %endif ; HIGH_BIT_DEPTH
352 ;-----------------------------------------------------------------------------
353 ; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
354 ;-----------------------------------------------------------------------------
355 %if HIGH_BIT_DEPTH == 0
357 cglobal pixel_ssd_ss_%1x%2, 4,7,6
359 %if mmsize == %1*4 || mmsize == %1*2
360 %define offset0_1 r1*2
361 %define offset0_2 r1*4
363 %define offset1_1 r3*2
364 %define offset1_2 r3*4
372 %define offset0_2 r1*2
373 %define offset0_3 r1*2+16
375 %define offset1_2 r3*2
376 %define offset1_3 r3*2+16
379 %assign %%n %2/(mmsize/%1)
381 %assign %%n %2/(2*mmsize/%1)
394 movh m1, [r0 + offset0_1]
395 movh m2, [r2 + offset1_1]
399 movh m1, [r0 + offset0_2]
400 movh m2, [r2 + offset1_2]
404 movh m1, [r0 + offset0_3]
405 movh m2, [r2 + offset1_3]
415 movu m1, [r0 + offset0_1]
416 movu m2, [r2 + offset1_1]
420 movu m1, [r0 + offset0_2]
421 movu m2, [r2 + offset1_2]
425 movu m1, [r0 + offset0_3]
426 movu m2, [r2 + offset1_3]
431 lea r0, [r0+r1*(%2/%%n)*2]
432 lea r2, [r2+r3*(%2/%%n)*2]
438 %if notcpuflag(ssse3)
466 %macro SSD_SS_12x16 0
467 cglobal pixel_ssd_ss_12x16, 4,7,6
508 cglobal pixel_ssd_ss_32x%1, 4,7,6
573 cglobal pixel_ssd_ss_24x32, 4,7,6
620 cglobal pixel_ssd_ss_48x64, 4,7,6
697 cglobal pixel_ssd_ss_64x%1, 4,7,6
821 %endif ; !HIGH_BIT_DEPTH
823 %if HIGH_BIT_DEPTH == 0
824 %macro SSD_LOAD_FULL 5
868 DEINTB %2, %1, %4, %3, 7
885 vinserti128 m%1, m%1, %4, 1
893 vinserti128 m%2, m%2, %6, 1
897 SBUTTERFLY bw, %1, %2, %3
900 %macro SSD_LOAD_HALF 5
901 LOAD 1, 2, [t0+%1], [t0+%3], 1
902 JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
903 LOAD 3, 4, [t0+%1], [t0+%3], %5
904 JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5
917 punpcklbw m%2, m%1, m%5
919 punpcklbw m%4, m%3, m%5
928 %macro SSD_CORE_SSE2 7-8
930 DEINTB %6, %1, %7, %2, %5
934 DEINTB %6, %3, %7, %4, %5
945 %macro SSD_CORE_SSSE3 7-8
947 punpckhbw m%6, m%1, m%2
948 punpckhbw m%7, m%3, m%4
965 SSD_LOAD_%1 %2,%3,%4,%5,%6
966 SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
973 ;-----------------------------------------------------------------------------
974 ; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
975 ;-----------------------------------------------------------------------------
978 %assign function_align 8
980 %assign function_align 16
982 cglobal pixel_ssd_%1x%2, 0,0,0
983 mov al, %1*%2/mmsize/2
986 jmp mangle(x265_pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop)
991 DECLARE_REG_TMP 0,1,2,3
995 DECLARE_REG_TMP 1,2,3,4
1004 %elifidn cpuname, sse2
1014 SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
1016 SSD_ITER FULL, 0, 0, t1, t3, 2
1018 SSD_ITER HALF, 0, 0, t1, t3, 2
1023 vextracti128 xm1, m0, 1
1047 SSD 24, 24 ; not used, but resolves x265_pixel_ssd_24x24_sse2.startloop symbol
1073 %define SSD_CORE SSD_CORE_SSE2
1074 %define JOIN JOIN_SSE2
1077 %define SSD_CORE SSD_CORE_SSSE3
1078 %define JOIN JOIN_SSSE3
1092 %define LOAD LOAD_AVX2
1093 %define JOIN JOIN_AVX2
1097 %assign function_align 16
1098 %endif ; !HIGH_BIT_DEPTH
1100 ;-----------------------------------------------------------------------------
1101 ; int pixel_ssd_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
1102 ;-----------------------------------------------------------------------------
1104 cglobal pixel_ssd_12x16, 4, 5, 7, src1, stride1, src2, stride2
1115 punpckhdq m4, m0, m2
1116 punpckhdq m5, m1, m3
1137 movu m0, [r0 + 2 * r1]
1138 movu m1, [r2 + 2 * r3]
1139 lea r0, [r0 + 2 * r1]
1140 lea r2, [r2 + 2 * r3]
1144 punpckhdq m4, m0, m2
1145 punpckhdq m5, m1, m3
1167 lea r0, [r0 + 2 * r1]
1168 lea r2, [r2 + 2 * r3]
1176 ;-----------------------------------------------------------------------------
1177 ; int pixel_ssd_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
1178 ;-----------------------------------------------------------------------------
1180 cglobal pixel_ssd_24x32, 4, 5, 8, src1, stride1, src2, stride2
1190 pmovzxbw m2, [r0 + 16]
1194 pmovzxbw m5, [r2 + 16]
1211 pmovzxbw m2, [r0 + r1 + 16]
1215 pmovzxbw m5, [r2 + r3 + 16]
1230 lea r0, [r0 + 2 * r1]
1231 lea r2, [r2 + 2 * r3]
1239 %macro PIXEL_SSD_16x4 0
1273 movu m3, [r2 + 2 * r3]
1281 lea r2, [r2 + 2 * r3]
1303 cglobal pixel_ssd_16x16_internal
1306 lea r2, [r2 + 2 * r3]
1309 lea r2, [r2 + 2 * r3]
1312 lea r2, [r2 + 2 * r3]
1316 ;-----------------------------------------------------------------------------
1317 ; int pixel_ssd_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
1318 ;-----------------------------------------------------------------------------
1320 cglobal pixel_ssd_48x64, 4, 7, 8, src1, stride1, src2, stride2
1328 call pixel_ssd_16x16_internal
1330 lea r2, [r2 + 2 * r3]
1331 call pixel_ssd_16x16_internal
1333 lea r2, [r2 + 2 * r3]
1334 call pixel_ssd_16x16_internal
1336 lea r2, [r2 + 2 * r3]
1337 call pixel_ssd_16x16_internal
1340 call pixel_ssd_16x16_internal
1342 lea r2, [r2 + 2 * r3]
1343 call pixel_ssd_16x16_internal
1345 lea r2, [r2 + 2 * r3]
1346 call pixel_ssd_16x16_internal
1348 lea r2, [r2 + 2 * r3]
1349 call pixel_ssd_16x16_internal
1352 call pixel_ssd_16x16_internal
1354 lea r2, [r2 + 2 * r3]
1355 call pixel_ssd_16x16_internal
1357 lea r2, [r2 + 2 * r3]
1358 call pixel_ssd_16x16_internal
1360 lea r2, [r2 + 2 * r3]
1361 call pixel_ssd_16x16_internal
1368 ;-----------------------------------------------------------------------------
1369 ; int pixel_ssd_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
1370 ;-----------------------------------------------------------------------------
1372 cglobal pixel_ssd_64x16, 4, 7, 8, src1, stride1, src2, stride2
1380 call pixel_ssd_16x16_internal
1383 call pixel_ssd_16x16_internal
1386 call pixel_ssd_16x16_internal
1389 call pixel_ssd_16x16_internal
1396 ;-----------------------------------------------------------------------------
1397 ; int pixel_ssd_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
1398 ;-----------------------------------------------------------------------------
1400 cglobal pixel_ssd_64x32, 4, 7, 8, src1, stride1, src2, stride2
1408 call pixel_ssd_16x16_internal
1410 lea r2, [r2 + 2 * r3]
1411 call pixel_ssd_16x16_internal
1414 call pixel_ssd_16x16_internal
1416 lea r2, [r2 + 2 * r3]
1417 call pixel_ssd_16x16_internal
1420 call pixel_ssd_16x16_internal
1422 lea r2, [r2 + 2 * r3]
1423 call pixel_ssd_16x16_internal
1426 call pixel_ssd_16x16_internal
1428 lea r2, [r2 + 2 * r3]
1429 call pixel_ssd_16x16_internal
1436 ;-----------------------------------------------------------------------------
1437 ; int pixel_ssd_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t )
1438 ;-----------------------------------------------------------------------------
1440 cglobal pixel_ssd_64x48, 4, 7, 8, src1, stride1, src2, stride2
1448 call pixel_ssd_16x16_internal
1450 lea r2, [r2 + 2 * r3]
1451 call pixel_ssd_16x16_internal
1453 lea r2, [r2 + 2 * r3]
1454 call pixel_ssd_16x16_internal
1457 call pixel_ssd_16x16_internal
1459 lea r2, [r2 + 2 * r3]
1460 call pixel_ssd_16x16_internal
1462 lea r2, [r2 + 2 * r3]
1463 call pixel_ssd_16x16_internal
1466 call pixel_ssd_16x16_internal
1468 lea r2, [r2 + 2 * r3]
1469 call pixel_ssd_16x16_internal
1471 lea r2, [r2 + 2 * r3]
1472 call pixel_ssd_16x16_internal
1475 call pixel_ssd_16x16_internal
1477 lea r2, [r2 + 2 * r3]
1478 call pixel_ssd_16x16_internal
1480 lea r2, [r2 + 2 * r3]
1481 call pixel_ssd_16x16_internal
1488 ;-----------------------------------------------------------------------------
1489 ; int pixel_ssd_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
1490 ;-----------------------------------------------------------------------------
1492 cglobal pixel_ssd_64x64, 4, 7, 8, src1, stride1, src2, stride2
1500 call pixel_ssd_16x16_internal
1502 lea r2, [r2 + 2 * r3]
1503 call pixel_ssd_16x16_internal
1505 lea r2, [r2 + 2 * r3]
1506 call pixel_ssd_16x16_internal
1508 lea r2, [r2 + 2 * r3]
1509 call pixel_ssd_16x16_internal
1512 call pixel_ssd_16x16_internal
1514 lea r2, [r2 + 2 * r3]
1515 call pixel_ssd_16x16_internal
1517 lea r2, [r2 + 2 * r3]
1518 call pixel_ssd_16x16_internal
1520 lea r2, [r2 + 2 * r3]
1521 call pixel_ssd_16x16_internal
1524 call pixel_ssd_16x16_internal
1526 lea r2, [r2 + 2 * r3]
1527 call pixel_ssd_16x16_internal
1529 lea r2, [r2 + 2 * r3]
1530 call pixel_ssd_16x16_internal
1532 lea r2, [r2 + 2 * r3]
1533 call pixel_ssd_16x16_internal
1536 call pixel_ssd_16x16_internal
1538 lea r2, [r2 + 2 * r3]
1539 call pixel_ssd_16x16_internal
1541 lea r2, [r2 + 2 * r3]
1542 call pixel_ssd_16x16_internal
1544 lea r2, [r2 + 2 * r3]
1545 call pixel_ssd_16x16_internal
1552 ;-----------------------------------------------------------------------------
1553 ; int pixel_ssd_sp ( int16_t *, intptr_t, uint8_t *, intptr_t )
1554 ;-----------------------------------------------------------------------------
1556 cglobal pixel_ssd_sp_4x4_internal
1565 movh m4, [r0 + 2 * r1]
1568 movd m6, [r2 + 2 * r3]
1569 lea r2, [r2 + 2 * r3]
1580 ;-----------------------------------------------------------------------------
1581 ; int pixel_ssd_sp_4x4( int16_t *, intptr_t, uint8_t *, intptr_t )
1582 ;-----------------------------------------------------------------------------
1584 cglobal pixel_ssd_sp_4x4, 4, 5, 8, src1, stride1, src2, stride2
1588 call pixel_ssd_sp_4x4_internal
1593 ;-----------------------------------------------------------------------------
1594 ; int pixel_ssd_sp_4x8( int16_t *, intptr_t, uint8_t *, intptr_t )
1595 ;-----------------------------------------------------------------------------
1597 cglobal pixel_ssd_sp_4x8, 4, 5, 8, src1, stride1, src2, stride2
1601 call pixel_ssd_sp_4x4_internal
1602 lea r0, [r0 + 4 * r1]
1603 lea r2, [r2 + 2 * r3]
1604 call pixel_ssd_sp_4x4_internal
1609 ;-----------------------------------------------------------------------------
1610 ; int pixel_ssd_sp_4x16( int16_t *, intptr_t, uint8_t *, intptr_t )
1611 ;-----------------------------------------------------------------------------
1613 cglobal pixel_ssd_sp_4x16, 4, 5, 8, src1, stride1, src2, stride2
1617 call pixel_ssd_sp_4x4_internal
1618 lea r0, [r0 + 4 * r1]
1619 lea r2, [r2 + 2 * r3]
1620 call pixel_ssd_sp_4x4_internal
1621 lea r0, [r0 + 4 * r1]
1622 lea r2, [r2 + 2 * r3]
1623 call pixel_ssd_sp_4x4_internal
1624 lea r0, [r0 + 4 * r1]
1625 lea r2, [r2 + 2 * r3]
1626 call pixel_ssd_sp_4x4_internal
1631 cglobal pixel_ssd_sp_8x4_internal
1642 movu m4, [r0 + 2 * r1]
1644 movh m2, [r2 + 2 * r3]
1663 ;-----------------------------------------------------------------------------
1664 ; int pixel_ssd_sp_8x4( int16_t *, intptr_t, uint8_t *, intptr_t )
1665 ;-----------------------------------------------------------------------------
1667 cglobal pixel_ssd_sp_8x4, 4, 6, 8, src1, stride1, src2, stride2
1672 call pixel_ssd_sp_8x4_internal
1677 ;-----------------------------------------------------------------------------
1678 ; int pixel_ssd_sp_8x8( int16_t *, intptr_t, uint8_t *, intptr_t )
1679 ;-----------------------------------------------------------------------------
1681 cglobal pixel_ssd_sp_8x8, 4, 6, 8, src1, stride1, src2, stride2
1686 call pixel_ssd_sp_8x4_internal
1687 lea r0, [r0 + 4 * r1]
1688 lea r2, [r2 + 4 * r3]
1689 call pixel_ssd_sp_8x4_internal
1694 ;-----------------------------------------------------------------------------
1695 ; int pixel_ssd_sp_8x16( int16_t *, intptr_t, uint8_t *, intptr_t )
1696 ;-----------------------------------------------------------------------------
1698 cglobal pixel_ssd_sp_8x16, 4, 6, 8, src1, stride1, src2, stride2
1703 call pixel_ssd_sp_8x4_internal
1704 lea r0, [r0 + 4 * r1]
1705 lea r2, [r2 + 4 * r3]
1706 call pixel_ssd_sp_8x4_internal
1707 lea r0, [r0 + 4 * r1]
1708 lea r2, [r2 + 4 * r3]
1709 call pixel_ssd_sp_8x4_internal
1710 lea r0, [r0 + 4 * r1]
1711 lea r2, [r2 + 4 * r3]
1712 call pixel_ssd_sp_8x4_internal
1717 ;-----------------------------------------------------------------------------
1718 ; int pixel_ssd_sp_8x32( int16_t *, intptr_t, uint8_t *, intptr_t )
1719 ;-----------------------------------------------------------------------------
1721 cglobal pixel_ssd_sp_8x32, 4, 6, 8, src1, stride1, src2, stride2
1726 call pixel_ssd_sp_8x4_internal
1727 lea r0, [r0 + 4 * r1]
1728 lea r2, [r2 + 4 * r3]
1729 call pixel_ssd_sp_8x4_internal
1730 lea r0, [r0 + 4 * r1]
1731 lea r2, [r2 + 4 * r3]
1732 call pixel_ssd_sp_8x4_internal
1733 lea r0, [r0 + 4 * r1]
1734 lea r2, [r2 + 4 * r3]
1735 call pixel_ssd_sp_8x4_internal
1736 lea r0, [r0 + 4 * r1]
1737 lea r2, [r2 + 4 * r3]
1738 call pixel_ssd_sp_8x4_internal
1739 lea r0, [r0 + 4 * r1]
1740 lea r2, [r2 + 4 * r3]
1741 call pixel_ssd_sp_8x4_internal
1742 lea r0, [r0 + 4 * r1]
1743 lea r2, [r2 + 4 * r3]
1744 call pixel_ssd_sp_8x4_internal
1745 lea r0, [r0 + 4 * r1]
1746 lea r2, [r2 + 4 * r3]
1747 call pixel_ssd_sp_8x4_internal
1752 ;-----------------------------------------------------------------------------
1753 ; int pixel_ssd_sp_12x16( int16_t *, intptr_t, uint8_t *, intptr_t )
1754 ;-----------------------------------------------------------------------------
1756 cglobal pixel_ssd_sp_12x16, 4, 7, 8, src1, stride1, src2, stride2
1762 call pixel_ssd_sp_4x4_internal
1763 lea r0, [r0 + 4 * r1]
1764 lea r2, [r2 + 2 * r3]
1765 call pixel_ssd_sp_4x4_internal
1766 lea r0, [r0 + 4 * r1]
1767 lea r2, [r2 + 2 * r3]
1768 call pixel_ssd_sp_4x4_internal
1769 lea r0, [r0 + 4 * r1]
1770 lea r2, [r2 + 2 * r3]
1771 call pixel_ssd_sp_4x4_internal
1775 call pixel_ssd_sp_8x4_internal
1776 lea r0, [r0 + 4 * r1]
1777 lea r2, [r2 + 4 * r3]
1778 call pixel_ssd_sp_8x4_internal
1779 lea r0, [r0 + 4 * r1]
1780 lea r2, [r2 + 4 * r3]
1781 call pixel_ssd_sp_8x4_internal
1782 lea r0, [r0 + 4 * r1]
1783 lea r2, [r2 + 4 * r3]
1784 call pixel_ssd_sp_8x4_internal
1789 %macro PIXEL_SSD_SP_16x4 0
1800 movu m5, [r0 + r1 +16]
1818 movu m0, [r0 + 2 * r1]
1819 movu m1, [r0 + 2 * r1 + 16]
1820 movu m3, [r2 + 2 * r3]
1827 lea r0, [r0 + 2 * r1]
1828 lea r2, [r2 + 2 * r3]
1830 movu m5, [r0 + r1 + 16]
1849 ;-----------------------------------------------------------------------------
1850 ; int pixel_ssd_sp_16x4( int16_t *, intptr_t, uint8_t *, intptr_t )
1851 ;-----------------------------------------------------------------------------
1853 cglobal pixel_ssd_sp_16x4, 4, 6, 8, src1, stride1, src2, stride2
1864 ;-----------------------------------------------------------------------------
1865 ; int pixel_ssd_sp_16x8( int16_t *, intptr_t, uint8_t *, intptr_t )
1866 ;-----------------------------------------------------------------------------
1868 cglobal pixel_ssd_sp_16x8, 4, 4, 8, src1, stride1, src2, stride2
1874 lea r0, [r0 + 2 * r1]
1875 lea r2, [r2 + 2 * r3]
1881 ;-----------------------------------------------------------------------------
1882 ; int pixel_ssd_sp_16x12( int16_t *, intptr_t, uint8_t *, intptr_t )
1883 ;-----------------------------------------------------------------------------
1885 cglobal pixel_ssd_sp_16x12, 4, 6, 8, src1, stride1, src2, stride2
1903 ;-----------------------------------------------------------------------------
1904 ; int pixel_ssd_sp_16x16( int16_t *, intptr_t, uint8_t *, intptr_t )
1905 ;-----------------------------------------------------------------------------
1907 cglobal pixel_ssd_sp_16x16, 4, 6, 8, src1, stride1, src2, stride2
1928 cglobal pixel_ssd_sp_16x16_internal
1931 lea r2, [r2 + 2 * r3]
1934 lea r2, [r2 + 2 * r3]
1937 lea r2, [r2 + 2 * r3]
1941 ;-----------------------------------------------------------------------------
1942 ; int pixel_ssd_sp_16x32( int16_t *, intptr_t, uint8_t *, intptr_t )
1943 ;-----------------------------------------------------------------------------
1945 cglobal pixel_ssd_sp_16x32, 4, 5, 8, src1, stride1, src2, stride2
1951 call pixel_ssd_sp_16x16_internal
1953 lea r2, [r2 + 2 * r3]
1954 call pixel_ssd_sp_16x16_internal
1959 ;-----------------------------------------------------------------------------
1960 ; int pixel_ssd_sp_16x64( int16_t *, intptr_t, uint8_t *, intptr_t )
1961 ;-----------------------------------------------------------------------------
1963 cglobal pixel_ssd_sp_16x64, 4, 6, 8, src1, stride1, src2, stride2
1970 call pixel_ssd_sp_16x16_internal
1973 call pixel_ssd_sp_16x16_internal
1976 call pixel_ssd_sp_16x16_internal
1979 call pixel_ssd_sp_16x16_internal
1985 ;-----------------------------------------------------------------------------
1986 ; int pixel_ssd_sp_24x32( int16_t *, intptr_t, uint8_t *, intptr_t )
1987 ;-----------------------------------------------------------------------------
1989 cglobal pixel_ssd_sp_24x32, 4, 7, 8, src1, stride1, src2, stride2
1996 call pixel_ssd_sp_16x16_internal
1998 lea r2, [r2 + 2 * r3]
1999 call pixel_ssd_sp_16x16_internal
2004 call pixel_ssd_sp_8x4_internal
2005 lea r0, [r0 + 4 * r1]
2006 lea r2, [r2 + 4 * r3]
2007 call pixel_ssd_sp_8x4_internal
2008 lea r0, [r0 + 4 * r1]
2009 lea r2, [r2 + 4 * r3]
2010 call pixel_ssd_sp_8x4_internal
2011 lea r0, [r0 + 4 * r1]
2012 lea r2, [r2 + 4 * r3]
2013 call pixel_ssd_sp_8x4_internal
2014 lea r0, [r0 + 4 * r1]
2015 lea r2, [r2 + 4 * r3]
2016 call pixel_ssd_sp_8x4_internal
2017 lea r0, [r0 + 4 * r1]
2018 lea r2, [r2 + 4 * r3]
2019 call pixel_ssd_sp_8x4_internal
2020 lea r0, [r0 + 4 * r1]
2021 lea r2, [r2 + 4 * r3]
2022 call pixel_ssd_sp_8x4_internal
2023 lea r0, [r0 + 4 * r1]
2024 lea r2, [r2 + 4 * r3]
2025 call pixel_ssd_sp_8x4_internal
2030 ;-----------------------------------------------------------------------------
2031 ; int pixel_ssd_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
2032 ;-----------------------------------------------------------------------------
2034 cglobal pixel_ssd_sp_32x8, 4, 7, 8, src1, stride1, src2, stride2
2044 lea r2, [r2 + 2 * r3]
2050 lea r2, [r2 + 2 * r3]
2056 ;-----------------------------------------------------------------------------
2057 ; int pixel_ssd_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
2058 ;-----------------------------------------------------------------------------
2060 cglobal pixel_ssd_sp_32x16, 4, 7, 8, src1, stride1, src2, stride2
2068 call pixel_ssd_sp_16x16_internal
2071 call pixel_ssd_sp_16x16_internal
2076 ;-----------------------------------------------------------------------------
2077 ; int pixel_ssd_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t )
2078 ;-----------------------------------------------------------------------------
2080 cglobal pixel_ssd_sp_32x24, 4, 7, 8, src1, stride1, src2, stride2
2088 call pixel_ssd_sp_16x16_internal
2090 lea r2, [r2 + 2 * r3]
2093 lea r2, [r2 + 2 * r3]
2097 call pixel_ssd_sp_16x16_internal
2099 lea r2, [r2 + 2 * r3]
2102 lea r2, [r2 + 2 * r3]
2108 ;-----------------------------------------------------------------------------
2109 ; int pixel_ssd_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
2110 ;-----------------------------------------------------------------------------
2112 cglobal pixel_ssd_sp_32x32, 4, 7, 8, src1, stride1, src2, stride2
2120 call pixel_ssd_sp_16x16_internal
2122 lea r2, [r2 + 2 * r3]
2123 call pixel_ssd_sp_16x16_internal
2126 call pixel_ssd_sp_16x16_internal
2128 lea r2, [r2 + 2 * r3]
2129 call pixel_ssd_sp_16x16_internal
2134 ;-----------------------------------------------------------------------------
2135 ; int pixel_ssd_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
2136 ;-----------------------------------------------------------------------------
2138 cglobal pixel_ssd_sp_32x64, 4, 7, 8, src1, stride1, src2, stride2
2146 call pixel_ssd_sp_16x16_internal
2148 lea r2, [r2 + 2 * r3]
2149 call pixel_ssd_sp_16x16_internal
2151 lea r2, [r2 + 2 * r3]
2152 call pixel_ssd_sp_16x16_internal
2154 lea r2, [r2 + 2 * r3]
2155 call pixel_ssd_sp_16x16_internal
2158 call pixel_ssd_sp_16x16_internal
2160 lea r2, [r2 + 2 * r3]
2161 call pixel_ssd_sp_16x16_internal
2163 lea r2, [r2 + 2 * r3]
2164 call pixel_ssd_sp_16x16_internal
2166 lea r2, [r2 + 2 * r3]
2167 call pixel_ssd_sp_16x16_internal
2172 ;-----------------------------------------------------------------------------
2173 ; int pixel_ssd_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
2174 ;-----------------------------------------------------------------------------
2176 cglobal pixel_ssd_sp_48x64, 4, 7, 8, src1, stride1, src2, stride2
2184 call pixel_ssd_sp_16x16_internal
2186 lea r2, [r2 + 2 * r3]
2187 call pixel_ssd_sp_16x16_internal
2189 lea r2, [r2 + 2 * r3]
2190 call pixel_ssd_sp_16x16_internal
2192 lea r2, [r2 + 2 * r3]
2193 call pixel_ssd_sp_16x16_internal
2196 call pixel_ssd_sp_16x16_internal
2198 lea r2, [r2 + 2 * r3]
2199 call pixel_ssd_sp_16x16_internal
2201 lea r2, [r2 + 2 * r3]
2202 call pixel_ssd_sp_16x16_internal
2204 lea r2, [r2 + 2 * r3]
2205 call pixel_ssd_sp_16x16_internal
2208 call pixel_ssd_sp_16x16_internal
2210 lea r2, [r2 + 2 * r3]
2211 call pixel_ssd_sp_16x16_internal
2213 lea r2, [r2 + 2 * r3]
2214 call pixel_ssd_sp_16x16_internal
2216 lea r2, [r2 + 2 * r3]
2217 call pixel_ssd_sp_16x16_internal
2222 ;-----------------------------------------------------------------------------
2223 ; int pixel_ssd_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
2224 ;-----------------------------------------------------------------------------
2226 cglobal pixel_ssd_sp_64x16, 4, 7, 8, src1, stride1, src2, stride2
2234 call pixel_ssd_sp_16x16_internal
2237 call pixel_ssd_sp_16x16_internal
2240 call pixel_ssd_sp_16x16_internal
2243 call pixel_ssd_sp_16x16_internal
2248 ;-----------------------------------------------------------------------------
2249 ; int pixel_ssd_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
2250 ;-----------------------------------------------------------------------------
2252 cglobal pixel_ssd_sp_64x32, 4, 7, 8, src1, stride1, src2, stride2
2260 call pixel_ssd_sp_16x16_internal
2262 lea r2, [r2 + 2 * r3]
2263 call pixel_ssd_sp_16x16_internal
2266 call pixel_ssd_sp_16x16_internal
2268 lea r2, [r2 + 2 * r3]
2269 call pixel_ssd_sp_16x16_internal
2272 call pixel_ssd_sp_16x16_internal
2274 lea r2, [r2 + 2 * r3]
2275 call pixel_ssd_sp_16x16_internal
2278 call pixel_ssd_sp_16x16_internal
2280 lea r2, [r2 + 2 * r3]
2281 call pixel_ssd_sp_16x16_internal
2286 ;-----------------------------------------------------------------------------
2287 ; int pixel_ssd_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t )
2288 ;-----------------------------------------------------------------------------
2290 cglobal pixel_ssd_sp_64x48, 4, 7, 8, src1, stride1, src2, stride2
2298 call pixel_ssd_sp_16x16_internal
2300 lea r2, [r2 + 2 * r3]
2301 call pixel_ssd_sp_16x16_internal
2303 lea r2, [r2 + 2 * r3]
2304 call pixel_ssd_sp_16x16_internal
2307 call pixel_ssd_sp_16x16_internal
2309 lea r2, [r2 + 2 * r3]
2310 call pixel_ssd_sp_16x16_internal
2312 lea r2, [r2 + 2 * r3]
2313 call pixel_ssd_sp_16x16_internal
2316 call pixel_ssd_sp_16x16_internal
2318 lea r2, [r2 + 2 * r3]
2319 call pixel_ssd_sp_16x16_internal
2321 lea r2, [r2 + 2 * r3]
2322 call pixel_ssd_sp_16x16_internal
2325 call pixel_ssd_sp_16x16_internal
2327 lea r2, [r2 + 2 * r3]
2328 call pixel_ssd_sp_16x16_internal
2330 lea r2, [r2 + 2 * r3]
2331 call pixel_ssd_sp_16x16_internal
2336 ;-----------------------------------------------------------------------------
2337 ; int pixel_ssd_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
2338 ;-----------------------------------------------------------------------------
2340 cglobal pixel_ssd_sp_64x64, 4, 7, 8, src1, stride1, src2, stride2
2348 call pixel_ssd_sp_16x16_internal
2350 lea r2, [r2 + 2 * r3]
2351 call pixel_ssd_sp_16x16_internal
2353 lea r2, [r2 + 2 * r3]
2354 call pixel_ssd_sp_16x16_internal
2356 lea r2, [r2 + 2 * r3]
2357 call pixel_ssd_sp_16x16_internal
2360 call pixel_ssd_sp_16x16_internal
2362 lea r2, [r2 + 2 * r3]
2363 call pixel_ssd_sp_16x16_internal
2365 lea r2, [r2 + 2 * r3]
2366 call pixel_ssd_sp_16x16_internal
2368 lea r2, [r2 + 2 * r3]
2369 call pixel_ssd_sp_16x16_internal
2372 call pixel_ssd_sp_16x16_internal
2374 lea r2, [r2 + 2 * r3]
2375 call pixel_ssd_sp_16x16_internal
2377 lea r2, [r2 + 2 * r3]
2378 call pixel_ssd_sp_16x16_internal
2380 lea r2, [r2 + 2 * r3]
2381 call pixel_ssd_sp_16x16_internal
2384 call pixel_ssd_sp_16x16_internal
2386 lea r2, [r2 + 2 * r3]
2387 call pixel_ssd_sp_16x16_internal
2389 lea r2, [r2 + 2 * r3]
2390 call pixel_ssd_sp_16x16_internal
2392 lea r2, [r2 + 2 * r3]
2393 call pixel_ssd_sp_16x16_internal
2399 ;-----------------------------------------------------------------------------
2400 ; int pixel_ssd_s( int16_t *ref, intptr_t i_stride )
2401 ;-----------------------------------------------------------------------------
2403 cglobal pixel_ssd_s_4, 2,2,2
2406 movhps m0, [r0 + r1]
2408 lea r0, [r0 + r1 * 2]
2410 movhps m1, [r0 + r1]
2416 ; calculate sum and return
2423 cglobal pixel_ssd_s_8, 2,3,5
2428 movu m2, [r0 + r1 * 2]
2439 lea r0, [r0 + r1 * 4]
2442 movu m2, [r0 + r1 * 2]
2454 ; calculate sum and return
2461 cglobal pixel_ssd_s_16, 2,3,5
2468 movu m2, [r0 + mmsize]
2470 movu m4, [r0 + r1 + mmsize]
2471 lea r0, [r0 + r1 * 2]
2483 movu m2, [r0 + mmsize]
2485 movu m4, [r0 + r1 + mmsize]
2486 lea r0, [r0 + r1 * 2]
2500 ; calculate sum and return
2507 cglobal pixel_ssd_s_32, 2,3,5
2513 movu m1, [r0 + 0 * mmsize]
2514 movu m2, [r0 + 1 * mmsize]
2515 movu m3, [r0 + 2 * mmsize]
2516 movu m4, [r0 + 3 * mmsize]
2528 movu m1, [r0 + 0 * mmsize]
2529 movu m2, [r0 + 1 * mmsize]
2530 movu m3, [r0 + 2 * mmsize]
2531 movu m4, [r0 + 3 * mmsize]
2546 ; calculate sum and return
2553 cglobal pixel_ssd_s_32, 2,4,5
2560 movu m1, [r0 + 0 * mmsize]
2561 movu m2, [r0 + 1 * mmsize]
2562 movu m3, [r0 + r1 + 0 * mmsize]
2563 movu m4, [r0 + r1 + 1 * mmsize]
2574 movu m1, [r0 + r1 * 2 + 0 * mmsize]
2575 movu m2, [r0 + r1 * 2 + 1 * mmsize]
2576 movu m3, [r0 + r3 + 0 * mmsize]
2577 movu m4, [r0 + r3 + 1 * mmsize]
2578 lea r0, [r0 + 4 * r1]
2592 ; calculate sum and return