1 ;*****************************************************************************
2 ;* sad-a.asm: x86 sad functions
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2013 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Alex Izvorski <aizvorksi@gmail.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;* This program is also available under a commercial proprietary license.
26 ;* For more information, contact us at license @ x265.com.
27 ;*****************************************************************************
30 %include "x86util.asm"
34 MSK: db 255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0
35 pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
36 hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
45 ;=============================================================================
47 ;=============================================================================
49 %macro SAD_INC_2x16P 0
80 punpckldq mm1, [r0+r1]
81 punpckldq mm2, [r2+r3]
88 ;-----------------------------------------------------------------------------
89 ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
90 ;-----------------------------------------------------------------------------
92 cglobal pixel_sad_%1x%2_mmx2, 4,4
112 ;=============================================================================
114 ;=============================================================================
116 %macro SAD_END_SSE2 0
123 %macro PROCESS_SAD_12x4 0
156 %macro PROCESS_SAD_16x4 0
163 lea r2, [r2 + 2 * r3]
164 lea r0, [r0 + 2 * r1]
171 lea r2, [r2 + 2 * r3]
172 lea r0, [r0 + 2 * r1]
175 %macro PROCESS_SAD_24x4 0
213 %macro PROCESS_SAD_32x4 0
248 %macro PROCESS_SAD_48x4 0
296 %macro PROCESS_SAD_8x4 0
299 lea r2, [r2 + 2 * r3]
302 lea r0, [r0 + 2 * r1]
309 lea r2, [r2 + 2 * r3]
312 lea r0, [r0 + 2 * r1]
319 %macro PROCESS_SAD_64x4 0
382 ;-----------------------------------------------------------------------------
383 ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
384 ;-----------------------------------------------------------------------------
385 cglobal pixel_sad_16x16, 4,4,8
449 ;-----------------------------------------------------------------------------
450 ; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
451 ;-----------------------------------------------------------------------------
452 cglobal pixel_sad_16x8, 4,4
486 ;-----------------------------------------------------------------------------
487 ; int pixel_sad_16x12( uint8_t *, intptr_t, uint8_t *, intptr_t )
488 ;-----------------------------------------------------------------------------
489 cglobal pixel_sad_16x12, 4,4,3
501 ;-----------------------------------------------------------------------------
502 ; int pixel_sad_16x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
503 ;-----------------------------------------------------------------------------
504 cglobal pixel_sad_16x32, 4,5,3
518 ;-----------------------------------------------------------------------------
519 ; int pixel_sad_16x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
520 ;-----------------------------------------------------------------------------
521 cglobal pixel_sad_16x64, 4,5,3
535 ;-----------------------------------------------------------------------------
536 ; int pixel_sad_16x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
537 ;-----------------------------------------------------------------------------
538 cglobal pixel_sad_16x4, 4,4,3
545 lea r2, [r2 + 2 * r3]
546 lea r0, [r0 + 2 * r1]
559 ;-----------------------------------------------------------------------------
560 ; int pixel_sad_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
561 ;-----------------------------------------------------------------------------
562 cglobal pixel_sad_32x8, 4,4,3
573 ;-----------------------------------------------------------------------------
574 ; int pixel_sad_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t )
575 ;-----------------------------------------------------------------------------
576 cglobal pixel_sad_32x24, 4,5,3
590 ;-----------------------------------------------------------------------------
591 ; int pixel_sad_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
592 ;-----------------------------------------------------------------------------
593 cglobal pixel_sad_32x32, 4,5,3
607 ;-----------------------------------------------------------------------------
608 ; int pixel_sad_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
609 ;-----------------------------------------------------------------------------
610 cglobal pixel_sad_32x16, 4,4,3
623 ;-----------------------------------------------------------------------------
624 ; int pixel_sad_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
625 ;-----------------------------------------------------------------------------
626 cglobal pixel_sad_32x64, 4,5,3
640 ;-----------------------------------------------------------------------------
641 ; int pixel_sad_8x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
642 ;-----------------------------------------------------------------------------
643 cglobal pixel_sad_8x32, 4,5,3
657 ;-----------------------------------------------------------------------------
658 ; int pixel_sad_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
659 ;-----------------------------------------------------------------------------
660 cglobal pixel_sad_64x16, 4,4,5
673 ;-----------------------------------------------------------------------------
674 ; int pixel_sad_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
675 ;-----------------------------------------------------------------------------
676 cglobal pixel_sad_64x32, 4,5,5
692 ;-----------------------------------------------------------------------------
693 ; int pixel_sad_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t )
694 ;-----------------------------------------------------------------------------
695 cglobal pixel_sad_64x48, 4,5,5
710 ;-----------------------------------------------------------------------------
711 ; int pixel_sad_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
712 ;-----------------------------------------------------------------------------
713 cglobal pixel_sad_64x64, 4,5,5
728 ;-----------------------------------------------------------------------------
729 ; int pixel_sad_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t )
730 ;-----------------------------------------------------------------------------
731 cglobal pixel_sad_48x64, 4,5,5
758 ;-----------------------------------------------------------------------------
759 ; int pixel_sad_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t )
760 ;-----------------------------------------------------------------------------
761 cglobal pixel_sad_24x32, 4,5,4
785 ;-----------------------------------------------------------------------------
786 ; int pixel_sad_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
787 ;-----------------------------------------------------------------------------
788 cglobal pixel_sad_12x16, 4,4,4
814 INIT_XMM sse2, aligned
817 %macro SAD_INC_4x8P_SSE 1
832 ACCUM paddw, 0, 1, %1
837 ;Even on Nehalem, no sizes other than 8x16 benefit from this method.
838 cglobal pixel_sad_8x16_sse2, 4,4
846 ;=============================================================================
848 ;=============================================================================
850 %macro SAD_X3_START_1x8P 0
873 %macro SAD_X3_START_2x4P 3
878 punpckldq mm3, [r0+FENC_STRIDE]
879 punpckldq %1, [r1+r4]
880 punpckldq %2, [r2+r4]
881 punpckldq %3, [r3+r4]
887 %macro SAD_X3_2x16P 1
894 SAD_X3_1x8P FENC_STRIDE, r4
895 SAD_X3_1x8P FENC_STRIDE+8, r4+8
896 add r0, 2*FENC_STRIDE
908 SAD_X3_1x8P FENC_STRIDE, r4
909 add r0, 2*FENC_STRIDE
917 SAD_X3_START_2x4P mm0, mm1, mm2
919 SAD_X3_START_2x4P mm4, mm5, mm6
924 add r0, 2*FENC_STRIDE
930 %macro SAD_X4_START_1x8P 0
957 %macro SAD_X4_START_2x4P 0
963 punpckldq mm7, [r0+FENC_STRIDE]
964 punpckldq mm0, [r1+r5]
965 punpckldq mm1, [r2+r5]
966 punpckldq mm2, [r3+r5]
967 punpckldq mm3, [r4+r5]
974 %macro SAD_X4_INC_2x4P 0
978 punpckldq mm7, [r0+FENC_STRIDE]
979 punpckldq mm4, [r1+r5]
980 punpckldq mm5, [r2+r5]
987 punpckldq mm4, [r3+r5]
988 punpckldq mm5, [r4+r5]
995 %macro SAD_X4_2x16P 1
1002 SAD_X4_1x8P FENC_STRIDE, r5
1003 SAD_X4_1x8P FENC_STRIDE+8, r5+8
1004 add r0, 2*FENC_STRIDE
1011 %macro SAD_X4_2x8P 1
1017 SAD_X4_1x8P FENC_STRIDE, r5
1018 add r0, 2*FENC_STRIDE
1025 %macro SAD_X4_2x4P 1
1031 add r0, 2*FENC_STRIDE
1061 %macro SAD_X3_12x4 0
1076 mova m3, [r0 + FENC_STRIDE]
1090 mova m3, [r0 + FENC_STRIDE * 2]
1091 movu m5, [r1 + r4 * 2]
1096 movu m5, [r2 + r4 * 2]
1100 movu m5, [r3 + r4 * 2]
1104 lea r1, [r1 + r4 * 2]
1105 lea r2, [r2 + r4 * 2]
1106 lea r3, [r3 + r4 * 2]
1107 mova m3, [r0 + FENC_STRIDE + FENC_STRIDE * 2]
1121 lea r0, [r0 + FENC_STRIDE * 4]
1122 lea r1, [r1 + r4 * 2]
1123 lea r2, [r2 + r4 * 2]
1124 lea r3, [r3 + r4 * 2]
1127 %macro SAD_X4_12x4 0
1146 mova m4, [r0 + FENC_STRIDE]
1164 mova m4, [r0 + FENC_STRIDE * 2]
1165 movu m5, [r1 + r5 * 2]
1170 movu m5, [r2 + r5 * 2]
1174 movu m5, [r3 + r5 * 2]
1178 movu m5, [r4 + r5 * 2]
1182 lea r1, [r1 + r5 * 2]
1183 lea r2, [r2 + r5 * 2]
1184 lea r3, [r3 + r5 * 2]
1185 lea r4, [r4 + r5 * 2]
1186 mova m4, [r0 + FENC_STRIDE + FENC_STRIDE * 2]
1204 lea r0, [r0 + FENC_STRIDE * 4]
1205 lea r1, [r1 + r5 * 2]
1206 lea r2, [r2 + r5 * 2]
1207 lea r3, [r3 + r5 * 2]
1208 lea r4, [r4 + r5 * 2]
1211 %macro SAD_X3_24x4 0
1236 mova m3, [r0 + FENC_STRIDE]
1237 mova m4, [r0 + 16 + FENC_STRIDE]
1239 movu m6, [r1 + 16 + r4]
1246 movu m6, [r2 + 16 + r4]
1253 movu m6, [r3 + 16 + r4]
1260 mova m3, [r0 + FENC_STRIDE * 2]
1261 mova m4, [r0 + 16 + FENC_STRIDE * 2]
1262 movu m5, [r1 + r4 * 2]
1263 movu m6, [r1 + 16 + r4 * 2]
1269 movu m5, [r2 + r4 * 2]
1270 movu m6, [r2 + 16 + r4 * 2]
1276 movu m5, [r3 + r4 * 2]
1277 movu m6, [r3 + 16 + r4 * 2]
1283 lea r0, [r0 + FENC_STRIDE * 2]
1284 lea r1, [r1 + r4 * 2]
1285 lea r2, [r2 + r4 * 2]
1286 lea r3, [r3 + r4 * 2]
1288 mova m3, [r0 + FENC_STRIDE]
1289 mova m4, [r0 + 16 + FENC_STRIDE]
1291 movu m6, [r1 + 16 + r4]
1298 movu m6, [r2 + 16 + r4]
1305 movu m6, [r3 + 16 + r4]
1311 lea r0, [r0 + FENC_STRIDE * 2]
1312 lea r1, [r1 + r4 * 2]
1313 lea r2, [r2 + r4 * 2]
1314 lea r3, [r3 + r4 * 2]
1317 %macro SAD_X4_24x4 0
1349 mova m4, [r0 + FENC_STRIDE]
1350 mova m5, [r0 + 16 + FENC_STRIDE]
1352 movu m7, [r1 + 16 + r5]
1359 movu m7, [r2 + 16 + r5]
1366 movu m7, [r3 + 16 + r5]
1373 movu m7, [r4 + 16 + r5]
1380 mova m4, [r0 + FENC_STRIDE * 2]
1381 mova m5, [r0 + 16 + FENC_STRIDE * 2]
1382 movu m6, [r1 + r5 * 2]
1383 movu m7, [r1 + 16 + r5 * 2]
1389 movu m6, [r2 + r5 * 2]
1390 movu m7, [r2 + 16 + r5 * 2]
1396 movu m6, [r3 + r5 * 2]
1397 movu m7, [r3 + 16 + r5 * 2]
1403 movu m6, [r4 + r5 * 2]
1404 movu m7, [r4 + 16 + r5 * 2]
1410 lea r0, [r0 + FENC_STRIDE * 2]
1411 lea r1, [r1 + r5 * 2]
1412 lea r2, [r2 + r5 * 2]
1413 lea r3, [r3 + r5 * 2]
1414 lea r4, [r4 + r5 * 2]
1415 mova m4, [r0 + FENC_STRIDE]
1416 mova m5, [r0 + 16 + FENC_STRIDE]
1418 movu m7, [r1 + 16 + r5]
1425 movu m7, [r2 + 16 + r5]
1432 movu m7, [r3 + 16 + r5]
1439 movu m7, [r4 + 16 + r5]
1445 lea r0, [r0 + FENC_STRIDE * 2]
1446 lea r1, [r1 + r5 * 2]
1447 lea r2, [r2 + r5 * 2]
1448 lea r3, [r3 + r5 * 2]
1449 lea r4, [r4 + r5 * 2]
1452 %macro SAD_X3_32x4 0
1473 lea r0, [r0 + FENC_STRIDE]
1497 lea r0, [r0 + FENC_STRIDE]
1521 lea r0, [r0 + FENC_STRIDE]
1545 lea r0, [r0 + FENC_STRIDE]
1551 %macro SAD_X4_32x4 0
1578 lea r0, [r0 + FENC_STRIDE]
1609 lea r0, [r0 + FENC_STRIDE]
1640 lea r0, [r0 + FENC_STRIDE]
1671 lea r0, [r0 + FENC_STRIDE]
1678 %macro SAD_X3_48x4 0
1710 mova m3, [r0 + FENC_STRIDE]
1711 mova m4, [r0 + 16 + FENC_STRIDE]
1712 mova m5, [r0 + 32 + FENC_STRIDE]
1716 movu m6, [r1 + 16 + r4]
1719 movu m6, [r1 + 32 + r4]
1725 movu m6, [r2 + 16 + r4]
1728 movu m6, [r2 + 32 + r4]
1734 movu m6, [r3 + 16 + r4]
1737 movu m6, [r3 + 32 + r4]
1741 mova m3, [r0 + FENC_STRIDE * 2]
1742 mova m4, [r0 + 16 + FENC_STRIDE * 2]
1743 mova m5, [r0 + 32 + FENC_STRIDE * 2]
1744 movu m6, [r1 + r4 * 2]
1747 movu m6, [r1 + 16 + r4 * 2]
1750 movu m6, [r1 + 32 + r4 * 2]
1753 movu m6, [r2 + r4 * 2]
1756 movu m6, [r2 + 16 + r4 * 2]
1759 movu m6, [r2 + 32 + r4 * 2]
1762 movu m6, [r3 + r4 * 2]
1765 movu m6, [r3 + 16 + r4 * 2]
1768 movu m6, [r3 + 32 + r4 * 2]
1772 lea r0, [r0 + FENC_STRIDE * 2]
1773 lea r1, [r1 + r4 * 2]
1774 lea r2, [r2 + r4 * 2]
1775 lea r3, [r3 + r4 * 2]
1776 mova m3, [r0 + FENC_STRIDE]
1777 mova m4, [r0 + 16 + FENC_STRIDE]
1778 mova m5, [r0 + 32 + FENC_STRIDE]
1782 movu m6, [r1 + 16 + r4]
1785 movu m6, [r1 + 32 + r4]
1791 movu m6, [r2 + 16 + r4]
1794 movu m6, [r2 + 32 + r4]
1800 movu m6, [r3 + 16 + r4]
1803 movu m6, [r3 + 32 + r4]
1806 lea r0, [r0 + FENC_STRIDE * 2]
1807 lea r1, [r1 + r4 * 2]
1808 lea r2, [r2 + r4 * 2]
1809 lea r3, [r3 + r4 * 2]
1812 %macro SAD_X4_48x4 0
1853 mova m4, [r0 + FENC_STRIDE]
1854 mova m5, [r0 + 16 + FENC_STRIDE]
1855 mova m6, [r0 + 32 + FENC_STRIDE]
1859 movu m7, [r1 + 16 + r5]
1862 movu m7, [r1 + 32 + r5]
1868 movu m7, [r2 + 16 + r5]
1871 movu m7, [r2 + 32 + r5]
1877 movu m7, [r3 + 16 + r5]
1880 movu m7, [r3 + 32 + r5]
1886 movu m7, [r4 + 16 + r5]
1889 movu m7, [r4 + 32 + r5]
1893 mova m4, [r0 + FENC_STRIDE * 2]
1894 mova m5, [r0 + 16 + FENC_STRIDE * 2]
1895 mova m6, [r0 + 32 + FENC_STRIDE * 2]
1896 movu m7, [r1 + r5 * 2]
1899 movu m7, [r1 + 16 + r5 * 2]
1902 movu m7, [r1 + 32 + r5 * 2]
1905 movu m7, [r2 + r5 * 2]
1908 movu m7, [r2 + 16 + r5 * 2]
1911 movu m7, [r2 + 32 + r5 * 2]
1914 movu m7, [r3 + r5 * 2]
1917 movu m7, [r3 + 16 + r5 * 2]
1920 movu m7, [r3 + 32 + r5 * 2]
1923 movu m7, [r4 + r5 * 2]
1926 movu m7, [r4 + 16 + r5 * 2]
1929 movu m7, [r4 + 32 + r5 * 2]
1933 lea r0, [r0 + FENC_STRIDE * 2]
1934 lea r1, [r1 + r5 * 2]
1935 lea r2, [r2 + r5 * 2]
1936 lea r3, [r3 + r5 * 2]
1937 lea r4, [r4 + r5 * 2]
1938 mova m4, [r0 + FENC_STRIDE]
1939 mova m5, [r0 + 16 + FENC_STRIDE]
1940 mova m6, [r0 + 32 + FENC_STRIDE]
1944 movu m7, [r1 + 16 + r5]
1947 movu m7, [r1 + 32 + r5]
1953 movu m7, [r2 + 16 + r5]
1956 movu m7, [r2 + 32 + r5]
1962 movu m7, [r3 + 16 + r5]
1965 movu m7, [r3 + 32 + r5]
1971 movu m7, [r4 + 16 + r5]
1974 movu m7, [r4 + 32 + r5]
1977 lea r0, [r0 + FENC_STRIDE * 2]
1978 lea r1, [r1 + r5 * 2]
1979 lea r2, [r2 + r5 * 2]
1980 lea r3, [r3 + r5 * 2]
1981 lea r4, [r4 + r5 * 2]
1984 %macro SAD_X3_64x4 0
2026 mova m3, [r0 + FENC_STRIDE]
2027 mova m4, [r0 + 16 + FENC_STRIDE]
2031 movu m5, [r1 + 16 + r4]
2037 movu m5, [r2 + 16 + r4]
2043 movu m5, [r3 + 16 + r4]
2046 mova m3, [r0 + 32 + FENC_STRIDE]
2047 mova m4, [r0 + 48 + FENC_STRIDE]
2048 movu m5, [r1 + 32 + r4]
2051 movu m5, [r1 + 48 + r4]
2054 movu m5, [r2 + 32 + r4]
2057 movu m5, [r2 + 48 + r4]
2060 movu m5, [r3 + 32 + r4]
2063 movu m5, [r3 + 48 + r4]
2067 mova m3, [r0 + FENC_STRIDE * 2]
2068 mova m4, [r0 + 16 + FENC_STRIDE * 2]
2069 movu m5, [r1 + r4 * 2]
2072 movu m5, [r1 + 16 + r4 * 2]
2075 movu m5, [r2 + r4 * 2]
2078 movu m5, [r2 + 16 + r4 * 2]
2081 movu m5, [r3 + r4 * 2]
2084 movu m5, [r3 + 16 + r4 * 2]
2087 mova m3, [r0 + 32 + FENC_STRIDE * 2]
2088 mova m4, [r0 + 48 + FENC_STRIDE * 2]
2089 movu m5, [r1 + 32 + r4 * 2]
2092 movu m5, [r1 + 48 + r4 * 2]
2095 movu m5, [r2 + 32 + r4 * 2]
2098 movu m5, [r2 + 48 + r4 * 2]
2101 movu m5, [r3 + 32 + r4 * 2]
2104 movu m5, [r3 + 48 + r4 * 2]
2108 lea r0, [r0 + FENC_STRIDE * 2]
2109 lea r1, [r1 + r4 * 2]
2110 lea r2, [r2 + r4 * 2]
2111 lea r3, [r3 + r4 * 2]
2112 mova m3, [r0 + FENC_STRIDE]
2113 mova m4, [r0 + 16 + FENC_STRIDE]
2117 movu m5, [r1 + 16 + r4]
2123 movu m5, [r2 + 16 + r4]
2129 movu m5, [r3 + 16 + r4]
2132 mova m3, [r0 + 32 + FENC_STRIDE]
2133 mova m4, [r0 + 48 + FENC_STRIDE]
2134 movu m5, [r1 + 32 + r4]
2137 movu m5, [r1 + 48 + r4]
2140 movu m5, [r2 + 32 + r4]
2143 movu m5, [r2 + 48 + r4]
2146 movu m5, [r3 + 32 + r4]
2149 movu m5, [r3 + 48 + r4]
2152 lea r0, [r0 + FENC_STRIDE * 2]
2153 lea r1, [r1 + r4 * 2]
2154 lea r2, [r2 + r4 * 2]
2155 lea r3, [r3 + r4 * 2]
2158 %macro SAD_X4_64x4 0
2212 mova m4, [r0 + FENC_STRIDE]
2213 mova m5, [r0 + 16 + FENC_STRIDE]
2217 movu m6, [r1 + 16 + r5]
2223 movu m6, [r2 + 16 + r5]
2229 movu m6, [r3 + 16 + r5]
2235 movu m6, [r4 + 16 + r5]
2238 mova m4, [r0 + 32 + FENC_STRIDE]
2239 mova m5, [r0 + 48 + FENC_STRIDE]
2240 movu m6, [r1 + 32 + r5]
2243 movu m6, [r1 + 48 + r5]
2246 movu m6, [r2 + 32 + r5]
2249 movu m6, [r2 + 48 + r5]
2252 movu m6, [r3 + 32 + r5]
2255 movu m6, [r3 + 48 + r5]
2258 movu m6, [r4 + 32 + r5]
2261 movu m6, [r4 + 48 + r5]
2265 mova m4, [r0 + FENC_STRIDE * 2]
2266 mova m5, [r0 + 16 + FENC_STRIDE * 2]
2267 movu m6, [r1 + r5 * 2]
2270 movu m6, [r1 + 16 + r5 * 2]
2273 movu m6, [r2 + r5 * 2]
2276 movu m6, [r2 + 16 + r5 * 2]
2279 movu m6, [r3 + r5 * 2]
2282 movu m6, [r3 + 16 + r5 * 2]
2285 movu m6, [r4 + r5 * 2]
2288 movu m6, [r4 + 16 + r5 * 2]
2291 mova m4, [r0 + 32 + FENC_STRIDE * 2]
2292 mova m5, [r0 + 48 + FENC_STRIDE * 2]
2293 movu m6, [r1 + 32 + r5 * 2]
2296 movu m6, [r1 + 48 + r5 * 2]
2299 movu m6, [r2 + 32 + r5 * 2]
2302 movu m6, [r2 + 48 + r5 * 2]
2305 movu m6, [r3 + 32 + r5 * 2]
2308 movu m6, [r3 + 48 + r5 * 2]
2311 movu m6, [r4 + 32 + r5 * 2]
2314 movu m6, [r4 + 48 + r5 * 2]
2318 lea r0, [r0 + FENC_STRIDE * 2]
2319 lea r1, [r1 + r5 * 2]
2320 lea r2, [r2 + r5 * 2]
2321 lea r3, [r3 + r5 * 2]
2322 lea r4, [r4 + r5 * 2]
2323 mova m4, [r0 + FENC_STRIDE]
2324 mova m5, [r0 + 16 + FENC_STRIDE]
2328 movu m6, [r1 + 16 + r5]
2334 movu m6, [r2 + 16 + r5]
2340 movu m6, [r3 + 16 + r5]
2346 movu m6, [r4 + 16 + r5]
2349 mova m4, [r0 + 32 + FENC_STRIDE]
2350 mova m5, [r0 + 48 + FENC_STRIDE]
2351 movu m6, [r1 + 32 + r5]
2354 movu m6, [r1 + 48 + r5]
2357 movu m6, [r2 + 32 + r5]
2360 movu m6, [r2 + 48 + r5]
2363 movu m6, [r3 + 32 + r5]
2366 movu m6, [r3 + 48 + r5]
2369 movu m6, [r4 + 32 + r5]
2372 movu m6, [r4 + 48 + r5]
2375 lea r0, [r0 + FENC_STRIDE * 2]
2376 lea r1, [r1 + r5 * 2]
2377 lea r2, [r2 + r5 * 2]
2378 lea r3, [r3 + r5 * 2]
2379 lea r4, [r4 + r5 * 2]
2382 ;-----------------------------------------------------------------------------
2383 ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
2384 ; uint8_t *pix2, intptr_t i_stride, int scores[3] )
2385 ;-----------------------------------------------------------------------------
2387 cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
2415 ;=============================================================================
2417 ;=============================================================================
2419 %macro SAD_X3_START_1x16P_SSE2 0
2435 %macro SAD_X3_1x16P_SSE2 2
2438 psadbw m4, m3, [r1+%2]
2439 psadbw m5, m3, [r2+%2]
2460 %macro SAD_X3_4x16P_SSE2 2
2463 SAD_X3_START_1x16P_SSE2
2465 SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
2467 SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1
2468 SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2
2469 SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), t0
2472 add r0, 8*FENC_STRIDE
2480 %macro SAD_X3_START_2x8P_SSE2 0
2485 movhps m3, [r0+FENC_STRIDE]
2494 %macro SAD_X3_2x8P_SSE2 4
2511 %macro SAD_X4_START_2x8P_SSE2 0
2517 movhps m4, [r0+FENC_STRIDE]
2528 %macro SAD_X4_2x8P_SSE2 4
2549 %macro SAD_X4_START_1x16P_SSE2 0
2568 %macro SAD_X4_1x16P_SSE2 2
2571 psadbw m4, m6, [r1+%2]
2572 psadbw m5, m6, [r2+%2]
2582 psadbw m4, m6, [r3+%2]
2583 psadbw m5, m6, [r4+%2]
2594 %macro SAD_X4_4x16P_SSE2 2
2597 SAD_X4_START_1x16P_SSE2
2599 SAD_X4_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0
2601 SAD_X4_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r5*1
2602 SAD_X4_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2
2603 SAD_X4_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r6
2606 add r0, 8*FENC_STRIDE
2615 %macro SAD_X3_4x8P_SSE2 2
2618 SAD_X3_START_2x8P_SSE2
2620 SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1
2622 SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), t0
2625 add r0, 8*FENC_STRIDE
2633 %macro SAD_X4_4x8P_SSE2 2
2636 SAD_X4_START_2x8P_SSE2
2638 SAD_X4_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
2640 SAD_X4_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
2643 add r0, 8*FENC_STRIDE
2652 %macro SAD_X3_END_SSE2 1
2666 %macro SAD_X4_END_SSE2 1
2681 %macro SAD_X3_START_2x16P_AVX2 0
2682 movu m3, [r0] ; assumes FENC_STRIDE == 16
2686 vinserti128 m0, m0, [r1+r4], 1
2687 vinserti128 m1, m1, [r2+r4], 1
2688 vinserti128 m2, m2, [r3+r4], 1
2694 %macro SAD_X3_2x16P_AVX2 3
2695 movu m3, [r0+%1] ; assumes FENC_STRIDE == 16
2699 vinserti128 m4, m4, [r1+%3], 1
2700 vinserti128 m5, m5, [r2+%3], 1
2701 vinserti128 m6, m6, [r3+%3], 1
2710 %macro SAD_X3_4x16P_AVX2 2
2713 SAD_X3_START_2x16P_AVX2
2715 SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1
2717 SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, t0
2720 add r0, 8*FENC_STRIDE
2728 %macro SAD_X4_START_2x16P_AVX2 0
2729 vbroadcasti128 m4, [r0]
2730 vbroadcasti128 m5, [r0+FENC_STRIDE]
2735 vinserti128 m0, m0, [r3], 1
2736 vinserti128 m1, m1, [r4], 1
2737 vinserti128 m2, m2, [r3+r5], 1
2738 vinserti128 m3, m3, [r4+r5], 1
2747 %macro SAD_X4_2x16P_AVX2 4
2748 vbroadcasti128 m6, [r0+%1]
2749 vbroadcasti128 m7, [r0+%3]
2754 vinserti128 m2, m2, [r3+%2], 1
2755 vinserti128 m3, m3, [r4+%2], 1
2756 vinserti128 m4, m4, [r3+%4], 1
2757 vinserti128 m5, m5, [r4+%4], 1
2768 %macro SAD_X4_4x16P_AVX2 2
2771 SAD_X4_START_2x16P_AVX2
2773 SAD_X4_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
2775 SAD_X4_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
2778 add r0, 8*FENC_STRIDE
2787 %macro SAD_X3_END_AVX2 0
2789 packssdw m0, m1 ; 0 0 1 1 0 0 1 1
2790 packssdw m2, m2 ; 2 2 _ _ 2 2 _ _
2791 phaddd m0, m2 ; 0 1 2 _ 0 1 2 _
2792 vextracti128 xm1, m0, 1
2793 paddd xm0, xm1 ; 0 1 2 _
2798 %macro SAD_X4_END_AVX2 0
2802 vextracti128 xm2, m0, 1
2803 vextracti128 xm3, m1, 1
2806 phaddd xm0, xm2 ; 0 1 2 3
2811 ;-----------------------------------------------------------------------------
2812 ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
2813 ; uint8_t *pix2, intptr_t i_stride, int scores[3] )
2814 ;-----------------------------------------------------------------------------
2816 cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
2819 SAD_X%1_4x%2P_SSE2 x, %3/4
2830 cglobal pixel_sad_x3_12x16, 5, 7, 8
2844 cglobal pixel_sad_x4_12x16, 6, 8, 8
2859 cglobal pixel_sad_x3_24x32, 5, 7, 8
2878 %if ARCH_X86_64 == 1
2879 cglobal pixel_sad_x4_24x32, 6, 8, 8
2882 cglobal pixel_sad_x4_24x32, 6, 7, 8, 0-4
2883 %define count dword [rsp]
2904 cglobal pixel_sad_x3_32x8, 5, 6, 8
2913 cglobal pixel_sad_x3_32x16, 5, 6, 8
2924 cglobal pixel_sad_x3_32x24, 5, 6, 8
2937 cglobal pixel_sad_x3_32x32, 5, 7, 8
2954 cglobal pixel_sad_x3_32x64, 5, 7, 8
2973 cglobal pixel_sad_x4_32x8, 6, 7, 8
2983 cglobal pixel_sad_x4_32x16, 6, 7, 8
2995 cglobal pixel_sad_x4_32x24, 6, 7, 8
3009 %if ARCH_X86_64 == 1
3010 cglobal pixel_sad_x4_32x32, 6, 8, 8
3013 cglobal pixel_sad_x4_32x32, 6, 7, 8, 0-4
3014 %define count dword [rsp]
3032 %if ARCH_X86_64 == 1
3033 cglobal pixel_sad_x4_32x64, 6, 8, 8
3036 cglobal pixel_sad_x4_32x64, 6, 7, 8, 0-4
3037 %define count dword [rsp]
3058 cglobal pixel_sad_x3_48x64, 5, 7, 8
3076 %if ARCH_X86_64 == 1
3077 cglobal pixel_sad_x4_48x64, 6, 8, 8
3080 cglobal pixel_sad_x4_48x64, 6, 7, 8, 0-4
3081 %define count dword [rsp]
3101 cglobal pixel_sad_x3_64x16, 5, 7, 7
3115 cglobal pixel_sad_x3_64x32, 5, 7, 7
3129 cglobal pixel_sad_x3_64x48, 5, 7, 7
3143 cglobal pixel_sad_x3_64x64, 5, 7, 7
3159 %if ARCH_X86_64 == 1
3160 cglobal pixel_sad_x4_64x16, 6, 8, 8
3163 cglobal pixel_sad_x4_64x16, 6, 7, 8, 0-4
3164 %define count dword [rsp]
3180 %if ARCH_X86_64 == 1
3181 cglobal pixel_sad_x4_64x32, 6, 8, 8
3184 cglobal pixel_sad_x4_64x32, 6, 7, 8, 0-4
3185 %define count dword [rsp]
3201 %if ARCH_X86_64 == 1
3202 cglobal pixel_sad_x4_64x48, 6, 8, 8
3205 cglobal pixel_sad_x4_64x48, 6, 7, 8, 0-4
3206 %define count dword [rsp]
3222 %if ARCH_X86_64 == 1
3223 cglobal pixel_sad_x4_64x64, 6, 8, 8
3226 cglobal pixel_sad_x4_64x64, 6, 7, 8, 0-4
3227 %define count dword [rsp]
3245 SAD_X_SSE2 3, 16, 16, 7
3246 SAD_X_SSE2 3, 16, 8, 7
3247 SAD_X_SSE2 3, 8, 16, 7
3248 SAD_X_SSE2 3, 8, 8, 7
3249 SAD_X_SSE2 3, 8, 4, 7
3250 SAD_X_SSE2 4, 16, 16, 7
3251 SAD_X_SSE2 4, 16, 8, 7
3252 SAD_X_SSE2 4, 8, 16, 7
3253 SAD_X_SSE2 4, 8, 8, 7
3254 SAD_X_SSE2 4, 8, 4, 7
3257 SAD_X_SSE2 3, 16, 16, 7
3258 SAD_X_SSE2 3, 16, 8, 7
3259 SAD_X_SSE2 3, 16, 4, 7
3260 SAD_X_SSE2 4, 16, 16, 7
3261 SAD_X_SSE2 4, 16, 8, 7
3262 SAD_X_SSE2 4, 16, 4, 7
3270 SAD_X_SSE2 3, 16, 64, 7
3271 SAD_X_SSE2 3, 16, 32, 7
3272 SAD_X_SSE2 3, 16, 16, 7
3273 SAD_X_SSE2 3, 16, 12, 7
3274 SAD_X_SSE2 3, 16, 8, 7
3275 SAD_X_SSE2 3, 8, 32, 7
3276 SAD_X_SSE2 3, 8, 16, 7
3282 SAD_X_SSE2 4, 16, 64, 7
3283 SAD_X_SSE2 4, 16, 32, 7
3284 SAD_X_SSE2 4, 16, 16, 7
3285 SAD_X_SSE2 4, 16, 12, 7
3286 SAD_X_SSE2 4, 16, 8, 7
3287 SAD_X_SSE2 4, 8, 32, 7
3288 SAD_X_SSE2 4, 8, 16, 7
3289 SAD_X_SSE2 4, 8, 8, 7
3290 SAD_X_SSE2 4, 8, 4, 7
3298 SAD_X_SSE2 3, 16, 64, 7
3299 SAD_X_SSE2 3, 16, 32, 6
3300 SAD_X_SSE2 3, 16, 16, 6
3301 SAD_X_SSE2 3, 16, 12, 6
3302 SAD_X_SSE2 3, 16, 8, 6
3303 SAD_X_SSE2 3, 16, 4, 6
3309 SAD_X_SSE2 4, 16, 64, 7
3310 SAD_X_SSE2 4, 16, 32, 7
3311 SAD_X_SSE2 4, 16, 16, 7
3312 SAD_X_SSE2 4, 16, 12, 7
3313 SAD_X_SSE2 4, 16, 8, 7
3314 SAD_X_SSE2 4, 16, 4, 7
3317 cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
3320 SAD_X%1_4x%2P_AVX2 x, %3/4
3327 SAD_X_AVX2 3, 16, 32, 7
3328 SAD_X_AVX2 3, 16, 16, 7
3329 SAD_X_AVX2 3, 16, 12, 7
3330 SAD_X_AVX2 3, 16, 8, 7
3331 SAD_X_AVX2 4, 16, 32, 8
3332 SAD_X_AVX2 4, 16, 16, 8
3333 SAD_X_AVX2 4, 16, 12, 8
3334 SAD_X_AVX2 4, 16, 8, 8
3336 ;=============================================================================
3337 ; SAD cacheline split
3338 ;=============================================================================
3340 ; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
3341 ; unless the unaligned data spans the border between 2 cachelines, in which
3342 ; case it's really slow. The exact numbers may differ, but all Intel cpus prior
3343 ; to Nehalem have a large penalty for cacheline splits.
3344 ; (8-byte alignment exactly half way between two cachelines is ok though.)
3345 ; LDDQU was supposed to fix this, but it only works on Pentium 4.
3346 ; So in the split case we load aligned data and explicitly perform the
3347 ; alignment between registers. Like on archs that have only aligned loads,
3348 ; except complicated by the fact that PALIGNR takes only an immediate, not
3349 ; a variable alignment.
3350 ; It is also possible to hoist the realignment to the macroblock level (keep
3351 ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
3352 ; needed for that method makes it often slower.
3354 ; sad 16x16 costs on Core2:
3355 ; good offsets: 49 cycles (50/64 of all mvs)
3356 ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
3357 ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
3358 ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
3360 ; computed jump assumes this loop is exactly 80 bytes
3361 %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
3363 sad_w16_align%1_sse2:
3364 movdqa xmm1, [r2+16]
3365 movdqa xmm2, [r2+r3+16]
3367 movdqa xmm4, [r2+r3]
3375 psadbw xmm2, [r0+r1]
3381 jg sad_w16_align%1_sse2
3385 ; computed jump assumes this loop is exactly 64 bytes
3386 %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
3388 sad_w16_align%1_ssse3:
3389 movdqa xmm1, [r2+16]
3390 movdqa xmm2, [r2+r3+16]
3391 palignr xmm1, [r2], %1
3392 palignr xmm2, [r2+r3], %1
3394 psadbw xmm2, [r0+r1]
3400 jg sad_w16_align%1_ssse3
3404 %macro SAD16_CACHELINE_FUNC 2 ; cpu, height
3405 cglobal pixel_sad_16x%2_cache64_%1
3409 jle pixel_sad_16x%2_sse2
3414 shl r4d, 6 ; code size = 64
3417 shl r4d, 4 ; code size = 80
3419 %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
3421 lea r5, [sad_w16_addr]
3424 lea r5, [sad_w16_addr + r4]
3436 %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
3438 and eax, 0x17|%1|(%4>>1)
3439 cmp eax, 0x10|%1|(%4>>1)
3440 jle pixel_sad_%1x%2_mmx2
3452 %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
3453 cglobal pixel_sad_16x%1_cache%2_mmx2
3454 SAD_CACHELINE_START_MMX2 16, %1, %1, %2
3478 %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
3479 cglobal pixel_sad_8x%1_cache%2_mmx2
3480 SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
3504 ; sad_x3/x4_cache64: check each mv.
3505 ; if they're all within a cacheline, use normal sad_x3/x4.
3506 ; otherwise, send them individually to sad_cache64.
3507 %macro CHECK_SPLIT 3 ; pix, width, cacheline
3509 and eax, 0x17|%2|(%3>>1)
3510 cmp eax, 0x10|%2|(%3>>1)
3514 %macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
3515 cglobal pixel_sad_x3_%1x%2_cache%3_%6
3516 CHECK_SPLIT r1m, %1, %3
3517 CHECK_SPLIT r2m, %1, %3
3518 CHECK_SPLIT r3m, %1, %3
3519 jmp pixel_sad_x3_%1x%2_%4
3527 sub rsp, 40 ; shadow space and alignment
3534 call pixel_sad_%1x%2_cache%3_%5
3537 mov r2, [rsp+40+0*8]
3542 call pixel_sad_%1x%2_cache%3_%5
3545 mov r2, [rsp+40+1*8]
3550 call pixel_sad_%1x%2_cache%3_%5
3563 call pixel_sad_%1x%2_cache%3_%5
3567 call pixel_sad_%1x%2_cache%3_%5
3571 call pixel_sad_%1x%2_cache%3_%5
3579 %macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
3580 cglobal pixel_sad_x4_%1x%2_cache%3_%6
3581 CHECK_SPLIT r1m, %1, %3
3582 CHECK_SPLIT r2m, %1, %3
3583 CHECK_SPLIT r3m, %1, %3
3584 CHECK_SPLIT r4m, %1, %3
3585 jmp pixel_sad_x4_%1x%2_%4
3594 sub rsp, 32 ; shadow space
3600 call pixel_sad_%1x%2_cache%3_%5
3603 mov r2, [rsp+32+0*8]
3608 call pixel_sad_%1x%2_cache%3_%5
3611 mov r2, [rsp+32+1*8]
3616 call pixel_sad_%1x%2_cache%3_%5
3619 mov r2, [rsp+32+2*8]
3624 call pixel_sad_%1x%2_cache%3_%5
3637 call pixel_sad_%1x%2_cache%3_%5
3641 call pixel_sad_%1x%2_cache%3_%5
3645 call pixel_sad_%1x%2_cache%3_%5
3649 call pixel_sad_%1x%2_cache%3_%5
3657 %macro SADX34_CACHELINE_FUNC 1+
3658 SADX3_CACHELINE_FUNC %1
3659 SADX4_CACHELINE_FUNC %1
3663 ; instantiate the aligned sads
3666 %if ARCH_X86_64 == 0
3667 SAD16_CACHELINE_FUNC_MMX2 8, 32
3668 SAD16_CACHELINE_FUNC_MMX2 16, 32
3669 SAD8_CACHELINE_FUNC_MMX2 4, 32
3670 SAD8_CACHELINE_FUNC_MMX2 8, 32
3671 SAD8_CACHELINE_FUNC_MMX2 16, 32
3672 SAD16_CACHELINE_FUNC_MMX2 8, 64
3673 SAD16_CACHELINE_FUNC_MMX2 16, 64
3674 %endif ; !ARCH_X86_64
3675 SAD8_CACHELINE_FUNC_MMX2 4, 64
3676 SAD8_CACHELINE_FUNC_MMX2 8, 64
3677 SAD8_CACHELINE_FUNC_MMX2 16, 64
3679 %if ARCH_X86_64 == 0
3680 SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2
3681 SADX34_CACHELINE_FUNC 16, 8, 32, mmx2, mmx2, mmx2
3682 SADX34_CACHELINE_FUNC 8, 16, 32, mmx2, mmx2, mmx2
3683 SADX34_CACHELINE_FUNC 8, 8, 32, mmx2, mmx2, mmx2
3684 SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2
3685 SADX34_CACHELINE_FUNC 16, 8, 64, mmx2, mmx2, mmx2
3686 %endif ; !ARCH_X86_64
3687 SADX34_CACHELINE_FUNC 8, 16, 64, mmx2, mmx2, mmx2
3688 SADX34_CACHELINE_FUNC 8, 8, 64, mmx2, mmx2, mmx2
3690 %if ARCH_X86_64 == 0
3691 SAD16_CACHELINE_FUNC sse2, 8
3692 SAD16_CACHELINE_FUNC sse2, 16
3695 SAD16_CACHELINE_LOOP_SSE2 i
3698 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
3699 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2
3700 %endif ; !ARCH_X86_64
3701 SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmx2, sse2
3703 SAD16_CACHELINE_FUNC ssse3, 8
3704 SAD16_CACHELINE_FUNC ssse3, 16
3707 SAD16_CACHELINE_LOOP_SSSE3 i
3710 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
3711 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3