1 ;*****************************************************************************
2 ;* sad16-a.asm: x86 high depth sad functions
3 ;*****************************************************************************
4 ;* Copyright (C) 2010-2013 x264 project
6 ;* Authors: Oskar Arvidsson <oskar@irock.se>
7 ;* Henrik Gramner <henrik@gramner.com>
8 ;* Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;* This program is also available under a commercial proprietary license.
25 ;* For more information, contact us at license @ x265.com.
26 ;*****************************************************************************
29 %include "x86util.asm"
35 ;=============================================================================
37 ;=============================================================================
39 %macro SAD_INC_1x16P_MMX 0
48 ABSW2 m1, m2, m1, m2, m5, m6
49 ABSW2 m3, m4, m3, m4, m7, m5
58 %macro SAD_INC_2x8P_MMX 0
67 ABSW2 m1, m2, m1, m2, m5, m6
68 ABSW2 m3, m4, m3, m4, m7, m5
77 %macro SAD_INC_2x4P_MMX 0
82 ABSW2 m1, m2, m1, m2, m3, m4
89 ;-----------------------------------------------------------------------------
90 ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
91 ;-----------------------------------------------------------------------------
93 cglobal pixel_sad_%1x%2, 4,5-(%2&4/4)
127 ;=============================================================================
129 ;=============================================================================
140 ABSW2 m1, m2, m1, m2, m5, m6
145 ABSW2 m3, m4, m3, m4, m7, m5
161 ABSW2 m1, m2, m1, m2, m4, m6
184 ABSW2 m1, m2, m1, m2, m5, m6
187 ABSW2 m3, m4, m3, m4, m7, m5
198 ABSW2 m1, m2, m1, m2, m3, m4
216 ABSW2 m1, m2, m1, m2, m5, m6
219 ABSW2 m3, m4, m3, m4, m7, m5
234 ABSW2 m1, m2, m1, m2, m5, m6
237 ABSW2 m3, m4, m3, m4, m7, m5
254 ABSW2 m1, m2, m1, m2, m4, m6
263 %macro SAD_INC_2ROW 1
267 movu m3, [r2+2*r3+ 0]
268 movu m4, [r2+2*r3+16]
271 psubw m3, [r0+2*r1+ 0]
272 psubw m4, [r0+2*r1+16]
273 ABSW2 m1, m2, m1, m2, m5, m6
276 ABSW2 m3, m4, m3, m4, m7, m5
287 ABSW2 m1, m2, m1, m2, m3, m4
296 ;-----------------------------------------------------------------------------
297 ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
298 ;-----------------------------------------------------------------------------
300 cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
332 ;------------------------------------------------------------------
333 ; int pixel_sad_32xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
334 ;------------------------------------------------------------------
336 cglobal pixel_sad_%1x%2, 4,5,8
359 ;------------------------------------------------------------------
360 ; int pixel_sad_64xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
361 ;------------------------------------------------------------------
363 cglobal pixel_sad_%1x%2, 4,5,8
385 ;------------------------------------------------------------------
386 ; int pixel_sad_48xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
387 ;------------------------------------------------------------------
389 cglobal pixel_sad_%1x%2, 4,5,8
408 ;------------------------------------------------------------------
409 ; int pixel_sad_24xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
410 ;------------------------------------------------------------------
412 cglobal pixel_sad_%1x%2, 4,5,8
431 ;------------------------------------------------------------------
432 ; int pixel_sad_12xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
433 ;------------------------------------------------------------------
435 cglobal pixel_sad_%1x%2, 4,5,8
455 ;=============================================================================
457 ;=============================================================================
459 %macro SAD_X3_INC_P 0
460 add r0, 4*FENC_STRIDE
466 %macro SAD_X3_ONE_START 0
474 ABSW2 m0, m1, m0, m1, m4, m5
489 ABSW2 m3, m4, m3, m4, m7, m6
500 %if mmsize == 8 && %1*%2 == 256
522 %macro SAD_X4_INC_P 0
523 add r0, 4*FENC_STRIDE
530 %macro SAD_X4_ONE_START 0
540 ABSW2 m0, m1, m0, m1, m5, m6
541 ABSW2 m2, m3, m2, m3, m4, m7
559 ABSW2 m5, m6, m5, m6, m9, m10
560 ABSW2 m7, m8, m7, m8, m9, m10
588 %else ; num_mmregs == 8 && !ssse3
601 ABSW2 m5, m6, m5, m6, m7, m4
610 %if mmsize == 8 && %1*%2 == 256
632 SAD_X%1_ONE x*mmsize, x*mmsize
633 SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize
639 cglobal pixel_vsad, 3,3,8
643 mova m3, [r0+2*r1+16]
647 ABSW2 m0, m1, m0, m1, m4, m5
655 mova m7, [r0+2*r1+16]
675 HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682
677 HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426
690 cglobal pixel_vsad, 3,3
720 ;-----------------------------------------------------------------------------
721 ; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
722 ; uint16_t *pix2, intptr_t i_stride, int scores[3] )
723 ;-----------------------------------------------------------------------------
725 cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
727 %xdefine STRIDE r %+ regnum
730 SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE
731 SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1
734 SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2)