1 ;*****************************************************************************
2 ;* x86-optimized functions for idet filter
4 ;* Copyright (C) 2014 Pascal Massimino (pascal.massimino@gmail.com)
5 ;* Copyright (c) 2014 Neil Birkbeck (birkbeck@google.com)
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
28 ; Implementation that does 8-bytes at a time using single-word operations.
29 %macro IDET_FILTER_LINE 1
31 cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index
39 movu m0, [aq + indexq*1]
40 punpckhbw m1, m0, m_zero
43 movu m3, [cq + indexq*1]
44 punpckhbw m4, m3, m_zero
50 movu m3, [bq + indexq*1]
51 punpckhbw m4, m3, m_zero
62 punpckhwd m1, m0, m_zero
78 IDET_FILTER_LINE mmxext
82 ;******************************************************************************
83 ; 16bit implementation that does 4/8-pixels at a time
85 %macro PABS_DIFF_WD 3 ; a, b, junk , output=a
96 %macro IDET_FILTER_LINE_16BIT 1 ; %1=increment (4 or 8 words)
97 cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
105 movu m2, [bq + indexq * 2] ; B
106 movu m3, [aq + indexq * 2] ; A
108 psubusw m5, m2, m3 ; ba
110 movu m4, [cq + indexq * 2] ; C
118 PABS_DIFF_WD m3, m6, m7 ; |ab - bc|
119 PABS_DIFF_WD m5, m4, m7 ; |ba - cb|
130 IDET_FILTER_LINE_16BIT 8
133 IDET_FILTER_LINE_16BIT 4
136 ;******************************************************************************
137 ; SSE2 8-bit implementation that does 16-bytes at a time:
140 cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
146 movu m2, [bq + indexq*1] ; B
147 movu m3, [aq + indexq*1] ; A
150 psubusb m5, m2, m3 ; ba
152 movu m3, [cq + indexq*1] ; C
160 psadbw m4, m6 ; |ab - bc|
162 psadbw m5, m3 ; |ba - cb|