Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavfilter / x86 / vf_idet.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* x86-optimized functions for idet filter
3;*
4;* Copyright (C) 2014 Pascal Massimino (pascal.massimino@gmail.com)
5;* Copyright (c) 2014 Neil Birkbeck (birkbeck@google.com)
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_TEXT
27
28; Implementation that does 8-bytes at a time using single-word operations.
29%macro IDET_FILTER_LINE 1
30INIT_MMX %1
31cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index
32 xor indexq, indexq
33%define m_zero m2
34%define m_sum m5
35 pxor m_sum, m_sum
36 pxor m_zero, m_zero
37
38.loop:
39 movu m0, [aq + indexq*1]
40 punpckhbw m1, m0, m_zero
41 punpcklbw m0, m_zero
42
43 movu m3, [cq + indexq*1]
44 punpckhbw m4, m3, m_zero
45 punpcklbw m3, m_zero
46
47 paddsw m1, m4
48 paddsw m0, m3
49
50 movu m3, [bq + indexq*1]
51 punpckhbw m4, m3, m_zero
52 punpcklbw m3, m_zero
53
54 paddw m4, m4
55 paddw m3, m3
56 psubsw m1, m4
57 psubsw m0, m3
58
59 ABS2 m1, m0, m4, m3
60
61 paddw m0, m1
62 punpckhwd m1, m0, m_zero
63 punpcklwd m0, m_zero
64
65 paddd m0, m1
66 paddd m_sum, m0
67
68 add indexq, 0x8
69 CMP widthd, indexd
70 jg .loop
71
72 HADDD m_sum, m0
73 movd eax, m_sum
74 RET
75%endmacro
76
77%if ARCH_X86_32
78IDET_FILTER_LINE mmxext
79IDET_FILTER_LINE mmx
80%endif
81
82;******************************************************************************
83; 16bit implementation that does 4/8-pixels at a time
84
85%macro PABS_DIFF_WD 3 ; a, b, junk , output=a
86 psubusw %3, %2, %1
87 psubusw %1, %2
88 por %1, %3
89
90 mova %2, %1
91 punpcklwd %1, m_zero
92 punpckhwd %2, m_zero
93 paddd %1, %2
94%endmacro
95
96%macro IDET_FILTER_LINE_16BIT 1 ; %1=increment (4 or 8 words)
97cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
98 xor indexq, indexq
99%define m_zero m1
100%define m_sum m0
101 pxor m_sum, m_sum
102 pxor m_zero, m_zero
103
104.loop_16bit:
105 movu m2, [bq + indexq * 2] ; B
106 movu m3, [aq + indexq * 2] ; A
107 mova m6, m2
108 psubusw m5, m2, m3 ; ba
109
110 movu m4, [cq + indexq * 2] ; C
111 add indexq, %1
112 psubusw m3, m2 ; ab
113 CMP indexd, widthd
114
115 psubusw m6, m4 ; bc
116 psubusw m4, m2 ; cb
117
118 PABS_DIFF_WD m3, m6, m7 ; |ab - bc|
119 PABS_DIFF_WD m5, m4, m7 ; |ba - cb|
120 paddd m_sum, m3
121 paddd m_sum, m5
122 jl .loop_16bit
123
124 HADDD m_sum, m2
125 movd eax, m_sum
126 RET
127%endmacro
128
129INIT_XMM sse2
130IDET_FILTER_LINE_16BIT 8
131%if ARCH_X86_32
132INIT_MMX mmx
133IDET_FILTER_LINE_16BIT 4
134%endif
135
136;******************************************************************************
137; SSE2 8-bit implementation that does 16-bytes at a time:
138
139INIT_XMM sse2
140cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
141 xor indexq, indexq
142 pxor m0, m0
143 pxor m1, m1
144
145.sse2_loop:
146 movu m2, [bq + indexq*1] ; B
147 movu m3, [aq + indexq*1] ; A
148 mova m6, m2
149 mova m4, m3
150 psubusb m5, m2, m3 ; ba
151
152 movu m3, [cq + indexq*1] ; C
153 add indexq, 0x10
154 psubusb m4, m2 ; ab
155 CMP indexd, widthd
156
157 psubusb m6, m3 ; bc
158 psubusb m3, m2 ; cb
159
160 psadbw m4, m6 ; |ab - bc|
161 paddq m0, m4
162 psadbw m5, m3 ; |ba - cb|
163 paddq m1, m5
164 jl .sse2_loop
165
166 paddq m0, m1
167 movhlps m1, m0
168 paddq m0, m1
169 movd eax, m0
170 RET