Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavfilter / x86 / yadif-16.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* x86-optimized functions for yadif filter
3;*
4;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
5;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
6;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29pw_1: times 8 dw 1
30pw_8000: times 8 dw 0x8000
31pd_1: times 4 dd 1
32pd_8000: times 4 dd 0x8000
33
34SECTION .text
35
36%macro PABS 2
37%if cpuflag(ssse3)
38 pabsd %1, %1
39%else
40 pxor %2, %2
41 pcmpgtd %2, %1
42 pxor %1, %2
43 psubd %1, %2
44%endif
45%endmacro
46
47%macro PACK 1
48%if cpuflag(sse4)
49 packusdw %1, %1
50%else
51 psubd %1, [pd_8000]
52 packssdw %1, %1
53 paddw %1, [pw_8000]
54%endif
55%endmacro
56
57%macro PMINSD 3
58%if cpuflag(sse4)
59 pminsd %1, %2
60%else
61 mova %3, %2
62 pcmpgtd %3, %1
63 pand %1, %3
64 pandn %3, %2
65 por %1, %3
66%endif
67%endmacro
68
69%macro PMAXSD 3
70%if cpuflag(sse4)
71 pmaxsd %1, %2
72%else
73 mova %3, %1
74 pcmpgtd %3, %2
75 pand %1, %3
76 pandn %3, %2
77 por %1, %3
78%endif
79%endmacro
80
81%macro PMAXUW 2
82%if cpuflag(sse4)
83 pmaxuw %1, %2
84%else
85 psubusw %1, %2
86 paddusw %1, %2
87%endif
88%endmacro
89
90%macro CHECK 2
91 movu m2, [curq+t1+%1*2]
92 movu m3, [curq+t0+%2*2]
93 mova m4, m2
94 mova m5, m2
95 pxor m4, m3
96 pavgw m5, m3
97 pand m4, [pw_1]
98 psubusw m5, m4
99 RSHIFT m5, 2
100 punpcklwd m5, m7
101 mova m4, m2
102 psubusw m2, m3
103 psubusw m3, m4
104 PMAXUW m2, m3
105 mova m3, m2
106 mova m4, m2
107 RSHIFT m3, 2
108 RSHIFT m4, 4
109 punpcklwd m2, m7
110 punpcklwd m3, m7
111 punpcklwd m4, m7
112 paddd m2, m3
113 paddd m2, m4
114%endmacro
115
116%macro CHECK1 0
117 mova m3, m0
118 pcmpgtd m3, m2
119 PMINSD m0, m2, m6
120 mova m6, m3
121 pand m5, m3
122 pandn m3, m1
123 por m3, m5
124 mova m1, m3
125%endmacro
126
127%macro CHECK2 0
128 paddd m6, [pd_1]
129 pslld m6, 30
130 paddd m2, m6
131 mova m3, m0
132 pcmpgtd m3, m2
133 PMINSD m0, m2, m4
134 pand m5, m3
135 pandn m3, m1
136 por m3, m5
137 mova m1, m3
138%endmacro
139
140; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I
141; am not sure whether it is any faster. A rewrite or refactor of the filter
142; code should make it possible to eliminate the move instruction at the end. It
143; exists to satisfy the expectation that the "score" values are in m1.
144
145; %macro CHECK2 0
146; mova m3, m0
147; pcmpgtd m0, m2
148; pand m0, m6
149; mova m6, m0
150; pand m5, m6
151; pand m2, m0
152; pandn m6, m1
153; pandn m0, m3
154; por m6, m5
155; por m0, m2
156; mova m1, m6
157; %endmacro
158
159%macro LOAD 2
160 movh %1, %2
161 punpcklwd %1, m7
162%endmacro
163
164%macro FILTER 3
165.loop%1:
166 pxor m7, m7
167 LOAD m0, [curq+t1]
168 LOAD m1, [curq+t0]
169 LOAD m2, [%2]
170 LOAD m3, [%3]
171 mova m4, m3
172 paddd m3, m2
173 psrad m3, 1
174 mova [rsp+ 0], m0
175 mova [rsp+16], m3
176 mova [rsp+32], m1
177 psubd m2, m4
178 PABS m2, m4
179 LOAD m3, [prevq+t1]
180 LOAD m4, [prevq+t0]
181 psubd m3, m0
182 psubd m4, m1
183 PABS m3, m5
184 PABS m4, m5
185 paddd m3, m4
186 psrld m2, 1
187 psrld m3, 1
188 PMAXSD m2, m3, m6
189 LOAD m3, [nextq+t1]
190 LOAD m4, [nextq+t0]
191 psubd m3, m0
192 psubd m4, m1
193 PABS m3, m5
194 PABS m4, m5
195 paddd m3, m4
196 psrld m3, 1
197 PMAXSD m2, m3, m6
198 mova [rsp+48], m2
199
200 paddd m1, m0
201 paddd m0, m0
202 psubd m0, m1
203 psrld m1, 1
204 PABS m0, m2
205
206 movu m2, [curq+t1-1*2]
207 movu m3, [curq+t0-1*2]
208 mova m4, m2
209 psubusw m2, m3
210 psubusw m3, m4
211 PMAXUW m2, m3
212 mova m3, m2
213 RSHIFT m3, 4
214 punpcklwd m2, m7
215 punpcklwd m3, m7
216 paddd m0, m2
217 paddd m0, m3
218 psubd m0, [pd_1]
219
220 CHECK -2, 0
221 CHECK1
222 CHECK -3, 1
223 CHECK2
224 CHECK 0, -2
225 CHECK1
226 CHECK 1, -3
227 CHECK2
228
229 mova m6, [rsp+48]
230 cmp DWORD r8m, 2
231 jge .end%1
232 LOAD m2, [%2+t1*2]
233 LOAD m4, [%3+t1*2]
234 LOAD m3, [%2+t0*2]
235 LOAD m5, [%3+t0*2]
236 paddd m2, m4
237 paddd m3, m5
238 psrld m2, 1
239 psrld m3, 1
240 mova m4, [rsp+ 0]
241 mova m5, [rsp+16]
242 mova m7, [rsp+32]
243 psubd m2, m4
244 psubd m3, m7
245 mova m0, m5
246 psubd m5, m4
247 psubd m0, m7
248 mova m4, m2
249 PMINSD m2, m3, m7
250 PMAXSD m3, m4, m7
251 PMAXSD m2, m5, m7
252 PMINSD m3, m5, m7
253 PMAXSD m2, m0, m7
254 PMINSD m3, m0, m7
255 pxor m4, m4
256 PMAXSD m6, m3, m7
257 psubd m4, m2
258 PMAXSD m6, m4, m7
259
260.end%1:
261 mova m2, [rsp+16]
262 mova m3, m2
263 psubd m2, m6
264 paddd m3, m6
265 PMAXSD m1, m2, m7
266 PMINSD m1, m3, m7
267 PACK m1
268
269 movh [dstq], m1
270 add dstq, mmsize/2
271 add prevq, mmsize/2
272 add curq, mmsize/2
273 add nextq, mmsize/2
274 sub DWORD r4m, mmsize/4
275 jg .loop%1
276%endmacro
277
278%macro YADIF 0
279%if ARCH_X86_32
280cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
281 prefs, mrefs, parity, mode
282%else
283cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
284 prefs, mrefs, parity, mode
285%endif
286%if ARCH_X86_32
287 mov r4, r5mp
288 mov r5, r6mp
289 DECLARE_REG_TMP 4,5
290%else
291 movsxd r5, DWORD r5m
292 movsxd r6, DWORD r6m
293 DECLARE_REG_TMP 5,6
294%endif
295
296 cmp DWORD paritym, 0
297 je .parity0
298 FILTER 1, prevq, curq
299 jmp .ret
300
301.parity0:
302 FILTER 0, curq, nextq
303
304.ret:
305 RET
306%endmacro
307
308INIT_XMM sse4
309YADIF
310INIT_XMM ssse3
311YADIF
312INIT_XMM sse2
313YADIF
314%if ARCH_X86_32
315INIT_MMX mmxext
316YADIF
317%endif