Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavfilter / x86 / yadif-10.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* x86-optimized functions for yadif filter
3;*
4;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
5;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
6;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA
28
29pw_1: times 8 dw 1
30
31SECTION .text
32
33%macro PMAXUW 2
34%if cpuflag(sse4)
35 pmaxuw %1, %2
36%else
37 psubusw %1, %2
38 paddusw %1, %2
39%endif
40%endmacro
41
42%macro CHECK 2
43 movu m2, [curq+t1+%1*2]
44 movu m3, [curq+t0+%2*2]
45 mova m4, m2
46 mova m5, m2
47 pxor m4, m3
48 pavgw m5, m3
49 pand m4, [pw_1]
50 psubusw m5, m4
51 RSHIFT m5, 2
52 mova m4, m2
53 psubusw m2, m3
54 psubusw m3, m4
55 PMAXUW m2, m3
56 mova m3, m2
57 mova m4, m2
58 RSHIFT m3, 2
59 RSHIFT m4, 4
60 paddw m2, m3
61 paddw m2, m4
62%endmacro
63
64%macro CHECK1 0
65 mova m3, m0
66 pcmpgtw m3, m2
67 pminsw m0, m2
68 mova m6, m3
69 pand m5, m3
70 pandn m3, m1
71 por m3, m5
72 mova m1, m3
73%endmacro
74
75; %macro CHECK2 0
76; paddw m6, [pw_1]
77; psllw m6, 14
78; paddsw m2, m6
79; mova m3, m0
80; pcmpgtw m3, m2
81; pminsw m0, m2
82; pand m5, m3
83; pandn m3, m1
84; por m3, m5
85; mova m1, m3
86; %endmacro
87
88; This version of CHECK2 is required for 14-bit samples. The left-shift trick
89; in the old code is not large enough to correctly select pixels or scores.
90
91%macro CHECK2 0
92 mova m3, m0
93 pcmpgtw m0, m2
94 pand m0, m6
95 mova m6, m0
96 pand m5, m6
97 pand m2, m0
98 pandn m6, m1
99 pandn m0, m3
100 por m6, m5
101 por m0, m2
102 mova m1, m6
103%endmacro
104
105%macro LOAD 2
106 movu %1, %2
107%endmacro
108
109%macro FILTER 3
110.loop%1:
111 pxor m7, m7
112 LOAD m0, [curq+t1]
113 LOAD m1, [curq+t0]
114 LOAD m2, [%2]
115 LOAD m3, [%3]
116 mova m4, m3
117 paddw m3, m2
118 psraw m3, 1
119 mova [rsp+ 0], m0
120 mova [rsp+16], m3
121 mova [rsp+32], m1
122 psubw m2, m4
123 ABS1 m2, m4
124 LOAD m3, [prevq+t1]
125 LOAD m4, [prevq+t0]
126 psubw m3, m0
127 psubw m4, m1
128 ABS2 m3, m4, m5, m6
129 paddw m3, m4
130 psrlw m2, 1
131 psrlw m3, 1
132 pmaxsw m2, m3
133 LOAD m3, [nextq+t1]
134 LOAD m4, [nextq+t0]
135 psubw m3, m0
136 psubw m4, m1
137 ABS2 m3, m4, m5, m6
138 paddw m3, m4
139 psrlw m3, 1
140 pmaxsw m2, m3
141 mova [rsp+48], m2
142
143 paddw m1, m0
144 paddw m0, m0
145 psubw m0, m1
146 psrlw m1, 1
147 ABS1 m0, m2
148
149 movu m2, [curq+t1-1*2]
150 movu m3, [curq+t0-1*2]
151 mova m4, m2
152 psubusw m2, m3
153 psubusw m3, m4
154 PMAXUW m2, m3
155 mova m3, m2
156 RSHIFT m3, 4
157 paddw m0, m2
158 paddw m0, m3
159 psubw m0, [pw_1]
160
161 CHECK -2, 0
162 CHECK1
163 CHECK -3, 1
164 CHECK2
165 CHECK 0, -2
166 CHECK1
167 CHECK 1, -3
168 CHECK2
169
170 mova m6, [rsp+48]
171 cmp DWORD r8m, 2
172 jge .end%1
173 LOAD m2, [%2+t1*2]
174 LOAD m4, [%3+t1*2]
175 LOAD m3, [%2+t0*2]
176 LOAD m5, [%3+t0*2]
177 paddw m2, m4
178 paddw m3, m5
179 psrlw m2, 1
180 psrlw m3, 1
181 mova m4, [rsp+ 0]
182 mova m5, [rsp+16]
183 mova m7, [rsp+32]
184 psubw m2, m4
185 psubw m3, m7
186 mova m0, m5
187 psubw m5, m4
188 psubw m0, m7
189 mova m4, m2
190 pminsw m2, m3
191 pmaxsw m3, m4
192 pmaxsw m2, m5
193 pminsw m3, m5
194 pmaxsw m2, m0
195 pminsw m3, m0
196 pxor m4, m4
197 pmaxsw m6, m3
198 psubw m4, m2
199 pmaxsw m6, m4
200
201.end%1:
202 mova m2, [rsp+16]
203 mova m3, m2
204 psubw m2, m6
205 paddw m3, m6
206 pmaxsw m1, m2
207 pminsw m1, m3
208
209 movu [dstq], m1
210 add dstq, mmsize-4
211 add prevq, mmsize-4
212 add curq, mmsize-4
213 add nextq, mmsize-4
214 sub DWORD r4m, mmsize/2-2
215 jg .loop%1
216%endmacro
217
218%macro YADIF 0
219%if ARCH_X86_32
220cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
221 prefs, mrefs, parity, mode
222%else
223cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
224 prefs, mrefs, parity, mode
225%endif
226%if ARCH_X86_32
227 mov r4, r5mp
228 mov r5, r6mp
229 DECLARE_REG_TMP 4,5
230%else
231 movsxd r5, DWORD r5m
232 movsxd r6, DWORD r6m
233 DECLARE_REG_TMP 5,6
234%endif
235
236 cmp DWORD paritym, 0
237 je .parity0
238 FILTER 1, prevq, curq
239 jmp .ret
240
241.parity0:
242 FILTER 0, curq, nextq
243
244.ret:
245 RET
246%endmacro
247
248INIT_XMM ssse3
249YADIF
250INIT_XMM sse2
251YADIF
252%if ARCH_X86_32
253INIT_MMX mmxext
254YADIF
255%endif