Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;***************************************************************************** |
2 | ;* x86-optimized functions for yadif filter | |
3 | ;* | |
4 | ;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at> | |
5 | ;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com> | |
6 | ;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com> | |
7 | ;* | |
8 | ;* This file is part of FFmpeg. | |
9 | ;* | |
10 | ;* FFmpeg is free software; you can redistribute it and/or | |
11 | ;* modify it under the terms of the GNU Lesser General Public | |
12 | ;* License as published by the Free Software Foundation; either | |
13 | ;* version 2.1 of the License, or (at your option) any later version. | |
14 | ;* | |
15 | ;* FFmpeg is distributed in the hope that it will be useful, | |
16 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 | ;* Lesser General Public License for more details. | |
19 | ;* | |
20 | ;* You should have received a copy of the GNU Lesser General Public | |
21 | ;* License along with FFmpeg; if not, write to the Free Software | |
22 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 | ;****************************************************************************** | |
24 | ||
25 | %include "libavutil/x86/x86util.asm" | |
26 | ||
27 | SECTION_RODATA | |
28 | ||
29 | pw_1: times 8 dw 1 | |
30 | pw_8000: times 8 dw 0x8000 | |
31 | pd_1: times 4 dd 1 | |
32 | pd_8000: times 4 dd 0x8000 | |
33 | ||
34 | SECTION .text | |
35 | ||
36 | %macro PABS 2 | |
37 | %if cpuflag(ssse3) | |
38 | pabsd %1, %1 | |
39 | %else | |
40 | pxor %2, %2 | |
41 | pcmpgtd %2, %1 | |
42 | pxor %1, %2 | |
43 | psubd %1, %2 | |
44 | %endif | |
45 | %endmacro | |
46 | ||
47 | %macro PACK 1 | |
48 | %if cpuflag(sse4) | |
49 | packusdw %1, %1 | |
50 | %else | |
51 | psubd %1, [pd_8000] | |
52 | packssdw %1, %1 | |
53 | paddw %1, [pw_8000] | |
54 | %endif | |
55 | %endmacro | |
56 | ||
57 | %macro PMINSD 3 | |
58 | %if cpuflag(sse4) | |
59 | pminsd %1, %2 | |
60 | %else | |
61 | mova %3, %2 | |
62 | pcmpgtd %3, %1 | |
63 | pand %1, %3 | |
64 | pandn %3, %2 | |
65 | por %1, %3 | |
66 | %endif | |
67 | %endmacro | |
68 | ||
69 | %macro PMAXSD 3 | |
70 | %if cpuflag(sse4) | |
71 | pmaxsd %1, %2 | |
72 | %else | |
73 | mova %3, %1 | |
74 | pcmpgtd %3, %2 | |
75 | pand %1, %3 | |
76 | pandn %3, %2 | |
77 | por %1, %3 | |
78 | %endif | |
79 | %endmacro | |
80 | ||
81 | %macro PMAXUW 2 | |
82 | %if cpuflag(sse4) | |
83 | pmaxuw %1, %2 | |
84 | %else | |
85 | psubusw %1, %2 | |
86 | paddusw %1, %2 | |
87 | %endif | |
88 | %endmacro | |
89 | ||
90 | %macro CHECK 2 | |
91 | movu m2, [curq+t1+%1*2] | |
92 | movu m3, [curq+t0+%2*2] | |
93 | mova m4, m2 | |
94 | mova m5, m2 | |
95 | pxor m4, m3 | |
96 | pavgw m5, m3 | |
97 | pand m4, [pw_1] | |
98 | psubusw m5, m4 | |
99 | RSHIFT m5, 2 | |
100 | punpcklwd m5, m7 | |
101 | mova m4, m2 | |
102 | psubusw m2, m3 | |
103 | psubusw m3, m4 | |
104 | PMAXUW m2, m3 | |
105 | mova m3, m2 | |
106 | mova m4, m2 | |
107 | RSHIFT m3, 2 | |
108 | RSHIFT m4, 4 | |
109 | punpcklwd m2, m7 | |
110 | punpcklwd m3, m7 | |
111 | punpcklwd m4, m7 | |
112 | paddd m2, m3 | |
113 | paddd m2, m4 | |
114 | %endmacro | |
115 | ||
116 | %macro CHECK1 0 | |
117 | mova m3, m0 | |
118 | pcmpgtd m3, m2 | |
119 | PMINSD m0, m2, m6 | |
120 | mova m6, m3 | |
121 | pand m5, m3 | |
122 | pandn m3, m1 | |
123 | por m3, m5 | |
124 | mova m1, m3 | |
125 | %endmacro | |
126 | ||
127 | %macro CHECK2 0 | |
128 | paddd m6, [pd_1] | |
129 | pslld m6, 30 | |
130 | paddd m2, m6 | |
131 | mova m3, m0 | |
132 | pcmpgtd m3, m2 | |
133 | PMINSD m0, m2, m4 | |
134 | pand m5, m3 | |
135 | pandn m3, m1 | |
136 | por m3, m5 | |
137 | mova m1, m3 | |
138 | %endmacro | |
139 | ||
140 | ; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I | |
141 | ; am not sure whether it is any faster. A rewrite or refactor of the filter | |
142 | ; code should make it possible to eliminate the move instruction at the end. It | |
143 | ; exists to satisfy the expectation that the "score" values are in m1. | |
144 | ||
145 | ; %macro CHECK2 0 | |
146 | ; mova m3, m0 | |
147 | ; pcmpgtd m0, m2 | |
148 | ; pand m0, m6 | |
149 | ; mova m6, m0 | |
150 | ; pand m5, m6 | |
151 | ; pand m2, m0 | |
152 | ; pandn m6, m1 | |
153 | ; pandn m0, m3 | |
154 | ; por m6, m5 | |
155 | ; por m0, m2 | |
156 | ; mova m1, m6 | |
157 | ; %endmacro | |
158 | ||
159 | %macro LOAD 2 | |
160 | movh %1, %2 | |
161 | punpcklwd %1, m7 | |
162 | %endmacro | |
163 | ||
164 | %macro FILTER 3 | |
165 | .loop%1: | |
166 | pxor m7, m7 | |
167 | LOAD m0, [curq+t1] | |
168 | LOAD m1, [curq+t0] | |
169 | LOAD m2, [%2] | |
170 | LOAD m3, [%3] | |
171 | mova m4, m3 | |
172 | paddd m3, m2 | |
173 | psrad m3, 1 | |
174 | mova [rsp+ 0], m0 | |
175 | mova [rsp+16], m3 | |
176 | mova [rsp+32], m1 | |
177 | psubd m2, m4 | |
178 | PABS m2, m4 | |
179 | LOAD m3, [prevq+t1] | |
180 | LOAD m4, [prevq+t0] | |
181 | psubd m3, m0 | |
182 | psubd m4, m1 | |
183 | PABS m3, m5 | |
184 | PABS m4, m5 | |
185 | paddd m3, m4 | |
186 | psrld m2, 1 | |
187 | psrld m3, 1 | |
188 | PMAXSD m2, m3, m6 | |
189 | LOAD m3, [nextq+t1] | |
190 | LOAD m4, [nextq+t0] | |
191 | psubd m3, m0 | |
192 | psubd m4, m1 | |
193 | PABS m3, m5 | |
194 | PABS m4, m5 | |
195 | paddd m3, m4 | |
196 | psrld m3, 1 | |
197 | PMAXSD m2, m3, m6 | |
198 | mova [rsp+48], m2 | |
199 | ||
200 | paddd m1, m0 | |
201 | paddd m0, m0 | |
202 | psubd m0, m1 | |
203 | psrld m1, 1 | |
204 | PABS m0, m2 | |
205 | ||
206 | movu m2, [curq+t1-1*2] | |
207 | movu m3, [curq+t0-1*2] | |
208 | mova m4, m2 | |
209 | psubusw m2, m3 | |
210 | psubusw m3, m4 | |
211 | PMAXUW m2, m3 | |
212 | mova m3, m2 | |
213 | RSHIFT m3, 4 | |
214 | punpcklwd m2, m7 | |
215 | punpcklwd m3, m7 | |
216 | paddd m0, m2 | |
217 | paddd m0, m3 | |
218 | psubd m0, [pd_1] | |
219 | ||
220 | CHECK -2, 0 | |
221 | CHECK1 | |
222 | CHECK -3, 1 | |
223 | CHECK2 | |
224 | CHECK 0, -2 | |
225 | CHECK1 | |
226 | CHECK 1, -3 | |
227 | CHECK2 | |
228 | ||
229 | mova m6, [rsp+48] | |
230 | cmp DWORD r8m, 2 | |
231 | jge .end%1 | |
232 | LOAD m2, [%2+t1*2] | |
233 | LOAD m4, [%3+t1*2] | |
234 | LOAD m3, [%2+t0*2] | |
235 | LOAD m5, [%3+t0*2] | |
236 | paddd m2, m4 | |
237 | paddd m3, m5 | |
238 | psrld m2, 1 | |
239 | psrld m3, 1 | |
240 | mova m4, [rsp+ 0] | |
241 | mova m5, [rsp+16] | |
242 | mova m7, [rsp+32] | |
243 | psubd m2, m4 | |
244 | psubd m3, m7 | |
245 | mova m0, m5 | |
246 | psubd m5, m4 | |
247 | psubd m0, m7 | |
248 | mova m4, m2 | |
249 | PMINSD m2, m3, m7 | |
250 | PMAXSD m3, m4, m7 | |
251 | PMAXSD m2, m5, m7 | |
252 | PMINSD m3, m5, m7 | |
253 | PMAXSD m2, m0, m7 | |
254 | PMINSD m3, m0, m7 | |
255 | pxor m4, m4 | |
256 | PMAXSD m6, m3, m7 | |
257 | psubd m4, m2 | |
258 | PMAXSD m6, m4, m7 | |
259 | ||
260 | .end%1: | |
261 | mova m2, [rsp+16] | |
262 | mova m3, m2 | |
263 | psubd m2, m6 | |
264 | paddd m3, m6 | |
265 | PMAXSD m1, m2, m7 | |
266 | PMINSD m1, m3, m7 | |
267 | PACK m1 | |
268 | ||
269 | movh [dstq], m1 | |
270 | add dstq, mmsize/2 | |
271 | add prevq, mmsize/2 | |
272 | add curq, mmsize/2 | |
273 | add nextq, mmsize/2 | |
274 | sub DWORD r4m, mmsize/4 | |
275 | jg .loop%1 | |
276 | %endmacro | |
277 | ||
278 | %macro YADIF 0 | |
279 | %if ARCH_X86_32 | |
280 | cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \ | |
281 | prefs, mrefs, parity, mode | |
282 | %else | |
283 | cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \ | |
284 | prefs, mrefs, parity, mode | |
285 | %endif | |
286 | %if ARCH_X86_32 | |
287 | mov r4, r5mp | |
288 | mov r5, r6mp | |
289 | DECLARE_REG_TMP 4,5 | |
290 | %else | |
291 | movsxd r5, DWORD r5m | |
292 | movsxd r6, DWORD r6m | |
293 | DECLARE_REG_TMP 5,6 | |
294 | %endif | |
295 | ||
296 | cmp DWORD paritym, 0 | |
297 | je .parity0 | |
298 | FILTER 1, prevq, curq | |
299 | jmp .ret | |
300 | ||
301 | .parity0: | |
302 | FILTER 0, curq, nextq | |
303 | ||
304 | .ret: | |
305 | RET | |
306 | %endmacro | |
307 | ||
308 | INIT_XMM sse4 | |
309 | YADIF | |
310 | INIT_XMM ssse3 | |
311 | YADIF | |
312 | INIT_XMM sse2 | |
313 | YADIF | |
314 | %if ARCH_X86_32 | |
315 | INIT_MMX mmxext | |
316 | YADIF | |
317 | %endif |