Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* VC1 deblocking optimizations | |
3 | ;* Copyright (c) 2009 David Conrad | |
4 | ;* | |
5 | ;* This file is part of FFmpeg. | |
6 | ;* | |
7 | ;* FFmpeg is free software; you can redistribute it and/or | |
8 | ;* modify it under the terms of the GNU Lesser General Public | |
9 | ;* License as published by the Free Software Foundation; either | |
10 | ;* version 2.1 of the License, or (at your option) any later version. | |
11 | ;* | |
12 | ;* FFmpeg is distributed in the hope that it will be useful, | |
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;* Lesser General Public License for more details. | |
16 | ;* | |
17 | ;* You should have received a copy of the GNU Lesser General Public | |
18 | ;* License along with FFmpeg; if not, write to the Free Software | |
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ;****************************************************************************** | |
21 | ||
22 | %include "libavutil/x86/x86util.asm" | |
23 | ||
24 | cextern pw_4 | |
25 | cextern pw_5 | |
26 | ||
27 | section .text | |
28 | ||
29 | ; dst_low, dst_high (src), zero | |
30 | ; zero-extends one vector from 8 to 16 bits | |
31 | %macro UNPACK_8TO16 4 | |
32 | mova m%2, m%3 | |
33 | punpckh%1 m%3, m%4 | |
34 | punpckl%1 m%2, m%4 | |
35 | %endmacro | |
36 | ||
37 | %macro STORE_4_WORDS 6 | |
38 | %if cpuflag(sse4) | |
39 | pextrw %1, %5, %6+0 | |
40 | pextrw %2, %5, %6+1 | |
41 | pextrw %3, %5, %6+2 | |
42 | pextrw %4, %5, %6+3 | |
43 | %else | |
44 | movd %6d, %5 | |
45 | %if mmsize==16 | |
46 | psrldq %5, 4 | |
47 | %else | |
48 | psrlq %5, 32 | |
49 | %endif | |
50 | mov %1, %6w | |
51 | shr %6, 16 | |
52 | mov %2, %6w | |
53 | movd %6d, %5 | |
54 | mov %3, %6w | |
55 | shr %6, 16 | |
56 | mov %4, %6w | |
57 | %endif | |
58 | %endmacro | |
59 | ||
60 | ; in: p1 p0 q0 q1, clobbers p0 | |
61 | ; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3 | |
62 | %macro VC1_LOOP_FILTER_A0 4 | |
63 | psubw %1, %4 | |
64 | psubw %2, %3 | |
65 | paddw %1, %1 | |
66 | pmullw %2, [pw_5] | |
67 | psubw %1, %2 | |
68 | paddw %1, [pw_4] | |
69 | psraw %1, 3 | |
70 | %endmacro | |
71 | ||
72 | ; in: p0 q0 a0 a1 a2 | |
73 | ; m0 m1 m7 m6 m5 | |
74 | ; %1: size | |
75 | ; out: m0=p0' m1=q0' | |
76 | %macro VC1_FILTER 1 | |
77 | PABSW m4, m7 | |
78 | PABSW m3, m6 | |
79 | PABSW m2, m5 | |
80 | mova m6, m4 | |
81 | pminsw m3, m2 | |
82 | pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0) | |
83 | psubw m3, m4 | |
84 | pmullw m3, [pw_5] ; 5*(a3 - a0) | |
85 | PABSW m2, m3 | |
86 | psraw m2, 3 ; abs(d/8) | |
87 | pxor m7, m3 ; d_sign ^= a0_sign | |
88 | ||
89 | pxor m5, m5 | |
90 | movd m3, r2d | |
91 | %if %1 > 4 | |
92 | punpcklbw m3, m3 | |
93 | %endif | |
94 | punpcklbw m3, m5 | |
95 | pcmpgtw m3, m4 ; if (a0 < pq) | |
96 | pand m6, m3 | |
97 | ||
98 | mova m3, m0 | |
99 | psubw m3, m1 | |
100 | PABSW m4, m3 | |
101 | psraw m4, 1 | |
102 | pxor m3, m7 ; d_sign ^ clip_sign | |
103 | psraw m3, 15 | |
104 | pminsw m2, m4 ; min(d, clip) | |
105 | pcmpgtw m4, m5 | |
106 | pand m6, m4 ; filt3 (C return value) | |
107 | ||
108 | ; each set of 4 pixels is not filtered if the 3rd is not | |
109 | %if mmsize==16 | |
110 | pshuflw m4, m6, 0xaa | |
111 | %if %1 > 4 | |
112 | pshufhw m4, m4, 0xaa | |
113 | %endif | |
114 | %else | |
115 | pshufw m4, m6, 0xaa | |
116 | %endif | |
117 | pandn m3, m4 | |
118 | pand m2, m6 | |
119 | pand m3, m2 ; d final | |
120 | ||
121 | psraw m7, 15 | |
122 | pxor m3, m7 | |
123 | psubw m3, m7 | |
124 | psubw m0, m3 | |
125 | paddw m1, m3 | |
126 | packuswb m0, m0 | |
127 | packuswb m1, m1 | |
128 | %endmacro | |
129 | ||
130 | ; 1st param: size of filter | |
131 | ; 2nd param: mov suffix equivalent to the filter size | |
132 | %macro VC1_V_LOOP_FILTER 2 | |
133 | pxor m5, m5 | |
134 | mov%2 m6, [r4] | |
135 | mov%2 m4, [r4+r1] | |
136 | mov%2 m7, [r4+2*r1] | |
137 | mov%2 m0, [r4+r3] | |
138 | punpcklbw m6, m5 | |
139 | punpcklbw m4, m5 | |
140 | punpcklbw m7, m5 | |
141 | punpcklbw m0, m5 | |
142 | ||
143 | VC1_LOOP_FILTER_A0 m6, m4, m7, m0 | |
144 | mov%2 m1, [r0] | |
145 | mov%2 m2, [r0+r1] | |
146 | punpcklbw m1, m5 | |
147 | punpcklbw m2, m5 | |
148 | mova m4, m0 | |
149 | VC1_LOOP_FILTER_A0 m7, m4, m1, m2 | |
150 | mov%2 m3, [r0+2*r1] | |
151 | mov%2 m4, [r0+r3] | |
152 | punpcklbw m3, m5 | |
153 | punpcklbw m4, m5 | |
154 | mova m5, m1 | |
155 | VC1_LOOP_FILTER_A0 m5, m2, m3, m4 | |
156 | ||
157 | VC1_FILTER %1 | |
158 | mov%2 [r4+r3], m0 | |
159 | mov%2 [r0], m1 | |
160 | %endmacro | |
161 | ||
162 | ; 1st param: size of filter | |
163 | ; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register | |
164 | ; 2nd (optional) param: temp register to use for storing words | |
165 | %macro VC1_H_LOOP_FILTER 1-2 | |
166 | %if %1 == 4 | |
167 | movq m0, [r0 -4] | |
168 | movq m1, [r0+ r1-4] | |
169 | movq m2, [r0+2*r1-4] | |
170 | movq m3, [r0+ r3-4] | |
171 | TRANSPOSE4x4B 0, 1, 2, 3, 4 | |
172 | %else | |
173 | movq m0, [r0 -4] | |
174 | movq m4, [r0+ r1-4] | |
175 | movq m1, [r0+2*r1-4] | |
176 | movq m5, [r0+ r3-4] | |
177 | movq m2, [r4 -4] | |
178 | movq m6, [r4+ r1-4] | |
179 | movq m3, [r4+2*r1-4] | |
180 | movq m7, [r4+ r3-4] | |
181 | punpcklbw m0, m4 | |
182 | punpcklbw m1, m5 | |
183 | punpcklbw m2, m6 | |
184 | punpcklbw m3, m7 | |
185 | TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
186 | %endif | |
187 | pxor m5, m5 | |
188 | ||
189 | UNPACK_8TO16 bw, 6, 0, 5 | |
190 | UNPACK_8TO16 bw, 7, 1, 5 | |
191 | VC1_LOOP_FILTER_A0 m6, m0, m7, m1 | |
192 | UNPACK_8TO16 bw, 4, 2, 5 | |
193 | mova m0, m1 ; m0 = p0 | |
194 | VC1_LOOP_FILTER_A0 m7, m1, m4, m2 | |
195 | UNPACK_8TO16 bw, 1, 3, 5 | |
196 | mova m5, m4 | |
197 | VC1_LOOP_FILTER_A0 m5, m2, m1, m3 | |
198 | SWAP 1, 4 ; m1 = q0 | |
199 | ||
200 | VC1_FILTER %1 | |
201 | punpcklbw m0, m1 | |
202 | %if %0 > 1 | |
203 | STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2 | |
204 | %if %1 > 4 | |
205 | psrldq m0, 4 | |
206 | STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2 | |
207 | %endif | |
208 | %else | |
209 | STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0 | |
210 | STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4 | |
211 | %endif | |
212 | %endmacro | |
213 | ||
214 | ||
215 | %macro START_V_FILTER 0 | |
216 | mov r4, r0 | |
217 | lea r3, [4*r1] | |
218 | sub r4, r3 | |
219 | lea r3, [r1+2*r1] | |
220 | imul r2, 0x01010101 | |
221 | %endmacro | |
222 | ||
223 | %macro START_H_FILTER 1 | |
224 | lea r3, [r1+2*r1] | |
225 | %if %1 > 4 | |
226 | lea r4, [r0+4*r1] | |
227 | %endif | |
228 | imul r2, 0x01010101 | |
229 | %endmacro | |
230 | ||
231 | %macro VC1_LF 0 | |
232 | cglobal vc1_v_loop_filter_internal | |
233 | VC1_V_LOOP_FILTER 4, d | |
234 | ret | |
235 | ||
236 | cglobal vc1_h_loop_filter_internal | |
237 | VC1_H_LOOP_FILTER 4, r4 | |
238 | ret | |
239 | ||
240 | ; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq) | |
241 | cglobal vc1_v_loop_filter4, 3,5,0 | |
242 | START_V_FILTER | |
243 | call vc1_v_loop_filter_internal | |
244 | RET | |
245 | ||
246 | ; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq) | |
247 | cglobal vc1_h_loop_filter4, 3,5,0 | |
248 | START_H_FILTER 4 | |
249 | call vc1_h_loop_filter_internal | |
250 | RET | |
251 | ||
252 | ; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq) | |
253 | cglobal vc1_v_loop_filter8, 3,5,0 | |
254 | START_V_FILTER | |
255 | call vc1_v_loop_filter_internal | |
256 | add r4, 4 | |
257 | add r0, 4 | |
258 | call vc1_v_loop_filter_internal | |
259 | RET | |
260 | ||
261 | ; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq) | |
262 | cglobal vc1_h_loop_filter8, 3,5,0 | |
263 | START_H_FILTER 4 | |
264 | call vc1_h_loop_filter_internal | |
265 | lea r0, [r0+4*r1] | |
266 | call vc1_h_loop_filter_internal | |
267 | RET | |
268 | %endmacro | |
269 | ||
270 | INIT_MMX mmxext | |
271 | VC1_LF | |
272 | ||
273 | INIT_XMM sse2 | |
274 | ; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq) | |
275 | cglobal vc1_v_loop_filter8, 3,5,8 | |
276 | START_V_FILTER | |
277 | VC1_V_LOOP_FILTER 8, q | |
278 | RET | |
279 | ||
280 | ; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq) | |
281 | cglobal vc1_h_loop_filter8, 3,6,8 | |
282 | START_H_FILTER 8 | |
283 | VC1_H_LOOP_FILTER 8, r5 | |
284 | RET | |
285 | ||
286 | INIT_MMX ssse3 | |
287 | ; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq) | |
288 | cglobal vc1_v_loop_filter4, 3,5,0 | |
289 | START_V_FILTER | |
290 | VC1_V_LOOP_FILTER 4, d | |
291 | RET | |
292 | ||
293 | ; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq) | |
294 | cglobal vc1_h_loop_filter4, 3,5,0 | |
295 | START_H_FILTER 4 | |
296 | VC1_H_LOOP_FILTER 4, r4 | |
297 | RET | |
298 | ||
299 | INIT_XMM ssse3 | |
300 | ; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq) | |
301 | cglobal vc1_v_loop_filter8, 3,5,8 | |
302 | START_V_FILTER | |
303 | VC1_V_LOOP_FILTER 8, q | |
304 | RET | |
305 | ||
306 | ; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq) | |
307 | cglobal vc1_h_loop_filter8, 3,6,8 | |
308 | START_H_FILTER 8 | |
309 | VC1_H_LOOP_FILTER 8, r5 | |
310 | RET | |
311 | ||
312 | INIT_XMM sse4 | |
313 | ; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq) | |
314 | cglobal vc1_h_loop_filter8, 3,5,8 | |
315 | START_H_FILTER 8 | |
316 | VC1_H_LOOP_FILTER 8 | |
317 | RET |