Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / vc1dsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* VC1 deblocking optimizations
3;* Copyright (c) 2009 David Conrad
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24cextern pw_4
25cextern pw_5
26
27section .text
28
29; dst_low, dst_high (src), zero
30; zero-extends one vector from 8 to 16 bits
31%macro UNPACK_8TO16 4
32 mova m%2, m%3
33 punpckh%1 m%3, m%4
34 punpckl%1 m%2, m%4
35%endmacro
36
37%macro STORE_4_WORDS 6
38%if cpuflag(sse4)
39 pextrw %1, %5, %6+0
40 pextrw %2, %5, %6+1
41 pextrw %3, %5, %6+2
42 pextrw %4, %5, %6+3
43%else
44 movd %6d, %5
45%if mmsize==16
46 psrldq %5, 4
47%else
48 psrlq %5, 32
49%endif
50 mov %1, %6w
51 shr %6, 16
52 mov %2, %6w
53 movd %6d, %5
54 mov %3, %6w
55 shr %6, 16
56 mov %4, %6w
57%endif
58%endmacro
59
60; in: p1 p0 q0 q1, clobbers p0
61; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
62%macro VC1_LOOP_FILTER_A0 4
63 psubw %1, %4
64 psubw %2, %3
65 paddw %1, %1
66 pmullw %2, [pw_5]
67 psubw %1, %2
68 paddw %1, [pw_4]
69 psraw %1, 3
70%endmacro
71
72; in: p0 q0 a0 a1 a2
73; m0 m1 m7 m6 m5
74; %1: size
75; out: m0=p0' m1=q0'
76%macro VC1_FILTER 1
77 PABSW m4, m7
78 PABSW m3, m6
79 PABSW m2, m5
80 mova m6, m4
81 pminsw m3, m2
82 pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0)
83 psubw m3, m4
84 pmullw m3, [pw_5] ; 5*(a3 - a0)
85 PABSW m2, m3
86 psraw m2, 3 ; abs(d/8)
87 pxor m7, m3 ; d_sign ^= a0_sign
88
89 pxor m5, m5
90 movd m3, r2d
91%if %1 > 4
92 punpcklbw m3, m3
93%endif
94 punpcklbw m3, m5
95 pcmpgtw m3, m4 ; if (a0 < pq)
96 pand m6, m3
97
98 mova m3, m0
99 psubw m3, m1
100 PABSW m4, m3
101 psraw m4, 1
102 pxor m3, m7 ; d_sign ^ clip_sign
103 psraw m3, 15
104 pminsw m2, m4 ; min(d, clip)
105 pcmpgtw m4, m5
106 pand m6, m4 ; filt3 (C return value)
107
108; each set of 4 pixels is not filtered if the 3rd is not
109%if mmsize==16
110 pshuflw m4, m6, 0xaa
111%if %1 > 4
112 pshufhw m4, m4, 0xaa
113%endif
114%else
115 pshufw m4, m6, 0xaa
116%endif
117 pandn m3, m4
118 pand m2, m6
119 pand m3, m2 ; d final
120
121 psraw m7, 15
122 pxor m3, m7
123 psubw m3, m7
124 psubw m0, m3
125 paddw m1, m3
126 packuswb m0, m0
127 packuswb m1, m1
128%endmacro
129
130; 1st param: size of filter
131; 2nd param: mov suffix equivalent to the filter size
132%macro VC1_V_LOOP_FILTER 2
133 pxor m5, m5
134 mov%2 m6, [r4]
135 mov%2 m4, [r4+r1]
136 mov%2 m7, [r4+2*r1]
137 mov%2 m0, [r4+r3]
138 punpcklbw m6, m5
139 punpcklbw m4, m5
140 punpcklbw m7, m5
141 punpcklbw m0, m5
142
143 VC1_LOOP_FILTER_A0 m6, m4, m7, m0
144 mov%2 m1, [r0]
145 mov%2 m2, [r0+r1]
146 punpcklbw m1, m5
147 punpcklbw m2, m5
148 mova m4, m0
149 VC1_LOOP_FILTER_A0 m7, m4, m1, m2
150 mov%2 m3, [r0+2*r1]
151 mov%2 m4, [r0+r3]
152 punpcklbw m3, m5
153 punpcklbw m4, m5
154 mova m5, m1
155 VC1_LOOP_FILTER_A0 m5, m2, m3, m4
156
157 VC1_FILTER %1
158 mov%2 [r4+r3], m0
159 mov%2 [r0], m1
160%endmacro
161
162; 1st param: size of filter
163; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
164; 2nd (optional) param: temp register to use for storing words
165%macro VC1_H_LOOP_FILTER 1-2
166%if %1 == 4
167 movq m0, [r0 -4]
168 movq m1, [r0+ r1-4]
169 movq m2, [r0+2*r1-4]
170 movq m3, [r0+ r3-4]
171 TRANSPOSE4x4B 0, 1, 2, 3, 4
172%else
173 movq m0, [r0 -4]
174 movq m4, [r0+ r1-4]
175 movq m1, [r0+2*r1-4]
176 movq m5, [r0+ r3-4]
177 movq m2, [r4 -4]
178 movq m6, [r4+ r1-4]
179 movq m3, [r4+2*r1-4]
180 movq m7, [r4+ r3-4]
181 punpcklbw m0, m4
182 punpcklbw m1, m5
183 punpcklbw m2, m6
184 punpcklbw m3, m7
185 TRANSPOSE4x4W 0, 1, 2, 3, 4
186%endif
187 pxor m5, m5
188
189 UNPACK_8TO16 bw, 6, 0, 5
190 UNPACK_8TO16 bw, 7, 1, 5
191 VC1_LOOP_FILTER_A0 m6, m0, m7, m1
192 UNPACK_8TO16 bw, 4, 2, 5
193 mova m0, m1 ; m0 = p0
194 VC1_LOOP_FILTER_A0 m7, m1, m4, m2
195 UNPACK_8TO16 bw, 1, 3, 5
196 mova m5, m4
197 VC1_LOOP_FILTER_A0 m5, m2, m1, m3
198 SWAP 1, 4 ; m1 = q0
199
200 VC1_FILTER %1
201 punpcklbw m0, m1
202%if %0 > 1
203 STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
204%if %1 > 4
205 psrldq m0, 4
206 STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
207%endif
208%else
209 STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
210 STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
211%endif
212%endmacro
213
214
215%macro START_V_FILTER 0
216 mov r4, r0
217 lea r3, [4*r1]
218 sub r4, r3
219 lea r3, [r1+2*r1]
220 imul r2, 0x01010101
221%endmacro
222
223%macro START_H_FILTER 1
224 lea r3, [r1+2*r1]
225%if %1 > 4
226 lea r4, [r0+4*r1]
227%endif
228 imul r2, 0x01010101
229%endmacro
230
231%macro VC1_LF 0
232cglobal vc1_v_loop_filter_internal
233 VC1_V_LOOP_FILTER 4, d
234 ret
235
236cglobal vc1_h_loop_filter_internal
237 VC1_H_LOOP_FILTER 4, r4
238 ret
239
240; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
241cglobal vc1_v_loop_filter4, 3,5,0
242 START_V_FILTER
243 call vc1_v_loop_filter_internal
244 RET
245
246; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
247cglobal vc1_h_loop_filter4, 3,5,0
248 START_H_FILTER 4
249 call vc1_h_loop_filter_internal
250 RET
251
252; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
253cglobal vc1_v_loop_filter8, 3,5,0
254 START_V_FILTER
255 call vc1_v_loop_filter_internal
256 add r4, 4
257 add r0, 4
258 call vc1_v_loop_filter_internal
259 RET
260
261; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
262cglobal vc1_h_loop_filter8, 3,5,0
263 START_H_FILTER 4
264 call vc1_h_loop_filter_internal
265 lea r0, [r0+4*r1]
266 call vc1_h_loop_filter_internal
267 RET
268%endmacro
269
270INIT_MMX mmxext
271VC1_LF
272
273INIT_XMM sse2
274; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
275cglobal vc1_v_loop_filter8, 3,5,8
276 START_V_FILTER
277 VC1_V_LOOP_FILTER 8, q
278 RET
279
280; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
281cglobal vc1_h_loop_filter8, 3,6,8
282 START_H_FILTER 8
283 VC1_H_LOOP_FILTER 8, r5
284 RET
285
286INIT_MMX ssse3
287; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
288cglobal vc1_v_loop_filter4, 3,5,0
289 START_V_FILTER
290 VC1_V_LOOP_FILTER 4, d
291 RET
292
293; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
294cglobal vc1_h_loop_filter4, 3,5,0
295 START_H_FILTER 4
296 VC1_H_LOOP_FILTER 4, r4
297 RET
298
299INIT_XMM ssse3
300; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
301cglobal vc1_v_loop_filter8, 3,5,8
302 START_V_FILTER
303 VC1_V_LOOP_FILTER 8, q
304 RET
305
306; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
307cglobal vc1_h_loop_filter8, 3,6,8
308 START_H_FILTER 8
309 VC1_H_LOOP_FILTER 8, r5
310 RET
311
312INIT_XMM sse4
313; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
314cglobal vc1_h_loop_filter8, 3,5,8
315 START_H_FILTER 8
316 VC1_H_LOOP_FILTER 8
317 RET