Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / vp9mc.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* VP9 MC SIMD optimizations
3;*
4;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27cextern pw_256
28
29%macro F8_TAPS 8
30times 8 db %1, %2
31times 8 db %3, %4
32times 8 db %5, %6
33times 8 db %7, %8
34%endmacro
35; int8_t ff_filters_ssse3[3][15][4][16]
36const filters_ssse3 ; smooth
37 F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0
38 F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0
39 F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0
40 F8_TAPS -2, -3, 24, 62, 46, 5, -4, 0
41 F8_TAPS -2, -3, 21, 60, 49, 7, -4, 0
42 F8_TAPS -1, -4, 18, 59, 51, 9, -4, 0
43 F8_TAPS -1, -4, 16, 57, 53, 12, -4, -1
44 F8_TAPS -1, -4, 14, 55, 55, 14, -4, -1
45 F8_TAPS -1, -4, 12, 53, 57, 16, -4, -1
46 F8_TAPS 0, -4, 9, 51, 59, 18, -4, -1
47 F8_TAPS 0, -4, 7, 49, 60, 21, -3, -2
48 F8_TAPS 0, -4, 5, 46, 62, 24, -3, -2
49 F8_TAPS 0, -4, 4, 43, 63, 26, -2, -2
50 F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2
51 F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3
52 ; regular
53 F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0
54 F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0
55 F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1
56 F8_TAPS -1, 4, -16, 112, 37, -11, 4, -1
57 F8_TAPS -1, 5, -18, 105, 48, -14, 4, -1
58 F8_TAPS -1, 5, -19, 97, 58, -16, 5, -1
59 F8_TAPS -1, 6, -19, 88, 68, -18, 5, -1
60 F8_TAPS -1, 6, -19, 78, 78, -19, 6, -1
61 F8_TAPS -1, 5, -18, 68, 88, -19, 6, -1
62 F8_TAPS -1, 5, -16, 58, 97, -19, 5, -1
63 F8_TAPS -1, 4, -14, 48, 105, -18, 5, -1
64 F8_TAPS -1, 4, -11, 37, 112, -16, 4, -1
65 F8_TAPS -1, 3, -9, 27, 118, -13, 4, -1
66 F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1
67 F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0
68 ; sharp
69 F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0
70 F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1
71 F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2
72 F8_TAPS -4, 9, -20, 115, 37, -13, 6, -2
73 F8_TAPS -4, 10, -23, 108, 48, -16, 8, -3
74 F8_TAPS -4, 10, -24, 100, 59, -19, 9, -3
75 F8_TAPS -4, 11, -24, 90, 70, -21, 10, -4
76 F8_TAPS -4, 11, -23, 80, 80, -23, 11, -4
77 F8_TAPS -4, 10, -21, 70, 90, -24, 11, -4
78 F8_TAPS -3, 9, -19, 59, 100, -24, 10, -4
79 F8_TAPS -3, 8, -16, 48, 108, -23, 10, -4
80 F8_TAPS -2, 6, -13, 37, 115, -20, 9, -4
81 F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3
82 F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2
83 F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1
84
85SECTION .text
86
87%macro filter_h_fn 1
88%assign %%px mmsize/2
89cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery
90 mova m6, [pw_256]
91 mova m7, [filteryq+ 0]
92%if ARCH_X86_64 && mmsize > 8
93 mova m8, [filteryq+16]
94 mova m9, [filteryq+32]
95 mova m10, [filteryq+48]
96%endif
97.loop:
98 movh m0, [srcq-3]
99 movh m1, [srcq-2]
100 movh m2, [srcq-1]
101 movh m3, [srcq+0]
102 movh m4, [srcq+1]
103 movh m5, [srcq+2]
104 punpcklbw m0, m1
105 punpcklbw m2, m3
106 movh m1, [srcq+3]
107 movh m3, [srcq+4]
108 add srcq, sstrideq
109 punpcklbw m4, m5
110 punpcklbw m1, m3
111 pmaddubsw m0, m7
112%if ARCH_X86_64 && mmsize > 8
113 pmaddubsw m2, m8
114 pmaddubsw m4, m9
115 pmaddubsw m1, m10
116%else
117 pmaddubsw m2, [filteryq+16]
118 pmaddubsw m4, [filteryq+32]
119 pmaddubsw m1, [filteryq+48]
120%endif
121 paddw m0, m2
122 paddw m4, m1
123 paddsw m0, m4
124 pmulhrsw m0, m6
125%ifidn %1, avg
126 movh m1, [dstq]
127%endif
128 packuswb m0, m0
129%ifidn %1, avg
130 pavgb m0, m1
131%endif
132 movh [dstq], m0
133 add dstq, dstrideq
134 dec hd
135 jg .loop
136 RET
137%endmacro
138
139INIT_MMX ssse3
140filter_h_fn put
141filter_h_fn avg
142
143INIT_XMM ssse3
144filter_h_fn put
145filter_h_fn avg
146
147%if ARCH_X86_64
148%macro filter_hx2_fn 1
149%assign %%px mmsize
150cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, dstride, src, sstride, h, filtery
151 mova m13, [pw_256]
152 mova m8, [filteryq+ 0]
153 mova m9, [filteryq+16]
154 mova m10, [filteryq+32]
155 mova m11, [filteryq+48]
156.loop:
157 movu m0, [srcq-3]
158 movu m1, [srcq-2]
159 movu m2, [srcq-1]
160 movu m3, [srcq+0]
161 movu m4, [srcq+1]
162 movu m5, [srcq+2]
163 movu m6, [srcq+3]
164 movu m7, [srcq+4]
165 add srcq, sstrideq
166 SBUTTERFLY bw, 0, 1, 12
167 SBUTTERFLY bw, 2, 3, 12
168 SBUTTERFLY bw, 4, 5, 12
169 SBUTTERFLY bw, 6, 7, 12
170 pmaddubsw m0, m8
171 pmaddubsw m1, m8
172 pmaddubsw m2, m9
173 pmaddubsw m3, m9
174 pmaddubsw m4, m10
175 pmaddubsw m5, m10
176 pmaddubsw m6, m11
177 pmaddubsw m7, m11
178 paddw m0, m2
179 paddw m1, m3
180 paddw m4, m6
181 paddw m5, m7
182 paddsw m0, m4
183 paddsw m1, m5
184 pmulhrsw m0, m13
185 pmulhrsw m1, m13
186 packuswb m0, m1
187%ifidn %1, avg
188 pavgb m0, [dstq]
189%endif
190 mova [dstq], m0
191 add dstq, dstrideq
192 dec hd
193 jg .loop
194 RET
195%endmacro
196
197INIT_XMM ssse3
198filter_hx2_fn put
199filter_hx2_fn avg
200
201%endif ; ARCH_X86_64
202
203%macro filter_v_fn 1
204%assign %%px mmsize/2
205%if ARCH_X86_64
206cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
207%else
208cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
209 mov filteryq, r5mp
210%define hd r4mp
211%endif
212 mova m6, [pw_256]
213 lea sstride3q, [sstrideq*3]
214 lea src4q, [srcq+sstrideq]
215 sub srcq, sstride3q
216 mova m7, [filteryq+ 0]
217%if ARCH_X86_64 && mmsize > 8
218 mova m8, [filteryq+16]
219 mova m9, [filteryq+32]
220 mova m10, [filteryq+48]
221%endif
222.loop:
223 ; FIXME maybe reuse loads from previous rows, or just
224 ; more generally unroll this to prevent multiple loads of
225 ; the same data?
226 movh m0, [srcq]
227 movh m1, [srcq+sstrideq]
228 movh m2, [srcq+sstrideq*2]
229 movh m3, [srcq+sstride3q]
230 movh m4, [src4q]
231 movh m5, [src4q+sstrideq]
232 punpcklbw m0, m1
233 punpcklbw m2, m3
234 movh m1, [src4q+sstrideq*2]
235 movh m3, [src4q+sstride3q]
236 add srcq, sstrideq
237 add src4q, sstrideq
238 punpcklbw m4, m5
239 punpcklbw m1, m3
240 pmaddubsw m0, m7
241%if ARCH_X86_64 && mmsize > 8
242 pmaddubsw m2, m8
243 pmaddubsw m4, m9
244 pmaddubsw m1, m10
245%else
246 pmaddubsw m2, [filteryq+16]
247 pmaddubsw m4, [filteryq+32]
248 pmaddubsw m1, [filteryq+48]
249%endif
250 paddw m0, m2
251 paddw m4, m1
252 paddsw m0, m4
253 pmulhrsw m0, m6
254%ifidn %1, avg
255 movh m1, [dstq]
256%endif
257 packuswb m0, m0
258%ifidn %1, avg
259 pavgb m0, m1
260%endif
261 movh [dstq], m0
262 add dstq, dstrideq
263 dec hd
264 jg .loop
265 RET
266%endmacro
267
268INIT_MMX ssse3
269filter_v_fn put
270filter_v_fn avg
271
272INIT_XMM ssse3
273filter_v_fn put
274filter_v_fn avg
275
276%if ARCH_X86_64
277
278%macro filter_vx2_fn 1
279%assign %%px mmsize
280cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
281 mova m13, [pw_256]
282 lea sstride3q, [sstrideq*3]
283 lea src4q, [srcq+sstrideq]
284 sub srcq, sstride3q
285 mova m8, [filteryq+ 0]
286 mova m9, [filteryq+16]
287 mova m10, [filteryq+32]
288 mova m11, [filteryq+48]
289.loop:
290 ; FIXME maybe reuse loads from previous rows, or just
291 ; more generally unroll this to prevent multiple loads of
292 ; the same data?
293 movu m0, [srcq]
294 movu m1, [srcq+sstrideq]
295 movu m2, [srcq+sstrideq*2]
296 movu m3, [srcq+sstride3q]
297 movu m4, [src4q]
298 movu m5, [src4q+sstrideq]
299 movu m6, [src4q+sstrideq*2]
300 movu m7, [src4q+sstride3q]
301 add srcq, sstrideq
302 add src4q, sstrideq
303 SBUTTERFLY bw, 0, 1, 12
304 SBUTTERFLY bw, 2, 3, 12
305 SBUTTERFLY bw, 4, 5, 12
306 SBUTTERFLY bw, 6, 7, 12
307 pmaddubsw m0, m8
308 pmaddubsw m1, m8
309 pmaddubsw m2, m9
310 pmaddubsw m3, m9
311 pmaddubsw m4, m10
312 pmaddubsw m5, m10
313 pmaddubsw m6, m11
314 pmaddubsw m7, m11
315 paddw m0, m2
316 paddw m1, m3
317 paddw m4, m6
318 paddw m5, m7
319 paddsw m0, m4
320 paddsw m1, m5
321 pmulhrsw m0, m13
322 pmulhrsw m1, m13
323 packuswb m0, m1
324%ifidn %1, avg
325 pavgb m0, [dstq]
326%endif
327 mova [dstq], m0
328 add dstq, dstrideq
329 dec hd
330 jg .loop
331 RET
332%endmacro
333
334INIT_XMM ssse3
335filter_vx2_fn put
336filter_vx2_fn avg
337
338%endif ; ARCH_X86_64
339
340%macro fpel_fn 6
341%if %2 == 4
342%define %%srcfn movh
343%define %%dstfn movh
344%else
345%define %%srcfn movu
346%define %%dstfn mova
347%endif
348
349%if %2 <= 16
350cglobal vp9_%1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
351 lea sstride3q, [sstrideq*3]
352 lea dstride3q, [dstrideq*3]
353%else
354cglobal vp9_%1%2, 5, 5, 4, dst, dstride, src, sstride, h
355%endif
356.loop:
357 %%srcfn m0, [srcq]
358 %%srcfn m1, [srcq+s%3]
359 %%srcfn m2, [srcq+s%4]
360 %%srcfn m3, [srcq+s%5]
361 lea srcq, [srcq+sstrideq*%6]
362%ifidn %1, avg
363 pavgb m0, [dstq]
364 pavgb m1, [dstq+d%3]
365 pavgb m2, [dstq+d%4]
366 pavgb m3, [dstq+d%5]
367%endif
368 %%dstfn [dstq], m0
369 %%dstfn [dstq+d%3], m1
370 %%dstfn [dstq+d%4], m2
371 %%dstfn [dstq+d%5], m3
372 lea dstq, [dstq+dstrideq*%6]
373 sub hd, %6
374 jnz .loop
375 RET
376%endmacro
377
378%define d16 16
379%define s16 16
380INIT_MMX mmx
381fpel_fn put, 4, strideq, strideq*2, stride3q, 4
382fpel_fn put, 8, strideq, strideq*2, stride3q, 4
383INIT_MMX mmxext
384fpel_fn avg, 4, strideq, strideq*2, stride3q, 4
385fpel_fn avg, 8, strideq, strideq*2, stride3q, 4
386INIT_XMM sse
387fpel_fn put, 16, strideq, strideq*2, stride3q, 4
388fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2
389fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1
390INIT_XMM sse2
391fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
392fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2
393fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1
394%undef s16
395%undef d16