Imported Debian version 2.5.0~trusty1.1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / vp9mc.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* VP9 MC SIMD optimizations
3;*
4;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
f6fa7814 25SECTION_RODATA 32
2ba45a60
DM
26
27cextern pw_256
28
29%macro F8_TAPS 8
f6fa7814
DM
30times 16 db %1, %2
31times 16 db %3, %4
32times 16 db %5, %6
33times 16 db %7, %8
2ba45a60 34%endmacro
f6fa7814 35; int8_t ff_filters_ssse3[3][15][4][32]
2ba45a60
DM
36const filters_ssse3 ; smooth
37 F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0
38 F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0
39 F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0
40 F8_TAPS -2, -3, 24, 62, 46, 5, -4, 0
41 F8_TAPS -2, -3, 21, 60, 49, 7, -4, 0
42 F8_TAPS -1, -4, 18, 59, 51, 9, -4, 0
43 F8_TAPS -1, -4, 16, 57, 53, 12, -4, -1
44 F8_TAPS -1, -4, 14, 55, 55, 14, -4, -1
45 F8_TAPS -1, -4, 12, 53, 57, 16, -4, -1
46 F8_TAPS 0, -4, 9, 51, 59, 18, -4, -1
47 F8_TAPS 0, -4, 7, 49, 60, 21, -3, -2
48 F8_TAPS 0, -4, 5, 46, 62, 24, -3, -2
49 F8_TAPS 0, -4, 4, 43, 63, 26, -2, -2
50 F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2
51 F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3
52 ; regular
53 F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0
54 F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0
55 F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1
56 F8_TAPS -1, 4, -16, 112, 37, -11, 4, -1
57 F8_TAPS -1, 5, -18, 105, 48, -14, 4, -1
58 F8_TAPS -1, 5, -19, 97, 58, -16, 5, -1
59 F8_TAPS -1, 6, -19, 88, 68, -18, 5, -1
60 F8_TAPS -1, 6, -19, 78, 78, -19, 6, -1
61 F8_TAPS -1, 5, -18, 68, 88, -19, 6, -1
62 F8_TAPS -1, 5, -16, 58, 97, -19, 5, -1
63 F8_TAPS -1, 4, -14, 48, 105, -18, 5, -1
64 F8_TAPS -1, 4, -11, 37, 112, -16, 4, -1
65 F8_TAPS -1, 3, -9, 27, 118, -13, 4, -1
66 F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1
67 F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0
68 ; sharp
69 F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0
70 F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1
71 F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2
72 F8_TAPS -4, 9, -20, 115, 37, -13, 6, -2
73 F8_TAPS -4, 10, -23, 108, 48, -16, 8, -3
74 F8_TAPS -4, 10, -24, 100, 59, -19, 9, -3
75 F8_TAPS -4, 11, -24, 90, 70, -21, 10, -4
76 F8_TAPS -4, 11, -23, 80, 80, -23, 11, -4
77 F8_TAPS -4, 10, -21, 70, 90, -24, 11, -4
78 F8_TAPS -3, 9, -19, 59, 100, -24, 10, -4
79 F8_TAPS -3, 8, -16, 48, 108, -23, 10, -4
80 F8_TAPS -2, 6, -13, 37, 115, -20, 9, -4
81 F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3
82 F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2
83 F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1
84
85SECTION .text
86
87%macro filter_h_fn 1
88%assign %%px mmsize/2
89cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery
90 mova m6, [pw_256]
91 mova m7, [filteryq+ 0]
92%if ARCH_X86_64 && mmsize > 8
f6fa7814
DM
93 mova m8, [filteryq+32]
94 mova m9, [filteryq+64]
95 mova m10, [filteryq+96]
2ba45a60
DM
96%endif
97.loop:
98 movh m0, [srcq-3]
99 movh m1, [srcq-2]
100 movh m2, [srcq-1]
101 movh m3, [srcq+0]
102 movh m4, [srcq+1]
103 movh m5, [srcq+2]
104 punpcklbw m0, m1
105 punpcklbw m2, m3
106 movh m1, [srcq+3]
107 movh m3, [srcq+4]
108 add srcq, sstrideq
109 punpcklbw m4, m5
110 punpcklbw m1, m3
111 pmaddubsw m0, m7
112%if ARCH_X86_64 && mmsize > 8
113 pmaddubsw m2, m8
114 pmaddubsw m4, m9
115 pmaddubsw m1, m10
116%else
f6fa7814
DM
117 pmaddubsw m2, [filteryq+32]
118 pmaddubsw m4, [filteryq+64]
119 pmaddubsw m1, [filteryq+96]
2ba45a60
DM
120%endif
121 paddw m0, m2
122 paddw m4, m1
123 paddsw m0, m4
124 pmulhrsw m0, m6
125%ifidn %1, avg
126 movh m1, [dstq]
127%endif
128 packuswb m0, m0
129%ifidn %1, avg
130 pavgb m0, m1
131%endif
132 movh [dstq], m0
133 add dstq, dstrideq
134 dec hd
135 jg .loop
136 RET
137%endmacro
138
139INIT_MMX ssse3
140filter_h_fn put
141filter_h_fn avg
142
143INIT_XMM ssse3
144filter_h_fn put
145filter_h_fn avg
146
147%if ARCH_X86_64
148%macro filter_hx2_fn 1
149%assign %%px mmsize
150cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, dstride, src, sstride, h, filtery
151 mova m13, [pw_256]
152 mova m8, [filteryq+ 0]
f6fa7814
DM
153 mova m9, [filteryq+32]
154 mova m10, [filteryq+64]
155 mova m11, [filteryq+96]
2ba45a60
DM
156.loop:
157 movu m0, [srcq-3]
158 movu m1, [srcq-2]
159 movu m2, [srcq-1]
160 movu m3, [srcq+0]
161 movu m4, [srcq+1]
162 movu m5, [srcq+2]
163 movu m6, [srcq+3]
164 movu m7, [srcq+4]
165 add srcq, sstrideq
166 SBUTTERFLY bw, 0, 1, 12
167 SBUTTERFLY bw, 2, 3, 12
168 SBUTTERFLY bw, 4, 5, 12
169 SBUTTERFLY bw, 6, 7, 12
170 pmaddubsw m0, m8
171 pmaddubsw m1, m8
172 pmaddubsw m2, m9
173 pmaddubsw m3, m9
174 pmaddubsw m4, m10
175 pmaddubsw m5, m10
176 pmaddubsw m6, m11
177 pmaddubsw m7, m11
178 paddw m0, m2
179 paddw m1, m3
180 paddw m4, m6
181 paddw m5, m7
182 paddsw m0, m4
183 paddsw m1, m5
184 pmulhrsw m0, m13
185 pmulhrsw m1, m13
186 packuswb m0, m1
187%ifidn %1, avg
188 pavgb m0, [dstq]
189%endif
190 mova [dstq], m0
191 add dstq, dstrideq
192 dec hd
193 jg .loop
194 RET
195%endmacro
196
197INIT_XMM ssse3
198filter_hx2_fn put
199filter_hx2_fn avg
200
f6fa7814
DM
201%if HAVE_AVX2_EXTERNAL
202INIT_YMM avx2
203filter_hx2_fn put
204filter_hx2_fn avg
205%endif
206
2ba45a60
DM
207%endif ; ARCH_X86_64
208
209%macro filter_v_fn 1
210%assign %%px mmsize/2
211%if ARCH_X86_64
212cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
213%else
214cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
215 mov filteryq, r5mp
216%define hd r4mp
217%endif
218 mova m6, [pw_256]
219 lea sstride3q, [sstrideq*3]
220 lea src4q, [srcq+sstrideq]
221 sub srcq, sstride3q
222 mova m7, [filteryq+ 0]
223%if ARCH_X86_64 && mmsize > 8
f6fa7814
DM
224 mova m8, [filteryq+32]
225 mova m9, [filteryq+64]
226 mova m10, [filteryq+96]
2ba45a60
DM
227%endif
228.loop:
229 ; FIXME maybe reuse loads from previous rows, or just
230 ; more generally unroll this to prevent multiple loads of
231 ; the same data?
232 movh m0, [srcq]
233 movh m1, [srcq+sstrideq]
234 movh m2, [srcq+sstrideq*2]
235 movh m3, [srcq+sstride3q]
236 movh m4, [src4q]
237 movh m5, [src4q+sstrideq]
238 punpcklbw m0, m1
239 punpcklbw m2, m3
240 movh m1, [src4q+sstrideq*2]
241 movh m3, [src4q+sstride3q]
242 add srcq, sstrideq
243 add src4q, sstrideq
244 punpcklbw m4, m5
245 punpcklbw m1, m3
246 pmaddubsw m0, m7
247%if ARCH_X86_64 && mmsize > 8
248 pmaddubsw m2, m8
249 pmaddubsw m4, m9
250 pmaddubsw m1, m10
251%else
f6fa7814
DM
252 pmaddubsw m2, [filteryq+32]
253 pmaddubsw m4, [filteryq+64]
254 pmaddubsw m1, [filteryq+96]
2ba45a60
DM
255%endif
256 paddw m0, m2
257 paddw m4, m1
258 paddsw m0, m4
259 pmulhrsw m0, m6
260%ifidn %1, avg
261 movh m1, [dstq]
262%endif
263 packuswb m0, m0
264%ifidn %1, avg
265 pavgb m0, m1
266%endif
267 movh [dstq], m0
268 add dstq, dstrideq
269 dec hd
270 jg .loop
271 RET
272%endmacro
273
274INIT_MMX ssse3
275filter_v_fn put
276filter_v_fn avg
277
278INIT_XMM ssse3
279filter_v_fn put
280filter_v_fn avg
281
282%if ARCH_X86_64
283
284%macro filter_vx2_fn 1
285%assign %%px mmsize
286cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
287 mova m13, [pw_256]
288 lea sstride3q, [sstrideq*3]
289 lea src4q, [srcq+sstrideq]
290 sub srcq, sstride3q
291 mova m8, [filteryq+ 0]
f6fa7814
DM
292 mova m9, [filteryq+32]
293 mova m10, [filteryq+64]
294 mova m11, [filteryq+96]
2ba45a60
DM
295.loop:
296 ; FIXME maybe reuse loads from previous rows, or just
297 ; more generally unroll this to prevent multiple loads of
298 ; the same data?
299 movu m0, [srcq]
300 movu m1, [srcq+sstrideq]
301 movu m2, [srcq+sstrideq*2]
302 movu m3, [srcq+sstride3q]
303 movu m4, [src4q]
304 movu m5, [src4q+sstrideq]
305 movu m6, [src4q+sstrideq*2]
306 movu m7, [src4q+sstride3q]
307 add srcq, sstrideq
308 add src4q, sstrideq
309 SBUTTERFLY bw, 0, 1, 12
310 SBUTTERFLY bw, 2, 3, 12
311 SBUTTERFLY bw, 4, 5, 12
312 SBUTTERFLY bw, 6, 7, 12
313 pmaddubsw m0, m8
314 pmaddubsw m1, m8
315 pmaddubsw m2, m9
316 pmaddubsw m3, m9
317 pmaddubsw m4, m10
318 pmaddubsw m5, m10
319 pmaddubsw m6, m11
320 pmaddubsw m7, m11
321 paddw m0, m2
322 paddw m1, m3
323 paddw m4, m6
324 paddw m5, m7
325 paddsw m0, m4
326 paddsw m1, m5
327 pmulhrsw m0, m13
328 pmulhrsw m1, m13
329 packuswb m0, m1
330%ifidn %1, avg
331 pavgb m0, [dstq]
332%endif
333 mova [dstq], m0
334 add dstq, dstrideq
335 dec hd
336 jg .loop
337 RET
338%endmacro
339
340INIT_XMM ssse3
341filter_vx2_fn put
342filter_vx2_fn avg
343
f6fa7814
DM
344%if HAVE_AVX2_EXTERNAL
345INIT_YMM avx2
346filter_vx2_fn put
347filter_vx2_fn avg
348%endif
349
2ba45a60
DM
350%endif ; ARCH_X86_64
351
352%macro fpel_fn 6
353%if %2 == 4
354%define %%srcfn movh
355%define %%dstfn movh
356%else
357%define %%srcfn movu
358%define %%dstfn mova
359%endif
360
f6fa7814 361%if %2 <= mmsize
2ba45a60
DM
362cglobal vp9_%1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
363 lea sstride3q, [sstrideq*3]
364 lea dstride3q, [dstrideq*3]
365%else
366cglobal vp9_%1%2, 5, 5, 4, dst, dstride, src, sstride, h
367%endif
368.loop:
369 %%srcfn m0, [srcq]
370 %%srcfn m1, [srcq+s%3]
371 %%srcfn m2, [srcq+s%4]
372 %%srcfn m3, [srcq+s%5]
373 lea srcq, [srcq+sstrideq*%6]
374%ifidn %1, avg
375 pavgb m0, [dstq]
376 pavgb m1, [dstq+d%3]
377 pavgb m2, [dstq+d%4]
378 pavgb m3, [dstq+d%5]
379%endif
380 %%dstfn [dstq], m0
381 %%dstfn [dstq+d%3], m1
382 %%dstfn [dstq+d%4], m2
383 %%dstfn [dstq+d%5], m3
384 lea dstq, [dstq+dstrideq*%6]
385 sub hd, %6
386 jnz .loop
387 RET
388%endmacro
389
390%define d16 16
391%define s16 16
f6fa7814
DM
392%define d32 32
393%define s32 32
2ba45a60
DM
394INIT_MMX mmx
395fpel_fn put, 4, strideq, strideq*2, stride3q, 4
396fpel_fn put, 8, strideq, strideq*2, stride3q, 4
397INIT_MMX mmxext
398fpel_fn avg, 4, strideq, strideq*2, stride3q, 4
399fpel_fn avg, 8, strideq, strideq*2, stride3q, 4
400INIT_XMM sse
401fpel_fn put, 16, strideq, strideq*2, stride3q, 4
402fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2
403fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1
404INIT_XMM sse2
405fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
406fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2
407fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1
f6fa7814
DM
408INIT_YMM avx
409fpel_fn put, 32, strideq, strideq*2, stride3q, 4
410fpel_fn put, 64, mmsize, strideq, strideq+mmsize, 2
411%if HAVE_AVX2_EXTERNAL
412INIT_YMM avx2
413fpel_fn avg, 32, strideq, strideq*2, stride3q, 4
414fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2
415%endif
2ba45a60
DM
416%undef s16
417%undef d16
f6fa7814
DM
418%undef s32
419%undef d32