Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libswscale / x86 / output.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* x86-optimized vertical line scaling functions
3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
4;* Kieran Kunhya <kieran@kunhya.com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27minshort: times 8 dw 0x8000
28yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
29yuv2yuvX_10_start: times 4 dd 0x10000
30yuv2yuvX_9_start: times 4 dd 0x20000
31yuv2yuvX_10_upper: times 8 dw 0x3ff
32yuv2yuvX_9_upper: times 8 dw 0x1ff
33pd_4: times 4 dd 4
34pd_4min0x40000:times 4 dd 4 - (0x40000)
35pw_16: times 8 dw 16
36pw_32: times 8 dw 32
37pw_512: times 8 dw 512
38pw_1024: times 8 dw 1024
39
40SECTION .text
41
42;-----------------------------------------------------------------------------
43; vertical line scaling
44;
45; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
46; const uint8_t *dither, int offset)
47; and
48; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
49; const int16_t **src, uint8_t *dst, int dstW,
50; const uint8_t *dither, int offset)
51;
52; Scale one or $filterSize lines of source data to generate one line of output
53; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
54; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
55; of 2. $offset is either 0 or 3. $dither holds 8 values.
56;-----------------------------------------------------------------------------
57
58%macro yuv2planeX_fn 3
59
60%if ARCH_X86_32
61%define cntr_reg fltsizeq
62%define movsx mov
63%else
64%define cntr_reg r7
65%define movsx movsxd
66%endif
67
68cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
69%if %1 == 8 || %1 == 9 || %1 == 10
70 pxor m6, m6
71%endif ; %1 == 8/9/10
72
73%if %1 == 8
74%if ARCH_X86_32
75%assign pad 0x2c - (stack_offset & 15)
76 SUB rsp, pad
77%define m_dith m7
78%else ; x86-64
79%define m_dith m9
80%endif ; x86-32
81
82 ; create registers holding dither
83 movq m_dith, [ditherq] ; dither
84 test offsetd, offsetd
85 jz .no_rot
86%if mmsize == 16
87 punpcklqdq m_dith, m_dith
88%endif ; mmsize == 16
89 PALIGNR m_dith, m_dith, 3, m0
90.no_rot:
91%if mmsize == 16
92 punpcklbw m_dith, m6
93%if ARCH_X86_64
94 punpcklwd m8, m_dith, m6
95 pslld m8, 12
96%else ; x86-32
97 punpcklwd m5, m_dith, m6
98 pslld m5, 12
99%endif ; x86-32/64
100 punpckhwd m_dith, m6
101 pslld m_dith, 12
102%if ARCH_X86_32
103 mova [rsp+ 0], m5
104 mova [rsp+16], m_dith
105%endif
106%else ; mmsize == 8
107 punpcklbw m5, m_dith, m6
108 punpckhbw m_dith, m6
109 punpcklwd m4, m5, m6
110 punpckhwd m5, m6
111 punpcklwd m3, m_dith, m6
112 punpckhwd m_dith, m6
113 pslld m4, 12
114 pslld m5, 12
115 pslld m3, 12
116 pslld m_dith, 12
117 mova [rsp+ 0], m4
118 mova [rsp+ 8], m5
119 mova [rsp+16], m3
120 mova [rsp+24], m_dith
121%endif ; mmsize == 8/16
122%endif ; %1 == 8
123
124 xor r5, r5
125
126.pixelloop:
127%assign %%i 0
128 ; the rep here is for the 8bit output mmx case, where dither covers
129 ; 8 pixels but we can only handle 2 pixels per register, and thus 4
130 ; pixels per iteration. In order to not have to keep track of where
131 ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
132%if %1 == 8
133%assign %%repcnt 16/mmsize
134%else
135%assign %%repcnt 1
136%endif
137
138%rep %%repcnt
139
140%if %1 == 8
141%if ARCH_X86_32
142 mova m2, [rsp+mmsize*(0+%%i)]
143 mova m1, [rsp+mmsize*(1+%%i)]
144%else ; x86-64
145 mova m2, m8
146 mova m1, m_dith
147%endif ; x86-32/64
148%else ; %1 == 9/10/16
149 mova m1, [yuv2yuvX_%1_start]
150 mova m2, m1
151%endif ; %1 == 8/9/10/16
152 movsx cntr_reg, fltsizem
153.filterloop_ %+ %%i:
154 ; input pixels
155 mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
156%if %1 == 16
157 mova m3, [r6+r5*4]
158 mova m5, [r6+r5*4+mmsize]
159%else ; %1 == 8/9/10
160 mova m3, [r6+r5*2]
161%endif ; %1 == 8/9/10/16
162 mov r6, [srcq+gprsize*cntr_reg-gprsize]
163%if %1 == 16
164 mova m4, [r6+r5*4]
165 mova m6, [r6+r5*4+mmsize]
166%else ; %1 == 8/9/10
167 mova m4, [r6+r5*2]
168%endif ; %1 == 8/9/10/16
169
170 ; coefficients
171 movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
172%if %1 == 16
173 pshuflw m7, m0, 0 ; coeff[0]
174 pshuflw m0, m0, 0x55 ; coeff[1]
175 pmovsxwd m7, m7 ; word -> dword
176 pmovsxwd m0, m0 ; word -> dword
177
178 pmulld m3, m7
179 pmulld m5, m7
180 pmulld m4, m0
181 pmulld m6, m0
182
183 paddd m2, m3
184 paddd m1, m5
185 paddd m2, m4
186 paddd m1, m6
187%else ; %1 == 10/9/8
188 punpcklwd m5, m3, m4
189 punpckhwd m3, m4
190 SPLATD m0
191
192 pmaddwd m5, m0
193 pmaddwd m3, m0
194
195 paddd m2, m5
196 paddd m1, m3
197%endif ; %1 == 8/9/10/16
198
199 sub cntr_reg, 2
200 jg .filterloop_ %+ %%i
201
202%if %1 == 16
203 psrad m2, 31 - %1
204 psrad m1, 31 - %1
205%else ; %1 == 10/9/8
206 psrad m2, 27 - %1
207 psrad m1, 27 - %1
208%endif ; %1 == 8/9/10/16
209
210%if %1 == 8
211 packssdw m2, m1
212 packuswb m2, m2
213 movh [dstq+r5*1], m2
214%else ; %1 == 9/10/16
215%if %1 == 16
216 packssdw m2, m1
217 paddw m2, [minshort]
218%else ; %1 == 9/10
219%if cpuflag(sse4)
220 packusdw m2, m1
221%else ; mmxext/sse2
222 packssdw m2, m1
223 pmaxsw m2, m6
224%endif ; mmxext/sse2/sse4/avx
225 pminsw m2, [yuv2yuvX_%1_upper]
226%endif ; %1 == 9/10/16
227 mova [dstq+r5*2], m2
228%endif ; %1 == 8/9/10/16
229
230 add r5, mmsize/2
231 sub wd, mmsize/2
232
233%assign %%i %%i+2
234%endrep
235 jg .pixelloop
236
237%if %1 == 8
238%if ARCH_X86_32
239 ADD rsp, pad
240 RET
241%else ; x86-64
242 REP_RET
243%endif ; x86-32/64
244%else ; %1 == 9/10/16
245 REP_RET
246%endif ; %1 == 8/9/10/16
247%endmacro
248
249%if ARCH_X86_32
250INIT_MMX mmxext
251yuv2planeX_fn 8, 0, 7
252yuv2planeX_fn 9, 0, 5
253yuv2planeX_fn 10, 0, 5
254%endif
255
256INIT_XMM sse2
257yuv2planeX_fn 8, 10, 7
258yuv2planeX_fn 9, 7, 5
259yuv2planeX_fn 10, 7, 5
260
261INIT_XMM sse4
262yuv2planeX_fn 8, 10, 7
263yuv2planeX_fn 9, 7, 5
264yuv2planeX_fn 10, 7, 5
265yuv2planeX_fn 16, 8, 5
266
267%if HAVE_AVX_EXTERNAL
268INIT_XMM avx
269yuv2planeX_fn 8, 10, 7
270yuv2planeX_fn 9, 7, 5
271yuv2planeX_fn 10, 7, 5
272%endif
273
274; %1=outout-bpc, %2=alignment (u/a)
275%macro yuv2plane1_mainloop 2
276.loop_%2:
277%if %1 == 8
278 paddsw m0, m2, [srcq+wq*2+mmsize*0]
279 paddsw m1, m3, [srcq+wq*2+mmsize*1]
280 psraw m0, 7
281 psraw m1, 7
282 packuswb m0, m1
283 mov%2 [dstq+wq], m0
284%elif %1 == 16
285 paddd m0, m4, [srcq+wq*4+mmsize*0]
286 paddd m1, m4, [srcq+wq*4+mmsize*1]
287 paddd m2, m4, [srcq+wq*4+mmsize*2]
288 paddd m3, m4, [srcq+wq*4+mmsize*3]
289 psrad m0, 3
290 psrad m1, 3
291 psrad m2, 3
292 psrad m3, 3
293%if cpuflag(sse4) ; avx/sse4
294 packusdw m0, m1
295 packusdw m2, m3
296%else ; mmx/sse2
297 packssdw m0, m1
298 packssdw m2, m3
299 paddw m0, m5
300 paddw m2, m5
301%endif ; mmx/sse2/sse4/avx
302 mov%2 [dstq+wq*2+mmsize*0], m0
303 mov%2 [dstq+wq*2+mmsize*1], m2
304%else ; %1 == 9/10
305 paddsw m0, m2, [srcq+wq*2+mmsize*0]
306 paddsw m1, m2, [srcq+wq*2+mmsize*1]
307 psraw m0, 15 - %1
308 psraw m1, 15 - %1
309 pmaxsw m0, m4
310 pmaxsw m1, m4
311 pminsw m0, m3
312 pminsw m1, m3
313 mov%2 [dstq+wq*2+mmsize*0], m0
314 mov%2 [dstq+wq*2+mmsize*1], m1
315%endif
316 add wq, mmsize
317 jl .loop_%2
318%endmacro
319
320%macro yuv2plane1_fn 3
321cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
322 movsxdifnidn wq, wd
323 add wq, mmsize - 1
324 and wq, ~(mmsize - 1)
325%if %1 == 8
326 add dstq, wq
327%else ; %1 != 8
328 lea dstq, [dstq+wq*2]
329%endif ; %1 == 8
330%if %1 == 16
331 lea srcq, [srcq+wq*4]
332%else ; %1 != 16
333 lea srcq, [srcq+wq*2]
334%endif ; %1 == 16
335 neg wq
336
337%if %1 == 8
338 pxor m4, m4 ; zero
339
340 ; create registers holding dither
341 movq m3, [ditherq] ; dither
342 test offsetd, offsetd
343 jz .no_rot
344%if mmsize == 16
345 punpcklqdq m3, m3
346%endif ; mmsize == 16
347 PALIGNR m3, m3, 3, m2
348.no_rot:
349%if mmsize == 8
350 mova m2, m3
351 punpckhbw m3, m4 ; byte->word
352 punpcklbw m2, m4 ; byte->word
353%else
354 punpcklbw m3, m4
355 mova m2, m3
356%endif
357%elif %1 == 9
358 pxor m4, m4
359 mova m3, [pw_512]
360 mova m2, [pw_32]
361%elif %1 == 10
362 pxor m4, m4
363 mova m3, [pw_1024]
364 mova m2, [pw_16]
365%else ; %1 == 16
366%if cpuflag(sse4) ; sse4/avx
367 mova m4, [pd_4]
368%else ; mmx/sse2
369 mova m4, [pd_4min0x40000]
370 mova m5, [minshort]
371%endif ; mmx/sse2/sse4/avx
372%endif ; %1 == ..
373
374 ; actual pixel scaling
375%if mmsize == 8
376 yuv2plane1_mainloop %1, a
377%else ; mmsize == 16
378 test dstq, 15
379 jnz .unaligned
380 yuv2plane1_mainloop %1, a
381 REP_RET
382.unaligned:
383 yuv2plane1_mainloop %1, u
384%endif ; mmsize == 8/16
385 REP_RET
386%endmacro
387
388%if ARCH_X86_32
389INIT_MMX mmx
390yuv2plane1_fn 8, 0, 5
391yuv2plane1_fn 16, 0, 3
392
393INIT_MMX mmxext
394yuv2plane1_fn 9, 0, 3
395yuv2plane1_fn 10, 0, 3
396%endif
397
398INIT_XMM sse2
399yuv2plane1_fn 8, 5, 5
400yuv2plane1_fn 9, 5, 3
401yuv2plane1_fn 10, 5, 3
402yuv2plane1_fn 16, 6, 3
403
404INIT_XMM sse4
405yuv2plane1_fn 16, 5, 3
406
407%if HAVE_AVX_EXTERNAL
408INIT_XMM avx
409yuv2plane1_fn 8, 5, 5
410yuv2plane1_fn 9, 5, 3
411yuv2plane1_fn 10, 5, 3
412yuv2plane1_fn 16, 5, 3
413%endif