Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / pngdsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* x86 optimizations for PNG decoding
3;*
4;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
5;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27
28cextern pw_255
29
30SECTION_TEXT
31
32; %1 = nr. of xmm registers used
33%macro ADD_BYTES_FN 1
34cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
35%if ARCH_X86_64
36 movsxd waq, wad
37%endif
38 xor iq, iq
39
40 ; vector loop
41 mov wq, waq
42 and waq, ~(mmsize*2-1)
43 jmp .end_v
44.loop_v:
45 mova m0, [src1q+iq]
46 mova m1, [src1q+iq+mmsize]
47 paddb m0, [src2q+iq]
48 paddb m1, [src2q+iq+mmsize]
49 mova [dstq+iq ], m0
50 mova [dstq+iq+mmsize], m1
51 add iq, mmsize*2
52.end_v:
53 cmp iq, waq
54 jl .loop_v
55
56%if mmsize == 16
57 ; vector loop
58 mov waq, wq
59 and waq, ~7
60 jmp .end_l
61.loop_l:
62 movq mm0, [src1q+iq]
63 paddb mm0, [src2q+iq]
64 movq [dstq+iq ], mm0
65 add iq, 8
66.end_l:
67 cmp iq, waq
68 jl .loop_l
69%endif
70
71 ; scalar loop for leftover
72 jmp .end_s
73.loop_s:
74 mov wab, [src1q+iq]
75 add wab, [src2q+iq]
76 mov [dstq+iq], wab
77 inc iq
78.end_s:
79 cmp iq, wq
80 jl .loop_s
81 REP_RET
82%endmacro
83
84%if ARCH_X86_32
85INIT_MMX mmx
86ADD_BYTES_FN 0
87%endif
88
89INIT_XMM sse2
90ADD_BYTES_FN 2
91
92%macro ADD_PAETH_PRED_FN 1
93cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
94%if ARCH_X86_64
95 movsxd bppq, bppd
96 movsxd wq, wd
97%endif
98 lea endq, [dstq+wq-(mmsize/2-1)]
99 sub topq, dstq
100 sub srcq, dstq
101 sub dstq, bppq
102 pxor m7, m7
103
104 PUSH dstq
105 lea cntrq, [bppq-1]
106 shr cntrq, 2 + mmsize/16
107.bpp_loop:
108 lea dstq, [dstq+cntrq*(mmsize/2)]
109 movh m0, [dstq]
110 movh m1, [topq+dstq]
111 punpcklbw m0, m7
112 punpcklbw m1, m7
113 add dstq, bppq
114.loop:
115 mova m2, m1
116 movh m1, [topq+dstq]
117 mova m3, m2
118 punpcklbw m1, m7
119 mova m4, m2
120 psubw m3, m1
121 psubw m4, m0
122 mova m5, m3
123 paddw m5, m4
124%if cpuflag(ssse3)
125 pabsw m3, m3
126 pabsw m4, m4
127 pabsw m5, m5
128%else ; !cpuflag(ssse3)
129 psubw m7, m5
130 pmaxsw m5, m7
131 pxor m6, m6
132 pxor m7, m7
133 psubw m6, m3
134 psubw m7, m4
135 pmaxsw m3, m6
136 pmaxsw m4, m7
137 pxor m7, m7
138%endif ; cpuflag(ssse3)
139 mova m6, m4
140 pminsw m6, m5
141 pcmpgtw m3, m6
142 pcmpgtw m4, m5
143 mova m6, m4
144 pand m4, m3
145 pandn m6, m3
146 pandn m3, m0
147 movh m0, [srcq+dstq]
148 pand m6, m1
149 pand m2, m4
150 punpcklbw m0, m7
151 paddw m0, m6
152 paddw m3, m2
153 paddw m0, m3
154 pand m0, [pw_255]
155 mova m3, m0
156 packuswb m3, m3
157 movh [dstq], m3
158 add dstq, bppq
159 cmp dstq, endq
160 jle .loop
161
162 mov dstq, [rsp]
163 dec cntrq
164 jge .bpp_loop
165 POP dstq
166 RET
167%endmacro
168
169INIT_MMX mmxext
170ADD_PAETH_PRED_FN 0
171
172INIT_MMX ssse3
173ADD_PAETH_PRED_FN 0