Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* x86 optimizations for PNG decoding | |
3 | ;* | |
4 | ;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu> | |
5 | ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> | |
6 | ;* | |
7 | ;* This file is part of FFmpeg. | |
8 | ;* | |
9 | ;* FFmpeg is free software; you can redistribute it and/or | |
10 | ;* modify it under the terms of the GNU Lesser General Public | |
11 | ;* License as published by the Free Software Foundation; either | |
12 | ;* version 2.1 of the License, or (at your option) any later version. | |
13 | ;* | |
14 | ;* FFmpeg is distributed in the hope that it will be useful, | |
15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | ;* Lesser General Public License for more details. | |
18 | ;* | |
19 | ;* You should have received a copy of the GNU Lesser General Public | |
20 | ;* License along with FFmpeg; if not, write to the Free Software | |
21 | ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 | ;****************************************************************************** | |
23 | ||
24 | %include "libavutil/x86/x86util.asm" | |
25 | ||
26 | SECTION_RODATA | |
27 | ||
28 | cextern pw_255 | |
29 | ||
30 | SECTION_TEXT | |
31 | ||
32 | ; %1 = nr. of xmm registers used | |
33 | %macro ADD_BYTES_FN 1 | |
34 | cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i | |
35 | %if ARCH_X86_64 | |
36 | movsxd waq, wad | |
37 | %endif | |
38 | xor iq, iq | |
39 | ||
40 | ; vector loop | |
41 | mov wq, waq | |
42 | and waq, ~(mmsize*2-1) | |
43 | jmp .end_v | |
44 | .loop_v: | |
45 | mova m0, [src1q+iq] | |
46 | mova m1, [src1q+iq+mmsize] | |
47 | paddb m0, [src2q+iq] | |
48 | paddb m1, [src2q+iq+mmsize] | |
49 | mova [dstq+iq ], m0 | |
50 | mova [dstq+iq+mmsize], m1 | |
51 | add iq, mmsize*2 | |
52 | .end_v: | |
53 | cmp iq, waq | |
54 | jl .loop_v | |
55 | ||
56 | %if mmsize == 16 | |
57 | ; vector loop | |
58 | mov waq, wq | |
59 | and waq, ~7 | |
60 | jmp .end_l | |
61 | .loop_l: | |
62 | movq mm0, [src1q+iq] | |
63 | paddb mm0, [src2q+iq] | |
64 | movq [dstq+iq ], mm0 | |
65 | add iq, 8 | |
66 | .end_l: | |
67 | cmp iq, waq | |
68 | jl .loop_l | |
69 | %endif | |
70 | ||
71 | ; scalar loop for leftover | |
72 | jmp .end_s | |
73 | .loop_s: | |
74 | mov wab, [src1q+iq] | |
75 | add wab, [src2q+iq] | |
76 | mov [dstq+iq], wab | |
77 | inc iq | |
78 | .end_s: | |
79 | cmp iq, wq | |
80 | jl .loop_s | |
81 | REP_RET | |
82 | %endmacro | |
83 | ||
84 | %if ARCH_X86_32 | |
85 | INIT_MMX mmx | |
86 | ADD_BYTES_FN 0 | |
87 | %endif | |
88 | ||
89 | INIT_XMM sse2 | |
90 | ADD_BYTES_FN 2 | |
91 | ||
92 | %macro ADD_PAETH_PRED_FN 1 | |
93 | cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr | |
94 | %if ARCH_X86_64 | |
95 | movsxd bppq, bppd | |
96 | movsxd wq, wd | |
97 | %endif | |
98 | lea endq, [dstq+wq-(mmsize/2-1)] | |
99 | sub topq, dstq | |
100 | sub srcq, dstq | |
101 | sub dstq, bppq | |
102 | pxor m7, m7 | |
103 | ||
104 | PUSH dstq | |
105 | lea cntrq, [bppq-1] | |
106 | shr cntrq, 2 + mmsize/16 | |
107 | .bpp_loop: | |
108 | lea dstq, [dstq+cntrq*(mmsize/2)] | |
109 | movh m0, [dstq] | |
110 | movh m1, [topq+dstq] | |
111 | punpcklbw m0, m7 | |
112 | punpcklbw m1, m7 | |
113 | add dstq, bppq | |
114 | .loop: | |
115 | mova m2, m1 | |
116 | movh m1, [topq+dstq] | |
117 | mova m3, m2 | |
118 | punpcklbw m1, m7 | |
119 | mova m4, m2 | |
120 | psubw m3, m1 | |
121 | psubw m4, m0 | |
122 | mova m5, m3 | |
123 | paddw m5, m4 | |
124 | %if cpuflag(ssse3) | |
125 | pabsw m3, m3 | |
126 | pabsw m4, m4 | |
127 | pabsw m5, m5 | |
128 | %else ; !cpuflag(ssse3) | |
129 | psubw m7, m5 | |
130 | pmaxsw m5, m7 | |
131 | pxor m6, m6 | |
132 | pxor m7, m7 | |
133 | psubw m6, m3 | |
134 | psubw m7, m4 | |
135 | pmaxsw m3, m6 | |
136 | pmaxsw m4, m7 | |
137 | pxor m7, m7 | |
138 | %endif ; cpuflag(ssse3) | |
139 | mova m6, m4 | |
140 | pminsw m6, m5 | |
141 | pcmpgtw m3, m6 | |
142 | pcmpgtw m4, m5 | |
143 | mova m6, m4 | |
144 | pand m4, m3 | |
145 | pandn m6, m3 | |
146 | pandn m3, m0 | |
147 | movh m0, [srcq+dstq] | |
148 | pand m6, m1 | |
149 | pand m2, m4 | |
150 | punpcklbw m0, m7 | |
151 | paddw m0, m6 | |
152 | paddw m3, m2 | |
153 | paddw m0, m3 | |
154 | pand m0, [pw_255] | |
155 | mova m3, m0 | |
156 | packuswb m3, m3 | |
157 | movh [dstq], m3 | |
158 | add dstq, bppq | |
159 | cmp dstq, endq | |
160 | jle .loop | |
161 | ||
162 | mov dstq, [rsp] | |
163 | dec cntrq | |
164 | jge .bpp_loop | |
165 | POP dstq | |
166 | RET | |
167 | %endmacro | |
168 | ||
169 | INIT_MMX mmxext | |
170 | ADD_PAETH_PRED_FN 0 | |
171 | ||
172 | INIT_MMX ssse3 | |
173 | ADD_PAETH_PRED_FN 0 |