Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* MMX optimized discrete wavelet trasnform | |
3 | ;* Copyright (c) 2010 David Conrad | |
4 | ;* | |
5 | ;* This file is part of FFmpeg. | |
6 | ;* | |
7 | ;* FFmpeg is free software; you can redistribute it and/or | |
8 | ;* modify it under the terms of the GNU Lesser General Public | |
9 | ;* License as published by the Free Software Foundation; either | |
10 | ;* version 2.1 of the License, or (at your option) any later version. | |
11 | ;* | |
12 | ;* FFmpeg is distributed in the hope that it will be useful, | |
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;* Lesser General Public License for more details. | |
16 | ;* | |
17 | ;* You should have received a copy of the GNU Lesser General Public | |
18 | ;* License along with FFmpeg; if not, write to the Free Software | |
19 | ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ;****************************************************************************** | |
21 | ||
22 | %include "libavutil/x86/x86util.asm" | |
23 | ||
24 | SECTION_RODATA | |
25 | pw_1991: times 4 dw 9,-1 | |
26 | ||
27 | cextern pw_1 | |
28 | cextern pw_2 | |
29 | cextern pw_8 | |
30 | cextern pw_16 | |
31 | ||
32 | section .text | |
33 | ||
34 | ; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2 | |
35 | %macro COMPOSE_53iL0 4 | |
36 | paddw %2, %3 | |
37 | paddw %2, %4 | |
38 | psraw %2, 2 | |
39 | psubw %1, %2 | |
40 | %endm | |
41 | ||
42 | ; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4 | |
43 | ; if %4 is supplied, %1 is loaded unaligned from there | |
44 | ; m2: clobbered m3: pw_8 m4: pw_1991 | |
45 | %macro COMPOSE_DD97iH0 3-4 | |
46 | paddw m0, %3 | |
47 | paddw m1, %2 | |
48 | psubw m0, m3 | |
49 | mova m2, m1 | |
50 | punpcklwd m1, m0 | |
51 | punpckhwd m2, m0 | |
52 | pmaddwd m1, m4 | |
53 | pmaddwd m2, m4 | |
54 | %if %0 > 3 | |
55 | movu %1, %4 | |
56 | %endif | |
57 | psrad m1, 4 | |
58 | psrad m2, 4 | |
59 | packssdw m1, m2 | |
60 | paddw m1, %1 | |
61 | %endm | |
62 | ||
63 | %macro COMPOSE_VERTICAL 1 | |
64 | ; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, | |
65 | ; int width) | |
66 | cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width | |
67 | mova m2, [pw_2] | |
68 | %if ARCH_X86_64 | |
69 | mov widthd, widthd | |
70 | %endif | |
71 | .loop: | |
72 | sub widthq, mmsize/2 | |
73 | mova m1, [b0q+2*widthq] | |
74 | mova m0, [b1q+2*widthq] | |
75 | COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2 | |
76 | mova [b1q+2*widthq], m0 | |
77 | jg .loop | |
78 | REP_RET | |
79 | ||
80 | ; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, | |
81 | ; int width) | |
82 | cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width | |
83 | mova m1, [pw_1] | |
84 | %if ARCH_X86_64 | |
85 | mov widthd, widthd | |
86 | %endif | |
87 | .loop: | |
88 | sub widthq, mmsize/2 | |
89 | mova m0, [b0q+2*widthq] | |
90 | paddw m0, [b2q+2*widthq] | |
91 | paddw m0, m1 | |
92 | psraw m0, 1 | |
93 | paddw m0, [b1q+2*widthq] | |
94 | mova [b1q+2*widthq], m0 | |
95 | jg .loop | |
96 | REP_RET | |
97 | ||
98 | ; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, | |
99 | ; IDWTELEM *b3, IDWTELEM *b4, int width) | |
100 | cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width | |
101 | mova m3, [pw_8] | |
102 | mova m4, [pw_1991] | |
103 | %if ARCH_X86_64 | |
104 | mov widthd, widthd | |
105 | %endif | |
106 | .loop: | |
107 | sub widthq, mmsize/2 | |
108 | mova m0, [b0q+2*widthq] | |
109 | mova m1, [b1q+2*widthq] | |
110 | COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq] | |
111 | mova [b2q+2*widthq], m1 | |
112 | jg .loop | |
113 | REP_RET | |
114 | ||
115 | ; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, | |
116 | ; IDWTELEM *b3, IDWTELEM *b4, int width) | |
117 | cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width | |
118 | mova m3, [pw_16] | |
119 | mova m4, [pw_1991] | |
120 | %if ARCH_X86_64 | |
121 | mov widthd, widthd | |
122 | %endif | |
123 | .loop: | |
124 | sub widthq, mmsize/2 | |
125 | mova m0, [b0q+2*widthq] | |
126 | mova m1, [b1q+2*widthq] | |
127 | mova m5, [b2q+2*widthq] | |
128 | paddw m0, [b4q+2*widthq] | |
129 | paddw m1, [b3q+2*widthq] | |
130 | psubw m0, m3 | |
131 | mova m2, m1 | |
132 | punpcklwd m1, m0 | |
133 | punpckhwd m2, m0 | |
134 | pmaddwd m1, m4 | |
135 | pmaddwd m2, m4 | |
136 | psrad m1, 5 | |
137 | psrad m2, 5 | |
138 | packssdw m1, m2 | |
139 | psubw m5, m1 | |
140 | mova [b2q+2*widthq], m5 | |
141 | jg .loop | |
142 | REP_RET | |
143 | ||
144 | ; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width) | |
145 | cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width | |
146 | mova m3, [pw_1] | |
147 | %if ARCH_X86_64 | |
148 | mov widthd, widthd | |
149 | %endif | |
150 | .loop: | |
151 | sub widthq, mmsize/2 | |
152 | mova m1, [b1q+2*widthq] | |
153 | mova m0, [b0q+2*widthq] | |
154 | mova m2, m1 | |
155 | paddw m1, m3 | |
156 | psraw m1, 1 | |
157 | psubw m0, m1 | |
158 | mova [b0q+2*widthq], m0 | |
159 | paddw m2, m0 | |
160 | mova [b1q+2*widthq], m2 | |
161 | jg .loop | |
162 | REP_RET | |
163 | %endmacro | |
164 | ||
165 | ; extend the left and right edges of the tmp array by %1 and %2 respectively | |
166 | %macro EDGE_EXTENSION 3 | |
167 | mov %3, [tmpq] | |
168 | %assign %%i 1 | |
169 | %rep %1 | |
170 | mov [tmpq-2*%%i], %3 | |
171 | %assign %%i %%i+1 | |
172 | %endrep | |
173 | mov %3, [tmpq+2*w2q-2] | |
174 | %assign %%i 0 | |
175 | %rep %2 | |
176 | mov [tmpq+2*w2q+2*%%i], %3 | |
177 | %assign %%i %%i+1 | |
178 | %endrep | |
179 | %endmacro | |
180 | ||
181 | ||
182 | %macro HAAR_HORIZONTAL 2 | |
183 | ; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width) | |
184 | cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2 | |
185 | mov w2d, wd | |
186 | xor xq, xq | |
187 | shr w2d, 1 | |
188 | lea b_w2q, [bq+wq] | |
189 | mova m3, [pw_1] | |
190 | .lowpass_loop: | |
191 | movu m1, [b_w2q + 2*xq] | |
192 | mova m0, [bq + 2*xq] | |
193 | paddw m1, m3 | |
194 | psraw m1, 1 | |
195 | psubw m0, m1 | |
196 | mova [tmpq + 2*xq], m0 | |
197 | add xq, mmsize/2 | |
198 | cmp xq, w2q | |
199 | jl .lowpass_loop | |
200 | ||
201 | xor xq, xq | |
202 | and w2q, ~(mmsize/2 - 1) | |
203 | cmp w2q, mmsize/2 | |
204 | jl .end | |
205 | ||
206 | .highpass_loop: | |
207 | movu m1, [b_w2q + 2*xq] | |
208 | mova m0, [tmpq + 2*xq] | |
209 | paddw m1, m0 | |
210 | ||
211 | ; shift and interleave | |
212 | %if %2 == 1 | |
213 | paddw m0, m3 | |
214 | paddw m1, m3 | |
215 | psraw m0, 1 | |
216 | psraw m1, 1 | |
217 | %endif | |
218 | mova m2, m0 | |
219 | punpcklwd m0, m1 | |
220 | punpckhwd m2, m1 | |
221 | mova [bq+4*xq], m0 | |
222 | mova [bq+4*xq+mmsize], m2 | |
223 | ||
224 | add xq, mmsize/2 | |
225 | cmp xq, w2q | |
226 | jl .highpass_loop | |
227 | .end: | |
228 | REP_RET | |
229 | %endmacro | |
230 | ||
231 | ||
232 | INIT_XMM | |
233 | ; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width) | |
234 | cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2 | |
235 | mov w2d, wd | |
236 | xor xd, xd | |
237 | shr w2d, 1 | |
238 | lea b_w2q, [bq+wq] | |
239 | movu m4, [bq+wq] | |
240 | mova m7, [pw_2] | |
241 | pslldq m4, 14 | |
242 | .lowpass_loop: | |
243 | movu m1, [b_w2q + 2*xq] | |
244 | mova m0, [bq + 2*xq] | |
245 | mova m2, m1 | |
246 | palignr m1, m4, 14 | |
247 | mova m4, m2 | |
248 | COMPOSE_53iL0 m0, m1, m2, m7 | |
249 | mova [tmpq + 2*xq], m0 | |
250 | add xd, mmsize/2 | |
251 | cmp xd, w2d | |
252 | jl .lowpass_loop | |
253 | ||
254 | EDGE_EXTENSION 1, 2, xw | |
255 | ; leave the last up to 7 (sse) or 3 (mmx) values for C | |
256 | xor xd, xd | |
257 | and w2d, ~(mmsize/2 - 1) | |
258 | cmp w2d, mmsize/2 | |
259 | jl .end | |
260 | ||
261 | mova m7, [tmpq-mmsize] | |
262 | mova m0, [tmpq] | |
263 | mova m5, [pw_1] | |
264 | mova m3, [pw_8] | |
265 | mova m4, [pw_1991] | |
266 | .highpass_loop: | |
267 | mova m6, m0 | |
268 | palignr m0, m7, 14 | |
269 | mova m7, [tmpq + 2*xq + 16] | |
270 | mova m1, m7 | |
271 | mova m2, m7 | |
272 | palignr m1, m6, 2 | |
273 | palignr m2, m6, 4 | |
274 | COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq] | |
275 | mova m0, m7 | |
276 | mova m7, m6 | |
277 | ||
278 | ; shift and interleave | |
279 | paddw m6, m5 | |
280 | paddw m1, m5 | |
281 | psraw m6, 1 | |
282 | psraw m1, 1 | |
283 | mova m2, m6 | |
284 | punpcklwd m6, m1 | |
285 | punpckhwd m2, m1 | |
286 | mova [bq+4*xq], m6 | |
287 | mova [bq+4*xq+mmsize], m2 | |
288 | ||
289 | add xd, mmsize/2 | |
290 | cmp xd, w2d | |
291 | jl .highpass_loop | |
292 | .end: | |
293 | REP_RET | |
294 | ||
295 | ||
296 | %if ARCH_X86_64 == 0 | |
297 | INIT_MMX | |
298 | COMPOSE_VERTICAL mmx | |
299 | HAAR_HORIZONTAL mmx, 0 | |
300 | HAAR_HORIZONTAL mmx, 1 | |
301 | %endif | |
302 | ||
303 | ;;INIT_XMM | |
304 | INIT_XMM | |
305 | COMPOSE_VERTICAL sse2 | |
306 | HAAR_HORIZONTAL sse2, 0 | |
307 | HAAR_HORIZONTAL sse2, 1 |