Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* MMX/SSE2-optimized functions for the VP3 decoder | |
3 | ;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> | |
4 | ;* | |
5 | ;* This file is part of FFmpeg. | |
6 | ;* | |
7 | ;* FFmpeg is free software; you can redistribute it and/or | |
8 | ;* modify it under the terms of the GNU Lesser General Public | |
9 | ;* License as published by the Free Software Foundation; either | |
10 | ;* version 2.1 of the License, or (at your option) any later version. | |
11 | ;* | |
12 | ;* FFmpeg is distributed in the hope that it will be useful, | |
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ;* Lesser General Public License for more details. | |
16 | ;* | |
17 | ;* You should have received a copy of the GNU Lesser General Public | |
18 | ;* License along with FFmpeg; if not, write to the Free Software | |
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ;****************************************************************************** | |
21 | ||
22 | %include "libavutil/x86/x86util.asm" | |
23 | ||
24 | ; MMX-optimized functions cribbed from the original VP3 source code. | |
25 | ||
26 | SECTION_RODATA | |
27 | ||
28 | vp3_idct_data: times 8 dw 64277 | |
29 | times 8 dw 60547 | |
30 | times 8 dw 54491 | |
31 | times 8 dw 46341 | |
32 | times 8 dw 36410 | |
33 | times 8 dw 25080 | |
34 | times 8 dw 12785 | |
35 | ||
36 | pb_7: times 8 db 0x07 | |
37 | pb_1F: times 8 db 0x1f | |
38 | pb_81: times 8 db 0x81 | |
39 | ||
40 | cextern pb_1 | |
41 | cextern pb_3 | |
42 | cextern pb_80 | |
43 | ||
44 | cextern pw_8 | |
45 | ||
46 | SECTION .text | |
47 | ||
48 | ; this is off by one or two for some cases when filter_limit is greater than 63 | |
49 | ; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 | |
50 | ; out: p1 in mm4, p2 in mm3 | |
51 | %macro VP3_LOOP_FILTER 0 | |
52 | movq m7, m6 | |
53 | pand m6, [pb_7] ; p0&7 | |
54 | psrlw m7, 3 | |
55 | pand m7, [pb_1F] ; p0>>3 | |
56 | movq m3, m2 ; p2 | |
57 | pxor m2, m4 | |
58 | pand m2, [pb_1] ; (p2^p1)&1 | |
59 | movq m5, m2 | |
60 | paddb m2, m2 | |
61 | paddb m2, m5 ; 3*(p2^p1)&1 | |
62 | paddb m2, m6 ; extra bits lost in shifts | |
63 | pcmpeqb m0, m0 | |
64 | pxor m1, m0 ; 255 - p3 | |
65 | pavgb m1, m2 ; (256 - p3 + extrabits) >> 1 | |
66 | pxor m0, m4 ; 255 - p1 | |
67 | pavgb m0, m3 ; (256 + p2-p1) >> 1 | |
68 | paddb m1, [pb_3] | |
69 | pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2 | |
70 | pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3 | |
71 | paddusb m7, m1 ; d+128+1 | |
72 | movq m6, [pb_81] | |
73 | psubusb m6, m7 | |
74 | psubusb m7, [pb_81] | |
75 | ||
76 | movq m5, [r2+516] ; flim | |
77 | pminub m6, m5 | |
78 | pminub m7, m5 | |
79 | movq m0, m6 | |
80 | movq m1, m7 | |
81 | paddb m6, m6 | |
82 | paddb m7, m7 | |
83 | pminub m6, m5 | |
84 | pminub m7, m5 | |
85 | psubb m6, m0 | |
86 | psubb m7, m1 | |
87 | paddusb m4, m7 | |
88 | psubusb m4, m6 | |
89 | psubusb m3, m7 | |
90 | paddusb m3, m6 | |
91 | %endmacro | |
92 | ||
93 | %macro STORE_4_WORDS 1 | |
94 | movd r2d, %1 | |
95 | mov [r0 -1], r2w | |
96 | psrlq %1, 32 | |
97 | shr r2, 16 | |
98 | mov [r0+r1 -1], r2w | |
99 | movd r2d, %1 | |
100 | mov [r0+r1*2-1], r2w | |
101 | shr r2, 16 | |
102 | mov [r0+r3 -1], r2w | |
103 | %endmacro | |
104 | ||
105 | INIT_MMX mmxext | |
106 | cglobal vp3_v_loop_filter, 3, 4 | |
107 | %if ARCH_X86_64 | |
108 | movsxd r1, r1d | |
109 | %endif | |
110 | mov r3, r1 | |
111 | neg r1 | |
112 | movq m6, [r0+r1*2] | |
113 | movq m4, [r0+r1 ] | |
114 | movq m2, [r0 ] | |
115 | movq m1, [r0+r3 ] | |
116 | ||
117 | VP3_LOOP_FILTER | |
118 | ||
119 | movq [r0+r1], m4 | |
120 | movq [r0 ], m3 | |
121 | RET | |
122 | ||
123 | cglobal vp3_h_loop_filter, 3, 4 | |
124 | %if ARCH_X86_64 | |
125 | movsxd r1, r1d | |
126 | %endif | |
127 | lea r3, [r1*3] | |
128 | ||
129 | movd m6, [r0 -2] | |
130 | movd m4, [r0+r1 -2] | |
131 | movd m2, [r0+r1*2-2] | |
132 | movd m1, [r0+r3 -2] | |
133 | lea r0, [r0+r1*4 ] | |
134 | punpcklbw m6, [r0 -2] | |
135 | punpcklbw m4, [r0+r1 -2] | |
136 | punpcklbw m2, [r0+r1*2-2] | |
137 | punpcklbw m1, [r0+r3 -2] | |
138 | sub r0, r3 | |
139 | sub r0, r1 | |
140 | ||
141 | TRANSPOSE4x4B 6, 4, 2, 1, 0 | |
142 | VP3_LOOP_FILTER | |
143 | SBUTTERFLY bw, 4, 3, 5 | |
144 | ||
145 | STORE_4_WORDS m4 | |
146 | lea r0, [r0+r1*4 ] | |
147 | STORE_4_WORDS m3 | |
148 | RET | |
149 | ||
150 | ; from original comments: The Macro does IDct on 4 1-D Dcts | |
151 | %macro BeginIDCT 0 | |
152 | movq m2, I(3) | |
153 | movq m6, C(3) | |
154 | movq m4, m2 | |
155 | movq m7, J(5) | |
156 | pmulhw m4, m6 ; r4 = c3*i3 - i3 | |
157 | movq m1, C(5) | |
158 | pmulhw m6, m7 ; r6 = c3*i5 - i5 | |
159 | movq m5, m1 | |
160 | pmulhw m1, m2 ; r1 = c5*i3 - i3 | |
161 | movq m3, I(1) | |
162 | pmulhw m5, m7 ; r5 = c5*i5 - i5 | |
163 | movq m0, C(1) | |
164 | paddw m4, m2 ; r4 = c3*i3 | |
165 | paddw m6, m7 ; r6 = c3*i5 | |
166 | paddw m2, m1 ; r2 = c5*i3 | |
167 | movq m1, J(7) | |
168 | paddw m7, m5 ; r7 = c5*i5 | |
169 | movq m5, m0 ; r5 = c1 | |
170 | pmulhw m0, m3 ; r0 = c1*i1 - i1 | |
171 | paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5 | |
172 | pmulhw m5, m1 ; r5 = c1*i7 - i7 | |
173 | movq m7, C(7) | |
174 | psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3 | |
175 | paddw m0, m3 ; r0 = c1*i1 | |
176 | pmulhw m3, m7 ; r3 = c7*i1 | |
177 | movq m2, I(2) | |
178 | pmulhw m7, m1 ; r7 = c7*i7 | |
179 | paddw m5, m1 ; r5 = c1*i7 | |
180 | movq m1, m2 ; r1 = i2 | |
181 | pmulhw m2, C(2) ; r2 = c2*i2 - i2 | |
182 | psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7 | |
183 | movq m5, J(6) | |
184 | paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7 | |
185 | movq m7, m5 ; r7 = i6 | |
186 | psubsw m0, m4 ; r0 = A - C | |
187 | pmulhw m5, C(2) ; r5 = c2*i6 - i6 | |
188 | paddw m2, m1 ; r2 = c2*i2 | |
189 | pmulhw m1, C(6) ; r1 = c6*i2 | |
190 | paddsw m4, m4 ; r4 = C + C | |
191 | paddsw m4, m0 ; r4 = C. = A + C | |
192 | psubsw m3, m6 ; r3 = B - D | |
193 | paddw m5, m7 ; r5 = c2*i6 | |
194 | paddsw m6, m6 ; r6 = D + D | |
195 | pmulhw m7, C(6) ; r7 = c6*i6 | |
196 | paddsw m6, m3 ; r6 = D. = B + D | |
197 | movq I(1), m4 ; save C. at I(1) | |
198 | psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6 | |
199 | movq m4, C(4) | |
200 | movq m5, m3 ; r5 = B - D | |
201 | pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D) | |
202 | paddsw m7, m2 ; r3 = (c4 - 1) * (B - D) | |
203 | movq I(2), m6 ; save D. at I(2) | |
204 | movq m2, m0 ; r2 = A - C | |
205 | movq m6, I(0) | |
206 | pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C) | |
207 | paddw m5, m3 ; r5 = B. = c4 * (B - D) | |
208 | movq m3, J(4) | |
209 | psubsw m5, m1 ; r5 = B.. = B. - H | |
210 | paddw m2, m0 ; r0 = A. = c4 * (A - C) | |
211 | psubsw m6, m3 ; r6 = i0 - i4 | |
212 | movq m0, m6 | |
213 | pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4) | |
214 | paddsw m3, m3 ; r3 = i4 + i4 | |
215 | paddsw m1, m1 ; r1 = H + H | |
216 | paddsw m3, m0 ; r3 = i0 + i4 | |
217 | paddsw m1, m5 ; r1 = H. = B + H | |
218 | pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4) | |
219 | paddsw m6, m0 ; r6 = F = c4 * (i0 - i4) | |
220 | psubsw m6, m2 ; r6 = F. = F - A. | |
221 | paddsw m2, m2 ; r2 = A. + A. | |
222 | movq m0, I(1) ; r0 = C. | |
223 | paddsw m2, m6 ; r2 = A.. = F + A. | |
224 | paddw m4, m3 ; r4 = E = c4 * (i0 + i4) | |
225 | psubsw m2, m1 ; r2 = R2 = A.. - H. | |
226 | %endmacro | |
227 | ||
228 | ; RowIDCT gets ready to transpose | |
229 | %macro RowIDCT 0 | |
230 | BeginIDCT | |
231 | movq m3, I(2) ; r3 = D. | |
232 | psubsw m4, m7 ; r4 = E. = E - G | |
233 | paddsw m1, m1 ; r1 = H. + H. | |
234 | paddsw m7, m7 ; r7 = G + G | |
235 | paddsw m1, m2 ; r1 = R1 = A.. + H. | |
236 | paddsw m7, m4 ; r1 = R1 = A.. + H. | |
237 | psubsw m4, m3 ; r4 = R4 = E. - D. | |
238 | paddsw m3, m3 | |
239 | psubsw m6, m5 ; r6 = R6 = F. - B.. | |
240 | paddsw m5, m5 | |
241 | paddsw m3, m4 ; r3 = R3 = E. + D. | |
242 | paddsw m5, m6 ; r5 = R5 = F. + B.. | |
243 | psubsw m7, m0 ; r7 = R7 = G. - C. | |
244 | paddsw m0, m0 | |
245 | movq I(1), m1 ; save R1 | |
246 | paddsw m0, m7 ; r0 = R0 = G. + C. | |
247 | %endmacro | |
248 | ||
249 | ; Column IDCT normalizes and stores final results | |
250 | %macro ColumnIDCT 0 | |
251 | BeginIDCT | |
252 | paddsw m2, OC_8 ; adjust R2 (and R1) for shift | |
253 | paddsw m1, m1 ; r1 = H. + H. | |
254 | paddsw m1, m2 ; r1 = R1 = A.. + H. | |
255 | psraw m2, 4 ; r2 = NR2 | |
256 | psubsw m4, m7 ; r4 = E. = E - G | |
257 | psraw m1, 4 ; r1 = NR2 | |
258 | movq m3, I(2) ; r3 = D. | |
259 | paddsw m7, m7 ; r7 = G + G | |
260 | movq I(2), m2 ; store NR2 at I2 | |
261 | paddsw m7, m4 ; r7 = G. = E + G | |
262 | movq I(1), m1 ; store NR1 at I1 | |
263 | psubsw m4, m3 ; r4 = R4 = E. - D. | |
264 | paddsw m4, OC_8 ; adjust R4 (and R3) for shift | |
265 | paddsw m3, m3 ; r3 = D. + D. | |
266 | paddsw m3, m4 ; r3 = R3 = E. + D. | |
267 | psraw m4, 4 ; r4 = NR4 | |
268 | psubsw m6, m5 ; r6 = R6 = F. - B.. | |
269 | psraw m3, 4 ; r3 = NR3 | |
270 | paddsw m6, OC_8 ; adjust R6 (and R5) for shift | |
271 | paddsw m5, m5 ; r5 = B.. + B.. | |
272 | paddsw m5, m6 ; r5 = R5 = F. + B.. | |
273 | psraw m6, 4 ; r6 = NR6 | |
274 | movq J(4), m4 ; store NR4 at J4 | |
275 | psraw m5, 4 ; r5 = NR5 | |
276 | movq I(3), m3 ; store NR3 at I3 | |
277 | psubsw m7, m0 ; r7 = R7 = G. - C. | |
278 | paddsw m7, OC_8 ; adjust R7 (and R0) for shift | |
279 | paddsw m0, m0 ; r0 = C. + C. | |
280 | paddsw m0, m7 ; r0 = R0 = G. + C. | |
281 | psraw m7, 4 ; r7 = NR7 | |
282 | movq J(6), m6 ; store NR6 at J6 | |
283 | psraw m0, 4 ; r0 = NR0 | |
284 | movq J(5), m5 ; store NR5 at J5 | |
285 | movq J(7), m7 ; store NR7 at J7 | |
286 | movq I(0), m0 ; store NR0 at I0 | |
287 | %endmacro | |
288 | ||
289 | ; Following macro does two 4x4 transposes in place. | |
290 | ; | |
291 | ; At entry (we assume): | |
292 | ; | |
293 | ; r0 = a3 a2 a1 a0 | |
294 | ; I(1) = b3 b2 b1 b0 | |
295 | ; r2 = c3 c2 c1 c0 | |
296 | ; r3 = d3 d2 d1 d0 | |
297 | ; | |
298 | ; r4 = e3 e2 e1 e0 | |
299 | ; r5 = f3 f2 f1 f0 | |
300 | ; r6 = g3 g2 g1 g0 | |
301 | ; r7 = h3 h2 h1 h0 | |
302 | ; | |
303 | ; At exit, we have: | |
304 | ; | |
305 | ; I(0) = d0 c0 b0 a0 | |
306 | ; I(1) = d1 c1 b1 a1 | |
307 | ; I(2) = d2 c2 b2 a2 | |
308 | ; I(3) = d3 c3 b3 a3 | |
309 | ; | |
310 | ; J(4) = h0 g0 f0 e0 | |
311 | ; J(5) = h1 g1 f1 e1 | |
312 | ; J(6) = h2 g2 f2 e2 | |
313 | ; J(7) = h3 g3 f3 e3 | |
314 | ; | |
315 | ; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. | |
316 | ; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. | |
317 | ; | |
318 | ; Since r1 is free at entry, we calculate the Js first. | |
319 | %macro Transpose 0 | |
320 | movq m1, m4 ; r1 = e3 e2 e1 e0 | |
321 | punpcklwd m4, m5 ; r4 = f1 e1 f0 e0 | |
322 | movq I(0), m0 ; save a3 a2 a1 a0 | |
323 | punpckhwd m1, m5 ; r1 = f3 e3 f2 e2 | |
324 | movq m0, m6 ; r0 = g3 g2 g1 g0 | |
325 | punpcklwd m6, m7 ; r6 = h1 g1 h0 g0 | |
326 | movq m5, m4 ; r5 = f1 e1 f0 e0 | |
327 | punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4 | |
328 | punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5 | |
329 | movq m6, m1 ; r6 = f3 e3 f2 e2 | |
330 | movq J(4), m4 | |
331 | punpckhwd m0, m7 ; r0 = h3 g3 h2 g2 | |
332 | movq J(5), m5 | |
333 | punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7 | |
334 | movq m4, I(0) ; r4 = a3 a2 a1 a0 | |
335 | punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6 | |
336 | movq m5, I(1) ; r5 = b3 b2 b1 b0 | |
337 | movq m0, m4 ; r0 = a3 a2 a1 a0 | |
338 | movq J(7), m6 | |
339 | punpcklwd m0, m5 ; r0 = b1 a1 b0 a0 | |
340 | movq J(6), m1 | |
341 | punpckhwd m4, m5 ; r4 = b3 a3 b2 a2 | |
342 | movq m5, m2 ; r5 = c3 c2 c1 c0 | |
343 | punpcklwd m2, m3 ; r2 = d1 c1 d0 c0 | |
344 | movq m1, m0 ; r1 = b1 a1 b0 a0 | |
345 | punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0 | |
346 | punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1 | |
347 | movq m2, m4 ; r2 = b3 a3 b2 a2 | |
348 | movq I(0), m0 | |
349 | punpckhwd m5, m3 ; r5 = d3 c3 d2 c2 | |
350 | movq I(1), m1 | |
351 | punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3 | |
352 | punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2 | |
353 | movq I(3), m4 | |
354 | movq I(2), m2 | |
355 | %endmacro | |
356 | ||
357 | %macro VP3_1D_IDCT_SSE2 0 | |
358 | movdqa m2, I(3) ; xmm2 = i3 | |
359 | movdqa m6, C(3) ; xmm6 = c3 | |
360 | movdqa m4, m2 ; xmm4 = i3 | |
361 | movdqa m7, I(5) ; xmm7 = i5 | |
362 | pmulhw m4, m6 ; xmm4 = c3 * i3 - i3 | |
363 | movdqa m1, C(5) ; xmm1 = c5 | |
364 | pmulhw m6, m7 ; xmm6 = c3 * i5 - i5 | |
365 | movdqa m5, m1 ; xmm5 = c5 | |
366 | pmulhw m1, m2 ; xmm1 = c5 * i3 - i3 | |
367 | movdqa m3, I(1) ; xmm3 = i1 | |
368 | pmulhw m5, m7 ; xmm5 = c5 * i5 - i5 | |
369 | movdqa m0, C(1) ; xmm0 = c1 | |
370 | paddw m4, m2 ; xmm4 = c3 * i3 | |
371 | paddw m6, m7 ; xmm6 = c3 * i5 | |
372 | paddw m2, m1 ; xmm2 = c5 * i3 | |
373 | movdqa m1, I(7) ; xmm1 = i7 | |
374 | paddw m7, m5 ; xmm7 = c5 * i5 | |
375 | movdqa m5, m0 ; xmm5 = c1 | |
376 | pmulhw m0, m3 ; xmm0 = c1 * i1 - i1 | |
377 | paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C | |
378 | pmulhw m5, m1 ; xmm5 = c1 * i7 - i7 | |
379 | movdqa m7, C(7) ; xmm7 = c7 | |
380 | psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D | |
381 | paddw m0, m3 ; xmm0 = c1 * i1 | |
382 | pmulhw m3, m7 ; xmm3 = c7 * i1 | |
383 | movdqa m2, I(2) ; xmm2 = i2 | |
384 | pmulhw m7, m1 ; xmm7 = c7 * i7 | |
385 | paddw m5, m1 ; xmm5 = c1 * i7 | |
386 | movdqa m1, m2 ; xmm1 = i2 | |
387 | pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2 | |
388 | psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B | |
389 | movdqa m5, I(6) ; xmm5 = i6 | |
390 | paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A | |
391 | movdqa m7, m5 ; xmm7 = i6 | |
392 | psubsw m0, m4 ; xmm0 = A - C | |
393 | pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6 | |
394 | paddw m2, m1 ; xmm2 = i2 * c2 | |
395 | pmulhw m1, C(6) ; xmm1 = c6 * i2 | |
396 | paddsw m4, m4 ; xmm4 = C + C | |
397 | paddsw m4, m0 ; xmm4 = A + C = C. | |
398 | psubsw m3, m6 ; xmm3 = B - D | |
399 | paddw m5, m7 ; xmm5 = c2 * i6 | |
400 | paddsw m6, m6 ; xmm6 = D + D | |
401 | pmulhw m7, C(6) ; xmm7 = c6 * i6 | |
402 | paddsw m6, m3 ; xmm6 = B + D = D. | |
403 | movdqa I(1), m4 ; Save C. at I(1) | |
404 | psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H | |
405 | movdqa m4, C(4) ; xmm4 = C4 | |
406 | movdqa m5, m3 ; xmm5 = B - D | |
407 | pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D ) | |
408 | paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G | |
409 | movdqa I(2), m6 ; save D. at I(2) | |
410 | movdqa m2, m0 ; xmm2 = A - C | |
411 | movdqa m6, I(0) ; xmm6 = i0 | |
412 | pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A. | |
413 | paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B. | |
414 | movdqa m3, I(4) ; xmm3 = i4 | |
415 | psubsw m5, m1 ; xmm5 = B. - H = B.. | |
416 | paddw m2, m0 ; xmm2 = c4 * ( A - C) = A. | |
417 | psubsw m6, m3 ; xmm6 = i0 - i4 | |
418 | movdqa m0, m6 ; xmm0 = i0 - i4 | |
419 | pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F | |
420 | paddsw m3, m3 ; xmm3 = i4 + i4 | |
421 | paddsw m1, m1 ; xmm1 = H + H | |
422 | paddsw m3, m0 ; xmm3 = i0 + i4 | |
423 | paddsw m1, m5 ; xmm1 = B. + H = H. | |
424 | pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 ) | |
425 | paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 ) | |
426 | psubsw m6, m2 ; xmm6 = F - A. = F. | |
427 | paddsw m2, m2 ; xmm2 = A. + A. | |
428 | movdqa m0, I(1) ; Load C. from I(1) | |
429 | paddsw m2, m6 ; xmm2 = F + A. = A.. | |
430 | paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3 | |
431 | psubsw m2, m1 ; xmm2 = A.. - H. = R2 | |
432 | ADD(m2) ; Adjust R2 and R1 before shifting | |
433 | paddsw m1, m1 ; xmm1 = H. + H. | |
434 | paddsw m1, m2 ; xmm1 = A.. + H. = R1 | |
435 | SHIFT(m2) ; xmm2 = op2 | |
436 | psubsw m4, m7 ; xmm4 = E - G = E. | |
437 | SHIFT(m1) ; xmm1 = op1 | |
438 | movdqa m3, I(2) ; Load D. from I(2) | |
439 | paddsw m7, m7 ; xmm7 = G + G | |
440 | paddsw m7, m4 ; xmm7 = E + G = G. | |
441 | psubsw m4, m3 ; xmm4 = E. - D. = R4 | |
442 | ADD(m4) ; Adjust R4 and R3 before shifting | |
443 | paddsw m3, m3 ; xmm3 = D. + D. | |
444 | paddsw m3, m4 ; xmm3 = E. + D. = R3 | |
445 | SHIFT(m4) ; xmm4 = op4 | |
446 | psubsw m6, m5 ; xmm6 = F. - B..= R6 | |
447 | SHIFT(m3) ; xmm3 = op3 | |
448 | ADD(m6) ; Adjust R6 and R5 before shifting | |
449 | paddsw m5, m5 ; xmm5 = B.. + B.. | |
450 | paddsw m5, m6 ; xmm5 = F. + B.. = R5 | |
451 | SHIFT(m6) ; xmm6 = op6 | |
452 | SHIFT(m5) ; xmm5 = op5 | |
453 | psubsw m7, m0 ; xmm7 = G. - C. = R7 | |
454 | ADD(m7) ; Adjust R7 and R0 before shifting | |
455 | paddsw m0, m0 ; xmm0 = C. + C. | |
456 | paddsw m0, m7 ; xmm0 = G. + C. | |
457 | SHIFT(m7) ; xmm7 = op7 | |
458 | SHIFT(m0) ; xmm0 = op0 | |
459 | %endmacro | |
460 | ||
461 | %macro PUT_BLOCK 8 | |
462 | movdqa O(0), m%1 | |
463 | movdqa O(1), m%2 | |
464 | movdqa O(2), m%3 | |
465 | movdqa O(3), m%4 | |
466 | movdqa O(4), m%5 | |
467 | movdqa O(5), m%6 | |
468 | movdqa O(6), m%7 | |
469 | movdqa O(7), m%8 | |
470 | %endmacro | |
471 | ||
472 | %macro VP3_IDCT 1 | |
473 | %if mmsize == 16 | |
474 | %define I(x) [%1+16*x] | |
475 | %define O(x) [%1+16*x] | |
476 | %define C(x) [vp3_idct_data+16*(x-1)] | |
477 | %define SHIFT(x) | |
478 | %define ADD(x) | |
479 | VP3_1D_IDCT_SSE2 | |
480 | %if ARCH_X86_64 | |
481 | TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 | |
482 | %else | |
483 | TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16] | |
484 | %endif | |
485 | PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 | |
486 | ||
487 | %define SHIFT(x) psraw x, 4 | |
488 | %define ADD(x) paddsw x, [pw_8] | |
489 | VP3_1D_IDCT_SSE2 | |
490 | PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 | |
491 | %else ; mmsize == 8 | |
492 | ; eax = quantized input | |
493 | ; ebx = dequantizer matrix | |
494 | ; ecx = IDCT constants | |
495 | ; M(I) = ecx + MaskOffset(0) + I * 8 | |
496 | ; C(I) = ecx + CosineOffset(32) + (I-1) * 8 | |
497 | ; edx = output | |
498 | ; r0..r7 = mm0..mm7 | |
499 | %define OC_8 [pw_8] | |
500 | %define C(x) [vp3_idct_data+16*(x-1)] | |
501 | ||
502 | ; at this point, function has completed dequantization + dezigzag + | |
503 | ; partial transposition; now do the idct itself | |
504 | %define I(x) [%1+16*x] | |
505 | %define J(x) [%1+16*x] | |
506 | RowIDCT | |
507 | Transpose | |
508 | ||
509 | %define I(x) [%1+16*x+8] | |
510 | %define J(x) [%1+16*x+8] | |
511 | RowIDCT | |
512 | Transpose | |
513 | ||
514 | %define I(x) [%1+16* x] | |
515 | %define J(x) [%1+16*(x-4)+8] | |
516 | ColumnIDCT | |
517 | ||
518 | %define I(x) [%1+16* x +64] | |
519 | %define J(x) [%1+16*(x-4)+72] | |
520 | ColumnIDCT | |
521 | %endif ; mmsize == 16/8 | |
522 | %endmacro | |
523 | ||
524 | %macro vp3_idct_funcs 0 | |
525 | cglobal vp3_idct_put, 3, 4, 9 | |
526 | VP3_IDCT r2 | |
527 | ||
528 | movsxdifnidn r1, r1d | |
529 | mova m4, [pb_80] | |
530 | lea r3, [r1*3] | |
531 | %assign %%i 0 | |
532 | %rep 16/mmsize | |
533 | mova m0, [r2+mmsize*0+%%i] | |
534 | mova m1, [r2+mmsize*2+%%i] | |
535 | mova m2, [r2+mmsize*4+%%i] | |
536 | mova m3, [r2+mmsize*6+%%i] | |
537 | %if mmsize == 8 | |
538 | packsswb m0, [r2+mmsize*8+%%i] | |
539 | packsswb m1, [r2+mmsize*10+%%i] | |
540 | packsswb m2, [r2+mmsize*12+%%i] | |
541 | packsswb m3, [r2+mmsize*14+%%i] | |
542 | %else | |
543 | packsswb m0, [r2+mmsize*1+%%i] | |
544 | packsswb m1, [r2+mmsize*3+%%i] | |
545 | packsswb m2, [r2+mmsize*5+%%i] | |
546 | packsswb m3, [r2+mmsize*7+%%i] | |
547 | %endif | |
548 | paddb m0, m4 | |
549 | paddb m1, m4 | |
550 | paddb m2, m4 | |
551 | paddb m3, m4 | |
552 | movq [r0 ], m0 | |
553 | %if mmsize == 8 | |
554 | movq [r0+r1 ], m1 | |
555 | movq [r0+r1*2], m2 | |
556 | movq [r0+r3 ], m3 | |
557 | %else | |
558 | movhps [r0+r1 ], m0 | |
559 | movq [r0+r1*2], m1 | |
560 | movhps [r0+r3 ], m1 | |
561 | %endif | |
562 | %if %%i == 0 | |
563 | lea r0, [r0+r1*4] | |
564 | %endif | |
565 | %if mmsize == 16 | |
566 | movq [r0 ], m2 | |
567 | movhps [r0+r1 ], m2 | |
568 | movq [r0+r1*2], m3 | |
569 | movhps [r0+r3 ], m3 | |
570 | %endif | |
571 | %assign %%i %%i+8 | |
572 | %endrep | |
573 | ||
574 | pxor m0, m0 | |
575 | %assign %%offset 0 | |
576 | %rep 128/mmsize | |
577 | mova [r2+%%offset], m0 | |
578 | %assign %%offset %%offset+mmsize | |
579 | %endrep | |
580 | RET | |
581 | ||
582 | cglobal vp3_idct_add, 3, 4, 9 | |
583 | VP3_IDCT r2 | |
584 | ||
585 | movsxdifnidn r1, r1d | |
586 | lea r3, [r1*3] | |
587 | pxor m4, m4 | |
588 | %if mmsize == 16 | |
589 | %assign %%i 0 | |
590 | %rep 2 | |
591 | movq m0, [r0] | |
592 | movq m1, [r0+r1] | |
593 | movq m2, [r0+r1*2] | |
594 | movq m3, [r0+r3] | |
595 | punpcklbw m0, m4 | |
596 | punpcklbw m1, m4 | |
597 | punpcklbw m2, m4 | |
598 | punpcklbw m3, m4 | |
599 | paddsw m0, [r2+ 0+%%i] | |
600 | paddsw m1, [r2+16+%%i] | |
601 | paddsw m2, [r2+32+%%i] | |
602 | paddsw m3, [r2+48+%%i] | |
603 | packuswb m0, m1 | |
604 | packuswb m2, m3 | |
605 | movq [r0 ], m0 | |
606 | movhps [r0+r1 ], m0 | |
607 | movq [r0+r1*2], m2 | |
608 | movhps [r0+r3 ], m2 | |
609 | %if %%i == 0 | |
610 | lea r0, [r0+r1*4] | |
611 | %endif | |
612 | %assign %%i %%i+64 | |
613 | %endrep | |
614 | %else | |
615 | %assign %%i 0 | |
616 | %rep 2 | |
617 | movq m0, [r0] | |
618 | movq m1, [r0+r1] | |
619 | movq m2, [r0+r1*2] | |
620 | movq m3, [r0+r3] | |
621 | movq m5, m0 | |
622 | movq m6, m1 | |
623 | movq m7, m2 | |
624 | punpcklbw m0, m4 | |
625 | punpcklbw m1, m4 | |
626 | punpcklbw m2, m4 | |
627 | punpckhbw m5, m4 | |
628 | punpckhbw m6, m4 | |
629 | punpckhbw m7, m4 | |
630 | paddsw m0, [r2+ 0+%%i] | |
631 | paddsw m1, [r2+16+%%i] | |
632 | paddsw m2, [r2+32+%%i] | |
633 | paddsw m5, [r2+64+%%i] | |
634 | paddsw m6, [r2+80+%%i] | |
635 | paddsw m7, [r2+96+%%i] | |
636 | packuswb m0, m5 | |
637 | movq m5, m3 | |
638 | punpcklbw m3, m4 | |
639 | punpckhbw m5, m4 | |
640 | packuswb m1, m6 | |
641 | paddsw m3, [r2+48+%%i] | |
642 | paddsw m5, [r2+112+%%i] | |
643 | packuswb m2, m7 | |
644 | packuswb m3, m5 | |
645 | movq [r0 ], m0 | |
646 | movq [r0+r1 ], m1 | |
647 | movq [r0+r1*2], m2 | |
648 | movq [r0+r3 ], m3 | |
649 | %if %%i == 0 | |
650 | lea r0, [r0+r1*4] | |
651 | %endif | |
652 | %assign %%i %%i+8 | |
653 | %endrep | |
654 | %endif | |
655 | %assign %%i 0 | |
656 | %rep 128/mmsize | |
657 | mova [r2+%%i], m4 | |
658 | %assign %%i %%i+mmsize | |
659 | %endrep | |
660 | RET | |
661 | %endmacro | |
662 | ||
663 | %if ARCH_X86_32 | |
664 | INIT_MMX mmx | |
665 | vp3_idct_funcs | |
666 | %endif | |
667 | ||
668 | INIT_XMM sse2 | |
669 | vp3_idct_funcs | |
670 | ||
671 | %macro DC_ADD 0 | |
672 | movq m2, [r0 ] | |
673 | movq m3, [r0+r1 ] | |
674 | paddusb m2, m0 | |
675 | movq m4, [r0+r1*2] | |
676 | paddusb m3, m0 | |
677 | movq m5, [r0+r2 ] | |
678 | paddusb m4, m0 | |
679 | paddusb m5, m0 | |
680 | psubusb m2, m1 | |
681 | psubusb m3, m1 | |
682 | movq [r0 ], m2 | |
683 | psubusb m4, m1 | |
684 | movq [r0+r1 ], m3 | |
685 | psubusb m5, m1 | |
686 | movq [r0+r1*2], m4 | |
687 | movq [r0+r2 ], m5 | |
688 | %endmacro | |
689 | ||
690 | INIT_MMX mmxext | |
691 | cglobal vp3_idct_dc_add, 3, 4 | |
692 | %if ARCH_X86_64 | |
693 | movsxd r1, r1d | |
694 | %endif | |
695 | movsx r3, word [r2] | |
696 | mov word [r2], 0 | |
697 | lea r2, [r1*3] | |
698 | add r3, 15 | |
699 | sar r3, 5 | |
700 | movd m0, r3d | |
701 | pshufw m0, m0, 0x0 | |
702 | pxor m1, m1 | |
703 | psubw m1, m0 | |
704 | packuswb m0, m0 | |
705 | packuswb m1, m1 | |
706 | DC_ADD | |
707 | lea r0, [r0+r1*4] | |
708 | DC_ADD | |
709 | RET |