Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / vp3dsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* MMX/SSE2-optimized functions for the VP3 decoder
3;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24; MMX-optimized functions cribbed from the original VP3 source code.
25
26SECTION_RODATA
27
28vp3_idct_data: times 8 dw 64277
29 times 8 dw 60547
30 times 8 dw 54491
31 times 8 dw 46341
32 times 8 dw 36410
33 times 8 dw 25080
34 times 8 dw 12785
35
36pb_7: times 8 db 0x07
37pb_1F: times 8 db 0x1f
38pb_81: times 8 db 0x81
39
40cextern pb_1
41cextern pb_3
42cextern pb_80
43
44cextern pw_8
45
46SECTION .text
47
48; this is off by one or two for some cases when filter_limit is greater than 63
49; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
50; out: p1 in mm4, p2 in mm3
51%macro VP3_LOOP_FILTER 0
52 movq m7, m6
53 pand m6, [pb_7] ; p0&7
54 psrlw m7, 3
55 pand m7, [pb_1F] ; p0>>3
56 movq m3, m2 ; p2
57 pxor m2, m4
58 pand m2, [pb_1] ; (p2^p1)&1
59 movq m5, m2
60 paddb m2, m2
61 paddb m2, m5 ; 3*(p2^p1)&1
62 paddb m2, m6 ; extra bits lost in shifts
63 pcmpeqb m0, m0
64 pxor m1, m0 ; 255 - p3
65 pavgb m1, m2 ; (256 - p3 + extrabits) >> 1
66 pxor m0, m4 ; 255 - p1
67 pavgb m0, m3 ; (256 + p2-p1) >> 1
68 paddb m1, [pb_3]
69 pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2
70 pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3
71 paddusb m7, m1 ; d+128+1
72 movq m6, [pb_81]
73 psubusb m6, m7
74 psubusb m7, [pb_81]
75
76 movq m5, [r2+516] ; flim
77 pminub m6, m5
78 pminub m7, m5
79 movq m0, m6
80 movq m1, m7
81 paddb m6, m6
82 paddb m7, m7
83 pminub m6, m5
84 pminub m7, m5
85 psubb m6, m0
86 psubb m7, m1
87 paddusb m4, m7
88 psubusb m4, m6
89 psubusb m3, m7
90 paddusb m3, m6
91%endmacro
92
93%macro STORE_4_WORDS 1
94 movd r2d, %1
95 mov [r0 -1], r2w
96 psrlq %1, 32
97 shr r2, 16
98 mov [r0+r1 -1], r2w
99 movd r2d, %1
100 mov [r0+r1*2-1], r2w
101 shr r2, 16
102 mov [r0+r3 -1], r2w
103%endmacro
104
105INIT_MMX mmxext
106cglobal vp3_v_loop_filter, 3, 4
107%if ARCH_X86_64
108 movsxd r1, r1d
109%endif
110 mov r3, r1
111 neg r1
112 movq m6, [r0+r1*2]
113 movq m4, [r0+r1 ]
114 movq m2, [r0 ]
115 movq m1, [r0+r3 ]
116
117 VP3_LOOP_FILTER
118
119 movq [r0+r1], m4
120 movq [r0 ], m3
121 RET
122
123cglobal vp3_h_loop_filter, 3, 4
124%if ARCH_X86_64
125 movsxd r1, r1d
126%endif
127 lea r3, [r1*3]
128
129 movd m6, [r0 -2]
130 movd m4, [r0+r1 -2]
131 movd m2, [r0+r1*2-2]
132 movd m1, [r0+r3 -2]
133 lea r0, [r0+r1*4 ]
134 punpcklbw m6, [r0 -2]
135 punpcklbw m4, [r0+r1 -2]
136 punpcklbw m2, [r0+r1*2-2]
137 punpcklbw m1, [r0+r3 -2]
138 sub r0, r3
139 sub r0, r1
140
141 TRANSPOSE4x4B 6, 4, 2, 1, 0
142 VP3_LOOP_FILTER
143 SBUTTERFLY bw, 4, 3, 5
144
145 STORE_4_WORDS m4
146 lea r0, [r0+r1*4 ]
147 STORE_4_WORDS m3
148 RET
149
150; from original comments: The Macro does IDct on 4 1-D Dcts
151%macro BeginIDCT 0
152 movq m2, I(3)
153 movq m6, C(3)
154 movq m4, m2
155 movq m7, J(5)
156 pmulhw m4, m6 ; r4 = c3*i3 - i3
157 movq m1, C(5)
158 pmulhw m6, m7 ; r6 = c3*i5 - i5
159 movq m5, m1
160 pmulhw m1, m2 ; r1 = c5*i3 - i3
161 movq m3, I(1)
162 pmulhw m5, m7 ; r5 = c5*i5 - i5
163 movq m0, C(1)
164 paddw m4, m2 ; r4 = c3*i3
165 paddw m6, m7 ; r6 = c3*i5
166 paddw m2, m1 ; r2 = c5*i3
167 movq m1, J(7)
168 paddw m7, m5 ; r7 = c5*i5
169 movq m5, m0 ; r5 = c1
170 pmulhw m0, m3 ; r0 = c1*i1 - i1
171 paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5
172 pmulhw m5, m1 ; r5 = c1*i7 - i7
173 movq m7, C(7)
174 psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3
175 paddw m0, m3 ; r0 = c1*i1
176 pmulhw m3, m7 ; r3 = c7*i1
177 movq m2, I(2)
178 pmulhw m7, m1 ; r7 = c7*i7
179 paddw m5, m1 ; r5 = c1*i7
180 movq m1, m2 ; r1 = i2
181 pmulhw m2, C(2) ; r2 = c2*i2 - i2
182 psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7
183 movq m5, J(6)
184 paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7
185 movq m7, m5 ; r7 = i6
186 psubsw m0, m4 ; r0 = A - C
187 pmulhw m5, C(2) ; r5 = c2*i6 - i6
188 paddw m2, m1 ; r2 = c2*i2
189 pmulhw m1, C(6) ; r1 = c6*i2
190 paddsw m4, m4 ; r4 = C + C
191 paddsw m4, m0 ; r4 = C. = A + C
192 psubsw m3, m6 ; r3 = B - D
193 paddw m5, m7 ; r5 = c2*i6
194 paddsw m6, m6 ; r6 = D + D
195 pmulhw m7, C(6) ; r7 = c6*i6
196 paddsw m6, m3 ; r6 = D. = B + D
197 movq I(1), m4 ; save C. at I(1)
198 psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6
199 movq m4, C(4)
200 movq m5, m3 ; r5 = B - D
201 pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D)
202 paddsw m7, m2 ; r3 = (c4 - 1) * (B - D)
203 movq I(2), m6 ; save D. at I(2)
204 movq m2, m0 ; r2 = A - C
205 movq m6, I(0)
206 pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C)
207 paddw m5, m3 ; r5 = B. = c4 * (B - D)
208 movq m3, J(4)
209 psubsw m5, m1 ; r5 = B.. = B. - H
210 paddw m2, m0 ; r0 = A. = c4 * (A - C)
211 psubsw m6, m3 ; r6 = i0 - i4
212 movq m0, m6
213 pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4)
214 paddsw m3, m3 ; r3 = i4 + i4
215 paddsw m1, m1 ; r1 = H + H
216 paddsw m3, m0 ; r3 = i0 + i4
217 paddsw m1, m5 ; r1 = H. = B + H
218 pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4)
219 paddsw m6, m0 ; r6 = F = c4 * (i0 - i4)
220 psubsw m6, m2 ; r6 = F. = F - A.
221 paddsw m2, m2 ; r2 = A. + A.
222 movq m0, I(1) ; r0 = C.
223 paddsw m2, m6 ; r2 = A.. = F + A.
224 paddw m4, m3 ; r4 = E = c4 * (i0 + i4)
225 psubsw m2, m1 ; r2 = R2 = A.. - H.
226%endmacro
227
228; RowIDCT gets ready to transpose
229%macro RowIDCT 0
230 BeginIDCT
231 movq m3, I(2) ; r3 = D.
232 psubsw m4, m7 ; r4 = E. = E - G
233 paddsw m1, m1 ; r1 = H. + H.
234 paddsw m7, m7 ; r7 = G + G
235 paddsw m1, m2 ; r1 = R1 = A.. + H.
236 paddsw m7, m4 ; r1 = R1 = A.. + H.
237 psubsw m4, m3 ; r4 = R4 = E. - D.
238 paddsw m3, m3
239 psubsw m6, m5 ; r6 = R6 = F. - B..
240 paddsw m5, m5
241 paddsw m3, m4 ; r3 = R3 = E. + D.
242 paddsw m5, m6 ; r5 = R5 = F. + B..
243 psubsw m7, m0 ; r7 = R7 = G. - C.
244 paddsw m0, m0
245 movq I(1), m1 ; save R1
246 paddsw m0, m7 ; r0 = R0 = G. + C.
247%endmacro
248
249; Column IDCT normalizes and stores final results
250%macro ColumnIDCT 0
251 BeginIDCT
252 paddsw m2, OC_8 ; adjust R2 (and R1) for shift
253 paddsw m1, m1 ; r1 = H. + H.
254 paddsw m1, m2 ; r1 = R1 = A.. + H.
255 psraw m2, 4 ; r2 = NR2
256 psubsw m4, m7 ; r4 = E. = E - G
257 psraw m1, 4 ; r1 = NR2
258 movq m3, I(2) ; r3 = D.
259 paddsw m7, m7 ; r7 = G + G
260 movq I(2), m2 ; store NR2 at I2
261 paddsw m7, m4 ; r7 = G. = E + G
262 movq I(1), m1 ; store NR1 at I1
263 psubsw m4, m3 ; r4 = R4 = E. - D.
264 paddsw m4, OC_8 ; adjust R4 (and R3) for shift
265 paddsw m3, m3 ; r3 = D. + D.
266 paddsw m3, m4 ; r3 = R3 = E. + D.
267 psraw m4, 4 ; r4 = NR4
268 psubsw m6, m5 ; r6 = R6 = F. - B..
269 psraw m3, 4 ; r3 = NR3
270 paddsw m6, OC_8 ; adjust R6 (and R5) for shift
271 paddsw m5, m5 ; r5 = B.. + B..
272 paddsw m5, m6 ; r5 = R5 = F. + B..
273 psraw m6, 4 ; r6 = NR6
274 movq J(4), m4 ; store NR4 at J4
275 psraw m5, 4 ; r5 = NR5
276 movq I(3), m3 ; store NR3 at I3
277 psubsw m7, m0 ; r7 = R7 = G. - C.
278 paddsw m7, OC_8 ; adjust R7 (and R0) for shift
279 paddsw m0, m0 ; r0 = C. + C.
280 paddsw m0, m7 ; r0 = R0 = G. + C.
281 psraw m7, 4 ; r7 = NR7
282 movq J(6), m6 ; store NR6 at J6
283 psraw m0, 4 ; r0 = NR0
284 movq J(5), m5 ; store NR5 at J5
285 movq J(7), m7 ; store NR7 at J7
286 movq I(0), m0 ; store NR0 at I0
287%endmacro
288
289; Following macro does two 4x4 transposes in place.
290;
291; At entry (we assume):
292;
293; r0 = a3 a2 a1 a0
294; I(1) = b3 b2 b1 b0
295; r2 = c3 c2 c1 c0
296; r3 = d3 d2 d1 d0
297;
298; r4 = e3 e2 e1 e0
299; r5 = f3 f2 f1 f0
300; r6 = g3 g2 g1 g0
301; r7 = h3 h2 h1 h0
302;
303; At exit, we have:
304;
305; I(0) = d0 c0 b0 a0
306; I(1) = d1 c1 b1 a1
307; I(2) = d2 c2 b2 a2
308; I(3) = d3 c3 b3 a3
309;
310; J(4) = h0 g0 f0 e0
311; J(5) = h1 g1 f1 e1
312; J(6) = h2 g2 f2 e2
313; J(7) = h3 g3 f3 e3
314;
315; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
316; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
317;
318; Since r1 is free at entry, we calculate the Js first.
319%macro Transpose 0
320 movq m1, m4 ; r1 = e3 e2 e1 e0
321 punpcklwd m4, m5 ; r4 = f1 e1 f0 e0
322 movq I(0), m0 ; save a3 a2 a1 a0
323 punpckhwd m1, m5 ; r1 = f3 e3 f2 e2
324 movq m0, m6 ; r0 = g3 g2 g1 g0
325 punpcklwd m6, m7 ; r6 = h1 g1 h0 g0
326 movq m5, m4 ; r5 = f1 e1 f0 e0
327 punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4
328 punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5
329 movq m6, m1 ; r6 = f3 e3 f2 e2
330 movq J(4), m4
331 punpckhwd m0, m7 ; r0 = h3 g3 h2 g2
332 movq J(5), m5
333 punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7
334 movq m4, I(0) ; r4 = a3 a2 a1 a0
335 punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6
336 movq m5, I(1) ; r5 = b3 b2 b1 b0
337 movq m0, m4 ; r0 = a3 a2 a1 a0
338 movq J(7), m6
339 punpcklwd m0, m5 ; r0 = b1 a1 b0 a0
340 movq J(6), m1
341 punpckhwd m4, m5 ; r4 = b3 a3 b2 a2
342 movq m5, m2 ; r5 = c3 c2 c1 c0
343 punpcklwd m2, m3 ; r2 = d1 c1 d0 c0
344 movq m1, m0 ; r1 = b1 a1 b0 a0
345 punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0
346 punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1
347 movq m2, m4 ; r2 = b3 a3 b2 a2
348 movq I(0), m0
349 punpckhwd m5, m3 ; r5 = d3 c3 d2 c2
350 movq I(1), m1
351 punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3
352 punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2
353 movq I(3), m4
354 movq I(2), m2
355%endmacro
356
357%macro VP3_1D_IDCT_SSE2 0
358 movdqa m2, I(3) ; xmm2 = i3
359 movdqa m6, C(3) ; xmm6 = c3
360 movdqa m4, m2 ; xmm4 = i3
361 movdqa m7, I(5) ; xmm7 = i5
362 pmulhw m4, m6 ; xmm4 = c3 * i3 - i3
363 movdqa m1, C(5) ; xmm1 = c5
364 pmulhw m6, m7 ; xmm6 = c3 * i5 - i5
365 movdqa m5, m1 ; xmm5 = c5
366 pmulhw m1, m2 ; xmm1 = c5 * i3 - i3
367 movdqa m3, I(1) ; xmm3 = i1
368 pmulhw m5, m7 ; xmm5 = c5 * i5 - i5
369 movdqa m0, C(1) ; xmm0 = c1
370 paddw m4, m2 ; xmm4 = c3 * i3
371 paddw m6, m7 ; xmm6 = c3 * i5
372 paddw m2, m1 ; xmm2 = c5 * i3
373 movdqa m1, I(7) ; xmm1 = i7
374 paddw m7, m5 ; xmm7 = c5 * i5
375 movdqa m5, m0 ; xmm5 = c1
376 pmulhw m0, m3 ; xmm0 = c1 * i1 - i1
377 paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C
378 pmulhw m5, m1 ; xmm5 = c1 * i7 - i7
379 movdqa m7, C(7) ; xmm7 = c7
380 psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D
381 paddw m0, m3 ; xmm0 = c1 * i1
382 pmulhw m3, m7 ; xmm3 = c7 * i1
383 movdqa m2, I(2) ; xmm2 = i2
384 pmulhw m7, m1 ; xmm7 = c7 * i7
385 paddw m5, m1 ; xmm5 = c1 * i7
386 movdqa m1, m2 ; xmm1 = i2
387 pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2
388 psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B
389 movdqa m5, I(6) ; xmm5 = i6
390 paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A
391 movdqa m7, m5 ; xmm7 = i6
392 psubsw m0, m4 ; xmm0 = A - C
393 pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6
394 paddw m2, m1 ; xmm2 = i2 * c2
395 pmulhw m1, C(6) ; xmm1 = c6 * i2
396 paddsw m4, m4 ; xmm4 = C + C
397 paddsw m4, m0 ; xmm4 = A + C = C.
398 psubsw m3, m6 ; xmm3 = B - D
399 paddw m5, m7 ; xmm5 = c2 * i6
400 paddsw m6, m6 ; xmm6 = D + D
401 pmulhw m7, C(6) ; xmm7 = c6 * i6
402 paddsw m6, m3 ; xmm6 = B + D = D.
403 movdqa I(1), m4 ; Save C. at I(1)
404 psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H
405 movdqa m4, C(4) ; xmm4 = C4
406 movdqa m5, m3 ; xmm5 = B - D
407 pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D )
408 paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G
409 movdqa I(2), m6 ; save D. at I(2)
410 movdqa m2, m0 ; xmm2 = A - C
411 movdqa m6, I(0) ; xmm6 = i0
412 pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
413 paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B.
414 movdqa m3, I(4) ; xmm3 = i4
415 psubsw m5, m1 ; xmm5 = B. - H = B..
416 paddw m2, m0 ; xmm2 = c4 * ( A - C) = A.
417 psubsw m6, m3 ; xmm6 = i0 - i4
418 movdqa m0, m6 ; xmm0 = i0 - i4
419 pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F
420 paddsw m3, m3 ; xmm3 = i4 + i4
421 paddsw m1, m1 ; xmm1 = H + H
422 paddsw m3, m0 ; xmm3 = i0 + i4
423 paddsw m1, m5 ; xmm1 = B. + H = H.
424 pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
425 paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 )
426 psubsw m6, m2 ; xmm6 = F - A. = F.
427 paddsw m2, m2 ; xmm2 = A. + A.
428 movdqa m0, I(1) ; Load C. from I(1)
429 paddsw m2, m6 ; xmm2 = F + A. = A..
430 paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3
431 psubsw m2, m1 ; xmm2 = A.. - H. = R2
432 ADD(m2) ; Adjust R2 and R1 before shifting
433 paddsw m1, m1 ; xmm1 = H. + H.
434 paddsw m1, m2 ; xmm1 = A.. + H. = R1
435 SHIFT(m2) ; xmm2 = op2
436 psubsw m4, m7 ; xmm4 = E - G = E.
437 SHIFT(m1) ; xmm1 = op1
438 movdqa m3, I(2) ; Load D. from I(2)
439 paddsw m7, m7 ; xmm7 = G + G
440 paddsw m7, m4 ; xmm7 = E + G = G.
441 psubsw m4, m3 ; xmm4 = E. - D. = R4
442 ADD(m4) ; Adjust R4 and R3 before shifting
443 paddsw m3, m3 ; xmm3 = D. + D.
444 paddsw m3, m4 ; xmm3 = E. + D. = R3
445 SHIFT(m4) ; xmm4 = op4
446 psubsw m6, m5 ; xmm6 = F. - B..= R6
447 SHIFT(m3) ; xmm3 = op3
448 ADD(m6) ; Adjust R6 and R5 before shifting
449 paddsw m5, m5 ; xmm5 = B.. + B..
450 paddsw m5, m6 ; xmm5 = F. + B.. = R5
451 SHIFT(m6) ; xmm6 = op6
452 SHIFT(m5) ; xmm5 = op5
453 psubsw m7, m0 ; xmm7 = G. - C. = R7
454 ADD(m7) ; Adjust R7 and R0 before shifting
455 paddsw m0, m0 ; xmm0 = C. + C.
456 paddsw m0, m7 ; xmm0 = G. + C.
457 SHIFT(m7) ; xmm7 = op7
458 SHIFT(m0) ; xmm0 = op0
459%endmacro
460
461%macro PUT_BLOCK 8
462 movdqa O(0), m%1
463 movdqa O(1), m%2
464 movdqa O(2), m%3
465 movdqa O(3), m%4
466 movdqa O(4), m%5
467 movdqa O(5), m%6
468 movdqa O(6), m%7
469 movdqa O(7), m%8
470%endmacro
471
472%macro VP3_IDCT 1
473%if mmsize == 16
474%define I(x) [%1+16*x]
475%define O(x) [%1+16*x]
476%define C(x) [vp3_idct_data+16*(x-1)]
477%define SHIFT(x)
478%define ADD(x)
479 VP3_1D_IDCT_SSE2
480%if ARCH_X86_64
481 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
482%else
483 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16]
484%endif
485 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
486
487%define SHIFT(x) psraw x, 4
488%define ADD(x) paddsw x, [pw_8]
489 VP3_1D_IDCT_SSE2
490 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
491%else ; mmsize == 8
492 ; eax = quantized input
493 ; ebx = dequantizer matrix
494 ; ecx = IDCT constants
495 ; M(I) = ecx + MaskOffset(0) + I * 8
496 ; C(I) = ecx + CosineOffset(32) + (I-1) * 8
497 ; edx = output
498 ; r0..r7 = mm0..mm7
499%define OC_8 [pw_8]
500%define C(x) [vp3_idct_data+16*(x-1)]
501
502 ; at this point, function has completed dequantization + dezigzag +
503 ; partial transposition; now do the idct itself
504%define I(x) [%1+16*x]
505%define J(x) [%1+16*x]
506 RowIDCT
507 Transpose
508
509%define I(x) [%1+16*x+8]
510%define J(x) [%1+16*x+8]
511 RowIDCT
512 Transpose
513
514%define I(x) [%1+16* x]
515%define J(x) [%1+16*(x-4)+8]
516 ColumnIDCT
517
518%define I(x) [%1+16* x +64]
519%define J(x) [%1+16*(x-4)+72]
520 ColumnIDCT
521%endif ; mmsize == 16/8
522%endmacro
523
524%macro vp3_idct_funcs 0
525cglobal vp3_idct_put, 3, 4, 9
526 VP3_IDCT r2
527
528 movsxdifnidn r1, r1d
529 mova m4, [pb_80]
530 lea r3, [r1*3]
531%assign %%i 0
532%rep 16/mmsize
533 mova m0, [r2+mmsize*0+%%i]
534 mova m1, [r2+mmsize*2+%%i]
535 mova m2, [r2+mmsize*4+%%i]
536 mova m3, [r2+mmsize*6+%%i]
537%if mmsize == 8
538 packsswb m0, [r2+mmsize*8+%%i]
539 packsswb m1, [r2+mmsize*10+%%i]
540 packsswb m2, [r2+mmsize*12+%%i]
541 packsswb m3, [r2+mmsize*14+%%i]
542%else
543 packsswb m0, [r2+mmsize*1+%%i]
544 packsswb m1, [r2+mmsize*3+%%i]
545 packsswb m2, [r2+mmsize*5+%%i]
546 packsswb m3, [r2+mmsize*7+%%i]
547%endif
548 paddb m0, m4
549 paddb m1, m4
550 paddb m2, m4
551 paddb m3, m4
552 movq [r0 ], m0
553%if mmsize == 8
554 movq [r0+r1 ], m1
555 movq [r0+r1*2], m2
556 movq [r0+r3 ], m3
557%else
558 movhps [r0+r1 ], m0
559 movq [r0+r1*2], m1
560 movhps [r0+r3 ], m1
561%endif
562%if %%i == 0
563 lea r0, [r0+r1*4]
564%endif
565%if mmsize == 16
566 movq [r0 ], m2
567 movhps [r0+r1 ], m2
568 movq [r0+r1*2], m3
569 movhps [r0+r3 ], m3
570%endif
571%assign %%i %%i+8
572%endrep
573
574 pxor m0, m0
575%assign %%offset 0
576%rep 128/mmsize
577 mova [r2+%%offset], m0
578%assign %%offset %%offset+mmsize
579%endrep
580 RET
581
582cglobal vp3_idct_add, 3, 4, 9
583 VP3_IDCT r2
584
585 movsxdifnidn r1, r1d
586 lea r3, [r1*3]
587 pxor m4, m4
588%if mmsize == 16
589%assign %%i 0
590%rep 2
591 movq m0, [r0]
592 movq m1, [r0+r1]
593 movq m2, [r0+r1*2]
594 movq m3, [r0+r3]
595 punpcklbw m0, m4
596 punpcklbw m1, m4
597 punpcklbw m2, m4
598 punpcklbw m3, m4
599 paddsw m0, [r2+ 0+%%i]
600 paddsw m1, [r2+16+%%i]
601 paddsw m2, [r2+32+%%i]
602 paddsw m3, [r2+48+%%i]
603 packuswb m0, m1
604 packuswb m2, m3
605 movq [r0 ], m0
606 movhps [r0+r1 ], m0
607 movq [r0+r1*2], m2
608 movhps [r0+r3 ], m2
609%if %%i == 0
610 lea r0, [r0+r1*4]
611%endif
612%assign %%i %%i+64
613%endrep
614%else
615%assign %%i 0
616%rep 2
617 movq m0, [r0]
618 movq m1, [r0+r1]
619 movq m2, [r0+r1*2]
620 movq m3, [r0+r3]
621 movq m5, m0
622 movq m6, m1
623 movq m7, m2
624 punpcklbw m0, m4
625 punpcklbw m1, m4
626 punpcklbw m2, m4
627 punpckhbw m5, m4
628 punpckhbw m6, m4
629 punpckhbw m7, m4
630 paddsw m0, [r2+ 0+%%i]
631 paddsw m1, [r2+16+%%i]
632 paddsw m2, [r2+32+%%i]
633 paddsw m5, [r2+64+%%i]
634 paddsw m6, [r2+80+%%i]
635 paddsw m7, [r2+96+%%i]
636 packuswb m0, m5
637 movq m5, m3
638 punpcklbw m3, m4
639 punpckhbw m5, m4
640 packuswb m1, m6
641 paddsw m3, [r2+48+%%i]
642 paddsw m5, [r2+112+%%i]
643 packuswb m2, m7
644 packuswb m3, m5
645 movq [r0 ], m0
646 movq [r0+r1 ], m1
647 movq [r0+r1*2], m2
648 movq [r0+r3 ], m3
649%if %%i == 0
650 lea r0, [r0+r1*4]
651%endif
652%assign %%i %%i+8
653%endrep
654%endif
655%assign %%i 0
656%rep 128/mmsize
657 mova [r2+%%i], m4
658%assign %%i %%i+mmsize
659%endrep
660 RET
661%endmacro
662
663%if ARCH_X86_32
664INIT_MMX mmx
665vp3_idct_funcs
666%endif
667
668INIT_XMM sse2
669vp3_idct_funcs
670
671%macro DC_ADD 0
672 movq m2, [r0 ]
673 movq m3, [r0+r1 ]
674 paddusb m2, m0
675 movq m4, [r0+r1*2]
676 paddusb m3, m0
677 movq m5, [r0+r2 ]
678 paddusb m4, m0
679 paddusb m5, m0
680 psubusb m2, m1
681 psubusb m3, m1
682 movq [r0 ], m2
683 psubusb m4, m1
684 movq [r0+r1 ], m3
685 psubusb m5, m1
686 movq [r0+r1*2], m4
687 movq [r0+r2 ], m5
688%endmacro
689
690INIT_MMX mmxext
691cglobal vp3_idct_dc_add, 3, 4
692%if ARCH_X86_64
693 movsxd r1, r1d
694%endif
695 movsx r3, word [r2]
696 mov word [r2], 0
697 lea r2, [r1*3]
698 add r3, 15
699 sar r3, 5
700 movd m0, r3d
701 pshufw m0, m0, 0x0
702 pxor m1, m1
703 psubw m1, m0
704 packuswb m0, m0
705 packuswb m1, m1
706 DC_ADD
707 lea r0, [r0+r1*4]
708 DC_ADD
709 RET