Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / simple_idct.c
1 /*
2 * Simple IDCT MMX
3 *
4 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 #include "libavutil/mem.h"
24 #include "libavutil/x86/asm.h"
25
26 #include "libavcodec/idctdsp.h"
27
28 #include "idctdsp.h"
29 #include "simple_idct.h"
30
31 #if HAVE_INLINE_ASM
32
33 /*
34 23170.475006
35 22725.260826
36 21406.727617
37 19265.545870
38 16384.000000
39 12872.826198
40 8866.956905
41 4520.335430
42 */
43 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
48 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
49 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
50 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
51
52 #define ROW_SHIFT 11
53 #define COL_SHIFT 20 // 6
54
55 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
56 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
57
58 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
59 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
60 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
61 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
62 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
63 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
64 // 0, 0, 0, 0,
65 // 0, 0, 0, 0,
66
67 C4, C4, C4, C4,
68 C4, -C4, C4, -C4,
69
70 C2, C6, C2, C6,
71 C6, -C2, C6, -C2,
72
73 C1, C3, C1, C3,
74 C5, C7, C5, C7,
75
76 C3, -C7, C3, -C7,
77 -C1, -C5, -C1, -C5,
78
79 C5, -C1, C5, -C1,
80 C7, C3, C7, C3,
81
82 C7, -C5, C7, -C5,
83 C3, -C1, C3, -C1
84 };
85
86 static inline void idct(int16_t *block)
87 {
88 LOCAL_ALIGNED_8(int64_t, align_tmp, [16]);
89 int16_t * const temp= (int16_t*)align_tmp;
90
91 __asm__ volatile(
92 #if 0 //Alternative, simpler variant
93
94 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
95 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
96 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
97 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
98 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
99 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
100 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
101 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
102 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
103 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
104 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
105 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
106 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
107 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
108 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
109 #rounder ", %%mm4 \n\t"\
110 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
111 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
112 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
113 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
114 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
115 #rounder ", %%mm0 \n\t"\
116 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
117 "paddd %%mm0, %%mm0 \n\t" \
118 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
119 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
120 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
121 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
122 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
123 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
124 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
125 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
126 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
127 "psrad $" #shift ", %%mm7 \n\t"\
128 "psrad $" #shift ", %%mm4 \n\t"\
129 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
130 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
131 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
132 "psrad $" #shift ", %%mm1 \n\t"\
133 "psrad $" #shift ", %%mm2 \n\t"\
134 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
135 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
136 "movq %%mm7, " #dst " \n\t"\
137 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
138 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
139 "movq %%mm2, 24+" #dst " \n\t"\
140 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
141 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
142 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
143 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
144 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
145 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
146 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
147 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
148 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
149 "psrad $" #shift ", %%mm2 \n\t"\
150 "psrad $" #shift ", %%mm0 \n\t"\
151 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
152 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
153 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
154 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
155 "psrad $" #shift ", %%mm6 \n\t"\
156 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
157 "movq %%mm2, 8+" #dst " \n\t"\
158 "psrad $" #shift ", %%mm4 \n\t"\
159 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
160 "movq %%mm4, 16+" #dst " \n\t"\
161
162 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
163 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
164 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
165 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
166 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
167 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
168 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
169 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
170 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
171 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
172 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
173 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
174 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
175 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
176 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
177 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
178 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
179 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
180 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
181 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
182 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
183 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
184 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
185 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
186 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
187 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
188 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
189 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
190 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
191 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
192 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
193 "psrad $" #shift ", %%mm7 \n\t"\
194 "psrad $" #shift ", %%mm4 \n\t"\
195 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
196 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
197 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
198 "psrad $" #shift ", %%mm0 \n\t"\
199 "psrad $" #shift ", %%mm2 \n\t"\
200 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
201 "movd %%mm7, " #dst " \n\t"\
202 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
203 "movd %%mm0, 16+" #dst " \n\t"\
204 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
205 "movd %%mm2, 96+" #dst " \n\t"\
206 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
207 "movd %%mm4, 112+" #dst " \n\t"\
208 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
209 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
210 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
211 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
212 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
213 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
214 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
215 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
216 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
217 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
218 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
219 "psrad $" #shift ", %%mm2 \n\t"\
220 "psrad $" #shift ", %%mm5 \n\t"\
221 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
222 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
223 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
224 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
225 "psrad $" #shift ", %%mm6 \n\t"\
226 "psrad $" #shift ", %%mm4 \n\t"\
227 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
228 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
229 "movd %%mm2, 32+" #dst " \n\t"\
230 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
231 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
232 "movd %%mm6, 48+" #dst " \n\t"\
233 "movd %%mm4, 64+" #dst " \n\t"\
234 "movd %%mm5, 80+" #dst " \n\t"\
235
236
237 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
238 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
239 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
240 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
241 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
242 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
243 "pand %%mm0, %%mm4 \n\t"\
244 "por %%mm1, %%mm4 \n\t"\
245 "por %%mm2, %%mm4 \n\t"\
246 "por %%mm3, %%mm4 \n\t"\
247 "packssdw %%mm4,%%mm4 \n\t"\
248 "movd %%mm4, %%eax \n\t"\
249 "orl %%eax, %%eax \n\t"\
250 "jz 1f \n\t"\
251 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
252 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
253 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
254 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
255 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
256 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
257 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
258 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
259 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
260 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
261 #rounder ", %%mm4 \n\t"\
262 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
263 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
264 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
265 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
266 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
267 #rounder ", %%mm0 \n\t"\
268 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
269 "paddd %%mm0, %%mm0 \n\t" \
270 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
271 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
272 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
273 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
274 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
275 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
276 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
277 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
278 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
279 "psrad $" #shift ", %%mm7 \n\t"\
280 "psrad $" #shift ", %%mm4 \n\t"\
281 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
282 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
283 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
284 "psrad $" #shift ", %%mm1 \n\t"\
285 "psrad $" #shift ", %%mm2 \n\t"\
286 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
287 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
288 "movq %%mm7, " #dst " \n\t"\
289 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
290 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
291 "movq %%mm2, 24+" #dst " \n\t"\
292 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
293 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
294 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
295 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
296 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
297 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
298 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
299 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
300 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
301 "psrad $" #shift ", %%mm2 \n\t"\
302 "psrad $" #shift ", %%mm0 \n\t"\
303 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
304 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
305 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
306 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
307 "psrad $" #shift ", %%mm6 \n\t"\
308 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
309 "movq %%mm2, 8+" #dst " \n\t"\
310 "psrad $" #shift ", %%mm4 \n\t"\
311 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
312 "movq %%mm4, 16+" #dst " \n\t"\
313 "jmp 2f \n\t"\
314 "1: \n\t"\
315 "pslld $16, %%mm0 \n\t"\
316 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
317 "psrad $13, %%mm0 \n\t"\
318 "packssdw %%mm0, %%mm0 \n\t"\
319 "movq %%mm0, " #dst " \n\t"\
320 "movq %%mm0, 8+" #dst " \n\t"\
321 "movq %%mm0, 16+" #dst " \n\t"\
322 "movq %%mm0, 24+" #dst " \n\t"\
323 "2: \n\t"
324
325
326 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
327 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
328 /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
329 ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
330 ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
331
332 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
333 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
334 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
335
336
337 //IDCT( src0, src4, src1, src5, dst, shift)
338 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
339 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
340 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
341 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
342
343 #else
344
345 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
346 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
347 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
348 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
349 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
350 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
351 "pand %%mm0, %%mm4 \n\t"\
352 "por %%mm1, %%mm4 \n\t"\
353 "por %%mm2, %%mm4 \n\t"\
354 "por %%mm3, %%mm4 \n\t"\
355 "packssdw %%mm4,%%mm4 \n\t"\
356 "movd %%mm4, %%eax \n\t"\
357 "orl %%eax, %%eax \n\t"\
358 "jz 1f \n\t"\
359 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
360 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
361 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
362 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
363 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
364 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
365 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
366 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
367 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
368 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
369 #rounder ", %%mm4 \n\t"\
370 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
371 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
372 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
373 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
374 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
375 #rounder ", %%mm0 \n\t"\
376 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
377 "paddd %%mm0, %%mm0 \n\t" \
378 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
379 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
380 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
381 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
382 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
383 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
384 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
385 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
386 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
387 "psrad $" #shift ", %%mm7 \n\t"\
388 "psrad $" #shift ", %%mm4 \n\t"\
389 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
390 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
391 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
392 "psrad $" #shift ", %%mm1 \n\t"\
393 "psrad $" #shift ", %%mm2 \n\t"\
394 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
395 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
396 "movq %%mm7, " #dst " \n\t"\
397 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
398 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
399 "movq %%mm2, 24+" #dst " \n\t"\
400 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
401 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
402 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
403 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
404 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
405 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
406 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
407 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
408 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
409 "psrad $" #shift ", %%mm2 \n\t"\
410 "psrad $" #shift ", %%mm0 \n\t"\
411 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
412 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
413 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
414 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
415 "psrad $" #shift ", %%mm6 \n\t"\
416 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
417 "movq %%mm2, 8+" #dst " \n\t"\
418 "psrad $" #shift ", %%mm4 \n\t"\
419 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
420 "movq %%mm4, 16+" #dst " \n\t"\
421 "jmp 2f \n\t"\
422 "1: \n\t"\
423 "pslld $16, %%mm0 \n\t"\
424 "paddd "MANGLE(d40000)", %%mm0 \n\t"\
425 "psrad $13, %%mm0 \n\t"\
426 "packssdw %%mm0, %%mm0 \n\t"\
427 "movq %%mm0, " #dst " \n\t"\
428 "movq %%mm0, 8+" #dst " \n\t"\
429 "movq %%mm0, 16+" #dst " \n\t"\
430 "movq %%mm0, 24+" #dst " \n\t"\
431 "2: \n\t"
432
433 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
434 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
435 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
436 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
437 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
438 "movq %%mm0, %%mm4 \n\t"\
439 "por %%mm1, %%mm4 \n\t"\
440 "por %%mm2, %%mm4 \n\t"\
441 "por %%mm3, %%mm4 \n\t"\
442 "packssdw %%mm4,%%mm4 \n\t"\
443 "movd %%mm4, %%eax \n\t"\
444 "orl %%eax, %%eax \n\t"\
445 "jz " #bt " \n\t"\
446 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
447 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
448 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
449 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
450 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
451 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
452 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
453 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
454 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
455 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
456 #rounder ", %%mm4 \n\t"\
457 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
458 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
459 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
460 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
461 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
462 #rounder ", %%mm0 \n\t"\
463 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
464 "paddd %%mm0, %%mm0 \n\t" \
465 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
466 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
467 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
468 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
469 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
470 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
471 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
472 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
473 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
474 "psrad $" #shift ", %%mm7 \n\t"\
475 "psrad $" #shift ", %%mm4 \n\t"\
476 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
477 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
478 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
479 "psrad $" #shift ", %%mm1 \n\t"\
480 "psrad $" #shift ", %%mm2 \n\t"\
481 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
482 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
483 "movq %%mm7, " #dst " \n\t"\
484 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
485 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
486 "movq %%mm2, 24+" #dst " \n\t"\
487 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
488 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
489 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
490 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
491 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
492 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
493 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
494 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
495 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
496 "psrad $" #shift ", %%mm2 \n\t"\
497 "psrad $" #shift ", %%mm0 \n\t"\
498 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
499 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
500 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
501 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
502 "psrad $" #shift ", %%mm6 \n\t"\
503 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
504 "movq %%mm2, 8+" #dst " \n\t"\
505 "psrad $" #shift ", %%mm4 \n\t"\
506 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
507 "movq %%mm4, 16+" #dst " \n\t"\
508
509 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
510 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
511 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
512 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
513 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
514 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
515 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
516 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
517 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
518 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
519 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
520 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
521 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
522 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
523 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
524 #rounder ", %%mm4 \n\t"\
525 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
526 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
527 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
528 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
529 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
530 #rounder ", %%mm0 \n\t"\
531 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
532 "paddd %%mm0, %%mm0 \n\t" \
533 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
534 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
535 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
536 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
537 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
538 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
539 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
540 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
541 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
542 "psrad $" #shift ", %%mm7 \n\t"\
543 "psrad $" #shift ", %%mm4 \n\t"\
544 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
545 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
546 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
547 "psrad $" #shift ", %%mm1 \n\t"\
548 "psrad $" #shift ", %%mm2 \n\t"\
549 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
550 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
551 "movq %%mm7, " #dst " \n\t"\
552 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
553 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
554 "movq %%mm2, 24+" #dst " \n\t"\
555 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
556 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
557 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
558 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
559 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
560 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
561 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
562 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
563 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
564 "psrad $" #shift ", %%mm2 \n\t"\
565 "psrad $" #shift ", %%mm0 \n\t"\
566 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
567 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
568 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
569 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
570 "psrad $" #shift ", %%mm6 \n\t"\
571 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
572 "movq %%mm2, 8+" #dst " \n\t"\
573 "psrad $" #shift ", %%mm4 \n\t"\
574 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
575 "movq %%mm4, 16+" #dst " \n\t"\
576
577 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
578 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
579 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
580 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
581 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
582
583 #undef IDCT
584 #define IDCT(src0, src4, src1, src5, dst, shift) \
585 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
586 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
587 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
588 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
589 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
590 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
591 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
592 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
593 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
594 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
595 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
596 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
597 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
598 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
599 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
600 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
601 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
602 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
603 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
604 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
605 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
606 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
607 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
608 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
609 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
610 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
611 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
612 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
613 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
614 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
615 "psrad $" #shift ", %%mm7 \n\t"\
616 "psrad $" #shift ", %%mm4 \n\t"\
617 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
618 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
619 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
620 "psrad $" #shift ", %%mm0 \n\t"\
621 "psrad $" #shift ", %%mm2 \n\t"\
622 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
623 "movd %%mm7, " #dst " \n\t"\
624 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
625 "movd %%mm0, 16+" #dst " \n\t"\
626 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
627 "movd %%mm2, 96+" #dst " \n\t"\
628 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
629 "movd %%mm4, 112+" #dst " \n\t"\
630 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
631 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
632 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
633 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
634 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
635 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
636 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
637 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
638 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
639 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
640 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
641 "psrad $" #shift ", %%mm2 \n\t"\
642 "psrad $" #shift ", %%mm5 \n\t"\
643 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
644 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
645 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
646 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
647 "psrad $" #shift ", %%mm6 \n\t"\
648 "psrad $" #shift ", %%mm4 \n\t"\
649 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
650 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
651 "movd %%mm2, 32+" #dst " \n\t"\
652 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
653 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
654 "movd %%mm6, 48+" #dst " \n\t"\
655 "movd %%mm4, 64+" #dst " \n\t"\
656 "movd %%mm5, 80+" #dst " \n\t"
657
658
659 //IDCT( src0, src4, src1, src5, dst, shift)
660 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
661 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
662 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
663 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
664 "jmp 9f \n\t"
665
666 "# .p2align 4 \n\t"\
667 "4: \n\t"
668 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
669 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
670
671 #undef IDCT
672 #define IDCT(src0, src4, src1, src5, dst, shift) \
673 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
674 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
675 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
676 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
677 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
678 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
679 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
680 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
681 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
682 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
683 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
684 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
685 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
686 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
687 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
688 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
689 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
690 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
691 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
692 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
693 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
694 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
695 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
696 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
697 "psrad $" #shift ", %%mm1 \n\t"\
698 "psrad $" #shift ", %%mm4 \n\t"\
699 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
700 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
701 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
702 "psrad $" #shift ", %%mm0 \n\t"\
703 "psrad $" #shift ", %%mm2 \n\t"\
704 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
705 "movd %%mm1, " #dst " \n\t"\
706 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
707 "movd %%mm0, 16+" #dst " \n\t"\
708 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
709 "movd %%mm2, 96+" #dst " \n\t"\
710 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
711 "movd %%mm4, 112+" #dst " \n\t"\
712 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
713 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
714 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
715 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
716 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
717 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
718 "psrad $" #shift ", %%mm2 \n\t"\
719 "psrad $" #shift ", %%mm5 \n\t"\
720 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
721 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
722 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
723 "psrad $" #shift ", %%mm6 \n\t"\
724 "psrad $" #shift ", %%mm1 \n\t"\
725 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
726 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
727 "movd %%mm2, 32+" #dst " \n\t"\
728 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
729 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
730 "movd %%mm6, 48+" #dst " \n\t"\
731 "movd %%mm1, 64+" #dst " \n\t"\
732 "movd %%mm5, 80+" #dst " \n\t"
733
734 //IDCT( src0, src4, src1, src5, dst, shift)
735 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
736 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
737 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
738 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
739 "jmp 9f \n\t"
740
741 "# .p2align 4 \n\t"\
742 "6: \n\t"
743 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
744
745 #undef IDCT
746 #define IDCT(src0, src4, src1, src5, dst, shift) \
747 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
748 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
749 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
750 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
751 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
752 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
753 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
754 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
755 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
756 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
757 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
758 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
759 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
760 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
761 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
762 "psrad $" #shift ", %%mm1 \n\t"\
763 "psrad $" #shift ", %%mm4 \n\t"\
764 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
765 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
766 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
767 "psrad $" #shift ", %%mm0 \n\t"\
768 "psrad $" #shift ", %%mm2 \n\t"\
769 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
770 "movd %%mm1, " #dst " \n\t"\
771 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
772 "movd %%mm0, 16+" #dst " \n\t"\
773 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
774 "movd %%mm2, 96+" #dst " \n\t"\
775 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
776 "movd %%mm4, 112+" #dst " \n\t"\
777 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
778 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
779 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
780 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
781 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
782 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
783 "psrad $" #shift ", %%mm2 \n\t"\
784 "psrad $" #shift ", %%mm5 \n\t"\
785 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
786 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
787 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
788 "psrad $" #shift ", %%mm6 \n\t"\
789 "psrad $" #shift ", %%mm1 \n\t"\
790 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
791 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
792 "movd %%mm2, 32+" #dst " \n\t"\
793 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
794 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
795 "movd %%mm6, 48+" #dst " \n\t"\
796 "movd %%mm1, 64+" #dst " \n\t"\
797 "movd %%mm5, 80+" #dst " \n\t"
798
799
800 //IDCT( src0, src4, src1, src5, dst, shift)
801 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
802 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
803 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
804 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
805 "jmp 9f \n\t"
806
807 "# .p2align 4 \n\t"\
808 "2: \n\t"
809 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
810
811 #undef IDCT
812 #define IDCT(src0, src4, src1, src5, dst, shift) \
813 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
814 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
815 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
816 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
817 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
818 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
819 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
820 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
821 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
822 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
823 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
824 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
825 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
826 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
827 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
828 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
829 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
830 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
831 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
832 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
833 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
834 "psrad $" #shift ", %%mm7 \n\t"\
835 "psrad $" #shift ", %%mm4 \n\t"\
836 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
837 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
838 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
839 "psrad $" #shift ", %%mm0 \n\t"\
840 "psrad $" #shift ", %%mm2 \n\t"\
841 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
842 "movd %%mm7, " #dst " \n\t"\
843 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
844 "movd %%mm0, 16+" #dst " \n\t"\
845 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
846 "movd %%mm2, 96+" #dst " \n\t"\
847 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
848 "movd %%mm4, 112+" #dst " \n\t"\
849 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
850 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
851 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
852 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
853 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
854 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
855 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
856 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
857 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
858 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
859 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
860 "psrad $" #shift ", %%mm2 \n\t"\
861 "psrad $" #shift ", %%mm5 \n\t"\
862 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
863 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
864 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
865 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
866 "psrad $" #shift ", %%mm6 \n\t"\
867 "psrad $" #shift ", %%mm4 \n\t"\
868 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
869 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
870 "movd %%mm2, 32+" #dst " \n\t"\
871 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
872 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
873 "movd %%mm6, 48+" #dst " \n\t"\
874 "movd %%mm4, 64+" #dst " \n\t"\
875 "movd %%mm5, 80+" #dst " \n\t"
876
877 //IDCT( src0, src4, src1, src5, dst, shift)
878 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
879 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
880 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
881 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
882 "jmp 9f \n\t"
883
884 "# .p2align 4 \n\t"\
885 "3: \n\t"
886 #undef IDCT
887 #define IDCT(src0, src4, src1, src5, dst, shift) \
888 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
889 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
890 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
891 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
892 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
893 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
894 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
895 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
896 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
897 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
898 "movq 64(%2), %%mm3 \n\t"\
899 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
900 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
901 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
902 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
903 "psrad $" #shift ", %%mm7 \n\t"\
904 "psrad $" #shift ", %%mm4 \n\t"\
905 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
906 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
907 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
908 "psrad $" #shift ", %%mm0 \n\t"\
909 "psrad $" #shift ", %%mm1 \n\t"\
910 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
911 "movd %%mm7, " #dst " \n\t"\
912 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
913 "movd %%mm0, 16+" #dst " \n\t"\
914 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
915 "movd %%mm1, 96+" #dst " \n\t"\
916 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
917 "movd %%mm4, 112+" #dst " \n\t"\
918 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
919 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
920 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
921 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
922 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
923 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
924 "psrad $" #shift ", %%mm1 \n\t"\
925 "psrad $" #shift ", %%mm5 \n\t"\
926 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
927 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
928 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
929 "psrad $" #shift ", %%mm6 \n\t"\
930 "psrad $" #shift ", %%mm4 \n\t"\
931 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
932 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
933 "movd %%mm1, 32+" #dst " \n\t"\
934 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
935 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
936 "movd %%mm6, 48+" #dst " \n\t"\
937 "movd %%mm4, 64+" #dst " \n\t"\
938 "movd %%mm5, 80+" #dst " \n\t"
939
940
941 //IDCT( src0, src4, src1, src5, dst, shift)
942 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
943 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
944 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
945 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
946 "jmp 9f \n\t"
947
948 "# .p2align 4 \n\t"\
949 "5: \n\t"
950 #undef IDCT
951 #define IDCT(src0, src4, src1, src5, dst, shift) \
952 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
953 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
954 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
955 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
956 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
957 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
958 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
959 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
960 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
961 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
962 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
963 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
964 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
965 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
966 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
967 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
968 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
969 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
970 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
971 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
972 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
973 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
974 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
975 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
976 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
977 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
978 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
979 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
980 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
981 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
982 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
983 "psrad $" #shift ", %%mm4 \n\t"\
984 "psrad $" #shift ", %%mm7 \n\t"\
985 "psrad $" #shift ", %%mm3 \n\t"\
986 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
987 "movq %%mm4, " #dst " \n\t"\
988 "psrad $" #shift ", %%mm0 \n\t"\
989 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
990 "movq %%mm0, 16+" #dst " \n\t"\
991 "movq %%mm0, 96+" #dst " \n\t"\
992 "movq %%mm4, 112+" #dst " \n\t"\
993 "psrad $" #shift ", %%mm5 \n\t"\
994 "psrad $" #shift ", %%mm6 \n\t"\
995 "psrad $" #shift ", %%mm2 \n\t"\
996 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
997 "movq %%mm5, 32+" #dst " \n\t"\
998 "psrad $" #shift ", %%mm1 \n\t"\
999 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1000 "movq %%mm6, 48+" #dst " \n\t"\
1001 "movq %%mm6, 64+" #dst " \n\t"\
1002 "movq %%mm5, 80+" #dst " \n\t"
1003
1004
1005 //IDCT( src0, src4, src1, src5, dst, shift)
1006 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1007 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1008 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1009 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1010 "jmp 9f \n\t"
1011
1012
1013 "# .p2align 4 \n\t"\
1014 "1: \n\t"
1015 #undef IDCT
1016 #define IDCT(src0, src4, src1, src5, dst, shift) \
1017 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1018 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1019 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1020 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1021 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1022 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1023 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1024 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1025 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1026 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1027 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1028 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1029 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1030 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1031 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1032 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1033 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1034 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1035 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1036 "movq 64(%2), %%mm1 \n\t"\
1037 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1038 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1039 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1040 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1041 "psrad $" #shift ", %%mm7 \n\t"\
1042 "psrad $" #shift ", %%mm4 \n\t"\
1043 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1044 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1045 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1046 "psrad $" #shift ", %%mm0 \n\t"\
1047 "psrad $" #shift ", %%mm3 \n\t"\
1048 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1049 "movd %%mm7, " #dst " \n\t"\
1050 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1051 "movd %%mm0, 16+" #dst " \n\t"\
1052 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1053 "movd %%mm3, 96+" #dst " \n\t"\
1054 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1055 "movd %%mm4, 112+" #dst " \n\t"\
1056 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1057 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1058 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1059 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1060 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1061 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1062 "psrad $" #shift ", %%mm3 \n\t"\
1063 "psrad $" #shift ", %%mm5 \n\t"\
1064 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1065 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1066 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1067 "psrad $" #shift ", %%mm6 \n\t"\
1068 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1069 "movd %%mm3, 32+" #dst " \n\t"\
1070 "psrad $" #shift ", %%mm4 \n\t"\
1071 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1072 "movd %%mm6, 48+" #dst " \n\t"\
1073 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1074 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1075 "movd %%mm4, 64+" #dst " \n\t"\
1076 "movd %%mm5, 80+" #dst " \n\t"
1077
1078
1079 //IDCT( src0, src4, src1, src5, dst, shift)
1080 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1081 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1082 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1083 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1084 "jmp 9f \n\t"
1085
1086
1087 "# .p2align 4 \n\t"
1088 "7: \n\t"
1089 #undef IDCT
1090 #define IDCT(src0, src4, src1, src5, dst, shift) \
1091 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1092 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1093 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1094 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1095 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1096 "psrad $" #shift ", %%mm4 \n\t"\
1097 "psrad $" #shift ", %%mm0 \n\t"\
1098 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1099 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1100 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1101 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1102 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1103 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1104 "psrad $" #shift ", %%mm1 \n\t"\
1105 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1106 "movq %%mm4, " #dst " \n\t"\
1107 "psrad $" #shift ", %%mm2 \n\t"\
1108 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1109 "movq %%mm0, 16+" #dst " \n\t"\
1110 "movq %%mm0, 96+" #dst " \n\t"\
1111 "movq %%mm4, 112+" #dst " \n\t"\
1112 "movq %%mm0, 32+" #dst " \n\t"\
1113 "movq %%mm4, 48+" #dst " \n\t"\
1114 "movq %%mm4, 64+" #dst " \n\t"\
1115 "movq %%mm0, 80+" #dst " \n\t"
1116
1117 //IDCT( src0, src4, src1, src5, dst, shift)
1118 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1119 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1120 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1121 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1122
1123
1124 #endif
1125
1126 /*
1127 Input
1128 00 40 04 44 20 60 24 64
1129 10 30 14 34 50 70 54 74
1130 01 41 03 43 21 61 23 63
1131 11 31 13 33 51 71 53 73
1132 02 42 06 46 22 62 26 66
1133 12 32 16 36 52 72 56 76
1134 05 45 07 47 25 65 27 67
1135 15 35 17 37 55 75 57 77
1136
1137 Temp
1138 00 04 10 14 20 24 30 34
1139 40 44 50 54 60 64 70 74
1140 01 03 11 13 21 23 31 33
1141 41 43 51 53 61 63 71 73
1142 02 06 12 16 22 26 32 36
1143 42 46 52 56 62 66 72 76
1144 05 07 15 17 25 27 35 37
1145 45 47 55 57 65 67 75 77
1146 */
1147
1148 "9: \n\t"
1149 :: "r" (block), "r" (temp), "r" (coeffs)
1150 NAMED_CONSTRAINTS_ADD(wm1010,d40000)
1151 : "%eax"
1152 );
1153 }
1154
1155 void ff_simple_idct_mmx(int16_t *block)
1156 {
1157 idct(block);
1158 }
1159
1160 //FIXME merge add/put into the idct
1161
1162 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
1163 {
1164 idct(block);
1165 ff_put_pixels_clamped(block, dest, line_size);
1166 }
1167 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
1168 {
1169 idct(block);
1170 ff_add_pixels_clamped(block, dest, line_size);
1171 }
1172
1173 #endif /* HAVE_INLINE_ASM */