Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / simple_idct.c
CommitLineData
2ba45a60
DM
1/*
2 * Simple IDCT MMX
3 *
4 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "libavutil/mem.h"
24#include "libavutil/x86/asm.h"
25
26#include "libavcodec/idctdsp.h"
27
28#include "idctdsp.h"
29#include "simple_idct.h"
30
31#if HAVE_INLINE_ASM
32
33/*
3423170.475006
3522725.260826
3621406.727617
3719265.545870
3816384.000000
3912872.826198
408866.956905
414520.335430
42*/
43#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
48#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
49#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
50#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
51
52#define ROW_SHIFT 11
53#define COL_SHIFT 20 // 6
54
55DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
56DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
57
58DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
59 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
60// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
61// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
62 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
63 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
64// 0, 0, 0, 0,
65// 0, 0, 0, 0,
66
67 C4, C4, C4, C4,
68 C4, -C4, C4, -C4,
69
70 C2, C6, C2, C6,
71 C6, -C2, C6, -C2,
72
73 C1, C3, C1, C3,
74 C5, C7, C5, C7,
75
76 C3, -C7, C3, -C7,
77-C1, -C5, -C1, -C5,
78
79 C5, -C1, C5, -C1,
80 C7, C3, C7, C3,
81
82 C7, -C5, C7, -C5,
83 C3, -C1, C3, -C1
84};
85
86static inline void idct(int16_t *block)
87{
88 LOCAL_ALIGNED_8(int64_t, align_tmp, [16]);
89 int16_t * const temp= (int16_t*)align_tmp;
90
91 __asm__ volatile(
92#if 0 //Alternative, simpler variant
93
94#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
95 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
96 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
97 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
98 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
99 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
100 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
101 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
102 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
103 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
104 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
105 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
106 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
107 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
108 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
109 #rounder ", %%mm4 \n\t"\
110 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
111 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
112 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
113 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
114 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
115 #rounder ", %%mm0 \n\t"\
116 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
117 "paddd %%mm0, %%mm0 \n\t" \
118 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
119 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
120 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
121 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
122 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
123 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
124 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
125 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
126 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
127 "psrad $" #shift ", %%mm7 \n\t"\
128 "psrad $" #shift ", %%mm4 \n\t"\
129 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
130 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
131 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
132 "psrad $" #shift ", %%mm1 \n\t"\
133 "psrad $" #shift ", %%mm2 \n\t"\
134 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
135 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
136 "movq %%mm7, " #dst " \n\t"\
137 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
138 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
139 "movq %%mm2, 24+" #dst " \n\t"\
140 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
141 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
142 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
143 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
144 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
145 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
146 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
147 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
148 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
149 "psrad $" #shift ", %%mm2 \n\t"\
150 "psrad $" #shift ", %%mm0 \n\t"\
151 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
152 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
153 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
154 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
155 "psrad $" #shift ", %%mm6 \n\t"\
156 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
157 "movq %%mm2, 8+" #dst " \n\t"\
158 "psrad $" #shift ", %%mm4 \n\t"\
159 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
160 "movq %%mm4, 16+" #dst " \n\t"\
161
162#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
163 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
164 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
165 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
166 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
167 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
168 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
169 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
170 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
171 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
172 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
173 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
174 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
175 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
176 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
177 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
178 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
179 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
180 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
181 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
182 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
183 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
184 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
185 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
186 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
187 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
188 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
189 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
190 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
191 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
192 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
193 "psrad $" #shift ", %%mm7 \n\t"\
194 "psrad $" #shift ", %%mm4 \n\t"\
195 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
196 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
197 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
198 "psrad $" #shift ", %%mm0 \n\t"\
199 "psrad $" #shift ", %%mm2 \n\t"\
200 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
201 "movd %%mm7, " #dst " \n\t"\
202 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
203 "movd %%mm0, 16+" #dst " \n\t"\
204 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
205 "movd %%mm2, 96+" #dst " \n\t"\
206 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
207 "movd %%mm4, 112+" #dst " \n\t"\
208 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
209 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
210 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
211 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
212 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
213 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
214 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
215 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
216 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
217 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
218 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
219 "psrad $" #shift ", %%mm2 \n\t"\
220 "psrad $" #shift ", %%mm5 \n\t"\
221 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
222 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
223 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
224 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
225 "psrad $" #shift ", %%mm6 \n\t"\
226 "psrad $" #shift ", %%mm4 \n\t"\
227 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
228 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
229 "movd %%mm2, 32+" #dst " \n\t"\
230 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
231 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
232 "movd %%mm6, 48+" #dst " \n\t"\
233 "movd %%mm4, 64+" #dst " \n\t"\
234 "movd %%mm5, 80+" #dst " \n\t"\
235
236
237#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
238 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
239 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
240 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
241 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
242 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
243 "pand %%mm0, %%mm4 \n\t"\
244 "por %%mm1, %%mm4 \n\t"\
245 "por %%mm2, %%mm4 \n\t"\
246 "por %%mm3, %%mm4 \n\t"\
247 "packssdw %%mm4,%%mm4 \n\t"\
248 "movd %%mm4, %%eax \n\t"\
249 "orl %%eax, %%eax \n\t"\
250 "jz 1f \n\t"\
251 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
252 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
253 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
254 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
255 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
256 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
257 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
258 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
259 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
260 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
261 #rounder ", %%mm4 \n\t"\
262 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
263 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
264 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
265 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
266 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
267 #rounder ", %%mm0 \n\t"\
268 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
269 "paddd %%mm0, %%mm0 \n\t" \
270 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
271 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
272 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
273 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
274 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
275 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
276 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
277 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
278 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
279 "psrad $" #shift ", %%mm7 \n\t"\
280 "psrad $" #shift ", %%mm4 \n\t"\
281 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
282 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
283 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
284 "psrad $" #shift ", %%mm1 \n\t"\
285 "psrad $" #shift ", %%mm2 \n\t"\
286 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
287 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
288 "movq %%mm7, " #dst " \n\t"\
289 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
290 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
291 "movq %%mm2, 24+" #dst " \n\t"\
292 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
293 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
294 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
295 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
296 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
297 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
298 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
299 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
300 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
301 "psrad $" #shift ", %%mm2 \n\t"\
302 "psrad $" #shift ", %%mm0 \n\t"\
303 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
304 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
305 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
306 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
307 "psrad $" #shift ", %%mm6 \n\t"\
308 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
309 "movq %%mm2, 8+" #dst " \n\t"\
310 "psrad $" #shift ", %%mm4 \n\t"\
311 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
312 "movq %%mm4, 16+" #dst " \n\t"\
313 "jmp 2f \n\t"\
314 "1: \n\t"\
315 "pslld $16, %%mm0 \n\t"\
316 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
317 "psrad $13, %%mm0 \n\t"\
318 "packssdw %%mm0, %%mm0 \n\t"\
319 "movq %%mm0, " #dst " \n\t"\
320 "movq %%mm0, 8+" #dst " \n\t"\
321 "movq %%mm0, 16+" #dst " \n\t"\
322 "movq %%mm0, 24+" #dst " \n\t"\
323 "2: \n\t"
324
325
326//IDCT( src0, src4, src1, src5, dst, rounder, shift)
327ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
328/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
329ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
330ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
331
332DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
333DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
334DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
335
336
337//IDCT( src0, src4, src1, src5, dst, shift)
338COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
339COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
340COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
341COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
342
343#else
344
345#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
346 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
347 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
348 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
349 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
350 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
351 "pand %%mm0, %%mm4 \n\t"\
352 "por %%mm1, %%mm4 \n\t"\
353 "por %%mm2, %%mm4 \n\t"\
354 "por %%mm3, %%mm4 \n\t"\
355 "packssdw %%mm4,%%mm4 \n\t"\
356 "movd %%mm4, %%eax \n\t"\
357 "orl %%eax, %%eax \n\t"\
358 "jz 1f \n\t"\
359 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
360 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
361 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
362 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
363 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
364 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
365 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
366 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
367 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
368 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
369 #rounder ", %%mm4 \n\t"\
370 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
371 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
372 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
373 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
374 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
375 #rounder ", %%mm0 \n\t"\
376 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
377 "paddd %%mm0, %%mm0 \n\t" \
378 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
379 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
380 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
381 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
382 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
383 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
384 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
385 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
386 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
387 "psrad $" #shift ", %%mm7 \n\t"\
388 "psrad $" #shift ", %%mm4 \n\t"\
389 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
390 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
391 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
392 "psrad $" #shift ", %%mm1 \n\t"\
393 "psrad $" #shift ", %%mm2 \n\t"\
394 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
395 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
396 "movq %%mm7, " #dst " \n\t"\
397 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
398 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
399 "movq %%mm2, 24+" #dst " \n\t"\
400 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
401 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
402 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
403 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
404 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
405 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
406 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
407 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
408 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
409 "psrad $" #shift ", %%mm2 \n\t"\
410 "psrad $" #shift ", %%mm0 \n\t"\
411 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
412 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
413 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
414 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
415 "psrad $" #shift ", %%mm6 \n\t"\
416 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
417 "movq %%mm2, 8+" #dst " \n\t"\
418 "psrad $" #shift ", %%mm4 \n\t"\
419 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
420 "movq %%mm4, 16+" #dst " \n\t"\
421 "jmp 2f \n\t"\
422 "1: \n\t"\
423 "pslld $16, %%mm0 \n\t"\
424 "paddd "MANGLE(d40000)", %%mm0 \n\t"\
425 "psrad $13, %%mm0 \n\t"\
426 "packssdw %%mm0, %%mm0 \n\t"\
427 "movq %%mm0, " #dst " \n\t"\
428 "movq %%mm0, 8+" #dst " \n\t"\
429 "movq %%mm0, 16+" #dst " \n\t"\
430 "movq %%mm0, 24+" #dst " \n\t"\
431 "2: \n\t"
432
433#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
434 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
435 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
436 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
437 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
438 "movq %%mm0, %%mm4 \n\t"\
439 "por %%mm1, %%mm4 \n\t"\
440 "por %%mm2, %%mm4 \n\t"\
441 "por %%mm3, %%mm4 \n\t"\
442 "packssdw %%mm4,%%mm4 \n\t"\
443 "movd %%mm4, %%eax \n\t"\
444 "orl %%eax, %%eax \n\t"\
445 "jz " #bt " \n\t"\
446 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
447 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
448 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
449 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
450 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
451 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
452 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
453 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
454 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
455 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
456 #rounder ", %%mm4 \n\t"\
457 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
458 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
459 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
460 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
461 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
462 #rounder ", %%mm0 \n\t"\
463 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
464 "paddd %%mm0, %%mm0 \n\t" \
465 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
466 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
467 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
468 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
469 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
470 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
471 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
472 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
473 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
474 "psrad $" #shift ", %%mm7 \n\t"\
475 "psrad $" #shift ", %%mm4 \n\t"\
476 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
477 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
478 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
479 "psrad $" #shift ", %%mm1 \n\t"\
480 "psrad $" #shift ", %%mm2 \n\t"\
481 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
482 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
483 "movq %%mm7, " #dst " \n\t"\
484 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
485 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
486 "movq %%mm2, 24+" #dst " \n\t"\
487 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
488 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
489 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
490 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
491 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
492 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
493 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
494 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
495 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
496 "psrad $" #shift ", %%mm2 \n\t"\
497 "psrad $" #shift ", %%mm0 \n\t"\
498 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
499 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
500 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
501 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
502 "psrad $" #shift ", %%mm6 \n\t"\
503 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
504 "movq %%mm2, 8+" #dst " \n\t"\
505 "psrad $" #shift ", %%mm4 \n\t"\
506 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
507 "movq %%mm4, 16+" #dst " \n\t"\
508
509#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
510 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
511 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
512 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
513 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
514 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
515 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
516 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
517 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
518 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
519 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
520 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
521 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
522 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
523 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
524 #rounder ", %%mm4 \n\t"\
525 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
526 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
527 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
528 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
529 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
530 #rounder ", %%mm0 \n\t"\
531 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
532 "paddd %%mm0, %%mm0 \n\t" \
533 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
534 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
535 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
536 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
537 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
538 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
539 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
540 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
541 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
542 "psrad $" #shift ", %%mm7 \n\t"\
543 "psrad $" #shift ", %%mm4 \n\t"\
544 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
545 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
546 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
547 "psrad $" #shift ", %%mm1 \n\t"\
548 "psrad $" #shift ", %%mm2 \n\t"\
549 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
550 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
551 "movq %%mm7, " #dst " \n\t"\
552 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
553 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
554 "movq %%mm2, 24+" #dst " \n\t"\
555 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
556 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
557 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
558 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
559 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
560 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
561 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
562 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
563 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
564 "psrad $" #shift ", %%mm2 \n\t"\
565 "psrad $" #shift ", %%mm0 \n\t"\
566 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
567 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
568 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
569 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
570 "psrad $" #shift ", %%mm6 \n\t"\
571 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
572 "movq %%mm2, 8+" #dst " \n\t"\
573 "psrad $" #shift ", %%mm4 \n\t"\
574 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
575 "movq %%mm4, 16+" #dst " \n\t"\
576
577//IDCT( src0, src4, src1, src5, dst, rounder, shift)
578DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
579Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
580Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
581Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
582
583#undef IDCT
584#define IDCT(src0, src4, src1, src5, dst, shift) \
585 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
586 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
587 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
588 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
589 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
590 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
591 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
592 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
593 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
594 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
595 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
596 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
597 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
598 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
599 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
600 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
601 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
602 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
603 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
604 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
605 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
606 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
607 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
608 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
609 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
610 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
611 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
612 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
613 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
614 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
615 "psrad $" #shift ", %%mm7 \n\t"\
616 "psrad $" #shift ", %%mm4 \n\t"\
617 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
618 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
619 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
620 "psrad $" #shift ", %%mm0 \n\t"\
621 "psrad $" #shift ", %%mm2 \n\t"\
622 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
623 "movd %%mm7, " #dst " \n\t"\
624 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
625 "movd %%mm0, 16+" #dst " \n\t"\
626 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
627 "movd %%mm2, 96+" #dst " \n\t"\
628 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
629 "movd %%mm4, 112+" #dst " \n\t"\
630 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
631 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
632 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
633 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
634 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
635 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
636 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
637 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
638 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
639 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
640 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
641 "psrad $" #shift ", %%mm2 \n\t"\
642 "psrad $" #shift ", %%mm5 \n\t"\
643 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
644 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
645 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
646 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
647 "psrad $" #shift ", %%mm6 \n\t"\
648 "psrad $" #shift ", %%mm4 \n\t"\
649 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
650 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
651 "movd %%mm2, 32+" #dst " \n\t"\
652 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
653 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
654 "movd %%mm6, 48+" #dst " \n\t"\
655 "movd %%mm4, 64+" #dst " \n\t"\
656 "movd %%mm5, 80+" #dst " \n\t"
657
658
659//IDCT( src0, src4, src1, src5, dst, shift)
660IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
661IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
662IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
663IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
664 "jmp 9f \n\t"
665
666 "# .p2align 4 \n\t"\
667 "4: \n\t"
668Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
669Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
670
671#undef IDCT
672#define IDCT(src0, src4, src1, src5, dst, shift) \
673 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
674 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
675 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
676 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
677 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
678 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
679 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
680 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
681 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
682 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
683 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
684 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
685 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
686 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
687 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
688 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
689 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
690 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
691 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
692 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
693 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
694 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
695 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
696 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
697 "psrad $" #shift ", %%mm1 \n\t"\
698 "psrad $" #shift ", %%mm4 \n\t"\
699 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
700 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
701 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
702 "psrad $" #shift ", %%mm0 \n\t"\
703 "psrad $" #shift ", %%mm2 \n\t"\
704 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
705 "movd %%mm1, " #dst " \n\t"\
706 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
707 "movd %%mm0, 16+" #dst " \n\t"\
708 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
709 "movd %%mm2, 96+" #dst " \n\t"\
710 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
711 "movd %%mm4, 112+" #dst " \n\t"\
712 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
713 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
714 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
715 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
716 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
717 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
718 "psrad $" #shift ", %%mm2 \n\t"\
719 "psrad $" #shift ", %%mm5 \n\t"\
720 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
721 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
722 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
723 "psrad $" #shift ", %%mm6 \n\t"\
724 "psrad $" #shift ", %%mm1 \n\t"\
725 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
726 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
727 "movd %%mm2, 32+" #dst " \n\t"\
728 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
729 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
730 "movd %%mm6, 48+" #dst " \n\t"\
731 "movd %%mm1, 64+" #dst " \n\t"\
732 "movd %%mm5, 80+" #dst " \n\t"
733
734//IDCT( src0, src4, src1, src5, dst, shift)
735IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
736IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
737IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
738IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
739 "jmp 9f \n\t"
740
741 "# .p2align 4 \n\t"\
742 "6: \n\t"
743Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
744
745#undef IDCT
746#define IDCT(src0, src4, src1, src5, dst, shift) \
747 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
748 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
749 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
750 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
751 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
752 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
753 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
754 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
755 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
756 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
757 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
758 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
759 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
760 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
761 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
762 "psrad $" #shift ", %%mm1 \n\t"\
763 "psrad $" #shift ", %%mm4 \n\t"\
764 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
765 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
766 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
767 "psrad $" #shift ", %%mm0 \n\t"\
768 "psrad $" #shift ", %%mm2 \n\t"\
769 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
770 "movd %%mm1, " #dst " \n\t"\
771 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
772 "movd %%mm0, 16+" #dst " \n\t"\
773 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
774 "movd %%mm2, 96+" #dst " \n\t"\
775 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
776 "movd %%mm4, 112+" #dst " \n\t"\
777 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
778 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
779 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
780 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
781 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
782 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
783 "psrad $" #shift ", %%mm2 \n\t"\
784 "psrad $" #shift ", %%mm5 \n\t"\
785 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
786 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
787 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
788 "psrad $" #shift ", %%mm6 \n\t"\
789 "psrad $" #shift ", %%mm1 \n\t"\
790 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
791 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
792 "movd %%mm2, 32+" #dst " \n\t"\
793 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
794 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
795 "movd %%mm6, 48+" #dst " \n\t"\
796 "movd %%mm1, 64+" #dst " \n\t"\
797 "movd %%mm5, 80+" #dst " \n\t"
798
799
800//IDCT( src0, src4, src1, src5, dst, shift)
801IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
802IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
803IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
804IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
805 "jmp 9f \n\t"
806
807 "# .p2align 4 \n\t"\
808 "2: \n\t"
809Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
810
811#undef IDCT
812#define IDCT(src0, src4, src1, src5, dst, shift) \
813 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
814 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
815 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
816 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
817 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
818 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
819 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
820 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
821 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
822 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
823 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
824 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
825 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
826 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
827 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
828 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
829 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
830 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
831 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
832 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
833 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
834 "psrad $" #shift ", %%mm7 \n\t"\
835 "psrad $" #shift ", %%mm4 \n\t"\
836 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
837 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
838 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
839 "psrad $" #shift ", %%mm0 \n\t"\
840 "psrad $" #shift ", %%mm2 \n\t"\
841 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
842 "movd %%mm7, " #dst " \n\t"\
843 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
844 "movd %%mm0, 16+" #dst " \n\t"\
845 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
846 "movd %%mm2, 96+" #dst " \n\t"\
847 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
848 "movd %%mm4, 112+" #dst " \n\t"\
849 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
850 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
851 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
852 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
853 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
854 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
855 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
856 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
857 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
858 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
859 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
860 "psrad $" #shift ", %%mm2 \n\t"\
861 "psrad $" #shift ", %%mm5 \n\t"\
862 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
863 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
864 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
865 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
866 "psrad $" #shift ", %%mm6 \n\t"\
867 "psrad $" #shift ", %%mm4 \n\t"\
868 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
869 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
870 "movd %%mm2, 32+" #dst " \n\t"\
871 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
872 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
873 "movd %%mm6, 48+" #dst " \n\t"\
874 "movd %%mm4, 64+" #dst " \n\t"\
875 "movd %%mm5, 80+" #dst " \n\t"
876
877//IDCT( src0, src4, src1, src5, dst, shift)
878IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
879IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
880IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
881IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
882 "jmp 9f \n\t"
883
884 "# .p2align 4 \n\t"\
885 "3: \n\t"
886#undef IDCT
887#define IDCT(src0, src4, src1, src5, dst, shift) \
888 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
889 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
890 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
891 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
892 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
893 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
894 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
895 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
896 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
897 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
898 "movq 64(%2), %%mm3 \n\t"\
899 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
900 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
901 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
902 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
903 "psrad $" #shift ", %%mm7 \n\t"\
904 "psrad $" #shift ", %%mm4 \n\t"\
905 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
906 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
907 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
908 "psrad $" #shift ", %%mm0 \n\t"\
909 "psrad $" #shift ", %%mm1 \n\t"\
910 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
911 "movd %%mm7, " #dst " \n\t"\
912 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
913 "movd %%mm0, 16+" #dst " \n\t"\
914 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
915 "movd %%mm1, 96+" #dst " \n\t"\
916 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
917 "movd %%mm4, 112+" #dst " \n\t"\
918 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
919 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
920 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
921 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
922 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
923 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
924 "psrad $" #shift ", %%mm1 \n\t"\
925 "psrad $" #shift ", %%mm5 \n\t"\
926 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
927 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
928 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
929 "psrad $" #shift ", %%mm6 \n\t"\
930 "psrad $" #shift ", %%mm4 \n\t"\
931 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
932 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
933 "movd %%mm1, 32+" #dst " \n\t"\
934 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
935 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
936 "movd %%mm6, 48+" #dst " \n\t"\
937 "movd %%mm4, 64+" #dst " \n\t"\
938 "movd %%mm5, 80+" #dst " \n\t"
939
940
941//IDCT( src0, src4, src1, src5, dst, shift)
942IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
943IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
944IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
945IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
946 "jmp 9f \n\t"
947
948 "# .p2align 4 \n\t"\
949 "5: \n\t"
950#undef IDCT
951#define IDCT(src0, src4, src1, src5, dst, shift) \
952 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
953 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
954 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
955 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
956 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
957 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
958 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
959 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
960 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
961 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
962 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
963 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
964 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
965 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
966 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
967 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
968 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
969 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
970 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
971 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
972 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
973 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
974 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
975 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
976 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
977 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
978 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
979 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
980 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
981 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
982 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
983 "psrad $" #shift ", %%mm4 \n\t"\
984 "psrad $" #shift ", %%mm7 \n\t"\
985 "psrad $" #shift ", %%mm3 \n\t"\
986 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
987 "movq %%mm4, " #dst " \n\t"\
988 "psrad $" #shift ", %%mm0 \n\t"\
989 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
990 "movq %%mm0, 16+" #dst " \n\t"\
991 "movq %%mm0, 96+" #dst " \n\t"\
992 "movq %%mm4, 112+" #dst " \n\t"\
993 "psrad $" #shift ", %%mm5 \n\t"\
994 "psrad $" #shift ", %%mm6 \n\t"\
995 "psrad $" #shift ", %%mm2 \n\t"\
996 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
997 "movq %%mm5, 32+" #dst " \n\t"\
998 "psrad $" #shift ", %%mm1 \n\t"\
999 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1000 "movq %%mm6, 48+" #dst " \n\t"\
1001 "movq %%mm6, 64+" #dst " \n\t"\
1002 "movq %%mm5, 80+" #dst " \n\t"
1003
1004
1005//IDCT( src0, src4, src1, src5, dst, shift)
1006IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1007//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1008IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1009//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1010 "jmp 9f \n\t"
1011
1012
1013 "# .p2align 4 \n\t"\
1014 "1: \n\t"
1015#undef IDCT
1016#define IDCT(src0, src4, src1, src5, dst, shift) \
1017 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1018 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1019 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1020 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1021 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1022 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1023 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1024 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1025 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1026 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1027 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1028 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1029 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1030 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1031 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1032 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1033 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1034 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1035 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1036 "movq 64(%2), %%mm1 \n\t"\
1037 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1038 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1039 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1040 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1041 "psrad $" #shift ", %%mm7 \n\t"\
1042 "psrad $" #shift ", %%mm4 \n\t"\
1043 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1044 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1045 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1046 "psrad $" #shift ", %%mm0 \n\t"\
1047 "psrad $" #shift ", %%mm3 \n\t"\
1048 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1049 "movd %%mm7, " #dst " \n\t"\
1050 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1051 "movd %%mm0, 16+" #dst " \n\t"\
1052 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1053 "movd %%mm3, 96+" #dst " \n\t"\
1054 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1055 "movd %%mm4, 112+" #dst " \n\t"\
1056 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1057 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1058 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1059 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1060 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1061 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1062 "psrad $" #shift ", %%mm3 \n\t"\
1063 "psrad $" #shift ", %%mm5 \n\t"\
1064 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1065 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1066 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1067 "psrad $" #shift ", %%mm6 \n\t"\
1068 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1069 "movd %%mm3, 32+" #dst " \n\t"\
1070 "psrad $" #shift ", %%mm4 \n\t"\
1071 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1072 "movd %%mm6, 48+" #dst " \n\t"\
1073 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1074 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1075 "movd %%mm4, 64+" #dst " \n\t"\
1076 "movd %%mm5, 80+" #dst " \n\t"
1077
1078
1079//IDCT( src0, src4, src1, src5, dst, shift)
1080IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1081IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1082IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1083IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1084 "jmp 9f \n\t"
1085
1086
1087 "# .p2align 4 \n\t"
1088 "7: \n\t"
1089#undef IDCT
1090#define IDCT(src0, src4, src1, src5, dst, shift) \
1091 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1092 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1093 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1094 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1095 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1096 "psrad $" #shift ", %%mm4 \n\t"\
1097 "psrad $" #shift ", %%mm0 \n\t"\
1098 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1099 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1100 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1101 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1102 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1103 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1104 "psrad $" #shift ", %%mm1 \n\t"\
1105 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1106 "movq %%mm4, " #dst " \n\t"\
1107 "psrad $" #shift ", %%mm2 \n\t"\
1108 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1109 "movq %%mm0, 16+" #dst " \n\t"\
1110 "movq %%mm0, 96+" #dst " \n\t"\
1111 "movq %%mm4, 112+" #dst " \n\t"\
1112 "movq %%mm0, 32+" #dst " \n\t"\
1113 "movq %%mm4, 48+" #dst " \n\t"\
1114 "movq %%mm4, 64+" #dst " \n\t"\
1115 "movq %%mm0, 80+" #dst " \n\t"
1116
1117//IDCT( src0, src4, src1, src5, dst, shift)
1118IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1119//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1120IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1121//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1122
1123
1124#endif
1125
1126/*
1127Input
1128 00 40 04 44 20 60 24 64
1129 10 30 14 34 50 70 54 74
1130 01 41 03 43 21 61 23 63
1131 11 31 13 33 51 71 53 73
1132 02 42 06 46 22 62 26 66
1133 12 32 16 36 52 72 56 76
1134 05 45 07 47 25 65 27 67
1135 15 35 17 37 55 75 57 77
1136
1137Temp
1138 00 04 10 14 20 24 30 34
1139 40 44 50 54 60 64 70 74
1140 01 03 11 13 21 23 31 33
1141 41 43 51 53 61 63 71 73
1142 02 06 12 16 22 26 32 36
1143 42 46 52 56 62 66 72 76
1144 05 07 15 17 25 27 35 37
1145 45 47 55 57 65 67 75 77
1146*/
1147
1148"9: \n\t"
1149 :: "r" (block), "r" (temp), "r" (coeffs)
1150 NAMED_CONSTRAINTS_ADD(wm1010,d40000)
1151 : "%eax"
1152 );
1153}
1154
1155void ff_simple_idct_mmx(int16_t *block)
1156{
1157 idct(block);
1158}
1159
1160//FIXME merge add/put into the idct
1161
1162void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
1163{
1164 idct(block);
1165 ff_put_pixels_clamped(block, dest, line_size);
1166}
1167void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
1168{
1169 idct(block);
1170 ff_add_pixels_clamped(block, dest, line_size);
1171}
1172
1173#endif /* HAVE_INLINE_ASM */