Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Simple IDCT MMX | |
3 | * | |
4 | * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> | |
5 | * | |
6 | * This file is part of FFmpeg. | |
7 | * | |
8 | * FFmpeg is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License as published by the Free Software Foundation; either | |
11 | * version 2.1 of the License, or (at your option) any later version. | |
12 | * | |
13 | * FFmpeg is distributed in the hope that it will be useful, | |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | * Lesser General Public License for more details. | |
17 | * | |
18 | * You should have received a copy of the GNU Lesser General Public | |
19 | * License along with FFmpeg; if not, write to the Free Software | |
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | */ | |
22 | ||
23 | #include "libavutil/mem.h" | |
24 | #include "libavutil/x86/asm.h" | |
25 | ||
26 | #include "libavcodec/idctdsp.h" | |
27 | ||
28 | #include "idctdsp.h" | |
29 | #include "simple_idct.h" | |
30 | ||
31 | #if HAVE_INLINE_ASM | |
32 | ||
33 | /* | |
34 | 23170.475006 | |
35 | 22725.260826 | |
36 | 21406.727617 | |
37 | 19265.545870 | |
38 | 16384.000000 | |
39 | 12872.826198 | |
40 | 8866.956905 | |
41 | 4520.335430 | |
42 | */ | |
43 | #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
44 | #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
45 | #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
46 | #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
47 | #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 | |
48 | #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
49 | #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
50 | #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
51 | ||
52 | #define ROW_SHIFT 11 | |
53 | #define COL_SHIFT 20 // 6 | |
54 | ||
55 | DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL; | |
56 | DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL; | |
57 | ||
58 | DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= { | |
59 | 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, | |
60 | // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0, | |
61 | // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16), | |
62 | 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, | |
63 | // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :) | |
64 | // 0, 0, 0, 0, | |
65 | // 0, 0, 0, 0, | |
66 | ||
67 | C4, C4, C4, C4, | |
68 | C4, -C4, C4, -C4, | |
69 | ||
70 | C2, C6, C2, C6, | |
71 | C6, -C2, C6, -C2, | |
72 | ||
73 | C1, C3, C1, C3, | |
74 | C5, C7, C5, C7, | |
75 | ||
76 | C3, -C7, C3, -C7, | |
77 | -C1, -C5, -C1, -C5, | |
78 | ||
79 | C5, -C1, C5, -C1, | |
80 | C7, C3, C7, C3, | |
81 | ||
82 | C7, -C5, C7, -C5, | |
83 | C3, -C1, C3, -C1 | |
84 | }; | |
85 | ||
86 | static inline void idct(int16_t *block) | |
87 | { | |
88 | LOCAL_ALIGNED_8(int64_t, align_tmp, [16]); | |
89 | int16_t * const temp= (int16_t*)align_tmp; | |
90 | ||
91 | __asm__ volatile( | |
92 | #if 0 //Alternative, simpler variant | |
93 | ||
94 | #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
95 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
96 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
97 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
98 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
99 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
100 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
101 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
102 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
103 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
104 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
105 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
106 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
107 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
108 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
109 | #rounder ", %%mm4 \n\t"\ | |
110 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
111 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
112 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
113 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
114 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
115 | #rounder ", %%mm0 \n\t"\ | |
116 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
117 | "paddd %%mm0, %%mm0 \n\t" \ | |
118 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
119 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
120 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
121 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
122 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
123 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
124 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
125 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
126 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
127 | "psrad $" #shift ", %%mm7 \n\t"\ | |
128 | "psrad $" #shift ", %%mm4 \n\t"\ | |
129 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
130 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
131 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
132 | "psrad $" #shift ", %%mm1 \n\t"\ | |
133 | "psrad $" #shift ", %%mm2 \n\t"\ | |
134 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
135 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
136 | "movq %%mm7, " #dst " \n\t"\ | |
137 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
138 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
139 | "movq %%mm2, 24+" #dst " \n\t"\ | |
140 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
141 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
142 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
143 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
144 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
145 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
146 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
147 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
148 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
149 | "psrad $" #shift ", %%mm2 \n\t"\ | |
150 | "psrad $" #shift ", %%mm0 \n\t"\ | |
151 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
152 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
153 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
154 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
155 | "psrad $" #shift ", %%mm6 \n\t"\ | |
156 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
157 | "movq %%mm2, 8+" #dst " \n\t"\ | |
158 | "psrad $" #shift ", %%mm4 \n\t"\ | |
159 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
160 | "movq %%mm4, 16+" #dst " \n\t"\ | |
161 | ||
162 | #define COL_IDCT(src0, src4, src1, src5, dst, shift) \ | |
163 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
164 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
165 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
166 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
167 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
168 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
169 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
170 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
171 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
172 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
173 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
174 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
175 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
176 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
177 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
178 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
179 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
180 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
181 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
182 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
183 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
184 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
185 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
186 | "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
187 | "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
188 | "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
189 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
190 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
191 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
192 | "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
193 | "psrad $" #shift ", %%mm7 \n\t"\ | |
194 | "psrad $" #shift ", %%mm4 \n\t"\ | |
195 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
196 | "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
197 | "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
198 | "psrad $" #shift ", %%mm0 \n\t"\ | |
199 | "psrad $" #shift ", %%mm2 \n\t"\ | |
200 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
201 | "movd %%mm7, " #dst " \n\t"\ | |
202 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
203 | "movd %%mm0, 16+" #dst " \n\t"\ | |
204 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
205 | "movd %%mm2, 96+" #dst " \n\t"\ | |
206 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
207 | "movd %%mm4, 112+" #dst " \n\t"\ | |
208 | "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
209 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
210 | "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
211 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
212 | "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
213 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
214 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
215 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
216 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
217 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
218 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
219 | "psrad $" #shift ", %%mm2 \n\t"\ | |
220 | "psrad $" #shift ", %%mm5 \n\t"\ | |
221 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
222 | "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
223 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
224 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
225 | "psrad $" #shift ", %%mm6 \n\t"\ | |
226 | "psrad $" #shift ", %%mm4 \n\t"\ | |
227 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
228 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
229 | "movd %%mm2, 32+" #dst " \n\t"\ | |
230 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
231 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
232 | "movd %%mm6, 48+" #dst " \n\t"\ | |
233 | "movd %%mm4, 64+" #dst " \n\t"\ | |
234 | "movd %%mm5, 80+" #dst " \n\t"\ | |
235 | ||
236 | ||
237 | #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
238 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
239 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
240 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
241 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
242 | "movq "MANGLE(wm1010)", %%mm4 \n\t"\ | |
243 | "pand %%mm0, %%mm4 \n\t"\ | |
244 | "por %%mm1, %%mm4 \n\t"\ | |
245 | "por %%mm2, %%mm4 \n\t"\ | |
246 | "por %%mm3, %%mm4 \n\t"\ | |
247 | "packssdw %%mm4,%%mm4 \n\t"\ | |
248 | "movd %%mm4, %%eax \n\t"\ | |
249 | "orl %%eax, %%eax \n\t"\ | |
250 | "jz 1f \n\t"\ | |
251 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
252 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
253 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
254 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
255 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
256 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
257 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
258 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
259 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
260 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
261 | #rounder ", %%mm4 \n\t"\ | |
262 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
263 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
264 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
265 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
266 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
267 | #rounder ", %%mm0 \n\t"\ | |
268 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
269 | "paddd %%mm0, %%mm0 \n\t" \ | |
270 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
271 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
272 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
273 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
274 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
275 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
276 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
277 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
278 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
279 | "psrad $" #shift ", %%mm7 \n\t"\ | |
280 | "psrad $" #shift ", %%mm4 \n\t"\ | |
281 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
282 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
283 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
284 | "psrad $" #shift ", %%mm1 \n\t"\ | |
285 | "psrad $" #shift ", %%mm2 \n\t"\ | |
286 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
287 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
288 | "movq %%mm7, " #dst " \n\t"\ | |
289 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
290 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
291 | "movq %%mm2, 24+" #dst " \n\t"\ | |
292 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
293 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
294 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
295 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
296 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
297 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
298 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
299 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
300 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
301 | "psrad $" #shift ", %%mm2 \n\t"\ | |
302 | "psrad $" #shift ", %%mm0 \n\t"\ | |
303 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
304 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
305 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
306 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
307 | "psrad $" #shift ", %%mm6 \n\t"\ | |
308 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
309 | "movq %%mm2, 8+" #dst " \n\t"\ | |
310 | "psrad $" #shift ", %%mm4 \n\t"\ | |
311 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
312 | "movq %%mm4, 16+" #dst " \n\t"\ | |
313 | "jmp 2f \n\t"\ | |
314 | "1: \n\t"\ | |
315 | "pslld $16, %%mm0 \n\t"\ | |
316 | "#paddd "MANGLE(d40000)", %%mm0 \n\t"\ | |
317 | "psrad $13, %%mm0 \n\t"\ | |
318 | "packssdw %%mm0, %%mm0 \n\t"\ | |
319 | "movq %%mm0, " #dst " \n\t"\ | |
320 | "movq %%mm0, 8+" #dst " \n\t"\ | |
321 | "movq %%mm0, 16+" #dst " \n\t"\ | |
322 | "movq %%mm0, 24+" #dst " \n\t"\ | |
323 | "2: \n\t" | |
324 | ||
325 | ||
326 | //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
327 | ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) | |
328 | /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11) | |
329 | ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11) | |
330 | ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/ | |
331 | ||
332 | DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11) | |
333 | DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) | |
334 | DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) | |
335 | ||
336 | ||
337 | //IDCT( src0, src4, src1, src5, dst, shift) | |
338 | COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) | |
339 | COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) | |
340 | COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) | |
341 | COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) | |
342 | ||
343 | #else | |
344 | ||
345 | #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
346 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
347 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
348 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
349 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
350 | "movq "MANGLE(wm1010)", %%mm4 \n\t"\ | |
351 | "pand %%mm0, %%mm4 \n\t"\ | |
352 | "por %%mm1, %%mm4 \n\t"\ | |
353 | "por %%mm2, %%mm4 \n\t"\ | |
354 | "por %%mm3, %%mm4 \n\t"\ | |
355 | "packssdw %%mm4,%%mm4 \n\t"\ | |
356 | "movd %%mm4, %%eax \n\t"\ | |
357 | "orl %%eax, %%eax \n\t"\ | |
358 | "jz 1f \n\t"\ | |
359 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
360 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
361 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
362 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
363 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
364 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
365 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
366 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
367 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
368 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
369 | #rounder ", %%mm4 \n\t"\ | |
370 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
371 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
372 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
373 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
374 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
375 | #rounder ", %%mm0 \n\t"\ | |
376 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
377 | "paddd %%mm0, %%mm0 \n\t" \ | |
378 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
379 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
380 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
381 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
382 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
383 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
384 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
385 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
386 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
387 | "psrad $" #shift ", %%mm7 \n\t"\ | |
388 | "psrad $" #shift ", %%mm4 \n\t"\ | |
389 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
390 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
391 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
392 | "psrad $" #shift ", %%mm1 \n\t"\ | |
393 | "psrad $" #shift ", %%mm2 \n\t"\ | |
394 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
395 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
396 | "movq %%mm7, " #dst " \n\t"\ | |
397 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
398 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
399 | "movq %%mm2, 24+" #dst " \n\t"\ | |
400 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
401 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
402 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
403 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
404 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
405 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
406 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
407 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
408 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
409 | "psrad $" #shift ", %%mm2 \n\t"\ | |
410 | "psrad $" #shift ", %%mm0 \n\t"\ | |
411 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
412 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
413 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
414 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
415 | "psrad $" #shift ", %%mm6 \n\t"\ | |
416 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
417 | "movq %%mm2, 8+" #dst " \n\t"\ | |
418 | "psrad $" #shift ", %%mm4 \n\t"\ | |
419 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
420 | "movq %%mm4, 16+" #dst " \n\t"\ | |
421 | "jmp 2f \n\t"\ | |
422 | "1: \n\t"\ | |
423 | "pslld $16, %%mm0 \n\t"\ | |
424 | "paddd "MANGLE(d40000)", %%mm0 \n\t"\ | |
425 | "psrad $13, %%mm0 \n\t"\ | |
426 | "packssdw %%mm0, %%mm0 \n\t"\ | |
427 | "movq %%mm0, " #dst " \n\t"\ | |
428 | "movq %%mm0, 8+" #dst " \n\t"\ | |
429 | "movq %%mm0, 16+" #dst " \n\t"\ | |
430 | "movq %%mm0, 24+" #dst " \n\t"\ | |
431 | "2: \n\t" | |
432 | ||
433 | #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \ | |
434 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
435 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
436 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
437 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
438 | "movq %%mm0, %%mm4 \n\t"\ | |
439 | "por %%mm1, %%mm4 \n\t"\ | |
440 | "por %%mm2, %%mm4 \n\t"\ | |
441 | "por %%mm3, %%mm4 \n\t"\ | |
442 | "packssdw %%mm4,%%mm4 \n\t"\ | |
443 | "movd %%mm4, %%eax \n\t"\ | |
444 | "orl %%eax, %%eax \n\t"\ | |
445 | "jz " #bt " \n\t"\ | |
446 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
447 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
448 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
449 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
450 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
451 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
452 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
453 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
454 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
455 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
456 | #rounder ", %%mm4 \n\t"\ | |
457 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
458 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
459 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
460 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
461 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
462 | #rounder ", %%mm0 \n\t"\ | |
463 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
464 | "paddd %%mm0, %%mm0 \n\t" \ | |
465 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
466 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
467 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
468 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
469 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
470 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
471 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
472 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
473 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
474 | "psrad $" #shift ", %%mm7 \n\t"\ | |
475 | "psrad $" #shift ", %%mm4 \n\t"\ | |
476 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
477 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
478 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
479 | "psrad $" #shift ", %%mm1 \n\t"\ | |
480 | "psrad $" #shift ", %%mm2 \n\t"\ | |
481 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
482 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
483 | "movq %%mm7, " #dst " \n\t"\ | |
484 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
485 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
486 | "movq %%mm2, 24+" #dst " \n\t"\ | |
487 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
488 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
489 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
490 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
491 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
492 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
493 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
494 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
495 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
496 | "psrad $" #shift ", %%mm2 \n\t"\ | |
497 | "psrad $" #shift ", %%mm0 \n\t"\ | |
498 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
499 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
500 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
501 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
502 | "psrad $" #shift ", %%mm6 \n\t"\ | |
503 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
504 | "movq %%mm2, 8+" #dst " \n\t"\ | |
505 | "psrad $" #shift ", %%mm4 \n\t"\ | |
506 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
507 | "movq %%mm4, 16+" #dst " \n\t"\ | |
508 | ||
509 | #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
510 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
511 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
512 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
513 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
514 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
515 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
516 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
517 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
518 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
519 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
520 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
521 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
522 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
523 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
524 | #rounder ", %%mm4 \n\t"\ | |
525 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
526 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
527 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
528 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
529 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
530 | #rounder ", %%mm0 \n\t"\ | |
531 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
532 | "paddd %%mm0, %%mm0 \n\t" \ | |
533 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
534 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
535 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
536 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
537 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
538 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
539 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
540 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
541 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
542 | "psrad $" #shift ", %%mm7 \n\t"\ | |
543 | "psrad $" #shift ", %%mm4 \n\t"\ | |
544 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
545 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
546 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
547 | "psrad $" #shift ", %%mm1 \n\t"\ | |
548 | "psrad $" #shift ", %%mm2 \n\t"\ | |
549 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
550 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
551 | "movq %%mm7, " #dst " \n\t"\ | |
552 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
553 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
554 | "movq %%mm2, 24+" #dst " \n\t"\ | |
555 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
556 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
557 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
558 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
559 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
560 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
561 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
562 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
563 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
564 | "psrad $" #shift ", %%mm2 \n\t"\ | |
565 | "psrad $" #shift ", %%mm0 \n\t"\ | |
566 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
567 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
568 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
569 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
570 | "psrad $" #shift ", %%mm6 \n\t"\ | |
571 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
572 | "movq %%mm2, 8+" #dst " \n\t"\ | |
573 | "psrad $" #shift ", %%mm4 \n\t"\ | |
574 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
575 | "movq %%mm4, 16+" #dst " \n\t"\ | |
576 | ||
577 | //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
578 | DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) | |
579 | Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f) | |
580 | Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) | |
581 | Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) | |
582 | ||
583 | #undef IDCT | |
584 | #define IDCT(src0, src4, src1, src5, dst, shift) \ | |
585 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
586 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
587 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
588 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
589 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
590 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
591 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
592 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
593 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
594 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
595 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
596 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
597 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
598 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
599 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
600 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
601 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
602 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
603 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
604 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
605 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
606 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
607 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
608 | "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
609 | "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
610 | "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
611 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
612 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
613 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
614 | "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
615 | "psrad $" #shift ", %%mm7 \n\t"\ | |
616 | "psrad $" #shift ", %%mm4 \n\t"\ | |
617 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
618 | "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
619 | "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
620 | "psrad $" #shift ", %%mm0 \n\t"\ | |
621 | "psrad $" #shift ", %%mm2 \n\t"\ | |
622 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
623 | "movd %%mm7, " #dst " \n\t"\ | |
624 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
625 | "movd %%mm0, 16+" #dst " \n\t"\ | |
626 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
627 | "movd %%mm2, 96+" #dst " \n\t"\ | |
628 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
629 | "movd %%mm4, 112+" #dst " \n\t"\ | |
630 | "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
631 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
632 | "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
633 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
634 | "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
635 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
636 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
637 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
638 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
639 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
640 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
641 | "psrad $" #shift ", %%mm2 \n\t"\ | |
642 | "psrad $" #shift ", %%mm5 \n\t"\ | |
643 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
644 | "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
645 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
646 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
647 | "psrad $" #shift ", %%mm6 \n\t"\ | |
648 | "psrad $" #shift ", %%mm4 \n\t"\ | |
649 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
650 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
651 | "movd %%mm2, 32+" #dst " \n\t"\ | |
652 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
653 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
654 | "movd %%mm6, 48+" #dst " \n\t"\ | |
655 | "movd %%mm4, 64+" #dst " \n\t"\ | |
656 | "movd %%mm5, 80+" #dst " \n\t" | |
657 | ||
658 | ||
659 | //IDCT( src0, src4, src1, src5, dst, shift) | |
660 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) | |
661 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) | |
662 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) | |
663 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) | |
664 | "jmp 9f \n\t" | |
665 | ||
666 | "# .p2align 4 \n\t"\ | |
667 | "4: \n\t" | |
668 | Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) | |
669 | Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) | |
670 | ||
671 | #undef IDCT | |
672 | #define IDCT(src0, src4, src1, src5, dst, shift) \ | |
673 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
674 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
675 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
676 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
677 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
678 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
679 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
680 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
681 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
682 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
683 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
684 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
685 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
686 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
687 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
688 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
689 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
690 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
691 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
692 | "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
693 | "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
694 | "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
695 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
696 | "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
697 | "psrad $" #shift ", %%mm1 \n\t"\ | |
698 | "psrad $" #shift ", %%mm4 \n\t"\ | |
699 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
700 | "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
701 | "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
702 | "psrad $" #shift ", %%mm0 \n\t"\ | |
703 | "psrad $" #shift ", %%mm2 \n\t"\ | |
704 | "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
705 | "movd %%mm1, " #dst " \n\t"\ | |
706 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
707 | "movd %%mm0, 16+" #dst " \n\t"\ | |
708 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
709 | "movd %%mm2, 96+" #dst " \n\t"\ | |
710 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
711 | "movd %%mm4, 112+" #dst " \n\t"\ | |
712 | "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ | |
713 | "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
714 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
715 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
716 | "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
717 | "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
718 | "psrad $" #shift ", %%mm2 \n\t"\ | |
719 | "psrad $" #shift ", %%mm5 \n\t"\ | |
720 | "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ | |
721 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
722 | "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ | |
723 | "psrad $" #shift ", %%mm6 \n\t"\ | |
724 | "psrad $" #shift ", %%mm1 \n\t"\ | |
725 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
726 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
727 | "movd %%mm2, 32+" #dst " \n\t"\ | |
728 | "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ | |
729 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
730 | "movd %%mm6, 48+" #dst " \n\t"\ | |
731 | "movd %%mm1, 64+" #dst " \n\t"\ | |
732 | "movd %%mm5, 80+" #dst " \n\t" | |
733 | ||
734 | //IDCT( src0, src4, src1, src5, dst, shift) | |
735 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) | |
736 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) | |
737 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) | |
738 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) | |
739 | "jmp 9f \n\t" | |
740 | ||
741 | "# .p2align 4 \n\t"\ | |
742 | "6: \n\t" | |
743 | Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) | |
744 | ||
745 | #undef IDCT | |
746 | #define IDCT(src0, src4, src1, src5, dst, shift) \ | |
747 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
748 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
749 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
750 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
751 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
752 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
753 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
754 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
755 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
756 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
757 | "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
758 | "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
759 | "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
760 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
761 | "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
762 | "psrad $" #shift ", %%mm1 \n\t"\ | |
763 | "psrad $" #shift ", %%mm4 \n\t"\ | |
764 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
765 | "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
766 | "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
767 | "psrad $" #shift ", %%mm0 \n\t"\ | |
768 | "psrad $" #shift ", %%mm2 \n\t"\ | |
769 | "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
770 | "movd %%mm1, " #dst " \n\t"\ | |
771 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
772 | "movd %%mm0, 16+" #dst " \n\t"\ | |
773 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
774 | "movd %%mm2, 96+" #dst " \n\t"\ | |
775 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
776 | "movd %%mm4, 112+" #dst " \n\t"\ | |
777 | "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ | |
778 | "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
779 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
780 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
781 | "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
782 | "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
783 | "psrad $" #shift ", %%mm2 \n\t"\ | |
784 | "psrad $" #shift ", %%mm5 \n\t"\ | |
785 | "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ | |
786 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
787 | "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ | |
788 | "psrad $" #shift ", %%mm6 \n\t"\ | |
789 | "psrad $" #shift ", %%mm1 \n\t"\ | |
790 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
791 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
792 | "movd %%mm2, 32+" #dst " \n\t"\ | |
793 | "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ | |
794 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
795 | "movd %%mm6, 48+" #dst " \n\t"\ | |
796 | "movd %%mm1, 64+" #dst " \n\t"\ | |
797 | "movd %%mm5, 80+" #dst " \n\t" | |
798 | ||
799 | ||
800 | //IDCT( src0, src4, src1, src5, dst, shift) | |
801 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) | |
802 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) | |
803 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) | |
804 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) | |
805 | "jmp 9f \n\t" | |
806 | ||
807 | "# .p2align 4 \n\t"\ | |
808 | "2: \n\t" | |
809 | Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) | |
810 | ||
811 | #undef IDCT | |
812 | #define IDCT(src0, src4, src1, src5, dst, shift) \ | |
813 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
814 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
815 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
816 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
817 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
818 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
819 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
820 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
821 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
822 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
823 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
824 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
825 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
826 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
827 | "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
828 | "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
829 | "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
830 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
831 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
832 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
833 | "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
834 | "psrad $" #shift ", %%mm7 \n\t"\ | |
835 | "psrad $" #shift ", %%mm4 \n\t"\ | |
836 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ | |
837 | "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
838 | "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
839 | "psrad $" #shift ", %%mm0 \n\t"\ | |
840 | "psrad $" #shift ", %%mm2 \n\t"\ | |
841 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
842 | "movd %%mm7, " #dst " \n\t"\ | |
843 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
844 | "movd %%mm0, 16+" #dst " \n\t"\ | |
845 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
846 | "movd %%mm2, 96+" #dst " \n\t"\ | |
847 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
848 | "movd %%mm4, 112+" #dst " \n\t"\ | |
849 | "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
850 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
851 | "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
852 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
853 | "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
854 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
855 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
856 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
857 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
858 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
859 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
860 | "psrad $" #shift ", %%mm2 \n\t"\ | |
861 | "psrad $" #shift ", %%mm5 \n\t"\ | |
862 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
863 | "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
864 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
865 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
866 | "psrad $" #shift ", %%mm6 \n\t"\ | |
867 | "psrad $" #shift ", %%mm4 \n\t"\ | |
868 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
869 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
870 | "movd %%mm2, 32+" #dst " \n\t"\ | |
871 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
872 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
873 | "movd %%mm6, 48+" #dst " \n\t"\ | |
874 | "movd %%mm4, 64+" #dst " \n\t"\ | |
875 | "movd %%mm5, 80+" #dst " \n\t" | |
876 | ||
877 | //IDCT( src0, src4, src1, src5, dst, shift) | |
878 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) | |
879 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) | |
880 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) | |
881 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) | |
882 | "jmp 9f \n\t" | |
883 | ||
884 | "# .p2align 4 \n\t"\ | |
885 | "3: \n\t" | |
886 | #undef IDCT | |
887 | #define IDCT(src0, src4, src1, src5, dst, shift) \ | |
888 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
889 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
890 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
891 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
892 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
893 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
894 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
895 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
896 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
897 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
898 | "movq 64(%2), %%mm3 \n\t"\ | |
899 | "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
900 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
901 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
902 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
903 | "psrad $" #shift ", %%mm7 \n\t"\ | |
904 | "psrad $" #shift ", %%mm4 \n\t"\ | |
905 | "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
906 | "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
907 | "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\ | |
908 | "psrad $" #shift ", %%mm0 \n\t"\ | |
909 | "psrad $" #shift ", %%mm1 \n\t"\ | |
910 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
911 | "movd %%mm7, " #dst " \n\t"\ | |
912 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
913 | "movd %%mm0, 16+" #dst " \n\t"\ | |
914 | "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\ | |
915 | "movd %%mm1, 96+" #dst " \n\t"\ | |
916 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
917 | "movd %%mm4, 112+" #dst " \n\t"\ | |
918 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
919 | "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
920 | "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
921 | "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\ | |
922 | "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\ | |
923 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
924 | "psrad $" #shift ", %%mm1 \n\t"\ | |
925 | "psrad $" #shift ", %%mm5 \n\t"\ | |
926 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
927 | "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
928 | "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
929 | "psrad $" #shift ", %%mm6 \n\t"\ | |
930 | "psrad $" #shift ", %%mm4 \n\t"\ | |
931 | "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\ | |
932 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
933 | "movd %%mm1, 32+" #dst " \n\t"\ | |
934 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
935 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
936 | "movd %%mm6, 48+" #dst " \n\t"\ | |
937 | "movd %%mm4, 64+" #dst " \n\t"\ | |
938 | "movd %%mm5, 80+" #dst " \n\t" | |
939 | ||
940 | ||
941 | //IDCT( src0, src4, src1, src5, dst, shift) | |
942 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) | |
943 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) | |
944 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) | |
945 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) | |
946 | "jmp 9f \n\t" | |
947 | ||
948 | "# .p2align 4 \n\t"\ | |
949 | "5: \n\t" | |
950 | #undef IDCT | |
951 | #define IDCT(src0, src4, src1, src5, dst, shift) \ | |
952 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
953 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
954 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
955 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
956 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
957 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
958 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
959 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
960 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
961 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
962 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
963 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
964 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
965 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
966 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
967 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
968 | "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ | |
969 | "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\ | |
970 | "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ | |
971 | "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
972 | "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ | |
973 | "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
974 | "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ | |
975 | "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
976 | "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
977 | "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\ | |
978 | "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\ | |
979 | "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\ | |
980 | "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\ | |
981 | "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\ | |
982 | "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\ | |
983 | "psrad $" #shift ", %%mm4 \n\t"\ | |
984 | "psrad $" #shift ", %%mm7 \n\t"\ | |
985 | "psrad $" #shift ", %%mm3 \n\t"\ | |
986 | "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\ | |
987 | "movq %%mm4, " #dst " \n\t"\ | |
988 | "psrad $" #shift ", %%mm0 \n\t"\ | |
989 | "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\ | |
990 | "movq %%mm0, 16+" #dst " \n\t"\ | |
991 | "movq %%mm0, 96+" #dst " \n\t"\ | |
992 | "movq %%mm4, 112+" #dst " \n\t"\ | |
993 | "psrad $" #shift ", %%mm5 \n\t"\ | |
994 | "psrad $" #shift ", %%mm6 \n\t"\ | |
995 | "psrad $" #shift ", %%mm2 \n\t"\ | |
996 | "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
997 | "movq %%mm5, 32+" #dst " \n\t"\ | |
998 | "psrad $" #shift ", %%mm1 \n\t"\ | |
999 | "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1000 | "movq %%mm6, 48+" #dst " \n\t"\ | |
1001 | "movq %%mm6, 64+" #dst " \n\t"\ | |
1002 | "movq %%mm5, 80+" #dst " \n\t" | |
1003 | ||
1004 | ||
1005 | //IDCT( src0, src4, src1, src5, dst, shift) | |
1006 | IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) | |
1007 | //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) | |
1008 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) | |
1009 | //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) | |
1010 | "jmp 9f \n\t" | |
1011 | ||
1012 | ||
1013 | "# .p2align 4 \n\t"\ | |
1014 | "1: \n\t" | |
1015 | #undef IDCT | |
1016 | #define IDCT(src0, src4, src1, src5, dst, shift) \ | |
1017 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1018 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
1019 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
1020 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1021 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1022 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1023 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1024 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
1025 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
1026 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
1027 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
1028 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1029 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
1030 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
1031 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
1032 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
1033 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1034 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
1035 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
1036 | "movq 64(%2), %%mm1 \n\t"\ | |
1037 | "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
1038 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
1039 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
1040 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
1041 | "psrad $" #shift ", %%mm7 \n\t"\ | |
1042 | "psrad $" #shift ", %%mm4 \n\t"\ | |
1043 | "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\ | |
1044 | "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1045 | "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\ | |
1046 | "psrad $" #shift ", %%mm0 \n\t"\ | |
1047 | "psrad $" #shift ", %%mm3 \n\t"\ | |
1048 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
1049 | "movd %%mm7, " #dst " \n\t"\ | |
1050 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1051 | "movd %%mm0, 16+" #dst " \n\t"\ | |
1052 | "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\ | |
1053 | "movd %%mm3, 96+" #dst " \n\t"\ | |
1054 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
1055 | "movd %%mm4, 112+" #dst " \n\t"\ | |
1056 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
1057 | "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
1058 | "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
1059 | "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\ | |
1060 | "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\ | |
1061 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
1062 | "psrad $" #shift ", %%mm3 \n\t"\ | |
1063 | "psrad $" #shift ", %%mm5 \n\t"\ | |
1064 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
1065 | "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1066 | "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
1067 | "psrad $" #shift ", %%mm6 \n\t"\ | |
1068 | "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\ | |
1069 | "movd %%mm3, 32+" #dst " \n\t"\ | |
1070 | "psrad $" #shift ", %%mm4 \n\t"\ | |
1071 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1072 | "movd %%mm6, 48+" #dst " \n\t"\ | |
1073 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
1074 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
1075 | "movd %%mm4, 64+" #dst " \n\t"\ | |
1076 | "movd %%mm5, 80+" #dst " \n\t" | |
1077 | ||
1078 | ||
1079 | //IDCT( src0, src4, src1, src5, dst, shift) | |
1080 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) | |
1081 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) | |
1082 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) | |
1083 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) | |
1084 | "jmp 9f \n\t" | |
1085 | ||
1086 | ||
1087 | "# .p2align 4 \n\t" | |
1088 | "7: \n\t" | |
1089 | #undef IDCT | |
1090 | #define IDCT(src0, src4, src1, src5, dst, shift) \ | |
1091 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1092 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1093 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1094 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1095 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1096 | "psrad $" #shift ", %%mm4 \n\t"\ | |
1097 | "psrad $" #shift ", %%mm0 \n\t"\ | |
1098 | "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ | |
1099 | "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ | |
1100 | "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1101 | "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ | |
1102 | "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1103 | "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ | |
1104 | "psrad $" #shift ", %%mm1 \n\t"\ | |
1105 | "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\ | |
1106 | "movq %%mm4, " #dst " \n\t"\ | |
1107 | "psrad $" #shift ", %%mm2 \n\t"\ | |
1108 | "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\ | |
1109 | "movq %%mm0, 16+" #dst " \n\t"\ | |
1110 | "movq %%mm0, 96+" #dst " \n\t"\ | |
1111 | "movq %%mm4, 112+" #dst " \n\t"\ | |
1112 | "movq %%mm0, 32+" #dst " \n\t"\ | |
1113 | "movq %%mm4, 48+" #dst " \n\t"\ | |
1114 | "movq %%mm4, 64+" #dst " \n\t"\ | |
1115 | "movq %%mm0, 80+" #dst " \n\t" | |
1116 | ||
1117 | //IDCT( src0, src4, src1, src5, dst, shift) | |
1118 | IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) | |
1119 | //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) | |
1120 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) | |
1121 | //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) | |
1122 | ||
1123 | ||
1124 | #endif | |
1125 | ||
1126 | /* | |
1127 | Input | |
1128 | 00 40 04 44 20 60 24 64 | |
1129 | 10 30 14 34 50 70 54 74 | |
1130 | 01 41 03 43 21 61 23 63 | |
1131 | 11 31 13 33 51 71 53 73 | |
1132 | 02 42 06 46 22 62 26 66 | |
1133 | 12 32 16 36 52 72 56 76 | |
1134 | 05 45 07 47 25 65 27 67 | |
1135 | 15 35 17 37 55 75 57 77 | |
1136 | ||
1137 | Temp | |
1138 | 00 04 10 14 20 24 30 34 | |
1139 | 40 44 50 54 60 64 70 74 | |
1140 | 01 03 11 13 21 23 31 33 | |
1141 | 41 43 51 53 61 63 71 73 | |
1142 | 02 06 12 16 22 26 32 36 | |
1143 | 42 46 52 56 62 66 72 76 | |
1144 | 05 07 15 17 25 27 35 37 | |
1145 | 45 47 55 57 65 67 75 77 | |
1146 | */ | |
1147 | ||
1148 | "9: \n\t" | |
1149 | :: "r" (block), "r" (temp), "r" (coeffs) | |
1150 | NAMED_CONSTRAINTS_ADD(wm1010,d40000) | |
1151 | : "%eax" | |
1152 | ); | |
1153 | } | |
1154 | ||
1155 | void ff_simple_idct_mmx(int16_t *block) | |
1156 | { | |
1157 | idct(block); | |
1158 | } | |
1159 | ||
1160 | //FIXME merge add/put into the idct | |
1161 | ||
1162 | void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block) | |
1163 | { | |
1164 | idct(block); | |
1165 | ff_put_pixels_clamped(block, dest, line_size); | |
1166 | } | |
1167 | void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block) | |
1168 | { | |
1169 | idct(block); | |
1170 | ff_add_pixels_clamped(block, dest, line_size); | |
1171 | } | |
1172 | ||
1173 | #endif /* HAVE_INLINE_ASM */ |