Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * SIMD-optimized forward DCT | |
3 | * The gcc porting is Copyright (c) 2001 Fabrice Bellard. | |
4 | * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |
5 | * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. | |
6 | * | |
7 | * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT | |
8 | * | |
9 | * Intel Application Note AP-922 - fast, precise implementation of DCT | |
10 | * http://developer.intel.com/vtune/cbts/appnotes.htm | |
11 | * | |
12 | * Also of inspiration: | |
13 | * a page about fdct at http://www.geocities.com/ssavekar/dct.htm | |
14 | * Skal's fdct at http://skal.planet-d.net/coding/dct.html | |
15 | * | |
16 | * This file is part of FFmpeg. | |
17 | * | |
18 | * FFmpeg is free software; you can redistribute it and/or | |
19 | * modify it under the terms of the GNU Lesser General Public | |
20 | * License as published by the Free Software Foundation; either | |
21 | * version 2.1 of the License, or (at your option) any later version. | |
22 | * | |
23 | * FFmpeg is distributed in the hope that it will be useful, | |
24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
26 | * Lesser General Public License for more details. | |
27 | * | |
28 | * You should have received a copy of the GNU Lesser General Public | |
29 | * License along with FFmpeg; if not, write to the Free Software | |
30 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
31 | */ | |
32 | ||
33 | #include "libavutil/common.h" | |
34 | #include "libavutil/x86/asm.h" | |
35 | #include "fdct.h" | |
36 | ||
37 | #if HAVE_MMX_INLINE | |
38 | ||
39 | ////////////////////////////////////////////////////////////////////// | |
40 | // | |
41 | // constants for the forward DCT | |
42 | // ----------------------------- | |
43 | // | |
44 | // Be sure to check that your compiler is aligning all constants to QWORD | |
45 | // (8-byte) memory boundaries! Otherwise the unaligned memory access will | |
46 | // severely stall MMX execution. | |
47 | // | |
48 | ////////////////////////////////////////////////////////////////////// | |
49 | ||
50 | #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy | |
51 | #define SHIFT_FRW_COL BITS_FRW_ACC | |
52 | #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) | |
53 | #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) | |
54 | //#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) | |
55 | ||
56 | #define X8(x) x,x,x,x,x,x,x,x | |
57 | ||
58 | //concatenated table, for forward DCT transformation | |
59 | DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = { | |
60 | X8(13036), // tg * (2<<16) + 0.5 | |
61 | X8(27146), // tg * (2<<16) + 0.5 | |
62 | X8(-21746) // tg * (2<<16) + 0.5 | |
63 | }; | |
64 | ||
65 | DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = { | |
66 | X8(23170) //cos * (2<<15) + 0.5 | |
67 | }; | |
68 | ||
69 | DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) }; | |
70 | ||
71 | DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW }; | |
72 | ||
73 | static const struct | |
74 | { | |
75 | DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4]; | |
76 | } fdct_r_row_sse2 = | |
77 | {{ | |
78 | RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW | |
79 | }}; | |
80 | //DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; | |
81 | ||
82 | DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table | |
83 | 16384, 16384, 22725, 19266, | |
84 | 16384, 16384, 12873, 4520, | |
85 | 21407, 8867, 19266, -4520, | |
86 | -8867, -21407, -22725, -12873, | |
87 | 16384, -16384, 12873, -22725, | |
88 | -16384, 16384, 4520, 19266, | |
89 | 8867, -21407, 4520, -12873, | |
90 | 21407, -8867, 19266, -22725, | |
91 | ||
92 | 22725, 22725, 31521, 26722, | |
93 | 22725, 22725, 17855, 6270, | |
94 | 29692, 12299, 26722, -6270, | |
95 | -12299, -29692, -31521, -17855, | |
96 | 22725, -22725, 17855, -31521, | |
97 | -22725, 22725, 6270, 26722, | |
98 | 12299, -29692, 6270, -17855, | |
99 | 29692, -12299, 26722, -31521, | |
100 | ||
101 | 21407, 21407, 29692, 25172, | |
102 | 21407, 21407, 16819, 5906, | |
103 | 27969, 11585, 25172, -5906, | |
104 | -11585, -27969, -29692, -16819, | |
105 | 21407, -21407, 16819, -29692, | |
106 | -21407, 21407, 5906, 25172, | |
107 | 11585, -27969, 5906, -16819, | |
108 | 27969, -11585, 25172, -29692, | |
109 | ||
110 | 19266, 19266, 26722, 22654, | |
111 | 19266, 19266, 15137, 5315, | |
112 | 25172, 10426, 22654, -5315, | |
113 | -10426, -25172, -26722, -15137, | |
114 | 19266, -19266, 15137, -26722, | |
115 | -19266, 19266, 5315, 22654, | |
116 | 10426, -25172, 5315, -15137, | |
117 | 25172, -10426, 22654, -26722, | |
118 | ||
119 | 16384, 16384, 22725, 19266, | |
120 | 16384, 16384, 12873, 4520, | |
121 | 21407, 8867, 19266, -4520, | |
122 | -8867, -21407, -22725, -12873, | |
123 | 16384, -16384, 12873, -22725, | |
124 | -16384, 16384, 4520, 19266, | |
125 | 8867, -21407, 4520, -12873, | |
126 | 21407, -8867, 19266, -22725, | |
127 | ||
128 | 19266, 19266, 26722, 22654, | |
129 | 19266, 19266, 15137, 5315, | |
130 | 25172, 10426, 22654, -5315, | |
131 | -10426, -25172, -26722, -15137, | |
132 | 19266, -19266, 15137, -26722, | |
133 | -19266, 19266, 5315, 22654, | |
134 | 10426, -25172, 5315, -15137, | |
135 | 25172, -10426, 22654, -26722, | |
136 | ||
137 | 21407, 21407, 29692, 25172, | |
138 | 21407, 21407, 16819, 5906, | |
139 | 27969, 11585, 25172, -5906, | |
140 | -11585, -27969, -29692, -16819, | |
141 | 21407, -21407, 16819, -29692, | |
142 | -21407, 21407, 5906, 25172, | |
143 | 11585, -27969, 5906, -16819, | |
144 | 27969, -11585, 25172, -29692, | |
145 | ||
146 | 22725, 22725, 31521, 26722, | |
147 | 22725, 22725, 17855, 6270, | |
148 | 29692, 12299, 26722, -6270, | |
149 | -12299, -29692, -31521, -17855, | |
150 | 22725, -22725, 17855, -31521, | |
151 | -22725, 22725, 6270, 26722, | |
152 | 12299, -29692, 6270, -17855, | |
153 | 29692, -12299, 26722, -31521, | |
154 | }; | |
155 | ||
156 | static const struct | |
157 | { | |
158 | DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256]; | |
159 | } tab_frw_01234567_sse2 = | |
160 | {{ | |
161 | //DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table | |
162 | #define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ | |
163 | C4, C4, C5, C7, C2, C6, C3, -C7, \ | |
164 | -C4, C4, C7, C3, C6, -C2, C7, -C5, \ | |
165 | C4, -C4, C5, -C1, C2, -C6, C3, -C1, | |
166 | // c1..c7 * cos(pi/4) * 2^15 | |
167 | #define C1 22725 | |
168 | #define C2 21407 | |
169 | #define C3 19266 | |
170 | #define C4 16384 | |
171 | #define C5 12873 | |
172 | #define C6 8867 | |
173 | #define C7 4520 | |
174 | TABLE_SSE2 | |
175 | ||
176 | #undef C1 | |
177 | #undef C2 | |
178 | #undef C3 | |
179 | #undef C4 | |
180 | #undef C5 | |
181 | #undef C6 | |
182 | #undef C7 | |
183 | #define C1 31521 | |
184 | #define C2 29692 | |
185 | #define C3 26722 | |
186 | #define C4 22725 | |
187 | #define C5 17855 | |
188 | #define C6 12299 | |
189 | #define C7 6270 | |
190 | TABLE_SSE2 | |
191 | ||
192 | #undef C1 | |
193 | #undef C2 | |
194 | #undef C3 | |
195 | #undef C4 | |
196 | #undef C5 | |
197 | #undef C6 | |
198 | #undef C7 | |
199 | #define C1 29692 | |
200 | #define C2 27969 | |
201 | #define C3 25172 | |
202 | #define C4 21407 | |
203 | #define C5 16819 | |
204 | #define C6 11585 | |
205 | #define C7 5906 | |
206 | TABLE_SSE2 | |
207 | ||
208 | #undef C1 | |
209 | #undef C2 | |
210 | #undef C3 | |
211 | #undef C4 | |
212 | #undef C5 | |
213 | #undef C6 | |
214 | #undef C7 | |
215 | #define C1 26722 | |
216 | #define C2 25172 | |
217 | #define C3 22654 | |
218 | #define C4 19266 | |
219 | #define C5 15137 | |
220 | #define C6 10426 | |
221 | #define C7 5315 | |
222 | TABLE_SSE2 | |
223 | ||
224 | #undef C1 | |
225 | #undef C2 | |
226 | #undef C3 | |
227 | #undef C4 | |
228 | #undef C5 | |
229 | #undef C6 | |
230 | #undef C7 | |
231 | #define C1 22725 | |
232 | #define C2 21407 | |
233 | #define C3 19266 | |
234 | #define C4 16384 | |
235 | #define C5 12873 | |
236 | #define C6 8867 | |
237 | #define C7 4520 | |
238 | TABLE_SSE2 | |
239 | ||
240 | #undef C1 | |
241 | #undef C2 | |
242 | #undef C3 | |
243 | #undef C4 | |
244 | #undef C5 | |
245 | #undef C6 | |
246 | #undef C7 | |
247 | #define C1 26722 | |
248 | #define C2 25172 | |
249 | #define C3 22654 | |
250 | #define C4 19266 | |
251 | #define C5 15137 | |
252 | #define C6 10426 | |
253 | #define C7 5315 | |
254 | TABLE_SSE2 | |
255 | ||
256 | #undef C1 | |
257 | #undef C2 | |
258 | #undef C3 | |
259 | #undef C4 | |
260 | #undef C5 | |
261 | #undef C6 | |
262 | #undef C7 | |
263 | #define C1 29692 | |
264 | #define C2 27969 | |
265 | #define C3 25172 | |
266 | #define C4 21407 | |
267 | #define C5 16819 | |
268 | #define C6 11585 | |
269 | #define C7 5906 | |
270 | TABLE_SSE2 | |
271 | ||
272 | #undef C1 | |
273 | #undef C2 | |
274 | #undef C3 | |
275 | #undef C4 | |
276 | #undef C5 | |
277 | #undef C6 | |
278 | #undef C7 | |
279 | #define C1 31521 | |
280 | #define C2 29692 | |
281 | #define C3 26722 | |
282 | #define C4 22725 | |
283 | #define C5 17855 | |
284 | #define C6 12299 | |
285 | #define C7 6270 | |
286 | TABLE_SSE2 | |
287 | }}; | |
288 | ||
289 | #define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long | |
290 | ||
291 | #define FDCT_COL(cpu, mm, mov)\ | |
292 | static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\ | |
293 | {\ | |
294 | __asm__ volatile (\ | |
295 | #mov" 16(%0), %%"#mm"0 \n\t" \ | |
296 | #mov" 96(%0), %%"#mm"1 \n\t" \ | |
297 | #mov" %%"#mm"0, %%"#mm"2 \n\t" \ | |
298 | #mov" 32(%0), %%"#mm"3 \n\t" \ | |
299 | "paddsw %%"#mm"1, %%"#mm"0 \n\t" \ | |
300 | #mov" 80(%0), %%"#mm"4 \n\t" \ | |
301 | "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \ | |
302 | #mov" (%0), %%"#mm"5 \n\t" \ | |
303 | "paddsw %%"#mm"3, %%"#mm"4 \n\t" \ | |
304 | "paddsw 112(%0), %%"#mm"5 \n\t" \ | |
305 | "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \ | |
306 | #mov" %%"#mm"0, %%"#mm"6 \n\t" \ | |
307 | "psubsw %%"#mm"1, %%"#mm"2 \n\t" \ | |
308 | #mov" 16(%1), %%"#mm"1 \n\t" \ | |
309 | "psubsw %%"#mm"4, %%"#mm"0 \n\t" \ | |
310 | #mov" 48(%0), %%"#mm"7 \n\t" \ | |
311 | "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \ | |
312 | "paddsw 64(%0), %%"#mm"7 \n\t" \ | |
313 | "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \ | |
314 | "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ | |
315 | "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \ | |
316 | #mov" %%"#mm"5, %%"#mm"4 \n\t" \ | |
317 | "psubsw %%"#mm"7, %%"#mm"5 \n\t" \ | |
318 | "paddsw %%"#mm"5, %%"#mm"1 \n\t" \ | |
319 | "paddsw %%"#mm"7, %%"#mm"4 \n\t" \ | |
320 | "por (%2), %%"#mm"1 \n\t" \ | |
321 | "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \ | |
322 | "pmulhw 16(%1), %%"#mm"5 \n\t" \ | |
323 | #mov" %%"#mm"4, %%"#mm"7 \n\t" \ | |
324 | "psubsw 80(%0), %%"#mm"3 \n\t" \ | |
325 | "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ | |
326 | #mov" %%"#mm"1, 32(%3) \n\t" \ | |
327 | "paddsw %%"#mm"6, %%"#mm"7 \n\t" \ | |
328 | #mov" 48(%0), %%"#mm"1 \n\t" \ | |
329 | "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \ | |
330 | "psubsw 64(%0), %%"#mm"1 \n\t" \ | |
331 | #mov" %%"#mm"2, %%"#mm"6 \n\t" \ | |
332 | #mov" %%"#mm"4, 64(%3) \n\t" \ | |
333 | "paddsw %%"#mm"3, %%"#mm"2 \n\t" \ | |
334 | "pmulhw (%4), %%"#mm"2 \n\t" \ | |
335 | "psubsw %%"#mm"3, %%"#mm"6 \n\t" \ | |
336 | "pmulhw (%4), %%"#mm"6 \n\t" \ | |
337 | "psubsw %%"#mm"0, %%"#mm"5 \n\t" \ | |
338 | "por (%2), %%"#mm"5 \n\t" \ | |
339 | "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \ | |
340 | "por (%2), %%"#mm"2 \n\t" \ | |
341 | #mov" %%"#mm"1, %%"#mm"4 \n\t" \ | |
342 | #mov" (%0), %%"#mm"3 \n\t" \ | |
343 | "paddsw %%"#mm"6, %%"#mm"1 \n\t" \ | |
344 | "psubsw 112(%0), %%"#mm"3 \n\t" \ | |
345 | "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ | |
346 | #mov" (%1), %%"#mm"0 \n\t" \ | |
347 | "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \ | |
348 | #mov" 32(%1), %%"#mm"6 \n\t" \ | |
349 | "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \ | |
350 | #mov" %%"#mm"7, (%3) \n\t" \ | |
351 | "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \ | |
352 | #mov" %%"#mm"5, 96(%3) \n\t" \ | |
353 | #mov" %%"#mm"3, %%"#mm"7 \n\t" \ | |
354 | #mov" 32(%1), %%"#mm"5 \n\t" \ | |
355 | "psubsw %%"#mm"2, %%"#mm"7 \n\t" \ | |
356 | "paddsw %%"#mm"2, %%"#mm"3 \n\t" \ | |
357 | "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \ | |
358 | "paddsw %%"#mm"3, %%"#mm"0 \n\t" \ | |
359 | "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ | |
360 | "pmulhw (%1), %%"#mm"3 \n\t" \ | |
361 | "por (%2), %%"#mm"0 \n\t" \ | |
362 | "paddsw %%"#mm"7, %%"#mm"5 \n\t" \ | |
363 | "psubsw %%"#mm"6, %%"#mm"7 \n\t" \ | |
364 | #mov" %%"#mm"0, 16(%3) \n\t" \ | |
365 | "paddsw %%"#mm"4, %%"#mm"5 \n\t" \ | |
366 | #mov" %%"#mm"7, 48(%3) \n\t" \ | |
367 | "psubsw %%"#mm"1, %%"#mm"3 \n\t" \ | |
368 | #mov" %%"#mm"5, 80(%3) \n\t" \ | |
369 | #mov" %%"#mm"3, 112(%3) \n\t" \ | |
370 | : \ | |
371 | : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \ | |
372 | "r" (out + offset), "r" (ocos_4_16)); \ | |
373 | } | |
374 | ||
375 | FDCT_COL(mmx, mm, movq) | |
376 | FDCT_COL(sse2, xmm, movdqa) | |
377 | ||
378 | static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) | |
379 | { | |
380 | __asm__ volatile( | |
381 | #define FDCT_ROW_SSE2_H1(i,t) \ | |
382 | "movq " #i "(%0), %%xmm2 \n\t" \ | |
383 | "movq " #i "+8(%0), %%xmm0 \n\t" \ | |
384 | "movdqa " #t "+32(%1), %%xmm3 \n\t" \ | |
385 | "movdqa " #t "+48(%1), %%xmm7 \n\t" \ | |
386 | "movdqa " #t "(%1), %%xmm4 \n\t" \ | |
387 | "movdqa " #t "+16(%1), %%xmm5 \n\t" | |
388 | ||
389 | #define FDCT_ROW_SSE2_H2(i,t) \ | |
390 | "movq " #i "(%0), %%xmm2 \n\t" \ | |
391 | "movq " #i "+8(%0), %%xmm0 \n\t" \ | |
392 | "movdqa " #t "+32(%1), %%xmm3 \n\t" \ | |
393 | "movdqa " #t "+48(%1), %%xmm7 \n\t" | |
394 | ||
395 | #define FDCT_ROW_SSE2(i) \ | |
396 | "movq %%xmm2, %%xmm1 \n\t" \ | |
397 | "pshuflw $27, %%xmm0, %%xmm0 \n\t" \ | |
398 | "paddsw %%xmm0, %%xmm1 \n\t" \ | |
399 | "psubsw %%xmm0, %%xmm2 \n\t" \ | |
400 | "punpckldq %%xmm2, %%xmm1 \n\t" \ | |
401 | "pshufd $78, %%xmm1, %%xmm2 \n\t" \ | |
402 | "pmaddwd %%xmm2, %%xmm3 \n\t" \ | |
403 | "pmaddwd %%xmm1, %%xmm7 \n\t" \ | |
404 | "pmaddwd %%xmm5, %%xmm2 \n\t" \ | |
405 | "pmaddwd %%xmm4, %%xmm1 \n\t" \ | |
406 | "paddd %%xmm7, %%xmm3 \n\t" \ | |
407 | "paddd %%xmm2, %%xmm1 \n\t" \ | |
408 | "paddd %%xmm6, %%xmm3 \n\t" \ | |
409 | "paddd %%xmm6, %%xmm1 \n\t" \ | |
410 | "psrad %3, %%xmm3 \n\t" \ | |
411 | "psrad %3, %%xmm1 \n\t" \ | |
412 | "packssdw %%xmm3, %%xmm1 \n\t" \ | |
413 | "movdqa %%xmm1, " #i "(%4) \n\t" | |
414 | ||
415 | "movdqa (%2), %%xmm6 \n\t" | |
416 | FDCT_ROW_SSE2_H1(0,0) | |
417 | FDCT_ROW_SSE2(0) | |
418 | FDCT_ROW_SSE2_H2(64,0) | |
419 | FDCT_ROW_SSE2(64) | |
420 | ||
421 | FDCT_ROW_SSE2_H1(16,64) | |
422 | FDCT_ROW_SSE2(16) | |
423 | FDCT_ROW_SSE2_H2(112,64) | |
424 | FDCT_ROW_SSE2(112) | |
425 | ||
426 | FDCT_ROW_SSE2_H1(32,128) | |
427 | FDCT_ROW_SSE2(32) | |
428 | FDCT_ROW_SSE2_H2(96,128) | |
429 | FDCT_ROW_SSE2(96) | |
430 | ||
431 | FDCT_ROW_SSE2_H1(48,192) | |
432 | FDCT_ROW_SSE2(48) | |
433 | FDCT_ROW_SSE2_H2(80,192) | |
434 | FDCT_ROW_SSE2(80) | |
435 | : | |
436 | : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), | |
437 | "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) | |
438 | XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", | |
439 | "%xmm4", "%xmm5", "%xmm6", "%xmm7") | |
440 | ); | |
441 | } | |
442 | ||
443 | static av_always_inline void fdct_row_mmxext(const int16_t *in, int16_t *out, | |
444 | const int16_t *table) | |
445 | { | |
446 | __asm__ volatile ( | |
447 | "pshufw $0x1B, 8(%0), %%mm5 \n\t" | |
448 | "movq (%0), %%mm0 \n\t" | |
449 | "movq %%mm0, %%mm1 \n\t" | |
450 | "paddsw %%mm5, %%mm0 \n\t" | |
451 | "psubsw %%mm5, %%mm1 \n\t" | |
452 | "movq %%mm0, %%mm2 \n\t" | |
453 | "punpckldq %%mm1, %%mm0 \n\t" | |
454 | "punpckhdq %%mm1, %%mm2 \n\t" | |
455 | "movq (%1), %%mm1 \n\t" | |
456 | "movq 8(%1), %%mm3 \n\t" | |
457 | "movq 16(%1), %%mm4 \n\t" | |
458 | "movq 24(%1), %%mm5 \n\t" | |
459 | "movq 32(%1), %%mm6 \n\t" | |
460 | "movq 40(%1), %%mm7 \n\t" | |
461 | "pmaddwd %%mm0, %%mm1 \n\t" | |
462 | "pmaddwd %%mm2, %%mm3 \n\t" | |
463 | "pmaddwd %%mm0, %%mm4 \n\t" | |
464 | "pmaddwd %%mm2, %%mm5 \n\t" | |
465 | "pmaddwd %%mm0, %%mm6 \n\t" | |
466 | "pmaddwd %%mm2, %%mm7 \n\t" | |
467 | "pmaddwd 48(%1), %%mm0 \n\t" | |
468 | "pmaddwd 56(%1), %%mm2 \n\t" | |
469 | "paddd %%mm1, %%mm3 \n\t" | |
470 | "paddd %%mm4, %%mm5 \n\t" | |
471 | "paddd %%mm6, %%mm7 \n\t" | |
472 | "paddd %%mm0, %%mm2 \n\t" | |
473 | "movq (%2), %%mm0 \n\t" | |
474 | "paddd %%mm0, %%mm3 \n\t" | |
475 | "paddd %%mm0, %%mm5 \n\t" | |
476 | "paddd %%mm0, %%mm7 \n\t" | |
477 | "paddd %%mm0, %%mm2 \n\t" | |
478 | "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" | |
479 | "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" | |
480 | "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" | |
481 | "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" | |
482 | "packssdw %%mm5, %%mm3 \n\t" | |
483 | "packssdw %%mm2, %%mm7 \n\t" | |
484 | "movq %%mm3, (%3) \n\t" | |
485 | "movq %%mm7, 8(%3) \n\t" | |
486 | : | |
487 | : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); | |
488 | } | |
489 | ||
490 | static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table) | |
491 | { | |
492 | //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...) | |
493 | __asm__ volatile( | |
494 | "movd 12(%0), %%mm1 \n\t" | |
495 | "punpcklwd 8(%0), %%mm1 \n\t" | |
496 | "movq %%mm1, %%mm2 \n\t" | |
497 | "psrlq $0x20, %%mm1 \n\t" | |
498 | "movq 0(%0), %%mm0 \n\t" | |
499 | "punpcklwd %%mm2, %%mm1 \n\t" | |
500 | "movq %%mm0, %%mm5 \n\t" | |
501 | "paddsw %%mm1, %%mm0 \n\t" | |
502 | "psubsw %%mm1, %%mm5 \n\t" | |
503 | "movq %%mm0, %%mm2 \n\t" | |
504 | "punpckldq %%mm5, %%mm0 \n\t" | |
505 | "punpckhdq %%mm5, %%mm2 \n\t" | |
506 | "movq 0(%1), %%mm1 \n\t" | |
507 | "movq 8(%1), %%mm3 \n\t" | |
508 | "movq 16(%1), %%mm4 \n\t" | |
509 | "movq 24(%1), %%mm5 \n\t" | |
510 | "movq 32(%1), %%mm6 \n\t" | |
511 | "movq 40(%1), %%mm7 \n\t" | |
512 | "pmaddwd %%mm0, %%mm1 \n\t" | |
513 | "pmaddwd %%mm2, %%mm3 \n\t" | |
514 | "pmaddwd %%mm0, %%mm4 \n\t" | |
515 | "pmaddwd %%mm2, %%mm5 \n\t" | |
516 | "pmaddwd %%mm0, %%mm6 \n\t" | |
517 | "pmaddwd %%mm2, %%mm7 \n\t" | |
518 | "pmaddwd 48(%1), %%mm0 \n\t" | |
519 | "pmaddwd 56(%1), %%mm2 \n\t" | |
520 | "paddd %%mm1, %%mm3 \n\t" | |
521 | "paddd %%mm4, %%mm5 \n\t" | |
522 | "paddd %%mm6, %%mm7 \n\t" | |
523 | "paddd %%mm0, %%mm2 \n\t" | |
524 | "movq (%2), %%mm0 \n\t" | |
525 | "paddd %%mm0, %%mm3 \n\t" | |
526 | "paddd %%mm0, %%mm5 \n\t" | |
527 | "paddd %%mm0, %%mm7 \n\t" | |
528 | "paddd %%mm0, %%mm2 \n\t" | |
529 | "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" | |
530 | "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" | |
531 | "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" | |
532 | "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" | |
533 | "packssdw %%mm5, %%mm3 \n\t" | |
534 | "packssdw %%mm2, %%mm7 \n\t" | |
535 | "movq %%mm3, 0(%3) \n\t" | |
536 | "movq %%mm7, 8(%3) \n\t" | |
537 | : | |
538 | : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); | |
539 | } | |
540 | ||
541 | void ff_fdct_mmx(int16_t *block) | |
542 | { | |
543 | DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; | |
544 | int16_t * block1= (int16_t*)align_tmp; | |
545 | const int16_t *table= tab_frw_01234567; | |
546 | int i; | |
547 | ||
548 | fdct_col_mmx(block, block1, 0); | |
549 | fdct_col_mmx(block, block1, 4); | |
550 | ||
551 | for(i=8;i>0;i--) { | |
552 | fdct_row_mmx(block1, block, table); | |
553 | block1 += 8; | |
554 | table += 32; | |
555 | block += 8; | |
556 | } | |
557 | } | |
558 | ||
559 | #endif /* HAVE_MMX_INLINE */ | |
560 | ||
561 | #if HAVE_MMXEXT_INLINE | |
562 | ||
563 | void ff_fdct_mmxext(int16_t *block) | |
564 | { | |
565 | DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; | |
566 | int16_t *block1= (int16_t*)align_tmp; | |
567 | const int16_t *table= tab_frw_01234567; | |
568 | int i; | |
569 | ||
570 | fdct_col_mmx(block, block1, 0); | |
571 | fdct_col_mmx(block, block1, 4); | |
572 | ||
573 | for(i=8;i>0;i--) { | |
574 | fdct_row_mmxext(block1, block, table); | |
575 | block1 += 8; | |
576 | table += 32; | |
577 | block += 8; | |
578 | } | |
579 | } | |
580 | ||
581 | #endif /* HAVE_MMXEXT_INLINE */ | |
582 | ||
583 | #if HAVE_SSE2_INLINE | |
584 | ||
585 | void ff_fdct_sse2(int16_t *block) | |
586 | { | |
587 | DECLARE_ALIGNED(16, int64_t, align_tmp)[16]; | |
588 | int16_t * const block1= (int16_t*)align_tmp; | |
589 | ||
590 | fdct_col_sse2(block, block1, 0); | |
591 | fdct_row_sse2(block1, block); | |
592 | } | |
593 | ||
594 | #endif /* HAVE_SSE2_INLINE */ |