Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (C) 2003 James Klicman <james@klicman.org> | |
3 | * | |
4 | * This file is part of FFmpeg. | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with FFmpeg; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | */ | |
20 | ||
21 | #include "config.h" | |
22 | #if HAVE_ALTIVEC_H | |
23 | #include <altivec.h> | |
24 | #endif | |
25 | ||
26 | #include "libavutil/attributes.h" | |
27 | #include "libavutil/cpu.h" | |
28 | #include "libavutil/ppc/cpu.h" | |
29 | #include "libavcodec/fdctdsp.h" | |
30 | #include "fdct.h" | |
31 | ||
32 | #if HAVE_ALTIVEC | |
33 | ||
34 | #define vs16(v) ((vector signed short) (v)) | |
35 | #define vs32(v) ((vector signed int) (v)) | |
36 | #define vu8(v) ((vector unsigned char) (v)) | |
37 | #define vu16(v) ((vector unsigned short) (v)) | |
38 | #define vu32(v) ((vector unsigned int) (v)) | |
39 | ||
40 | #define C1 0.98078525066375732421875000 /* cos(1 * PI / 16) */ | |
41 | #define C2 0.92387950420379638671875000 /* cos(2 * PI / 16) */ | |
42 | #define C3 0.83146959543228149414062500 /* cos(3 * PI / 16) */ | |
43 | #define C4 0.70710676908493041992187500 /* cos(4 * PI / 16) */ | |
44 | #define C5 0.55557024478912353515625000 /* cos(5 * PI / 16) */ | |
45 | #define C6 0.38268342614173889160156250 /* cos(6 * PI / 16) */ | |
46 | #define C7 0.19509032368659973144531250 /* cos(7 * PI / 16) */ | |
47 | #define SQRT_2 1.41421353816986083984375000 /* sqrt(2) */ | |
48 | ||
49 | #define W0 -(2 * C2) | |
50 | #define W1 (2 * C6) | |
51 | #define W2 (SQRT_2 * C6) | |
52 | #define W3 (SQRT_2 * C3) | |
53 | #define W4 (SQRT_2 * (-C1 + C3 + C5 - C7)) | |
54 | #define W5 (SQRT_2 * (C1 + C3 - C5 + C7)) | |
55 | #define W6 (SQRT_2 * (C1 + C3 + C5 - C7)) | |
56 | #define W7 (SQRT_2 * (C1 + C3 - C5 - C7)) | |
57 | #define W8 (SQRT_2 * (C7 - C3)) | |
58 | #define W9 (SQRT_2 * (-C1 - C3)) | |
59 | #define WA (SQRT_2 * (-C3 - C5)) | |
60 | #define WB (SQRT_2 * (C5 - C3)) | |
61 | ||
62 | static const vector float fdctconsts[3] = { | |
63 | { W0, W1, W2, W3 }, | |
64 | { W4, W5, W6, W7 }, | |
65 | { W8, W9, WA, WB } | |
66 | }; | |
67 | ||
68 | #define LD_W0 vec_splat(cnsts0, 0) | |
69 | #define LD_W1 vec_splat(cnsts0, 1) | |
70 | #define LD_W2 vec_splat(cnsts0, 2) | |
71 | #define LD_W3 vec_splat(cnsts0, 3) | |
72 | #define LD_W4 vec_splat(cnsts1, 0) | |
73 | #define LD_W5 vec_splat(cnsts1, 1) | |
74 | #define LD_W6 vec_splat(cnsts1, 2) | |
75 | #define LD_W7 vec_splat(cnsts1, 3) | |
76 | #define LD_W8 vec_splat(cnsts2, 0) | |
77 | #define LD_W9 vec_splat(cnsts2, 1) | |
78 | #define LD_WA vec_splat(cnsts2, 2) | |
79 | #define LD_WB vec_splat(cnsts2, 3) | |
80 | ||
81 | #define FDCTROW(b0, b1, b2, b3, b4, b5, b6, b7) /* {{{ */ \ | |
82 | x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \ | |
83 | x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \ | |
84 | x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \ | |
85 | x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \ | |
86 | x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \ | |
87 | x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \ | |
88 | x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \ | |
89 | x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \ | |
90 | \ | |
91 | b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \ | |
92 | b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \ | |
93 | b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \ | |
94 | b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \ | |
95 | \ | |
96 | b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \ | |
97 | b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \ | |
98 | b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \ | |
99 | cnst = LD_W2; \ | |
100 | b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \ | |
101 | cnst = LD_W1; \ | |
102 | b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \ | |
103 | cnst = LD_W0; \ | |
104 | b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \ | |
105 | \ | |
106 | x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \ | |
107 | x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \ | |
108 | x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \ | |
109 | x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \ | |
110 | x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \ | |
111 | cnst = LD_W3; \ | |
112 | x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \ | |
113 | \ | |
114 | cnst = LD_W8; \ | |
115 | x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \ | |
116 | cnst = LD_W9; \ | |
117 | x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \ | |
118 | cnst = LD_WA; \ | |
119 | x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \ | |
120 | cnst = LD_WB; \ | |
121 | x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \ | |
122 | \ | |
123 | cnst = LD_W4; \ | |
124 | b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \ | |
125 | cnst = LD_W5; \ | |
126 | b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \ | |
127 | cnst = LD_W6; \ | |
128 | b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \ | |
129 | cnst = LD_W7; \ | |
130 | b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \ | |
131 | \ | |
132 | b7 = vec_add(b7, x2); /* b7 = b7 + x2; */ \ | |
133 | b5 = vec_add(b5, x3); /* b5 = b5 + x3; */ \ | |
134 | b3 = vec_add(b3, x2); /* b3 = b3 + x2; */ \ | |
135 | b1 = vec_add(b1, x3) /* b1 = b1 + x3; */ \ | |
136 | /* }}} */ | |
137 | ||
138 | #define FDCTCOL(b0, b1, b2, b3, b4, b5, b6, b7) /* {{{ */ \ | |
139 | x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \ | |
140 | x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \ | |
141 | x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \ | |
142 | x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \ | |
143 | x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \ | |
144 | x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \ | |
145 | x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \ | |
146 | x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \ | |
147 | \ | |
148 | b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \ | |
149 | b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \ | |
150 | b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \ | |
151 | b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \ | |
152 | \ | |
153 | b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \ | |
154 | b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \ | |
155 | b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \ | |
156 | cnst = LD_W2; \ | |
157 | b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \ | |
158 | cnst = LD_W1; \ | |
159 | b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \ | |
160 | cnst = LD_W0; \ | |
161 | b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \ | |
162 | \ | |
163 | x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \ | |
164 | x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \ | |
165 | x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \ | |
166 | x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \ | |
167 | x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \ | |
168 | cnst = LD_W3; \ | |
169 | x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \ | |
170 | \ | |
171 | cnst = LD_W8; \ | |
172 | x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \ | |
173 | cnst = LD_W9; \ | |
174 | x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \ | |
175 | cnst = LD_WA; \ | |
176 | x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \ | |
177 | cnst = LD_WB; \ | |
178 | x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \ | |
179 | \ | |
180 | cnst = LD_W4; \ | |
181 | b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \ | |
182 | cnst = LD_W5; \ | |
183 | b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \ | |
184 | cnst = LD_W6; \ | |
185 | b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \ | |
186 | cnst = LD_W7; \ | |
187 | b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \ | |
188 | \ | |
189 | b7 = vec_add(b7, x2); /* b7 += x2; */ \ | |
190 | b5 = vec_add(b5, x3); /* b5 += x3; */ \ | |
191 | b3 = vec_add(b3, x2); /* b3 += x2; */ \ | |
192 | b1 = vec_add(b1, x3) /* b1 += x3; */ \ | |
193 | /* }}} */ | |
194 | ||
195 | /* two dimensional discrete cosine transform */ | |
196 | void ff_fdct_altivec(int16_t *block) | |
197 | { | |
198 | vector signed short *bp; | |
199 | vector float *cp = fdctconsts; | |
200 | vector float b00, b10, b20, b30, b40, b50, b60, b70; | |
201 | vector float b01, b11, b21, b31, b41, b51, b61, b71; | |
202 | vector float mzero, cnst, cnsts0, cnsts1, cnsts2; | |
203 | vector float x0, x1, x2, x3, x4, x5, x6, x7, x8; | |
204 | ||
205 | /* setup constants {{{ */ | |
206 | /* mzero = -0.0 */ | |
207 | mzero = ((vector float) vec_splat_u32(-1)); | |
208 | mzero = ((vector float) vec_sl(vu32(mzero), vu32(mzero))); | |
209 | cnsts0 = vec_ld(0, cp); | |
210 | cp++; | |
211 | cnsts1 = vec_ld(0, cp); | |
212 | cp++; | |
213 | cnsts2 = vec_ld(0, cp); | |
214 | /* }}} */ | |
215 | ||
216 | /* 8x8 matrix transpose (vector short[8]) {{{ */ | |
217 | #define MERGE_S16(hl, a, b) vec_merge ## hl(vs16(a), vs16(b)) | |
218 | ||
219 | bp = (vector signed short *) block; | |
220 | b00 = ((vector float) vec_ld(0, bp)); | |
221 | b40 = ((vector float) vec_ld(16 * 4, bp)); | |
222 | b01 = ((vector float) MERGE_S16(h, b00, b40)); | |
223 | b11 = ((vector float) MERGE_S16(l, b00, b40)); | |
224 | bp++; | |
225 | b10 = ((vector float) vec_ld(0, bp)); | |
226 | b50 = ((vector float) vec_ld(16 * 4, bp)); | |
227 | b21 = ((vector float) MERGE_S16(h, b10, b50)); | |
228 | b31 = ((vector float) MERGE_S16(l, b10, b50)); | |
229 | bp++; | |
230 | b20 = ((vector float) vec_ld(0, bp)); | |
231 | b60 = ((vector float) vec_ld(16 * 4, bp)); | |
232 | b41 = ((vector float) MERGE_S16(h, b20, b60)); | |
233 | b51 = ((vector float) MERGE_S16(l, b20, b60)); | |
234 | bp++; | |
235 | b30 = ((vector float) vec_ld(0, bp)); | |
236 | b70 = ((vector float) vec_ld(16 * 4, bp)); | |
237 | b61 = ((vector float) MERGE_S16(h, b30, b70)); | |
238 | b71 = ((vector float) MERGE_S16(l, b30, b70)); | |
239 | ||
240 | x0 = ((vector float) MERGE_S16(h, b01, b41)); | |
241 | x1 = ((vector float) MERGE_S16(l, b01, b41)); | |
242 | x2 = ((vector float) MERGE_S16(h, b11, b51)); | |
243 | x3 = ((vector float) MERGE_S16(l, b11, b51)); | |
244 | x4 = ((vector float) MERGE_S16(h, b21, b61)); | |
245 | x5 = ((vector float) MERGE_S16(l, b21, b61)); | |
246 | x6 = ((vector float) MERGE_S16(h, b31, b71)); | |
247 | x7 = ((vector float) MERGE_S16(l, b31, b71)); | |
248 | ||
249 | b00 = ((vector float) MERGE_S16(h, x0, x4)); | |
250 | b10 = ((vector float) MERGE_S16(l, x0, x4)); | |
251 | b20 = ((vector float) MERGE_S16(h, x1, x5)); | |
252 | b30 = ((vector float) MERGE_S16(l, x1, x5)); | |
253 | b40 = ((vector float) MERGE_S16(h, x2, x6)); | |
254 | b50 = ((vector float) MERGE_S16(l, x2, x6)); | |
255 | b60 = ((vector float) MERGE_S16(h, x3, x7)); | |
256 | b70 = ((vector float) MERGE_S16(l, x3, x7)); | |
257 | ||
258 | #undef MERGE_S16 | |
259 | /* }}} */ | |
260 | ||
261 | /* Some of the initial calculations can be done as vector short | |
262 | * before conversion to vector float. The following code section | |
263 | * takes advantage of this. */ | |
264 | ||
265 | /* fdct rows {{{ */ | |
266 | x0 = ((vector float) vec_add(vs16(b00), vs16(b70))); | |
267 | x7 = ((vector float) vec_sub(vs16(b00), vs16(b70))); | |
268 | x1 = ((vector float) vec_add(vs16(b10), vs16(b60))); | |
269 | x6 = ((vector float) vec_sub(vs16(b10), vs16(b60))); | |
270 | x2 = ((vector float) vec_add(vs16(b20), vs16(b50))); | |
271 | x5 = ((vector float) vec_sub(vs16(b20), vs16(b50))); | |
272 | x3 = ((vector float) vec_add(vs16(b30), vs16(b40))); | |
273 | x4 = ((vector float) vec_sub(vs16(b30), vs16(b40))); | |
274 | ||
275 | b70 = ((vector float) vec_add(vs16(x0), vs16(x3))); | |
276 | b10 = ((vector float) vec_add(vs16(x1), vs16(x2))); | |
277 | ||
278 | b00 = ((vector float) vec_add(vs16(b70), vs16(b10))); | |
279 | b40 = ((vector float) vec_sub(vs16(b70), vs16(b10))); | |
280 | ||
281 | #define CTF0(n) \ | |
282 | b ## n ## 1 = ((vector float) vec_unpackl(vs16(b ## n ## 0))); \ | |
283 | b ## n ## 0 = ((vector float) vec_unpackh(vs16(b ## n ## 0))); \ | |
284 | b ## n ## 1 = vec_ctf(vs32(b ## n ## 1), 0); \ | |
285 | b ## n ## 0 = vec_ctf(vs32(b ## n ## 0), 0) | |
286 | ||
287 | CTF0(0); | |
288 | CTF0(4); | |
289 | ||
290 | b20 = ((vector float) vec_sub(vs16(x0), vs16(x3))); | |
291 | b60 = ((vector float) vec_sub(vs16(x1), vs16(x2))); | |
292 | ||
293 | CTF0(2); | |
294 | CTF0(6); | |
295 | ||
296 | #undef CTF0 | |
297 | ||
298 | x0 = vec_add(b60, b20); | |
299 | x1 = vec_add(b61, b21); | |
300 | ||
301 | cnst = LD_W2; | |
302 | x0 = vec_madd(cnst, x0, mzero); | |
303 | x1 = vec_madd(cnst, x1, mzero); | |
304 | cnst = LD_W1; | |
305 | b20 = vec_madd(cnst, b20, x0); | |
306 | b21 = vec_madd(cnst, b21, x1); | |
307 | cnst = LD_W0; | |
308 | b60 = vec_madd(cnst, b60, x0); | |
309 | b61 = vec_madd(cnst, b61, x1); | |
310 | ||
311 | #define CTFX(x, b) \ | |
312 | b ## 0 = ((vector float) vec_unpackh(vs16(x))); \ | |
313 | b ## 1 = ((vector float) vec_unpackl(vs16(x))); \ | |
314 | b ## 0 = vec_ctf(vs32(b ## 0), 0); \ | |
315 | b ## 1 = vec_ctf(vs32(b ## 1), 0) | |
316 | ||
317 | CTFX(x4, b7); | |
318 | CTFX(x5, b5); | |
319 | CTFX(x6, b3); | |
320 | CTFX(x7, b1); | |
321 | ||
322 | #undef CTFX | |
323 | ||
324 | x0 = vec_add(b70, b10); | |
325 | x1 = vec_add(b50, b30); | |
326 | x2 = vec_add(b70, b30); | |
327 | x3 = vec_add(b50, b10); | |
328 | x8 = vec_add(x2, x3); | |
329 | cnst = LD_W3; | |
330 | x8 = vec_madd(cnst, x8, mzero); | |
331 | ||
332 | cnst = LD_W8; | |
333 | x0 = vec_madd(cnst, x0, mzero); | |
334 | cnst = LD_W9; | |
335 | x1 = vec_madd(cnst, x1, mzero); | |
336 | cnst = LD_WA; | |
337 | x2 = vec_madd(cnst, x2, x8); | |
338 | cnst = LD_WB; | |
339 | x3 = vec_madd(cnst, x3, x8); | |
340 | ||
341 | cnst = LD_W4; | |
342 | b70 = vec_madd(cnst, b70, x0); | |
343 | cnst = LD_W5; | |
344 | b50 = vec_madd(cnst, b50, x1); | |
345 | cnst = LD_W6; | |
346 | b30 = vec_madd(cnst, b30, x1); | |
347 | cnst = LD_W7; | |
348 | b10 = vec_madd(cnst, b10, x0); | |
349 | ||
350 | b70 = vec_add(b70, x2); | |
351 | b50 = vec_add(b50, x3); | |
352 | b30 = vec_add(b30, x2); | |
353 | b10 = vec_add(b10, x3); | |
354 | ||
355 | x0 = vec_add(b71, b11); | |
356 | x1 = vec_add(b51, b31); | |
357 | x2 = vec_add(b71, b31); | |
358 | x3 = vec_add(b51, b11); | |
359 | x8 = vec_add(x2, x3); | |
360 | cnst = LD_W3; | |
361 | x8 = vec_madd(cnst, x8, mzero); | |
362 | ||
363 | cnst = LD_W8; | |
364 | x0 = vec_madd(cnst, x0, mzero); | |
365 | cnst = LD_W9; | |
366 | x1 = vec_madd(cnst, x1, mzero); | |
367 | cnst = LD_WA; | |
368 | x2 = vec_madd(cnst, x2, x8); | |
369 | cnst = LD_WB; | |
370 | x3 = vec_madd(cnst, x3, x8); | |
371 | ||
372 | cnst = LD_W4; | |
373 | b71 = vec_madd(cnst, b71, x0); | |
374 | cnst = LD_W5; | |
375 | b51 = vec_madd(cnst, b51, x1); | |
376 | cnst = LD_W6; | |
377 | b31 = vec_madd(cnst, b31, x1); | |
378 | cnst = LD_W7; | |
379 | b11 = vec_madd(cnst, b11, x0); | |
380 | ||
381 | b71 = vec_add(b71, x2); | |
382 | b51 = vec_add(b51, x3); | |
383 | b31 = vec_add(b31, x2); | |
384 | b11 = vec_add(b11, x3); | |
385 | /* }}} */ | |
386 | ||
387 | /* 8x8 matrix transpose (vector float[8][2]) {{{ */ | |
388 | x0 = vec_mergel(b00, b20); | |
389 | x1 = vec_mergeh(b00, b20); | |
390 | x2 = vec_mergel(b10, b30); | |
391 | x3 = vec_mergeh(b10, b30); | |
392 | ||
393 | b00 = vec_mergeh(x1, x3); | |
394 | b10 = vec_mergel(x1, x3); | |
395 | b20 = vec_mergeh(x0, x2); | |
396 | b30 = vec_mergel(x0, x2); | |
397 | ||
398 | x4 = vec_mergel(b41, b61); | |
399 | x5 = vec_mergeh(b41, b61); | |
400 | x6 = vec_mergel(b51, b71); | |
401 | x7 = vec_mergeh(b51, b71); | |
402 | ||
403 | b41 = vec_mergeh(x5, x7); | |
404 | b51 = vec_mergel(x5, x7); | |
405 | b61 = vec_mergeh(x4, x6); | |
406 | b71 = vec_mergel(x4, x6); | |
407 | ||
408 | x0 = vec_mergel(b01, b21); | |
409 | x1 = vec_mergeh(b01, b21); | |
410 | x2 = vec_mergel(b11, b31); | |
411 | x3 = vec_mergeh(b11, b31); | |
412 | ||
413 | x4 = vec_mergel(b40, b60); | |
414 | x5 = vec_mergeh(b40, b60); | |
415 | x6 = vec_mergel(b50, b70); | |
416 | x7 = vec_mergeh(b50, b70); | |
417 | ||
418 | b40 = vec_mergeh(x1, x3); | |
419 | b50 = vec_mergel(x1, x3); | |
420 | b60 = vec_mergeh(x0, x2); | |
421 | b70 = vec_mergel(x0, x2); | |
422 | ||
423 | b01 = vec_mergeh(x5, x7); | |
424 | b11 = vec_mergel(x5, x7); | |
425 | b21 = vec_mergeh(x4, x6); | |
426 | b31 = vec_mergel(x4, x6); | |
427 | /* }}} */ | |
428 | ||
429 | FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70); | |
430 | FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71); | |
431 | ||
432 | /* round, convert back to short {{{ */ | |
433 | #define CTS(n) \ | |
434 | b ## n ## 0 = vec_round(b ## n ## 0); \ | |
435 | b ## n ## 1 = vec_round(b ## n ## 1); \ | |
436 | b ## n ## 0 = ((vector float) vec_cts(b ## n ## 0, 0)); \ | |
437 | b ## n ## 1 = ((vector float) vec_cts(b ## n ## 1, 0)); \ | |
438 | b ## n ## 0 = ((vector float) vec_pack(vs32(b ## n ## 0), \ | |
439 | vs32(b ## n ## 1))); \ | |
440 | vec_st(vs16(b ## n ## 0), 0, bp) | |
441 | ||
442 | bp = (vector signed short *) block; | |
443 | CTS(0); | |
444 | bp++; | |
445 | CTS(1); | |
446 | bp++; | |
447 | CTS(2); | |
448 | bp++; | |
449 | CTS(3); | |
450 | bp++; | |
451 | CTS(4); | |
452 | bp++; | |
453 | CTS(5); | |
454 | bp++; | |
455 | CTS(6); | |
456 | bp++; | |
457 | CTS(7); | |
458 | ||
459 | #undef CTS | |
460 | /* }}} */ | |
461 | } | |
462 | ||
463 | #endif /* HAVE_ALTIVEC */ | |
464 | ||
465 | av_cold void ff_fdctdsp_init_ppc(FDCTDSPContext *c, AVCodecContext *avctx, | |
466 | unsigned high_bit_depth) | |
467 | { | |
468 | #if HAVE_ALTIVEC | |
469 | if (!PPC_ALTIVEC(av_get_cpu_flags())) | |
470 | return; | |
471 | ||
472 | if (!high_bit_depth) { | |
473 | if (avctx->dct_algo == FF_DCT_AUTO || | |
474 | avctx->dct_algo == FF_DCT_ALTIVEC) { | |
475 | c->fdct = ff_fdct_altivec; | |
476 | } | |
477 | } | |
478 | #endif /* HAVE_ALTIVEC */ | |
479 | } |