Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | #ifndef AVCODEC_PPC_FFT_VSX_H |
2 | #define AVCODEC_PPC_FFT_VSX_H | |
3 | /* | |
4 | * FFT transform, optimized with VSX built-in functions | |
5 | * Copyright (c) 2014 Rong Yan Copyright (c) 2009 Loren Merritt | |
6 | * | |
7 | * This algorithm (though not any of the implementation details) is | |
8 | * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S. | |
9 | * | |
10 | * This file is part of FFmpeg. | |
11 | * | |
12 | * FFmpeg is free software; you can redistribute it and/or | |
13 | * modify it under the terms of the GNU Lesser General Public | |
14 | * License as published by the Free Software Foundation; either | |
15 | * version 2.1 of the License, or (at your option) any later version. | |
16 | * | |
17 | * FFmpeg is distributed in the hope that it will be useful, | |
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
20 | * Lesser General Public License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU Lesser General Public | |
23 | * License along with FFmpeg; if not, write to the Free Software | |
24 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
25 | */ | |
26 | ||
27 | ||
28 | #include "config.h" | |
29 | #include "libavutil/cpu.h" | |
30 | #include "libavutil/ppc/types_altivec.h" | |
31 | #include "libavutil/ppc/util_altivec.h" | |
32 | #include "libavcodec/fft.h" | |
33 | #include "libavcodec/fft-internal.h" | |
34 | ||
35 | #if HAVE_VSX | |
36 | ||
37 | void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z); | |
38 | void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z); | |
39 | ||
40 | ||
41 | #define byte_2complex (2*sizeof(FFTComplex)) | |
42 | #define byte_4complex (4*sizeof(FFTComplex)) | |
43 | #define byte_6complex (6*sizeof(FFTComplex)) | |
44 | #define byte_8complex (8*sizeof(FFTComplex)) | |
45 | #define byte_10complex (10*sizeof(FFTComplex)) | |
46 | #define byte_12complex (12*sizeof(FFTComplex)) | |
47 | #define byte_14complex (14*sizeof(FFTComplex)) | |
48 | ||
49 | inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n) | |
50 | { | |
51 | int o1 = n<<1; | |
52 | int o2 = n<<2; | |
53 | int o3 = o1+o2; | |
54 | int i1, i2, i3; | |
55 | FFTSample* out = (FFTSample*)z; | |
56 | const FFTSample *wim = wre+o1; | |
57 | vec_f vz0, vzo1, vzo2, vzo3; | |
58 | vec_f x0, x1, x2, x3; | |
59 | vec_f x4, x5, x6, x7; | |
60 | vec_f x8, x9, x10, x11; | |
61 | vec_f x12, x13, x14, x15; | |
62 | vec_f x16, x17, x18, x19; | |
63 | vec_f x20, x21, x22, x23; | |
64 | vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1; | |
65 | vec_f y0, y1, y2, y3; | |
66 | vec_f y4, y5, y8, y9; | |
67 | vec_f y10, y13, y14, y15; | |
68 | vec_f y16, y17, y18, y19; | |
69 | vec_f y20, y21, y22, y23; | |
70 | vec_f wr1, wi1, wr0, wi0; | |
71 | vec_f wr2, wi2, wr3, wi3; | |
72 | vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3; | |
73 | ||
74 | n = n-2; | |
75 | i1 = o1*sizeof(FFTComplex); | |
76 | i2 = o2*sizeof(FFTComplex); | |
77 | i3 = o3*sizeof(FFTComplex); | |
78 | vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i | |
79 | vzo2plus1 = vec_ld(i2+16, &(out[0])); | |
80 | vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i | |
81 | vzo3plus1 = vec_ld(i3+16, &(out[0])); | |
82 | vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i | |
83 | vz0plus1 = vec_ld(16, &(out[0])); | |
84 | vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i | |
85 | vzo1plus1 = vec_ld(i1+16, &(out[0])); | |
86 | ||
87 | x0 = vec_add(vzo2, vzo3); | |
88 | x1 = vec_sub(vzo2, vzo3); | |
89 | y0 = vec_add(vzo2plus1, vzo3plus1); | |
90 | y1 = vec_sub(vzo2plus1, vzo3plus1); | |
91 | ||
92 | wr1 = vec_splats(wre[1]); | |
93 | wi1 = vec_splats(wim[-1]); | |
94 | wi2 = vec_splats(wim[-2]); | |
95 | wi3 = vec_splats(wim[-3]); | |
96 | wr2 = vec_splats(wre[2]); | |
97 | wr3 = vec_splats(wre[3]); | |
98 | ||
99 | x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3)); | |
100 | x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2)); | |
101 | ||
102 | y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0)); | |
103 | y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2)); | |
104 | y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1)); | |
105 | y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3)); | |
106 | ||
107 | ymulwi2 = vec_mul(y4, wi2); | |
108 | ymulwi3 = vec_mul(y5, wi3); | |
109 | x4 = vec_mul(x2, wr1); | |
110 | x5 = vec_mul(x3, wi1); | |
111 | y8 = vec_madd(y2, wr2, ymulwi2); | |
112 | y9 = vec_msub(y2, wr2, ymulwi2); | |
113 | x6 = vec_add(x4, x5); | |
114 | x7 = vec_sub(x4, x5); | |
115 | y13 = vec_madd(y3, wr3, ymulwi3); | |
116 | y14 = vec_msub(y3, wr3, ymulwi3); | |
117 | ||
118 | x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3)); | |
119 | y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3)); | |
120 | y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3)); | |
121 | ||
122 | x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2)); | |
123 | x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1)); | |
124 | ||
125 | y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2)); | |
126 | y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1)); | |
127 | ||
128 | x11 = vec_add(vz0, x9); | |
129 | x12 = vec_sub(vz0, x9); | |
130 | x13 = vec_add(vzo1, x10); | |
131 | x14 = vec_sub(vzo1, x10); | |
132 | ||
133 | y18 = vec_add(vz0plus1, y16); | |
134 | y19 = vec_sub(vz0plus1, y16); | |
135 | y20 = vec_add(vzo1plus1, y17); | |
136 | y21 = vec_sub(vzo1plus1, y17); | |
137 | ||
138 | x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3)); | |
139 | x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3)); | |
140 | y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3)); | |
141 | y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3)); | |
142 | ||
143 | ||
144 | vec_st(x11, 0, &(out[0])); | |
145 | vec_st(y18, 16, &(out[0])); | |
146 | vec_st(x15, i1, &(out[0])); | |
147 | vec_st(y22, i1+16, &(out[0])); | |
148 | vec_st(x12, i2, &(out[0])); | |
149 | vec_st(y19, i2+16, &(out[0])); | |
150 | vec_st(x16, i3, &(out[0])); | |
151 | vec_st(y23, i3+16, &(out[0])); | |
152 | ||
153 | do { | |
154 | out += 8; | |
155 | wre += 4; | |
156 | wim -= 4; | |
157 | wr0 = vec_splats(wre[0]); | |
158 | wr1 = vec_splats(wre[1]); | |
159 | wi0 = vec_splats(wim[0]); | |
160 | wi1 = vec_splats(wim[-1]); | |
161 | ||
162 | wr2 = vec_splats(wre[2]); | |
163 | wr3 = vec_splats(wre[3]); | |
164 | wi2 = vec_splats(wim[-2]); | |
165 | wi3 = vec_splats(wim[-3]); | |
166 | ||
167 | vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i | |
168 | vzo2plus1 = vec_ld(i2+16, &(out[0])); | |
169 | vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i | |
170 | vzo3plus1 = vec_ld(i3+16, &(out[0])); | |
171 | vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i | |
172 | vz0plus1 = vec_ld(16, &(out[0])); | |
173 | vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i | |
174 | vzo1plus1 = vec_ld(i1+16, &(out[0])); | |
175 | ||
176 | x0 = vec_add(vzo2, vzo3); | |
177 | x1 = vec_sub(vzo2, vzo3); | |
178 | ||
179 | y0 = vec_add(vzo2plus1, vzo3plus1); | |
180 | y1 = vec_sub(vzo2plus1, vzo3plus1); | |
181 | ||
182 | x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0)); | |
183 | x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2)); | |
184 | x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1)); | |
185 | x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3)); | |
186 | ||
187 | y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1)); | |
188 | y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3)); | |
189 | xmulwi0 = vec_mul(x4, wi0); | |
190 | xmulwi1 = vec_mul(x5, wi1); | |
191 | ||
192 | y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0)); | |
193 | y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2)); | |
194 | ||
195 | x8 = vec_madd(x2, wr0, xmulwi0); | |
196 | x9 = vec_msub(x2, wr0, xmulwi0); | |
197 | ymulwi2 = vec_mul(y4, wi2); | |
198 | ymulwi3 = vec_mul(y5, wi3); | |
199 | ||
200 | x13 = vec_madd(x3, wr1, xmulwi1); | |
201 | x14 = vec_msub(x3, wr1, xmulwi1); | |
202 | ||
203 | y8 = vec_madd(y2, wr2, ymulwi2); | |
204 | y9 = vec_msub(y2, wr2, ymulwi2); | |
205 | y13 = vec_madd(y3, wr3, ymulwi3); | |
206 | y14 = vec_msub(y3, wr3, ymulwi3); | |
207 | ||
208 | x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3)); | |
209 | x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3)); | |
210 | ||
211 | y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3)); | |
212 | y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3)); | |
213 | ||
214 | x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2)); | |
215 | x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1)); | |
216 | ||
217 | y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2)); | |
218 | y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1)); | |
219 | ||
220 | x18 = vec_add(vz0, x16); | |
221 | x19 = vec_sub(vz0, x16); | |
222 | x20 = vec_add(vzo1, x17); | |
223 | x21 = vec_sub(vzo1, x17); | |
224 | ||
225 | y18 = vec_add(vz0plus1, y16); | |
226 | y19 = vec_sub(vz0plus1, y16); | |
227 | y20 = vec_add(vzo1plus1, y17); | |
228 | y21 = vec_sub(vzo1plus1, y17); | |
229 | ||
230 | x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3)); | |
231 | x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3)); | |
232 | ||
233 | y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3)); | |
234 | y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3)); | |
235 | ||
236 | vec_st(x18, 0, &(out[0])); | |
237 | vec_st(y18, 16, &(out[0])); | |
238 | vec_st(x22, i1, &(out[0])); | |
239 | vec_st(y22, i1+16, &(out[0])); | |
240 | vec_st(x19, i2, &(out[0])); | |
241 | vec_st(y19, i2+16, &(out[0])); | |
242 | vec_st(x23, i3, &(out[0])); | |
243 | vec_st(y23, i3+16, &(out[0])); | |
244 | } while (n-=2); | |
245 | } | |
246 | ||
247 | inline static void fft2_vsx_interleave(FFTComplex *z) | |
248 | { | |
249 | FFTSample r1, i1; | |
250 | ||
251 | r1 = z[0].re - z[1].re; | |
252 | z[0].re += z[1].re; | |
253 | z[1].re = r1; | |
254 | ||
255 | i1 = z[0].im - z[1].im; | |
256 | z[0].im += z[1].im; | |
257 | z[1].im = i1; | |
258 | } | |
259 | ||
260 | inline static void fft4_vsx_interleave(FFTComplex *z) | |
261 | { | |
262 | vec_f a, b, c, d; | |
263 | float* out= (float*)z; | |
264 | a = vec_ld(0, &(out[0])); | |
265 | b = vec_ld(byte_2complex, &(out[0])); | |
266 | ||
267 | c = vec_perm(a, b, vcprm(0,1,s2,s1)); | |
268 | d = vec_perm(a, b, vcprm(2,3,s0,s3)); | |
269 | a = vec_add(c, d); | |
270 | b = vec_sub(c, d); | |
271 | ||
272 | c = vec_perm(a, b, vcprm(0,1,s0,s1)); | |
273 | d = vec_perm(a, b, vcprm(2,3,s3,s2)); | |
274 | ||
275 | a = vec_add(c, d); | |
276 | b = vec_sub(c, d); | |
277 | vec_st(a, 0, &(out[0])); | |
278 | vec_st(b, byte_2complex, &(out[0])); | |
279 | } | |
280 | ||
281 | inline static void fft8_vsx_interleave(FFTComplex *z) | |
282 | { | |
283 | vec_f vz0, vz1, vz2, vz3; | |
284 | vec_f x0, x1, x2, x3; | |
285 | vec_f x4, x5, x6, x7; | |
286 | vec_f x8, x9, x10, x11; | |
287 | vec_f x12, x13, x14, x15; | |
288 | vec_f x16, x17, x18, x19; | |
289 | vec_f x20, x21, x22, x23; | |
290 | vec_f x24, x25, x26, x27; | |
291 | vec_f x28, x29, x30, x31; | |
292 | vec_f x32, x33, x34; | |
293 | ||
294 | float* out= (float*)z; | |
295 | vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; | |
296 | ||
297 | vz0 = vec_ld(0, &(out[0])); | |
298 | vz1 = vec_ld(byte_2complex, &(out[0])); | |
299 | vz2 = vec_ld(byte_4complex, &(out[0])); | |
300 | vz3 = vec_ld(byte_6complex, &(out[0])); | |
301 | ||
302 | x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); | |
303 | x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); | |
304 | x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1)); | |
305 | x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3)); | |
306 | ||
307 | x4 = vec_add(x0, x1); | |
308 | x5 = vec_sub(x0, x1); | |
309 | x6 = vec_add(x2, x3); | |
310 | x7 = vec_sub(x2, x3); | |
311 | ||
312 | x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1)); | |
313 | x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2)); | |
314 | x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1)); | |
315 | x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3)); | |
316 | ||
317 | x12 = vec_add(x8, x9); | |
318 | x13 = vec_sub(x8, x9); | |
319 | x14 = vec_add(x10, x11); | |
320 | x15 = vec_sub(x10, x11); | |
321 | x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1)); | |
322 | x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1)); | |
323 | x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1)); | |
324 | x19 = vec_add(x16, x18); // z0.r z2.r z0.i z2.i | |
325 | x20 = vec_sub(x16, x18); // z4.r z6.r z4.i z6.i | |
326 | ||
327 | x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3)); | |
328 | x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3)); | |
329 | x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2)); | |
330 | x24 = vec_add(x22, x23); | |
331 | x25 = vec_sub(x22, x23); | |
332 | x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1); | |
333 | ||
334 | x27 = vec_add(x21, x26); // z1.r z7.r z1.i z3.i | |
335 | x28 = vec_sub(x21, x26); //z5.r z3.r z5.i z7.i | |
336 | ||
337 | x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r z0.i z1.r z1.i | |
338 | x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r z2.i z7.r z3.i | |
339 | x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r z4.i z5.r z5.i | |
340 | x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r z6.i z3.r z7.i | |
341 | x33 = vec_perm(x30, x32, vcprm(0,1,s2,3)); // z2.r z2.i z3.r z3.i | |
342 | x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r z6.i z7.r z7.i | |
343 | ||
344 | vec_st(x29, 0, &(out[0])); | |
345 | vec_st(x33, byte_2complex, &(out[0])); | |
346 | vec_st(x31, byte_4complex, &(out[0])); | |
347 | vec_st(x34, byte_6complex, &(out[0])); | |
348 | } | |
349 | ||
350 | inline static void fft16_vsx_interleave(FFTComplex *z) | |
351 | { | |
352 | float* out= (float*)z; | |
353 | vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; | |
354 | vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]}; | |
355 | vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]}; | |
356 | vec_f vz0, vz1, vz2, vz3; | |
357 | vec_f vz4, vz5, vz6, vz7; | |
358 | vec_f x0, x1, x2, x3; | |
359 | vec_f x4, x5, x6, x7; | |
360 | vec_f x8, x9, x10, x11; | |
361 | vec_f x12, x13, x14, x15; | |
362 | vec_f x16, x17, x18, x19; | |
363 | vec_f x20, x21, x22, x23; | |
364 | vec_f x24, x25, x26, x27; | |
365 | vec_f x28, x29, x30, x31; | |
366 | vec_f x32, x33, x34, x35; | |
367 | vec_f x36, x37, x38, x39; | |
368 | vec_f x40, x41, x42, x43; | |
369 | vec_f x44, x45, x46, x47; | |
370 | vec_f x48, x49, x50, x51; | |
371 | vec_f x52, x53, x54, x55; | |
372 | vec_f x56, x57, x58, x59; | |
373 | vec_f x60, x61, x62, x63; | |
374 | vec_f x64, x65, x66, x67; | |
375 | vec_f x68, x69, x70, x71; | |
376 | vec_f x72, x73, x74, x75; | |
377 | vec_f x76, x77, x78, x79; | |
378 | vec_f x80, x81, x82, x83; | |
379 | vec_f x84, x85, x86; | |
380 | ||
381 | vz0 = vec_ld(0, &(out[0])); | |
382 | vz1 = vec_ld(byte_2complex, &(out[0])); | |
383 | vz2 = vec_ld(byte_4complex, &(out[0])); | |
384 | vz3 = vec_ld(byte_6complex, &(out[0])); | |
385 | vz4 = vec_ld(byte_8complex, &(out[0])); | |
386 | vz5 = vec_ld(byte_10complex, &(out[0])); | |
387 | vz6 = vec_ld(byte_12complex, &(out[0])); | |
388 | vz7 = vec_ld(byte_14complex, &(out[0])); | |
389 | ||
390 | x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); | |
391 | x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); | |
392 | x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1)); | |
393 | x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3)); | |
394 | ||
395 | x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1)); | |
396 | x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3)); | |
397 | x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1)); | |
398 | x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3)); | |
399 | ||
400 | x8 = vec_add(x0, x1); | |
401 | x9 = vec_sub(x0, x1); | |
402 | x10 = vec_add(x2, x3); | |
403 | x11 = vec_sub(x2, x3); | |
404 | ||
405 | x12 = vec_add(x4, x5); | |
406 | x13 = vec_sub(x4, x5); | |
407 | x14 = vec_add(x6, x7); | |
408 | x15 = vec_sub(x6, x7); | |
409 | ||
410 | x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1)); | |
411 | x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2)); | |
412 | x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2)); | |
413 | x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3)); | |
414 | x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1)); | |
415 | x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3)); | |
416 | x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1)); | |
417 | x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2)); | |
418 | ||
419 | x24 = vec_add(x16, x17); | |
420 | x25 = vec_sub(x16, x17); | |
421 | x26 = vec_add(x18, x19); | |
422 | x27 = vec_sub(x18, x19); | |
423 | x28 = vec_add(x20, x21); | |
424 | x29 = vec_sub(x20, x21); | |
425 | x30 = vec_add(x22, x23); | |
426 | x31 = vec_sub(x22, x23); | |
427 | ||
428 | x32 = vec_add(x24, x26); | |
429 | x33 = vec_sub(x24, x26); | |
430 | x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1)); | |
431 | ||
432 | x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2)); | |
433 | x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3)); | |
434 | x37 = vec_add(x35, x36); | |
435 | x38 = vec_sub(x35, x36); | |
436 | x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0)); | |
437 | ||
438 | x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3)); | |
439 | x41 = vec_perm(x26, x37, vcprm(2,3,s3,s2)); | |
440 | x42 = vec_add(x40, x41); | |
441 | x43 = vec_sub(x40, x41); | |
442 | x44 = vec_mul(x42, vc0); | |
443 | x45 = vec_mul(x43, vc0); | |
444 | ||
445 | x46 = vec_add(x34, x39); // z0.r z0.i z4.r z4.i | |
446 | x47 = vec_sub(x34, x39); // z8.r z8.i z12.r z12.i | |
447 | ||
448 | x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2)); | |
449 | x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0)); | |
450 | x50 = vec_add(x48, x49); | |
451 | x51 = vec_sub(x48, x49); | |
452 | x52 = vec_mul(x50, vc1); | |
453 | x53 = vec_mul(x50, vc2); | |
454 | x54 = vec_mul(x51, vc1); | |
455 | x55 = vec_mul(x51, vc2); | |
456 | ||
457 | x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3)); | |
458 | x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0)); | |
459 | x58 = vec_add(x56, x57); | |
460 | x59 = vec_sub(x56, x57); | |
461 | ||
462 | x60 = vec_perm(x54, x55, vcprm(1,0,3,2)); | |
463 | x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2)); | |
464 | x62 = vec_add(x52, x61); | |
465 | x63 = vec_sub(x52, x61); | |
466 | x64 = vec_add(x60, x53); | |
467 | x65 = vec_sub(x60, x53); | |
468 | x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2)); | |
469 | x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2)); | |
470 | ||
471 | x68 = vec_add(x58, x66); // z1.r z1.i z3.r z3.i | |
472 | x69 = vec_sub(x58, x66); // z9.r z9.i z11.r z11.i | |
473 | x70 = vec_add(x59, x67); // z5.r z5.i z15.r z15.i | |
474 | x71 = vec_sub(x59, x67); // z13.r z13.i z7.r z7.i | |
475 | ||
476 | x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3)); | |
477 | x73 = vec_add(x25, x72); | |
478 | x74 = vec_sub(x25, x72); | |
479 | x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1)); | |
480 | x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3)); | |
481 | x77 = vec_add(x75, x76); // z2.r z2.i z6.r z6.i | |
482 | x78 = vec_sub(x75, x76); // z10.r z10.i z14.r z14.i | |
483 | ||
484 | x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r z0.i z1.r z1.i | |
485 | x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r z2.i z3.r z3.i | |
486 | x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r z4.i z5.r z5.i | |
487 | x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r z6.i z7.r z7.i | |
488 | vec_st(x79, 0, &(out[0])); | |
489 | vec_st(x80, byte_2complex, &(out[0])); | |
490 | vec_st(x81, byte_4complex, &(out[0])); | |
491 | vec_st(x82, byte_6complex, &(out[0])); | |
492 | x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r z8.i z9.r z9.i | |
493 | x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r z10.i z11.r z11.i | |
494 | x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r z12.i z13.r z13.i | |
495 | x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r z14.i z15.r z15.i | |
496 | vec_st(x83, byte_8complex, &(out[0])); | |
497 | vec_st(x84, byte_10complex, &(out[0])); | |
498 | vec_st(x85, byte_12complex, &(out[0])); | |
499 | vec_st(x86, byte_14complex, &(out[0])); | |
500 | } | |
501 | ||
502 | inline static void fft4_vsx(FFTComplex *z) | |
503 | { | |
504 | vec_f a, b, c, d; | |
505 | float* out= (float*)z; | |
506 | a = vec_ld(0, &(out[0])); | |
507 | b = vec_ld(byte_2complex, &(out[0])); | |
508 | ||
509 | c = vec_perm(a, b, vcprm(0,1,s2,s1)); | |
510 | d = vec_perm(a, b, vcprm(2,3,s0,s3)); | |
511 | a = vec_add(c, d); | |
512 | b = vec_sub(c, d); | |
513 | ||
514 | c = vec_perm(a,b, vcprm(0,s0,1,s1)); | |
515 | d = vec_perm(a, b, vcprm(2,s3,3,s2)); | |
516 | ||
517 | a = vec_add(c, d); | |
518 | b = vec_sub(c, d); | |
519 | ||
520 | c = vec_perm(a, b, vcprm(0,1,s0,s1)); | |
521 | d = vec_perm(a, b, vcprm(2,3,s2,s3)); | |
522 | ||
523 | vec_st(c, 0, &(out[0])); | |
524 | vec_st(d, byte_2complex, &(out[0])); | |
525 | return; | |
526 | } | |
527 | ||
528 | inline static void fft8_vsx(FFTComplex *z) | |
529 | { | |
530 | vec_f vz0, vz1, vz2, vz3; | |
531 | vec_f vz4, vz5, vz6, vz7, vz8; | |
532 | ||
533 | float* out= (float*)z; | |
534 | vec_f vc0 = {0.0, 0.0, 0.0, 0.0}; | |
535 | vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf}; | |
536 | vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; | |
537 | ||
538 | vz0 = vec_ld(0, &(out[0])); | |
539 | vz1 = vec_ld(byte_2complex, &(out[0])); | |
540 | vz2 = vec_ld(byte_4complex, &(out[0])); | |
541 | vz3 = vec_ld(byte_6complex, &(out[0])); | |
542 | ||
543 | vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); | |
544 | vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3)); | |
545 | vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); | |
546 | vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); | |
547 | ||
548 | vz2 = vec_add(vz6, vz7); | |
549 | vz3 = vec_sub(vz6, vz7); | |
550 | vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1)); | |
551 | ||
552 | vz0 = vec_add(vz4, vz5); | |
553 | vz1 = vec_sub(vz4, vz5); | |
554 | ||
555 | vz3 = vec_madd(vz3, vc1, vc0); | |
556 | vz3 = vec_madd(vz8, vc2, vz3); | |
557 | ||
558 | vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); | |
559 | vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); | |
560 | vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0)); | |
561 | vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1)); | |
562 | ||
563 | vz0 = vec_add(vz4, vz5); | |
564 | vz1 = vec_sub(vz4, vz5); | |
565 | vz2 = vec_add(vz6, vz7); | |
566 | vz3 = vec_sub(vz6, vz7); | |
567 | ||
568 | vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); | |
569 | vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); | |
570 | vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3)); | |
571 | vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2)); | |
572 | ||
573 | ||
574 | vz2 = vec_sub(vz4, vz6); | |
575 | vz3 = vec_sub(vz5, vz7); | |
576 | ||
577 | vz0 = vec_add(vz4, vz6); | |
578 | vz1 = vec_add(vz5, vz7); | |
579 | ||
580 | vec_st(vz0, 0, &(out[0])); | |
581 | vec_st(vz1, byte_2complex, &(out[0])); | |
582 | vec_st(vz2, byte_4complex, &(out[0])); | |
583 | vec_st(vz3, byte_6complex, &(out[0])); | |
584 | return; | |
585 | } | |
586 | ||
587 | inline static void fft16_vsx(FFTComplex *z) | |
588 | { | |
589 | float* out= (float*)z; | |
590 | vec_f vc0 = {0.0, 0.0, 0.0, 0.0}; | |
591 | vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf}; | |
592 | vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf}; | |
593 | vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343}; | |
594 | vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953}; | |
595 | vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953}; | |
596 | ||
597 | vec_f vz0, vz1, vz2, vz3; | |
598 | vec_f vz4, vz5, vz6, vz7; | |
599 | vec_f vz8, vz9, vz10, vz11; | |
600 | vec_f vz12, vz13; | |
601 | ||
602 | vz0 = vec_ld(byte_8complex, &(out[0])); | |
603 | vz1 = vec_ld(byte_10complex, &(out[0])); | |
604 | vz2 = vec_ld(byte_12complex, &(out[0])); | |
605 | vz3 = vec_ld(byte_14complex, &(out[0])); | |
606 | ||
607 | vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); | |
608 | vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); | |
609 | vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1)); | |
610 | vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3)); | |
611 | ||
612 | vz0 = vec_add(vz4, vz5); | |
613 | vz1= vec_sub(vz4, vz5); | |
614 | vz2 = vec_add(vz6, vz7); | |
615 | vz3 = vec_sub(vz6, vz7); | |
616 | ||
617 | vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); | |
618 | vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); | |
619 | vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); | |
620 | vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2)); | |
621 | ||
622 | vz0 = vec_add(vz4, vz5); | |
623 | vz1 = vec_sub(vz4, vz5); | |
624 | vz2 = vec_add(vz6, vz7); | |
625 | vz3 = vec_sub(vz6, vz7); | |
626 | ||
627 | vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); | |
628 | vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); | |
629 | ||
630 | vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1)); | |
631 | vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3)); | |
632 | ||
633 | vz0 = vec_ld(0, &(out[0])); | |
634 | vz1 = vec_ld(byte_2complex, &(out[0])); | |
635 | vz2 = vec_ld(byte_4complex, &(out[0])); | |
636 | vz3 = vec_ld(byte_6complex, &(out[0])); | |
637 | vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1)); | |
638 | vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3)); | |
639 | vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1)); | |
640 | vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3)); | |
641 | ||
642 | vz2 = vec_add(vz10, vz11); | |
643 | vz3 = vec_sub(vz10, vz11); | |
644 | vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1)); | |
645 | vz0 = vec_add(vz8, vz9); | |
646 | vz1 = vec_sub(vz8, vz9); | |
647 | ||
648 | vz3 = vec_madd(vz3, vc1, vc0); | |
649 | vz3 = vec_madd(vz12, vc2, vz3); | |
650 | vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1)); | |
651 | vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2)); | |
652 | vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0)); | |
653 | vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1)); | |
654 | ||
655 | vz0 = vec_add(vz8, vz9); | |
656 | vz1 = vec_sub(vz8, vz9); | |
657 | vz2 = vec_add(vz10, vz11); | |
658 | vz3 = vec_sub(vz10, vz11); | |
659 | ||
660 | vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1)); | |
661 | vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3)); | |
662 | vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3)); | |
663 | vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2)); | |
664 | ||
665 | vz2 = vec_sub(vz8, vz10); | |
666 | vz3 = vec_sub(vz9, vz11); | |
667 | vz0 = vec_add(vz8, vz10); | |
668 | vz1 = vec_add(vz9, vz11); | |
669 | ||
670 | vz8 = vec_madd(vz4, vc3, vc0); | |
671 | vz9 = vec_madd(vz5, vc3, vc0); | |
672 | vz10 = vec_madd(vz6, vc3, vc0); | |
673 | vz11 = vec_madd(vz7, vc3, vc0); | |
674 | ||
675 | vz8 = vec_madd(vz5, vc4, vz8); | |
676 | vz9 = vec_madd(vz4, vc5, vz9); | |
677 | vz10 = vec_madd(vz7, vc5, vz10); | |
678 | vz11 = vec_madd(vz6, vc4, vz11); | |
679 | ||
680 | vz12 = vec_sub(vz10, vz8); | |
681 | vz10 = vec_add(vz10, vz8); | |
682 | ||
683 | vz13 = vec_sub(vz9, vz11); | |
684 | vz11 = vec_add(vz9, vz11); | |
685 | ||
686 | vz4 = vec_sub(vz0, vz10); | |
687 | vz0 = vec_add(vz0, vz10); | |
688 | ||
689 | vz7= vec_sub(vz3, vz12); | |
690 | vz3= vec_add(vz3, vz12); | |
691 | ||
692 | vz5 = vec_sub(vz1, vz11); | |
693 | vz1 = vec_add(vz1, vz11); | |
694 | ||
695 | vz6 = vec_sub(vz2, vz13); | |
696 | vz2 = vec_add(vz2, vz13); | |
697 | ||
698 | vec_st(vz0, 0, &(out[0])); | |
699 | vec_st(vz1, byte_2complex, &(out[0])); | |
700 | vec_st(vz2, byte_4complex, &(out[0])); | |
701 | vec_st(vz3, byte_6complex, &(out[0])); | |
702 | vec_st(vz4, byte_8complex, &(out[0])); | |
703 | vec_st(vz5, byte_10complex, &(out[0])); | |
704 | vec_st(vz6, byte_12complex, &(out[0])); | |
705 | vec_st(vz7, byte_14complex, &(out[0])); | |
706 | return; | |
707 | ||
708 | } | |
709 | inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n) | |
710 | { | |
711 | int o1 = n<<1; | |
712 | int o2 = n<<2; | |
713 | int o3 = o1+o2; | |
714 | int i1, i2, i3; | |
715 | FFTSample* out = (FFTSample*)z; | |
716 | const FFTSample *wim = wre+o1; | |
717 | vec_f v0, v1, v2, v3; | |
718 | vec_f v4, v5, v6, v7; | |
719 | vec_f v8, v9, v10, v11; | |
720 | vec_f v12, v13; | |
721 | ||
722 | n = n-2; | |
723 | i1 = o1*sizeof(FFTComplex); | |
724 | i2 = o2*sizeof(FFTComplex); | |
725 | i3 = o3*sizeof(FFTComplex); | |
726 | ||
727 | v8 = vec_ld(0, &(wre[0])); | |
728 | v10 = vec_ld(0, &(wim[0])); | |
729 | v9 = vec_ld(0, &(wim[-4])); | |
730 | v9 = vec_perm(v9, v10, vcprm(s0,3,2,1)); | |
731 | ||
732 | v4 = vec_ld(i2, &(out[0])); | |
733 | v5 = vec_ld(i2+16, &(out[0])); | |
734 | v6 = vec_ld(i3, &(out[0])); | |
735 | v7 = vec_ld(i3+16, &(out[0])); | |
736 | v10 = vec_mul(v4, v8); // r2*wre | |
737 | v11 = vec_mul(v5, v8); // i2*wre | |
738 | v12 = vec_mul(v6, v8); // r3*wre | |
739 | v13 = vec_mul(v7, v8); // i3*wre | |
740 | ||
741 | v0 = vec_ld(0, &(out[0])); // r0 | |
742 | v3 = vec_ld(i1+16, &(out[0])); // i1 | |
743 | v10 = vec_madd(v5, v9, v10); // r2*wim | |
744 | v11 = vec_nmsub(v4, v9, v11); // i2*wim | |
745 | v12 = vec_nmsub(v7, v9, v12); // r3*wim | |
746 | v13 = vec_madd(v6, v9, v13); // i3*wim | |
747 | ||
748 | v1 = vec_ld(16, &(out[0])); // i0 | |
749 | v2 = vec_ld(i1, &(out[0])); // r1 | |
750 | v8 = vec_sub(v12, v10); | |
751 | v12 = vec_add(v12, v10); | |
752 | v9 = vec_sub(v11, v13); | |
753 | v13 = vec_add(v11, v13); | |
754 | v4 = vec_sub(v0, v12); | |
755 | v0 = vec_add(v0, v12); | |
756 | v7 = vec_sub(v3, v8); | |
757 | v3 = vec_add(v3, v8); | |
758 | ||
759 | vec_st(v0, 0, &(out[0])); // r0 | |
760 | vec_st(v3, i1+16, &(out[0])); // i1 | |
761 | vec_st(v4, i2, &(out[0])); // r2 | |
762 | vec_st(v7, i3+16, &(out[0]));// i3 | |
763 | ||
764 | v5 = vec_sub(v1, v13); | |
765 | v1 = vec_add(v1, v13); | |
766 | v6 = vec_sub(v2, v9); | |
767 | v2 = vec_add(v2, v9); | |
768 | ||
769 | vec_st(v1, 16, &(out[0])); // i0 | |
770 | vec_st(v2, i1, &(out[0])); // r1 | |
771 | vec_st(v5, i2+16, &(out[0])); // i2 | |
772 | vec_st(v6, i3, &(out[0])); // r3 | |
773 | ||
774 | do { | |
775 | out += 8; | |
776 | wre += 4; | |
777 | wim -= 4; | |
778 | ||
779 | v8 = vec_ld(0, &(wre[0])); | |
780 | v10 = vec_ld(0, &(wim[0])); | |
781 | v9 = vec_ld(0, &(wim[-4])); | |
782 | v9 = vec_perm(v9, v10, vcprm(s0,3,2,1)); | |
783 | ||
784 | v4 = vec_ld(i2, &(out[0])); // r2 | |
785 | v5 = vec_ld(i2+16, &(out[0])); // i2 | |
786 | v6 = vec_ld(i3, &(out[0])); // r3 | |
787 | v7 = vec_ld(i3+16, &(out[0]));// i3 | |
788 | v10 = vec_mul(v4, v8); // r2*wre | |
789 | v11 = vec_mul(v5, v8); // i2*wre | |
790 | v12 = vec_mul(v6, v8); // r3*wre | |
791 | v13 = vec_mul(v7, v8); // i3*wre | |
792 | ||
793 | v0 = vec_ld(0, &(out[0])); // r0 | |
794 | v3 = vec_ld(i1+16, &(out[0])); // i1 | |
795 | v10 = vec_madd(v5, v9, v10); // r2*wim | |
796 | v11 = vec_nmsub(v4, v9, v11); // i2*wim | |
797 | v12 = vec_nmsub(v7, v9, v12); // r3*wim | |
798 | v13 = vec_madd(v6, v9, v13); // i3*wim | |
799 | ||
800 | v1 = vec_ld(16, &(out[0])); // i0 | |
801 | v2 = vec_ld(i1, &(out[0])); // r1 | |
802 | v8 = vec_sub(v12, v10); | |
803 | v12 = vec_add(v12, v10); | |
804 | v9 = vec_sub(v11, v13); | |
805 | v13 = vec_add(v11, v13); | |
806 | v4 = vec_sub(v0, v12); | |
807 | v0 = vec_add(v0, v12); | |
808 | v7 = vec_sub(v3, v8); | |
809 | v3 = vec_add(v3, v8); | |
810 | ||
811 | vec_st(v0, 0, &(out[0])); // r0 | |
812 | vec_st(v3, i1+16, &(out[0])); // i1 | |
813 | vec_st(v4, i2, &(out[0])); // r2 | |
814 | vec_st(v7, i3+16, &(out[0])); // i3 | |
815 | ||
816 | v5 = vec_sub(v1, v13); | |
817 | v1 = vec_add(v1, v13); | |
818 | v6 = vec_sub(v2, v9); | |
819 | v2 = vec_add(v2, v9); | |
820 | ||
821 | vec_st(v1, 16, &(out[0])); // i0 | |
822 | vec_st(v2, i1, &(out[0])); // r1 | |
823 | vec_st(v5, i2+16, &(out[0])); // i2 | |
824 | vec_st(v6, i3, &(out[0])); // r3 | |
825 | } while (n-=2); | |
826 | } | |
827 | ||
828 | #endif | |
829 | ||
830 | #endif /* AVCODEC_PPC_FFT_VSX_H */ |