a85475d160bf14e1344ebbc97efdede7d10e96a0
1 #ifndef AVCODEC_PPC_FFT_VSX_H
2 #define AVCODEC_PPC_FFT_VSX_H
4 * FFT transform, optimized with VSX built-in functions
5 * Copyright (c) 2014 Rong Yan Copyright (c) 2009 Loren Merritt
7 * This algorithm (though not any of the implementation details) is
8 * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S.
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
29 #include "libavutil/cpu.h"
30 #include "libavutil/ppc/types_altivec.h"
31 #include "libavutil/ppc/util_altivec.h"
32 #include "libavcodec/fft.h"
33 #include "libavcodec/fft-internal.h"
37 void ff_fft_calc_interleave_vsx(FFTContext
*s
, FFTComplex
*z
);
38 void ff_fft_calc_vsx(FFTContext
*s
, FFTComplex
*z
);
41 #define byte_2complex (2*sizeof(FFTComplex))
42 #define byte_4complex (4*sizeof(FFTComplex))
43 #define byte_6complex (6*sizeof(FFTComplex))
44 #define byte_8complex (8*sizeof(FFTComplex))
45 #define byte_10complex (10*sizeof(FFTComplex))
46 #define byte_12complex (12*sizeof(FFTComplex))
47 #define byte_14complex (14*sizeof(FFTComplex))
49 inline static void pass_vsx_interleave(FFTComplex
*z
, const FFTSample
*wre
, unsigned int n
)
55 FFTSample
* out
= (FFTSample
*)z
;
56 const FFTSample
*wim
= wre
+o1
;
57 vec_f vz0
, vzo1
, vzo2
, vzo3
;
60 vec_f x8
, x9
, x10
, x11
;
61 vec_f x12
, x13
, x14
, x15
;
62 vec_f x16
, x17
, x18
, x19
;
63 vec_f x20
, x21
, x22
, x23
;
64 vec_f vz0plus1
, vzo1plus1
, vzo2plus1
, vzo3plus1
;
67 vec_f y10
, y13
, y14
, y15
;
68 vec_f y16
, y17
, y18
, y19
;
69 vec_f y20
, y21
, y22
, y23
;
70 vec_f wr1
, wi1
, wr0
, wi0
;
71 vec_f wr2
, wi2
, wr3
, wi3
;
72 vec_f xmulwi0
, xmulwi1
, ymulwi2
, ymulwi3
;
75 i1
= o1
*sizeof(FFTComplex
);
76 i2
= o2
*sizeof(FFTComplex
);
77 i3
= o3
*sizeof(FFTComplex
);
78 vzo2
= vec_ld(i2
, &(out
[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i
79 vzo2plus1
= vec_ld(i2
+16, &(out
[0]));
80 vzo3
= vec_ld(i3
, &(out
[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i
81 vzo3plus1
= vec_ld(i3
+16, &(out
[0]));
82 vz0
= vec_ld(0, &(out
[0])); // z0.r z0.i z1.r z1.i
83 vz0plus1
= vec_ld(16, &(out
[0]));
84 vzo1
= vec_ld(i1
, &(out
[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i
85 vzo1plus1
= vec_ld(i1
+16, &(out
[0]));
87 x0
= vec_add(vzo2
, vzo3
);
88 x1
= vec_sub(vzo2
, vzo3
);
89 y0
= vec_add(vzo2plus1
, vzo3plus1
);
90 y1
= vec_sub(vzo2plus1
, vzo3plus1
);
92 wr1
= vec_splats(wre
[1]);
93 wi1
= vec_splats(wim
[-1]);
94 wi2
= vec_splats(wim
[-2]);
95 wi3
= vec_splats(wim
[-3]);
96 wr2
= vec_splats(wre
[2]);
97 wr3
= vec_splats(wre
[3]);
99 x2
= vec_perm(x0
, x1
, vcprm(2,s2
,3,s3
));
100 x3
= vec_perm(x0
, x1
, vcprm(s3
,3,s2
,2));
102 y4
= vec_perm(y0
, y1
, vcprm(s1
,1,s0
,0));
103 y5
= vec_perm(y0
, y1
, vcprm(s3
,3,s2
,2));
104 y2
= vec_perm(y0
, y1
, vcprm(0,s0
,1,s1
));
105 y3
= vec_perm(y0
, y1
, vcprm(2,s2
,3,s3
));
107 ymulwi2
= vec_mul(y4
, wi2
);
108 ymulwi3
= vec_mul(y5
, wi3
);
109 x4
= vec_mul(x2
, wr1
);
110 x5
= vec_mul(x3
, wi1
);
111 y8
= vec_madd(y2
, wr2
, ymulwi2
);
112 y9
= vec_msub(y2
, wr2
, ymulwi2
);
113 x6
= vec_add(x4
, x5
);
114 x7
= vec_sub(x4
, x5
);
115 y13
= vec_madd(y3
, wr3
, ymulwi3
);
116 y14
= vec_msub(y3
, wr3
, ymulwi3
);
118 x8
= vec_perm(x6
, x7
, vcprm(0,1,s2
,s3
));
119 y10
= vec_perm(y8
, y9
, vcprm(0,1,s2
,s3
));
120 y15
= vec_perm(y13
, y14
, vcprm(0,1,s2
,s3
));
122 x9
= vec_perm(x0
, x8
, vcprm(0,1,s0
,s2
));
123 x10
= vec_perm(x1
, x8
, vcprm(1,0,s3
,s1
));
125 y16
= vec_perm(y10
, y15
, vcprm(0,2,s0
,s2
));
126 y17
= vec_perm(y10
, y15
, vcprm(3,1,s3
,s1
));
128 x11
= vec_add(vz0
, x9
);
129 x12
= vec_sub(vz0
, x9
);
130 x13
= vec_add(vzo1
, x10
);
131 x14
= vec_sub(vzo1
, x10
);
133 y18
= vec_add(vz0plus1
, y16
);
134 y19
= vec_sub(vz0plus1
, y16
);
135 y20
= vec_add(vzo1plus1
, y17
);
136 y21
= vec_sub(vzo1plus1
, y17
);
138 x15
= vec_perm(x13
, x14
, vcprm(0,s1
,2,s3
));
139 x16
= vec_perm(x13
, x14
, vcprm(s0
,1,s2
,3));
140 y22
= vec_perm(y20
, y21
, vcprm(0,s1
,2,s3
));
141 y23
= vec_perm(y20
, y21
, vcprm(s0
,1,s2
,3));
144 vec_st(x11
, 0, &(out
[0]));
145 vec_st(y18
, 16, &(out
[0]));
146 vec_st(x15
, i1
, &(out
[0]));
147 vec_st(y22
, i1
+16, &(out
[0]));
148 vec_st(x12
, i2
, &(out
[0]));
149 vec_st(y19
, i2
+16, &(out
[0]));
150 vec_st(x16
, i3
, &(out
[0]));
151 vec_st(y23
, i3
+16, &(out
[0]));
157 wr0
= vec_splats(wre
[0]);
158 wr1
= vec_splats(wre
[1]);
159 wi0
= vec_splats(wim
[0]);
160 wi1
= vec_splats(wim
[-1]);
162 wr2
= vec_splats(wre
[2]);
163 wr3
= vec_splats(wre
[3]);
164 wi2
= vec_splats(wim
[-2]);
165 wi3
= vec_splats(wim
[-3]);
167 vzo2
= vec_ld(i2
, &(out
[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i
168 vzo2plus1
= vec_ld(i2
+16, &(out
[0]));
169 vzo3
= vec_ld(i3
, &(out
[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i
170 vzo3plus1
= vec_ld(i3
+16, &(out
[0]));
171 vz0
= vec_ld(0, &(out
[0])); // z0.r z0.i z1.r z1.i
172 vz0plus1
= vec_ld(16, &(out
[0]));
173 vzo1
= vec_ld(i1
, &(out
[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i
174 vzo1plus1
= vec_ld(i1
+16, &(out
[0]));
176 x0
= vec_add(vzo2
, vzo3
);
177 x1
= vec_sub(vzo2
, vzo3
);
179 y0
= vec_add(vzo2plus1
, vzo3plus1
);
180 y1
= vec_sub(vzo2plus1
, vzo3plus1
);
182 x4
= vec_perm(x0
, x1
, vcprm(s1
,1,s0
,0));
183 x5
= vec_perm(x0
, x1
, vcprm(s3
,3,s2
,2));
184 x2
= vec_perm(x0
, x1
, vcprm(0,s0
,1,s1
));
185 x3
= vec_perm(x0
, x1
, vcprm(2,s2
,3,s3
));
187 y2
= vec_perm(y0
, y1
, vcprm(0,s0
,1,s1
));
188 y3
= vec_perm(y0
, y1
, vcprm(2,s2
,3,s3
));
189 xmulwi0
= vec_mul(x4
, wi0
);
190 xmulwi1
= vec_mul(x5
, wi1
);
192 y4
= vec_perm(y0
, y1
, vcprm(s1
,1,s0
,0));
193 y5
= vec_perm(y0
, y1
, vcprm(s3
,3,s2
,2));
195 x8
= vec_madd(x2
, wr0
, xmulwi0
);
196 x9
= vec_msub(x2
, wr0
, xmulwi0
);
197 ymulwi2
= vec_mul(y4
, wi2
);
198 ymulwi3
= vec_mul(y5
, wi3
);
200 x13
= vec_madd(x3
, wr1
, xmulwi1
);
201 x14
= vec_msub(x3
, wr1
, xmulwi1
);
203 y8
= vec_madd(y2
, wr2
, ymulwi2
);
204 y9
= vec_msub(y2
, wr2
, ymulwi2
);
205 y13
= vec_madd(y3
, wr3
, ymulwi3
);
206 y14
= vec_msub(y3
, wr3
, ymulwi3
);
208 x10
= vec_perm(x8
, x9
, vcprm(0,1,s2
,s3
));
209 x15
= vec_perm(x13
, x14
, vcprm(0,1,s2
,s3
));
211 y10
= vec_perm(y8
, y9
, vcprm(0,1,s2
,s3
));
212 y15
= vec_perm(y13
, y14
, vcprm(0,1,s2
,s3
));
214 x16
= vec_perm(x10
, x15
, vcprm(0,2,s0
,s2
));
215 x17
= vec_perm(x10
, x15
, vcprm(3,1,s3
,s1
));
217 y16
= vec_perm(y10
, y15
, vcprm(0,2,s0
,s2
));
218 y17
= vec_perm(y10
, y15
, vcprm(3,1,s3
,s1
));
220 x18
= vec_add(vz0
, x16
);
221 x19
= vec_sub(vz0
, x16
);
222 x20
= vec_add(vzo1
, x17
);
223 x21
= vec_sub(vzo1
, x17
);
225 y18
= vec_add(vz0plus1
, y16
);
226 y19
= vec_sub(vz0plus1
, y16
);
227 y20
= vec_add(vzo1plus1
, y17
);
228 y21
= vec_sub(vzo1plus1
, y17
);
230 x22
= vec_perm(x20
, x21
, vcprm(0,s1
,2,s3
));
231 x23
= vec_perm(x20
, x21
, vcprm(s0
,1,s2
,3));
233 y22
= vec_perm(y20
, y21
, vcprm(0,s1
,2,s3
));
234 y23
= vec_perm(y20
, y21
, vcprm(s0
,1,s2
,3));
236 vec_st(x18
, 0, &(out
[0]));
237 vec_st(y18
, 16, &(out
[0]));
238 vec_st(x22
, i1
, &(out
[0]));
239 vec_st(y22
, i1
+16, &(out
[0]));
240 vec_st(x19
, i2
, &(out
[0]));
241 vec_st(y19
, i2
+16, &(out
[0]));
242 vec_st(x23
, i3
, &(out
[0]));
243 vec_st(y23
, i3
+16, &(out
[0]));
247 inline static void fft2_vsx_interleave(FFTComplex
*z
)
251 r1
= z
[0].re
- z
[1].re
;
255 i1
= z
[0].im
- z
[1].im
;
260 inline static void fft4_vsx_interleave(FFTComplex
*z
)
263 float* out
= (float*)z
;
264 a
= vec_ld(0, &(out
[0]));
265 b
= vec_ld(byte_2complex
, &(out
[0]));
267 c
= vec_perm(a
, b
, vcprm(0,1,s2
,s1
));
268 d
= vec_perm(a
, b
, vcprm(2,3,s0
,s3
));
272 c
= vec_perm(a
, b
, vcprm(0,1,s0
,s1
));
273 d
= vec_perm(a
, b
, vcprm(2,3,s3
,s2
));
277 vec_st(a
, 0, &(out
[0]));
278 vec_st(b
, byte_2complex
, &(out
[0]));
281 inline static void fft8_vsx_interleave(FFTComplex
*z
)
283 vec_f vz0
, vz1
, vz2
, vz3
;
284 vec_f x0
, x1
, x2
, x3
;
285 vec_f x4
, x5
, x6
, x7
;
286 vec_f x8
, x9
, x10
, x11
;
287 vec_f x12
, x13
, x14
, x15
;
288 vec_f x16
, x17
, x18
, x19
;
289 vec_f x20
, x21
, x22
, x23
;
290 vec_f x24
, x25
, x26
, x27
;
291 vec_f x28
, x29
, x30
, x31
;
294 float* out
= (float*)z
;
295 vec_f vc1
= {sqrthalf
, sqrthalf
, sqrthalf
, sqrthalf
};
297 vz0
= vec_ld(0, &(out
[0]));
298 vz1
= vec_ld(byte_2complex
, &(out
[0]));
299 vz2
= vec_ld(byte_4complex
, &(out
[0]));
300 vz3
= vec_ld(byte_6complex
, &(out
[0]));
302 x0
= vec_perm(vz0
, vz1
, vcprm(0,1,s2
,s1
));
303 x1
= vec_perm(vz0
, vz1
, vcprm(2,3,s0
,s3
));
304 x2
= vec_perm(vz2
, vz3
, vcprm(2,1,s0
,s1
));
305 x3
= vec_perm(vz2
, vz3
, vcprm(0,3,s2
,s3
));
307 x4
= vec_add(x0
, x1
);
308 x5
= vec_sub(x0
, x1
);
309 x6
= vec_add(x2
, x3
);
310 x7
= vec_sub(x2
, x3
);
312 x8
= vec_perm(x4
, x5
, vcprm(0,1,s0
,s1
));
313 x9
= vec_perm(x4
, x5
, vcprm(2,3,s3
,s2
));
314 x10
= vec_perm(x6
, x7
, vcprm(2,1,s2
,s1
));
315 x11
= vec_perm(x6
, x7
, vcprm(0,3,s0
,s3
));
317 x12
= vec_add(x8
, x9
);
318 x13
= vec_sub(x8
, x9
);
319 x14
= vec_add(x10
, x11
);
320 x15
= vec_sub(x10
, x11
);
321 x16
= vec_perm(x12
, x13
, vcprm(0,s0
,1,s1
));
322 x17
= vec_perm(x14
, x15
, vcprm(0,s0
,1,s1
));
323 x18
= vec_perm(x16
, x17
, vcprm(s0
,s3
,s2
,s1
));
324 x19
= vec_add(x16
, x18
); // z0.r z2.r z0.i z2.i
325 x20
= vec_sub(x16
, x18
); // z4.r z6.r z4.i z6.i
327 x21
= vec_perm(x12
, x13
, vcprm(2,s2
,3,s3
));
328 x22
= vec_perm(x14
, x15
, vcprm(2,3,s2
,s3
));
329 x23
= vec_perm(x14
, x15
, vcprm(3,2,s3
,s2
));
330 x24
= vec_add(x22
, x23
);
331 x25
= vec_sub(x22
, x23
);
332 x26
= vec_mul( vec_perm(x24
, x25
, vcprm(2,s2
,0,s0
)), vc1
);
334 x27
= vec_add(x21
, x26
); // z1.r z7.r z1.i z3.i
335 x28
= vec_sub(x21
, x26
); //z5.r z3.r z5.i z7.i
337 x29
= vec_perm(x19
, x27
, vcprm(0,2,s0
,s2
)); // z0.r z0.i z1.r z1.i
338 x30
= vec_perm(x19
, x27
, vcprm(1,3,s1
,s3
)); // z2.r z2.i z7.r z3.i
339 x31
= vec_perm(x20
, x28
, vcprm(0,2,s0
,s2
)); // z4.r z4.i z5.r z5.i
340 x32
= vec_perm(x20
, x28
, vcprm(1,3,s1
,s3
)); // z6.r z6.i z3.r z7.i
341 x33
= vec_perm(x30
, x32
, vcprm(0,1,s2
,3)); // z2.r z2.i z3.r z3.i
342 x34
= vec_perm(x30
, x32
, vcprm(s0
,s1
,2,s3
)); // z6.r z6.i z7.r z7.i
344 vec_st(x29
, 0, &(out
[0]));
345 vec_st(x33
, byte_2complex
, &(out
[0]));
346 vec_st(x31
, byte_4complex
, &(out
[0]));
347 vec_st(x34
, byte_6complex
, &(out
[0]));
350 inline static void fft16_vsx_interleave(FFTComplex
*z
)
352 float* out
= (float*)z
;
353 vec_f vc0
= {sqrthalf
, sqrthalf
, sqrthalf
, sqrthalf
};
354 vec_f vc1
= {ff_cos_16
[1], ff_cos_16
[1], ff_cos_16
[1], ff_cos_16
[1]};
355 vec_f vc2
= {ff_cos_16
[3], ff_cos_16
[3], ff_cos_16
[3], ff_cos_16
[3]};
356 vec_f vz0
, vz1
, vz2
, vz3
;
357 vec_f vz4
, vz5
, vz6
, vz7
;
358 vec_f x0
, x1
, x2
, x3
;
359 vec_f x4
, x5
, x6
, x7
;
360 vec_f x8
, x9
, x10
, x11
;
361 vec_f x12
, x13
, x14
, x15
;
362 vec_f x16
, x17
, x18
, x19
;
363 vec_f x20
, x21
, x22
, x23
;
364 vec_f x24
, x25
, x26
, x27
;
365 vec_f x28
, x29
, x30
, x31
;
366 vec_f x32
, x33
, x34
, x35
;
367 vec_f x36
, x37
, x38
, x39
;
368 vec_f x40
, x41
, x42
, x43
;
369 vec_f x44
, x45
, x46
, x47
;
370 vec_f x48
, x49
, x50
, x51
;
371 vec_f x52
, x53
, x54
, x55
;
372 vec_f x56
, x57
, x58
, x59
;
373 vec_f x60
, x61
, x62
, x63
;
374 vec_f x64
, x65
, x66
, x67
;
375 vec_f x68
, x69
, x70
, x71
;
376 vec_f x72
, x73
, x74
, x75
;
377 vec_f x76
, x77
, x78
, x79
;
378 vec_f x80
, x81
, x82
, x83
;
381 vz0
= vec_ld(0, &(out
[0]));
382 vz1
= vec_ld(byte_2complex
, &(out
[0]));
383 vz2
= vec_ld(byte_4complex
, &(out
[0]));
384 vz3
= vec_ld(byte_6complex
, &(out
[0]));
385 vz4
= vec_ld(byte_8complex
, &(out
[0]));
386 vz5
= vec_ld(byte_10complex
, &(out
[0]));
387 vz6
= vec_ld(byte_12complex
, &(out
[0]));
388 vz7
= vec_ld(byte_14complex
, &(out
[0]));
390 x0
= vec_perm(vz0
, vz1
, vcprm(0,1,s2
,s1
));
391 x1
= vec_perm(vz0
, vz1
, vcprm(2,3,s0
,s3
));
392 x2
= vec_perm(vz2
, vz3
, vcprm(0,1,s0
,s1
));
393 x3
= vec_perm(vz2
, vz3
, vcprm(2,3,s2
,s3
));
395 x4
= vec_perm(vz4
, vz5
, vcprm(0,1,s2
,s1
));
396 x5
= vec_perm(vz4
, vz5
, vcprm(2,3,s0
,s3
));
397 x6
= vec_perm(vz6
, vz7
, vcprm(0,1,s2
,s1
));
398 x7
= vec_perm(vz6
, vz7
, vcprm(2,3,s0
,s3
));
400 x8
= vec_add(x0
, x1
);
401 x9
= vec_sub(x0
, x1
);
402 x10
= vec_add(x2
, x3
);
403 x11
= vec_sub(x2
, x3
);
405 x12
= vec_add(x4
, x5
);
406 x13
= vec_sub(x4
, x5
);
407 x14
= vec_add(x6
, x7
);
408 x15
= vec_sub(x6
, x7
);
410 x16
= vec_perm(x8
, x9
, vcprm(0,1,s0
,s1
));
411 x17
= vec_perm(x8
, x9
, vcprm(2,3,s3
,s2
));
412 x18
= vec_perm(x10
, x11
, vcprm(2,1,s1
,s2
));
413 x19
= vec_perm(x10
, x11
, vcprm(0,3,s0
,s3
));
414 x20
= vec_perm(x12
, x14
, vcprm(0,1,s0
, s1
));
415 x21
= vec_perm(x12
, x14
, vcprm(2,3,s2
,s3
));
416 x22
= vec_perm(x13
, x15
, vcprm(0,1,s0
,s1
));
417 x23
= vec_perm(x13
, x15
, vcprm(3,2,s3
,s2
));
419 x24
= vec_add(x16
, x17
);
420 x25
= vec_sub(x16
, x17
);
421 x26
= vec_add(x18
, x19
);
422 x27
= vec_sub(x18
, x19
);
423 x28
= vec_add(x20
, x21
);
424 x29
= vec_sub(x20
, x21
);
425 x30
= vec_add(x22
, x23
);
426 x31
= vec_sub(x22
, x23
);
428 x32
= vec_add(x24
, x26
);
429 x33
= vec_sub(x24
, x26
);
430 x34
= vec_perm(x32
, x33
, vcprm(0,1,s0
,s1
));
432 x35
= vec_perm(x28
, x29
, vcprm(2,1,s1
,s2
));
433 x36
= vec_perm(x28
, x29
, vcprm(0,3,s0
,s3
));
434 x37
= vec_add(x35
, x36
);
435 x38
= vec_sub(x35
, x36
);
436 x39
= vec_perm(x37
, x38
, vcprm(0,1,s1
,s0
));
438 x40
= vec_perm(x27
, x38
, vcprm(3,2,s2
,s3
));
439 x41
= vec_perm(x26
, x37
, vcprm(2,3,s3
,s2
));
440 x42
= vec_add(x40
, x41
);
441 x43
= vec_sub(x40
, x41
);
442 x44
= vec_mul(x42
, vc0
);
443 x45
= vec_mul(x43
, vc0
);
445 x46
= vec_add(x34
, x39
); // z0.r z0.i z4.r z4.i
446 x47
= vec_sub(x34
, x39
); // z8.r z8.i z12.r z12.i
448 x48
= vec_perm(x30
, x31
, vcprm(2,1,s1
,s2
));
449 x49
= vec_perm(x30
, x31
, vcprm(0,3,s3
,s0
));
450 x50
= vec_add(x48
, x49
);
451 x51
= vec_sub(x48
, x49
);
452 x52
= vec_mul(x50
, vc1
);
453 x53
= vec_mul(x50
, vc2
);
454 x54
= vec_mul(x51
, vc1
);
455 x55
= vec_mul(x51
, vc2
);
457 x56
= vec_perm(x24
, x25
, vcprm(2,3,s2
,s3
));
458 x57
= vec_perm(x44
, x45
, vcprm(0,1,s1
,s0
));
459 x58
= vec_add(x56
, x57
);
460 x59
= vec_sub(x56
, x57
);
462 x60
= vec_perm(x54
, x55
, vcprm(1,0,3,2));
463 x61
= vec_perm(x54
, x55
, vcprm(s1
,s0
,s3
,s2
));
464 x62
= vec_add(x52
, x61
);
465 x63
= vec_sub(x52
, x61
);
466 x64
= vec_add(x60
, x53
);
467 x65
= vec_sub(x60
, x53
);
468 x66
= vec_perm(x62
, x64
, vcprm(0,1,s3
,s2
));
469 x67
= vec_perm(x63
, x65
, vcprm(s0
,s1
,3,2));
471 x68
= vec_add(x58
, x66
); // z1.r z1.i z3.r z3.i
472 x69
= vec_sub(x58
, x66
); // z9.r z9.i z11.r z11.i
473 x70
= vec_add(x59
, x67
); // z5.r z5.i z15.r z15.i
474 x71
= vec_sub(x59
, x67
); // z13.r z13.i z7.r z7.i
476 x72
= vec_perm(x25
, x27
, vcprm(s1
,s0
,s2
,s3
));
477 x73
= vec_add(x25
, x72
);
478 x74
= vec_sub(x25
, x72
);
479 x75
= vec_perm(x73
, x74
, vcprm(0,1,s0
,s1
));
480 x76
= vec_perm(x44
, x45
, vcprm(3,2,s2
,s3
));
481 x77
= vec_add(x75
, x76
); // z2.r z2.i z6.r z6.i
482 x78
= vec_sub(x75
, x76
); // z10.r z10.i z14.r z14.i
484 x79
= vec_perm(x46
, x68
, vcprm(0,1,s0
,s1
)); // z0.r z0.i z1.r z1.i
485 x80
= vec_perm(x77
, x68
, vcprm(0,1,s2
,s3
)); // z2.r z2.i z3.r z3.i
486 x81
= vec_perm(x46
, x70
, vcprm(2,3,s0
,s1
)); // z4.r z4.i z5.r z5.i
487 x82
= vec_perm(x71
, x77
, vcprm(s2
,s3
,2,3)); // z6.r z6.i z7.r z7.i
488 vec_st(x79
, 0, &(out
[0]));
489 vec_st(x80
, byte_2complex
, &(out
[0]));
490 vec_st(x81
, byte_4complex
, &(out
[0]));
491 vec_st(x82
, byte_6complex
, &(out
[0]));
492 x83
= vec_perm(x47
, x69
, vcprm(0,1,s0
,s1
)); // z8.r z8.i z9.r z9.i
493 x84
= vec_perm(x78
, x69
, vcprm(0,1,s2
,s3
)); // z10.r z10.i z11.r z11.i
494 x85
= vec_perm(x47
, x71
, vcprm(2,3,s0
,s1
)); // z12.r z12.i z13.r z13.i
495 x86
= vec_perm(x70
, x78
, vcprm(s2
,s3
,2,3)); // z14.r z14.i z15.r z15.i
496 vec_st(x83
, byte_8complex
, &(out
[0]));
497 vec_st(x84
, byte_10complex
, &(out
[0]));
498 vec_st(x85
, byte_12complex
, &(out
[0]));
499 vec_st(x86
, byte_14complex
, &(out
[0]));
502 inline static void fft4_vsx(FFTComplex
*z
)
505 float* out
= (float*)z
;
506 a
= vec_ld(0, &(out
[0]));
507 b
= vec_ld(byte_2complex
, &(out
[0]));
509 c
= vec_perm(a
, b
, vcprm(0,1,s2
,s1
));
510 d
= vec_perm(a
, b
, vcprm(2,3,s0
,s3
));
514 c
= vec_perm(a
,b
, vcprm(0,s0
,1,s1
));
515 d
= vec_perm(a
, b
, vcprm(2,s3
,3,s2
));
520 c
= vec_perm(a
, b
, vcprm(0,1,s0
,s1
));
521 d
= vec_perm(a
, b
, vcprm(2,3,s2
,s3
));
523 vec_st(c
, 0, &(out
[0]));
524 vec_st(d
, byte_2complex
, &(out
[0]));
528 inline static void fft8_vsx(FFTComplex
*z
)
530 vec_f vz0
, vz1
, vz2
, vz3
;
531 vec_f vz4
, vz5
, vz6
, vz7
, vz8
;
533 float* out
= (float*)z
;
534 vec_f vc0
= {0.0, 0.0, 0.0, 0.0};
535 vec_f vc1
= {-sqrthalf
, sqrthalf
, sqrthalf
, -sqrthalf
};
536 vec_f vc2
= {sqrthalf
, sqrthalf
, sqrthalf
, sqrthalf
};
538 vz0
= vec_ld(0, &(out
[0]));
539 vz1
= vec_ld(byte_2complex
, &(out
[0]));
540 vz2
= vec_ld(byte_4complex
, &(out
[0]));
541 vz3
= vec_ld(byte_6complex
, &(out
[0]));
543 vz6
= vec_perm(vz2
, vz3
, vcprm(0,s0
,1,s1
));
544 vz7
= vec_perm(vz2
, vz3
, vcprm(2,s2
,3,s3
));
545 vz4
= vec_perm(vz0
, vz1
, vcprm(0,1,s2
,s1
));
546 vz5
= vec_perm(vz0
, vz1
, vcprm(2,3,s0
,s3
));
548 vz2
= vec_add(vz6
, vz7
);
549 vz3
= vec_sub(vz6
, vz7
);
550 vz8
= vec_perm(vz3
, vz3
, vcprm(2,3,0,1));
552 vz0
= vec_add(vz4
, vz5
);
553 vz1
= vec_sub(vz4
, vz5
);
555 vz3
= vec_madd(vz3
, vc1
, vc0
);
556 vz3
= vec_madd(vz8
, vc2
, vz3
);
558 vz4
= vec_perm(vz0
, vz1
, vcprm(0,s0
,1,s1
));
559 vz5
= vec_perm(vz0
, vz1
, vcprm(2,s3
,3,s2
));
560 vz6
= vec_perm(vz2
, vz3
, vcprm(1,2,s3
,s0
));
561 vz7
= vec_perm(vz2
, vz3
, vcprm(0,3,s2
,s1
));
563 vz0
= vec_add(vz4
, vz5
);
564 vz1
= vec_sub(vz4
, vz5
);
565 vz2
= vec_add(vz6
, vz7
);
566 vz3
= vec_sub(vz6
, vz7
);
568 vz4
= vec_perm(vz0
, vz1
, vcprm(0,1,s0
,s1
));
569 vz5
= vec_perm(vz0
, vz1
, vcprm(2,3,s2
,s3
));
570 vz6
= vec_perm(vz2
, vz3
, vcprm(0,2,s1
,s3
));
571 vz7
= vec_perm(vz2
, vz3
, vcprm(1,3,s0
,s2
));
574 vz2
= vec_sub(vz4
, vz6
);
575 vz3
= vec_sub(vz5
, vz7
);
577 vz0
= vec_add(vz4
, vz6
);
578 vz1
= vec_add(vz5
, vz7
);
580 vec_st(vz0
, 0, &(out
[0]));
581 vec_st(vz1
, byte_2complex
, &(out
[0]));
582 vec_st(vz2
, byte_4complex
, &(out
[0]));
583 vec_st(vz3
, byte_6complex
, &(out
[0]));
587 inline static void fft16_vsx(FFTComplex
*z
)
589 float* out
= (float*)z
;
590 vec_f vc0
= {0.0, 0.0, 0.0, 0.0};
591 vec_f vc1
= {-sqrthalf
, sqrthalf
, sqrthalf
, -sqrthalf
};
592 vec_f vc2
= {sqrthalf
, sqrthalf
, sqrthalf
, sqrthalf
};
593 vec_f vc3
= {1.0, 0.92387953, sqrthalf
, 0.38268343};
594 vec_f vc4
= {0.0, 0.38268343, sqrthalf
, 0.92387953};
595 vec_f vc5
= {-0.0, -0.38268343, -sqrthalf
, -0.92387953};
597 vec_f vz0
, vz1
, vz2
, vz3
;
598 vec_f vz4
, vz5
, vz6
, vz7
;
599 vec_f vz8
, vz9
, vz10
, vz11
;
602 vz0
= vec_ld(byte_8complex
, &(out
[0]));
603 vz1
= vec_ld(byte_10complex
, &(out
[0]));
604 vz2
= vec_ld(byte_12complex
, &(out
[0]));
605 vz3
= vec_ld(byte_14complex
, &(out
[0]));
607 vz4
= vec_perm(vz0
, vz1
, vcprm(0,1,s2
,s1
));
608 vz5
= vec_perm(vz0
, vz1
, vcprm(2,3,s0
,s3
));
609 vz6
= vec_perm(vz2
, vz3
, vcprm(0,1,s2
,s1
));
610 vz7
= vec_perm(vz2
, vz3
, vcprm(2,3,s0
,s3
));
612 vz0
= vec_add(vz4
, vz5
);
613 vz1
= vec_sub(vz4
, vz5
);
614 vz2
= vec_add(vz6
, vz7
);
615 vz3
= vec_sub(vz6
, vz7
);
617 vz4
= vec_perm(vz0
, vz1
, vcprm(0,s0
,1,s1
));
618 vz5
= vec_perm(vz0
, vz1
, vcprm(2,s3
,3,s2
));
619 vz6
= vec_perm(vz2
, vz3
, vcprm(0,s0
,1,s1
));
620 vz7
= vec_perm(vz2
, vz3
, vcprm(2,s3
,3,s2
));
622 vz0
= vec_add(vz4
, vz5
);
623 vz1
= vec_sub(vz4
, vz5
);
624 vz2
= vec_add(vz6
, vz7
);
625 vz3
= vec_sub(vz6
, vz7
);
627 vz4
= vec_perm(vz0
, vz1
, vcprm(0,1,s0
,s1
));
628 vz5
= vec_perm(vz0
, vz1
, vcprm(2,3,s2
,s3
));
630 vz6
= vec_perm(vz2
, vz3
, vcprm(0,1,s0
,s1
));
631 vz7
= vec_perm(vz2
, vz3
, vcprm(2,3,s2
,s3
));
633 vz0
= vec_ld(0, &(out
[0]));
634 vz1
= vec_ld(byte_2complex
, &(out
[0]));
635 vz2
= vec_ld(byte_4complex
, &(out
[0]));
636 vz3
= vec_ld(byte_6complex
, &(out
[0]));
637 vz10
= vec_perm(vz2
, vz3
, vcprm(0,s0
,1,s1
));
638 vz11
= vec_perm(vz2
, vz3
, vcprm(2,s2
,3,s3
));
639 vz8
= vec_perm(vz0
, vz1
, vcprm(0,1,s2
,s1
));
640 vz9
= vec_perm(vz0
, vz1
, vcprm(2,3,s0
,s3
));
642 vz2
= vec_add(vz10
, vz11
);
643 vz3
= vec_sub(vz10
, vz11
);
644 vz12
= vec_perm(vz3
, vz3
, vcprm(2,3,0,1));
645 vz0
= vec_add(vz8
, vz9
);
646 vz1
= vec_sub(vz8
, vz9
);
648 vz3
= vec_madd(vz3
, vc1
, vc0
);
649 vz3
= vec_madd(vz12
, vc2
, vz3
);
650 vz8
= vec_perm(vz0
, vz1
, vcprm(0,s0
,1,s1
));
651 vz9
= vec_perm(vz0
, vz1
, vcprm(2,s3
,3,s2
));
652 vz10
= vec_perm(vz2
, vz3
, vcprm(1,2,s3
,s0
));
653 vz11
= vec_perm(vz2
, vz3
, vcprm(0,3,s2
,s1
));
655 vz0
= vec_add(vz8
, vz9
);
656 vz1
= vec_sub(vz8
, vz9
);
657 vz2
= vec_add(vz10
, vz11
);
658 vz3
= vec_sub(vz10
, vz11
);
660 vz8
= vec_perm(vz0
, vz1
, vcprm(0,1,s0
,s1
));
661 vz9
= vec_perm(vz0
, vz1
, vcprm(2,3,s2
,s3
));
662 vz10
= vec_perm(vz2
, vz3
, vcprm(0,2,s1
,s3
));
663 vz11
= vec_perm(vz2
, vz3
, vcprm(1,3,s0
,s2
));
665 vz2
= vec_sub(vz8
, vz10
);
666 vz3
= vec_sub(vz9
, vz11
);
667 vz0
= vec_add(vz8
, vz10
);
668 vz1
= vec_add(vz9
, vz11
);
670 vz8
= vec_madd(vz4
, vc3
, vc0
);
671 vz9
= vec_madd(vz5
, vc3
, vc0
);
672 vz10
= vec_madd(vz6
, vc3
, vc0
);
673 vz11
= vec_madd(vz7
, vc3
, vc0
);
675 vz8
= vec_madd(vz5
, vc4
, vz8
);
676 vz9
= vec_madd(vz4
, vc5
, vz9
);
677 vz10
= vec_madd(vz7
, vc5
, vz10
);
678 vz11
= vec_madd(vz6
, vc4
, vz11
);
680 vz12
= vec_sub(vz10
, vz8
);
681 vz10
= vec_add(vz10
, vz8
);
683 vz13
= vec_sub(vz9
, vz11
);
684 vz11
= vec_add(vz9
, vz11
);
686 vz4
= vec_sub(vz0
, vz10
);
687 vz0
= vec_add(vz0
, vz10
);
689 vz7
= vec_sub(vz3
, vz12
);
690 vz3
= vec_add(vz3
, vz12
);
692 vz5
= vec_sub(vz1
, vz11
);
693 vz1
= vec_add(vz1
, vz11
);
695 vz6
= vec_sub(vz2
, vz13
);
696 vz2
= vec_add(vz2
, vz13
);
698 vec_st(vz0
, 0, &(out
[0]));
699 vec_st(vz1
, byte_2complex
, &(out
[0]));
700 vec_st(vz2
, byte_4complex
, &(out
[0]));
701 vec_st(vz3
, byte_6complex
, &(out
[0]));
702 vec_st(vz4
, byte_8complex
, &(out
[0]));
703 vec_st(vz5
, byte_10complex
, &(out
[0]));
704 vec_st(vz6
, byte_12complex
, &(out
[0]));
705 vec_st(vz7
, byte_14complex
, &(out
[0]));
709 inline static void pass_vsx(FFTComplex
* z
, const FFTSample
* wre
, unsigned int n
)
715 FFTSample
* out
= (FFTSample
*)z
;
716 const FFTSample
*wim
= wre
+o1
;
717 vec_f v0
, v1
, v2
, v3
;
718 vec_f v4
, v5
, v6
, v7
;
719 vec_f v8
, v9
, v10
, v11
;
723 i1
= o1
*sizeof(FFTComplex
);
724 i2
= o2
*sizeof(FFTComplex
);
725 i3
= o3
*sizeof(FFTComplex
);
727 v8
= vec_ld(0, &(wre
[0]));
728 v10
= vec_ld(0, &(wim
[0]));
729 v9
= vec_ld(0, &(wim
[-4]));
730 v9
= vec_perm(v9
, v10
, vcprm(s0
,3,2,1));
732 v4
= vec_ld(i2
, &(out
[0]));
733 v5
= vec_ld(i2
+16, &(out
[0]));
734 v6
= vec_ld(i3
, &(out
[0]));
735 v7
= vec_ld(i3
+16, &(out
[0]));
736 v10
= vec_mul(v4
, v8
); // r2*wre
737 v11
= vec_mul(v5
, v8
); // i2*wre
738 v12
= vec_mul(v6
, v8
); // r3*wre
739 v13
= vec_mul(v7
, v8
); // i3*wre
741 v0
= vec_ld(0, &(out
[0])); // r0
742 v3
= vec_ld(i1
+16, &(out
[0])); // i1
743 v10
= vec_madd(v5
, v9
, v10
); // r2*wim
744 v11
= vec_nmsub(v4
, v9
, v11
); // i2*wim
745 v12
= vec_nmsub(v7
, v9
, v12
); // r3*wim
746 v13
= vec_madd(v6
, v9
, v13
); // i3*wim
748 v1
= vec_ld(16, &(out
[0])); // i0
749 v2
= vec_ld(i1
, &(out
[0])); // r1
750 v8
= vec_sub(v12
, v10
);
751 v12
= vec_add(v12
, v10
);
752 v9
= vec_sub(v11
, v13
);
753 v13
= vec_add(v11
, v13
);
754 v4
= vec_sub(v0
, v12
);
755 v0
= vec_add(v0
, v12
);
756 v7
= vec_sub(v3
, v8
);
757 v3
= vec_add(v3
, v8
);
759 vec_st(v0
, 0, &(out
[0])); // r0
760 vec_st(v3
, i1
+16, &(out
[0])); // i1
761 vec_st(v4
, i2
, &(out
[0])); // r2
762 vec_st(v7
, i3
+16, &(out
[0]));// i3
764 v5
= vec_sub(v1
, v13
);
765 v1
= vec_add(v1
, v13
);
766 v6
= vec_sub(v2
, v9
);
767 v2
= vec_add(v2
, v9
);
769 vec_st(v1
, 16, &(out
[0])); // i0
770 vec_st(v2
, i1
, &(out
[0])); // r1
771 vec_st(v5
, i2
+16, &(out
[0])); // i2
772 vec_st(v6
, i3
, &(out
[0])); // r3
779 v8
= vec_ld(0, &(wre
[0]));
780 v10
= vec_ld(0, &(wim
[0]));
781 v9
= vec_ld(0, &(wim
[-4]));
782 v9
= vec_perm(v9
, v10
, vcprm(s0
,3,2,1));
784 v4
= vec_ld(i2
, &(out
[0])); // r2
785 v5
= vec_ld(i2
+16, &(out
[0])); // i2
786 v6
= vec_ld(i3
, &(out
[0])); // r3
787 v7
= vec_ld(i3
+16, &(out
[0]));// i3
788 v10
= vec_mul(v4
, v8
); // r2*wre
789 v11
= vec_mul(v5
, v8
); // i2*wre
790 v12
= vec_mul(v6
, v8
); // r3*wre
791 v13
= vec_mul(v7
, v8
); // i3*wre
793 v0
= vec_ld(0, &(out
[0])); // r0
794 v3
= vec_ld(i1
+16, &(out
[0])); // i1
795 v10
= vec_madd(v5
, v9
, v10
); // r2*wim
796 v11
= vec_nmsub(v4
, v9
, v11
); // i2*wim
797 v12
= vec_nmsub(v7
, v9
, v12
); // r3*wim
798 v13
= vec_madd(v6
, v9
, v13
); // i3*wim
800 v1
= vec_ld(16, &(out
[0])); // i0
801 v2
= vec_ld(i1
, &(out
[0])); // r1
802 v8
= vec_sub(v12
, v10
);
803 v12
= vec_add(v12
, v10
);
804 v9
= vec_sub(v11
, v13
);
805 v13
= vec_add(v11
, v13
);
806 v4
= vec_sub(v0
, v12
);
807 v0
= vec_add(v0
, v12
);
808 v7
= vec_sub(v3
, v8
);
809 v3
= vec_add(v3
, v8
);
811 vec_st(v0
, 0, &(out
[0])); // r0
812 vec_st(v3
, i1
+16, &(out
[0])); // i1
813 vec_st(v4
, i2
, &(out
[0])); // r2
814 vec_st(v7
, i3
+16, &(out
[0])); // i3
816 v5
= vec_sub(v1
, v13
);
817 v1
= vec_add(v1
, v13
);
818 v6
= vec_sub(v2
, v9
);
819 v2
= vec_add(v2
, v9
);
821 vec_st(v1
, 16, &(out
[0])); // i0
822 vec_st(v2
, i1
, &(out
[0])); // r1
823 vec_st(v5
, i2
+16, &(out
[0])); // i2
824 vec_st(v6
, i3
, &(out
[0])); // r3
830 #endif /* AVCODEC_PPC_FFT_VSX_H */