Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * FFT/IFFT transforms | |
3 | * AltiVec-enabled | |
4 | * Copyright (c) 2009 Loren Merritt | |
5 | * | |
6 | * This file is part of FFmpeg. | |
7 | * | |
8 | * FFmpeg is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License as published by the Free Software Foundation; either | |
11 | * version 2.1 of the License, or (at your option) any later version. | |
12 | * | |
13 | * FFmpeg is distributed in the hope that it will be useful, | |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | * Lesser General Public License for more details. | |
17 | * | |
18 | * You should have received a copy of the GNU Lesser General Public | |
19 | * License along with FFmpeg; if not, write to the Free Software | |
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | */ | |
22 | ||
23 | #include "config.h" | |
24 | #include "libavutil/cpu.h" | |
25 | #include "libavutil/ppc/cpu.h" | |
26 | #include "libavutil/ppc/types_altivec.h" | |
27 | #include "libavutil/ppc/util_altivec.h" | |
28 | #include "libavcodec/fft.h" | |
29 | ||
30 | /** | |
31 | * Do a complex FFT with the parameters defined in ff_fft_init(). | |
32 | * The input data must be permuted before with s->revtab table. | |
33 | * No 1.0 / sqrt(n) normalization is done. | |
34 | * AltiVec-enabled: | |
35 | * This code assumes that the 'z' pointer is 16 bytes-aligned. | |
36 | * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats. | |
37 | */ | |
38 | ||
39 | #if HAVE_VSX | |
40 | #include "fft_vsx.h" | |
41 | #else | |
42 | void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z); | |
43 | void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z); | |
44 | #endif | |
45 | ||
46 | #if HAVE_GNU_AS && HAVE_ALTIVEC | |
47 | static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) | |
48 | { | |
49 | int j, k; | |
50 | int n = 1 << s->mdct_bits; | |
51 | int n4 = n >> 2; | |
52 | int n8 = n >> 3; | |
53 | int n32 = n >> 5; | |
54 | const uint16_t *revtabj = s->revtab; | |
55 | const uint16_t *revtabk = s->revtab+n4; | |
56 | const vec_f *tcos = (const vec_f*)(s->tcos+n8); | |
57 | const vec_f *tsin = (const vec_f*)(s->tsin+n8); | |
58 | const vec_f *pin = (const vec_f*)(input+n4); | |
59 | vec_f *pout = (vec_f*)(output+n4); | |
60 | ||
61 | /* pre rotation */ | |
62 | k = n32-1; | |
63 | do { | |
64 | vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d; | |
65 | #define CMULA(p,o0,o1,o2,o3)\ | |
66 | a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\ | |
67 | b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\ | |
68 | re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\ | |
69 | im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\ | |
70 | cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\ | |
71 | sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\ | |
72 | r##p = im*cos - re*sin;\ | |
73 | i##p = re*cos + im*sin; | |
74 | #define STORE2(v,dst)\ | |
75 | j = dst;\ | |
76 | vec_ste(v, 0, output+j*2);\ | |
77 | vec_ste(v, 4, output+j*2); | |
78 | #define STORE8(p)\ | |
79 | a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\ | |
80 | b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\ | |
81 | c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\ | |
82 | d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\ | |
83 | STORE2(a, revtabk[ p*2-4]);\ | |
84 | STORE2(b, revtabk[ p*2-3]);\ | |
85 | STORE2(c, revtabj[-p*2+2]);\ | |
86 | STORE2(d, revtabj[-p*2+3]); | |
87 | ||
88 | cos0 = tcos[k]; | |
89 | sin0 = tsin[k]; | |
90 | cos1 = tcos[-k-1]; | |
91 | sin1 = tsin[-k-1]; | |
92 | CMULA(0, 0,1,2,3); | |
93 | CMULA(1, 2,3,0,1); | |
94 | STORE8(0); | |
95 | STORE8(1); | |
96 | revtabj += 4; | |
97 | revtabk -= 4; | |
98 | k--; | |
99 | } while(k >= 0); | |
100 | ||
101 | #if HAVE_VSX | |
102 | ff_fft_calc_vsx(s, (FFTComplex*)output); | |
103 | #else | |
104 | ff_fft_calc_altivec(s, (FFTComplex*)output); | |
105 | #endif | |
106 | ||
107 | /* post rotation + reordering */ | |
108 | j = -n32; | |
109 | k = n32-1; | |
110 | do { | |
111 | vec_f cos,sin,re,im,a,b,c,d; | |
112 | #define CMULB(d0,d1,o)\ | |
113 | re = pout[o*2];\ | |
114 | im = pout[o*2+1];\ | |
115 | cos = tcos[o];\ | |
116 | sin = tsin[o];\ | |
117 | d0 = im*sin - re*cos;\ | |
118 | d1 = re*sin + im*cos; | |
119 | ||
120 | CMULB(a,b,j); | |
121 | CMULB(c,d,k); | |
122 | pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2)); | |
123 | pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0)); | |
124 | pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2)); | |
125 | pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0)); | |
126 | j++; | |
127 | k--; | |
128 | } while(k >= 0); | |
129 | } | |
130 | ||
131 | static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) | |
132 | { | |
133 | int k; | |
134 | int n = 1 << s->mdct_bits; | |
135 | int n4 = n >> 2; | |
136 | int n16 = n >> 4; | |
137 | vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31}; | |
138 | vec_u32 *p0 = (vec_u32*)(output+n4); | |
139 | vec_u32 *p1 = (vec_u32*)(output+n4*3); | |
140 | ||
141 | imdct_half_altivec(s, output + n4, input); | |
142 | ||
143 | for (k = 0; k < n16; k++) { | |
144 | vec_u32 a = p0[k] ^ sign; | |
145 | vec_u32 b = p1[-k-1]; | |
146 | p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0)); | |
147 | p1[k] = vec_perm(b, b, vcprm(3,2,1,0)); | |
148 | } | |
149 | } | |
150 | #endif /* HAVE_GNU_AS && HAVE_ALTIVEC */ | |
151 | ||
152 | av_cold void ff_fft_init_ppc(FFTContext *s) | |
153 | { | |
154 | #if HAVE_GNU_AS && HAVE_ALTIVEC | |
155 | if (!PPC_ALTIVEC(av_get_cpu_flags())) | |
156 | return; | |
157 | ||
158 | #if HAVE_VSX | |
159 | s->fft_calc = ff_fft_calc_interleave_vsx; | |
160 | #else | |
161 | s->fft_calc = ff_fft_calc_interleave_altivec; | |
162 | #endif | |
163 | if (s->mdct_bits >= 5) { | |
164 | s->imdct_calc = imdct_calc_altivec; | |
165 | s->imdct_half = imdct_half_altivec; | |
166 | } | |
167 | #endif /* HAVE_GNU_AS && HAVE_ALTIVEC */ | |
168 | } |