| 1 | /* |
| 2 | * FFT transform, optimized with VSX built-in functions |
| 3 | * Copyright (c) 2014 Rong Yan |
| 4 | * |
| 5 | * This algorithm (though not any of the implementation details) is |
| 6 | * based on libdjbfft by D. J. Bernstein. |
| 7 | * |
| 8 | * This file is part of FFmpeg. |
| 9 | * |
| 10 | * FFmpeg is free software; you can redistribute it and/or |
| 11 | * modify it under the terms of the GNU Lesser General Public |
| 12 | * License as published by the Free Software Foundation; either |
| 13 | * version 2.1 of the License, or (at your option) any later version. |
| 14 | * |
| 15 | * FFmpeg is distributed in the hope that it will be useful, |
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 18 | * Lesser General Public License for more details. |
| 19 | * |
| 20 | * You should have received a copy of the GNU Lesser General Public |
| 21 | * License along with FFmpeg; if not, write to the Free Software |
| 22 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 23 | */ |
| 24 | |
| 25 | |
| 26 | #include "config.h" |
| 27 | #include "libavutil/cpu.h" |
| 28 | #include "libavutil/ppc/types_altivec.h" |
| 29 | #include "libavutil/ppc/util_altivec.h" |
| 30 | #include "libavcodec/fft.h" |
| 31 | #include "libavcodec/fft-internal.h" |
| 32 | #include "fft_vsx.h" |
| 33 | |
| 34 | #if HAVE_VSX |
| 35 | |
| 36 | static void fft32_vsx_interleave(FFTComplex *z) |
| 37 | { |
| 38 | fft16_vsx_interleave(z); |
| 39 | fft8_vsx_interleave(z+16); |
| 40 | fft8_vsx_interleave(z+24); |
| 41 | pass_vsx_interleave(z,ff_cos_32,4); |
| 42 | } |
| 43 | |
| 44 | static void fft64_vsx_interleave(FFTComplex *z) |
| 45 | { |
| 46 | fft32_vsx_interleave(z); |
| 47 | fft16_vsx_interleave(z+32); |
| 48 | fft16_vsx_interleave(z+48); |
| 49 | pass_vsx_interleave(z,ff_cos_64, 8); |
| 50 | } |
| 51 | static void fft128_vsx_interleave(FFTComplex *z) |
| 52 | { |
| 53 | fft64_vsx_interleave(z); |
| 54 | fft32_vsx_interleave(z+64); |
| 55 | fft32_vsx_interleave(z+96); |
| 56 | pass_vsx_interleave(z,ff_cos_128,16); |
| 57 | } |
| 58 | static void fft256_vsx_interleave(FFTComplex *z) |
| 59 | { |
| 60 | fft128_vsx_interleave(z); |
| 61 | fft64_vsx_interleave(z+128); |
| 62 | fft64_vsx_interleave(z+192); |
| 63 | pass_vsx_interleave(z,ff_cos_256,32); |
| 64 | } |
| 65 | static void fft512_vsx_interleave(FFTComplex *z) |
| 66 | { |
| 67 | fft256_vsx_interleave(z); |
| 68 | fft128_vsx_interleave(z+256); |
| 69 | fft128_vsx_interleave(z+384); |
| 70 | pass_vsx_interleave(z,ff_cos_512,64); |
| 71 | } |
| 72 | static void fft1024_vsx_interleave(FFTComplex *z) |
| 73 | { |
| 74 | fft512_vsx_interleave(z); |
| 75 | fft256_vsx_interleave(z+512); |
| 76 | fft256_vsx_interleave(z+768); |
| 77 | pass_vsx_interleave(z,ff_cos_1024,128); |
| 78 | |
| 79 | } |
| 80 | static void fft2048_vsx_interleave(FFTComplex *z) |
| 81 | { |
| 82 | fft1024_vsx_interleave(z); |
| 83 | fft512_vsx_interleave(z+1024); |
| 84 | fft512_vsx_interleave(z+1536); |
| 85 | pass_vsx_interleave(z,ff_cos_2048,256); |
| 86 | } |
| 87 | static void fft4096_vsx_interleave(FFTComplex *z) |
| 88 | { |
| 89 | fft2048_vsx_interleave(z); |
| 90 | fft1024_vsx_interleave(z+2048); |
| 91 | fft1024_vsx_interleave(z+3072); |
| 92 | pass_vsx_interleave(z,ff_cos_4096, 512); |
| 93 | } |
| 94 | static void fft8192_vsx_interleave(FFTComplex *z) |
| 95 | { |
| 96 | fft4096_vsx_interleave(z); |
| 97 | fft2048_vsx_interleave(z+4096); |
| 98 | fft2048_vsx_interleave(z+6144); |
| 99 | pass_vsx_interleave(z,ff_cos_8192,1024); |
| 100 | } |
| 101 | static void fft16384_vsx_interleave(FFTComplex *z) |
| 102 | { |
| 103 | fft8192_vsx_interleave(z); |
| 104 | fft4096_vsx_interleave(z+8192); |
| 105 | fft4096_vsx_interleave(z+12288); |
| 106 | pass_vsx_interleave(z,ff_cos_16384,2048); |
| 107 | } |
| 108 | static void fft32768_vsx_interleave(FFTComplex *z) |
| 109 | { |
| 110 | fft16384_vsx_interleave(z); |
| 111 | fft8192_vsx_interleave(z+16384); |
| 112 | fft8192_vsx_interleave(z+24576); |
| 113 | pass_vsx_interleave(z,ff_cos_32768,4096); |
| 114 | } |
| 115 | static void fft65536_vsx_interleave(FFTComplex *z) |
| 116 | { |
| 117 | fft32768_vsx_interleave(z); |
| 118 | fft16384_vsx_interleave(z+32768); |
| 119 | fft16384_vsx_interleave(z+49152); |
| 120 | pass_vsx_interleave(z,ff_cos_65536,8192); |
| 121 | } |
| 122 | |
| 123 | static void fft32_vsx(FFTComplex *z) |
| 124 | { |
| 125 | fft16_vsx(z); |
| 126 | fft8_vsx(z+16); |
| 127 | fft8_vsx(z+24); |
| 128 | pass_vsx(z,ff_cos_32,4); |
| 129 | } |
| 130 | |
| 131 | static void fft64_vsx(FFTComplex *z) |
| 132 | { |
| 133 | fft32_vsx(z); |
| 134 | fft16_vsx(z+32); |
| 135 | fft16_vsx(z+48); |
| 136 | pass_vsx(z,ff_cos_64, 8); |
| 137 | } |
| 138 | static void fft128_vsx(FFTComplex *z) |
| 139 | { |
| 140 | fft64_vsx(z); |
| 141 | fft32_vsx(z+64); |
| 142 | fft32_vsx(z+96); |
| 143 | pass_vsx(z,ff_cos_128,16); |
| 144 | } |
| 145 | static void fft256_vsx(FFTComplex *z) |
| 146 | { |
| 147 | fft128_vsx(z); |
| 148 | fft64_vsx(z+128); |
| 149 | fft64_vsx(z+192); |
| 150 | pass_vsx(z,ff_cos_256,32); |
| 151 | } |
| 152 | static void fft512_vsx(FFTComplex *z) |
| 153 | { |
| 154 | fft256_vsx(z); |
| 155 | fft128_vsx(z+256); |
| 156 | fft128_vsx(z+384); |
| 157 | pass_vsx(z,ff_cos_512,64); |
| 158 | } |
| 159 | static void fft1024_vsx(FFTComplex *z) |
| 160 | { |
| 161 | fft512_vsx(z); |
| 162 | fft256_vsx(z+512); |
| 163 | fft256_vsx(z+768); |
| 164 | pass_vsx(z,ff_cos_1024,128); |
| 165 | |
| 166 | } |
| 167 | static void fft2048_vsx(FFTComplex *z) |
| 168 | { |
| 169 | fft1024_vsx(z); |
| 170 | fft512_vsx(z+1024); |
| 171 | fft512_vsx(z+1536); |
| 172 | pass_vsx(z,ff_cos_2048,256); |
| 173 | } |
| 174 | static void fft4096_vsx(FFTComplex *z) |
| 175 | { |
| 176 | fft2048_vsx(z); |
| 177 | fft1024_vsx(z+2048); |
| 178 | fft1024_vsx(z+3072); |
| 179 | pass_vsx(z,ff_cos_4096, 512); |
| 180 | } |
| 181 | static void fft8192_vsx(FFTComplex *z) |
| 182 | { |
| 183 | fft4096_vsx(z); |
| 184 | fft2048_vsx(z+4096); |
| 185 | fft2048_vsx(z+6144); |
| 186 | pass_vsx(z,ff_cos_8192,1024); |
| 187 | } |
| 188 | static void fft16384_vsx(FFTComplex *z) |
| 189 | { |
| 190 | fft8192_vsx(z); |
| 191 | fft4096_vsx(z+8192); |
| 192 | fft4096_vsx(z+12288); |
| 193 | pass_vsx(z,ff_cos_16384,2048); |
| 194 | } |
| 195 | static void fft32768_vsx(FFTComplex *z) |
| 196 | { |
| 197 | fft16384_vsx(z); |
| 198 | fft8192_vsx(z+16384); |
| 199 | fft8192_vsx(z+24576); |
| 200 | pass_vsx(z,ff_cos_32768,4096); |
| 201 | } |
| 202 | static void fft65536_vsx(FFTComplex *z) |
| 203 | { |
| 204 | fft32768_vsx(z); |
| 205 | fft16384_vsx(z+32768); |
| 206 | fft16384_vsx(z+49152); |
| 207 | pass_vsx(z,ff_cos_65536,8192); |
| 208 | } |
| 209 | |
| 210 | static void (* const fft_dispatch_vsx[])(FFTComplex*) = { |
| 211 | fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx, |
| 212 | fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx, |
| 213 | }; |
| 214 | static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = { |
| 215 | fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave, |
| 216 | fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave, |
| 217 | fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave, |
| 218 | }; |
| 219 | void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z) |
| 220 | { |
| 221 | fft_dispatch_vsx_interleave[s->nbits-2](z); |
| 222 | } |
| 223 | void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z) |
| 224 | { |
| 225 | fft_dispatch_vsx[s->nbits-2](z); |
| 226 | } |
| 227 | #endif /* HAVE_VSX */ |