aab669ea455cbfbf6edfb25fa4051d425b02a17d
[deb_ffmpeg.git] / fft_altivec.S
1 /*
2 * FFT transform with Altivec optimizations
3 * Copyright (c) 2009 Loren Merritt
4 *
5 * This algorithm (though not any of the implementation details) is
6 * based on libdjbfft by D. J. Bernstein.
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 /*
26 * These functions are not individually interchangeable with the C versions.
27 * While C takes arrays of FFTComplex, Altivec leaves intermediate results
28 * in blocks as convenient to the vector size.
29 * i.e. {4x real, 4x imaginary, 4x real, ...}
30 *
31 * I ignore standard calling convention.
32 * Instead, the following registers are treated as global constants:
33 * v14: zero
34 * v15..v18: cosines
35 * v19..v29: permutations
36 * r9: 16
37 * r12: ff_cos_tabs
38 * and the rest are free for local use.
39 */
40
41 #include "config.h"
42
43 #if HAVE_GNU_AS && HAVE_ALTIVEC
44
45 #include "asm.S"
46
47 .text
48
49 .macro addi2 ra, imm // add 32-bit immediate
50 .if \imm & 0xffff
51 addi \ra, \ra, \imm@l
52 .endif
53 .if (\imm+0x8000)>>16
54 addis \ra, \ra, \imm@ha
55 .endif
56 .endm
57
58 .macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
59 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
60 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
61 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
62 vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
63 vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
64 vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
65 vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
66 vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
67 vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
68 vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
69 .endm
70
71 .macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3
72 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
73 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
74 vperm \b2,\b0,\b1,v20
75 vperm \b3,\b0,\b1,v21
76 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
77 vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
78 vaddfp \b0,\b2,\b3
79 vsubfp \b1,\b2,\b3
80 vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
81 vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
82 vmrghw \b2,\b0,\b1
83 vperm \b3,\b0,\b1,v22
84 vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
85 vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
86 vaddfp \b0,\b2,\b3
87 vsubfp \b1,\b2,\b3
88 vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
89 vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
90 vperm \b2,\b0,\b1,v23
91 vperm \b3,\b0,\b1,v24
92 .endm
93
94 .macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1
95 vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6}
96 vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7}
97 vperm \a2,\a0,\a1,v20 // FFT4 ...
98 vperm \a3,\a0,\a1,v21
99 vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4}
100 vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7}
101 vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7}
102 vaddfp \a0,\a2,\a3
103 vsubfp \a1,\a2,\a3
104 vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2)
105 vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9}
106 vmrghw \a2,\a0,\a1
107 vperm \a3,\a0,\a1,v22
108 vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8}
109 vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta}
110 vaddfp \a0,\a2,\a3
111 vsubfp \a1,\a2,\a3
112 vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta}
113 vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb}
114 vperm \a2,\a0,\a1,v23
115 vperm \a3,\a0,\a1,v24
116 vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb}
117 vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc}
118 vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7}
119 vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7}
120 vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3}
121 vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3}
122 .endm
123
124 .macro BF d0,d1,s0,s1
125 vsubfp \d1,\s0,\s1
126 vaddfp \d0,\s0,\s1
127 .endm
128
129 .macro zip d0,d1,s0,s1
130 vmrghw \d0,\s0,\s1
131 vmrglw \d1,\s0,\s1
132 .endm
133
134 .macro def_fft4 interleave
135 fft4\interleave\()_altivec:
136 lvx v0, 0,r3
137 lvx v1,r9,r3
138 FFT4 v0,v1,v2,v3
139 .ifnb \interleave
140 zip v0,v1,v2,v3
141 stvx v0, 0,r3
142 stvx v1,r9,r3
143 .else
144 stvx v2, 0,r3
145 stvx v3,r9,r3
146 .endif
147 blr
148 .endm
149
150 .macro def_fft8 interleave
151 fft8\interleave\()_altivec:
152 addi r4,r3,32
153 lvx v0, 0,r3
154 lvx v1,r9,r3
155 lvx v2, 0,r4
156 lvx v3,r9,r4
157 FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
158 .ifnb \interleave
159 zip v4,v5,v0,v1
160 zip v6,v7,v2,v3
161 stvx v4, 0,r3
162 stvx v5,r9,r3
163 stvx v6, 0,r4
164 stvx v7,r9,r4
165 .else
166 stvx v0, 0,r3
167 stvx v1,r9,r3
168 stvx v2, 0,r4
169 stvx v3,r9,r4
170 .endif
171 blr
172 .endm
173
174 .macro def_fft16 interleave
175 fft16\interleave\()_altivec:
176 addi r5,r3,64
177 addi r6,r3,96
178 addi r4,r3,32
179 lvx v0, 0,r5
180 lvx v1,r9,r5
181 lvx v2, 0,r6
182 lvx v3,r9,r6
183 FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7
184 lvx v0, 0,r3
185 lvx v1,r9,r3
186 lvx v2, 0,r4
187 lvx v3,r9,r4
188 FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12
189 vmaddfp v8,v4,v15,v14 // r2*wre
190 vmaddfp v9,v5,v15,v14 // i2*wre
191 vmaddfp v10,v6,v15,v14 // r3*wre
192 vmaddfp v11,v7,v15,v14 // i3*wre
193 vmaddfp v8,v5,v16,v8 // i2*wim
194 vnmsubfp v9,v4,v16,v9 // r2*wim
195 vnmsubfp v10,v7,v16,v10 // i3*wim
196 vmaddfp v11,v6,v16,v11 // r3*wim
197 BF v10,v12,v10,v8
198 BF v11,v13,v9,v11
199 BF v0,v4,v0,v10
200 BF v3,v7,v3,v12
201 BF v1,v5,v1,v11
202 BF v2,v6,v2,v13
203 .ifnb \interleave
204 zip v8, v9,v0,v1
205 zip v10,v11,v2,v3
206 zip v12,v13,v4,v5
207 zip v14,v15,v6,v7
208 stvx v8, 0,r3
209 stvx v9,r9,r3
210 stvx v10, 0,r4
211 stvx v11,r9,r4
212 stvx v12, 0,r5
213 stvx v13,r9,r5
214 stvx v14, 0,r6
215 stvx v15,r9,r6
216 .else
217 stvx v0, 0,r3
218 stvx v4, 0,r5
219 stvx v3,r9,r4
220 stvx v7,r9,r6
221 stvx v1,r9,r3
222 stvx v5,r9,r5
223 stvx v2, 0,r4
224 stvx v6, 0,r6
225 .endif
226 blr
227 .endm
228
229 // void pass(float *z, float *wre, int n)
230 .macro PASS interleave, suffix
231 fft_pass\suffix\()_altivec:
232 mtctr r5
233 slwi r0,r5,4
234 slwi r7,r5,6 // o2
235 slwi r5,r5,5 // o1
236 add r10,r5,r7 // o3
237 add r0,r4,r0 // wim
238 addi r6,r5,16 // o1+16
239 addi r8,r7,16 // o2+16
240 addi r11,r10,16 // o3+16
241 1:
242 lvx v8, 0,r4 // wre
243 lvx v10, 0,r0 // wim
244 sub r0,r0,r9
245 lvx v9, 0,r0
246 vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3]
247 lvx v4,r3,r7 // r2 = z[o2]
248 lvx v5,r3,r8 // i2 = z[o2+16]
249 lvx v6,r3,r10 // r3 = z[o3]
250 lvx v7,r3,r11 // i3 = z[o3+16]
251 vmaddfp v10,v4,v8,v14 // r2*wre
252 vmaddfp v11,v5,v8,v14 // i2*wre
253 vmaddfp v12,v6,v8,v14 // r3*wre
254 vmaddfp v13,v7,v8,v14 // i3*wre
255 lvx v0, 0,r3 // r0 = z[0]
256 lvx v3,r3,r6 // i1 = z[o1+16]
257 vmaddfp v10,v5,v9,v10 // i2*wim
258 vnmsubfp v11,v4,v9,v11 // r2*wim
259 vnmsubfp v12,v7,v9,v12 // i3*wim
260 vmaddfp v13,v6,v9,v13 // r3*wim
261 lvx v1,r3,r9 // i0 = z[16]
262 lvx v2,r3,r5 // r1 = z[o1]
263 BF v12,v8,v12,v10
264 BF v13,v9,v11,v13
265 BF v0,v4,v0,v12
266 BF v3,v7,v3,v8
267 .if !\interleave
268 stvx v0, 0,r3
269 stvx v4,r3,r7
270 stvx v3,r3,r6
271 stvx v7,r3,r11
272 .endif
273 BF v1,v5,v1,v13
274 BF v2,v6,v2,v9
275 .if !\interleave
276 stvx v1,r3,r9
277 stvx v2,r3,r5
278 stvx v5,r3,r8
279 stvx v6,r3,r10
280 .else
281 vmrghw v8,v0,v1
282 vmrglw v9,v0,v1
283 stvx v8, 0,r3
284 stvx v9,r3,r9
285 vmrghw v8,v2,v3
286 vmrglw v9,v2,v3
287 stvx v8,r3,r5
288 stvx v9,r3,r6
289 vmrghw v8,v4,v5
290 vmrglw v9,v4,v5
291 stvx v8,r3,r7
292 stvx v9,r3,r8
293 vmrghw v8,v6,v7
294 vmrglw v9,v6,v7
295 stvx v8,r3,r10
296 stvx v9,r3,r11
297 .endif
298 addi r3,r3,32
299 addi r4,r4,16
300 bdnz 1b
301 sub r3,r3,r5
302 blr
303 .endm
304
305 #define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
306
307 #define WORD_0 0x00,0x01,0x02,0x03
308 #define WORD_1 0x04,0x05,0x06,0x07
309 #define WORD_2 0x08,0x09,0x0a,0x0b
310 #define WORD_3 0x0c,0x0d,0x0e,0x0f
311 #define WORD_s0 0x10,0x11,0x12,0x13
312 #define WORD_s1 0x14,0x15,0x16,0x17
313 #define WORD_s2 0x18,0x19,0x1a,0x1b
314 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
315
316 #define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d
317
318 .rodata
319 .align 4
320 fft_data:
321 .float 0, 0, 0, 0
322 .float 1, 0.92387953, M_SQRT1_2, 0.38268343
323 .float 0, 0.38268343, M_SQRT1_2, 0.92387953
324 .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2
325 .float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
326 vcprm(s0,3,2,1)
327 vcprm(0,1,s2,s1)
328 vcprm(2,3,s0,s3)
329 vcprm(2,s3,3,s2)
330 vcprm(0,1,s0,s1)
331 vcprm(2,3,s2,s3)
332 vcprm(2,3,0,1)
333 vcprm(1,2,s3,s0)
334 vcprm(0,3,s2,s1)
335 vcprm(0,2,s1,s3)
336 vcprm(1,3,s0,s2)
337
338 .macro lvm b, r, regs:vararg
339 lvx \r, 0, \b
340 addi \b, \b, 16
341 .ifnb \regs
342 lvm \b, \regs
343 .endif
344 .endm
345
346 .macro stvm b, r, regs:vararg
347 stvx \r, 0, \b
348 addi \b, \b, 16
349 .ifnb \regs
350 stvm \b, \regs
351 .endif
352 .endm
353
354 .macro fft_calc interleave
355 extfunc ff_fft_calc\interleave\()_altivec
356 mflr r0
357 stp r0, 2*PS(r1)
358 stpu r1, -(160+16*PS)(r1)
359 get_got r11
360 addi r6, r1, 16*PS
361 stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
362 mfvrsave r0
363 stw r0, 15*PS(r1)
364 li r6, 0xfffffffc
365 mtvrsave r6
366
367 movrel r6, fft_data, r11
368 lvm r6, v14, v15, v16, v17, v18, v19, v20, v21
369 lvm r6, v22, v23, v24, v25, v26, v27, v28, v29
370
371 li r9, 16
372 movrel r12, X(ff_cos_tabs), r11
373
374 movrel r6, fft_dispatch_tab\interleave\()_altivec, r11
375 lwz r3, 0(r3)
376 subi r3, r3, 2
377 slwi r3, r3, 2+ARCH_PPC64
378 lpx r3, r3, r6
379 mtctr r3
380 mr r3, r4
381 bctrl
382
383 addi r6, r1, 16*PS
384 lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
385 lwz r6, 15*PS(r1)
386 mtvrsave r6
387 lp r1, 0(r1)
388 lp r0, 2*PS(r1)
389 mtlr r0
390 blr
391 .endm
392
393 .macro DECL_FFT suffix, bits, n, n2, n4
394 fft\n\suffix\()_altivec:
395 mflr r0
396 stp r0,PS*(\bits-3)(r1)
397 bl fft\n2\()_altivec
398 addi2 r3,\n*4
399 bl fft\n4\()_altivec
400 addi2 r3,\n*2
401 bl fft\n4\()_altivec
402 addi2 r3,\n*-6
403 lp r0,PS*(\bits-3)(r1)
404 lp r4,\bits*PS(r12)
405 mtlr r0
406 li r5,\n/16
407 b fft_pass\suffix\()_altivec
408 .endm
409
410 .macro DECL_FFTS interleave, suffix
411 .text
412 def_fft4 \suffix
413 def_fft8 \suffix
414 def_fft16 \suffix
415 PASS \interleave, \suffix
416 DECL_FFT \suffix, 5, 32, 16, 8
417 DECL_FFT \suffix, 6, 64, 32, 16
418 DECL_FFT \suffix, 7, 128, 64, 32
419 DECL_FFT \suffix, 8, 256, 128, 64
420 DECL_FFT \suffix, 9, 512, 256, 128
421 DECL_FFT \suffix,10, 1024, 512, 256
422 DECL_FFT \suffix,11, 2048, 1024, 512
423 DECL_FFT \suffix,12, 4096, 2048, 1024
424 DECL_FFT \suffix,13, 8192, 4096, 2048
425 DECL_FFT \suffix,14,16384, 8192, 4096
426 DECL_FFT \suffix,15,32768,16384, 8192
427 DECL_FFT \suffix,16,65536,32768,16384
428
429 fft_calc \suffix
430
431 .rodata
432 .align 3
433 fft_dispatch_tab\suffix\()_altivec:
434 PTR fft4\suffix\()_altivec
435 PTR fft8\suffix\()_altivec
436 PTR fft16\suffix\()_altivec
437 PTR fft32\suffix\()_altivec
438 PTR fft64\suffix\()_altivec
439 PTR fft128\suffix\()_altivec
440 PTR fft256\suffix\()_altivec
441 PTR fft512\suffix\()_altivec
442 PTR fft1024\suffix\()_altivec
443 PTR fft2048\suffix\()_altivec
444 PTR fft4096\suffix\()_altivec
445 PTR fft8192\suffix\()_altivec
446 PTR fft16384\suffix\()_altivec
447 PTR fft32768\suffix\()_altivec
448 PTR fft65536\suffix\()_altivec
449 .endm
450
451 DECL_FFTS 0
452 DECL_FFTS 1, _interleave
453
454 #endif /* HAVE_GNU_AS && HAVE_ALTIVEC */