Imported Debian version 2.5.0~trusty1.1
[deb_ffmpeg.git] / ffmpeg / libavcodec / aarch64 / fft_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * ARM NEON optimised FFT
3 *
4 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
5 * Copyright (c) 2009 Naotoshi Nojiri
6 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
7 *
8 * This algorithm (though not any of the implementation details) is
9 * based on libdjbfft by D. J. Bernstein.
10 *
11 * This file is part of FFmpeg.
12 *
13 * FFmpeg is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU Lesser General Public
15 * License as published by the Free Software Foundation; either
16 * version 2.1 of the License, or (at your option) any later version.
17 *
18 * FFmpeg is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 * Lesser General Public License for more details.
22 *
23 * You should have received a copy of the GNU Lesser General Public
24 * License along with FFmpeg; if not, write to the Free Software
25 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 */
27
28#include "libavutil/aarch64/asm.S"
29
30#define M_SQRT1_2 0.70710678118654752440
31
32.macro transpose d0, d1, s0, s1
33 trn1 \d0, \s0, \s1
34 trn2 \d1, \s0, \s1
35.endm
36
37
38function fft4_neon
39 ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
40
41 fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1
42 fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1
43
44 ext v16.8b, v2.8b, v3.8b, #4
45 ext v17.8b, v3.8b, v2.8b, #4
46
47 fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3
48 fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3
49
50 fadd v0.2s, v4.2s, v5.2s
51 fsub v2.2s, v4.2s, v5.2s
52 fadd v1.2s, v6.2s, v7.2s
53 fsub v3.2s, v6.2s, v7.2s
54
55 st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
56
57 ret
58endfunc
59
60function fft8_neon
61 mov x1, x0
62 ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
63 ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
64 ext v22.8b, v2.8b, v3.8b, #4
65 ext v23.8b, v3.8b, v2.8b, #4
66 fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
67 fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
68 fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
69 fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
70 rev64 v27.2s, v28.2s // ???
71 fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
72 fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
73 fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
74 ext v6.8b, v4.8b, v5.8b, #4
75 ext v7.8b, v5.8b, v4.8b, #4
76 fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
77 fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
78 fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
79 fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
80 fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
81 fadd v0.2s, v20.2s, v21.2s
82 fsub v2.2s, v20.2s, v21.2s
83 fadd v1.2s, v22.2s, v23.2s
84 rev64 v26.2s, v26.2s
85 rev64 v27.2s, v27.2s
86 fsub v3.2s, v22.2s, v23.2s
87 fsub v6.2s, v6.2s, v7.2s
88 fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
89 fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
90 fadd v7.2s, v4.2s, v5.2s
91 fsub v18.2s, v2.2s, v6.2s
92 ext v26.8b, v24.8b, v25.8b, #4
93 ext v27.8b, v25.8b, v24.8b, #4
94 fadd v2.2s, v2.2s, v6.2s
95 fsub v16.2s, v0.2s, v7.2s
96 fadd v5.2s, v25.2s, v24.2s
97 fsub v4.2s, v26.2s, v27.2s
98 fadd v0.2s, v0.2s, v7.2s
99 fsub v17.2s, v1.2s, v5.2s
100 fsub v19.2s, v3.2s, v4.2s
101 fadd v3.2s, v3.2s, v4.2s
102 fadd v1.2s, v1.2s, v5.2s
103
104 st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
105 st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1]
106
107 ret
108endfunc
109
110function fft16_neon
111 mov x1, x0
112 ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
113 ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
114 ext v22.8b, v2.8b, v3.8b, #4
115 ext v23.8b, v3.8b, v2.8b, #4
116 fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
117 fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
118 fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
119 fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
120 rev64 v27.2s, v28.2s // ???
121 fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
122 fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
123 fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
124 ext v6.8b, v4.8b, v5.8b, #4
125 ext v7.8b, v5.8b, v4.8b, #4
126 fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
127 fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
128 fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
129 fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
130 fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
131 fadd v0.2s, v20.2s, v21.2s
132 fsub v2.2s, v20.2s, v21.2s
133 fadd v1.2s, v22.2s, v23.2s
134 rev64 v26.2s, v26.2s
135 rev64 v27.2s, v27.2s
136 fsub v3.2s, v22.2s, v23.2s
137 fsub v6.2s, v6.2s, v7.2s
138 fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
139 fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
140 fadd v7.2s, v4.2s, v5.2s
141 fsub v18.2s, v2.2s, v6.2s
142 ld1 {v20.4s,v21.4s}, [x0], #32
143 ld1 {v22.4s,v23.4s}, [x0], #32
144 ext v26.8b, v24.8b, v25.8b, #4
145 ext v27.8b, v25.8b, v24.8b, #4
146 fadd v2.2s, v2.2s, v6.2s
147 fsub v16.2s, v0.2s, v7.2s
148 fadd v5.2s, v25.2s, v24.2s
149 fsub v4.2s, v26.2s, v27.2s
150 transpose v24.2d, v25.2d, v20.2d, v22.2d
151 transpose v26.2d, v27.2d, v21.2d, v23.2d
152 fadd v0.2s, v0.2s, v7.2s
153 fsub v17.2s, v1.2s, v5.2s
154 fsub v19.2s, v3.2s, v4.2s
155 fadd v3.2s, v3.2s, v4.2s
156 fadd v1.2s, v1.2s, v5.2s
157 ext v20.16b, v21.16b, v21.16b, #4
158 ext v21.16b, v23.16b, v23.16b, #4
159
160 zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]}
161 zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]}
162 zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]}
163 zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]}
164
165 // 2 x fft4
166 transpose v22.2d, v23.2d, v20.2d, v21.2d
167
168 fadd v4.4s, v24.4s, v25.4s
169 fadd v5.4s, v26.4s, v27.4s
170 fsub v6.4s, v24.4s, v25.4s
171 fsub v7.4s, v22.4s, v23.4s
172
173 ld1 {v23.4s}, [x14]
174
175 fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]}
176 fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]}
177 fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]}
178 fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]}
179
180 //fft_pass_neon_16
181 rev64 v7.4s, v25.4s
182 fmul v25.4s, v25.4s, v23.s[1]
183 fmul v7.4s, v7.4s, v29.4s
184 fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a}
185
186 zip1 v20.4s, v24.4s, v25.4s
187 zip2 v21.4s, v24.4s, v25.4s
188 fneg v22.4s, v20.4s
189 fadd v4.4s, v21.4s, v20.4s
190 fsub v6.4s, v20.4s, v21.4s // just the second half
191 fadd v5.4s, v21.4s, v22.4s // just the first half
192
193 tbl v4.16b, {v4.16b}, v30.16b // trans4_float
194 tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
195
196 fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]}
197 fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]}
198 fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]}
199 fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]}
200
201//second half
202 rev64 v6.4s, v26.4s
203 fmul v26.4s, v26.4s, v23.s[2]
204 rev64 v7.4s, v27.4s
205 fmul v27.4s, v27.4s, v23.s[3]
206 fmul v6.4s, v6.4s, v29.4s
207 fmul v7.4s, v7.4s, v29.4s
208 fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6}
209 fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a}
210
211 zip1 v24.4s, v26.4s, v27.4s
212 zip2 v25.4s, v26.4s, v27.4s
213 fneg v26.4s, v24.4s
214 fadd v4.4s, v25.4s, v24.4s
215 fsub v6.4s, v24.4s, v25.4s // just the second half
216 fadd v5.4s, v25.4s, v26.4s // just the first half
217
218 tbl v4.16b, {v4.16b}, v30.16b // trans4_float
219 tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
220
221 fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]}
222 fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]}
223 fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]}
224 fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]}
225
226 st1 {v16.4s,v17.4s}, [x1], #32
227 st1 {v18.4s,v19.4s}, [x1], #32
228 st1 {v20.4s,v21.4s}, [x1], #32
229 st1 {v22.4s,v23.4s}, [x1], #32
230
231 ret
232endfunc
233
234
235const trans4_float, align=4
236 .byte 0, 1, 2, 3
237 .byte 8, 9, 10, 11
238 .byte 4, 5, 6, 7
239 .byte 12, 13, 14, 15
240endconst
241
242const trans8_float, align=4
243 .byte 24, 25, 26, 27
244 .byte 0, 1, 2, 3
245 .byte 28, 29, 30, 31
246 .byte 4, 5, 6, 7
247endconst
248
249function fft_pass_neon
250 sub x6, x2, #1 // n - 1, loop counter
251 lsl x5, x2, #3 // 2 * n * sizeof FFTSample
252 lsl x1, x2, #4 // 2 * n * sizeof FFTComplex
253 add x5, x4, x5 // wim
254 add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex
255 add x2, x0, x2, lsl #5 // &z[o2]
256 add x3, x0, x3 // &z[o3]
257 add x1, x0, x1 // &z[o1]
258 ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
259 ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
260 ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
261 trn2 v25.2d, v20.2d, v22.2d
262 sub x5, x5, #4 // wim--
263 trn1 v24.2d, v20.2d, v22.2d
264 ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1]
265 rev64 v7.4s, v25.4s
266 fmul v25.4s, v25.4s, v4.s[1]
267 ld1 {v16.4s}, [x0] // {z[0],z[1]}
268 fmul v7.4s, v7.4s, v29.4s
269 ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]}
270 prfm pldl1keep, [x2, #16]
271 prfm pldl1keep, [x3, #16]
272 fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
273 prfm pldl1keep, [x0, #16]
274 prfm pldl1keep, [x1, #16]
275
276 zip1 v20.4s, v24.4s, v25.4s
277 zip2 v21.4s, v24.4s, v25.4s
278 fneg v22.4s, v20.4s
279 fadd v4.4s, v21.4s, v20.4s
280 fsub v6.4s, v20.4s, v21.4s // just the second half
281 fadd v5.4s, v21.4s, v22.4s // just the first half
282
283 tbl v4.16b, {v4.16b}, v30.16b // trans4_float
284 tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
285
286 fadd v20.4s, v16.4s, v4.4s
287 fsub v22.4s, v16.4s, v4.4s
288 fadd v21.4s, v17.4s, v5.4s
289 st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
290 fsub v23.4s, v17.4s, v5.4s
291
292 st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
293 st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
294 st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
2951:
296 ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
297 ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
298 ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
299 transpose v26.2d, v27.2d, v20.2d, v22.2d
300 ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]}
301 rev64 v6.4s, v26.4s
302 fmul v26.4s, v26.4s, v4.s[0]
303 rev64 v7.4s, v27.4s
304 fmul v27.4s, v27.4s, v4.s[1]
305 fmul v6.4s, v6.4s, v29.4s
306 fmul v7.4s, v7.4s, v29.4s
307 ld1 {v16.4s},[x0] // {z[0],z[1]}
308 fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6}
309 fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
310 ld1 {v17.4s},[x1] // {z[o1],z[o1+1]}
311
312 subs x6, x6, #1 // n--
313
314 zip1 v20.4s, v26.4s, v27.4s
315 zip2 v21.4s, v26.4s, v27.4s
316 fneg v22.4s, v20.4s
317 fadd v4.4s, v21.4s, v20.4s
318 fsub v6.4s, v20.4s, v21.4s // just the second half
319 fadd v5.4s, v21.4s, v22.4s // just the first half
320
321 tbl v4.16b, {v4.16b}, v30.16b // trans4_float
322 tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
323
324 fadd v20.4s, v16.4s, v4.4s
325 fsub v22.4s, v16.4s, v4.4s
326 fadd v21.4s, v17.4s, v5.4s
327 st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
328 fsub v23.4s, v17.4s, v5.4s
329
330 st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
331 st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
332 st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
333 b.ne 1b
334
335 ret
336endfunc
337
338.macro def_fft n, n2, n4
339function fft\n\()_neon, align=6
340 sub sp, sp, #16
341 stp x28, x30, [sp]
342 add x28, x0, #\n4*2*8
343 bl fft\n2\()_neon
344 mov x0, x28
345 bl fft\n4\()_neon
346 add x0, x28, #\n4*1*8
347 bl fft\n4\()_neon
348 sub x0, x28, #\n4*2*8
349 ldp x28, x30, [sp], #16
350 movrel x4, X(ff_cos_\n)
351 mov x2, #\n4>>1
352 b fft_pass_neon
353endfunc
354.endm
355
356 def_fft 32, 16, 8
357 def_fft 64, 32, 16
358 def_fft 128, 64, 32
359 def_fft 256, 128, 64
360 def_fft 512, 256, 128
361 def_fft 1024, 512, 256
362 def_fft 2048, 1024, 512
363 def_fft 4096, 2048, 1024
364 def_fft 8192, 4096, 2048
365 def_fft 16384, 8192, 4096
366 def_fft 32768, 16384, 8192
367 def_fft 65536, 32768, 16384
368
369function ff_fft_calc_neon, export=1
370 prfm pldl1keep, [x1]
371 movrel x10, trans4_float
372 ldr w2, [x0]
373 movrel x11, trans8_float
374 sub w2, w2, #2
375 movrel x3, fft_tab_neon
376 ld1 {v30.16b}, [x10]
377 mov x7, #-8
378 movrel x12, pmmp
f6fa7814
DM
379 ldr x4, [x3, x2, lsl #3]
380 add x3, x3, x4
2ba45a60
DM
381 movrel x13, mppm
382 movrel x14, X(ff_cos_16)
383 ld1 {v31.16b}, [x11]
384 mov x0, x1
385 ld1 {v29.4s}, [x12] // pmmp
386 ld1 {v28.4s}, [x13]
387 br x3
388endfunc
389
390function ff_fft_permute_neon, export=1
391 mov x6, #1
392 ldr w2, [x0] // nbits
393 ldr x3, [x0, #16] // tmp_buf
394 ldr x0, [x0, #8] // revtab
395 lsl x6, x6, x2
396 mov x2, x6
3971:
398 ld1 {v0.2s,v1.2s}, [x1], #16
399 ldr w4, [x0], #4
400 uxth w5, w4
401 lsr w4, w4, #16
402 add x5, x3, x5, lsl #3
403 add x4, x3, x4, lsl #3
404 st1 {v0.2s}, [x5]
405 st1 {v1.2s}, [x4]
406 subs x6, x6, #2
407 b.gt 1b
408
409 sub x1, x1, x2, lsl #3
4101:
411 ld1 {v0.4s,v1.4s}, [x3], #32
412 st1 {v0.4s,v1.4s}, [x1], #32
413 subs x2, x2, #4
414 b.gt 1b
415
416 ret
417endfunc
418
419const fft_tab_neon
f6fa7814
DM
420 .quad fft4_neon - fft_tab_neon
421 .quad fft8_neon - fft_tab_neon
422 .quad fft16_neon - fft_tab_neon
423 .quad fft32_neon - fft_tab_neon
424 .quad fft64_neon - fft_tab_neon
425 .quad fft128_neon - fft_tab_neon
426 .quad fft256_neon - fft_tab_neon
427 .quad fft512_neon - fft_tab_neon
428 .quad fft1024_neon - fft_tab_neon
429 .quad fft2048_neon - fft_tab_neon
430 .quad fft4096_neon - fft_tab_neon
431 .quad fft8192_neon - fft_tab_neon
432 .quad fft16384_neon - fft_tab_neon
433 .quad fft32768_neon - fft_tab_neon
434 .quad fft65536_neon - fft_tab_neon
2ba45a60
DM
435endconst
436
437const pmmp, align=4
438 .float +1.0, -1.0, -1.0, +1.0
439endconst
440
441const mppm, align=4
442 .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
443endconst