Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * AArch64 NEON optimised MDCT | |
3 | * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> | |
4 | * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> | |
5 | * | |
6 | * This file is part of FFmpeg. | |
7 | * | |
8 | * FFmpeg is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License as published by the Free Software Foundation; either | |
11 | * version 2.1 of the License, or (at your option) any later version. | |
12 | * | |
13 | * FFmpeg is distributed in the hope that it will be useful, | |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | * Lesser General Public License for more details. | |
17 | * | |
18 | * You should have received a copy of the GNU Lesser General Public | |
19 | * License along with FFmpeg; if not, write to the Free Software | |
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | */ | |
22 | ||
23 | #include "libavutil/aarch64/asm.S" | |
24 | ||
25 | function ff_imdct_half_neon, export=1 | |
26 | sub sp, sp, #32 | |
27 | stp x19, x20, [sp] | |
28 | str x30, [sp, #16] | |
29 | mov x12, #1 | |
30 | ldr w14, [x0, #28] // mdct_bits | |
31 | ldr x4, [x0, #32] // tcos | |
32 | ldr x3, [x0, #8] // revtab | |
33 | lsl x12, x12, x14 // n = 1 << nbits | |
34 | lsr x14, x12, #2 // n4 = n >> 2 | |
35 | add x7, x2, x12, lsl #1 | |
36 | mov x12, #-16 | |
37 | sub x7, x7, #16 | |
38 | ||
39 | ld2 {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0 | |
40 | ld2 {v0.2s,v1.2s}, [x2], #16 // d0 =m0,x d1 =m1,x | |
41 | rev64 v17.2s, v17.2s | |
42 | ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2 | |
43 | fmul v6.2s, v17.2s, v2.2s | |
44 | fmul v7.2s, v0.2s, v2.2s | |
45 | 1: | |
46 | subs x14, x14, #2 | |
47 | ldr w6, [x3], #4 | |
48 | fmul v4.2s, v0.2s, v3.2s | |
49 | fmul v5.2s, v17.2s, v3.2s | |
50 | fsub v4.2s, v6.2s, v4.2s | |
51 | fadd v5.2s, v5.2s, v7.2s | |
52 | ubfm x8, x6, #16, #31 | |
53 | ubfm x6, x6, #0, #15 | |
54 | add x8, x1, x8, lsl #3 | |
55 | add x6, x1, x6, lsl #3 | |
56 | b.eq 2f | |
57 | ld2 {v16.2s,v17.2s}, [x7], x12 | |
58 | ld2 {v0.2s,v1.2s}, [x2], #16 | |
59 | rev64 v17.2s, v17.2s | |
60 | ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2 | |
61 | fmul v6.2s, v17.2s, v2.2s | |
62 | fmul v7.2s, v0.2s, v2.2s | |
63 | st2 {v4.s,v5.s}[0], [x6] | |
64 | st2 {v4.s,v5.s}[1], [x8] | |
65 | b 1b | |
66 | 2: | |
67 | st2 {v4.s,v5.s}[0], [x6] | |
68 | st2 {v4.s,v5.s}[1], [x8] | |
69 | ||
70 | mov x19, x0 | |
71 | mov x20, x1 | |
72 | bl X(ff_fft_calc_neon) | |
73 | ||
74 | mov x12, #1 | |
75 | ldr w14, [x19, #28] // mdct_bits | |
76 | ldr x4, [x19, #32] // tcos | |
77 | lsl x12, x12, x14 // n = 1 << nbits | |
78 | lsr x14, x12, #3 // n8 = n >> 3 | |
79 | ||
80 | add x4, x4, x14, lsl #3 | |
81 | add x6, x20, x14, lsl #3 | |
82 | sub x1, x4, #16 | |
83 | sub x3, x6, #16 | |
84 | ||
85 | mov x7, #-16 | |
86 | mov x8, x6 | |
87 | mov x0, x3 | |
88 | ||
89 | ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =i1,r1 d1 =i0,r0 | |
90 | ld2 {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3 | |
91 | ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0 | |
92 | 3: | |
93 | subs x14, x14, #2 | |
94 | fmul v7.2s, v0.2s, v17.2s | |
95 | ld2 {v18.2s,v19.2s},[x4], #16 // d17=c2,c3 d19=s2,s3 | |
96 | fmul v4.2s, v1.2s, v17.2s | |
97 | fmul v6.2s, v21.2s, v19.2s | |
98 | fmul v5.2s, v20.2s, v19.2s | |
99 | fmul v22.2s, v1.2s, v16.2s | |
100 | fmul v23.2s, v21.2s, v18.2s | |
101 | fmul v24.2s, v0.2s, v16.2s | |
102 | fmul v25.2s, v20.2s, v18.2s | |
103 | fadd v7.2s, v7.2s, v22.2s | |
104 | fadd v5.2s, v5.2s, v23.2s | |
105 | fsub v4.2s, v4.2s, v24.2s | |
106 | fsub v6.2s, v6.2s, v25.2s | |
107 | b.eq 4f | |
108 | ld2 {v0.2s,v1.2s}, [x3], x7 | |
109 | ld2 {v20.2s,v21.2s},[x6], #16 | |
110 | ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0 | |
111 | rev64 v5.2s, v5.2s | |
112 | rev64 v7.2s, v7.2s | |
113 | st2 {v4.2s,v5.2s}, [x0], x7 | |
114 | st2 {v6.2s,v7.2s}, [x8], #16 | |
115 | b 3b | |
116 | 4: | |
117 | rev64 v5.2s, v5.2s | |
118 | rev64 v7.2s, v7.2s | |
119 | st2 {v4.2s,v5.2s}, [x0] | |
120 | st2 {v6.2s,v7.2s}, [x8] | |
121 | ||
122 | ldp x19, x20, [sp] | |
123 | ldr x30, [sp, #16] | |
124 | add sp, sp, #32 | |
125 | ||
126 | ret | |
127 | endfunc | |
128 | ||
129 | function ff_imdct_calc_neon, export=1 | |
130 | sub sp, sp, #32 | |
131 | stp x19, x20, [sp] | |
132 | str x30, [sp, #16] | |
133 | ldr w3, [x0, #28] // mdct_bits | |
134 | mov x19, #1 | |
135 | mov x20, x1 | |
136 | lsl x19, x19, x3 | |
137 | add x1, x1, x19 | |
138 | ||
139 | bl X(ff_imdct_half_neon) | |
140 | ||
141 | add x0, x20, x19, lsl #2 | |
142 | add x1, x20, x19, lsl #1 | |
143 | sub x0, x0, #8 | |
144 | sub x2, x1, #16 | |
145 | mov x3, #-16 | |
146 | mov x6, #-8 | |
147 | 1: | |
148 | ld1 {v0.4s}, [x2], x3 | |
149 | prfum pldl1keep, [x0, #-16] | |
150 | rev64 v0.4s, v0.4s | |
151 | ld1 {v2.2s,v3.2s}, [x1], #16 | |
152 | fneg v4.4s, v0.4s | |
153 | prfum pldl1keep, [x2, #-16] | |
154 | rev64 v2.2s, v2.2s | |
155 | rev64 v3.2s, v3.2s | |
156 | ext v4.16b, v4.16b, v4.16b, #8 | |
157 | st1 {v2.2s}, [x0], x6 | |
158 | st1 {v3.2s}, [x0], x6 | |
159 | st1 {v4.4s}, [x20], #16 | |
160 | subs x19, x19, #16 | |
161 | b.gt 1b | |
162 | ||
163 | ldp x19, x20, [sp], #16 | |
164 | ldr x30, [sp], #16 | |
165 | ||
166 | ret | |
167 | endfunc | |
168 | ||
169 | ||
170 | function ff_mdct_calc_neon, export=1 | |
171 | sub sp, sp, #32 | |
172 | stp x19, x20, [sp] | |
173 | str x30, [sp, #16] | |
174 | ||
175 | mov x12, #1 | |
176 | ldr w14, [x0, #28] // mdct_bits | |
177 | ldr x4, [x0, #32] // tcos | |
178 | ldr x3, [x0, #8] // revtab | |
179 | lsl x14, x12, x14 // n = 1 << nbits | |
180 | add x7, x2, x14 // in4u | |
181 | sub x9, x7, #16 // in4d | |
182 | add x2, x7, x14, lsl #1 // in3u | |
183 | add x8, x9, x14, lsl #1 // in3d | |
184 | add x5, x4, x14, lsl #1 | |
185 | sub x5, x5, #16 | |
186 | sub x3, x3, #4 | |
187 | mov x12, #-16 | |
188 | lsr x13, x14, #1 | |
189 | ||
190 | ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0 | |
191 | ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0 | |
192 | ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0 | |
193 | rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1 | |
194 | rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1 | |
195 | ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0 | |
196 | fsub v0.2s, v17.2s, v0.2s // in4d-in4u I | |
197 | ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1 | |
198 | rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1 | |
199 | rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1 | |
200 | ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3 | |
201 | fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R | |
202 | fsub v16.2s, v16.2s, v1.2s // in0u-in2d R | |
203 | fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I | |
204 | 1: | |
205 | fmul v7.2s, v0.2s, v21.2s // I*s | |
206 | ldr w10, [x3, x13] | |
207 | fmul v6.2s, v2.2s, v20.2s // -R*c | |
208 | ldr w6, [x3, #4]! | |
209 | fmul v4.2s, v2.2s, v21.2s // -R*s | |
210 | fmul v5.2s, v0.2s, v20.2s // I*c | |
211 | fmul v24.2s, v16.2s, v30.2s // R*c | |
212 | fmul v25.2s, v18.2s, v31.2s // -I*s | |
213 | fmul v22.2s, v16.2s, v31.2s // R*s | |
214 | fmul v23.2s, v18.2s, v30.2s // I*c | |
215 | subs x14, x14, #16 | |
216 | subs x13, x13, #8 | |
217 | fsub v6.2s, v6.2s, v7.2s // -R*c-I*s | |
218 | fadd v7.2s, v4.2s, v5.2s // -R*s+I*c | |
219 | fsub v24.2s, v25.2s, v24.2s // I*s-R*c | |
220 | fadd v25.2s, v22.2s, v23.2s // R*s-I*c | |
221 | b.eq 1f | |
222 | mov x12, #-16 | |
223 | ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0 | |
224 | ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0 | |
225 | fneg v7.2s, v7.2s // R*s-I*c | |
226 | ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0 | |
227 | rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1 | |
228 | rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1 | |
229 | ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0 | |
230 | fsub v0.2s, v17.2s, v0.2s // in4d-in4u I | |
231 | ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1 | |
232 | rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1 | |
233 | rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1 | |
234 | ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3 | |
235 | fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R | |
236 | fsub v16.2s, v16.2s, v1.2s // in0u-in2d R | |
237 | fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I | |
238 | ubfm x12, x6, #16, #31 | |
239 | ubfm x6, x6, #0, #15 | |
240 | add x12, x1, x12, lsl #3 | |
241 | add x6, x1, x6, lsl #3 | |
242 | st2 {v6.s,v7.s}[0], [x6] | |
243 | st2 {v6.s,v7.s}[1], [x12] | |
244 | ubfm x6, x10, #16, #31 | |
245 | ubfm x10, x10, #0, #15 | |
246 | add x6 , x1, x6, lsl #3 | |
247 | add x10, x1, x10, lsl #3 | |
248 | st2 {v24.s,v25.s}[0], [x10] | |
249 | st2 {v24.s,v25.s}[1], [x6] | |
250 | b 1b | |
251 | 1: | |
252 | fneg v7.2s, v7.2s // R*s-I*c | |
253 | ubfm x12, x6, #16, #31 | |
254 | ubfm x6, x6, #0, #15 | |
255 | add x12, x1, x12, lsl #3 | |
256 | add x6, x1, x6, lsl #3 | |
257 | st2 {v6.s,v7.s}[0], [x6] | |
258 | st2 {v6.s,v7.s}[1], [x12] | |
259 | ubfm x6, x10, #16, #31 | |
260 | ubfm x10, x10, #0, #15 | |
261 | add x6 , x1, x6, lsl #3 | |
262 | add x10, x1, x10, lsl #3 | |
263 | st2 {v24.s,v25.s}[0], [x10] | |
264 | st2 {v24.s,v25.s}[1], [x6] | |
265 | ||
266 | mov x19, x0 | |
267 | mov x20, x1 | |
268 | bl X(ff_fft_calc_neon) | |
269 | ||
270 | mov x12, #1 | |
271 | ldr w14, [x19, #28] // mdct_bits | |
272 | ldr x4, [x19, #32] // tcos | |
273 | lsl x12, x12, x14 // n = 1 << nbits | |
274 | lsr x14, x12, #3 // n8 = n >> 3 | |
275 | ||
276 | add x4, x4, x14, lsl #3 | |
277 | add x6, x20, x14, lsl #3 | |
278 | sub x1, x4, #16 | |
279 | sub x3, x6, #16 | |
280 | ||
281 | mov x7, #-16 | |
282 | mov x8, x6 | |
283 | mov x0, x3 | |
284 | ||
285 | ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =r1,i1 d1 =r0,i0 | |
286 | ld2 {v20.2s,v21.2s}, [x6], #16 // d20=r2,i2 d21=r3,i3 | |
287 | ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0 | |
288 | 1: | |
289 | subs x14, x14, #2 | |
290 | fmul v7.2s, v0.2s, v17.2s // r1*s1,r0*s0 | |
291 | ld2 {v18.2s,v19.2s}, [x4], #16 // c2,c3 s2,s3 | |
292 | fmul v4.2s, v1.2s, v17.2s // i1*s1,i0*s0 | |
293 | fmul v6.2s, v21.2s, v19.2s // i2*s2,i3*s3 | |
294 | fmul v5.2s, v20.2s, v19.2s // r2*s2,r3*s3 | |
295 | fmul v24.2s, v0.2s, v16.2s // r1*c1,r0*c0 | |
296 | fmul v25.2s, v20.2s, v18.2s // r2*c2,r3*c3 | |
297 | fmul v22.2s, v21.2s, v18.2s // i2*c2,i3*c3 | |
298 | fmul v23.2s, v1.2s, v16.2s // i1*c1,i0*c0 | |
299 | fadd v4.2s, v4.2s, v24.2s // i1*s1+r1*c1,i0*s0+r0*c0 | |
300 | fadd v6.2s, v6.2s, v25.2s // i2*s2+r2*c2,i3*s3+r3*c3 | |
301 | fsub v5.2s, v22.2s, v5.2s // i2*c2-r2*s2,i3*c3-r3*s3 | |
302 | fsub v7.2s, v23.2s, v7.2s // i1*c1-r1*s1,i0*c0-r0*s0 | |
303 | fneg v4.2s, v4.2s | |
304 | fneg v6.2s, v6.2s | |
305 | b.eq 1f | |
306 | ld2 {v0.2s, v1.2s}, [x3], x7 | |
307 | ld2 {v20.2s,v21.2s}, [x6], #16 | |
308 | ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0 | |
309 | rev64 v5.2s, v5.2s | |
310 | rev64 v7.2s, v7.2s | |
311 | st2 {v4.2s,v5.2s}, [x0], x7 | |
312 | st2 {v6.2s,v7.2s}, [x8], #16 | |
313 | b 1b | |
314 | 1: | |
315 | rev64 v5.2s, v5.2s | |
316 | rev64 v7.2s, v7.2s | |
317 | st2 {v4.2s,v5.2s}, [x0] | |
318 | st2 {v6.2s,v7.2s}, [x8] | |
319 | ||
320 | ldp x19, x20, [sp], #16 | |
321 | ldr x30, [sp], #16 | |
322 | ret | |
323 | endfunc |