Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / aarch64 / mdct_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * AArch64 NEON optimised MDCT
3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
4 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "libavutil/aarch64/asm.S"
24
25function ff_imdct_half_neon, export=1
26 sub sp, sp, #32
27 stp x19, x20, [sp]
28 str x30, [sp, #16]
29 mov x12, #1
30 ldr w14, [x0, #28] // mdct_bits
31 ldr x4, [x0, #32] // tcos
32 ldr x3, [x0, #8] // revtab
33 lsl x12, x12, x14 // n = 1 << nbits
34 lsr x14, x12, #2 // n4 = n >> 2
35 add x7, x2, x12, lsl #1
36 mov x12, #-16
37 sub x7, x7, #16
38
39 ld2 {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0
40 ld2 {v0.2s,v1.2s}, [x2], #16 // d0 =m0,x d1 =m1,x
41 rev64 v17.2s, v17.2s
42 ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
43 fmul v6.2s, v17.2s, v2.2s
44 fmul v7.2s, v0.2s, v2.2s
451:
46 subs x14, x14, #2
47 ldr w6, [x3], #4
48 fmul v4.2s, v0.2s, v3.2s
49 fmul v5.2s, v17.2s, v3.2s
50 fsub v4.2s, v6.2s, v4.2s
51 fadd v5.2s, v5.2s, v7.2s
52 ubfm x8, x6, #16, #31
53 ubfm x6, x6, #0, #15
54 add x8, x1, x8, lsl #3
55 add x6, x1, x6, lsl #3
56 b.eq 2f
57 ld2 {v16.2s,v17.2s}, [x7], x12
58 ld2 {v0.2s,v1.2s}, [x2], #16
59 rev64 v17.2s, v17.2s
60 ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
61 fmul v6.2s, v17.2s, v2.2s
62 fmul v7.2s, v0.2s, v2.2s
63 st2 {v4.s,v5.s}[0], [x6]
64 st2 {v4.s,v5.s}[1], [x8]
65 b 1b
662:
67 st2 {v4.s,v5.s}[0], [x6]
68 st2 {v4.s,v5.s}[1], [x8]
69
70 mov x19, x0
71 mov x20, x1
72 bl X(ff_fft_calc_neon)
73
74 mov x12, #1
75 ldr w14, [x19, #28] // mdct_bits
76 ldr x4, [x19, #32] // tcos
77 lsl x12, x12, x14 // n = 1 << nbits
78 lsr x14, x12, #3 // n8 = n >> 3
79
80 add x4, x4, x14, lsl #3
81 add x6, x20, x14, lsl #3
82 sub x1, x4, #16
83 sub x3, x6, #16
84
85 mov x7, #-16
86 mov x8, x6
87 mov x0, x3
88
89 ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =i1,r1 d1 =i0,r0
90 ld2 {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3
91 ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
923:
93 subs x14, x14, #2
94 fmul v7.2s, v0.2s, v17.2s
95 ld2 {v18.2s,v19.2s},[x4], #16 // d17=c2,c3 d19=s2,s3
96 fmul v4.2s, v1.2s, v17.2s
97 fmul v6.2s, v21.2s, v19.2s
98 fmul v5.2s, v20.2s, v19.2s
99 fmul v22.2s, v1.2s, v16.2s
100 fmul v23.2s, v21.2s, v18.2s
101 fmul v24.2s, v0.2s, v16.2s
102 fmul v25.2s, v20.2s, v18.2s
103 fadd v7.2s, v7.2s, v22.2s
104 fadd v5.2s, v5.2s, v23.2s
105 fsub v4.2s, v4.2s, v24.2s
106 fsub v6.2s, v6.2s, v25.2s
107 b.eq 4f
108 ld2 {v0.2s,v1.2s}, [x3], x7
109 ld2 {v20.2s,v21.2s},[x6], #16
110 ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
111 rev64 v5.2s, v5.2s
112 rev64 v7.2s, v7.2s
113 st2 {v4.2s,v5.2s}, [x0], x7
114 st2 {v6.2s,v7.2s}, [x8], #16
115 b 3b
1164:
117 rev64 v5.2s, v5.2s
118 rev64 v7.2s, v7.2s
119 st2 {v4.2s,v5.2s}, [x0]
120 st2 {v6.2s,v7.2s}, [x8]
121
122 ldp x19, x20, [sp]
123 ldr x30, [sp, #16]
124 add sp, sp, #32
125
126 ret
127endfunc
128
129function ff_imdct_calc_neon, export=1
130 sub sp, sp, #32
131 stp x19, x20, [sp]
132 str x30, [sp, #16]
133 ldr w3, [x0, #28] // mdct_bits
134 mov x19, #1
135 mov x20, x1
136 lsl x19, x19, x3
137 add x1, x1, x19
138
139 bl X(ff_imdct_half_neon)
140
141 add x0, x20, x19, lsl #2
142 add x1, x20, x19, lsl #1
143 sub x0, x0, #8
144 sub x2, x1, #16
145 mov x3, #-16
146 mov x6, #-8
1471:
148 ld1 {v0.4s}, [x2], x3
149 prfum pldl1keep, [x0, #-16]
150 rev64 v0.4s, v0.4s
151 ld1 {v2.2s,v3.2s}, [x1], #16
152 fneg v4.4s, v0.4s
153 prfum pldl1keep, [x2, #-16]
154 rev64 v2.2s, v2.2s
155 rev64 v3.2s, v3.2s
156 ext v4.16b, v4.16b, v4.16b, #8
157 st1 {v2.2s}, [x0], x6
158 st1 {v3.2s}, [x0], x6
159 st1 {v4.4s}, [x20], #16
160 subs x19, x19, #16
161 b.gt 1b
162
163 ldp x19, x20, [sp], #16
164 ldr x30, [sp], #16
165
166 ret
167endfunc
168
169
170function ff_mdct_calc_neon, export=1
171 sub sp, sp, #32
172 stp x19, x20, [sp]
173 str x30, [sp, #16]
174
175 mov x12, #1
176 ldr w14, [x0, #28] // mdct_bits
177 ldr x4, [x0, #32] // tcos
178 ldr x3, [x0, #8] // revtab
179 lsl x14, x12, x14 // n = 1 << nbits
180 add x7, x2, x14 // in4u
181 sub x9, x7, #16 // in4d
182 add x2, x7, x14, lsl #1 // in3u
183 add x8, x9, x14, lsl #1 // in3d
184 add x5, x4, x14, lsl #1
185 sub x5, x5, #16
186 sub x3, x3, #4
187 mov x12, #-16
188 lsr x13, x14, #1
189
190 ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
191 ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
192 ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
193 rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
194 rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
195 ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
196 fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
197 ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
198 rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
199 rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
200 ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
201 fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
202 fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
203 fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
2041:
205 fmul v7.2s, v0.2s, v21.2s // I*s
206 ldr w10, [x3, x13]
207 fmul v6.2s, v2.2s, v20.2s // -R*c
208 ldr w6, [x3, #4]!
209 fmul v4.2s, v2.2s, v21.2s // -R*s
210 fmul v5.2s, v0.2s, v20.2s // I*c
211 fmul v24.2s, v16.2s, v30.2s // R*c
212 fmul v25.2s, v18.2s, v31.2s // -I*s
213 fmul v22.2s, v16.2s, v31.2s // R*s
214 fmul v23.2s, v18.2s, v30.2s // I*c
215 subs x14, x14, #16
216 subs x13, x13, #8
217 fsub v6.2s, v6.2s, v7.2s // -R*c-I*s
218 fadd v7.2s, v4.2s, v5.2s // -R*s+I*c
219 fsub v24.2s, v25.2s, v24.2s // I*s-R*c
220 fadd v25.2s, v22.2s, v23.2s // R*s-I*c
221 b.eq 1f
222 mov x12, #-16
223 ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
224 ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
225 fneg v7.2s, v7.2s // R*s-I*c
226 ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
227 rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
228 rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
229 ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
230 fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
231 ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
232 rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
233 rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
234 ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
235 fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
236 fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
237 fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
238 ubfm x12, x6, #16, #31
239 ubfm x6, x6, #0, #15
240 add x12, x1, x12, lsl #3
241 add x6, x1, x6, lsl #3
242 st2 {v6.s,v7.s}[0], [x6]
243 st2 {v6.s,v7.s}[1], [x12]
244 ubfm x6, x10, #16, #31
245 ubfm x10, x10, #0, #15
246 add x6 , x1, x6, lsl #3
247 add x10, x1, x10, lsl #3
248 st2 {v24.s,v25.s}[0], [x10]
249 st2 {v24.s,v25.s}[1], [x6]
250 b 1b
2511:
252 fneg v7.2s, v7.2s // R*s-I*c
253 ubfm x12, x6, #16, #31
254 ubfm x6, x6, #0, #15
255 add x12, x1, x12, lsl #3
256 add x6, x1, x6, lsl #3
257 st2 {v6.s,v7.s}[0], [x6]
258 st2 {v6.s,v7.s}[1], [x12]
259 ubfm x6, x10, #16, #31
260 ubfm x10, x10, #0, #15
261 add x6 , x1, x6, lsl #3
262 add x10, x1, x10, lsl #3
263 st2 {v24.s,v25.s}[0], [x10]
264 st2 {v24.s,v25.s}[1], [x6]
265
266 mov x19, x0
267 mov x20, x1
268 bl X(ff_fft_calc_neon)
269
270 mov x12, #1
271 ldr w14, [x19, #28] // mdct_bits
272 ldr x4, [x19, #32] // tcos
273 lsl x12, x12, x14 // n = 1 << nbits
274 lsr x14, x12, #3 // n8 = n >> 3
275
276 add x4, x4, x14, lsl #3
277 add x6, x20, x14, lsl #3
278 sub x1, x4, #16
279 sub x3, x6, #16
280
281 mov x7, #-16
282 mov x8, x6
283 mov x0, x3
284
285 ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =r1,i1 d1 =r0,i0
286 ld2 {v20.2s,v21.2s}, [x6], #16 // d20=r2,i2 d21=r3,i3
287 ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
2881:
289 subs x14, x14, #2
290 fmul v7.2s, v0.2s, v17.2s // r1*s1,r0*s0
291 ld2 {v18.2s,v19.2s}, [x4], #16 // c2,c3 s2,s3
292 fmul v4.2s, v1.2s, v17.2s // i1*s1,i0*s0
293 fmul v6.2s, v21.2s, v19.2s // i2*s2,i3*s3
294 fmul v5.2s, v20.2s, v19.2s // r2*s2,r3*s3
295 fmul v24.2s, v0.2s, v16.2s // r1*c1,r0*c0
296 fmul v25.2s, v20.2s, v18.2s // r2*c2,r3*c3
297 fmul v22.2s, v21.2s, v18.2s // i2*c2,i3*c3
298 fmul v23.2s, v1.2s, v16.2s // i1*c1,i0*c0
299 fadd v4.2s, v4.2s, v24.2s // i1*s1+r1*c1,i0*s0+r0*c0
300 fadd v6.2s, v6.2s, v25.2s // i2*s2+r2*c2,i3*s3+r3*c3
301 fsub v5.2s, v22.2s, v5.2s // i2*c2-r2*s2,i3*c3-r3*s3
302 fsub v7.2s, v23.2s, v7.2s // i1*c1-r1*s1,i0*c0-r0*s0
303 fneg v4.2s, v4.2s
304 fneg v6.2s, v6.2s
305 b.eq 1f
306 ld2 {v0.2s, v1.2s}, [x3], x7
307 ld2 {v20.2s,v21.2s}, [x6], #16
308 ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
309 rev64 v5.2s, v5.2s
310 rev64 v7.2s, v7.2s
311 st2 {v4.2s,v5.2s}, [x0], x7
312 st2 {v6.2s,v7.2s}, [x8], #16
313 b 1b
3141:
315 rev64 v5.2s, v5.2s
316 rev64 v7.2s, v7.2s
317 st2 {v4.2s,v5.2s}, [x0]
318 st2 {v6.2s,v7.2s}, [x8]
319
320 ldp x19, x20, [sp], #16
321 ldr x30, [sp], #16
322 ret
323endfunc