Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / mdct_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * ARM NEON optimised MDCT
3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/arm/asm.S"
23
24#define ff_fft_calc_neon X(ff_fft_calc_neon)
25
26function ff_imdct_half_neon, export=1
27 push {r4-r8,lr}
28
29 mov r12, #1
30 ldr lr, [r0, #20] @ mdct_bits
31 ldr r4, [r0, #24] @ tcos
32 ldr r3, [r0, #8] @ revtab
33 lsl r12, r12, lr @ n = 1 << nbits
34 lsr lr, r12, #2 @ n4 = n >> 2
35 add r7, r2, r12, lsl #1
36 mov r12, #-16
37 sub r7, r7, #16
38
39 vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
40 vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
41 vrev64.32 d17, d17
42 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
43 vmul.f32 d6, d17, d2
44 vmul.f32 d7, d0, d2
451:
46 subs lr, lr, #2
47 ldr r6, [r3], #4
48 vmul.f32 d4, d0, d3
49 vmul.f32 d5, d17, d3
50 vsub.f32 d4, d6, d4
51 vadd.f32 d5, d5, d7
52 uxth r8, r6, ror #16
53 uxth r6, r6
54 add r8, r1, r8, lsl #3
55 add r6, r1, r6, lsl #3
56 beq 1f
57 vld2.32 {d16-d17},[r7,:128],r12
58 vld2.32 {d0-d1}, [r2,:128]!
59 vrev64.32 d17, d17
60 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
61 vmul.f32 d6, d17, d2
62 vmul.f32 d7, d0, d2
63 vst2.32 {d4[0],d5[0]}, [r6,:64]
64 vst2.32 {d4[1],d5[1]}, [r8,:64]
65 b 1b
661:
67 vst2.32 {d4[0],d5[0]}, [r6,:64]
68 vst2.32 {d4[1],d5[1]}, [r8,:64]
69
70 mov r4, r0
71 mov r6, r1
72 bl ff_fft_calc_neon
73
74 mov r12, #1
75 ldr lr, [r4, #20] @ mdct_bits
76 ldr r4, [r4, #24] @ tcos
77 lsl r12, r12, lr @ n = 1 << nbits
78 lsr lr, r12, #3 @ n8 = n >> 3
79
80 add r4, r4, lr, lsl #3
81 add r6, r6, lr, lsl #3
82 sub r1, r4, #16
83 sub r3, r6, #16
84
85 mov r7, #-16
86 mov r8, r6
87 mov r0, r3
88
89 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
90 vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
91 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
921:
93 subs lr, lr, #2
94 vmul.f32 d7, d0, d18
95 vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3
96 vmul.f32 d4, d1, d18
97 vmul.f32 d5, d21, d19
98 vmul.f32 d6, d20, d19
99 vmul.f32 d22, d1, d16
100 vmul.f32 d23, d21, d17
101 vmul.f32 d24, d0, d16
102 vmul.f32 d25, d20, d17
103 vadd.f32 d7, d7, d22
104 vadd.f32 d6, d6, d23
105 vsub.f32 d4, d4, d24
106 vsub.f32 d5, d5, d25
107 beq 1f
108 vld2.32 {d0-d1}, [r3,:128], r7
109 vld2.32 {d20-d21},[r6,:128]!
110 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
111 vrev64.32 q3, q3
112 vst2.32 {d4,d6}, [r0,:128], r7
113 vst2.32 {d5,d7}, [r8,:128]!
114 b 1b
1151:
116 vrev64.32 q3, q3
117 vst2.32 {d4,d6}, [r0,:128]
118 vst2.32 {d5,d7}, [r8,:128]
119
120 pop {r4-r8,pc}
121endfunc
122
123function ff_imdct_calc_neon, export=1
124 push {r4-r6,lr}
125
126 ldr r3, [r0, #20]
127 mov r4, #1
128 mov r5, r1
129 lsl r4, r4, r3
130 add r1, r1, r4
131
132 bl X(ff_imdct_half_neon)
133
134 add r0, r5, r4, lsl #2
135 add r1, r5, r4, lsl #1
136 sub r0, r0, #8
137 sub r2, r1, #16
138 mov r3, #-16
139 mov r6, #-8
140 vmov.i32 d30, #1<<31
1411:
142 vld1.32 {d0-d1}, [r2,:128], r3
143 pld [r0, #-16]
144 vrev64.32 q0, q0
145 vld1.32 {d2-d3}, [r1,:128]!
146 veor d4, d1, d30
147 pld [r2, #-16]
148 vrev64.32 q1, q1
149 veor d5, d0, d30
150 vst1.32 {d2}, [r0,:64], r6
151 vst1.32 {d3}, [r0,:64], r6
152 vst1.32 {d4-d5}, [r5,:128]!
153 subs r4, r4, #16
154 bgt 1b
155
156 pop {r4-r6,pc}
157endfunc
158
159function ff_mdct_calc_neon, export=1
160 push {r4-r10,lr}
161
162 mov r12, #1
163 ldr lr, [r0, #20] @ mdct_bits
164 ldr r4, [r0, #24] @ tcos
165 ldr r3, [r0, #8] @ revtab
166 lsl lr, r12, lr @ n = 1 << nbits
167 add r7, r2, lr @ in4u
168 sub r9, r7, #16 @ in4d
169 add r2, r7, lr, lsl #1 @ in3u
170 add r8, r9, lr, lsl #1 @ in3d
171 add r5, r4, lr, lsl #1
172 sub r5, r5, #16
173 sub r3, r3, #4
174 mov r12, #-16
175
176 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
177 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
178 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
179 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
180 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
181 vsub.f32 d0, d18, d0 @ in4d-in4u I
182 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
183 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
184 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
185 vadd.f32 d1, d1, d19 @ in3u+in3d -R
186 vsub.f32 d16, d16, d2 @ in0u-in2d R
187 vadd.f32 d17, d17, d3 @ in2u+in1d -I
1881:
189 vmul.f32 d7, d0, d21 @ I*s
190A ldr r10, [r3, lr, lsr #1]
191T lsr r10, lr, #1
192T ldr r10, [r3, r10]
193 vmul.f32 d6, d1, d20 @ -R*c
194 ldr r6, [r3, #4]!
195 vmul.f32 d4, d1, d21 @ -R*s
196 vmul.f32 d5, d0, d20 @ I*c
197 vmul.f32 d24, d16, d30 @ R*c
198 vmul.f32 d25, d17, d31 @ -I*s
199 vmul.f32 d22, d16, d31 @ R*s
200 vmul.f32 d23, d17, d30 @ I*c
201 subs lr, lr, #16
202 vsub.f32 d6, d6, d7 @ -R*c-I*s
203 vadd.f32 d7, d4, d5 @ -R*s+I*c
204 vsub.f32 d24, d25, d24 @ I*s-R*c
205 vadd.f32 d25, d22, d23 @ R*s-I*c
206 beq 1f
207 mov r12, #-16
208 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
209 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
210 vneg.f32 d7, d7 @ R*s-I*c
211 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
212 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
213 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
214 vsub.f32 d0, d18, d0 @ in4d-in4u I
215 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
216 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
217 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
218 vadd.f32 d1, d1, d19 @ in3u+in3d -R
219 vsub.f32 d16, d16, d2 @ in0u-in2d R
220 vadd.f32 d17, d17, d3 @ in2u+in1d -I
221 uxth r12, r6, ror #16
222 uxth r6, r6
223 add r12, r1, r12, lsl #3
224 add r6, r1, r6, lsl #3
225 vst2.32 {d6[0],d7[0]}, [r6,:64]
226 vst2.32 {d6[1],d7[1]}, [r12,:64]
227 uxth r6, r10, ror #16
228 uxth r10, r10
229 add r6 , r1, r6, lsl #3
230 add r10, r1, r10, lsl #3
231 vst2.32 {d24[0],d25[0]},[r10,:64]
232 vst2.32 {d24[1],d25[1]},[r6,:64]
233 b 1b
2341:
235 vneg.f32 d7, d7 @ R*s-I*c
236 uxth r12, r6, ror #16
237 uxth r6, r6
238 add r12, r1, r12, lsl #3
239 add r6, r1, r6, lsl #3
240 vst2.32 {d6[0],d7[0]}, [r6,:64]
241 vst2.32 {d6[1],d7[1]}, [r12,:64]
242 uxth r6, r10, ror #16
243 uxth r10, r10
244 add r6 , r1, r6, lsl #3
245 add r10, r1, r10, lsl #3
246 vst2.32 {d24[0],d25[0]},[r10,:64]
247 vst2.32 {d24[1],d25[1]},[r6,:64]
248
249 mov r4, r0
250 mov r6, r1
251 bl ff_fft_calc_neon
252
253 mov r12, #1
254 ldr lr, [r4, #20] @ mdct_bits
255 ldr r4, [r4, #24] @ tcos
256 lsl r12, r12, lr @ n = 1 << nbits
257 lsr lr, r12, #3 @ n8 = n >> 3
258
259 add r4, r4, lr, lsl #3
260 add r6, r6, lr, lsl #3
261 sub r1, r4, #16
262 sub r3, r6, #16
263
264 mov r7, #-16
265 mov r8, r6
266 mov r0, r3
267
268 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
269 vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
270 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
2711:
272 subs lr, lr, #2
273 vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
274 vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3
275 vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
276 vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
277 vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
278 vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
279 vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
280 vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
281 vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
282 vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
283 vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
284 vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
285 vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
286 vneg.f32 q2, q2
287 beq 1f
288 vld2.32 {d0-d1}, [r3,:128], r7
289 vld2.32 {d20-d21},[r6,:128]!
290 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
291 vrev64.32 q3, q3
292 vst2.32 {d4,d6}, [r0,:128], r7
293 vst2.32 {d5,d7}, [r8,:128]!
294 b 1b
2951:
296 vrev64.32 q3, q3
297 vst2.32 {d4,d6}, [r0,:128]
298 vst2.32 {d5,d7}, [r8,:128]
299
300 pop {r4-r10,pc}
301endfunc