Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / aacpsdsp_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2012 Mans Rullgard
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22
23function ff_ps_add_squares_neon, export=1
24 mov r3, r0
25 sub r2, r2, #4
26 vld1.32 {q0}, [r1,:128]!
27 vmul.f32 q0, q0, q0
28 vld1.32 {q2}, [r1,:128]!
29 vmul.f32 q2, q2, q2
30 vld1.32 {q1}, [r0,:128]!
311:
32 vpadd.f32 d6, d0, d1
33 vld1.32 {q0}, [r1,:128]!
34 vpadd.f32 d7, d4, d5
35 vmul.f32 q0, q0, q0
36 vld1.32 {q2}, [r1,:128]!
37 vadd.f32 q3, q1, q3
38 vld1.32 {q1}, [r0,:128]!
39 vmul.f32 q2, q2, q2
40 vst1.32 {q3}, [r3,:128]!
41 subs r2, r2, #4
42 bgt 1b
43 vpadd.f32 d6, d0, d1
44 vpadd.f32 d7, d4, d5
45 vadd.f32 q1, q1, q3
46 vst1.32 {q1}, [r3,:128]!
47 bx lr
48endfunc
49
50function ff_ps_mul_pair_single_neon, export=1
51 sub r3, r3, #4
52 tst r1, #8
53 bne 2f
54 vld1.32 {q0}, [r1,:128]!
551:
56 vld1.32 {q3}, [r2,:128]!
57 vmul.f32 d4, d0, d6[0]
58 vmul.f32 d5, d1, d6[1]
59 vld1.32 {q1}, [r1,:128]!
60 vmul.f32 d6, d2, d7[0]
61 vmul.f32 d7, d3, d7[1]
62 vld1.32 {q0}, [r1,:128]!
63 vst1.32 {q2,q3}, [r0,:128]!
64 subs r3, r3, #4
65 bgt 1b
66 vld1.32 {q3}, [r2,:128]!
67 vmul.f32 d4, d0, d6[0]
68 vmul.f32 d5, d1, d6[1]
69 vld1.32 {q1}, [r1,:128]!
70 vmul.f32 d6, d2, d7[0]
71 vmul.f32 d7, d3, d7[1]
72 vst1.32 {q2,q3}, [r0,:128]!
73 bx lr
742:
75 vld1.32 {d0}, [r1,:64]!
76 vld1.32 {d1,d2}, [r1,:128]!
771:
78 vld1.32 {q3}, [r2,:128]!
79 vmul.f32 d4, d0, d6[0]
80 vmul.f32 d5, d1, d6[1]
81 vld1.32 {d0,d1}, [r1,:128]!
82 vmul.f32 d6, d2, d7[0]
83 vmul.f32 d7, d0, d7[1]
84 vmov d0, d1
85 vld1.32 {d1,d2}, [r1,:128]!
86 vst1.32 {q2,q3}, [r0,:128]!
87 subs r3, r3, #4
88 bgt 1b
89 vld1.32 {q3}, [r2,:128]!
90 vmul.f32 d4, d0, d6[0]
91 vmul.f32 d5, d1, d6[1]
92 vld1.32 {d0}, [r1,:64]!
93 vmul.f32 d6, d2, d7[0]
94 vmul.f32 d7, d0, d7[1]
95 vst1.32 {q2,q3}, [r0,:128]!
96 bx lr
97endfunc
98
99function ff_ps_hybrid_synthesis_deint_neon, export=1
100 push {r4-r8,lr}
101 add r0, r0, r2, lsl #2
102 add r1, r1, r2, lsl #5+1+2
103 rsb r2, r2, #64
104 mov r5, #64*4
105 mov lr, r0
106 add r4, r0, #38*64*4
107 mov r12, r3
1082:
109 vld1.32 {d0,d1}, [r1,:128]!
110 vst1.32 {d0[0]}, [lr,:32], r5
111 vst1.32 {d0[1]}, [r4,:32], r5
112 vst1.32 {d1[0]}, [lr,:32], r5
113 vst1.32 {d1[1]}, [r4,:32], r5
114 subs r12, r12, #2
115 bgt 2b
116 add r0, r0, #4
117 sub r2, r2, #1
118 tst r2, #2
119 bne 6f
1201:
121 mov lr, r0
122 add r4, r0, #38*64*4
123 add r6, r1, # 32*2*4
124 add r7, r1, #2*32*2*4
125 add r8, r1, #3*32*2*4
126 mov r12, r3
1272:
128 vld1.32 {d0,d1}, [r1,:128]!
129 vld1.32 {d2,d3}, [r6,:128]!
130 vld1.32 {d4,d5}, [r7,:128]!
131 vld1.32 {d6,d7}, [r8,:128]!
132 vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [lr,:128], r5
133 vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r4,:128], r5
134 vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [lr,:128], r5
135 vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r4,:128], r5
136 subs r12, r12, #2
137 bgt 2b
138 add r0, r0, #16
139 add r1, r1, #3*32*2*4
140 subs r2, r2, #4
141 bgt 1b
142 pop {r4-r8,pc}
1436:
144 mov lr, r0
145 add r4, r0, #38*64*4
146 add r6, r1, #32*2*4
147 mov r12, r3
1482:
149 vld1.32 {d0,d1}, [r1,:128]!
150 vld1.32 {d2,d3}, [r6,:128]!
151 vst2.32 {d0[0],d2[0]}, [lr,:64], r5
152 vst2.32 {d0[1],d2[1]}, [r4,:64], r5
153 vst2.32 {d1[0],d3[0]}, [lr,:64], r5
154 vst2.32 {d1[1],d3[1]}, [r4,:64], r5
155 subs r12, r12, #2
156 bgt 2b
157 add r0, r0, #8
158 add r1, r1, #32*2*4
159 sub r2, r2, #2
160 b 1b
161endfunc
162
163function ff_ps_hybrid_analysis_neon, export=1
164 vldm r1, {d19-d31}
165 ldr r12, [sp]
166 lsl r3, r3, #3
167 vadd.f32 d16, d19, d31
168 vadd.f32 d17, d20, d30
169 vsub.f32 d18, d19, d31
170 vsub.f32 d19, d20, d30
171 vsub.f32 d0, d21, d29
172 vsub.f32 d1, d22, d28
173 vadd.f32 d2, d21, d29
174 vadd.f32 d3, d22, d28
175 vadd.f32 d20, d23, d27
176 vadd.f32 d21, d24, d26
177 vsub.f32 d22, d23, d27
178 vsub.f32 d23, d24, d26
179 vmov.i32 d6, #1<<31
180 vmov.i32 d7, #0
181 vmov.f32 q14, #0.0
182 vmov.f32 q15, #0.0
183 vtrn.32 d6, d7
184 vrev64.32 q9, q9
185 vrev64.32 q0, q0
186 vrev64.32 q11, q11
187 veor q9, q9, q3
188 veor q0, q0, q3
189 veor q11, q11, q3
190 vld1.32 {q13}, [r2,:128]!
191 vtrn.32 q8, q9
192 vtrn.32 q1, q0
193 vtrn.32 q10, q11
194 sub r12, r12, #1
195 vmla.f32 q14, q8, q13
196 vld1.32 {q2}, [r2,:128]!
197 vmla.f32 q15, q9, q13
1981:
199 vmla.f32 q14, q1, q2
200 vld1.32 {q13}, [r2,:128]!
201 vmla.f32 q15, q0, q2
202 vmla.f32 q14, q10, q13
203 vld1.32 {q2}, [r2,:128]!
204 vmla.f32 q15, q11, q13
205 vld1.32 {q13}, [r2,:128]!
206 vadd.f32 d6, d28, d29
207 vadd.f32 d7, d30, d31
208 vmov.f32 q14, #0.0
209 vmov.f32 q15, #0.0
210 vmla.f32 q14, q8, q13
211 vpadd.f32 d6, d6, d7
212 vmla.f32 q15, q9, q13
213 vmla.f32 d6, d25, d4[0]
214 vld1.32 {q2}, [r2,:128]!
215 vst1.32 {d6}, [r0,:64], r3
216 subs r12, r12, #1
217 bgt 1b
218 vmla.f32 q14, q1, q2
219 vld1.32 {q13}, [r2,:128]!
220 vmla.f32 q15, q0, q2
221 vmla.f32 q14, q10, q13
222 vld1.32 {q2}, [r2,:128]!
223 vmla.f32 q15, q11, q13
224 vadd.f32 d6, d28, d29
225 vadd.f32 d7, d30, d31
226 vpadd.f32 d6, d6, d7
227 vmla.f32 d6, d25, d4[0]
228 vst1.32 {d6}, [r0,:64], r3
229 bx lr
230endfunc
231
232function ff_ps_stereo_interpolate_neon, export=1
233 vld1.32 {q0}, [r2]
234 vld1.32 {q14}, [r3]
235 vadd.f32 q15, q14, q14
236 mov r2, r0
237 mov r3, r1
238 ldr r12, [sp]
239 vadd.f32 q1, q0, q14
240 vadd.f32 q0, q0, q15
241 vld1.32 {q2}, [r0,:64]!
242 vld1.32 {q3}, [r1,:64]!
243 subs r12, r12, #1
244 beq 2f
2451:
246 vmul.f32 d16, d4, d2[0]
247 vmul.f32 d17, d5, d0[0]
248 vmul.f32 d18, d4, d2[1]
249 vmul.f32 d19, d5, d0[1]
250 vmla.f32 d16, d6, d3[0]
251 vmla.f32 d17, d7, d1[0]
252 vmla.f32 d18, d6, d3[1]
253 vmla.f32 d19, d7, d1[1]
254 vadd.f32 q1, q1, q15
255 vadd.f32 q0, q0, q15
256 vld1.32 {q2}, [r0,:64]!
257 vld1.32 {q3}, [r1,:64]!
258 vst1.32 {q8}, [r2,:64]!
259 vst1.32 {q9}, [r3,:64]!
260 subs r12, r12, #2
261 bgt 1b
262 it lt
263 bxlt lr
2642:
265 vmul.f32 d16, d4, d2[0]
266 vmul.f32 d18, d4, d2[1]
267 vmla.f32 d16, d6, d3[0]
268 vmla.f32 d18, d6, d3[1]
269 vst1.32 {d16}, [r2,:64]!
270 vst1.32 {d18}, [r3,:64]!
271 bx lr
272endfunc