Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / sbrdsp_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2012 Mans Rullgard
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22
23function ff_sbr_sum64x5_neon, export=1
24 push {lr}
25 add r1, r0, # 64*4
26 add r2, r0, #128*4
27 add r3, r0, #192*4
28 add lr, r0, #256*4
29 mov r12, #64
301:
31 vld1.32 {q0}, [r0,:128]
32 vld1.32 {q1}, [r1,:128]!
33 vadd.f32 q0, q0, q1
34 vld1.32 {q2}, [r2,:128]!
35 vadd.f32 q0, q0, q2
36 vld1.32 {q3}, [r3,:128]!
37 vadd.f32 q0, q0, q3
38 vld1.32 {q8}, [lr,:128]!
39 vadd.f32 q0, q0, q8
40 vst1.32 {q0}, [r0,:128]!
41 subs r12, #4
42 bgt 1b
43 pop {pc}
44endfunc
45
46function ff_sbr_sum_square_neon, export=1
47 vmov.f32 q0, #0.0
481:
49 vld1.32 {q1}, [r0,:128]!
50 vmla.f32 q0, q1, q1
51 subs r1, r1, #2
52 bgt 1b
53 vadd.f32 d0, d0, d1
54 vpadd.f32 d0, d0, d0
55NOVFP vmov.32 r0, d0[0]
56 bx lr
57endfunc
58
59function ff_sbr_neg_odd_64_neon, export=1
60 mov r1, r0
61 vmov.i32 q8, #1<<31
62 vld2.32 {q0,q1}, [r0,:128]!
63 veor q1, q1, q8
64 vld2.32 {q2,q3}, [r0,:128]!
65 .rept 3
66 vst2.32 {q0,q1}, [r1,:128]!
67 veor q3, q3, q8
68 vld2.32 {q0,q1}, [r0,:128]!
69 vst2.32 {q2,q3}, [r1,:128]!
70 veor q1, q1, q8
71 vld2.32 {q2,q3}, [r0,:128]!
72 .endr
73 veor q3, q3, q8
74 vst2.32 {q0,q1}, [r1,:128]!
75 vst2.32 {q2,q3}, [r1,:128]!
76 bx lr
77endfunc
78
79function ff_sbr_qmf_pre_shuffle_neon, export=1
80 add r1, r0, #60*4
81 add r2, r0, #64*4
82 vld1.32 {d0}, [r0,:64]!
83 vst1.32 {d0}, [r2,:64]!
84 mov r3, #-16
85 mov r12, #24
86 vmov.i32 q8, #1<<31
87 vld1.32 {q0}, [r1,:128], r3
88 vld1.32 {d2}, [r0,:64]!
891:
90 vld1.32 {d3,d4}, [r0,:128]!
91 vrev64.32 q0, q0
92 vld1.32 {q9}, [r1,:128], r3
93 veor q0, q0, q8
94 vld1.32 {d5,d6}, [r0,:128]!
95 vswp d0, d1
96 vrev64.32 q9, q9
97 vst2.32 {q0,q1}, [r2,:64]!
98 vmov q10, q2
99 veor q9, q9, q8
100 vmov d2, d6
101 vswp d18, d19
102 vld1.32 {q0}, [r1,:128], r3
103 vst2.32 {q9,q10}, [r2,:64]!
104 subs r12, r12, #8
105 bgt 1b
106 vld1.32 {d3,d4}, [r0,:128]!
107 vrev64.32 q0, q0
108 vld1.32 {q9}, [r1,:128], r3
109 veor q0, q0, q8
110 vld1.32 {d5}, [r0,:64]!
111 vswp d0, d1
112 vrev64.32 q9, q9
113 vst2.32 {q0,q1}, [r2,:64]!
114 vswp d4, d5
115 veor q1, q9, q8
116 vst2.32 {d3,d5}, [r2,:64]!
117 vst2.32 {d2[0],d4[0]}, [r2,:64]!
118 bx lr
119endfunc
120
121function ff_sbr_qmf_post_shuffle_neon, export=1
122 add r2, r1, #60*4
123 mov r3, #-16
124 mov r12, #32
125 vmov.i32 q8, #1<<31
126 vld1.32 {q0}, [r2,:128], r3
127 vld1.32 {q1}, [r1,:128]!
1281:
129 pld [r2, #-32]
130 vrev64.32 q0, q0
131 vswp d2, d3
132 veor q0, q0, q8
133 vld1.32 {q2}, [r2,:128], r3
134 vld1.32 {q3}, [r1,:128]!
135 vst2.32 {d1,d3}, [r0,:128]!
136 vst2.32 {d0,d2}, [r0,:128]!
137 pld [r2, #-32]
138 vrev64.32 q2, q2
139 vswp d6, d7
140 veor q2, q2, q8
141 vld1.32 {q0}, [r2,:128], r3
142 vld1.32 {q1}, [r1,:128]!
143 vst2.32 {d5,d7}, [r0,:128]!
144 vst2.32 {d4,d6}, [r0,:128]!
145 subs r12, r12, #8
146 bgt 1b
147 bx lr
148endfunc
149
150function ff_sbr_qmf_deint_neg_neon, export=1
151 add r1, r1, #60*4
152 add r2, r0, #62*4
153 mov r3, #-16
154 mov r12, #32
155 vmov.i32 d2, #1<<31
1561:
157 vld2.32 {d0,d1}, [r1,:128], r3
158 veor d0, d0, d2
159 vrev64.32 d1, d1
160 vst1.32 {d0}, [r2,:64]
161 vst1.32 {d1}, [r0,:64]!
162 sub r2, r2, #8
163 subs r12, r12, #2
164 bgt 1b
165 bx lr
166endfunc
167
168function ff_sbr_qmf_deint_bfly_neon, export=1
169 push {lr}
170 add r2, r2, #60*4
171 add r3, r0, #124*4
172 mov r12, #64
173 mov lr, #-16
1741:
175 vld1.32 {q0}, [r1,:128]!
176 vld1.32 {q1}, [r2,:128], lr
177 vrev64.32 q2, q0
178 vrev64.32 q3, q1
179 vadd.f32 d3, d4, d3
180 vadd.f32 d2, d5, d2
181 vsub.f32 d0, d0, d7
182 vsub.f32 d1, d1, d6
183 vst1.32 {q1}, [r3,:128], lr
184 vst1.32 {q0}, [r0,:128]!
185 subs r12, r12, #4
186 bgt 1b
187 pop {pc}
188endfunc
189
190function ff_sbr_hf_g_filt_neon, export=1
191 ldr r12, [sp]
192 add r1, r1, r12, lsl #3
193 mov r12, #40*2*4
194 sub r3, r3, #1
195 vld2.32 {d2[],d3[]},[r2,:64]!
196 vld1.32 {d0}, [r1,:64], r12
1971:
198 vld1.32 {d1}, [r1,:64], r12
199 vmul.f32 q3, q0, q1
200 vld2.32 {d2[],d3[]},[r2,:64]!
201 vld1.32 {d0}, [r1,:64], r12
202 vst1.32 {q3}, [r0,:64]!
203 subs r3, r3, #2
204 bgt 1b
205 it lt
206 bxlt lr
207 vmul.f32 d0, d0, d2
208 vst1.32 {d0}, [r0,:64]!
209 bx lr
210endfunc
211
212function ff_sbr_hf_gen_neon, export=1
213NOVFP vld1.32 {d1[]}, [sp,:32]
214VFP vdup.32 d1, d0[0]
215 vmul.f32 d0, d1, d1
216 vld1.32 {d3}, [r2,:64]
217 vld1.32 {d2}, [r3,:64]
218 vmul.f32 q0, q0, q1
219 ldrd r2, r3, [sp, #4*!HAVE_VFP_ARGS]
220 vtrn.32 d0, d1
221 vneg.f32 d18, d1
222 vtrn.32 d18, d1
223 add r0, r0, r2, lsl #3
224 add r1, r1, r2, lsl #3
225 sub r1, r1, #2*8
226 sub r3, r3, r2
227 vld1.32 {q1}, [r1,:128]!
2281:
229 vld1.32 {q3}, [r1,:128]!
230 vrev64.32 q2, q1
231 vmov q8, q3
232 vrev64.32 d20, d3
233 vrev64.32 d21, d6
234 vmla.f32 q3, q1, d0[0]
235 vmla.f32 d6, d4, d18
236 vmla.f32 d7, d20, d18
237 vmla.f32 d6, d3, d0[1]
238 vmla.f32 d7, d16, d0[1]
239 vmla.f32 d6, d5, d1
240 vmla.f32 d7, d21, d1
241 vmov q1, q8
242 vst1.32 {q3}, [r0,:128]!
243 subs r3, r3, #2
244 bgt 1b
245 bx lr
246endfunc
247
248function ff_sbr_autocorrelate_neon, export=1
249 vld1.32 {q0}, [r0,:128]!
250 vmov.f32 q1, #0.0
251 vmov.f32 q3, #0.0
252 vmov.f32 d20, #0.0
253 vmul.f32 d21, d1, d1
254 vmov q8, q0
255 vmov q11, q0
256 mov r12, #36
2571:
258 vld1.32 {q2}, [r0,:128]!
259 vrev64.32 q12, q2
260 vmla.f32 q10, q2, q2
261 vmla.f32 d2, d1, d4
262 vmla.f32 d3, d1, d24
263 vmla.f32 d6, d0, d4
264 vmla.f32 d7, d0, d24
265 vmla.f32 d2, d4, d5
266 vmla.f32 d3, d4, d25
267 vmla.f32 d6, d1, d5
268 vmla.f32 d7, d1, d25
269 vmov q0, q2
270 subs r12, r12, #2
271 bgt 1b
272 vld1.32 {q2}, [r0,:128]!
273 vrev64.32 q12, q2
274 vmla.f32 d2, d1, d4
275 vmla.f32 d3, d1, d24
276 vmla.f32 d6, d0, d4
277 vmla.f32 d7, d0, d24
278 vadd.f32 d20, d20, d21
279 vrev64.32 d18, d17
280 vmla.f32 d6, d1, d5
281 vmla.f32 d7, d1, d25
282 vmov q0, q1
283 vmla.f32 d0, d16, d17
284 vmla.f32 d1, d16, d18
285 vmla.f32 d2, d4, d5
286 vmla.f32 d3, d4, d25
287 vneg.f32 s15, s15
288 vmov d21, d20
289 vpadd.f32 d0, d0, d2
290 vpadd.f32 d7, d6, d7
291 vtrn.32 d1, d3
292 vsub.f32 d6, d1, d3
293 vmla.f32 d20, d22, d22
294 vmla.f32 d21, d4, d4
295 vtrn.32 d0, d6
296 vpadd.f32 d20, d20, d21
297 vst1.32 {q3}, [r1,:128]!
298 vst1.32 {d20[1]}, [r1,:32]
299 add r1, r1, #2*4
300 vst1.32 {d0}, [r1,:64]
301 add r1, r1, #4*4
302 vst1.32 {d20[0]}, [r1,:32]
303 bx lr
304endfunc
305
306function ff_sbr_hf_apply_noise_0_neon, export=1
307 vmov.i32 d3, #0
308.Lhf_apply_noise_0:
309 push {r4,lr}
310 movrelx r4, X(ff_sbr_noise_table)
311 ldr r12, [sp, #12]
312 add r3, r3, #1
313 bfc r3, #9, #23
314 sub r12, r12, #1
3151:
316 add lr, r4, r3, lsl #3
317 vld2.32 {q0}, [r0,:64]
318 vld2.32 {q3}, [lr,:64]
319 vld1.32 {d2}, [r1,:64]!
320 vld1.32 {d18}, [r2,:64]!
321 vceq.f32 d16, d2, #0
322 veor d2, d2, d3
323 vmov q2, q0
324 vmla.f32 d0, d6, d18
325 vmla.f32 d1, d7, d18
326 vadd.f32 d4, d4, d2
327 add r3, r3, #2
328 bfc r3, #9, #23
329 vbif d0, d4, d16
330 vbif d1, d5, d16
331 vst2.32 {q0}, [r0,:64]!
332 subs r12, r12, #2
333 bgt 1b
334 blt 2f
335 add lr, r4, r3, lsl #3
336 vld1.32 {d0}, [r0,:64]
337 vld1.32 {d6}, [lr,:64]
338 vld1.32 {d2[]}, [r1,:32]!
339 vld1.32 {d3[]}, [r2,:32]!
340 vceq.f32 d4, d2, #0
341 veor d2, d2, d3
342 vmov d1, d0
343 vmla.f32 d0, d6, d3
344 vadd.f32 s2, s2, s4
345 vbif d0, d1, d4
346 vst1.32 {d0}, [r0,:64]!
3472:
348 pop {r4,pc}
349endfunc
350
351function ff_sbr_hf_apply_noise_1_neon, export=1
352 ldr r12, [sp]
353 push {r4,lr}
354 lsl r12, r12, #31
355 eor lr, r12, #1<<31
356 vmov d3, r12, lr
357.Lhf_apply_noise_1:
358 movrelx r4, X(ff_sbr_noise_table)
359 ldr r12, [sp, #12]
360 add r3, r3, #1
361 bfc r3, #9, #23
362 sub r12, r12, #1
3631:
364 add lr, r4, r3, lsl #3
365 vld2.32 {q0}, [r0,:64]
366 vld2.32 {q3}, [lr,:64]
367 vld1.32 {d2}, [r1,:64]!
368 vld1.32 {d18}, [r2,:64]!
369 vceq.f32 d16, d2, #0
370 veor d2, d2, d3
371 vmov q2, q0
372 vmla.f32 d0, d6, d18
373 vmla.f32 d1, d7, d18
374 vadd.f32 d5, d5, d2
375 add r3, r3, #2
376 bfc r3, #9, #23
377 vbif d0, d4, d16
378 vbif d1, d5, d16
379 vst2.32 {q0}, [r0,:64]!
380 subs r12, r12, #2
381 bgt 1b
382 blt 2f
383 add lr, r4, r3, lsl #3
384 vld1.32 {d0}, [r0,:64]
385 vld1.32 {d6}, [lr,:64]
386 vld1.32 {d2[]}, [r1,:32]!
387 vld1.32 {d18[]}, [r2,:32]!
388 vceq.f32 d4, d2, #0
389 veor d2, d2, d3
390 vmov d1, d0
391 vmla.f32 d0, d6, d18
392 vadd.f32 s3, s3, s5
393 vbif d0, d1, d4
394 vst1.32 {d0}, [r0,:64]!
3952:
396 pop {r4,pc}
397endfunc
398
399function ff_sbr_hf_apply_noise_2_neon, export=1
400 vmov.i32 d3, #1<<31
401 b .Lhf_apply_noise_0
402endfunc
403
404function ff_sbr_hf_apply_noise_3_neon, export=1
405 ldr r12, [sp]
406 push {r4,lr}
407 lsl r12, r12, #31
408 eor lr, r12, #1<<31
409 vmov d3, lr, r12
410 b .Lhf_apply_noise_1
411endfunc