Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2012 Mans Rullgard | |
3 | * | |
4 | * This file is part of FFmpeg. | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with FFmpeg; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | */ | |
20 | ||
21 | #include "libavutil/arm/asm.S" | |
22 | ||
23 | function ff_sbr_sum64x5_neon, export=1 | |
24 | push {lr} | |
25 | add r1, r0, # 64*4 | |
26 | add r2, r0, #128*4 | |
27 | add r3, r0, #192*4 | |
28 | add lr, r0, #256*4 | |
29 | mov r12, #64 | |
30 | 1: | |
31 | vld1.32 {q0}, [r0,:128] | |
32 | vld1.32 {q1}, [r1,:128]! | |
33 | vadd.f32 q0, q0, q1 | |
34 | vld1.32 {q2}, [r2,:128]! | |
35 | vadd.f32 q0, q0, q2 | |
36 | vld1.32 {q3}, [r3,:128]! | |
37 | vadd.f32 q0, q0, q3 | |
38 | vld1.32 {q8}, [lr,:128]! | |
39 | vadd.f32 q0, q0, q8 | |
40 | vst1.32 {q0}, [r0,:128]! | |
41 | subs r12, #4 | |
42 | bgt 1b | |
43 | pop {pc} | |
44 | endfunc | |
45 | ||
46 | function ff_sbr_sum_square_neon, export=1 | |
47 | vmov.f32 q0, #0.0 | |
48 | 1: | |
49 | vld1.32 {q1}, [r0,:128]! | |
50 | vmla.f32 q0, q1, q1 | |
51 | subs r1, r1, #2 | |
52 | bgt 1b | |
53 | vadd.f32 d0, d0, d1 | |
54 | vpadd.f32 d0, d0, d0 | |
55 | NOVFP vmov.32 r0, d0[0] | |
56 | bx lr | |
57 | endfunc | |
58 | ||
59 | function ff_sbr_neg_odd_64_neon, export=1 | |
60 | mov r1, r0 | |
61 | vmov.i32 q8, #1<<31 | |
62 | vld2.32 {q0,q1}, [r0,:128]! | |
63 | veor q1, q1, q8 | |
64 | vld2.32 {q2,q3}, [r0,:128]! | |
65 | .rept 3 | |
66 | vst2.32 {q0,q1}, [r1,:128]! | |
67 | veor q3, q3, q8 | |
68 | vld2.32 {q0,q1}, [r0,:128]! | |
69 | vst2.32 {q2,q3}, [r1,:128]! | |
70 | veor q1, q1, q8 | |
71 | vld2.32 {q2,q3}, [r0,:128]! | |
72 | .endr | |
73 | veor q3, q3, q8 | |
74 | vst2.32 {q0,q1}, [r1,:128]! | |
75 | vst2.32 {q2,q3}, [r1,:128]! | |
76 | bx lr | |
77 | endfunc | |
78 | ||
79 | function ff_sbr_qmf_pre_shuffle_neon, export=1 | |
80 | add r1, r0, #60*4 | |
81 | add r2, r0, #64*4 | |
82 | vld1.32 {d0}, [r0,:64]! | |
83 | vst1.32 {d0}, [r2,:64]! | |
84 | mov r3, #-16 | |
85 | mov r12, #24 | |
86 | vmov.i32 q8, #1<<31 | |
87 | vld1.32 {q0}, [r1,:128], r3 | |
88 | vld1.32 {d2}, [r0,:64]! | |
89 | 1: | |
90 | vld1.32 {d3,d4}, [r0,:128]! | |
91 | vrev64.32 q0, q0 | |
92 | vld1.32 {q9}, [r1,:128], r3 | |
93 | veor q0, q0, q8 | |
94 | vld1.32 {d5,d6}, [r0,:128]! | |
95 | vswp d0, d1 | |
96 | vrev64.32 q9, q9 | |
97 | vst2.32 {q0,q1}, [r2,:64]! | |
98 | vmov q10, q2 | |
99 | veor q9, q9, q8 | |
100 | vmov d2, d6 | |
101 | vswp d18, d19 | |
102 | vld1.32 {q0}, [r1,:128], r3 | |
103 | vst2.32 {q9,q10}, [r2,:64]! | |
104 | subs r12, r12, #8 | |
105 | bgt 1b | |
106 | vld1.32 {d3,d4}, [r0,:128]! | |
107 | vrev64.32 q0, q0 | |
108 | vld1.32 {q9}, [r1,:128], r3 | |
109 | veor q0, q0, q8 | |
110 | vld1.32 {d5}, [r0,:64]! | |
111 | vswp d0, d1 | |
112 | vrev64.32 q9, q9 | |
113 | vst2.32 {q0,q1}, [r2,:64]! | |
114 | vswp d4, d5 | |
115 | veor q1, q9, q8 | |
116 | vst2.32 {d3,d5}, [r2,:64]! | |
117 | vst2.32 {d2[0],d4[0]}, [r2,:64]! | |
118 | bx lr | |
119 | endfunc | |
120 | ||
121 | function ff_sbr_qmf_post_shuffle_neon, export=1 | |
122 | add r2, r1, #60*4 | |
123 | mov r3, #-16 | |
124 | mov r12, #32 | |
125 | vmov.i32 q8, #1<<31 | |
126 | vld1.32 {q0}, [r2,:128], r3 | |
127 | vld1.32 {q1}, [r1,:128]! | |
128 | 1: | |
129 | pld [r2, #-32] | |
130 | vrev64.32 q0, q0 | |
131 | vswp d2, d3 | |
132 | veor q0, q0, q8 | |
133 | vld1.32 {q2}, [r2,:128], r3 | |
134 | vld1.32 {q3}, [r1,:128]! | |
135 | vst2.32 {d1,d3}, [r0,:128]! | |
136 | vst2.32 {d0,d2}, [r0,:128]! | |
137 | pld [r2, #-32] | |
138 | vrev64.32 q2, q2 | |
139 | vswp d6, d7 | |
140 | veor q2, q2, q8 | |
141 | vld1.32 {q0}, [r2,:128], r3 | |
142 | vld1.32 {q1}, [r1,:128]! | |
143 | vst2.32 {d5,d7}, [r0,:128]! | |
144 | vst2.32 {d4,d6}, [r0,:128]! | |
145 | subs r12, r12, #8 | |
146 | bgt 1b | |
147 | bx lr | |
148 | endfunc | |
149 | ||
150 | function ff_sbr_qmf_deint_neg_neon, export=1 | |
151 | add r1, r1, #60*4 | |
152 | add r2, r0, #62*4 | |
153 | mov r3, #-16 | |
154 | mov r12, #32 | |
155 | vmov.i32 d2, #1<<31 | |
156 | 1: | |
157 | vld2.32 {d0,d1}, [r1,:128], r3 | |
158 | veor d0, d0, d2 | |
159 | vrev64.32 d1, d1 | |
160 | vst1.32 {d0}, [r2,:64] | |
161 | vst1.32 {d1}, [r0,:64]! | |
162 | sub r2, r2, #8 | |
163 | subs r12, r12, #2 | |
164 | bgt 1b | |
165 | bx lr | |
166 | endfunc | |
167 | ||
168 | function ff_sbr_qmf_deint_bfly_neon, export=1 | |
169 | push {lr} | |
170 | add r2, r2, #60*4 | |
171 | add r3, r0, #124*4 | |
172 | mov r12, #64 | |
173 | mov lr, #-16 | |
174 | 1: | |
175 | vld1.32 {q0}, [r1,:128]! | |
176 | vld1.32 {q1}, [r2,:128], lr | |
177 | vrev64.32 q2, q0 | |
178 | vrev64.32 q3, q1 | |
179 | vadd.f32 d3, d4, d3 | |
180 | vadd.f32 d2, d5, d2 | |
181 | vsub.f32 d0, d0, d7 | |
182 | vsub.f32 d1, d1, d6 | |
183 | vst1.32 {q1}, [r3,:128], lr | |
184 | vst1.32 {q0}, [r0,:128]! | |
185 | subs r12, r12, #4 | |
186 | bgt 1b | |
187 | pop {pc} | |
188 | endfunc | |
189 | ||
190 | function ff_sbr_hf_g_filt_neon, export=1 | |
191 | ldr r12, [sp] | |
192 | add r1, r1, r12, lsl #3 | |
193 | mov r12, #40*2*4 | |
194 | sub r3, r3, #1 | |
195 | vld2.32 {d2[],d3[]},[r2,:64]! | |
196 | vld1.32 {d0}, [r1,:64], r12 | |
197 | 1: | |
198 | vld1.32 {d1}, [r1,:64], r12 | |
199 | vmul.f32 q3, q0, q1 | |
200 | vld2.32 {d2[],d3[]},[r2,:64]! | |
201 | vld1.32 {d0}, [r1,:64], r12 | |
202 | vst1.32 {q3}, [r0,:64]! | |
203 | subs r3, r3, #2 | |
204 | bgt 1b | |
205 | it lt | |
206 | bxlt lr | |
207 | vmul.f32 d0, d0, d2 | |
208 | vst1.32 {d0}, [r0,:64]! | |
209 | bx lr | |
210 | endfunc | |
211 | ||
212 | function ff_sbr_hf_gen_neon, export=1 | |
213 | NOVFP vld1.32 {d1[]}, [sp,:32] | |
214 | VFP vdup.32 d1, d0[0] | |
215 | vmul.f32 d0, d1, d1 | |
216 | vld1.32 {d3}, [r2,:64] | |
217 | vld1.32 {d2}, [r3,:64] | |
218 | vmul.f32 q0, q0, q1 | |
219 | ldrd r2, r3, [sp, #4*!HAVE_VFP_ARGS] | |
220 | vtrn.32 d0, d1 | |
221 | vneg.f32 d18, d1 | |
222 | vtrn.32 d18, d1 | |
223 | add r0, r0, r2, lsl #3 | |
224 | add r1, r1, r2, lsl #3 | |
225 | sub r1, r1, #2*8 | |
226 | sub r3, r3, r2 | |
227 | vld1.32 {q1}, [r1,:128]! | |
228 | 1: | |
229 | vld1.32 {q3}, [r1,:128]! | |
230 | vrev64.32 q2, q1 | |
231 | vmov q8, q3 | |
232 | vrev64.32 d20, d3 | |
233 | vrev64.32 d21, d6 | |
234 | vmla.f32 q3, q1, d0[0] | |
235 | vmla.f32 d6, d4, d18 | |
236 | vmla.f32 d7, d20, d18 | |
237 | vmla.f32 d6, d3, d0[1] | |
238 | vmla.f32 d7, d16, d0[1] | |
239 | vmla.f32 d6, d5, d1 | |
240 | vmla.f32 d7, d21, d1 | |
241 | vmov q1, q8 | |
242 | vst1.32 {q3}, [r0,:128]! | |
243 | subs r3, r3, #2 | |
244 | bgt 1b | |
245 | bx lr | |
246 | endfunc | |
247 | ||
248 | function ff_sbr_autocorrelate_neon, export=1 | |
249 | vld1.32 {q0}, [r0,:128]! | |
250 | vmov.f32 q1, #0.0 | |
251 | vmov.f32 q3, #0.0 | |
252 | vmov.f32 d20, #0.0 | |
253 | vmul.f32 d21, d1, d1 | |
254 | vmov q8, q0 | |
255 | vmov q11, q0 | |
256 | mov r12, #36 | |
257 | 1: | |
258 | vld1.32 {q2}, [r0,:128]! | |
259 | vrev64.32 q12, q2 | |
260 | vmla.f32 q10, q2, q2 | |
261 | vmla.f32 d2, d1, d4 | |
262 | vmla.f32 d3, d1, d24 | |
263 | vmla.f32 d6, d0, d4 | |
264 | vmla.f32 d7, d0, d24 | |
265 | vmla.f32 d2, d4, d5 | |
266 | vmla.f32 d3, d4, d25 | |
267 | vmla.f32 d6, d1, d5 | |
268 | vmla.f32 d7, d1, d25 | |
269 | vmov q0, q2 | |
270 | subs r12, r12, #2 | |
271 | bgt 1b | |
272 | vld1.32 {q2}, [r0,:128]! | |
273 | vrev64.32 q12, q2 | |
274 | vmla.f32 d2, d1, d4 | |
275 | vmla.f32 d3, d1, d24 | |
276 | vmla.f32 d6, d0, d4 | |
277 | vmla.f32 d7, d0, d24 | |
278 | vadd.f32 d20, d20, d21 | |
279 | vrev64.32 d18, d17 | |
280 | vmla.f32 d6, d1, d5 | |
281 | vmla.f32 d7, d1, d25 | |
282 | vmov q0, q1 | |
283 | vmla.f32 d0, d16, d17 | |
284 | vmla.f32 d1, d16, d18 | |
285 | vmla.f32 d2, d4, d5 | |
286 | vmla.f32 d3, d4, d25 | |
287 | vneg.f32 s15, s15 | |
288 | vmov d21, d20 | |
289 | vpadd.f32 d0, d0, d2 | |
290 | vpadd.f32 d7, d6, d7 | |
291 | vtrn.32 d1, d3 | |
292 | vsub.f32 d6, d1, d3 | |
293 | vmla.f32 d20, d22, d22 | |
294 | vmla.f32 d21, d4, d4 | |
295 | vtrn.32 d0, d6 | |
296 | vpadd.f32 d20, d20, d21 | |
297 | vst1.32 {q3}, [r1,:128]! | |
298 | vst1.32 {d20[1]}, [r1,:32] | |
299 | add r1, r1, #2*4 | |
300 | vst1.32 {d0}, [r1,:64] | |
301 | add r1, r1, #4*4 | |
302 | vst1.32 {d20[0]}, [r1,:32] | |
303 | bx lr | |
304 | endfunc | |
305 | ||
306 | function ff_sbr_hf_apply_noise_0_neon, export=1 | |
307 | vmov.i32 d3, #0 | |
308 | .Lhf_apply_noise_0: | |
309 | push {r4,lr} | |
310 | movrelx r4, X(ff_sbr_noise_table) | |
311 | ldr r12, [sp, #12] | |
312 | add r3, r3, #1 | |
313 | bfc r3, #9, #23 | |
314 | sub r12, r12, #1 | |
315 | 1: | |
316 | add lr, r4, r3, lsl #3 | |
317 | vld2.32 {q0}, [r0,:64] | |
318 | vld2.32 {q3}, [lr,:64] | |
319 | vld1.32 {d2}, [r1,:64]! | |
320 | vld1.32 {d18}, [r2,:64]! | |
321 | vceq.f32 d16, d2, #0 | |
322 | veor d2, d2, d3 | |
323 | vmov q2, q0 | |
324 | vmla.f32 d0, d6, d18 | |
325 | vmla.f32 d1, d7, d18 | |
326 | vadd.f32 d4, d4, d2 | |
327 | add r3, r3, #2 | |
328 | bfc r3, #9, #23 | |
329 | vbif d0, d4, d16 | |
330 | vbif d1, d5, d16 | |
331 | vst2.32 {q0}, [r0,:64]! | |
332 | subs r12, r12, #2 | |
333 | bgt 1b | |
334 | blt 2f | |
335 | add lr, r4, r3, lsl #3 | |
336 | vld1.32 {d0}, [r0,:64] | |
337 | vld1.32 {d6}, [lr,:64] | |
338 | vld1.32 {d2[]}, [r1,:32]! | |
339 | vld1.32 {d3[]}, [r2,:32]! | |
340 | vceq.f32 d4, d2, #0 | |
341 | veor d2, d2, d3 | |
342 | vmov d1, d0 | |
343 | vmla.f32 d0, d6, d3 | |
344 | vadd.f32 s2, s2, s4 | |
345 | vbif d0, d1, d4 | |
346 | vst1.32 {d0}, [r0,:64]! | |
347 | 2: | |
348 | pop {r4,pc} | |
349 | endfunc | |
350 | ||
351 | function ff_sbr_hf_apply_noise_1_neon, export=1 | |
352 | ldr r12, [sp] | |
353 | push {r4,lr} | |
354 | lsl r12, r12, #31 | |
355 | eor lr, r12, #1<<31 | |
356 | vmov d3, r12, lr | |
357 | .Lhf_apply_noise_1: | |
358 | movrelx r4, X(ff_sbr_noise_table) | |
359 | ldr r12, [sp, #12] | |
360 | add r3, r3, #1 | |
361 | bfc r3, #9, #23 | |
362 | sub r12, r12, #1 | |
363 | 1: | |
364 | add lr, r4, r3, lsl #3 | |
365 | vld2.32 {q0}, [r0,:64] | |
366 | vld2.32 {q3}, [lr,:64] | |
367 | vld1.32 {d2}, [r1,:64]! | |
368 | vld1.32 {d18}, [r2,:64]! | |
369 | vceq.f32 d16, d2, #0 | |
370 | veor d2, d2, d3 | |
371 | vmov q2, q0 | |
372 | vmla.f32 d0, d6, d18 | |
373 | vmla.f32 d1, d7, d18 | |
374 | vadd.f32 d5, d5, d2 | |
375 | add r3, r3, #2 | |
376 | bfc r3, #9, #23 | |
377 | vbif d0, d4, d16 | |
378 | vbif d1, d5, d16 | |
379 | vst2.32 {q0}, [r0,:64]! | |
380 | subs r12, r12, #2 | |
381 | bgt 1b | |
382 | blt 2f | |
383 | add lr, r4, r3, lsl #3 | |
384 | vld1.32 {d0}, [r0,:64] | |
385 | vld1.32 {d6}, [lr,:64] | |
386 | vld1.32 {d2[]}, [r1,:32]! | |
387 | vld1.32 {d18[]}, [r2,:32]! | |
388 | vceq.f32 d4, d2, #0 | |
389 | veor d2, d2, d3 | |
390 | vmov d1, d0 | |
391 | vmla.f32 d0, d6, d18 | |
392 | vadd.f32 s3, s3, s5 | |
393 | vbif d0, d1, d4 | |
394 | vst1.32 {d0}, [r0,:64]! | |
395 | 2: | |
396 | pop {r4,pc} | |
397 | endfunc | |
398 | ||
399 | function ff_sbr_hf_apply_noise_2_neon, export=1 | |
400 | vmov.i32 d3, #1<<31 | |
401 | b .Lhf_apply_noise_0 | |
402 | endfunc | |
403 | ||
404 | function ff_sbr_hf_apply_noise_3_neon, export=1 | |
405 | ldr r12, [sp] | |
406 | push {r4,lr} | |
407 | lsl r12, r12, #31 | |
408 | eor lr, r12, #1<<31 | |
409 | vmov d3, lr, r12 | |
410 | b .Lhf_apply_noise_1 | |
411 | endfunc |