2 * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
4 * This file is part of FFmpeg
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 * Assume that len is a positive number and is multiple of 8
27 @ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len)
28 function ff_vector_fmul_vfp, export=1
31 orr r12, r12, #(3 << 16) /* set vector size to 4 */
43 vldmiage r1!, {s16-s19}
44 vldmiage r2!, {s24-s27}
45 vldmiage r1!, {s20-s23}
46 vldmiage r2!, {s28-s31}
48 vmulge.f32 s24, s16, s24
52 vmulge.f32 s28, s20, s28
55 vldmiagt r2!, {s8-s11}
57 vldmiagt r2!, {s12-s15}
60 vstmiage r0!, {s24-s27}
61 vstmiage r0!, {s28-s31}
64 bic r12, r12, #(7 << 16) /* set vector size back to 1 */
71 * ARM VFP implementation of 'vector_fmul_window_c' function
72 * Assume that len is a positive non-zero number
74 @ void ff_vector_fmul_window_vfp(float *dst, const float *src0,
75 @ const float *src1, const float *win, int len)
76 function ff_vector_fmul_window_vfp, export=1
90 add DST1, DST0, LEN, lsl #3
91 add SRC1, SRC1, LEN, lsl #2
92 add WIN1, WIN0, LEN, lsl #3
95 beq 4f @ common case: len is a multiple of 8
97 ldr lr, =0x03000000 @ RunFast mode, scalar mode
117 vldmia SRC0!, {s8-s9}
118 vldmia WIN0!, {s16-s17}
129 vstmia DST0!, {s24-s25}
139 vldmia SRC0!, {s8-s11}
140 vldmia WIN0!, {s16-s19}
161 vstmia DST0!, {s24-s27}
170 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
177 vldmia SRC0!, {s8-s11}
178 vldmia WIN0!, {s16-s19}
179 vmul.f s24, s0, s8 @ vector * vector
184 vmul.f s8, s16, s8 @ vector * vector
185 vmls.f s24, s16, s20 @ vector * vector
190 vldmia SRC0!, {s12-s13}
191 vmla.f s8, s0, s20 @ vector * vector
192 vldmia SRC0!, {s14-s15}
195 5: vldmia WIN0!, {s20-s23}
196 vmul.f s28, s4, s12 @ vector * vector
197 vstmia DST0!, {s24-s25}
202 vmul.f s12, s20, s12 @ vector * vector
203 vstmia DST0!, {s26-s27}
208 vmls.f s28, s20, s16 @ vector * vector
213 vldmia SRC0!, {s8-s9}
214 vmla.f s12, s4, s16 @ vector * vector
215 vldmia SRC0!, {s10-s11}
217 vldmia WIN0!, {s16-s19}
218 vmul.f s24, s0, s8 @ vector * vector
219 vstmia DST0!, {s28-s29}
224 vmul.f s8, s16, s8 @ vector * vector
225 vstmia DST0!, {s30-s31}
230 vmls.f s24, s16, s20 @ vector * vector
235 vldmia SRC0!, {s12-s13}
236 vmla.f s8, s0, s20 @ vector * vector
237 vldmia SRC0!, {s14-s15}
239 6: vldmia WIN0!, {s20-s23}
240 vmul.f s28, s4, s12 @ vector * vector
241 vstmia DST0!, {s24-s25}
246 vmul.f s12, s20, s12 @ vector * vector
247 vstmia DST0!, {s26-s27}
252 vmls.f s28, s20, s16 @ vector * vector
253 vmla.f s12, s4, s16 @ vector * vector
254 vstmia DST0!, {s28-s31}
275 * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
276 * Assume that len is a positive number and is multiple of 8
278 @ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
279 @ const float *src1, int len)
280 function ff_vector_fmul_reverse_vfp, export=1
282 add r2, r2, r3, lsl #2
286 vldmia r1!, {s12-s15}
289 vmul.f32 s10, s1, s10
290 vmul.f32 s11, s0, s11
294 vldmdbge r2!, {s16-s19}
295 vmul.f32 s12, s7, s12
297 vldmiage r1!, {s24-s27}
298 vmul.f32 s13, s6, s13
300 vldmdbge r2!, {s20-s23}
301 vmul.f32 s14, s5, s14
303 vldmiage r1!, {s28-s31}
304 vmul.f32 s15, s4, s15
306 vmulge.f32 s24, s19, s24
308 vldmdbgt r2!, {s0-s3}
310 vmulge.f32 s25, s18, s25
313 vmulge.f32 s26, s17, s26
315 vldmiagt r1!, {s8-s11}
317 vmulge.f32 s27, s16, s27
318 vmulge.f32 s28, s23, s28
320 vldmdbgt r2!, {s4-s7}
322 vmulge.f32 s29, s22, s29
323 vstmia r0!, {s14-s15}
325 vmulge.f32 s30, s21, s30
326 vmulge.f32 s31, s20, s31
327 vmulge.f32 s8, s3, s8
329 vldmiagt r1!, {s12-s15}
331 vmulge.f32 s9, s2, s9
332 vmulge.f32 s10, s1, s10
333 vstmiage r0!, {s24-s27}
334 vmulge.f32 s11, s0, s11
336 vstmiage r0!, {s28-s31}
344 * ARM VFP implementation of 'butterflies_float_c' function
345 * Assume that len is a positive non-zero number
347 @ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len)
348 function ff_butterflies_float_vfp, export=1
358 beq 4f @ common case: len is a multiple of 8
360 ldr ip, =0x03000000 @ RunFast mode, scalar mode
369 vstr s16, [BASE1, #0-4*1]
370 vstr s24, [BASE2, #0-4*1]
374 vldmia BASE1!, {s0-s1}
375 vldmia BASE2!, {s8-s9}
380 vstr d8, [BASE1, #0-8*1] @ s16,s17
381 vstr d12, [BASE2, #0-8*1] @ s24,s25
385 vldmia BASE1!, {s0-s1}
386 vldmia BASE2!, {s8-s9}
387 vldmia BASE1!, {s2-s3}
388 vldmia BASE2!, {s10-s11}
397 vstr d8, [BASE1, #0-16*1] @ s16,s17
398 vstr d12, [BASE2, #0-16*1] @ s24,s25
399 vstr d9, [BASE1, #8-16*1] @ s18,s19
400 vstr d13, [BASE2, #8-16*1] @ s26,s27
405 ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
408 vldmia BASE1!, {s0-s1}
409 vldmia BASE2!, {s8-s9}
410 vldmia BASE1!, {s2-s3}
411 vldmia BASE2!, {s10-s11}
413 vldmia BASE1!, {s4-s5}
414 vldmia BASE2!, {s12-s13}
415 vldmia BASE1!, {s6-s7}
416 vldmia BASE2!, {s14-s15}
421 5: vldmia BASE1!, {s0-s3}
422 vldmia BASE2!, {s8-s11}
424 vstr d8, [BASE1, #0-16*3] @ s16,s17
425 vstr d9, [BASE1, #8-16*3] @ s18,s19
426 vstr d12, [BASE2, #0-16*3] @ s24,s25
427 vstr d13, [BASE2, #8-16*3] @ s26,s27
429 vldmia BASE1!, {s4-s7}
430 vldmia BASE2!, {s12-s15}
432 vstr d10, [BASE1, #0-16*3] @ s20,s21
433 vstr d11, [BASE1, #8-16*3] @ s22,s23
434 vstr d14, [BASE2, #0-16*3] @ s28,s29
435 vstr d15, [BASE2, #8-16*3] @ s30,s31
439 6: vsub.f s28, s4, s12
440 vstr d8, [BASE1, #0-16*2] @ s16,s17
441 vstr d9, [BASE1, #8-16*2] @ s18,s19
442 vstr d12, [BASE2, #0-16*2] @ s24,s25
443 vstr d13, [BASE2, #8-16*2] @ s26,s27
444 vstr d10, [BASE1, #0-16*1] @ s20,s21
445 vstr d11, [BASE1, #8-16*1] @ s22,s23
446 vstr d14, [BASE2, #0-16*1] @ s28,s29
447 vstr d15, [BASE2, #8-16*1] @ s30,s31