7db24522848b3126939ab2f223eafd741a070f0b
[deb_ffmpeg.git] / float_dsp_vfp.S
1 /*
2 * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
3 *
4 * This file is part of FFmpeg
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "config.h"
22 #include "asm.S"
23
24 /**
25 * Assume that len is a positive number and is multiple of 8
26 */
27 @ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len)
28 function ff_vector_fmul_vfp, export=1
29 vpush {d8-d15}
30 fmrx r12, fpscr
31 orr r12, r12, #(3 << 16) /* set vector size to 4 */
32 fmxr fpscr, r12
33
34 vldmia r1!, {s0-s3}
35 vldmia r2!, {s8-s11}
36 vldmia r1!, {s4-s7}
37 vldmia r2!, {s12-s15}
38 vmul.f32 s8, s0, s8
39 1:
40 subs r3, r3, #16
41 vmul.f32 s12, s4, s12
42 itttt ge
43 vldmiage r1!, {s16-s19}
44 vldmiage r2!, {s24-s27}
45 vldmiage r1!, {s20-s23}
46 vldmiage r2!, {s28-s31}
47 it ge
48 vmulge.f32 s24, s16, s24
49 vstmia r0!, {s8-s11}
50 vstmia r0!, {s12-s15}
51 it ge
52 vmulge.f32 s28, s20, s28
53 itttt gt
54 vldmiagt r1!, {s0-s3}
55 vldmiagt r2!, {s8-s11}
56 vldmiagt r1!, {s4-s7}
57 vldmiagt r2!, {s12-s15}
58 ittt ge
59 vmulge.f32 s8, s0, s8
60 vstmiage r0!, {s24-s27}
61 vstmiage r0!, {s28-s31}
62 bgt 1b
63
64 bic r12, r12, #(7 << 16) /* set vector size back to 1 */
65 fmxr fpscr, r12
66 vpop {d8-d15}
67 bx lr
68 endfunc
69
70 /**
71 * ARM VFP implementation of 'vector_fmul_window_c' function
72 * Assume that len is a positive non-zero number
73 */
74 @ void ff_vector_fmul_window_vfp(float *dst, const float *src0,
75 @ const float *src1, const float *win, int len)
76 function ff_vector_fmul_window_vfp, export=1
77 DST0 .req a1
78 SRC0 .req a2
79 SRC1 .req a3
80 WIN0 .req a4
81 LEN .req v1
82 DST1 .req v2
83 WIN1 .req v3
84 OLDFPSCR .req ip
85
86 push {v1-v3,lr}
87 ldr LEN, [sp, #4*4+0]
88 vpush {s16-s31}
89 fmrx OLDFPSCR, FPSCR
90 add DST1, DST0, LEN, lsl #3
91 add SRC1, SRC1, LEN, lsl #2
92 add WIN1, WIN0, LEN, lsl #3
93
94 tst LEN, #7
95 beq 4f @ common case: len is a multiple of 8
96
97 ldr lr, =0x03000000 @ RunFast mode, scalar mode
98 fmxr FPSCR, lr
99
100 tst LEN, #1
101 beq 1f
102 vldmdb WIN1!, {s0}
103 vldmia SRC0!, {s8}
104 vldmia WIN0!, {s16}
105 vmul.f s24, s0, s8
106 vldmdb SRC1!, {s20}
107 vmul.f s8, s16, s8
108 vmls.f s24, s16, s20
109 vmla.f s8, s0, s20
110 vstmia DST0!, {s24}
111 vstmdb DST1!, {s8}
112 1:
113 tst LEN, #2
114 beq 2f
115 vldmdb WIN1!, {s0}
116 vldmdb WIN1!, {s1}
117 vldmia SRC0!, {s8-s9}
118 vldmia WIN0!, {s16-s17}
119 vmul.f s24, s0, s8
120 vmul.f s25, s1, s9
121 vldmdb SRC1!, {s20}
122 vldmdb SRC1!, {s21}
123 vmul.f s8, s16, s8
124 vmul.f s9, s17, s9
125 vmls.f s24, s16, s20
126 vmls.f s25, s17, s21
127 vmla.f s8, s0, s20
128 vmla.f s9, s1, s21
129 vstmia DST0!, {s24-s25}
130 vstmdb DST1!, {s8}
131 vstmdb DST1!, {s9}
132 2:
133 tst LEN, #4
134 beq 3f
135 vldmdb WIN1!, {s0}
136 vldmdb WIN1!, {s1}
137 vldmdb WIN1!, {s2}
138 vldmdb WIN1!, {s3}
139 vldmia SRC0!, {s8-s11}
140 vldmia WIN0!, {s16-s19}
141 vmul.f s24, s0, s8
142 vmul.f s25, s1, s9
143 vmul.f s26, s2, s10
144 vmul.f s27, s3, s11
145 vldmdb SRC1!, {s20}
146 vldmdb SRC1!, {s21}
147 vldmdb SRC1!, {s22}
148 vldmdb SRC1!, {s23}
149 vmul.f s8, s16, s8
150 vmul.f s9, s17, s9
151 vmul.f s10, s18, s10
152 vmul.f s11, s19, s11
153 vmls.f s24, s16, s20
154 vmls.f s25, s17, s21
155 vmls.f s26, s18, s22
156 vmls.f s27, s19, s23
157 vmla.f s8, s0, s20
158 vmla.f s9, s1, s21
159 vmla.f s10, s2, s22
160 vmla.f s11, s3, s23
161 vstmia DST0!, {s24-s27}
162 vstmdb DST1!, {s8}
163 vstmdb DST1!, {s9}
164 vstmdb DST1!, {s10}
165 vstmdb DST1!, {s11}
166 3:
167 bics LEN, LEN, #7
168 beq 7f
169 4:
170 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
171 fmxr FPSCR, lr
172
173 vldmdb WIN1!, {s0}
174 vldmdb WIN1!, {s1}
175 vldmdb WIN1!, {s2}
176 vldmdb WIN1!, {s3}
177 vldmia SRC0!, {s8-s11}
178 vldmia WIN0!, {s16-s19}
179 vmul.f s24, s0, s8 @ vector * vector
180 vldmdb SRC1!, {s20}
181 vldmdb SRC1!, {s21}
182 vldmdb SRC1!, {s22}
183 vldmdb SRC1!, {s23}
184 vmul.f s8, s16, s8 @ vector * vector
185 vmls.f s24, s16, s20 @ vector * vector
186 vldmdb WIN1!, {s4}
187 vldmdb WIN1!, {s5}
188 vldmdb WIN1!, {s6}
189 vldmdb WIN1!, {s7}
190 vldmia SRC0!, {s12-s13}
191 vmla.f s8, s0, s20 @ vector * vector
192 vldmia SRC0!, {s14-s15}
193 subs LEN, LEN, #8
194 beq 6f
195 5: vldmia WIN0!, {s20-s23}
196 vmul.f s28, s4, s12 @ vector * vector
197 vstmia DST0!, {s24-s25}
198 vldmdb SRC1!, {s16}
199 vldmdb SRC1!, {s17}
200 vldmdb SRC1!, {s18}
201 vldmdb SRC1!, {s19}
202 vmul.f s12, s20, s12 @ vector * vector
203 vstmia DST0!, {s26-s27}
204 vstmdb DST1!, {s8}
205 vstmdb DST1!, {s9}
206 vstmdb DST1!, {s10}
207 vstmdb DST1!, {s11}
208 vmls.f s28, s20, s16 @ vector * vector
209 vldmdb WIN1!, {s0}
210 vldmdb WIN1!, {s1}
211 vldmdb WIN1!, {s2}
212 vldmdb WIN1!, {s3}
213 vldmia SRC0!, {s8-s9}
214 vmla.f s12, s4, s16 @ vector * vector
215 vldmia SRC0!, {s10-s11}
216 subs LEN, LEN, #8
217 vldmia WIN0!, {s16-s19}
218 vmul.f s24, s0, s8 @ vector * vector
219 vstmia DST0!, {s28-s29}
220 vldmdb SRC1!, {s20}
221 vldmdb SRC1!, {s21}
222 vldmdb SRC1!, {s22}
223 vldmdb SRC1!, {s23}
224 vmul.f s8, s16, s8 @ vector * vector
225 vstmia DST0!, {s30-s31}
226 vstmdb DST1!, {s12}
227 vstmdb DST1!, {s13}
228 vstmdb DST1!, {s14}
229 vstmdb DST1!, {s15}
230 vmls.f s24, s16, s20 @ vector * vector
231 vldmdb WIN1!, {s4}
232 vldmdb WIN1!, {s5}
233 vldmdb WIN1!, {s6}
234 vldmdb WIN1!, {s7}
235 vldmia SRC0!, {s12-s13}
236 vmla.f s8, s0, s20 @ vector * vector
237 vldmia SRC0!, {s14-s15}
238 bne 5b
239 6: vldmia WIN0!, {s20-s23}
240 vmul.f s28, s4, s12 @ vector * vector
241 vstmia DST0!, {s24-s25}
242 vldmdb SRC1!, {s16}
243 vldmdb SRC1!, {s17}
244 vldmdb SRC1!, {s18}
245 vldmdb SRC1!, {s19}
246 vmul.f s12, s20, s12 @ vector * vector
247 vstmia DST0!, {s26-s27}
248 vstmdb DST1!, {s8}
249 vstmdb DST1!, {s9}
250 vstmdb DST1!, {s10}
251 vstmdb DST1!, {s11}
252 vmls.f s28, s20, s16 @ vector * vector
253 vmla.f s12, s4, s16 @ vector * vector
254 vstmia DST0!, {s28-s31}
255 vstmdb DST1!, {s12}
256 vstmdb DST1!, {s13}
257 vstmdb DST1!, {s14}
258 vstmdb DST1!, {s15}
259 7:
260 fmxr FPSCR, OLDFPSCR
261 vpop {s16-s31}
262 pop {v1-v3,pc}
263
264 .unreq DST0
265 .unreq SRC0
266 .unreq SRC1
267 .unreq WIN0
268 .unreq LEN
269 .unreq OLDFPSCR
270 .unreq DST1
271 .unreq WIN1
272 endfunc
273
274 /**
275 * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
276 * Assume that len is a positive number and is multiple of 8
277 */
278 @ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
279 @ const float *src1, int len)
280 function ff_vector_fmul_reverse_vfp, export=1
281 vpush {d8-d15}
282 add r2, r2, r3, lsl #2
283 vldmdb r2!, {s0-s3}
284 vldmia r1!, {s8-s11}
285 vldmdb r2!, {s4-s7}
286 vldmia r1!, {s12-s15}
287 vmul.f32 s8, s3, s8
288 vmul.f32 s9, s2, s9
289 vmul.f32 s10, s1, s10
290 vmul.f32 s11, s0, s11
291 1:
292 subs r3, r3, #16
293 it ge
294 vldmdbge r2!, {s16-s19}
295 vmul.f32 s12, s7, s12
296 it ge
297 vldmiage r1!, {s24-s27}
298 vmul.f32 s13, s6, s13
299 it ge
300 vldmdbge r2!, {s20-s23}
301 vmul.f32 s14, s5, s14
302 it ge
303 vldmiage r1!, {s28-s31}
304 vmul.f32 s15, s4, s15
305 it ge
306 vmulge.f32 s24, s19, s24
307 it gt
308 vldmdbgt r2!, {s0-s3}
309 it ge
310 vmulge.f32 s25, s18, s25
311 vstmia r0!, {s8-s13}
312 it ge
313 vmulge.f32 s26, s17, s26
314 it gt
315 vldmiagt r1!, {s8-s11}
316 itt ge
317 vmulge.f32 s27, s16, s27
318 vmulge.f32 s28, s23, s28
319 it gt
320 vldmdbgt r2!, {s4-s7}
321 it ge
322 vmulge.f32 s29, s22, s29
323 vstmia r0!, {s14-s15}
324 ittt ge
325 vmulge.f32 s30, s21, s30
326 vmulge.f32 s31, s20, s31
327 vmulge.f32 s8, s3, s8
328 it gt
329 vldmiagt r1!, {s12-s15}
330 itttt ge
331 vmulge.f32 s9, s2, s9
332 vmulge.f32 s10, s1, s10
333 vstmiage r0!, {s24-s27}
334 vmulge.f32 s11, s0, s11
335 it ge
336 vstmiage r0!, {s28-s31}
337 bgt 1b
338
339 vpop {d8-d15}
340 bx lr
341 endfunc
342
343 /**
344 * ARM VFP implementation of 'butterflies_float_c' function
345 * Assume that len is a positive non-zero number
346 */
347 @ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len)
348 function ff_butterflies_float_vfp, export=1
349 BASE1 .req a1
350 BASE2 .req a2
351 LEN .req a3
352 OLDFPSCR .req a4
353
354 vpush {s16-s31}
355 fmrx OLDFPSCR, FPSCR
356
357 tst LEN, #7
358 beq 4f @ common case: len is a multiple of 8
359
360 ldr ip, =0x03000000 @ RunFast mode, scalar mode
361 fmxr FPSCR, ip
362
363 tst LEN, #1
364 beq 1f
365 vldmia BASE1!, {s0}
366 vldmia BASE2!, {s8}
367 vadd.f s16, s0, s8
368 vsub.f s24, s0, s8
369 vstr s16, [BASE1, #0-4*1]
370 vstr s24, [BASE2, #0-4*1]
371 1:
372 tst LEN, #2
373 beq 2f
374 vldmia BASE1!, {s0-s1}
375 vldmia BASE2!, {s8-s9}
376 vadd.f s16, s0, s8
377 vadd.f s17, s1, s9
378 vsub.f s24, s0, s8
379 vsub.f s25, s1, s9
380 vstr d8, [BASE1, #0-8*1] @ s16,s17
381 vstr d12, [BASE2, #0-8*1] @ s24,s25
382 2:
383 tst LEN, #4
384 beq 3f
385 vldmia BASE1!, {s0-s1}
386 vldmia BASE2!, {s8-s9}
387 vldmia BASE1!, {s2-s3}
388 vldmia BASE2!, {s10-s11}
389 vadd.f s16, s0, s8
390 vadd.f s17, s1, s9
391 vsub.f s24, s0, s8
392 vsub.f s25, s1, s9
393 vadd.f s18, s2, s10
394 vadd.f s19, s3, s11
395 vsub.f s26, s2, s10
396 vsub.f s27, s3, s11
397 vstr d8, [BASE1, #0-16*1] @ s16,s17
398 vstr d12, [BASE2, #0-16*1] @ s24,s25
399 vstr d9, [BASE1, #8-16*1] @ s18,s19
400 vstr d13, [BASE2, #8-16*1] @ s26,s27
401 3:
402 bics LEN, LEN, #7
403 beq 7f
404 4:
405 ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
406 fmxr FPSCR, ip
407
408 vldmia BASE1!, {s0-s1}
409 vldmia BASE2!, {s8-s9}
410 vldmia BASE1!, {s2-s3}
411 vldmia BASE2!, {s10-s11}
412 vadd.f s16, s0, s8
413 vldmia BASE1!, {s4-s5}
414 vldmia BASE2!, {s12-s13}
415 vldmia BASE1!, {s6-s7}
416 vldmia BASE2!, {s14-s15}
417 vsub.f s24, s0, s8
418 vadd.f s20, s4, s12
419 subs LEN, LEN, #8
420 beq 6f
421 5: vldmia BASE1!, {s0-s3}
422 vldmia BASE2!, {s8-s11}
423 vsub.f s28, s4, s12
424 vstr d8, [BASE1, #0-16*3] @ s16,s17
425 vstr d9, [BASE1, #8-16*3] @ s18,s19
426 vstr d12, [BASE2, #0-16*3] @ s24,s25
427 vstr d13, [BASE2, #8-16*3] @ s26,s27
428 vadd.f s16, s0, s8
429 vldmia BASE1!, {s4-s7}
430 vldmia BASE2!, {s12-s15}
431 vsub.f s24, s0, s8
432 vstr d10, [BASE1, #0-16*3] @ s20,s21
433 vstr d11, [BASE1, #8-16*3] @ s22,s23
434 vstr d14, [BASE2, #0-16*3] @ s28,s29
435 vstr d15, [BASE2, #8-16*3] @ s30,s31
436 vadd.f s20, s4, s12
437 subs LEN, LEN, #8
438 bne 5b
439 6: vsub.f s28, s4, s12
440 vstr d8, [BASE1, #0-16*2] @ s16,s17
441 vstr d9, [BASE1, #8-16*2] @ s18,s19
442 vstr d12, [BASE2, #0-16*2] @ s24,s25
443 vstr d13, [BASE2, #8-16*2] @ s26,s27
444 vstr d10, [BASE1, #0-16*1] @ s20,s21
445 vstr d11, [BASE1, #8-16*1] @ s22,s23
446 vstr d14, [BASE2, #0-16*1] @ s28,s29
447 vstr d15, [BASE2, #8-16*1] @ s30,s31
448 7:
449 fmxr FPSCR, OLDFPSCR
450 vpop {s16-s31}
451 bx lr
452
453 .unreq BASE1
454 .unreq BASE2
455 .unreq LEN
456 .unreq OLDFPSCR
457 endfunc