Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> | |
3 | * | |
4 | * This file is part of FFmpeg | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with FFmpeg; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | */ | |
20 | ||
21 | #include "config.h" | |
22 | #include "asm.S" | |
23 | ||
24 | /** | |
25 | * Assume that len is a positive number and is multiple of 8 | |
26 | */ | |
27 | @ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len) | |
28 | function ff_vector_fmul_vfp, export=1 | |
29 | vpush {d8-d15} | |
30 | fmrx r12, fpscr | |
31 | orr r12, r12, #(3 << 16) /* set vector size to 4 */ | |
32 | fmxr fpscr, r12 | |
33 | ||
34 | vldmia r1!, {s0-s3} | |
35 | vldmia r2!, {s8-s11} | |
36 | vldmia r1!, {s4-s7} | |
37 | vldmia r2!, {s12-s15} | |
38 | vmul.f32 s8, s0, s8 | |
39 | 1: | |
40 | subs r3, r3, #16 | |
41 | vmul.f32 s12, s4, s12 | |
42 | itttt ge | |
43 | vldmiage r1!, {s16-s19} | |
44 | vldmiage r2!, {s24-s27} | |
45 | vldmiage r1!, {s20-s23} | |
46 | vldmiage r2!, {s28-s31} | |
47 | it ge | |
48 | vmulge.f32 s24, s16, s24 | |
49 | vstmia r0!, {s8-s11} | |
50 | vstmia r0!, {s12-s15} | |
51 | it ge | |
52 | vmulge.f32 s28, s20, s28 | |
53 | itttt gt | |
54 | vldmiagt r1!, {s0-s3} | |
55 | vldmiagt r2!, {s8-s11} | |
56 | vldmiagt r1!, {s4-s7} | |
57 | vldmiagt r2!, {s12-s15} | |
58 | ittt ge | |
59 | vmulge.f32 s8, s0, s8 | |
60 | vstmiage r0!, {s24-s27} | |
61 | vstmiage r0!, {s28-s31} | |
62 | bgt 1b | |
63 | ||
64 | bic r12, r12, #(7 << 16) /* set vector size back to 1 */ | |
65 | fmxr fpscr, r12 | |
66 | vpop {d8-d15} | |
67 | bx lr | |
68 | endfunc | |
69 | ||
70 | /** | |
71 | * ARM VFP implementation of 'vector_fmul_window_c' function | |
72 | * Assume that len is a positive non-zero number | |
73 | */ | |
74 | @ void ff_vector_fmul_window_vfp(float *dst, const float *src0, | |
75 | @ const float *src1, const float *win, int len) | |
76 | function ff_vector_fmul_window_vfp, export=1 | |
77 | DST0 .req a1 | |
78 | SRC0 .req a2 | |
79 | SRC1 .req a3 | |
80 | WIN0 .req a4 | |
81 | LEN .req v1 | |
82 | DST1 .req v2 | |
83 | WIN1 .req v3 | |
84 | OLDFPSCR .req ip | |
85 | ||
86 | push {v1-v3,lr} | |
87 | ldr LEN, [sp, #4*4+0] | |
88 | vpush {s16-s31} | |
89 | fmrx OLDFPSCR, FPSCR | |
90 | add DST1, DST0, LEN, lsl #3 | |
91 | add SRC1, SRC1, LEN, lsl #2 | |
92 | add WIN1, WIN0, LEN, lsl #3 | |
93 | ||
94 | tst LEN, #7 | |
95 | beq 4f @ common case: len is a multiple of 8 | |
96 | ||
97 | ldr lr, =0x03000000 @ RunFast mode, scalar mode | |
98 | fmxr FPSCR, lr | |
99 | ||
100 | tst LEN, #1 | |
101 | beq 1f | |
102 | vldmdb WIN1!, {s0} | |
103 | vldmia SRC0!, {s8} | |
104 | vldmia WIN0!, {s16} | |
105 | vmul.f s24, s0, s8 | |
106 | vldmdb SRC1!, {s20} | |
107 | vmul.f s8, s16, s8 | |
108 | vmls.f s24, s16, s20 | |
109 | vmla.f s8, s0, s20 | |
110 | vstmia DST0!, {s24} | |
111 | vstmdb DST1!, {s8} | |
112 | 1: | |
113 | tst LEN, #2 | |
114 | beq 2f | |
115 | vldmdb WIN1!, {s0} | |
116 | vldmdb WIN1!, {s1} | |
117 | vldmia SRC0!, {s8-s9} | |
118 | vldmia WIN0!, {s16-s17} | |
119 | vmul.f s24, s0, s8 | |
120 | vmul.f s25, s1, s9 | |
121 | vldmdb SRC1!, {s20} | |
122 | vldmdb SRC1!, {s21} | |
123 | vmul.f s8, s16, s8 | |
124 | vmul.f s9, s17, s9 | |
125 | vmls.f s24, s16, s20 | |
126 | vmls.f s25, s17, s21 | |
127 | vmla.f s8, s0, s20 | |
128 | vmla.f s9, s1, s21 | |
129 | vstmia DST0!, {s24-s25} | |
130 | vstmdb DST1!, {s8} | |
131 | vstmdb DST1!, {s9} | |
132 | 2: | |
133 | tst LEN, #4 | |
134 | beq 3f | |
135 | vldmdb WIN1!, {s0} | |
136 | vldmdb WIN1!, {s1} | |
137 | vldmdb WIN1!, {s2} | |
138 | vldmdb WIN1!, {s3} | |
139 | vldmia SRC0!, {s8-s11} | |
140 | vldmia WIN0!, {s16-s19} | |
141 | vmul.f s24, s0, s8 | |
142 | vmul.f s25, s1, s9 | |
143 | vmul.f s26, s2, s10 | |
144 | vmul.f s27, s3, s11 | |
145 | vldmdb SRC1!, {s20} | |
146 | vldmdb SRC1!, {s21} | |
147 | vldmdb SRC1!, {s22} | |
148 | vldmdb SRC1!, {s23} | |
149 | vmul.f s8, s16, s8 | |
150 | vmul.f s9, s17, s9 | |
151 | vmul.f s10, s18, s10 | |
152 | vmul.f s11, s19, s11 | |
153 | vmls.f s24, s16, s20 | |
154 | vmls.f s25, s17, s21 | |
155 | vmls.f s26, s18, s22 | |
156 | vmls.f s27, s19, s23 | |
157 | vmla.f s8, s0, s20 | |
158 | vmla.f s9, s1, s21 | |
159 | vmla.f s10, s2, s22 | |
160 | vmla.f s11, s3, s23 | |
161 | vstmia DST0!, {s24-s27} | |
162 | vstmdb DST1!, {s8} | |
163 | vstmdb DST1!, {s9} | |
164 | vstmdb DST1!, {s10} | |
165 | vstmdb DST1!, {s11} | |
166 | 3: | |
167 | bics LEN, LEN, #7 | |
168 | beq 7f | |
169 | 4: | |
170 | ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 | |
171 | fmxr FPSCR, lr | |
172 | ||
173 | vldmdb WIN1!, {s0} | |
174 | vldmdb WIN1!, {s1} | |
175 | vldmdb WIN1!, {s2} | |
176 | vldmdb WIN1!, {s3} | |
177 | vldmia SRC0!, {s8-s11} | |
178 | vldmia WIN0!, {s16-s19} | |
179 | vmul.f s24, s0, s8 @ vector * vector | |
180 | vldmdb SRC1!, {s20} | |
181 | vldmdb SRC1!, {s21} | |
182 | vldmdb SRC1!, {s22} | |
183 | vldmdb SRC1!, {s23} | |
184 | vmul.f s8, s16, s8 @ vector * vector | |
185 | vmls.f s24, s16, s20 @ vector * vector | |
186 | vldmdb WIN1!, {s4} | |
187 | vldmdb WIN1!, {s5} | |
188 | vldmdb WIN1!, {s6} | |
189 | vldmdb WIN1!, {s7} | |
190 | vldmia SRC0!, {s12-s13} | |
191 | vmla.f s8, s0, s20 @ vector * vector | |
192 | vldmia SRC0!, {s14-s15} | |
193 | subs LEN, LEN, #8 | |
194 | beq 6f | |
195 | 5: vldmia WIN0!, {s20-s23} | |
196 | vmul.f s28, s4, s12 @ vector * vector | |
197 | vstmia DST0!, {s24-s25} | |
198 | vldmdb SRC1!, {s16} | |
199 | vldmdb SRC1!, {s17} | |
200 | vldmdb SRC1!, {s18} | |
201 | vldmdb SRC1!, {s19} | |
202 | vmul.f s12, s20, s12 @ vector * vector | |
203 | vstmia DST0!, {s26-s27} | |
204 | vstmdb DST1!, {s8} | |
205 | vstmdb DST1!, {s9} | |
206 | vstmdb DST1!, {s10} | |
207 | vstmdb DST1!, {s11} | |
208 | vmls.f s28, s20, s16 @ vector * vector | |
209 | vldmdb WIN1!, {s0} | |
210 | vldmdb WIN1!, {s1} | |
211 | vldmdb WIN1!, {s2} | |
212 | vldmdb WIN1!, {s3} | |
213 | vldmia SRC0!, {s8-s9} | |
214 | vmla.f s12, s4, s16 @ vector * vector | |
215 | vldmia SRC0!, {s10-s11} | |
216 | subs LEN, LEN, #8 | |
217 | vldmia WIN0!, {s16-s19} | |
218 | vmul.f s24, s0, s8 @ vector * vector | |
219 | vstmia DST0!, {s28-s29} | |
220 | vldmdb SRC1!, {s20} | |
221 | vldmdb SRC1!, {s21} | |
222 | vldmdb SRC1!, {s22} | |
223 | vldmdb SRC1!, {s23} | |
224 | vmul.f s8, s16, s8 @ vector * vector | |
225 | vstmia DST0!, {s30-s31} | |
226 | vstmdb DST1!, {s12} | |
227 | vstmdb DST1!, {s13} | |
228 | vstmdb DST1!, {s14} | |
229 | vstmdb DST1!, {s15} | |
230 | vmls.f s24, s16, s20 @ vector * vector | |
231 | vldmdb WIN1!, {s4} | |
232 | vldmdb WIN1!, {s5} | |
233 | vldmdb WIN1!, {s6} | |
234 | vldmdb WIN1!, {s7} | |
235 | vldmia SRC0!, {s12-s13} | |
236 | vmla.f s8, s0, s20 @ vector * vector | |
237 | vldmia SRC0!, {s14-s15} | |
238 | bne 5b | |
239 | 6: vldmia WIN0!, {s20-s23} | |
240 | vmul.f s28, s4, s12 @ vector * vector | |
241 | vstmia DST0!, {s24-s25} | |
242 | vldmdb SRC1!, {s16} | |
243 | vldmdb SRC1!, {s17} | |
244 | vldmdb SRC1!, {s18} | |
245 | vldmdb SRC1!, {s19} | |
246 | vmul.f s12, s20, s12 @ vector * vector | |
247 | vstmia DST0!, {s26-s27} | |
248 | vstmdb DST1!, {s8} | |
249 | vstmdb DST1!, {s9} | |
250 | vstmdb DST1!, {s10} | |
251 | vstmdb DST1!, {s11} | |
252 | vmls.f s28, s20, s16 @ vector * vector | |
253 | vmla.f s12, s4, s16 @ vector * vector | |
254 | vstmia DST0!, {s28-s31} | |
255 | vstmdb DST1!, {s12} | |
256 | vstmdb DST1!, {s13} | |
257 | vstmdb DST1!, {s14} | |
258 | vstmdb DST1!, {s15} | |
259 | 7: | |
260 | fmxr FPSCR, OLDFPSCR | |
261 | vpop {s16-s31} | |
262 | pop {v1-v3,pc} | |
263 | ||
264 | .unreq DST0 | |
265 | .unreq SRC0 | |
266 | .unreq SRC1 | |
267 | .unreq WIN0 | |
268 | .unreq LEN | |
269 | .unreq OLDFPSCR | |
270 | .unreq DST1 | |
271 | .unreq WIN1 | |
272 | endfunc | |
273 | ||
274 | /** | |
275 | * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. | |
276 | * Assume that len is a positive number and is multiple of 8 | |
277 | */ | |
278 | @ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, | |
279 | @ const float *src1, int len) | |
280 | function ff_vector_fmul_reverse_vfp, export=1 | |
281 | vpush {d8-d15} | |
282 | add r2, r2, r3, lsl #2 | |
283 | vldmdb r2!, {s0-s3} | |
284 | vldmia r1!, {s8-s11} | |
285 | vldmdb r2!, {s4-s7} | |
286 | vldmia r1!, {s12-s15} | |
287 | vmul.f32 s8, s3, s8 | |
288 | vmul.f32 s9, s2, s9 | |
289 | vmul.f32 s10, s1, s10 | |
290 | vmul.f32 s11, s0, s11 | |
291 | 1: | |
292 | subs r3, r3, #16 | |
293 | it ge | |
294 | vldmdbge r2!, {s16-s19} | |
295 | vmul.f32 s12, s7, s12 | |
296 | it ge | |
297 | vldmiage r1!, {s24-s27} | |
298 | vmul.f32 s13, s6, s13 | |
299 | it ge | |
300 | vldmdbge r2!, {s20-s23} | |
301 | vmul.f32 s14, s5, s14 | |
302 | it ge | |
303 | vldmiage r1!, {s28-s31} | |
304 | vmul.f32 s15, s4, s15 | |
305 | it ge | |
306 | vmulge.f32 s24, s19, s24 | |
307 | it gt | |
308 | vldmdbgt r2!, {s0-s3} | |
309 | it ge | |
310 | vmulge.f32 s25, s18, s25 | |
311 | vstmia r0!, {s8-s13} | |
312 | it ge | |
313 | vmulge.f32 s26, s17, s26 | |
314 | it gt | |
315 | vldmiagt r1!, {s8-s11} | |
316 | itt ge | |
317 | vmulge.f32 s27, s16, s27 | |
318 | vmulge.f32 s28, s23, s28 | |
319 | it gt | |
320 | vldmdbgt r2!, {s4-s7} | |
321 | it ge | |
322 | vmulge.f32 s29, s22, s29 | |
323 | vstmia r0!, {s14-s15} | |
324 | ittt ge | |
325 | vmulge.f32 s30, s21, s30 | |
326 | vmulge.f32 s31, s20, s31 | |
327 | vmulge.f32 s8, s3, s8 | |
328 | it gt | |
329 | vldmiagt r1!, {s12-s15} | |
330 | itttt ge | |
331 | vmulge.f32 s9, s2, s9 | |
332 | vmulge.f32 s10, s1, s10 | |
333 | vstmiage r0!, {s24-s27} | |
334 | vmulge.f32 s11, s0, s11 | |
335 | it ge | |
336 | vstmiage r0!, {s28-s31} | |
337 | bgt 1b | |
338 | ||
339 | vpop {d8-d15} | |
340 | bx lr | |
341 | endfunc | |
342 | ||
343 | /** | |
344 | * ARM VFP implementation of 'butterflies_float_c' function | |
345 | * Assume that len is a positive non-zero number | |
346 | */ | |
347 | @ void ff_butterflies_float_vfp(float *restrict v1, float *restrict v2, int len) | |
348 | function ff_butterflies_float_vfp, export=1 | |
349 | BASE1 .req a1 | |
350 | BASE2 .req a2 | |
351 | LEN .req a3 | |
352 | OLDFPSCR .req a4 | |
353 | ||
354 | vpush {s16-s31} | |
355 | fmrx OLDFPSCR, FPSCR | |
356 | ||
357 | tst LEN, #7 | |
358 | beq 4f @ common case: len is a multiple of 8 | |
359 | ||
360 | ldr ip, =0x03000000 @ RunFast mode, scalar mode | |
361 | fmxr FPSCR, ip | |
362 | ||
363 | tst LEN, #1 | |
364 | beq 1f | |
365 | vldmia BASE1!, {s0} | |
366 | vldmia BASE2!, {s8} | |
367 | vadd.f s16, s0, s8 | |
368 | vsub.f s24, s0, s8 | |
369 | vstr s16, [BASE1, #0-4*1] | |
370 | vstr s24, [BASE2, #0-4*1] | |
371 | 1: | |
372 | tst LEN, #2 | |
373 | beq 2f | |
374 | vldmia BASE1!, {s0-s1} | |
375 | vldmia BASE2!, {s8-s9} | |
376 | vadd.f s16, s0, s8 | |
377 | vadd.f s17, s1, s9 | |
378 | vsub.f s24, s0, s8 | |
379 | vsub.f s25, s1, s9 | |
380 | vstr d8, [BASE1, #0-8*1] @ s16,s17 | |
381 | vstr d12, [BASE2, #0-8*1] @ s24,s25 | |
382 | 2: | |
383 | tst LEN, #4 | |
384 | beq 3f | |
385 | vldmia BASE1!, {s0-s1} | |
386 | vldmia BASE2!, {s8-s9} | |
387 | vldmia BASE1!, {s2-s3} | |
388 | vldmia BASE2!, {s10-s11} | |
389 | vadd.f s16, s0, s8 | |
390 | vadd.f s17, s1, s9 | |
391 | vsub.f s24, s0, s8 | |
392 | vsub.f s25, s1, s9 | |
393 | vadd.f s18, s2, s10 | |
394 | vadd.f s19, s3, s11 | |
395 | vsub.f s26, s2, s10 | |
396 | vsub.f s27, s3, s11 | |
397 | vstr d8, [BASE1, #0-16*1] @ s16,s17 | |
398 | vstr d12, [BASE2, #0-16*1] @ s24,s25 | |
399 | vstr d9, [BASE1, #8-16*1] @ s18,s19 | |
400 | vstr d13, [BASE2, #8-16*1] @ s26,s27 | |
401 | 3: | |
402 | bics LEN, LEN, #7 | |
403 | beq 7f | |
404 | 4: | |
405 | ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 | |
406 | fmxr FPSCR, ip | |
407 | ||
408 | vldmia BASE1!, {s0-s1} | |
409 | vldmia BASE2!, {s8-s9} | |
410 | vldmia BASE1!, {s2-s3} | |
411 | vldmia BASE2!, {s10-s11} | |
412 | vadd.f s16, s0, s8 | |
413 | vldmia BASE1!, {s4-s5} | |
414 | vldmia BASE2!, {s12-s13} | |
415 | vldmia BASE1!, {s6-s7} | |
416 | vldmia BASE2!, {s14-s15} | |
417 | vsub.f s24, s0, s8 | |
418 | vadd.f s20, s4, s12 | |
419 | subs LEN, LEN, #8 | |
420 | beq 6f | |
421 | 5: vldmia BASE1!, {s0-s3} | |
422 | vldmia BASE2!, {s8-s11} | |
423 | vsub.f s28, s4, s12 | |
424 | vstr d8, [BASE1, #0-16*3] @ s16,s17 | |
425 | vstr d9, [BASE1, #8-16*3] @ s18,s19 | |
426 | vstr d12, [BASE2, #0-16*3] @ s24,s25 | |
427 | vstr d13, [BASE2, #8-16*3] @ s26,s27 | |
428 | vadd.f s16, s0, s8 | |
429 | vldmia BASE1!, {s4-s7} | |
430 | vldmia BASE2!, {s12-s15} | |
431 | vsub.f s24, s0, s8 | |
432 | vstr d10, [BASE1, #0-16*3] @ s20,s21 | |
433 | vstr d11, [BASE1, #8-16*3] @ s22,s23 | |
434 | vstr d14, [BASE2, #0-16*3] @ s28,s29 | |
435 | vstr d15, [BASE2, #8-16*3] @ s30,s31 | |
436 | vadd.f s20, s4, s12 | |
437 | subs LEN, LEN, #8 | |
438 | bne 5b | |
439 | 6: vsub.f s28, s4, s12 | |
440 | vstr d8, [BASE1, #0-16*2] @ s16,s17 | |
441 | vstr d9, [BASE1, #8-16*2] @ s18,s19 | |
442 | vstr d12, [BASE2, #0-16*2] @ s24,s25 | |
443 | vstr d13, [BASE2, #8-16*2] @ s26,s27 | |
444 | vstr d10, [BASE1, #0-16*1] @ s20,s21 | |
445 | vstr d11, [BASE1, #8-16*1] @ s22,s23 | |
446 | vstr d14, [BASE2, #0-16*1] @ s28,s29 | |
447 | vstr d15, [BASE2, #8-16*1] @ s30,s31 | |
448 | 7: | |
449 | fmxr FPSCR, OLDFPSCR | |
450 | vpop {s16-s31} | |
451 | bx lr | |
452 | ||
453 | .unreq BASE1 | |
454 | .unreq BASE2 | |
455 | .unreq LEN | |
456 | .unreq OLDFPSCR | |
457 | endfunc |