Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2013 RISC OS Open Ltd | |
3 | * Author: Ben Avison <bavison@riscosopen.org> | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | #include "libavutil/arm/asm.S" | |
23 | ||
24 | IMDCT .req r0 | |
25 | ORIG_P_SB .req r1 | |
26 | P_SB_OFF .req r2 | |
27 | I .req r0 | |
28 | P_SB2_UP .req r1 | |
29 | OLDFPSCR .req r2 | |
30 | P_SB2_DN .req r3 | |
31 | P_WIN_DN .req r4 | |
32 | P_OUT_DN .req r5 | |
33 | P_SB .req r6 | |
34 | J_WRAP .req r7 | |
35 | P_WIN_UP .req r12 | |
36 | P_OUT_UP .req r14 | |
37 | ||
38 | SCALE .req s0 | |
39 | SBUF_DAT_REV0 .req s4 | |
40 | SBUF_DAT_REV1 .req s5 | |
41 | SBUF_DAT_REV2 .req s6 | |
42 | SBUF_DAT_REV3 .req s7 | |
43 | VA0 .req s8 | |
44 | VA3 .req s11 | |
45 | VB0 .req s12 | |
46 | VB3 .req s15 | |
47 | VC0 .req s8 | |
48 | VC3 .req s11 | |
49 | VD0 .req s12 | |
50 | VD3 .req s15 | |
51 | SBUF_DAT0 .req s16 | |
52 | SBUF_DAT1 .req s17 | |
53 | SBUF_DAT2 .req s18 | |
54 | SBUF_DAT3 .req s19 | |
55 | SBUF_DAT_ALT0 .req s20 | |
56 | SBUF_DAT_ALT1 .req s21 | |
57 | SBUF_DAT_ALT2 .req s22 | |
58 | SBUF_DAT_ALT3 .req s23 | |
59 | WIN_DN_DAT0 .req s24 | |
60 | WIN_UP_DAT0 .req s28 | |
61 | ||
62 | ||
63 | .macro inner_loop half, tail, head | |
64 | .if (OFFSET & (64*4)) == 0 @ even numbered call | |
65 | SBUF_DAT_THIS0 .req SBUF_DAT0 | |
66 | SBUF_DAT_THIS1 .req SBUF_DAT1 | |
67 | SBUF_DAT_THIS2 .req SBUF_DAT2 | |
68 | SBUF_DAT_THIS3 .req SBUF_DAT3 | |
69 | .ifnc "\head","" | |
70 | vldr d8, [P_SB, #OFFSET] @ d8 = SBUF_DAT | |
71 | vldr d9, [P_SB, #OFFSET+8] | |
72 | .endif | |
73 | .else | |
74 | SBUF_DAT_THIS0 .req SBUF_DAT_ALT0 | |
75 | SBUF_DAT_THIS1 .req SBUF_DAT_ALT1 | |
76 | SBUF_DAT_THIS2 .req SBUF_DAT_ALT2 | |
77 | SBUF_DAT_THIS3 .req SBUF_DAT_ALT3 | |
78 | .ifnc "\head","" | |
79 | vldr d10, [P_SB, #OFFSET] @ d10 = SBUF_DAT_ALT | |
80 | vldr d11, [P_SB, #OFFSET+8] | |
81 | .endif | |
82 | .endif | |
83 | .ifnc "\tail","" | |
84 | .ifc "\half","ab" | |
85 | vmls.f VA0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors | |
86 | .else | |
87 | vmla.f VD0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors | |
88 | .endif | |
89 | .endif | |
90 | .ifnc "\head","" | |
91 | vldr d14, [P_WIN_UP, #OFFSET] @ d14 = WIN_UP_DAT | |
92 | vldr d15, [P_WIN_UP, #OFFSET+8] | |
93 | vldr d12, [P_WIN_DN, #OFFSET] @ d12 = WIN_DN_DAT | |
94 | vldr d13, [P_WIN_DN, #OFFSET+8] | |
95 | vmov SBUF_DAT_REV3, SBUF_DAT_THIS0 | |
96 | vmov SBUF_DAT_REV2, SBUF_DAT_THIS1 | |
97 | vmov SBUF_DAT_REV1, SBUF_DAT_THIS2 | |
98 | vmov SBUF_DAT_REV0, SBUF_DAT_THIS3 | |
99 | .ifc "\half","ab" | |
100 | vmla.f VB0, SBUF_DAT_THIS0, WIN_UP_DAT0 | |
101 | .else | |
102 | vmla.f VC0, SBUF_DAT_THIS0, WIN_UP_DAT0 | |
103 | .endif | |
104 | teq J_WRAP, #J | |
105 | bne 2f @ strongly predictable, so better than cond exec in this case | |
106 | sub P_SB, P_SB, #512*4 | |
107 | 2: | |
108 | .set J, J - 64 | |
109 | .set OFFSET, OFFSET + 64*4 | |
110 | .endif | |
111 | .unreq SBUF_DAT_THIS0 | |
112 | .unreq SBUF_DAT_THIS1 | |
113 | .unreq SBUF_DAT_THIS2 | |
114 | .unreq SBUF_DAT_THIS3 | |
115 | .endm | |
116 | ||
117 | ||
118 | /* void ff_synth_filter_float_vfp(FFTContext *imdct, | |
119 | * float *synth_buf_ptr, int *synth_buf_offset, | |
120 | * float synth_buf2[32], const float window[512], | |
121 | * float out[32], const float in[32], float scale) | |
122 | */ | |
123 | function ff_synth_filter_float_vfp, export=1 | |
124 | push {r3-r7,lr} | |
125 | vpush {s16-s31} | |
126 | ldr lr, [P_SB_OFF] | |
127 | add a2, ORIG_P_SB, lr, lsl #2 @ calculate synth_buf to pass to imdct_half | |
128 | mov P_SB, a2 @ and keep a copy for ourselves | |
129 | bic J_WRAP, lr, #63 @ mangled to make testing for wrap easier in inner loop | |
130 | sub lr, lr, #32 | |
131 | and lr, lr, #512-32 | |
132 | str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call | |
133 | ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half | |
134 | VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case | |
135 | bl X(ff_imdct_half_vfp) | |
136 | VFP vmov SCALE, s16 | |
137 | ||
138 | fmrx OLDFPSCR, FPSCR | |
139 | ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 | |
140 | fmxr FPSCR, lr | |
141 | ldr P_SB2_DN, [sp, #16*4] | |
142 | ldr P_WIN_DN, [sp, #(16+6+0)*4] | |
143 | ldr P_OUT_DN, [sp, #(16+6+1)*4] | |
144 | NOVFP vldr SCALE, [sp, #(16+6+3)*4] | |
145 | ||
146 | #define IMM_OFF_SKEW 956 /* also valid immediate constant when you add 16*4 */ | |
147 | add P_SB, P_SB, #IMM_OFF_SKEW @ so we can use -ve offsets to use full immediate offset range | |
148 | add P_SB2_UP, P_SB2_DN, #16*4 | |
149 | add P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW | |
150 | add P_OUT_UP, P_OUT_DN, #16*4 | |
151 | add P_SB2_DN, P_SB2_DN, #16*4 | |
152 | add P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW | |
153 | add P_OUT_DN, P_OUT_DN, #16*4 | |
154 | mov I, #4 | |
155 | 1: | |
156 | vldmia P_SB2_UP!, {VB0-VB3} | |
157 | vldmdb P_SB2_DN!, {VA0-VA3} | |
158 | .set J, 512 - 64 | |
159 | .set OFFSET, -IMM_OFF_SKEW | |
160 | inner_loop ab,, head | |
161 | .rept 7 | |
162 | inner_loop ab, tail, head | |
163 | .endr | |
164 | inner_loop ab, tail | |
165 | add P_WIN_UP, P_WIN_UP, #4*4 | |
166 | sub P_WIN_DN, P_WIN_DN, #4*4 | |
167 | vmul.f VB0, VB0, SCALE @ SCALE treated as scalar | |
168 | add P_SB, P_SB, #(512+4)*4 | |
169 | subs I, I, #1 | |
170 | vmul.f VA0, VA0, SCALE | |
171 | vstmia P_OUT_UP!, {VB0-VB3} | |
172 | vstmdb P_OUT_DN!, {VA0-VA3} | |
173 | bne 1b | |
174 | ||
175 | add P_SB2_DN, P_SB2_DN, #(16+28-12)*4 | |
176 | sub P_SB2_UP, P_SB2_UP, #(16+16)*4 | |
177 | add P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4 | |
178 | mov I, #4 | |
179 | 1: | |
180 | vldr.d d4, zero @ d4 = VC0 | |
181 | vldr.d d5, zero | |
182 | vldr.d d6, zero @ d6 = VD0 | |
183 | vldr.d d7, zero | |
184 | .set J, 512 - 64 | |
185 | .set OFFSET, -IMM_OFF_SKEW | |
186 | inner_loop cd,, head | |
187 | .rept 7 | |
188 | inner_loop cd, tail, head | |
189 | .endr | |
190 | inner_loop cd, tail | |
191 | add P_WIN_UP, P_WIN_UP, #4*4 | |
192 | sub P_WIN_DN, P_WIN_DN, #4*4 | |
193 | add P_SB, P_SB, #(512+4)*4 | |
194 | subs I, I, #1 | |
195 | vstmia P_SB2_UP!, {VC0-VC3} | |
196 | vstmdb P_SB2_DN!, {VD0-VD3} | |
197 | bne 1b | |
198 | ||
199 | fmxr FPSCR, OLDFPSCR | |
200 | vpop {s16-s31} | |
201 | pop {r3-r7,pc} | |
202 | endfunc | |
203 | ||
204 | .unreq IMDCT | |
205 | .unreq ORIG_P_SB | |
206 | .unreq P_SB_OFF | |
207 | .unreq I | |
208 | .unreq P_SB2_UP | |
209 | .unreq OLDFPSCR | |
210 | .unreq P_SB2_DN | |
211 | .unreq P_WIN_DN | |
212 | .unreq P_OUT_DN | |
213 | .unreq P_SB | |
214 | .unreq J_WRAP | |
215 | .unreq P_WIN_UP | |
216 | .unreq P_OUT_UP | |
217 | ||
218 | .unreq SCALE | |
219 | .unreq SBUF_DAT_REV0 | |
220 | .unreq SBUF_DAT_REV1 | |
221 | .unreq SBUF_DAT_REV2 | |
222 | .unreq SBUF_DAT_REV3 | |
223 | .unreq VA0 | |
224 | .unreq VA3 | |
225 | .unreq VB0 | |
226 | .unreq VB3 | |
227 | .unreq VC0 | |
228 | .unreq VC3 | |
229 | .unreq VD0 | |
230 | .unreq VD3 | |
231 | .unreq SBUF_DAT0 | |
232 | .unreq SBUF_DAT1 | |
233 | .unreq SBUF_DAT2 | |
234 | .unreq SBUF_DAT3 | |
235 | .unreq SBUF_DAT_ALT0 | |
236 | .unreq SBUF_DAT_ALT1 | |
237 | .unreq SBUF_DAT_ALT2 | |
238 | .unreq SBUF_DAT_ALT3 | |
239 | .unreq WIN_DN_DAT0 | |
240 | .unreq WIN_UP_DAT0 | |
241 | ||
242 | .align 3 | |
243 | zero: .word 0, 0 |