Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2013 RISC OS Open Ltd | |
3 | * Author: Ben Avison <bavison@riscosopen.org> | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | #include "libavutil/arm/asm.S" | |
23 | ||
24 | CONTEXT .req a1 | |
25 | ORIGOUT .req a2 | |
26 | IN .req a3 | |
27 | OUT .req v1 | |
28 | REVTAB .req v2 | |
29 | TCOS .req v3 | |
30 | TSIN .req v4 | |
31 | OLDFPSCR .req v5 | |
32 | J0 .req a2 | |
33 | J1 .req a4 | |
34 | J2 .req ip | |
35 | J3 .req lr | |
36 | REVTAB_HI .req v5 | |
37 | IN_HI .req v6 | |
38 | OUT_HI .req v6 | |
39 | TCOS_HI .req sl | |
40 | TSIN_HI .req fp | |
41 | ||
42 | .macro prerotation_innerloop | |
43 | .set trig_lo, k | |
44 | .set trig_hi, n4 - k - 2 | |
45 | .set in_lo, trig_lo * 2 | |
46 | .set in_hi, trig_hi * 2 | |
47 | vldr d8, [TCOS, #trig_lo*4] @ s16,s17 | |
48 | vldr d9, [TCOS, #trig_hi*4] @ s18,s19 | |
49 | vldr s0, [IN, #in_hi*4 + 12] | |
50 | vldr s1, [IN, #in_hi*4 + 4] | |
51 | vldr s2, [IN, #in_lo*4 + 12] | |
52 | vldr s3, [IN, #in_lo*4 + 4] | |
53 | vmul.f s8, s0, s16 @ vector operation | |
54 | vldr d10, [TSIN, #trig_lo*4] @ s20,s21 | |
55 | vldr d11, [TSIN, #trig_hi*4] @ s22,s23 | |
56 | vldr s4, [IN, #in_lo*4] | |
57 | vldr s5, [IN, #in_lo*4 + 8] | |
58 | vldr s6, [IN, #in_hi*4] | |
59 | vldr s7, [IN, #in_hi*4 + 8] | |
60 | ldr J0, [REVTAB, #trig_lo*2] | |
61 | vmul.f s12, s0, s20 @ vector operation | |
62 | ldr J2, [REVTAB, #trig_hi*2] | |
63 | mov J1, J0, lsr #16 | |
64 | and J0, J0, #255 @ halfword value will be < n4 | |
65 | vmls.f s8, s4, s20 @ vector operation | |
66 | mov J3, J2, lsr #16 | |
67 | and J2, J2, #255 @ halfword value will be < n4 | |
68 | add J0, OUT, J0, lsl #3 | |
69 | vmla.f s12, s4, s16 @ vector operation | |
70 | add J1, OUT, J1, lsl #3 | |
71 | add J2, OUT, J2, lsl #3 | |
72 | add J3, OUT, J3, lsl #3 | |
73 | vstr s8, [J0] | |
74 | vstr s9, [J1] | |
75 | vstr s10, [J2] | |
76 | vstr s11, [J3] | |
77 | vstr s12, [J0, #4] | |
78 | vstr s13, [J1, #4] | |
79 | vstr s14, [J2, #4] | |
80 | vstr s15, [J3, #4] | |
81 | .set k, k + 2 | |
82 | .endm | |
83 | ||
84 | .macro prerotation_innerloop_rolled | |
85 | vldmia TCOS!, {s16,s17} | |
86 | vldmdb TCOS_HI!, {s18,s19} | |
87 | vldr s0, [IN_HI, #-4] | |
88 | vldr s1, [IN_HI, #-12] | |
89 | vldr s2, [IN, #12] | |
90 | vldr s3, [IN, #4] | |
91 | vmul.f s8, s0, s16 @ vector operation | |
92 | vldmia TSIN!, {s20,s21} | |
93 | vldmdb TSIN_HI!, {s22,s23} | |
94 | vldr s4, [IN] | |
95 | vldr s5, [IN, #8] | |
96 | vldr s6, [IN_HI, #-16] | |
97 | vldr s7, [IN_HI, #-8] | |
98 | vmul.f s12, s0, s20 @ vector operation | |
99 | add IN, IN, #16 | |
100 | sub IN_HI, IN_HI, #16 | |
101 | ldrh J0, [REVTAB], #2 | |
102 | ldrh J1, [REVTAB], #2 | |
103 | vmls.f s8, s4, s20 @ vector operation | |
104 | ldrh J3, [REVTAB_HI, #-2]! | |
105 | ldrh J2, [REVTAB_HI, #-2]! | |
106 | add J0, OUT, J0, lsl #3 | |
107 | vmla.f s12, s4, s16 @ vector operation | |
108 | add J1, OUT, J1, lsl #3 | |
109 | add J2, OUT, J2, lsl #3 | |
110 | add J3, OUT, J3, lsl #3 | |
111 | vstr s8, [J0] | |
112 | vstr s9, [J1] | |
113 | vstr s10, [J2] | |
114 | vstr s11, [J3] | |
115 | vstr s12, [J0, #4] | |
116 | vstr s13, [J1, #4] | |
117 | vstr s14, [J2, #4] | |
118 | vstr s15, [J3, #4] | |
119 | .endm | |
120 | ||
121 | .macro postrotation_innerloop tail, head | |
122 | .set trig_lo_head, n8 - k - 2 | |
123 | .set trig_hi_head, n8 + k | |
124 | .set out_lo_head, trig_lo_head * 2 | |
125 | .set out_hi_head, trig_hi_head * 2 | |
126 | .set trig_lo_tail, n8 - (k - 2) - 2 | |
127 | .set trig_hi_tail, n8 + (k - 2) | |
128 | .set out_lo_tail, trig_lo_tail * 2 | |
129 | .set out_hi_tail, trig_hi_tail * 2 | |
130 | .if (k & 2) == 0 | |
131 | TCOS_D0_HEAD .req d10 @ s20,s21 | |
132 | TCOS_D1_HEAD .req d11 @ s22,s23 | |
133 | TCOS_S0_TAIL .req s24 | |
134 | .else | |
135 | TCOS_D0_HEAD .req d12 @ s24,s25 | |
136 | TCOS_D1_HEAD .req d13 @ s26,s27 | |
137 | TCOS_S0_TAIL .req s20 | |
138 | .endif | |
139 | .ifnc "\tail","" | |
140 | vmls.f s8, s0, TCOS_S0_TAIL @ vector operation | |
141 | .endif | |
142 | .ifnc "\head","" | |
143 | vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17 | |
144 | vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19 | |
145 | vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4] | |
146 | .endif | |
147 | .ifnc "\tail","" | |
148 | vmla.f s12, s4, TCOS_S0_TAIL @ vector operation | |
149 | .endif | |
150 | .ifnc "\head","" | |
151 | vldr s0, [OUT, #out_lo_head*4] | |
152 | vldr s1, [OUT, #out_lo_head*4 + 8] | |
153 | vldr s2, [OUT, #out_hi_head*4] | |
154 | vldr s3, [OUT, #out_hi_head*4 + 8] | |
155 | vldr s4, [OUT, #out_lo_head*4 + 4] | |
156 | vldr s5, [OUT, #out_lo_head*4 + 12] | |
157 | vldr s6, [OUT, #out_hi_head*4 + 4] | |
158 | vldr s7, [OUT, #out_hi_head*4 + 12] | |
159 | .endif | |
160 | .ifnc "\tail","" | |
161 | vstr s8, [OUT, #out_lo_tail*4] | |
162 | vstr s9, [OUT, #out_lo_tail*4 + 8] | |
163 | vstr s10, [OUT, #out_hi_tail*4] | |
164 | vstr s11, [OUT, #out_hi_tail*4 + 8] | |
165 | .endif | |
166 | .ifnc "\head","" | |
167 | vmul.f s8, s4, s16 @ vector operation | |
168 | .endif | |
169 | .ifnc "\tail","" | |
170 | vstr s12, [OUT, #out_hi_tail*4 + 12] | |
171 | vstr s13, [OUT, #out_hi_tail*4 + 4] | |
172 | vstr s14, [OUT, #out_lo_tail*4 + 12] | |
173 | vstr s15, [OUT, #out_lo_tail*4 + 4] | |
174 | .endif | |
175 | .ifnc "\head","" | |
176 | vmul.f s12, s0, s16 @ vector operation | |
177 | vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4] | |
178 | .endif | |
179 | .unreq TCOS_D0_HEAD | |
180 | .unreq TCOS_D1_HEAD | |
181 | .unreq TCOS_S0_TAIL | |
182 | .ifnc "\head","" | |
183 | .set k, k + 2 | |
184 | .endif | |
185 | .endm | |
186 | ||
187 | .macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail | |
188 | .ifnc "\tail","" | |
189 | vmls.f s8, s0, \tcos_s0_tail @ vector operation | |
190 | .endif | |
191 | .ifnc "\head","" | |
192 | vldmia TSIN!, {s16,s17} | |
193 | vldmdb TSIN_HI!, {s18,s19} | |
194 | vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head} | |
195 | .endif | |
196 | .ifnc "\tail","" | |
197 | vmla.f s12, s4, \tcos_s0_tail @ vector operation | |
198 | .endif | |
199 | .ifnc "\head","" | |
200 | vldr s0, [OUT, #+\out_offset_head+0] | |
201 | vldr s1, [OUT, #+\out_offset_head+8] | |
202 | vldr s2, [OUT_HI, #-\out_offset_head-16] | |
203 | vldr s3, [OUT_HI, #-\out_offset_head-8] | |
204 | vldr s4, [OUT, #+\out_offset_head+4] | |
205 | vldr s5, [OUT, #+\out_offset_head+12] | |
206 | vldr s6, [OUT_HI, #-\out_offset_head-12] | |
207 | vldr s7, [OUT_HI, #-\out_offset_head-4] | |
208 | .endif | |
209 | .ifnc "\tail","" | |
210 | vstr s8, [OUT, #+\out_offset_tail+0] | |
211 | vstr s9, [OUT, #+\out_offset_tail+8] | |
212 | vstr s10, [OUT_HI, #-\out_offset_tail-16] | |
213 | vstr s11, [OUT_HI, #-\out_offset_tail-8] | |
214 | .endif | |
215 | .ifnc "\head","" | |
216 | vmul.f s8, s4, s16 @ vector operation | |
217 | .endif | |
218 | .ifnc "\tail","" | |
219 | vstr s12, [OUT_HI, #-\out_offset_tail-4] | |
220 | vstr s13, [OUT_HI, #-\out_offset_tail-12] | |
221 | vstr s14, [OUT, #+\out_offset_tail+12] | |
222 | vstr s15, [OUT, #+\out_offset_tail+4] | |
223 | .endif | |
224 | .ifnc "\head","" | |
225 | vmul.f s12, s0, s16 @ vector operation | |
226 | vldmdb TCOS_HI!, {\tcos_s2_head,\tcos_s3_head} | |
227 | .endif | |
228 | .endm | |
229 | ||
230 | ||
231 | /* void ff_imdct_half_vfp(FFTContext *s, | |
232 | * FFTSample *output, | |
233 | * const FFTSample *input) | |
234 | */ | |
235 | function ff_imdct_half_vfp, export=1 | |
236 | ldr ip, [CONTEXT, #5*4] @ mdct_bits | |
237 | teq ip, #6 | |
238 | bne 10f | |
239 | ||
240 | .set n, 1<<6 | |
241 | .set n2, n/2 | |
242 | .set n4, n/4 | |
243 | .set n8, n/8 | |
244 | ||
245 | push {v1-v5,lr} | |
246 | vpush {s16-s27} | |
247 | fmrx OLDFPSCR, FPSCR | |
248 | ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 | |
249 | fmxr FPSCR, lr | |
250 | mov OUT, ORIGOUT | |
251 | ldr REVTAB, [CONTEXT, #2*4] | |
252 | ldr TCOS, [CONTEXT, #6*4] | |
253 | ldr TSIN, [CONTEXT, #7*4] | |
254 | ||
255 | .set k, 0 | |
256 | .rept n8/2 | |
257 | prerotation_innerloop | |
258 | .endr | |
259 | ||
260 | fmxr FPSCR, OLDFPSCR | |
261 | mov a1, OUT | |
262 | bl X(ff_fft16_vfp) | |
263 | ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 | |
264 | fmxr FPSCR, lr | |
265 | ||
266 | .set k, 0 | |
267 | postrotation_innerloop , head | |
268 | .rept n8/2 - 1 | |
269 | postrotation_innerloop tail, head | |
270 | .endr | |
271 | postrotation_innerloop tail | |
272 | ||
273 | fmxr FPSCR, OLDFPSCR | |
274 | vpop {s16-s27} | |
275 | pop {v1-v5,pc} | |
276 | ||
277 | 10: | |
278 | push {v1-v6,sl,fp,lr} | |
279 | vpush {s16-s27} | |
280 | fmrx OLDFPSCR, FPSCR | |
281 | ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 | |
282 | fmxr FPSCR, lr | |
283 | mov lr, #1 | |
284 | mov OUT, ORIGOUT | |
285 | ldr REVTAB, [CONTEXT, #2*4] | |
286 | ldr TCOS, [CONTEXT, #6*4] | |
287 | ldr TSIN, [CONTEXT, #7*4] | |
288 | mov lr, lr, lsl ip | |
289 | ||
290 | push {CONTEXT,OLDFPSCR} | |
291 | add IN_HI, IN, lr, lsl #1 | |
292 | add REVTAB_HI, REVTAB, lr, lsr #1 | |
293 | add TCOS_HI, TCOS, lr | |
294 | add TSIN_HI, TSIN, lr | |
295 | 0: prerotation_innerloop_rolled | |
296 | teq IN, IN_HI | |
297 | bne 0b | |
298 | ldmia sp, {CONTEXT,OLDFPSCR} | |
299 | ||
300 | mov ORIGOUT, OUT | |
301 | fmxr FPSCR, OLDFPSCR | |
302 | ldr ip, [CONTEXT, #9*4] | |
303 | blx ip @ s->fft_calc(s, output) | |
304 | ||
305 | pop {CONTEXT,OLDFPSCR} | |
306 | ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 | |
307 | ldr ip, [CONTEXT, #5*4] @ mdct_bits | |
308 | fmxr FPSCR, lr | |
309 | mov lr, #1 | |
310 | mov lr, lr, lsl ip | |
311 | sub TCOS, TCOS, lr, lsr #1 | |
312 | sub TSIN, TSIN, lr, lsr #1 | |
313 | add OUT_HI, OUT, lr, lsl #1 | |
314 | add TCOS_HI, TCOS, lr | |
315 | add TSIN_HI, TSIN, lr | |
316 | postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0 | |
317 | b 1f | |
318 | 0: add OUT, OUT, #32 | |
319 | sub OUT_HI, OUT_HI, #32 | |
320 | postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16 | |
321 | 1: postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0 | |
322 | teq TSIN, TSIN_HI | |
323 | bne 0b | |
324 | postrotation_innerloop_rolled tail,,,,,, s24,, 16 | |
325 | ||
326 | fmxr FPSCR, OLDFPSCR | |
327 | vpop {s16-s27} | |
328 | pop {v1-v6,sl,fp,pc} | |
329 | endfunc | |
330 | ||
331 | .unreq CONTEXT | |
332 | .unreq ORIGOUT | |
333 | .unreq IN | |
334 | .unreq OUT | |
335 | .unreq REVTAB | |
336 | .unreq TCOS | |
337 | .unreq TSIN | |
338 | .unreq OLDFPSCR | |
339 | .unreq J0 | |
340 | .unreq J1 | |
341 | .unreq J2 | |
342 | .unreq J3 | |
343 | .unreq REVTAB_HI | |
344 | .unreq IN_HI | |
345 | .unreq OUT_HI | |
346 | .unreq TCOS_HI | |
347 | .unreq TSIN_HI |