Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / mdct_vfp.S
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2013 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/arm/asm.S"
23
24CONTEXT .req a1
25ORIGOUT .req a2
26IN .req a3
27OUT .req v1
28REVTAB .req v2
29TCOS .req v3
30TSIN .req v4
31OLDFPSCR .req v5
32J0 .req a2
33J1 .req a4
34J2 .req ip
35J3 .req lr
36REVTAB_HI .req v5
37IN_HI .req v6
38OUT_HI .req v6
39TCOS_HI .req sl
40TSIN_HI .req fp
41
42.macro prerotation_innerloop
43 .set trig_lo, k
44 .set trig_hi, n4 - k - 2
45 .set in_lo, trig_lo * 2
46 .set in_hi, trig_hi * 2
47 vldr d8, [TCOS, #trig_lo*4] @ s16,s17
48 vldr d9, [TCOS, #trig_hi*4] @ s18,s19
49 vldr s0, [IN, #in_hi*4 + 12]
50 vldr s1, [IN, #in_hi*4 + 4]
51 vldr s2, [IN, #in_lo*4 + 12]
52 vldr s3, [IN, #in_lo*4 + 4]
53 vmul.f s8, s0, s16 @ vector operation
54 vldr d10, [TSIN, #trig_lo*4] @ s20,s21
55 vldr d11, [TSIN, #trig_hi*4] @ s22,s23
56 vldr s4, [IN, #in_lo*4]
57 vldr s5, [IN, #in_lo*4 + 8]
58 vldr s6, [IN, #in_hi*4]
59 vldr s7, [IN, #in_hi*4 + 8]
60 ldr J0, [REVTAB, #trig_lo*2]
61 vmul.f s12, s0, s20 @ vector operation
62 ldr J2, [REVTAB, #trig_hi*2]
63 mov J1, J0, lsr #16
64 and J0, J0, #255 @ halfword value will be < n4
65 vmls.f s8, s4, s20 @ vector operation
66 mov J3, J2, lsr #16
67 and J2, J2, #255 @ halfword value will be < n4
68 add J0, OUT, J0, lsl #3
69 vmla.f s12, s4, s16 @ vector operation
70 add J1, OUT, J1, lsl #3
71 add J2, OUT, J2, lsl #3
72 add J3, OUT, J3, lsl #3
73 vstr s8, [J0]
74 vstr s9, [J1]
75 vstr s10, [J2]
76 vstr s11, [J3]
77 vstr s12, [J0, #4]
78 vstr s13, [J1, #4]
79 vstr s14, [J2, #4]
80 vstr s15, [J3, #4]
81 .set k, k + 2
82.endm
83
84.macro prerotation_innerloop_rolled
85 vldmia TCOS!, {s16,s17}
86 vldmdb TCOS_HI!, {s18,s19}
87 vldr s0, [IN_HI, #-4]
88 vldr s1, [IN_HI, #-12]
89 vldr s2, [IN, #12]
90 vldr s3, [IN, #4]
91 vmul.f s8, s0, s16 @ vector operation
92 vldmia TSIN!, {s20,s21}
93 vldmdb TSIN_HI!, {s22,s23}
94 vldr s4, [IN]
95 vldr s5, [IN, #8]
96 vldr s6, [IN_HI, #-16]
97 vldr s7, [IN_HI, #-8]
98 vmul.f s12, s0, s20 @ vector operation
99 add IN, IN, #16
100 sub IN_HI, IN_HI, #16
101 ldrh J0, [REVTAB], #2
102 ldrh J1, [REVTAB], #2
103 vmls.f s8, s4, s20 @ vector operation
104 ldrh J3, [REVTAB_HI, #-2]!
105 ldrh J2, [REVTAB_HI, #-2]!
106 add J0, OUT, J0, lsl #3
107 vmla.f s12, s4, s16 @ vector operation
108 add J1, OUT, J1, lsl #3
109 add J2, OUT, J2, lsl #3
110 add J3, OUT, J3, lsl #3
111 vstr s8, [J0]
112 vstr s9, [J1]
113 vstr s10, [J2]
114 vstr s11, [J3]
115 vstr s12, [J0, #4]
116 vstr s13, [J1, #4]
117 vstr s14, [J2, #4]
118 vstr s15, [J3, #4]
119.endm
120
121.macro postrotation_innerloop tail, head
122 .set trig_lo_head, n8 - k - 2
123 .set trig_hi_head, n8 + k
124 .set out_lo_head, trig_lo_head * 2
125 .set out_hi_head, trig_hi_head * 2
126 .set trig_lo_tail, n8 - (k - 2) - 2
127 .set trig_hi_tail, n8 + (k - 2)
128 .set out_lo_tail, trig_lo_tail * 2
129 .set out_hi_tail, trig_hi_tail * 2
130 .if (k & 2) == 0
131 TCOS_D0_HEAD .req d10 @ s20,s21
132 TCOS_D1_HEAD .req d11 @ s22,s23
133 TCOS_S0_TAIL .req s24
134 .else
135 TCOS_D0_HEAD .req d12 @ s24,s25
136 TCOS_D1_HEAD .req d13 @ s26,s27
137 TCOS_S0_TAIL .req s20
138 .endif
139 .ifnc "\tail",""
140 vmls.f s8, s0, TCOS_S0_TAIL @ vector operation
141 .endif
142 .ifnc "\head",""
143 vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17
144 vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19
145 vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
146 .endif
147 .ifnc "\tail",""
148 vmla.f s12, s4, TCOS_S0_TAIL @ vector operation
149 .endif
150 .ifnc "\head",""
151 vldr s0, [OUT, #out_lo_head*4]
152 vldr s1, [OUT, #out_lo_head*4 + 8]
153 vldr s2, [OUT, #out_hi_head*4]
154 vldr s3, [OUT, #out_hi_head*4 + 8]
155 vldr s4, [OUT, #out_lo_head*4 + 4]
156 vldr s5, [OUT, #out_lo_head*4 + 12]
157 vldr s6, [OUT, #out_hi_head*4 + 4]
158 vldr s7, [OUT, #out_hi_head*4 + 12]
159 .endif
160 .ifnc "\tail",""
161 vstr s8, [OUT, #out_lo_tail*4]
162 vstr s9, [OUT, #out_lo_tail*4 + 8]
163 vstr s10, [OUT, #out_hi_tail*4]
164 vstr s11, [OUT, #out_hi_tail*4 + 8]
165 .endif
166 .ifnc "\head",""
167 vmul.f s8, s4, s16 @ vector operation
168 .endif
169 .ifnc "\tail",""
170 vstr s12, [OUT, #out_hi_tail*4 + 12]
171 vstr s13, [OUT, #out_hi_tail*4 + 4]
172 vstr s14, [OUT, #out_lo_tail*4 + 12]
173 vstr s15, [OUT, #out_lo_tail*4 + 4]
174 .endif
175 .ifnc "\head",""
176 vmul.f s12, s0, s16 @ vector operation
177 vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
178 .endif
179 .unreq TCOS_D0_HEAD
180 .unreq TCOS_D1_HEAD
181 .unreq TCOS_S0_TAIL
182 .ifnc "\head",""
183 .set k, k + 2
184 .endif
185.endm
186
187.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail
188 .ifnc "\tail",""
189 vmls.f s8, s0, \tcos_s0_tail @ vector operation
190 .endif
191 .ifnc "\head",""
192 vldmia TSIN!, {s16,s17}
193 vldmdb TSIN_HI!, {s18,s19}
194 vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head}
195 .endif
196 .ifnc "\tail",""
197 vmla.f s12, s4, \tcos_s0_tail @ vector operation
198 .endif
199 .ifnc "\head",""
200 vldr s0, [OUT, #+\out_offset_head+0]
201 vldr s1, [OUT, #+\out_offset_head+8]
202 vldr s2, [OUT_HI, #-\out_offset_head-16]
203 vldr s3, [OUT_HI, #-\out_offset_head-8]
204 vldr s4, [OUT, #+\out_offset_head+4]
205 vldr s5, [OUT, #+\out_offset_head+12]
206 vldr s6, [OUT_HI, #-\out_offset_head-12]
207 vldr s7, [OUT_HI, #-\out_offset_head-4]
208 .endif
209 .ifnc "\tail",""
210 vstr s8, [OUT, #+\out_offset_tail+0]
211 vstr s9, [OUT, #+\out_offset_tail+8]
212 vstr s10, [OUT_HI, #-\out_offset_tail-16]
213 vstr s11, [OUT_HI, #-\out_offset_tail-8]
214 .endif
215 .ifnc "\head",""
216 vmul.f s8, s4, s16 @ vector operation
217 .endif
218 .ifnc "\tail",""
219 vstr s12, [OUT_HI, #-\out_offset_tail-4]
220 vstr s13, [OUT_HI, #-\out_offset_tail-12]
221 vstr s14, [OUT, #+\out_offset_tail+12]
222 vstr s15, [OUT, #+\out_offset_tail+4]
223 .endif
224 .ifnc "\head",""
225 vmul.f s12, s0, s16 @ vector operation
226 vldmdb TCOS_HI!, {\tcos_s2_head,\tcos_s3_head}
227 .endif
228.endm
229
230
231/* void ff_imdct_half_vfp(FFTContext *s,
232 * FFTSample *output,
233 * const FFTSample *input)
234 */
235function ff_imdct_half_vfp, export=1
236 ldr ip, [CONTEXT, #5*4] @ mdct_bits
237 teq ip, #6
238 bne 10f
239
240 .set n, 1<<6
241 .set n2, n/2
242 .set n4, n/4
243 .set n8, n/8
244
245 push {v1-v5,lr}
246 vpush {s16-s27}
247 fmrx OLDFPSCR, FPSCR
248 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
249 fmxr FPSCR, lr
250 mov OUT, ORIGOUT
251 ldr REVTAB, [CONTEXT, #2*4]
252 ldr TCOS, [CONTEXT, #6*4]
253 ldr TSIN, [CONTEXT, #7*4]
254
255 .set k, 0
256 .rept n8/2
257 prerotation_innerloop
258 .endr
259
260 fmxr FPSCR, OLDFPSCR
261 mov a1, OUT
262 bl X(ff_fft16_vfp)
263 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
264 fmxr FPSCR, lr
265
266 .set k, 0
267 postrotation_innerloop , head
268 .rept n8/2 - 1
269 postrotation_innerloop tail, head
270 .endr
271 postrotation_innerloop tail
272
273 fmxr FPSCR, OLDFPSCR
274 vpop {s16-s27}
275 pop {v1-v5,pc}
276
27710:
278 push {v1-v6,sl,fp,lr}
279 vpush {s16-s27}
280 fmrx OLDFPSCR, FPSCR
281 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
282 fmxr FPSCR, lr
283 mov lr, #1
284 mov OUT, ORIGOUT
285 ldr REVTAB, [CONTEXT, #2*4]
286 ldr TCOS, [CONTEXT, #6*4]
287 ldr TSIN, [CONTEXT, #7*4]
288 mov lr, lr, lsl ip
289
290 push {CONTEXT,OLDFPSCR}
291 add IN_HI, IN, lr, lsl #1
292 add REVTAB_HI, REVTAB, lr, lsr #1
293 add TCOS_HI, TCOS, lr
294 add TSIN_HI, TSIN, lr
2950: prerotation_innerloop_rolled
296 teq IN, IN_HI
297 bne 0b
298 ldmia sp, {CONTEXT,OLDFPSCR}
299
300 mov ORIGOUT, OUT
301 fmxr FPSCR, OLDFPSCR
302 ldr ip, [CONTEXT, #9*4]
303 blx ip @ s->fft_calc(s, output)
304
305 pop {CONTEXT,OLDFPSCR}
306 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
307 ldr ip, [CONTEXT, #5*4] @ mdct_bits
308 fmxr FPSCR, lr
309 mov lr, #1
310 mov lr, lr, lsl ip
311 sub TCOS, TCOS, lr, lsr #1
312 sub TSIN, TSIN, lr, lsr #1
313 add OUT_HI, OUT, lr, lsl #1
314 add TCOS_HI, TCOS, lr
315 add TSIN_HI, TSIN, lr
316 postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0
317 b 1f
3180: add OUT, OUT, #32
319 sub OUT_HI, OUT_HI, #32
320 postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16
3211: postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0
322 teq TSIN, TSIN_HI
323 bne 0b
324 postrotation_innerloop_rolled tail,,,,,, s24,, 16
325
326 fmxr FPSCR, OLDFPSCR
327 vpop {s16-s27}
328 pop {v1-v6,sl,fp,pc}
329endfunc
330
331 .unreq CONTEXT
332 .unreq ORIGOUT
333 .unreq IN
334 .unreq OUT
335 .unreq REVTAB
336 .unreq TCOS
337 .unreq TSIN
338 .unreq OLDFPSCR
339 .unreq J0
340 .unreq J1
341 .unreq J2
342 .unreq J3
343 .unreq REVTAB_HI
344 .unreq IN_HI
345 .unreq OUT_HI
346 .unreq TCOS_HI
347 .unreq TSIN_HI