Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / dcadsp_vfp.S
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2013 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/arm/asm.S"
23
24POUT .req a1
25PIN .req a2
26PCOEF .req a3
27OLDFPSCR .req a4
28COUNTER .req ip
29
30IN0 .req s4
31IN1 .req s5
32IN2 .req s6
33IN3 .req s7
34IN4 .req s0
35IN5 .req s1
36IN6 .req s2
37IN7 .req s3
38COEF0 .req s8 @ coefficient elements
39COEF1 .req s9
40COEF2 .req s10
41COEF3 .req s11
42COEF4 .req s12
43COEF5 .req s13
44COEF6 .req s14
45COEF7 .req s15
46ACCUM0 .req s16 @ double-buffered multiply-accumulate results
47ACCUM4 .req s20
48POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
49POST1 .req s25
50POST2 .req s26
51POST3 .req s27
52
53
54.macro inner_loop decifactor, dir, tail, head
55 .ifc "\dir","up"
56 .set X, 0
57 .set Y, 4
58 .else
59 .set X, 4*JMAX*4 - 4
60 .set Y, -4
61 .endif
62 .ifnc "\head",""
63 vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
64 vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
65 vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
66 vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
67 .endif
68 .ifnc "\tail",""
69 vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
70 .endif
71 .ifnc "\head",""
72 vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
73 vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
74 vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
75 vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
76 .endif
77 .ifnc "\head",""
78 vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
79 .ifc "\tail",""
80 vmul.f ACCUM4, COEF4, IN1 @ vector operation
81 .endif
82 vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
83 vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
84 .ifnc "\tail",""
85 vmul.f ACCUM4, COEF4, IN1 @ vector operation
86 .endif
87 vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
88 vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
89 .endif
90 .ifnc "\tail",""
91 vstmia POUT!, {POST0-POST3}
92 .endif
93 .ifnc "\head",""
94 vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
95 vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
96 vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
97 vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
98 vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
99 vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
100 .if \decifactor == 32
101 vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
102 vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
103 vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
104 vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
105 vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
106 vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
107 vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
108 vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
109 vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
110 vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
111 vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
112 vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
113 vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
114 vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
115 vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
116 vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
117 vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
118 vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
119 vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
120 vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
121 .endif
122 .endif
123.endm
124
125.macro dca_lfe_fir decifactor
126function ff_dca_lfe_fir\decifactor\()_vfp, export=1
127 fmrx OLDFPSCR, FPSCR
128 ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
129 fmxr FPSCR, ip
130 vldr IN0, [PIN, #-0*4]
131 vldr IN1, [PIN, #-1*4]
132 vldr IN2, [PIN, #-2*4]
133 vldr IN3, [PIN, #-3*4]
134 .if \decifactor == 32
135 .set JMAX, 8
136 vpush {s16-s31}
137 vldr IN4, [PIN, #-4*4]
138 vldr IN5, [PIN, #-5*4]
139 vldr IN6, [PIN, #-6*4]
140 vldr IN7, [PIN, #-7*4]
141 .else
142 .set JMAX, 4
143 vpush {s16-s27}
144 .endif
145
146 mov COUNTER, #\decifactor/4 - 1
147 inner_loop \decifactor, up,, head
1481: add PCOEF, PCOEF, #4*JMAX*4
149 subs COUNTER, COUNTER, #1
150 inner_loop \decifactor, up, tail, head
151 bne 1b
152 inner_loop \decifactor, up, tail
153
154 mov COUNTER, #\decifactor/4 - 1
155 inner_loop \decifactor, down,, head
1561: sub PCOEF, PCOEF, #4*JMAX*4
157 subs COUNTER, COUNTER, #1
158 inner_loop \decifactor, down, tail, head
159 bne 1b
160 inner_loop \decifactor, down, tail
161
162 .if \decifactor == 32
163 vpop {s16-s31}
164 .else
165 vpop {s16-s27}
166 .endif
167 fmxr FPSCR, OLDFPSCR
168 bx lr
169endfunc
170.endm
171
172 dca_lfe_fir 64
173 .ltorg
174 dca_lfe_fir 32
175
176 .unreq POUT
177 .unreq PIN
178 .unreq PCOEF
179 .unreq OLDFPSCR
180 .unreq COUNTER
181
182 .unreq IN0
183 .unreq IN1
184 .unreq IN2
185 .unreq IN3
186 .unreq IN4
187 .unreq IN5
188 .unreq IN6
189 .unreq IN7
190 .unreq COEF0
191 .unreq COEF1
192 .unreq COEF2
193 .unreq COEF3
194 .unreq COEF4
195 .unreq COEF5
196 .unreq COEF6
197 .unreq COEF7
198 .unreq ACCUM0
199 .unreq ACCUM4
200 .unreq POST0
201 .unreq POST1
202 .unreq POST2
203 .unreq POST3
204
205
206IN .req a1
207SBACT .req a2
208OLDFPSCR .req a3
209IMDCT .req a4
210WINDOW .req v1
211OUT .req v2
212BUF .req v3
213SCALEINT .req v4 @ only used in softfp case
214COUNT .req v5
215
216SCALE .req s0
217
218/* Stack layout differs in softfp and hardfp cases:
219 *
220 * hardfp
221 * fp -> 6 arg words saved by caller
222 * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
223 * s16-s23 on entry
224 * align 16
225 * buf -> 8*32*4 bytes buffer
226 * s0 on entry
227 * sp -> 3 arg words for callee
228 *
229 * softfp
230 * fp -> 7 arg words saved by caller
231 * a4,v1-v5,fp,lr on entry
232 * s16-s23 on entry
233 * align 16
234 * buf -> 8*32*4 bytes buffer
235 * sp -> 4 arg words for callee
236 */
237
238/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
239 * SynthFilterContext *synth, FFTContext *imdct,
240 * float (*synth_buf_ptr)[512],
241 * int *synth_buf_offset, float (*synth_buf2)[32],
242 * const float (*window)[512], float *samples_out,
243 * float (*raXin)[32], float scale);
244 */
245function ff_dca_qmf_32_subbands_vfp, export=1
246VFP push {a3-a4,v1-v3,v5,fp,lr}
247NOVFP push {a4,v1-v5,fp,lr}
248 add fp, sp, #8*4
249 vpush {s16-s23}
250 @ The buffer pointed at by raXin isn't big enough for us to do a
251 @ complete matrix transposition as we want to, so allocate an
252 @ alternative buffer from the stack. Align to 4 words for speed.
253 sub BUF, sp, #8*32*4
254 bic BUF, BUF, #15
255 mov sp, BUF
256 ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
257 fmrx OLDFPSCR, FPSCR
258 fmxr FPSCR, lr
259 @ COUNT is used to count down 2 things at once:
260 @ bits 0-4 are the number of word pairs remaining in the output row
261 @ bits 5-31 are the number of words to copy (with possible negation)
262 @ from the source matrix before we start zeroing the remainder
263 mov COUNT, #(-4 << 5) + 16
264 adds COUNT, COUNT, SBACT, lsl #5
265 bmi 2f
2661:
267 vldr s8, [IN, #(0*8+0)*4]
268 vldr s10, [IN, #(0*8+1)*4]
269 vldr s12, [IN, #(0*8+2)*4]
270 vldr s14, [IN, #(0*8+3)*4]
271 vldr s16, [IN, #(0*8+4)*4]
272 vldr s18, [IN, #(0*8+5)*4]
273 vldr s20, [IN, #(0*8+6)*4]
274 vldr s22, [IN, #(0*8+7)*4]
275 vneg.f s8, s8
276 vldr s9, [IN, #(1*8+0)*4]
277 vldr s11, [IN, #(1*8+1)*4]
278 vldr s13, [IN, #(1*8+2)*4]
279 vldr s15, [IN, #(1*8+3)*4]
280 vneg.f s16, s16
281 vldr s17, [IN, #(1*8+4)*4]
282 vldr s19, [IN, #(1*8+5)*4]
283 vldr s21, [IN, #(1*8+6)*4]
284 vldr s23, [IN, #(1*8+7)*4]
285 vstr d4, [BUF, #(0*32+0)*4]
286 vstr d5, [BUF, #(1*32+0)*4]
287 vstr d6, [BUF, #(2*32+0)*4]
288 vstr d7, [BUF, #(3*32+0)*4]
289 vstr d8, [BUF, #(4*32+0)*4]
290 vstr d9, [BUF, #(5*32+0)*4]
291 vstr d10, [BUF, #(6*32+0)*4]
292 vstr d11, [BUF, #(7*32+0)*4]
293 vldr s9, [IN, #(3*8+0)*4]
294 vldr s11, [IN, #(3*8+1)*4]
295 vldr s13, [IN, #(3*8+2)*4]
296 vldr s15, [IN, #(3*8+3)*4]
297 vldr s17, [IN, #(3*8+4)*4]
298 vldr s19, [IN, #(3*8+5)*4]
299 vldr s21, [IN, #(3*8+6)*4]
300 vldr s23, [IN, #(3*8+7)*4]
301 vneg.f s9, s9
302 vldr s8, [IN, #(2*8+0)*4]
303 vldr s10, [IN, #(2*8+1)*4]
304 vldr s12, [IN, #(2*8+2)*4]
305 vldr s14, [IN, #(2*8+3)*4]
306 vneg.f s17, s17
307 vldr s16, [IN, #(2*8+4)*4]
308 vldr s18, [IN, #(2*8+5)*4]
309 vldr s20, [IN, #(2*8+6)*4]
310 vldr s22, [IN, #(2*8+7)*4]
311 vstr d4, [BUF, #(0*32+2)*4]
312 vstr d5, [BUF, #(1*32+2)*4]
313 vstr d6, [BUF, #(2*32+2)*4]
314 vstr d7, [BUF, #(3*32+2)*4]
315 vstr d8, [BUF, #(4*32+2)*4]
316 vstr d9, [BUF, #(5*32+2)*4]
317 vstr d10, [BUF, #(6*32+2)*4]
318 vstr d11, [BUF, #(7*32+2)*4]
319 add IN, IN, #4*8*4
320 add BUF, BUF, #4*4
321 subs COUNT, COUNT, #(4 << 5) + 2
322 bpl 1b
3232: @ Now deal with trailing < 4 samples
324 adds COUNT, COUNT, #3 << 5
325 bmi 4f @ sb_act was a multiple of 4
326 bics lr, COUNT, #0x1F
327 bne 3f
328 @ sb_act was n*4+1
329 vldr s8, [IN, #(0*8+0)*4]
330 vldr s10, [IN, #(0*8+1)*4]
331 vldr s12, [IN, #(0*8+2)*4]
332 vldr s14, [IN, #(0*8+3)*4]
333 vldr s16, [IN, #(0*8+4)*4]
334 vldr s18, [IN, #(0*8+5)*4]
335 vldr s20, [IN, #(0*8+6)*4]
336 vldr s22, [IN, #(0*8+7)*4]
337 vneg.f s8, s8
338 vldr s9, zero
339 vldr s11, zero
340 vldr s13, zero
341 vldr s15, zero
342 vneg.f s16, s16
343 vldr s17, zero
344 vldr s19, zero
345 vldr s21, zero
346 vldr s23, zero
347 vstr d4, [BUF, #(0*32+0)*4]
348 vstr d5, [BUF, #(1*32+0)*4]
349 vstr d6, [BUF, #(2*32+0)*4]
350 vstr d7, [BUF, #(3*32+0)*4]
351 vstr d8, [BUF, #(4*32+0)*4]
352 vstr d9, [BUF, #(5*32+0)*4]
353 vstr d10, [BUF, #(6*32+0)*4]
354 vstr d11, [BUF, #(7*32+0)*4]
355 add BUF, BUF, #2*4
356 sub COUNT, COUNT, #1
357 b 4f
3583: @ sb_act was n*4+2 or n*4+3, so do the first 2
359 vldr s8, [IN, #(0*8+0)*4]
360 vldr s10, [IN, #(0*8+1)*4]
361 vldr s12, [IN, #(0*8+2)*4]
362 vldr s14, [IN, #(0*8+3)*4]
363 vldr s16, [IN, #(0*8+4)*4]
364 vldr s18, [IN, #(0*8+5)*4]
365 vldr s20, [IN, #(0*8+6)*4]
366 vldr s22, [IN, #(0*8+7)*4]
367 vneg.f s8, s8
368 vldr s9, [IN, #(1*8+0)*4]
369 vldr s11, [IN, #(1*8+1)*4]
370 vldr s13, [IN, #(1*8+2)*4]
371 vldr s15, [IN, #(1*8+3)*4]
372 vneg.f s16, s16
373 vldr s17, [IN, #(1*8+4)*4]
374 vldr s19, [IN, #(1*8+5)*4]
375 vldr s21, [IN, #(1*8+6)*4]
376 vldr s23, [IN, #(1*8+7)*4]
377 vstr d4, [BUF, #(0*32+0)*4]
378 vstr d5, [BUF, #(1*32+0)*4]
379 vstr d6, [BUF, #(2*32+0)*4]
380 vstr d7, [BUF, #(3*32+0)*4]
381 vstr d8, [BUF, #(4*32+0)*4]
382 vstr d9, [BUF, #(5*32+0)*4]
383 vstr d10, [BUF, #(6*32+0)*4]
384 vstr d11, [BUF, #(7*32+0)*4]
385 add BUF, BUF, #2*4
386 sub COUNT, COUNT, #(2 << 5) + 1
387 bics lr, COUNT, #0x1F
388 bne 4f
389 @ sb_act was n*4+3
390 vldr s8, [IN, #(2*8+0)*4]
391 vldr s10, [IN, #(2*8+1)*4]
392 vldr s12, [IN, #(2*8+2)*4]
393 vldr s14, [IN, #(2*8+3)*4]
394 vldr s16, [IN, #(2*8+4)*4]
395 vldr s18, [IN, #(2*8+5)*4]
396 vldr s20, [IN, #(2*8+6)*4]
397 vldr s22, [IN, #(2*8+7)*4]
398 vldr s9, zero
399 vldr s11, zero
400 vldr s13, zero
401 vldr s15, zero
402 vldr s17, zero
403 vldr s19, zero
404 vldr s21, zero
405 vldr s23, zero
406 vstr d4, [BUF, #(0*32+0)*4]
407 vstr d5, [BUF, #(1*32+0)*4]
408 vstr d6, [BUF, #(2*32+0)*4]
409 vstr d7, [BUF, #(3*32+0)*4]
410 vstr d8, [BUF, #(4*32+0)*4]
411 vstr d9, [BUF, #(5*32+0)*4]
412 vstr d10, [BUF, #(6*32+0)*4]
413 vstr d11, [BUF, #(7*32+0)*4]
414 add BUF, BUF, #2*4
415 sub COUNT, COUNT, #1
4164: @ Now fill the remainder with 0
417 vldr s8, zero
418 vldr s9, zero
419 ands COUNT, COUNT, #0x1F
420 beq 6f
4215: vstr d4, [BUF, #(0*32+0)*4]
422 vstr d4, [BUF, #(1*32+0)*4]
423 vstr d4, [BUF, #(2*32+0)*4]
424 vstr d4, [BUF, #(3*32+0)*4]
425 vstr d4, [BUF, #(4*32+0)*4]
426 vstr d4, [BUF, #(5*32+0)*4]
427 vstr d4, [BUF, #(6*32+0)*4]
428 vstr d4, [BUF, #(7*32+0)*4]
429 add BUF, BUF, #2*4
430 subs COUNT, COUNT, #1
431 bne 5b
4326:
433 fmxr FPSCR, OLDFPSCR
434 ldr WINDOW, [fp, #3*4]
435 ldr OUT, [fp, #4*4]
436 sub BUF, BUF, #32*4
437NOVFP ldr SCALEINT, [fp, #6*4]
438 mov COUNT, #8
439VFP vpush {SCALE}
440VFP sub sp, sp, #3*4
441NOVFP sub sp, sp, #4*4
4427:
443VFP ldr a1, [fp, #-7*4] @ imdct
444NOVFP ldr a1, [fp, #-8*4]
445 ldmia fp, {a2-a4}
446VFP stmia sp, {WINDOW, OUT, BUF}
447NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
448VFP vldr SCALE, [sp, #3*4]
449 bl X(ff_synth_filter_float_vfp)
450 add OUT, OUT, #32*4
451 add BUF, BUF, #32*4
452 subs COUNT, COUNT, #1
453 bne 7b
454
455A sub sp, fp, #(8+8)*4
456T sub fp, fp, #(8+8)*4
457T mov sp, fp
458 vpop {s16-s23}
459VFP pop {a3-a4,v1-v3,v5,fp,pc}
460NOVFP pop {a4,v1-v5,fp,pc}
461endfunc
462
463 .unreq IN
464 .unreq SBACT
465 .unreq OLDFPSCR
466 .unreq IMDCT
467 .unreq WINDOW
468 .unreq OUT
469 .unreq BUF
470 .unreq SCALEINT
471 .unreq COUNT
472
473 .unreq SCALE
474
475 .align 2
476zero: .word 0