Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / synth_filter_vfp.S
1 /*
2 * Copyright (c) 2013 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #include "libavutil/arm/asm.S"
23
24 IMDCT .req r0
25 ORIG_P_SB .req r1
26 P_SB_OFF .req r2
27 I .req r0
28 P_SB2_UP .req r1
29 OLDFPSCR .req r2
30 P_SB2_DN .req r3
31 P_WIN_DN .req r4
32 P_OUT_DN .req r5
33 P_SB .req r6
34 J_WRAP .req r7
35 P_WIN_UP .req r12
36 P_OUT_UP .req r14
37
38 SCALE .req s0
39 SBUF_DAT_REV0 .req s4
40 SBUF_DAT_REV1 .req s5
41 SBUF_DAT_REV2 .req s6
42 SBUF_DAT_REV3 .req s7
43 VA0 .req s8
44 VA3 .req s11
45 VB0 .req s12
46 VB3 .req s15
47 VC0 .req s8
48 VC3 .req s11
49 VD0 .req s12
50 VD3 .req s15
51 SBUF_DAT0 .req s16
52 SBUF_DAT1 .req s17
53 SBUF_DAT2 .req s18
54 SBUF_DAT3 .req s19
55 SBUF_DAT_ALT0 .req s20
56 SBUF_DAT_ALT1 .req s21
57 SBUF_DAT_ALT2 .req s22
58 SBUF_DAT_ALT3 .req s23
59 WIN_DN_DAT0 .req s24
60 WIN_UP_DAT0 .req s28
61
62
63 .macro inner_loop half, tail, head
64 .if (OFFSET & (64*4)) == 0 @ even numbered call
65 SBUF_DAT_THIS0 .req SBUF_DAT0
66 SBUF_DAT_THIS1 .req SBUF_DAT1
67 SBUF_DAT_THIS2 .req SBUF_DAT2
68 SBUF_DAT_THIS3 .req SBUF_DAT3
69 .ifnc "\head",""
70 vldr d8, [P_SB, #OFFSET] @ d8 = SBUF_DAT
71 vldr d9, [P_SB, #OFFSET+8]
72 .endif
73 .else
74 SBUF_DAT_THIS0 .req SBUF_DAT_ALT0
75 SBUF_DAT_THIS1 .req SBUF_DAT_ALT1
76 SBUF_DAT_THIS2 .req SBUF_DAT_ALT2
77 SBUF_DAT_THIS3 .req SBUF_DAT_ALT3
78 .ifnc "\head",""
79 vldr d10, [P_SB, #OFFSET] @ d10 = SBUF_DAT_ALT
80 vldr d11, [P_SB, #OFFSET+8]
81 .endif
82 .endif
83 .ifnc "\tail",""
84 .ifc "\half","ab"
85 vmls.f VA0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors
86 .else
87 vmla.f VD0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors
88 .endif
89 .endif
90 .ifnc "\head",""
91 vldr d14, [P_WIN_UP, #OFFSET] @ d14 = WIN_UP_DAT
92 vldr d15, [P_WIN_UP, #OFFSET+8]
93 vldr d12, [P_WIN_DN, #OFFSET] @ d12 = WIN_DN_DAT
94 vldr d13, [P_WIN_DN, #OFFSET+8]
95 vmov SBUF_DAT_REV3, SBUF_DAT_THIS0
96 vmov SBUF_DAT_REV2, SBUF_DAT_THIS1
97 vmov SBUF_DAT_REV1, SBUF_DAT_THIS2
98 vmov SBUF_DAT_REV0, SBUF_DAT_THIS3
99 .ifc "\half","ab"
100 vmla.f VB0, SBUF_DAT_THIS0, WIN_UP_DAT0
101 .else
102 vmla.f VC0, SBUF_DAT_THIS0, WIN_UP_DAT0
103 .endif
104 teq J_WRAP, #J
105 bne 2f @ strongly predictable, so better than cond exec in this case
106 sub P_SB, P_SB, #512*4
107 2:
108 .set J, J - 64
109 .set OFFSET, OFFSET + 64*4
110 .endif
111 .unreq SBUF_DAT_THIS0
112 .unreq SBUF_DAT_THIS1
113 .unreq SBUF_DAT_THIS2
114 .unreq SBUF_DAT_THIS3
115 .endm
116
117
118 /* void ff_synth_filter_float_vfp(FFTContext *imdct,
119 * float *synth_buf_ptr, int *synth_buf_offset,
120 * float synth_buf2[32], const float window[512],
121 * float out[32], const float in[32], float scale)
122 */
123 function ff_synth_filter_float_vfp, export=1
124 push {r3-r7,lr}
125 vpush {s16-s31}
126 ldr lr, [P_SB_OFF]
127 add a2, ORIG_P_SB, lr, lsl #2 @ calculate synth_buf to pass to imdct_half
128 mov P_SB, a2 @ and keep a copy for ourselves
129 bic J_WRAP, lr, #63 @ mangled to make testing for wrap easier in inner loop
130 sub lr, lr, #32
131 and lr, lr, #512-32
132 str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call
133 ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half
134 VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
135 bl X(ff_imdct_half_vfp)
136 VFP vmov SCALE, s16
137
138 fmrx OLDFPSCR, FPSCR
139 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
140 fmxr FPSCR, lr
141 ldr P_SB2_DN, [sp, #16*4]
142 ldr P_WIN_DN, [sp, #(16+6+0)*4]
143 ldr P_OUT_DN, [sp, #(16+6+1)*4]
144 NOVFP vldr SCALE, [sp, #(16+6+3)*4]
145
146 #define IMM_OFF_SKEW 956 /* also valid immediate constant when you add 16*4 */
147 add P_SB, P_SB, #IMM_OFF_SKEW @ so we can use -ve offsets to use full immediate offset range
148 add P_SB2_UP, P_SB2_DN, #16*4
149 add P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW
150 add P_OUT_UP, P_OUT_DN, #16*4
151 add P_SB2_DN, P_SB2_DN, #16*4
152 add P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW
153 add P_OUT_DN, P_OUT_DN, #16*4
154 mov I, #4
155 1:
156 vldmia P_SB2_UP!, {VB0-VB3}
157 vldmdb P_SB2_DN!, {VA0-VA3}
158 .set J, 512 - 64
159 .set OFFSET, -IMM_OFF_SKEW
160 inner_loop ab,, head
161 .rept 7
162 inner_loop ab, tail, head
163 .endr
164 inner_loop ab, tail
165 add P_WIN_UP, P_WIN_UP, #4*4
166 sub P_WIN_DN, P_WIN_DN, #4*4
167 vmul.f VB0, VB0, SCALE @ SCALE treated as scalar
168 add P_SB, P_SB, #(512+4)*4
169 subs I, I, #1
170 vmul.f VA0, VA0, SCALE
171 vstmia P_OUT_UP!, {VB0-VB3}
172 vstmdb P_OUT_DN!, {VA0-VA3}
173 bne 1b
174
175 add P_SB2_DN, P_SB2_DN, #(16+28-12)*4
176 sub P_SB2_UP, P_SB2_UP, #(16+16)*4
177 add P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4
178 mov I, #4
179 1:
180 vldr.d d4, zero @ d4 = VC0
181 vldr.d d5, zero
182 vldr.d d6, zero @ d6 = VD0
183 vldr.d d7, zero
184 .set J, 512 - 64
185 .set OFFSET, -IMM_OFF_SKEW
186 inner_loop cd,, head
187 .rept 7
188 inner_loop cd, tail, head
189 .endr
190 inner_loop cd, tail
191 add P_WIN_UP, P_WIN_UP, #4*4
192 sub P_WIN_DN, P_WIN_DN, #4*4
193 add P_SB, P_SB, #(512+4)*4
194 subs I, I, #1
195 vstmia P_SB2_UP!, {VC0-VC3}
196 vstmdb P_SB2_DN!, {VD0-VD3}
197 bne 1b
198
199 fmxr FPSCR, OLDFPSCR
200 vpop {s16-s31}
201 pop {r3-r7,pc}
202 endfunc
203
204 .unreq IMDCT
205 .unreq ORIG_P_SB
206 .unreq P_SB_OFF
207 .unreq I
208 .unreq P_SB2_UP
209 .unreq OLDFPSCR
210 .unreq P_SB2_DN
211 .unreq P_WIN_DN
212 .unreq P_OUT_DN
213 .unreq P_SB
214 .unreq J_WRAP
215 .unreq P_WIN_UP
216 .unreq P_OUT_UP
217
218 .unreq SCALE
219 .unreq SBUF_DAT_REV0
220 .unreq SBUF_DAT_REV1
221 .unreq SBUF_DAT_REV2
222 .unreq SBUF_DAT_REV3
223 .unreq VA0
224 .unreq VA3
225 .unreq VB0
226 .unreq VB3
227 .unreq VC0
228 .unreq VC3
229 .unreq VD0
230 .unreq VD3
231 .unreq SBUF_DAT0
232 .unreq SBUF_DAT1
233 .unreq SBUF_DAT2
234 .unreq SBUF_DAT3
235 .unreq SBUF_DAT_ALT0
236 .unreq SBUF_DAT_ALT1
237 .unreq SBUF_DAT_ALT2
238 .unreq SBUF_DAT_ALT3
239 .unreq WIN_DN_DAT0
240 .unreq WIN_UP_DAT0
241
242 .align 3
243 zero: .word 0, 0