Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / synth_filter_vfp.S
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2013 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/arm/asm.S"
23
24IMDCT .req r0
25ORIG_P_SB .req r1
26P_SB_OFF .req r2
27I .req r0
28P_SB2_UP .req r1
29OLDFPSCR .req r2
30P_SB2_DN .req r3
31P_WIN_DN .req r4
32P_OUT_DN .req r5
33P_SB .req r6
34J_WRAP .req r7
35P_WIN_UP .req r12
36P_OUT_UP .req r14
37
38SCALE .req s0
39SBUF_DAT_REV0 .req s4
40SBUF_DAT_REV1 .req s5
41SBUF_DAT_REV2 .req s6
42SBUF_DAT_REV3 .req s7
43VA0 .req s8
44VA3 .req s11
45VB0 .req s12
46VB3 .req s15
47VC0 .req s8
48VC3 .req s11
49VD0 .req s12
50VD3 .req s15
51SBUF_DAT0 .req s16
52SBUF_DAT1 .req s17
53SBUF_DAT2 .req s18
54SBUF_DAT3 .req s19
55SBUF_DAT_ALT0 .req s20
56SBUF_DAT_ALT1 .req s21
57SBUF_DAT_ALT2 .req s22
58SBUF_DAT_ALT3 .req s23
59WIN_DN_DAT0 .req s24
60WIN_UP_DAT0 .req s28
61
62
63.macro inner_loop half, tail, head
64 .if (OFFSET & (64*4)) == 0 @ even numbered call
65 SBUF_DAT_THIS0 .req SBUF_DAT0
66 SBUF_DAT_THIS1 .req SBUF_DAT1
67 SBUF_DAT_THIS2 .req SBUF_DAT2
68 SBUF_DAT_THIS3 .req SBUF_DAT3
69 .ifnc "\head",""
70 vldr d8, [P_SB, #OFFSET] @ d8 = SBUF_DAT
71 vldr d9, [P_SB, #OFFSET+8]
72 .endif
73 .else
74 SBUF_DAT_THIS0 .req SBUF_DAT_ALT0
75 SBUF_DAT_THIS1 .req SBUF_DAT_ALT1
76 SBUF_DAT_THIS2 .req SBUF_DAT_ALT2
77 SBUF_DAT_THIS3 .req SBUF_DAT_ALT3
78 .ifnc "\head",""
79 vldr d10, [P_SB, #OFFSET] @ d10 = SBUF_DAT_ALT
80 vldr d11, [P_SB, #OFFSET+8]
81 .endif
82 .endif
83 .ifnc "\tail",""
84 .ifc "\half","ab"
85 vmls.f VA0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors
86 .else
87 vmla.f VD0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors
88 .endif
89 .endif
90 .ifnc "\head",""
91 vldr d14, [P_WIN_UP, #OFFSET] @ d14 = WIN_UP_DAT
92 vldr d15, [P_WIN_UP, #OFFSET+8]
93 vldr d12, [P_WIN_DN, #OFFSET] @ d12 = WIN_DN_DAT
94 vldr d13, [P_WIN_DN, #OFFSET+8]
95 vmov SBUF_DAT_REV3, SBUF_DAT_THIS0
96 vmov SBUF_DAT_REV2, SBUF_DAT_THIS1
97 vmov SBUF_DAT_REV1, SBUF_DAT_THIS2
98 vmov SBUF_DAT_REV0, SBUF_DAT_THIS3
99 .ifc "\half","ab"
100 vmla.f VB0, SBUF_DAT_THIS0, WIN_UP_DAT0
101 .else
102 vmla.f VC0, SBUF_DAT_THIS0, WIN_UP_DAT0
103 .endif
104 teq J_WRAP, #J
105 bne 2f @ strongly predictable, so better than cond exec in this case
106 sub P_SB, P_SB, #512*4
1072:
108 .set J, J - 64
109 .set OFFSET, OFFSET + 64*4
110 .endif
111 .unreq SBUF_DAT_THIS0
112 .unreq SBUF_DAT_THIS1
113 .unreq SBUF_DAT_THIS2
114 .unreq SBUF_DAT_THIS3
115.endm
116
117
118/* void ff_synth_filter_float_vfp(FFTContext *imdct,
119 * float *synth_buf_ptr, int *synth_buf_offset,
120 * float synth_buf2[32], const float window[512],
121 * float out[32], const float in[32], float scale)
122 */
123function ff_synth_filter_float_vfp, export=1
124 push {r3-r7,lr}
125 vpush {s16-s31}
126 ldr lr, [P_SB_OFF]
127 add a2, ORIG_P_SB, lr, lsl #2 @ calculate synth_buf to pass to imdct_half
128 mov P_SB, a2 @ and keep a copy for ourselves
129 bic J_WRAP, lr, #63 @ mangled to make testing for wrap easier in inner loop
130 sub lr, lr, #32
131 and lr, lr, #512-32
132 str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call
133 ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half
134VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
135 bl X(ff_imdct_half_vfp)
136VFP vmov SCALE, s16
137
138 fmrx OLDFPSCR, FPSCR
139 ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
140 fmxr FPSCR, lr
141 ldr P_SB2_DN, [sp, #16*4]
142 ldr P_WIN_DN, [sp, #(16+6+0)*4]
143 ldr P_OUT_DN, [sp, #(16+6+1)*4]
144NOVFP vldr SCALE, [sp, #(16+6+3)*4]
145
146#define IMM_OFF_SKEW 956 /* also valid immediate constant when you add 16*4 */
147 add P_SB, P_SB, #IMM_OFF_SKEW @ so we can use -ve offsets to use full immediate offset range
148 add P_SB2_UP, P_SB2_DN, #16*4
149 add P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW
150 add P_OUT_UP, P_OUT_DN, #16*4
151 add P_SB2_DN, P_SB2_DN, #16*4
152 add P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW
153 add P_OUT_DN, P_OUT_DN, #16*4
154 mov I, #4
1551:
156 vldmia P_SB2_UP!, {VB0-VB3}
157 vldmdb P_SB2_DN!, {VA0-VA3}
158 .set J, 512 - 64
159 .set OFFSET, -IMM_OFF_SKEW
160 inner_loop ab,, head
161 .rept 7
162 inner_loop ab, tail, head
163 .endr
164 inner_loop ab, tail
165 add P_WIN_UP, P_WIN_UP, #4*4
166 sub P_WIN_DN, P_WIN_DN, #4*4
167 vmul.f VB0, VB0, SCALE @ SCALE treated as scalar
168 add P_SB, P_SB, #(512+4)*4
169 subs I, I, #1
170 vmul.f VA0, VA0, SCALE
171 vstmia P_OUT_UP!, {VB0-VB3}
172 vstmdb P_OUT_DN!, {VA0-VA3}
173 bne 1b
174
175 add P_SB2_DN, P_SB2_DN, #(16+28-12)*4
176 sub P_SB2_UP, P_SB2_UP, #(16+16)*4
177 add P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4
178 mov I, #4
1791:
180 vldr.d d4, zero @ d4 = VC0
181 vldr.d d5, zero
182 vldr.d d6, zero @ d6 = VD0
183 vldr.d d7, zero
184 .set J, 512 - 64
185 .set OFFSET, -IMM_OFF_SKEW
186 inner_loop cd,, head
187 .rept 7
188 inner_loop cd, tail, head
189 .endr
190 inner_loop cd, tail
191 add P_WIN_UP, P_WIN_UP, #4*4
192 sub P_WIN_DN, P_WIN_DN, #4*4
193 add P_SB, P_SB, #(512+4)*4
194 subs I, I, #1
195 vstmia P_SB2_UP!, {VC0-VC3}
196 vstmdb P_SB2_DN!, {VD0-VD3}
197 bne 1b
198
199 fmxr FPSCR, OLDFPSCR
200 vpop {s16-s31}
201 pop {r3-r7,pc}
202endfunc
203
204 .unreq IMDCT
205 .unreq ORIG_P_SB
206 .unreq P_SB_OFF
207 .unreq I
208 .unreq P_SB2_UP
209 .unreq OLDFPSCR
210 .unreq P_SB2_DN
211 .unreq P_WIN_DN
212 .unreq P_OUT_DN
213 .unreq P_SB
214 .unreq J_WRAP
215 .unreq P_WIN_UP
216 .unreq P_OUT_UP
217
218 .unreq SCALE
219 .unreq SBUF_DAT_REV0
220 .unreq SBUF_DAT_REV1
221 .unreq SBUF_DAT_REV2
222 .unreq SBUF_DAT_REV3
223 .unreq VA0
224 .unreq VA3
225 .unreq VB0
226 .unreq VB3
227 .unreq VC0
228 .unreq VC3
229 .unreq VD0
230 .unreq VD3
231 .unreq SBUF_DAT0
232 .unreq SBUF_DAT1
233 .unreq SBUF_DAT2
234 .unreq SBUF_DAT3
235 .unreq SBUF_DAT_ALT0
236 .unreq SBUF_DAT_ALT1
237 .unreq SBUF_DAT_ALT2
238 .unreq SBUF_DAT_ALT3
239 .unreq WIN_DN_DAT0
240 .unreq WIN_UP_DAT0
241
242 .align 3
243zero: .word 0, 0