382322760800011e0cc22947a27fdb515fec3a2e
[deb_ffmpeg.git] / float_dsp_neon.S
1 /*
2 * ARM NEON optimised Float DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #include "config.h"
23 #include "asm.S"
24
25 function ff_vector_fmul_neon, export=1
26 subs r3, r3, #8
27 vld1.32 {d0-d3}, [r1,:128]!
28 vld1.32 {d4-d7}, [r2,:128]!
29 vmul.f32 q8, q0, q2
30 vmul.f32 q9, q1, q3
31 beq 3f
32 bics ip, r3, #15
33 beq 2f
34 1: subs ip, ip, #16
35 vld1.32 {d0-d1}, [r1,:128]!
36 vld1.32 {d4-d5}, [r2,:128]!
37 vmul.f32 q10, q0, q2
38 vld1.32 {d2-d3}, [r1,:128]!
39 vld1.32 {d6-d7}, [r2,:128]!
40 vmul.f32 q11, q1, q3
41 vst1.32 {d16-d19},[r0,:128]!
42 vld1.32 {d0-d1}, [r1,:128]!
43 vld1.32 {d4-d5}, [r2,:128]!
44 vmul.f32 q8, q0, q2
45 vld1.32 {d2-d3}, [r1,:128]!
46 vld1.32 {d6-d7}, [r2,:128]!
47 vmul.f32 q9, q1, q3
48 vst1.32 {d20-d23},[r0,:128]!
49 bne 1b
50 ands r3, r3, #15
51 beq 3f
52 2: vld1.32 {d0-d1}, [r1,:128]!
53 vld1.32 {d4-d5}, [r2,:128]!
54 vst1.32 {d16-d17},[r0,:128]!
55 vmul.f32 q8, q0, q2
56 vld1.32 {d2-d3}, [r1,:128]!
57 vld1.32 {d6-d7}, [r2,:128]!
58 vst1.32 {d18-d19},[r0,:128]!
59 vmul.f32 q9, q1, q3
60 3: vst1.32 {d16-d19},[r0,:128]!
61 bx lr
62 endfunc
63
64 function ff_vector_fmac_scalar_neon, export=1
65 VFP len .req r2
66 VFP acc .req r3
67 NOVFP len .req r3
68 NOVFP acc .req r2
69 VFP vdup.32 q15, d0[0]
70 NOVFP vdup.32 q15, r2
71 bics r12, len, #15
72 mov acc, r0
73 beq 3f
74 vld1.32 {q0}, [r1,:128]!
75 vld1.32 {q8}, [acc,:128]!
76 vld1.32 {q1}, [r1,:128]!
77 vld1.32 {q9}, [acc,:128]!
78 1: vmla.f32 q8, q0, q15
79 vld1.32 {q2}, [r1,:128]!
80 vld1.32 {q10}, [acc,:128]!
81 vmla.f32 q9, q1, q15
82 vld1.32 {q3}, [r1,:128]!
83 vld1.32 {q11}, [acc,:128]!
84 vmla.f32 q10, q2, q15
85 vst1.32 {q8}, [r0,:128]!
86 vmla.f32 q11, q3, q15
87 vst1.32 {q9}, [r0,:128]!
88 subs r12, r12, #16
89 beq 2f
90 vld1.32 {q0}, [r1,:128]!
91 vld1.32 {q8}, [acc,:128]!
92 vst1.32 {q10}, [r0,:128]!
93 vld1.32 {q1}, [r1,:128]!
94 vld1.32 {q9}, [acc,:128]!
95 vst1.32 {q11}, [r0,:128]!
96 b 1b
97 2: vst1.32 {q10}, [r0,:128]!
98 vst1.32 {q11}, [r0,:128]!
99 ands len, len, #15
100 it eq
101 bxeq lr
102 3: vld1.32 {q0}, [r1,:128]!
103 vld1.32 {q8}, [acc,:128]!
104 vmla.f32 q8, q0, q15
105 vst1.32 {q8}, [r0,:128]!
106 subs len, len, #4
107 bgt 3b
108 bx lr
109 .unreq len
110 endfunc
111
112 function ff_vector_fmul_scalar_neon, export=1
113 VFP len .req r2
114 NOVFP len .req r3
115 VFP vdup.32 q8, d0[0]
116 NOVFP vdup.32 q8, r2
117 bics r12, len, #15
118 beq 3f
119 vld1.32 {q0},[r1,:128]!
120 vld1.32 {q1},[r1,:128]!
121 1: vmul.f32 q0, q0, q8
122 vld1.32 {q2},[r1,:128]!
123 vmul.f32 q1, q1, q8
124 vld1.32 {q3},[r1,:128]!
125 vmul.f32 q2, q2, q8
126 vst1.32 {q0},[r0,:128]!
127 vmul.f32 q3, q3, q8
128 vst1.32 {q1},[r0,:128]!
129 subs r12, r12, #16
130 beq 2f
131 vld1.32 {q0},[r1,:128]!
132 vst1.32 {q2},[r0,:128]!
133 vld1.32 {q1},[r1,:128]!
134 vst1.32 {q3},[r0,:128]!
135 b 1b
136 2: vst1.32 {q2},[r0,:128]!
137 vst1.32 {q3},[r0,:128]!
138 ands len, len, #15
139 it eq
140 bxeq lr
141 3: vld1.32 {q0},[r1,:128]!
142 vmul.f32 q0, q0, q8
143 vst1.32 {q0},[r0,:128]!
144 subs len, len, #4
145 bgt 3b
146 bx lr
147 .unreq len
148 endfunc
149
150 function ff_vector_fmul_window_neon, export=1
151 push {r4,r5,lr}
152 ldr lr, [sp, #12]
153 sub r2, r2, #8
154 sub r5, lr, #2
155 add r2, r2, r5, lsl #2
156 add r4, r3, r5, lsl #3
157 add ip, r0, r5, lsl #3
158 mov r5, #-16
159 vld1.32 {d0,d1}, [r1,:128]!
160 vld1.32 {d2,d3}, [r2,:128], r5
161 vld1.32 {d4,d5}, [r3,:128]!
162 vld1.32 {d6,d7}, [r4,:128], r5
163 1: subs lr, lr, #4
164 vmul.f32 d22, d0, d4
165 vrev64.32 q3, q3
166 vmul.f32 d23, d1, d5
167 vrev64.32 q1, q1
168 vmul.f32 d20, d0, d7
169 vmul.f32 d21, d1, d6
170 beq 2f
171 vmla.f32 d22, d3, d7
172 vld1.32 {d0,d1}, [r1,:128]!
173 vmla.f32 d23, d2, d6
174 vld1.32 {d18,d19},[r2,:128], r5
175 vmls.f32 d20, d3, d4
176 vld1.32 {d24,d25},[r3,:128]!
177 vmls.f32 d21, d2, d5
178 vld1.32 {d6,d7}, [r4,:128], r5
179 vmov q1, q9
180 vrev64.32 q11, q11
181 vmov q2, q12
182 vswp d22, d23
183 vst1.32 {d20,d21},[r0,:128]!
184 vst1.32 {d22,d23},[ip,:128], r5
185 b 1b
186 2: vmla.f32 d22, d3, d7
187 vmla.f32 d23, d2, d6
188 vmls.f32 d20, d3, d4
189 vmls.f32 d21, d2, d5
190 vrev64.32 q11, q11
191 vswp d22, d23
192 vst1.32 {d20,d21},[r0,:128]!
193 vst1.32 {d22,d23},[ip,:128], r5
194 pop {r4,r5,pc}
195 endfunc
196
197 function ff_vector_fmul_add_neon, export=1
198 ldr r12, [sp]
199 vld1.32 {q0-q1}, [r1,:128]!
200 vld1.32 {q8-q9}, [r2,:128]!
201 vld1.32 {q2-q3}, [r3,:128]!
202 vmul.f32 q10, q0, q8
203 vmul.f32 q11, q1, q9
204 1: vadd.f32 q12, q2, q10
205 vadd.f32 q13, q3, q11
206 pld [r1, #16]
207 pld [r2, #16]
208 pld [r3, #16]
209 subs r12, r12, #8
210 beq 2f
211 vld1.32 {q0}, [r1,:128]!
212 vld1.32 {q8}, [r2,:128]!
213 vmul.f32 q10, q0, q8
214 vld1.32 {q1}, [r1,:128]!
215 vld1.32 {q9}, [r2,:128]!
216 vmul.f32 q11, q1, q9
217 vld1.32 {q2-q3}, [r3,:128]!
218 vst1.32 {q12-q13},[r0,:128]!
219 b 1b
220 2: vst1.32 {q12-q13},[r0,:128]!
221 bx lr
222 endfunc
223
224 function ff_vector_fmul_reverse_neon, export=1
225 add r2, r2, r3, lsl #2
226 sub r2, r2, #32
227 mov r12, #-32
228 vld1.32 {q0-q1}, [r1,:128]!
229 vld1.32 {q2-q3}, [r2,:128], r12
230 1: pld [r1, #32]
231 vrev64.32 q3, q3
232 vmul.f32 d16, d0, d7
233 vmul.f32 d17, d1, d6
234 pld [r2, #-32]
235 vrev64.32 q2, q2
236 vmul.f32 d18, d2, d5
237 vmul.f32 d19, d3, d4
238 subs r3, r3, #8
239 beq 2f
240 vld1.32 {q0-q1}, [r1,:128]!
241 vld1.32 {q2-q3}, [r2,:128], r12
242 vst1.32 {q8-q9}, [r0,:128]!
243 b 1b
244 2: vst1.32 {q8-q9}, [r0,:128]!
245 bx lr
246 endfunc
247
248 function ff_butterflies_float_neon, export=1
249 1: vld1.32 {q0},[r0,:128]
250 vld1.32 {q1},[r1,:128]
251 vsub.f32 q2, q0, q1
252 vadd.f32 q1, q0, q1
253 vst1.32 {q2},[r1,:128]!
254 vst1.32 {q1},[r0,:128]!
255 subs r2, r2, #4
256 bgt 1b
257 bx lr
258 endfunc
259
260 function ff_scalarproduct_float_neon, export=1
261 vmov.f32 q2, #0.0
262 1: vld1.32 {q0},[r0,:128]!
263 vld1.32 {q1},[r1,:128]!
264 vmla.f32 q2, q0, q1
265 subs r2, r2, #4
266 bgt 1b
267 vadd.f32 d0, d4, d5
268 vpadd.f32 d0, d0, d0
269 NOVFP vmov.32 r0, d0[0]
270 bx lr
271 endfunc