Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavutil / arm / float_dsp_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * ARM NEON optimised Float DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "config.h"
23#include "asm.S"
24
25function ff_vector_fmul_neon, export=1
26 subs r3, r3, #8
27 vld1.32 {d0-d3}, [r1,:128]!
28 vld1.32 {d4-d7}, [r2,:128]!
29 vmul.f32 q8, q0, q2
30 vmul.f32 q9, q1, q3
31 beq 3f
32 bics ip, r3, #15
33 beq 2f
341: subs ip, ip, #16
35 vld1.32 {d0-d1}, [r1,:128]!
36 vld1.32 {d4-d5}, [r2,:128]!
37 vmul.f32 q10, q0, q2
38 vld1.32 {d2-d3}, [r1,:128]!
39 vld1.32 {d6-d7}, [r2,:128]!
40 vmul.f32 q11, q1, q3
41 vst1.32 {d16-d19},[r0,:128]!
42 vld1.32 {d0-d1}, [r1,:128]!
43 vld1.32 {d4-d5}, [r2,:128]!
44 vmul.f32 q8, q0, q2
45 vld1.32 {d2-d3}, [r1,:128]!
46 vld1.32 {d6-d7}, [r2,:128]!
47 vmul.f32 q9, q1, q3
48 vst1.32 {d20-d23},[r0,:128]!
49 bne 1b
50 ands r3, r3, #15
51 beq 3f
522: vld1.32 {d0-d1}, [r1,:128]!
53 vld1.32 {d4-d5}, [r2,:128]!
54 vst1.32 {d16-d17},[r0,:128]!
55 vmul.f32 q8, q0, q2
56 vld1.32 {d2-d3}, [r1,:128]!
57 vld1.32 {d6-d7}, [r2,:128]!
58 vst1.32 {d18-d19},[r0,:128]!
59 vmul.f32 q9, q1, q3
603: vst1.32 {d16-d19},[r0,:128]!
61 bx lr
62endfunc
63
64function ff_vector_fmac_scalar_neon, export=1
65VFP len .req r2
66VFP acc .req r3
67NOVFP len .req r3
68NOVFP acc .req r2
69VFP vdup.32 q15, d0[0]
70NOVFP vdup.32 q15, r2
71 bics r12, len, #15
72 mov acc, r0
73 beq 3f
74 vld1.32 {q0}, [r1,:128]!
75 vld1.32 {q8}, [acc,:128]!
76 vld1.32 {q1}, [r1,:128]!
77 vld1.32 {q9}, [acc,:128]!
781: vmla.f32 q8, q0, q15
79 vld1.32 {q2}, [r1,:128]!
80 vld1.32 {q10}, [acc,:128]!
81 vmla.f32 q9, q1, q15
82 vld1.32 {q3}, [r1,:128]!
83 vld1.32 {q11}, [acc,:128]!
84 vmla.f32 q10, q2, q15
85 vst1.32 {q8}, [r0,:128]!
86 vmla.f32 q11, q3, q15
87 vst1.32 {q9}, [r0,:128]!
88 subs r12, r12, #16
89 beq 2f
90 vld1.32 {q0}, [r1,:128]!
91 vld1.32 {q8}, [acc,:128]!
92 vst1.32 {q10}, [r0,:128]!
93 vld1.32 {q1}, [r1,:128]!
94 vld1.32 {q9}, [acc,:128]!
95 vst1.32 {q11}, [r0,:128]!
96 b 1b
972: vst1.32 {q10}, [r0,:128]!
98 vst1.32 {q11}, [r0,:128]!
99 ands len, len, #15
100 it eq
101 bxeq lr
1023: vld1.32 {q0}, [r1,:128]!
103 vld1.32 {q8}, [acc,:128]!
104 vmla.f32 q8, q0, q15
105 vst1.32 {q8}, [r0,:128]!
106 subs len, len, #4
107 bgt 3b
108 bx lr
109 .unreq len
110endfunc
111
112function ff_vector_fmul_scalar_neon, export=1
113VFP len .req r2
114NOVFP len .req r3
115VFP vdup.32 q8, d0[0]
116NOVFP vdup.32 q8, r2
117 bics r12, len, #15
118 beq 3f
119 vld1.32 {q0},[r1,:128]!
120 vld1.32 {q1},[r1,:128]!
1211: vmul.f32 q0, q0, q8
122 vld1.32 {q2},[r1,:128]!
123 vmul.f32 q1, q1, q8
124 vld1.32 {q3},[r1,:128]!
125 vmul.f32 q2, q2, q8
126 vst1.32 {q0},[r0,:128]!
127 vmul.f32 q3, q3, q8
128 vst1.32 {q1},[r0,:128]!
129 subs r12, r12, #16
130 beq 2f
131 vld1.32 {q0},[r1,:128]!
132 vst1.32 {q2},[r0,:128]!
133 vld1.32 {q1},[r1,:128]!
134 vst1.32 {q3},[r0,:128]!
135 b 1b
1362: vst1.32 {q2},[r0,:128]!
137 vst1.32 {q3},[r0,:128]!
138 ands len, len, #15
139 it eq
140 bxeq lr
1413: vld1.32 {q0},[r1,:128]!
142 vmul.f32 q0, q0, q8
143 vst1.32 {q0},[r0,:128]!
144 subs len, len, #4
145 bgt 3b
146 bx lr
147 .unreq len
148endfunc
149
150function ff_vector_fmul_window_neon, export=1
151 push {r4,r5,lr}
152 ldr lr, [sp, #12]
153 sub r2, r2, #8
154 sub r5, lr, #2
155 add r2, r2, r5, lsl #2
156 add r4, r3, r5, lsl #3
157 add ip, r0, r5, lsl #3
158 mov r5, #-16
159 vld1.32 {d0,d1}, [r1,:128]!
160 vld1.32 {d2,d3}, [r2,:128], r5
161 vld1.32 {d4,d5}, [r3,:128]!
162 vld1.32 {d6,d7}, [r4,:128], r5
1631: subs lr, lr, #4
164 vmul.f32 d22, d0, d4
165 vrev64.32 q3, q3
166 vmul.f32 d23, d1, d5
167 vrev64.32 q1, q1
168 vmul.f32 d20, d0, d7
169 vmul.f32 d21, d1, d6
170 beq 2f
171 vmla.f32 d22, d3, d7
172 vld1.32 {d0,d1}, [r1,:128]!
173 vmla.f32 d23, d2, d6
174 vld1.32 {d18,d19},[r2,:128], r5
175 vmls.f32 d20, d3, d4
176 vld1.32 {d24,d25},[r3,:128]!
177 vmls.f32 d21, d2, d5
178 vld1.32 {d6,d7}, [r4,:128], r5
179 vmov q1, q9
180 vrev64.32 q11, q11
181 vmov q2, q12
182 vswp d22, d23
183 vst1.32 {d20,d21},[r0,:128]!
184 vst1.32 {d22,d23},[ip,:128], r5
185 b 1b
1862: vmla.f32 d22, d3, d7
187 vmla.f32 d23, d2, d6
188 vmls.f32 d20, d3, d4
189 vmls.f32 d21, d2, d5
190 vrev64.32 q11, q11
191 vswp d22, d23
192 vst1.32 {d20,d21},[r0,:128]!
193 vst1.32 {d22,d23},[ip,:128], r5
194 pop {r4,r5,pc}
195endfunc
196
197function ff_vector_fmul_add_neon, export=1
198 ldr r12, [sp]
199 vld1.32 {q0-q1}, [r1,:128]!
200 vld1.32 {q8-q9}, [r2,:128]!
201 vld1.32 {q2-q3}, [r3,:128]!
202 vmul.f32 q10, q0, q8
203 vmul.f32 q11, q1, q9
2041: vadd.f32 q12, q2, q10
205 vadd.f32 q13, q3, q11
206 pld [r1, #16]
207 pld [r2, #16]
208 pld [r3, #16]
209 subs r12, r12, #8
210 beq 2f
211 vld1.32 {q0}, [r1,:128]!
212 vld1.32 {q8}, [r2,:128]!
213 vmul.f32 q10, q0, q8
214 vld1.32 {q1}, [r1,:128]!
215 vld1.32 {q9}, [r2,:128]!
216 vmul.f32 q11, q1, q9
217 vld1.32 {q2-q3}, [r3,:128]!
218 vst1.32 {q12-q13},[r0,:128]!
219 b 1b
2202: vst1.32 {q12-q13},[r0,:128]!
221 bx lr
222endfunc
223
224function ff_vector_fmul_reverse_neon, export=1
225 add r2, r2, r3, lsl #2
226 sub r2, r2, #32
227 mov r12, #-32
228 vld1.32 {q0-q1}, [r1,:128]!
229 vld1.32 {q2-q3}, [r2,:128], r12
2301: pld [r1, #32]
231 vrev64.32 q3, q3
232 vmul.f32 d16, d0, d7
233 vmul.f32 d17, d1, d6
234 pld [r2, #-32]
235 vrev64.32 q2, q2
236 vmul.f32 d18, d2, d5
237 vmul.f32 d19, d3, d4
238 subs r3, r3, #8
239 beq 2f
240 vld1.32 {q0-q1}, [r1,:128]!
241 vld1.32 {q2-q3}, [r2,:128], r12
242 vst1.32 {q8-q9}, [r0,:128]!
243 b 1b
2442: vst1.32 {q8-q9}, [r0,:128]!
245 bx lr
246endfunc
247
248function ff_butterflies_float_neon, export=1
2491: vld1.32 {q0},[r0,:128]
250 vld1.32 {q1},[r1,:128]
251 vsub.f32 q2, q0, q1
252 vadd.f32 q1, q0, q1
253 vst1.32 {q2},[r1,:128]!
254 vst1.32 {q1},[r0,:128]!
255 subs r2, r2, #4
256 bgt 1b
257 bx lr
258endfunc
259
260function ff_scalarproduct_float_neon, export=1
261 vmov.f32 q2, #0.0
2621: vld1.32 {q0},[r0,:128]!
263 vld1.32 {q1},[r1,:128]!
264 vmla.f32 q2, q0, q1
265 subs r2, r2, #4
266 bgt 1b
267 vadd.f32 d0, d4, d5
268 vpadd.f32 d0, d0, d0
269NOVFP vmov.32 r0, d0[0]
270 bx lr
271endfunc