Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * ARM NEON optimised Float DSP functions | |
3 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | #include "config.h" | |
23 | #include "asm.S" | |
24 | ||
25 | function ff_vector_fmul_neon, export=1 | |
26 | subs r3, r3, #8 | |
27 | vld1.32 {d0-d3}, [r1,:128]! | |
28 | vld1.32 {d4-d7}, [r2,:128]! | |
29 | vmul.f32 q8, q0, q2 | |
30 | vmul.f32 q9, q1, q3 | |
31 | beq 3f | |
32 | bics ip, r3, #15 | |
33 | beq 2f | |
34 | 1: subs ip, ip, #16 | |
35 | vld1.32 {d0-d1}, [r1,:128]! | |
36 | vld1.32 {d4-d5}, [r2,:128]! | |
37 | vmul.f32 q10, q0, q2 | |
38 | vld1.32 {d2-d3}, [r1,:128]! | |
39 | vld1.32 {d6-d7}, [r2,:128]! | |
40 | vmul.f32 q11, q1, q3 | |
41 | vst1.32 {d16-d19},[r0,:128]! | |
42 | vld1.32 {d0-d1}, [r1,:128]! | |
43 | vld1.32 {d4-d5}, [r2,:128]! | |
44 | vmul.f32 q8, q0, q2 | |
45 | vld1.32 {d2-d3}, [r1,:128]! | |
46 | vld1.32 {d6-d7}, [r2,:128]! | |
47 | vmul.f32 q9, q1, q3 | |
48 | vst1.32 {d20-d23},[r0,:128]! | |
49 | bne 1b | |
50 | ands r3, r3, #15 | |
51 | beq 3f | |
52 | 2: vld1.32 {d0-d1}, [r1,:128]! | |
53 | vld1.32 {d4-d5}, [r2,:128]! | |
54 | vst1.32 {d16-d17},[r0,:128]! | |
55 | vmul.f32 q8, q0, q2 | |
56 | vld1.32 {d2-d3}, [r1,:128]! | |
57 | vld1.32 {d6-d7}, [r2,:128]! | |
58 | vst1.32 {d18-d19},[r0,:128]! | |
59 | vmul.f32 q9, q1, q3 | |
60 | 3: vst1.32 {d16-d19},[r0,:128]! | |
61 | bx lr | |
62 | endfunc | |
63 | ||
64 | function ff_vector_fmac_scalar_neon, export=1 | |
65 | VFP len .req r2 | |
66 | VFP acc .req r3 | |
67 | NOVFP len .req r3 | |
68 | NOVFP acc .req r2 | |
69 | VFP vdup.32 q15, d0[0] | |
70 | NOVFP vdup.32 q15, r2 | |
71 | bics r12, len, #15 | |
72 | mov acc, r0 | |
73 | beq 3f | |
74 | vld1.32 {q0}, [r1,:128]! | |
75 | vld1.32 {q8}, [acc,:128]! | |
76 | vld1.32 {q1}, [r1,:128]! | |
77 | vld1.32 {q9}, [acc,:128]! | |
78 | 1: vmla.f32 q8, q0, q15 | |
79 | vld1.32 {q2}, [r1,:128]! | |
80 | vld1.32 {q10}, [acc,:128]! | |
81 | vmla.f32 q9, q1, q15 | |
82 | vld1.32 {q3}, [r1,:128]! | |
83 | vld1.32 {q11}, [acc,:128]! | |
84 | vmla.f32 q10, q2, q15 | |
85 | vst1.32 {q8}, [r0,:128]! | |
86 | vmla.f32 q11, q3, q15 | |
87 | vst1.32 {q9}, [r0,:128]! | |
88 | subs r12, r12, #16 | |
89 | beq 2f | |
90 | vld1.32 {q0}, [r1,:128]! | |
91 | vld1.32 {q8}, [acc,:128]! | |
92 | vst1.32 {q10}, [r0,:128]! | |
93 | vld1.32 {q1}, [r1,:128]! | |
94 | vld1.32 {q9}, [acc,:128]! | |
95 | vst1.32 {q11}, [r0,:128]! | |
96 | b 1b | |
97 | 2: vst1.32 {q10}, [r0,:128]! | |
98 | vst1.32 {q11}, [r0,:128]! | |
99 | ands len, len, #15 | |
100 | it eq | |
101 | bxeq lr | |
102 | 3: vld1.32 {q0}, [r1,:128]! | |
103 | vld1.32 {q8}, [acc,:128]! | |
104 | vmla.f32 q8, q0, q15 | |
105 | vst1.32 {q8}, [r0,:128]! | |
106 | subs len, len, #4 | |
107 | bgt 3b | |
108 | bx lr | |
109 | .unreq len | |
110 | endfunc | |
111 | ||
112 | function ff_vector_fmul_scalar_neon, export=1 | |
113 | VFP len .req r2 | |
114 | NOVFP len .req r3 | |
115 | VFP vdup.32 q8, d0[0] | |
116 | NOVFP vdup.32 q8, r2 | |
117 | bics r12, len, #15 | |
118 | beq 3f | |
119 | vld1.32 {q0},[r1,:128]! | |
120 | vld1.32 {q1},[r1,:128]! | |
121 | 1: vmul.f32 q0, q0, q8 | |
122 | vld1.32 {q2},[r1,:128]! | |
123 | vmul.f32 q1, q1, q8 | |
124 | vld1.32 {q3},[r1,:128]! | |
125 | vmul.f32 q2, q2, q8 | |
126 | vst1.32 {q0},[r0,:128]! | |
127 | vmul.f32 q3, q3, q8 | |
128 | vst1.32 {q1},[r0,:128]! | |
129 | subs r12, r12, #16 | |
130 | beq 2f | |
131 | vld1.32 {q0},[r1,:128]! | |
132 | vst1.32 {q2},[r0,:128]! | |
133 | vld1.32 {q1},[r1,:128]! | |
134 | vst1.32 {q3},[r0,:128]! | |
135 | b 1b | |
136 | 2: vst1.32 {q2},[r0,:128]! | |
137 | vst1.32 {q3},[r0,:128]! | |
138 | ands len, len, #15 | |
139 | it eq | |
140 | bxeq lr | |
141 | 3: vld1.32 {q0},[r1,:128]! | |
142 | vmul.f32 q0, q0, q8 | |
143 | vst1.32 {q0},[r0,:128]! | |
144 | subs len, len, #4 | |
145 | bgt 3b | |
146 | bx lr | |
147 | .unreq len | |
148 | endfunc | |
149 | ||
150 | function ff_vector_fmul_window_neon, export=1 | |
151 | push {r4,r5,lr} | |
152 | ldr lr, [sp, #12] | |
153 | sub r2, r2, #8 | |
154 | sub r5, lr, #2 | |
155 | add r2, r2, r5, lsl #2 | |
156 | add r4, r3, r5, lsl #3 | |
157 | add ip, r0, r5, lsl #3 | |
158 | mov r5, #-16 | |
159 | vld1.32 {d0,d1}, [r1,:128]! | |
160 | vld1.32 {d2,d3}, [r2,:128], r5 | |
161 | vld1.32 {d4,d5}, [r3,:128]! | |
162 | vld1.32 {d6,d7}, [r4,:128], r5 | |
163 | 1: subs lr, lr, #4 | |
164 | vmul.f32 d22, d0, d4 | |
165 | vrev64.32 q3, q3 | |
166 | vmul.f32 d23, d1, d5 | |
167 | vrev64.32 q1, q1 | |
168 | vmul.f32 d20, d0, d7 | |
169 | vmul.f32 d21, d1, d6 | |
170 | beq 2f | |
171 | vmla.f32 d22, d3, d7 | |
172 | vld1.32 {d0,d1}, [r1,:128]! | |
173 | vmla.f32 d23, d2, d6 | |
174 | vld1.32 {d18,d19},[r2,:128], r5 | |
175 | vmls.f32 d20, d3, d4 | |
176 | vld1.32 {d24,d25},[r3,:128]! | |
177 | vmls.f32 d21, d2, d5 | |
178 | vld1.32 {d6,d7}, [r4,:128], r5 | |
179 | vmov q1, q9 | |
180 | vrev64.32 q11, q11 | |
181 | vmov q2, q12 | |
182 | vswp d22, d23 | |
183 | vst1.32 {d20,d21},[r0,:128]! | |
184 | vst1.32 {d22,d23},[ip,:128], r5 | |
185 | b 1b | |
186 | 2: vmla.f32 d22, d3, d7 | |
187 | vmla.f32 d23, d2, d6 | |
188 | vmls.f32 d20, d3, d4 | |
189 | vmls.f32 d21, d2, d5 | |
190 | vrev64.32 q11, q11 | |
191 | vswp d22, d23 | |
192 | vst1.32 {d20,d21},[r0,:128]! | |
193 | vst1.32 {d22,d23},[ip,:128], r5 | |
194 | pop {r4,r5,pc} | |
195 | endfunc | |
196 | ||
197 | function ff_vector_fmul_add_neon, export=1 | |
198 | ldr r12, [sp] | |
199 | vld1.32 {q0-q1}, [r1,:128]! | |
200 | vld1.32 {q8-q9}, [r2,:128]! | |
201 | vld1.32 {q2-q3}, [r3,:128]! | |
202 | vmul.f32 q10, q0, q8 | |
203 | vmul.f32 q11, q1, q9 | |
204 | 1: vadd.f32 q12, q2, q10 | |
205 | vadd.f32 q13, q3, q11 | |
206 | pld [r1, #16] | |
207 | pld [r2, #16] | |
208 | pld [r3, #16] | |
209 | subs r12, r12, #8 | |
210 | beq 2f | |
211 | vld1.32 {q0}, [r1,:128]! | |
212 | vld1.32 {q8}, [r2,:128]! | |
213 | vmul.f32 q10, q0, q8 | |
214 | vld1.32 {q1}, [r1,:128]! | |
215 | vld1.32 {q9}, [r2,:128]! | |
216 | vmul.f32 q11, q1, q9 | |
217 | vld1.32 {q2-q3}, [r3,:128]! | |
218 | vst1.32 {q12-q13},[r0,:128]! | |
219 | b 1b | |
220 | 2: vst1.32 {q12-q13},[r0,:128]! | |
221 | bx lr | |
222 | endfunc | |
223 | ||
224 | function ff_vector_fmul_reverse_neon, export=1 | |
225 | add r2, r2, r3, lsl #2 | |
226 | sub r2, r2, #32 | |
227 | mov r12, #-32 | |
228 | vld1.32 {q0-q1}, [r1,:128]! | |
229 | vld1.32 {q2-q3}, [r2,:128], r12 | |
230 | 1: pld [r1, #32] | |
231 | vrev64.32 q3, q3 | |
232 | vmul.f32 d16, d0, d7 | |
233 | vmul.f32 d17, d1, d6 | |
234 | pld [r2, #-32] | |
235 | vrev64.32 q2, q2 | |
236 | vmul.f32 d18, d2, d5 | |
237 | vmul.f32 d19, d3, d4 | |
238 | subs r3, r3, #8 | |
239 | beq 2f | |
240 | vld1.32 {q0-q1}, [r1,:128]! | |
241 | vld1.32 {q2-q3}, [r2,:128], r12 | |
242 | vst1.32 {q8-q9}, [r0,:128]! | |
243 | b 1b | |
244 | 2: vst1.32 {q8-q9}, [r0,:128]! | |
245 | bx lr | |
246 | endfunc | |
247 | ||
248 | function ff_butterflies_float_neon, export=1 | |
249 | 1: vld1.32 {q0},[r0,:128] | |
250 | vld1.32 {q1},[r1,:128] | |
251 | vsub.f32 q2, q0, q1 | |
252 | vadd.f32 q1, q0, q1 | |
253 | vst1.32 {q2},[r1,:128]! | |
254 | vst1.32 {q1},[r0,:128]! | |
255 | subs r2, r2, #4 | |
256 | bgt 1b | |
257 | bx lr | |
258 | endfunc | |
259 | ||
260 | function ff_scalarproduct_float_neon, export=1 | |
261 | vmov.f32 q2, #0.0 | |
262 | 1: vld1.32 {q0},[r0,:128]! | |
263 | vld1.32 {q1},[r1,:128]! | |
264 | vmla.f32 q2, q0, q1 | |
265 | subs r2, r2, #4 | |
266 | bgt 1b | |
267 | vadd.f32 d0, d4, d5 | |
268 | vpadd.f32 d0, d0, d0 | |
269 | NOVFP vmov.32 r0, d0[0] | |
270 | bx lr | |
271 | endfunc |