2 * ARM NEON optimised Float DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 function ff_vector_fmul_neon, export=1
27 vld1.32 {d0-d3}, [r1,:128]!
28 vld1.32 {d4-d7}, [r2,:128]!
35 vld1.32 {d0-d1}, [r1,:128]!
36 vld1.32 {d4-d5}, [r2,:128]!
38 vld1.32 {d2-d3}, [r1,:128]!
39 vld1.32 {d6-d7}, [r2,:128]!
41 vst1.32 {d16-d19},[r0,:128]!
42 vld1.32 {d0-d1}, [r1,:128]!
43 vld1.32 {d4-d5}, [r2,:128]!
45 vld1.32 {d2-d3}, [r1,:128]!
46 vld1.32 {d6-d7}, [r2,:128]!
48 vst1.32 {d20-d23},[r0,:128]!
52 2: vld1.32 {d0-d1}, [r1,:128]!
53 vld1.32 {d4-d5}, [r2,:128]!
54 vst1.32 {d16-d17},[r0,:128]!
56 vld1.32 {d2-d3}, [r1,:128]!
57 vld1.32 {d6-d7}, [r2,:128]!
58 vst1.32 {d18-d19},[r0,:128]!
60 3: vst1.32 {d16-d19},[r0,:128]!
64 function ff_vector_fmac_scalar_neon, export=1
69 VFP vdup.32 q15, d0[0]
74 vld1.32 {q0}, [r1,:128]!
75 vld1.32 {q8}, [acc,:128]!
76 vld1.32 {q1}, [r1,:128]!
77 vld1.32 {q9}, [acc,:128]!
78 1: vmla.f32 q8, q0, q15
79 vld1.32 {q2}, [r1,:128]!
80 vld1.32 {q10}, [acc,:128]!
82 vld1.32 {q3}, [r1,:128]!
83 vld1.32 {q11}, [acc,:128]!
85 vst1.32 {q8}, [r0,:128]!
87 vst1.32 {q9}, [r0,:128]!
90 vld1.32 {q0}, [r1,:128]!
91 vld1.32 {q8}, [acc,:128]!
92 vst1.32 {q10}, [r0,:128]!
93 vld1.32 {q1}, [r1,:128]!
94 vld1.32 {q9}, [acc,:128]!
95 vst1.32 {q11}, [r0,:128]!
97 2: vst1.32 {q10}, [r0,:128]!
98 vst1.32 {q11}, [r0,:128]!
102 3: vld1.32 {q0}, [r1,:128]!
103 vld1.32 {q8}, [acc,:128]!
105 vst1.32 {q8}, [r0,:128]!
112 function ff_vector_fmul_scalar_neon, export=1
115 VFP vdup.32 q8, d0[0]
119 vld1.32 {q0},[r1,:128]!
120 vld1.32 {q1},[r1,:128]!
121 1: vmul.f32 q0, q0, q8
122 vld1.32 {q2},[r1,:128]!
124 vld1.32 {q3},[r1,:128]!
126 vst1.32 {q0},[r0,:128]!
128 vst1.32 {q1},[r0,:128]!
131 vld1.32 {q0},[r1,:128]!
132 vst1.32 {q2},[r0,:128]!
133 vld1.32 {q1},[r1,:128]!
134 vst1.32 {q3},[r0,:128]!
136 2: vst1.32 {q2},[r0,:128]!
137 vst1.32 {q3},[r0,:128]!
141 3: vld1.32 {q0},[r1,:128]!
143 vst1.32 {q0},[r0,:128]!
150 function ff_vector_fmul_window_neon, export=1
155 add r2, r2, r5, lsl #2
156 add r4, r3, r5, lsl #3
157 add ip, r0, r5, lsl #3
159 vld1.32 {d0,d1}, [r1,:128]!
160 vld1.32 {d2,d3}, [r2,:128], r5
161 vld1.32 {d4,d5}, [r3,:128]!
162 vld1.32 {d6,d7}, [r4,:128], r5
172 vld1.32 {d0,d1}, [r1,:128]!
174 vld1.32 {d18,d19},[r2,:128], r5
176 vld1.32 {d24,d25},[r3,:128]!
178 vld1.32 {d6,d7}, [r4,:128], r5
183 vst1.32 {d20,d21},[r0,:128]!
184 vst1.32 {d22,d23},[ip,:128], r5
186 2: vmla.f32 d22, d3, d7
192 vst1.32 {d20,d21},[r0,:128]!
193 vst1.32 {d22,d23},[ip,:128], r5
197 function ff_vector_fmul_add_neon, export=1
199 vld1.32 {q0-q1}, [r1,:128]!
200 vld1.32 {q8-q9}, [r2,:128]!
201 vld1.32 {q2-q3}, [r3,:128]!
204 1: vadd.f32 q12, q2, q10
205 vadd.f32 q13, q3, q11
211 vld1.32 {q0}, [r1,:128]!
212 vld1.32 {q8}, [r2,:128]!
214 vld1.32 {q1}, [r1,:128]!
215 vld1.32 {q9}, [r2,:128]!
217 vld1.32 {q2-q3}, [r3,:128]!
218 vst1.32 {q12-q13},[r0,:128]!
220 2: vst1.32 {q12-q13},[r0,:128]!
224 function ff_vector_fmul_reverse_neon, export=1
225 add r2, r2, r3, lsl #2
228 vld1.32 {q0-q1}, [r1,:128]!
229 vld1.32 {q2-q3}, [r2,:128], r12
240 vld1.32 {q0-q1}, [r1,:128]!
241 vld1.32 {q2-q3}, [r2,:128], r12
242 vst1.32 {q8-q9}, [r0,:128]!
244 2: vst1.32 {q8-q9}, [r0,:128]!
248 function ff_butterflies_float_neon, export=1
249 1: vld1.32 {q0},[r0,:128]
250 vld1.32 {q1},[r1,:128]
253 vst1.32 {q2},[r1,:128]!
254 vst1.32 {q1},[r0,:128]!
260 function ff_scalarproduct_float_neon, export=1
262 1: vld1.32 {q0},[r0,:128]!
263 vld1.32 {q1},[r1,:128]!
269 NOVFP vmov.32 r0, d0[0]