Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * ARM NEON optimised Float DSP functions | |
3 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
4 | * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> | |
5 | * | |
6 | * This file is part of FFmpeg. | |
7 | * | |
8 | * FFmpeg is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License as published by the Free Software Foundation; either | |
11 | * version 2.1 of the License, or (at your option) any later version. | |
12 | * | |
13 | * FFmpeg is distributed in the hope that it will be useful, | |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | * Lesser General Public License for more details. | |
17 | * | |
18 | * You should have received a copy of the GNU Lesser General Public | |
19 | * License along with FFmpeg; if not, write to the Free Software | |
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | */ | |
22 | ||
23 | #include "config.h" | |
24 | #include "asm.S" | |
25 | ||
26 | function ff_vector_fmul_neon, export=1 | |
27 | 1: subs w3, w3, #16 | |
28 | ld1 {v0.4S, v1.4S}, [x1], #32 | |
29 | ld1 {v2.4S, v3.4S}, [x1], #32 | |
30 | ld1 {v4.4S, v5.4S}, [x2], #32 | |
31 | ld1 {v6.4S, v7.4S}, [x2], #32 | |
32 | fmul v16.4S, v0.4S, v4.4S | |
33 | fmul v17.4S, v1.4S, v5.4S | |
34 | fmul v18.4S, v2.4S, v6.4S | |
35 | fmul v19.4S, v3.4S, v7.4S | |
36 | st1 {v16.4S, v17.4S}, [x0], #32 | |
37 | st1 {v18.4S, v19.4S}, [x0], #32 | |
38 | b.ne 1b | |
39 | ret | |
40 | endfunc | |
41 | ||
42 | function ff_vector_fmac_scalar_neon, export=1 | |
43 | mov x3, #-32 | |
44 | 1: subs w2, w2, #16 | |
45 | ld1 {v16.4S, v17.4S}, [x0], #32 | |
46 | ld1 {v18.4S, v19.4S}, [x0], x3 | |
47 | ld1 {v4.4S, v5.4S}, [x1], #32 | |
48 | ld1 {v6.4S, v7.4S}, [x1], #32 | |
49 | fmla v16.4S, v4.4S, v0.S[0] | |
50 | fmla v17.4S, v5.4S, v0.S[0] | |
51 | fmla v18.4S, v6.4S, v0.S[0] | |
52 | fmla v19.4S, v7.4S, v0.S[0] | |
53 | st1 {v16.4S, v17.4S}, [x0], #32 | |
54 | st1 {v18.4S, v19.4S}, [x0], #32 | |
55 | b.ne 1b | |
56 | ret | |
57 | endfunc | |
58 | ||
59 | function ff_vector_fmul_scalar_neon, export=1 | |
60 | mov w4, #15 | |
61 | bics w3, w2, w4 | |
62 | dup v16.4S, v0.S[0] | |
63 | b.eq 3f | |
64 | ld1 {v0.4S, v1.4S}, [x1], #32 | |
65 | 1: subs w3, w3, #16 | |
66 | fmul v0.4S, v0.4S, v16.4S | |
67 | ld1 {v2.4S, v3.4S}, [x1], #32 | |
68 | fmul v1.4S, v1.4S, v16.4S | |
69 | fmul v2.4S, v2.4S, v16.4S | |
70 | st1 {v0.4S, v1.4S}, [x0], #32 | |
71 | fmul v3.4S, v3.4S, v16.4S | |
72 | b.eq 2f | |
73 | ld1 {v0.4S, v1.4S}, [x1], #32 | |
74 | st1 {v2.4S, v3.4S}, [x0], #32 | |
75 | b 1b | |
76 | 2: ands w2, w2, #15 | |
77 | st1 {v2.4S, v3.4S}, [x0], #32 | |
78 | b.eq 4f | |
79 | 3: ld1 {v0.4S}, [x1], #16 | |
80 | fmul v0.4S, v0.4S, v16.4S | |
81 | st1 {v0.4S}, [x0], #16 | |
82 | subs w2, w2, #4 | |
83 | b.gt 3b | |
84 | 4: ret | |
85 | endfunc | |
86 | ||
87 | function ff_vector_dmul_scalar_neon, export=1 | |
88 | dup v16.2D, v0.D[0] | |
89 | ld1 {v0.2D, v1.2D}, [x1], #32 | |
90 | 1: subs w2, w2, #8 | |
91 | fmul v0.2D, v0.2D, v16.2D | |
92 | ld1 {v2.2D, v3.2D}, [x1], #32 | |
93 | fmul v1.2D, v1.2D, v16.2D | |
94 | fmul v2.2D, v2.2D, v16.2D | |
95 | st1 {v0.2D, v1.2D}, [x0], #32 | |
96 | fmul v3.2D, v3.2D, v16.2D | |
97 | ld1 {v0.2D, v1.2D}, [x1], #32 | |
98 | st1 {v2.2D, v3.2D}, [x0], #32 | |
99 | b.gt 1b | |
100 | ret | |
101 | endfunc | |
102 | ||
103 | function ff_vector_fmul_window_neon, export=1 | |
104 | sxtw x4, w4 // len | |
105 | sub x2, x2, #8 | |
106 | sub x5, x4, #2 | |
107 | add x2, x2, x5, lsl #2 // src1 + 4 * (len - 4) | |
108 | add x6, x3, x5, lsl #3 // win + 8 * (len - 2) | |
109 | add x5, x0, x5, lsl #3 // dst + 8 * (len - 2) | |
110 | mov x7, #-16 | |
111 | ld1 {v0.4S}, [x1], #16 // s0 | |
112 | ld1 {v2.4S}, [x3], #16 // wi | |
113 | ld1 {v1.4S}, [x2], x7 // s1 | |
114 | 1: ld1 {v3.4S}, [x6], x7 // wj | |
115 | subs x4, x4, #4 | |
116 | fmul v17.4S, v0.4S, v2.4S // s0 * wi | |
117 | rev64 v4.4S, v1.4S | |
118 | rev64 v5.4S, v3.4S | |
119 | rev64 v17.4S, v17.4S | |
120 | ext v4.16B, v4.16B, v4.16B, #8 // s1_r | |
121 | ext v5.16B, v5.16B, v5.16B, #8 // wj_r | |
122 | ext v17.16B, v17.16B, v17.16B, #8 // (s0 * wi)_rev | |
123 | fmul v16.4S, v0.4S, v5.4S // s0 * wj_r | |
124 | fmla v17.4S, v1.4S, v3.4S // (s0 * wi)_rev + s1 * wj | |
125 | b.eq 2f | |
126 | ld1 {v0.4S}, [x1], #16 | |
127 | fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi | |
128 | st1 {v17.4S}, [x5], x7 | |
129 | ld1 {v2.4S}, [x3], #16 | |
130 | ld1 {v1.4S}, [x2], x7 | |
131 | st1 {v16.4S}, [x0], #16 | |
132 | b 1b | |
133 | 2: | |
134 | fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi | |
135 | st1 {v17.4S}, [x5], x7 | |
136 | st1 {v16.4S}, [x0], #16 | |
137 | ret | |
138 | endfunc | |
139 | ||
140 | function ff_vector_fmul_add_neon, export=1 | |
141 | ld1 {v0.4S, v1.4S}, [x1], #32 | |
142 | ld1 {v2.4S, v3.4S}, [x2], #32 | |
143 | ld1 {v4.4S, v5.4S}, [x3], #32 | |
144 | 1: subs w4, w4, #8 | |
145 | fmla v4.4S, v0.4S, v2.4S | |
146 | fmla v5.4S, v1.4S, v3.4S | |
147 | b.eq 2f | |
148 | ld1 {v0.4S, v1.4S}, [x1], #32 | |
149 | ld1 {v2.4S, v3.4S}, [x2], #32 | |
150 | st1 {v4.4S, v5.4S}, [x0], #32 | |
151 | ld1 {v4.4S, v5.4S}, [x3], #32 | |
152 | b 1b | |
153 | 2: st1 {v4.4S, v5.4S}, [x0], #32 | |
154 | ret | |
155 | endfunc | |
156 | ||
157 | function ff_vector_fmul_reverse_neon, export=1 | |
158 | sxtw x3, w3 | |
159 | add x2, x2, x3, lsl #2 | |
160 | sub x2, x2, #32 | |
161 | mov x4, #-32 | |
162 | ld1 {v2.4S, v3.4S}, [x2], x4 | |
163 | ld1 {v0.4S, v1.4S}, [x1], #32 | |
164 | 1: subs x3, x3, #8 | |
165 | rev64 v3.4S, v3.4S | |
166 | rev64 v2.4S, v2.4S | |
167 | ext v3.16B, v3.16B, v3.16B, #8 | |
168 | ext v2.16B, v2.16B, v2.16B, #8 | |
169 | fmul v16.4S, v0.4S, v3.4S | |
170 | fmul v17.4S, v1.4S, v2.4S | |
171 | b.eq 2f | |
172 | ld1 {v2.4S, v3.4S}, [x2], x4 | |
173 | ld1 {v0.4S, v1.4S}, [x1], #32 | |
174 | st1 {v16.4S, v17.4S}, [x0], #32 | |
175 | b 1b | |
176 | 2: st1 {v16.4S, v17.4S}, [x0], #32 | |
177 | ret | |
178 | endfunc | |
179 | ||
180 | function ff_butterflies_float_neon, export=1 | |
181 | 1: ld1 {v0.4S}, [x0] | |
182 | ld1 {v1.4S}, [x1] | |
183 | subs w2, w2, #4 | |
184 | fsub v2.4S, v0.4S, v1.4S | |
185 | fadd v3.4S, v0.4S, v1.4S | |
186 | st1 {v2.4S}, [x1], #16 | |
187 | st1 {v3.4S}, [x0], #16 | |
188 | b.gt 1b | |
189 | ret | |
190 | endfunc | |
191 | ||
192 | function ff_scalarproduct_float_neon, export=1 | |
193 | movi v2.4S, #0 | |
194 | 1: ld1 {v0.4S}, [x0], #16 | |
195 | ld1 {v1.4S}, [x1], #16 | |
196 | subs w2, w2, #4 | |
197 | fmla v2.4S, v0.4S, v1.4S | |
198 | b.gt 1b | |
199 | faddp v0.4S, v2.4S, v2.4S | |
200 | faddp s0, v0.2S | |
201 | ret | |
202 | endfunc |