Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
3 | * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | #include "config.h" | |
23 | #include "libavutil/aarch64/asm.S" | |
24 | ||
25 | function ff_conv_flt_to_s16_neon, export=1 | |
26 | subs x2, x2, #8 | |
27 | ld1 {v0.4s}, [x1], #16 | |
28 | fcvtzs v4.4s, v0.4s, #31 | |
29 | ld1 {v1.4s}, [x1], #16 | |
30 | fcvtzs v5.4s, v1.4s, #31 | |
31 | b.eq 3f | |
32 | ands x12, x2, #~15 | |
33 | b.eq 2f | |
34 | 1: subs x12, x12, #16 | |
35 | sqrshrn v4.4h, v4.4s, #16 | |
36 | ld1 {v2.4s}, [x1], #16 | |
37 | fcvtzs v6.4s, v2.4s, #31 | |
38 | sqrshrn2 v4.8h, v5.4s, #16 | |
39 | ld1 {v3.4s}, [x1], #16 | |
40 | fcvtzs v7.4s, v3.4s, #31 | |
41 | sqrshrn v6.4h, v6.4s, #16 | |
42 | st1 {v4.8h}, [x0], #16 | |
43 | sqrshrn2 v6.8h, v7.4s, #16 | |
44 | ld1 {v0.4s}, [x1], #16 | |
45 | fcvtzs v4.4s, v0.4s, #31 | |
46 | ld1 {v1.4s}, [x1], #16 | |
47 | fcvtzs v5.4s, v1.4s, #31 | |
48 | st1 {v6.8h}, [x0], #16 | |
49 | b.ne 1b | |
50 | ands x2, x2, #15 | |
51 | b.eq 3f | |
52 | 2: ld1 {v2.4s}, [x1], #16 | |
53 | sqrshrn v4.4h, v4.4s, #16 | |
54 | fcvtzs v6.4s, v2.4s, #31 | |
55 | ld1 {v3.4s}, [x1], #16 | |
56 | sqrshrn2 v4.8h, v5.4s, #16 | |
57 | fcvtzs v7.4s, v3.4s, #31 | |
58 | sqrshrn v6.4h, v6.4s, #16 | |
59 | st1 {v4.8h}, [x0], #16 | |
60 | sqrshrn2 v6.8h, v7.4s, #16 | |
61 | st1 {v6.8h}, [x0] | |
62 | ret | |
63 | 3: sqrshrn v4.4h, v4.4s, #16 | |
64 | sqrshrn2 v4.8h, v5.4s, #16 | |
65 | st1 {v4.8h}, [x0] | |
66 | ret | |
67 | endfunc | |
68 | ||
69 | function ff_conv_fltp_to_s16_2ch_neon, export=1 | |
70 | ldp x4, x5, [x1] | |
71 | subs x2, x2, #8 | |
72 | ld1 {v0.4s}, [x4], #16 | |
73 | fcvtzs v4.4s, v0.4s, #31 | |
74 | ld1 {v1.4s}, [x4], #16 | |
75 | fcvtzs v5.4s, v1.4s, #31 | |
76 | ld1 {v2.4s}, [x5], #16 | |
77 | fcvtzs v6.4s, v2.4s, #31 | |
78 | ld1 {v3.4s}, [x5], #16 | |
79 | fcvtzs v7.4s, v3.4s, #31 | |
80 | b.eq 3f | |
81 | ands x12, x2, #~15 | |
82 | b.eq 2f | |
83 | 1: subs x12, x12, #16 | |
84 | ld1 {v16.4s}, [x4], #16 | |
85 | fcvtzs v20.4s, v16.4s, #31 | |
86 | sri v6.4s, v4.4s, #16 | |
87 | ld1 {v17.4s}, [x4], #16 | |
88 | fcvtzs v21.4s, v17.4s, #31 | |
89 | ld1 {v18.4s}, [x5], #16 | |
90 | fcvtzs v22.4s, v18.4s, #31 | |
91 | ld1 {v19.4s}, [x5], #16 | |
92 | sri v7.4s, v5.4s, #16 | |
93 | st1 {v6.4s}, [x0], #16 | |
94 | fcvtzs v23.4s, v19.4s, #31 | |
95 | st1 {v7.4s}, [x0], #16 | |
96 | sri v22.4s, v20.4s, #16 | |
97 | ld1 {v0.4s}, [x4], #16 | |
98 | sri v23.4s, v21.4s, #16 | |
99 | st1 {v22.4s}, [x0], #16 | |
100 | fcvtzs v4.4s, v0.4s, #31 | |
101 | ld1 {v1.4s}, [x4], #16 | |
102 | fcvtzs v5.4s, v1.4s, #31 | |
103 | ld1 {v2.4s}, [x5], #16 | |
104 | fcvtzs v6.4s, v2.4s, #31 | |
105 | ld1 {v3.4s}, [x5], #16 | |
106 | fcvtzs v7.4s, v3.4s, #31 | |
107 | st1 {v23.4s}, [x0], #16 | |
108 | b.ne 1b | |
109 | ands x2, x2, #15 | |
110 | b.eq 3f | |
111 | 2: sri v6.4s, v4.4s, #16 | |
112 | ld1 {v0.4s}, [x4], #16 | |
113 | fcvtzs v0.4s, v0.4s, #31 | |
114 | ld1 {v1.4s}, [x4], #16 | |
115 | fcvtzs v1.4s, v1.4s, #31 | |
116 | ld1 {v2.4s}, [x5], #16 | |
117 | fcvtzs v2.4s, v2.4s, #31 | |
118 | sri v7.4s, v5.4s, #16 | |
119 | ld1 {v3.4s}, [x5], #16 | |
120 | fcvtzs v3.4s, v3.4s, #31 | |
121 | sri v2.4s, v0.4s, #16 | |
122 | st1 {v6.4s,v7.4s}, [x0], #32 | |
123 | sri v3.4s, v1.4s, #16 | |
124 | st1 {v2.4s,v3.4s}, [x0], #32 | |
125 | ret | |
126 | 3: sri v6.4s, v4.4s, #16 | |
127 | sri v7.4s, v5.4s, #16 | |
128 | st1 {v6.4s,v7.4s}, [x0] | |
129 | ret | |
130 | endfunc | |
131 | ||
132 | function ff_conv_fltp_to_s16_neon, export=1 | |
133 | cmp w3, #2 | |
134 | b.eq X(ff_conv_fltp_to_s16_2ch_neon) | |
135 | b.gt 1f | |
136 | ldr x1, [x1] | |
137 | b X(ff_conv_flt_to_s16_neon) | |
138 | 1: | |
139 | cmp w3, #4 | |
140 | lsl x12, x3, #1 | |
141 | b.lt 4f | |
142 | ||
143 | 5: // 4 channels | |
144 | ldp x4, x5, [x1], #16 | |
145 | ldp x6, x7, [x1], #16 | |
146 | mov w9, w2 | |
147 | mov x8, x0 | |
148 | ld1 {v4.4s}, [x4], #16 | |
149 | fcvtzs v4.4s, v4.4s, #31 | |
150 | ld1 {v5.4s}, [x5], #16 | |
151 | fcvtzs v5.4s, v5.4s, #31 | |
152 | ld1 {v6.4s}, [x6], #16 | |
153 | fcvtzs v6.4s, v6.4s, #31 | |
154 | ld1 {v7.4s}, [x7], #16 | |
155 | fcvtzs v7.4s, v7.4s, #31 | |
156 | 6: | |
157 | subs w9, w9, #8 | |
158 | ld1 {v0.4s}, [x4], #16 | |
159 | fcvtzs v0.4s, v0.4s, #31 | |
160 | sri v5.4s, v4.4s, #16 | |
161 | ld1 {v1.4s}, [x5], #16 | |
162 | fcvtzs v1.4s, v1.4s, #31 | |
163 | sri v7.4s, v6.4s, #16 | |
164 | ld1 {v2.4s}, [x6], #16 | |
165 | fcvtzs v2.4s, v2.4s, #31 | |
166 | zip1 v16.4s, v5.4s, v7.4s | |
167 | ld1 {v3.4s}, [x7], #16 | |
168 | fcvtzs v3.4s, v3.4s, #31 | |
169 | zip2 v17.4s, v5.4s, v7.4s | |
170 | st1 {v16.d}[0], [x8], x12 | |
171 | sri v1.4s, v0.4s, #16 | |
172 | st1 {v16.d}[1], [x8], x12 | |
173 | sri v3.4s, v2.4s, #16 | |
174 | st1 {v17.d}[0], [x8], x12 | |
175 | zip1 v18.4s, v1.4s, v3.4s | |
176 | st1 {v17.d}[1], [x8], x12 | |
177 | zip2 v19.4s, v1.4s, v3.4s | |
178 | b.eq 7f | |
179 | ld1 {v4.4s}, [x4], #16 | |
180 | fcvtzs v4.4s, v4.4s, #31 | |
181 | st1 {v18.d}[0], [x8], x12 | |
182 | ld1 {v5.4s}, [x5], #16 | |
183 | fcvtzs v5.4s, v5.4s, #31 | |
184 | st1 {v18.d}[1], [x8], x12 | |
185 | ld1 {v6.4s}, [x6], #16 | |
186 | fcvtzs v6.4s, v6.4s, #31 | |
187 | st1 {v19.d}[0], [x8], x12 | |
188 | ld1 {v7.4s}, [x7], #16 | |
189 | fcvtzs v7.4s, v7.4s, #31 | |
190 | st1 {v19.d}[1], [x8], x12 | |
191 | b 6b | |
192 | 7: | |
193 | st1 {v18.d}[0], [x8], x12 | |
194 | st1 {v18.d}[1], [x8], x12 | |
195 | st1 {v19.d}[0], [x8], x12 | |
196 | st1 {v19.d}[1], [x8], x12 | |
197 | subs w3, w3, #4 | |
198 | b.eq end | |
199 | cmp w3, #4 | |
200 | add x0, x0, #8 | |
201 | b.ge 5b | |
202 | ||
203 | 4: // 2 channels | |
204 | cmp w3, #2 | |
205 | b.lt 4f | |
206 | ldp x4, x5, [x1], #16 | |
207 | mov w9, w2 | |
208 | mov x8, x0 | |
209 | tst w9, #8 | |
210 | ld1 {v4.4s}, [x4], #16 | |
211 | fcvtzs v4.4s, v4.4s, #31 | |
212 | ld1 {v5.4s}, [x5], #16 | |
213 | fcvtzs v5.4s, v5.4s, #31 | |
214 | ld1 {v6.4s}, [x4], #16 | |
215 | fcvtzs v6.4s, v6.4s, #31 | |
216 | ld1 {v7.4s}, [x5], #16 | |
217 | fcvtzs v7.4s, v7.4s, #31 | |
218 | b.eq 6f | |
219 | subs w9, w9, #8 | |
220 | b.eq 7f | |
221 | sri v5.4s, v4.4s, #16 | |
222 | ld1 {v4.4s}, [x4], #16 | |
223 | fcvtzs v4.4s, v4.4s, #31 | |
224 | st1 {v5.s}[0], [x8], x12 | |
225 | sri v7.4s, v6.4s, #16 | |
226 | st1 {v5.s}[1], [x8], x12 | |
227 | ld1 {v6.4s}, [x4], #16 | |
228 | fcvtzs v6.4s, v6.4s, #31 | |
229 | st1 {v5.s}[2], [x8], x12 | |
230 | st1 {v5.s}[3], [x8], x12 | |
231 | st1 {v7.s}[0], [x8], x12 | |
232 | st1 {v7.s}[1], [x8], x12 | |
233 | ld1 {v5.4s}, [x5], #16 | |
234 | fcvtzs v5.4s, v5.4s, #31 | |
235 | st1 {v7.s}[2], [x8], x12 | |
236 | st1 {v7.s}[3], [x8], x12 | |
237 | ld1 {v7.4s}, [x5], #16 | |
238 | fcvtzs v7.4s, v7.4s, #31 | |
239 | 6: | |
240 | subs w9, w9, #16 | |
241 | ld1 {v0.4s}, [x4], #16 | |
242 | sri v5.4s, v4.4s, #16 | |
243 | fcvtzs v0.4s, v0.4s, #31 | |
244 | ld1 {v1.4s}, [x5], #16 | |
245 | sri v7.4s, v6.4s, #16 | |
246 | st1 {v5.s}[0], [x8], x12 | |
247 | st1 {v5.s}[1], [x8], x12 | |
248 | fcvtzs v1.4s, v1.4s, #31 | |
249 | st1 {v5.s}[2], [x8], x12 | |
250 | st1 {v5.s}[3], [x8], x12 | |
251 | ld1 {v2.4s}, [x4], #16 | |
252 | st1 {v7.s}[0], [x8], x12 | |
253 | fcvtzs v2.4s, v2.4s, #31 | |
254 | st1 {v7.s}[1], [x8], x12 | |
255 | ld1 {v3.4s}, [x5], #16 | |
256 | st1 {v7.s}[2], [x8], x12 | |
257 | fcvtzs v3.4s, v3.4s, #31 | |
258 | st1 {v7.s}[3], [x8], x12 | |
259 | sri v1.4s, v0.4s, #16 | |
260 | sri v3.4s, v2.4s, #16 | |
261 | b.eq 6f | |
262 | ld1 {v4.4s}, [x4], #16 | |
263 | st1 {v1.s}[0], [x8], x12 | |
264 | fcvtzs v4.4s, v4.4s, #31 | |
265 | st1 {v1.s}[1], [x8], x12 | |
266 | ld1 {v5.4s}, [x5], #16 | |
267 | st1 {v1.s}[2], [x8], x12 | |
268 | fcvtzs v5.4s, v5.4s, #31 | |
269 | st1 {v1.s}[3], [x8], x12 | |
270 | ld1 {v6.4s}, [x4], #16 | |
271 | st1 {v3.s}[0], [x8], x12 | |
272 | fcvtzs v6.4s, v6.4s, #31 | |
273 | st1 {v3.s}[1], [x8], x12 | |
274 | ld1 {v7.4s}, [x5], #16 | |
275 | st1 {v3.s}[2], [x8], x12 | |
276 | fcvtzs v7.4s, v7.4s, #31 | |
277 | st1 {v3.s}[3], [x8], x12 | |
278 | b.gt 6b | |
279 | 6: | |
280 | st1 {v1.s}[0], [x8], x12 | |
281 | st1 {v1.s}[1], [x8], x12 | |
282 | st1 {v1.s}[2], [x8], x12 | |
283 | st1 {v1.s}[3], [x8], x12 | |
284 | st1 {v3.s}[0], [x8], x12 | |
285 | st1 {v3.s}[1], [x8], x12 | |
286 | st1 {v3.s}[2], [x8], x12 | |
287 | st1 {v3.s}[3], [x8], x12 | |
288 | b 8f | |
289 | 7: | |
290 | sri v5.4s, v4.4s, #16 | |
291 | sri v7.4s, v6.4s, #16 | |
292 | st1 {v5.s}[0], [x8], x12 | |
293 | st1 {v5.s}[1], [x8], x12 | |
294 | st1 {v5.s}[2], [x8], x12 | |
295 | st1 {v5.s}[3], [x8], x12 | |
296 | st1 {v7.s}[0], [x8], x12 | |
297 | st1 {v7.s}[1], [x8], x12 | |
298 | st1 {v7.s}[2], [x8], x12 | |
299 | st1 {v7.s}[3], [x8], x12 | |
300 | 8: | |
301 | subs w3, w3, #2 | |
302 | add x0, x0, #4 | |
303 | b.eq end | |
304 | ||
305 | 4: // 1 channel | |
306 | ldr x4, [x1] | |
307 | tst w2, #8 | |
308 | mov w9, w2 | |
309 | mov x5, x0 | |
310 | ld1 {v0.4s}, [x4], #16 | |
311 | fcvtzs v0.4s, v0.4s, #31 | |
312 | ld1 {v1.4s}, [x4], #16 | |
313 | fcvtzs v1.4s, v1.4s, #31 | |
314 | b.ne 8f | |
315 | 6: | |
316 | subs w9, w9, #16 | |
317 | ld1 {v2.4s}, [x4], #16 | |
318 | fcvtzs v2.4s, v2.4s, #31 | |
319 | ld1 {v3.4s}, [x4], #16 | |
320 | fcvtzs v3.4s, v3.4s, #31 | |
321 | st1 {v0.h}[1], [x5], x12 | |
322 | st1 {v0.h}[3], [x5], x12 | |
323 | st1 {v0.h}[5], [x5], x12 | |
324 | st1 {v0.h}[7], [x5], x12 | |
325 | st1 {v1.h}[1], [x5], x12 | |
326 | st1 {v1.h}[3], [x5], x12 | |
327 | st1 {v1.h}[5], [x5], x12 | |
328 | st1 {v1.h}[7], [x5], x12 | |
329 | b.eq 7f | |
330 | ld1 {v0.4s}, [x4], #16 | |
331 | fcvtzs v0.4s, v0.4s, #31 | |
332 | ld1 {v1.4s}, [x4], #16 | |
333 | fcvtzs v1.4s, v1.4s, #31 | |
334 | 7: | |
335 | st1 {v2.h}[1], [x5], x12 | |
336 | st1 {v2.h}[3], [x5], x12 | |
337 | st1 {v2.h}[5], [x5], x12 | |
338 | st1 {v2.h}[7], [x5], x12 | |
339 | st1 {v3.h}[1], [x5], x12 | |
340 | st1 {v3.h}[3], [x5], x12 | |
341 | st1 {v3.h}[5], [x5], x12 | |
342 | st1 {v3.h}[7], [x5], x12 | |
343 | b.gt 6b | |
344 | ret | |
345 | 8: | |
346 | subs w9, w9, #8 | |
347 | st1 {v0.h}[1], [x5], x12 | |
348 | st1 {v0.h}[3], [x5], x12 | |
349 | st1 {v0.h}[5], [x5], x12 | |
350 | st1 {v0.h}[7], [x5], x12 | |
351 | st1 {v1.h}[1], [x5], x12 | |
352 | st1 {v1.h}[3], [x5], x12 | |
353 | st1 {v1.h}[5], [x5], x12 | |
354 | st1 {v1.h}[7], [x5], x12 | |
355 | b.eq end | |
356 | ld1 {v0.4s}, [x4], #16 | |
357 | fcvtzs v0.4s, v0.4s, #31 | |
358 | ld1 {v1.4s}, [x4], #16 | |
359 | fcvtzs v1.4s, v1.4s, #31 | |
360 | b 6b | |
361 | end: | |
362 | ret | |
363 | endfunc |