Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libswresample / aarch64 / audio_convert_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "config.h"
23#include "libavutil/aarch64/asm.S"
24
25function swri_oldapi_conv_flt_to_s16_neon, export=1
26 subs x2, x2, #8
27 ld1 {v0.4s}, [x1], #16
28 fcvtzs v4.4s, v0.4s, #31
29 ld1 {v1.4s}, [x1], #16
30 fcvtzs v5.4s, v1.4s, #31
31 b.eq 3f
32 ands x12, x2, #~15
33 b.eq 2f
341: subs x12, x12, #16
35 sqrshrn v4.4h, v4.4s, #16
36 ld1 {v2.4s}, [x1], #16
37 fcvtzs v6.4s, v2.4s, #31
38 sqrshrn2 v4.8h, v5.4s, #16
39 ld1 {v3.4s}, [x1], #16
40 fcvtzs v7.4s, v3.4s, #31
41 sqrshrn v6.4h, v6.4s, #16
42 st1 {v4.8h}, [x0], #16
43 sqrshrn2 v6.8h, v7.4s, #16
44 ld1 {v0.4s}, [x1], #16
45 fcvtzs v4.4s, v0.4s, #31
46 ld1 {v1.4s}, [x1], #16
47 fcvtzs v5.4s, v1.4s, #31
48 st1 {v6.8h}, [x0], #16
49 b.ne 1b
50 ands x2, x2, #15
51 b.eq 3f
522: ld1 {v2.4s}, [x1], #16
53 sqrshrn v4.4h, v4.4s, #16
54 fcvtzs v6.4s, v2.4s, #31
55 ld1 {v3.4s}, [x1], #16
56 sqrshrn2 v4.8h, v5.4s, #16
57 fcvtzs v7.4s, v3.4s, #31
58 sqrshrn v6.4h, v6.4s, #16
59 st1 {v4.8h}, [x0], #16
60 sqrshrn2 v6.8h, v7.4s, #16
61 st1 {v6.8h}, [x0]
62 ret
633: sqrshrn v4.4h, v4.4s, #16
64 sqrshrn2 v4.8h, v5.4s, #16
65 st1 {v4.8h}, [x0]
66 ret
67endfunc
68
69function swri_oldapi_conv_fltp_to_s16_2ch_neon, export=1
70 ldp x4, x5, [x1]
71 subs x2, x2, #8
72 ld1 {v0.4s}, [x4], #16
73 fcvtzs v4.4s, v0.4s, #31
74 ld1 {v1.4s}, [x4], #16
75 fcvtzs v5.4s, v1.4s, #31
76 ld1 {v2.4s}, [x5], #16
77 fcvtzs v6.4s, v2.4s, #31
78 ld1 {v3.4s}, [x5], #16
79 fcvtzs v7.4s, v3.4s, #31
80 b.eq 3f
81 ands x12, x2, #~15
82 b.eq 2f
831: subs x12, x12, #16
84 ld1 {v16.4s}, [x4], #16
85 fcvtzs v20.4s, v16.4s, #31
86 sri v6.4s, v4.4s, #16
87 ld1 {v17.4s}, [x4], #16
88 fcvtzs v21.4s, v17.4s, #31
89 ld1 {v18.4s}, [x5], #16
90 fcvtzs v22.4s, v18.4s, #31
91 ld1 {v19.4s}, [x5], #16
92 sri v7.4s, v5.4s, #16
93 st1 {v6.4s}, [x0], #16
94 fcvtzs v23.4s, v19.4s, #31
95 st1 {v7.4s}, [x0], #16
96 sri v22.4s, v20.4s, #16
97 ld1 {v0.4s}, [x4], #16
98 sri v23.4s, v21.4s, #16
99 st1 {v22.4s}, [x0], #16
100 fcvtzs v4.4s, v0.4s, #31
101 ld1 {v1.4s}, [x4], #16
102 fcvtzs v5.4s, v1.4s, #31
103 ld1 {v2.4s}, [x5], #16
104 fcvtzs v6.4s, v2.4s, #31
105 ld1 {v3.4s}, [x5], #16
106 fcvtzs v7.4s, v3.4s, #31
107 st1 {v23.4s}, [x0], #16
108 b.ne 1b
109 ands x2, x2, #15
110 b.eq 3f
1112: sri v6.4s, v4.4s, #16
112 ld1 {v0.4s}, [x4], #16
113 fcvtzs v0.4s, v0.4s, #31
114 ld1 {v1.4s}, [x4], #16
115 fcvtzs v1.4s, v1.4s, #31
116 ld1 {v2.4s}, [x5], #16
117 fcvtzs v2.4s, v2.4s, #31
118 sri v7.4s, v5.4s, #16
119 ld1 {v3.4s}, [x5], #16
120 fcvtzs v3.4s, v3.4s, #31
121 sri v2.4s, v0.4s, #16
122 st1 {v6.4s,v7.4s}, [x0], #32
123 sri v3.4s, v1.4s, #16
124 st1 {v2.4s,v3.4s}, [x0], #32
125 ret
1263: sri v6.4s, v4.4s, #16
127 sri v7.4s, v5.4s, #16
128 st1 {v6.4s,v7.4s}, [x0]
129 ret
130endfunc
131
132function swri_oldapi_conv_fltp_to_s16_nch_neon, export=1
133 cmp w3, #2
134 b.eq X(swri_oldapi_conv_fltp_to_s16_2ch_neon)
135 b.gt 1f
136 ldr x1, [x1]
137 b X(swri_oldapi_conv_flt_to_s16_neon)
1381:
139 cmp w3, #4
140 lsl x12, x3, #1
141 b.lt 4f
142
1435: // 4 channels
144 ldp x4, x5, [x1], #16
145 ldp x6, x7, [x1], #16
146 mov w9, w2
147 mov x8, x0
148 ld1 {v4.4s}, [x4], #16
149 fcvtzs v4.4s, v4.4s, #31
150 ld1 {v5.4s}, [x5], #16
151 fcvtzs v5.4s, v5.4s, #31
152 ld1 {v6.4s}, [x6], #16
153 fcvtzs v6.4s, v6.4s, #31
154 ld1 {v7.4s}, [x7], #16
155 fcvtzs v7.4s, v7.4s, #31
1566:
157 subs w9, w9, #8
158 ld1 {v0.4s}, [x4], #16
159 fcvtzs v0.4s, v0.4s, #31
160 sri v5.4s, v4.4s, #16
161 ld1 {v1.4s}, [x5], #16
162 fcvtzs v1.4s, v1.4s, #31
163 sri v7.4s, v6.4s, #16
164 ld1 {v2.4s}, [x6], #16
165 fcvtzs v2.4s, v2.4s, #31
166 zip1 v16.4s, v5.4s, v7.4s
167 ld1 {v3.4s}, [x7], #16
168 fcvtzs v3.4s, v3.4s, #31
169 zip2 v17.4s, v5.4s, v7.4s
170 st1 {v16.d}[0], [x8], x12
171 sri v1.4s, v0.4s, #16
172 st1 {v16.d}[1], [x8], x12
173 sri v3.4s, v2.4s, #16
174 st1 {v17.d}[0], [x8], x12
175 zip1 v18.4s, v1.4s, v3.4s
176 st1 {v17.d}[1], [x8], x12
177 zip2 v19.4s, v1.4s, v3.4s
178 b.eq 7f
179 ld1 {v4.4s}, [x4], #16
180 fcvtzs v4.4s, v4.4s, #31
181 st1 {v18.d}[0], [x8], x12
182 ld1 {v5.4s}, [x5], #16
183 fcvtzs v5.4s, v5.4s, #31
184 st1 {v18.d}[1], [x8], x12
185 ld1 {v6.4s}, [x6], #16
186 fcvtzs v6.4s, v6.4s, #31
187 st1 {v19.d}[0], [x8], x12
188 ld1 {v7.4s}, [x7], #16
189 fcvtzs v7.4s, v7.4s, #31
190 st1 {v19.d}[1], [x8], x12
191 b 6b
1927:
193 st1 {v18.d}[0], [x8], x12
194 st1 {v18.d}[1], [x8], x12
195 st1 {v19.d}[0], [x8], x12
196 st1 {v19.d}[1], [x8], x12
197 subs w3, w3, #4
198 b.eq end
199 cmp w3, #4
200 add x0, x0, #8
201 b.ge 5b
202
2034: // 2 channels
204 cmp w3, #2
205 b.lt 4f
206 ldp x4, x5, [x1], #16
207 mov w9, w2
208 mov x8, x0
209 tst w9, #8
210 ld1 {v4.4s}, [x4], #16
211 fcvtzs v4.4s, v4.4s, #31
212 ld1 {v5.4s}, [x5], #16
213 fcvtzs v5.4s, v5.4s, #31
214 ld1 {v6.4s}, [x4], #16
215 fcvtzs v6.4s, v6.4s, #31
216 ld1 {v7.4s}, [x5], #16
217 fcvtzs v7.4s, v7.4s, #31
218 b.eq 6f
219 subs w9, w9, #8
220 b.eq 7f
221 sri v5.4s, v4.4s, #16
222 ld1 {v4.4s}, [x4], #16
223 fcvtzs v4.4s, v4.4s, #31
224 st1 {v5.s}[0], [x8], x12
225 sri v7.4s, v6.4s, #16
226 st1 {v5.s}[1], [x8], x12
227 ld1 {v6.4s}, [x4], #16
228 fcvtzs v6.4s, v6.4s, #31
229 st1 {v5.s}[2], [x8], x12
230 st1 {v5.s}[3], [x8], x12
231 st1 {v7.s}[0], [x8], x12
232 st1 {v7.s}[1], [x8], x12
233 ld1 {v5.4s}, [x5], #16
234 fcvtzs v5.4s, v5.4s, #31
235 st1 {v7.s}[2], [x8], x12
236 st1 {v7.s}[3], [x8], x12
237 ld1 {v7.4s}, [x5], #16
238 fcvtzs v7.4s, v7.4s, #31
2396:
240 subs w9, w9, #16
241 ld1 {v0.4s}, [x4], #16
242 sri v5.4s, v4.4s, #16
243 fcvtzs v0.4s, v0.4s, #31
244 ld1 {v1.4s}, [x5], #16
245 sri v7.4s, v6.4s, #16
246 st1 {v5.s}[0], [x8], x12
247 st1 {v5.s}[1], [x8], x12
248 fcvtzs v1.4s, v1.4s, #31
249 st1 {v5.s}[2], [x8], x12
250 st1 {v5.s}[3], [x8], x12
251 ld1 {v2.4s}, [x4], #16
252 st1 {v7.s}[0], [x8], x12
253 fcvtzs v2.4s, v2.4s, #31
254 st1 {v7.s}[1], [x8], x12
255 ld1 {v3.4s}, [x5], #16
256 st1 {v7.s}[2], [x8], x12
257 fcvtzs v3.4s, v3.4s, #31
258 st1 {v7.s}[3], [x8], x12
259 sri v1.4s, v0.4s, #16
260 sri v3.4s, v2.4s, #16
261 b.eq 6f
262 ld1 {v4.4s}, [x4], #16
263 st1 {v1.s}[0], [x8], x12
264 fcvtzs v4.4s, v4.4s, #31
265 st1 {v1.s}[1], [x8], x12
266 ld1 {v5.4s}, [x5], #16
267 st1 {v1.s}[2], [x8], x12
268 fcvtzs v5.4s, v5.4s, #31
269 st1 {v1.s}[3], [x8], x12
270 ld1 {v6.4s}, [x4], #16
271 st1 {v3.s}[0], [x8], x12
272 fcvtzs v6.4s, v6.4s, #31
273 st1 {v3.s}[1], [x8], x12
274 ld1 {v7.4s}, [x5], #16
275 st1 {v3.s}[2], [x8], x12
276 fcvtzs v7.4s, v7.4s, #31
277 st1 {v3.s}[3], [x8], x12
278 b.gt 6b
2796:
280 st1 {v1.s}[0], [x8], x12
281 st1 {v1.s}[1], [x8], x12
282 st1 {v1.s}[2], [x8], x12
283 st1 {v1.s}[3], [x8], x12
284 st1 {v3.s}[0], [x8], x12
285 st1 {v3.s}[1], [x8], x12
286 st1 {v3.s}[2], [x8], x12
287 st1 {v3.s}[3], [x8], x12
288 b 8f
2897:
290 sri v5.4s, v4.4s, #16
291 sri v7.4s, v6.4s, #16
292 st1 {v5.s}[0], [x8], x12
293 st1 {v5.s}[1], [x8], x12
294 st1 {v5.s}[2], [x8], x12
295 st1 {v5.s}[3], [x8], x12
296 st1 {v7.s}[0], [x8], x12
297 st1 {v7.s}[1], [x8], x12
298 st1 {v7.s}[2], [x8], x12
299 st1 {v7.s}[3], [x8], x12
3008:
301 subs w3, w3, #2
302 add x0, x0, #4
303 b.eq end
304
3054: // 1 channel
306 ldr x4, [x1]
307 tst w2, #8
308 mov w9, w2
309 mov x5, x0
310 ld1 {v0.4s}, [x4], #16
311 fcvtzs v0.4s, v0.4s, #31
312 ld1 {v1.4s}, [x4], #16
313 fcvtzs v1.4s, v1.4s, #31
314 b.ne 8f
3156:
316 subs w9, w9, #16
317 ld1 {v2.4s}, [x4], #16
318 fcvtzs v2.4s, v2.4s, #31
319 ld1 {v3.4s}, [x4], #16
320 fcvtzs v3.4s, v3.4s, #31
321 st1 {v0.h}[1], [x5], x12
322 st1 {v0.h}[3], [x5], x12
323 st1 {v0.h}[5], [x5], x12
324 st1 {v0.h}[7], [x5], x12
325 st1 {v1.h}[1], [x5], x12
326 st1 {v1.h}[3], [x5], x12
327 st1 {v1.h}[5], [x5], x12
328 st1 {v1.h}[7], [x5], x12
329 b.eq 7f
330 ld1 {v0.4s}, [x4], #16
331 fcvtzs v0.4s, v0.4s, #31
332 ld1 {v1.4s}, [x4], #16
333 fcvtzs v1.4s, v1.4s, #31
3347:
335 st1 {v2.h}[1], [x5], x12
336 st1 {v2.h}[3], [x5], x12
337 st1 {v2.h}[5], [x5], x12
338 st1 {v2.h}[7], [x5], x12
339 st1 {v3.h}[1], [x5], x12
340 st1 {v3.h}[3], [x5], x12
341 st1 {v3.h}[5], [x5], x12
342 st1 {v3.h}[7], [x5], x12
343 b.gt 6b
344 ret
3458:
346 subs w9, w9, #8
347 st1 {v0.h}[1], [x5], x12
348 st1 {v0.h}[3], [x5], x12
349 st1 {v0.h}[5], [x5], x12
350 st1 {v0.h}[7], [x5], x12
351 st1 {v1.h}[1], [x5], x12
352 st1 {v1.h}[3], [x5], x12
353 st1 {v1.h}[5], [x5], x12
354 st1 {v1.h}[7], [x5], x12
355 b.eq end
356 ld1 {v0.4s}, [x4], #16
357 fcvtzs v0.4s, v0.4s, #31
358 ld1 {v1.4s}, [x4], #16
359 fcvtzs v1.4s, v1.4s, #31
360 b 6b
361end:
362 ret
363endfunc