Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
3 | * | |
4 | * This file is part of FFmpeg | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with FFmpeg; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | */ | |
20 | ||
21 | #include "config.h" | |
22 | #include "libavutil/arm/asm.S" | |
23 | ||
24 | function ff_conv_flt_to_s16_neon, export=1 | |
25 | subs r2, r2, #8 | |
26 | vld1.32 {q0}, [r1,:128]! | |
27 | vcvt.s32.f32 q8, q0, #31 | |
28 | vld1.32 {q1}, [r1,:128]! | |
29 | vcvt.s32.f32 q9, q1, #31 | |
30 | beq 3f | |
31 | bics r12, r2, #15 | |
32 | beq 2f | |
33 | 1: subs r12, r12, #16 | |
34 | vqrshrn.s32 d4, q8, #16 | |
35 | vld1.32 {q0}, [r1,:128]! | |
36 | vcvt.s32.f32 q0, q0, #31 | |
37 | vqrshrn.s32 d5, q9, #16 | |
38 | vld1.32 {q1}, [r1,:128]! | |
39 | vcvt.s32.f32 q1, q1, #31 | |
40 | vqrshrn.s32 d6, q0, #16 | |
41 | vst1.16 {q2}, [r0,:128]! | |
42 | vqrshrn.s32 d7, q1, #16 | |
43 | vld1.32 {q8}, [r1,:128]! | |
44 | vcvt.s32.f32 q8, q8, #31 | |
45 | vld1.32 {q9}, [r1,:128]! | |
46 | vcvt.s32.f32 q9, q9, #31 | |
47 | vst1.16 {q3}, [r0,:128]! | |
48 | bne 1b | |
49 | ands r2, r2, #15 | |
50 | beq 3f | |
51 | 2: vld1.32 {q0}, [r1,:128]! | |
52 | vqrshrn.s32 d4, q8, #16 | |
53 | vcvt.s32.f32 q0, q0, #31 | |
54 | vld1.32 {q1}, [r1,:128]! | |
55 | vqrshrn.s32 d5, q9, #16 | |
56 | vcvt.s32.f32 q1, q1, #31 | |
57 | vqrshrn.s32 d6, q0, #16 | |
58 | vst1.16 {q2}, [r0,:128]! | |
59 | vqrshrn.s32 d7, q1, #16 | |
60 | vst1.16 {q3}, [r0,:128]! | |
61 | bx lr | |
62 | 3: vqrshrn.s32 d4, q8, #16 | |
63 | vqrshrn.s32 d5, q9, #16 | |
64 | vst1.16 {q2}, [r0,:128]! | |
65 | bx lr | |
66 | endfunc | |
67 | ||
68 | function ff_conv_fltp_to_s16_2ch_neon, export=1 | |
69 | ldm r1, {r1, r3} | |
70 | subs r2, r2, #8 | |
71 | vld1.32 {q0}, [r1,:128]! | |
72 | vcvt.s32.f32 q8, q0, #31 | |
73 | vld1.32 {q1}, [r1,:128]! | |
74 | vcvt.s32.f32 q9, q1, #31 | |
75 | vld1.32 {q10}, [r3,:128]! | |
76 | vcvt.s32.f32 q10, q10, #31 | |
77 | vld1.32 {q11}, [r3,:128]! | |
78 | vcvt.s32.f32 q11, q11, #31 | |
79 | beq 3f | |
80 | bics r12, r2, #15 | |
81 | beq 2f | |
82 | 1: subs r12, r12, #16 | |
83 | vld1.32 {q0}, [r1,:128]! | |
84 | vcvt.s32.f32 q0, q0, #31 | |
85 | vsri.32 q10, q8, #16 | |
86 | vld1.32 {q1}, [r1,:128]! | |
87 | vcvt.s32.f32 q1, q1, #31 | |
88 | vld1.32 {q12}, [r3,:128]! | |
89 | vcvt.s32.f32 q12, q12, #31 | |
90 | vld1.32 {q13}, [r3,:128]! | |
91 | vsri.32 q11, q9, #16 | |
92 | vst1.16 {q10}, [r0,:128]! | |
93 | vcvt.s32.f32 q13, q13, #31 | |
94 | vst1.16 {q11}, [r0,:128]! | |
95 | vsri.32 q12, q0, #16 | |
96 | vld1.32 {q8}, [r1,:128]! | |
97 | vsri.32 q13, q1, #16 | |
98 | vst1.16 {q12}, [r0,:128]! | |
99 | vcvt.s32.f32 q8, q8, #31 | |
100 | vld1.32 {q9}, [r1,:128]! | |
101 | vcvt.s32.f32 q9, q9, #31 | |
102 | vld1.32 {q10}, [r3,:128]! | |
103 | vcvt.s32.f32 q10, q10, #31 | |
104 | vld1.32 {q11}, [r3,:128]! | |
105 | vcvt.s32.f32 q11, q11, #31 | |
106 | vst1.16 {q13}, [r0,:128]! | |
107 | bne 1b | |
108 | ands r2, r2, #15 | |
109 | beq 3f | |
110 | 2: vsri.32 q10, q8, #16 | |
111 | vld1.32 {q0}, [r1,:128]! | |
112 | vcvt.s32.f32 q0, q0, #31 | |
113 | vld1.32 {q1}, [r1,:128]! | |
114 | vcvt.s32.f32 q1, q1, #31 | |
115 | vld1.32 {q12}, [r3,:128]! | |
116 | vcvt.s32.f32 q12, q12, #31 | |
117 | vsri.32 q11, q9, #16 | |
118 | vld1.32 {q13}, [r3,:128]! | |
119 | vcvt.s32.f32 q13, q13, #31 | |
120 | vst1.16 {q10}, [r0,:128]! | |
121 | vsri.32 q12, q0, #16 | |
122 | vst1.16 {q11}, [r0,:128]! | |
123 | vsri.32 q13, q1, #16 | |
124 | vst1.16 {q12-q13},[r0,:128]! | |
125 | bx lr | |
126 | 3: vsri.32 q10, q8, #16 | |
127 | vsri.32 q11, q9, #16 | |
128 | vst1.16 {q10-q11},[r0,:128]! | |
129 | bx lr | |
130 | endfunc | |
131 | ||
132 | function ff_conv_fltp_to_s16_neon, export=1 | |
133 | cmp r3, #2 | |
134 | itt lt | |
135 | ldrlt r1, [r1] | |
136 | blt X(ff_conv_flt_to_s16_neon) | |
137 | beq X(ff_conv_fltp_to_s16_2ch_neon) | |
138 | ||
139 | push {r4-r8, lr} | |
140 | cmp r3, #4 | |
141 | lsl r12, r3, #1 | |
142 | blt 4f | |
143 | ||
144 | @ 4 channels | |
145 | 5: ldm r1!, {r4-r7} | |
146 | mov lr, r2 | |
147 | mov r8, r0 | |
148 | vld1.32 {q8}, [r4,:128]! | |
149 | vcvt.s32.f32 q8, q8, #31 | |
150 | vld1.32 {q9}, [r5,:128]! | |
151 | vcvt.s32.f32 q9, q9, #31 | |
152 | vld1.32 {q10}, [r6,:128]! | |
153 | vcvt.s32.f32 q10, q10, #31 | |
154 | vld1.32 {q11}, [r7,:128]! | |
155 | vcvt.s32.f32 q11, q11, #31 | |
156 | 6: subs lr, lr, #8 | |
157 | vld1.32 {q0}, [r4,:128]! | |
158 | vcvt.s32.f32 q0, q0, #31 | |
159 | vsri.32 q9, q8, #16 | |
160 | vld1.32 {q1}, [r5,:128]! | |
161 | vcvt.s32.f32 q1, q1, #31 | |
162 | vsri.32 q11, q10, #16 | |
163 | vld1.32 {q2}, [r6,:128]! | |
164 | vcvt.s32.f32 q2, q2, #31 | |
165 | vzip.32 d18, d22 | |
166 | vld1.32 {q3}, [r7,:128]! | |
167 | vcvt.s32.f32 q3, q3, #31 | |
168 | vzip.32 d19, d23 | |
169 | vst1.16 {d18}, [r8], r12 | |
170 | vsri.32 q1, q0, #16 | |
171 | vst1.16 {d22}, [r8], r12 | |
172 | vsri.32 q3, q2, #16 | |
173 | vst1.16 {d19}, [r8], r12 | |
174 | vzip.32 d2, d6 | |
175 | vst1.16 {d23}, [r8], r12 | |
176 | vzip.32 d3, d7 | |
177 | beq 7f | |
178 | vld1.32 {q8}, [r4,:128]! | |
179 | vcvt.s32.f32 q8, q8, #31 | |
180 | vst1.16 {d2}, [r8], r12 | |
181 | vld1.32 {q9}, [r5,:128]! | |
182 | vcvt.s32.f32 q9, q9, #31 | |
183 | vst1.16 {d6}, [r8], r12 | |
184 | vld1.32 {q10}, [r6,:128]! | |
185 | vcvt.s32.f32 q10, q10, #31 | |
186 | vst1.16 {d3}, [r8], r12 | |
187 | vld1.32 {q11}, [r7,:128]! | |
188 | vcvt.s32.f32 q11, q11, #31 | |
189 | vst1.16 {d7}, [r8], r12 | |
190 | b 6b | |
191 | 7: vst1.16 {d2}, [r8], r12 | |
192 | vst1.16 {d6}, [r8], r12 | |
193 | vst1.16 {d3}, [r8], r12 | |
194 | vst1.16 {d7}, [r8], r12 | |
195 | subs r3, r3, #4 | |
196 | it eq | |
197 | popeq {r4-r8, pc} | |
198 | cmp r3, #4 | |
199 | add r0, r0, #8 | |
200 | bge 5b | |
201 | ||
202 | @ 2 channels | |
203 | 4: cmp r3, #2 | |
204 | blt 4f | |
205 | ldm r1!, {r4-r5} | |
206 | mov lr, r2 | |
207 | mov r8, r0 | |
208 | tst lr, #8 | |
209 | vld1.32 {q8}, [r4,:128]! | |
210 | vcvt.s32.f32 q8, q8, #31 | |
211 | vld1.32 {q9}, [r5,:128]! | |
212 | vcvt.s32.f32 q9, q9, #31 | |
213 | vld1.32 {q10}, [r4,:128]! | |
214 | vcvt.s32.f32 q10, q10, #31 | |
215 | vld1.32 {q11}, [r5,:128]! | |
216 | vcvt.s32.f32 q11, q11, #31 | |
217 | beq 6f | |
218 | subs lr, lr, #8 | |
219 | beq 7f | |
220 | vsri.32 d18, d16, #16 | |
221 | vsri.32 d19, d17, #16 | |
222 | vld1.32 {q8}, [r4,:128]! | |
223 | vcvt.s32.f32 q8, q8, #31 | |
224 | vst1.32 {d18[0]}, [r8], r12 | |
225 | vsri.32 d22, d20, #16 | |
226 | vst1.32 {d18[1]}, [r8], r12 | |
227 | vsri.32 d23, d21, #16 | |
228 | vst1.32 {d19[0]}, [r8], r12 | |
229 | vst1.32 {d19[1]}, [r8], r12 | |
230 | vld1.32 {q9}, [r5,:128]! | |
231 | vcvt.s32.f32 q9, q9, #31 | |
232 | vst1.32 {d22[0]}, [r8], r12 | |
233 | vst1.32 {d22[1]}, [r8], r12 | |
234 | vld1.32 {q10}, [r4,:128]! | |
235 | vcvt.s32.f32 q10, q10, #31 | |
236 | vst1.32 {d23[0]}, [r8], r12 | |
237 | vst1.32 {d23[1]}, [r8], r12 | |
238 | vld1.32 {q11}, [r5,:128]! | |
239 | vcvt.s32.f32 q11, q11, #31 | |
240 | 6: subs lr, lr, #16 | |
241 | vld1.32 {q0}, [r4,:128]! | |
242 | vcvt.s32.f32 q0, q0, #31 | |
243 | vsri.32 d18, d16, #16 | |
244 | vld1.32 {q1}, [r5,:128]! | |
245 | vcvt.s32.f32 q1, q1, #31 | |
246 | vsri.32 d19, d17, #16 | |
247 | vld1.32 {q2}, [r4,:128]! | |
248 | vcvt.s32.f32 q2, q2, #31 | |
249 | vld1.32 {q3}, [r5,:128]! | |
250 | vcvt.s32.f32 q3, q3, #31 | |
251 | vst1.32 {d18[0]}, [r8], r12 | |
252 | vsri.32 d22, d20, #16 | |
253 | vst1.32 {d18[1]}, [r8], r12 | |
254 | vsri.32 d23, d21, #16 | |
255 | vst1.32 {d19[0]}, [r8], r12 | |
256 | vsri.32 d2, d0, #16 | |
257 | vst1.32 {d19[1]}, [r8], r12 | |
258 | vsri.32 d3, d1, #16 | |
259 | vst1.32 {d22[0]}, [r8], r12 | |
260 | vsri.32 d6, d4, #16 | |
261 | vst1.32 {d22[1]}, [r8], r12 | |
262 | vsri.32 d7, d5, #16 | |
263 | vst1.32 {d23[0]}, [r8], r12 | |
264 | vst1.32 {d23[1]}, [r8], r12 | |
265 | beq 6f | |
266 | vld1.32 {q8}, [r4,:128]! | |
267 | vcvt.s32.f32 q8, q8, #31 | |
268 | vst1.32 {d2[0]}, [r8], r12 | |
269 | vst1.32 {d2[1]}, [r8], r12 | |
270 | vld1.32 {q9}, [r5,:128]! | |
271 | vcvt.s32.f32 q9, q9, #31 | |
272 | vst1.32 {d3[0]}, [r8], r12 | |
273 | vst1.32 {d3[1]}, [r8], r12 | |
274 | vld1.32 {q10}, [r4,:128]! | |
275 | vcvt.s32.f32 q10, q10, #31 | |
276 | vst1.32 {d6[0]}, [r8], r12 | |
277 | vst1.32 {d6[1]}, [r8], r12 | |
278 | vld1.32 {q11}, [r5,:128]! | |
279 | vcvt.s32.f32 q11, q11, #31 | |
280 | vst1.32 {d7[0]}, [r8], r12 | |
281 | vst1.32 {d7[1]}, [r8], r12 | |
282 | bgt 6b | |
283 | 6: vst1.32 {d2[0]}, [r8], r12 | |
284 | vst1.32 {d2[1]}, [r8], r12 | |
285 | vst1.32 {d3[0]}, [r8], r12 | |
286 | vst1.32 {d3[1]}, [r8], r12 | |
287 | vst1.32 {d6[0]}, [r8], r12 | |
288 | vst1.32 {d6[1]}, [r8], r12 | |
289 | vst1.32 {d7[0]}, [r8], r12 | |
290 | vst1.32 {d7[1]}, [r8], r12 | |
291 | b 8f | |
292 | 7: vsri.32 d18, d16, #16 | |
293 | vsri.32 d19, d17, #16 | |
294 | vst1.32 {d18[0]}, [r8], r12 | |
295 | vsri.32 d22, d20, #16 | |
296 | vst1.32 {d18[1]}, [r8], r12 | |
297 | vsri.32 d23, d21, #16 | |
298 | vst1.32 {d19[0]}, [r8], r12 | |
299 | vst1.32 {d19[1]}, [r8], r12 | |
300 | vst1.32 {d22[0]}, [r8], r12 | |
301 | vst1.32 {d22[1]}, [r8], r12 | |
302 | vst1.32 {d23[0]}, [r8], r12 | |
303 | vst1.32 {d23[1]}, [r8], r12 | |
304 | 8: subs r3, r3, #2 | |
305 | add r0, r0, #4 | |
306 | it eq | |
307 | popeq {r4-r8, pc} | |
308 | ||
309 | @ 1 channel | |
310 | 4: ldr r4, [r1] | |
311 | tst r2, #8 | |
312 | mov lr, r2 | |
313 | mov r5, r0 | |
314 | vld1.32 {q0}, [r4,:128]! | |
315 | vcvt.s32.f32 q0, q0, #31 | |
316 | vld1.32 {q1}, [r4,:128]! | |
317 | vcvt.s32.f32 q1, q1, #31 | |
318 | bne 8f | |
319 | 6: subs lr, lr, #16 | |
320 | vld1.32 {q2}, [r4,:128]! | |
321 | vcvt.s32.f32 q2, q2, #31 | |
322 | vld1.32 {q3}, [r4,:128]! | |
323 | vcvt.s32.f32 q3, q3, #31 | |
324 | vst1.16 {d0[1]}, [r5,:16], r12 | |
325 | vst1.16 {d0[3]}, [r5,:16], r12 | |
326 | vst1.16 {d1[1]}, [r5,:16], r12 | |
327 | vst1.16 {d1[3]}, [r5,:16], r12 | |
328 | vst1.16 {d2[1]}, [r5,:16], r12 | |
329 | vst1.16 {d2[3]}, [r5,:16], r12 | |
330 | vst1.16 {d3[1]}, [r5,:16], r12 | |
331 | vst1.16 {d3[3]}, [r5,:16], r12 | |
332 | beq 7f | |
333 | vld1.32 {q0}, [r4,:128]! | |
334 | vcvt.s32.f32 q0, q0, #31 | |
335 | vld1.32 {q1}, [r4,:128]! | |
336 | vcvt.s32.f32 q1, q1, #31 | |
337 | 7: vst1.16 {d4[1]}, [r5,:16], r12 | |
338 | vst1.16 {d4[3]}, [r5,:16], r12 | |
339 | vst1.16 {d5[1]}, [r5,:16], r12 | |
340 | vst1.16 {d5[3]}, [r5,:16], r12 | |
341 | vst1.16 {d6[1]}, [r5,:16], r12 | |
342 | vst1.16 {d6[3]}, [r5,:16], r12 | |
343 | vst1.16 {d7[1]}, [r5,:16], r12 | |
344 | vst1.16 {d7[3]}, [r5,:16], r12 | |
345 | bgt 6b | |
346 | pop {r4-r8, pc} | |
347 | 8: subs lr, lr, #8 | |
348 | vst1.16 {d0[1]}, [r5,:16], r12 | |
349 | vst1.16 {d0[3]}, [r5,:16], r12 | |
350 | vst1.16 {d1[1]}, [r5,:16], r12 | |
351 | vst1.16 {d1[3]}, [r5,:16], r12 | |
352 | vst1.16 {d2[1]}, [r5,:16], r12 | |
353 | vst1.16 {d2[3]}, [r5,:16], r12 | |
354 | vst1.16 {d3[1]}, [r5,:16], r12 | |
355 | vst1.16 {d3[3]}, [r5,:16], r12 | |
356 | it eq | |
357 | popeq {r4-r8, pc} | |
358 | vld1.32 {q0}, [r4,:128]! | |
359 | vcvt.s32.f32 q0, q0, #31 | |
360 | vld1.32 {q1}, [r4,:128]! | |
361 | vcvt.s32.f32 q1, q1, #31 | |
362 | b 6b | |
363 | endfunc |