Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * ARM NEON optimised DSP functions | |
3 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | #include "libavutil/arm/asm.S" | |
23 | ||
24 | .macro pixels16 rnd=1, avg=0 | |
25 | .if \avg | |
26 | mov r12, r0 | |
27 | .endif | |
28 | 1: vld1.8 {q0}, [r1], r2 | |
29 | vld1.8 {q1}, [r1], r2 | |
30 | vld1.8 {q2}, [r1], r2 | |
31 | pld [r1, r2, lsl #2] | |
32 | vld1.8 {q3}, [r1], r2 | |
33 | pld [r1] | |
34 | pld [r1, r2] | |
35 | pld [r1, r2, lsl #1] | |
36 | .if \avg | |
37 | vld1.8 {q8}, [r12,:128], r2 | |
38 | vrhadd.u8 q0, q0, q8 | |
39 | vld1.8 {q9}, [r12,:128], r2 | |
40 | vrhadd.u8 q1, q1, q9 | |
41 | vld1.8 {q10}, [r12,:128], r2 | |
42 | vrhadd.u8 q2, q2, q10 | |
43 | vld1.8 {q11}, [r12,:128], r2 | |
44 | vrhadd.u8 q3, q3, q11 | |
45 | .endif | |
46 | subs r3, r3, #4 | |
47 | vst1.64 {q0}, [r0,:128], r2 | |
48 | vst1.64 {q1}, [r0,:128], r2 | |
49 | vst1.64 {q2}, [r0,:128], r2 | |
50 | vst1.64 {q3}, [r0,:128], r2 | |
51 | bne 1b | |
52 | bx lr | |
53 | .endm | |
54 | ||
55 | .macro pixels16_x2 rnd=1, avg=0 | |
56 | 1: vld1.8 {d0-d2}, [r1], r2 | |
57 | vld1.8 {d4-d6}, [r1], r2 | |
58 | pld [r1] | |
59 | pld [r1, r2] | |
60 | subs r3, r3, #2 | |
61 | vext.8 q1, q0, q1, #1 | |
62 | avg q0, q0, q1 | |
63 | vext.8 q3, q2, q3, #1 | |
64 | avg q2, q2, q3 | |
65 | .if \avg | |
66 | vld1.8 {q1}, [r0,:128], r2 | |
67 | vld1.8 {q3}, [r0,:128] | |
68 | vrhadd.u8 q0, q0, q1 | |
69 | vrhadd.u8 q2, q2, q3 | |
70 | sub r0, r0, r2 | |
71 | .endif | |
72 | vst1.8 {q0}, [r0,:128], r2 | |
73 | vst1.8 {q2}, [r0,:128], r2 | |
74 | bne 1b | |
75 | bx lr | |
76 | .endm | |
77 | ||
78 | .macro pixels16_y2 rnd=1, avg=0 | |
79 | sub r3, r3, #2 | |
80 | vld1.8 {q0}, [r1], r2 | |
81 | vld1.8 {q1}, [r1], r2 | |
82 | 1: subs r3, r3, #2 | |
83 | avg q2, q0, q1 | |
84 | vld1.8 {q0}, [r1], r2 | |
85 | avg q3, q0, q1 | |
86 | vld1.8 {q1}, [r1], r2 | |
87 | pld [r1] | |
88 | pld [r1, r2] | |
89 | .if \avg | |
90 | vld1.8 {q8}, [r0,:128], r2 | |
91 | vld1.8 {q9}, [r0,:128] | |
92 | vrhadd.u8 q2, q2, q8 | |
93 | vrhadd.u8 q3, q3, q9 | |
94 | sub r0, r0, r2 | |
95 | .endif | |
96 | vst1.8 {q2}, [r0,:128], r2 | |
97 | vst1.8 {q3}, [r0,:128], r2 | |
98 | bne 1b | |
99 | ||
100 | avg q2, q0, q1 | |
101 | vld1.8 {q0}, [r1], r2 | |
102 | avg q3, q0, q1 | |
103 | .if \avg | |
104 | vld1.8 {q8}, [r0,:128], r2 | |
105 | vld1.8 {q9}, [r0,:128] | |
106 | vrhadd.u8 q2, q2, q8 | |
107 | vrhadd.u8 q3, q3, q9 | |
108 | sub r0, r0, r2 | |
109 | .endif | |
110 | vst1.8 {q2}, [r0,:128], r2 | |
111 | vst1.8 {q3}, [r0,:128], r2 | |
112 | ||
113 | bx lr | |
114 | .endm | |
115 | ||
116 | .macro pixels16_xy2 rnd=1, avg=0 | |
117 | sub r3, r3, #2 | |
118 | vld1.8 {d0-d2}, [r1], r2 | |
119 | vld1.8 {d4-d6}, [r1], r2 | |
120 | NRND vmov.i16 q13, #1 | |
121 | pld [r1] | |
122 | pld [r1, r2] | |
123 | vext.8 q1, q0, q1, #1 | |
124 | vext.8 q3, q2, q3, #1 | |
125 | vaddl.u8 q8, d0, d2 | |
126 | vaddl.u8 q10, d1, d3 | |
127 | vaddl.u8 q9, d4, d6 | |
128 | vaddl.u8 q11, d5, d7 | |
129 | 1: subs r3, r3, #2 | |
130 | vld1.8 {d0-d2}, [r1], r2 | |
131 | vadd.u16 q12, q8, q9 | |
132 | pld [r1] | |
133 | NRND vadd.u16 q12, q12, q13 | |
134 | vext.8 q15, q0, q1, #1 | |
135 | vadd.u16 q1 , q10, q11 | |
136 | shrn d28, q12, #2 | |
137 | NRND vadd.u16 q1, q1, q13 | |
138 | shrn d29, q1, #2 | |
139 | .if \avg | |
140 | vld1.8 {q8}, [r0,:128] | |
141 | vrhadd.u8 q14, q14, q8 | |
142 | .endif | |
143 | vaddl.u8 q8, d0, d30 | |
144 | vld1.8 {d2-d4}, [r1], r2 | |
145 | vaddl.u8 q10, d1, d31 | |
146 | vst1.8 {q14}, [r0,:128], r2 | |
147 | vadd.u16 q12, q8, q9 | |
148 | pld [r1, r2] | |
149 | NRND vadd.u16 q12, q12, q13 | |
150 | vext.8 q2, q1, q2, #1 | |
151 | vadd.u16 q0, q10, q11 | |
152 | shrn d30, q12, #2 | |
153 | NRND vadd.u16 q0, q0, q13 | |
154 | shrn d31, q0, #2 | |
155 | .if \avg | |
156 | vld1.8 {q9}, [r0,:128] | |
157 | vrhadd.u8 q15, q15, q9 | |
158 | .endif | |
159 | vaddl.u8 q9, d2, d4 | |
160 | vaddl.u8 q11, d3, d5 | |
161 | vst1.8 {q15}, [r0,:128], r2 | |
162 | bgt 1b | |
163 | ||
164 | vld1.8 {d0-d2}, [r1], r2 | |
165 | vadd.u16 q12, q8, q9 | |
166 | NRND vadd.u16 q12, q12, q13 | |
167 | vext.8 q15, q0, q1, #1 | |
168 | vadd.u16 q1 , q10, q11 | |
169 | shrn d28, q12, #2 | |
170 | NRND vadd.u16 q1, q1, q13 | |
171 | shrn d29, q1, #2 | |
172 | .if \avg | |
173 | vld1.8 {q8}, [r0,:128] | |
174 | vrhadd.u8 q14, q14, q8 | |
175 | .endif | |
176 | vaddl.u8 q8, d0, d30 | |
177 | vaddl.u8 q10, d1, d31 | |
178 | vst1.8 {q14}, [r0,:128], r2 | |
179 | vadd.u16 q12, q8, q9 | |
180 | NRND vadd.u16 q12, q12, q13 | |
181 | vadd.u16 q0, q10, q11 | |
182 | shrn d30, q12, #2 | |
183 | NRND vadd.u16 q0, q0, q13 | |
184 | shrn d31, q0, #2 | |
185 | .if \avg | |
186 | vld1.8 {q9}, [r0,:128] | |
187 | vrhadd.u8 q15, q15, q9 | |
188 | .endif | |
189 | vst1.8 {q15}, [r0,:128], r2 | |
190 | ||
191 | bx lr | |
192 | .endm | |
193 | ||
194 | .macro pixels8 rnd=1, avg=0 | |
195 | 1: vld1.8 {d0}, [r1], r2 | |
196 | vld1.8 {d1}, [r1], r2 | |
197 | vld1.8 {d2}, [r1], r2 | |
198 | pld [r1, r2, lsl #2] | |
199 | vld1.8 {d3}, [r1], r2 | |
200 | pld [r1] | |
201 | pld [r1, r2] | |
202 | pld [r1, r2, lsl #1] | |
203 | .if \avg | |
204 | vld1.8 {d4}, [r0,:64], r2 | |
205 | vrhadd.u8 d0, d0, d4 | |
206 | vld1.8 {d5}, [r0,:64], r2 | |
207 | vrhadd.u8 d1, d1, d5 | |
208 | vld1.8 {d6}, [r0,:64], r2 | |
209 | vrhadd.u8 d2, d2, d6 | |
210 | vld1.8 {d7}, [r0,:64], r2 | |
211 | vrhadd.u8 d3, d3, d7 | |
212 | sub r0, r0, r2, lsl #2 | |
213 | .endif | |
214 | subs r3, r3, #4 | |
215 | vst1.8 {d0}, [r0,:64], r2 | |
216 | vst1.8 {d1}, [r0,:64], r2 | |
217 | vst1.8 {d2}, [r0,:64], r2 | |
218 | vst1.8 {d3}, [r0,:64], r2 | |
219 | bne 1b | |
220 | bx lr | |
221 | .endm | |
222 | ||
223 | .macro pixels8_x2 rnd=1, avg=0 | |
224 | 1: vld1.8 {q0}, [r1], r2 | |
225 | vext.8 d1, d0, d1, #1 | |
226 | vld1.8 {q1}, [r1], r2 | |
227 | vext.8 d3, d2, d3, #1 | |
228 | pld [r1] | |
229 | pld [r1, r2] | |
230 | subs r3, r3, #2 | |
231 | vswp d1, d2 | |
232 | avg q0, q0, q1 | |
233 | .if \avg | |
234 | vld1.8 {d4}, [r0,:64], r2 | |
235 | vld1.8 {d5}, [r0,:64] | |
236 | vrhadd.u8 q0, q0, q2 | |
237 | sub r0, r0, r2 | |
238 | .endif | |
239 | vst1.8 {d0}, [r0,:64], r2 | |
240 | vst1.8 {d1}, [r0,:64], r2 | |
241 | bne 1b | |
242 | bx lr | |
243 | .endm | |
244 | ||
245 | .macro pixels8_y2 rnd=1, avg=0 | |
246 | sub r3, r3, #2 | |
247 | vld1.8 {d0}, [r1], r2 | |
248 | vld1.8 {d1}, [r1], r2 | |
249 | 1: subs r3, r3, #2 | |
250 | avg d4, d0, d1 | |
251 | vld1.8 {d0}, [r1], r2 | |
252 | avg d5, d0, d1 | |
253 | vld1.8 {d1}, [r1], r2 | |
254 | pld [r1] | |
255 | pld [r1, r2] | |
256 | .if \avg | |
257 | vld1.8 {d2}, [r0,:64], r2 | |
258 | vld1.8 {d3}, [r0,:64] | |
259 | vrhadd.u8 q2, q2, q1 | |
260 | sub r0, r0, r2 | |
261 | .endif | |
262 | vst1.8 {d4}, [r0,:64], r2 | |
263 | vst1.8 {d5}, [r0,:64], r2 | |
264 | bne 1b | |
265 | ||
266 | avg d4, d0, d1 | |
267 | vld1.8 {d0}, [r1], r2 | |
268 | avg d5, d0, d1 | |
269 | .if \avg | |
270 | vld1.8 {d2}, [r0,:64], r2 | |
271 | vld1.8 {d3}, [r0,:64] | |
272 | vrhadd.u8 q2, q2, q1 | |
273 | sub r0, r0, r2 | |
274 | .endif | |
275 | vst1.8 {d4}, [r0,:64], r2 | |
276 | vst1.8 {d5}, [r0,:64], r2 | |
277 | ||
278 | bx lr | |
279 | .endm | |
280 | ||
281 | .macro pixels8_xy2 rnd=1, avg=0 | |
282 | sub r3, r3, #2 | |
283 | vld1.8 {q0}, [r1], r2 | |
284 | vld1.8 {q1}, [r1], r2 | |
285 | NRND vmov.i16 q11, #1 | |
286 | pld [r1] | |
287 | pld [r1, r2] | |
288 | vext.8 d4, d0, d1, #1 | |
289 | vext.8 d6, d2, d3, #1 | |
290 | vaddl.u8 q8, d0, d4 | |
291 | vaddl.u8 q9, d2, d6 | |
292 | 1: subs r3, r3, #2 | |
293 | vld1.8 {q0}, [r1], r2 | |
294 | pld [r1] | |
295 | vadd.u16 q10, q8, q9 | |
296 | vext.8 d4, d0, d1, #1 | |
297 | NRND vadd.u16 q10, q10, q11 | |
298 | vaddl.u8 q8, d0, d4 | |
299 | shrn d5, q10, #2 | |
300 | vld1.8 {q1}, [r1], r2 | |
301 | vadd.u16 q10, q8, q9 | |
302 | pld [r1, r2] | |
303 | .if \avg | |
304 | vld1.8 {d7}, [r0,:64] | |
305 | vrhadd.u8 d5, d5, d7 | |
306 | .endif | |
307 | NRND vadd.u16 q10, q10, q11 | |
308 | vst1.8 {d5}, [r0,:64], r2 | |
309 | shrn d7, q10, #2 | |
310 | .if \avg | |
311 | vld1.8 {d5}, [r0,:64] | |
312 | vrhadd.u8 d7, d7, d5 | |
313 | .endif | |
314 | vext.8 d6, d2, d3, #1 | |
315 | vaddl.u8 q9, d2, d6 | |
316 | vst1.8 {d7}, [r0,:64], r2 | |
317 | bgt 1b | |
318 | ||
319 | vld1.8 {q0}, [r1], r2 | |
320 | vadd.u16 q10, q8, q9 | |
321 | vext.8 d4, d0, d1, #1 | |
322 | NRND vadd.u16 q10, q10, q11 | |
323 | vaddl.u8 q8, d0, d4 | |
324 | shrn d5, q10, #2 | |
325 | vadd.u16 q10, q8, q9 | |
326 | .if \avg | |
327 | vld1.8 {d7}, [r0,:64] | |
328 | vrhadd.u8 d5, d5, d7 | |
329 | .endif | |
330 | NRND vadd.u16 q10, q10, q11 | |
331 | vst1.8 {d5}, [r0,:64], r2 | |
332 | shrn d7, q10, #2 | |
333 | .if \avg | |
334 | vld1.8 {d5}, [r0,:64] | |
335 | vrhadd.u8 d7, d7, d5 | |
336 | .endif | |
337 | vst1.8 {d7}, [r0,:64], r2 | |
338 | ||
339 | bx lr | |
340 | .endm | |
341 | ||
342 | .macro pixfunc pfx, name, suf, rnd=1, avg=0 | |
343 | .if \rnd | |
344 | .macro avg rd, rn, rm | |
345 | vrhadd.u8 \rd, \rn, \rm | |
346 | .endm | |
347 | .macro shrn rd, rn, rm | |
348 | vrshrn.u16 \rd, \rn, \rm | |
349 | .endm | |
350 | .macro NRND insn:vararg | |
351 | .endm | |
352 | .else | |
353 | .macro avg rd, rn, rm | |
354 | vhadd.u8 \rd, \rn, \rm | |
355 | .endm | |
356 | .macro shrn rd, rn, rm | |
357 | vshrn.u16 \rd, \rn, \rm | |
358 | .endm | |
359 | .macro NRND insn:vararg | |
360 | \insn | |
361 | .endm | |
362 | .endif | |
363 | function ff_\pfx\name\suf\()_neon, export=1 | |
364 | \name \rnd, \avg | |
365 | endfunc | |
366 | .purgem avg | |
367 | .purgem shrn | |
368 | .purgem NRND | |
369 | .endm | |
370 | ||
371 | .macro pixfunc2 pfx, name, avg=0 | |
372 | pixfunc \pfx, \name, rnd=1, avg=\avg | |
373 | pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg | |
374 | .endm | |
375 | ||
376 | function ff_put_h264_qpel16_mc00_neon, export=1 | |
377 | mov r3, #16 | |
378 | endfunc | |
379 | ||
380 | pixfunc put_, pixels16, avg=0 | |
381 | pixfunc2 put_, pixels16_x2, avg=0 | |
382 | pixfunc2 put_, pixels16_y2, avg=0 | |
383 | pixfunc2 put_, pixels16_xy2, avg=0 | |
384 | ||
385 | function ff_avg_h264_qpel16_mc00_neon, export=1 | |
386 | mov r3, #16 | |
387 | endfunc | |
388 | ||
389 | pixfunc avg_, pixels16, avg=1 | |
390 | pixfunc2 avg_, pixels16_x2, avg=1 | |
391 | pixfunc2 avg_, pixels16_y2, avg=1 | |
392 | pixfunc2 avg_, pixels16_xy2, avg=1 | |
393 | ||
394 | function ff_put_h264_qpel8_mc00_neon, export=1 | |
395 | mov r3, #8 | |
396 | endfunc | |
397 | ||
398 | pixfunc put_, pixels8, avg=0 | |
399 | pixfunc2 put_, pixels8_x2, avg=0 | |
400 | pixfunc2 put_, pixels8_y2, avg=0 | |
401 | pixfunc2 put_, pixels8_xy2, avg=0 | |
402 | ||
403 | function ff_avg_h264_qpel8_mc00_neon, export=1 | |
404 | mov r3, #8 | |
405 | endfunc | |
406 | ||
407 | pixfunc avg_, pixels8, avg=1 | |
408 | pixfunc avg_, pixels8_x2, avg=1 | |
409 | pixfunc avg_, pixels8_y2, avg=1 | |
410 | pixfunc avg_, pixels8_xy2, avg=1 |