Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * ARM NEON optimised DSP functions | |
3 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
4 | * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> | |
5 | * | |
6 | * This file is part of FFmpeg. | |
7 | * | |
8 | * FFmpeg is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License as published by the Free Software Foundation; either | |
11 | * version 2.1 of the License, or (at your option) any later version. | |
12 | * | |
13 | * FFmpeg is distributed in the hope that it will be useful, | |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | * Lesser General Public License for more details. | |
17 | * | |
18 | * You should have received a copy of the GNU Lesser General Public | |
19 | * License along with FFmpeg; if not, write to the Free Software | |
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | */ | |
22 | ||
23 | #include "libavutil/aarch64/asm.S" | |
24 | ||
25 | .macro pixels16 rnd=1, avg=0 | |
26 | .if \avg | |
27 | mov x12, x0 | |
28 | .endif | |
29 | 1: ld1 {v0.16B}, [x1], x2 | |
30 | ld1 {v1.16B}, [x1], x2 | |
31 | ld1 {v2.16B}, [x1], x2 | |
32 | ld1 {v3.16B}, [x1], x2 | |
33 | .if \avg | |
34 | ld1 {v4.16B}, [x12], x2 | |
35 | urhadd v0.16B, v0.16B, v4.16B | |
36 | ld1 {v5.16B}, [x12], x2 | |
37 | urhadd v1.16B, v1.16B, v5.16B | |
38 | ld1 {v6.16B}, [x12], x2 | |
39 | urhadd v2.16B, v2.16B, v6.16B | |
40 | ld1 {v7.16B}, [x12], x2 | |
41 | urhadd v3.16B, v3.16B, v7.16B | |
42 | .endif | |
43 | subs w3, w3, #4 | |
44 | st1 {v0.16B}, [x0], x2 | |
45 | st1 {v1.16B}, [x0], x2 | |
46 | st1 {v2.16B}, [x0], x2 | |
47 | st1 {v3.16B}, [x0], x2 | |
48 | b.ne 1b | |
49 | ret | |
50 | .endm | |
51 | ||
52 | .macro pixels16_x2 rnd=1, avg=0 | |
53 | 1: ld1 {v0.16B, v1.16B}, [x1], x2 | |
54 | ld1 {v2.16B, v3.16B}, [x1], x2 | |
55 | subs w3, w3, #2 | |
56 | ext v1.16B, v0.16B, v1.16B, #1 | |
57 | avg v0.16B, v0.16B, v1.16B | |
58 | ext v3.16B, v2.16B, v3.16B, #1 | |
59 | avg v2.16B, v2.16B, v3.16B | |
60 | .if \avg | |
61 | ld1 {v1.16B}, [x0], x2 | |
62 | ld1 {v3.16B}, [x0] | |
63 | urhadd v0.16B, v0.16B, v1.16B | |
64 | urhadd v2.16B, v2.16B, v3.16B | |
65 | sub x0, x0, x2 | |
66 | .endif | |
67 | st1 {v0.16B}, [x0], x2 | |
68 | st1 {v2.16B}, [x0], x2 | |
69 | b.ne 1b | |
70 | ret | |
71 | .endm | |
72 | ||
73 | .macro pixels16_y2 rnd=1, avg=0 | |
74 | sub w3, w3, #2 | |
75 | ld1 {v0.16B}, [x1], x2 | |
76 | ld1 {v1.16B}, [x1], x2 | |
77 | 1: subs w3, w3, #2 | |
78 | avg v2.16B, v0.16B, v1.16B | |
79 | ld1 {v0.16B}, [x1], x2 | |
80 | avg v3.16B, v0.16B, v1.16B | |
81 | ld1 {v1.16B}, [x1], x2 | |
82 | .if \avg | |
83 | ld1 {v4.16B}, [x0], x2 | |
84 | ld1 {v5.16B}, [x0] | |
85 | urhadd v2.16B, v2.16B, v4.16B | |
86 | urhadd v3.16B, v3.16B, v5.16B | |
87 | sub x0, x0, x2 | |
88 | .endif | |
89 | st1 {v2.16B}, [x0], x2 | |
90 | st1 {v3.16B}, [x0], x2 | |
91 | b.ne 1b | |
92 | ||
93 | avg v2.16B, v0.16B, v1.16B | |
94 | ld1 {v0.16B}, [x1], x2 | |
95 | avg v3.16B, v0.16B, v1.16B | |
96 | .if \avg | |
97 | ld1 {v4.16B}, [x0], x2 | |
98 | ld1 {v5.16B}, [x0] | |
99 | urhadd v2.16B, v2.16B, v4.16B | |
100 | urhadd v3.16B, v3.16B, v5.16B | |
101 | sub x0, x0, x2 | |
102 | .endif | |
103 | st1 {v2.16B}, [x0], x2 | |
104 | st1 {v3.16B}, [x0], x2 | |
105 | ||
106 | ret | |
107 | .endm | |
108 | ||
109 | .macro pixels16_xy2 rnd=1, avg=0 | |
110 | sub w3, w3, #2 | |
111 | ld1 {v0.16B, v1.16B}, [x1], x2 | |
112 | ld1 {v4.16B, v5.16B}, [x1], x2 | |
113 | NRND movi v26.8H, #1 | |
114 | ext v1.16B, v0.16B, v1.16B, #1 | |
115 | ext v5.16B, v4.16B, v5.16B, #1 | |
116 | uaddl v16.8H, v0.8B, v1.8B | |
117 | uaddl2 v20.8H, v0.16B, v1.16B | |
118 | uaddl v18.8H, v4.8B, v5.8B | |
119 | uaddl2 v22.8H, v4.16B, v5.16B | |
120 | 1: subs w3, w3, #2 | |
121 | ld1 {v0.16B, v1.16B}, [x1], x2 | |
122 | add v24.8H, v16.8H, v18.8H | |
123 | NRND add v24.8H, v24.8H, v26.8H | |
124 | ext v30.16B, v0.16B, v1.16B, #1 | |
125 | add v1.8H, v20.8H, v22.8H | |
126 | mshrn v28.8B, v24.8H, #2 | |
127 | NRND add v1.8H, v1.8H, v26.8H | |
128 | mshrn2 v28.16B, v1.8H, #2 | |
129 | .if \avg | |
130 | ld1 {v16.16B}, [x0] | |
131 | urhadd v28.16B, v28.16B, v16.16B | |
132 | .endif | |
133 | uaddl v16.8H, v0.8B, v30.8B | |
134 | ld1 {v2.16B, v3.16B}, [x1], x2 | |
135 | uaddl2 v20.8H, v0.16B, v30.16B | |
136 | st1 {v28.16B}, [x0], x2 | |
137 | add v24.8H, v16.8H, v18.8H | |
138 | NRND add v24.8H, v24.8H, v26.8H | |
139 | ext v3.16B, v2.16B, v3.16B, #1 | |
140 | add v0.8H, v20.8H, v22.8H | |
141 | mshrn v30.8B, v24.8H, #2 | |
142 | NRND add v0.8H, v0.8H, v26.8H | |
143 | mshrn2 v30.16B, v0.8H, #2 | |
144 | .if \avg | |
145 | ld1 {v18.16B}, [x0] | |
146 | urhadd v30.16B, v30.16B, v18.16B | |
147 | .endif | |
148 | uaddl v18.8H, v2.8B, v3.8B | |
149 | uaddl2 v22.8H, v2.16B, v3.16B | |
150 | st1 {v30.16B}, [x0], x2 | |
151 | b.gt 1b | |
152 | ||
153 | ld1 {v0.16B, v1.16B}, [x1], x2 | |
154 | add v24.8H, v16.8H, v18.8H | |
155 | NRND add v24.8H, v24.8H, v26.8H | |
156 | ext v30.16B, v0.16B, v1.16B, #1 | |
157 | add v1.8H, v20.8H, v22.8H | |
158 | mshrn v28.8B, v24.8H, #2 | |
159 | NRND add v1.8H, v1.8H, v26.8H | |
160 | mshrn2 v28.16B, v1.8H, #2 | |
161 | .if \avg | |
162 | ld1 {v16.16B}, [x0] | |
163 | urhadd v28.16B, v28.16B, v16.16B | |
164 | .endif | |
165 | uaddl v16.8H, v0.8B, v30.8B | |
166 | uaddl2 v20.8H, v0.16B, v30.16B | |
167 | st1 {v28.16B}, [x0], x2 | |
168 | add v24.8H, v16.8H, v18.8H | |
169 | NRND add v24.8H, v24.8H, v26.8H | |
170 | add v0.8H, v20.8H, v22.8H | |
171 | mshrn v30.8B, v24.8H, #2 | |
172 | NRND add v0.8H, v0.8H, v26.8H | |
173 | mshrn2 v30.16B, v0.8H, #2 | |
174 | .if \avg | |
175 | ld1 {v18.16B}, [x0] | |
176 | urhadd v30.16B, v30.16B, v18.16B | |
177 | .endif | |
178 | st1 {v30.16B}, [x0], x2 | |
179 | ||
180 | ret | |
181 | .endm | |
182 | ||
183 | .macro pixels8 rnd=1, avg=0 | |
184 | 1: ld1 {v0.8B}, [x1], x2 | |
185 | ld1 {v1.8B}, [x1], x2 | |
186 | ld1 {v2.8B}, [x1], x2 | |
187 | ld1 {v3.8B}, [x1], x2 | |
188 | .if \avg | |
189 | ld1 {v4.8B}, [x0], x2 | |
190 | urhadd v0.8B, v0.8B, v4.8B | |
191 | ld1 {v5.8B}, [x0], x2 | |
192 | urhadd v1.8B, v1.8B, v5.8B | |
193 | ld1 {v6.8B}, [x0], x2 | |
194 | urhadd v2.8B, v2.8B, v6.8B | |
195 | ld1 {v7.8B}, [x0], x2 | |
196 | urhadd v3.8B, v3.8B, v7.8B | |
197 | sub x0, x0, x2, lsl #2 | |
198 | .endif | |
199 | subs w3, w3, #4 | |
200 | st1 {v0.8B}, [x0], x2 | |
201 | st1 {v1.8B}, [x0], x2 | |
202 | st1 {v2.8B}, [x0], x2 | |
203 | st1 {v3.8B}, [x0], x2 | |
204 | b.ne 1b | |
205 | ret | |
206 | .endm | |
207 | ||
208 | .macro pixels8_x2 rnd=1, avg=0 | |
209 | 1: ld1 {v0.8B, v1.8B}, [x1], x2 | |
210 | ext v1.8B, v0.8B, v1.8B, #1 | |
211 | ld1 {v2.8B, v3.8B}, [x1], x2 | |
212 | ext v3.8B, v2.8B, v3.8B, #1 | |
213 | subs w3, w3, #2 | |
214 | avg v0.8B, v0.8B, v1.8B | |
215 | avg v2.8B, v2.8B, v3.8B | |
216 | .if \avg | |
217 | ld1 {v4.8B}, [x0], x2 | |
218 | ld1 {v5.8B}, [x0] | |
219 | urhadd v0.8B, v0.8B, v4.8B | |
220 | urhadd v2.8B, v2.8B, v5.8B | |
221 | sub x0, x0, x2 | |
222 | .endif | |
223 | st1 {v0.8B}, [x0], x2 | |
224 | st1 {v2.8B}, [x0], x2 | |
225 | b.ne 1b | |
226 | ret | |
227 | .endm | |
228 | ||
229 | .macro pixels8_y2 rnd=1, avg=0 | |
230 | sub w3, w3, #2 | |
231 | ld1 {v0.8B}, [x1], x2 | |
232 | ld1 {v1.8B}, [x1], x2 | |
233 | 1: subs w3, w3, #2 | |
234 | avg v4.8B, v0.8B, v1.8B | |
235 | ld1 {v0.8B}, [x1], x2 | |
236 | avg v5.8B, v0.8B, v1.8B | |
237 | ld1 {v1.8B}, [x1], x2 | |
238 | .if \avg | |
239 | ld1 {v2.8B}, [x0], x2 | |
240 | ld1 {v3.8B}, [x0] | |
241 | urhadd v4.8B, v4.8B, v2.8B | |
242 | urhadd v5.8B, v5.8B, v3.8B | |
243 | sub x0, x0, x2 | |
244 | .endif | |
245 | st1 {v4.8B}, [x0], x2 | |
246 | st1 {v5.8B}, [x0], x2 | |
247 | b.ne 1b | |
248 | ||
249 | avg v4.8B, v0.8B, v1.8B | |
250 | ld1 {v0.8B}, [x1], x2 | |
251 | avg v5.8B, v0.8B, v1.8B | |
252 | .if \avg | |
253 | ld1 {v2.8B}, [x0], x2 | |
254 | ld1 {v3.8B}, [x0] | |
255 | urhadd v4.8B, v4.8B, v2.8B | |
256 | urhadd v5.8B, v5.8B, v3.8B | |
257 | sub x0, x0, x2 | |
258 | .endif | |
259 | st1 {v4.8B}, [x0], x2 | |
260 | st1 {v5.8B}, [x0], x2 | |
261 | ||
262 | ret | |
263 | .endm | |
264 | ||
265 | .macro pixels8_xy2 rnd=1, avg=0 | |
266 | sub w3, w3, #2 | |
267 | ld1 {v0.16B}, [x1], x2 | |
268 | ld1 {v1.16B}, [x1], x2 | |
269 | NRND movi v19.8H, #1 | |
270 | ext v4.16B, v0.16B, v4.16B, #1 | |
271 | ext v6.16B, v1.16B, v6.16B, #1 | |
272 | uaddl v16.8H, v0.8B, v4.8B | |
273 | uaddl v17.8H, v1.8B, v6.8B | |
274 | 1: subs w3, w3, #2 | |
275 | ld1 {v0.16B}, [x1], x2 | |
276 | add v18.8H, v16.8H, v17.8H | |
277 | ext v4.16B, v0.16B, v4.16B, #1 | |
278 | NRND add v18.8H, v18.8H, v19.8H | |
279 | uaddl v16.8H, v0.8B, v4.8B | |
280 | mshrn v5.8B, v18.8H, #2 | |
281 | ld1 {v1.16B}, [x1], x2 | |
282 | add v18.8H, v16.8H, v17.8H | |
283 | .if \avg | |
284 | ld1 {v7.8B}, [x0] | |
285 | urhadd v5.8B, v5.8B, v7.8B | |
286 | .endif | |
287 | NRND add v18.8H, v18.8H, v19.8H | |
288 | st1 {v5.8B}, [x0], x2 | |
289 | mshrn v7.8B, v18.8H, #2 | |
290 | .if \avg | |
291 | ld1 {v5.8B}, [x0] | |
292 | urhadd v7.8B, v7.8B, v5.8B | |
293 | .endif | |
294 | ext v6.16B, v1.16B, v6.16B, #1 | |
295 | uaddl v17.8H, v1.8B, v6.8B | |
296 | st1 {v7.8B}, [x0], x2 | |
297 | b.gt 1b | |
298 | ||
299 | ld1 {v0.16B}, [x1], x2 | |
300 | add v18.8H, v16.8H, v17.8H | |
301 | ext v4.16B, v0.16B, v4.16B, #1 | |
302 | NRND add v18.8H, v18.8H, v19.8H | |
303 | uaddl v16.8H, v0.8B, v4.8B | |
304 | mshrn v5.8B, v18.8H, #2 | |
305 | add v18.8H, v16.8H, v17.8H | |
306 | .if \avg | |
307 | ld1 {v7.8B}, [x0] | |
308 | urhadd v5.8B, v5.8B, v7.8B | |
309 | .endif | |
310 | NRND add v18.8H, v18.8H, v19.8H | |
311 | st1 {v5.8B}, [x0], x2 | |
312 | mshrn v7.8B, v18.8H, #2 | |
313 | .if \avg | |
314 | ld1 {v5.8B}, [x0] | |
315 | urhadd v7.8B, v7.8B, v5.8B | |
316 | .endif | |
317 | st1 {v7.8B}, [x0], x2 | |
318 | ||
319 | ret | |
320 | .endm | |
321 | ||
322 | .macro pixfunc pfx, name, suf, rnd=1, avg=0 | |
323 | .if \rnd | |
324 | .macro avg rd, rn, rm | |
325 | urhadd \rd, \rn, \rm | |
326 | .endm | |
327 | .macro mshrn rd, rn, rm | |
328 | rshrn \rd, \rn, \rm | |
329 | .endm | |
330 | .macro mshrn2 rd, rn, rm | |
331 | rshrn2 \rd, \rn, \rm | |
332 | .endm | |
333 | .macro NRND insn:vararg | |
334 | .endm | |
335 | .else | |
336 | .macro avg rd, rn, rm | |
337 | uhadd \rd, \rn, \rm | |
338 | .endm | |
339 | .macro mshrn rd, rn, rm | |
340 | shrn \rd, \rn, \rm | |
341 | .endm | |
342 | .macro mshrn2 rd, rn, rm | |
343 | shrn2 \rd, \rn, \rm | |
344 | .endm | |
345 | .macro NRND insn:vararg | |
346 | \insn | |
347 | .endm | |
348 | .endif | |
349 | function ff_\pfx\name\suf\()_neon, export=1 | |
350 | \name \rnd, \avg | |
351 | endfunc | |
352 | .purgem avg | |
353 | .purgem mshrn | |
354 | .purgem mshrn2 | |
355 | .purgem NRND | |
356 | .endm | |
357 | ||
358 | .macro pixfunc2 pfx, name, avg=0 | |
359 | pixfunc \pfx, \name, rnd=1, avg=\avg | |
360 | pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg | |
361 | .endm | |
362 | ||
363 | function ff_put_h264_qpel16_mc00_neon, export=1 | |
364 | mov w3, #16 | |
365 | endfunc | |
366 | ||
367 | pixfunc put_, pixels16, avg=0 | |
368 | pixfunc2 put_, pixels16_x2, avg=0 | |
369 | pixfunc2 put_, pixels16_y2, avg=0 | |
370 | pixfunc2 put_, pixels16_xy2, avg=0 | |
371 | ||
372 | function ff_avg_h264_qpel16_mc00_neon, export=1 | |
373 | mov w3, #16 | |
374 | endfunc | |
375 | ||
376 | pixfunc avg_, pixels16, avg=1 | |
377 | pixfunc2 avg_, pixels16_x2, avg=1 | |
378 | pixfunc2 avg_, pixels16_y2, avg=1 | |
379 | pixfunc2 avg_, pixels16_xy2, avg=1 | |
380 | ||
381 | function ff_put_h264_qpel8_mc00_neon, export=1 | |
382 | mov w3, #8 | |
383 | endfunc | |
384 | ||
385 | pixfunc put_, pixels8, avg=0 | |
386 | pixfunc2 put_, pixels8_x2, avg=0 | |
387 | pixfunc2 put_, pixels8_y2, avg=0 | |
388 | pixfunc2 put_, pixels8_xy2, avg=0 | |
389 | ||
390 | function ff_avg_h264_qpel8_mc00_neon, export=1 | |
391 | mov w3, #8 | |
392 | endfunc | |
393 | ||
394 | pixfunc avg_, pixels8, avg=1 | |
395 | pixfunc avg_, pixels8_x2, avg=1 | |
396 | pixfunc avg_, pixels8_y2, avg=1 | |
397 | pixfunc avg_, pixels8_xy2, avg=1 |