Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / aarch64 / hpeldsp_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "libavutil/aarch64/asm.S"
24
25.macro pixels16 rnd=1, avg=0
26 .if \avg
27 mov x12, x0
28 .endif
291: ld1 {v0.16B}, [x1], x2
30 ld1 {v1.16B}, [x1], x2
31 ld1 {v2.16B}, [x1], x2
32 ld1 {v3.16B}, [x1], x2
33 .if \avg
34 ld1 {v4.16B}, [x12], x2
35 urhadd v0.16B, v0.16B, v4.16B
36 ld1 {v5.16B}, [x12], x2
37 urhadd v1.16B, v1.16B, v5.16B
38 ld1 {v6.16B}, [x12], x2
39 urhadd v2.16B, v2.16B, v6.16B
40 ld1 {v7.16B}, [x12], x2
41 urhadd v3.16B, v3.16B, v7.16B
42 .endif
43 subs w3, w3, #4
44 st1 {v0.16B}, [x0], x2
45 st1 {v1.16B}, [x0], x2
46 st1 {v2.16B}, [x0], x2
47 st1 {v3.16B}, [x0], x2
48 b.ne 1b
49 ret
50.endm
51
52.macro pixels16_x2 rnd=1, avg=0
531: ld1 {v0.16B, v1.16B}, [x1], x2
54 ld1 {v2.16B, v3.16B}, [x1], x2
55 subs w3, w3, #2
56 ext v1.16B, v0.16B, v1.16B, #1
57 avg v0.16B, v0.16B, v1.16B
58 ext v3.16B, v2.16B, v3.16B, #1
59 avg v2.16B, v2.16B, v3.16B
60 .if \avg
61 ld1 {v1.16B}, [x0], x2
62 ld1 {v3.16B}, [x0]
63 urhadd v0.16B, v0.16B, v1.16B
64 urhadd v2.16B, v2.16B, v3.16B
65 sub x0, x0, x2
66 .endif
67 st1 {v0.16B}, [x0], x2
68 st1 {v2.16B}, [x0], x2
69 b.ne 1b
70 ret
71.endm
72
73.macro pixels16_y2 rnd=1, avg=0
74 sub w3, w3, #2
75 ld1 {v0.16B}, [x1], x2
76 ld1 {v1.16B}, [x1], x2
771: subs w3, w3, #2
78 avg v2.16B, v0.16B, v1.16B
79 ld1 {v0.16B}, [x1], x2
80 avg v3.16B, v0.16B, v1.16B
81 ld1 {v1.16B}, [x1], x2
82 .if \avg
83 ld1 {v4.16B}, [x0], x2
84 ld1 {v5.16B}, [x0]
85 urhadd v2.16B, v2.16B, v4.16B
86 urhadd v3.16B, v3.16B, v5.16B
87 sub x0, x0, x2
88 .endif
89 st1 {v2.16B}, [x0], x2
90 st1 {v3.16B}, [x0], x2
91 b.ne 1b
92
93 avg v2.16B, v0.16B, v1.16B
94 ld1 {v0.16B}, [x1], x2
95 avg v3.16B, v0.16B, v1.16B
96 .if \avg
97 ld1 {v4.16B}, [x0], x2
98 ld1 {v5.16B}, [x0]
99 urhadd v2.16B, v2.16B, v4.16B
100 urhadd v3.16B, v3.16B, v5.16B
101 sub x0, x0, x2
102 .endif
103 st1 {v2.16B}, [x0], x2
104 st1 {v3.16B}, [x0], x2
105
106 ret
107.endm
108
109.macro pixels16_xy2 rnd=1, avg=0
110 sub w3, w3, #2
111 ld1 {v0.16B, v1.16B}, [x1], x2
112 ld1 {v4.16B, v5.16B}, [x1], x2
113NRND movi v26.8H, #1
114 ext v1.16B, v0.16B, v1.16B, #1
115 ext v5.16B, v4.16B, v5.16B, #1
116 uaddl v16.8H, v0.8B, v1.8B
117 uaddl2 v20.8H, v0.16B, v1.16B
118 uaddl v18.8H, v4.8B, v5.8B
119 uaddl2 v22.8H, v4.16B, v5.16B
1201: subs w3, w3, #2
121 ld1 {v0.16B, v1.16B}, [x1], x2
122 add v24.8H, v16.8H, v18.8H
123NRND add v24.8H, v24.8H, v26.8H
124 ext v30.16B, v0.16B, v1.16B, #1
125 add v1.8H, v20.8H, v22.8H
126 mshrn v28.8B, v24.8H, #2
127NRND add v1.8H, v1.8H, v26.8H
128 mshrn2 v28.16B, v1.8H, #2
129 .if \avg
130 ld1 {v16.16B}, [x0]
131 urhadd v28.16B, v28.16B, v16.16B
132 .endif
133 uaddl v16.8H, v0.8B, v30.8B
134 ld1 {v2.16B, v3.16B}, [x1], x2
135 uaddl2 v20.8H, v0.16B, v30.16B
136 st1 {v28.16B}, [x0], x2
137 add v24.8H, v16.8H, v18.8H
138NRND add v24.8H, v24.8H, v26.8H
139 ext v3.16B, v2.16B, v3.16B, #1
140 add v0.8H, v20.8H, v22.8H
141 mshrn v30.8B, v24.8H, #2
142NRND add v0.8H, v0.8H, v26.8H
143 mshrn2 v30.16B, v0.8H, #2
144 .if \avg
145 ld1 {v18.16B}, [x0]
146 urhadd v30.16B, v30.16B, v18.16B
147 .endif
148 uaddl v18.8H, v2.8B, v3.8B
149 uaddl2 v22.8H, v2.16B, v3.16B
150 st1 {v30.16B}, [x0], x2
151 b.gt 1b
152
153 ld1 {v0.16B, v1.16B}, [x1], x2
154 add v24.8H, v16.8H, v18.8H
155NRND add v24.8H, v24.8H, v26.8H
156 ext v30.16B, v0.16B, v1.16B, #1
157 add v1.8H, v20.8H, v22.8H
158 mshrn v28.8B, v24.8H, #2
159NRND add v1.8H, v1.8H, v26.8H
160 mshrn2 v28.16B, v1.8H, #2
161 .if \avg
162 ld1 {v16.16B}, [x0]
163 urhadd v28.16B, v28.16B, v16.16B
164 .endif
165 uaddl v16.8H, v0.8B, v30.8B
166 uaddl2 v20.8H, v0.16B, v30.16B
167 st1 {v28.16B}, [x0], x2
168 add v24.8H, v16.8H, v18.8H
169NRND add v24.8H, v24.8H, v26.8H
170 add v0.8H, v20.8H, v22.8H
171 mshrn v30.8B, v24.8H, #2
172NRND add v0.8H, v0.8H, v26.8H
173 mshrn2 v30.16B, v0.8H, #2
174 .if \avg
175 ld1 {v18.16B}, [x0]
176 urhadd v30.16B, v30.16B, v18.16B
177 .endif
178 st1 {v30.16B}, [x0], x2
179
180 ret
181.endm
182
183.macro pixels8 rnd=1, avg=0
1841: ld1 {v0.8B}, [x1], x2
185 ld1 {v1.8B}, [x1], x2
186 ld1 {v2.8B}, [x1], x2
187 ld1 {v3.8B}, [x1], x2
188 .if \avg
189 ld1 {v4.8B}, [x0], x2
190 urhadd v0.8B, v0.8B, v4.8B
191 ld1 {v5.8B}, [x0], x2
192 urhadd v1.8B, v1.8B, v5.8B
193 ld1 {v6.8B}, [x0], x2
194 urhadd v2.8B, v2.8B, v6.8B
195 ld1 {v7.8B}, [x0], x2
196 urhadd v3.8B, v3.8B, v7.8B
197 sub x0, x0, x2, lsl #2
198 .endif
199 subs w3, w3, #4
200 st1 {v0.8B}, [x0], x2
201 st1 {v1.8B}, [x0], x2
202 st1 {v2.8B}, [x0], x2
203 st1 {v3.8B}, [x0], x2
204 b.ne 1b
205 ret
206.endm
207
208.macro pixels8_x2 rnd=1, avg=0
2091: ld1 {v0.8B, v1.8B}, [x1], x2
210 ext v1.8B, v0.8B, v1.8B, #1
211 ld1 {v2.8B, v3.8B}, [x1], x2
212 ext v3.8B, v2.8B, v3.8B, #1
213 subs w3, w3, #2
214 avg v0.8B, v0.8B, v1.8B
215 avg v2.8B, v2.8B, v3.8B
216 .if \avg
217 ld1 {v4.8B}, [x0], x2
218 ld1 {v5.8B}, [x0]
219 urhadd v0.8B, v0.8B, v4.8B
220 urhadd v2.8B, v2.8B, v5.8B
221 sub x0, x0, x2
222 .endif
223 st1 {v0.8B}, [x0], x2
224 st1 {v2.8B}, [x0], x2
225 b.ne 1b
226 ret
227.endm
228
229.macro pixels8_y2 rnd=1, avg=0
230 sub w3, w3, #2
231 ld1 {v0.8B}, [x1], x2
232 ld1 {v1.8B}, [x1], x2
2331: subs w3, w3, #2
234 avg v4.8B, v0.8B, v1.8B
235 ld1 {v0.8B}, [x1], x2
236 avg v5.8B, v0.8B, v1.8B
237 ld1 {v1.8B}, [x1], x2
238 .if \avg
239 ld1 {v2.8B}, [x0], x2
240 ld1 {v3.8B}, [x0]
241 urhadd v4.8B, v4.8B, v2.8B
242 urhadd v5.8B, v5.8B, v3.8B
243 sub x0, x0, x2
244 .endif
245 st1 {v4.8B}, [x0], x2
246 st1 {v5.8B}, [x0], x2
247 b.ne 1b
248
249 avg v4.8B, v0.8B, v1.8B
250 ld1 {v0.8B}, [x1], x2
251 avg v5.8B, v0.8B, v1.8B
252 .if \avg
253 ld1 {v2.8B}, [x0], x2
254 ld1 {v3.8B}, [x0]
255 urhadd v4.8B, v4.8B, v2.8B
256 urhadd v5.8B, v5.8B, v3.8B
257 sub x0, x0, x2
258 .endif
259 st1 {v4.8B}, [x0], x2
260 st1 {v5.8B}, [x0], x2
261
262 ret
263.endm
264
265.macro pixels8_xy2 rnd=1, avg=0
266 sub w3, w3, #2
267 ld1 {v0.16B}, [x1], x2
268 ld1 {v1.16B}, [x1], x2
269NRND movi v19.8H, #1
270 ext v4.16B, v0.16B, v4.16B, #1
271 ext v6.16B, v1.16B, v6.16B, #1
272 uaddl v16.8H, v0.8B, v4.8B
273 uaddl v17.8H, v1.8B, v6.8B
2741: subs w3, w3, #2
275 ld1 {v0.16B}, [x1], x2
276 add v18.8H, v16.8H, v17.8H
277 ext v4.16B, v0.16B, v4.16B, #1
278NRND add v18.8H, v18.8H, v19.8H
279 uaddl v16.8H, v0.8B, v4.8B
280 mshrn v5.8B, v18.8H, #2
281 ld1 {v1.16B}, [x1], x2
282 add v18.8H, v16.8H, v17.8H
283 .if \avg
284 ld1 {v7.8B}, [x0]
285 urhadd v5.8B, v5.8B, v7.8B
286 .endif
287NRND add v18.8H, v18.8H, v19.8H
288 st1 {v5.8B}, [x0], x2
289 mshrn v7.8B, v18.8H, #2
290 .if \avg
291 ld1 {v5.8B}, [x0]
292 urhadd v7.8B, v7.8B, v5.8B
293 .endif
294 ext v6.16B, v1.16B, v6.16B, #1
295 uaddl v17.8H, v1.8B, v6.8B
296 st1 {v7.8B}, [x0], x2
297 b.gt 1b
298
299 ld1 {v0.16B}, [x1], x2
300 add v18.8H, v16.8H, v17.8H
301 ext v4.16B, v0.16B, v4.16B, #1
302NRND add v18.8H, v18.8H, v19.8H
303 uaddl v16.8H, v0.8B, v4.8B
304 mshrn v5.8B, v18.8H, #2
305 add v18.8H, v16.8H, v17.8H
306 .if \avg
307 ld1 {v7.8B}, [x0]
308 urhadd v5.8B, v5.8B, v7.8B
309 .endif
310NRND add v18.8H, v18.8H, v19.8H
311 st1 {v5.8B}, [x0], x2
312 mshrn v7.8B, v18.8H, #2
313 .if \avg
314 ld1 {v5.8B}, [x0]
315 urhadd v7.8B, v7.8B, v5.8B
316 .endif
317 st1 {v7.8B}, [x0], x2
318
319 ret
320.endm
321
322.macro pixfunc pfx, name, suf, rnd=1, avg=0
323 .if \rnd
324 .macro avg rd, rn, rm
325 urhadd \rd, \rn, \rm
326 .endm
327 .macro mshrn rd, rn, rm
328 rshrn \rd, \rn, \rm
329 .endm
330 .macro mshrn2 rd, rn, rm
331 rshrn2 \rd, \rn, \rm
332 .endm
333 .macro NRND insn:vararg
334 .endm
335 .else
336 .macro avg rd, rn, rm
337 uhadd \rd, \rn, \rm
338 .endm
339 .macro mshrn rd, rn, rm
340 shrn \rd, \rn, \rm
341 .endm
342 .macro mshrn2 rd, rn, rm
343 shrn2 \rd, \rn, \rm
344 .endm
345 .macro NRND insn:vararg
346 \insn
347 .endm
348 .endif
349function ff_\pfx\name\suf\()_neon, export=1
350 \name \rnd, \avg
351endfunc
352 .purgem avg
353 .purgem mshrn
354 .purgem mshrn2
355 .purgem NRND
356.endm
357
358.macro pixfunc2 pfx, name, avg=0
359 pixfunc \pfx, \name, rnd=1, avg=\avg
360 pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
361.endm
362
363function ff_put_h264_qpel16_mc00_neon, export=1
364 mov w3, #16
365endfunc
366
367 pixfunc put_, pixels16, avg=0
368 pixfunc2 put_, pixels16_x2, avg=0
369 pixfunc2 put_, pixels16_y2, avg=0
370 pixfunc2 put_, pixels16_xy2, avg=0
371
372function ff_avg_h264_qpel16_mc00_neon, export=1
373 mov w3, #16
374endfunc
375
376 pixfunc avg_, pixels16, avg=1
377 pixfunc2 avg_, pixels16_x2, avg=1
378 pixfunc2 avg_, pixels16_y2, avg=1
379 pixfunc2 avg_, pixels16_xy2, avg=1
380
381function ff_put_h264_qpel8_mc00_neon, export=1
382 mov w3, #8
383endfunc
384
385 pixfunc put_, pixels8, avg=0
386 pixfunc2 put_, pixels8_x2, avg=0
387 pixfunc2 put_, pixels8_y2, avg=0
388 pixfunc2 put_, pixels8_xy2, avg=0
389
390function ff_avg_h264_qpel8_mc00_neon, export=1
391 mov w3, #8
392endfunc
393
394 pixfunc avg_, pixels8, avg=1
395 pixfunc avg_, pixels8_x2, avg=1
396 pixfunc avg_, pixels8_y2, avg=1
397 pixfunc avg_, pixels8_xy2, avg=1