Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / aarch64 / h264dsp_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/aarch64/asm.S"
23#include "neon.S"
24
25.macro h264_loop_filter_start
26 cmp w2, #0
27 ldr w6, [x4]
28 ccmp w3, #0, #0, ne
29 mov v24.S[0], w6
30 and w6, w6, w6, lsl #16
31 b.eq 1f
32 ands w6, w6, w6, lsl #8
33 b.ge 2f
341:
35 ret
362:
37.endm
38
39.macro h264_loop_filter_luma
40 dup v22.16B, w2 // alpha
41 uxtl v24.8H, v24.8B
42 uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
43 uxtl v24.4S, v24.4H
44 uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
45 sli v24.8H, v24.8H, #8
46 uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
47 sli v24.4S, v24.4S, #16
48 cmhi v21.16B, v22.16B, v21.16B // < alpha
49 dup v22.16B, w3 // beta
50 cmlt v23.16B, v24.16B, #0
51 cmhi v28.16B, v22.16B, v28.16B // < beta
52 cmhi v30.16B, v22.16B, v30.16B // < beta
53 bic v21.16B, v21.16B, v23.16B
54 uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
55 and v21.16B, v21.16B, v28.16B
56 uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
57 cmhi v17.16B, v22.16B, v17.16B // < beta
58 and v21.16B, v21.16B, v30.16B
59 cmhi v19.16B, v22.16B, v19.16B // < beta
60 and v17.16B, v17.16B, v21.16B
61 and v19.16B, v19.16B, v21.16B
62 and v24.16B, v24.16B, v21.16B
63 urhadd v28.16B, v16.16B, v0.16B
64 sub v21.16B, v24.16B, v17.16B
65 uqadd v23.16B, v18.16B, v24.16B
66 uhadd v20.16B, v20.16B, v28.16B
67 sub v21.16B, v21.16B, v19.16B
68 uhadd v28.16B, v4.16B, v28.16B
69 umin v23.16B, v23.16B, v20.16B
70 uqsub v22.16B, v18.16B, v24.16B
71 uqadd v4.16B, v2.16B, v24.16B
72 umax v23.16B, v23.16B, v22.16B
73 uqsub v22.16B, v2.16B, v24.16B
74 umin v28.16B, v4.16B, v28.16B
75 uxtl v4.8H, v0.8B
76 umax v28.16B, v28.16B, v22.16B
77 uxtl2 v20.8H, v0.16B
78 usubw v4.8H, v4.8H, v16.8B
79 usubw2 v20.8H, v20.8H, v16.16B
80 shl v4.8H, v4.8H, #2
81 shl v20.8H, v20.8H, #2
82 uaddw v4.8H, v4.8H, v18.8B
83 uaddw2 v20.8H, v20.8H, v18.16B
84 usubw v4.8H, v4.8H, v2.8B
85 usubw2 v20.8H, v20.8H, v2.16B
86 rshrn v4.8B, v4.8H, #3
87 rshrn2 v4.16B, v20.8H, #3
88 bsl v17.16B, v23.16B, v18.16B
89 bsl v19.16B, v28.16B, v2.16B
90 neg v23.16B, v21.16B
91 uxtl v28.8H, v16.8B
92 smin v4.16B, v4.16B, v21.16B
93 uxtl2 v21.8H, v16.16B
94 smax v4.16B, v4.16B, v23.16B
95 uxtl v22.8H, v0.8B
96 uxtl2 v24.8H, v0.16B
97 saddw v28.8H, v28.8H, v4.8B
98 saddw2 v21.8H, v21.8H, v4.16B
99 ssubw v22.8H, v22.8H, v4.8B
100 ssubw2 v24.8H, v24.8H, v4.16B
101 sqxtun v16.8B, v28.8H
102 sqxtun2 v16.16B, v21.8H
103 sqxtun v0.8B, v22.8H
104 sqxtun2 v0.16B, v24.8H
105.endm
106
107function ff_h264_v_loop_filter_luma_neon, export=1
108 h264_loop_filter_start
109 sxtw x1, w1
110
111 ld1 {v0.16B}, [x0], x1
112 ld1 {v2.16B}, [x0], x1
113 ld1 {v4.16B}, [x0], x1
114 sub x0, x0, x1, lsl #2
115 sub x0, x0, x1, lsl #1
116 ld1 {v20.16B}, [x0], x1
117 ld1 {v18.16B}, [x0], x1
118 ld1 {v16.16B}, [x0], x1
119
120 h264_loop_filter_luma
121
122 sub x0, x0, x1, lsl #1
123 st1 {v17.16B}, [x0], x1
124 st1 {v16.16B}, [x0], x1
125 st1 {v0.16B}, [x0], x1
126 st1 {v19.16B}, [x0]
127
128 ret
129endfunc
130
131function ff_h264_h_loop_filter_luma_neon, export=1
132 h264_loop_filter_start
133
134 sub x0, x0, #4
135 ld1 {v6.8B}, [x0], x1
136 ld1 {v20.8B}, [x0], x1
137 ld1 {v18.8B}, [x0], x1
138 ld1 {v16.8B}, [x0], x1
139 ld1 {v0.8B}, [x0], x1
140 ld1 {v2.8B}, [x0], x1
141 ld1 {v4.8B}, [x0], x1
142 ld1 {v26.8B}, [x0], x1
143 ld1 {v6.D}[1], [x0], x1
144 ld1 {v20.D}[1], [x0], x1
145 ld1 {v18.D}[1], [x0], x1
146 ld1 {v16.D}[1], [x0], x1
147 ld1 {v0.D}[1], [x0], x1
148 ld1 {v2.D}[1], [x0], x1
149 ld1 {v4.D}[1], [x0], x1
150 ld1 {v26.D}[1], [x0], x1
151
152 transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
153
154 h264_loop_filter_luma
155
156 transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
157
158 sub x0, x0, x1, lsl #4
159 add x0, x0, #2
160 st1 {v17.S}[0], [x0], x1
161 st1 {v16.S}[0], [x0], x1
162 st1 {v0.S}[0], [x0], x1
163 st1 {v19.S}[0], [x0], x1
164 st1 {v17.S}[1], [x0], x1
165 st1 {v16.S}[1], [x0], x1
166 st1 {v0.S}[1], [x0], x1
167 st1 {v19.S}[1], [x0], x1
168 st1 {v17.S}[2], [x0], x1
169 st1 {v16.S}[2], [x0], x1
170 st1 {v0.S}[2], [x0], x1
171 st1 {v19.S}[2], [x0], x1
172 st1 {v17.S}[3], [x0], x1
173 st1 {v16.S}[3], [x0], x1
174 st1 {v0.S}[3], [x0], x1
175 st1 {v19.S}[3], [x0], x1
176
177 ret
178endfunc
179
180.macro h264_loop_filter_chroma
181 dup v22.8B, w2 // alpha
182 uxtl v24.8H, v24.8B
183 uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
184 uxtl v4.8H, v0.8B
185 uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
186 usubw v4.8H, v4.8H, v16.8B
187 sli v24.8H, v24.8H, #8
188 shl v4.8H, v4.8H, #2
189 uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
190 uaddw v4.8H, v4.8H, v18.8B
191 cmhi v26.8B, v22.8B, v26.8B // < alpha
192 usubw v4.8H, v4.8H, v2.8B
193 dup v22.8B, w3 // beta
194 rshrn v4.8B, v4.8H, #3
195 cmhi v28.8B, v22.8B, v28.8B // < beta
196 cmhi v30.8B, v22.8B, v30.8B // < beta
197 smin v4.8B, v4.8B, v24.8B
198 neg v25.8B, v24.8B
199 and v26.8B, v26.8B, v28.8B
200 smax v4.8B, v4.8B, v25.8B
201 and v26.8B, v26.8B, v30.8B
202 uxtl v22.8H, v0.8B
203 and v4.8B, v4.8B, v26.8B
204 uxtl v28.8H, v16.8B
205 saddw v28.8H, v28.8H, v4.8B
206 ssubw v22.8H, v22.8H, v4.8B
207 sqxtun v16.8B, v28.8H
208 sqxtun v0.8B, v22.8H
209.endm
210
211function ff_h264_v_loop_filter_chroma_neon, export=1
212 h264_loop_filter_start
213
214 sub x0, x0, x1, lsl #1
215 ld1 {v18.8B}, [x0], x1
216 ld1 {v16.8B}, [x0], x1
217 ld1 {v0.8B}, [x0], x1
218 ld1 {v2.8B}, [x0]
219
220 h264_loop_filter_chroma
221
222 sub x0, x0, x1, lsl #1
223 st1 {v16.8B}, [x0], x1
224 st1 {v0.8B}, [x0], x1
225
226 ret
227endfunc
228
229function ff_h264_h_loop_filter_chroma_neon, export=1
230 h264_loop_filter_start
231
232 sub x0, x0, #2
233 ld1 {v18.S}[0], [x0], x1
234 ld1 {v16.S}[0], [x0], x1
235 ld1 {v0.S}[0], [x0], x1
236 ld1 {v2.S}[0], [x0], x1
237 ld1 {v18.S}[1], [x0], x1
238 ld1 {v16.S}[1], [x0], x1
239 ld1 {v0.S}[1], [x0], x1
240 ld1 {v2.S}[1], [x0], x1
241
242 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
243
244 h264_loop_filter_chroma
245
246 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
247
248 sub x0, x0, x1, lsl #3
249 st1 {v18.S}[0], [x0], x1
250 st1 {v16.S}[0], [x0], x1
251 st1 {v0.S}[0], [x0], x1
252 st1 {v2.S}[0], [x0], x1
253 st1 {v18.S}[1], [x0], x1
254 st1 {v16.S}[1], [x0], x1
255 st1 {v0.S}[1], [x0], x1
256 st1 {v2.S}[1], [x0], x1
257
258 ret
259endfunc
260
261.macro biweight_16 macs, macd
262 dup v0.16B, w5
263 dup v1.16B, w6
264 mov v4.16B, v16.16B
265 mov v6.16B, v16.16B
2661: subs w3, w3, #2
267 ld1 {v20.16B}, [x0], x2
268 \macd v4.8H, v0.8B, v20.8B
269 \macd\()2 v6.8H, v0.16B, v20.16B
270 ld1 {v22.16B}, [x1], x2
271 \macs v4.8H, v1.8B, v22.8B
272 \macs\()2 v6.8H, v1.16B, v22.16B
273 mov v24.16B, v16.16B
274 ld1 {v28.16B}, [x0], x2
275 mov v26.16B, v16.16B
276 \macd v24.8H, v0.8B, v28.8B
277 \macd\()2 v26.8H, v0.16B, v28.16B
278 ld1 {v30.16B}, [x1], x2
279 \macs v24.8H, v1.8B, v30.8B
280 \macs\()2 v26.8H, v1.16B, v30.16B
281 sshl v4.8H, v4.8H, v18.8H
282 sshl v6.8H, v6.8H, v18.8H
283 sqxtun v4.8B, v4.8H
284 sqxtun2 v4.16B, v6.8H
285 sshl v24.8H, v24.8H, v18.8H
286 sshl v26.8H, v26.8H, v18.8H
287 sqxtun v24.8B, v24.8H
288 sqxtun2 v24.16B, v26.8H
289 mov v6.16B, v16.16B
290 st1 {v4.16B}, [x7], x2
291 mov v4.16B, v16.16B
292 st1 {v24.16B}, [x7], x2
293 b.ne 1b
294 ret
295.endm
296
297.macro biweight_8 macs, macd
298 dup v0.8B, w5
299 dup v1.8B, w6
300 mov v2.16B, v16.16B
301 mov v20.16B, v16.16B
3021: subs w3, w3, #2
303 ld1 {v4.8B}, [x0], x2
304 \macd v2.8H, v0.8B, v4.8B
305 ld1 {v5.8B}, [x1], x2
306 \macs v2.8H, v1.8B, v5.8B
307 ld1 {v6.8B}, [x0], x2
308 \macd v20.8H, v0.8B, v6.8B
309 ld1 {v7.8B}, [x1], x2
310 \macs v20.8H, v1.8B, v7.8B
311 sshl v2.8H, v2.8H, v18.8H
312 sqxtun v2.8B, v2.8H
313 sshl v20.8H, v20.8H, v18.8H
314 sqxtun v4.8B, v20.8H
315 mov v20.16B, v16.16B
316 st1 {v2.8B}, [x7], x2
317 mov v2.16B, v16.16B
318 st1 {v4.8B}, [x7], x2
319 b.ne 1b
320 ret
321.endm
322
323.macro biweight_4 macs, macd
324 dup v0.8B, w5
325 dup v1.8B, w6
326 mov v2.16B, v16.16B
327 mov v20.16B,v16.16B
3281: subs w3, w3, #4
329 ld1 {v4.S}[0], [x0], x2
330 ld1 {v4.S}[1], [x0], x2
331 \macd v2.8H, v0.8B, v4.8B
332 ld1 {v5.S}[0], [x1], x2
333 ld1 {v5.S}[1], [x1], x2
334 \macs v2.8H, v1.8B, v5.8B
335 b.lt 2f
336 ld1 {v6.S}[0], [x0], x2
337 ld1 {v6.S}[1], [x0], x2
338 \macd v20.8H, v0.8B, v6.8B
339 ld1 {v7.S}[0], [x1], x2
340 ld1 {v7.S}[1], [x1], x2
341 \macs v20.8H, v1.8B, v7.8B
342 sshl v2.8H, v2.8H, v18.8H
343 sqxtun v2.8B, v2.8H
344 sshl v20.8H, v20.8H, v18.8H
345 sqxtun v4.8B, v20.8H
346 mov v20.16B, v16.16B
347 st1 {v2.S}[0], [x7], x2
348 st1 {v2.S}[1], [x7], x2
349 mov v2.16B, v16.16B
350 st1 {v4.S}[0], [x7], x2
351 st1 {v4.S}[1], [x7], x2
352 b.ne 1b
353 ret
3542: sshl v2.8H, v2.8H, v18.8H
355 sqxtun v2.8B, v2.8H
356 st1 {v2.S}[0], [x7], x2
357 st1 {v2.S}[1], [x7], x2
358 ret
359.endm
360
361.macro biweight_func w
362function ff_biweight_h264_pixels_\w\()_neon, export=1
363 sxtw x2, w2
364 lsr w8, w5, #31
365 add w7, w7, #1
366 eor w8, w8, w6, lsr #30
367 orr w7, w7, #1
368 dup v18.8H, w4
369 lsl w7, w7, w4
370 not v18.16B, v18.16B
371 dup v16.8H, w7
372 mov x7, x0
373 cbz w8, 10f
374 subs w8, w8, #1
375 b.eq 20f
376 subs w8, w8, #1
377 b.eq 30f
378 b 40f
37910: biweight_\w umlal, umlal
38020: neg w5, w5
381 biweight_\w umlal, umlsl
38230: neg w5, w5
383 neg w6, w6
384 biweight_\w umlsl, umlsl
38540: neg w6, w6
386 biweight_\w umlsl, umlal
387endfunc
388.endm
389
390 biweight_func 16
391 biweight_func 8
392 biweight_func 4
393
394.macro weight_16 add
395 dup v0.16B, w4
3961: subs w2, w2, #2
397 ld1 {v20.16B}, [x0], x1
398 umull v4.8H, v0.8B, v20.8B
399 umull2 v6.8H, v0.16B, v20.16B
400 ld1 {v28.16B}, [x0], x1
401 umull v24.8H, v0.8B, v28.8B
402 umull2 v26.8H, v0.16B, v28.16B
403 \add v4.8H, v16.8H, v4.8H
404 srshl v4.8H, v4.8H, v18.8H
405 \add v6.8H, v16.8H, v6.8H
406 srshl v6.8H, v6.8H, v18.8H
407 sqxtun v4.8B, v4.8H
408 sqxtun2 v4.16B, v6.8H
409 \add v24.8H, v16.8H, v24.8H
410 srshl v24.8H, v24.8H, v18.8H
411 \add v26.8H, v16.8H, v26.8H
412 srshl v26.8H, v26.8H, v18.8H
413 sqxtun v24.8B, v24.8H
414 sqxtun2 v24.16B, v26.8H
415 st1 {v4.16B}, [x5], x1
416 st1 {v24.16B}, [x5], x1
417 b.ne 1b
418 ret
419.endm
420
421.macro weight_8 add
422 dup v0.8B, w4
4231: subs w2, w2, #2
424 ld1 {v4.8B}, [x0], x1
425 umull v2.8H, v0.8B, v4.8B
426 ld1 {v6.8B}, [x0], x1
427 umull v20.8H, v0.8B, v6.8B
428 \add v2.8H, v16.8H, v2.8H
429 srshl v2.8H, v2.8H, v18.8H
430 sqxtun v2.8B, v2.8H
431 \add v20.8H, v16.8H, v20.8H
432 srshl v20.8H, v20.8H, v18.8H
433 sqxtun v4.8B, v20.8H
434 st1 {v2.8B}, [x5], x1
435 st1 {v4.8B}, [x5], x1
436 b.ne 1b
437 ret
438.endm
439
440.macro weight_4 add
441 dup v0.8B, w4
4421: subs w2, w2, #4
443 ld1 {v4.S}[0], [x0], x1
444 ld1 {v4.S}[1], [x0], x1
445 umull v2.8H, v0.8B, v4.8B
446 b.lt 2f
447 ld1 {v6.S}[0], [x0], x1
448 ld1 {v6.S}[1], [x0], x1
449 umull v20.8H, v0.8B, v6.8B
450 \add v2.8H, v16.8H, v2.8H
451 srshl v2.8H, v2.8H, v18.8H
452 sqxtun v2.8B, v2.8H
453 \add v20.8H, v16.8H, v20.8H
454 srshl v20.8H, v20.8h, v18.8H
455 sqxtun v4.8B, v20.8H
456 st1 {v2.S}[0], [x5], x1
457 st1 {v2.S}[1], [x5], x1
458 st1 {v4.S}[0], [x5], x1
459 st1 {v4.S}[1], [x5], x1
460 b.ne 1b
461 ret
4622: \add v2.8H, v16.8H, v2.8H
463 srshl v2.8H, v2.8H, v18.8H
464 sqxtun v2.8B, v2.8H
465 st1 {v2.S}[0], [x5], x1
466 st1 {v2.S}[1], [x5], x1
467 ret
468.endm
469
470.macro weight_func w
471function ff_weight_h264_pixels_\w\()_neon, export=1
472 sxtw x1, w1
473 cmp w3, #1
474 mov w6, #1
475 lsl w5, w5, w3
476 dup v16.8H, w5
477 mov x5, x0
478 b.le 20f
479 sub w6, w6, w3
480 dup v18.8H, w6
481 cmp w4, #0
482 b.lt 10f
483 weight_\w shadd
48410: neg w4, w4
485 weight_\w shsub
48620: neg w6, w3
487 dup v18.8H, w6
488 cmp w4, #0
489 b.lt 10f
490 weight_\w add
49110: neg w4, w4
492 weight_\w sub
493endfunc
494.endm
495
496 weight_func 16
497 weight_func 8
498 weight_func 4