Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / aarch64 / h264cmc_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/aarch64/asm.S"
23
24/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
25.macro h264_chroma_mc8 type, codec=h264
26function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
27 sxtw x2, w2
28 .ifc \type,avg
29 mov x8, x0
30 .endif
31 prfm pldl1strm, [x1]
32 prfm pldl1strm, [x1, x2]
33 .ifc \codec,rv40
34 movrel x6, rv40bias
35 lsr w9, w5, #1
36 lsr w10, w4, #1
37 lsl w9, w9, #3
38 lsl w10, w10, #1
39 add w9, w9, w10
40 add x6, x6, w9, UXTW
41 ld1r {v22.8H}, [x6]
42 .endif
43 .ifc \codec,vc1
44 movi v22.8H, #28
45 .endif
46 mul w7, w4, w5
47 lsl w14, w5, #3
48 lsl w13, w4, #3
49 cmp w7, #0
50 sub w6, w14, w7
51 sub w12, w13, w7
52 sub w4, w7, w13
53 sub w4, w4, w14
54 add w4, w4, #64
55 b.eq 2f
56
57 dup v0.8B, w4
58 dup v1.8B, w12
59 ld1 {v4.8B, v5.8B}, [x1], x2
60 dup v2.8B, w6
61 dup v3.8B, w7
62 ext v5.8B, v4.8B, v5.8B, #1
631: ld1 {v6.8B, v7.8B}, [x1], x2
64 umull v16.8H, v4.8B, v0.8B
65 umlal v16.8H, v5.8B, v1.8B
66 ext v7.8B, v6.8B, v7.8B, #1
67 ld1 {v4.8B, v5.8B}, [x1], x2
68 umlal v16.8H, v6.8B, v2.8B
69 prfm pldl1strm, [x1]
70 ext v5.8B, v4.8B, v5.8B, #1
71 umlal v16.8H, v7.8B, v3.8B
72 umull v17.8H, v6.8B, v0.8B
73 subs w3, w3, #2
74 umlal v17.8H, v7.8B, v1.8B
75 umlal v17.8H, v4.8B, v2.8B
76 umlal v17.8H, v5.8B, v3.8B
77 prfm pldl1strm, [x1, x2]
78 .ifc \codec,h264
79 rshrn v16.8B, v16.8H, #6
80 rshrn v17.8B, v17.8H, #6
81 .else
82 add v16.8H, v16.8H, v22.8H
83 add v17.8H, v17.8H, v22.8H
84 shrn v16.8B, v16.8H, #6
85 shrn v17.8B, v17.8H, #6
86 .endif
87 .ifc \type,avg
88 ld1 {v20.8B}, [x8], x2
89 ld1 {v21.8B}, [x8], x2
90 urhadd v16.8B, v16.8B, v20.8B
91 urhadd v17.8B, v17.8B, v21.8B
92 .endif
93 st1 {v16.8B}, [x0], x2
94 st1 {v17.8B}, [x0], x2
95 b.gt 1b
96 ret
97
982: adds w12, w12, w6
99 dup v0.8B, w4
100 b.eq 5f
101 tst w6, w6
102 dup v1.8B, w12
103 b.eq 4f
104
105 ld1 {v4.8B}, [x1], x2
1063: ld1 {v6.8B}, [x1], x2
107 umull v16.8H, v4.8B, v0.8B
108 umlal v16.8H, v6.8B, v1.8B
109 ld1 {v4.8B}, [x1], x2
110 umull v17.8H, v6.8B, v0.8B
111 umlal v17.8H, v4.8B, v1.8B
112 prfm pldl1strm, [x1]
113 .ifc \codec,h264
114 rshrn v16.8B, v16.8H, #6
115 rshrn v17.8B, v17.8H, #6
116 .else
117 add v16.8H, v16.8H, v22.8H
118 add v17.8H, v17.8H, v22.8H
119 shrn v16.8B, v16.8H, #6
120 shrn v17.8B, v17.8H, #6
121 .endif
122 prfm pldl1strm, [x1, x2]
123 .ifc \type,avg
124 ld1 {v20.8B}, [x8], x2
125 ld1 {v21.8B}, [x8], x2
126 urhadd v16.8B, v16.8B, v20.8B
127 urhadd v17.8B, v17.8B, v21.8B
128 .endif
129 subs w3, w3, #2
130 st1 {v16.8B}, [x0], x2
131 st1 {v17.8B}, [x0], x2
132 b.gt 3b
133 ret
134
1354: ld1 {v4.8B, v5.8B}, [x1], x2
136 ld1 {v6.8B, v7.8B}, [x1], x2
137 ext v5.8B, v4.8B, v5.8B, #1
138 ext v7.8B, v6.8B, v7.8B, #1
139 prfm pldl1strm, [x1]
140 subs w3, w3, #2
141 umull v16.8H, v4.8B, v0.8B
142 umlal v16.8H, v5.8B, v1.8B
143 umull v17.8H, v6.8B, v0.8B
144 umlal v17.8H, v7.8B, v1.8B
145 prfm pldl1strm, [x1, x2]
146 .ifc \codec,h264
147 rshrn v16.8B, v16.8H, #6
148 rshrn v17.8B, v17.8H, #6
149 .else
150 add v16.8H, v16.8H, v22.8H
151 add v17.8H, v17.8H, v22.8H
152 shrn v16.8B, v16.8H, #6
153 shrn v17.8B, v17.8H, #6
154 .endif
155 .ifc \type,avg
156 ld1 {v20.8B}, [x8], x2
157 ld1 {v21.8B}, [x8], x2
158 urhadd v16.8B, v16.8B, v20.8B
159 urhadd v17.8B, v17.8B, v21.8B
160 .endif
161 st1 {v16.8B}, [x0], x2
162 st1 {v17.8B}, [x0], x2
163 b.gt 4b
164 ret
165
1665: ld1 {v4.8B}, [x1], x2
167 ld1 {v5.8B}, [x1], x2
168 prfm pldl1strm, [x1]
169 subs w3, w3, #2
170 umull v16.8H, v4.8B, v0.8B
171 umull v17.8H, v5.8B, v0.8B
172 prfm pldl1strm, [x1, x2]
173 .ifc \codec,h264
174 rshrn v16.8B, v16.8H, #6
175 rshrn v17.8B, v17.8H, #6
176 .else
177 add v16.8H, v16.8H, v22.8H
178 add v17.8H, v17.8H, v22.8H
179 shrn v16.8B, v16.8H, #6
180 shrn v17.8B, v17.8H, #6
181 .endif
182 .ifc \type,avg
183 ld1 {v20.8B}, [x8], x2
184 ld1 {v21.8B}, [x8], x2
185 urhadd v16.8B, v16.8B, v20.8B
186 urhadd v17.8B, v17.8B, v21.8B
187 .endif
188 st1 {v16.8B}, [x0], x2
189 st1 {v17.8B}, [x0], x2
190 b.gt 5b
191 ret
192endfunc
193.endm
194
195/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
196.macro h264_chroma_mc4 type, codec=h264
197function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
198 sxtw x2, w2
199 .ifc \type,avg
200 mov x8, x0
201 .endif
202 prfm pldl1strm, [x1]
203 prfm pldl1strm, [x1, x2]
204 .ifc \codec,rv40
205 movrel x6, rv40bias
206 lsr w9, w5, #1
207 lsr w10, w4, #1
208 lsl w9, w9, #3
209 lsl w10, w10, #1
210 add w9, w9, w10
211 add x6, x6, w9, UXTW
212 ld1r {v22.8H}, [x6]
213 .endif
214 .ifc \codec,vc1
215 movi v22.8H, #28
216 .endif
217 mul w7, w4, w5
218 lsl w14, w5, #3
219 lsl w13, w4, #3
220 cmp w7, #0
221 sub w6, w14, w7
222 sub w12, w13, w7
223 sub w4, w7, w13
224 sub w4, w4, w14
225 add w4, w4, #64
226 b.eq 2f
227
228 dup v24.8B, w4
229 dup v25.8B, w12
230 ld1 {v4.8B}, [x1], x2
231 dup v26.8B, w6
232 dup v27.8B, w7
233 ext v5.8B, v4.8B, v5.8B, #1
234 trn1 v0.2S, v24.2S, v25.2S
235 trn1 v2.2S, v26.2S, v27.2S
236 trn1 v4.2S, v4.2S, v5.2S
2371: ld1 {v6.8B}, [x1], x2
238 ext v7.8B, v6.8B, v7.8B, #1
239 trn1 v6.2S, v6.2S, v7.2S
240 umull v18.8H, v4.8B, v0.8B
241 umlal v18.8H, v6.8B, v2.8B
242 ld1 {v4.8B}, [x1], x2
243 ext v5.8B, v4.8B, v5.8B, #1
244 trn1 v4.2S, v4.2S, v5.2S
245 prfm pldl1strm, [x1]
246 umull v19.8H, v6.8B, v0.8B
247 umlal v19.8H, v4.8B, v2.8B
248 trn1 v30.2D, v18.2D, v19.2D
249 trn2 v31.2D, v18.2D, v19.2D
250 add v18.8H, v30.8H, v31.8H
251 .ifc \codec,h264
252 rshrn v16.8B, v18.8H, #6
253 .else
254 add v18.8H, v18.8H, v22.8H
255 shrn v16.8B, v18.8H, #6
256 .endif
257 subs w3, w3, #2
258 prfm pldl1strm, [x1, x2]
259 .ifc \type,avg
260 ld1 {v20.S}[0], [x8], x2
261 ld1 {v20.S}[1], [x8], x2
262 urhadd v16.8B, v16.8B, v20.8B
263 .endif
264 st1 {v16.S}[0], [x0], x2
265 st1 {v16.S}[1], [x0], x2
266 b.gt 1b
267 ret
268
2692: adds w12, w12, w6
270 dup v30.8B, w4
271 b.eq 5f
272 tst w6, w6
273 dup v31.8B, w12
274 trn1 v0.2S, v30.2S, v31.2S
275 trn2 v1.2S, v30.2S, v31.2S
276 b.eq 4f
277
278 ext v1.8B, v0.8B, v1.8B, #4
279 ld1 {v4.S}[0], [x1], x2
2803: ld1 {v4.S}[1], [x1], x2
281 umull v18.8H, v4.8B, v0.8B
282 ld1 {v4.S}[0], [x1], x2
283 umull v19.8H, v4.8B, v1.8B
284 trn1 v30.2D, v18.2D, v19.2D
285 trn2 v31.2D, v18.2D, v19.2D
286 add v18.8H, v30.8H, v31.8H
287 prfm pldl1strm, [x1]
288 .ifc \codec,h264
289 rshrn v16.8B, v18.8H, #6
290 .else
291 add v18.8H, v18.8H, v22.8H
292 shrn v16.8B, v18.8H, #6
293 .endif
294 .ifc \type,avg
295 ld1 {v20.S}[0], [x8], x2
296 ld1 {v20.S}[1], [x8], x2
297 urhadd v16.8B, v16.8B, v20.8B
298 .endif
299 subs w3, w3, #2
300 prfm pldl1strm, [x1, x2]
301 st1 {v16.S}[0], [x0], x2
302 st1 {v16.S}[1], [x0], x2
303 b.gt 3b
304 ret
305
3064: ld1 {v4.8B}, [x1], x2
307 ld1 {v6.8B}, [x1], x2
308 ext v5.8B, v4.8B, v5.8B, #1
309 ext v7.8B, v6.8B, v7.8B, #1
310 trn1 v4.2S, v4.2S, v5.2S
311 trn1 v6.2S, v6.2S, v7.2S
312 umull v18.8H, v4.8B, v0.8B
313 umull v19.8H, v6.8B, v0.8B
314 subs w3, w3, #2
315 trn1 v30.2D, v18.2D, v19.2D
316 trn2 v31.2D, v18.2D, v19.2D
317 add v18.8H, v30.8H, v31.8H
318 prfm pldl1strm, [x1]
319 .ifc \codec,h264
320 rshrn v16.8B, v18.8H, #6
321 .else
322 add v18.8H, v18.8H, v22.8H
323 shrn v16.8B, v18.8H, #6
324 .endif
325 .ifc \type,avg
326 ld1 {v20.S}[0], [x8], x2
327 ld1 {v20.S}[1], [x8], x2
328 urhadd v16.8B, v16.8B, v20.8B
329 .endif
330 prfm pldl1strm, [x1]
331 st1 {v16.S}[0], [x0], x2
332 st1 {v16.S}[1], [x0], x2
333 b.gt 4b
334 ret
335
3365: ld1 {v4.S}[0], [x1], x2
337 ld1 {v4.S}[1], [x1], x2
338 umull v18.8H, v4.8B, v30.8B
339 subs w3, w3, #2
340 prfm pldl1strm, [x1]
341 .ifc \codec,h264
342 rshrn v16.8B, v18.8H, #6
343 .else
344 add v18.8H, v18.8H, v22.8H
345 shrn v16.8B, v18.8H, #6
346 .endif
347 .ifc \type,avg
348 ld1 {v20.S}[0], [x8], x2
349 ld1 {v20.S}[1], [x8], x2
350 urhadd v16.8B, v16.8B, v20.8B
351 .endif
352 prfm pldl1strm, [x1]
353 st1 {v16.S}[0], [x0], x2
354 st1 {v16.S}[1], [x0], x2
355 b.gt 5b
356 ret
357endfunc
358.endm
359
360.macro h264_chroma_mc2 type
361function ff_\type\()_h264_chroma_mc2_neon, export=1
362 sxtw x2, w2
363 prfm pldl1strm, [x1]
364 prfm pldl1strm, [x1, x2]
365 orr w7, w4, w5
366 cbz w7, 2f
367
368 mul w7, w4, w5
369 lsl w14, w5, #3
370 lsl w13, w4, #3
371 sub w6, w14, w7
372 sub w12, w13, w7
373 sub w4, w7, w13
374 sub w4, w4, w14
375 add w4, w4, #64
376 dup v0.8B, w4
377 dup v2.8B, w12
378 dup v1.8B, w6
379 dup v3.8B, w7
380 trn1 v0.4H, v0.4H, v2.4H
381 trn1 v1.4H, v1.4H, v3.4H
3821:
383 ld1 {v4.S}[0], [x1], x2
384 ld1 {v4.S}[1], [x1], x2
385 rev64 v5.2S, v4.2S
386 ld1 {v5.S}[1], [x1]
387 ext v6.8B, v4.8B, v5.8B, #1
388 ext v7.8B, v5.8B, v4.8B, #1
389 trn1 v4.4H, v4.4H, v6.4H
390 trn1 v5.4H, v5.4H, v7.4H
391 umull v16.8H, v4.8B, v0.8B
392 umlal v16.8H, v5.8B, v1.8B
393 .ifc \type,avg
394 ld1 {v18.H}[0], [x0], x2
395 ld1 {v18.H}[2], [x0]
396 sub x0, x0, x2
397 .endif
398 rev64 v17.4S, v16.4S
399 add v16.8H, v16.8H, v17.8H
400 rshrn v16.8B, v16.8H, #6
401 .ifc \type,avg
402 urhadd v16.8B, v16.8B, v18.8B
403 .endif
404 st1 {v16.H}[0], [x0], x2
405 st1 {v16.H}[2], [x0], x2
406 subs w3, w3, #2
407 b.gt 1b
408 ret
409
4102:
411 ld1 {v16.H}[0], [x1], x2
412 ld1 {v16.H}[1], [x1], x2
413 .ifc \type,avg
414 ld1 {v18.H}[0], [x0], x2
415 ld1 {v18.H}[1], [x0]
416 sub x0, x0, x2
417 urhadd v16.8B, v16.8B, v18.8B
418 .endif
419 st1 {v16.H}[0], [x0], x2
420 st1 {v16.H}[1], [x0], x2
421 subs w3, w3, #2
422 b.gt 2b
423 ret
424endfunc
425.endm
426
427 h264_chroma_mc8 put
428 h264_chroma_mc8 avg
429 h264_chroma_mc4 put
430 h264_chroma_mc4 avg
431 h264_chroma_mc2 put
432 h264_chroma_mc2 avg
433
434#if CONFIG_RV40_DECODER
435const rv40bias
436 .short 0, 16, 32, 16
437 .short 32, 28, 32, 28
438 .short 0, 32, 16, 32
439 .short 32, 28, 32, 28
440endconst
441
442 h264_chroma_mc8 put, rv40
443 h264_chroma_mc8 avg, rv40
444 h264_chroma_mc4 put, rv40
445 h264_chroma_mc4 avg, rv40
446#endif
447
448#if CONFIG_VC1_DECODER
449 h264_chroma_mc8 put, vc1
450 h264_chroma_mc8 avg, vc1
451 h264_chroma_mc4 put, vc1
452 h264_chroma_mc4 avg, vc1
453#endif