Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / aarch64 / h264idct_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/aarch64/asm.S"
23#include "neon.S"
24
25function ff_h264_idct_add_neon, export=1
26 ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1]
27 sxtw x2, w2
28 movi v30.8H, #0
29
30 add v4.4H, v0.4H, v2.4H
31 sshr v16.4H, v1.4H, #1
32 st1 {v30.8H}, [x1], #16
33 sshr v17.4H, v3.4H, #1
34 st1 {v30.8H}, [x1], #16
35 sub v5.4H, v0.4H, v2.4H
36 add v6.4H, v1.4H, v17.4H
37 sub v7.4H, v16.4H, v3.4H
38 add v0.4H, v4.4H, v6.4H
39 add v1.4H, v5.4H, v7.4H
40 sub v2.4H, v4.4H, v6.4H
41 sub v3.4H, v5.4H, v7.4H
42
43 transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
44
45 add v4.4H, v0.4H, v3.4H
46 ld1 {v18.S}[0], [x0], x2
47 sshr v16.4H, v2.4H, #1
48 sshr v17.4H, v1.4H, #1
49 ld1 {v19.S}[1], [x0], x2
50 sub v5.4H, v0.4H, v3.4H
51 ld1 {v18.S}[1], [x0], x2
52 add v6.4H, v16.4H, v1.4H
53 ins v4.D[1], v5.D[0]
54 sub v7.4H, v2.4H, v17.4H
55 ld1 {v19.S}[0], [x0], x2
56 ins v6.D[1], v7.D[0]
57 sub x0, x0, x2, lsl #2
58 add v0.8H, v4.8H, v6.8H
59 sub v1.8H, v4.8H, v6.8H
60
61 srshr v0.8H, v0.8H, #6
62 srshr v1.8H, v1.8H, #6
63
64 uaddw v0.8H, v0.8H, v18.8B
65 uaddw v1.8H, v1.8H, v19.8B
66
67 sqxtun v0.8B, v0.8H
68 sqxtun v1.8B, v1.8H
69
70 st1 {v0.S}[0], [x0], x2
71 st1 {v1.S}[1], [x0], x2
72 st1 {v0.S}[1], [x0], x2
73 st1 {v1.S}[0], [x0], x2
74
75 sub x1, x1, #32
76 ret
77endfunc
78
79function ff_h264_idct_dc_add_neon, export=1
80 sxtw x2, w2
81 mov w3, #0
82 ld1r {v2.8H}, [x1]
83 strh w3, [x1]
84 srshr v2.8H, v2.8H, #6
85 ld1 {v0.S}[0], [x0], x2
86 ld1 {v0.S}[1], [x0], x2
87 uaddw v3.8H, v2.8H, v0.8B
88 ld1 {v1.S}[0], [x0], x2
89 ld1 {v1.S}[1], [x0], x2
90 uaddw v4.8H, v2.8H, v1.8B
91 sqxtun v0.8B, v3.8H
92 sqxtun v1.8B, v4.8H
93 sub x0, x0, x2, lsl #2
94 st1 {v0.S}[0], [x0], x2
95 st1 {v0.S}[1], [x0], x2
96 st1 {v1.S}[0], [x0], x2
97 st1 {v1.S}[1], [x0], x2
98 ret
99endfunc
100
101function ff_h264_idct_add16_neon, export=1
102 mov x12, x30
103 mov x6, x0 // dest
104 mov x5, x1 // block_offset
105 mov x1, x2 // block
106 mov w9, w3 // stride
107 movrel x7, scan8
108 mov x10, #16
109 movrel x13, X(ff_h264_idct_dc_add_neon)
110 movrel x14, X(ff_h264_idct_add_neon)
1111: mov w2, w9
112 ldrb w3, [x7], #1
113 ldrsw x0, [x5], #4
114 ldrb w3, [x4, w3, uxtw]
115 subs w3, w3, #1
116 b.lt 2f
117 ldrsh w3, [x1]
118 add x0, x0, x6
119 ccmp w3, #0, #4, eq
120 csel x15, x13, x14, ne
121 blr x15
1222: subs x10, x10, #1
123 add x1, x1, #32
124 b.ne 1b
125 ret x12
126endfunc
127
128function ff_h264_idct_add16intra_neon, export=1
129 mov x12, x30
130 mov x6, x0 // dest
131 mov x5, x1 // block_offset
132 mov x1, x2 // block
133 mov w9, w3 // stride
134 movrel x7, scan8
135 mov x10, #16
136 movrel x13, X(ff_h264_idct_dc_add_neon)
137 movrel x14, X(ff_h264_idct_add_neon)
1381: mov w2, w9
139 ldrb w3, [x7], #1
140 ldrsw x0, [x5], #4
141 ldrb w3, [x4, w3, uxtw]
142 add x0, x0, x6
143 cmp w3, #0
144 ldrsh w3, [x1]
145 csel x15, x13, x14, eq
146 ccmp w3, #0, #0, eq
147 b.eq 2f
148 blr x15
1492: subs x10, x10, #1
150 add x1, x1, #32
151 b.ne 1b
152 ret x12
153endfunc
154
155function ff_h264_idct_add8_neon, export=1
156 sub sp, sp, #0x40
157 stp x19, x20, [sp]
158 mov x12, x30
159 ldp x6, x15, [x0] // dest[0], dest[1]
160 add x5, x1, #16*4 // block_offset
161 add x9, x2, #16*32 // block
162 mov w19, w3 // stride
163 movrel x13, X(ff_h264_idct_dc_add_neon)
164 movrel x14, X(ff_h264_idct_add_neon)
165 movrel x7, scan8+16
166 mov x10, #0
167 mov x11, #16
1681: mov w2, w19
169 ldrb w3, [x7, x10] // scan8[i]
170 ldrsw x0, [x5, x10, lsl #2] // block_offset[i]
171 ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ]
172 add x0, x0, x6 // block_offset[i] + dst[j-1]
173 add x1, x9, x10, lsl #5 // block + i * 16
174 cmp w3, #0
175 ldrsh w3, [x1] // block[i*16]
176 csel x20, x13, x14, eq
177 ccmp w3, #0, #0, eq
178 b.eq 2f
179 blr x20
1802: add x10, x10, #1
181 cmp x10, #4
182 csel x10, x11, x10, eq // mov x10, #16
183 csel x6, x15, x6, eq
184 cmp x10, #20
185 b.lt 1b
186 ldp x19, x20, [sp]
187 add sp, sp, #0x40
188 ret x12
189endfunc
190
191.macro idct8x8_cols pass
192 .if \pass == 0
193 va .req v18
194 vb .req v30
195 sshr v18.8H, v26.8H, #1
196 add v16.8H, v24.8H, v28.8H
197 ld1 {v30.8H, v31.8H}, [x1]
198 st1 {v19.8H}, [x1], #16
199 st1 {v19.8H}, [x1], #16
200 sub v17.8H, v24.8H, v28.8H
201 sshr v19.8H, v30.8H, #1
202 sub v18.8H, v18.8H, v30.8H
203 add v19.8H, v19.8H, v26.8H
204 .else
205 va .req v30
206 vb .req v18
207 sshr v30.8H, v26.8H, #1
208 sshr v19.8H, v18.8H, #1
209 add v16.8H, v24.8H, v28.8H
210 sub v17.8H, v24.8H, v28.8H
211 sub v30.8H, v30.8H, v18.8H
212 add v19.8H, v19.8H, v26.8H
213 .endif
214 add v26.8H, v17.8H, va.8H
215 sub v28.8H, v17.8H, va.8H
216 add v24.8H, v16.8H, v19.8H
217 sub vb.8H, v16.8H, v19.8H
218 sub v16.8H, v29.8H, v27.8H
219 add v17.8H, v31.8H, v25.8H
220 sub va.8H, v31.8H, v25.8H
221 add v19.8H, v29.8H, v27.8H
222 sub v16.8H, v16.8H, v31.8H
223 sub v17.8H, v17.8H, v27.8H
224 add va.8H, va.8H, v29.8H
225 add v19.8H, v19.8H, v25.8H
226 sshr v25.8H, v25.8H, #1
227 sshr v27.8H, v27.8H, #1
228 sshr v29.8H, v29.8H, #1
229 sshr v31.8H, v31.8H, #1
230 sub v16.8H, v16.8H, v31.8H
231 sub v17.8H, v17.8H, v27.8H
232 add va.8H, va.8H, v29.8H
233 add v19.8H, v19.8H, v25.8H
234 sshr v25.8H, v16.8H, #2
235 sshr v27.8H, v17.8H, #2
236 sshr v29.8H, va.8H, #2
237 sshr v31.8H, v19.8H, #2
238 sub v19.8H, v19.8H, v25.8H
239 sub va.8H, v27.8H, va.8H
240 add v17.8H, v17.8H, v29.8H
241 add v16.8H, v16.8H, v31.8H
242 .if \pass == 0
243 sub v31.8H, v24.8H, v19.8H
244 add v24.8H, v24.8H, v19.8H
245 add v25.8H, v26.8H, v18.8H
246 sub v18.8H, v26.8H, v18.8H
247 add v26.8H, v28.8H, v17.8H
248 add v27.8H, v30.8H, v16.8H
249 sub v29.8H, v28.8H, v17.8H
250 sub v28.8H, v30.8H, v16.8H
251 .else
252 sub v31.8H, v24.8H, v19.8H
253 add v24.8H, v24.8H, v19.8H
254 add v25.8H, v26.8H, v30.8H
255 sub v30.8H, v26.8H, v30.8H
256 add v26.8H, v28.8H, v17.8H
257 sub v29.8H, v28.8H, v17.8H
258 add v27.8H, v18.8H, v16.8H
259 sub v28.8H, v18.8H, v16.8H
260 .endif
261 .unreq va
262 .unreq vb
263.endm
264
265function ff_h264_idct8_add_neon, export=1
266 movi v19.8H, #0
267 ld1 {v24.8H, v25.8H}, [x1]
268 st1 {v19.8H}, [x1], #16
269 st1 {v19.8H}, [x1], #16
270 ld1 {v26.8H, v27.8H}, [x1]
271 st1 {v19.8H}, [x1], #16
272 st1 {v19.8H}, [x1], #16
273 ld1 {v28.8H, v29.8H}, [x1]
274 st1 {v19.8H}, [x1], #16
275 st1 {v19.8H}, [x1], #16
276
277 idct8x8_cols 0
278 transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
279 idct8x8_cols 1
280
281 mov x3, x0
282 srshr v24.8H, v24.8H, #6
283 ld1 {v0.8B}, [x0], x2
284 srshr v25.8H, v25.8H, #6
285 ld1 {v1.8B}, [x0], x2
286 srshr v26.8H, v26.8H, #6
287 ld1 {v2.8B}, [x0], x2
288 srshr v27.8H, v27.8H, #6
289 ld1 {v3.8B}, [x0], x2
290 srshr v28.8H, v28.8H, #6
291 ld1 {v4.8B}, [x0], x2
292 srshr v29.8H, v29.8H, #6
293 ld1 {v5.8B}, [x0], x2
294 srshr v30.8H, v30.8H, #6
295 ld1 {v6.8B}, [x0], x2
296 srshr v31.8H, v31.8H, #6
297 ld1 {v7.8B}, [x0], x2
298 uaddw v24.8H, v24.8H, v0.8B
299 uaddw v25.8H, v25.8H, v1.8B
300 uaddw v26.8H, v26.8H, v2.8B
301 sqxtun v0.8B, v24.8H
302 uaddw v27.8H, v27.8H, v3.8B
303 sqxtun v1.8B, v25.8H
304 uaddw v28.8H, v28.8H, v4.8B
305 sqxtun v2.8B, v26.8H
306 st1 {v0.8B}, [x3], x2
307 uaddw v29.8H, v29.8H, v5.8B
308 sqxtun v3.8B, v27.8H
309 st1 {v1.8B}, [x3], x2
310 uaddw v30.8H, v30.8H, v6.8B
311 sqxtun v4.8B, v28.8H
312 st1 {v2.8B}, [x3], x2
313 uaddw v31.8H, v31.8H, v7.8B
314 sqxtun v5.8B, v29.8H
315 st1 {v3.8B}, [x3], x2
316 sqxtun v6.8B, v30.8H
317 sqxtun v7.8B, v31.8H
318 st1 {v4.8B}, [x3], x2
319 st1 {v5.8B}, [x3], x2
320 st1 {v6.8B}, [x3], x2
321 st1 {v7.8B}, [x3], x2
322
323 sub x1, x1, #128
324 ret
325endfunc
326
327function ff_h264_idct8_dc_add_neon, export=1
328 mov w3, #0
329 sxtw x2, w2
330 ld1r {v31.8H}, [x1]
331 strh w3, [x1]
332 ld1 {v0.8B}, [x0], x2
333 srshr v31.8H, v31.8H, #6
334 ld1 {v1.8B}, [x0], x2
335 ld1 {v2.8B}, [x0], x2
336 uaddw v24.8H, v31.8H, v0.8B
337 ld1 {v3.8B}, [x0], x2
338 uaddw v25.8H, v31.8H, v1.8B
339 ld1 {v4.8B}, [x0], x2
340 uaddw v26.8H, v31.8H, v2.8B
341 ld1 {v5.8B}, [x0], x2
342 uaddw v27.8H, v31.8H, v3.8B
343 ld1 {v6.8B}, [x0], x2
344 uaddw v28.8H, v31.8H, v4.8B
345 ld1 {v7.8B}, [x0], x2
346 uaddw v29.8H, v31.8H, v5.8B
347 uaddw v30.8H, v31.8H, v6.8B
348 uaddw v31.8H, v31.8H, v7.8B
349 sqxtun v0.8B, v24.8H
350 sqxtun v1.8B, v25.8H
351 sqxtun v2.8B, v26.8H
352 sqxtun v3.8B, v27.8H
353 sub x0, x0, x2, lsl #3
354 st1 {v0.8B}, [x0], x2
355 sqxtun v4.8B, v28.8H
356 st1 {v1.8B}, [x0], x2
357 sqxtun v5.8B, v29.8H
358 st1 {v2.8B}, [x0], x2
359 sqxtun v6.8B, v30.8H
360 st1 {v3.8B}, [x0], x2
361 sqxtun v7.8B, v31.8H
362 st1 {v4.8B}, [x0], x2
363 st1 {v5.8B}, [x0], x2
364 st1 {v6.8B}, [x0], x2
365 st1 {v7.8B}, [x0], x2
366 ret
367endfunc
368
369function ff_h264_idct8_add4_neon, export=1
370 mov x12, x30
371 mov x6, x0
372 mov x5, x1
373 mov x1, x2
374 mov w2, w3
375 movrel x7, scan8
376 mov w10, #16
377 movrel x13, X(ff_h264_idct8_dc_add_neon)
378 movrel x14, X(ff_h264_idct8_add_neon)
3791: ldrb w9, [x7], #4
380 ldrsw x0, [x5], #16
381 ldrb w9, [x4, w9, UXTW]
382 subs w9, w9, #1
383 b.lt 2f
384 ldrsh w11, [x1]
385 add x0, x6, x0
386 ccmp w11, #0, #4, eq
387 csel x15, x13, x14, ne
388 blr x15
3892: subs w10, w10, #4
390 add x1, x1, #128
391 b.ne 1b
392 ret x12
393endfunc
394
395const scan8
396 .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
397 .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
398 .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
399 .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
400 .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
401 .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
402 .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
403 .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
404 .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
405 .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
406 .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
407 .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8
408endconst