Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
3 | * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | #include "libavutil/aarch64/asm.S" | |
23 | #include "neon.S" | |
24 | ||
25 | function ff_h264_idct_add_neon, export=1 | |
26 | ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1] | |
27 | sxtw x2, w2 | |
28 | movi v30.8H, #0 | |
29 | ||
30 | add v4.4H, v0.4H, v2.4H | |
31 | sshr v16.4H, v1.4H, #1 | |
32 | st1 {v30.8H}, [x1], #16 | |
33 | sshr v17.4H, v3.4H, #1 | |
34 | st1 {v30.8H}, [x1], #16 | |
35 | sub v5.4H, v0.4H, v2.4H | |
36 | add v6.4H, v1.4H, v17.4H | |
37 | sub v7.4H, v16.4H, v3.4H | |
38 | add v0.4H, v4.4H, v6.4H | |
39 | add v1.4H, v5.4H, v7.4H | |
40 | sub v2.4H, v4.4H, v6.4H | |
41 | sub v3.4H, v5.4H, v7.4H | |
42 | ||
43 | transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 | |
44 | ||
45 | add v4.4H, v0.4H, v3.4H | |
46 | ld1 {v18.S}[0], [x0], x2 | |
47 | sshr v16.4H, v2.4H, #1 | |
48 | sshr v17.4H, v1.4H, #1 | |
49 | ld1 {v19.S}[1], [x0], x2 | |
50 | sub v5.4H, v0.4H, v3.4H | |
51 | ld1 {v18.S}[1], [x0], x2 | |
52 | add v6.4H, v16.4H, v1.4H | |
53 | ins v4.D[1], v5.D[0] | |
54 | sub v7.4H, v2.4H, v17.4H | |
55 | ld1 {v19.S}[0], [x0], x2 | |
56 | ins v6.D[1], v7.D[0] | |
57 | sub x0, x0, x2, lsl #2 | |
58 | add v0.8H, v4.8H, v6.8H | |
59 | sub v1.8H, v4.8H, v6.8H | |
60 | ||
61 | srshr v0.8H, v0.8H, #6 | |
62 | srshr v1.8H, v1.8H, #6 | |
63 | ||
64 | uaddw v0.8H, v0.8H, v18.8B | |
65 | uaddw v1.8H, v1.8H, v19.8B | |
66 | ||
67 | sqxtun v0.8B, v0.8H | |
68 | sqxtun v1.8B, v1.8H | |
69 | ||
70 | st1 {v0.S}[0], [x0], x2 | |
71 | st1 {v1.S}[1], [x0], x2 | |
72 | st1 {v0.S}[1], [x0], x2 | |
73 | st1 {v1.S}[0], [x0], x2 | |
74 | ||
75 | sub x1, x1, #32 | |
76 | ret | |
77 | endfunc | |
78 | ||
79 | function ff_h264_idct_dc_add_neon, export=1 | |
80 | sxtw x2, w2 | |
81 | mov w3, #0 | |
82 | ld1r {v2.8H}, [x1] | |
83 | strh w3, [x1] | |
84 | srshr v2.8H, v2.8H, #6 | |
85 | ld1 {v0.S}[0], [x0], x2 | |
86 | ld1 {v0.S}[1], [x0], x2 | |
87 | uaddw v3.8H, v2.8H, v0.8B | |
88 | ld1 {v1.S}[0], [x0], x2 | |
89 | ld1 {v1.S}[1], [x0], x2 | |
90 | uaddw v4.8H, v2.8H, v1.8B | |
91 | sqxtun v0.8B, v3.8H | |
92 | sqxtun v1.8B, v4.8H | |
93 | sub x0, x0, x2, lsl #2 | |
94 | st1 {v0.S}[0], [x0], x2 | |
95 | st1 {v0.S}[1], [x0], x2 | |
96 | st1 {v1.S}[0], [x0], x2 | |
97 | st1 {v1.S}[1], [x0], x2 | |
98 | ret | |
99 | endfunc | |
100 | ||
101 | function ff_h264_idct_add16_neon, export=1 | |
102 | mov x12, x30 | |
103 | mov x6, x0 // dest | |
104 | mov x5, x1 // block_offset | |
105 | mov x1, x2 // block | |
106 | mov w9, w3 // stride | |
107 | movrel x7, scan8 | |
108 | mov x10, #16 | |
109 | movrel x13, X(ff_h264_idct_dc_add_neon) | |
110 | movrel x14, X(ff_h264_idct_add_neon) | |
111 | 1: mov w2, w9 | |
112 | ldrb w3, [x7], #1 | |
113 | ldrsw x0, [x5], #4 | |
114 | ldrb w3, [x4, w3, uxtw] | |
115 | subs w3, w3, #1 | |
116 | b.lt 2f | |
117 | ldrsh w3, [x1] | |
118 | add x0, x0, x6 | |
119 | ccmp w3, #0, #4, eq | |
120 | csel x15, x13, x14, ne | |
121 | blr x15 | |
122 | 2: subs x10, x10, #1 | |
123 | add x1, x1, #32 | |
124 | b.ne 1b | |
125 | ret x12 | |
126 | endfunc | |
127 | ||
128 | function ff_h264_idct_add16intra_neon, export=1 | |
129 | mov x12, x30 | |
130 | mov x6, x0 // dest | |
131 | mov x5, x1 // block_offset | |
132 | mov x1, x2 // block | |
133 | mov w9, w3 // stride | |
134 | movrel x7, scan8 | |
135 | mov x10, #16 | |
136 | movrel x13, X(ff_h264_idct_dc_add_neon) | |
137 | movrel x14, X(ff_h264_idct_add_neon) | |
138 | 1: mov w2, w9 | |
139 | ldrb w3, [x7], #1 | |
140 | ldrsw x0, [x5], #4 | |
141 | ldrb w3, [x4, w3, uxtw] | |
142 | add x0, x0, x6 | |
143 | cmp w3, #0 | |
144 | ldrsh w3, [x1] | |
145 | csel x15, x13, x14, eq | |
146 | ccmp w3, #0, #0, eq | |
147 | b.eq 2f | |
148 | blr x15 | |
149 | 2: subs x10, x10, #1 | |
150 | add x1, x1, #32 | |
151 | b.ne 1b | |
152 | ret x12 | |
153 | endfunc | |
154 | ||
155 | function ff_h264_idct_add8_neon, export=1 | |
156 | sub sp, sp, #0x40 | |
157 | stp x19, x20, [sp] | |
158 | mov x12, x30 | |
159 | ldp x6, x15, [x0] // dest[0], dest[1] | |
160 | add x5, x1, #16*4 // block_offset | |
161 | add x9, x2, #16*32 // block | |
162 | mov w19, w3 // stride | |
163 | movrel x13, X(ff_h264_idct_dc_add_neon) | |
164 | movrel x14, X(ff_h264_idct_add_neon) | |
165 | movrel x7, scan8+16 | |
166 | mov x10, #0 | |
167 | mov x11, #16 | |
168 | 1: mov w2, w19 | |
169 | ldrb w3, [x7, x10] // scan8[i] | |
170 | ldrsw x0, [x5, x10, lsl #2] // block_offset[i] | |
171 | ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ] | |
172 | add x0, x0, x6 // block_offset[i] + dst[j-1] | |
173 | add x1, x9, x10, lsl #5 // block + i * 16 | |
174 | cmp w3, #0 | |
175 | ldrsh w3, [x1] // block[i*16] | |
176 | csel x20, x13, x14, eq | |
177 | ccmp w3, #0, #0, eq | |
178 | b.eq 2f | |
179 | blr x20 | |
180 | 2: add x10, x10, #1 | |
181 | cmp x10, #4 | |
182 | csel x10, x11, x10, eq // mov x10, #16 | |
183 | csel x6, x15, x6, eq | |
184 | cmp x10, #20 | |
185 | b.lt 1b | |
186 | ldp x19, x20, [sp] | |
187 | add sp, sp, #0x40 | |
188 | ret x12 | |
189 | endfunc | |
190 | ||
191 | .macro idct8x8_cols pass | |
192 | .if \pass == 0 | |
193 | va .req v18 | |
194 | vb .req v30 | |
195 | sshr v18.8H, v26.8H, #1 | |
196 | add v16.8H, v24.8H, v28.8H | |
197 | ld1 {v30.8H, v31.8H}, [x1] | |
198 | st1 {v19.8H}, [x1], #16 | |
199 | st1 {v19.8H}, [x1], #16 | |
200 | sub v17.8H, v24.8H, v28.8H | |
201 | sshr v19.8H, v30.8H, #1 | |
202 | sub v18.8H, v18.8H, v30.8H | |
203 | add v19.8H, v19.8H, v26.8H | |
204 | .else | |
205 | va .req v30 | |
206 | vb .req v18 | |
207 | sshr v30.8H, v26.8H, #1 | |
208 | sshr v19.8H, v18.8H, #1 | |
209 | add v16.8H, v24.8H, v28.8H | |
210 | sub v17.8H, v24.8H, v28.8H | |
211 | sub v30.8H, v30.8H, v18.8H | |
212 | add v19.8H, v19.8H, v26.8H | |
213 | .endif | |
214 | add v26.8H, v17.8H, va.8H | |
215 | sub v28.8H, v17.8H, va.8H | |
216 | add v24.8H, v16.8H, v19.8H | |
217 | sub vb.8H, v16.8H, v19.8H | |
218 | sub v16.8H, v29.8H, v27.8H | |
219 | add v17.8H, v31.8H, v25.8H | |
220 | sub va.8H, v31.8H, v25.8H | |
221 | add v19.8H, v29.8H, v27.8H | |
222 | sub v16.8H, v16.8H, v31.8H | |
223 | sub v17.8H, v17.8H, v27.8H | |
224 | add va.8H, va.8H, v29.8H | |
225 | add v19.8H, v19.8H, v25.8H | |
226 | sshr v25.8H, v25.8H, #1 | |
227 | sshr v27.8H, v27.8H, #1 | |
228 | sshr v29.8H, v29.8H, #1 | |
229 | sshr v31.8H, v31.8H, #1 | |
230 | sub v16.8H, v16.8H, v31.8H | |
231 | sub v17.8H, v17.8H, v27.8H | |
232 | add va.8H, va.8H, v29.8H | |
233 | add v19.8H, v19.8H, v25.8H | |
234 | sshr v25.8H, v16.8H, #2 | |
235 | sshr v27.8H, v17.8H, #2 | |
236 | sshr v29.8H, va.8H, #2 | |
237 | sshr v31.8H, v19.8H, #2 | |
238 | sub v19.8H, v19.8H, v25.8H | |
239 | sub va.8H, v27.8H, va.8H | |
240 | add v17.8H, v17.8H, v29.8H | |
241 | add v16.8H, v16.8H, v31.8H | |
242 | .if \pass == 0 | |
243 | sub v31.8H, v24.8H, v19.8H | |
244 | add v24.8H, v24.8H, v19.8H | |
245 | add v25.8H, v26.8H, v18.8H | |
246 | sub v18.8H, v26.8H, v18.8H | |
247 | add v26.8H, v28.8H, v17.8H | |
248 | add v27.8H, v30.8H, v16.8H | |
249 | sub v29.8H, v28.8H, v17.8H | |
250 | sub v28.8H, v30.8H, v16.8H | |
251 | .else | |
252 | sub v31.8H, v24.8H, v19.8H | |
253 | add v24.8H, v24.8H, v19.8H | |
254 | add v25.8H, v26.8H, v30.8H | |
255 | sub v30.8H, v26.8H, v30.8H | |
256 | add v26.8H, v28.8H, v17.8H | |
257 | sub v29.8H, v28.8H, v17.8H | |
258 | add v27.8H, v18.8H, v16.8H | |
259 | sub v28.8H, v18.8H, v16.8H | |
260 | .endif | |
261 | .unreq va | |
262 | .unreq vb | |
263 | .endm | |
264 | ||
265 | function ff_h264_idct8_add_neon, export=1 | |
266 | movi v19.8H, #0 | |
267 | ld1 {v24.8H, v25.8H}, [x1] | |
268 | st1 {v19.8H}, [x1], #16 | |
269 | st1 {v19.8H}, [x1], #16 | |
270 | ld1 {v26.8H, v27.8H}, [x1] | |
271 | st1 {v19.8H}, [x1], #16 | |
272 | st1 {v19.8H}, [x1], #16 | |
273 | ld1 {v28.8H, v29.8H}, [x1] | |
274 | st1 {v19.8H}, [x1], #16 | |
275 | st1 {v19.8H}, [x1], #16 | |
276 | ||
277 | idct8x8_cols 0 | |
278 | transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7 | |
279 | idct8x8_cols 1 | |
280 | ||
281 | mov x3, x0 | |
282 | srshr v24.8H, v24.8H, #6 | |
283 | ld1 {v0.8B}, [x0], x2 | |
284 | srshr v25.8H, v25.8H, #6 | |
285 | ld1 {v1.8B}, [x0], x2 | |
286 | srshr v26.8H, v26.8H, #6 | |
287 | ld1 {v2.8B}, [x0], x2 | |
288 | srshr v27.8H, v27.8H, #6 | |
289 | ld1 {v3.8B}, [x0], x2 | |
290 | srshr v28.8H, v28.8H, #6 | |
291 | ld1 {v4.8B}, [x0], x2 | |
292 | srshr v29.8H, v29.8H, #6 | |
293 | ld1 {v5.8B}, [x0], x2 | |
294 | srshr v30.8H, v30.8H, #6 | |
295 | ld1 {v6.8B}, [x0], x2 | |
296 | srshr v31.8H, v31.8H, #6 | |
297 | ld1 {v7.8B}, [x0], x2 | |
298 | uaddw v24.8H, v24.8H, v0.8B | |
299 | uaddw v25.8H, v25.8H, v1.8B | |
300 | uaddw v26.8H, v26.8H, v2.8B | |
301 | sqxtun v0.8B, v24.8H | |
302 | uaddw v27.8H, v27.8H, v3.8B | |
303 | sqxtun v1.8B, v25.8H | |
304 | uaddw v28.8H, v28.8H, v4.8B | |
305 | sqxtun v2.8B, v26.8H | |
306 | st1 {v0.8B}, [x3], x2 | |
307 | uaddw v29.8H, v29.8H, v5.8B | |
308 | sqxtun v3.8B, v27.8H | |
309 | st1 {v1.8B}, [x3], x2 | |
310 | uaddw v30.8H, v30.8H, v6.8B | |
311 | sqxtun v4.8B, v28.8H | |
312 | st1 {v2.8B}, [x3], x2 | |
313 | uaddw v31.8H, v31.8H, v7.8B | |
314 | sqxtun v5.8B, v29.8H | |
315 | st1 {v3.8B}, [x3], x2 | |
316 | sqxtun v6.8B, v30.8H | |
317 | sqxtun v7.8B, v31.8H | |
318 | st1 {v4.8B}, [x3], x2 | |
319 | st1 {v5.8B}, [x3], x2 | |
320 | st1 {v6.8B}, [x3], x2 | |
321 | st1 {v7.8B}, [x3], x2 | |
322 | ||
323 | sub x1, x1, #128 | |
324 | ret | |
325 | endfunc | |
326 | ||
327 | function ff_h264_idct8_dc_add_neon, export=1 | |
328 | mov w3, #0 | |
329 | sxtw x2, w2 | |
330 | ld1r {v31.8H}, [x1] | |
331 | strh w3, [x1] | |
332 | ld1 {v0.8B}, [x0], x2 | |
333 | srshr v31.8H, v31.8H, #6 | |
334 | ld1 {v1.8B}, [x0], x2 | |
335 | ld1 {v2.8B}, [x0], x2 | |
336 | uaddw v24.8H, v31.8H, v0.8B | |
337 | ld1 {v3.8B}, [x0], x2 | |
338 | uaddw v25.8H, v31.8H, v1.8B | |
339 | ld1 {v4.8B}, [x0], x2 | |
340 | uaddw v26.8H, v31.8H, v2.8B | |
341 | ld1 {v5.8B}, [x0], x2 | |
342 | uaddw v27.8H, v31.8H, v3.8B | |
343 | ld1 {v6.8B}, [x0], x2 | |
344 | uaddw v28.8H, v31.8H, v4.8B | |
345 | ld1 {v7.8B}, [x0], x2 | |
346 | uaddw v29.8H, v31.8H, v5.8B | |
347 | uaddw v30.8H, v31.8H, v6.8B | |
348 | uaddw v31.8H, v31.8H, v7.8B | |
349 | sqxtun v0.8B, v24.8H | |
350 | sqxtun v1.8B, v25.8H | |
351 | sqxtun v2.8B, v26.8H | |
352 | sqxtun v3.8B, v27.8H | |
353 | sub x0, x0, x2, lsl #3 | |
354 | st1 {v0.8B}, [x0], x2 | |
355 | sqxtun v4.8B, v28.8H | |
356 | st1 {v1.8B}, [x0], x2 | |
357 | sqxtun v5.8B, v29.8H | |
358 | st1 {v2.8B}, [x0], x2 | |
359 | sqxtun v6.8B, v30.8H | |
360 | st1 {v3.8B}, [x0], x2 | |
361 | sqxtun v7.8B, v31.8H | |
362 | st1 {v4.8B}, [x0], x2 | |
363 | st1 {v5.8B}, [x0], x2 | |
364 | st1 {v6.8B}, [x0], x2 | |
365 | st1 {v7.8B}, [x0], x2 | |
366 | ret | |
367 | endfunc | |
368 | ||
369 | function ff_h264_idct8_add4_neon, export=1 | |
370 | mov x12, x30 | |
371 | mov x6, x0 | |
372 | mov x5, x1 | |
373 | mov x1, x2 | |
374 | mov w2, w3 | |
375 | movrel x7, scan8 | |
376 | mov w10, #16 | |
377 | movrel x13, X(ff_h264_idct8_dc_add_neon) | |
378 | movrel x14, X(ff_h264_idct8_add_neon) | |
379 | 1: ldrb w9, [x7], #4 | |
380 | ldrsw x0, [x5], #16 | |
381 | ldrb w9, [x4, w9, UXTW] | |
382 | subs w9, w9, #1 | |
383 | b.lt 2f | |
384 | ldrsh w11, [x1] | |
385 | add x0, x6, x0 | |
386 | ccmp w11, #0, #4, eq | |
387 | csel x15, x13, x14, ne | |
388 | blr x15 | |
389 | 2: subs w10, w10, #4 | |
390 | add x1, x1, #128 | |
391 | b.ne 1b | |
392 | ret x12 | |
393 | endfunc | |
394 | ||
395 | const scan8 | |
396 | .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 | |
397 | .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 | |
398 | .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 | |
399 | .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 | |
400 | .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 | |
401 | .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 | |
402 | .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 | |
403 | .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 | |
404 | .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8 | |
405 | .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8 | |
406 | .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8 | |
407 | .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8 | |
408 | endconst |