Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
3 | * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | #include "libavutil/aarch64/asm.S" | |
23 | #include "neon.S" | |
24 | ||
25 | /* H.264 qpel MC */ | |
26 | ||
27 | .macro lowpass_const r | |
28 | movz \r, #20, lsl #16 | |
29 | movk \r, #5 | |
30 | mov v6.S[0], \r | |
31 | .endm | |
32 | ||
33 | //trashes v0-v5 | |
34 | .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 | |
35 | ext v2.8B, \r0\().8B, \r1\().8B, #2 | |
36 | ext v3.8B, \r0\().8B, \r1\().8B, #3 | |
37 | uaddl v2.8H, v2.8B, v3.8B | |
38 | ext v4.8B, \r0\().8B, \r1\().8B, #1 | |
39 | ext v5.8B, \r0\().8B, \r1\().8B, #4 | |
40 | uaddl v4.8H, v4.8B, v5.8B | |
41 | ext v1.8B, \r0\().8B, \r1\().8B, #5 | |
42 | uaddl \d0\().8H, \r0\().8B, v1.8B | |
43 | ext v0.8B, \r2\().8B, \r3\().8B, #2 | |
44 | mla \d0\().8H, v2.8H, v6.H[1] | |
45 | ext v1.8B, \r2\().8B, \r3\().8B, #3 | |
46 | uaddl v0.8H, v0.8B, v1.8B | |
47 | ext v1.8B, \r2\().8B, \r3\().8B, #1 | |
48 | mls \d0\().8H, v4.8H, v6.H[0] | |
49 | ext v3.8B, \r2\().8B, \r3\().8B, #4 | |
50 | uaddl v1.8H, v1.8B, v3.8B | |
51 | ext v2.8B, \r2\().8B, \r3\().8B, #5 | |
52 | uaddl \d1\().8H, \r2\().8B, v2.8B | |
53 | mla \d1\().8H, v0.8H, v6.H[1] | |
54 | mls \d1\().8H, v1.8H, v6.H[0] | |
55 | .if \narrow | |
56 | sqrshrun \d0\().8B, \d0\().8H, #5 | |
57 | sqrshrun \d1\().8B, \d1\().8H, #5 | |
58 | .endif | |
59 | .endm | |
60 | ||
61 | //trashes v0-v5, v7, v30-v31 | |
62 | .macro lowpass_8H r0, r1 | |
63 | ext v0.16B, \r0\().16B, \r0\().16B, #2 | |
64 | ext v1.16B, \r0\().16B, \r0\().16B, #3 | |
65 | uaddl v0.8H, v0.8B, v1.8B | |
66 | ext v2.16B, \r0\().16B, \r0\().16B, #1 | |
67 | ext v3.16B, \r0\().16B, \r0\().16B, #4 | |
68 | uaddl v2.8H, v2.8B, v3.8B | |
69 | ext v30.16B, \r0\().16B, \r0\().16B, #5 | |
70 | uaddl \r0\().8H, \r0\().8B, v30.8B | |
71 | ext v4.16B, \r1\().16B, \r1\().16B, #2 | |
72 | mla \r0\().8H, v0.8H, v6.H[1] | |
73 | ext v5.16B, \r1\().16B, \r1\().16B, #3 | |
74 | uaddl v4.8H, v4.8B, v5.8B | |
75 | ext v7.16B, \r1\().16B, \r1\().16B, #1 | |
76 | mls \r0\().8H, v2.8H, v6.H[0] | |
77 | ext v0.16B, \r1\().16B, \r1\().16B, #4 | |
78 | uaddl v7.8H, v7.8B, v0.8B | |
79 | ext v31.16B, \r1\().16B, \r1\().16B, #5 | |
80 | uaddl \r1\().8H, \r1\().8B, v31.8B | |
81 | mla \r1\().8H, v4.8H, v6.H[1] | |
82 | mls \r1\().8H, v7.8H, v6.H[0] | |
83 | .endm | |
84 | ||
85 | // trashes v2-v5, v30 | |
86 | .macro lowpass_8_1 r0, r1, d0, narrow=1 | |
87 | ext v2.8B, \r0\().8B, \r1\().8B, #2 | |
88 | ext v3.8B, \r0\().8B, \r1\().8B, #3 | |
89 | uaddl v2.8H, v2.8B, v3.8B | |
90 | ext v4.8B, \r0\().8B, \r1\().8B, #1 | |
91 | ext v5.8B, \r0\().8B, \r1\().8B, #4 | |
92 | uaddl v4.8H, v4.8B, v5.8B | |
93 | ext v30.8B, \r0\().8B, \r1\().8B, #5 | |
94 | uaddl \d0\().8H, \r0\().8B, v30.8B | |
95 | mla \d0\().8H, v2.8H, v6.H[1] | |
96 | mls \d0\().8H, v4.8H, v6.H[0] | |
97 | .if \narrow | |
98 | sqrshrun \d0\().8B, \d0\().8H, #5 | |
99 | .endif | |
100 | .endm | |
101 | ||
102 | // trashed v0-v7 | |
103 | .macro lowpass_8.16 r0, r1, r2 | |
104 | ext v1.16B, \r0\().16B, \r1\().16B, #4 | |
105 | ext v0.16B, \r0\().16B, \r1\().16B, #6 | |
106 | saddl v5.4S, v1.4H, v0.4H | |
107 | ext v2.16B, \r0\().16B, \r1\().16B, #2 | |
108 | saddl2 v1.4S, v1.8H, v0.8H | |
109 | ext v3.16B, \r0\().16B, \r1\().16B, #8 | |
110 | saddl v6.4S, v2.4H, v3.4H | |
111 | ext \r1\().16B, \r0\().16B, \r1\().16B, #10 | |
112 | saddl2 v2.4S, v2.8H, v3.8H | |
113 | saddl v0.4S, \r0\().4H, \r1\().4H | |
114 | saddl2 v4.4S, \r0\().8H, \r1\().8H | |
115 | ||
116 | shl v3.4S, v5.4S, #4 | |
117 | shl v5.4S, v5.4S, #2 | |
118 | shl v7.4S, v6.4S, #2 | |
119 | add v5.4S, v5.4S, v3.4S | |
120 | add v6.4S, v6.4S, v7.4S | |
121 | ||
122 | shl v3.4S, v1.4S, #4 | |
123 | shl v1.4S, v1.4S, #2 | |
124 | shl v7.4S, v2.4S, #2 | |
125 | add v1.4S, v1.4S, v3.4S | |
126 | add v2.4S, v2.4S, v7.4S | |
127 | ||
128 | add v5.4S, v5.4S, v0.4S | |
129 | sub v5.4S, v5.4S, v6.4S | |
130 | ||
131 | add v1.4S, v1.4S, v4.4S | |
132 | sub v1.4S, v1.4S, v2.4S | |
133 | ||
134 | rshrn v5.4H, v5.4S, #10 | |
135 | rshrn2 v5.8H, v1.4S, #10 | |
136 | ||
137 | sqxtun \r2\().8B, v5.8H | |
138 | .endm | |
139 | ||
140 | function put_h264_qpel16_h_lowpass_neon_packed | |
141 | mov x4, x30 | |
142 | mov x12, #16 | |
143 | mov x3, #8 | |
144 | bl put_h264_qpel8_h_lowpass_neon | |
145 | sub x1, x1, x2, lsl #4 | |
146 | add x1, x1, #8 | |
147 | mov x12, #16 | |
148 | mov x30, x4 | |
149 | b put_h264_qpel8_h_lowpass_neon | |
150 | endfunc | |
151 | ||
152 | .macro h264_qpel_h_lowpass type | |
153 | function \type\()_h264_qpel16_h_lowpass_neon | |
154 | mov x13, x30 | |
155 | mov x12, #16 | |
156 | bl \type\()_h264_qpel8_h_lowpass_neon | |
157 | sub x0, x0, x3, lsl #4 | |
158 | sub x1, x1, x2, lsl #4 | |
159 | add x0, x0, #8 | |
160 | add x1, x1, #8 | |
161 | mov x12, #16 | |
162 | mov x30, x13 | |
163 | endfunc | |
164 | ||
165 | function \type\()_h264_qpel8_h_lowpass_neon | |
166 | 1: ld1 {v28.8B, v29.8B}, [x1], x2 | |
167 | ld1 {v16.8B, v17.8B}, [x1], x2 | |
168 | subs x12, x12, #2 | |
169 | lowpass_8 v28, v29, v16, v17, v28, v16 | |
170 | .ifc \type,avg | |
171 | ld1 {v2.8B}, [x0], x3 | |
172 | urhadd v28.8B, v28.8B, v2.8B | |
173 | ld1 {v3.8B}, [x0] | |
174 | urhadd v16.8B, v16.8B, v3.8B | |
175 | sub x0, x0, x3 | |
176 | .endif | |
177 | st1 {v28.8B}, [x0], x3 | |
178 | st1 {v16.8B}, [x0], x3 | |
179 | b.ne 1b | |
180 | ret | |
181 | endfunc | |
182 | .endm | |
183 | ||
184 | h264_qpel_h_lowpass put | |
185 | h264_qpel_h_lowpass avg | |
186 | ||
187 | .macro h264_qpel_h_lowpass_l2 type | |
188 | function \type\()_h264_qpel16_h_lowpass_l2_neon | |
189 | mov x13, x30 | |
190 | mov x12, #16 | |
191 | bl \type\()_h264_qpel8_h_lowpass_l2_neon | |
192 | sub x0, x0, x2, lsl #4 | |
193 | sub x1, x1, x2, lsl #4 | |
194 | sub x3, x3, x2, lsl #4 | |
195 | add x0, x0, #8 | |
196 | add x1, x1, #8 | |
197 | add x3, x3, #8 | |
198 | mov x12, #16 | |
199 | mov x30, x13 | |
200 | endfunc | |
201 | ||
202 | function \type\()_h264_qpel8_h_lowpass_l2_neon | |
203 | 1: ld1 {v26.8B, v27.8B}, [x1], x2 | |
204 | ld1 {v16.8B, v17.8B}, [x1], x2 | |
205 | ld1 {v28.8B}, [x3], x2 | |
206 | ld1 {v29.8B}, [x3], x2 | |
207 | subs x12, x12, #2 | |
208 | lowpass_8 v26, v27, v16, v17, v26, v27 | |
209 | urhadd v26.8B, v26.8B, v28.8B | |
210 | urhadd v27.8B, v27.8B, v29.8B | |
211 | .ifc \type,avg | |
212 | ld1 {v2.8B}, [x0], x2 | |
213 | urhadd v26.8B, v26.8B, v2.8B | |
214 | ld1 {v3.8B}, [x0] | |
215 | urhadd v27.8B, v27.8B, v3.8B | |
216 | sub x0, x0, x2 | |
217 | .endif | |
218 | st1 {v26.8B}, [x0], x2 | |
219 | st1 {v27.8B}, [x0], x2 | |
220 | b.ne 1b | |
221 | ret | |
222 | endfunc | |
223 | .endm | |
224 | ||
225 | h264_qpel_h_lowpass_l2 put | |
226 | h264_qpel_h_lowpass_l2 avg | |
227 | ||
228 | function put_h264_qpel16_v_lowpass_neon_packed | |
229 | mov x4, x30 | |
230 | mov x2, #8 | |
231 | bl put_h264_qpel8_v_lowpass_neon | |
232 | sub x1, x1, x3, lsl #2 | |
233 | bl put_h264_qpel8_v_lowpass_neon | |
234 | sub x1, x1, x3, lsl #4 | |
235 | sub x1, x1, x3, lsl #2 | |
236 | add x1, x1, #8 | |
237 | bl put_h264_qpel8_v_lowpass_neon | |
238 | sub x1, x1, x3, lsl #2 | |
239 | mov x30, x4 | |
240 | b put_h264_qpel8_v_lowpass_neon | |
241 | endfunc | |
242 | ||
243 | .macro h264_qpel_v_lowpass type | |
244 | function \type\()_h264_qpel16_v_lowpass_neon | |
245 | mov x4, x30 | |
246 | bl \type\()_h264_qpel8_v_lowpass_neon | |
247 | sub x1, x1, x3, lsl #2 | |
248 | bl \type\()_h264_qpel8_v_lowpass_neon | |
249 | sub x0, x0, x2, lsl #4 | |
250 | add x0, x0, #8 | |
251 | sub x1, x1, x3, lsl #4 | |
252 | sub x1, x1, x3, lsl #2 | |
253 | add x1, x1, #8 | |
254 | bl \type\()_h264_qpel8_v_lowpass_neon | |
255 | sub x1, x1, x3, lsl #2 | |
256 | mov x30, x4 | |
257 | endfunc | |
258 | ||
259 | function \type\()_h264_qpel8_v_lowpass_neon | |
260 | ld1 {v16.8B}, [x1], x3 | |
261 | ld1 {v18.8B}, [x1], x3 | |
262 | ld1 {v20.8B}, [x1], x3 | |
263 | ld1 {v22.8B}, [x1], x3 | |
264 | ld1 {v24.8B}, [x1], x3 | |
265 | ld1 {v26.8B}, [x1], x3 | |
266 | ld1 {v28.8B}, [x1], x3 | |
267 | ld1 {v30.8B}, [x1], x3 | |
268 | ld1 {v17.8B}, [x1], x3 | |
269 | ld1 {v19.8B}, [x1], x3 | |
270 | ld1 {v21.8B}, [x1], x3 | |
271 | ld1 {v23.8B}, [x1], x3 | |
272 | ld1 {v25.8B}, [x1] | |
273 | ||
274 | transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1 | |
275 | transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1 | |
276 | lowpass_8 v16, v17, v18, v19, v16, v17 | |
277 | lowpass_8 v20, v21, v22, v23, v18, v19 | |
278 | lowpass_8 v24, v25, v26, v27, v20, v21 | |
279 | lowpass_8 v28, v29, v30, v31, v22, v23 | |
280 | transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 | |
281 | ||
282 | .ifc \type,avg | |
283 | ld1 {v24.8B}, [x0], x2 | |
284 | urhadd v16.8B, v16.8B, v24.8B | |
285 | ld1 {v25.8B}, [x0], x2 | |
286 | urhadd v17.8B, v17.8B, v25.8B | |
287 | ld1 {v26.8B}, [x0], x2 | |
288 | urhadd v18.8B, v18.8B, v26.8B | |
289 | ld1 {v27.8B}, [x0], x2 | |
290 | urhadd v19.8B, v19.8B, v27.8B | |
291 | ld1 {v28.8B}, [x0], x2 | |
292 | urhadd v20.8B, v20.8B, v28.8B | |
293 | ld1 {v29.8B}, [x0], x2 | |
294 | urhadd v21.8B, v21.8B, v29.8B | |
295 | ld1 {v30.8B}, [x0], x2 | |
296 | urhadd v22.8B, v22.8B, v30.8B | |
297 | ld1 {v31.8B}, [x0], x2 | |
298 | urhadd v23.8B, v23.8B, v31.8B | |
299 | sub x0, x0, x2, lsl #3 | |
300 | .endif | |
301 | ||
302 | st1 {v16.8B}, [x0], x2 | |
303 | st1 {v17.8B}, [x0], x2 | |
304 | st1 {v18.8B}, [x0], x2 | |
305 | st1 {v19.8B}, [x0], x2 | |
306 | st1 {v20.8B}, [x0], x2 | |
307 | st1 {v21.8B}, [x0], x2 | |
308 | st1 {v22.8B}, [x0], x2 | |
309 | st1 {v23.8B}, [x0], x2 | |
310 | ||
311 | ret | |
312 | endfunc | |
313 | .endm | |
314 | ||
315 | h264_qpel_v_lowpass put | |
316 | h264_qpel_v_lowpass avg | |
317 | ||
318 | .macro h264_qpel_v_lowpass_l2 type | |
319 | function \type\()_h264_qpel16_v_lowpass_l2_neon | |
320 | mov x4, x30 | |
321 | bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
322 | sub x1, x1, x3, lsl #2 | |
323 | bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
324 | sub x0, x0, x3, lsl #4 | |
325 | sub x12, x12, x2, lsl #4 | |
326 | add x0, x0, #8 | |
327 | add x12, x12, #8 | |
328 | sub x1, x1, x3, lsl #4 | |
329 | sub x1, x1, x3, lsl #2 | |
330 | add x1, x1, #8 | |
331 | bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
332 | sub x1, x1, x3, lsl #2 | |
333 | mov x30, x4 | |
334 | endfunc | |
335 | ||
336 | function \type\()_h264_qpel8_v_lowpass_l2_neon | |
337 | ld1 {v16.8B}, [x1], x3 | |
338 | ld1 {v18.8B}, [x1], x3 | |
339 | ld1 {v20.8B}, [x1], x3 | |
340 | ld1 {v22.8B}, [x1], x3 | |
341 | ld1 {v24.8B}, [x1], x3 | |
342 | ld1 {v26.8B}, [x1], x3 | |
343 | ld1 {v28.8B}, [x1], x3 | |
344 | ld1 {v30.8B}, [x1], x3 | |
345 | ld1 {v17.8B}, [x1], x3 | |
346 | ld1 {v19.8B}, [x1], x3 | |
347 | ld1 {v21.8B}, [x1], x3 | |
348 | ld1 {v23.8B}, [x1], x3 | |
349 | ld1 {v25.8B}, [x1] | |
350 | ||
351 | transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1 | |
352 | transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1 | |
353 | lowpass_8 v16, v17, v18, v19, v16, v17 | |
354 | lowpass_8 v20, v21, v22, v23, v18, v19 | |
355 | lowpass_8 v24, v25, v26, v27, v20, v21 | |
356 | lowpass_8 v28, v29, v30, v31, v22, v23 | |
357 | transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 | |
358 | ||
359 | ld1 {v24.8B}, [x12], x2 | |
360 | ld1 {v25.8B}, [x12], x2 | |
361 | ld1 {v26.8B}, [x12], x2 | |
362 | ld1 {v27.8B}, [x12], x2 | |
363 | ld1 {v28.8B}, [x12], x2 | |
364 | urhadd v16.8B, v24.8B, v16.8B | |
365 | urhadd v17.8B, v25.8B, v17.8B | |
366 | ld1 {v29.8B}, [x12], x2 | |
367 | urhadd v18.8B, v26.8B, v18.8B | |
368 | urhadd v19.8B, v27.8B, v19.8B | |
369 | ld1 {v30.8B}, [x12], x2 | |
370 | urhadd v20.8B, v28.8B, v20.8B | |
371 | urhadd v21.8B, v29.8B, v21.8B | |
372 | ld1 {v31.8B}, [x12], x2 | |
373 | urhadd v22.8B, v30.8B, v22.8B | |
374 | urhadd v23.8B, v31.8B, v23.8B | |
375 | ||
376 | .ifc \type,avg | |
377 | ld1 {v24.8B}, [x0], x3 | |
378 | urhadd v16.8B, v16.8B, v24.8B | |
379 | ld1 {v25.8B}, [x0], x3 | |
380 | urhadd v17.8B, v17.8B, v25.8B | |
381 | ld1 {v26.8B}, [x0], x3 | |
382 | urhadd v18.8B, v18.8B, v26.8B | |
383 | ld1 {v27.8B}, [x0], x3 | |
384 | urhadd v19.8B, v19.8B, v27.8B | |
385 | ld1 {v28.8B}, [x0], x3 | |
386 | urhadd v20.8B, v20.8B, v28.8B | |
387 | ld1 {v29.8B}, [x0], x3 | |
388 | urhadd v21.8B, v21.8B, v29.8B | |
389 | ld1 {v30.8B}, [x0], x3 | |
390 | urhadd v22.8B, v22.8B, v30.8B | |
391 | ld1 {v31.8B}, [x0], x3 | |
392 | urhadd v23.8B, v23.8B, v31.8B | |
393 | sub x0, x0, x3, lsl #3 | |
394 | .endif | |
395 | ||
396 | st1 {v16.8B}, [x0], x3 | |
397 | st1 {v17.8B}, [x0], x3 | |
398 | st1 {v18.8B}, [x0], x3 | |
399 | st1 {v19.8B}, [x0], x3 | |
400 | st1 {v20.8B}, [x0], x3 | |
401 | st1 {v21.8B}, [x0], x3 | |
402 | st1 {v22.8B}, [x0], x3 | |
403 | st1 {v23.8B}, [x0], x3 | |
404 | ||
405 | ret | |
406 | endfunc | |
407 | .endm | |
408 | ||
409 | h264_qpel_v_lowpass_l2 put | |
410 | h264_qpel_v_lowpass_l2 avg | |
411 | ||
412 | function put_h264_qpel8_hv_lowpass_neon_top | |
413 | lowpass_const w12 | |
414 | ld1 {v16.8H}, [x1], x3 | |
415 | ld1 {v17.8H}, [x1], x3 | |
416 | ld1 {v18.8H}, [x1], x3 | |
417 | ld1 {v19.8H}, [x1], x3 | |
418 | ld1 {v20.8H}, [x1], x3 | |
419 | ld1 {v21.8H}, [x1], x3 | |
420 | ld1 {v22.8H}, [x1], x3 | |
421 | ld1 {v23.8H}, [x1], x3 | |
422 | ld1 {v24.8H}, [x1], x3 | |
423 | ld1 {v25.8H}, [x1], x3 | |
424 | ld1 {v26.8H}, [x1], x3 | |
425 | ld1 {v27.8H}, [x1], x3 | |
426 | ld1 {v28.8H}, [x1] | |
427 | lowpass_8H v16, v17 | |
428 | lowpass_8H v18, v19 | |
429 | lowpass_8H v20, v21 | |
430 | lowpass_8H v22, v23 | |
431 | lowpass_8H v24, v25 | |
432 | lowpass_8H v26, v27 | |
433 | lowpass_8H v28, v29 | |
434 | ||
435 | transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 | |
436 | transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 | |
437 | ||
438 | lowpass_8.16 v16, v24, v16 | |
439 | lowpass_8.16 v17, v25, v17 | |
440 | ||
441 | lowpass_8.16 v18, v26, v18 | |
442 | lowpass_8.16 v19, v27, v19 | |
443 | ||
444 | lowpass_8.16 v20, v28, v20 | |
445 | lowpass_8.16 v21, v29, v21 | |
446 | ||
447 | lowpass_8.16 v22, v30, v22 | |
448 | lowpass_8.16 v23, v31, v23 | |
449 | ||
450 | transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 | |
451 | ||
452 | ret | |
453 | endfunc | |
454 | ||
455 | .macro h264_qpel8_hv_lowpass type | |
456 | function \type\()_h264_qpel8_hv_lowpass_neon | |
457 | mov x10, x30 | |
458 | bl put_h264_qpel8_hv_lowpass_neon_top | |
459 | .ifc \type,avg | |
460 | ld1 {v0.8B}, [x0], x2 | |
461 | urhadd v16.8B, v16.8B, v0.8B | |
462 | ld1 {v1.8B}, [x0], x2 | |
463 | urhadd v17.8B, v17.8B, v1.8B | |
464 | ld1 {v2.8B}, [x0], x2 | |
465 | urhadd v18.8B, v18.8B, v2.8B | |
466 | ld1 {v3.8B}, [x0], x2 | |
467 | urhadd v19.8B, v19.8B, v3.8B | |
468 | ld1 {v4.8B}, [x0], x2 | |
469 | urhadd v20.8B, v20.8B, v4.8B | |
470 | ld1 {v5.8B}, [x0], x2 | |
471 | urhadd v21.8B, v21.8B, v5.8B | |
472 | ld1 {v6.8B}, [x0], x2 | |
473 | urhadd v22.8B, v22.8B, v6.8B | |
474 | ld1 {v7.8B}, [x0], x2 | |
475 | urhadd v23.8B, v23.8B, v7.8B | |
476 | sub x0, x0, x2, lsl #3 | |
477 | .endif | |
478 | ||
479 | st1 {v16.8B}, [x0], x2 | |
480 | st1 {v17.8B}, [x0], x2 | |
481 | st1 {v18.8B}, [x0], x2 | |
482 | st1 {v19.8B}, [x0], x2 | |
483 | st1 {v20.8B}, [x0], x2 | |
484 | st1 {v21.8B}, [x0], x2 | |
485 | st1 {v22.8B}, [x0], x2 | |
486 | st1 {v23.8B}, [x0], x2 | |
487 | ||
488 | ret x10 | |
489 | endfunc | |
490 | .endm | |
491 | ||
492 | h264_qpel8_hv_lowpass put | |
493 | h264_qpel8_hv_lowpass avg | |
494 | ||
495 | .macro h264_qpel8_hv_lowpass_l2 type | |
496 | function \type\()_h264_qpel8_hv_lowpass_l2_neon | |
497 | mov x10, x30 | |
498 | bl put_h264_qpel8_hv_lowpass_neon_top | |
499 | ||
500 | ld1 {v0.8B, v1.8B}, [x2], #16 | |
501 | ld1 {v2.8B, v3.8B}, [x2], #16 | |
502 | urhadd v0.8B, v0.8B, v16.8B | |
503 | urhadd v1.8B, v1.8B, v17.8B | |
504 | ld1 {v4.8B, v5.8B}, [x2], #16 | |
505 | urhadd v2.8B, v2.8B, v18.8B | |
506 | urhadd v3.8B, v3.8B, v19.8B | |
507 | ld1 {v6.8B, v7.8B}, [x2], #16 | |
508 | urhadd v4.8B, v4.8B, v20.8B | |
509 | urhadd v5.8B, v5.8B, v21.8B | |
510 | urhadd v6.8B, v6.8B, v22.8B | |
511 | urhadd v7.8B, v7.8B, v23.8B | |
512 | .ifc \type,avg | |
513 | ld1 {v16.8B}, [x0], x3 | |
514 | urhadd v0.8B, v0.8B, v16.8B | |
515 | ld1 {v17.8B}, [x0], x3 | |
516 | urhadd v1.8B, v1.8B, v17.8B | |
517 | ld1 {v18.8B}, [x0], x3 | |
518 | urhadd v2.8B, v2.8B, v18.8B | |
519 | ld1 {v19.8B}, [x0], x3 | |
520 | urhadd v3.8B, v3.8B, v19.8B | |
521 | ld1 {v20.8B}, [x0], x3 | |
522 | urhadd v4.8B, v4.8B, v20.8B | |
523 | ld1 {v21.8B}, [x0], x3 | |
524 | urhadd v5.8B, v5.8B, v21.8B | |
525 | ld1 {v22.8B}, [x0], x3 | |
526 | urhadd v6.8B, v6.8B, v22.8B | |
527 | ld1 {v23.8B}, [x0], x3 | |
528 | urhadd v7.8B, v7.8B, v23.8B | |
529 | sub x0, x0, x3, lsl #3 | |
530 | .endif | |
531 | st1 {v0.8B}, [x0], x3 | |
532 | st1 {v1.8B}, [x0], x3 | |
533 | st1 {v2.8B}, [x0], x3 | |
534 | st1 {v3.8B}, [x0], x3 | |
535 | st1 {v4.8B}, [x0], x3 | |
536 | st1 {v5.8B}, [x0], x3 | |
537 | st1 {v6.8B}, [x0], x3 | |
538 | st1 {v7.8B}, [x0], x3 | |
539 | ||
540 | ret x10 | |
541 | endfunc | |
542 | .endm | |
543 | ||
544 | h264_qpel8_hv_lowpass_l2 put | |
545 | h264_qpel8_hv_lowpass_l2 avg | |
546 | ||
547 | .macro h264_qpel16_hv type | |
548 | function \type\()_h264_qpel16_hv_lowpass_neon | |
549 | mov x13, x30 | |
550 | bl \type\()_h264_qpel8_hv_lowpass_neon | |
551 | sub x1, x1, x3, lsl #2 | |
552 | bl \type\()_h264_qpel8_hv_lowpass_neon | |
553 | sub x1, x1, x3, lsl #4 | |
554 | sub x1, x1, x3, lsl #2 | |
555 | add x1, x1, #8 | |
556 | sub x0, x0, x2, lsl #4 | |
557 | add x0, x0, #8 | |
558 | bl \type\()_h264_qpel8_hv_lowpass_neon | |
559 | sub x1, x1, x3, lsl #2 | |
560 | mov x30, x13 | |
561 | b \type\()_h264_qpel8_hv_lowpass_neon | |
562 | endfunc | |
563 | ||
564 | function \type\()_h264_qpel16_hv_lowpass_l2_neon | |
565 | mov x13, x30 | |
566 | sub x2, x4, #256 | |
567 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
568 | sub x1, x1, x3, lsl #2 | |
569 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
570 | sub x1, x1, x3, lsl #4 | |
571 | sub x1, x1, x3, lsl #2 | |
572 | add x1, x1, #8 | |
573 | sub x0, x0, x3, lsl #4 | |
574 | add x0, x0, #8 | |
575 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
576 | sub x1, x1, x3, lsl #2 | |
577 | mov x30, x13 | |
578 | b \type\()_h264_qpel8_hv_lowpass_l2_neon | |
579 | endfunc | |
580 | .endm | |
581 | ||
582 | h264_qpel16_hv put | |
583 | h264_qpel16_hv avg | |
584 | ||
585 | .macro h264_qpel8 type | |
586 | function ff_\type\()_h264_qpel8_mc10_neon, export=1 | |
587 | lowpass_const w3 | |
588 | mov x3, x1 | |
589 | sub x1, x1, #2 | |
590 | mov x12, #8 | |
591 | b \type\()_h264_qpel8_h_lowpass_l2_neon | |
592 | endfunc | |
593 | ||
594 | function ff_\type\()_h264_qpel8_mc20_neon, export=1 | |
595 | lowpass_const w3 | |
596 | sub x1, x1, #2 | |
597 | mov x3, x2 | |
598 | mov x12, #8 | |
599 | b \type\()_h264_qpel8_h_lowpass_neon | |
600 | endfunc | |
601 | ||
602 | function ff_\type\()_h264_qpel8_mc30_neon, export=1 | |
603 | lowpass_const w3 | |
604 | add x3, x1, #1 | |
605 | sub x1, x1, #2 | |
606 | mov x12, #8 | |
607 | b \type\()_h264_qpel8_h_lowpass_l2_neon | |
608 | endfunc | |
609 | ||
610 | function ff_\type\()_h264_qpel8_mc01_neon, export=1 | |
611 | mov x14, x30 | |
612 | mov x12, x1 | |
613 | \type\()_h264_qpel8_mc01: | |
614 | lowpass_const w3 | |
615 | mov x3, x2 | |
616 | sub x1, x1, x2, lsl #1 | |
617 | bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
618 | ret x14 | |
619 | endfunc | |
620 | ||
621 | function ff_\type\()_h264_qpel8_mc11_neon, export=1 | |
622 | mov x14, x30 | |
623 | mov x8, x0 | |
624 | mov x9, x1 | |
625 | \type\()_h264_qpel8_mc11: | |
626 | lowpass_const w3 | |
627 | mov x11, sp | |
628 | sub sp, sp, #64 | |
629 | mov x0, sp | |
630 | sub x1, x1, #2 | |
631 | mov x3, #8 | |
632 | mov x12, #8 | |
633 | bl put_h264_qpel8_h_lowpass_neon | |
634 | mov x0, x8 | |
635 | mov x3, x2 | |
636 | mov x12, sp | |
637 | sub x1, x9, x2, lsl #1 | |
638 | mov x2, #8 | |
639 | bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
640 | mov sp, x11 | |
641 | ret x14 | |
642 | endfunc | |
643 | ||
644 | function ff_\type\()_h264_qpel8_mc21_neon, export=1 | |
645 | mov x14, x30 | |
646 | mov x8, x0 | |
647 | mov x9, x1 | |
648 | \type\()_h264_qpel8_mc21: | |
649 | lowpass_const w3 | |
650 | mov x11, sp | |
651 | sub sp, sp, #(8*8+16*12) | |
652 | sub x1, x1, #2 | |
653 | mov x3, #8 | |
654 | mov x0, sp | |
655 | mov x12, #8 | |
656 | bl put_h264_qpel8_h_lowpass_neon | |
657 | mov x4, x0 | |
658 | mov x0, x8 | |
659 | sub x1, x9, x2, lsl #1 | |
660 | sub x1, x1, #2 | |
661 | mov x3, x2 | |
662 | sub x2, x4, #64 | |
663 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
664 | mov sp, x11 | |
665 | ret x14 | |
666 | endfunc | |
667 | ||
668 | function ff_\type\()_h264_qpel8_mc31_neon, export=1 | |
669 | add x1, x1, #1 | |
670 | mov x14, x30 | |
671 | mov x8, x0 | |
672 | mov x9, x1 | |
673 | sub x1, x1, #1 | |
674 | b \type\()_h264_qpel8_mc11 | |
675 | endfunc | |
676 | ||
677 | function ff_\type\()_h264_qpel8_mc02_neon, export=1 | |
678 | mov x14, x30 | |
679 | lowpass_const w3 | |
680 | sub x1, x1, x2, lsl #1 | |
681 | mov x3, x2 | |
682 | bl \type\()_h264_qpel8_v_lowpass_neon | |
683 | ret x14 | |
684 | endfunc | |
685 | ||
686 | function ff_\type\()_h264_qpel8_mc12_neon, export=1 | |
687 | mov x14, x30 | |
688 | mov x8, x0 | |
689 | mov x9, x1 | |
690 | \type\()_h264_qpel8_mc12: | |
691 | lowpass_const w3 | |
692 | mov x11, sp | |
693 | sub sp, sp, #(8*8+16*12) | |
694 | sub x1, x1, x2, lsl #1 | |
695 | mov x3, x2 | |
696 | mov x2, #8 | |
697 | mov x0, sp | |
698 | bl put_h264_qpel8_v_lowpass_neon | |
699 | mov x4, x0 | |
700 | mov x0, x8 | |
701 | sub x1, x9, x3, lsl #1 | |
702 | sub x1, x1, #2 | |
703 | sub x2, x4, #64 | |
704 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
705 | mov sp, x11 | |
706 | ret x14 | |
707 | endfunc | |
708 | ||
709 | function ff_\type\()_h264_qpel8_mc22_neon, export=1 | |
710 | mov x14, x30 | |
711 | mov x11, sp | |
712 | sub x1, x1, x2, lsl #1 | |
713 | sub x1, x1, #2 | |
714 | mov x3, x2 | |
715 | bl \type\()_h264_qpel8_hv_lowpass_neon | |
716 | mov sp, x11 | |
717 | ret x14 | |
718 | endfunc | |
719 | ||
720 | function ff_\type\()_h264_qpel8_mc32_neon, export=1 | |
721 | mov x14, x30 | |
722 | mov x8, x0 | |
723 | mov x9, x1 | |
724 | add x1, x1, #1 | |
725 | b \type\()_h264_qpel8_mc12 | |
726 | endfunc | |
727 | ||
728 | function ff_\type\()_h264_qpel8_mc03_neon, export=1 | |
729 | mov x14, x30 | |
730 | add x12, x1, x2 | |
731 | b \type\()_h264_qpel8_mc01 | |
732 | endfunc | |
733 | ||
734 | function ff_\type\()_h264_qpel8_mc13_neon, export=1 | |
735 | mov x14, x30 | |
736 | mov x8, x0 | |
737 | mov x9, x1 | |
738 | add x1, x1, x2 | |
739 | b \type\()_h264_qpel8_mc11 | |
740 | endfunc | |
741 | ||
742 | function ff_\type\()_h264_qpel8_mc23_neon, export=1 | |
743 | mov x14, x30 | |
744 | mov x8, x0 | |
745 | mov x9, x1 | |
746 | add x1, x1, x2 | |
747 | b \type\()_h264_qpel8_mc21 | |
748 | endfunc | |
749 | ||
750 | function ff_\type\()_h264_qpel8_mc33_neon, export=1 | |
751 | add x1, x1, #1 | |
752 | mov x14, x30 | |
753 | mov x8, x0 | |
754 | mov x9, x1 | |
755 | add x1, x1, x2 | |
756 | sub x1, x1, #1 | |
757 | b \type\()_h264_qpel8_mc11 | |
758 | endfunc | |
759 | .endm | |
760 | ||
761 | h264_qpel8 put | |
762 | h264_qpel8 avg | |
763 | ||
764 | .macro h264_qpel16 type | |
765 | function ff_\type\()_h264_qpel16_mc10_neon, export=1 | |
766 | lowpass_const w3 | |
767 | mov x3, x1 | |
768 | sub x1, x1, #2 | |
769 | b \type\()_h264_qpel16_h_lowpass_l2_neon | |
770 | endfunc | |
771 | ||
772 | function ff_\type\()_h264_qpel16_mc20_neon, export=1 | |
773 | lowpass_const w3 | |
774 | sub x1, x1, #2 | |
775 | mov x3, x2 | |
776 | b \type\()_h264_qpel16_h_lowpass_neon | |
777 | endfunc | |
778 | ||
779 | function ff_\type\()_h264_qpel16_mc30_neon, export=1 | |
780 | lowpass_const w3 | |
781 | add x3, x1, #1 | |
782 | sub x1, x1, #2 | |
783 | b \type\()_h264_qpel16_h_lowpass_l2_neon | |
784 | endfunc | |
785 | ||
786 | function ff_\type\()_h264_qpel16_mc01_neon, export=1 | |
787 | mov x14, x30 | |
788 | mov x12, x1 | |
789 | \type\()_h264_qpel16_mc01: | |
790 | lowpass_const w3 | |
791 | mov x3, x2 | |
792 | sub x1, x1, x2, lsl #1 | |
793 | bl \type\()_h264_qpel16_v_lowpass_l2_neon | |
794 | ret x14 | |
795 | endfunc | |
796 | ||
797 | function ff_\type\()_h264_qpel16_mc11_neon, export=1 | |
798 | mov x14, x30 | |
799 | mov x8, x0 | |
800 | mov x9, x1 | |
801 | \type\()_h264_qpel16_mc11: | |
802 | lowpass_const w3 | |
803 | mov x11, sp | |
804 | sub sp, sp, #256 | |
805 | mov x0, sp | |
806 | sub x1, x1, #2 | |
807 | mov x3, #16 | |
808 | bl put_h264_qpel16_h_lowpass_neon | |
809 | mov x0, x8 | |
810 | mov x3, x2 | |
811 | mov x12, sp | |
812 | sub x1, x9, x2, lsl #1 | |
813 | mov x2, #16 | |
814 | bl \type\()_h264_qpel16_v_lowpass_l2_neon | |
815 | mov sp, x11 | |
816 | ret x14 | |
817 | endfunc | |
818 | ||
819 | function ff_\type\()_h264_qpel16_mc21_neon, export=1 | |
820 | mov x14, x30 | |
821 | mov x8, x0 | |
822 | mov x9, x1 | |
823 | \type\()_h264_qpel16_mc21: | |
824 | lowpass_const w3 | |
825 | mov x11, sp | |
826 | sub sp, sp, #(16*16+16*12) | |
827 | sub x1, x1, #2 | |
828 | mov x0, sp | |
829 | bl put_h264_qpel16_h_lowpass_neon_packed | |
830 | mov x4, x0 | |
831 | mov x0, x8 | |
832 | sub x1, x9, x2, lsl #1 | |
833 | sub x1, x1, #2 | |
834 | mov x3, x2 | |
835 | bl \type\()_h264_qpel16_hv_lowpass_l2_neon | |
836 | mov sp, x11 | |
837 | ret x14 | |
838 | endfunc | |
839 | ||
840 | function ff_\type\()_h264_qpel16_mc31_neon, export=1 | |
841 | add x1, x1, #1 | |
842 | mov x14, x30 | |
843 | mov x8, x0 | |
844 | mov x9, x1 | |
845 | sub x1, x1, #1 | |
846 | b \type\()_h264_qpel16_mc11 | |
847 | endfunc | |
848 | ||
849 | function ff_\type\()_h264_qpel16_mc02_neon, export=1 | |
850 | mov x14, x30 | |
851 | lowpass_const w3 | |
852 | sub x1, x1, x2, lsl #1 | |
853 | mov x3, x2 | |
854 | bl \type\()_h264_qpel16_v_lowpass_neon | |
855 | ret x14 | |
856 | endfunc | |
857 | ||
858 | function ff_\type\()_h264_qpel16_mc12_neon, export=1 | |
859 | mov x14, x30 | |
860 | mov x8, x0 | |
861 | mov x9, x1 | |
862 | \type\()_h264_qpel16_mc12: | |
863 | lowpass_const w3 | |
864 | mov x11, sp | |
865 | sub sp, sp, #(16*16+16*12) | |
866 | sub x1, x1, x2, lsl #1 | |
867 | mov x0, sp | |
868 | mov x3, x2 | |
869 | bl put_h264_qpel16_v_lowpass_neon_packed | |
870 | mov x4, x0 | |
871 | mov x0, x8 | |
872 | sub x1, x9, x3, lsl #1 | |
873 | sub x1, x1, #2 | |
874 | mov x2, x3 | |
875 | bl \type\()_h264_qpel16_hv_lowpass_l2_neon | |
876 | mov sp, x11 | |
877 | ret x14 | |
878 | endfunc | |
879 | ||
880 | function ff_\type\()_h264_qpel16_mc22_neon, export=1 | |
881 | mov x14, x30 | |
882 | lowpass_const w3 | |
883 | mov x11, sp | |
884 | sub x1, x1, x2, lsl #1 | |
885 | sub x1, x1, #2 | |
886 | mov x3, x2 | |
887 | bl \type\()_h264_qpel16_hv_lowpass_neon | |
888 | mov sp, x11 // restore stack | |
889 | ret x14 | |
890 | endfunc | |
891 | ||
892 | function ff_\type\()_h264_qpel16_mc32_neon, export=1 | |
893 | mov x14, x30 | |
894 | mov x8, x0 | |
895 | mov x9, x1 | |
896 | add x1, x1, #1 | |
897 | b \type\()_h264_qpel16_mc12 | |
898 | endfunc | |
899 | ||
900 | function ff_\type\()_h264_qpel16_mc03_neon, export=1 | |
901 | mov x14, x30 | |
902 | add x12, x1, x2 | |
903 | b \type\()_h264_qpel16_mc01 | |
904 | endfunc | |
905 | ||
906 | function ff_\type\()_h264_qpel16_mc13_neon, export=1 | |
907 | mov x14, x30 | |
908 | mov x8, x0 | |
909 | mov x9, x1 | |
910 | add x1, x1, x2 | |
911 | b \type\()_h264_qpel16_mc11 | |
912 | endfunc | |
913 | ||
914 | function ff_\type\()_h264_qpel16_mc23_neon, export=1 | |
915 | mov x14, x30 | |
916 | mov x8, x0 | |
917 | mov x9, x1 | |
918 | add x1, x1, x2 | |
919 | b \type\()_h264_qpel16_mc21 | |
920 | endfunc | |
921 | ||
922 | function ff_\type\()_h264_qpel16_mc33_neon, export=1 | |
923 | add x1, x1, #1 | |
924 | mov x14, x30 | |
925 | mov x8, x0 | |
926 | mov x9, x1 | |
927 | add x1, x1, x2 | |
928 | sub x1, x1, #1 | |
929 | b \type\()_h264_qpel16_mc11 | |
930 | endfunc | |
931 | .endm | |
932 | ||
933 | h264_qpel16 put | |
934 | h264_qpel16 avg |