Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
3 | * | |
4 | * This file is part of FFmpeg. | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with FFmpeg; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | */ | |
20 | ||
21 | #include "libavutil/arm/asm.S" | |
22 | #include "neon.S" | |
23 | ||
24 | /* H.264 qpel MC */ | |
25 | ||
26 | .macro lowpass_const r | |
27 | movw \r, #5 | |
28 | movt \r, #20 | |
29 | vmov.32 d6[0], \r | |
30 | .endm | |
31 | ||
32 | .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 | |
33 | .if \narrow | |
34 | t0 .req q0 | |
35 | t1 .req q8 | |
36 | .else | |
37 | t0 .req \d0 | |
38 | t1 .req \d1 | |
39 | .endif | |
40 | vext.8 d2, \r0, \r1, #2 | |
41 | vext.8 d3, \r0, \r1, #3 | |
42 | vaddl.u8 q1, d2, d3 | |
43 | vext.8 d4, \r0, \r1, #1 | |
44 | vext.8 d5, \r0, \r1, #4 | |
45 | vaddl.u8 q2, d4, d5 | |
46 | vext.8 d30, \r0, \r1, #5 | |
47 | vaddl.u8 t0, \r0, d30 | |
48 | vext.8 d18, \r2, \r3, #2 | |
49 | vmla.i16 t0, q1, d6[1] | |
50 | vext.8 d19, \r2, \r3, #3 | |
51 | vaddl.u8 q9, d18, d19 | |
52 | vext.8 d20, \r2, \r3, #1 | |
53 | vmls.i16 t0, q2, d6[0] | |
54 | vext.8 d21, \r2, \r3, #4 | |
55 | vaddl.u8 q10, d20, d21 | |
56 | vext.8 d31, \r2, \r3, #5 | |
57 | vaddl.u8 t1, \r2, d31 | |
58 | vmla.i16 t1, q9, d6[1] | |
59 | vmls.i16 t1, q10, d6[0] | |
60 | .if \narrow | |
61 | vqrshrun.s16 \d0, t0, #5 | |
62 | vqrshrun.s16 \d1, t1, #5 | |
63 | .endif | |
64 | .unreq t0 | |
65 | .unreq t1 | |
66 | .endm | |
67 | ||
68 | .macro lowpass_8_1 r0, r1, d0, narrow=1 | |
69 | .if \narrow | |
70 | t0 .req q0 | |
71 | .else | |
72 | t0 .req \d0 | |
73 | .endif | |
74 | vext.8 d2, \r0, \r1, #2 | |
75 | vext.8 d3, \r0, \r1, #3 | |
76 | vaddl.u8 q1, d2, d3 | |
77 | vext.8 d4, \r0, \r1, #1 | |
78 | vext.8 d5, \r0, \r1, #4 | |
79 | vaddl.u8 q2, d4, d5 | |
80 | vext.8 d30, \r0, \r1, #5 | |
81 | vaddl.u8 t0, \r0, d30 | |
82 | vmla.i16 t0, q1, d6[1] | |
83 | vmls.i16 t0, q2, d6[0] | |
84 | .if \narrow | |
85 | vqrshrun.s16 \d0, t0, #5 | |
86 | .endif | |
87 | .unreq t0 | |
88 | .endm | |
89 | ||
90 | .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d | |
91 | vext.16 q1, \r0, \r1, #2 | |
92 | vext.16 q0, \r0, \r1, #3 | |
93 | vaddl.s16 q9, d2, d0 | |
94 | vext.16 q2, \r0, \r1, #1 | |
95 | vaddl.s16 q1, d3, d1 | |
96 | vext.16 q3, \r0, \r1, #4 | |
97 | vaddl.s16 q10, d4, d6 | |
98 | vext.16 \r1, \r0, \r1, #5 | |
99 | vaddl.s16 q2, d5, d7 | |
100 | vaddl.s16 q0, \h0, \h1 | |
101 | vaddl.s16 q8, \l0, \l1 | |
102 | ||
103 | vshl.i32 q3, q9, #4 | |
104 | vshl.i32 q9, q9, #2 | |
105 | vshl.i32 q15, q10, #2 | |
106 | vadd.i32 q9, q9, q3 | |
107 | vadd.i32 q10, q10, q15 | |
108 | ||
109 | vshl.i32 q3, q1, #4 | |
110 | vshl.i32 q1, q1, #2 | |
111 | vshl.i32 q15, q2, #2 | |
112 | vadd.i32 q1, q1, q3 | |
113 | vadd.i32 q2, q2, q15 | |
114 | ||
115 | vadd.i32 q9, q9, q8 | |
116 | vsub.i32 q9, q9, q10 | |
117 | ||
118 | vadd.i32 q1, q1, q0 | |
119 | vsub.i32 q1, q1, q2 | |
120 | ||
121 | vrshrn.s32 d18, q9, #10 | |
122 | vrshrn.s32 d19, q1, #10 | |
123 | ||
124 | vqmovun.s16 \d, q9 | |
125 | .endm | |
126 | ||
127 | function put_h264_qpel16_h_lowpass_neon_packed | |
128 | mov r4, lr | |
129 | mov r12, #16 | |
130 | mov r3, #8 | |
131 | bl put_h264_qpel8_h_lowpass_neon | |
132 | sub r1, r1, r2, lsl #4 | |
133 | add r1, r1, #8 | |
134 | mov r12, #16 | |
135 | mov lr, r4 | |
136 | b put_h264_qpel8_h_lowpass_neon | |
137 | endfunc | |
138 | ||
139 | .macro h264_qpel_h_lowpass type | |
140 | function \type\()_h264_qpel16_h_lowpass_neon | |
141 | push {lr} | |
142 | mov r12, #16 | |
143 | bl \type\()_h264_qpel8_h_lowpass_neon | |
144 | sub r0, r0, r3, lsl #4 | |
145 | sub r1, r1, r2, lsl #4 | |
146 | add r0, r0, #8 | |
147 | add r1, r1, #8 | |
148 | mov r12, #16 | |
149 | pop {lr} | |
150 | endfunc | |
151 | ||
152 | function \type\()_h264_qpel8_h_lowpass_neon | |
153 | 1: vld1.8 {d0, d1}, [r1], r2 | |
154 | vld1.8 {d16,d17}, [r1], r2 | |
155 | subs r12, r12, #2 | |
156 | lowpass_8 d0, d1, d16, d17, d0, d16 | |
157 | .ifc \type,avg | |
158 | vld1.8 {d2}, [r0,:64], r3 | |
159 | vrhadd.u8 d0, d0, d2 | |
160 | vld1.8 {d3}, [r0,:64] | |
161 | vrhadd.u8 d16, d16, d3 | |
162 | sub r0, r0, r3 | |
163 | .endif | |
164 | vst1.8 {d0}, [r0,:64], r3 | |
165 | vst1.8 {d16}, [r0,:64], r3 | |
166 | bne 1b | |
167 | bx lr | |
168 | endfunc | |
169 | .endm | |
170 | ||
171 | h264_qpel_h_lowpass put | |
172 | h264_qpel_h_lowpass avg | |
173 | ||
174 | .macro h264_qpel_h_lowpass_l2 type | |
175 | function \type\()_h264_qpel16_h_lowpass_l2_neon | |
176 | push {lr} | |
177 | mov r12, #16 | |
178 | bl \type\()_h264_qpel8_h_lowpass_l2_neon | |
179 | sub r0, r0, r2, lsl #4 | |
180 | sub r1, r1, r2, lsl #4 | |
181 | sub r3, r3, r2, lsl #4 | |
182 | add r0, r0, #8 | |
183 | add r1, r1, #8 | |
184 | add r3, r3, #8 | |
185 | mov r12, #16 | |
186 | pop {lr} | |
187 | endfunc | |
188 | ||
189 | function \type\()_h264_qpel8_h_lowpass_l2_neon | |
190 | 1: vld1.8 {d0, d1}, [r1], r2 | |
191 | vld1.8 {d16,d17}, [r1], r2 | |
192 | vld1.8 {d28}, [r3], r2 | |
193 | vld1.8 {d29}, [r3], r2 | |
194 | subs r12, r12, #2 | |
195 | lowpass_8 d0, d1, d16, d17, d0, d1 | |
196 | vrhadd.u8 q0, q0, q14 | |
197 | .ifc \type,avg | |
198 | vld1.8 {d2}, [r0,:64], r2 | |
199 | vrhadd.u8 d0, d0, d2 | |
200 | vld1.8 {d3}, [r0,:64] | |
201 | vrhadd.u8 d1, d1, d3 | |
202 | sub r0, r0, r2 | |
203 | .endif | |
204 | vst1.8 {d0}, [r0,:64], r2 | |
205 | vst1.8 {d1}, [r0,:64], r2 | |
206 | bne 1b | |
207 | bx lr | |
208 | endfunc | |
209 | .endm | |
210 | ||
211 | h264_qpel_h_lowpass_l2 put | |
212 | h264_qpel_h_lowpass_l2 avg | |
213 | ||
214 | function put_h264_qpel16_v_lowpass_neon_packed | |
215 | mov r4, lr | |
216 | mov r2, #8 | |
217 | bl put_h264_qpel8_v_lowpass_neon | |
218 | sub r1, r1, r3, lsl #2 | |
219 | bl put_h264_qpel8_v_lowpass_neon | |
220 | sub r1, r1, r3, lsl #4 | |
221 | sub r1, r1, r3, lsl #2 | |
222 | add r1, r1, #8 | |
223 | bl put_h264_qpel8_v_lowpass_neon | |
224 | sub r1, r1, r3, lsl #2 | |
225 | mov lr, r4 | |
226 | b put_h264_qpel8_v_lowpass_neon | |
227 | endfunc | |
228 | ||
229 | .macro h264_qpel_v_lowpass type | |
230 | function \type\()_h264_qpel16_v_lowpass_neon | |
231 | mov r4, lr | |
232 | bl \type\()_h264_qpel8_v_lowpass_neon | |
233 | sub r1, r1, r3, lsl #2 | |
234 | bl \type\()_h264_qpel8_v_lowpass_neon | |
235 | sub r0, r0, r2, lsl #4 | |
236 | add r0, r0, #8 | |
237 | sub r1, r1, r3, lsl #4 | |
238 | sub r1, r1, r3, lsl #2 | |
239 | add r1, r1, #8 | |
240 | bl \type\()_h264_qpel8_v_lowpass_neon | |
241 | sub r1, r1, r3, lsl #2 | |
242 | mov lr, r4 | |
243 | endfunc | |
244 | ||
245 | function \type\()_h264_qpel8_v_lowpass_neon | |
246 | vld1.8 {d8}, [r1], r3 | |
247 | vld1.8 {d10}, [r1], r3 | |
248 | vld1.8 {d12}, [r1], r3 | |
249 | vld1.8 {d14}, [r1], r3 | |
250 | vld1.8 {d22}, [r1], r3 | |
251 | vld1.8 {d24}, [r1], r3 | |
252 | vld1.8 {d26}, [r1], r3 | |
253 | vld1.8 {d28}, [r1], r3 | |
254 | vld1.8 {d9}, [r1], r3 | |
255 | vld1.8 {d11}, [r1], r3 | |
256 | vld1.8 {d13}, [r1], r3 | |
257 | vld1.8 {d15}, [r1], r3 | |
258 | vld1.8 {d23}, [r1] | |
259 | ||
260 | transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 | |
261 | lowpass_8 d8, d9, d10, d11, d8, d10 | |
262 | lowpass_8 d12, d13, d14, d15, d12, d14 | |
263 | lowpass_8 d22, d23, d24, d25, d22, d24 | |
264 | lowpass_8 d26, d27, d28, d29, d26, d28 | |
265 | transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 | |
266 | ||
267 | .ifc \type,avg | |
268 | vld1.8 {d9}, [r0,:64], r2 | |
269 | vrhadd.u8 d8, d8, d9 | |
270 | vld1.8 {d11}, [r0,:64], r2 | |
271 | vrhadd.u8 d10, d10, d11 | |
272 | vld1.8 {d13}, [r0,:64], r2 | |
273 | vrhadd.u8 d12, d12, d13 | |
274 | vld1.8 {d15}, [r0,:64], r2 | |
275 | vrhadd.u8 d14, d14, d15 | |
276 | vld1.8 {d23}, [r0,:64], r2 | |
277 | vrhadd.u8 d22, d22, d23 | |
278 | vld1.8 {d25}, [r0,:64], r2 | |
279 | vrhadd.u8 d24, d24, d25 | |
280 | vld1.8 {d27}, [r0,:64], r2 | |
281 | vrhadd.u8 d26, d26, d27 | |
282 | vld1.8 {d29}, [r0,:64], r2 | |
283 | vrhadd.u8 d28, d28, d29 | |
284 | sub r0, r0, r2, lsl #3 | |
285 | .endif | |
286 | ||
287 | vst1.8 {d8}, [r0,:64], r2 | |
288 | vst1.8 {d10}, [r0,:64], r2 | |
289 | vst1.8 {d12}, [r0,:64], r2 | |
290 | vst1.8 {d14}, [r0,:64], r2 | |
291 | vst1.8 {d22}, [r0,:64], r2 | |
292 | vst1.8 {d24}, [r0,:64], r2 | |
293 | vst1.8 {d26}, [r0,:64], r2 | |
294 | vst1.8 {d28}, [r0,:64], r2 | |
295 | ||
296 | bx lr | |
297 | endfunc | |
298 | .endm | |
299 | ||
300 | h264_qpel_v_lowpass put | |
301 | h264_qpel_v_lowpass avg | |
302 | ||
303 | .macro h264_qpel_v_lowpass_l2 type | |
304 | function \type\()_h264_qpel16_v_lowpass_l2_neon | |
305 | mov r4, lr | |
306 | bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
307 | sub r1, r1, r3, lsl #2 | |
308 | bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
309 | sub r0, r0, r3, lsl #4 | |
310 | sub r12, r12, r2, lsl #4 | |
311 | add r0, r0, #8 | |
312 | add r12, r12, #8 | |
313 | sub r1, r1, r3, lsl #4 | |
314 | sub r1, r1, r3, lsl #2 | |
315 | add r1, r1, #8 | |
316 | bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
317 | sub r1, r1, r3, lsl #2 | |
318 | mov lr, r4 | |
319 | endfunc | |
320 | ||
321 | function \type\()_h264_qpel8_v_lowpass_l2_neon | |
322 | vld1.8 {d8}, [r1], r3 | |
323 | vld1.8 {d10}, [r1], r3 | |
324 | vld1.8 {d12}, [r1], r3 | |
325 | vld1.8 {d14}, [r1], r3 | |
326 | vld1.8 {d22}, [r1], r3 | |
327 | vld1.8 {d24}, [r1], r3 | |
328 | vld1.8 {d26}, [r1], r3 | |
329 | vld1.8 {d28}, [r1], r3 | |
330 | vld1.8 {d9}, [r1], r3 | |
331 | vld1.8 {d11}, [r1], r3 | |
332 | vld1.8 {d13}, [r1], r3 | |
333 | vld1.8 {d15}, [r1], r3 | |
334 | vld1.8 {d23}, [r1] | |
335 | ||
336 | transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 | |
337 | lowpass_8 d8, d9, d10, d11, d8, d9 | |
338 | lowpass_8 d12, d13, d14, d15, d12, d13 | |
339 | lowpass_8 d22, d23, d24, d25, d22, d23 | |
340 | lowpass_8 d26, d27, d28, d29, d26, d27 | |
341 | transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 | |
342 | ||
343 | vld1.8 {d0}, [r12], r2 | |
344 | vld1.8 {d1}, [r12], r2 | |
345 | vld1.8 {d2}, [r12], r2 | |
346 | vld1.8 {d3}, [r12], r2 | |
347 | vld1.8 {d4}, [r12], r2 | |
348 | vrhadd.u8 q0, q0, q4 | |
349 | vld1.8 {d5}, [r12], r2 | |
350 | vrhadd.u8 q1, q1, q6 | |
351 | vld1.8 {d10}, [r12], r2 | |
352 | vrhadd.u8 q2, q2, q11 | |
353 | vld1.8 {d11}, [r12], r2 | |
354 | vrhadd.u8 q5, q5, q13 | |
355 | ||
356 | .ifc \type,avg | |
357 | vld1.8 {d16}, [r0,:64], r3 | |
358 | vrhadd.u8 d0, d0, d16 | |
359 | vld1.8 {d17}, [r0,:64], r3 | |
360 | vrhadd.u8 d1, d1, d17 | |
361 | vld1.8 {d16}, [r0,:64], r3 | |
362 | vrhadd.u8 d2, d2, d16 | |
363 | vld1.8 {d17}, [r0,:64], r3 | |
364 | vrhadd.u8 d3, d3, d17 | |
365 | vld1.8 {d16}, [r0,:64], r3 | |
366 | vrhadd.u8 d4, d4, d16 | |
367 | vld1.8 {d17}, [r0,:64], r3 | |
368 | vrhadd.u8 d5, d5, d17 | |
369 | vld1.8 {d16}, [r0,:64], r3 | |
370 | vrhadd.u8 d10, d10, d16 | |
371 | vld1.8 {d17}, [r0,:64], r3 | |
372 | vrhadd.u8 d11, d11, d17 | |
373 | sub r0, r0, r3, lsl #3 | |
374 | .endif | |
375 | ||
376 | vst1.8 {d0}, [r0,:64], r3 | |
377 | vst1.8 {d1}, [r0,:64], r3 | |
378 | vst1.8 {d2}, [r0,:64], r3 | |
379 | vst1.8 {d3}, [r0,:64], r3 | |
380 | vst1.8 {d4}, [r0,:64], r3 | |
381 | vst1.8 {d5}, [r0,:64], r3 | |
382 | vst1.8 {d10}, [r0,:64], r3 | |
383 | vst1.8 {d11}, [r0,:64], r3 | |
384 | ||
385 | bx lr | |
386 | endfunc | |
387 | .endm | |
388 | ||
389 | h264_qpel_v_lowpass_l2 put | |
390 | h264_qpel_v_lowpass_l2 avg | |
391 | ||
392 | function put_h264_qpel8_hv_lowpass_neon_top | |
393 | lowpass_const r12 | |
394 | mov r12, #12 | |
395 | 1: vld1.8 {d0, d1}, [r1], r3 | |
396 | vld1.8 {d16,d17}, [r1], r3 | |
397 | subs r12, r12, #2 | |
398 | lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 | |
399 | vst1.8 {d22-d25}, [r4,:128]! | |
400 | bne 1b | |
401 | ||
402 | vld1.8 {d0, d1}, [r1] | |
403 | lowpass_8_1 d0, d1, q12, narrow=0 | |
404 | ||
405 | mov r12, #-16 | |
406 | add r4, r4, r12 | |
407 | vld1.8 {d30,d31}, [r4,:128], r12 | |
408 | vld1.8 {d20,d21}, [r4,:128], r12 | |
409 | vld1.8 {d18,d19}, [r4,:128], r12 | |
410 | vld1.8 {d16,d17}, [r4,:128], r12 | |
411 | vld1.8 {d14,d15}, [r4,:128], r12 | |
412 | vld1.8 {d12,d13}, [r4,:128], r12 | |
413 | vld1.8 {d10,d11}, [r4,:128], r12 | |
414 | vld1.8 {d8, d9}, [r4,:128], r12 | |
415 | vld1.8 {d6, d7}, [r4,:128], r12 | |
416 | vld1.8 {d4, d5}, [r4,:128], r12 | |
417 | vld1.8 {d2, d3}, [r4,:128], r12 | |
418 | vld1.8 {d0, d1}, [r4,:128] | |
419 | ||
420 | swap4 d1, d3, d5, d7, d8, d10, d12, d14 | |
421 | transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 | |
422 | ||
423 | swap4 d17, d19, d21, d31, d24, d26, d28, d22 | |
424 | transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 | |
425 | ||
426 | vst1.8 {d30,d31}, [r4,:128]! | |
427 | vst1.8 {d6, d7}, [r4,:128]! | |
428 | vst1.8 {d20,d21}, [r4,:128]! | |
429 | vst1.8 {d4, d5}, [r4,:128]! | |
430 | vst1.8 {d18,d19}, [r4,:128]! | |
431 | vst1.8 {d2, d3}, [r4,:128]! | |
432 | vst1.8 {d16,d17}, [r4,:128]! | |
433 | vst1.8 {d0, d1}, [r4,:128] | |
434 | ||
435 | lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 | |
436 | lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 | |
437 | lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 | |
438 | lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 | |
439 | ||
440 | vld1.8 {d16,d17}, [r4,:128], r12 | |
441 | vld1.8 {d30,d31}, [r4,:128], r12 | |
442 | lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 | |
443 | vld1.8 {d16,d17}, [r4,:128], r12 | |
444 | vld1.8 {d30,d31}, [r4,:128], r12 | |
445 | lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 | |
446 | vld1.8 {d16,d17}, [r4,:128], r12 | |
447 | vld1.8 {d30,d31}, [r4,:128], r12 | |
448 | lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 | |
449 | vld1.8 {d16,d17}, [r4,:128], r12 | |
450 | vld1.8 {d30,d31}, [r4,:128] | |
451 | lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 | |
452 | ||
453 | transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 | |
454 | ||
455 | bx lr | |
456 | endfunc | |
457 | ||
458 | .macro h264_qpel8_hv_lowpass type | |
459 | function \type\()_h264_qpel8_hv_lowpass_neon | |
460 | mov r10, lr | |
461 | bl put_h264_qpel8_hv_lowpass_neon_top | |
462 | .ifc \type,avg | |
463 | vld1.8 {d0}, [r0,:64], r2 | |
464 | vrhadd.u8 d12, d12, d0 | |
465 | vld1.8 {d1}, [r0,:64], r2 | |
466 | vrhadd.u8 d13, d13, d1 | |
467 | vld1.8 {d2}, [r0,:64], r2 | |
468 | vrhadd.u8 d14, d14, d2 | |
469 | vld1.8 {d3}, [r0,:64], r2 | |
470 | vrhadd.u8 d15, d15, d3 | |
471 | vld1.8 {d4}, [r0,:64], r2 | |
472 | vrhadd.u8 d8, d8, d4 | |
473 | vld1.8 {d5}, [r0,:64], r2 | |
474 | vrhadd.u8 d9, d9, d5 | |
475 | vld1.8 {d6}, [r0,:64], r2 | |
476 | vrhadd.u8 d10, d10, d6 | |
477 | vld1.8 {d7}, [r0,:64], r2 | |
478 | vrhadd.u8 d11, d11, d7 | |
479 | sub r0, r0, r2, lsl #3 | |
480 | .endif | |
481 | ||
482 | vst1.8 {d12}, [r0,:64], r2 | |
483 | vst1.8 {d13}, [r0,:64], r2 | |
484 | vst1.8 {d14}, [r0,:64], r2 | |
485 | vst1.8 {d15}, [r0,:64], r2 | |
486 | vst1.8 {d8}, [r0,:64], r2 | |
487 | vst1.8 {d9}, [r0,:64], r2 | |
488 | vst1.8 {d10}, [r0,:64], r2 | |
489 | vst1.8 {d11}, [r0,:64], r2 | |
490 | ||
491 | mov lr, r10 | |
492 | bx lr | |
493 | endfunc | |
494 | .endm | |
495 | ||
496 | h264_qpel8_hv_lowpass put | |
497 | h264_qpel8_hv_lowpass avg | |
498 | ||
499 | .macro h264_qpel8_hv_lowpass_l2 type | |
500 | function \type\()_h264_qpel8_hv_lowpass_l2_neon | |
501 | mov r10, lr | |
502 | bl put_h264_qpel8_hv_lowpass_neon_top | |
503 | ||
504 | vld1.8 {d0, d1}, [r2,:128]! | |
505 | vld1.8 {d2, d3}, [r2,:128]! | |
506 | vrhadd.u8 q0, q0, q6 | |
507 | vld1.8 {d4, d5}, [r2,:128]! | |
508 | vrhadd.u8 q1, q1, q7 | |
509 | vld1.8 {d6, d7}, [r2,:128]! | |
510 | vrhadd.u8 q2, q2, q4 | |
511 | vrhadd.u8 q3, q3, q5 | |
512 | .ifc \type,avg | |
513 | vld1.8 {d16}, [r0,:64], r3 | |
514 | vrhadd.u8 d0, d0, d16 | |
515 | vld1.8 {d17}, [r0,:64], r3 | |
516 | vrhadd.u8 d1, d1, d17 | |
517 | vld1.8 {d18}, [r0,:64], r3 | |
518 | vrhadd.u8 d2, d2, d18 | |
519 | vld1.8 {d19}, [r0,:64], r3 | |
520 | vrhadd.u8 d3, d3, d19 | |
521 | vld1.8 {d20}, [r0,:64], r3 | |
522 | vrhadd.u8 d4, d4, d20 | |
523 | vld1.8 {d21}, [r0,:64], r3 | |
524 | vrhadd.u8 d5, d5, d21 | |
525 | vld1.8 {d22}, [r0,:64], r3 | |
526 | vrhadd.u8 d6, d6, d22 | |
527 | vld1.8 {d23}, [r0,:64], r3 | |
528 | vrhadd.u8 d7, d7, d23 | |
529 | sub r0, r0, r3, lsl #3 | |
530 | .endif | |
531 | vst1.8 {d0}, [r0,:64], r3 | |
532 | vst1.8 {d1}, [r0,:64], r3 | |
533 | vst1.8 {d2}, [r0,:64], r3 | |
534 | vst1.8 {d3}, [r0,:64], r3 | |
535 | vst1.8 {d4}, [r0,:64], r3 | |
536 | vst1.8 {d5}, [r0,:64], r3 | |
537 | vst1.8 {d6}, [r0,:64], r3 | |
538 | vst1.8 {d7}, [r0,:64], r3 | |
539 | ||
540 | mov lr, r10 | |
541 | bx lr | |
542 | endfunc | |
543 | .endm | |
544 | ||
545 | h264_qpel8_hv_lowpass_l2 put | |
546 | h264_qpel8_hv_lowpass_l2 avg | |
547 | ||
548 | .macro h264_qpel16_hv type | |
549 | function \type\()_h264_qpel16_hv_lowpass_neon | |
550 | mov r9, lr | |
551 | bl \type\()_h264_qpel8_hv_lowpass_neon | |
552 | sub r1, r1, r3, lsl #2 | |
553 | bl \type\()_h264_qpel8_hv_lowpass_neon | |
554 | sub r1, r1, r3, lsl #4 | |
555 | sub r1, r1, r3, lsl #2 | |
556 | add r1, r1, #8 | |
557 | sub r0, r0, r2, lsl #4 | |
558 | add r0, r0, #8 | |
559 | bl \type\()_h264_qpel8_hv_lowpass_neon | |
560 | sub r1, r1, r3, lsl #2 | |
561 | mov lr, r9 | |
562 | b \type\()_h264_qpel8_hv_lowpass_neon | |
563 | endfunc | |
564 | ||
565 | function \type\()_h264_qpel16_hv_lowpass_l2_neon | |
566 | mov r9, lr | |
567 | sub r2, r4, #256 | |
568 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
569 | sub r1, r1, r3, lsl #2 | |
570 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
571 | sub r1, r1, r3, lsl #4 | |
572 | sub r1, r1, r3, lsl #2 | |
573 | add r1, r1, #8 | |
574 | sub r0, r0, r3, lsl #4 | |
575 | add r0, r0, #8 | |
576 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
577 | sub r1, r1, r3, lsl #2 | |
578 | mov lr, r9 | |
579 | b \type\()_h264_qpel8_hv_lowpass_l2_neon | |
580 | endfunc | |
581 | .endm | |
582 | ||
583 | h264_qpel16_hv put | |
584 | h264_qpel16_hv avg | |
585 | ||
586 | .macro h264_qpel8 type | |
587 | function ff_\type\()_h264_qpel8_mc10_neon, export=1 | |
588 | lowpass_const r3 | |
589 | mov r3, r1 | |
590 | sub r1, r1, #2 | |
591 | mov r12, #8 | |
592 | b \type\()_h264_qpel8_h_lowpass_l2_neon | |
593 | endfunc | |
594 | ||
595 | function ff_\type\()_h264_qpel8_mc20_neon, export=1 | |
596 | lowpass_const r3 | |
597 | sub r1, r1, #2 | |
598 | mov r3, r2 | |
599 | mov r12, #8 | |
600 | b \type\()_h264_qpel8_h_lowpass_neon | |
601 | endfunc | |
602 | ||
603 | function ff_\type\()_h264_qpel8_mc30_neon, export=1 | |
604 | lowpass_const r3 | |
605 | add r3, r1, #1 | |
606 | sub r1, r1, #2 | |
607 | mov r12, #8 | |
608 | b \type\()_h264_qpel8_h_lowpass_l2_neon | |
609 | endfunc | |
610 | ||
611 | function ff_\type\()_h264_qpel8_mc01_neon, export=1 | |
612 | push {lr} | |
613 | mov r12, r1 | |
614 | \type\()_h264_qpel8_mc01: | |
615 | lowpass_const r3 | |
616 | mov r3, r2 | |
617 | sub r1, r1, r2, lsl #1 | |
618 | vpush {d8-d15} | |
619 | bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
620 | vpop {d8-d15} | |
621 | pop {pc} | |
622 | endfunc | |
623 | ||
624 | function ff_\type\()_h264_qpel8_mc11_neon, export=1 | |
625 | push {r0, r1, r11, lr} | |
626 | \type\()_h264_qpel8_mc11: | |
627 | lowpass_const r3 | |
628 | mov r11, sp | |
629 | A bic sp, sp, #15 | |
630 | T bic r0, r11, #15 | |
631 | T mov sp, r0 | |
632 | sub sp, sp, #64 | |
633 | mov r0, sp | |
634 | sub r1, r1, #2 | |
635 | mov r3, #8 | |
636 | mov r12, #8 | |
637 | vpush {d8-d15} | |
638 | bl put_h264_qpel8_h_lowpass_neon | |
639 | ldrd r0, r1, [r11], #8 | |
640 | mov r3, r2 | |
641 | add r12, sp, #64 | |
642 | sub r1, r1, r2, lsl #1 | |
643 | mov r2, #8 | |
644 | bl \type\()_h264_qpel8_v_lowpass_l2_neon | |
645 | vpop {d8-d15} | |
646 | mov sp, r11 | |
647 | pop {r11, pc} | |
648 | endfunc | |
649 | ||
650 | function ff_\type\()_h264_qpel8_mc21_neon, export=1 | |
651 | push {r0, r1, r4, r10, r11, lr} | |
652 | \type\()_h264_qpel8_mc21: | |
653 | lowpass_const r3 | |
654 | mov r11, sp | |
655 | A bic sp, sp, #15 | |
656 | T bic r0, r11, #15 | |
657 | T mov sp, r0 | |
658 | sub sp, sp, #(8*8+16*12) | |
659 | sub r1, r1, #2 | |
660 | mov r3, #8 | |
661 | mov r0, sp | |
662 | mov r12, #8 | |
663 | vpush {d8-d15} | |
664 | bl put_h264_qpel8_h_lowpass_neon | |
665 | mov r4, r0 | |
666 | ldrd r0, r1, [r11], #8 | |
667 | sub r1, r1, r2, lsl #1 | |
668 | sub r1, r1, #2 | |
669 | mov r3, r2 | |
670 | sub r2, r4, #64 | |
671 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
672 | vpop {d8-d15} | |
673 | mov sp, r11 | |
674 | pop {r4, r10, r11, pc} | |
675 | endfunc | |
676 | ||
677 | function ff_\type\()_h264_qpel8_mc31_neon, export=1 | |
678 | add r1, r1, #1 | |
679 | push {r0, r1, r11, lr} | |
680 | sub r1, r1, #1 | |
681 | b \type\()_h264_qpel8_mc11 | |
682 | endfunc | |
683 | ||
684 | function ff_\type\()_h264_qpel8_mc02_neon, export=1 | |
685 | push {lr} | |
686 | lowpass_const r3 | |
687 | sub r1, r1, r2, lsl #1 | |
688 | mov r3, r2 | |
689 | vpush {d8-d15} | |
690 | bl \type\()_h264_qpel8_v_lowpass_neon | |
691 | vpop {d8-d15} | |
692 | pop {pc} | |
693 | endfunc | |
694 | ||
695 | function ff_\type\()_h264_qpel8_mc12_neon, export=1 | |
696 | push {r0, r1, r4, r10, r11, lr} | |
697 | \type\()_h264_qpel8_mc12: | |
698 | lowpass_const r3 | |
699 | mov r11, sp | |
700 | A bic sp, sp, #15 | |
701 | T bic r0, r11, #15 | |
702 | T mov sp, r0 | |
703 | sub sp, sp, #(8*8+16*12) | |
704 | sub r1, r1, r2, lsl #1 | |
705 | mov r3, r2 | |
706 | mov r2, #8 | |
707 | mov r0, sp | |
708 | vpush {d8-d15} | |
709 | bl put_h264_qpel8_v_lowpass_neon | |
710 | mov r4, r0 | |
711 | ldrd r0, r1, [r11], #8 | |
712 | sub r1, r1, r3, lsl #1 | |
713 | sub r1, r1, #2 | |
714 | sub r2, r4, #64 | |
715 | bl \type\()_h264_qpel8_hv_lowpass_l2_neon | |
716 | vpop {d8-d15} | |
717 | mov sp, r11 | |
718 | pop {r4, r10, r11, pc} | |
719 | endfunc | |
720 | ||
721 | function ff_\type\()_h264_qpel8_mc22_neon, export=1 | |
722 | push {r4, r10, r11, lr} | |
723 | mov r11, sp | |
724 | A bic sp, sp, #15 | |
725 | T bic r4, r11, #15 | |
726 | T mov sp, r4 | |
727 | sub r1, r1, r2, lsl #1 | |
728 | sub r1, r1, #2 | |
729 | mov r3, r2 | |
730 | sub sp, sp, #(16*12) | |
731 | mov r4, sp | |
732 | vpush {d8-d15} | |
733 | bl \type\()_h264_qpel8_hv_lowpass_neon | |
734 | vpop {d8-d15} | |
735 | mov sp, r11 | |
736 | pop {r4, r10, r11, pc} | |
737 | endfunc | |
738 | ||
739 | function ff_\type\()_h264_qpel8_mc32_neon, export=1 | |
740 | push {r0, r1, r4, r10, r11, lr} | |
741 | add r1, r1, #1 | |
742 | b \type\()_h264_qpel8_mc12 | |
743 | endfunc | |
744 | ||
745 | function ff_\type\()_h264_qpel8_mc03_neon, export=1 | |
746 | push {lr} | |
747 | add r12, r1, r2 | |
748 | b \type\()_h264_qpel8_mc01 | |
749 | endfunc | |
750 | ||
751 | function ff_\type\()_h264_qpel8_mc13_neon, export=1 | |
752 | push {r0, r1, r11, lr} | |
753 | add r1, r1, r2 | |
754 | b \type\()_h264_qpel8_mc11 | |
755 | endfunc | |
756 | ||
757 | function ff_\type\()_h264_qpel8_mc23_neon, export=1 | |
758 | push {r0, r1, r4, r10, r11, lr} | |
759 | add r1, r1, r2 | |
760 | b \type\()_h264_qpel8_mc21 | |
761 | endfunc | |
762 | ||
763 | function ff_\type\()_h264_qpel8_mc33_neon, export=1 | |
764 | add r1, r1, #1 | |
765 | push {r0, r1, r11, lr} | |
766 | add r1, r1, r2 | |
767 | sub r1, r1, #1 | |
768 | b \type\()_h264_qpel8_mc11 | |
769 | endfunc | |
770 | .endm | |
771 | ||
772 | h264_qpel8 put | |
773 | h264_qpel8 avg | |
774 | ||
775 | .macro h264_qpel16 type | |
776 | function ff_\type\()_h264_qpel16_mc10_neon, export=1 | |
777 | lowpass_const r3 | |
778 | mov r3, r1 | |
779 | sub r1, r1, #2 | |
780 | b \type\()_h264_qpel16_h_lowpass_l2_neon | |
781 | endfunc | |
782 | ||
783 | function ff_\type\()_h264_qpel16_mc20_neon, export=1 | |
784 | lowpass_const r3 | |
785 | sub r1, r1, #2 | |
786 | mov r3, r2 | |
787 | b \type\()_h264_qpel16_h_lowpass_neon | |
788 | endfunc | |
789 | ||
790 | function ff_\type\()_h264_qpel16_mc30_neon, export=1 | |
791 | lowpass_const r3 | |
792 | add r3, r1, #1 | |
793 | sub r1, r1, #2 | |
794 | b \type\()_h264_qpel16_h_lowpass_l2_neon | |
795 | endfunc | |
796 | ||
797 | function ff_\type\()_h264_qpel16_mc01_neon, export=1 | |
798 | push {r4, lr} | |
799 | mov r12, r1 | |
800 | \type\()_h264_qpel16_mc01: | |
801 | lowpass_const r3 | |
802 | mov r3, r2 | |
803 | sub r1, r1, r2, lsl #1 | |
804 | vpush {d8-d15} | |
805 | bl \type\()_h264_qpel16_v_lowpass_l2_neon | |
806 | vpop {d8-d15} | |
807 | pop {r4, pc} | |
808 | endfunc | |
809 | ||
810 | function ff_\type\()_h264_qpel16_mc11_neon, export=1 | |
811 | push {r0, r1, r4, r11, lr} | |
812 | \type\()_h264_qpel16_mc11: | |
813 | lowpass_const r3 | |
814 | mov r11, sp | |
815 | A bic sp, sp, #15 | |
816 | T bic r0, r11, #15 | |
817 | T mov sp, r0 | |
818 | sub sp, sp, #256 | |
819 | mov r0, sp | |
820 | sub r1, r1, #2 | |
821 | mov r3, #16 | |
822 | vpush {d8-d15} | |
823 | bl put_h264_qpel16_h_lowpass_neon | |
824 | ldrd r0, r1, [r11], #8 | |
825 | mov r3, r2 | |
826 | add r12, sp, #64 | |
827 | sub r1, r1, r2, lsl #1 | |
828 | mov r2, #16 | |
829 | bl \type\()_h264_qpel16_v_lowpass_l2_neon | |
830 | vpop {d8-d15} | |
831 | mov sp, r11 | |
832 | pop {r4, r11, pc} | |
833 | endfunc | |
834 | ||
835 | function ff_\type\()_h264_qpel16_mc21_neon, export=1 | |
836 | push {r0, r1, r4-r5, r9-r11, lr} | |
837 | \type\()_h264_qpel16_mc21: | |
838 | lowpass_const r3 | |
839 | mov r11, sp | |
840 | A bic sp, sp, #15 | |
841 | T bic r0, r11, #15 | |
842 | T mov sp, r0 | |
843 | sub sp, sp, #(16*16+16*12) | |
844 | sub r1, r1, #2 | |
845 | mov r0, sp | |
846 | vpush {d8-d15} | |
847 | bl put_h264_qpel16_h_lowpass_neon_packed | |
848 | mov r4, r0 | |
849 | ldrd r0, r1, [r11], #8 | |
850 | sub r1, r1, r2, lsl #1 | |
851 | sub r1, r1, #2 | |
852 | mov r3, r2 | |
853 | bl \type\()_h264_qpel16_hv_lowpass_l2_neon | |
854 | vpop {d8-d15} | |
855 | mov sp, r11 | |
856 | pop {r4-r5, r9-r11, pc} | |
857 | endfunc | |
858 | ||
859 | function ff_\type\()_h264_qpel16_mc31_neon, export=1 | |
860 | add r1, r1, #1 | |
861 | push {r0, r1, r4, r11, lr} | |
862 | sub r1, r1, #1 | |
863 | b \type\()_h264_qpel16_mc11 | |
864 | endfunc | |
865 | ||
866 | function ff_\type\()_h264_qpel16_mc02_neon, export=1 | |
867 | push {r4, lr} | |
868 | lowpass_const r3 | |
869 | sub r1, r1, r2, lsl #1 | |
870 | mov r3, r2 | |
871 | vpush {d8-d15} | |
872 | bl \type\()_h264_qpel16_v_lowpass_neon | |
873 | vpop {d8-d15} | |
874 | pop {r4, pc} | |
875 | endfunc | |
876 | ||
877 | function ff_\type\()_h264_qpel16_mc12_neon, export=1 | |
878 | push {r0, r1, r4-r5, r9-r11, lr} | |
879 | \type\()_h264_qpel16_mc12: | |
880 | lowpass_const r3 | |
881 | mov r11, sp | |
882 | A bic sp, sp, #15 | |
883 | T bic r0, r11, #15 | |
884 | T mov sp, r0 | |
885 | sub sp, sp, #(16*16+16*12) | |
886 | sub r1, r1, r2, lsl #1 | |
887 | mov r0, sp | |
888 | mov r3, r2 | |
889 | vpush {d8-d15} | |
890 | bl put_h264_qpel16_v_lowpass_neon_packed | |
891 | mov r4, r0 | |
892 | ldrd r0, r1, [r11], #8 | |
893 | sub r1, r1, r3, lsl #1 | |
894 | sub r1, r1, #2 | |
895 | mov r2, r3 | |
896 | bl \type\()_h264_qpel16_hv_lowpass_l2_neon | |
897 | vpop {d8-d15} | |
898 | mov sp, r11 | |
899 | pop {r4-r5, r9-r11, pc} | |
900 | endfunc | |
901 | ||
902 | function ff_\type\()_h264_qpel16_mc22_neon, export=1 | |
903 | push {r4, r9-r11, lr} | |
904 | lowpass_const r3 | |
905 | mov r11, sp | |
906 | A bic sp, sp, #15 | |
907 | T bic r4, r11, #15 | |
908 | T mov sp, r4 | |
909 | sub r1, r1, r2, lsl #1 | |
910 | sub r1, r1, #2 | |
911 | mov r3, r2 | |
912 | sub sp, sp, #(16*12) | |
913 | mov r4, sp | |
914 | vpush {d8-d15} | |
915 | bl \type\()_h264_qpel16_hv_lowpass_neon | |
916 | vpop {d8-d15} | |
917 | mov sp, r11 | |
918 | pop {r4, r9-r11, pc} | |
919 | endfunc | |
920 | ||
921 | function ff_\type\()_h264_qpel16_mc32_neon, export=1 | |
922 | push {r0, r1, r4-r5, r9-r11, lr} | |
923 | add r1, r1, #1 | |
924 | b \type\()_h264_qpel16_mc12 | |
925 | endfunc | |
926 | ||
927 | function ff_\type\()_h264_qpel16_mc03_neon, export=1 | |
928 | push {r4, lr} | |
929 | add r12, r1, r2 | |
930 | b \type\()_h264_qpel16_mc01 | |
931 | endfunc | |
932 | ||
933 | function ff_\type\()_h264_qpel16_mc13_neon, export=1 | |
934 | push {r0, r1, r4, r11, lr} | |
935 | add r1, r1, r2 | |
936 | b \type\()_h264_qpel16_mc11 | |
937 | endfunc | |
938 | ||
939 | function ff_\type\()_h264_qpel16_mc23_neon, export=1 | |
940 | push {r0, r1, r4-r5, r9-r11, lr} | |
941 | add r1, r1, r2 | |
942 | b \type\()_h264_qpel16_mc21 | |
943 | endfunc | |
944 | ||
945 | function ff_\type\()_h264_qpel16_mc33_neon, export=1 | |
946 | add r1, r1, #1 | |
947 | push {r0, r1, r4, r11, lr} | |
948 | add r1, r1, r2 | |
949 | sub r1, r1, #1 | |
950 | b \type\()_h264_qpel16_mc11 | |
951 | endfunc | |
952 | .endm | |
953 | ||
954 | h264_qpel16 put | |
955 | h264_qpel16 avg |