Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / h264qpel_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22#include "neon.S"
23
24 /* H.264 qpel MC */
25
26.macro lowpass_const r
27 movw \r, #5
28 movt \r, #20
29 vmov.32 d6[0], \r
30.endm
31
32.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
33 .if \narrow
34 t0 .req q0
35 t1 .req q8
36 .else
37 t0 .req \d0
38 t1 .req \d1
39 .endif
40 vext.8 d2, \r0, \r1, #2
41 vext.8 d3, \r0, \r1, #3
42 vaddl.u8 q1, d2, d3
43 vext.8 d4, \r0, \r1, #1
44 vext.8 d5, \r0, \r1, #4
45 vaddl.u8 q2, d4, d5
46 vext.8 d30, \r0, \r1, #5
47 vaddl.u8 t0, \r0, d30
48 vext.8 d18, \r2, \r3, #2
49 vmla.i16 t0, q1, d6[1]
50 vext.8 d19, \r2, \r3, #3
51 vaddl.u8 q9, d18, d19
52 vext.8 d20, \r2, \r3, #1
53 vmls.i16 t0, q2, d6[0]
54 vext.8 d21, \r2, \r3, #4
55 vaddl.u8 q10, d20, d21
56 vext.8 d31, \r2, \r3, #5
57 vaddl.u8 t1, \r2, d31
58 vmla.i16 t1, q9, d6[1]
59 vmls.i16 t1, q10, d6[0]
60 .if \narrow
61 vqrshrun.s16 \d0, t0, #5
62 vqrshrun.s16 \d1, t1, #5
63 .endif
64 .unreq t0
65 .unreq t1
66.endm
67
68.macro lowpass_8_1 r0, r1, d0, narrow=1
69 .if \narrow
70 t0 .req q0
71 .else
72 t0 .req \d0
73 .endif
74 vext.8 d2, \r0, \r1, #2
75 vext.8 d3, \r0, \r1, #3
76 vaddl.u8 q1, d2, d3
77 vext.8 d4, \r0, \r1, #1
78 vext.8 d5, \r0, \r1, #4
79 vaddl.u8 q2, d4, d5
80 vext.8 d30, \r0, \r1, #5
81 vaddl.u8 t0, \r0, d30
82 vmla.i16 t0, q1, d6[1]
83 vmls.i16 t0, q2, d6[0]
84 .if \narrow
85 vqrshrun.s16 \d0, t0, #5
86 .endif
87 .unreq t0
88.endm
89
90.macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
91 vext.16 q1, \r0, \r1, #2
92 vext.16 q0, \r0, \r1, #3
93 vaddl.s16 q9, d2, d0
94 vext.16 q2, \r0, \r1, #1
95 vaddl.s16 q1, d3, d1
96 vext.16 q3, \r0, \r1, #4
97 vaddl.s16 q10, d4, d6
98 vext.16 \r1, \r0, \r1, #5
99 vaddl.s16 q2, d5, d7
100 vaddl.s16 q0, \h0, \h1
101 vaddl.s16 q8, \l0, \l1
102
103 vshl.i32 q3, q9, #4
104 vshl.i32 q9, q9, #2
105 vshl.i32 q15, q10, #2
106 vadd.i32 q9, q9, q3
107 vadd.i32 q10, q10, q15
108
109 vshl.i32 q3, q1, #4
110 vshl.i32 q1, q1, #2
111 vshl.i32 q15, q2, #2
112 vadd.i32 q1, q1, q3
113 vadd.i32 q2, q2, q15
114
115 vadd.i32 q9, q9, q8
116 vsub.i32 q9, q9, q10
117
118 vadd.i32 q1, q1, q0
119 vsub.i32 q1, q1, q2
120
121 vrshrn.s32 d18, q9, #10
122 vrshrn.s32 d19, q1, #10
123
124 vqmovun.s16 \d, q9
125.endm
126
127function put_h264_qpel16_h_lowpass_neon_packed
128 mov r4, lr
129 mov r12, #16
130 mov r3, #8
131 bl put_h264_qpel8_h_lowpass_neon
132 sub r1, r1, r2, lsl #4
133 add r1, r1, #8
134 mov r12, #16
135 mov lr, r4
136 b put_h264_qpel8_h_lowpass_neon
137endfunc
138
139.macro h264_qpel_h_lowpass type
140function \type\()_h264_qpel16_h_lowpass_neon
141 push {lr}
142 mov r12, #16
143 bl \type\()_h264_qpel8_h_lowpass_neon
144 sub r0, r0, r3, lsl #4
145 sub r1, r1, r2, lsl #4
146 add r0, r0, #8
147 add r1, r1, #8
148 mov r12, #16
149 pop {lr}
150endfunc
151
152function \type\()_h264_qpel8_h_lowpass_neon
1531: vld1.8 {d0, d1}, [r1], r2
154 vld1.8 {d16,d17}, [r1], r2
155 subs r12, r12, #2
156 lowpass_8 d0, d1, d16, d17, d0, d16
157 .ifc \type,avg
158 vld1.8 {d2}, [r0,:64], r3
159 vrhadd.u8 d0, d0, d2
160 vld1.8 {d3}, [r0,:64]
161 vrhadd.u8 d16, d16, d3
162 sub r0, r0, r3
163 .endif
164 vst1.8 {d0}, [r0,:64], r3
165 vst1.8 {d16}, [r0,:64], r3
166 bne 1b
167 bx lr
168endfunc
169.endm
170
171 h264_qpel_h_lowpass put
172 h264_qpel_h_lowpass avg
173
174.macro h264_qpel_h_lowpass_l2 type
175function \type\()_h264_qpel16_h_lowpass_l2_neon
176 push {lr}
177 mov r12, #16
178 bl \type\()_h264_qpel8_h_lowpass_l2_neon
179 sub r0, r0, r2, lsl #4
180 sub r1, r1, r2, lsl #4
181 sub r3, r3, r2, lsl #4
182 add r0, r0, #8
183 add r1, r1, #8
184 add r3, r3, #8
185 mov r12, #16
186 pop {lr}
187endfunc
188
189function \type\()_h264_qpel8_h_lowpass_l2_neon
1901: vld1.8 {d0, d1}, [r1], r2
191 vld1.8 {d16,d17}, [r1], r2
192 vld1.8 {d28}, [r3], r2
193 vld1.8 {d29}, [r3], r2
194 subs r12, r12, #2
195 lowpass_8 d0, d1, d16, d17, d0, d1
196 vrhadd.u8 q0, q0, q14
197 .ifc \type,avg
198 vld1.8 {d2}, [r0,:64], r2
199 vrhadd.u8 d0, d0, d2
200 vld1.8 {d3}, [r0,:64]
201 vrhadd.u8 d1, d1, d3
202 sub r0, r0, r2
203 .endif
204 vst1.8 {d0}, [r0,:64], r2
205 vst1.8 {d1}, [r0,:64], r2
206 bne 1b
207 bx lr
208endfunc
209.endm
210
211 h264_qpel_h_lowpass_l2 put
212 h264_qpel_h_lowpass_l2 avg
213
214function put_h264_qpel16_v_lowpass_neon_packed
215 mov r4, lr
216 mov r2, #8
217 bl put_h264_qpel8_v_lowpass_neon
218 sub r1, r1, r3, lsl #2
219 bl put_h264_qpel8_v_lowpass_neon
220 sub r1, r1, r3, lsl #4
221 sub r1, r1, r3, lsl #2
222 add r1, r1, #8
223 bl put_h264_qpel8_v_lowpass_neon
224 sub r1, r1, r3, lsl #2
225 mov lr, r4
226 b put_h264_qpel8_v_lowpass_neon
227endfunc
228
229.macro h264_qpel_v_lowpass type
230function \type\()_h264_qpel16_v_lowpass_neon
231 mov r4, lr
232 bl \type\()_h264_qpel8_v_lowpass_neon
233 sub r1, r1, r3, lsl #2
234 bl \type\()_h264_qpel8_v_lowpass_neon
235 sub r0, r0, r2, lsl #4
236 add r0, r0, #8
237 sub r1, r1, r3, lsl #4
238 sub r1, r1, r3, lsl #2
239 add r1, r1, #8
240 bl \type\()_h264_qpel8_v_lowpass_neon
241 sub r1, r1, r3, lsl #2
242 mov lr, r4
243endfunc
244
245function \type\()_h264_qpel8_v_lowpass_neon
246 vld1.8 {d8}, [r1], r3
247 vld1.8 {d10}, [r1], r3
248 vld1.8 {d12}, [r1], r3
249 vld1.8 {d14}, [r1], r3
250 vld1.8 {d22}, [r1], r3
251 vld1.8 {d24}, [r1], r3
252 vld1.8 {d26}, [r1], r3
253 vld1.8 {d28}, [r1], r3
254 vld1.8 {d9}, [r1], r3
255 vld1.8 {d11}, [r1], r3
256 vld1.8 {d13}, [r1], r3
257 vld1.8 {d15}, [r1], r3
258 vld1.8 {d23}, [r1]
259
260 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
261 lowpass_8 d8, d9, d10, d11, d8, d10
262 lowpass_8 d12, d13, d14, d15, d12, d14
263 lowpass_8 d22, d23, d24, d25, d22, d24
264 lowpass_8 d26, d27, d28, d29, d26, d28
265 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
266
267 .ifc \type,avg
268 vld1.8 {d9}, [r0,:64], r2
269 vrhadd.u8 d8, d8, d9
270 vld1.8 {d11}, [r0,:64], r2
271 vrhadd.u8 d10, d10, d11
272 vld1.8 {d13}, [r0,:64], r2
273 vrhadd.u8 d12, d12, d13
274 vld1.8 {d15}, [r0,:64], r2
275 vrhadd.u8 d14, d14, d15
276 vld1.8 {d23}, [r0,:64], r2
277 vrhadd.u8 d22, d22, d23
278 vld1.8 {d25}, [r0,:64], r2
279 vrhadd.u8 d24, d24, d25
280 vld1.8 {d27}, [r0,:64], r2
281 vrhadd.u8 d26, d26, d27
282 vld1.8 {d29}, [r0,:64], r2
283 vrhadd.u8 d28, d28, d29
284 sub r0, r0, r2, lsl #3
285 .endif
286
287 vst1.8 {d8}, [r0,:64], r2
288 vst1.8 {d10}, [r0,:64], r2
289 vst1.8 {d12}, [r0,:64], r2
290 vst1.8 {d14}, [r0,:64], r2
291 vst1.8 {d22}, [r0,:64], r2
292 vst1.8 {d24}, [r0,:64], r2
293 vst1.8 {d26}, [r0,:64], r2
294 vst1.8 {d28}, [r0,:64], r2
295
296 bx lr
297endfunc
298.endm
299
300 h264_qpel_v_lowpass put
301 h264_qpel_v_lowpass avg
302
303.macro h264_qpel_v_lowpass_l2 type
304function \type\()_h264_qpel16_v_lowpass_l2_neon
305 mov r4, lr
306 bl \type\()_h264_qpel8_v_lowpass_l2_neon
307 sub r1, r1, r3, lsl #2
308 bl \type\()_h264_qpel8_v_lowpass_l2_neon
309 sub r0, r0, r3, lsl #4
310 sub r12, r12, r2, lsl #4
311 add r0, r0, #8
312 add r12, r12, #8
313 sub r1, r1, r3, lsl #4
314 sub r1, r1, r3, lsl #2
315 add r1, r1, #8
316 bl \type\()_h264_qpel8_v_lowpass_l2_neon
317 sub r1, r1, r3, lsl #2
318 mov lr, r4
319endfunc
320
321function \type\()_h264_qpel8_v_lowpass_l2_neon
322 vld1.8 {d8}, [r1], r3
323 vld1.8 {d10}, [r1], r3
324 vld1.8 {d12}, [r1], r3
325 vld1.8 {d14}, [r1], r3
326 vld1.8 {d22}, [r1], r3
327 vld1.8 {d24}, [r1], r3
328 vld1.8 {d26}, [r1], r3
329 vld1.8 {d28}, [r1], r3
330 vld1.8 {d9}, [r1], r3
331 vld1.8 {d11}, [r1], r3
332 vld1.8 {d13}, [r1], r3
333 vld1.8 {d15}, [r1], r3
334 vld1.8 {d23}, [r1]
335
336 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
337 lowpass_8 d8, d9, d10, d11, d8, d9
338 lowpass_8 d12, d13, d14, d15, d12, d13
339 lowpass_8 d22, d23, d24, d25, d22, d23
340 lowpass_8 d26, d27, d28, d29, d26, d27
341 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
342
343 vld1.8 {d0}, [r12], r2
344 vld1.8 {d1}, [r12], r2
345 vld1.8 {d2}, [r12], r2
346 vld1.8 {d3}, [r12], r2
347 vld1.8 {d4}, [r12], r2
348 vrhadd.u8 q0, q0, q4
349 vld1.8 {d5}, [r12], r2
350 vrhadd.u8 q1, q1, q6
351 vld1.8 {d10}, [r12], r2
352 vrhadd.u8 q2, q2, q11
353 vld1.8 {d11}, [r12], r2
354 vrhadd.u8 q5, q5, q13
355
356 .ifc \type,avg
357 vld1.8 {d16}, [r0,:64], r3
358 vrhadd.u8 d0, d0, d16
359 vld1.8 {d17}, [r0,:64], r3
360 vrhadd.u8 d1, d1, d17
361 vld1.8 {d16}, [r0,:64], r3
362 vrhadd.u8 d2, d2, d16
363 vld1.8 {d17}, [r0,:64], r3
364 vrhadd.u8 d3, d3, d17
365 vld1.8 {d16}, [r0,:64], r3
366 vrhadd.u8 d4, d4, d16
367 vld1.8 {d17}, [r0,:64], r3
368 vrhadd.u8 d5, d5, d17
369 vld1.8 {d16}, [r0,:64], r3
370 vrhadd.u8 d10, d10, d16
371 vld1.8 {d17}, [r0,:64], r3
372 vrhadd.u8 d11, d11, d17
373 sub r0, r0, r3, lsl #3
374 .endif
375
376 vst1.8 {d0}, [r0,:64], r3
377 vst1.8 {d1}, [r0,:64], r3
378 vst1.8 {d2}, [r0,:64], r3
379 vst1.8 {d3}, [r0,:64], r3
380 vst1.8 {d4}, [r0,:64], r3
381 vst1.8 {d5}, [r0,:64], r3
382 vst1.8 {d10}, [r0,:64], r3
383 vst1.8 {d11}, [r0,:64], r3
384
385 bx lr
386endfunc
387.endm
388
389 h264_qpel_v_lowpass_l2 put
390 h264_qpel_v_lowpass_l2 avg
391
392function put_h264_qpel8_hv_lowpass_neon_top
393 lowpass_const r12
394 mov r12, #12
3951: vld1.8 {d0, d1}, [r1], r3
396 vld1.8 {d16,d17}, [r1], r3
397 subs r12, r12, #2
398 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
399 vst1.8 {d22-d25}, [r4,:128]!
400 bne 1b
401
402 vld1.8 {d0, d1}, [r1]
403 lowpass_8_1 d0, d1, q12, narrow=0
404
405 mov r12, #-16
406 add r4, r4, r12
407 vld1.8 {d30,d31}, [r4,:128], r12
408 vld1.8 {d20,d21}, [r4,:128], r12
409 vld1.8 {d18,d19}, [r4,:128], r12
410 vld1.8 {d16,d17}, [r4,:128], r12
411 vld1.8 {d14,d15}, [r4,:128], r12
412 vld1.8 {d12,d13}, [r4,:128], r12
413 vld1.8 {d10,d11}, [r4,:128], r12
414 vld1.8 {d8, d9}, [r4,:128], r12
415 vld1.8 {d6, d7}, [r4,:128], r12
416 vld1.8 {d4, d5}, [r4,:128], r12
417 vld1.8 {d2, d3}, [r4,:128], r12
418 vld1.8 {d0, d1}, [r4,:128]
419
420 swap4 d1, d3, d5, d7, d8, d10, d12, d14
421 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
422
423 swap4 d17, d19, d21, d31, d24, d26, d28, d22
424 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
425
426 vst1.8 {d30,d31}, [r4,:128]!
427 vst1.8 {d6, d7}, [r4,:128]!
428 vst1.8 {d20,d21}, [r4,:128]!
429 vst1.8 {d4, d5}, [r4,:128]!
430 vst1.8 {d18,d19}, [r4,:128]!
431 vst1.8 {d2, d3}, [r4,:128]!
432 vst1.8 {d16,d17}, [r4,:128]!
433 vst1.8 {d0, d1}, [r4,:128]
434
435 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
436 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
437 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
438 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
439
440 vld1.8 {d16,d17}, [r4,:128], r12
441 vld1.8 {d30,d31}, [r4,:128], r12
442 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
443 vld1.8 {d16,d17}, [r4,:128], r12
444 vld1.8 {d30,d31}, [r4,:128], r12
445 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
446 vld1.8 {d16,d17}, [r4,:128], r12
447 vld1.8 {d30,d31}, [r4,:128], r12
448 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
449 vld1.8 {d16,d17}, [r4,:128], r12
450 vld1.8 {d30,d31}, [r4,:128]
451 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
452
453 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
454
455 bx lr
456endfunc
457
458.macro h264_qpel8_hv_lowpass type
459function \type\()_h264_qpel8_hv_lowpass_neon
460 mov r10, lr
461 bl put_h264_qpel8_hv_lowpass_neon_top
462 .ifc \type,avg
463 vld1.8 {d0}, [r0,:64], r2
464 vrhadd.u8 d12, d12, d0
465 vld1.8 {d1}, [r0,:64], r2
466 vrhadd.u8 d13, d13, d1
467 vld1.8 {d2}, [r0,:64], r2
468 vrhadd.u8 d14, d14, d2
469 vld1.8 {d3}, [r0,:64], r2
470 vrhadd.u8 d15, d15, d3
471 vld1.8 {d4}, [r0,:64], r2
472 vrhadd.u8 d8, d8, d4
473 vld1.8 {d5}, [r0,:64], r2
474 vrhadd.u8 d9, d9, d5
475 vld1.8 {d6}, [r0,:64], r2
476 vrhadd.u8 d10, d10, d6
477 vld1.8 {d7}, [r0,:64], r2
478 vrhadd.u8 d11, d11, d7
479 sub r0, r0, r2, lsl #3
480 .endif
481
482 vst1.8 {d12}, [r0,:64], r2
483 vst1.8 {d13}, [r0,:64], r2
484 vst1.8 {d14}, [r0,:64], r2
485 vst1.8 {d15}, [r0,:64], r2
486 vst1.8 {d8}, [r0,:64], r2
487 vst1.8 {d9}, [r0,:64], r2
488 vst1.8 {d10}, [r0,:64], r2
489 vst1.8 {d11}, [r0,:64], r2
490
491 mov lr, r10
492 bx lr
493endfunc
494.endm
495
496 h264_qpel8_hv_lowpass put
497 h264_qpel8_hv_lowpass avg
498
499.macro h264_qpel8_hv_lowpass_l2 type
500function \type\()_h264_qpel8_hv_lowpass_l2_neon
501 mov r10, lr
502 bl put_h264_qpel8_hv_lowpass_neon_top
503
504 vld1.8 {d0, d1}, [r2,:128]!
505 vld1.8 {d2, d3}, [r2,:128]!
506 vrhadd.u8 q0, q0, q6
507 vld1.8 {d4, d5}, [r2,:128]!
508 vrhadd.u8 q1, q1, q7
509 vld1.8 {d6, d7}, [r2,:128]!
510 vrhadd.u8 q2, q2, q4
511 vrhadd.u8 q3, q3, q5
512 .ifc \type,avg
513 vld1.8 {d16}, [r0,:64], r3
514 vrhadd.u8 d0, d0, d16
515 vld1.8 {d17}, [r0,:64], r3
516 vrhadd.u8 d1, d1, d17
517 vld1.8 {d18}, [r0,:64], r3
518 vrhadd.u8 d2, d2, d18
519 vld1.8 {d19}, [r0,:64], r3
520 vrhadd.u8 d3, d3, d19
521 vld1.8 {d20}, [r0,:64], r3
522 vrhadd.u8 d4, d4, d20
523 vld1.8 {d21}, [r0,:64], r3
524 vrhadd.u8 d5, d5, d21
525 vld1.8 {d22}, [r0,:64], r3
526 vrhadd.u8 d6, d6, d22
527 vld1.8 {d23}, [r0,:64], r3
528 vrhadd.u8 d7, d7, d23
529 sub r0, r0, r3, lsl #3
530 .endif
531 vst1.8 {d0}, [r0,:64], r3
532 vst1.8 {d1}, [r0,:64], r3
533 vst1.8 {d2}, [r0,:64], r3
534 vst1.8 {d3}, [r0,:64], r3
535 vst1.8 {d4}, [r0,:64], r3
536 vst1.8 {d5}, [r0,:64], r3
537 vst1.8 {d6}, [r0,:64], r3
538 vst1.8 {d7}, [r0,:64], r3
539
540 mov lr, r10
541 bx lr
542endfunc
543.endm
544
545 h264_qpel8_hv_lowpass_l2 put
546 h264_qpel8_hv_lowpass_l2 avg
547
548.macro h264_qpel16_hv type
549function \type\()_h264_qpel16_hv_lowpass_neon
550 mov r9, lr
551 bl \type\()_h264_qpel8_hv_lowpass_neon
552 sub r1, r1, r3, lsl #2
553 bl \type\()_h264_qpel8_hv_lowpass_neon
554 sub r1, r1, r3, lsl #4
555 sub r1, r1, r3, lsl #2
556 add r1, r1, #8
557 sub r0, r0, r2, lsl #4
558 add r0, r0, #8
559 bl \type\()_h264_qpel8_hv_lowpass_neon
560 sub r1, r1, r3, lsl #2
561 mov lr, r9
562 b \type\()_h264_qpel8_hv_lowpass_neon
563endfunc
564
565function \type\()_h264_qpel16_hv_lowpass_l2_neon
566 mov r9, lr
567 sub r2, r4, #256
568 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
569 sub r1, r1, r3, lsl #2
570 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
571 sub r1, r1, r3, lsl #4
572 sub r1, r1, r3, lsl #2
573 add r1, r1, #8
574 sub r0, r0, r3, lsl #4
575 add r0, r0, #8
576 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
577 sub r1, r1, r3, lsl #2
578 mov lr, r9
579 b \type\()_h264_qpel8_hv_lowpass_l2_neon
580endfunc
581.endm
582
583 h264_qpel16_hv put
584 h264_qpel16_hv avg
585
586.macro h264_qpel8 type
587function ff_\type\()_h264_qpel8_mc10_neon, export=1
588 lowpass_const r3
589 mov r3, r1
590 sub r1, r1, #2
591 mov r12, #8
592 b \type\()_h264_qpel8_h_lowpass_l2_neon
593endfunc
594
595function ff_\type\()_h264_qpel8_mc20_neon, export=1
596 lowpass_const r3
597 sub r1, r1, #2
598 mov r3, r2
599 mov r12, #8
600 b \type\()_h264_qpel8_h_lowpass_neon
601endfunc
602
603function ff_\type\()_h264_qpel8_mc30_neon, export=1
604 lowpass_const r3
605 add r3, r1, #1
606 sub r1, r1, #2
607 mov r12, #8
608 b \type\()_h264_qpel8_h_lowpass_l2_neon
609endfunc
610
611function ff_\type\()_h264_qpel8_mc01_neon, export=1
612 push {lr}
613 mov r12, r1
614\type\()_h264_qpel8_mc01:
615 lowpass_const r3
616 mov r3, r2
617 sub r1, r1, r2, lsl #1
618 vpush {d8-d15}
619 bl \type\()_h264_qpel8_v_lowpass_l2_neon
620 vpop {d8-d15}
621 pop {pc}
622endfunc
623
624function ff_\type\()_h264_qpel8_mc11_neon, export=1
625 push {r0, r1, r11, lr}
626\type\()_h264_qpel8_mc11:
627 lowpass_const r3
628 mov r11, sp
629A bic sp, sp, #15
630T bic r0, r11, #15
631T mov sp, r0
632 sub sp, sp, #64
633 mov r0, sp
634 sub r1, r1, #2
635 mov r3, #8
636 mov r12, #8
637 vpush {d8-d15}
638 bl put_h264_qpel8_h_lowpass_neon
639 ldrd r0, r1, [r11], #8
640 mov r3, r2
641 add r12, sp, #64
642 sub r1, r1, r2, lsl #1
643 mov r2, #8
644 bl \type\()_h264_qpel8_v_lowpass_l2_neon
645 vpop {d8-d15}
646 mov sp, r11
647 pop {r11, pc}
648endfunc
649
650function ff_\type\()_h264_qpel8_mc21_neon, export=1
651 push {r0, r1, r4, r10, r11, lr}
652\type\()_h264_qpel8_mc21:
653 lowpass_const r3
654 mov r11, sp
655A bic sp, sp, #15
656T bic r0, r11, #15
657T mov sp, r0
658 sub sp, sp, #(8*8+16*12)
659 sub r1, r1, #2
660 mov r3, #8
661 mov r0, sp
662 mov r12, #8
663 vpush {d8-d15}
664 bl put_h264_qpel8_h_lowpass_neon
665 mov r4, r0
666 ldrd r0, r1, [r11], #8
667 sub r1, r1, r2, lsl #1
668 sub r1, r1, #2
669 mov r3, r2
670 sub r2, r4, #64
671 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
672 vpop {d8-d15}
673 mov sp, r11
674 pop {r4, r10, r11, pc}
675endfunc
676
677function ff_\type\()_h264_qpel8_mc31_neon, export=1
678 add r1, r1, #1
679 push {r0, r1, r11, lr}
680 sub r1, r1, #1
681 b \type\()_h264_qpel8_mc11
682endfunc
683
684function ff_\type\()_h264_qpel8_mc02_neon, export=1
685 push {lr}
686 lowpass_const r3
687 sub r1, r1, r2, lsl #1
688 mov r3, r2
689 vpush {d8-d15}
690 bl \type\()_h264_qpel8_v_lowpass_neon
691 vpop {d8-d15}
692 pop {pc}
693endfunc
694
695function ff_\type\()_h264_qpel8_mc12_neon, export=1
696 push {r0, r1, r4, r10, r11, lr}
697\type\()_h264_qpel8_mc12:
698 lowpass_const r3
699 mov r11, sp
700A bic sp, sp, #15
701T bic r0, r11, #15
702T mov sp, r0
703 sub sp, sp, #(8*8+16*12)
704 sub r1, r1, r2, lsl #1
705 mov r3, r2
706 mov r2, #8
707 mov r0, sp
708 vpush {d8-d15}
709 bl put_h264_qpel8_v_lowpass_neon
710 mov r4, r0
711 ldrd r0, r1, [r11], #8
712 sub r1, r1, r3, lsl #1
713 sub r1, r1, #2
714 sub r2, r4, #64
715 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
716 vpop {d8-d15}
717 mov sp, r11
718 pop {r4, r10, r11, pc}
719endfunc
720
721function ff_\type\()_h264_qpel8_mc22_neon, export=1
722 push {r4, r10, r11, lr}
723 mov r11, sp
724A bic sp, sp, #15
725T bic r4, r11, #15
726T mov sp, r4
727 sub r1, r1, r2, lsl #1
728 sub r1, r1, #2
729 mov r3, r2
730 sub sp, sp, #(16*12)
731 mov r4, sp
732 vpush {d8-d15}
733 bl \type\()_h264_qpel8_hv_lowpass_neon
734 vpop {d8-d15}
735 mov sp, r11
736 pop {r4, r10, r11, pc}
737endfunc
738
739function ff_\type\()_h264_qpel8_mc32_neon, export=1
740 push {r0, r1, r4, r10, r11, lr}
741 add r1, r1, #1
742 b \type\()_h264_qpel8_mc12
743endfunc
744
745function ff_\type\()_h264_qpel8_mc03_neon, export=1
746 push {lr}
747 add r12, r1, r2
748 b \type\()_h264_qpel8_mc01
749endfunc
750
751function ff_\type\()_h264_qpel8_mc13_neon, export=1
752 push {r0, r1, r11, lr}
753 add r1, r1, r2
754 b \type\()_h264_qpel8_mc11
755endfunc
756
757function ff_\type\()_h264_qpel8_mc23_neon, export=1
758 push {r0, r1, r4, r10, r11, lr}
759 add r1, r1, r2
760 b \type\()_h264_qpel8_mc21
761endfunc
762
763function ff_\type\()_h264_qpel8_mc33_neon, export=1
764 add r1, r1, #1
765 push {r0, r1, r11, lr}
766 add r1, r1, r2
767 sub r1, r1, #1
768 b \type\()_h264_qpel8_mc11
769endfunc
770.endm
771
772 h264_qpel8 put
773 h264_qpel8 avg
774
775.macro h264_qpel16 type
776function ff_\type\()_h264_qpel16_mc10_neon, export=1
777 lowpass_const r3
778 mov r3, r1
779 sub r1, r1, #2
780 b \type\()_h264_qpel16_h_lowpass_l2_neon
781endfunc
782
783function ff_\type\()_h264_qpel16_mc20_neon, export=1
784 lowpass_const r3
785 sub r1, r1, #2
786 mov r3, r2
787 b \type\()_h264_qpel16_h_lowpass_neon
788endfunc
789
790function ff_\type\()_h264_qpel16_mc30_neon, export=1
791 lowpass_const r3
792 add r3, r1, #1
793 sub r1, r1, #2
794 b \type\()_h264_qpel16_h_lowpass_l2_neon
795endfunc
796
797function ff_\type\()_h264_qpel16_mc01_neon, export=1
798 push {r4, lr}
799 mov r12, r1
800\type\()_h264_qpel16_mc01:
801 lowpass_const r3
802 mov r3, r2
803 sub r1, r1, r2, lsl #1
804 vpush {d8-d15}
805 bl \type\()_h264_qpel16_v_lowpass_l2_neon
806 vpop {d8-d15}
807 pop {r4, pc}
808endfunc
809
810function ff_\type\()_h264_qpel16_mc11_neon, export=1
811 push {r0, r1, r4, r11, lr}
812\type\()_h264_qpel16_mc11:
813 lowpass_const r3
814 mov r11, sp
815A bic sp, sp, #15
816T bic r0, r11, #15
817T mov sp, r0
818 sub sp, sp, #256
819 mov r0, sp
820 sub r1, r1, #2
821 mov r3, #16
822 vpush {d8-d15}
823 bl put_h264_qpel16_h_lowpass_neon
824 ldrd r0, r1, [r11], #8
825 mov r3, r2
826 add r12, sp, #64
827 sub r1, r1, r2, lsl #1
828 mov r2, #16
829 bl \type\()_h264_qpel16_v_lowpass_l2_neon
830 vpop {d8-d15}
831 mov sp, r11
832 pop {r4, r11, pc}
833endfunc
834
835function ff_\type\()_h264_qpel16_mc21_neon, export=1
836 push {r0, r1, r4-r5, r9-r11, lr}
837\type\()_h264_qpel16_mc21:
838 lowpass_const r3
839 mov r11, sp
840A bic sp, sp, #15
841T bic r0, r11, #15
842T mov sp, r0
843 sub sp, sp, #(16*16+16*12)
844 sub r1, r1, #2
845 mov r0, sp
846 vpush {d8-d15}
847 bl put_h264_qpel16_h_lowpass_neon_packed
848 mov r4, r0
849 ldrd r0, r1, [r11], #8
850 sub r1, r1, r2, lsl #1
851 sub r1, r1, #2
852 mov r3, r2
853 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
854 vpop {d8-d15}
855 mov sp, r11
856 pop {r4-r5, r9-r11, pc}
857endfunc
858
859function ff_\type\()_h264_qpel16_mc31_neon, export=1
860 add r1, r1, #1
861 push {r0, r1, r4, r11, lr}
862 sub r1, r1, #1
863 b \type\()_h264_qpel16_mc11
864endfunc
865
866function ff_\type\()_h264_qpel16_mc02_neon, export=1
867 push {r4, lr}
868 lowpass_const r3
869 sub r1, r1, r2, lsl #1
870 mov r3, r2
871 vpush {d8-d15}
872 bl \type\()_h264_qpel16_v_lowpass_neon
873 vpop {d8-d15}
874 pop {r4, pc}
875endfunc
876
877function ff_\type\()_h264_qpel16_mc12_neon, export=1
878 push {r0, r1, r4-r5, r9-r11, lr}
879\type\()_h264_qpel16_mc12:
880 lowpass_const r3
881 mov r11, sp
882A bic sp, sp, #15
883T bic r0, r11, #15
884T mov sp, r0
885 sub sp, sp, #(16*16+16*12)
886 sub r1, r1, r2, lsl #1
887 mov r0, sp
888 mov r3, r2
889 vpush {d8-d15}
890 bl put_h264_qpel16_v_lowpass_neon_packed
891 mov r4, r0
892 ldrd r0, r1, [r11], #8
893 sub r1, r1, r3, lsl #1
894 sub r1, r1, #2
895 mov r2, r3
896 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
897 vpop {d8-d15}
898 mov sp, r11
899 pop {r4-r5, r9-r11, pc}
900endfunc
901
902function ff_\type\()_h264_qpel16_mc22_neon, export=1
903 push {r4, r9-r11, lr}
904 lowpass_const r3
905 mov r11, sp
906A bic sp, sp, #15
907T bic r4, r11, #15
908T mov sp, r4
909 sub r1, r1, r2, lsl #1
910 sub r1, r1, #2
911 mov r3, r2
912 sub sp, sp, #(16*12)
913 mov r4, sp
914 vpush {d8-d15}
915 bl \type\()_h264_qpel16_hv_lowpass_neon
916 vpop {d8-d15}
917 mov sp, r11
918 pop {r4, r9-r11, pc}
919endfunc
920
921function ff_\type\()_h264_qpel16_mc32_neon, export=1
922 push {r0, r1, r4-r5, r9-r11, lr}
923 add r1, r1, #1
924 b \type\()_h264_qpel16_mc12
925endfunc
926
927function ff_\type\()_h264_qpel16_mc03_neon, export=1
928 push {r4, lr}
929 add r12, r1, r2
930 b \type\()_h264_qpel16_mc01
931endfunc
932
933function ff_\type\()_h264_qpel16_mc13_neon, export=1
934 push {r0, r1, r4, r11, lr}
935 add r1, r1, r2
936 b \type\()_h264_qpel16_mc11
937endfunc
938
939function ff_\type\()_h264_qpel16_mc23_neon, export=1
940 push {r0, r1, r4-r5, r9-r11, lr}
941 add r1, r1, r2
942 b \type\()_h264_qpel16_mc21
943endfunc
944
945function ff_\type\()_h264_qpel16_mc33_neon, export=1
946 add r1, r1, #1
947 push {r0, r1, r4, r11, lr}
948 add r1, r1, r2
949 sub r1, r1, #1
950 b \type\()_h264_qpel16_mc11
951endfunc
952.endm
953
954 h264_qpel16 put
955 h264_qpel16 avg