Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / rv40dsp_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
3 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/arm/asm.S"
23#include "neon.S"
24
25.macro qpel_lowpass r0, r1, rc1, rc2, shift
26 vext.8 d25, \r0, \r1, #1 @ src[-1]
27 vext.8 d26, \r0, \r1, #4 @ src[ 2]
28 vext.8 d24, \r0, \r1, #5 @ src[ 3]
29 vaddl.u8 q9, d25, d26
30 vaddl.u8 q8, \r0, d24
31 vext.8 d27, \r0, \r1, #2 @ src[ 0]
32 vshl.s16 q12, q9, #2
33 vsub.s16 q8, q8, q9
34 vext.8 d28, \r0, \r1, #3 @ src[ 1]
35 vsub.s16 q8, q8, q12
36 vmlal.u8 q8, d27, \rc1
37 vmlal.u8 q8, d28, \rc2
38 vqrshrun.s16 \r0, q8, #\shift
39.endm
40
41.macro qpel_lowpass_x2 r0, r1, r2, r3, rc1, rc2, shift
42 vext.8 d25, \r0, \r1, #1 @ src[-1]
43 vext.8 d26, \r0, \r1, #4 @ src[ 2]
44 vext.8 d24, \r0, \r1, #5 @ src[ 3]
45 vaddl.u8 q9, d25, d26
46 vaddl.u8 q8, \r0, d24
47 vext.8 d29, \r0, \r1, #2 @ src[ 0]
48 vext.8 d28, \r0, \r1, #3 @ src[ 1]
49 vshl.s16 q10, q9, #2
50 vext.8 \r1, \r2, \r3, #1 @ src[-1]
51 vsub.s16 q8, q8, q9
52 vext.8 d22, \r2, \r3, #4 @ src[ 2]
53 vext.8 \r0, \r2, \r3, #5 @ src[ 3]
54 vaddl.u8 q13, \r1, d22
55 vaddl.u8 q12, \r2, \r0
56 vsub.s16 q8, q8, q10
57 vshl.s16 q9, q13, #2
58 vsub.s16 q12, q12, q13
59 vmlal.u8 q8, d29, \rc1
60 vmlal.u8 q8, d28, \rc2
61 vsub.s16 q12, q12, q9
62 vext.8 d26, \r2, \r3, #2 @ src[ 0]
63 vext.8 d27, \r2, \r3, #3 @ src[ 1]
64 vmlal.u8 q12, d26, \rc1
65 vmlal.u8 q12, d27, \rc2
66 vqrshrun.s16 \r0, q8, #\shift
67 vqrshrun.s16 \r2, q12, #\shift
68.endm
69
70.macro rv40_qpel8_h shift
71function put_rv40_qpel8_h_lp_packed_s\shift\()_neon
721:
73 vld1.8 {q2}, [r1], r2
74 vld1.8 {q3}, [r1], r2
75 qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, \shift
76 vst1.8 {d4}, [r12,:64]!
77 vst1.8 {d6}, [r12,:64]!
78 subs r3, r3, #2
79 bgt 1b
80 vld1.8 {q2}, [r1]
81 qpel_lowpass d4, d5, d0, d1, \shift
82 vst1.8 {d4}, [r12,:64]!
83 bx lr
84endfunc
85.endm
86
87.macro rv40_qpel8_v shift, type
88function \type\()_rv40_qpel8_v_lp_packed_s\shift\()_neon
89 vld1.64 {d2}, [r1,:64]!
90 vld1.64 {d3}, [r1,:64]!
91 vld1.64 {d4}, [r1,:64]!
92 vld1.64 {d5}, [r1,:64]!
93 vld1.64 {d6}, [r1,:64]!
94 vld1.64 {d7}, [r1,:64]!
95 vld1.64 {d8}, [r1,:64]!
96 vld1.64 {d9}, [r1,:64]!
97 vld1.64 {d10}, [r1,:64]!
98 vld1.64 {d11}, [r1,:64]!
99 vld1.64 {d12}, [r1,:64]!
100 vld1.64 {d13}, [r1,:64]!
101 vld1.64 {d14}, [r1,:64]!
102 transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
103 transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31
104 qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, \shift
105 qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, \shift
106 qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, \shift
107 qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, \shift
108 transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
109 .ifc \type,avg
110 vld1.64 d12, [r0,:64], r2
111 vld1.64 d13, [r0,:64], r2
112 vld1.64 d14, [r0,:64], r2
113 vld1.64 d15, [r0,:64], r2
114 vld1.64 d16, [r0,:64], r2
115 vld1.64 d17, [r0,:64], r2
116 vld1.64 d18, [r0,:64], r2
117 vld1.64 d19, [r0,:64], r2
118 sub r0, r0, r2, lsl #3
119 vrhadd.u8 q1, q1, q6
120 vrhadd.u8 q2, q2, q7
121 vrhadd.u8 q3, q3, q8
122 vrhadd.u8 q4, q4, q9
123 .endif
124 vst1.64 d2, [r0,:64], r2
125 vst1.64 d3, [r0,:64], r2
126 vst1.64 d4, [r0,:64], r2
127 vst1.64 d5, [r0,:64], r2
128 vst1.64 d6, [r0,:64], r2
129 vst1.64 d7, [r0,:64], r2
130 vst1.64 d8, [r0,:64], r2
131 vst1.64 d9, [r0,:64], r2
132 bx lr
133endfunc
134.endm
135
136 rv40_qpel8_h 5
137 rv40_qpel8_h 6
138
139.macro rv40_qpel type
140function \type\()_rv40_qpel8_h_lowpass_neon
141 .ifc \type,avg
142 mov r12, r0
143 .endif
1441:
145 vld1.8 {q2}, [r1], r2
146 vld1.8 {q3}, [r1], r2
147 qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, 6
148 .ifc \type,avg
149 vld1.8 {d3}, [r12,:64], r2
150 vld1.8 {d16}, [r12,:64], r2
151 vrhadd.u8 d4, d4, d3
152 vrhadd.u8 d6, d6, d16
153 .endif
154 vst1.8 {d4}, [r0,:64], r2
155 vst1.8 {d6}, [r0,:64], r2
156 subs r3, r3, #2
157 bgt 1b
158 bx lr
159endfunc
160
161function \type\()_rv40_qpel8_v_lowpass_neon
162 vld1.64 {d2}, [r1], r2
163 vld1.64 {d3}, [r1], r2
164 vld1.64 {d4}, [r1], r2
165 vld1.64 {d5}, [r1], r2
166 vld1.64 {d6}, [r1], r2
167 vld1.64 {d7}, [r1], r2
168 vld1.64 {d8}, [r1], r2
169 vld1.64 {d9}, [r1], r2
170 vld1.64 {d10}, [r1], r2
171 vld1.64 {d11}, [r1], r2
172 vld1.64 {d12}, [r1], r2
173 vld1.64 {d13}, [r1], r2
174 vld1.64 {d14}, [r1]
175 transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
176 transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31
177 qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, 6
178 qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, 6
179 qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, 6
180 qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, 6
181 transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
182 .ifc \type,avg
183 vld1.64 d12, [r0,:64], r2
184 vld1.64 d13, [r0,:64], r2
185 vld1.64 d14, [r0,:64], r2
186 vld1.64 d15, [r0,:64], r2
187 vld1.64 d16, [r0,:64], r2
188 vld1.64 d17, [r0,:64], r2
189 vld1.64 d18, [r0,:64], r2
190 vld1.64 d19, [r0,:64], r2
191 sub r0, r0, r2, lsl #3
192 vrhadd.u8 q1, q1, q6
193 vrhadd.u8 q2, q2, q7
194 vrhadd.u8 q3, q3, q8
195 vrhadd.u8 q4, q4, q9
196 .endif
197 vst1.64 d2, [r0,:64], r2
198 vst1.64 d3, [r0,:64], r2
199 vst1.64 d4, [r0,:64], r2
200 vst1.64 d5, [r0,:64], r2
201 vst1.64 d6, [r0,:64], r2
202 vst1.64 d7, [r0,:64], r2
203 vst1.64 d8, [r0,:64], r2
204 vst1.64 d9, [r0,:64], r2
205 bx lr
206endfunc
207
208 rv40_qpel8_v 5, \type
209 rv40_qpel8_v 6, \type
210
211function ff_\type\()_rv40_qpel8_mc10_neon, export=1
212 sub r1, r1, #2
213 mov r3, #8
214 vmov.i8 d0, #52
215 vmov.i8 d1, #20
216 b \type\()_rv40_qpel8_h_lowpass_neon
217endfunc
218
219function ff_\type\()_rv40_qpel8_mc30_neon, export=1
220 sub r1, r1, #2
221 mov r3, #8
222 vmov.i8 d0, #20
223 vmov.i8 d1, #52
224 b \type\()_rv40_qpel8_h_lowpass_neon
225endfunc
226
227function ff_\type\()_rv40_qpel8_mc01_neon, export=1
228 push {r4, lr}
229 vpush {d8-d15}
230 sub r1, r1, r2, lsl #1
231 vmov.i8 d0, #52
232 vmov.i8 d1, #20
233 bl \type\()_rv40_qpel8_v_lowpass_neon
234 vpop {d8-d15}
235 pop {r4, pc}
236endfunc
237
238function ff_\type\()_rv40_qpel8_mc11_neon, export=1
239 push {r4, lr}
240 vpush {d8-d15}
241 sub sp, sp, #14*8
242 add r12, sp, #7
243 bic r12, r12, #7
244 sub r1, r1, r2, lsl #1
245 sub r1, r1, #2
246 mov r3, #12
247 vmov.i8 d0, #52
248 vmov.i8 d1, #20
249 bl put_rv40_qpel8_h_lp_packed_s6_neon
250 add r1, sp, #7
251 bic r1, r1, #7
252 bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
253 add sp, sp, #14*8
254 vpop {d8-d15}
255 pop {r4, pc}
256endfunc
257
258function ff_\type\()_rv40_qpel8_mc21_neon, export=1
259 push {r4, lr}
260 vpush {d8-d15}
261 sub sp, sp, #14*8
262 add r12, sp, #7
263 bic r12, r12, #7
264 sub r1, r1, r2, lsl #1
265 sub r1, r1, #2
266 mov r3, #12
267 vmov.i8 d0, #20
268 vmov.i8 d1, #20
269 bl put_rv40_qpel8_h_lp_packed_s5_neon
270 add r1, sp, #7
271 bic r1, r1, #7
272 vmov.i8 d0, #52
273 bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
274 add sp, sp, #14*8
275 vpop {d8-d15}
276 pop {r4, pc}
277endfunc
278
279function ff_\type\()_rv40_qpel8_mc31_neon, export=1
280 push {r4, lr}
281 vpush {d8-d15}
282 sub sp, sp, #14*8
283 add r12, sp, #7
284 bic r12, r12, #7
285 sub r1, r1, r2, lsl #1
286 sub r1, r1, #2
287 mov r3, #12
288 vmov.i8 d0, #20
289 vmov.i8 d1, #52
290 bl put_rv40_qpel8_h_lp_packed_s6_neon
291 add r1, sp, #7
292 bic r1, r1, #7
293 vswp d0, d1
294 bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
295 add sp, sp, #14*8
296 vpop {d8-d15}
297 pop {r4, pc}
298endfunc
299
300function ff_\type\()_rv40_qpel8_mc12_neon, export=1
301 push {r4, lr}
302 vpush {d8-d15}
303 sub sp, sp, #14*8
304 add r12, sp, #7
305 bic r12, r12, #7
306 sub r1, r1, r2, lsl #1
307 sub r1, r1, #2
308 mov r3, #12
309 vmov.i8 d0, #52
310 vmov.i8 d1, #20
311 bl put_rv40_qpel8_h_lp_packed_s6_neon
312 add r1, sp, #7
313 bic r1, r1, #7
314 vmov.i8 d0, #20
315 bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
316 add sp, sp, #14*8
317 vpop {d8-d15}
318 pop {r4, pc}
319endfunc
320
321function ff_\type\()_rv40_qpel8_mc22_neon, export=1
322 push {r4, lr}
323 vpush {d8-d15}
324 sub sp, sp, #14*8
325 add r12, sp, #7
326 bic r12, r12, #7
327 sub r1, r1, r2, lsl #1
328 sub r1, r1, #2
329 mov r3, #12
330 vmov.i8 d0, #20
331 vmov.i8 d1, #20
332 bl put_rv40_qpel8_h_lp_packed_s5_neon
333 add r1, sp, #7
334 bic r1, r1, #7
335 bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
336 add sp, sp, #14*8
337 vpop {d8-d15}
338 pop {r4, pc}
339endfunc
340
341function ff_\type\()_rv40_qpel8_mc32_neon, export=1
342 push {r4, lr}
343 vpush {d8-d15}
344 sub sp, sp, #14*8
345 add r12, sp, #7
346 bic r12, r12, #7
347 sub r1, r1, r2, lsl #1
348 sub r1, r1, #2
349 mov r3, #12
350 vmov.i8 d0, #20
351 vmov.i8 d1, #52
352 bl put_rv40_qpel8_h_lp_packed_s6_neon
353 add r1, sp, #7
354 bic r1, r1, #7
355 vmov.i8 d1, #20
356 bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
357 add sp, sp, #14*8
358 vpop {d8-d15}
359 pop {r4, pc}
360endfunc
361
362function ff_\type\()_rv40_qpel8_mc03_neon, export=1
363 push {r4, lr}
364 vpush {d8-d15}
365 sub r1, r1, r2, lsl #1
366 vmov.i8 d0, #20
367 vmov.i8 d1, #52
368 bl \type\()_rv40_qpel8_v_lowpass_neon
369 vpop {d8-d15}
370 pop {r4, pc}
371endfunc
372
373function ff_\type\()_rv40_qpel8_mc33_neon, export=1
374 mov r3, #8
375 b X(ff_\type\()_pixels8_xy2_neon)
376endfunc
377
378function ff_\type\()_rv40_qpel8_mc13_neon, export=1
379 push {r4, lr}
380 vpush {d8-d15}
381 sub sp, sp, #14*8
382 add r12, sp, #7
383 bic r12, r12, #7
384 sub r1, r1, r2, lsl #1
385 sub r1, r1, #2
386 mov r3, #12
387 vmov.i8 d0, #52
388 vmov.i8 d1, #20
389 bl put_rv40_qpel8_h_lp_packed_s6_neon
390 add r1, sp, #7
391 bic r1, r1, #7
392 vswp d0, d1
393 bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
394 add sp, sp, #14*8
395 vpop {d8-d15}
396 pop {r4, pc}
397endfunc
398
399function ff_\type\()_rv40_qpel8_mc23_neon, export=1
400 push {r4, lr}
401 vpush {d8-d15}
402 sub sp, sp, #14*8
403 add r12, sp, #7
404 bic r12, r12, #7
405 sub r1, r1, r2, lsl #1
406 sub r1, r1, #2
407 mov r3, #12
408 vmov.i8 d0, #20
409 vmov.i8 d1, #20
410 bl put_rv40_qpel8_h_lp_packed_s5_neon
411 add r1, sp, #7
412 bic r1, r1, #7
413 vmov.i8 d1, #52
414 bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
415 add sp, sp, #14*8
416 vpop {d8-d15}
417 pop {r4, pc}
418endfunc
419
420function ff_\type\()_rv40_qpel16_mc10_neon, export=1
421 vmov.i8 d0, #52
422 vmov.i8 d1, #20
423.L\type\()_rv40_qpel16_h:
424 push {r1, lr}
425 sub r1, r1, #2
426 mov r3, #16
427 bl \type\()_rv40_qpel8_h_lowpass_neon
428 pop {r1, lr}
429 sub r0, r0, r2, lsl #4
430 add r0, r0, #8
431 add r1, r1, #6
432 mov r3, #16
433 b \type\()_rv40_qpel8_h_lowpass_neon
434endfunc
435
436function ff_\type\()_rv40_qpel16_mc30_neon, export=1
437 vmov.i8 d0, #20
438 vmov.i8 d1, #52
439 b .L\type\()_rv40_qpel16_h
440endfunc
441
442function ff_\type\()_rv40_qpel16_mc01_neon, export=1
443 vmov.i8 d0, #52
444 vmov.i8 d1, #20
445.L\type\()_rv40_qpel16_v:
446 sub r1, r1, r2, lsl #1
447 push {r1, lr}
448 vpush {d8-d15}
449 bl \type\()_rv40_qpel8_v_lowpass_neon
450 sub r1, r1, r2, lsl #2
451 bl \type\()_rv40_qpel8_v_lowpass_neon
452 ldr r1, [sp, #64]
453 sub r0, r0, r2, lsl #4
454 add r0, r0, #8
455 add r1, r1, #8
456 bl \type\()_rv40_qpel8_v_lowpass_neon
457 sub r1, r1, r2, lsl #2
458 bl \type\()_rv40_qpel8_v_lowpass_neon
459 vpop {d8-d15}
460 pop {r1, pc}
461endfunc
462
463function ff_\type\()_rv40_qpel16_mc11_neon, export=1
464 sub r1, r1, r2, lsl #1
465 sub r1, r1, #2
466 push {r1, lr}
467 vpush {d8-d15}
468 sub sp, sp, #44*8
469 add r12, sp, #7
470 bic r12, r12, #7
471 mov r3, #20
472 vmov.i8 d0, #52
473 vmov.i8 d1, #20
474 bl put_rv40_qpel8_h_lp_packed_s6_neon
475 ldr r1, [sp, #416]
476 add r1, r1, #8
477 mov r3, #20
478 bl put_rv40_qpel8_h_lp_packed_s6_neon
479.L\type\()_rv40_qpel16_v_s6:
480 add r1, sp, #7
481 bic r1, r1, #7
482 bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
483 sub r1, r1, #40
484 bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
485 sub r0, r0, r2, lsl #4
486 add r0, r0, #8
487 bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
488 sub r1, r1, #40
489 bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
490 add sp, sp, #44*8
491 vpop {d8-d15}
492 pop {r1, pc}
493endfunc
494
495function ff_\type\()_rv40_qpel16_mc21_neon, export=1
496 sub r1, r1, r2, lsl #1
497 sub r1, r1, #2
498 push {r1, lr}
499 vpush {d8-d15}
500 sub sp, sp, #44*8
501 add r12, sp, #7
502 bic r12, r12, #7
503 mov r3, #20
504 vmov.i8 d0, #20
505 vmov.i8 d1, #20
506 bl put_rv40_qpel8_h_lp_packed_s5_neon
507 ldr r1, [sp, #416]
508 add r1, r1, #8
509 mov r3, #20
510 bl put_rv40_qpel8_h_lp_packed_s5_neon
511 vmov.i8 d0, #52
512 b .L\type\()_rv40_qpel16_v_s6
513endfunc
514
515function ff_\type\()_rv40_qpel16_mc31_neon, export=1
516 sub r1, r1, r2, lsl #1
517 sub r1, r1, #2
518 push {r1, lr}
519 vpush {d8-d15}
520 sub sp, sp, #44*8
521 add r12, sp, #7
522 bic r12, r12, #7
523 mov r3, #20
524 vmov.i8 d0, #20
525 vmov.i8 d1, #52
526 bl put_rv40_qpel8_h_lp_packed_s6_neon
527 ldr r1, [sp, #416]
528 add r1, r1, #8
529 mov r3, #20
530 bl put_rv40_qpel8_h_lp_packed_s6_neon
531 vswp d0, d1
532 b .L\type\()_rv40_qpel16_v_s6
533endfunc
534
535function ff_\type\()_rv40_qpel16_mc12_neon, export=1
536 sub r1, r1, r2, lsl #1
537 sub r1, r1, #2
538 push {r1, lr}
539 vpush {d8-d15}
540 sub sp, sp, #44*8
541 add r12, sp, #7
542 bic r12, r12, #7
543 mov r3, #20
544 vmov.i8 d0, #52
545 vmov.i8 d1, #20
546 bl put_rv40_qpel8_h_lp_packed_s6_neon
547 ldr r1, [sp, #416]
548 add r1, r1, #8
549 mov r3, #20
550 bl put_rv40_qpel8_h_lp_packed_s6_neon
551 vmov.i8 d0, #20
552.L\type\()_rv40_qpel16_v_s5:
553 add r1, sp, #7
554 bic r1, r1, #7
555 bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
556 sub r1, r1, #40
557 bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
558 sub r0, r0, r2, lsl #4
559 add r0, r0, #8
560 bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
561 sub r1, r1, #40
562 bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
563 add sp, sp, #44*8
564 vpop {d8-d15}
565 pop {r1, pc}
566endfunc
567
568function ff_\type\()_rv40_qpel16_mc22_neon, export=1
569 sub r1, r1, r2, lsl #1
570 sub r1, r1, #2
571 push {r1, lr}
572 vpush {d8-d15}
573 sub sp, sp, #44*8
574 add r12, sp, #7
575 bic r12, r12, #7
576 mov r3, #20
577 vmov.i8 d0, #20
578 vmov.i8 d1, #20
579 bl put_rv40_qpel8_h_lp_packed_s5_neon
580 ldr r1, [sp, #416]
581 add r1, r1, #8
582 mov r3, #20
583 bl put_rv40_qpel8_h_lp_packed_s5_neon
584 b .L\type\()_rv40_qpel16_v_s5
585endfunc
586
587function ff_\type\()_rv40_qpel16_mc32_neon, export=1
588 sub r1, r1, r2, lsl #1
589 sub r1, r1, #2
590 push {r1, lr}
591 vpush {d8-d15}
592 sub sp, sp, #44*8
593 add r12, sp, #7
594 bic r12, r12, #7
595 mov r3, #20
596 vmov.i8 d0, #20
597 vmov.i8 d1, #52
598 bl put_rv40_qpel8_h_lp_packed_s6_neon
599 ldr r1, [sp, #416]
600 add r1, r1, #8
601 mov r3, #20
602 bl put_rv40_qpel8_h_lp_packed_s6_neon
603 vmov.i8 d1, #20
604 b .L\type\()_rv40_qpel16_v_s5
605endfunc
606
607function ff_\type\()_rv40_qpel16_mc03_neon, export=1
608 vmov.i8 d0, #20
609 vmov.i8 d1, #52
610 b .L\type\()_rv40_qpel16_v
611endfunc
612
613function ff_\type\()_rv40_qpel16_mc13_neon, export=1
614 sub r1, r1, r2, lsl #1
615 sub r1, r1, #2
616 push {r1, lr}
617 vpush {d8-d15}
618 sub sp, sp, #44*8
619 add r12, sp, #7
620 bic r12, r12, #7
621 mov r3, #20
622 vmov.i8 d0, #52
623 vmov.i8 d1, #20
624 bl put_rv40_qpel8_h_lp_packed_s6_neon
625 ldr r1, [sp, #416]
626 add r1, r1, #8
627 mov r3, #20
628 bl put_rv40_qpel8_h_lp_packed_s6_neon
629 vswp d0, d1
630 b .L\type\()_rv40_qpel16_v_s6
631endfunc
632
633function ff_\type\()_rv40_qpel16_mc23_neon, export=1
634 sub r1, r1, r2, lsl #1
635 sub r1, r1, #2
636 push {r1, lr}
637 vpush {d8-d15}
638 sub sp, sp, #44*8
639 add r12, sp, #7
640 bic r12, r12, #7
641 mov r3, #20
642 vmov.i8 d0, #20
643 vmov.i8 d1, #20
644 bl put_rv40_qpel8_h_lp_packed_s5_neon
645 ldr r1, [sp, #416]
646 add r1, r1, #8
647 mov r3, #20
648 bl put_rv40_qpel8_h_lp_packed_s5_neon
649 vmov.i8 d1, #52
650 b .L\type\()_rv40_qpel16_v_s6
651endfunc
652
653function ff_\type\()_rv40_qpel16_mc33_neon, export=1
654 mov r3, #16
655 b X(ff_\type\()_pixels16_xy2_neon)
656endfunc
657.endm
658
659 rv40_qpel put
660 rv40_qpel avg
661
662.macro rv40_weight
663 vmovl.u8 q8, d2
664 vmovl.u8 q9, d3
665 vmovl.u8 q10, d4
666 vmovl.u8 q11, d5
667 vmull.u16 q2, d16, d0[2]
668 vmull.u16 q3, d17, d0[2]
669 vmull.u16 q8, d18, d0[2]
670 vmull.u16 q9, d19, d0[2]
671 vmull.u16 q12, d20, d0[0]
672 vmull.u16 q13, d21, d0[0]
673 vmull.u16 q14, d22, d0[0]
674 vmull.u16 q15, d23, d0[0]
675 vshrn.i32 d4, q2, #9
676 vshrn.i32 d5, q3, #9
677 vshrn.i32 d6, q8, #9
678 vshrn.i32 d7, q9, #9
679 vshrn.i32 d16, q12, #9
680 vshrn.i32 d17, q13, #9
681 vshrn.i32 d18, q14, #9
682 vshrn.i32 d19, q15, #9
683 vadd.u16 q2, q2, q8
684 vadd.u16 q3, q3, q9
685 vrshrn.i16 d2, q2, #5
686 vrshrn.i16 d3, q3, #5
687.endm
688
689/* void ff_rv40_weight_func_16_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
690 int w1, int w2, int stride) */
691function ff_rv40_weight_func_16_neon, export=1
692 ldr r12, [sp]
693 vmov d0, r3, r12
694 ldr r12, [sp, #4]
695 mov r3, #16
6961:
697 vld1.8 {q1}, [r1,:128], r12
698 vld1.8 {q2}, [r2,:128], r12
699 rv40_weight
700 vst1.8 {q1}, [r0,:128], r12
701 subs r3, r3, #1
702 bne 1b
703 bx lr
704endfunc
705
706/* void ff_rv40_weight_func_8_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
707 int w1, int w2, int stride) */
708function ff_rv40_weight_func_8_neon, export=1
709 ldr r12, [sp]
710 vmov d0, r3, r12
711 ldr r12, [sp, #4]
712 mov r3, #8
7131:
714 vld1.8 {d2}, [r1,:64], r12
715 vld1.8 {d3}, [r1,:64], r12
716 vld1.8 {d4}, [r2,:64], r12
717 vld1.8 {d5}, [r2,:64], r12
718 rv40_weight
719 vst1.8 {d2}, [r0,:64], r12
720 vst1.8 {d3}, [r0,:64], r12
721 subs r3, r3, #2
722 bne 1b
723 bx lr
724endfunc
725
726function ff_rv40_h_loop_filter_strength_neon, export=1
727 pkhbt r2, r3, r2, lsl #18
728
729 ldr r3, [r0]
730 ldr_dpre r12, r0, r1
731 teq r3, r12
732 beq 1f
733
734 sub r0, r0, r1, lsl #1
735
736 vld1.32 {d4[]}, [r0,:32], r1 @ -3
737 vld1.32 {d0[]}, [r0,:32], r1 @ -2
738 vld1.32 {d4[1]}, [r0,:32], r1 @ -1
739 vld1.32 {d5[]}, [r0,:32], r1 @ 0
740 vld1.32 {d1[]}, [r0,:32], r1 @ 1
741 vld1.32 {d5[0]}, [r0,:32], r1 @ 2
742
743 vpaddl.u8 q8, q0 @ -2, -2, -2, -2, 1, 1, 1, 1
744 vpaddl.u8 q9, q2 @ -3, -3, -1, -1, 2, 2, 0, 0
745 vdup.32 d30, r2 @ beta2, beta << 2
746 vpadd.u16 d16, d16, d17 @ -2, -2, 1, 1
747 vpadd.u16 d18, d18, d19 @ -3, -1, 2, 0
748 vabd.u16 d16, d18, d16
749 vclt.u16 d16, d16, d30
750
751 ldrd r2, r3, [sp, #4]
752 vmovl.u16 q12, d16
753 vtrn.16 d16, d17
754 vshr.u32 q12, q12, #15
755 ldr r0, [sp]
756 vst1.32 {d24[1]}, [r2,:32]
757 vst1.32 {d25[1]}, [r3,:32]
758
759 cmp r0, #0
760 it eq
761 bxeq lr
762
763 vand d18, d16, d17
764 vtrn.32 d18, d19
765 vand d18, d18, d19
766 vmov.u16 r0, d18[0]
767 bx lr
7681:
769 ldrd r2, r3, [sp, #4]
770 mov r0, #0
771 str r0, [r2]
772 str r0, [r3]
773 bx lr
774endfunc
775
776function ff_rv40_v_loop_filter_strength_neon, export=1
777 sub r0, r0, #3
778 pkhbt r2, r3, r2, lsl #18
779
780 vld1.8 {d0}, [r0], r1
781 vld1.8 {d1}, [r0], r1
782 vld1.8 {d2}, [r0], r1
783 vld1.8 {d3}, [r0], r1
784
785 vaddl.u8 q0, d0, d1
786 vaddl.u8 q1, d2, d3
787 vdup.32 q15, r2
788 vadd.u16 q0, q0, q1 @ -3, -2, -1, 0, 1, 2
789 vext.16 q1, q0, q0, #1 @ -2, -1, 0, 1, 2
790 vabd.u16 q0, q1, q0
791 vclt.u16 q0, q0, q15
792
793 ldrd r2, r3, [sp, #4]
794 vmovl.u16 q1, d0
795 vext.16 d1, d0, d1, #3
796 vshr.u32 q1, q1, #15
797 ldr r0, [sp]
798 vst1.32 {d2[1]}, [r2,:32]
799 vst1.32 {d3[1]}, [r3,:32]
800
801 cmp r0, #0
802 it eq
803 bxeq lr
804
805 vand d0, d0, d1
806 vtrn.16 d0, d1
807 vand d0, d0, d1
808 vmov.u16 r0, d0[0]
809 bx lr
810endfunc
811
812.macro rv40_weak_loop_filter
813 vdup.16 d30, r2 @ filter_p1
814 vdup.16 d31, r3 @ filter_q1
815 ldrd r2, r3, [sp]
816 vdup.16 d28, r2 @ alpha
817 vdup.16 d29, r3 @ beta
818 ldr r12, [sp, #8]
819 vdup.16 d25, r12 @ lim_p0q0
820 ldrd r2, r3, [sp, #12]
821 vsubl.u8 q9, d5, d4 @ x, t
822 vabdl.u8 q8, d5, d4 @ x, abs(t)
823 vneg.s16 q15, q15
824 vceq.i16 d16, d19, #0 @ !t
825 vshl.s16 d19, d19, #2 @ t << 2
826 vmul.u16 d18, d17, d28 @ alpha * abs(t)
827 vand d24, d30, d31 @ filter_p1 & filter_q1
828 vsubl.u8 q1, d0, d4 @ p1p2, p1p0
829 vsubl.u8 q3, d1, d5 @ q1q2, q1q0
830 vmov.i16 d22, #3
831 vshr.u16 d18, d18, #7
832 vadd.i16 d22, d22, d24 @ 3 - (filter_p1 & filter_q1)
833 vsubl.u8 q10, d0, d1 @ src[-2] - src[1]
834 vcle.u16 d18, d18, d22
835 vand d20, d20, d24
836 vneg.s16 d23, d25 @ -lim_p0q0
837 vadd.s16 d19, d19, d20
838 vbic d16, d18, d16 @ t && u <= 3 - (fp1 & fq1)
839 vtrn.32 d4, d5 @ -3, 2, -1, 0
840 vrshr.s16 d19, d19, #3
841 vmov d28, d29 @ beta
842 vswp d3, d6 @ q1q2, p1p0
843 vmin.s16 d19, d19, d25
844 vand d30, d30, d16
845 vand d31, d31, d16
846 vadd.s16 q10, q1, q3 @ p1p2 + p1p0, q1q2 + q1q0
847 vmax.s16 d19, d19, d23 @ diff
848 vabs.s16 q1, q1 @ abs(p1p2), abs(q1q2)
849 vand d18, d19, d16 @ diff
850 vcle.u16 q1, q1, q14
851 vneg.s16 d19, d18 @ -diff
852 vdup.16 d26, r3 @ lim_p1
853 vaddw.u8 q2, q9, d5 @ src[-1]+diff, src[0]-diff
854 vhsub.s16 q11, q10, q9
855 vand q1, q1, q15
856 vqmovun.s16 d4, q2 @ -1, 0
857 vand q9, q11, q1
858 vdup.16 d27, r2 @ lim_q1
859 vneg.s16 q9, q9
860 vneg.s16 q14, q13
861 vmin.s16 q9, q9, q13
862 vtrn.32 d0, d1 @ -2, 1, -2, 1
863 vmax.s16 q9, q9, q14
864 vaddw.u8 q3, q9, d0
865 vqmovun.s16 d5, q3 @ -2, 1
866.endm
867
868function ff_rv40_h_weak_loop_filter_neon, export=1
869 sub r0, r0, r1, lsl #1
870 sub r0, r0, r1
871
872 vld1.32 {d4[]}, [r0,:32], r1
873 vld1.32 {d0[]}, [r0,:32], r1
874 vld1.32 {d4[1]}, [r0,:32], r1
875 vld1.32 {d5[]}, [r0,:32], r1
876 vld1.32 {d1[]}, [r0,:32], r1
877 vld1.32 {d5[0]}, [r0,:32]
878
879 sub r0, r0, r1, lsl #2
880
881 rv40_weak_loop_filter
882
883 vst1.32 {d5[0]}, [r0,:32], r1
884 vst1.32 {d4[0]}, [r0,:32], r1
885 vst1.32 {d4[1]}, [r0,:32], r1
886 vst1.32 {d5[1]}, [r0,:32], r1
887
888 bx lr
889endfunc
890
891function ff_rv40_v_weak_loop_filter_neon, export=1
892 sub r12, r0, #3
893 sub r0, r0, #2
894
895 vld1.8 {d4}, [r12], r1
896 vld1.8 {d5}, [r12], r1
897 vld1.8 {d2}, [r12], r1
898 vld1.8 {d3}, [r12], r1
899
900 vtrn.16 q2, q1
901 vtrn.8 d4, d5
902 vtrn.8 d2, d3
903
904 vrev64.32 d5, d5
905 vtrn.32 q2, q1
906 vdup.32 d0, d3[0]
907 vdup.32 d1, d2[0]
908
909 rv40_weak_loop_filter
910
911 vtrn.32 q2, q3
912 vswp d4, d5
913
914 vst4.8 {d4[0],d5[0],d6[0],d7[0]}, [r0], r1
915 vst4.8 {d4[1],d5[1],d6[1],d7[1]}, [r0], r1
916 vst4.8 {d4[2],d5[2],d6[2],d7[2]}, [r0], r1
917 vst4.8 {d4[3],d5[3],d6[3],d7[3]}, [r0], r1
918
919 bx lr
920endfunc