Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / vp8dsp_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * VP8 NEON optimisations
3 *
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include "libavutil/arm/asm.S"
25#include "neon.S"
26
27function ff_vp8_luma_dc_wht_neon, export=1
28 vld1.16 {q0-q1}, [r1,:128]
29 vmov.i16 q15, #0
30
31 vadd.i16 d4, d0, d3
32 vadd.i16 d6, d1, d2
33 vst1.16 {q15}, [r1,:128]!
34 vsub.i16 d7, d1, d2
35 vsub.i16 d5, d0, d3
36 vst1.16 {q15}, [r1,:128]
37 vadd.i16 q0, q2, q3
38 vsub.i16 q1, q2, q3
39
40 vmov.i16 q8, #3
41
42 vtrn.32 d0, d2
43 vtrn.32 d1, d3
44 vtrn.16 d0, d1
45 vtrn.16 d2, d3
46
47 vadd.i16 d0, d0, d16
48
49 vadd.i16 d4, d0, d3
50 vadd.i16 d6, d1, d2
51 vsub.i16 d7, d1, d2
52 vsub.i16 d5, d0, d3
53 vadd.i16 q0, q2, q3
54 vsub.i16 q1, q2, q3
55
56 vshr.s16 q0, q0, #3
57 vshr.s16 q1, q1, #3
58
59 mov r3, #32
60 vst1.16 {d0[0]}, [r0,:16], r3
61 vst1.16 {d1[0]}, [r0,:16], r3
62 vst1.16 {d2[0]}, [r0,:16], r3
63 vst1.16 {d3[0]}, [r0,:16], r3
64 vst1.16 {d0[1]}, [r0,:16], r3
65 vst1.16 {d1[1]}, [r0,:16], r3
66 vst1.16 {d2[1]}, [r0,:16], r3
67 vst1.16 {d3[1]}, [r0,:16], r3
68 vst1.16 {d0[2]}, [r0,:16], r3
69 vst1.16 {d1[2]}, [r0,:16], r3
70 vst1.16 {d2[2]}, [r0,:16], r3
71 vst1.16 {d3[2]}, [r0,:16], r3
72 vst1.16 {d0[3]}, [r0,:16], r3
73 vst1.16 {d1[3]}, [r0,:16], r3
74 vst1.16 {d2[3]}, [r0,:16], r3
75 vst1.16 {d3[3]}, [r0,:16], r3
76
77 bx lr
78endfunc
79
80function ff_vp8_idct_add_neon, export=1
81 vld1.16 {q0-q1}, [r1,:128]
82 movw r3, #20091
83 movt r3, #35468/2
84 vdup.32 d4, r3
85
86 vmull.s16 q12, d1, d4[0]
87 vmull.s16 q13, d3, d4[0]
88 vqdmulh.s16 d20, d1, d4[1]
89 vqdmulh.s16 d23, d3, d4[1]
90 vshrn.s32 d21, q12, #16
91 vshrn.s32 d22, q13, #16
92 vadd.s16 d21, d21, d1
93 vadd.s16 d22, d22, d3
94
95 vadd.s16 d16, d0, d2
96 vsub.s16 d17, d0, d2
97 vadd.s16 d18, d21, d23
98 vsub.s16 d19, d20, d22
99 vadd.s16 q0, q8, q9
100 vsub.s16 q1, q8, q9
101
102 vtrn.32 d0, d3
103 vtrn.32 d1, d2
104 vtrn.16 d0, d1
105 vtrn.16 d3, d2
106
107 vmov.i16 q15, #0
108 vmull.s16 q12, d1, d4[0]
109 vst1.16 {q15}, [r1,:128]!
110 vmull.s16 q13, d2, d4[0]
111 vst1.16 {q15}, [r1,:128]
112 vqdmulh.s16 d21, d1, d4[1]
113 vqdmulh.s16 d23, d2, d4[1]
114 vshrn.s32 d20, q12, #16
115 vshrn.s32 d22, q13, #16
116 vadd.i16 d20, d20, d1
117 vadd.i16 d22, d22, d2
118
119 vadd.i16 d16, d0, d3
120 vsub.i16 d17, d0, d3
121 vadd.i16 d18, d20, d23
122 vld1.32 {d20[]}, [r0,:32], r2
123 vsub.i16 d19, d21, d22
124 vld1.32 {d22[]}, [r0,:32], r2
125 vadd.s16 q0, q8, q9
126 vld1.32 {d23[]}, [r0,:32], r2
127 vsub.s16 q1, q8, q9
128 vld1.32 {d21[]}, [r0,:32], r2
129 vrshr.s16 q0, q0, #3
130 vtrn.32 q10, q11
131 vrshr.s16 q1, q1, #3
132
133 sub r0, r0, r2, lsl #2
134
135 vtrn.32 d0, d3
136 vtrn.32 d1, d2
137 vtrn.16 d0, d1
138 vtrn.16 d3, d2
139
140 vaddw.u8 q0, q0, d20
141 vaddw.u8 q1, q1, d21
142 vqmovun.s16 d0, q0
143 vqmovun.s16 d1, q1
144
145 vst1.32 {d0[0]}, [r0,:32], r2
146 vst1.32 {d0[1]}, [r0,:32], r2
147 vst1.32 {d1[1]}, [r0,:32], r2
148 vst1.32 {d1[0]}, [r0,:32], r2
149
150 bx lr
151endfunc
152
153function ff_vp8_idct_dc_add_neon, export=1
154 mov r3, #0
155 ldrsh r12, [r1]
156 strh r3, [r1]
157 vdup.16 q1, r12
158 vrshr.s16 q1, q1, #3
159 vld1.32 {d0[]}, [r0,:32], r2
160 vld1.32 {d1[]}, [r0,:32], r2
161 vld1.32 {d0[1]}, [r0,:32], r2
162 vld1.32 {d1[1]}, [r0,:32], r2
163 vaddw.u8 q2, q1, d0
164 vaddw.u8 q3, q1, d1
165 sub r0, r0, r2, lsl #2
166 vqmovun.s16 d0, q2
167 vqmovun.s16 d1, q3
168 vst1.32 {d0[0]}, [r0,:32], r2
169 vst1.32 {d1[0]}, [r0,:32], r2
170 vst1.32 {d0[1]}, [r0,:32], r2
171 vst1.32 {d1[1]}, [r0,:32], r2
172 bx lr
173endfunc
174
175function ff_vp8_idct_dc_add4uv_neon, export=1
176 vmov.i16 d0, #0
177 mov r3, #32
178 vld1.16 {d16[]}, [r1,:16]
179 vst1.16 {d0[0]}, [r1,:16], r3
180 vld1.16 {d17[]}, [r1,:16]
181 vst1.16 {d0[0]}, [r1,:16], r3
182 vld1.16 {d18[]}, [r1,:16]
183 vst1.16 {d0[0]}, [r1,:16], r3
184 vld1.16 {d19[]}, [r1,:16]
185 vst1.16 {d0[0]}, [r1,:16], r3
186 mov r3, r0
187 vrshr.s16 q8, q8, #3 @ dc >>= 3
188 vld1.8 {d0}, [r0,:64], r2
189 vrshr.s16 q9, q9, #3
190 vld1.8 {d1}, [r0,:64], r2
191 vaddw.u8 q10, q8, d0
192 vld1.8 {d2}, [r0,:64], r2
193 vaddw.u8 q0, q8, d1
194 vld1.8 {d3}, [r0,:64], r2
195 vaddw.u8 q11, q8, d2
196 vld1.8 {d4}, [r0,:64], r2
197 vaddw.u8 q1, q8, d3
198 vld1.8 {d5}, [r0,:64], r2
199 vaddw.u8 q12, q9, d4
200 vld1.8 {d6}, [r0,:64], r2
201 vaddw.u8 q2, q9, d5
202 vld1.8 {d7}, [r0,:64], r2
203 vaddw.u8 q13, q9, d6
204 vqmovun.s16 d20, q10
205 vaddw.u8 q3, q9, d7
206 vqmovun.s16 d21, q0
207 vqmovun.s16 d22, q11
208 vst1.8 {d20}, [r3,:64], r2
209 vqmovun.s16 d23, q1
210 vst1.8 {d21}, [r3,:64], r2
211 vqmovun.s16 d24, q12
212 vst1.8 {d22}, [r3,:64], r2
213 vqmovun.s16 d25, q2
214 vst1.8 {d23}, [r3,:64], r2
215 vqmovun.s16 d26, q13
216 vst1.8 {d24}, [r3,:64], r2
217 vqmovun.s16 d27, q3
218 vst1.8 {d25}, [r3,:64], r2
219 vst1.8 {d26}, [r3,:64], r2
220 vst1.8 {d27}, [r3,:64], r2
221
222 bx lr
223endfunc
224
225function ff_vp8_idct_dc_add4y_neon, export=1
226 vmov.i16 d0, #0
227 mov r3, #32
228 vld1.16 {d16[]}, [r1,:16]
229 vst1.16 {d0[0]}, [r1,:16], r3
230 vld1.16 {d17[]}, [r1,:16]
231 vst1.16 {d0[0]}, [r1,:16], r3
232 vld1.16 {d18[]}, [r1,:16]
233 vst1.16 {d0[0]}, [r1,:16], r3
234 vld1.16 {d19[]}, [r1,:16]
235 vst1.16 {d0[0]}, [r1,:16], r3
236 vrshr.s16 q8, q8, #3 @ dc >>= 3
237 vld1.8 {q0}, [r0,:128], r2
238 vrshr.s16 q9, q9, #3
239 vld1.8 {q1}, [r0,:128], r2
240 vaddw.u8 q10, q8, d0
241 vld1.8 {q2}, [r0,:128], r2
242 vaddw.u8 q0, q9, d1
243 vld1.8 {q3}, [r0,:128], r2
244 vaddw.u8 q11, q8, d2
245 vaddw.u8 q1, q9, d3
246 vaddw.u8 q12, q8, d4
247 vaddw.u8 q2, q9, d5
248 vaddw.u8 q13, q8, d6
249 vaddw.u8 q3, q9, d7
250 sub r0, r0, r2, lsl #2
251 vqmovun.s16 d20, q10
252 vqmovun.s16 d21, q0
253 vqmovun.s16 d22, q11
254 vqmovun.s16 d23, q1
255 vqmovun.s16 d24, q12
256 vst1.8 {q10}, [r0,:128], r2
257 vqmovun.s16 d25, q2
258 vst1.8 {q11}, [r0,:128], r2
259 vqmovun.s16 d26, q13
260 vst1.8 {q12}, [r0,:128], r2
261 vqmovun.s16 d27, q3
262 vst1.8 {q13}, [r0,:128], r2
263
264 bx lr
265endfunc
266
267@ Register layout:
268@ P3..Q3 -> q0..q7
269@ flim_E -> q14
270@ flim_I -> q15
271@ hev_thresh -> r12
272@
273.macro vp8_loop_filter, inner=0, simple=0
274 .if \simple
275 vabd.u8 q9, q3, q4 @ abs(P0-Q0)
276 vabd.u8 q15, q2, q5 @ abs(P1-Q1)
277 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
278 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
279 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
280 vmov.i8 q13, #0x80
281 vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
282 .else
283 @ calculate hev and normal_limit:
284 vabd.u8 q12, q2, q3 @ abs(P1-P0)
285 vabd.u8 q13, q5, q4 @ abs(Q1-Q0)
286 vabd.u8 q10, q0, q1 @ abs(P3-P2)
287 vabd.u8 q11, q1, q2 @ abs(P2-P1)
288 vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I
289 vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I
290 vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I
291 vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I
292 vand q8, q8, q9
293 vabd.u8 q9, q7, q6 @ abs(Q3-Q2)
294 vand q8, q8, q11
295 vabd.u8 q11, q6, q5 @ abs(Q2-Q1)
296 vand q8, q8, q10
297 vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I
298 vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I
299 vabd.u8 q9, q3, q4 @ abs(P0-Q0)
300 vabd.u8 q15, q2, q5 @ abs(P1-Q1)
301 vand q8, q8, q10
302 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
303 vand q8, q8, q11
304 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
305 vdup.8 q15, r12 @ hev_thresh
306 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
307 vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh
308 vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
309 vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh
310 vand q8, q8, q11
311 vmov.i8 q13, #0x80
312 vorr q9, q12, q14
313 .endif
314
315 @ at this point:
316 @ q8: normal_limit
317 @ q9: hev
318
319 @ convert to signed value:
320 veor q3, q3, q13 @ PS0 = P0 ^ 0x80
321 veor q4, q4, q13 @ QS0 = Q0 ^ 0x80
322
323 vmov.i16 q12, #3
324 vsubl.s8 q10, d8, d6 @ QS0 - PS0
325 vsubl.s8 q11, d9, d7 @ (widened to 16bit)
326 veor q2, q2, q13 @ PS1 = P1 ^ 0x80
327 veor q5, q5, q13 @ QS1 = Q1 ^ 0x80
328 vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0)
329 vmul.i16 q11, q11, q12
330
331 vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1)
332 vmov.i8 q14, #4
333 vmov.i8 q15, #3
334 .if \inner
335 vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1)
336 .endif
337 vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1)
338 vaddw.s8 q11, q11, d25
339 vqmovn.s16 d20, q10 @ narrow result back into q10
340 vqmovn.s16 d21, q11
341 .if !\inner && !\simple
342 veor q1, q1, q13 @ PS2 = P2 ^ 0x80
343 veor q6, q6, q13 @ QS2 = Q2 ^ 0x80
344 .endif
345 vand q10, q10, q8 @ w &= normal_limit
346
347 @ registers used at this point..
348 @ q0 -> P3 (don't corrupt)
349 @ q1-q6 -> PS2-QS2
350 @ q7 -> Q3 (don't corrupt)
351 @ q9 -> hev
352 @ q10 -> w
353 @ q13 -> #0x80
354 @ q14 -> #4
355 @ q15 -> #3
356 @ q8, q11, q12 -> unused
357
358 @ filter_common: is4tap==1
359 @ c1 = clamp(w + 4) >> 3;
360 @ c2 = clamp(w + 3) >> 3;
361 @ Q0 = s2u(QS0 - c1);
362 @ P0 = s2u(PS0 + c2);
363
364 .if \simple
365 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
366 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
367 vshr.s8 q11, q11, #3 @ c1 >>= 3
368 vshr.s8 q12, q12, #3 @ c2 >>= 3
369 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
370 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
371 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
372 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
373 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
374 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
375 .elseif \inner
376 @ the !is4tap case of filter_common, only used for inner blocks
377 @ c3 = ((c1&~hev) + 1) >> 1;
378 @ Q1 = s2u(QS1 - c3);
379 @ P1 = s2u(PS1 + c3);
380 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
381 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
382 vshr.s8 q11, q11, #3 @ c1 >>= 3
383 vshr.s8 q12, q12, #3 @ c2 >>= 3
384 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
385 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
386 vbic q11, q11, q9 @ c1 & ~hev
387 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
388 vrshr.s8 q11, q11, #1 @ c3 >>= 1
389 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
390 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3)
391 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3)
392 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
393 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
394 .else
395 vand q12, q10, q9 @ w & hev
396 vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4)
397 vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3)
398 vshr.s8 q11, q11, #3 @ c1 >>= 3
399 vshr.s8 q12, q12, #3 @ c2 >>= 3
400 vbic q10, q10, q9 @ w &= ~hev
401 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
402 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
403
404 @ filter_mbedge:
405 @ a = clamp((27*w + 63) >> 7);
406 @ Q0 = s2u(QS0 - a);
407 @ P0 = s2u(PS0 + a);
408 @ a = clamp((18*w + 63) >> 7);
409 @ Q1 = s2u(QS1 - a);
410 @ P1 = s2u(PS1 + a);
411 @ a = clamp((9*w + 63) >> 7);
412 @ Q2 = s2u(QS2 - a);
413 @ P2 = s2u(PS2 + a);
414 vmov.i16 q9, #63
415 vshll.s8 q14, d20, #3
416 vshll.s8 q15, d21, #3
417 vaddw.s8 q14, q14, d20
418 vaddw.s8 q15, q15, d21
419 vadd.s16 q8, q9, q14
420 vadd.s16 q9, q9, q15 @ 9*w + 63
421 vadd.s16 q11, q8, q14
422 vadd.s16 q12, q9, q15 @ 18*w + 63
423 vadd.s16 q14, q11, q14
424 vadd.s16 q15, q12, q15 @ 27*w + 63
425 vqshrn.s16 d16, q8, #7
426 vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7)
427 vqshrn.s16 d22, q11, #7
428 vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7)
429 vqshrn.s16 d28, q14, #7
430 vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7)
431 vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a)
432 vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a)
433 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a)
434 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a)
435 vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a)
436 vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a)
437 veor q3, q3, q13 @ P0 = PS0 ^ 0x80
438 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
439 veor q2, q2, q13 @ P1 = PS1 ^ 0x80
440 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
441 veor q1, q1, q13 @ P2 = PS2 ^ 0x80
442 veor q6, q6, q13 @ Q2 = QS2 ^ 0x80
443 .endif
444.endm
445
446.macro vp8_v_loop_filter16 name, inner=0, simple=0
447function ff_vp8_v_loop_filter16\name\()_neon, export=1
448 vpush {q4-q7}
449 sub r0, r0, r1, lsl #1+!\simple
450
451 @ Load pixels:
452 .if !\simple
453 ldr r12, [sp, #64] @ hev_thresh
454 vld1.8 {q0}, [r0,:128], r1 @ P3
455 vld1.8 {q1}, [r0,:128], r1 @ P2
456 .endif
457 vld1.8 {q2}, [r0,:128], r1 @ P1
458 vld1.8 {q3}, [r0,:128], r1 @ P0
459 vld1.8 {q4}, [r0,:128], r1 @ Q0
460 vld1.8 {q5}, [r0,:128], r1 @ Q1
461 .if !\simple
462 vld1.8 {q6}, [r0,:128], r1 @ Q2
463 vld1.8 {q7}, [r0,:128] @ Q3
464 vdup.8 q15, r3 @ flim_I
465 .endif
466 vdup.8 q14, r2 @ flim_E
467
468 vp8_loop_filter inner=\inner, simple=\simple
469
470 @ back up to P2: dst -= stride * 6
471 sub r0, r0, r1, lsl #2
472 .if !\simple
473 sub r0, r0, r1, lsl #1
474
475 @ Store pixels:
476 vst1.8 {q1}, [r0,:128], r1 @ P2
477 .endif
478 vst1.8 {q2}, [r0,:128], r1 @ P1
479 vst1.8 {q3}, [r0,:128], r1 @ P0
480 vst1.8 {q4}, [r0,:128], r1 @ Q0
481 vst1.8 {q5}, [r0,:128], r1 @ Q1
482 .if !\simple
483 vst1.8 {q6}, [r0,:128] @ Q2
484 .endif
485
486 vpop {q4-q7}
487 bx lr
488endfunc
489.endm
490
491vp8_v_loop_filter16
492vp8_v_loop_filter16 _inner, inner=1
493vp8_v_loop_filter16 _simple, simple=1
494
495.macro vp8_v_loop_filter8uv name, inner=0
496function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
497 vpush {q4-q7}
498 sub r0, r0, r2, lsl #2
499 sub r1, r1, r2, lsl #2
500 ldr r12, [sp, #64] @ flim_I
501
502 @ Load pixels:
503 vld1.8 {d0}, [r0,:64], r2 @ P3
504 vld1.8 {d1}, [r1,:64], r2 @ P3
505 vld1.8 {d2}, [r0,:64], r2 @ P2
506 vld1.8 {d3}, [r1,:64], r2 @ P2
507 vld1.8 {d4}, [r0,:64], r2 @ P1
508 vld1.8 {d5}, [r1,:64], r2 @ P1
509 vld1.8 {d6}, [r0,:64], r2 @ P0
510 vld1.8 {d7}, [r1,:64], r2 @ P0
511 vld1.8 {d8}, [r0,:64], r2 @ Q0
512 vld1.8 {d9}, [r1,:64], r2 @ Q0
513 vld1.8 {d10}, [r0,:64], r2 @ Q1
514 vld1.8 {d11}, [r1,:64], r2 @ Q1
515 vld1.8 {d12}, [r0,:64], r2 @ Q2
516 vld1.8 {d13}, [r1,:64], r2 @ Q2
517 vld1.8 {d14}, [r0,:64] @ Q3
518 vld1.8 {d15}, [r1,:64] @ Q3
519
520 vdup.8 q14, r3 @ flim_E
521 vdup.8 q15, r12 @ flim_I
522 ldr r12, [sp, #68] @ hev_thresh
523
524 vp8_loop_filter inner=\inner
525
526 @ back up to P2: u,v -= stride * 6
527 sub r0, r0, r2, lsl #2
528 sub r1, r1, r2, lsl #2
529 sub r0, r0, r2, lsl #1
530 sub r1, r1, r2, lsl #1
531
532 @ Store pixels:
533 vst1.8 {d2}, [r0,:64], r2 @ P2
534 vst1.8 {d3}, [r1,:64], r2 @ P2
535 vst1.8 {d4}, [r0,:64], r2 @ P1
536 vst1.8 {d5}, [r1,:64], r2 @ P1
537 vst1.8 {d6}, [r0,:64], r2 @ P0
538 vst1.8 {d7}, [r1,:64], r2 @ P0
539 vst1.8 {d8}, [r0,:64], r2 @ Q0
540 vst1.8 {d9}, [r1,:64], r2 @ Q0
541 vst1.8 {d10}, [r0,:64], r2 @ Q1
542 vst1.8 {d11}, [r1,:64], r2 @ Q1
543 vst1.8 {d12}, [r0,:64] @ Q2
544 vst1.8 {d13}, [r1,:64] @ Q2
545
546 vpop {q4-q7}
547 bx lr
548endfunc
549.endm
550
551vp8_v_loop_filter8uv
552vp8_v_loop_filter8uv _inner, inner=1
553
554.macro vp8_h_loop_filter16 name, inner=0, simple=0
555function ff_vp8_h_loop_filter16\name\()_neon, export=1
556 vpush {q4-q7}
557 sub r0, r0, #4
558 .if !\simple
559 ldr r12, [sp, #64] @ hev_thresh
560 .endif
561
562 @ Load pixels:
563 vld1.8 {d0}, [r0], r1 @ load first 8-line src data
564 vld1.8 {d2}, [r0], r1
565 vld1.8 {d4}, [r0], r1
566 vld1.8 {d6}, [r0], r1
567 vld1.8 {d8}, [r0], r1
568 vld1.8 {d10}, [r0], r1
569 vld1.8 {d12}, [r0], r1
570 vld1.8 {d14}, [r0], r1
571 vld1.8 {d1}, [r0], r1 @ load second 8-line src data
572 vld1.8 {d3}, [r0], r1
573 vld1.8 {d5}, [r0], r1
574 vld1.8 {d7}, [r0], r1
575 vld1.8 {d9}, [r0], r1
576 vld1.8 {d11}, [r0], r1
577 vld1.8 {d13}, [r0], r1
578 vld1.8 {d15}, [r0], r1
579
580 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
581
582 vdup.8 q14, r2 @ flim_E
583 .if !\simple
584 vdup.8 q15, r3 @ flim_I
585 .endif
586
587 vp8_loop_filter inner=\inner, simple=\simple
588
589 sub r0, r0, r1, lsl #4 @ backup 16 rows
590
591 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
592
593 @ Store pixels:
594 vst1.8 {d0}, [r0], r1
595 vst1.8 {d2}, [r0], r1
596 vst1.8 {d4}, [r0], r1
597 vst1.8 {d6}, [r0], r1
598 vst1.8 {d8}, [r0], r1
599 vst1.8 {d10}, [r0], r1
600 vst1.8 {d12}, [r0], r1
601 vst1.8 {d14}, [r0], r1
602 vst1.8 {d1}, [r0], r1
603 vst1.8 {d3}, [r0], r1
604 vst1.8 {d5}, [r0], r1
605 vst1.8 {d7}, [r0], r1
606 vst1.8 {d9}, [r0], r1
607 vst1.8 {d11}, [r0], r1
608 vst1.8 {d13}, [r0], r1
609 vst1.8 {d15}, [r0]
610
611 vpop {q4-q7}
612 bx lr
613endfunc
614.endm
615
616vp8_h_loop_filter16
617vp8_h_loop_filter16 _inner, inner=1
618vp8_h_loop_filter16 _simple, simple=1
619
620.macro vp8_h_loop_filter8uv name, inner=0
621function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
622 vpush {q4-q7}
623 sub r0, r0, #4
624 sub r1, r1, #4
625 ldr r12, [sp, #64] @ flim_I
626
627 @ Load pixels:
628 vld1.8 {d0}, [r0], r2 @ load u
629 vld1.8 {d1}, [r1], r2 @ load v
630 vld1.8 {d2}, [r0], r2
631 vld1.8 {d3}, [r1], r2
632 vld1.8 {d4}, [r0], r2
633 vld1.8 {d5}, [r1], r2
634 vld1.8 {d6}, [r0], r2
635 vld1.8 {d7}, [r1], r2
636 vld1.8 {d8}, [r0], r2
637 vld1.8 {d9}, [r1], r2
638 vld1.8 {d10}, [r0], r2
639 vld1.8 {d11}, [r1], r2
640 vld1.8 {d12}, [r0], r2
641 vld1.8 {d13}, [r1], r2
642 vld1.8 {d14}, [r0], r2
643 vld1.8 {d15}, [r1], r2
644
645 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
646
647 vdup.8 q14, r3 @ flim_E
648 vdup.8 q15, r12 @ flim_I
649 ldr r12, [sp, #68] @ hev_thresh
650
651 vp8_loop_filter inner=\inner
652
653 sub r0, r0, r2, lsl #3 @ backup u 8 rows
654 sub r1, r1, r2, lsl #3 @ backup v 8 rows
655
656 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
657
658 @ Store pixels:
659 vst1.8 {d0}, [r0], r2
660 vst1.8 {d1}, [r1], r2
661 vst1.8 {d2}, [r0], r2
662 vst1.8 {d3}, [r1], r2
663 vst1.8 {d4}, [r0], r2
664 vst1.8 {d5}, [r1], r2
665 vst1.8 {d6}, [r0], r2
666 vst1.8 {d7}, [r1], r2
667 vst1.8 {d8}, [r0], r2
668 vst1.8 {d9}, [r1], r2
669 vst1.8 {d10}, [r0], r2
670 vst1.8 {d11}, [r1], r2
671 vst1.8 {d12}, [r0], r2
672 vst1.8 {d13}, [r1], r2
673 vst1.8 {d14}, [r0]
674 vst1.8 {d15}, [r1]
675
676 vpop {q4-q7}
677 bx lr
678endfunc
679.endm
680
681vp8_h_loop_filter8uv
682vp8_h_loop_filter8uv _inner, inner=1
683
684function ff_put_vp8_pixels16_neon, export=1
685 ldr r12, [sp, #0] @ h
6861:
687 subs r12, r12, #4
688 vld1.8 {q0}, [r2], r3
689 vld1.8 {q1}, [r2], r3
690 vld1.8 {q2}, [r2], r3
691 vld1.8 {q3}, [r2], r3
692 vst1.8 {q0}, [r0,:128], r1
693 vst1.8 {q1}, [r0,:128], r1
694 vst1.8 {q2}, [r0,:128], r1
695 vst1.8 {q3}, [r0,:128], r1
696 bgt 1b
697 bx lr
698endfunc
699
700function ff_put_vp8_pixels8_neon, export=1
701 ldr r12, [sp, #0] @ h
7021:
703 subs r12, r12, #4
704 vld1.8 {d0}, [r2], r3
705 vld1.8 {d1}, [r2], r3
706 vld1.8 {d2}, [r2], r3
707 vld1.8 {d3}, [r2], r3
708 vst1.8 {d0}, [r0,:64], r1
709 vst1.8 {d1}, [r0,:64], r1
710 vst1.8 {d2}, [r0,:64], r1
711 vst1.8 {d3}, [r0,:64], r1
712 bgt 1b
713 bx lr
714endfunc
715
716/* 4/6-tap 8th-pel MC */
717
718.macro vp8_epel8_h6 d, a, b
719 vext.8 d27, \a, \b, #1
720 vmovl.u8 q8, \a
721 vext.8 d28, \a, \b, #2
722 vmovl.u8 q9, d27
723 vext.8 d29, \a, \b, #3
724 vmovl.u8 q10, d28
725 vext.8 d30, \a, \b, #4
726 vmovl.u8 q11, d29
727 vext.8 d31, \a, \b, #5
728 vmovl.u8 q12, d30
729 vmul.u16 q10, q10, d0[2]
730 vmovl.u8 q13, d31
731 vmul.u16 q11, q11, d0[3]
732 vmls.u16 q10, q9, d0[1]
733 vmls.u16 q11, q12, d1[0]
734 vmla.u16 q10, q8, d0[0]
735 vmla.u16 q11, q13, d1[1]
736 vqadd.s16 q11, q10, q11
737 vqrshrun.s16 \d, q11, #7
738.endm
739
740.macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1
741 vext.8 q14, \q0, \q1, #3
742 vext.8 q15, \q0, \q1, #4
743 vmovl.u8 q11, d28
744 vmovl.u8 q14, d29
745 vext.8 q3, \q0, \q1, #2
746 vmovl.u8 q12, d30
747 vmovl.u8 q15, d31
748 vext.8 q8, \q0, \q1, #1
749 vmovl.u8 q10, d6
750 vmovl.u8 q3, d7
751 vext.8 q2, \q0, \q1, #5
752 vmovl.u8 q13, d4
753 vmovl.u8 q2, d5
754 vmovl.u8 q9, d16
755 vmovl.u8 q8, d17
756 vmul.u16 q11, q11, d0[3]
757 vmul.u16 q10, q10, d0[2]
758 vmul.u16 q3, q3, d0[2]
759 vmul.u16 q14, q14, d0[3]
760 vmls.u16 q11, q12, d1[0]
761 vmovl.u8 q12, \s0
762 vmovl.u8 q1, \s1
763 vmls.u16 q10, q9, d0[1]
764 vmls.u16 q3, q8, d0[1]
765 vmls.u16 q14, q15, d1[0]
766 vmla.u16 q10, q12, d0[0]
767 vmla.u16 q11, q13, d1[1]
768 vmla.u16 q3, q1, d0[0]
769 vmla.u16 q14, q2, d1[1]
770 vqadd.s16 q11, q10, q11
771 vqadd.s16 q14, q3, q14
772 vqrshrun.s16 \d0, q11, #7
773 vqrshrun.s16 \d1, q14, #7
774.endm
775
776.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
777 vmovl.u8 q10, \s2
778 vmovl.u8 q11, \s3
779 vmovl.u8 q9, \s1
780 vmovl.u8 q12, \s4
781 vmovl.u8 q8, \s0
782 vmovl.u8 q13, \s5
783 vmul.u16 q10, q10, d0[2]
784 vmul.u16 q11, q11, d0[3]
785 vmls.u16 q10, q9, d0[1]
786 vmls.u16 q11, q12, d1[0]
787 vmla.u16 q10, q8, d0[0]
788 vmla.u16 q11, q13, d1[1]
789 vqadd.s16 q11, q10, q11
790 vqrshrun.s16 \d0, q11, #7
791.endm
792
793.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
794 vmovl.u8 q10, \s0
795 vmovl.u8 q11, \s3
796 vmovl.u8 q14, \s6
797 vmovl.u8 q9, \s1
798 vmovl.u8 q12, \s4
799 vmovl.u8 q8, \s2
800 vmovl.u8 q13, \s5
801 vmul.u16 q10, q10, d0[0]
802 vmul.u16 q15, q11, d0[3]
803 vmul.u16 q11, q11, d0[2]
804 vmul.u16 q14, q14, d1[1]
805 vmls.u16 q10, q9, d0[1]
806 vmls.u16 q15, q12, d1[0]
807 vmls.u16 q11, q8, d0[1]
808 vmls.u16 q14, q13, d1[0]
809 vmla.u16 q10, q8, d0[2]
810 vmla.u16 q15, q13, d1[1]
811 vmla.u16 q11, q9, d0[0]
812 vmla.u16 q14, q12, d0[3]
813 vqadd.s16 q15, q10, q15
814 vqadd.s16 q14, q11, q14
815 vqrshrun.s16 \d0, q15, #7
816 vqrshrun.s16 \d1, q14, #7
817.endm
818
819.macro vp8_epel8_h4 d, a, b
820 vext.8 d28, \a, \b, #1
821 vmovl.u8 q9, \a
822 vext.8 d29, \a, \b, #2
823 vmovl.u8 q10, d28
824 vext.8 d30, \a, \b, #3
825 vmovl.u8 q11, d29
826 vmovl.u8 q12, d30
827 vmul.u16 q10, q10, d0[2]
828 vmul.u16 q11, q11, d0[3]
829 vmls.u16 q10, q9, d0[1]
830 vmls.u16 q11, q12, d1[0]
831 vqadd.s16 q11, q10, q11
832 vqrshrun.s16 \d, q11, #7
833.endm
834
835.macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4
836 vmovl.u8 q9, \s0
837 vmovl.u8 q10, \s1
838 vmovl.u8 q11, \s2
839 vmovl.u8 q12, \s3
840 vmovl.u8 q13, \s4
841 vmul.u16 q8, q10, d0[2]
842 vmul.u16 q14, q11, d0[3]
843 vmul.u16 q11, q11, d0[2]
844 vmul.u16 q15, q12, d0[3]
845 vmls.u16 q8, q9, d0[1]
846 vmls.u16 q14, q12, d1[0]
847 vmls.u16 q11, q10, d0[1]
848 vmls.u16 q15, q13, d1[0]
849 vqadd.s16 q8, q8, q14
850 vqadd.s16 q11, q11, q15
851 vqrshrun.s16 \d0, q8, #7
852 vqrshrun.s16 \d1, q11, #7
853.endm
854
855function ff_put_vp8_epel16_v6_neon, export=1
856 sub r2, r2, r3, lsl #1
857 push {r4,lr}
858 vpush {d8-d15}
859
860 ldr r4, [sp, #80] @ my
861 movrel lr, subpel_filters-16
862 ldr r12, [sp, #72] @ h
863 add r4, lr, r4, lsl #4
864 vld1.16 {q0}, [r4,:128]
8651:
866 vld1.8 {d2-d3}, [r2], r3
867 vld1.8 {d4-d5}, [r2], r3
868 vld1.8 {d6-d7}, [r2], r3
869 vld1.8 {d8-d9}, [r2], r3
870 vld1.8 {d10-d11},[r2], r3
871 vld1.8 {d12-d13},[r2], r3
872 vld1.8 {d14-d15},[r2]
873 sub r2, r2, r3, lsl #2
874
875 vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
876 vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
877
878 vst1.8 {d2-d3}, [r0,:128], r1
879 vst1.8 {d4-d5}, [r0,:128], r1
880 subs r12, r12, #2
881 bne 1b
882
883 vpop {d8-d15}
884 pop {r4,pc}
885endfunc
886
887function ff_put_vp8_epel16_h6_neon, export=1
888 sub r2, r2, #2
889 push {r4,lr}
890
891 ldr r4, [sp, #12] @ mx
892 movrel lr, subpel_filters-16
893 ldr r12, [sp, #8] @ h
894 add r4, lr, r4, lsl #4
895 vld1.16 {q0}, [r4,:128]
8961:
897 vld1.8 {d2-d4}, [r2], r3
898
899 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
900
901 vst1.8 {d2-d3}, [r0,:128], r1
902 subs r12, r12, #1
903 bne 1b
904
905 pop {r4,pc}
906endfunc
907
908function ff_put_vp8_epel16_h6v6_neon, export=1
909 sub r2, r2, r3, lsl #1
910 sub r2, r2, #2
911 push {r4,lr}
912 vpush {d8-d9}
913
914 @ first pass (horizontal):
915 ldr r4, [sp, #28] @ mx
916 movrel lr, subpel_filters-16
917 ldr r12, [sp, #24] @ h
918 add r4, lr, r4, lsl #4
919 sub sp, sp, #336+16
920 vld1.16 {q0}, [r4,:128]
921 add lr, sp, #15
922 add r12, r12, #5
923 bic lr, lr, #15
9241:
925 vld1.8 {d2,d3,d4}, [r2], r3
926
927 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
928
929 vst1.8 {d2-d3}, [lr,:128]!
930 subs r12, r12, #1
931 bne 1b
932
933 @ second pass (vertical):
934 ldr r4, [sp, #336+16+32] @ my
935 movrel lr, subpel_filters-16
936 ldr r12, [sp, #336+16+24] @ h
937 add r4, lr, r4, lsl #4
938 add lr, sp, #15
939 vld1.16 {q0}, [r4,:128]
940 bic lr, lr, #15
9412:
942 vld1.8 {d2-d5}, [lr,:128]!
943 vld1.8 {d6-d9}, [lr,:128]!
944 vld1.8 {d28-d31},[lr,:128]
945 sub lr, lr, #48
946
947 vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30
948 vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31
949
950 vst1.8 {d2-d3}, [r0,:128], r1
951 subs r12, r12, #1
952 bne 2b
953
954 add sp, sp, #336+16
955 vpop {d8-d9}
956 pop {r4,pc}
957endfunc
958
959function ff_put_vp8_epel8_v6_neon, export=1
960 sub r2, r2, r3, lsl #1
961 push {r4,lr}
962
963 ldr r4, [sp, #16] @ my
964 movrel lr, subpel_filters-16
965 ldr r12, [sp, #8] @ h
966 add r4, lr, r4, lsl #4
967 vld1.16 {q0}, [r4,:128]
9681:
969 vld1.8 {d2}, [r2], r3
970 vld1.8 {d3}, [r2], r3
971 vld1.8 {d4}, [r2], r3
972 vld1.8 {d5}, [r2], r3
973 vld1.8 {d6}, [r2], r3
974 vld1.8 {d7}, [r2], r3
975 vld1.8 {d28}, [r2]
976
977 sub r2, r2, r3, lsl #2
978
979 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
980
981 vst1.8 {d2}, [r0,:64], r1
982 vst1.8 {d3}, [r0,:64], r1
983 subs r12, r12, #2
984 bne 1b
985
986 pop {r4,pc}
987endfunc
988
989function ff_put_vp8_epel8_h6_neon, export=1
990 sub r2, r2, #2
991 push {r4,lr}
992
993 ldr r4, [sp, #12] @ mx
994 movrel lr, subpel_filters-16
995 ldr r12, [sp, #8] @ h
996 add r4, lr, r4, lsl #4
997 vld1.16 {q0}, [r4,:128]
9981:
999 vld1.8 {d2,d3}, [r2], r3
1000
1001 vp8_epel8_h6 d2, d2, d3
1002
1003 vst1.8 {d2}, [r0,:64], r1
1004 subs r12, r12, #1
1005 bne 1b
1006
1007 pop {r4,pc}
1008endfunc
1009
1010function ff_put_vp8_epel8_h6v6_neon, export=1
1011 sub r2, r2, r3, lsl #1
1012 sub r2, r2, #2
1013 push {r4,lr}
1014
1015 @ first pass (horizontal):
1016 ldr r4, [sp, #12] @ mx
1017 movrel lr, subpel_filters-16
1018 ldr r12, [sp, #8] @ h
1019 add r4, lr, r4, lsl #4
1020 sub sp, sp, #168+16
1021 vld1.16 {q0}, [r4,:128]
1022 add lr, sp, #15
1023 add r12, r12, #5
1024 bic lr, lr, #15
10251:
1026 vld1.8 {d2,d3}, [r2], r3
1027
1028 vp8_epel8_h6 d2, d2, d3
1029
1030 vst1.8 {d2}, [lr,:64]!
1031 subs r12, r12, #1
1032 bne 1b
1033
1034 @ second pass (vertical):
1035 ldr r4, [sp, #168+16+16] @ my
1036 movrel lr, subpel_filters-16
1037 ldr r12, [sp, #168+16+8] @ h
1038 add r4, lr, r4, lsl #4
1039 add lr, sp, #15
1040 vld1.16 {q0}, [r4,:128]
1041 bic lr, lr, #15
10422:
1043 vld1.8 {d2-d5}, [lr,:128]!
1044 vld1.8 {d6-d7}, [lr,:128]!
1045 vld1.8 {d30}, [lr,:64]
1046 sub lr, lr, #32
1047
1048 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
1049
1050 vst1.8 {d2}, [r0,:64], r1
1051 vst1.8 {d3}, [r0,:64], r1
1052 subs r12, r12, #2
1053 bne 2b
1054
1055 add sp, sp, #168+16
1056 pop {r4,pc}
1057endfunc
1058
1059function ff_put_vp8_epel8_v4_neon, export=1
1060 sub r2, r2, r3
1061 push {r4,lr}
1062
1063 ldr r4, [sp, #16] @ my
1064 movrel lr, subpel_filters-16
1065 ldr r12, [sp, #8] @ h
1066 add r4, lr, r4, lsl #4
1067 vld1.16 {q0}, [r4,:128]
10681:
1069 vld1.8 {d2}, [r2], r3
1070 vld1.8 {d3}, [r2], r3
1071 vld1.8 {d4}, [r2], r3
1072 vld1.8 {d5}, [r2], r3
1073 vld1.8 {d6}, [r2]
1074 sub r2, r2, r3, lsl #1
1075
1076 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1077
1078 vst1.8 {d2}, [r0,:64], r1
1079 vst1.8 {d3}, [r0,:64], r1
1080 subs r12, r12, #2
1081 bne 1b
1082
1083 pop {r4,pc}
1084endfunc
1085
1086function ff_put_vp8_epel8_h4_neon, export=1
1087 sub r2, r2, #1
1088 push {r4,lr}
1089
1090 ldr r4, [sp, #12] @ mx
1091 movrel lr, subpel_filters-16
1092 ldr r12, [sp, #8] @ h
1093 add r4, lr, r4, lsl #4
1094 vld1.16 {q0}, [r4,:128]
10951:
1096 vld1.8 {d2,d3}, [r2], r3
1097
1098 vp8_epel8_h4 d2, d2, d3
1099
1100 vst1.8 {d2}, [r0,:64], r1
1101 subs r12, r12, #1
1102 bne 1b
1103
1104 pop {r4,pc}
1105endfunc
1106
1107function ff_put_vp8_epel8_h4v4_neon, export=1
1108 sub r2, r2, r3
1109 sub r2, r2, #1
1110 push {r4,lr}
1111
1112 @ first pass (horizontal):
1113 ldr r4, [sp, #12] @ mx
1114 movrel lr, subpel_filters-16
1115 ldr r12, [sp, #8] @ h
1116 add r4, lr, r4, lsl #4
1117 sub sp, sp, #168+16
1118 vld1.16 {q0}, [r4,:128]
1119 add lr, sp, #15
1120 add r12, r12, #3
1121 bic lr, lr, #15
11221:
1123 vld1.8 {d2,d3}, [r2], r3
1124
1125 vp8_epel8_h4 d2, d2, d3
1126
1127 vst1.8 {d2}, [lr,:64]!
1128 subs r12, r12, #1
1129 bne 1b
1130
1131 @ second pass (vertical):
1132 ldr r4, [sp, #168+16+16] @ my
1133 movrel lr, subpel_filters-16
1134 ldr r12, [sp, #168+16+8] @ h
1135 add r4, lr, r4, lsl #4
1136 add lr, sp, #15
1137 vld1.16 {q0}, [r4,:128]
1138 bic lr, lr, #15
11392:
1140 vld1.8 {d2-d5}, [lr,:128]!
1141 vld1.8 {d6}, [lr,:64]
1142 sub lr, lr, #16
1143
1144 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1145
1146 vst1.8 {d2}, [r0,:64], r1
1147 vst1.8 {d3}, [r0,:64], r1
1148 subs r12, r12, #2
1149 bne 2b
1150
1151 add sp, sp, #168+16
1152 pop {r4,pc}
1153endfunc
1154
1155function ff_put_vp8_epel8_h6v4_neon, export=1
1156 sub r2, r2, r3
1157 sub r2, r2, #2
1158 push {r4,lr}
1159
1160 @ first pass (horizontal):
1161 ldr r4, [sp, #12] @ mx
1162 movrel lr, subpel_filters-16
1163 ldr r12, [sp, #8] @ h
1164 add r4, lr, r4, lsl #4
1165 sub sp, sp, #168+16
1166 vld1.16 {q0}, [r4,:128]
1167 add lr, sp, #15
1168 add r12, r12, #3
1169 bic lr, lr, #15
11701:
1171 vld1.8 {d2,d3}, [r2], r3
1172
1173 vp8_epel8_h6 d2, d2, d3
1174
1175 vst1.8 {d2}, [lr,:64]!
1176 subs r12, r12, #1
1177 bne 1b
1178
1179 @ second pass (vertical):
1180 ldr r4, [sp, #168+16+16] @ my
1181 movrel lr, subpel_filters-16
1182 ldr r12, [sp, #168+16+8] @ h
1183 add r4, lr, r4, lsl #4
1184 add lr, sp, #15
1185 vld1.16 {q0}, [r4,:128]
1186 bic lr, lr, #15
11872:
1188 vld1.8 {d2-d5}, [lr,:128]!
1189 vld1.8 {d6}, [lr,:64]
1190 sub lr, lr, #16
1191
1192 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1193
1194 vst1.8 {d2}, [r0,:64], r1
1195 vst1.8 {d3}, [r0,:64], r1
1196 subs r12, r12, #2
1197 bne 2b
1198
1199 add sp, sp, #168+16
1200 pop {r4,pc}
1201endfunc
1202
1203function ff_put_vp8_epel8_h4v6_neon, export=1
1204 sub r2, r2, r3, lsl #1
1205 sub r2, r2, #1
1206 push {r4,lr}
1207
1208 @ first pass (horizontal):
1209 ldr r4, [sp, #12] @ mx
1210 movrel lr, subpel_filters-16
1211 ldr r12, [sp, #8] @ h
1212 add r4, lr, r4, lsl #4
1213 sub sp, sp, #168+16
1214 vld1.16 {q0}, [r4,:128]
1215 add lr, sp, #15
1216 add r12, r12, #5
1217 bic lr, lr, #15
12181:
1219 vld1.8 {d2,d3}, [r2], r3
1220
1221 vp8_epel8_h4 d2, d2, d3
1222
1223 vst1.8 {d2}, [lr,:64]!
1224 subs r12, r12, #1
1225 bne 1b
1226
1227 @ second pass (vertical):
1228 ldr r4, [sp, #168+16+16] @ my
1229 movrel lr, subpel_filters-16
1230 ldr r12, [sp, #168+16+8] @ h
1231 add r4, lr, r4, lsl #4
1232 add lr, sp, #15
1233 vld1.16 {q0}, [r4,:128]
1234 bic lr, lr, #15
12352:
1236 vld1.8 {d2-d5}, [lr,:128]!
1237 vld1.8 {d6-d7}, [lr,:128]!
1238 vld1.8 {d30}, [lr,:64]
1239 sub lr, lr, #32
1240
1241 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
1242
1243 vst1.8 {d2}, [r0,:64], r1
1244 vst1.8 {d3}, [r0,:64], r1
1245 subs r12, r12, #2
1246 bne 2b
1247
1248 add sp, sp, #168+16
1249 pop {r4,pc}
1250endfunc
1251
1252.ltorg
1253
1254function ff_put_vp8_epel4_v6_neon, export=1
1255 sub r2, r2, r3, lsl #1
1256 push {r4,lr}
1257
1258 ldr r4, [sp, #16] @ my
1259 movrel lr, subpel_filters-16
1260 ldr r12, [sp, #8] @ h
1261 add r4, lr, r4, lsl #4
1262 vld1.16 {q0}, [r4,:128]
12631:
1264 vld1.32 {d2[]}, [r2], r3
1265 vld1.32 {d3[]}, [r2], r3
1266 vld1.32 {d4[]}, [r2], r3
1267 vld1.32 {d5[]}, [r2], r3
1268 vld1.32 {d6[]}, [r2], r3
1269 vld1.32 {d7[]}, [r2], r3
1270 vld1.32 {d28[]}, [r2]
1271 sub r2, r2, r3, lsl #2
1272 vld1.32 {d2[1]}, [r2], r3
1273 vld1.32 {d3[1]}, [r2], r3
1274 vld1.32 {d4[1]}, [r2], r3
1275 vld1.32 {d5[1]}, [r2], r3
1276 vld1.32 {d6[1]}, [r2], r3
1277 vld1.32 {d7[1]}, [r2], r3
1278 vld1.32 {d28[1]}, [r2]
1279 sub r2, r2, r3, lsl #2
1280
1281 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
1282
1283 vst1.32 {d2[0]}, [r0,:32], r1
1284 vst1.32 {d3[0]}, [r0,:32], r1
1285 vst1.32 {d2[1]}, [r0,:32], r1
1286 vst1.32 {d3[1]}, [r0,:32], r1
1287 subs r12, r12, #4
1288 bne 1b
1289
1290 pop {r4,pc}
1291endfunc
1292
1293function ff_put_vp8_epel4_h6_neon, export=1
1294 sub r2, r2, #2
1295 push {r4,lr}
1296
1297 ldr r4, [sp, #12] @ mx
1298 movrel lr, subpel_filters-16
1299 ldr r12, [sp, #8] @ h
1300 add r4, lr, r4, lsl #4
1301 vld1.16 {q0}, [r4,:128]
13021:
1303 vld1.8 {q1}, [r2], r3
1304 vp8_epel8_h6 d2, d2, d3
1305 vst1.32 {d2[0]}, [r0,:32], r1
1306 subs r12, r12, #1
1307 bne 1b
1308
1309 pop {r4,pc}
1310endfunc
1311
1312function ff_put_vp8_epel4_h6v6_neon, export=1
1313 sub r2, r2, r3, lsl #1
1314 sub r2, r2, #2
1315 push {r4,lr}
1316
1317 ldr r4, [sp, #12] @ mx
1318 movrel lr, subpel_filters-16
1319 ldr r12, [sp, #8] @ h
1320 add r4, lr, r4, lsl #4
1321 sub sp, sp, #52+16
1322 vld1.16 {q0}, [r4,:128]
1323 add lr, sp, #15
1324 add r12, r12, #5
1325 bic lr, lr, #15
13261:
1327 vld1.8 {q1}, [r2], r3
1328 vp8_epel8_h6 d2, d2, d3
1329 vst1.32 {d2[0]}, [lr,:32]!
1330 subs r12, r12, #1
1331 bne 1b
1332
1333 ldr r4, [sp, #52+16+16] @ my
1334 movrel lr, subpel_filters-16
1335 ldr r12, [sp, #52+16+8] @ h
1336 add r4, lr, r4, lsl #4
1337 add lr, sp, #15
1338 vld1.16 {q0}, [r4,:128]
1339 bic lr, lr, #15
13402:
1341 vld1.8 {d2-d3}, [lr,:128]!
1342 vld1.8 {d6}, [lr,:64]!
1343 vld1.32 {d28[]}, [lr,:32]
1344 sub lr, lr, #16
1345 vld1.8 {d4-d5}, [lr]!
1346 vld1.8 {d7}, [lr,:64]!
1347 vld1.32 {d28[1]}, [lr,:32]
1348 sub lr, lr, #16
1349 vtrn.32 q1, q2
1350 vtrn.32 d6, d7
1351 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
1352 vst1.32 {d2[0]}, [r0,:32], r1
1353 vst1.32 {d3[0]}, [r0,:32], r1
1354 vst1.32 {d2[1]}, [r0,:32], r1
1355 vst1.32 {d3[1]}, [r0,:32], r1
1356 subs r12, r12, #4
1357 bne 2b
1358
1359 add sp, sp, #52+16
1360 pop {r4,pc}
1361endfunc
1362
1363function ff_put_vp8_epel4_h4v6_neon, export=1
1364 sub r2, r2, r3, lsl #1
1365 sub r2, r2, #1
1366 push {r4,lr}
1367
1368 ldr r4, [sp, #12] @ mx
1369 movrel lr, subpel_filters-16
1370 ldr r12, [sp, #8] @ h
1371 add r4, lr, r4, lsl #4
1372 sub sp, sp, #52+16
1373 vld1.16 {q0}, [r4,:128]
1374 add lr, sp, #15
1375 add r12, r12, #5
1376 bic lr, lr, #15
13771:
1378 vld1.8 {d2}, [r2], r3
1379 vp8_epel8_h4 d2, d2, d2
1380 vst1.32 {d2[0]}, [lr,:32]!
1381 subs r12, r12, #1
1382 bne 1b
1383
1384 ldr r4, [sp, #52+16+16] @ my
1385 movrel lr, subpel_filters-16
1386 ldr r12, [sp, #52+16+8] @ h
1387 add r4, lr, r4, lsl #4
1388 add lr, sp, #15
1389 vld1.16 {q0}, [r4,:128]
1390 bic lr, lr, #15
13912:
1392 vld1.8 {d2-d3}, [lr,:128]!
1393 vld1.8 {d6}, [lr,:64]!
1394 vld1.32 {d28[]}, [lr,:32]
1395 sub lr, lr, #16
1396 vld1.8 {d4-d5}, [lr]!
1397 vld1.8 {d7}, [lr,:64]!
1398 vld1.32 {d28[1]}, [lr,:32]
1399 sub lr, lr, #16
1400 vtrn.32 q1, q2
1401 vtrn.32 d6, d7
1402 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
1403 vst1.32 {d2[0]}, [r0,:32], r1
1404 vst1.32 {d3[0]}, [r0,:32], r1
1405 vst1.32 {d2[1]}, [r0,:32], r1
1406 vst1.32 {d3[1]}, [r0,:32], r1
1407 subs r12, r12, #4
1408 bne 2b
1409
1410 add sp, sp, #52+16
1411 pop {r4,pc}
1412endfunc
1413
1414function ff_put_vp8_epel4_h6v4_neon, export=1
1415 sub r2, r2, r3
1416 sub r2, r2, #2
1417 push {r4,lr}
1418
1419 ldr r4, [sp, #12] @ mx
1420 movrel lr, subpel_filters-16
1421 ldr r12, [sp, #8] @ h
1422 add r4, lr, r4, lsl #4
1423 sub sp, sp, #44+16
1424 vld1.16 {q0}, [r4,:128]
1425 add lr, sp, #15
1426 add r12, r12, #3
1427 bic lr, lr, #15
14281:
1429 vld1.8 {q1}, [r2], r3
1430 vp8_epel8_h6 d2, d2, d3
1431 vst1.32 {d2[0]}, [lr,:32]!
1432 subs r12, r12, #1
1433 bne 1b
1434
1435 ldr r4, [sp, #44+16+16] @ my
1436 movrel lr, subpel_filters-16
1437 ldr r12, [sp, #44+16+8] @ h
1438 add r4, lr, r4, lsl #4
1439 add lr, sp, #15
1440 vld1.16 {q0}, [r4,:128]
1441 bic lr, lr, #15
14422:
1443 vld1.8 {d2-d3}, [lr,:128]!
1444 vld1.32 {d6[]}, [lr,:32]
1445 sub lr, lr, #8
1446 vld1.8 {d4-d5}, [lr]!
1447 vld1.32 {d6[1]}, [lr,:32]
1448 sub lr, lr, #8
1449 vtrn.32 q1, q2
1450 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
1451 vst1.32 {d2[0]}, [r0,:32], r1
1452 vst1.32 {d3[0]}, [r0,:32], r1
1453 vst1.32 {d2[1]}, [r0,:32], r1
1454 vst1.32 {d3[1]}, [r0,:32], r1
1455 subs r12, r12, #4
1456 bne 2b
1457
1458 add sp, sp, #44+16
1459 pop {r4,pc}
1460endfunc
1461
1462function ff_put_vp8_epel4_h4_neon, export=1
1463 sub r2, r2, #1
1464 push {r4,lr}
1465
1466 ldr r4, [sp, #12] @ mx
1467 movrel lr, subpel_filters-16
1468 ldr r12, [sp, #8] @ h
1469 add r4, lr, r4, lsl #4
1470 vld1.16 {q0}, [r4,:128]
14711:
1472 vld1.8 {d2}, [r2], r3
1473 vp8_epel8_h4 d2, d2, d2
1474 vst1.32 {d2[0]}, [r0,:32], r1
1475 subs r12, r12, #1
1476 bne 1b
1477
1478 pop {r4,pc}
1479endfunc
1480
1481function ff_put_vp8_epel4_v4_neon, export=1
1482 sub r2, r2, r3
1483 push {r4,lr}
1484
1485 ldr r4, [sp, #16] @ my
1486 movrel lr, subpel_filters-16
1487 ldr r12, [sp, #8] @ h
1488 add r4, lr, r4, lsl #4
1489 vld1.16 {q0}, [r4,:128]
14901:
1491 vld1.32 {d2[]}, [r2], r3
1492 vld1.32 {d3[]}, [r2], r3
1493 vld1.32 {d4[]}, [r2], r3
1494 vld1.32 {d5[]}, [r2], r3
1495 vld1.32 {d6[]}, [r2]
1496 sub r2, r2, r3, lsl #1
1497 vld1.32 {d2[1]}, [r2], r3
1498 vld1.32 {d3[1]}, [r2], r3
1499 vld1.32 {d4[1]}, [r2], r3
1500 vld1.32 {d5[1]}, [r2], r3
1501 vld1.32 {d6[1]}, [r2]
1502 sub r2, r2, r3, lsl #1
1503
1504 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
1505
1506 vst1.32 {d2[0]}, [r0,:32], r1
1507 vst1.32 {d3[0]}, [r0,:32], r1
1508 vst1.32 {d2[1]}, [r0,:32], r1
1509 vst1.32 {d3[1]}, [r0,:32], r1
1510 subs r12, r12, #4
1511 bne 1b
1512
1513 pop {r4,pc}
1514endfunc
1515
1516function ff_put_vp8_epel4_h4v4_neon, export=1
1517 sub r2, r2, r3
1518 sub r2, r2, #1
1519 push {r4,lr}
1520
1521 ldr r4, [sp, #12] @ mx
1522 movrel lr, subpel_filters-16
1523 ldr r12, [sp, #8] @ h
1524 add r4, lr, r4, lsl #4
1525 sub sp, sp, #44+16
1526 vld1.16 {q0}, [r4,:128]
1527 add lr, sp, #15
1528 add r12, r12, #3
1529 bic lr, lr, #15
15301:
1531 vld1.8 {d2}, [r2], r3
1532 vp8_epel8_h4 d2, d2, d3
1533 vst1.32 {d2[0]}, [lr,:32]!
1534 subs r12, r12, #1
1535 bne 1b
1536
1537 ldr r4, [sp, #44+16+16] @ my
1538 movrel lr, subpel_filters-16
1539 ldr r12, [sp, #44+16+8] @ h
1540 add r4, lr, r4, lsl #4
1541 add lr, sp, #15
1542 vld1.16 {q0}, [r4,:128]
1543 bic lr, lr, #15
15442:
1545 vld1.8 {d2-d3}, [lr,:128]!
1546 vld1.32 {d6[]}, [lr,:32]
1547 sub lr, lr, #8
1548 vld1.8 {d4-d5}, [lr]!
1549 vld1.32 {d6[1]}, [lr,:32]
1550 sub lr, lr, #8
1551 vtrn.32 q1, q2
1552 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
1553 vst1.32 {d2[0]}, [r0,:32], r1
1554 vst1.32 {d3[0]}, [r0,:32], r1
1555 vst1.32 {d2[1]}, [r0,:32], r1
1556 vst1.32 {d3[1]}, [r0,:32], r1
1557 subs r12, r12, #4
1558 bne 2b
1559
1560 add sp, sp, #44+16
1561 pop {r4,pc}
1562endfunc
1563
1564@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1565@ arithmatic can be used to apply filters
1566const subpel_filters, align=4
1567 .short 0, 6, 123, 12, 1, 0, 0, 0
1568 .short 2, 11, 108, 36, 8, 1, 0, 0
1569 .short 0, 9, 93, 50, 6, 0, 0, 0
1570 .short 3, 16, 77, 77, 16, 3, 0, 0
1571 .short 0, 6, 50, 93, 9, 0, 0, 0
1572 .short 1, 8, 36, 108, 11, 2, 0, 0
1573 .short 0, 1, 12, 123, 6, 0, 0, 0
1574endconst
1575
1576/* Bilinear MC */
1577
1578function ff_put_vp8_bilin16_h_neon, export=1
1579 ldr r12, [sp, #4] @ mx
1580 vdup.8 d0, r12
1581 rsb r12, r12, #8
1582 vdup.8 d1, r12
1583 ldr r12, [sp] @ h
15841:
1585 subs r12, r12, #2
1586 vld1.8 {d2-d4}, [r2], r3
1587 vext.8 q2, q1, q2, #1
1588 vmull.u8 q8, d2, d1
1589 vmlal.u8 q8, d4, d0
1590 vld1.8 {d18-d20},[r2], r3
1591 vmull.u8 q3, d3, d1
1592 vmlal.u8 q3, d5, d0
1593 vext.8 q10, q9, q10, #1
1594 vmull.u8 q11, d18, d1
1595 vmlal.u8 q11, d20, d0
1596 vmull.u8 q12, d19, d1
1597 vmlal.u8 q12, d21, d0
1598 vrshrn.u16 d4, q8, #3
1599 vrshrn.u16 d5, q3, #3
1600 vrshrn.u16 d6, q11, #3
1601 vrshrn.u16 d7, q12, #3
1602 vst1.8 {q2}, [r0,:128], r1
1603 vst1.8 {q3}, [r0,:128], r1
1604 bgt 1b
1605
1606 bx lr
1607endfunc
1608
1609function ff_put_vp8_bilin16_v_neon, export=1
1610 ldr r12, [sp, #8] @ my
1611 vdup.8 d0, r12
1612 rsb r12, r12, #8
1613 vdup.8 d1, r12
1614 ldr r12, [sp] @ h
1615 vld1.8 {q1}, [r2], r3
16161:
1617 subs r12, r12, #2
1618 vld1.8 {q2}, [r2], r3
1619 vmull.u8 q3, d2, d1
1620 vmlal.u8 q3, d4, d0
1621 vmull.u8 q8, d3, d1
1622 vmlal.u8 q8, d5, d0
1623 vld1.8 {q1}, [r2], r3
1624 vmull.u8 q9, d4, d1
1625 vmlal.u8 q9, d2, d0
1626 vmull.u8 q10, d5, d1
1627 vmlal.u8 q10, d3, d0
1628 vrshrn.u16 d4, q3, #3
1629 vrshrn.u16 d5, q8, #3
1630 vrshrn.u16 d6, q9, #3
1631 vrshrn.u16 d7, q10, #3
1632 vst1.8 {q2}, [r0,:128], r1
1633 vst1.8 {q3}, [r0,:128], r1
1634 bgt 1b
1635
1636 bx lr
1637endfunc
1638
1639function ff_put_vp8_bilin16_hv_neon, export=1
1640 ldr r12, [sp, #4] @ mx
1641 vdup.8 d0, r12
1642 rsb r12, r12, #8
1643 vdup.8 d1, r12
1644 ldr r12, [sp, #8] @ my
1645 vdup.8 d2, r12
1646 rsb r12, r12, #8
1647 vdup.8 d3, r12
1648 ldr r12, [sp] @ h
1649
1650 vld1.8 {d4-d6}, [r2], r3
1651 vext.8 q3, q2, q3, #1
1652 vmull.u8 q8, d4, d1
1653 vmlal.u8 q8, d6, d0
1654 vmull.u8 q9, d5, d1
1655 vmlal.u8 q9, d7, d0
1656 vrshrn.u16 d4, q8, #3
1657 vrshrn.u16 d5, q9, #3
16581:
1659 subs r12, r12, #2
1660 vld1.8 {d18-d20},[r2], r3
1661 vext.8 q10, q9, q10, #1
1662 vmull.u8 q11, d18, d1
1663 vmlal.u8 q11, d20, d0
1664 vld1.8 {d26-d28},[r2], r3
1665 vmull.u8 q12, d19, d1
1666 vmlal.u8 q12, d21, d0
1667 vext.8 q14, q13, q14, #1
1668 vmull.u8 q8, d26, d1
1669 vmlal.u8 q8, d28, d0
1670 vmull.u8 q9, d27, d1
1671 vmlal.u8 q9, d29, d0
1672 vrshrn.u16 d6, q11, #3
1673 vrshrn.u16 d7, q12, #3
1674 vmull.u8 q12, d4, d3
1675 vmlal.u8 q12, d6, d2
1676 vmull.u8 q15, d5, d3
1677 vmlal.u8 q15, d7, d2
1678 vrshrn.u16 d4, q8, #3
1679 vrshrn.u16 d5, q9, #3
1680 vmull.u8 q10, d6, d3
1681 vmlal.u8 q10, d4, d2
1682 vmull.u8 q11, d7, d3
1683 vmlal.u8 q11, d5, d2
1684 vrshrn.u16 d24, q12, #3
1685 vrshrn.u16 d25, q15, #3
1686 vst1.8 {q12}, [r0,:128], r1
1687 vrshrn.u16 d20, q10, #3
1688 vrshrn.u16 d21, q11, #3
1689 vst1.8 {q10}, [r0,:128], r1
1690 bgt 1b
1691
1692 bx lr
1693endfunc
1694
1695function ff_put_vp8_bilin8_h_neon, export=1
1696 ldr r12, [sp, #4] @ mx
1697 vdup.8 d0, r12
1698 rsb r12, r12, #8
1699 vdup.8 d1, r12
1700 ldr r12, [sp] @ h
17011:
1702 subs r12, r12, #2
1703 vld1.8 {q1}, [r2], r3
1704 vext.8 d3, d2, d3, #1
1705 vmull.u8 q2, d2, d1
1706 vmlal.u8 q2, d3, d0
1707 vld1.8 {q3}, [r2], r3
1708 vext.8 d7, d6, d7, #1
1709 vmull.u8 q8, d6, d1
1710 vmlal.u8 q8, d7, d0
1711 vrshrn.u16 d4, q2, #3
1712 vrshrn.u16 d16, q8, #3
1713 vst1.8 {d4}, [r0,:64], r1
1714 vst1.8 {d16}, [r0,:64], r1
1715 bgt 1b
1716
1717 bx lr
1718endfunc
1719
1720function ff_put_vp8_bilin8_v_neon, export=1
1721 ldr r12, [sp, #8] @ my
1722 vdup.8 d0, r12
1723 rsb r12, r12, #8
1724 vdup.8 d1, r12
1725 ldr r12, [sp] @ h
1726 vld1.8 {d2}, [r2], r3
17271:
1728 subs r12, r12, #2
1729 vld1.8 {d3}, [r2], r3
1730 vmull.u8 q2, d2, d1
1731 vmlal.u8 q2, d3, d0
1732 vld1.8 {d2}, [r2], r3
1733 vmull.u8 q3, d3, d1
1734 vmlal.u8 q3, d2, d0
1735 vrshrn.u16 d4, q2, #3
1736 vrshrn.u16 d6, q3, #3
1737 vst1.8 {d4}, [r0,:64], r1
1738 vst1.8 {d6}, [r0,:64], r1
1739 bgt 1b
1740
1741 bx lr
1742endfunc
1743
1744function ff_put_vp8_bilin8_hv_neon, export=1
1745 ldr r12, [sp, #4] @ mx
1746 vdup.8 d0, r12
1747 rsb r12, r12, #8
1748 vdup.8 d1, r12
1749 ldr r12, [sp, #8] @ my
1750 vdup.8 d2, r12
1751 rsb r12, r12, #8
1752 vdup.8 d3, r12
1753 ldr r12, [sp] @ h
1754
1755 vld1.8 {q2}, [r2], r3
1756 vext.8 d5, d4, d5, #1
1757 vmull.u8 q9, d4, d1
1758 vmlal.u8 q9, d5, d0
1759 vrshrn.u16 d22, q9, #3
17601:
1761 subs r12, r12, #2
1762 vld1.8 {q3}, [r2], r3
1763 vext.8 d7, d6, d7, #1
1764 vmull.u8 q8, d6, d1
1765 vmlal.u8 q8, d7, d0
1766 vld1.8 {q2}, [r2], r3
1767 vext.8 d5, d4, d5, #1
1768 vmull.u8 q9, d4, d1
1769 vmlal.u8 q9, d5, d0
1770 vrshrn.u16 d16, q8, #3
1771 vmull.u8 q10, d22, d3
1772 vmlal.u8 q10, d16, d2
1773 vrshrn.u16 d22, q9, #3
1774 vmull.u8 q12, d16, d3
1775 vmlal.u8 q12, d22, d2
1776 vrshrn.u16 d20, q10, #3
1777 vst1.8 {d20}, [r0,:64], r1
1778 vrshrn.u16 d23, q12, #3
1779 vst1.8 {d23}, [r0,:64], r1
1780 bgt 1b
1781
1782 bx lr
1783endfunc
1784
1785function ff_put_vp8_bilin4_h_neon, export=1
1786 ldr r12, [sp, #4] @ mx
1787 vdup.8 d0, r12
1788 rsb r12, r12, #8
1789 vdup.8 d1, r12
1790 ldr r12, [sp] @ h
17911:
1792 subs r12, r12, #2
1793 vld1.8 {d2}, [r2], r3
1794 vext.8 d3, d2, d3, #1
1795 vld1.8 {d6}, [r2], r3
1796 vext.8 d7, d6, d7, #1
1797 vtrn.32 q1, q3
1798 vmull.u8 q2, d2, d1
1799 vmlal.u8 q2, d3, d0
1800 vrshrn.u16 d4, q2, #3
1801 vst1.32 {d4[0]}, [r0,:32], r1
1802 vst1.32 {d4[1]}, [r0,:32], r1
1803 bgt 1b
1804
1805 bx lr
1806endfunc
1807
1808function ff_put_vp8_bilin4_v_neon, export=1
1809 ldr r12, [sp, #8] @ my
1810 vdup.8 d0, r12
1811 rsb r12, r12, #8
1812 vdup.8 d1, r12
1813 ldr r12, [sp] @ h
1814 vld1.32 {d2[]}, [r2], r3
18151:
1816 vld1.32 {d3[]}, [r2]
1817 vld1.32 {d2[1]}, [r2], r3
1818 vld1.32 {d3[1]}, [r2], r3
1819 vmull.u8 q2, d2, d1
1820 vmlal.u8 q2, d3, d0
1821 vtrn.32 d3, d2
1822 vrshrn.u16 d4, q2, #3
1823 vst1.32 {d4[0]}, [r0,:32], r1
1824 vst1.32 {d4[1]}, [r0,:32], r1
1825 subs r12, r12, #2
1826 bgt 1b
1827
1828 bx lr
1829endfunc
1830
1831function ff_put_vp8_bilin4_hv_neon, export=1
1832 ldr r12, [sp, #4] @ mx
1833 vdup.8 d0, r12
1834 rsb r12, r12, #8
1835 vdup.8 d1, r12
1836 ldr r12, [sp, #8] @ my
1837 vdup.8 d2, r12
1838 rsb r12, r12, #8
1839 vdup.8 d3, r12
1840 ldr r12, [sp] @ h
1841
1842 vld1.8 {d4}, [r2], r3
1843 vext.8 d5, d4, d4, #1
1844 vmull.u8 q9, d4, d1
1845 vmlal.u8 q9, d5, d0
1846 vrshrn.u16 d22, q9, #3
18471:
1848 subs r12, r12, #2
1849 vld1.8 {d6}, [r2], r3
1850 vext.8 d7, d6, d6, #1
1851 vld1.8 {d4}, [r2], r3
1852 vext.8 d5, d4, d4, #1
1853 vtrn.32 q3, q2
1854 vmull.u8 q8, d6, d1
1855 vmlal.u8 q8, d7, d0
1856 vrshrn.u16 d16, q8, #3
1857 vmull.u8 q10, d16, d2
1858 vtrn.32 d22, d16
1859 vmlal.u8 q10, d22, d3
1860 vrev64.32 d22, d16
1861 vrshrn.u16 d20, q10, #3
1862 vst1.32 {d20[0]}, [r0,:32], r1
1863 vst1.32 {d20[1]}, [r0,:32], r1
1864 bgt 1b
1865
1866 bx lr
1867endfunc