Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * VP8 NEON optimisations | |
3 | * | |
4 | * Copyright (c) 2010 Rob Clark <rob@ti.com> | |
5 | * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> | |
6 | * | |
7 | * This file is part of FFmpeg. | |
8 | * | |
9 | * FFmpeg is free software; you can redistribute it and/or | |
10 | * modify it under the terms of the GNU Lesser General Public | |
11 | * License as published by the Free Software Foundation; either | |
12 | * version 2.1 of the License, or (at your option) any later version. | |
13 | * | |
14 | * FFmpeg is distributed in the hope that it will be useful, | |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | * Lesser General Public License for more details. | |
18 | * | |
19 | * You should have received a copy of the GNU Lesser General Public | |
20 | * License along with FFmpeg; if not, write to the Free Software | |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 | */ | |
23 | ||
24 | #include "libavutil/arm/asm.S" | |
25 | #include "neon.S" | |
26 | ||
27 | function ff_vp8_luma_dc_wht_neon, export=1 | |
28 | vld1.16 {q0-q1}, [r1,:128] | |
29 | vmov.i16 q15, #0 | |
30 | ||
31 | vadd.i16 d4, d0, d3 | |
32 | vadd.i16 d6, d1, d2 | |
33 | vst1.16 {q15}, [r1,:128]! | |
34 | vsub.i16 d7, d1, d2 | |
35 | vsub.i16 d5, d0, d3 | |
36 | vst1.16 {q15}, [r1,:128] | |
37 | vadd.i16 q0, q2, q3 | |
38 | vsub.i16 q1, q2, q3 | |
39 | ||
40 | vmov.i16 q8, #3 | |
41 | ||
42 | vtrn.32 d0, d2 | |
43 | vtrn.32 d1, d3 | |
44 | vtrn.16 d0, d1 | |
45 | vtrn.16 d2, d3 | |
46 | ||
47 | vadd.i16 d0, d0, d16 | |
48 | ||
49 | vadd.i16 d4, d0, d3 | |
50 | vadd.i16 d6, d1, d2 | |
51 | vsub.i16 d7, d1, d2 | |
52 | vsub.i16 d5, d0, d3 | |
53 | vadd.i16 q0, q2, q3 | |
54 | vsub.i16 q1, q2, q3 | |
55 | ||
56 | vshr.s16 q0, q0, #3 | |
57 | vshr.s16 q1, q1, #3 | |
58 | ||
59 | mov r3, #32 | |
60 | vst1.16 {d0[0]}, [r0,:16], r3 | |
61 | vst1.16 {d1[0]}, [r0,:16], r3 | |
62 | vst1.16 {d2[0]}, [r0,:16], r3 | |
63 | vst1.16 {d3[0]}, [r0,:16], r3 | |
64 | vst1.16 {d0[1]}, [r0,:16], r3 | |
65 | vst1.16 {d1[1]}, [r0,:16], r3 | |
66 | vst1.16 {d2[1]}, [r0,:16], r3 | |
67 | vst1.16 {d3[1]}, [r0,:16], r3 | |
68 | vst1.16 {d0[2]}, [r0,:16], r3 | |
69 | vst1.16 {d1[2]}, [r0,:16], r3 | |
70 | vst1.16 {d2[2]}, [r0,:16], r3 | |
71 | vst1.16 {d3[2]}, [r0,:16], r3 | |
72 | vst1.16 {d0[3]}, [r0,:16], r3 | |
73 | vst1.16 {d1[3]}, [r0,:16], r3 | |
74 | vst1.16 {d2[3]}, [r0,:16], r3 | |
75 | vst1.16 {d3[3]}, [r0,:16], r3 | |
76 | ||
77 | bx lr | |
78 | endfunc | |
79 | ||
80 | function ff_vp8_idct_add_neon, export=1 | |
81 | vld1.16 {q0-q1}, [r1,:128] | |
82 | movw r3, #20091 | |
83 | movt r3, #35468/2 | |
84 | vdup.32 d4, r3 | |
85 | ||
86 | vmull.s16 q12, d1, d4[0] | |
87 | vmull.s16 q13, d3, d4[0] | |
88 | vqdmulh.s16 d20, d1, d4[1] | |
89 | vqdmulh.s16 d23, d3, d4[1] | |
90 | vshrn.s32 d21, q12, #16 | |
91 | vshrn.s32 d22, q13, #16 | |
92 | vadd.s16 d21, d21, d1 | |
93 | vadd.s16 d22, d22, d3 | |
94 | ||
95 | vadd.s16 d16, d0, d2 | |
96 | vsub.s16 d17, d0, d2 | |
97 | vadd.s16 d18, d21, d23 | |
98 | vsub.s16 d19, d20, d22 | |
99 | vadd.s16 q0, q8, q9 | |
100 | vsub.s16 q1, q8, q9 | |
101 | ||
102 | vtrn.32 d0, d3 | |
103 | vtrn.32 d1, d2 | |
104 | vtrn.16 d0, d1 | |
105 | vtrn.16 d3, d2 | |
106 | ||
107 | vmov.i16 q15, #0 | |
108 | vmull.s16 q12, d1, d4[0] | |
109 | vst1.16 {q15}, [r1,:128]! | |
110 | vmull.s16 q13, d2, d4[0] | |
111 | vst1.16 {q15}, [r1,:128] | |
112 | vqdmulh.s16 d21, d1, d4[1] | |
113 | vqdmulh.s16 d23, d2, d4[1] | |
114 | vshrn.s32 d20, q12, #16 | |
115 | vshrn.s32 d22, q13, #16 | |
116 | vadd.i16 d20, d20, d1 | |
117 | vadd.i16 d22, d22, d2 | |
118 | ||
119 | vadd.i16 d16, d0, d3 | |
120 | vsub.i16 d17, d0, d3 | |
121 | vadd.i16 d18, d20, d23 | |
122 | vld1.32 {d20[]}, [r0,:32], r2 | |
123 | vsub.i16 d19, d21, d22 | |
124 | vld1.32 {d22[]}, [r0,:32], r2 | |
125 | vadd.s16 q0, q8, q9 | |
126 | vld1.32 {d23[]}, [r0,:32], r2 | |
127 | vsub.s16 q1, q8, q9 | |
128 | vld1.32 {d21[]}, [r0,:32], r2 | |
129 | vrshr.s16 q0, q0, #3 | |
130 | vtrn.32 q10, q11 | |
131 | vrshr.s16 q1, q1, #3 | |
132 | ||
133 | sub r0, r0, r2, lsl #2 | |
134 | ||
135 | vtrn.32 d0, d3 | |
136 | vtrn.32 d1, d2 | |
137 | vtrn.16 d0, d1 | |
138 | vtrn.16 d3, d2 | |
139 | ||
140 | vaddw.u8 q0, q0, d20 | |
141 | vaddw.u8 q1, q1, d21 | |
142 | vqmovun.s16 d0, q0 | |
143 | vqmovun.s16 d1, q1 | |
144 | ||
145 | vst1.32 {d0[0]}, [r0,:32], r2 | |
146 | vst1.32 {d0[1]}, [r0,:32], r2 | |
147 | vst1.32 {d1[1]}, [r0,:32], r2 | |
148 | vst1.32 {d1[0]}, [r0,:32], r2 | |
149 | ||
150 | bx lr | |
151 | endfunc | |
152 | ||
153 | function ff_vp8_idct_dc_add_neon, export=1 | |
154 | mov r3, #0 | |
155 | ldrsh r12, [r1] | |
156 | strh r3, [r1] | |
157 | vdup.16 q1, r12 | |
158 | vrshr.s16 q1, q1, #3 | |
159 | vld1.32 {d0[]}, [r0,:32], r2 | |
160 | vld1.32 {d1[]}, [r0,:32], r2 | |
161 | vld1.32 {d0[1]}, [r0,:32], r2 | |
162 | vld1.32 {d1[1]}, [r0,:32], r2 | |
163 | vaddw.u8 q2, q1, d0 | |
164 | vaddw.u8 q3, q1, d1 | |
165 | sub r0, r0, r2, lsl #2 | |
166 | vqmovun.s16 d0, q2 | |
167 | vqmovun.s16 d1, q3 | |
168 | vst1.32 {d0[0]}, [r0,:32], r2 | |
169 | vst1.32 {d1[0]}, [r0,:32], r2 | |
170 | vst1.32 {d0[1]}, [r0,:32], r2 | |
171 | vst1.32 {d1[1]}, [r0,:32], r2 | |
172 | bx lr | |
173 | endfunc | |
174 | ||
175 | function ff_vp8_idct_dc_add4uv_neon, export=1 | |
176 | vmov.i16 d0, #0 | |
177 | mov r3, #32 | |
178 | vld1.16 {d16[]}, [r1,:16] | |
179 | vst1.16 {d0[0]}, [r1,:16], r3 | |
180 | vld1.16 {d17[]}, [r1,:16] | |
181 | vst1.16 {d0[0]}, [r1,:16], r3 | |
182 | vld1.16 {d18[]}, [r1,:16] | |
183 | vst1.16 {d0[0]}, [r1,:16], r3 | |
184 | vld1.16 {d19[]}, [r1,:16] | |
185 | vst1.16 {d0[0]}, [r1,:16], r3 | |
186 | mov r3, r0 | |
187 | vrshr.s16 q8, q8, #3 @ dc >>= 3 | |
188 | vld1.8 {d0}, [r0,:64], r2 | |
189 | vrshr.s16 q9, q9, #3 | |
190 | vld1.8 {d1}, [r0,:64], r2 | |
191 | vaddw.u8 q10, q8, d0 | |
192 | vld1.8 {d2}, [r0,:64], r2 | |
193 | vaddw.u8 q0, q8, d1 | |
194 | vld1.8 {d3}, [r0,:64], r2 | |
195 | vaddw.u8 q11, q8, d2 | |
196 | vld1.8 {d4}, [r0,:64], r2 | |
197 | vaddw.u8 q1, q8, d3 | |
198 | vld1.8 {d5}, [r0,:64], r2 | |
199 | vaddw.u8 q12, q9, d4 | |
200 | vld1.8 {d6}, [r0,:64], r2 | |
201 | vaddw.u8 q2, q9, d5 | |
202 | vld1.8 {d7}, [r0,:64], r2 | |
203 | vaddw.u8 q13, q9, d6 | |
204 | vqmovun.s16 d20, q10 | |
205 | vaddw.u8 q3, q9, d7 | |
206 | vqmovun.s16 d21, q0 | |
207 | vqmovun.s16 d22, q11 | |
208 | vst1.8 {d20}, [r3,:64], r2 | |
209 | vqmovun.s16 d23, q1 | |
210 | vst1.8 {d21}, [r3,:64], r2 | |
211 | vqmovun.s16 d24, q12 | |
212 | vst1.8 {d22}, [r3,:64], r2 | |
213 | vqmovun.s16 d25, q2 | |
214 | vst1.8 {d23}, [r3,:64], r2 | |
215 | vqmovun.s16 d26, q13 | |
216 | vst1.8 {d24}, [r3,:64], r2 | |
217 | vqmovun.s16 d27, q3 | |
218 | vst1.8 {d25}, [r3,:64], r2 | |
219 | vst1.8 {d26}, [r3,:64], r2 | |
220 | vst1.8 {d27}, [r3,:64], r2 | |
221 | ||
222 | bx lr | |
223 | endfunc | |
224 | ||
225 | function ff_vp8_idct_dc_add4y_neon, export=1 | |
226 | vmov.i16 d0, #0 | |
227 | mov r3, #32 | |
228 | vld1.16 {d16[]}, [r1,:16] | |
229 | vst1.16 {d0[0]}, [r1,:16], r3 | |
230 | vld1.16 {d17[]}, [r1,:16] | |
231 | vst1.16 {d0[0]}, [r1,:16], r3 | |
232 | vld1.16 {d18[]}, [r1,:16] | |
233 | vst1.16 {d0[0]}, [r1,:16], r3 | |
234 | vld1.16 {d19[]}, [r1,:16] | |
235 | vst1.16 {d0[0]}, [r1,:16], r3 | |
236 | vrshr.s16 q8, q8, #3 @ dc >>= 3 | |
237 | vld1.8 {q0}, [r0,:128], r2 | |
238 | vrshr.s16 q9, q9, #3 | |
239 | vld1.8 {q1}, [r0,:128], r2 | |
240 | vaddw.u8 q10, q8, d0 | |
241 | vld1.8 {q2}, [r0,:128], r2 | |
242 | vaddw.u8 q0, q9, d1 | |
243 | vld1.8 {q3}, [r0,:128], r2 | |
244 | vaddw.u8 q11, q8, d2 | |
245 | vaddw.u8 q1, q9, d3 | |
246 | vaddw.u8 q12, q8, d4 | |
247 | vaddw.u8 q2, q9, d5 | |
248 | vaddw.u8 q13, q8, d6 | |
249 | vaddw.u8 q3, q9, d7 | |
250 | sub r0, r0, r2, lsl #2 | |
251 | vqmovun.s16 d20, q10 | |
252 | vqmovun.s16 d21, q0 | |
253 | vqmovun.s16 d22, q11 | |
254 | vqmovun.s16 d23, q1 | |
255 | vqmovun.s16 d24, q12 | |
256 | vst1.8 {q10}, [r0,:128], r2 | |
257 | vqmovun.s16 d25, q2 | |
258 | vst1.8 {q11}, [r0,:128], r2 | |
259 | vqmovun.s16 d26, q13 | |
260 | vst1.8 {q12}, [r0,:128], r2 | |
261 | vqmovun.s16 d27, q3 | |
262 | vst1.8 {q13}, [r0,:128], r2 | |
263 | ||
264 | bx lr | |
265 | endfunc | |
266 | ||
267 | @ Register layout: | |
268 | @ P3..Q3 -> q0..q7 | |
269 | @ flim_E -> q14 | |
270 | @ flim_I -> q15 | |
271 | @ hev_thresh -> r12 | |
272 | @ | |
273 | .macro vp8_loop_filter, inner=0, simple=0 | |
274 | .if \simple | |
275 | vabd.u8 q9, q3, q4 @ abs(P0-Q0) | |
276 | vabd.u8 q15, q2, q5 @ abs(P1-Q1) | |
277 | vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 | |
278 | vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 | |
279 | vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) | |
280 | vmov.i8 q13, #0x80 | |
281 | vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim | |
282 | .else | |
283 | @ calculate hev and normal_limit: | |
284 | vabd.u8 q12, q2, q3 @ abs(P1-P0) | |
285 | vabd.u8 q13, q5, q4 @ abs(Q1-Q0) | |
286 | vabd.u8 q10, q0, q1 @ abs(P3-P2) | |
287 | vabd.u8 q11, q1, q2 @ abs(P2-P1) | |
288 | vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I | |
289 | vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I | |
290 | vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I | |
291 | vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I | |
292 | vand q8, q8, q9 | |
293 | vabd.u8 q9, q7, q6 @ abs(Q3-Q2) | |
294 | vand q8, q8, q11 | |
295 | vabd.u8 q11, q6, q5 @ abs(Q2-Q1) | |
296 | vand q8, q8, q10 | |
297 | vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I | |
298 | vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I | |
299 | vabd.u8 q9, q3, q4 @ abs(P0-Q0) | |
300 | vabd.u8 q15, q2, q5 @ abs(P1-Q1) | |
301 | vand q8, q8, q10 | |
302 | vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 | |
303 | vand q8, q8, q11 | |
304 | vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 | |
305 | vdup.8 q15, r12 @ hev_thresh | |
306 | vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) | |
307 | vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh | |
308 | vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E | |
309 | vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh | |
310 | vand q8, q8, q11 | |
311 | vmov.i8 q13, #0x80 | |
312 | vorr q9, q12, q14 | |
313 | .endif | |
314 | ||
315 | @ at this point: | |
316 | @ q8: normal_limit | |
317 | @ q9: hev | |
318 | ||
319 | @ convert to signed value: | |
320 | veor q3, q3, q13 @ PS0 = P0 ^ 0x80 | |
321 | veor q4, q4, q13 @ QS0 = Q0 ^ 0x80 | |
322 | ||
323 | vmov.i16 q12, #3 | |
324 | vsubl.s8 q10, d8, d6 @ QS0 - PS0 | |
325 | vsubl.s8 q11, d9, d7 @ (widened to 16bit) | |
326 | veor q2, q2, q13 @ PS1 = P1 ^ 0x80 | |
327 | veor q5, q5, q13 @ QS1 = Q1 ^ 0x80 | |
328 | vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0) | |
329 | vmul.i16 q11, q11, q12 | |
330 | ||
331 | vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1) | |
332 | vmov.i8 q14, #4 | |
333 | vmov.i8 q15, #3 | |
334 | .if \inner | |
335 | vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1) | |
336 | .endif | |
337 | vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1) | |
338 | vaddw.s8 q11, q11, d25 | |
339 | vqmovn.s16 d20, q10 @ narrow result back into q10 | |
340 | vqmovn.s16 d21, q11 | |
341 | .if !\inner && !\simple | |
342 | veor q1, q1, q13 @ PS2 = P2 ^ 0x80 | |
343 | veor q6, q6, q13 @ QS2 = Q2 ^ 0x80 | |
344 | .endif | |
345 | vand q10, q10, q8 @ w &= normal_limit | |
346 | ||
347 | @ registers used at this point.. | |
348 | @ q0 -> P3 (don't corrupt) | |
349 | @ q1-q6 -> PS2-QS2 | |
350 | @ q7 -> Q3 (don't corrupt) | |
351 | @ q9 -> hev | |
352 | @ q10 -> w | |
353 | @ q13 -> #0x80 | |
354 | @ q14 -> #4 | |
355 | @ q15 -> #3 | |
356 | @ q8, q11, q12 -> unused | |
357 | ||
358 | @ filter_common: is4tap==1 | |
359 | @ c1 = clamp(w + 4) >> 3; | |
360 | @ c2 = clamp(w + 3) >> 3; | |
361 | @ Q0 = s2u(QS0 - c1); | |
362 | @ P0 = s2u(PS0 + c2); | |
363 | ||
364 | .if \simple | |
365 | vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) | |
366 | vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) | |
367 | vshr.s8 q11, q11, #3 @ c1 >>= 3 | |
368 | vshr.s8 q12, q12, #3 @ c2 >>= 3 | |
369 | vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) | |
370 | vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) | |
371 | veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 | |
372 | veor q3, q3, q13 @ P0 = PS0 ^ 0x80 | |
373 | veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 | |
374 | veor q2, q2, q13 @ P1 = PS1 ^ 0x80 | |
375 | .elseif \inner | |
376 | @ the !is4tap case of filter_common, only used for inner blocks | |
377 | @ c3 = ((c1&~hev) + 1) >> 1; | |
378 | @ Q1 = s2u(QS1 - c3); | |
379 | @ P1 = s2u(PS1 + c3); | |
380 | vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) | |
381 | vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) | |
382 | vshr.s8 q11, q11, #3 @ c1 >>= 3 | |
383 | vshr.s8 q12, q12, #3 @ c2 >>= 3 | |
384 | vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) | |
385 | vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) | |
386 | vbic q11, q11, q9 @ c1 & ~hev | |
387 | veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 | |
388 | vrshr.s8 q11, q11, #1 @ c3 >>= 1 | |
389 | veor q3, q3, q13 @ P0 = PS0 ^ 0x80 | |
390 | vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3) | |
391 | vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3) | |
392 | veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 | |
393 | veor q2, q2, q13 @ P1 = PS1 ^ 0x80 | |
394 | .else | |
395 | vand q12, q10, q9 @ w & hev | |
396 | vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4) | |
397 | vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3) | |
398 | vshr.s8 q11, q11, #3 @ c1 >>= 3 | |
399 | vshr.s8 q12, q12, #3 @ c2 >>= 3 | |
400 | vbic q10, q10, q9 @ w &= ~hev | |
401 | vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) | |
402 | vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) | |
403 | ||
404 | @ filter_mbedge: | |
405 | @ a = clamp((27*w + 63) >> 7); | |
406 | @ Q0 = s2u(QS0 - a); | |
407 | @ P0 = s2u(PS0 + a); | |
408 | @ a = clamp((18*w + 63) >> 7); | |
409 | @ Q1 = s2u(QS1 - a); | |
410 | @ P1 = s2u(PS1 + a); | |
411 | @ a = clamp((9*w + 63) >> 7); | |
412 | @ Q2 = s2u(QS2 - a); | |
413 | @ P2 = s2u(PS2 + a); | |
414 | vmov.i16 q9, #63 | |
415 | vshll.s8 q14, d20, #3 | |
416 | vshll.s8 q15, d21, #3 | |
417 | vaddw.s8 q14, q14, d20 | |
418 | vaddw.s8 q15, q15, d21 | |
419 | vadd.s16 q8, q9, q14 | |
420 | vadd.s16 q9, q9, q15 @ 9*w + 63 | |
421 | vadd.s16 q11, q8, q14 | |
422 | vadd.s16 q12, q9, q15 @ 18*w + 63 | |
423 | vadd.s16 q14, q11, q14 | |
424 | vadd.s16 q15, q12, q15 @ 27*w + 63 | |
425 | vqshrn.s16 d16, q8, #7 | |
426 | vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7) | |
427 | vqshrn.s16 d22, q11, #7 | |
428 | vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7) | |
429 | vqshrn.s16 d28, q14, #7 | |
430 | vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7) | |
431 | vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a) | |
432 | vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a) | |
433 | vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a) | |
434 | vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a) | |
435 | vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a) | |
436 | vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a) | |
437 | veor q3, q3, q13 @ P0 = PS0 ^ 0x80 | |
438 | veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 | |
439 | veor q2, q2, q13 @ P1 = PS1 ^ 0x80 | |
440 | veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 | |
441 | veor q1, q1, q13 @ P2 = PS2 ^ 0x80 | |
442 | veor q6, q6, q13 @ Q2 = QS2 ^ 0x80 | |
443 | .endif | |
444 | .endm | |
445 | ||
446 | .macro vp8_v_loop_filter16 name, inner=0, simple=0 | |
447 | function ff_vp8_v_loop_filter16\name\()_neon, export=1 | |
448 | vpush {q4-q7} | |
449 | sub r0, r0, r1, lsl #1+!\simple | |
450 | ||
451 | @ Load pixels: | |
452 | .if !\simple | |
453 | ldr r12, [sp, #64] @ hev_thresh | |
454 | vld1.8 {q0}, [r0,:128], r1 @ P3 | |
455 | vld1.8 {q1}, [r0,:128], r1 @ P2 | |
456 | .endif | |
457 | vld1.8 {q2}, [r0,:128], r1 @ P1 | |
458 | vld1.8 {q3}, [r0,:128], r1 @ P0 | |
459 | vld1.8 {q4}, [r0,:128], r1 @ Q0 | |
460 | vld1.8 {q5}, [r0,:128], r1 @ Q1 | |
461 | .if !\simple | |
462 | vld1.8 {q6}, [r0,:128], r1 @ Q2 | |
463 | vld1.8 {q7}, [r0,:128] @ Q3 | |
464 | vdup.8 q15, r3 @ flim_I | |
465 | .endif | |
466 | vdup.8 q14, r2 @ flim_E | |
467 | ||
468 | vp8_loop_filter inner=\inner, simple=\simple | |
469 | ||
470 | @ back up to P2: dst -= stride * 6 | |
471 | sub r0, r0, r1, lsl #2 | |
472 | .if !\simple | |
473 | sub r0, r0, r1, lsl #1 | |
474 | ||
475 | @ Store pixels: | |
476 | vst1.8 {q1}, [r0,:128], r1 @ P2 | |
477 | .endif | |
478 | vst1.8 {q2}, [r0,:128], r1 @ P1 | |
479 | vst1.8 {q3}, [r0,:128], r1 @ P0 | |
480 | vst1.8 {q4}, [r0,:128], r1 @ Q0 | |
481 | vst1.8 {q5}, [r0,:128], r1 @ Q1 | |
482 | .if !\simple | |
483 | vst1.8 {q6}, [r0,:128] @ Q2 | |
484 | .endif | |
485 | ||
486 | vpop {q4-q7} | |
487 | bx lr | |
488 | endfunc | |
489 | .endm | |
490 | ||
491 | vp8_v_loop_filter16 | |
492 | vp8_v_loop_filter16 _inner, inner=1 | |
493 | vp8_v_loop_filter16 _simple, simple=1 | |
494 | ||
495 | .macro vp8_v_loop_filter8uv name, inner=0 | |
496 | function ff_vp8_v_loop_filter8uv\name\()_neon, export=1 | |
497 | vpush {q4-q7} | |
498 | sub r0, r0, r2, lsl #2 | |
499 | sub r1, r1, r2, lsl #2 | |
500 | ldr r12, [sp, #64] @ flim_I | |
501 | ||
502 | @ Load pixels: | |
503 | vld1.8 {d0}, [r0,:64], r2 @ P3 | |
504 | vld1.8 {d1}, [r1,:64], r2 @ P3 | |
505 | vld1.8 {d2}, [r0,:64], r2 @ P2 | |
506 | vld1.8 {d3}, [r1,:64], r2 @ P2 | |
507 | vld1.8 {d4}, [r0,:64], r2 @ P1 | |
508 | vld1.8 {d5}, [r1,:64], r2 @ P1 | |
509 | vld1.8 {d6}, [r0,:64], r2 @ P0 | |
510 | vld1.8 {d7}, [r1,:64], r2 @ P0 | |
511 | vld1.8 {d8}, [r0,:64], r2 @ Q0 | |
512 | vld1.8 {d9}, [r1,:64], r2 @ Q0 | |
513 | vld1.8 {d10}, [r0,:64], r2 @ Q1 | |
514 | vld1.8 {d11}, [r1,:64], r2 @ Q1 | |
515 | vld1.8 {d12}, [r0,:64], r2 @ Q2 | |
516 | vld1.8 {d13}, [r1,:64], r2 @ Q2 | |
517 | vld1.8 {d14}, [r0,:64] @ Q3 | |
518 | vld1.8 {d15}, [r1,:64] @ Q3 | |
519 | ||
520 | vdup.8 q14, r3 @ flim_E | |
521 | vdup.8 q15, r12 @ flim_I | |
522 | ldr r12, [sp, #68] @ hev_thresh | |
523 | ||
524 | vp8_loop_filter inner=\inner | |
525 | ||
526 | @ back up to P2: u,v -= stride * 6 | |
527 | sub r0, r0, r2, lsl #2 | |
528 | sub r1, r1, r2, lsl #2 | |
529 | sub r0, r0, r2, lsl #1 | |
530 | sub r1, r1, r2, lsl #1 | |
531 | ||
532 | @ Store pixels: | |
533 | vst1.8 {d2}, [r0,:64], r2 @ P2 | |
534 | vst1.8 {d3}, [r1,:64], r2 @ P2 | |
535 | vst1.8 {d4}, [r0,:64], r2 @ P1 | |
536 | vst1.8 {d5}, [r1,:64], r2 @ P1 | |
537 | vst1.8 {d6}, [r0,:64], r2 @ P0 | |
538 | vst1.8 {d7}, [r1,:64], r2 @ P0 | |
539 | vst1.8 {d8}, [r0,:64], r2 @ Q0 | |
540 | vst1.8 {d9}, [r1,:64], r2 @ Q0 | |
541 | vst1.8 {d10}, [r0,:64], r2 @ Q1 | |
542 | vst1.8 {d11}, [r1,:64], r2 @ Q1 | |
543 | vst1.8 {d12}, [r0,:64] @ Q2 | |
544 | vst1.8 {d13}, [r1,:64] @ Q2 | |
545 | ||
546 | vpop {q4-q7} | |
547 | bx lr | |
548 | endfunc | |
549 | .endm | |
550 | ||
551 | vp8_v_loop_filter8uv | |
552 | vp8_v_loop_filter8uv _inner, inner=1 | |
553 | ||
554 | .macro vp8_h_loop_filter16 name, inner=0, simple=0 | |
555 | function ff_vp8_h_loop_filter16\name\()_neon, export=1 | |
556 | vpush {q4-q7} | |
557 | sub r0, r0, #4 | |
558 | .if !\simple | |
559 | ldr r12, [sp, #64] @ hev_thresh | |
560 | .endif | |
561 | ||
562 | @ Load pixels: | |
563 | vld1.8 {d0}, [r0], r1 @ load first 8-line src data | |
564 | vld1.8 {d2}, [r0], r1 | |
565 | vld1.8 {d4}, [r0], r1 | |
566 | vld1.8 {d6}, [r0], r1 | |
567 | vld1.8 {d8}, [r0], r1 | |
568 | vld1.8 {d10}, [r0], r1 | |
569 | vld1.8 {d12}, [r0], r1 | |
570 | vld1.8 {d14}, [r0], r1 | |
571 | vld1.8 {d1}, [r0], r1 @ load second 8-line src data | |
572 | vld1.8 {d3}, [r0], r1 | |
573 | vld1.8 {d5}, [r0], r1 | |
574 | vld1.8 {d7}, [r0], r1 | |
575 | vld1.8 {d9}, [r0], r1 | |
576 | vld1.8 {d11}, [r0], r1 | |
577 | vld1.8 {d13}, [r0], r1 | |
578 | vld1.8 {d15}, [r0], r1 | |
579 | ||
580 | transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 | |
581 | ||
582 | vdup.8 q14, r2 @ flim_E | |
583 | .if !\simple | |
584 | vdup.8 q15, r3 @ flim_I | |
585 | .endif | |
586 | ||
587 | vp8_loop_filter inner=\inner, simple=\simple | |
588 | ||
589 | sub r0, r0, r1, lsl #4 @ backup 16 rows | |
590 | ||
591 | transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 | |
592 | ||
593 | @ Store pixels: | |
594 | vst1.8 {d0}, [r0], r1 | |
595 | vst1.8 {d2}, [r0], r1 | |
596 | vst1.8 {d4}, [r0], r1 | |
597 | vst1.8 {d6}, [r0], r1 | |
598 | vst1.8 {d8}, [r0], r1 | |
599 | vst1.8 {d10}, [r0], r1 | |
600 | vst1.8 {d12}, [r0], r1 | |
601 | vst1.8 {d14}, [r0], r1 | |
602 | vst1.8 {d1}, [r0], r1 | |
603 | vst1.8 {d3}, [r0], r1 | |
604 | vst1.8 {d5}, [r0], r1 | |
605 | vst1.8 {d7}, [r0], r1 | |
606 | vst1.8 {d9}, [r0], r1 | |
607 | vst1.8 {d11}, [r0], r1 | |
608 | vst1.8 {d13}, [r0], r1 | |
609 | vst1.8 {d15}, [r0] | |
610 | ||
611 | vpop {q4-q7} | |
612 | bx lr | |
613 | endfunc | |
614 | .endm | |
615 | ||
616 | vp8_h_loop_filter16 | |
617 | vp8_h_loop_filter16 _inner, inner=1 | |
618 | vp8_h_loop_filter16 _simple, simple=1 | |
619 | ||
620 | .macro vp8_h_loop_filter8uv name, inner=0 | |
621 | function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 | |
622 | vpush {q4-q7} | |
623 | sub r0, r0, #4 | |
624 | sub r1, r1, #4 | |
625 | ldr r12, [sp, #64] @ flim_I | |
626 | ||
627 | @ Load pixels: | |
628 | vld1.8 {d0}, [r0], r2 @ load u | |
629 | vld1.8 {d1}, [r1], r2 @ load v | |
630 | vld1.8 {d2}, [r0], r2 | |
631 | vld1.8 {d3}, [r1], r2 | |
632 | vld1.8 {d4}, [r0], r2 | |
633 | vld1.8 {d5}, [r1], r2 | |
634 | vld1.8 {d6}, [r0], r2 | |
635 | vld1.8 {d7}, [r1], r2 | |
636 | vld1.8 {d8}, [r0], r2 | |
637 | vld1.8 {d9}, [r1], r2 | |
638 | vld1.8 {d10}, [r0], r2 | |
639 | vld1.8 {d11}, [r1], r2 | |
640 | vld1.8 {d12}, [r0], r2 | |
641 | vld1.8 {d13}, [r1], r2 | |
642 | vld1.8 {d14}, [r0], r2 | |
643 | vld1.8 {d15}, [r1], r2 | |
644 | ||
645 | transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 | |
646 | ||
647 | vdup.8 q14, r3 @ flim_E | |
648 | vdup.8 q15, r12 @ flim_I | |
649 | ldr r12, [sp, #68] @ hev_thresh | |
650 | ||
651 | vp8_loop_filter inner=\inner | |
652 | ||
653 | sub r0, r0, r2, lsl #3 @ backup u 8 rows | |
654 | sub r1, r1, r2, lsl #3 @ backup v 8 rows | |
655 | ||
656 | transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 | |
657 | ||
658 | @ Store pixels: | |
659 | vst1.8 {d0}, [r0], r2 | |
660 | vst1.8 {d1}, [r1], r2 | |
661 | vst1.8 {d2}, [r0], r2 | |
662 | vst1.8 {d3}, [r1], r2 | |
663 | vst1.8 {d4}, [r0], r2 | |
664 | vst1.8 {d5}, [r1], r2 | |
665 | vst1.8 {d6}, [r0], r2 | |
666 | vst1.8 {d7}, [r1], r2 | |
667 | vst1.8 {d8}, [r0], r2 | |
668 | vst1.8 {d9}, [r1], r2 | |
669 | vst1.8 {d10}, [r0], r2 | |
670 | vst1.8 {d11}, [r1], r2 | |
671 | vst1.8 {d12}, [r0], r2 | |
672 | vst1.8 {d13}, [r1], r2 | |
673 | vst1.8 {d14}, [r0] | |
674 | vst1.8 {d15}, [r1] | |
675 | ||
676 | vpop {q4-q7} | |
677 | bx lr | |
678 | endfunc | |
679 | .endm | |
680 | ||
681 | vp8_h_loop_filter8uv | |
682 | vp8_h_loop_filter8uv _inner, inner=1 | |
683 | ||
684 | function ff_put_vp8_pixels16_neon, export=1 | |
685 | ldr r12, [sp, #0] @ h | |
686 | 1: | |
687 | subs r12, r12, #4 | |
688 | vld1.8 {q0}, [r2], r3 | |
689 | vld1.8 {q1}, [r2], r3 | |
690 | vld1.8 {q2}, [r2], r3 | |
691 | vld1.8 {q3}, [r2], r3 | |
692 | vst1.8 {q0}, [r0,:128], r1 | |
693 | vst1.8 {q1}, [r0,:128], r1 | |
694 | vst1.8 {q2}, [r0,:128], r1 | |
695 | vst1.8 {q3}, [r0,:128], r1 | |
696 | bgt 1b | |
697 | bx lr | |
698 | endfunc | |
699 | ||
700 | function ff_put_vp8_pixels8_neon, export=1 | |
701 | ldr r12, [sp, #0] @ h | |
702 | 1: | |
703 | subs r12, r12, #4 | |
704 | vld1.8 {d0}, [r2], r3 | |
705 | vld1.8 {d1}, [r2], r3 | |
706 | vld1.8 {d2}, [r2], r3 | |
707 | vld1.8 {d3}, [r2], r3 | |
708 | vst1.8 {d0}, [r0,:64], r1 | |
709 | vst1.8 {d1}, [r0,:64], r1 | |
710 | vst1.8 {d2}, [r0,:64], r1 | |
711 | vst1.8 {d3}, [r0,:64], r1 | |
712 | bgt 1b | |
713 | bx lr | |
714 | endfunc | |
715 | ||
716 | /* 4/6-tap 8th-pel MC */ | |
717 | ||
718 | .macro vp8_epel8_h6 d, a, b | |
719 | vext.8 d27, \a, \b, #1 | |
720 | vmovl.u8 q8, \a | |
721 | vext.8 d28, \a, \b, #2 | |
722 | vmovl.u8 q9, d27 | |
723 | vext.8 d29, \a, \b, #3 | |
724 | vmovl.u8 q10, d28 | |
725 | vext.8 d30, \a, \b, #4 | |
726 | vmovl.u8 q11, d29 | |
727 | vext.8 d31, \a, \b, #5 | |
728 | vmovl.u8 q12, d30 | |
729 | vmul.u16 q10, q10, d0[2] | |
730 | vmovl.u8 q13, d31 | |
731 | vmul.u16 q11, q11, d0[3] | |
732 | vmls.u16 q10, q9, d0[1] | |
733 | vmls.u16 q11, q12, d1[0] | |
734 | vmla.u16 q10, q8, d0[0] | |
735 | vmla.u16 q11, q13, d1[1] | |
736 | vqadd.s16 q11, q10, q11 | |
737 | vqrshrun.s16 \d, q11, #7 | |
738 | .endm | |
739 | ||
740 | .macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1 | |
741 | vext.8 q14, \q0, \q1, #3 | |
742 | vext.8 q15, \q0, \q1, #4 | |
743 | vmovl.u8 q11, d28 | |
744 | vmovl.u8 q14, d29 | |
745 | vext.8 q3, \q0, \q1, #2 | |
746 | vmovl.u8 q12, d30 | |
747 | vmovl.u8 q15, d31 | |
748 | vext.8 q8, \q0, \q1, #1 | |
749 | vmovl.u8 q10, d6 | |
750 | vmovl.u8 q3, d7 | |
751 | vext.8 q2, \q0, \q1, #5 | |
752 | vmovl.u8 q13, d4 | |
753 | vmovl.u8 q2, d5 | |
754 | vmovl.u8 q9, d16 | |
755 | vmovl.u8 q8, d17 | |
756 | vmul.u16 q11, q11, d0[3] | |
757 | vmul.u16 q10, q10, d0[2] | |
758 | vmul.u16 q3, q3, d0[2] | |
759 | vmul.u16 q14, q14, d0[3] | |
760 | vmls.u16 q11, q12, d1[0] | |
761 | vmovl.u8 q12, \s0 | |
762 | vmovl.u8 q1, \s1 | |
763 | vmls.u16 q10, q9, d0[1] | |
764 | vmls.u16 q3, q8, d0[1] | |
765 | vmls.u16 q14, q15, d1[0] | |
766 | vmla.u16 q10, q12, d0[0] | |
767 | vmla.u16 q11, q13, d1[1] | |
768 | vmla.u16 q3, q1, d0[0] | |
769 | vmla.u16 q14, q2, d1[1] | |
770 | vqadd.s16 q11, q10, q11 | |
771 | vqadd.s16 q14, q3, q14 | |
772 | vqrshrun.s16 \d0, q11, #7 | |
773 | vqrshrun.s16 \d1, q14, #7 | |
774 | .endm | |
775 | ||
776 | .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5 | |
777 | vmovl.u8 q10, \s2 | |
778 | vmovl.u8 q11, \s3 | |
779 | vmovl.u8 q9, \s1 | |
780 | vmovl.u8 q12, \s4 | |
781 | vmovl.u8 q8, \s0 | |
782 | vmovl.u8 q13, \s5 | |
783 | vmul.u16 q10, q10, d0[2] | |
784 | vmul.u16 q11, q11, d0[3] | |
785 | vmls.u16 q10, q9, d0[1] | |
786 | vmls.u16 q11, q12, d1[0] | |
787 | vmla.u16 q10, q8, d0[0] | |
788 | vmla.u16 q11, q13, d1[1] | |
789 | vqadd.s16 q11, q10, q11 | |
790 | vqrshrun.s16 \d0, q11, #7 | |
791 | .endm | |
792 | ||
793 | .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 | |
794 | vmovl.u8 q10, \s0 | |
795 | vmovl.u8 q11, \s3 | |
796 | vmovl.u8 q14, \s6 | |
797 | vmovl.u8 q9, \s1 | |
798 | vmovl.u8 q12, \s4 | |
799 | vmovl.u8 q8, \s2 | |
800 | vmovl.u8 q13, \s5 | |
801 | vmul.u16 q10, q10, d0[0] | |
802 | vmul.u16 q15, q11, d0[3] | |
803 | vmul.u16 q11, q11, d0[2] | |
804 | vmul.u16 q14, q14, d1[1] | |
805 | vmls.u16 q10, q9, d0[1] | |
806 | vmls.u16 q15, q12, d1[0] | |
807 | vmls.u16 q11, q8, d0[1] | |
808 | vmls.u16 q14, q13, d1[0] | |
809 | vmla.u16 q10, q8, d0[2] | |
810 | vmla.u16 q15, q13, d1[1] | |
811 | vmla.u16 q11, q9, d0[0] | |
812 | vmla.u16 q14, q12, d0[3] | |
813 | vqadd.s16 q15, q10, q15 | |
814 | vqadd.s16 q14, q11, q14 | |
815 | vqrshrun.s16 \d0, q15, #7 | |
816 | vqrshrun.s16 \d1, q14, #7 | |
817 | .endm | |
818 | ||
819 | .macro vp8_epel8_h4 d, a, b | |
820 | vext.8 d28, \a, \b, #1 | |
821 | vmovl.u8 q9, \a | |
822 | vext.8 d29, \a, \b, #2 | |
823 | vmovl.u8 q10, d28 | |
824 | vext.8 d30, \a, \b, #3 | |
825 | vmovl.u8 q11, d29 | |
826 | vmovl.u8 q12, d30 | |
827 | vmul.u16 q10, q10, d0[2] | |
828 | vmul.u16 q11, q11, d0[3] | |
829 | vmls.u16 q10, q9, d0[1] | |
830 | vmls.u16 q11, q12, d1[0] | |
831 | vqadd.s16 q11, q10, q11 | |
832 | vqrshrun.s16 \d, q11, #7 | |
833 | .endm | |
834 | ||
835 | .macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4 | |
836 | vmovl.u8 q9, \s0 | |
837 | vmovl.u8 q10, \s1 | |
838 | vmovl.u8 q11, \s2 | |
839 | vmovl.u8 q12, \s3 | |
840 | vmovl.u8 q13, \s4 | |
841 | vmul.u16 q8, q10, d0[2] | |
842 | vmul.u16 q14, q11, d0[3] | |
843 | vmul.u16 q11, q11, d0[2] | |
844 | vmul.u16 q15, q12, d0[3] | |
845 | vmls.u16 q8, q9, d0[1] | |
846 | vmls.u16 q14, q12, d1[0] | |
847 | vmls.u16 q11, q10, d0[1] | |
848 | vmls.u16 q15, q13, d1[0] | |
849 | vqadd.s16 q8, q8, q14 | |
850 | vqadd.s16 q11, q11, q15 | |
851 | vqrshrun.s16 \d0, q8, #7 | |
852 | vqrshrun.s16 \d1, q11, #7 | |
853 | .endm | |
854 | ||
855 | function ff_put_vp8_epel16_v6_neon, export=1 | |
856 | sub r2, r2, r3, lsl #1 | |
857 | push {r4,lr} | |
858 | vpush {d8-d15} | |
859 | ||
860 | ldr r4, [sp, #80] @ my | |
861 | movrel lr, subpel_filters-16 | |
862 | ldr r12, [sp, #72] @ h | |
863 | add r4, lr, r4, lsl #4 | |
864 | vld1.16 {q0}, [r4,:128] | |
865 | 1: | |
866 | vld1.8 {d2-d3}, [r2], r3 | |
867 | vld1.8 {d4-d5}, [r2], r3 | |
868 | vld1.8 {d6-d7}, [r2], r3 | |
869 | vld1.8 {d8-d9}, [r2], r3 | |
870 | vld1.8 {d10-d11},[r2], r3 | |
871 | vld1.8 {d12-d13},[r2], r3 | |
872 | vld1.8 {d14-d15},[r2] | |
873 | sub r2, r2, r3, lsl #2 | |
874 | ||
875 | vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14 | |
876 | vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15 | |
877 | ||
878 | vst1.8 {d2-d3}, [r0,:128], r1 | |
879 | vst1.8 {d4-d5}, [r0,:128], r1 | |
880 | subs r12, r12, #2 | |
881 | bne 1b | |
882 | ||
883 | vpop {d8-d15} | |
884 | pop {r4,pc} | |
885 | endfunc | |
886 | ||
887 | function ff_put_vp8_epel16_h6_neon, export=1 | |
888 | sub r2, r2, #2 | |
889 | push {r4,lr} | |
890 | ||
891 | ldr r4, [sp, #12] @ mx | |
892 | movrel lr, subpel_filters-16 | |
893 | ldr r12, [sp, #8] @ h | |
894 | add r4, lr, r4, lsl #4 | |
895 | vld1.16 {q0}, [r4,:128] | |
896 | 1: | |
897 | vld1.8 {d2-d4}, [r2], r3 | |
898 | ||
899 | vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 | |
900 | ||
901 | vst1.8 {d2-d3}, [r0,:128], r1 | |
902 | subs r12, r12, #1 | |
903 | bne 1b | |
904 | ||
905 | pop {r4,pc} | |
906 | endfunc | |
907 | ||
908 | function ff_put_vp8_epel16_h6v6_neon, export=1 | |
909 | sub r2, r2, r3, lsl #1 | |
910 | sub r2, r2, #2 | |
911 | push {r4,lr} | |
912 | vpush {d8-d9} | |
913 | ||
914 | @ first pass (horizontal): | |
915 | ldr r4, [sp, #28] @ mx | |
916 | movrel lr, subpel_filters-16 | |
917 | ldr r12, [sp, #24] @ h | |
918 | add r4, lr, r4, lsl #4 | |
919 | sub sp, sp, #336+16 | |
920 | vld1.16 {q0}, [r4,:128] | |
921 | add lr, sp, #15 | |
922 | add r12, r12, #5 | |
923 | bic lr, lr, #15 | |
924 | 1: | |
925 | vld1.8 {d2,d3,d4}, [r2], r3 | |
926 | ||
927 | vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 | |
928 | ||
929 | vst1.8 {d2-d3}, [lr,:128]! | |
930 | subs r12, r12, #1 | |
931 | bne 1b | |
932 | ||
933 | @ second pass (vertical): | |
934 | ldr r4, [sp, #336+16+32] @ my | |
935 | movrel lr, subpel_filters-16 | |
936 | ldr r12, [sp, #336+16+24] @ h | |
937 | add r4, lr, r4, lsl #4 | |
938 | add lr, sp, #15 | |
939 | vld1.16 {q0}, [r4,:128] | |
940 | bic lr, lr, #15 | |
941 | 2: | |
942 | vld1.8 {d2-d5}, [lr,:128]! | |
943 | vld1.8 {d6-d9}, [lr,:128]! | |
944 | vld1.8 {d28-d31},[lr,:128] | |
945 | sub lr, lr, #48 | |
946 | ||
947 | vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30 | |
948 | vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31 | |
949 | ||
950 | vst1.8 {d2-d3}, [r0,:128], r1 | |
951 | subs r12, r12, #1 | |
952 | bne 2b | |
953 | ||
954 | add sp, sp, #336+16 | |
955 | vpop {d8-d9} | |
956 | pop {r4,pc} | |
957 | endfunc | |
958 | ||
959 | function ff_put_vp8_epel8_v6_neon, export=1 | |
960 | sub r2, r2, r3, lsl #1 | |
961 | push {r4,lr} | |
962 | ||
963 | ldr r4, [sp, #16] @ my | |
964 | movrel lr, subpel_filters-16 | |
965 | ldr r12, [sp, #8] @ h | |
966 | add r4, lr, r4, lsl #4 | |
967 | vld1.16 {q0}, [r4,:128] | |
968 | 1: | |
969 | vld1.8 {d2}, [r2], r3 | |
970 | vld1.8 {d3}, [r2], r3 | |
971 | vld1.8 {d4}, [r2], r3 | |
972 | vld1.8 {d5}, [r2], r3 | |
973 | vld1.8 {d6}, [r2], r3 | |
974 | vld1.8 {d7}, [r2], r3 | |
975 | vld1.8 {d28}, [r2] | |
976 | ||
977 | sub r2, r2, r3, lsl #2 | |
978 | ||
979 | vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 | |
980 | ||
981 | vst1.8 {d2}, [r0,:64], r1 | |
982 | vst1.8 {d3}, [r0,:64], r1 | |
983 | subs r12, r12, #2 | |
984 | bne 1b | |
985 | ||
986 | pop {r4,pc} | |
987 | endfunc | |
988 | ||
989 | function ff_put_vp8_epel8_h6_neon, export=1 | |
990 | sub r2, r2, #2 | |
991 | push {r4,lr} | |
992 | ||
993 | ldr r4, [sp, #12] @ mx | |
994 | movrel lr, subpel_filters-16 | |
995 | ldr r12, [sp, #8] @ h | |
996 | add r4, lr, r4, lsl #4 | |
997 | vld1.16 {q0}, [r4,:128] | |
998 | 1: | |
999 | vld1.8 {d2,d3}, [r2], r3 | |
1000 | ||
1001 | vp8_epel8_h6 d2, d2, d3 | |
1002 | ||
1003 | vst1.8 {d2}, [r0,:64], r1 | |
1004 | subs r12, r12, #1 | |
1005 | bne 1b | |
1006 | ||
1007 | pop {r4,pc} | |
1008 | endfunc | |
1009 | ||
1010 | function ff_put_vp8_epel8_h6v6_neon, export=1 | |
1011 | sub r2, r2, r3, lsl #1 | |
1012 | sub r2, r2, #2 | |
1013 | push {r4,lr} | |
1014 | ||
1015 | @ first pass (horizontal): | |
1016 | ldr r4, [sp, #12] @ mx | |
1017 | movrel lr, subpel_filters-16 | |
1018 | ldr r12, [sp, #8] @ h | |
1019 | add r4, lr, r4, lsl #4 | |
1020 | sub sp, sp, #168+16 | |
1021 | vld1.16 {q0}, [r4,:128] | |
1022 | add lr, sp, #15 | |
1023 | add r12, r12, #5 | |
1024 | bic lr, lr, #15 | |
1025 | 1: | |
1026 | vld1.8 {d2,d3}, [r2], r3 | |
1027 | ||
1028 | vp8_epel8_h6 d2, d2, d3 | |
1029 | ||
1030 | vst1.8 {d2}, [lr,:64]! | |
1031 | subs r12, r12, #1 | |
1032 | bne 1b | |
1033 | ||
1034 | @ second pass (vertical): | |
1035 | ldr r4, [sp, #168+16+16] @ my | |
1036 | movrel lr, subpel_filters-16 | |
1037 | ldr r12, [sp, #168+16+8] @ h | |
1038 | add r4, lr, r4, lsl #4 | |
1039 | add lr, sp, #15 | |
1040 | vld1.16 {q0}, [r4,:128] | |
1041 | bic lr, lr, #15 | |
1042 | 2: | |
1043 | vld1.8 {d2-d5}, [lr,:128]! | |
1044 | vld1.8 {d6-d7}, [lr,:128]! | |
1045 | vld1.8 {d30}, [lr,:64] | |
1046 | sub lr, lr, #32 | |
1047 | ||
1048 | vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 | |
1049 | ||
1050 | vst1.8 {d2}, [r0,:64], r1 | |
1051 | vst1.8 {d3}, [r0,:64], r1 | |
1052 | subs r12, r12, #2 | |
1053 | bne 2b | |
1054 | ||
1055 | add sp, sp, #168+16 | |
1056 | pop {r4,pc} | |
1057 | endfunc | |
1058 | ||
1059 | function ff_put_vp8_epel8_v4_neon, export=1 | |
1060 | sub r2, r2, r3 | |
1061 | push {r4,lr} | |
1062 | ||
1063 | ldr r4, [sp, #16] @ my | |
1064 | movrel lr, subpel_filters-16 | |
1065 | ldr r12, [sp, #8] @ h | |
1066 | add r4, lr, r4, lsl #4 | |
1067 | vld1.16 {q0}, [r4,:128] | |
1068 | 1: | |
1069 | vld1.8 {d2}, [r2], r3 | |
1070 | vld1.8 {d3}, [r2], r3 | |
1071 | vld1.8 {d4}, [r2], r3 | |
1072 | vld1.8 {d5}, [r2], r3 | |
1073 | vld1.8 {d6}, [r2] | |
1074 | sub r2, r2, r3, lsl #1 | |
1075 | ||
1076 | vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 | |
1077 | ||
1078 | vst1.8 {d2}, [r0,:64], r1 | |
1079 | vst1.8 {d3}, [r0,:64], r1 | |
1080 | subs r12, r12, #2 | |
1081 | bne 1b | |
1082 | ||
1083 | pop {r4,pc} | |
1084 | endfunc | |
1085 | ||
1086 | function ff_put_vp8_epel8_h4_neon, export=1 | |
1087 | sub r2, r2, #1 | |
1088 | push {r4,lr} | |
1089 | ||
1090 | ldr r4, [sp, #12] @ mx | |
1091 | movrel lr, subpel_filters-16 | |
1092 | ldr r12, [sp, #8] @ h | |
1093 | add r4, lr, r4, lsl #4 | |
1094 | vld1.16 {q0}, [r4,:128] | |
1095 | 1: | |
1096 | vld1.8 {d2,d3}, [r2], r3 | |
1097 | ||
1098 | vp8_epel8_h4 d2, d2, d3 | |
1099 | ||
1100 | vst1.8 {d2}, [r0,:64], r1 | |
1101 | subs r12, r12, #1 | |
1102 | bne 1b | |
1103 | ||
1104 | pop {r4,pc} | |
1105 | endfunc | |
1106 | ||
1107 | function ff_put_vp8_epel8_h4v4_neon, export=1 | |
1108 | sub r2, r2, r3 | |
1109 | sub r2, r2, #1 | |
1110 | push {r4,lr} | |
1111 | ||
1112 | @ first pass (horizontal): | |
1113 | ldr r4, [sp, #12] @ mx | |
1114 | movrel lr, subpel_filters-16 | |
1115 | ldr r12, [sp, #8] @ h | |
1116 | add r4, lr, r4, lsl #4 | |
1117 | sub sp, sp, #168+16 | |
1118 | vld1.16 {q0}, [r4,:128] | |
1119 | add lr, sp, #15 | |
1120 | add r12, r12, #3 | |
1121 | bic lr, lr, #15 | |
1122 | 1: | |
1123 | vld1.8 {d2,d3}, [r2], r3 | |
1124 | ||
1125 | vp8_epel8_h4 d2, d2, d3 | |
1126 | ||
1127 | vst1.8 {d2}, [lr,:64]! | |
1128 | subs r12, r12, #1 | |
1129 | bne 1b | |
1130 | ||
1131 | @ second pass (vertical): | |
1132 | ldr r4, [sp, #168+16+16] @ my | |
1133 | movrel lr, subpel_filters-16 | |
1134 | ldr r12, [sp, #168+16+8] @ h | |
1135 | add r4, lr, r4, lsl #4 | |
1136 | add lr, sp, #15 | |
1137 | vld1.16 {q0}, [r4,:128] | |
1138 | bic lr, lr, #15 | |
1139 | 2: | |
1140 | vld1.8 {d2-d5}, [lr,:128]! | |
1141 | vld1.8 {d6}, [lr,:64] | |
1142 | sub lr, lr, #16 | |
1143 | ||
1144 | vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 | |
1145 | ||
1146 | vst1.8 {d2}, [r0,:64], r1 | |
1147 | vst1.8 {d3}, [r0,:64], r1 | |
1148 | subs r12, r12, #2 | |
1149 | bne 2b | |
1150 | ||
1151 | add sp, sp, #168+16 | |
1152 | pop {r4,pc} | |
1153 | endfunc | |
1154 | ||
1155 | function ff_put_vp8_epel8_h6v4_neon, export=1 | |
1156 | sub r2, r2, r3 | |
1157 | sub r2, r2, #2 | |
1158 | push {r4,lr} | |
1159 | ||
1160 | @ first pass (horizontal): | |
1161 | ldr r4, [sp, #12] @ mx | |
1162 | movrel lr, subpel_filters-16 | |
1163 | ldr r12, [sp, #8] @ h | |
1164 | add r4, lr, r4, lsl #4 | |
1165 | sub sp, sp, #168+16 | |
1166 | vld1.16 {q0}, [r4,:128] | |
1167 | add lr, sp, #15 | |
1168 | add r12, r12, #3 | |
1169 | bic lr, lr, #15 | |
1170 | 1: | |
1171 | vld1.8 {d2,d3}, [r2], r3 | |
1172 | ||
1173 | vp8_epel8_h6 d2, d2, d3 | |
1174 | ||
1175 | vst1.8 {d2}, [lr,:64]! | |
1176 | subs r12, r12, #1 | |
1177 | bne 1b | |
1178 | ||
1179 | @ second pass (vertical): | |
1180 | ldr r4, [sp, #168+16+16] @ my | |
1181 | movrel lr, subpel_filters-16 | |
1182 | ldr r12, [sp, #168+16+8] @ h | |
1183 | add r4, lr, r4, lsl #4 | |
1184 | add lr, sp, #15 | |
1185 | vld1.16 {q0}, [r4,:128] | |
1186 | bic lr, lr, #15 | |
1187 | 2: | |
1188 | vld1.8 {d2-d5}, [lr,:128]! | |
1189 | vld1.8 {d6}, [lr,:64] | |
1190 | sub lr, lr, #16 | |
1191 | ||
1192 | vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 | |
1193 | ||
1194 | vst1.8 {d2}, [r0,:64], r1 | |
1195 | vst1.8 {d3}, [r0,:64], r1 | |
1196 | subs r12, r12, #2 | |
1197 | bne 2b | |
1198 | ||
1199 | add sp, sp, #168+16 | |
1200 | pop {r4,pc} | |
1201 | endfunc | |
1202 | ||
1203 | function ff_put_vp8_epel8_h4v6_neon, export=1 | |
1204 | sub r2, r2, r3, lsl #1 | |
1205 | sub r2, r2, #1 | |
1206 | push {r4,lr} | |
1207 | ||
1208 | @ first pass (horizontal): | |
1209 | ldr r4, [sp, #12] @ mx | |
1210 | movrel lr, subpel_filters-16 | |
1211 | ldr r12, [sp, #8] @ h | |
1212 | add r4, lr, r4, lsl #4 | |
1213 | sub sp, sp, #168+16 | |
1214 | vld1.16 {q0}, [r4,:128] | |
1215 | add lr, sp, #15 | |
1216 | add r12, r12, #5 | |
1217 | bic lr, lr, #15 | |
1218 | 1: | |
1219 | vld1.8 {d2,d3}, [r2], r3 | |
1220 | ||
1221 | vp8_epel8_h4 d2, d2, d3 | |
1222 | ||
1223 | vst1.8 {d2}, [lr,:64]! | |
1224 | subs r12, r12, #1 | |
1225 | bne 1b | |
1226 | ||
1227 | @ second pass (vertical): | |
1228 | ldr r4, [sp, #168+16+16] @ my | |
1229 | movrel lr, subpel_filters-16 | |
1230 | ldr r12, [sp, #168+16+8] @ h | |
1231 | add r4, lr, r4, lsl #4 | |
1232 | add lr, sp, #15 | |
1233 | vld1.16 {q0}, [r4,:128] | |
1234 | bic lr, lr, #15 | |
1235 | 2: | |
1236 | vld1.8 {d2-d5}, [lr,:128]! | |
1237 | vld1.8 {d6-d7}, [lr,:128]! | |
1238 | vld1.8 {d30}, [lr,:64] | |
1239 | sub lr, lr, #32 | |
1240 | ||
1241 | vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 | |
1242 | ||
1243 | vst1.8 {d2}, [r0,:64], r1 | |
1244 | vst1.8 {d3}, [r0,:64], r1 | |
1245 | subs r12, r12, #2 | |
1246 | bne 2b | |
1247 | ||
1248 | add sp, sp, #168+16 | |
1249 | pop {r4,pc} | |
1250 | endfunc | |
1251 | ||
1252 | .ltorg | |
1253 | ||
1254 | function ff_put_vp8_epel4_v6_neon, export=1 | |
1255 | sub r2, r2, r3, lsl #1 | |
1256 | push {r4,lr} | |
1257 | ||
1258 | ldr r4, [sp, #16] @ my | |
1259 | movrel lr, subpel_filters-16 | |
1260 | ldr r12, [sp, #8] @ h | |
1261 | add r4, lr, r4, lsl #4 | |
1262 | vld1.16 {q0}, [r4,:128] | |
1263 | 1: | |
1264 | vld1.32 {d2[]}, [r2], r3 | |
1265 | vld1.32 {d3[]}, [r2], r3 | |
1266 | vld1.32 {d4[]}, [r2], r3 | |
1267 | vld1.32 {d5[]}, [r2], r3 | |
1268 | vld1.32 {d6[]}, [r2], r3 | |
1269 | vld1.32 {d7[]}, [r2], r3 | |
1270 | vld1.32 {d28[]}, [r2] | |
1271 | sub r2, r2, r3, lsl #2 | |
1272 | vld1.32 {d2[1]}, [r2], r3 | |
1273 | vld1.32 {d3[1]}, [r2], r3 | |
1274 | vld1.32 {d4[1]}, [r2], r3 | |
1275 | vld1.32 {d5[1]}, [r2], r3 | |
1276 | vld1.32 {d6[1]}, [r2], r3 | |
1277 | vld1.32 {d7[1]}, [r2], r3 | |
1278 | vld1.32 {d28[1]}, [r2] | |
1279 | sub r2, r2, r3, lsl #2 | |
1280 | ||
1281 | vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 | |
1282 | ||
1283 | vst1.32 {d2[0]}, [r0,:32], r1 | |
1284 | vst1.32 {d3[0]}, [r0,:32], r1 | |
1285 | vst1.32 {d2[1]}, [r0,:32], r1 | |
1286 | vst1.32 {d3[1]}, [r0,:32], r1 | |
1287 | subs r12, r12, #4 | |
1288 | bne 1b | |
1289 | ||
1290 | pop {r4,pc} | |
1291 | endfunc | |
1292 | ||
1293 | function ff_put_vp8_epel4_h6_neon, export=1 | |
1294 | sub r2, r2, #2 | |
1295 | push {r4,lr} | |
1296 | ||
1297 | ldr r4, [sp, #12] @ mx | |
1298 | movrel lr, subpel_filters-16 | |
1299 | ldr r12, [sp, #8] @ h | |
1300 | add r4, lr, r4, lsl #4 | |
1301 | vld1.16 {q0}, [r4,:128] | |
1302 | 1: | |
1303 | vld1.8 {q1}, [r2], r3 | |
1304 | vp8_epel8_h6 d2, d2, d3 | |
1305 | vst1.32 {d2[0]}, [r0,:32], r1 | |
1306 | subs r12, r12, #1 | |
1307 | bne 1b | |
1308 | ||
1309 | pop {r4,pc} | |
1310 | endfunc | |
1311 | ||
1312 | function ff_put_vp8_epel4_h6v6_neon, export=1 | |
1313 | sub r2, r2, r3, lsl #1 | |
1314 | sub r2, r2, #2 | |
1315 | push {r4,lr} | |
1316 | ||
1317 | ldr r4, [sp, #12] @ mx | |
1318 | movrel lr, subpel_filters-16 | |
1319 | ldr r12, [sp, #8] @ h | |
1320 | add r4, lr, r4, lsl #4 | |
1321 | sub sp, sp, #52+16 | |
1322 | vld1.16 {q0}, [r4,:128] | |
1323 | add lr, sp, #15 | |
1324 | add r12, r12, #5 | |
1325 | bic lr, lr, #15 | |
1326 | 1: | |
1327 | vld1.8 {q1}, [r2], r3 | |
1328 | vp8_epel8_h6 d2, d2, d3 | |
1329 | vst1.32 {d2[0]}, [lr,:32]! | |
1330 | subs r12, r12, #1 | |
1331 | bne 1b | |
1332 | ||
1333 | ldr r4, [sp, #52+16+16] @ my | |
1334 | movrel lr, subpel_filters-16 | |
1335 | ldr r12, [sp, #52+16+8] @ h | |
1336 | add r4, lr, r4, lsl #4 | |
1337 | add lr, sp, #15 | |
1338 | vld1.16 {q0}, [r4,:128] | |
1339 | bic lr, lr, #15 | |
1340 | 2: | |
1341 | vld1.8 {d2-d3}, [lr,:128]! | |
1342 | vld1.8 {d6}, [lr,:64]! | |
1343 | vld1.32 {d28[]}, [lr,:32] | |
1344 | sub lr, lr, #16 | |
1345 | vld1.8 {d4-d5}, [lr]! | |
1346 | vld1.8 {d7}, [lr,:64]! | |
1347 | vld1.32 {d28[1]}, [lr,:32] | |
1348 | sub lr, lr, #16 | |
1349 | vtrn.32 q1, q2 | |
1350 | vtrn.32 d6, d7 | |
1351 | vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 | |
1352 | vst1.32 {d2[0]}, [r0,:32], r1 | |
1353 | vst1.32 {d3[0]}, [r0,:32], r1 | |
1354 | vst1.32 {d2[1]}, [r0,:32], r1 | |
1355 | vst1.32 {d3[1]}, [r0,:32], r1 | |
1356 | subs r12, r12, #4 | |
1357 | bne 2b | |
1358 | ||
1359 | add sp, sp, #52+16 | |
1360 | pop {r4,pc} | |
1361 | endfunc | |
1362 | ||
1363 | function ff_put_vp8_epel4_h4v6_neon, export=1 | |
1364 | sub r2, r2, r3, lsl #1 | |
1365 | sub r2, r2, #1 | |
1366 | push {r4,lr} | |
1367 | ||
1368 | ldr r4, [sp, #12] @ mx | |
1369 | movrel lr, subpel_filters-16 | |
1370 | ldr r12, [sp, #8] @ h | |
1371 | add r4, lr, r4, lsl #4 | |
1372 | sub sp, sp, #52+16 | |
1373 | vld1.16 {q0}, [r4,:128] | |
1374 | add lr, sp, #15 | |
1375 | add r12, r12, #5 | |
1376 | bic lr, lr, #15 | |
1377 | 1: | |
1378 | vld1.8 {d2}, [r2], r3 | |
1379 | vp8_epel8_h4 d2, d2, d2 | |
1380 | vst1.32 {d2[0]}, [lr,:32]! | |
1381 | subs r12, r12, #1 | |
1382 | bne 1b | |
1383 | ||
1384 | ldr r4, [sp, #52+16+16] @ my | |
1385 | movrel lr, subpel_filters-16 | |
1386 | ldr r12, [sp, #52+16+8] @ h | |
1387 | add r4, lr, r4, lsl #4 | |
1388 | add lr, sp, #15 | |
1389 | vld1.16 {q0}, [r4,:128] | |
1390 | bic lr, lr, #15 | |
1391 | 2: | |
1392 | vld1.8 {d2-d3}, [lr,:128]! | |
1393 | vld1.8 {d6}, [lr,:64]! | |
1394 | vld1.32 {d28[]}, [lr,:32] | |
1395 | sub lr, lr, #16 | |
1396 | vld1.8 {d4-d5}, [lr]! | |
1397 | vld1.8 {d7}, [lr,:64]! | |
1398 | vld1.32 {d28[1]}, [lr,:32] | |
1399 | sub lr, lr, #16 | |
1400 | vtrn.32 q1, q2 | |
1401 | vtrn.32 d6, d7 | |
1402 | vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 | |
1403 | vst1.32 {d2[0]}, [r0,:32], r1 | |
1404 | vst1.32 {d3[0]}, [r0,:32], r1 | |
1405 | vst1.32 {d2[1]}, [r0,:32], r1 | |
1406 | vst1.32 {d3[1]}, [r0,:32], r1 | |
1407 | subs r12, r12, #4 | |
1408 | bne 2b | |
1409 | ||
1410 | add sp, sp, #52+16 | |
1411 | pop {r4,pc} | |
1412 | endfunc | |
1413 | ||
1414 | function ff_put_vp8_epel4_h6v4_neon, export=1 | |
1415 | sub r2, r2, r3 | |
1416 | sub r2, r2, #2 | |
1417 | push {r4,lr} | |
1418 | ||
1419 | ldr r4, [sp, #12] @ mx | |
1420 | movrel lr, subpel_filters-16 | |
1421 | ldr r12, [sp, #8] @ h | |
1422 | add r4, lr, r4, lsl #4 | |
1423 | sub sp, sp, #44+16 | |
1424 | vld1.16 {q0}, [r4,:128] | |
1425 | add lr, sp, #15 | |
1426 | add r12, r12, #3 | |
1427 | bic lr, lr, #15 | |
1428 | 1: | |
1429 | vld1.8 {q1}, [r2], r3 | |
1430 | vp8_epel8_h6 d2, d2, d3 | |
1431 | vst1.32 {d2[0]}, [lr,:32]! | |
1432 | subs r12, r12, #1 | |
1433 | bne 1b | |
1434 | ||
1435 | ldr r4, [sp, #44+16+16] @ my | |
1436 | movrel lr, subpel_filters-16 | |
1437 | ldr r12, [sp, #44+16+8] @ h | |
1438 | add r4, lr, r4, lsl #4 | |
1439 | add lr, sp, #15 | |
1440 | vld1.16 {q0}, [r4,:128] | |
1441 | bic lr, lr, #15 | |
1442 | 2: | |
1443 | vld1.8 {d2-d3}, [lr,:128]! | |
1444 | vld1.32 {d6[]}, [lr,:32] | |
1445 | sub lr, lr, #8 | |
1446 | vld1.8 {d4-d5}, [lr]! | |
1447 | vld1.32 {d6[1]}, [lr,:32] | |
1448 | sub lr, lr, #8 | |
1449 | vtrn.32 q1, q2 | |
1450 | vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 | |
1451 | vst1.32 {d2[0]}, [r0,:32], r1 | |
1452 | vst1.32 {d3[0]}, [r0,:32], r1 | |
1453 | vst1.32 {d2[1]}, [r0,:32], r1 | |
1454 | vst1.32 {d3[1]}, [r0,:32], r1 | |
1455 | subs r12, r12, #4 | |
1456 | bne 2b | |
1457 | ||
1458 | add sp, sp, #44+16 | |
1459 | pop {r4,pc} | |
1460 | endfunc | |
1461 | ||
1462 | function ff_put_vp8_epel4_h4_neon, export=1 | |
1463 | sub r2, r2, #1 | |
1464 | push {r4,lr} | |
1465 | ||
1466 | ldr r4, [sp, #12] @ mx | |
1467 | movrel lr, subpel_filters-16 | |
1468 | ldr r12, [sp, #8] @ h | |
1469 | add r4, lr, r4, lsl #4 | |
1470 | vld1.16 {q0}, [r4,:128] | |
1471 | 1: | |
1472 | vld1.8 {d2}, [r2], r3 | |
1473 | vp8_epel8_h4 d2, d2, d2 | |
1474 | vst1.32 {d2[0]}, [r0,:32], r1 | |
1475 | subs r12, r12, #1 | |
1476 | bne 1b | |
1477 | ||
1478 | pop {r4,pc} | |
1479 | endfunc | |
1480 | ||
1481 | function ff_put_vp8_epel4_v4_neon, export=1 | |
1482 | sub r2, r2, r3 | |
1483 | push {r4,lr} | |
1484 | ||
1485 | ldr r4, [sp, #16] @ my | |
1486 | movrel lr, subpel_filters-16 | |
1487 | ldr r12, [sp, #8] @ h | |
1488 | add r4, lr, r4, lsl #4 | |
1489 | vld1.16 {q0}, [r4,:128] | |
1490 | 1: | |
1491 | vld1.32 {d2[]}, [r2], r3 | |
1492 | vld1.32 {d3[]}, [r2], r3 | |
1493 | vld1.32 {d4[]}, [r2], r3 | |
1494 | vld1.32 {d5[]}, [r2], r3 | |
1495 | vld1.32 {d6[]}, [r2] | |
1496 | sub r2, r2, r3, lsl #1 | |
1497 | vld1.32 {d2[1]}, [r2], r3 | |
1498 | vld1.32 {d3[1]}, [r2], r3 | |
1499 | vld1.32 {d4[1]}, [r2], r3 | |
1500 | vld1.32 {d5[1]}, [r2], r3 | |
1501 | vld1.32 {d6[1]}, [r2] | |
1502 | sub r2, r2, r3, lsl #1 | |
1503 | ||
1504 | vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 | |
1505 | ||
1506 | vst1.32 {d2[0]}, [r0,:32], r1 | |
1507 | vst1.32 {d3[0]}, [r0,:32], r1 | |
1508 | vst1.32 {d2[1]}, [r0,:32], r1 | |
1509 | vst1.32 {d3[1]}, [r0,:32], r1 | |
1510 | subs r12, r12, #4 | |
1511 | bne 1b | |
1512 | ||
1513 | pop {r4,pc} | |
1514 | endfunc | |
1515 | ||
1516 | function ff_put_vp8_epel4_h4v4_neon, export=1 | |
1517 | sub r2, r2, r3 | |
1518 | sub r2, r2, #1 | |
1519 | push {r4,lr} | |
1520 | ||
1521 | ldr r4, [sp, #12] @ mx | |
1522 | movrel lr, subpel_filters-16 | |
1523 | ldr r12, [sp, #8] @ h | |
1524 | add r4, lr, r4, lsl #4 | |
1525 | sub sp, sp, #44+16 | |
1526 | vld1.16 {q0}, [r4,:128] | |
1527 | add lr, sp, #15 | |
1528 | add r12, r12, #3 | |
1529 | bic lr, lr, #15 | |
1530 | 1: | |
1531 | vld1.8 {d2}, [r2], r3 | |
1532 | vp8_epel8_h4 d2, d2, d3 | |
1533 | vst1.32 {d2[0]}, [lr,:32]! | |
1534 | subs r12, r12, #1 | |
1535 | bne 1b | |
1536 | ||
1537 | ldr r4, [sp, #44+16+16] @ my | |
1538 | movrel lr, subpel_filters-16 | |
1539 | ldr r12, [sp, #44+16+8] @ h | |
1540 | add r4, lr, r4, lsl #4 | |
1541 | add lr, sp, #15 | |
1542 | vld1.16 {q0}, [r4,:128] | |
1543 | bic lr, lr, #15 | |
1544 | 2: | |
1545 | vld1.8 {d2-d3}, [lr,:128]! | |
1546 | vld1.32 {d6[]}, [lr,:32] | |
1547 | sub lr, lr, #8 | |
1548 | vld1.8 {d4-d5}, [lr]! | |
1549 | vld1.32 {d6[1]}, [lr,:32] | |
1550 | sub lr, lr, #8 | |
1551 | vtrn.32 q1, q2 | |
1552 | vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 | |
1553 | vst1.32 {d2[0]}, [r0,:32], r1 | |
1554 | vst1.32 {d3[0]}, [r0,:32], r1 | |
1555 | vst1.32 {d2[1]}, [r0,:32], r1 | |
1556 | vst1.32 {d3[1]}, [r0,:32], r1 | |
1557 | subs r12, r12, #4 | |
1558 | bne 2b | |
1559 | ||
1560 | add sp, sp, #44+16 | |
1561 | pop {r4,pc} | |
1562 | endfunc | |
1563 | ||
1564 | @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit | |
1565 | @ arithmatic can be used to apply filters | |
1566 | const subpel_filters, align=4 | |
1567 | .short 0, 6, 123, 12, 1, 0, 0, 0 | |
1568 | .short 2, 11, 108, 36, 8, 1, 0, 0 | |
1569 | .short 0, 9, 93, 50, 6, 0, 0, 0 | |
1570 | .short 3, 16, 77, 77, 16, 3, 0, 0 | |
1571 | .short 0, 6, 50, 93, 9, 0, 0, 0 | |
1572 | .short 1, 8, 36, 108, 11, 2, 0, 0 | |
1573 | .short 0, 1, 12, 123, 6, 0, 0, 0 | |
1574 | endconst | |
1575 | ||
1576 | /* Bilinear MC */ | |
1577 | ||
1578 | function ff_put_vp8_bilin16_h_neon, export=1 | |
1579 | ldr r12, [sp, #4] @ mx | |
1580 | vdup.8 d0, r12 | |
1581 | rsb r12, r12, #8 | |
1582 | vdup.8 d1, r12 | |
1583 | ldr r12, [sp] @ h | |
1584 | 1: | |
1585 | subs r12, r12, #2 | |
1586 | vld1.8 {d2-d4}, [r2], r3 | |
1587 | vext.8 q2, q1, q2, #1 | |
1588 | vmull.u8 q8, d2, d1 | |
1589 | vmlal.u8 q8, d4, d0 | |
1590 | vld1.8 {d18-d20},[r2], r3 | |
1591 | vmull.u8 q3, d3, d1 | |
1592 | vmlal.u8 q3, d5, d0 | |
1593 | vext.8 q10, q9, q10, #1 | |
1594 | vmull.u8 q11, d18, d1 | |
1595 | vmlal.u8 q11, d20, d0 | |
1596 | vmull.u8 q12, d19, d1 | |
1597 | vmlal.u8 q12, d21, d0 | |
1598 | vrshrn.u16 d4, q8, #3 | |
1599 | vrshrn.u16 d5, q3, #3 | |
1600 | vrshrn.u16 d6, q11, #3 | |
1601 | vrshrn.u16 d7, q12, #3 | |
1602 | vst1.8 {q2}, [r0,:128], r1 | |
1603 | vst1.8 {q3}, [r0,:128], r1 | |
1604 | bgt 1b | |
1605 | ||
1606 | bx lr | |
1607 | endfunc | |
1608 | ||
1609 | function ff_put_vp8_bilin16_v_neon, export=1 | |
1610 | ldr r12, [sp, #8] @ my | |
1611 | vdup.8 d0, r12 | |
1612 | rsb r12, r12, #8 | |
1613 | vdup.8 d1, r12 | |
1614 | ldr r12, [sp] @ h | |
1615 | vld1.8 {q1}, [r2], r3 | |
1616 | 1: | |
1617 | subs r12, r12, #2 | |
1618 | vld1.8 {q2}, [r2], r3 | |
1619 | vmull.u8 q3, d2, d1 | |
1620 | vmlal.u8 q3, d4, d0 | |
1621 | vmull.u8 q8, d3, d1 | |
1622 | vmlal.u8 q8, d5, d0 | |
1623 | vld1.8 {q1}, [r2], r3 | |
1624 | vmull.u8 q9, d4, d1 | |
1625 | vmlal.u8 q9, d2, d0 | |
1626 | vmull.u8 q10, d5, d1 | |
1627 | vmlal.u8 q10, d3, d0 | |
1628 | vrshrn.u16 d4, q3, #3 | |
1629 | vrshrn.u16 d5, q8, #3 | |
1630 | vrshrn.u16 d6, q9, #3 | |
1631 | vrshrn.u16 d7, q10, #3 | |
1632 | vst1.8 {q2}, [r0,:128], r1 | |
1633 | vst1.8 {q3}, [r0,:128], r1 | |
1634 | bgt 1b | |
1635 | ||
1636 | bx lr | |
1637 | endfunc | |
1638 | ||
1639 | function ff_put_vp8_bilin16_hv_neon, export=1 | |
1640 | ldr r12, [sp, #4] @ mx | |
1641 | vdup.8 d0, r12 | |
1642 | rsb r12, r12, #8 | |
1643 | vdup.8 d1, r12 | |
1644 | ldr r12, [sp, #8] @ my | |
1645 | vdup.8 d2, r12 | |
1646 | rsb r12, r12, #8 | |
1647 | vdup.8 d3, r12 | |
1648 | ldr r12, [sp] @ h | |
1649 | ||
1650 | vld1.8 {d4-d6}, [r2], r3 | |
1651 | vext.8 q3, q2, q3, #1 | |
1652 | vmull.u8 q8, d4, d1 | |
1653 | vmlal.u8 q8, d6, d0 | |
1654 | vmull.u8 q9, d5, d1 | |
1655 | vmlal.u8 q9, d7, d0 | |
1656 | vrshrn.u16 d4, q8, #3 | |
1657 | vrshrn.u16 d5, q9, #3 | |
1658 | 1: | |
1659 | subs r12, r12, #2 | |
1660 | vld1.8 {d18-d20},[r2], r3 | |
1661 | vext.8 q10, q9, q10, #1 | |
1662 | vmull.u8 q11, d18, d1 | |
1663 | vmlal.u8 q11, d20, d0 | |
1664 | vld1.8 {d26-d28},[r2], r3 | |
1665 | vmull.u8 q12, d19, d1 | |
1666 | vmlal.u8 q12, d21, d0 | |
1667 | vext.8 q14, q13, q14, #1 | |
1668 | vmull.u8 q8, d26, d1 | |
1669 | vmlal.u8 q8, d28, d0 | |
1670 | vmull.u8 q9, d27, d1 | |
1671 | vmlal.u8 q9, d29, d0 | |
1672 | vrshrn.u16 d6, q11, #3 | |
1673 | vrshrn.u16 d7, q12, #3 | |
1674 | vmull.u8 q12, d4, d3 | |
1675 | vmlal.u8 q12, d6, d2 | |
1676 | vmull.u8 q15, d5, d3 | |
1677 | vmlal.u8 q15, d7, d2 | |
1678 | vrshrn.u16 d4, q8, #3 | |
1679 | vrshrn.u16 d5, q9, #3 | |
1680 | vmull.u8 q10, d6, d3 | |
1681 | vmlal.u8 q10, d4, d2 | |
1682 | vmull.u8 q11, d7, d3 | |
1683 | vmlal.u8 q11, d5, d2 | |
1684 | vrshrn.u16 d24, q12, #3 | |
1685 | vrshrn.u16 d25, q15, #3 | |
1686 | vst1.8 {q12}, [r0,:128], r1 | |
1687 | vrshrn.u16 d20, q10, #3 | |
1688 | vrshrn.u16 d21, q11, #3 | |
1689 | vst1.8 {q10}, [r0,:128], r1 | |
1690 | bgt 1b | |
1691 | ||
1692 | bx lr | |
1693 | endfunc | |
1694 | ||
1695 | function ff_put_vp8_bilin8_h_neon, export=1 | |
1696 | ldr r12, [sp, #4] @ mx | |
1697 | vdup.8 d0, r12 | |
1698 | rsb r12, r12, #8 | |
1699 | vdup.8 d1, r12 | |
1700 | ldr r12, [sp] @ h | |
1701 | 1: | |
1702 | subs r12, r12, #2 | |
1703 | vld1.8 {q1}, [r2], r3 | |
1704 | vext.8 d3, d2, d3, #1 | |
1705 | vmull.u8 q2, d2, d1 | |
1706 | vmlal.u8 q2, d3, d0 | |
1707 | vld1.8 {q3}, [r2], r3 | |
1708 | vext.8 d7, d6, d7, #1 | |
1709 | vmull.u8 q8, d6, d1 | |
1710 | vmlal.u8 q8, d7, d0 | |
1711 | vrshrn.u16 d4, q2, #3 | |
1712 | vrshrn.u16 d16, q8, #3 | |
1713 | vst1.8 {d4}, [r0,:64], r1 | |
1714 | vst1.8 {d16}, [r0,:64], r1 | |
1715 | bgt 1b | |
1716 | ||
1717 | bx lr | |
1718 | endfunc | |
1719 | ||
1720 | function ff_put_vp8_bilin8_v_neon, export=1 | |
1721 | ldr r12, [sp, #8] @ my | |
1722 | vdup.8 d0, r12 | |
1723 | rsb r12, r12, #8 | |
1724 | vdup.8 d1, r12 | |
1725 | ldr r12, [sp] @ h | |
1726 | vld1.8 {d2}, [r2], r3 | |
1727 | 1: | |
1728 | subs r12, r12, #2 | |
1729 | vld1.8 {d3}, [r2], r3 | |
1730 | vmull.u8 q2, d2, d1 | |
1731 | vmlal.u8 q2, d3, d0 | |
1732 | vld1.8 {d2}, [r2], r3 | |
1733 | vmull.u8 q3, d3, d1 | |
1734 | vmlal.u8 q3, d2, d0 | |
1735 | vrshrn.u16 d4, q2, #3 | |
1736 | vrshrn.u16 d6, q3, #3 | |
1737 | vst1.8 {d4}, [r0,:64], r1 | |
1738 | vst1.8 {d6}, [r0,:64], r1 | |
1739 | bgt 1b | |
1740 | ||
1741 | bx lr | |
1742 | endfunc | |
1743 | ||
1744 | function ff_put_vp8_bilin8_hv_neon, export=1 | |
1745 | ldr r12, [sp, #4] @ mx | |
1746 | vdup.8 d0, r12 | |
1747 | rsb r12, r12, #8 | |
1748 | vdup.8 d1, r12 | |
1749 | ldr r12, [sp, #8] @ my | |
1750 | vdup.8 d2, r12 | |
1751 | rsb r12, r12, #8 | |
1752 | vdup.8 d3, r12 | |
1753 | ldr r12, [sp] @ h | |
1754 | ||
1755 | vld1.8 {q2}, [r2], r3 | |
1756 | vext.8 d5, d4, d5, #1 | |
1757 | vmull.u8 q9, d4, d1 | |
1758 | vmlal.u8 q9, d5, d0 | |
1759 | vrshrn.u16 d22, q9, #3 | |
1760 | 1: | |
1761 | subs r12, r12, #2 | |
1762 | vld1.8 {q3}, [r2], r3 | |
1763 | vext.8 d7, d6, d7, #1 | |
1764 | vmull.u8 q8, d6, d1 | |
1765 | vmlal.u8 q8, d7, d0 | |
1766 | vld1.8 {q2}, [r2], r3 | |
1767 | vext.8 d5, d4, d5, #1 | |
1768 | vmull.u8 q9, d4, d1 | |
1769 | vmlal.u8 q9, d5, d0 | |
1770 | vrshrn.u16 d16, q8, #3 | |
1771 | vmull.u8 q10, d22, d3 | |
1772 | vmlal.u8 q10, d16, d2 | |
1773 | vrshrn.u16 d22, q9, #3 | |
1774 | vmull.u8 q12, d16, d3 | |
1775 | vmlal.u8 q12, d22, d2 | |
1776 | vrshrn.u16 d20, q10, #3 | |
1777 | vst1.8 {d20}, [r0,:64], r1 | |
1778 | vrshrn.u16 d23, q12, #3 | |
1779 | vst1.8 {d23}, [r0,:64], r1 | |
1780 | bgt 1b | |
1781 | ||
1782 | bx lr | |
1783 | endfunc | |
1784 | ||
1785 | function ff_put_vp8_bilin4_h_neon, export=1 | |
1786 | ldr r12, [sp, #4] @ mx | |
1787 | vdup.8 d0, r12 | |
1788 | rsb r12, r12, #8 | |
1789 | vdup.8 d1, r12 | |
1790 | ldr r12, [sp] @ h | |
1791 | 1: | |
1792 | subs r12, r12, #2 | |
1793 | vld1.8 {d2}, [r2], r3 | |
1794 | vext.8 d3, d2, d3, #1 | |
1795 | vld1.8 {d6}, [r2], r3 | |
1796 | vext.8 d7, d6, d7, #1 | |
1797 | vtrn.32 q1, q3 | |
1798 | vmull.u8 q2, d2, d1 | |
1799 | vmlal.u8 q2, d3, d0 | |
1800 | vrshrn.u16 d4, q2, #3 | |
1801 | vst1.32 {d4[0]}, [r0,:32], r1 | |
1802 | vst1.32 {d4[1]}, [r0,:32], r1 | |
1803 | bgt 1b | |
1804 | ||
1805 | bx lr | |
1806 | endfunc | |
1807 | ||
1808 | function ff_put_vp8_bilin4_v_neon, export=1 | |
1809 | ldr r12, [sp, #8] @ my | |
1810 | vdup.8 d0, r12 | |
1811 | rsb r12, r12, #8 | |
1812 | vdup.8 d1, r12 | |
1813 | ldr r12, [sp] @ h | |
1814 | vld1.32 {d2[]}, [r2], r3 | |
1815 | 1: | |
1816 | vld1.32 {d3[]}, [r2] | |
1817 | vld1.32 {d2[1]}, [r2], r3 | |
1818 | vld1.32 {d3[1]}, [r2], r3 | |
1819 | vmull.u8 q2, d2, d1 | |
1820 | vmlal.u8 q2, d3, d0 | |
1821 | vtrn.32 d3, d2 | |
1822 | vrshrn.u16 d4, q2, #3 | |
1823 | vst1.32 {d4[0]}, [r0,:32], r1 | |
1824 | vst1.32 {d4[1]}, [r0,:32], r1 | |
1825 | subs r12, r12, #2 | |
1826 | bgt 1b | |
1827 | ||
1828 | bx lr | |
1829 | endfunc | |
1830 | ||
1831 | function ff_put_vp8_bilin4_hv_neon, export=1 | |
1832 | ldr r12, [sp, #4] @ mx | |
1833 | vdup.8 d0, r12 | |
1834 | rsb r12, r12, #8 | |
1835 | vdup.8 d1, r12 | |
1836 | ldr r12, [sp, #8] @ my | |
1837 | vdup.8 d2, r12 | |
1838 | rsb r12, r12, #8 | |
1839 | vdup.8 d3, r12 | |
1840 | ldr r12, [sp] @ h | |
1841 | ||
1842 | vld1.8 {d4}, [r2], r3 | |
1843 | vext.8 d5, d4, d4, #1 | |
1844 | vmull.u8 q9, d4, d1 | |
1845 | vmlal.u8 q9, d5, d0 | |
1846 | vrshrn.u16 d22, q9, #3 | |
1847 | 1: | |
1848 | subs r12, r12, #2 | |
1849 | vld1.8 {d6}, [r2], r3 | |
1850 | vext.8 d7, d6, d6, #1 | |
1851 | vld1.8 {d4}, [r2], r3 | |
1852 | vext.8 d5, d4, d4, #1 | |
1853 | vtrn.32 q3, q2 | |
1854 | vmull.u8 q8, d6, d1 | |
1855 | vmlal.u8 q8, d7, d0 | |
1856 | vrshrn.u16 d16, q8, #3 | |
1857 | vmull.u8 q10, d16, d2 | |
1858 | vtrn.32 d22, d16 | |
1859 | vmlal.u8 q10, d22, d3 | |
1860 | vrev64.32 d22, d16 | |
1861 | vrshrn.u16 d20, q10, #3 | |
1862 | vst1.32 {d20[0]}, [r0,:32], r1 | |
1863 | vst1.32 {d20[1]}, [r0,:32], r1 | |
1864 | bgt 1b | |
1865 | ||
1866 | bx lr | |
1867 | endfunc |