Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> | |
3 | * | |
4 | * This file is part of FFmpeg. | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with FFmpeg; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | */ | |
20 | ||
21 | #include "libavutil/arm/asm.S" | |
22 | ||
23 | .macro ldcol.8 rd, rs, rt, n=8, hi=0 | |
24 | .if \n == 8 || \hi == 0 | |
25 | vld1.8 {\rd[0]}, [\rs], \rt | |
26 | vld1.8 {\rd[1]}, [\rs], \rt | |
27 | vld1.8 {\rd[2]}, [\rs], \rt | |
28 | vld1.8 {\rd[3]}, [\rs], \rt | |
29 | .endif | |
30 | .if \n == 8 || \hi == 1 | |
31 | vld1.8 {\rd[4]}, [\rs], \rt | |
32 | vld1.8 {\rd[5]}, [\rs], \rt | |
33 | vld1.8 {\rd[6]}, [\rs], \rt | |
34 | vld1.8 {\rd[7]}, [\rs], \rt | |
35 | .endif | |
36 | .endm | |
37 | ||
38 | .macro add16x8 dq, dl, dh, rl, rh | |
39 | vaddl.u8 \dq, \rl, \rh | |
40 | vadd.u16 \dl, \dl, \dh | |
41 | vpadd.u16 \dl, \dl, \dl | |
42 | vpadd.u16 \dl, \dl, \dl | |
43 | .endm | |
44 | ||
45 | function ff_pred16x16_128_dc_neon, export=1 | |
46 | vmov.i8 q0, #128 | |
47 | b .L_pred16x16_dc_end | |
48 | endfunc | |
49 | ||
50 | function ff_pred16x16_top_dc_neon, export=1 | |
51 | sub r2, r0, r1 | |
52 | vld1.8 {q0}, [r2,:128] | |
53 | add16x8 q0, d0, d1, d0, d1 | |
54 | vrshrn.u16 d0, q0, #4 | |
55 | vdup.8 q0, d0[0] | |
56 | b .L_pred16x16_dc_end | |
57 | endfunc | |
58 | ||
59 | function ff_pred16x16_left_dc_neon, export=1 | |
60 | sub r2, r0, #1 | |
61 | ldcol.8 d0, r2, r1 | |
62 | ldcol.8 d1, r2, r1 | |
63 | add16x8 q0, d0, d1, d0, d1 | |
64 | vrshrn.u16 d0, q0, #4 | |
65 | vdup.8 q0, d0[0] | |
66 | b .L_pred16x16_dc_end | |
67 | endfunc | |
68 | ||
69 | function ff_pred16x16_dc_neon, export=1 | |
70 | sub r2, r0, r1 | |
71 | vld1.8 {q0}, [r2,:128] | |
72 | sub r2, r0, #1 | |
73 | ldcol.8 d2, r2, r1 | |
74 | ldcol.8 d3, r2, r1 | |
75 | vaddl.u8 q0, d0, d1 | |
76 | vaddl.u8 q1, d2, d3 | |
77 | vadd.u16 q0, q0, q1 | |
78 | vadd.u16 d0, d0, d1 | |
79 | vpadd.u16 d0, d0, d0 | |
80 | vpadd.u16 d0, d0, d0 | |
81 | vrshrn.u16 d0, q0, #5 | |
82 | vdup.8 q0, d0[0] | |
83 | .L_pred16x16_dc_end: | |
84 | mov r3, #8 | |
85 | 6: vst1.8 {q0}, [r0,:128], r1 | |
86 | vst1.8 {q0}, [r0,:128], r1 | |
87 | subs r3, r3, #1 | |
88 | bne 6b | |
89 | bx lr | |
90 | endfunc | |
91 | ||
92 | function ff_pred16x16_hor_neon, export=1 | |
93 | sub r2, r0, #1 | |
94 | mov r3, #16 | |
95 | 1: vld1.8 {d0[],d1[]},[r2], r1 | |
96 | vst1.8 {q0}, [r0,:128], r1 | |
97 | subs r3, r3, #1 | |
98 | bne 1b | |
99 | bx lr | |
100 | endfunc | |
101 | ||
102 | function ff_pred16x16_vert_neon, export=1 | |
103 | sub r0, r0, r1 | |
104 | vld1.8 {q0}, [r0,:128], r1 | |
105 | mov r3, #8 | |
106 | 1: vst1.8 {q0}, [r0,:128], r1 | |
107 | vst1.8 {q0}, [r0,:128], r1 | |
108 | subs r3, r3, #1 | |
109 | bne 1b | |
110 | bx lr | |
111 | endfunc | |
112 | ||
113 | function ff_pred16x16_plane_neon, export=1 | |
114 | sub r3, r0, r1 | |
115 | add r2, r3, #8 | |
116 | sub r3, r3, #1 | |
117 | vld1.8 {d0}, [r3] | |
118 | vld1.8 {d2}, [r2,:64], r1 | |
119 | ldcol.8 d1, r3, r1 | |
120 | add r3, r3, r1 | |
121 | ldcol.8 d3, r3, r1 | |
122 | vrev64.8 q0, q0 | |
123 | vaddl.u8 q8, d2, d3 | |
124 | vsubl.u8 q2, d2, d0 | |
125 | vsubl.u8 q3, d3, d1 | |
126 | movrel r3, p16weight | |
127 | vld1.8 {q0}, [r3,:128] | |
128 | vmul.s16 q2, q2, q0 | |
129 | vmul.s16 q3, q3, q0 | |
130 | vadd.i16 d4, d4, d5 | |
131 | vadd.i16 d5, d6, d7 | |
132 | vpadd.i16 d4, d4, d5 | |
133 | vpadd.i16 d4, d4, d4 | |
134 | vshll.s16 q3, d4, #2 | |
135 | vaddw.s16 q2, q3, d4 | |
136 | vrshrn.s32 d4, q2, #6 | |
137 | mov r3, #0 | |
138 | vtrn.16 d4, d5 | |
139 | vadd.i16 d2, d4, d5 | |
140 | vshl.i16 d3, d2, #3 | |
141 | vrev64.16 d16, d17 | |
142 | vsub.i16 d3, d3, d2 | |
143 | vadd.i16 d16, d16, d0 | |
144 | vshl.i16 d2, d16, #4 | |
145 | vsub.i16 d2, d2, d3 | |
146 | vshl.i16 d3, d4, #4 | |
147 | vext.16 q0, q0, q0, #7 | |
148 | vsub.i16 d6, d5, d3 | |
149 | vmov.16 d0[0], r3 | |
150 | vmul.i16 q0, q0, d4[0] | |
151 | vdup.16 q1, d2[0] | |
152 | vdup.16 q2, d4[0] | |
153 | vdup.16 q3, d6[0] | |
154 | vshl.i16 q2, q2, #3 | |
155 | vadd.i16 q1, q1, q0 | |
156 | vadd.i16 q3, q3, q2 | |
157 | mov r3, #16 | |
158 | 1: | |
159 | vqshrun.s16 d0, q1, #5 | |
160 | vadd.i16 q1, q1, q2 | |
161 | vqshrun.s16 d1, q1, #5 | |
162 | vadd.i16 q1, q1, q3 | |
163 | vst1.8 {q0}, [r0,:128], r1 | |
164 | subs r3, r3, #1 | |
165 | bne 1b | |
166 | bx lr | |
167 | endfunc | |
168 | ||
169 | const p16weight, align=4 | |
170 | .short 1,2,3,4,5,6,7,8 | |
171 | endconst | |
172 | ||
173 | function ff_pred8x8_hor_neon, export=1 | |
174 | sub r2, r0, #1 | |
175 | mov r3, #8 | |
176 | 1: vld1.8 {d0[]}, [r2], r1 | |
177 | vst1.8 {d0}, [r0,:64], r1 | |
178 | subs r3, r3, #1 | |
179 | bne 1b | |
180 | bx lr | |
181 | endfunc | |
182 | ||
183 | function ff_pred8x8_vert_neon, export=1 | |
184 | sub r0, r0, r1 | |
185 | vld1.8 {d0}, [r0,:64], r1 | |
186 | mov r3, #4 | |
187 | 1: vst1.8 {d0}, [r0,:64], r1 | |
188 | vst1.8 {d0}, [r0,:64], r1 | |
189 | subs r3, r3, #1 | |
190 | bne 1b | |
191 | bx lr | |
192 | endfunc | |
193 | ||
194 | function ff_pred8x8_plane_neon, export=1 | |
195 | sub r3, r0, r1 | |
196 | add r2, r3, #4 | |
197 | sub r3, r3, #1 | |
198 | vld1.32 {d0[0]}, [r3] | |
199 | vld1.32 {d2[0]}, [r2,:32], r1 | |
200 | ldcol.8 d0, r3, r1, 4, hi=1 | |
201 | add r3, r3, r1 | |
202 | ldcol.8 d3, r3, r1, 4 | |
203 | vaddl.u8 q8, d2, d3 | |
204 | vrev32.8 d0, d0 | |
205 | vtrn.32 d2, d3 | |
206 | vsubl.u8 q2, d2, d0 | |
207 | movrel r3, p16weight | |
208 | vld1.16 {q0}, [r3,:128] | |
209 | vmul.s16 d4, d4, d0 | |
210 | vmul.s16 d5, d5, d0 | |
211 | vpadd.i16 d4, d4, d5 | |
212 | vpaddl.s16 d4, d4 | |
213 | vshl.i32 d5, d4, #4 | |
214 | vadd.s32 d4, d4, d5 | |
215 | vrshrn.s32 d4, q2, #5 | |
216 | mov r3, #0 | |
217 | vtrn.16 d4, d5 | |
218 | vadd.i16 d2, d4, d5 | |
219 | vshl.i16 d3, d2, #2 | |
220 | vrev64.16 d16, d16 | |
221 | vsub.i16 d3, d3, d2 | |
222 | vadd.i16 d16, d16, d0 | |
223 | vshl.i16 d2, d16, #4 | |
224 | vsub.i16 d2, d2, d3 | |
225 | vshl.i16 d3, d4, #3 | |
226 | vext.16 q0, q0, q0, #7 | |
227 | vsub.i16 d6, d5, d3 | |
228 | vmov.16 d0[0], r3 | |
229 | vmul.i16 q0, q0, d4[0] | |
230 | vdup.16 q1, d2[0] | |
231 | vdup.16 q2, d4[0] | |
232 | vdup.16 q3, d6[0] | |
233 | vshl.i16 q2, q2, #3 | |
234 | vadd.i16 q1, q1, q0 | |
235 | vadd.i16 q3, q3, q2 | |
236 | mov r3, #8 | |
237 | 1: | |
238 | vqshrun.s16 d0, q1, #5 | |
239 | vadd.i16 q1, q1, q3 | |
240 | vst1.8 {d0}, [r0,:64], r1 | |
241 | subs r3, r3, #1 | |
242 | bne 1b | |
243 | bx lr | |
244 | endfunc | |
245 | ||
246 | function ff_pred8x8_128_dc_neon, export=1 | |
247 | vmov.i8 q0, #128 | |
248 | b .L_pred8x8_dc_end | |
249 | endfunc | |
250 | ||
251 | function ff_pred8x8_top_dc_neon, export=1 | |
252 | sub r2, r0, r1 | |
253 | vld1.8 {d0}, [r2,:64] | |
254 | vpaddl.u8 d0, d0 | |
255 | vpadd.u16 d0, d0, d0 | |
256 | vrshrn.u16 d0, q0, #2 | |
257 | vdup.8 d1, d0[1] | |
258 | vdup.8 d0, d0[0] | |
259 | vtrn.32 d0, d1 | |
260 | b .L_pred8x8_dc_end | |
261 | endfunc | |
262 | ||
263 | function ff_pred8x8_left_dc_neon, export=1 | |
264 | sub r2, r0, #1 | |
265 | ldcol.8 d0, r2, r1 | |
266 | vpaddl.u8 d0, d0 | |
267 | vpadd.u16 d0, d0, d0 | |
268 | vrshrn.u16 d0, q0, #2 | |
269 | vdup.8 d1, d0[1] | |
270 | vdup.8 d0, d0[0] | |
271 | b .L_pred8x8_dc_end | |
272 | endfunc | |
273 | ||
274 | function ff_pred8x8_dc_neon, export=1 | |
275 | sub r2, r0, r1 | |
276 | vld1.8 {d0}, [r2,:64] | |
277 | sub r2, r0, #1 | |
278 | ldcol.8 d1, r2, r1 | |
279 | vtrn.32 d0, d1 | |
280 | vpaddl.u8 q0, q0 | |
281 | vpadd.u16 d0, d0, d1 | |
282 | vpadd.u16 d1, d0, d0 | |
283 | vrshrn.u16 d2, q0, #3 | |
284 | vrshrn.u16 d3, q0, #2 | |
285 | vdup.8 d0, d2[4] | |
286 | vdup.8 d1, d3[3] | |
287 | vdup.8 d4, d3[2] | |
288 | vdup.8 d5, d2[5] | |
289 | vtrn.32 q0, q2 | |
290 | .L_pred8x8_dc_end: | |
291 | mov r3, #4 | |
292 | add r2, r0, r1, lsl #2 | |
293 | 6: vst1.8 {d0}, [r0,:64], r1 | |
294 | vst1.8 {d1}, [r2,:64], r1 | |
295 | subs r3, r3, #1 | |
296 | bne 6b | |
297 | bx lr | |
298 | endfunc | |
299 | ||
300 | function ff_pred8x8_l0t_dc_neon, export=1 | |
301 | sub r2, r0, r1 | |
302 | vld1.8 {d0}, [r2,:64] | |
303 | sub r2, r0, #1 | |
304 | ldcol.8 d1, r2, r1, 4 | |
305 | vtrn.32 d0, d1 | |
306 | vpaddl.u8 q0, q0 | |
307 | vpadd.u16 d0, d0, d1 | |
308 | vpadd.u16 d1, d0, d0 | |
309 | vrshrn.u16 d2, q0, #3 | |
310 | vrshrn.u16 d3, q0, #2 | |
311 | vdup.8 d0, d2[4] | |
312 | vdup.8 d1, d3[0] | |
313 | vdup.8 q2, d3[2] | |
314 | vtrn.32 q0, q2 | |
315 | b .L_pred8x8_dc_end | |
316 | endfunc | |
317 | ||
318 | function ff_pred8x8_l00_dc_neon, export=1 | |
319 | sub r2, r0, #1 | |
320 | ldcol.8 d0, r2, r1, 4 | |
321 | vpaddl.u8 d0, d0 | |
322 | vpadd.u16 d0, d0, d0 | |
323 | vrshrn.u16 d0, q0, #2 | |
324 | vmov.i8 d1, #128 | |
325 | vdup.8 d0, d0[0] | |
326 | b .L_pred8x8_dc_end | |
327 | endfunc | |
328 | ||
329 | function ff_pred8x8_0lt_dc_neon, export=1 | |
330 | sub r2, r0, r1 | |
331 | vld1.8 {d0}, [r2,:64] | |
332 | add r2, r0, r1, lsl #2 | |
333 | sub r2, r2, #1 | |
334 | ldcol.8 d1, r2, r1, 4, hi=1 | |
335 | vtrn.32 d0, d1 | |
336 | vpaddl.u8 q0, q0 | |
337 | vpadd.u16 d0, d0, d1 | |
338 | vpadd.u16 d1, d0, d0 | |
339 | vrshrn.u16 d3, q0, #2 | |
340 | vrshrn.u16 d2, q0, #3 | |
341 | vdup.8 d0, d3[0] | |
342 | vdup.8 d1, d3[3] | |
343 | vdup.8 d4, d3[2] | |
344 | vdup.8 d5, d2[5] | |
345 | vtrn.32 q0, q2 | |
346 | b .L_pred8x8_dc_end | |
347 | endfunc | |
348 | ||
349 | function ff_pred8x8_0l0_dc_neon, export=1 | |
350 | add r2, r0, r1, lsl #2 | |
351 | sub r2, r2, #1 | |
352 | ldcol.8 d1, r2, r1, 4 | |
353 | vpaddl.u8 d2, d1 | |
354 | vpadd.u16 d2, d2, d2 | |
355 | vrshrn.u16 d1, q1, #2 | |
356 | vmov.i8 d0, #128 | |
357 | vdup.8 d1, d1[0] | |
358 | b .L_pred8x8_dc_end | |
359 | endfunc |