Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / h264pred_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22
23 .macro ldcol.8 rd, rs, rt, n=8, hi=0
24.if \n == 8 || \hi == 0
25 vld1.8 {\rd[0]}, [\rs], \rt
26 vld1.8 {\rd[1]}, [\rs], \rt
27 vld1.8 {\rd[2]}, [\rs], \rt
28 vld1.8 {\rd[3]}, [\rs], \rt
29.endif
30.if \n == 8 || \hi == 1
31 vld1.8 {\rd[4]}, [\rs], \rt
32 vld1.8 {\rd[5]}, [\rs], \rt
33 vld1.8 {\rd[6]}, [\rs], \rt
34 vld1.8 {\rd[7]}, [\rs], \rt
35.endif
36 .endm
37
38 .macro add16x8 dq, dl, dh, rl, rh
39 vaddl.u8 \dq, \rl, \rh
40 vadd.u16 \dl, \dl, \dh
41 vpadd.u16 \dl, \dl, \dl
42 vpadd.u16 \dl, \dl, \dl
43 .endm
44
45function ff_pred16x16_128_dc_neon, export=1
46 vmov.i8 q0, #128
47 b .L_pred16x16_dc_end
48endfunc
49
50function ff_pred16x16_top_dc_neon, export=1
51 sub r2, r0, r1
52 vld1.8 {q0}, [r2,:128]
53 add16x8 q0, d0, d1, d0, d1
54 vrshrn.u16 d0, q0, #4
55 vdup.8 q0, d0[0]
56 b .L_pred16x16_dc_end
57endfunc
58
59function ff_pred16x16_left_dc_neon, export=1
60 sub r2, r0, #1
61 ldcol.8 d0, r2, r1
62 ldcol.8 d1, r2, r1
63 add16x8 q0, d0, d1, d0, d1
64 vrshrn.u16 d0, q0, #4
65 vdup.8 q0, d0[0]
66 b .L_pred16x16_dc_end
67endfunc
68
69function ff_pred16x16_dc_neon, export=1
70 sub r2, r0, r1
71 vld1.8 {q0}, [r2,:128]
72 sub r2, r0, #1
73 ldcol.8 d2, r2, r1
74 ldcol.8 d3, r2, r1
75 vaddl.u8 q0, d0, d1
76 vaddl.u8 q1, d2, d3
77 vadd.u16 q0, q0, q1
78 vadd.u16 d0, d0, d1
79 vpadd.u16 d0, d0, d0
80 vpadd.u16 d0, d0, d0
81 vrshrn.u16 d0, q0, #5
82 vdup.8 q0, d0[0]
83.L_pred16x16_dc_end:
84 mov r3, #8
856: vst1.8 {q0}, [r0,:128], r1
86 vst1.8 {q0}, [r0,:128], r1
87 subs r3, r3, #1
88 bne 6b
89 bx lr
90endfunc
91
92function ff_pred16x16_hor_neon, export=1
93 sub r2, r0, #1
94 mov r3, #16
951: vld1.8 {d0[],d1[]},[r2], r1
96 vst1.8 {q0}, [r0,:128], r1
97 subs r3, r3, #1
98 bne 1b
99 bx lr
100endfunc
101
102function ff_pred16x16_vert_neon, export=1
103 sub r0, r0, r1
104 vld1.8 {q0}, [r0,:128], r1
105 mov r3, #8
1061: vst1.8 {q0}, [r0,:128], r1
107 vst1.8 {q0}, [r0,:128], r1
108 subs r3, r3, #1
109 bne 1b
110 bx lr
111endfunc
112
113function ff_pred16x16_plane_neon, export=1
114 sub r3, r0, r1
115 add r2, r3, #8
116 sub r3, r3, #1
117 vld1.8 {d0}, [r3]
118 vld1.8 {d2}, [r2,:64], r1
119 ldcol.8 d1, r3, r1
120 add r3, r3, r1
121 ldcol.8 d3, r3, r1
122 vrev64.8 q0, q0
123 vaddl.u8 q8, d2, d3
124 vsubl.u8 q2, d2, d0
125 vsubl.u8 q3, d3, d1
126 movrel r3, p16weight
127 vld1.8 {q0}, [r3,:128]
128 vmul.s16 q2, q2, q0
129 vmul.s16 q3, q3, q0
130 vadd.i16 d4, d4, d5
131 vadd.i16 d5, d6, d7
132 vpadd.i16 d4, d4, d5
133 vpadd.i16 d4, d4, d4
134 vshll.s16 q3, d4, #2
135 vaddw.s16 q2, q3, d4
136 vrshrn.s32 d4, q2, #6
137 mov r3, #0
138 vtrn.16 d4, d5
139 vadd.i16 d2, d4, d5
140 vshl.i16 d3, d2, #3
141 vrev64.16 d16, d17
142 vsub.i16 d3, d3, d2
143 vadd.i16 d16, d16, d0
144 vshl.i16 d2, d16, #4
145 vsub.i16 d2, d2, d3
146 vshl.i16 d3, d4, #4
147 vext.16 q0, q0, q0, #7
148 vsub.i16 d6, d5, d3
149 vmov.16 d0[0], r3
150 vmul.i16 q0, q0, d4[0]
151 vdup.16 q1, d2[0]
152 vdup.16 q2, d4[0]
153 vdup.16 q3, d6[0]
154 vshl.i16 q2, q2, #3
155 vadd.i16 q1, q1, q0
156 vadd.i16 q3, q3, q2
157 mov r3, #16
1581:
159 vqshrun.s16 d0, q1, #5
160 vadd.i16 q1, q1, q2
161 vqshrun.s16 d1, q1, #5
162 vadd.i16 q1, q1, q3
163 vst1.8 {q0}, [r0,:128], r1
164 subs r3, r3, #1
165 bne 1b
166 bx lr
167endfunc
168
169const p16weight, align=4
170 .short 1,2,3,4,5,6,7,8
171endconst
172
173function ff_pred8x8_hor_neon, export=1
174 sub r2, r0, #1
175 mov r3, #8
1761: vld1.8 {d0[]}, [r2], r1
177 vst1.8 {d0}, [r0,:64], r1
178 subs r3, r3, #1
179 bne 1b
180 bx lr
181endfunc
182
183function ff_pred8x8_vert_neon, export=1
184 sub r0, r0, r1
185 vld1.8 {d0}, [r0,:64], r1
186 mov r3, #4
1871: vst1.8 {d0}, [r0,:64], r1
188 vst1.8 {d0}, [r0,:64], r1
189 subs r3, r3, #1
190 bne 1b
191 bx lr
192endfunc
193
194function ff_pred8x8_plane_neon, export=1
195 sub r3, r0, r1
196 add r2, r3, #4
197 sub r3, r3, #1
198 vld1.32 {d0[0]}, [r3]
199 vld1.32 {d2[0]}, [r2,:32], r1
200 ldcol.8 d0, r3, r1, 4, hi=1
201 add r3, r3, r1
202 ldcol.8 d3, r3, r1, 4
203 vaddl.u8 q8, d2, d3
204 vrev32.8 d0, d0
205 vtrn.32 d2, d3
206 vsubl.u8 q2, d2, d0
207 movrel r3, p16weight
208 vld1.16 {q0}, [r3,:128]
209 vmul.s16 d4, d4, d0
210 vmul.s16 d5, d5, d0
211 vpadd.i16 d4, d4, d5
212 vpaddl.s16 d4, d4
213 vshl.i32 d5, d4, #4
214 vadd.s32 d4, d4, d5
215 vrshrn.s32 d4, q2, #5
216 mov r3, #0
217 vtrn.16 d4, d5
218 vadd.i16 d2, d4, d5
219 vshl.i16 d3, d2, #2
220 vrev64.16 d16, d16
221 vsub.i16 d3, d3, d2
222 vadd.i16 d16, d16, d0
223 vshl.i16 d2, d16, #4
224 vsub.i16 d2, d2, d3
225 vshl.i16 d3, d4, #3
226 vext.16 q0, q0, q0, #7
227 vsub.i16 d6, d5, d3
228 vmov.16 d0[0], r3
229 vmul.i16 q0, q0, d4[0]
230 vdup.16 q1, d2[0]
231 vdup.16 q2, d4[0]
232 vdup.16 q3, d6[0]
233 vshl.i16 q2, q2, #3
234 vadd.i16 q1, q1, q0
235 vadd.i16 q3, q3, q2
236 mov r3, #8
2371:
238 vqshrun.s16 d0, q1, #5
239 vadd.i16 q1, q1, q3
240 vst1.8 {d0}, [r0,:64], r1
241 subs r3, r3, #1
242 bne 1b
243 bx lr
244endfunc
245
246function ff_pred8x8_128_dc_neon, export=1
247 vmov.i8 q0, #128
248 b .L_pred8x8_dc_end
249endfunc
250
251function ff_pred8x8_top_dc_neon, export=1
252 sub r2, r0, r1
253 vld1.8 {d0}, [r2,:64]
254 vpaddl.u8 d0, d0
255 vpadd.u16 d0, d0, d0
256 vrshrn.u16 d0, q0, #2
257 vdup.8 d1, d0[1]
258 vdup.8 d0, d0[0]
259 vtrn.32 d0, d1
260 b .L_pred8x8_dc_end
261endfunc
262
263function ff_pred8x8_left_dc_neon, export=1
264 sub r2, r0, #1
265 ldcol.8 d0, r2, r1
266 vpaddl.u8 d0, d0
267 vpadd.u16 d0, d0, d0
268 vrshrn.u16 d0, q0, #2
269 vdup.8 d1, d0[1]
270 vdup.8 d0, d0[0]
271 b .L_pred8x8_dc_end
272endfunc
273
274function ff_pred8x8_dc_neon, export=1
275 sub r2, r0, r1
276 vld1.8 {d0}, [r2,:64]
277 sub r2, r0, #1
278 ldcol.8 d1, r2, r1
279 vtrn.32 d0, d1
280 vpaddl.u8 q0, q0
281 vpadd.u16 d0, d0, d1
282 vpadd.u16 d1, d0, d0
283 vrshrn.u16 d2, q0, #3
284 vrshrn.u16 d3, q0, #2
285 vdup.8 d0, d2[4]
286 vdup.8 d1, d3[3]
287 vdup.8 d4, d3[2]
288 vdup.8 d5, d2[5]
289 vtrn.32 q0, q2
290.L_pred8x8_dc_end:
291 mov r3, #4
292 add r2, r0, r1, lsl #2
2936: vst1.8 {d0}, [r0,:64], r1
294 vst1.8 {d1}, [r2,:64], r1
295 subs r3, r3, #1
296 bne 6b
297 bx lr
298endfunc
299
300function ff_pred8x8_l0t_dc_neon, export=1
301 sub r2, r0, r1
302 vld1.8 {d0}, [r2,:64]
303 sub r2, r0, #1
304 ldcol.8 d1, r2, r1, 4
305 vtrn.32 d0, d1
306 vpaddl.u8 q0, q0
307 vpadd.u16 d0, d0, d1
308 vpadd.u16 d1, d0, d0
309 vrshrn.u16 d2, q0, #3
310 vrshrn.u16 d3, q0, #2
311 vdup.8 d0, d2[4]
312 vdup.8 d1, d3[0]
313 vdup.8 q2, d3[2]
314 vtrn.32 q0, q2
315 b .L_pred8x8_dc_end
316endfunc
317
318function ff_pred8x8_l00_dc_neon, export=1
319 sub r2, r0, #1
320 ldcol.8 d0, r2, r1, 4
321 vpaddl.u8 d0, d0
322 vpadd.u16 d0, d0, d0
323 vrshrn.u16 d0, q0, #2
324 vmov.i8 d1, #128
325 vdup.8 d0, d0[0]
326 b .L_pred8x8_dc_end
327endfunc
328
329function ff_pred8x8_0lt_dc_neon, export=1
330 sub r2, r0, r1
331 vld1.8 {d0}, [r2,:64]
332 add r2, r0, r1, lsl #2
333 sub r2, r2, #1
334 ldcol.8 d1, r2, r1, 4, hi=1
335 vtrn.32 d0, d1
336 vpaddl.u8 q0, q0
337 vpadd.u16 d0, d0, d1
338 vpadd.u16 d1, d0, d0
339 vrshrn.u16 d3, q0, #2
340 vrshrn.u16 d2, q0, #3
341 vdup.8 d0, d3[0]
342 vdup.8 d1, d3[3]
343 vdup.8 d4, d3[2]
344 vdup.8 d5, d2[5]
345 vtrn.32 q0, q2
346 b .L_pred8x8_dc_end
347endfunc
348
349function ff_pred8x8_0l0_dc_neon, export=1
350 add r2, r0, r1, lsl #2
351 sub r2, r2, #1
352 ldcol.8 d1, r2, r1, 4
353 vpaddl.u8 d2, d1
354 vpadd.u16 d2, d2, d2
355 vrshrn.u16 d1, q1, #2
356 vmov.i8 d0, #128
357 vdup.8 d1, d1[0]
358 b .L_pred8x8_dc_end
359endfunc