Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / h264cmc_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22
23/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
24.macro h264_chroma_mc8 type, codec=h264
25function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
26 push {r4-r7, lr}
27 ldrd r4, r5, [sp, #20]
28 .ifc \type,avg
29 mov lr, r0
30 .endif
31 pld [r1]
32 pld [r1, r2]
33
34 .ifc \codec,rv40
35 movrel r6, rv40bias
36 lsr r7, r5, #1
37 add r6, r6, r7, lsl #3
38 lsr r7, r4, #1
39 add r6, r6, r7, lsl #1
40 vld1.16 {d22[],d23[]}, [r6,:16]
41 .endif
42 .ifc \codec,vc1
43 vmov.u16 q11, #28
44 .endif
45
46A muls r7, r4, r5
47T mul r7, r4, r5
48T cmp r7, #0
49 rsb r6, r7, r5, lsl #3
50 rsb r12, r7, r4, lsl #3
51 sub r4, r7, r4, lsl #3
52 sub r4, r4, r5, lsl #3
53 add r4, r4, #64
54
55 beq 2f
56
57 vdup.8 d0, r4
58 vdup.8 d1, r12
59 vld1.8 {d4, d5}, [r1], r2
60 vdup.8 d2, r6
61 vdup.8 d3, r7
62 vext.8 d5, d4, d5, #1
63
641: vld1.8 {d6, d7}, [r1], r2
65 vmull.u8 q8, d4, d0
66 vmlal.u8 q8, d5, d1
67 vext.8 d7, d6, d7, #1
68 vld1.8 {d4, d5}, [r1], r2
69 vmlal.u8 q8, d6, d2
70 pld [r1]
71 vext.8 d5, d4, d5, #1
72 vmlal.u8 q8, d7, d3
73 vmull.u8 q9, d6, d0
74 subs r3, r3, #2
75 vmlal.u8 q9, d7, d1
76 vmlal.u8 q9, d4, d2
77 vmlal.u8 q9, d5, d3
78 pld [r1, r2]
79 .ifc \codec,h264
80 vrshrn.u16 d16, q8, #6
81 vrshrn.u16 d17, q9, #6
82 .else
83 vadd.u16 q8, q8, q11
84 vadd.u16 q9, q9, q11
85 vshrn.u16 d16, q8, #6
86 vshrn.u16 d17, q9, #6
87 .endif
88 .ifc \type,avg
89 vld1.8 {d20}, [lr,:64], r2
90 vld1.8 {d21}, [lr,:64], r2
91 vrhadd.u8 q8, q8, q10
92 .endif
93 vst1.8 {d16}, [r0,:64], r2
94 vst1.8 {d17}, [r0,:64], r2
95 bgt 1b
96
97 pop {r4-r7, pc}
98
992: adds r12, r12, r6
100 vdup.8 d0, r4
101 beq 5f
102 tst r6, r6
103 vdup.8 d1, r12
104
105 beq 4f
106
107 vld1.8 {d4}, [r1], r2
108
1093: vld1.8 {d6}, [r1], r2
110 vmull.u8 q8, d4, d0
111 vmlal.u8 q8, d6, d1
112 vld1.8 {d4}, [r1], r2
113 vmull.u8 q9, d6, d0
114 vmlal.u8 q9, d4, d1
115 pld [r1]
116 .ifc \codec,h264
117 vrshrn.u16 d16, q8, #6
118 vrshrn.u16 d17, q9, #6
119 .else
120 vadd.u16 q8, q8, q11
121 vadd.u16 q9, q9, q11
122 vshrn.u16 d16, q8, #6
123 vshrn.u16 d17, q9, #6
124 .endif
125 pld [r1, r2]
126 .ifc \type,avg
127 vld1.8 {d20}, [lr,:64], r2
128 vld1.8 {d21}, [lr,:64], r2
129 vrhadd.u8 q8, q8, q10
130 .endif
131 subs r3, r3, #2
132 vst1.8 {d16}, [r0,:64], r2
133 vst1.8 {d17}, [r0,:64], r2
134 bgt 3b
135
136 pop {r4-r7, pc}
137
1384: vld1.8 {d4, d5}, [r1], r2
139 vld1.8 {d6, d7}, [r1], r2
140 vext.8 d5, d4, d5, #1
141 vext.8 d7, d6, d7, #1
142 pld [r1]
143 subs r3, r3, #2
144 vmull.u8 q8, d4, d0
145 vmlal.u8 q8, d5, d1
146 vmull.u8 q9, d6, d0
147 vmlal.u8 q9, d7, d1
148 pld [r1, r2]
149 .ifc \codec,h264
150 vrshrn.u16 d16, q8, #6
151 vrshrn.u16 d17, q9, #6
152 .else
153 vadd.u16 q8, q8, q11
154 vadd.u16 q9, q9, q11
155 vshrn.u16 d16, q8, #6
156 vshrn.u16 d17, q9, #6
157 .endif
158 .ifc \type,avg
159 vld1.8 {d20}, [lr,:64], r2
160 vld1.8 {d21}, [lr,:64], r2
161 vrhadd.u8 q8, q8, q10
162 .endif
163 vst1.8 {d16}, [r0,:64], r2
164 vst1.8 {d17}, [r0,:64], r2
165 bgt 4b
166
167 pop {r4-r7, pc}
168
1695: vld1.8 {d4}, [r1], r2
170 vld1.8 {d5}, [r1], r2
171 pld [r1]
172 subs r3, r3, #2
173 vmull.u8 q8, d4, d0
174 vmull.u8 q9, d5, d0
175 pld [r1, r2]
176 .ifc \codec,h264
177 vrshrn.u16 d16, q8, #6
178 vrshrn.u16 d17, q9, #6
179 .else
180 vadd.u16 q8, q8, q11
181 vadd.u16 q9, q9, q11
182 vshrn.u16 d16, q8, #6
183 vshrn.u16 d17, q9, #6
184 .endif
185 .ifc \type,avg
186 vld1.8 {d20}, [lr,:64], r2
187 vld1.8 {d21}, [lr,:64], r2
188 vrhadd.u8 q8, q8, q10
189 .endif
190 vst1.8 {d16}, [r0,:64], r2
191 vst1.8 {d17}, [r0,:64], r2
192 bgt 5b
193
194 pop {r4-r7, pc}
195endfunc
196.endm
197
198/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
199.macro h264_chroma_mc4 type, codec=h264
200function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
201 push {r4-r7, lr}
202 ldrd r4, r5, [sp, #20]
203 .ifc \type,avg
204 mov lr, r0
205 .endif
206 pld [r1]
207 pld [r1, r2]
208
209 .ifc \codec,rv40
210 movrel r6, rv40bias
211 lsr r7, r5, #1
212 add r6, r6, r7, lsl #3
213 lsr r7, r4, #1
214 add r6, r6, r7, lsl #1
215 vld1.16 {d22[],d23[]}, [r6,:16]
216 .endif
217 .ifc \codec,vc1
218 vmov.u16 q11, #28
219 .endif
220
221A muls r7, r4, r5
222T mul r7, r4, r5
223T cmp r7, #0
224 rsb r6, r7, r5, lsl #3
225 rsb r12, r7, r4, lsl #3
226 sub r4, r7, r4, lsl #3
227 sub r4, r4, r5, lsl #3
228 add r4, r4, #64
229
230 beq 2f
231
232 vdup.8 d0, r4
233 vdup.8 d1, r12
234 vld1.8 {d4}, [r1], r2
235 vdup.8 d2, r6
236 vdup.8 d3, r7
237
238 vext.8 d5, d4, d5, #1
239 vtrn.32 d4, d5
240
241 vtrn.32 d0, d1
242 vtrn.32 d2, d3
243
2441: vld1.8 {d6}, [r1], r2
245 vext.8 d7, d6, d7, #1
246 vtrn.32 d6, d7
247 vmull.u8 q8, d4, d0
248 vmlal.u8 q8, d6, d2
249 vld1.8 {d4}, [r1], r2
250 vext.8 d5, d4, d5, #1
251 vtrn.32 d4, d5
252 pld [r1]
253 vmull.u8 q9, d6, d0
254 vmlal.u8 q9, d4, d2
255 vadd.i16 d16, d16, d17
256 vadd.i16 d17, d18, d19
257 .ifc \codec,h264
258 vrshrn.u16 d16, q8, #6
259 .else
260 vadd.u16 q8, q8, q11
261 vshrn.u16 d16, q8, #6
262 .endif
263 subs r3, r3, #2
264 pld [r1, r2]
265 .ifc \type,avg
266 vld1.32 {d20[0]}, [lr,:32], r2
267 vld1.32 {d20[1]}, [lr,:32], r2
268 vrhadd.u8 d16, d16, d20
269 .endif
270 vst1.32 {d16[0]}, [r0,:32], r2
271 vst1.32 {d16[1]}, [r0,:32], r2
272 bgt 1b
273
274 pop {r4-r7, pc}
275
2762: adds r12, r12, r6
277 vdup.8 d0, r4
278 beq 5f
279 tst r6, r6
280 vdup.8 d1, r12
281 vtrn.32 d0, d1
282
283 beq 4f
284
285 vext.32 d1, d0, d1, #1
286 vld1.32 {d4[0]}, [r1], r2
287
2883: vld1.32 {d4[1]}, [r1], r2
289 vmull.u8 q8, d4, d0
290 vld1.32 {d4[0]}, [r1], r2
291 vmull.u8 q9, d4, d1
292 vadd.i16 d16, d16, d17
293 vadd.i16 d17, d18, d19
294 pld [r1]
295 .ifc \codec,h264
296 vrshrn.u16 d16, q8, #6
297 .else
298 vadd.u16 q8, q8, q11
299 vshrn.u16 d16, q8, #6
300 .endif
301 .ifc \type,avg
302 vld1.32 {d20[0]}, [lr,:32], r2
303 vld1.32 {d20[1]}, [lr,:32], r2
304 vrhadd.u8 d16, d16, d20
305 .endif
306 subs r3, r3, #2
307 pld [r1, r2]
308 vst1.32 {d16[0]}, [r0,:32], r2
309 vst1.32 {d16[1]}, [r0,:32], r2
310 bgt 3b
311
312 pop {r4-r7, pc}
313
3144: vld1.8 {d4}, [r1], r2
315 vld1.8 {d6}, [r1], r2
316 vext.8 d5, d4, d5, #1
317 vext.8 d7, d6, d7, #1
318 vtrn.32 d4, d5
319 vtrn.32 d6, d7
320 vmull.u8 q8, d4, d0
321 vmull.u8 q9, d6, d0
322 subs r3, r3, #2
323 vadd.i16 d16, d16, d17
324 vadd.i16 d17, d18, d19
325 pld [r1]
326 .ifc \codec,h264
327 vrshrn.u16 d16, q8, #6
328 .else
329 vadd.u16 q8, q8, q11
330 vshrn.u16 d16, q8, #6
331 .endif
332 .ifc \type,avg
333 vld1.32 {d20[0]}, [lr,:32], r2
334 vld1.32 {d20[1]}, [lr,:32], r2
335 vrhadd.u8 d16, d16, d20
336 .endif
337 pld [r1]
338 vst1.32 {d16[0]}, [r0,:32], r2
339 vst1.32 {d16[1]}, [r0,:32], r2
340 bgt 4b
341
342 pop {r4-r7, pc}
343
3445: vld1.32 {d4[0]}, [r1], r2
345 vld1.32 {d4[1]}, [r1], r2
346 vmull.u8 q8, d4, d0
347 subs r3, r3, #2
348 pld [r1]
349 .ifc \codec,h264
350 vrshrn.u16 d16, q8, #6
351 .else
352 vadd.u16 q8, q8, q11
353 vshrn.u16 d16, q8, #6
354 .endif
355 .ifc \type,avg
356 vld1.32 {d20[0]}, [lr,:32], r2
357 vld1.32 {d20[1]}, [lr,:32], r2
358 vrhadd.u8 d16, d16, d20
359 .endif
360 pld [r1]
361 vst1.32 {d16[0]}, [r0,:32], r2
362 vst1.32 {d16[1]}, [r0,:32], r2
363 bgt 5b
364
365 pop {r4-r7, pc}
366endfunc
367.endm
368
369.macro h264_chroma_mc2 type
370function ff_\type\()_h264_chroma_mc2_neon, export=1
371 push {r4-r6, lr}
372 ldr r4, [sp, #16]
373 ldr lr, [sp, #20]
374 pld [r1]
375 pld [r1, r2]
376 orrs r5, r4, lr
377 beq 2f
378
379 mul r5, r4, lr
380 rsb r6, r5, lr, lsl #3
381 rsb r12, r5, r4, lsl #3
382 sub r4, r5, r4, lsl #3
383 sub r4, r4, lr, lsl #3
384 add r4, r4, #64
385 vdup.8 d0, r4
386 vdup.8 d2, r12
387 vdup.8 d1, r6
388 vdup.8 d3, r5
389 vtrn.16 q0, q1
3901:
391 vld1.32 {d4[0]}, [r1], r2
392 vld1.32 {d4[1]}, [r1], r2
393 vrev64.32 d5, d4
394 vld1.32 {d5[1]}, [r1]
395 vext.8 q3, q2, q2, #1
396 vtrn.16 q2, q3
397 vmull.u8 q8, d4, d0
398 vmlal.u8 q8, d5, d1
399 .ifc \type,avg
400 vld1.16 {d18[0]}, [r0,:16], r2
401 vld1.16 {d18[1]}, [r0,:16]
402 sub r0, r0, r2
403 .endif
404 vtrn.32 d16, d17
405 vadd.i16 d16, d16, d17
406 vrshrn.u16 d16, q8, #6
407 .ifc \type,avg
408 vrhadd.u8 d16, d16, d18
409 .endif
410 vst1.16 {d16[0]}, [r0,:16], r2
411 vst1.16 {d16[1]}, [r0,:16], r2
412 subs r3, r3, #2
413 bgt 1b
414 pop {r4-r6, pc}
4152:
416 .ifc \type,put
417 ldrh_post r5, r1, r2
418 strh_post r5, r0, r2
419 ldrh_post r6, r1, r2
420 strh_post r6, r0, r2
421 .else
422 vld1.16 {d16[0]}, [r1], r2
423 vld1.16 {d16[1]}, [r1], r2
424 vld1.16 {d18[0]}, [r0,:16], r2
425 vld1.16 {d18[1]}, [r0,:16]
426 sub r0, r0, r2
427 vrhadd.u8 d16, d16, d18
428 vst1.16 {d16[0]}, [r0,:16], r2
429 vst1.16 {d16[1]}, [r0,:16], r2
430 .endif
431 subs r3, r3, #2
432 bgt 2b
433 pop {r4-r6, pc}
434endfunc
435.endm
436
437 h264_chroma_mc8 put
438 h264_chroma_mc8 avg
439 h264_chroma_mc4 put
440 h264_chroma_mc4 avg
441 h264_chroma_mc2 put
442 h264_chroma_mc2 avg
443
444#if CONFIG_RV40_DECODER
445const rv40bias
446 .short 0, 16, 32, 16
447 .short 32, 28, 32, 28
448 .short 0, 32, 16, 32
449 .short 32, 28, 32, 28
450endconst
451
452 h264_chroma_mc8 put, rv40
453 h264_chroma_mc8 avg, rv40
454 h264_chroma_mc4 put, rv40
455 h264_chroma_mc4 avg, rv40
456#endif
457
458#if CONFIG_VC1_DECODER
459 h264_chroma_mc8 put, vc1
460 h264_chroma_mc8 avg, vc1
461 h264_chroma_mc4 put, vc1
462 h264_chroma_mc4 avg, vc1
463#endif