Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
3 | * | |
4 | * This file is part of FFmpeg. | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with FFmpeg; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | */ | |
20 | ||
21 | #include "libavutil/arm/asm.S" | |
22 | ||
23 | /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ | |
24 | .macro h264_chroma_mc8 type, codec=h264 | |
25 | function ff_\type\()_\codec\()_chroma_mc8_neon, export=1 | |
26 | push {r4-r7, lr} | |
27 | ldrd r4, r5, [sp, #20] | |
28 | .ifc \type,avg | |
29 | mov lr, r0 | |
30 | .endif | |
31 | pld [r1] | |
32 | pld [r1, r2] | |
33 | ||
34 | .ifc \codec,rv40 | |
35 | movrel r6, rv40bias | |
36 | lsr r7, r5, #1 | |
37 | add r6, r6, r7, lsl #3 | |
38 | lsr r7, r4, #1 | |
39 | add r6, r6, r7, lsl #1 | |
40 | vld1.16 {d22[],d23[]}, [r6,:16] | |
41 | .endif | |
42 | .ifc \codec,vc1 | |
43 | vmov.u16 q11, #28 | |
44 | .endif | |
45 | ||
46 | A muls r7, r4, r5 | |
47 | T mul r7, r4, r5 | |
48 | T cmp r7, #0 | |
49 | rsb r6, r7, r5, lsl #3 | |
50 | rsb r12, r7, r4, lsl #3 | |
51 | sub r4, r7, r4, lsl #3 | |
52 | sub r4, r4, r5, lsl #3 | |
53 | add r4, r4, #64 | |
54 | ||
55 | beq 2f | |
56 | ||
57 | vdup.8 d0, r4 | |
58 | vdup.8 d1, r12 | |
59 | vld1.8 {d4, d5}, [r1], r2 | |
60 | vdup.8 d2, r6 | |
61 | vdup.8 d3, r7 | |
62 | vext.8 d5, d4, d5, #1 | |
63 | ||
64 | 1: vld1.8 {d6, d7}, [r1], r2 | |
65 | vmull.u8 q8, d4, d0 | |
66 | vmlal.u8 q8, d5, d1 | |
67 | vext.8 d7, d6, d7, #1 | |
68 | vld1.8 {d4, d5}, [r1], r2 | |
69 | vmlal.u8 q8, d6, d2 | |
70 | pld [r1] | |
71 | vext.8 d5, d4, d5, #1 | |
72 | vmlal.u8 q8, d7, d3 | |
73 | vmull.u8 q9, d6, d0 | |
74 | subs r3, r3, #2 | |
75 | vmlal.u8 q9, d7, d1 | |
76 | vmlal.u8 q9, d4, d2 | |
77 | vmlal.u8 q9, d5, d3 | |
78 | pld [r1, r2] | |
79 | .ifc \codec,h264 | |
80 | vrshrn.u16 d16, q8, #6 | |
81 | vrshrn.u16 d17, q9, #6 | |
82 | .else | |
83 | vadd.u16 q8, q8, q11 | |
84 | vadd.u16 q9, q9, q11 | |
85 | vshrn.u16 d16, q8, #6 | |
86 | vshrn.u16 d17, q9, #6 | |
87 | .endif | |
88 | .ifc \type,avg | |
89 | vld1.8 {d20}, [lr,:64], r2 | |
90 | vld1.8 {d21}, [lr,:64], r2 | |
91 | vrhadd.u8 q8, q8, q10 | |
92 | .endif | |
93 | vst1.8 {d16}, [r0,:64], r2 | |
94 | vst1.8 {d17}, [r0,:64], r2 | |
95 | bgt 1b | |
96 | ||
97 | pop {r4-r7, pc} | |
98 | ||
99 | 2: adds r12, r12, r6 | |
100 | vdup.8 d0, r4 | |
101 | beq 5f | |
102 | tst r6, r6 | |
103 | vdup.8 d1, r12 | |
104 | ||
105 | beq 4f | |
106 | ||
107 | vld1.8 {d4}, [r1], r2 | |
108 | ||
109 | 3: vld1.8 {d6}, [r1], r2 | |
110 | vmull.u8 q8, d4, d0 | |
111 | vmlal.u8 q8, d6, d1 | |
112 | vld1.8 {d4}, [r1], r2 | |
113 | vmull.u8 q9, d6, d0 | |
114 | vmlal.u8 q9, d4, d1 | |
115 | pld [r1] | |
116 | .ifc \codec,h264 | |
117 | vrshrn.u16 d16, q8, #6 | |
118 | vrshrn.u16 d17, q9, #6 | |
119 | .else | |
120 | vadd.u16 q8, q8, q11 | |
121 | vadd.u16 q9, q9, q11 | |
122 | vshrn.u16 d16, q8, #6 | |
123 | vshrn.u16 d17, q9, #6 | |
124 | .endif | |
125 | pld [r1, r2] | |
126 | .ifc \type,avg | |
127 | vld1.8 {d20}, [lr,:64], r2 | |
128 | vld1.8 {d21}, [lr,:64], r2 | |
129 | vrhadd.u8 q8, q8, q10 | |
130 | .endif | |
131 | subs r3, r3, #2 | |
132 | vst1.8 {d16}, [r0,:64], r2 | |
133 | vst1.8 {d17}, [r0,:64], r2 | |
134 | bgt 3b | |
135 | ||
136 | pop {r4-r7, pc} | |
137 | ||
138 | 4: vld1.8 {d4, d5}, [r1], r2 | |
139 | vld1.8 {d6, d7}, [r1], r2 | |
140 | vext.8 d5, d4, d5, #1 | |
141 | vext.8 d7, d6, d7, #1 | |
142 | pld [r1] | |
143 | subs r3, r3, #2 | |
144 | vmull.u8 q8, d4, d0 | |
145 | vmlal.u8 q8, d5, d1 | |
146 | vmull.u8 q9, d6, d0 | |
147 | vmlal.u8 q9, d7, d1 | |
148 | pld [r1, r2] | |
149 | .ifc \codec,h264 | |
150 | vrshrn.u16 d16, q8, #6 | |
151 | vrshrn.u16 d17, q9, #6 | |
152 | .else | |
153 | vadd.u16 q8, q8, q11 | |
154 | vadd.u16 q9, q9, q11 | |
155 | vshrn.u16 d16, q8, #6 | |
156 | vshrn.u16 d17, q9, #6 | |
157 | .endif | |
158 | .ifc \type,avg | |
159 | vld1.8 {d20}, [lr,:64], r2 | |
160 | vld1.8 {d21}, [lr,:64], r2 | |
161 | vrhadd.u8 q8, q8, q10 | |
162 | .endif | |
163 | vst1.8 {d16}, [r0,:64], r2 | |
164 | vst1.8 {d17}, [r0,:64], r2 | |
165 | bgt 4b | |
166 | ||
167 | pop {r4-r7, pc} | |
168 | ||
169 | 5: vld1.8 {d4}, [r1], r2 | |
170 | vld1.8 {d5}, [r1], r2 | |
171 | pld [r1] | |
172 | subs r3, r3, #2 | |
173 | vmull.u8 q8, d4, d0 | |
174 | vmull.u8 q9, d5, d0 | |
175 | pld [r1, r2] | |
176 | .ifc \codec,h264 | |
177 | vrshrn.u16 d16, q8, #6 | |
178 | vrshrn.u16 d17, q9, #6 | |
179 | .else | |
180 | vadd.u16 q8, q8, q11 | |
181 | vadd.u16 q9, q9, q11 | |
182 | vshrn.u16 d16, q8, #6 | |
183 | vshrn.u16 d17, q9, #6 | |
184 | .endif | |
185 | .ifc \type,avg | |
186 | vld1.8 {d20}, [lr,:64], r2 | |
187 | vld1.8 {d21}, [lr,:64], r2 | |
188 | vrhadd.u8 q8, q8, q10 | |
189 | .endif | |
190 | vst1.8 {d16}, [r0,:64], r2 | |
191 | vst1.8 {d17}, [r0,:64], r2 | |
192 | bgt 5b | |
193 | ||
194 | pop {r4-r7, pc} | |
195 | endfunc | |
196 | .endm | |
197 | ||
198 | /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ | |
199 | .macro h264_chroma_mc4 type, codec=h264 | |
200 | function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 | |
201 | push {r4-r7, lr} | |
202 | ldrd r4, r5, [sp, #20] | |
203 | .ifc \type,avg | |
204 | mov lr, r0 | |
205 | .endif | |
206 | pld [r1] | |
207 | pld [r1, r2] | |
208 | ||
209 | .ifc \codec,rv40 | |
210 | movrel r6, rv40bias | |
211 | lsr r7, r5, #1 | |
212 | add r6, r6, r7, lsl #3 | |
213 | lsr r7, r4, #1 | |
214 | add r6, r6, r7, lsl #1 | |
215 | vld1.16 {d22[],d23[]}, [r6,:16] | |
216 | .endif | |
217 | .ifc \codec,vc1 | |
218 | vmov.u16 q11, #28 | |
219 | .endif | |
220 | ||
221 | A muls r7, r4, r5 | |
222 | T mul r7, r4, r5 | |
223 | T cmp r7, #0 | |
224 | rsb r6, r7, r5, lsl #3 | |
225 | rsb r12, r7, r4, lsl #3 | |
226 | sub r4, r7, r4, lsl #3 | |
227 | sub r4, r4, r5, lsl #3 | |
228 | add r4, r4, #64 | |
229 | ||
230 | beq 2f | |
231 | ||
232 | vdup.8 d0, r4 | |
233 | vdup.8 d1, r12 | |
234 | vld1.8 {d4}, [r1], r2 | |
235 | vdup.8 d2, r6 | |
236 | vdup.8 d3, r7 | |
237 | ||
238 | vext.8 d5, d4, d5, #1 | |
239 | vtrn.32 d4, d5 | |
240 | ||
241 | vtrn.32 d0, d1 | |
242 | vtrn.32 d2, d3 | |
243 | ||
244 | 1: vld1.8 {d6}, [r1], r2 | |
245 | vext.8 d7, d6, d7, #1 | |
246 | vtrn.32 d6, d7 | |
247 | vmull.u8 q8, d4, d0 | |
248 | vmlal.u8 q8, d6, d2 | |
249 | vld1.8 {d4}, [r1], r2 | |
250 | vext.8 d5, d4, d5, #1 | |
251 | vtrn.32 d4, d5 | |
252 | pld [r1] | |
253 | vmull.u8 q9, d6, d0 | |
254 | vmlal.u8 q9, d4, d2 | |
255 | vadd.i16 d16, d16, d17 | |
256 | vadd.i16 d17, d18, d19 | |
257 | .ifc \codec,h264 | |
258 | vrshrn.u16 d16, q8, #6 | |
259 | .else | |
260 | vadd.u16 q8, q8, q11 | |
261 | vshrn.u16 d16, q8, #6 | |
262 | .endif | |
263 | subs r3, r3, #2 | |
264 | pld [r1, r2] | |
265 | .ifc \type,avg | |
266 | vld1.32 {d20[0]}, [lr,:32], r2 | |
267 | vld1.32 {d20[1]}, [lr,:32], r2 | |
268 | vrhadd.u8 d16, d16, d20 | |
269 | .endif | |
270 | vst1.32 {d16[0]}, [r0,:32], r2 | |
271 | vst1.32 {d16[1]}, [r0,:32], r2 | |
272 | bgt 1b | |
273 | ||
274 | pop {r4-r7, pc} | |
275 | ||
276 | 2: adds r12, r12, r6 | |
277 | vdup.8 d0, r4 | |
278 | beq 5f | |
279 | tst r6, r6 | |
280 | vdup.8 d1, r12 | |
281 | vtrn.32 d0, d1 | |
282 | ||
283 | beq 4f | |
284 | ||
285 | vext.32 d1, d0, d1, #1 | |
286 | vld1.32 {d4[0]}, [r1], r2 | |
287 | ||
288 | 3: vld1.32 {d4[1]}, [r1], r2 | |
289 | vmull.u8 q8, d4, d0 | |
290 | vld1.32 {d4[0]}, [r1], r2 | |
291 | vmull.u8 q9, d4, d1 | |
292 | vadd.i16 d16, d16, d17 | |
293 | vadd.i16 d17, d18, d19 | |
294 | pld [r1] | |
295 | .ifc \codec,h264 | |
296 | vrshrn.u16 d16, q8, #6 | |
297 | .else | |
298 | vadd.u16 q8, q8, q11 | |
299 | vshrn.u16 d16, q8, #6 | |
300 | .endif | |
301 | .ifc \type,avg | |
302 | vld1.32 {d20[0]}, [lr,:32], r2 | |
303 | vld1.32 {d20[1]}, [lr,:32], r2 | |
304 | vrhadd.u8 d16, d16, d20 | |
305 | .endif | |
306 | subs r3, r3, #2 | |
307 | pld [r1, r2] | |
308 | vst1.32 {d16[0]}, [r0,:32], r2 | |
309 | vst1.32 {d16[1]}, [r0,:32], r2 | |
310 | bgt 3b | |
311 | ||
312 | pop {r4-r7, pc} | |
313 | ||
314 | 4: vld1.8 {d4}, [r1], r2 | |
315 | vld1.8 {d6}, [r1], r2 | |
316 | vext.8 d5, d4, d5, #1 | |
317 | vext.8 d7, d6, d7, #1 | |
318 | vtrn.32 d4, d5 | |
319 | vtrn.32 d6, d7 | |
320 | vmull.u8 q8, d4, d0 | |
321 | vmull.u8 q9, d6, d0 | |
322 | subs r3, r3, #2 | |
323 | vadd.i16 d16, d16, d17 | |
324 | vadd.i16 d17, d18, d19 | |
325 | pld [r1] | |
326 | .ifc \codec,h264 | |
327 | vrshrn.u16 d16, q8, #6 | |
328 | .else | |
329 | vadd.u16 q8, q8, q11 | |
330 | vshrn.u16 d16, q8, #6 | |
331 | .endif | |
332 | .ifc \type,avg | |
333 | vld1.32 {d20[0]}, [lr,:32], r2 | |
334 | vld1.32 {d20[1]}, [lr,:32], r2 | |
335 | vrhadd.u8 d16, d16, d20 | |
336 | .endif | |
337 | pld [r1] | |
338 | vst1.32 {d16[0]}, [r0,:32], r2 | |
339 | vst1.32 {d16[1]}, [r0,:32], r2 | |
340 | bgt 4b | |
341 | ||
342 | pop {r4-r7, pc} | |
343 | ||
344 | 5: vld1.32 {d4[0]}, [r1], r2 | |
345 | vld1.32 {d4[1]}, [r1], r2 | |
346 | vmull.u8 q8, d4, d0 | |
347 | subs r3, r3, #2 | |
348 | pld [r1] | |
349 | .ifc \codec,h264 | |
350 | vrshrn.u16 d16, q8, #6 | |
351 | .else | |
352 | vadd.u16 q8, q8, q11 | |
353 | vshrn.u16 d16, q8, #6 | |
354 | .endif | |
355 | .ifc \type,avg | |
356 | vld1.32 {d20[0]}, [lr,:32], r2 | |
357 | vld1.32 {d20[1]}, [lr,:32], r2 | |
358 | vrhadd.u8 d16, d16, d20 | |
359 | .endif | |
360 | pld [r1] | |
361 | vst1.32 {d16[0]}, [r0,:32], r2 | |
362 | vst1.32 {d16[1]}, [r0,:32], r2 | |
363 | bgt 5b | |
364 | ||
365 | pop {r4-r7, pc} | |
366 | endfunc | |
367 | .endm | |
368 | ||
369 | .macro h264_chroma_mc2 type | |
370 | function ff_\type\()_h264_chroma_mc2_neon, export=1 | |
371 | push {r4-r6, lr} | |
372 | ldr r4, [sp, #16] | |
373 | ldr lr, [sp, #20] | |
374 | pld [r1] | |
375 | pld [r1, r2] | |
376 | orrs r5, r4, lr | |
377 | beq 2f | |
378 | ||
379 | mul r5, r4, lr | |
380 | rsb r6, r5, lr, lsl #3 | |
381 | rsb r12, r5, r4, lsl #3 | |
382 | sub r4, r5, r4, lsl #3 | |
383 | sub r4, r4, lr, lsl #3 | |
384 | add r4, r4, #64 | |
385 | vdup.8 d0, r4 | |
386 | vdup.8 d2, r12 | |
387 | vdup.8 d1, r6 | |
388 | vdup.8 d3, r5 | |
389 | vtrn.16 q0, q1 | |
390 | 1: | |
391 | vld1.32 {d4[0]}, [r1], r2 | |
392 | vld1.32 {d4[1]}, [r1], r2 | |
393 | vrev64.32 d5, d4 | |
394 | vld1.32 {d5[1]}, [r1] | |
395 | vext.8 q3, q2, q2, #1 | |
396 | vtrn.16 q2, q3 | |
397 | vmull.u8 q8, d4, d0 | |
398 | vmlal.u8 q8, d5, d1 | |
399 | .ifc \type,avg | |
400 | vld1.16 {d18[0]}, [r0,:16], r2 | |
401 | vld1.16 {d18[1]}, [r0,:16] | |
402 | sub r0, r0, r2 | |
403 | .endif | |
404 | vtrn.32 d16, d17 | |
405 | vadd.i16 d16, d16, d17 | |
406 | vrshrn.u16 d16, q8, #6 | |
407 | .ifc \type,avg | |
408 | vrhadd.u8 d16, d16, d18 | |
409 | .endif | |
410 | vst1.16 {d16[0]}, [r0,:16], r2 | |
411 | vst1.16 {d16[1]}, [r0,:16], r2 | |
412 | subs r3, r3, #2 | |
413 | bgt 1b | |
414 | pop {r4-r6, pc} | |
415 | 2: | |
416 | .ifc \type,put | |
417 | ldrh_post r5, r1, r2 | |
418 | strh_post r5, r0, r2 | |
419 | ldrh_post r6, r1, r2 | |
420 | strh_post r6, r0, r2 | |
421 | .else | |
422 | vld1.16 {d16[0]}, [r1], r2 | |
423 | vld1.16 {d16[1]}, [r1], r2 | |
424 | vld1.16 {d18[0]}, [r0,:16], r2 | |
425 | vld1.16 {d18[1]}, [r0,:16] | |
426 | sub r0, r0, r2 | |
427 | vrhadd.u8 d16, d16, d18 | |
428 | vst1.16 {d16[0]}, [r0,:16], r2 | |
429 | vst1.16 {d16[1]}, [r0,:16], r2 | |
430 | .endif | |
431 | subs r3, r3, #2 | |
432 | bgt 2b | |
433 | pop {r4-r6, pc} | |
434 | endfunc | |
435 | .endm | |
436 | ||
437 | h264_chroma_mc8 put | |
438 | h264_chroma_mc8 avg | |
439 | h264_chroma_mc4 put | |
440 | h264_chroma_mc4 avg | |
441 | h264_chroma_mc2 put | |
442 | h264_chroma_mc2 avg | |
443 | ||
444 | #if CONFIG_RV40_DECODER | |
445 | const rv40bias | |
446 | .short 0, 16, 32, 16 | |
447 | .short 32, 28, 32, 28 | |
448 | .short 0, 32, 16, 32 | |
449 | .short 32, 28, 32, 28 | |
450 | endconst | |
451 | ||
452 | h264_chroma_mc8 put, rv40 | |
453 | h264_chroma_mc8 avg, rv40 | |
454 | h264_chroma_mc4 put, rv40 | |
455 | h264_chroma_mc4 avg, rv40 | |
456 | #endif | |
457 | ||
458 | #if CONFIG_VC1_DECODER | |
459 | h264_chroma_mc8 put, vc1 | |
460 | h264_chroma_mc8 avg, vc1 | |
461 | h264_chroma_mc4 put, vc1 | |
462 | h264_chroma_mc4 avg, vc1 | |
463 | #endif |