Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * VP8 ARMv6 optimisations | |
3 | * | |
4 | * Copyright (c) 2010 Google Inc. | |
5 | * Copyright (c) 2010 Rob Clark <rob@ti.com> | |
6 | * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> | |
7 | * | |
8 | * This file is part of FFmpeg. | |
9 | * | |
10 | * FFmpeg is free software; you can redistribute it and/or | |
11 | * modify it under the terms of the GNU Lesser General Public | |
12 | * License as published by the Free Software Foundation; either | |
13 | * version 2.1 of the License, or (at your option) any later version. | |
14 | * | |
15 | * FFmpeg is distributed in the hope that it will be useful, | |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 | * Lesser General Public License for more details. | |
19 | * | |
20 | * You should have received a copy of the GNU Lesser General Public | |
21 | * License along with FFmpeg; if not, write to the Free Software | |
22 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 | * | |
24 | * This code was partially ported from libvpx, which uses this license: | |
25 | * | |
26 | * Redistribution and use in source and binary forms, with or without | |
27 | * modification, are permitted provided that the following conditions are | |
28 | * met: | |
29 | * | |
30 | * * Redistributions of source code must retain the above copyright | |
31 | * notice, this list of conditions and the following disclaimer. | |
32 | * | |
33 | * * Redistributions in binary form must reproduce the above copyright | |
34 | * notice, this list of conditions and the following disclaimer in | |
35 | * the documentation and/or other materials provided with the | |
36 | * distribution. | |
37 | * | |
38 | * * Neither the name of Google nor the names of its contributors may | |
39 | * be used to endorse or promote products derived from this software | |
40 | * without specific prior written permission. | |
41 | * | |
42 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
43 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
44 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
45 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
46 | * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
47 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
48 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
49 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
50 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
51 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
52 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
53 | */ | |
54 | ||
55 | #include "libavutil/arm/asm.S" | |
56 | ||
57 | @ idct | |
58 | ||
59 | @ void vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16]) | |
60 | function ff_vp8_luma_dc_wht_armv6, export=1 | |
61 | push {r4-r10, lr} | |
62 | ||
63 | ldm r1, {r2-r9} | |
64 | mov r10, #0 | |
65 | mov lr, #0 | |
66 | uadd16 r12, r2, r8 @ t0[0,1] | |
67 | usub16 r2, r2, r8 @ t3[0,1] | |
68 | stm r1!, {r10, lr} | |
69 | uadd16 r8, r4, r6 @ t1[0,1] | |
70 | usub16 r4, r4, r6 @ t2[0,1] | |
71 | stm r1!, {r10, lr} | |
72 | uadd16 r6, r12, r8 @ dc0[0,1] | |
73 | usub16 r12, r12, r8 @ dc2[0,1] | |
74 | stm r1!, {r10, lr} | |
75 | uadd16 r8, r2, r4 @ dc1[0,1] | |
76 | usub16 r2, r2, r4 @ dc3[0,1] | |
77 | stm r1!, {r10, lr} | |
78 | ||
79 | uadd16 lr, r3, r9 @ t0[2,3] | |
80 | usub16 r3, r3, r9 @ t3[2,3] | |
81 | uadd16 r9, r5, r7 @ t1[2,3] | |
82 | usub16 r5, r5, r7 @ t2[2,3] | |
83 | ||
84 | uadd16 r7, lr, r9 @ dc0[2,3] | |
85 | usub16 lr, lr, r9 @ dc2[2,3] | |
86 | uadd16 r9, r3, r5 @ dc1[2,3] | |
87 | usub16 r3, r3, r5 @ dc3[2,3] | |
88 | ||
89 | mov r1, #3 | |
90 | orr r1, r1, #0x30000 @ 3 | 3 (round) | |
91 | ||
92 | pkhbt r4, r6, r8, lsl #16 @ dc{0,1}[0] | |
93 | pkhtb r6, r8, r6, asr #16 @ dc{0,1}[1] | |
94 | pkhbt r5, r12, r2, lsl #16 @ dc{2,3}[0] | |
95 | pkhtb r12, r2, r12, asr #16 @ dc{2,3}[1] | |
96 | pkhbt r8, r7, r9, lsl #16 @ dc{0,1}[2] | |
97 | uadd16 r4, r4, r1 | |
98 | uadd16 r5, r5, r1 | |
99 | pkhtb r7, r9, r7, asr #16 @ dc{0,1}[3] | |
100 | pkhbt r2, lr, r3, lsl #16 @ dc{2,3}[2] | |
101 | pkhtb lr, r3, lr, asr #16 @ dc{2,3}[3] | |
102 | ||
103 | uadd16 r9, r4, r7 @ t0[0,1] | |
104 | uadd16 r3, r5, lr @ t0[2,3] | |
105 | usub16 r4, r4, r7 @ t3[0,1] | |
106 | usub16 r5, r5, lr @ t3[2,3] | |
107 | uadd16 r7, r6, r8 @ t1[0,1] | |
108 | uadd16 lr, r12, r2 @ t1[2,3] | |
109 | usub16 r6, r6, r8 @ t2[0,1] | |
110 | usub16 r12, r12, r2 @ t2[2,3] | |
111 | ||
112 | uadd16 r8, r9, r7 @ block[0,1][0] | |
113 | uadd16 r2, r3, lr @ block[2,3][0] | |
114 | usub16 r9, r9, r7 @ block[0,1][2] | |
115 | usub16 r3, r3, lr @ block[2,3][2] | |
116 | uadd16 r7, r4, r6 @ block[0,1][1] | |
117 | uadd16 lr, r5, r12 @ block[2,3][1] | |
118 | usub16 r4, r4, r6 @ block[0,1][3] | |
119 | usub16 r5, r5, r12 @ block[2,3][3] | |
120 | ||
121 | #if HAVE_ARMV6T2_EXTERNAL | |
122 | sbfx r6, r8, #3, #13 | |
123 | sbfx r12, r7, #3, #13 | |
124 | sbfx r1, r9, #3, #13 | |
125 | sbfx r10, r4, #3, #13 | |
126 | #else | |
127 | sxth r6, r8 | |
128 | sxth r12, r7 | |
129 | sxth r1, r9 | |
130 | sxth r10, r4 | |
131 | asr r6, #3 @ block[0][0] | |
132 | asr r12, #3 @ block[0][1] | |
133 | asr r1, #3 @ block[0][2] | |
134 | asr r10, #3 @ block[0][3] | |
135 | #endif | |
136 | ||
137 | strh r6, [r0], #32 | |
138 | asr r8, r8, #19 @ block[1][0] | |
139 | strh r12, [r0], #32 | |
140 | asr r7, r7, #19 @ block[1][1] | |
141 | strh r1, [r0], #32 | |
142 | asr r9, r9, #19 @ block[1][2] | |
143 | strh r10, [r0], #32 | |
144 | asr r4, r4, #19 @ block[1][3] | |
145 | strh r8, [r0], #32 | |
146 | asr r6, r2, #19 @ block[3][0] | |
147 | strh r7, [r0], #32 | |
148 | asr r12, lr, #19 @ block[3][1] | |
149 | strh r9, [r0], #32 | |
150 | asr r1, r3, #19 @ block[3][2] | |
151 | strh r4, [r0], #32 | |
152 | asr r10, r5, #19 @ block[3][3] | |
153 | ||
154 | #if HAVE_ARMV6T2_EXTERNAL | |
155 | sbfx r2, r2, #3, #13 | |
156 | sbfx lr, lr, #3, #13 | |
157 | sbfx r3, r3, #3, #13 | |
158 | sbfx r5, r5, #3, #13 | |
159 | #else | |
160 | sxth r2, r2 | |
161 | sxth lr, lr | |
162 | sxth r3, r3 | |
163 | sxth r5, r5 | |
164 | asr r2, #3 @ block[2][0] | |
165 | asr lr, #3 @ block[2][1] | |
166 | asr r3, #3 @ block[2][2] | |
167 | asr r5, #3 @ block[2][3] | |
168 | #endif | |
169 | ||
170 | strh r2, [r0], #32 | |
171 | strh lr, [r0], #32 | |
172 | strh r3, [r0], #32 | |
173 | strh r5, [r0], #32 | |
174 | strh r6, [r0], #32 | |
175 | strh r12, [r0], #32 | |
176 | strh r1, [r0], #32 | |
177 | strh r10, [r0], #32 | |
178 | ||
179 | pop {r4-r10, pc} | |
180 | endfunc | |
181 | ||
182 | @ void vp8_luma_dc_wht_dc(int16_t block[4][4][16], int16_t dc[16]) | |
183 | function ff_vp8_luma_dc_wht_dc_armv6, export=1 | |
184 | ldrsh r2, [r1] | |
185 | mov r3, #0 | |
186 | add r2, r2, #3 | |
187 | strh r3, [r1] | |
188 | asr r2, r2, #3 | |
189 | .rept 16 | |
190 | strh r2, [r0], #32 | |
191 | .endr | |
192 | bx lr | |
193 | endfunc | |
194 | ||
195 | @ void vp8_idct_add(uint8_t *dst, int16_t block[16], int stride) | |
196 | function ff_vp8_idct_add_armv6, export=1 | |
197 | push {r4-r12, lr} | |
198 | sub sp, sp, #32 | |
199 | ||
200 | movw r3, #20091 @ cospi8sqrt2minus1 | |
201 | movw r4, #35468 @ sinpi8sqrt2 | |
202 | mov r5, sp | |
203 | 1: | |
204 | ldr r6, [r1, #8] @ i5 | i4 = block1[1] | block1[0] | |
205 | ldr lr, [r1, #16] @ i9 | i8 = block2[1] | block2[0] | |
206 | ldr r12, [r1, #24] @ i13 | i12 = block3[1] | block3[0] | |
207 | ||
208 | smulwt r9, r3, r6 @ ip[5] * cospi8sqrt2minus1 | |
209 | smulwb r7, r3, r6 @ ip[4] * cospi8sqrt2minus1 | |
210 | smulwt r10, r4, r6 @ ip[5] * sinpi8sqrt2 | |
211 | smulwb r8, r4, r6 @ ip[4] * sinpi8sqrt2 | |
212 | pkhbt r7, r7, r9, lsl #16 @ 5c | 4c | |
213 | smulwt r11, r3, r12 @ ip[13] * cospi8sqrt2minus1 | |
214 | pkhbt r8, r8, r10, lsl #16 @ 5s | 4s = t2 first half | |
215 | uadd16 r6, r6, r7 @ 5c+5 | 4c+4 = t3 first half | |
216 | smulwb r9, r3, r12 @ ip[12] * cospi8sqrt2minus1 | |
217 | smulwt r7, r4, r12 @ ip[13] * sinpi8sqrt2 | |
218 | smulwb r10, r4, r12 @ ip[12] * sinpi8sqrt2 | |
219 | ||
220 | pkhbt r9, r9, r11, lsl #16 @ 13c | 12c | |
221 | ldr r11, [r1] @ i1 | i0 | |
222 | pkhbt r10, r10, r7, lsl #16 @ 13s | 12s = t3 second half | |
223 | uadd16 r7, r12, r9 @ 13c+13 | 12c+12 = t2 2nd half | |
224 | uadd16 r6, r6, r10 @ d = t3 | |
225 | uadd16 r10, r11, lr @ a = t0 | |
226 | usub16 r7, r8, r7 @ c = t2 | |
227 | usub16 r8, r11, lr @ b = t1 | |
228 | uadd16 r9, r10, r6 @ a+d = tmp{0,1}[0] | |
229 | usub16 r10, r10, r6 @ a-d = tmp{0,1}[3] | |
230 | uadd16 r6, r8, r7 @ b+c = tmp{0,1}[1] | |
231 | usub16 r7, r8, r7 @ b-c = tmp{0,1}[2] | |
232 | mov r8, #0 | |
233 | cmp sp, r5 | |
234 | str r6, [r5, #8] @ o5 | o4 | |
235 | str r7, [r5, #16] @ o9 | o8 | |
236 | str r10, [r5, #24] @ o13 | o12 | |
237 | str r9, [r5], #4 @ o1 | o0 | |
238 | str r8, [r1, #8] | |
239 | str r8, [r1, #16] | |
240 | str r8, [r1, #24] | |
241 | str r8, [r1], #4 | |
242 | beq 1b | |
243 | ||
244 | mov r5, #2 | |
245 | 2: | |
246 | pop {r1, r6, r12, lr} | |
247 | smulwt r9, r3, r12 @ ip[5] * cospi8sqrt2minus1 | |
248 | smulwt r7, r3, r1 @ ip[1] * cospi8sqrt2minus1 | |
249 | smulwt r10, r4, r12 @ ip[5] * sinpi8sqrt2 | |
250 | smulwt r8, r4, r1 @ ip[1] * sinpi8sqrt2 | |
251 | pkhbt r11, r1, r12, lsl #16 @ i4 | i0 = t0/t1 first half | |
252 | pkhtb r1, r12, r1, asr #16 @ i5 | i1 | |
253 | pkhbt r7, r7, r9, lsl #16 @ 5c | 1c | |
254 | pkhbt r8, r8, r10, lsl #16 @ 5s | 1s = t2 first half | |
255 | pkhbt r9, r6, lr, lsl #16 @ i6 | i2 = t0/t1 second half | |
256 | pkhtb r12, lr, r6, asr #16 @ i7 | i3 | |
257 | uadd16 r1, r7, r1 @ 5c+5 | 1c+1 = t3 first half | |
258 | uadd16 r10, r11, r9 @ a = t0 | |
259 | usub16 r9, r11, r9 @ b = t1 | |
260 | smulwt r7, r3, r12 @ ip[7] * cospi8sqrt2minus1 | |
261 | smulwb lr, r3, r12 @ ip[3] * cospi8sqrt2minus1 | |
262 | smulwt r11, r4, r12 @ ip[7] * sinpi8sqrt2 | |
263 | smulwb r6, r4, r12 @ ip[3] * sinpi8sqrt2 | |
264 | subs r5, r5, #1 | |
265 | pkhbt r7, lr, r7, lsl #16 @ 7c | 3c | |
266 | pkhbt r11, r6, r11, lsl #16 @ 7s | 3s = t3 second half | |
267 | mov r6, #0x4 | |
268 | orr r6, r6, #0x40000 | |
269 | uadd16 r12, r7, r12 @ 7c+7 | 3c+3 = t2 second half | |
270 | uadd16 r10, r10, r6 @ t0 + 4 | |
271 | uadd16 r9, r9, r6 @ t1 + 4 | |
272 | usub16 lr, r8, r12 @ c (o5 | o1) = t2 | |
273 | uadd16 r12, r11, r1 @ d (o7 | o3) = t3 | |
274 | usub16 r1, r9, lr @ b-c = dst{0,1}[2] | |
275 | uadd16 r7, r10, r12 @ a+d = dst{0,1}[0] | |
276 | usub16 r12, r10, r12 @ a-d = dst{0,1}[3] | |
277 | uadd16 r10, r9, lr @ b+c = dst{0,1}[1] | |
278 | ||
279 | asr lr, r1, #3 @ o[1][2] | |
280 | asr r9, r12, #3 @ o[1][3] | |
281 | pkhtb r8, lr, r7, asr #19 @ o[1][0,2] | |
282 | pkhtb r11, r9, r10, asr #19 @ o[1][1,3] | |
283 | ldr lr, [r0] | |
284 | sxth r12, r12 | |
285 | ldr r9, [r0, r2] | |
286 | sxth r1, r1 | |
287 | #if HAVE_ARMV6T2_EXTERNAL | |
288 | sbfx r7, r7, #3, #13 | |
289 | sbfx r10, r10, #3, #13 | |
290 | #else | |
291 | sxth r7, r7 | |
292 | sxth r10, r10 | |
293 | asr r7, #3 @ o[0][0] | |
294 | asr r10, #3 @ o[0][1] | |
295 | #endif | |
296 | pkhbt r7, r7, r1, lsl #13 @ o[0][0,2] | |
297 | pkhbt r10, r10, r12, lsl #13 @ o[0][1,3] | |
298 | ||
299 | uxtab16 r7, r7, lr | |
300 | uxtab16 r10, r10, lr, ror #8 | |
301 | uxtab16 r8, r8, r9 | |
302 | uxtab16 r11, r11, r9, ror #8 | |
303 | usat16 r7, #8, r7 | |
304 | usat16 r10, #8, r10 | |
305 | usat16 r8, #8, r8 | |
306 | usat16 r11, #8, r11 | |
307 | orr r7, r7, r10, lsl #8 | |
308 | orr r8, r8, r11, lsl #8 | |
309 | str r8, [r0, r2] | |
310 | str_post r7, r0, r2, lsl #1 | |
311 | ||
312 | bne 2b | |
313 | ||
314 | pop {r4-r12, pc} | |
315 | endfunc | |
316 | ||
317 | @ void vp8_idct_dc_add(uint8_t *dst, int16_t block[16], int stride) | |
318 | function ff_vp8_idct_dc_add_armv6, export=1 | |
319 | push {r4-r6, lr} | |
320 | add r6, r0, r2, lsl #1 | |
321 | ldrsh r3, [r1] | |
322 | mov r4, #0 | |
323 | add r3, r3, #4 | |
324 | strh r4, [r1], #32 | |
325 | asr r3, #3 | |
326 | ldr r5, [r0] | |
327 | ldr r4, [r0, r2] | |
328 | pkhbt r3, r3, r3, lsl #16 | |
329 | uxtab16 lr, r3, r5 @ a1+2 | a1+0 | |
330 | uxtab16 r5, r3, r5, ror #8 @ a1+3 | a1+1 | |
331 | uxtab16 r12, r3, r4 | |
332 | uxtab16 r4, r3, r4, ror #8 | |
333 | usat16 lr, #8, lr | |
334 | usat16 r5, #8, r5 | |
335 | usat16 r12, #8, r12 | |
336 | usat16 r4, #8, r4 | |
337 | orr lr, lr, r5, lsl #8 | |
338 | ldr r5, [r6] | |
339 | orr r12, r12, r4, lsl #8 | |
340 | ldr r4, [r6, r2] | |
341 | str lr, [r0] | |
342 | uxtab16 lr, r3, r5 | |
343 | str r12, [r0, r2] | |
344 | uxtab16 r5, r3, r5, ror #8 | |
345 | uxtab16 r12, r3, r4 | |
346 | uxtab16 r4, r3, r4, ror #8 | |
347 | usat16 lr, #8, lr | |
348 | usat16 r5, #8, r5 | |
349 | usat16 r12, #8, r12 | |
350 | usat16 r4, #8, r4 | |
351 | orr lr, lr, r5, lsl #8 | |
352 | orr r12, r12, r4, lsl #8 | |
353 | str lr, [r6] | |
354 | str r12, [r6, r2] | |
355 | pop {r4-r6, pc} | |
356 | endfunc | |
357 | ||
358 | @ void vp8_idct_dc_add4uv(uint8_t *dst, int16_t block[4][16], int stride) | |
359 | function ff_vp8_idct_dc_add4uv_armv6, export=1 | |
360 | push {r4, lr} | |
361 | ||
362 | bl X(ff_vp8_idct_dc_add_armv6) | |
363 | add r0, r0, #4 | |
364 | bl X(ff_vp8_idct_dc_add_armv6) | |
365 | add r0, r0, r2, lsl #2 | |
366 | sub r0, r0, #4 | |
367 | bl X(ff_vp8_idct_dc_add_armv6) | |
368 | add r0, r0, #4 | |
369 | bl X(ff_vp8_idct_dc_add_armv6) | |
370 | ||
371 | pop {r4, pc} | |
372 | endfunc | |
373 | ||
374 | @ void vp8_idct_dc_add4y(uint8_t *dst, int16_t block[4][16], int stride) | |
375 | function ff_vp8_idct_dc_add4y_armv6, export=1 | |
376 | push {r4, lr} | |
377 | ||
378 | bl X(ff_vp8_idct_dc_add_armv6) | |
379 | add r0, r0, #4 | |
380 | bl X(ff_vp8_idct_dc_add_armv6) | |
381 | add r0, r0, #4 | |
382 | bl X(ff_vp8_idct_dc_add_armv6) | |
383 | add r0, r0, #4 | |
384 | bl X(ff_vp8_idct_dc_add_armv6) | |
385 | ||
386 | pop {r4, pc} | |
387 | endfunc | |
388 | ||
389 | @ loopfilter | |
390 | ||
391 | .macro transpose o3, o2, o1, o0, i0, i1, i2, i3 | |
392 | uxtb16 \o1, \i1 @ xx 12 xx 10 | |
393 | uxtb16 \o0, \i0 @ xx 02 xx 00 | |
394 | uxtb16 \o3, \i3 @ xx 32 xx 30 | |
395 | uxtb16 \o2, \i2 @ xx 22 xx 20 | |
396 | orr \o1, \o0, \o1, lsl #8 @ 12 02 10 00 | |
397 | orr \o3, \o2, \o3, lsl #8 @ 32 22 30 20 | |
398 | ||
399 | uxtb16 \i1, \i1, ror #8 @ xx 13 xx 11 | |
400 | uxtb16 \i3, \i3, ror #8 @ xx 33 xx 31 | |
401 | uxtb16 \i0, \i0, ror #8 @ xx 03 xx 01 | |
402 | uxtb16 \i2, \i2, ror #8 @ xx 23 xx 21 | |
403 | orr \i0, \i0, \i1, lsl #8 @ 13 03 11 01 | |
404 | orr \i2, \i2, \i3, lsl #8 @ 33 23 31 21 | |
405 | ||
406 | pkhtb \o2, \o3, \o1, asr #16 @ 32 22 12 02 | |
407 | pkhbt \o0, \o1, \o3, lsl #16 @ 30 20 10 00 | |
408 | pkhtb \o3, \i2, \i0, asr #16 @ 33 23 13 03 | |
409 | pkhbt \o1, \i0, \i2, lsl #16 @ 31 21 11 01 | |
410 | .endm | |
411 | ||
412 | .macro simple_filter | |
413 | uqsub8 r7, r3, r6 @ p1 - q1 | |
414 | uqsub8 r8, r6, r3 @ q1 - p1 | |
415 | uqsub8 r10, r4, r5 @ p0 - q0 | |
416 | uqsub8 r9, r5, r4 @ q0 - p0 | |
417 | orr r7, r7, r8 @ abs(p1 - q1) | |
418 | orr r9, r9, r10 @ abs(p0 - q0) | |
419 | uhadd8 r7, r7, lr @ abs(p1 - q2) >> 1 | |
420 | uqadd8 r9, r9, r9 @ abs(p0 - q0) * 2 | |
421 | uqadd8 r7, r7, r9 @ abs(p0 - q0)*2 + abs(p1-q1)/2 | |
422 | mvn r8, #0 | |
423 | usub8 r10, r12, r7 @ compare to flimit | |
424 | sel r10, r8, lr @ filter mask: F or 0 | |
425 | cmp r10, #0 | |
426 | beq 2f | |
427 | ||
428 | eor r3, r3, r2 @ ps1 | |
429 | eor r6, r6, r2 @ qs1 | |
430 | eor r4, r4, r2 @ ps0 | |
431 | eor r5, r5, r2 @ qs0 | |
432 | ||
433 | qsub8 r3, r3, r6 @ vp8_filter = p1 - q1 | |
434 | qsub8 r6, r5, r4 @ q0 - p0 | |
435 | qadd8 r3, r3, r6 @ += q0 - p0 | |
436 | lsr r7, r2, #5 @ 0x04040404 | |
437 | qadd8 r3, r3, r6 @ += q0 - p0 | |
438 | sub r9, r7, r2, lsr #7 @ 0x03030303 | |
439 | qadd8 r3, r3, r6 @ vp8_filter = p1-q1 + 3*(q0-p0) | |
440 | and r3, r3, r10 @ vp8_filter &= mask | |
441 | ||
442 | qadd8 r9, r3, r9 @ Filter2 = vp8_filter + 3 | |
443 | qadd8 r3, r3, r7 @ Filter1 = vp8_filter + 4 | |
444 | ||
445 | shadd8 r9, r9, lr | |
446 | shadd8 r3, r3, lr | |
447 | shadd8 r9, r9, lr | |
448 | shadd8 r3, r3, lr | |
449 | shadd8 r9, r9, lr @ Filter2 >>= 3 | |
450 | shadd8 r3, r3, lr @ Filter1 >>= 3 | |
451 | ||
452 | qadd8 r4, r4, r9 @ u = p0 + Filter2 | |
453 | qsub8 r5, r5, r3 @ u = q0 - Filter1 | |
454 | eor r4, r4, r2 @ *op0 = u ^ 0x80 | |
455 | eor r5, r5, r2 @ *oq0 = u ^ 0x80 | |
456 | .endm | |
457 | ||
458 | @ void vp8_v_loop_filter16_simple(uint8_t *dst, int stride, int flim) | |
459 | function ff_vp8_v_loop_filter16_simple_armv6, export=1 | |
460 | push {r4-r11, lr} | |
461 | ||
462 | orr r2, r2, r2, lsl #16 | |
463 | mov r11, #4 | |
464 | mov lr, #0 | |
465 | orr r12, r2, r2, lsl #8 | |
466 | mov32 r2, 0x80808080 | |
467 | 1: | |
468 | ldr_nreg r3, r0, r1, lsl #1 @ p1 | |
469 | ldr_nreg r4, r0, r1 @ p0 | |
470 | ldr r5, [r0] @ q0 | |
471 | ldr r6, [r0, r1] @ q1 | |
472 | simple_filter | |
473 | T sub r7, r0, r1 | |
474 | str r5, [r0] @ oq0 | |
475 | A str r4, [r0, -r1] @ op0 | |
476 | T str r4, [r7] | |
477 | 2: | |
478 | subs r11, r11, #1 | |
479 | add r0, r0, #4 | |
480 | bne 1b | |
481 | ||
482 | pop {r4-r11, pc} | |
483 | endfunc | |
484 | ||
485 | .macro filter_mask_p | |
486 | uqsub8 r6, r9, r10 @ p3 - p2 | |
487 | uqsub8 r7, r10, r9 @ p2 - p3 | |
488 | uqsub8 r8, r10, r11 @ p2 - p1 | |
489 | uqsub8 r10, r11, r10 @ p1 - p2 | |
490 | orr r6, r6, r7 @ abs(p3-p2) | |
491 | orr r8, r8, r10 @ abs(p2-p1) | |
492 | uqsub8 lr, r6, r2 @ compare to limit | |
493 | uqsub8 r8, r8, r2 @ compare to limit | |
494 | uqsub8 r6, r11, r12 @ p1 - p0 | |
495 | orr lr, lr, r8 | |
496 | uqsub8 r7, r12, r11 @ p0 - p1 | |
497 | orr r6, r6, r7 @ abs(p1-p0) | |
498 | uqsub8 r7, r6, r2 @ compare to limit | |
499 | uqsub8 r8, r6, r3 @ compare to thresh | |
500 | orr lr, lr, r7 | |
501 | .endm | |
502 | ||
503 | .macro filter_mask_pq | |
504 | uqsub8 r6, r11, r10 @ p1 - q1 | |
505 | uqsub8 r7, r10, r11 @ q1 - p1 | |
506 | uqsub8 r11, r12, r9 @ p0 - q0 | |
507 | uqsub8 r12, r9, r12 @ q0 - p0 | |
508 | orr r6, r6, r7 @ abs(p1-q1) | |
509 | orr r12, r11, r12 @ abs(p0-q0) | |
510 | mov32 r7, 0x7f7f7f7f | |
511 | uqadd8 r12, r12, r12 @ abs(p0-q0) * 2 | |
512 | and r6, r7, r6, lsr #1 @ abs(p1-q1) / 2 | |
513 | uqadd8 r12, r12, r6 @ abs(p0-q0) * 2 + abs(p1-q1)/2 | |
514 | .endm | |
515 | ||
516 | .macro filter_mask_v | |
517 | filter_mask_p | |
518 | ||
519 | ldr r10, [r0, r1] @ q1 | |
520 | ldr_post r9, r0, r1, lsl #1 @ q0 | |
521 | ||
522 | filter_mask_pq | |
523 | ||
524 | ldr r11, [r0] @ q2 | |
525 | ||
526 | uqsub8 r7, r9, r10 @ q0 - q1 | |
527 | uqsub8 r6, r10, r9 @ q1 - q0 | |
528 | uqsub8 r12, r12, r4 @ compare to flimit | |
529 | uqsub8 r9, r11, r10 @ q2 - q1 | |
530 | uqsub8 r10, r10, r11 @ q1 - q2 | |
531 | orr lr, lr, r12 | |
532 | ldr r12, [r0, r1] @ q3 | |
533 | orr r6, r7, r6 @ abs(q1-q0) | |
534 | orr r10, r9, r10 @ abs(q2-q1) | |
535 | uqsub8 r9, r12, r11 @ q3 - q2 | |
536 | uqsub8 r11, r11, r12 @ q2 - q3 | |
537 | uqsub8 r7, r6, r2 @ compare to limit | |
538 | uqsub8 r10, r10, r2 @ compare to limit | |
539 | uqsub8 r6, r6, r3 @ compare to thresh | |
540 | orr r9, r9, r11 @ abs(q3-q2) | |
541 | orr lr, lr, r7 | |
542 | orr lr, lr, r10 | |
543 | uqsub8 r9, r9, r2 @ compare to limit | |
544 | orr lr, lr, r9 | |
545 | ||
546 | mov r12, #0 | |
547 | usub8 lr, r12, lr | |
548 | mvn r11, #0 | |
549 | sel lr, r11, r12 @ filter mask | |
550 | sub r0, r0, r1, lsl #1 | |
551 | .endm | |
552 | ||
553 | .macro filter_mask_h | |
554 | transpose r12, r11, r10, r9, r6, r7, r8, lr | |
555 | ||
556 | filter_mask_p | |
557 | ||
558 | stm sp, {r8, r11, r12, lr} | |
559 | sub r0, r0, r1, lsl #2 | |
560 | add r0, r0, #4 | |
561 | ||
562 | ldr r7, [r0, r1] | |
563 | ldr_post r6, r0, r1, lsl #1 | |
564 | ldr lr, [r0, r1] | |
565 | ldr r8, [r0] | |
566 | ||
567 | transpose r12, r11, r10, r9, r6, r7, r8, lr | |
568 | ||
569 | uqsub8 r8, r12, r11 @ q3 - q2 | |
570 | uqsub8 lr, r11, r12 @ q2 - q3 | |
571 | uqsub8 r7, r9, r10 @ q0 - q1 | |
572 | uqsub8 r6, r10, r9 @ q1 - q0 | |
573 | uqsub8 r12, r11, r10 @ q2 - q1 | |
574 | uqsub8 r11, r10, r11 @ q1 - q2 | |
575 | orr r8, r8, lr @ abs(q3-q2) | |
576 | orr r6, r7, r6 @ abs(q1-q0) | |
577 | orr r11, r12, r11 @ abs(q2-q1) | |
578 | ldr lr, [sp, #12] @ load back (f)limit accumulator | |
579 | uqsub8 r8, r8, r2 @ compare to limit | |
580 | uqsub8 r7, r6, r2 @ compare to limit | |
581 | uqsub8 r11, r11, r2 @ compare to limit | |
582 | orr lr, lr, r8 | |
583 | uqsub8 r8, r6, r3 @ compare to thresh | |
584 | orr lr, lr, r7 | |
585 | ldr r12, [sp, #8] @ p1 | |
586 | orr lr, lr, r11 | |
587 | ||
588 | ldr r11, [sp, #4] @ p0 | |
589 | ||
590 | filter_mask_pq | |
591 | ||
592 | mov r10, #0 | |
593 | uqsub8 r12, r12, r4 @ compare to flimit | |
594 | mvn r11, #0 | |
595 | orr lr, lr, r12 | |
596 | usub8 lr, r10, lr | |
597 | sel lr, r11, r10 @ filter mask | |
598 | .endm | |
599 | ||
600 | .macro filter inner | |
601 | mov32 r12, 0x80808080 | |
602 | eor r11, r7, r12 @ ps1 | |
603 | eor r8, r8, r12 @ ps0 | |
604 | eor r9, r9, r12 @ qs0 | |
605 | eor r10, r10, r12 @ qs1 | |
606 | ||
607 | stm sp, {r8-r11} | |
608 | ||
609 | qsub8 r7, r11, r10 @ vp8_signed_char_clamp(ps1-qs1) | |
610 | qsub8 r8, r9, r8 @ vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) | |
611 | .if \inner | |
612 | and r7, r7, r6 @ vp8_filter &= hev | |
613 | .endif | |
614 | qadd8 r7, r7, r8 | |
615 | lsr r10, r12, #5 @ 0x04040404 | |
616 | qadd8 r7, r7, r8 | |
617 | sub r9, r10, r12, lsr #7 @ 0x03030303 | |
618 | qadd8 r7, r7, r8 | |
619 | ||
620 | and r7, r7, lr @ vp8_filter &= mask | |
621 | .if !\inner | |
622 | mov r12, r7 @ Filter2 | |
623 | and r7, r7, r6 @ Filter2 &= hev | |
624 | .endif | |
625 | qadd8 lr, r7, r9 @ Filter2 = vp8_signed_char_clamp(vp8_filter+3) | |
626 | qadd8 r7, r7, r10 @ Filter1 = vp8_signed_char_clamp(vp8_filter+4) | |
627 | ||
628 | mov r9, #0 | |
629 | shadd8 lr, lr, r9 @ Filter2 >>= 3 | |
630 | shadd8 r7, r7, r9 @ Filter1 >>= 3 | |
631 | shadd8 lr, lr, r9 | |
632 | shadd8 r7, r7, r9 | |
633 | shadd8 lr, lr, r9 @ Filter2 | |
634 | shadd8 r7, r7, r9 @ Filter1 | |
635 | .endm | |
636 | ||
637 | .macro filter_v inner | |
638 | orr r10, r6, r8 @ calculate vp8_hevmask | |
639 | ldr_nreg r7, r0, r1, lsl #1 @ p1 | |
640 | usub8 r10, r12, r10 | |
641 | ldr_nreg r8, r0, r1 @ p0 | |
642 | sel r6, r12, r11 @ obtain vp8_hevmask | |
643 | ldr r9, [r0] @ q0 | |
644 | ldr r10, [r0, r1] @ q1 | |
645 | filter \inner | |
646 | .endm | |
647 | ||
648 | .macro filter_h inner | |
649 | orr r9, r6, r8 | |
650 | usub8 r9, r12, r9 | |
651 | sel r6, r12, r11 @ hev mask | |
652 | ||
653 | stm sp, {r6, lr} | |
654 | ||
655 | ldr_nreg r12, r0, r1, lsl #1 | |
656 | ldr_nreg r11, r0, r1 | |
657 | ldr r6, [r0] | |
658 | ldr lr, [r0, r1] | |
659 | ||
660 | transpose r10, r9, r8, r7, r12, r11, r6, lr | |
661 | ||
662 | ldm sp, {r6, lr} | |
663 | filter \inner | |
664 | .endm | |
665 | ||
666 | .macro filter_inner | |
667 | ldm sp, {r8, r9} | |
668 | lsr r10, r10, #2 @ 0x01010101 | |
669 | qadd8 r8, r8, lr @ u = vp8_signed_char_clamp(ps0 + Filter2) | |
670 | mov lr, #0 | |
671 | qsub8 r9, r9, r7 @ u = vp8_signed_char_clamp(qs0 - Filter1) | |
672 | sadd8 r7, r7, r10 @ vp8_filter += 1 | |
673 | ldr r10, [sp, #8] @ qs1 | |
674 | shadd8 r7, r7, lr @ vp8_filter >>= 1 | |
675 | eor r8, r8, r12 @ *op0 = u ^ 0x80 | |
676 | bic r7, r7, r6 @ vp8_filter &= ~hev | |
677 | qadd8 r11, r11, r7 @ u = vp8_signed_char_clamp(ps1 + vp8_filter) | |
678 | eor r9, r9, r12 @ *oq0 = u ^ 0x80 | |
679 | qsub8 r10, r10, r7 @ u = vp8_signed_char_clamp(qs1 - vp8_filter) | |
680 | eor r11, r11, r12 @ *op1 = u ^ 0x80 | |
681 | eor r10, r10, r12 @ *oq1 = u ^ 0x80 | |
682 | .endm | |
683 | ||
684 | .macro filter_x c0 | |
685 | mov lr, \c0 | |
686 | mov r7, #63 | |
687 | ||
688 | sxtb16 r6, r12 | |
689 | sxtb16 r10, r12, ror #8 | |
690 | smlabb r8, r6, lr, r7 | |
691 | smlatb r6, r6, lr, r7 | |
692 | smlabb r7, r10, lr, r7 | |
693 | smultb r10, r10, lr | |
694 | ssat r8, #8, r8, asr #7 | |
695 | ssat r6, #8, r6, asr #7 | |
696 | add r10, r10, #63 | |
697 | ssat r7, #8, r7, asr #7 | |
698 | ssat r10, #8, r10, asr #7 | |
699 | ||
700 | pkhbt r6, r8, r6, lsl #16 | |
701 | pkhbt r10, r7, r10, lsl #16 | |
702 | uxtb16 r6, r6 | |
703 | uxtb16 r10, r10 | |
704 | ||
705 | mov32 lr, 0x80808080 | |
706 | ||
707 | orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) | |
708 | qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs0 - u) | |
709 | qadd8 r10, r11, r10 @ s = vp8_signed_char_clamp(ps0 + u) | |
710 | eor r8, r8, lr @ *oq0 = s ^ 0x80 | |
711 | eor r10, r10, lr @ *op0 = s ^ 0x80 | |
712 | .endm | |
713 | ||
714 | .macro filter_1 | |
715 | ldm sp, {r8, r9} | |
716 | qadd8 r11, r8, lr | |
717 | qsub8 r9, r9, r7 | |
718 | bic r12, r12, r6 @ vp8_filter &= ~hev | |
719 | filter_x #27 | |
720 | .endm | |
721 | ||
722 | .macro filter_2 | |
723 | ldr r9, [sp, #8] @ qs1 | |
724 | ldr r11, [sp, #12] @ ps1 | |
725 | filter_x #18 | |
726 | .endm | |
727 | ||
728 | .macro filter_3 | |
729 | eor r9, r9, lr | |
730 | eor r11, r11, lr | |
731 | filter_x #9 | |
732 | .endm | |
733 | ||
734 | function vp8_v_loop_filter_inner_armv6 | |
735 | mov r5, #4 | |
736 | sub sp, sp, #16 | |
737 | ||
738 | orr r2, r2, r2, lsl #16 | |
739 | orr r3, r3, r3, lsl #16 | |
740 | orr r6, r6, r6, lsl #16 | |
741 | orr r4, r2, r2, lsl #8 @ flimE | |
742 | orr r2, r3, r3, lsl #8 @ flimI | |
743 | orr r3, r6, r6, lsl #8 @ thresh | |
744 | 1: | |
745 | sub r0, r0, r1, lsl #2 | |
746 | ldr r10, [r0, r1] @ p2 | |
747 | ldr_post r9, r0, r1, lsl #1 @ p3 | |
748 | ldr r12, [r0, r1] @ p0 | |
749 | ldr_post r11, r0, r1, lsl #1 @ p1 | |
750 | ||
751 | filter_mask_v | |
752 | cmp lr, #0 | |
753 | beq 2f | |
754 | filter_v inner=1 | |
755 | filter_inner | |
756 | ||
757 | A str r11, [r0, -r1, lsl #1] @ op1 | |
758 | A str r8, [r0, -r1] @ op0 | |
759 | T sub r0, r0, r1, lsl #1 | |
760 | T str r8, [r0, r1] | |
761 | T str_post r11, r0, r1, lsl #1 | |
762 | str r9, [r0] @ oq0 | |
763 | str r10, [r0, r1] @ oq1 | |
764 | 2: | |
765 | add r0, r0, #4 | |
766 | cmp r5, #3 | |
767 | it eq | |
768 | ldreq r0, [sp, #16] | |
769 | subs r5, r5, #1 | |
770 | bne 1b | |
771 | ||
772 | add sp, sp, #16 | |
773 | pop {r0, r4-r11, pc} | |
774 | endfunc | |
775 | ||
776 | function ff_vp8_v_loop_filter16_inner_armv6, export=1 | |
777 | push {r4-r11, lr} | |
778 | add r12, r0, #8 | |
779 | push {r12} | |
780 | ldr r6, [sp, #40] | |
781 | orr r2, r2, r2, lsl #16 | |
782 | b vp8_v_loop_filter_inner_armv6 | |
783 | endfunc | |
784 | ||
785 | function ff_vp8_v_loop_filter8uv_inner_armv6, export=1 | |
786 | push {r1, r4-r11, lr} | |
787 | mov r1, r2 | |
788 | orr r2, r3, r3, lsl #16 | |
789 | ldr r3, [sp, #40] | |
790 | ldr r6, [sp, #44] | |
791 | b vp8_v_loop_filter_inner_armv6 | |
792 | endfunc | |
793 | ||
794 | function vp8_v_loop_filter_armv6 | |
795 | mov r5, #4 | |
796 | sub sp, sp, #16 | |
797 | ||
798 | orr r3, r3, r3, lsl #16 | |
799 | orr r6, r6, r6, lsl #16 | |
800 | orr r4, r2, r2, lsl #8 @ flimE | |
801 | orr r2, r3, r3, lsl #8 @ flimI | |
802 | orr r3, r6, r6, lsl #8 @ thresh | |
803 | 1: | |
804 | sub r0, r0, r1, lsl #2 | |
805 | ldr r10, [r0, r1] @ p2 | |
806 | ldr_post r9, r0, r1, lsl #1 @ p3 | |
807 | ldr r12, [r0, r1] @ p0 | |
808 | ldr_post r11, r0, r1, lsl #1 @ p1 | |
809 | ||
810 | filter_mask_v | |
811 | cmp lr, #0 | |
812 | beq 2f | |
813 | ||
814 | filter_v inner=0 | |
815 | filter_1 | |
816 | ||
817 | str r8, [r0] @ *oq0 | |
818 | A str r10, [r0, -r1] @ *op0 | |
819 | T sub r0, r0, r1, lsl #1 | |
820 | T str r10, [r0, r1] | |
821 | ||
822 | filter_2 | |
823 | ||
824 | A str r10, [r0, -r1, lsl #1] @ *op1 | |
825 | T str_post r10, r0, r1, lsl #1 | |
826 | str r8, [r0, r1] @ *oq1 | |
827 | ||
828 | ldr r9, [r0, r1, lsl #1] @ q2 | |
829 | add r0, r0, r1 | |
830 | A ldr r11, [r0, -r1, lsl #2] @ p2 | |
831 | T ldr_dpre r11, r0, r1, lsl #2 | |
832 | ||
833 | filter_3 | |
834 | ||
835 | A str r10, [r0, -r1, lsl #2] @ *op2 | |
836 | T str_post r10, r0, r1, lsl #2 | |
837 | str r8, [r0, r1] @ *oq2 | |
838 | sub r0, r0, r1 | |
839 | 2: | |
840 | add r0, r0, #4 | |
841 | cmp r5, #3 | |
842 | it eq | |
843 | ldreq r0, [sp, #16] | |
844 | subs r5, r5, #1 | |
845 | bne 1b | |
846 | ||
847 | add sp, sp, #16 | |
848 | pop {r0, r4-r11, pc} | |
849 | endfunc | |
850 | ||
851 | function ff_vp8_v_loop_filter16_armv6, export=1 | |
852 | push {r4-r11, lr} | |
853 | add r12, r0, #8 | |
854 | push {r12} | |
855 | ldr r6, [sp, #40] | |
856 | orr r2, r2, r2, lsl #16 | |
857 | b vp8_v_loop_filter_armv6 | |
858 | endfunc | |
859 | ||
860 | function ff_vp8_v_loop_filter8uv_armv6, export=1 | |
861 | push {r1, r4-r11, lr} | |
862 | mov r1, r2 | |
863 | orr r2, r3, r3, lsl #16 | |
864 | ldr r3, [sp, #40] | |
865 | ldr r6, [sp, #44] | |
866 | b vp8_v_loop_filter_armv6 | |
867 | endfunc | |
868 | ||
869 | @ void vp8_h_loop_filter16_simple(uint8_t *dst, int stride, int flim) | |
870 | function ff_vp8_h_loop_filter16_simple_armv6, export=1 | |
871 | push {r4-r11, lr} | |
872 | orr r12, r2, r2, lsl #16 | |
873 | mov32 r2, 0x80808080 | |
874 | orr r12, r12, r12, lsl #8 | |
875 | ||
876 | mov lr, #0 | |
877 | mov r11, #4 | |
878 | 1: | |
879 | sub r0, r0, #2 | |
880 | ldr r8, [r0, r1] | |
881 | ldr_post r7, r0, r1, lsl #1 | |
882 | ldr r10, [r0, r1] | |
883 | ldr_post r9, r0, r1, lsl #1 | |
884 | add r0, r0, #2 | |
885 | transpose r6, r5, r4, r3, r7, r8, r9, r10 | |
886 | simple_filter | |
887 | sub r0, r0, r1, lsl #2 | |
888 | sub r0, r0, #1 | |
889 | ||
890 | uxtb16 r6, r4 | |
891 | uxtb16 r8, r5 | |
892 | uxtb16 r7, r4, ror #8 | |
893 | uxtb16 r9, r5, ror #8 | |
894 | orr r6, r6, r8, lsl #8 | |
895 | orr r7, r7, r9, lsl #8 | |
896 | lsr r4, r6, #16 | |
897 | lsr r5, r7, #16 | |
898 | ||
899 | strh_post r6, r0, r1 | |
900 | strh_post r7, r0, r1 | |
901 | strh_post r4, r0, r1 | |
902 | strh_post r5, r0, r1 | |
903 | add r0, r0, #1 | |
904 | 2: | |
905 | subs r11, r11, #1 | |
906 | bne 1b | |
907 | ||
908 | pop {r4-r11, pc} | |
909 | endfunc | |
910 | ||
911 | function vp8_h_loop_filter_inner_armv6 | |
912 | mov r5, #4 | |
913 | sub sp, sp, #16 | |
914 | ||
915 | orr r3, r3, r3, lsl #16 | |
916 | orr r9, r9, r9, lsl #16 | |
917 | orr r4, r2, r2, lsl #8 @ flimE | |
918 | orr r2, r3, r3, lsl #8 @ flimI | |
919 | orr r3, r9, r9, lsl #8 @ thresh | |
920 | sub r0, r0, #4 | |
921 | 1: | |
922 | ldr r7, [r0, r1] | |
923 | ldr_post r6, r0, r1, lsl #1 | |
924 | ldr lr, [r0, r1] | |
925 | ldr_post r8, r0, r1, lsl #1 | |
926 | ||
927 | filter_mask_h | |
928 | ||
929 | cmp lr, #0 | |
930 | sub r0, r0, #2 | |
931 | beq 2f | |
932 | ||
933 | ldr r6, [sp] | |
934 | ||
935 | filter_h inner=1 | |
936 | filter_inner | |
937 | ||
938 | transpose lr, r12, r7, r6, r11, r8, r9, r10 | |
939 | ||
940 | A str r6, [r0, -r1, lsl #1] | |
941 | A str r7, [r0, -r1] | |
942 | T sub r0, r0, r1, lsl #1 | |
943 | T str r7, [r0, r1] | |
944 | T str_post r6, r0, r1, lsl #1 | |
945 | str r12, [r0] | |
946 | str lr, [r0, r1] | |
947 | 2: | |
948 | sub r0, r0, #2 | |
949 | add r0, r0, r1, lsl #1 | |
950 | cmp r5, #3 | |
951 | it eq | |
952 | ldreq r0, [sp, #16] | |
953 | subs r5, r5, #1 | |
954 | bne 1b | |
955 | ||
956 | add sp, sp, #16 | |
957 | pop {r0, r4-r11, pc} | |
958 | endfunc | |
959 | ||
960 | function ff_vp8_h_loop_filter16_inner_armv6, export=1 | |
961 | push {r4-r11, lr} | |
962 | add r12, r0, r1, lsl #3 | |
963 | sub r12, r12, #4 | |
964 | push {r12} | |
965 | ldr r9, [sp, #40] | |
966 | orr r2, r2, r2, lsl #16 | |
967 | b vp8_h_loop_filter_inner_armv6 | |
968 | endfunc | |
969 | ||
970 | function ff_vp8_h_loop_filter8uv_inner_armv6, export=1 | |
971 | sub r1, r1, #4 | |
972 | push {r1, r4-r11, lr} | |
973 | mov r1, r2 | |
974 | orr r2, r3, r3, lsl #16 | |
975 | ldr r3, [sp, #40] | |
976 | ldr r9, [sp, #44] | |
977 | b vp8_h_loop_filter_inner_armv6 | |
978 | endfunc | |
979 | ||
980 | function vp8_h_loop_filter_armv6 | |
981 | mov r5, #4 | |
982 | sub sp, sp, #16 | |
983 | ||
984 | orr r3, r3, r3, lsl #16 | |
985 | orr r9, r9, r9, lsl #16 | |
986 | orr r4, r2, r2, lsl #8 @ flimE | |
987 | orr r2, r3, r3, lsl #8 @ flimI | |
988 | orr r3, r9, r9, lsl #8 @ thresh | |
989 | 1: | |
990 | sub r0, r0, #4 | |
991 | ldr r7, [r0, r1] | |
992 | ldr_post r6, r0, r1, lsl #1 | |
993 | ldr lr, [r0, r1] | |
994 | ldr_post r8, r0, r1, lsl #1 | |
995 | ||
996 | filter_mask_h | |
997 | cmp lr, #0 | |
998 | it eq | |
999 | addeq r0, r0, r1, lsl #1 | |
1000 | beq 2f | |
1001 | ||
1002 | ldr r6, [sp] | |
1003 | sub r0, r0, #2 | |
1004 | ||
1005 | filter_h inner=0 | |
1006 | filter_1 | |
1007 | ||
1008 | sub r0, r0, r1, lsl #1 | |
1009 | uxtb16 r6, r10 | |
1010 | uxtb16 r7, r8 | |
1011 | uxtb16 r10, r10, ror #8 | |
1012 | uxtb16 r8, r8, ror #8 | |
1013 | orr r6, r6, r7, lsl #8 | |
1014 | orr r10, r10, r8, lsl #8 | |
1015 | lsr r7, r6, #16 | |
1016 | lsr r8, r10, #16 | |
1017 | ||
1018 | add r0, r0, #1 | |
1019 | strh_post r6, r0, r1 | |
1020 | strh_post r10, r0, r1 | |
1021 | strh_post r7, r0, r1 | |
1022 | strh_post r8, r0, r1 | |
1023 | ||
1024 | filter_2 | |
1025 | ||
1026 | sub r0, r0, r1, lsl #2 | |
1027 | add r0, r0, #3 | |
1028 | ||
1029 | ldrb r11, [r0, #-5] @ p2 for 1/7th difference | |
1030 | strb r10, [r0, #-4] @ op1 | |
1031 | strb r8, [r0, #-1] @ oq1 | |
1032 | ldrb_post r9, r0, r1 @ q2 for 1/7th difference | |
1033 | ||
1034 | lsr r10, r10, #8 | |
1035 | lsr r8, r8, #8 | |
1036 | ||
1037 | ldrb r6, [r0, #-5] | |
1038 | strb r10, [r0, #-4] | |
1039 | strb r8, [r0, #-1] | |
1040 | ldrb_post r7, r0, r1 | |
1041 | ||
1042 | lsr r10, r10, #8 | |
1043 | lsr r8, r8, #8 | |
1044 | orr r11, r11, r6, lsl #8 | |
1045 | orr r9, r9, r7, lsl #8 | |
1046 | ||
1047 | ldrb r6, [r0, #-5] | |
1048 | strb r10, [r0, #-4] | |
1049 | strb r8, [r0, #-1] | |
1050 | ldrb_post r7, r0, r1 | |
1051 | ||
1052 | lsr r10, r10, #8 | |
1053 | lsr r8, r8, #8 | |
1054 | orr r11, r11, r6, lsl #16 | |
1055 | orr r9, r9, r7, lsl #16 | |
1056 | ||
1057 | ldrb r6, [r0, #-5] | |
1058 | strb r10, [r0, #-4] | |
1059 | strb r8, [r0, #-1] | |
1060 | ldrb_post r7, r0, r1 | |
1061 | orr r11, r11, r6, lsl #24 | |
1062 | orr r9, r9, r7, lsl #24 | |
1063 | ||
1064 | filter_3 | |
1065 | ||
1066 | sub r0, r0, r1, lsl #2 | |
1067 | strb r10, [r0, #-5] | |
1068 | strb_post r8, r0, r1 | |
1069 | lsr r10, r10, #8 | |
1070 | lsr r8, r8, #8 | |
1071 | strb r10, [r0, #-5] | |
1072 | strb_post r8, r0, r1 | |
1073 | lsr r10, r10, #8 | |
1074 | lsr r8, r8, #8 | |
1075 | strb r10, [r0, #-5] | |
1076 | strb_post r8, r0, r1 | |
1077 | lsr r10, r10, #8 | |
1078 | lsr r8, r8, #8 | |
1079 | strb r10, [r0, #-5] | |
1080 | strb_post r8, r0, r1 | |
1081 | ||
1082 | sub r0, r0, #2 | |
1083 | 2: | |
1084 | cmp r5, #3 | |
1085 | it eq | |
1086 | ldreq r0, [sp, #16] | |
1087 | subs r5, r5, #1 | |
1088 | bne 1b | |
1089 | ||
1090 | add sp, sp, #16 | |
1091 | pop {r0, r4-r11, pc} | |
1092 | endfunc | |
1093 | ||
1094 | function ff_vp8_h_loop_filter16_armv6, export=1 | |
1095 | push {r4-r11, lr} | |
1096 | add r12, r0, r1, lsl #3 | |
1097 | push {r12} | |
1098 | ldr r9, [sp, #40] | |
1099 | orr r2, r2, r2, lsl #16 | |
1100 | b vp8_h_loop_filter_armv6 | |
1101 | endfunc | |
1102 | ||
1103 | function ff_vp8_h_loop_filter8uv_armv6, export=1 | |
1104 | push {r1, r4-r11, lr} | |
1105 | mov r1, r2 | |
1106 | orr r2, r3, r3, lsl #16 | |
1107 | ldr r3, [sp, #40] | |
1108 | ldr r9, [sp, #44] | |
1109 | b vp8_h_loop_filter_armv6 | |
1110 | endfunc | |
1111 | ||
1112 | .ltorg | |
1113 | ||
1114 | @ MC | |
1115 | ||
1116 | @ void put_vp8_pixels16(uint8_t *dst, int dststride, uint8_t *src, | |
1117 | @ int srcstride, int h, int mx, int my) | |
1118 | function ff_put_vp8_pixels16_armv6, export=1 | |
1119 | push {r4-r11} | |
1120 | ldr r12, [sp, #32] @ h | |
1121 | 1: | |
1122 | subs r12, r12, #2 | |
1123 | ldr r5, [r2, #4] | |
1124 | ldr r6, [r2, #8] | |
1125 | ldr r7, [r2, #12] | |
1126 | ldr_post r4, r2, r3 | |
1127 | ldr r9, [r2, #4] | |
1128 | ldr r10, [r2, #8] | |
1129 | ldr r11, [r2, #12] | |
1130 | ldr_post r8, r2, r3 | |
1131 | strd r6, r7, [r0, #8] | |
1132 | strd_post r4, r5, r0, r1 | |
1133 | strd r10, r11, [r0, #8] | |
1134 | strd_post r8, r9, r0, r1 | |
1135 | bgt 1b | |
1136 | pop {r4-r11} | |
1137 | bx lr | |
1138 | endfunc | |
1139 | ||
1140 | @ void put_vp8_pixels8(uint8_t *dst, int dststride, uint8_t *src, | |
1141 | @ int srcstride, int h, int mx, int my) | |
1142 | function ff_put_vp8_pixels8_armv6, export=1 | |
1143 | push {r4-r11} | |
1144 | ldr r12, [sp, #32] @ h | |
1145 | 1: | |
1146 | subs r12, r12, #4 | |
1147 | ldr r5, [r2, #4] | |
1148 | ldr_post r4, r2, r3 | |
1149 | ldr r7, [r2, #4] | |
1150 | ldr_post r6, r2, r3 | |
1151 | ldr r9, [r2, #4] | |
1152 | ldr_post r8, r2, r3 | |
1153 | ldr r11, [r2, #4] | |
1154 | ldr_post r10, r2, r3 | |
1155 | strd_post r4, r5, r0, r1 | |
1156 | strd_post r6, r7, r0, r1 | |
1157 | strd_post r8, r9, r0, r1 | |
1158 | strd_post r10, r11, r0, r1 | |
1159 | bgt 1b | |
1160 | pop {r4-r11} | |
1161 | bx lr | |
1162 | endfunc | |
1163 | ||
1164 | @ void put_vp8_pixels4(uint8_t *dst, int dststride, uint8_t *src, | |
1165 | @ int srcstride, int h, int mx, int my) | |
1166 | function ff_put_vp8_pixels4_armv6, export=1 | |
1167 | ldr r12, [sp, #0] @ h | |
1168 | push {r4-r6,lr} | |
1169 | 1: | |
1170 | subs r12, r12, #4 | |
1171 | ldr_post r4, r2, r3 | |
1172 | ldr_post r5, r2, r3 | |
1173 | ldr_post r6, r2, r3 | |
1174 | ldr_post lr, r2, r3 | |
1175 | str_post r4, r0, r1 | |
1176 | str_post r5, r0, r1 | |
1177 | str_post r6, r0, r1 | |
1178 | str_post lr, r0, r1 | |
1179 | bgt 1b | |
1180 | pop {r4-r6,pc} | |
1181 | endfunc | |
1182 | ||
1183 | @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit | |
1184 | @ arithmatic can be used to apply filters | |
1185 | const sixtap_filters_13245600, align=4 | |
1186 | .short 2, 108, -11, 36, -8, 1, 0, 0 | |
1187 | .short 3, 77, -16, 77, -16, 3, 0, 0 | |
1188 | .short 1, 36, -8, 108, -11, 2, 0, 0 | |
1189 | endconst | |
1190 | ||
1191 | const fourtap_filters_1324, align=4 | |
1192 | .short -6, 12, 123, -1 | |
1193 | .short -9, 50, 93, -6 | |
1194 | .short -6, 93, 50, -9 | |
1195 | .short -1, 123, 12, -6 | |
1196 | endconst | |
1197 | ||
1198 | .macro vp8_mc_1 name, size, hv | |
1199 | function ff_put_vp8_\name\size\()_\hv\()_armv6, export=1 | |
1200 | sub r1, r1, #\size | |
1201 | mov r12, sp | |
1202 | push {r1, r4-r11, lr} | |
1203 | ldm r12, {r5-r7} | |
1204 | mov r4, #\size | |
1205 | stm r12, {r4, r5} | |
1206 | orr r12, r6, r7 | |
1207 | b bl_put_\name\()_\hv\()_armv6 | |
1208 | endfunc | |
1209 | .endm | |
1210 | ||
1211 | vp8_mc_1 epel, 16, h6 | |
1212 | vp8_mc_1 epel, 16, v6 | |
1213 | vp8_mc_1 epel, 8, h6 | |
1214 | vp8_mc_1 epel, 8, v6 | |
1215 | vp8_mc_1 epel, 8, h4 | |
1216 | vp8_mc_1 epel, 8, v4 | |
1217 | vp8_mc_1 epel, 4, h6 | |
1218 | vp8_mc_1 epel, 4, v6 | |
1219 | vp8_mc_1 epel, 4, h4 | |
1220 | vp8_mc_1 epel, 4, v4 | |
1221 | ||
1222 | vp8_mc_1 bilin, 16, h | |
1223 | vp8_mc_1 bilin, 16, v | |
1224 | vp8_mc_1 bilin, 8, h | |
1225 | vp8_mc_1 bilin, 8, v | |
1226 | vp8_mc_1 bilin, 4, h | |
1227 | vp8_mc_1 bilin, 4, v | |
1228 | ||
1229 | /* True relational expressions have the value -1 in the GNU assembler, | |
1230 | +1 in Apple's. */ | |
1231 | #ifdef __APPLE__ | |
1232 | # define TMPSIZE \size * (8 + 8*(\size > 4) + \ytaps - 1) | |
1233 | #else | |
1234 | # define TMPSIZE \size * (8 - 8*(\size > 4) + \ytaps - 1) | |
1235 | #endif | |
1236 | ||
1237 | .macro vp8_mc_hv name, size, h, v, ytaps | |
1238 | function ff_put_vp8_\name\size\()_\h\v\()_armv6, export=1 | |
1239 | push {r0, r1, r4, lr} | |
1240 | add r0, sp, #16 | |
1241 | sub sp, sp, #TMPSIZE+16 | |
1242 | ldm r0, {r0, r12} | |
1243 | mov r4, #\size | |
1244 | add lr, r0, #\ytaps-1 | |
1245 | .if \ytaps > 2 | |
1246 | sub r2, r2, r3, lsl #\ytaps >> 1 & 1 | |
1247 | .endif | |
1248 | stm sp, {r4, lr} | |
1249 | add r0, sp, #16 | |
1250 | mov r1, #0 | |
1251 | bl vp8_put_\name\()_\h\()_armv6 | |
1252 | add r0, sp, #TMPSIZE+16 | |
1253 | ldr lr, [sp, #TMPSIZE+16+16] | |
1254 | ldm r0, {r0, r1} | |
1255 | mov r3, #\size | |
1256 | ldr r12, [sp, #TMPSIZE+16+16+8] | |
1257 | str lr, [sp, #4] | |
1258 | add r2, sp, #16 + \size * (\ytaps / 2 - 1) | |
1259 | sub r1, r1, #\size | |
1260 | bl vp8_put_\name\()_\v\()_armv6 | |
1261 | add sp, sp, #TMPSIZE+16+8 | |
1262 | pop {r4, pc} | |
1263 | endfunc | |
1264 | .endm | |
1265 | ||
1266 | vp8_mc_hv epel, 16, h6, v6, 6 | |
1267 | vp8_mc_hv epel, 8, h6, v6, 6 | |
1268 | vp8_mc_hv epel, 8, h4, v6, 6 | |
1269 | vp8_mc_hv epel, 8, h6, v4, 4 | |
1270 | vp8_mc_hv epel, 8, h4, v4, 4 | |
1271 | vp8_mc_hv epel, 4, h6, v6, 6 | |
1272 | vp8_mc_hv epel, 4, h4, v6, 6 | |
1273 | vp8_mc_hv epel, 4, h6, v4, 4 | |
1274 | vp8_mc_hv epel, 4, h4, v4, 4 | |
1275 | ||
1276 | vp8_mc_hv bilin, 16, h, v, 2 | |
1277 | vp8_mc_hv bilin, 8, h, v, 2 | |
1278 | vp8_mc_hv bilin, 4, h, v, 2 | |
1279 | ||
1280 | .macro sat4 r0, r1, r2, r3 | |
1281 | asr \r0, \r0, #7 | |
1282 | asr \r1, \r1, #7 | |
1283 | pkhbt \r0, \r0, \r2, lsl #9 | |
1284 | pkhbt \r1, \r1, \r3, lsl #9 | |
1285 | usat16 \r0, #8, \r0 | |
1286 | usat16 \r1, #8, \r1 | |
1287 | orr \r0, \r0, \r1, lsl #8 | |
1288 | .endm | |
1289 | ||
1290 | @ Calling convention for the inner MC functions: | |
1291 | @ r0 dst | |
1292 | @ r1 dst_stride - block_width | |
1293 | @ r2 src | |
1294 | @ r3 src_stride | |
1295 | @ r4 block_width | |
1296 | @ r12 filter_index | |
1297 | @ [sp] block_width | |
1298 | @ [sp+4] height | |
1299 | @ [sp+8] scratch | |
1300 | ||
1301 | function vp8_put_epel_h6_armv6 | |
1302 | push {r1, r4-r11, lr} | |
1303 | bl_put_epel_h6_armv6: | |
1304 | sub r2, r2, #2 | |
1305 | movrel lr, sixtap_filters_13245600 - 16 | |
1306 | add lr, lr, r12, lsl #3 | |
1307 | sub r3, r3, r4 | |
1308 | str r3, [sp, #48] | |
1309 | ldm lr, {r1, r3, lr} | |
1310 | 1: | |
1311 | ldr r7, [r2, #5] @ src[5-8] | |
1312 | ldr r6, [r2, #2] @ src[2-5] | |
1313 | ldr r5, [r2], #4 @ src[0-3] | |
1314 | ||
1315 | pkhtb r7, r7, r7, asr #8 @ src[8,7,7,6] | |
1316 | uxtb16 r9, r6, ror #8 @ src[5] | src[3] | |
1317 | uxtb16 r6, r6 @ src[4] | src[2] | |
1318 | uxtb16 r8, r5, ror #8 @ src[3] | src[1] | |
1319 | uxtb16 r11, r7, ror #8 @ src[8] | src[7] | |
1320 | uxtb16 r7, r7 @ src[7] | src[6] | |
1321 | uxtb16 r5, r5 @ src[2] | src[0] | |
1322 | ||
1323 | mov r10, #0x40 | |
1324 | smlad r5, r5, r1, r10 @ filter[0][0] | |
1325 | smlad r11, r11, lr, r10 @ filter[3][2] | |
1326 | smlad r12, r7, lr, r10 @ filter[2][2] | |
1327 | smlad r10, r8, r1, r10 @ filter[1][0] | |
1328 | smlad r5, r8, r3, r5 @ filter[0][1] | |
1329 | smlad r11, r9, r1, r11 @ filter[3][0] | |
1330 | smlad r12, r9, r3, r12 @ filter[2][1] | |
1331 | pkhtb r9, r9, r6, asr #16 @ src[5] | src[4] | |
1332 | smlad r10, r6, r3, r10 @ filter[1][1] | |
1333 | pkhbt r7, r9, r7, lsl #16 @ src[6] | src[4] | |
1334 | smlad r5, r9, lr, r5 @ filter[0][2] | |
1335 | pkhtb r8, r7, r9, asr #16 @ src[6] | src[5] | |
1336 | smlad r11, r7, r3, r11 @ filter[3][1] | |
1337 | smlad r9, r8, lr, r10 @ filter[1][2] | |
1338 | smlad r7, r6, r1, r12 @ filter[2][0] | |
1339 | ||
1340 | subs r4, r4, #4 | |
1341 | ||
1342 | sat4 r5, r9, r7, r11 | |
1343 | str r5, [r0], #4 | |
1344 | ||
1345 | bne 1b | |
1346 | ||
1347 | add r4, sp, #40 | |
1348 | ldm r4, {r4, r5, r12} | |
1349 | ldr r6, [sp] | |
1350 | subs r5, r5, #1 | |
1351 | add r2, r2, r12 | |
1352 | str r5, [sp, #44] | |
1353 | add r0, r0, r6 | |
1354 | ||
1355 | bne 1b | |
1356 | ||
1357 | pop {r1, r4-r11, pc} | |
1358 | endfunc | |
1359 | ||
1360 | function vp8_put_epel_v6_armv6 | |
1361 | push {r1, r4-r11, lr} | |
1362 | bl_put_epel_v6_armv6: | |
1363 | movrel lr, sixtap_filters_13245600 - 16 | |
1364 | add lr, lr, r12, lsl #3 | |
1365 | str r3, [sp, #48] | |
1366 | 1: | |
1367 | add r1, r3, r3, lsl #1 @ stride * 3 | |
1368 | ldr_nreg r5, r2, r3 @ src[0,1,2,3 + stride * 1] | |
1369 | ldr r6, [r2, r3] @ src[0,1,2,3 + stride * 3] | |
1370 | ldr r7, [r2, r3, lsl #1] @ src[0,1,2,3 + stride * 4] | |
1371 | ldr r8, [r2, r1] @ src[0,1,2,3 + stride * 5] | |
1372 | ||
1373 | uxtb16 r9, r5, ror #8 @ src[3 + s*1] | src[1 + s*1] | |
1374 | uxtb16 r10, r6, ror #8 @ src[3 + s*3] | src[1 + s*3] | |
1375 | uxtb16 r11, r7, ror #8 @ src[3 + s*4] | src[1 + s*4] | |
1376 | uxtb16 r12, r8, ror #8 @ src[3 + s*5] | src[1 + s*5] | |
1377 | uxtb16 r5, r5 @ src[2 + s*1] | src[0 + s*1] | |
1378 | uxtb16 r6, r6 @ src[2 + s*3] | src[0 + s*3] | |
1379 | uxtb16 r7, r7 @ src[2 + s*4] | src[0 + s*4] | |
1380 | uxtb16 r8, r8 @ src[2 + s*5] | src[0 + s*5] | |
1381 | pkhbt r1, r9, r10, lsl #16 @ src[1 + s*3] | src[1 + s*1] | |
1382 | pkhtb r9, r10, r9, asr #16 @ src[3 + s*3] | src[3 + s*1] | |
1383 | pkhbt r10, r11, r12, lsl #16 @ src[1 + s*5] | src[1 + s*4] | |
1384 | pkhtb r11, r12, r11, asr #16 @ src[3 + s*5] | src[3 + s*4] | |
1385 | pkhbt r12, r5, r6, lsl #16 @ src[0 + s*3] | src[0 + s*1] | |
1386 | pkhtb r5, r6, r5, asr #16 @ src[2 + s*3] | src[2 + s*1] | |
1387 | pkhbt r6, r7, r8, lsl #16 @ src[0 + s*5] | src[0 + s*4] | |
1388 | pkhtb r7, r8, r7, asr #16 @ src[2 + s*5] | src[2 + s*4] | |
1389 | ||
1390 | ldr r8, [lr, #4] | |
1391 | mov r3, #0x40 | |
1392 | smlad r12, r12, r8, r3 @ filter[0][1] | |
1393 | smlad r1, r1, r8, r3 @ filter[1][1] | |
1394 | smlad r5, r5, r8, r3 @ filter[2][1] | |
1395 | smlad r9, r9, r8, r3 @ filter[3][1] | |
1396 | ldr r8, [lr, #8] | |
1397 | ldr r3, [sp, #48] | |
1398 | smlad r12, r6, r8, r12 @ filter[0][2] | |
1399 | smlad r1, r10, r8, r1 @ filter[1][2] | |
1400 | ldr_nreg r6, r2, r3, lsl #1 @ src[0,1,2,3 + stride * 0] | |
1401 | ldr r10, [r2], #4 @ src[0,1,2,3 + stride * 2] | |
1402 | smlad r5, r7, r8, r5 @ filter[2][2] | |
1403 | smlad r9, r11, r8, r9 @ filter[3][2] | |
1404 | ||
1405 | uxtb16 r7, r6, ror #8 @ src[3 + s*0] | src[1 + s*0] | |
1406 | uxtb16 r11, r10, ror #8 @ src[3 + s*2] | src[1 + s*2] | |
1407 | uxtb16 r6, r6 @ src[2 + s*0] | src[0 + s*0] | |
1408 | uxtb16 r10, r10 @ src[2 + s*2] | src[0 + s*2] | |
1409 | ||
1410 | pkhbt r8, r7, r11, lsl #16 @ src[1 + s*2] | src[1 + s*0] | |
1411 | pkhtb r7, r11, r7, asr #16 @ src[3 + s*2] | src[3 + s*0] | |
1412 | pkhbt r11, r6, r10, lsl #16 @ src[0 + s*2] | src[0 + s*0] | |
1413 | pkhtb r6, r10, r6, asr #16 @ src[2 + s*2] | src[2 + s*0] | |
1414 | ||
1415 | ldr r10, [lr] | |
1416 | subs r4, r4, #4 | |
1417 | smlad r12, r11, r10, r12 @ filter[0][0] | |
1418 | smlad r1, r8, r10, r1 @ filter[1][0] | |
1419 | smlad r5, r6, r10, r5 @ filter[2][0] | |
1420 | smlad r9, r7, r10, r9 @ filter[3][0] | |
1421 | ||
1422 | sat4 r12, r1, r5, r9 | |
1423 | str r12, [r0], #4 | |
1424 | ||
1425 | bne 1b | |
1426 | ||
1427 | ldrd r4, r5, [sp, #40] | |
1428 | ldr r6, [sp] | |
1429 | subs r5, r5, #1 | |
1430 | sub r2, r2, r4 | |
1431 | str r5, [sp, #44] | |
1432 | add r0, r0, r6 | |
1433 | add r2, r2, r3 | |
1434 | ||
1435 | bne 1b | |
1436 | ||
1437 | pop {r1, r4-r11, pc} | |
1438 | endfunc | |
1439 | ||
1440 | function vp8_put_epel_h4_armv6 | |
1441 | push {r1, r4-r11, lr} | |
1442 | bl_put_epel_h4_armv6: | |
1443 | subs r2, r2, #1 | |
1444 | movrel lr, fourtap_filters_1324 - 4 | |
1445 | add lr, lr, r12, lsl #2 | |
1446 | sub r3, r3, r4 | |
1447 | ldm lr, {r5, r6} | |
1448 | ldr lr, [sp, #44] | |
1449 | 1: | |
1450 | ldr r9, [r2, #3] | |
1451 | ldr r8, [r2, #2] | |
1452 | ldr r7, [r2], #4 | |
1453 | ||
1454 | uxtb16 r9, r9, ror #8 @ src[6] | src[4] | |
1455 | uxtb16 r10, r8, ror #8 @ src[5] | src[3] | |
1456 | uxtb16 r8, r8 @ src[4] | src[2] | |
1457 | uxtb16 r11, r7, ror #8 @ src[3] | src[1] | |
1458 | uxtb16 r7, r7 @ src[2] | src[0] | |
1459 | ||
1460 | mov r12, #0x40 | |
1461 | smlad r9, r9, r6, r12 @ filter[3][1] | |
1462 | smlad r7, r7, r5, r12 @ filter[0][0] | |
1463 | smlad r9, r10, r5, r9 @ filter[3][0] | |
1464 | smlad r10, r10, r6, r12 @ filter[2][1] | |
1465 | smlad r12, r11, r5, r12 @ filter[1][0] | |
1466 | smlad r7, r11, r6, r7 @ filter[0][1] | |
1467 | smlad r10, r8, r5, r10 @ filter[2][0] | |
1468 | smlad r12, r8, r6, r12 @ filter[1][1] | |
1469 | ||
1470 | subs r4, r4, #4 | |
1471 | ||
1472 | sat4 r7, r12, r10, r9 | |
1473 | str r7, [r0], #4 | |
1474 | ||
1475 | bne 1b | |
1476 | ||
1477 | subs lr, lr, #1 | |
1478 | ldr r4, [sp, #40] | |
1479 | add r2, r2, r3 | |
1480 | add r0, r0, r1 | |
1481 | ||
1482 | bne 1b | |
1483 | ||
1484 | pop {r1, r4-r11, pc} | |
1485 | endfunc | |
1486 | ||
1487 | function vp8_put_epel_v4_armv6 | |
1488 | push {r1, r4-r11, lr} | |
1489 | bl_put_epel_v4_armv6: | |
1490 | movrel lr, fourtap_filters_1324 - 4 | |
1491 | add lr, lr, r12, lsl #2 | |
1492 | ldm lr, {r5, r6} | |
1493 | str r3, [sp, #48] | |
1494 | 1: | |
1495 | ldr lr, [r2, r3, lsl #1] | |
1496 | ldr r12, [r2, r3] | |
1497 | ldr_nreg r7, r2, r3 | |
1498 | ldr r11, [r2], #4 | |
1499 | ||
1500 | uxtb16 r8, lr, ror #8 @ src[3 + s*3] | src[1 + s*3] | |
1501 | uxtb16 r9, r12, ror #8 @ src[3 + s*2] | src[1 + s*2] | |
1502 | uxtb16 r3, r7, ror #8 @ src[3 + s*0] | src[1 + s*0] | |
1503 | uxtb16 r1, r11, ror #8 @ src[3 + s*1] | src[1 + s*1] | |
1504 | uxtb16 lr, lr @ src[2 + s*3] | src[0 + s*3] | |
1505 | uxtb16 r12, r12 @ src[2 + s*2] | src[0 + s*2] | |
1506 | uxtb16 r7, r7 @ src[2 + s*0] | src[0 + s*0] | |
1507 | uxtb16 r11, r11 @ src[2 + s*1] | src[0 + s*1] | |
1508 | pkhbt r10, r1, r8, lsl #16 @ src[1 + s*3] | src[1 + s*1] | |
1509 | pkhtb r1, r8, r1, asr #16 @ src[3 + s*3] | src[3 + s*1] | |
1510 | pkhbt r8, r3, r9, lsl #16 @ src[1 + s*2] | src[1 + s*0] | |
1511 | pkhtb r3, r9, r3, asr #16 @ src[3 + s*2] | src[3 + s*0] | |
1512 | pkhbt r9, r11, lr, lsl #16 @ src[0 + s*3] | src[0 + s*1] | |
1513 | pkhtb r11, lr, r11, asr #16 @ src[2 + s*3] | src[2 + s*1] | |
1514 | pkhbt lr, r7, r12, lsl #16 @ src[0 + s*2] | src[0 + s*0] | |
1515 | pkhtb r7, r12, r7, asr #16 @ src[2 + s*2] | src[2 + s*0] | |
1516 | ||
1517 | mov r12, #0x40 | |
1518 | smlad r9, r9, r6, r12 @ filter[0][1] | |
1519 | smlad r10, r10, r6, r12 @ filter[1][1] | |
1520 | smlad r11, r11, r6, r12 @ filter[2][1] | |
1521 | smlad r1, r1, r6, r12 @ filter[3][1] | |
1522 | smlad r9, lr, r5, r9 @ filter[0][0] | |
1523 | smlad r10, r8, r5, r10 @ filter[1][0] | |
1524 | smlad r11, r7, r5, r11 @ filter[2][0] | |
1525 | smlad r1, r3, r5, r1 @ filter[3][0] | |
1526 | ||
1527 | subs r4, r4, #4 | |
1528 | ldr r3, [sp, #48] | |
1529 | ||
1530 | sat4 r9, r10, r11, r1 | |
1531 | str r9, [r0], #4 | |
1532 | ||
1533 | bne 1b | |
1534 | ||
1535 | ldr r4, [sp, #40] | |
1536 | ldr r12, [sp, #44] | |
1537 | add r2, r2, r3 | |
1538 | ldr r9, [sp, #0] | |
1539 | subs r12, r12, #1 | |
1540 | sub r2, r2, r4 | |
1541 | str r12, [sp, #44] | |
1542 | add r0, r0, r9 | |
1543 | ||
1544 | bne 1b | |
1545 | ||
1546 | pop {r1, r4-r11, pc} | |
1547 | endfunc | |
1548 | ||
1549 | function vp8_put_bilin_h_armv6 | |
1550 | push {r1, r4-r11, lr} | |
1551 | bl_put_bilin_h_armv6: | |
1552 | rsb r5, r12, r12, lsl #16 | |
1553 | ldr r12, [sp, #44] | |
1554 | sub r3, r3, r4 | |
1555 | add r5, r5, #8 | |
1556 | 1: | |
1557 | ldrb r6, [r2], #1 | |
1558 | ldrb r7, [r2], #1 | |
1559 | ldrb r8, [r2], #1 | |
1560 | ldrb r9, [r2], #1 | |
1561 | ldrb lr, [r2] | |
1562 | ||
1563 | pkhbt r6, r6, r7, lsl #16 @ src[1] | src[0] | |
1564 | pkhbt r7, r7, r8, lsl #16 @ src[2] | src[1] | |
1565 | pkhbt r8, r8, r9, lsl #16 @ src[3] | src[2] | |
1566 | pkhbt r9, r9, lr, lsl #16 @ src[4] | src[3] | |
1567 | ||
1568 | mov r10, #4 | |
1569 | smlad r6, r6, r5, r10 | |
1570 | smlad r7, r7, r5, r10 | |
1571 | smlad r8, r8, r5, r10 | |
1572 | smlad r9, r9, r5, r10 | |
1573 | ||
1574 | subs r4, r4, #4 | |
1575 | ||
1576 | asr r6, #3 | |
1577 | asr r7, #3 | |
1578 | pkhbt r6, r6, r8, lsl #13 | |
1579 | pkhbt r7, r7, r9, lsl #13 | |
1580 | orr r6, r6, r7, lsl #8 | |
1581 | str r6, [r0], #4 | |
1582 | ||
1583 | bne 1b | |
1584 | ||
1585 | ldr r4, [sp, #40] | |
1586 | subs r12, r12, #1 | |
1587 | add r2, r2, r3 | |
1588 | add r0, r0, r1 | |
1589 | ||
1590 | bne 1b | |
1591 | ||
1592 | pop {r1, r4-r11, pc} | |
1593 | endfunc | |
1594 | ||
1595 | function vp8_put_bilin_v_armv6 | |
1596 | push {r1, r4-r11, lr} | |
1597 | bl_put_bilin_v_armv6: | |
1598 | rsb r5, r12, r12, lsl #16 | |
1599 | ldr r12, [sp, #44] | |
1600 | add r5, r5, #8 | |
1601 | 1: | |
1602 | ldrb r10, [r2, r3] | |
1603 | ldrb r6, [r2], #1 | |
1604 | ldrb r11, [r2, r3] | |
1605 | ldrb r7, [r2], #1 | |
1606 | ldrb lr, [r2, r3] | |
1607 | ldrb r8, [r2], #1 | |
1608 | ldrb r9, [r2, r3] | |
1609 | pkhbt r6, r6, r10, lsl #16 | |
1610 | ldrb r10, [r2], #1 | |
1611 | pkhbt r7, r7, r11, lsl #16 | |
1612 | pkhbt r8, r8, lr, lsl #16 | |
1613 | pkhbt r9, r10, r9, lsl #16 | |
1614 | ||
1615 | mov r10, #4 | |
1616 | smlad r6, r6, r5, r10 | |
1617 | smlad r7, r7, r5, r10 | |
1618 | smlad r8, r8, r5, r10 | |
1619 | smlad r9, r9, r5, r10 | |
1620 | ||
1621 | subs r4, r4, #4 | |
1622 | ||
1623 | asr r6, #3 | |
1624 | asr r7, #3 | |
1625 | pkhbt r6, r6, r8, lsl #13 | |
1626 | pkhbt r7, r7, r9, lsl #13 | |
1627 | orr r6, r6, r7, lsl #8 | |
1628 | str r6, [r0], #4 | |
1629 | ||
1630 | bne 1b | |
1631 | ||
1632 | ldr r4, [sp, #40] | |
1633 | subs r12, r12, #1 | |
1634 | add r2, r2, r3 | |
1635 | add r0, r0, r1 | |
1636 | sub r2, r2, r4 | |
1637 | ||
1638 | bne 1b | |
1639 | pop {r1, r4-r11, pc} | |
1640 | endfunc |