4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5 * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "libavutil/arm/asm.S"
26 #define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
27 #define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
28 #define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
29 #define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
30 #define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
31 #define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
32 #define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
36 #define W13 (W1 | (W3 << 16))
37 #define W26 (W2 | (W6 << 16))
38 #define W57 (W5 | (W7 << 16))
40 function idct_row_armv5te
44 ldrd a3, a4, [a1] /* a3 = row[1:0], a4 = row[3:2] */
51 mov v1, #(1<<(ROW_SHIFT-1))
53 sub ip, ip, #1 /* ip = W4 */
54 smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */
55 ldr ip, =W26 /* ip = W2 | (W6 << 16) */
63 ldr ip, =W13 /* ip = W1 | (W3 << 16) */
64 ldr lr, =W57 /* lr = W5 | (W7 << 16) */
75 ldrd a3, a4, [a1, #8] /* a3=row[5:4] a4=row[7:6] */
91 ldr ip, =W26 /* ip = W2 | (W6 << 16) */
93 sub a2, a2, #1 /* a2 = W4 */
94 smulbb a2, a2, a3 /* a2 = W4*row[4] */
95 smultb lr, ip, a4 /* lr = W6*row[6] */
96 add v1, v1, a2 /* v1 += W4*row[4] */
97 add v1, v1, lr /* v1 += W6*row[6] */
98 add v4, v4, a2 /* v4 += W4*row[4] */
99 sub v4, v4, lr /* v4 -= W6*row[6] */
100 smulbb lr, ip, a4 /* lr = W2*row[6] */
101 sub v2, v2, a2 /* v2 -= W4*row[4] */
102 sub v2, v2, lr /* v2 -= W2*row[6] */
103 sub v3, v3, a2 /* v3 -= W4*row[4] */
104 add v3, v3, lr /* v3 += W2*row[6] */
108 bic a3, a3, #0x1f0000
111 add a3, a3, a2, lsl #16
114 bic a4, a4, #0x1f0000
117 add a4, a4, a2, lsl #16
122 bic a3, a3, #0x1f0000
125 add a3, a3, a2, lsl #16
128 bic a4, a4, #0x1f0000
131 add a4, a4, a2, lsl #16
132 strd a3, a4, [a1, #8]
137 orr a3, a3, a3, lsl #16
142 strd a3, a4, [a1, #8]
148 ldr a4, [a1] /* a4 = col[1:0] */
150 sub ip, ip, #1 /* ip = W4 */
152 mov v1, #(1<<(COL_SHIFT-1))
153 smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */
154 smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */
155 ldr a4, [a1, #(16*4)]
157 mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */
158 add v2, v1, a4, asr #16
159 rsb v2, v2, v2, lsl #14
161 add v1, v1, a4, asr #16
162 ldr a4, [a1, #(16*4)]
163 rsb v1, v1, v1, lsl #14
176 ldr a4, [a1, #(16*2)]
190 ldr a4, [a1, #(16*6)]
206 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp}
209 ldr a4, [a1, #(16*1)]
220 ldr a4, [a1, #(16*3)]
223 smlatb v1, ip, a4, v1
224 smlatb v3, lr, a4, v3
229 smlatt v2, ip, a4, v2
230 smlatt v4, lr, a4, v4
234 ldr a4, [a1, #(16*5)]
237 smlabb v1, lr, a4, v1
238 smlabb v3, ip, a4, v3
239 smlatb v5, lr, a4, v5
240 smlatb v7, ip, a4, v7
241 smlabt v2, lr, a4, v2
242 smlabt v4, ip, a4, v4
243 smlatt v6, lr, a4, v6
244 ldr a3, [a1, #(16*7)]
245 smlatt fp, ip, a4, fp
247 smlatb v1, lr, a3, v1
248 smlabb v3, lr, a3, v3
249 smlatb v5, ip, a3, v5
251 smlatt v2, lr, a3, v2
253 smlabt v4, lr, a3, v4
255 smlatt v6, ip, a3, v6
259 function idct_col_armv5te
268 orrmi a2, a2, #0xf000
271 orr a2, a2, ip, lsl #16
276 orrmi a2, a2, #0xf000
279 orr a2, a2, a4, lsl #16
281 str a2, [a1, #(16*7)]
286 orrmi a2, a2, #0xf000
289 orr a2, a2, ip, lsl #16
290 str a2, [a1, #(16*1)]
294 orrmi a2, a2, #0xf000
297 orr a2, a2, a4, lsl #16
299 str a2, [a1, #(16*6)]
304 orrmi a2, a2, #0xf000
307 orr a2, a2, ip, lsl #16
308 str a2, [a1, #(16*2)]
312 orrmi a2, a2, #0xf000
315 orr a2, a2, a4, lsl #16
317 str a2, [a1, #(16*5)]
322 orrmi a2, a2, #0xf000
325 orr a2, a2, ip, lsl #16
326 str a2, [a1, #(16*3)]
330 orrmi a2, a2, #0xf000
333 orr a2, a2, a4, lsl #16
334 str a2, [a1, #(16*4)]
339 .macro clip dst, src:vararg
348 .macro aclip dst, src:vararg
357 function idct_col_put_armv5te
368 orr a2, a2, ip, lsl #8
377 orr a2, a3, a4, lsl #8
378 rsb v2, lr, lr, lsl #3
386 orr a2, a2, ip, lsl #8
392 orr a2, a2, a4, lsl #8
400 orr a2, a2, ip, lsl #8
406 orr a2, a2, a4, lsl #8
414 orr a2, a2, ip, lsl #8
420 orr a2, a2, a4, lsl #8
426 function idct_col_add_armv5te
438 aclip a2, v1, a2, asr #20
441 aclip v1, v1, ip, lsr #8
442 orr a2, a2, v1, lsl #8
445 rsb v2, v1, v1, lsl #3
449 aclip a3, a2, a3, asr #20
451 aclip a4, a4, ip, lsr #8
454 orr a2, a3, a4, lsl #8
462 aclip a2, v3, a2, asr #20
465 aclip v3, v3, ip, lsr #8
466 orr a2, a2, v3, lsl #8
471 aclip a3, a2, a3, asr #20
473 aclip a4, a4, ip, lsr #8
474 orr a2, a3, a4, lsl #8
482 aclip a2, v3, a2, asr #20
485 aclip v3, v3, ip, lsr #8
486 orr a2, a2, v3, lsl #8
491 aclip a3, a2, a3, asr #20
493 aclip a4, a4, ip, lsr #8
494 orr a2, a3, a4, lsl #8
502 aclip a2, v3, a2, asr #20
505 aclip v3, v3, ip, lsr #8
506 orr a2, a2, v3, lsl #8
511 aclip a3, a2, a3, asr #20
513 aclip a4, a4, ip, lsr #8
514 orr a2, a3, a4, lsl #8
520 function ff_simple_idct_armv5te, export=1
521 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
549 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
552 function ff_simple_idct_add_armv5te, export=1
553 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
575 bl idct_col_add_armv5te
577 bl idct_col_add_armv5te
579 bl idct_col_add_armv5te
581 bl idct_col_add_armv5te
584 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
587 function ff_simple_idct_put_armv5te, export=1
588 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
610 bl idct_col_put_armv5te
612 bl idct_col_put_armv5te
614 bl idct_col_put_armv5te
616 bl idct_col_put_armv5te
619 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}