Imported Debian version 2.5.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / simple_idct_armv5te.S
1 /*
2 * Simple IDCT
3 *
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5 * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #include "libavutil/arm/asm.S"
25
26 #define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
27 #define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
28 #define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
29 #define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
30 #define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
31 #define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
32 #define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
33 #define ROW_SHIFT 11
34 #define COL_SHIFT 20
35
36 #define W13 (W1 | (W3 << 16))
37 #define W26 (W2 | (W6 << 16))
38 #define W57 (W5 | (W7 << 16))
39
40 function idct_row_armv5te
41 str lr, [sp, #-4]!
42
43 ldrd v1, v2, [a1, #8]
44 ldrd a3, a4, [a1] /* a3 = row[1:0], a4 = row[3:2] */
45 orrs v1, v1, v2
46 itt eq
47 cmpeq v1, a4
48 cmpeq v1, a3, lsr #16
49 beq row_dc_only
50
51 mov v1, #(1<<(ROW_SHIFT-1))
52 mov ip, #16384
53 sub ip, ip, #1 /* ip = W4 */
54 smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */
55 ldr ip, =W26 /* ip = W2 | (W6 << 16) */
56 smultb a2, ip, a4
57 smulbb lr, ip, a4
58 add v2, v1, a2
59 sub v3, v1, a2
60 sub v4, v1, lr
61 add v1, v1, lr
62
63 ldr ip, =W13 /* ip = W1 | (W3 << 16) */
64 ldr lr, =W57 /* lr = W5 | (W7 << 16) */
65 smulbt v5, ip, a3
66 smultt v6, lr, a4
67 smlatt v5, ip, a4, v5
68 smultt a2, ip, a3
69 smulbt v7, lr, a3
70 sub v6, v6, a2
71 smulbt a2, ip, a4
72 smultt fp, lr, a3
73 sub v7, v7, a2
74 smulbt a2, lr, a4
75 ldrd a3, a4, [a1, #8] /* a3=row[5:4] a4=row[7:6] */
76 sub fp, fp, a2
77
78 orrs a2, a3, a4
79 beq 1f
80
81 smlabt v5, lr, a3, v5
82 smlabt v6, ip, a3, v6
83 smlatt v5, lr, a4, v5
84 smlabt v6, lr, a4, v6
85 smlatt v7, lr, a3, v7
86 smlatt fp, ip, a3, fp
87 smulbt a2, ip, a4
88 smlatt v7, ip, a4, v7
89 sub fp, fp, a2
90
91 ldr ip, =W26 /* ip = W2 | (W6 << 16) */
92 mov a2, #16384
93 sub a2, a2, #1 /* a2 = W4 */
94 smulbb a2, a2, a3 /* a2 = W4*row[4] */
95 smultb lr, ip, a4 /* lr = W6*row[6] */
96 add v1, v1, a2 /* v1 += W4*row[4] */
97 add v1, v1, lr /* v1 += W6*row[6] */
98 add v4, v4, a2 /* v4 += W4*row[4] */
99 sub v4, v4, lr /* v4 -= W6*row[6] */
100 smulbb lr, ip, a4 /* lr = W2*row[6] */
101 sub v2, v2, a2 /* v2 -= W4*row[4] */
102 sub v2, v2, lr /* v2 -= W2*row[6] */
103 sub v3, v3, a2 /* v3 -= W4*row[4] */
104 add v3, v3, lr /* v3 += W2*row[6] */
105
106 1: add a2, v1, v5
107 mov a3, a2, lsr #11
108 bic a3, a3, #0x1f0000
109 sub a2, v2, v6
110 mov a2, a2, lsr #11
111 add a3, a3, a2, lsl #16
112 add a2, v3, v7
113 mov a4, a2, lsr #11
114 bic a4, a4, #0x1f0000
115 add a2, v4, fp
116 mov a2, a2, lsr #11
117 add a4, a4, a2, lsl #16
118 strd a3, a4, [a1]
119
120 sub a2, v4, fp
121 mov a3, a2, lsr #11
122 bic a3, a3, #0x1f0000
123 sub a2, v3, v7
124 mov a2, a2, lsr #11
125 add a3, a3, a2, lsl #16
126 add a2, v2, v6
127 mov a4, a2, lsr #11
128 bic a4, a4, #0x1f0000
129 sub a2, v1, v5
130 mov a2, a2, lsr #11
131 add a4, a4, a2, lsl #16
132 strd a3, a4, [a1, #8]
133
134 ldr pc, [sp], #4
135
136 row_dc_only:
137 orr a3, a3, a3, lsl #16
138 bic a3, a3, #0xe000
139 mov a3, a3, lsl #3
140 mov a4, a3
141 strd a3, a4, [a1]
142 strd a3, a4, [a1, #8]
143
144 ldr pc, [sp], #4
145 endfunc
146
147 .macro idct_col
148 ldr a4, [a1] /* a4 = col[1:0] */
149 mov ip, #16384
150 sub ip, ip, #1 /* ip = W4 */
151 #if 0
152 mov v1, #(1<<(COL_SHIFT-1))
153 smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */
154 smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */
155 ldr a4, [a1, #(16*4)]
156 #else
157 mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */
158 add v2, v1, a4, asr #16
159 rsb v2, v2, v2, lsl #14
160 mov a4, a4, lsl #16
161 add v1, v1, a4, asr #16
162 ldr a4, [a1, #(16*4)]
163 rsb v1, v1, v1, lsl #14
164 #endif
165
166 smulbb lr, ip, a4
167 smulbt a3, ip, a4
168 sub v3, v1, lr
169 sub v5, v1, lr
170 add v7, v1, lr
171 add v1, v1, lr
172 sub v4, v2, a3
173 sub v6, v2, a3
174 add fp, v2, a3
175 ldr ip, =W26
176 ldr a4, [a1, #(16*2)]
177 add v2, v2, a3
178
179 smulbb lr, ip, a4
180 smultb a3, ip, a4
181 add v1, v1, lr
182 sub v7, v7, lr
183 add v3, v3, a3
184 sub v5, v5, a3
185 smulbt lr, ip, a4
186 smultt a3, ip, a4
187 add v2, v2, lr
188 sub fp, fp, lr
189 add v4, v4, a3
190 ldr a4, [a1, #(16*6)]
191 sub v6, v6, a3
192
193 smultb lr, ip, a4
194 smulbb a3, ip, a4
195 add v1, v1, lr
196 sub v7, v7, lr
197 sub v3, v3, a3
198 add v5, v5, a3
199 smultt lr, ip, a4
200 smulbt a3, ip, a4
201 add v2, v2, lr
202 sub fp, fp, lr
203 sub v4, v4, a3
204 add v6, v6, a3
205
206 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp}
207
208 ldr ip, =W13
209 ldr a4, [a1, #(16*1)]
210 ldr lr, =W57
211 smulbb v1, ip, a4
212 smultb v3, ip, a4
213 smulbb v5, lr, a4
214 smultb v7, lr, a4
215 smulbt v2, ip, a4
216 smultt v4, ip, a4
217 smulbt v6, lr, a4
218 smultt fp, lr, a4
219 rsb v4, v4, #0
220 ldr a4, [a1, #(16*3)]
221 rsb v3, v3, #0
222
223 smlatb v1, ip, a4, v1
224 smlatb v3, lr, a4, v3
225 smulbb a3, ip, a4
226 smulbb a2, lr, a4
227 sub v5, v5, a3
228 sub v7, v7, a2
229 smlatt v2, ip, a4, v2
230 smlatt v4, lr, a4, v4
231 smulbt a3, ip, a4
232 smulbt a2, lr, a4
233 sub v6, v6, a3
234 ldr a4, [a1, #(16*5)]
235 sub fp, fp, a2
236
237 smlabb v1, lr, a4, v1
238 smlabb v3, ip, a4, v3
239 smlatb v5, lr, a4, v5
240 smlatb v7, ip, a4, v7
241 smlabt v2, lr, a4, v2
242 smlabt v4, ip, a4, v4
243 smlatt v6, lr, a4, v6
244 ldr a3, [a1, #(16*7)]
245 smlatt fp, ip, a4, fp
246
247 smlatb v1, lr, a3, v1
248 smlabb v3, lr, a3, v3
249 smlatb v5, ip, a3, v5
250 smulbb a4, ip, a3
251 smlatt v2, lr, a3, v2
252 sub v7, v7, a4
253 smlabt v4, lr, a3, v4
254 smulbt a4, ip, a3
255 smlatt v6, ip, a3, v6
256 sub fp, fp, a4
257 .endm
258
259 function idct_col_armv5te
260 str lr, [sp, #-4]!
261
262 idct_col
263
264 ldmfd sp!, {a3, a4}
265 adds a2, a3, v1
266 mov a2, a2, lsr #20
267 it mi
268 orrmi a2, a2, #0xf000
269 add ip, a4, v2
270 mov ip, ip, asr #20
271 orr a2, a2, ip, lsl #16
272 str a2, [a1]
273 subs a3, a3, v1
274 mov a2, a3, lsr #20
275 it mi
276 orrmi a2, a2, #0xf000
277 sub a4, a4, v2
278 mov a4, a4, asr #20
279 orr a2, a2, a4, lsl #16
280 ldmfd sp!, {a3, a4}
281 str a2, [a1, #(16*7)]
282
283 subs a2, a3, v3
284 mov a2, a2, lsr #20
285 it mi
286 orrmi a2, a2, #0xf000
287 sub ip, a4, v4
288 mov ip, ip, asr #20
289 orr a2, a2, ip, lsl #16
290 str a2, [a1, #(16*1)]
291 adds a3, a3, v3
292 mov a2, a3, lsr #20
293 it mi
294 orrmi a2, a2, #0xf000
295 add a4, a4, v4
296 mov a4, a4, asr #20
297 orr a2, a2, a4, lsl #16
298 ldmfd sp!, {a3, a4}
299 str a2, [a1, #(16*6)]
300
301 adds a2, a3, v5
302 mov a2, a2, lsr #20
303 it mi
304 orrmi a2, a2, #0xf000
305 add ip, a4, v6
306 mov ip, ip, asr #20
307 orr a2, a2, ip, lsl #16
308 str a2, [a1, #(16*2)]
309 subs a3, a3, v5
310 mov a2, a3, lsr #20
311 it mi
312 orrmi a2, a2, #0xf000
313 sub a4, a4, v6
314 mov a4, a4, asr #20
315 orr a2, a2, a4, lsl #16
316 ldmfd sp!, {a3, a4}
317 str a2, [a1, #(16*5)]
318
319 adds a2, a3, v7
320 mov a2, a2, lsr #20
321 it mi
322 orrmi a2, a2, #0xf000
323 add ip, a4, fp
324 mov ip, ip, asr #20
325 orr a2, a2, ip, lsl #16
326 str a2, [a1, #(16*3)]
327 subs a3, a3, v7
328 mov a2, a3, lsr #20
329 it mi
330 orrmi a2, a2, #0xf000
331 sub a4, a4, fp
332 mov a4, a4, asr #20
333 orr a2, a2, a4, lsl #16
334 str a2, [a1, #(16*4)]
335
336 ldr pc, [sp], #4
337 endfunc
338
339 .macro clip dst, src:vararg
340 movs \dst, \src
341 it mi
342 movmi \dst, #0
343 cmp \dst, #255
344 it gt
345 movgt \dst, #255
346 .endm
347
348 .macro aclip dst, src:vararg
349 adds \dst, \src
350 it mi
351 movmi \dst, #0
352 cmp \dst, #255
353 it gt
354 movgt \dst, #255
355 .endm
356
357 function idct_col_put_armv5te
358 str lr, [sp, #-4]!
359
360 idct_col
361
362 ldmfd sp!, {a3, a4}
363 ldr lr, [sp, #32]
364 add a2, a3, v1
365 clip a2, a2, asr #20
366 add ip, a4, v2
367 clip ip, ip, asr #20
368 orr a2, a2, ip, lsl #8
369 sub a3, a3, v1
370 clip a3, a3, asr #20
371 sub a4, a4, v2
372 clip a4, a4, asr #20
373 ldr v1, [sp, #28]
374 strh a2, [v1]
375 add a2, v1, #2
376 str a2, [sp, #28]
377 orr a2, a3, a4, lsl #8
378 rsb v2, lr, lr, lsl #3
379 ldmfd sp!, {a3, a4}
380 strh_pre a2, v2, v1
381
382 sub a2, a3, v3
383 clip a2, a2, asr #20
384 sub ip, a4, v4
385 clip ip, ip, asr #20
386 orr a2, a2, ip, lsl #8
387 strh_pre a2, v1, lr
388 add a3, a3, v3
389 clip a2, a3, asr #20
390 add a4, a4, v4
391 clip a4, a4, asr #20
392 orr a2, a2, a4, lsl #8
393 ldmfd sp!, {a3, a4}
394 strh_dpre a2, v2, lr
395
396 add a2, a3, v5
397 clip a2, a2, asr #20
398 add ip, a4, v6
399 clip ip, ip, asr #20
400 orr a2, a2, ip, lsl #8
401 strh_pre a2, v1, lr
402 sub a3, a3, v5
403 clip a2, a3, asr #20
404 sub a4, a4, v6
405 clip a4, a4, asr #20
406 orr a2, a2, a4, lsl #8
407 ldmfd sp!, {a3, a4}
408 strh_dpre a2, v2, lr
409
410 add a2, a3, v7
411 clip a2, a2, asr #20
412 add ip, a4, fp
413 clip ip, ip, asr #20
414 orr a2, a2, ip, lsl #8
415 strh a2, [v1, lr]
416 sub a3, a3, v7
417 clip a2, a3, asr #20
418 sub a4, a4, fp
419 clip a4, a4, asr #20
420 orr a2, a2, a4, lsl #8
421 strh_dpre a2, v2, lr
422
423 ldr pc, [sp], #4
424 endfunc
425
426 function idct_col_add_armv5te
427 str lr, [sp, #-4]!
428
429 idct_col
430
431 ldr lr, [sp, #36]
432
433 ldmfd sp!, {a3, a4}
434 ldrh ip, [lr]
435 add a2, a3, v1
436 sub a3, a3, v1
437 and v1, ip, #255
438 aclip a2, v1, a2, asr #20
439 add v1, a4, v2
440 mov v1, v1, asr #20
441 aclip v1, v1, ip, lsr #8
442 orr a2, a2, v1, lsl #8
443 ldr v1, [sp, #32]
444 sub a4, a4, v2
445 rsb v2, v1, v1, lsl #3
446 ldrh_pre ip, v2, lr
447 strh a2, [lr]
448 and a2, ip, #255
449 aclip a3, a2, a3, asr #20
450 mov a4, a4, asr #20
451 aclip a4, a4, ip, lsr #8
452 add a2, lr, #2
453 str a2, [sp, #28]
454 orr a2, a3, a4, lsl #8
455 strh a2, [v2]
456
457 ldmfd sp!, {a3, a4}
458 ldrh_pre ip, lr, v1
459 sub a2, a3, v3
460 add a3, a3, v3
461 and v3, ip, #255
462 aclip a2, v3, a2, asr #20
463 sub v3, a4, v4
464 mov v3, v3, asr #20
465 aclip v3, v3, ip, lsr #8
466 orr a2, a2, v3, lsl #8
467 add a4, a4, v4
468 ldrh_dpre ip, v2, v1
469 strh a2, [lr]
470 and a2, ip, #255
471 aclip a3, a2, a3, asr #20
472 mov a4, a4, asr #20
473 aclip a4, a4, ip, lsr #8
474 orr a2, a3, a4, lsl #8
475 strh a2, [v2]
476
477 ldmfd sp!, {a3, a4}
478 ldrh_pre ip, lr, v1
479 add a2, a3, v5
480 sub a3, a3, v5
481 and v3, ip, #255
482 aclip a2, v3, a2, asr #20
483 add v3, a4, v6
484 mov v3, v3, asr #20
485 aclip v3, v3, ip, lsr #8
486 orr a2, a2, v3, lsl #8
487 sub a4, a4, v6
488 ldrh_dpre ip, v2, v1
489 strh a2, [lr]
490 and a2, ip, #255
491 aclip a3, a2, a3, asr #20
492 mov a4, a4, asr #20
493 aclip a4, a4, ip, lsr #8
494 orr a2, a3, a4, lsl #8
495 strh a2, [v2]
496
497 ldmfd sp!, {a3, a4}
498 ldrh_pre ip, lr, v1
499 add a2, a3, v7
500 sub a3, a3, v7
501 and v3, ip, #255
502 aclip a2, v3, a2, asr #20
503 add v3, a4, fp
504 mov v3, v3, asr #20
505 aclip v3, v3, ip, lsr #8
506 orr a2, a2, v3, lsl #8
507 sub a4, a4, fp
508 ldrh_dpre ip, v2, v1
509 strh a2, [lr]
510 and a2, ip, #255
511 aclip a3, a2, a3, asr #20
512 mov a4, a4, asr #20
513 aclip a4, a4, ip, lsr #8
514 orr a2, a3, a4, lsl #8
515 strh a2, [v2]
516
517 ldr pc, [sp], #4
518 endfunc
519
520 function ff_simple_idct_armv5te, export=1
521 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
522
523 bl idct_row_armv5te
524 add a1, a1, #16
525 bl idct_row_armv5te
526 add a1, a1, #16
527 bl idct_row_armv5te
528 add a1, a1, #16
529 bl idct_row_armv5te
530 add a1, a1, #16
531 bl idct_row_armv5te
532 add a1, a1, #16
533 bl idct_row_armv5te
534 add a1, a1, #16
535 bl idct_row_armv5te
536 add a1, a1, #16
537 bl idct_row_armv5te
538
539 sub a1, a1, #(16*7)
540
541 bl idct_col_armv5te
542 add a1, a1, #4
543 bl idct_col_armv5te
544 add a1, a1, #4
545 bl idct_col_armv5te
546 add a1, a1, #4
547 bl idct_col_armv5te
548
549 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
550 endfunc
551
552 function ff_simple_idct_add_armv5te, export=1
553 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
554
555 mov a1, a3
556
557 bl idct_row_armv5te
558 add a1, a1, #16
559 bl idct_row_armv5te
560 add a1, a1, #16
561 bl idct_row_armv5te
562 add a1, a1, #16
563 bl idct_row_armv5te
564 add a1, a1, #16
565 bl idct_row_armv5te
566 add a1, a1, #16
567 bl idct_row_armv5te
568 add a1, a1, #16
569 bl idct_row_armv5te
570 add a1, a1, #16
571 bl idct_row_armv5te
572
573 sub a1, a1, #(16*7)
574
575 bl idct_col_add_armv5te
576 add a1, a1, #4
577 bl idct_col_add_armv5te
578 add a1, a1, #4
579 bl idct_col_add_armv5te
580 add a1, a1, #4
581 bl idct_col_add_armv5te
582
583 add sp, sp, #8
584 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
585 endfunc
586
587 function ff_simple_idct_put_armv5te, export=1
588 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
589
590 mov a1, a3
591
592 bl idct_row_armv5te
593 add a1, a1, #16
594 bl idct_row_armv5te
595 add a1, a1, #16
596 bl idct_row_armv5te
597 add a1, a1, #16
598 bl idct_row_armv5te
599 add a1, a1, #16
600 bl idct_row_armv5te
601 add a1, a1, #16
602 bl idct_row_armv5te
603 add a1, a1, #16
604 bl idct_row_armv5te
605 add a1, a1, #16
606 bl idct_row_armv5te
607
608 sub a1, a1, #(16*7)
609
610 bl idct_col_put_armv5te
611 add a1, a1, #4
612 bl idct_col_put_armv5te
613 add a1, a1, #4
614 bl idct_col_put_armv5te
615 add a1, a1, #4
616 bl idct_col_put_armv5te
617
618 add sp, sp, #8
619 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
620 endfunc