f545df9e4f2617d608abbebb1a51a05aba736a83
[deb_ffmpeg.git] / idctdsp_alpha_asm.S
1 /*
2 * Alpha optimized IDCT-related routines
3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 /*
23 * These functions are scheduled for pca56. They should work
24 * reasonably on ev6, though.
25 */
26
27 #include "regdef.h"
28
29 .set noat
30 .set noreorder
31 .arch pca56
32 .text
33
34 /************************************************************************
35 * void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
36 * ptrdiff_t line_size)
37 */
38 .align 6
39 .globl put_pixels_clamped_mvi_asm
40 .ent put_pixels_clamped_mvi_asm
41 put_pixels_clamped_mvi_asm:
42 .frame sp, 0, ra
43 .prologue 0
44
45 lda t8, -1
46 lda t9, 8 # loop counter
47 zap t8, 0xaa, t8 # 00ff00ff00ff00ff
48
49 .align 4
50 1: ldq t0, 0(a0)
51 ldq t1, 8(a0)
52 ldq t2, 16(a0)
53 ldq t3, 24(a0)
54
55 maxsw4 t0, zero, t0
56 subq t9, 2, t9
57 maxsw4 t1, zero, t1
58 lda a0, 32(a0)
59
60 maxsw4 t2, zero, t2
61 addq a1, a2, ta
62 maxsw4 t3, zero, t3
63 minsw4 t0, t8, t0
64
65 minsw4 t1, t8, t1
66 minsw4 t2, t8, t2
67 minsw4 t3, t8, t3
68 pkwb t0, t0
69
70 pkwb t1, t1
71 pkwb t2, t2
72 pkwb t3, t3
73 stl t0, 0(a1)
74
75 stl t1, 4(a1)
76 addq ta, a2, a1
77 stl t2, 0(ta)
78 stl t3, 4(ta)
79
80 bne t9, 1b
81 ret
82 .end put_pixels_clamped_mvi_asm
83
84 /************************************************************************
85 * void add_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
86 * ptrdiff_t line_size)
87 */
88 .align 6
89 .globl add_pixels_clamped_mvi_asm
90 .ent add_pixels_clamped_mvi_asm
91 add_pixels_clamped_mvi_asm:
92 .frame sp, 0, ra
93 .prologue 0
94
95 lda t1, -1
96 lda th, 8
97 zap t1, 0x33, tg
98 nop
99
100 srl tg, 1, t0
101 xor tg, t0, tg # 0x8000800080008000
102 zap t1, 0xaa, tf # 0x00ff00ff00ff00ff
103
104 .align 4
105 1: ldl t1, 0(a1) # pix0 (try to hit cache line soon)
106 ldl t4, 4(a1) # pix1
107 addq a1, a2, te # pixels += line_size
108 ldq t0, 0(a0) # shorts0
109
110 ldl t7, 0(te) # pix2 (try to hit cache line soon)
111 ldl ta, 4(te) # pix3
112 ldq t3, 8(a0) # shorts1
113 ldq t6, 16(a0) # shorts2
114
115 ldq t9, 24(a0) # shorts3
116 unpkbw t1, t1 # 0 0 (quarter/op no.)
117 and t0, tg, t2 # 0 1
118 unpkbw t4, t4 # 1 0
119
120 bic t0, tg, t0 # 0 2
121 unpkbw t7, t7 # 2 0
122 and t3, tg, t5 # 1 1
123 addq t0, t1, t0 # 0 3
124
125 xor t0, t2, t0 # 0 4
126 unpkbw ta, ta # 3 0
127 and t6, tg, t8 # 2 1
128 maxsw4 t0, zero, t0 # 0 5
129
130 bic t3, tg, t3 # 1 2
131 bic t6, tg, t6 # 2 2
132 minsw4 t0, tf, t0 # 0 6
133 addq t3, t4, t3 # 1 3
134
135 pkwb t0, t0 # 0 7
136 xor t3, t5, t3 # 1 4
137 maxsw4 t3, zero, t3 # 1 5
138 addq t6, t7, t6 # 2 3
139
140 xor t6, t8, t6 # 2 4
141 and t9, tg, tb # 3 1
142 minsw4 t3, tf, t3 # 1 6
143 bic t9, tg, t9 # 3 2
144
145 maxsw4 t6, zero, t6 # 2 5
146 addq t9, ta, t9 # 3 3
147 stl t0, 0(a1) # 0 8
148 minsw4 t6, tf, t6 # 2 6
149
150 xor t9, tb, t9 # 3 4
151 maxsw4 t9, zero, t9 # 3 5
152 lda a0, 32(a0) # block += 16;
153 pkwb t3, t3 # 1 7
154
155 minsw4 t9, tf, t9 # 3 6
156 subq th, 2, th
157 pkwb t6, t6 # 2 7
158 pkwb t9, t9 # 3 7
159
160 stl t3, 4(a1) # 1 8
161 addq te, a2, a1 # pixels += line_size
162 stl t6, 0(te) # 2 8
163 stl t9, 4(te) # 3 8
164
165 bne th, 1b
166 ret
167 .end add_pixels_clamped_mvi_asm