Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Simple IDCT | |
3 | * | |
4 | * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> | |
5 | * Copyright (c) 2007 Mans Rullgard <mans@mansr.com> | |
6 | * | |
7 | * This file is part of FFmpeg. | |
8 | * | |
9 | * FFmpeg is free software; you can redistribute it and/or | |
10 | * modify it under the terms of the GNU Lesser General Public | |
11 | * License as published by the Free Software Foundation; either | |
12 | * version 2.1 of the License, or (at your option) any later version. | |
13 | * | |
14 | * FFmpeg is distributed in the hope that it will be useful, | |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | * Lesser General Public License for more details. | |
18 | * | |
19 | * You should have received a copy of the GNU Lesser General Public | |
20 | * License along with FFmpeg; if not, write to the Free Software | |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
22 | */ | |
23 | ||
24 | #include "libavutil/arm/asm.S" | |
25 | ||
26 | #define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
27 | #define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
28 | #define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
29 | #define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
30 | #define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
31 | #define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
32 | #define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ | |
33 | #define ROW_SHIFT 11 | |
34 | #define COL_SHIFT 20 | |
35 | ||
36 | #define W13 (W1 | (W3 << 16)) | |
37 | #define W26 (W2 | (W6 << 16)) | |
38 | #define W42 (W4 | (W2 << 16)) | |
39 | #define W42n (-W4&0xffff | (-W2 << 16)) | |
40 | #define W46 (W4 | (W6 << 16)) | |
41 | #define W57 (W5 | (W7 << 16)) | |
42 | ||
43 | /* | |
44 | Compute partial IDCT of single row. | |
45 | shift = left-shift amount | |
46 | r0 = source address | |
47 | r2 = row[2,0] <= 2 cycles | |
48 | r3 = row[3,1] | |
49 | ip = w42 <= 2 cycles | |
50 | ||
51 | Output in registers r4--r11 | |
52 | */ | |
53 | .macro idct_row shift | |
54 | ldr lr, =W46 /* lr = W4 | (W6 << 16) */ | |
55 | mov r1, #(1<<(\shift-1)) | |
56 | smlad r4, r2, ip, r1 | |
57 | smlsd r7, r2, ip, r1 | |
58 | ldr ip, =W13 /* ip = W1 | (W3 << 16) */ | |
59 | ldr r10,=W57 /* r10 = W5 | (W7 << 16) */ | |
60 | smlad r5, r2, lr, r1 | |
61 | smlsd r6, r2, lr, r1 | |
62 | ||
63 | smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ | |
64 | smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ | |
65 | ldr lr, [r0, #12] /* lr = row[7,5] */ | |
66 | pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ | |
67 | pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ | |
68 | smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ | |
69 | smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */ | |
70 | smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ | |
71 | ||
72 | ldr r3, =W42n /* r3 = -W4 | (-W2 << 16) */ | |
73 | smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */ | |
74 | ldr r2, [r0, #4] /* r2 = row[6,4] */ | |
75 | smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */ | |
76 | ldr ip, =W46 /* ip = W4 | (W6 << 16) */ | |
77 | smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */ | |
78 | ||
79 | smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */ | |
80 | smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */ | |
81 | smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */ | |
82 | smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */ | |
83 | .endm | |
84 | ||
85 | /* | |
86 | Compute partial IDCT of half row. | |
87 | shift = left-shift amount | |
88 | r2 = row[2,0] | |
89 | r3 = row[3,1] | |
90 | ip = w42 | |
91 | ||
92 | Output in registers r4--r11 | |
93 | */ | |
94 | .macro idct_row4 shift | |
95 | ldr lr, =W46 /* lr = W4 | (W6 << 16) */ | |
96 | ldr r10,=W57 /* r10 = W5 | (W7 << 16) */ | |
97 | mov r1, #(1<<(\shift-1)) | |
98 | smlad r4, r2, ip, r1 | |
99 | smlsd r7, r2, ip, r1 | |
100 | ldr ip, =W13 /* ip = W1 | (W3 << 16) */ | |
101 | smlad r5, r2, lr, r1 | |
102 | smlsd r6, r2, lr, r1 | |
103 | smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ | |
104 | smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ | |
105 | pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ | |
106 | pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ | |
107 | smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ | |
108 | smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ | |
109 | .endm | |
110 | ||
111 | /* | |
112 | Compute final part of IDCT single row without shift. | |
113 | Input in registers r4--r11 | |
114 | Output in registers ip, r4--r6, lr, r8--r10 | |
115 | */ | |
116 | .macro idct_finish | |
117 | add ip, r4, r8 /* r1 = A0 + B0 */ | |
118 | sub lr, r4, r8 /* r2 = A0 - B0 */ | |
119 | sub r4, r5, r9 /* r2 = A1 + B1 */ | |
120 | add r8, r5, r9 /* r2 = A1 - B1 */ | |
121 | add r5, r6, r10 /* r1 = A2 + B2 */ | |
122 | sub r9, r6, r10 /* r1 = A2 - B2 */ | |
123 | add r6, r7, r11 /* r2 = A3 + B3 */ | |
124 | sub r10,r7, r11 /* r2 = A3 - B3 */ | |
125 | .endm | |
126 | ||
127 | /* | |
128 | Compute final part of IDCT single row. | |
129 | shift = right-shift amount | |
130 | Input/output in registers r4--r11 | |
131 | */ | |
132 | .macro idct_finish_shift shift | |
133 | add r3, r4, r8 /* r3 = A0 + B0 */ | |
134 | sub r2, r4, r8 /* r2 = A0 - B0 */ | |
135 | mov r4, r3, asr #\shift | |
136 | mov r8, r2, asr #\shift | |
137 | ||
138 | sub r3, r5, r9 /* r3 = A1 + B1 */ | |
139 | add r2, r5, r9 /* r2 = A1 - B1 */ | |
140 | mov r5, r3, asr #\shift | |
141 | mov r9, r2, asr #\shift | |
142 | ||
143 | add r3, r6, r10 /* r3 = A2 + B2 */ | |
144 | sub r2, r6, r10 /* r2 = A2 - B2 */ | |
145 | mov r6, r3, asr #\shift | |
146 | mov r10,r2, asr #\shift | |
147 | ||
148 | add r3, r7, r11 /* r3 = A3 + B3 */ | |
149 | sub r2, r7, r11 /* r2 = A3 - B3 */ | |
150 | mov r7, r3, asr #\shift | |
151 | mov r11,r2, asr #\shift | |
152 | .endm | |
153 | ||
154 | /* | |
155 | Compute final part of IDCT single row, saturating results at 8 bits. | |
156 | shift = right-shift amount | |
157 | Input/output in registers r4--r11 | |
158 | */ | |
159 | .macro idct_finish_shift_sat shift | |
160 | add r3, r4, r8 /* r3 = A0 + B0 */ | |
161 | sub ip, r4, r8 /* ip = A0 - B0 */ | |
162 | usat r4, #8, r3, asr #\shift | |
163 | usat r8, #8, ip, asr #\shift | |
164 | ||
165 | sub r3, r5, r9 /* r3 = A1 + B1 */ | |
166 | add ip, r5, r9 /* ip = A1 - B1 */ | |
167 | usat r5, #8, r3, asr #\shift | |
168 | usat r9, #8, ip, asr #\shift | |
169 | ||
170 | add r3, r6, r10 /* r3 = A2 + B2 */ | |
171 | sub ip, r6, r10 /* ip = A2 - B2 */ | |
172 | usat r6, #8, r3, asr #\shift | |
173 | usat r10,#8, ip, asr #\shift | |
174 | ||
175 | add r3, r7, r11 /* r3 = A3 + B3 */ | |
176 | sub ip, r7, r11 /* ip = A3 - B3 */ | |
177 | usat r7, #8, r3, asr #\shift | |
178 | usat r11,#8, ip, asr #\shift | |
179 | .endm | |
180 | ||
181 | /* | |
182 | Compute IDCT of single row, storing as column. | |
183 | r0 = source | |
184 | r1 = dest | |
185 | */ | |
186 | function idct_row_armv6 | |
187 | push {lr} | |
188 | ||
189 | ldr lr, [r0, #12] /* lr = row[7,5] */ | |
190 | ldr ip, [r0, #4] /* ip = row[6,4] */ | |
191 | ldr r3, [r0, #8] /* r3 = row[3,1] */ | |
192 | ldr r2, [r0] /* r2 = row[2,0] */ | |
193 | orrs lr, lr, ip | |
194 | itt eq | |
195 | cmpeq lr, r3 | |
196 | cmpeq lr, r2, lsr #16 | |
197 | beq 1f | |
198 | push {r1} | |
199 | ldr ip, =W42 /* ip = W4 | (W2 << 16) */ | |
200 | cmp lr, #0 | |
201 | beq 2f | |
202 | ||
203 | idct_row ROW_SHIFT | |
204 | b 3f | |
205 | ||
206 | 2: idct_row4 ROW_SHIFT | |
207 | ||
208 | 3: pop {r1} | |
209 | idct_finish_shift ROW_SHIFT | |
210 | ||
211 | strh r4, [r1] | |
212 | strh r5, [r1, #(16*2)] | |
213 | strh r6, [r1, #(16*4)] | |
214 | strh r7, [r1, #(16*6)] | |
215 | strh r11,[r1, #(16*1)] | |
216 | strh r10,[r1, #(16*3)] | |
217 | strh r9, [r1, #(16*5)] | |
218 | strh r8, [r1, #(16*7)] | |
219 | ||
220 | pop {pc} | |
221 | ||
222 | 1: mov r2, r2, lsl #3 | |
223 | strh r2, [r1] | |
224 | strh r2, [r1, #(16*2)] | |
225 | strh r2, [r1, #(16*4)] | |
226 | strh r2, [r1, #(16*6)] | |
227 | strh r2, [r1, #(16*1)] | |
228 | strh r2, [r1, #(16*3)] | |
229 | strh r2, [r1, #(16*5)] | |
230 | strh r2, [r1, #(16*7)] | |
231 | pop {pc} | |
232 | endfunc | |
233 | ||
234 | /* | |
235 | Compute IDCT of single column, read as row. | |
236 | r0 = source | |
237 | r1 = dest | |
238 | */ | |
239 | function idct_col_armv6 | |
240 | push {r1, lr} | |
241 | ||
242 | ldr r2, [r0] /* r2 = row[2,0] */ | |
243 | ldr ip, =W42 /* ip = W4 | (W2 << 16) */ | |
244 | ldr r3, [r0, #8] /* r3 = row[3,1] */ | |
245 | idct_row COL_SHIFT | |
246 | pop {r1} | |
247 | idct_finish_shift COL_SHIFT | |
248 | ||
249 | strh r4, [r1] | |
250 | strh r5, [r1, #(16*1)] | |
251 | strh r6, [r1, #(16*2)] | |
252 | strh r7, [r1, #(16*3)] | |
253 | strh r11,[r1, #(16*4)] | |
254 | strh r10,[r1, #(16*5)] | |
255 | strh r9, [r1, #(16*6)] | |
256 | strh r8, [r1, #(16*7)] | |
257 | ||
258 | pop {pc} | |
259 | endfunc | |
260 | ||
261 | /* | |
262 | Compute IDCT of single column, read as row, store saturated 8-bit. | |
263 | r0 = source | |
264 | r1 = dest | |
265 | r2 = line size | |
266 | */ | |
267 | function idct_col_put_armv6 | |
268 | push {r1, r2, lr} | |
269 | ||
270 | ldr r2, [r0] /* r2 = row[2,0] */ | |
271 | ldr ip, =W42 /* ip = W4 | (W2 << 16) */ | |
272 | ldr r3, [r0, #8] /* r3 = row[3,1] */ | |
273 | idct_row COL_SHIFT | |
274 | pop {r1, r2} | |
275 | idct_finish_shift_sat COL_SHIFT | |
276 | ||
277 | strb_post r4, r1, r2 | |
278 | strb_post r5, r1, r2 | |
279 | strb_post r6, r1, r2 | |
280 | strb_post r7, r1, r2 | |
281 | strb_post r11,r1, r2 | |
282 | strb_post r10,r1, r2 | |
283 | strb_post r9, r1, r2 | |
284 | strb_post r8, r1, r2 | |
285 | ||
286 | sub r1, r1, r2, lsl #3 | |
287 | ||
288 | pop {pc} | |
289 | endfunc | |
290 | ||
291 | /* | |
292 | Compute IDCT of single column, read as row, add/store saturated 8-bit. | |
293 | r0 = source | |
294 | r1 = dest | |
295 | r2 = line size | |
296 | */ | |
297 | function idct_col_add_armv6 | |
298 | push {r1, r2, lr} | |
299 | ||
300 | ldr r2, [r0] /* r2 = row[2,0] */ | |
301 | ldr ip, =W42 /* ip = W4 | (W2 << 16) */ | |
302 | ldr r3, [r0, #8] /* r3 = row[3,1] */ | |
303 | idct_row COL_SHIFT | |
304 | pop {r1, r2} | |
305 | idct_finish | |
306 | ||
307 | ldrb r3, [r1] | |
308 | ldrb r7, [r1, r2] | |
309 | ldrb r11,[r1, r2, lsl #2] | |
310 | add ip, r3, ip, asr #COL_SHIFT | |
311 | usat ip, #8, ip | |
312 | add r4, r7, r4, asr #COL_SHIFT | |
313 | strb_post ip, r1, r2 | |
314 | ldrb ip, [r1, r2] | |
315 | usat r4, #8, r4 | |
316 | ldrb r11,[r1, r2, lsl #2] | |
317 | add r5, ip, r5, asr #COL_SHIFT | |
318 | usat r5, #8, r5 | |
319 | strb_post r4, r1, r2 | |
320 | ldrb r3, [r1, r2] | |
321 | ldrb ip, [r1, r2, lsl #2] | |
322 | strb_post r5, r1, r2 | |
323 | ldrb r7, [r1, r2] | |
324 | ldrb r4, [r1, r2, lsl #2] | |
325 | add r6, r3, r6, asr #COL_SHIFT | |
326 | usat r6, #8, r6 | |
327 | add r10,r7, r10,asr #COL_SHIFT | |
328 | usat r10,#8, r10 | |
329 | add r9, r11,r9, asr #COL_SHIFT | |
330 | usat r9, #8, r9 | |
331 | add r8, ip, r8, asr #COL_SHIFT | |
332 | usat r8, #8, r8 | |
333 | add lr, r4, lr, asr #COL_SHIFT | |
334 | usat lr, #8, lr | |
335 | strb_post r6, r1, r2 | |
336 | strb_post r10,r1, r2 | |
337 | strb_post r9, r1, r2 | |
338 | strb_post r8, r1, r2 | |
339 | strb_post lr, r1, r2 | |
340 | ||
341 | sub r1, r1, r2, lsl #3 | |
342 | ||
343 | pop {pc} | |
344 | endfunc | |
345 | ||
346 | /* | |
347 | Compute 8 IDCT row transforms. | |
348 | func = IDCT row->col function | |
349 | width = width of columns in bytes | |
350 | */ | |
351 | .macro idct_rows func width | |
352 | bl \func | |
353 | add r0, r0, #(16*2) | |
354 | add r1, r1, #\width | |
355 | bl \func | |
356 | add r0, r0, #(16*2) | |
357 | add r1, r1, #\width | |
358 | bl \func | |
359 | add r0, r0, #(16*2) | |
360 | add r1, r1, #\width | |
361 | bl \func | |
362 | sub r0, r0, #(16*5) | |
363 | add r1, r1, #\width | |
364 | bl \func | |
365 | add r0, r0, #(16*2) | |
366 | add r1, r1, #\width | |
367 | bl \func | |
368 | add r0, r0, #(16*2) | |
369 | add r1, r1, #\width | |
370 | bl \func | |
371 | add r0, r0, #(16*2) | |
372 | add r1, r1, #\width | |
373 | bl \func | |
374 | ||
375 | sub r0, r0, #(16*7) | |
376 | .endm | |
377 | ||
378 | /* void ff_simple_idct_armv6(int16_t *data); */ | |
379 | function ff_simple_idct_armv6, export=1 | |
380 | push {r4-r11, lr} | |
381 | sub sp, sp, #128 | |
382 | ||
383 | mov r1, sp | |
384 | idct_rows idct_row_armv6, 2 | |
385 | mov r1, r0 | |
386 | mov r0, sp | |
387 | idct_rows idct_col_armv6, 2 | |
388 | ||
389 | add sp, sp, #128 | |
390 | pop {r4-r11, pc} | |
391 | endfunc | |
392 | ||
393 | /* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data); */ | |
394 | function ff_simple_idct_add_armv6, export=1 | |
395 | push {r0, r1, r4-r11, lr} | |
396 | sub sp, sp, #128 | |
397 | ||
398 | mov r0, r2 | |
399 | mov r1, sp | |
400 | idct_rows idct_row_armv6, 2 | |
401 | mov r0, sp | |
402 | ldr r1, [sp, #128] | |
403 | ldr r2, [sp, #(128+4)] | |
404 | idct_rows idct_col_add_armv6, 1 | |
405 | ||
406 | add sp, sp, #(128+8) | |
407 | pop {r4-r11, pc} | |
408 | endfunc | |
409 | ||
410 | /* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data); */ | |
411 | function ff_simple_idct_put_armv6, export=1 | |
412 | push {r0, r1, r4-r11, lr} | |
413 | sub sp, sp, #128 | |
414 | ||
415 | mov r0, r2 | |
416 | mov r1, sp | |
417 | idct_rows idct_row_armv6, 2 | |
418 | mov r0, sp | |
419 | ldr r1, [sp, #128] | |
420 | ldr r2, [sp, #(128+4)] | |
421 | idct_rows idct_col_put_armv6, 1 | |
422 | ||
423 | add sp, sp, #(128+8) | |
424 | pop {r4-r11, pc} | |
425 | endfunc |