Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | C-like prototype : | |
3 | void j_rev_dct_arm(DCTBLOCK data) | |
4 | ||
5 | With DCTBLOCK being a pointer to an array of 64 'signed shorts' | |
6 | ||
7 | Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org) | |
8 | ||
9 | Permission is hereby granted, free of charge, to any person obtaining a copy | |
10 | of this software and associated documentation files (the "Software"), to deal | |
11 | in the Software without restriction, including without limitation the rights | |
12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
13 | copies of the Software, and to permit persons to whom the Software is | |
14 | furnished to do so, subject to the following conditions: | |
15 | ||
16 | The above copyright notice and this permission notice shall be included in | |
17 | all copies or substantial portions of the Software. | |
18 | ||
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
22 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER | |
23 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
24 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
25 | ||
26 | */ | |
27 | ||
28 | #include "libavutil/arm/asm.S" | |
29 | ||
30 | #define FIX_0_298631336 2446 | |
31 | #define FIX_0_541196100 4433 | |
32 | #define FIX_0_765366865 6270 | |
33 | #define FIX_1_175875602 9633 | |
34 | #define FIX_1_501321110 12299 | |
35 | #define FIX_2_053119869 16819 | |
36 | #define FIX_3_072711026 25172 | |
37 | #define FIX_M_0_390180644 -3196 | |
38 | #define FIX_M_0_899976223 -7373 | |
39 | #define FIX_M_1_847759065 -15137 | |
40 | #define FIX_M_1_961570560 -16069 | |
41 | #define FIX_M_2_562915447 -20995 | |
42 | #define FIX_0xFFFF 0xFFFF | |
43 | ||
44 | #define FIX_0_298631336_ID 0 | |
45 | #define FIX_0_541196100_ID 4 | |
46 | #define FIX_0_765366865_ID 8 | |
47 | #define FIX_1_175875602_ID 12 | |
48 | #define FIX_1_501321110_ID 16 | |
49 | #define FIX_2_053119869_ID 20 | |
50 | #define FIX_3_072711026_ID 24 | |
51 | #define FIX_M_0_390180644_ID 28 | |
52 | #define FIX_M_0_899976223_ID 32 | |
53 | #define FIX_M_1_847759065_ID 36 | |
54 | #define FIX_M_1_961570560_ID 40 | |
55 | #define FIX_M_2_562915447_ID 44 | |
56 | #define FIX_0xFFFF_ID 48 | |
57 | ||
58 | function ff_j_rev_dct_arm, export=1 | |
59 | push {r0, r4 - r11, lr} | |
60 | ||
61 | mov lr, r0 @ lr = pointer to the current row | |
62 | mov r12, #8 @ r12 = row-counter | |
63 | movrel r11, const_array @ r11 = base pointer to the constants array | |
64 | row_loop: | |
65 | ldrsh r0, [lr, # 0] @ r0 = 'd0' | |
66 | ldrsh r2, [lr, # 2] @ r2 = 'd2' | |
67 | ||
68 | @ Optimization for row that have all items except the first set to 0 | |
69 | @ (this works as the int16_t are always 4-byte aligned) | |
70 | ldr r5, [lr, # 0] | |
71 | ldr r6, [lr, # 4] | |
72 | ldr r3, [lr, # 8] | |
73 | ldr r4, [lr, #12] | |
74 | orr r3, r3, r4 | |
75 | orr r3, r3, r6 | |
76 | orrs r5, r3, r5 | |
77 | beq end_of_row_loop @ nothing to be done as ALL of them are '0' | |
78 | orrs r3, r3, r2 | |
79 | beq empty_row | |
80 | ||
81 | ldrsh r1, [lr, # 8] @ r1 = 'd1' | |
82 | ldrsh r4, [lr, # 4] @ r4 = 'd4' | |
83 | ldrsh r6, [lr, # 6] @ r6 = 'd6' | |
84 | ||
85 | ldr r3, [r11, #FIX_0_541196100_ID] | |
86 | add r7, r2, r6 | |
87 | ldr r5, [r11, #FIX_M_1_847759065_ID] | |
88 | mul r7, r3, r7 @ r7 = z1 | |
89 | ldr r3, [r11, #FIX_0_765366865_ID] | |
90 | mla r6, r5, r6, r7 @ r6 = tmp2 | |
91 | add r5, r0, r4 @ r5 = tmp0 | |
92 | mla r2, r3, r2, r7 @ r2 = tmp3 | |
93 | sub r3, r0, r4 @ r3 = tmp1 | |
94 | ||
95 | add r0, r2, r5, lsl #13 @ r0 = tmp10 | |
96 | rsb r2, r2, r5, lsl #13 @ r2 = tmp13 | |
97 | add r4, r6, r3, lsl #13 @ r4 = tmp11 | |
98 | rsb r3, r6, r3, lsl #13 @ r3 = tmp12 | |
99 | ||
100 | push {r0, r2, r3, r4} @ save on the stack tmp10, tmp13, tmp12, tmp11 | |
101 | ||
102 | ldrsh r3, [lr, #10] @ r3 = 'd3' | |
103 | ldrsh r5, [lr, #12] @ r5 = 'd5' | |
104 | ldrsh r7, [lr, #14] @ r7 = 'd7' | |
105 | ||
106 | add r0, r3, r5 @ r0 = 'z2' | |
107 | add r2, r1, r7 @ r2 = 'z1' | |
108 | add r4, r3, r7 @ r4 = 'z3' | |
109 | add r6, r1, r5 @ r6 = 'z4' | |
110 | ldr r9, [r11, #FIX_1_175875602_ID] | |
111 | add r8, r4, r6 @ r8 = z3 + z4 | |
112 | ldr r10, [r11, #FIX_M_0_899976223_ID] | |
113 | mul r8, r9, r8 @ r8 = 'z5' | |
114 | ldr r9, [r11, #FIX_M_2_562915447_ID] | |
115 | mul r2, r10, r2 @ r2 = 'z1' | |
116 | ldr r10, [r11, #FIX_M_1_961570560_ID] | |
117 | mul r0, r9, r0 @ r0 = 'z2' | |
118 | ldr r9, [r11, #FIX_M_0_390180644_ID] | |
119 | mla r4, r10, r4, r8 @ r4 = 'z3' | |
120 | ldr r10, [r11, #FIX_0_298631336_ID] | |
121 | mla r6, r9, r6, r8 @ r6 = 'z4' | |
122 | ldr r9, [r11, #FIX_2_053119869_ID] | |
123 | mla r7, r10, r7, r2 @ r7 = tmp0 + z1 | |
124 | ldr r10, [r11, #FIX_3_072711026_ID] | |
125 | mla r5, r9, r5, r0 @ r5 = tmp1 + z2 | |
126 | ldr r9, [r11, #FIX_1_501321110_ID] | |
127 | mla r3, r10, r3, r0 @ r3 = tmp2 + z2 | |
128 | add r7, r7, r4 @ r7 = tmp0 | |
129 | mla r1, r9, r1, r2 @ r1 = tmp3 + z1 | |
130 | add r5, r5, r6 @ r5 = tmp1 | |
131 | add r3, r3, r4 @ r3 = tmp2 | |
132 | add r1, r1, r6 @ r1 = tmp3 | |
133 | ||
134 | pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11 | |
135 | @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 | |
136 | ||
137 | @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) | |
138 | add r8, r0, r1 | |
139 | add r8, r8, #(1<<10) | |
140 | mov r8, r8, asr #11 | |
141 | strh r8, [lr, # 0] | |
142 | ||
143 | @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) | |
144 | sub r8, r0, r1 | |
145 | add r8, r8, #(1<<10) | |
146 | mov r8, r8, asr #11 | |
147 | strh r8, [lr, #14] | |
148 | ||
149 | @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) | |
150 | add r8, r6, r3 | |
151 | add r8, r8, #(1<<10) | |
152 | mov r8, r8, asr #11 | |
153 | strh r8, [lr, # 2] | |
154 | ||
155 | @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) | |
156 | sub r8, r6, r3 | |
157 | add r8, r8, #(1<<10) | |
158 | mov r8, r8, asr #11 | |
159 | strh r8, [lr, #12] | |
160 | ||
161 | @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) | |
162 | add r8, r4, r5 | |
163 | add r8, r8, #(1<<10) | |
164 | mov r8, r8, asr #11 | |
165 | strh r8, [lr, # 4] | |
166 | ||
167 | @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) | |
168 | sub r8, r4, r5 | |
169 | add r8, r8, #(1<<10) | |
170 | mov r8, r8, asr #11 | |
171 | strh r8, [lr, #10] | |
172 | ||
173 | @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) | |
174 | add r8, r2, r7 | |
175 | add r8, r8, #(1<<10) | |
176 | mov r8, r8, asr #11 | |
177 | strh r8, [lr, # 6] | |
178 | ||
179 | @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) | |
180 | sub r8, r2, r7 | |
181 | add r8, r8, #(1<<10) | |
182 | mov r8, r8, asr #11 | |
183 | strh r8, [lr, # 8] | |
184 | ||
185 | @ End of row loop | |
186 | add lr, lr, #16 | |
187 | subs r12, r12, #1 | |
188 | bne row_loop | |
189 | beq start_column_loop | |
190 | ||
191 | empty_row: | |
192 | ldr r1, [r11, #FIX_0xFFFF_ID] | |
193 | mov r0, r0, lsl #2 | |
194 | and r0, r0, r1 | |
195 | add r0, r0, r0, lsl #16 | |
196 | str r0, [lr, # 0] | |
197 | str r0, [lr, # 4] | |
198 | str r0, [lr, # 8] | |
199 | str r0, [lr, #12] | |
200 | ||
201 | end_of_row_loop: | |
202 | @ End of loop | |
203 | add lr, lr, #16 | |
204 | subs r12, r12, #1 | |
205 | bne row_loop | |
206 | ||
207 | start_column_loop: | |
208 | @ Start of column loop | |
209 | pop {lr} | |
210 | mov r12, #8 | |
211 | column_loop: | |
212 | ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0' | |
213 | ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2' | |
214 | ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4' | |
215 | ldrsh r6, [lr, #(12*8)] @ r6 = 'd6' | |
216 | ||
217 | ldr r3, [r11, #FIX_0_541196100_ID] | |
218 | add r1, r2, r6 | |
219 | ldr r5, [r11, #FIX_M_1_847759065_ID] | |
220 | mul r1, r3, r1 @ r1 = z1 | |
221 | ldr r3, [r11, #FIX_0_765366865_ID] | |
222 | mla r6, r5, r6, r1 @ r6 = tmp2 | |
223 | add r5, r0, r4 @ r5 = tmp0 | |
224 | mla r2, r3, r2, r1 @ r2 = tmp3 | |
225 | sub r3, r0, r4 @ r3 = tmp1 | |
226 | ||
227 | add r0, r2, r5, lsl #13 @ r0 = tmp10 | |
228 | rsb r2, r2, r5, lsl #13 @ r2 = tmp13 | |
229 | add r4, r6, r3, lsl #13 @ r4 = tmp11 | |
230 | rsb r6, r6, r3, lsl #13 @ r6 = tmp12 | |
231 | ||
232 | ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1' | |
233 | ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3' | |
234 | ldrsh r5, [lr, #(10*8)] @ r5 = 'd5' | |
235 | ldrsh r7, [lr, #(14*8)] @ r7 = 'd7' | |
236 | ||
237 | @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats) | |
238 | orr r9, r1, r3 | |
239 | orr r10, r5, r7 | |
240 | orrs r10, r9, r10 | |
241 | beq empty_odd_column | |
242 | ||
243 | push {r0, r2, r4, r6} @ save on the stack tmp10, tmp13, tmp12, tmp11 | |
244 | ||
245 | add r0, r3, r5 @ r0 = 'z2' | |
246 | add r2, r1, r7 @ r2 = 'z1' | |
247 | add r4, r3, r7 @ r4 = 'z3' | |
248 | add r6, r1, r5 @ r6 = 'z4' | |
249 | ldr r9, [r11, #FIX_1_175875602_ID] | |
250 | add r8, r4, r6 | |
251 | ldr r10, [r11, #FIX_M_0_899976223_ID] | |
252 | mul r8, r9, r8 @ r8 = 'z5' | |
253 | ldr r9, [r11, #FIX_M_2_562915447_ID] | |
254 | mul r2, r10, r2 @ r2 = 'z1' | |
255 | ldr r10, [r11, #FIX_M_1_961570560_ID] | |
256 | mul r0, r9, r0 @ r0 = 'z2' | |
257 | ldr r9, [r11, #FIX_M_0_390180644_ID] | |
258 | mla r4, r10, r4, r8 @ r4 = 'z3' | |
259 | ldr r10, [r11, #FIX_0_298631336_ID] | |
260 | mla r6, r9, r6, r8 @ r6 = 'z4' | |
261 | ldr r9, [r11, #FIX_2_053119869_ID] | |
262 | mla r7, r10, r7, r2 @ r7 = tmp0 + z1 | |
263 | ldr r10, [r11, #FIX_3_072711026_ID] | |
264 | mla r5, r9, r5, r0 @ r5 = tmp1 + z2 | |
265 | ldr r9, [r11, #FIX_1_501321110_ID] | |
266 | mla r3, r10, r3, r0 @ r3 = tmp2 + z2 | |
267 | add r7, r7, r4 @ r7 = tmp0 | |
268 | mla r1, r9, r1, r2 @ r1 = tmp3 + z1 | |
269 | add r5, r5, r6 @ r5 = tmp1 | |
270 | add r3, r3, r4 @ r3 = tmp2 | |
271 | add r1, r1, r6 @ r1 = tmp3 | |
272 | ||
273 | pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12 | |
274 | @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 | |
275 | ||
276 | @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) | |
277 | add r8, r0, r1 | |
278 | add r8, r8, #(1<<17) | |
279 | mov r8, r8, asr #18 | |
280 | strh r8, [lr, #( 0*8)] | |
281 | ||
282 | @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) | |
283 | sub r8, r0, r1 | |
284 | add r8, r8, #(1<<17) | |
285 | mov r8, r8, asr #18 | |
286 | strh r8, [lr, #(14*8)] | |
287 | ||
288 | @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) | |
289 | add r8, r4, r3 | |
290 | add r8, r8, #(1<<17) | |
291 | mov r8, r8, asr #18 | |
292 | strh r8, [lr, #( 2*8)] | |
293 | ||
294 | @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) | |
295 | sub r8, r4, r3 | |
296 | add r8, r8, #(1<<17) | |
297 | mov r8, r8, asr #18 | |
298 | strh r8, [lr, #(12*8)] | |
299 | ||
300 | @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) | |
301 | add r8, r6, r5 | |
302 | add r8, r8, #(1<<17) | |
303 | mov r8, r8, asr #18 | |
304 | strh r8, [lr, #( 4*8)] | |
305 | ||
306 | @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) | |
307 | sub r8, r6, r5 | |
308 | add r8, r8, #(1<<17) | |
309 | mov r8, r8, asr #18 | |
310 | strh r8, [lr, #(10*8)] | |
311 | ||
312 | @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) | |
313 | add r8, r2, r7 | |
314 | add r8, r8, #(1<<17) | |
315 | mov r8, r8, asr #18 | |
316 | strh r8, [lr, #( 6*8)] | |
317 | ||
318 | @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) | |
319 | sub r8, r2, r7 | |
320 | add r8, r8, #(1<<17) | |
321 | mov r8, r8, asr #18 | |
322 | strh r8, [lr, #( 8*8)] | |
323 | ||
324 | @ End of row loop | |
325 | add lr, lr, #2 | |
326 | subs r12, r12, #1 | |
327 | bne column_loop | |
328 | beq the_end | |
329 | ||
330 | empty_odd_column: | |
331 | @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) | |
332 | @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) | |
333 | add r0, r0, #(1<<17) | |
334 | mov r0, r0, asr #18 | |
335 | strh r0, [lr, #( 0*8)] | |
336 | strh r0, [lr, #(14*8)] | |
337 | ||
338 | @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) | |
339 | @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) | |
340 | add r4, r4, #(1<<17) | |
341 | mov r4, r4, asr #18 | |
342 | strh r4, [lr, #( 2*8)] | |
343 | strh r4, [lr, #(12*8)] | |
344 | ||
345 | @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) | |
346 | @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) | |
347 | add r6, r6, #(1<<17) | |
348 | mov r6, r6, asr #18 | |
349 | strh r6, [lr, #( 4*8)] | |
350 | strh r6, [lr, #(10*8)] | |
351 | ||
352 | @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) | |
353 | @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) | |
354 | add r2, r2, #(1<<17) | |
355 | mov r2, r2, asr #18 | |
356 | strh r2, [lr, #( 6*8)] | |
357 | strh r2, [lr, #( 8*8)] | |
358 | ||
359 | @ End of row loop | |
360 | add lr, lr, #2 | |
361 | subs r12, r12, #1 | |
362 | bne column_loop | |
363 | ||
364 | the_end: | |
365 | @ The end.... | |
366 | pop {r4 - r11, pc} | |
367 | endfunc | |
368 | ||
369 | const const_array | |
370 | .word FIX_0_298631336 | |
371 | .word FIX_0_541196100 | |
372 | .word FIX_0_765366865 | |
373 | .word FIX_1_175875602 | |
374 | .word FIX_1_501321110 | |
375 | .word FIX_2_053119869 | |
376 | .word FIX_3_072711026 | |
377 | .word FIX_M_0_390180644 | |
378 | .word FIX_M_0_899976223 | |
379 | .word FIX_M_1_847759065 | |
380 | .word FIX_M_1_961570560 | |
381 | .word FIX_M_2_562915447 | |
382 | .word FIX_0xFFFF | |
383 | endconst |