Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * ARM NEON IDCT | |
3 | * | |
4 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
5 | * | |
6 | * Based on Simple IDCT | |
7 | * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> | |
8 | * | |
9 | * This file is part of FFmpeg. | |
10 | * | |
11 | * FFmpeg is free software; you can redistribute it and/or | |
12 | * modify it under the terms of the GNU Lesser General Public | |
13 | * License as published by the Free Software Foundation; either | |
14 | * version 2.1 of the License, or (at your option) any later version. | |
15 | * | |
16 | * FFmpeg is distributed in the hope that it will be useful, | |
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
19 | * Lesser General Public License for more details. | |
20 | * | |
21 | * You should have received a copy of the GNU Lesser General Public | |
22 | * License along with FFmpeg; if not, write to the Free Software | |
23 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
24 | */ | |
25 | ||
26 | #include "libavutil/arm/asm.S" | |
27 | ||
28 | #define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
29 | #define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
30 | #define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
31 | #define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
32 | #define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
33 | #define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
34 | #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
35 | #define W4c ((1<<(COL_SHIFT-1))/W4) | |
36 | #define ROW_SHIFT 11 | |
37 | #define COL_SHIFT 20 | |
38 | ||
39 | #define w1 d0[0] | |
40 | #define w2 d0[1] | |
41 | #define w3 d0[2] | |
42 | #define w4 d0[3] | |
43 | #define w5 d1[0] | |
44 | #define w6 d1[1] | |
45 | #define w7 d1[2] | |
46 | #define w4c d1[3] | |
47 | ||
48 | .macro idct_col4_top | |
49 | vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ | |
50 | vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ | |
51 | vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ | |
52 | vadd.i32 q11, q15, q7 | |
53 | vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ | |
54 | vadd.i32 q12, q15, q8 | |
55 | vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ | |
56 | vsub.i32 q13, q15, q8 | |
57 | vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ | |
58 | vsub.i32 q14, q15, q7 | |
59 | ||
60 | vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ | |
61 | vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ | |
62 | vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ | |
63 | vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ | |
64 | .endm | |
65 | ||
66 | .text | |
67 | .align 6 | |
68 | ||
69 | function idct_row4_pld_neon | |
70 | pld [r0] | |
71 | add r3, r0, r1, lsl #2 | |
72 | pld [r0, r1] | |
73 | pld [r0, r1, lsl #1] | |
74 | A pld [r3, -r1] | |
75 | pld [r3] | |
76 | pld [r3, r1] | |
77 | add r3, r3, r1, lsl #1 | |
78 | pld [r3] | |
79 | pld [r3, r1] | |
80 | endfunc | |
81 | ||
82 | function idct_row4_neon | |
83 | vmov.i32 q15, #(1<<(ROW_SHIFT-1)) | |
84 | vld1.64 {d2-d5}, [r2,:128]! | |
85 | vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ | |
86 | vld1.64 {d6,d7}, [r2,:128]! | |
87 | vorr d10, d3, d5 | |
88 | vld1.64 {d8,d9}, [r2,:128]! | |
89 | add r2, r2, #-64 | |
90 | ||
91 | vorr d11, d7, d9 | |
92 | vorr d10, d10, d11 | |
93 | vmov r3, r4, d10 | |
94 | ||
95 | idct_col4_top | |
96 | ||
97 | orrs r3, r3, r4 | |
98 | beq 1f | |
99 | ||
100 | vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ | |
101 | vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ | |
102 | vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ | |
103 | vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ | |
104 | vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ | |
105 | vadd.i32 q11, q11, q7 | |
106 | vsub.i32 q12, q12, q7 | |
107 | vsub.i32 q13, q13, q7 | |
108 | vadd.i32 q14, q14, q7 | |
109 | vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ | |
110 | vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ | |
111 | vmlal.s16 q9, d9, w7 | |
112 | vmlsl.s16 q10, d9, w5 | |
113 | vmlal.s16 q5, d9, w3 | |
114 | vmlsl.s16 q6, d9, w1 | |
115 | vadd.i32 q11, q11, q7 | |
116 | vsub.i32 q12, q12, q8 | |
117 | vadd.i32 q13, q13, q8 | |
118 | vsub.i32 q14, q14, q7 | |
119 | ||
120 | 1: vadd.i32 q3, q11, q9 | |
121 | vadd.i32 q4, q12, q10 | |
122 | vshrn.i32 d2, q3, #ROW_SHIFT | |
123 | vshrn.i32 d4, q4, #ROW_SHIFT | |
124 | vadd.i32 q7, q13, q5 | |
125 | vadd.i32 q8, q14, q6 | |
126 | vtrn.16 d2, d4 | |
127 | vshrn.i32 d6, q7, #ROW_SHIFT | |
128 | vshrn.i32 d8, q8, #ROW_SHIFT | |
129 | vsub.i32 q14, q14, q6 | |
130 | vsub.i32 q11, q11, q9 | |
131 | vtrn.16 d6, d8 | |
132 | vsub.i32 q13, q13, q5 | |
133 | vshrn.i32 d3, q14, #ROW_SHIFT | |
134 | vtrn.32 d2, d6 | |
135 | vsub.i32 q12, q12, q10 | |
136 | vtrn.32 d4, d8 | |
137 | vshrn.i32 d5, q13, #ROW_SHIFT | |
138 | vshrn.i32 d7, q12, #ROW_SHIFT | |
139 | vshrn.i32 d9, q11, #ROW_SHIFT | |
140 | ||
141 | vtrn.16 d3, d5 | |
142 | vtrn.16 d7, d9 | |
143 | vtrn.32 d3, d7 | |
144 | vtrn.32 d5, d9 | |
145 | ||
146 | vst1.64 {d2-d5}, [r2,:128]! | |
147 | vst1.64 {d6-d9}, [r2,:128]! | |
148 | ||
149 | bx lr | |
150 | endfunc | |
151 | ||
152 | function idct_col4_neon | |
153 | mov ip, #16 | |
154 | vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */ | |
155 | vdup.16 d30, w4c | |
156 | vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */ | |
157 | vadd.i16 d30, d30, d2 | |
158 | vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */ | |
159 | vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/ | |
160 | vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */ | |
161 | ||
162 | ldrd r4, r5, [r2] | |
163 | ldrd r6, r7, [r2, #16] | |
164 | orrs r4, r4, r5 | |
165 | ||
166 | idct_col4_top | |
167 | it eq | |
168 | addeq r2, r2, #16 | |
169 | beq 1f | |
170 | ||
171 | vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */ | |
172 | vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ | |
173 | vadd.i32 q11, q11, q7 | |
174 | vsub.i32 q12, q12, q7 | |
175 | vsub.i32 q13, q13, q7 | |
176 | vadd.i32 q14, q14, q7 | |
177 | ||
178 | 1: orrs r6, r6, r7 | |
179 | ldrd r4, r5, [r2, #16] | |
180 | it eq | |
181 | addeq r2, r2, #16 | |
182 | beq 2f | |
183 | ||
184 | vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */ | |
185 | vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ | |
186 | vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ | |
187 | vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ | |
188 | vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ | |
189 | ||
190 | 2: orrs r4, r4, r5 | |
191 | ldrd r4, r5, [r2, #16] | |
192 | it eq | |
193 | addeq r2, r2, #16 | |
194 | beq 3f | |
195 | ||
196 | vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */ | |
197 | vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ | |
198 | vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ | |
199 | vadd.i32 q11, q11, q7 | |
200 | vsub.i32 q14, q14, q7 | |
201 | vsub.i32 q12, q12, q8 | |
202 | vadd.i32 q13, q13, q8 | |
203 | ||
204 | 3: orrs r4, r4, r5 | |
205 | it eq | |
206 | addeq r2, r2, #16 | |
207 | beq 4f | |
208 | ||
209 | vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */ | |
210 | vmlal.s16 q9, d9, w7 | |
211 | vmlsl.s16 q10, d9, w5 | |
212 | vmlal.s16 q5, d9, w3 | |
213 | vmlsl.s16 q6, d9, w1 | |
214 | ||
215 | 4: vaddhn.i32 d2, q11, q9 | |
216 | vaddhn.i32 d3, q12, q10 | |
217 | vaddhn.i32 d4, q13, q5 | |
218 | vaddhn.i32 d5, q14, q6 | |
219 | vsubhn.i32 d9, q11, q9 | |
220 | vsubhn.i32 d8, q12, q10 | |
221 | vsubhn.i32 d7, q13, q5 | |
222 | vsubhn.i32 d6, q14, q6 | |
223 | ||
224 | bx lr | |
225 | endfunc | |
226 | ||
227 | .align 6 | |
228 | ||
229 | function idct_col4_st8_neon | |
230 | vqshrun.s16 d2, q1, #COL_SHIFT-16 | |
231 | vqshrun.s16 d3, q2, #COL_SHIFT-16 | |
232 | vqshrun.s16 d4, q3, #COL_SHIFT-16 | |
233 | vqshrun.s16 d5, q4, #COL_SHIFT-16 | |
234 | vst1.32 {d2[0]}, [r0,:32], r1 | |
235 | vst1.32 {d2[1]}, [r0,:32], r1 | |
236 | vst1.32 {d3[0]}, [r0,:32], r1 | |
237 | vst1.32 {d3[1]}, [r0,:32], r1 | |
238 | vst1.32 {d4[0]}, [r0,:32], r1 | |
239 | vst1.32 {d4[1]}, [r0,:32], r1 | |
240 | vst1.32 {d5[0]}, [r0,:32], r1 | |
241 | vst1.32 {d5[1]}, [r0,:32], r1 | |
242 | ||
243 | bx lr | |
244 | endfunc | |
245 | ||
246 | const idct_coeff_neon, align=4 | |
247 | .short W1, W2, W3, W4, W5, W6, W7, W4c | |
248 | endconst | |
249 | ||
250 | .macro idct_start data | |
251 | push {r4-r7, lr} | |
252 | pld [\data] | |
253 | pld [\data, #64] | |
254 | vpush {d8-d15} | |
255 | movrel r3, idct_coeff_neon | |
256 | vld1.64 {d0,d1}, [r3,:128] | |
257 | .endm | |
258 | ||
259 | .macro idct_end | |
260 | vpop {d8-d15} | |
261 | pop {r4-r7, pc} | |
262 | .endm | |
263 | ||
264 | /* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, int16_t *data); */ | |
265 | function ff_simple_idct_put_neon, export=1 | |
266 | idct_start r2 | |
267 | ||
268 | bl idct_row4_pld_neon | |
269 | bl idct_row4_neon | |
270 | add r2, r2, #-128 | |
271 | bl idct_col4_neon | |
272 | bl idct_col4_st8_neon | |
273 | sub r0, r0, r1, lsl #3 | |
274 | add r0, r0, #4 | |
275 | add r2, r2, #-120 | |
276 | bl idct_col4_neon | |
277 | bl idct_col4_st8_neon | |
278 | ||
279 | idct_end | |
280 | endfunc | |
281 | ||
282 | .align 6 | |
283 | ||
284 | function idct_col4_add8_neon | |
285 | mov ip, r0 | |
286 | ||
287 | vld1.32 {d10[0]}, [r0,:32], r1 | |
288 | vshr.s16 q1, q1, #COL_SHIFT-16 | |
289 | vld1.32 {d10[1]}, [r0,:32], r1 | |
290 | vshr.s16 q2, q2, #COL_SHIFT-16 | |
291 | vld1.32 {d11[0]}, [r0,:32], r1 | |
292 | vshr.s16 q3, q3, #COL_SHIFT-16 | |
293 | vld1.32 {d11[1]}, [r0,:32], r1 | |
294 | vshr.s16 q4, q4, #COL_SHIFT-16 | |
295 | vld1.32 {d12[0]}, [r0,:32], r1 | |
296 | vaddw.u8 q1, q1, d10 | |
297 | vld1.32 {d12[1]}, [r0,:32], r1 | |
298 | vaddw.u8 q2, q2, d11 | |
299 | vld1.32 {d13[0]}, [r0,:32], r1 | |
300 | vqmovun.s16 d2, q1 | |
301 | vld1.32 {d13[1]}, [r0,:32], r1 | |
302 | vaddw.u8 q3, q3, d12 | |
303 | vst1.32 {d2[0]}, [ip,:32], r1 | |
304 | vqmovun.s16 d3, q2 | |
305 | vst1.32 {d2[1]}, [ip,:32], r1 | |
306 | vaddw.u8 q4, q4, d13 | |
307 | vst1.32 {d3[0]}, [ip,:32], r1 | |
308 | vqmovun.s16 d4, q3 | |
309 | vst1.32 {d3[1]}, [ip,:32], r1 | |
310 | vqmovun.s16 d5, q4 | |
311 | vst1.32 {d4[0]}, [ip,:32], r1 | |
312 | vst1.32 {d4[1]}, [ip,:32], r1 | |
313 | vst1.32 {d5[0]}, [ip,:32], r1 | |
314 | vst1.32 {d5[1]}, [ip,:32], r1 | |
315 | ||
316 | bx lr | |
317 | endfunc | |
318 | ||
319 | /* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, int16_t *data); */ | |
320 | function ff_simple_idct_add_neon, export=1 | |
321 | idct_start r2 | |
322 | ||
323 | bl idct_row4_pld_neon | |
324 | bl idct_row4_neon | |
325 | add r2, r2, #-128 | |
326 | bl idct_col4_neon | |
327 | bl idct_col4_add8_neon | |
328 | sub r0, r0, r1, lsl #3 | |
329 | add r0, r0, #4 | |
330 | add r2, r2, #-120 | |
331 | bl idct_col4_neon | |
332 | bl idct_col4_add8_neon | |
333 | ||
334 | idct_end | |
335 | endfunc | |
336 | ||
337 | .align 6 | |
338 | ||
339 | function idct_col4_st16_neon | |
340 | mov ip, #16 | |
341 | ||
342 | vshr.s16 q1, q1, #COL_SHIFT-16 | |
343 | vshr.s16 q2, q2, #COL_SHIFT-16 | |
344 | vst1.64 {d2}, [r2,:64], ip | |
345 | vshr.s16 q3, q3, #COL_SHIFT-16 | |
346 | vst1.64 {d3}, [r2,:64], ip | |
347 | vshr.s16 q4, q4, #COL_SHIFT-16 | |
348 | vst1.64 {d4}, [r2,:64], ip | |
349 | vst1.64 {d5}, [r2,:64], ip | |
350 | vst1.64 {d6}, [r2,:64], ip | |
351 | vst1.64 {d7}, [r2,:64], ip | |
352 | vst1.64 {d8}, [r2,:64], ip | |
353 | vst1.64 {d9}, [r2,:64], ip | |
354 | ||
355 | bx lr | |
356 | endfunc | |
357 | ||
358 | /* void ff_simple_idct_neon(int16_t *data); */ | |
359 | function ff_simple_idct_neon, export=1 | |
360 | idct_start r0 | |
361 | ||
362 | mov r2, r0 | |
363 | bl idct_row4_neon | |
364 | bl idct_row4_neon | |
365 | add r2, r2, #-128 | |
366 | bl idct_col4_neon | |
367 | add r2, r2, #-128 | |
368 | bl idct_col4_st16_neon | |
369 | add r2, r2, #-120 | |
370 | bl idct_col4_neon | |
371 | add r2, r2, #-128 | |
372 | bl idct_col4_st16_neon | |
373 | ||
374 | idct_end | |
375 | endfunc |