Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / mdct_fixed_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22
23.macro prerot dst, rt
24 lsr r3, r6, #2 @ n4
25 add \rt, r4, r6, lsr #1 @ revtab + n4
26 add r9, r3, r3, lsl #1 @ n3
27 add r8, r7, r6 @ tcos + n4
28 add r3, r2, r6, lsr #1 @ in + n4
29 add r9, r2, r9, lsl #1 @ in + n3
30 sub r8, r8, #16
31 sub r10, r3, #16
32 sub r11, r9, #16
33 mov r12, #-16
341:
35 vld2.16 {d0,d1}, [r9, :128]!
36 vld2.16 {d2,d3}, [r11,:128], r12
37 vld2.16 {d4,d5}, [r3, :128]!
38 vld2.16 {d6,d7}, [r10,:128], r12
39 vld2.16 {d16,d17},[r7, :128]! @ cos, sin
40 vld2.16 {d18,d19},[r8, :128], r12
41 vrev64.16 q1, q1
42 vrev64.16 q3, q3
43 vrev64.16 q9, q9
44 vneg.s16 d0, d0
45 vneg.s16 d2, d2
46 vneg.s16 d16, d16
47 vneg.s16 d18, d18
48 vhsub.s16 d0, d0, d3 @ re
49 vhsub.s16 d4, d7, d4 @ im
50 vhsub.s16 d6, d6, d5
51 vhsub.s16 d2, d2, d1
52 vmull.s16 q10, d0, d16
53 vmlsl.s16 q10, d4, d17
54 vmull.s16 q11, d0, d17
55 vmlal.s16 q11, d4, d16
56 vmull.s16 q12, d6, d18
57 vmlsl.s16 q12, d2, d19
58 vmull.s16 q13, d6, d19
59 vmlal.s16 q13, d2, d18
60 vshrn.s32 d0, q10, #15
61 vshrn.s32 d1, q11, #15
62 vshrn.s32 d2, q12, #15
63 vshrn.s32 d3, q13, #15
64 vzip.16 d0, d1
65 vzip.16 d2, d3
66 ldrh lr, [r4], #2
67 ldrh r2, [\rt, #-2]!
68 add lr, \dst, lr, lsl #2
69 add r2, \dst, r2, lsl #2
70 vst1.32 {d0[0]}, [lr,:32]
71 vst1.32 {d2[0]}, [r2,:32]
72 ldrh lr, [r4], #2
73 ldrh r2, [\rt, #-2]!
74 add lr, \dst, lr, lsl #2
75 add r2, \dst, r2, lsl #2
76 vst1.32 {d0[1]}, [lr,:32]
77 vst1.32 {d2[1]}, [r2,:32]
78 ldrh lr, [r4], #2
79 ldrh r2, [\rt, #-2]!
80 add lr, \dst, lr, lsl #2
81 add r2, \dst, r2, lsl #2
82 vst1.32 {d1[0]}, [lr,:32]
83 vst1.32 {d3[0]}, [r2,:32]
84 ldrh lr, [r4], #2
85 ldrh r2, [\rt, #-2]!
86 add lr, \dst, lr, lsl #2
87 add r2, \dst, r2, lsl #2
88 vst1.32 {d1[1]}, [lr,:32]
89 vst1.32 {d3[1]}, [r2,:32]
90 subs r6, r6, #32
91 bgt 1b
92.endm
93
94function ff_mdct_fixed_calc_neon, export=1
95 push {r1,r4-r11,lr}
96
97 ldr r4, [r0, #8] @ revtab
98 ldr r6, [r0, #16] @ mdct_size; n
99 ldr r7, [r0, #24] @ tcos
100
101 prerot r1, r5
102
103 mov r4, r0
104 bl X(ff_fft_fixed_calc_neon)
105
106 pop {r5}
107 mov r12, #-16
108 ldr r6, [r4, #16] @ mdct_size; n
109 ldr r7, [r4, #24] @ tcos
110 add r5, r5, r6, lsr #1
111 add r7, r7, r6, lsr #1
112 sub r1, r5, #16
113 sub r2, r7, #16
1141:
115 vld2.16 {d4,d5}, [r7,:128]!
116 vld2.16 {d6,d7}, [r2,:128], r12
117 vld2.16 {d0,d1}, [r5,:128]
118 vld2.16 {d2,d3}, [r1,:128]
119 vrev64.16 q3, q3
120 vrev64.16 q1, q1
121 vneg.s16 q3, q3
122 vneg.s16 q2, q2
123 vmull.s16 q11, d2, d6
124 vmlal.s16 q11, d3, d7
125 vmull.s16 q8, d0, d5
126 vmlsl.s16 q8, d1, d4
127 vmull.s16 q9, d0, d4
128 vmlal.s16 q9, d1, d5
129 vmull.s16 q10, d2, d7
130 vmlsl.s16 q10, d3, d6
131 vshrn.s32 d0, q11, #15
132 vshrn.s32 d1, q8, #15
133 vshrn.s32 d2, q9, #15
134 vshrn.s32 d3, q10, #15
135 vrev64.16 q0, q0
136 vst2.16 {d2,d3}, [r5,:128]!
137 vst2.16 {d0,d1}, [r1,:128], r12
138 subs r6, r6, #32
139 bgt 1b
140
141 pop {r4-r11,pc}
142endfunc
143
144function ff_mdct_fixed_calcw_neon, export=1
145 push {r1,r4-r11,lr}
146
147 ldrd r4, r5, [r0, #8] @ revtab, tmp_buf
148 ldr r6, [r0, #16] @ mdct_size; n
149 ldr r7, [r0, #24] @ tcos
150
151 prerot r5, r1
152
153 mov r4, r0
154 mov r1, r5
155 bl X(ff_fft_fixed_calc_neon)
156
157 pop {r7}
158 mov r12, #-16
159 ldr r6, [r4, #16] @ mdct_size; n
160 ldr r9, [r4, #24] @ tcos
161 add r5, r5, r6, lsr #1
162 add r7, r7, r6
163 add r9, r9, r6, lsr #1
164 sub r3, r5, #16
165 sub r1, r7, #16
166 sub r2, r9, #16
1671:
168 vld2.16 {d4,d5}, [r9,:128]!
169 vld2.16 {d6,d7}, [r2,:128], r12
170 vld2.16 {d0,d1}, [r5,:128]!
171 vld2.16 {d2,d3}, [r3,:128], r12
172 vrev64.16 q3, q3
173 vrev64.16 q1, q1
174 vneg.s16 q3, q3
175 vneg.s16 q2, q2
176 vmull.s16 q8, d2, d6
177 vmlal.s16 q8, d3, d7
178 vmull.s16 q9, d0, d5
179 vmlsl.s16 q9, d1, d4
180 vmull.s16 q10, d0, d4
181 vmlal.s16 q10, d1, d5
182 vmull.s16 q11, d2, d7
183 vmlsl.s16 q11, d3, d6
184 vrev64.32 q8, q8
185 vrev64.32 q9, q9
186 vst2.32 {q10,q11},[r7,:128]!
187 vst2.32 {d16,d18},[r1,:128], r12
188 vst2.32 {d17,d19},[r1,:128], r12
189 subs r6, r6, #32
190 bgt 1b
191
192 pop {r4-r11,pc}
193endfunc