Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> | |
3 | * | |
4 | * This file is part of FFmpeg. | |
5 | * | |
6 | * FFmpeg is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
10 | * | |
11 | * FFmpeg is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with FFmpeg; if not, write to the Free Software | |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 | */ | |
20 | ||
21 | #include "libavutil/aarch64/asm.S" | |
22 | #include "asm-offsets.h" | |
23 | ||
24 | .macro resample_one fmt, es=2 | |
25 | .ifnc \fmt, dbl | |
26 | .macro M_MUL2 x:vararg | |
27 | .endm | |
28 | .macro M_MLA2 x:vararg | |
29 | .endm | |
30 | .endif | |
31 | function ff_resample_one_\fmt\()_neon, export=1 | |
32 | sxtw x2, w2 | |
33 | ldr x9, [x0, #FILTER_BANK] | |
34 | ldr w6, [x0, #FILTER_LENGTH] | |
35 | ldp w7, w8, [x0, #PHASE_SHIFT] // and phase_mask | |
36 | lsr x10, x4, x7 // sample_index | |
37 | and x4, x4, x8 | |
38 | lsl x11, x6, #\es // filter_length * elem_size | |
39 | add x3, x3, x10, lsl #\es // src[sample_index] | |
40 | madd x9, x11, x4, x9 // filter | |
41 | cmp w6, #16 | |
42 | b.lt 5f | |
43 | 8: // remaining filter_length at least 16 | |
44 | subs w6, w6, #16 | |
45 | LOAD8 v4, v5, v6, v7, x3 | |
46 | LOAD8 v16, v17, v18, v19, x9 | |
47 | M_MUL v0, v4, v16, v1 | |
48 | M_MUL2 v1, v6, v18 | |
49 | 7: | |
50 | LOAD8 v20, v21, v22, v23, x3 | |
51 | M_MLA v0, v5, v17, v1 | |
52 | M_MLA2 v1, v7, v19 | |
53 | LOAD8 v24, v25, v26, v27, x9 | |
54 | M_MLA v0, v20, v24, v1 | |
55 | M_MLA2 v1, v22, v26 | |
56 | b.eq 6f | |
57 | cmp w6, #16 | |
58 | M_MLA v0, v21, v25, v1 | |
59 | M_MLA2 v1, v23, v27 | |
60 | b.lt 4f | |
61 | subs w6, w6, #16 | |
62 | LOAD8 v4, v5, v6, v7, x3 | |
63 | LOAD8 v16, v17, v18, v19, x9 | |
64 | M_MLA v0, v4, v16, v1 | |
65 | M_MLA2 v1, v6, v18 | |
66 | b 7b | |
67 | 6: | |
68 | M_MLA v0, v21, v25, v1 | |
69 | M_MLA2 v1, v23, v27 | |
70 | STORE_ONE 0, x1, x2, v1 | |
71 | ret | |
72 | 5: | |
73 | movi v0.16b, #0 | |
74 | movi v1.16b, #0 | |
75 | 4: // remaining filter_length 1-15 | |
76 | cmp w6, #4 | |
77 | b.lt 2f | |
78 | subs w6, w6, #4 | |
79 | LOAD4 v4, v5, x3 | |
80 | LOAD4 v6, v7, x9 | |
81 | M_MLA v0, v4, v6, v1 | |
82 | M_MLA2 v1, v5, v7 | |
83 | b.eq 0f | |
84 | b 4b | |
85 | 2: // remaining filter_length 1-3 | |
86 | cmp w6, #2 | |
87 | b.lt 1f | |
88 | LOAD2 2, x3 | |
89 | LOAD2 3, x9 | |
90 | subs w6, w6, #2 | |
91 | M_MLA v0, v2, v3 | |
92 | b.eq 0f | |
93 | 1: // remaining filter_length 1 | |
94 | LOAD1 6, x3 | |
95 | LOAD1 7, x9 | |
96 | M_MLA v0, v6, v7 | |
97 | 0: | |
98 | STORE_ONE 0, x1, x2, v1 | |
99 | ret | |
100 | endfunc | |
101 | ||
102 | .purgem LOAD1 | |
103 | .purgem LOAD2 | |
104 | .purgem LOAD4 | |
105 | .purgem LOAD8 | |
106 | .purgem M_MLA | |
107 | .purgem M_MLA2 | |
108 | .purgem M_MUL | |
109 | .purgem M_MUL2 | |
110 | .purgem STORE_ONE | |
111 | .endm | |
112 | ||
113 | ||
114 | .macro LOAD1 d1, addr | |
115 | ldr d\d1, [\addr], #8 | |
116 | .endm | |
117 | .macro LOAD2 d1, addr | |
118 | ld1 {v\d1\().2d}, [\addr], #16 | |
119 | .endm | |
120 | .macro LOAD4 d1, d2, addr | |
121 | ld1 {\d1\().2d,\d2\().2d}, [\addr], #32 | |
122 | .endm | |
123 | .macro LOAD8 d1, d2, d3, d4, addr | |
124 | ld1 {\d1\().2d,\d2\().2d,\d3\().2d,\d4\().2d}, [\addr], #64 | |
125 | .endm | |
126 | .macro M_MLA d, r0, r1, d2:vararg | |
127 | fmla \d\().2d, \r0\().2d, \r1\().2d | |
128 | .endm | |
129 | .macro M_MLA2 second:vararg | |
130 | M_MLA \second | |
131 | .endm | |
132 | .macro M_MUL d, r0, r1, d2:vararg | |
133 | fmul \d\().2d, \r0\().2d, \r1\().2d | |
134 | .endm | |
135 | .macro M_MUL2 second:vararg | |
136 | M_MUL \second | |
137 | .endm | |
138 | .macro STORE_ONE rn, addr, idx, d2 | |
139 | fadd v\rn\().2d, v\rn\().2d, \d2\().2d | |
140 | faddp d\rn\(), v\rn\().2d | |
141 | str d\rn\(), [\addr, \idx, lsl #3] | |
142 | .endm | |
143 | ||
144 | resample_one dbl, 3 | |
145 | ||
146 | ||
147 | .macro LOAD1 d1, addr | |
148 | ldr s\d1, [\addr], #4 | |
149 | .endm | |
150 | .macro LOAD2 d1, addr | |
151 | ld1 {v\d1\().2s}, [\addr], #8 | |
152 | .endm | |
153 | .macro LOAD4 d1, d2, addr | |
154 | ld1 {\d1\().4s}, [\addr], #16 | |
155 | .endm | |
156 | .macro LOAD8 d1, d2, d3, d4, addr | |
157 | ld1 {\d1\().4s,\d2\().4s}, [\addr], #32 | |
158 | .endm | |
159 | .macro M_MLA d, r0, r1, d2:vararg | |
160 | fmla \d\().4s, \r0\().4s, \r1\().4s | |
161 | .endm | |
162 | .macro M_MUL d, r0, r1, d2:vararg | |
163 | fmul \d\().4s, \r0\().4s, \r1\().4s | |
164 | .endm | |
165 | .macro STORE_ONE rn, addr, idx, d2 | |
166 | faddp v\rn\().4s, v\rn\().4s, v\rn\().4s | |
167 | faddp s\rn\(), v\rn\().2s | |
168 | str s\rn\(), [\addr, \idx, lsl #2] | |
169 | .endm | |
170 | ||
171 | resample_one flt | |
172 | ||
173 | ||
174 | .macro LOAD1 d1, addr | |
175 | ldr h\d1, [\addr], #2 | |
176 | .endm | |
177 | .macro LOAD2 d1, addr | |
178 | ldr s\d1, [\addr], #4 | |
179 | .endm | |
180 | .macro LOAD4 d1, d2, addr | |
181 | ld1 {\d1\().4h}, [\addr], #8 | |
182 | .endm | |
183 | .macro LOAD8 d1, d2, d3, d4, addr | |
184 | ld1 {\d1\().4h,\d2\().4h}, [\addr], #16 | |
185 | .endm | |
186 | .macro M_MLA d, r0, r1, d2:vararg | |
187 | smlal \d\().4s, \r0\().4h, \r1\().4h | |
188 | .endm | |
189 | .macro M_MUL d, r0, r1, d2:vararg | |
190 | smull \d\().4s, \r0\().4h, \r1\().4h | |
191 | .endm | |
192 | .macro STORE_ONE rn, addr, idx, d2 | |
193 | addp v\rn\().4s, v\rn\().4s, v\rn\().4s | |
194 | addp v\rn\().4s, v\rn\().4s, v\rn\().4s | |
195 | sqrshrn v\rn\().4h, v\rn\().4s, #15 | |
196 | str h\rn\(), [\addr, \idx, lsl #1] | |
197 | .endm | |
198 | ||
199 | resample_one s16, 1 | |
200 | ||
201 | ||
202 | .macro LOAD1 d1, addr | |
203 | ldr s\d1, [\addr], #4 | |
204 | .endm | |
205 | .macro LOAD2 d1, addr | |
206 | ld1 {v\d1\().2s}, [\addr], #8 | |
207 | .endm | |
208 | .macro LOAD4 d1, d2, addr | |
209 | ld1 {\d1\().4s}, [\addr], #16 | |
210 | .endm | |
211 | .macro LOAD8 d1, d2, d3, d4, addr | |
212 | ld1 {\d1\().4s,\d2\().4s}, [\addr], #32 | |
213 | .endm | |
214 | .macro M_MLA d1, r0, r1, d2:vararg | |
215 | smlal \d1\().2d, \r0\().2s, \r1\().2s | |
216 | .ifnb \d2 | |
217 | smlal2 \d2\().2d, \r0\().4s, \r1\().4s | |
218 | .endif | |
219 | .endm | |
220 | .macro M_MUL d1, r0, r1, d2:vararg | |
221 | smull \d1\().2d, \r0\().2s, \r1\().2s | |
222 | .ifnb \d2 | |
223 | smull2 \d2\().2d, \r0\().4s, \r1\().4s | |
224 | .endif | |
225 | .endm | |
226 | .macro STORE_ONE rn, addr, idx, d2 | |
227 | add v\rn\().2d, v\rn\().2d, \d2\().2d | |
228 | addp d\rn\(), v\rn\().2d | |
229 | sqrshrn v\rn\().2s, v\rn\().2d, #30 | |
230 | str s\rn\(), [\addr, \idx, lsl #2] | |
231 | .endm | |
232 | ||
233 | resample_one s32 |