Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
3 | * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | #include "libavutil/aarch64/asm.S" | |
23 | #include "neon.S" | |
24 | ||
25 | .macro h264_loop_filter_start | |
26 | cmp w2, #0 | |
27 | ldr w6, [x4] | |
28 | ccmp w3, #0, #0, ne | |
29 | mov v24.S[0], w6 | |
30 | and w6, w6, w6, lsl #16 | |
31 | b.eq 1f | |
32 | ands w6, w6, w6, lsl #8 | |
33 | b.ge 2f | |
34 | 1: | |
35 | ret | |
36 | 2: | |
37 | .endm | |
38 | ||
39 | .macro h264_loop_filter_luma | |
40 | dup v22.16B, w2 // alpha | |
41 | uxtl v24.8H, v24.8B | |
42 | uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0) | |
43 | uxtl v24.4S, v24.4H | |
44 | uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0) | |
45 | sli v24.8H, v24.8H, #8 | |
46 | uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0) | |
47 | sli v24.4S, v24.4S, #16 | |
48 | cmhi v21.16B, v22.16B, v21.16B // < alpha | |
49 | dup v22.16B, w3 // beta | |
50 | cmlt v23.16B, v24.16B, #0 | |
51 | cmhi v28.16B, v22.16B, v28.16B // < beta | |
52 | cmhi v30.16B, v22.16B, v30.16B // < beta | |
53 | bic v21.16B, v21.16B, v23.16B | |
54 | uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0) | |
55 | and v21.16B, v21.16B, v28.16B | |
56 | uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0) | |
57 | cmhi v17.16B, v22.16B, v17.16B // < beta | |
58 | and v21.16B, v21.16B, v30.16B | |
59 | cmhi v19.16B, v22.16B, v19.16B // < beta | |
60 | and v17.16B, v17.16B, v21.16B | |
61 | and v19.16B, v19.16B, v21.16B | |
62 | and v24.16B, v24.16B, v21.16B | |
63 | urhadd v28.16B, v16.16B, v0.16B | |
64 | sub v21.16B, v24.16B, v17.16B | |
65 | uqadd v23.16B, v18.16B, v24.16B | |
66 | uhadd v20.16B, v20.16B, v28.16B | |
67 | sub v21.16B, v21.16B, v19.16B | |
68 | uhadd v28.16B, v4.16B, v28.16B | |
69 | umin v23.16B, v23.16B, v20.16B | |
70 | uqsub v22.16B, v18.16B, v24.16B | |
71 | uqadd v4.16B, v2.16B, v24.16B | |
72 | umax v23.16B, v23.16B, v22.16B | |
73 | uqsub v22.16B, v2.16B, v24.16B | |
74 | umin v28.16B, v4.16B, v28.16B | |
75 | uxtl v4.8H, v0.8B | |
76 | umax v28.16B, v28.16B, v22.16B | |
77 | uxtl2 v20.8H, v0.16B | |
78 | usubw v4.8H, v4.8H, v16.8B | |
79 | usubw2 v20.8H, v20.8H, v16.16B | |
80 | shl v4.8H, v4.8H, #2 | |
81 | shl v20.8H, v20.8H, #2 | |
82 | uaddw v4.8H, v4.8H, v18.8B | |
83 | uaddw2 v20.8H, v20.8H, v18.16B | |
84 | usubw v4.8H, v4.8H, v2.8B | |
85 | usubw2 v20.8H, v20.8H, v2.16B | |
86 | rshrn v4.8B, v4.8H, #3 | |
87 | rshrn2 v4.16B, v20.8H, #3 | |
88 | bsl v17.16B, v23.16B, v18.16B | |
89 | bsl v19.16B, v28.16B, v2.16B | |
90 | neg v23.16B, v21.16B | |
91 | uxtl v28.8H, v16.8B | |
92 | smin v4.16B, v4.16B, v21.16B | |
93 | uxtl2 v21.8H, v16.16B | |
94 | smax v4.16B, v4.16B, v23.16B | |
95 | uxtl v22.8H, v0.8B | |
96 | uxtl2 v24.8H, v0.16B | |
97 | saddw v28.8H, v28.8H, v4.8B | |
98 | saddw2 v21.8H, v21.8H, v4.16B | |
99 | ssubw v22.8H, v22.8H, v4.8B | |
100 | ssubw2 v24.8H, v24.8H, v4.16B | |
101 | sqxtun v16.8B, v28.8H | |
102 | sqxtun2 v16.16B, v21.8H | |
103 | sqxtun v0.8B, v22.8H | |
104 | sqxtun2 v0.16B, v24.8H | |
105 | .endm | |
106 | ||
107 | function ff_h264_v_loop_filter_luma_neon, export=1 | |
108 | h264_loop_filter_start | |
109 | sxtw x1, w1 | |
110 | ||
111 | ld1 {v0.16B}, [x0], x1 | |
112 | ld1 {v2.16B}, [x0], x1 | |
113 | ld1 {v4.16B}, [x0], x1 | |
114 | sub x0, x0, x1, lsl #2 | |
115 | sub x0, x0, x1, lsl #1 | |
116 | ld1 {v20.16B}, [x0], x1 | |
117 | ld1 {v18.16B}, [x0], x1 | |
118 | ld1 {v16.16B}, [x0], x1 | |
119 | ||
120 | h264_loop_filter_luma | |
121 | ||
122 | sub x0, x0, x1, lsl #1 | |
123 | st1 {v17.16B}, [x0], x1 | |
124 | st1 {v16.16B}, [x0], x1 | |
125 | st1 {v0.16B}, [x0], x1 | |
126 | st1 {v19.16B}, [x0] | |
127 | ||
128 | ret | |
129 | endfunc | |
130 | ||
131 | function ff_h264_h_loop_filter_luma_neon, export=1 | |
132 | h264_loop_filter_start | |
133 | ||
134 | sub x0, x0, #4 | |
135 | ld1 {v6.8B}, [x0], x1 | |
136 | ld1 {v20.8B}, [x0], x1 | |
137 | ld1 {v18.8B}, [x0], x1 | |
138 | ld1 {v16.8B}, [x0], x1 | |
139 | ld1 {v0.8B}, [x0], x1 | |
140 | ld1 {v2.8B}, [x0], x1 | |
141 | ld1 {v4.8B}, [x0], x1 | |
142 | ld1 {v26.8B}, [x0], x1 | |
143 | ld1 {v6.D}[1], [x0], x1 | |
144 | ld1 {v20.D}[1], [x0], x1 | |
145 | ld1 {v18.D}[1], [x0], x1 | |
146 | ld1 {v16.D}[1], [x0], x1 | |
147 | ld1 {v0.D}[1], [x0], x1 | |
148 | ld1 {v2.D}[1], [x0], x1 | |
149 | ld1 {v4.D}[1], [x0], x1 | |
150 | ld1 {v26.D}[1], [x0], x1 | |
151 | ||
152 | transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23 | |
153 | ||
154 | h264_loop_filter_luma | |
155 | ||
156 | transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27 | |
157 | ||
158 | sub x0, x0, x1, lsl #4 | |
159 | add x0, x0, #2 | |
160 | st1 {v17.S}[0], [x0], x1 | |
161 | st1 {v16.S}[0], [x0], x1 | |
162 | st1 {v0.S}[0], [x0], x1 | |
163 | st1 {v19.S}[0], [x0], x1 | |
164 | st1 {v17.S}[1], [x0], x1 | |
165 | st1 {v16.S}[1], [x0], x1 | |
166 | st1 {v0.S}[1], [x0], x1 | |
167 | st1 {v19.S}[1], [x0], x1 | |
168 | st1 {v17.S}[2], [x0], x1 | |
169 | st1 {v16.S}[2], [x0], x1 | |
170 | st1 {v0.S}[2], [x0], x1 | |
171 | st1 {v19.S}[2], [x0], x1 | |
172 | st1 {v17.S}[3], [x0], x1 | |
173 | st1 {v16.S}[3], [x0], x1 | |
174 | st1 {v0.S}[3], [x0], x1 | |
175 | st1 {v19.S}[3], [x0], x1 | |
176 | ||
177 | ret | |
178 | endfunc | |
179 | ||
180 | .macro h264_loop_filter_chroma | |
181 | dup v22.8B, w2 // alpha | |
182 | uxtl v24.8H, v24.8B | |
183 | uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0) | |
184 | uxtl v4.8H, v0.8B | |
185 | uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0) | |
186 | usubw v4.8H, v4.8H, v16.8B | |
187 | sli v24.8H, v24.8H, #8 | |
188 | shl v4.8H, v4.8H, #2 | |
189 | uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0) | |
190 | uaddw v4.8H, v4.8H, v18.8B | |
191 | cmhi v26.8B, v22.8B, v26.8B // < alpha | |
192 | usubw v4.8H, v4.8H, v2.8B | |
193 | dup v22.8B, w3 // beta | |
194 | rshrn v4.8B, v4.8H, #3 | |
195 | cmhi v28.8B, v22.8B, v28.8B // < beta | |
196 | cmhi v30.8B, v22.8B, v30.8B // < beta | |
197 | smin v4.8B, v4.8B, v24.8B | |
198 | neg v25.8B, v24.8B | |
199 | and v26.8B, v26.8B, v28.8B | |
200 | smax v4.8B, v4.8B, v25.8B | |
201 | and v26.8B, v26.8B, v30.8B | |
202 | uxtl v22.8H, v0.8B | |
203 | and v4.8B, v4.8B, v26.8B | |
204 | uxtl v28.8H, v16.8B | |
205 | saddw v28.8H, v28.8H, v4.8B | |
206 | ssubw v22.8H, v22.8H, v4.8B | |
207 | sqxtun v16.8B, v28.8H | |
208 | sqxtun v0.8B, v22.8H | |
209 | .endm | |
210 | ||
211 | function ff_h264_v_loop_filter_chroma_neon, export=1 | |
212 | h264_loop_filter_start | |
213 | ||
214 | sub x0, x0, x1, lsl #1 | |
215 | ld1 {v18.8B}, [x0], x1 | |
216 | ld1 {v16.8B}, [x0], x1 | |
217 | ld1 {v0.8B}, [x0], x1 | |
218 | ld1 {v2.8B}, [x0] | |
219 | ||
220 | h264_loop_filter_chroma | |
221 | ||
222 | sub x0, x0, x1, lsl #1 | |
223 | st1 {v16.8B}, [x0], x1 | |
224 | st1 {v0.8B}, [x0], x1 | |
225 | ||
226 | ret | |
227 | endfunc | |
228 | ||
229 | function ff_h264_h_loop_filter_chroma_neon, export=1 | |
230 | h264_loop_filter_start | |
231 | ||
232 | sub x0, x0, #2 | |
233 | ld1 {v18.S}[0], [x0], x1 | |
234 | ld1 {v16.S}[0], [x0], x1 | |
235 | ld1 {v0.S}[0], [x0], x1 | |
236 | ld1 {v2.S}[0], [x0], x1 | |
237 | ld1 {v18.S}[1], [x0], x1 | |
238 | ld1 {v16.S}[1], [x0], x1 | |
239 | ld1 {v0.S}[1], [x0], x1 | |
240 | ld1 {v2.S}[1], [x0], x1 | |
241 | ||
242 | transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 | |
243 | ||
244 | h264_loop_filter_chroma | |
245 | ||
246 | transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 | |
247 | ||
248 | sub x0, x0, x1, lsl #3 | |
249 | st1 {v18.S}[0], [x0], x1 | |
250 | st1 {v16.S}[0], [x0], x1 | |
251 | st1 {v0.S}[0], [x0], x1 | |
252 | st1 {v2.S}[0], [x0], x1 | |
253 | st1 {v18.S}[1], [x0], x1 | |
254 | st1 {v16.S}[1], [x0], x1 | |
255 | st1 {v0.S}[1], [x0], x1 | |
256 | st1 {v2.S}[1], [x0], x1 | |
257 | ||
258 | ret | |
259 | endfunc | |
260 | ||
261 | .macro biweight_16 macs, macd | |
262 | dup v0.16B, w5 | |
263 | dup v1.16B, w6 | |
264 | mov v4.16B, v16.16B | |
265 | mov v6.16B, v16.16B | |
266 | 1: subs w3, w3, #2 | |
267 | ld1 {v20.16B}, [x0], x2 | |
268 | \macd v4.8H, v0.8B, v20.8B | |
269 | \macd\()2 v6.8H, v0.16B, v20.16B | |
270 | ld1 {v22.16B}, [x1], x2 | |
271 | \macs v4.8H, v1.8B, v22.8B | |
272 | \macs\()2 v6.8H, v1.16B, v22.16B | |
273 | mov v24.16B, v16.16B | |
274 | ld1 {v28.16B}, [x0], x2 | |
275 | mov v26.16B, v16.16B | |
276 | \macd v24.8H, v0.8B, v28.8B | |
277 | \macd\()2 v26.8H, v0.16B, v28.16B | |
278 | ld1 {v30.16B}, [x1], x2 | |
279 | \macs v24.8H, v1.8B, v30.8B | |
280 | \macs\()2 v26.8H, v1.16B, v30.16B | |
281 | sshl v4.8H, v4.8H, v18.8H | |
282 | sshl v6.8H, v6.8H, v18.8H | |
283 | sqxtun v4.8B, v4.8H | |
284 | sqxtun2 v4.16B, v6.8H | |
285 | sshl v24.8H, v24.8H, v18.8H | |
286 | sshl v26.8H, v26.8H, v18.8H | |
287 | sqxtun v24.8B, v24.8H | |
288 | sqxtun2 v24.16B, v26.8H | |
289 | mov v6.16B, v16.16B | |
290 | st1 {v4.16B}, [x7], x2 | |
291 | mov v4.16B, v16.16B | |
292 | st1 {v24.16B}, [x7], x2 | |
293 | b.ne 1b | |
294 | ret | |
295 | .endm | |
296 | ||
297 | .macro biweight_8 macs, macd | |
298 | dup v0.8B, w5 | |
299 | dup v1.8B, w6 | |
300 | mov v2.16B, v16.16B | |
301 | mov v20.16B, v16.16B | |
302 | 1: subs w3, w3, #2 | |
303 | ld1 {v4.8B}, [x0], x2 | |
304 | \macd v2.8H, v0.8B, v4.8B | |
305 | ld1 {v5.8B}, [x1], x2 | |
306 | \macs v2.8H, v1.8B, v5.8B | |
307 | ld1 {v6.8B}, [x0], x2 | |
308 | \macd v20.8H, v0.8B, v6.8B | |
309 | ld1 {v7.8B}, [x1], x2 | |
310 | \macs v20.8H, v1.8B, v7.8B | |
311 | sshl v2.8H, v2.8H, v18.8H | |
312 | sqxtun v2.8B, v2.8H | |
313 | sshl v20.8H, v20.8H, v18.8H | |
314 | sqxtun v4.8B, v20.8H | |
315 | mov v20.16B, v16.16B | |
316 | st1 {v2.8B}, [x7], x2 | |
317 | mov v2.16B, v16.16B | |
318 | st1 {v4.8B}, [x7], x2 | |
319 | b.ne 1b | |
320 | ret | |
321 | .endm | |
322 | ||
323 | .macro biweight_4 macs, macd | |
324 | dup v0.8B, w5 | |
325 | dup v1.8B, w6 | |
326 | mov v2.16B, v16.16B | |
327 | mov v20.16B,v16.16B | |
328 | 1: subs w3, w3, #4 | |
329 | ld1 {v4.S}[0], [x0], x2 | |
330 | ld1 {v4.S}[1], [x0], x2 | |
331 | \macd v2.8H, v0.8B, v4.8B | |
332 | ld1 {v5.S}[0], [x1], x2 | |
333 | ld1 {v5.S}[1], [x1], x2 | |
334 | \macs v2.8H, v1.8B, v5.8B | |
335 | b.lt 2f | |
336 | ld1 {v6.S}[0], [x0], x2 | |
337 | ld1 {v6.S}[1], [x0], x2 | |
338 | \macd v20.8H, v0.8B, v6.8B | |
339 | ld1 {v7.S}[0], [x1], x2 | |
340 | ld1 {v7.S}[1], [x1], x2 | |
341 | \macs v20.8H, v1.8B, v7.8B | |
342 | sshl v2.8H, v2.8H, v18.8H | |
343 | sqxtun v2.8B, v2.8H | |
344 | sshl v20.8H, v20.8H, v18.8H | |
345 | sqxtun v4.8B, v20.8H | |
346 | mov v20.16B, v16.16B | |
347 | st1 {v2.S}[0], [x7], x2 | |
348 | st1 {v2.S}[1], [x7], x2 | |
349 | mov v2.16B, v16.16B | |
350 | st1 {v4.S}[0], [x7], x2 | |
351 | st1 {v4.S}[1], [x7], x2 | |
352 | b.ne 1b | |
353 | ret | |
354 | 2: sshl v2.8H, v2.8H, v18.8H | |
355 | sqxtun v2.8B, v2.8H | |
356 | st1 {v2.S}[0], [x7], x2 | |
357 | st1 {v2.S}[1], [x7], x2 | |
358 | ret | |
359 | .endm | |
360 | ||
361 | .macro biweight_func w | |
362 | function ff_biweight_h264_pixels_\w\()_neon, export=1 | |
363 | sxtw x2, w2 | |
364 | lsr w8, w5, #31 | |
365 | add w7, w7, #1 | |
366 | eor w8, w8, w6, lsr #30 | |
367 | orr w7, w7, #1 | |
368 | dup v18.8H, w4 | |
369 | lsl w7, w7, w4 | |
370 | not v18.16B, v18.16B | |
371 | dup v16.8H, w7 | |
372 | mov x7, x0 | |
373 | cbz w8, 10f | |
374 | subs w8, w8, #1 | |
375 | b.eq 20f | |
376 | subs w8, w8, #1 | |
377 | b.eq 30f | |
378 | b 40f | |
379 | 10: biweight_\w umlal, umlal | |
380 | 20: neg w5, w5 | |
381 | biweight_\w umlal, umlsl | |
382 | 30: neg w5, w5 | |
383 | neg w6, w6 | |
384 | biweight_\w umlsl, umlsl | |
385 | 40: neg w6, w6 | |
386 | biweight_\w umlsl, umlal | |
387 | endfunc | |
388 | .endm | |
389 | ||
390 | biweight_func 16 | |
391 | biweight_func 8 | |
392 | biweight_func 4 | |
393 | ||
394 | .macro weight_16 add | |
395 | dup v0.16B, w4 | |
396 | 1: subs w2, w2, #2 | |
397 | ld1 {v20.16B}, [x0], x1 | |
398 | umull v4.8H, v0.8B, v20.8B | |
399 | umull2 v6.8H, v0.16B, v20.16B | |
400 | ld1 {v28.16B}, [x0], x1 | |
401 | umull v24.8H, v0.8B, v28.8B | |
402 | umull2 v26.8H, v0.16B, v28.16B | |
403 | \add v4.8H, v16.8H, v4.8H | |
404 | srshl v4.8H, v4.8H, v18.8H | |
405 | \add v6.8H, v16.8H, v6.8H | |
406 | srshl v6.8H, v6.8H, v18.8H | |
407 | sqxtun v4.8B, v4.8H | |
408 | sqxtun2 v4.16B, v6.8H | |
409 | \add v24.8H, v16.8H, v24.8H | |
410 | srshl v24.8H, v24.8H, v18.8H | |
411 | \add v26.8H, v16.8H, v26.8H | |
412 | srshl v26.8H, v26.8H, v18.8H | |
413 | sqxtun v24.8B, v24.8H | |
414 | sqxtun2 v24.16B, v26.8H | |
415 | st1 {v4.16B}, [x5], x1 | |
416 | st1 {v24.16B}, [x5], x1 | |
417 | b.ne 1b | |
418 | ret | |
419 | .endm | |
420 | ||
421 | .macro weight_8 add | |
422 | dup v0.8B, w4 | |
423 | 1: subs w2, w2, #2 | |
424 | ld1 {v4.8B}, [x0], x1 | |
425 | umull v2.8H, v0.8B, v4.8B | |
426 | ld1 {v6.8B}, [x0], x1 | |
427 | umull v20.8H, v0.8B, v6.8B | |
428 | \add v2.8H, v16.8H, v2.8H | |
429 | srshl v2.8H, v2.8H, v18.8H | |
430 | sqxtun v2.8B, v2.8H | |
431 | \add v20.8H, v16.8H, v20.8H | |
432 | srshl v20.8H, v20.8H, v18.8H | |
433 | sqxtun v4.8B, v20.8H | |
434 | st1 {v2.8B}, [x5], x1 | |
435 | st1 {v4.8B}, [x5], x1 | |
436 | b.ne 1b | |
437 | ret | |
438 | .endm | |
439 | ||
440 | .macro weight_4 add | |
441 | dup v0.8B, w4 | |
442 | 1: subs w2, w2, #4 | |
443 | ld1 {v4.S}[0], [x0], x1 | |
444 | ld1 {v4.S}[1], [x0], x1 | |
445 | umull v2.8H, v0.8B, v4.8B | |
446 | b.lt 2f | |
447 | ld1 {v6.S}[0], [x0], x1 | |
448 | ld1 {v6.S}[1], [x0], x1 | |
449 | umull v20.8H, v0.8B, v6.8B | |
450 | \add v2.8H, v16.8H, v2.8H | |
451 | srshl v2.8H, v2.8H, v18.8H | |
452 | sqxtun v2.8B, v2.8H | |
453 | \add v20.8H, v16.8H, v20.8H | |
454 | srshl v20.8H, v20.8h, v18.8H | |
455 | sqxtun v4.8B, v20.8H | |
456 | st1 {v2.S}[0], [x5], x1 | |
457 | st1 {v2.S}[1], [x5], x1 | |
458 | st1 {v4.S}[0], [x5], x1 | |
459 | st1 {v4.S}[1], [x5], x1 | |
460 | b.ne 1b | |
461 | ret | |
462 | 2: \add v2.8H, v16.8H, v2.8H | |
463 | srshl v2.8H, v2.8H, v18.8H | |
464 | sqxtun v2.8B, v2.8H | |
465 | st1 {v2.S}[0], [x5], x1 | |
466 | st1 {v2.S}[1], [x5], x1 | |
467 | ret | |
468 | .endm | |
469 | ||
470 | .macro weight_func w | |
471 | function ff_weight_h264_pixels_\w\()_neon, export=1 | |
472 | sxtw x1, w1 | |
473 | cmp w3, #1 | |
474 | mov w6, #1 | |
475 | lsl w5, w5, w3 | |
476 | dup v16.8H, w5 | |
477 | mov x5, x0 | |
478 | b.le 20f | |
479 | sub w6, w6, w3 | |
480 | dup v18.8H, w6 | |
481 | cmp w4, #0 | |
482 | b.lt 10f | |
483 | weight_\w shadd | |
484 | 10: neg w4, w4 | |
485 | weight_\w shsub | |
486 | 20: neg w6, w3 | |
487 | dup v18.8H, w6 | |
488 | cmp w4, #0 | |
489 | b.lt 10f | |
490 | weight_\w add | |
491 | 10: neg w4, w4 | |
492 | weight_\w sub | |
493 | endfunc | |
494 | .endm | |
495 | ||
496 | weight_func 16 | |
497 | weight_func 8 | |
498 | weight_func 4 |