Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
3 | * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | #include "libavutil/aarch64/asm.S" | |
23 | ||
24 | /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ | |
25 | .macro h264_chroma_mc8 type, codec=h264 | |
26 | function ff_\type\()_\codec\()_chroma_mc8_neon, export=1 | |
27 | sxtw x2, w2 | |
28 | .ifc \type,avg | |
29 | mov x8, x0 | |
30 | .endif | |
31 | prfm pldl1strm, [x1] | |
32 | prfm pldl1strm, [x1, x2] | |
33 | .ifc \codec,rv40 | |
34 | movrel x6, rv40bias | |
35 | lsr w9, w5, #1 | |
36 | lsr w10, w4, #1 | |
37 | lsl w9, w9, #3 | |
38 | lsl w10, w10, #1 | |
39 | add w9, w9, w10 | |
40 | add x6, x6, w9, UXTW | |
41 | ld1r {v22.8H}, [x6] | |
42 | .endif | |
43 | .ifc \codec,vc1 | |
44 | movi v22.8H, #28 | |
45 | .endif | |
46 | mul w7, w4, w5 | |
47 | lsl w14, w5, #3 | |
48 | lsl w13, w4, #3 | |
49 | cmp w7, #0 | |
50 | sub w6, w14, w7 | |
51 | sub w12, w13, w7 | |
52 | sub w4, w7, w13 | |
53 | sub w4, w4, w14 | |
54 | add w4, w4, #64 | |
55 | b.eq 2f | |
56 | ||
57 | dup v0.8B, w4 | |
58 | dup v1.8B, w12 | |
59 | ld1 {v4.8B, v5.8B}, [x1], x2 | |
60 | dup v2.8B, w6 | |
61 | dup v3.8B, w7 | |
62 | ext v5.8B, v4.8B, v5.8B, #1 | |
63 | 1: ld1 {v6.8B, v7.8B}, [x1], x2 | |
64 | umull v16.8H, v4.8B, v0.8B | |
65 | umlal v16.8H, v5.8B, v1.8B | |
66 | ext v7.8B, v6.8B, v7.8B, #1 | |
67 | ld1 {v4.8B, v5.8B}, [x1], x2 | |
68 | umlal v16.8H, v6.8B, v2.8B | |
69 | prfm pldl1strm, [x1] | |
70 | ext v5.8B, v4.8B, v5.8B, #1 | |
71 | umlal v16.8H, v7.8B, v3.8B | |
72 | umull v17.8H, v6.8B, v0.8B | |
73 | subs w3, w3, #2 | |
74 | umlal v17.8H, v7.8B, v1.8B | |
75 | umlal v17.8H, v4.8B, v2.8B | |
76 | umlal v17.8H, v5.8B, v3.8B | |
77 | prfm pldl1strm, [x1, x2] | |
78 | .ifc \codec,h264 | |
79 | rshrn v16.8B, v16.8H, #6 | |
80 | rshrn v17.8B, v17.8H, #6 | |
81 | .else | |
82 | add v16.8H, v16.8H, v22.8H | |
83 | add v17.8H, v17.8H, v22.8H | |
84 | shrn v16.8B, v16.8H, #6 | |
85 | shrn v17.8B, v17.8H, #6 | |
86 | .endif | |
87 | .ifc \type,avg | |
88 | ld1 {v20.8B}, [x8], x2 | |
89 | ld1 {v21.8B}, [x8], x2 | |
90 | urhadd v16.8B, v16.8B, v20.8B | |
91 | urhadd v17.8B, v17.8B, v21.8B | |
92 | .endif | |
93 | st1 {v16.8B}, [x0], x2 | |
94 | st1 {v17.8B}, [x0], x2 | |
95 | b.gt 1b | |
96 | ret | |
97 | ||
98 | 2: adds w12, w12, w6 | |
99 | dup v0.8B, w4 | |
100 | b.eq 5f | |
101 | tst w6, w6 | |
102 | dup v1.8B, w12 | |
103 | b.eq 4f | |
104 | ||
105 | ld1 {v4.8B}, [x1], x2 | |
106 | 3: ld1 {v6.8B}, [x1], x2 | |
107 | umull v16.8H, v4.8B, v0.8B | |
108 | umlal v16.8H, v6.8B, v1.8B | |
109 | ld1 {v4.8B}, [x1], x2 | |
110 | umull v17.8H, v6.8B, v0.8B | |
111 | umlal v17.8H, v4.8B, v1.8B | |
112 | prfm pldl1strm, [x1] | |
113 | .ifc \codec,h264 | |
114 | rshrn v16.8B, v16.8H, #6 | |
115 | rshrn v17.8B, v17.8H, #6 | |
116 | .else | |
117 | add v16.8H, v16.8H, v22.8H | |
118 | add v17.8H, v17.8H, v22.8H | |
119 | shrn v16.8B, v16.8H, #6 | |
120 | shrn v17.8B, v17.8H, #6 | |
121 | .endif | |
122 | prfm pldl1strm, [x1, x2] | |
123 | .ifc \type,avg | |
124 | ld1 {v20.8B}, [x8], x2 | |
125 | ld1 {v21.8B}, [x8], x2 | |
126 | urhadd v16.8B, v16.8B, v20.8B | |
127 | urhadd v17.8B, v17.8B, v21.8B | |
128 | .endif | |
129 | subs w3, w3, #2 | |
130 | st1 {v16.8B}, [x0], x2 | |
131 | st1 {v17.8B}, [x0], x2 | |
132 | b.gt 3b | |
133 | ret | |
134 | ||
135 | 4: ld1 {v4.8B, v5.8B}, [x1], x2 | |
136 | ld1 {v6.8B, v7.8B}, [x1], x2 | |
137 | ext v5.8B, v4.8B, v5.8B, #1 | |
138 | ext v7.8B, v6.8B, v7.8B, #1 | |
139 | prfm pldl1strm, [x1] | |
140 | subs w3, w3, #2 | |
141 | umull v16.8H, v4.8B, v0.8B | |
142 | umlal v16.8H, v5.8B, v1.8B | |
143 | umull v17.8H, v6.8B, v0.8B | |
144 | umlal v17.8H, v7.8B, v1.8B | |
145 | prfm pldl1strm, [x1, x2] | |
146 | .ifc \codec,h264 | |
147 | rshrn v16.8B, v16.8H, #6 | |
148 | rshrn v17.8B, v17.8H, #6 | |
149 | .else | |
150 | add v16.8H, v16.8H, v22.8H | |
151 | add v17.8H, v17.8H, v22.8H | |
152 | shrn v16.8B, v16.8H, #6 | |
153 | shrn v17.8B, v17.8H, #6 | |
154 | .endif | |
155 | .ifc \type,avg | |
156 | ld1 {v20.8B}, [x8], x2 | |
157 | ld1 {v21.8B}, [x8], x2 | |
158 | urhadd v16.8B, v16.8B, v20.8B | |
159 | urhadd v17.8B, v17.8B, v21.8B | |
160 | .endif | |
161 | st1 {v16.8B}, [x0], x2 | |
162 | st1 {v17.8B}, [x0], x2 | |
163 | b.gt 4b | |
164 | ret | |
165 | ||
166 | 5: ld1 {v4.8B}, [x1], x2 | |
167 | ld1 {v5.8B}, [x1], x2 | |
168 | prfm pldl1strm, [x1] | |
169 | subs w3, w3, #2 | |
170 | umull v16.8H, v4.8B, v0.8B | |
171 | umull v17.8H, v5.8B, v0.8B | |
172 | prfm pldl1strm, [x1, x2] | |
173 | .ifc \codec,h264 | |
174 | rshrn v16.8B, v16.8H, #6 | |
175 | rshrn v17.8B, v17.8H, #6 | |
176 | .else | |
177 | add v16.8H, v16.8H, v22.8H | |
178 | add v17.8H, v17.8H, v22.8H | |
179 | shrn v16.8B, v16.8H, #6 | |
180 | shrn v17.8B, v17.8H, #6 | |
181 | .endif | |
182 | .ifc \type,avg | |
183 | ld1 {v20.8B}, [x8], x2 | |
184 | ld1 {v21.8B}, [x8], x2 | |
185 | urhadd v16.8B, v16.8B, v20.8B | |
186 | urhadd v17.8B, v17.8B, v21.8B | |
187 | .endif | |
188 | st1 {v16.8B}, [x0], x2 | |
189 | st1 {v17.8B}, [x0], x2 | |
190 | b.gt 5b | |
191 | ret | |
192 | endfunc | |
193 | .endm | |
194 | ||
195 | /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ | |
196 | .macro h264_chroma_mc4 type, codec=h264 | |
197 | function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 | |
198 | sxtw x2, w2 | |
199 | .ifc \type,avg | |
200 | mov x8, x0 | |
201 | .endif | |
202 | prfm pldl1strm, [x1] | |
203 | prfm pldl1strm, [x1, x2] | |
204 | .ifc \codec,rv40 | |
205 | movrel x6, rv40bias | |
206 | lsr w9, w5, #1 | |
207 | lsr w10, w4, #1 | |
208 | lsl w9, w9, #3 | |
209 | lsl w10, w10, #1 | |
210 | add w9, w9, w10 | |
211 | add x6, x6, w9, UXTW | |
212 | ld1r {v22.8H}, [x6] | |
213 | .endif | |
214 | .ifc \codec,vc1 | |
215 | movi v22.8H, #28 | |
216 | .endif | |
217 | mul w7, w4, w5 | |
218 | lsl w14, w5, #3 | |
219 | lsl w13, w4, #3 | |
220 | cmp w7, #0 | |
221 | sub w6, w14, w7 | |
222 | sub w12, w13, w7 | |
223 | sub w4, w7, w13 | |
224 | sub w4, w4, w14 | |
225 | add w4, w4, #64 | |
226 | b.eq 2f | |
227 | ||
228 | dup v24.8B, w4 | |
229 | dup v25.8B, w12 | |
230 | ld1 {v4.8B}, [x1], x2 | |
231 | dup v26.8B, w6 | |
232 | dup v27.8B, w7 | |
233 | ext v5.8B, v4.8B, v5.8B, #1 | |
234 | trn1 v0.2S, v24.2S, v25.2S | |
235 | trn1 v2.2S, v26.2S, v27.2S | |
236 | trn1 v4.2S, v4.2S, v5.2S | |
237 | 1: ld1 {v6.8B}, [x1], x2 | |
238 | ext v7.8B, v6.8B, v7.8B, #1 | |
239 | trn1 v6.2S, v6.2S, v7.2S | |
240 | umull v18.8H, v4.8B, v0.8B | |
241 | umlal v18.8H, v6.8B, v2.8B | |
242 | ld1 {v4.8B}, [x1], x2 | |
243 | ext v5.8B, v4.8B, v5.8B, #1 | |
244 | trn1 v4.2S, v4.2S, v5.2S | |
245 | prfm pldl1strm, [x1] | |
246 | umull v19.8H, v6.8B, v0.8B | |
247 | umlal v19.8H, v4.8B, v2.8B | |
248 | trn1 v30.2D, v18.2D, v19.2D | |
249 | trn2 v31.2D, v18.2D, v19.2D | |
250 | add v18.8H, v30.8H, v31.8H | |
251 | .ifc \codec,h264 | |
252 | rshrn v16.8B, v18.8H, #6 | |
253 | .else | |
254 | add v18.8H, v18.8H, v22.8H | |
255 | shrn v16.8B, v18.8H, #6 | |
256 | .endif | |
257 | subs w3, w3, #2 | |
258 | prfm pldl1strm, [x1, x2] | |
259 | .ifc \type,avg | |
260 | ld1 {v20.S}[0], [x8], x2 | |
261 | ld1 {v20.S}[1], [x8], x2 | |
262 | urhadd v16.8B, v16.8B, v20.8B | |
263 | .endif | |
264 | st1 {v16.S}[0], [x0], x2 | |
265 | st1 {v16.S}[1], [x0], x2 | |
266 | b.gt 1b | |
267 | ret | |
268 | ||
269 | 2: adds w12, w12, w6 | |
270 | dup v30.8B, w4 | |
271 | b.eq 5f | |
272 | tst w6, w6 | |
273 | dup v31.8B, w12 | |
274 | trn1 v0.2S, v30.2S, v31.2S | |
275 | trn2 v1.2S, v30.2S, v31.2S | |
276 | b.eq 4f | |
277 | ||
278 | ext v1.8B, v0.8B, v1.8B, #4 | |
279 | ld1 {v4.S}[0], [x1], x2 | |
280 | 3: ld1 {v4.S}[1], [x1], x2 | |
281 | umull v18.8H, v4.8B, v0.8B | |
282 | ld1 {v4.S}[0], [x1], x2 | |
283 | umull v19.8H, v4.8B, v1.8B | |
284 | trn1 v30.2D, v18.2D, v19.2D | |
285 | trn2 v31.2D, v18.2D, v19.2D | |
286 | add v18.8H, v30.8H, v31.8H | |
287 | prfm pldl1strm, [x1] | |
288 | .ifc \codec,h264 | |
289 | rshrn v16.8B, v18.8H, #6 | |
290 | .else | |
291 | add v18.8H, v18.8H, v22.8H | |
292 | shrn v16.8B, v18.8H, #6 | |
293 | .endif | |
294 | .ifc \type,avg | |
295 | ld1 {v20.S}[0], [x8], x2 | |
296 | ld1 {v20.S}[1], [x8], x2 | |
297 | urhadd v16.8B, v16.8B, v20.8B | |
298 | .endif | |
299 | subs w3, w3, #2 | |
300 | prfm pldl1strm, [x1, x2] | |
301 | st1 {v16.S}[0], [x0], x2 | |
302 | st1 {v16.S}[1], [x0], x2 | |
303 | b.gt 3b | |
304 | ret | |
305 | ||
306 | 4: ld1 {v4.8B}, [x1], x2 | |
307 | ld1 {v6.8B}, [x1], x2 | |
308 | ext v5.8B, v4.8B, v5.8B, #1 | |
309 | ext v7.8B, v6.8B, v7.8B, #1 | |
310 | trn1 v4.2S, v4.2S, v5.2S | |
311 | trn1 v6.2S, v6.2S, v7.2S | |
312 | umull v18.8H, v4.8B, v0.8B | |
313 | umull v19.8H, v6.8B, v0.8B | |
314 | subs w3, w3, #2 | |
315 | trn1 v30.2D, v18.2D, v19.2D | |
316 | trn2 v31.2D, v18.2D, v19.2D | |
317 | add v18.8H, v30.8H, v31.8H | |
318 | prfm pldl1strm, [x1] | |
319 | .ifc \codec,h264 | |
320 | rshrn v16.8B, v18.8H, #6 | |
321 | .else | |
322 | add v18.8H, v18.8H, v22.8H | |
323 | shrn v16.8B, v18.8H, #6 | |
324 | .endif | |
325 | .ifc \type,avg | |
326 | ld1 {v20.S}[0], [x8], x2 | |
327 | ld1 {v20.S}[1], [x8], x2 | |
328 | urhadd v16.8B, v16.8B, v20.8B | |
329 | .endif | |
330 | prfm pldl1strm, [x1] | |
331 | st1 {v16.S}[0], [x0], x2 | |
332 | st1 {v16.S}[1], [x0], x2 | |
333 | b.gt 4b | |
334 | ret | |
335 | ||
336 | 5: ld1 {v4.S}[0], [x1], x2 | |
337 | ld1 {v4.S}[1], [x1], x2 | |
338 | umull v18.8H, v4.8B, v30.8B | |
339 | subs w3, w3, #2 | |
340 | prfm pldl1strm, [x1] | |
341 | .ifc \codec,h264 | |
342 | rshrn v16.8B, v18.8H, #6 | |
343 | .else | |
344 | add v18.8H, v18.8H, v22.8H | |
345 | shrn v16.8B, v18.8H, #6 | |
346 | .endif | |
347 | .ifc \type,avg | |
348 | ld1 {v20.S}[0], [x8], x2 | |
349 | ld1 {v20.S}[1], [x8], x2 | |
350 | urhadd v16.8B, v16.8B, v20.8B | |
351 | .endif | |
352 | prfm pldl1strm, [x1] | |
353 | st1 {v16.S}[0], [x0], x2 | |
354 | st1 {v16.S}[1], [x0], x2 | |
355 | b.gt 5b | |
356 | ret | |
357 | endfunc | |
358 | .endm | |
359 | ||
360 | .macro h264_chroma_mc2 type | |
361 | function ff_\type\()_h264_chroma_mc2_neon, export=1 | |
362 | sxtw x2, w2 | |
363 | prfm pldl1strm, [x1] | |
364 | prfm pldl1strm, [x1, x2] | |
365 | orr w7, w4, w5 | |
366 | cbz w7, 2f | |
367 | ||
368 | mul w7, w4, w5 | |
369 | lsl w14, w5, #3 | |
370 | lsl w13, w4, #3 | |
371 | sub w6, w14, w7 | |
372 | sub w12, w13, w7 | |
373 | sub w4, w7, w13 | |
374 | sub w4, w4, w14 | |
375 | add w4, w4, #64 | |
376 | dup v0.8B, w4 | |
377 | dup v2.8B, w12 | |
378 | dup v1.8B, w6 | |
379 | dup v3.8B, w7 | |
380 | trn1 v0.4H, v0.4H, v2.4H | |
381 | trn1 v1.4H, v1.4H, v3.4H | |
382 | 1: | |
383 | ld1 {v4.S}[0], [x1], x2 | |
384 | ld1 {v4.S}[1], [x1], x2 | |
385 | rev64 v5.2S, v4.2S | |
386 | ld1 {v5.S}[1], [x1] | |
387 | ext v6.8B, v4.8B, v5.8B, #1 | |
388 | ext v7.8B, v5.8B, v4.8B, #1 | |
389 | trn1 v4.4H, v4.4H, v6.4H | |
390 | trn1 v5.4H, v5.4H, v7.4H | |
391 | umull v16.8H, v4.8B, v0.8B | |
392 | umlal v16.8H, v5.8B, v1.8B | |
393 | .ifc \type,avg | |
394 | ld1 {v18.H}[0], [x0], x2 | |
395 | ld1 {v18.H}[2], [x0] | |
396 | sub x0, x0, x2 | |
397 | .endif | |
398 | rev64 v17.4S, v16.4S | |
399 | add v16.8H, v16.8H, v17.8H | |
400 | rshrn v16.8B, v16.8H, #6 | |
401 | .ifc \type,avg | |
402 | urhadd v16.8B, v16.8B, v18.8B | |
403 | .endif | |
404 | st1 {v16.H}[0], [x0], x2 | |
405 | st1 {v16.H}[2], [x0], x2 | |
406 | subs w3, w3, #2 | |
407 | b.gt 1b | |
408 | ret | |
409 | ||
410 | 2: | |
411 | ld1 {v16.H}[0], [x1], x2 | |
412 | ld1 {v16.H}[1], [x1], x2 | |
413 | .ifc \type,avg | |
414 | ld1 {v18.H}[0], [x0], x2 | |
415 | ld1 {v18.H}[1], [x0] | |
416 | sub x0, x0, x2 | |
417 | urhadd v16.8B, v16.8B, v18.8B | |
418 | .endif | |
419 | st1 {v16.H}[0], [x0], x2 | |
420 | st1 {v16.H}[1], [x0], x2 | |
421 | subs w3, w3, #2 | |
422 | b.gt 2b | |
423 | ret | |
424 | endfunc | |
425 | .endm | |
426 | ||
427 | h264_chroma_mc8 put | |
428 | h264_chroma_mc8 avg | |
429 | h264_chroma_mc4 put | |
430 | h264_chroma_mc4 avg | |
431 | h264_chroma_mc2 put | |
432 | h264_chroma_mc2 avg | |
433 | ||
434 | #if CONFIG_RV40_DECODER | |
435 | const rv40bias | |
436 | .short 0, 16, 32, 16 | |
437 | .short 32, 28, 32, 28 | |
438 | .short 0, 32, 16, 32 | |
439 | .short 32, 28, 32, 28 | |
440 | endconst | |
441 | ||
442 | h264_chroma_mc8 put, rv40 | |
443 | h264_chroma_mc8 avg, rv40 | |
444 | h264_chroma_mc4 put, rv40 | |
445 | h264_chroma_mc4 avg, rv40 | |
446 | #endif | |
447 | ||
448 | #if CONFIG_VC1_DECODER | |
449 | h264_chroma_mc8 put, vc1 | |
450 | h264_chroma_mc8 avg, vc1 | |
451 | h264_chroma_mc4 put, vc1 | |
452 | h264_chroma_mc4 avg, vc1 | |
453 | #endif |