Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / arm / vc1dsp_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * VC1 NEON optimisations
3 *
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include "libavutil/arm/asm.S"
25#include "neon.S"
26
27#include "config.h"
28
29@ Transpose rows into columns of a matrix of 16-bit elements. For 4x4, pass
30@ double-word registers, for 8x4, pass quad-word registers.
31.macro transpose16 r0, r1, r2, r3
32 @ At this point:
33 @ row[0] r0
34 @ row[1] r1
35 @ row[2] r2
36 @ row[3] r3
37
38 vtrn.16 \r0, \r1 @ first and second row
39 vtrn.16 \r2, \r3 @ third and fourth row
40 vtrn.32 \r0, \r2 @ first and third row
41 vtrn.32 \r1, \r3 @ second and fourth row
42
43 @ At this point, if registers are quad-word:
44 @ column[0] d0
45 @ column[1] d2
46 @ column[2] d4
47 @ column[3] d6
48 @ column[4] d1
49 @ column[5] d3
50 @ column[6] d5
51 @ column[7] d7
52
53 @ At this point, if registers are double-word:
54 @ column[0] d0
55 @ column[1] d1
56 @ column[2] d2
57 @ column[3] d3
58.endm
59
60@ ff_vc1_inv_trans_{4,8}x{4,8}_neon and overflow: The input values in the file
61@ are supposed to be in a specific range as to allow for 16-bit math without
62@ causing overflows, but sometimes the input values are just big enough to
63@ barely cause overflow in vadd instructions like:
64@
65@ vadd.i16 q0, q8, q10
66@ vshr.s16 q0, q0, #\rshift
67@
68@ To prevent these borderline cases from overflowing, we just need one more
69@ bit of precision, which is accomplished by replacing the sequence above with:
70@
71@ vhadd.s16 q0, q8, q10
72@ vshr.s16 q0, q0, #(\rshift -1)
73@
74@ This works because vhadd is a single instruction that adds, then shifts to
75@ the right once, all before writing the result to the destination register.
76@
77@ Even with this workaround, there were still some files that caused overflows
78@ in ff_vc1_inv_trans_8x8_neon. See the comments in ff_vc1_inv_trans_8x8_neon
79@ for the additional workaround.
80
81@ Takes 4 columns of 8 values each and operates on it. Modeled after the first
82@ for loop in vc1_inv_trans_4x8_c.
83@ Input columns: q0 q1 q2 q3
84@ Output columns: q0 q1 q2 q3
85@ Trashes: r12 q8 q9 q10 q11 q12 q13
86.macro vc1_inv_trans_4x8_helper add rshift
87 @ Compute temp1, temp2 and setup scalar #17, #22, #10
88 vadd.i16 q12, q0, q2 @ temp1 = src[0] + src[2]
89 movw r12, #17
90 vsub.i16 q13, q0, q2 @ temp2 = src[0] - src[2]
91 movt r12, #22
92 vmov.32 d0[0], r12
93 movw r12, #10
94 vmov.16 d1[0], r12
95
96 vmov.i16 q8, #\add @ t1 will accumulate here
97 vmov.i16 q9, #\add @ t2 will accumulate here
98
99 vmul.i16 q10, q1, d0[1] @ t3 = 22 * (src[1])
100 vmul.i16 q11, q3, d0[1] @ t4 = 22 * (src[3])
101
102 vmla.i16 q8, q12, d0[0] @ t1 = 17 * (temp1) + 4
103 vmla.i16 q9, q13, d0[0] @ t2 = 17 * (temp2) + 4
104
105 vmla.i16 q10, q3, d1[0] @ t3 += 10 * src[3]
106 vmls.i16 q11, q1, d1[0] @ t4 -= 10 * src[1]
107
108 vhadd.s16 q0, q8, q10 @ dst[0] = (t1 + t3) >> 1
109 vhsub.s16 q3, q8, q10 @ dst[3] = (t1 - t3) >> 1
110 vhsub.s16 q1, q9, q11 @ dst[1] = (t2 - t4) >> 1
111 vhadd.s16 q2, q9, q11 @ dst[2] = (t2 + t4) >> 1
112
113 @ Halving add/sub above already did one shift
114 vshr.s16 q0, q0, #(\rshift - 1) @ dst[0] >>= (rshift - 1)
115 vshr.s16 q3, q3, #(\rshift - 1) @ dst[3] >>= (rshift - 1)
116 vshr.s16 q1, q1, #(\rshift - 1) @ dst[1] >>= (rshift - 1)
117 vshr.s16 q2, q2, #(\rshift - 1) @ dst[2] >>= (rshift - 1)
118.endm
119
120@ Takes 8 columns of 4 values each and operates on it. Modeled after the second
121@ for loop in vc1_inv_trans_4x8_c.
122@ Input columns: d0 d2 d4 d6 d1 d3 d5 d7
123@ Output columns: d16 d17 d18 d19 d21 d20 d23 d22
124@ Trashes all NEON registers (and r12) except for: q4 q5 q6 q7
125.macro vc1_inv_trans_8x4_helper add add1beforeshift rshift
126 @ At this point:
127 @ src[0] d0 overwritten later
128 @ src[8] d2
129 @ src[16] d4 overwritten later
130 @ src[24] d6
131 @ src[32] d1 overwritten later
132 @ src[40] d3
133 @ src[48] d5 overwritten later
134 @ src[56] d7
135
136 movw r12, #12
137 vmov.i16 q14, #\add @ t1|t2 will accumulate here
138 movt r12, #6
139
140 vadd.i16 d20, d0, d1 @ temp1 = src[0] + src[32]
141 vsub.i16 d21, d0, d1 @ temp2 = src[0] - src[32]
142 vmov.i32 d0[0], r12 @ 16-bit: d0[0] = #12, d0[1] = #6
143
144 vshl.i16 q15, q2, #4 @ t3|t4 = 16 * (src[16]|src[48])
145 vswp d4, d5 @ q2 = src[48]|src[16]
146 vmla.i16 q14, q10, d0[0] @ t1|t2 = 12 * (temp1|temp2) + 64
147 movw r12, #15
148 movt r12, #9
149 vmov.i32 d0[1], r12 @ 16-bit: d0[2] = #15, d0[3] = #9
150 vneg.s16 d31, d31 @ t4 = -t4
151 vmla.i16 q15, q2, d0[1] @ t3|t4 += 6 * (src[48]|src[16])
152
153 @ At this point:
154 @ d0[2] #15
155 @ d0[3] #9
156 @ q1 src[8]|src[40]
157 @ q3 src[24]|src[56]
158 @ q14 old t1|t2
159 @ q15 old t3|t4
160
161 vshl.i16 q8, q1, #4 @ t1|t2 = 16 * (src[8]|src[40])
162 vswp d2, d3 @ q1 = src[40]|src[8]
163 vshl.i16 q12, q3, #4 @ temp3a|temp4a = 16 * src[24]|src[56]
164 vswp d6, d7 @ q3 = src[56]|src[24]
165 vshl.i16 q13, q1, #2 @ temp3b|temp4b = 4 * (src[40]|src[8])
166 vshl.i16 q2, q3, #2 @ temp1|temp2 = 4 * (src[56]|src[24])
167 vswp d3, d6 @ q1 = src[40]|src[56], q3 = src[8]|src[24]
168 vsub.i16 q9, q13, q12 @ t3|t4 = - (temp3a|temp4a) + (temp3b|temp4b)
169 vadd.i16 q8, q8, q2 @ t1|t2 += temp1|temp2
170 vmul.i16 q12, q3, d0[3] @ temp3|temp4 = 9 * src[8]|src[24]
171 vmla.i16 q8, q1, d0[3] @ t1|t2 += 9 * (src[40]|src[56])
172 vswp d6, d7 @ q3 = src[24]|src[8]
173 vswp d2, d3 @ q1 = src[56]|src[40]
174
175 vsub.i16 q11, q14, q15 @ t8|t7 = old t1|t2 - old t3|t4
176 vadd.i16 q10, q14, q15 @ t5|t6 = old t1|t2 + old t3|t4
177 .if \add1beforeshift
178 vmov.i16 q15, #1
179 .endif
180
181 vadd.i16 d18, d18, d24 @ t3 += temp3
182 vsub.i16 d19, d19, d25 @ t4 -= temp4
183
184 vswp d22, d23 @ q11 = t7|t8
185
186 vneg.s16 d17, d17 @ t2 = -t2
187 vmla.i16 q9, q1, d0[2] @ t3|t4 += 15 * src[56]|src[40]
188 vmla.i16 q8, q3, d0[2] @ t1|t2 += 15 * src[24]|src[8]
189
190 @ At this point:
191 @ t1 d16
192 @ t2 d17
193 @ t3 d18
194 @ t4 d19
195 @ t5 d20
196 @ t6 d21
197 @ t7 d22
198 @ t8 d23
199 @ #1 q15
200
201 .if \add1beforeshift
202 vadd.i16 q3, q15, q10 @ line[7,6] = t5|t6 + 1
203 vadd.i16 q2, q15, q11 @ line[5,4] = t7|t8 + 1
204 .endif
205
206 @ Sometimes this overflows, so to get one additional bit of precision, use
207 @ a single instruction that both adds and shifts right (halving).
208 vhadd.s16 q1, q9, q11 @ line[2,3] = (t3|t4 + t7|t8) >> 1
209 vhadd.s16 q0, q8, q10 @ line[0,1] = (t1|t2 + t5|t6) >> 1
210 .if \add1beforeshift
211 vhsub.s16 q2, q2, q9 @ line[5,4] = (t7|t8 - t3|t4 + 1) >> 1
212 vhsub.s16 q3, q3, q8 @ line[7,6] = (t5|t6 - t1|t2 + 1) >> 1
213 .else
214 vhsub.s16 q2, q11, q9 @ line[5,4] = (t7|t8 - t3|t4) >> 1
215 vhsub.s16 q3, q10, q8 @ line[7,6] = (t5|t6 - t1|t2) >> 1
216 .endif
217
218 vshr.s16 q9, q1, #(\rshift - 1) @ one shift is already done by vhadd/vhsub above
219 vshr.s16 q8, q0, #(\rshift - 1)
220 vshr.s16 q10, q2, #(\rshift - 1)
221 vshr.s16 q11, q3, #(\rshift - 1)
222
223 @ At this point:
224 @ dst[0] d16
225 @ dst[1] d17
226 @ dst[2] d18
227 @ dst[3] d19
228 @ dst[4] d21
229 @ dst[5] d20
230 @ dst[6] d23
231 @ dst[7] d22
232.endm
233
234@ This is modeled after the first and second for loop in vc1_inv_trans_8x8_c.
235@ Input columns: q8, q9, q10, q11, q12, q13, q14, q15
236@ Output columns: q8, q9, q10, q11, q12, q13, q14, q15
237@ Trashes all NEON registers (and r12) except for: q4 q5 q6 q7
238.macro vc1_inv_trans_8x8_helper add add1beforeshift rshift
239 @ This actually computes half of t1, t2, t3, t4, as explained below
240 @ near `tNhalf`.
241 vmov.i16 q0, #(6 / 2) @ q0 = #6/2
242 vshl.i16 q1, q10, #3 @ t3 = 16/2 * src[16]
243 vshl.i16 q3, q14, #3 @ temp4 = 16/2 * src[48]
244 vmul.i16 q2, q10, q0 @ t4 = 6/2 * src[16]
245 vmla.i16 q1, q14, q0 @ t3 += 6/2 * src[48]
246 @ unused: q0, q10, q14
247 vmov.i16 q0, #(12 / 2) @ q0 = #12/2
248 vadd.i16 q10, q8, q12 @ temp1 = src[0] + src[32]
249 vsub.i16 q14, q8, q12 @ temp2 = src[0] - src[32]
250 @ unused: q8, q12
251 vmov.i16 q8, #(\add / 2) @ t1 will accumulate here
252 vmov.i16 q12, #(\add / 2) @ t2 will accumulate here
253 movw r12, #15
254 vsub.i16 q2, q2, q3 @ t4 = 6/2 * src[16] - 16/2 * src[48]
255 movt r12, #9
256 @ unused: q3
257 vmla.i16 q8, q10, q0 @ t1 = 12/2 * temp1 + add
258 vmla.i16 q12, q14, q0 @ t2 = 12/2 * temp2 + add
259 vmov.i32 d0[0], r12
260 @ unused: q3, q10, q14
261
262 @ At this point:
263 @ q0 d0=#15|#9
264 @ q1 old t3
265 @ q2 old t4
266 @ q3
267 @ q8 old t1
268 @ q9 src[8]
269 @ q10
270 @ q11 src[24]
271 @ q12 old t2
272 @ q13 src[40]
273 @ q14
274 @ q15 src[56]
275
276 @ unused: q3, q10, q14
277 movw r12, #16
278 vshl.i16 q3, q9, #4 @ t1 = 16 * src[8]
279 movt r12, #4
280 vshl.i16 q10, q9, #2 @ t4 = 4 * src[8]
281 vmov.i32 d1[0], r12
282 vmul.i16 q14, q9, d0[0] @ t2 = 15 * src[8]
283 vmul.i16 q9, q9, d0[1] @ t3 = 9 * src[8]
284 @ unused: none
285 vmla.i16 q3, q11, d0[0] @ t1 += 15 * src[24]
286 vmls.i16 q10, q11, d0[1] @ t4 -= 9 * src[24]
287 vmls.i16 q14, q11, d1[1] @ t2 -= 4 * src[24]
288 vmls.i16 q9, q11, d1[0] @ t3 -= 16 * src[24]
289 @ unused: q11
290 vmla.i16 q3, q13, d0[1] @ t1 += 9 * src[40]
291 vmla.i16 q10, q13, d0[0] @ t4 += 15 * src[40]
292 vmls.i16 q14, q13, d1[0] @ t2 -= 16 * src[40]
293 vmla.i16 q9, q13, d1[1] @ t3 += 4 * src[40]
294 @ unused: q11, q13
295
296 @ Compute t5, t6, t7, t8 from old t1, t2, t3, t4. Actually, it computes
297 @ half of t5, t6, t7, t8 since t1, t2, t3, t4 are halved.
298 vadd.i16 q11, q8, q1 @ t5 = t1 + t3
299 vsub.i16 q1, q8, q1 @ t8 = t1 - t3
300 vadd.i16 q13, q12, q2 @ t6 = t2 + t4
301 vsub.i16 q2, q12, q2 @ t7 = t2 - t4
302 @ unused: q8, q12
303
304 .if \add1beforeshift
305 vmov.i16 q12, #1
306 .endif
307
308 @ unused: q8
309 vmla.i16 q3, q15, d1[1] @ t1 += 4 * src[56]
310 vmls.i16 q14, q15, d0[1] @ t2 -= 9 * src[56]
311 vmla.i16 q9, q15, d0[0] @ t3 += 15 * src[56]
312 vmls.i16 q10, q15, d1[0] @ t4 -= 16 * src[56]
313 @ unused: q0, q8, q15
314
315 @ At this point:
316 @ t1 q3
317 @ t2 q14
318 @ t3 q9
319 @ t4 q10
320 @ t5half q11
321 @ t6half q13
322 @ t7half q2
323 @ t8half q1
324 @ #1 q12
325 @
326 @ tNhalf is half of the value of tN (as described in vc1_inv_trans_8x8_c).
327 @ This is done because sometimes files have input that causes tN + tM to
328 @ overflow. To avoid this overflow, we compute tNhalf, then compute
329 @ tNhalf + tM (which doesn't overflow), and then we use vhadd to compute
330 @ (tNhalf + (tNhalf + tM)) >> 1 which does not overflow because it is
331 @ one instruction.
332
333 @ For each pair of tN and tM, do:
334 @ lineA = t5half + t1
335 @ if add1beforeshift: t1 -= 1
336 @ lineA = (t5half + lineA) >> 1
337 @ lineB = t5half - t1
338 @ lineB = (t5half + lineB) >> 1
339 @ lineA >>= rshift - 1
340 @ lineB >>= rshift - 1
341
342 vadd.i16 q8, q11, q3 @ q8 = t5half + t1
343 .if \add1beforeshift
344 vsub.i16 q3, q3, q12 @ q3 = t1 - 1
345 .endif
346
347 vadd.i16 q0, q13, q14 @ q0 = t6half + t2
348 .if \add1beforeshift
349 vsub.i16 q14, q14, q12 @ q14 = t2 - 1
350 .endif
351
352 vadd.i16 q15, q2, q9 @ q15 = t7half + t3
353 .if \add1beforeshift
354 vsub.i16 q9, q9, q12 @ q9 = t3 - 1
355 .endif
356 @ unused: none
357
358 vhadd.s16 q8, q11, q8 @ q8 = (t5half + t5half + t1) >> 1
359 vsub.i16 q3, q11, q3 @ q3 = t5half - t1 + 1
360
361 vhadd.s16 q0, q13, q0 @ q0 = (t6half + t6half + t2) >> 1
362 vsub.i16 q14, q13, q14 @ q14 = t6half - t2 + 1
363
364 vhadd.s16 q15, q2, q15 @ q15 = (t7half + t7half + t3) >> 1
365 vsub.i16 q9, q2, q9 @ q9 = t7half - t3 + 1
366
367 vhadd.s16 q3, q11, q3 @ q3 = (t5half + t5half - t1 + 1) >> 1
368 @ unused: q11
369
370 vadd.i16 q11, q1, q10 @ q11 = t8half + t4
371 .if \add1beforeshift
372 vsub.i16 q10, q10, q12 @ q10 = t4 - 1
373 .endif
374 @ unused: q12
375
376 vhadd.s16 q14, q13, q14 @ q14 = (t6half + t6half - t2 + 1) >> 1
377 @ unused: q12, q13
378 vhadd.s16 q13, q2, q9 @ q9 = (t7half + t7half - t3 + 1) >> 1
379 @ unused: q12, q2, q9
380
381 vsub.i16 q10, q1, q10 @ q10 = t8half - t4 + 1
382 vhadd.s16 q11, q1, q11 @ q11 = (t8half + t8half + t4) >> 1
383
384 vshr.s16 q8, q8, #(\rshift - 1) @ q8 = line[0]
385 vhadd.s16 q12, q1, q10 @ q12 = (t8half + t8half - t4 + 1) >> 1
386 vshr.s16 q9, q0, #(\rshift - 1) @ q9 = line[1]
387 vshr.s16 q10, q15, #(\rshift - 1) @ q10 = line[2]
388 vshr.s16 q11, q11, #(\rshift - 1) @ q11 = line[3]
389 vshr.s16 q12, q12, #(\rshift - 1) @ q12 = line[4]
390 vshr.s16 q13, q13, #(\rshift - 1) @ q13 = line[5]
391 vshr.s16 q14, q14, #(\rshift - 1) @ q14 = line[6]
392 vshr.s16 q15, q3, #(\rshift - 1) @ q15 = line[7]
393.endm
394
395@ (int16_t *block [r0])
396function ff_vc1_inv_trans_8x8_neon, export=1
397 vld1.64 {q8-q9}, [r0,:128]!
398 vld1.64 {q10-q11}, [r0,:128]!
399 vld1.64 {q12-q13}, [r0,:128]!
400 vld1.64 {q14-q15}, [r0,:128]
401 sub r0, r0, #(16 * 2 * 3) @ restore r0
402
403 @ At this point:
404 @ src[0] q8
405 @ src[8] q9
406 @ src[16] q10
407 @ src[24] q11
408 @ src[32] q12
409 @ src[40] q13
410 @ src[48] q14
411 @ src[56] q15
412
413 vc1_inv_trans_8x8_helper add=4 add1beforeshift=0 rshift=3
414
415 @ Transpose result matrix of 8x8
416 swap4 d17, d19, d21, d23, d24, d26, d28, d30
417 transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15
418
419 vc1_inv_trans_8x8_helper add=64 add1beforeshift=1 rshift=7
420
421 vst1.64 {q8-q9}, [r0,:128]!
422 vst1.64 {q10-q11}, [r0,:128]!
423 vst1.64 {q12-q13}, [r0,:128]!
424 vst1.64 {q14-q15}, [r0,:128]
425
426 bx lr
427endfunc
428
429@ (uint8_t *dest [r0], int linesize [r1], int16_t *block [r2])
430function ff_vc1_inv_trans_8x4_neon, export=1
431 vld1.64 {q0-q1}, [r2,:128]! @ load 8 * 4 * 2 = 64 bytes / 16 bytes per quad = 4 quad registers
432 vld1.64 {q2-q3}, [r2,:128]
433
434 transpose16 q0 q1 q2 q3 @ transpose rows to columns
435
436 @ At this point:
437 @ src[0] d0
438 @ src[1] d2
439 @ src[2] d4
440 @ src[3] d6
441 @ src[4] d1
442 @ src[5] d3
443 @ src[6] d5
444 @ src[7] d7
445
446 vc1_inv_trans_8x4_helper add=4 add1beforeshift=0 rshift=3
447
448 @ Move output to more standardized registers
449 vmov d0, d16
450 vmov d2, d17
451 vmov d4, d18
452 vmov d6, d19
453 vmov d1, d21
454 vmov d3, d20
455 vmov d5, d23
456 vmov d7, d22
457
458 @ At this point:
459 @ dst[0] d0
460 @ dst[1] d2
461 @ dst[2] d4
462 @ dst[3] d6
463 @ dst[4] d1
464 @ dst[5] d3
465 @ dst[6] d5
466 @ dst[7] d7
467
468 transpose16 q0 q1 q2 q3 @ turn columns into rows
469
470 @ At this point:
471 @ row[0] q0
472 @ row[1] q1
473 @ row[2] q2
474 @ row[3] q3
475
476 vc1_inv_trans_4x8_helper add=64 rshift=7
477
478 @ At this point:
479 @ line[0].l d0
480 @ line[0].h d1
481 @ line[1].l d2
482 @ line[1].h d3
483 @ line[2].l d4
484 @ line[2].h d5
485 @ line[3].l d6
486 @ line[3].h d7
487
488 @ unused registers: q12, q13, q14, q15
489
490 vld1.64 {d28}, [r0,:64], r1 @ read dest
491 vld1.64 {d29}, [r0,:64], r1
492 vld1.64 {d30}, [r0,:64], r1
493 vld1.64 {d31}, [r0,:64], r1
494 sub r0, r0, r1, lsl #2 @ restore original r0 value
495
496 vaddw.u8 q0, q0, d28 @ line[0] += dest[0]
497 vaddw.u8 q1, q1, d29 @ line[1] += dest[1]
498 vaddw.u8 q2, q2, d30 @ line[2] += dest[2]
499 vaddw.u8 q3, q3, d31 @ line[3] += dest[3]
500
501 vqmovun.s16 d0, q0 @ line[0]
502 vqmovun.s16 d1, q1 @ line[1]
503 vqmovun.s16 d2, q2 @ line[2]
504 vqmovun.s16 d3, q3 @ line[3]
505
506 vst1.64 {d0}, [r0,:64], r1 @ write dest
507 vst1.64 {d1}, [r0,:64], r1
508 vst1.64 {d2}, [r0,:64], r1
509 vst1.64 {d3}, [r0,:64]
510
511 bx lr
512endfunc
513
514@ (uint8_t *dest [r0], int linesize [r1], int16_t *block [r2])
515function ff_vc1_inv_trans_4x8_neon, export=1
516 mov r12, #(8 * 2) @ 8 elements per line, each element 2 bytes
517 vld4.16 {d0[], d2[], d4[], d6[]}, [r2,:64], r12 @ read each column into a q register
518 vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r2,:64], r12
519 vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r2,:64], r12
520 vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r2,:64], r12
521 vld4.16 {d1[], d3[], d5[], d7[]}, [r2,:64], r12
522 vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r2,:64], r12
523 vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r2,:64], r12
524 vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r2,:64]
525
526 vc1_inv_trans_4x8_helper add=4 rshift=3
527
528 @ At this point:
529 @ dst[0] = q0
530 @ dst[1] = q1
531 @ dst[2] = q2
532 @ dst[3] = q3
533
534 transpose16 q0 q1 q2 q3 @ Transpose rows (registers) into columns
535
536 vc1_inv_trans_8x4_helper add=64 add1beforeshift=1 rshift=7
537
538 vld1.32 {d28[]}, [r0,:32], r1 @ read dest
539 vld1.32 {d28[1]}, [r0,:32], r1
540 vld1.32 {d29[]}, [r0,:32], r1
541 vld1.32 {d29[1]}, [r0,:32], r1
542
543 vld1.32 {d30[]}, [r0,:32], r1
544 vld1.32 {d30[0]}, [r0,:32], r1
545 vld1.32 {d31[]}, [r0,:32], r1
546 vld1.32 {d31[0]}, [r0,:32], r1
547 sub r0, r0, r1, lsl #3 @ restore original r0 value
548
549 vaddw.u8 q8, q8, d28 @ line[0,1] += dest[0,1]
550 vaddw.u8 q9, q9, d29 @ line[2,3] += dest[2,3]
551 vaddw.u8 q10, q10, d30 @ line[5,4] += dest[5,4]
552 vaddw.u8 q11, q11, d31 @ line[7,6] += dest[7,6]
553
554 vqmovun.s16 d16, q8 @ clip(line[0,1])
555 vqmovun.s16 d18, q9 @ clip(line[2,3])
556 vqmovun.s16 d20, q10 @ clip(line[5,4])
557 vqmovun.s16 d22, q11 @ clip(line[7,6])
558
559 vst1.32 {d16[0]}, [r0,:32], r1 @ write dest
560 vst1.32 {d16[1]}, [r0,:32], r1
561 vst1.32 {d18[0]}, [r0,:32], r1
562 vst1.32 {d18[1]}, [r0,:32], r1
563
564 vst1.32 {d20[1]}, [r0,:32], r1
565 vst1.32 {d20[0]}, [r0,:32], r1
566 vst1.32 {d22[1]}, [r0,:32], r1
567 vst1.32 {d22[0]}, [r0,:32]
568
569 bx lr
570endfunc
571
572@ Setup constants in registers which are used by vc1_inv_trans_4x4_helper
573.macro vc1_inv_trans_4x4_helper_setup
574 vmov.i16 q13, #17
575 vmov.i16 q14, #22
576 vmov.i16 d30, #10 @ only need double-word, not quad-word
577.endm
578
579@ This is modeled after the first for loop in vc1_inv_trans_4x4_c.
580.macro vc1_inv_trans_4x4_helper add rshift
581 vmov.i16 q2, #\add @ t1|t2 will accumulate here
582
583 vadd.i16 d16, d0, d1 @ temp1 = src[0] + src[2]
584 vsub.i16 d17, d0, d1 @ temp2 = src[0] - src[2]
585 vmul.i16 q3, q14, q1 @ t3|t4 = 22 * (src[1]|src[3])
586 vmla.i16 q2, q13, q8 @ t1|t2 = 17 * (temp1|temp2) + add
587 vmla.i16 d6, d30, d3 @ t3 += 10 * src[3]
588 vmls.i16 d7, d30, d2 @ t4 -= 10 * src[1]
589
590 vadd.i16 q0, q2, q3 @ dst[0,2] = (t1|t2 + t3|t4)
591 vsub.i16 q1, q2, q3 @ dst[3,1] = (t1|t2 - t3|t4)
592 vshr.s16 q0, q0, #\rshift @ dst[0,2] >>= rshift
593 vshr.s16 q1, q1, #\rshift @ dst[3,1] >>= rshift
594.endm
595
596@ (uint8_t *dest [r0], int linesize [r1], int16_t *block [r2])
597function ff_vc1_inv_trans_4x4_neon, export=1
598 mov r12, #(8 * 2) @ 8 elements per line, each element 2 bytes
599 vld4.16 {d0[], d1[], d2[], d3[]}, [r2,:64], r12 @ read each column into a register
600 vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r2,:64], r12
601 vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r2,:64], r12
602 vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r2,:64]
603
604 vswp d1, d2 @ so that we can later access column 1 and column 3 as a single q1 register
605
606 vc1_inv_trans_4x4_helper_setup
607
608 @ At this point:
609 @ src[0] = d0
610 @ src[1] = d2
611 @ src[2] = d1
612 @ src[3] = d3
613
614 vc1_inv_trans_4x4_helper add=4 rshift=3 @ compute t1, t2, t3, t4 and combine them into dst[0-3]
615
616 @ At this point:
617 @ dst[0] = d0
618 @ dst[1] = d3
619 @ dst[2] = d1
620 @ dst[3] = d2
621
622 transpose16 d0 d3 d1 d2 @ Transpose rows (registers) into columns
623
624 @ At this point:
625 @ src[0] = d0
626 @ src[8] = d3
627 @ src[16] = d1
628 @ src[24] = d2
629
630 vswp d2, d3 @ so that we can later access column 1 and column 3 in order as a single q1 register
631
632 @ At this point:
633 @ src[0] = d0
634 @ src[8] = d2
635 @ src[16] = d1
636 @ src[24] = d3
637
638 vc1_inv_trans_4x4_helper add=64 rshift=7 @ compute t1, t2, t3, t4 and combine them into dst[0-3]
639
640 @ At this point:
641 @ line[0] = d0
642 @ line[1] = d3
643 @ line[2] = d1
644 @ line[3] = d2
645
646 vld1.32 {d18[]}, [r0,:32], r1 @ read dest
647 vld1.32 {d19[]}, [r0,:32], r1
648 vld1.32 {d18[1]}, [r0,:32], r1
649 vld1.32 {d19[0]}, [r0,:32], r1
650 sub r0, r0, r1, lsl #2 @ restore original r0 value
651
652 vaddw.u8 q0, q0, d18 @ line[0,2] += dest[0,2]
653 vaddw.u8 q1, q1, d19 @ line[3,1] += dest[3,1]
654
655 vqmovun.s16 d0, q0 @ clip(line[0,2])
656 vqmovun.s16 d1, q1 @ clip(line[3,1])
657
658 vst1.32 {d0[0]}, [r0,:32], r1 @ write dest
659 vst1.32 {d1[1]}, [r0,:32], r1
660 vst1.32 {d0[1]}, [r0,:32], r1
661 vst1.32 {d1[0]}, [r0,:32]
662
663 bx lr
664endfunc
665
666#if HAVE_AS_DN_DIRECTIVE
667@ The absolute value of multiplication constants from vc1_mspel_filter and vc1_mspel_{ver,hor}_filter_16bits.
668@ The sign is embedded in the code below that carries out the multiplication (mspel_filter{,.16}).
669#define MSPEL_MODE_1_MUL_CONSTANTS 4 53 18 3
670#define MSPEL_MODE_2_MUL_CONSTANTS 1 9 9 1
671#define MSPEL_MODE_3_MUL_CONSTANTS 3 18 53 4
672
673@ These constants are from reading the source code of vc1_mspel_mc and determining the value that
674@ is added to `rnd` to result in the variable `r`, and the value of the variable `shift`.
675#define MSPEL_MODES_11_ADDSHIFT_CONSTANTS 15 5
676#define MSPEL_MODES_12_ADDSHIFT_CONSTANTS 3 3
677#define MSPEL_MODES_13_ADDSHIFT_CONSTANTS 15 5
678#define MSPEL_MODES_21_ADDSHIFT_CONSTANTS MSPEL_MODES_12_ADDSHIFT_CONSTANTS
679#define MSPEL_MODES_22_ADDSHIFT_CONSTANTS 0 1
680#define MSPEL_MODES_23_ADDSHIFT_CONSTANTS 3 3
681#define MSPEL_MODES_31_ADDSHIFT_CONSTANTS MSPEL_MODES_13_ADDSHIFT_CONSTANTS
682#define MSPEL_MODES_32_ADDSHIFT_CONSTANTS MSPEL_MODES_23_ADDSHIFT_CONSTANTS
683#define MSPEL_MODES_33_ADDSHIFT_CONSTANTS 15 5
684
685@ The addition and shift constants from vc1_mspel_filter.
686#define MSPEL_MODE_1_ADDSHIFT_CONSTANTS 32 6
687#define MSPEL_MODE_2_ADDSHIFT_CONSTANTS 8 4
688#define MSPEL_MODE_3_ADDSHIFT_CONSTANTS 32 6
689
690@ Setup constants in registers for a subsequent use of mspel_filter{,.16}.
691.macro mspel_constants typesize reg_a reg_b reg_c reg_d filter_a filter_b filter_c filter_d reg_add filter_add_register
692 @ Define double-word register aliases. Typesize should be i8 or i16.
693 ra .dn \reg_a\().\typesize
694 rb .dn \reg_b\().\typesize
695 rc .dn \reg_c\().\typesize
696 rd .dn \reg_d\().\typesize
697
698 @ Only set the register if the value is not 1 and unique
699 .if \filter_a != 1
700 vmov ra, #\filter_a @ ra = filter_a
701 .endif
702 vmov rb, #\filter_b @ rb = filter_b
703 .if \filter_b != \filter_c
704 vmov rc, #\filter_c @ rc = filter_c
705 .endif
706 .if \filter_d != 1
707 vmov rd, #\filter_d @ rd = filter_d
708 .endif
709 @ vdup to double the size of typesize
710 .ifc \typesize,i8
711 vdup.16 \reg_add, \filter_add_register @ reg_add = filter_add_register
712 .else
713 vdup.32 \reg_add, \filter_add_register @ reg_add = filter_add_register
714 .endif
715
716 .unreq ra
717 .unreq rb
718 .unreq rc
719 .unreq rd
720.endm
721
722@ After mspel_constants has been used, do the filtering.
723.macro mspel_filter acc dest src0 src1 src2 src3 filter_a filter_b filter_c filter_d reg_a reg_b reg_c reg_d reg_add filter_shift narrow=1
724 .if \filter_a != 1
725 @ If filter_a != 1, then we need a move and subtract instruction
726 vmov \acc, \reg_add @ acc = reg_add
727 vmlsl.u8 \acc, \reg_a, \src0 @ acc -= filter_a * src[-stride]
728 .else
729 @ If filter_a is 1, then just subtract without an extra move
730 vsubw.u8 \acc, \reg_add, \src0 @ acc = reg_add - src[-stride] @ since filter_a == 1
731 .endif
732 vmlal.u8 \acc, \reg_b, \src1 @ acc += filter_b * src[0]
733 .if \filter_b != \filter_c
734 vmlal.u8 \acc, \reg_c, \src2 @ acc += filter_c * src[stride]
735 .else
736 @ If filter_b is the same as filter_c, use the same reg_b register
737 vmlal.u8 \acc, \reg_b, \src2 @ acc += filter_c * src[stride] @ where filter_c == filter_b
738 .endif
739 .if \filter_d != 1
740 @ If filter_d != 1, then do a multiply accumulate
741 vmlsl.u8 \acc, \reg_d, \src3 @ acc -= filter_d * src[stride * 2]
742 .else
743 @ If filter_d is 1, then just do a subtract
744 vsubw.u8 \acc, \acc, \src3 @ acc -= src[stride * 2] @ since filter_d == 1
745 .endif
746 .if \narrow
747 vqshrun.s16 \dest, \acc, #\filter_shift @ dest = clip_uint8(acc >> filter_shift)
748 .else
749 vshr.s16 \dest, \acc, #\filter_shift @ dest = acc >> filter_shift
750 .endif
751.endm
752
753@ This is similar to mspel_filter, but the input is 16-bit instead of 8-bit and narrow=0 is not supported.
754.macro mspel_filter.16 acc0 acc1 acc0_0 acc0_1 dest src0 src1 src2 src3 src4 src5 src6 src7 filter_a filter_b filter_c filter_d reg_a reg_b reg_c reg_d reg_add filter_shift
755 .if \filter_a != 1
756 vmov \acc0, \reg_add
757 vmov \acc1, \reg_add
758 vmlsl.s16 \acc0, \reg_a, \src0
759 vmlsl.s16 \acc1, \reg_a, \src1
760 .else
761 vsubw.s16 \acc0, \reg_add, \src0
762 vsubw.s16 \acc1, \reg_add, \src1
763 .endif
764 vmlal.s16 \acc0, \reg_b, \src2
765 vmlal.s16 \acc1, \reg_b, \src3
766 .if \filter_b != \filter_c
767 vmlal.s16 \acc0, \reg_c, \src4
768 vmlal.s16 \acc1, \reg_c, \src5
769 .else
770 vmlal.s16 \acc0, \reg_b, \src4
771 vmlal.s16 \acc1, \reg_b, \src5
772 .endif
773 .if \filter_d != 1
774 vmlsl.s16 \acc0, \reg_d, \src6
775 vmlsl.s16 \acc1, \reg_d, \src7
776 .else
777 vsubw.s16 \acc0, \acc0, \src6
778 vsubw.s16 \acc1, \acc1, \src7
779 .endif
780 @ Use acc0_0 and acc0_1 as temp space
781 vqshrun.s32 \acc0_0, \acc0, #\filter_shift @ Shift and narrow with saturation from s32 to u16
782 vqshrun.s32 \acc0_1, \acc1, #\filter_shift
783 vqmovn.u16 \dest, \acc0 @ Narrow with saturation from u16 to u8
784.endm
785
786@ Register usage for put_vc1_mspel_mc functions. Registers marked 'hv' are only used in put_vc1_mspel_mc_hv.
787@
788@ r0 adjusted dst
789@ r1 adjusted src
790@ r2 stride
791@ r3 adjusted rnd
792@ r4 [hv] tmp
793@ r11 [hv] sp saved
794@ r12 loop counter
795@ d0 src[-stride]
796@ d1 src[0]
797@ d2 src[stride]
798@ d3 src[stride * 2]
799@ q0 [hv] src[-stride]
800@ q1 [hv] src[0]
801@ q2 [hv] src[stride]
802@ q3 [hv] src[stride * 2]
803@ d21 often result from mspel_filter
804@ q11 accumulator 0
805@ q12 [hv] accumulator 1
806@ q13 accumulator initial value
807@ d28 filter_a
808@ d29 filter_b
809@ d30 filter_c
810@ d31 filter_d
811
812@ (uint8_t *dst [r0], const uint8_t *src [r1], ptrdiff_t stride [r2], int rnd [r3])
813.macro put_vc1_mspel_mc_hv hmode vmode filter_h_a filter_h_b filter_h_c filter_h_d filter_v_a filter_v_b filter_v_c filter_v_d filter_add filter_shift
814function ff_put_vc1_mspel_mc\hmode\()\vmode\()_neon, export=1
815 push {r4, r11, lr}
816 mov r11, sp @ r11 = stack pointer before realignmnet
817A bic sp, sp, #15 @ sp = round down to multiple of 16 bytes
818T bic r4, r11, #15
819T mov sp, r4
820 sub sp, sp, #(8*2*16) @ make space for 8 rows * 2 byte per element * 16 elements per row (to fit 11 actual elements per row)
821 mov r4, sp @ r4 = int16_t tmp[8 * 16]
822
823 sub r1, r1, #1 @ src -= 1
824 .if \filter_add != 0
825 add r3, r3, #\filter_add @ r3 = filter_add + rnd
826 .endif
827 mov r12, #8 @ loop counter
828 sub r1, r1, r2 @ r1 = &src[-stride] @ slide back
829
830 @ Do vertical filtering from src into tmp
831 mspel_constants i8 d28 d29 d30 d31 \filter_v_a \filter_v_b \filter_v_c \filter_v_d q13 r3
832
833 vld1.64 {d0,d1}, [r1], r2
834 vld1.64 {d2,d3}, [r1], r2
835 vld1.64 {d4,d5}, [r1], r2
836
8371:
838 subs r12, r12, #4
839
840 vld1.64 {d6,d7}, [r1], r2
841 mspel_filter q11 q11 d0 d2 d4 d6 \filter_v_a \filter_v_b \filter_v_c \filter_v_d d28 d29 d30 d31 q13 \filter_shift narrow=0
842 mspel_filter q12 q12 d1 d3 d5 d7 \filter_v_a \filter_v_b \filter_v_c \filter_v_d d28 d29 d30 d31 q13 \filter_shift narrow=0
843 vst1.64 {q11,q12}, [r4,:128]! @ store and increment
844
845 vld1.64 {d0,d1}, [r1], r2
846 mspel_filter q11 q11 d2 d4 d6 d0 \filter_v_a \filter_v_b \filter_v_c \filter_v_d d28 d29 d30 d31 q13 \filter_shift narrow=0
847 mspel_filter q12 q12 d3 d5 d7 d1 \filter_v_a \filter_v_b \filter_v_c \filter_v_d d28 d29 d30 d31 q13 \filter_shift narrow=0
848 vst1.64 {q11,q12}, [r4,:128]! @ store and increment
849
850 vld1.64 {d2,d3}, [r1], r2
851 mspel_filter q11 q11 d4 d6 d0 d2 \filter_v_a \filter_v_b \filter_v_c \filter_v_d d28 d29 d30 d31 q13 \filter_shift narrow=0
852 mspel_filter q12 q12 d5 d7 d1 d3 \filter_v_a \filter_v_b \filter_v_c \filter_v_d d28 d29 d30 d31 q13 \filter_shift narrow=0
853 vst1.64 {q11,q12}, [r4,:128]! @ store and increment
854
855 vld1.64 {d4,d5}, [r1], r2
856 mspel_filter q11 q11 d6 d0 d2 d4 \filter_v_a \filter_v_b \filter_v_c \filter_v_d d28 d29 d30 d31 q13 \filter_shift narrow=0
857 mspel_filter q12 q12 d7 d1 d3 d5 \filter_v_a \filter_v_b \filter_v_c \filter_v_d d28 d29 d30 d31 q13 \filter_shift narrow=0
858 vst1.64 {q11,q12}, [r4,:128]! @ store and increment
859
860 bne 1b
861
862 rsb r3, r3, #(64 + \filter_add) @ r3 = (64 + filter_add) - r3
863 mov r12, #8 @ loop counter
864 mov r4, sp @ r4 = tmp
865
866 @ Do horizontal filtering from temp to dst
867 mspel_constants i16 d28 d29 d30 d31 \filter_h_a \filter_h_b \filter_h_c \filter_h_d q13 r3
868
8692:
870 subs r12, r12, #1
871
872 vld1.64 {q0,q1}, [r4,:128]! @ read one line of tmp
873 vext.16 q2, q0, q1, #2
874 vext.16 q3, q0, q1, #3
875 vext.16 q1, q0, q1, #1 @ do last because it writes to q1 which is read by the other vext instructions
876
877 mspel_filter.16 q11 q12 d22 d23 d21 d0 d1 d2 d3 d4 d5 d6 d7 \filter_h_a \filter_h_b \filter_h_c \filter_h_d d28 d29 d30 d31 q13 7
878
879 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst
880
881 bne 2b
882
883 mov sp, r11
884 pop {r4, r11, pc}
885endfunc
886.endm
887
888@ Use C preprocessor and assembler macros to expand to functions for horizontal and vertical filtering.
889#define PUT_VC1_MSPEL_MC_HV(hmode, vmode) \
890 put_vc1_mspel_mc_hv hmode vmode \
891 MSPEL_MODE_ ## hmode ## _MUL_CONSTANTS \
892 MSPEL_MODE_ ## vmode ## _MUL_CONSTANTS \
893 MSPEL_MODES_ ## hmode ## vmode ## _ADDSHIFT_CONSTANTS
894
895PUT_VC1_MSPEL_MC_HV(1, 1)
896PUT_VC1_MSPEL_MC_HV(1, 2)
897PUT_VC1_MSPEL_MC_HV(1, 3)
898PUT_VC1_MSPEL_MC_HV(2, 1)
899PUT_VC1_MSPEL_MC_HV(2, 2)
900PUT_VC1_MSPEL_MC_HV(2, 3)
901PUT_VC1_MSPEL_MC_HV(3, 1)
902PUT_VC1_MSPEL_MC_HV(3, 2)
903PUT_VC1_MSPEL_MC_HV(3, 3)
904
905#undef PUT_VC1_MSPEL_MC_HV
906
907.macro put_vc1_mspel_mc_h_only hmode filter_a filter_b filter_c filter_d filter_add filter_shift
908function ff_put_vc1_mspel_mc\hmode\()0_neon, export=1
909 rsb r3, r3, #\filter_add @ r3 = filter_add - r = filter_add - rnd
910 mov r12, #8 @ loop counter
911 sub r1, r1, #1 @ slide back, using immediate
912
913 mspel_constants i8 d28 d29 d30 d31 \filter_a \filter_b \filter_c \filter_d q13 r3
914
9151:
916 subs r12, r12, #1
917
918 vld1.64 {d0,d1}, [r1], r2 @ read 16 bytes even though we only need 11, also src += stride
919 vext.8 d2, d0, d1, #2
920 vext.8 d3, d0, d1, #3
921 vext.8 d1, d0, d1, #1 @ do last because it writes to d1 which is read by the other vext instructions
922
923 mspel_filter q11 d21 d0 d1 d2 d3 \filter_a \filter_b \filter_c \filter_d d28 d29 d30 d31 q13 \filter_shift
924
925 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst
926
927 bne 1b
928
929 bx lr
930endfunc
931.endm
932
933@ Use C preprocessor and assembler macros to expand to functions for horizontal only filtering.
934#define PUT_VC1_MSPEL_MC_H_ONLY(hmode) \
935 put_vc1_mspel_mc_h_only hmode MSPEL_MODE_ ## hmode ## _MUL_CONSTANTS MSPEL_MODE_ ## hmode ## _ADDSHIFT_CONSTANTS
936
937PUT_VC1_MSPEL_MC_H_ONLY(1)
938PUT_VC1_MSPEL_MC_H_ONLY(2)
939PUT_VC1_MSPEL_MC_H_ONLY(3)
940
941#undef PUT_VC1_MSPEL_MC_H_ONLY
942
943@ (uint8_t *dst [r0], const uint8_t *src [r1], ptrdiff_t stride [r2], int rnd [r3])
944.macro put_vc1_mspel_mc_v_only vmode filter_a filter_b filter_c filter_d filter_add filter_shift
945function ff_put_vc1_mspel_mc0\vmode\()_neon, export=1
946 add r3, r3, #\filter_add - 1 @ r3 = filter_add - r = filter_add - (1 - rnd) = filter_add - 1 + rnd
947 mov r12, #8 @ loop counter
948 sub r1, r1, r2 @ r1 = &src[-stride] @ slide back
949
950 mspel_constants i8 d28 d29 d30 d31 \filter_a \filter_b \filter_c \filter_d q13 r3
951
952 vld1.64 {d0}, [r1], r2 @ d0 = src[-stride]
953 vld1.64 {d1}, [r1], r2 @ d1 = src[0]
954 vld1.64 {d2}, [r1], r2 @ d2 = src[stride]
955
9561:
957 subs r12, r12, #4
958
959 vld1.64 {d3}, [r1], r2 @ d3 = src[stride * 2]
960 mspel_filter q11 d21 d0 d1 d2 d3 \filter_a \filter_b \filter_c \filter_d d28 d29 d30 d31 q13 \filter_shift
961 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst
962
963 vld1.64 {d0}, [r1], r2 @ d0 = next line
964 mspel_filter q11 d21 d1 d2 d3 d0 \filter_a \filter_b \filter_c \filter_d d28 d29 d30 d31 q13 \filter_shift
965 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst
966
967 vld1.64 {d1}, [r1], r2 @ d1 = next line
968 mspel_filter q11 d21 d2 d3 d0 d1 \filter_a \filter_b \filter_c \filter_d d28 d29 d30 d31 q13 \filter_shift
969 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst
970
971 vld1.64 {d2}, [r1], r2 @ d2 = next line
972 mspel_filter q11 d21 d3 d0 d1 d2 \filter_a \filter_b \filter_c \filter_d d28 d29 d30 d31 q13 \filter_shift
973 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst
974
975 bne 1b
976
977 bx lr
978endfunc
979.endm
980
981@ Use C preprocessor and assembler macros to expand to functions for vertical only filtering.
982#define PUT_VC1_MSPEL_MC_V_ONLY(vmode) \
983 put_vc1_mspel_mc_v_only vmode MSPEL_MODE_ ## vmode ## _MUL_CONSTANTS MSPEL_MODE_ ## vmode ## _ADDSHIFT_CONSTANTS
984
985PUT_VC1_MSPEL_MC_V_ONLY(1)
986PUT_VC1_MSPEL_MC_V_ONLY(2)
987PUT_VC1_MSPEL_MC_V_ONLY(3)
988
989#undef PUT_VC1_MSPEL_MC_V_ONLY
990#endif
991
992function ff_put_pixels8x8_neon, export=1
993 vld1.64 {d0}, [r1], r2
994 vld1.64 {d1}, [r1], r2
995 vld1.64 {d2}, [r1], r2
996 vld1.64 {d3}, [r1], r2
997 vld1.64 {d4}, [r1], r2
998 vld1.64 {d5}, [r1], r2
999 vld1.64 {d6}, [r1], r2
1000 vld1.64 {d7}, [r1]
1001 vst1.64 {d0}, [r0,:64], r2
1002 vst1.64 {d1}, [r0,:64], r2
1003 vst1.64 {d2}, [r0,:64], r2
1004 vst1.64 {d3}, [r0,:64], r2
1005 vst1.64 {d4}, [r0,:64], r2
1006 vst1.64 {d5}, [r0,:64], r2
1007 vst1.64 {d6}, [r0,:64], r2
1008 vst1.64 {d7}, [r0,:64]
1009 bx lr
1010endfunc
1011
1012function ff_vc1_inv_trans_8x8_dc_neon, export=1
1013 ldrsh r2, [r2] @ int dc = block[0];
1014
1015 vld1.64 {d0}, [r0,:64], r1
1016 vld1.64 {d1}, [r0,:64], r1
1017 vld1.64 {d4}, [r0,:64], r1
1018 vld1.64 {d5}, [r0,:64], r1
1019
1020 add r2, r2, r2, lsl #1 @ dc = (3 * dc + 1) >> 1;
1021 vld1.64 {d6}, [r0,:64], r1
1022 add r2, r2, #1
1023 vld1.64 {d7}, [r0,:64], r1
1024 vld1.64 {d16}, [r0,:64], r1
1025 vld1.64 {d17}, [r0,:64], r1
1026 asr r2, r2, #1
1027
1028 sub r0, r0, r1, lsl #3 @ restore r0 to original value
1029
1030 add r2, r2, r2, lsl #1 @ dc = (3 * dc + 16) >> 5;
1031 add r2, r2, #16
1032 asr r2, r2, #5
1033
1034 vdup.16 q1, r2 @ dc
1035
1036 vaddw.u8 q9, q1, d0
1037 vaddw.u8 q10, q1, d1
1038 vaddw.u8 q11, q1, d4
1039 vaddw.u8 q12, q1, d5
1040 vqmovun.s16 d0, q9
1041 vqmovun.s16 d1, q10
1042 vqmovun.s16 d4, q11
1043 vst1.64 {d0}, [r0,:64], r1
1044 vqmovun.s16 d5, q12
1045 vst1.64 {d1}, [r0,:64], r1
1046 vaddw.u8 q13, q1, d6
1047 vst1.64 {d4}, [r0,:64], r1
1048 vaddw.u8 q14, q1, d7
1049 vst1.64 {d5}, [r0,:64], r1
1050 vaddw.u8 q15, q1, d16
1051 vaddw.u8 q1, q1, d17 @ this destroys q1
1052 vqmovun.s16 d6, q13
1053 vqmovun.s16 d7, q14
1054 vqmovun.s16 d16, q15
1055 vqmovun.s16 d17, q1
1056 vst1.64 {d6}, [r0,:64], r1
1057 vst1.64 {d7}, [r0,:64], r1
1058 vst1.64 {d16}, [r0,:64], r1
1059 vst1.64 {d17}, [r0,:64]
1060 bx lr
1061endfunc
1062
1063function ff_vc1_inv_trans_8x4_dc_neon, export=1
1064 ldrsh r2, [r2] @ int dc = block[0];
1065
1066 vld1.64 {d0}, [r0,:64], r1
1067 vld1.64 {d1}, [r0,:64], r1
1068 vld1.64 {d4}, [r0,:64], r1
1069 vld1.64 {d5}, [r0,:64], r1
1070
1071 add r2, r2, r2, lsl #1 @ dc = ( 3 * dc + 1) >> 1;
1072
1073 sub r0, r0, r1, lsl #2 @ restore r0 to original value
1074
1075 add r2, r2, #1
1076 asr r2, r2, #1
1077
1078 add r2, r2, r2, lsl #4 @ dc = (17 * dc + 64) >> 7;
1079 add r2, r2, #64
1080 asr r2, r2, #7
1081
1082 vdup.16 q1, r2 @ dc
1083
1084 vaddw.u8 q3, q1, d0
1085 vaddw.u8 q8, q1, d1
1086 vaddw.u8 q9, q1, d4
1087 vaddw.u8 q10, q1, d5
1088 vqmovun.s16 d0, q3
1089 vqmovun.s16 d1, q8
1090 vqmovun.s16 d4, q9
1091 vst1.64 {d0}, [r0,:64], r1
1092 vqmovun.s16 d5, q10
1093 vst1.64 {d1}, [r0,:64], r1
1094 vst1.64 {d4}, [r0,:64], r1
1095 vst1.64 {d5}, [r0,:64]
1096 bx lr
1097endfunc
1098
1099function ff_vc1_inv_trans_4x8_dc_neon, export=1
1100 ldrsh r2, [r2] @ int dc = block[0];
1101
1102 vld1.32 {d0[]}, [r0,:32], r1
1103 vld1.32 {d1[]}, [r0,:32], r1
1104 vld1.32 {d0[1]}, [r0,:32], r1
1105 vld1.32 {d1[1]}, [r0,:32], r1
1106
1107 add r2, r2, r2, lsl #4 @ dc = (17 * dc + 4) >> 3;
1108 vld1.32 {d4[]}, [r0,:32], r1
1109 add r2, r2, #4
1110 vld1.32 {d5[]}, [r0,:32], r1
1111 vld1.32 {d4[1]}, [r0,:32], r1
1112 asr r2, r2, #3
1113 vld1.32 {d5[1]}, [r0,:32], r1
1114
1115 add r2, r2, r2, lsl #1 @ dc = (12 * dc + 64) >> 7;
1116
1117 sub r0, r0, r1, lsl #3 @ restore r0 to original value
1118
1119 lsl r2, r2, #2
1120 add r2, r2, #64
1121 asr r2, r2, #7
1122
1123 vdup.16 q1, r2 @ dc
1124
1125 vaddw.u8 q3, q1, d0
1126 vaddw.u8 q8, q1, d1
1127 vaddw.u8 q9, q1, d4
1128 vaddw.u8 q10, q1, d5
1129 vqmovun.s16 d0, q3
1130 vst1.32 {d0[0]}, [r0,:32], r1
1131 vqmovun.s16 d1, q8
1132 vst1.32 {d1[0]}, [r0,:32], r1
1133 vqmovun.s16 d4, q9
1134 vst1.32 {d0[1]}, [r0,:32], r1
1135 vqmovun.s16 d5, q10
1136 vst1.32 {d1[1]}, [r0,:32], r1
1137 vst1.32 {d4[0]}, [r0,:32], r1
1138 vst1.32 {d5[0]}, [r0,:32], r1
1139 vst1.32 {d4[1]}, [r0,:32], r1
1140 vst1.32 {d5[1]}, [r0,:32]
1141 bx lr
1142endfunc
1143
1144function ff_vc1_inv_trans_4x4_dc_neon, export=1
1145 ldrsh r2, [r2] @ int dc = block[0];
1146
1147 vld1.32 {d0[]}, [r0,:32], r1
1148 vld1.32 {d1[]}, [r0,:32], r1
1149 vld1.32 {d0[1]}, [r0,:32], r1
1150 vld1.32 {d1[1]}, [r0,:32], r1
1151
1152 add r2, r2, r2, lsl #4 @ dc = (17 * dc + 4) >> 3;
1153
1154 sub r0, r0, r1, lsl #2 @ restore r0 to original value
1155
1156 add r2, r2, #4
1157 asr r2, r2, #3
1158
1159 add r2, r2, r2, lsl #4 @ dc = (17 * dc + 64) >> 7;
1160 add r2, r2, #64
1161 asr r2, r2, #7
1162
1163 vdup.16 q1, r2 @ dc
1164
1165 vaddw.u8 q2, q1, d0
1166 vaddw.u8 q3, q1, d1
1167 vqmovun.s16 d0, q2
1168 vst1.32 {d0[0]}, [r0,:32], r1
1169 vqmovun.s16 d1, q3
1170 vst1.32 {d1[0]}, [r0,:32], r1
1171 vst1.32 {d0[1]}, [r0,:32], r1
1172 vst1.32 {d1[1]}, [r0,:32]
1173 bx lr
1174endfunc