Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavutil / aarch64 / float_dsp_neon.S
CommitLineData
2ba45a60
DM
1/*
2 * ARM NEON optimised Float DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23#include "config.h"
24#include "asm.S"
25
26function ff_vector_fmul_neon, export=1
271: subs w3, w3, #16
28 ld1 {v0.4S, v1.4S}, [x1], #32
29 ld1 {v2.4S, v3.4S}, [x1], #32
30 ld1 {v4.4S, v5.4S}, [x2], #32
31 ld1 {v6.4S, v7.4S}, [x2], #32
32 fmul v16.4S, v0.4S, v4.4S
33 fmul v17.4S, v1.4S, v5.4S
34 fmul v18.4S, v2.4S, v6.4S
35 fmul v19.4S, v3.4S, v7.4S
36 st1 {v16.4S, v17.4S}, [x0], #32
37 st1 {v18.4S, v19.4S}, [x0], #32
38 b.ne 1b
39 ret
40endfunc
41
42function ff_vector_fmac_scalar_neon, export=1
43 mov x3, #-32
441: subs w2, w2, #16
45 ld1 {v16.4S, v17.4S}, [x0], #32
46 ld1 {v18.4S, v19.4S}, [x0], x3
47 ld1 {v4.4S, v5.4S}, [x1], #32
48 ld1 {v6.4S, v7.4S}, [x1], #32
49 fmla v16.4S, v4.4S, v0.S[0]
50 fmla v17.4S, v5.4S, v0.S[0]
51 fmla v18.4S, v6.4S, v0.S[0]
52 fmla v19.4S, v7.4S, v0.S[0]
53 st1 {v16.4S, v17.4S}, [x0], #32
54 st1 {v18.4S, v19.4S}, [x0], #32
55 b.ne 1b
56 ret
57endfunc
58
59function ff_vector_fmul_scalar_neon, export=1
60 mov w4, #15
61 bics w3, w2, w4
62 dup v16.4S, v0.S[0]
63 b.eq 3f
64 ld1 {v0.4S, v1.4S}, [x1], #32
651: subs w3, w3, #16
66 fmul v0.4S, v0.4S, v16.4S
67 ld1 {v2.4S, v3.4S}, [x1], #32
68 fmul v1.4S, v1.4S, v16.4S
69 fmul v2.4S, v2.4S, v16.4S
70 st1 {v0.4S, v1.4S}, [x0], #32
71 fmul v3.4S, v3.4S, v16.4S
72 b.eq 2f
73 ld1 {v0.4S, v1.4S}, [x1], #32
74 st1 {v2.4S, v3.4S}, [x0], #32
75 b 1b
762: ands w2, w2, #15
77 st1 {v2.4S, v3.4S}, [x0], #32
78 b.eq 4f
793: ld1 {v0.4S}, [x1], #16
80 fmul v0.4S, v0.4S, v16.4S
81 st1 {v0.4S}, [x0], #16
82 subs w2, w2, #4
83 b.gt 3b
844: ret
85endfunc
86
87function ff_vector_dmul_scalar_neon, export=1
88 dup v16.2D, v0.D[0]
89 ld1 {v0.2D, v1.2D}, [x1], #32
901: subs w2, w2, #8
91 fmul v0.2D, v0.2D, v16.2D
92 ld1 {v2.2D, v3.2D}, [x1], #32
93 fmul v1.2D, v1.2D, v16.2D
94 fmul v2.2D, v2.2D, v16.2D
95 st1 {v0.2D, v1.2D}, [x0], #32
96 fmul v3.2D, v3.2D, v16.2D
97 ld1 {v0.2D, v1.2D}, [x1], #32
98 st1 {v2.2D, v3.2D}, [x0], #32
99 b.gt 1b
100 ret
101endfunc
102
103function ff_vector_fmul_window_neon, export=1
104 sxtw x4, w4 // len
105 sub x2, x2, #8
106 sub x5, x4, #2
107 add x2, x2, x5, lsl #2 // src1 + 4 * (len - 4)
108 add x6, x3, x5, lsl #3 // win + 8 * (len - 2)
109 add x5, x0, x5, lsl #3 // dst + 8 * (len - 2)
110 mov x7, #-16
111 ld1 {v0.4S}, [x1], #16 // s0
112 ld1 {v2.4S}, [x3], #16 // wi
113 ld1 {v1.4S}, [x2], x7 // s1
1141: ld1 {v3.4S}, [x6], x7 // wj
115 subs x4, x4, #4
116 fmul v17.4S, v0.4S, v2.4S // s0 * wi
117 rev64 v4.4S, v1.4S
118 rev64 v5.4S, v3.4S
119 rev64 v17.4S, v17.4S
120 ext v4.16B, v4.16B, v4.16B, #8 // s1_r
121 ext v5.16B, v5.16B, v5.16B, #8 // wj_r
122 ext v17.16B, v17.16B, v17.16B, #8 // (s0 * wi)_rev
123 fmul v16.4S, v0.4S, v5.4S // s0 * wj_r
124 fmla v17.4S, v1.4S, v3.4S // (s0 * wi)_rev + s1 * wj
125 b.eq 2f
126 ld1 {v0.4S}, [x1], #16
127 fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
128 st1 {v17.4S}, [x5], x7
129 ld1 {v2.4S}, [x3], #16
130 ld1 {v1.4S}, [x2], x7
131 st1 {v16.4S}, [x0], #16
132 b 1b
1332:
134 fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
135 st1 {v17.4S}, [x5], x7
136 st1 {v16.4S}, [x0], #16
137 ret
138endfunc
139
140function ff_vector_fmul_add_neon, export=1
141 ld1 {v0.4S, v1.4S}, [x1], #32
142 ld1 {v2.4S, v3.4S}, [x2], #32
143 ld1 {v4.4S, v5.4S}, [x3], #32
1441: subs w4, w4, #8
145 fmla v4.4S, v0.4S, v2.4S
146 fmla v5.4S, v1.4S, v3.4S
147 b.eq 2f
148 ld1 {v0.4S, v1.4S}, [x1], #32
149 ld1 {v2.4S, v3.4S}, [x2], #32
150 st1 {v4.4S, v5.4S}, [x0], #32
151 ld1 {v4.4S, v5.4S}, [x3], #32
152 b 1b
1532: st1 {v4.4S, v5.4S}, [x0], #32
154 ret
155endfunc
156
157function ff_vector_fmul_reverse_neon, export=1
158 sxtw x3, w3
159 add x2, x2, x3, lsl #2
160 sub x2, x2, #32
161 mov x4, #-32
162 ld1 {v2.4S, v3.4S}, [x2], x4
163 ld1 {v0.4S, v1.4S}, [x1], #32
1641: subs x3, x3, #8
165 rev64 v3.4S, v3.4S
166 rev64 v2.4S, v2.4S
167 ext v3.16B, v3.16B, v3.16B, #8
168 ext v2.16B, v2.16B, v2.16B, #8
169 fmul v16.4S, v0.4S, v3.4S
170 fmul v17.4S, v1.4S, v2.4S
171 b.eq 2f
172 ld1 {v2.4S, v3.4S}, [x2], x4
173 ld1 {v0.4S, v1.4S}, [x1], #32
174 st1 {v16.4S, v17.4S}, [x0], #32
175 b 1b
1762: st1 {v16.4S, v17.4S}, [x0], #32
177 ret
178endfunc
179
180function ff_butterflies_float_neon, export=1
1811: ld1 {v0.4S}, [x0]
182 ld1 {v1.4S}, [x1]
183 subs w2, w2, #4
184 fsub v2.4S, v0.4S, v1.4S
185 fadd v3.4S, v0.4S, v1.4S
186 st1 {v2.4S}, [x1], #16
187 st1 {v3.4S}, [x0], #16
188 b.gt 1b
189 ret
190endfunc
191
192function ff_scalarproduct_float_neon, export=1
193 movi v2.4S, #0
1941: ld1 {v0.4S}, [x0], #16
195 ld1 {v1.4S}, [x1], #16
196 subs w2, w2, #4
197 fmla v2.4S, v0.4S, v1.4S
198 b.gt 1b
199 faddp v0.4S, v2.4S, v2.4S
200 faddp s0, v0.2S
201 ret
202endfunc