| 1 | /* |
| 2 | * ARM NEON optimised Float DSP functions |
| 3 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> |
| 4 | * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> |
| 5 | * |
| 6 | * This file is part of FFmpeg. |
| 7 | * |
| 8 | * FFmpeg is free software; you can redistribute it and/or |
| 9 | * modify it under the terms of the GNU Lesser General Public |
| 10 | * License as published by the Free Software Foundation; either |
| 11 | * version 2.1 of the License, or (at your option) any later version. |
| 12 | * |
| 13 | * FFmpeg is distributed in the hope that it will be useful, |
| 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 16 | * Lesser General Public License for more details. |
| 17 | * |
| 18 | * You should have received a copy of the GNU Lesser General Public |
| 19 | * License along with FFmpeg; if not, write to the Free Software |
| 20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 21 | */ |
| 22 | |
| 23 | #include "config.h" |
| 24 | #include "asm.S" |
| 25 | |
| 26 | function ff_vector_fmul_neon, export=1 |
| 27 | 1: subs w3, w3, #16 |
| 28 | ld1 {v0.4S, v1.4S}, [x1], #32 |
| 29 | ld1 {v2.4S, v3.4S}, [x1], #32 |
| 30 | ld1 {v4.4S, v5.4S}, [x2], #32 |
| 31 | ld1 {v6.4S, v7.4S}, [x2], #32 |
| 32 | fmul v16.4S, v0.4S, v4.4S |
| 33 | fmul v17.4S, v1.4S, v5.4S |
| 34 | fmul v18.4S, v2.4S, v6.4S |
| 35 | fmul v19.4S, v3.4S, v7.4S |
| 36 | st1 {v16.4S, v17.4S}, [x0], #32 |
| 37 | st1 {v18.4S, v19.4S}, [x0], #32 |
| 38 | b.ne 1b |
| 39 | ret |
| 40 | endfunc |
| 41 | |
| 42 | function ff_vector_fmac_scalar_neon, export=1 |
| 43 | mov x3, #-32 |
| 44 | 1: subs w2, w2, #16 |
| 45 | ld1 {v16.4S, v17.4S}, [x0], #32 |
| 46 | ld1 {v18.4S, v19.4S}, [x0], x3 |
| 47 | ld1 {v4.4S, v5.4S}, [x1], #32 |
| 48 | ld1 {v6.4S, v7.4S}, [x1], #32 |
| 49 | fmla v16.4S, v4.4S, v0.S[0] |
| 50 | fmla v17.4S, v5.4S, v0.S[0] |
| 51 | fmla v18.4S, v6.4S, v0.S[0] |
| 52 | fmla v19.4S, v7.4S, v0.S[0] |
| 53 | st1 {v16.4S, v17.4S}, [x0], #32 |
| 54 | st1 {v18.4S, v19.4S}, [x0], #32 |
| 55 | b.ne 1b |
| 56 | ret |
| 57 | endfunc |
| 58 | |
| 59 | function ff_vector_fmul_scalar_neon, export=1 |
| 60 | mov w4, #15 |
| 61 | bics w3, w2, w4 |
| 62 | dup v16.4S, v0.S[0] |
| 63 | b.eq 3f |
| 64 | ld1 {v0.4S, v1.4S}, [x1], #32 |
| 65 | 1: subs w3, w3, #16 |
| 66 | fmul v0.4S, v0.4S, v16.4S |
| 67 | ld1 {v2.4S, v3.4S}, [x1], #32 |
| 68 | fmul v1.4S, v1.4S, v16.4S |
| 69 | fmul v2.4S, v2.4S, v16.4S |
| 70 | st1 {v0.4S, v1.4S}, [x0], #32 |
| 71 | fmul v3.4S, v3.4S, v16.4S |
| 72 | b.eq 2f |
| 73 | ld1 {v0.4S, v1.4S}, [x1], #32 |
| 74 | st1 {v2.4S, v3.4S}, [x0], #32 |
| 75 | b 1b |
| 76 | 2: ands w2, w2, #15 |
| 77 | st1 {v2.4S, v3.4S}, [x0], #32 |
| 78 | b.eq 4f |
| 79 | 3: ld1 {v0.4S}, [x1], #16 |
| 80 | fmul v0.4S, v0.4S, v16.4S |
| 81 | st1 {v0.4S}, [x0], #16 |
| 82 | subs w2, w2, #4 |
| 83 | b.gt 3b |
| 84 | 4: ret |
| 85 | endfunc |
| 86 | |
| 87 | function ff_vector_dmul_scalar_neon, export=1 |
| 88 | dup v16.2D, v0.D[0] |
| 89 | ld1 {v0.2D, v1.2D}, [x1], #32 |
| 90 | 1: subs w2, w2, #8 |
| 91 | fmul v0.2D, v0.2D, v16.2D |
| 92 | ld1 {v2.2D, v3.2D}, [x1], #32 |
| 93 | fmul v1.2D, v1.2D, v16.2D |
| 94 | fmul v2.2D, v2.2D, v16.2D |
| 95 | st1 {v0.2D, v1.2D}, [x0], #32 |
| 96 | fmul v3.2D, v3.2D, v16.2D |
| 97 | ld1 {v0.2D, v1.2D}, [x1], #32 |
| 98 | st1 {v2.2D, v3.2D}, [x0], #32 |
| 99 | b.gt 1b |
| 100 | ret |
| 101 | endfunc |
| 102 | |
| 103 | function ff_vector_fmul_window_neon, export=1 |
| 104 | sxtw x4, w4 // len |
| 105 | sub x2, x2, #8 |
| 106 | sub x5, x4, #2 |
| 107 | add x2, x2, x5, lsl #2 // src1 + 4 * (len - 4) |
| 108 | add x6, x3, x5, lsl #3 // win + 8 * (len - 2) |
| 109 | add x5, x0, x5, lsl #3 // dst + 8 * (len - 2) |
| 110 | mov x7, #-16 |
| 111 | ld1 {v0.4S}, [x1], #16 // s0 |
| 112 | ld1 {v2.4S}, [x3], #16 // wi |
| 113 | ld1 {v1.4S}, [x2], x7 // s1 |
| 114 | 1: ld1 {v3.4S}, [x6], x7 // wj |
| 115 | subs x4, x4, #4 |
| 116 | fmul v17.4S, v0.4S, v2.4S // s0 * wi |
| 117 | rev64 v4.4S, v1.4S |
| 118 | rev64 v5.4S, v3.4S |
| 119 | rev64 v17.4S, v17.4S |
| 120 | ext v4.16B, v4.16B, v4.16B, #8 // s1_r |
| 121 | ext v5.16B, v5.16B, v5.16B, #8 // wj_r |
| 122 | ext v17.16B, v17.16B, v17.16B, #8 // (s0 * wi)_rev |
| 123 | fmul v16.4S, v0.4S, v5.4S // s0 * wj_r |
| 124 | fmla v17.4S, v1.4S, v3.4S // (s0 * wi)_rev + s1 * wj |
| 125 | b.eq 2f |
| 126 | ld1 {v0.4S}, [x1], #16 |
| 127 | fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi |
| 128 | st1 {v17.4S}, [x5], x7 |
| 129 | ld1 {v2.4S}, [x3], #16 |
| 130 | ld1 {v1.4S}, [x2], x7 |
| 131 | st1 {v16.4S}, [x0], #16 |
| 132 | b 1b |
| 133 | 2: |
| 134 | fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi |
| 135 | st1 {v17.4S}, [x5], x7 |
| 136 | st1 {v16.4S}, [x0], #16 |
| 137 | ret |
| 138 | endfunc |
| 139 | |
| 140 | function ff_vector_fmul_add_neon, export=1 |
| 141 | ld1 {v0.4S, v1.4S}, [x1], #32 |
| 142 | ld1 {v2.4S, v3.4S}, [x2], #32 |
| 143 | ld1 {v4.4S, v5.4S}, [x3], #32 |
| 144 | 1: subs w4, w4, #8 |
| 145 | fmla v4.4S, v0.4S, v2.4S |
| 146 | fmla v5.4S, v1.4S, v3.4S |
| 147 | b.eq 2f |
| 148 | ld1 {v0.4S, v1.4S}, [x1], #32 |
| 149 | ld1 {v2.4S, v3.4S}, [x2], #32 |
| 150 | st1 {v4.4S, v5.4S}, [x0], #32 |
| 151 | ld1 {v4.4S, v5.4S}, [x3], #32 |
| 152 | b 1b |
| 153 | 2: st1 {v4.4S, v5.4S}, [x0], #32 |
| 154 | ret |
| 155 | endfunc |
| 156 | |
| 157 | function ff_vector_fmul_reverse_neon, export=1 |
| 158 | sxtw x3, w3 |
| 159 | add x2, x2, x3, lsl #2 |
| 160 | sub x2, x2, #32 |
| 161 | mov x4, #-32 |
| 162 | ld1 {v2.4S, v3.4S}, [x2], x4 |
| 163 | ld1 {v0.4S, v1.4S}, [x1], #32 |
| 164 | 1: subs x3, x3, #8 |
| 165 | rev64 v3.4S, v3.4S |
| 166 | rev64 v2.4S, v2.4S |
| 167 | ext v3.16B, v3.16B, v3.16B, #8 |
| 168 | ext v2.16B, v2.16B, v2.16B, #8 |
| 169 | fmul v16.4S, v0.4S, v3.4S |
| 170 | fmul v17.4S, v1.4S, v2.4S |
| 171 | b.eq 2f |
| 172 | ld1 {v2.4S, v3.4S}, [x2], x4 |
| 173 | ld1 {v0.4S, v1.4S}, [x1], #32 |
| 174 | st1 {v16.4S, v17.4S}, [x0], #32 |
| 175 | b 1b |
| 176 | 2: st1 {v16.4S, v17.4S}, [x0], #32 |
| 177 | ret |
| 178 | endfunc |
| 179 | |
| 180 | function ff_butterflies_float_neon, export=1 |
| 181 | 1: ld1 {v0.4S}, [x0] |
| 182 | ld1 {v1.4S}, [x1] |
| 183 | subs w2, w2, #4 |
| 184 | fsub v2.4S, v0.4S, v1.4S |
| 185 | fadd v3.4S, v0.4S, v1.4S |
| 186 | st1 {v2.4S}, [x1], #16 |
| 187 | st1 {v3.4S}, [x0], #16 |
| 188 | b.gt 1b |
| 189 | ret |
| 190 | endfunc |
| 191 | |
| 192 | function ff_scalarproduct_float_neon, export=1 |
| 193 | movi v2.4S, #0 |
| 194 | 1: ld1 {v0.4S}, [x0], #16 |
| 195 | ld1 {v1.4S}, [x1], #16 |
| 196 | subs w2, w2, #4 |
| 197 | fmla v2.4S, v0.4S, v1.4S |
| 198 | b.gt 1b |
| 199 | faddp v0.4S, v2.4S, v2.4S |
| 200 | faddp s0, v0.2S |
| 201 | ret |
| 202 | endfunc |