| 1 | ;****************************************************************************** |
| 2 | ;* SIMD-optimized MLP DSP functions |
| 3 | ;* Copyright (c) 2014 James Almer <jamrial@gmail.com> |
| 4 | ;* |
| 5 | ;* This file is part of FFmpeg. |
| 6 | ;* |
| 7 | ;* FFmpeg is free software; you can redistribute it and/or |
| 8 | ;* modify it under the terms of the GNU Lesser General Public |
| 9 | ;* License as published by the Free Software Foundation; either |
| 10 | ;* version 2.1 of the License, or (at your option) any later version. |
| 11 | ;* |
| 12 | ;* FFmpeg is distributed in the hope that it will be useful, |
| 13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | ;* Lesser General Public License for more details. |
| 16 | ;* |
| 17 | ;* You should have received a copy of the GNU Lesser General Public |
| 18 | ;* License along with FFmpeg; if not, write to the Free Software |
| 19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 | ;****************************************************************************** |
| 21 | |
| 22 | %include "libavutil/x86/x86util.asm" |
| 23 | |
| 24 | SECTION_TEXT |
| 25 | |
| 26 | %if ARCH_X86_64 |
| 27 | |
| 28 | %macro SHLX 2 |
| 29 | %if cpuflag(bmi2) |
| 30 | shlx %1, %1, %2q |
| 31 | %else |
| 32 | shl %1, %2b |
| 33 | %endif |
| 34 | %endmacro |
| 35 | |
| 36 | %macro REMATRIX 0 |
| 37 | movdqa m0, [samplesq] |
| 38 | movdqa m1, [coeffsq ] |
| 39 | pshufd m2, m0, q2301 |
| 40 | pshufd m3, m1, q2301 |
| 41 | pmuldq m0, m1 |
| 42 | pmuldq m3, m2 |
| 43 | paddq m0, m3 |
| 44 | %if notcpuflag(avx2) |
| 45 | movdqa m1, [samplesq + 16] |
| 46 | movdqa m2, [coeffsq + 16] |
| 47 | pshufd m3, m1, q2301 |
| 48 | pshufd m4, m2, q2301 |
| 49 | pmuldq m1, m2 |
| 50 | pmuldq m4, m3 |
| 51 | paddq m0, m1 |
| 52 | paddq m0, m4 |
| 53 | %else |
| 54 | vextracti128 xm1, m0, 1 |
| 55 | paddq xm0, xm1 |
| 56 | %endif |
| 57 | %endmacro |
| 58 | |
| 59 | %macro LOOP_END 0 |
| 60 | pshufd xm1, xm0, q0032 |
| 61 | paddq xm0, xm1 |
| 62 | movq accumq, xm0 |
| 63 | movzx blsbsd, byte [blsbs_ptrq] ; load *bypassed_lsbs |
| 64 | sar accumq, 14 ; accum >>= 14 |
| 65 | and accumd, maskd ; accum &= mask |
| 66 | add accumd, blsbsd ; accum += *bypassed_lsbs |
| 67 | mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum |
| 68 | add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS; |
| 69 | add samplesq, 32 ; samples += MAX_CHANNELS; |
| 70 | cmp blsbs_ptrq, cntq |
| 71 | %endmacro |
| 72 | |
| 73 | %macro LOOP_SHIFT_END 0 |
| 74 | pshufd xm1, xm0, q0032 |
| 75 | paddq xm0, xm1 |
| 76 | movq accumq, xm0 |
| 77 | and indexd, auspd ; index &= access_unit_size_pow2; |
| 78 | movsx noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index] |
| 79 | add indexd, index2d ; index += index2 |
| 80 | SHLX noiseq, mns ; noise_buffer[index] <<= matrix_noise_shift |
| 81 | add accumq, noiseq ; accum += noise_buffer[index] |
| 82 | movzx noised, byte [blsbs_ptrq] ; load *bypassed_lsbs (reuse tmp noise register) |
| 83 | sar accumq, 14 ; accum >>= 14 |
| 84 | and accumd, maskd ; accum &= mask |
| 85 | add accumd, noised ; accum += *bypassed_lsbs |
| 86 | mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum |
| 87 | add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS; |
| 88 | add samplesq, 32 ; samples += MAX_CHANNELS; |
| 89 | cmp blsbs_ptrq, cntq |
| 90 | %endmacro |
| 91 | |
| 92 | ;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs, |
| 93 | ; const uint8_t *bypassed_lsbs, const int8_t *noise_buffer, |
| 94 | ; int index, unsigned int dest_ch, uint16_t blockpos, |
| 95 | ; unsigned int maxchan, int matrix_noise_shift, |
| 96 | ; int access_unit_size_pow2, int32_t mask) |
| 97 | %macro MLP_REMATRIX_CHANNEL 0 |
| 98 | cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \ |
| 99 | index, dest_ch, blockpos, maxchan, mns, \ |
| 100 | accum, mask, cnt |
| 101 | mov mnsd, mnsm ; load matrix_noise_shift |
| 102 | movzx blockposq, word blockposm ; load and zero extend blockpos (16bit) |
| 103 | mov maxchand, maxchanm ; load maxchan |
| 104 | mov maskd, maskm ; load mask |
| 105 | %if WIN64 |
| 106 | mov dest_chd, dest_chm ; load dest_chd (not needed on UNIX64) |
| 107 | %endif |
| 108 | shl dest_chd, 2 |
| 109 | lea cntq, [blsbs_ptrq + blockposq*8] |
| 110 | test mnsd, mnsd ; is matrix_noise_shift != 0? |
| 111 | jne .shift ; jump if true |
| 112 | cmp maxchand, 4 ; is maxchan < 4? |
| 113 | jl .loop4 ; jump if true |
| 114 | |
| 115 | align 16 |
| 116 | .loop8: |
| 117 | ; Process 5 or more channels |
| 118 | REMATRIX |
| 119 | LOOP_END |
| 120 | jne .loop8 |
| 121 | RET |
| 122 | |
| 123 | align 16 |
| 124 | .loop4: |
| 125 | ; Process up to 4 channels |
| 126 | movdqa xm0, [samplesq] |
| 127 | movdqa xm1, [coeffsq ] |
| 128 | pshufd xm2, xm0, q2301 |
| 129 | pshufd xm3, xm1, q2301 |
| 130 | pmuldq xm0, xm1 |
| 131 | pmuldq xm3, xm2 |
| 132 | paddq xm0, xm3 |
| 133 | LOOP_END |
| 134 | jne .loop4 |
| 135 | RET |
| 136 | |
| 137 | .shift: |
| 138 | %if WIN64 |
| 139 | mov indexd, indexm ; load index (not needed on UNIX64) |
| 140 | %endif |
| 141 | mov r9d, r9m ; load access_unit_size_pow2 |
| 142 | %if cpuflag(bmi2) |
| 143 | ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place. |
| 144 | DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \ |
| 145 | index, dest_ch, accum, index2, mns, \ |
| 146 | ausp, mask, cnt, noise |
| 147 | add mnsd, 7 ; matrix_noise_shift += 7 |
| 148 | %else ; sse4 |
| 149 | mov r6, rcx ; move rcx elsewhere so we can use cl for matrix_noise_shift |
| 150 | %if WIN64 |
| 151 | ; r0 = rcx |
| 152 | DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \ |
| 153 | index2, accum, ausp, mask, cnt, noise |
| 154 | %else ; UNIX64 |
| 155 | ; r3 = rcx |
| 156 | DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \ |
| 157 | index2, accum, ausp, mask, cnt, noise |
| 158 | %endif |
| 159 | lea mnsd, [r8 + 7] ; rcx = matrix_noise_shift + 7 |
| 160 | %endif ; cpuflag |
| 161 | sub auspd, 1 ; access_unit_size_pow2 -= 1 |
| 162 | cmp r7d, 4 ; is maxchan < 4? |
| 163 | lea index2q, [indexq*2 + 1] ; index2 = 2 * index + 1; |
| 164 | jl .loop4_shift ; jump if maxchan < 4 |
| 165 | |
| 166 | align 16 |
| 167 | .loop8_shift: |
| 168 | ; Process 5 or more channels |
| 169 | REMATRIX |
| 170 | LOOP_SHIFT_END |
| 171 | jne .loop8_shift |
| 172 | RET |
| 173 | |
| 174 | align 16 |
| 175 | .loop4_shift: |
| 176 | ; Process up to 4 channels |
| 177 | movdqa xm0, [samplesq] |
| 178 | movdqa xm1, [coeffsq ] |
| 179 | pshufd xm2, xm0, q2301 |
| 180 | pshufd xm3, xm1, q2301 |
| 181 | pmuldq xm0, xm1 |
| 182 | pmuldq xm3, xm2 |
| 183 | paddq xm0, xm3 |
| 184 | LOOP_SHIFT_END |
| 185 | jne .loop4_shift |
| 186 | RET |
| 187 | %endmacro |
| 188 | |
| 189 | INIT_XMM sse4 |
| 190 | MLP_REMATRIX_CHANNEL |
| 191 | %if HAVE_AVX2_EXTERNAL |
| 192 | INIT_YMM avx2, bmi2 |
| 193 | MLP_REMATRIX_CHANNEL |
| 194 | %endif |
| 195 | |
| 196 | %endif ; ARCH_X86_64 |