ffmpeg/libavcodec/x86/mlpdsp.asm

   1 ;******************************************************************************
   2 ;* SIMD-optimized MLP DSP functions
   3 ;* Copyright (c) 2014 James Almer <jamrial@gmail.com>
   4 ;*
   5 ;* This file is part of FFmpeg.
   6 ;*
   7 ;* FFmpeg is free software; you can redistribute it and/or
   8 ;* modify it under the terms of the GNU Lesser General Public
   9 ;* License as published by the Free Software Foundation; either
  10 ;* version 2.1 of the License, or (at your option) any later version.
  11 ;*
  12 ;* FFmpeg is distributed in the hope that it will be useful,
  13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 ;* Lesser General Public License for more details.
  16 ;*
  17 ;* You should have received a copy of the GNU Lesser General Public
  18 ;* License along with FFmpeg; if not, write to the Free Software
  19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20 ;******************************************************************************
  21
  22 %include "libavutil/x86/x86util.asm"
  23
  24 SECTION_TEXT
  25
  26 %if ARCH_X86_64
  27
  28 %macro SHLX 2
  29 %if cpuflag(bmi2)
  30    shlx %1, %1, %2q
  31 %else
  32    shl  %1, %2b
  33 %endif
  34 %endmacro
  35
  36 %macro REMATRIX 0
  37     movdqa        m0, [samplesq]
  38     movdqa        m1, [coeffsq ]
  39     pshufd        m2, m0, q2301
  40     pshufd        m3, m1, q2301
  41     pmuldq        m0, m1
  42     pmuldq        m3, m2
  43     paddq         m0, m3
  44 %if notcpuflag(avx2)
  45     movdqa        m1, [samplesq + 16]
  46     movdqa        m2, [coeffsq  + 16]
  47     pshufd        m3, m1, q2301
  48     pshufd        m4, m2, q2301
  49     pmuldq        m1, m2
  50     pmuldq        m4, m3
  51     paddq         m0, m1
  52     paddq         m0, m4
  53 %else
  54     vextracti128 xm1, m0, 1
  55     paddq        xm0, xm1
  56 %endif
  57 %endmacro
  58
  59 %macro LOOP_END 0
  60     pshufd       xm1, xm0, q0032
  61     paddq        xm0, xm1
  62     movq      accumq, xm0
  63     movzx     blsbsd, byte [blsbs_ptrq]             ; load *bypassed_lsbs
  64     sar       accumq, 14                            ; accum >>= 14
  65     and       accumd, maskd                         ; accum &= mask
  66     add       accumd, blsbsd                        ; accum += *bypassed_lsbs
  67     mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
  68     add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
  69     add     samplesq, 32                            ; samples += MAX_CHANNELS;
  70     cmp   blsbs_ptrq, cntq
  71 %endmacro
  72
  73 %macro LOOP_SHIFT_END 0
  74     pshufd       xm1, xm0, q0032
  75     paddq        xm0, xm1
  76     movq      accumq, xm0
  77     and       indexd, auspd                         ; index &= access_unit_size_pow2;
  78     movsx     noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
  79     add       indexd, index2d                       ; index += index2
  80     SHLX      noiseq, mns                           ; noise_buffer[index] <<= matrix_noise_shift
  81     add       accumq, noiseq                        ; accum += noise_buffer[index]
  82     movzx     noised, byte [blsbs_ptrq]             ; load *bypassed_lsbs (reuse tmp noise register)
  83     sar       accumq, 14                            ; accum >>= 14
  84     and       accumd, maskd                         ; accum &= mask
  85     add       accumd, noised                        ; accum += *bypassed_lsbs
  86     mov   [samplesq + dest_chq], accumd             ; samples[dest_ch] = accum
  87     add   blsbs_ptrq, 8                             ; bypassed_lsbs += MAX_CHANNELS;
  88     add     samplesq, 32                            ; samples += MAX_CHANNELS;
  89     cmp   blsbs_ptrq, cntq
  90 %endmacro
  91
  92 ;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
  93 ;                             const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
  94 ;                             int index, unsigned int dest_ch, uint16_t blockpos,
  95 ;                             unsigned int maxchan, int matrix_noise_shift,
  96 ;                             int access_unit_size_pow2, int32_t mask)
  97 %macro MLP_REMATRIX_CHANNEL 0
  98 cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
  99                                         index, dest_ch, blockpos, maxchan, mns, \
 100                                         accum, mask, cnt
 101     mov         mnsd, mnsm                          ; load matrix_noise_shift
 102     movzx  blockposq, word blockposm                ; load and zero extend blockpos (16bit)
 103     mov     maxchand, maxchanm                      ; load maxchan
 104     mov        maskd, maskm                         ; load mask
 105 %if WIN64
 106     mov     dest_chd, dest_chm                      ; load dest_chd (not needed on UNIX64)
 107 %endif
 108     shl     dest_chd, 2
 109     lea         cntq, [blsbs_ptrq + blockposq*8]
 110     test        mnsd, mnsd                          ; is matrix_noise_shift != 0?
 111     jne .shift                                      ; jump if true
 112     cmp     maxchand, 4                             ; is maxchan < 4?
 113     jl .loop4                                       ; jump if true
 114
 115 align 16
 116 .loop8:
 117     ; Process 5 or more channels
 118     REMATRIX
 119     LOOP_END
 120     jne .loop8
 121     RET
 122
 123 align 16
 124 .loop4:
 125     ; Process up to 4 channels
 126     movdqa       xm0, [samplesq]
 127     movdqa       xm1, [coeffsq ]
 128     pshufd       xm2, xm0, q2301
 129     pshufd       xm3, xm1, q2301
 130     pmuldq       xm0, xm1
 131     pmuldq       xm3, xm2
 132     paddq        xm0, xm3
 133     LOOP_END
 134     jne .loop4
 135     RET
 136
 137 .shift:
 138 %if WIN64
 139     mov       indexd, indexm         ; load index (not needed on UNIX64)
 140 %endif
 141     mov          r9d, r9m            ; load access_unit_size_pow2
 142 %if cpuflag(bmi2)
 143     ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
 144     DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
 145                 index, dest_ch, accum, index2, mns, \
 146                 ausp, mask, cnt, noise
 147     add         mnsd, 7              ; matrix_noise_shift += 7
 148 %else ; sse4
 149     mov           r6, rcx            ; move rcx elsewhere so we can use cl for matrix_noise_shift
 150 %if WIN64
 151     ; r0 = rcx
 152     DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
 153                 index2, accum, ausp, mask, cnt, noise
 154 %else ; UNIX64
 155     ; r3 = rcx
 156     DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
 157                 index2, accum, ausp, mask, cnt, noise
 158 %endif
 159     lea         mnsd, [r8 + 7]       ; rcx = matrix_noise_shift + 7
 160 %endif ; cpuflag
 161     sub        auspd, 1              ; access_unit_size_pow2 -= 1
 162     cmp          r7d, 4              ; is maxchan < 4?
 163     lea      index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
 164     jl .loop4_shift                  ; jump if maxchan < 4
 165
 166 align 16
 167 .loop8_shift:
 168     ; Process 5 or more channels
 169     REMATRIX
 170     LOOP_SHIFT_END
 171     jne .loop8_shift
 172     RET
 173
 174 align 16
 175 .loop4_shift:
 176     ; Process up to 4 channels
 177     movdqa       xm0, [samplesq]
 178     movdqa       xm1, [coeffsq ]
 179     pshufd       xm2, xm0, q2301
 180     pshufd       xm3, xm1, q2301
 181     pmuldq       xm0, xm1
 182     pmuldq       xm3, xm2
 183     paddq        xm0, xm3
 184     LOOP_SHIFT_END
 185     jne .loop4_shift
 186     RET
 187 %endmacro
 188
 189 INIT_XMM sse4
 190 MLP_REMATRIX_CHANNEL
 191 %if HAVE_AVX2_EXTERNAL
 192 INIT_YMM avx2, bmi2
 193 MLP_REMATRIX_CHANNEL
 194 %endif
 195
 196 %endif ; ARCH_X86_64