ffmpeg/libavcodec/x86/vorbisdsp.asm

   1 ;******************************************************************************
   2 ;* Vorbis x86 optimizations
   3 ;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
   4 ;*
   5 ;* This file is part of FFmpeg.
   6 ;*
   7 ;* FFmpeg is free software; you can redistribute it and/or
   8 ;* modify it under the terms of the GNU Lesser General Public
   9 ;* License as published by the Free Software Foundation; either
  10 ;* version 2.1 of the License, or (at your option) any later version.
  11 ;*
  12 ;* FFmpeg is distributed in the hope that it will be useful,
  13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 ;* Lesser General Public License for more details.
  16 ;*
  17 ;* You should have received a copy of the GNU Lesser General Public
  18 ;* License along with FFmpeg; if not, write to the Free Software
  19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20 ;******************************************************************************
  21
  22 %include "libavutil/x86/x86util.asm"
  23
  24 SECTION_RODATA
  25
  26 pdw_80000000: times 4 dd 0x80000000
  27
  28 SECTION .text
  29
  30 %if ARCH_X86_32
  31 INIT_MMX 3dnow
  32 cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size
  33     pxor                     m7, m7
  34     lea                    magq, [magq+block_sizeq*4]
  35     lea                    angq, [angq+block_sizeq*4]
  36     neg             block_sizeq
  37 .loop:
  38     mova                     m0, [magq+block_sizeq*4]
  39     mova                     m1, [angq+block_sizeq*4]
  40     mova                     m2, m0
  41     mova                     m3, m1
  42     pfcmpge                  m2, m7     ; m <= 0.0
  43     pfcmpge                  m3, m7     ; a <= 0.0
  44     pslld                    m2, 31     ; keep only the sign bit
  45     pxor                     m1, m2
  46     mova                     m4, m3
  47     pand                     m3, m1
  48     pandn                    m4, m1
  49     pfadd                    m3, m0     ; a = m + ((a < 0) & (a ^ sign(m)))
  50     pfsub                    m0, m4     ; m = m + ((a > 0) & (a ^ sign(m)))
  51     mova   [angq+block_sizeq*4], m3
  52     mova   [magq+block_sizeq*4], m0
  53     add             block_sizeq, 2
  54     jl .loop
  55     femms
  56     RET
  57 %endif
  58
  59 INIT_XMM sse
  60 cglobal vorbis_inverse_coupling, 3, 4, 6, mag, ang, block_size, cntr
  61     mova                     m5, [pdw_80000000]
  62     xor                   cntrq, cntrq
  63 align 16
  64 .loop:
  65     mova                     m0, [magq+cntrq*4]
  66     mova                     m1, [angq+cntrq*4]
  67     xorps                    m2, m2
  68     xorps                    m3, m3
  69     cmpleps                  m2, m0     ; m <= 0.0
  70     cmpleps                  m3, m1     ; a <= 0.0
  71     andps                    m2, m5     ; keep only the sign bit
  72     xorps                    m1, m2
  73     mova                     m4, m3
  74     andps                    m3, m1
  75     andnps                   m4, m1
  76     addps                    m3, m0     ; a = m + ((a < 0) & (a ^ sign(m)))
  77     subps                    m0, m4     ; m = m + ((a > 0) & (a ^ sign(m)))
  78     mova         [angq+cntrq*4], m3
  79     mova         [magq+cntrq*4], m0
  80     add                   cntrq, 4
  81     cmp                   cntrq, block_sizeq
  82     jl .loop
  83     RET