[deb_ffmpeg.git] / ffmpeg / libavcodec / mips / aacsbr_mips.c

/*
 * Copyright (c) 2012
 *      MIPS Technologies, Inc., California.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * Authors:  Djordje Pesut   (djordje@mips.com)
 *           Mirjana Vulin   (mvulin@mips.com)
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

/**
 * @file
 * Reference: libavcodec/aacsbr.c
 */

#include "libavcodec/aac.h"
#include "libavcodec/aacsbr.h"

#define ENVELOPE_ADJUSTMENT_OFFSET 2

#if HAVE_INLINE_ASM
static int sbr_lf_gen_mips(AACContext *ac, SpectralBandReplication *sbr,
                      float X_low[32][40][2], const float W[2][32][32][2],
                      int buf_idx)
{
    int i, k;
    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
    float *p_x_low = &X_low[0][8][0];
    float *p_w = (float*)&W[buf_idx][0][0][0];
    float *p_x1_low = &X_low[0][0][0];
    float *p_w1 = (float*)&W[1-buf_idx][24][0][0];

    float *loop_end=p_x1_low + 2560;

    /* loop unrolled 8 times */
    __asm__ volatile (
    "1:                                                 \n\t"
        "sw     $0,            0(%[p_x1_low])           \n\t"
        "sw     $0,            4(%[p_x1_low])           \n\t"
        "sw     $0,            8(%[p_x1_low])           \n\t"
        "sw     $0,            12(%[p_x1_low])          \n\t"
        "sw     $0,            16(%[p_x1_low])          \n\t"
        "sw     $0,            20(%[p_x1_low])          \n\t"
        "sw     $0,            24(%[p_x1_low])          \n\t"
        "sw     $0,            28(%[p_x1_low])          \n\t"
        "addiu  %[p_x1_low],   %[p_x1_low],      32     \n\t"
        "bne    %[p_x1_low],   %[loop_end],      1b     \n\t"
        "addiu  %[p_x1_low],   %[p_x1_low],      -10240 \n\t"

        : [p_x1_low]"+r"(p_x1_low)
        : [loop_end]"r"(loop_end)
        : "memory"
    );

    for (k = 0; k < sbr->kx[1]; k++) {
        for (i = 0; i < 32; i+=4) {
            /* loop unrolled 4 times */
            __asm__ volatile (
                "lw     %[temp0],   0(%[p_w])               \n\t"
                "lw     %[temp1],   4(%[p_w])               \n\t"
                "lw     %[temp2],   256(%[p_w])             \n\t"
                "lw     %[temp3],   260(%[p_w])             \n\t"
                "lw     %[temp4],   512(%[p_w])             \n\t"
                "lw     %[temp5],   516(%[p_w])             \n\t"
                "lw     %[temp6],   768(%[p_w])             \n\t"
                "lw     %[temp7],   772(%[p_w])             \n\t"
                "sw     %[temp0],   0(%[p_x_low])           \n\t"
                "sw     %[temp1],   4(%[p_x_low])           \n\t"
                "sw     %[temp2],   8(%[p_x_low])           \n\t"
                "sw     %[temp3],   12(%[p_x_low])          \n\t"
                "sw     %[temp4],   16(%[p_x_low])          \n\t"
                "sw     %[temp5],   20(%[p_x_low])          \n\t"
                "sw     %[temp6],   24(%[p_x_low])          \n\t"
                "sw     %[temp7],   28(%[p_x_low])          \n\t"
                "addiu  %[p_x_low], %[p_x_low],     32      \n\t"
                "addiu  %[p_w],     %[p_w],         1024    \n\t"

                : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
                  [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
                  [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
                  [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
                  [p_w]"+r"(p_w), [p_x_low]"+r"(p_x_low)
                :
                : "memory"
            );
        }
        p_x_low += 16;
        p_w -= 2046;
    }

    for (k = 0; k < sbr->kx[0]; k++) {
        for (i = 0; i < 2; i++) {

            /* loop unrolled 4 times */
            __asm__ volatile (
                "lw     %[temp0],    0(%[p_w1])             \n\t"
                "lw     %[temp1],    4(%[p_w1])             \n\t"
                "lw     %[temp2],    256(%[p_w1])           \n\t"
                "lw     %[temp3],    260(%[p_w1])           \n\t"
                "lw     %[temp4],    512(%[p_w1])           \n\t"
                "lw     %[temp5],    516(%[p_w1])           \n\t"
                "lw     %[temp6],    768(%[p_w1])           \n\t"
                "lw     %[temp7],    772(%[p_w1])           \n\t"
                "sw     %[temp0],    0(%[p_x1_low])         \n\t"
                "sw     %[temp1],    4(%[p_x1_low])         \n\t"
                "sw     %[temp2],    8(%[p_x1_low])         \n\t"
                "sw     %[temp3],    12(%[p_x1_low])        \n\t"
                "sw     %[temp4],    16(%[p_x1_low])        \n\t"
                "sw     %[temp5],    20(%[p_x1_low])        \n\t"
                "sw     %[temp6],    24(%[p_x1_low])        \n\t"
                "sw     %[temp7],    28(%[p_x1_low])        \n\t"
                "addiu  %[p_x1_low], %[p_x1_low],   32      \n\t"
                "addiu  %[p_w1],     %[p_w1],       1024    \n\t"

                : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
                  [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
                  [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
                  [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
                  [p_w1]"+r"(p_w1), [p_x1_low]"+r"(p_x1_low)
                :
                : "memory"
            );
        }
        p_x1_low += 64;
        p_w1 -= 510;
    }
    return 0;
}

static int sbr_x_gen_mips(SpectralBandReplication *sbr, float X[2][38][64],
                     const float Y0[38][64][2], const float Y1[38][64][2],
                     const float X_low[32][40][2], int ch)
{
    int k, i;
    const int i_f = 32;
    int temp0, temp1, temp2, temp3;
    const float *X_low1, *Y01, *Y11;
    float *x1=&X[0][0][0];
    float *j=x1+4864;
    const int i_Temp = FFMAX(2*sbr->data[ch].t_env_num_env_old - i_f, 0);

    /* loop unrolled 8 times */
    __asm__ volatile (
    "1:                                       \n\t"
        "sw     $0,      0(%[x1])             \n\t"
        "sw     $0,      4(%[x1])             \n\t"
        "sw     $0,      8(%[x1])             \n\t"
        "sw     $0,      12(%[x1])            \n\t"
        "sw     $0,      16(%[x1])            \n\t"
        "sw     $0,      20(%[x1])            \n\t"
        "sw     $0,      24(%[x1])            \n\t"
        "sw     $0,      28(%[x1])            \n\t"
        "addiu  %[x1],   %[x1],      32       \n\t"
        "bne    %[x1],   %[j],       1b       \n\t"
        "addiu  %[x1],   %[x1],      -19456   \n\t"

        : [x1]"+r"(x1)
        : [j]"r"(j)
        : "memory"
    );

    if (i_Temp != 0) {

        X_low1=&X_low[0][2][0];

        for (k = 0; k < sbr->kx[0]; k++) {

            __asm__ volatile (
                "move    %[i],        $zero                  \n\t"
            "2:                                              \n\t"
                "lw      %[temp0],    0(%[X_low1])           \n\t"
                "lw      %[temp1],    4(%[X_low1])           \n\t"
                "sw      %[temp0],    0(%[x1])               \n\t"
                "sw      %[temp1],    9728(%[x1])            \n\t"
                "addiu   %[x1],       %[x1],         256     \n\t"
                "addiu   %[X_low1],   %[X_low1],     8       \n\t"
                "addiu   %[i],        %[i],          1       \n\t"
                "bne     %[i],        %[i_Temp],     2b      \n\t"

                : [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i),
                  [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
                : [i_Temp]"r"(i_Temp)
                : "memory"
            );
            x1-=(i_Temp<<6)-1;
            X_low1-=(i_Temp<<1)-80;
        }

        x1=&X[0][0][k];
        Y01=(float*)&Y0[32][k][0];

        for (; k < sbr->kx[0] + sbr->m[0]; k++) {
            __asm__ volatile (
                "move    %[i],       $zero               \n\t"
            "3:                                          \n\t"
                "lw      %[temp0],   0(%[Y01])           \n\t"
                "lw      %[temp1],   4(%[Y01])           \n\t"
                "sw      %[temp0],   0(%[x1])            \n\t"
                "sw      %[temp1],   9728(%[x1])         \n\t"
                "addiu   %[x1],      %[x1],      256     \n\t"
                "addiu   %[Y01],     %[Y01],     512     \n\t"
                "addiu   %[i],       %[i],       1       \n\t"
                "bne     %[i],       %[i_Temp],  3b      \n\t"

                : [x1]"+r"(x1), [Y01]"+r"(Y01), [i]"=&r"(i),
                  [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
                : [i_Temp]"r"(i_Temp)
                : "memory"
            );
            x1 -=(i_Temp<<6)-1;
            Y01 -=(i_Temp<<7)-2;
        }
    }

    x1=&X[0][i_Temp][0];
    X_low1=&X_low[0][i_Temp+2][0];
    temp3=38;

    for (k = 0; k < sbr->kx[1]; k++) {

        __asm__ volatile (
            "move    %[i],       %[i_Temp]              \n\t"
        "4:                                             \n\t"
            "lw      %[temp0],   0(%[X_low1])           \n\t"
            "lw      %[temp1],   4(%[X_low1])           \n\t"
            "sw      %[temp0],   0(%[x1])               \n\t"
            "sw      %[temp1],   9728(%[x1])            \n\t"
            "addiu   %[x1],      %[x1],         256     \n\t"
            "addiu   %[X_low1],  %[X_low1],     8       \n\t"
            "addiu   %[i],       %[i],          1       \n\t"
            "bne     %[i],       %[temp3],      4b      \n\t"

            : [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i),
              [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
              [temp2]"=&r"(temp2)
            : [i_Temp]"r"(i_Temp), [temp3]"r"(temp3)
            : "memory"
        );
        x1 -= ((38-i_Temp)<<6)-1;
        X_low1 -= ((38-i_Temp)<<1)- 80;
    }

    x1=&X[0][i_Temp][k];
    Y11=&Y1[i_Temp][k][0];
    temp2=32;

    for (; k < sbr->kx[1] + sbr->m[1]; k++) {

        __asm__ volatile (
           "move    %[i],       %[i_Temp]               \n\t"
        "5:                                             \n\t"
           "lw      %[temp0],   0(%[Y11])               \n\t"
           "lw      %[temp1],   4(%[Y11])               \n\t"
           "sw      %[temp0],   0(%[x1])                \n\t"
           "sw      %[temp1],   9728(%[x1])             \n\t"
           "addiu   %[x1],      %[x1],          256     \n\t"
           "addiu   %[Y11],     %[Y11],         512     \n\t"
           "addiu   %[i],       %[i],           1       \n\t"
           "bne     %[i],       %[temp2],       5b      \n\t"

           : [x1]"+r"(x1), [Y11]"+r"(Y11), [i]"=&r"(i),
             [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
           : [i_Temp]"r"(i_Temp), [temp3]"r"(temp3),
             [temp2]"r"(temp2)
           : "memory"
        );

        x1 -= ((32-i_Temp)<<6)-1;
        Y11 -= ((32-i_Temp)<<7)-2;
   }
      return 0;
}

#if HAVE_MIPSFPU
static void sbr_hf_assemble_mips(float Y1[38][64][2],
                            const float X_high[64][40][2],
                            SpectralBandReplication *sbr, SBRData *ch_data,
                            const int e_a[2])
{
    int e, i, j, m;
    const int h_SL = 4 * !sbr->bs_smoothing_mode;
    const int kx = sbr->kx[1];
    const int m_max = sbr->m[1];
    static const float h_smooth[5] = {
        0.33333333333333,
        0.30150283239582,
        0.21816949906249,
        0.11516383427084,
        0.03183050093751,
    };

    float (*g_temp)[48] = ch_data->g_temp, (*q_temp)[48] = ch_data->q_temp;
    int indexnoise = ch_data->f_indexnoise;
    int indexsine  = ch_data->f_indexsine;
    float *g_temp1, *q_temp1, *pok, *pok1;
    float temp1, temp2, temp3, temp4;
    int size = m_max;

    if (sbr->reset) {
        for (i = 0; i < h_SL; i++) {
            memcpy(g_temp[i + 2*ch_data->t_env[0]], sbr->gain[0], m_max * sizeof(sbr->gain[0][0]));
            memcpy(q_temp[i + 2*ch_data->t_env[0]], sbr->q_m[0],  m_max * sizeof(sbr->q_m[0][0]));
        }
    } else if (h_SL) {
        memcpy(g_temp[2*ch_data->t_env[0]], g_temp[2*ch_data->t_env_num_env_old], 4*sizeof(g_temp[0]));
        memcpy(q_temp[2*ch_data->t_env[0]], q_temp[2*ch_data->t_env_num_env_old], 4*sizeof(q_temp[0]));
    }

    for (e = 0; e < ch_data->bs_num_env; e++) {
        for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
            g_temp1 = g_temp[h_SL + i];
            pok = sbr->gain[e];
            q_temp1 = q_temp[h_SL + i];
            pok1 = sbr->q_m[e];

            /* loop unrolled 4 times */
            for (j=0; j<(size>>2); j++) {
                __asm__ volatile (
                    "lw      %[temp1],   0(%[pok])               \n\t"
                    "lw      %[temp2],   4(%[pok])               \n\t"
                    "lw      %[temp3],   8(%[pok])               \n\t"
                    "lw      %[temp4],   12(%[pok])              \n\t"
                    "sw      %[temp1],   0(%[g_temp1])           \n\t"
                    "sw      %[temp2],   4(%[g_temp1])           \n\t"
                    "sw      %[temp3],   8(%[g_temp1])           \n\t"
                    "sw      %[temp4],   12(%[g_temp1])          \n\t"
                    "lw      %[temp1],   0(%[pok1])              \n\t"
                    "lw      %[temp2],   4(%[pok1])              \n\t"
                    "lw      %[temp3],   8(%[pok1])              \n\t"
                    "lw      %[temp4],   12(%[pok1])             \n\t"
                    "sw      %[temp1],   0(%[q_temp1])           \n\t"
                    "sw      %[temp2],   4(%[q_temp1])           \n\t"
                    "sw      %[temp3],   8(%[q_temp1])           \n\t"
                    "sw      %[temp4],   12(%[q_temp1])          \n\t"
                    "addiu   %[pok],     %[pok],           16    \n\t"
                    "addiu   %[g_temp1], %[g_temp1],       16    \n\t"
                    "addiu   %[pok1],    %[pok1],          16    \n\t"
                    "addiu   %[q_temp1], %[q_temp1],       16    \n\t"

                    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
                      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
                      [pok]"+r"(pok), [g_temp1]"+r"(g_temp1),
                      [pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1)
                    :
                    : "memory"
                );
            }

            for (j=0; j<(size&3); j++) {
                __asm__ volatile (
                    "lw      %[temp1],   0(%[pok])              \n\t"
                    "lw      %[temp2],   0(%[pok1])             \n\t"
                    "sw      %[temp1],   0(%[g_temp1])          \n\t"
                    "sw      %[temp2],   0(%[q_temp1])          \n\t"
                    "addiu   %[pok],     %[pok],          4     \n\t"
                    "addiu   %[g_temp1], %[g_temp1],      4     \n\t"
                    "addiu   %[pok1],    %[pok1],         4     \n\t"
                    "addiu   %[q_temp1], %[q_temp1],      4     \n\t"

                    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
                      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
                      [pok]"+r"(pok), [g_temp1]"+r"(g_temp1),
                      [pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1)
                    :
                    : "memory"
                );
            }
        }
    }

    for (e = 0; e < ch_data->bs_num_env; e++) {
        for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
            LOCAL_ALIGNED_16(float, g_filt_tab, [48]);
            LOCAL_ALIGNED_16(float, q_filt_tab, [48]);
            float *g_filt, *q_filt;

            if (h_SL && e != e_a[0] && e != e_a[1]) {
                g_filt = g_filt_tab;
                q_filt = q_filt_tab;

                for (m = 0; m < m_max; m++) {
                    const int idx1 = i + h_SL;
                    g_filt[m] = 0.0f;
                    q_filt[m] = 0.0f;

                    for (j = 0; j <= h_SL; j++) {
                        g_filt[m] += g_temp[idx1 - j][m] * h_smooth[j];
                        q_filt[m] += q_temp[idx1 - j][m] * h_smooth[j];
                    }
                }
            } else {
                g_filt = g_temp[i + h_SL];
                q_filt = q_temp[i];
            }

            sbr->dsp.hf_g_filt(Y1[i] + kx, X_high + kx, g_filt, m_max,
                               i + ENVELOPE_ADJUSTMENT_OFFSET);

            if (e != e_a[0] && e != e_a[1]) {
                sbr->dsp.hf_apply_noise[indexsine](Y1[i] + kx, sbr->s_m[e],
                                                   q_filt, indexnoise,
                                                   kx, m_max);
            } else {
                int idx = indexsine&1;
                int A = (1-((indexsine+(kx & 1))&2));
                int B = (A^(-idx)) + idx;
                float *out = &Y1[i][kx][idx];
                float *in  = sbr->s_m[e];
                float temp0, temp1, temp2, temp3, temp4, temp5;
                float A_f = (float)A;
                float B_f = (float)B;

                for (m = 0; m+1 < m_max; m+=2) {

                    temp2 = out[0];
                    temp3 = out[2];

                    __asm__ volatile(
                        "lwc1    %[temp0],  0(%[in])                     \n\t"
                        "lwc1    %[temp1],  4(%[in])                     \n\t"
                        "madd.s  %[temp4],  %[temp2],  %[temp0], %[A_f]  \n\t"
                        "madd.s  %[temp5],  %[temp3],  %[temp1], %[B_f]  \n\t"
                        "swc1    %[temp4],  0(%[out])                    \n\t"
                        "swc1    %[temp5],  8(%[out])                    \n\t"
                        "addiu   %[in],     %[in],     8                 \n\t"
                        "addiu   %[out],    %[out],    16                \n\t"

                        : [temp0]"=&f" (temp0), [temp1]"=&f"(temp1),
                          [temp4]"=&f" (temp4), [temp5]"=&f"(temp5),
                          [in]"+r"(in), [out]"+r"(out)
                        : [A_f]"f"(A_f), [B_f]"f"(B_f), [temp2]"f"(temp2),
                          [temp3]"f"(temp3)
                        : "memory"
                    );
                }
                if(m_max&1)
                    out[2*m  ] += in[m  ] * A;
            }
            indexnoise = (indexnoise + m_max) & 0x1ff;
            indexsine = (indexsine + 1) & 3;
        }
    }
    ch_data->f_indexnoise = indexnoise;
    ch_data->f_indexsine  = indexsine;
}

static void sbr_hf_inverse_filter_mips(SBRDSPContext *dsp,
                                  float (*alpha0)[2], float (*alpha1)[2],
                                  const float X_low[32][40][2], int k0)
{
    int k;
    float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, c;
    float *phi1, *alpha_1, *alpha_0, res1, res2, temp_real, temp_im;

    c = 1.000001f;

    for (k = 0; k < k0; k++) {
        LOCAL_ALIGNED_16(float, phi, [3], [2][2]);
        float dk;
        phi1 = &phi[0][0][0];
        alpha_1 = &alpha1[k][0];
        alpha_0 = &alpha0[k][0];
        dsp->autocorrelate(X_low[k], phi);

        __asm__ volatile (
            "lwc1    %[temp0],  40(%[phi1])                       \n\t"
            "lwc1    %[temp1],  16(%[phi1])                       \n\t"
            "lwc1    %[temp2],  24(%[phi1])                       \n\t"
            "lwc1    %[temp3],  28(%[phi1])                       \n\t"
            "mul.s   %[dk],     %[temp0],    %[temp1]             \n\t"
            "lwc1    %[temp4],  0(%[phi1])                        \n\t"
            "mul.s   %[res2],   %[temp2],    %[temp2]             \n\t"
            "lwc1    %[temp5],  4(%[phi1])                        \n\t"
            "madd.s  %[res2],   %[res2],     %[temp3],  %[temp3]  \n\t"
            "lwc1    %[temp6],  8(%[phi1])                        \n\t"
            "div.s   %[res2],   %[res2],     %[c]                 \n\t"
            "lwc1    %[temp0],  12(%[phi1])                       \n\t"
            "sub.s   %[dk],     %[dk],       %[res2]              \n\t"

            : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
              [temp6]"=&f"(temp6), [res2]"=&f"(res2), [dk]"=&f"(dk)
            : [phi1]"r"(phi1), [c]"f"(c)
            : "memory"
        );

        if (!dk) {
            alpha_1[0] = 0;
            alpha_1[1] = 0;
        } else {
            __asm__ volatile (
                "mul.s   %[temp_real], %[temp4],     %[temp2]            \n\t"
                "nmsub.s %[temp_real], %[temp_real], %[temp5], %[temp3]  \n\t"
                "nmsub.s %[temp_real], %[temp_real], %[temp6], %[temp1]  \n\t"
                "mul.s   %[temp_im],   %[temp4],     %[temp3]            \n\t"
                "madd.s  %[temp_im],   %[temp_im],   %[temp5], %[temp2]  \n\t"
                "nmsub.s %[temp_im],   %[temp_im],   %[temp0], %[temp1]  \n\t"
                "div.s   %[temp_real], %[temp_real], %[dk]               \n\t"
                "div.s   %[temp_im],   %[temp_im],   %[dk]               \n\t"
                "swc1    %[temp_real], 0(%[alpha_1])                     \n\t"
                "swc1    %[temp_im],   4(%[alpha_1])                     \n\t"

                : [temp_real]"=&f" (temp_real), [temp_im]"=&f"(temp_im)
                : [phi1]"r"(phi1), [temp0]"f"(temp0), [temp1]"f"(temp1),
                  [temp2]"f"(temp2), [temp3]"f"(temp3), [temp4]"f"(temp4),
                  [temp5]"f"(temp5), [temp6]"f"(temp6),
                  [alpha_1]"r"(alpha_1), [dk]"f"(dk)
                : "memory"
            );
        }

        if (!phi1[4]) {
            alpha_0[0] = 0;
            alpha_0[1] = 0;
        } else {
            __asm__ volatile (
                "lwc1    %[temp6],     0(%[alpha_1])                     \n\t"
                "lwc1    %[temp7],     4(%[alpha_1])                     \n\t"
                "mul.s   %[temp_real], %[temp6],     %[temp2]            \n\t"
                "add.s   %[temp_real], %[temp_real], %[temp4]            \n\t"
                "madd.s  %[temp_real], %[temp_real], %[temp7], %[temp3]  \n\t"
                "mul.s   %[temp_im],   %[temp7],     %[temp2]            \n\t"
                "add.s   %[temp_im],   %[temp_im],   %[temp5]            \n\t"
                "nmsub.s %[temp_im],   %[temp_im],   %[temp6], %[temp3]  \n\t"
                "div.s   %[temp_real], %[temp_real], %[temp1]            \n\t"
                "div.s   %[temp_im],   %[temp_im],   %[temp1]            \n\t"
                "neg.s   %[temp_real], %[temp_real]                      \n\t"
                "neg.s   %[temp_im],   %[temp_im]                        \n\t"
                "swc1    %[temp_real], 0(%[alpha_0])                     \n\t"
                "swc1    %[temp_im],   4(%[alpha_0])                     \n\t"

                : [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im),
                  [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
                  [res1]"=&f"(res1), [res2]"=&f"(res2)
                : [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0),
                  [temp0]"f"(temp0), [temp1]"f"(temp1), [temp2]"f"(temp2),
                  [temp3]"f"(temp3), [temp4]"f"(temp4), [temp5]"f"(temp5)
                : "memory"
            );
        }

        __asm__ volatile (
            "lwc1    %[temp1],      0(%[alpha_1])                           \n\t"
            "lwc1    %[temp2],      4(%[alpha_1])                           \n\t"
            "lwc1    %[temp_real],  0(%[alpha_0])                           \n\t"
            "lwc1    %[temp_im],    4(%[alpha_0])                           \n\t"
            "mul.s   %[res1],       %[temp1],      %[temp1]                 \n\t"
            "madd.s  %[res1],       %[res1],       %[temp2],    %[temp2]    \n\t"
            "mul.s   %[res2],       %[temp_real],  %[temp_real]             \n\t"
            "madd.s  %[res2],       %[res2],       %[temp_im],  %[temp_im]  \n\t"

            : [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im),
              [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
              [res1]"=&f"(res1), [res2]"=&f"(res2)
            : [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0)
            : "memory"
        );

        if (res1 >= 16.0f || res2 >= 16.0f) {
            alpha_1[0] = 0;
            alpha_1[1] = 0;
            alpha_0[0] = 0;
            alpha_0[1] = 0;
        }
    }
}
#endif /* HAVE_MIPSFPU */
#endif /* HAVE_INLINE_ASM */

void ff_aacsbr_func_ptr_init_mips(AACSBRContext *c)
{
#if HAVE_INLINE_ASM
    c->sbr_lf_gen            = sbr_lf_gen_mips;
    c->sbr_x_gen             = sbr_x_gen_mips;
#if HAVE_MIPSFPU
    c->sbr_hf_inverse_filter = sbr_hf_inverse_filter_mips;
    c->sbr_hf_assemble       = sbr_hf_assemble_mips;
#endif /* HAVE_MIPSFPU */
#endif /* HAVE_INLINE_ASM */
}
Commit	Line	Data
	1	/*
	2	* Copyright (c) 2012
	3	* MIPS Technologies, Inc., California.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* 1. Redistributions of source code must retain the above copyright
	9	* notice, this list of conditions and the following disclaimer.
	10	* 2. Redistributions in binary form must reproduce the above copyright
	11	* notice, this list of conditions and the following disclaimer in the
	12	* documentation and/or other materials provided with the distribution.
	13	* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
	14	* contributors may be used to endorse or promote products derived from
	15	* this software without specific prior written permission.
	16	*
	17	* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
	18	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	19	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	20	* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
	21	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	22	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	23	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	24	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	25	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	26	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	27	* SUCH DAMAGE.
	28	*
	29	* Authors: Djordje Pesut (djordje@mips.com)
	30	* Mirjana Vulin (mvulin@mips.com)
	31	*
	32	* This file is part of FFmpeg.
	33	*
	34	* FFmpeg is free software; you can redistribute it and/or
	35	* modify it under the terms of the GNU Lesser General Public
	36	* License as published by the Free Software Foundation; either
	37	* version 2.1 of the License, or (at your option) any later version.
	38	*
	39	* FFmpeg is distributed in the hope that it will be useful,
	40	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	41	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	42	* Lesser General Public License for more details.
	43	*
	44	* You should have received a copy of the GNU Lesser General Public
	45	* License along with FFmpeg; if not, write to the Free Software
	46	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	47	*/
	48
	49	/**
	50	* @file
	51	* Reference: libavcodec/aacsbr.c
	52	*/
	53
	54	#include "libavcodec/aac.h"
	55	#include "libavcodec/aacsbr.h"
	56
	57	#define ENVELOPE_ADJUSTMENT_OFFSET 2
	58
	59	#if HAVE_INLINE_ASM
	60	static int sbr_lf_gen_mips(AACContext ac, SpectralBandReplication sbr,
	61	float X_low[32][40][2], const float W[2][32][32][2],
	62	int buf_idx)
	63	{
	64	int i, k;
	65	int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
	66	float *p_x_low = &X_low[0][8][0];
	67	float p_w = (float)&W[buf_idx][0][0][0];
	68	float *p_x1_low = &X_low[0][0][0];
	69	float p_w1 = (float)&W[1-buf_idx][24][0][0];
	70
	71	float *loop_end=p_x1_low + 2560;
	72
	73	/* loop unrolled 8 times */
	74	__asm__ volatile (
	75	"1: \n\t"
	76	"sw $0, 0(%[p_x1_low]) \n\t"
	77	"sw $0, 4(%[p_x1_low]) \n\t"
	78	"sw $0, 8(%[p_x1_low]) \n\t"
	79	"sw $0, 12(%[p_x1_low]) \n\t"
	80	"sw $0, 16(%[p_x1_low]) \n\t"
	81	"sw $0, 20(%[p_x1_low]) \n\t"
	82	"sw $0, 24(%[p_x1_low]) \n\t"
	83	"sw $0, 28(%[p_x1_low]) \n\t"
	84	"addiu %[p_x1_low], %[p_x1_low], 32 \n\t"
	85	"bne %[p_x1_low], %[loop_end], 1b \n\t"
	86	"addiu %[p_x1_low], %[p_x1_low], -10240 \n\t"
	87
	88	: [p_x1_low]"+r"(p_x1_low)
	89	: [loop_end]"r"(loop_end)
	90	: "memory"
	91	);
	92
	93	for (k = 0; k < sbr->kx[1]; k++) {
	94	for (i = 0; i < 32; i+=4) {
	95	/* loop unrolled 4 times */
	96	__asm__ volatile (
	97	"lw %[temp0], 0(%[p_w]) \n\t"
	98	"lw %[temp1], 4(%[p_w]) \n\t"
	99	"lw %[temp2], 256(%[p_w]) \n\t"
	100	"lw %[temp3], 260(%[p_w]) \n\t"
	101	"lw %[temp4], 512(%[p_w]) \n\t"
	102	"lw %[temp5], 516(%[p_w]) \n\t"
	103	"lw %[temp6], 768(%[p_w]) \n\t"
	104	"lw %[temp7], 772(%[p_w]) \n\t"
	105	"sw %[temp0], 0(%[p_x_low]) \n\t"
	106	"sw %[temp1], 4(%[p_x_low]) \n\t"
	107	"sw %[temp2], 8(%[p_x_low]) \n\t"
	108	"sw %[temp3], 12(%[p_x_low]) \n\t"
	109	"sw %[temp4], 16(%[p_x_low]) \n\t"
	110	"sw %[temp5], 20(%[p_x_low]) \n\t"
	111	"sw %[temp6], 24(%[p_x_low]) \n\t"
	112	"sw %[temp7], 28(%[p_x_low]) \n\t"
	113	"addiu %[p_x_low], %[p_x_low], 32 \n\t"
	114	"addiu %[p_w], %[p_w], 1024 \n\t"
	115
	116	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
	117	[temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
	118	[temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
	119	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
	120	[p_w]"+r"(p_w), [p_x_low]"+r"(p_x_low)
	121	:
	122	: "memory"
	123	);
	124	}
	125	p_x_low += 16;
	126	p_w -= 2046;
	127	}
	128
	129	for (k = 0; k < sbr->kx[0]; k++) {
	130	for (i = 0; i < 2; i++) {
	131
	132	/* loop unrolled 4 times */
	133	__asm__ volatile (
	134	"lw %[temp0], 0(%[p_w1]) \n\t"
	135	"lw %[temp1], 4(%[p_w1]) \n\t"
	136	"lw %[temp2], 256(%[p_w1]) \n\t"
	137	"lw %[temp3], 260(%[p_w1]) \n\t"
	138	"lw %[temp4], 512(%[p_w1]) \n\t"
	139	"lw %[temp5], 516(%[p_w1]) \n\t"
	140	"lw %[temp6], 768(%[p_w1]) \n\t"
	141	"lw %[temp7], 772(%[p_w1]) \n\t"
	142	"sw %[temp0], 0(%[p_x1_low]) \n\t"
	143	"sw %[temp1], 4(%[p_x1_low]) \n\t"
	144	"sw %[temp2], 8(%[p_x1_low]) \n\t"
	145	"sw %[temp3], 12(%[p_x1_low]) \n\t"
	146	"sw %[temp4], 16(%[p_x1_low]) \n\t"
	147	"sw %[temp5], 20(%[p_x1_low]) \n\t"
	148	"sw %[temp6], 24(%[p_x1_low]) \n\t"
	149	"sw %[temp7], 28(%[p_x1_low]) \n\t"
	150	"addiu %[p_x1_low], %[p_x1_low], 32 \n\t"
	151	"addiu %[p_w1], %[p_w1], 1024 \n\t"
	152
	153	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
	154	[temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
	155	[temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
	156	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
	157	[p_w1]"+r"(p_w1), [p_x1_low]"+r"(p_x1_low)
	158	:
	159	: "memory"
	160	);
	161	}
	162	p_x1_low += 64;
	163	p_w1 -= 510;
	164	}
	165	return 0;
	166	}
	167
	168	static int sbr_x_gen_mips(SpectralBandReplication *sbr, float X[2][38][64],
	169	const float Y0[38][64][2], const float Y1[38][64][2],
	170	const float X_low[32][40][2], int ch)
	171	{
	172	int k, i;
	173	const int i_f = 32;
	174	int temp0, temp1, temp2, temp3;
	175	const float X_low1, Y01, *Y11;
	176	float *x1=&X[0][0][0];
	177	float *j=x1+4864;
	178	const int i_Temp = FFMAX(2*sbr->data[ch].t_env_num_env_old - i_f, 0);
	179
	180	/* loop unrolled 8 times */
	181	__asm__ volatile (
	182	"1: \n\t"
	183	"sw $0, 0(%[x1]) \n\t"
	184	"sw $0, 4(%[x1]) \n\t"
	185	"sw $0, 8(%[x1]) \n\t"
	186	"sw $0, 12(%[x1]) \n\t"
	187	"sw $0, 16(%[x1]) \n\t"
	188	"sw $0, 20(%[x1]) \n\t"
	189	"sw $0, 24(%[x1]) \n\t"
	190	"sw $0, 28(%[x1]) \n\t"
	191	"addiu %[x1], %[x1], 32 \n\t"
	192	"bne %[x1], %[j], 1b \n\t"
	193	"addiu %[x1], %[x1], -19456 \n\t"
	194
	195	: [x1]"+r"(x1)
	196	: [j]"r"(j)
	197	: "memory"
	198	);
	199
	200	if (i_Temp != 0) {
	201
	202	X_low1=&X_low[0][2][0];
	203
	204	for (k = 0; k < sbr->kx[0]; k++) {
	205
	206	__asm__ volatile (
	207	"move %[i], $zero \n\t"
	208	"2: \n\t"
	209	"lw %[temp0], 0(%[X_low1]) \n\t"
	210	"lw %[temp1], 4(%[X_low1]) \n\t"
	211	"sw %[temp0], 0(%[x1]) \n\t"
	212	"sw %[temp1], 9728(%[x1]) \n\t"
	213	"addiu %[x1], %[x1], 256 \n\t"
	214	"addiu %[X_low1], %[X_low1], 8 \n\t"
	215	"addiu %[i], %[i], 1 \n\t"
	216	"bne %[i], %[i_Temp], 2b \n\t"
	217
	218	: [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i),
	219	[temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
	220	: [i_Temp]"r"(i_Temp)
	221	: "memory"
	222	);
	223	x1-=(i_Temp<<6)-1;
	224	X_low1-=(i_Temp<<1)-80;
	225	}
	226
	227	x1=&X[0][0][k];
	228	Y01=(float*)&Y0[32][k][0];
	229
	230	for (; k < sbr->kx[0] + sbr->m[0]; k++) {
	231	__asm__ volatile (
	232	"move %[i], $zero \n\t"
	233	"3: \n\t"
	234	"lw %[temp0], 0(%[Y01]) \n\t"
	235	"lw %[temp1], 4(%[Y01]) \n\t"
	236	"sw %[temp0], 0(%[x1]) \n\t"
	237	"sw %[temp1], 9728(%[x1]) \n\t"
	238	"addiu %[x1], %[x1], 256 \n\t"
	239	"addiu %[Y01], %[Y01], 512 \n\t"
	240	"addiu %[i], %[i], 1 \n\t"
	241	"bne %[i], %[i_Temp], 3b \n\t"
	242
	243	: [x1]"+r"(x1), [Y01]"+r"(Y01), [i]"=&r"(i),
	244	[temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
	245	: [i_Temp]"r"(i_Temp)
	246	: "memory"
	247	);
	248	x1 -=(i_Temp<<6)-1;
	249	Y01 -=(i_Temp<<7)-2;
	250	}
	251	}
	252
	253	x1=&X[0][i_Temp][0];
	254	X_low1=&X_low[0][i_Temp+2][0];
	255	temp3=38;
	256
	257	for (k = 0; k < sbr->kx[1]; k++) {
	258
	259	__asm__ volatile (
	260	"move %[i], %[i_Temp] \n\t"
	261	"4: \n\t"
	262	"lw %[temp0], 0(%[X_low1]) \n\t"
	263	"lw %[temp1], 4(%[X_low1]) \n\t"
	264	"sw %[temp0], 0(%[x1]) \n\t"
	265	"sw %[temp1], 9728(%[x1]) \n\t"
	266	"addiu %[x1], %[x1], 256 \n\t"
	267	"addiu %[X_low1], %[X_low1], 8 \n\t"
	268	"addiu %[i], %[i], 1 \n\t"
	269	"bne %[i], %[temp3], 4b \n\t"
	270
	271	: [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i),
	272	[temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
	273	[temp2]"=&r"(temp2)
	274	: [i_Temp]"r"(i_Temp), [temp3]"r"(temp3)
	275	: "memory"
	276	);
	277	x1 -= ((38-i_Temp)<<6)-1;
	278	X_low1 -= ((38-i_Temp)<<1)- 80;
	279	}
	280
	281	x1=&X[0][i_Temp][k];
	282	Y11=&Y1[i_Temp][k][0];
	283	temp2=32;
	284
	285	for (; k < sbr->kx[1] + sbr->m[1]; k++) {
	286
	287	__asm__ volatile (
	288	"move %[i], %[i_Temp] \n\t"
	289	"5: \n\t"
	290	"lw %[temp0], 0(%[Y11]) \n\t"
	291	"lw %[temp1], 4(%[Y11]) \n\t"
	292	"sw %[temp0], 0(%[x1]) \n\t"
	293	"sw %[temp1], 9728(%[x1]) \n\t"
	294	"addiu %[x1], %[x1], 256 \n\t"
	295	"addiu %[Y11], %[Y11], 512 \n\t"
	296	"addiu %[i], %[i], 1 \n\t"
	297	"bne %[i], %[temp2], 5b \n\t"
	298
	299	: [x1]"+r"(x1), [Y11]"+r"(Y11), [i]"=&r"(i),
	300	[temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
	301	: [i_Temp]"r"(i_Temp), [temp3]"r"(temp3),
	302	[temp2]"r"(temp2)
	303	: "memory"
	304	);
	305
	306	x1 -= ((32-i_Temp)<<6)-1;
	307	Y11 -= ((32-i_Temp)<<7)-2;
	308	}
	309	return 0;
	310	}
	311
	312	#if HAVE_MIPSFPU
	313	static void sbr_hf_assemble_mips(float Y1[38][64][2],
	314	const float X_high[64][40][2],
	315	SpectralBandReplication sbr, SBRData ch_data,
	316	const int e_a[2])
	317	{
	318	int e, i, j, m;
	319	const int h_SL = 4 * !sbr->bs_smoothing_mode;
	320	const int kx = sbr->kx[1];
	321	const int m_max = sbr->m[1];
	322	static const float h_smooth[5] = {
	323	0.33333333333333,
	324	0.30150283239582,
	325	0.21816949906249,
	326	0.11516383427084,
	327	0.03183050093751,
	328	};
	329
	330	float (g_temp)[48] = ch_data->g_temp, (q_temp)[48] = ch_data->q_temp;
	331	int indexnoise = ch_data->f_indexnoise;
	332	int indexsine = ch_data->f_indexsine;
	333	float g_temp1, q_temp1, pok, pok1;
	334	float temp1, temp2, temp3, temp4;
	335	int size = m_max;
	336
	337	if (sbr->reset) {
	338	for (i = 0; i < h_SL; i++) {
	339	memcpy(g_temp[i + 2ch_data->t_env[0]], sbr->gain[0], m_max sizeof(sbr->gain[0][0]));
	340	memcpy(q_temp[i + 2ch_data->t_env[0]], sbr->q_m[0], m_max sizeof(sbr->q_m[0][0]));
	341	}
	342	} else if (h_SL) {
	343	memcpy(g_temp[2ch_data->t_env[0]], g_temp[2ch_data->t_env_num_env_old], 4*sizeof(g_temp[0]));
	344	memcpy(q_temp[2ch_data->t_env[0]], q_temp[2ch_data->t_env_num_env_old], 4*sizeof(q_temp[0]));
	345	}
	346
	347	for (e = 0; e < ch_data->bs_num_env; e++) {
	348	for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
	349	g_temp1 = g_temp[h_SL + i];
	350	pok = sbr->gain[e];
	351	q_temp1 = q_temp[h_SL + i];
	352	pok1 = sbr->q_m[e];
	353
	354	/* loop unrolled 4 times */
	355	for (j=0; j<(size>>2); j++) {
	356	__asm__ volatile (
	357	"lw %[temp1], 0(%[pok]) \n\t"
	358	"lw %[temp2], 4(%[pok]) \n\t"
	359	"lw %[temp3], 8(%[pok]) \n\t"
	360	"lw %[temp4], 12(%[pok]) \n\t"
	361	"sw %[temp1], 0(%[g_temp1]) \n\t"
	362	"sw %[temp2], 4(%[g_temp1]) \n\t"
	363	"sw %[temp3], 8(%[g_temp1]) \n\t"
	364	"sw %[temp4], 12(%[g_temp1]) \n\t"
	365	"lw %[temp1], 0(%[pok1]) \n\t"
	366	"lw %[temp2], 4(%[pok1]) \n\t"
	367	"lw %[temp3], 8(%[pok1]) \n\t"
	368	"lw %[temp4], 12(%[pok1]) \n\t"
	369	"sw %[temp1], 0(%[q_temp1]) \n\t"
	370	"sw %[temp2], 4(%[q_temp1]) \n\t"
	371	"sw %[temp3], 8(%[q_temp1]) \n\t"
	372	"sw %[temp4], 12(%[q_temp1]) \n\t"
	373	"addiu %[pok], %[pok], 16 \n\t"
	374	"addiu %[g_temp1], %[g_temp1], 16 \n\t"
	375	"addiu %[pok1], %[pok1], 16 \n\t"
	376	"addiu %[q_temp1], %[q_temp1], 16 \n\t"
	377
	378	: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
	379	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
	380	[pok]"+r"(pok), [g_temp1]"+r"(g_temp1),
	381	[pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1)
	382	:
	383	: "memory"
	384	);
	385	}
	386
	387	for (j=0; j<(size&3); j++) {
	388	__asm__ volatile (
	389	"lw %[temp1], 0(%[pok]) \n\t"
	390	"lw %[temp2], 0(%[pok1]) \n\t"
	391	"sw %[temp1], 0(%[g_temp1]) \n\t"
	392	"sw %[temp2], 0(%[q_temp1]) \n\t"
	393	"addiu %[pok], %[pok], 4 \n\t"
	394	"addiu %[g_temp1], %[g_temp1], 4 \n\t"
	395	"addiu %[pok1], %[pok1], 4 \n\t"
	396	"addiu %[q_temp1], %[q_temp1], 4 \n\t"
	397
	398	: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
	399	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
	400	[pok]"+r"(pok), [g_temp1]"+r"(g_temp1),
	401	[pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1)
	402	:
	403	: "memory"
	404	);
	405	}
	406	}
	407	}
	408
	409	for (e = 0; e < ch_data->bs_num_env; e++) {
	410	for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
	411	LOCAL_ALIGNED_16(float, g_filt_tab, [48]);
	412	LOCAL_ALIGNED_16(float, q_filt_tab, [48]);
	413	float g_filt, q_filt;
	414
	415	if (h_SL && e != e_a[0] && e != e_a[1]) {
	416	g_filt = g_filt_tab;
	417	q_filt = q_filt_tab;
	418
	419	for (m = 0; m < m_max; m++) {
	420	const int idx1 = i + h_SL;
	421	g_filt[m] = 0.0f;
	422	q_filt[m] = 0.0f;
	423
	424	for (j = 0; j <= h_SL; j++) {
	425	g_filt[m] += g_temp[idx1 - j][m] * h_smooth[j];
	426	q_filt[m] += q_temp[idx1 - j][m] * h_smooth[j];
	427	}
	428	}
	429	} else {
	430	g_filt = g_temp[i + h_SL];
	431	q_filt = q_temp[i];
	432	}
	433
	434	sbr->dsp.hf_g_filt(Y1[i] + kx, X_high + kx, g_filt, m_max,
	435	i + ENVELOPE_ADJUSTMENT_OFFSET);
	436
	437	if (e != e_a[0] && e != e_a[1]) {
	438	sbr->dsp.hf_apply_noise[indexsine](Y1[i] + kx, sbr->s_m[e],
	439	q_filt, indexnoise,
	440	kx, m_max);
	441	} else {
	442	int idx = indexsine&1;
	443	int A = (1-((indexsine+(kx & 1))&2));
	444	int B = (A^(-idx)) + idx;
	445	float *out = &Y1[i][kx][idx];
	446	float *in = sbr->s_m[e];
	447	float temp0, temp1, temp2, temp3, temp4, temp5;
	448	float A_f = (float)A;
	449	float B_f = (float)B;
	450
	451	for (m = 0; m+1 < m_max; m+=2) {
	452
	453	temp2 = out[0];
	454	temp3 = out[2];
	455
	456	__asm__ volatile(
	457	"lwc1 %[temp0], 0(%[in]) \n\t"
	458	"lwc1 %[temp1], 4(%[in]) \n\t"
	459	"madd.s %[temp4], %[temp2], %[temp0], %[A_f] \n\t"
	460	"madd.s %[temp5], %[temp3], %[temp1], %[B_f] \n\t"
	461	"swc1 %[temp4], 0(%[out]) \n\t"
	462	"swc1 %[temp5], 8(%[out]) \n\t"
	463	"addiu %[in], %[in], 8 \n\t"
	464	"addiu %[out], %[out], 16 \n\t"
	465
	466	: [temp0]"=&f" (temp0), [temp1]"=&f"(temp1),
	467	[temp4]"=&f" (temp4), [temp5]"=&f"(temp5),
	468	[in]"+r"(in), [out]"+r"(out)
	469	: [A_f]"f"(A_f), [B_f]"f"(B_f), [temp2]"f"(temp2),
	470	[temp3]"f"(temp3)
	471	: "memory"
	472	);
	473	}
	474	if(m_max&1)
	475	out[2m ] += in[m ] A;
	476	}
	477	indexnoise = (indexnoise + m_max) & 0x1ff;
	478	indexsine = (indexsine + 1) & 3;
	479	}
	480	}
	481	ch_data->f_indexnoise = indexnoise;
	482	ch_data->f_indexsine = indexsine;
	483	}
	484
	485	static void sbr_hf_inverse_filter_mips(SBRDSPContext *dsp,
	486	float (alpha0)[2], float (alpha1)[2],
	487	const float X_low[32][40][2], int k0)
	488	{
	489	int k;
	490	float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, c;
	491	float phi1, alpha_1, *alpha_0, res1, res2, temp_real, temp_im;
	492
	493	c = 1.000001f;
	494
	495	for (k = 0; k < k0; k++) {
	496	LOCAL_ALIGNED_16(float, phi, [3], [2][2]);
	497	float dk;
	498	phi1 = &phi[0][0][0];
	499	alpha_1 = &alpha1[k][0];
	500	alpha_0 = &alpha0[k][0];
	501	dsp->autocorrelate(X_low[k], phi);
	502
	503	__asm__ volatile (
	504	"lwc1 %[temp0], 40(%[phi1]) \n\t"
	505	"lwc1 %[temp1], 16(%[phi1]) \n\t"
	506	"lwc1 %[temp2], 24(%[phi1]) \n\t"
	507	"lwc1 %[temp3], 28(%[phi1]) \n\t"
	508	"mul.s %[dk], %[temp0], %[temp1] \n\t"
	509	"lwc1 %[temp4], 0(%[phi1]) \n\t"
	510	"mul.s %[res2], %[temp2], %[temp2] \n\t"
	511	"lwc1 %[temp5], 4(%[phi1]) \n\t"
	512	"madd.s %[res2], %[res2], %[temp3], %[temp3] \n\t"
	513	"lwc1 %[temp6], 8(%[phi1]) \n\t"
	514	"div.s %[res2], %[res2], %[c] \n\t"
	515	"lwc1 %[temp0], 12(%[phi1]) \n\t"
	516	"sub.s %[dk], %[dk], %[res2] \n\t"
	517
	518	: [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
	519	[temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
	520	[temp6]"=&f"(temp6), [res2]"=&f"(res2), [dk]"=&f"(dk)
	521	: [phi1]"r"(phi1), [c]"f"(c)
	522	: "memory"
	523	);
	524
	525	if (!dk) {
	526	alpha_1[0] = 0;
	527	alpha_1[1] = 0;
	528	} else {
	529	__asm__ volatile (
	530	"mul.s %[temp_real], %[temp4], %[temp2] \n\t"
	531	"nmsub.s %[temp_real], %[temp_real], %[temp5], %[temp3] \n\t"
	532	"nmsub.s %[temp_real], %[temp_real], %[temp6], %[temp1] \n\t"
	533	"mul.s %[temp_im], %[temp4], %[temp3] \n\t"
	534	"madd.s %[temp_im], %[temp_im], %[temp5], %[temp2] \n\t"
	535	"nmsub.s %[temp_im], %[temp_im], %[temp0], %[temp1] \n\t"
	536	"div.s %[temp_real], %[temp_real], %[dk] \n\t"
	537	"div.s %[temp_im], %[temp_im], %[dk] \n\t"
	538	"swc1 %[temp_real], 0(%[alpha_1]) \n\t"
	539	"swc1 %[temp_im], 4(%[alpha_1]) \n\t"
	540
	541	: [temp_real]"=&f" (temp_real), [temp_im]"=&f"(temp_im)
	542	: [phi1]"r"(phi1), [temp0]"f"(temp0), [temp1]"f"(temp1),
	543	[temp2]"f"(temp2), [temp3]"f"(temp3), [temp4]"f"(temp4),
	544	[temp5]"f"(temp5), [temp6]"f"(temp6),
	545	[alpha_1]"r"(alpha_1), [dk]"f"(dk)
	546	: "memory"
	547	);
	548	}
	549
	550	if (!phi1[4]) {
	551	alpha_0[0] = 0;
	552	alpha_0[1] = 0;
	553	} else {
	554	__asm__ volatile (
	555	"lwc1 %[temp6], 0(%[alpha_1]) \n\t"
	556	"lwc1 %[temp7], 4(%[alpha_1]) \n\t"
	557	"mul.s %[temp_real], %[temp6], %[temp2] \n\t"
	558	"add.s %[temp_real], %[temp_real], %[temp4] \n\t"
	559	"madd.s %[temp_real], %[temp_real], %[temp7], %[temp3] \n\t"
	560	"mul.s %[temp_im], %[temp7], %[temp2] \n\t"
	561	"add.s %[temp_im], %[temp_im], %[temp5] \n\t"
	562	"nmsub.s %[temp_im], %[temp_im], %[temp6], %[temp3] \n\t"
	563	"div.s %[temp_real], %[temp_real], %[temp1] \n\t"
	564	"div.s %[temp_im], %[temp_im], %[temp1] \n\t"
	565	"neg.s %[temp_real], %[temp_real] \n\t"
	566	"neg.s %[temp_im], %[temp_im] \n\t"
	567	"swc1 %[temp_real], 0(%[alpha_0]) \n\t"
	568	"swc1 %[temp_im], 4(%[alpha_0]) \n\t"
	569
	570	: [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im),
	571	[temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
	572	[res1]"=&f"(res1), [res2]"=&f"(res2)
	573	: [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0),
	574	[temp0]"f"(temp0), [temp1]"f"(temp1), [temp2]"f"(temp2),
	575	[temp3]"f"(temp3), [temp4]"f"(temp4), [temp5]"f"(temp5)
	576	: "memory"
	577	);
	578	}
	579
	580	__asm__ volatile (
	581	"lwc1 %[temp1], 0(%[alpha_1]) \n\t"
	582	"lwc1 %[temp2], 4(%[alpha_1]) \n\t"
	583	"lwc1 %[temp_real], 0(%[alpha_0]) \n\t"
	584	"lwc1 %[temp_im], 4(%[alpha_0]) \n\t"
	585	"mul.s %[res1], %[temp1], %[temp1] \n\t"
	586	"madd.s %[res1], %[res1], %[temp2], %[temp2] \n\t"
	587	"mul.s %[res2], %[temp_real], %[temp_real] \n\t"
	588	"madd.s %[res2], %[res2], %[temp_im], %[temp_im] \n\t"
	589
	590	: [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im),
	591	[temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
	592	[res1]"=&f"(res1), [res2]"=&f"(res2)
	593	: [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0)
	594	: "memory"
	595	);
	596
	597	if (res1 >= 16.0f \|\| res2 >= 16.0f) {
	598	alpha_1[0] = 0;
	599	alpha_1[1] = 0;
	600	alpha_0[0] = 0;
	601	alpha_0[1] = 0;
	602	}
	603	}
	604	}
	605	#endif /* HAVE_MIPSFPU */
	606	#endif /* HAVE_INLINE_ASM */
	607
	608	void ff_aacsbr_func_ptr_init_mips(AACSBRContext *c)
	609	{
	610	#if HAVE_INLINE_ASM
	611	c->sbr_lf_gen = sbr_lf_gen_mips;
	612	c->sbr_x_gen = sbr_x_gen_mips;
	613	#if HAVE_MIPSFPU
	614	c->sbr_hf_inverse_filter = sbr_hf_inverse_filter_mips;
	615	c->sbr_hf_assemble = sbr_hf_assemble_mips;
	616	#endif /* HAVE_MIPSFPU */
	617	#endif /* HAVE_INLINE_ASM */
	618	}