[deb_ffmpeg.git] / ffmpeg / libavcodec / mips / aacsbr_mips.c

/*
 * Copyright (c) 2012
 *      MIPS Technologies, Inc., California.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * Authors:  Djordje Pesut   (djordje@mips.com)
 *           Mirjana Vulin   (mvulin@mips.com)
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

/**
 * @file
 * Reference: libavcodec/aacsbr.c
 */

#include "libavcodec/aac.h"
#include "libavcodec/aacsbr.h"

#define ENVELOPE_ADJUSTMENT_OFFSET 2

#if HAVE_INLINE_ASM
static int sbr_lf_gen_mips(AACContext *ac, SpectralBandReplication *sbr,
                      float X_low[32][40][2], const float W[2][32][32][2],
                      int buf_idx)
{
    int i, k;
    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
    float *p_x_low = &X_low[0][8][0];
    float *p_w = (float*)&W[buf_idx][0][0][0];
    float *p_x1_low = &X_low[0][0][0];
    float *p_w1 = (float*)&W[1-buf_idx][24][0][0];

    float *loop_end=p_x1_low + 2560;

    /* loop unrolled 8 times */
    __asm__ volatile (
    "1:                                                 \n\t"
        "sw     $0,            0(%[p_x1_low])           \n\t"
        "sw     $0,            4(%[p_x1_low])           \n\t"
        "sw     $0,            8(%[p_x1_low])           \n\t"
        "sw     $0,            12(%[p_x1_low])          \n\t"
        "sw     $0,            16(%[p_x1_low])          \n\t"
        "sw     $0,            20(%[p_x1_low])          \n\t"
        "sw     $0,            24(%[p_x1_low])          \n\t"
        "sw     $0,            28(%[p_x1_low])          \n\t"
        "addiu  %[p_x1_low],   %[p_x1_low],      32     \n\t"
        "bne    %[p_x1_low],   %[loop_end],      1b     \n\t"
        "addiu  %[p_x1_low],   %[p_x1_low],      -10240 \n\t"

        : [p_x1_low]"+r"(p_x1_low)
        : [loop_end]"r"(loop_end)
        : "memory"
    );

    for (k = 0; k < sbr->kx[1]; k++) {
        for (i = 0; i < 32; i+=4) {
            /* loop unrolled 4 times */
            __asm__ volatile (
                "lw     %[temp0],   0(%[p_w])               \n\t"
                "lw     %[temp1],   4(%[p_w])               \n\t"
                "lw     %[temp2],   256(%[p_w])             \n\t"
                "lw     %[temp3],   260(%[p_w])             \n\t"
                "lw     %[temp4],   512(%[p_w])             \n\t"
                "lw     %[temp5],   516(%[p_w])             \n\t"
                "lw     %[temp6],   768(%[p_w])             \n\t"
                "lw     %[temp7],   772(%[p_w])             \n\t"
                "sw     %[temp0],   0(%[p_x_low])           \n\t"
                "sw     %[temp1],   4(%[p_x_low])           \n\t"
                "sw     %[temp2],   8(%[p_x_low])           \n\t"
                "sw     %[temp3],   12(%[p_x_low])          \n\t"
                "sw     %[temp4],   16(%[p_x_low])          \n\t"
                "sw     %[temp5],   20(%[p_x_low])          \n\t"
                "sw     %[temp6],   24(%[p_x_low])          \n\t"
                "sw     %[temp7],   28(%[p_x_low])          \n\t"
                "addiu  %[p_x_low], %[p_x_low],     32      \n\t"
                "addiu  %[p_w],     %[p_w],         1024    \n\t"

                : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
                  [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
                  [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
                  [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
                  [p_w]"+r"(p_w), [p_x_low]"+r"(p_x_low)
                :
                : "memory"
            );
        }
        p_x_low += 16;
        p_w -= 2046;
    }

    for (k = 0; k < sbr->kx[0]; k++) {
        for (i = 0; i < 2; i++) {

            /* loop unrolled 4 times */
            __asm__ volatile (
                "lw     %[temp0],    0(%[p_w1])             \n\t"
                "lw     %[temp1],    4(%[p_w1])             \n\t"
                "lw     %[temp2],    256(%[p_w1])           \n\t"
                "lw     %[temp3],    260(%[p_w1])           \n\t"
                "lw     %[temp4],    512(%[p_w1])           \n\t"
                "lw     %[temp5],    516(%[p_w1])           \n\t"
                "lw     %[temp6],    768(%[p_w1])           \n\t"
                "lw     %[temp7],    772(%[p_w1])           \n\t"
                "sw     %[temp0],    0(%[p_x1_low])         \n\t"
                "sw     %[temp1],    4(%[p_x1_low])         \n\t"
                "sw     %[temp2],    8(%[p_x1_low])         \n\t"
                "sw     %[temp3],    12(%[p_x1_low])        \n\t"
                "sw     %[temp4],    16(%[p_x1_low])        \n\t"
                "sw     %[temp5],    20(%[p_x1_low])        \n\t"
                "sw     %[temp6],    24(%[p_x1_low])        \n\t"
                "sw     %[temp7],    28(%[p_x1_low])        \n\t"
                "addiu  %[p_x1_low], %[p_x1_low],   32      \n\t"
                "addiu  %[p_w1],     %[p_w1],       1024    \n\t"

                : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
                  [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
                  [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
                  [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
                  [p_w1]"+r"(p_w1), [p_x1_low]"+r"(p_x1_low)
                :
                : "memory"
            );
        }
        p_x1_low += 64;
        p_w1 -= 510;
    }
    return 0;
}

static int sbr_x_gen_mips(SpectralBandReplication *sbr, float X[2][38][64],
                     const float Y0[38][64][2], const float Y1[38][64][2],
                     const float X_low[32][40][2], int ch)
{
    int k, i;
    const int i_f = 32;
    int temp0, temp1, temp2, temp3;
    const float *X_low1, *Y01, *Y11;
    float *x1=&X[0][0][0];
    float *j=x1+4864;
    const int i_Temp = FFMAX(2*sbr->data[ch].t_env_num_env_old - i_f, 0);

    /* loop unrolled 8 times */
    __asm__ volatile (
    "1:                                       \n\t"
        "sw     $0,      0(%[x1])             \n\t"
        "sw     $0,      4(%[x1])             \n\t"
        "sw     $0,      8(%[x1])             \n\t"
        "sw     $0,      12(%[x1])            \n\t"
        "sw     $0,      16(%[x1])            \n\t"
        "sw     $0,      20(%[x1])            \n\t"
        "sw     $0,      24(%[x1])            \n\t"
        "sw     $0,      28(%[x1])            \n\t"
        "addiu  %[x1],   %[x1],      32       \n\t"
        "bne    %[x1],   %[j],       1b       \n\t"
        "addiu  %[x1],   %[x1],      -19456   \n\t"

        : [x1]"+r"(x1)
        : [j]"r"(j)
        : "memory"
    );

    if (i_Temp != 0) {

        X_low1=&X_low[0][2][0];

        for (k = 0; k < sbr->kx[0]; k++) {

            __asm__ volatile (
                "move    %[i],        $zero                  \n\t"
            "2:                                              \n\t"
                "lw      %[temp0],    0(%[X_low1])           \n\t"
                "lw      %[temp1],    4(%[X_low1])           \n\t"
                "sw      %[temp0],    0(%[x1])               \n\t"
                "sw      %[temp1],    9728(%[x1])            \n\t"
                "addiu   %[x1],       %[x1],         256     \n\t"
                "addiu   %[X_low1],   %[X_low1],     8       \n\t"
                "addiu   %[i],        %[i],          1       \n\t"
                "bne     %[i],        %[i_Temp],     2b      \n\t"

                : [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i),
                  [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
                : [i_Temp]"r"(i_Temp)
                : "memory"
            );
            x1-=(i_Temp<<6)-1;
            X_low1-=(i_Temp<<1)-80;
        }

        x1=&X[0][0][k];
        Y01=(float*)&Y0[32][k][0];

        for (; k < sbr->kx[0] + sbr->m[0]; k++) {
            __asm__ volatile (
                "move    %[i],       $zero               \n\t"
            "3:                                          \n\t"
                "lw      %[temp0],   0(%[Y01])           \n\t"
                "lw      %[temp1],   4(%[Y01])           \n\t"
                "sw      %[temp0],   0(%[x1])            \n\t"
                "sw      %[temp1],   9728(%[x1])         \n\t"
                "addiu   %[x1],      %[x1],      256     \n\t"
                "addiu   %[Y01],     %[Y01],     512     \n\t"
                "addiu   %[i],       %[i],       1       \n\t"
                "bne     %[i],       %[i_Temp],  3b      \n\t"

                : [x1]"+r"(x1), [Y01]"+r"(Y01), [i]"=&r"(i),
                  [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
                : [i_Temp]"r"(i_Temp)
                : "memory"
            );
            x1 -=(i_Temp<<6)-1;
            Y01 -=(i_Temp<<7)-2;
        }
    }

    x1=&X[0][i_Temp][0];
    X_low1=&X_low[0][i_Temp+2][0];
    temp3=38;

    for (k = 0; k < sbr->kx[1]; k++) {

        __asm__ volatile (
            "move    %[i],       %[i_Temp]              \n\t"
        "4:                                             \n\t"
            "lw      %[temp0],   0(%[X_low1])           \n\t"
            "lw      %[temp1],   4(%[X_low1])           \n\t"
            "sw      %[temp0],   0(%[x1])               \n\t"
            "sw      %[temp1],   9728(%[x1])            \n\t"
            "addiu   %[x1],      %[x1],         256     \n\t"
            "addiu   %[X_low1],  %[X_low1],     8       \n\t"
            "addiu   %[i],       %[i],          1       \n\t"
            "bne     %[i],       %[temp3],      4b      \n\t"

            : [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i),
              [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
              [temp2]"=&r"(temp2)
            : [i_Temp]"r"(i_Temp), [temp3]"r"(temp3)
            : "memory"
        );
        x1 -= ((38-i_Temp)<<6)-1;
        X_low1 -= ((38-i_Temp)<<1)- 80;
    }

    x1=&X[0][i_Temp][k];
    Y11=&Y1[i_Temp][k][0];
    temp2=32;

    for (; k < sbr->kx[1] + sbr->m[1]; k++) {

        __asm__ volatile (
           "move    %[i],       %[i_Temp]               \n\t"
        "5:                                             \n\t"
           "lw      %[temp0],   0(%[Y11])               \n\t"
           "lw      %[temp1],   4(%[Y11])               \n\t"
           "sw      %[temp0],   0(%[x1])                \n\t"
           "sw      %[temp1],   9728(%[x1])             \n\t"
           "addiu   %[x1],      %[x1],          256     \n\t"
           "addiu   %[Y11],     %[Y11],         512     \n\t"
           "addiu   %[i],       %[i],           1       \n\t"
           "bne     %[i],       %[temp2],       5b      \n\t"

           : [x1]"+r"(x1), [Y11]"+r"(Y11), [i]"=&r"(i),
             [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
           : [i_Temp]"r"(i_Temp), [temp3]"r"(temp3),
             [temp2]"r"(temp2)
           : "memory"
        );

        x1 -= ((32-i_Temp)<<6)-1;
        Y11 -= ((32-i_Temp)<<7)-2;
   }
      return 0;
}

#if HAVE_MIPSFPU
static void sbr_hf_assemble_mips(float Y1[38][64][2],
                            const float X_high[64][40][2],
                            SpectralBandReplication *sbr, SBRData *ch_data,
                            const int e_a[2])
{
    int e, i, j, m;
    const int h_SL = 4 * !sbr->bs_smoothing_mode;
    const int kx = sbr->kx[1];
    const int m_max = sbr->m[1];
    static const float h_smooth[5] = {
        0.33333333333333,
        0.30150283239582,
        0.21816949906249,
        0.11516383427084,
        0.03183050093751,
    };

    float (*g_temp)[48] = ch_data->g_temp, (*q_temp)[48] = ch_data->q_temp;
    int indexnoise = ch_data->f_indexnoise;
    int indexsine  = ch_data->f_indexsine;
    float *g_temp1, *q_temp1, *pok, *pok1;
    float temp1, temp2, temp3, temp4;
    int size = m_max;

    if (sbr->reset) {
        for (i = 0; i < h_SL; i++) {
            memcpy(g_temp[i + 2*ch_data->t_env[0]], sbr->gain[0], m_max * sizeof(sbr->gain[0][0]));
            memcpy(q_temp[i + 2*ch_data->t_env[0]], sbr->q_m[0],  m_max * sizeof(sbr->q_m[0][0]));
        }
    } else if (h_SL) {
        memcpy(g_temp[2*ch_data->t_env[0]], g_temp[2*ch_data->t_env_num_env_old], 4*sizeof(g_temp[0]));
        memcpy(q_temp[2*ch_data->t_env[0]], q_temp[2*ch_data->t_env_num_env_old], 4*sizeof(q_temp[0]));
    }

    for (e = 0; e < ch_data->bs_num_env; e++) {
        for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
            g_temp1 = g_temp[h_SL + i];
            pok = sbr->gain[e];
            q_temp1 = q_temp[h_SL + i];
            pok1 = sbr->q_m[e];

            /* loop unrolled 4 times */
            for (j=0; j<(size>>2); j++) {
                __asm__ volatile (
                    "lw      %[temp1],   0(%[pok])               \n\t"
                    "lw      %[temp2],   4(%[pok])               \n\t"
                    "lw      %[temp3],   8(%[pok])               \n\t"
                    "lw      %[temp4],   12(%[pok])              \n\t"
                    "sw      %[temp1],   0(%[g_temp1])           \n\t"
                    "sw      %[temp2],   4(%[g_temp1])           \n\t"
                    "sw      %[temp3],   8(%[g_temp1])           \n\t"
                    "sw      %[temp4],   12(%[g_temp1])          \n\t"
                    "lw      %[temp1],   0(%[pok1])              \n\t"
                    "lw      %[temp2],   4(%[pok1])              \n\t"
                    "lw      %[temp3],   8(%[pok1])              \n\t"
                    "lw      %[temp4],   12(%[pok1])             \n\t"
                    "sw      %[temp1],   0(%[q_temp1])           \n\t"
                    "sw      %[temp2],   4(%[q_temp1])           \n\t"
                    "sw      %[temp3],   8(%[q_temp1])           \n\t"
                    "sw      %[temp4],   12(%[q_temp1])          \n\t"
                    "addiu   %[pok],     %[pok],           16    \n\t"
                    "addiu   %[g_temp1], %[g_temp1],       16    \n\t"
                    "addiu   %[pok1],    %[pok1],          16    \n\t"
                    "addiu   %[q_temp1], %[q_temp1],       16    \n\t"

                    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
                      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
                      [pok]"+r"(pok), [g_temp1]"+r"(g_temp1),
                      [pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1)
                    :
                    : "memory"
                );
            }

            for (j=0; j<(size&3); j++) {
                __asm__ volatile (
                    "lw      %[temp1],   0(%[pok])              \n\t"
                    "lw      %[temp2],   0(%[pok1])             \n\t"
                    "sw      %[temp1],   0(%[g_temp1])          \n\t"
                    "sw      %[temp2],   0(%[q_temp1])          \n\t"
                    "addiu   %[pok],     %[pok],          4     \n\t"
                    "addiu   %[g_temp1], %[g_temp1],      4     \n\t"
                    "addiu   %[pok1],    %[pok1],         4     \n\t"
                    "addiu   %[q_temp1], %[q_temp1],      4     \n\t"

                    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
                      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
                      [pok]"+r"(pok), [g_temp1]"+r"(g_temp1),
                      [pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1)
                    :
                    : "memory"
                );
            }
        }
    }

    for (e = 0; e < ch_data->bs_num_env; e++) {
        for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
            LOCAL_ALIGNED_16(float, g_filt_tab, [48]);
            LOCAL_ALIGNED_16(float, q_filt_tab, [48]);
            float *g_filt, *q_filt;

            if (h_SL && e != e_a[0] && e != e_a[1]) {
                g_filt = g_filt_tab;
                q_filt = q_filt_tab;

                for (m = 0; m < m_max; m++) {
                    const int idx1 = i + h_SL;
                    g_filt[m] = 0.0f;
                    q_filt[m] = 0.0f;

                    for (j = 0; j <= h_SL; j++) {
                        g_filt[m] += g_temp[idx1 - j][m] * h_smooth[j];
                        q_filt[m] += q_temp[idx1 - j][m] * h_smooth[j];
                    }
                }
            } else {
                g_filt = g_temp[i + h_SL];
                q_filt = q_temp[i];
            }

            sbr->dsp.hf_g_filt(Y1[i] + kx, X_high + kx, g_filt, m_max,
                               i + ENVELOPE_ADJUSTMENT_OFFSET);

            if (e != e_a[0] && e != e_a[1]) {
                sbr->dsp.hf_apply_noise[indexsine](Y1[i] + kx, sbr->s_m[e],
                                                   q_filt, indexnoise,
                                                   kx, m_max);
            } else {
                int idx = indexsine&1;
                int A = (1-((indexsine+(kx & 1))&2));
                int B = (A^(-idx)) + idx;
                float *out = &Y1[i][kx][idx];
                float *in  = sbr->s_m[e];
                float temp0, temp1, temp2, temp3, temp4, temp5;
                float A_f = (float)A;
                float B_f = (float)B;

                for (m = 0; m+1 < m_max; m+=2) {

                    temp2 = out[0];
                    temp3 = out[2];

                    __asm__ volatile(
                        "lwc1    %[temp0],  0(%[in])                     \n\t"
                        "lwc1    %[temp1],  4(%[in])                     \n\t"
                        "madd.s  %[temp4],  %[temp2],  %[temp0], %[A_f]  \n\t"
                        "madd.s  %[temp5],  %[temp3],  %[temp1], %[B_f]  \n\t"
                        "swc1    %[temp4],  0(%[out])                    \n\t"
                        "swc1    %[temp5],  8(%[out])                    \n\t"
                        "addiu   %[in],     %[in],     8                 \n\t"
                        "addiu   %[out],    %[out],    16                \n\t"

                        : [temp0]"=&f" (temp0), [temp1]"=&f"(temp1),
                          [temp4]"=&f" (temp4), [temp5]"=&f"(temp5),
                          [in]"+r"(in), [out]"+r"(out)
                        : [A_f]"f"(A_f), [B_f]"f"(B_f), [temp2]"f"(temp2),
                          [temp3]"f"(temp3)
                        : "memory"
                    );
                }
                if(m_max&1)
                    out[2*m  ] += in[m  ] * A;
            }
            indexnoise = (indexnoise + m_max) & 0x1ff;
            indexsine = (indexsine + 1) & 3;
        }
    }
    ch_data->f_indexnoise = indexnoise;
    ch_data->f_indexsine  = indexsine;
}

static void sbr_hf_inverse_filter_mips(SBRDSPContext *dsp,
                                  float (*alpha0)[2], float (*alpha1)[2],
                                  const float X_low[32][40][2], int k0)
{
    int k;
    float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, c;
    float *phi1, *alpha_1, *alpha_0, res1, res2, temp_real, temp_im;

    c = 1.000001f;

    for (k = 0; k < k0; k++) {
        LOCAL_ALIGNED_16(float, phi, [3], [2][2]);
        float dk;
        phi1 = &phi[0][0][0];
        alpha_1 = &alpha1[k][0];
        alpha_0 = &alpha0[k][0];
        dsp->autocorrelate(X_low[k], phi);

        __asm__ volatile (
            "lwc1    %[temp0],  40(%[phi1])                       \n\t"
            "lwc1    %[temp1],  16(%[phi1])                       \n\t"
            "lwc1    %[temp2],  24(%[phi1])                       \n\t"
            "lwc1    %[temp3],  28(%[phi1])                       \n\t"
            "mul.s   %[dk],     %[temp0],    %[temp1]             \n\t"
            "lwc1    %[temp4],  0(%[phi1])                        \n\t"
            "mul.s   %[res2],   %[temp2],    %[temp2]             \n\t"
            "lwc1    %[temp5],  4(%[phi1])                        \n\t"
            "madd.s  %[res2],   %[res2],     %[temp3],  %[temp3]  \n\t"
            "lwc1    %[temp6],  8(%[phi1])                        \n\t"
            "div.s   %[res2],   %[res2],     %[c]                 \n\t"
            "lwc1    %[temp0],  12(%[phi1])                       \n\t"
            "sub.s   %[dk],     %[dk],       %[res2]              \n\t"

            : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
              [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
              [temp6]"=&f"(temp6), [res2]"=&f"(res2), [dk]"=&f"(dk)
            : [phi1]"r"(phi1), [c]"f"(c)
            : "memory"
        );

        if (!dk) {
            alpha_1[0] = 0;
            alpha_1[1] = 0;
        } else {
            __asm__ volatile (
                "mul.s   %[temp_real], %[temp4],     %[temp2]            \n\t"
                "nmsub.s %[temp_real], %[temp_real], %[temp5], %[temp3]  \n\t"
                "nmsub.s %[temp_real], %[temp_real], %[temp6], %[temp1]  \n\t"
                "mul.s   %[temp_im],   %[temp4],     %[temp3]            \n\t"
                "madd.s  %[temp_im],   %[temp_im],   %[temp5], %[temp2]  \n\t"
                "nmsub.s %[temp_im],   %[temp_im],   %[temp0], %[temp1]  \n\t"
                "div.s   %[temp_real], %[temp_real], %[dk]               \n\t"
                "div.s   %[temp_im],   %[temp_im],   %[dk]               \n\t"
                "swc1    %[temp_real], 0(%[alpha_1])                     \n\t"
                "swc1    %[temp_im],   4(%[alpha_1])                     \n\t"

                : [temp_real]"=&f" (temp_real), [temp_im]"=&f"(temp_im)
                : [phi1]"r"(phi1), [temp0]"f"(temp0), [temp1]"f"(temp1),
                  [temp2]"f"(temp2), [temp3]"f"(temp3), [temp4]"f"(temp4),
                  [temp5]"f"(temp5), [temp6]"f"(temp6),
                  [alpha_1]"r"(alpha_1), [dk]"f"(dk)
                : "memory"
            );
        }

        if (!phi1[4]) {
            alpha_0[0] = 0;
            alpha_0[1] = 0;
        } else {
            __asm__ volatile (
                "lwc1    %[temp6],     0(%[alpha_1])                     \n\t"
                "lwc1    %[temp7],     4(%[alpha_1])                     \n\t"
                "mul.s   %[temp_real], %[temp6],     %[temp2]            \n\t"
                "add.s   %[temp_real], %[temp_real], %[temp4]            \n\t"
                "madd.s  %[temp_real], %[temp_real], %[temp7], %[temp3]  \n\t"
                "mul.s   %[temp_im],   %[temp7],     %[temp2]            \n\t"
                "add.s   %[temp_im],   %[temp_im],   %[temp5]            \n\t"
                "nmsub.s %[temp_im],   %[temp_im],   %[temp6], %[temp3]  \n\t"
                "div.s   %[temp_real], %[temp_real], %[temp1]            \n\t"
                "div.s   %[temp_im],   %[temp_im],   %[temp1]            \n\t"
                "neg.s   %[temp_real], %[temp_real]                      \n\t"
                "neg.s   %[temp_im],   %[temp_im]                        \n\t"
                "swc1    %[temp_real], 0(%[alpha_0])                     \n\t"
                "swc1    %[temp_im],   4(%[alpha_0])                     \n\t"

                : [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im),
                  [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
                  [res1]"=&f"(res1), [res2]"=&f"(res2)
                : [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0),
                  [temp0]"f"(temp0), [temp1]"f"(temp1), [temp2]"f"(temp2),
                  [temp3]"f"(temp3), [temp4]"f"(temp4), [temp5]"f"(temp5)
                : "memory"
            );
        }

        __asm__ volatile (
            "lwc1    %[temp1],      0(%[alpha_1])                           \n\t"
            "lwc1    %[temp2],      4(%[alpha_1])                           \n\t"
            "lwc1    %[temp_real],  0(%[alpha_0])                           \n\t"
            "lwc1    %[temp_im],    4(%[alpha_0])                           \n\t"
            "mul.s   %[res1],       %[temp1],      %[temp1]                 \n\t"
            "madd.s  %[res1],       %[res1],       %[temp2],    %[temp2]    \n\t"
            "mul.s   %[res2],       %[temp_real],  %[temp_real]             \n\t"
            "madd.s  %[res2],       %[res2],       %[temp_im],  %[temp_im]  \n\t"

            : [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im),
              [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
              [res1]"=&f"(res1), [res2]"=&f"(res2)
            : [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0)
            : "memory"
        );

        if (res1 >= 16.0f || res2 >= 16.0f) {
            alpha_1[0] = 0;
            alpha_1[1] = 0;
            alpha_0[0] = 0;
            alpha_0[1] = 0;
        }
    }
}
#endif /* HAVE_MIPSFPU */
#endif /* HAVE_INLINE_ASM */

void ff_aacsbr_func_ptr_init_mips(AACSBRContext *c)
{
#if HAVE_INLINE_ASM
    c->sbr_lf_gen            = sbr_lf_gen_mips;
    c->sbr_x_gen             = sbr_x_gen_mips;
#if HAVE_MIPSFPU
    c->sbr_hf_inverse_filter = sbr_hf_inverse_filter_mips;
    c->sbr_hf_assemble       = sbr_hf_assemble_mips;
#endif /* HAVE_MIPSFPU */
#endif /* HAVE_INLINE_ASM */
}
Commit	Line	Data
2ba45a60 DM	1	/*
	2	* Copyright (c) 2012
	3	* MIPS Technologies, Inc., California.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* 1. Redistributions of source code must retain the above copyright
	9	* notice, this list of conditions and the following disclaimer.
	10	* 2. Redistributions in binary form must reproduce the above copyright
	11	* notice, this list of conditions and the following disclaimer in the
	12	* documentation and/or other materials provided with the distribution.
	13	* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
	14	* contributors may be used to endorse or promote products derived from
	15	* this software without specific prior written permission.
	16	*
	17	* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
	18	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	19	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	20	* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
	21	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	22	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	23	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	24	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	25	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	26	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	27	* SUCH DAMAGE.
	28	*
	29	* Authors: Djordje Pesut (djordje@mips.com)
	30	* Mirjana Vulin (mvulin@mips.com)
	31	*
	32	* This file is part of FFmpeg.
	33	*
	34	* FFmpeg is free software; you can redistribute it and/or
	35	* modify it under the terms of the GNU Lesser General Public
	36	* License as published by the Free Software Foundation; either
	37	* version 2.1 of the License, or (at your option) any later version.
	38	*
	39	* FFmpeg is distributed in the hope that it will be useful,
	40	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	41	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	42	* Lesser General Public License for more details.
	43	*
	44	* You should have received a copy of the GNU Lesser General Public
	45	* License along with FFmpeg; if not, write to the Free Software
	46	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	47	*/
	48
	49	/**
	50	* @file
	51	* Reference: libavcodec/aacsbr.c
	52	*/
	53
	54	#include "libavcodec/aac.h"
	55	#include "libavcodec/aacsbr.h"
	56
	57	#define ENVELOPE_ADJUSTMENT_OFFSET 2
	58
	59	#if HAVE_INLINE_ASM
	60	static int sbr_lf_gen_mips(AACContext ac, SpectralBandReplication sbr,
	61	float X_low[32][40][2], const float W[2][32][32][2],
	62	int buf_idx)
	63	{
	64	int i, k;
65	int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
66	float *p_x_low = &X_low[0][8][0];
67	float p_w = (float)&W[buf_idx][0][0][0];
68	float *p_x1_low = &X_low[0][0][0];
69	float p_w1 = (float)&W[1-buf_idx][24][0][0];
70
71	float *loop_end=p_x1_low + 2560;
72
73	/* loop unrolled 8 times */
74	__asm__ volatile (
75	"1: \n\t"
76	"sw $0, 0(%[p_x1_low]) \n\t"
77	"sw $0, 4(%[p_x1_low]) \n\t"
78	"sw $0, 8(%[p_x1_low]) \n\t"
79	"sw $0, 12(%[p_x1_low]) \n\t"
80	"sw $0, 16(%[p_x1_low]) \n\t"
81	"sw $0, 20(%[p_x1_low]) \n\t"
82	"sw $0, 24(%[p_x1_low]) \n\t"
83	"sw $0, 28(%[p_x1_low]) \n\t"
84	"addiu %[p_x1_low], %[p_x1_low], 32 \n\t"
85	"bne %[p_x1_low], %[loop_end], 1b \n\t"
86	"addiu %[p_x1_low], %[p_x1_low], -10240 \n\t"
87
88	: [p_x1_low]"+r"(p_x1_low)
89	: [loop_end]"r"(loop_end)
90	: "memory"
91	);
92
93	for (k = 0; k < sbr->kx[1]; k++) {
94	for (i = 0; i < 32; i+=4) {
95	/* loop unrolled 4 times */
96	__asm__ volatile (
97	"lw %[temp0], 0(%[p_w]) \n\t"
98	"lw %[temp1], 4(%[p_w]) \n\t"
99	"lw %[temp2], 256(%[p_w]) \n\t"
100	"lw %[temp3], 260(%[p_w]) \n\t"
101	"lw %[temp4], 512(%[p_w]) \n\t"
102	"lw %[temp5], 516(%[p_w]) \n\t"
103	"lw %[temp6], 768(%[p_w]) \n\t"
104	"lw %[temp7], 772(%[p_w]) \n\t"
105	"sw %[temp0], 0(%[p_x_low]) \n\t"
106	"sw %[temp1], 4(%[p_x_low]) \n\t"
107	"sw %[temp2], 8(%[p_x_low]) \n\t"
108	"sw %[temp3], 12(%[p_x_low]) \n\t"
109	"sw %[temp4], 16(%[p_x_low]) \n\t"
110	"sw %[temp5], 20(%[p_x_low]) \n\t"
111	"sw %[temp6], 24(%[p_x_low]) \n\t"
112	"sw %[temp7], 28(%[p_x_low]) \n\t"
113	"addiu %[p_x_low], %[p_x_low], 32 \n\t"
114	"addiu %[p_w], %[p_w], 1024 \n\t"
115
116	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
117	[temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
118	[temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
119	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
120	[p_w]"+r"(p_w), [p_x_low]"+r"(p_x_low)
121	:
122	: "memory"
123	);
124	}
125	p_x_low += 16;
126	p_w -= 2046;
127	}
128
129	for (k = 0; k < sbr->kx[0]; k++) {
130	for (i = 0; i < 2; i++) {
131
132	/* loop unrolled 4 times */
133	__asm__ volatile (
134	"lw %[temp0], 0(%[p_w1]) \n\t"
135	"lw %[temp1], 4(%[p_w1]) \n\t"
136	"lw %[temp2], 256(%[p_w1]) \n\t"
137	"lw %[temp3], 260(%[p_w1]) \n\t"
138	"lw %[temp4], 512(%[p_w1]) \n\t"
139	"lw %[temp5], 516(%[p_w1]) \n\t"
140	"lw %[temp6], 768(%[p_w1]) \n\t"
141	"lw %[temp7], 772(%[p_w1]) \n\t"
142	"sw %[temp0], 0(%[p_x1_low]) \n\t"
143	"sw %[temp1], 4(%[p_x1_low]) \n\t"
144	"sw %[temp2], 8(%[p_x1_low]) \n\t"
145	"sw %[temp3], 12(%[p_x1_low]) \n\t"
146	"sw %[temp4], 16(%[p_x1_low]) \n\t"
147	"sw %[temp5], 20(%[p_x1_low]) \n\t"
148	"sw %[temp6], 24(%[p_x1_low]) \n\t"
149	"sw %[temp7], 28(%[p_x1_low]) \n\t"
150	"addiu %[p_x1_low], %[p_x1_low], 32 \n\t"
151	"addiu %[p_w1], %[p_w1], 1024 \n\t"
152
153	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
154	[temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
155	[temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
156	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
157	[p_w1]"+r"(p_w1), [p_x1_low]"+r"(p_x1_low)
158	:
159	: "memory"
160	);
161	}
162	p_x1_low += 64;
163	p_w1 -= 510;
164	}
165	return 0;
166	}
167
168	static int sbr_x_gen_mips(SpectralBandReplication *sbr, float X[2][38][64],
169	const float Y0[38][64][2], const float Y1[38][64][2],
170	const float X_low[32][40][2], int ch)
171	{
172	int k, i;
173	const int i_f = 32;
174	int temp0, temp1, temp2, temp3;
175	const float X_low1, Y01, *Y11;
176	float *x1=&X[0][0][0];
177	float *j=x1+4864;
178	const int i_Temp = FFMAX(2*sbr->data[ch].t_env_num_env_old - i_f, 0);
179
180	/* loop unrolled 8 times */
181	__asm__ volatile (
182	"1: \n\t"
183	"sw $0, 0(%[x1]) \n\t"
184	"sw $0, 4(%[x1]) \n\t"
185	"sw $0, 8(%[x1]) \n\t"
186	"sw $0, 12(%[x1]) \n\t"
187	"sw $0, 16(%[x1]) \n\t"
188	"sw $0, 20(%[x1]) \n\t"
189	"sw $0, 24(%[x1]) \n\t"
190	"sw $0, 28(%[x1]) \n\t"
191	"addiu %[x1], %[x1], 32 \n\t"
192	"bne %[x1], %[j], 1b \n\t"
193	"addiu %[x1], %[x1], -19456 \n\t"
194
195	: [x1]"+r"(x1)
196	: [j]"r"(j)
197	: "memory"
198	);
199
200	if (i_Temp != 0) {
201
202	X_low1=&X_low[0][2][0];
203
204	for (k = 0; k < sbr->kx[0]; k++) {
205
206	__asm__ volatile (
207	"move %[i], $zero \n\t"
208	"2: \n\t"
209	"lw %[temp0], 0(%[X_low1]) \n\t"
210	"lw %[temp1], 4(%[X_low1]) \n\t"
211	"sw %[temp0], 0(%[x1]) \n\t"
212	"sw %[temp1], 9728(%[x1]) \n\t"
213	"addiu %[x1], %[x1], 256 \n\t"
214	"addiu %[X_low1], %[X_low1], 8 \n\t"
215	"addiu %[i], %[i], 1 \n\t"
216	"bne %[i], %[i_Temp], 2b \n\t"
217
218	: [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i),
219	[temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
220	: [i_Temp]"r"(i_Temp)
221	: "memory"
222	);
223	x1-=(i_Temp<<6)-1;
224	X_low1-=(i_Temp<<1)-80;
225	}
226
227	x1=&X[0][0][k];
228	Y01=(float*)&Y0[32][k][0];
229
230	for (; k < sbr->kx[0] + sbr->m[0]; k++) {
231	__asm__ volatile (
232	"move %[i], $zero \n\t"
233	"3: \n\t"
234	"lw %[temp0], 0(%[Y01]) \n\t"
235	"lw %[temp1], 4(%[Y01]) \n\t"
236	"sw %[temp0], 0(%[x1]) \n\t"
237	"sw %[temp1], 9728(%[x1]) \n\t"
238	"addiu %[x1], %[x1], 256 \n\t"
239	"addiu %[Y01], %[Y01], 512 \n\t"
240	"addiu %[i], %[i], 1 \n\t"
241	"bne %[i], %[i_Temp], 3b \n\t"
242
243	: [x1]"+r"(x1), [Y01]"+r"(Y01), [i]"=&r"(i),
244	[temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
245	: [i_Temp]"r"(i_Temp)
246	: "memory"
247	);
248	x1 -=(i_Temp<<6)-1;
249	Y01 -=(i_Temp<<7)-2;
250	}
251	}
252
253	x1=&X[0][i_Temp][0];
254	X_low1=&X_low[0][i_Temp+2][0];
255	temp3=38;
256
257	for (k = 0; k < sbr->kx[1]; k++) {
258
259	__asm__ volatile (
260	"move %[i], %[i_Temp] \n\t"
261	"4: \n\t"
262	"lw %[temp0], 0(%[X_low1]) \n\t"
263	"lw %[temp1], 4(%[X_low1]) \n\t"
264	"sw %[temp0], 0(%[x1]) \n\t"
265	"sw %[temp1], 9728(%[x1]) \n\t"
266	"addiu %[x1], %[x1], 256 \n\t"
267	"addiu %[X_low1], %[X_low1], 8 \n\t"
268	"addiu %[i], %[i], 1 \n\t"
269	"bne %[i], %[temp3], 4b \n\t"
270
271	: [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i),
272	[temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
273	[temp2]"=&r"(temp2)
274	: [i_Temp]"r"(i_Temp), [temp3]"r"(temp3)
275	: "memory"
276	);
277	x1 -= ((38-i_Temp)<<6)-1;
278	X_low1 -= ((38-i_Temp)<<1)- 80;
279	}
280
281	x1=&X[0][i_Temp][k];
282	Y11=&Y1[i_Temp][k][0];
283	temp2=32;
284
285	for (; k < sbr->kx[1] + sbr->m[1]; k++) {
286
287	__asm__ volatile (
288	"move %[i], %[i_Temp] \n\t"
289	"5: \n\t"
290	"lw %[temp0], 0(%[Y11]) \n\t"
291	"lw %[temp1], 4(%[Y11]) \n\t"
292	"sw %[temp0], 0(%[x1]) \n\t"
293	"sw %[temp1], 9728(%[x1]) \n\t"
294	"addiu %[x1], %[x1], 256 \n\t"
295	"addiu %[Y11], %[Y11], 512 \n\t"
296	"addiu %[i], %[i], 1 \n\t"
297	"bne %[i], %[temp2], 5b \n\t"
298
299	: [x1]"+r"(x1), [Y11]"+r"(Y11), [i]"=&r"(i),
300	[temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
301	: [i_Temp]"r"(i_Temp), [temp3]"r"(temp3),
302	[temp2]"r"(temp2)
303	: "memory"
304	);
305
306	x1 -= ((32-i_Temp)<<6)-1;
307	Y11 -= ((32-i_Temp)<<7)-2;
308	}
309	return 0;
310	}
311
312	#if HAVE_MIPSFPU
313	static void sbr_hf_assemble_mips(float Y1[38][64][2],
314	const float X_high[64][40][2],
315	SpectralBandReplication sbr, SBRData ch_data,
316	const int e_a[2])
317	{
318	int e, i, j, m;
319	const int h_SL = 4 * !sbr->bs_smoothing_mode;
320	const int kx = sbr->kx[1];
321	const int m_max = sbr->m[1];
322	static const float h_smooth[5] = {
323	0.33333333333333,
324	0.30150283239582,
325	0.21816949906249,
326	0.11516383427084,
327	0.03183050093751,
328	};
329
330	float (g_temp)[48] = ch_data->g_temp, (q_temp)[48] = ch_data->q_temp;
331	int indexnoise = ch_data->f_indexnoise;
332	int indexsine = ch_data->f_indexsine;
333	float g_temp1, q_temp1, pok, pok1;
334	float temp1, temp2, temp3, temp4;
335	int size = m_max;
336
337	if (sbr->reset) {
338	for (i = 0; i < h_SL; i++) {
339	memcpy(g_temp[i + 2ch_data->t_env[0]], sbr->gain[0], m_max sizeof(sbr->gain[0][0]));
340	memcpy(q_temp[i + 2ch_data->t_env[0]], sbr->q_m[0], m_max sizeof(sbr->q_m[0][0]));
341	}
342	} else if (h_SL) {
343	memcpy(g_temp[2ch_data->t_env[0]], g_temp[2ch_data->t_env_num_env_old], 4*sizeof(g_temp[0]));
344	memcpy(q_temp[2ch_data->t_env[0]], q_temp[2ch_data->t_env_num_env_old], 4*sizeof(q_temp[0]));
345	}
346
347	for (e = 0; e < ch_data->bs_num_env; e++) {
348	for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
349	g_temp1 = g_temp[h_SL + i];
350	pok = sbr->gain[e];
351	q_temp1 = q_temp[h_SL + i];
352	pok1 = sbr->q_m[e];
353
354	/* loop unrolled 4 times */
355	for (j=0; j<(size>>2); j++) {
356	__asm__ volatile (
357	"lw %[temp1], 0(%[pok]) \n\t"
358	"lw %[temp2], 4(%[pok]) \n\t"
359	"lw %[temp3], 8(%[pok]) \n\t"
360	"lw %[temp4], 12(%[pok]) \n\t"
361	"sw %[temp1], 0(%[g_temp1]) \n\t"
362	"sw %[temp2], 4(%[g_temp1]) \n\t"
363	"sw %[temp3], 8(%[g_temp1]) \n\t"
364	"sw %[temp4], 12(%[g_temp1]) \n\t"
365	"lw %[temp1], 0(%[pok1]) \n\t"
366	"lw %[temp2], 4(%[pok1]) \n\t"
367	"lw %[temp3], 8(%[pok1]) \n\t"
368	"lw %[temp4], 12(%[pok1]) \n\t"
369	"sw %[temp1], 0(%[q_temp1]) \n\t"
370	"sw %[temp2], 4(%[q_temp1]) \n\t"
371	"sw %[temp3], 8(%[q_temp1]) \n\t"
372	"sw %[temp4], 12(%[q_temp1]) \n\t"
373	"addiu %[pok], %[pok], 16 \n\t"
374	"addiu %[g_temp1], %[g_temp1], 16 \n\t"
375	"addiu %[pok1], %[pok1], 16 \n\t"
376	"addiu %[q_temp1], %[q_temp1], 16 \n\t"
377
378	: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
379	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
380	[pok]"+r"(pok), [g_temp1]"+r"(g_temp1),
381	[pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1)
382	:
383	: "memory"
384	);
385	}
386
387	for (j=0; j<(size&3); j++) {
388	__asm__ volatile (
389	"lw %[temp1], 0(%[pok]) \n\t"
390	"lw %[temp2], 0(%[pok1]) \n\t"
391	"sw %[temp1], 0(%[g_temp1]) \n\t"
392	"sw %[temp2], 0(%[q_temp1]) \n\t"
393	"addiu %[pok], %[pok], 4 \n\t"
394	"addiu %[g_temp1], %[g_temp1], 4 \n\t"
395	"addiu %[pok1], %[pok1], 4 \n\t"
396	"addiu %[q_temp1], %[q_temp1], 4 \n\t"
397
398	: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
399	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
400	[pok]"+r"(pok), [g_temp1]"+r"(g_temp1),
401	[pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1)
402	:
403	: "memory"
404	);
405	}
406	}
407	}
408
409	for (e = 0; e < ch_data->bs_num_env; e++) {
410	for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
411	LOCAL_ALIGNED_16(float, g_filt_tab, [48]);
412	LOCAL_ALIGNED_16(float, q_filt_tab, [48]);
413	float g_filt, q_filt;
414
415	if (h_SL && e != e_a[0] && e != e_a[1]) {
416	g_filt = g_filt_tab;
417	q_filt = q_filt_tab;
418
419	for (m = 0; m < m_max; m++) {
420	const int idx1 = i + h_SL;
421	g_filt[m] = 0.0f;
422	q_filt[m] = 0.0f;
423
424	for (j = 0; j <= h_SL; j++) {
425	g_filt[m] += g_temp[idx1 - j][m] * h_smooth[j];
426	q_filt[m] += q_temp[idx1 - j][m] * h_smooth[j];
427	}
428	}
429	} else {
430	g_filt = g_temp[i + h_SL];
431	q_filt = q_temp[i];
432	}
433
434	sbr->dsp.hf_g_filt(Y1[i] + kx, X_high + kx, g_filt, m_max,
435	i + ENVELOPE_ADJUSTMENT_OFFSET);
436
437	if (e != e_a[0] && e != e_a[1]) {
438	sbr->dsp.hf_apply_noise[indexsine](Y1[i] + kx, sbr->s_m[e],
439	q_filt, indexnoise,
440	kx, m_max);
441	} else {
442	int idx = indexsine&1;
443	int A = (1-((indexsine+(kx & 1))&2));
444	int B = (A^(-idx)) + idx;
445	float *out = &Y1[i][kx][idx];
446	float *in = sbr->s_m[e];
447	float temp0, temp1, temp2, temp3, temp4, temp5;
448	float A_f = (float)A;
449	float B_f = (float)B;
450
451	for (m = 0; m+1 < m_max; m+=2) {
452
453	temp2 = out[0];
454	temp3 = out[2];
455
456	__asm__ volatile(
457	"lwc1 %[temp0], 0(%[in]) \n\t"
458	"lwc1 %[temp1], 4(%[in]) \n\t"
459	"madd.s %[temp4], %[temp2], %[temp0], %[A_f] \n\t"
460	"madd.s %[temp5], %[temp3], %[temp1], %[B_f] \n\t"
461	"swc1 %[temp4], 0(%[out]) \n\t"
462	"swc1 %[temp5], 8(%[out]) \n\t"
463	"addiu %[in], %[in], 8 \n\t"
464	"addiu %[out], %[out], 16 \n\t"
465
466	: [temp0]"=&f" (temp0), [temp1]"=&f"(temp1),
467	[temp4]"=&f" (temp4), [temp5]"=&f"(temp5),
468	[in]"+r"(in), [out]"+r"(out)
469	: [A_f]"f"(A_f), [B_f]"f"(B_f), [temp2]"f"(temp2),
470	[temp3]"f"(temp3)
471	: "memory"
472	);
473	}
474	if(m_max&1)
475	out[2m ] += in[m ] A;
476	}
477	indexnoise = (indexnoise + m_max) & 0x1ff;
478	indexsine = (indexsine + 1) & 3;
479	}
480	}
481	ch_data->f_indexnoise = indexnoise;
482	ch_data->f_indexsine = indexsine;
483	}
484
485	static void sbr_hf_inverse_filter_mips(SBRDSPContext *dsp,
486	float (alpha0)[2], float (alpha1)[2],
487	const float X_low[32][40][2], int k0)
488	{
489	int k;
490	float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, c;
491	float phi1, alpha_1, *alpha_0, res1, res2, temp_real, temp_im;
492
493	c = 1.000001f;
494
495	for (k = 0; k < k0; k++) {
496	LOCAL_ALIGNED_16(float, phi, [3], [2][2]);
497	float dk;
498	phi1 = &phi[0][0][0];
499	alpha_1 = &alpha1[k][0];
500	alpha_0 = &alpha0[k][0];
501	dsp->autocorrelate(X_low[k], phi);
502
503	__asm__ volatile (
504	"lwc1 %[temp0], 40(%[phi1]) \n\t"
505	"lwc1 %[temp1], 16(%[phi1]) \n\t"
506	"lwc1 %[temp2], 24(%[phi1]) \n\t"
507	"lwc1 %[temp3], 28(%[phi1]) \n\t"
508	"mul.s %[dk], %[temp0], %[temp1] \n\t"
509	"lwc1 %[temp4], 0(%[phi1]) \n\t"
510	"mul.s %[res2], %[temp2], %[temp2] \n\t"
511	"lwc1 %[temp5], 4(%[phi1]) \n\t"
512	"madd.s %[res2], %[res2], %[temp3], %[temp3] \n\t"
513	"lwc1 %[temp6], 8(%[phi1]) \n\t"
514	"div.s %[res2], %[res2], %[c] \n\t"
515	"lwc1 %[temp0], 12(%[phi1]) \n\t"
516	"sub.s %[dk], %[dk], %[res2] \n\t"
517
518	: [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
519	[temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
520	[temp6]"=&f"(temp6), [res2]"=&f"(res2), [dk]"=&f"(dk)
521	: [phi1]"r"(phi1), [c]"f"(c)
522	: "memory"
523	);
524
525	if (!dk) {
526	alpha_1[0] = 0;
527	alpha_1[1] = 0;
528	} else {
529	__asm__ volatile (
530	"mul.s %[temp_real], %[temp4], %[temp2] \n\t"
531	"nmsub.s %[temp_real], %[temp_real], %[temp5], %[temp3] \n\t"
532	"nmsub.s %[temp_real], %[temp_real], %[temp6], %[temp1] \n\t"
533	"mul.s %[temp_im], %[temp4], %[temp3] \n\t"
534	"madd.s %[temp_im], %[temp_im], %[temp5], %[temp2] \n\t"
535	"nmsub.s %[temp_im], %[temp_im], %[temp0], %[temp1] \n\t"
536	"div.s %[temp_real], %[temp_real], %[dk] \n\t"
537	"div.s %[temp_im], %[temp_im], %[dk] \n\t"
538	"swc1 %[temp_real], 0(%[alpha_1]) \n\t"
539	"swc1 %[temp_im], 4(%[alpha_1]) \n\t"
540
541	: [temp_real]"=&f" (temp_real), [temp_im]"=&f"(temp_im)
542	: [phi1]"r"(phi1), [temp0]"f"(temp0), [temp1]"f"(temp1),
543	[temp2]"f"(temp2), [temp3]"f"(temp3), [temp4]"f"(temp4),
544	[temp5]"f"(temp5), [temp6]"f"(temp6),
545	[alpha_1]"r"(alpha_1), [dk]"f"(dk)
546	: "memory"
547	);
548	}
549
550	if (!phi1[4]) {
551	alpha_0[0] = 0;
552	alpha_0[1] = 0;
553	} else {
554	__asm__ volatile (
555	"lwc1 %[temp6], 0(%[alpha_1]) \n\t"
556	"lwc1 %[temp7], 4(%[alpha_1]) \n\t"
557	"mul.s %[temp_real], %[temp6], %[temp2] \n\t"
558	"add.s %[temp_real], %[temp_real], %[temp4] \n\t"
559	"madd.s %[temp_real], %[temp_real], %[temp7], %[temp3] \n\t"
560	"mul.s %[temp_im], %[temp7], %[temp2] \n\t"
561	"add.s %[temp_im], %[temp_im], %[temp5] \n\t"
562	"nmsub.s %[temp_im], %[temp_im], %[temp6], %[temp3] \n\t"
563	"div.s %[temp_real], %[temp_real], %[temp1] \n\t"
564	"div.s %[temp_im], %[temp_im], %[temp1] \n\t"
565	"neg.s %[temp_real], %[temp_real] \n\t"
566	"neg.s %[temp_im], %[temp_im] \n\t"
567	"swc1 %[temp_real], 0(%[alpha_0]) \n\t"
568	"swc1 %[temp_im], 4(%[alpha_0]) \n\t"
569
570	: [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im),
571	[temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
572	[res1]"=&f"(res1), [res2]"=&f"(res2)
573	: [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0),
574	[temp0]"f"(temp0), [temp1]"f"(temp1), [temp2]"f"(temp2),
575	[temp3]"f"(temp3), [temp4]"f"(temp4), [temp5]"f"(temp5)
576	: "memory"
577	);
578	}
579
580	__asm__ volatile (
581	"lwc1 %[temp1], 0(%[alpha_1]) \n\t"
582	"lwc1 %[temp2], 4(%[alpha_1]) \n\t"
583	"lwc1 %[temp_real], 0(%[alpha_0]) \n\t"
584	"lwc1 %[temp_im], 4(%[alpha_0]) \n\t"
585	"mul.s %[res1], %[temp1], %[temp1] \n\t"
586	"madd.s %[res1], %[res1], %[temp2], %[temp2] \n\t"
587	"mul.s %[res2], %[temp_real], %[temp_real] \n\t"
588	"madd.s %[res2], %[res2], %[temp_im], %[temp_im] \n\t"
589
590	: [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im),
591	[temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
592	[res1]"=&f"(res1), [res2]"=&f"(res2)
593	: [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0)
594	: "memory"
595	);
596
597	if (res1 >= 16.0f \|\| res2 >= 16.0f) {
598	alpha_1[0] = 0;
599	alpha_1[1] = 0;
600	alpha_0[0] = 0;
601	alpha_0[1] = 0;
602	}
603	}
604	}
605	#endif /* HAVE_MIPSFPU */
606	#endif /* HAVE_INLINE_ASM */
607
608	void ff_aacsbr_func_ptr_init_mips(AACSBRContext *c)
609	{
610	#if HAVE_INLINE_ASM
611	c->sbr_lf_gen = sbr_lf_gen_mips;
612	c->sbr_x_gen = sbr_x_gen_mips;
613	#if HAVE_MIPSFPU
614	c->sbr_hf_inverse_filter = sbr_hf_inverse_filter_mips;
615	c->sbr_hf_assemble = sbr_hf_assemble_mips;
616	#endif /* HAVE_MIPSFPU */
617	#endif /* HAVE_INLINE_ASM */
618	}