ffmpeg/libavcodec/mips/aacsbr_mips.c

   1 /*
   2  * Copyright (c) 2012
   3  *      MIPS Technologies, Inc., California.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
  14  *    contributors may be used to endorse or promote products derived from
  15  *    this software without specific prior written permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  *
  29  * Authors:  Djordje Pesut   (djordje@mips.com)
  30  *           Mirjana Vulin   (mvulin@mips.com)
  31  *
  32  * This file is part of FFmpeg.
  33  *
  34  * FFmpeg is free software; you can redistribute it and/or
  35  * modify it under the terms of the GNU Lesser General Public
  36  * License as published by the Free Software Foundation; either
  37  * version 2.1 of the License, or (at your option) any later version.
  38  *
  39  * FFmpeg is distributed in the hope that it will be useful,
  40  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  41  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  42  * Lesser General Public License for more details.
  43  *
  44  * You should have received a copy of the GNU Lesser General Public
  45  * License along with FFmpeg; if not, write to the Free Software
  46  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  47  */
  48
  49 /**
  50  * @file
  51  * Reference: libavcodec/aacsbr.c
  52  */
  53
  54 #include "libavcodec/aac.h"
  55 #include "libavcodec/aacsbr.h"
  56
  57 #define ENVELOPE_ADJUSTMENT_OFFSET 2
  58
  59 #if HAVE_INLINE_ASM
  60 static int sbr_lf_gen_mips(AACContext *ac, SpectralBandReplication *sbr,
  61                       float X_low[32][40][2], const float W[2][32][32][2],
  62                       int buf_idx)
  63 {
  64     int i, k;
  65     int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  66     float *p_x_low = &X_low[0][8][0];
  67     float *p_w = (float*)&W[buf_idx][0][0][0];
  68     float *p_x1_low = &X_low[0][0][0];
  69     float *p_w1 = (float*)&W[1-buf_idx][24][0][0];
  70
  71     float *loop_end=p_x1_low + 2560;
  72
  73     /* loop unrolled 8 times */
  74     __asm__ volatile (
  75     "1:                                                 \n\t"
  76         "sw     $0,            0(%[p_x1_low])           \n\t"
  77         "sw     $0,            4(%[p_x1_low])           \n\t"
  78         "sw     $0,            8(%[p_x1_low])           \n\t"
  79         "sw     $0,            12(%[p_x1_low])          \n\t"
  80         "sw     $0,            16(%[p_x1_low])          \n\t"
  81         "sw     $0,            20(%[p_x1_low])          \n\t"
  82         "sw     $0,            24(%[p_x1_low])          \n\t"
  83         "sw     $0,            28(%[p_x1_low])          \n\t"
  84         "addiu  %[p_x1_low],   %[p_x1_low],      32     \n\t"
  85         "bne    %[p_x1_low],   %[loop_end],      1b     \n\t"
  86         "addiu  %[p_x1_low],   %[p_x1_low],      -10240 \n\t"
  87
  88         : [p_x1_low]"+r"(p_x1_low)
  89         : [loop_end]"r"(loop_end)
  90         : "memory"
  91     );
  92
  93     for (k = 0; k < sbr->kx[1]; k++) {
  94         for (i = 0; i < 32; i+=4) {
  95             /* loop unrolled 4 times */
  96             __asm__ volatile (
  97                 "lw     %[temp0],   0(%[p_w])               \n\t"
  98                 "lw     %[temp1],   4(%[p_w])               \n\t"
  99                 "lw     %[temp2],   256(%[p_w])             \n\t"
 100                 "lw     %[temp3],   260(%[p_w])             \n\t"
 101                 "lw     %[temp4],   512(%[p_w])             \n\t"
 102                 "lw     %[temp5],   516(%[p_w])             \n\t"
 103                 "lw     %[temp6],   768(%[p_w])             \n\t"
 104                 "lw     %[temp7],   772(%[p_w])             \n\t"
 105                 "sw     %[temp0],   0(%[p_x_low])           \n\t"
 106                 "sw     %[temp1],   4(%[p_x_low])           \n\t"
 107                 "sw     %[temp2],   8(%[p_x_low])           \n\t"
 108                 "sw     %[temp3],   12(%[p_x_low])          \n\t"
 109                 "sw     %[temp4],   16(%[p_x_low])          \n\t"
 110                 "sw     %[temp5],   20(%[p_x_low])          \n\t"
 111                 "sw     %[temp6],   24(%[p_x_low])          \n\t"
 112                 "sw     %[temp7],   28(%[p_x_low])          \n\t"
 113                 "addiu  %[p_x_low], %[p_x_low],     32      \n\t"
 114                 "addiu  %[p_w],     %[p_w],         1024    \n\t"
 115
 116                 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
 117                   [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
 118                   [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
 119                   [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
 120                   [p_w]"+r"(p_w), [p_x_low]"+r"(p_x_low)
 121                 :
 122                 : "memory"
 123             );
 124         }
 125         p_x_low += 16;
 126         p_w -= 2046;
 127     }
 128
 129     for (k = 0; k < sbr->kx[0]; k++) {
 130         for (i = 0; i < 2; i++) {
 131
 132             /* loop unrolled 4 times */
 133             __asm__ volatile (
 134                 "lw     %[temp0],    0(%[p_w1])             \n\t"
 135                 "lw     %[temp1],    4(%[p_w1])             \n\t"
 136                 "lw     %[temp2],    256(%[p_w1])           \n\t"
 137                 "lw     %[temp3],    260(%[p_w1])           \n\t"
 138                 "lw     %[temp4],    512(%[p_w1])           \n\t"
 139                 "lw     %[temp5],    516(%[p_w1])           \n\t"
 140                 "lw     %[temp6],    768(%[p_w1])           \n\t"
 141                 "lw     %[temp7],    772(%[p_w1])           \n\t"
 142                 "sw     %[temp0],    0(%[p_x1_low])         \n\t"
 143                 "sw     %[temp1],    4(%[p_x1_low])         \n\t"
 144                 "sw     %[temp2],    8(%[p_x1_low])         \n\t"
 145                 "sw     %[temp3],    12(%[p_x1_low])        \n\t"
 146                 "sw     %[temp4],    16(%[p_x1_low])        \n\t"
 147                 "sw     %[temp5],    20(%[p_x1_low])        \n\t"
 148                 "sw     %[temp6],    24(%[p_x1_low])        \n\t"
 149                 "sw     %[temp7],    28(%[p_x1_low])        \n\t"
 150                 "addiu  %[p_x1_low], %[p_x1_low],   32      \n\t"
 151                 "addiu  %[p_w1],     %[p_w1],       1024    \n\t"
 152
 153                 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
 154                   [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
 155                   [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
 156                   [temp6]"=&r"(temp6), [temp7]"=&r"(temp7),
 157                   [p_w1]"+r"(p_w1), [p_x1_low]"+r"(p_x1_low)
 158                 :
 159                 : "memory"
 160             );
 161         }
 162         p_x1_low += 64;
 163         p_w1 -= 510;
 164     }
 165     return 0;
 166 }
 167
 168 static int sbr_x_gen_mips(SpectralBandReplication *sbr, float X[2][38][64],
 169                      const float Y0[38][64][2], const float Y1[38][64][2],
 170                      const float X_low[32][40][2], int ch)
 171 {
 172     int k, i;
 173     const int i_f = 32;
 174     int temp0, temp1, temp2, temp3;
 175     const float *X_low1, *Y01, *Y11;
 176     float *x1=&X[0][0][0];
 177     float *j=x1+4864;
 178     const int i_Temp = FFMAX(2*sbr->data[ch].t_env_num_env_old - i_f, 0);
 179
 180     /* loop unrolled 8 times */
 181     __asm__ volatile (
 182     "1:                                       \n\t"
 183         "sw     $0,      0(%[x1])             \n\t"
 184         "sw     $0,      4(%[x1])             \n\t"
 185         "sw     $0,      8(%[x1])             \n\t"
 186         "sw     $0,      12(%[x1])            \n\t"
 187         "sw     $0,      16(%[x1])            \n\t"
 188         "sw     $0,      20(%[x1])            \n\t"
 189         "sw     $0,      24(%[x1])            \n\t"
 190         "sw     $0,      28(%[x1])            \n\t"
 191         "addiu  %[x1],   %[x1],      32       \n\t"
 192         "bne    %[x1],   %[j],       1b       \n\t"
 193         "addiu  %[x1],   %[x1],      -19456   \n\t"
 194
 195         : [x1]"+r"(x1)
 196         : [j]"r"(j)
 197         : "memory"
 198     );
 199
 200     if (i_Temp != 0) {
 201
 202         X_low1=&X_low[0][2][0];
 203
 204         for (k = 0; k < sbr->kx[0]; k++) {
 205
 206             __asm__ volatile (
 207                 "move    %[i],        $zero                  \n\t"
 208             "2:                                              \n\t"
 209                 "lw      %[temp0],    0(%[X_low1])           \n\t"
 210                 "lw      %[temp1],    4(%[X_low1])           \n\t"
 211                 "sw      %[temp0],    0(%[x1])               \n\t"
 212                 "sw      %[temp1],    9728(%[x1])            \n\t"
 213                 "addiu   %[x1],       %[x1],         256     \n\t"
 214                 "addiu   %[X_low1],   %[X_low1],     8       \n\t"
 215                 "addiu   %[i],        %[i],          1       \n\t"
 216                 "bne     %[i],        %[i_Temp],     2b      \n\t"
 217
 218                 : [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i),
 219                   [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
 220                 : [i_Temp]"r"(i_Temp)
 221                 : "memory"
 222             );
 223             x1-=(i_Temp<<6)-1;
 224             X_low1-=(i_Temp<<1)-80;
 225         }
 226
 227         x1=&X[0][0][k];
 228         Y01=(float*)&Y0[32][k][0];
 229
 230         for (; k < sbr->kx[0] + sbr->m[0]; k++) {
 231             __asm__ volatile (
 232                 "move    %[i],       $zero               \n\t"
 233             "3:                                          \n\t"
 234                 "lw      %[temp0],   0(%[Y01])           \n\t"
 235                 "lw      %[temp1],   4(%[Y01])           \n\t"
 236                 "sw      %[temp0],   0(%[x1])            \n\t"
 237                 "sw      %[temp1],   9728(%[x1])         \n\t"
 238                 "addiu   %[x1],      %[x1],      256     \n\t"
 239                 "addiu   %[Y01],     %[Y01],     512     \n\t"
 240                 "addiu   %[i],       %[i],       1       \n\t"
 241                 "bne     %[i],       %[i_Temp],  3b      \n\t"
 242
 243                 : [x1]"+r"(x1), [Y01]"+r"(Y01), [i]"=&r"(i),
 244                   [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
 245                 : [i_Temp]"r"(i_Temp)
 246                 : "memory"
 247             );
 248             x1 -=(i_Temp<<6)-1;
 249             Y01 -=(i_Temp<<7)-2;
 250         }
 251     }
 252
 253     x1=&X[0][i_Temp][0];
 254     X_low1=&X_low[0][i_Temp+2][0];
 255     temp3=38;
 256
 257     for (k = 0; k < sbr->kx[1]; k++) {
 258
 259         __asm__ volatile (
 260             "move    %[i],       %[i_Temp]              \n\t"
 261         "4:                                             \n\t"
 262             "lw      %[temp0],   0(%[X_low1])           \n\t"
 263             "lw      %[temp1],   4(%[X_low1])           \n\t"
 264             "sw      %[temp0],   0(%[x1])               \n\t"
 265             "sw      %[temp1],   9728(%[x1])            \n\t"
 266             "addiu   %[x1],      %[x1],         256     \n\t"
 267             "addiu   %[X_low1],  %[X_low1],     8       \n\t"
 268             "addiu   %[i],       %[i],          1       \n\t"
 269             "bne     %[i],       %[temp3],      4b      \n\t"
 270
 271             : [x1]"+r"(x1), [X_low1]"+r"(X_low1), [i]"=&r"(i),
 272               [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
 273               [temp2]"=&r"(temp2)
 274             : [i_Temp]"r"(i_Temp), [temp3]"r"(temp3)
 275             : "memory"
 276         );
 277         x1 -= ((38-i_Temp)<<6)-1;
 278         X_low1 -= ((38-i_Temp)<<1)- 80;
 279     }
 280
 281     x1=&X[0][i_Temp][k];
 282     Y11=&Y1[i_Temp][k][0];
 283     temp2=32;
 284
 285     for (; k < sbr->kx[1] + sbr->m[1]; k++) {
 286
 287         __asm__ volatile (
 288            "move    %[i],       %[i_Temp]               \n\t"
 289         "5:                                             \n\t"
 290            "lw      %[temp0],   0(%[Y11])               \n\t"
 291            "lw      %[temp1],   4(%[Y11])               \n\t"
 292            "sw      %[temp0],   0(%[x1])                \n\t"
 293            "sw      %[temp1],   9728(%[x1])             \n\t"
 294            "addiu   %[x1],      %[x1],          256     \n\t"
 295            "addiu   %[Y11],     %[Y11],         512     \n\t"
 296            "addiu   %[i],       %[i],           1       \n\t"
 297            "bne     %[i],       %[temp2],       5b      \n\t"
 298
 299            : [x1]"+r"(x1), [Y11]"+r"(Y11), [i]"=&r"(i),
 300              [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
 301            : [i_Temp]"r"(i_Temp), [temp3]"r"(temp3),
 302              [temp2]"r"(temp2)
 303            : "memory"
 304         );
 305
 306         x1 -= ((32-i_Temp)<<6)-1;
 307         Y11 -= ((32-i_Temp)<<7)-2;
 308    }
 309       return 0;
 310 }
 311
 312 #if HAVE_MIPSFPU
 313 static void sbr_hf_assemble_mips(float Y1[38][64][2],
 314                             const float X_high[64][40][2],
 315                             SpectralBandReplication *sbr, SBRData *ch_data,
 316                             const int e_a[2])
 317 {
 318     int e, i, j, m;
 319     const int h_SL = 4 * !sbr->bs_smoothing_mode;
 320     const int kx = sbr->kx[1];
 321     const int m_max = sbr->m[1];
 322     static const float h_smooth[5] = {
 323         0.33333333333333,
 324         0.30150283239582,
 325         0.21816949906249,
 326         0.11516383427084,
 327         0.03183050093751,
 328     };
 329
 330     float (*g_temp)[48] = ch_data->g_temp, (*q_temp)[48] = ch_data->q_temp;
 331     int indexnoise = ch_data->f_indexnoise;
 332     int indexsine  = ch_data->f_indexsine;
 333     float *g_temp1, *q_temp1, *pok, *pok1;
 334     float temp1, temp2, temp3, temp4;
 335     int size = m_max;
 336
 337     if (sbr->reset) {
 338         for (i = 0; i < h_SL; i++) {
 339             memcpy(g_temp[i + 2*ch_data->t_env[0]], sbr->gain[0], m_max * sizeof(sbr->gain[0][0]));
 340             memcpy(q_temp[i + 2*ch_data->t_env[0]], sbr->q_m[0],  m_max * sizeof(sbr->q_m[0][0]));
 341         }
 342     } else if (h_SL) {
 343         memcpy(g_temp[2*ch_data->t_env[0]], g_temp[2*ch_data->t_env_num_env_old], 4*sizeof(g_temp[0]));
 344         memcpy(q_temp[2*ch_data->t_env[0]], q_temp[2*ch_data->t_env_num_env_old], 4*sizeof(q_temp[0]));
 345     }
 346
 347     for (e = 0; e < ch_data->bs_num_env; e++) {
 348         for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
 349             g_temp1 = g_temp[h_SL + i];
 350             pok = sbr->gain[e];
 351             q_temp1 = q_temp[h_SL + i];
 352             pok1 = sbr->q_m[e];
 353
 354             /* loop unrolled 4 times */
 355             for (j=0; j<(size>>2); j++) {
 356                 __asm__ volatile (
 357                     "lw      %[temp1],   0(%[pok])               \n\t"
 358                     "lw      %[temp2],   4(%[pok])               \n\t"
 359                     "lw      %[temp3],   8(%[pok])               \n\t"
 360                     "lw      %[temp4],   12(%[pok])              \n\t"
 361                     "sw      %[temp1],   0(%[g_temp1])           \n\t"
 362                     "sw      %[temp2],   4(%[g_temp1])           \n\t"
 363                     "sw      %[temp3],   8(%[g_temp1])           \n\t"
 364                     "sw      %[temp4],   12(%[g_temp1])          \n\t"
 365                     "lw      %[temp1],   0(%[pok1])              \n\t"
 366                     "lw      %[temp2],   4(%[pok1])              \n\t"
 367                     "lw      %[temp3],   8(%[pok1])              \n\t"
 368                     "lw      %[temp4],   12(%[pok1])             \n\t"
 369                     "sw      %[temp1],   0(%[q_temp1])           \n\t"
 370                     "sw      %[temp2],   4(%[q_temp1])           \n\t"
 371                     "sw      %[temp3],   8(%[q_temp1])           \n\t"
 372                     "sw      %[temp4],   12(%[q_temp1])          \n\t"
 373                     "addiu   %[pok],     %[pok],           16    \n\t"
 374                     "addiu   %[g_temp1], %[g_temp1],       16    \n\t"
 375                     "addiu   %[pok1],    %[pok1],          16    \n\t"
 376                     "addiu   %[q_temp1], %[q_temp1],       16    \n\t"
 377
 378                     : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
 379                       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
 380                       [pok]"+r"(pok), [g_temp1]"+r"(g_temp1),
 381                       [pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1)
 382                     :
 383                     : "memory"
 384                 );
 385             }
 386
 387             for (j=0; j<(size&3); j++) {
 388                 __asm__ volatile (
 389                     "lw      %[temp1],   0(%[pok])              \n\t"
 390                     "lw      %[temp2],   0(%[pok1])             \n\t"
 391                     "sw      %[temp1],   0(%[g_temp1])          \n\t"
 392                     "sw      %[temp2],   0(%[q_temp1])          \n\t"
 393                     "addiu   %[pok],     %[pok],          4     \n\t"
 394                     "addiu   %[g_temp1], %[g_temp1],      4     \n\t"
 395                     "addiu   %[pok1],    %[pok1],         4     \n\t"
 396                     "addiu   %[q_temp1], %[q_temp1],      4     \n\t"
 397
 398                     : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
 399                       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
 400                       [pok]"+r"(pok), [g_temp1]"+r"(g_temp1),
 401                       [pok1]"+r"(pok1), [q_temp1]"+r"(q_temp1)
 402                     :
 403                     : "memory"
 404                 );
 405             }
 406         }
 407     }
 408
 409     for (e = 0; e < ch_data->bs_num_env; e++) {
 410         for (i = 2 * ch_data->t_env[e]; i < 2 * ch_data->t_env[e + 1]; i++) {
 411             LOCAL_ALIGNED_16(float, g_filt_tab, [48]);
 412             LOCAL_ALIGNED_16(float, q_filt_tab, [48]);
 413             float *g_filt, *q_filt;
 414
 415             if (h_SL && e != e_a[0] && e != e_a[1]) {
 416                 g_filt = g_filt_tab;
 417                 q_filt = q_filt_tab;
 418
 419                 for (m = 0; m < m_max; m++) {
 420                     const int idx1 = i + h_SL;
 421                     g_filt[m] = 0.0f;
 422                     q_filt[m] = 0.0f;
 423
 424                     for (j = 0; j <= h_SL; j++) {
 425                         g_filt[m] += g_temp[idx1 - j][m] * h_smooth[j];
 426                         q_filt[m] += q_temp[idx1 - j][m] * h_smooth[j];
 427                     }
 428                 }
 429             } else {
 430                 g_filt = g_temp[i + h_SL];
 431                 q_filt = q_temp[i];
 432             }
 433
 434             sbr->dsp.hf_g_filt(Y1[i] + kx, X_high + kx, g_filt, m_max,
 435                                i + ENVELOPE_ADJUSTMENT_OFFSET);
 436
 437             if (e != e_a[0] && e != e_a[1]) {
 438                 sbr->dsp.hf_apply_noise[indexsine](Y1[i] + kx, sbr->s_m[e],
 439                                                    q_filt, indexnoise,
 440                                                    kx, m_max);
 441             } else {
 442                 int idx = indexsine&1;
 443                 int A = (1-((indexsine+(kx & 1))&2));
 444                 int B = (A^(-idx)) + idx;
 445                 float *out = &Y1[i][kx][idx];
 446                 float *in  = sbr->s_m[e];
 447                 float temp0, temp1, temp2, temp3, temp4, temp5;
 448                 float A_f = (float)A;
 449                 float B_f = (float)B;
 450
 451                 for (m = 0; m+1 < m_max; m+=2) {
 452
 453                     temp2 = out[0];
 454                     temp3 = out[2];
 455
 456                     __asm__ volatile(
 457                         "lwc1    %[temp0],  0(%[in])                     \n\t"
 458                         "lwc1    %[temp1],  4(%[in])                     \n\t"
 459                         "madd.s  %[temp4],  %[temp2],  %[temp0], %[A_f]  \n\t"
 460                         "madd.s  %[temp5],  %[temp3],  %[temp1], %[B_f]  \n\t"
 461                         "swc1    %[temp4],  0(%[out])                    \n\t"
 462                         "swc1    %[temp5],  8(%[out])                    \n\t"
 463                         "addiu   %[in],     %[in],     8                 \n\t"
 464                         "addiu   %[out],    %[out],    16                \n\t"
 465
 466                         : [temp0]"=&f" (temp0), [temp1]"=&f"(temp1),
 467                           [temp4]"=&f" (temp4), [temp5]"=&f"(temp5),
 468                           [in]"+r"(in), [out]"+r"(out)
 469                         : [A_f]"f"(A_f), [B_f]"f"(B_f), [temp2]"f"(temp2),
 470                           [temp3]"f"(temp3)
 471                         : "memory"
 472                     );
 473                 }
 474                 if(m_max&1)
 475                     out[2*m  ] += in[m  ] * A;
 476             }
 477             indexnoise = (indexnoise + m_max) & 0x1ff;
 478             indexsine = (indexsine + 1) & 3;
 479         }
 480     }
 481     ch_data->f_indexnoise = indexnoise;
 482     ch_data->f_indexsine  = indexsine;
 483 }
 484
 485 static void sbr_hf_inverse_filter_mips(SBRDSPContext *dsp,
 486                                   float (*alpha0)[2], float (*alpha1)[2],
 487                                   const float X_low[32][40][2], int k0)
 488 {
 489     int k;
 490     float temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, c;
 491     float *phi1, *alpha_1, *alpha_0, res1, res2, temp_real, temp_im;
 492
 493     c = 1.000001f;
 494
 495     for (k = 0; k < k0; k++) {
 496         LOCAL_ALIGNED_16(float, phi, [3], [2][2]);
 497         float dk;
 498         phi1 = &phi[0][0][0];
 499         alpha_1 = &alpha1[k][0];
 500         alpha_0 = &alpha0[k][0];
 501         dsp->autocorrelate(X_low[k], phi);
 502
 503         __asm__ volatile (
 504             "lwc1    %[temp0],  40(%[phi1])                       \n\t"
 505             "lwc1    %[temp1],  16(%[phi1])                       \n\t"
 506             "lwc1    %[temp2],  24(%[phi1])                       \n\t"
 507             "lwc1    %[temp3],  28(%[phi1])                       \n\t"
 508             "mul.s   %[dk],     %[temp0],    %[temp1]             \n\t"
 509             "lwc1    %[temp4],  0(%[phi1])                        \n\t"
 510             "mul.s   %[res2],   %[temp2],    %[temp2]             \n\t"
 511             "lwc1    %[temp5],  4(%[phi1])                        \n\t"
 512             "madd.s  %[res2],   %[res2],     %[temp3],  %[temp3]  \n\t"
 513             "lwc1    %[temp6],  8(%[phi1])                        \n\t"
 514             "div.s   %[res2],   %[res2],     %[c]                 \n\t"
 515             "lwc1    %[temp0],  12(%[phi1])                       \n\t"
 516             "sub.s   %[dk],     %[dk],       %[res2]              \n\t"
 517
 518             : [temp0]"=&f"(temp0), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
 519               [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
 520               [temp6]"=&f"(temp6), [res2]"=&f"(res2), [dk]"=&f"(dk)
 521             : [phi1]"r"(phi1), [c]"f"(c)
 522             : "memory"
 523         );
 524
 525         if (!dk) {
 526             alpha_1[0] = 0;
 527             alpha_1[1] = 0;
 528         } else {
 529             __asm__ volatile (
 530                 "mul.s   %[temp_real], %[temp4],     %[temp2]            \n\t"
 531                 "nmsub.s %[temp_real], %[temp_real], %[temp5], %[temp3]  \n\t"
 532                 "nmsub.s %[temp_real], %[temp_real], %[temp6], %[temp1]  \n\t"
 533                 "mul.s   %[temp_im],   %[temp4],     %[temp3]            \n\t"
 534                 "madd.s  %[temp_im],   %[temp_im],   %[temp5], %[temp2]  \n\t"
 535                 "nmsub.s %[temp_im],   %[temp_im],   %[temp0], %[temp1]  \n\t"
 536                 "div.s   %[temp_real], %[temp_real], %[dk]               \n\t"
 537                 "div.s   %[temp_im],   %[temp_im],   %[dk]               \n\t"
 538                 "swc1    %[temp_real], 0(%[alpha_1])                     \n\t"
 539                 "swc1    %[temp_im],   4(%[alpha_1])                     \n\t"
 540
 541                 : [temp_real]"=&f" (temp_real), [temp_im]"=&f"(temp_im)
 542                 : [phi1]"r"(phi1), [temp0]"f"(temp0), [temp1]"f"(temp1),
 543                   [temp2]"f"(temp2), [temp3]"f"(temp3), [temp4]"f"(temp4),
 544                   [temp5]"f"(temp5), [temp6]"f"(temp6),
 545                   [alpha_1]"r"(alpha_1), [dk]"f"(dk)
 546                 : "memory"
 547             );
 548         }
 549
 550         if (!phi1[4]) {
 551             alpha_0[0] = 0;
 552             alpha_0[1] = 0;
 553         } else {
 554             __asm__ volatile (
 555                 "lwc1    %[temp6],     0(%[alpha_1])                     \n\t"
 556                 "lwc1    %[temp7],     4(%[alpha_1])                     \n\t"
 557                 "mul.s   %[temp_real], %[temp6],     %[temp2]            \n\t"
 558                 "add.s   %[temp_real], %[temp_real], %[temp4]            \n\t"
 559                 "madd.s  %[temp_real], %[temp_real], %[temp7], %[temp3]  \n\t"
 560                 "mul.s   %[temp_im],   %[temp7],     %[temp2]            \n\t"
 561                 "add.s   %[temp_im],   %[temp_im],   %[temp5]            \n\t"
 562                 "nmsub.s %[temp_im],   %[temp_im],   %[temp6], %[temp3]  \n\t"
 563                 "div.s   %[temp_real], %[temp_real], %[temp1]            \n\t"
 564                 "div.s   %[temp_im],   %[temp_im],   %[temp1]            \n\t"
 565                 "neg.s   %[temp_real], %[temp_real]                      \n\t"
 566                 "neg.s   %[temp_im],   %[temp_im]                        \n\t"
 567                 "swc1    %[temp_real], 0(%[alpha_0])                     \n\t"
 568                 "swc1    %[temp_im],   4(%[alpha_0])                     \n\t"
 569
 570                 : [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im),
 571                   [temp6]"=&f"(temp6), [temp7]"=&f"(temp7),
 572                   [res1]"=&f"(res1), [res2]"=&f"(res2)
 573                 : [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0),
 574                   [temp0]"f"(temp0), [temp1]"f"(temp1), [temp2]"f"(temp2),
 575                   [temp3]"f"(temp3), [temp4]"f"(temp4), [temp5]"f"(temp5)
 576                 : "memory"
 577             );
 578         }
 579
 580         __asm__ volatile (
 581             "lwc1    %[temp1],      0(%[alpha_1])                           \n\t"
 582             "lwc1    %[temp2],      4(%[alpha_1])                           \n\t"
 583             "lwc1    %[temp_real],  0(%[alpha_0])                           \n\t"
 584             "lwc1    %[temp_im],    4(%[alpha_0])                           \n\t"
 585             "mul.s   %[res1],       %[temp1],      %[temp1]                 \n\t"
 586             "madd.s  %[res1],       %[res1],       %[temp2],    %[temp2]    \n\t"
 587             "mul.s   %[res2],       %[temp_real],  %[temp_real]             \n\t"
 588             "madd.s  %[res2],       %[res2],       %[temp_im],  %[temp_im]  \n\t"
 589
 590             : [temp_real]"=&f"(temp_real), [temp_im]"=&f"(temp_im),
 591               [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
 592               [res1]"=&f"(res1), [res2]"=&f"(res2)
 593             : [alpha_1]"r"(alpha_1), [alpha_0]"r"(alpha_0)
 594             : "memory"
 595         );
 596
 597         if (res1 >= 16.0f || res2 >= 16.0f) {
 598             alpha_1[0] = 0;
 599             alpha_1[1] = 0;
 600             alpha_0[0] = 0;
 601             alpha_0[1] = 0;
 602         }
 603     }
 604 }
 605 #endif /* HAVE_MIPSFPU */
 606 #endif /* HAVE_INLINE_ASM */
 607
 608 void ff_aacsbr_func_ptr_init_mips(AACSBRContext *c)
 609 {
 610 #if HAVE_INLINE_ASM
 611     c->sbr_lf_gen            = sbr_lf_gen_mips;
 612     c->sbr_x_gen             = sbr_x_gen_mips;
 613 #if HAVE_MIPSFPU
 614     c->sbr_hf_inverse_filter = sbr_hf_inverse_filter_mips;
 615     c->sbr_hf_assemble       = sbr_hf_assemble_mips;
 616 #endif /* HAVE_MIPSFPU */
 617 #endif /* HAVE_INLINE_ASM */
 618 }