Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / mips / aaccoder_mips.c
CommitLineData
2ba45a60
DM
1/*
2 * Copyright (c) 2012
3 * MIPS Technologies, Inc., California.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14 * contributors may be used to endorse or promote products derived from
15 * this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * Author: Stanislav Ocovaj (socovaj@mips.com)
30 * Szabolcs Pal (sabolc@mips.com)
31 *
32 * AAC coefficients encoder optimized for MIPS floating-point architecture
33 *
34 * This file is part of FFmpeg.
35 *
36 * FFmpeg is free software; you can redistribute it and/or
37 * modify it under the terms of the GNU Lesser General Public
38 * License as published by the Free Software Foundation; either
39 * version 2.1 of the License, or (at your option) any later version.
40 *
41 * FFmpeg is distributed in the hope that it will be useful,
42 * but WITHOUT ANY WARRANTY; without even the implied warranty of
43 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
44 * Lesser General Public License for more details.
45 *
46 * You should have received a copy of the GNU Lesser General Public
47 * License along with FFmpeg; if not, write to the Free Software
48 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
49 */
50
51/**
52 * @file
53 * Reference: libavcodec/aaccoder.c
54 */
55
56#include "libavutil/libm.h"
57
58#include <float.h>
59#include "libavutil/mathematics.h"
60#include "libavcodec/avcodec.h"
61#include "libavcodec/put_bits.h"
62#include "libavcodec/aac.h"
63#include "libavcodec/aacenc.h"
64#include "libavcodec/aactab.h"
65
66#if HAVE_INLINE_ASM
67typedef struct BandCodingPath {
68 int prev_idx;
69 float cost;
70 int run;
71} BandCodingPath;
72
73static const uint8_t run_value_bits_long[64] = {
74 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
75 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10,
76 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
77 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15
78};
79
80static const uint8_t run_value_bits_short[16] = {
81 3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 9
82};
83
84static const uint8_t * const run_value_bits[2] = {
85 run_value_bits_long, run_value_bits_short
86};
87
88static const uint8_t uquad_sign_bits[81] = {
89 0, 1, 1, 1, 2, 2, 1, 2, 2,
90 1, 2, 2, 2, 3, 3, 2, 3, 3,
91 1, 2, 2, 2, 3, 3, 2, 3, 3,
92 1, 2, 2, 2, 3, 3, 2, 3, 3,
93 2, 3, 3, 3, 4, 4, 3, 4, 4,
94 2, 3, 3, 3, 4, 4, 3, 4, 4,
95 1, 2, 2, 2, 3, 3, 2, 3, 3,
96 2, 3, 3, 3, 4, 4, 3, 4, 4,
97 2, 3, 3, 3, 4, 4, 3, 4, 4
98};
99
100static const uint8_t upair7_sign_bits[64] = {
101 0, 1, 1, 1, 1, 1, 1, 1,
102 1, 2, 2, 2, 2, 2, 2, 2,
103 1, 2, 2, 2, 2, 2, 2, 2,
104 1, 2, 2, 2, 2, 2, 2, 2,
105 1, 2, 2, 2, 2, 2, 2, 2,
106 1, 2, 2, 2, 2, 2, 2, 2,
107 1, 2, 2, 2, 2, 2, 2, 2,
108 1, 2, 2, 2, 2, 2, 2, 2,
109};
110
111static const uint8_t upair12_sign_bits[169] = {
112 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
114 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
115 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
116 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
117 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
118 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
119 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
120 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
121 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
122 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
123 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
124 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
125};
126
127static const uint8_t esc_sign_bits[289] = {
128 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
130 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
131 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
132 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
133 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
134 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
135 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
136 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
137 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
138 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
139 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
140 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
141 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
142 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
143 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
144 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
145};
146
147static void abs_pow34_v(float *out, const float *in, const int size) {
148#ifndef USE_REALLY_FULL_SEARCH
149 int i;
150 float a, b, c, d;
151 float ax, bx, cx, dx;
152
153 for (i = 0; i < size; i += 4) {
154 a = fabsf(in[i ]);
155 b = fabsf(in[i+1]);
156 c = fabsf(in[i+2]);
157 d = fabsf(in[i+3]);
158
159 ax = sqrtf(a);
160 bx = sqrtf(b);
161 cx = sqrtf(c);
162 dx = sqrtf(d);
163
164 a = a * ax;
165 b = b * bx;
166 c = c * cx;
167 d = d * dx;
168
169 out[i ] = sqrtf(a);
170 out[i+1] = sqrtf(b);
171 out[i+2] = sqrtf(c);
172 out[i+3] = sqrtf(d);
173 }
174#endif /* USE_REALLY_FULL_SEARCH */
175}
176
177static float find_max_val(int group_len, int swb_size, const float *scaled) {
178 float maxval = 0.0f;
179 int w2, i;
180 for (w2 = 0; w2 < group_len; w2++) {
181 for (i = 0; i < swb_size; i++) {
182 maxval = FFMAX(maxval, scaled[w2*128+i]);
183 }
184 }
185 return maxval;
186}
187
188static int find_min_book(float maxval, int sf) {
189 float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
190 float Q34 = sqrtf(Q * sqrtf(Q));
191 int qmaxval, cb;
192 qmaxval = maxval * Q34 + 0.4054f;
193 if (qmaxval == 0) cb = 0;
194 else if (qmaxval == 1) cb = 1;
195 else if (qmaxval == 2) cb = 3;
196 else if (qmaxval <= 4) cb = 5;
197 else if (qmaxval <= 7) cb = 7;
198 else if (qmaxval <= 12) cb = 9;
199 else cb = 11;
200 return cb;
201}
202
203/**
204 * Functions developed from template function and optimized for quantizing and encoding band
205 */
206static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
207 PutBitContext *pb, const float *in,
208 const float *scaled, int size, int scale_idx,
209 int cb, const float lambda, const float uplim,
210 int *bits)
211{
212 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
213 int i;
214 int qc1, qc2, qc3, qc4;
215
216 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
217 uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
218
219 abs_pow34_v(s->scoefs, in, size);
220 scaled = s->scoefs;
221 for (i = 0; i < size; i += 4) {
222 int curidx;
223 int *in_int = (int *)&in[i];
224
225 qc1 = scaled[i ] * Q34 + 0.4054f;
226 qc2 = scaled[i+1] * Q34 + 0.4054f;
227 qc3 = scaled[i+2] * Q34 + 0.4054f;
228 qc4 = scaled[i+3] * Q34 + 0.4054f;
229
230 __asm__ volatile (
231 ".set push \n\t"
232 ".set noreorder \n\t"
233
234 "slt %[qc1], $zero, %[qc1] \n\t"
235 "slt %[qc2], $zero, %[qc2] \n\t"
236 "slt %[qc3], $zero, %[qc3] \n\t"
237 "slt %[qc4], $zero, %[qc4] \n\t"
238 "lw $t0, 0(%[in_int]) \n\t"
239 "lw $t1, 4(%[in_int]) \n\t"
240 "lw $t2, 8(%[in_int]) \n\t"
241 "lw $t3, 12(%[in_int]) \n\t"
242 "srl $t0, $t0, 31 \n\t"
243 "srl $t1, $t1, 31 \n\t"
244 "srl $t2, $t2, 31 \n\t"
245 "srl $t3, $t3, 31 \n\t"
246 "subu $t4, $zero, %[qc1] \n\t"
247 "subu $t5, $zero, %[qc2] \n\t"
248 "subu $t6, $zero, %[qc3] \n\t"
249 "subu $t7, $zero, %[qc4] \n\t"
250 "movn %[qc1], $t4, $t0 \n\t"
251 "movn %[qc2], $t5, $t1 \n\t"
252 "movn %[qc3], $t6, $t2 \n\t"
253 "movn %[qc4], $t7, $t3 \n\t"
254
255 ".set pop \n\t"
256
257 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
258 [qc3]"+r"(qc3), [qc4]"+r"(qc4)
259 : [in_int]"r"(in_int)
260 : "t0", "t1", "t2", "t3",
261 "t4", "t5", "t6", "t7",
262 "memory"
263 );
264
265 curidx = qc1;
266 curidx *= 3;
267 curidx += qc2;
268 curidx *= 3;
269 curidx += qc3;
270 curidx *= 3;
271 curidx += qc4;
272 curidx += 40;
273
274 put_bits(pb, p_bits[curidx], p_codes[curidx]);
275 }
276}
277
278static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
279 PutBitContext *pb, const float *in,
280 const float *scaled, int size, int scale_idx,
281 int cb, const float lambda, const float uplim,
282 int *bits)
283{
284 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
285 int i;
286 int qc1, qc2, qc3, qc4;
287
288 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
289 uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
290
291 abs_pow34_v(s->scoefs, in, size);
292 scaled = s->scoefs;
293 for (i = 0; i < size; i += 4) {
294 int curidx, sign, count;
295 int *in_int = (int *)&in[i];
296 uint8_t v_bits;
297 unsigned int v_codes;
298
299 qc1 = scaled[i ] * Q34 + 0.4054f;
300 qc2 = scaled[i+1] * Q34 + 0.4054f;
301 qc3 = scaled[i+2] * Q34 + 0.4054f;
302 qc4 = scaled[i+3] * Q34 + 0.4054f;
303
304 __asm__ volatile (
305 ".set push \n\t"
306 ".set noreorder \n\t"
307
308 "ori $t4, $zero, 2 \n\t"
309 "ori %[sign], $zero, 0 \n\t"
310 "slt $t0, $t4, %[qc1] \n\t"
311 "slt $t1, $t4, %[qc2] \n\t"
312 "slt $t2, $t4, %[qc3] \n\t"
313 "slt $t3, $t4, %[qc4] \n\t"
314 "movn %[qc1], $t4, $t0 \n\t"
315 "movn %[qc2], $t4, $t1 \n\t"
316 "movn %[qc3], $t4, $t2 \n\t"
317 "movn %[qc4], $t4, $t3 \n\t"
318 "lw $t0, 0(%[in_int]) \n\t"
319 "lw $t1, 4(%[in_int]) \n\t"
320 "lw $t2, 8(%[in_int]) \n\t"
321 "lw $t3, 12(%[in_int]) \n\t"
322 "slt $t0, $t0, $zero \n\t"
323 "movn %[sign], $t0, %[qc1] \n\t"
324 "slt $t1, $t1, $zero \n\t"
325 "slt $t2, $t2, $zero \n\t"
326 "slt $t3, $t3, $zero \n\t"
327 "sll $t0, %[sign], 1 \n\t"
328 "or $t0, $t0, $t1 \n\t"
329 "movn %[sign], $t0, %[qc2] \n\t"
330 "slt $t4, $zero, %[qc1] \n\t"
331 "slt $t1, $zero, %[qc2] \n\t"
332 "slt %[count], $zero, %[qc3] \n\t"
333 "sll $t0, %[sign], 1 \n\t"
334 "or $t0, $t0, $t2 \n\t"
335 "movn %[sign], $t0, %[qc3] \n\t"
336 "slt $t2, $zero, %[qc4] \n\t"
337 "addu %[count], %[count], $t4 \n\t"
338 "addu %[count], %[count], $t1 \n\t"
339 "sll $t0, %[sign], 1 \n\t"
340 "or $t0, $t0, $t3 \n\t"
341 "movn %[sign], $t0, %[qc4] \n\t"
342 "addu %[count], %[count], $t2 \n\t"
343
344 ".set pop \n\t"
345
346 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
347 [qc3]"+r"(qc3), [qc4]"+r"(qc4),
348 [sign]"=&r"(sign), [count]"=&r"(count)
349 : [in_int]"r"(in_int)
350 : "t0", "t1", "t2", "t3", "t4",
351 "memory"
352 );
353
354 curidx = qc1;
355 curidx *= 3;
356 curidx += qc2;
357 curidx *= 3;
358 curidx += qc3;
359 curidx *= 3;
360 curidx += qc4;
361
362 v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
363 v_bits = p_bits[curidx] + count;
364 put_bits(pb, v_bits, v_codes);
365 }
366}
367
368static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
369 PutBitContext *pb, const float *in,
370 const float *scaled, int size, int scale_idx,
371 int cb, const float lambda, const float uplim,
372 int *bits)
373{
374 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
375 int i;
376 int qc1, qc2, qc3, qc4;
377
378 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
379 uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
380
381 abs_pow34_v(s->scoefs, in, size);
382 scaled = s->scoefs;
383 for (i = 0; i < size; i += 4) {
384 int curidx, curidx2;
385 int *in_int = (int *)&in[i];
386 uint8_t v_bits;
387 unsigned int v_codes;
388
389 qc1 = scaled[i ] * Q34 + 0.4054f;
390 qc2 = scaled[i+1] * Q34 + 0.4054f;
391 qc3 = scaled[i+2] * Q34 + 0.4054f;
392 qc4 = scaled[i+3] * Q34 + 0.4054f;
393
394 __asm__ volatile (
395 ".set push \n\t"
396 ".set noreorder \n\t"
397
398 "ori $t4, $zero, 4 \n\t"
399 "slt $t0, $t4, %[qc1] \n\t"
400 "slt $t1, $t4, %[qc2] \n\t"
401 "slt $t2, $t4, %[qc3] \n\t"
402 "slt $t3, $t4, %[qc4] \n\t"
403 "movn %[qc1], $t4, $t0 \n\t"
404 "movn %[qc2], $t4, $t1 \n\t"
405 "movn %[qc3], $t4, $t2 \n\t"
406 "movn %[qc4], $t4, $t3 \n\t"
407 "lw $t0, 0(%[in_int]) \n\t"
408 "lw $t1, 4(%[in_int]) \n\t"
409 "lw $t2, 8(%[in_int]) \n\t"
410 "lw $t3, 12(%[in_int]) \n\t"
411 "srl $t0, $t0, 31 \n\t"
412 "srl $t1, $t1, 31 \n\t"
413 "srl $t2, $t2, 31 \n\t"
414 "srl $t3, $t3, 31 \n\t"
415 "subu $t4, $zero, %[qc1] \n\t"
416 "subu $t5, $zero, %[qc2] \n\t"
417 "subu $t6, $zero, %[qc3] \n\t"
418 "subu $t7, $zero, %[qc4] \n\t"
419 "movn %[qc1], $t4, $t0 \n\t"
420 "movn %[qc2], $t5, $t1 \n\t"
421 "movn %[qc3], $t6, $t2 \n\t"
422 "movn %[qc4], $t7, $t3 \n\t"
423
424 ".set pop \n\t"
425
426 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
427 [qc3]"+r"(qc3), [qc4]"+r"(qc4)
428 : [in_int]"r"(in_int)
429 : "t0", "t1", "t2", "t3",
430 "t4", "t5", "t6", "t7",
431 "memory"
432 );
433
434 curidx = 9 * qc1;
435 curidx += qc2 + 40;
436
437 curidx2 = 9 * qc3;
438 curidx2 += qc4 + 40;
439
440 v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
441 v_bits = p_bits[curidx] + p_bits[curidx2];
442 put_bits(pb, v_bits, v_codes);
443 }
444}
445
446static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
447 PutBitContext *pb, const float *in,
448 const float *scaled, int size, int scale_idx,
449 int cb, const float lambda, const float uplim,
450 int *bits)
451{
452 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
453 int i;
454 int qc1, qc2, qc3, qc4;
455
456 uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1];
457 uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
458
459 abs_pow34_v(s->scoefs, in, size);
460 scaled = s->scoefs;
461 for (i = 0; i < size; i += 4) {
462 int curidx, sign1, count1, sign2, count2;
463 int *in_int = (int *)&in[i];
464 uint8_t v_bits;
465 unsigned int v_codes;
466
467 qc1 = scaled[i ] * Q34 + 0.4054f;
468 qc2 = scaled[i+1] * Q34 + 0.4054f;
469 qc3 = scaled[i+2] * Q34 + 0.4054f;
470 qc4 = scaled[i+3] * Q34 + 0.4054f;
471
472 __asm__ volatile (
473 ".set push \n\t"
474 ".set noreorder \n\t"
475
476 "ori $t4, $zero, 7 \n\t"
477 "ori %[sign1], $zero, 0 \n\t"
478 "ori %[sign2], $zero, 0 \n\t"
479 "slt $t0, $t4, %[qc1] \n\t"
480 "slt $t1, $t4, %[qc2] \n\t"
481 "slt $t2, $t4, %[qc3] \n\t"
482 "slt $t3, $t4, %[qc4] \n\t"
483 "movn %[qc1], $t4, $t0 \n\t"
484 "movn %[qc2], $t4, $t1 \n\t"
485 "movn %[qc3], $t4, $t2 \n\t"
486 "movn %[qc4], $t4, $t3 \n\t"
487 "lw $t0, 0(%[in_int]) \n\t"
488 "lw $t1, 4(%[in_int]) \n\t"
489 "lw $t2, 8(%[in_int]) \n\t"
490 "lw $t3, 12(%[in_int]) \n\t"
491 "slt $t0, $t0, $zero \n\t"
492 "movn %[sign1], $t0, %[qc1] \n\t"
493 "slt $t2, $t2, $zero \n\t"
494 "movn %[sign2], $t2, %[qc3] \n\t"
495 "slt $t1, $t1, $zero \n\t"
496 "sll $t0, %[sign1], 1 \n\t"
497 "or $t0, $t0, $t1 \n\t"
498 "movn %[sign1], $t0, %[qc2] \n\t"
499 "slt $t3, $t3, $zero \n\t"
500 "sll $t0, %[sign2], 1 \n\t"
501 "or $t0, $t0, $t3 \n\t"
502 "movn %[sign2], $t0, %[qc4] \n\t"
503 "slt %[count1], $zero, %[qc1] \n\t"
504 "slt $t1, $zero, %[qc2] \n\t"
505 "slt %[count2], $zero, %[qc3] \n\t"
506 "slt $t2, $zero, %[qc4] \n\t"
507 "addu %[count1], %[count1], $t1 \n\t"
508 "addu %[count2], %[count2], $t2 \n\t"
509
510 ".set pop \n\t"
511
512 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
513 [qc3]"+r"(qc3), [qc4]"+r"(qc4),
514 [sign1]"=&r"(sign1), [count1]"=&r"(count1),
515 [sign2]"=&r"(sign2), [count2]"=&r"(count2)
516 : [in_int]"r"(in_int)
517 : "t0", "t1", "t2", "t3", "t4",
518 "memory"
519 );
520
521 curidx = 8 * qc1;
522 curidx += qc2;
523
524 v_codes = (p_codes[curidx] << count1) | sign1;
525 v_bits = p_bits[curidx] + count1;
526 put_bits(pb, v_bits, v_codes);
527
528 curidx = 8 * qc3;
529 curidx += qc4;
530
531 v_codes = (p_codes[curidx] << count2) | sign2;
532 v_bits = p_bits[curidx] + count2;
533 put_bits(pb, v_bits, v_codes);
534 }
535}
536
537static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
538 PutBitContext *pb, const float *in,
539 const float *scaled, int size, int scale_idx,
540 int cb, const float lambda, const float uplim,
541 int *bits)
542{
543 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
544 int i;
545 int qc1, qc2, qc3, qc4;
546
547 uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1];
548 uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
549
550 abs_pow34_v(s->scoefs, in, size);
551 scaled = s->scoefs;
552 for (i = 0; i < size; i += 4) {
553 int curidx, sign1, count1, sign2, count2;
554 int *in_int = (int *)&in[i];
555 uint8_t v_bits;
556 unsigned int v_codes;
557
558 qc1 = scaled[i ] * Q34 + 0.4054f;
559 qc2 = scaled[i+1] * Q34 + 0.4054f;
560 qc3 = scaled[i+2] * Q34 + 0.4054f;
561 qc4 = scaled[i+3] * Q34 + 0.4054f;
562
563 __asm__ volatile (
564 ".set push \n\t"
565 ".set noreorder \n\t"
566
567 "ori $t4, $zero, 12 \n\t"
568 "ori %[sign1], $zero, 0 \n\t"
569 "ori %[sign2], $zero, 0 \n\t"
570 "slt $t0, $t4, %[qc1] \n\t"
571 "slt $t1, $t4, %[qc2] \n\t"
572 "slt $t2, $t4, %[qc3] \n\t"
573 "slt $t3, $t4, %[qc4] \n\t"
574 "movn %[qc1], $t4, $t0 \n\t"
575 "movn %[qc2], $t4, $t1 \n\t"
576 "movn %[qc3], $t4, $t2 \n\t"
577 "movn %[qc4], $t4, $t3 \n\t"
578 "lw $t0, 0(%[in_int]) \n\t"
579 "lw $t1, 4(%[in_int]) \n\t"
580 "lw $t2, 8(%[in_int]) \n\t"
581 "lw $t3, 12(%[in_int]) \n\t"
582 "slt $t0, $t0, $zero \n\t"
583 "movn %[sign1], $t0, %[qc1] \n\t"
584 "slt $t2, $t2, $zero \n\t"
585 "movn %[sign2], $t2, %[qc3] \n\t"
586 "slt $t1, $t1, $zero \n\t"
587 "sll $t0, %[sign1], 1 \n\t"
588 "or $t0, $t0, $t1 \n\t"
589 "movn %[sign1], $t0, %[qc2] \n\t"
590 "slt $t3, $t3, $zero \n\t"
591 "sll $t0, %[sign2], 1 \n\t"
592 "or $t0, $t0, $t3 \n\t"
593 "movn %[sign2], $t0, %[qc4] \n\t"
594 "slt %[count1], $zero, %[qc1] \n\t"
595 "slt $t1, $zero, %[qc2] \n\t"
596 "slt %[count2], $zero, %[qc3] \n\t"
597 "slt $t2, $zero, %[qc4] \n\t"
598 "addu %[count1], %[count1], $t1 \n\t"
599 "addu %[count2], %[count2], $t2 \n\t"
600
601 ".set pop \n\t"
602
603 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
604 [qc3]"+r"(qc3), [qc4]"+r"(qc4),
605 [sign1]"=&r"(sign1), [count1]"=&r"(count1),
606 [sign2]"=&r"(sign2), [count2]"=&r"(count2)
607 : [in_int]"r"(in_int)
608 : "t0", "t1", "t2", "t3", "t4",
609 "memory"
610 );
611
612 curidx = 13 * qc1;
613 curidx += qc2;
614
615 v_codes = (p_codes[curidx] << count1) | sign1;
616 v_bits = p_bits[curidx] + count1;
617 put_bits(pb, v_bits, v_codes);
618
619 curidx = 13 * qc3;
620 curidx += qc4;
621
622 v_codes = (p_codes[curidx] << count2) | sign2;
623 v_bits = p_bits[curidx] + count2;
624 put_bits(pb, v_bits, v_codes);
625 }
626}
627
628static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
629 PutBitContext *pb, const float *in,
630 const float *scaled, int size, int scale_idx,
631 int cb, const float lambda, const float uplim,
632 int *bits)
633{
634 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
635 int i;
636 int qc1, qc2, qc3, qc4;
637
638 uint8_t *p_bits = (uint8_t* )ff_aac_spectral_bits[cb-1];
639 uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
640 float *p_vectors = (float* )ff_aac_codebook_vectors[cb-1];
641
642 abs_pow34_v(s->scoefs, in, size);
643 scaled = s->scoefs;
644
645 if (cb < 11) {
646 for (i = 0; i < size; i += 4) {
647 int curidx, curidx2, sign1, count1, sign2, count2;
648 int *in_int = (int *)&in[i];
649 uint8_t v_bits;
650 unsigned int v_codes;
651
652 qc1 = scaled[i ] * Q34 + 0.4054f;
653 qc2 = scaled[i+1] * Q34 + 0.4054f;
654 qc3 = scaled[i+2] * Q34 + 0.4054f;
655 qc4 = scaled[i+3] * Q34 + 0.4054f;
656
657 __asm__ volatile (
658 ".set push \n\t"
659 ".set noreorder \n\t"
660
661 "ori $t4, $zero, 16 \n\t"
662 "ori %[sign1], $zero, 0 \n\t"
663 "ori %[sign2], $zero, 0 \n\t"
664 "slt $t0, $t4, %[qc1] \n\t"
665 "slt $t1, $t4, %[qc2] \n\t"
666 "slt $t2, $t4, %[qc3] \n\t"
667 "slt $t3, $t4, %[qc4] \n\t"
668 "movn %[qc1], $t4, $t0 \n\t"
669 "movn %[qc2], $t4, $t1 \n\t"
670 "movn %[qc3], $t4, $t2 \n\t"
671 "movn %[qc4], $t4, $t3 \n\t"
672 "lw $t0, 0(%[in_int]) \n\t"
673 "lw $t1, 4(%[in_int]) \n\t"
674 "lw $t2, 8(%[in_int]) \n\t"
675 "lw $t3, 12(%[in_int]) \n\t"
676 "slt $t0, $t0, $zero \n\t"
677 "movn %[sign1], $t0, %[qc1] \n\t"
678 "slt $t2, $t2, $zero \n\t"
679 "movn %[sign2], $t2, %[qc3] \n\t"
680 "slt $t1, $t1, $zero \n\t"
681 "sll $t0, %[sign1], 1 \n\t"
682 "or $t0, $t0, $t1 \n\t"
683 "movn %[sign1], $t0, %[qc2] \n\t"
684 "slt $t3, $t3, $zero \n\t"
685 "sll $t0, %[sign2], 1 \n\t"
686 "or $t0, $t0, $t3 \n\t"
687 "movn %[sign2], $t0, %[qc4] \n\t"
688 "slt %[count1], $zero, %[qc1] \n\t"
689 "slt $t1, $zero, %[qc2] \n\t"
690 "slt %[count2], $zero, %[qc3] \n\t"
691 "slt $t2, $zero, %[qc4] \n\t"
692 "addu %[count1], %[count1], $t1 \n\t"
693 "addu %[count2], %[count2], $t2 \n\t"
694
695 ".set pop \n\t"
696
697 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
698 [qc3]"+r"(qc3), [qc4]"+r"(qc4),
699 [sign1]"=&r"(sign1), [count1]"=&r"(count1),
700 [sign2]"=&r"(sign2), [count2]"=&r"(count2)
701 : [in_int]"r"(in_int)
702 : "t0", "t1", "t2", "t3", "t4",
703 "memory"
704 );
705
706 curidx = 17 * qc1;
707 curidx += qc2;
708 curidx2 = 17 * qc3;
709 curidx2 += qc4;
710
711 v_codes = (p_codes[curidx] << count1) | sign1;
712 v_bits = p_bits[curidx] + count1;
713 put_bits(pb, v_bits, v_codes);
714
715 v_codes = (p_codes[curidx2] << count2) | sign2;
716 v_bits = p_bits[curidx2] + count2;
717 put_bits(pb, v_bits, v_codes);
718 }
719 } else {
720 for (i = 0; i < size; i += 4) {
721 int curidx, curidx2, sign1, count1, sign2, count2;
722 int *in_int = (int *)&in[i];
723 uint8_t v_bits;
724 unsigned int v_codes;
725 int c1, c2, c3, c4;
726
727 qc1 = scaled[i ] * Q34 + 0.4054f;
728 qc2 = scaled[i+1] * Q34 + 0.4054f;
729 qc3 = scaled[i+2] * Q34 + 0.4054f;
730 qc4 = scaled[i+3] * Q34 + 0.4054f;
731
732 __asm__ volatile (
733 ".set push \n\t"
734 ".set noreorder \n\t"
735
736 "ori $t4, $zero, 16 \n\t"
737 "ori %[sign1], $zero, 0 \n\t"
738 "ori %[sign2], $zero, 0 \n\t"
739 "shll_s.w %[c1], %[qc1], 18 \n\t"
740 "shll_s.w %[c2], %[qc2], 18 \n\t"
741 "shll_s.w %[c3], %[qc3], 18 \n\t"
742 "shll_s.w %[c4], %[qc4], 18 \n\t"
743 "srl %[c1], %[c1], 18 \n\t"
744 "srl %[c2], %[c2], 18 \n\t"
745 "srl %[c3], %[c3], 18 \n\t"
746 "srl %[c4], %[c4], 18 \n\t"
747 "slt $t0, $t4, %[qc1] \n\t"
748 "slt $t1, $t4, %[qc2] \n\t"
749 "slt $t2, $t4, %[qc3] \n\t"
750 "slt $t3, $t4, %[qc4] \n\t"
751 "movn %[qc1], $t4, $t0 \n\t"
752 "movn %[qc2], $t4, $t1 \n\t"
753 "movn %[qc3], $t4, $t2 \n\t"
754 "movn %[qc4], $t4, $t3 \n\t"
755 "lw $t0, 0(%[in_int]) \n\t"
756 "lw $t1, 4(%[in_int]) \n\t"
757 "lw $t2, 8(%[in_int]) \n\t"
758 "lw $t3, 12(%[in_int]) \n\t"
759 "slt $t0, $t0, $zero \n\t"
760 "movn %[sign1], $t0, %[qc1] \n\t"
761 "slt $t2, $t2, $zero \n\t"
762 "movn %[sign2], $t2, %[qc3] \n\t"
763 "slt $t1, $t1, $zero \n\t"
764 "sll $t0, %[sign1], 1 \n\t"
765 "or $t0, $t0, $t1 \n\t"
766 "movn %[sign1], $t0, %[qc2] \n\t"
767 "slt $t3, $t3, $zero \n\t"
768 "sll $t0, %[sign2], 1 \n\t"
769 "or $t0, $t0, $t3 \n\t"
770 "movn %[sign2], $t0, %[qc4] \n\t"
771 "slt %[count1], $zero, %[qc1] \n\t"
772 "slt $t1, $zero, %[qc2] \n\t"
773 "slt %[count2], $zero, %[qc3] \n\t"
774 "slt $t2, $zero, %[qc4] \n\t"
775 "addu %[count1], %[count1], $t1 \n\t"
776 "addu %[count2], %[count2], $t2 \n\t"
777
778 ".set pop \n\t"
779
780 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
781 [qc3]"+r"(qc3), [qc4]"+r"(qc4),
782 [sign1]"=&r"(sign1), [count1]"=&r"(count1),
783 [sign2]"=&r"(sign2), [count2]"=&r"(count2),
784 [c1]"=&r"(c1), [c2]"=&r"(c2),
785 [c3]"=&r"(c3), [c4]"=&r"(c4)
786 : [in_int]"r"(in_int)
787 : "t0", "t1", "t2", "t3", "t4",
788 "memory"
789 );
790
791 curidx = 17 * qc1;
792 curidx += qc2;
793
794 curidx2 = 17 * qc3;
795 curidx2 += qc4;
796
797 v_codes = (p_codes[curidx] << count1) | sign1;
798 v_bits = p_bits[curidx] + count1;
799 put_bits(pb, v_bits, v_codes);
800
801 if (p_vectors[curidx*2 ] == 64.0f) {
802 int len = av_log2(c1);
803 v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1));
804 put_bits(pb, len * 2 - 3, v_codes);
805 }
806 if (p_vectors[curidx*2+1] == 64.0f) {
807 int len = av_log2(c2);
808 v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1));
809 put_bits(pb, len*2-3, v_codes);
810 }
811
812 v_codes = (p_codes[curidx2] << count2) | sign2;
813 v_bits = p_bits[curidx2] + count2;
814 put_bits(pb, v_bits, v_codes);
815
816 if (p_vectors[curidx2*2 ] == 64.0f) {
817 int len = av_log2(c3);
818 v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1));
819 put_bits(pb, len* 2 - 3, v_codes);
820 }
821 if (p_vectors[curidx2*2+1] == 64.0f) {
822 int len = av_log2(c4);
823 v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
824 put_bits(pb, len * 2 - 3, v_codes);
825 }
826 }
827 }
828}
829
830static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
831 PutBitContext *pb, const float *in,
832 const float *scaled, int size, int scale_idx,
833 int cb, const float lambda, const float uplim,
834 int *bits) = {
835 NULL,
836 quantize_and_encode_band_cost_SQUAD_mips,
837 quantize_and_encode_band_cost_SQUAD_mips,
838 quantize_and_encode_band_cost_UQUAD_mips,
839 quantize_and_encode_band_cost_UQUAD_mips,
840 quantize_and_encode_band_cost_SPAIR_mips,
841 quantize_and_encode_band_cost_SPAIR_mips,
842 quantize_and_encode_band_cost_UPAIR7_mips,
843 quantize_and_encode_band_cost_UPAIR7_mips,
844 quantize_and_encode_band_cost_UPAIR12_mips,
845 quantize_and_encode_band_cost_UPAIR12_mips,
846 quantize_and_encode_band_cost_ESC_mips,
847};
848
849#define quantize_and_encode_band_cost( \
850 s, pb, in, scaled, size, scale_idx, cb, \
851 lambda, uplim, bits) \
852 quantize_and_encode_band_cost_arr[cb]( \
853 s, pb, in, scaled, size, scale_idx, cb, \
854 lambda, uplim, bits)
855
856static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
857 const float *in, int size, int scale_idx,
858 int cb, const float lambda)
859{
860 quantize_and_encode_band_cost(s, pb, in, NULL, size, scale_idx, cb, lambda,
861 INFINITY, NULL);
862}
863
864/**
865 * Functions developed from template function and optimized for getting the number of bits
866 */
867static float get_band_numbits_ZERO_mips(struct AACEncContext *s,
868 PutBitContext *pb, const float *in,
869 const float *scaled, int size, int scale_idx,
870 int cb, const float lambda, const float uplim,
871 int *bits)
872{
873 return 0;
874}
875
876static float get_band_numbits_SQUAD_mips(struct AACEncContext *s,
877 PutBitContext *pb, const float *in,
878 const float *scaled, int size, int scale_idx,
879 int cb, const float lambda, const float uplim,
880 int *bits)
881{
882 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
883 int i;
884 int qc1, qc2, qc3, qc4;
885 int curbits = 0;
886
887 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
888
889 for (i = 0; i < size; i += 4) {
890 int curidx;
891 int *in_int = (int *)&in[i];
892
893 qc1 = scaled[i ] * Q34 + 0.4054f;
894 qc2 = scaled[i+1] * Q34 + 0.4054f;
895 qc3 = scaled[i+2] * Q34 + 0.4054f;
896 qc4 = scaled[i+3] * Q34 + 0.4054f;
897
898 __asm__ volatile (
899 ".set push \n\t"
900 ".set noreorder \n\t"
901
902 "slt %[qc1], $zero, %[qc1] \n\t"
903 "slt %[qc2], $zero, %[qc2] \n\t"
904 "slt %[qc3], $zero, %[qc3] \n\t"
905 "slt %[qc4], $zero, %[qc4] \n\t"
906 "lw $t0, 0(%[in_int]) \n\t"
907 "lw $t1, 4(%[in_int]) \n\t"
908 "lw $t2, 8(%[in_int]) \n\t"
909 "lw $t3, 12(%[in_int]) \n\t"
910 "srl $t0, $t0, 31 \n\t"
911 "srl $t1, $t1, 31 \n\t"
912 "srl $t2, $t2, 31 \n\t"
913 "srl $t3, $t3, 31 \n\t"
914 "subu $t4, $zero, %[qc1] \n\t"
915 "subu $t5, $zero, %[qc2] \n\t"
916 "subu $t6, $zero, %[qc3] \n\t"
917 "subu $t7, $zero, %[qc4] \n\t"
918 "movn %[qc1], $t4, $t0 \n\t"
919 "movn %[qc2], $t5, $t1 \n\t"
920 "movn %[qc3], $t6, $t2 \n\t"
921 "movn %[qc4], $t7, $t3 \n\t"
922
923 ".set pop \n\t"
924
925 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
926 [qc3]"+r"(qc3), [qc4]"+r"(qc4)
927 : [in_int]"r"(in_int)
928 : "t0", "t1", "t2", "t3",
929 "t4", "t5", "t6", "t7",
930 "memory"
931 );
932
933 curidx = qc1;
934 curidx *= 3;
935 curidx += qc2;
936 curidx *= 3;
937 curidx += qc3;
938 curidx *= 3;
939 curidx += qc4;
940 curidx += 40;
941
942 curbits += p_bits[curidx];
943 }
944 return curbits;
945}
946
947static float get_band_numbits_UQUAD_mips(struct AACEncContext *s,
948 PutBitContext *pb, const float *in,
949 const float *scaled, int size, int scale_idx,
950 int cb, const float lambda, const float uplim,
951 int *bits)
952{
953 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
954 int i;
955 int curbits = 0;
956 int qc1, qc2, qc3, qc4;
957
958 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
959
960 for (i = 0; i < size; i += 4) {
961 int curidx;
962
963 qc1 = scaled[i ] * Q34 + 0.4054f;
964 qc2 = scaled[i+1] * Q34 + 0.4054f;
965 qc3 = scaled[i+2] * Q34 + 0.4054f;
966 qc4 = scaled[i+3] * Q34 + 0.4054f;
967
968 __asm__ volatile (
969 ".set push \n\t"
970 ".set noreorder \n\t"
971
972 "ori $t4, $zero, 2 \n\t"
973 "slt $t0, $t4, %[qc1] \n\t"
974 "slt $t1, $t4, %[qc2] \n\t"
975 "slt $t2, $t4, %[qc3] \n\t"
976 "slt $t3, $t4, %[qc4] \n\t"
977 "movn %[qc1], $t4, $t0 \n\t"
978 "movn %[qc2], $t4, $t1 \n\t"
979 "movn %[qc3], $t4, $t2 \n\t"
980 "movn %[qc4], $t4, $t3 \n\t"
981
982 ".set pop \n\t"
983
984 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
985 [qc3]"+r"(qc3), [qc4]"+r"(qc4)
986 :
987 : "t0", "t1", "t2", "t3", "t4"
988 );
989
990 curidx = qc1;
991 curidx *= 3;
992 curidx += qc2;
993 curidx *= 3;
994 curidx += qc3;
995 curidx *= 3;
996 curidx += qc4;
997
998 curbits += p_bits[curidx];
999 curbits += uquad_sign_bits[curidx];
1000 }
1001 return curbits;
1002}
1003
1004static float get_band_numbits_SPAIR_mips(struct AACEncContext *s,
1005 PutBitContext *pb, const float *in,
1006 const float *scaled, int size, int scale_idx,
1007 int cb, const float lambda, const float uplim,
1008 int *bits)
1009{
1010 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1011 int i;
1012 int qc1, qc2, qc3, qc4;
1013 int curbits = 0;
1014
1015 uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1016
1017 for (i = 0; i < size; i += 4) {
1018 int curidx, curidx2;
1019 int *in_int = (int *)&in[i];
1020
1021 qc1 = scaled[i ] * Q34 + 0.4054f;
1022 qc2 = scaled[i+1] * Q34 + 0.4054f;
1023 qc3 = scaled[i+2] * Q34 + 0.4054f;
1024 qc4 = scaled[i+3] * Q34 + 0.4054f;
1025
1026 __asm__ volatile (
1027 ".set push \n\t"
1028 ".set noreorder \n\t"
1029
1030 "ori $t4, $zero, 4 \n\t"
1031 "slt $t0, $t4, %[qc1] \n\t"
1032 "slt $t1, $t4, %[qc2] \n\t"
1033 "slt $t2, $t4, %[qc3] \n\t"
1034 "slt $t3, $t4, %[qc4] \n\t"
1035 "movn %[qc1], $t4, $t0 \n\t"
1036 "movn %[qc2], $t4, $t1 \n\t"
1037 "movn %[qc3], $t4, $t2 \n\t"
1038 "movn %[qc4], $t4, $t3 \n\t"
1039 "lw $t0, 0(%[in_int]) \n\t"
1040 "lw $t1, 4(%[in_int]) \n\t"
1041 "lw $t2, 8(%[in_int]) \n\t"
1042 "lw $t3, 12(%[in_int]) \n\t"
1043 "srl $t0, $t0, 31 \n\t"
1044 "srl $t1, $t1, 31 \n\t"
1045 "srl $t2, $t2, 31 \n\t"
1046 "srl $t3, $t3, 31 \n\t"
1047 "subu $t4, $zero, %[qc1] \n\t"
1048 "subu $t5, $zero, %[qc2] \n\t"
1049 "subu $t6, $zero, %[qc3] \n\t"
1050 "subu $t7, $zero, %[qc4] \n\t"
1051 "movn %[qc1], $t4, $t0 \n\t"
1052 "movn %[qc2], $t5, $t1 \n\t"
1053 "movn %[qc3], $t6, $t2 \n\t"
1054 "movn %[qc4], $t7, $t3 \n\t"
1055
1056 ".set pop \n\t"
1057
1058 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1059 [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1060 : [in_int]"r"(in_int)
1061 : "t0", "t1", "t2", "t3",
1062 "t4", "t5", "t6", "t7",
1063 "memory"
1064 );
1065
1066 curidx = 9 * qc1;
1067 curidx += qc2 + 40;
1068
1069 curidx2 = 9 * qc3;
1070 curidx2 += qc4 + 40;
1071
1072 curbits += p_bits[curidx] + p_bits[curidx2];
1073 }
1074 return curbits;
1075}
1076
1077static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s,
1078 PutBitContext *pb, const float *in,
1079 const float *scaled, int size, int scale_idx,
1080 int cb, const float lambda, const float uplim,
1081 int *bits)
1082{
1083 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1084 int i;
1085 int qc1, qc2, qc3, qc4;
1086 int curbits = 0;
1087
1088 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1089
1090 for (i = 0; i < size; i += 4) {
1091 int curidx, curidx2;
1092
1093 qc1 = scaled[i ] * Q34 + 0.4054f;
1094 qc2 = scaled[i+1] * Q34 + 0.4054f;
1095 qc3 = scaled[i+2] * Q34 + 0.4054f;
1096 qc4 = scaled[i+3] * Q34 + 0.4054f;
1097
1098 __asm__ volatile (
1099 ".set push \n\t"
1100 ".set noreorder \n\t"
1101
1102 "ori $t4, $zero, 7 \n\t"
1103 "slt $t0, $t4, %[qc1] \n\t"
1104 "slt $t1, $t4, %[qc2] \n\t"
1105 "slt $t2, $t4, %[qc3] \n\t"
1106 "slt $t3, $t4, %[qc4] \n\t"
1107 "movn %[qc1], $t4, $t0 \n\t"
1108 "movn %[qc2], $t4, $t1 \n\t"
1109 "movn %[qc3], $t4, $t2 \n\t"
1110 "movn %[qc4], $t4, $t3 \n\t"
1111
1112 ".set pop \n\t"
1113
1114 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1115 [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1116 :
1117 : "t0", "t1", "t2", "t3", "t4"
1118 );
1119
1120 curidx = 8 * qc1;
1121 curidx += qc2;
1122
1123 curidx2 = 8 * qc3;
1124 curidx2 += qc4;
1125
1126 curbits += p_bits[curidx] +
1127 upair7_sign_bits[curidx] +
1128 p_bits[curidx2] +
1129 upair7_sign_bits[curidx2];
1130 }
1131 return curbits;
1132}
1133
1134static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s,
1135 PutBitContext *pb, const float *in,
1136 const float *scaled, int size, int scale_idx,
1137 int cb, const float lambda, const float uplim,
1138 int *bits)
1139{
1140 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1141 int i;
1142 int qc1, qc2, qc3, qc4;
1143 int curbits = 0;
1144
1145 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1146
1147 for (i = 0; i < size; i += 4) {
1148 int curidx, curidx2;
1149
1150 qc1 = scaled[i ] * Q34 + 0.4054f;
1151 qc2 = scaled[i+1] * Q34 + 0.4054f;
1152 qc3 = scaled[i+2] * Q34 + 0.4054f;
1153 qc4 = scaled[i+3] * Q34 + 0.4054f;
1154
1155 __asm__ volatile (
1156 ".set push \n\t"
1157 ".set noreorder \n\t"
1158
1159 "ori $t4, $zero, 12 \n\t"
1160 "slt $t0, $t4, %[qc1] \n\t"
1161 "slt $t1, $t4, %[qc2] \n\t"
1162 "slt $t2, $t4, %[qc3] \n\t"
1163 "slt $t3, $t4, %[qc4] \n\t"
1164 "movn %[qc1], $t4, $t0 \n\t"
1165 "movn %[qc2], $t4, $t1 \n\t"
1166 "movn %[qc3], $t4, $t2 \n\t"
1167 "movn %[qc4], $t4, $t3 \n\t"
1168
1169 ".set pop \n\t"
1170
1171 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1172 [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1173 :
1174 : "t0", "t1", "t2", "t3", "t4"
1175 );
1176
1177 curidx = 13 * qc1;
1178 curidx += qc2;
1179
1180 curidx2 = 13 * qc3;
1181 curidx2 += qc4;
1182
1183 curbits += p_bits[curidx] +
1184 p_bits[curidx2] +
1185 upair12_sign_bits[curidx] +
1186 upair12_sign_bits[curidx2];
1187 }
1188 return curbits;
1189}
1190
1191static float get_band_numbits_ESC_mips(struct AACEncContext *s,
1192 PutBitContext *pb, const float *in,
1193 const float *scaled, int size, int scale_idx,
1194 int cb, const float lambda, const float uplim,
1195 int *bits)
1196{
1197 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1198 int i;
1199 int qc1, qc2, qc3, qc4;
1200 int curbits = 0;
1201
1202 uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1203
1204 for (i = 0; i < size; i += 4) {
1205 int curidx, curidx2;
1206 int cond0, cond1, cond2, cond3;
1207 int c1, c2, c3, c4;
1208
1209 qc1 = scaled[i ] * Q34 + 0.4054f;
1210 qc2 = scaled[i+1] * Q34 + 0.4054f;
1211 qc3 = scaled[i+2] * Q34 + 0.4054f;
1212 qc4 = scaled[i+3] * Q34 + 0.4054f;
1213
1214 __asm__ volatile (
1215 ".set push \n\t"
1216 ".set noreorder \n\t"
1217
1218 "ori $t4, $zero, 15 \n\t"
1219 "ori $t5, $zero, 16 \n\t"
1220 "shll_s.w %[c1], %[qc1], 18 \n\t"
1221 "shll_s.w %[c2], %[qc2], 18 \n\t"
1222 "shll_s.w %[c3], %[qc3], 18 \n\t"
1223 "shll_s.w %[c4], %[qc4], 18 \n\t"
1224 "srl %[c1], %[c1], 18 \n\t"
1225 "srl %[c2], %[c2], 18 \n\t"
1226 "srl %[c3], %[c3], 18 \n\t"
1227 "srl %[c4], %[c4], 18 \n\t"
1228 "slt %[cond0], $t4, %[qc1] \n\t"
1229 "slt %[cond1], $t4, %[qc2] \n\t"
1230 "slt %[cond2], $t4, %[qc3] \n\t"
1231 "slt %[cond3], $t4, %[qc4] \n\t"
1232 "movn %[qc1], $t5, %[cond0] \n\t"
1233 "movn %[qc2], $t5, %[cond1] \n\t"
1234 "movn %[qc3], $t5, %[cond2] \n\t"
1235 "movn %[qc4], $t5, %[cond3] \n\t"
1236 "ori $t5, $zero, 31 \n\t"
1237 "clz %[c1], %[c1] \n\t"
1238 "clz %[c2], %[c2] \n\t"
1239 "clz %[c3], %[c3] \n\t"
1240 "clz %[c4], %[c4] \n\t"
1241 "subu %[c1], $t5, %[c1] \n\t"
1242 "subu %[c2], $t5, %[c2] \n\t"
1243 "subu %[c3], $t5, %[c3] \n\t"
1244 "subu %[c4], $t5, %[c4] \n\t"
1245 "sll %[c1], %[c1], 1 \n\t"
1246 "sll %[c2], %[c2], 1 \n\t"
1247 "sll %[c3], %[c3], 1 \n\t"
1248 "sll %[c4], %[c4], 1 \n\t"
1249 "addiu %[c1], %[c1], -3 \n\t"
1250 "addiu %[c2], %[c2], -3 \n\t"
1251 "addiu %[c3], %[c3], -3 \n\t"
1252 "addiu %[c4], %[c4], -3 \n\t"
1253 "subu %[cond0], $zero, %[cond0] \n\t"
1254 "subu %[cond1], $zero, %[cond1] \n\t"
1255 "subu %[cond2], $zero, %[cond2] \n\t"
1256 "subu %[cond3], $zero, %[cond3] \n\t"
1257 "and %[c1], %[c1], %[cond0] \n\t"
1258 "and %[c2], %[c2], %[cond1] \n\t"
1259 "and %[c3], %[c3], %[cond2] \n\t"
1260 "and %[c4], %[c4], %[cond3] \n\t"
1261
1262 ".set pop \n\t"
1263
1264 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1265 [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1266 [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
1267 [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
1268 [c1]"=&r"(c1), [c2]"=&r"(c2),
1269 [c3]"=&r"(c3), [c4]"=&r"(c4)
1270 :
1271 : "t4", "t5"
1272 );
1273
1274 curidx = 17 * qc1;
1275 curidx += qc2;
1276
1277 curidx2 = 17 * qc3;
1278 curidx2 += qc4;
1279
1280 curbits += p_bits[curidx];
1281 curbits += esc_sign_bits[curidx];
1282 curbits += p_bits[curidx2];
1283 curbits += esc_sign_bits[curidx2];
1284
1285 curbits += c1;
1286 curbits += c2;
1287 curbits += c3;
1288 curbits += c4;
1289 }
1290 return curbits;
1291}
1292
1293static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
1294 PutBitContext *pb, const float *in,
1295 const float *scaled, int size, int scale_idx,
1296 int cb, const float lambda, const float uplim,
1297 int *bits) = {
1298 get_band_numbits_ZERO_mips,
1299 get_band_numbits_SQUAD_mips,
1300 get_band_numbits_SQUAD_mips,
1301 get_band_numbits_UQUAD_mips,
1302 get_band_numbits_UQUAD_mips,
1303 get_band_numbits_SPAIR_mips,
1304 get_band_numbits_SPAIR_mips,
1305 get_band_numbits_UPAIR7_mips,
1306 get_band_numbits_UPAIR7_mips,
1307 get_band_numbits_UPAIR12_mips,
1308 get_band_numbits_UPAIR12_mips,
1309 get_band_numbits_ESC_mips,
1310};
1311
1312#define get_band_numbits( \
1313 s, pb, in, scaled, size, scale_idx, cb, \
1314 lambda, uplim, bits) \
1315 get_band_numbits_arr[cb]( \
1316 s, pb, in, scaled, size, scale_idx, cb, \
1317 lambda, uplim, bits)
1318
1319static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
1320 const float *scaled, int size, int scale_idx,
1321 int cb, const float lambda, const float uplim,
1322 int *bits)
1323{
1324 return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
1325}
1326
1327/**
1328 * Functions developed from template function and optimized for getting the band cost
1329 */
1330#if HAVE_MIPSFPU
1331static float get_band_cost_ZERO_mips(struct AACEncContext *s,
1332 PutBitContext *pb, const float *in,
1333 const float *scaled, int size, int scale_idx,
1334 int cb, const float lambda, const float uplim,
1335 int *bits)
1336{
1337 int i;
1338 float cost = 0;
1339
1340 for (i = 0; i < size; i += 4) {
1341 cost += in[i ] * in[i ];
1342 cost += in[i+1] * in[i+1];
1343 cost += in[i+2] * in[i+2];
1344 cost += in[i+3] * in[i+3];
1345 }
1346 if (bits)
1347 *bits = 0;
1348 return cost * lambda;
1349}
1350
1351static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
1352 PutBitContext *pb, const float *in,
1353 const float *scaled, int size, int scale_idx,
1354 int cb, const float lambda, const float uplim,
1355 int *bits)
1356{
1357 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1358 const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1359 int i;
1360 float cost = 0;
1361 int qc1, qc2, qc3, qc4;
1362 int curbits = 0;
1363
1364 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1365 float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
1366
1367 for (i = 0; i < size; i += 4) {
1368 const float *vec;
1369 int curidx;
1370 int *in_int = (int *)&in[i];
1371 float *in_pos = (float *)&in[i];
1372 float di0, di1, di2, di3;
1373
1374 qc1 = scaled[i ] * Q34 + 0.4054f;
1375 qc2 = scaled[i+1] * Q34 + 0.4054f;
1376 qc3 = scaled[i+2] * Q34 + 0.4054f;
1377 qc4 = scaled[i+3] * Q34 + 0.4054f;
1378
1379 __asm__ volatile (
1380 ".set push \n\t"
1381 ".set noreorder \n\t"
1382
1383 "slt %[qc1], $zero, %[qc1] \n\t"
1384 "slt %[qc2], $zero, %[qc2] \n\t"
1385 "slt %[qc3], $zero, %[qc3] \n\t"
1386 "slt %[qc4], $zero, %[qc4] \n\t"
1387 "lw $t0, 0(%[in_int]) \n\t"
1388 "lw $t1, 4(%[in_int]) \n\t"
1389 "lw $t2, 8(%[in_int]) \n\t"
1390 "lw $t3, 12(%[in_int]) \n\t"
1391 "srl $t0, $t0, 31 \n\t"
1392 "srl $t1, $t1, 31 \n\t"
1393 "srl $t2, $t2, 31 \n\t"
1394 "srl $t3, $t3, 31 \n\t"
1395 "subu $t4, $zero, %[qc1] \n\t"
1396 "subu $t5, $zero, %[qc2] \n\t"
1397 "subu $t6, $zero, %[qc3] \n\t"
1398 "subu $t7, $zero, %[qc4] \n\t"
1399 "movn %[qc1], $t4, $t0 \n\t"
1400 "movn %[qc2], $t5, $t1 \n\t"
1401 "movn %[qc3], $t6, $t2 \n\t"
1402 "movn %[qc4], $t7, $t3 \n\t"
1403
1404 ".set pop \n\t"
1405
1406 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1407 [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1408 : [in_int]"r"(in_int)
1409 : "t0", "t1", "t2", "t3",
1410 "t4", "t5", "t6", "t7",
1411 "memory"
1412 );
1413
1414 curidx = qc1;
1415 curidx *= 3;
1416 curidx += qc2;
1417 curidx *= 3;
1418 curidx += qc3;
1419 curidx *= 3;
1420 curidx += qc4;
1421 curidx += 40;
1422
1423 curbits += p_bits[curidx];
1424 vec = &p_codes[curidx*4];
1425
1426 __asm__ volatile (
1427 ".set push \n\t"
1428 ".set noreorder \n\t"
1429
1430 "lwc1 $f0, 0(%[in_pos]) \n\t"
1431 "lwc1 $f1, 0(%[vec]) \n\t"
1432 "lwc1 $f2, 4(%[in_pos]) \n\t"
1433 "lwc1 $f3, 4(%[vec]) \n\t"
1434 "lwc1 $f4, 8(%[in_pos]) \n\t"
1435 "lwc1 $f5, 8(%[vec]) \n\t"
1436 "lwc1 $f6, 12(%[in_pos]) \n\t"
1437 "lwc1 $f7, 12(%[vec]) \n\t"
1438 "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t"
1439 "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t"
1440 "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t"
1441 "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t"
1442
1443 ".set pop \n\t"
1444
1445 : [di0]"=&f"(di0), [di1]"=&f"(di1),
1446 [di2]"=&f"(di2), [di3]"=&f"(di3)
1447 : [in_pos]"r"(in_pos), [vec]"r"(vec),
1448 [IQ]"f"(IQ)
1449 : "$f0", "$f1", "$f2", "$f3",
1450 "$f4", "$f5", "$f6", "$f7",
1451 "memory"
1452 );
1453
1454 cost += di0 * di0 + di1 * di1
1455 + di2 * di2 + di3 * di3;
1456 }
1457
1458 if (bits)
1459 *bits = curbits;
1460 return cost * lambda + curbits;
1461}
1462
1463static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
1464 PutBitContext *pb, const float *in,
1465 const float *scaled, int size, int scale_idx,
1466 int cb, const float lambda, const float uplim,
1467 int *bits)
1468{
1469 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1470 const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1471 int i;
1472 float cost = 0;
1473 int curbits = 0;
1474 int qc1, qc2, qc3, qc4;
1475
1476 uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1477 float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
1478
1479 for (i = 0; i < size; i += 4) {
1480 const float *vec;
1481 int curidx;
1482 float *in_pos = (float *)&in[i];
1483 float di0, di1, di2, di3;
1484
1485 qc1 = scaled[i ] * Q34 + 0.4054f;
1486 qc2 = scaled[i+1] * Q34 + 0.4054f;
1487 qc3 = scaled[i+2] * Q34 + 0.4054f;
1488 qc4 = scaled[i+3] * Q34 + 0.4054f;
1489
1490 __asm__ volatile (
1491 ".set push \n\t"
1492 ".set noreorder \n\t"
1493
1494 "ori $t4, $zero, 2 \n\t"
1495 "slt $t0, $t4, %[qc1] \n\t"
1496 "slt $t1, $t4, %[qc2] \n\t"
1497 "slt $t2, $t4, %[qc3] \n\t"
1498 "slt $t3, $t4, %[qc4] \n\t"
1499 "movn %[qc1], $t4, $t0 \n\t"
1500 "movn %[qc2], $t4, $t1 \n\t"
1501 "movn %[qc3], $t4, $t2 \n\t"
1502 "movn %[qc4], $t4, $t3 \n\t"
1503
1504 ".set pop \n\t"
1505
1506 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1507 [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1508 :
1509 : "t0", "t1", "t2", "t3", "t4"
1510 );
1511
1512 curidx = qc1;
1513 curidx *= 3;
1514 curidx += qc2;
1515 curidx *= 3;
1516 curidx += qc3;
1517 curidx *= 3;
1518 curidx += qc4;
1519
1520 curbits += p_bits[curidx];
1521 curbits += uquad_sign_bits[curidx];
1522 vec = &p_codes[curidx*4];
1523
1524 __asm__ volatile (
1525 ".set push \n\t"
1526 ".set noreorder \n\t"
1527
1528 "lwc1 %[di0], 0(%[in_pos]) \n\t"
1529 "lwc1 %[di1], 4(%[in_pos]) \n\t"
1530 "lwc1 %[di2], 8(%[in_pos]) \n\t"
1531 "lwc1 %[di3], 12(%[in_pos]) \n\t"
1532 "abs.s %[di0], %[di0] \n\t"
1533 "abs.s %[di1], %[di1] \n\t"
1534 "abs.s %[di2], %[di2] \n\t"
1535 "abs.s %[di3], %[di3] \n\t"
1536 "lwc1 $f0, 0(%[vec]) \n\t"
1537 "lwc1 $f1, 4(%[vec]) \n\t"
1538 "lwc1 $f2, 8(%[vec]) \n\t"
1539 "lwc1 $f3, 12(%[vec]) \n\t"
1540 "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
1541 "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
1542 "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
1543 "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
1544
1545 ".set pop \n\t"
1546
1547 : [di0]"=&f"(di0), [di1]"=&f"(di1),
1548 [di2]"=&f"(di2), [di3]"=&f"(di3)
1549 : [in_pos]"r"(in_pos), [vec]"r"(vec),
1550 [IQ]"f"(IQ)
1551 : "$f0", "$f1", "$f2", "$f3",
1552 "memory"
1553 );
1554
1555 cost += di0 * di0 + di1 * di1
1556 + di2 * di2 + di3 * di3;
1557 }
1558
1559 if (bits)
1560 *bits = curbits;
1561 return cost * lambda + curbits;
1562}
1563
1564static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
1565 PutBitContext *pb, const float *in,
1566 const float *scaled, int size, int scale_idx,
1567 int cb, const float lambda, const float uplim,
1568 int *bits)
1569{
1570 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1571 const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1572 int i;
1573 float cost = 0;
1574 int qc1, qc2, qc3, qc4;
1575 int curbits = 0;
1576
1577 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1578 float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
1579
1580 for (i = 0; i < size; i += 4) {
1581 const float *vec, *vec2;
1582 int curidx, curidx2;
1583 int *in_int = (int *)&in[i];
1584 float *in_pos = (float *)&in[i];
1585 float di0, di1, di2, di3;
1586
1587 qc1 = scaled[i ] * Q34 + 0.4054f;
1588 qc2 = scaled[i+1] * Q34 + 0.4054f;
1589 qc3 = scaled[i+2] * Q34 + 0.4054f;
1590 qc4 = scaled[i+3] * Q34 + 0.4054f;
1591
1592 __asm__ volatile (
1593 ".set push \n\t"
1594 ".set noreorder \n\t"
1595
1596 "ori $t4, $zero, 4 \n\t"
1597 "slt $t0, $t4, %[qc1] \n\t"
1598 "slt $t1, $t4, %[qc2] \n\t"
1599 "slt $t2, $t4, %[qc3] \n\t"
1600 "slt $t3, $t4, %[qc4] \n\t"
1601 "movn %[qc1], $t4, $t0 \n\t"
1602 "movn %[qc2], $t4, $t1 \n\t"
1603 "movn %[qc3], $t4, $t2 \n\t"
1604 "movn %[qc4], $t4, $t3 \n\t"
1605 "lw $t0, 0(%[in_int]) \n\t"
1606 "lw $t1, 4(%[in_int]) \n\t"
1607 "lw $t2, 8(%[in_int]) \n\t"
1608 "lw $t3, 12(%[in_int]) \n\t"
1609 "srl $t0, $t0, 31 \n\t"
1610 "srl $t1, $t1, 31 \n\t"
1611 "srl $t2, $t2, 31 \n\t"
1612 "srl $t3, $t3, 31 \n\t"
1613 "subu $t4, $zero, %[qc1] \n\t"
1614 "subu $t5, $zero, %[qc2] \n\t"
1615 "subu $t6, $zero, %[qc3] \n\t"
1616 "subu $t7, $zero, %[qc4] \n\t"
1617 "movn %[qc1], $t4, $t0 \n\t"
1618 "movn %[qc2], $t5, $t1 \n\t"
1619 "movn %[qc3], $t6, $t2 \n\t"
1620 "movn %[qc4], $t7, $t3 \n\t"
1621
1622 ".set pop \n\t"
1623
1624 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1625 [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1626 : [in_int]"r"(in_int)
1627 : "t0", "t1", "t2", "t3",
1628 "t4", "t5", "t6", "t7",
1629 "memory"
1630 );
1631
1632 curidx = 9 * qc1;
1633 curidx += qc2 + 40;
1634
1635 curidx2 = 9 * qc3;
1636 curidx2 += qc4 + 40;
1637
1638 curbits += p_bits[curidx];
1639 curbits += p_bits[curidx2];
1640
1641 vec = &p_codes[curidx*2];
1642 vec2 = &p_codes[curidx2*2];
1643
1644 __asm__ volatile (
1645 ".set push \n\t"
1646 ".set noreorder \n\t"
1647
1648 "lwc1 $f0, 0(%[in_pos]) \n\t"
1649 "lwc1 $f1, 0(%[vec]) \n\t"
1650 "lwc1 $f2, 4(%[in_pos]) \n\t"
1651 "lwc1 $f3, 4(%[vec]) \n\t"
1652 "lwc1 $f4, 8(%[in_pos]) \n\t"
1653 "lwc1 $f5, 0(%[vec2]) \n\t"
1654 "lwc1 $f6, 12(%[in_pos]) \n\t"
1655 "lwc1 $f7, 4(%[vec2]) \n\t"
1656 "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t"
1657 "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t"
1658 "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t"
1659 "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t"
1660
1661 ".set pop \n\t"
1662
1663 : [di0]"=&f"(di0), [di1]"=&f"(di1),
1664 [di2]"=&f"(di2), [di3]"=&f"(di3)
1665 : [in_pos]"r"(in_pos), [vec]"r"(vec),
1666 [vec2]"r"(vec2), [IQ]"f"(IQ)
1667 : "$f0", "$f1", "$f2", "$f3",
1668 "$f4", "$f5", "$f6", "$f7",
1669 "memory"
1670 );
1671
1672 cost += di0 * di0 + di1 * di1
1673 + di2 * di2 + di3 * di3;
1674 }
1675
1676 if (bits)
1677 *bits = curbits;
1678 return cost * lambda + curbits;
1679}
1680
1681static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
1682 PutBitContext *pb, const float *in,
1683 const float *scaled, int size, int scale_idx,
1684 int cb, const float lambda, const float uplim,
1685 int *bits)
1686{
1687 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1688 const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1689 int i;
1690 float cost = 0;
1691 int qc1, qc2, qc3, qc4;
1692 int curbits = 0;
1693
1694 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1695 float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
1696
1697 for (i = 0; i < size; i += 4) {
1698 const float *vec, *vec2;
1699 int curidx, curidx2, sign1, count1, sign2, count2;
1700 int *in_int = (int *)&in[i];
1701 float *in_pos = (float *)&in[i];
1702 float di0, di1, di2, di3;
1703
1704 qc1 = scaled[i ] * Q34 + 0.4054f;
1705 qc2 = scaled[i+1] * Q34 + 0.4054f;
1706 qc3 = scaled[i+2] * Q34 + 0.4054f;
1707 qc4 = scaled[i+3] * Q34 + 0.4054f;
1708
1709 __asm__ volatile (
1710 ".set push \n\t"
1711 ".set noreorder \n\t"
1712
1713 "ori $t4, $zero, 7 \n\t"
1714 "ori %[sign1], $zero, 0 \n\t"
1715 "ori %[sign2], $zero, 0 \n\t"
1716 "slt $t0, $t4, %[qc1] \n\t"
1717 "slt $t1, $t4, %[qc2] \n\t"
1718 "slt $t2, $t4, %[qc3] \n\t"
1719 "slt $t3, $t4, %[qc4] \n\t"
1720 "movn %[qc1], $t4, $t0 \n\t"
1721 "movn %[qc2], $t4, $t1 \n\t"
1722 "movn %[qc3], $t4, $t2 \n\t"
1723 "movn %[qc4], $t4, $t3 \n\t"
1724 "lw $t0, 0(%[in_int]) \n\t"
1725 "lw $t1, 4(%[in_int]) \n\t"
1726 "lw $t2, 8(%[in_int]) \n\t"
1727 "lw $t3, 12(%[in_int]) \n\t"
1728 "slt $t0, $t0, $zero \n\t"
1729 "movn %[sign1], $t0, %[qc1] \n\t"
1730 "slt $t2, $t2, $zero \n\t"
1731 "movn %[sign2], $t2, %[qc3] \n\t"
1732 "slt $t1, $t1, $zero \n\t"
1733 "sll $t0, %[sign1], 1 \n\t"
1734 "or $t0, $t0, $t1 \n\t"
1735 "movn %[sign1], $t0, %[qc2] \n\t"
1736 "slt $t3, $t3, $zero \n\t"
1737 "sll $t0, %[sign2], 1 \n\t"
1738 "or $t0, $t0, $t3 \n\t"
1739 "movn %[sign2], $t0, %[qc4] \n\t"
1740 "slt %[count1], $zero, %[qc1] \n\t"
1741 "slt $t1, $zero, %[qc2] \n\t"
1742 "slt %[count2], $zero, %[qc3] \n\t"
1743 "slt $t2, $zero, %[qc4] \n\t"
1744 "addu %[count1], %[count1], $t1 \n\t"
1745 "addu %[count2], %[count2], $t2 \n\t"
1746
1747 ".set pop \n\t"
1748
1749 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1750 [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1751 [sign1]"=&r"(sign1), [count1]"=&r"(count1),
1752 [sign2]"=&r"(sign2), [count2]"=&r"(count2)
1753 : [in_int]"r"(in_int)
1754 : "t0", "t1", "t2", "t3", "t4",
1755 "memory"
1756 );
1757
1758 curidx = 8 * qc1;
1759 curidx += qc2;
1760
1761 curidx2 = 8 * qc3;
1762 curidx2 += qc4;
1763
1764 curbits += p_bits[curidx];
1765 curbits += upair7_sign_bits[curidx];
1766 vec = &p_codes[curidx*2];
1767
1768 curbits += p_bits[curidx2];
1769 curbits += upair7_sign_bits[curidx2];
1770 vec2 = &p_codes[curidx2*2];
1771
1772 __asm__ volatile (
1773 ".set push \n\t"
1774 ".set noreorder \n\t"
1775
1776 "lwc1 %[di0], 0(%[in_pos]) \n\t"
1777 "lwc1 %[di1], 4(%[in_pos]) \n\t"
1778 "lwc1 %[di2], 8(%[in_pos]) \n\t"
1779 "lwc1 %[di3], 12(%[in_pos]) \n\t"
1780 "abs.s %[di0], %[di0] \n\t"
1781 "abs.s %[di1], %[di1] \n\t"
1782 "abs.s %[di2], %[di2] \n\t"
1783 "abs.s %[di3], %[di3] \n\t"
1784 "lwc1 $f0, 0(%[vec]) \n\t"
1785 "lwc1 $f1, 4(%[vec]) \n\t"
1786 "lwc1 $f2, 0(%[vec2]) \n\t"
1787 "lwc1 $f3, 4(%[vec2]) \n\t"
1788 "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
1789 "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
1790 "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
1791 "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
1792
1793 ".set pop \n\t"
1794
1795 : [di0]"=&f"(di0), [di1]"=&f"(di1),
1796 [di2]"=&f"(di2), [di3]"=&f"(di3)
1797 : [in_pos]"r"(in_pos), [vec]"r"(vec),
1798 [vec2]"r"(vec2), [IQ]"f"(IQ)
1799 : "$f0", "$f1", "$f2", "$f3",
1800 "memory"
1801 );
1802
1803 cost += di0 * di0 + di1 * di1
1804 + di2 * di2 + di3 * di3;
1805 }
1806
1807 if (bits)
1808 *bits = curbits;
1809 return cost * lambda + curbits;
1810}
1811
1812static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
1813 PutBitContext *pb, const float *in,
1814 const float *scaled, int size, int scale_idx,
1815 int cb, const float lambda, const float uplim,
1816 int *bits)
1817{
1818 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1819 const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1820 int i;
1821 float cost = 0;
1822 int qc1, qc2, qc3, qc4;
1823 int curbits = 0;
1824
1825 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1826 float *p_codes = (float *)ff_aac_codebook_vectors[cb-1];
1827
1828 for (i = 0; i < size; i += 4) {
1829 const float *vec, *vec2;
1830 int curidx, curidx2;
1831 int sign1, count1, sign2, count2;
1832 int *in_int = (int *)&in[i];
1833 float *in_pos = (float *)&in[i];
1834 float di0, di1, di2, di3;
1835
1836 qc1 = scaled[i ] * Q34 + 0.4054f;
1837 qc2 = scaled[i+1] * Q34 + 0.4054f;
1838 qc3 = scaled[i+2] * Q34 + 0.4054f;
1839 qc4 = scaled[i+3] * Q34 + 0.4054f;
1840
1841 __asm__ volatile (
1842 ".set push \n\t"
1843 ".set noreorder \n\t"
1844
1845 "ori $t4, $zero, 12 \n\t"
1846 "ori %[sign1], $zero, 0 \n\t"
1847 "ori %[sign2], $zero, 0 \n\t"
1848 "slt $t0, $t4, %[qc1] \n\t"
1849 "slt $t1, $t4, %[qc2] \n\t"
1850 "slt $t2, $t4, %[qc3] \n\t"
1851 "slt $t3, $t4, %[qc4] \n\t"
1852 "movn %[qc1], $t4, $t0 \n\t"
1853 "movn %[qc2], $t4, $t1 \n\t"
1854 "movn %[qc3], $t4, $t2 \n\t"
1855 "movn %[qc4], $t4, $t3 \n\t"
1856 "lw $t0, 0(%[in_int]) \n\t"
1857 "lw $t1, 4(%[in_int]) \n\t"
1858 "lw $t2, 8(%[in_int]) \n\t"
1859 "lw $t3, 12(%[in_int]) \n\t"
1860 "slt $t0, $t0, $zero \n\t"
1861 "movn %[sign1], $t0, %[qc1] \n\t"
1862 "slt $t2, $t2, $zero \n\t"
1863 "movn %[sign2], $t2, %[qc3] \n\t"
1864 "slt $t1, $t1, $zero \n\t"
1865 "sll $t0, %[sign1], 1 \n\t"
1866 "or $t0, $t0, $t1 \n\t"
1867 "movn %[sign1], $t0, %[qc2] \n\t"
1868 "slt $t3, $t3, $zero \n\t"
1869 "sll $t0, %[sign2], 1 \n\t"
1870 "or $t0, $t0, $t3 \n\t"
1871 "movn %[sign2], $t0, %[qc4] \n\t"
1872 "slt %[count1], $zero, %[qc1] \n\t"
1873 "slt $t1, $zero, %[qc2] \n\t"
1874 "slt %[count2], $zero, %[qc3] \n\t"
1875 "slt $t2, $zero, %[qc4] \n\t"
1876 "addu %[count1], %[count1], $t1 \n\t"
1877 "addu %[count2], %[count2], $t2 \n\t"
1878
1879 ".set pop \n\t"
1880
1881 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1882 [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1883 [sign1]"=&r"(sign1), [count1]"=&r"(count1),
1884 [sign2]"=&r"(sign2), [count2]"=&r"(count2)
1885 : [in_int]"r"(in_int)
1886 : "t0", "t1", "t2", "t3", "t4",
1887 "memory"
1888 );
1889
1890 curidx = 13 * qc1;
1891 curidx += qc2;
1892
1893 curidx2 = 13 * qc3;
1894 curidx2 += qc4;
1895
1896 curbits += p_bits[curidx];
1897 curbits += p_bits[curidx2];
1898 curbits += upair12_sign_bits[curidx];
1899 curbits += upair12_sign_bits[curidx2];
1900 vec = &p_codes[curidx*2];
1901 vec2 = &p_codes[curidx2*2];
1902
1903 __asm__ volatile (
1904 ".set push \n\t"
1905 ".set noreorder \n\t"
1906
1907 "lwc1 %[di0], 0(%[in_pos]) \n\t"
1908 "lwc1 %[di1], 4(%[in_pos]) \n\t"
1909 "lwc1 %[di2], 8(%[in_pos]) \n\t"
1910 "lwc1 %[di3], 12(%[in_pos]) \n\t"
1911 "abs.s %[di0], %[di0] \n\t"
1912 "abs.s %[di1], %[di1] \n\t"
1913 "abs.s %[di2], %[di2] \n\t"
1914 "abs.s %[di3], %[di3] \n\t"
1915 "lwc1 $f0, 0(%[vec]) \n\t"
1916 "lwc1 $f1, 4(%[vec]) \n\t"
1917 "lwc1 $f2, 0(%[vec2]) \n\t"
1918 "lwc1 $f3, 4(%[vec2]) \n\t"
1919 "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t"
1920 "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t"
1921 "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t"
1922 "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t"
1923
1924 ".set pop \n\t"
1925
1926 : [di0]"=&f"(di0), [di1]"=&f"(di1),
1927 [di2]"=&f"(di2), [di3]"=&f"(di3)
1928 : [in_pos]"r"(in_pos), [vec]"r"(vec),
1929 [vec2]"r"(vec2), [IQ]"f"(IQ)
1930 : "$f0", "$f1", "$f2", "$f3",
1931 "memory"
1932 );
1933
1934 cost += di0 * di0 + di1 * di1
1935 + di2 * di2 + di3 * di3;
1936 }
1937
1938 if (bits)
1939 *bits = curbits;
1940 return cost * lambda + curbits;
1941}
1942
1943static float get_band_cost_ESC_mips(struct AACEncContext *s,
1944 PutBitContext *pb, const float *in,
1945 const float *scaled, int size, int scale_idx,
1946 int cb, const float lambda, const float uplim,
1947 int *bits)
1948{
1949 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1950 const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1951 const float CLIPPED_ESCAPE = 165140.0f * IQ;
1952 int i;
1953 float cost = 0;
1954 int qc1, qc2, qc3, qc4;
1955 int curbits = 0;
1956
1957 uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1958 float *p_codes = (float* )ff_aac_codebook_vectors[cb-1];
1959
1960 for (i = 0; i < size; i += 4) {
1961 const float *vec, *vec2;
1962 int curidx, curidx2;
1963 float t1, t2, t3, t4;
1964 float di1, di2, di3, di4;
1965 int cond0, cond1, cond2, cond3;
1966 int c1, c2, c3, c4;
1967
1968 qc1 = scaled[i ] * Q34 + 0.4054f;
1969 qc2 = scaled[i+1] * Q34 + 0.4054f;
1970 qc3 = scaled[i+2] * Q34 + 0.4054f;
1971 qc4 = scaled[i+3] * Q34 + 0.4054f;
1972
1973 __asm__ volatile (
1974 ".set push \n\t"
1975 ".set noreorder \n\t"
1976
1977 "ori $t4, $zero, 15 \n\t"
1978 "ori $t5, $zero, 16 \n\t"
1979 "shll_s.w %[c1], %[qc1], 18 \n\t"
1980 "shll_s.w %[c2], %[qc2], 18 \n\t"
1981 "shll_s.w %[c3], %[qc3], 18 \n\t"
1982 "shll_s.w %[c4], %[qc4], 18 \n\t"
1983 "srl %[c1], %[c1], 18 \n\t"
1984 "srl %[c2], %[c2], 18 \n\t"
1985 "srl %[c3], %[c3], 18 \n\t"
1986 "srl %[c4], %[c4], 18 \n\t"
1987 "slt %[cond0], $t4, %[qc1] \n\t"
1988 "slt %[cond1], $t4, %[qc2] \n\t"
1989 "slt %[cond2], $t4, %[qc3] \n\t"
1990 "slt %[cond3], $t4, %[qc4] \n\t"
1991 "movn %[qc1], $t5, %[cond0] \n\t"
1992 "movn %[qc2], $t5, %[cond1] \n\t"
1993 "movn %[qc3], $t5, %[cond2] \n\t"
1994 "movn %[qc4], $t5, %[cond3] \n\t"
1995
1996 ".set pop \n\t"
1997
1998 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1999 [qc3]"+r"(qc3), [qc4]"+r"(qc4),
2000 [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
2001 [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
2002 [c1]"=&r"(c1), [c2]"=&r"(c2),
2003 [c3]"=&r"(c3), [c4]"=&r"(c4)
2004 :
2005 : "t4", "t5"
2006 );
2007
2008 curidx = 17 * qc1;
2009 curidx += qc2;
2010
2011 curidx2 = 17 * qc3;
2012 curidx2 += qc4;
2013
2014 curbits += p_bits[curidx];
2015 curbits += esc_sign_bits[curidx];
2016 vec = &p_codes[curidx*2];
2017
2018 curbits += p_bits[curidx2];
2019 curbits += esc_sign_bits[curidx2];
2020 vec2 = &p_codes[curidx2*2];
2021
2022 curbits += (av_log2(c1) * 2 - 3) & (-cond0);
2023 curbits += (av_log2(c2) * 2 - 3) & (-cond1);
2024 curbits += (av_log2(c3) * 2 - 3) & (-cond2);
2025 curbits += (av_log2(c4) * 2 - 3) & (-cond3);
2026
2027 t1 = fabsf(in[i ]);
2028 t2 = fabsf(in[i+1]);
2029 t3 = fabsf(in[i+2]);
2030 t4 = fabsf(in[i+3]);
2031
2032 if (cond0) {
2033 if (t1 >= CLIPPED_ESCAPE) {
2034 di1 = t1 - CLIPPED_ESCAPE;
2035 } else {
2036 di1 = t1 - c1 * cbrtf(c1) * IQ;
2037 }
2038 } else
2039 di1 = t1 - vec[0] * IQ;
2040
2041 if (cond1) {
2042 if (t2 >= CLIPPED_ESCAPE) {
2043 di2 = t2 - CLIPPED_ESCAPE;
2044 } else {
2045 di2 = t2 - c2 * cbrtf(c2) * IQ;
2046 }
2047 } else
2048 di2 = t2 - vec[1] * IQ;
2049
2050 if (cond2) {
2051 if (t3 >= CLIPPED_ESCAPE) {
2052 di3 = t3 - CLIPPED_ESCAPE;
2053 } else {
2054 di3 = t3 - c3 * cbrtf(c3) * IQ;
2055 }
2056 } else
2057 di3 = t3 - vec2[0] * IQ;
2058
2059 if (cond3) {
2060 if (t4 >= CLIPPED_ESCAPE) {
2061 di4 = t4 - CLIPPED_ESCAPE;
2062 } else {
2063 di4 = t4 - c4 * cbrtf(c4) * IQ;
2064 }
2065 } else
2066 di4 = t4 - vec2[1]*IQ;
2067
2068 cost += di1 * di1 + di2 * di2
2069 + di3 * di3 + di4 * di4;
2070 }
2071
2072 if (bits)
2073 *bits = curbits;
2074 return cost * lambda + curbits;
2075}
2076
2077static float (*const get_band_cost_arr[])(struct AACEncContext *s,
2078 PutBitContext *pb, const float *in,
2079 const float *scaled, int size, int scale_idx,
2080 int cb, const float lambda, const float uplim,
2081 int *bits) = {
2082 get_band_cost_ZERO_mips,
2083 get_band_cost_SQUAD_mips,
2084 get_band_cost_SQUAD_mips,
2085 get_band_cost_UQUAD_mips,
2086 get_band_cost_UQUAD_mips,
2087 get_band_cost_SPAIR_mips,
2088 get_band_cost_SPAIR_mips,
2089 get_band_cost_UPAIR7_mips,
2090 get_band_cost_UPAIR7_mips,
2091 get_band_cost_UPAIR12_mips,
2092 get_band_cost_UPAIR12_mips,
2093 get_band_cost_ESC_mips,
2094};
2095
2096#define get_band_cost( \
2097 s, pb, in, scaled, size, scale_idx, cb, \
2098 lambda, uplim, bits) \
2099 get_band_cost_arr[cb]( \
2100 s, pb, in, scaled, size, scale_idx, cb, \
2101 lambda, uplim, bits)
2102
2103static float quantize_band_cost(struct AACEncContext *s, const float *in,
2104 const float *scaled, int size, int scale_idx,
2105 int cb, const float lambda, const float uplim,
2106 int *bits)
2107{
2108 return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
2109}
2110
2111static void search_for_quantizers_twoloop_mips(AVCodecContext *avctx,
2112 AACEncContext *s,
2113 SingleChannelElement *sce,
2114 const float lambda)
2115{
2116 int start = 0, i, w, w2, g;
2117 int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels;
2118 float dists[128] = { 0 }, uplims[128];
2119 float maxvals[128];
2120 int fflag, minscaler;
2121 int its = 0;
2122 int allz = 0;
2123 float minthr = INFINITY;
2124
2125 destbits = FFMIN(destbits, 5800);
2126 for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2127 for (g = 0; g < sce->ics.num_swb; g++) {
2128 int nz = 0;
2129 float uplim = 0.0f;
2130 for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2131 FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
2132 uplim += band->threshold;
2133 if (band->energy <= band->threshold || band->threshold == 0.0f) {
2134 sce->zeroes[(w+w2)*16+g] = 1;
2135 continue;
2136 }
2137 nz = 1;
2138 }
2139 uplims[w*16+g] = uplim *512;
2140 sce->zeroes[w*16+g] = !nz;
2141 if (nz)
2142 minthr = FFMIN(minthr, uplim);
2143 allz |= nz;
2144 }
2145 }
2146 for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2147 for (g = 0; g < sce->ics.num_swb; g++) {
2148 if (sce->zeroes[w*16+g]) {
2149 sce->sf_idx[w*16+g] = SCALE_ONE_POS;
2150 continue;
2151 }
2152 sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
2153 }
2154 }
2155
2156 if (!allz)
2157 return;
2158 abs_pow34_v(s->scoefs, sce->coeffs, 1024);
2159
2160 for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2161 start = w*128;
2162 for (g = 0; g < sce->ics.num_swb; g++) {
2163 const float *scaled = s->scoefs + start;
2164 maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
2165 start += sce->ics.swb_sizes[g];
2166 }
2167 }
2168
2169 do {
2170 int tbits, qstep;
2171 minscaler = sce->sf_idx[0];
2172 qstep = its ? 1 : 32;
2173 do {
2174 int prev = -1;
2175 tbits = 0;
2176 fflag = 0;
2177
2178 if (qstep > 1) {
2179 for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2180 start = w*128;
2181 for (g = 0; g < sce->ics.num_swb; g++) {
2182 const float *coefs = sce->coeffs + start;
2183 const float *scaled = s->scoefs + start;
2184 int bits = 0;
2185 int cb;
2186
2187 if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
2188 start += sce->ics.swb_sizes[g];
2189 continue;
2190 }
2191 minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
2192 cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2193 for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2194 int b;
2195 bits += quantize_band_cost_bits(s, coefs + w2*128,
2196 scaled + w2*128,
2197 sce->ics.swb_sizes[g],
2198 sce->sf_idx[w*16+g],
2199 cb,
2200 1.0f,
2201 INFINITY,
2202 &b);
2203 }
2204 if (prev != -1) {
2205 bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
2206 }
2207 tbits += bits;
2208 start += sce->ics.swb_sizes[g];
2209 prev = sce->sf_idx[w*16+g];
2210 }
2211 }
2212 }
2213 else {
2214 for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2215 start = w*128;
2216 for (g = 0; g < sce->ics.num_swb; g++) {
2217 const float *coefs = sce->coeffs + start;
2218 const float *scaled = s->scoefs + start;
2219 int bits = 0;
2220 int cb;
2221 float dist = 0.0f;
2222
2223 if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
2224 start += sce->ics.swb_sizes[g];
2225 continue;
2226 }
2227 minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
2228 cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2229 for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2230 int b;
2231 dist += quantize_band_cost(s, coefs + w2*128,
2232 scaled + w2*128,
2233 sce->ics.swb_sizes[g],
2234 sce->sf_idx[w*16+g],
2235 cb,
2236 1.0f,
2237 INFINITY,
2238 &b);
2239 bits += b;
2240 }
2241 dists[w*16+g] = dist - bits;
2242 if (prev != -1) {
2243 bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
2244 }
2245 tbits += bits;
2246 start += sce->ics.swb_sizes[g];
2247 prev = sce->sf_idx[w*16+g];
2248 }
2249 }
2250 }
2251 if (tbits > destbits) {
2252 for (i = 0; i < 128; i++)
2253 if (sce->sf_idx[i] < 218 - qstep)
2254 sce->sf_idx[i] += qstep;
2255 } else {
2256 for (i = 0; i < 128; i++)
2257 if (sce->sf_idx[i] > 60 - qstep)
2258 sce->sf_idx[i] -= qstep;
2259 }
2260 qstep >>= 1;
2261 if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
2262 qstep = 1;
2263 } while (qstep);
2264
2265 fflag = 0;
2266 minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
2267 for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2268 for (g = 0; g < sce->ics.num_swb; g++) {
2269 int prevsc = sce->sf_idx[w*16+g];
2270 if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
2271 if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
2272 sce->sf_idx[w*16+g]--;
2273 else
2274 sce->sf_idx[w*16+g]-=2;
2275 }
2276 sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
2277 sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
2278 if (sce->sf_idx[w*16+g] != prevsc)
2279 fflag = 1;
2280 sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2281 }
2282 }
2283 its++;
2284 } while (fflag && its < 10);
2285}
2286
2287static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe,
2288 const float lambda)
2289{
2290 int start = 0, i, w, w2, g;
2291 float M[128], S[128];
2292 float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
2293 SingleChannelElement *sce0 = &cpe->ch[0];
2294 SingleChannelElement *sce1 = &cpe->ch[1];
2295 if (!cpe->common_window)
2296 return;
2297 for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
2298 for (g = 0; g < sce0->ics.num_swb; g++) {
2299 if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
2300 float dist1 = 0.0f, dist2 = 0.0f;
2301 for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
2302 FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
2303 FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
2304 float minthr = FFMIN(band0->threshold, band1->threshold);
2305 float maxthr = FFMAX(band0->threshold, band1->threshold);
2306 for (i = 0; i < sce0->ics.swb_sizes[g]; i+=4) {
2307 M[i ] = (sce0->coeffs[start+w2*128+i ]
2308 + sce1->coeffs[start+w2*128+i ]) * 0.5;
2309 M[i+1] = (sce0->coeffs[start+w2*128+i+1]
2310 + sce1->coeffs[start+w2*128+i+1]) * 0.5;
2311 M[i+2] = (sce0->coeffs[start+w2*128+i+2]
2312 + sce1->coeffs[start+w2*128+i+2]) * 0.5;
2313 M[i+3] = (sce0->coeffs[start+w2*128+i+3]
2314 + sce1->coeffs[start+w2*128+i+3]) * 0.5;
2315
2316 S[i ] = M[i ]
2317 - sce1->coeffs[start+w2*128+i ];
2318 S[i+1] = M[i+1]
2319 - sce1->coeffs[start+w2*128+i+1];
2320 S[i+2] = M[i+2]
2321 - sce1->coeffs[start+w2*128+i+2];
2322 S[i+3] = M[i+3]
2323 - sce1->coeffs[start+w2*128+i+3];
2324 }
2325 abs_pow34_v(L34, sce0->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
2326 abs_pow34_v(R34, sce1->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
2327 abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]);
2328 abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]);
2329 dist1 += quantize_band_cost(s, sce0->coeffs + start + w2*128,
2330 L34,
2331 sce0->ics.swb_sizes[g],
2332 sce0->sf_idx[(w+w2)*16+g],
2333 sce0->band_type[(w+w2)*16+g],
2334 lambda / band0->threshold, INFINITY, NULL);
2335 dist1 += quantize_band_cost(s, sce1->coeffs + start + w2*128,
2336 R34,
2337 sce1->ics.swb_sizes[g],
2338 sce1->sf_idx[(w+w2)*16+g],
2339 sce1->band_type[(w+w2)*16+g],
2340 lambda / band1->threshold, INFINITY, NULL);
2341 dist2 += quantize_band_cost(s, M,
2342 M34,
2343 sce0->ics.swb_sizes[g],
2344 sce0->sf_idx[(w+w2)*16+g],
2345 sce0->band_type[(w+w2)*16+g],
2346 lambda / maxthr, INFINITY, NULL);
2347 dist2 += quantize_band_cost(s, S,
2348 S34,
2349 sce1->ics.swb_sizes[g],
2350 sce1->sf_idx[(w+w2)*16+g],
2351 sce1->band_type[(w+w2)*16+g],
2352 lambda / minthr, INFINITY, NULL);
2353 }
2354 cpe->ms_mask[w*16+g] = dist2 < dist1;
2355 }
2356 start += sce0->ics.swb_sizes[g];
2357 }
2358 }
2359}
2360#endif /*HAVE_MIPSFPU */
2361
2362static void codebook_trellis_rate_mips(AACEncContext *s, SingleChannelElement *sce,
2363 int win, int group_len, const float lambda)
2364{
2365 BandCodingPath path[120][12];
2366 int w, swb, cb, start, size;
2367 int i, j;
2368 const int max_sfb = sce->ics.max_sfb;
2369 const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
2370 const int run_esc = (1 << run_bits) - 1;
2371 int idx, ppos, count;
2372 int stackrun[120], stackcb[120], stack_len;
2373 float next_minbits = INFINITY;
2374 int next_mincb = 0;
2375
2376 abs_pow34_v(s->scoefs, sce->coeffs, 1024);
2377 start = win*128;
2378 for (cb = 0; cb < 12; cb++) {
2379 path[0][cb].cost = run_bits+4;
2380 path[0][cb].prev_idx = -1;
2381 path[0][cb].run = 0;
2382 }
2383 for (swb = 0; swb < max_sfb; swb++) {
2384 size = sce->ics.swb_sizes[swb];
2385 if (sce->zeroes[win*16 + swb]) {
2386 float cost_stay_here = path[swb][0].cost;
2387 float cost_get_here = next_minbits + run_bits + 4;
2388 if ( run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
2389 != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
2390 cost_stay_here += run_bits;
2391 if (cost_get_here < cost_stay_here) {
2392 path[swb+1][0].prev_idx = next_mincb;
2393 path[swb+1][0].cost = cost_get_here;
2394 path[swb+1][0].run = 1;
2395 } else {
2396 path[swb+1][0].prev_idx = 0;
2397 path[swb+1][0].cost = cost_stay_here;
2398 path[swb+1][0].run = path[swb][0].run + 1;
2399 }
2400 next_minbits = path[swb+1][0].cost;
2401 next_mincb = 0;
2402 for (cb = 1; cb < 12; cb++) {
2403 path[swb+1][cb].cost = 61450;
2404 path[swb+1][cb].prev_idx = -1;
2405 path[swb+1][cb].run = 0;
2406 }
2407 } else {
2408 float minbits = next_minbits;
2409 int mincb = next_mincb;
2410 int startcb = sce->band_type[win*16+swb];
2411 next_minbits = INFINITY;
2412 next_mincb = 0;
2413 for (cb = 0; cb < startcb; cb++) {
2414 path[swb+1][cb].cost = 61450;
2415 path[swb+1][cb].prev_idx = -1;
2416 path[swb+1][cb].run = 0;
2417 }
2418 for (cb = startcb; cb < 12; cb++) {
2419 float cost_stay_here, cost_get_here;
2420 float bits = 0.0f;
2421 for (w = 0; w < group_len; w++) {
2422 bits += quantize_band_cost_bits(s, sce->coeffs + start + w*128,
2423 s->scoefs + start + w*128, size,
2424 sce->sf_idx[(win+w)*16+swb], cb,
2425 0, INFINITY, NULL);
2426 }
2427 cost_stay_here = path[swb][cb].cost + bits;
2428 cost_get_here = minbits + bits + run_bits + 4;
2429 if ( run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
2430 != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
2431 cost_stay_here += run_bits;
2432 if (cost_get_here < cost_stay_here) {
2433 path[swb+1][cb].prev_idx = mincb;
2434 path[swb+1][cb].cost = cost_get_here;
2435 path[swb+1][cb].run = 1;
2436 } else {
2437 path[swb+1][cb].prev_idx = cb;
2438 path[swb+1][cb].cost = cost_stay_here;
2439 path[swb+1][cb].run = path[swb][cb].run + 1;
2440 }
2441 if (path[swb+1][cb].cost < next_minbits) {
2442 next_minbits = path[swb+1][cb].cost;
2443 next_mincb = cb;
2444 }
2445 }
2446 }
2447 start += sce->ics.swb_sizes[swb];
2448 }
2449
2450 stack_len = 0;
2451 idx = 0;
2452 for (cb = 1; cb < 12; cb++)
2453 if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
2454 idx = cb;
2455 ppos = max_sfb;
2456 while (ppos > 0) {
2457 av_assert1(idx >= 0);
2458 cb = idx;
2459 stackrun[stack_len] = path[ppos][cb].run;
2460 stackcb [stack_len] = cb;
2461 idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
2462 ppos -= path[ppos][cb].run;
2463 stack_len++;
2464 }
2465
2466 start = 0;
2467 for (i = stack_len - 1; i >= 0; i--) {
2468 put_bits(&s->pb, 4, stackcb[i]);
2469 count = stackrun[i];
2470 memset(sce->zeroes + win*16 + start, !stackcb[i], count);
2471 for (j = 0; j < count; j++) {
2472 sce->band_type[win*16 + start] = stackcb[i];
2473 start++;
2474 }
2475 while (count >= run_esc) {
2476 put_bits(&s->pb, run_bits, run_esc);
2477 count -= run_esc;
2478 }
2479 put_bits(&s->pb, run_bits, count);
2480 }
2481}
2482#endif /* HAVE_INLINE_ASM */
2483
2484void ff_aac_coder_init_mips(AACEncContext *c) {
2485#if HAVE_INLINE_ASM
2486 AACCoefficientsEncoder *e = c->coder;
2487 int option = c->options.aac_coder;
2488
2489 if (option == 2) {
2490 e->quantize_and_encode_band = quantize_and_encode_band_mips;
2491 e->encode_window_bands_info = codebook_trellis_rate_mips;
2492#if HAVE_MIPSFPU
2493 e->search_for_quantizers = search_for_quantizers_twoloop_mips;
2494 e->search_for_ms = search_for_ms_mips;
2495#endif /* HAVE_MIPSFPU */
2496 }
2497#endif /* HAVE_INLINE_ASM */
2498}