Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Copyright (c) 2012 | |
3 | * MIPS Technologies, Inc., California. | |
4 | * | |
5 | * Redistribution and use in source and binary forms, with or without | |
6 | * modification, are permitted provided that the following conditions | |
7 | * are met: | |
8 | * 1. Redistributions of source code must retain the above copyright | |
9 | * notice, this list of conditions and the following disclaimer. | |
10 | * 2. Redistributions in binary form must reproduce the above copyright | |
11 | * notice, this list of conditions and the following disclaimer in the | |
12 | * documentation and/or other materials provided with the distribution. | |
13 | * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its | |
14 | * contributors may be used to endorse or promote products derived from | |
15 | * this software without specific prior written permission. | |
16 | * | |
17 | * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND | |
18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
19 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
20 | * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE | |
21 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
22 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
23 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
24 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
25 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
26 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
27 | * SUCH DAMAGE. | |
28 | * | |
29 | * Author: Bojan Zivkovic (bojan@mips.com) | |
30 | * | |
31 | * MPEG Audio decoder optimized for MIPS fixed-point architecture | |
32 | * | |
33 | * This file is part of FFmpeg. | |
34 | * | |
35 | * FFmpeg is free software; you can redistribute it and/or | |
36 | * modify it under the terms of the GNU Lesser General Public | |
37 | * License as published by the Free Software Foundation; either | |
38 | * version 2.1 of the License, or (at your option) any later version. | |
39 | * | |
40 | * FFmpeg is distributed in the hope that it will be useful, | |
41 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
42 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
43 | * Lesser General Public License for more details. | |
44 | * | |
45 | * You should have received a copy of the GNU Lesser General Public | |
46 | * License along with FFmpeg; if not, write to the Free Software | |
47 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
48 | */ | |
49 | ||
50 | /** | |
51 | * @file | |
52 | * Reference: libavcodec/mpegaudiodsp_template.c | |
53 | */ | |
54 | ||
55 | #include <string.h> | |
56 | ||
57 | #include "libavcodec/mpegaudiodsp.h" | |
58 | ||
59 | static void ff_mpadsp_apply_window_mips_fixed(int32_t *synth_buf, int32_t *window, | |
60 | int *dither_state, int16_t *samples, int incr) | |
61 | { | |
62 | register const int32_t *w, *w2, *p; | |
63 | int j; | |
64 | int16_t *samples2; | |
65 | int w_asm, p_asm, w_asm1, p_asm1, w_asm2, p_asm2; | |
66 | int w2_asm, w2_asm1, *p_temp1, *p_temp2; | |
67 | int sum1 = 0; | |
68 | int const min_asm = -32768, max_asm = 32767; | |
69 | int temp1, temp2 = 0, temp3 = 0; | |
70 | int64_t sum; | |
71 | ||
72 | /* copy to avoid wrap */ | |
73 | memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf)); | |
74 | samples2 = samples + 31 * incr; | |
75 | w = window; | |
76 | w2 = window + 31; | |
77 | sum = *dither_state; | |
78 | p = synth_buf + 16; | |
79 | p_temp1 = synth_buf + 16; | |
80 | p_temp2 = synth_buf + 48; | |
81 | temp1 = sum; | |
82 | ||
83 | /** | |
84 | * use of round_sample function from the original code is eliminated, | |
85 | * changed with appropriate assembly instructions. | |
86 | */ | |
87 | __asm__ volatile ( | |
88 | "mthi $zero \n\t" | |
89 | "mtlo %[temp1] \n\t" | |
90 | "lw %[w_asm], 0(%[w]) \n\t" | |
91 | "lw %[p_asm], 0(%[p]) \n\t" | |
92 | "lw %[w_asm1], 64*4(%[w]) \n\t" | |
93 | "lw %[p_asm1], 64*4(%[p]) \n\t" | |
94 | "lw %[w_asm2], 128*4(%[w]) \n\t" | |
95 | "lw %[p_asm2], 128*4(%[p]) \n\t" | |
96 | "madd %[w_asm], %[p_asm] \n\t" | |
97 | "madd %[w_asm1], %[p_asm1] \n\t" | |
98 | "madd %[w_asm2], %[p_asm2] \n\t" | |
99 | "lw %[w_asm], 192*4(%[w]) \n\t" | |
100 | "lw %[p_asm], 192*4(%[p]) \n\t" | |
101 | "lw %[w_asm1], 256*4(%[w]) \n\t" | |
102 | "lw %[p_asm1], 256*4(%[p]) \n\t" | |
103 | "lw %[w_asm2], 320*4(%[w]) \n\t" | |
104 | "lw %[p_asm2], 320*4(%[p]) \n\t" | |
105 | "madd %[w_asm], %[p_asm] \n\t" | |
106 | "madd %[w_asm1], %[p_asm1] \n\t" | |
107 | "madd %[w_asm2], %[p_asm2] \n\t" | |
108 | "lw %[w_asm], 384*4(%[w]) \n\t" | |
109 | "lw %[p_asm], 384*4(%[p]) \n\t" | |
110 | "lw %[w_asm1], 448*4(%[w]) \n\t" | |
111 | "lw %[p_asm1], 448*4(%[p]) \n\t" | |
112 | "lw %[w_asm2], 32*4(%[w]) \n\t" | |
113 | "lw %[p_asm2], 32*4(%[p]) \n\t" | |
114 | "madd %[w_asm], %[p_asm] \n\t" | |
115 | "madd %[w_asm1], %[p_asm1] \n\t" | |
116 | "msub %[w_asm2], %[p_asm2] \n\t" | |
117 | "lw %[w_asm], 96*4(%[w]) \n\t" | |
118 | "lw %[p_asm], 96*4(%[p]) \n\t" | |
119 | "lw %[w_asm1], 160*4(%[w]) \n\t" | |
120 | "lw %[p_asm1], 160*4(%[p]) \n\t" | |
121 | "lw %[w_asm2], 224*4(%[w]) \n\t" | |
122 | "lw %[p_asm2], 224*4(%[p]) \n\t" | |
123 | "msub %[w_asm], %[p_asm] \n\t" | |
124 | "msub %[w_asm1], %[p_asm1] \n\t" | |
125 | "msub %[w_asm2], %[p_asm2] \n\t" | |
126 | "lw %[w_asm], 288*4(%[w]) \n\t" | |
127 | "lw %[p_asm], 288*4(%[p]) \n\t" | |
128 | "lw %[w_asm1], 352*4(%[w]) \n\t" | |
129 | "lw %[p_asm1], 352*4(%[p]) \n\t" | |
130 | "msub %[w_asm], %[p_asm] \n\t" | |
131 | "lw %[w_asm], 480*4(%[w]) \n\t" | |
132 | "lw %[p_asm], 480*4(%[p]) \n\t" | |
133 | "lw %[w_asm2], 416*4(%[w]) \n\t" | |
134 | "lw %[p_asm2], 416*4(%[p]) \n\t" | |
135 | "msub %[w_asm], %[p_asm] \n\t" | |
136 | "msub %[w_asm1], %[p_asm1] \n\t" | |
137 | "msub %[w_asm2], %[p_asm2] \n\t" | |
138 | ||
139 | /*round_sample function from the original code is eliminated, | |
140 | * changed with appropriate assembly instructions | |
141 | * code example: | |
142 | ||
143 | "extr.w %[sum1],$ac0,24 \n\t" | |
144 | "mflo %[temp3], $ac0 \n\t" | |
145 | "and %[temp1], %[temp3], 0x00ffffff \n\t" | |
146 | "slt %[temp2], %[sum1], %[min_asm] \n\t" | |
147 | "movn %[sum1], %[min_asm],%[temp2] \n\t" | |
148 | "slt %[temp2], %[max_asm],%[sum1] \n\t" | |
149 | "movn %[sum1], %[max_asm],%[temp2] \n\t" | |
150 | "sh %[sum1], 0(%[samples]) \n\t" | |
151 | */ | |
152 | ||
153 | "extr.w %[sum1], $ac0, 24 \n\t" | |
154 | "mflo %[temp3] \n\t" | |
155 | "addi %[w], %[w], 4 \n\t" | |
156 | "and %[temp1], %[temp3], 0x00ffffff \n\t" | |
157 | "slt %[temp2], %[sum1], %[min_asm] \n\t" | |
158 | "movn %[sum1], %[min_asm], %[temp2] \n\t" | |
159 | "slt %[temp2], %[max_asm], %[sum1] \n\t" | |
160 | "movn %[sum1], %[max_asm], %[temp2] \n\t" | |
161 | "sh %[sum1], 0(%[samples]) \n\t" | |
162 | ||
163 | : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1), | |
164 | [p_asm1] "=&r" (p_asm1), [temp1] "+r" (temp1), [temp2] "+r" (temp2), | |
165 | [w_asm2] "=&r" (w_asm2), [p_asm2] "=&r" (p_asm2), | |
166 | [sum1] "+r" (sum1), [w] "+r" (w), [temp3] "+r" (temp3) | |
167 | : [p] "r" (p), [samples] "r" (samples), [min_asm] "r" (min_asm), | |
168 | [max_asm] "r" (max_asm) | |
169 | : "memory", "hi","lo" | |
170 | ); | |
171 | ||
172 | samples += incr; | |
173 | ||
174 | /* we calculate two samples at the same time to avoid one memory | |
175 | access per two sample */ | |
176 | ||
177 | for(j = 1; j < 16; j++) { | |
178 | __asm__ volatile ( | |
179 | "mthi $0, $ac1 \n\t" | |
180 | "mtlo $0, $ac1 \n\t" | |
181 | "mthi $0 \n\t" | |
182 | "mtlo %[temp1] \n\t" | |
183 | "addi %[p_temp1], %[p_temp1], 4 \n\t" | |
184 | "lw %[w_asm], 0(%[w]) \n\t" | |
185 | "lw %[p_asm], 0(%[p_temp1]) \n\t" | |
186 | "lw %[w2_asm], 0(%[w2]) \n\t" | |
187 | "lw %[w_asm1], 64*4(%[w]) \n\t" | |
188 | "lw %[p_asm1], 64*4(%[p_temp1]) \n\t" | |
189 | "lw %[w2_asm1], 64*4(%[w2]) \n\t" | |
190 | "madd %[w_asm], %[p_asm] \n\t" | |
191 | "msub $ac1, %[w2_asm], %[p_asm] \n\t" | |
192 | "madd %[w_asm1], %[p_asm1] \n\t" | |
193 | "msub $ac1, %[w2_asm1], %[p_asm1] \n\t" | |
194 | "lw %[w_asm], 128*4(%[w]) \n\t" | |
195 | "lw %[p_asm], 128*4(%[p_temp1]) \n\t" | |
196 | "lw %[w2_asm], 128*4(%[w2]) \n\t" | |
197 | "lw %[w_asm1], 192*4(%[w]) \n\t" | |
198 | "lw %[p_asm1], 192*4(%[p_temp1]) \n\t" | |
199 | "lw %[w2_asm1], 192*4(%[w2]) \n\t" | |
200 | "madd %[w_asm], %[p_asm] \n\t" | |
201 | "msub $ac1, %[w2_asm], %[p_asm] \n\t" | |
202 | "madd %[w_asm1], %[p_asm1] \n\t" | |
203 | "msub $ac1, %[w2_asm1], %[p_asm1] \n\t" | |
204 | "lw %[w_asm], 256*4(%[w]) \n\t" | |
205 | "lw %[p_asm], 256*4(%[p_temp1]) \n\t" | |
206 | "lw %[w2_asm], 256*4(%[w2]) \n\t" | |
207 | "lw %[w_asm1], 320*4(%[w]) \n\t" | |
208 | "lw %[p_asm1], 320*4(%[p_temp1]) \n\t" | |
209 | "lw %[w2_asm1], 320*4(%[w2]) \n\t" | |
210 | "madd %[w_asm], %[p_asm] \n\t" | |
211 | "msub $ac1, %[w2_asm], %[p_asm] \n\t" | |
212 | "madd %[w_asm1], %[p_asm1] \n\t" | |
213 | "msub $ac1, %[w2_asm1], %[p_asm1] \n\t" | |
214 | "lw %[w_asm], 384*4(%[w]) \n\t" | |
215 | "lw %[p_asm], 384*4(%[p_temp1]) \n\t" | |
216 | "lw %[w2_asm], 384*4(%[w2]) \n\t" | |
217 | "lw %[w_asm1], 448*4(%[w]) \n\t" | |
218 | "lw %[p_asm1], 448*4(%[p_temp1]) \n\t" | |
219 | "lw %[w2_asm1], 448*4(%[w2]) \n\t" | |
220 | "madd %[w_asm], %[p_asm] \n\t" | |
221 | "msub $ac1, %[w2_asm], %[p_asm] \n\t" | |
222 | "madd %[w_asm1], %[p_asm1] \n\t" | |
223 | "msub $ac1, %[w2_asm1], %[p_asm1] \n\t" | |
224 | "addi %[p_temp2], %[p_temp2], -4 \n\t" | |
225 | "lw %[w_asm], 32*4(%[w]) \n\t" | |
226 | "lw %[p_asm], 0(%[p_temp2]) \n\t" | |
227 | "lw %[w2_asm], 32*4(%[w2]) \n\t" | |
228 | "lw %[w_asm1], 96*4(%[w]) \n\t" | |
229 | "lw %[p_asm1], 64*4(%[p_temp2]) \n\t" | |
230 | "lw %[w2_asm1], 96*4(%[w2]) \n\t" | |
231 | "msub %[w_asm], %[p_asm] \n\t" | |
232 | "msub $ac1, %[w2_asm], %[p_asm] \n\t" | |
233 | "msub %[w_asm1], %[p_asm1] \n\t" | |
234 | "msub $ac1, %[w2_asm1], %[p_asm1] \n\t" | |
235 | "lw %[w_asm], 160*4(%[w]) \n\t" | |
236 | "lw %[p_asm], 128*4(%[p_temp2]) \n\t" | |
237 | "lw %[w2_asm], 160*4(%[w2]) \n\t" | |
238 | "lw %[w_asm1], 224*4(%[w]) \n\t" | |
239 | "lw %[p_asm1], 192*4(%[p_temp2]) \n\t" | |
240 | "lw %[w2_asm1], 224*4(%[w2]) \n\t" | |
241 | "msub %[w_asm], %[p_asm] \n\t" | |
242 | "msub $ac1, %[w2_asm], %[p_asm] \n\t" | |
243 | "msub %[w_asm1], %[p_asm1] \n\t" | |
244 | "msub $ac1, %[w2_asm1], %[p_asm1] \n\t" | |
245 | "lw %[w_asm], 288*4(%[w]) \n\t" | |
246 | "lw %[p_asm], 256*4(%[p_temp2]) \n\t" | |
247 | "lw %[w2_asm], 288*4(%[w2]) \n\t" | |
248 | "lw %[w_asm1], 352*4(%[w]) \n\t" | |
249 | "lw %[p_asm1], 320*4(%[p_temp2]) \n\t" | |
250 | "lw %[w2_asm1], 352*4(%[w2]) \n\t" | |
251 | "msub %[w_asm], %[p_asm] \n\t" | |
252 | "msub $ac1, %[w2_asm], %[p_asm] \n\t" | |
253 | "msub %[w_asm1], %[p_asm1] \n\t" | |
254 | "msub $ac1, %[w2_asm1], %[p_asm1] \n\t" | |
255 | "lw %[w_asm], 416*4(%[w]) \n\t" | |
256 | "lw %[p_asm], 384*4(%[p_temp2]) \n\t" | |
257 | "lw %[w2_asm], 416*4(%[w2]) \n\t" | |
258 | "lw %[w_asm1], 480*4(%[w]) \n\t" | |
259 | "lw %[p_asm1], 448*4(%[p_temp2]) \n\t" | |
260 | "lw %[w2_asm1], 480*4(%[w2]) \n\t" | |
261 | "msub %[w_asm], %[p_asm] \n\t" | |
262 | "msub %[w_asm1], %[p_asm1] \n\t" | |
263 | "msub $ac1, %[w2_asm], %[p_asm] \n\t" | |
264 | "msub $ac1, %[w2_asm1], %[p_asm1] \n\t" | |
265 | "addi %[w], %[w], 4 \n\t" | |
266 | "addi %[w2], %[w2], -4 \n\t" | |
267 | "mflo %[temp2] \n\t" | |
268 | "extr.w %[sum1], $ac0, 24 \n\t" | |
269 | "li %[temp3], 1 \n\t" | |
270 | "and %[temp1], %[temp2], 0x00ffffff \n\t" | |
271 | "madd $ac1, %[temp1], %[temp3] \n\t" | |
272 | "slt %[temp2], %[sum1], %[min_asm] \n\t" | |
273 | "movn %[sum1], %[min_asm], %[temp2] \n\t" | |
274 | "slt %[temp2], %[max_asm], %[sum1] \n\t" | |
275 | "movn %[sum1], %[max_asm], %[temp2] \n\t" | |
276 | "sh %[sum1], 0(%[samples]) \n\t" | |
277 | "mflo %[temp3], $ac1 \n\t" | |
278 | "extr.w %[sum1], $ac1, 24 \n\t" | |
279 | "and %[temp1], %[temp3], 0x00ffffff \n\t" | |
280 | "slt %[temp2], %[sum1], %[min_asm] \n\t" | |
281 | "movn %[sum1], %[min_asm], %[temp2] \n\t" | |
282 | "slt %[temp2], %[max_asm], %[sum1] \n\t" | |
283 | "movn %[sum1], %[max_asm], %[temp2] \n\t" | |
284 | "sh %[sum1], 0(%[samples2]) \n\t" | |
285 | ||
286 | : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1), | |
287 | [p_asm1] "=&r" (p_asm1), [w2_asm1] "=&r" (w2_asm1), | |
288 | [w2_asm] "=&r" (w2_asm), [temp1] "+r" (temp1), [temp2] "+r" (temp2), | |
289 | [p_temp1] "+r" (p_temp1), [p_temp2] "+r" (p_temp2), [sum1] "+r" (sum1), | |
290 | [w] "+r" (w), [w2] "+r" (w2), [samples] "+r" (samples), | |
291 | [samples2] "+r" (samples2), [temp3] "+r" (temp3) | |
292 | : [min_asm] "r" (min_asm), [max_asm] "r" (max_asm) | |
293 | : "memory", "hi", "lo", "$ac1hi", "$ac1lo" | |
294 | ); | |
295 | ||
296 | samples += incr; | |
297 | samples2 -= incr; | |
298 | } | |
299 | ||
300 | p = synth_buf + 32; | |
301 | ||
302 | __asm__ volatile ( | |
303 | "mthi $0 \n\t" | |
304 | "mtlo %[temp1] \n\t" | |
305 | "lw %[w_asm], 32*4(%[w]) \n\t" | |
306 | "lw %[p_asm], 0(%[p]) \n\t" | |
307 | "lw %[w_asm1], 96*4(%[w]) \n\t" | |
308 | "lw %[p_asm1], 64*4(%[p]) \n\t" | |
309 | "lw %[w_asm2], 160*4(%[w]) \n\t" | |
310 | "lw %[p_asm2], 128*4(%[p]) \n\t" | |
311 | "msub %[w_asm], %[p_asm] \n\t" | |
312 | "msub %[w_asm1], %[p_asm1] \n\t" | |
313 | "msub %[w_asm2], %[p_asm2] \n\t" | |
314 | "lw %[w_asm], 224*4(%[w]) \n\t" | |
315 | "lw %[p_asm], 192*4(%[p]) \n\t" | |
316 | "lw %[w_asm1], 288*4(%[w]) \n\t" | |
317 | "lw %[p_asm1], 256*4(%[p]) \n\t" | |
318 | "lw %[w_asm2], 352*4(%[w]) \n\t" | |
319 | "lw %[p_asm2], 320*4(%[p]) \n\t" | |
320 | "msub %[w_asm], %[p_asm] \n\t" | |
321 | "msub %[w_asm1], %[p_asm1] \n\t" | |
322 | "msub %[w_asm2], %[p_asm2] \n\t" | |
323 | "lw %[w_asm], 416*4(%[w]) \n\t" | |
324 | "lw %[p_asm], 384*4(%[p]) \n\t" | |
325 | "lw %[w_asm1], 480*4(%[w]) \n\t" | |
326 | "lw %[p_asm1], 448*4(%[p]) \n\t" | |
327 | "msub %[w_asm], %[p_asm] \n\t" | |
328 | "msub %[w_asm1], %[p_asm1] \n\t" | |
329 | "extr.w %[sum1], $ac0, 24 \n\t" | |
330 | "mflo %[temp2] \n\t" | |
331 | "and %[temp1], %[temp2], 0x00ffffff \n\t" | |
332 | "slt %[temp2], %[sum1], %[min_asm] \n\t" | |
333 | "movn %[sum1], %[min_asm], %[temp2] \n\t" | |
334 | "slt %[temp2], %[max_asm], %[sum1] \n\t" | |
335 | "movn %[sum1], %[max_asm], %[temp2] \n\t" | |
336 | "sh %[sum1], 0(%[samples]) \n\t" | |
337 | ||
338 | : [w_asm] "=&r" (w_asm), [p_asm] "=&r" (p_asm), [w_asm1] "=&r" (w_asm1), | |
339 | [p_asm1] "=&r" (p_asm1), [temp1] "+r" (temp1), [temp2] "+r" (temp2), | |
340 | [w_asm2] "=&r" (w_asm2), [p_asm2] "=&r" (p_asm2), [sum1] "+r" (sum1) | |
341 | : [w] "r" (w), [p] "r" (p), [samples] "r" (samples), [min_asm] "r" (min_asm), | |
342 | [max_asm] "r" (max_asm) | |
343 | : "memory", "hi", "lo", "$ac1hi", "$ac1lo" | |
344 | ); | |
345 | ||
346 | *dither_state= temp1; | |
347 | } | |
348 | ||
349 | static void imdct36_mips_fixed(int *out, int *buf, int *in, int *win) | |
350 | { | |
351 | int j; | |
352 | int t0, t1, t2, t3, s0, s1, s2, s3; | |
353 | int tmp[18], *tmp1, *in1; | |
354 | /* temporary variables */ | |
355 | int temp_reg1, temp_reg2, temp_reg3, temp_reg4, temp_reg5, temp_reg6; | |
356 | int t4, t5, t6, t8, t7; | |
357 | ||
358 | /* values defined in macros and tables are | |
359 | * eliminated - they are directly loaded in appropriate variables | |
360 | */ | |
361 | int const C_1 = 4229717092; /* cos(pi*1/18)*2 */ | |
362 | int const C_2 = 4035949074; /* cos(pi*2/18)*2 */ | |
363 | int const C_3 = 575416510; /* -cos(pi*3/18)*2 */ | |
364 | int const C_3A = 3719550786; /* cos(pi*3/18)*2 */ | |
365 | int const C_4 = 1004831466; /* -cos(pi*4/18)*2 */ | |
366 | int const C_5 = 1534215534; /* -cos(pi*5/18)*2 */ | |
367 | int const C_7 = -1468965330; /* -cos(pi*7/18)*2 */ | |
368 | int const C_8 = -745813244; /* -cos(pi*8/18)*2 */ | |
369 | ||
370 | /* | |
371 | * instructions of the first two loops are reorganized and loops are unrolled, | |
372 | * in order to eliminate unnecessary readings and writings in array | |
373 | */ | |
374 | ||
375 | __asm__ volatile ( | |
376 | "lw %[t1], 17*4(%[in]) \n\t" | |
377 | "lw %[t2], 16*4(%[in]) \n\t" | |
378 | "lw %[t3], 15*4(%[in]) \n\t" | |
379 | "lw %[t4], 14*4(%[in]) \n\t" | |
380 | "addu %[t1], %[t1], %[t2] \n\t" | |
381 | "addu %[t2], %[t2], %[t3] \n\t" | |
382 | "addu %[t3], %[t3], %[t4] \n\t" | |
383 | "lw %[t5], 13*4(%[in]) \n\t" | |
384 | "addu %[t1], %[t1], %[t3] \n\t" | |
385 | "sw %[t2], 16*4(%[in]) \n\t" | |
386 | "lw %[t6], 12*4(%[in]) \n\t" | |
387 | "sw %[t1], 17*4(%[in]) \n\t" | |
388 | "addu %[t4], %[t4], %[t5] \n\t" | |
389 | "addu %[t5], %[t5], %[t6] \n\t" | |
390 | "lw %[t7], 11*4(%[in]) \n\t" | |
391 | "addu %[t3], %[t3], %[t5] \n\t" | |
392 | "sw %[t4], 14*4(%[in]) \n\t" | |
393 | "lw %[t8], 10*4(%[in]) \n\t" | |
394 | "sw %[t3], 15*4(%[in]) \n\t" | |
395 | "addu %[t6], %[t6], %[t7] \n\t" | |
396 | "addu %[t7], %[t7], %[t8] \n\t" | |
397 | "sw %[t6], 12*4(%[in]) \n\t" | |
398 | "addu %[t5], %[t5], %[t7] \n\t" | |
399 | "lw %[t1], 9*4(%[in]) \n\t" | |
400 | "lw %[t2], 8*4(%[in]) \n\t" | |
401 | "sw %[t5], 13*4(%[in]) \n\t" | |
402 | "addu %[t8], %[t8], %[t1] \n\t" | |
403 | "addu %[t1], %[t1], %[t2] \n\t" | |
404 | "sw %[t8], 10*4(%[in]) \n\t" | |
405 | "addu %[t7], %[t7], %[t1] \n\t" | |
406 | "lw %[t3], 7*4(%[in]) \n\t" | |
407 | "lw %[t4], 6*4(%[in]) \n\t" | |
408 | "sw %[t7], 11*4(%[in]) \n\t" | |
409 | "addu %[t2], %[t2], %[t3] \n\t" | |
410 | "addu %[t3], %[t3], %[t4] \n\t" | |
411 | "sw %[t2], 8*4(%[in]) \n\t" | |
412 | "addu %[t1], %[t1], %[t3] \n\t" | |
413 | "lw %[t5], 5*4(%[in]) \n\t" | |
414 | "lw %[t6], 4*4(%[in]) \n\t" | |
415 | "sw %[t1], 9*4(%[in]) \n\t" | |
416 | "addu %[t4], %[t4], %[t5] \n\t" | |
417 | "addu %[t5], %[t5], %[t6] \n\t" | |
418 | "sw %[t4], 6*4(%[in]) \n\t" | |
419 | "addu %[t3], %[t3], %[t5] \n\t" | |
420 | "lw %[t7], 3*4(%[in]) \n\t" | |
421 | "lw %[t8], 2*4(%[in]) \n\t" | |
422 | "sw %[t3], 7*4(%[in]) \n\t" | |
423 | "addu %[t6], %[t6], %[t7] \n\t" | |
424 | "addu %[t7], %[t7], %[t8] \n\t" | |
425 | "sw %[t6], 4*4(%[in]) \n\t" | |
426 | "addu %[t5], %[t5], %[t7] \n\t" | |
427 | "lw %[t1], 1*4(%[in]) \n\t" | |
428 | "lw %[t2], 0*4(%[in]) \n\t" | |
429 | "sw %[t5], 5*4(%[in]) \n\t" | |
430 | "addu %[t8], %[t8], %[t1] \n\t" | |
431 | "addu %[t1], %[t1], %[t2] \n\t" | |
432 | "sw %[t8], 2*4(%[in]) \n\t" | |
433 | "addu %[t7], %[t7], %[t1] \n\t" | |
434 | "sw %[t7], 3*4(%[in]) \n\t" | |
435 | "sw %[t1], 1*4(%[in]) \n\t" | |
436 | ||
437 | : [in] "+r" (in), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), | |
438 | [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), | |
439 | [t7] "=&r" (t7), [t8] "=&r" (t8) | |
440 | : | |
441 | : "memory" | |
442 | ); | |
443 | ||
444 | for(j = 0; j < 2; j++) { | |
445 | ||
446 | tmp1 = tmp + j; | |
447 | in1 = in + j; | |
448 | ||
449 | /** | |
450 | * Original constants are multiplied by two in advanced | |
451 | * for assembly optimization (e.g. C_2 = 2 * C2). | |
452 | * That can lead to overflow in operations where they are used. | |
453 | * | |
454 | * Example of the solution: | |
455 | * | |
456 | * in original code: | |
457 | * t0 = ((int64_t)(in1[2*2] + in1[2*4]) * (int64_t)(2*C2))>>32 | |
458 | * | |
459 | * in assembly: | |
460 | * C_2 = 2 * C2; | |
461 | * . | |
462 | * . | |
463 | * "lw %[t7], 4*4(%[in1]) \n\t" | |
464 | * "lw %[t8], 8*4(%[in1]) \n\t" | |
465 | * "addu %[temp_reg2],%[t7], %[t8] \n\t" | |
466 | * "multu %[C_2], %[temp_reg2] \n\t" | |
467 | * "mfhi %[temp_reg1] \n\t" | |
468 | * "sra %[temp_reg2],%[temp_reg2],31 \n\t" | |
469 | * "move %[t0], $0 \n\t" | |
470 | * "movn %[t0], %[C_2], %[temp_reg2] \n\t" | |
471 | * "sub %[t0], %[temp_reg1],%[t0] \n\t" | |
472 | */ | |
473 | ||
474 | __asm__ volatile ( | |
475 | "lw %[t7], 4*4(%[in1]) \n\t" | |
476 | "lw %[t8], 8*4(%[in1]) \n\t" | |
477 | "lw %[t6], 16*4(%[in1]) \n\t" | |
478 | "lw %[t4], 0*4(%[in1]) \n\t" | |
479 | "addu %[temp_reg2], %[t7], %[t8] \n\t" | |
480 | "addu %[t2], %[t6], %[t8] \n\t" | |
481 | "multu %[C_2], %[temp_reg2] \n\t" | |
482 | "lw %[t5], 12*4(%[in1]) \n\t" | |
483 | "sub %[t2], %[t2], %[t7] \n\t" | |
484 | "sub %[t1], %[t4], %[t5] \n\t" | |
485 | "sra %[t3], %[t5], 1 \n\t" | |
486 | "sra %[temp_reg1], %[t2], 1 \n\t" | |
487 | "addu %[t3], %[t3], %[t4] \n\t" | |
488 | "sub %[temp_reg1], %[t1], %[temp_reg1] \n\t" | |
489 | "sra %[temp_reg2], %[temp_reg2], 31 \n\t" | |
490 | "sw %[temp_reg1], 6*4(%[tmp1]) \n\t" | |
491 | "move %[t0], $0 \n\t" | |
492 | "movn %[t0], %[C_2], %[temp_reg2] \n\t" | |
493 | "mfhi %[temp_reg1] \n\t" | |
494 | "addu %[t1], %[t1], %[t2] \n\t" | |
495 | "sw %[t1], 16*4(%[tmp1]) \n\t" | |
496 | "sub %[temp_reg4], %[t8], %[t6] \n\t" | |
497 | "add %[temp_reg2], %[t7], %[t6] \n\t" | |
498 | "mult $ac1, %[C_8], %[temp_reg4] \n\t" | |
499 | "multu $ac2, %[C_4], %[temp_reg2] \n\t" | |
500 | "sub %[t0], %[temp_reg1], %[t0] \n\t" | |
501 | "sra %[temp_reg1], %[temp_reg2], 31 \n\t" | |
502 | "move %[t2], $0 \n\t" | |
503 | "movn %[t2], %[C_4], %[temp_reg1] \n\t" | |
504 | "mfhi %[t1], $ac1 \n\t" | |
505 | "mfhi %[temp_reg1], $ac2 \n\t" | |
506 | "lw %[t6], 10*4(%[in1]) \n\t" | |
507 | "lw %[t8], 14*4(%[in1]) \n\t" | |
508 | "lw %[t7], 2*4(%[in1]) \n\t" | |
509 | "lw %[t4], 6*4(%[in1]) \n\t" | |
510 | "sub %[temp_reg3], %[t3], %[t0] \n\t" | |
511 | "add %[temp_reg4], %[t3], %[t0] \n\t" | |
512 | "sub %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" | |
513 | "add %[temp_reg4], %[temp_reg4], %[t1] \n\t" | |
514 | "sub %[t2], %[temp_reg1], %[t2] \n\t" | |
515 | "sw %[temp_reg4], 2*4(%[tmp1]) \n\t" | |
516 | "sub %[temp_reg3], %[temp_reg3], %[t2] \n\t" | |
517 | "add %[temp_reg1], %[t3], %[t2] \n\t" | |
518 | "sw %[temp_reg3], 10*4(%[tmp1]) \n\t" | |
519 | "sub %[temp_reg1], %[temp_reg1], %[t1] \n\t" | |
520 | "addu %[temp_reg2], %[t6], %[t8] \n\t" | |
521 | "sw %[temp_reg1], 14*4(%[tmp1]) \n\t" | |
522 | "sub %[temp_reg2], %[temp_reg2], %[t7] \n\t" | |
523 | "addu %[temp_reg3], %[t7], %[t6] \n\t" | |
524 | "multu $ac3, %[C_3], %[temp_reg2] \n\t" | |
525 | "multu %[C_1], %[temp_reg3] \n\t" | |
526 | "sra %[temp_reg1], %[temp_reg2], 31 \n\t" | |
527 | "move %[t1], $0 \n\t" | |
528 | "sra %[temp_reg3], %[temp_reg3], 31 \n\t" | |
529 | "movn %[t1], %[C_3], %[temp_reg1] \n\t" | |
530 | "mfhi %[temp_reg1], $ac3 \n\t" | |
531 | "mfhi %[temp_reg4] \n\t" | |
532 | "move %[t2], $0 \n\t" | |
533 | "movn %[t2], %[C_1], %[temp_reg3] \n\t" | |
534 | "sub %[temp_reg3], %[t6], %[t8] \n\t" | |
535 | "sub %[t2], %[temp_reg4], %[t2] \n\t" | |
536 | "multu $ac1, %[C_7], %[temp_reg3] \n\t" | |
537 | "sub %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" | |
538 | "sra %[temp_reg4], %[temp_reg3], 31 \n\t" | |
539 | "sub %[t1], %[temp_reg1], %[t1] \n\t" | |
540 | "move %[t3], $0 \n\t" | |
541 | "sw %[t1], 4*4(%[tmp1]) \n\t" | |
542 | "movn %[t3], %[C_7], %[temp_reg4] \n\t" | |
543 | "multu $ac2, %[C_3A], %[t4] \n\t" | |
544 | "add %[temp_reg2], %[t7], %[t8] \n\t" | |
545 | "move %[t1], $0 \n\t" | |
546 | "mfhi %[temp_reg4], $ac1 \n\t" | |
547 | "multu $ac3,%[C_5], %[temp_reg2] \n\t" | |
548 | "move %[t0], $0 \n\t" | |
549 | "sra %[temp_reg1], %[temp_reg2], 31 \n\t" | |
550 | "movn %[t1],%[C_5], %[temp_reg1] \n\t" | |
551 | "sub %[temp_reg4], %[temp_reg4], %[temp_reg3] \n\t" | |
552 | "mfhi %[temp_reg1], $ac3 \n\t" | |
553 | "sra %[temp_reg3], %[t4], 31 \n\t" | |
554 | "movn %[t0], %[C_3A], %[temp_reg3] \n\t" | |
555 | "mfhi %[temp_reg3], $ac2 \n\t" | |
556 | "sub %[t3], %[temp_reg4], %[t3] \n\t" | |
557 | "add %[temp_reg4], %[t3], %[t2] \n\t" | |
558 | "sub %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" | |
559 | "sub %[t1], %[temp_reg1], %[t1] \n\t" | |
560 | "sub %[t0], %[temp_reg3], %[t0] \n\t" | |
561 | "add %[temp_reg1], %[t2], %[t1] \n\t" | |
562 | "add %[temp_reg4], %[temp_reg4], %[t0] \n\t" | |
563 | "sub %[temp_reg2], %[t3], %[t1] \n\t" | |
564 | "sw %[temp_reg4], 0*4(%[tmp1]) \n\t" | |
565 | "sub %[temp_reg1], %[temp_reg1], %[t0] \n\t" | |
566 | "sub %[temp_reg2], %[temp_reg2], %[t0] \n\t" | |
567 | "sw %[temp_reg1], 12*4(%[tmp1]) \n\t" | |
568 | "sw %[temp_reg2], 8*4(%[tmp1]) \n\t" | |
569 | ||
570 | : [t7] "=&r" (t7), [temp_reg1] "=&r" (temp_reg1), | |
571 | [temp_reg2] "=&r" (temp_reg2), [temp_reg4] "=&r" (temp_reg4), | |
572 | [temp_reg3] "=&r" (temp_reg3), [t8] "=&r" (t8), [t0] "=&r" (t0), | |
573 | [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r"(t6), [t2] "=&r" (t2), | |
574 | [t3] "=&r" (t3), [t1] "=&r" (t1) | |
575 | : [C_2] "r" (C_2), [in1] "r" (in1), [tmp1] "r" (tmp1), [C_8] "r" (C_8), | |
576 | [C_4] "r" (C_4), [C_3] "r" (C_3), [C_1] "r" (C_1), [C_7] "r" (C_7), | |
577 | [C_3A] "r" (C_3A), [C_5] "r" (C_5) | |
578 | : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo", | |
579 | "$ac3hi", "$ac3lo" | |
580 | ); | |
581 | } | |
582 | ||
583 | /** | |
584 | * loop is unrolled four times | |
585 | * | |
586 | * values defined in tables(icos36[] and icos36h[]) are not loaded from | |
587 | * these tables - they are directly loaded in appropriate registers | |
588 | * | |
589 | */ | |
590 | ||
591 | __asm__ volatile ( | |
592 | "lw %[t2], 1*4(%[tmp]) \n\t" | |
593 | "lw %[t3], 3*4(%[tmp]) \n\t" | |
594 | "lw %[t0], 0*4(%[tmp]) \n\t" | |
595 | "lw %[t1], 2*4(%[tmp]) \n\t" | |
596 | "addu %[temp_reg1], %[t3], %[t2] \n\t" | |
597 | "li %[temp_reg2], 0x807D2B1E \n\t" | |
598 | "move %[s1], $0 \n\t" | |
599 | "multu %[temp_reg2], %[temp_reg1] \n\t" | |
600 | "sra %[temp_reg1], %[temp_reg1], 31 \n\t" | |
601 | "movn %[s1], %[temp_reg2], %[temp_reg1] \n\t" | |
602 | "sub %[temp_reg3], %[t3], %[t2] \n\t" | |
603 | "li %[temp_reg4], 0x2de5151 \n\t" | |
604 | "mfhi %[temp_reg2] \n\t" | |
605 | "addu %[s0], %[t1], %[t0] \n\t" | |
606 | "lw %[temp_reg5], 9*4(%[win]) \n\t" | |
607 | "mult $ac1, %[temp_reg4], %[temp_reg3] \n\t" | |
608 | "lw %[temp_reg6], 4*9*4(%[buf]) \n\t" | |
609 | "sub %[s2], %[t1], %[t0] \n\t" | |
610 | "lw %[temp_reg3], 29*4(%[win]) \n\t" | |
611 | "subu %[s1], %[temp_reg2], %[s1] \n\t" | |
612 | "lw %[temp_reg4], 28*4(%[win]) \n\t" | |
613 | "add %[t0], %[s0], %[s1] \n\t" | |
614 | "extr.w %[s3], $ac1,23 \n\t" | |
615 | "mult $ac2, %[t0], %[temp_reg3] \n\t" | |
616 | "sub %[t1], %[s0], %[s1] \n\t" | |
617 | "lw %[temp_reg1], 4*8*4(%[buf]) \n\t" | |
618 | "mult %[t1], %[temp_reg5] \n\t" | |
619 | "lw %[temp_reg2], 8*4(%[win]) \n\t" | |
620 | "mfhi %[temp_reg3], $ac2 \n\t" | |
621 | "mult $ac3, %[t0], %[temp_reg4] \n\t" | |
622 | "add %[t0], %[s2], %[s3] \n\t" | |
623 | "mfhi %[temp_reg5] \n\t" | |
624 | "mult $ac1, %[t1], %[temp_reg2] \n\t" | |
625 | "sub %[t1], %[s2], %[s3] \n\t" | |
626 | "sw %[temp_reg3], 4*9*4(%[buf]) \n\t" | |
627 | "mfhi %[temp_reg4], $ac3 \n\t" | |
628 | "lw %[temp_reg3], 37*4(%[win]) \n\t" | |
629 | "mfhi %[temp_reg2], $ac1 \n\t" | |
630 | "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t" | |
631 | "lw %[temp_reg6], 17*4(%[win]) \n\t" | |
632 | "sw %[temp_reg5], 32*9*4(%[out]) \n\t" | |
633 | "sw %[temp_reg4], 4*8*4(%[buf]) \n\t" | |
634 | "mult %[t1], %[temp_reg6] \n\t" | |
635 | "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" | |
636 | "lw %[temp_reg2], 0*4(%[win]) \n\t" | |
637 | "lw %[temp_reg5], 4*17*4(%[buf]) \n\t" | |
638 | "sw %[temp_reg1], 8*32*4(%[out]) \n\t" | |
639 | "mfhi %[temp_reg6] \n\t" | |
640 | "mult $ac1, %[t1], %[temp_reg2] \n\t" | |
641 | "lw %[temp_reg4], 20*4(%[win]) \n\t" | |
642 | "lw %[temp_reg1], 0(%[buf]) \n\t" | |
643 | "mult $ac2, %[t0], %[temp_reg3] \n\t" | |
644 | "mult %[t0], %[temp_reg4] \n\t" | |
645 | "mfhi %[temp_reg2], $ac1 \n\t" | |
646 | "lw %[t0], 4*4(%[tmp]) \n\t" | |
647 | "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t" | |
648 | "mfhi %[temp_reg3], $ac2 \n\t" | |
649 | "mfhi %[temp_reg4] \n\t" | |
650 | "sw %[temp_reg5], 17*32*4(%[out]) \n\t" | |
651 | "lw %[t1], 6*4(%[tmp]) \n\t" | |
652 | "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" | |
653 | "lw %[t2], 5*4(%[tmp]) \n\t" | |
654 | "sw %[temp_reg1], 0*32*4(%[out]) \n\t" | |
655 | "addu %[s0], %[t1], %[t0] \n\t" | |
656 | "sw %[temp_reg3], 4*17*4(%[buf]) \n\t" | |
657 | "lw %[t3], 7*4(%[tmp]) \n\t" | |
658 | "sub %[s2], %[t1], %[t0] \n\t" | |
659 | "sw %[temp_reg4], 0(%[buf]) \n\t" | |
660 | "addu %[temp_reg5], %[t3], %[t2] \n\t" | |
661 | "li %[temp_reg6], 0x8483EE0C \n\t" | |
662 | "move %[s1], $0 \n\t" | |
663 | "multu %[temp_reg6], %[temp_reg5] \n\t" | |
664 | "sub %[temp_reg1], %[t3], %[t2] \n\t" | |
665 | "li %[temp_reg2], 0xf746ea \n\t" | |
666 | "sra %[temp_reg5], %[temp_reg5], 31 \n\t" | |
667 | "mult $ac1, %[temp_reg2], %[temp_reg1] \n\t" | |
668 | "movn %[s1], %[temp_reg6], %[temp_reg5] \n\t" | |
669 | "mfhi %[temp_reg5] \n\t" | |
670 | "lw %[temp_reg3], 10*4(%[win]) \n\t" | |
671 | "lw %[temp_reg4], 4*10*4(%[buf]) \n\t" | |
672 | "extr.w %[s3], $ac1, 23 \n\t" | |
673 | "lw %[temp_reg1], 4*7*4(%[buf]) \n\t" | |
674 | "lw %[temp_reg2], 7*4(%[win]) \n\t" | |
675 | "lw %[temp_reg6], 30*4(%[win]) \n\t" | |
676 | "subu %[s1], %[temp_reg5], %[s1] \n\t" | |
677 | "sub %[t1], %[s0], %[s1] \n\t" | |
678 | "add %[t0], %[s0], %[s1] \n\t" | |
679 | "mult $ac2, %[t1], %[temp_reg3] \n\t" | |
680 | "mult $ac3, %[t1], %[temp_reg2] \n\t" | |
681 | "mult %[t0], %[temp_reg6] \n\t" | |
682 | "lw %[temp_reg5], 27*4(%[win]) \n\t" | |
683 | "mult $ac1, %[t0], %[temp_reg5] \n\t" | |
684 | "mfhi %[temp_reg3], $ac2 \n\t" | |
685 | "mfhi %[temp_reg2], $ac3 \n\t" | |
686 | "mfhi %[temp_reg6] \n\t" | |
687 | "add %[t0], %[s2], %[s3] \n\t" | |
688 | "sub %[t1], %[s2], %[s3] \n\t" | |
689 | "add %[temp_reg3], %[temp_reg3], %[temp_reg4] \n\t" | |
690 | "lw %[temp_reg4], 16*4(%[win]) \n\t" | |
691 | "mfhi %[temp_reg5], $ac1 \n\t" | |
692 | "sw %[temp_reg3], 32*10*4(%[out]) \n\t" | |
693 | "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" | |
694 | "lw %[temp_reg3], 4*16*4(%[buf]) \n\t" | |
695 | "sw %[temp_reg6], 4*10*4(%[buf]) \n\t" | |
696 | "sw %[temp_reg1], 7*32*4(%[out]) \n\t" | |
697 | "mult $ac2, %[t1], %[temp_reg4] \n\t" | |
698 | "sw %[temp_reg5], 4*7*4(%[buf]) \n\t" | |
699 | "lw %[temp_reg6], 1*4(%[win]) \n\t" | |
700 | "lw %[temp_reg5], 4*1*4(%[buf]) \n\t" | |
701 | "lw %[temp_reg1], 36*4(%[win]) \n\t" | |
702 | "mult $ac3, %[t1], %[temp_reg6] \n\t" | |
703 | "lw %[temp_reg2], 21*4(%[win]) \n\t" | |
704 | "mfhi %[temp_reg4], $ac2 \n\t" | |
705 | "mult %[t0], %[temp_reg1] \n\t" | |
706 | "mult $ac1, %[t0],%[temp_reg2] \n\t" | |
707 | "lw %[t0], 8*4(%[tmp]) \n\t" | |
708 | "mfhi %[temp_reg6], $ac3 \n\t" | |
709 | "lw %[t1], 10*4(%[tmp]) \n\t" | |
710 | "lw %[t3], 11*4(%[tmp]) \n\t" | |
711 | "mfhi %[temp_reg1] \n\t" | |
712 | "add %[temp_reg3], %[temp_reg3], %[temp_reg4] \n\t" | |
713 | "lw %[t2], 9*4(%[tmp]) \n\t" | |
714 | "mfhi %[temp_reg2], $ac1 \n\t" | |
715 | "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t" | |
716 | "sw %[temp_reg3], 16*32*4(%[out]) \n\t" | |
717 | "sw %[temp_reg5], 1*32*4(%[out]) \n\t" | |
718 | "sw %[temp_reg1], 4*16*4(%[buf]) \n\t" | |
719 | "addu %[temp_reg3], %[t3], %[t2] \n\t" | |
720 | "li %[temp_reg4], 0x8D3B7CD6 \n\t" | |
721 | "sw %[temp_reg2], 4*1*4(%[buf]) \n\t" | |
722 | "multu %[temp_reg4],%[temp_reg3] \n\t" | |
723 | "sra %[temp_reg3], %[temp_reg3], 31 \n\t" | |
724 | "move %[s1], $0 \n\t" | |
725 | "movn %[s1], %[temp_reg4], %[temp_reg3] \n\t" | |
726 | "addu %[s0], %[t1], %[t0] \n\t" | |
727 | "mfhi %[temp_reg3] \n\t" | |
728 | "sub %[s2], %[t1], %[t0] \n\t" | |
729 | "sub %[temp_reg5], %[t3], %[t2] \n\t" | |
730 | "li %[temp_reg6], 0x976fd9 \n\t" | |
731 | "lw %[temp_reg2], 11*4(%[win]) \n\t" | |
732 | "lw %[temp_reg1], 4*11*4(%[buf]) \n\t" | |
733 | "mult $ac1, %[temp_reg6], %[temp_reg5] \n\t" | |
734 | "subu %[s1], %[temp_reg3], %[s1] \n\t" | |
735 | "lw %[temp_reg5], 31*4(%[win]) \n\t" | |
736 | "sub %[t1], %[s0], %[s1] \n\t" | |
737 | "add %[t0], %[s0], %[s1] \n\t" | |
738 | "mult $ac2, %[t1], %[temp_reg2] \n\t" | |
739 | "mult %[t0], %[temp_reg5] \n\t" | |
740 | "lw %[temp_reg4], 6*4(%[win]) \n\t" | |
741 | "extr.w %[s3], $ac1, 23 \n\t" | |
742 | "lw %[temp_reg3], 4*6*4(%[buf]) \n\t" | |
743 | "mfhi %[temp_reg2], $ac2 \n\t" | |
744 | "lw %[temp_reg6], 26*4(%[win]) \n\t" | |
745 | "mfhi %[temp_reg5] \n\t" | |
746 | "mult $ac3, %[t1], %[temp_reg4] \n\t" | |
747 | "mult $ac1, %[t0], %[temp_reg6] \n\t" | |
748 | "add %[t0], %[s2], %[s3] \n\t" | |
749 | "sub %[t1], %[s2], %[s3] \n\t" | |
750 | "add %[temp_reg2], %[temp_reg2], %[temp_reg1] \n\t" | |
751 | "mfhi %[temp_reg4], $ac3 \n\t" | |
752 | "mfhi %[temp_reg6], $ac1 \n\t" | |
753 | "sw %[temp_reg5], 4*11*4(%[buf]) \n\t" | |
754 | "sw %[temp_reg2], 32*11*4(%[out]) \n\t" | |
755 | "lw %[temp_reg1], 4*15*4(%[buf]) \n\t" | |
756 | "add %[temp_reg3], %[temp_reg3], %[temp_reg4] \n\t" | |
757 | "lw %[temp_reg2], 15*4(%[win]) \n\t" | |
758 | "sw %[temp_reg3], 6*32*4(%[out]) \n\t" | |
759 | "sw %[temp_reg6], 4*6*4(%[buf]) \n\t" | |
760 | "mult %[t1], %[temp_reg2] \n\t" | |
761 | "lw %[temp_reg3], 2*4(%[win]) \n\t" | |
762 | "lw %[temp_reg4], 4*2*4(%[buf]) \n\t" | |
763 | "lw %[temp_reg5], 35*4(%[win]) \n\t" | |
764 | "mult $ac1, %[t1], %[temp_reg3] \n\t" | |
765 | "mfhi %[temp_reg2] \n\t" | |
766 | "lw %[temp_reg6], 22*4(%[win]) \n\t" | |
767 | "mult $ac2, %[t0], %[temp_reg5] \n\t" | |
768 | "lw %[t1], 14*4(%[tmp]) \n\t" | |
769 | "mult $ac3, %[t0], %[temp_reg6] \n\t" | |
770 | "lw %[t0], 12*4(%[tmp]) \n\t" | |
771 | "mfhi %[temp_reg3], $ac1 \n\t" | |
772 | "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" | |
773 | "mfhi %[temp_reg5], $ac2 \n\t" | |
774 | "sw %[temp_reg1], 15*32*4(%[out]) \n\t" | |
775 | "mfhi %[temp_reg6], $ac3 \n\t" | |
776 | "lw %[t2], 13*4(%[tmp]) \n\t" | |
777 | "lw %[t3], 15*4(%[tmp]) \n\t" | |
778 | "add %[temp_reg4], %[temp_reg4], %[temp_reg3] \n\t" | |
779 | "sw %[temp_reg5], 4*15*4(%[buf]) \n\t" | |
780 | "addu %[temp_reg1], %[t3], %[t2] \n\t" | |
781 | "li %[temp_reg2], 0x9C42577C \n\t" | |
782 | "move %[s1], $0 \n\t" | |
783 | "multu %[temp_reg2], %[temp_reg1] \n\t" | |
784 | "sw %[temp_reg4], 2*32*4(%[out]) \n\t" | |
785 | "sra %[temp_reg1], %[temp_reg1], 31 \n\t" | |
786 | "movn %[s1], %[temp_reg2], %[temp_reg1] \n\t" | |
787 | "sub %[temp_reg3], %[t3], %[t2] \n\t" | |
788 | "li %[temp_reg4], 0x6f94a2 \n\t" | |
789 | "mfhi %[temp_reg1] \n\t" | |
790 | "addu %[s0], %[t1], %[t0] \n\t" | |
791 | "sw %[temp_reg6], 4*2*4(%[buf]) \n\t" | |
792 | "mult $ac1, %[temp_reg4], %[temp_reg3] \n\t" | |
793 | "sub %[s2], %[t1], %[t0] \n\t" | |
794 | "lw %[temp_reg5], 12*4(%[win]) \n\t" | |
795 | "lw %[temp_reg6], 4*12*4(%[buf]) \n\t" | |
796 | "subu %[s1], %[temp_reg1], %[s1] \n\t" | |
797 | "sub %[t1], %[s0], %[s1] \n\t" | |
798 | "lw %[temp_reg3], 32*4(%[win]) \n\t" | |
799 | "mult $ac2, %[t1], %[temp_reg5] \n\t" | |
800 | "add %[t0], %[s0], %[s1] \n\t" | |
801 | "extr.w %[s3], $ac1, 23 \n\t" | |
802 | "lw %[temp_reg2], 5*4(%[win]) \n\t" | |
803 | "mult %[t0], %[temp_reg3] \n\t" | |
804 | "mfhi %[temp_reg5], $ac2 \n\t" | |
805 | "lw %[temp_reg4], 25*4(%[win]) \n\t" | |
806 | "lw %[temp_reg1], 4*5*4(%[buf]) \n\t" | |
807 | "mult $ac3, %[t1], %[temp_reg2] \n\t" | |
808 | "mult $ac1, %[t0], %[temp_reg4] \n\t" | |
809 | "mfhi %[temp_reg3] \n\t" | |
810 | "add %[t0], %[s2], %[s3] \n\t" | |
811 | "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t" | |
812 | "mfhi %[temp_reg2], $ac3 \n\t" | |
813 | "mfhi %[temp_reg4], $ac1 \n\t" | |
814 | "sub %[t1], %[s2], %[s3] \n\t" | |
815 | "sw %[temp_reg5], 32*12*4(%[out]) \n\t" | |
816 | "sw %[temp_reg3], 4*12*4(%[buf]) \n\t" | |
817 | "lw %[temp_reg6], 14*4(%[win]) \n\t" | |
818 | "lw %[temp_reg5], 4*14*4(%[buf]) \n\t" | |
819 | "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" | |
820 | "sw %[temp_reg4], 4*5*4(%[buf]) \n\t" | |
821 | "sw %[temp_reg1], 5*32*4(%[out]) \n\t" | |
822 | "mult %[t1], %[temp_reg6] \n\t" | |
823 | "lw %[temp_reg4], 34*4(%[win]) \n\t" | |
824 | "lw %[temp_reg2], 3*4(%[win]) \n\t" | |
825 | "lw %[temp_reg1], 4*3*4(%[buf]) \n\t" | |
826 | "mult $ac2, %[t0], %[temp_reg4] \n\t" | |
827 | "mfhi %[temp_reg6] \n\t" | |
828 | "mult $ac1, %[t1], %[temp_reg2] \n\t" | |
829 | "lw %[temp_reg3], 23*4(%[win]) \n\t" | |
830 | "lw %[s0], 16*4(%[tmp]) \n\t" | |
831 | "mfhi %[temp_reg4], $ac2 \n\t" | |
832 | "lw %[t1], 17*4(%[tmp]) \n\t" | |
833 | "mult $ac3, %[t0], %[temp_reg3] \n\t" | |
834 | "move %[s1], $0 \n\t" | |
835 | "add %[temp_reg5], %[temp_reg5], %[temp_reg6] \n\t" | |
836 | "mfhi %[temp_reg2], $ac1 \n\t" | |
837 | "sw %[temp_reg5], 14*32*4(%[out]) \n\t" | |
838 | "sw %[temp_reg4], 4*14*4(%[buf]) \n\t" | |
839 | "mfhi %[temp_reg3], $ac3 \n\t" | |
840 | "li %[temp_reg5], 0xB504F334 \n\t" | |
841 | "add %[temp_reg1], %[temp_reg1], %[temp_reg2] \n\t" | |
842 | "multu %[temp_reg5], %[t1] \n\t" | |
843 | "lw %[temp_reg2], 4*13*4(%[buf]) \n\t" | |
844 | "sw %[temp_reg1], 3*32*4(%[out]) \n\t" | |
845 | "sra %[t1], %[t1], 31 \n\t" | |
846 | "mfhi %[temp_reg6] \n\t" | |
847 | "movn %[s1], %[temp_reg5], %[t1] \n\t" | |
848 | "sw %[temp_reg3], 4*3*4(%[buf]) \n\t" | |
849 | "lw %[temp_reg1], 13*4(%[win]) \n\t" | |
850 | "lw %[temp_reg4], 4*4*4(%[buf]) \n\t" | |
851 | "lw %[temp_reg3], 4*4(%[win]) \n\t" | |
852 | "lw %[temp_reg5], 33*4(%[win]) \n\t" | |
853 | "subu %[s1], %[temp_reg6], %[s1] \n\t" | |
854 | "lw %[temp_reg6], 24*4(%[win]) \n\t" | |
855 | "sub %[t1], %[s0], %[s1] \n\t" | |
856 | "add %[t0], %[s0], %[s1] \n\t" | |
857 | "mult $ac1, %[t1], %[temp_reg1] \n\t" | |
858 | "mult $ac2, %[t1], %[temp_reg3] \n\t" | |
859 | "mult $ac3, %[t0], %[temp_reg5] \n\t" | |
860 | "mult %[t0], %[temp_reg6] \n\t" | |
861 | "mfhi %[temp_reg1], $ac1 \n\t" | |
862 | "mfhi %[temp_reg3], $ac2 \n\t" | |
863 | "mfhi %[temp_reg5], $ac3 \n\t" | |
864 | "mfhi %[temp_reg6] \n\t" | |
865 | "add %[temp_reg2], %[temp_reg2], %[temp_reg1] \n\t" | |
866 | "add %[temp_reg4], %[temp_reg4], %[temp_reg3] \n\t" | |
867 | "sw %[temp_reg2], 13*32*4(%[out]) \n\t" | |
868 | "sw %[temp_reg4], 4*32*4(%[out]) \n\t" | |
869 | "sw %[temp_reg5], 4*13*4(%[buf]) \n\t" | |
870 | "sw %[temp_reg6], 4*4*4(%[buf]) \n\t" | |
871 | ||
872 | : [t0] "=&r" (t0), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), | |
873 | [s0] "=&r" (s0), [s2] "=&r" (s2), [temp_reg1] "=&r" (temp_reg1), | |
874 | [temp_reg2] "=&r" (temp_reg2), [s1] "=&r" (s1), [s3] "=&r" (s3), | |
875 | [temp_reg3] "=&r" (temp_reg3), [temp_reg4] "=&r" (temp_reg4), | |
876 | [temp_reg5] "=&r" (temp_reg5), [temp_reg6] "=&r" (temp_reg6), | |
877 | [out] "+r" (out) | |
878 | : [tmp] "r" (tmp), [win] "r" (win), [buf] "r" (buf) | |
879 | : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo", | |
880 | "$ac3hi", "$ac3lo" | |
881 | ); | |
882 | } | |
883 | ||
884 | static void ff_imdct36_blocks_mips_fixed(int *out, int *buf, int *in, | |
885 | int count, int switch_point, int block_type) | |
886 | { | |
887 | int j; | |
888 | for (j=0 ; j < count; j++) { | |
889 | /* apply window & overlap with previous buffer */ | |
890 | ||
891 | /* select window */ | |
892 | int win_idx = (switch_point && j < 2) ? 0 : block_type; | |
893 | int *win = ff_mdct_win_fixed[win_idx + (4 & -(j & 1))]; | |
894 | ||
895 | imdct36_mips_fixed(out, buf, in, win); | |
896 | ||
897 | in += 18; | |
898 | buf += ((j&3) != 3 ? 1 : (72-3)); | |
899 | out++; | |
900 | } | |
901 | } | |
902 | ||
903 | void ff_mpadsp_init_mipsdspr1(MPADSPContext *s) | |
904 | { | |
905 | s->apply_window_fixed = ff_mpadsp_apply_window_mips_fixed; | |
906 | s->imdct36_blocks_fixed = ff_imdct36_blocks_mips_fixed; | |
907 | } |