Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> | |
3 | * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | #include "libavutil/attributes.h" | |
23 | #include "libavutil/cpu.h" | |
24 | #include "libavutil/x86/asm.h" | |
25 | #include "libavutil/x86/cpu.h" | |
26 | #include "libavcodec/avcodec.h" | |
27 | #include "libavcodec/mpegvideo.h" | |
28 | ||
29 | #if HAVE_MMX_INLINE | |
30 | ||
31 | static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, | |
32 | int16_t *block, int n, int qscale) | |
33 | { | |
34 | x86_reg level, qmul, qadd, nCoeffs; | |
35 | ||
36 | qmul = qscale << 1; | |
37 | ||
38 | av_assert2(s->block_last_index[n]>=0 || s->h263_aic); | |
39 | ||
40 | if (!s->h263_aic) { | |
41 | if (n < 4) | |
42 | level = block[0] * s->y_dc_scale; | |
43 | else | |
44 | level = block[0] * s->c_dc_scale; | |
45 | qadd = (qscale - 1) | 1; | |
46 | }else{ | |
47 | qadd = 0; | |
48 | level= block[0]; | |
49 | } | |
50 | if(s->ac_pred) | |
51 | nCoeffs=63; | |
52 | else | |
53 | nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; | |
54 | ||
55 | __asm__ volatile( | |
56 | "movd %1, %%mm6 \n\t" //qmul | |
57 | "packssdw %%mm6, %%mm6 \n\t" | |
58 | "packssdw %%mm6, %%mm6 \n\t" | |
59 | "movd %2, %%mm5 \n\t" //qadd | |
60 | "pxor %%mm7, %%mm7 \n\t" | |
61 | "packssdw %%mm5, %%mm5 \n\t" | |
62 | "packssdw %%mm5, %%mm5 \n\t" | |
63 | "psubw %%mm5, %%mm7 \n\t" | |
64 | "pxor %%mm4, %%mm4 \n\t" | |
65 | ".p2align 4 \n\t" | |
66 | "1: \n\t" | |
67 | "movq (%0, %3), %%mm0 \n\t" | |
68 | "movq 8(%0, %3), %%mm1 \n\t" | |
69 | ||
70 | "pmullw %%mm6, %%mm0 \n\t" | |
71 | "pmullw %%mm6, %%mm1 \n\t" | |
72 | ||
73 | "movq (%0, %3), %%mm2 \n\t" | |
74 | "movq 8(%0, %3), %%mm3 \n\t" | |
75 | ||
76 | "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
77 | "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
78 | ||
79 | "pxor %%mm2, %%mm0 \n\t" | |
80 | "pxor %%mm3, %%mm1 \n\t" | |
81 | ||
82 | "paddw %%mm7, %%mm0 \n\t" | |
83 | "paddw %%mm7, %%mm1 \n\t" | |
84 | ||
85 | "pxor %%mm0, %%mm2 \n\t" | |
86 | "pxor %%mm1, %%mm3 \n\t" | |
87 | ||
88 | "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 | |
89 | "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 | |
90 | ||
91 | "pandn %%mm2, %%mm0 \n\t" | |
92 | "pandn %%mm3, %%mm1 \n\t" | |
93 | ||
94 | "movq %%mm0, (%0, %3) \n\t" | |
95 | "movq %%mm1, 8(%0, %3) \n\t" | |
96 | ||
97 | "add $16, %3 \n\t" | |
98 | "jng 1b \n\t" | |
99 | ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) | |
100 | : "memory" | |
101 | ); | |
102 | block[0]= level; | |
103 | } | |
104 | ||
105 | ||
106 | static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, | |
107 | int16_t *block, int n, int qscale) | |
108 | { | |
109 | x86_reg qmul, qadd, nCoeffs; | |
110 | ||
111 | qmul = qscale << 1; | |
112 | qadd = (qscale - 1) | 1; | |
113 | ||
114 | av_assert2(s->block_last_index[n]>=0 || s->h263_aic); | |
115 | ||
116 | nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; | |
117 | ||
118 | __asm__ volatile( | |
119 | "movd %1, %%mm6 \n\t" //qmul | |
120 | "packssdw %%mm6, %%mm6 \n\t" | |
121 | "packssdw %%mm6, %%mm6 \n\t" | |
122 | "movd %2, %%mm5 \n\t" //qadd | |
123 | "pxor %%mm7, %%mm7 \n\t" | |
124 | "packssdw %%mm5, %%mm5 \n\t" | |
125 | "packssdw %%mm5, %%mm5 \n\t" | |
126 | "psubw %%mm5, %%mm7 \n\t" | |
127 | "pxor %%mm4, %%mm4 \n\t" | |
128 | ".p2align 4 \n\t" | |
129 | "1: \n\t" | |
130 | "movq (%0, %3), %%mm0 \n\t" | |
131 | "movq 8(%0, %3), %%mm1 \n\t" | |
132 | ||
133 | "pmullw %%mm6, %%mm0 \n\t" | |
134 | "pmullw %%mm6, %%mm1 \n\t" | |
135 | ||
136 | "movq (%0, %3), %%mm2 \n\t" | |
137 | "movq 8(%0, %3), %%mm3 \n\t" | |
138 | ||
139 | "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
140 | "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
141 | ||
142 | "pxor %%mm2, %%mm0 \n\t" | |
143 | "pxor %%mm3, %%mm1 \n\t" | |
144 | ||
145 | "paddw %%mm7, %%mm0 \n\t" | |
146 | "paddw %%mm7, %%mm1 \n\t" | |
147 | ||
148 | "pxor %%mm0, %%mm2 \n\t" | |
149 | "pxor %%mm1, %%mm3 \n\t" | |
150 | ||
151 | "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 | |
152 | "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 | |
153 | ||
154 | "pandn %%mm2, %%mm0 \n\t" | |
155 | "pandn %%mm3, %%mm1 \n\t" | |
156 | ||
157 | "movq %%mm0, (%0, %3) \n\t" | |
158 | "movq %%mm1, 8(%0, %3) \n\t" | |
159 | ||
160 | "add $16, %3 \n\t" | |
161 | "jng 1b \n\t" | |
162 | ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) | |
163 | : "memory" | |
164 | ); | |
165 | } | |
166 | ||
167 | static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, | |
168 | int16_t *block, int n, int qscale) | |
169 | { | |
170 | x86_reg nCoeffs; | |
171 | const uint16_t *quant_matrix; | |
172 | int block0; | |
173 | ||
174 | av_assert2(s->block_last_index[n]>=0); | |
175 | ||
176 | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; | |
177 | ||
178 | if (n < 4) | |
179 | block0 = block[0] * s->y_dc_scale; | |
180 | else | |
181 | block0 = block[0] * s->c_dc_scale; | |
182 | /* XXX: only mpeg1 */ | |
183 | quant_matrix = s->intra_matrix; | |
184 | __asm__ volatile( | |
185 | "pcmpeqw %%mm7, %%mm7 \n\t" | |
186 | "psrlw $15, %%mm7 \n\t" | |
187 | "movd %2, %%mm6 \n\t" | |
188 | "packssdw %%mm6, %%mm6 \n\t" | |
189 | "packssdw %%mm6, %%mm6 \n\t" | |
190 | "mov %3, %%"REG_a" \n\t" | |
191 | ".p2align 4 \n\t" | |
192 | "1: \n\t" | |
193 | "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
194 | "movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |
195 | "movq (%1, %%"REG_a"), %%mm4 \n\t" | |
196 | "movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |
197 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
198 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
199 | "pxor %%mm2, %%mm2 \n\t" | |
200 | "pxor %%mm3, %%mm3 \n\t" | |
201 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
202 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
203 | "pxor %%mm2, %%mm0 \n\t" | |
204 | "pxor %%mm3, %%mm1 \n\t" | |
205 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
206 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
207 | "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q | |
208 | "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | |
209 | "pxor %%mm4, %%mm4 \n\t" | |
210 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
211 | "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
212 | "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |
213 | "psraw $3, %%mm0 \n\t" | |
214 | "psraw $3, %%mm1 \n\t" | |
215 | "psubw %%mm7, %%mm0 \n\t" | |
216 | "psubw %%mm7, %%mm1 \n\t" | |
217 | "por %%mm7, %%mm0 \n\t" | |
218 | "por %%mm7, %%mm1 \n\t" | |
219 | "pxor %%mm2, %%mm0 \n\t" | |
220 | "pxor %%mm3, %%mm1 \n\t" | |
221 | "psubw %%mm2, %%mm0 \n\t" | |
222 | "psubw %%mm3, %%mm1 \n\t" | |
223 | "pandn %%mm0, %%mm4 \n\t" | |
224 | "pandn %%mm1, %%mm5 \n\t" | |
225 | "movq %%mm4, (%0, %%"REG_a") \n\t" | |
226 | "movq %%mm5, 8(%0, %%"REG_a") \n\t" | |
227 | ||
228 | "add $16, %%"REG_a" \n\t" | |
229 | "js 1b \n\t" | |
230 | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) | |
231 | : "%"REG_a, "memory" | |
232 | ); | |
233 | block[0]= block0; | |
234 | } | |
235 | ||
236 | static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, | |
237 | int16_t *block, int n, int qscale) | |
238 | { | |
239 | x86_reg nCoeffs; | |
240 | const uint16_t *quant_matrix; | |
241 | ||
242 | av_assert2(s->block_last_index[n]>=0); | |
243 | ||
244 | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; | |
245 | ||
246 | quant_matrix = s->inter_matrix; | |
247 | __asm__ volatile( | |
248 | "pcmpeqw %%mm7, %%mm7 \n\t" | |
249 | "psrlw $15, %%mm7 \n\t" | |
250 | "movd %2, %%mm6 \n\t" | |
251 | "packssdw %%mm6, %%mm6 \n\t" | |
252 | "packssdw %%mm6, %%mm6 \n\t" | |
253 | "mov %3, %%"REG_a" \n\t" | |
254 | ".p2align 4 \n\t" | |
255 | "1: \n\t" | |
256 | "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
257 | "movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |
258 | "movq (%1, %%"REG_a"), %%mm4 \n\t" | |
259 | "movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |
260 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
261 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
262 | "pxor %%mm2, %%mm2 \n\t" | |
263 | "pxor %%mm3, %%mm3 \n\t" | |
264 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
265 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
266 | "pxor %%mm2, %%mm0 \n\t" | |
267 | "pxor %%mm3, %%mm1 \n\t" | |
268 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
269 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
270 | "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 | |
271 | "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 | |
272 | "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 | |
273 | "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 | |
274 | "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q | |
275 | "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q | |
276 | "pxor %%mm4, %%mm4 \n\t" | |
277 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
278 | "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
279 | "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |
280 | "psraw $4, %%mm0 \n\t" | |
281 | "psraw $4, %%mm1 \n\t" | |
282 | "psubw %%mm7, %%mm0 \n\t" | |
283 | "psubw %%mm7, %%mm1 \n\t" | |
284 | "por %%mm7, %%mm0 \n\t" | |
285 | "por %%mm7, %%mm1 \n\t" | |
286 | "pxor %%mm2, %%mm0 \n\t" | |
287 | "pxor %%mm3, %%mm1 \n\t" | |
288 | "psubw %%mm2, %%mm0 \n\t" | |
289 | "psubw %%mm3, %%mm1 \n\t" | |
290 | "pandn %%mm0, %%mm4 \n\t" | |
291 | "pandn %%mm1, %%mm5 \n\t" | |
292 | "movq %%mm4, (%0, %%"REG_a") \n\t" | |
293 | "movq %%mm5, 8(%0, %%"REG_a") \n\t" | |
294 | ||
295 | "add $16, %%"REG_a" \n\t" | |
296 | "js 1b \n\t" | |
297 | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) | |
298 | : "%"REG_a, "memory" | |
299 | ); | |
300 | } | |
301 | ||
302 | static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, | |
303 | int16_t *block, int n, int qscale) | |
304 | { | |
305 | x86_reg nCoeffs; | |
306 | const uint16_t *quant_matrix; | |
307 | int block0; | |
308 | ||
309 | av_assert2(s->block_last_index[n]>=0); | |
310 | ||
311 | if(s->alternate_scan) nCoeffs= 63; //FIXME | |
312 | else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; | |
313 | ||
314 | if (n < 4) | |
315 | block0 = block[0] * s->y_dc_scale; | |
316 | else | |
317 | block0 = block[0] * s->c_dc_scale; | |
318 | quant_matrix = s->intra_matrix; | |
319 | __asm__ volatile( | |
320 | "pcmpeqw %%mm7, %%mm7 \n\t" | |
321 | "psrlw $15, %%mm7 \n\t" | |
322 | "movd %2, %%mm6 \n\t" | |
323 | "packssdw %%mm6, %%mm6 \n\t" | |
324 | "packssdw %%mm6, %%mm6 \n\t" | |
325 | "mov %3, %%"REG_a" \n\t" | |
326 | ".p2align 4 \n\t" | |
327 | "1: \n\t" | |
328 | "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
329 | "movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |
330 | "movq (%1, %%"REG_a"), %%mm4 \n\t" | |
331 | "movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |
332 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
333 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
334 | "pxor %%mm2, %%mm2 \n\t" | |
335 | "pxor %%mm3, %%mm3 \n\t" | |
336 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
337 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
338 | "pxor %%mm2, %%mm0 \n\t" | |
339 | "pxor %%mm3, %%mm1 \n\t" | |
340 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
341 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
342 | "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q | |
343 | "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | |
344 | "pxor %%mm4, %%mm4 \n\t" | |
345 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
346 | "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
347 | "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |
348 | "psraw $3, %%mm0 \n\t" | |
349 | "psraw $3, %%mm1 \n\t" | |
350 | "pxor %%mm2, %%mm0 \n\t" | |
351 | "pxor %%mm3, %%mm1 \n\t" | |
352 | "psubw %%mm2, %%mm0 \n\t" | |
353 | "psubw %%mm3, %%mm1 \n\t" | |
354 | "pandn %%mm0, %%mm4 \n\t" | |
355 | "pandn %%mm1, %%mm5 \n\t" | |
356 | "movq %%mm4, (%0, %%"REG_a") \n\t" | |
357 | "movq %%mm5, 8(%0, %%"REG_a") \n\t" | |
358 | ||
359 | "add $16, %%"REG_a" \n\t" | |
360 | "jng 1b \n\t" | |
361 | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) | |
362 | : "%"REG_a, "memory" | |
363 | ); | |
364 | block[0]= block0; | |
365 | //Note, we do not do mismatch control for intra as errors cannot accumulate | |
366 | } | |
367 | ||
368 | static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, | |
369 | int16_t *block, int n, int qscale) | |
370 | { | |
371 | x86_reg nCoeffs; | |
372 | const uint16_t *quant_matrix; | |
373 | ||
374 | av_assert2(s->block_last_index[n]>=0); | |
375 | ||
376 | if(s->alternate_scan) nCoeffs= 63; //FIXME | |
377 | else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; | |
378 | ||
379 | quant_matrix = s->inter_matrix; | |
380 | __asm__ volatile( | |
381 | "pcmpeqw %%mm7, %%mm7 \n\t" | |
382 | "psrlq $48, %%mm7 \n\t" | |
383 | "movd %2, %%mm6 \n\t" | |
384 | "packssdw %%mm6, %%mm6 \n\t" | |
385 | "packssdw %%mm6, %%mm6 \n\t" | |
386 | "mov %3, %%"REG_a" \n\t" | |
387 | ".p2align 4 \n\t" | |
388 | "1: \n\t" | |
389 | "movq (%0, %%"REG_a"), %%mm0 \n\t" | |
390 | "movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |
391 | "movq (%1, %%"REG_a"), %%mm4 \n\t" | |
392 | "movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |
393 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |
394 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |
395 | "pxor %%mm2, %%mm2 \n\t" | |
396 | "pxor %%mm3, %%mm3 \n\t" | |
397 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 | |
398 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 | |
399 | "pxor %%mm2, %%mm0 \n\t" | |
400 | "pxor %%mm3, %%mm1 \n\t" | |
401 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) | |
402 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) | |
403 | "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 | |
404 | "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 | |
405 | "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q | |
406 | "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q | |
407 | "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q | |
408 | "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q | |
409 | "pxor %%mm4, %%mm4 \n\t" | |
410 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow | |
411 | "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |
412 | "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |
413 | "psrlw $4, %%mm0 \n\t" | |
414 | "psrlw $4, %%mm1 \n\t" | |
415 | "pxor %%mm2, %%mm0 \n\t" | |
416 | "pxor %%mm3, %%mm1 \n\t" | |
417 | "psubw %%mm2, %%mm0 \n\t" | |
418 | "psubw %%mm3, %%mm1 \n\t" | |
419 | "pandn %%mm0, %%mm4 \n\t" | |
420 | "pandn %%mm1, %%mm5 \n\t" | |
421 | "pxor %%mm4, %%mm7 \n\t" | |
422 | "pxor %%mm5, %%mm7 \n\t" | |
423 | "movq %%mm4, (%0, %%"REG_a") \n\t" | |
424 | "movq %%mm5, 8(%0, %%"REG_a") \n\t" | |
425 | ||
426 | "add $16, %%"REG_a" \n\t" | |
427 | "jng 1b \n\t" | |
428 | "movd 124(%0, %3), %%mm0 \n\t" | |
429 | "movq %%mm7, %%mm6 \n\t" | |
430 | "psrlq $32, %%mm7 \n\t" | |
431 | "pxor %%mm6, %%mm7 \n\t" | |
432 | "movq %%mm7, %%mm6 \n\t" | |
433 | "psrlq $16, %%mm7 \n\t" | |
434 | "pxor %%mm6, %%mm7 \n\t" | |
435 | "pslld $31, %%mm7 \n\t" | |
436 | "psrlq $15, %%mm7 \n\t" | |
437 | "pxor %%mm7, %%mm0 \n\t" | |
438 | "movd %%mm0, 124(%0, %3) \n\t" | |
439 | ||
440 | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs) | |
441 | : "%"REG_a, "memory" | |
442 | ); | |
443 | } | |
444 | ||
445 | #endif /* HAVE_MMX_INLINE */ | |
446 | ||
447 | av_cold void ff_mpv_common_init_x86(MpegEncContext *s) | |
448 | { | |
449 | #if HAVE_MMX_INLINE | |
450 | int cpu_flags = av_get_cpu_flags(); | |
451 | ||
452 | if (INLINE_MMX(cpu_flags)) { | |
453 | s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; | |
454 | s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; | |
455 | s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; | |
456 | s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; | |
457 | if(!(s->flags & CODEC_FLAG_BITEXACT)) | |
458 | s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; | |
459 | s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; | |
460 | } | |
461 | #endif /* HAVE_MMX_INLINE */ | |
462 | } |