2 * MPEG video MMX templates
4 * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/internal.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/mpegvideo.h"
39 #if COMPILE_TEMPLATE_SSE2
40 #define MMREG_WIDTH "16"
44 "pshuflw $0, "a", "a" \n\t"\
45 "punpcklwd "a", "a" \n\t"
46 #define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
48 "movhlps "a", "b" \n\t"\
50 "pshuflw $0x0E, "a", "b" \n\t"\
52 "pshuflw $0x01, "a", "b" \n\t"\
55 #define MMREG_WIDTH "8"
58 #if COMPILE_TEMPLATE_MMXEXT
59 #define SPREADW(a) "pshufw $0, "a", "a" \n\t"
60 #define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
62 "pshufw $0x0E, "a", "b" \n\t"\
64 "pshufw $0x01, "a", "b" \n\t"\
68 "punpcklwd "a", "a" \n\t"\
69 "punpcklwd "a", "a" \n\t"
71 "psubusw "a", "b" \n\t"\
75 "psrlq $32, "a" \n\t"\
78 "psrlq $16, "a" \n\t"\
84 #if COMPILE_TEMPLATE_SSSE3
85 #define SAVE_SIGN(a,b) \
86 "movdqa "b", "a" \n\t"\
88 #define RESTORE_SIGN(a,b) \
89 "psignw "a", "b" \n\t"
91 #define SAVE_SIGN(a,b) \
93 "pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\
95 "psubw "a", "b" \n\t" /* ABS(block[i]) */
96 #define RESTORE_SIGN(a,b) \
98 "psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
101 static int RENAME(dct_quantize
)(MpegEncContext
*s
,
102 int16_t *block
, int n
,
103 int qscale
, int *overflow
)
105 x86_reg last_non_zero_p1
;
106 int level
=0, q
; //=0 is because gcc says uninitialized ...
107 const uint16_t *qmat
, *bias
;
108 LOCAL_ALIGNED_16(int16_t, temp_block
, [64]);
110 av_assert2((7&(int)(&temp_block
[0])) == 0); //did gcc align it correctly?
113 RENAME_FDCT(ff_fdct
)(block
); // cannot be anything else ...
116 s
->denoise_dct(s
, block
);
122 bias
= s
->q_intra_matrix16
[qscale
][1];
123 qmat
= s
->q_intra_matrix16
[qscale
][0];
126 bias
= s
->q_chroma_intra_matrix16
[qscale
][1];
127 qmat
= s
->q_chroma_intra_matrix16
[qscale
][0];
129 /* note: block[0] is assumed to be positive */
133 : "=d" (level
), "=a"(dummy
)
134 : "a" ((block
[0]>>2) + q
), "c" (ff_inverse
[q
<<1])
137 /* For AIC we skip quant/dequant of INTRADC */
138 level
= (block
[0] + 4)>>3;
140 block
[0]=0; //avoid fake overflow
141 // temp_block[0] = (block[0] + (q >> 1)) / q;
142 last_non_zero_p1
= 1;
144 last_non_zero_p1
= 0;
145 bias
= s
->q_inter_matrix16
[qscale
][1];
146 qmat
= s
->q_inter_matrix16
[qscale
][0];
149 if((s
->out_format
== FMT_H263
|| s
->out_format
== FMT_H261
) && s
->mpeg_quant
==0){
152 "movd %%"REG_a
", "MM
"3 \n\t" // last_non_zero_p1
154 "pxor "MM
"7, "MM
"7 \n\t" // 0
155 "pxor "MM
"4, "MM
"4 \n\t" // 0
156 MOVQ
" (%2), "MM
"5 \n\t" // qmat[0]
157 "pxor "MM
"6, "MM
"6 \n\t"
158 "psubw (%3), "MM
"6 \n\t" // -bias[0]
159 "mov $-128, %%"REG_a
" \n\t"
162 MOVQ
" (%1, %%"REG_a
"), "MM
"0 \n\t" // block[i]
163 SAVE_SIGN(MM
"1", MM
"0") // ABS(block[i])
164 "psubusw "MM
"6, "MM
"0 \n\t" // ABS(block[i]) + bias[0]
165 "pmulhw "MM
"5, "MM
"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
166 "por "MM
"0, "MM
"4 \n\t"
167 RESTORE_SIGN(MM
"1", MM
"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
168 MOVQ
" "MM
"0, (%5, %%"REG_a
") \n\t"
169 "pcmpeqw "MM
"7, "MM
"0 \n\t" // out==0 ? 0xFF : 0x00
170 MOVQ
" (%4, %%"REG_a
"), "MM
"1 \n\t"
171 MOVQ
" "MM
"7, (%1, %%"REG_a
") \n\t" // 0
172 "pandn "MM
"1, "MM
"0 \n\t"
174 "add $"MMREG_WIDTH
", %%"REG_a
" \n\t"
177 "movd "MM
"3, %%"REG_a
" \n\t"
178 "movzbl %%al, %%eax \n\t" // last_non_zero_p1
179 : "+a" (last_non_zero_p1
)
180 : "r" (block
+64), "r" (qmat
), "r" (bias
),
181 "r" (inv_zigzag_direct16
+ 64), "r" (temp_block
+ 64)
182 XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
183 "%xmm4", "%xmm5", "%xmm6", "%xmm7")
187 "movd %%"REG_a
", "MM
"3 \n\t" // last_non_zero_p1
189 "pxor "MM
"7, "MM
"7 \n\t" // 0
190 "pxor "MM
"4, "MM
"4 \n\t" // 0
191 "mov $-128, %%"REG_a
" \n\t"
194 MOVQ
" (%1, %%"REG_a
"), "MM
"0 \n\t" // block[i]
195 SAVE_SIGN(MM
"1", MM
"0") // ABS(block[i])
196 MOVQ
" (%3, %%"REG_a
"), "MM
"6 \n\t" // bias[0]
197 "paddusw "MM
"6, "MM
"0 \n\t" // ABS(block[i]) + bias[0]
198 MOVQ
" (%2, %%"REG_a
"), "MM
"5 \n\t" // qmat[i]
199 "pmulhw "MM
"5, "MM
"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
200 "por "MM
"0, "MM
"4 \n\t"
201 RESTORE_SIGN(MM
"1", MM
"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
202 MOVQ
" "MM
"0, (%5, %%"REG_a
") \n\t"
203 "pcmpeqw "MM
"7, "MM
"0 \n\t" // out==0 ? 0xFF : 0x00
204 MOVQ
" (%4, %%"REG_a
"), "MM
"1 \n\t"
205 MOVQ
" "MM
"7, (%1, %%"REG_a
") \n\t" // 0
206 "pandn "MM
"1, "MM
"0 \n\t"
208 "add $"MMREG_WIDTH
", %%"REG_a
" \n\t"
211 "movd "MM
"3, %%"REG_a
" \n\t"
212 "movzbl %%al, %%eax \n\t" // last_non_zero_p1
213 : "+a" (last_non_zero_p1
)
214 : "r" (block
+64), "r" (qmat
+64), "r" (bias
+64),
215 "r" (inv_zigzag_direct16
+ 64), "r" (temp_block
+ 64)
216 XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
217 "%xmm4", "%xmm5", "%xmm6", "%xmm7")
221 "movd %1, "MM
"1 \n\t" // max_qcoeff
223 "psubusw "MM
"1, "MM
"4 \n\t"
224 "packuswb "MM
"4, "MM
"4 \n\t"
225 #if COMPILE_TEMPLATE_SSE2
226 "packsswb "MM
"4, "MM
"4 \n\t"
228 "movd "MM
"4, %0 \n\t" // *overflow
230 : "g" (s
->max_qcoeff
)
233 if(s
->mb_intra
) block
[0]= level
;
234 else block
[0]= temp_block
[0];
236 if (s
->idsp
.perm_type
== FF_IDCT_PERM_SIMPLE
) {
237 if(last_non_zero_p1
<= 1) goto end
;
238 block
[0x08] = temp_block
[0x01]; block
[0x10] = temp_block
[0x08];
239 block
[0x20] = temp_block
[0x10];
240 if(last_non_zero_p1
<= 4) goto end
;
241 block
[0x18] = temp_block
[0x09]; block
[0x04] = temp_block
[0x02];
242 block
[0x09] = temp_block
[0x03];
243 if(last_non_zero_p1
<= 7) goto end
;
244 block
[0x14] = temp_block
[0x0A]; block
[0x28] = temp_block
[0x11];
245 block
[0x12] = temp_block
[0x18]; block
[0x02] = temp_block
[0x20];
246 if(last_non_zero_p1
<= 11) goto end
;
247 block
[0x1A] = temp_block
[0x19]; block
[0x24] = temp_block
[0x12];
248 block
[0x19] = temp_block
[0x0B]; block
[0x01] = temp_block
[0x04];
249 block
[0x0C] = temp_block
[0x05];
250 if(last_non_zero_p1
<= 16) goto end
;
251 block
[0x11] = temp_block
[0x0C]; block
[0x29] = temp_block
[0x13];
252 block
[0x16] = temp_block
[0x1A]; block
[0x0A] = temp_block
[0x21];
253 block
[0x30] = temp_block
[0x28]; block
[0x22] = temp_block
[0x30];
254 block
[0x38] = temp_block
[0x29]; block
[0x06] = temp_block
[0x22];
255 if(last_non_zero_p1
<= 24) goto end
;
256 block
[0x1B] = temp_block
[0x1B]; block
[0x21] = temp_block
[0x14];
257 block
[0x1C] = temp_block
[0x0D]; block
[0x05] = temp_block
[0x06];
258 block
[0x0D] = temp_block
[0x07]; block
[0x15] = temp_block
[0x0E];
259 block
[0x2C] = temp_block
[0x15]; block
[0x13] = temp_block
[0x1C];
260 if(last_non_zero_p1
<= 32) goto end
;
261 block
[0x0B] = temp_block
[0x23]; block
[0x34] = temp_block
[0x2A];
262 block
[0x2A] = temp_block
[0x31]; block
[0x32] = temp_block
[0x38];
263 block
[0x3A] = temp_block
[0x39]; block
[0x26] = temp_block
[0x32];
264 block
[0x39] = temp_block
[0x2B]; block
[0x03] = temp_block
[0x24];
265 if(last_non_zero_p1
<= 40) goto end
;
266 block
[0x1E] = temp_block
[0x1D]; block
[0x25] = temp_block
[0x16];
267 block
[0x1D] = temp_block
[0x0F]; block
[0x2D] = temp_block
[0x17];
268 block
[0x17] = temp_block
[0x1E]; block
[0x0E] = temp_block
[0x25];
269 block
[0x31] = temp_block
[0x2C]; block
[0x2B] = temp_block
[0x33];
270 if(last_non_zero_p1
<= 48) goto end
;
271 block
[0x36] = temp_block
[0x3A]; block
[0x3B] = temp_block
[0x3B];
272 block
[0x23] = temp_block
[0x34]; block
[0x3C] = temp_block
[0x2D];
273 block
[0x07] = temp_block
[0x26]; block
[0x1F] = temp_block
[0x1F];
274 block
[0x0F] = temp_block
[0x27]; block
[0x35] = temp_block
[0x2E];
275 if(last_non_zero_p1
<= 56) goto end
;
276 block
[0x2E] = temp_block
[0x35]; block
[0x33] = temp_block
[0x3C];
277 block
[0x3E] = temp_block
[0x3D]; block
[0x27] = temp_block
[0x36];
278 block
[0x3D] = temp_block
[0x2F]; block
[0x2F] = temp_block
[0x37];
279 block
[0x37] = temp_block
[0x3E]; block
[0x3F] = temp_block
[0x3F];
280 }else if(s
->idsp
.perm_type
== FF_IDCT_PERM_LIBMPEG2
){
281 if(last_non_zero_p1
<= 1) goto end
;
282 block
[0x04] = temp_block
[0x01];
283 block
[0x08] = temp_block
[0x08]; block
[0x10] = temp_block
[0x10];
284 if(last_non_zero_p1
<= 4) goto end
;
285 block
[0x0C] = temp_block
[0x09]; block
[0x01] = temp_block
[0x02];
286 block
[0x05] = temp_block
[0x03];
287 if(last_non_zero_p1
<= 7) goto end
;
288 block
[0x09] = temp_block
[0x0A]; block
[0x14] = temp_block
[0x11];
289 block
[0x18] = temp_block
[0x18]; block
[0x20] = temp_block
[0x20];
290 if(last_non_zero_p1
<= 11) goto end
;
291 block
[0x1C] = temp_block
[0x19];
292 block
[0x11] = temp_block
[0x12]; block
[0x0D] = temp_block
[0x0B];
293 block
[0x02] = temp_block
[0x04]; block
[0x06] = temp_block
[0x05];
294 if(last_non_zero_p1
<= 16) goto end
;
295 block
[0x0A] = temp_block
[0x0C]; block
[0x15] = temp_block
[0x13];
296 block
[0x19] = temp_block
[0x1A]; block
[0x24] = temp_block
[0x21];
297 block
[0x28] = temp_block
[0x28]; block
[0x30] = temp_block
[0x30];
298 block
[0x2C] = temp_block
[0x29]; block
[0x21] = temp_block
[0x22];
299 if(last_non_zero_p1
<= 24) goto end
;
300 block
[0x1D] = temp_block
[0x1B]; block
[0x12] = temp_block
[0x14];
301 block
[0x0E] = temp_block
[0x0D]; block
[0x03] = temp_block
[0x06];
302 block
[0x07] = temp_block
[0x07]; block
[0x0B] = temp_block
[0x0E];
303 block
[0x16] = temp_block
[0x15]; block
[0x1A] = temp_block
[0x1C];
304 if(last_non_zero_p1
<= 32) goto end
;
305 block
[0x25] = temp_block
[0x23]; block
[0x29] = temp_block
[0x2A];
306 block
[0x34] = temp_block
[0x31]; block
[0x38] = temp_block
[0x38];
307 block
[0x3C] = temp_block
[0x39]; block
[0x31] = temp_block
[0x32];
308 block
[0x2D] = temp_block
[0x2B]; block
[0x22] = temp_block
[0x24];
309 if(last_non_zero_p1
<= 40) goto end
;
310 block
[0x1E] = temp_block
[0x1D]; block
[0x13] = temp_block
[0x16];
311 block
[0x0F] = temp_block
[0x0F]; block
[0x17] = temp_block
[0x17];
312 block
[0x1B] = temp_block
[0x1E]; block
[0x26] = temp_block
[0x25];
313 block
[0x2A] = temp_block
[0x2C]; block
[0x35] = temp_block
[0x33];
314 if(last_non_zero_p1
<= 48) goto end
;
315 block
[0x39] = temp_block
[0x3A]; block
[0x3D] = temp_block
[0x3B];
316 block
[0x32] = temp_block
[0x34]; block
[0x2E] = temp_block
[0x2D];
317 block
[0x23] = temp_block
[0x26]; block
[0x1F] = temp_block
[0x1F];
318 block
[0x27] = temp_block
[0x27]; block
[0x2B] = temp_block
[0x2E];
319 if(last_non_zero_p1
<= 56) goto end
;
320 block
[0x36] = temp_block
[0x35]; block
[0x3A] = temp_block
[0x3C];
321 block
[0x3E] = temp_block
[0x3D]; block
[0x33] = temp_block
[0x36];
322 block
[0x2F] = temp_block
[0x2F]; block
[0x37] = temp_block
[0x37];
323 block
[0x3B] = temp_block
[0x3E]; block
[0x3F] = temp_block
[0x3F];
325 if(last_non_zero_p1
<= 1) goto end
;
326 block
[0x01] = temp_block
[0x01];
327 block
[0x08] = temp_block
[0x08]; block
[0x10] = temp_block
[0x10];
328 if(last_non_zero_p1
<= 4) goto end
;
329 block
[0x09] = temp_block
[0x09]; block
[0x02] = temp_block
[0x02];
330 block
[0x03] = temp_block
[0x03];
331 if(last_non_zero_p1
<= 7) goto end
;
332 block
[0x0A] = temp_block
[0x0A]; block
[0x11] = temp_block
[0x11];
333 block
[0x18] = temp_block
[0x18]; block
[0x20] = temp_block
[0x20];
334 if(last_non_zero_p1
<= 11) goto end
;
335 block
[0x19] = temp_block
[0x19];
336 block
[0x12] = temp_block
[0x12]; block
[0x0B] = temp_block
[0x0B];
337 block
[0x04] = temp_block
[0x04]; block
[0x05] = temp_block
[0x05];
338 if(last_non_zero_p1
<= 16) goto end
;
339 block
[0x0C] = temp_block
[0x0C]; block
[0x13] = temp_block
[0x13];
340 block
[0x1A] = temp_block
[0x1A]; block
[0x21] = temp_block
[0x21];
341 block
[0x28] = temp_block
[0x28]; block
[0x30] = temp_block
[0x30];
342 block
[0x29] = temp_block
[0x29]; block
[0x22] = temp_block
[0x22];
343 if(last_non_zero_p1
<= 24) goto end
;
344 block
[0x1B] = temp_block
[0x1B]; block
[0x14] = temp_block
[0x14];
345 block
[0x0D] = temp_block
[0x0D]; block
[0x06] = temp_block
[0x06];
346 block
[0x07] = temp_block
[0x07]; block
[0x0E] = temp_block
[0x0E];
347 block
[0x15] = temp_block
[0x15]; block
[0x1C] = temp_block
[0x1C];
348 if(last_non_zero_p1
<= 32) goto end
;
349 block
[0x23] = temp_block
[0x23]; block
[0x2A] = temp_block
[0x2A];
350 block
[0x31] = temp_block
[0x31]; block
[0x38] = temp_block
[0x38];
351 block
[0x39] = temp_block
[0x39]; block
[0x32] = temp_block
[0x32];
352 block
[0x2B] = temp_block
[0x2B]; block
[0x24] = temp_block
[0x24];
353 if(last_non_zero_p1
<= 40) goto end
;
354 block
[0x1D] = temp_block
[0x1D]; block
[0x16] = temp_block
[0x16];
355 block
[0x0F] = temp_block
[0x0F]; block
[0x17] = temp_block
[0x17];
356 block
[0x1E] = temp_block
[0x1E]; block
[0x25] = temp_block
[0x25];
357 block
[0x2C] = temp_block
[0x2C]; block
[0x33] = temp_block
[0x33];
358 if(last_non_zero_p1
<= 48) goto end
;
359 block
[0x3A] = temp_block
[0x3A]; block
[0x3B] = temp_block
[0x3B];
360 block
[0x34] = temp_block
[0x34]; block
[0x2D] = temp_block
[0x2D];
361 block
[0x26] = temp_block
[0x26]; block
[0x1F] = temp_block
[0x1F];
362 block
[0x27] = temp_block
[0x27]; block
[0x2E] = temp_block
[0x2E];
363 if(last_non_zero_p1
<= 56) goto end
;
364 block
[0x35] = temp_block
[0x35]; block
[0x3C] = temp_block
[0x3C];
365 block
[0x3D] = temp_block
[0x3D]; block
[0x36] = temp_block
[0x36];
366 block
[0x2F] = temp_block
[0x2F]; block
[0x37] = temp_block
[0x37];
367 block
[0x3E] = temp_block
[0x3E]; block
[0x3F] = temp_block
[0x3F];
370 return last_non_zero_p1
- 1;