Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;***************************************************************************** |
2 | ;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code | |
3 | ;***************************************************************************** | |
4 | ;* Copyright (C) 2005-2011 x264 project | |
5 | ;* | |
6 | ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> | |
7 | ;* | |
8 | ;* This file is part of FFmpeg. | |
9 | ;* | |
10 | ;* FFmpeg is free software; you can redistribute it and/or | |
11 | ;* modify it under the terms of the GNU Lesser General Public | |
12 | ;* License as published by the Free Software Foundation; either | |
13 | ;* version 2.1 of the License, or (at your option) any later version. | |
14 | ;* | |
15 | ;* FFmpeg is distributed in the hope that it will be useful, | |
16 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 | ;* Lesser General Public License for more details. | |
19 | ;* | |
20 | ;* You should have received a copy of the GNU Lesser General Public | |
21 | ;* License along with FFmpeg; if not, write to the Free Software | |
22 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 | ;****************************************************************************** | |
24 | ||
25 | %include "libavutil/x86/x86util.asm" | |
26 | ||
27 | SECTION_RODATA | |
28 | ||
29 | pw_pixel_max: times 8 dw ((1 << 10)-1) | |
30 | pd_32: times 4 dd 32 | |
31 | ||
32 | SECTION .text | |
33 | ||
34 | ;----------------------------------------------------------------------------- | |
35 | ; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride) | |
36 | ;----------------------------------------------------------------------------- | |
37 | %macro STORE_DIFFx2 6 | |
38 | psrad %1, 6 | |
39 | psrad %2, 6 | |
40 | packssdw %1, %2 | |
41 | movq %3, [%5] | |
42 | movhps %3, [%5+%6] | |
43 | paddsw %1, %3 | |
44 | CLIPW %1, %4, [pw_pixel_max] | |
45 | movq [%5], %1 | |
46 | movhps [%5+%6], %1 | |
47 | %endmacro | |
48 | ||
49 | %macro STORE_DIFF16 5 | |
50 | psrad %1, 6 | |
51 | psrad %2, 6 | |
52 | packssdw %1, %2 | |
53 | paddsw %1, [%5] | |
54 | CLIPW %1, %3, %4 | |
55 | mova [%5], %1 | |
56 | %endmacro | |
57 | ||
58 | ;dst, in, stride | |
59 | %macro IDCT4_ADD_10 3 | |
60 | mova m0, [%2+ 0] | |
61 | mova m1, [%2+16] | |
62 | mova m2, [%2+32] | |
63 | mova m3, [%2+48] | |
64 | IDCT4_1D d,0,1,2,3,4,5 | |
65 | TRANSPOSE4x4D 0,1,2,3,4 | |
66 | paddd m0, [pd_32] | |
67 | IDCT4_1D d,0,1,2,3,4,5 | |
68 | pxor m5, m5 | |
69 | mova [%2+ 0], m5 | |
70 | mova [%2+16], m5 | |
71 | mova [%2+32], m5 | |
72 | mova [%2+48], m5 | |
73 | STORE_DIFFx2 m0, m1, m4, m5, %1, %3 | |
74 | lea %1, [%1+%3*2] | |
75 | STORE_DIFFx2 m2, m3, m4, m5, %1, %3 | |
76 | %endmacro | |
77 | ||
78 | %macro IDCT_ADD_10 0 | |
79 | cglobal h264_idct_add_10, 3,3 | |
80 | IDCT4_ADD_10 r0, r1, r2 | |
81 | RET | |
82 | %endmacro | |
83 | ||
84 | INIT_XMM sse2 | |
85 | IDCT_ADD_10 | |
86 | %if HAVE_AVX_EXTERNAL | |
87 | INIT_XMM avx | |
88 | IDCT_ADD_10 | |
89 | %endif | |
90 | ||
91 | ;----------------------------------------------------------------------------- | |
92 | ; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset, | |
93 | ; int16_t *block, int stride, | |
94 | ; const uint8_t nnzc[6*8]) | |
95 | ;----------------------------------------------------------------------------- | |
96 | ;;;;;;; NO FATE SAMPLES TRIGGER THIS | |
97 | %macro ADD4x4IDCT 0 | |
98 | add4x4_idct %+ SUFFIX: | |
99 | add r5, r0 | |
100 | mova m0, [r2+ 0] | |
101 | mova m1, [r2+16] | |
102 | mova m2, [r2+32] | |
103 | mova m3, [r2+48] | |
104 | IDCT4_1D d,0,1,2,3,4,5 | |
105 | TRANSPOSE4x4D 0,1,2,3,4 | |
106 | paddd m0, [pd_32] | |
107 | IDCT4_1D d,0,1,2,3,4,5 | |
108 | pxor m5, m5 | |
109 | mova [r2+ 0], m5 | |
110 | mova [r2+16], m5 | |
111 | mova [r2+32], m5 | |
112 | mova [r2+48], m5 | |
113 | STORE_DIFFx2 m0, m1, m4, m5, r5, r3 | |
114 | lea r5, [r5+r3*2] | |
115 | STORE_DIFFx2 m2, m3, m4, m5, r5, r3 | |
116 | ret | |
117 | %endmacro | |
118 | ||
119 | INIT_XMM sse2 | |
120 | ALIGN 16 | |
121 | ADD4x4IDCT | |
122 | %if HAVE_AVX_EXTERNAL | |
123 | INIT_XMM avx | |
124 | ALIGN 16 | |
125 | ADD4x4IDCT | |
126 | %endif | |
127 | ||
128 | %macro ADD16_OP 2 | |
129 | cmp byte [r4+%2], 0 | |
130 | jz .skipblock%1 | |
131 | mov r5d, [r1+%1*4] | |
132 | call add4x4_idct %+ SUFFIX | |
133 | .skipblock%1: | |
134 | %if %1<15 | |
135 | add r2, 64 | |
136 | %endif | |
137 | %endmacro | |
138 | ||
139 | %macro IDCT_ADD16_10 0 | |
140 | cglobal h264_idct_add16_10, 5,6 | |
141 | ADD16_OP 0, 4+1*8 | |
142 | ADD16_OP 1, 5+1*8 | |
143 | ADD16_OP 2, 4+2*8 | |
144 | ADD16_OP 3, 5+2*8 | |
145 | ADD16_OP 4, 6+1*8 | |
146 | ADD16_OP 5, 7+1*8 | |
147 | ADD16_OP 6, 6+2*8 | |
148 | ADD16_OP 7, 7+2*8 | |
149 | ADD16_OP 8, 4+3*8 | |
150 | ADD16_OP 9, 5+3*8 | |
151 | ADD16_OP 10, 4+4*8 | |
152 | ADD16_OP 11, 5+4*8 | |
153 | ADD16_OP 12, 6+3*8 | |
154 | ADD16_OP 13, 7+3*8 | |
155 | ADD16_OP 14, 6+4*8 | |
156 | ADD16_OP 15, 7+4*8 | |
157 | REP_RET | |
158 | %endmacro | |
159 | ||
160 | INIT_XMM sse2 | |
161 | IDCT_ADD16_10 | |
162 | %if HAVE_AVX_EXTERNAL | |
163 | INIT_XMM avx | |
164 | IDCT_ADD16_10 | |
165 | %endif | |
166 | ||
167 | ;----------------------------------------------------------------------------- | |
168 | ; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride) | |
169 | ;----------------------------------------------------------------------------- | |
170 | %macro IDCT_DC_ADD_OP_10 3 | |
171 | pxor m5, m5 | |
172 | %if avx_enabled | |
173 | paddw m1, m0, [%1+0 ] | |
174 | paddw m2, m0, [%1+%2 ] | |
175 | paddw m3, m0, [%1+%2*2] | |
176 | paddw m4, m0, [%1+%3 ] | |
177 | %else | |
178 | mova m1, [%1+0 ] | |
179 | mova m2, [%1+%2 ] | |
180 | mova m3, [%1+%2*2] | |
181 | mova m4, [%1+%3 ] | |
182 | paddw m1, m0 | |
183 | paddw m2, m0 | |
184 | paddw m3, m0 | |
185 | paddw m4, m0 | |
186 | %endif | |
187 | CLIPW m1, m5, m6 | |
188 | CLIPW m2, m5, m6 | |
189 | CLIPW m3, m5, m6 | |
190 | CLIPW m4, m5, m6 | |
191 | mova [%1+0 ], m1 | |
192 | mova [%1+%2 ], m2 | |
193 | mova [%1+%2*2], m3 | |
194 | mova [%1+%3 ], m4 | |
195 | %endmacro | |
196 | ||
197 | INIT_MMX mmxext | |
198 | cglobal h264_idct_dc_add_10,3,3 | |
199 | movd m0, [r1] | |
200 | mov dword [r1], 0 | |
201 | paddd m0, [pd_32] | |
202 | psrad m0, 6 | |
203 | lea r1, [r2*3] | |
204 | pshufw m0, m0, 0 | |
205 | mova m6, [pw_pixel_max] | |
206 | IDCT_DC_ADD_OP_10 r0, r2, r1 | |
207 | RET | |
208 | ||
209 | ;----------------------------------------------------------------------------- | |
210 | ; void ff_h264_idct8_dc_add_10(pixel *dst, int16_t *block, int stride) | |
211 | ;----------------------------------------------------------------------------- | |
212 | %macro IDCT8_DC_ADD 0 | |
213 | cglobal h264_idct8_dc_add_10,3,4,7 | |
214 | movd m0, [r1] | |
215 | mov dword[r1], 0 | |
216 | paddd m0, [pd_32] | |
217 | psrad m0, 6 | |
218 | lea r1, [r2*3] | |
219 | SPLATW m0, m0, 0 | |
220 | mova m6, [pw_pixel_max] | |
221 | IDCT_DC_ADD_OP_10 r0, r2, r1 | |
222 | lea r0, [r0+r2*4] | |
223 | IDCT_DC_ADD_OP_10 r0, r2, r1 | |
224 | RET | |
225 | %endmacro | |
226 | ||
227 | INIT_XMM sse2 | |
228 | IDCT8_DC_ADD | |
229 | %if HAVE_AVX_EXTERNAL | |
230 | INIT_XMM avx | |
231 | IDCT8_DC_ADD | |
232 | %endif | |
233 | ||
234 | ;----------------------------------------------------------------------------- | |
235 | ; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset, | |
236 | ; int16_t *block, int stride, | |
237 | ; const uint8_t nnzc[6*8]) | |
238 | ;----------------------------------------------------------------------------- | |
239 | %macro AC 1 | |
240 | .ac%1: | |
241 | mov r5d, [r1+(%1+0)*4] | |
242 | call add4x4_idct %+ SUFFIX | |
243 | mov r5d, [r1+(%1+1)*4] | |
244 | add r2, 64 | |
245 | call add4x4_idct %+ SUFFIX | |
246 | add r2, 64 | |
247 | jmp .skipadd%1 | |
248 | %endmacro | |
249 | ||
250 | %assign last_block 16 | |
251 | %macro ADD16_OP_INTRA 2 | |
252 | cmp word [r4+%2], 0 | |
253 | jnz .ac%1 | |
254 | mov r5d, [r2+ 0] | |
255 | or r5d, [r2+64] | |
256 | jz .skipblock%1 | |
257 | mov r5d, [r1+(%1+0)*4] | |
258 | call idct_dc_add %+ SUFFIX | |
259 | .skipblock%1: | |
260 | %if %1<last_block-2 | |
261 | add r2, 128 | |
262 | %endif | |
263 | .skipadd%1: | |
264 | %endmacro | |
265 | ||
266 | %macro IDCT_ADD16INTRA_10 0 | |
267 | idct_dc_add %+ SUFFIX: | |
268 | add r5, r0 | |
269 | movq m0, [r2+ 0] | |
270 | movhps m0, [r2+64] | |
271 | mov dword [r2+ 0], 0 | |
272 | mov dword [r2+64], 0 | |
273 | paddd m0, [pd_32] | |
274 | psrad m0, 6 | |
275 | pshufhw m0, m0, 0 | |
276 | pshuflw m0, m0, 0 | |
277 | lea r6, [r3*3] | |
278 | mova m6, [pw_pixel_max] | |
279 | IDCT_DC_ADD_OP_10 r5, r3, r6 | |
280 | ret | |
281 | ||
282 | cglobal h264_idct_add16intra_10,5,7,8 | |
283 | ADD16_OP_INTRA 0, 4+1*8 | |
284 | ADD16_OP_INTRA 2, 4+2*8 | |
285 | ADD16_OP_INTRA 4, 6+1*8 | |
286 | ADD16_OP_INTRA 6, 6+2*8 | |
287 | ADD16_OP_INTRA 8, 4+3*8 | |
288 | ADD16_OP_INTRA 10, 4+4*8 | |
289 | ADD16_OP_INTRA 12, 6+3*8 | |
290 | ADD16_OP_INTRA 14, 6+4*8 | |
291 | REP_RET | |
292 | AC 8 | |
293 | AC 10 | |
294 | AC 12 | |
295 | AC 14 | |
296 | AC 0 | |
297 | AC 2 | |
298 | AC 4 | |
299 | AC 6 | |
300 | %endmacro | |
301 | ||
302 | INIT_XMM sse2 | |
303 | IDCT_ADD16INTRA_10 | |
304 | %if HAVE_AVX_EXTERNAL | |
305 | INIT_XMM avx | |
306 | IDCT_ADD16INTRA_10 | |
307 | %endif | |
308 | ||
309 | %assign last_block 36 | |
310 | ;----------------------------------------------------------------------------- | |
311 | ; void ff_h264_idct_add8_10(pixel **dst, const int *block_offset, | |
312 | ; int16_t *block, int stride, | |
313 | ; const uint8_t nnzc[6*8]) | |
314 | ;----------------------------------------------------------------------------- | |
315 | %macro IDCT_ADD8 0 | |
316 | cglobal h264_idct_add8_10,5,8,7 | |
317 | %if ARCH_X86_64 | |
318 | mov r7, r0 | |
319 | %endif | |
320 | add r2, 1024 | |
321 | mov r0, [r0] | |
322 | ADD16_OP_INTRA 16, 4+ 6*8 | |
323 | ADD16_OP_INTRA 18, 4+ 7*8 | |
324 | add r2, 1024-128*2 | |
325 | %if ARCH_X86_64 | |
326 | mov r0, [r7+gprsize] | |
327 | %else | |
328 | mov r0, r0m | |
329 | mov r0, [r0+gprsize] | |
330 | %endif | |
331 | ADD16_OP_INTRA 32, 4+11*8 | |
332 | ADD16_OP_INTRA 34, 4+12*8 | |
333 | REP_RET | |
334 | AC 16 | |
335 | AC 18 | |
336 | AC 32 | |
337 | AC 34 | |
338 | ||
339 | %endmacro ; IDCT_ADD8 | |
340 | ||
341 | INIT_XMM sse2 | |
342 | IDCT_ADD8 | |
343 | %if HAVE_AVX_EXTERNAL | |
344 | INIT_XMM avx | |
345 | IDCT_ADD8 | |
346 | %endif | |
347 | ||
348 | ;----------------------------------------------------------------------------- | |
349 | ; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride) | |
350 | ;----------------------------------------------------------------------------- | |
351 | %macro IDCT8_1D 2 | |
352 | SWAP 0, 1 | |
353 | psrad m4, m5, 1 | |
354 | psrad m1, m0, 1 | |
355 | paddd m4, m5 | |
356 | paddd m1, m0 | |
357 | paddd m4, m7 | |
358 | paddd m1, m5 | |
359 | psubd m4, m0 | |
360 | paddd m1, m3 | |
361 | ||
362 | psubd m0, m3 | |
363 | psubd m5, m3 | |
364 | paddd m0, m7 | |
365 | psubd m5, m7 | |
366 | psrad m3, 1 | |
367 | psrad m7, 1 | |
368 | psubd m0, m3 | |
369 | psubd m5, m7 | |
370 | ||
371 | SWAP 1, 7 | |
372 | psrad m1, m7, 2 | |
373 | psrad m3, m4, 2 | |
374 | paddd m3, m0 | |
375 | psrad m0, 2 | |
376 | paddd m1, m5 | |
377 | psrad m5, 2 | |
378 | psubd m0, m4 | |
379 | psubd m7, m5 | |
380 | ||
381 | SWAP 5, 6 | |
382 | psrad m4, m2, 1 | |
383 | psrad m6, m5, 1 | |
384 | psubd m4, m5 | |
385 | paddd m6, m2 | |
386 | ||
387 | mova m2, %1 | |
388 | mova m5, %2 | |
389 | SUMSUB_BA d, 5, 2 | |
390 | SUMSUB_BA d, 6, 5 | |
391 | SUMSUB_BA d, 4, 2 | |
392 | SUMSUB_BA d, 7, 6 | |
393 | SUMSUB_BA d, 0, 4 | |
394 | SUMSUB_BA d, 3, 2 | |
395 | SUMSUB_BA d, 1, 5 | |
396 | SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 | |
397 | %endmacro | |
398 | ||
399 | %macro IDCT8_1D_FULL 1 | |
400 | mova m7, [%1+112*2] | |
401 | mova m6, [%1+ 96*2] | |
402 | mova m5, [%1+ 80*2] | |
403 | mova m3, [%1+ 48*2] | |
404 | mova m2, [%1+ 32*2] | |
405 | mova m1, [%1+ 16*2] | |
406 | IDCT8_1D [%1], [%1+ 64*2] | |
407 | %endmacro | |
408 | ||
409 | ; %1=int16_t *block, %2=int16_t *dstblock | |
410 | %macro IDCT8_ADD_SSE_START 2 | |
411 | IDCT8_1D_FULL %1 | |
412 | %if ARCH_X86_64 | |
413 | TRANSPOSE4x4D 0,1,2,3,8 | |
414 | mova [%2 ], m0 | |
415 | TRANSPOSE4x4D 4,5,6,7,8 | |
416 | mova [%2+8*2], m4 | |
417 | %else | |
418 | mova [%1], m7 | |
419 | TRANSPOSE4x4D 0,1,2,3,7 | |
420 | mova m7, [%1] | |
421 | mova [%2 ], m0 | |
422 | mova [%2+16*2], m1 | |
423 | mova [%2+32*2], m2 | |
424 | mova [%2+48*2], m3 | |
425 | TRANSPOSE4x4D 4,5,6,7,3 | |
426 | mova [%2+ 8*2], m4 | |
427 | mova [%2+24*2], m5 | |
428 | mova [%2+40*2], m6 | |
429 | mova [%2+56*2], m7 | |
430 | %endif | |
431 | %endmacro | |
432 | ||
433 | ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride | |
434 | %macro IDCT8_ADD_SSE_END 3 | |
435 | IDCT8_1D_FULL %2 | |
436 | mova [%2 ], m6 | |
437 | mova [%2+16*2], m7 | |
438 | ||
439 | pxor m7, m7 | |
440 | STORE_DIFFx2 m0, m1, m6, m7, %1, %3 | |
441 | lea %1, [%1+%3*2] | |
442 | STORE_DIFFx2 m2, m3, m6, m7, %1, %3 | |
443 | mova m0, [%2 ] | |
444 | mova m1, [%2+16*2] | |
445 | lea %1, [%1+%3*2] | |
446 | STORE_DIFFx2 m4, m5, m6, m7, %1, %3 | |
447 | lea %1, [%1+%3*2] | |
448 | STORE_DIFFx2 m0, m1, m6, m7, %1, %3 | |
449 | %endmacro | |
450 | ||
451 | %macro IDCT8_ADD 0 | |
452 | cglobal h264_idct8_add_10, 3,4,16 | |
453 | %if UNIX64 == 0 | |
454 | %assign pad 16-gprsize-(stack_offset&15) | |
455 | sub rsp, pad | |
456 | call h264_idct8_add1_10 %+ SUFFIX | |
457 | add rsp, pad | |
458 | RET | |
459 | %endif | |
460 | ||
461 | ALIGN 16 | |
462 | ; TODO: does not need to use stack | |
463 | h264_idct8_add1_10 %+ SUFFIX: | |
464 | %assign pad 256+16-gprsize | |
465 | sub rsp, pad | |
466 | add dword [r1], 32 | |
467 | ||
468 | %if ARCH_X86_64 | |
469 | IDCT8_ADD_SSE_START r1, rsp | |
470 | SWAP 1, 9 | |
471 | SWAP 2, 10 | |
472 | SWAP 3, 11 | |
473 | SWAP 5, 13 | |
474 | SWAP 6, 14 | |
475 | SWAP 7, 15 | |
476 | IDCT8_ADD_SSE_START r1+16, rsp+128 | |
477 | PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7 | |
478 | IDCT8_1D [rsp], [rsp+128] | |
479 | SWAP 0, 8 | |
480 | SWAP 1, 9 | |
481 | SWAP 2, 10 | |
482 | SWAP 3, 11 | |
483 | SWAP 4, 12 | |
484 | SWAP 5, 13 | |
485 | SWAP 6, 14 | |
486 | SWAP 7, 15 | |
487 | IDCT8_1D [rsp+16], [rsp+144] | |
488 | psrad m8, 6 | |
489 | psrad m0, 6 | |
490 | packssdw m8, m0 | |
491 | paddsw m8, [r0] | |
492 | pxor m0, m0 | |
493 | mova [r1+ 0], m0 | |
494 | mova [r1+ 16], m0 | |
495 | mova [r1+ 32], m0 | |
496 | mova [r1+ 48], m0 | |
497 | mova [r1+ 64], m0 | |
498 | mova [r1+ 80], m0 | |
499 | mova [r1+ 96], m0 | |
500 | mova [r1+112], m0 | |
501 | mova [r1+128], m0 | |
502 | mova [r1+144], m0 | |
503 | mova [r1+160], m0 | |
504 | mova [r1+176], m0 | |
505 | mova [r1+192], m0 | |
506 | mova [r1+208], m0 | |
507 | mova [r1+224], m0 | |
508 | mova [r1+240], m0 | |
509 | CLIPW m8, m0, [pw_pixel_max] | |
510 | mova [r0], m8 | |
511 | mova m8, [pw_pixel_max] | |
512 | STORE_DIFF16 m9, m1, m0, m8, r0+r2 | |
513 | lea r0, [r0+r2*2] | |
514 | STORE_DIFF16 m10, m2, m0, m8, r0 | |
515 | STORE_DIFF16 m11, m3, m0, m8, r0+r2 | |
516 | lea r0, [r0+r2*2] | |
517 | STORE_DIFF16 m12, m4, m0, m8, r0 | |
518 | STORE_DIFF16 m13, m5, m0, m8, r0+r2 | |
519 | lea r0, [r0+r2*2] | |
520 | STORE_DIFF16 m14, m6, m0, m8, r0 | |
521 | STORE_DIFF16 m15, m7, m0, m8, r0+r2 | |
522 | %else | |
523 | IDCT8_ADD_SSE_START r1, rsp | |
524 | IDCT8_ADD_SSE_START r1+16, rsp+128 | |
525 | lea r3, [r0+8] | |
526 | IDCT8_ADD_SSE_END r0, rsp, r2 | |
527 | IDCT8_ADD_SSE_END r3, rsp+16, r2 | |
528 | mova [r1+ 0], m7 | |
529 | mova [r1+ 16], m7 | |
530 | mova [r1+ 32], m7 | |
531 | mova [r1+ 48], m7 | |
532 | mova [r1+ 64], m7 | |
533 | mova [r1+ 80], m7 | |
534 | mova [r1+ 96], m7 | |
535 | mova [r1+112], m7 | |
536 | mova [r1+128], m7 | |
537 | mova [r1+144], m7 | |
538 | mova [r1+160], m7 | |
539 | mova [r1+176], m7 | |
540 | mova [r1+192], m7 | |
541 | mova [r1+208], m7 | |
542 | mova [r1+224], m7 | |
543 | mova [r1+240], m7 | |
544 | %endif ; ARCH_X86_64 | |
545 | ||
546 | add rsp, pad | |
547 | ret | |
548 | %endmacro | |
549 | ||
550 | INIT_XMM sse2 | |
551 | IDCT8_ADD | |
552 | %if HAVE_AVX_EXTERNAL | |
553 | INIT_XMM avx | |
554 | IDCT8_ADD | |
555 | %endif | |
556 | ||
557 | ;----------------------------------------------------------------------------- | |
558 | ; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset, | |
559 | ; int16_t *block, int stride, | |
560 | ; const uint8_t nnzc[6*8]) | |
561 | ;----------------------------------------------------------------------------- | |
562 | ;;;;;;; NO FATE SAMPLES TRIGGER THIS | |
563 | %macro IDCT8_ADD4_OP 2 | |
564 | cmp byte [r4+%2], 0 | |
565 | jz .skipblock%1 | |
566 | mov r0d, [r6+%1*4] | |
567 | add r0, r5 | |
568 | call h264_idct8_add1_10 %+ SUFFIX | |
569 | .skipblock%1: | |
570 | %if %1<12 | |
571 | add r1, 256 | |
572 | %endif | |
573 | %endmacro | |
574 | ||
575 | %macro IDCT8_ADD4 0 | |
576 | cglobal h264_idct8_add4_10, 0,7,16 | |
577 | %assign pad 16-gprsize-(stack_offset&15) | |
578 | SUB rsp, pad | |
579 | mov r5, r0mp | |
580 | mov r6, r1mp | |
581 | mov r1, r2mp | |
582 | mov r2d, r3m | |
583 | movifnidn r4, r4mp | |
584 | IDCT8_ADD4_OP 0, 4+1*8 | |
585 | IDCT8_ADD4_OP 4, 6+1*8 | |
586 | IDCT8_ADD4_OP 8, 4+3*8 | |
587 | IDCT8_ADD4_OP 12, 6+3*8 | |
588 | ADD rsp, pad | |
589 | RET | |
590 | %endmacro ; IDCT8_ADD4 | |
591 | ||
592 | INIT_XMM sse2 | |
593 | IDCT8_ADD4 | |
594 | %if HAVE_AVX_EXTERNAL | |
595 | INIT_XMM avx | |
596 | IDCT8_ADD4 | |
597 | %endif |