Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;***************************************************************************** |
2 | ;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code | |
3 | ;***************************************************************************** | |
4 | ;* Copyright (C) 2011 x264 project | |
5 | ;* | |
6 | ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> | |
7 | ;* | |
8 | ;* This file is part of FFmpeg. | |
9 | ;* | |
10 | ;* FFmpeg is free software; you can redistribute it and/or | |
11 | ;* modify it under the terms of the GNU Lesser General Public | |
12 | ;* License as published by the Free Software Foundation; either | |
13 | ;* version 2.1 of the License, or (at your option) any later version. | |
14 | ;* | |
15 | ;* FFmpeg is distributed in the hope that it will be useful, | |
16 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 | ;* Lesser General Public License for more details. | |
19 | ;* | |
20 | ;* You should have received a copy of the GNU Lesser General Public | |
21 | ;* License along with FFmpeg; if not, write to the Free Software | |
22 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 | ;****************************************************************************** | |
24 | ||
25 | %include "libavutil/x86/x86util.asm" | |
26 | ||
27 | SECTION_RODATA 32 | |
28 | ||
29 | cextern pw_16 | |
30 | cextern pw_1 | |
31 | pb_0: times 32 db 0 ; we do not use cextern here as old llvm-gcc fails to align it correctly | |
32 | ||
33 | pw_pixel_max: times 8 dw ((1 << 10)-1) | |
34 | ||
35 | pad10: times 8 dw 10*1023 | |
36 | pad20: times 8 dw 20*1023 | |
37 | pad30: times 8 dw 30*1023 | |
38 | depad: times 4 dd 32*20*1023 + 512 | |
39 | depad2: times 8 dw 20*1023 + 16*1022 + 16 | |
40 | unpad: times 8 dw 16*1022/32 ; needs to be mod 16 | |
41 | ||
42 | tap1: times 4 dw 1, -5 | |
43 | tap2: times 4 dw 20, 20 | |
44 | tap3: times 4 dw -5, 1 | |
45 | pd_0f: times 4 dd 0xffff | |
46 | ||
47 | SECTION .text | |
48 | ||
49 | ||
50 | %macro AVG_MOV 2 | |
51 | pavgw %2, %1 | |
52 | mova %1, %2 | |
53 | %endmacro | |
54 | ||
55 | %macro ADDW 3 | |
56 | %if mmsize == 8 | |
57 | paddw %1, %2 | |
58 | %else | |
59 | movu %3, %2 | |
60 | paddw %1, %3 | |
61 | %endif | |
62 | %endmacro | |
63 | ||
64 | %macro FILT_H 4 | |
65 | paddw %1, %4 | |
66 | psubw %1, %2 ; a-b | |
67 | psraw %1, 2 ; (a-b)/4 | |
68 | psubw %1, %2 ; (a-b)/4-b | |
69 | paddw %1, %3 ; (a-b)/4-b+c | |
70 | psraw %1, 2 ; ((a-b)/4-b+c)/4 | |
71 | paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 | |
72 | %endmacro | |
73 | ||
74 | %macro PRELOAD_V 0 | |
75 | lea r3, [r2*3] | |
76 | sub r1, r3 | |
77 | movu m0, [r1+r2] | |
78 | movu m1, [r1+r2*2] | |
79 | add r1, r3 | |
80 | movu m2, [r1] | |
81 | movu m3, [r1+r2] | |
82 | movu m4, [r1+r2*2] | |
83 | add r1, r3 | |
84 | %endmacro | |
85 | ||
86 | %macro FILT_V 8 | |
87 | movu %6, [r1] | |
88 | paddw %1, %6 | |
89 | mova %7, %2 | |
90 | paddw %7, %5 | |
91 | mova %8, %3 | |
92 | paddw %8, %4 | |
93 | FILT_H %1, %7, %8, [pw_16] | |
94 | psraw %1, 1 | |
95 | CLIPW %1, [pb_0], [pw_pixel_max] | |
96 | %endmacro | |
97 | ||
98 | %macro MC 1 | |
99 | %define OP_MOV mova | |
100 | INIT_MMX mmxext | |
101 | %1 put, 4 | |
102 | INIT_XMM sse2 | |
103 | %1 put, 8 | |
104 | ||
105 | %define OP_MOV AVG_MOV | |
106 | INIT_MMX mmxext | |
107 | %1 avg, 4 | |
108 | INIT_XMM sse2 | |
109 | %1 avg, 8 | |
110 | %endmacro | |
111 | ||
112 | %macro MCAxA_OP 7 | |
113 | %if ARCH_X86_32 | |
114 | cglobal %1_h264_qpel%4_%2_10, %5,%6,%7 | |
115 | call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX | |
116 | mov r0, r0m | |
117 | mov r1, r1m | |
118 | add r0, %3*2 | |
119 | add r1, %3*2 | |
120 | call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX | |
121 | mov r0, r0m | |
122 | mov r1, r1m | |
123 | lea r0, [r0+r2*%3] | |
124 | lea r1, [r1+r2*%3] | |
125 | call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX | |
126 | mov r0, r0m | |
127 | mov r1, r1m | |
128 | lea r0, [r0+r2*%3+%3*2] | |
129 | lea r1, [r1+r2*%3+%3*2] | |
130 | call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX | |
131 | RET | |
132 | %else ; ARCH_X86_64 | |
133 | cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7 | |
134 | mov r%6, r0 | |
135 | %assign p1 %6+1 | |
136 | mov r %+ p1, r1 | |
137 | call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX | |
138 | lea r0, [r%6+%3*2] | |
139 | lea r1, [r %+ p1+%3*2] | |
140 | call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX | |
141 | lea r0, [r%6+r2*%3] | |
142 | lea r1, [r %+ p1+r2*%3] | |
143 | call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX | |
144 | lea r0, [r%6+r2*%3+%3*2] | |
145 | lea r1, [r %+ p1+r2*%3+%3*2] | |
146 | %if UNIX64 == 0 ; fall through to function | |
147 | call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX | |
148 | RET | |
149 | %endif | |
150 | %endif | |
151 | %endmacro | |
152 | ||
153 | ;cpu, put/avg, mc, 4/8, ... | |
154 | %macro cglobal_mc 6 | |
155 | %assign i %3*2 | |
156 | %if ARCH_X86_32 || cpuflag(sse2) | |
157 | MCAxA_OP %1, %2, %3, i, %4,%5,%6 | |
158 | %endif | |
159 | ||
160 | cglobal %1_h264_qpel%3_%2_10, %4,%5,%6 | |
161 | %if UNIX64 == 0 ; no prologue or epilogue for UNIX64 | |
162 | call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX | |
163 | RET | |
164 | %endif | |
165 | ||
166 | stub_%1_h264_qpel%3_%2_10 %+ SUFFIX: | |
167 | %endmacro | |
168 | ||
169 | ;----------------------------------------------------------------------------- | |
170 | ; void ff_h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride) | |
171 | ;----------------------------------------------------------------------------- | |
172 | %macro COPY4 0 | |
173 | movu m0, [r1 ] | |
174 | OP_MOV [r0 ], m0 | |
175 | movu m0, [r1+r2 ] | |
176 | OP_MOV [r0+r2 ], m0 | |
177 | movu m0, [r1+r2*2] | |
178 | OP_MOV [r0+r2*2], m0 | |
179 | movu m0, [r1+r3 ] | |
180 | OP_MOV [r0+r3 ], m0 | |
181 | %endmacro | |
182 | ||
183 | %macro MC00 1 | |
184 | INIT_MMX mmxext | |
185 | cglobal_mc %1, mc00, 4, 3,4,0 | |
186 | lea r3, [r2*3] | |
187 | COPY4 | |
188 | ret | |
189 | ||
190 | INIT_XMM sse2 | |
191 | cglobal %1_h264_qpel8_mc00_10, 3,4 | |
192 | lea r3, [r2*3] | |
193 | COPY4 | |
194 | lea r0, [r0+r2*4] | |
195 | lea r1, [r1+r2*4] | |
196 | COPY4 | |
197 | RET | |
198 | ||
199 | cglobal %1_h264_qpel16_mc00_10, 3,4 | |
200 | mov r3d, 8 | |
201 | .loop: | |
202 | movu m0, [r1 ] | |
203 | movu m1, [r1 +16] | |
204 | OP_MOV [r0 ], m0 | |
205 | OP_MOV [r0 +16], m1 | |
206 | movu m0, [r1+r2 ] | |
207 | movu m1, [r1+r2+16] | |
208 | OP_MOV [r0+r2 ], m0 | |
209 | OP_MOV [r0+r2+16], m1 | |
210 | lea r0, [r0+r2*2] | |
211 | lea r1, [r1+r2*2] | |
212 | dec r3d | |
213 | jg .loop | |
214 | REP_RET | |
215 | %endmacro | |
216 | ||
217 | %define OP_MOV mova | |
218 | MC00 put | |
219 | ||
220 | %define OP_MOV AVG_MOV | |
221 | MC00 avg | |
222 | ||
223 | ;----------------------------------------------------------------------------- | |
224 | ; void ff_h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride) | |
225 | ;----------------------------------------------------------------------------- | |
226 | %macro MC_CACHE 1 | |
227 | %define OP_MOV mova | |
228 | INIT_MMX mmxext | |
229 | %1 put, 4 | |
230 | INIT_XMM sse2, cache64 | |
231 | %1 put, 8 | |
232 | INIT_XMM ssse3, cache64 | |
233 | %1 put, 8 | |
234 | INIT_XMM sse2 | |
235 | %1 put, 8 | |
236 | ||
237 | %define OP_MOV AVG_MOV | |
238 | INIT_MMX mmxext | |
239 | %1 avg, 4 | |
240 | INIT_XMM sse2, cache64 | |
241 | %1 avg, 8 | |
242 | INIT_XMM ssse3, cache64 | |
243 | %1 avg, 8 | |
244 | INIT_XMM sse2 | |
245 | %1 avg, 8 | |
246 | %endmacro | |
247 | ||
248 | %macro MC20 2 | |
249 | cglobal_mc %1, mc20, %2, 3,4,9 | |
250 | mov r3d, %2 | |
251 | mova m1, [pw_pixel_max] | |
252 | %if num_mmregs > 8 | |
253 | mova m8, [pw_16] | |
254 | %define p16 m8 | |
255 | %else | |
256 | %define p16 [pw_16] | |
257 | %endif | |
258 | .nextrow: | |
259 | %if %0 == 4 | |
260 | movu m2, [r1-4] | |
261 | movu m3, [r1-2] | |
262 | movu m4, [r1+0] | |
263 | ADDW m2, [r1+6], m5 | |
264 | ADDW m3, [r1+4], m5 | |
265 | ADDW m4, [r1+2], m5 | |
266 | %else ; movu is slow on these processors | |
267 | %if mmsize==16 | |
268 | movu m2, [r1-4] | |
269 | movu m0, [r1+6] | |
270 | mova m6, m0 | |
271 | psrldq m0, 6 | |
272 | ||
273 | paddw m6, m2 | |
274 | PALIGNR m3, m0, m2, 2, m5 | |
275 | PALIGNR m7, m0, m2, 8, m5 | |
276 | paddw m3, m7 | |
277 | PALIGNR m4, m0, m2, 4, m5 | |
278 | PALIGNR m7, m0, m2, 6, m5 | |
279 | paddw m4, m7 | |
280 | SWAP 2, 6 | |
281 | %else | |
282 | movu m2, [r1-4] | |
283 | movu m6, [r1+4] | |
284 | PALIGNR m3, m6, m2, 2, m5 | |
285 | paddw m3, m6 | |
286 | PALIGNR m4, m6, m2, 4, m5 | |
287 | PALIGNR m7, m6, m2, 6, m5 | |
288 | paddw m4, m7 | |
289 | paddw m2, [r1+6] | |
290 | %endif | |
291 | %endif | |
292 | ||
293 | FILT_H m2, m3, m4, p16 | |
294 | psraw m2, 1 | |
295 | pxor m0, m0 | |
296 | CLIPW m2, m0, m1 | |
297 | OP_MOV [r0], m2 | |
298 | add r0, r2 | |
299 | add r1, r2 | |
300 | dec r3d | |
301 | jg .nextrow | |
302 | rep ret | |
303 | %endmacro | |
304 | ||
305 | MC_CACHE MC20 | |
306 | ||
307 | ;----------------------------------------------------------------------------- | |
308 | ; void ff_h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride) | |
309 | ;----------------------------------------------------------------------------- | |
310 | %macro MC30 2 | |
311 | cglobal_mc %1, mc30, %2, 3,5,9 | |
312 | lea r4, [r1+2] | |
313 | jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body | |
314 | %endmacro | |
315 | ||
316 | MC_CACHE MC30 | |
317 | ||
318 | ;----------------------------------------------------------------------------- | |
319 | ; void ff_h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride) | |
320 | ;----------------------------------------------------------------------------- | |
321 | %macro MC10 2 | |
322 | cglobal_mc %1, mc10, %2, 3,5,9 | |
323 | mov r4, r1 | |
324 | .body: | |
325 | mov r3d, %2 | |
326 | mova m1, [pw_pixel_max] | |
327 | %if num_mmregs > 8 | |
328 | mova m8, [pw_16] | |
329 | %define p16 m8 | |
330 | %else | |
331 | %define p16 [pw_16] | |
332 | %endif | |
333 | .nextrow: | |
334 | %if %0 == 4 | |
335 | movu m2, [r1-4] | |
336 | movu m3, [r1-2] | |
337 | movu m4, [r1+0] | |
338 | ADDW m2, [r1+6], m5 | |
339 | ADDW m3, [r1+4], m5 | |
340 | ADDW m4, [r1+2], m5 | |
341 | %else ; movu is slow on these processors | |
342 | %if mmsize==16 | |
343 | movu m2, [r1-4] | |
344 | movu m0, [r1+6] | |
345 | mova m6, m0 | |
346 | psrldq m0, 6 | |
347 | ||
348 | paddw m6, m2 | |
349 | PALIGNR m3, m0, m2, 2, m5 | |
350 | PALIGNR m7, m0, m2, 8, m5 | |
351 | paddw m3, m7 | |
352 | PALIGNR m4, m0, m2, 4, m5 | |
353 | PALIGNR m7, m0, m2, 6, m5 | |
354 | paddw m4, m7 | |
355 | SWAP 2, 6 | |
356 | %else | |
357 | movu m2, [r1-4] | |
358 | movu m6, [r1+4] | |
359 | PALIGNR m3, m6, m2, 2, m5 | |
360 | paddw m3, m6 | |
361 | PALIGNR m4, m6, m2, 4, m5 | |
362 | PALIGNR m7, m6, m2, 6, m5 | |
363 | paddw m4, m7 | |
364 | paddw m2, [r1+6] | |
365 | %endif | |
366 | %endif | |
367 | ||
368 | FILT_H m2, m3, m4, p16 | |
369 | psraw m2, 1 | |
370 | pxor m0, m0 | |
371 | CLIPW m2, m0, m1 | |
372 | movu m3, [r4] | |
373 | pavgw m2, m3 | |
374 | OP_MOV [r0], m2 | |
375 | add r0, r2 | |
376 | add r1, r2 | |
377 | add r4, r2 | |
378 | dec r3d | |
379 | jg .nextrow | |
380 | rep ret | |
381 | %endmacro | |
382 | ||
383 | MC_CACHE MC10 | |
384 | ||
385 | ;----------------------------------------------------------------------------- | |
386 | ; void ff_h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride) | |
387 | ;----------------------------------------------------------------------------- | |
388 | %macro V_FILT 10 | |
389 | v_filt%9_%10_10 | |
390 | add r4, r2 | |
391 | .no_addr4: | |
392 | FILT_V m0, m1, m2, m3, m4, m5, m6, m7 | |
393 | add r1, r2 | |
394 | add r0, r2 | |
395 | ret | |
396 | %endmacro | |
397 | ||
398 | INIT_MMX mmxext | |
399 | RESET_MM_PERMUTATION | |
400 | %assign i 0 | |
401 | %rep 4 | |
402 | V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i | |
403 | SWAP 0,1,2,3,4,5 | |
404 | %assign i i+1 | |
405 | %endrep | |
406 | ||
407 | INIT_XMM sse2 | |
408 | RESET_MM_PERMUTATION | |
409 | %assign i 0 | |
410 | %rep 6 | |
411 | V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i | |
412 | SWAP 0,1,2,3,4,5 | |
413 | %assign i i+1 | |
414 | %endrep | |
415 | ||
416 | %macro MC02 2 | |
417 | cglobal_mc %1, mc02, %2, 3,4,8 | |
418 | PRELOAD_V | |
419 | ||
420 | sub r0, r2 | |
421 | %assign j 0 | |
422 | %rep %2 | |
423 | %assign i (j % 6) | |
424 | call v_filt%2_ %+ i %+ _10.no_addr4 | |
425 | OP_MOV [r0], m0 | |
426 | SWAP 0,1,2,3,4,5 | |
427 | %assign j j+1 | |
428 | %endrep | |
429 | ret | |
430 | %endmacro | |
431 | ||
432 | MC MC02 | |
433 | ||
434 | ;----------------------------------------------------------------------------- | |
435 | ; void ff_h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride) | |
436 | ;----------------------------------------------------------------------------- | |
437 | %macro MC01 2 | |
438 | cglobal_mc %1, mc01, %2, 3,5,8 | |
439 | mov r4, r1 | |
440 | .body: | |
441 | PRELOAD_V | |
442 | ||
443 | sub r4, r2 | |
444 | sub r0, r2 | |
445 | %assign j 0 | |
446 | %rep %2 | |
447 | %assign i (j % 6) | |
448 | call v_filt%2_ %+ i %+ _10 | |
449 | movu m7, [r4] | |
450 | pavgw m0, m7 | |
451 | OP_MOV [r0], m0 | |
452 | SWAP 0,1,2,3,4,5 | |
453 | %assign j j+1 | |
454 | %endrep | |
455 | ret | |
456 | %endmacro | |
457 | ||
458 | MC MC01 | |
459 | ||
460 | ;----------------------------------------------------------------------------- | |
461 | ; void ff_h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride) | |
462 | ;----------------------------------------------------------------------------- | |
463 | %macro MC03 2 | |
464 | cglobal_mc %1, mc03, %2, 3,5,8 | |
465 | lea r4, [r1+r2] | |
466 | jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body | |
467 | %endmacro | |
468 | ||
469 | MC MC03 | |
470 | ||
471 | ;----------------------------------------------------------------------------- | |
472 | ; void ff_h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride) | |
473 | ;----------------------------------------------------------------------------- | |
474 | %macro H_FILT_AVG 2-3 | |
475 | h_filt%1_%2_10: | |
476 | ;FILT_H with fewer registers and averaged with the FILT_V result | |
477 | ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration | |
478 | ;unfortunately I need three registers, so m5 will have to be re-read from memory | |
479 | movu m5, [r4-4] | |
480 | ADDW m5, [r4+6], m7 | |
481 | movu m6, [r4-2] | |
482 | ADDW m6, [r4+4], m7 | |
483 | paddw m5, [pw_16] | |
484 | psubw m5, m6 ; a-b | |
485 | psraw m5, 2 ; (a-b)/4 | |
486 | psubw m5, m6 ; (a-b)/4-b | |
487 | movu m6, [r4+0] | |
488 | ADDW m6, [r4+2], m7 | |
489 | paddw m5, m6 ; (a-b)/4-b+c | |
490 | psraw m5, 2 ; ((a-b)/4-b+c)/4 | |
491 | paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 | |
492 | psraw m5, 1 | |
493 | CLIPW m5, [pb_0], [pw_pixel_max] | |
494 | ;avg FILT_V, FILT_H | |
495 | pavgw m0, m5 | |
496 | %if %0!=4 | |
497 | movu m5, [r1+r5] | |
498 | %endif | |
499 | ret | |
500 | %endmacro | |
501 | ||
502 | INIT_MMX mmxext | |
503 | RESET_MM_PERMUTATION | |
504 | %assign i 0 | |
505 | %rep 3 | |
506 | H_FILT_AVG 4, i | |
507 | SWAP 0,1,2,3,4,5 | |
508 | %assign i i+1 | |
509 | %endrep | |
510 | H_FILT_AVG 4, i, 0 | |
511 | ||
512 | INIT_XMM sse2 | |
513 | RESET_MM_PERMUTATION | |
514 | %assign i 0 | |
515 | %rep 6 | |
516 | %if i==1 | |
517 | H_FILT_AVG 8, i, 0 | |
518 | %else | |
519 | H_FILT_AVG 8, i | |
520 | %endif | |
521 | SWAP 0,1,2,3,4,5 | |
522 | %assign i i+1 | |
523 | %endrep | |
524 | ||
525 | %macro MC11 2 | |
526 | ; this REALLY needs x86_64 | |
527 | cglobal_mc %1, mc11, %2, 3,6,8 | |
528 | mov r4, r1 | |
529 | .body: | |
530 | PRELOAD_V | |
531 | ||
532 | sub r0, r2 | |
533 | sub r4, r2 | |
534 | mov r5, r2 | |
535 | neg r5 | |
536 | %assign j 0 | |
537 | %rep %2 | |
538 | %assign i (j % 6) | |
539 | call v_filt%2_ %+ i %+ _10 | |
540 | call h_filt%2_ %+ i %+ _10 | |
541 | %if %2==8 && i==1 | |
542 | movu m5, [r1+r5] | |
543 | %endif | |
544 | OP_MOV [r0], m0 | |
545 | SWAP 0,1,2,3,4,5 | |
546 | %assign j j+1 | |
547 | %endrep | |
548 | ret | |
549 | %endmacro | |
550 | ||
551 | MC MC11 | |
552 | ||
553 | ;----------------------------------------------------------------------------- | |
554 | ; void ff_h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride) | |
555 | ;----------------------------------------------------------------------------- | |
556 | %macro MC31 2 | |
557 | cglobal_mc %1, mc31, %2, 3,6,8 | |
558 | mov r4, r1 | |
559 | add r1, 2 | |
560 | jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body | |
561 | %endmacro | |
562 | ||
563 | MC MC31 | |
564 | ||
565 | ;----------------------------------------------------------------------------- | |
566 | ; void ff_h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride) | |
567 | ;----------------------------------------------------------------------------- | |
568 | %macro MC13 2 | |
569 | cglobal_mc %1, mc13, %2, 3,7,12 | |
570 | lea r4, [r1+r2] | |
571 | jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body | |
572 | %endmacro | |
573 | ||
574 | MC MC13 | |
575 | ||
576 | ;----------------------------------------------------------------------------- | |
577 | ; void ff_h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride) | |
578 | ;----------------------------------------------------------------------------- | |
579 | %macro MC33 2 | |
580 | cglobal_mc %1, mc33, %2, 3,6,8 | |
581 | lea r4, [r1+r2] | |
582 | add r1, 2 | |
583 | jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body | |
584 | %endmacro | |
585 | ||
586 | MC MC33 | |
587 | ||
588 | ;----------------------------------------------------------------------------- | |
589 | ; void ff_h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride) | |
590 | ;----------------------------------------------------------------------------- | |
591 | %macro FILT_H2 3 | |
592 | psubw %1, %2 ; a-b | |
593 | psubw %2, %3 ; b-c | |
594 | psllw %2, 2 | |
595 | psubw %1, %2 ; a-5*b+4*c | |
596 | psllw %3, 4 | |
597 | paddw %1, %3 ; a-5*b+20*c | |
598 | %endmacro | |
599 | ||
600 | %macro FILT_VNRD 8 | |
601 | movu %6, [r1] | |
602 | paddw %1, %6 | |
603 | mova %7, %2 | |
604 | paddw %7, %5 | |
605 | mova %8, %3 | |
606 | paddw %8, %4 | |
607 | FILT_H2 %1, %7, %8 | |
608 | %endmacro | |
609 | ||
610 | %macro HV 1 | |
611 | %if mmsize==16 | |
612 | %define PAD 12 | |
613 | %define COUNT 2 | |
614 | %else | |
615 | %define PAD 4 | |
616 | %define COUNT 3 | |
617 | %endif | |
618 | put_hv%1_10: | |
619 | neg r2 ; This actually saves instructions | |
620 | lea r1, [r1+r2*2-mmsize+PAD] | |
621 | lea r4, [rsp+PAD+gprsize] | |
622 | mov r3d, COUNT | |
623 | .v_loop: | |
624 | movu m0, [r1] | |
625 | sub r1, r2 | |
626 | movu m1, [r1] | |
627 | sub r1, r2 | |
628 | movu m2, [r1] | |
629 | sub r1, r2 | |
630 | movu m3, [r1] | |
631 | sub r1, r2 | |
632 | movu m4, [r1] | |
633 | sub r1, r2 | |
634 | %assign i 0 | |
635 | %rep %1-1 | |
636 | FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7 | |
637 | psubw m0, [pad20] | |
638 | movu [r4+i*mmsize*3], m0 | |
639 | sub r1, r2 | |
640 | SWAP 0,1,2,3,4,5 | |
641 | %assign i i+1 | |
642 | %endrep | |
643 | FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7 | |
644 | psubw m0, [pad20] | |
645 | movu [r4+i*mmsize*3], m0 | |
646 | add r4, mmsize | |
647 | lea r1, [r1+r2*8+mmsize] | |
648 | %if %1==8 | |
649 | lea r1, [r1+r2*4] | |
650 | %endif | |
651 | dec r3d | |
652 | jg .v_loop | |
653 | neg r2 | |
654 | ret | |
655 | %endmacro | |
656 | ||
657 | INIT_MMX mmxext | |
658 | HV 4 | |
659 | INIT_XMM sse2 | |
660 | HV 8 | |
661 | ||
662 | %macro H_LOOP 1 | |
663 | %if num_mmregs > 8 | |
664 | %define s1 m8 | |
665 | %define s2 m9 | |
666 | %define s3 m10 | |
667 | %define d1 m11 | |
668 | %else | |
669 | %define s1 [tap1] | |
670 | %define s2 [tap2] | |
671 | %define s3 [tap3] | |
672 | %define d1 [depad] | |
673 | %endif | |
674 | h%1_loop_op: | |
675 | movu m1, [r1+mmsize-4] | |
676 | movu m2, [r1+mmsize-2] | |
677 | mova m3, [r1+mmsize+0] | |
678 | movu m4, [r1+mmsize+2] | |
679 | movu m5, [r1+mmsize+4] | |
680 | movu m6, [r1+mmsize+6] | |
681 | %if num_mmregs > 8 | |
682 | pmaddwd m1, s1 | |
683 | pmaddwd m2, s1 | |
684 | pmaddwd m3, s2 | |
685 | pmaddwd m4, s2 | |
686 | pmaddwd m5, s3 | |
687 | pmaddwd m6, s3 | |
688 | paddd m1, d1 | |
689 | paddd m2, d1 | |
690 | %else | |
691 | mova m0, s1 | |
692 | pmaddwd m1, m0 | |
693 | pmaddwd m2, m0 | |
694 | mova m0, s2 | |
695 | pmaddwd m3, m0 | |
696 | pmaddwd m4, m0 | |
697 | mova m0, s3 | |
698 | pmaddwd m5, m0 | |
699 | pmaddwd m6, m0 | |
700 | mova m0, d1 | |
701 | paddd m1, m0 | |
702 | paddd m2, m0 | |
703 | %endif | |
704 | paddd m3, m5 | |
705 | paddd m4, m6 | |
706 | paddd m1, m3 | |
707 | paddd m2, m4 | |
708 | psrad m1, 10 | |
709 | psrad m2, 10 | |
710 | pslld m2, 16 | |
711 | pand m1, [pd_0f] | |
712 | por m1, m2 | |
713 | %if num_mmregs <= 8 | |
714 | pxor m0, m0 | |
715 | %endif | |
716 | CLIPW m1, m0, m7 | |
717 | add r1, mmsize*3 | |
718 | ret | |
719 | %endmacro | |
720 | ||
721 | INIT_MMX mmxext | |
722 | H_LOOP 4 | |
723 | INIT_XMM sse2 | |
724 | H_LOOP 8 | |
725 | ||
726 | %macro MC22 2 | |
727 | cglobal_mc %1, mc22, %2, 3,7,12 | |
728 | %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) | |
729 | mov r6, rsp ; backup stack pointer | |
730 | and rsp, ~(mmsize-1) ; align stack | |
731 | sub rsp, PAD | |
732 | ||
733 | call put_hv%2_10 | |
734 | ||
735 | mov r3d, %2 | |
736 | mova m7, [pw_pixel_max] | |
737 | %if num_mmregs > 8 | |
738 | pxor m0, m0 | |
739 | mova m8, [tap1] | |
740 | mova m9, [tap2] | |
741 | mova m10, [tap3] | |
742 | mova m11, [depad] | |
743 | %endif | |
744 | mov r1, rsp | |
745 | .h_loop: | |
746 | call h%2_loop_op | |
747 | ||
748 | OP_MOV [r0], m1 | |
749 | add r0, r2 | |
750 | dec r3d | |
751 | jg .h_loop | |
752 | ||
753 | mov rsp, r6 ; restore stack pointer | |
754 | ret | |
755 | %endmacro | |
756 | ||
757 | MC MC22 | |
758 | ||
759 | ;----------------------------------------------------------------------------- | |
760 | ; void ff_h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride) | |
761 | ;----------------------------------------------------------------------------- | |
762 | %macro MC12 2 | |
763 | cglobal_mc %1, mc12, %2, 3,7,12 | |
764 | %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) | |
765 | mov r6, rsp ; backup stack pointer | |
766 | and rsp, ~(mmsize-1) ; align stack | |
767 | sub rsp, PAD | |
768 | ||
769 | call put_hv%2_10 | |
770 | ||
771 | xor r4d, r4d | |
772 | .body: | |
773 | mov r3d, %2 | |
774 | pxor m0, m0 | |
775 | mova m7, [pw_pixel_max] | |
776 | %if num_mmregs > 8 | |
777 | mova m8, [tap1] | |
778 | mova m9, [tap2] | |
779 | mova m10, [tap3] | |
780 | mova m11, [depad] | |
781 | %endif | |
782 | mov r1, rsp | |
783 | .h_loop: | |
784 | call h%2_loop_op | |
785 | ||
786 | movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc | |
787 | paddw m3, [depad2] | |
788 | psrlw m3, 5 | |
789 | psubw m3, [unpad] | |
790 | CLIPW m3, m0, m7 | |
791 | pavgw m1, m3 | |
792 | ||
793 | OP_MOV [r0], m1 | |
794 | add r0, r2 | |
795 | dec r3d | |
796 | jg .h_loop | |
797 | ||
798 | mov rsp, r6 ; restore stack pointer | |
799 | ret | |
800 | %endmacro | |
801 | ||
802 | MC MC12 | |
803 | ||
804 | ;----------------------------------------------------------------------------- | |
805 | ; void ff_h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride) | |
806 | ;----------------------------------------------------------------------------- | |
807 | %macro MC32 2 | |
808 | cglobal_mc %1, mc32, %2, 3,7,12 | |
809 | %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) | |
810 | mov r6, rsp ; backup stack pointer | |
811 | and rsp, ~(mmsize-1) ; align stack | |
812 | sub rsp, PAD | |
813 | ||
814 | call put_hv%2_10 | |
815 | ||
816 | mov r4d, 2 ; sizeof(pixel) | |
817 | jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body | |
818 | %endmacro | |
819 | ||
820 | MC MC32 | |
821 | ||
822 | ;----------------------------------------------------------------------------- | |
823 | ; void ff_h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride) | |
824 | ;----------------------------------------------------------------------------- | |
825 | %macro H_NRD 1 | |
826 | put_h%1_10: | |
827 | add rsp, gprsize | |
828 | mov r3d, %1 | |
829 | xor r4d, r4d | |
830 | mova m6, [pad20] | |
831 | .nextrow: | |
832 | movu m2, [r5-4] | |
833 | movu m3, [r5-2] | |
834 | movu m4, [r5+0] | |
835 | ADDW m2, [r5+6], m5 | |
836 | ADDW m3, [r5+4], m5 | |
837 | ADDW m4, [r5+2], m5 | |
838 | ||
839 | FILT_H2 m2, m3, m4 | |
840 | psubw m2, m6 | |
841 | mova [rsp+r4], m2 | |
842 | add r4d, mmsize*3 | |
843 | add r5, r2 | |
844 | dec r3d | |
845 | jg .nextrow | |
846 | sub rsp, gprsize | |
847 | ret | |
848 | %endmacro | |
849 | ||
850 | INIT_MMX mmxext | |
851 | H_NRD 4 | |
852 | INIT_XMM sse2 | |
853 | H_NRD 8 | |
854 | ||
855 | %macro MC21 2 | |
856 | cglobal_mc %1, mc21, %2, 3,7,12 | |
857 | mov r5, r1 | |
858 | .body: | |
859 | %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) | |
860 | mov r6, rsp ; backup stack pointer | |
861 | and rsp, ~(mmsize-1) ; align stack | |
862 | ||
863 | sub rsp, PAD | |
864 | call put_h%2_10 | |
865 | ||
866 | sub rsp, PAD | |
867 | call put_hv%2_10 | |
868 | ||
869 | mov r4d, PAD-mmsize ; H buffer | |
870 | jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body | |
871 | %endmacro | |
872 | ||
873 | MC MC21 | |
874 | ||
875 | ;----------------------------------------------------------------------------- | |
876 | ; void ff_h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride) | |
877 | ;----------------------------------------------------------------------------- | |
878 | %macro MC23 2 | |
879 | cglobal_mc %1, mc23, %2, 3,7,12 | |
880 | lea r5, [r1+r2] | |
881 | jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body | |
882 | %endmacro | |
883 | ||
884 | MC MC23 |