Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / h264_qpel_10bit.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
3;*****************************************************************************
4;* Copyright (C) 2011 x264 project
5;*
6;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA 32
28
29cextern pw_16
30cextern pw_1
31pb_0: times 32 db 0 ; we do not use cextern here as old llvm-gcc fails to align it correctly
32
33pw_pixel_max: times 8 dw ((1 << 10)-1)
34
35pad10: times 8 dw 10*1023
36pad20: times 8 dw 20*1023
37pad30: times 8 dw 30*1023
38depad: times 4 dd 32*20*1023 + 512
39depad2: times 8 dw 20*1023 + 16*1022 + 16
40unpad: times 8 dw 16*1022/32 ; needs to be mod 16
41
42tap1: times 4 dw 1, -5
43tap2: times 4 dw 20, 20
44tap3: times 4 dw -5, 1
45pd_0f: times 4 dd 0xffff
46
47SECTION .text
48
49
50%macro AVG_MOV 2
51 pavgw %2, %1
52 mova %1, %2
53%endmacro
54
55%macro ADDW 3
56%if mmsize == 8
57 paddw %1, %2
58%else
59 movu %3, %2
60 paddw %1, %3
61%endif
62%endmacro
63
64%macro FILT_H 4
65 paddw %1, %4
66 psubw %1, %2 ; a-b
67 psraw %1, 2 ; (a-b)/4
68 psubw %1, %2 ; (a-b)/4-b
69 paddw %1, %3 ; (a-b)/4-b+c
70 psraw %1, 2 ; ((a-b)/4-b+c)/4
71 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
72%endmacro
73
74%macro PRELOAD_V 0
75 lea r3, [r2*3]
76 sub r1, r3
77 movu m0, [r1+r2]
78 movu m1, [r1+r2*2]
79 add r1, r3
80 movu m2, [r1]
81 movu m3, [r1+r2]
82 movu m4, [r1+r2*2]
83 add r1, r3
84%endmacro
85
86%macro FILT_V 8
87 movu %6, [r1]
88 paddw %1, %6
89 mova %7, %2
90 paddw %7, %5
91 mova %8, %3
92 paddw %8, %4
93 FILT_H %1, %7, %8, [pw_16]
94 psraw %1, 1
95 CLIPW %1, [pb_0], [pw_pixel_max]
96%endmacro
97
98%macro MC 1
99%define OP_MOV mova
100INIT_MMX mmxext
101%1 put, 4
102INIT_XMM sse2
103%1 put, 8
104
105%define OP_MOV AVG_MOV
106INIT_MMX mmxext
107%1 avg, 4
108INIT_XMM sse2
109%1 avg, 8
110%endmacro
111
112%macro MCAxA_OP 7
113%if ARCH_X86_32
114cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
115 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
116 mov r0, r0m
117 mov r1, r1m
118 add r0, %3*2
119 add r1, %3*2
120 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
121 mov r0, r0m
122 mov r1, r1m
123 lea r0, [r0+r2*%3]
124 lea r1, [r1+r2*%3]
125 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
126 mov r0, r0m
127 mov r1, r1m
128 lea r0, [r0+r2*%3+%3*2]
129 lea r1, [r1+r2*%3+%3*2]
130 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
131 RET
132%else ; ARCH_X86_64
133cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
134 mov r%6, r0
135%assign p1 %6+1
136 mov r %+ p1, r1
137 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
138 lea r0, [r%6+%3*2]
139 lea r1, [r %+ p1+%3*2]
140 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
141 lea r0, [r%6+r2*%3]
142 lea r1, [r %+ p1+r2*%3]
143 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
144 lea r0, [r%6+r2*%3+%3*2]
145 lea r1, [r %+ p1+r2*%3+%3*2]
146%if UNIX64 == 0 ; fall through to function
147 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
148 RET
149%endif
150%endif
151%endmacro
152
153;cpu, put/avg, mc, 4/8, ...
154%macro cglobal_mc 6
155%assign i %3*2
156%if ARCH_X86_32 || cpuflag(sse2)
157MCAxA_OP %1, %2, %3, i, %4,%5,%6
158%endif
159
160cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
161%if UNIX64 == 0 ; no prologue or epilogue for UNIX64
162 call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
163 RET
164%endif
165
166stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
167%endmacro
168
169;-----------------------------------------------------------------------------
170; void ff_h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
171;-----------------------------------------------------------------------------
172%macro COPY4 0
173 movu m0, [r1 ]
174 OP_MOV [r0 ], m0
175 movu m0, [r1+r2 ]
176 OP_MOV [r0+r2 ], m0
177 movu m0, [r1+r2*2]
178 OP_MOV [r0+r2*2], m0
179 movu m0, [r1+r3 ]
180 OP_MOV [r0+r3 ], m0
181%endmacro
182
183%macro MC00 1
184INIT_MMX mmxext
185cglobal_mc %1, mc00, 4, 3,4,0
186 lea r3, [r2*3]
187 COPY4
188 ret
189
190INIT_XMM sse2
191cglobal %1_h264_qpel8_mc00_10, 3,4
192 lea r3, [r2*3]
193 COPY4
194 lea r0, [r0+r2*4]
195 lea r1, [r1+r2*4]
196 COPY4
197 RET
198
199cglobal %1_h264_qpel16_mc00_10, 3,4
200 mov r3d, 8
201.loop:
202 movu m0, [r1 ]
203 movu m1, [r1 +16]
204 OP_MOV [r0 ], m0
205 OP_MOV [r0 +16], m1
206 movu m0, [r1+r2 ]
207 movu m1, [r1+r2+16]
208 OP_MOV [r0+r2 ], m0
209 OP_MOV [r0+r2+16], m1
210 lea r0, [r0+r2*2]
211 lea r1, [r1+r2*2]
212 dec r3d
213 jg .loop
214 REP_RET
215%endmacro
216
217%define OP_MOV mova
218MC00 put
219
220%define OP_MOV AVG_MOV
221MC00 avg
222
223;-----------------------------------------------------------------------------
224; void ff_h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
225;-----------------------------------------------------------------------------
226%macro MC_CACHE 1
227%define OP_MOV mova
228INIT_MMX mmxext
229%1 put, 4
230INIT_XMM sse2, cache64
231%1 put, 8
232INIT_XMM ssse3, cache64
233%1 put, 8
234INIT_XMM sse2
235%1 put, 8
236
237%define OP_MOV AVG_MOV
238INIT_MMX mmxext
239%1 avg, 4
240INIT_XMM sse2, cache64
241%1 avg, 8
242INIT_XMM ssse3, cache64
243%1 avg, 8
244INIT_XMM sse2
245%1 avg, 8
246%endmacro
247
248%macro MC20 2
249cglobal_mc %1, mc20, %2, 3,4,9
250 mov r3d, %2
251 mova m1, [pw_pixel_max]
252%if num_mmregs > 8
253 mova m8, [pw_16]
254 %define p16 m8
255%else
256 %define p16 [pw_16]
257%endif
258.nextrow:
259%if %0 == 4
260 movu m2, [r1-4]
261 movu m3, [r1-2]
262 movu m4, [r1+0]
263 ADDW m2, [r1+6], m5
264 ADDW m3, [r1+4], m5
265 ADDW m4, [r1+2], m5
266%else ; movu is slow on these processors
267%if mmsize==16
268 movu m2, [r1-4]
269 movu m0, [r1+6]
270 mova m6, m0
271 psrldq m0, 6
272
273 paddw m6, m2
274 PALIGNR m3, m0, m2, 2, m5
275 PALIGNR m7, m0, m2, 8, m5
276 paddw m3, m7
277 PALIGNR m4, m0, m2, 4, m5
278 PALIGNR m7, m0, m2, 6, m5
279 paddw m4, m7
280 SWAP 2, 6
281%else
282 movu m2, [r1-4]
283 movu m6, [r1+4]
284 PALIGNR m3, m6, m2, 2, m5
285 paddw m3, m6
286 PALIGNR m4, m6, m2, 4, m5
287 PALIGNR m7, m6, m2, 6, m5
288 paddw m4, m7
289 paddw m2, [r1+6]
290%endif
291%endif
292
293 FILT_H m2, m3, m4, p16
294 psraw m2, 1
295 pxor m0, m0
296 CLIPW m2, m0, m1
297 OP_MOV [r0], m2
298 add r0, r2
299 add r1, r2
300 dec r3d
301 jg .nextrow
302 rep ret
303%endmacro
304
305MC_CACHE MC20
306
307;-----------------------------------------------------------------------------
308; void ff_h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
309;-----------------------------------------------------------------------------
310%macro MC30 2
311cglobal_mc %1, mc30, %2, 3,5,9
312 lea r4, [r1+2]
313 jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
314%endmacro
315
316MC_CACHE MC30
317
318;-----------------------------------------------------------------------------
319; void ff_h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
320;-----------------------------------------------------------------------------
321%macro MC10 2
322cglobal_mc %1, mc10, %2, 3,5,9
323 mov r4, r1
324.body:
325 mov r3d, %2
326 mova m1, [pw_pixel_max]
327%if num_mmregs > 8
328 mova m8, [pw_16]
329 %define p16 m8
330%else
331 %define p16 [pw_16]
332%endif
333.nextrow:
334%if %0 == 4
335 movu m2, [r1-4]
336 movu m3, [r1-2]
337 movu m4, [r1+0]
338 ADDW m2, [r1+6], m5
339 ADDW m3, [r1+4], m5
340 ADDW m4, [r1+2], m5
341%else ; movu is slow on these processors
342%if mmsize==16
343 movu m2, [r1-4]
344 movu m0, [r1+6]
345 mova m6, m0
346 psrldq m0, 6
347
348 paddw m6, m2
349 PALIGNR m3, m0, m2, 2, m5
350 PALIGNR m7, m0, m2, 8, m5
351 paddw m3, m7
352 PALIGNR m4, m0, m2, 4, m5
353 PALIGNR m7, m0, m2, 6, m5
354 paddw m4, m7
355 SWAP 2, 6
356%else
357 movu m2, [r1-4]
358 movu m6, [r1+4]
359 PALIGNR m3, m6, m2, 2, m5
360 paddw m3, m6
361 PALIGNR m4, m6, m2, 4, m5
362 PALIGNR m7, m6, m2, 6, m5
363 paddw m4, m7
364 paddw m2, [r1+6]
365%endif
366%endif
367
368 FILT_H m2, m3, m4, p16
369 psraw m2, 1
370 pxor m0, m0
371 CLIPW m2, m0, m1
372 movu m3, [r4]
373 pavgw m2, m3
374 OP_MOV [r0], m2
375 add r0, r2
376 add r1, r2
377 add r4, r2
378 dec r3d
379 jg .nextrow
380 rep ret
381%endmacro
382
383MC_CACHE MC10
384
385;-----------------------------------------------------------------------------
386; void ff_h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
387;-----------------------------------------------------------------------------
388%macro V_FILT 10
389v_filt%9_%10_10
390 add r4, r2
391.no_addr4:
392 FILT_V m0, m1, m2, m3, m4, m5, m6, m7
393 add r1, r2
394 add r0, r2
395 ret
396%endmacro
397
398INIT_MMX mmxext
399RESET_MM_PERMUTATION
400%assign i 0
401%rep 4
402V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
403SWAP 0,1,2,3,4,5
404%assign i i+1
405%endrep
406
407INIT_XMM sse2
408RESET_MM_PERMUTATION
409%assign i 0
410%rep 6
411V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
412SWAP 0,1,2,3,4,5
413%assign i i+1
414%endrep
415
416%macro MC02 2
417cglobal_mc %1, mc02, %2, 3,4,8
418 PRELOAD_V
419
420 sub r0, r2
421%assign j 0
422%rep %2
423 %assign i (j % 6)
424 call v_filt%2_ %+ i %+ _10.no_addr4
425 OP_MOV [r0], m0
426 SWAP 0,1,2,3,4,5
427 %assign j j+1
428%endrep
429 ret
430%endmacro
431
432MC MC02
433
434;-----------------------------------------------------------------------------
435; void ff_h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
436;-----------------------------------------------------------------------------
437%macro MC01 2
438cglobal_mc %1, mc01, %2, 3,5,8
439 mov r4, r1
440.body:
441 PRELOAD_V
442
443 sub r4, r2
444 sub r0, r2
445%assign j 0
446%rep %2
447 %assign i (j % 6)
448 call v_filt%2_ %+ i %+ _10
449 movu m7, [r4]
450 pavgw m0, m7
451 OP_MOV [r0], m0
452 SWAP 0,1,2,3,4,5
453 %assign j j+1
454%endrep
455 ret
456%endmacro
457
458MC MC01
459
460;-----------------------------------------------------------------------------
461; void ff_h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
462;-----------------------------------------------------------------------------
463%macro MC03 2
464cglobal_mc %1, mc03, %2, 3,5,8
465 lea r4, [r1+r2]
466 jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
467%endmacro
468
469MC MC03
470
471;-----------------------------------------------------------------------------
472; void ff_h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
473;-----------------------------------------------------------------------------
474%macro H_FILT_AVG 2-3
475h_filt%1_%2_10:
476;FILT_H with fewer registers and averaged with the FILT_V result
477;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
478;unfortunately I need three registers, so m5 will have to be re-read from memory
479 movu m5, [r4-4]
480 ADDW m5, [r4+6], m7
481 movu m6, [r4-2]
482 ADDW m6, [r4+4], m7
483 paddw m5, [pw_16]
484 psubw m5, m6 ; a-b
485 psraw m5, 2 ; (a-b)/4
486 psubw m5, m6 ; (a-b)/4-b
487 movu m6, [r4+0]
488 ADDW m6, [r4+2], m7
489 paddw m5, m6 ; (a-b)/4-b+c
490 psraw m5, 2 ; ((a-b)/4-b+c)/4
491 paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
492 psraw m5, 1
493 CLIPW m5, [pb_0], [pw_pixel_max]
494;avg FILT_V, FILT_H
495 pavgw m0, m5
496%if %0!=4
497 movu m5, [r1+r5]
498%endif
499 ret
500%endmacro
501
502INIT_MMX mmxext
503RESET_MM_PERMUTATION
504%assign i 0
505%rep 3
506H_FILT_AVG 4, i
507SWAP 0,1,2,3,4,5
508%assign i i+1
509%endrep
510H_FILT_AVG 4, i, 0
511
512INIT_XMM sse2
513RESET_MM_PERMUTATION
514%assign i 0
515%rep 6
516%if i==1
517H_FILT_AVG 8, i, 0
518%else
519H_FILT_AVG 8, i
520%endif
521SWAP 0,1,2,3,4,5
522%assign i i+1
523%endrep
524
525%macro MC11 2
526; this REALLY needs x86_64
527cglobal_mc %1, mc11, %2, 3,6,8
528 mov r4, r1
529.body:
530 PRELOAD_V
531
532 sub r0, r2
533 sub r4, r2
534 mov r5, r2
535 neg r5
536%assign j 0
537%rep %2
538 %assign i (j % 6)
539 call v_filt%2_ %+ i %+ _10
540 call h_filt%2_ %+ i %+ _10
541%if %2==8 && i==1
542 movu m5, [r1+r5]
543%endif
544 OP_MOV [r0], m0
545 SWAP 0,1,2,3,4,5
546 %assign j j+1
547%endrep
548 ret
549%endmacro
550
551MC MC11
552
553;-----------------------------------------------------------------------------
554; void ff_h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
555;-----------------------------------------------------------------------------
556%macro MC31 2
557cglobal_mc %1, mc31, %2, 3,6,8
558 mov r4, r1
559 add r1, 2
560 jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
561%endmacro
562
563MC MC31
564
565;-----------------------------------------------------------------------------
566; void ff_h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
567;-----------------------------------------------------------------------------
568%macro MC13 2
569cglobal_mc %1, mc13, %2, 3,7,12
570 lea r4, [r1+r2]
571 jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
572%endmacro
573
574MC MC13
575
576;-----------------------------------------------------------------------------
577; void ff_h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
578;-----------------------------------------------------------------------------
579%macro MC33 2
580cglobal_mc %1, mc33, %2, 3,6,8
581 lea r4, [r1+r2]
582 add r1, 2
583 jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
584%endmacro
585
586MC MC33
587
588;-----------------------------------------------------------------------------
589; void ff_h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
590;-----------------------------------------------------------------------------
591%macro FILT_H2 3
592 psubw %1, %2 ; a-b
593 psubw %2, %3 ; b-c
594 psllw %2, 2
595 psubw %1, %2 ; a-5*b+4*c
596 psllw %3, 4
597 paddw %1, %3 ; a-5*b+20*c
598%endmacro
599
600%macro FILT_VNRD 8
601 movu %6, [r1]
602 paddw %1, %6
603 mova %7, %2
604 paddw %7, %5
605 mova %8, %3
606 paddw %8, %4
607 FILT_H2 %1, %7, %8
608%endmacro
609
610%macro HV 1
611%if mmsize==16
612%define PAD 12
613%define COUNT 2
614%else
615%define PAD 4
616%define COUNT 3
617%endif
618put_hv%1_10:
619 neg r2 ; This actually saves instructions
620 lea r1, [r1+r2*2-mmsize+PAD]
621 lea r4, [rsp+PAD+gprsize]
622 mov r3d, COUNT
623.v_loop:
624 movu m0, [r1]
625 sub r1, r2
626 movu m1, [r1]
627 sub r1, r2
628 movu m2, [r1]
629 sub r1, r2
630 movu m3, [r1]
631 sub r1, r2
632 movu m4, [r1]
633 sub r1, r2
634%assign i 0
635%rep %1-1
636 FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
637 psubw m0, [pad20]
638 movu [r4+i*mmsize*3], m0
639 sub r1, r2
640 SWAP 0,1,2,3,4,5
641%assign i i+1
642%endrep
643 FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
644 psubw m0, [pad20]
645 movu [r4+i*mmsize*3], m0
646 add r4, mmsize
647 lea r1, [r1+r2*8+mmsize]
648%if %1==8
649 lea r1, [r1+r2*4]
650%endif
651 dec r3d
652 jg .v_loop
653 neg r2
654 ret
655%endmacro
656
657INIT_MMX mmxext
658HV 4
659INIT_XMM sse2
660HV 8
661
662%macro H_LOOP 1
663%if num_mmregs > 8
664 %define s1 m8
665 %define s2 m9
666 %define s3 m10
667 %define d1 m11
668%else
669 %define s1 [tap1]
670 %define s2 [tap2]
671 %define s3 [tap3]
672 %define d1 [depad]
673%endif
674h%1_loop_op:
675 movu m1, [r1+mmsize-4]
676 movu m2, [r1+mmsize-2]
677 mova m3, [r1+mmsize+0]
678 movu m4, [r1+mmsize+2]
679 movu m5, [r1+mmsize+4]
680 movu m6, [r1+mmsize+6]
681%if num_mmregs > 8
682 pmaddwd m1, s1
683 pmaddwd m2, s1
684 pmaddwd m3, s2
685 pmaddwd m4, s2
686 pmaddwd m5, s3
687 pmaddwd m6, s3
688 paddd m1, d1
689 paddd m2, d1
690%else
691 mova m0, s1
692 pmaddwd m1, m0
693 pmaddwd m2, m0
694 mova m0, s2
695 pmaddwd m3, m0
696 pmaddwd m4, m0
697 mova m0, s3
698 pmaddwd m5, m0
699 pmaddwd m6, m0
700 mova m0, d1
701 paddd m1, m0
702 paddd m2, m0
703%endif
704 paddd m3, m5
705 paddd m4, m6
706 paddd m1, m3
707 paddd m2, m4
708 psrad m1, 10
709 psrad m2, 10
710 pslld m2, 16
711 pand m1, [pd_0f]
712 por m1, m2
713%if num_mmregs <= 8
714 pxor m0, m0
715%endif
716 CLIPW m1, m0, m7
717 add r1, mmsize*3
718 ret
719%endmacro
720
721INIT_MMX mmxext
722H_LOOP 4
723INIT_XMM sse2
724H_LOOP 8
725
726%macro MC22 2
727cglobal_mc %1, mc22, %2, 3,7,12
728%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
729 mov r6, rsp ; backup stack pointer
730 and rsp, ~(mmsize-1) ; align stack
731 sub rsp, PAD
732
733 call put_hv%2_10
734
735 mov r3d, %2
736 mova m7, [pw_pixel_max]
737%if num_mmregs > 8
738 pxor m0, m0
739 mova m8, [tap1]
740 mova m9, [tap2]
741 mova m10, [tap3]
742 mova m11, [depad]
743%endif
744 mov r1, rsp
745.h_loop:
746 call h%2_loop_op
747
748 OP_MOV [r0], m1
749 add r0, r2
750 dec r3d
751 jg .h_loop
752
753 mov rsp, r6 ; restore stack pointer
754 ret
755%endmacro
756
757MC MC22
758
759;-----------------------------------------------------------------------------
760; void ff_h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
761;-----------------------------------------------------------------------------
762%macro MC12 2
763cglobal_mc %1, mc12, %2, 3,7,12
764%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
765 mov r6, rsp ; backup stack pointer
766 and rsp, ~(mmsize-1) ; align stack
767 sub rsp, PAD
768
769 call put_hv%2_10
770
771 xor r4d, r4d
772.body:
773 mov r3d, %2
774 pxor m0, m0
775 mova m7, [pw_pixel_max]
776%if num_mmregs > 8
777 mova m8, [tap1]
778 mova m9, [tap2]
779 mova m10, [tap3]
780 mova m11, [depad]
781%endif
782 mov r1, rsp
783.h_loop:
784 call h%2_loop_op
785
786 movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
787 paddw m3, [depad2]
788 psrlw m3, 5
789 psubw m3, [unpad]
790 CLIPW m3, m0, m7
791 pavgw m1, m3
792
793 OP_MOV [r0], m1
794 add r0, r2
795 dec r3d
796 jg .h_loop
797
798 mov rsp, r6 ; restore stack pointer
799 ret
800%endmacro
801
802MC MC12
803
804;-----------------------------------------------------------------------------
805; void ff_h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
806;-----------------------------------------------------------------------------
807%macro MC32 2
808cglobal_mc %1, mc32, %2, 3,7,12
809%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
810 mov r6, rsp ; backup stack pointer
811 and rsp, ~(mmsize-1) ; align stack
812 sub rsp, PAD
813
814 call put_hv%2_10
815
816 mov r4d, 2 ; sizeof(pixel)
817 jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
818%endmacro
819
820MC MC32
821
822;-----------------------------------------------------------------------------
823; void ff_h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
824;-----------------------------------------------------------------------------
825%macro H_NRD 1
826put_h%1_10:
827 add rsp, gprsize
828 mov r3d, %1
829 xor r4d, r4d
830 mova m6, [pad20]
831.nextrow:
832 movu m2, [r5-4]
833 movu m3, [r5-2]
834 movu m4, [r5+0]
835 ADDW m2, [r5+6], m5
836 ADDW m3, [r5+4], m5
837 ADDW m4, [r5+2], m5
838
839 FILT_H2 m2, m3, m4
840 psubw m2, m6
841 mova [rsp+r4], m2
842 add r4d, mmsize*3
843 add r5, r2
844 dec r3d
845 jg .nextrow
846 sub rsp, gprsize
847 ret
848%endmacro
849
850INIT_MMX mmxext
851H_NRD 4
852INIT_XMM sse2
853H_NRD 8
854
855%macro MC21 2
856cglobal_mc %1, mc21, %2, 3,7,12
857 mov r5, r1
858.body:
859%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
860 mov r6, rsp ; backup stack pointer
861 and rsp, ~(mmsize-1) ; align stack
862
863 sub rsp, PAD
864 call put_h%2_10
865
866 sub rsp, PAD
867 call put_hv%2_10
868
869 mov r4d, PAD-mmsize ; H buffer
870 jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
871%endmacro
872
873MC MC21
874
875;-----------------------------------------------------------------------------
876; void ff_h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
877;-----------------------------------------------------------------------------
878%macro MC23 2
879cglobal_mc %1, mc23, %2, 3,7,12
880 lea r5, [r1+r2]
881 jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body
882%endmacro
883
884MC MC23