2 ; * Provide SSE luma and chroma mc functions for HEVC decoding
3 ; * Copyright (c) 2013 Pierre-Edouard LEPERE
5 ; * This file is part of FFmpeg.
7 ; * FFmpeg is free software; you can redistribute it and/or
8 ; * modify it under the terms of the GNU Lesser General Public
9 ; * License as published by the Free Software Foundation; either
10 ; * version 2.1 of the License, or (at your option) any later version.
12 ; * FFmpeg is distributed in the hope that it will be useful,
13 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ; * Lesser General Public License for more details.
17 ; * You should have received a copy of the GNU Lesser General Public
18 ; * License along with FFmpeg; if not, write to the Free Software
19 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 %include "libavutil/x86/x86util.asm"
24 pw_8: times 8 dw (1 << 9)
25 pw_10: times 8 dw (1 << 11)
26 pw_12: times 8 dw (1 << 13)
27 pw_bi_8: times 8 dw (1 << 8)
28 pw_bi_10: times 8 dw (1 << 10)
29 pw_bi_12: times 8 dw (1 << 12)
30 max_pixels_10: times 8 dw ((1 << 10)-1)
31 max_pixels_12: times 8 dw ((1 << 12)-1)
33 one_per_32: times 4 dd 1
37 hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
55 EPEL_TABLE 8, 8, b, sse4
56 EPEL_TABLE 10, 4, w, sse4
57 EPEL_TABLE 12, 4, w, sse4
60 hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4
74 QPEL_TABLE 8, 8, b, sse4
75 QPEL_TABLE 10, 4, w, sse4
76 QPEL_TABLE 12, 4, w, sse4
78 %define MAX_PB_SIZE 64
80 %define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
84 %macro SIMPLE_BILOAD 4 ;width, tab, r1, r2
86 movq %3, [%2] ; load data from source2
88 movdqa %3, [%2] ; load data from source2
90 movdqa %3, [%2] ; load data from source2
91 movq %4, [%2+16] ; load data from source2
93 movdqa %3, [%2] ; load data from source2
94 movdqa %4, [%2+16] ; load data from source2
98 %macro SIMPLE_LOAD 4 ;width, bitd, tab, r1
99 %if %1 == 2 || (%2 == 8 && %1 <= 4)
100 movd %4, [%3] ; load data from source
101 %elif %1 == 4 || (%2 == 8 && %1 <= 8)
102 movq %4, [%3] ; load data from source
104 movdqu %4, [%3] ; load data from source
108 %macro SIMPLE_8LOAD 5 ;width, bitd, tab, r1, r2
109 %if %1 == 2 || (%2 == 8 && %1 <= 4)
110 movq %4, [%3] ; load data from source2
111 %elif %1 == 4 || (%2 == 8 && %1 <= 8)
112 movdqa %4, [%3] ; load data from source2
114 movdqa %4, [%3] ; load data from source2
115 movq %5, [%3+16] ; load data from source2
117 movdqa %4, [%3] ; load data from source2
118 movdqa %5, [%3+16] ; load data from source2
122 %macro EPEL_FILTER 2-4 ; bit depth, filter index
124 lea rfilterq, [hevc_epel_filters_sse4_%1]
126 %define rfilterq hevc_epel_filters_sse4_%1
129 shl %2q, 5 ; multiply by 32
131 movdqa m14, [rfilterq + %2q] ; get 2 first values of filters
132 movdqa m15, [rfilterq + %2q+16] ; get 2 last values of filters
134 movdqa %3, [rfilterq + %2q] ; get 2 first values of filters
135 movdqa %4, [rfilterq + %2q+16] ; get 2 last values of filters
139 %macro EPEL_HV_FILTER 1
141 lea rfilterq, [hevc_epel_filters_sse4_%1]
143 %define rfilterq hevc_epel_filters_sse4_%1
147 shl mxq, 5 ; multiply by 32
148 shl myq, 5 ; multiply by 32
149 movdqa m14, [rfilterq + mxq] ; get 2 first values of filters
150 movdqa m15, [rfilterq + mxq+16] ; get 2 last values of filters
151 lea r3srcq, [srcstrideq*3]
154 lea rfilterq, [hevc_epel_filters_sse4_10]
156 %define rfilterq hevc_epel_filters_sse4_10
158 movdqa m12, [rfilterq + myq] ; get 2 first values of filters
159 movdqa m13, [rfilterq + myq+16] ; get 2 last values of filters
164 lea rfilterq, [hevc_qpel_filters_sse4_%1]
166 %define rfilterq hevc_qpel_filters_sse4_%1
169 movdqa m12, [rfilterq + %2q*8] ; get 4 first values of filters
170 movdqa m13, [rfilterq + %2q*8 + 16] ; get 4 first values of filters
171 movdqa m14, [rfilterq + %2q*8 + 32] ; get 4 first values of filters
172 movdqa m15, [rfilterq + %2q*8 + 48] ; get 4 first values of filters
181 %if (%1 == 8 && %4 <= 4)
183 %elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
186 %define %%load movdqu
189 %%load m0, [rfilterq ]
191 %%load m1, [rfilterq+ %3]
192 %%load m2, [rfilterq+2*%3]
193 %%load m3, [rfilterq+3*%3]
195 %%load m1, [rfilterq+ %3q]
196 %%load m2, [rfilterq+2*%3q]
197 %%load m3, [rfilterq+r3srcq]
202 SBUTTERFLY bw, 0, 1, 10
203 SBUTTERFLY bw, 2, 3, 10
210 SBUTTERFLY wd, 0, 1, 10
211 SBUTTERFLY wd, 2, 3, 10
221 %assign %%stride (%1+7)/8
228 %define %%load movdqu
236 %define %%load movdqu
239 %%load m0, [%2-3*%%stride] ;load data from source
240 %%load m1, [%2-2*%%stride]
241 %%load m2, [%2-%%stride ]
243 %%load m4, [%2+%%stride ]
244 %%load m5, [%2+2*%%stride]
245 %%load m6, [%2+3*%%stride]
246 %%load m7, [%2+4*%%stride]
250 SBUTTERFLY wd, 0, 1, %4
251 SBUTTERFLY wd, 2, 3, %4
252 SBUTTERFLY wd, 4, 5, %4
253 SBUTTERFLY wd, 6, 7, %4
262 SBUTTERFLY dq, 0, 1, %4
263 SBUTTERFLY dq, 2, 3, %4
264 SBUTTERFLY dq, 4, 5, %4
265 SBUTTERFLY dq, 6, 7, %4
278 movdqu m0, [%5q ] ;load x- 3*srcstride
279 movdqu m1, [%5q+ %3q ] ;load x- 2*srcstride
280 movdqu m2, [%5q+ 2*%3q ] ;load x-srcstride
281 movdqu m3, [%2 ] ;load x
282 movdqu m4, [%2+ %3q] ;load x+stride
283 movdqu m5, [%2+ 2*%3q] ;load x+2*stride
284 movdqu m6, [%2+r3srcq] ;load x+3*stride
285 movdqu m7, [%2+ 4*%3q] ;load x+4*stride
288 SBUTTERFLY bw, 0, 1, 8
289 SBUTTERFLY bw, 2, 3, 8
290 SBUTTERFLY bw, 4, 5, 8
291 SBUTTERFLY bw, 6, 7, 8
300 SBUTTERFLY wd, 0, 1, 8
301 SBUTTERFLY wd, 2, 3, 8
302 SBUTTERFLY wd, 4, 5, 8
303 SBUTTERFLY wd, 6, 7, 8
313 %macro PEL_12STORE2 3
316 %macro PEL_12STORE4 3
319 %macro PEL_12STORE6 3
324 %macro PEL_12STORE8 3
327 %macro PEL_12STORE12 3
331 %macro PEL_12STORE16 3
332 PEL_12STORE8 %1, %2, %3
336 %macro PEL_10STORE2 3
339 %macro PEL_10STORE4 3
342 %macro PEL_10STORE6 3
347 %macro PEL_10STORE8 3
350 %macro PEL_10STORE12 3
354 %macro PEL_10STORE16 3
355 PEL_10STORE8 %1, %2, %3
372 %macro PEL_8STORE12 3
377 %macro PEL_8STORE16 3
382 add %1q, 2*MAX_PB_SIZE ; dst += dststride
383 add %2q, %3q ; src += srcstride
384 dec heightd ; cmp height
385 jnz .loop ; height loop
389 %macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
401 %macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2
403 pmaddubsw m0, %3 ;x1*c1+x2*c2
404 pmaddubsw m2, %4 ;x3*c3+x4*c4
428 %macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx
430 lea rfilterq, [hevc_qpel_filters_sse4_%2]
432 %define rfilterq hevc_qpel_filters_sse4_%2
436 pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2
437 pmaddubsw m2, [rfilterq + %3q*8+16] ;x3*c3+x4*c4
438 pmaddubsw m4, [rfilterq + %3q*8+32] ;x5*c5+x6*c6
439 pmaddubsw m6, [rfilterq + %3q*8+48] ;x7*c7+x8*c8
444 pmaddwd m0, [rfilterq + %3q*8 ]
445 pmaddwd m2, [rfilterq + %3q*8+16]
446 pmaddwd m4, [rfilterq + %3q*8+32]
447 pmaddwd m6, [rfilterq + %3q*8+48]
455 pmaddwd m1, [rfilterq + %3q*8 ]
456 pmaddwd m3, [rfilterq + %3q*8+16]
457 pmaddwd m5, [rfilterq + %3q*8+32]
458 pmaddwd m7, [rfilterq + %3q*8+48]
470 %macro QPEL_COMPUTE 2 ; width, bitdepth
472 pmaddubsw m0, m12 ;x1*c1+x2*c2
473 pmaddubsw m2, m13 ;x3*c3+x4*c4
474 pmaddubsw m4, m14 ;x5*c5+x6*c6
475 pmaddubsw m6, m15 ;x7*c7+x8*c8
514 %macro BI_COMPUTE 7 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
519 UNI_COMPUTE %1, %2, %3, %4, %7
524 %if %1 > 8 || (%2 > 8 && %1 > 4)
530 pminsw %3, [max_pixels_%2]
533 pminsw %4, [max_pixels_%2]
539 INIT_XMM sse4 ; adds ff_ and _sse4 to function name
540 ; ******************************
541 ; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
542 ; uint8_t *_src, ptrdiff_t _srcstride,
543 ; int height, int mx, int my)
544 ; ******************************
546 %macro HEVC_PUT_HEVC_PEL_PIXELS 2
547 cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
550 SIMPLE_LOAD %1, %2, srcq, m0
551 MC_PIXEL_COMPUTE %1, %2
552 PEL_10STORE%1 dstq, m0, m1
553 LOOP_END dst, src, srcstride
556 cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
558 SIMPLE_LOAD %1, %2, srcq, m0
559 PEL_%2STORE%1 dstq, m0, m1
560 add dstq, dststrideq ; dst += dststride
561 add srcq, srcstrideq ; src += srcstride
562 dec heightd ; cmp height
563 jnz .loop ; height loop
566 cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
568 movdqa m5, [pw_bi_%2]
570 SIMPLE_LOAD %1, %2, srcq, m0
571 SIMPLE_BILOAD %1, src2q, m3, m4
572 MC_PIXEL_COMPUTE %1, %2
573 BI_COMPUTE %1, %2, m0, m1, m3, m4, m5
574 PEL_%2STORE%1 dstq, m0, m1
575 add dstq, dststrideq ; dst += dststride
576 add srcq, srcstrideq ; src += srcstride
577 add src2q, 2*MAX_PB_SIZE ; src += srcstride
578 dec heightd ; cmp height
579 jnz .loop ; height loop
585 ; ******************************
586 ; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
587 ; uint8_t *_src, ptrdiff_t _srcstride,
588 ; int width, int height, int mx, int my,
590 ; ******************************
593 %macro HEVC_PUT_HEVC_EPEL 2
594 cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, 6, dst, src, srcstride, height, mx, rfilter
595 %assign %%stride ((%2 + 7)/8)
596 EPEL_FILTER %2, mx, m4, m5
598 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
599 EPEL_COMPUTE %2, %1, m4, m5
600 PEL_10STORE%1 dstq, m0, m1
601 LOOP_END dst, src, srcstride
604 cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride, height, mx, rfilter
605 %assign %%stride ((%2 + 7)/8)
607 EPEL_FILTER %2, mx, m4, m5
609 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
610 EPEL_COMPUTE %2, %1, m4, m5
611 UNI_COMPUTE %1, %2, m0, m1, m6
612 PEL_%2STORE%1 dstq, m0, m1
613 add dstq, dststrideq ; dst += dststride
614 add srcq, srcstrideq ; src += srcstride
615 dec heightd ; cmp height
616 jnz .loop ; height loop
619 cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 7, dst, dststride, src, srcstride, src2, height, mx, rfilter
620 movdqa m6, [pw_bi_%2]
621 EPEL_FILTER %2, mx, m4, m5
623 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
624 EPEL_COMPUTE %2, %1, m4, m5
625 SIMPLE_BILOAD %1, src2q, m2, m3
626 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
627 PEL_%2STORE%1 dstq, m0, m1
628 add dstq, dststrideq ; dst += dststride
629 add srcq, srcstrideq ; src += srcstride
630 add src2q, 2*MAX_PB_SIZE ; src += srcstride
631 dec heightd ; cmp height
632 jnz .loop ; height loop
635 ; ******************************
636 ; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
637 ; uint8_t *_src, ptrdiff_t _srcstride,
638 ; int width, int height, int mx, int my,
640 ; ******************************
642 cglobal hevc_put_hevc_epel_v%1_%2, 6, 7, 6, dst, src, srcstride, height, r3src, my, rfilter
643 lea r3srcq, [srcstrideq*3]
645 EPEL_FILTER %2, my, m4, m5
647 EPEL_LOAD %2, srcq, srcstride, %1
648 EPEL_COMPUTE %2, %1, m4, m5
649 PEL_10STORE%1 dstq, m0, m1
650 LOOP_END dst, src, srcstride
653 cglobal hevc_put_hevc_uni_epel_v%1_%2, 7, 8, 7, dst, dststride, src, srcstride, height, r3src, my, rfilter
654 lea r3srcq, [srcstrideq*3]
657 EPEL_FILTER %2, my, m4, m5
659 EPEL_LOAD %2, srcq, srcstride, %1
660 EPEL_COMPUTE %2, %1, m4, m5
661 UNI_COMPUTE %1, %2, m0, m1, m6
662 PEL_%2STORE%1 dstq, m0, m1
663 add dstq, dststrideq ; dst += dststride
664 add srcq, srcstrideq ; src += srcstride
665 dec heightd ; cmp height
666 jnz .loop ; height loop
670 cglobal hevc_put_hevc_bi_epel_v%1_%2, 8, 9, 7, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
671 lea r3srcq, [srcstrideq*3]
672 movdqa m6, [pw_bi_%2]
674 EPEL_FILTER %2, my, m4, m5
676 EPEL_LOAD %2, srcq, srcstride, %1
677 EPEL_COMPUTE %2, %1, m4, m5
678 SIMPLE_BILOAD %1, src2q, m2, m3
679 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
680 PEL_%2STORE%1 dstq, m0, m1
681 add dstq, dststrideq ; dst += dststride
682 add srcq, srcstrideq ; src += srcstride
683 add src2q, 2*MAX_PB_SIZE ; src += srcstride
684 dec heightd ; cmp height
685 jnz .loop ; height loop
690 ; ******************************
691 ; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
692 ; uint8_t *_src, ptrdiff_t _srcstride,
693 ; int width, int height, int mx, int my)
694 ; ******************************
696 %macro HEVC_PUT_HEVC_EPEL_HV 2
697 cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 12 , dst, src, srcstride, height, mx, my, r3src, rfilter
698 %assign %%stride ((%2 + 7)/8)
701 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
702 EPEL_COMPUTE %2, %1, m14, m15
705 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
706 EPEL_COMPUTE %2, %1, m14, m15
709 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
710 EPEL_COMPUTE %2, %1, m14, m15
714 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
715 EPEL_COMPUTE %2, %1, m14, m15
723 EPEL_COMPUTE 14, %1, m12, m13
724 PEL_10STORE%1 dstq, m0, m1
728 LOOP_END dst, src, srcstride
731 cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
732 %assign %%stride ((%2 + 7)/8)
735 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
736 EPEL_COMPUTE %2, %1, m14, m15
739 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
740 EPEL_COMPUTE %2, %1, m14, m15
743 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
744 EPEL_COMPUTE %2, %1, m14, m15
748 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
749 EPEL_COMPUTE %2, %1, m14, m15
757 EPEL_COMPUTE 14, %1, m12, m13
758 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
759 PEL_%2STORE%1 dstq, m0, m1
763 add dstq, dststrideq ; dst += dststride
764 add srcq, srcstrideq ; src += srcstride
765 dec heightd ; cmp height
766 jnz .loop ; height loop
770 cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
771 %assign %%stride ((%2 + 7)/8)
774 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
775 EPEL_COMPUTE %2, %1, m14, m15
778 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
779 EPEL_COMPUTE %2, %1, m14, m15
782 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
783 EPEL_COMPUTE %2, %1, m14, m15
787 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
788 EPEL_COMPUTE %2, %1, m14, m15
796 EPEL_COMPUTE 14, %1, m12, m13
797 SIMPLE_BILOAD %1, src2q, m8, m9
798 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
799 PEL_%2STORE%1 dstq, m0, m1
803 add dstq, dststrideq ; dst += dststride
804 add srcq, srcstrideq ; src += srcstride
805 add src2q, 2*MAX_PB_SIZE ; src += srcstride
806 dec heightd ; cmp height
807 jnz .loop ; height loop
811 ; ******************************
812 ; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
813 ; uint8_t *_src, ptrdiff_t _srcstride,
814 ; int width, int height, int mx, int my)
815 ; ******************************
817 %macro HEVC_PUT_HEVC_QPEL 2
818 cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 15, dst, src, srcstride, height, mx, rfilter
821 QPEL_H_LOAD %2, srcq, %1, 10
826 PEL_10STORE%1 dstq, m0, m1
827 LOOP_END dst, src, srcstride
830 cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, height, mx, rfilter
834 QPEL_H_LOAD %2, srcq, %1, 10
839 UNI_COMPUTE %1, %2, m0, m1, m9
840 PEL_%2STORE%1 dstq, m0, m1
841 add dstq, dststrideq ; dst += dststride
842 add srcq, srcstrideq ; src += srcstride
843 dec heightd ; cmp height
844 jnz .loop ; height loop
847 cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
848 movdqa m9, [pw_bi_%2]
851 QPEL_H_LOAD %2, srcq, %1, 10
856 SIMPLE_BILOAD %1, src2q, m10, m11
857 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
858 PEL_%2STORE%1 dstq, m0, m1
859 add dstq, dststrideq ; dst += dststride
860 add srcq, srcstrideq ; src += srcstride
861 add src2q, 2*MAX_PB_SIZE ; src += srcstride
862 dec heightd ; cmp height
863 jnz .loop ; height loop
867 ; ******************************
868 ; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
869 ; uint8_t *_src, ptrdiff_t _srcstride,
870 ; int width, int height, int mx, int my)
871 ; ******************************
873 cglobal hevc_put_hevc_qpel_v%1_%2, 6, 8, 15, dst, src, srcstride, height, r3src, my, rfilter
874 lea r3srcq, [srcstrideq*3]
877 QPEL_V_LOAD %2, srcq, srcstride, %1, r7
882 PEL_10STORE%1 dstq, m0, m1
883 LOOP_END dst, src, srcstride
886 cglobal hevc_put_hevc_uni_qpel_v%1_%2, 7, 9, 15, dst, dststride, src, srcstride, height, r3src, my, rfilter
888 lea r3srcq, [srcstrideq*3]
891 QPEL_V_LOAD %2, srcq, srcstride, %1, r8
896 UNI_COMPUTE %1, %2, m0, m1, m9
897 PEL_%2STORE%1 dstq, m0, m1
898 add dstq, dststrideq ; dst += dststride
899 add srcq, srcstrideq ; src += srcstride
900 dec heightd ; cmp height
901 jnz .loop ; height loop
904 cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
905 movdqa m9, [pw_bi_%2]
906 lea r3srcq, [srcstrideq*3]
909 SIMPLE_BILOAD %1, src2q, m10, m11
910 QPEL_V_LOAD %2, srcq, srcstride, %1, r9
915 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
916 PEL_%2STORE%1 dstq, m0, m1
917 add dstq, dststrideq ; dst += dststride
918 add srcq, srcstrideq ; src += srcstride
919 add src2q, 2*MAX_PB_SIZE ; src += srcstride
920 dec heightd ; cmp height
921 jnz .loop ; height loop
926 ; ******************************
927 ; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
928 ; uint8_t *_src, ptrdiff_t _srcstride,
929 ; int height, int mx, int my)
930 ; ******************************
931 %macro HEVC_PUT_HEVC_QPEL_HV 2
932 cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 12, dst, src, srcstride, height, mx, my, r3src, rfilter
935 lea r3srcq, [srcstrideq*3]
937 QPEL_H_LOAD %2, srcq, %1, 15
938 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
941 QPEL_H_LOAD %2, srcq, %1, 15
942 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
945 QPEL_H_LOAD %2, srcq, %1, 15
946 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
949 QPEL_H_LOAD %2, srcq, %1, 15
950 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
953 QPEL_H_LOAD %2, srcq, %1, 15
954 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
957 QPEL_H_LOAD %2, srcq, %1, 15
958 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
961 QPEL_H_LOAD %2, srcq, %1, 15
962 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
966 QPEL_H_LOAD %2, srcq, %1, 15
967 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
970 punpcklwd m2, m10, m11
971 punpcklwd m4, m12, m13
972 punpcklwd m6, m14, m15
975 punpckhwd m3, m10, m11
976 punpckhwd m5, m12, m13
977 punpckhwd m7, m14, m15
979 QPEL_HV_COMPUTE %1, 14, my, ackssdw
980 PEL_10STORE%1 dstq, m0, m1
998 LOOP_END dst, src, srcstride
1001 cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
1004 lea r3srcq, [srcstrideq*3]
1006 QPEL_H_LOAD %2, srcq, %1, 15
1007 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1009 add srcq, srcstrideq
1010 QPEL_H_LOAD %2, srcq, %1, 15
1011 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1013 add srcq, srcstrideq
1014 QPEL_H_LOAD %2, srcq, %1, 15
1015 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1017 add srcq, srcstrideq
1018 QPEL_H_LOAD %2, srcq, %1, 15
1019 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1021 add srcq, srcstrideq
1022 QPEL_H_LOAD %2, srcq, %1, 15
1023 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1025 add srcq, srcstrideq
1026 QPEL_H_LOAD %2, srcq, %1, 15
1027 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1029 add srcq, srcstrideq
1030 QPEL_H_LOAD %2, srcq, %1, 15
1031 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1033 add srcq, srcstrideq
1035 QPEL_H_LOAD %2, srcq, %1, 15
1036 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1038 punpcklwd m0, m8, m9
1039 punpcklwd m2, m10, m11
1040 punpcklwd m4, m12, m13
1041 punpcklwd m6, m14, m15
1043 punpckhwd m1, m8, m9
1044 punpckhwd m3, m10, m11
1045 punpckhwd m5, m12, m13
1046 punpckhwd m7, m14, m15
1048 QPEL_HV_COMPUTE %1, 14, my, ackusdw
1049 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
1050 PEL_%2STORE%1 dstq, m0, m1
1069 add dstq, dststrideq ; dst += dststride
1070 add srcq, srcstrideq ; src += srcstride
1071 dec heightd ; cmp height
1072 jnz .loop ; height loop
1075 cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
1078 lea r3srcq, [srcstrideq*3]
1080 QPEL_H_LOAD %2, srcq, %1, 15
1081 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1083 add srcq, srcstrideq
1084 QPEL_H_LOAD %2, srcq, %1, 15
1085 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1087 add srcq, srcstrideq
1088 QPEL_H_LOAD %2, srcq, %1, 15
1089 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1091 add srcq, srcstrideq
1092 QPEL_H_LOAD %2, srcq, %1, 15
1093 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1095 add srcq, srcstrideq
1096 QPEL_H_LOAD %2, srcq, %1, 15
1097 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1099 add srcq, srcstrideq
1100 QPEL_H_LOAD %2, srcq, %1, 15
1101 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1103 add srcq, srcstrideq
1104 QPEL_H_LOAD %2, srcq, %1, 15
1105 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1107 add srcq, srcstrideq
1109 QPEL_H_LOAD %2, srcq, %1, 15
1110 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1112 punpcklwd m0, m8, m9
1113 punpcklwd m2, m10, m11
1114 punpcklwd m4, m12, m13
1115 punpcklwd m6, m14, m15
1117 punpckhwd m1, m8, m9
1118 punpckhwd m3, m10, m11
1119 punpckhwd m5, m12, m13
1120 punpckhwd m7, m14, m15
1122 QPEL_HV_COMPUTE %1, 14, my, ackssdw
1123 SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case
1124 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1125 PEL_%2STORE%1 dstq, m0, m1
1144 add dstq, dststrideq ; dst += dststride
1145 add srcq, srcstrideq ; src += srcstride
1146 add src2q, 2*MAX_PB_SIZE ; src += srcstride
1147 dec heightd ; cmp height
1148 jnz .loop ; height loop
1152 %macro WEIGHTING_FUNCS 2
1153 %if WIN64 || ARCH_X86_32
1154 cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, srcstride, height, denom, wx, ox
1158 cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, denom, wx, ox
1159 %define SHIFT denomd
1161 lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom
1166 movd m4, SHIFT ; shift
1173 movdqu m5, [one_per_32]
1179 shl SHIFT, %2-8 ; ox << (bitd - 8)
1183 %if WIN64 || ARCH_X86_32
1187 SIMPLE_LOAD %1, 10, srcq, m0
1197 punpckhwd m1, m0, m6
1210 pminsw m0, [max_pixels_%2]
1212 PEL_%2STORE%1 dstq, m0, m1
1213 add dstq, dststrideq ; dst += dststride
1214 add srcq, 2*MAX_PB_SIZE ; src += srcstride
1215 dec heightd ; cmp height
1216 jnz .loop ; height loop
1219 cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2, height, denom, wx0, wx1, ox0, ox1
1225 lea r6d, [r6d+14-%2] ; shift = 14 - bitd + denom
1227 movd m0, r6d ; shift
1236 movd m5, r6d ; shift+1
1242 shl r6d, %2-8 ; ox << (bitd - 8)
1245 movd m4, r6d ; offset
1251 SIMPLE_LOAD %1, 10, srcq, m0
1252 SIMPLE_LOAD %1, 10, src2q, m8
1266 punpckhwd m1, m0, m6
1268 punpckhwd m9, m8, m7
1281 pminsw m0, [max_pixels_%2]
1283 PEL_%2STORE%1 dstq, m0, m1
1284 add dstq, dststrideq ; dst += dststride
1285 add srcq, 2*MAX_PB_SIZE ; src += srcstride
1286 add src2q, 2*MAX_PB_SIZE ; src2 += srcstride
1287 dec r6d ; cmp height
1288 jnz .loop ; height loop
1292 WEIGHTING_FUNCS 2, 8
1293 WEIGHTING_FUNCS 4, 8
1294 WEIGHTING_FUNCS 6, 8
1295 WEIGHTING_FUNCS 8, 8
1297 WEIGHTING_FUNCS 2, 10
1298 WEIGHTING_FUNCS 4, 10
1299 WEIGHTING_FUNCS 6, 10
1300 WEIGHTING_FUNCS 8, 10
1302 WEIGHTING_FUNCS 2, 12
1303 WEIGHTING_FUNCS 4, 12
1304 WEIGHTING_FUNCS 6, 12
1305 WEIGHTING_FUNCS 8, 12
1307 HEVC_PUT_HEVC_PEL_PIXELS 2, 8
1308 HEVC_PUT_HEVC_PEL_PIXELS 4, 8
1309 HEVC_PUT_HEVC_PEL_PIXELS 6, 8
1310 HEVC_PUT_HEVC_PEL_PIXELS 8, 8
1311 HEVC_PUT_HEVC_PEL_PIXELS 12, 8
1312 HEVC_PUT_HEVC_PEL_PIXELS 16, 8
1314 HEVC_PUT_HEVC_PEL_PIXELS 2, 10
1315 HEVC_PUT_HEVC_PEL_PIXELS 4, 10
1316 HEVC_PUT_HEVC_PEL_PIXELS 6, 10
1317 HEVC_PUT_HEVC_PEL_PIXELS 8, 10
1319 HEVC_PUT_HEVC_PEL_PIXELS 2, 12
1320 HEVC_PUT_HEVC_PEL_PIXELS 4, 12
1321 HEVC_PUT_HEVC_PEL_PIXELS 6, 12
1322 HEVC_PUT_HEVC_PEL_PIXELS 8, 12
1324 HEVC_PUT_HEVC_EPEL 2, 8
1325 HEVC_PUT_HEVC_EPEL 4, 8
1326 HEVC_PUT_HEVC_EPEL 6, 8
1327 HEVC_PUT_HEVC_EPEL 8, 8
1328 HEVC_PUT_HEVC_EPEL 12, 8
1329 HEVC_PUT_HEVC_EPEL 16, 8
1332 HEVC_PUT_HEVC_EPEL 2, 10
1333 HEVC_PUT_HEVC_EPEL 4, 10
1334 HEVC_PUT_HEVC_EPEL 6, 10
1335 HEVC_PUT_HEVC_EPEL 8, 10
1337 HEVC_PUT_HEVC_EPEL 2, 12
1338 HEVC_PUT_HEVC_EPEL 4, 12
1339 HEVC_PUT_HEVC_EPEL 6, 12
1340 HEVC_PUT_HEVC_EPEL 8, 12
1342 HEVC_PUT_HEVC_EPEL_HV 2, 8
1343 HEVC_PUT_HEVC_EPEL_HV 4, 8
1344 HEVC_PUT_HEVC_EPEL_HV 6, 8
1345 HEVC_PUT_HEVC_EPEL_HV 8, 8
1347 HEVC_PUT_HEVC_EPEL_HV 2, 10
1348 HEVC_PUT_HEVC_EPEL_HV 4, 10
1349 HEVC_PUT_HEVC_EPEL_HV 6, 10
1350 HEVC_PUT_HEVC_EPEL_HV 8, 10
1352 HEVC_PUT_HEVC_EPEL_HV 2, 12
1353 HEVC_PUT_HEVC_EPEL_HV 4, 12
1354 HEVC_PUT_HEVC_EPEL_HV 6, 12
1355 HEVC_PUT_HEVC_EPEL_HV 8, 12
1357 HEVC_PUT_HEVC_QPEL 4, 8
1358 HEVC_PUT_HEVC_QPEL 8, 8
1359 HEVC_PUT_HEVC_QPEL 12, 8
1360 HEVC_PUT_HEVC_QPEL 16, 8
1362 HEVC_PUT_HEVC_QPEL 4, 10
1363 HEVC_PUT_HEVC_QPEL 8, 10
1365 HEVC_PUT_HEVC_QPEL 4, 12
1366 HEVC_PUT_HEVC_QPEL 8, 12
1368 HEVC_PUT_HEVC_QPEL_HV 2, 8
1369 HEVC_PUT_HEVC_QPEL_HV 4, 8
1370 HEVC_PUT_HEVC_QPEL_HV 6, 8
1371 HEVC_PUT_HEVC_QPEL_HV 8, 8
1373 HEVC_PUT_HEVC_QPEL_HV 2, 10
1374 HEVC_PUT_HEVC_QPEL_HV 4, 10
1375 HEVC_PUT_HEVC_QPEL_HV 6, 10
1376 HEVC_PUT_HEVC_QPEL_HV 8, 10
1378 HEVC_PUT_HEVC_QPEL_HV 2, 12
1379 HEVC_PUT_HEVC_QPEL_HV 4, 12
1380 HEVC_PUT_HEVC_QPEL_HV 6, 12
1381 HEVC_PUT_HEVC_QPEL_HV 8, 12
1383 %endif ; ARCH_X86_64