1 ;******************************************************************************
2 ;* VP9 Intra prediction SIMD optimizations
4 ;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
7 ;* H.264 intra prediction asm optimizations
8 ;* Copyright (c) 2010 Fiona Glaser
9 ;* Copyright (c) 2010 Holger Lubitz
10 ;* Copyright (c) 2010 Loren Merritt
11 ;* Copyright (c) 2010 Ronald S. Bultje
13 ;* This file is part of FFmpeg.
15 ;* FFmpeg is free software; you can redistribute it and/or
16 ;* modify it under the terms of the GNU Lesser General Public
17 ;* License as published by the Free Software Foundation; either
18 ;* version 2.1 of the License, or (at your option) any later version.
20 ;* FFmpeg is distributed in the hope that it will be useful,
21 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
22 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 ;* Lesser General Public License for more details.
25 ;* You should have received a copy of the GNU Lesser General Public
26 ;* License along with FFmpeg; if not, write to the Free Software
27 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
28 ;******************************************************************************
30 %include "libavutil/x86/x86util.asm"
34 pw_m256: times 16 dw -256
35 pw_m255: times 16 dw -255
36 pw_4096: times 8 dw 4096
38 pb_4x3_4x2_4x1_4x0: times 4 db 3
42 pb_8x1_8x0: times 8 db 1
44 pb_8x3_8x2: times 8 db 3
46 pb_0to5_2x7: db 0, 1, 2, 3, 4, 5, 7, 7
48 pb_0to6_9x7: db 0, 1, 2, 3, 4, 5, 6
50 pb_1to6_10x7: db 1, 2, 3, 4, 5, 6
53 pb_2to6_11x7: db 2, 3, 4, 5, 6
55 pb_1toE_2xF: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
56 pb_2toE_3xF: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
57 pb_13456_3xm1: db 1, 3, 4, 5, 6
59 pb_6012_4xm1: db 6, 0, 1, 2
61 pb_6xm1_246_8toE: times 6 db -1
62 db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14
63 pb_6xm1_BDF_0to6: times 6 db -1
64 db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6
65 pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
66 pb_7to1_9x0: db 7, 6, 5, 4
67 pb_3to1_5x0: db 3, 2, 1
69 pb_Fto0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
83 ; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
86 cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a
91 pmulhrsw m0, [pw_4096]
93 movd [dstq+strideq*0], m0
94 movd [dstq+strideq*1], m0
95 lea dstq, [dstq+strideq*2]
96 movd [dstq+strideq*0], m0
97 movd [dstq+strideq*1], m0
101 cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a
104 DEFINE_ARGS dst, stride, stride3
105 lea stride3q, [strideq*3]
110 pmulhrsw m0, [pw_2048]
112 movq [dstq+strideq*0], m0
113 movq [dstq+strideq*1], m0
114 movq [dstq+strideq*2], m0
115 movq [dstq+stride3q ], m0
116 lea dstq, [dstq+strideq*4]
117 movq [dstq+strideq*0], m0
118 movq [dstq+strideq*1], m0
119 movq [dstq+strideq*2], m0
120 movq [dstq+stride3q ], m0
124 cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a
127 DEFINE_ARGS dst, stride, stride3, cnt
128 lea stride3q, [strideq*3]
135 pmulhrsw m0, [pw_1024]
139 mova [dstq+strideq*0], m0
140 mova [dstq+strideq*1], m0
141 mova [dstq+strideq*2], m0
142 mova [dstq+stride3q ], m0
143 lea dstq, [dstq+strideq*4]
149 cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
154 DEFINE_ARGS dst, stride, stride3, cnt
155 lea stride3q, [strideq*3]
166 pmulhrsw m0, [pw_512]
170 mova [dstq+strideq*0+ 0], m0
171 mova [dstq+strideq*0+16], m0
172 mova [dstq+strideq*1+ 0], m0
173 mova [dstq+strideq*1+16], m0
174 mova [dstq+strideq*2+ 0], m0
175 mova [dstq+strideq*2+16], m0
176 mova [dstq+stride3q + 0], m0
177 mova [dstq+stride3q +16], m0
178 lea dstq, [dstq+strideq*4]
183 %if HAVE_AVX2_EXTERNAL
185 cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
188 DEFINE_ARGS dst, stride, stride3, cnt
189 lea stride3q, [strideq*3]
194 vextracti128 xm1, m0, 1
198 pmulhrsw xm0, [pw_512]
202 mova [dstq+strideq*0], m0
203 mova [dstq+strideq*1], m0
204 mova [dstq+strideq*2], m0
205 mova [dstq+stride3q ], m0
206 lea dstq, [dstq+strideq*4]
207 mova [dstq+strideq*0], m0
208 mova [dstq+strideq*1], m0
209 mova [dstq+strideq*2], m0
210 mova [dstq+stride3q ], m0
211 lea dstq, [dstq+strideq*4]
217 ; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
219 %macro DC_1D_FUNCS 2 ; dir (top or left), arg (a or l)
221 cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a
225 pmulhrsw m0, [pw_8192]
227 movd [dstq+strideq*0], m0
228 movd [dstq+strideq*1], m0
229 lea dstq, [dstq+strideq*2]
230 movd [dstq+strideq*0], m0
231 movd [dstq+strideq*1], m0
235 cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a
237 DEFINE_ARGS dst, stride, stride3
238 lea stride3q, [strideq*3]
241 pmulhrsw m0, [pw_4096]
243 movq [dstq+strideq*0], m0
244 movq [dstq+strideq*1], m0
245 movq [dstq+strideq*2], m0
246 movq [dstq+stride3q ], m0
247 lea dstq, [dstq+strideq*4]
248 movq [dstq+strideq*0], m0
249 movq [dstq+strideq*1], m0
250 movq [dstq+strideq*2], m0
251 movq [dstq+stride3q ], m0
255 cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a
257 DEFINE_ARGS dst, stride, stride3, cnt
258 lea stride3q, [strideq*3]
263 pmulhrsw m0, [pw_2048]
267 mova [dstq+strideq*0], m0
268 mova [dstq+strideq*1], m0
269 mova [dstq+strideq*2], m0
270 mova [dstq+stride3q ], m0
271 lea dstq, [dstq+strideq*4]
277 cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
280 DEFINE_ARGS dst, stride, stride3, cnt
281 lea stride3q, [strideq*3]
288 pmulhrsw m0, [pw_1024]
292 mova [dstq+strideq*0+ 0], m0
293 mova [dstq+strideq*0+16], m0
294 mova [dstq+strideq*1+ 0], m0
295 mova [dstq+strideq*1+16], m0
296 mova [dstq+strideq*2+ 0], m0
297 mova [dstq+strideq*2+16], m0
298 mova [dstq+stride3q + 0], m0
299 mova [dstq+stride3q +16], m0
300 lea dstq, [dstq+strideq*4]
305 %if HAVE_AVX2_EXTERNAL
307 cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
309 DEFINE_ARGS dst, stride, stride3, cnt
310 lea stride3q, [strideq*3]
313 vextracti128 xm1, m0, 1
317 pmulhrsw xm0, [pw_1024]
321 mova [dstq+strideq*0], m0
322 mova [dstq+strideq*1], m0
323 mova [dstq+strideq*2], m0
324 mova [dstq+stride3q ], m0
325 lea dstq, [dstq+strideq*4]
326 mova [dstq+strideq*0], m0
327 mova [dstq+strideq*1], m0
328 mova [dstq+strideq*2], m0
329 mova [dstq+stride3q ], m0
330 lea dstq, [dstq+strideq*4]
343 cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a
345 DEFINE_ARGS dst, stride, stride3
346 lea stride3q, [strideq*3]
347 movq [dstq+strideq*0], m0
348 movq [dstq+strideq*1], m0
349 movq [dstq+strideq*2], m0
350 movq [dstq+stride3q ], m0
351 lea dstq, [dstq+strideq*4]
352 movq [dstq+strideq*0], m0
353 movq [dstq+strideq*1], m0
354 movq [dstq+strideq*2], m0
355 movq [dstq+stride3q ], m0
359 cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a
361 DEFINE_ARGS dst, stride, stride3, cnt
362 lea stride3q, [strideq*3]
365 mova [dstq+strideq*0], m0
366 mova [dstq+strideq*1], m0
367 mova [dstq+strideq*2], m0
368 mova [dstq+stride3q ], m0
369 lea dstq, [dstq+strideq*4]
375 cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
378 DEFINE_ARGS dst, stride, stride3, cnt
379 lea stride3q, [strideq*3]
382 mova [dstq+strideq*0+ 0], m0
383 mova [dstq+strideq*0+16], m1
384 mova [dstq+strideq*1+ 0], m0
385 mova [dstq+strideq*1+16], m1
386 mova [dstq+strideq*2+ 0], m0
387 mova [dstq+strideq*2+16], m1
388 mova [dstq+stride3q + 0], m0
389 mova [dstq+stride3q +16], m1
390 lea dstq, [dstq+strideq*4]
395 %if HAVE_AVX2_EXTERNAL
397 cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
399 DEFINE_ARGS dst, stride, stride3, cnt
400 lea stride3q, [strideq*3]
403 mova [dstq+strideq*0], m0
404 mova [dstq+strideq*1], m0
405 mova [dstq+strideq*2], m0
406 mova [dstq+stride3q ], m0
407 lea dstq, [dstq+strideq*4]
408 mova [dstq+strideq*0], m0
409 mova [dstq+strideq*1], m0
410 mova [dstq+strideq*2], m0
411 mova [dstq+stride3q ], m0
412 lea dstq, [dstq+strideq*4]
421 cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3
423 pshufb m0, [pb_4x3_4x2_4x1_4x0]
424 lea stride3q, [strideq*3]
425 movd [dstq+strideq*0], m0
427 movd [dstq+strideq*1], m0
429 movd [dstq+strideq*2], m0
431 movd [dstq+stride3q ], m0
436 cglobal vp9_ipred_h_8x8, 3, 5, 4, dst, stride, l, stride3, cnt
437 mova m2, [pb_8x1_8x0]
438 mova m3, [pb_8x3_8x2]
439 lea stride3q, [strideq*3]
445 movq [dstq+strideq*0], m1
446 movhps [dstq+strideq*1], m1
447 movq [dstq+strideq*2], m0
448 movhps [dstq+stride3q ], m0
449 lea dstq, [dstq+strideq*4]
455 cglobal vp9_ipred_h_16x16, 3, 5, 8, dst, stride, l, stride3, cnt
460 lea stride3q, [strideq*3]
466 mova [dstq+strideq*0], m0
467 mova [dstq+strideq*1], m1
470 mova [dstq+strideq*2], m2
471 mova [dstq+stride3q ], m3
472 lea dstq, [dstq+strideq*4]
478 cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
483 lea stride3q, [strideq*3]
489 mova [dstq+strideq*0+ 0], m0
490 mova [dstq+strideq*0+16], m0
491 mova [dstq+strideq*1+ 0], m1
492 mova [dstq+strideq*1+16], m1
495 mova [dstq+strideq*2+ 0], m2
496 mova [dstq+strideq*2+16], m2
497 mova [dstq+stride3q + 0], m3
498 mova [dstq+stride3q +16], m3
499 lea dstq, [dstq+strideq*4]
508 %if HAVE_AVX2_EXTERNAL
510 cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
515 lea stride3q, [strideq*3]
518 movd xm3, [lq+cntq*4]
519 vinserti128 m3, m3, xm3, 1
522 mova [dstq+strideq*0], m0
523 mova [dstq+strideq*1], m1
526 mova [dstq+strideq*2], m2
527 mova [dstq+stride3q ], m3
528 lea dstq, [dstq+strideq*4]
537 cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a
541 DEFINE_ARGS dst, stride, l, cnt
549 pinsrw m2, [lq+cntq*2], 0
556 movd [dstq+strideq*0], m1
557 movd [dstq+strideq*1], m2
558 lea dstq, [dstq+strideq*2]
563 %macro TM_XMM_FUNCS 1
565 cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a
569 DEFINE_ARGS dst, stride, l, cnt
577 pinsrw m2, [lq+cntq*2], 0
583 movh [dstq+strideq*0], m1
584 movhps [dstq+strideq*1], m1
585 lea dstq, [dstq+strideq*2]
591 cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a
595 DEFINE_ARGS dst, stride, l, cnt
605 pinsrw m7, [lq+cntq*2], 0
614 mova [dstq+strideq*0], m2
615 mova [dstq+strideq*1], m6
616 lea dstq, [dstq+strideq*2]
623 cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a
628 DEFINE_ARGS dst, stride, l, cnt
642 pinsrw m13, [lq+cntq*2], 0
657 mova [dstq+strideq*0+ 0], m4
658 mova [dstq+strideq*0+16], m6
659 mova [dstq+strideq*1+ 0], m10
660 mova [dstq+strideq*1+16], m12
661 lea dstq, [dstq+strideq*2]
671 %if HAVE_AVX2_EXTERNAL
673 cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
675 pinsrw xm2, [aq-1], 0
676 vinserti128 m2, m2, xm2, 1
678 DEFINE_ARGS dst, stride, l, cnt
688 pinsrw xm7, [lq+cntq*2], 0
689 vinserti128 m7, m7, xm7, 1
698 mova [dstq+strideq*0], m2
699 mova [dstq+strideq*1], m6
700 lea dstq, [dstq+strideq*2]
708 %macro LOWPASS 4 ; left [dst], center, right, tmp
717 cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a
719 pshufb m0, m1, [pb_0to5_2x7]
720 pshufb m2, m1, [pb_2to6_3x7]
725 movd [dstq+strideq*0], m0
726 movd [dstq+strideq*2], m1
730 movd [dstq+strideq*0], m0
731 movd [dstq+strideq*2], m1
734 %macro DL_XMM_FUNCS 1
736 cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a
738 lea stride5q, [strideq*5]
739 pshufb m1, m0, [pb_1to6_10x7]
745 movq [dstq+strideq*0], m0
746 movq [dstq+strideq*4], m1
749 movq [dstq+strideq*1], m0
750 movq [dstq+stride5q ], m1
751 lea dstq, [dstq+strideq*2]
754 movq [dstq+strideq*0], m0
755 movq [dstq+strideq*4], m1
758 movq [dstq+strideq*1], m0
759 movq [dstq+stride5q ], m1
763 cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a
764 mova m5, [pb_1toE_2xF]
768 pshufb m4, m0, [pb_15]
770 DEFINE_ARGS dst, stride, cnt, stride9
771 lea stride9q, [strideq*3]
773 lea stride9q, [stride9q*3]
777 mova [dstq+strideq*0], m0
779 mova [dstq+strideq*8], m4
781 mova [dstq+strideq*1], m0
783 mova [dstq+stride9q ], m4
784 lea dstq, [dstq+strideq*2]
790 cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16
791 mova m5, [pb_1toE_2xF]
794 palignr m2, m1, m0, 1
795 palignr m3, m1, m0, 2
799 pshufb m6, m1, [pb_15]
802 lea dst16q, [dstq +strideq*8]
804 lea dst16q, [dst16q+strideq*8]
807 mova [dstq +strideq*0+ 0], m0
808 mova [dstq +strideq*0+16], m1
809 movhps [dstq+strideq*8+ 0], m0
810 movq [dstq +strideq*8+ 8], m1
811 mova [dstq +strideq*8+16], m7
812 mova [dst16q+strideq*0+ 0], m1
813 mova [dst16q+strideq*0+16], m6
814 mova [dst16q+strideq*8+ 0], m7
815 mova [dst16q+strideq*8+16], m6
817 vpalignr m0, m1, m0, 1
820 palignr m2, m1, m0, 1
837 cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a
841 DEFINE_ARGS dst, stride, stride3
842 lea stride3q, [strideq*3]
847 movd [dstq+stride3q ], m0
849 movd [dstq+strideq*2], m0
851 movd [dstq+strideq*1], m0
853 movd [dstq+strideq*0], m0
856 %macro DR_XMM_FUNCS 1
858 cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a
862 DEFINE_ARGS dst, stride, stride3
863 lea stride3q, [strideq*3]
868 movhps [dstq+strideq*0], m0
870 movhps [dstq+strideq*1], m0
872 movhps [dstq+strideq*2], m0
874 movhps [dstq+stride3q ], m0
876 lea dstq, [dstq+strideq*4]
877 movhps [dstq+strideq*0], m0
879 movhps [dstq+strideq*1], m0
881 movhps [dstq+strideq*2], m0
883 movhps [dstq+stride3q ], m0
887 cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a
891 DEFINE_ARGS dst, stride, stride9, cnt
892 lea stride9q, [strideq *3]
894 lea stride9q, [stride9q*3]
896 palignr m3, m2, m1, 15
903 mova [dstq+strideq*0 ], m3
904 movhps [dstq+strideq*8+0], m0
905 movq [dstq+strideq*8+8], m3
908 mova [dstq+strideq*1 ], m3
909 movhps [dstq+stride9q +0], m0
910 movq [dstq+stride9q +8], m3
913 lea dstq, [dstq+strideq*2]
919 cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a
925 DEFINE_ARGS dst, stride, stride8, cnt
926 lea stride8q, [strideq*8]
928 palignr m6, m4, m3, 15
931 palignr m6, m3, m2, 15
934 palignr m6, m2, m1, 15
943 mova [dstq+stride8q*0+ 0], m4
944 mova [dstq+stride8q*0+16], m5
945 mova [dstq+stride8q*2+ 0], m3
946 mova [dstq+stride8q*2+16], m4
963 cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a
969 movd [dstq+strideq*0], m1
970 movd [dstq+strideq*1], m2
971 lea dstq, [dstq+strideq*2]
974 movd [dstq+strideq*0], m1
975 movd [dstq+strideq*1], m2
978 %macro VL_XMM_FUNCS 1
980 cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a
982 pshufb m0, [pb_0to6_9x7]
983 DEFINE_ARGS dst, stride, stride3
984 lea stride3q, [strideq*3]
990 movq [dstq+strideq*0], m1
991 movq [dstq+strideq*1], m2
994 movq [dstq+strideq*2], m1
995 movq [dstq+stride3q ], m2
996 lea dstq, [dstq+strideq*4]
999 movq [dstq+strideq*0], m1
1000 movq [dstq+strideq*1], m2
1003 movq [dstq+strideq*2], m1
1004 movq [dstq+stride3q ], m2
1008 cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a
1010 mova m4, [pb_1toE_2xF]
1011 DEFINE_ARGS dst, stride, stride3, cnt
1012 lea stride3q, [strideq*3]
1019 mova [dstq+strideq*0], m1
1020 mova [dstq+strideq*1], m2
1023 mova [dstq+strideq*2], m1
1024 mova [dstq+stride3q ], m2
1027 lea dstq, [dstq+strideq*4]
1033 cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a
1036 mova m4, [pb_1toE_2xF]
1037 DEFINE_ARGS dst, stride, dst16, cnt
1038 palignr m2, m5, m0, 1
1039 palignr m3, m5, m0, 2
1040 lea dst16q, [dstq +strideq*8]
1045 lea dst16q, [dst16q+strideq*8]
1053 mova [dstq+stride%1+ 0], %2
1054 mova [dstq+stride%1+16], %3
1055 movhps [dst16q+stride%1 ], %2
1056 movu [dst16q+stride%1+ 8], %3
1057 movq [dst16q+stride%1+24], m5
1059 palignr %2, %3, %2, 1
1062 palignr m6, %3, %2, 1
1070 lea dstq, [dstq +strideq*2]
1071 lea dst16q, [dst16q+strideq*2]
1083 cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a
1087 DEFINE_ARGS dst, stride, stride3
1088 lea stride3q, [strideq*3]
1095 ; ABCD <- for the following predictor:
1097 ; IABC | m0 contains ABCDxxxx
1098 ; JEFG | m2 contains xJIEFGHx
1101 pshufb m2, [pb_13456_3xm1]
1102 movd [dstq+strideq*0], m0
1103 pshufb m0, [pb_6012_4xm1]
1104 movd [dstq+stride3q ], m2
1106 movd [dstq+strideq*2], m0
1107 movd [dstq+strideq*1], m2
1110 %macro VR_XMM_FUNCS 1
1112 cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a
1116 DEFINE_ARGS dst, stride, stride3
1117 lea stride3q, [strideq*3]
1124 ; ABCDEFGH <- for the following predictor:
1126 ; QABCDEFG | m0 contains ABCDEFGHxxxxxxxx
1127 ; RIJKLMNO | m1 contains xxVUTSRQIJKLMNOP
1133 punpcklqdq m0, m1 ; ABCDEFGHxxVUTSRQ
1134 movq [dstq+strideq*0], m0
1135 pshufb m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG
1136 movhps [dstq+strideq*1], m1
1137 pshufb m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO
1138 movhps [dstq+strideq*2], m0
1140 movhps [dstq+stride3q ], m1
1141 lea dstq, [dstq+strideq*4]
1143 movhps [dstq+strideq*0], m0
1145 movhps [dstq+strideq*1], m1
1147 movhps [dstq+strideq*2], m0
1148 movhps [dstq+stride3q ], m1
1152 cglobal vp9_ipred_vr_16x16, 4, 4, 6, dst, stride, l, a
1156 DEFINE_ARGS dst, stride, stride3, cnt
1157 lea stride3q, [strideq*3]
1158 palignr m3, m1, m2, 15
1164 pshufb m1, [pb_02468ACE_13579BDF]
1169 mova [dstq+strideq*0], m0
1170 mova [dstq+strideq*1], m3
1171 palignr m4, m0, m1, 15
1172 palignr m5, m3, m2, 15
1173 mova [dstq+strideq*2], m4
1174 mova [dstq+stride3q ], m5
1175 lea dstq, [dstq+strideq*4]
1185 cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a
1189 palignr m3, m2, m0, 15
1190 palignr m4, m2, m0, 14
1194 palignr m5, m1, m2, 15
1199 palignr m7, m2, m6, 15
1204 pshufb m1, [pb_02468ACE_13579BDF]
1205 pshufb m2, [pb_02468ACE_13579BDF]
1206 DEFINE_ARGS dst, stride, dst16, cnt
1207 lea dst16q, [dstq +strideq*8]
1208 lea dst16q, [dst16q+strideq*8]
1209 SBUTTERFLY qdq, 2, 1, 6
1213 ; even lines (0, 2, 4, ...): m1 | m0, m3
1214 ; odd lines (1, 3, 5, ...): m2 | m5, m4
1216 mova [dstq+stride%1+ 0], %3
1217 mova [dstq+stride%1+16], %4
1218 movhps [dst16q+stride%1 ], %2
1219 movu [dst16q+stride%1+ 8], %3
1220 movq [dst16q+stride%1+24], %4
1226 %%write q*0, m1, m0, m3
1227 %%write q*1, m2, m5, m4
1228 lea dstq, [dstq +strideq*2]
1229 lea dst16q, [dst16q+strideq*2]
1242 cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a
1244 punpckldq m0, [aq-1]
1245 DEFINE_ARGS dst, stride, stride3
1246 lea stride3q, [strideq*3]
1252 ; DHIJ <- for the following predictor:
1254 ; BFCG | m1 contains ABCDxxxx
1255 ; AEBF | m2 contains EFGHIJxx
1258 punpckhdq m0, m1, m2
1260 ; m1 contains AEBFCGDH
1261 ; m0 contains CGDHIJxx
1263 movd [dstq+stride3q ], m1
1264 movd [dstq+strideq*1], m0
1267 movd [dstq+strideq*2], m1
1268 movd [dstq+strideq*0], m0
1271 %macro HD_XMM_FUNCS 1
1273 cglobal vp9_ipred_hd_8x8, 4, 4, 4, dst, stride, l, a
1276 DEFINE_ARGS dst, stride, stride3, dst4
1277 lea stride3q, [strideq*3]
1278 lea dst4q, [dstq+strideq*4]
1284 ; HPQRSTUV <- for the following predictor
1286 ; FNGOHPQR | m1 contains ABCDEFGHxxxxxxxx
1287 ; EMFNGOHP | m2 contains IJKLMNOPQRSTUVxx
1296 ; m1 contains AIBJCKDLEMFNGOHP
1297 ; m2 contains QRSTUVxxxxxxxxxx
1299 movhps [dstq +stride3q ], m1
1300 movq [dst4q+stride3q ], m1
1301 palignr m3, m2, m1, 2
1302 movhps [dstq +strideq*2], m3
1303 movq [dst4q+strideq*2], m3
1304 palignr m3, m2, m1, 4
1305 movhps [dstq +strideq*1], m3
1306 movq [dst4q+strideq*1], m3
1308 movhps [dstq +strideq*0], m2
1309 movq [dst4q+strideq*0], m2
1313 cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a
1316 DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12
1317 lea stride4q, [strideq*4]
1318 lea dst4q, [dstq +stride4q]
1319 lea dst8q, [dst4q+stride4q]
1320 lea dst12q, [dst8q+stride4q]
1324 palignr m1, m3, m0, 1
1325 palignr m2, m3, m0, 2
1328 SBUTTERFLY bw, 1, 2, 6
1330 ; I PROBABLY INVERTED L0 ad L16 here
1333 sub stride4q, strideq
1334 movhps [dstq +stride4q +0], m2
1335 movq [dstq +stride4q +8], m5
1336 mova [dst4q+stride4q ], m2
1337 movhps [dst8q+stride4q +0], m1
1338 movq [dst8q+stride4q +8], m2
1339 mova [dst12q+stride4q ], m1
1341 palignr m1, m2, m1, 2
1342 palignr m2, m5, m2, 2
1344 palignr m3, m2, m1, 2
1345 palignr m0, m5, m2, 2
1354 cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a
1359 DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24
1360 lea stride8q, [strideq*8]
1361 lea dst8q, [dstq +stride8q]
1362 lea dst16q, [dst8q +stride8q]
1363 lea dst24q, [dst16q+stride8q]
1367 palignr m4, m3, m2, 2
1370 palignr m3, m2, m1, 2
1374 palignr m6, m1, m0, 1
1378 SBUTTERFLY bw, 2, 3, 6
1379 SBUTTERFLY bw, 0, 1, 6
1381 ; m0, m1, m2, m3, m4, m5
1383 sub stride8q, strideq
1384 mova [dstq +stride8q+ 0], m3
1385 mova [dstq +stride8q+16], m4
1386 mova [dst8q +stride8q+ 0], m2
1387 mova [dst8q +stride8q+16], m3
1388 mova [dst16q+stride8q+ 0], m1
1389 mova [dst16q+stride8q+16], m2
1390 mova [dst24q+stride8q+ 0], m0
1391 mova [dst24q+stride8q+16], m1
1393 palignr m0, m1, m0, 2
1394 palignr m1, m2, m1, 2
1395 palignr m2, m3, m2, 2
1396 palignr m3, m4, m3, 2
1397 palignr m4, m5, m4, 2
1421 cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l
1423 pshufb m0, [pb_3to1_5x0]
1428 DEFINE_ARGS dst, stride, stride3
1429 lea stride3q, [strideq*3]
1430 SBUTTERFLY bw, 1, 2, 0
1432 movd [dstq+strideq*0], m1
1433 movd [dstq+strideq*1], m2
1436 movd [dstq+strideq*2], m1
1437 movd [dstq+stride3q ], m2
1440 %macro HU_XMM_FUNCS 1
1442 cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l
1444 pshufb m0, [pb_7to1_9x0]
1449 DEFINE_ARGS dst, stride, stride3, dst4
1450 lea stride3q, [strideq*3]
1451 lea dst4q, [dstq+strideq*4]
1452 SBUTTERFLY bw, 1, 2, 0
1453 movq [dstq +strideq*0], m1
1454 movhps [dst4q+strideq*0], m1
1455 palignr m0, m2, m1, 2
1456 movq [dstq +strideq*1], m0
1457 movhps [dst4q+strideq*1], m0
1458 palignr m0, m2, m1, 4
1459 movq [dstq +strideq*2], m0
1460 movhps [dst4q+strideq*2], m0
1462 movq [dstq +stride3q ], m2
1463 movhps [dst4q+stride3q ], m2
1467 cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l
1469 pshufb m0, [pb_Fto0]
1470 mova m3, [pb_2toE_3xF]
1471 pshufb m1, m0, [pb_1toE_2xF]
1475 DEFINE_ARGS dst, stride, stride9, cnt
1476 lea stride9q, [strideq *3]
1478 lea stride9q, [stride9q*3]
1479 SBUTTERFLY bw, 1, 2, 0
1482 mova [dstq+strideq*0], m1
1483 mova [dstq+strideq*8], m2
1484 palignr m0, m2, m1, 2
1486 mova [dstq+strideq*1], m0
1487 mova [dstq+stride9q ], m2
1488 palignr m1, m2, m0, 2
1490 lea dstq, [dstq+strideq*2]
1496 cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l
1500 mova m4, [pb_2toE_3xF]
1503 palignr m2, m0, m1, 1
1504 palignr m3, m0, m1, 2
1508 pshufb m5, m0, [pb_1toE_2xF]
1511 DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24
1513 xor stride0q, stride0q
1514 lea dst8q, [dstq +strideq*8]
1515 lea dst16q, [dst8q +strideq*8]
1516 lea dst24q, [dst16q+strideq*8]
1517 SBUTTERFLY bw, 0, 1, 5
1518 SBUTTERFLY bw, 2, 3, 5
1519 pshufb m6, m1, [pb_15]
1522 mova [dstq +stride0q+ 0], m2
1523 mova [dstq +stride0q+16], m3
1524 mova [dst8q +stride0q+ 0], m3
1525 mova [dst8q +stride0q+16], m0
1526 mova [dst16q+stride0q+ 0], m0
1527 mova [dst16q+stride0q+16], m1
1528 mova [dst24q+stride0q+ 0], m1
1529 mova [dst24q+stride0q+16], m6
1531 palignr m2, m3, m2, 2
1532 palignr m3, m0, m3, 2
1533 palignr m0, m1, m0, 2
1545 add stride0q, strideq
1554 ; FIXME 127, 128, 129 ?