1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
4 ;* Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
5 ;* Yuvaraj Venkatesh <yuvaraj@multicorewareinc.com>
6 ;* Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 ;* This program is also available under a commercial proprietary license.
23 ;* For more information, contact us at license @ x265.com.
24 ;*****************************************************************************/
27 %include "x86util.asm"
38 const shuf_mode_13_23, db 0, 0, 14, 15, 6, 7, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
39 const shuf_mode_14_22, db 14, 15, 10, 11, 4, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
40 const shuf_mode_15_21, db 12, 13, 8, 9, 4, 5, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
41 const shuf_mode_16_20, db 2, 3, 0, 1, 14, 15, 12, 13, 8, 9, 6, 7, 2, 3, 0, 1
42 const shuf_mode_17_19, db 0, 1, 14, 15, 12, 13, 10, 11, 6, 7, 4, 5, 2, 3, 0, 1
43 const shuf_mode32_18, db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
44 const pw_punpcklwd, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
45 const c_mode32_10_0, db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
47 const pw_unpackwdq, times 8 db 0,1
48 const pw_ang8_12, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 1
49 const pw_ang8_13, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 8, 9, 0, 1
50 const pw_ang8_14, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 10, 11, 4, 5, 0, 1
51 const pw_ang8_15, db 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 8, 9, 4, 5, 0, 1
52 const pw_ang8_16, db 0, 0, 0, 0, 0, 0, 12, 13, 10, 11, 6, 7, 4, 5, 0, 1
53 const pw_ang8_17, db 0, 0, 14, 15, 12, 13, 10, 11, 8, 9, 4, 5, 2, 3, 0, 1
54 const pw_swap16, db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
56 const pw_ang16_13, db 14, 15, 8, 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
57 const pw_ang16_16, db 0, 0, 0, 0, 0, 0, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1
74 ;-------------------------------------------------------------------------------------------------------
75 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
76 ;-------------------------------------------------------------------------------------------------------
78 cglobal intra_pred_dc4, 4,6,2
83 movh m0, [r3] ; sumAbove
84 movh m1, [r2] ; sumLeft
89 phaddw m0, m0 ; m0 = sum
93 pmulhrsw m0, [pw_4096] ; m0 = (sum + 4) / 8
94 movd r4d, m0 ; r4d = dc_val
96 pshuflw m0, m0, 0 ; m0 = word [dc_val ...]
100 movh [r0 + r1 * 2], m0
101 movh [r0 + r1 * 4], m0
102 lea r5, [r0 + r1 * 4]
103 movh [r5 + r1 * 2], m0
107 lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2
108 add r4d, r5d ; r4d = DC * 3 + 2
110 pshuflw m0, m0, 0 ; m0 = pixDCx3
116 movh [r0], m1 ; overwrite top-left pixel, we will update it later
127 lea r0, [r0 + r1 * 2]
134 mov [r0 + r1 * 2], r3w
135 pextrw [r0 + r1 * 4], m1, 2
143 ;-------------------------------------------------------------------------------------------------------
144 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
145 ;-------------------------------------------------------------------------------------------------------
147 cglobal intra_pred_dc8, 4, 7, 2
163 shr r5d, 4 ; sum = sum / 16
165 pshuflw m1, m1, 0 ; m1 = word [dc_val ...]
174 movu [r0 + r1 * 2], m1
175 lea r0, [r0 + r1 * 2]
177 movu [r0 + r1 * 2], m1
178 lea r0, [r0 + r1 * 2]
180 movu [r0 + r1 * 2], m1
181 lea r0, [r0 + r1 * 2]
186 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
187 add r5d, r4d ; r5d = DC * 3 + 2
189 pshuflw m1, m1, 0 ; m1 = pixDCx3
212 pextrw [r6 + r1], m0, 1
213 pextrw [r6 + r1 * 2], m0, 2
214 lea r6, [r6 + r1 * 2]
215 pextrw [r6 + r1], m0, 3
216 pextrw [r6 + r1 * 2], m0, 4
217 lea r6, [r6 + r1 * 2]
218 pextrw [r6 + r1], m0, 5
219 pextrw [r6 + r1 * 2], m0, 6
225 ;-------------------------------------------------------------------------------------------------------
226 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
227 ;-------------------------------------------------------------------------------------------------------
229 cglobal intra_pred_dc16, 4, 7, 4
249 shr r5d, 5 ; sum = sum / 16
251 pshuflw m1, m1, 0 ; m1 = word [dc_val ...]
261 movu [r0 + 16 + r1], m1
262 lea r0, [r0 + r1 * 2]
266 movu [r0 + 16 + r1], m1
267 lea r0, [r0 + r1 * 2]
271 movu [r0 + 16 + r1], m1
272 lea r0, [r0 + r1 * 2]
276 movu [r0 + 16 + r1], m1
277 lea r0, [r0 + r1 * 2]
281 movu [r0 + 16 + r1], m1
282 lea r0, [r0 + r1 * 2]
286 movu [r0 + 16 + r1], m1
287 lea r0, [r0 + r1 * 2]
291 movu [r0 + 16 + r1], m1
292 lea r0, [r0 + r1 * 2]
296 movu [r0 + 16 + r1], m1
300 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
301 add r5d, r4d ; r5d = DC * 3 + 2
303 pshuflw m1, m1, 0 ; m1 = pixDCx3
331 pextrw [r6 + r1], m2, 1
332 lea r6, [r6 + r1 * 2]
334 pextrw [r6 + r1], m2, 3
335 lea r6, [r6 + r1 * 2]
337 pextrw [r6 + r1], m2, 5
338 lea r6, [r6 + r1 * 2]
340 pextrw [r6 + r1], m2, 7
342 lea r6, [r6 + r1 * 2]
348 pextrw [r6 + r1], m3, 1
349 lea r6, [r6 + r1 * 2]
351 pextrw [r6 + r1], m3, 3
352 lea r6, [r6 + r1 * 2]
354 pextrw [r6 + r1], m3, 5
355 lea r6, [r6 + r1 * 2]
362 ;-------------------------------------------------------------------------------------------
363 ; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
364 ;-------------------------------------------------------------------------------------------
366 cglobal intra_pred_dc32, 4, 5, 6
391 paddd m0, [pd_32] ; sum = sum + 32
392 psrld m0, 6 ; sum = sum / 64
404 movu [r0 + r1 + 0], m0
405 movu [r0 + r1 + 16], m0
406 movu [r0 + r1 + 32], m0
407 movu [r0 + r1 + 48], m0
408 movu [r0 + r1 * 2 + 0], m0
409 movu [r0 + r1 * 2 + 16], m0
410 movu [r0 + r1 * 2 + 32], m0
411 movu [r0 + r1 * 2 + 48], m0
412 movu [r0 + r2 + 0], m0
413 movu [r0 + r2 + 16], m0
414 movu [r0 + r2 + 32], m0
415 movu [r0 + r2 + 48], m0
416 lea r0, [r0 + r1 * 4]
421 movu [r0 + r1 + 0], m0
422 movu [r0 + r1 + 16], m0
423 movu [r0 + r1 + 32], m0
424 movu [r0 + r1 + 48], m0
425 movu [r0 + r1 * 2 + 0], m0
426 movu [r0 + r1 * 2 + 16], m0
427 movu [r0 + r1 * 2 + 32], m0
428 movu [r0 + r1 * 2 + 48], m0
429 movu [r0 + r2 + 0], m0
430 movu [r0 + r2 + 16], m0
431 movu [r0 + r2 + 32], m0
432 movu [r0 + r2 + 48], m0
433 lea r0, [r0 + r1 * 4]
438 ;-----------------------------------------------------------------------------------------------------------
439 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
440 ;-----------------------------------------------------------------------------------------------------------
442 cglobal intra_pred_planar4, 4,7,5
446 movh m0, [r3] ; topRow[i] = above[i];
450 movd m2, [r2 + 8] ; bottomLeft = left[4]
451 movzx r6d, word [r3 + 8] ; topRight = above[4];
455 psubw m2, m0 ; bottomRow[i] = bottomLeft - topRow[i]
457 punpcklqdq m3, m2, m1
461 %macro COMP_PRED_PLANAR_2ROW 1
462 movzx r4d, word [r2 + %1]
463 lea r4d, [r4d * 4 + 4]
467 movzx r4d, word [r2 + %1 + 2]
468 lea r4d, [r4d * 4 + 4]
471 punpcklqdq m3, m4 ; horPred
473 movzx r4d, word [r2 + %1]
479 movzx r4d, word [r2 + %1 + 2]
484 punpcklqdq m4, m1 ; rightColumnN
486 pmullw m4, [multi_2Row]
495 lea r0, [r0 + 2 * r1]
498 COMP_PRED_PLANAR_2ROW 0
499 COMP_PRED_PLANAR_2ROW 4
500 %undef COMP_PRED_PLANAR_2ROW
503 ;-----------------------------------------------------------------------------------------------------------
504 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
505 ;-----------------------------------------------------------------------------------------------------------
507 cglobal intra_pred_planar8, 4,4,7
511 movu m1, [r3] ; v_topRow
512 movu m2, [r2] ; v_leftColumn
514 movd m3, [r3 + 16] ; topRight = above[8];
515 movd m4, [r2 + 16] ; bottomLeft = left[8];
522 psubw m4, m1 ; v_bottomRow
523 psubw m3, m2 ; v_rightColumn
525 psllw m1, 3 ; v_topRow
526 psllw m2, 3 ; v_leftColumn
530 %macro PRED_PLANAR_ROW8 1
532 pshuflw m5, m6, 0x55 * %1
534 pshuflw m2, m3, 0x55 * %1
537 pshufhw m5, m6, 0x55 * (%1 - 4)
539 pshufhw m2, m3, 0x55 * (%1 - 4)
563 %undef PRED_PLANAR_ROW8
567 ;-----------------------------------------------------------------------------------------------------------
568 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
569 ;-----------------------------------------------------------------------------------------------------------
571 %if (BIT_DEPTH == 12)
573 %if (ARCH_X86_64 == 1)
574 cglobal intra_pred_planar16, 4,7,8+3
575 %define bottomRow0 m7
576 %define bottomRow1 m8
577 %define bottomRow2 m9
578 %define bottomRow3 m10
580 cglobal intra_pred_planar16, 4,7,8, 0-3*mmsize
581 %define bottomRow0 [rsp + 0*mmsize]
582 %define bottomRow1 [rsp + 1*mmsize]
583 %define bottomRow2 [rsp + 2*mmsize]
584 %define bottomRow3 m7
594 movzx r4d, word [r2 + 16*2]
596 pshufd m1, m1, 0 ; m1 = bottomLeft
613 pmovzxwd m0, [r3 + 0*8]
615 pmovzxwd m1, [r3 + 1*8]
617 pmovzxwd m2, [r3 + 2*8]
619 pmovzxwd m3, [r3 + 3*8]
624 movzx r4d, word [r2 + r6*2]
625 movzx r5d, word [r3 + 16*2] ; r5 = topRight
630 pmovsxwd m5, m5 ; m5 = rightCol
632 lea r4d, [r4d * 8 + 16]
634 pshufd m4, m4, 0 ; m4 = horPred
636 pshufd m6, m5, 0xFF ; m6 = [4 4 4 4]
675 %else ; BIT_DEPTH == 10
677 cglobal intra_pred_planar16, 4,6,7
682 movu m1, [r3] ; topRow[0-7]
683 movu m2, [r3 + 16] ; topRow[8-15]
688 movzx r4d, word [r3 + 32] ; topRight = above[16]
690 psubw m4, m3, m1 ; v_bottomRow[0]
691 psubw m3, m2 ; v_bottomRow[1]
696 %macro PRED_PLANAR_ROW16 1
697 movzx r5d, word [r2 + %1 * 2]
699 lea r5d, [r5d * 8 + 16]
702 pshufd m5, m5, 0 ; horPred
704 movzx r5d, word [r2 + %1 * 2]
711 pmullw m6, m0, [multiL]
717 pmullw m0, m0, [multiH]
744 %undef PRED_PLANAR_ROW16
748 ;-----------------------------------------------------------------------------------------------------------
749 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
750 ;-----------------------------------------------------------------------------------------------------------
752 %if (ARCH_X86_64 == 1)
753 cglobal intra_pred_planar32, 4,7,8+8, 0-4*mmsize
754 %define bottomRow0 m8
755 %define bottomRow1 m9
756 %define bottomRow2 m10
757 %define bottomRow3 m11
758 %define bottomRow4 m12
759 %define bottomRow5 m13
760 %define bottomRow6 m14
761 %define bottomRow7 m15
762 %define tmp0 [rsp + 0*mmsize]
763 %define tmp1 [rsp + 1*mmsize]
764 %define tmp2 [rsp + 2*mmsize]
765 %define tmp3 [rsp + 3*mmsize]
767 cglobal intra_pred_planar32, 4,7,8, 0-12*mmsize
768 %define bottomRow0 [rsp + 0*mmsize]
769 %define bottomRow1 [rsp + 1*mmsize]
770 %define bottomRow2 [rsp + 2*mmsize]
771 %define bottomRow3 [rsp + 3*mmsize]
772 %define bottomRow4 [rsp + 4*mmsize]
773 %define bottomRow5 [rsp + 5*mmsize]
774 %define bottomRow6 [rsp + 6*mmsize]
775 %define bottomRow7 [rsp + 7*mmsize]
776 %define tmp0 [rsp + 8*mmsize]
777 %define tmp1 [rsp + 9*mmsize]
778 %define tmp2 [rsp + 10*mmsize]
779 %define tmp3 [rsp + 11*mmsize]
789 movzx r4d, word [r2 + 32*2]
791 pshufd m1, m1, 0 ; m1 = bottomLeft
823 pmovzxwd m0, [r3 + 0*8]
825 pmovzxwd m1, [r3 + 1*8]
827 pmovzxwd m2, [r3 + 2*8]
829 pmovzxwd m3, [r3 + 3*8]
832 pmovzxwd m4, [r3 + 4*8]
835 pmovzxwd m4, [r3 + 5*8]
838 pmovzxwd m4, [r3 + 6*8]
841 pmovzxwd m4, [r3 + 7*8]
847 movzx r4d, word [r2 + r6*2]
848 movzx r5d, word [r3 + 32*2] ; r5 = topRight
853 pmovsxwd m5, m5 ; m5 = rightCol
857 pshufd m4, m4, 0 ; m4 = horPred
859 pshufd m6, m5, 0xFF ; m6 = [4 4 4 4]
939 ;-----------------------------------------------------------------------------
940 ; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
941 ;-----------------------------------------------------------------------------
943 cglobal intra_pred_ang4_2, 3,3,4
952 movh [r0 + r1 * 2], m2
959 cglobal intra_pred_ang4_3, 3,4,8
962 lea r3, [ang_table + 20 * 16]
963 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
964 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
965 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
966 palignr m5, m0, 4 ; [x x 8 7 6 5 4 3]
967 punpcklwd m3, m1, m5 ; [6 5 5 4 4 3 3 2]
968 palignr m1, m0, 6 ; [x x x 8 7 6 5 4]
969 punpcklwd m4, m5 ,m1 ; [7 6 6 5 5 4 4 3]
970 movhlps m0, m0 ; [x x x x 8 7 6 5]
971 punpcklwd m5, m1, m0 ; [8 7 7 6 6 5 5 4]
973 mova m0, [r3 + 6 * 16] ; [26]
975 mova m6, [r3 - 6 * 16] ; [14]
976 mova m7, [r3 - 12 * 16] ; [ 8]
1002 punpckhwd m0, m2, m4
1004 punpckhwd m4, m2, m0
1010 movhps [r0 + r1], m2
1011 movh [r0 + r1 * 2], m4
1013 movhps [r0 + r1], m4
1016 cglobal intra_pred_ang4_4, 3,4,8
1019 lea r3, [ang_table + 18 * 16]
1020 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1021 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
1022 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
1023 palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
1024 punpcklwd m3, m1, m6 ; [6 5 5 4 4 3 3 2]
1026 palignr m7, m0, 6 ; [x x x 8 7 6 5 4]
1027 punpcklwd m5, m6, m7 ; [7 6 6 5 5 4 4 3]
1029 mova m0, [r3 + 3 * 16] ; [21]
1030 mova m1, [r3 - 8 * 16] ; [10]
1031 mova m6, [r3 + 13 * 16] ; [31]
1032 mova m7, [r3 + 2 * 16] ; [20]
1033 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1035 cglobal intra_pred_ang4_5, 3,4,8
1038 lea r3, [ang_table + 10 * 16]
1039 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1040 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
1041 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
1042 palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
1043 punpcklwd m3, m1, m6 ; [6 5 5 4 4 3 3 2]
1045 palignr m7, m0, 6 ; [x x x 8 7 6 5 4]
1046 punpcklwd m5, m6, m7 ; [7 6 6 5 5 4 4 3]
1048 mova m0, [r3 + 7 * 16] ; [17]
1049 mova m1, [r3 - 8 * 16] ; [ 2]
1050 mova m6, [r3 + 9 * 16] ; [19]
1051 mova m7, [r3 - 6 * 16] ; [ 4]
1052 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1054 cglobal intra_pred_ang4_6, 3,4,8
1057 lea r3, [ang_table + 19 * 16]
1058 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1059 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
1060 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
1062 palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
1063 punpcklwd m4, m1, m6 ; [6 5 5 4 4 3 3 2]
1066 mova m0, [r3 - 6 * 16] ; [13]
1067 mova m1, [r3 + 7 * 16] ; [26]
1068 mova m6, [r3 - 12 * 16] ; [ 7]
1069 mova m7, [r3 + 1 * 16] ; [20]
1070 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1072 cglobal intra_pred_ang4_7, 3,4,8
1075 lea r3, [ang_table + 20 * 16]
1076 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1077 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
1078 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
1081 palignr m6, m0, 4 ; [x x 8 7 6 5 4 3]
1082 punpcklwd m5, m1, m6 ; [6 5 5 4 4 3 3 2]
1084 mova m0, [r3 - 11 * 16] ; [ 9]
1085 mova m1, [r3 - 2 * 16] ; [18]
1086 mova m6, [r3 + 7 * 16] ; [27]
1087 mova m7, [r3 - 16 * 16] ; [ 4]
1088 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1090 cglobal intra_pred_ang4_8, 3,4,8
1093 lea r3, [ang_table + 13 * 16]
1094 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1095 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
1096 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
1101 mova m0, [r3 - 8 * 16] ; [ 5]
1102 mova m1, [r3 - 3 * 16] ; [10]
1103 mova m6, [r3 + 2 * 16] ; [15]
1104 mova m7, [r3 + 7 * 16] ; [20]
1105 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1108 cglobal intra_pred_ang4_9, 3,4,8
1111 lea r3, [ang_table + 4 * 16]
1112 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1113 palignr m1, m0, 2 ; [x 8 7 6 5 4 3 2]
1114 punpcklwd m2, m0, m1 ; [5 4 4 3 3 2 2 1]
1119 mova m0, [r3 - 2 * 16] ; [ 2]
1120 mova m1, [r3 - 0 * 16] ; [ 4]
1121 mova m6, [r3 + 2 * 16] ; [ 6]
1122 mova m7, [r3 + 4 * 16] ; [ 8]
1123 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1125 cglobal intra_pred_ang4_10, 3,3,4
1126 movh m0, [r2 + 2] ; [4 3 2 1]
1127 pshufb m2, m0, [pb_unpackwq2] ; [4 4 4 4 3 3 3 3]
1128 pshufb m0, [pb_unpackwq1] ; [2 2 2 2 1 1 1 1]
1130 movhlps m1, m0 ; [2 2 2 2]
1131 movhlps m3, m2 ; [4 4 4 4]
1133 movh [r0 + r1 * 2], m2
1142 movu m1, [r2] ; [7 6 5 4 3 2 1 0]
1143 pshufb m2, m1, [pb_unpackwq1] ; [0 0 0 0]
1144 palignr m1, m1, 2 ; [4 3 2 1]
1150 pminsw m0, [pw_1023]
1156 cglobal intra_pred_ang4_26, 4,4,3
1157 movh m0, [r3 + 2] ; [8 7 6 5 4 3 2 1]
1162 movh [r0 + r1 * 2], m0
1170 pshufb m0, [pb_unpackwq1] ; [2 2 2 2 1 1 1 1]
1171 movu m1, [r2] ; [7 6 5 4 3 2 1 0]
1172 pshufb m2, m1, [pb_unpackwq1] ; [0 0 0 0]
1173 palignr m1, m1, 2 ; [4 3 2 1]
1179 pminsw m0, [pw_1023]
1182 pextrw [r0 + r1], m0, 1
1183 pextrw [r0 + r1 * 2], m0, 2
1184 pextrw [r0 + r3], m0, 3
1189 cglobal intra_pred_ang4_11, 3,4,8
1192 lea r3, [ang_table + 24 * 16]
1193 movu m2, [r2] ; [x x x 4 3 2 1 0]
1194 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
1195 punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
1200 mova m0, [r3 + 6 * 16] ; [24]
1201 mova m1, [r3 + 4 * 16] ; [26]
1202 mova m6, [r3 + 2 * 16] ; [28]
1203 mova m7, [r3 + 0 * 16] ; [30]
1204 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1207 cglobal intra_pred_ang4_12, 3,4,8
1210 lea r3, [ang_table + 20 * 16]
1211 movu m2, [r2] ; [x x x 4 3 2 1 0]
1212 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
1213 punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
1218 mova m0, [r3 + 7 * 16] ; [27]
1219 mova m1, [r3 + 2 * 16] ; [22]
1220 mova m6, [r3 - 3 * 16] ; [17]
1221 mova m7, [r3 - 8 * 16] ; [12]
1222 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1225 cglobal intra_pred_ang4_13, 4,4,8
1230 movu m5, [r2 - 2] ; [x x 4 3 2 1 0 x]
1231 palignr m2, m5, 2 ; [x x x 4 3 2 1 0]
1232 palignr m0, m5, 4 ; [x x x x 4 3 2 1]
1233 pinsrw m5, [r3 + 8], 0
1234 punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x]
1235 punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
1239 lea r3, [ang_table + 21 * 16]
1240 mova m0, [r3 + 2 * 16] ; [23]
1241 mova m1, [r3 - 7 * 16] ; [14]
1242 mova m6, [r3 - 16 * 16] ; [ 5]
1243 mova m7, [r3 + 7 * 16] ; [28]
1244 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1246 cglobal intra_pred_ang4_14, 4,4,8
1251 movu m5, [r2 - 2] ; [x x 4 3 2 1 0 x]
1252 palignr m2, m5, 2 ; [x x x 4 3 2 1 0]
1253 palignr m0, m5, 4 ; [x x x x 4 3 2 1]
1254 pinsrw m5, [r3 + 4], 0
1255 punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x]
1256 punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
1260 lea r3, [ang_table + 19 * 16]
1261 mova m0, [r3 + 0 * 16] ; [19]
1262 mova m1, [r3 - 13 * 16] ; [ 6]
1263 mova m6, [r3 + 6 * 16] ; [25]
1264 mova m7, [r3 - 7 * 16] ; [12]
1265 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1268 cglobal intra_pred_ang4_15, 4,4,8
1273 movu m3, [r2 - 2] ; [x x 4 3 2 1 0 x]
1274 palignr m2, m3, 2 ; [x x x 4 3 2 1 0]
1275 palignr m0, m3, 4 ; [x x x x 4 3 2 1]
1276 pinsrw m3, [r3 + 4], 0
1277 pslldq m5, m3, 2 ; [x 4 3 2 1 0 x y]
1278 pinsrw m5, [r3 + 8], 0
1279 punpcklwd m5, m3 ; [2 1 1 0 0 x x y]
1280 punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x]
1281 punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
1284 lea r3, [ang_table + 23 * 16]
1285 mova m0, [r3 - 8 * 16] ; [15]
1286 mova m1, [r3 + 7 * 16] ; [30]
1287 mova m6, [r3 - 10 * 16] ; [13]
1288 mova m7, [r3 + 5 * 16] ; [28]
1289 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1292 cglobal intra_pred_ang4_16, 4,4,8
1297 movu m3, [r2 - 2] ; [x x 4 3 2 1 0 x]
1298 palignr m2, m3, 2 ; [x x x 4 3 2 1 0]
1299 palignr m0, m3, 4 ; [x x x x 4 3 2 1]
1300 pinsrw m3, [r3 + 4], 0
1301 pslldq m5, m3, 2 ; [x 4 3 2 1 0 x y]
1302 pinsrw m5, [r3 + 6], 0
1303 punpcklwd m5, m3 ; [2 1 1 0 0 x x y]
1304 punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x]
1305 punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
1308 lea r3, [ang_table + 19 * 16]
1309 mova m0, [r3 - 8 * 16] ; [11]
1310 mova m1, [r3 + 3 * 16] ; [22]
1311 mova m6, [r3 - 18 * 16] ; [ 1]
1312 mova m7, [r3 - 7 * 16] ; [12]
1313 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1315 cglobal intra_pred_ang4_17, 4,4,8
1320 movu m6, [r2 - 2] ; [- - 4 3 2 1 0 x]
1321 palignr m2, m6, 2 ; [- - - 4 3 2 1 0]
1322 palignr m1, m6, 4 ; [- - - - 4 3 2 1]
1324 punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
1326 pinsrw m6, [r3 + 2], 0
1327 punpcklwd m3, m6, m4 ; [3 2 2 1 1 0 0 x]
1329 pslldq m4, m6, 2 ; [- 4 3 2 1 0 x y]
1330 pinsrw m4, [r3 + 4], 0
1331 pslldq m5, m4, 2 ; [4 3 2 1 0 x y z]
1332 pinsrw m5, [r3 + 8], 0
1333 punpcklwd m5, m4 ; [1 0 0 x x y y z]
1334 punpcklwd m4, m6 ; [2 1 1 0 0 x x y]
1336 lea r3, [ang_table + 14 * 16]
1337 mova m0, [r3 - 8 * 16] ; [ 6]
1338 mova m1, [r3 - 2 * 16] ; [12]
1339 mova m6, [r3 + 4 * 16] ; [18]
1340 mova m7, [r3 + 10 * 16] ; [24]
1341 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1344 cglobal intra_pred_ang4_18, 4,4,1
1346 pshufb m0, [pw_swap]
1352 movh [r0 + r1 * 2], m0
1359 ;-----------------------------------------------------------------------------
1360 ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
1361 ;-----------------------------------------------------------------------------
1363 cglobal intra_pred_ang8_2, 3,4,3
1371 palignr m2, m1, m0, 2
1373 palignr m2, m1, m0, 4
1374 movu [r0 + r1 * 2], m2
1375 palignr m2, m1, m0, 6
1377 lea r0, [r0 + r1 * 4]
1378 palignr m2, m1, m0, 8
1380 palignr m2, m1, m0, 10
1382 palignr m2, m1, m0, 12
1383 movu [r0 + r1 * 2], m2
1389 cglobal intra_pred_ang8_3, 3,5,8
1390 lea r3, [ang_table + 14 * 16]
1393 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1394 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
1395 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
1396 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
1398 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
1399 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
1400 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
1401 punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13]
1404 pmaddwd m4, [r3 + 12 * 16] ; [26]
1408 pmaddwd m2, [r3 + 12 * 16]
1413 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
1414 pmaddwd m2, [r3 + 6 * 16] ; [20]
1417 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
1418 pmaddwd m6, [r3 + 6 * 16]
1423 palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
1424 pmaddwd m6, [r3] ; [14]
1427 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
1433 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
1434 pmaddwd m7, [r3 - 6 * 16] ; [ 8]
1437 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
1438 pmaddwd m3, [r3 - 6 * 16]
1443 punpckhwd m3, m4, m2
1445 punpckhwd m2, m6, m7
1448 punpckldq m7, m4, m6
1450 punpckldq m6, m3, m2
1455 movhps [r0 + r1], m7
1456 movh [r0 + r1 * 2], m4
1457 movhps [r0 + r4], m4
1458 lea r2, [r0 + r1 * 4]
1460 movhps [r2 + r1], m6
1461 movh [r2 + r1 * 2], m3
1462 movhps [r2 + r4], m3
1465 pmaddwd m4, [r3 - 12 * 16] ; [ 2]
1469 pmaddwd m2, [r3 - 12 * 16]
1475 pmaddwd m2, [r3 + 14 * 16] ; [28]
1479 pmaddwd m6, [r3 + 14 * 16]
1484 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
1485 pmaddwd m6, [r3 + 8 * 16] ; [22]
1488 palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10]
1489 pmaddwd m7, [r3 + 8 * 16]
1494 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
1495 pmaddwd m7, [r3 + 2 * 16] ; [16]
1498 palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11]
1499 pmaddwd m1, [r3 + 2 * 16]
1504 punpckhwd m3, m4, m2
1506 punpckhwd m2, m6, m7
1509 punpckldq m7, m4, m6
1511 punpckldq m6, m3, m2
1515 movhps [r0 + r1 + 8], m7
1516 movh [r0 + r1 * 2 + 8], m4
1517 movhps [r0 + r4 + 8], m4
1518 lea r0, [r0 + r1 * 4]
1520 movhps [r0 + r1 + 8], m6
1521 movh [r0 + r1 * 2 + 8], m3
1522 movhps [r0 + r4 + 8], m3
1526 cglobal intra_pred_ang8_4, 3,6,8
1527 lea r3, [ang_table + 19 * 16]
1530 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1531 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
1532 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
1533 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
1535 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
1536 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
1537 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
1540 pmaddwd m4, [r3 + 2 * 16] ; [21]
1544 pmaddwd m2, [r3 + 2 * 16]
1549 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
1551 pmaddwd m2, [r3 - 9 * 16] ; [10]
1554 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
1556 pmaddwd m1, [r3 - 9 * 16]
1561 pmaddwd m6, [r3 + 12 * 16] ; [31]
1564 pmaddwd m7, [r3 + 12 * 16]
1569 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
1570 pmaddwd m7, [r3 + 1 * 16] ; [20]
1573 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
1574 pmaddwd m1, [r3 + 1 * 16]
1579 punpckhwd m1, m4, m2
1581 punpckhwd m2, m6, m7
1584 punpckldq m7, m4, m6
1586 punpckldq m6, m1, m2
1591 movhps [r0 + r1], m7
1592 movh [r0 + r1 * 2], m4
1593 movhps [r0 + r4], m4
1594 lea r5, [r0 + r1 * 4]
1596 movhps [r5 + r1], m6
1597 movh [r5 + r1 * 2], m1
1598 movhps [r5 + r4], m1
1600 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
1602 pmaddwd m4, [r3 - 10 * 16] ; [ 9]
1605 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
1607 pmaddwd m3, [r3 - 10 * 16]
1612 pmaddwd m2, [r3 + 11 * 16] ; [30]
1615 pmaddwd m6, [r3 + 11 * 16]
1621 pmaddwd m6, [r3] ; [19]
1630 movh m1, [r2 + 26] ; [16 15 14 13]
1631 palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
1632 pmaddwd m7, [r3 - 11 * 16] ; [8]
1635 palignr m1, m5, 4 ; [14 13 13 12 12 11 11 10]
1636 pmaddwd m1, [r3 - 11 * 16]
1641 punpckhwd m3, m4, m2
1643 punpckhwd m2, m6, m7
1646 punpckldq m7, m4, m6
1648 punpckldq m6, m3, m2
1652 movhps [r0 + r1 + 8], m7
1653 movh [r0 + r1 * 2 + 8], m4
1654 movhps [r0 + r4 + 8], m4
1655 lea r0, [r0 + r1 * 4]
1657 movhps [r0 + r1 + 8], m6
1658 movh [r0 + r1 * 2 + 8], m3
1659 movhps [r0 + r4 + 8], m3
1663 cglobal intra_pred_ang8_5, 3,5,8
1664 lea r3, [ang_table + 13 * 16]
1667 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1668 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
1669 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
1670 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
1672 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
1673 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
1674 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
1677 pmaddwd m4, [r3 + 4 * 16] ; [17]
1681 pmaddwd m2, [r3 + 4 * 16]
1686 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
1688 pmaddwd m2, [r3 - 11 * 16] ; [2]
1691 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
1693 pmaddwd m1, [r3 - 11 * 16]
1698 pmaddwd m6, [r3 + 6 * 16] ; [19]
1701 pmaddwd m7, [r3 + 6 * 16]
1706 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
1707 pmaddwd m7, [r3 - 9 * 16] ; [4]
1710 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
1711 pmaddwd m1, [r3 - 9 * 16]
1716 punpckhwd m1, m4, m2
1718 punpckhwd m2, m6, m7
1721 punpckldq m7, m4, m6
1723 punpckldq m6, m1, m2
1728 movhps [r0 + r1], m7
1729 movh [r0 + r1 * 2], m4
1730 movhps [r0 + r4], m4
1731 lea r2, [r0 + r1 * 4]
1733 movhps [r2 + r1], m6
1734 movh [r2 + r1 * 2], m1
1735 movhps [r2 + r4], m1
1737 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
1738 pmaddwd m4, [r3 + 8 * 16] ; [21]
1741 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
1742 pmaddwd m2, [r3 + 8 * 16]
1747 palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
1749 pmaddwd m2, [r3 - 7 * 16] ; [6]
1752 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
1754 pmaddwd m1, [r3 - 7 * 16]
1759 pmaddwd m6, [r3 + 10 * 16] ; [23]
1762 pmaddwd m7, [r3 + 10 * 16]
1768 pmaddwd m7, [r3 - 5 * 16] ; [8]
1772 pmaddwd m1, [r3 - 5 * 16]
1777 punpckhwd m3, m4, m2
1779 punpckhwd m2, m6, m7
1782 punpckldq m7, m4, m6
1784 punpckldq m6, m3, m2
1788 movhps [r0 + r1 + 8], m7
1789 movh [r0 + r1 * 2 + 8], m4
1790 movhps [r0 + r4 + 8], m4
1791 lea r0, [r0 + r1 * 4]
1793 movhps [r0 + r1 + 8], m6
1794 movh [r0 + r1 * 2 + 8], m3
1795 movhps [r0 + r4 + 8], m3
1799 cglobal intra_pred_ang8_6, 3,5,8
1800 lea r3, [ang_table + 14 * 16]
1803 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1804 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
1805 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
1806 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
1808 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
1809 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
1810 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
1813 pmaddwd m4, [r3 - 1 * 16] ; [13]
1817 pmaddwd m2, [r3 - 1 * 16]
1823 pmaddwd m2, [r3 + 12 * 16] ; [26]
1827 pmaddwd m1, [r3 + 12 * 16]
1832 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
1834 pmaddwd m6, [r3 - 7 * 16] ; [7]
1837 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
1838 pmaddwd m1, [r3 - 7 * 16]
1843 pmaddwd m7, [r3 + 6 * 16] ; [20]
1846 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
1847 pmaddwd m1, [r3 + 6 * 16]
1852 punpckhwd m1, m4, m2
1854 punpckhwd m2, m6, m7
1857 punpckldq m7, m4, m6
1859 punpckldq m6, m1, m2
1864 movhps [r0 + r1], m7
1865 movh [r0 + r1 * 2], m4
1866 movhps [r0 + r4], m4
1867 lea r2, [r0 + r1 * 4]
1869 movhps [r2 + r1], m6
1870 movh [r2 + r1 * 2], m1
1871 movhps [r2 + r4], m1
1873 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
1875 pmaddwd m4, [r3 - 13 * 16] ; [1]
1878 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
1880 pmaddwd m2, [r3 - 13 * 16]
1885 pmaddwd m2, m6, [r3] ; [14]
1888 pmaddwd m1, m7, [r3]
1893 pmaddwd m6, [r3 + 13 * 16] ; [27]
1896 pmaddwd m7, [r3 + 13 * 16]
1901 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
1902 pmaddwd m7, [r3 - 6 * 16] ; [8]
1905 palignr m5, m0, 12 ; [12 11 11 10 10 9 9 8]
1906 pmaddwd m5, [r3 - 6 * 16]
1911 punpckhwd m3, m4, m2
1913 punpckhwd m2, m6, m7
1916 punpckldq m7, m4, m6
1918 punpckldq m6, m3, m2
1922 movhps [r0 + r1 + 8], m7
1923 movh [r0 + r1 * 2 + 8], m4
1924 movhps [r0 + r4 + 8], m4
1925 lea r0, [r0 + r1 * 4]
1927 movhps [r0 + r1 + 8], m6
1928 movh [r0 + r1 * 2 + 8], m3
1929 movhps [r0 + r4 + 8], m3
1933 cglobal intra_pred_ang8_7, 3,5,8
1934 lea r3, [ang_table + 18 * 16]
1937 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
1938 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
1939 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
1940 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
1942 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
1943 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
1944 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
1947 pmaddwd m4, [r3 - 9 * 16] ; [9]
1951 pmaddwd m2, [r3 - 9 * 16]
1957 pmaddwd m2, [r3] ; [18]
1967 pmaddwd m6, [r3 + 9 * 16] ; [27]
1971 pmaddwd m1, [r3 + 9 * 16]
1976 palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
1977 pmaddwd m7, [r3 - 14 * 16] ; [4]
1980 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
1981 pmaddwd m1, [r3 - 14 * 16]
1986 punpckhwd m1, m4, m2
1988 punpckhwd m2, m6, m7
1991 punpckldq m7, m4, m6
1993 punpckldq m6, m1, m2
1998 movhps [r0 + r1], m7
1999 movh [r0 + r1 * 2], m4
2000 movhps [r0 + r4], m4
2001 lea r2, [r0 + r1 * 4]
2003 movhps [r2 + r1], m6
2004 movh [r2 + r1 * 2], m1
2005 movhps [r2 + r4], m1
2007 palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
2009 pmaddwd m4, [r3 - 5 * 16] ; [13]
2012 palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
2014 pmaddwd m2, [r3 - 5 * 16]
2019 pmaddwd m2, m6, [r3 + 4 * 16] ; [22]
2022 pmaddwd m1, m7, [r3 + 4 * 16]
2027 pmaddwd m6, [r3 + 13 * 16] ; [31]
2030 pmaddwd m7, [r3 + 13 * 16]
2035 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
2036 pmaddwd m7, [r3 - 10 * 16] ; [8]
2039 palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7]
2040 pmaddwd m5, [r3 - 10 * 16]
2045 punpckhwd m3, m4, m2
2047 punpckhwd m2, m6, m7
2050 punpckldq m7, m4, m6
2052 punpckldq m6, m3, m2
2056 movhps [r0 + r1 + 8], m7
2057 movh [r0 + r1 * 2 + 8], m4
2058 movhps [r0 + r4 + 8], m4
2059 lea r0, [r0 + r1 * 4]
2061 movhps [r0 + r1 + 8], m6
2062 movh [r0 + r1 * 2 + 8], m3
2063 movhps [r0 + r4 + 8], m3
2067 cglobal intra_pred_ang8_8, 3,6,7
2068 lea r3, [ang_table + 17 * 16]
2071 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2072 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
2074 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
2075 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
2078 pmaddwd m4, [r3 - 12 * 16] ; [5]
2082 pmaddwd m2, [r3 - 12 * 16]
2088 pmaddwd m2, [r3 - 7 * 16] ; [10]
2092 pmaddwd m1, [r3 - 7 * 16]
2098 pmaddwd m6, [r3 - 2 * 16] ; [15]
2102 pmaddwd m1, [r3 - 2 * 16]
2108 pmaddwd m5, [r3 + 3 * 16] ; [20]
2112 pmaddwd m1, [r3 + 3 * 16]
2117 punpckhwd m1, m4, m2
2119 punpckhwd m2, m6, m5
2122 punpckldq m5, m4, m6
2124 punpckldq m6, m1, m2
2129 movhps [r0 + r1], m5
2130 movh [r0 + r1 * 2], m4
2131 movhps [r0 + r4], m4
2132 lea r5, [r0 + r1 * 4]
2134 movhps [r5 + r1], m6
2135 movh [r5 + r1 * 2], m1
2136 movhps [r5 + r4], m1
2139 pmaddwd m4, [r3 + 8 * 16] ; [25]
2143 pmaddwd m2, [r3 + 8 * 16]
2149 pmaddwd m2, [r3 + 13 * 16] ; [30]
2153 pmaddwd m1, [r3 + 13 * 16]
2158 movh m1, [r2 + 18] ; [12 11 10 9]
2160 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
2162 pmaddwd m6, [r3 - 14 * 16] ; [3]
2165 palignr m1, m0, 4 ; [10 9 9 8 8 7 7 6]
2167 pmaddwd m1, [r3 - 14 * 16]
2172 pmaddwd m5, [r3 - 9 * 16] ; [8]
2175 pmaddwd m3, [r3 - 9 * 16]
2180 punpckhwd m3, m4, m2
2182 punpckhwd m2, m6, m5
2185 punpckldq m5, m4, m6
2187 punpckldq m6, m3, m2
2191 movhps [r0 + r1 + 8], m5
2192 movh [r0 + r1 * 2 + 8], m4
2193 movhps [r0 + r4 + 8], m4
2194 lea r0, [r0 + r1 * 4]
2196 movhps [r0 + r1 + 8], m6
2197 movh [r0 + r1 * 2 + 8], m3
2198 movhps [r0 + r4 + 8], m3
2202 cglobal intra_pred_ang8_9, 3,5,7
2203 lea r3, [ang_table + 9 * 16]
2206 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2207 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
2209 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
2210 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
2213 pmaddwd m4, [r3 - 7 * 16] ; [2]
2217 pmaddwd m2, [r3 - 7 * 16]
2223 pmaddwd m2, [r3 - 5 * 16] ; [4]
2227 pmaddwd m1, [r3 - 5 * 16]
2233 pmaddwd m6, [r3 - 3 * 16] ; [6]
2237 pmaddwd m1, [r3 - 3 * 16]
2243 pmaddwd m5, [r3 - 1 * 16] ; [8]
2247 pmaddwd m1, [r3 - 1 * 16]
2252 punpckhwd m1, m4, m2
2254 punpckhwd m2, m6, m5
2257 punpckldq m5, m4, m6
2259 punpckldq m6, m1, m2
2264 movhps [r0 + r1], m5
2265 movh [r0 + r1 * 2], m4
2266 movhps [r0 + r4], m4
2267 lea r2, [r0 + r1 * 4]
2269 movhps [r2 + r1], m6
2270 movh [r2 + r1 * 2], m1
2271 movhps [r2 + r4], m1
2274 pmaddwd m4, [r3 + 1 * 16] ; [10]
2278 pmaddwd m2, [r3 + 1 * 16]
2284 pmaddwd m2, [r3 + 3 * 16] ; [12]
2288 pmaddwd m1, [r3 + 3 * 16]
2294 pmaddwd m6, [r3 + 5 * 16] ; [14]
2298 pmaddwd m5, [r3 + 5 * 16]
2303 pmaddwd m3, [r3 + 7 * 16] ; [16]
2306 pmaddwd m0, [r3 + 7 * 16]
2311 punpckhwd m5, m4, m2
2313 punpckhwd m2, m6, m3
2316 punpckldq m3, m4, m6
2318 punpckldq m6, m5, m2
2322 movhps [r0 + r1 + 8], m3
2323 movh [r0 + r1 * 2 + 8], m4
2324 movhps [r0 + r4 + 8], m4
2325 lea r0, [r0 + r1 * 4]
2327 movhps [r0 + r1 + 8], m6
2328 movh [r0 + r1 * 2 + 8], m5
2329 movhps [r0 + r4 + 8], m5
2333 cglobal intra_pred_ang8_10, 4,5,3
2334 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2335 pshufb m0, m1, [pw_unpackwdq] ; [1 1 1 1 1 1 1 1]
2340 pshufb m2, m1, [pw_unpackwdq] ; [2 2 2 2 2 2 2 2]
2343 pshufb m2, m1, [pw_unpackwdq] ; [3 3 3 3 3 3 3 3]
2344 movu [r0 + r1 * 2], m2
2346 pshufb m2, m1, [pw_unpackwdq] ; [4 4 4 4 4 4 4 4]
2349 lea r2, [r0 + r1 *4]
2351 pshufb m2, m1, [pw_unpackwdq] ; [5 5 5 5 5 5 5 5]
2354 pshufb m2, m1, [pw_unpackwdq] ; [6 6 6 6 6 6 6 6]
2357 pshufb m2, m1, [pw_unpackwdq] ; [7 7 7 7 7 7 7 7]
2358 movu [r2 + r1 * 2], m2
2360 pshufb m2, m1, [pw_unpackwdq] ; [8 8 8 8 8 8 8 8]
2368 movh m1, [r3] ; [3 2 1 0]
2369 pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
2370 movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
2376 pminsw m0, [pw_1023]
2382 cglobal intra_pred_ang8_11, 3,5,7
2383 lea r3, [ang_table + 23 * 16]
2386 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
2387 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2389 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
2390 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
2393 pmaddwd m4, [r3 + 7 * 16] ; [30]
2397 pmaddwd m2, [r3 + 7 * 16]
2403 pmaddwd m2, [r3 + 5 * 16] ; [28]
2407 pmaddwd m1, [r3 + 5 * 16]
2413 pmaddwd m6, [r3 + 3 * 16] ; [26]
2417 pmaddwd m1, [r3 + 3 * 16]
2423 pmaddwd m5, [r3 + 1 * 16] ; [24]
2427 pmaddwd m1, [r3 + 1 * 16]
2432 punpckhwd m1, m4, m2
2434 punpckhwd m2, m6, m5
2437 punpckldq m5, m4, m6
2439 punpckldq m6, m1, m2
2444 movhps [r0 + r1], m5
2445 movh [r0 + r1 * 2], m4
2446 movhps [r0 + r4], m4
2447 lea r2, [r0 + r1 * 4]
2449 movhps [r2 + r1], m6
2450 movh [r2 + r1 * 2], m1
2451 movhps [r2 + r4], m1
2454 pmaddwd m4, [r3 - 1 * 16] ; [22]
2458 pmaddwd m2, [r3 - 1 * 16]
2464 pmaddwd m2, [r3 - 3 * 16] ; [20]
2468 pmaddwd m1, [r3 - 3 * 16]
2474 pmaddwd m6, [r3 - 5 * 16] ; [18]
2478 pmaddwd m5, [r3 - 5 * 16]
2483 pmaddwd m3, [r3 - 7 * 16] ; [16]
2486 pmaddwd m0, [r3 - 7 * 16]
2491 punpckhwd m5, m4, m2
2493 punpckhwd m2, m6, m3
2496 punpckldq m3, m4, m6
2498 punpckldq m6, m5, m2
2502 movhps [r0 + r1 + 8], m3
2503 movh [r0 + r1 * 2 + 8], m4
2504 movhps [r0 + r4 + 8], m4
2505 lea r0, [r0 + r1 * 4]
2507 movhps [r0 + r1 + 8], m6
2508 movh [r0 + r1 * 2 + 8], m5
2509 movhps [r0 + r4 + 8], m5
2513 cglobal intra_pred_ang8_12, 4,6,7
2514 lea r5, [ang_table + 16 * 16]
2517 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
2518 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2520 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
2521 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
2524 pmaddwd m4, [r5 + 11 * 16] ; [27]
2528 pmaddwd m2, [r5 + 11 * 16]
2534 pmaddwd m2, [r5 + 6 * 16] ; [22]
2538 pmaddwd m1, [r5 + 6 * 16]
2544 pmaddwd m6, [r5 + 1 * 16] ; [17]
2548 pmaddwd m1, [r5 + 1 * 16]
2554 pmaddwd m5, [r5 - 4 * 16] ; [12]
2558 pmaddwd m1, [r5 - 4 * 16]
2563 punpckhwd m1, m4, m2
2565 punpckhwd m2, m6, m5
2568 punpckldq m5, m4, m6
2570 punpckldq m6, m1, m2
2575 movhps [r0 + r1], m5
2576 movh [r0 + r1 * 2], m4
2577 movhps [r0 + r4], m4
2578 lea r2, [r0 + r1 * 4]
2580 movhps [r2 + r1], m6
2581 movh [r2 + r1 * 2], m1
2582 movhps [r2 + r4], m1
2585 pmaddwd m4, [r5 - 9 * 16] ; [7]
2589 pmaddwd m2, [r5 - 9 * 16]
2595 pmaddwd m2, [r5 - 14 * 16] ; [2]
2599 pmaddwd m1, [r5 - 14 * 16]
2606 pshufb m1, [pw_ang8_12]
2610 pmaddwd m6, [r5 + 13 * 16] ; [29]
2614 pmaddwd m5, [r5 + 13 * 16]
2619 pmaddwd m3, [r5 + 8 * 16] ; [24]
2622 pmaddwd m0, [r5 + 8 * 16]
2627 punpckhwd m5, m4, m2
2629 punpckhwd m2, m6, m3
2632 punpckldq m3, m4, m6
2634 punpckldq m6, m5, m2
2638 movhps [r0 + r1 + 8], m3
2639 movh [r0 + r1 * 2 + 8], m4
2640 movhps [r0 + r4 + 8], m4
2641 lea r0, [r0 + r1 * 4]
2643 movhps [r0 + r1 + 8], m6
2644 movh [r0 + r1 * 2 + 8], m5
2645 movhps [r0 + r4 + 8], m5
2649 cglobal intra_pred_ang8_13, 4,6,8
2650 lea r5, [ang_table + 14 * 16]
2653 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
2654 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2656 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
2657 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
2660 pmaddwd m4, [r5 + 9 * 16] ; [23]
2664 pmaddwd m2, [r5 + 9 * 16]
2670 pmaddwd m2, [r5] ; [14]
2680 pmaddwd m6, [r5 - 9 * 16] ; [5]
2684 pmaddwd m1, [r5 - 9 * 16]
2691 pshufb m1, [pw_ang8_13]
2695 pmaddwd m5, [r5 + 14 * 16] ; [28]
2699 pmaddwd m7, [r5 + 14 * 16]
2704 punpckhwd m7, m4, m2
2706 punpckhwd m2, m6, m5
2709 punpckldq m5, m4, m6
2711 punpckldq m6, m7, m2
2716 movhps [r0 + r1], m5
2717 movh [r0 + r1 * 2], m4
2718 movhps [r0 + r4], m4
2719 lea r2, [r0 + r1 * 4]
2721 movhps [r2 + r1], m6
2722 movh [r2 + r1 * 2], m7
2723 movhps [r2 + r4], m7
2726 pmaddwd m4, [r5 + 5 * 16] ; [19]
2730 pmaddwd m2, [r5 + 5 * 16]
2736 pmaddwd m2, [r5 - 4 * 16] ; [10]
2740 pmaddwd m5, [r5 - 4 * 16]
2746 pmaddwd m6, [r5 - 13 * 16] ; [1]
2750 pmaddwd m5, [r5 - 13 * 16]
2759 pmaddwd m3, [r5 + 10 * 16] ; [24]
2762 pmaddwd m0, [r5 + 10 * 16]
2767 punpckhwd m5, m4, m2
2769 punpckhwd m2, m6, m3
2772 punpckldq m3, m4, m6
2774 punpckldq m6, m5, m2
2778 movhps [r0 + r1 + 8], m3
2779 movh [r0 + r1 * 2 + 8], m4
2780 movhps [r0 + r4 + 8], m4
2781 lea r0, [r0 + r1 * 4]
2783 movhps [r0 + r1 + 8], m6
2784 movh [r0 + r1 * 2 + 8], m5
2785 movhps [r0 + r4 + 8], m5
2789 cglobal intra_pred_ang8_14, 4,6,8
2790 lea r5, [ang_table + 18 * 16]
2793 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
2794 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2796 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
2797 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
2800 pmaddwd m4, [r5 + 1 * 16] ; [19]
2804 pmaddwd m2, [r5 + 1 * 16]
2810 pmaddwd m2, [r5 - 12 * 16] ; [6]
2814 pmaddwd m1, [r5 - 12 * 16]
2821 pshufb m1, [pw_ang8_14]
2825 pmaddwd m6, [r5 + 7 * 16] ; [25]
2829 pmaddwd m5, [r5 + 7 * 16]
2835 pmaddwd m5, [r5 - 6 * 16] ; [12]
2839 pmaddwd m7, [r5 - 6 * 16]
2844 punpckhwd m7, m4, m2
2846 punpckhwd m2, m6, m5
2849 punpckldq m5, m4, m6
2851 punpckldq m6, m7, m2
2856 movhps [r0 + r1], m5
2857 movh [r0 + r1 * 2], m4
2858 movhps [r0 + r4], m4
2859 lea r2, [r0 + r1 * 4]
2861 movhps [r2 + r1], m6
2862 movh [r2 + r1 * 2], m7
2863 movhps [r2 + r4], m7
2870 pmaddwd m4, [r5 + 13 * 16] ; [31]
2874 pmaddwd m2, [r5 + 13 * 16]
2880 pmaddwd m2, [r5] ; [18]
2890 pmaddwd m6, [r5 - 13 * 16] ; [5]
2894 pmaddwd m5, [r5 - 13 * 16]
2903 pmaddwd m3, [r5 + 6 * 16] ; [24]
2906 pmaddwd m0, [r5 + 6 * 16]
2911 punpckhwd m5, m4, m2
2913 punpckhwd m2, m6, m3
2916 punpckldq m3, m4, m6
2918 punpckldq m6, m5, m2
2922 movhps [r0 + r1 + 8], m3
2923 movh [r0 + r1 * 2 + 8], m4
2924 movhps [r0 + r4 + 8], m4
2925 lea r0, [r0 + r1 * 4]
2927 movhps [r0 + r1 + 8], m6
2928 movh [r0 + r1 * 2 + 8], m5
2929 movhps [r0 + r4 + 8], m5
2933 cglobal intra_pred_ang8_15, 4,6,8
2934 lea r5, [ang_table + 20 * 16]
2937 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
2938 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
2940 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
2941 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
2944 pmaddwd m4, [r5 - 5 * 16] ; [15]
2948 pmaddwd m2, [r5 - 5 * 16]
2955 pshufb m1, [pw_ang8_15]
2959 pmaddwd m2, [r5 + 10 * 16] ; [30]
2963 pmaddwd m5, [r5 + 10 * 16]
2969 pmaddwd m6, [r5 - 7 * 16] ; [13]
2973 pmaddwd m5, [r5 - 7 * 16]
2983 pmaddwd m5, [r5 + 8 * 16] ; [28]
2987 pmaddwd m7, [r5 + 8 * 16]
2992 punpckhwd m7, m4, m2
2994 punpckhwd m2, m6, m5
2997 punpckldq m5, m4, m6
2999 punpckldq m6, m7, m2
3004 movhps [r0 + r1], m5
3005 movh [r0 + r1 * 2], m4
3006 movhps [r0 + r4], m4
3007 lea r2, [r0 + r1 * 4]
3009 movhps [r2 + r1], m6
3010 movh [r2 + r1 * 2], m7
3011 movhps [r2 + r4], m7
3014 pmaddwd m4, [r5 - 9 * 16] ; [11]
3018 pmaddwd m2, [r5 - 9 * 16]
3028 pmaddwd m2, [r5 + 6 * 16] ; [26]
3032 pmaddwd m5, [r5 + 6 * 16]
3038 pmaddwd m6, [r5 - 11 * 16] ; [9]
3042 pmaddwd m5, [r5 - 11 * 16]
3050 pinsrw m3, [r3 + 16], 0
3052 pmaddwd m3, [r5 + 4 * 16] ; [24]
3055 pmaddwd m0, [r5 + 4 * 16]
3060 punpckhwd m5, m4, m2
3062 punpckhwd m2, m6, m3
3065 punpckldq m3, m4, m6
3067 punpckldq m6, m5, m2
3071 movhps [r0 + r1 + 8], m3
3072 movh [r0 + r1 * 2 + 8], m4
3073 movhps [r0 + r4 + 8], m4
3074 lea r0, [r0 + r1 * 4]
3076 movhps [r0 + r1 + 8], m6
3077 movh [r0 + r1 * 2 + 8], m5
3078 movhps [r0 + r4 + 8], m5
3082 cglobal intra_pred_ang8_16, 4,6,8
3083 lea r5, [ang_table + 13 * 16]
3086 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
3087 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
3089 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
3090 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
3093 pmaddwd m4, [r5 - 2 * 16] ; [11]
3097 pmaddwd m2, [r5 - 2 * 16]
3104 pshufb m1, [pw_ang8_16]
3108 pmaddwd m2, [r5 + 9 * 16] ; [22]
3112 pmaddwd m5, [r5 + 9 * 16]
3118 pmaddwd m6, [r5 - 12 * 16] ; [1]
3122 pmaddwd m5, [r5 - 12 * 16]
3132 pmaddwd m5, [r5 - 1 * 16] ; [12]
3136 pmaddwd m7, [r5 - 1 * 16]
3141 punpckhwd m7, m4, m2
3143 punpckhwd m2, m6, m5
3146 punpckldq m5, m4, m6
3148 punpckldq m6, m7, m2
3153 movhps [r0 + r1], m5
3154 movh [r0 + r1 * 2], m4
3155 movhps [r0 + r4], m4
3156 lea r2, [r0 + r1 * 4]
3158 movhps [r2 + r1], m6
3159 movh [r2 + r1 * 2], m7
3160 movhps [r2 + r4], m7
3167 pmaddwd m4, [r5 + 10 * 16] ; [23]
3171 pmaddwd m2, [r5 + 10 * 16]
3177 pmaddwd m2, [r5 - 11 * 16] ; [2]
3181 pmaddwd m5, [r5 - 11 * 16]
3191 pmaddwd m6, [r5] ; [13]
3203 pinsrw m3, [r3 + 16], 0
3205 pmaddwd m3, [r5 + 11 * 16] ; [24]
3208 pmaddwd m0, [r5 + 11 * 16]
3213 punpckhwd m5, m4, m2
3215 punpckhwd m2, m6, m3
3218 punpckldq m3, m4, m6
3220 punpckldq m6, m5, m2
3224 movhps [r0 + r1 + 8], m3
3225 movh [r0 + r1 * 2 + 8], m4
3226 movhps [r0 + r4 + 8], m4
3227 lea r0, [r0 + r1 * 4]
3229 movhps [r0 + r1 + 8], m6
3230 movh [r0 + r1 * 2 + 8], m5
3231 movhps [r0 + r4 + 8], m5
3235 cglobal intra_pred_ang8_17, 4,6,8
3236 lea r5, [ang_table + 17 * 16]
3239 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
3240 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
3242 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
3243 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
3246 pmaddwd m4, [r5 - 11 * 16] ; [6]
3250 pmaddwd m2, [r5 - 11 * 16]
3257 pshufb m1, [pw_ang8_17]
3261 pmaddwd m2, [r5 - 5 * 16] ; [12]
3265 pmaddwd m5, [r5 - 5 * 16]
3275 pmaddwd m6, [r5 + 1 * 16] ; [18]
3279 pmaddwd m5, [r5 + 1 * 16]
3289 pmaddwd m5, [r5 + 7 * 16] ; [24]
3293 pmaddwd m7, [r5 + 7 * 16]
3298 punpckhwd m7, m4, m2
3300 punpckhwd m2, m6, m5
3303 punpckldq m5, m4, m6
3305 punpckldq m6, m7, m2
3310 movhps [r0 + r1], m5
3311 movh [r0 + r1 * 2], m4
3312 movhps [r0 + r4], m4
3313 lea r2, [r0 + r1 * 4]
3315 movhps [r2 + r1], m6
3316 movh [r2 + r1 * 2], m7
3317 movhps [r2 + r4], m7
3324 pmaddwd m4, [r5 + 13 * 16] ; [30]
3328 pmaddwd m2, [r5 + 13 * 16]
3334 pmaddwd m2, [r5 - 13 * 16] ; [4]
3338 pmaddwd m5, [r5 - 13 * 16]
3348 pmaddwd m6, [r5 - 7 * 16] ; [10]
3352 pmaddwd m5, [r5 - 7 * 16]
3361 pmaddwd m3, [r5 - 1 * 16] ; [16]
3364 pmaddwd m0, [r5 - 1 * 16]
3369 punpckhwd m5, m4, m2
3371 punpckhwd m2, m6, m3
3374 punpckldq m3, m4, m6
3376 punpckldq m6, m5, m2
3380 movhps [r0 + r1 + 8], m3
3381 movh [r0 + r1 * 2 + 8], m4
3382 movhps [r0 + r4 + 8], m4
3383 lea r0, [r0 + r1 * 4]
3385 movhps [r0 + r1 + 8], m6
3386 movh [r0 + r1 * 2 + 8], m5
3387 movhps [r0 + r4 + 8], m5
3391 cglobal intra_pred_ang8_18, 4,5,3
3396 pshufb m0, [pw_swap16]
3398 palignr m2, m1, m0, 14
3400 palignr m2, m1, m0, 12
3401 movu [r0 + r1 * 2], m2
3402 palignr m2, m1, m0, 10
3404 lea r0, [r0 + r1 * 4]
3405 palignr m2, m1, m0, 8
3407 palignr m2, m1, m0, 6
3409 palignr m2, m1, m0, 4
3410 movu [r0 + r1 * 2], m2
3415 cglobal intra_pred_ang8_19, 4,6,8
3416 lea r5, [ang_table + 17 * 16]
3419 movu m0, [r3] ; [7 6 5 4 3 2 1 0]
3420 movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
3422 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
3423 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
3426 pmaddwd m4, [r5 - 11 * 16] ; [6]
3430 pmaddwd m2, [r5 - 11 * 16]
3437 pshufb m1, [pw_ang8_17]
3441 pmaddwd m2, [r5 - 5 * 16] ; [12]
3445 pmaddwd m5, [r5 - 5 * 16]
3455 pmaddwd m6, [r5 + 1 * 16] ; [18]
3459 pmaddwd m5, [r5 + 1 * 16]
3469 pmaddwd m5, [r5 + 7 * 16] ; [24]
3473 pmaddwd m7, [r5 + 7 * 16]
3481 movu [r0 + r1 * 2], m6
3489 pmaddwd m4, [r5 + 13 * 16] ; [30]
3493 pmaddwd m2, [r5 + 13 * 16]
3499 pmaddwd m2, [r5 - 13 * 16] ; [4]
3503 pmaddwd m5, [r5 - 13 * 16]
3513 pmaddwd m6, [r5 - 7 * 16] ; [10]
3517 pmaddwd m5, [r5 - 7 * 16]
3526 pmaddwd m3, [r5 - 1 * 16] ; [16]
3529 pmaddwd m0, [r5 - 1 * 16]
3534 lea r0, [r0 + r1 * 4]
3537 movu [r0 + r1 * 2], m6
3542 cglobal intra_pred_ang8_20, 4,6,8
3543 lea r5, [ang_table + 13 * 16]
3546 movu m0, [r3] ; [7 6 5 4 3 2 1 0]
3547 movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
3549 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
3550 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
3553 pmaddwd m4, [r5 - 2 * 16] ; [11]
3557 pmaddwd m2, [r5 - 2 * 16]
3564 pshufb m1, [pw_ang8_16]
3568 pmaddwd m2, [r5 + 9 * 16] ; [22]
3572 pmaddwd m5, [r5 + 9 * 16]
3578 pmaddwd m6, [r5 - 12 * 16] ; [1]
3582 pmaddwd m5, [r5 - 12 * 16]
3592 pmaddwd m5, [r5 - 1 * 16] ; [12]
3596 pmaddwd m7, [r5 - 1 * 16]
3604 movu [r0 + r1 * 2], m6
3612 pmaddwd m4, [r5 + 10 * 16] ; [23]
3616 pmaddwd m2, [r5 + 10 * 16]
3622 pmaddwd m2, [r5 - 11 * 16] ; [2]
3626 pmaddwd m5, [r5 - 11 * 16]
3636 pmaddwd m6, [r5] ; [13]
3648 pinsrw m3, [r2 + 16], 0
3650 pmaddwd m3, [r5 + 11 * 16] ; [24]
3653 pmaddwd m0, [r5 + 11 * 16]
3658 lea r0, [r0 + r1 * 4]
3661 movu [r0 + r1 * 2], m6
3666 cglobal intra_pred_ang8_21, 4,6,8
3667 lea r5, [ang_table + 20 * 16]
3670 movu m0, [r3] ; [7 6 5 4 3 2 1 0]
3671 movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
3673 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
3674 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
3677 pmaddwd m4, [r5 - 5 * 16] ; [15]
3681 pmaddwd m2, [r5 - 5 * 16]
3688 pshufb m1, [pw_ang8_15]
3692 pmaddwd m2, [r5 + 10 * 16] ; [30]
3696 pmaddwd m5, [r5 + 10 * 16]
3702 pmaddwd m6, [r5 - 7 * 16] ; [13]
3706 pmaddwd m5, [r5 - 7 * 16]
3716 pmaddwd m5, [r5 + 8 * 16] ; [28]
3720 pmaddwd m7, [r5 + 8 * 16]
3728 movu [r0 + r1 * 2], m6
3732 pmaddwd m4, [r5 - 9 * 16] ; [11]
3736 pmaddwd m2, [r5 - 9 * 16]
3746 pmaddwd m2, [r5 + 6 * 16] ; [26]
3750 pmaddwd m5, [r5 + 6 * 16]
3756 pmaddwd m6, [r5 - 11 * 16] ; [9]
3760 pmaddwd m5, [r5 - 11 * 16]
3768 pinsrw m3, [r2 + 16], 0
3770 pmaddwd m3, [r5 + 4 * 16] ; [24]
3773 pmaddwd m0, [r5 + 4 * 16]
3778 lea r0, [r0 + r1 * 4]
3781 movu [r0 + r1 * 2], m6
3786 cglobal intra_pred_ang8_22, 4,6,8
3787 lea r5, [ang_table + 18 * 16]
3790 movu m0, [r3] ; [7 6 5 4 3 2 1 0]
3791 movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
3793 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
3794 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
3797 pmaddwd m4, [r5 + 1 * 16] ; [19]
3801 pmaddwd m2, [r5 + 1 * 16]
3807 pmaddwd m2, [r5 - 12 * 16] ; [6]
3811 pmaddwd m1, [r5 - 12 * 16]
3818 pshufb m1, [pw_ang8_14]
3822 pmaddwd m6, [r5 + 7 * 16] ; [25]
3826 pmaddwd m5, [r5 + 7 * 16]
3832 pmaddwd m5, [r5 - 6 * 16] ; [12]
3836 pmaddwd m7, [r5 - 6 * 16]
3844 movu [r0 + r1 * 2], m6
3852 pmaddwd m4, [r5 + 13 * 16] ; [31]
3856 pmaddwd m2, [r5 + 13 * 16]
3862 pmaddwd m2, [r5] ; [18]
3872 pmaddwd m6, [r5 - 13 * 16] ; [5]
3876 pmaddwd m5, [r5 - 13 * 16]
3885 pmaddwd m3, [r5 + 6 * 16] ; [24]
3888 pmaddwd m0, [r5 + 6 * 16]
3893 lea r0, [r0 + r1 * 4]
3896 movu [r0 + r1 * 2], m6
3901 cglobal intra_pred_ang8_23, 4,6,8
3902 lea r5, [ang_table + 14 * 16]
3905 movu m0, [r3] ; [7 6 5 4 3 2 1 0]
3906 movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
3908 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
3909 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
3912 pmaddwd m4, [r5 + 9 * 16] ; [23]
3916 pmaddwd m2, [r5 + 9 * 16]
3922 pmaddwd m2, [r5] ; [14]
3932 pmaddwd m6, [r5 - 9 * 16] ; [5]
3936 pmaddwd m1, [r5 - 9 * 16]
3943 pshufb m1, [pw_ang8_13]
3947 pmaddwd m5, [r5 + 14 * 16] ; [28]
3951 pmaddwd m7, [r5 + 14 * 16]
3959 movu [r0 + r1 * 2], m6
3963 pmaddwd m4, [r5 + 5 * 16] ; [19]
3967 pmaddwd m2, [r5 + 5 * 16]
3973 pmaddwd m2, [r5 - 4 * 16] ; [10]
3977 pmaddwd m5, [r5 - 4 * 16]
3983 pmaddwd m6, [r5 - 13 * 16] ; [1]
3987 pmaddwd m5, [r5 - 13 * 16]
3996 pmaddwd m3, [r5 + 10 * 16] ; [24]
3999 pmaddwd m0, [r5 + 10 * 16]
4004 lea r0, [r0 + r1 * 4]
4007 movu [r0 + r1 * 2], m6
4012 cglobal intra_pred_ang8_24, 4,6,7
4013 lea r5, [ang_table + 16 * 16]
4016 movu m0, [r3] ; [7 6 5 4 3 2 1 0]
4017 movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
4019 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
4020 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
4023 pmaddwd m4, [r5 + 11 * 16] ; [27]
4027 pmaddwd m2, [r5 + 11 * 16]
4033 pmaddwd m2, [r5 + 6 * 16] ; [22]
4037 pmaddwd m1, [r5 + 6 * 16]
4043 pmaddwd m6, [r5 + 1 * 16] ; [17]
4047 pmaddwd m1, [r5 + 1 * 16]
4053 pmaddwd m5, [r5 - 4 * 16] ; [12]
4057 pmaddwd m1, [r5 - 4 * 16]
4065 movu [r0 + r1 * 2], m6
4069 pmaddwd m4, [r5 - 9 * 16] ; [7]
4073 pmaddwd m2, [r5 - 9 * 16]
4079 pmaddwd m2, [r5 - 14 * 16] ; [2]
4083 pmaddwd m1, [r5 - 14 * 16]
4090 pshufb m1, [pw_ang8_12]
4094 pmaddwd m6, [r5 + 13 * 16] ; [29]
4098 pmaddwd m5, [r5 + 13 * 16]
4103 pmaddwd m3, [r5 + 8 * 16] ; [24]
4106 pmaddwd m0, [r5 + 8 * 16]
4111 lea r0, [r0 + r1 * 4]
4114 movu [r0 + r1 * 2], m6
4119 cglobal intra_pred_ang8_25, 3,5,7
4121 lea r3, [ang_table + 23 * 16]
4124 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
4125 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4127 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
4128 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
4131 pmaddwd m4, [r3 + 7 * 16] ; [30]
4135 pmaddwd m2, [r3 + 7 * 16]
4141 pmaddwd m2, [r3 + 5 * 16] ; [28]
4145 pmaddwd m1, [r3 + 5 * 16]
4151 pmaddwd m6, [r3 + 3 * 16] ; [26]
4155 pmaddwd m1, [r3 + 3 * 16]
4161 pmaddwd m5, [r3 + 1 * 16] ; [24]
4165 pmaddwd m1, [r3 + 1 * 16]
4173 movu [r0 + r1 * 2], m6
4177 pmaddwd m4, [r3 - 1 * 16] ; [22]
4181 pmaddwd m2, [r3 - 1 * 16]
4187 pmaddwd m2, [r3 - 3 * 16] ; [20]
4191 pmaddwd m1, [r3 - 3 * 16]
4197 pmaddwd m6, [r3 - 5 * 16] ; [18]
4201 pmaddwd m5, [r3 - 5 * 16]
4206 pmaddwd m3, [r3 - 7 * 16] ; [16]
4209 pmaddwd m0, [r3 - 7 * 16]
4214 lea r0, [r0 + r1 * 4]
4217 movu [r0 + r1 * 2], m6
4222 cglobal intra_pred_ang8_26, 4,5,3
4223 movu m0, [r3 + 2] ; [8 7 6 5 4 3 2 1]
4229 movu [r0 + r1 * 2], m0
4232 lea r3, [r0 + r1 *4]
4235 movu [r3 + r1 * 2], m0
4243 pshufb m0, [pw_unpackwdq]
4244 movh m1, [r2] ; [3 2 1 0]
4245 pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
4246 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4252 pminsw m0, [pw_1023]
4254 pextrw [r0 + r1], m0, 1
4255 pextrw [r0 + r1 * 2], m0, 2
4256 pextrw [r0 + r4], m0, 3
4258 pextrw [r3 + r1], m0, 5
4259 pextrw [r3 + r1 * 2], m0, 6
4260 pextrw [r3 + r4], m0, 7
4265 cglobal intra_pred_ang8_27, 3,5,7
4267 lea r3, [ang_table + 9 * 16]
4270 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4271 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
4273 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
4274 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
4277 pmaddwd m4, [r3 - 7 * 16] ; [2]
4281 pmaddwd m2, [r3 - 7 * 16]
4287 pmaddwd m2, [r3 - 5 * 16] ; [4]
4291 pmaddwd m1, [r3 - 5 * 16]
4297 pmaddwd m6, [r3 - 3 * 16] ; [6]
4301 pmaddwd m1, [r3 - 3 * 16]
4307 pmaddwd m5, [r3 - 1 * 16] ; [8]
4311 pmaddwd m1, [r3 - 1 * 16]
4319 movu [r0 + r1 * 2], m6
4323 pmaddwd m4, [r3 + 1 * 16] ; [10]
4327 pmaddwd m2, [r3 + 1 * 16]
4333 pmaddwd m2, [r3 + 3 * 16] ; [12]
4337 pmaddwd m1, [r3 + 3 * 16]
4343 pmaddwd m6, [r3 + 5 * 16] ; [14]
4347 pmaddwd m5, [r3 + 5 * 16]
4352 pmaddwd m3, [r3 + 7 * 16] ; [16]
4355 pmaddwd m0, [r3 + 7 * 16]
4360 lea r0, [r0 + r1 * 4]
4363 movu [r0 + r1 * 2], m6
4368 cglobal intra_pred_ang8_28, 3,5,7
4370 lea r3, [ang_table + 17 * 16]
4373 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4374 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
4376 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
4377 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
4380 pmaddwd m4, [r3 - 12 * 16] ; [5]
4384 pmaddwd m2, [r3 - 12 * 16]
4390 pmaddwd m2, [r3 - 7 * 16] ; [10]
4394 pmaddwd m1, [r3 - 7 * 16]
4400 pmaddwd m6, [r3 - 2 * 16] ; [15]
4404 pmaddwd m1, [r3 - 2 * 16]
4410 pmaddwd m5, [r3 + 3 * 16] ; [20]
4414 pmaddwd m1, [r3 + 3 * 16]
4422 movu [r0 + r1 * 2], m6
4426 pmaddwd m4, [r3 + 8 * 16] ; [25]
4430 pmaddwd m2, [r3 + 8 * 16]
4436 pmaddwd m2, [r3 + 13 * 16] ; [30]
4440 pmaddwd m1, [r3 + 13 * 16]
4445 movh m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
4447 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
4449 pmaddwd m6, [r3 - 14 * 16] ; [3]
4452 palignr m1, m0, 4 ; [10 9 9 8 8 7 7 6]
4454 pmaddwd m1, [r3 - 14 * 16]
4459 pmaddwd m5, [r3 - 9 * 16] ; [8]
4462 pmaddwd m3, [r3 - 9 * 16]
4467 lea r0, [r0 + r1 * 4]
4470 movu [r0 + r1 * 2], m6
4475 cglobal intra_pred_ang8_29, 3,5,8
4477 lea r3, [ang_table + 18 * 16]
4480 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4481 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
4482 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
4483 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
4485 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
4486 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
4487 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
4490 pmaddwd m4, [r3 - 9 * 16] ; [9]
4494 pmaddwd m2, [r3 - 9 * 16]
4500 pmaddwd m2, [r3] ; [18]
4510 pmaddwd m6, [r3 + 9 * 16] ; [27]
4514 pmaddwd m1, [r3 + 9 * 16]
4519 palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
4520 pmaddwd m7, [r3 - 14 * 16] ; [4]
4523 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4524 pmaddwd m1, [r3 - 14 * 16]
4532 movu [r0 + r1 * 2], m6
4535 palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
4537 pmaddwd m4, [r3 - 5 * 16] ; [13]
4540 palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4542 pmaddwd m2, [r3 - 5 * 16]
4547 pmaddwd m2, m6, [r3 + 4 * 16] ; [22]
4550 pmaddwd m1, m7, [r3 + 4 * 16]
4555 pmaddwd m6, [r3 + 13 * 16] ; [31]
4558 pmaddwd m7, [r3 + 13 * 16]
4563 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
4564 pmaddwd m7, [r3 - 10 * 16] ; [8]
4567 palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7]
4568 pmaddwd m5, [r3 - 10 * 16]
4573 lea r0, [r0 + r1 * 4]
4576 movu [r0 + r1 * 2], m6
4581 cglobal intra_pred_ang8_30, 3,5,8
4583 lea r3, [ang_table + 14 * 16]
4586 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4587 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
4588 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
4589 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
4591 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
4592 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
4593 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
4596 pmaddwd m4, [r3 - 1 * 16] ; [13]
4600 pmaddwd m2, [r3 - 1 * 16]
4606 pmaddwd m2, [r3 + 12 * 16] ; [26]
4610 pmaddwd m1, [r3 + 12 * 16]
4615 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
4617 pmaddwd m6, [r3 - 7 * 16] ; [7]
4620 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4621 pmaddwd m1, [r3 - 7 * 16]
4626 pmaddwd m7, [r3 + 6 * 16] ; [20]
4629 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4630 pmaddwd m1, [r3 + 6 * 16]
4638 movu [r0 + r1 * 2], m6
4641 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
4643 pmaddwd m4, [r3 - 13 * 16] ; [1]
4646 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
4648 pmaddwd m2, [r3 - 13 * 16]
4653 pmaddwd m2, m6, [r3] ; [14]
4656 pmaddwd m1, m7, [r3]
4661 pmaddwd m6, [r3 + 13 * 16] ; [27]
4664 pmaddwd m7, [r3 + 13 * 16]
4669 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
4670 pmaddwd m7, [r3 - 6 * 16] ; [8]
4673 palignr m5, m0, 12 ; [12 11 11 10 10 9 9 8]
4674 pmaddwd m5, [r3 - 6 * 16]
4679 lea r0, [r0 + r1 * 4]
4682 movu [r0 + r1 * 2], m6
4687 cglobal intra_pred_ang8_31, 3,5,8
4689 lea r3, [ang_table + 13 * 16]
4692 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4693 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
4694 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
4695 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
4697 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
4698 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
4699 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
4702 pmaddwd m4, [r3 + 4 * 16] ; [17]
4706 pmaddwd m2, [r3 + 4 * 16]
4711 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
4713 pmaddwd m2, [r3 - 11 * 16] ; [2]
4716 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4718 pmaddwd m1, [r3 - 11 * 16]
4723 pmaddwd m6, [r3 + 6 * 16] ; [19]
4726 pmaddwd m7, [r3 + 6 * 16]
4731 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
4732 pmaddwd m7, [r3 - 9 * 16] ; [4]
4735 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
4736 pmaddwd m1, [r3 - 9 * 16]
4744 movu [r0 + r1 * 2], m6
4747 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
4748 pmaddwd m4, [r3 + 8 * 16] ; [21]
4751 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
4752 pmaddwd m2, [r3 + 8 * 16]
4757 palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
4759 pmaddwd m2, [r3 - 7 * 16] ; [6]
4762 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
4764 pmaddwd m1, [r3 - 7 * 16]
4769 pmaddwd m6, [r3 + 10 * 16] ; [23]
4772 pmaddwd m7, [r3 + 10 * 16]
4778 pmaddwd m7, [r3 - 5 * 16] ; [8]
4782 pmaddwd m1, [r3 - 5 * 16]
4787 lea r0, [r0 + r1 * 4]
4790 movu [r0 + r1 * 2], m6
4795 cglobal intra_pred_ang8_32, 3,6,8
4797 lea r3, [ang_table + 19 * 16]
4800 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4801 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
4802 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
4803 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
4805 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
4806 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
4807 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
4810 pmaddwd m4, [r3 + 2 * 16] ; [21]
4814 pmaddwd m2, [r3 + 2 * 16]
4819 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
4821 pmaddwd m2, [r3 - 9 * 16] ; [10]
4824 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4826 pmaddwd m1, [r3 - 9 * 16]
4831 pmaddwd m6, [r3 + 12 * 16] ; [31]
4834 pmaddwd m7, [r3 + 12 * 16]
4839 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
4840 pmaddwd m7, [r3 + 1 * 16] ; [20]
4843 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
4844 pmaddwd m1, [r3 + 1 * 16]
4852 movu [r0 + r1 * 2], m6
4855 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
4857 pmaddwd m4, [r3 - 10 * 16] ; [ 9]
4860 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
4862 pmaddwd m3, [r3 - 10 * 16]
4867 pmaddwd m2, [r3 + 11 * 16] ; [30]
4870 pmaddwd m6, [r3 + 11 * 16]
4876 pmaddwd m6, [r3] ; [19]
4885 movh m1, [r2 + 26] ; [16 15 14 13]
4886 palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4887 pmaddwd m7, [r3 - 11 * 16] ; [8]
4890 palignr m1, m5, 4 ; [14 13 13 12 12 11 11 10]
4891 pmaddwd m1, [r3 - 11 * 16]
4896 lea r0, [r0 + r1 * 4]
4899 movu [r0 + r1 * 2], m6
4904 cglobal intra_pred_ang8_33, 3,5,8
4906 lea r3, [ang_table + 14 * 16]
4909 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
4910 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
4911 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
4912 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
4914 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
4915 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
4916 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
4917 punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13]
4920 pmaddwd m4, [r3 + 12 * 16] ; [26]
4924 pmaddwd m2, [r3 + 12 * 16]
4929 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
4930 pmaddwd m2, [r3 + 6 * 16] ; [20]
4933 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4934 pmaddwd m6, [r3 + 6 * 16]
4939 palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
4940 pmaddwd m6, [r3] ; [14]
4943 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
4949 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
4950 pmaddwd m7, [r3 - 6 * 16] ; [ 8]
4953 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
4954 pmaddwd m3, [r3 - 6 * 16]
4962 movu [r0 + r1 * 2], m6
4966 pmaddwd m4, [r3 - 12 * 16] ; [ 2]
4970 pmaddwd m2, [r3 - 12 * 16]
4976 pmaddwd m2, [r3 + 14 * 16] ; [28]
4980 pmaddwd m6, [r3 + 14 * 16]
4985 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
4986 pmaddwd m6, [r3 + 8 * 16] ; [22]
4989 palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10]
4990 pmaddwd m7, [r3 + 8 * 16]
4995 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
4996 pmaddwd m7, [r3 + 2 * 16] ; [16]
4999 palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11]
5000 pmaddwd m1, [r3 + 2 * 16]
5005 lea r0, [r0 + r1 * 4]
5008 movu [r0 + r1 * 2], m6
5013 ;-----------------------------------------------------------------------------
5014 ; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
5015 ;-----------------------------------------------------------------------------
5017 cglobal intra_pred_ang16_2, 3,4,5
5028 palignr m3, m1, m0, 2
5029 palignr m4, m2, m1, 2
5031 movu [r0 + r1 + 16], m4
5032 palignr m3, m1, m0, 4
5033 palignr m4, m2, m1, 4
5034 movu [r0 + r1 * 2], m3
5035 movu [r0 + r1 * 2 + 16], m4
5036 palignr m3, m1, m0, 6
5037 palignr m4, m2, m1, 6
5039 movu [r0 + r3 + 16], m4
5041 lea r0, [r0 + r1 * 4]
5042 palignr m3, m1, m0, 8
5043 palignr m4, m2, m1, 8
5046 palignr m3, m1, m0, 10
5047 palignr m4, m2, m1, 10
5049 movu [r0 + r1 + 16], m4
5050 palignr m3, m1, m0, 12
5051 palignr m4, m2, m1, 12
5052 movu [r0 + r1 * 2], m3
5053 movu [r0 + r1 * 2 + 16], m4
5054 palignr m3, m1, m0, 14
5055 palignr m4, m2, m1, 14
5057 movu [r0 + r3 + 16], m4
5060 lea r0, [r0 + r1 * 4]
5063 palignr m3, m2, m1, 2
5064 palignr m4, m0, m2, 2
5066 movu [r0 + r1 + 16], m4
5067 palignr m3, m2, m1, 4
5068 palignr m4, m0, m2, 4
5069 movu [r0 + r1 * 2], m3
5070 movu [r0 + r1 * 2 + 16], m4
5071 palignr m3, m2, m1, 6
5072 palignr m4, m0, m2, 6
5074 movu [r0 + r3 + 16], m4
5076 lea r0, [r0 + r1 * 4]
5077 palignr m3, m2, m1, 8
5078 palignr m4, m0, m2, 8
5081 palignr m3, m2, m1, 10
5082 palignr m4, m0, m2, 10
5084 movu [r0 + r1 + 16], m4
5085 palignr m3, m2, m1, 12
5086 palignr m4, m0, m2, 12
5087 movu [r0 + r1 * 2], m3
5088 movu [r0 + r1 * 2 + 16], m4
5089 palignr m3, m2, m1, 14
5090 palignr m4, m0, m2, 14
5092 movu [r0 + r3 + 16], m4
5096 %macro TRANSPOSE_STORE 6
5098 punpckhwd %5, %1, %2
5100 punpckhwd %2, %3, %4
5103 punpckldq %4, %1, %3
5105 punpckldq %3, %5, %2
5109 movhps [r0 + r1 + %6], %4
5110 movh [r0 + r1 * 2 + %6], %1
5111 movhps [r0 + r4 + %6], %1
5112 lea r5, [r0 + r1 * 4]
5114 movhps [r5 + r1 + %6], %3
5115 movh [r5 + r1 * 2 + %6], %5
5116 movhps [r5 + r4 + %6], %5
5122 movu [r5 + r1 * 2], %3
5128 cglobal ang16_mode_3_33
5130 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5131 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
5132 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
5133 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
5135 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
5136 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
5137 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
5138 punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13]
5141 pmaddwd m4, [r3 + 10 * 16] ; [26]
5145 pmaddwd m2, [r3 + 10 * 16]
5150 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
5151 pmaddwd m2, [r3 + 4 * 16] ; [20]
5154 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5155 pmaddwd m6, [r3 + 4 * 16]
5160 palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
5161 pmaddwd m6, [r3 - 2 * 16] ; [14]
5164 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5165 pmaddwd m7, [r3 - 2 * 16]
5170 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
5171 pmaddwd m7, [r3 - 8 * 16] ; [ 8]
5174 palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
5175 pmaddwd m3, [r3 - 8 * 16]
5182 TRANSPOSE_STORE m4, m2, m6, m7, m3, 0
5185 pmaddwd m4, [r3 - 14 * 16] ; [ 2]
5189 pmaddwd m2, [r3 - 14 * 16]
5195 pmaddwd m2, [r3 + 12 * 16] ; [28]
5199 pmaddwd m6, [r3 + 12 * 16]
5204 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5205 pmaddwd m6, [r3 + 6 * 16] ; [22]
5208 palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10]
5209 pmaddwd m7, [r3 + 6 * 16]
5214 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5215 pmaddwd m7, [r3] ; [16]
5218 palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11]
5224 lea r5, [r0 + r1 * 4]
5226 TRANSPOSE_STORE m4, m2, m6, m7, m3, 8
5228 movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13]
5229 psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14]
5231 punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13]
5232 punpckhwd m1, m4 ; [x 20 20 19 19 18 18 17]
5234 palignr m4, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
5235 pmaddwd m4, [r3 - 6 * 16] ; [10]
5238 palignr m2, m3, m5, 12 ; [15 16 15 14 14 13 13 12]
5239 pmaddwd m2, [r3 - 6 * 16]
5245 pmaddwd m2, [r3 - 12 * 16] ; [4]
5249 pmaddwd m6, [r3 - 12 * 16]
5255 pmaddwd m6, [r3 + 14 * 16] ; [30]
5259 pmaddwd m7, [r3 + 14 * 16]
5264 palignr m7, m3, m5, 4 ; [14 13 13 12 12 11 11 10]
5265 pmaddwd m7, [r3 + 8 * 16] ; [24]
5268 palignr m0, m1, m3, 4 ; [18 17 17 16 16 15 15 14]
5269 pmaddwd m0, [r3 + 8 * 16]
5274 lea r5, [r5 + r1 * 4]
5276 TRANSPOSE_STORE m4, m2, m6, m7, m0, 16
5278 palignr m4, m3, m5, 8 ; [15 14 14 13 13 12 12 11]
5279 pmaddwd m4, [r3 + 2 * 16] ; [18]
5282 palignr m2, m1, m3, 8 ; [19 18 18 17 17 16 16 15]
5283 pmaddwd m2, [r3 + 2 * 16]
5288 palignr m2, m3, m5, 12 ; [16 15 15 14 14 13 13 12]
5289 pmaddwd m2, [r3 - 4 * 16] ; [12]
5292 palignr m6, m1, m3, 12 ; [20 19 19 18 18 17 17 16]
5293 pmaddwd m6, [r3 - 4 * 16]
5298 pinsrw m1, [r2 + 42], 7
5299 pmaddwd m3, [r3 - 10 * 16] ; [6]
5302 pmaddwd m1, [r3 - 10 * 16]
5309 lea r5, [r5 + r1 * 4]
5311 TRANSPOSE_STORE m4, m2, m3, m7, m0, 24
5315 cglobal intra_pred_ang16_3, 3,7,8
5317 lea r3, [ang_table + 16 * 16]
5321 call ang16_mode_3_33
5324 lea r0, [r0 + r1 * 8]
5326 call ang16_mode_3_33
5330 cglobal intra_pred_ang16_33, 4,7,8
5334 lea r3, [ang_table + 16 * 16]
5338 call ang16_mode_3_33
5343 call ang16_mode_3_33
5347 cglobal ang16_mode_4_32
5349 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5350 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
5351 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
5352 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
5354 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
5355 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
5356 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
5359 pmaddwd m4, [r3 + 3 * 16] ; [21]
5363 pmaddwd m2, [r3 + 3 * 16]
5368 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
5370 pmaddwd m2, [r3 - 8 * 16] ; [10]
5373 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5375 pmaddwd m1, [r3 - 8 * 16]
5380 pmaddwd m6, [r3 + 13 * 16] ; [31]
5383 pmaddwd m7, [r3 + 13 * 16]
5388 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
5389 pmaddwd m7, [r3 + 2 * 16] ; [20]
5392 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5393 pmaddwd m1, [r3 + 2 * 16]
5400 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
5402 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
5404 pmaddwd m4, [r3 - 9 * 16] ; [9]
5407 palignr m7, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
5409 pmaddwd m7, [r3 - 9 * 16]
5414 pmaddwd m2, [r3 + 12 * 16] ; [30]
5417 pmaddwd m6, [r3 + 12 * 16]
5423 pmaddwd m6, [r3 + 1 * 16] ; [19]
5427 pmaddwd m7, [r3 + 1 * 16]
5432 movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13]
5434 palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5435 pmaddwd m7, [r3 - 10 * 16] ; [8]
5438 palignr m3, m1, m5, 4 ; [14 13 13 12 12 11 11 10]
5439 pmaddwd m3, [r3 - 10 * 16]
5444 lea r5, [r0 + r1 * 4]
5446 TRANSPOSE_STORE m4, m2, m6, m7, m3, 8
5448 psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14]
5450 punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13]
5451 punpckhwd m1, m4 ; [x 20 20 19 19 18 18 17]
5453 palignr m4, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5454 pmaddwd m4, [r3 + 11 * 16] ; [29]
5457 palignr m2, m3, m5, 4 ; [14 13 13 12 12 11 11 10]
5458 pmaddwd m2, [r3 + 11 * 16]
5463 palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5464 pmaddwd m2, [r3] ; [18]
5467 palignr m6, m3, m5, 8 ; [15 14 14 13 13 12 12 11]
5473 palignr m6, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
5475 pmaddwd m6, [r3 - 11 * 16] ; [7]
5478 palignr m0, m3, m5, 12 ; [15 16 15 14 14 13 13 12]
5479 pmaddwd m0, [r3 - 11 * 16]
5484 pmaddwd m7, [r3 + 10 * 16] ; [28]
5487 palignr m0, m3, m5, 12 ; [15 16 15 14 14 13 13 12]
5488 pmaddwd m0, [r3 + 10 * 16]
5493 lea r5, [r5 + r1 * 4]
5495 TRANSPOSE_STORE m4, m2, m6, m7, m0, 16
5498 pmaddwd m4, [r3 - 1 * 16] ; [17]
5502 pmaddwd m2, [r3 - 1 * 16]
5507 palignr m2, m3, m5, 4 ; [14 13 13 12 12 11 11 10]
5509 pmaddwd m2, [r3 - 12 * 16] ; [6]
5512 palignr m6, m1, m3, 4 ; [18 17 17 16 16 15 15 14]
5514 pmaddwd m6, [r3 - 12 * 16]
5519 pmaddwd m7, [r3 + 9 * 16] ; [27]
5522 pmaddwd m0, [r3 + 9 * 16]
5527 palignr m0, m3, m5, 8 ; [15 14 14 13 13 12 12 11]
5528 pmaddwd m0, [r3 - 2 * 16] ; [16]
5531 palignr m1, m3, 8 ; [19 18 18 17 17 16 16 15]
5532 pmaddwd m1, [r3 - 2 * 16]
5537 lea r5, [r5 + r1 * 4]
5539 TRANSPOSE_STORE m4, m2, m7, m0, m3, 24
5543 cglobal intra_pred_ang16_4, 3,7,8
5545 lea r3, [ang_table + 18 * 16]
5549 call ang16_mode_4_32
5552 lea r0, [r0 + r1 * 8]
5554 call ang16_mode_4_32
5558 cglobal intra_pred_ang16_32, 4,7,8
5562 lea r3, [ang_table + 18 * 16]
5566 call ang16_mode_4_32
5571 call ang16_mode_4_32
5575 cglobal ang16_mode_5_31
5577 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5578 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
5579 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
5580 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
5582 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
5583 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
5584 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
5587 pmaddwd m4, [r3 + 1 * 16] ; [17]
5591 pmaddwd m2, [r3 + 1 * 16]
5596 palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
5598 pmaddwd m2, [r3 - 14 * 16] ; [2]
5601 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5603 pmaddwd m1, [r3 - 14 * 16]
5608 pmaddwd m6, [r3 + 3 * 16] ; [19]
5611 pmaddwd m7, [r3 + 3 * 16]
5616 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
5617 pmaddwd m7, [r3 - 12 * 16] ; [4]
5620 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5621 pmaddwd m1, [r3 - 12 * 16]
5628 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
5630 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
5631 pmaddwd m4, [r3 + 5 * 16] ; [21]
5634 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5635 pmaddwd m7, [r3 + 5 * 16]
5640 palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
5642 pmaddwd m2, [r3 - 10 * 16] ; [6]
5645 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
5647 pmaddwd m1, [r3 - 10 * 16]
5652 pmaddwd m6, [r3 + 7 * 16] ; [23]
5655 pmaddwd m7, [r3 + 7 * 16]
5661 pmaddwd m7, [r3 - 8 * 16] ; [8]
5665 pmaddwd m3, [r3 - 8 * 16]
5670 lea r5, [r0 + r1 * 4]
5672 TRANSPOSE_STORE m4, m2, m6, m7, m3, 8
5674 movu m1, [r2 + 26] ; [20 19 18 17 16 15 14 13]
5675 psrldq m4, m1, 2 ; [x 20 19 18 17 16 15 14]
5677 punpcklwd m3, m1, m4 ; [17 16 16 15 15 14 14 13]
5680 pmaddwd m4, [r3 + 9 * 16] ; [25]
5684 pmaddwd m2, [r3 + 9 * 16]
5689 palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5691 pmaddwd m2, [r3 - 6 * 16] ; [10]
5694 palignr m7, m3, m5, 4 ; [14 13 13 12 12 11 11 10]
5696 pmaddwd m7, [r3 - 6 * 16]
5701 pmaddwd m6, [r3 + 11 * 16] ; [27]
5704 pmaddwd m1, [r3 + 11 * 16]
5709 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5710 pmaddwd m7, [r3 - 4 * 16] ; [12]
5713 palignr m1, m3, m5, 8 ; [15 14 14 13 13 12 12 11]
5714 pmaddwd m1, [r3 - 4 * 16]
5719 lea r5, [r5 + r1 * 4]
5721 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
5723 palignr m4, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5724 pmaddwd m4, [r3 + 13 * 16] ; [29]
5727 palignr m2, m3, m5, 8 ; [15 14 14 13 13 12 12 11]
5728 pmaddwd m2, [r3 + 13 * 16]
5733 palignr m2, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
5735 pmaddwd m2, [r3 - 2 * 16] ; [14]
5738 palignr m6, m3, m5, 12 ; [15 16 15 14 14 13 13 12]
5740 pmaddwd m6, [r3 - 2 * 16]
5745 pmaddwd m7, [r3 + 15 * 16] ; [31]
5748 pmaddwd m0, [r3 + 15 * 16]
5753 pmaddwd m5, [r3] ; [16]
5761 lea r5, [r5 + r1 * 4]
5763 TRANSPOSE_STORE m4, m2, m7, m5, m3, 24
5767 cglobal intra_pred_ang16_5, 3,7,8
5769 lea r3, [ang_table + 16 * 16]
5773 call ang16_mode_5_31
5776 lea r0, [r0 + r1 * 8]
5778 call ang16_mode_5_31
5782 cglobal intra_pred_ang16_31, 4,7,8
5786 lea r3, [ang_table + 16 * 16]
5790 call ang16_mode_5_31
5795 call ang16_mode_5_31
5799 cglobal ang16_mode_6_30
5801 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
5802 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
5803 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
5804 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
5806 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
5807 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
5808 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
5811 pmaddwd m4, [r3 - 2 * 16] ; [13]
5815 pmaddwd m2, [r3 - 2 * 16]
5821 pmaddwd m2, [r3 + 11 * 16] ; [26]
5825 pmaddwd m1, [r3 + 11 * 16]
5830 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
5832 pmaddwd m6, [r3 - 8 * 16] ; [7]
5835 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5836 pmaddwd m1, [r3 - 8 * 16]
5841 pmaddwd m7, [r3 + 5 * 16] ; [20]
5844 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5845 pmaddwd m1, [r3 + 5 * 16]
5852 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
5854 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
5856 pmaddwd m4, [r3 - 14 * 16] ; [1]
5859 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5861 pmaddwd m1, [r3 - 14 * 16]
5867 pmaddwd m2, [r3 - 1 * 16] ; [14]
5871 pmaddwd m1, [r3 - 1 * 16]
5876 pmaddwd m6, [r3 + 12 * 16] ; [27]
5879 pmaddwd m7, [r3 + 12 * 16]
5884 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
5885 pmaddwd m7, [r3 - 7 * 16] ; [8]
5888 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
5889 pmaddwd m1, [r3 - 7 * 16]
5894 lea r5, [r0 + r1 * 4]
5896 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
5898 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
5899 pmaddwd m4, [r3 + 6 * 16] ; [21]
5902 palignr m2, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
5903 pmaddwd m2, [r3 + 6 * 16]
5909 pmaddwd m2, [r3 - 13 * 16] ; [2]
5913 pmaddwd m7, [r3 - 13 * 16]
5919 pmaddwd m6, [r3] ; [15]
5929 pmaddwd m7, [r3 + 13 * 16] ; [28]
5933 pmaddwd m1, [r3 + 13 * 16]
5938 lea r5, [r5 + r1 * 4]
5940 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
5942 movh m3, [r2 + 26] ; [16 15 14 13]
5944 palignr m4, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
5946 pmaddwd m4, [r3 - 6 * 16] ; [9]
5949 palignr m1, m3, m5, 4 ; [14 13 13 12 12 11 11 10]
5951 pmaddwd m1, [r3 - 6 * 16]
5956 pmaddwd m2, [r3 + 7 * 16] ; [22]
5960 pmaddwd m1, [r3 + 7 * 16]
5966 palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
5968 pmaddwd m7, [r3 - 12 * 16] ; [3]
5971 palignr m3, m6, 4 ; [15 14 14 13 13 12 12 11]
5973 pmaddwd m3, [r3 - 12 * 16]
5978 pmaddwd m5, [r3 + 1 * 16] ; [16]
5981 pmaddwd m1, [r3 + 1 * 16]
5986 lea r5, [r5 + r1 * 4]
5988 TRANSPOSE_STORE m4, m2, m7, m5, m3, 24
5992 cglobal intra_pred_ang16_6, 3,7,8
5994 lea r3, [ang_table + 15 * 16]
5998 call ang16_mode_6_30
6001 lea r0, [r0 + r1 * 8]
6003 call ang16_mode_6_30
6007 cglobal intra_pred_ang16_30, 4,7,8
6011 lea r3, [ang_table + 15 * 16]
6015 call ang16_mode_6_30
6020 call ang16_mode_6_30
6024 cglobal ang16_mode_7_29
6026 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6027 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
6028 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
6029 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
6031 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
6032 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
6033 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
6036 pmaddwd m4, [r3 - 8 * 16] ; [9]
6040 pmaddwd m2, [r3 - 8 * 16]
6046 pmaddwd m2, [r3 + 1 * 16] ; [18]
6050 pmaddwd m1, [r3 + 1 * 16]
6056 pmaddwd m6, [r3 + 10 * 16] ; [27]
6060 pmaddwd m1, [r3 + 10 * 16]
6065 palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6066 pmaddwd m7, [r3 - 13 * 16] ; [4]
6069 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6070 pmaddwd m1, [r3 - 13 * 16]
6077 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
6079 palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6081 pmaddwd m4, [r3 - 4 * 16] ; [13]
6084 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6086 pmaddwd m1, [r3 - 4 * 16]
6092 pmaddwd m2, [r3 + 5 * 16] ; [22]
6096 pmaddwd m1, [r3 + 5 * 16]
6101 pmaddwd m6, [r3 + 14 * 16] ; [31]
6104 pmaddwd m7, [r3 + 14 * 16]
6109 palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
6110 pmaddwd m7, [r3 - 9 * 16] ; [8]
6113 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6114 pmaddwd m1, [r3 - 9 * 16]
6119 lea r5, [r0 + r1 * 4]
6121 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
6123 palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
6125 pmaddwd m4, [r3] ; [17]
6128 palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6135 pmaddwd m2, [r3 + 9 * 16] ; [26]
6138 pmaddwd m7, [r3 + 9 * 16]
6143 palignr m6, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
6144 pmaddwd m6, [r3 - 14 * 16] ; [3]
6147 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
6148 pmaddwd m1, [r3 - 14 * 16]
6153 palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
6154 pmaddwd m7, [r3 - 5 * 16] ; [12]
6157 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
6158 pmaddwd m1, [r3 - 5 * 16]
6163 lea r5, [r5 + r1 * 4]
6165 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
6167 palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
6169 pmaddwd m4, [r3 + 4 * 16] ; [21]
6172 palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
6174 pmaddwd m1, [r3 + 4 * 16]
6179 pmaddwd m2, [r3 + 13 * 16] ; [30]
6182 pmaddwd m3, [r3 + 13 * 16]
6188 pmaddwd m7, [r3 - 10 * 16] ; [7]
6192 pmaddwd m3, [r3 - 10 * 16]
6197 pmaddwd m0, [r3 - 1 * 16] ; [16]
6200 pmaddwd m5, [r3 - 1 * 16]
6205 lea r5, [r5 + r1 * 4]
6207 TRANSPOSE_STORE m4, m2, m7, m0, m3, 24
6211 cglobal intra_pred_ang16_7, 3,7,8
6213 lea r3, [ang_table + 17 * 16]
6217 call ang16_mode_7_29
6220 lea r0, [r0 + r1 * 8]
6222 call ang16_mode_7_29
6226 cglobal intra_pred_ang16_29, 4,7,8
6230 lea r3, [ang_table + 17 * 16]
6234 call ang16_mode_7_29
6239 call ang16_mode_7_29
6243 cglobal ang16_mode_8_28
6245 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6246 movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
6247 palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
6248 psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
6250 punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
6251 punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
6252 punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
6255 pmaddwd m4, [r3 - 10 * 16] ; [5]
6259 pmaddwd m2, [r3 - 10 * 16]
6265 pmaddwd m2, [r3 - 5 * 16] ; [10]
6269 pmaddwd m1, [r3 - 5 * 16]
6275 pmaddwd m6, [r3] ; [15]
6285 pmaddwd m7, [r3 + 5 * 16] ; [20]
6289 pmaddwd m1, [r3 + 5 * 16]
6296 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
6299 pmaddwd m4, [r3 + 10 * 16] ; [25]
6303 pmaddwd m1, [r3 + 10 * 16]
6309 pmaddwd m2, [r3 + 15 * 16] ; [30]
6313 pmaddwd m1, [r3 + 15 * 16]
6318 palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6319 pmaddwd m6, [r3 - 12 * 16] ; [3]
6322 palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6323 pmaddwd m7, [r3 - 12 * 16]
6328 palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6329 pmaddwd m7, [r3 - 7 * 16] ; [8]
6332 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6333 pmaddwd m1, [r3 - 7 * 16]
6338 lea r5, [r0 + r1 * 4]
6340 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
6342 palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
6344 pmaddwd m4, [r3 - 2 *16] ; [13]
6347 palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6349 pmaddwd m6, [r3 - 2 * 16]
6355 pmaddwd m2, [r3 + 3 * 16] ; [18]
6359 pmaddwd m6, [r3 + 3 * 16]
6365 pmaddwd m6, [r3 + 8 * 16] ; [23]
6368 pmaddwd m1, [r3 + 8 * 16]
6373 pmaddwd m7, [r3 + 13 * 16] ; [28]
6376 palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
6377 pmaddwd m1, [r3 + 13 * 16]
6382 lea r5, [r5 + r1 * 4]
6384 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
6386 palignr m1, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
6388 pmaddwd m4, [r3 - 14 * 16] ; [1]
6391 palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7]
6393 pmaddwd m0, [r3 - 14 * 16]
6399 pmaddwd m2, [r3 - 9 * 16] ; [6]
6403 pmaddwd m3, [r3 - 9 * 16]
6409 pmaddwd m7, [r3 - 4 * 16] ; [11]
6413 pmaddwd m3, [r3 - 4 * 16]
6418 pmaddwd m1, [r3 + 1 * 16] ; [16]
6421 pmaddwd m5, [r3 + 1 * 16]
6426 lea r5, [r5 + r1 * 4]
6428 TRANSPOSE_STORE m4, m2, m7, m1, m3, 24
6432 cglobal intra_pred_ang16_8, 3,7,8
6434 lea r3, [ang_table + 15 * 16]
6438 call ang16_mode_8_28
6441 lea r0, [r0 + r1 * 8]
6443 call ang16_mode_8_28
6447 cglobal intra_pred_ang16_28, 4,7,8
6451 lea r3, [ang_table + 15 * 16]
6455 call ang16_mode_8_28
6460 call ang16_mode_8_28
6464 cglobal ang16_mode_9_27
6466 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6467 movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
6469 punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
6470 punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
6473 pmaddwd m4, [r3 - 14 * 16] ; [2]
6477 pmaddwd m2, [r3 - 14 * 16]
6483 pmaddwd m2, [r3 - 12 * 16] ; [4]
6487 pmaddwd m1, [r3 - 12 * 16]
6493 pmaddwd m6, [r3 - 10 *16] ; [6]
6497 pmaddwd m1, [r3 - 10 * 16]
6503 pmaddwd m7, [r3 - 8 * 16] ; [8]
6507 pmaddwd m1, [r3 - 8 * 16]
6514 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
6517 pmaddwd m4, [r3 - 6 * 16] ; [10]
6521 pmaddwd m1, [r3 - 6 * 16]
6527 pmaddwd m2, [r3 - 4 * 16] ; [12]
6531 pmaddwd m1, [r3 - 4 * 16]
6537 pmaddwd m6, [r3 - 2 * 16] ; [14]
6541 pmaddwd m7, [r3 - 2 * 16]
6547 pmaddwd m7, [r3] ; [16]
6556 lea r5, [r0 + r1 * 4]
6558 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
6561 pmaddwd m4, [r3 + 2 *16] ; [18]
6565 pmaddwd m6, [r3 + 2 * 16]
6571 pmaddwd m2, [r3 + 4 * 16] ; [20]
6575 pmaddwd m6, [r3 + 4 * 16]
6581 pmaddwd m6, [r3 + 6 * 16] ; [22]
6585 pmaddwd m1, [r3 + 6 * 16]
6591 pmaddwd m7, [r3 + 8 * 16] ; [24]
6595 pmaddwd m1, [r3 + 8 * 16]
6600 lea r5, [r5 + r1 * 4]
6602 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
6605 pmaddwd m4, [r3 + 10 * 16] ; [26]
6609 pmaddwd m1, [r3 + 10 * 16]
6615 pmaddwd m2, [r3 + 12 * 16] ; [28]
6619 pmaddwd m1, [r3 + 12 * 16]
6624 pmaddwd m3, [r3 + 14 * 16] ; [30]
6627 pmaddwd m0, [r3 + 14 * 16]
6634 lea r5, [r5 + r1 * 4]
6636 TRANSPOSE_STORE m4, m2, m3, m7, m1, 24
6640 cglobal intra_pred_ang16_9, 3,7,8
6642 lea r3, [ang_table + 16 * 16]
6646 call ang16_mode_9_27
6649 lea r0, [r0 + r1 * 8]
6651 call ang16_mode_9_27
6655 cglobal intra_pred_ang16_27, 4,7,8
6659 lea r3, [ang_table + 16 * 16]
6663 call ang16_mode_9_27
6668 call ang16_mode_9_27
6672 cglobal ang16_mode_11_25
6674 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
6675 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6677 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
6678 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
6681 pmaddwd m4, [r3 + 14 * 16] ; [30]
6685 pmaddwd m2, [r3 + 14 * 16]
6691 pmaddwd m2, [r3 + 12 * 16] ; [28]
6695 pmaddwd m1, [r3 + 12 * 16]
6701 pmaddwd m6, [r3 + 10 *16] ; [26]
6705 pmaddwd m1, [r3 + 10 * 16]
6711 pmaddwd m7, [r3 + 8 * 16] ; [24]
6715 pmaddwd m1, [r3 + 8 * 16]
6722 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
6725 pmaddwd m4, [r3 + 6 * 16] ; [22]
6729 pmaddwd m1, [r3 + 6 * 16]
6735 pmaddwd m2, [r3 + 4 * 16] ; [20]
6739 pmaddwd m1, [r3 + 4 * 16]
6745 pmaddwd m6, [r3 + 2 * 16] ; [18]
6749 pmaddwd m7, [r3 + 2 * 16]
6755 pmaddwd m7, [r3] ; [16]
6764 lea r5, [r0 + r1 * 4]
6766 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
6769 pmaddwd m4, [r3 - 2 *16] ; [14]
6773 pmaddwd m6, [r3 - 2 * 16]
6779 pmaddwd m2, [r3 - 4 * 16] ; [12]
6783 pmaddwd m6, [r3 - 4 * 16]
6789 pmaddwd m6, [r3 - 6 * 16] ; [10]
6793 pmaddwd m1, [r3 - 6 * 16]
6799 pmaddwd m7, [r3 - 8 * 16] ; [8]
6803 pmaddwd m1, [r3 - 8 * 16]
6808 lea r5, [r5 + r1 * 4]
6810 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
6813 pmaddwd m4, [r3 - 10 * 16] ; [6]
6817 pmaddwd m1, [r3 - 10 * 16]
6823 pmaddwd m2, [r3 - 12 * 16] ; [4]
6827 pmaddwd m1, [r3 - 12 * 16]
6833 pmaddwd m7, [r3 - 14 * 16] ; [2]
6837 pmaddwd m1, [r3 - 14 * 16]
6844 lea r5, [r5 + r1 * 4]
6846 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
6850 cglobal intra_pred_ang16_11, 3,7,8
6852 lea r3, [ang_table + 16 * 16]
6856 call ang16_mode_11_25
6859 lea r0, [r0 + r1 * 8]
6861 call ang16_mode_11_25
6865 cglobal intra_pred_ang16_25, 4,7,8
6869 lea r3, [ang_table + 16 * 16]
6873 call ang16_mode_11_25
6878 call ang16_mode_11_25
6882 cglobal ang16_mode_12_24
6884 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
6885 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
6887 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
6888 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
6891 pmaddwd m4, [r6 + 11 * 16] ; [27]
6895 pmaddwd m2, [r6 + 11 * 16]
6901 pmaddwd m2, [r6 + 6 * 16] ; [22]
6905 pmaddwd m1, [r6 + 6 * 16]
6911 pmaddwd m6, [r6 + 1 *16] ; [17]
6915 pmaddwd m1, [r6 + 1 * 16]
6921 pmaddwd m7, [r6 - 4 * 16] ; [12]
6925 pmaddwd m1, [r6 - 4 * 16]
6932 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
6935 pmaddwd m4, [r6 - 9 * 16] ; [7]
6939 pmaddwd m1, [r6 - 9 * 16]
6945 pmaddwd m2, [r6 - 14 * 16] ; [2]
6949 pmaddwd m1, [r6 - 14 * 16]
6958 pmaddwd m6, [r6 + 13 * 16] ; [29]
6962 pmaddwd m7, [r6 + 13 * 16]
6968 pmaddwd m7, [r6 + 8 * 16] ; [24]
6972 pmaddwd m1, [r6 + 8 * 16]
6977 lea r5, [r0 + r1 * 4]
6979 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
6982 pmaddwd m4, [r6 + 3 *16] ; [19]
6986 pmaddwd m6, [r6 + 3 * 16]
6992 pmaddwd m2, [r6 - 2 * 16] ; [14]
6996 pmaddwd m6, [r6 - 2 * 16]
7002 pmaddwd m6, [r6 - 7 * 16] ; [9]
7006 pmaddwd m1, [r6 - 7 * 16]
7012 pmaddwd m7, [r6 - 12 * 16] ; [4]
7016 pmaddwd m1, [r6 - 12 * 16]
7021 lea r5, [r5 + r1 * 4]
7023 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
7030 pmaddwd m4, [r6 + 15 * 16] ; [31]
7034 pmaddwd m1, [r6 + 15 * 16]
7040 pmaddwd m2, [r6 + 10 * 16] ; [26]
7044 pmaddwd m1, [r6 + 10 * 16]
7050 pmaddwd m7, [r6 + 5 * 16] ; [21]
7054 pmaddwd m1, [r6 + 5 * 16]
7059 pmaddwd m3, [r6] ; [16]
7067 lea r5, [r5 + r1 * 4]
7069 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
7073 cglobal intra_pred_ang16_12, 4,7,8
7076 lea r6, [ang_table + 16 * 16]
7078 pshufb m5, [pw_ang8_12]
7079 pinsrw m5, [r3 + 26], 5
7082 call ang16_mode_12_24
7084 lea r0, [r0 + r1 * 8]
7088 call ang16_mode_12_24
7092 cglobal intra_pred_ang16_24, 4,7,8
7096 lea r6, [ang_table + 16 * 16]
7098 pshufb m5, [pw_ang8_12]
7099 pinsrw m5, [r3 + 26], 5
7103 call ang16_mode_12_24
7109 call ang16_mode_12_24
7113 cglobal ang16_mode_13_23
7115 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
7116 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
7118 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
7119 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
7122 pmaddwd m4, [r6 + 8 * 16] ; [23]
7126 pmaddwd m2, [r6 + 8 * 16]
7132 pmaddwd m2, [r6 - 1 * 16] ; [14]
7136 pmaddwd m1, [r6 - 1 * 16]
7142 pmaddwd m6, [r6 - 10 *16] ; [5]
7146 pmaddwd m1, [r6 - 10 * 16]
7155 pmaddwd m7, [r6 + 13 * 16] ; [28]
7159 pmaddwd m1, [r6 + 13 * 16]
7166 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
7169 pmaddwd m4, [r6 + 4 * 16] ; [19]
7173 pmaddwd m1, [r6 + 4 * 16]
7179 pmaddwd m2, [r6 - 5 * 16] ; [10]
7183 pmaddwd m1, [r6 - 5 * 16]
7189 pmaddwd m6, [r6 - 14 * 16] ; [1]
7193 pmaddwd m7, [r6 - 14 * 16]
7203 pmaddwd m7, [r6 + 9 * 16] ; [24]
7207 pmaddwd m1, [r6 + 9 * 16]
7212 lea r5, [r0 + r1 * 4]
7214 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
7217 pmaddwd m4, [r6] ; [15]
7227 pmaddwd m2, [r6 - 9 * 16] ; [6]
7231 pmaddwd m6, [r6 - 9 * 16]
7241 pmaddwd m6, [r6 + 14 * 16] ; [29]
7245 pmaddwd m1, [r6 + 14 * 16]
7251 pmaddwd m7, [r6 + 5 * 16] ; [20]
7255 pmaddwd m1, [r6 + 5 * 16]
7260 lea r5, [r5 + r1 * 4]
7262 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
7265 pmaddwd m4, [r6 - 4 * 16] ; [11]
7269 pmaddwd m1, [r6 - 4 * 16]
7275 pmaddwd m2, [r6 - 13 * 16] ; [2]
7279 pmaddwd m1, [r6 - 13 * 16]
7289 pmaddwd m7, [r6 + 10 * 16] ; [25]
7293 pmaddwd m1, [r6 + 10 * 16]
7298 pmaddwd m3, [r6 + 1 * 16] ; [16]
7301 pmaddwd m0, [r6 + 1 *16]
7306 lea r5, [r5 + r1 * 4]
7308 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
7312 cglobal intra_pred_ang16_13, 4,7,8
7315 lea r6, [ang_table + 15 * 16]
7317 pshufb m5, [pw_ang16_13]
7319 pshufb m6, [pw_ang8_13]
7324 call ang16_mode_13_23
7326 lea r0, [r0 + r1 * 8]
7330 call ang16_mode_13_23
7334 cglobal intra_pred_ang16_23, 4,7,8
7338 lea r6, [ang_table + 15 * 16]
7340 pshufb m5, [pw_ang16_13]
7342 pshufb m6, [pw_ang8_13]
7348 call ang16_mode_13_23
7354 call ang16_mode_13_23
7358 cglobal ang16_mode_14_22
7360 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
7361 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
7363 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
7364 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
7367 pmaddwd m4, [r6 + 1 * 16] ; [19]
7371 pmaddwd m2, [r6 + 1 * 16]
7377 pmaddwd m2, [r6 - 12 * 16] ; [6]
7381 pmaddwd m1, [r6 - 12 * 16]
7390 pmaddwd m6, [r6 + 7 * 16] ; [25]
7394 pmaddwd m1, [r6 + 7 * 16]
7400 pmaddwd m7, [r6 - 6 * 16] ; [12]
7404 pmaddwd m1, [r6 - 6 * 16]
7411 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
7418 pmaddwd m4, [r6 + 13 * 16] ; [31]
7422 pmaddwd m1, [r6 + 13 * 16]
7428 pmaddwd m2, [r6] ; [18]
7438 pmaddwd m6, [r6 - 13 * 16] ; [5]
7442 pmaddwd m7, [r6 - 13 * 16]
7452 pmaddwd m7, [r6 + 6 * 16] ; [24]
7456 pmaddwd m1, [r6 + 6 * 16]
7461 lea r5, [r0 + r1 * 4]
7463 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
7466 pmaddwd m4, [r6 - 7 * 16] ; [11]
7470 pmaddwd m6, [r6 - 7 * 16]
7480 pmaddwd m2, [r6 + 12 * 16] ; [30]
7484 pmaddwd m6, [r6 + 12 * 16]
7490 pmaddwd m6, [r6 - 1 * 16] ; [17]
7494 pmaddwd m1, [r6 - 1 * 16]
7500 pmaddwd m7, [r6 - 14 * 16] ; [4]
7504 pmaddwd m1, [r6 - 14 * 16]
7509 lea r5, [r5 + r1 * 4]
7511 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
7518 pmaddwd m4, [r6 + 5 * 16] ; [23]
7522 pmaddwd m1, [r6 + 5 * 16]
7528 pmaddwd m2, [r6 - 8 * 16] ; [10]
7532 pmaddwd m1, [r6 - 8 * 16]
7542 pmaddwd m7, [r6 + 11 * 16] ; [29]
7546 pmaddwd m1, [r6 + 11 * 16]
7551 pmaddwd m3, [r6 - 2 * 16] ; [16]
7554 pmaddwd m0, [r6 - 2 *16]
7559 lea r5, [r5 + r1 * 4]
7561 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
7565 cglobal intra_pred_ang16_14, 4,7,8
7568 lea r6, [ang_table + 18 * 16]
7570 pshufb m6, [pw_ang8_14]
7572 pshufb m5, [pw_ang8_14]
7576 call ang16_mode_14_22
7578 lea r0, [r0 + r1 * 8]
7582 call ang16_mode_14_22
7586 cglobal intra_pred_ang16_22, 4,7,8
7590 lea r6, [ang_table + 18 * 16]
7592 pshufb m6, [pw_ang8_14]
7594 pshufb m5, [pw_ang8_14]
7599 call ang16_mode_14_22
7605 call ang16_mode_14_22
7609 cglobal ang16_mode_15_21
7611 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
7612 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
7614 palignr m6, m0, m5, 2
7616 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
7617 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
7620 pmaddwd m4, [r6] ; [15]
7633 pmaddwd m2, [r6 + 15 * 16] ; [30]
7637 pmaddwd m1, [r6 + 15 * 16]
7643 pmaddwd m6, [r6 - 2 * 16] ; [13]
7647 pmaddwd m1, [r6 - 2 * 16]
7656 pmaddwd m7, [r6 + 13 * 16] ; [28]
7660 pmaddwd m1, [r6 + 13 * 16]
7667 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
7670 pmaddwd m4, [r6 - 4 * 16] ; [11]
7674 pmaddwd m1, [r6 - 4 * 16]
7684 pmaddwd m2, [r6 + 11 * 16] ; [26]
7688 pmaddwd m1, [r6 + 11 * 16]
7694 pmaddwd m6, [r6 - 6 * 16] ; [9]
7698 pmaddwd m7, [r6 - 6 * 16]
7708 pmaddwd m7, [r6 + 9 * 16] ; [24]
7712 pmaddwd m1, [r6 + 9 * 16]
7717 lea r5, [r0 + r1 * 4]
7719 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
7722 pmaddwd m4, [r6 - 8 * 16] ; [7]
7726 pmaddwd m6, [r6 - 8 * 16]
7736 pmaddwd m2, [r6 + 7 * 16] ; [22]
7740 pmaddwd m6, [r6 + 7 * 16]
7746 pmaddwd m6, [r6 - 10 * 16] ; [5]
7750 pmaddwd m1, [r6 - 10 * 16]
7760 pmaddwd m7, [r6 + 5 * 16] ; [20]
7764 pmaddwd m1, [r6 + 5 * 16]
7769 lea r5, [r5 + r1 * 4]
7771 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
7774 pmaddwd m4, [r6 - 12 * 16] ; [3]
7778 pmaddwd m1, [r6 - 12 * 16]
7788 pmaddwd m2, [r6 + 3 * 16] ; [18]
7792 pmaddwd m1, [r6 + 3 * 16]
7798 pmaddwd m7, [r6 - 14 * 16] ; [1]
7802 pmaddwd m1, [r6 - 14 * 16]
7811 pmaddwd m3, [r6 + 1 * 16] ; [16]
7814 pmaddwd m0, [r6 + 1 * 16]
7819 lea r5, [r5 + r1 * 4]
7821 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
7825 cglobal intra_pred_ang16_15, 4,7,8
7828 lea r6, [ang_table + 15 * 16]
7830 pshufb m6, [pw_ang8_15]
7832 pshufb m5, [pw_ang8_15]
7836 call ang16_mode_15_21
7838 lea r0, [r0 + r1 * 8]
7842 call ang16_mode_15_21
7846 cglobal intra_pred_ang16_21, 4,7,8
7850 lea r6, [ang_table + 15 * 16]
7852 pshufb m6, [pw_ang8_15]
7854 pshufb m5, [pw_ang8_15]
7859 call ang16_mode_15_21
7865 call ang16_mode_15_21
7869 cglobal ang16_mode_16_20
7872 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
7873 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
7875 palignr m6, m0, m5, 2
7877 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
7878 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
7881 pmaddwd m4, [r6 - 2 * 16] ; [11]
7885 pmaddwd m2, [r6 - 2 * 16]
7894 pmaddwd m2, [r6 + 9 * 16] ; [22]
7898 pmaddwd m1, [r6 + 9 * 16]
7904 pmaddwd m6, [r6 - 12 * 16] ; [1]
7908 pmaddwd m1, [r6 - 12 * 16]
7917 pmaddwd m7, [r6 - 1 * 16] ; [12]
7921 pmaddwd m1, [r6 - 1 * 16]
7928 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
7935 pmaddwd m4, [r6 + 10 * 16] ; [23]
7939 pmaddwd m1, [r6 + 10 * 16]
7945 pmaddwd m2, [r6 - 11 * 16] ; [2]
7949 pmaddwd m1, [r6 - 11 * 16]
7959 pmaddwd m6, [r6] ; [13]
7973 pmaddwd m7, [r6 + 11 * 16] ; [24]
7977 pmaddwd m1, [r6 + 11 * 16]
7982 lea r5, [r0 + r1 * 4]
7984 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
7987 pmaddwd m4, [r6 - 10 * 16] ; [3]
7991 pmaddwd m6, [r6 - 10 * 16]
8001 pmaddwd m2, [r6 + 1 * 16] ; [14]
8005 pmaddwd m6, [r6 + 1 * 16]
8015 pmaddwd m6, [r6 + 12 * 16] ; [25]
8019 pmaddwd m1, [r6 + 12 * 16]
8025 pmaddwd m7, [r6 - 9 * 16] ; [4]
8029 pmaddwd m1, [r6 - 9 * 16]
8034 lea r5, [r5 + r1 * 4]
8036 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
8043 pmaddwd m4, [r6 + 2 * 16] ; [15]
8047 pmaddwd m1, [r6 + 2 * 16]
8053 pshufb m5, [pw_ang8_16]
8059 pmaddwd m2, [r6 + 13 * 16] ; [26]
8063 pmaddwd m1, [r6 + 13 * 16]
8069 pmaddwd m7, [r6 - 8 * 16] ; [5]
8073 pmaddwd m1, [r6 - 8 * 16]
8082 pmaddwd m3, [r6 + 3 * 16] ; [16]
8085 pmaddwd m0, [r6 + 3 * 16]
8090 lea r5, [r5 + r1 * 4]
8092 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
8096 cglobal intra_pred_ang16_16, 4,7,8,0-(1*mmsize)
8098 lea r6, [ang_table + 13 * 16]
8100 pshufb m6, [pw_ang16_16]
8102 pshufb m5, [pw_ang16_16]
8108 call ang16_mode_16_20
8110 lea r0, [r0 + r1 * 8]
8116 call ang16_mode_16_20
8120 cglobal intra_pred_ang16_20, 4,7,8,0-(1*mmsize)
8123 lea r6, [ang_table + 13 * 16]
8125 pshufb m6, [pw_ang16_16]
8127 pshufb m5, [pw_ang16_16]
8134 call ang16_mode_16_20
8143 call ang16_mode_16_20
8147 cglobal ang16_mode_17_19
8150 movu m0, [r2] ; [7 6 5 4 3 2 1 0]
8151 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
8153 palignr m6, m0, m5, 2
8155 punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
8156 punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
8159 pmaddwd m4, [r6 - 10 * 16] ; [6]
8163 pmaddwd m2, [r6 - 10 * 16]
8172 pmaddwd m2, [r6 - 4 * 16] ; [12]
8176 pmaddwd m1, [r6 - 4 * 16]
8185 pmaddwd m6, [r6 + 2 * 16] ; [18]
8189 pmaddwd m1, [r6 + 2 * 16]
8199 pmaddwd m7, [r6 + 8 * 16] ; [24]
8203 pmaddwd m1, [r6 + 8 * 16]
8210 TRANSPOSE_STORE m4, m2, m6, m7, m1, 0
8217 pmaddwd m4, [r6 + 14 * 16] ; [30]
8221 pmaddwd m1, [r6 + 14 * 16]
8227 pmaddwd m2, [r6 - 12 * 16] ; [4]
8231 pmaddwd m1, [r6 - 12 * 16]
8241 pmaddwd m6, [r6 - 6 * 16] ; [10]
8245 pmaddwd m7, [r6 - 6 * 16]
8255 pmaddwd m7, [r6] ; [16]
8264 lea r5, [r0 + r1 * 4]
8266 TRANSPOSE_STORE m4, m2, m6, m7, m1, 8
8273 pmaddwd m4, [r6 + 6 * 16] ; [22]
8277 pmaddwd m6, [r6 + 6 * 16]
8287 pmaddwd m2, [r6 + 12 * 16] ; [28]
8291 pmaddwd m6, [r6 + 12 * 16]
8297 pmaddwd m6, [r6 - 14 * 16] ; [2]
8301 pmaddwd m1, [r6 - 14 * 16]
8307 pshufb m5, [pw_ang8_17]
8313 pmaddwd m7, [r6 - 8 * 16] ; [8]
8317 pmaddwd m1, [r6 - 8 * 16]
8322 lea r5, [r5 + r1 * 4]
8324 TRANSPOSE_STORE m4, m2, m6, m7, m1, 16
8331 pmaddwd m4, [r6 - 2 * 16] ; [14]
8335 pmaddwd m1, [r6 - 2 * 16]
8345 pmaddwd m2, [r6 + 4 * 16] ; [20]
8349 pmaddwd m1, [r6 + 4 * 16]
8359 pmaddwd m7, [r6 + 10 * 16] ; [26]
8363 pmaddwd m1, [r6 + 10 * 16]
8368 pmaddwd m3, [r6 - 16 * 16]
8371 pmaddwd m0, [r6 - 16 * 16]
8376 lea r5, [r5 + r1 * 4]
8378 TRANSPOSE_STORE m4, m2, m7, m3, m1, 24
8382 cglobal intra_pred_ang16_17, 4,7,8,0-(1*mmsize)
8384 lea r6, [ang_table + 16 * 16]
8386 pshufb m6, [pw_ang16_16]
8388 pshufb m5, [pw_ang16_16]
8394 call ang16_mode_17_19
8396 lea r0, [r0 + r1 * 8]
8402 call ang16_mode_17_19
8406 cglobal intra_pred_ang16_19, 4,7,8,0-(1*mmsize)
8409 lea r6, [ang_table + 16 * 16]
8411 pshufb m6, [pw_ang16_16]
8413 pshufb m5, [pw_ang16_16]
8420 call ang16_mode_17_19
8429 call ang16_mode_17_19
8433 cglobal intra_pred_ang16_18, 4,5,4
8439 pshufb m0, [pw_swap16]
8442 palignr m2, m1, m0, 14
8444 palignr m2, m3, m1, 14
8445 movu [r0 + r1 + 16], m2
8446 palignr m2, m1, m0, 12
8447 movu [r0 + r1 * 2], m2
8448 palignr m2, m3, m1, 12
8449 movu [r0 + r1 * 2 + 16], m2
8450 palignr m2, m1, m0, 10
8452 palignr m2, m3, m1, 10
8453 movu [r0 + r4 + 16], m2
8455 lea r0, [r0 + r1 * 4]
8456 palignr m2, m1, m0, 8
8458 palignr m2, m3, m1, 8
8460 palignr m2, m1, m0, 6
8462 palignr m2, m3, m1, 6
8463 movu [r0 + r1 + 16], m2
8464 palignr m2, m1, m0, 4
8465 movu [r0 + r1 * 2], m2
8466 palignr m2, m3, m1, 4
8467 movu [r0 + r1 * 2 + 16], m2
8468 palignr m2, m1, m0, 2
8471 movu [r0 + r4 + 16], m3
8473 lea r0, [r0 + r1 * 4]
8477 pshufb m3, [pw_swap16]
8478 palignr m2, m0, m3, 14
8480 palignr m2, m1, m0, 14
8481 movu [r0 + r1 + 16], m2
8482 palignr m2, m0, m3, 12
8483 movu [r0 + r1 * 2], m2
8484 palignr m2, m1, m0, 12
8485 movu [r0 + r1 * 2 + 16], m2
8486 palignr m2, m0, m3, 10
8488 palignr m2, m1, m0, 10
8489 movu [r0 + r4 + 16], m2
8491 lea r0, [r0 + r1 * 4]
8492 palignr m2, m0, m3, 8
8494 palignr m2, m1, m0, 8
8496 palignr m2, m0, m3, 6
8498 palignr m2, m1, m0, 6
8499 movu [r0 + r1 + 16], m2
8500 palignr m2, m0, m3, 4
8501 movu [r0 + r1 * 2], m2
8502 palignr m2, m1, m0, 4
8503 movu [r0 + r1 * 2 + 16], m2
8504 palignr m2, m0, m3, 2
8507 movu [r0 + r4 + 16], m1
8511 cglobal intra_pred_ang16_10, 4,5,4
8512 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
8513 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
8514 pshufb m0, m1, [pw_unpackwdq] ; [1 1 1 1 1 1 1 1]
8519 pshufb m2, m1, [pw_unpackwdq] ; [2 2 2 2 2 2 2 2]
8521 movu [r0 + r1 + 16], m2
8523 pshufb m2, m1, [pw_unpackwdq] ; [3 3 3 3 3 3 3 3]
8524 movu [r0 + r1 * 2], m2
8525 movu [r0 + r1 * 2 + 16], m2
8527 pshufb m2, m1, [pw_unpackwdq] ; [4 4 4 4 4 4 4 4]
8529 movu [r0 + r4 + 16], m2
8531 lea r2, [r0 + r1 *4]
8533 pshufb m2, m1, [pw_unpackwdq] ; [5 5 5 5 5 5 5 5]
8537 pshufb m2, m1, [pw_unpackwdq] ; [6 6 6 6 6 6 6 6]
8539 movu [r2 + r1 + 16], m2
8541 pshufb m2, m1, [pw_unpackwdq] ; [7 7 7 7 7 7 7 7]
8542 movu [r2 + r1 * 2], m2
8543 movu [r2 + r1 * 2 + 16], m2
8545 pshufb m2, m1, [pw_unpackwdq] ; [8 8 8 8 8 8 8 8]
8547 movu [r2 + r4 + 16], m2
8549 lea r2, [r2 + r1 *4]
8550 pshufb m2, m3, [pw_unpackwdq] ; [9 9 9 9 9 9 9 9]
8554 pshufb m2, m3, [pw_unpackwdq] ; [10 10 10 10 10 10 10 10]
8556 movu [r2 + r1 + 16], m2
8558 pshufb m2, m3, [pw_unpackwdq] ; [11 11 11 11 11 11 11 11]
8559 movu [r2 + r1 * 2], m2
8560 movu [r2 + r1 * 2 + 16], m2
8562 pshufb m2, m3, [pw_unpackwdq] ; [12 12 12 12 12 12 12 12]
8564 movu [r2 + r4 + 16], m2
8566 lea r2, [r2 + r1 *4]
8568 pshufb m2, m3, [pw_unpackwdq] ; [13 13 13 13 13 13 13 13]
8572 pshufb m2, m3, [pw_unpackwdq] ; [14 14 14 14 14 14 14 14]
8574 movu [r2 + r1 + 16], m2
8576 pshufb m2, m3, [pw_unpackwdq] ; [15 15 15 15 15 15 15 15]
8577 movu [r2 + r1 * 2], m2
8578 movu [r2 + r1 * 2 + 16], m2
8580 pshufb m2, m3, [pw_unpackwdq] ; [16 16 16 16 16 16 16 16]
8582 movu [r2 + r4 + 16], m2
8590 movh m1, [r3] ; [3 2 1 0]
8591 pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
8592 movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
8593 movu m3, [r3 + 18] ; [16 15 14 13 12 11 10 9]
8602 pminsw m0, [pw_1023]
8604 pminsw m3, [pw_1023]
8611 cglobal intra_pred_ang16_26, 4,5,4
8612 movu m0, [r3 + 2] ; [8 7 6 5 4 3 2 1]
8613 movu m3, [r3 + 18] ; [16 15 14 13 12 11 10 9]
8620 movu [r0 + r1 + 16], m3
8621 movu [r0 + r1 * 2], m0
8622 movu [r0 + r1 * 2 + 16], m3
8624 movu [r0 + r4 + 16], m3
8626 lea r3, [r0 + r1 *4]
8630 movu [r3 + r1 + 16], m3
8631 movu [r3 + r1 * 2], m0
8632 movu [r3 + r1 * 2 + 16], m3
8634 movu [r3 + r4 + 16], m3
8636 lea r3, [r3 + r1 *4]
8640 movu [r3 + r1 + 16], m3
8641 movu [r3 + r1 * 2], m0
8642 movu [r3 + r1 * 2 + 16], m3
8644 movu [r3 + r4 + 16], m3
8646 lea r3, [r3 + r1 *4]
8650 movu [r3 + r1 + 16], m3
8651 movu [r3 + r1 * 2], m0
8652 movu [r3 + r1 * 2 + 16], m3
8654 movu [r3 + r4 + 16], m3
8661 pshufb m0, [pw_unpackwdq]
8662 movh m1, [r2] ; [3 2 1 0]
8663 pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
8664 movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
8665 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
8674 pminsw m0, [pw_1023]
8676 pminsw m3, [pw_1023]
8678 pextrw [r0 + r1], m0, 1
8679 pextrw [r0 + r1 * 2], m0, 2
8680 pextrw [r0 + r4], m0, 3
8681 lea r0, [r0 + r1 * 4]
8683 pextrw [r0 + r1], m0, 5
8684 pextrw [r0 + r1 * 2], m0, 6
8685 pextrw [r0 + r4], m0, 7
8686 lea r0, [r0 + r1 * 4]
8688 pextrw [r0 + r1], m3, 1
8689 pextrw [r0 + r1 * 2], m3, 2
8690 pextrw [r0 + r4], m3, 3
8692 pextrw [r3 + r1], m3, 5
8693 pextrw [r3 + r1 * 2], m3, 6
8694 pextrw [r3 + r4], m3, 7
8709 palignr m5, m1, m0, 2
8711 palignr m5, m2, m1, 2
8712 movu [r0 + r1 + 16], m5
8713 palignr m5, m3, m2, 2
8714 movu [r0 + r1 + 32], m5
8715 palignr m5, m4, m3, 2
8716 movu [r0 + r1 + 48], m5
8717 palignr m5, m1, m0, 4
8719 palignr m5, m2, m1, 4
8720 movu [r0 + r3 + 16], m5
8721 palignr m5, m3, m2, 4
8722 movu [r0 + r3 + 32], m5
8723 palignr m5, m4, m3, 4
8724 movu [r0 + r3 + 48], m5
8725 palignr m5, m1, m0, 6
8727 palignr m5, m2, m1, 6
8728 movu [r0 + r4 + 16], m5
8729 palignr m5, m3, m2, 6
8730 movu [r0 + r4 + 32], m5
8731 palignr m5, m4, m3, 6
8732 movu [r0 + r4 + 48], m5
8733 lea r0, [r0 + r1 * 4]
8734 palignr m5, m1, m0, 8
8736 palignr m5, m2, m1, 8
8738 palignr m5, m3, m2, 8
8740 palignr m5, m4, m3, 8
8742 palignr m5, m1, m0, 10
8744 palignr m5, m2, m1, 10
8745 movu [r0 + r1 + 16], m5
8746 palignr m5, m3, m2, 10
8747 movu [r0 + r1 + 32], m5
8748 palignr m5, m4, m3, 10
8749 movu [r0 + r1 + 48], m5
8750 palignr m5, m1, m0, 12
8752 palignr m5, m2, m1, 12
8753 movu [r0 + r3 + 16], m5
8754 palignr m5, m3, m2, 12
8755 movu [r0 + r3 + 32], m5
8756 palignr m5, m4, m3, 12
8757 movu [r0 + r3 + 48], m5
8758 palignr m5, m1, m0, 14
8760 palignr m5, m2, m1, 14
8761 movu [r0 + r4 + 16], m5
8762 palignr m5, m3, m2, 14
8763 movu [r0 + r4 + 32], m5
8764 palignr m5, m4, m3, 14
8765 movu [r0 + r4 + 48], m5
8766 lea r0, [r0 + r1 * 4]
8772 palignr m5, m2, m1, 2
8774 palignr m5, m3, m2, 2
8775 movu [r0 + r1 + 16], m5
8776 palignr m5, m4, m3, 2
8777 movu [r0 + r1 + 32], m5
8778 palignr m5, m0, m4, 2
8779 movu [r0 + r1 + 48], m5
8780 palignr m5, m2, m1, 4
8782 palignr m5, m3, m2, 4
8783 movu [r0 + r3 + 16], m5
8784 palignr m5, m4, m3, 4
8785 movu [r0 + r3 + 32], m5
8786 palignr m5, m0, m4, 4
8787 movu [r0 + r3 + 48], m5
8788 palignr m5, m2, m1, 6
8790 palignr m5, m3, m2, 6
8791 movu [r0 + r4 + 16], m5
8792 palignr m5, m4, m3, 6
8793 movu [r0 + r4 + 32], m5
8794 palignr m5, m0, m4, 6
8795 movu [r0 + r4 + 48], m5
8796 lea r0, [r0 + r1 * 4]
8797 palignr m5, m2, m1, 8
8799 palignr m5, m3, m2, 8
8801 palignr m5, m4, m3, 8
8803 palignr m5, m0, m4, 8
8805 palignr m5, m2, m1, 10
8807 palignr m5, m3, m2, 10
8808 movu [r0 + r1 + 16], m5
8809 palignr m5, m4, m3, 10
8810 movu [r0 + r1 + 32], m5
8811 palignr m5, m0, m4, 10
8812 movu [r0 + r1 + 48], m5
8813 palignr m5, m2, m1, 12
8815 palignr m5, m3, m2, 12
8816 movu [r0 + r3 + 16], m5
8817 palignr m5, m4, m3, 12
8818 movu [r0 + r3 + 32], m5
8819 palignr m5, m0, m4, 12
8820 movu [r0 + r3 + 48], m5
8821 palignr m5, m2, m1, 14
8823 palignr m5, m3, m2, 14
8824 movu [r0 + r4 + 16], m5
8825 palignr m5, m4, m3, 14
8826 movu [r0 + r4 + 32], m5
8827 palignr m5, m0, m4, 14
8828 movu [r0 + r4 + 48], m5
8829 lea r0, [r0 + r1 * 4]
8831 ;--------------------------------------------------------------------------------------------------------------------
8832 ; void intraPredAng32_2_34(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8833 ;--------------------------------------------------------------------------------------------------------------------
8835 cglobal intra_pred_ang32_2, 3,6,6
8851 %macro TRANSPOSE_STORE_8x8 6
8853 ; transpose 4x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
8854 punpckhwd m0, %3, %4
8856 punpckhwd %4, %3, m0
8859 punpckhwd m0, %5, %6
8861 punpckhwd %6, %5, m0
8864 punpckhqdq m0, %3, %5
8866 punpcklqdq %5, %4, %6
8870 movu [r0 + r1 + %1], m0
8871 movu [r0 + r1 * 2 + %1], %5
8872 movu [r0 + r5 + %1], %4
8874 ; store 8x4, used by angle BLOCK_16x16 and BLOCK_32x32
8876 movhps [r0 + r1], %3
8877 movh [r0 + r1 * 2], %4
8878 movhps [r0 + r5], %4
8879 lea r0, [r0 + r1 * 4]
8881 movhps [r0 + r1], %5
8882 movh [r0 + r1 * 2], %6
8883 movhps [r0 + r5], %6
8884 lea r0, [r0 + r1 * 4]
8889 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
8890 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
8893 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
8894 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5] xmm2
8895 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1] xmm0
8897 palignr m1, m2, m0, 4 ; [6 5 5 4 4 3 3 2] xmm1
8898 pmaddwd m4, m0, [r3 + 10 * 16] ; [26]
8902 pmaddwd m5, m1, [r3 + 4 * 16] ; [20]
8907 palignr m5, m2, m0, 8
8908 pmaddwd m5, [r3 - 2 * 16] ; [14]
8912 palignr m6, m2, m0, 12
8913 pmaddwd m6, [r3 - 8 * 16] ; [ 8]
8918 pmaddwd m6, m2, [r3 - 14 * 16] ; [ 2]
8922 pmaddwd m1, m2, [r3 + 12 * 16] ; [28]
8927 palignr m0, m3, m2, 4 ; [10 9 9 8 8 7 7 6]
8928 pmaddwd m1, m0, [r3 + 6 * 16] ; [22]
8932 psrldq m2, m3, 2 ; [x 16 15 14 13 12 11 10]
8933 palignr m2, m0, 4 ;[11 10 10 9 9 8 8 7]
8935 pmaddwd m2, [r3] ; [16]
8940 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
8942 palignr m0, m3, m7, 14 ; [15 14 13 12 11 10 9 8]
8943 movu m3, [r2 + 32] ; [23 22 21 20 19 18 17 16]
8944 palignr m1, m3, m0, 2 ; [16 15 14 13 12 11 10 9]
8945 punpckhwd m7, m0, m1 ; [16 15 15 14 14 13 13 12]
8946 punpcklwd m0, m1 ; [12 11 11 10 10 9 9 8]
8948 palignr m5, m7, m0, 4 ; [13 12 12 11 11 10 10 9]
8949 pmaddwd m4, m0, [r3 - 6 * 16] ; [10]
8953 pmaddwd m1, m5, [r3 - 12 * 16] ; [04]
8958 pmaddwd m5, [r3 + 14 * 16] ; [30]
8962 palignr m6, m7, m0, 8 ; [14 13 13 12 12 11 11 10]
8963 pmaddwd m6, [r3 + 8 * 16] ; [24]
8968 palignr m1, m7, m0, 12 ; [15 14 14 13 13 12 12 11]
8969 pmaddwd m6, m1, [r3 + 2 * 16] ; [18]
8973 pmaddwd m1, m7, [r3 - 4 * 16] ; [12]
8978 palignr m2, m3, m7, 4 ; [17 16 16 15 15 14 14 13]
8979 pmaddwd m1, m2, [r3 - 10 * 16] ; [6]
8984 movhps m1, [r2 + 28] ; [00]
8986 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
8988 movu m0, [r2 + 28] ; [35 34 33 32 31 30 29 28]
8989 palignr m1, m0, 2 ; [ x 35 34 33 32 31 30 29]
8990 punpckhwd m2, m0, m1 ; [ x 35 35 34 34 33 33 32]
8991 punpcklwd m0, m1 ; [32 31 31 30 30 29 29 28]
8993 pmaddwd m4, m0, [r3 + 10 * 16] ; [26]
8997 palignr m1, m2, m0, 4 ; [33 32 32 31 31 30 30 29]
8998 pmaddwd m1, [r3 + 4 * 16] ; [20]
9003 palignr m5, m2, m0, 8 ; [34 33 33 32 32 31 31 30]
9004 pmaddwd m5, [r3 - 2 * 16] ; [14]
9008 palignr m6, m2, m0, 12 ; [35 34 34 33 33 32 32 31]
9009 pmaddwd m6, [r3 - 8 * 16] ; [ 8]
9014 pinsrw m2, [r2 + 44], 7 ; [35 34 34 33 33 32 32 31]
9015 pmaddwd m6, m2, [r3 - 14 * 16] ; [ 2]
9019 pmaddwd m2, [r3 + 12 * 16] ; [28]
9024 movu m3, [r2 + 38] ; [45 44 43 42 41 40 39 38]
9025 palignr m1, m3, 2 ; [ x 45 44 43 42 41 40 39]
9026 punpckhwd m2, m3, m1 ; [ x 35 35 34 34 33 33 32]
9027 punpcklwd m3, m1 ; [32 31 31 30 30 29 29 28]
9029 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
9033 palignr m0, m2, m3, 4
9034 pmaddwd m0, [r3] ; [16]
9039 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
9041 palignr m5, m2, m3, 8
9042 pmaddwd m4, m5, [r3 - 6 * 16] ; [10]
9046 palignr m5, m2, m3, 12
9047 pmaddwd m1, m5, [r3 - 12 * 16] ; [04]
9052 pmaddwd m5, [r3 + 14 * 16] ; [30]
9058 punpckhwd m2, m3, m1
9061 pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
9066 palignr m6, m2, m3, 4
9067 pmaddwd m6, [r3 + 2 * 16] ; [18]
9071 palignr m1, m2, m3, 8
9072 pmaddwd m1, [r3 - 4 * 16] ; [12]
9077 palignr m1, m2, m3, 12
9078 pmaddwd m1, [r3 - 10 * 16] ; [06]
9083 movhps m1, [r2 + 54] ; [00]
9085 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
9087 ;------------------------------------------------------------------------------------------------------------------
9088 ; void intraPredAng32_3(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
9089 ;------------------------------------------------------------------------------------------------------------------
9091 cglobal intra_pred_ang32_3, 3,6,8
9092 lea r3, [ang_table + 16 * 16]
9099 lea r0, [r0 + r1 * 4 ]
9106 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
9107 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
9108 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
9109 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
9110 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
9112 pmaddwd m4, m0, [r3 + 5 * 16] ; [21]
9116 palignr m5, m2, m0, 4 ; [6 5 5 4 4 3 3 2]
9117 pmaddwd m1, m5, [r3 - 6 * 16] ; [10]
9122 pmaddwd m5, [r3 + 15 * 16] ; [31]
9126 palignr m6, m2, m0, 8
9127 pmaddwd m6, [r3 + 4 * 16] ; [ 20]
9132 palignr m1, m2, m0, 12
9133 pmaddwd m6, m1, [r3 - 7 * 16] ; [ 9]
9137 pmaddwd m1, [r3 + 14 * 16] ; [30]
9142 pmaddwd m1, m2, [r3 + 3 * 16] ; [19]
9146 palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4]
9147 pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
9152 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
9154 pmaddwd m4, m7, [r3 + 13 * 16] ; [29]
9158 movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
9160 palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
9161 palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
9162 punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
9163 punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
9165 palignr m1, m2, m7, 4 ; [11 10 10 9 9 8 7 6]
9166 pmaddwd m1, [r3 + 2 * 16] ; [18]
9171 palignr m5, m2, m7, 8
9173 pmaddwd m5, [r3 - 9 * 16] ; [07]
9177 pmaddwd m6, [r3 + 12 * 16] ; [28]
9182 palignr m6, m2, m7, 12
9183 pmaddwd m6, [r3 + 16] ; [17]
9187 pmaddwd m1, m2, [r3 - 10 * 16] ; [06]
9192 pmaddwd m1, m2, [r3 + 11 * 16] ; [27]
9196 palignr m7, m3, m2, 4
9197 pmaddwd m7, [r3] ; [16]
9203 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
9205 palignr m0, m3, m2, 8
9206 pmaddwd m4, m0, [r3 - 11 * 16] ; [5]
9210 pmaddwd m1, m0, [r3 + 10 * 16] ; [26]
9215 palignr m5, m3, m2, 12
9216 pmaddwd m5, [r3 - 16] ; [15]
9220 pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
9225 pmaddwd m6, m3, [r3 + 9 * 16] ; [25]
9229 movu m0, [r2 + 50] ; [32 31 30 29 28 27 26 25]
9230 palignr m2, m0, m7, 2 ; [25 24 23 22 21 20 19 18]
9231 palignr m1, m0, m7, 4 ; [26 25 24 23 22 21 20 19]
9232 punpckhwd m7, m2, m1 ; [26 25 25 24 24 23 23 22]
9233 punpcklwd m2, m1 ; [22 21 21 20 20 19 19 18]
9235 palignr m1, m2, m3, 4
9236 pmaddwd m1, [r3 - 2 * 16] ; [14]
9241 palignr m1, m2, m3, 8
9243 pmaddwd m1, [r3 - 13 * 16] ; [3]
9247 pmaddwd m0, [r3 + 8 * 16] ; [24]
9252 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
9254 palignr m4, m2, m3, 12
9255 pmaddwd m4, [r3 - 3 * 16] ; [13]
9259 pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
9264 pmaddwd m5, m2, [r3 + 7 * 16] ; [23]
9268 palignr m6, m7, m2, 4
9269 pmaddwd m6, [r3 - 4 * 16] ; [12]
9274 palignr m1, m7, m2, 8
9275 pmaddwd m6, m1, [r3 - 15 * 16] ; [1]
9279 pmaddwd m1, [r3 + 6 * 16] ; [22]
9284 palignr m1, m7, m2, 12
9285 pmaddwd m1, [r3 - 5 * 16] ; [11]
9289 movhps m1, [r2 + 44] ; [00]
9291 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
9293 ;------------------------------------------------------------------------------------------------------------------
9294 ; void intraPredAng32_4(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
9295 ;------------------------------------------------------------------------------------------------------------------
9297 cglobal intra_pred_ang32_4, 3,6,8
9298 lea r3, [ang_table + 16 * 16]
9305 lea r0, [r0 + r1 * 4 ]
9312 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
9313 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
9314 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
9315 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
9316 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
9318 pmaddwd m4, m0, [r3 + 16] ; [17]
9322 palignr m1, m2, m0, 4
9324 pmaddwd m1, [r3 - 14 * 16] ; [2]
9329 pmaddwd m5, [r3 + 3 * 16] ; [19]
9333 palignr m6, m2, m0, 8
9335 pmaddwd m6, [r3 - 12 * 16] ; [4]
9340 pmaddwd m6, m1, [r3 + 5 * 16] ; [21]
9344 palignr m1, m2, m0, 12
9346 pmaddwd m7, [r3 - 10 * 16] ; [6]
9351 pmaddwd m1, [r3 + 7 * 16] ; [23]
9355 pmaddwd m7, m2, [r3 - 8 * 16] ; [8]
9360 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
9362 pmaddwd m4, m2, [r3 + 9 * 16] ; [25]
9366 palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4]
9367 pmaddwd m1, m7, [r3 - 6 * 16] ; [10]
9372 pmaddwd m5, m7, [r3 + 11 * 16] ; [27]
9376 movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
9377 palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
9378 palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
9379 punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
9380 punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
9382 palignr m6, m2, m7, 4
9383 pmaddwd m1, m6, [r3 - 4 * 16] ; [12]
9388 pmaddwd m6, [r3 + 13 * 16] ; [29]
9392 palignr m1, m2, m7, 8
9394 pmaddwd m1, [r3 - 2 * 16] ; [14]
9399 pmaddwd m1, m0, [r3 + 15 * 16] ; [31]
9403 palignr m0, m2, m7, 12
9404 pmaddwd m0, [r3] ; [16]
9409 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
9411 pmaddwd m4, m2, [r3 - 15 * 16] ; [1]
9415 pmaddwd m1, m2, [r3 + 2 * 16] ; [18]
9420 palignr m1, m3, m2, 4
9421 pmaddwd m5, m1, [r3 - 13 * 16] ; [3]
9425 pmaddwd m1, [r3 + 4 * 16] ; [20]
9430 palignr m1, m3, m2, 8
9431 pmaddwd m6, m1, [r3 - 11 * 16] ; [5]
9435 pmaddwd m1, [r3 + 6 * 16] ; [22]
9440 palignr m7, m3, m2, 12
9441 pmaddwd m1, m7, [r3 - 9 * 16] ; [7]
9445 pmaddwd m7, [r3 + 8 * 16] ; [24]
9450 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
9452 pmaddwd m4, m3, [r3 - 7 * 16] ; [9]
9456 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
9461 movu m0, [r2 + 36] ; [25 24 23 22 21 20 19 18]
9462 palignr m1, m0, 2 ; [x 25 24 23 22 21 20 19]
9463 punpcklwd m0, m1 ; [22 21 21 20 20 19 19 18]
9465 palignr m1, m0, m3, 4
9466 pmaddwd m5, m1, [r3 - 5 * 16] ; [11]
9470 pmaddwd m1, [r3 + 12 * 16] ; [28]
9475 palignr m1, m0, m3, 8
9476 pmaddwd m6, m1, [r3 - 3 * 16] ; [13]
9480 pmaddwd m1, [r3 + 14 * 16] ; [30]
9485 palignr m1, m0, m3, 12
9486 pmaddwd m1, [r3 - 16] ; [15]
9490 movhps m1, [r2 + 36] ; [00]
9492 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
9494 ;------------------------------------------------------------------------------------------------------------------
9495 ; void intraPredAng32_5(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
9496 ;------------------------------------------------------------------------------------------------------------------
9498 cglobal intra_pred_ang32_5, 3,6,8
9499 lea r3, [ang_table + 16 * 16]
9506 lea r0, [r0 + r1 * 4 ]
9513 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
9514 movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
9515 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
9516 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
9517 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
9519 pmaddwd m4, m0, [r3 - 3 * 16] ; [13]
9523 pmaddwd m1, m0, [r3 + 10 * 16] ; [26]
9528 palignr m1, m2, m0, 4
9529 pmaddwd m5, m1, [r3 - 9 * 16] ; [7]
9533 pmaddwd m1, [r3 + 4 * 16] ; [20]
9538 palignr m1, m2, m0, 8
9539 pmaddwd m6, m1, [r3 - 15 * 16] ; [1]
9543 pmaddwd m7, m1, [r3 - 2 * 16] ; [14]
9548 pmaddwd m1, [r3 + 11 * 16] ; [27]
9552 palignr m7, m2, m0, 12
9553 pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
9558 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
9560 pmaddwd m4, m7, [r3 + 5 * 16] ; [21]
9564 pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
9569 pmaddwd m5, m2, [r3 - 16] ; [15]
9573 pmaddwd m6, m2, [r3 + 12 * 16] ; [28]
9578 palignr m7, m3, m2, 4
9579 pmaddwd m6, m7, [r3 - 7 * 16] ; [9]
9583 pmaddwd m1, m7, [r3 + 6 * 16] ; [22]
9588 movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
9589 palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
9590 palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
9591 punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
9592 punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
9594 palignr m0, m2, m7, 4
9595 pmaddwd m1, m0, [r3 - 13 * 16] ; [3]
9599 pmaddwd m0, [r3] ; [16]
9604 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
9606 palignr m4, m2, m7, 4
9607 pmaddwd m4, [r3 + 13 * 16] ; [29]
9611 palignr m5, m2, m7, 8
9612 pmaddwd m1, m5, [r3 - 6 * 16] ; [10]
9617 pmaddwd m5, [r3 + 7 * 16] ; [23]
9621 palignr m1, m2, m7, 12
9622 pmaddwd m6, m1, [r3 - 12 * 16] ; [4]
9627 pmaddwd m6, m1, [r3 + 16] ; [17]
9631 pmaddwd m1, [r3 + 14 * 16] ; [30]
9636 pmaddwd m1, m2, [r3 - 5 * 16] ; [11]
9640 pmaddwd m0, m2, [r3 + 8 * 16] ; [24]
9645 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
9647 palignr m5, m3, m2, 4
9648 pmaddwd m4, m5, [r3 - 11 * 16] ; [5]
9652 pmaddwd m1, m5, [r3 + 2 * 16] ; [18]
9657 pmaddwd m5, [r3 + 15 * 16] ; [31]
9661 palignr m6, m3, m2, 8
9662 pmaddwd m1, m6, [r3 - 4 * 16] ; [12]
9667 pmaddwd m6, [r3 + 9 * 16] ; [25]
9671 palignr m1, m3, m2, 12
9672 pmaddwd m0, m1, [r3 - 10 * 16] ; [6]
9677 pmaddwd m1, [r3 + 3 * 16] ; [19]
9681 movhps m1, [r2 + 28] ; [00]
9683 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
9685 ;------------------------------------------------------------------------------------------------------------------
9686 ; void intraPredAng32_6(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
9687 ;------------------------------------------------------------------------------------------------------------------
9689 cglobal intra_pred_ang32_6, 3,6,8
9690 lea r3, [ang_table + 16 * 16]
9697 lea r0, [r0 + r1 * 4 ]
9704 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
9705 movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
9706 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
9707 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
9708 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
9710 pmaddwd m4, m0, [r3 - 7 * 16] ; [9]
9714 pmaddwd m1, m0, [r3 + 2 * 16] ; [18]
9719 pmaddwd m5, m0, [r3 + 11 * 16] ; [27]
9723 palignr m1, m2, m0, 4
9724 pmaddwd m6, m1, [r3 - 12 * 16] ; [4]
9729 pmaddwd m6, m1, [r3 - 3 * 16] ; [13]
9733 pmaddwd m7, m1, [r3 + 6 * 16] ; [22]
9738 pmaddwd m1, [r3 + 15 * 16] ; [31]
9743 palignr m7, m2, m0, 8
9744 pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
9749 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
9751 pmaddwd m4, m7, [r3 + 16] ; [17]
9755 pmaddwd m1, m7, [r3 + 10 * 16] ; [26]
9760 palignr m1, m2, m3, 12
9761 pmaddwd m5, m1, [r3 - 13 * 16] ; [3]
9765 pmaddwd m6, m1, [r3 - 4 * 16] ; [12]
9770 pmaddwd m6, m1, [r3 + 5 * 16] ; [21]
9774 pmaddwd m1, [r3 + 14 * 16] ; [30]
9779 pmaddwd m1, m2, [r3 - 9 * 16] ; [7]
9783 pmaddwd m0, m2, [r3] ; [16]
9788 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
9790 pmaddwd m4, m2, [r3 + 9 * 16] ; [25]
9794 movu m7, [r2 + 18] ; [16 15 14 13 12 11 10 9]
9795 palignr m1, m7, 2 ; [x 16 15 14 13 12 11 10]
9796 punpcklwd m7, m1 ; [13 12 12 11 11 10 10 9]
9798 palignr m6, m7, m2, 4
9799 pmaddwd m1, m6, [r3 - 14 * 16] ; [2]
9804 pmaddwd m5, m6, [r3 - 5 * 16] ; [11]
9808 pmaddwd m0, m6, [r3 + 4 * 16] ; [20]
9813 pmaddwd m6, [r3 + 13 * 16] ; [29]
9817 palignr m0, m7, m2, 8
9818 pmaddwd m1, m0, [r3 - 10 * 16] ; [6]
9823 pmaddwd m1, m0, [r3 - 16] ; [15]
9827 pmaddwd m0, [r3 + 8 * 16] ; [24]
9832 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
9834 palignr m0, m7, m2, 12
9835 pmaddwd m4, m0, [r3 - 15 * 16] ; [1]
9839 pmaddwd m1, m0, [r3 - 6 * 16] ; [10]
9844 pmaddwd m5, m0, [r3 + 3 * 16] ; [19]
9848 pmaddwd m0, [r3 + 12 * 16] ; [28]
9853 pmaddwd m6, m7, [r3 - 11 * 16] ; [5]
9857 pmaddwd m0, m7, [r3 - 2 * 16] ; [14]
9862 pmaddwd m1, m7, [r3 + 7 * 16] ; [23]
9866 movhps m1, [r2 + 20] ; [00]
9868 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
9870 ;------------------------------------------------------------------------------------------------------------------
9871 ; void intraPredAng32_7(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
9872 ;------------------------------------------------------------------------------------------------------------------
9874 cglobal intra_pred_ang32_7, 3,6,8
9875 lea r3, [ang_table + 16 * 16]
9882 lea r0, [r0 + r1 * 4 ]
9889 movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
9890 movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
9891 palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
9892 punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
9893 punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
9895 pmaddwd m4, m0, [r3 - 11 * 16] ; [5]
9899 pmaddwd m1, m0, [r3 - 6 * 16] ; [10]
9904 pmaddwd m5, m0, [r3 - 16] ; [15]
9908 pmaddwd m6, m0, [r3 + 4 * 16] ; [20]
9913 pmaddwd m6, m0, [r3 + 9 * 16] ; [25]
9917 pmaddwd m1, m0, [r3 + 14 * 16] ; [30]
9922 palignr m7, m2, m0, 4
9923 pmaddwd m1, m7, [r3 - 13 * 16] ; [3]
9928 pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
9933 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
9935 pmaddwd m4, m7, [r3 - 3 * 16] ; [13]
9939 pmaddwd m1, m7, [r3 + 2 * 16] ; [18]
9944 pmaddwd m5, m7, [r3 + 7 * 16] ; [23]
9948 pmaddwd m6, m7, [r3 + 12 * 16] ; [28]
9953 palignr m7, m2, m3, 8
9954 pmaddwd m6, m7, [r3 - 15 * 16] ; [1]
9958 pmaddwd m1, m7, [r3 - 10 * 16] ; [6]
9963 pmaddwd m1, m7, [r3 - 5 * 16] ; [11]
9967 pmaddwd m0, m7, [r3] ; [16]
9972 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
9974 pmaddwd m4, m7, [r3 + 5 * 16] ; [21]
9978 pmaddwd m1, m7, [r3 + 10 * 16] ; [26]
9983 pmaddwd m5, m7, [r3 + 15 * 16] ; [31]
9987 palignr m7, m2, m3, 12
9988 pmaddwd m0, m7, [r3 - 12 * 16] ; [4]
9993 pmaddwd m6, m7, [r3 - 7 * 16] ; [9]
9997 pmaddwd m1, m7, [r3 - 2 * 16] ; [14]
10002 pmaddwd m1, m7, [r3 + 3 * 16] ; [19]
10006 pmaddwd m0, m7, [r3 + 8 * 16] ; [24]
10011 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
10013 pmaddwd m4, m7, [r3 + 13 * 16] ; [29]
10017 pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
10022 pmaddwd m5, m2, [r3 - 9 * 16] ; [7]
10026 pmaddwd m0, m2, [r3 - 4 * 16] ; [12]
10031 pmaddwd m6, m2, [r3 + 16] ; [17]
10035 pmaddwd m0, m2, [r3 + 6 * 16] ; [22]
10040 pmaddwd m1, m2, [r3 + 11 * 16] ; [27]
10044 movhps m1, [r2 + 12] ; [00]
10046 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
10048 ;------------------------------------------------------------------------------------------------------------------
10049 ; void intraPredAng32_8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
10050 ;------------------------------------------------------------------------------------------------------------------
10052 cglobal intra_pred_ang32_8, 3,6,8
10053 lea r3, [ang_table + 16 * 16]
10060 lea r0, [r0 + r1 * 4 ]
10067 movu m3, [r2 + 2] ; [8 7 6 5 4 3 2 1]
10068 palignr m1, m3, 2 ; [9 8 7 6 5 4 3 2]
10069 punpckhwd m2, m3, m1 ; [9 8 8 7 7 6 6 5]
10070 punpcklwd m3, m1 ; [5 4 4 3 3 2 2 1]
10072 pmaddwd m4, m3, [r3 - 14 * 16] ; [2]
10076 pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
10081 pmaddwd m5, m3, [r3 - 10 * 16] ; [6]
10085 pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
10090 pmaddwd m6, m3, [r3 - 6 * 16] ; [10]
10094 pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
10099 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
10103 pmaddwd m0, m3, [r3] ; [16]
10108 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
10110 pmaddwd m4, m3, [r3 + 2 * 16] ; [18]
10114 pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
10119 pmaddwd m5, m3, [r3 + 6 * 16] ; [22]
10123 pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
10128 pmaddwd m6, m3, [r3 + 10 * 16] ; [26]
10132 pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
10137 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
10142 movhps m1, [r2 + 4] ; [00]
10144 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
10146 palignr m7, m2, m3, 4
10147 pmaddwd m4, m7, [r3 - 14 * 16] ; [2]
10151 pmaddwd m1, m7, [r3 - 12 * 16] ; [4]
10156 pmaddwd m5, m7, [r3 - 10 * 16] ; [6]
10160 pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
10165 pmaddwd m6, m7, [r3 - 6 * 16] ; [10]
10169 pmaddwd m1, m7, [r3 - 4 * 16] ; [12]
10174 pmaddwd m1, m7, [r3 - 2 * 16] ; [14]
10178 pmaddwd m0, m7, [r3] ; [16]
10183 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
10185 pmaddwd m4, m7, [r3 + 2 * 16] ; [18]
10189 pmaddwd m1, m7, [r3 + 4 * 16] ; [20]
10194 pmaddwd m5, m7, [r3 + 6 * 16] ; [22]
10198 pmaddwd m0, m7, [r3 + 8 * 16] ; [24]
10203 pmaddwd m6, m7, [r3 + 10 * 16] ; [26]
10207 pmaddwd m0, m7, [r3 + 12 * 16] ; [28]
10212 pmaddwd m7, [r3 + 14 * 16] ; [30]
10216 movhps m7, [r2 + 6] ; [00]
10218 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m7
10220 ;------------------------------------------------------------------------------------------------------------------
10221 ; void intraPredAng32_9(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
10222 ;------------------------------------------------------------------------------------------------------------------
10224 cglobal intra_pred_ang32_9, 3,6,8
10225 lea r3, [ang_table + 16 * 16]
10232 lea r0, [r0 + r1 * 4 ]
10238 ;------------------------------------------------------------------------------------------------------------------
10239 ; void intraPredAng32_10(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
10240 ;------------------------------------------------------------------------------------------------------------------
10242 cglobal intra_pred_ang32_10, 4,7,8
10248 mova m7, [c_mode32_10_0]
10261 movu [r0 + r1 + 16], m1
10262 movu [r0 + r1 + 32], m1
10263 movu [r0 + r1 + 48], m1
10268 movu [r0 + r4 + 16], m1
10269 movu [r0 + r4 + 32], m1
10270 movu [r0 + r4 + 48], m1
10275 movu [r0 + r5 + 16], m1
10276 movu [r0 + r5 + 32], m1
10277 movu [r0 + r5 + 48], m1
10291 movu [r0 + r1 + 16], m1
10292 movu [r0 + r1 + 32], m1
10293 movu [r0 + r1 + 48], m1
10298 movu [r0 + r4 + 16], m1
10299 movu [r0 + r4 + 32], m1
10300 movu [r0 + r4 + 48], m1
10305 movu [r0 + r5 + 16], m1
10306 movu [r0 + r5 + 32], m1
10307 movu [r0 + r5 + 48], m1
10315 %macro MODE_11_25 1
10316 movu m3, [r2 + 2] ; [7 6 5 4 3 2 1 0]
10317 pshufb m3, [pw_punpcklwd] ; [4 3 3 2 2 1 1 0]
10319 pmaddwd m4, m3, [r3 + 14 * 16] ; [30]
10323 pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
10328 pmaddwd m5, m3, [r3 + 10 * 16] ; [26]
10332 pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
10337 pmaddwd m6, m3, [r3 + 6 * 16] ; [22]
10341 pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
10346 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
10350 pmaddwd m0, m3, [r3] ; [16]
10355 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
10357 pmaddwd m4, m3, [r3 - 2 * 16] ; [14]
10361 pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
10366 pmaddwd m5, m3, [r3 - 6 * 16] ; [10]
10370 pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
10375 pmaddwd m6, m3, [r3 - 10 * 16] ; [6]
10379 pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
10384 pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
10389 movhps m1, [r2 + 2] ; [00]
10391 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
10393 movu m3, [r2] ; [6 5 4 3 2 1 0 16]
10394 pshufb m3, [pw_punpcklwd] ; [3 2 2 1 1 0 0 16]
10396 pmaddwd m4, m3, [r3 + 14 * 16] ; [30]
10400 pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
10405 pmaddwd m5, m3, [r3 + 10 * 16] ; [26]
10409 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
10414 pmaddwd m6, m3, [r3 + 6 * 16] ; [22]
10418 pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
10423 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
10427 pmaddwd m0, m3, [r3] ; [16]
10432 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
10434 pmaddwd m4, m3, [r3 - 2 * 16] ; [14]
10438 pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
10443 pmaddwd m5, m3, [r3 - 6 * 16] ; [10]
10447 pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
10452 pmaddwd m6, m3, [r3 - 10 * 16] ; [6]
10456 pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
10461 pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
10466 movhps m1, [r2] ; [00]
10468 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
10470 ;------------------------------------------------------------------------------------------------------------------
10471 ; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
10472 ;------------------------------------------------------------------------------------------------------------------
10474 cglobal intra_pred_ang32_11, 4,6,7,0-(4*mmsize+4)
10475 movu m0, [r2 + 0*mmsize]
10476 movu m1, [r2 + 1*mmsize]
10477 movu m2, [r2 + 2*mmsize]
10478 movu m3, [r2 + 3*mmsize]
10479 movu [rsp + 0*mmsize + 2], m0
10480 movu [rsp + 1*mmsize + 2], m1
10481 movu [rsp + 2*mmsize + 2], m2
10482 movu [rsp + 3*mmsize + 2], m3
10488 lea r3, [ang_table + 16 * 16]
10496 lea r0, [r0 + r1 * 4 ]
10502 %macro MODE_12_24 1
10503 movu m3, [r2 + 8] ; [7 6 5 4 3 2 1 0]
10504 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
10506 pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
10510 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
10515 pmaddwd m5, m3, [r3 + 16] ; [17]
10519 pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
10524 pmaddwd m6, m3, [r3 - 9 * 16] ; [7]
10528 pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
10536 pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
10540 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
10545 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
10547 pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
10551 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
10556 pmaddwd m5, m3, [r3 - 7 * 16] ; [9]
10560 pmaddwd m6, m3, [r3 - 12 * 16] ; [4]
10568 pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
10572 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
10577 pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
10581 pmaddwd m0, m3, [r3] ; [16]
10586 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
10588 pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
10592 pmaddwd m1, m3, [r3 - 10 * 16] ; [6]
10597 pmaddwd m5, m3, [r3 - 15 * 16] ; [1]
10604 pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
10609 pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
10613 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
10618 pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
10622 pmaddwd m0, m3, [r3 - 8 * 16] ; [8]
10627 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
10629 pmaddwd m4, m3, [r3 - 13 * 16] ; [3]
10636 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
10641 pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
10645 pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
10650 pmaddwd m6, m3, [r3 - 16] ; [15]
10654 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
10659 pmaddwd m1, m3, [r3 - 11 * 16] ; [5]
10664 movhps m1, [r2] ; [00]
10666 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
10668 ;------------------------------------------------------------------------------------------------------------------
10669 ; void intraPredAng32_12(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
10670 ;------------------------------------------------------------------------------------------------------------------
10672 cglobal intra_pred_ang32_12, 4,6,7,0-(4*mmsize+10)
10673 movu m0, [r2 + 0*mmsize]
10674 movu m1, [r2 + 1*mmsize]
10675 movu m2, [r2 + 2*mmsize]
10676 movu m3, [r2 + 3*mmsize]
10677 movu [rsp + 0*mmsize + 8], m0
10678 movu [rsp + 1*mmsize + 8], m1
10679 movu [rsp + 2*mmsize + 8], m2
10680 movu [rsp + 3*mmsize + 8], m3
10693 lea r3, [ang_table + 16 * 16]
10698 mova m2, [pw_punpcklwd]
10702 lea r0, [r0 + r1 * 4 ]
10708 %macro MODE_13_23 1
10709 movu m3, [r2 + 16] ; [7 6 5 4 3 2 1 0]
10710 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
10712 pmaddwd m4, m3, [r3 + 7 * 16] ; [23]
10716 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
10721 pmaddwd m5, m3, [r3 - 11 * 16] ; [05]
10728 pmaddwd m6, m3, [r3 + 12 * 16] ; [28]
10733 pmaddwd m6, m3, [r3 + 3 * 16] ; [19]
10737 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
10742 pmaddwd m1, m3, [r3 - 15 * 16] ; [01]
10749 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
10754 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
10756 pmaddwd m4, m3, [r3 - 16] ; [15]
10760 pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
10768 pmaddwd m5, m3, [r3 + 13 * 16] ; [29]
10772 pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
10777 pmaddwd m6, m3, [r3 - 5 * 16] ; [11]
10781 pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
10789 pmaddwd m1, m3, [r3 + 9 * 16] ; [25]
10793 pmaddwd m0, m3, [r3] ; [16]
10798 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
10800 pmaddwd m4, m3, [r3 - 9 * 16] ; [07]
10807 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
10812 pmaddwd m5, m3, [r3 + 5 * 16] ; [21]
10816 pmaddwd m0, m3, [r3 - 4 * 16] ; [12]
10821 pmaddwd m6, m3, [r3 - 13 * 16] ; [03]
10828 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
10833 pmaddwd m1, m3, [r3 + 16] ; [17]
10837 pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
10842 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
10847 pmaddwd m4, m3, [r3 + 15 * 16] ; [31]
10851 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
10856 pmaddwd m5, m3, [r3 - 3 * 16] ; [13]
10860 pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
10868 pmaddwd m6, m3, [r3 + 11 * 16] ; [27]
10872 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
10877 pmaddwd m1, m3, [r3 - 7 * 16] ; [09]
10882 movhps m1, [r2] ; [00]
10884 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
10886 ;------------------------------------------------------------------------------------------------------------------
10887 ; void intraPredAng32_13(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
10888 ;------------------------------------------------------------------------------------------------------------------
10890 cglobal intra_pred_ang32_13, 4,6,7,0-(5*mmsize+2)
10891 movu m0, [r2 + 0*mmsize]
10892 movu m1, [r2 + 1*mmsize]
10893 movu m2, [r2 + 2*mmsize]
10894 movu m3, [r2 + 3*mmsize]
10895 movu [rsp + 1*mmsize], m0
10896 movu [rsp + 2*mmsize], m1
10897 movu [rsp + 3*mmsize], m2
10898 movu [rsp + 4*mmsize], m3
10904 pshufb m0, [shuf_mode_13_23]
10905 pshufb m1, [shuf_mode_13_23]
10913 lea r3, [ang_table + 16 * 16]
10918 mova m2, [pw_punpcklwd]
10922 lea r0, [r0 + r1 * 4 ]
10928 %macro MODE_14_22 1
10929 movu m3, [r2 + 24] ; [7 6 5 4 3 2 1 0]
10930 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
10932 pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
10936 pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
10944 pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
10948 pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
10956 pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
10960 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
10965 pmaddwd m1, m3, [r3 - 11 * 16] ; [05]
10972 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
10977 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
10979 pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
10986 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
10991 pmaddwd m5, m3, [r3 + 16] ; [17]
10995 pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
11003 pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
11007 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
11015 pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
11019 pmaddwd m0, m3, [r3] ; [16]
11024 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
11026 pmaddwd m4, m3, [r3 - 13 * 16] ; [03]
11033 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
11038 pmaddwd m5, m3, [r3 - 7 * 16] ; [09]
11045 pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
11050 pmaddwd m6, m3, [r3 - 16] ; [15]
11054 pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
11062 pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
11066 pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
11071 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
11076 pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
11080 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
11085 pmaddwd m5, m3, [r3 - 15 * 16] ; [01]
11092 pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
11097 pmaddwd m6, m3, [r3 - 9 * 16] ; [07]
11104 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
11109 pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
11114 movhps m1, [r2] ; [00]
11116 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
11118 ;------------------------------------------------------------------------------------------------------------------
11119 ; void intraPredAng32_14(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
11120 ;------------------------------------------------------------------------------------------------------------------
11122 cglobal intra_pred_ang32_14, 4,6,7,0-(5*mmsize+10)
11123 movu m0, [r2 + 0*mmsize]
11124 movu m1, [r2 + 1*mmsize]
11125 movu m2, [r2 + 2*mmsize]
11126 movu m3, [r2 + 3*mmsize]
11127 movu [rsp + 1*mmsize + 8], m0
11128 movu [rsp + 2*mmsize + 8], m1
11129 movu [rsp + 3*mmsize + 8], m2
11130 movu [rsp + 4*mmsize + 8], m3
11133 mov [rsp + 88], r4w
11139 pshufb m0, [shuf_mode_14_22]
11140 pshufb m1, [shuf_mode_14_22]
11141 pshufb m2, [shuf_mode_14_22]
11142 movh [rsp + 14], m0
11146 lea r3, [ang_table + 16 * 16]
11151 mova m2, [pw_punpcklwd]
11155 lea r0, [r0 + r1 * 4 ]
11161 %macro MODE_15_21 1
11162 movu m3, [r2 + 32] ; [7 6 5 4 3 2 1 0]
11163 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
11165 pmaddwd m4, m3, [r3 - 16] ; [15]
11172 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
11177 pmaddwd m5, m3, [r3 - 3 * 16] ; [13]
11184 pmaddwd m6, m3, [r3 + 12 * 16] ; [28]
11189 pmaddwd m6, m3, [r3 - 5 * 16] ; [11]
11196 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
11201 pmaddwd m1, m3, [r3 - 7 * 16] ; [09]
11208 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
11213 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
11215 pmaddwd m4, m3, [r3 - 9 * 16] ; [07]
11222 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
11227 pmaddwd m5, m3, [r3 - 11 * 16] ; [05]
11234 pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
11239 pmaddwd m6, m3, [r3 - 13 * 16] ; [03]
11246 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
11251 pmaddwd m1, m3, [r3 - 15 * 16] ; [01]
11258 pmaddwd m0, m3, [r3] ; [16]
11263 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
11268 pmaddwd m4, m3, [r3 + 15 * 16] ; [31]
11272 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
11280 pmaddwd m5, m3, [r3 + 13 * 16] ; [29]
11284 pmaddwd m0, m3, [r3 - 4 * 16] ; [12]
11292 pmaddwd m6, m3, [r3 + 11 * 16] ; [27]
11296 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
11304 pmaddwd m1, m3, [r3 + 9 * 16] ; [25]
11308 pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
11313 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
11318 pmaddwd m4, m3, [r3 + 7 * 16] ; [23]
11322 pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
11330 pmaddwd m5, m3, [r3 + 5 * 16] ; [21]
11334 pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
11342 pmaddwd m6, m3, [r3 + 3 * 16] ; [19]
11346 pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
11354 pmaddwd m1, m3, [r3 + 16] ; [17]
11359 movhps m1, [r2] ; [00]
11361 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
11363 ;------------------------------------------------------------------------------------------------------------------
11364 ; void intraPredAng32_15(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
11365 ;------------------------------------------------------------------------------------------------------------------
11367 cglobal intra_pred_ang32_15, 4,6,7,0-(6*mmsize+2)
11368 movu m0, [r2 + 0*mmsize]
11369 movu m1, [r2 + 1*mmsize]
11370 movu m2, [r2 + 2*mmsize]
11371 movu m3, [r2 + 3*mmsize]
11372 movu [rsp + 2*mmsize], m0
11373 movu [rsp + 3*mmsize], m1
11374 movu [rsp + 4*mmsize], m2
11375 movu [rsp + 5*mmsize], m3
11378 mov [rsp + 96], r4w
11383 pshufb m0, [shuf_mode_15_21]
11384 pshufb m1, [shuf_mode_15_21]
11385 pshufb m2, [shuf_mode_15_21]
11386 pshufb m3, [shuf_mode_15_21]
11387 movh [rsp + 24], m0
11388 movh [rsp + 16], m1
11392 lea r3, [ang_table + 16 * 16]
11397 mova m2, [pw_punpcklwd]
11401 lea r0, [r0 + r1 * 4 ]
11407 %macro MODE_16_20 1
11408 movu m3, [r2 + 40] ; [7 6 5 4 3 2 1 0]
11409 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
11411 pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
11418 pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
11423 pmaddwd m5, m3, [r3 - 15 * 16] ; [01]
11430 pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
11438 pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
11442 pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
11450 pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
11457 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
11462 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
11464 pmaddwd m4, m3, [r3 - 13 * 16] ; [03]
11471 pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
11479 pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
11483 pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
11491 pmaddwd m6, m3, [r3 - 16] ; [15]
11498 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
11503 pmaddwd m1, m3, [r3 - 11 * 16] ; [05]
11510 pmaddwd m0, m3, [r3] ; [16]
11515 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
11520 pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
11524 pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
11532 pmaddwd m5, m3, [r3 + 16] ; [17]
11539 pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
11544 pmaddwd m6, m3, [r3 - 9 * 16] ; [07]
11551 pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
11559 pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
11563 pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
11568 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
11573 pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
11580 pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
11585 pmaddwd m5, m3, [r3 - 7 * 16] ; [09]
11592 pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
11600 pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
11604 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
11612 pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
11617 movhps m1, [r2] ; [00]
11619 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
11621 ;------------------------------------------------------------------------------------------------------------------
11622 ; void intraPredAng32_16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
11623 ;------------------------------------------------------------------------------------------------------------------
11625 cglobal intra_pred_ang32_16, 4,6,7,0-(6*mmsize+10)
11626 movu m0, [r2 + 0*mmsize]
11627 movu m1, [r2 + 1*mmsize]
11628 movu m2, [r2 + 2*mmsize]
11629 movu m3, [r2 + 3*mmsize]
11630 movu [rsp + 2*mmsize + 8], m0
11631 movu [rsp + 3*mmsize + 8], m1
11632 movu [rsp + 4*mmsize + 8], m2
11633 movu [rsp + 5*mmsize + 8], m3
11636 mov [rsp + 104], r4w
11641 pshufb m0, [shuf_mode_16_20]
11642 pshufb m1, [shuf_mode_16_20]
11643 pshufb m2, [shuf_mode_16_20]
11644 pshufb m3, [shuf_mode_16_20]
11645 movu [rsp + 24], m0
11646 movu [rsp + 12], m1
11650 lea r3, [ang_table + 16 * 16]
11655 mova m2, [pw_punpcklwd]
11659 lea r0, [r0 + r1 * 4 ]
11665 %macro MODE_17_19 1
11666 movu m3, [r2 + 50] ; [7 6 5 4 3 2 1 0]
11667 pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
11669 pmaddwd m4, m3, [r3 - 10 * 16] ; [06]
11676 pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
11684 pmaddwd m5, m3, [r3 + 2 * 16] ; [18]
11691 pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
11699 pmaddwd m6, m3, [r3 + 14 * 16] ; [30]
11703 pmaddwd m1, m3, [r3 - 12 * 16] ; [04]
11711 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
11718 pmaddwd m0, m3, [r3] ; [16]
11723 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
11728 pmaddwd m4, m3, [r3 + 6 * 16] ; [22]
11735 pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
11740 pmaddwd m5, m3, [r3 - 14 * 16] ; [02]
11747 pmaddwd m6, m3, [r3 - 8 * 16] ; [08]
11755 pmaddwd m6, m3, [r3 - 2 * 16] ; [14]
11762 pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
11770 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
11775 movhps m1, [r2 + 26] ; [00]
11777 TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
11782 pmaddwd m4, m3, [r3 - 10 * 16] ; [06]
11789 pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
11797 pmaddwd m5, m3, [r3 + 2 * 16] ; [18]
11804 pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
11812 pmaddwd m6, m3, [r3 + 14 * 16] ; [30]
11816 pmaddwd m1, m3, [r3 - 12 * 16] ; [04]
11824 pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
11831 pmaddwd m0, m3, [r3] ; [16]
11836 TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
11841 pmaddwd m4, m3, [r3 + 6 * 16] ; [22]
11848 pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
11853 pmaddwd m5, m3, [r3 - 14 * 16] ; [02]
11860 pmaddwd m6, m3, [r3 - 8 * 16] ; [08]
11868 pmaddwd m6, m3, [r3 - 2 * 16] ; [14]
11875 pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
11883 pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
11888 movhps m1, [r2] ; [00]
11890 TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
11892 ;------------------------------------------------------------------------------------------------------------------
11893 ; void intraPredAng32_17(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
11894 ;------------------------------------------------------------------------------------------------------------------
11896 cglobal intra_pred_ang32_17, 4,6,7,0-(7*mmsize+4)
11897 movu m0, [r2 + 0*mmsize]
11898 movu m1, [r2 + 1*mmsize]
11899 movu m2, [r2 + 2*mmsize]
11900 movu m3, [r2 + 3*mmsize]
11901 movu [rsp + 3*mmsize + 2], m0
11902 movu [rsp + 4*mmsize + 2], m1
11903 movu [rsp + 5*mmsize + 2], m2
11904 movu [rsp + 6*mmsize + 2], m3
11907 mov [rsp + 114], r4w
11912 pshufb m0, [shuf_mode_17_19]
11913 pshufb m1, [shuf_mode_17_19]
11914 pshufb m2, [shuf_mode_17_19]
11915 pshufb m3, [shuf_mode_16_20]
11916 movd [rsp + 46], m3
11917 movu [rsp + 30], m0
11918 movu [rsp + 12], m1
11921 mov [rsp + 30], r4w
11923 mov [rsp + 28], r4w
11925 mov [rsp + 12], r4w
11927 lea r3, [ang_table + 16 * 16]
11932 mova m2, [pw_punpcklwd]
11936 lea r0, [r0 + r1 * 4 ]
11942 ;-------------------------------------------------------------------------------------------------------------------
11943 ; void intraPredAng32_18(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
11944 ;-------------------------------------------------------------------------------------------------------------------
11946 cglobal intra_pred_ang32_18, 4,7,8
11947 movu m0, [r3] ; [7 6 5 4 3 2 1 0]
11948 movu m1, [r3 + 16] ; [15 14 13 12 11 10 9 8]
11949 movu m2, [r3 + 32] ; [23 22 21 20 19 18 17 16]
11950 movu m3, [r3 + 48] ; [31 30 29 28 27 26 25 24]
11951 movu m4, [r2 + 2] ; [8 7 6 5 4 3 2 1]
11952 movu m5, [r2 + 18] ; [16 15 14 13 12 11 10 9]
11964 pshufb m4, [shuf_mode32_18] ; [1 2 3 4 5 6 7 8]
11965 pshufb m5, [shuf_mode32_18] ; [9 10 11 12 13 14 15 16]
11967 palignr m6, m0, m4, 14
11969 palignr m6, m1, m0, 14
11970 movu [r0 + r1 + 16], m6
11971 palignr m6, m2, m1, 14
11972 movu [r0 + r1 + 32], m6
11973 palignr m6, m3, m2, 14
11974 movu [r0 + r1 + 48], m6
11976 palignr m6, m0, m4, 12
11978 palignr m6, m1, m0, 12
11979 movu [r0 + r6 + 16], m6
11980 palignr m6, m2, m1, 12
11981 movu [r0 + r6 + 32], m6
11982 palignr m6, m3, m2, 12
11983 movu [r0 + r6 + 48], m6
11985 palignr m6, m0, m4, 10
11987 palignr m6, m1, m0, 10
11988 movu [r0 + r3 + 16], m6
11989 palignr m6, m2, m1, 10
11990 movu [r0 + r3 + 32], m6
11991 palignr m6, m3, m2, 10
11992 movu [r0 + r3 + 48], m6
11996 palignr m6, m0, m4, 8
11998 palignr m6, m1, m0, 8
12000 palignr m6, m2, m1, 8
12002 palignr m6, m3, m2, 8
12005 palignr m6, m0, m4, 6
12007 palignr m6, m1, m0, 6
12008 movu [r0 + r1 + 16], m6
12009 palignr m6, m2, m1, 6
12010 movu [r0 + r1 + 32], m6
12011 palignr m6, m3, m2, 6
12012 movu [r0 + r1 + 48], m6
12014 palignr m6, m0, m4, 4
12016 palignr m6, m1, m0, 4
12017 movu [r0 + r6 + 16], m6
12018 palignr m6, m2, m1, 4
12019 movu [r0 + r6 + 32], m6
12020 palignr m6, m3, m2, 4
12021 movu [r0 + r6 + 48], m6
12023 palignr m6, m0, m4, 2
12025 palignr m6, m1, m0, 2
12026 movu [r0 + r3 + 16], m6
12027 palignr m6, m2, m1, 2
12028 movu [r0 + r3 + 32], m6
12029 palignr m6, m3, m2, 2
12030 movu [r0 + r3 + 48], m6
12039 palignr m6, m4, m5, 14
12041 palignr m6, m0, m4, 14
12042 movu [r0 + r1 + 16], m6
12043 palignr m6, m1, m0, 14
12044 movu [r0 + r1 + 32], m6
12045 palignr m6, m2, m1, 14
12046 movu [r0 + r1 + 48], m6
12048 palignr m6, m4, m5, 12
12050 palignr m6, m0, m4, 12
12051 movu [r0 + r6 + 16], m6
12052 palignr m6, m1, m0, 12
12053 movu [r0 + r6 + 32], m6
12054 palignr m6, m2, m1, 12
12055 movu [r0 + r6 + 48], m6
12057 palignr m6, m4, m5, 10
12059 palignr m6, m0, m4, 10
12060 movu [r0 + r3 + 16], m6
12061 palignr m6, m1, m0, 10
12062 movu [r0 + r3 + 32], m6
12063 palignr m6, m2, m1, 10
12064 movu [r0 + r3 + 48], m6
12068 palignr m6, m4, m5, 8
12070 palignr m6, m0, m4, 8
12072 palignr m6, m1, m0, 8
12074 palignr m6, m2, m1, 8
12077 palignr m6, m4, m5, 6
12079 palignr m6, m0, m4, 6
12080 movu [r0 + r1 + 16], m6
12081 palignr m6, m1, m0, 6
12082 movu [r0 + r1 + 32], m6
12083 palignr m6, m2, m1, 6
12084 movu [r0 + r1 + 48], m6
12086 palignr m6, m4, m5, 4
12088 palignr m6, m0, m4, 4
12089 movu [r0 + r6 + 16], m6
12090 palignr m6, m1, m0, 4
12091 movu [r0 + r6 + 32], m6
12092 palignr m6, m2, m1, 4
12093 movu [r0 + r6 + 48], m6
12095 palignr m6, m4, m5, 2
12097 palignr m6, m0, m4, 2
12098 movu [r0 + r3 + 16], m6
12099 palignr m6, m1, m0, 2
12100 movu [r0 + r3 + 32], m6
12101 palignr m6, m2, m1, 2
12102 movu [r0 + r3 + 48], m6
12108 pshufb m2, [shuf_mode32_18]
12109 pshufb m3, [shuf_mode32_18]
12116 palignr m6, m5, m2, 14
12118 palignr m6, m4, m5, 14
12119 movu [r0 + r1 + 16], m6
12120 palignr m6, m0, m4, 14
12121 movu [r0 + r1 + 32], m6
12122 palignr m6, m1, m0, 14
12123 movu [r0 + r1 + 48], m6
12125 palignr m6, m5, m2, 12
12127 palignr m6, m4, m5, 12
12128 movu [r0 + r6 + 16], m6
12129 palignr m6, m0, m4, 12
12130 movu [r0 + r6 + 32], m6
12131 palignr m6, m1, m0, 12
12132 movu [r0 + r6 + 48], m6
12134 palignr m6, m5, m2, 10
12136 palignr m6, m4, m5, 10
12137 movu [r0 + r3 + 16], m6
12138 palignr m6, m0, m4, 10
12139 movu [r0 + r3 + 32], m6
12140 palignr m6, m1, m0, 10
12141 movu [r0 + r3 + 48], m6
12145 palignr m6, m5, m2, 8
12147 palignr m6, m4, m5, 8
12149 palignr m6, m0, m4, 8
12151 palignr m6, m1, m0, 8
12154 palignr m6, m5, m2, 6
12156 palignr m6, m4, m5, 6
12157 movu [r0 + r1 + 16], m6
12158 palignr m6, m0, m4, 6
12159 movu [r0 + r1 + 32], m6
12160 palignr m6, m1, m0, 6
12161 movu [r0 + r1 + 48], m6
12163 palignr m6, m5, m2, 4
12165 palignr m6, m4, m5, 4
12166 movu [r0 + r6 + 16], m6
12167 palignr m6, m0, m4, 4
12168 movu [r0 + r6 + 32], m6
12169 palignr m6, m1, m0, 4
12170 movu [r0 + r6 + 48], m6
12172 palignr m6, m5, m2, 2
12174 palignr m6, m4, m5, 2
12175 movu [r0 + r3 + 16], m6
12176 palignr m6, m0, m4, 2
12177 movu [r0 + r3 + 32], m6
12178 palignr m6, m1, m0, 2
12179 movu [r0 + r3 + 48], m6
12188 palignr m6, m2, m3, 14
12190 palignr m6, m5, m2, 14
12191 movu [r0 + r1 + 16], m6
12192 palignr m6, m4, m5, 14
12193 movu [r0 + r1 + 32], m6
12194 palignr m6, m0, m4, 14
12195 movu [r0 + r1 + 48], m6
12197 palignr m6, m2, m3, 12
12199 palignr m6, m5, m2, 12
12200 movu [r0 + r6 + 16], m6
12201 palignr m6, m4, m5, 12
12202 movu [r0 + r6 + 32], m6
12203 palignr m6, m0, m4, 12
12204 movu [r0 + r6 + 48], m6
12206 palignr m6, m2, m3, 10
12208 palignr m6, m5, m2, 10
12209 movu [r0 + r3 + 16], m6
12210 palignr m6, m4, m5, 10
12211 movu [r0 + r3 + 32], m6
12212 palignr m6, m0, m4, 10
12213 movu [r0 + r3 + 48], m6
12217 palignr m6, m2, m3, 8
12219 palignr m6, m5, m2, 8
12221 palignr m6, m4, m5, 8
12223 palignr m6, m0, m4, 8
12226 palignr m6, m2, m3, 6
12228 palignr m6, m5, m2, 6
12229 movu [r0 + r1 + 16], m6
12230 palignr m6, m4, m5, 6
12231 movu [r0 + r1 + 32], m6
12232 palignr m6, m0, m4, 6
12233 movu [r0 + r1 + 48], m6
12235 palignr m6, m2, m3, 4
12237 palignr m6, m5, m2, 4
12238 movu [r0 + r6 + 16], m6
12239 palignr m6, m4, m5, 4
12240 movu [r0 + r6 + 32], m6
12241 palignr m6, m0, m4, 4
12242 movu [r0 + r6 + 48], m6
12244 palignr m6, m2, m3, 2
12246 palignr m6, m5, m2, 2
12247 movu [r0 + r3 + 16], m6
12248 palignr m6, m4, m5, 2
12249 movu [r0 + r3 + 32], m6
12250 palignr m6, m0, m4, 2
12251 movu [r0 + r3 + 48], m6
12254 ;------------------------------------------------------------------------------------------------------------------
12255 ; void intraPredAng32_19(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12256 ;------------------------------------------------------------------------------------------------------------------
12258 cglobal intra_pred_ang32_19, 4,7,7,0-(7*mmsize+4)
12260 movu m0, [r2 + 0*mmsize]
12261 movu m1, [r2 + 1*mmsize]
12262 movu m2, [r2 + 2*mmsize]
12263 movu m3, [r2 + 3*mmsize]
12264 movu [rsp + 3*mmsize + 2], m0
12265 movu [rsp + 4*mmsize + 2], m1
12266 movu [rsp + 5*mmsize + 2], m2
12267 movu [rsp + 6*mmsize + 2], m3
12270 mov [rsp + 114], r4w
12275 pshufb m0, [shuf_mode_17_19]
12276 pshufb m1, [shuf_mode_17_19]
12277 pshufb m2, [shuf_mode_17_19]
12278 pshufb m3, [shuf_mode_16_20]
12279 movd [rsp + 46], m3
12280 movu [rsp + 30], m0
12281 movu [rsp + 12], m1
12284 mov [rsp + 30], r4w
12286 mov [rsp + 28], r4w
12288 mov [rsp + 12], r4w
12290 lea r3, [ang_table + 16 * 16]
12295 mova m2, [pw_punpcklwd]
12307 ;------------------------------------------------------------------------------------------------------------------
12308 ; void intraPredAng32_20(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12309 ;------------------------------------------------------------------------------------------------------------------
12311 cglobal intra_pred_ang32_20, 4,7,7,0-(6*mmsize+10)
12313 movu m0, [r2 + 0*mmsize]
12314 movu m1, [r2 + 1*mmsize]
12315 movu m2, [r2 + 2*mmsize]
12316 movu m3, [r2 + 3*mmsize]
12317 movu [rsp + 2*mmsize + 8], m0
12318 movu [rsp + 3*mmsize + 8], m1
12319 movu [rsp + 4*mmsize + 8], m2
12320 movu [rsp + 5*mmsize + 8], m3
12323 mov [rsp + 104], r4w
12328 pshufb m0, [shuf_mode_16_20]
12329 pshufb m1, [shuf_mode_16_20]
12330 pshufb m2, [shuf_mode_16_20]
12331 pshufb m3, [shuf_mode_16_20]
12332 movu [rsp + 24], m0
12333 movu [rsp + 12], m1
12337 lea r3, [ang_table + 16 * 16]
12342 mova m2, [pw_punpcklwd]
12354 ;------------------------------------------------------------------------------------------------------------------
12355 ; void intraPredAng32_21(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12356 ;------------------------------------------------------------------------------------------------------------------
12358 cglobal intra_pred_ang32_21, 4,7,7,0-(6*mmsize+2)
12360 movu m0, [r2 + 0*mmsize]
12361 movu m1, [r2 + 1*mmsize]
12362 movu m2, [r2 + 2*mmsize]
12363 movu m3, [r2 + 3*mmsize]
12364 movu [rsp + 2*mmsize], m0
12365 movu [rsp + 3*mmsize], m1
12366 movu [rsp + 4*mmsize], m2
12367 movu [rsp + 5*mmsize], m3
12370 mov [rsp + 96], r4w
12375 pshufb m0, [shuf_mode_15_21]
12376 pshufb m1, [shuf_mode_15_21]
12377 pshufb m2, [shuf_mode_15_21]
12378 pshufb m3, [shuf_mode_15_21]
12379 movh [rsp + 24], m0
12380 movh [rsp + 16], m1
12384 lea r3, [ang_table + 16 * 16]
12389 mova m2, [pw_punpcklwd]
12401 ;------------------------------------------------------------------------------------------------------------------
12402 ; void intraPredAng32_22(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12403 ;------------------------------------------------------------------------------------------------------------------
12405 cglobal intra_pred_ang32_22, 4,7,7,0-(5*mmsize+10)
12407 movu m0, [r2 + 0*mmsize]
12408 movu m1, [r2 + 1*mmsize]
12409 movu m2, [r2 + 2*mmsize]
12410 movu m3, [r2 + 3*mmsize]
12411 movu [rsp + 1*mmsize + 8], m0
12412 movu [rsp + 2*mmsize + 8], m1
12413 movu [rsp + 3*mmsize + 8], m2
12414 movu [rsp + 4*mmsize + 8], m3
12417 mov [rsp + 88], r4w
12423 pshufb m0, [shuf_mode_14_22]
12424 pshufb m1, [shuf_mode_14_22]
12425 pshufb m2, [shuf_mode_14_22]
12426 movh [rsp + 14], m0
12430 lea r3, [ang_table + 16 * 16]
12435 mova m2, [pw_punpcklwd]
12447 ;------------------------------------------------------------------------------------------------------------------
12448 ; void intraPredAng32_23(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12449 ;------------------------------------------------------------------------------------------------------------------
12451 cglobal intra_pred_ang32_23, 4,7,7,0-(5*mmsize+2)
12453 movu m0, [r2 + 0*mmsize]
12454 movu m1, [r2 + 1*mmsize]
12455 movu m2, [r2 + 2*mmsize]
12456 movu m3, [r2 + 3*mmsize]
12457 movu [rsp + 1*mmsize], m0
12458 movu [rsp + 2*mmsize], m1
12459 movu [rsp + 3*mmsize], m2
12460 movu [rsp + 4*mmsize], m3
12466 pshufb m0, [shuf_mode_13_23]
12467 pshufb m1, [shuf_mode_13_23]
12475 lea r3, [ang_table + 16 * 16]
12480 mova m2, [pw_punpcklwd]
12492 ;------------------------------------------------------------------------------------------------------------------
12493 ; void intraPredAng32_24(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12494 ;------------------------------------------------------------------------------------------------------------------
12496 cglobal intra_pred_ang32_24, 4,7,7,0-(4*mmsize+10)
12498 movu m0, [r2 + 0*mmsize]
12499 movu m1, [r2 + 1*mmsize]
12500 movu m2, [r2 + 2*mmsize]
12501 movu m3, [r2 + 3*mmsize]
12503 movu [rsp + 0*mmsize + 8], m0
12504 movu [rsp + 1*mmsize + 8], m1
12505 movu [rsp + 2*mmsize + 8], m2
12506 movu [rsp + 3*mmsize + 8], m3
12519 lea r3, [ang_table + 16 * 16]
12525 mova m2, [pw_punpcklwd]
12536 ;------------------------------------------------------------------------------------------------------------------
12537 ; void intraPredAng32_25(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12538 ;------------------------------------------------------------------------------------------------------------------
12540 cglobal intra_pred_ang32_25, 4,7,7,0-(4*mmsize+4)
12542 movu m0, [r2 + 0*mmsize]
12543 movu m1, [r2 + 1*mmsize]
12544 movu m2, [r2 + 2*mmsize]
12545 movu m3, [r2 + 3*mmsize]
12546 movu [rsp + 0*mmsize + 2], m0
12547 movu [rsp + 1*mmsize + 2], m1
12548 movu [rsp + 2*mmsize + 2], m2
12549 movu [rsp + 3*mmsize + 2], m3
12555 lea r3, [ang_table + 16 * 16]
12571 ;------------------------------------------------------------------------------------------------------------------
12572 ; void intraPredAng32_26(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12573 ;------------------------------------------------------------------------------------------------------------------
12575 cglobal intra_pred_ang32_26, 4,7,5
12581 mova m4, [c_mode32_10_0]
12595 movu [r0 + r1 + 16], m1
12596 movu [r0 + r1 + 32], m2
12597 movu [r0 + r1 + 48], m3
12600 movu [r0 + r2 + 16], m1
12601 movu [r0 + r2 + 32], m2
12602 movu [r0 + r2 + 48], m3
12605 movu [r0 + r4 + 16], m1
12606 movu [r0 + r4 + 32], m2
12607 movu [r0 + r4 + 48], m3
12617 movu [r0 + r1 + 16], m1
12618 movu [r0 + r1 + 32], m2
12619 movu [r0 + r1 + 48], m3
12622 movu [r0 + r2 + 16], m1
12623 movu [r0 + r2 + 32], m2
12624 movu [r0 + r2 + 48], m3
12627 movu [r0 + r4 + 16], m1
12628 movu [r0 + r4 + 32], m2
12629 movu [r0 + r4 + 48], m3
12636 ;------------------------------------------------------------------------------------------------------------------
12637 ; void intraPredAng32_27(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12638 ;------------------------------------------------------------------------------------------------------------------
12640 cglobal intra_pred_ang32_27, 4,7,8
12642 lea r3, [ang_table + 16 * 16]
12657 ;------------------------------------------------------------------------------------------------------------------
12658 ; void intraPredAng32_28(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12659 ;------------------------------------------------------------------------------------------------------------------
12661 cglobal intra_pred_ang32_28, 4,7,8
12663 lea r3, [ang_table + 16 * 16]
12678 ;------------------------------------------------------------------------------------------------------------------
12679 ; void intraPredAng32_29(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12680 ;------------------------------------------------------------------------------------------------------------------
12682 cglobal intra_pred_ang32_29, 4,7,8
12684 lea r3, [ang_table + 16 * 16]
12699 ;------------------------------------------------------------------------------------------------------------------
12700 ; void intraPredAng32_30(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12701 ;------------------------------------------------------------------------------------------------------------------
12703 cglobal intra_pred_ang32_30, 4,7,8
12705 lea r3, [ang_table + 16 * 16]
12720 ;------------------------------------------------------------------------------------------------------------------
12721 ; void intraPredAng32_31(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12722 ;------------------------------------------------------------------------------------------------------------------
12724 cglobal intra_pred_ang32_31, 4,7,8
12726 lea r3, [ang_table + 16 * 16]
12741 ;------------------------------------------------------------------------------------------------------------------
12742 ; void intraPredAng32_32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12743 ;------------------------------------------------------------------------------------------------------------------
12745 cglobal intra_pred_ang32_32, 4,7,8
12747 lea r3, [ang_table + 16 * 16]
12762 ;------------------------------------------------------------------------------------------------------------------
12763 ; void intraPredAng32_33(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
12764 ;------------------------------------------------------------------------------------------------------------------
12766 cglobal intra_pred_ang32_33, 4,7,8
12768 lea r3, [ang_table + 16 * 16]