1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
4 ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
6 ;* This program is free software; you can redistribute it and/or modify
7 ;* it under the terms of the GNU General Public License as published by
8 ;* the Free Software Foundation; either version 2 of the License, or
9 ;* (at your option) any later version.
11 ;* This program is distributed in the hope that it will be useful,
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ;* GNU General Public License for more details.
16 ;* You should have received a copy of the GNU General Public License
17 ;* along with this program; if not, write to the Free Software
18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 ;* This program is also available under a commercial proprietary license.
21 ;* For more information, contact us at license @ x265.com.
22 ;*****************************************************************************/
25 %include "x86util.asm"
29 pb_0_8 times 8 db 0, 8
30 pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8
31 pb_swap8: times 2 db 7, 6, 5, 4, 3, 2, 1, 0
32 c_trans_4x4 db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
33 tab_Si: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7
34 pb_fact0: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
35 c_mode32_12_0: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 7, 0
36 c_mode32_13_0: db 3, 6, 10, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
37 c_mode32_13_shuf: db 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0
38 c_mode32_14_shuf: db 15, 14, 13, 0, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15
39 c_mode32_14_0: db 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
40 c_mode32_15_0: db 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0
41 c_mode32_16_0: db 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0
42 c_mode32_17_0: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0
43 c_mode32_18_0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
44 c_shuf8_0: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
45 c_deinterval8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
46 tab_S1: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0
47 pb_unpackbq: db 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1
48 c_mode16_12: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
49 c_mode16_13: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
50 c_mode16_14: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
51 c_mode16_15: db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2
52 c_mode16_16: db 8, 6, 5, 3, 2, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2
53 c_mode16_17: db 4, 2, 1, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1
54 c_mode16_18: db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
55 tab_S2: db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0
75 ;-----------------------------------------------------------------------------
76 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
77 ;-----------------------------------------------------------------------------
79 cglobal intra_pred_dc4, 4,6,3
87 psadbw m1, m0 ; m1 = sum
93 pmulhrsw m1, m2 ; m1 = (sum + 4) / 8
94 movd r4d, m1 ; r4d = dc_val
95 pshufb m1, m0 ; m1 = byte [dc_val ...]
101 movd [r0 + r1 * 2], m1
106 lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2
107 add r4d, r5d ; r4d = DC * 3 + 2
109 pshuflw m1, m1, 0 ; m1 = pixDCx3
116 movd [r0], m2 ; overwrite top-left pixel, we will update it later
128 pmovzxbw m2, [r2 + 1]
133 pextrb [r0 + r1], m2, 1
134 pextrb [r0 + r1 * 2], m2, 2
140 ;-------------------------------------------------------------------------------------------
141 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
142 ;-------------------------------------------------------------------------------------------
144 cglobal intra_pred_dc8, 4, 7, 3
158 shr r5d, 4 ; sum = sum / 16
160 pshufb m1, m0 ; m1 = byte [dc_val ...]
168 lea r0, [r0 + r1 * 2]
171 lea r0, [r0 + r1 * 2]
174 lea r0, [r0 + r1 * 2]
180 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
181 add r5d, r4d ; r5d = DC * 3 + 2
183 pshuflw m1, m1, 0 ; m1 = pixDCx3
203 pmovzxbw m2, [r2 + 1]
208 pextrb [r6 + r1], m2, 1
209 pextrb [r6 + 2 * r1], m2, 2
210 lea r6, [r6 + r1 * 2]
211 pextrb [r6 + r1], m2, 3
212 pextrb [r6 + r1 * 2], m2, 4
213 pextrb [r6 + r1 * 4], m2, 6
215 pextrb [r6 + r1], m2, 5
220 ;-------------------------------------------------------------------------------------------
221 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
222 ;-------------------------------------------------------------------------------------------
224 cglobal intra_pred_dc16, 5, 7, 4
239 shr r5d, 5 ; sum = sum / 32
241 pshufb m1, m0 ; m1 = byte [dc_val ...]
249 lea r0, [r0 + r1 * 2]
252 lea r0, [r0 + r1 * 2]
255 lea r0, [r0 + r1 * 2]
258 lea r0, [r0 + r1 * 2]
261 lea r0, [r0 + r1 * 2]
264 lea r0, [r0 + r1 * 2]
267 lea r0, [r0 + r1 * 2]
273 lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2
274 add r5d, r4d ; r5d = DC * 3 + 2
276 pshuflw m1, m1, 0 ; m1 = pixDCx3
285 pmovzxbw m3, [r3 + 8]
301 pmovzxbw m2, [r2 + 1]
306 pextrb [r6 + r1], m2, 1
307 pextrb [r6 + r1 * 2], m2, 2
308 lea r6, [r6 + r1 * 2]
309 pextrb [r6 + r1], m2, 3
310 pextrb [r6 + r1 * 2], m2, 4
311 lea r6, [r6 + r1 * 2]
312 pextrb [r6 + r1], m2, 5
313 pextrb [r6 + r1 * 2], m2, 6
314 lea r6, [r6 + r1 * 2]
315 pextrb [r6 + r1], m2, 7
317 pmovzxbw m3, [r2 + 9]
321 pextrb [r6 + r1 * 2], m3, 0
322 lea r6, [r6 + r1 * 2]
323 pextrb [r6 + r1], m3, 1
324 pextrb [r6 + r1 * 2], m3, 2
325 lea r6, [r6 + r1 * 2]
326 pextrb [r6 + r1], m3, 3
327 pextrb [r6 + r1 * 2], m3, 4
328 lea r6, [r6 + r1 * 2]
329 pextrb [r6 + r1], m3, 5
330 pextrb [r6 + r1 * 2], m3, 6
335 ;-------------------------------------------------------------------------------------------
336 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
337 ;-------------------------------------------------------------------------------------------
339 cglobal intra_pred_dc32, 4, 5, 5
359 shr r4d, 6 ; sum = sum / 64
361 pshufb m1, m0 ; m1 = byte [dc_val ...]
368 movu [r0 + r1 + 16],m1
369 lea r0, [r0 + 2 * r1]
373 movu [r0 + r1 + 16],m1
374 lea r0, [r0 + 2 * r1]
378 movu [r0 + r1 + 16],m1
379 lea r0, [r0 + 2 * r1]
383 movu [r0 + r1 + 16],m1
384 lea r0, [r0 + 2 * r1]
388 movu [r0 + r1 + 16],m1
389 lea r0, [r0 + 2 * r1]
393 movu [r0 + r1 + 16],m1
394 lea r0, [r0 + 2 * r1]
398 movu [r0 + r1 + 16],m1
399 lea r0, [r0 + 2 * r1]
403 movu [r0 + r1 + 16],m1
404 lea r0, [r0 + 2 * r1]
409 ;-----------------------------------------------------------------------------------------------------------
410 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
411 ;-----------------------------------------------------------------------------------------------------------
413 cglobal intra_pred_planar4, 4,7,5
416 pmovzxbw m0, [r3] ; topRow[i] = above[i];
420 movd m2, [r2 + 4] ; bottomLeft = left[4]
421 movzx r6d, byte [r3 + 4] ; topRight = above[4];
424 psubw m2, m0 ; bottomRow[i] = bottomLeft - topRow[i]
426 punpcklqdq m3, m2, m1
430 %macro COMP_PRED_PLANAR_2ROW 1
431 movzx r4d, byte [r2 + %1]
432 lea r4d, [r4d * 4 + 4]
436 movzx r4d, byte [r2 + %1 + 1]
437 lea r4d, [r4d * 4 + 4]
440 punpcklqdq m3, m4 ; horPred
442 movzx r4d, byte [r2 + %1]
448 movzx r4d, byte [r2 + %1 + 1]
453 punpcklqdq m4, m1 ; rightColumnN
455 pmullw m4, [multi_2Row]
465 lea r0, [r0 + 2 * r1]
468 COMP_PRED_PLANAR_2ROW 0
469 COMP_PRED_PLANAR_2ROW 2
473 ;-----------------------------------------------------------------------------------------------------------
474 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
475 ;-----------------------------------------------------------------------------------------------------------
477 cglobal intra_pred_planar8, 4,4,7
481 pmovzxbw m1, [r3] ; v_topRow
482 pmovzxbw m2, [r2] ; v_leftColumn
484 movd m3, [r3 + 8] ; topRight = above[8];
485 movd m4, [r2 + 8] ; bottomLeft = left[8];
489 punpcklbw m3, m0 ; v_topRight
490 punpcklbw m4, m0 ; v_bottomLeft
492 psubw m4, m1 ; v_bottomRow
493 psubw m3, m2 ; v_rightColumn
495 psllw m1, 3 ; v_topRow
496 psllw m2, 3 ; v_leftColumn
500 %macro PRED_PLANAR_ROW8 1
502 pshuflw m5, m6, 0x55 * %1
504 pshuflw m2, m3, 0x55 * %1
507 pshufhw m5, m6, 0x55 * (%1 - 4)
509 pshufhw m2, m3, 0x55 * (%1 - 4)
537 ;-----------------------------------------------------------------------------------------------------------
538 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
539 ;-----------------------------------------------------------------------------------------------------------
541 cglobal intra_pred_planar16, 4,6,8
545 pmovzxbw m1, [r3] ; topRow[0-7]
546 pmovzxbw m2, [r3 + 8] ; topRow[8-15]
550 punpcklbw m3, m0 ; v_bottomLeft = left[16]
551 movzx r4d, byte [r3 + 16] ; topRight = above[16]
553 psubw m4, m3, m1 ; v_bottomRow[0]
554 psubw m5, m3, m2 ; v_bottomRow[1]
559 %macro PRED_PLANAR_ROW16 1
560 movzx r5d, byte [r2 + %1]
562 lea r5d, [r5d * 8 + 16]
565 pshufd m3, m3, 0 ; horPred
567 movzx r5d, byte [r2 + %1]
574 pmullw m7, m6, [multiL]
580 pmullw m6, m6, [multiH]
611 ;-----------------------------------------------------------------------------------------------------------
612 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
613 ;-----------------------------------------------------------------------------------------------------------
616 cglobal intra_pred_planar32, 4,7,12
617 %define bottomRow0 m8
618 %define bottomRow1 m9
619 %define bottomRow2 m10
620 %define bottomRow3 m11
622 cglobal intra_pred_planar32, 4,7,8,0-(4*mmsize)
623 %define bottomRow0 [rsp + 0 * mmsize]
624 %define bottomRow1 [rsp + 1 * mmsize]
625 %define bottomRow2 [rsp + 2 * mmsize]
626 %define bottomRow3 [rsp + 3 * mmsize]
633 punpcklbw m0, m3 ; v_bottomLeft = left[32]
634 movzx r4d, byte [r3 + 32] ; topRight = above[32]
636 pmovzxbw m1, [r3 + 0] ; topRow[0]
637 pmovzxbw m2, [r3 + 8] ; topRow[1]
638 pmovzxbw m3, [r3 +16] ; topRow[2]
639 pmovzxbw m4, [r3 +24] ; topRow[3]
641 psubw m5, m0, m1 ; v_bottomRow[0]
642 psubw m6, m0, m2 ; v_bottomRow[1]
643 psubw m7, m0, m3 ; v_bottomRow[2]
644 psubw m0, m4 ; v_bottomRow[3]
656 %macro COMP_PRED_PLANAR_ROW 1
662 pshufd m5, m5, 0 ; horPred
672 pmullw m7, m6, [multiL]
674 pmullw m7, m6, [multiH2]
708 COMP_PRED_PLANAR_ROW 0
709 COMP_PRED_PLANAR_ROW 16
715 %undef COMP_PRED_PLANAR_ROW
719 ;-----------------------------------------------------------------------------
720 ; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
721 ;-----------------------------------------------------------------------------
723 cglobal intra_pred_ang4_2, 3,3,4
731 movd [r0 + r1 * 2], m2
739 cglobal intra_pred_ang4_3, 3,4,5
742 lea r3, [ang_table + 20 * 16]
743 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
744 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
745 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
746 palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
747 palignr m2, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
748 palignr m3, m0, 6 ; [x x x x x x x x 8 7 7 6 6 5 5 4]
752 movh m3, [r3 + 6 * 16] ; [26]
753 movhps m3, [r3] ; [20]
754 movh m4, [r3 - 6 * 16] ; [14]
755 movhps m4, [r3 - 12 * 16] ; [ 8]
758 ; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose
769 ; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before
773 pshufb m0, [c_trans_4x4]
776 ; TODO: use pextrd here after intrinsic ssse3 removed
778 pextrd [r0 + r1], m0, 1
779 pextrd [r0 + r1 * 2], m0, 2
781 pextrd [r0 + r1], m0, 3
785 cglobal intra_pred_ang4_4, 3,4,5
788 lea r3, [ang_table + 18 * 16]
789 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
790 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
791 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
792 palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
793 palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
795 punpcklqdq m2, m1, m3
797 movh m3, [r3 + 3 * 16] ; [21]
798 movhps m3, [r3 - 8 * 16] ; [10]
799 movh m4, [r3 + 13 * 16] ; [31]
800 movhps m4, [r3 + 2 * 16] ; [20]
801 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
804 cglobal intra_pred_ang4_5, 3,4,5
807 lea r3, [ang_table + 10 * 16]
808 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
809 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
810 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
811 palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
812 palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3]
814 punpcklqdq m2, m1, m3
816 movh m3, [r3 + 7 * 16] ; [17]
817 movhps m3, [r3 - 8 * 16] ; [ 2]
818 movh m4, [r3 + 9 * 16] ; [19]
819 movhps m4, [r3 - 6 * 16] ; [ 4]
820 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
823 cglobal intra_pred_ang4_6, 3,4,5
826 lea r3, [ang_table + 19 * 16]
827 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
828 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
829 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
830 palignr m2, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
834 movh m3, [r3 - 6 * 16] ; [13]
835 movhps m3, [r3 + 7 * 16] ; [26]
836 movh m4, [r3 - 12 * 16] ; [ 7]
837 movhps m4, [r3 + 1 * 16] ; [20]
838 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
841 cglobal intra_pred_ang4_7, 3,4,5
844 lea r3, [ang_table + 20 * 16]
845 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
846 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
847 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
848 palignr m3, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2]
849 punpcklqdq m2, m0, m3
852 movh m3, [r3 - 11 * 16] ; [ 9]
853 movhps m3, [r3 - 2 * 16] ; [18]
854 movh m4, [r3 + 7 * 16] ; [27]
855 movhps m4, [r3 - 16 * 16] ; [ 4]
856 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
859 cglobal intra_pred_ang4_8, 3,4,5
862 lea r3, [ang_table + 13 * 16]
863 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
864 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
865 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
869 movh m3, [r3 - 8 * 16] ; [ 5]
870 movhps m3, [r3 - 3 * 16] ; [10]
871 movh m4, [r3 + 2 * 16] ; [15]
872 movhps m4, [r3 + 7 * 16] ; [20]
873 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
876 cglobal intra_pred_ang4_9, 3,4,5
879 lea r3, [ang_table + 4 * 16]
880 movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
881 palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2]
882 punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
886 movh m3, [r3 - 2 * 16] ; [ 2]
887 movhps m3, [r3 - 0 * 16] ; [ 4]
888 movh m4, [r3 + 2 * 16] ; [ 6]
889 movhps m4, [r3 + 4 * 16] ; [ 8]
890 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
893 cglobal intra_pred_ang4_10, 3,3,4
894 movd m0, [r2 + 1] ; [8 7 6 5 4 3 2 1]
895 pshufb m0, [pb_unpackbd1]
901 movd [r0 + r1 * 2], m2
910 pmovzxbw m0, m0 ; [-1 -1 -1 -1]
911 movh m1, [r2] ; [4 3 2 1 0]
912 pshufb m2, m1, [pb_0_8] ; [0 0 0 0]
913 pshufb m1, [pb_unpackbw1] ; [4 3 2 1]
925 cglobal intra_pred_ang4_26, 4,4,3
926 movd m0, [r3 + 1] ; [8 7 6 5 4 3 2 1]
931 movd [r0 + r1 * 2], m0
939 pshufb m0, [pb_0_8] ; [ 1 1 1 1]
940 movh m1, [r2] ; [-4 -3 -2 -1 0]
941 pshufb m2, m1, [pb_0_8] ; [0 0 0 0]
942 pshufb m1, [pb_unpackbw1] ; [-4 -3 -2 -1]
949 pextrb [r0 + r1], m0, 1
950 pextrb [r0 + r1 * 2], m0, 2
951 pextrb [r0 + r3], m0, 3
957 cglobal intra_pred_ang4_11, 3,4,5
960 lea r3, [ang_table + 24 * 16]
961 movh m0, [r2] ; [x x x 4 3 2 1 0]
962 palignr m1, m0, 1 ; [x x x x 4 3 2 1]
963 punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0]
967 movh m3, [r3 + 6 * 16] ; [24]
968 movhps m3, [r3 + 4 * 16] ; [26]
969 movh m4, [r3 + 2 * 16] ; [28]
970 movhps m4, [r3 + 0 * 16] ; [30]
971 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
974 cglobal intra_pred_ang4_12, 3,4,5
977 lea r3, [ang_table + 20 * 16]
978 movh m0, [r2] ; [x x x 4 3 2 1 0]
979 palignr m1, m0, 1 ; [x x x x 4 3 2 1]
980 punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0]
984 movh m3, [r3 + 7 * 16] ; [27]
985 movhps m3, [r3 + 2 * 16] ; [22]
986 movh m4, [r3 - 3 * 16] ; [17]
987 movhps m4, [r3 - 8 * 16] ; [12]
988 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
991 cglobal intra_pred_ang4_13, 4,4,5
996 movh m1, [r2 - 1] ; [x x 4 3 2 1 0 x]
997 palignr m0, m1, 1 ; [x x x 4 3 2 1 0]
998 palignr m2, m1, 2 ; [x x x x 4 3 2 1]
999 pinsrb m1, [r3 + 4], 0
1000 punpcklbw m1, m0 ; [3 2 2 1 1 0 0 x]
1001 punpcklbw m0, m2 ; [4 3 3 2 2 1 1 0]
1002 punpcklqdq m2, m0, m1
1005 lea r3, [ang_table + 21 * 16]
1006 movh m3, [r3 + 2 * 16] ; [23]
1007 movhps m3, [r3 - 7 * 16] ; [14]
1008 movh m4, [r3 - 16 * 16] ; [ 5]
1009 movhps m4, [r3 + 7 * 16] ; [28]
1010 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1013 cglobal intra_pred_ang4_14, 4,4,5
1018 movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x]
1019 palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
1020 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
1021 pinsrb m2, [r3 + 2], 0
1022 punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
1023 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
1027 lea r3, [ang_table + 19 * 16]
1028 movh m3, [r3 + 0 * 16] ; [19]
1029 movhps m3, [r3 - 13 * 16] ; [ 6]
1030 movh m4, [r3 + 6 * 16] ; [25]
1031 movhps m4, [r3 - 7 * 16] ; [12]
1032 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1035 cglobal intra_pred_ang4_15, 4,4,5
1040 movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x]
1041 palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
1042 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
1043 pinsrb m2, [r3 + 2], 0
1044 pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y]
1045 pinsrb m3, [r3 + 4], 0
1046 punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y]
1047 punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
1048 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
1052 lea r3, [ang_table + 23 * 16]
1053 movh m3, [r3 - 8 * 16] ; [15]
1054 movhps m3, [r3 + 7 * 16] ; [30]
1055 movh m4, [r3 - 10 * 16] ; [13]
1056 movhps m4, [r3 + 5 * 16] ; [28]
1057 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1060 cglobal intra_pred_ang4_16, 4,4,5
1065 movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x]
1066 palignr m0, m2, 1 ; [x x x 4 3 2 1 0]
1067 palignr m1, m2, 2 ; [x x x x 4 3 2 1]
1068 pinsrb m2, [r3 + 2], 0
1069 pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y]
1070 pinsrb m3, [r3 + 3], 0
1071 punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y]
1072 punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x]
1073 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
1077 lea r3, [ang_table + 19 * 16]
1078 movh m3, [r3 - 8 * 16] ; [11]
1079 movhps m3, [r3 + 3 * 16] ; [22]
1080 movh m4, [r3 - 18 * 16] ; [ 1]
1081 movhps m4, [r3 - 7 * 16] ; [12]
1082 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1085 cglobal intra_pred_ang4_17, 4,4,5
1090 movh m3, [r2 - 1] ; [- - 4 3 2 1 0 x]
1091 palignr m0, m3, 1 ; [- - - 4 3 2 1 0]
1092 palignr m1, m3, 2 ; [- - - - 4 3 2 1]
1094 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0]
1096 pinsrb m3, [r3 + 1], 0
1097 punpcklbw m1, m3, m4 ; [3 2 2 1 1 0 0 x]
1100 pslldq m2, m3, 1 ; [- 4 3 2 1 0 x y]
1101 pinsrb m2, [r3 + 2], 0
1102 pslldq m1, m2, 1 ; [4 3 2 1 0 x y z]
1103 pinsrb m1, [r3 + 4], 0
1104 punpcklbw m1, m2 ; [1 0 0 x x y y z]
1105 punpcklbw m2, m3 ; [2 1 1 0 0 x x y]
1108 lea r3, [ang_table + 14 * 16]
1109 movh m3, [r3 - 8 * 16] ; [ 6]
1110 movhps m3, [r3 - 2 * 16] ; [12]
1111 movh m4, [r3 + 4 * 16] ; [18]
1112 movhps m4, [r3 + 10 * 16] ; [24]
1113 jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
1116 cglobal intra_pred_ang4_18, 4,4,1
1120 pinsrd m0, [r3 + 1], 1 ; [- 3 2 1 0 -1 -2 -3]
1124 movd [r0 + r1 * 2], m0
1130 ;-----------------------------------------------------------------------------
1131 ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
1132 ;-----------------------------------------------------------------------------
1134 cglobal intra_pred_ang8_2, 3,5,2
1144 movh [r0 + r1 * 2], m1
1148 lea r0, [r0 + r1 * 4]
1153 movh [r0 + r1 * 2], m1
1159 cglobal intra_pred_ang8_3, 3,5,8
1162 lea r3, [ang_table + 22 * 16]
1163 lea r4, [ang_table + 8 * 16]
1166 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1167 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1169 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1170 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1171 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1173 pmaddubsw m4, m0, [r3 + 4 * 16] ; [26]
1175 pmaddubsw m1, [r3 - 2 * 16] ; [20]
1179 palignr m5, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1181 pmaddubsw m5, [r3 - 8 * 16] ; [14]
1184 palignr m6, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
1186 pmaddubsw m6, [r4] ; [ 8]
1190 palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
1192 pmaddubsw m6, m1, [r4 - 6 * 16] ; [ 2]
1195 pmaddubsw m1, [r3 + 6 * 16] ; [28]
1199 palignr m1, m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]
1201 pmaddubsw m1, [r3] ; [22]
1204 palignr m2, m0, 12 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]
1206 pmaddubsw m2, [r3 - 6 * 16] ; [16]
1216 punpckhbw m0, m4, m5
1218 punpckhbw m2, m4, m0
1221 punpckhbw m0, m6, m1
1223 punpckhbw m1, m6, m0
1226 punpckhdq m5, m4, m6
1228 punpckldq m6, m2, m1
1235 movhps [r0 + r1], m4
1236 movh [r0 + r1 * 2], m5
1237 movhps [r0 + r4], m5
1240 movhps [r0 + r1 * 2], m6
1242 movhps [r0 + r1 * 4], m1
1245 cglobal intra_pred_ang8_4, 3,5,8
1248 lea r3, [ang_table + 24 * 16]
1249 lea r4, [ang_table + 10 * 16]
1252 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1253 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1255 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1256 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1257 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1260 pmaddubsw m4, m0, [r3 - 3 * 16] ; [21]
1262 pmaddubsw m1, [r4] ; [10]
1266 pmaddubsw m5, [r3 + 7 * 16] ; [31]
1269 palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1271 pmaddubsw m6, [r3 - 4 * 16] ; [ 20]
1275 palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
1277 pmaddubsw m6, m1, [r4 - 1 * 16] ; [ 9]
1280 pmaddubsw m1, [r3 + 6 * 16] ; [30]
1284 palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
1286 pmaddubsw m1, [r3 - 5 * 16] ; [19]
1289 palignr m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 8]
1291 pmaddubsw m2, [r4 - 2 * 16] ; [8]
1294 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1296 cglobal intra_pred_ang8_5, 3,5,8
1299 lea r3, [ang_table + 17 * 16]
1300 lea r4, [ang_table + 2 * 16]
1303 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1304 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1306 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1307 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1308 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1311 pmaddubsw m4, m0, [r3] ; [17]
1313 pmaddubsw m1, [r4] ; [2]
1317 pmaddubsw m5, [r3 + 2 * 16] ; [19]
1320 palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1323 pmaddubsw m1, [r4 + 2 * 16] ; [4]
1327 pmaddubsw m6, [r3 + 4 * 16] ; [21]
1330 palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
1333 pmaddubsw m7, [r4 + 4 * 16] ; [6]
1337 pmaddubsw m1, [r3 + 6 * 16] ; [23]
1340 palignr m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 8 8 9]
1342 pmaddubsw m2, [r4 + 6 * 16] ; [8]
1345 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1347 cglobal intra_pred_ang8_6, 3,5,8
1350 lea r3, [ang_table + 20 * 16]
1351 lea r4, [ang_table + 8 * 16]
1354 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1355 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1357 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1358 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1361 pmaddubsw m4, m0, [r3 - 7 * 16] ; [13]
1363 pmaddubsw m1, [r3 + 6 * 16] ; [26]
1367 palignr m6, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1369 pmaddubsw m5, m6, [r4 - 1 * 16] ; [7]
1372 pmaddubsw m6, [r3] ; [20]
1376 palignr m1, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1378 pmaddubsw m6, m1, [r4 - 7 * 16] ; [1]
1382 pmaddubsw m3, [r3 - 6 * 16] ; [14]
1386 pmaddubsw m1, [r3 + 7 * 16] ; [27]
1389 palignr m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
1391 pmaddubsw m2, [r4] ; [8]
1394 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1396 cglobal intra_pred_ang8_7, 3,5,8
1399 lea r3, [ang_table + 24 * 16]
1400 lea r4, [ang_table + 6 * 16]
1403 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1404 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1406 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1407 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1409 pmaddubsw m4, m0, [r4 + 3 * 16] ; [9]
1411 pmaddubsw m3, m0, [r3 - 6 * 16] ; [18]
1415 pmaddubsw m5, m0, [r3 + 3 * 16] ; [27]
1418 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1420 pmaddubsw m6, m1, [r4 - 2 * 16] ; [4]
1424 pmaddubsw m6, m1, [r4 + 7 * 16] ; [13]
1428 pmaddubsw m3, [r3 - 2 * 16] ; [22]
1432 pmaddubsw m1, [r3 + 7 * 16] ; [31]
1435 palignr m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1437 pmaddubsw m2, [r4 + 2 * 16] ; [8]
1440 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1442 cglobal intra_pred_ang8_8, 3,5,8
1445 lea r3, [ang_table + 23 * 16]
1446 lea r4, [ang_table + 8 * 16]
1449 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1450 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1452 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
1453 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1454 palignr m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1456 pmaddubsw m4, m0, [r4 - 3 * 16] ; [5]
1458 pmaddubsw m3, m0, [r4 + 2 * 16] ; [10]
1462 pmaddubsw m5, m0, [r3 - 8 * 16] ; [15]
1465 pmaddubsw m6, m0, [r3 - 3 * 16] ; [20]
1469 pmaddubsw m6, m0, [r3 + 2 * 16] ; [25]
1472 pmaddubsw m0, [r3 + 7 * 16] ; [30]
1476 pmaddubsw m1, m2, [r4 - 5 * 16] ; [3]
1479 pmaddubsw m2, [r4] ; [8]
1482 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1484 cglobal intra_pred_ang8_9, 3,5,8
1487 lea r3, [ang_table + 10 * 16]
1490 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1491 palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1493 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
1495 pmaddubsw m4, m0, [r3 - 8 * 16] ; [2]
1497 pmaddubsw m3, m0, [r3 - 6 * 16] ; [4]
1501 pmaddubsw m5, m0, [r3 - 4 * 16] ; [6]
1504 pmaddubsw m6, m0, [r3 - 2 * 16] ; [8]
1508 pmaddubsw m6, m0, [r3] ; [10]
1511 pmaddubsw m2, m0, [r3 + 2 * 16] ; [12]
1515 pmaddubsw m1, m0, [r3 + 4 * 16] ; [14]
1518 pmaddubsw m0, [r3 + 6 * 16] ; [16]
1521 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1523 cglobal intra_pred_ang8_10, 4,5,5
1525 mova m4, [pb_unpackbq]
1535 movhps [r0 + r1], m0
1536 movh [r0 + r1 * 2], m1
1537 movhps [r0 + r4], m1
1538 lea r2, [r0 + r1 * 4]
1540 movhps [r2 + r1], m2
1541 movh [r2 + r1 * 2], m3
1542 movhps [r2 + r4], m3
1563 cglobal intra_pred_ang8_26, 4,5,3
1569 movh [r0 + r1 * 2], m0
1571 lea r3, [r0 + r1 * 4]
1574 movh [r3 + r1 * 2], m0
1581 pshufb m0, [pb_unpackbq]
1585 pshufb m1, [pb_unpackbq]
1593 pextrb [r0 + r1], m0, 1
1594 pextrb [r0 + r1 * 2], m0, 2
1595 pextrb [r0 + r4], m0, 3
1597 pextrb [r3 + r1], m0, 5
1598 pextrb [r3 + r1 * 2], m0, 6
1599 pextrb [r3 + r4], m0, 7
1604 cglobal intra_pred_ang8_11, 3,5,8
1607 lea r3, [ang_table + 23 * 16]
1610 movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1611 palignr m1, m0, 1 ; [x 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1613 punpcklbw m0, m1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1615 pmaddubsw m4, m0, [r3 + 7 * 16] ; [30]
1617 pmaddubsw m3, m0, [r3 + 5 * 16] ; [28]
1621 pmaddubsw m5, m0, [r3 + 3 * 16] ; [26]
1624 pmaddubsw m6, m0, [r3 + 1 * 16] ; [24]
1628 pmaddubsw m6, m0, [r3 - 1 * 16] ; [22]
1631 pmaddubsw m2, m0, [r3 - 3 * 16] ; [20]
1635 pmaddubsw m1, m0, [r3 - 5 * 16] ; [18]
1638 pmaddubsw m0, [r3 - 7 * 16] ; [16]
1641 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1643 cglobal intra_pred_ang8_12, 4,5,8
1649 lea r4, [ang_table + 22 * 16]
1652 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1653 pslldq m0, m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
1654 pinsrb m0, [r3 + 6], 0
1655 punpckhbw m2, m0, m1 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7]
1656 punpcklbw m0, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1657 palignr m2, m0, 2 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1659 pmaddubsw m4, m2, [r4 + 5 * 16] ; [27]
1661 pmaddubsw m3, m2, [r4] ; [22]
1665 pmaddubsw m1, m0, [r4 + 7 * 16] ; [29]
1668 pmaddubsw m0, [r4 + 2 * 16] ; [24]
1672 pmaddubsw m5, m2, [r4 - 5 * 16] ; [17]
1675 lea r4, [ang_table + 7 * 16]
1676 pmaddubsw m6, m2, [r4 + 5 * 16] ; [12]
1680 pmaddubsw m6, m2, [r4] ; [7]
1683 pmaddubsw m2, [r4 - 5 * 16] ; [2]
1686 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1688 cglobal intra_pred_ang8_13, 4,5,8
1694 lea r4, [ang_table + 24 * 16]
1697 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1698 pslldq m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
1699 pinsrb m1, [r3 + 4], 0
1700 pslldq m0, m1, 1 ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
1701 pinsrb m0, [r3 + 7], 0
1702 punpckhbw m5, m0, m1 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6]
1703 punpcklbw m0, m1 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
1704 palignr m1, m5, m0, 2 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1705 palignr m5, m0, 4 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1707 pmaddubsw m4, m5, [r4 - 1 * 16] ; [23]
1710 pmaddubsw m6, m1, [r4 + 4 * 16] ; [28]
1713 pmaddubsw m0, [r4] ; [24]
1716 lea r4, [ang_table + 13 * 16]
1717 pmaddubsw m3, m5, [r4 + 1 * 16] ; [14]
1721 pmaddubsw m5, [r4 - 8 * 16] ; [5]
1725 pmaddubsw m6, m1, [r4 + 6 * 16] ; [19]
1728 pmaddubsw m2, m1, [r4 - 3 * 16] ; [10]
1732 pmaddubsw m1, [r4 - 12 * 16] ; [1]
1735 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1737 cglobal intra_pred_ang8_14, 4,5,8
1743 lea r4, [ang_table + 24 * 16]
1746 movu m1, [r2 - 2] ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b]
1747 pinsrb m1, [r3 + 2], 1
1748 pinsrb m1, [r3 + 5], 0
1749 pslldq m0, m1, 1 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
1750 pinsrb m0, [r3 + 7], 0
1751 punpckhbw m2, m0, m1 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5]
1752 punpcklbw m0, m1 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
1753 palignr m1, m2, m0, 2 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
1754 palignr m6, m2, m0, 4 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1755 palignr m2, m0, 6 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1757 pmaddubsw m4, m2, [r4 - 5 * 16] ; [19]
1760 pmaddubsw m0, [r4] ; [24]
1763 pmaddubsw m5, m6, [r4 + 1 * 16] ; [25]
1766 lea r4, [ang_table + 12 * 16]
1767 pmaddubsw m6, [r4] ; [12]
1771 pmaddubsw m6, m1, [r4 + 19 * 16] ; [31]
1774 pmaddubsw m2, [r4 - 6 * 16] ; [6]
1778 pmaddubsw m2, m1, [r4 + 6 * 16] ; [18]
1782 pmaddubsw m1, [r4 - 7 * 16] ; [5]
1785 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1787 cglobal intra_pred_ang8_15, 4,5,8
1793 lea r4, [ang_table + 23 * 16]
1796 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1798 pshufb m2, [c_mode16_15]
1799 palignr m1, m2, 13 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c]
1800 pslldq m0, m1, 1 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
1801 pinsrb m0, [r3 + 8], 0
1802 punpckhbw m4, m0, m1 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4]
1803 punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
1804 palignr m1, m4, m0, 2 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
1805 palignr m6, m4, m0, 4 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
1806 palignr m5, m4, m0, 6 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1807 palignr m4, m0, 8 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1809 pmaddubsw m4, [r4 - 8 * 16] ; [15]
1812 pmaddubsw m2, m5, [r4 + 7 * 16] ; [30]
1816 pmaddubsw m5, [r4 - 10 * 16] ; [13]
1819 pmaddubsw m2, m6, [r4 + 5 * 16] ; [28]
1823 pmaddubsw m2, m1, [r4 + 3 * 16] ; [26]
1826 pmaddubsw m0, [r4 + 1 * 16] ; [24]
1829 lea r4, [ang_table + 11 * 16]
1830 pmaddubsw m6, [r4] ; [11]
1834 pmaddubsw m1, [r4 - 2 * 16] ; [9]
1837 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1839 cglobal intra_pred_ang8_16, 4,5,8
1845 lea r4, [ang_table + 22 * 16]
1848 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1850 pshufb m2, [c_mode16_16]
1851 palignr m1, m2, 12 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d]
1852 pslldq m0, m1, 1 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
1853 pinsrb m0, [r3 + 8], 0
1854 punpckhbw m4, m0, m1 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3]
1855 punpcklbw m0, m1 ; [3 2 2 1 1 0 0 a a b b c c d d e]
1856 palignr m1, m4, m0, 2 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
1857 palignr m6, m4, m0, 4 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
1858 palignr m2, m4, m0, 6 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
1859 palignr m5, m4, m0, 8 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1860 palignr m4, m0, 10 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1862 pmaddubsw m3, m5, [r4] ; [22]
1865 pmaddubsw m0, [r4 + 2 * 16] ; [24]
1868 lea r4, [ang_table + 9 * 16]
1870 pmaddubsw m4, [r4 + 2 * 16] ; [11]
1874 pmaddubsw m2, [r4 + 3 * 16] ; [12]
1877 pmaddubsw m5, [r4 - 8 * 16] ; [1]
1882 pmaddubsw m6, [r4 + 14 * 16] ; [23]
1885 pmaddubsw m2, [r4 - 7 * 16] ; [2]
1889 pmaddubsw m1, [r4 + 4 * 16] ; [13]
1892 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1894 cglobal intra_pred_ang8_17, 4,5,8
1900 lea r4, [ang_table + 17 * 16]
1903 movu m2, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
1905 pshufb m1, [c_mode16_17]
1906 palignr m2, m1, 11 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e]
1907 pslldq m0, m2, 1 ; [9 8 7 6 5 4 3 2 1 0 a b c d e f]
1908 pinsrb m0, [r3 + 7], 0
1909 punpckhbw m1, m0, m2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
1910 punpcklbw m0, m2 ; [2 1 1 0 0 a a b b c c d d e e f]
1912 palignr m5, m1, m0, 8 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
1913 palignr m2, m1, m0, 10 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
1914 palignr m4, m1, m0, 12 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
1917 pmaddubsw m2, [r4 - 5 * 16] ; [12]
1920 pmaddubsw m4, [r4 - 11 * 16] ; [6]
1924 pmaddubsw m5, [r4 + 1 * 16] ; [18]
1927 palignr m2, m1, m0, 6 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c]
1928 pmaddubsw m2, [r4 + 7 * 16] ; [24]
1932 palignr m6, m1, m0, 4 ; [4 3 3 2 2 1 1 0 0 a a b b c c d]
1934 pmaddubsw m6, [r4 + 13 * 16] ; [30]
1937 pmaddubsw m2, [r4 - 13 * 16] ; [4]
1941 palignr m1, m0, 2 ; [3 2 2 1 1 0 0 a a b b c c d d e]
1942 pmaddubsw m1, [r4 - 7 * 16] ; [10]
1945 pmaddubsw m0, [r4 - 1 * 16] ; [16]
1948 jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8)
1950 cglobal intra_pred_ang8_18, 4,4,1
1952 pshufb m0, [pb_swap8]
1954 lea r2, [r0 + r1 * 4]
1958 movh [r2 + r1 * 2], m0
1966 movh [r0 + r1 * 2], m0
1974 ;-----------------------------------------------------------------------------
1975 ; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
1976 ;-----------------------------------------------------------------------------
1978 cglobal intra_pred_ang16_2, 3,3,3
1984 palignr m2, m1, m0, 1
1986 lea r0, [r0 + r1 * 2]
1987 palignr m2, m1, m0, 2
1989 palignr m2, m1, m0, 3
1991 lea r0, [r0 + r1 * 2]
1992 palignr m2, m1, m0, 4
1994 palignr m2, m1, m0, 5
1996 lea r0, [r0 + r1 * 2]
1997 palignr m2, m1, m0, 6
1999 palignr m2, m1, m0, 7
2001 lea r0, [r0 + r1 * 2]
2002 palignr m2, m1, m0, 8
2004 palignr m2, m1, m0, 9
2006 lea r0, [r0 + r1 * 2]
2007 palignr m2, m1, m0, 10
2009 palignr m2, m1, m0, 11
2011 lea r0, [r0 + r1 * 2]
2012 palignr m2, m1, m0, 12
2014 palignr m2, m1, m0, 13
2016 lea r0, [r0 + r1 * 2]
2017 palignr m2, m1, m0, 14
2019 palignr m2, m1, m0, 15
2023 %macro TRANSPOSE_STORE_8x8 6
2025 ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
2026 punpckhbw m0, %3, %4
2028 punpckhbw %4, %3, m0
2031 punpckhbw m0, %5, m1
2033 punpckhbw %6, %5, m0
2036 punpckhdq m0, %3, %5
2038 punpckldq %5, %4, %6
2041 movh [r0 + + %1 * 8], %3
2042 movhps [r0 + r1 + %1 * 8], %3
2043 movh [r0 + r1*2 + %1 * 8], m0
2044 movhps [r0 + r5 + %1 * 8], m0
2045 movh [r6 + %1 * 8], %5
2046 movhps [r6 + r1 + %1 * 8], %5
2047 movh [r6 + r1*2 + %1 * 8], %4
2048 movhps [r6 + r5 + %1 * 8], %4
2050 ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32
2052 movhps [r0 + r1 ], %3
2053 movh [r0 + r1 * 2], %4
2054 movhps [r0 + r5 ], %4
2055 lea r0, [r0 + r1 * 4]
2057 movhps [r0 + r1 ], %5
2058 movh [r0 + r1 * 2], %6
2059 movhps [r0 + r5 ], %6
2060 lea r0, [r0 + r1 * 4]
2065 cglobal intra_pred_ang16_3, 3,7,8
2067 lea r3, [ang_table + 16 * 16]
2069 lea r5, [r1 * 3] ; r5 -> 3 * stride
2070 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
2077 punpckhbw m2, m0, m1
2079 palignr m1, m2, m0, 2
2081 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
2083 pmaddubsw m1, [r3 + 4 * 16] ; [20]
2087 palignr m5, m2, m0, 4
2089 pmaddubsw m5, [r3 - 2 * 16] ; [14]
2092 palignr m6, m2, m0, 6
2094 pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
2098 palignr m1, m2, m0, 8
2100 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
2103 pmaddubsw m1, [r3 + 12 * 16] ; [28]
2107 palignr m1, m2, m0, 10
2109 pmaddubsw m1, [r3 + 6 * 16] ; [22]
2114 pmaddubsw m2, [r3] ; [16]
2118 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
2123 punpckhbw m2, m0, m1
2125 palignr m5, m2, m0, 2
2127 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
2129 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
2133 pmaddubsw m5, [r3 + 14 * 16] ; [30]
2136 palignr m6, m2, m0, 4
2138 pmaddubsw m6, [r3 + 8 * 16] ; [24]
2142 palignr m1, m2, m0, 6
2144 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
2147 palignr m1, m2, m0, 8
2149 pmaddubsw m1, [r3 - 4 * 16] ; [12]
2153 palignr m1, m2, m0, 10
2155 pmaddubsw m1, [r3 - 10 * 16] ; [06]
2159 movhps m1, [r2 + 14] ; [00]
2161 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
2163 lea r0, [r6 + r1 * 4]
2164 lea r6, [r6 + r1 * 8]
2172 cglobal intra_pred_ang16_33, 3,7,8
2174 lea r3, [ang_table + 16 * 16]
2184 punpckhbw m2, m0, m1
2186 palignr m1, m2, m0, 2
2188 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
2190 pmaddubsw m1, [r3 + 4 * 16] ; [20]
2194 palignr m5, m2, m0, 4
2196 pmaddubsw m5, [r3 - 2 * 16] ; [14]
2199 palignr m6, m2, m0, 6
2201 pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
2205 palignr m1, m2, m0, 8
2207 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
2210 pmaddubsw m1, [r3 + 12 * 16] ; [28]
2214 palignr m1, m2, m0, 10
2216 pmaddubsw m1, [r3 + 6 * 16] ; [22]
2221 pmaddubsw m2, [r3] ; [16]
2225 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
2230 punpckhbw m2, m0, m1
2232 palignr m5, m2, m0, 2
2234 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
2236 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
2240 pmaddubsw m5, [r3 + 14 * 16] ; [30]
2243 palignr m6, m2, m0, 4
2245 pmaddubsw m6, [r3 + 8 * 16] ; [24]
2249 palignr m1, m2, m0, 6
2251 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
2254 palignr m1, m2, m0, 8
2256 pmaddubsw m1, [r3 - 4 * 16] ; [12]
2260 palignr m1, m2, m0, 10
2262 pmaddubsw m1, [r3 - 10 * 16] ; [06]
2266 movh m2, [r2 + 14] ; [00]
2269 movhps [r0 + r1 ], m4
2270 movh [r0 + r1 * 2], m5
2271 movhps [r0 + r5 ], m5
2272 lea r0, [r0 + r1 * 4]
2274 movhps [r0 + r1 ], m6
2275 movh [r0 + r1 * 2], m1
2286 cglobal intra_pred_ang16_4, 3,7,8
2288 lea r3, [ang_table + 16 * 16]
2290 lea r5, [r1 * 3] ; r5 -> 3 * stride
2291 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
2298 punpckhbw m2, m0, m1
2300 palignr m1, m2, m0, 2
2303 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
2305 pmaddubsw m1, [r3 - 6 * 16] ; [10]
2309 pmaddubsw m5, [r3 + 15 * 16] ; [31]
2312 palignr m6, m2, m0, 4
2314 pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
2318 palignr m1, m2, m0, 6
2320 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
2323 pmaddubsw m1, [r3 + 14 * 16] ; [30]
2327 palignr m1, m2, m0, 8
2329 pmaddubsw m1, [r3 + 3 * 16] ; [19]
2334 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
2338 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
2340 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
2346 punpckhbw m2, m0, m1
2348 palignr m1, m2, m0, 2
2350 pmaddubsw m1, [r3 + 2 * 16] ; [18]
2354 palignr m5, m2, m0, 4
2357 pmaddubsw m5, [r3 - 9 * 16] ; [07]
2360 pmaddubsw m6, [r3 + 12 * 16] ; [28]
2364 palignr m6, m2, m0, 6
2366 pmaddubsw m6, [r3 + 16] ; [17]
2369 palignr m1, m2, m0, 8
2372 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
2376 pmaddubsw m1, [r3 + 11 * 16] ; [27]
2379 pmaddubsw m2, [r3] ; [16]
2383 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
2385 lea r0, [r6 + r1 * 4]
2386 lea r6, [r6 + r1 * 8]
2394 cglobal intra_pred_ang16_32, 3,7,8
2396 lea r3, [ang_table + 16 * 16]
2398 lea r5, [r1 * 3] ; r5 -> 3 * stride
2406 punpckhbw m2, m0, m1
2408 palignr m1, m2, m0, 2
2412 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
2414 pmaddubsw m1, [r3 - 6 * 16] ; [10]
2418 pmaddubsw m5, [r3 + 15 * 16] ; [31]
2421 palignr m6, m2, m0, 4
2423 pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
2427 palignr m1, m2, m0, 6
2429 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
2432 pmaddubsw m1, [r3 + 14 * 16] ; [30]
2436 palignr m1, m2, m0, 8
2438 pmaddubsw m1, [r3 + 3 * 16] ; [19]
2443 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
2447 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
2449 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
2455 punpckhbw m2, m0, m1
2457 palignr m1, m2, m0, 2
2459 pmaddubsw m1, [r3 + 2 * 16] ; [18]
2463 palignr m5, m2, m0, 4
2466 pmaddubsw m5, [r3 - 9 * 16] ; [07]
2469 pmaddubsw m6, [r3 + 12 * 16] ; [28]
2473 palignr m6, m2, m0, 6
2475 pmaddubsw m6, [r3 + 16] ; [17]
2478 palignr m1, m2, m0, 8
2481 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
2485 pmaddubsw m1, [r3 + 11 * 16] ; [27]
2488 pmaddubsw m2, [r3] ; [16]
2492 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
2502 cglobal intra_pred_ang16_5, 3,7,8
2504 lea r3, [ang_table + 16 * 16]
2506 lea r5, [r1 * 3] ; r5 -> 3 * stride
2507 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
2511 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2512 movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2513 punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2514 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2516 palignr m5, m2, m3, 2
2518 pmaddubsw m4, m3, [r3 + 16] ; [17]
2520 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
2524 palignr m6, m2, m3, 4
2526 pmaddubsw m5, [r3 + 3 * 16] ; [19]
2528 pmaddubsw m1, m6, [r3 - 12 * 16] ; [4]
2532 palignr m1, m2, m3, 6
2534 pmaddubsw m6, [r3 + 5 * 16] ; [21]
2536 pmaddubsw m0, m1, [r3 - 10 * 16] ; [6]
2540 palignr m0, m2, m3, 8
2542 pmaddubsw m1, [r3 + 7 * 16] ; [23]
2544 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2548 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
2550 palignr m4, m2, m3, 8
2551 palignr m5, m2, m3, 10
2553 pmaddubsw m4, [r3 + 9 * 16] ; [25]
2555 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
2559 palignr m6, m2, m3, 12
2561 pmaddubsw m5, [r3 + 11 * 16] ; [27]
2563 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
2567 palignr m1, m2, m3, 14
2569 pmaddubsw m6, [r3 + 13 * 16] ; [29]
2571 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
2575 pmaddubsw m1, [r3 + 15 * 16] ; [31]
2577 pmaddubsw m2, [r3] ; [16]
2581 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
2583 lea r0, [r6 + r1 * 4]
2584 lea r6, [r6 + r1 * 8]
2592 cglobal intra_pred_ang16_31, 3,7,8
2594 lea r3, [ang_table + 16 * 16]
2596 lea r5, [r1 * 3] ; r5 -> 3 * stride
2601 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2602 movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2603 punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2604 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2606 palignr m5, m2, m3, 2
2608 pmaddubsw m4, m3, [r3 + 16] ; [17]
2610 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
2614 palignr m6, m2, m3, 4
2616 pmaddubsw m5, [r3 + 3 * 16] ; [19]
2618 pmaddubsw m1, m6, [r3 - 12 * 16] ; [4]
2622 palignr m1, m2, m3, 6
2624 pmaddubsw m6, [r3 + 5 * 16] ; [21]
2626 pmaddubsw m0, m1, [r3 - 10 * 16] ; [6]
2630 palignr m0, m2, m3, 8
2632 pmaddubsw m1, [r3 + 7 * 16] ; [23]
2634 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2638 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
2640 palignr m4, m2, m3, 8
2641 palignr m5, m2, m3, 10
2643 pmaddubsw m4, [r3 + 9 * 16] ; [25]
2645 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
2649 palignr m6, m2, m3, 12
2651 pmaddubsw m5, [r3 + 11 * 16] ; [27]
2653 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
2657 palignr m1, m2, m3, 14
2659 pmaddubsw m6, [r3 + 13 * 16] ; [29]
2661 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
2665 pmaddubsw m1, [r3 + 15 * 16] ; [31]
2667 pmaddubsw m2, [r3] ; [16]
2671 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
2681 cglobal intra_pred_ang16_6, 3,7,8
2683 lea r3, [ang_table + 16 * 16]
2685 lea r5, [r1 * 3] ; r5 -> 3 * stride
2686 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
2690 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2691 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2692 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2693 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2695 pmaddubsw m4, m3, [r3 - 3 * 16] ; [13]
2697 pmaddubsw m1, m3, [r3 + 10 * 16] ; [26]
2701 palignr m6, m2, m3, 2
2703 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
2705 pmaddubsw m6, [r3 + 4 * 16] ; [20]
2709 palignr m1, m2, m3, 4
2711 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
2713 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
2717 palignr m0, m2, m3, 6
2719 pmaddubsw m1, [r3 + 11 * 16] ; [27]
2721 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2725 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
2727 palignr m4, m2, m3, 6
2728 palignr m6, m2, m3, 8
2730 pmaddubsw m4, [r3 + 5 * 16] ; [21]
2732 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
2736 pmaddubsw m5, m6, [r3 - 16] ; [15]
2738 pmaddubsw m6, [r3 + 12 * 16] ; [28]
2742 palignr m0, m2, m3, 10
2744 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
2746 pmaddubsw m0, [r3 + 6 * 16] ; [22]
2752 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
2754 pmaddubsw m2, [r3] ; [16]
2758 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
2760 lea r0, [r6 + r1 * 4]
2761 lea r6, [r6 + r1 * 8]
2769 cglobal intra_pred_ang16_30, 3,7,8
2771 lea r3, [ang_table + 16 * 16]
2773 lea r5, [r1 * 3] ; r5 -> 3 * stride
2778 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2779 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2780 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2781 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2783 pmaddubsw m4, m3, [r3 - 3 * 16] ; [13]
2785 pmaddubsw m1, m3, [r3 + 10 * 16] ; [26]
2789 palignr m6, m2, m3, 2
2791 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
2793 pmaddubsw m6, [r3 + 4 * 16] ; [20]
2797 palignr m1, m2, m3, 4
2799 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
2801 pmaddubsw m0, m1, [r3 - 2 * 16] ; [14]
2805 palignr m0, m2, m3, 6
2807 pmaddubsw m1, [r3 + 11 * 16] ; [27]
2809 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2813 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
2815 palignr m4, m2, m3, 6
2816 palignr m6, m2, m3, 8
2818 pmaddubsw m4, [r3 + 5 * 16] ; [21]
2820 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
2824 pmaddubsw m5, m6, [r3 - 16] ; [15]
2826 pmaddubsw m6, [r3 + 12 * 16] ; [28]
2830 palignr m0, m2, m3, 10
2832 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
2834 pmaddubsw m0, [r3 + 6 * 16] ; [22]
2840 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
2842 pmaddubsw m2, [r3] ; [16]
2846 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
2856 cglobal intra_pred_ang16_7, 3,7,8
2858 lea r3, [ang_table + 16 * 16]
2860 lea r5, [r1 * 3] ; r5 -> 3 * stride
2861 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
2865 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2866 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2867 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2868 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2870 pmaddubsw m4, m3, [r3 - 7 * 16] ; [9]
2872 pmaddubsw m0, m3, [r3 + 2 * 16] ; [18]
2876 palignr m1, m2, m3, 2
2878 pmaddubsw m5, m3, [r3 + 11 * 16] ; [27]
2880 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
2884 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
2886 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
2890 palignr m0, m2, m3, 4
2892 pmaddubsw m1, [r3 + 15 * 16] ; [31]
2894 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2898 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
2900 palignr m1, m2, m3, 4
2902 pmaddubsw m4, m1, [r3 + 16] ; [17]
2904 pmaddubsw m1, [r3 + 10 * 16] ; [26]
2908 palignr m0, m2, m3, 6
2910 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
2912 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
2916 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
2918 pmaddubsw m0, [r3 + 14 * 16] ; [30]
2924 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
2926 pmaddubsw m2, [r3] ; [16]
2930 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
2932 lea r0, [r6 + r1 * 4]
2933 lea r6, [r6 + r1 * 8]
2941 cglobal intra_pred_ang16_29, 3,7,8
2943 lea r3, [ang_table + 16 * 16]
2945 lea r5, [r1 * 3] ; r5 -> 3 * stride
2950 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2951 palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2952 punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
2953 punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
2955 pmaddubsw m4, m3, [r3 - 7 * 16] ; [9]
2957 pmaddubsw m0, m3, [r3 + 2 * 16] ; [18]
2961 palignr m1, m2, m3, 2
2963 pmaddubsw m5, m3, [r3 + 11 * 16] ; [27]
2965 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
2969 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
2971 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
2975 palignr m0, m2, m3, 4
2977 pmaddubsw m1, [r3 + 15 * 16] ; [31]
2979 pmaddubsw m0, [r3 - 8 * 16] ; [8]
2983 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
2985 palignr m1, m2, m3, 4
2987 pmaddubsw m4, m1, [r3 + 16] ; [17]
2989 pmaddubsw m1, [r3 + 10 * 16] ; [26]
2993 palignr m0, m2, m3, 6
2995 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
2997 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
3001 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
3003 pmaddubsw m0, [r3 + 14 * 16] ; [30]
3009 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
3011 pmaddubsw m2, [r3] ; [16]
3015 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
3025 cglobal intra_pred_ang16_8, 3,7,8
3027 lea r3, [ang_table + 16 * 16]
3029 lea r5, [r1 * 3] ; r5 -> 3 * stride
3030 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
3034 movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3035 palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3036 punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
3037 punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3039 pmaddubsw m4, m1, [r3 - 11 * 16] ; [5]
3041 pmaddubsw m2, m1, [r3 - 6 * 16] ; [10]
3045 pmaddubsw m5, m1, [r3 - 1 * 16] ; [15]
3047 pmaddubsw m6, m1, [r3 + 4 * 16] ; [20]
3051 pmaddubsw m6, m1, [r3 + 9 * 16] ; [25]
3053 pmaddubsw m2, m1, [r3 + 14 * 16] ; [30]
3057 palignr m2, m0, m1, 2
3058 palignr m3, m0, m1, 4
3060 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
3062 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
3066 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3068 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
3070 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
3074 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
3076 pmaddubsw m2, [r3 + 12 * 16] ; [28]
3080 pmaddubsw m6, m3, [r3 - 15 * 16] ; [01]
3082 pmaddubsw m1, m3, [r3 - 10 * 16] ; [06]
3086 pmaddubsw m1, m3, [r3 - 5 * 16] ; [11]
3088 pmaddubsw m3, [r3] ; [16]
3092 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
3094 lea r0, [r6 + r1 * 4]
3095 lea r6, [r6 + r1 * 8]
3103 cglobal intra_pred_ang16_28, 3,7,8
3105 lea r3, [ang_table + 16 * 16]
3107 lea r5, [r1 * 3] ; r5 -> 3 * stride
3112 movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3113 palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3114 punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
3115 punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3117 pmaddubsw m4, m1, [r3 - 11 * 16] ; [5]
3119 pmaddubsw m2, m1, [r3 - 6 * 16] ; [10]
3123 pmaddubsw m5, m1, [r3 - 1 * 16] ; [15]
3125 pmaddubsw m6, m1, [r3 + 4 * 16] ; [20]
3129 pmaddubsw m6, m1, [r3 + 9 * 16] ; [25]
3131 pmaddubsw m2, m1, [r3 + 14 * 16] ; [30]
3135 palignr m2, m0, m1, 2
3136 palignr m3, m0, m1, 4
3138 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
3140 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
3144 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
3146 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
3148 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
3152 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
3154 pmaddubsw m2, [r3 + 12 * 16] ; [28]
3158 pmaddubsw m6, m3, [r3 - 15 * 16] ; [01]
3160 pmaddubsw m1, m3, [r3 - 10 * 16] ; [06]
3164 pmaddubsw m1, m3, [r3 - 5 * 16] ; [11]
3166 pmaddubsw m3, [r3] ; [16]
3170 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
3180 cglobal intra_pred_ang16_9, 3,7,8
3182 lea r3, [ang_table + 16 * 16]
3184 lea r5, [r1 * 3] ; r5 -> 3 * stride
3185 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
3189 movu m2, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3190 palignr m3, m2, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3191 punpcklbw m2, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3193 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
3195 pmaddubsw m0, m2, [r3 - 12 * 16] ; [4]
3199 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
3201 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
3205 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
3207 pmaddubsw m0, m2, [r3 - 4 * 16] ; [12]
3211 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
3213 pmaddubsw m0, m2, [r3] ; [16]
3217 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3219 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
3221 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
3225 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
3227 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
3231 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
3233 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
3237 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
3241 punpcklqdq m1, m3 ; [00]
3243 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
3245 lea r0, [r6 + r1 * 4]
3246 lea r6, [r6 + r1 * 8]
3254 cglobal intra_pred_ang16_27, 3,7,8
3256 lea r3, [ang_table + 16 * 16]
3258 lea r5, [r1 * 3] ; r5 -> 3 * stride
3263 movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3264 palignr m2, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
3265 punpcklbw m3, m2 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
3267 pmaddubsw m4, m3, [r3 - 14 * 16] ; [2]
3269 pmaddubsw m0, m3, [r3 - 12 * 16] ; [4]
3273 pmaddubsw m5, m3, [r3 - 10 * 16] ; [6]
3275 pmaddubsw m6, m3, [r3 - 8 * 16] ; [8]
3279 pmaddubsw m6, m3, [r3 - 6 * 16] ; [10]
3281 pmaddubsw m0, m3, [r3 - 4 * 16] ; [12]
3285 pmaddubsw m1, m3, [r3 - 2 * 16] ; [14]
3287 pmaddubsw m0, m3, [r3] ; [16]
3291 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
3293 pmaddubsw m4, m3, [r3 + 2 * 16] ; [18]
3295 pmaddubsw m5, m3, [r3 + 4 * 16] ; [20]
3299 pmaddubsw m5, m3, [r3 + 6 * 16] ; [22]
3301 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
3305 pmaddubsw m6, m3, [r3 + 10 * 16] ; [26]
3307 pmaddubsw m1, m3, [r3 + 12 * 16] ; [28]
3311 pmaddubsw m1, m3, [r3 + 14 * 16] ; [30]
3316 movhps [r0 + r1 ], m4
3317 movh [r0 + r1 * 2], m5
3318 movhps [r0 + r5 ], m5
3319 lea r0, [r0 + r1 * 4]
3321 movhps [r0 + r1 ], m6
3322 movh [r0 + r1 * 2], m1
3333 cglobal intra_pred_ang16_10, 6,6,8
3352 movu [r0 + r1 * 2], m2
3354 lea r2, [r0 + r1 * 4]
3357 movu [r2 + r1 * 2], m6
3373 lea r2, [r2 + r1 * 4]
3376 movu [r2 + r1 * 2], m4
3378 lea r2, [r2 + r1 * 4]
3390 movu [r2 + r1 * 2], m2
3420 %if ARCH_X86_64 == 1
3421 cglobal intra_pred_ang16_26, 4,8,5
3425 cglobal intra_pred_ang16_26, 6,7,5,0 - 4
3426 %define bfilter dword[rsp]
3432 lea r3, [r0 + r1 * 4]
3433 lea r5, [r3 + r1 * 4]
3434 lea r6, [r5 + r1 * 4]
3438 movu [r0 + r1 * 2], m0
3442 movu [r3 + r1 * 2], m0
3446 movu [r5 + r1 * 2], m0
3451 movu [r6 + r1 * 2], m0
3479 pextrb [r0 + r1], m0, 1
3480 pextrb [r0 + r1 * 2], m0, 2
3481 pextrb [r0 + r4], m0, 3
3483 pextrb [r3 + r1], m0, 5
3484 pextrb [r3 + r1 * 2], m0, 6
3485 pextrb [r3 + r4], m0, 7
3487 pextrb [r5 + r1], m0, 9
3488 pextrb [r5 + r1 * 2], m0, 10
3489 pextrb [r5 + r4], m0, 11
3491 pextrb [r6 + r1], m0, 13
3492 pextrb [r6 + r1 * 2], m0, 14
3493 pextrb [r6 + r4], m0, 15
3499 cglobal intra_pred_ang16_11, 3,7,8
3501 lea r3, [ang_table + 16 * 16]
3503 lea r5, [r1 * 3] ; r5 -> 3 * stride
3504 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
3508 movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
3510 palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3511 punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
3513 pmaddubsw m4, m3, [r3 + 14 * 16] ; [30]
3515 pmaddubsw m0, m3, [r3 + 12 * 16] ; [28]
3519 pmaddubsw m5, m3, [r3 + 10 * 16] ; [26]
3521 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
3525 pmaddubsw m6, m3, [r3 + 6 * 16] ; [22]
3527 pmaddubsw m0, m3, [r3 + 4 * 16] ; [20]
3531 pmaddubsw m1, m3, [r3 + 2 * 16] ; [18]
3533 pmaddubsw m0, m3, [r3] ; [16]
3537 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3539 pmaddubsw m4, m3, [r3 - 2 * 16] ; [14]
3541 pmaddubsw m5, m3, [r3 - 4 * 16] ; [12]
3545 pmaddubsw m5, m3, [r3 - 6 * 16] ; [10]
3547 pmaddubsw m6, m3, [r3 - 8 * 16] ; [08]
3551 pmaddubsw m6, m3, [r3 - 10 * 16] ; [06]
3553 pmaddubsw m1, m3, [r3 - 12 * 16] ; [04]
3557 pmaddubsw m1, m3, [r3 - 14 * 16] ; [02]
3560 punpcklqdq m1, m2 ;[00]
3562 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
3564 lea r0, [r6 + r1 * 4]
3565 lea r6, [r6 + r1 * 8]
3573 cglobal intra_pred_ang16_25, 3,7,8
3575 lea r3, [ang_table + 16 * 16]
3577 lea r5, [r1 * 3] ; r5 -> 3 * stride
3582 movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
3584 palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3585 punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
3587 pmaddubsw m4, m3, [r3 + 14 * 16] ; [30]
3589 pmaddubsw m0, m3, [r3 + 12 * 16] ; [28]
3593 pmaddubsw m5, m3, [r3 + 10 * 16] ; [26]
3595 pmaddubsw m6, m3, [r3 + 8 * 16] ; [24]
3599 pmaddubsw m6, m3, [r3 + 6 * 16] ; [22]
3601 pmaddubsw m0, m3, [r3 + 4 * 16] ; [20]
3605 pmaddubsw m1, m3, [r3 + 2 * 16] ; [18]
3607 pmaddubsw m0, m3, [r3] ; [16]
3611 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
3613 pmaddubsw m4, m3, [r3 - 2 * 16] ; [14]
3615 pmaddubsw m5, m3, [r3 - 4 * 16] ; [12]
3619 pmaddubsw m5, m3, [r3 - 6 * 16] ; [10]
3621 pmaddubsw m6, m3, [r3 - 8 * 16] ; [08]
3625 pmaddubsw m6, m3, [r3 - 10 * 16] ; [06]
3627 pmaddubsw m1, m3, [r3 - 12 * 16] ; [04]
3631 pmaddubsw m1, m3, [r3 - 14 * 16] ; [02]
3636 movhps [r0 + r1 ], m4
3637 movh [r0 + r1 * 2], m5
3638 movhps [r0 + r5 ], m5
3639 lea r0, [r0 + r1 * 4]
3641 movhps [r0 + r1 ], m6
3642 movh [r0 + r1 * 2], m1
3653 cglobal intra_pred_ang16_12, 4,7,8
3655 lea r4, [ang_table + 16 * 16]
3656 lea r5, [r1 * 3] ; r5 -> 3 * stride
3657 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
3660 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
3661 punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
3662 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
3664 pshufb m2, [c_mode16_12]
3666 palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
3668 pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
3670 pmaddubsw m1, m0, [r4 + 6 * 16] ; [22]
3674 pmaddubsw m5, m0, [r4 + 1 * 16] ; [17]
3676 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
3680 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
3682 pmaddubsw m0, [r4 - 14 * 16] ; [2]
3688 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
3690 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
3694 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3696 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
3698 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
3702 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
3704 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
3710 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
3712 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
3716 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
3718 pmaddubsw m3, [r4] ; [16]
3722 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
3724 lea r0, [r6 + r1 * 4]
3725 lea r6, [r6 + r1 * 8]
3727 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3728 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
3729 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
3730 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
3732 pmaddubsw m4, m3, [r4 + 11 * 16] ; [27]
3734 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
3738 pmaddubsw m5, m3, [r4 + 1 * 16] ; [17]
3740 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
3744 pmaddubsw m6, m3, [r4 - 9 * 16] ; [7]
3746 pmaddubsw m0, m3, [r4 - 14 * 16] ; [2]
3752 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
3754 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
3758 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3760 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
3762 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
3766 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
3768 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
3775 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
3777 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
3781 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
3783 pmaddubsw m3, [r4] ; [16]
3787 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
3792 cglobal intra_pred_ang16_24, 4,7,8
3794 lea r4, [ang_table + 16 * 16]
3795 lea r5, [r1 * 3] ; r5 -> 3 * stride
3799 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
3800 punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
3801 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
3803 pshufb m2, [c_mode16_12]
3805 palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
3807 pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
3809 pmaddubsw m1, m0, [r4 + 6 * 16] ; [22]
3813 pmaddubsw m5, m0, [r4 + 1 * 16] ; [17]
3815 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
3819 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
3821 pmaddubsw m0, [r4 - 14 * 16] ; [2]
3827 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
3829 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
3833 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
3835 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
3837 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
3841 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
3843 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
3849 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
3851 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
3855 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
3857 pmaddubsw m3, [r4] ; [16]
3861 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
3865 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
3866 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
3867 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
3868 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
3870 pmaddubsw m4, m3, [r4 + 11 * 16] ; [27]
3872 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
3876 pmaddubsw m5, m3, [r4 + 1 * 16] ; [17]
3878 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
3882 pmaddubsw m6, m3, [r4 - 9 * 16] ; [7]
3884 pmaddubsw m0, m3, [r4 - 14 * 16] ; [2]
3890 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
3892 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
3896 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
3898 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
3900 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
3904 pmaddubsw m5, m3, [r4 - 7 * 16] ; [09]
3906 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
3913 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
3915 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
3919 pmaddubsw m1, m3, [r4 + 5 * 16] ; [21]
3921 pmaddubsw m3, [r4] ; [16]
3925 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
3930 cglobal intra_pred_ang16_13, 4,7,8
3932 lea r4, [ang_table + 16 * 16]
3933 lea r5, [r1 * 3] ; r5 -> 3 * stride
3934 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
3937 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
3938 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
3939 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
3941 pshufb m2, [c_mode16_13]
3943 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
3945 pmaddubsw m4, m5, [r4 + 7 * 16] ; [23]
3947 pmaddubsw m0, m5, [r4 - 2 * 16] ; [14]
3951 pmaddubsw m5, [r4 - 11 * 16] ; [05]
3956 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
3960 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
3962 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
3966 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
3971 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
3975 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
3977 pmaddubsw m4, m3, [r4 - 16] ; [15]
3979 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
3986 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
3988 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
3992 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
3994 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
4001 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
4003 pmaddubsw m3, [r4] ; [16]
4007 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4009 lea r0, [r6 + r1 * 4]
4010 lea r6, [r6 + r1 * 8]
4012 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4013 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4014 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4015 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
4017 pmaddubsw m4, m3, [r4 + 7 * 16] ; [23]
4019 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
4023 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4028 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4032 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
4034 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
4038 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4044 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4048 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4050 pmaddubsw m4, m3, [r4 - 16] ; [15]
4052 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
4059 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
4061 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4065 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4067 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
4074 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
4076 pmaddubsw m3, [r4] ; [16]
4080 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4085 cglobal intra_pred_ang16_23, 4,7,8
4087 lea r4, [ang_table + 16 * 16]
4088 lea r5, [r1 * 3] ; r5 -> 3 * stride
4092 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4093 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4094 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4096 pshufb m2, [c_mode16_13]
4098 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4100 pmaddubsw m4, m5, [r4 + 7 * 16] ; [23]
4102 pmaddubsw m0, m5, [r4 - 2 * 16] ; [14]
4106 pmaddubsw m5, [r4 - 11 * 16] ; [05]
4111 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4115 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
4117 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
4121 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4126 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4130 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4132 pmaddubsw m4, m3, [r4 - 16] ; [15]
4134 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
4141 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
4143 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4147 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4149 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
4156 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
4158 pmaddubsw m3, [r4] ; [16]
4162 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4166 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4167 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4168 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4169 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
4171 pmaddubsw m4, m3, [r4 + 7 * 16] ; [23]
4173 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
4177 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4182 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4186 pmaddubsw m6, m3, [r4 + 3 * 16] ; [19]
4188 pmaddubsw m0, m3, [r4 - 6 * 16] ; [10]
4192 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4198 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4202 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4204 pmaddubsw m4, m3, [r4 - 16] ; [15]
4206 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
4213 pmaddubsw m5, m3, [r4 + 13 * 16] ; [29]
4215 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4219 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4221 pmaddubsw m1, m3, [r4 - 14 * 16] ; [02]
4228 pmaddubsw m1, m3, [r4 + 9 * 16] ; [25]
4230 pmaddubsw m3, [r4] ; [16]
4234 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4239 cglobal intra_pred_ang16_14, 4,7,8
4241 lea r4, [ang_table + 16 * 16]
4242 lea r5, [r1 * 3] ; r5 -> 3 * stride
4243 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
4246 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4247 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4248 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4250 pshufb m2, [c_mode16_14]
4252 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4254 pmaddubsw m4, m5, [r4 + 3 * 16] ; [19]
4256 pmaddubsw m5, [r4 - 10 * 16] ; [06]
4262 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
4264 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
4270 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
4272 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
4276 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
4282 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4286 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4288 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
4294 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4298 pmaddubsw m5, m3, [r4 + 16] ; [17]
4300 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
4307 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
4309 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
4316 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
4318 pmaddubsw m3, [r4] ; [16]
4322 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4324 lea r0, [r6 + r1 * 4]
4325 lea r6, [r6 + r1 * 8]
4327 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4328 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4329 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4330 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
4332 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
4334 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
4340 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
4342 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
4349 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
4351 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
4355 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
4361 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4365 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4367 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
4373 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4377 pmaddubsw m5, m3, [r4 + 16] ; [17]
4379 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
4386 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
4388 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
4395 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
4397 pmaddubsw m3, [r4] ; [16]
4401 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4406 cglobal intra_pred_ang16_22, 4,7,8
4408 lea r4, [ang_table + 16 * 16]
4409 lea r5, [r1 * 3] ; r5 -> 3 * stride
4413 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4414 punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4415 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4417 pshufb m2, [c_mode16_14]
4419 palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4421 pmaddubsw m4, m5, [r4 + 3 * 16] ; [19]
4423 pmaddubsw m5, [r4 - 10 * 16] ; [06]
4429 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
4431 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
4437 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
4439 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
4443 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
4449 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4453 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4455 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
4461 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4465 pmaddubsw m5, m3, [r4 + 16] ; [17]
4467 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
4474 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
4476 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
4483 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
4485 pmaddubsw m3, [r4] ; [16]
4489 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4493 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4494 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4495 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4496 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x]
4498 pmaddubsw m4, m3, [r4 + 3 * 16] ; [19]
4500 pmaddubsw m5, m3, [r4 - 10 * 16] ; [06]
4506 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
4508 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
4515 pmaddubsw m6, m3, [r4 + 15 * 16] ; [31]
4517 pmaddubsw m0, m3, [r4 + 2 * 16] ; [18]
4521 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
4527 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4531 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4533 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
4539 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4543 pmaddubsw m5, m3, [r4 + 16] ; [17]
4545 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
4552 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
4554 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
4561 pmaddubsw m1, m3, [r4 + 13 * 16] ; [29]
4563 pmaddubsw m3, [r4] ; [16]
4567 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4572 cglobal intra_pred_ang16_15, 4,7,8
4574 lea r4, [ang_table + 16 * 16]
4575 lea r5, [r1 * 3] ; r5 -> 3 * stride
4576 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
4579 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4580 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4581 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4583 pshufb m2, [c_mode16_15]
4585 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4587 pmaddubsw m4, [r4 - 16] ; [15]
4592 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4596 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
4601 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4605 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4611 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
4615 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
4621 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4625 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4627 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
4633 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
4637 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4643 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4647 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
4653 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
4657 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4663 pmaddubsw m3, [r4] ; [16]
4667 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4669 lea r0, [r6 + r1 * 4]
4670 lea r6, [r6 + r1 * 8]
4672 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4673 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4674 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4675 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]
4677 pmaddubsw m4, m3, [r4 - 16] ; [15]
4682 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4686 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
4692 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4696 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4702 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
4706 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
4712 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4716 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
4718 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
4724 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
4728 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4734 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4738 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
4744 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
4748 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4754 pmaddubsw m3, [r4] ; [16]
4758 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
4763 cglobal intra_pred_ang16_21, 4,7,8
4765 lea r4, [ang_table + 16 * 16]
4766 lea r5, [r1 * 3] ; r5 -> 3 * stride
4770 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4771 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4772 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4774 pshufb m2, [c_mode16_15]
4776 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4778 pmaddubsw m4, [r4 - 16] ; [15]
4783 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4787 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
4792 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4796 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4802 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
4806 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
4812 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4816 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4818 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
4824 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
4828 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4834 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4838 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
4844 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
4848 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4854 pmaddubsw m3, [r4] ; [16]
4858 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4862 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
4863 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
4864 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
4865 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L]
4867 pmaddubsw m4, m3, [r4 - 16] ; [15]
4872 pmaddubsw m5, m3, [r4 + 14 * 16] ; [30]
4876 pmaddubsw m5, m3, [r4 - 3 * 16] ; [13]
4882 pmaddubsw m6, m3, [r4 + 12 * 16] ; [28]
4886 pmaddubsw m6, m3, [r4 - 5 * 16] ; [11]
4892 pmaddubsw m0, m3, [r4 + 10 * 16] ; [26]
4896 pmaddubsw m1, m3, [r4 - 7 * 16] ; [09]
4902 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
4906 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
4908 pmaddubsw m4, m3, [r4 - 9 * 16] ; [07]
4914 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
4918 pmaddubsw m5, m3, [r4 - 11 * 16] ; [05]
4924 pmaddubsw m6, m3, [r4 + 4 * 16] ; [20]
4928 pmaddubsw m6, m3, [r4 - 13 * 16] ; [03]
4934 pmaddubsw m1, m3, [r4 + 2 * 16] ; [18]
4938 pmaddubsw m1, m3, [r4 - 15 * 16] ; [01]
4944 pmaddubsw m3, [r4] ; [16]
4948 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
4953 cglobal intra_pred_ang16_16, 4,7,8
4955 lea r4, [ang_table + 16 * 16]
4956 lea r5, [r1 * 3] ; r5 -> 3 * stride
4957 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
4960 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
4961 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
4962 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
4964 pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
4965 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
4967 pmaddubsw m4, [r4 - 5 * 16] ; [11]
4972 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
4976 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
4981 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
4985 pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x]
4988 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
4990 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
4994 pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
4997 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
5000 pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x]
5003 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
5007 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5009 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
5012 pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x]
5015 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
5019 pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x]
5022 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
5024 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
5028 pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x]
5031 pmaddubsw m6, m3, [r4 - 16] ; [15]
5034 pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x]
5037 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5041 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
5044 pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x]
5047 pmaddubsw m3, [r4] ; [16]
5051 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5053 lea r0, [r6 + r1 * 4]
5054 lea r6, [r6 + r1 * 8]
5056 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5057 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
5058 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
5059 palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
5060 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
5062 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
5067 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
5071 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
5077 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
5084 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
5087 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
5094 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
5100 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
5104 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5106 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
5112 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
5119 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
5121 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
5128 pmaddubsw m6, m3, [r4 - 16] ; [15]
5134 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5138 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
5144 pmaddubsw m3, [r4] ; [16]
5148 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5153 cglobal intra_pred_ang16_20, 4,7,8
5155 lea r4, [ang_table + 16 * 16]
5156 lea r5, [r1 * 3] ; r5 -> 3 * stride
5160 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
5161 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
5162 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
5164 pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8]
5165 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
5167 pmaddubsw m4, [r4 - 5 * 16] ; [11]
5172 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
5176 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
5181 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
5185 pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x]
5188 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
5190 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
5194 pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
5197 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
5200 pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x]
5203 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
5207 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5209 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
5212 pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x]
5215 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
5219 pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x]
5222 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
5224 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
5228 pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x]
5231 pmaddubsw m6, m3, [r4 - 16] ; [15]
5234 pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x]
5237 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5241 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
5244 pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x]
5247 pmaddubsw m3, [r4] ; [16]
5251 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
5255 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5256 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
5257 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
5258 palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x]
5259 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
5261 pmaddubsw m4, m3, [r4 - 5 * 16] ; [11]
5266 pmaddubsw m5, m3, [r4 + 6 * 16] ; [22]
5270 pmaddubsw m5, m3, [r4 - 15 * 16] ; [01]
5276 pmaddubsw m6, m3, [r4 - 4 * 16] ; [12]
5283 pmaddubsw m6, m3, [r4 + 7 * 16] ; [23]
5286 pmaddubsw m0, m3, [r4 - 14 * 16] ; [02]
5293 pmaddubsw m1, m3, [r4 - 3 * 16] ; [13]
5299 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24]
5303 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5305 pmaddubsw m4, m3, [r4 - 13 * 16] ; [03]
5311 pmaddubsw m5, m3, [r4 - 2 * 16] ; [14]
5318 pmaddubsw m5, m3, [r4 + 9 * 16] ; [25]
5320 pmaddubsw m6, m3, [r4 - 12 * 16] ; [04]
5327 pmaddubsw m6, m3, [r4 - 16] ; [15]
5333 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5337 pmaddubsw m1, m3, [r4 - 11 * 16] ; [05]
5343 pmaddubsw m3, [r4] ; [16]
5347 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
5352 cglobal intra_pred_ang16_17, 4,7,8
5354 lea r4, [ang_table + 16 * 16]
5355 lea r5, [r1 * 3] ; r5 -> 3 * stride
5356 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
5359 movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
5360 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
5361 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
5363 pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
5364 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
5366 pmaddubsw m4, [r4 - 10 * 16] ; [06]
5371 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
5377 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
5380 pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
5381 pinsrb m2, [r3 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
5384 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
5388 pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x]
5391 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
5393 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
5397 pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x]
5400 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
5403 pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x]
5406 pmaddubsw m0, m3, [r4] ; [16]
5410 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5412 pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x]
5415 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
5418 pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x]
5421 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
5425 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
5428 pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x]
5431 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
5435 pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x]
5438 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
5441 pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x]
5444 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
5448 pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x]
5451 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5453 pmaddubsw m3, [r4 - 16 * 16] ; [00]
5457 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5459 lea r0, [r6 + r1 * 4]
5460 lea r6, [r6 + r1 * 8]
5462 movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5463 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
5464 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
5465 palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
5466 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 1, 2, 4, 5, x, x, x]
5468 pmaddubsw m4, m3, [r4 - 10 * 16] ; [06]
5473 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
5480 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
5486 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
5493 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
5495 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
5502 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
5508 pmaddubsw m0, m3, [r4] ; [16]
5512 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
5517 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
5523 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
5527 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
5533 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
5540 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
5546 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
5553 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5555 pmaddubsw m3, [r4 - 16 * 16] ; [00]
5559 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
5564 cglobal intra_pred_ang16_19, 4,7,8
5566 lea r4, [ang_table + 16 * 16]
5567 lea r5, [r1 * 3] ; r5 -> 3 * stride
5571 movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
5572 punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8]
5573 punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
5575 pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4]
5576 palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0]
5578 pmaddubsw m4, [r4 - 10 * 16] ; [06]
5583 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
5589 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
5592 pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x]
5593 pinsrb m2, [r2 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5]
5596 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
5600 pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x]
5603 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
5605 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
5609 pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x]
5612 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
5615 pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x]
5618 pmaddubsw m0, m3, [r4] ; [16]
5622 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5624 pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x]
5627 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
5630 pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x]
5633 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
5637 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
5640 pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x]
5643 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
5647 pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x]
5650 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
5653 pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x]
5656 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
5660 pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x]
5663 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5665 pmaddubsw m3, [r4 - 16 * 16] ; [00]
5669 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
5673 movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
5674 pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x]
5675 punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8]
5676 palignr m2, m2, 6 ; [x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x]
5677 movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x]
5679 pmaddubsw m4, m3, [r4 - 10 * 16] ; [06]
5684 pmaddubsw m5, m3, [r4 - 4 * 16] ; [12]
5691 pmaddubsw m5, m3, [r4 + 2 * 16] ; [18]
5697 pmaddubsw m6, m3, [r4 + 8 * 16] ; [24]
5704 pmaddubsw m6, m3, [r4 + 14 * 16] ; [30]
5706 pmaddubsw m0, m3, [r4 - 12 * 16] ; [04]
5713 pmaddubsw m1, m3, [r4 - 6 * 16] ; [10]
5719 pmaddubsw m0, m3, [r4] ; [16]
5723 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
5728 pmaddubsw m4, m3, [r4 + 6 * 16] ; [22]
5734 pmaddubsw m5, m3, [r4 + 12 * 16] ; [28]
5738 pmaddubsw m5, m3, [r4 - 14 * 16] ; [02]
5744 pmaddubsw m6, m3, [r4 - 8 * 16] ; [08]
5751 pmaddubsw m6, m3, [r4 - 2 * 16] ; [14]
5757 pmaddubsw m1, m3, [r4 + 4 * 16] ; [20]
5764 pmaddubsw m1, m3, [r4 + 10 * 16] ; [26]
5766 pmaddubsw m3, [r4 - 16 * 16] ; [00]
5770 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
5775 cglobal intra_pred_ang16_18, 4,5,3
5779 mova m2, [c_mode16_18]
5786 palignr m2, m0, m1, 15
5788 palignr m2, m0, m1, 14
5790 palignr m2, m0, m1, 13
5793 palignr m2, m0, m1, 12
5795 palignr m2, m0, m1, 11
5797 palignr m2, m0, m1, 10
5799 palignr m2, m0, m1, 9
5802 palignr m2, m0, m1, 8
5804 palignr m2, m0, m1, 7
5806 palignr m2, m0, m1, 6
5808 palignr m2, m0, m1, 5
5811 palignr m2, m0, m1, 4
5813 palignr m2, m0, m1, 3
5815 palignr m2, m0, m1, 2
5821 ;---------------------------------------------------------------------------------------------------------------
5822 ; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
5823 ;---------------------------------------------------------------------------------------------------------------
5825 cglobal intra_pred_ang32_2, 3,4,4
5836 palignr m2, m1, m0, 1
5838 palignr m2, m3, m1, 1
5839 movu [r0 + r1 + 16], m2
5840 palignr m2, m1, m0, 2
5841 movu [r0 + r1 * 2], m2
5842 palignr m2, m3, m1, 2
5843 movu [r0 + r1 * 2 + 16], m2
5844 palignr m2, m1, m0, 3
5846 palignr m2, m3, m1, 3
5847 movu [r0 + r3 + 16], m2
5849 lea r0, [r0 + r1 * 4]
5851 palignr m2, m1, m0, 4
5853 palignr m2, m3, m1, 4
5855 palignr m2, m1, m0, 5
5857 palignr m2, m3, m1, 5
5858 movu [r0 + r1 + 16], m2
5859 palignr m2, m1, m0, 6
5860 movu [r0 + r1 * 2], m2
5861 palignr m2, m3, m1, 6
5862 movu [r0 + r1 * 2 + 16], m2
5863 palignr m2, m1, m0, 7
5865 palignr m2, m3, m1, 7
5866 movu [r0 + r3 + 16], m2
5868 lea r0, [r0 + r1 * 4]
5870 palignr m2, m1, m0, 8
5872 palignr m2, m3, m1, 8
5874 palignr m2, m1, m0, 9
5876 palignr m2, m3, m1, 9
5877 movu [r0 + r1 + 16], m2
5878 palignr m2, m1, m0, 10
5879 movu [r0 + r1 * 2], m2
5880 palignr m2, m3, m1, 10
5881 movu [r0 + r1 * 2 + 16], m2
5882 palignr m2, m1, m0, 11
5884 palignr m2, m3, m1, 11
5885 movu [r0 + r3 + 16], m2
5887 lea r0, [r0 + r1 * 4]
5889 palignr m2, m1, m0, 12
5891 palignr m2, m3, m1, 12
5893 palignr m2, m1, m0, 13
5895 palignr m2, m3, m1, 13
5896 movu [r0 + r1 + 16], m2
5897 palignr m2, m1, m0, 14
5898 movu [r0 + r1 * 2], m2
5899 palignr m2, m3, m1, 14
5900 movu [r0 + r1 * 2 + 16], m2
5901 palignr m2, m1, m0, 15
5903 palignr m2, m3, m1, 15
5904 movu [r0 + r3 + 16], m2
5906 lea r0, [r0 + r1 * 4]
5911 palignr m2, m3, m1, 1
5913 palignr m2, m0, m3, 1
5914 movu [r0 + r1 + 16], m2
5915 palignr m2, m3, m1, 2
5916 movu [r0 + r1 * 2], m2
5917 palignr m2, m0, m3, 2
5918 movu [r0 + r1 * 2 + 16], m2
5919 palignr m2, m3, m1, 3
5921 palignr m2, m0, m3, 3
5922 movu [r0 + r3 + 16], m2
5924 lea r0, [r0 + r1 * 4]
5926 palignr m2, m3, m1, 4
5928 palignr m2, m0, m3, 4
5930 palignr m2, m3, m1, 5
5932 palignr m2, m0, m3, 5
5933 movu [r0 + r1 + 16], m2
5934 palignr m2, m3, m1, 6
5935 movu [r0 + r1 * 2], m2
5936 palignr m2, m0, m3, 6
5937 movu [r0 + r1 * 2 + 16], m2
5938 palignr m2, m3, m1, 7
5940 palignr m2, m0, m3, 7
5941 movu [r0 + r3 + 16], m2
5943 lea r0, [r0 + r1 * 4]
5945 palignr m2, m3, m1, 8
5947 palignr m2, m0, m3, 8
5949 palignr m2, m3, m1, 9
5951 palignr m2, m0, m3, 9
5952 movu [r0 + r1 + 16], m2
5953 palignr m2, m3, m1, 10
5954 movu [r0 + r1 * 2], m2
5955 palignr m2, m0, m3, 10
5956 movu [r0 + r1 * 2 + 16], m2
5957 palignr m2, m3, m1, 11
5959 palignr m2, m0, m3, 11
5960 movu [r0 + r3 + 16], m2
5962 lea r0, [r0 + r1 * 4]
5964 palignr m2, m3, m1, 12
5966 palignr m2, m0, m3, 12
5968 palignr m2, m3, m1, 13
5970 palignr m2, m0, m3, 13
5971 movu [r0 + r1 + 16], m2
5972 palignr m2, m3, m1, 14
5973 movu [r0 + r1 * 2], m2
5974 palignr m2, m0, m3, 14
5975 movu [r0 + r1 * 2 + 16], m2
5976 palignr m2, m3, m1, 15
5978 palignr m2, m0, m3, 15
5979 movu [r0 + r3 + 16], m2
5982 ; Process Intra32x32, input 8x8 in [m0, m1, m2, m3, m4, m5, m6, m7], output 8x8
5983 %macro PROC32_8x8 10 ; col4, transpose[0/1] c0, c1, c2, c3, c4, c5, c6, c7
5987 pmaddubsw m0, [r4 + %3 * 16]
5988 pmulhrsw m0, [pw_1024]
5994 pmaddubsw m1, [r4 + %4 * 16]
5995 pmulhrsw m1, [pw_1024]
6007 pmaddubsw m2, [r4 + %5 * 16]
6014 pmaddubsw m3, [r4 + %6 * 16]
6026 pmaddubsw m4, [r4 + %7 * 16]
6033 pmaddubsw m5, [r4 + %8 * 16]
6045 pmaddubsw m6, [r4 + %9 * 16]
6052 pmaddubsw m7, [r4 + %10 * 16]
6064 punpckhbw m1, m0, m2
6066 punpckhbw m3, m0, m1
6069 punpckhbw m1, m4, m6
6071 punpckhbw m6, m4, m1
6074 punpckhdq m2, m0, m4
6076 punpckldq m4, m3, m6
6079 movh [r0 + + %1 * 8], m0
6080 movhps [r0 + r1 + %1 * 8], m0
6081 movh [r0 + r1*2 + %1 * 8], m2
6082 movhps [r0 + r5 + %1 * 8], m2
6083 movh [r6 + %1 * 8], m4
6084 movhps [r6 + r1 + %1 * 8], m4
6085 movh [r6 + r1*2 + %1 * 8], m3
6086 movhps [r6 + r5 + %1 * 8], m3
6089 movhps [r0 + r1 ], m0
6090 movh [r0 + r1 * 2], m2
6091 movhps [r0 + r5 ], m2
6092 lea r0, [r0 + r1 * 4]
6094 movhps [r0 + r1 ], m4
6095 movh [r0 + r1 * 2], m6
6096 movhps [r0 + r5 ], m6
6101 movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
6102 palignr m1, m0, 1 ; [ x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
6103 punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9]
6104 punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1]
6105 palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2]
6106 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
6108 pmaddubsw m1, [r3 + 4 * 16] ; [20]
6111 palignr m5, m2, m0, 4
6112 pmaddubsw m5, [r3 - 2 * 16] ; [14]
6114 palignr m6, m2, m0, 6
6115 pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
6118 palignr m1, m2, m0, 8
6119 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
6121 pmaddubsw m1, [r3 + 12 * 16] ; [28]
6124 palignr m1, m2, m0, 10
6125 pmaddubsw m1, [r3 + 6 * 16] ; [22]
6128 pmaddubsw m2, [r3] ; [16]
6132 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6136 punpckhbw m2, m0, m1
6138 palignr m5, m2, m0, 2
6139 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
6141 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
6144 pmaddubsw m5, [r3 + 14 * 16] ; [30]
6146 palignr m6, m2, m0, 4
6147 pmaddubsw m6, [r3 + 8 * 16] ; [24]
6150 palignr m1, m2, m0, 6
6151 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
6153 palignr m1, m2, m0, 8
6154 pmaddubsw m1, [r3 - 4 * 16] ; [12]
6157 palignr m1, m2, m0, 10
6158 pmaddubsw m1, [r3 - 10 * 16] ; [06]
6161 movhps m1, [r2 + 14] ; [00]
6163 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6167 punpckhbw m2, m0, m1
6169 palignr m1, m2, m0, 2
6170 pmaddubsw m4, m0, [r3 + 10 * 16] ; [26]
6172 pmaddubsw m1, [r3 + 4 * 16] ; [20]
6175 palignr m5, m2, m0, 4
6176 pmaddubsw m5, [r3 - 2 * 16] ; [14]
6178 palignr m6, m2, m0, 6
6179 pmaddubsw m6, [r3 - 8 * 16] ; [ 8]
6182 palignr m1, m2, m0, 8
6183 pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2]
6185 pmaddubsw m1, [r3 + 12 * 16] ; [28]
6188 palignr m1, m2, m0, 10
6189 pmaddubsw m1, [r3 + 6 * 16] ; [22]
6192 pmaddubsw m2, [r3] ; [16]
6196 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6200 punpckhbw m2, m0, m1
6202 palignr m5, m2, m0, 2
6203 pmaddubsw m4, m0, [r3 - 6 * 16] ; [10]
6205 pmaddubsw m1, m5, [r3 - 12 * 16] ; [04]
6208 pmaddubsw m5, [r3 + 14 * 16] ; [30]
6210 palignr m6, m2, m0, 4
6211 pmaddubsw m6, [r3 + 8 * 16] ; [24]
6214 palignr m1, m2, m0, 6
6215 pmaddubsw m6, m1, [r3 + 2 * 16] ; [18]
6217 palignr m1, m2, m0, 8
6218 pmaddubsw m1, [r3 - 4 * 16] ; [12]
6221 palignr m1, m2, m0, 10
6222 pmaddubsw m1, [r3 - 10 * 16] ; [06]
6225 movhps m1, [r2 + 27] ; [00]
6227 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6229 ;------------------------------------------------------------------------------------------------------------------
6230 ; void intraPredAng32_3(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6231 ;------------------------------------------------------------------------------------------------------------------
6233 cglobal intra_pred_ang32_3, 3,7,8
6234 lea r3, [ang_table + 16 * 16]
6236 lea r5, [r1 * 3] ; r5 -> 3 * stride
6237 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6241 lea r0, [r6 + r1 * 4]
6242 lea r6, [r6 + r1 * 8]
6251 punpckhbw m2, m0, m1
6253 palignr m1, m2, m0, 2
6255 pmaddubsw m4, m0, [r3 + 5 * 16] ; [21]
6257 pmaddubsw m1, [r3 - 6 * 16] ; [10]
6260 pmaddubsw m5, [r3 + 15 * 16] ; [31]
6262 palignr m6, m2, m0, 4
6263 pmaddubsw m6, [r3 + 4 * 16] ; [ 20]
6266 palignr m1, m2, m0, 6
6267 pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9]
6269 pmaddubsw m1, [r3 + 14 * 16] ; [30]
6272 palignr m1, m2, m0, 8
6273 pmaddubsw m1, [r3 + 3 * 16] ; [19]
6276 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
6280 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6282 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
6286 punpckhbw m2, m0, m1
6288 palignr m1, m2, m0, 2
6289 pmaddubsw m1, [r3 + 2 * 16] ; [18]
6292 palignr m5, m2, m0, 4
6294 pmaddubsw m5, [r3 - 9 * 16] ; [07]
6296 pmaddubsw m6, [r3 + 12 * 16] ; [28]
6299 palignr m6, m2, m0, 6
6300 pmaddubsw m6, [r3 + 16] ; [17]
6302 palignr m1, m2, m0, 8
6303 pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
6306 pmaddubsw m1, [r3 + 11 * 16] ; [27]
6309 pmaddubsw m2, [r3] ; [16]
6313 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6317 punpckhbw m2, m0, m1
6320 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
6322 pmaddubsw m1, [r3 + 10 * 16] ; [26]
6325 palignr m5, m2, m0, 2
6326 pmaddubsw m5, [r3 - 16] ; [15]
6328 palignr m6, m2, m0, 4
6330 pmaddubsw m1, [r3 - 12 * 16] ; [4]
6333 pmaddubsw m6, [r3 + 9 * 16] ; [25]
6335 palignr m1, m2, m0, 6
6336 pmaddubsw m1, [r3 - 2 * 16] ; [14]
6339 palignr m1, m2, m0, 8
6341 pmaddubsw m1, [r3 - 13 * 16] ; [3]
6343 pmaddubsw m2, [r3 + 8 * 16] ; [24]
6347 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6351 punpckhbw m2, m0, m1
6353 pmaddubsw m4, m0, [r3 - 3 * 16] ; [13]
6355 palignr m5, m2, m0, 2
6356 pmaddubsw m1, m5, [r3 - 14 * 16] ; [2]
6359 pmaddubsw m5, [r3 + 7 * 16] ; [23]
6361 palignr m6, m2, m0, 4
6362 pmaddubsw m6, [r3 - 4 * 16] ; [12]
6365 palignr m6, m2, m0, 6
6367 pmaddubsw m6, [r3 - 15 * 16] ; [1]
6369 pmaddubsw m1, [r3 + 6 * 16] ; [22]
6372 palignr m1, m2, m0, 8
6373 pmaddubsw m1, [r3 - 5 * 16] ; [11]
6376 movhps m1, [r2 + 22] ; [00]
6378 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6380 ;-----------------------------------------------------------------------------------------------------------------
6381 ; void intraPredAng32_4(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6382 ;-----------------------------------------------------------------------------------------------------------------
6384 cglobal intra_pred_ang32_4, 3,7,8
6385 lea r3, [ang_table + 16 * 16]
6387 lea r5, [r1 * 3] ; r5 -> 3 * stride
6388 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6392 lea r0, [r6 + r1 * 4]
6393 lea r6, [r6 + r1 * 8]
6402 punpckhbw m2, m0, m1
6404 palignr m1, m2, m0, 2
6406 pmaddubsw m4, m0, [r3 + 16] ; [17]
6408 pmaddubsw m1, [r3 - 14 * 16] ; [2]
6411 pmaddubsw m5, [r3 + 3 * 16] ; [19]
6413 palignr m6, m2, m0, 4
6415 pmaddubsw m6, [r3 - 12 * 16] ; [4]
6418 pmaddubsw m6, m1, [r3 + 5 * 16] ; [21]
6420 palignr m1, m2, m0, 6
6422 pmaddubsw m3, [r3 - 10 * 16] ; [6]
6425 pmaddubsw m1, [r3 + 7 * 16] ; [23]
6428 pmaddubsw m2, [r3 - 8 * 16] ; [8]
6432 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6436 punpckhbw m2, m0, m1
6438 palignr m1, m2, m0, 2
6440 pmaddubsw m4, m0, [r3 + 9 * 16] ; [25]
6442 pmaddubsw m1, [r3 - 6 * 16] ; [10]
6445 pmaddubsw m5, [r3 + 11 * 16] ; [27]
6447 palignr m6, m2, m0, 4
6449 pmaddubsw m6, [r3 - 4 * 16] ; [12]
6452 pmaddubsw m6, m1, [r3 + 13 * 16] ; [29]
6454 palignr m1, m2, m0, 6
6456 pmaddubsw m3, [r3 - 2 * 16] ; [14]
6459 pmaddubsw m1, [r3 + 15 * 16] ; [31]
6462 pmaddubsw m2, [r3] ; [16]
6466 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6470 punpckhbw m2, m0, m1
6473 pmaddubsw m4, m0, [r3 - 15 * 16] ; [1]
6475 pmaddubsw m1, [r3 + 2 * 16] ; [18]
6478 palignr m5, m2, m0, 2
6480 pmaddubsw m5, [r3 - 13 * 16] ; [3]
6482 pmaddubsw m1, [r3 + 4 * 16] ; [20]
6485 palignr m1, m2, m0, 4
6486 pmaddubsw m6, m1, [r3 - 11 * 16] ; [5]
6488 pmaddubsw m1, [r3 + 6 * 16] ; [22]
6492 pmaddubsw m1, m2, [r3 - 9 * 16] ; [7]
6494 pmaddubsw m2, [r3 + 8 * 16] ; [24]
6498 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6502 punpckhbw m2, m0, m1
6505 pmaddubsw m4, m0, [r3 - 7 * 16] ; [9]
6507 pmaddubsw m1, [r3 + 10 * 16] ; [26]
6510 palignr m5, m2, m0, 2
6512 pmaddubsw m5, [r3 - 5 * 16] ; [11]
6514 pmaddubsw m1, [r3 + 12 * 16] ; [28]
6517 palignr m1, m2, m0, 4
6518 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
6520 pmaddubsw m1, [r3 + 14 * 16] ; [30]
6524 pmaddubsw m1, m2, [r3 - 16] ; [15]
6527 movhps m1, [r2 + 18] ; [00]
6529 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6531 ;------------------------------------------------------------------------------------------------------------------
6532 ; void intraPredAng32_5(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6533 ;------------------------------------------------------------------------------------------------------------------
6535 cglobal intra_pred_ang32_5, 3,7,8
6536 lea r3, [ang_table + 16 * 16]
6538 lea r5, [r1 * 3] ; r5 -> 3 * stride
6539 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6543 lea r0, [r6 + r1 * 4]
6544 lea r6, [r6 + r1 * 8]
6553 punpckhbw m2, m0, m1
6556 pmaddubsw m4, m0, [r3 - 3 * 16] ; [13]
6558 pmaddubsw m1, [r3 + 10 * 16] ; [26]
6561 palignr m6, m2, m0, 2
6562 pmaddubsw m5, m6, [r3 - 9 * 16] ; [7]
6564 pmaddubsw m6, [r3 + 4 * 16] ; [20]
6567 palignr m1, m2, m0, 4
6568 pmaddubsw m6, m1, [r3 - 15 * 16] ; [1]
6570 pmaddubsw m3, m1, [r3 - 2 * 16] ; [14]
6573 pmaddubsw m1, [r3 + 11 * 16] ; [27]
6576 pmaddubsw m3, m2, [r3 - 8 * 16] ; [8]
6580 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6582 pmaddubsw m4, m2, [r3 + 5 * 16] ; [21]
6586 punpckhbw m2, m0, m1
6589 pmaddubsw m1, m6, [r3 - 14 * 16] ; [2]
6592 pmaddubsw m5, m6, [r3 - 16] ; [15]
6594 pmaddubsw m6, [r3 + 12 * 16] ; [28]
6597 palignr m3, m2, m0, 2
6598 pmaddubsw m6, m3, [r3 - 7 * 16] ; [9]
6600 pmaddubsw m3, [r3 + 6 * 16] ; [22]
6604 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
6606 pmaddubsw m3, m2, [r3] ; [16]
6610 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6612 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
6616 punpckhbw m2, m0, m1
6618 palignr m5, m2, m0, 2
6619 pmaddubsw m1, m5, [r3 - 6 * 16] ; [10]
6622 pmaddubsw m5, [r3 + 7 * 16] ; [23]
6624 palignr m1, m2, m0, 4
6625 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
6628 pmaddubsw m6, m1, [r3 + 16] ; [17]
6630 pmaddubsw m1, [r3 + 14 * 16] ; [30]
6633 palignr m2, m2, m0, 6
6634 pmaddubsw m1, m2, [r3 - 5 * 16] ; [11]
6636 pmaddubsw m2, m2, [r3 + 8 * 16] ; [24]
6640 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6644 punpckhbw m2, m0, m1
6647 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
6649 pmaddubsw m3, m5, [r3 + 2 * 16] ; [18]
6652 pmaddubsw m5, [r3 + 15 * 16] ; [31]
6654 palignr m6, m2, m0, 2
6655 pmaddubsw m1, m6, [r3 - 4 * 16] ; [12]
6658 pmaddubsw m6, [r3 + 9 * 16] ; [25]
6660 palignr m1, m2, m0, 4
6661 pmaddubsw m2, m1, [r3 - 10 * 16] ; [6]
6664 pmaddubsw m1, [r3 + 3 * 16] ; [19]
6667 movhps m1, [r2 + 14] ; [00]
6669 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6671 ;------------------------------------------------------------------------------------------------------------------
6672 ; void intraPredAng32_6(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6673 ;------------------------------------------------------------------------------------------------------------------
6675 cglobal intra_pred_ang32_6, 3,7,8
6676 lea r3, [ang_table + 16 * 16]
6678 lea r5, [r1 * 3] ; r5 -> 3 * stride
6679 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6683 lea r0, [r6 + r1 * 4]
6684 lea r6, [r6 + r1 * 8]
6693 punpckhbw m2, m0, m1
6696 pmaddubsw m4, m0, [r3 - 7 * 16] ; [9]
6698 pmaddubsw m3, m5, [r3 + 2 * 16] ; [18]
6701 pmaddubsw m5, [r3 + 11 * 16] ; [27]
6703 palignr m1, m2, m0, 2
6705 pmaddubsw m6, m1, [r3 - 12 * 16] ; [4]
6708 pmaddubsw m6, m1, [r3 - 3 * 16] ; [13]
6710 pmaddubsw m0, m1, [r3 + 6 * 16] ; [22]
6713 pmaddubsw m1, [r3 + 15 * 16] ; [31]
6715 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
6719 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6721 pmaddubsw m4, m2, [r3 + 16] ; [17]
6723 pmaddubsw m2, [r3 + 10 * 16] ; [26]
6728 punpckhbw m2, m0, m1
6731 pmaddubsw m5, m0, [r3 - 13 * 16] ; [03]
6733 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
6736 pmaddubsw m6, m0, [r3 + 5 * 16] ; [21]
6738 pmaddubsw m0, [r3 + 14 * 16] ; [30]
6741 pmaddubsw m1, m2, [r3 - 9 * 16] ; [07]
6743 pmaddubsw m3, m2, [r3] ; [16]
6747 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6749 pmaddubsw m4, m2, [r3 + 9 * 16] ; [25]
6753 punpckhbw m2, m0, m1
6756 pmaddubsw m1, m0, [r3 - 14 * 16] ; [2]
6759 pmaddubsw m5, m0, [r3 - 5 * 16] ; [11]
6761 pmaddubsw m6, m0, [r3 + 4 * 16] ; [20]
6764 pmaddubsw m6, m0, [r3 + 13 * 16] ; [29]
6766 pmaddubsw m1, m2, [r3 - 10 * 16] ; [6]
6769 pmaddubsw m1, m2, [r3 - 16] ; [15]
6771 pmaddubsw m2, m2, [r3 + 8 * 16] ; [24]
6775 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6779 punpckhbw m2, m0, m1
6781 pmaddubsw m4, m0, [r3 - 15 * 16] ; [1]
6783 pmaddubsw m3, m0, [r3 - 6 * 16] ; [10]
6786 pmaddubsw m5, m0, [r3 + 3 * 16] ; [19]
6788 pmaddubsw m6, m0, [r3 + 12 * 16] ; [28]
6792 pmaddubsw m6, m2, [r3 - 11 * 16] ; [5]
6794 pmaddubsw m0, m2, [r3 - 2 * 16] ; [14]
6797 pmaddubsw m1, m2, [r3 + 7 * 16] ; [23]
6800 movhps m1, [r2 + 10] ; [0]
6802 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6804 ;------------------------------------------------------------------------------------------------------------------
6805 ; void intraPredAng32_7(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6806 ;------------------------------------------------------------------------------------------------------------------
6808 cglobal intra_pred_ang32_7, 3,7,8
6809 lea r3, [ang_table + 16 * 16]
6811 lea r5, [r1 * 3] ; r5 -> 3 * stride
6812 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6816 lea r0, [r6 + r1 * 4]
6817 lea r6, [r6 + r1 * 8]
6826 punpckhbw m2, m0, m1
6829 pmaddubsw m4, m0, [r3 - 11 * 16] ; [5]
6831 pmaddubsw m3, m0, [r3 - 6 * 16] ; [10]
6834 pmaddubsw m5, m0, [r3 - 1 * 16] ; [15]
6836 pmaddubsw m6, m0, [r3 + 4 * 16] ; [20]
6839 pmaddubsw m6, m0, [r3 + 9 * 16] ; [25]
6841 pmaddubsw m0, [r3 + 14 * 16] ; [30]
6844 pmaddubsw m1, m2, [r3 - 13 * 16] ; [3]
6846 pmaddubsw m0, m2, [r3 - 8 * 16] ; [8]
6850 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6852 pmaddubsw m4, m2, [r3 - 3 * 16] ; [13]
6854 pmaddubsw m5, m2, [r3 + 2 * 16] ; [18]
6857 pmaddubsw m5, m2, [r3 + 7 * 16] ; [23]
6859 pmaddubsw m2, [r3 + 12 * 16] ; [28]
6864 punpckhbw m2, m0, m1
6866 pmaddubsw m6, m0, [r3 - 15 * 16] ; [01]
6868 pmaddubsw m1, m0, [r3 - 10 * 16] ; [06]
6871 pmaddubsw m1, m0, [r3 - 5 * 16] ; [11]
6874 pmaddubsw m0, [r3] ; [16]
6878 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
6880 pmaddubsw m4, m2, [r3 + 5 * 16] ; [21]
6882 pmaddubsw m5, m2, [r3 + 10 * 16] ; [26]
6885 pmaddubsw m5, m2, [r3 + 15 * 16] ; [31]
6889 punpckhbw m2, m0, m1
6891 pmaddubsw m2, m0, [r3 - 12 * 16] ; [4]
6894 pmaddubsw m6, m0, [r3 - 7 * 16] ; [9]
6896 pmaddubsw m1, m0, [r3 - 2 * 16] ; [14]
6899 pmaddubsw m1, m0, [r3 + 3 * 16] ; [19]
6902 pmaddubsw m0, [r3 + 8 * 16] ; [24]
6906 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
6908 pmaddubsw m4, m2, [r3 + 13 * 16] ; [29]
6912 punpckhbw m2, m0, m1
6914 pmaddubsw m1, m0, [r3 - 14 * 16] ; [2]
6917 pmaddubsw m5, m0, [r3 - 9 * 16] ; [7]
6919 pmaddubsw m6, m0, [r3 - 4 * 16] ; [12]
6922 pmaddubsw m6, m0, [r3 + 16] ; [17]
6924 pmaddubsw m1, m0, [r3 + 6 * 16] ; [22]
6927 pmaddubsw m1, m0, [r3 + 11 * 16] ; [27]
6930 movhps m1, [r2 + 6] ; [00]
6932 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
6934 ;------------------------------------------------------------------------------------------------------------------
6935 ; void intraPredAng32_8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
6936 ;------------------------------------------------------------------------------------------------------------------
6938 cglobal intra_pred_ang32_8, 3,7,8
6939 lea r3, [ang_table + 16 * 16]
6941 lea r5, [r1 * 3] ; r5 -> 3 * stride
6942 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
6946 lea r0, [r6 + r1 * 4]
6947 lea r6, [r6 + r1 * 8]
6956 punpckhbw m0, m2, m1
6958 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
6960 pmaddubsw m3, m2, [r3 - 12 * 16] ; [4]
6963 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
6965 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
6968 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
6970 pmaddubsw m3, m2, [r3 - 4 * 16] ; [12]
6973 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
6975 pmaddubsw m0, m2, [r3] ; [16]
6979 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
6981 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
6983 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
6986 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
6988 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
6991 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
6993 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
6996 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
6999 movhps m1, [r2 + 2] ; [00]
7001 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
7006 pmaddubsw m4, m2, [r3 - 14 * 16] ; [2]
7008 pmaddubsw m3, m2, [r3 - 12 * 16] ; [4]
7011 pmaddubsw m5, m2, [r3 - 10 * 16] ; [6]
7013 pmaddubsw m6, m2, [r3 - 8 * 16] ; [8]
7016 pmaddubsw m6, m2, [r3 - 6 * 16] ; [10]
7018 pmaddubsw m0, m2, [r3 - 4 * 16] ; [12]
7021 pmaddubsw m1, m2, [r3 - 2 * 16] ; [14]
7023 pmaddubsw m0, m2, [r3] ; [16]
7027 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
7032 pmaddubsw m4, m2, [r3 + 2 * 16] ; [18]
7034 pmaddubsw m5, m2, [r3 + 4 * 16] ; [20]
7037 pmaddubsw m5, m2, [r3 + 6 * 16] ; [22]
7039 pmaddubsw m6, m2, [r3 + 8 * 16] ; [24]
7042 pmaddubsw m6, m2, [r3 + 10 * 16] ; [26]
7044 pmaddubsw m1, m2, [r3 + 12 * 16] ; [28]
7047 pmaddubsw m1, m2, [r3 + 14 * 16] ; [30]
7050 movhps m1, [r2 + 3] ; [00]
7052 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
7054 ;------------------------------------------------------------------------------------------------------------------
7055 ; void intraPredAng32_9(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7056 ;------------------------------------------------------------------------------------------------------------------
7058 cglobal intra_pred_ang32_9, 3,7,8
7059 lea r3, [ang_table + 16 * 16]
7061 lea r5, [r1 * 3] ; r5 -> 3 * stride
7062 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7066 lea r0, [r6 + r1 * 4]
7067 lea r6, [r6 + r1 * 8]
7073 ;------------------------------------------------------------------------------------------------------------------
7074 ; void intraPredAng32_10(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7075 ;------------------------------------------------------------------------------------------------------------------
7077 cglobal intra_pred_ang32_10, 6,7,8,0-(2*mmsize)
7078 %define m8 [rsp + 0 * mmsize]
7079 %define m9 [rsp + 1 * mmsize]
7105 movu [r0 + r1 + 16], m1
7106 movu [r0 + r1 * 2], m2
7107 movu [r0 + r1 * 2 + 16], m2
7109 movu [r0 + r4 + 16], m3
7110 lea r5, [r0 + r1 * 4]
7114 movu [r5 + r1 + 16], m5
7115 movu [r5 + r1 * 2], m6
7116 movu [r5 + r1 * 2 + 16], m6
7132 movu [r5 + r4 + 16], m1
7133 lea r5, [r5 + r1 * 4]
7137 movu [r5 + r1 + 16], m3
7138 movu [r5 + r1 * 2], m4
7139 movu [r5 + r1 * 2 + 16], m4
7141 movu [r5 + r4 + 16], m5
7142 lea r5, [r5 + r1 * 4]
7155 movu [r5 + r1 + 16], m1
7156 movu [r5 + r1 * 2], m2
7157 movu [r5 + r1 * 2 + 16], m2
7159 movu [r5 + r4 + 16], m3
7187 lea r0, [r5 + r1 * 4]
7192 ;-------------------------------------------------------------------------------------------------------------------
7193 ; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7194 ;-------------------------------------------------------------------------------------------------------------------
7196 cglobal intra_pred_ang32_11, 4,7,8
7197 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
7204 ; collect reference pixel
7207 pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
7213 movu [rsp + 1 + 16], m1
7214 movu [rsp + 1 + 32], m2
7215 mov [rsp + 63], byte 4
7218 lea r2, [rsp + 1] ; r2 -> [0]
7219 lea r3, [c_shuf8_0] ; r3 -> shuffle8
7220 lea r4, [ang_table] ; r4 -> ang_table
7221 lea r5, [r1 * 3] ; r5 -> 3 * stride
7222 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7223 mova m5, [pw_1024] ; m5 -> 1024
7224 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
7236 PROC32_8x8 0, 1, 30,28,26,24,22,20,18,16
7247 PROC32_8x8 1, 1, 14,12,10,8,6,4,2,0
7258 PROC32_8x8 2, 1, 30,28,26,24,22,20,18,16
7269 PROC32_8x8 3, 1, 14,12,10,8,6,4,2,0
7271 lea r0, [r6 + r1 * 4]
7272 lea r6, [r6 + r1 * 8]
7279 %macro MODE_12_24_ROW0 1
7281 pshufb m0, [c_mode32_12_0]
7282 pinsrb m0, [r3 + 26], 12
7287 pmaddubsw m4, m2, [r4 + 11 * 16] ; [27]
7289 pmaddubsw m3, m2, [r4 + 6 * 16] ; [22]
7292 pmaddubsw m5, m2, [r4 + 16] ; [17]
7294 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
7297 pmaddubsw m6, m2, [r4 - 9 * 16] ; [7]
7299 pmaddubsw m3, m2, [r4 - 14 * 16] ; [2]
7302 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
7303 palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
7304 punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a]
7305 pmaddubsw m1, m2, [r4 + 13 * 16] ; [29]
7307 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
7310 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
7311 pmaddubsw m4, m2, [r4 + 3 * 16] ; [19]
7313 pmaddubsw m5, m2, [r4 - 2 * 16] ; [14]
7316 pmaddubsw m5, m2, [r4 - 7 * 16] ; [09]
7318 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
7321 palignr m2, above, 14 ;[6 5 5 4 4 3 3 2 2 1 1 0 0 a a b]
7322 pmaddubsw m6, m2, [r4 + 15 * 16] ; [31]
7324 pmaddubsw m1, m2, [r4 + 10 * 16] ; [26]
7327 pmaddubsw m1, m2, [r4 + 5 * 16] ; [21]
7329 pmaddubsw m3, m2, [r4] ; [16]
7332 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
7333 pmaddubsw m4, m2, [r4 - 5 * 16] ; [11]
7335 pmaddubsw m3, m2, [r4 - 10 * 16] ; [06]
7338 pmaddubsw m5, m2, [r4 - 15 * 16] ; [1]
7342 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
7345 pmaddubsw m6, m2, [r4 + 7 * 16] ; [23]
7347 pmaddubsw m3, m2, [r4 + 2 * 16] ; [18]
7350 pmaddubsw m1, m2, [r4 - 3 * 16] ; [13]
7352 pmaddubsw m3, m2, [r4 - 8 * 16] ; [8]
7355 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
7356 pmaddubsw m4, m2, [r4 - 13 * 16] ; [3]
7360 pmaddubsw m5, m2, [r4 + 14 * 16] ; [30]
7363 pmaddubsw m5, m2, [r4 + 9 * 16] ; [25]
7365 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
7368 pmaddubsw m6, m2, [r4 - 16] ; [15]
7370 pmaddubsw m1, m2, [r4 - 6 * 16] ; [10]
7373 pmaddubsw m1, m2, [r4 - 11 * 16] ; [05]
7379 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
7385 punpckhbw m0, m2, m1
7388 pmaddubsw m4, m0, [r4 + 11 * 16] ; [27]
7390 pmaddubsw m3, m0, [r4 + 6 * 16] ; [22]
7393 pmaddubsw m5, m0, [r4 + 16] ; [17]
7395 pmaddubsw m6, m0, [r4 - 4 * 16] ; [12]
7398 pmaddubsw m6, m0, [r4 - 9 * 16] ; [7]
7400 pmaddubsw m3, m0, [r4 - 14 * 16] ; [2]
7403 pmaddubsw m1, m2, [r4 + 13 * 16] ; [29]
7405 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
7408 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
7409 pmaddubsw m4, m2, [r4 + 3 * 16] ; [19]
7411 pmaddubsw m5, m2, [r4 - 2 * 16] ; [14]
7414 pmaddubsw m5, m2, [r4 - 7 * 16] ; [09]
7416 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
7421 punpckhbw m2, m0, m1
7424 pmaddubsw m6, m2, [r4 + 15 * 16] ; [31]
7426 pmaddubsw m1, m2, [r4 + 10 * 16] ; [26]
7429 pmaddubsw m1, m2, [r4 + 5 * 16] ; [21]
7431 pmaddubsw m3, m2, [r4] ; [16]
7434 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
7435 pmaddubsw m4, m2, [r4 - 5 * 16] ; [11]
7437 pmaddubsw m3, m2, [r4 - 10 * 16] ; [06]
7440 pmaddubsw m5, m2, [r4 - 15 * 16] ; [1]
7444 punpckhbw m2, m0, m1
7447 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
7450 pmaddubsw m6, m2, [r4 + 7 * 16] ; [23]
7452 pmaddubsw m3, m2, [r4 + 2 * 16] ; [18]
7455 pmaddubsw m1, m2, [r4 - 3 * 16] ; [13]
7457 pmaddubsw m3, m2, [r4 - 8 * 16] ; [8]
7460 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
7461 pmaddubsw m4, m2, [r4 - 13 * 16] ; [3]
7465 punpckhbw m0, m2, m1
7468 pmaddubsw m5, m0, [r4 + 14 * 16] ; [30]
7471 pmaddubsw m5, m0, [r4 + 9 * 16] ; [25]
7473 pmaddubsw m6, m0, [r4 + 4 * 16] ; [20]
7476 pmaddubsw m6, m0, [r4 - 16] ; [15]
7478 pmaddubsw m1, m0, [r4 - 6 * 16] ; [10]
7481 pmaddubsw m1, m0, [r4 - 11 * 16] ; [05]
7487 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
7489 ;-----------------------------------------------------------------------------------------------------------------
7490 ; void intraPredAng32_12(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7491 ;-----------------------------------------------------------------------------------------------------------------
7493 cglobal intra_pred_ang32_12, 4,7,8,0-(1*mmsize)
7494 %define above [rsp + 0 * mmsize]
7496 lea r4, [ang_table + 16 * 16]
7497 lea r5, [r1 * 3] ; r5 -> 3 * stride
7498 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7502 lea r0, [r6 + r1 * 4]
7503 lea r6, [r6 + r1 * 8]
7508 lea r0, [r6 + r1 * 4]
7509 lea r6, [r6 + r1 * 8]
7515 %macro MODE_13_23_ROW0 1
7518 pshufb m0, [c_mode32_13_0]
7519 pshufb m1, [c_mode32_13_0]
7521 pshufb m0, [c_mode32_13_shuf]
7526 pmaddubsw m4, m2, [r4 + 7 * 16] ; [23]
7528 pmaddubsw m3, m2, [r4 - 2 * 16] ; [14]
7531 pmaddubsw m5, m2, [r4 - 11 * 16] ; [5]
7533 movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
7534 palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a]
7535 punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0]
7536 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
7539 pmaddubsw m6, m2, [r4 + 3 * 16] ; [19]
7541 pmaddubsw m0, m2, [r4 - 6 * 16] ; [10]
7544 pmaddubsw m1, m2, [r4 - 15 * 16] ; [1]
7546 palignr m2, above, 14
7547 pmaddubsw m3, m2, [r4 + 8 * 16] ; [24]
7550 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
7551 pmaddubsw m4, m2, [r4 - 16] ; [15]
7553 pmaddubsw m5, m2, [r4 - 10 * 16] ; [6]
7558 pmaddubsw m5, m2, [r4 + 13 * 16] ; [29]
7560 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
7563 pmaddubsw m6, m2, [r4 - 5 * 16] ; [11]
7565 pmaddubsw m1, m2, [r4 - 14 * 16] ; [2]
7570 pmaddubsw m1, m2, [r4 + 9 * 16] ; [25]
7572 pmaddubsw m0, m2, [r4] ; [16]
7575 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
7576 pmaddubsw m4, m2, [r4 - 9 * 16] ; [7]
7580 pmaddubsw m3, m2, [r4 + 14 * 16] ; [30]
7583 pmaddubsw m5, m2, [r4 + 5 * 16] ; [21]
7585 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
7588 pmaddubsw m6, m2, [r4 - 13 * 16] ; [3]
7592 pmaddubsw m0, m2, [r4 + 10 * 16] ; [26]
7595 pmaddubsw m1, m2, [r4 + 16] ; [17]
7597 pmaddubsw m0, m2, [r4 - 8 * 16] ; [8]
7600 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
7603 pmaddubsw m4, m2, [r4 + 15 * 16] ; [31]
7605 pmaddubsw m5, m2, [r4 + 6 * 16] ; [22]
7608 pmaddubsw m5, m2, [r4 - 3 * 16] ; [13]
7610 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
7615 pmaddubsw m6, m2, [r4 + 11 * 16] ; [27]
7617 pmaddubsw m1, m2, [r4 + 2 * 16] ; [18]
7620 pmaddubsw m1, m2, [r4 - 7 * 16] ; [09]
7622 pmaddubsw m3, m2, [r4 - 16 * 16] ; [00]
7625 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
7629 movu m2, [r2] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
7630 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
7631 punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
7632 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
7633 palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
7634 pmaddubsw m4, m0, [r4 + 7 * 16] ; [23]
7636 pmaddubsw m3, m0, [r4 - 2 * 16] ; [14]
7639 pmaddubsw m5, m0, [r4 - 11 * 16] ; [05]
7641 pmaddubsw m6, m2, [r4 + 12 * 16] ; [28]
7644 pmaddubsw m6, m2, [r4 + 3 * 16] ; [19]
7646 pmaddubsw m3, m2, [r4 - 6 * 16] ; [10]
7649 pmaddubsw m1, m2, [r4 - 15 * 16] ; [1]
7651 movu m2, [r2 - 2] ; [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1]
7652 palignr m3, m2, 1 ; [x, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
7653 punpckhbw m0, m2, m3
7656 pmaddubsw m3, m0, [r4 + 8 * 16] ; [24]
7660 TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
7661 pmaddubsw m4, m3, [r4 - 16] ; [15]
7663 pmaddubsw m5, m3, [r4 - 10 * 16] ; [6]
7666 pmaddubsw m5, m2, [r4 + 13 * 16] ; [29]
7668 pmaddubsw m6, m2, [r4 + 4 * 16] ; [20]
7671 pmaddubsw m6, m2, [r4 - 5 * 16] ; [11]
7673 pmaddubsw m1, m2, [r4 - 14 * 16] ; [2]
7676 movu m2, [r2 - 4] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
7677 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
7678 punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
7679 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
7680 palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
7681 pmaddubsw m1, m0, [r4 + 9 * 16] ; [25]
7683 pmaddubsw m3, m0, [r4] ; [16]
7687 TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1
7688 pmaddubsw m4, m3, [r4 - 9 * 16] ; [7]
7690 pmaddubsw m3, m2, [r4 + 14 * 16] ; [30]
7693 pmaddubsw m5, m2, [r4 + 5 * 16] ; [21]
7695 pmaddubsw m6, m2, [r4 - 4 * 16] ; [12]
7698 pmaddubsw m6, m2, [r4 - 13 * 16] ; [3]
7700 movu m2, [r2 - 6] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
7701 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
7702 punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8]
7703 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
7704 palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1]
7705 pmaddubsw m3, m0, [r4 + 10 * 16] ; [26]
7708 pmaddubsw m1, m0, [r4 + 16] ; [17]
7710 pmaddubsw m3, m0, [r4 - 8 * 16] ; [8]
7713 TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1
7714 pmaddubsw m4, m2, [r4 + 15 * 16] ; [31]
7716 pmaddubsw m5, m2, [r4 + 6 * 16] ; [22]
7719 pmaddubsw m5, m2, [r4 - 3 * 16] ; [13]
7721 pmaddubsw m6, m2, [r4 - 12 * 16] ; [04]
7724 movu m2, [r2 - 7] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
7725 palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
7726 punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0]
7727 pmaddubsw m6, m2, [r4 + 11 * 16] ; [27]
7729 pmaddubsw m1, m2, [r4 + 2 * 16] ; [18]
7732 pmaddubsw m1, m2, [r4 - 7 * 16] ; [09]
7738 TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1
7740 ;-----------------------------------------------------------------------------------------------------------------
7741 ; void intraPredAng32_13(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7742 ;-----------------------------------------------------------------------------------------------------------------
7744 cglobal intra_pred_ang32_13, 4,7,8,0-(1*mmsize)
7745 %define above [rsp + 0 * mmsize]
7746 lea r4, [ang_table + 16 * 16]
7747 lea r5, [r1 * 3] ; r5 -> 3 * stride
7748 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7752 lea r0, [r6 + r1 * 4]
7753 lea r6, [r6 + r1 * 8]
7758 lea r0, [r6 + r1 * 4]
7759 lea r6, [r6 + r1 * 8]
7765 ;-------------------------------------------------------------------------------------------------------------------
7766 ; void intraPredAng32_14(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7767 ;-------------------------------------------------------------------------------------------------------------------
7769 cglobal intra_pred_ang32_14, 4,7,8
7770 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
7776 ; collect reference pixel
7779 pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15]
7780 pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30]
7781 pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x]
7782 palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
7785 movu m1, [r2 + 1 + 16]
7787 movu [rsp + 13 + 16], m1
7788 mov [rsp + 63], byte 4
7791 lea r2, [rsp + 13] ; r2 -> [0]
7792 lea r3, [c_shuf8_0] ; r3 -> shuffle8
7793 lea r4, [ang_table] ; r4 -> ang_table
7794 lea r5, [r1 * 3] ; r5 -> 3 * stride
7795 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7796 mova m5, [pw_1024] ; m5 -> 1024
7797 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
7809 PROC32_8x8 0, 1, 19,6,25,12,31,18,5,24
7820 PROC32_8x8 1, 1, 11,30,17,4,23,10,29,16
7831 PROC32_8x8 2, 1, 3,22,9,28,15,2,21,8
7842 PROC32_8x8 3, 1, 27,14,1,20,7,26,13,0
7844 lea r0, [r6 + r1 * 4]
7845 lea r6, [r6 + r1 * 8]
7852 ;-------------------------------------------------------------------------------------------------------------------
7853 ; void intraPredAng32_15(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7854 ;-------------------------------------------------------------------------------------------------------------------
7856 cglobal intra_pred_ang32_15, 4,7,8
7857 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
7863 ; collect reference pixel
7866 pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
7867 pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
7871 movu m1, [r2 + 1 + 16]
7873 movu [rsp + 17 + 16], m1
7874 mov [rsp + 63], byte 4
7877 lea r2, [rsp + 17] ; r2 -> [0]
7878 lea r3, [c_shuf8_0] ; r3 -> shuffle8
7879 lea r4, [ang_table] ; r4 -> ang_table
7880 lea r5, [r1 * 3] ; r5 -> 3 * stride
7881 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7882 mova m5, [pw_1024] ; m5 -> 1024
7883 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
7895 PROC32_8x8 0, 1, 15,30,13,28,11,26,9,24
7906 PROC32_8x8 1, 1, 7,22,5,20,3,18,1,16
7917 PROC32_8x8 2, 1, 31,14,29,12,27,10,25,8
7928 PROC32_8x8 3, 1, 23,6,21,4,19,2,17,0
7930 lea r0, [r6 + r1 * 4]
7931 lea r6, [r6 + r1 * 8]
7938 ;-------------------------------------------------------------------------------------------------------------------
7939 ; void intraPredAng32_16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
7940 ;-------------------------------------------------------------------------------------------------------------------
7942 cglobal intra_pred_ang32_16, 4,7,8
7943 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
7949 ; collect reference pixel
7952 pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
7953 pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
7957 movu m1, [r2 + 1 + 16]
7959 movu [rsp + 21 + 16], m1
7960 mov [rsp + 63], byte 4
7963 lea r2, [rsp + 21] ; r2 -> [0]
7964 lea r3, [c_shuf8_0] ; r3 -> shuffle8
7965 lea r4, [ang_table] ; r4 -> ang_table
7966 lea r5, [r1 * 3] ; r5 -> 3 * stride
7967 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
7968 mova m5, [pw_1024] ; m5 -> 1024
7969 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
7981 PROC32_8x8 0, 1, 11,22,1,12,23,2,13,24
7992 PROC32_8x8 1, 1, 3,14,25,4,15,26,5,16
8003 PROC32_8x8 2, 1, 27,6,17,28,7,18,29,8
8014 PROC32_8x8 3, 1, 19,30,9,20,31,10,21,0
8016 lea r0, [r6 + r1 * 4]
8017 lea r6, [r6 + r1 * 8]
8024 ;------------------------------------------------------------------------------------------------------------------
8025 ; void intraPredAng32_17(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8026 ;------------------------------------------------------------------------------------------------------------------
8028 cglobal intra_pred_ang32_17, 4,7,8
8029 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8035 ; collect reference pixel
8038 pshufb m0, [c_mode32_17_0]
8039 pshufb m1, [c_mode32_17_0]
8043 movu m1, [r2 + 1 + 16]
8045 movu [rsp + 26 + 16], m1
8046 mov [rsp + 63], byte 4
8049 lea r2, [rsp + 25] ; r2 -> [0]
8050 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8051 lea r4, [ang_table] ; r4 -> ang_table
8052 lea r5, [r1 * 3] ; r5 -> 3 * stride
8053 lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
8054 mova m5, [pw_1024] ; m5 -> 1024
8055 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8067 PROC32_8x8 0, 1, 6,12,18,24,30,4,10,16
8078 PROC32_8x8 1, 1, 22,28,2,8,14,20,26,0
8089 PROC32_8x8 2, 1, 6,12,18,24,30,4,10,16
8100 PROC32_8x8 3, 1, 22,28,2,8,14,20,26,0
8102 lea r0, [r6 + r1 * 4]
8103 lea r6, [r6 + r1 * 8]
8111 ;-------------------------------------------------------------------------------------------------------------------
8112 ; void intraPredAng32_18(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8113 ;-------------------------------------------------------------------------------------------------------------------
8115 cglobal intra_pred_ang32_18, 4,5,5
8116 movu m0, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
8117 movu m1, [r3 + 16] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16]
8118 movu m2, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
8119 movu m3, [r2 + 17] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
8128 pshufb m2, [c_mode32_18_0] ; [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
8129 pshufb m3, [c_mode32_18_0] ; [17 18 19 20 21 22 23 24 25 26 27 28 19 30 31 32]
8131 palignr m4, m0, m2, 15
8133 palignr m4, m1, m0, 15
8134 movu [r0 + r1 + 16], m4
8135 palignr m4, m0, m2, 14
8137 palignr m4, m1, m0, 14
8138 movu [r0 + r2 + 16], m4
8139 palignr m4, m0, m2, 13
8141 palignr m4, m1, m0, 13
8142 movu [r0 + r3 + 16], m4
8146 palignr m4, m0, m2, 12
8148 palignr m4, m1, m0, 12
8150 palignr m4, m0, m2, 11
8152 palignr m4, m1, m0, 11
8153 movu [r0 + r1 + 16], m4
8154 palignr m4, m0, m2, 10
8156 palignr m4, m1, m0, 10
8157 movu [r0 + r2 + 16], m4
8158 palignr m4, m0, m2, 9
8160 palignr m4, m1, m0, 9
8161 movu [r0 + r3 + 16], m4
8165 palignr m4, m0, m2, 8
8167 palignr m4, m1, m0, 8
8169 palignr m4, m0, m2, 7
8171 palignr m4, m1, m0, 7
8172 movu [r0 + r1 + 16], m4
8173 palignr m4, m0, m2, 6
8175 palignr m4, m1, m0, 6
8176 movu [r0 + r2 + 16], m4
8177 palignr m4, m0, m2, 5
8179 palignr m4, m1, m0, 5
8180 movu [r0 + r3 + 16], m4
8184 palignr m4, m0, m2, 4
8186 palignr m4, m1, m0, 4
8188 palignr m4, m0, m2, 3
8190 palignr m4, m1, m0, 3
8191 movu [r0 + r1 + 16], m4
8192 palignr m4, m0, m2, 2
8194 palignr m4, m1, m0, 2
8195 movu [r0 + r2 + 16], m4
8196 palignr m4, m0, m2, 1
8198 palignr m4, m1, m0, 1
8199 movu [r0 + r3 + 16], m4
8205 palignr m4, m2, m3, 15
8207 palignr m4, m0, m2, 15
8208 movu [r0 + r1 + 16], m4
8209 palignr m4, m2, m3, 14
8211 palignr m4, m0, m2, 14
8212 movu [r0 + r2 + 16], m4
8213 palignr m4, m2, m3, 13
8215 palignr m4, m0, m2, 13
8216 movu [r0 + r3 + 16], m4
8220 palignr m4, m2, m3, 12
8222 palignr m4, m0, m2, 12
8224 palignr m4, m2, m3, 11
8226 palignr m4, m0, m2, 11
8227 movu [r0 + r1 + 16], m4
8228 palignr m4, m2, m3, 10
8230 palignr m4, m0, m2, 10
8231 movu [r0 + r2 + 16], m4
8232 palignr m4, m2, m3, 9
8234 palignr m4, m0, m2, 9
8235 movu [r0 + r3 + 16], m4
8239 palignr m4, m2, m3, 8
8241 palignr m4, m0, m2, 8
8243 palignr m4, m2, m3, 7
8245 palignr m4, m0, m2, 7
8246 movu [r0 + r1 + 16], m4
8247 palignr m4, m2, m3, 6
8249 palignr m4, m0, m2, 6
8250 movu [r0 + r2 + 16], m4
8251 palignr m4, m2, m3, 5
8253 palignr m4, m0, m2, 5
8254 movu [r0 + r3 + 16], m4
8258 palignr m4, m2, m3, 4
8260 palignr m4, m0, m2, 4
8262 palignr m4, m2, m3, 3
8264 palignr m4, m0, m2, 3
8265 movu [r0 + r1 + 16], m4
8266 palignr m4, m2, m3, 2
8268 palignr m4, m0, m2, 2
8269 movu [r0 + r2 + 16], m4
8270 palignr m4, m2, m3, 1
8272 palignr m4, m0, m2, 1
8273 movu [r0 + r3 + 16], m4
8276 ;------------------------------------------------------------------------------------------------------------------
8277 ; void intraPredAng32_19(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8278 ;------------------------------------------------------------------------------------------------------------------
8280 cglobal intra_pred_ang32_19, 4,7,8
8281 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8288 ; collect reference pixel
8291 pshufb m0, [c_mode32_17_0]
8292 pshufb m1, [c_mode32_17_0]
8296 movu m1, [r2 + 1 + 16]
8298 movu [rsp + 26 + 16], m1
8299 mov [rsp + 63], byte 4
8302 lea r2, [rsp + 25] ; r2 -> [0]
8303 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8304 lea r4, [ang_table] ; r4 -> ang_table
8305 lea r5, [r1 * 3] ; r5 -> 3 * stride
8306 lea r6, [r0] ; r6 -> r0
8307 mova m5, [pw_1024] ; m5 -> 1024
8308 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8320 PROC32_8x8 0, 0, 6,12,18,24,30,4,10,16
8331 lea r0, [r0 + r1 * 4]
8332 PROC32_8x8 1, 0, 22,28,2,8,14,20,26,0
8343 lea r0, [r0 + r1 * 4]
8344 PROC32_8x8 2, 0, 6,12,18,24,30,4,10,16
8355 lea r0, [r0 + r1 * 4]
8356 PROC32_8x8 3, 0, 22,28,2,8,14,20,26,0
8366 ;-------------------------------------------------------------------------------------------------------------------
8367 ; void intraPredAng32_20(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8368 ;-------------------------------------------------------------------------------------------------------------------
8370 cglobal intra_pred_ang32_20, 4,7,8
8371 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8378 ; collect reference pixel
8381 pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15]
8382 pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30]
8386 movu m1, [r2 + 1 + 16]
8388 movu [rsp + 21 + 16], m1
8389 mov [rsp + 63], byte 4
8392 lea r2, [rsp + 21] ; r2 -> [0]
8393 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8394 lea r4, [ang_table] ; r4 -> ang_table
8395 lea r5, [r1 * 3] ; r5 -> 3 * stride
8396 lea r6, [r0] ; r6 -> r0
8397 mova m5, [pw_1024] ; m5 -> 1024
8398 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8410 PROC32_8x8 0, 0, 11,22,1,12,23,2,13,24
8421 lea r0, [r0 + r1 * 4]
8422 PROC32_8x8 1, 0, 3,14,25,4,15,26,5,16
8433 lea r0, [r0 + r1 * 4]
8434 PROC32_8x8 2, 0, 27,6,17,28,7,18,29,8
8445 lea r0, [r0 + r1 * 4]
8446 PROC32_8x8 3, 0, 19,30,9,20,31,10,21,0
8456 ;-------------------------------------------------------------------------------------------------------------------
8457 ; void intraPredAng32_21(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8458 ;-------------------------------------------------------------------------------------------------------------------
8460 cglobal intra_pred_ang32_21, 4,7,8
8461 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8468 ; collect reference pixel
8471 pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15]
8472 pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30]
8476 movu m1, [r2 + 1 + 16]
8478 movu [rsp + 17 + 16], m1
8479 mov [rsp + 63], byte 4
8482 lea r2, [rsp + 17] ; r2 -> [0]
8483 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8484 lea r4, [ang_table] ; r4 -> ang_table
8485 lea r5, [r1 * 3] ; r5 -> 3 * stride
8486 lea r6, [r0] ; r6 -> r0
8487 mova m5, [pw_1024] ; m5 -> 1024
8488 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8500 PROC32_8x8 0, 0, 15,30,13,28,11,26,9,24
8511 lea r0, [r0 + r1 * 4]
8512 PROC32_8x8 1, 0, 7,22,5,20,3,18,1,16
8523 lea r0, [r0 + r1 * 4]
8524 PROC32_8x8 2, 0, 31,14,29,12,27,10,25,8
8535 lea r0, [r0 + r1 * 4]
8536 PROC32_8x8 3, 0, 23,6,21,4,19,2,17,0
8546 ;-------------------------------------------------------------------------------------------------------------------
8547 ; void intraPredAng32_22(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8548 ;-------------------------------------------------------------------------------------------------------------------
8550 cglobal intra_pred_ang32_22, 4,7,8
8551 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8559 ; collect reference pixel
8562 pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15]
8563 pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30]
8564 pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x]
8565 palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30]
8568 movu m1, [r2 + 1 + 16]
8570 movu [rsp + 13 + 16], m1
8571 mov [rsp + 63], byte 4
8574 lea r2, [rsp + 13] ; r2 -> [0]
8575 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8576 lea r4, [ang_table] ; r4 -> ang_table
8577 lea r5, [r1 * 3] ; r5 -> 3 * stride
8578 lea r6, [r0] ; r6 -> r0
8579 mova m5, [pw_1024] ; m5 -> 1024
8580 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8592 PROC32_8x8 0, 0, 19,6,25,12,31,18,5,24
8603 lea r0, [r0 + r1 * 4]
8604 PROC32_8x8 1, 0, 11,30,17,4,23,10,29,16
8615 lea r0, [r0 + r1 * 4]
8616 PROC32_8x8 2, 0, 3,22,9,28,15,2,21,8
8627 lea r0, [r0 + r1 * 4]
8628 PROC32_8x8 3, 0, 27,14,1,20,7,26,13,0
8638 ;-----------------------------------------------------------------------------------------------------------------
8639 ; void intraPredAng32_23(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8640 ;-----------------------------------------------------------------------------------------------------------------
8642 cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize)
8643 %define above [rsp + 0 * mmsize]
8645 lea r4, [ang_table + 16 * 16]
8646 lea r5, [r1 * 3] ; r5 -> 3 * stride
8664 ;-----------------------------------------------------------------------------------------------------------------
8665 ; void intraPredAng32_24(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8666 ;-----------------------------------------------------------------------------------------------------------------
8668 cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize)
8669 %define above [rsp + 0 * mmsize]
8671 lea r4, [ang_table + 16 * 16]
8672 lea r5, [r1 * 3] ; r5 -> 3 * stride
8690 ;-------------------------------------------------------------------------------------------------------------------
8691 ; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8692 ;-------------------------------------------------------------------------------------------------------------------
8694 cglobal intra_pred_ang32_25, 4,7,8
8695 ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line
8702 ; collect reference pixel
8705 pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
8711 movu [rsp + 1 + 16], m1
8712 movu [rsp + 1 + 32], m2
8713 mov [rsp + 63], byte 4
8716 lea r2, [rsp + 1] ; r2 -> [0]
8717 lea r3, [c_shuf8_0] ; r3 -> shuffle8
8718 lea r4, [ang_table] ; r4 -> ang_table
8719 lea r5, [r1 * 3] ; r5 -> 3 * stride
8720 lea r6, [r0] ; r6 -> r0
8721 mova m5, [pw_1024] ; m5 -> 1024
8722 mova m6, [c_deinterval8] ; m6 -> c_deinterval8
8734 PROC32_8x8 0, 0, 30,28,26,24,22,20,18,16
8745 lea r0, [r0 + r1 * 4]
8746 PROC32_8x8 1, 0, 14,12,10,8,6,4,2,0
8757 lea r0, [r0 + r1 * 4]
8758 PROC32_8x8 2, 0, 30,28,26,24,22,20,18,16
8769 lea r0, [r0 + r1 * 4]
8770 PROC32_8x8 3, 0, 14,12,10,8,6,4,2,0
8780 ;------------------------------------------------------------------------------------------------------------------
8781 ; void intraPredAng32_26(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8782 ;------------------------------------------------------------------------------------------------------------------
8784 cglobal intra_pred_ang32_26, 6,7,7,0-(2*mmsize)
8785 %define m8 [rsp + 0 * mmsize]
8786 %define m9 [rsp + 1 * mmsize]
8800 movu [r0 + r1 * 2], m0
8802 lea r5, [r0 + r1 * 4]
8805 movu [r5 + r1 * 2], m0
8807 lea r5, [r5 + r1 * 4]
8810 movu [r5 + r1 * 2], m0
8812 lea r5, [r5 + r1 * 4]
8815 movu [r5 + r1 * 2], m0
8817 lea r5, [r0 + r1 * 4]
8820 movu [r5 + r1 * 2], m0
8822 lea r5, [r5 + r1 * 4]
8825 movu [r5 + r1 * 2], m0
8827 lea r5, [r5 + r1 * 4]
8830 movu [r5 + r1 * 2], m0
8832 lea r5, [r5 + r1 * 4]
8835 movu [r5 + r1 * 2], m0
8837 lea r5, [r5 + r1 * 4]
8840 movu [r5 + r1 * 2], m0
8842 lea r5, [r5 + r1 * 4]
8845 movu [r5 + r1 * 2], m0
8847 lea r5, [r5 + r1 * 4]
8850 movu [r5 + r1 * 2], m0
8878 pextrb [r0 + r1], m0, 1
8879 pextrb [r0 + r1 * 2], m0, 2
8880 pextrb [r0 + r4], m0, 3
8881 lea r5, [r0 + r1 * 4]
8883 pextrb [r5 + r1], m0, 5
8884 pextrb [r5 + r1 * 2], m0, 6
8885 pextrb [r5 + r4], m0, 7
8886 lea r5, [r5 + r1 * 4]
8888 pextrb [r5 + r1], m0, 9
8889 pextrb [r5 + r1 * 2], m0, 10
8890 pextrb [r5 + r4], m0, 11
8891 lea r5, [r5 + r1 * 4]
8893 pextrb [r5 + r1], m0, 13
8894 pextrb [r5 + r1 * 2], m0, 14
8895 pextrb [r5 + r4], m0, 15
8904 ;------------------------------------------------------------------------------------------------------------------
8905 ; void intraPredAng32_27(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8906 ;------------------------------------------------------------------------------------------------------------------
8908 cglobal intra_pred_ang32_27, 3,7,8
8910 lea r3, [ang_table + 16 * 16]
8924 ;------------------------------------------------------------------------------------------------------------------
8925 ; void intraPredAng32_28(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8926 ;------------------------------------------------------------------------------------------------------------------
8928 cglobal intra_pred_ang32_28, 3,7,8
8930 lea r3, [ang_table + 16 * 16]
8944 ;------------------------------------------------------------------------------------------------------------------
8945 ; void intraPredAng32_29(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8946 ;------------------------------------------------------------------------------------------------------------------
8948 cglobal intra_pred_ang32_29, 3,7,8
8950 lea r3, [ang_table + 16 * 16]
8964 ;------------------------------------------------------------------------------------------------------------------
8965 ; void intraPredAng32_30(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8966 ;------------------------------------------------------------------------------------------------------------------
8968 cglobal intra_pred_ang32_30, 3,7,8
8970 lea r3, [ang_table + 16 * 16]
8984 ;------------------------------------------------------------------------------------------------------------------
8985 ; void intraPredAng32_31(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
8986 ;------------------------------------------------------------------------------------------------------------------
8988 cglobal intra_pred_ang32_31, 3,7,8
8990 lea r3, [ang_table + 16 * 16]
9004 ;-----------------------------------------------------------------------------------------------------------------
9005 ; void intraPredAng32_32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
9006 ;-----------------------------------------------------------------------------------------------------------------
9008 cglobal intra_pred_ang32_32, 3,7,8
9010 lea r3, [ang_table + 16 * 16]
9024 ;------------------------------------------------------------------------------------------------------------------
9025 ; void intraPredAng32_33(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
9026 ;------------------------------------------------------------------------------------------------------------------
9028 cglobal intra_pred_ang32_33, 3,7,8
9030 lea r3, [ang_table + 16 * 16]
9044 ;-----------------------------------------------------------------------------
9045 ; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
9046 ;-----------------------------------------------------------------------------
9048 cglobal all_angs_pred_4x4, 6, 6, 8
9075 pmaddubsw m5, m1, [r5 + 26 * 16]
9082 mova m7, [r5 + 20 * 16]
9084 pmaddubsw m6, m2, m7
9091 pmaddubsw m4, m3, [r5 + 14 * 16]
9098 pmaddubsw m4, [r5 + 8 * 16]
9105 pmaddubsw m4, m1, [r5 + 21 * 16]
9110 pmaddubsw m4, m2, [r5 + 10 * 16]
9115 pmaddubsw m4, m2, [r5 + 31 * 16]
9120 pmaddubsw m4, m3, m7
9127 pmaddubsw m4, m1, [r5 + 17 * 16]
9132 pmaddubsw m4, m2, [r5 + 2 * 16]
9137 pmaddubsw m4, m2, [r5 + 19 * 16]
9142 pmaddubsw m3, [r5 + 4 * 16]
9149 pmaddubsw m3, m1, [r5 + 13 * 16]
9156 pmaddubsw m3, m2, [r5 + 7 * 16]
9165 pmaddubsw m3, m1, [r5 + 9 * 16]
9170 pmaddubsw m3, m1, [r5 + 18 * 16]
9175 pmaddubsw m3, m1, [r5 + 27 * 16]
9180 pmaddubsw m2, [r5 + 4 * 16]
9187 pmaddubsw m2, m1, [r5 + 5 * 16]
9192 pmaddubsw m2, m1, [r5 + 10 * 16]
9197 pmaddubsw m2, m1, [r5 + 15 * 16]
9202 pmaddubsw m2, m1, m7
9209 pmaddubsw m2, m1, [r5 + 2 * 16]
9214 pmaddubsw m2, m1, [r5 + 4 * 16]
9219 pmaddubsw m2, m1, [r5 + 6 * 16]
9224 pmaddubsw m1, [r5 + 8 * 16]
9256 pextrb [r0 + 128], m4, 0
9257 pextrb [r0 + 132], m4, 1
9258 pextrb [r0 + 136], m4, 2
9259 pextrb [r0 + 140], m4, 3
9266 pmaddubsw m2, m1, [r5 + 30 * 16]
9271 pmaddubsw m2, m1, [r5 + 28 * 16]
9276 pmaddubsw m2, m1, [r5 + 26 * 16]
9281 pmaddubsw m2, m1, [r5 + 24 * 16]
9288 pmaddubsw m2, m1, [r5 + 27 * 16]
9293 pmaddubsw m2, m1, [r5 + 22 * 16]
9298 pmaddubsw m2, m1, [r5 + 17 * 16]
9303 pmaddubsw m2, m1, [r5 + 12 * 16]
9310 pmaddubsw m2, m1, [r5 + 23 * 16]
9315 pmaddubsw m2, m1, [r5 + 14 * 16]
9320 pmaddubsw m2, m1, [r5 + 5 * 16]
9326 pinsrb m2, [r1 + 0], 1
9327 pinsrb m2, [r1 + 4], 0
9329 pmaddubsw m3, m2, [r5 + 28 * 16]
9336 pmaddubsw m3, m1, [r5 + 19 * 16]
9341 pmaddubsw m5, m1, [r5 + 6 * 16]
9346 pinsrb m2, [r1 + 2], 0
9348 pmaddubsw m3, m2, [r5 + 25 * 16]
9353 pmaddubsw m3, m2, [r5 + 12 * 16]
9360 pmaddubsw m3, m1, [r5 + 15 * 16]
9365 pmaddubsw m3, m2, [r5 + 30 * 16]
9370 pmaddubsw m3, m2, [r5 + 13 * 16]
9376 pinsrb m3, [r1 + 2], 1
9377 pinsrb m3, [r1 + 4], 0
9379 pmaddubsw m4, m3, [r5 + 28 * 16]
9386 pmaddubsw m4, m1, [r5 + 11 * 16]
9391 pmaddubsw m4, m2, [r5 + 22 * 16]
9396 pmaddubsw m4, m2, [r5 + 1 * 16]
9401 pinsrb m3, [r1 + 3], 0
9403 pmaddubsw m3, [r5 + 12 * 16]
9413 pinsrb m1, [r1 + 1], 0
9414 pinsrb m1, [r1 + 0], 1
9416 pmaddubsw m2, m1, [r5 + 12 * 16]
9422 pinsrb m1, [r1 + 2], 0
9423 pinsrb m1, [r1 + 1], 1
9425 pmaddubsw m2, m1, [r5 + 18 * 16]
9431 pinsrb m1, [r1 + 4], 0
9432 pinsrb m1, [r1 + 2], 1
9434 pmaddubsw m1, [r5 + 24 * 16]
9445 pinsrb m2, [r2 + 1], 0
9449 pinsrb m3, [r2 + 2], 0
9453 pinsrb m4, [r2 + 3], 0
9461 pmaddubsw m5, m1, [r5 + 6 * 16]
9467 pinsrb m2, [r2 + 1], 0
9470 pmaddubsw m3, m2, [r5 + 12 * 16]
9476 pinsrb m3, [r2 + 1], 1
9477 pinsrb m3, [r2 + 2], 0
9479 pmaddubsw m4, m3, [r5 + 18 * 16]
9485 pinsrb m3, [r2 + 2], 1
9486 pinsrb m3, [r2 + 4], 0
9488 pmaddubsw m3, [r5 + 24 * 16]
9495 pmaddubsw m3, m1, [r5 + 11 * 16]
9500 pinsrb m2, [r2 + 2], 0
9502 pmaddubsw m3, m2, [r5 + 22 * 16]
9507 pmaddubsw m3, m2, [r5 + 1 * 16]
9513 pinsrb m3, [r2 + 2], 1
9514 pinsrb m3, [r2 + 3], 0
9516 pmaddubsw m4, m3, [r5 + 12 * 16]
9523 pmaddubsw m4, m1, [r5 + 15 * 16]
9528 pmaddubsw m4, m2, [r5 + 30 * 16]
9533 pmaddubsw m4, m2, [r5 + 13 * 16]
9538 pinsrb m3, [r2 + 4], 0
9540 pmaddubsw m3, [r5 + 28 * 16]
9547 pmaddubsw m3, m1, [r5 + 19 * 16]
9554 pmaddubsw m3, m2, [r5 + 25 * 16]
9559 pmaddubsw m3, m2, [r5 + 12 * 16]
9566 pmaddubsw m3, m1, [r5 + 23 * 16]
9571 pmaddubsw m3, m1, [r5 + 14 * 16]
9576 pmaddubsw m3, m1, [r5 + 5 * 16]
9581 pinsrb m2, [r2 + 4], 0
9583 pmaddubsw m2, [r5 + 28 * 16]
9590 pmaddubsw m2, m1, [r5 + 27 * 16]
9595 pmaddubsw m2, m1, [r5 + 22 * 16]
9600 pmaddubsw m2, m1, [r5 + 17 * 16]
9605 pmaddubsw m2, m1, [r5 + 12 * 16]
9612 pmaddubsw m2, m1, [r5 + 30 * 16]
9617 pmaddubsw m2, m1, [r5 + 28 * 16]
9622 pmaddubsw m2, m1, [r5 + 26 * 16]
9627 pmaddubsw m2, m1, [r5 + 24 * 16]
9658 pextrb [r0 + 384], m3, 0
9659 pextrb [r0 + 388], m3, 1
9660 pextrb [r0 + 392], m3, 2
9661 pextrb [r0 + 396], m3, 3
9668 pmaddubsw m2, m1, [r5 + 2 * 16]
9673 pmaddubsw m2, m1, [r5 + 4 * 16]
9678 pmaddubsw m2, m1, [r5 + 6 * 16]
9683 pmaddubsw m2, m1, [r5 + 8 * 16]
9690 pmaddubsw m2, m1, [r5 + 5 * 16]
9695 pmaddubsw m2, m1, [r5 + 10 * 16]
9700 pmaddubsw m2, m1, [r5 + 15 * 16]
9705 pmaddubsw m2, m1, m7
9712 pmaddubsw m2, m1, [r5 + 9 * 16]
9717 pmaddubsw m2, m1, [r5 + 18 * 16]
9722 pmaddubsw m2, m1, [r5 + 27 * 16]
9729 pmaddubsw m3, m2, [r5 + 4 * 16]
9736 pmaddubsw m3, m1, [r5 + 13 * 16]
9741 pmaddubsw m6, m1, [r5 + 26 * 16]
9746 pmaddubsw m3, m2, [r5 + 7 * 16]
9751 pmaddubsw m5, m2, m7
9758 pmaddubsw m3, m1, [r5 + 17 * 16]
9763 pmaddubsw m3, m2, [r5 + 2 * 16]
9768 pmaddubsw m3, m2, [r5 + 19 * 16]
9775 pmaddubsw m4, m3, [r5 + 4 * 16]
9782 pmaddubsw m4, m1, [r5 + 21 * 16]
9787 pmaddubsw m4, m2, [r5 + 10 * 16]
9792 pmaddubsw m4, m2, [r5 + 31 * 16]
9797 pmaddubsw m4, m3, m7
9808 pmaddubsw m4, m3, [r5 + 14 * 16]
9815 pmaddubsw m3, [r5 + 8 * 16]
9836 ;-----------------------------------------------------------------------------
9837 ; void all_angs_pred_8x8(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
9838 ;-----------------------------------------------------------------------------
9840 cglobal all_angs_pred_8x8, 6, 6, 8, dest, above0, left0, above1, left1, bLuma
9847 punpcklqdq m2, m0, m1
9875 punpcklbw m3, m0, m1
9876 pmaddubsw m4, m3, [r5 + 26 * 16]
9880 pmaddubsw m5, m1, [r5 + 20 * 16]
9893 movhps [r0 + 280], m4
9897 pmaddubsw m4, m3, [r5 + 21 * 16]
9900 pmaddubsw m5, m1, [r5 + 10 * 16]
9908 pmaddubsw m4, m3, [r5 + 17 * 16]
9911 pmaddubsw m5, m1, [r5 + 2 * 16]
9919 pmaddubsw m4, m3, [r5 + 13 * 16]
9929 pmaddubsw m4, m3, [r5 + 9 * 16]
9932 pmaddubsw m5, m3, [r5 + 18 * 16]
9940 pmaddubsw m4, m3, [r5 + 5 * 16]
9943 pmaddubsw m5, m3, [r5 + 10 * 16]
9951 pmaddubsw m4, m3, [r5 + 15 * 16]
9954 pmaddubsw m5, m3, [r5 + 20 * 16]
9962 pmaddubsw m4, m3, [r5 + 25 * 16]
9965 pmaddubsw m5, m3, [r5 + 30 * 16]
9973 pmaddubsw m4, m1, [r5 + 3 * 16]
9976 pmaddubsw m5, m1, [r5 + 8 * 16]
9984 pmaddubsw m4, m3, [r5 + 2 * 16]
9987 pmaddubsw m5, m3, [r5 + 4 * 16]
9995 pmaddubsw m4, m3, [r5 + 6 * 16]
9998 pmaddubsw m5, m3, [r5 + 8 * 16]
10002 movu [r0 + 464], m4
10004 ; mode 9 [row 4, 5]
10006 pmaddubsw m4, m3, [r5 + 10 * 16]
10009 pmaddubsw m5, m3, [r5 + 12 * 16]
10013 movu [r0 + 480], m4
10015 ; mode 9 [row 6, 7]
10017 pmaddubsw m4, m3, [r5 + 14 * 16]
10020 pmaddubsw m5, m3, [r5 + 16 * 16]
10024 movu [r0 + 496], m4
10026 ; mode 7 [row 2, 3]
10028 pmaddubsw m4, m3, [r5 + 27 * 16]
10031 pmaddubsw m5, m1, [r5 + 4 * 16]
10035 movu [r0 + 336], m4
10037 ; mode 7 [row 4, 5]
10039 pmaddubsw m4, m1, [r5 + 13 * 16]
10042 pmaddubsw m5, m1, [r5 + 22 * 16]
10046 movu [r0 + 352], m4
10050 pmaddubsw m4, m1, [r5 + 7 * 16]
10056 movh [r0 + 272], m4
10058 ; mode 3 [row 2, 3]
10064 pmaddubsw m5, m2, [r5 + 14 * 16]
10068 pmaddubsw m6, m1, [r5 + 8 * 16]
10076 movhps [r0 + 312], m5
10080 movh [r0 + 296], m5
10082 ; mode 4 [calculate and store row 4, 5]
10084 pmaddubsw m4, m1, [r5 + 9 * 16]
10087 pmaddubsw m5, m1, [r5 + 30 * 16]
10091 movu [r0 + 160], m4
10093 ; mode 5 [row 4, 5]
10095 pmaddubsw m4, m2, [r5 + 21 * 16]
10098 pmaddubsw m5, m1, [r5 + 6 * 16]
10102 movu [r0 + 224], m4
10104 ; mode 6 [row 4, 5]
10106 pmaddubsw m5, m2, [r5 + 1 * 16]
10112 movh [r0 + 288], m5
10114 ; mode 6 [row 6, 7]
10116 pmaddubsw m5, m2, [r5 + 27 * 16]
10122 movh [r0 + 304], m5
10124 ; mode 5 [calculate row 6]
10126 pmaddubsw m6, m1, [r5 + 23 * 16]
10129 ; mode 3 [row 4, 5]
10134 pmaddubsw m4, m3, [r5 + 2 * 16]
10137 pmaddubsw m5, m3, [r5 + 28 * 16]
10143 ; mode 4 [calculate row 7]
10145 pmaddubsw m5, m3, [r5 + 19 * 16]
10148 ; mode 5 [calculate row 6]
10150 pmaddubsw m4, m3, [r5 + 8 * 16]
10154 movu [r0 + 240], m6
10156 ; mode 3 [row 6, 7]
10162 pmaddubsw m4, m1, [r5 + 22 * 16]
10166 pmaddubsw m2, [r5 + 16 * 16]
10170 movu [r0 + 112], m4
10172 ; mode 4 [calculate row 7]
10174 pmaddubsw m2, m1, [r5 + 8 * 16]
10177 ; mode 4 [store row 6 and 7]
10180 movu [r0 + 176], m5
10182 ; mode 4 [row 2, 3]
10189 pmaddubsw m4, m1, [r5 + 31 * 16]
10193 pmaddubsw m5, m2, [r5 + 20 * 16]
10197 movu [r0 + 144], m4
10199 ; mode 5 [row 2, 3]
10201 pmaddubsw m4, m1, [r5 + 19 * 16]
10204 pmaddubsw m5, m2, [r5 + 4 * 16]
10208 movu [r0 + 208], m4
10210 ; mode 7 [row 6, 7]
10212 pmaddubsw m4, m1, [r5 + 31 * 16]
10215 pmaddubsw m5, m2, [r5 + 8 * 16]
10219 movu [r0 + 368], m4
10223 pshufb m1, m0, [tab_Si]
10224 movu [r0 + 512], m1
10225 movu [r0 + 528], m1
10226 movu [r0 + 544], m1
10227 movu [r0 + 560], m1
10256 pextrb [r0 + 512], m4, 0
10257 pextrb [r0 + 520], m4, 1
10258 pextrb [r0 + 528], m4, 2
10259 pextrb [r0 + 536], m4, 3
10260 pextrb [r0 + 544], m4, 4
10261 pextrb [r0 + 552], m4, 5
10262 pextrb [r0 + 560], m4, 6
10263 pextrb [r0 + 568], m4, 7
10265 ; mode 11 [row 0, 1]
10269 punpcklbw m2, m0, m1
10271 pmaddubsw m3, m2, [r5 + 30 * 16]
10274 pmaddubsw m4, m2, [r5 + 28 * 16]
10278 movu [r0 + 576], m3
10280 ; mode 11 [row 2, 3]
10282 pmaddubsw m3, m2, [r5 + 26 * 16]
10285 pmaddubsw m4, m2, [r5 + 24 * 16]
10289 movu [r0 + 592], m3
10291 ; mode 11 [row 4, 5]
10293 pmaddubsw m3, m2, [r5 + 22 * 16]
10296 pmaddubsw m4, m2, [r5 + 20 * 16]
10299 packuswb m5, m3, m4
10300 movu [r0 + 608], m5
10302 ; mode 12 [row 0, 1]
10304 pmaddubsw m4, m2, [r5 + 27 * 16]
10308 movu [r0 + 640], m4
10310 ; mode 11 [row 6, 7]
10312 pmaddubsw m3, m2, [r5 + 18 * 16]
10315 pmaddubsw m4, m2, [r5 + 16 * 16]
10319 movu [r0 + 624], m3
10321 ; mode 12 [row 2, 3]
10323 pmaddubsw m3, m2, [r5 + 17 * 16]
10326 pmaddubsw m4, m2, [r5 + 12 * 16]
10330 movu [r0 + 656], m3
10332 ; mode 12 [row 4, 5]
10334 pmaddubsw m3, m2, [r5 + 7 * 16]
10337 pmaddubsw m4, m2, [r5 + 2 * 16]
10341 movu [r0 + 672], m3
10343 ; mode 12 [row 6, 7]
10346 pinsrb m3, [r1 + 0], 1
10347 pinsrb m3, [r1 + 6], 0
10349 pmaddubsw m4, m3, [r5 + 29 * 16]
10352 pmaddubsw m5, m3, [r5 + 24 * 16]
10356 movu [r0 + 688], m4
10358 ; mode 13 [row 0, 1]
10360 pmaddubsw m4, m2, [r5 + 23 * 16]
10363 pmaddubsw m5, m2, [r5 + 14 * 16]
10367 movu [r0 + 704], m4
10369 ; mode 13 [row 2, 3]
10371 pmaddubsw m4, m2, [r5 + 5 * 16]
10374 pinsrb m3, [r1 + 4], 0
10375 pmaddubsw m5, m3, [r5 + 28 * 16]
10379 movu [r0 + 720], m4
10381 ; mode 13 [row 4, 5]
10383 pmaddubsw m4, m3, [r5 + 19 * 16]
10386 pmaddubsw m5, m3, [r5 + 10 * 16]
10390 movu [r0 + 736], m4
10392 ; mode 13 [row 6, 7]
10394 pmaddubsw m4, m3, [r5 + 1 * 16]
10398 pinsrb m5, [r1 + 4], 1
10399 pinsrb m5, [r1 + 7], 0
10401 pmaddubsw m5, [r5 + 24 * 16]
10405 movu [r0 + 752], m4
10407 ; mode 14 [row 0, 1]
10409 pmaddubsw m4, m2, [r5 + 19 * 16]
10412 pmaddubsw m5, m2, [r5 + 6 * 16]
10416 movu [r0 + 768], m4
10418 ; mode 14 [row 2, 3]
10420 pinsrb m3, [r1 + 2], 0
10422 pmaddubsw m4, m3, [r5 + 25 * 16]
10425 pmaddubsw m5, m3, [r5 + 12 * 16]
10429 movu [r0 + 784], m4
10431 ; mode 14 [row 4, 5]
10434 pinsrb m1, [r1 + 2], 1
10435 pinsrb m1, [r1 + 5], 0
10437 pmaddubsw m4, m1, [r5 + 31 * 16]
10440 pmaddubsw m5, m1, [r5 + 18 * 16]
10444 movu [r0 + 800], m4
10446 ; mode 14 [row 6, 7]
10448 pmaddubsw m4, m1, [r5 + 5 * 16]
10452 pinsrb m1, [r1 + 5], 1
10453 pinsrb m1, [r1 + 7], 0
10455 pmaddubsw m5, m1, [r5 + 24 * 16]
10459 movu [r0 + 816], m4
10461 ; mode 15 [row 0, 1]
10463 pmaddubsw m4, m2, [r5 + 15 * 16]
10466 pmaddubsw m5, m3, [r5 + 30 * 16]
10470 movu [r0 + 832], m4
10472 ; mode 15 [row 2, 3]
10474 pmaddubsw m4, m3, [r5 + 13 * 16]
10478 pinsrb m1, [r1 + 2], 1
10479 pinsrb m1, [r1 + 4], 0
10481 pmaddubsw m5, m1, [r5 + 28 * 16]
10485 movu [r0 + 848], m4
10487 ; mode 15 [row 4, 5]
10489 pmaddubsw m4, m1, [r5 + 11 * 16]
10493 pinsrb m1, [r1 + 4], 1
10494 pinsrb m1, [r1 + 6], 0
10496 pmaddubsw m5, m1, [r5 + 26 * 16]
10500 movu [r0 + 864], m4
10502 ; mode 15 [row 6, 7]
10504 pmaddubsw m4, m1, [r5 + 9 * 16]
10508 pinsrb m1, [r1 + 6], 1
10509 pinsrb m1, [r1 + 8], 0
10511 pmaddubsw m1, [r5 + 24 * 16]
10515 movu [r0 + 880], m4
10517 ; mode 16 [row 0, 1]
10519 pmaddubsw m4, m2, [r5 + 11 * 16]
10522 pmaddubsw m5, m3, [r5 + 22 * 16]
10526 movu [r0 + 896], m4
10528 ; mode 16 [row 2, 3]
10530 pmaddubsw m4, m3, [r5 + 1 * 16]
10534 pinsrb m3, [r1 + 2], 1
10535 pinsrb m3, [r1 + 3], 0
10537 pmaddubsw m5, m3, [r5 + 12 * 16]
10541 movu [r0 + 912], m4
10543 ; mode 16 [row 4, 5]
10546 pinsrb m3, [r1 + 3], 1
10547 pinsrb m3, [r1 + 5], 0
10549 pmaddubsw m4, m3, [r5 + 23 * 16]
10552 pmaddubsw m5, m3, [r5 + 2 * 16]
10556 movu [r0 + 928], m4
10558 ; mode 16 [row 6, 7]
10561 pinsrb m3, [r1 + 5], 1
10562 pinsrb m3, [r1 + 6], 0
10564 pmaddubsw m4, m3, [r5 + 13 * 16]
10568 pinsrb m3, [r1 + 6], 1
10569 pinsrb m3, [r1 + 8], 0
10571 pmaddubsw m3, [r5 + 24 * 16]
10575 movu [r0 + 944], m4
10577 ; mode 17 [row 0, 1]
10579 pmaddubsw m4, m2, [r5 + 6 * 16]
10583 pinsrb m2, [r1 + 0], 1
10584 pinsrb m2, [r1 + 1], 0
10586 pmaddubsw m3, m2, [r5 + 12 * 16]
10590 movu [r0 + 960], m4
10592 ; mode 17 [row 2, 3]
10595 pinsrb m2, [r1 + 1], 1
10596 pinsrb m2, [r1 + 2], 0
10598 pmaddubsw m4, m2, [r5 + 18 * 16]
10602 pinsrb m2, [r1 + 2], 1
10603 pinsrb m2, [r1 + 4], 0
10605 pmaddubsw m3, m2, [r5 + 24 * 16]
10609 movu [r0 + 976], m4
10611 ; mode 17 [row 4, 5]
10614 pinsrb m2, [r1 + 4], 1
10615 pinsrb m2, [r1 + 5], 0
10617 pmaddubsw m4, m2, [r5 + 30 * 16]
10620 pmaddubsw m3, m2, [r5 + 4 * 16]
10624 movu [r0 + 992], m4
10626 ; mode 17 [row 6, 7]
10629 pinsrb m2, [r1 + 5], 1
10630 pinsrb m2, [r1 + 6], 0
10632 pmaddubsw m4, m2, [r5 + 10 * 16]
10636 pinsrb m2, [r1 + 6], 1
10637 pinsrb m2, [r1 + 7], 0
10639 pmaddubsw m3, m2, [r5 + 16 * 16]
10643 movu [r0 + 1008], m4
10645 ; mode 18 [row 0, 1, 2, 3, 4, 5, 6, 7]
10648 movh [r0 + 1024], m1
10651 pinsrb m2, [r4 + 1], 0
10652 movh [r0 + 1032], m2
10655 pinsrb m2, [r4 + 2], 0
10656 movh [r0 + 1040], m2
10659 pinsrb m2, [r4 + 3], 0
10660 movh [r0 + 1048], m2
10663 pinsrb m2, [r4 + 4], 0
10664 movh [r0 + 1056], m2
10667 pinsrb m2, [r4 + 5], 0
10668 movh [r0 + 1064], m2
10671 pinsrb m2, [r4 + 6], 0
10672 movh [r0 + 1072], m2
10675 pinsrb m2, [r4 + 7], 0
10676 movh [r0 + 1080], m2
10678 ; mode 19 [row 0, 1]
10684 pmaddubsw m1, m0, [r5 + 6 * 16]
10688 pinsrb m2, [r2 + 0], 1
10689 pinsrb m2, [r2 + 1], 0
10691 pmaddubsw m3, m2, [r5 + 12 * 16]
10695 movu [r0 + 1088], m1
10697 ; mode 19 [row 2, 3]
10700 pinsrb m2, [r2 + 1], 1
10701 pinsrb m2, [r2 + 2], 0
10703 pmaddubsw m4, m2, [r5 + 18 * 16]
10707 pinsrb m2, [r2 + 2], 1
10708 pinsrb m2, [r2 + 4], 0
10710 pmaddubsw m5, m2, [r5 + 24 * 16]
10714 movu [r0 + 1104], m4
10716 ; mode 19 [row 4, 5]
10719 pinsrb m2, [r2 + 4], 1
10720 pinsrb m2, [r2 + 5], 0
10722 pmaddubsw m4, m2, [r5 + 30 * 16]
10725 pmaddubsw m5, m2, [r5 + 4 * 16]
10729 movu [r0 + 1120], m4
10731 ; mode 19 [row 6, 7]
10734 pinsrb m2, [r2 + 5], 1
10735 pinsrb m2, [r2 + 6], 0
10737 pmaddubsw m4, m2, [r5 + 10 * 16]
10741 pinsrb m2, [r2 + 6], 1
10742 pinsrb m2, [r2 + 7], 0
10744 pmaddubsw m2, [r5 + 16 * 16]
10748 movu [r0 + 1136], m4
10750 ; mode 20 [row 0, 1]
10752 pmaddubsw m3, m0, [r5 + 11 * 16]
10756 pinsrb m1, [r2 + 0], 1
10757 pinsrb m1, [r2 + 2], 0
10759 pmaddubsw m4, m1, [r5 + 22 * 16]
10763 movu [r0 + 1152], m3
10765 ; mode 20 [row 2, 3]
10767 pmaddubsw m3, m1, [r5 + 1 * 16]
10771 pinsrb m2, [r2 + 2], 1
10772 pinsrb m2, [r2 + 3], 0
10774 pmaddubsw m4, m2, [r5 + 12 * 16]
10778 movu [r0 + 1168], m3
10780 ; mode 20 [row 4, 5]
10783 pinsrb m2, [r2 + 3], 1
10784 pinsrb m2, [r2 + 5], 0
10786 pmaddubsw m3, m2, [r5 + 23 * 16]
10789 pmaddubsw m4, m2, [r5 + 2 * 16]
10793 movu [r0 + 1184], m3
10795 ; mode 20 [row 6, 7]
10798 pinsrb m2, [r2 + 5], 1
10799 pinsrb m2, [r2 + 6], 0
10801 pmaddubsw m3, m2, [r5 + 13 * 16]
10805 pinsrb m2, [r2 + 6], 1
10806 pinsrb m2, [r2 + 8], 0
10808 pmaddubsw m4, m2, [r5 + 24 * 16]
10812 movu [r0 + 1200], m3
10814 ; mode 21 [row 0, 1]
10816 pmaddubsw m2, m0, [r5 + 15 * 16]
10819 pmaddubsw m3, m1, [r5 + 30 * 16]
10823 movu [r0 + 1216], m2
10825 ; mode 21 [row 2, 3]
10827 pmaddubsw m2, m1, [r5 + 13 * 16]
10831 pinsrb m3, [r2 + 2], 1
10832 pinsrb m3, [r2 + 4], 0
10834 pmaddubsw m4, m3, [r5 + 28 * 16]
10838 movu [r0 + 1232], m2
10840 ; mode 21 [row 4, 5]
10842 pmaddubsw m2, m3, [r5 + 11 * 16]
10846 pinsrb m3, [r2 + 4], 1
10847 pinsrb m3, [r2 + 6], 0
10849 pmaddubsw m4, m3, [r5 + 26 * 16]
10853 movu [r0 + 1248], m2
10855 ; mode 21 [row 6, 7]
10857 pmaddubsw m2, m3, [r5 + 9 * 16]
10861 pinsrb m3, [r2 + 6], 1
10862 pinsrb m3, [r2 + 8], 0
10864 pmaddubsw m4, m3, [r5 + 24 * 16]
10868 movu [r0 + 1264], m2
10870 ; mode 22 [row 0, 1]
10872 pmaddubsw m2, m0, [r5 + 19 * 16]
10875 pmaddubsw m4, m0, [r5 + 6 * 16]
10879 movu [r0 + 1280], m2
10881 ; mode 22 [row 2, 3]
10883 pmaddubsw m2, m1, [r5 + 25 * 16]
10886 pmaddubsw m3, m1, [r5 + 12 * 16]
10890 movu [r0 + 1296], m2
10892 ; mode 22 [row 4, 5]
10895 pinsrb m1, [r2 + 5], 0
10896 pinsrb m1, [r2 + 2], 1
10898 pmaddubsw m2, m1, [r5 + 31 * 16]
10901 pmaddubsw m3, m1, [r5 + 18 * 16]
10905 movu [r0 + 1312], m2
10907 ; mode 22 [row 6, 7]
10909 pmaddubsw m2, m1, [r5 + 5 * 16]
10913 pinsrb m1, [r2 + 5], 1
10914 pinsrb m1, [r2 + 7], 0
10916 pmaddubsw m1, [r5 + 24 * 16]
10920 movu [r0 + 1328], m2
10922 ; mode 23 [row 0, 1]
10924 pmaddubsw m2, m0, [r5 + 23 * 16]
10927 pmaddubsw m3, m0, [r5 + 14 * 16]
10931 movu [r0 + 1344], m2
10933 ; mode 23 [row 2, 3]
10935 pmaddubsw m2, m0, [r5 + 5 * 16]
10939 pinsrb m1, [r2 + 0], 1
10940 pinsrb m1, [r2 + 4], 0
10942 pmaddubsw m3, m1, [r5 + 28 * 16]
10946 movu [r0 + 1360], m2
10948 ; mode 23 [row 4, 5]
10950 pmaddubsw m2, m1, [r5 + 19 * 16]
10953 pmaddubsw m3, m1, [r5 + 10 * 16]
10957 movu [r0 + 1376], m2
10959 ; mode 23 [row 6, 7]
10961 pmaddubsw m2, m1, [r5 + 1 * 16]
10965 pinsrb m3, [r2 + 4], 1
10966 pinsrb m3, [r2 + 7], 0
10968 pmaddubsw m3, [r5 + 24 * 16]
10972 movu [r0 + 1392], m2
10974 ; mode 24 [row 0, 1]
10976 pmaddubsw m2, m0, [r5 + 27 * 16]
10979 pmaddubsw m5, m0, [r5 + 22 * 16]
10983 movu [r0 + 1408], m2
10985 ; mode 24 [row 2, 3]
10987 pmaddubsw m2, m0, [r5 + 17 * 16]
10990 pmaddubsw m3, m0, [r5 + 12 * 16]
10994 movu [r0 + 1424], m2
10996 ; mode 24 [row 4, 5]
10998 pmaddubsw m2, m0, [r5 + 7 * 16]
11001 pmaddubsw m3, m0, [r5 + 2 * 16]
11005 movu [r0 + 1440], m2
11007 ; mode 24 [row 6, 7]
11009 pinsrb m1, [r2 + 6], 0
11011 pmaddubsw m2, m1, [r5 + 29 * 16]
11014 pmaddubsw m1, [r5 + 24 * 16]
11018 movu [r0 + 1456], m2
11020 ; mode 25 [row 0, 1]
11022 pmaddubsw m2, m0, [r5 + 30 * 16]
11025 pmaddubsw m1, m0, [r5 + 28 * 16]
11029 movu [r0 + 1472], m2
11031 ; mode 25 [row 2, 3]
11033 pmaddubsw m2, m0, [r5 + 26 * 16]
11036 pmaddubsw m1, m0, [r5 + 24 * 16]
11040 movu [r0 + 1488], m2
11042 ; mode 25 [row 4, 5]
11044 pmaddubsw m1, m0, [r5 + 20 * 16]
11048 movu [r0 + 1504], m5
11050 ; mode 25 [row 6, 7]
11052 pmaddubsw m2, m0, [r5 + 18 * 16]
11055 pmaddubsw m1, m0, [r5 + 16 * 16]
11059 movu [r0 + 1520], m2
11065 pshufb m1, m0, [tab_Si]
11066 movu [r0 + 1536], m1
11067 movu [r0 + 1552], m1
11068 movu [r0 + 1568], m1
11069 movu [r0 + 1584], m1
11098 pextrb [r0 + 1536], m4, 0
11099 pextrb [r0 + 1544], m4, 1
11100 pextrb [r0 + 1552], m4, 2
11101 pextrb [r0 + 1560], m4, 3
11102 pextrb [r0 + 1568], m4, 4
11103 pextrb [r0 + 1576], m4, 5
11104 pextrb [r0 + 1584], m4, 6
11105 pextrb [r0 + 1592], m4, 7
11107 ; mode 27 [row 0, 1]
11110 punpcklbw m4, m0, m6
11112 pmaddubsw m1, m4, [r5 + 2 * 16]
11115 pmaddubsw m2, m4, [r5 + 4 * 16]
11119 movu [r0 + 1600], m1
11121 ; mode 27 [row 2, 3]
11123 pmaddubsw m1, m4, [r5 + 6 * 16]
11126 pmaddubsw m2, m4, [r5 + 8 * 16]
11130 movu [r0 + 1616], m1
11132 ; mode 27 [row 4, 5]
11134 pmaddubsw m3, m4, [r5 + 10 * 16]
11137 pmaddubsw m2, m4, [r5 + 12 * 16]
11140 packuswb m1, m3, m2
11141 movu [r0 + 1632], m1
11143 ; mode 27 [row 6, 7]
11145 pmaddubsw m1, m4, [r5 + 14 * 16]
11148 pmaddubsw m2, m4, [r5 + 16 * 16]
11152 movu [r0 + 1648], m1
11154 ; mode 28 [row 0, 1]
11156 pmaddubsw m1, m4, [r5 + 5 * 16]
11160 movu [r0 + 1664], m1
11162 ; mode 28 [row 2, 3]
11164 pmaddubsw m1, m4, [r5 + 15 * 16]
11167 pmaddubsw m2, m4, [r5 + 20 * 16]
11171 movu [r0 + 1680], m1
11173 ; mode 28 [row 4, 5]
11175 pmaddubsw m1, m4, [r5 + 25 * 16]
11178 pmaddubsw m2, m4, [r5 + 30 * 16]
11182 movu [r0 + 1696], m1
11184 ; mode 28 [row 6, 7]
11187 punpcklbw m5, m6, m1
11189 pmaddubsw m2, m5, [r5 + 3 * 16]
11192 pmaddubsw m3, m5, [r5 + 8 * 16]
11196 movu [r0 + 1712], m2
11198 ; mode 29 [row 0, 1]
11200 pmaddubsw m2, m4, [r5 + 9 * 16]
11203 pmaddubsw m3, m4, [r5 + 18 * 16]
11207 movu [r0 + 1728], m2
11209 ; mode 29 [row 2, 3]
11211 pmaddubsw m2, m4, [r5 + 27 * 16]
11214 pmaddubsw m3, m5, [r5 + 4 * 16]
11218 movu [r0 + 1744], m2
11220 ; mode 29 [row 4, 5]
11222 pmaddubsw m2, m5, [r5 + 13 * 16]
11225 pmaddubsw m3, m5, [r5 + 22 * 16]
11229 movu [r0 + 1760], m2
11231 ; mode 29 [row 6, 7]
11233 pmaddubsw m2, m5, [r5 + 31 * 16]
11239 pmaddubsw m3, m1, [r5 + 8 * 16]
11243 movu [r0 + 1776], m2
11247 movh [r0 + 1936], m2
11249 ; mode 30 [row 0, 1]
11251 pmaddubsw m2, m4, [r5 + 13 * 16]
11254 pmaddubsw m3, m4, [r5 + 26 * 16]
11258 movu [r0 + 1792], m2
11260 ; mode 30 [row 2, 3]
11262 pmaddubsw m2, m5, [r5 + 7 * 16]
11265 pmaddubsw m3, m5, [r5 + 20 * 16]
11269 movu [r0 + 1808], m2
11273 movhps [r0 + 1992], m2
11275 ; mode 30 [row 4, 5]
11277 pmaddubsw m2, m1, [r5 + 1 * 16]
11280 pmaddubsw m3, m1, [r5 + 14 * 16]
11284 movu [r0 + 1824], m2
11288 movhps [r0 + 2000], m2
11290 ; mode 30 [row 6, 7]
11292 pmaddubsw m2, m1, [r5 + 27 * 16]
11298 pmaddubsw m3, m6, [r5 + 8 * 16]
11302 movu [r0 + 1840], m2
11306 movhps [r0 + 2008], m2
11308 ; mode 31 [row 0, 1]
11310 pmaddubsw m2, m4, [r5 + 17 * 16]
11313 pmaddubsw m3, m5, [r5 + 2 * 16]
11317 movu [r0 + 1856], m2
11319 ; mode 31 [row 2, 3]
11321 pmaddubsw m2, m5, [r5 + 19 * 16]
11324 pmaddubsw m3, m1, [r5 + 4 * 16]
11328 movu [r0 + 1872], m2
11330 ; mode 31 [row 4, 5]
11332 pmaddubsw m2, m1, [r5 + 21 * 16]
11335 pmaddubsw m3, m6, [r5 + 6 * 16]
11339 movu [r0 + 1888], m2
11341 ; mode 31 [row 6, 7]
11343 pmaddubsw m2, m6, [r5 + 23 * 16]
11349 pmaddubsw m3, m0, [r5 + 8 * 16]
11353 movu [r0 + 1904], m2
11355 ; mode 32 [row 0, 1]
11357 pmaddubsw m2, m4, [r5 + 21 * 16]
11360 pmaddubsw m3, m5, [r5 + 10 * 16]
11364 movu [r0 + 1920], m2
11368 pmaddubsw m2, m1, [r5 + 20 * 16]
11374 movh [r0 + 1944], m2
11376 ; mode 32 [row 4, 5]
11378 pmaddubsw m2, m6, [r5 + 9 * 16]
11381 pmaddubsw m3, m6, [r5 + 30 * 16]
11385 movu [r0 + 1952], m2
11387 ; mode 33 [row 4, 5]
11389 pmaddubsw m2, m0, [r5 + 2 * 16]
11392 pmaddubsw m3, m0, [r5 + 28 * 16]
11396 movu [r0 + 2016], m2
11400 pmaddubsw m2, m0, [r5 + 19 * 16]
11409 pmaddubsw m3, m0, [r5 + 8 * 16]
11413 movu [r0 + 1968], m2
11415 ; mode 33 [row 6, 7]
11417 pmaddubsw m2, m0, [r5 + 22 * 16]
11424 pmaddubsw m3, m0, [r5 + 16 * 16]
11428 movu [r0 + 2032], m2
11432 pmaddubsw m2, m4, [r5 + 26 * 16]
11438 movh [r0 + 1984], m2
11440 ; mode 34 [row 0, 1, 2, 3, 4, 5, 6, 7]
11444 punpcklqdq m2, m0, m1
11445 movu [r0 + 2048], m2
11450 movu [r0 + 2064], m1
11455 movu [r0 + 2080], m1
11460 movu [r0 + 2096], m1
11464 ;-----------------------------------------------------------------------------
11465 ; void all_angs_pred_16x16(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
11466 ;-----------------------------------------------------------------------------
11468 cglobal all_angs_pred_16x16, 6, 6, 8, dest, above0, left0, above1, left1, bLuma
11471 movu [r0 + 0 * 16], m0
11476 palignr m5, m6, m0, 1
11477 movu [r0 + 1 * 16], m5
11481 palignr m5, m6, m0, 2
11482 movu [r0 + 2 * 16], m5
11483 palignr m5, m6, m0, 3
11484 movu [r0 + 3 * 16], m5
11485 palignr m5, m6, m0, 4
11486 movu [r0 + 4 * 16], m5
11487 palignr m5, m6, m0, 5
11488 movu [r0 + 5 * 16], m5
11489 palignr m5, m6, m0, 6
11490 movu [r0 + 6 * 16], m5
11491 palignr m5, m6, m0, 7
11492 movu [r0 + 7 * 16], m5
11496 palignr m5, m6, m0, 8
11497 movu [r0 + 8 * 16], m5
11501 palignr m5, m6, m0, 9
11502 movu [r0 + 9 * 16], m5
11504 palignr m3, m6, m0, 10
11505 movu [r0 + 10 * 16], m3
11506 palignr m3, m6, m0, 11
11507 movu [r0 + 11 * 16], m3
11508 palignr m3, m6, m0, 12
11509 movu [r0 + 12 * 16], m3
11512 movu [r0 + (3-2)*16*16 + 15 * 16], m3
11514 palignr m3, m6, m0, 13
11515 movu [r0 + 13 * 16], m3
11516 palignr m3, m6, m0, 14
11517 movu [r0 + 14 * 16], m3
11518 palignr m3, m6, m0, 15
11519 movu [r0 + 15 * 16], m3
11522 lea r5, [ang_table]
11527 ; mode 17 [row 8 - second half]
11528 pmaddubsw m1, m0, [r5 + 22 * 16]
11531 movh [r0 + 248 * 16 + 8], m1
11532 ; mode 17 [row 8 - second half] end
11534 pmaddubsw m1, m0, [r5 + 26 * 16]
11537 pmaddubsw m2, m7, [r5 + 26 * 16]
11540 movu [r0 + 16 * 16], m1
11543 movu [r0 + 65 * 16], m1
11546 pmaddubsw m1, m0, [r5 + 21 * 16]
11548 pmaddubsw m2, m7, [r5 + 21 * 16]
11551 movu [r0 + 32 * 16], m1
11554 pmaddubsw m1, m0, [r5 + 17 * 16]
11556 pmaddubsw m2, m7, [r5 + 17 * 16]
11559 movu [r0 + 48 * 16], m1
11562 pmaddubsw m1, m0, [r5 + 13 * 16]
11564 pmaddubsw m2, m7, [r5 + 13 * 16]
11567 movu [r0 + 64 * 16], m1
11570 pmaddubsw m1, m0, [r5 + 9 * 16]
11572 pmaddubsw m2, m7, [r5 + 9 * 16]
11575 movu [r0 + 80 * 16], m1
11578 pmaddubsw m1, m0, [r5 + 18 * 16]
11580 pmaddubsw m2, m7, [r5 + 18 * 16]
11583 movu [r0 + 81 * 16], m1
11586 pmaddubsw m1, m0, [r5 + 27 * 16]
11588 pmaddubsw m2, m7, [r5 + 27 * 16]
11591 movu [r0 + 82 * 16], m1
11594 pmaddubsw m1, m0, [r5 + 5 * 16]
11596 pmaddubsw m2, m7, [r5 + 5 * 16]
11599 movu [r0 + 96 * 16], m1
11602 pmaddubsw m1, m0, [r5 + 10 * 16]
11604 pmaddubsw m2, m7, [r5 + 10 * 16]
11607 movu [r0 + 97 * 16], m1
11610 pmaddubsw m1, m0, [r5 + 15 * 16]
11612 pmaddubsw m2, m7, [r5 + 15 * 16]
11615 movu [r0 + 98 * 16], m1
11618 pmaddubsw m1, m0, [r5 + 20 * 16]
11620 pmaddubsw m2, m7, [r5 + 20 * 16]
11623 movu [r0 + 99 * 16], m1
11626 pmaddubsw m1, m0, [r5 + 25 * 16]
11628 pmaddubsw m2, m7, [r5 + 25 * 16]
11631 movu [r0 + 100 * 16], m1
11634 pmaddubsw m1, m0, [r5 + 30 * 16]
11636 pmaddubsw m2, m7, [r5 + 30 * 16]
11639 movu [r0 + 101 * 16], m1
11641 ; mode 15 [row 13 - second half]
11642 pmaddubsw m1, m0, [r5 + 18 * 16]
11645 movh [r0 + 221 * 16 + 8], m1
11646 ; mode 15 [row 13 - second half] end
11648 ; mode 15 [row 14 - second half]
11649 pmaddubsw m1, m0, [r5 + 1 * 16]
11652 movh [r0 + 222 * 16 + 8], m1
11653 ; mode 15 [row 14 - second half] end
11655 ; mode 16 [row 10 - second half]
11656 pmaddubsw m1, m0, [r5 + 25 * 16]
11659 movh [r0 + 234 * 16 + 8], m1
11660 ; mode 16 [row 10 - second half] end
11662 ; mode 16 [row 11 - second half]
11663 pmaddubsw m1, m0, [r5 + 4 * 16]
11666 movh [r0 + 235 * 16 + 8], m1
11667 ; mode 16 [row 11 - second half] end
11670 movu m6, [r5 + 20 * 16]
11674 ; mode 17 [row 7 - second half]
11675 pmaddubsw m1, m0, [r5 + 16 * 16]
11678 movh [r0 + 247 * 16 + 8], m1
11680 ; mode 17 [row 7 - second half] end
11681 pmaddubsw m1, m0, m6
11685 pmaddubsw m4, m2, m6
11688 movu [r0 + 17 * 16], m1
11691 movu [r0 + 67 * 16], m1
11693 ; mode 4 row [row 1]
11694 pmaddubsw m1, m0, [r5 + 10 * 16]
11696 pmaddubsw m4, m2, [r5 + 10 * 16]
11699 movu [r0 + 33 * 16], m1
11701 ; mode 4 row [row 2]
11702 pmaddubsw m1, m0, [r5 + 31 * 16]
11704 pmaddubsw m4, m2, [r5 + 31 * 16]
11707 movu [r0 + 34 * 16], m1
11710 movu [r0 + 86 * 16], m1
11712 ; mode 5 row [row 1]
11713 pmaddubsw m1, m0, [r5 + 2 * 16]
11715 pmaddubsw m4, m2, [r5 + 2 * 16]
11718 movu [r0 + 49 * 16], m1
11720 ; mode 5 row [row 2]
11721 pmaddubsw m1, m0, [r5 + 19 * 16]
11723 pmaddubsw m4, m2, [r5 + 19 * 16]
11726 movu [r0 + 50 * 16], m1
11729 pmaddubsw m1, m0, [r5 + 7 * 16]
11731 pmaddubsw m4, m2, [r5 + 7 * 16]
11734 movu [r0 + 66 * 16], m1
11737 pmaddubsw m1, m0, [r5 + 4 * 16]
11739 pmaddubsw m4, m2, [r5 + 4 * 16]
11742 movu [r0 + 83 * 16], m1
11745 pmaddubsw m1, m0, [r5 + 13 * 16]
11747 pmaddubsw m4, m2, [r5 + 13 * 16]
11750 movu [r0 + 84 * 16], m1
11753 movu [r0 + 104 * 16], m1
11756 pmaddubsw m1, m0, [r5 + 22 * 16]
11758 pmaddubsw m4, m2, [r5 + 22 * 16]
11761 movu [r0 + 85 * 16], m1
11764 pmaddubsw m1, m0, [r5 + 3 * 16]
11766 pmaddubsw m4, m2, [r5 + 3 * 16]
11769 movu [r0 + 102 * 16], m1
11772 pmaddubsw m1, m0, [r5 + 8 * 16]
11774 pmaddubsw m4, m2, [r5 + 8 * 16]
11777 movu [r0 + 103 * 16], m1
11780 pmaddubsw m1, m0, [r5 + 18 * 16]
11782 pmaddubsw m4, m2, [r5 + 18 * 16]
11785 movu [r0 + 105 * 16], m1
11788 pmaddubsw m1, m0, [r5 + 23 * 16]
11790 pmaddubsw m4, m2, [r5 + 23 * 16]
11793 movu [r0 + 106 * 16], m1
11796 pmaddubsw m1, m0, [r5 + 28 * 16]
11798 pmaddubsw m4, m2, [r5 + 28 * 16]
11801 movu [r0 + 107 * 16], m1
11809 ; mode 17 [row 6 - second half]
11810 pmaddubsw m1, m0, [r5 + 10 * 16]
11813 movh [r0 + 246 * 16 + 8], m1
11814 ; mode 17 [row 6 - second half] end
11816 pmaddubsw m1, m0, [r5 + 14 * 16]
11822 pmaddubsw m4, m2, [r5 + 14 * 16]
11825 movu [r0 + 18 * 16], m1
11828 movu [r0 + 69 * 16], m1
11830 ; mode 4 row [row 3]
11831 pmaddubsw m1, m0, [r5 + 20 * 16]
11833 pmaddubsw m4, m2, [r5 + 20 * 16]
11836 movu [r0 + 35 * 16], m1
11838 ; mode 5 row [row 3]
11839 pmaddubsw m1, m0, [r5 + 4 * 16]
11841 pmaddubsw m4, m2, [r5 + 4 * 16]
11844 movu [r0 + 51 * 16], m1
11846 ; mode 5 row [row 4]
11847 pmaddubsw m1, m0, [r5 + 21 * 16]
11849 pmaddubsw m4, m2, [r5 + 21 * 16]
11852 movu [r0 + 52 * 16], m1
11855 pmaddubsw m1, m0, [r5 + 1 * 16]
11857 pmaddubsw m4, m2, [r5 + 1 * 16]
11860 movu [r0 + 68 * 16], m1
11863 pmaddubsw m1, m0, [r5 + 27 * 16]
11865 pmaddubsw m4, m2, [r5 + 27 * 16]
11868 movu [r0 + 70 * 16], m1
11871 pmaddubsw m1, m0, [r5 + 8 * 16]
11873 pmaddubsw m4, m2, [r5 + 8 * 16]
11876 movu [r0 + 87 * 16], m1
11879 pmaddubsw m1, m0, [r5 + 17 * 16]
11881 pmaddubsw m4, m2, [r5 + 17 * 16]
11884 movu [r0 + 88 * 16], m1
11887 pmaddubsw m1, m0, [r5 + 26 * 16]
11889 pmaddubsw m4, m2, [r5 + 26 * 16]
11892 movu [r0 + 89 * 16], m1
11895 pmaddubsw m1, m0, [r5 + 1 * 16]
11897 pmaddubsw m4, m2, [r5 + 1 * 16]
11900 movu [r0 + 108 * 16], m1
11903 pmaddubsw m1, m0, [r5 + 6 * 16]
11905 pmaddubsw m4, m2, [r5 + 6 * 16]
11908 movu [r0 + 109 * 16], m1
11911 pmaddubsw m1, m0, [r5 + 11 * 16]
11913 pmaddubsw m4, m2, [r5 + 11 * 16]
11916 movu [r0 + 110 * 16], m1
11919 pmaddubsw m1, m0, [r5 + 16 * 16]
11921 pmaddubsw m4, m2, [r5 + 16 * 16]
11924 movu [r0 + 111 * 16], m1
11932 ; mode 17 [row 4 - second half]
11933 pmaddubsw m1, m0, [r5 + 30 * 16]
11936 movh [r0 + 244 * 16 + 8], m1
11937 ; mode 17 [row 4 - second half] end
11939 ; mode 17 [row 5 - second half]
11940 pmaddubsw m1, m0, [r5 + 4 * 16]
11943 movh [r0 + 245 * 16 + 8], m1
11944 ; mode 17 [row 5 - second half] end
11946 pmaddubsw m1, m0, [r5 + 8 * 16]
11952 pmaddubsw m4, m2, [r5 + 8 * 16]
11955 movu [r0 + 19 * 16], m1
11958 movu [r0 + 71 * 16], m1
11960 ; mode 4 row [row 4]
11961 pmaddubsw m1, m0, [r5 + 9 * 16]
11963 pmaddubsw m4, m2, [r5 + 9 * 16]
11966 movu [r0 + 36 * 16], m1
11968 ; mode 4 row [row 5]
11969 pmaddubsw m1, m0, [r5 + 30 * 16]
11971 pmaddubsw m4, m2, [r5 + 30 * 16]
11974 movu [r0 + 37 * 16], m1
11976 ; mode 7 row [row 13]
11977 movu [r0 + 93 * 16], m1
11979 ; mode 5 row [row 5]
11980 pmaddubsw m1, m0, [r5 + 6 * 16]
11982 pmaddubsw m4, m2, [r5 + 6 * 16]
11985 movu [r0 + 53 * 16], m1
11987 ; mode 5 row [row 6]
11988 pmaddubsw m1, m0, [r5 + 23 * 16]
11990 pmaddubsw m4, m2, [r5 + 23 * 16]
11993 movu [r0 + 54 * 16], m1
11996 pmaddubsw m1, m0, [r5 + 21 * 16]
11998 pmaddubsw m4, m2, [r5 + 21 * 16]
12001 movu [r0 + 72 * 16], m1
12004 movu [r0 + 92 * 16], m1
12007 pmaddubsw m1, m0, [r5 + 3 * 16]
12009 pmaddubsw m4, m2, [r5 + 3 * 16]
12012 movu [r0 + 90 * 16], m1
12015 pmaddubsw m1, m0, [r5 + 12 * 16]
12017 pmaddubsw m4, m2, [r5 + 12 * 16]
12020 movu [r0 + 91 * 16], m1
12028 ; mode 17 [row 3 - second half]
12029 pmaddubsw m1, m0, [r5 + 24 * 16]
12032 movh [r0 + 243 * 16 + 8], m1
12034 ; mode 17 [row 3 - second half] end
12035 pmaddubsw m1, m0, [r5 + 2 * 16]
12041 pmaddubsw m4, m2, [r5 + 2 * 16]
12044 movu [r0 + 20 * 16], m1
12047 movu [r0 + 73 * 16], m1
12049 ; mode 4 row [row 6]
12050 movu m6, [r5 + 19 * 16]
12051 pmaddubsw m1, m0, m6
12053 pmaddubsw m4, m2, m6
12056 movu [r0 + 38 * 16], m1
12059 pmaddubsw m1, m0, [r5 + 28 * 16]
12061 pmaddubsw m4, m2, [r5 + 28 * 16]
12064 movu [r0 + 21 * 16], m1
12067 movu [r0 + 75 * 16], m1
12069 ; mode 5 row [row 7]
12070 pmaddubsw m1, m0, [r5 + 8 * 16]
12072 pmaddubsw m4, m2, [r5 + 8 * 16]
12075 movu [r0 + 55 * 16], m1
12077 ; mode 5 row [row 8]
12078 pmaddubsw m1, m0, [r5 + 25 * 16]
12080 pmaddubsw m4, m2, [r5 + 25 * 16]
12083 movu [r0 + 56 * 16], m1
12086 pmaddubsw m1, m0, [r5 + 15 * 16]
12088 pmaddubsw m4, m2, [r5 + 15 * 16]
12091 movu [r0 + 74 * 16], m1
12094 pmaddubsw m1, m0, [r5 + 7 * 16]
12096 pmaddubsw m4, m2, [r5 + 7 * 16]
12099 movu [r0 + 94 * 16], m1
12102 pmaddubsw m1, m0, [r5 + 16 * 16]
12104 pmaddubsw m4, m2, [r5 + 16 * 16]
12107 movu [r0 + 95 * 16], m1
12115 ; mode 17 [row 2 - second half]
12116 pmaddubsw m1, m0, [r5 + 18 * 16]
12119 movh [r0 + 242 * 16 + 8], m1
12120 ; mode 17 [row 2 - second half] end
12122 pmaddubsw m1, m0, [r5 + 22 * 16]
12128 pmaddubsw m4, m2, [r5 + 22 * 16]
12131 movu [r0 + 22 * 16], m1
12134 movu [r0 + 77 * 16], m1
12136 ; mode 4 row [row 7]
12137 pmaddubsw m1, m0, [r5 + 8 * 16]
12139 pmaddubsw m4, m2, [r5 + 8 * 16]
12142 movu [r0 + 39 * 16], m1
12144 ; mode 4 row [row 8]
12145 pmaddubsw m1, m0, [r5 + 29 * 16]
12147 pmaddubsw m4, m2, [r5 + 29 * 16]
12150 movu [r0 + 40 * 16], m1
12152 ; mode 5 row [row 9]
12153 pmaddubsw m1, m0, [r5 + 10 * 16]
12155 pmaddubsw m4, m2, [r5 + 10 * 16]
12158 movu [r0 + 57 * 16], m1
12160 ; mode 5 row [row 10]
12161 pmaddubsw m1, m0, [r5 + 27 * 16]
12163 pmaddubsw m4, m2, [r5 + 27 * 16]
12166 movu [r0 + 58 * 16], m1
12169 pmaddubsw m1, m0, [r5 + 9 * 16]
12171 pmaddubsw m4, m2, [r5 + 9 * 16]
12174 movu [r0 + 76 * 16], m1
12182 ; mode 17 [row 1 - second half]
12183 pmaddubsw m1, m0, [r5 + 12 * 16]
12186 movh [r0 + 241 * 16 + 8], m1
12187 ; mode 17 [row 1 - second half] end
12189 pmaddubsw m1, m0, [r5 + 16 * 16]
12195 pmaddubsw m4, m2, [r5 + 16 * 16]
12198 movu [r0 + 23 * 16], m1
12201 movu [r0 + 79 * 16], m1
12203 ; mode 4 row [row 9]
12204 pmaddubsw m1, m0, [r5 + 18 * 16]
12206 pmaddubsw m4, m2, [r5 + 18 * 16]
12209 movu [r0 + 41 * 16], m1
12211 ; mode 5 row [row 11]
12212 pmaddubsw m1, m0, [r5 + 12 * 16]
12214 pmaddubsw m4, m2, [r5 + 12 * 16]
12217 movu [r0 + 59 * 16], m1
12219 ; mode 5 row [row 12]
12220 pmaddubsw m1, m0, [r5 + 29 * 16]
12222 pmaddubsw m4, m2, [r5 + 29 * 16]
12225 movu [r0 + 60 * 16], m1
12228 pmaddubsw m1, m0, [r5 + 3 * 16]
12230 pmaddubsw m4, m2, [r5 + 3 * 16]
12233 movu [r0 + 78 * 16], m1
12240 pmaddubsw m1, m0, [r5 + 10 * 16]
12244 pinsrb m4, [r4 + 32], 15
12246 pmaddubsw m4, m2, [r5 + 10 * 16]
12249 movu [r0 + 24 * 16], m1
12251 ; mode 4 row [row 10]
12252 pmaddubsw m1, m0, [r5 + 7 * 16]
12254 pmaddubsw m4, m2, [r5 + 7 * 16]
12257 movu [r0 + 42 * 16], m1
12259 ; mode 4 row [row 11]
12260 pmaddubsw m1, m0, [r5 + 28 * 16]
12262 pmaddubsw m4, m2, [r5 + 28 * 16]
12265 movu [r0 + 43 * 16], m1
12267 ; mode 5 row [row 13]
12268 pmaddubsw m1, m0, [r5 + 14 * 16]
12270 pmaddubsw m4, m2, [r5 + 14 * 16]
12273 movu [r0 + 61 * 16], m1
12275 ; mode 5 row [row 14]
12276 pmaddubsw m1, m0, [r5 + 31 * 16]
12278 pmaddubsw m4, m2, [r5 + 31 * 16]
12281 movu [r0 + 62 * 16], m1
12288 pmaddubsw m1, m0, [r5 + 4 * 16]
12294 pmaddubsw m4, m2, [r5 + 4 * 16]
12297 movu [r0 + 25 * 16], m1
12299 ; mode 4 row [row 12]
12300 pmaddubsw m1, m0, [r5 + 17 * 16]
12302 pmaddubsw m4, m2, [r5 + 17 * 16]
12305 movu [r0 + 44 * 16], m1
12308 pmaddubsw m1, m0, [r5 + 30 * 16]
12310 pmaddubsw m4, m2, [r5 + 30 * 16]
12313 movu [r0 + 26 * 16], m1
12315 ; mode 5 row [row 15]
12316 pmaddubsw m1, m0, [r5 + 16 * 16]
12318 pmaddubsw m4, m2, [r5 + 16 * 16]
12321 movu [r0 + 63 * 16], m1
12328 pmaddubsw m1, m0, [r5 + 24 * 16]
12334 pmaddubsw m4, m2, [r5 + 24 * 16]
12337 movu [r0 + 27 * 16], m1
12339 ; mode 4 row [row 13]
12340 pmaddubsw m1, m0, [r5 + 6 * 16]
12342 pmaddubsw m4, m2, [r5 + 6 * 16]
12345 movu [r0 + 45 * 16], m1
12347 ; mode 4 row [row 14]
12348 pmaddubsw m1, m0, [r5 + 27 * 16]
12350 pmaddubsw m4, m2, [r5 + 27 * 16]
12353 movu [r0 + 46 * 16], m1
12360 pmaddubsw m1, m0, [r5 + 18 * 16]
12366 pmaddubsw m4, m2, [r5 + 18 * 16]
12369 movu [r0 + 28 * 16], m1
12371 ; mode 4 row [row 15]
12372 pmaddubsw m1, m0, [r5 + 16 * 16]
12374 pmaddubsw m4, m2, [r5 + 16 * 16]
12377 movu [r0 + 47 * 16], m1
12384 pmaddubsw m1, m0, [r5 + 12 * 16]
12390 pmaddubsw m4, m2, [r5 + 12 * 16]
12393 movu [r0 + 29 * 16], m1
12400 pmaddubsw m1, m0, [r5 + 6 * 16]
12406 pmaddubsw m4, m2, [r5 + 6 * 16]
12409 movu [r0 + 30 * 16], m1
12417 movu [r0 + 127 * 16], m1
12421 pmaddubsw m1, m0, [r5 + 2 * 16]
12427 pmaddubsw m2, m7, [r5 + 2 * 16]
12430 movu [r0 + 112 * 16], m1
12433 pmaddubsw m1, m0, [r5 + 4 * 16]
12435 pmaddubsw m2, m7, [r5 + 4 * 16]
12438 movu [r0 + 113 * 16], m1
12441 pmaddubsw m1, m0, [r5 + 6 * 16]
12443 pmaddubsw m2, m7, [r5 + 6 * 16]
12446 movu [r0 + 114 * 16], m1
12449 pmaddubsw m1, m0, [r5 + 8 * 16]
12451 pmaddubsw m2, m7, [r5 + 8 * 16]
12454 movu [r0 + 115 * 16], m1
12457 pmaddubsw m1, m0, [r5 + 10 * 16]
12459 pmaddubsw m2, m7, [r5 + 10 * 16]
12462 movu [r0 + 116 * 16], m1
12465 pmaddubsw m1, m0, [r5 + 12 * 16]
12467 pmaddubsw m2, m7, [r5 + 12 * 16]
12470 movu [r0 + 117 * 16], m1
12473 pmaddubsw m1, m0, [r5 + 14 * 16]
12475 pmaddubsw m2, m7, [r5 + 14 * 16]
12478 movu [r0 + 118 * 16], m1
12481 pmaddubsw m1, m0, [r5 + 16 * 16]
12483 pmaddubsw m2, m7, [r5 + 16 * 16]
12486 movu [r0 + 119 * 16], m1
12489 pmaddubsw m1, m0, [r5 + 18 * 16]
12491 pmaddubsw m2, m7, [r5 + 18 * 16]
12494 movu [r0 + 120 * 16], m1
12497 pmaddubsw m1, m0, [r5 + 20 * 16]
12499 pmaddubsw m2, m7, [r5 + 20 * 16]
12502 movu [r0 + 121 * 16], m1
12505 pmaddubsw m1, m0, [r5 + 22 * 16]
12507 pmaddubsw m2, m7, [r5 + 22 * 16]
12510 movu [r0 + 122 * 16], m1
12513 pmaddubsw m1, m0, [r5 + 24 * 16]
12515 pmaddubsw m2, m7, [r5 + 24 * 16]
12518 movu [r0 + 123 * 16], m1
12521 pmaddubsw m1, m0, [r5 + 26 * 16]
12523 pmaddubsw m2, m7, [r5 + 26 * 16]
12526 movu [r0 + 124 * 16], m1
12529 pmaddubsw m1, m0, [r5 + 28 * 16]
12531 pmaddubsw m2, m7, [r5 + 28 * 16]
12534 movu [r0 + 125 * 16], m1
12537 pmaddubsw m1, m0, [r5 + 30 * 16]
12539 pmaddubsw m2, m7, [r5 + 30 * 16]
12542 movu [r0 + 126 * 16], m1
12546 movu [r0 + 128 * 16], m1
12547 movu [r0 + 129 * 16], m1
12548 movu [r0 + 130 * 16], m1
12549 movu [r0 + 131 * 16], m1
12550 movu [r0 + 132 * 16], m1
12551 movu [r0 + 133 * 16], m1
12552 movu [r0 + 134 * 16], m1
12553 movu [r0 + 135 * 16], m1
12554 movu [r0 + 136 * 16], m1
12555 movu [r0 + 137 * 16], m1
12556 movu [r0 + 138 * 16], m1
12557 movu [r0 + 139 * 16], m1
12558 movu [r0 + 140 * 16], m1
12559 movu [r0 + 141 * 16], m1
12560 movu [r0 + 142 * 16], m1
12561 movu [r0 + 143 * 16], m1
12570 punpcklbw m5, m4, m0
12580 pextrb [r0 + 128 * 16], m5, 0
12581 pextrb [r0 + 129 * 16], m5, 1
12582 pextrb [r0 + 130 * 16], m5, 2
12583 pextrb [r0 + 131 * 16], m5, 3
12584 pextrb [r0 + 132 * 16], m5, 4
12585 pextrb [r0 + 133 * 16], m5, 5
12586 pextrb [r0 + 134 * 16], m5, 6
12587 pextrb [r0 + 135 * 16], m5, 7
12588 pextrb [r0 + 136 * 16], m5, 8
12589 pextrb [r0 + 137 * 16], m5, 9
12590 pextrb [r0 + 138 * 16], m5, 10
12591 pextrb [r0 + 139 * 16], m5, 11
12592 pextrb [r0 + 140 * 16], m5, 12
12593 pextrb [r0 + 141 * 16], m5, 13
12594 pextrb [r0 + 142 * 16], m5, 14
12595 pextrb [r0 + 143 * 16], m5, 15
12601 movu [r0 + 159 * 16], m0
12606 pmaddubsw m1, m0, [r5 + 30 * 16]
12611 pmaddubsw m2, m7, [r5 + 30 * 16]
12614 movu [r0 + 144 * 16], m1
12617 pmaddubsw m1, m0, [r5 + 28 * 16]
12619 pmaddubsw m2, m7, [r5 + 28 * 16]
12622 movu [r0 + 145 * 16], m1
12625 pmaddubsw m1, m0, [r5 + 26 * 16]
12627 pmaddubsw m2, m7, [r5 + 26 * 16]
12630 movu [r0 + 146 * 16], m1
12633 pmaddubsw m1, m0, [r5 + 24 * 16]
12635 pmaddubsw m2, m7, [r5 + 24 * 16]
12638 movu [r0 + 147 * 16], m1
12641 pmaddubsw m1, m0, [r5 + 22 * 16]
12643 pmaddubsw m2, m7, [r5 + 22 * 16]
12646 movu [r0 + 148 * 16], m1
12649 pmaddubsw m1, m0, [r5 + 20 * 16]
12651 pmaddubsw m2, m7, [r5 + 20 * 16]
12654 movu [r0 + 149 * 16], m1
12657 pmaddubsw m1, m0, [r5 + 18 * 16]
12659 pmaddubsw m2, m7, [r5 + 18 * 16]
12662 movu [r0 + 150 * 16], m1
12665 pmaddubsw m1, m0, [r5 + 16 * 16]
12667 pmaddubsw m2, m7, [r5 + 16 * 16]
12670 movu [r0 + 151 * 16], m1
12673 pmaddubsw m1, m0, [r5 + 14 * 16]
12675 pmaddubsw m2, m7, [r5 + 14 * 16]
12678 movu [r0 + 152 * 16], m1
12681 pmaddubsw m1, m0, [r5 + 12 * 16]
12683 pmaddubsw m2, m7, [r5 + 12 * 16]
12686 movu [r0 + 153 * 16], m1
12689 pmaddubsw m1, m0, [r5 + 10 * 16]
12691 pmaddubsw m2, m7, [r5 + 10 * 16]
12694 movu [r0 + 154 * 16], m1
12697 pmaddubsw m1, m0, [r5 + 8 * 16]
12699 pmaddubsw m2, m7, [r5 + 8 * 16]
12702 movu [r0 + 155 * 16], m1
12705 pmaddubsw m1, m0, [r5 + 6 * 16]
12707 pmaddubsw m2, m7, [r5 + 6 * 16]
12710 movu [r0 + 156 * 16], m1
12713 pmaddubsw m1, m0, [r5 + 4 * 16]
12715 pmaddubsw m2, m7, [r5 + 4 * 16]
12718 movu [r0 + 157 * 16], m1
12721 pmaddubsw m1, m0, [r5 + 2 * 16]
12723 pmaddubsw m2, m7, [r5 + 2 * 16]
12726 movu [r0 + 158 * 16], m1
12732 pmaddubsw m1, m0, [r5 + 27 * 16]
12738 pmaddubsw m2, m7, [r5 + 27 * 16]
12741 movu [r0 + 160 * 16], m1
12744 pmaddubsw m1, m0, [r5 + 22 * 16]
12746 pmaddubsw m2, m7, [r5 + 22 * 16]
12749 movu [r0 + 161 * 16], m1
12752 pmaddubsw m1, m0, [r5 + 17 * 16]
12754 pmaddubsw m2, m7, [r5 + 17 * 16]
12757 movu [r0 + 162 * 16], m1
12760 pmaddubsw m1, m0, [r5 + 12 * 16]
12762 pmaddubsw m2, m7, [r5 + 12 * 16]
12765 movu [r0 + 163 * 16], m1
12768 pmaddubsw m1, m0, [r5 + 7 * 16]
12770 pmaddubsw m2, m7, [r5 + 7 * 16]
12773 movu [r0 + 164 * 16], m1
12776 pmaddubsw m1, m0, [r5 + 2 * 16]
12778 pmaddubsw m2, m7, [r5 + 2 * 16]
12781 movu [r0 + 165 * 16], m1
12784 pmaddubsw m1, m0, [r5 + 23 * 16]
12786 pmaddubsw m2, m7, [r5 + 23 * 16]
12789 movu [r0 + 176 * 16], m1
12792 pmaddubsw m1, m0, [r5 + 14 * 16]
12794 pmaddubsw m2, m7, [r5 + 14 * 16]
12797 movu [r0 + 177 * 16], m1
12800 pmaddubsw m1, m0, [r5 + 5 * 16]
12802 pmaddubsw m2, m7, [r5 + 5 * 16]
12805 movu [r0 + 178 * 16], m1
12808 pmaddubsw m1, m0, [r5 + 19 * 16]
12810 pmaddubsw m2, m7, [r5 + 19 * 16]
12813 movu [r0 + 192 * 16], m1
12816 pmaddubsw m1, m0, [r5 + 6 * 16]
12818 pmaddubsw m2, m7, [r5 + 6 * 16]
12821 movu [r0 + 193 * 16], m1
12824 movu [r0 + 240 * 16], m1
12827 pmaddubsw m1, m0, [r5 + 15 * 16]
12829 pmaddubsw m2, m7, [r5 + 15 * 16]
12832 movu [r0 + 208 * 16], m1
12834 ; mode 15 [row 15 - second half]
12835 pmaddubsw m1, m0, [r5 + 16 * 16]
12838 movh [r0 + 223 * 16 + 8], m1
12839 ; mode 15 [row 15 - second half] end
12842 pmaddubsw m1, m0, [r5 + 11 * 16]
12844 pmaddubsw m2, m7, [r5 + 11 * 16]
12847 movu [r0 + 224 * 16], m1
12849 ; mode 17 [row 9 - second half]
12850 pmaddubsw m1, m0, [r5 + 28 * 16]
12853 movh [r0 + 249 * 16 + 8], m1
12854 ; mode 17 [row 9 - second half] end
12856 ; mode 17 [row 10 - second half]
12857 pmaddubsw m1, m0, [r5 + 2 * 16]
12860 movh [r0 + 250 * 16 + 8], m1
12861 ; mode 17 [row 10 - second half] end
12863 ; mode 17 [row 1 - first half]
12865 pinsrb m6, [r3 + 0], 1
12866 pinsrb m6, [r3 + 1], 0
12867 pmaddubsw m1, m6, [r5 + 12 * 16]
12870 movh [r0 + 241 * 16], m1
12872 ; mode 17 [row 11 - second half]
12873 pmaddubsw m1, m6, [r5 + 8 * 16]
12876 movh [r0 + 251 * 16 + 8], m1
12877 ; mode 17 [row 11 - second half] end
12879 ; mode 17 [row 2 - first half]
12881 pinsrb m6, [r3 + 1], 1
12882 pinsrb m6, [r3 + 2], 0
12883 pmaddubsw m1, m6, [r5 + 18 * 16]
12886 movh [r0 + 242 * 16], m1
12888 ; mode 17 [row 12 - second half]
12889 pmaddubsw m1, m6, [r5 + 14 * 16]
12892 movh [r0 + 252 * 16 + 8], m1
12893 ; mode 17 [row 12 - second half] end
12895 ; mode 17 [row 3 - first half]
12897 pinsrb m6, [r3 + 2], 1
12898 pinsrb m6, [r3 + 4], 0
12899 pmaddubsw m1, m6, [r5 + 24 * 16]
12902 movh [r0 + 243 * 16], m1
12904 ; mode 17 [row 13 - first half]
12905 pmaddubsw m1, m6, [r5 + 20 * 16]
12908 movh [r0 + 253 * 16 + 8], m1
12910 ; mode 17 [row 4 - first half]
12912 pinsrb m6, [r3 + 4], 1
12913 pinsrb m6, [r3 + 5], 0
12914 pmaddubsw m1, m6, [r5 + 30 * 16]
12917 movh [r0 + 244 * 16], m1
12919 ; mode 17 [row 5 - first half]
12920 pmaddubsw m1, m6, [r5 + 4 * 16]
12923 movh [r0 + 245 * 16], m1
12925 ; mode 17 [row 14 - second half]
12926 pmaddubsw m1, m6, [r5 + 26 * 16]
12929 movh [r0 + 254 * 16 + 8], m1
12930 ; mode 17 [row 14 - second half] end
12932 ; mode 17 [row 6 - first half]
12934 pinsrb m6, [r3 + 5], 1
12935 pinsrb m6, [r3 + 6], 0
12936 pmaddubsw m1, m6, [r5 + 10 * 16]
12939 movh [r0 + 246 * 16], m1
12941 ; mode 17 [row 7 - first half]
12943 pinsrb m6, [r3 + 6], 1
12944 pinsrb m6, [r3 + 7], 0
12945 pmaddubsw m1, m6, [r5 + 16 * 16]
12948 movh [r0 + 247 * 16], m1
12950 ; mode 17 [row 8 - first half]
12952 pinsrb m6, [r3 + 7], 1
12953 pinsrb m6, [r3 + 9], 0
12954 pmaddubsw m1, m6, [r5 + 22 * 16]
12957 movh [r0 + 248 * 16], m1
12959 ; mode 17 [row 9 - first half]
12961 pinsrb m6, [r3 + 9], 1
12962 pinsrb m6, [r3 + 10], 0
12963 pmaddubsw m1, m6, [r5 + 28 * 16]
12966 movh [r0 + 249 * 16], m1
12968 ; mode 17 [row 10 - first half]
12969 pmaddubsw m1, m6, [r5 + 2 * 16]
12972 movh [r0 + 250 * 16], m1
12974 ; mode 17 [row 11 - first half]
12976 pinsrb m6, [r3 + 10], 1
12977 pinsrb m6, [r3 + 11], 0
12978 pmaddubsw m1, m6, [r5 + 8 * 16]
12981 movh [r0 + 251 * 16], m1
12983 ; mode 17 [row 12 - first half]
12985 pinsrb m6, [r3 + 11], 1
12986 pinsrb m6, [r3 + 12], 0
12987 pmaddubsw m1, m6, [r5 + 14 * 16]
12990 movh [r0 + 252 * 16], m1
12992 ; mode 17 [row 13 - first half]
12994 pinsrb m6, [r3 + 12], 1
12995 pinsrb m6, [r3 + 14], 0
12996 pmaddubsw m1, m6, [r5 + 20 * 16]
12999 movh [r0 + 253 * 16], m1
13001 ; mode 17 [row 14 - first half]
13003 pinsrb m6, [r3 + 14], 1
13004 pinsrb m6, [r3 + 15], 0
13005 pmaddubsw m1, m6, [r5 + 26 * 16]
13008 movh [r0 + 254 * 16], m1
13010 ; mode 16 [row 12 - second half]
13011 pmaddubsw m1, m0, [r5 + 15 * 16]
13014 movh [r0 + 236 * 16 + 8], m1
13015 ; mode 16 [row 12 - second half]
13019 pinsrb m2, [r3 + 0], 1
13020 pinsrb m2, [r3 + 6], 0
13021 pmaddubsw m1, m2, [r5 + 29 * 16]
13026 pmaddubsw m4, m0, [r5 + 29 * 16]
13029 movu [r0 + 166 * 16], m1
13032 pmaddubsw m1, m2, [r5 + 24 * 16]
13034 pmaddubsw m4, m0, [r5 + 24 * 16]
13037 movu [r0 + 167 * 16], m1
13040 pmaddubsw m1, m2, [r5 + 19 * 16]
13042 pmaddubsw m4, m0, [r5 + 19 * 16]
13045 movu [r0 + 168 * 16], m1
13048 pmaddubsw m1, m2, [r5 + 14 * 16]
13050 pmaddubsw m4, m0, [r5 + 14 * 16]
13053 movu [r0 + 169 * 16], m1
13056 pmaddubsw m1, m2, [r5 + 9 * 16]
13058 pmaddubsw m4, m0, [r5 + 9 * 16]
13061 movu [r0 + 170 * 16], m1
13064 pmaddubsw m1, m2, [r5 + 4 * 16]
13066 pmaddubsw m4, m0, [r5 + 4 * 16]
13069 movu [r0 + 171 * 16], m1
13072 pinsrb m7, m2, [r3 + 4], 0
13073 pmaddubsw m1, m7, [r5 + 28 * 16]
13075 pmaddubsw m4, m0, [r5 + 28 * 16]
13078 movu [r0 + 179 * 16], m1
13081 pmaddubsw m1, m7, [r5 + 19 * 16]
13083 pmaddubsw m4, m0, [r5 + 19 * 16]
13086 movu [r0 + 180 * 16], m1
13089 pmaddubsw m1, m7, [r5 + 10 * 16]
13091 pmaddubsw m4, m0, [r5 + 10 * 16]
13094 movu [r0 + 181 * 16], m1
13097 pmaddubsw m1, m7, [r5 + 1 * 16]
13099 pmaddubsw m4, m0, [r5 + 1 * 16]
13102 movu [r0 + 182 * 16], m1
13105 pinsrb m5, m7, [r3 + 2], 0
13106 pmaddubsw m1, m5, [r5 + 25 * 16]
13108 pmaddubsw m4, m0, [r5 + 25 * 16]
13111 movu [r0 + 194 * 16], m1
13114 pmaddubsw m1, m5, [r5 + 12 * 16]
13116 pmaddubsw m4, m0, [r5 + 12 * 16]
13119 movu [r0 + 195 * 16], m1
13122 pmaddubsw m1, m5, [r5 + 30 * 16]
13124 pmaddubsw m4, m0, [r5 + 30 * 16]
13127 movu [r0 + 209 * 16], m1
13130 pmaddubsw m1, m5, [r5 + 13 * 16]
13132 pmaddubsw m4, m0, [r5 + 13 * 16]
13135 movu [r0 + 210 * 16], m1
13138 pmaddubsw m1, m5, [r5 + 22 * 16]
13140 pmaddubsw m4, m0, [r5 + 22 * 16]
13143 movu [r0 + 225 * 16], m1
13146 pmaddubsw m1, m5, [r5 + 1 * 16]
13148 pmaddubsw m4, m0, [r5 + 1 * 16]
13151 movu [r0 + 226 * 16], m1
13153 ; mode 16 [row 13 - second half]
13154 pmaddubsw m1, m5, [r5 + 26 * 16]
13157 movh [r0 + 237 * 16 + 8], m1
13158 ; mode 16 [row 13 - second half]
13160 ; mode 16 [row 14 - second half]
13161 pmaddubsw m1, m5, [r5 + 5 * 16]
13164 movh [r0 + 238 * 16 + 8], m1
13165 ; mode 16 [row 14 - second half]
13169 pinsrb m6, [r3 + 2], 1
13170 pinsrb m6, [r3 + 3], 0
13171 pmaddubsw m1, m6, [r5 + 12 * 16]
13174 movh [r0 + 227 * 16], m1
13176 ; mode 16 [row 15 - second half]
13177 pmaddubsw m1, m6, [r5 + 16 * 16]
13180 movh [r0 + 239 * 16 + 8], m1
13181 ; mode 16 [row 15 - second half] end
13183 ; mode 16 [row 4- first half]
13185 pinsrb m6, [r3 + 3], 1
13186 pinsrb m6, [r3 + 5], 0
13187 pmaddubsw m1, m6, [r5 + 23 * 16]
13190 movh [r0 + 228 * 16], m1
13192 ; mode 16 [row 5- first half]
13193 pmaddubsw m1, m6, [r5 + 2 * 16]
13196 movh [r0 + 229 * 16], m1
13198 ; mode 16 [row 6- first half]
13200 pinsrb m6, [r3 + 5], 1
13201 pinsrb m6, [r3 + 6], 0
13202 pmaddubsw m1, m6, [r5 + 13 * 16]
13205 movh [r0 + 230 * 16], m1
13207 ; mode 16 [row 7- first half]
13209 pinsrb m6, [r3 + 6], 1
13210 pinsrb m6, [r3 + 8], 0
13211 pmaddubsw m1, m6, [r5 + 24 * 16]
13214 movh [r0 + 231 * 16], m1
13216 ; mode 16 [row 8- first half]
13217 pmaddubsw m1, m6, [r5 + 3 * 16]
13220 movh [r0 + 232 * 16], m1
13221 ; mode 19 [row 0 - second half] end
13223 ; mode 16 [row 9- first half]
13225 pinsrb m6, [r3 + 8], 1
13226 pinsrb m6, [r3 + 9], 0
13227 pmaddubsw m1, m6, [r5 + 14 * 16]
13230 movh [r0 + 233 * 16], m1
13232 ; mode 16 [row 10 - first half]
13234 pinsrb m6, [r3 + 9], 1
13235 pinsrb m6, [r3 + 11], 0
13236 pmaddubsw m1, m6, [r5 + 25 * 16]
13239 movh [r0 + 234 * 16], m1
13241 ; mode 16 [row 11 - first half]
13242 pmaddubsw m1, m6, [r5 + 4 * 16]
13245 movh [r0 + 235 * 16], m1
13247 ; mode 16 [row 12 - first half]
13249 pinsrb m6, [r3 + 11], 1
13250 pinsrb m6, [r3 + 12], 0
13251 pmaddubsw m1, m6, [r5 + 15 * 16]
13254 movh [r0 + 236 * 16], m1
13256 ; mode 16 [row 13 - first half]
13258 pinsrb m6, [r3 + 12], 1
13259 pinsrb m6, [r3 + 14], 0
13260 pmaddubsw m1, m6, [r5 + 26 * 16]
13263 movh [r0 + 237 * 16], m1
13265 ; mode 16 [row 14 - first half]
13266 pmaddubsw m1, m6, [r5 + 5 * 16]
13269 movh [r0 + 238 * 16], m1
13271 ; mode 16 [row 15 - first half]
13273 pinsrb m6, [r3 + 14], 1
13274 pinsrb m6, [r3 + 15], 0
13275 pmaddubsw m1, m6, [r5 + 16 * 16]
13278 movh [r0 + 239 * 16], m1
13282 pinsrb m5, [r3 + 2], 1
13283 pinsrb m5, [r3 + 5], 0
13288 ; mode 16 [row 3 - second half]
13289 pmaddubsw m1, m4, [r5 + 12 * 16]
13292 movh [r0 + 227 * 16 + 8], m1
13294 ; mode 16 [row 3 - second half] end
13295 pmaddubsw m1, m5, [r5 + 31 * 16]
13297 pmaddubsw m0, m4, [r5 + 31 * 16]
13300 movu [r0 + 196 * 16], m1
13303 pmaddubsw m1, m5, [r5 + 18 * 16]
13305 pmaddubsw m0, m4, [r5 + 18 * 16]
13308 movu [r0 + 197 * 16], m1
13311 pmaddubsw m1, m5, [r5 + 5 * 16]
13313 pmaddubsw m0, m4, [r5 + 5 * 16]
13316 movu [r0 + 198 * 16], m1
13320 pinsrb m6, [r3 + 4], 0
13321 pmaddubsw m1, m6, [r5 + 28 * 16]
13323 pmaddubsw m0, m4, [r5 + 28 * 16]
13326 movu [r0 + 211 * 16], m1
13329 pmaddubsw m1, m6, [r5 + 11 * 16]
13331 pmaddubsw m0, m4, [r5 + 11 * 16]
13334 movu [r0 + 212 * 16], m1
13336 ; mode 15 [row 5 - first half]
13338 pinsrb m6, [r3 + 4], 1
13339 pinsrb m6, [r3 + 6], 0
13340 pmaddubsw m1, m6, [r5 + 26 * 16]
13343 movh [r0 + 213 * 16], m1
13345 ; mode 15 [row 6 - first half]
13346 pmaddubsw m1, m6, [r5 + 9 * 16]
13349 movh [r0 + 214 * 16], m1
13351 ; mode 15 [row 7 - first half]
13353 pinsrb m6, [r3 + 6], 1
13354 pinsrb m6, [r3 + 8], 0
13355 pmaddubsw m1, m6, [r5 + 24 * 16]
13358 movh [r0 + 215 * 16], m1
13360 ; mode 15 [row 8 - first half]
13361 pmaddubsw m1, m6, [r5 + 7 * 16]
13364 movh [r0 + 216 * 16], m1
13366 ; mode 15 [row 9 - first half]
13368 pinsrb m6, [r3 + 8], 1
13369 pinsrb m6, [r3 + 9], 0
13370 pmaddubsw m1, m6, [r5 + 22 * 16]
13373 movh [r0 + 217 * 16], m1
13375 ; mode 15 [row 10 - first half]
13376 pmaddubsw m1, m6, [r5 + 5 * 16]
13379 movh [r0 + 218 * 16], m1
13381 ; mode 15 [row 11 - first half]
13383 pinsrb m6, [r3 + 9], 1
13384 pinsrb m6, [r3 + 11], 0
13385 pmaddubsw m1, m6, [r5 + 20 * 16]
13388 movh [r0 + 219 * 16], m1
13390 ; mode 15 [row 12 - first half]
13391 pmaddubsw m1, m6, [r5 + 3 * 16]
13394 movh [r0 + 220 * 16], m1
13396 ; mode 15 [row 13 - first half]
13398 pinsrb m6, [r3 + 11], 1
13399 pinsrb m6, [r3 + 13], 0
13400 pmaddubsw m1, m6, [r5 + 18 * 16]
13403 movh [r0 + 221 * 16], m1
13405 ; mode 15 [row 14 - first half]
13406 pmaddubsw m1, m6, [r5 + 1 * 16]
13409 movh [r0 + 222 * 16], m1
13411 ; mode 15 [row 15 - first half]
13413 pinsrb m6, [r3 + 13], 1
13414 pinsrb m6, [r3 + 15], 0
13415 pmaddubsw m1, m6, [r5 + 16 * 16]
13418 movh [r0 + 223 * 16], m1
13422 pinsrb m5, [r3 + 5], 1
13423 pinsrb m5, [r3 + 7], 0
13428 ; mode 15 [row 5 - second half]
13429 pmaddubsw m1, m0, [r5 + 26 * 16]
13432 movh [r0 + 213 * 16 + 8], m1
13433 ; mode 15 [row 5 - second half] end
13435 ; mode 15 [row 6 - second half]
13436 pmaddubsw m1, m0, [r5 + 9 * 16]
13439 movh [r0 + 214 * 16 + 8], m1
13440 ; mode 15 [row 6 - second half] end
13442 ; mode 16 [row 4 - second half]
13443 pmaddubsw m1, m0, [r5 + 23 * 16]
13446 movh [r0 + 228 * 16 + 8], m1
13447 ; mode 16 [row 4 - second half] end
13449 ; mode 16 [row 5 - second half]
13450 pmaddubsw m1, m0, [r5 + 2 * 16]
13453 movh [r0 + 229 * 16 + 8], m1
13455 ; mode 16 [row 5 - second half] end
13456 pmaddubsw m1, m5, [r5 + 24 * 16]
13458 pmaddubsw m6, m0, [r5 + 24 * 16]
13461 movu [r0 + 199 * 16], m1
13464 pmaddubsw m1, m5, [r5 + 11 * 16]
13466 pmaddubsw m6, m0, [r5 + 11 * 16]
13469 movu [r0 + 200 * 16], m1
13473 pinsrb m5, [r3 + 7], 1
13474 pinsrb m5, [r3 + 10], 0
13479 ; mode 15 [row 7 - second half]
13480 pmaddubsw m1, m0, [r5 + 24 * 16]
13483 movh [r0 + 215 * 16 + 8], m1
13484 ; mode 15 [row 7 - second half] end
13486 ; mode 15 [row 8 - second half]
13487 pmaddubsw m1, m0, [r5 + 7 * 16]
13490 movh [r0 + 216 * 16 + 8], m1
13491 ; mode 15 [row 8 - second half] end
13493 ; mode 16 [row 6 - second half]
13494 pmaddubsw m1, m0, [r5 + 13 * 16]
13497 movh [r0 + 230 * 16 + 8], m1
13498 ; mode 16 [row 6 - second half] end
13500 ; mode 15 [row 6 - second half] end
13501 pmaddubsw m1, m5, [r5 + 30 * 16]
13503 pmaddubsw m6, m0, [r5 + 30 * 16]
13506 movu [r0 + 201 * 16], m1
13509 pmaddubsw m1, m5, [r5 + 17 * 16]
13511 pmaddubsw m6, m0, [r5 + 17 * 16]
13514 movu [r0 + 202 * 16], m1
13517 pmaddubsw m1, m5, [r5 + 4 * 16]
13519 pmaddubsw m6, m0, [r5 + 4 * 16]
13522 movu [r0 + 203 * 16], m1
13526 pinsrb m5, [r3 + 10], 1
13527 pinsrb m5, [r3 + 12], 0
13532 ; mode 15 [row 9 - second half]
13533 pmaddubsw m1, m0, [r5 + 22 * 16]
13536 movh [r0 + 217 * 16 + 8], m1
13537 ; mode 15 [row 9 - second half] end
13539 ; mode 15 [row 10 - second half]
13540 pmaddubsw m1, m0, [r5 + 5 * 16]
13543 movh [r0 + 218 * 16 + 8], m1
13544 ; mode 15 [row 10 - second half] end
13546 ; mode 16 [row 7 - second half]
13547 pmaddubsw m1, m0, [r5 + 24 * 16]
13550 movh [r0 + 231 * 16 + 8], m1
13551 ; mode 16 [row 7 - second half] end
13553 ; mode 16 [row 8 - second half]
13554 pmaddubsw m1, m0, [r5 + 3 * 16]
13557 movh [r0 + 232 * 16 + 8], m1
13558 ; mode 16 [row 8 - second half] end
13560 pmaddubsw m1, m5, [r5 + 23 * 16]
13562 pmaddubsw m6, m0, [r5 + 23 * 16]
13565 movu [r0 + 204 * 16], m1
13568 pmaddubsw m1, m5, [r5 + 10 * 16]
13570 pmaddubsw m6, m0, [r5 + 10 * 16]
13573 movu [r0 + 205 * 16], m1
13577 pinsrb m5, [r3 + 12], 1
13578 pinsrb m5, [r3 + 15], 0
13583 ; mode 15 [row 11 - second half]
13584 pmaddubsw m1, m0, [r5 + 20 * 16]
13587 movh [r0 + 219 * 16 + 8], m1
13588 ; mode 15 [row 11 - second half] end
13590 ; mode 15 [row 12 - second half]
13591 pmaddubsw m1, m0, [r5 + 3 * 16]
13594 movh [r0 + 220 * 16 + 8], m1
13595 ; mode 15 [row 12 - second half] end
13597 ; mode 16 [row 9 - second half]
13598 pmaddubsw m1, m0, [r5 + 14 * 16]
13601 movh [r0 + 233 * 16 + 8], m1
13603 ; mode 16 [row 9 - second half] end
13604 pmaddubsw m1, m5, [r5 + 29 * 16]
13606 pmaddubsw m6, m0, [r5 + 29 * 16]
13609 movu [r0 + 206 * 16], m1
13612 pmaddubsw m1, m5, [r5 + 16 * 16]
13614 pmaddubsw m6, m0, [r5 + 16 * 16]
13617 movu [r0 + 207 * 16], m1
13621 pinsrb m0, [r3 + 6], 1
13622 pinsrb m0, [r3 + 13], 0
13623 pmaddubsw m1, m0, [r5 + 31 * 16]
13625 pmaddubsw m5, m4, [r5 + 31 * 16]
13628 movu [r0 + 172 * 16], m1
13631 pmaddubsw m1, m0, [r5 + 26 * 16]
13633 pmaddubsw m5, m4, [r5 + 26 * 16]
13636 movu [r0 + 173 * 16], m1
13639 pmaddubsw m1, m0, [r5 + 21 * 16]
13641 pmaddubsw m5, m4, [r5 + 21 * 16]
13644 movu [r0 + 174 * 16], m1
13647 pmaddubsw m1, m0, [r5 + 16 * 16]
13649 pmaddubsw m5, m4, [r5 + 16 * 16]
13652 movu [r0 + 175 * 16], m1
13656 pinsrb m7, [r3 + 4], 1
13657 pinsrb m7, [r3 + 7], 0
13658 pmaddubsw m1, m7, [r5 + 24 * 16]
13660 pmaddubsw m5, m4, [r5 + 24 * 16]
13663 movu [r0 + 183 * 16], m1
13666 pmaddubsw m1, m7, [r5 + 15 * 16]
13668 pmaddubsw m5, m4, [r5 + 15 * 16]
13671 movu [r0 + 184 * 16], m1
13674 pmaddubsw m1, m7, [r5 + 6 * 16]
13676 pmaddubsw m5, m4, [r5 + 6 * 16]
13679 movu [r0 + 185 * 16], m1
13683 pinsrb m7, [r3 + 7], 1
13684 pinsrb m7, [r3 + 11], 0
13685 pmaddubsw m1, m7, [r5 + 29 * 16]
13690 pmaddubsw m5, m4, [r5 + 29 * 16]
13693 movu [r0 + 186 * 16], m1
13696 pmaddubsw m1, m7, [r5 + 20 * 16]
13698 pmaddubsw m5, m4, [r5 + 20 * 16]
13701 movu [r0 + 187 * 16], m1
13704 pmaddubsw m1, m7, [r5 + 11 * 16]
13706 pmaddubsw m5, m4, [r5 + 11 * 16]
13709 movu [r0 + 188 * 16], m1
13712 pmaddubsw m1, m7, [r5 + 2 * 16]
13714 pmaddubsw m5, m4, [r5 + 2 * 16]
13717 movu [r0 + 189 * 16], m1
13721 pinsrb m7, [r3 + 11], 1
13722 pinsrb m7, [r3 + 14], 0
13723 pmaddubsw m1, m7, [r5 + 25 * 16]
13728 pmaddubsw m5, m4, [r5 + 25 * 16]
13731 movu [r0 + 190 * 16], m1
13734 pmaddubsw m1, m7, [r5 + 16 * 16]
13736 pmaddubsw m5, m4, [r5 + 16 * 16]
13739 movu [r0 + 191 * 16], m1
13743 pshufb m1, m0, [tab_S1]
13744 movu [r0 + 255 * 16], m1
13746 movd [r0 + 255 * 16 + 12], m2
13749 movu [r0 + 256 * 16], m0
13753 pinsrb m4, [r4 + 1], 0
13754 movu [r0 + 257 * 16], m4
13756 pinsrb m4, [r4 + 2], 0
13757 movu [r0 + 258 * 16], m4
13759 pinsrb m4, [r4 + 3], 0
13760 movu [r0 + 259 * 16], m4
13762 pinsrb m4, [r4 + 4], 0
13763 movu [r0 + 260 * 16], m4
13765 pinsrb m4, [r4 + 5], 0
13766 movu [r0 + 261 * 16], m4
13768 pinsrb m4, [r4 + 6], 0
13769 movu [r0 + 262 * 16], m4
13771 pinsrb m4, [r4 + 7], 0
13772 movu [r0 + 263 * 16], m4
13774 pinsrb m4, [r4 + 8], 0
13775 movu [r0 + 264 * 16], m4
13777 pinsrb m4, [r4 + 9], 0
13778 movu [r0 + 265 * 16], m4
13780 pinsrb m4, [r4 + 10], 0
13781 movu [r0 + 266 * 16], m4
13783 pinsrb m4, [r4 + 11], 0
13784 movu [r0 + 267 * 16], m4
13786 pinsrb m4, [r4 + 12], 0
13787 movu [r0 + 268 * 16], m4
13789 pinsrb m4, [r4 + 13], 0
13790 movu [r0 + 269 * 16], m4
13792 pinsrb m4, [r4 + 14], 0
13793 movu [r0 + 270 * 16], m4
13795 pinsrb m4, [r4 + 15], 0
13796 movu [r0 + 271 * 16], m4
13804 pmaddubsw m4, m0, [r5 + 6 * 16]
13806 pmaddubsw m6, m5, [r5 + 6 * 16]
13809 movu [r0 + 272 * 16], m4
13812 pmaddubsw m4, m0, [r5 + 11 * 16]
13814 pmaddubsw m6, m5, [r5 + 11 * 16]
13817 movu [r0 + 288 * 16], m4
13820 pmaddubsw m4, m0, [r5 + 15 * 16]
13822 pmaddubsw m6, m5, [r5 + 15 * 16]
13825 movu [r0 + 304 * 16], m4
13828 pmaddubsw m4, m0, [r5 + 19 * 16]
13830 pmaddubsw m6, m5, [r5 + 19 * 16]
13833 movu [r0 + 320 * 16], m4
13836 pmaddubsw m4, m0, [r5 + 6 * 16]
13838 pmaddubsw m6, m5, [r5 + 6 * 16]
13841 movu [r0 + 321 * 16], m4
13844 pmaddubsw m4, m0, [r5 + 23 * 16]
13846 pmaddubsw m6, m5, [r5 + 23 * 16]
13849 movu [r0 + 336 * 16], m4
13852 pmaddubsw m4, m0, [r5 + 14 * 16]
13854 pmaddubsw m6, m5, [r5 + 14 * 16]
13857 movu [r0 + 337 * 16], m4
13860 pmaddubsw m4, m0, [r5 + 5 * 16]
13862 pmaddubsw m6, m5, [r5 + 5 * 16]
13865 movu [r0 + 338 * 16], m4
13868 pmaddubsw m4, m0, [r5 + 27 * 16]
13870 pmaddubsw m6, m5, [r5 + 27 * 16]
13873 movu [r0 + 352 * 16], m4
13876 pmaddubsw m4, m0, [r5 + 22 * 16]
13878 pmaddubsw m6, m5, [r5 + 22 * 16]
13881 movu [r0 + 353 * 16], m4
13884 pmaddubsw m4, m0, [r5 + 17 * 16]
13886 pmaddubsw m6, m5, [r5 + 17 * 16]
13889 movu [r0 + 354 * 16], m4
13892 pmaddubsw m4, m0, [r5 + 12 * 16]
13894 pmaddubsw m6, m5, [r5 + 12 * 16]
13897 movu [r0 + 355 * 16], m4
13900 pmaddubsw m4, m0, [r5 + 7 * 16]
13902 pmaddubsw m6, m5, [r5 + 7 * 16]
13905 movu [r0 + 356 * 16], m4
13908 pmaddubsw m4, m0, [r5 + 2 * 16]
13910 pmaddubsw m6, m5, [r5 + 2 * 16]
13913 movu [r0 + 357 * 16], m4
13915 ; mode 24 [row 6 - first half]
13917 pinsrb m7, [r4 + 0], 1
13918 pinsrb m7, [r4 + 6], 0
13919 pmaddubsw m4, m7, [r5 + 29 * 16]
13922 movh [r0 + 358 * 16], m4
13924 ; mode 24 [row 7 - first half]
13925 pmaddubsw m4, m7, [r5 + 24 * 16]
13928 movh [r0 + 359 * 16], m4
13930 ; mode 24 [row 8 - first half]
13931 pmaddubsw m4, m7, [r5 + 19 * 16]
13934 movh [r0 + 360 * 16], m4
13936 ; mode 24 [row 9 - first half]
13937 pmaddubsw m4, m7, [r5 + 14 * 16]
13940 movh [r0 + 361 * 16], m4
13942 ; mode 24 [row 10 - first half]
13943 pmaddubsw m4, m7, [r5 + 9 * 16]
13946 movh [r0 + 362 * 16], m4
13948 ; mode 24 [row 11 - first half]
13949 pmaddubsw m4, m7, [r5 + 4 * 16]
13952 movh [r0 + 363 * 16], m4
13954 ; mode 24 [row 12 - first half]
13956 pinsrb m7, [r4 + 6], 1
13957 pinsrb m7, [r4 + 13], 0
13958 pmaddubsw m4, m7, [r5 + 31 * 16]
13961 movh [r0 + 364 * 16], m4
13963 ; mode 24 [row 13 - first half]
13964 pmaddubsw m4, m7, [r5 + 26 * 16]
13967 movh [r0 + 365 * 16], m4
13969 ; mode 24 [row 14 - first half]
13970 pmaddubsw m4, m7, [r5 + 21 * 16]
13973 movh [r0 + 366 * 16], m4
13975 ; mode 24 [row 15 - first half]
13976 pmaddubsw m4, m7, [r5 + 16 * 16]
13979 movh [r0 + 367 * 16], m4
13981 ; mode 23 [row 3 - first half]
13983 pinsrb m7, [r4 + 0], 1
13984 pinsrb m7, [r4 + 4], 0
13985 pmaddubsw m4, m7, [r5 + 28 * 16]
13988 movh [r0 + 339 * 16], m4
13990 ; mode 23 [row 4 - first half]
13991 pmaddubsw m4, m7, [r5 + 19 * 16]
13994 movh [r0 + 340 * 16], m4
13996 ; mode 23 [row 5 - first half]
13997 pmaddubsw m4, m7, [r5 + 10 * 16]
14000 movh [r0 + 341 * 16], m4
14002 ; mode 23 [row 6 - first half]
14003 pmaddubsw m4, m7, [r5 + 1 * 16]
14006 movh [r0 + 342 * 16], m4
14008 ; mode 23 [row 7 - first half]
14010 pinsrb m7, [r4 + 4], 1
14011 pinsrb m7, [r4 + 7], 0
14012 pmaddubsw m4, m7, [r5 + 24 * 16]
14015 movh [r0 + 343 * 16], m4
14017 ; mode 23 [row 8 - first half]
14018 pmaddubsw m4, m7, [r5 + 15 * 16]
14021 movh [r0 + 344 * 16], m4
14023 ; mode 23 [row 9 - first half]
14024 pmaddubsw m4, m7, [r5 + 6 * 16]
14027 movh [r0 + 345 * 16], m4
14029 ; mode 23 [row 10 - first half]
14031 pinsrb m7, [r4 + 7], 1
14032 pinsrb m7, [r4 + 11], 0
14033 pmaddubsw m4, m7, [r5 + 29 * 16]
14036 movh [r0 + 346 * 16], m4
14038 ; mode 23 [row 11 - first half]
14039 pmaddubsw m4, m7, [r5 + 20 * 16]
14042 movh [r0 + 347 * 16], m4
14044 ; mode 23 [row 12 - first half]
14045 pmaddubsw m4, m7, [r5 + 11 * 16]
14048 movh [r0 + 348 * 16], m4
14050 ; mode 23 [row 13 - first half]
14051 pmaddubsw m4, m7, [r5 + 2 * 16]
14054 movh [r0 + 349 * 16], m4
14056 ; mode 23 [row 14 - first half]
14058 pinsrb m7, [r4 + 11], 1
14059 pinsrb m7, [r4 + 14], 0
14060 pmaddubsw m4, m7, [r5 + 25 * 16]
14063 movh [r0 + 350 * 16], m4
14065 ; mode 23 [row 15 - first half]
14066 pmaddubsw m4, m7, [r5 + 16 * 16]
14069 movh [r0 + 351 * 16], m4
14071 ; mode 21 [row 15 - first half]
14072 pmaddubsw m4, m0, [r5 + 16 * 16]
14075 movh [r0 + 319 * 16 + 8], m4
14076 ; mode 21 [row 15 - second half] end
14078 ; mode 20 [row 1 - first half]
14080 pinsrb m7, [r4 + 0], 1
14081 pinsrb m7, [r4 + 2], 0
14082 pmaddubsw m4, m7, [r5 + 22 * 16]
14085 movh [r0 + 289 * 16], m4
14087 ; mode 20 [row 2 - first half]
14088 pmaddubsw m4, m7, [r5 + 1 * 16]
14091 movh [r0 + 290 * 16], m4
14093 ; mode 21 [row 1 - first half]
14094 pmaddubsw m4, m7, [r5 + 30 * 16]
14097 movh [r0 + 305 * 16], m4
14099 ; mode 21 [row 2 - first half]
14100 pmaddubsw m4, m7, [r5 + 13 * 16]
14103 movh [r0 + 306 * 16], m4
14105 ; mode 22 [row 2 - first half]
14106 pmaddubsw m4, m7, [r5 + 25 * 16]
14109 movh [r0 + 322 * 16], m4
14111 ; mode 22 [row 3 - first half]
14112 pmaddubsw m4, m7, [r5 + 12 * 16]
14115 movh [r0 + 323 * 16], m4
14117 ; mode 22 [row 4 - first half]
14119 pinsrb m1, [r4 + 2], 1
14120 pinsrb m1, [r4 + 5], 0
14121 pmaddubsw m4, m1, [r5 + 31 * 16]
14124 movh [r0 + 324 * 16], m4
14126 ; mode 22 [row 5 - first half]
14127 pmaddubsw m4, m1, [r5 + 18 * 16]
14130 movh [r0 + 325 * 16], m4
14132 ; mode 22 [row 6 - first half]
14133 pmaddubsw m4, m1, [r5 + 5 * 16]
14136 movh [r0 + 326 * 16], m4
14138 ; mode 22 [row 7 - first half]
14140 pinsrb m1, [r4 + 5], 1
14141 pinsrb m1, [r4 + 7], 0
14142 pmaddubsw m4, m1, [r5 + 24 * 16]
14145 movh [r0 + 327 * 16], m4
14147 ; mode 22 [row 8 - first half]
14148 pmaddubsw m4, m1, [r5 + 11 * 16]
14151 movh [r0 + 328 * 16], m4
14153 ; mode 22 [row 9 - first half]
14155 pinsrb m1, [r4 + 7], 1
14156 pinsrb m1, [r4 + 10], 0
14157 pmaddubsw m4, m1, [r5 + 30 * 16]
14160 movh [r0 + 329 * 16], m4
14162 ; mode 22 [row 10 - first half]
14163 pmaddubsw m4, m1, [r5 + 17 * 16]
14166 movh [r0 + 330 * 16], m4
14168 ; mode 22 [row 11 - first half]
14169 pmaddubsw m4, m1, [r5 + 4 * 16]
14172 movh [r0 + 331 * 16], m4
14174 ; mode 22 [row 12 - first half]
14176 pinsrb m1, [r4 + 10], 1
14177 pinsrb m1, [r4 + 12], 0
14178 pmaddubsw m4, m1, [r5 + 23 * 16]
14181 movh [r0 + 332 * 16], m4
14183 ; mode 22 [row 13 - first half]
14184 pmaddubsw m4, m1, [r5 + 10 * 16]
14187 movh [r0 + 333 * 16], m4
14189 ; mode 22 [row 14 - first half]
14191 pinsrb m1, [r4 + 12], 1
14192 pinsrb m1, [r4 + 15], 0
14193 pmaddubsw m4, m1, [r5 + 29 * 16]
14196 movh [r0 + 334 * 16], m4
14198 ; mode 22 [row 15 - first half]
14199 pmaddubsw m4, m1, [r5 + 16 * 16]
14202 movh [r0 + 335 * 16], m4
14204 ; mode 21 [row 3 - first half]
14206 pinsrb m6, [r4 + 2], 1
14207 pinsrb m6, [r4 + 4], 0
14208 pmaddubsw m4, m6, [r5 + 28 * 16]
14211 movh [r0 + 307 * 16], m4
14213 ; mode 21 [row 4 - first half]
14214 pmaddubsw m4, m6, [r5 + 11 * 16]
14217 movh [r0 + 308 * 16], m4
14219 ; mode 21 [row 5 - first half]
14221 pinsrb m6, [r4 + 4], 1
14222 pinsrb m6, [r4 + 6], 0
14223 pmaddubsw m4, m6, [r5 + 26 * 16]
14226 movh [r0 + 309 * 16], m4
14228 ; mode 21 [row 6 - first half]
14229 pmaddubsw m4, m6, [r5 + 9 * 16]
14232 movh [r0 + 310 * 16], m4
14234 ; mode 21 [row 7 - first half]
14236 pinsrb m6, [r4 + 6], 1
14237 pinsrb m6, [r4 + 8], 0
14238 pmaddubsw m4, m6, [r5 + 24 * 16]
14241 movh [r0 + 311 * 16], m4
14243 ; mode 21 [row 8 - first half]
14244 pmaddubsw m4, m6, [r5 + 7 * 16]
14247 movh [r0 + 312 * 16], m4
14249 ; mode 21 [row 9 - first half]
14251 pinsrb m6, [r4 + 8], 1
14252 pinsrb m6, [r4 + 9], 0
14253 pmaddubsw m4, m6, [r5 + 22 * 16]
14256 movh [r0 + 313 * 16], m4
14258 ; mode 21 [row 10 - first half]
14259 pmaddubsw m4, m6, [r5 + 5 * 16]
14262 movh [r0 + 314 * 16], m4
14264 ; mode 21 [row 11 - first half]
14266 pinsrb m6, [r4 + 9], 1
14267 pinsrb m6, [r4 + 11], 0
14268 pmaddubsw m4, m6, [r5 + 20 * 16]
14271 movh [r0 + 315 * 16], m4
14273 ; mode 21 [row 12 - first half]
14274 pmaddubsw m4, m6, [r5 + 3 * 16]
14277 movh [r0 + 316 * 16], m4
14279 ; mode 21 [row 13 - first half]
14281 pinsrb m6, [r4 + 11], 1
14282 pinsrb m6, [r4 + 13], 0
14283 pmaddubsw m4, m6, [r5 + 18 * 16]
14286 movh [r0 + 317 * 16], m4
14288 ; mode 21 [row 14 - first half]
14289 pmaddubsw m4, m6, [r5 + 1 * 16]
14292 movh [r0 + 318 * 16], m4
14294 ; mode 21 [row 15 - first half]
14296 pinsrb m6, [r4 + 13], 1
14297 pinsrb m6, [r4 + 15], 0
14298 pmaddubsw m4, m6, [r5 + 16 * 16]
14301 movh [r0 + 319 * 16], m4
14303 ; mode 20 [row 13 - second half]
14304 pmaddubsw m4, m7, [r5 + 26 * 16]
14307 movh [r0 + 301 * 16 + 8], m4
14308 ; mode 20 [row 13 - second half]
14310 ; mode 20 [row 14 - second half]
14311 pmaddubsw m4, m7, [r5 + 5 * 16]
14314 movh [r0 + 302 * 16 + 8], m4
14315 ; mode 20 [row 14 - second half]
14317 ; mode 20 [row 3 - first half]
14319 pinsrb m7, [r4 + 2], 1
14320 pinsrb m7, [r4 + 3], 0
14321 pmaddubsw m4, m7, [r5 + 12 * 16]
14324 movh [r0 + 291 * 16], m4
14326 ; mode 20 [row 15 - second half]
14327 pmaddubsw m4, m7, [r5 + 16 * 16]
14330 movh [r0 + 303 * 16 + 8], m4
14331 ; mode 20 [row 15 - second half]
14333 ; mode 20 [row 4 - first half]
14335 pinsrb m7, [r4 + 3], 1
14336 pinsrb m7, [r4 + 5], 0
14337 pmaddubsw m4, m7, [r5 + 23 * 16]
14340 movh [r0 + 292 * 16], m4
14342 ; mode 20 [row 5 - first half]
14343 pmaddubsw m4, m7, [r5 + 2 * 16]
14346 movh [r0 + 293 * 16], m4
14348 ; mode 20 [row 6 - first half]
14350 pinsrb m7, [r4 + 5], 1
14351 pinsrb m7, [r4 + 6], 0
14352 pmaddubsw m4, m7, [r5 + 13 * 16]
14355 movh [r0 + 294 * 16], m4
14357 ; mode 20 [row 7 - first half]
14359 pinsrb m7, [r4 + 6], 1
14360 pinsrb m7, [r4 + 8], 0
14361 pmaddubsw m4, m7, [r5 + 24 * 16]
14364 movh [r0 + 295 * 16], m4
14366 ; mode 20 [row 8 - first half]
14367 pmaddubsw m4, m7, [r5 + 3 * 16]
14370 movh [r0 + 296 * 16], m4
14372 ; mode 20 [row 9 - first half]
14374 pinsrb m7, [r4 + 8], 1
14375 pinsrb m7, [r4 + 9], 0
14376 pmaddubsw m4, m7, [r5 + 14 * 16]
14379 movh [r0 + 297 * 16], m4
14381 ; mode 20 [row 10 - first half]
14383 pinsrb m7, [r4 + 9], 1
14384 pinsrb m7, [r4 + 11], 0
14385 pmaddubsw m4, m7, [r5 + 25 * 16]
14388 movh [r0 + 298 * 16], m4
14390 ; mode 20 [row 11 - first half]
14391 pmaddubsw m4, m7, [r5 + 4 * 16]
14394 movh [r0 + 299 * 16], m4
14396 ; mode 20 [row 12 - first half]
14397 movu m1, [r5 + 15 * 16]
14399 pinsrb m7, [r4 + 11], 1
14400 pinsrb m7, [r4 + 12], 0
14401 pmaddubsw m4, m7, [r5 + 15 * 16]
14404 movh [r0 + 300 * 16], m4
14406 ; mode 20 [row 13 - first half]
14408 pinsrb m7, [r4 + 12], 1
14409 pinsrb m7, [r4 + 14], 0
14410 pmaddubsw m4, m7, [r5 + 26 * 16]
14413 movh [r0 + 301 * 16], m4
14415 ; mode 20 [row 14 - first half]
14416 pmaddubsw m4, m7, [r5 + 5 * 16]
14419 movh [r0 + 302 * 16], m4
14421 ; mode 20 [row 15 - first half]
14423 pinsrb m7, [r4 + 14], 1
14424 pinsrb m7, [r4 + 15], 0
14425 pmaddubsw m4, m7, [r5 + 16 * 16]
14428 movh [r0 + 303 * 16], m4
14432 pinsrb m0, [r4 + 0], 1
14433 pinsrb m0, [r4 + 1], 0
14435 pinsrb m5, [r3 + 8], 1
14436 pinsrb m5, [r3 + 7], 0
14438 ; mode 20 [row 1 - second half]
14439 pmaddubsw m4, m5, [r5 + 22 * 16]
14442 movh [r0 + 289 * 16 + 8], m4
14443 ; mode 20 [row 1 - second half] end
14445 ; mode 20 [row 2 - second half]
14446 pmaddubsw m4, m5, [r5 + 1 * 16]
14449 movh [r0 + 290 * 16 + 8], m4
14450 ; mode 20 [row 2 - second half] end
14452 ; mode 21 [row 2 - second half]
14453 pmaddubsw m4, m5, [r5 + 30 * 16]
14456 movh [r0 + 305 * 16 + 8], m4
14457 ; mode 21 [row 2 - second half] end
14459 ; mode 21 [row 3 - second half]
14460 pmaddubsw m4, m5, [r5 + 13 * 16]
14463 movh [r0 + 306 * 16 + 8], m4
14464 ; mode 21 [row 3 - second half] end
14466 ; mode 21 [row 4 - second half]
14467 pmaddubsw m4, m5, [r5 + 11 * 16]
14470 movh [r0 + 307 * 16 + 8], m4
14471 ; mode 21 [row 4 - second half] end
14473 ; mode 22 [row 2 - second half]
14474 pmaddubsw m4, m5, [r5 + 25 * 16]
14477 movh [r0 + 322 * 16 + 8], m4
14478 ; mode 22 [row 2 - second half] end
14480 ; mode 22 [row 3 - second half]
14481 pmaddubsw m4, m5, [r5 + 12 * 16]
14484 movh [r0 + 323 * 16 + 8], m4
14485 ; mode 22 [row 3 - second half] end
14487 ; mode 23 [row 3 - second half]
14488 pmaddubsw m4, m5, [r5 + 28 * 16]
14491 movh [r0 + 339 * 16 + 8], m4
14492 ; mode 23 [row 3 - second half] end
14494 ; mode 23 [row 4 - second half]
14495 pmaddubsw m4, m5, [r5 + 19 * 16]
14498 movh [r0 + 340 * 16 + 8], m4
14499 ; mode 23 [row 4 - second half] end
14501 ; mode 23 [row 5 - second half]
14502 pmaddubsw m4, m5, [r5 + 10 * 16]
14505 movh [r0 + 341 * 16 + 8], m4
14506 ; mode 23 [row 5 - second half] end
14508 ; mode 23 [row 6 - second half]
14509 pmaddubsw m4, m5, [r5 + 1 * 16]
14512 movh [r0 + 342 * 16 + 8], m4
14513 ; mode 23 [row 6 - second half] end
14515 ; mode 24 [row 6 - second half]
14516 pmaddubsw m4, m5, [r5 + 29 * 16]
14519 movh [r0 + 358 * 16 + 8], m4
14520 ; mode 24 [row 6 - second half] end
14522 ; mode 24 [row 7 - second half]
14523 pmaddubsw m4, m5, [r5 + 24 * 16]
14526 movh [r0 + 359 * 16 + 8], m4
14527 ; mode 24 [row 7 - second half] end
14529 ; mode 24 [row 8 - second half]
14530 pmaddubsw m4, m5, [r5 + 19 * 16]
14533 movh [r0 + 360 * 16 + 8], m4
14534 ; mode 24 [row 8 - second half] end
14536 ; mode 24 [row 9 - second half]
14537 pmaddubsw m4, m5, [r5 + 14 * 16]
14540 movh [r0 + 361 * 16 + 8], m4
14541 ; mode 24 [row 9 - second half] end
14543 ; mode 24 [row 10 - second half]
14544 pmaddubsw m4, m5, [r5 + 9 * 16]
14547 movh [r0 + 362 * 16 + 8], m4
14548 ; mode 24 [row 10 - second half] end
14550 ; mode 24 [row 11 - second half]
14551 pmaddubsw m4, m5, [r5 + 4 * 16]
14554 movh [r0 + 363 * 16 + 8], m4
14555 ; mode 24 [row 11 - second half] end
14557 pmaddubsw m4, m0, [r5 + 12 * 16]
14559 pmaddubsw m6, m5, [r5 + 12 * 16]
14562 movu [r0 + 273 * 16], m4
14566 pinsrb m0, [r4 + 1], 1
14567 pinsrb m0, [r4 + 2], 0
14569 pinsrb m5, [r3 + 7], 1
14570 pinsrb m5, [r3 + 6], 0
14572 ; mode 20 [row 3 - second half]
14573 pmaddubsw m4, m5, [r5 + 12 * 16]
14576 movh [r0 + 291 * 16 + 8], m4
14577 ; mode 20 [row 3 - second half] end
14579 ; mode 21 [row 3 - second half]
14580 pmaddubsw m4, m5, [r5 + 28 * 16]
14583 movh [r0 + 307 * 16 + 8], m4
14584 ; mode 21 [row 3 - second half] end
14586 ; mode 21 [row 4 - second half]
14587 pmaddubsw m4, m5, [r5 + 11 * 16]
14590 movh [r0 + 308 * 16 + 8], m4
14591 ; mode 21 [row 4 - second half] end
14593 ; mode 22 [row 4 - second half]
14594 pmaddubsw m4, m5, [r5 + 31 * 16]
14597 movh [r0 + 324 * 16 + 8], m4
14598 ; mode 22 [row 4 - second half] end
14600 ; mode 22 [row 5 - second half]
14601 pmaddubsw m4, m5, [r5 + 18 * 16]
14604 movh [r0 + 325 * 16 + 8], m4
14605 ; mode 22 [row 5 - second half] end
14607 ; mode 22 [row 6 - second half]
14608 pmaddubsw m4, m5, [r5 + 5 * 16]
14611 movh [r0 + 326 * 16 + 8], m4
14612 ; mode 22 [row 6 - second half] end
14614 ; mode 23 [row 7 - second half]
14615 pmaddubsw m4, m5, [r5 + 24 * 16]
14618 movh [r0 + 343 * 16 + 8], m4
14619 ; mode 23 [row 7 - second half] end
14621 ; mode 23 [row 8 - second half]
14622 pmaddubsw m4, m5, [r5 + 15 * 16]
14625 movh [r0 + 344 * 16 + 8], m4
14626 ; mode 23 [row 8 - second half] end
14628 ; mode 23 [row 9 - second half]
14629 pmaddubsw m4, m5, [r5 + 6 * 16]
14632 movh [r0 + 345 * 16 + 8], m4
14633 ; mode 23 [row 9 - second half] end
14635 ; mode 24 [row 12 - second half]
14636 pmaddubsw m4, m5, [r5 + 31 * 16]
14639 movh [r0 + 364 * 16 + 8], m4
14640 ; mode 24 [row 12 - second half] end
14642 ; mode 24 [row 13 - second half]
14643 pmaddubsw m4, m5, [r5 + 26 * 16]
14646 movh [r0 + 365 * 16 + 8], m4
14647 ; mode 24 [row 13 - second half] end
14649 ; mode 24 [row 14 - second half]
14650 pmaddubsw m4, m5, [r5 + 21 * 16]
14653 movh [r0 + 366 * 16 + 8], m4
14654 ; mode 24 [row 14 - second half] end
14656 ; mode 24 [row 15 - second half]
14657 pmaddubsw m4, m5, [r5 + 16 * 16]
14660 movh [r0 + 367 * 16 + 8], m4
14661 ; mode 24 [row 15 - second half] end
14663 pmaddubsw m4, m0, [r5 + 18 * 16]
14665 pmaddubsw m6, m5, [r5 + 18 * 16]
14668 movu [r0 + 274 * 16], m4
14672 pinsrb m0, [r4 + 2], 1
14673 pinsrb m0, [r4 + 4], 0
14675 pinsrb m5, [r3 + 6], 1
14676 pinsrb m5, [r3 + 5], 0
14678 ; mode 20 [row 4 - second half]
14679 pmaddubsw m4, m5, [r5 + 23 * 16]
14682 movh [r0 + 292 * 16 + 8], m4
14683 ; mode 20 [row 4 - second half] end
14685 ; mode 20 [row 5 - second half]
14686 pmaddubsw m4, m5, [r5 + 2 * 16]
14689 movh [r0 + 293 * 16 + 8], m4
14690 ; mode 20 [row 5 - second half] end
14692 ; mode 21 [row 5 - second half]
14693 pmaddubsw m4, m5, [r5 + 26 * 16]
14696 movh [r0 + 309 * 16 + 8], m4
14697 ; mode 21 [row 5 - second half] end
14699 ; mode 21 [row 6 - second half]
14700 pmaddubsw m4, m5, [r5 + 9 * 16]
14703 movh [r0 + 310 * 16 + 8], m4
14704 ; mode 21 [row 6 - second half] end
14706 ; mode 22 [row 7 - second half]
14707 pmaddubsw m4, m5, [r5 + 24 * 16]
14710 movh [r0 + 327 * 16 + 8], m4
14711 ; mode 22 [row 7 - second half] end
14713 ; mode 22 [row 8 - second half]
14714 pmaddubsw m4, m5, [r5 + 11 * 16]
14717 movh [r0 + 328 * 16 + 8], m4
14718 ; mode 22 [row 7 - second half] end
14720 ; mode 23 [row 10 - second half]
14721 pmaddubsw m4, m5, [r5 + 29 * 16]
14724 movh [r0 + 346 * 16 + 8], m4
14725 ; mode 23 [row 10 - second half] end
14727 ; mode 23 [row 11 - second half]
14728 pmaddubsw m4, m5, [r5 + 20 * 16]
14731 movh [r0 + 347 * 16 + 8], m4
14732 ; mode 23 [row 11 - second half] end
14734 ; mode 23 [row 12 - second half]
14735 pmaddubsw m4, m5, [r5 + 11 * 16]
14738 movh [r0 + 348 * 16 + 8], m4
14739 ; mode 23 [row 12 - second half] end
14741 ; mode 23 [row 13 - second half]
14742 pmaddubsw m4, m5, [r5 + 2 * 16]
14745 movh [r0 + 349 * 16 + 8], m4
14746 ; mode 23 [row 13 - second half] end
14748 pmaddubsw m4, m0, [r5 + 24 * 16]
14750 pmaddubsw m6, m5, [r5 + 24 * 16]
14753 movu [r0 + 275 * 16], m4
14757 pinsrb m0, [r4 + 4], 1
14758 pinsrb m0, [r4 + 5], 0
14760 pinsrb m5, [r3 + 5], 1
14761 pinsrb m5, [r3 + 4], 0
14763 ; mode 20 [row 6 - second half]
14764 pmaddubsw m4, m5, [r5 + 13 * 16]
14767 movh [r0 + 294 * 16 + 8], m4
14768 ; mode 20 [row 6 - second half] end
14770 ; mode 21 [row 7 - second half]
14771 pmaddubsw m4, m5, [r5 + 24 * 16]
14774 movh [r0 + 311 * 16 + 8], m4
14775 ; mode 21 [row 7 - second half] end
14777 ; mode 21 [row 8 - second half]
14778 pmaddubsw m4, m5, [r5 + 7 * 16]
14781 movh [r0 + 312 * 16 + 8], m4
14782 ; mode 21 [row 8 - second half] end
14784 ; mode 22 [row 9 - second half]
14785 pmaddubsw m4, m5, [r5 + 30 * 16]
14788 movh [r0 + 329 * 16 + 8], m4
14789 ; mode 22 [row 9 - second half] end
14791 ; mode 22 [row 10 - second half]
14792 pmaddubsw m4, m5, [r5 + 17 * 16]
14795 movh [r0 + 330 * 16 + 8], m4
14796 ; mode 22 [row 10 - second half] end
14798 ; mode 22 [row 11 - second half]
14799 pmaddubsw m4, m5, [r5 + 4 * 16]
14802 movh [r0 + 331 * 16 + 8], m4
14803 ; mode 22 [row 11 - second half] end
14805 ; mode 23 [row 14 - second half]
14806 pmaddubsw m4, m5, [r5 + 25 * 16]
14809 movh [r0 + 350 * 16 + 8], m4
14810 ; mode 23 [row 14 - second half] end
14812 ; mode 23 [row 15 - second half]
14813 pmaddubsw m4, m5, [r5 + 16 * 16]
14816 movh [r0 + 351 * 16 + 8], m4
14818 ; mode 23 [row 15 - second half] end
14819 pmaddubsw m4, m0, [r5 + 30 * 16]
14821 pmaddubsw m6, m5, [r5 + 30 * 16]
14824 movu [r0 + 276 * 16], m4
14827 pmaddubsw m4, m0, [r5 + 4 * 16]
14829 pmaddubsw m6, m5, [r5 + 4 * 16]
14832 movu [r0 + 277 * 16], m4
14836 pinsrb m0, [r4 + 5], 1
14837 pinsrb m0, [r4 + 6], 0
14839 pinsrb m5, [r3 + 4], 1
14840 pinsrb m5, [r3 + 3], 0
14842 ; mode 20 [row 7 - second half]
14843 pmaddubsw m4, m5, [r5 + 24 * 16]
14846 movh [r0 + 295 * 16 + 8], m4
14847 ; mode 20 [row 7 - second half] end
14849 ; mode 20 [row 8 - second half]
14850 pmaddubsw m4, m5, [r5 + 3 * 16]
14853 movh [r0 + 296 * 16 + 8], m4
14854 ; mode 20 [row 8 - second half] end
14856 ; mode 21 [row 9 - second half]
14857 pmaddubsw m4, m5, [r5 + 22 * 16]
14860 movh [r0 + 313 * 16 + 8], m4
14861 ; mode 21 [row 9 - second half] end
14863 ; mode 21 [row 10 - second half]
14864 pmaddubsw m4, m5, [r5 + 5 * 16]
14867 movh [r0 + 314 * 16 + 8], m4
14868 ; mode 21 [row 10 - second half] end
14870 ; mode 22 [row 12 - second half]
14871 pmaddubsw m4, m5, [r5 + 23 * 16]
14874 movh [r0 + 332 * 16 + 8], m4
14875 ; mode 22 [row 12 - second half] end
14877 ; mode 22 [row 12 - second half]
14878 pmaddubsw m4, m5, [r5 + 10 * 16]
14881 movh [r0 + 333 * 16 + 8], m4
14882 ; mode 22 [row 12 - second half] end
14884 pmaddubsw m4, m0, [r5 + 10 * 16]
14886 pmaddubsw m6, m5, [r5 + 10 * 16]
14889 movu [r0 + 278 * 16], m4
14893 pinsrb m0, [r4 + 6], 1
14894 pinsrb m0, [r4 + 7], 0
14896 pinsrb m5, [r3 + 3], 1
14897 pinsrb m5, [r3 + 2], 0
14899 ; mode 20 [row 9 - second half]
14900 pmaddubsw m4, m5, [r5 + 14 * 16]
14903 movh [r0 + 297 * 16 + 8], m4
14904 ; mode 20 [row 9 - second half]
14906 ; mode 21 [row 11 - second half]
14907 pmaddubsw m4, m5, [r5 + 20 * 16]
14910 movh [r0 + 315 * 16 + 8], m4
14911 ; mode 21 [row 11 - second half] end
14913 ; mode 21 [row 12 - second half]
14914 pmaddubsw m4, m5, [r5 + 3 * 16]
14917 movh [r0 + 316 * 16 + 8], m4
14918 ; mode 21 [row 12 - second half] end
14920 ; mode 22 [row 14 - second half]
14921 pmaddubsw m4, m5, [r5 + 29 * 16]
14924 movh [r0 + 334 * 16 + 8], m4
14925 ; mode 22 [row 14 - second half] end
14927 ; mode 22 [row 15 - second half]
14928 pmaddubsw m4, m5, [r5 + 16 * 16]
14931 movh [r0 + 335 * 16 + 8], m4
14932 ; mode 22 [row 15 - second half] end
14934 pmaddubsw m4, m0, [r5 + 16 * 16]
14936 pmaddubsw m6, m5, [r5 + 16 * 16]
14939 movu [r0 + 279 * 16], m4
14943 pinsrb m0, [r4 + 7], 1
14944 pinsrb m0, [r4 + 9], 0
14946 pinsrb m5, [r3 + 2], 1
14947 pinsrb m5, [r3 + 1], 0
14949 ; mode 20 [row 10 - second half]
14950 pmaddubsw m4, m5, [r5 + 25 * 16]
14953 movh [r0 + 298 * 16 + 8], m4
14954 ; mode 20 [row 10 - second half] end
14956 ; mode 20 [row 11 - second half]
14957 pmaddubsw m4, m5, [r5 + 4 * 16]
14960 movh [r0 + 299 * 16 + 8], m4
14961 ; mode 20 [row 11 - second half] end
14963 ; mode 21 [row 13 - second half]
14964 pmaddubsw m4, m5, [r5 + 18 * 16]
14967 movh [r0 + 317 * 16 + 8], m4
14968 ; mode 21 [row 13 - second half] end
14970 ; mode 21 [row 14 - second half]
14971 pmaddubsw m4, m5, [r5 + 1 * 16]
14974 movh [r0 + 318 * 16 + 8], m4
14975 ; mode 21 [row 14 - second half] end
14977 pmaddubsw m4, m0, [r5 + 22 * 16]
14979 pmaddubsw m6, m5, [r5 + 22 * 16]
14982 movu [r0 + 280 * 16], m4
14986 pinsrb m0, [r4 + 9], 1
14987 pinsrb m0, [r4 + 10], 0
14989 pinsrb m5, [r3 + 1], 1
14990 pinsrb m5, [r3 + 0], 0
14992 ; mode 20 [row 12 - second half]
14993 pmaddubsw m4, m5, [r5 + 15 * 16]
14996 movh [r0 + 300 * 16 + 8], m4
14998 ; mode 20 [row 12 - second half] end
14999 pmaddubsw m4, m0, [r5 + 28 * 16]
15001 pmaddubsw m6, m5, [r5 + 28 * 16]
15004 movu [r0 + 281 * 16], m4
15007 pmaddubsw m4, m0, [r5 + 2 * 16]
15009 pmaddubsw m6, m5, [r5 + 2 * 16]
15012 movu [r0 + 282 * 16], m4
15016 pinsrb m0, [r4 + 10], 1
15017 pinsrb m0, [r4 + 11], 0
15018 pmaddubsw m4, m0, [r5 + 8 * 16]
15021 pinsrb m5, [r4 + 0], 1
15022 pinsrb m5, [r4 + 1], 0
15023 pmaddubsw m6, m5, [r5 + 8 * 16]
15026 movu [r0 + 283 * 16], m4
15030 pinsrb m0, [r4 + 11], 1
15031 pinsrb m0, [r4 + 12], 0
15033 pinsrb m5, [r4 + 1], 1
15034 pinsrb m5, [r4 + 2], 0
15035 pmaddubsw m4, m0, [r5 + 14 * 16]
15037 pmaddubsw m6, m5, [r5 + 14 * 16]
15040 movu [r0 + 284 * 16], m4
15044 pinsrb m0, [r4 + 12], 1
15045 pinsrb m0, [r4 + 14], 0
15046 pmaddubsw m4, m0, [r5 + 20 * 16]
15049 pinsrb m5, [r4 + 2], 1
15050 pinsrb m5, [r4 + 4], 0
15051 pmaddubsw m6, m5, [r5 + 20 * 16]
15054 movu [r0 + 285 * 16], m4
15058 pinsrb m0, [r4 + 14], 1
15059 pinsrb m0, [r4 + 15], 0
15060 pmaddubsw m4, m0, [r5 + 26 * 16]
15063 pinsrb m5, [r4 + 4], 1
15064 pinsrb m5, [r4 + 5], 0
15065 pmaddubsw m6, m5, [r5 + 26 * 16]
15068 movu [r0 + 286 * 16], m4
15072 pshufb m0, [tab_S1]
15073 movu [r0 + 287 * 16], m0
15075 movd [r0 + 287 * 16 + 12], m1
15080 ; mode 26 [all rows]
15082 pinsrb m6, [r1 + 16], 15
15084 movu [r0 + 384 * 16], m6
15085 movu [r0 + 385 * 16], m6
15086 movu [r0 + 386 * 16], m6
15087 movu [r0 + 387 * 16], m6
15088 movu [r0 + 388 * 16], m6
15089 movu [r0 + 389 * 16], m6
15090 movu [r0 + 390 * 16], m6
15091 movu [r0 + 391 * 16], m6
15092 movu [r0 + 392 * 16], m6
15093 movu [r0 + 393 * 16], m6
15094 movu [r0 + 394 * 16], m6
15095 movu [r0 + 395 * 16], m6
15096 movu [r0 + 396 * 16], m6
15097 movu [r0 + 397 * 16], m6
15098 movu [r0 + 398 * 16], m6
15099 movu [r0 + 399 * 16], m6
15108 punpcklbw m5, m4, m0
15118 pextrb [r0 + 384 * 16], m5, 0
15119 pextrb [r0 + 385 * 16], m5, 1
15120 pextrb [r0 + 386 * 16], m5, 2
15121 pextrb [r0 + 387 * 16], m5, 3
15122 pextrb [r0 + 388 * 16], m5, 4
15123 pextrb [r0 + 389 * 16], m5, 5
15124 pextrb [r0 + 390 * 16], m5, 6
15125 pextrb [r0 + 391 * 16], m5, 7
15126 pextrb [r0 + 392 * 16], m5, 8
15127 pextrb [r0 + 393 * 16], m5, 9
15128 pextrb [r0 + 394 * 16], m5, 10
15129 pextrb [r0 + 395 * 16], m5, 11
15130 pextrb [r0 + 396 * 16], m5, 12
15131 pextrb [r0 + 397 * 16], m5, 13
15132 pextrb [r0 + 398 * 16], m5, 14
15133 pextrb [r0 + 399 * 16], m5, 15
15136 movu [r0 + 383 * 16], m1
15144 pmaddubsw m4, m1, [r5 + 30 * 16]
15146 pmaddubsw m5, m2, [r5 + 30 * 16]
15149 movu [r0 + 368 * 16], m4
15152 pmaddubsw m4, m1, [r5 + 28 * 16]
15154 pmaddubsw m5, m2, [r5 + 28 * 16]
15157 movu [r0 + 369 * 16], m4
15160 pmaddubsw m4, m1, [r5 + 26 * 16]
15162 pmaddubsw m5, m2, [r5 + 26 * 16]
15165 movu [r0 + 370 * 16], m4
15168 pmaddubsw m4, m1, [r5 + 24 * 16]
15170 pmaddubsw m5, m2, [r5 + 24 * 16]
15173 movu [r0 + 371 * 16], m4
15176 pmaddubsw m4, m1, [r5 + 22 * 16]
15178 pmaddubsw m5, m2, [r5 + 22 * 16]
15181 movu [r0 + 372 * 16], m4
15184 pmaddubsw m4, m1, [r5 + 20 * 16]
15186 pmaddubsw m5, m2, [r5 + 20 * 16]
15189 movu [r0 + 373 * 16], m4
15192 pmaddubsw m4, m1, [r5 + 18 * 16]
15194 pmaddubsw m5, m2, [r5 + 18 * 16]
15197 movu [r0 + 374 * 16], m4
15200 pmaddubsw m4, m1, [r5 + 16 * 16]
15202 pmaddubsw m5, m2, [r5 + 16 * 16]
15205 movu [r0 + 375 * 16], m4
15208 pmaddubsw m4, m1, [r5 + 14 * 16]
15210 pmaddubsw m5, m2, [r5 + 14 * 16]
15213 movu [r0 + 376 * 16], m4
15216 pmaddubsw m4, m1, [r5 + 12 * 16]
15218 pmaddubsw m5, m2, [r5 + 12 * 16]
15221 movu [r0 + 377 * 16], m4
15224 pmaddubsw m4, m1, [r5 + 10 * 16]
15226 pmaddubsw m5, m2, [r5 + 10 * 16]
15229 movu [r0 + 378 * 16], m4
15232 pmaddubsw m4, m1, [r5 + 8 * 16]
15234 pmaddubsw m5, m2, [r5 + 8 * 16]
15237 movu [r0 + 379 * 16], m4
15240 pmaddubsw m4, m1, [r5 + 6 * 16]
15242 pmaddubsw m5, m2, [r5 + 6 * 16]
15245 movu [r0 + 380 * 16], m4
15248 pmaddubsw m4, m1, [r5 + 4 * 16]
15250 pmaddubsw m5, m2, [r5 + 4 * 16]
15253 movu [r0 + 381 * 16], m4
15256 pmaddubsw m4, m1, [r5 + 2 * 16]
15258 pmaddubsw m5, m2, [r5 + 2 * 16]
15261 movu [r0 + 382 * 16], m4
15266 pinsrb m6, [r1 + 17], 15
15267 movu [r0 + 415 * 16], m6
15273 pmaddubsw m6, m7, [r5 + 2 * 16]
15275 pmaddubsw m5, m4, [r5 + 2 * 16]
15278 movu [r0 + 400 * 16], m6
15281 pmaddubsw m6, m7, [r5 + 4 * 16]
15283 pmaddubsw m5, m4, [r5 + 4 * 16]
15286 movu [r0 + 401 * 16], m6
15289 pmaddubsw m6, m7, [r5 + 6 * 16]
15291 pmaddubsw m5, m4, [r5 + 6 * 16]
15294 movu [r0 + 402 * 16], m6
15297 pmaddubsw m6, m7, [r5 + 8 * 16]
15299 pmaddubsw m5, m4, [r5 + 8 * 16]
15302 movu [r0 + 403 * 16], m6
15305 pmaddubsw m6, m7, [r5 + 10 * 16]
15307 pmaddubsw m5, m4, [r5 + 10 * 16]
15310 movu [r0 + 404 * 16], m6
15313 pmaddubsw m6, m7, [r5 + 12 * 16]
15315 pmaddubsw m5, m4, [r5 + 12 * 16]
15318 movu [r0 + 405 * 16], m6
15321 pmaddubsw m6, m7, [r5 + 14 * 16]
15323 pmaddubsw m5, m4, [r5 + 14 * 16]
15326 movu [r0 + 406 * 16], m6
15329 pmaddubsw m6, m7, [r5 + 16 * 16]
15331 pmaddubsw m5, m4, [r5 + 16 * 16]
15334 movu [r0 + 407 * 16], m6
15337 pmaddubsw m6, m7, [r5 + 18 * 16]
15339 pmaddubsw m5, m4, [r5 + 18 * 16]
15342 movu [r0 + 408 * 16], m6
15345 pmaddubsw m6, m7, [r5 + 20 * 16]
15347 pmaddubsw m5, m4, [r5 + 20 * 16]
15350 movu [r0 + 409 * 16], m6
15353 pmaddubsw m6, m7, [r5 + 22 * 16]
15355 pmaddubsw m5, m4, [r5 + 22 * 16]
15358 movu [r0 + 410 * 16], m6
15361 pmaddubsw m6, m7, [r5 + 24 * 16]
15363 pmaddubsw m5, m4, [r5 + 24 * 16]
15366 movu [r0 + 411 * 16], m6
15369 pmaddubsw m6, m7, [r5 + 26 * 16]
15371 pmaddubsw m5, m4, [r5 + 26 * 16]
15374 movu [r0 + 412 * 16], m6
15377 pmaddubsw m6, m7, [r5 + 28 * 16]
15379 pmaddubsw m5, m4, [r5 + 28 * 16]
15382 movu [r0 + 413 * 16], m6
15385 pmaddubsw m6, m7, [r5 + 30 * 16]
15387 pmaddubsw m5, m4, [r5 + 30 * 16]
15390 movu [r0 + 414 * 16], m6
15399 pmaddubsw m2, m1, [r5 + 5 * 16]
15401 pmaddubsw m5, m4, [r5 + 5 * 16]
15404 movu [r0 + 416 * 16], m2
15407 pmaddubsw m2, m1, [r5 + 5 * 16]
15409 pmaddubsw m5, m4, [r5 + 5 * 16]
15412 movu [r0 + 416 * 16], m2
15415 pmaddubsw m2, m1, [r5 + 10 * 16]
15417 pmaddubsw m5, m4, [r5 + 10 * 16]
15420 movu [r0 + 417 * 16], m2
15423 pmaddubsw m2, m1, [r5 + 15 * 16]
15425 pmaddubsw m5, m4, [r5 + 15 * 16]
15428 movu [r0 + 418 * 16], m2
15431 pmaddubsw m2, m1, [r5 + 20 * 16]
15433 pmaddubsw m5, m4, [r5 + 20 * 16]
15436 movu [r0 + 419 * 16], m2
15439 pmaddubsw m2, m1, [r5 + 25 * 16]
15441 pmaddubsw m5, m4, [r5 + 25 * 16]
15444 movu [r0 + 420 * 16], m2
15447 pmaddubsw m2, m1, [r5 + 30 * 16]
15449 pmaddubsw m5, m4, [r5 + 30 * 16]
15452 movu [r0 + 421 * 16], m2
15455 pmaddubsw m2, m1, [r5 + 9 * 16]
15457 pmaddubsw m5, m4, [r5 + 9 * 16]
15460 movu [r0 + 432 * 16], m2
15463 pmaddubsw m2, m1, [r5 + 18 * 16]
15465 pmaddubsw m5, m4, [r5 + 18 * 16]
15468 movu [r0 + 433 * 16], m2
15471 pmaddubsw m2, m1, [r5 + 27 * 16]
15473 pmaddubsw m5, m4, [r5 + 27 * 16]
15476 movu [r0 + 434 * 16], m2
15479 pmaddubsw m2, m1, [r5 + 13 * 16]
15481 pmaddubsw m5, m4, [r5 + 13 * 16]
15484 movu [r0 + 448 * 16], m2
15487 pmaddubsw m2, m1, [r5 + 26 * 16]
15489 pmaddubsw m5, m4, [r5 + 26 * 16]
15492 movu [r0 + 449 * 16], m2
15495 movu [r0 + 496 * 16], m2
15498 pmaddubsw m2, m1, [r5 + 17 * 16]
15500 pmaddubsw m5, m4, [r5 + 17 * 16]
15503 movu [r0 + 464 * 16], m2
15506 pmaddubsw m2, m1, [r5 + 21 * 16]
15508 pmaddubsw m5, m4, [r5 + 21 * 16]
15511 movu [r0 + 480 * 16], m2
15516 pmaddubsw m2, m7, [r5 + 3 * 16]
15520 pmaddubsw m5, m6, [r5 + 3 * 16]
15523 movu [r0 + 422 * 16], m2
15526 pmaddubsw m2, m7, [r5 + 8 * 16]
15528 pmaddubsw m5, m6, [r5 + 8 * 16]
15531 movu [r0 + 423 * 16], m2
15534 pmaddubsw m2, m7, [r5 + 13 * 16]
15536 pmaddubsw m5, m6, [r5 + 13 * 16]
15539 movu [r0 + 424 * 16], m2
15542 pmaddubsw m2, m7, [r5 + 18 * 16]
15544 pmaddubsw m5, m6, [r5 + 18 * 16]
15547 movu [r0 + 425 * 16], m2
15550 pmaddubsw m2, m7, [r5 + 23 * 16]
15552 pmaddubsw m5, m6, [r5 + 23 * 16]
15555 movu [r0 + 426 * 16], m2
15558 pmaddubsw m2, m7, [r5 + 4 * 16]
15560 pmaddubsw m5, m6, [r5 + 4 * 16]
15563 movu [r0 + 435 * 16], m2
15566 pmaddubsw m2, m7, [r5 + 13 * 16]
15568 pmaddubsw m5, m6, [r5 + 13 * 16]
15571 movu [r0 + 436 * 16], m2
15574 pmaddubsw m2, m7, [r5 + 22 * 16]
15576 pmaddubsw m5, m6, [r5 + 22 * 16]
15579 movu [r0 + 437 * 16], m2
15582 pmaddubsw m2, m7, [r5 + 31 * 16]
15584 pmaddubsw m5, m6, [r5 + 31 * 16]
15587 movu [r0 + 438 * 16], m2
15590 movu [r0 + 482 * 16], m2
15593 pmaddubsw m2, m7, [r5 + 7 * 16]
15595 pmaddubsw m5, m6, [r5 + 7 * 16]
15598 movu [r0 + 450 * 16], m2
15601 pmaddubsw m2, m7, [r5 + 20 * 16]
15603 pmaddubsw m5, m6, [r5 + 20 * 16]
15606 movu [r0 + 451 * 16], m2
15609 movu [r0 + 497 * 16], m2
15612 pmaddubsw m2, m7, [r5 + 2 * 16]
15614 pmaddubsw m5, m6, [r5 + 2 * 16]
15617 movu [r0 + 465 * 16], m2
15620 pmaddubsw m2, m7, [r5 + 19 * 16]
15622 pmaddubsw m5, m6, [r5 + 19 * 16]
15625 movu [r0 + 466 * 16], m2
15628 pmaddubsw m2, m7, [r5 + 10 * 16]
15630 pmaddubsw m5, m6, [r5 + 10 * 16]
15633 movu [r0 + 481 * 16], m2
15636 pmaddubsw m2, m7, [r5 + 28 * 16]
15638 pmaddubsw m5, m6, [r5 + 28 * 16]
15641 movu [r0 + 427 * 16], m2
15646 pmaddubsw m2, m1, [r5 + 1 * 16]
15650 pmaddubsw m5, m4, [r5 + 1 * 16]
15653 movu [r0 + 428 * 16], m2
15656 movu [r0 + 452 * 16], m2
15659 pmaddubsw m2, m1, [r5 + 6 * 16]
15661 pmaddubsw m5, m4, [r5 + 6 * 16]
15664 movu [r0 + 429 * 16], m2
15667 pmaddubsw m2, m1, [r5 + 11 * 16]
15669 pmaddubsw m5, m4, [r5 + 11 * 16]
15672 movu [r0 + 430 * 16], m2
15675 pmaddubsw m2, m1, [r5 + 16 * 16]
15677 pmaddubsw m5, m4, [r5 + 16 * 16]
15680 movu [r0 + 431 * 16], m2
15683 pmaddubsw m2, m1, [r5 + 8 * 16]
15685 pmaddubsw m5, m4, [r5 + 8 * 16]
15688 movu [r0 + 439 * 16], m2
15691 pmaddubsw m2, m1, [r5 + 17 * 16]
15693 pmaddubsw m5, m4, [r5 + 17 * 16]
15696 movu [r0 + 440 * 16], m2
15699 pmaddubsw m2, m1, [r5 + 26 * 16]
15701 pmaddubsw m5, m4, [r5 + 26 * 16]
15704 movu [r0 + 441 * 16], m2
15707 pmaddubsw m2, m1, [r5 + 14 * 16]
15709 pmaddubsw m5, m4, [r5 + 14 * 16]
15712 movu [r0 + 453 * 16], m2
15715 movu [r0 + 498 * 16], m2
15718 pmaddubsw m2, m1, [r5 + 27 * 16]
15720 pmaddubsw m5, m4, [r5 + 27 * 16]
15723 movu [r0 + 454 * 16], m2
15726 pmaddubsw m2, m1, [r5 + 4 * 16]
15728 pmaddubsw m5, m4, [r5 + 4 * 16]
15731 movu [r0 + 467 * 16], m2
15734 pmaddubsw m2, m1, [r5 + 21 * 16]
15736 pmaddubsw m5, m4, [r5 + 21 * 16]
15739 movu [r0 + 468 * 16], m2
15742 pmaddubsw m2, m1, [r5 + 20 * 16]
15744 pmaddubsw m5, m4, [r5 + 20 * 16]
15747 movu [r0 + 483 * 16], m2
15752 pmaddubsw m2, m7, [r5 + 3 * 16]
15756 pmaddubsw m5, m6, [r5 + 3 * 16]
15759 movu [r0 + 442 * 16], m2
15762 pmaddubsw m2, m7, [r5 + 12 * 16]
15764 pmaddubsw m5, m6, [r5 + 12 * 16]
15767 movu [r0 + 443 * 16], m2
15770 pmaddubsw m2, m7, [r5 + 21 * 16]
15772 pmaddubsw m5, m6, [r5 + 21 * 16]
15775 movu [r0 + 444 * 16], m2
15778 movu [r0 + 456 * 16], m2
15781 pmaddubsw m2, m7, [r5 + 30 * 16]
15783 pmaddubsw m5, m6, [r5 + 30 * 16]
15786 movu [r0 + 445 * 16], m2
15789 movu [r0 + 485 * 16], m2
15792 pmaddubsw m2, m7, [r5 + 8 * 16]
15794 pmaddubsw m5, m6, [r5 + 8 * 16]
15797 movu [r0 + 455 * 16], m2
15800 movu [r0 + 499 * 16], m2
15803 pmaddubsw m2, m7, [r5 + 6 * 16]
15805 pmaddubsw m5, m6, [r5 + 6 * 16]
15808 movu [r0 + 469 * 16], m2
15811 pmaddubsw m2, m7, [r5 + 23 * 16]
15813 pmaddubsw m5, m6, [r5 + 23 * 16]
15816 movu [r0 + 470 * 16], m2
15819 pmaddubsw m2, m7, [r5 + 9 * 16]
15821 pmaddubsw m5, m6, [r5 + 9 * 16]
15824 movu [r0 + 484 * 16], m2
15832 pmaddubsw m2, m1, [r5 + 7 * 16]
15836 pmaddubsw m5, m4, [r5 + 7 * 16]
15839 movu [r0 + 446 * 16], m2
15842 pmaddubsw m2, m1, [r5 + 16 * 16]
15844 pmaddubsw m5, m4, [r5 + 16 * 16]
15847 movu [r0 + 447 * 16], m2
15850 pmaddubsw m2, m1, [r5 + 2 * 16]
15852 pmaddubsw m5, m4, [r5 + 2 * 16]
15855 movu [r0 + 457 * 16], m2
15858 movu [r0 + 500 * 16], m2
15861 pmaddubsw m2, m1, [r5 + 15 * 16]
15863 pmaddubsw m5, m4, [r5 + 15 * 16]
15866 movu [r0 + 458 * 16], m2
15869 pmaddubsw m2, m1, [r5 + 28 * 16]
15871 pmaddubsw m5, m4, [r5 + 28 * 16]
15874 movu [r0 + 459 * 16], m2
15877 movu [r0 + 501 * 16], m2
15880 pmaddubsw m2, m1, [r5 + 8 * 16]
15882 pmaddubsw m5, m4, [r5 + 8 * 16]
15885 movu [r0 + 471 * 16], m2
15888 pmaddubsw m2, m1, [r5 + 25 * 16]
15890 pmaddubsw m5, m4, [r5 + 25 * 16]
15893 movu [r0 + 472 * 16], m2
15896 pmaddubsw m2, m1, [r5 + 19 * 16]
15898 pmaddubsw m5, m4, [r5 + 19 * 16]
15901 movu [r0 + 486 * 16], m2
15906 pmaddubsw m2, m7, [r5 + 9 * 16]
15910 pmaddubsw m5, m6, [r5 + 9 * 16]
15913 movu [r0 + 460 * 16], m2
15916 pmaddubsw m2, m7, [r5 + 22 * 16]
15918 pmaddubsw m5, m6, [r5 + 22 * 16]
15921 movu [r0 + 461 * 16], m2
15924 movu [r0 + 502 * 16], m2
15927 pmaddubsw m2, m7, [r5 + 10 * 16]
15929 pmaddubsw m5, m6, [r5 + 10 * 16]
15932 movu [r0 + 473 * 16], m2
15935 pmaddubsw m2, m7, [r5 + 27 * 16]
15937 pmaddubsw m5, m6, [r5 + 27 * 16]
15940 movu [r0 + 474 * 16], m2
15943 pmaddubsw m2, m7, [r5 + 8 * 16]
15945 pmaddubsw m5, m6, [r5 + 8 * 16]
15948 movu [r0 + 487 * 16], m2
15951 pmaddubsw m2, m7, [r5 + 29 * 16]
15953 pmaddubsw m5, m6, [r5 + 29 * 16]
15956 movu [r0 + 488 * 16], m2
15965 pmaddubsw m2, m1, [r5 + 3 * 16]
15969 pmaddubsw m5, m4, [r5 + 3 * 16]
15972 movu [r0 + 462 * 16], m2
15975 pmaddubsw m2, m1, [r5 + 16 * 16]
15977 pmaddubsw m5, m4, [r5 + 16 * 16]
15980 movu [r0 + 463 * 16], m2
15983 movu [r0 + 503 * 16], m2
15986 pmaddubsw m2, m1, [r5 + 12 * 16]
15988 pmaddubsw m5, m4, [r5 + 12 * 16]
15991 movu [r0 + 475 * 16], m2
15994 pmaddubsw m2, m1, [r5 + 29 * 16]
15996 pmaddubsw m5, m4, [r5 + 29 * 16]
15999 movu [r0 + 476 * 16], m2
16002 pmaddubsw m2, m1, [r5 + 18 * 16]
16004 pmaddubsw m5, m4, [r5 + 18 * 16]
16007 movu [r0 + 489 * 16], m2
16012 pmaddubsw m2, m7, [r5 + 14 * 16]
16016 pmaddubsw m5, m6, [r5 + 14 * 16]
16019 movu [r0 + 477 * 16], m2
16022 pmaddubsw m2, m7, [r5 + 31 * 16]
16024 pmaddubsw m5, m6, [r5 + 31 * 16]
16027 movu [r0 + 478 * 16], m2
16030 pmaddubsw m2, m7, [r5 + 7 * 16]
16032 pmaddubsw m5, m6, [r5 + 7 * 16]
16035 movu [r0 + 490 * 16], m2
16038 pmaddubsw m2, m7, [r5 + 28 * 16]
16040 pmaddubsw m5, m6, [r5 + 28 * 16]
16043 movu [r0 + 491 * 16], m2
16046 pmaddubsw m2, m7, [r5 + 10 * 16]
16048 pmaddubsw m5, m6, [r5 + 10 * 16]
16051 movu [r0 + 504 * 16], m2
16056 pmaddubsw m2, m1, [r5 + 16 * 16]
16060 pmaddubsw m5, m4, [r5 + 16 * 16]
16063 movu [r0 + 479 * 16], m2
16066 pmaddubsw m2, m1, [r5 + 17 * 16]
16068 pmaddubsw m5, m4, [r5 + 17 * 16]
16071 movu [r0 + 492 * 16], m2
16074 pmaddubsw m2, m1, [r5 + 4 * 16]
16076 pmaddubsw m5, m4, [r5 + 4 * 16]
16079 movu [r0 + 505 * 16], m2
16082 pmaddubsw m2, m1, [r5 + 30 * 16]
16084 pmaddubsw m5, m4, [r5 + 30 * 16]
16087 movu [r0 + 506 * 16], m2
16090 pmaddubsw m2, m1, [r5 + 4 * 16]
16092 pmaddubsw m5, m4, [r5 + 4 * 16]
16095 movu [r0 + 505 * 16], m2
16100 pmaddubsw m2, m7, [r5 + 6 * 16]
16105 pmaddubsw m5, m6, [r5 + 6 * 16]
16108 movu [r0 + 493 * 16], m2
16111 pmaddubsw m2, m7, [r5 + 27 * 16]
16113 pmaddubsw m5, m6, [r5 + 27 * 16]
16116 movu [r0 + 494 * 16], m2
16119 pmaddubsw m2, m7, [r5 + 24 * 16]
16121 pmaddubsw m5, m6, [r5 + 24 * 16]
16124 movu [r0 + 507 * 16], m2
16129 pmaddubsw m2, m1, [r5 + 16 * 16]
16132 pinsrb m4, [r3 + 26], 14
16133 pinsrb m4, [r3 + 27], 15
16136 pmaddubsw m5, m4, [r5 + 16 * 16]
16139 movu [r0 + 495 * 16], m2
16142 pmaddubsw m2, m1, [r5 + 18 * 16]
16144 pmaddubsw m5, m4, [r5 + 18 * 16]
16147 movu [r0 + 508 * 16], m2
16152 pmaddubsw m2, m7, [r5 + 12 * 16]
16156 pmaddubsw m5, m6, [r5 + 12 * 16]
16159 movu [r0 + 509 * 16], m2
16164 pmaddubsw m2, m1, [r5 + 6 * 16]
16168 pmaddubsw m5, m4, [r5 + 6 * 16]
16171 movu [r0 + 510 * 16], m2
16175 movu [r0 + 512 * 16], m1
16177 palignr m3, m2, m1, 1
16178 movu [r0 + 513 * 16], m3
16179 palignr m3, m2, m1, 2
16180 movu [r0 + 514 * 16], m3
16181 palignr m3, m2, m1, 3
16182 movu [r0 + 515 * 16], m3
16183 palignr m3, m2, m1, 4
16184 movu [r0 + 516 * 16], m3
16185 palignr m3, m2, m1, 5
16186 movu [r0 + 517 * 16], m3
16187 palignr m3, m2, m1, 6
16188 movu [r0 + 518 * 16], m3
16189 palignr m3, m2, m1, 7
16190 movu [r0 + 519 * 16], m3
16191 palignr m3, m2, m1, 8
16192 movu [r0 + 520 * 16], m3
16193 palignr m3, m2, m1, 9
16194 movu [r0 + 521 * 16], m3
16195 palignr m3, m2, m1, 10
16196 movu [r0 + 522 * 16], m3
16197 palignr m3, m2, m1, 11
16198 movu [r0 + 523 * 16], m3
16199 palignr m3, m2, m1, 12
16200 movu [r0 + 524 * 16], m3
16203 movu [r0 + 511 * 16], m3
16206 palignr m3, m2, m1, 13
16207 movu [r0 + 525 * 16], m3
16208 palignr m3, m2, m1, 14
16209 movu [r0 + 526 * 16], m3
16210 palignr m3, m2, m1, 15
16211 movu [r0 + 527 * 16], m3
16215 ;-----------------------------------------------------------------------------
16216 ; void all_angs_pred_32x32(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
16217 ;-----------------------------------------------------------------------------
16219 cglobal all_angs_pred_32x32, 6, 6, 8, dest, above0, left0, above1, left1, bLuma
16223 movu [r0 + 0 * 16], m0
16225 movu [r0 + 1 * 16], m1
16228 movu [r0 + 478 * 16], m0
16229 movu [r0 + 479 * 16], m1
16233 palignr m3, m1, m0, 1
16234 movu [r0 + 2 * 16], m3
16235 palignr m4, m2, m1, 1
16236 movu [r0 + 3 * 16], m4
16239 movu [r0 + 510 * 16], m3
16240 movu [r0 + 511 * 16], m4
16243 movu [r0 + 34 * 16], m4
16245 movu [r0 + 35 * 16], m5
16248 palignr m3, m1, m0, 2
16249 movu [r0 + 4 * 16], m3
16250 palignr m4, m2, m1, 2
16251 movu [r0 + 5 * 16], m4
16254 movu [r0 + 36 * 16], m4
16256 palignr m7, m6, m5, 1
16257 movu [r0 + 37 * 16], m7
16260 palignr m3, m1, m0, 3
16261 movu [r0 + 6 * 16], m3
16262 palignr m4, m2, m1, 3
16263 movu [r0 + 7 * 16], m4
16266 movu [r0 + 38 * 16], m4
16267 palignr m7, m6, m5, 2
16268 movu [r0 + 39 * 16], m7
16271 palignr m3, m1, m0, 4
16272 movu [r0 + 8 * 16], m3
16273 palignr m4, m2, m1, 4
16274 movu [r0 + 9 * 16], m4
16277 movu [r0 + 446 * 16], m3
16278 movu [r0 + 447 * 16], m4
16281 movu [r0 + 40 * 16], m4
16282 palignr m7, m6, m5, 3
16283 movu [r0 + 41 * 16], m7
16286 movu [r0 + 190 * 16], m4
16287 movu [r0 + 191 * 16], m7
16290 palignr m3, m1, m0, 5
16291 movu [r0 + 10 * 16], m3
16292 palignr m4, m2, m1, 5
16293 movu [r0 + 11 * 16], m4
16296 movu [r0 + 42 * 16], m4
16297 palignr m7, m6, m5, 4
16298 movu [r0 + 43 * 16], m7
16301 palignr m3, m1, m0, 6
16302 movu [r0 + 12 * 16], m3
16303 palignr m4, m2, m1, 6
16304 movu [r0 + 13 * 16], m4
16307 movu [r0 + 44 * 16], m4
16308 palignr m7, m6, m5, 5
16309 movu [r0 + 45 * 16], m7
16312 palignr m3, m1, m0, 7
16313 movu [r0 + 14 * 16], m3
16314 palignr m4, m2, m1, 7
16315 movu [r0 + 15 * 16], m4
16318 movu [r0 + 46 * 16], m4
16319 palignr m7, m6, m5, 6
16320 movu [r0 + 47 * 16], m7
16323 palignr m3, m1, m0, 8
16324 movu [r0 + 16 * 16], m3
16325 palignr m4, m2, m1, 8
16326 movu [r0 + 17 * 16], m4
16329 movu [r0 + 382 * 16], m3
16330 movu [r0 + 383 * 16], m4
16333 movu [r0 + 48 * 16], m4
16334 palignr m7, m6, m5, 7
16335 movu [r0 + 49 * 16], m7
16338 palignr m3, m1, m0, 9
16339 movu [r0 + 18 * 16], m3
16340 palignr m4, m2, m1, 9
16341 movu [r0 + 19 * 16], m4
16344 movu [r0 + 50 * 16], m4
16345 palignr m7, m6, m5, 8
16346 movu [r0 + 51 * 16], m7
16349 movu [r0 + 126 * 16], m4
16350 movu [r0 + 127 * 16], m7
16353 palignr m3, m1, m0, 10
16354 movu [r0 + 20 * 16], m3
16355 palignr m4, m2, m1, 10
16356 movu [r0 + 21 * 16], m4
16359 movu [r0 + 52 * 16], m4
16360 palignr m7, m6, m5, 9
16361 movu [r0 + 53 * 16], m7
16364 palignr m3, m1, m0, 11
16365 movu [r0 + 22 * 16], m3
16366 palignr m4, m2, m1, 11
16367 movu [r0 + 23 * 16], m4
16370 movu [r0 + 54 * 16], m4
16371 palignr m7, m6, m5, 10
16372 movu [r0 + 55 * 16], m7
16375 palignr m3, m1, m0, 12
16376 movu [r0 + 24 * 16], m3
16377 palignr m4, m2, m1, 12
16378 movu [r0 + 25 * 16], m4
16381 movu [r0 + 318 * 16], m3
16382 movu [r0 + 319 * 16], m4
16385 movu [r0 + 94 * 16], m3
16386 movu [r0 + 95 * 16], m4
16389 movu [r0 + 56 * 16], m4
16390 palignr m7, m6, m5, 11
16391 movu [r0 + 57 * 16], m7
16394 palignr m3, m1, m0, 13
16395 movu [r0 + 26 * 16], m3
16396 palignr m4, m2, m1, 13
16397 movu [r0 + 27 * 16], m4
16400 movu [r0 + 58 * 16], m4
16401 palignr m7, m6, m5, 12
16402 movu [r0 + 59 * 16], m7
16405 palignr m3, m1, m0, 14
16406 movu [r0 + 28 * 16], m3
16407 palignr m4, m2, m1, 14
16408 movu [r0 + 29 * 16], m4
16411 movu [r0 + 60 * 16], m4
16412 palignr m7, m6, m5, 13
16413 movu [r0 + 61 * 16], m7
16416 palignr m3, m1, m0, 15
16417 movu [r0 + 30 * 16], m3
16418 palignr m4, m2, m1, 15
16419 movu [r0 + 31 * 16], m4
16422 movu [r0 + 62 * 16], m4
16423 palignr m7, m6, m5, 14
16424 movu [r0 + 63 * 16], m7
16427 movu [r0 + 32 * 16], m1
16428 movu [r0 + 33 * 16], m2
16431 movu [r0 + 254 * 16], m1
16432 movu [r0 + 255 * 16], m2
16435 lea r5, [ang_table]
16436 movu m6, [r5 + 26 * 16]
16437 movu m7, [pw_1024 ]
16440 pmaddubsw m0, m1, m6
16446 pmaddubsw m3, m2, m6
16449 movu [r0 + 64 * 16], m0
16451 ; mode 6 [row 1 - first half]
16452 movu [r0 + 258 * 16], m0
16454 ; mode 9 [row 12 - first half]
16455 movu [r0 + 472 * 16], m0
16461 pmaddubsw m3, m0, m6
16467 pmaddubsw m5, m4, m6
16470 movu [r0 + 65 * 16], m3
16472 ; mode 6 [row 1 - second half]
16473 movu [r0 + 259 * 16], m3
16475 ; mode 9 [row 12 - second half]
16476 movu [r0 + 473 * 16], m3
16479 movu m6, [r5 + 21 * 16]
16480 pmaddubsw m3, m1, m6
16482 pmaddubsw m5, m2, m6
16485 movu [r0 + 128 * 16], m3
16486 pmaddubsw m3, m0, m6
16488 pmaddubsw m5, m4, m6
16491 movu [r0 + 129 * 16], m3
16494 movu m6, [r5 + 17 * 16]
16495 pmaddubsw m3, m1, m6
16497 pmaddubsw m5, m2, m6
16500 movu [r0 + 192 * 16], m3
16501 pmaddubsw m3, m0, m6
16503 pmaddubsw m5, m4, m6
16506 movu [r0 + 193 * 16], m3
16509 movu m6, [r5 + 13 * 16]
16510 pmaddubsw m3, m1, m6
16512 pmaddubsw m5, m2, m6
16515 movu [r0 + 256 * 16], m3
16516 pmaddubsw m3, m0, m6
16518 pmaddubsw m5, m4, m6
16521 movu [r0 + 257 * 16], m3
16524 movu m6, [r5 + 9 * 16]
16525 pmaddubsw m3, m1, m6
16527 pmaddubsw m5, m2, m6
16530 movu [r0 + 320 * 16], m3
16531 pmaddubsw m3, m0, m6
16533 pmaddubsw m5, m4, m6
16536 movu [r0 + 321 * 16], m3
16539 movu m6, [r5 + 18 * 16]
16540 pmaddubsw m3, m1, m6
16542 pmaddubsw m5, m2, m6
16545 movu [r0 + 322 * 16], m3
16547 ; mode 9 [row 8 - first half]
16548 movu [r0 + 464 * 16], m3
16550 pmaddubsw m3, m0, m6
16552 pmaddubsw m5, m4, m6
16555 movu [r0 + 323 * 16], m3
16557 ; mode 9 [row 8 - second half]
16558 movu [r0 + 465 * 16], m3
16561 movu m6, [r5 + 27 * 16]
16562 pmaddubsw m3, m1, m6
16564 pmaddubsw m5, m2, m6
16567 movu [r0 + 324 * 16], m3
16568 pmaddubsw m3, m0, m6
16570 pmaddubsw m5, m4, m6
16573 movu [r0 + 325 * 16], m3
16576 movu m6, [r5 + 5 * 16]
16577 pmaddubsw m3, m1, m6
16579 pmaddubsw m5, m2, m6
16582 movu [r0 + 384 * 16], m3
16583 pmaddubsw m3, m0, m6
16585 pmaddubsw m5, m4, m6
16588 movu [r0 + 385 * 16], m3
16591 movu m6, [r5 + 10 * 16]
16592 pmaddubsw m3, m1, m6
16594 pmaddubsw m5, m2, m6
16597 movu [r0 + 386 * 16], m3
16599 ; mode 9 [row 4 - first half]
16600 movu [r0 + 456 * 16], m3
16602 pmaddubsw m3, m0, m6
16604 pmaddubsw m5, m4, m6
16607 movu [r0 + 387 * 16], m3
16609 ; mode 9 [row 4 - second half]
16610 movu [r0 + 457 * 16], m3
16613 movu m6, [r5 + 15 * 16]
16614 pmaddubsw m3, m1, m6
16616 pmaddubsw m5, m2, m6
16619 movu [r0 + 388 * 16], m3
16620 pmaddubsw m3, m0, m6
16622 pmaddubsw m5, m4, m6
16625 movu [r0 + 389 * 16], m3
16628 movu m6, [r5 + 20 * 16]
16629 pmaddubsw m3, m1, m6
16631 pmaddubsw m5, m2, m6
16634 movu [r0 + 390 * 16], m3
16636 ; mode 9 [row 9 - first half]
16637 movu [r0 + 466 * 16], m3
16639 pmaddubsw m3, m0, m6
16641 pmaddubsw m5, m4, m6
16644 movu [r0 + 391 * 16], m3
16646 ; mode 9 [row 9 - second half]
16647 movu [r0 + 467 * 16], m3
16650 movu m6, [r5 + 25 * 16]
16651 pmaddubsw m3, m1, m6
16653 pmaddubsw m5, m2, m6
16656 movu [r0 + 392 * 16], m3
16657 pmaddubsw m3, m0, m6
16659 pmaddubsw m5, m4, m6
16662 movu [r0 + 393 * 16], m3
16665 movu m6, [r5 + 30 * 16]
16666 pmaddubsw m3, m1, m6
16668 pmaddubsw m5, m2, m6
16671 movu [r0 + 394 * 16], m3
16673 ; mode 9 [row 14 - first half]
16674 movu [r0 + 476 * 16], m3
16676 pmaddubsw m3, m0, m6
16678 pmaddubsw m5, m4, m6
16681 movu [r0 + 395 * 16], m3
16683 ; mode 9 [row 14 - second half]
16684 movu [r0 + 477 * 16], m3
16687 movu m6, [r5 + 2 * 16]
16688 pmaddubsw m3, m1, m6
16690 pmaddubsw m5, m2, m6
16693 movu [r0 + 448 * 16], m3
16694 pmaddubsw m3, m0, m6
16696 pmaddubsw m5, m4, m6
16699 movu [r0 + 449 * 16], m3
16702 movu m6, [r5 + 4 * 16]
16703 pmaddubsw m3, m1, m6
16705 pmaddubsw m5, m2, m6
16708 movu [r0 + 450 * 16], m3
16709 pmaddubsw m3, m0, m6
16711 pmaddubsw m5, m4, m6
16714 movu [r0 + 451 * 16], m3
16717 movu m6, [r5 + 6 * 16]
16718 pmaddubsw m3, m1, m6
16720 pmaddubsw m5, m2, m6
16723 movu [r0 + 452 * 16], m3
16724 pmaddubsw m3, m0, m6
16726 pmaddubsw m5, m4, m6
16729 movu [r0 + 453 * 16], m3
16732 movu m6, [r5 + 8 * 16]
16733 pmaddubsw m3, m1, m6
16735 pmaddubsw m5, m2, m6
16738 movu [r0 + 454 * 16], m3
16739 pmaddubsw m3, m0, m6
16741 pmaddubsw m5, m4, m6
16744 movu [r0 + 455 * 16], m3
16747 movu m6, [r5 + 12 * 16]
16748 pmaddubsw m3, m1, m6
16750 pmaddubsw m5, m2, m6
16753 movu [r0 + 458 * 16], m3
16754 pmaddubsw m3, m0, m6
16756 pmaddubsw m5, m4, m6
16759 movu [r0 + 459 * 16], m3
16762 movu m6, [r5 + 14 * 16]
16763 pmaddubsw m3, m1, m6
16765 pmaddubsw m5, m2, m6
16768 movu [r0 + 460 * 16], m3
16769 pmaddubsw m3, m0, m6
16771 pmaddubsw m5, m4, m6
16774 movu [r0 + 461 * 16], m3
16777 movu m6, [r5 + 16 * 16]
16778 pmaddubsw m3, m1, m6
16780 pmaddubsw m5, m2, m6
16783 movu [r0 + 462 * 16], m3
16784 pmaddubsw m3, m0, m6
16786 pmaddubsw m5, m4, m6
16789 movu [r0 + 463 * 16], m3
16792 movu m6, [r5 + 22 * 16]
16793 pmaddubsw m3, m1, m6
16795 pmaddubsw m5, m2, m6
16798 movu [r0 + 468 * 16], m3
16799 pmaddubsw m3, m0, m6
16801 pmaddubsw m5, m4, m6
16804 movu [r0 + 469 * 16], m3
16807 movu m6, [r5 + 24 * 16]
16808 pmaddubsw m3, m1, m6
16810 pmaddubsw m5, m2, m6
16813 movu [r0 + 470 * 16], m3
16814 pmaddubsw m3, m0, m6
16816 pmaddubsw m5, m4, m6
16819 movu [r0 + 471 * 16], m3
16822 movu m6, [r5 + 28 * 16]
16823 pmaddubsw m3, m1, m6
16825 pmaddubsw m5, m2, m6
16828 movu [r0 + 474 * 16], m3
16829 pmaddubsw m3, m0, m6
16831 pmaddubsw m5, m4, m6
16834 movu [r0 + 475 * 16], m3
16837 movu m6, [r5 + 20 * 16]
16842 pmaddubsw m1, m0, m6
16848 pmaddubsw m3, m2, m6
16851 movu [r0 + 66 * 16], m1
16853 ; mode 6 [row 3 - first half]
16854 movu [r0 + 262 * 16], m1
16856 ; mode 9 [row 25 - first half]
16857 movu [r0 + 498 * 16], m1
16863 pmaddubsw m3, m1, m6
16869 pmaddubsw m5, m4, m6
16872 movu [r0 + 67 * 16], m3
16874 ; mode 6 [row 3 - second half]
16875 movu [r0 + 263 * 16], m3
16877 ; mode 9 [row 25 - second half]
16878 movu [r0 + 499 * 16], m3
16881 movu m6, [r5 + 10 * 16]
16882 pmaddubsw m3, m0, m6
16884 pmaddubsw m5, m2, m6
16887 movu [r0 + 130 * 16], m3
16889 ; mode 9 [row 20 - first half]
16890 movu [r0 + 488 * 16], m3
16892 pmaddubsw m3, m1, m6
16894 pmaddubsw m5, m4, m6
16897 movu [r0 + 131 * 16], m3
16899 ; mode 9 [row 20 - second half]
16900 movu [r0 + 489 * 16], m3
16903 movu m6, [r5 + 31 * 16]
16904 pmaddubsw m3, m0, m6
16906 pmaddubsw m5, m2, m6
16909 movu [r0 + 132 * 16], m3
16911 ; mode 7 [row 6 - first half]
16912 movu [r0 + 332 * 16], m3
16914 pmaddubsw m3, m1, m6
16916 pmaddubsw m5, m4, m6
16919 movu [r0 + 133 * 16], m3
16921 ; mode 7 [row 6 - second half]
16922 movu [r0 + 333 * 16], m3
16925 movu m6, [r5 + 2 * 16]
16926 pmaddubsw m3, m0, m6
16928 pmaddubsw m5, m2, m6
16931 movu [r0 + 194 * 16], m3
16933 ; mode 5 [row 1 - first half]
16934 movu [r0 + 480 * 16], m3
16936 pmaddubsw m3, m1, m6
16938 pmaddubsw m5, m4, m6
16941 movu [r0 + 195 * 16], m3
16943 ; mode 5 [row 1 - second half]
16944 movu [r0 + 481 * 16], m3
16947 movu m6, [r5 + 19 * 16]
16948 pmaddubsw m3, m0, m6
16950 pmaddubsw m5, m2, m6
16953 movu [r0 + 196 * 16], m3
16954 pmaddubsw m3, m1, m6
16956 pmaddubsw m5, m4, m6
16959 movu [r0 + 197 * 16], m3
16962 movu m6, [r5 + 7 * 16]
16963 pmaddubsw m3, m0, m6
16965 pmaddubsw m5, m2, m6
16968 movu [r0 + 260 * 16], m3
16969 pmaddubsw m3, m1, m6
16971 pmaddubsw m5, m4, m6
16974 movu [r0 + 261 * 16], m3
16977 movu m6, [r5 + 4 * 16]
16978 pmaddubsw m3, m0, m6
16980 pmaddubsw m5, m2, m6
16983 movu [r0 + 326 * 16], m3
16985 ; mode 9 [row 17 - first half]
16986 movu [r0 + 482 * 16], m3
16988 pmaddubsw m3, m1, m6
16990 pmaddubsw m5, m4, m6
16993 movu [r0 + 327 * 16], m3
16995 ; mode 9 [row 17 - second half]
16996 movu [r0 + 483 * 16], m3
16999 movu m6, [r5 + 13 * 16]
17000 pmaddubsw m3, m0, m6
17002 pmaddubsw m5, m2, m6
17005 movu [r0 + 328 * 16], m3
17007 ; mode 8 [row 8 - first half]
17008 movu [r0 + 400 * 16], m3
17010 pmaddubsw m3, m1, m6
17012 pmaddubsw m5, m4, m6
17015 movu [r0 + 329 * 16], m3
17017 ; mode 8 [row 8 - second half]
17018 movu [r0 + 401 * 16], m3
17021 movu m6, [r5 + 22 * 16]
17022 pmaddubsw m3, m0, m6
17024 pmaddubsw m5, m2, m6
17027 movu [r0 + 330 * 16], m3
17029 ; mode 9 [row 26 - first half]
17030 movu [r0 + 500 * 16], m3
17032 pmaddubsw m3, m1, m6
17034 pmaddubsw m5, m4, m6
17037 movu [r0 + 331 * 16], m3
17039 ; mode 9 [row 26 - second half]
17040 movu [r0 + 501 * 16], m3
17043 movu m6, [r5 + 3 * 16]
17044 pmaddubsw m3, m0, m6
17046 pmaddubsw m5, m2, m6
17049 movu [r0 + 396 * 16], m3
17050 pmaddubsw m3, m1, m6
17052 pmaddubsw m5, m4, m6
17055 movu [r0 + 397 * 16], m3
17058 movu m6, [r5 + 6 * 16]
17059 pmaddubsw m3, m0, m6
17061 pmaddubsw m5, m2, m6
17064 movu [r0 + 484 * 16], m3
17065 pmaddubsw m3, m1, m6
17067 pmaddubsw m5, m4, m6
17070 movu [r0 + 485 * 16], m3
17073 movu m6, [r5 + 12 * 16]
17074 pmaddubsw m3, m0, m6
17076 pmaddubsw m5, m2, m6
17079 movu [r0 + 490 * 16], m3
17080 pmaddubsw m3, m1, m6
17082 pmaddubsw m5, m4, m6
17085 movu [r0 + 491 * 16], m3
17088 movu m6, [r5 + 14 * 16]
17089 pmaddubsw m3, m0, m6
17091 pmaddubsw m5, m2, m6
17094 movu [r0 + 492 * 16], m3
17095 pmaddubsw m3, m1, m6
17097 pmaddubsw m5, m4, m6
17100 movu [r0 + 493 * 16], m3
17103 movu m6, [r5 + 16 * 16]
17104 pmaddubsw m3, m0, m6
17106 pmaddubsw m5, m2, m6
17109 movu [r0 + 494 * 16], m3
17110 pmaddubsw m3, m1, m6
17112 pmaddubsw m5, m4, m6
17115 movu [r0 + 495 * 16], m3
17118 movu m6, [r5 + 24 * 16]
17119 pmaddubsw m3, m0, m6
17121 pmaddubsw m5, m2, m6
17124 movu [r0 + 502 * 16], m3
17125 pmaddubsw m3, m1, m6
17127 pmaddubsw m5, m4, m6
17130 movu [r0 + 503 * 16], m3
17133 movu m6, [r5 + 26 * 16]
17134 pmaddubsw m3, m0, m6
17136 pmaddubsw m5, m2, m6
17139 movu [r0 + 504 * 16], m3
17140 pmaddubsw m3, m1, m6
17142 pmaddubsw m5, m4, m6
17145 movu [r0 + 505 * 16], m3
17148 movu m6, [r5 + 30 * 16]
17149 pmaddubsw m3, m0, m6
17151 pmaddubsw m5, m2, m6
17154 movu [r0 + 508 * 16], m3
17155 pmaddubsw m3, m1, m6
17157 pmaddubsw m5, m4, m6
17160 movu [r0 + 509 * 16], m3
17163 movu m6, [r5 + 8 * 16]
17164 pmaddubsw m3, m0, m6
17166 pmaddubsw m5, m2, m6
17169 movu [r0 + 398 * 16], m3
17171 ; mode 9 [row 19 - first half]
17172 movu [r0 + 486 * 16], m3
17174 pmaddubsw m3, m1, m6
17176 pmaddubsw m5, m4, m6
17179 movu [r0 + 399 * 16], m3
17181 ; mode 9 [row 19 - second half]
17182 movu [r0 + 487 * 16], m3
17185 movu m6, [r5 + 18 * 16]
17186 pmaddubsw m3, m0, m6
17188 pmaddubsw m5, m2, m6
17191 movu [r0 + 402 * 16], m3
17193 ; mode 9 [row 24 - first half]
17194 movu [r0 + 496 * 16], m3
17196 pmaddubsw m3, m1, m6
17198 pmaddubsw m5, m4, m6
17201 movu [r0 + 403 * 16], m3
17203 ; mode 9 [row 24 - second half]
17204 movu [r0 + 497 * 16], m3
17207 movu m6, [r5 + 23 * 16]
17208 pmaddubsw m3, m0, m6
17210 pmaddubsw m5, m2, m6
17213 movu [r0 + 404 * 16], m3
17214 pmaddubsw m3, m1, m6
17216 pmaddubsw m5, m4, m6
17219 movu [r0 + 405 * 16], m3
17222 movu m6, [r5 + 28 * 16]
17223 pmaddubsw m3, m0, m6
17225 pmaddubsw m5, m2, m6
17228 movu [r0 + 406 * 16], m3
17230 ; mode 9 [row 29 - first half]
17231 movu [r0 + 506 * 16], m3
17233 pmaddubsw m3, m1, m6
17235 pmaddubsw m5, m4, m6
17238 movu [r0 + 407 * 16], m3
17240 ; mode 9 [row 29 - second half]
17241 movu [r0 + 507 * 16], m3
17244 movu m6, [r5 + 14 * 16]
17249 pmaddubsw m1, m0, m6
17255 pmaddubsw m3, m2, m6
17258 movu [r0 + 68 * 16], m1
17260 ; mode 3 [row 2 - first half]
17261 movu [r0 + 266 * 16], m1
17267 pmaddubsw m3, m1, m6
17273 pmaddubsw m5, m4, m6
17276 movu [r0 + 69 * 16], m3
17278 ; mode 3 [row 2 - second half]
17279 movu [r0 + 267 * 16], m3
17282 movu m6, [r5 + 20 * 16]
17283 pmaddubsw m3, m0, m6
17285 pmaddubsw m5, m2, m6
17288 movu [r0 + 134 * 16], m3
17289 pmaddubsw m3, m1, m6
17291 pmaddubsw m5, m4, m6
17294 movu [r0 + 135 * 16], m3
17297 movu m6, [r5 + 4 * 16]
17298 pmaddubsw m3, m0, m6
17300 pmaddubsw m5, m2, m6
17303 movu [r0 + 198 * 16], m3
17304 pmaddubsw m3, m1, m6
17306 pmaddubsw m5, m4, m6
17309 movu [r0 + 199 * 16], m3
17312 movu m6, [r5 + 21 * 16]
17313 pmaddubsw m3, m0, m6
17315 pmaddubsw m5, m2, m6
17318 movu [r0 + 200 * 16], m3
17320 ; mode 8 [row 16 - first half]
17321 movu [r0 + 416 * 16], m3
17323 pmaddubsw m3, m1, m6
17325 pmaddubsw m5, m4, m6
17328 movu [r0 + 201 * 16], m3
17330 ; mode 8 [row 16 - second half]
17331 movu [r0 + 417 * 16], m3
17334 movu m6, [r5 + 1 * 16]
17335 pmaddubsw m3, m0, m6
17337 pmaddubsw m5, m2, m6
17340 movu [r0 + 264 * 16], m3
17342 ; mode 6 [row 4 - first half]
17343 movu [r0 + 408 * 16], m3
17345 pmaddubsw m3, m1, m6
17347 pmaddubsw m5, m4, m6
17350 movu [r0 + 265 * 16], m3
17352 ; mode 6 [row 4 - second half]
17353 movu [r0 + 409 * 16], m3
17356 movu m6, [r5 + 27 * 16]
17357 pmaddubsw m3, m0, m6
17359 pmaddubsw m5, m2, m6
17362 movu [r0 + 268 * 16], m3
17363 pmaddubsw m3, m1, m6
17365 pmaddubsw m5, m4, m6
17368 movu [r0 + 269 * 16], m3
17371 movu m6, [r5 + 8 * 16]
17372 pmaddubsw m3, m0, m6
17374 pmaddubsw m5, m2, m6
17377 movu [r0 + 334 * 16], m3
17378 pmaddubsw m3, m1, m6
17380 pmaddubsw m5, m4, m6
17383 movu [r0 + 335 * 16], m3
17386 movu m6, [r5 + 17 * 16]
17387 pmaddubsw m3, m0, m6
17389 pmaddubsw m5, m2, m6
17392 movu [r0 + 336 * 16], m3
17393 pmaddubsw m3, m1, m6
17395 pmaddubsw m5, m4, m6
17398 movu [r0 + 337 * 16], m3
17401 movu m6, [r5 + 26 * 16]
17402 pmaddubsw m3, m0, m6
17404 pmaddubsw m5, m2, m6
17407 movu [r0 + 338 * 16], m3
17409 ; mode 8 [row 17 - first half]
17410 movu [r0 + 418 * 16], m3
17412 pmaddubsw m3, m1, m6
17414 pmaddubsw m5, m4, m6
17417 movu [r0 + 339 * 16], m3
17419 ; mode 8 [row 17 - second half]
17420 movu [r0 + 419 * 16], m3
17423 movu m6, [r5 + 6 * 16]
17424 pmaddubsw m3, m0, m6
17426 pmaddubsw m5, m2, m6
17429 movu [r0 + 410 * 16], m3
17430 pmaddubsw m3, m1, m6
17432 pmaddubsw m5, m4, m6
17435 movu [r0 + 411 * 16], m3
17438 movu m6, [r5 + 11 * 16]
17439 pmaddubsw m3, m0, m6
17441 pmaddubsw m5, m2, m6
17444 movu [r0 + 412 * 16], m3
17445 pmaddubsw m3, m1, m6
17447 pmaddubsw m5, m4, m6
17450 movu [r0 + 413 * 16], m3
17453 movu m6, [r5 + 16 * 16]
17454 pmaddubsw m3, m0, m6
17456 pmaddubsw m5, m2, m6
17459 movu [r0 + 414 * 16], m3
17460 pmaddubsw m3, m1, m6
17462 pmaddubsw m5, m4, m6
17465 movu [r0 + 415 * 16], m3
17468 movu m6, [r5 + 31 * 16]
17469 pmaddubsw m3, m0, m6
17471 pmaddubsw m5, m2, m6
17474 movu [r0 + 420 * 16], m3
17475 pmaddubsw m3, m1, m6
17477 pmaddubsw m5, m4, m6
17480 movu [r0 + 421 * 16], m3
17483 movu m6, [r5 + 8 * 16]
17488 pmaddubsw m1, m0, m6
17494 pmaddubsw m3, m2, m6
17497 movu [r0 + 70 * 16], m1
17499 ; mode 6 [row 7 - first half]
17500 movu [r0 + 270 * 16], m1
17506 pmaddubsw m3, m1, m6
17512 pmaddubsw m5, m4, m6
17515 movu [r0 + 71 * 16], m3
17517 ; mode 6 [row 7 - second half]
17518 movu [r0 + 271 * 16], m3
17521 movu m6, [r5 + 9 * 16]
17522 pmaddubsw m3, m0, m6
17524 pmaddubsw m5, m2, m6
17527 movu [r0 + 136 * 16], m3
17529 ; mode 4 [row 4 - first half]
17530 movu [r0 + 424 * 16], m3
17532 pmaddubsw m3, m1, m6
17534 pmaddubsw m5, m4, m6
17537 movu [r0 + 137 * 16], m3
17539 ; mode 4 [row 4 - second half]
17540 movu [r0 + 425 * 16], m3
17543 movu m6, [r5 + 30 * 16]
17544 pmaddubsw m3, m0, m6
17546 pmaddubsw m5, m2, m6
17549 movu [r0 + 138 * 16], m3
17551 ; mode 7 [row 13 - first half]
17552 movu [r0 + 346 * 16], m3
17554 pmaddubsw m3, m1, m6
17556 pmaddubsw m5, m4, m6
17559 movu [r0 + 139 * 16], m3
17561 ; mode 7 [row 13 - second half]
17562 movu [r0 + 347 * 16], m3
17565 movu m6, [r5 + 6 * 16]
17566 pmaddubsw m3, m0, m6
17568 pmaddubsw m5, m2, m6
17571 movu [r0 + 202 * 16], m3
17572 pmaddubsw m3, m1, m6
17574 pmaddubsw m5, m4, m6
17577 movu [r0 + 203 * 16], m3
17580 movu m6, [r5 + 23 * 16]
17581 pmaddubsw m3, m0, m6
17583 pmaddubsw m5, m2, m6
17586 movu [r0 + 204 * 16], m3
17587 pmaddubsw m3, m1, m6
17589 pmaddubsw m5, m4, m6
17592 movu [r0 + 205 * 16], m3
17595 movu m6, [r5 + 21 * 16]
17596 pmaddubsw m3, m0, m6
17598 pmaddubsw m5, m2, m6
17601 movu [r0 + 272 * 16], m3
17603 ; mode 7 [row 12 - first half]
17604 movu [r0 + 344 * 16], m3
17606 pmaddubsw m3, m1, m6
17608 pmaddubsw m5, m4, m6
17611 movu [r0 + 273 * 16], m3
17613 ; mode 7 [row 12 - second half]
17614 movu [r0 + 345 * 16], m3
17617 movu m6, [r5 + 3 * 16]
17618 pmaddubsw m3, m0, m6
17620 pmaddubsw m5, m2, m6
17623 movu [r0 + 340 * 16], m3
17624 pmaddubsw m3, m1, m6
17626 pmaddubsw m5, m4, m6
17629 movu [r0 + 341 * 16], m3
17632 movu m6, [r5 + 12 * 16]
17633 pmaddubsw m3, m0, m6
17635 pmaddubsw m5, m2, m6
17638 movu [r0 + 342 * 16], m3
17639 pmaddubsw m3, m1, m6
17641 pmaddubsw m5, m4, m6
17644 movu [r0 + 343 * 16], m3
17647 movu m6, [r5 + 4 * 16]
17648 pmaddubsw m3, m0, m6
17650 pmaddubsw m5, m2, m6
17653 movu [r0 + 422 * 16], m3
17654 pmaddubsw m3, m1, m6
17656 pmaddubsw m5, m4, m6
17659 movu [r0 + 423 * 16], m3
17662 movu m6, [r5 + 14 * 16]
17663 pmaddubsw m3, m0, m6
17665 pmaddubsw m5, m2, m6
17668 movu [r0 + 426 * 16], m3
17669 pmaddubsw m3, m1, m6
17671 pmaddubsw m5, m4, m6
17674 movu [r0 + 427 * 16], m3
17677 movu m6, [r5 + 19 * 16]
17678 pmaddubsw m3, m0, m6
17680 pmaddubsw m5, m2, m6
17683 movu [r0 + 428 * 16], m3
17684 pmaddubsw m3, m1, m6
17686 pmaddubsw m5, m4, m6
17689 movu [r0 + 429 * 16], m3
17692 movu m6, [r5 + 24 * 16]
17693 pmaddubsw m3, m0, m6
17695 pmaddubsw m5, m2, m6
17698 movu [r0 + 430 * 16], m3
17699 pmaddubsw m3, m1, m6
17701 pmaddubsw m5, m4, m6
17704 movu [r0 + 431 * 16], m3
17707 movu m6, [r5 + 29 * 16]
17708 pmaddubsw m3, m0, m6
17710 pmaddubsw m5, m2, m6
17713 movu [r0 + 432 * 16], m3
17714 pmaddubsw m3, m1, m6
17716 pmaddubsw m5, m4, m6
17719 movu [r0 + 433 * 16], m3
17722 movu m6, [r5 + 2 * 16]
17727 pmaddubsw m1, m0, m6
17733 pmaddubsw m3, m2, m6
17736 movu [r0 + 72 * 16], m1
17738 ; mode 3 [row 4 - first half]
17739 movu [r0 + 274 * 16], m1
17741 ; mode 8 [row 25 - first half]
17742 movu [r0 + 434 * 16], m1
17748 pmaddubsw m3, m1, m6
17754 pmaddubsw m5, m4, m6
17757 movu [r0 + 73 * 16], m3
17759 ; mode 3 [row 4 - second half]
17760 movu [r0 + 275 * 16], m3
17762 ; mode 8 [row 25 - second half]
17763 movu [r0 + 435 * 16], m3
17766 movu m6, [r5 + 28 * 16]
17767 pmaddubsw m3, m0, m6
17769 pmaddubsw m5, m2, m6
17772 movu [r0 + 74 * 16], m3
17774 ; mode 3 [row 5 - first half]
17775 movu [r0 + 278 * 16], m3
17777 pmaddubsw m3, m1, m6
17779 pmaddubsw m5, m4, m6
17782 movu [r0 + 75 * 16], m3
17784 ; mode 3 [row 5 - second half]
17785 movu [r0 + 279 * 16], m3
17788 movu m6, [r5 + 19 * 16]
17789 pmaddubsw m3, m0, m6
17791 pmaddubsw m5, m2, m6
17794 movu [r0 + 140 * 16], m3
17795 pmaddubsw m3, m1, m6
17797 pmaddubsw m5, m4, m6
17800 movu [r0 + 141 * 16], m3
17803 movu m6, [r5 + 8 * 16]
17804 pmaddubsw m3, m0, m6
17806 pmaddubsw m5, m2, m6
17809 movu [r0 + 206 * 16], m3
17810 pmaddubsw m3, m1, m6
17812 pmaddubsw m5, m4, m6
17815 movu [r0 + 207 * 16], m3
17818 movu m6, [r5 + 25 * 16]
17819 pmaddubsw m3, m0, m6
17821 pmaddubsw m5, m2, m6
17824 movu [r0 + 208 * 16], m3
17826 ; mode 7 [row 16 - first half]
17827 movu [r0 + 352 * 16], m3
17829 pmaddubsw m3, m1, m6
17831 pmaddubsw m5, m4, m6
17834 movu [r0 + 209 * 16], m3
17836 ; mode 7 [row 16 - second half]
17837 movu [r0 + 353 * 16], m3
17840 movu m6, [r5 + 15 * 16]
17841 pmaddubsw m3, m0, m6
17843 pmaddubsw m5, m2, m6
17846 movu [r0 + 276 * 16], m3
17847 pmaddubsw m3, m1, m6
17849 pmaddubsw m5, m4, m6
17852 movu [r0 + 277 * 16], m3
17855 movu m6, [r5 + 7 * 16]
17856 pmaddubsw m3, m0, m6
17858 pmaddubsw m5, m2, m6
17861 movu [r0 + 348 * 16], m3
17863 ; mode 8 [row 26 - first half]
17864 movu [r0 + 436 * 16], m3
17866 pmaddubsw m3, m1, m6
17868 pmaddubsw m5, m4, m6
17871 movu [r0 + 349 * 16], m3
17873 ; mode 8 [row 26 - second half]
17874 movu [r0 + 437 * 16], m3
17877 movu m6, [r5 + 16 * 16]
17878 pmaddubsw m3, m0, m6
17880 pmaddubsw m5, m2, m6
17883 movu [r0 + 350 * 16], m3
17884 pmaddubsw m3, m1, m6
17886 pmaddubsw m5, m4, m6
17889 movu [r0 + 351 * 16], m3
17892 movu m6, [r5 + 12 * 16]
17893 pmaddubsw m3, m0, m6
17895 pmaddubsw m5, m2, m6
17898 movu [r0 + 438 * 16], m3
17899 pmaddubsw m3, m1, m6
17901 pmaddubsw m5, m4, m6
17904 movu [r0 + 439 * 16], m3
17907 movu m6, [r5 + 17 * 16]
17908 pmaddubsw m3, m0, m6
17910 pmaddubsw m5, m2, m6
17913 movu [r0 + 440 * 16], m3
17914 pmaddubsw m3, m1, m6
17916 pmaddubsw m5, m4, m6
17919 movu [r0 + 441 * 16], m3
17922 movu m6, [r5 + 22 * 16]
17923 pmaddubsw m3, m0, m6
17925 pmaddubsw m5, m2, m6
17928 movu [r0 + 442 * 16], m3
17929 pmaddubsw m3, m1, m6
17931 pmaddubsw m5, m4, m6
17934 movu [r0 + 443 * 16], m3
17937 movu m6, [r5 + 27 * 16]
17938 pmaddubsw m3, m0, m6
17940 pmaddubsw m5, m2, m6
17943 movu [r0 + 444 * 16], m3
17944 pmaddubsw m3, m1, m6
17946 pmaddubsw m5, m4, m6
17949 movu [r0 + 445 * 16], m3
17952 movu m6, [r5 + 22 * 16]
17957 pmaddubsw m1, m0, m6
17963 pmaddubsw m3, m2, m6
17966 movu [r0 + 76 * 16], m1
17968 ; mode 6 [row 13 - first half]
17969 movu [r0 + 282 * 16], m1
17975 pmaddubsw m3, m1, m6
17981 pmaddubsw m5, m4, m6
17984 movu [r0 + 77 * 16], m3
17986 ; mode 6 [row 13 - second half]
17987 movu [r0 + 283 * 16], m3
17990 movu m6, [r5 + 8 * 16]
17991 pmaddubsw m3, m0, m6
17993 pmaddubsw m5, m2, m6
17996 movu [r0 + 142 * 16], m3
17997 pmaddubsw m3, m1, m6
17999 pmaddubsw m5, m4, m6
18002 movu [r0 + 143 * 16], m3
18005 movu m6, [r5 + 29 * 16]
18006 pmaddubsw m3, m0, m6
18008 pmaddubsw m5, m2, m6
18011 movu [r0 + 144 * 16], m3
18013 ; mode 4 [row 8 - first half]
18014 movu [r0 + 360 * 16], m3
18016 pmaddubsw m3, m1, m6
18018 pmaddubsw m5, m4, m6
18021 movu [r0 + 145 * 16], m3
18023 ; mode 4 [row 8 - second half]
18024 movu [r0 + 361 * 16], m3
18027 movu m6, [r5 + 10 * 16]
18028 pmaddubsw m3, m0, m6
18030 pmaddubsw m5, m2, m6
18033 movu [r0 + 210 * 16], m3
18034 pmaddubsw m3, m1, m6
18036 pmaddubsw m5, m4, m6
18039 movu [r0 + 211 * 16], m3
18042 movu m6, [r5 + 27 * 16]
18043 pmaddubsw m3, m0, m6
18045 pmaddubsw m5, m2, m6
18048 movu [r0 + 212 * 16], m3
18049 pmaddubsw m3, m1, m6
18051 pmaddubsw m5, m4, m6
18054 movu [r0 + 213 * 16], m3
18057 movu m6, [r5 + 2 * 16]
18058 pmaddubsw m3, m0, m6
18060 pmaddubsw m5, m2, m6
18063 movu [r0 + 354 * 16], m3
18064 pmaddubsw m3, m1, m6
18066 pmaddubsw m5, m4, m6
18069 movu [r0 + 355 * 16], m3
18072 movu m6, [r5 + 11 * 16]
18073 pmaddubsw m3, m0, m6
18075 pmaddubsw m5, m2, m6
18078 movu [r0 + 356 * 16], m3
18079 pmaddubsw m3, m1, m6
18081 pmaddubsw m5, m4, m6
18084 movu [r0 + 357 * 16], m3
18087 movu m6, [r5 + 20 * 16]
18088 pmaddubsw m3, m0, m6
18090 pmaddubsw m5, m2, m6
18093 movu [r0 + 358 * 16], m3
18094 pmaddubsw m3, m1, m6
18096 pmaddubsw m5, m4, m6
18099 movu [r0 + 359 * 16], m3
18102 movu m6, [r5 + 9 * 16]
18103 pmaddubsw m3, m0, m6
18105 pmaddubsw m5, m2, m6
18108 movu [r0 + 280 * 16], m3
18109 pmaddubsw m3, m1, m6
18111 pmaddubsw m5, m4, m6
18114 movu [r0 + 281 * 16], m3
18117 movu m6, [r5 + 16 * 16]
18122 pmaddubsw m1, m0, m6
18128 pmaddubsw m3, m2, m6
18131 movu [r0 + 78 * 16], m1
18133 ; mode 6 [row 15 - first half]
18134 movu [r0 + 286 * 16], m1
18140 pmaddubsw m3, m1, m6
18146 pmaddubsw m5, m4, m6
18149 movu [r0 + 79 * 16], m3
18151 ; mode 6 [row 15 - second half]
18152 movu [r0 + 287 * 16], m3
18155 movu m6, [r5 + 18 * 16]
18156 pmaddubsw m3, m0, m6
18158 pmaddubsw m5, m2, m6
18161 movu [r0 + 146 * 16], m3
18162 pmaddubsw m3, m1, m6
18164 pmaddubsw m5, m4, m6
18167 movu [r0 + 147 * 16], m3
18170 movu m6, [r5 + 12 * 16]
18171 pmaddubsw m3, m0, m6
18173 pmaddubsw m5, m2, m6
18176 movu [r0 + 214 * 16], m3
18177 pmaddubsw m3, m1, m6
18179 pmaddubsw m5, m4, m6
18182 movu [r0 + 215 * 16], m3
18185 movu m6, [r5 + 29 * 16]
18186 pmaddubsw m3, m0, m6
18188 pmaddubsw m5, m2, m6
18191 movu [r0 + 216 * 16], m3
18193 ; mode 6 [row 16 - first half]
18194 movu [r0 + 288 * 16], m3
18196 pmaddubsw m3, m1, m6
18198 pmaddubsw m5, m4, m6
18201 movu [r0 + 217 * 16], m3
18203 ; mode 6 [row 16 - second half]
18204 movu [r0 + 289 * 16], m3
18207 movu m6, [r5 + 3 * 16]
18208 pmaddubsw m3, m0, m6
18210 pmaddubsw m5, m2, m6
18213 movu [r0 + 284 * 16], m3
18214 pmaddubsw m3, m1, m6
18216 pmaddubsw m5, m4, m6
18219 movu [r0 + 285 * 16], m3
18222 movu m6, [r5 + 6 * 16]
18223 pmaddubsw m3, m0, m6
18225 pmaddubsw m5, m2, m6
18228 movu [r0 + 362 * 16], m3
18229 pmaddubsw m3, m1, m6
18231 pmaddubsw m5, m4, m6
18234 movu [r0 + 363 * 16], m3
18237 movu m6, [r5 + 15 * 16]
18238 pmaddubsw m3, m0, m6
18240 pmaddubsw m5, m2, m6
18243 movu [r0 + 364 * 16], m3
18244 pmaddubsw m3, m1, m6
18246 pmaddubsw m5, m4, m6
18249 movu [r0 + 365 * 16], m3
18252 movu m6, [r5 + 24 * 16]
18253 pmaddubsw m3, m0, m6
18255 pmaddubsw m5, m2, m6
18258 movu [r0 + 366 * 16], m3
18259 pmaddubsw m3, m1, m6
18261 pmaddubsw m5, m4, m6
18264 movu [r0 + 367 * 16], m3
18267 movu m6, [r5 + 10 * 16]
18272 pmaddubsw m1, m0, m6
18278 pmaddubsw m3, m2, m6
18281 movu [r0 + 80 * 16], m1
18283 ; mode 7 [row 25 - first half]
18284 movu [r0 + 290 * 16], m1
18286 ; mode 6 [row 17 - first half]
18287 movu [r0 + 370 * 16], m1
18293 pmaddubsw m3, m1, m6
18299 pmaddubsw m5, m4, m6
18302 movu [r0 + 81 * 16], m3
18304 ; mode 7 [row 25 - second half]
18305 movu [r0 + 291 * 16], m3
18307 ; mode 6 [row 17 - second half]
18308 movu [r0 + 371 * 16], m3
18311 movu m6, [r5 + 7 * 16]
18312 pmaddubsw m3, m0, m6
18314 pmaddubsw m5, m2, m6
18317 movu [r0 + 148 * 16], m3
18318 pmaddubsw m3, m1, m6
18320 pmaddubsw m5, m4, m6
18323 movu [r0 + 149 * 16], m3
18326 movu m6, [r5 + 28 * 16]
18327 pmaddubsw m3, m0, m6
18329 pmaddubsw m5, m2, m6
18332 movu [r0 + 150 * 16], m3
18334 ; mode 7 [row 27 - first half]
18335 movu [r0 + 374 * 16], m3
18337 pmaddubsw m3, m1, m6
18339 pmaddubsw m5, m4, m6
18342 movu [r0 + 151 * 16], m3
18344 ; mode 7 [row 27 - second half]
18345 movu [r0 + 375 * 16], m3
18348 movu m6, [r5 + 14 * 16]
18349 pmaddubsw m3, m0, m6
18351 pmaddubsw m5, m2, m6
18354 movu [r0 + 218 * 16], m3
18355 pmaddubsw m3, m1, m6
18357 pmaddubsw m5, m4, m6
18360 movu [r0 + 219 * 16], m3
18363 movu m6, [r5 + 31 * 16]
18364 pmaddubsw m3, m0, m6
18366 pmaddubsw m5, m2, m6
18369 movu [r0 + 220 * 16], m3
18370 pmaddubsw m3, m1, m6
18372 pmaddubsw m5, m4, m6
18375 movu [r0 + 221 * 16], m3
18378 movu m6, [r5 + 23 * 16]
18379 pmaddubsw m3, m0, m6
18381 pmaddubsw m5, m2, m6
18384 movu [r0 + 292 * 16], m3
18385 pmaddubsw m3, m1, m6
18387 pmaddubsw m5, m4, m6
18390 movu [r0 + 293 * 16], m3
18393 movu m6, [r5 + 1 * 16]
18394 pmaddubsw m3, m0, m6
18396 pmaddubsw m5, m2, m6
18399 movu [r0 + 368 * 16], m3
18400 pmaddubsw m3, m1, m6
18402 pmaddubsw m5, m4, m6
18405 movu [r0 + 369 * 16], m3
18408 movu m6, [r5 + 19 * 16]
18409 pmaddubsw m3, m0, m6
18411 pmaddubsw m5, m2, m6
18414 movu [r0 + 372 * 16], m3
18415 pmaddubsw m3, m1, m6
18417 pmaddubsw m5, m4, m6
18420 movu [r0 + 373 * 16], m3
18423 movu m6, [r5 + 4 * 16]
18428 pmaddubsw m1, m0, m6
18434 pmaddubsw m3, m2, m6
18437 movu [r0 + 82 * 16], m1
18439 ; mode 6 [row 19 - first half]
18440 movu [r0 + 294 * 16], m1
18446 pmaddubsw m3, m1, m6
18452 pmaddubsw m5, m4, m6
18455 movu [r0 + 83 * 16], m3
18457 ; mode 6 [row 19 - second half]
18458 movu [r0 + 295 * 16], m3
18461 movu m6, [r5 + 17 * 16]
18462 pmaddubsw m3, m0, m6
18464 pmaddubsw m5, m2, m6
18467 movu [r0 + 152 * 16], m3
18469 ; mode 4 [row 12 - first half]
18470 movu [r0 + 296 * 16], m3
18472 pmaddubsw m3, m1, m6
18474 pmaddubsw m5, m4, m6
18477 movu [r0 + 153 * 16], m3
18479 ; mode 4 [row 12 - second half]
18480 movu [r0 + 297 * 16], m3
18483 movu m6, [r5 + 30 * 16]
18484 pmaddubsw m3, m0, m6
18486 pmaddubsw m5, m2, m6
18489 movu [r0 + 84 * 16], m3
18491 ; mode 6 [row 21 - first half]
18492 movu [r0 + 298 * 16], m3
18494 pmaddubsw m3, m1, m6
18496 pmaddubsw m5, m4, m6
18499 movu [r0 + 85 * 16], m3
18501 ; mode 6 [row 21 - second half]
18502 movu [r0 + 299 * 16], m3
18505 movu m6, [r5 + 16 * 16]
18506 pmaddubsw m3, m0, m6
18508 pmaddubsw m5, m2, m6
18511 movu [r0 + 222 * 16], m3
18512 pmaddubsw m3, m1, m6
18514 pmaddubsw m5, m4, m6
18517 movu [r0 + 223 * 16], m3
18520 movu m6, [r5 + 5 * 16]
18521 pmaddubsw m3, m0, m6
18523 pmaddubsw m5, m2, m6
18526 movu [r0 + 376 * 16], m3
18527 pmaddubsw m3, m1, m6
18529 pmaddubsw m5, m4, m6
18532 movu [r0 + 377 * 16], m3
18535 movu m6, [r5 + 14 * 16]
18536 pmaddubsw m3, m0, m6
18538 pmaddubsw m5, m2, m6
18541 movu [r0 + 378 * 16], m3
18542 pmaddubsw m3, m1, m6
18544 pmaddubsw m5, m4, m6
18547 movu [r0 + 379 * 16], m3
18550 movu m6, [r5 + 23 * 16]
18551 pmaddubsw m3, m0, m6
18553 pmaddubsw m5, m2, m6
18556 movu [r0 + 380 * 16], m3
18557 pmaddubsw m3, m1, m6
18559 pmaddubsw m5, m4, m6
18562 movu [r0 + 381 * 16], m3
18565 movu m6, [r5 + 24 * 16]
18570 pmaddubsw m1, m0, m6
18576 pmaddubsw m3, m2, m6
18579 movu [r0 + 86 * 16], m1
18581 ; mode 6 [row 23 - first half]
18582 movu [r0 + 302 * 16], m1
18588 pmaddubsw m3, m1, m6
18594 pmaddubsw m5, m4, m6
18597 movu [r0 + 87 * 16], m3
18599 ; mode 6 [row 23 - second half]
18600 movu [r0 + 303 * 16], m3
18603 movu m6, [r5 + 6 * 16]
18604 pmaddubsw m3, m0, m6
18606 pmaddubsw m5, m2, m6
18609 movu [r0 + 154 * 16], m3
18610 pmaddubsw m3, m1, m6
18612 pmaddubsw m5, m4, m6
18615 movu [r0 + 155 * 16], m3
18618 movu m6, [r5 + 27 * 16]
18619 pmaddubsw m3, m0, m6
18621 pmaddubsw m5, m2, m6
18624 movu [r0 + 156 * 16], m3
18625 pmaddubsw m3, m1, m6
18627 pmaddubsw m5, m4, m6
18630 movu [r0 + 157 * 16], m3
18633 movu m6, [r5 + 1 * 16]
18634 pmaddubsw m3, m0, m6
18636 pmaddubsw m5, m2, m6
18639 movu [r0 + 224 * 16], m3
18640 pmaddubsw m3, m1, m6
18642 pmaddubsw m5, m4, m6
18645 movu [r0 + 225 * 16], m3
18648 movu m6, [r5 + 18 * 16]
18649 pmaddubsw m3, m0, m6
18651 pmaddubsw m5, m2, m6
18654 movu [r0 + 226 * 16], m3
18655 pmaddubsw m3, m1, m6
18657 pmaddubsw m5, m4, m6
18660 movu [r0 + 227 * 16], m3
18663 movu m6, [r5 + 11 * 16]
18664 pmaddubsw m3, m0, m6
18666 pmaddubsw m5, m2, m6
18669 movu [r0 + 300 * 16], m3
18670 pmaddubsw m3, m1, m6
18672 pmaddubsw m5, m4, m6
18675 movu [r0 + 301 * 16], m3
18678 movu m6, [r5 + 18 * 16]
18683 pmaddubsw m1, m0, m6
18689 pmaddubsw m3, m2, m6
18692 movu [r0 + 88 * 16], m1
18694 ; mode 6 [row 25 - first half]
18695 movu [r0 + 306 * 16], m1
18701 pmaddubsw m3, m1, m6
18707 pmaddubsw m5, m4, m6
18710 movu [r0 + 89 * 16], m3
18712 ; mode 6 [row 25 - second half]
18713 movu [r0 + 307 * 16], m3
18716 movu m6, [r5 + 16 * 16]
18717 pmaddubsw m3, m0, m6
18719 pmaddubsw m5, m2, m6
18722 movu [r0 + 158 * 16], m3
18723 pmaddubsw m3, m1, m6
18725 pmaddubsw m5, m4, m6
18728 movu [r0 + 159 * 16], m3
18731 movu m6, [r5 + 3 * 16]
18732 pmaddubsw m3, m0, m6
18734 pmaddubsw m5, m2, m6
18737 movu [r0 + 228 * 16], m3
18738 pmaddubsw m3, m1, m6
18740 pmaddubsw m5, m4, m6
18743 movu [r0 + 229 * 16], m3
18746 movu m6, [r5 + 20 * 16]
18747 pmaddubsw m3, m0, m6
18749 pmaddubsw m5, m2, m6
18752 movu [r0 + 230 * 16], m3
18753 pmaddubsw m3, m1, m6
18755 pmaddubsw m5, m4, m6
18758 movu [r0 + 231 * 16], m3
18761 movu m6, [r5 + 5 * 16]
18762 pmaddubsw m3, m0, m6
18764 pmaddubsw m5, m2, m6
18767 movu [r0 + 304 * 16], m3
18768 pmaddubsw m3, m1, m6
18770 pmaddubsw m5, m4, m6
18773 movu [r0 + 305 * 16], m3
18776 movu m6, [r5 + 31 * 16]
18777 pmaddubsw m3, m0, m6
18779 pmaddubsw m5, m2, m6
18782 movu [r0 + 308 * 16], m3
18783 pmaddubsw m3, m1, m6
18785 pmaddubsw m5, m4, m6
18788 movu [r0 + 309 * 16], m3
18791 movu m6, [r5 + 12 * 16]
18796 pmaddubsw m1, m0, m6
18802 pmaddubsw m3, m2, m6
18805 movu [r0 + 90 * 16], m1
18811 pmaddubsw m3, m1, m6
18817 pmaddubsw m5, m4, m6
18820 movu [r0 + 91 * 16], m3
18823 movu m6, [r5 + 5 * 16]
18824 pmaddubsw m3, m0, m6
18826 pmaddubsw m5, m2, m6
18829 movu [r0 + 160 * 16], m3
18831 ; mode 5 [row 20 - first half]
18832 movu [r0 + 232 * 16], m3
18834 pmaddubsw m3, m1, m6
18836 pmaddubsw m5, m4, m6
18839 movu [r0 + 161 * 16], m3
18841 ; mode 5 [row 20 - second half]
18842 movu [r0 + 233 * 16], m3
18845 movu m6, [r5 + 26 * 16]
18846 pmaddubsw m3, m0, m6
18848 pmaddubsw m5, m2, m6
18851 movu [r0 + 162 * 16], m3
18852 pmaddubsw m3, m1, m6
18854 pmaddubsw m5, m4, m6
18857 movu [r0 + 163 * 16], m3
18860 movu m6, [r5 + 22 * 16]
18861 pmaddubsw m3, m0, m6
18863 pmaddubsw m5, m2, m6
18866 movu [r0 + 234 * 16], m3
18867 pmaddubsw m3, m1, m6
18869 pmaddubsw m5, m4, m6
18872 movu [r0 + 235 * 16], m3
18875 movu m6, [r5 + 12 * 16]
18876 pmaddubsw m3, m0, m6
18878 pmaddubsw m5, m2, m6
18881 movu [r0 + 310 * 16], m3
18882 pmaddubsw m3, m1, m6
18884 pmaddubsw m5, m4, m6
18887 movu [r0 + 311 * 16], m3
18890 movu m6, [r5 + 25 * 16]
18891 pmaddubsw m3, m0, m6
18893 pmaddubsw m5, m2, m6
18896 movu [r0 + 312 * 16], m3
18897 pmaddubsw m3, m1, m6
18899 pmaddubsw m5, m4, m6
18902 movu [r0 + 313 * 16], m3
18905 movu m6, [r5 + 6 * 16]
18910 pmaddubsw m1, m0, m6
18916 pmaddubsw m3, m2, m6
18919 movu [r0 + 92 * 16], m1
18921 ; mode 6 [row 29 - first half]
18922 movu [r0 + 314 * 16], m1
18928 pmaddubsw m3, m1, m6
18934 pmaddubsw m5, m4, m6
18937 movu [r0 + 93 * 16], m3
18939 ; mode 6 [row 29 - second half]
18940 movu [r0 + 315 * 16], m3
18943 movu m6, [r5 + 15 * 16]
18944 pmaddubsw m3, m0, m6
18946 pmaddubsw m5, m2, m6
18949 movu [r0 + 164 * 16], m3
18950 pmaddubsw m3, m1, m6
18952 pmaddubsw m5, m4, m6
18955 movu [r0 + 165 * 16], m3
18958 movu m6, [r5 + 7 * 16]
18959 pmaddubsw m3, m0, m6
18961 pmaddubsw m5, m2, m6
18964 movu [r0 + 236 * 16], m3
18965 pmaddubsw m3, m1, m6
18967 pmaddubsw m5, m4, m6
18970 movu [r0 + 237 * 16], m3
18973 movu m6, [r5 + 24 * 16]
18974 pmaddubsw m3, m0, m6
18976 pmaddubsw m5, m2, m6
18979 movu [r0 + 238 * 16], m3
18980 pmaddubsw m3, m1, m6
18982 pmaddubsw m5, m4, m6
18985 movu [r0 + 239 * 16], m3
18988 movu m6, [r5 + 19 * 16]
18989 pmaddubsw m3, m0, m6
18991 pmaddubsw m5, m2, m6
18994 movu [r0 + 316 * 16], m3
18995 pmaddubsw m3, m1, m6
18997 pmaddubsw m5, m4, m6
19000 movu [r0 + 317 * 16], m3
19003 movu m6, [r5 + 26 * 16]
19008 pmaddubsw m1, m0, m6
19014 pmaddubsw m3, m2, m6
19017 movu [r0 + 96 * 16], m1
19019 ; mode 5 [row 25 - first half]
19020 movu [r0 + 242 * 16], m1
19026 pmaddubsw m3, m1, m6
19032 pmaddubsw m5, m4, m6
19035 movu [r0 + 97 * 16], m3
19037 ; mode 5 [row 25 - second half]
19038 movu [r0 + 243 * 16], m3
19041 movu m6, [r5 + 4 * 16]
19042 pmaddubsw m3, m0, m6
19044 pmaddubsw m5, m2, m6
19047 movu [r0 + 166 * 16], m3
19048 pmaddubsw m3, m1, m6
19050 pmaddubsw m5, m4, m6
19053 movu [r0 + 167 * 16], m3
19056 movu m6, [r5 + 25 * 16]
19057 pmaddubsw m3, m0, m6
19059 pmaddubsw m5, m2, m6
19062 movu [r0 + 168 * 16], m3
19063 pmaddubsw m3, m1, m6
19065 pmaddubsw m5, m4, m6
19068 movu [r0 + 169 * 16], m3
19071 movu m6, [r5 + 9 * 16]
19072 pmaddubsw m3, m0, m6
19074 pmaddubsw m5, m2, m6
19077 movu [r0 + 240 * 16], m3
19078 pmaddubsw m3, m1, m6
19080 pmaddubsw m5, m4, m6
19083 movu [r0 + 241 * 16], m3
19086 movu m6, [r5 + 20 * 16]
19091 pmaddubsw m1, m0, m6
19097 pmaddubsw m3, m2, m6
19100 movu [r0 + 98 * 16], m1
19106 pmaddubsw m3, m1, m6
19112 pmaddubsw m5, m4, m6
19115 movu [r0 + 99 * 16], m3
19118 movu m6, [r5 + 14 * 16]
19119 pmaddubsw m3, m0, m6
19121 pmaddubsw m5, m2, m6
19124 movu [r0 + 170 * 16], m3
19125 pmaddubsw m3, m1, m6
19127 pmaddubsw m5, m4, m6
19130 movu [r0 + 171 * 16], m3
19133 movu m6, [r5 + 11 * 16]
19134 pmaddubsw m3, m0, m6
19136 pmaddubsw m5, m2, m6
19139 movu [r0 + 244 * 16], m3
19140 pmaddubsw m3, m1, m6
19142 pmaddubsw m5, m4, m6
19145 movu [r0 + 245 * 16], m3
19148 movu m6, [r5 + 28 * 16]
19149 pmaddubsw m3, m0, m6
19151 pmaddubsw m5, m2, m6
19154 movu [r0 + 246 * 16], m3
19155 pmaddubsw m3, m1, m6
19157 pmaddubsw m5, m4, m6
19160 movu [r0 + 247 * 16], m3
19163 movu m6, [r5 + 14 * 16]
19168 pmaddubsw m1, m0, m6
19174 pmaddubsw m3, m2, m6
19177 movu [r0 + 100 * 16], m1
19183 pmaddubsw m3, m1, m6
19189 pmaddubsw m5, m4, m6
19192 movu [r0 + 101 * 16], m3
19195 movu m6, [r5 + 3 * 16]
19196 pmaddubsw m3, m0, m6
19198 pmaddubsw m5, m2, m6
19201 movu [r0 + 172 * 16], m3
19202 pmaddubsw m3, m1, m6
19204 pmaddubsw m5, m4, m6
19207 movu [r0 + 173 * 16], m3
19210 movu m6, [r5 + 24 * 16]
19211 pmaddubsw m3, m0, m6
19213 pmaddubsw m5, m2, m6
19216 movu [r0 + 174 * 16], m3
19217 pmaddubsw m3, m1, m6
19219 pmaddubsw m5, m4, m6
19222 movu [r0 + 175 * 16], m3
19225 movu m6, [r5 + 13 * 16]
19226 pmaddubsw m3, m0, m6
19228 pmaddubsw m5, m2, m6
19231 movu [r0 + 248 * 16], m3
19232 pmaddubsw m3, m1, m6
19234 pmaddubsw m5, m4, m6
19237 movu [r0 + 249 * 16], m3
19240 movu m6, [r5 + 30 * 16]
19241 pmaddubsw m3, m0, m6
19243 pmaddubsw m5, m2, m6
19246 movu [r0 + 250 * 16], m3
19247 pmaddubsw m3, m1, m6
19249 pmaddubsw m5, m4, m6
19252 movu [r0 + 251 * 16], m3
19255 movu m6, [r5 + 8 * 16]
19260 pmaddubsw m1, m0, m6
19266 pmaddubsw m3, m2, m6
19269 movu [r0 + 102 * 16], m1
19275 pmaddubsw m3, m1, m6
19281 pmaddubsw m5, m4, m6
19284 movu [r0 + 103 * 16], m3
19287 movu m6, [r5 + 13 * 16]
19288 pmaddubsw m3, m0, m6
19290 pmaddubsw m5, m2, m6
19293 movu [r0 + 176 * 16], m3
19294 pmaddubsw m3, m1, m6
19296 pmaddubsw m5, m4, m6
19299 movu [r0 + 177 * 16], m3
19302 movu m6, [r5 + 15 * 16]
19303 pmaddubsw m3, m0, m6
19305 pmaddubsw m5, m2, m6
19308 movu [r0 + 252 * 16], m3
19309 pmaddubsw m3, m1, m6
19311 pmaddubsw m5, m4, m6
19314 movu [r0 + 253 * 16], m3
19317 movu m6, [r5 + 2 * 16]
19322 pmaddubsw m1, m0, m6
19328 pmaddubsw m3, m2, m6
19331 movu [r0 + 104 * 16], m1
19337 pmaddubsw m3, m1, m6
19343 pmaddubsw m5, m4, m6
19346 movu [r0 + 105 * 16], m3
19349 pmaddubsw m3, m0, m6
19351 pmaddubsw m5, m2, m6
19354 movu [r0 + 178 * 16], m3
19355 pmaddubsw m3, m1, m6
19357 pmaddubsw m5, m4, m6
19360 movu [r0 + 179 * 16], m3
19363 movu m6, [r5 + 23 * 16]
19364 pmaddubsw m3, m0, m6
19366 pmaddubsw m5, m2, m6
19369 movu [r0 + 180 * 16], m3
19370 pmaddubsw m3, m1, m6
19372 pmaddubsw m5, m4, m6
19375 movu [r0 + 181 * 16], m3
19378 movu m6, [r5 + 28 * 16]
19379 pmaddubsw m3, m0, m6
19381 pmaddubsw m5, m2, m6
19384 movu [r0 + 106 * 16], m3
19385 pmaddubsw m3, m1, m6
19387 pmaddubsw m5, m4, m6
19390 movu [r0 + 107 * 16], m3
19393 movu m6, [r5 + 22 * 16]
19398 pmaddubsw m1, m0, m6
19404 pmaddubsw m3, m2, m6
19407 movu [r0 + 108 * 16], m1
19413 pmaddubsw m3, m1, m6
19419 pmaddubsw m5, m4, m6
19422 movu [r0 + 109 * 16], m3
19425 movu m6, [r5 + 12 * 16]
19426 pmaddubsw m3, m0, m6
19428 pmaddubsw m5, m2, m6
19431 movu [r0 + 182 * 16], m3
19432 pmaddubsw m3, m1, m6
19434 pmaddubsw m5, m4, m6
19437 movu [r0 + 183 * 16], m3
19440 movu m6, [r5 + 16 * 16]
19445 pmaddubsw m1, m0, m6
19451 pmaddubsw m3, m2, m6
19454 movu [r0 + 110 * 16], m1
19460 pmaddubsw m3, m1, m6
19466 pmaddubsw m5, m4, m6
19469 movu [r0 + 111 * 16], m3
19472 movu m6, [r5 + 1 * 16]
19473 pmaddubsw m3, m0, m6
19475 pmaddubsw m5, m2, m6
19478 movu [r0 + 184 * 16], m3
19479 pmaddubsw m3, m1, m6
19481 pmaddubsw m5, m4, m6
19484 movu [r0 + 185 * 16], m3
19487 movu m6, [r5 + 22 * 16]
19488 pmaddubsw m3, m0, m6
19490 pmaddubsw m5, m2, m6
19493 movu [r0 + 186 * 16], m3
19494 pmaddubsw m3, m1, m6
19496 pmaddubsw m5, m4, m6
19499 movu [r0 + 187 * 16], m3
19502 movu m6, [r5 + 10 * 16]
19507 pmaddubsw m1, m0, m6
19513 pmaddubsw m3, m2, m6
19516 movu [r0 + 112 * 16], m1
19522 pmaddubsw m3, m1, m6
19528 pmaddubsw m5, m4, m6
19531 movu [r0 + 113 * 16], m3
19534 movu m6, [r5 + 11 * 16]
19535 pmaddubsw m3, m0, m6
19537 pmaddubsw m5, m2, m6
19540 movu [r0 + 188 * 16], m3
19541 pmaddubsw m3, m1, m6
19543 pmaddubsw m5, m4, m6
19546 movu [r0 + 189 * 16], m3
19549 movu m6, [r5 + 4 * 16]
19554 pmaddubsw m1, m0, m6
19560 pmaddubsw m3, m2, m6
19563 movu [r0 + 114 * 16], m1
19569 pmaddubsw m3, m1, m6
19575 pmaddubsw m5, m4, m6
19578 movu [r0 + 115 * 16], m3
19581 movu m6, [r5 + 30 * 16]
19582 pmaddubsw m3, m0, m6
19584 pmaddubsw m5, m2, m6
19587 movu [r0 + 116 * 16], m3
19588 pmaddubsw m3, m1, m6
19590 pmaddubsw m5, m4, m6
19593 movu [r0 + 117 * 16], m3
19596 movu m6, [r5 + 24 * 16]
19601 pmaddubsw m1, m0, m6
19607 pmaddubsw m3, m2, m6
19610 movu [r0 + 118 * 16], m1
19616 pmaddubsw m3, m1, m6
19622 pmaddubsw m5, m4, m6
19625 movu [r0 + 119 * 16], m3
19628 movu m6, [r5 + 18 * 16]
19633 pmaddubsw m1, m0, m6
19639 pmaddubsw m3, m2, m6
19642 movu [r0 + 120 * 16], m1
19648 pmaddubsw m3, m1, m6
19654 pmaddubsw m5, m4, m6
19657 movu [r0 + 121 * 16], m3
19660 movu m6, [r5 + 12 * 16]
19665 pmaddubsw m1, m0, m6
19671 pmaddubsw m3, m2, m6
19674 movu [r0 + 122 * 16], m1
19680 pmaddubsw m3, m1, m6
19686 pmaddubsw m5, m4, m6
19689 movu [r0 + 123 * 16], m3
19692 movu m6, [r5 + 6 * 16]
19697 pmaddubsw m1, m0, m6
19703 pmaddubsw m3, m2, m6
19706 movu [r0 + 124 * 16], m1
19712 pmaddubsw m3, m1, m6
19718 pmaddubsw m5, m4, m6
19721 movu [r0 + 125 * 16], m3
19726 movu [r0 + 512 * 16], m1
19727 movu [r0 + 513 * 16], m2
19728 movu [r0 + 514 * 16], m1
19729 movu [r0 + 515 * 16], m2
19730 movu [r0 + 516 * 16], m1
19731 movu [r0 + 517 * 16], m2
19732 movu [r0 + 518 * 16], m1
19733 movu [r0 + 519 * 16], m2
19734 movu [r0 + 520 * 16], m1
19735 movu [r0 + 521 * 16], m2
19736 movu [r0 + 522 * 16], m1
19737 movu [r0 + 523 * 16], m2
19738 movu [r0 + 524 * 16], m1
19739 movu [r0 + 525 * 16], m2
19740 movu [r0 + 526 * 16], m1
19741 movu [r0 + 527 * 16], m2
19743 movu [r0 + 528 * 16], m1
19744 movu [r0 + 529 * 16], m2
19745 movu [r0 + 530 * 16], m1
19746 movu [r0 + 531 * 16], m2
19747 movu [r0 + 532 * 16], m1
19748 movu [r0 + 533 * 16], m2
19749 movu [r0 + 534 * 16], m1
19750 movu [r0 + 535 * 16], m2
19751 movu [r0 + 536 * 16], m1
19752 movu [r0 + 537 * 16], m2
19753 movu [r0 + 538 * 16], m1
19754 movu [r0 + 539 * 16], m2
19755 movu [r0 + 540 * 16], m1
19756 movu [r0 + 541 * 16], m2
19757 movu [r0 + 542 * 16], m1
19758 movu [r0 + 543 * 16], m2
19760 movu [r0 + 544 * 16], m1
19761 movu [r0 + 545 * 16], m2
19762 movu [r0 + 546 * 16], m1
19763 movu [r0 + 547 * 16], m2
19764 movu [r0 + 548 * 16], m1
19765 movu [r0 + 549 * 16], m2
19766 movu [r0 + 550 * 16], m1
19767 movu [r0 + 551 * 16], m2
19768 movu [r0 + 552 * 16], m1
19769 movu [r0 + 553 * 16], m2
19770 movu [r0 + 554 * 16], m1
19771 movu [r0 + 555 * 16], m2
19772 movu [r0 + 556 * 16], m1
19773 movu [r0 + 557 * 16], m2
19774 movu [r0 + 558 * 16], m1
19775 movu [r0 + 559 * 16], m2
19777 movu [r0 + 560 * 16], m1
19778 movu [r0 + 561 * 16], m2
19779 movu [r0 + 562 * 16], m1
19780 movu [r0 + 563 * 16], m2
19781 movu [r0 + 564 * 16], m1
19782 movu [r0 + 565 * 16], m2
19783 movu [r0 + 566 * 16], m1
19784 movu [r0 + 567 * 16], m2
19785 movu [r0 + 568 * 16], m1
19786 movu [r0 + 569 * 16], m2
19787 movu [r0 + 570 * 16], m1
19788 movu [r0 + 571 * 16], m2
19789 movu [r0 + 572 * 16], m1
19790 movu [r0 + 573 * 16], m2
19791 movu [r0 + 574 * 16], m1
19792 movu [r0 + 575 * 16], m2
19797 ; mode 11 [row 15 - first half]
19798 movu [r0 + 606 * 16], m0
19800 movu [r0 + 606 * 16], m0
19804 pinsrb m6, [r3 + 26], 0
19805 pinsrb m6, [r3 + 19], 1
19806 pinsrb m6, [r3 + 13], 2
19807 pinsrb m6, [r3 + 6], 3
19808 movu [r0 + 702 * 16], m6
19810 movu [r0 + 703 * 16], m6
19814 pinsrb m6, [r3 + 16], 0
19815 movu [r0 + 638 * 16], m6
19817 movu [r0 + 639 * 16], m6
19822 pmaddubsw m1, m0, [r5 + 30 * 16]
19828 pmaddubsw m3, m2, [r5 + 30 * 16]
19831 movu [r0 + 576 * 16], m1
19835 ; mode 11 [row 15 - second half]
19836 movu [r0 + 607 * 16], m1
19841 pmaddubsw m3, m1, [r5 + 30 * 16]
19847 pmaddubsw m5, m4, [r5 + 30 * 16]
19850 movu [r0 + 577 * 16], m3
19853 pmaddubsw m3, m0, [r5 + 28 * 16]
19855 pmaddubsw m5, m2, [r5 + 28 * 16]
19858 movu [r0 + 578 * 16], m3
19859 pmaddubsw m3, m1, [r5 + 28 * 16]
19861 pmaddubsw m5, m4, [r5 + 28 * 16]
19864 movu [r0 + 579 * 16], m3
19867 pmaddubsw m3, m0, [r5 + 26 * 16]
19869 pmaddubsw m5, m2, [r5 + 26 * 16]
19872 movu [r0 + 580 * 16], m3
19873 pmaddubsw m3, m1, [r5 + 26 * 16]
19875 pmaddubsw m5, m4, [r5 + 26 * 16]
19878 movu [r0 + 581 * 16], m3
19881 pmaddubsw m3, m0, [r5 + 24 * 16]
19883 pmaddubsw m5, m2, [r5 + 24 * 16]
19886 movu [r0 + 582 * 16], m3
19887 pmaddubsw m3, m1, [r5 + 24 * 16]
19889 pmaddubsw m5, m4, [r5 + 24 * 16]
19892 movu [r0 + 583 * 16], m3
19895 pmaddubsw m3, m0, [r5 + 22 * 16]
19897 pmaddubsw m5, m2, [r5 + 22 * 16]
19900 movu [r0 + 584 * 16], m3
19902 ; mode 12 [row 1 - first half]
19903 movu [r0 + 642 * 16], m3
19905 pmaddubsw m3, m1, [r5 + 22 * 16]
19907 pmaddubsw m5, m4, [r5 + 22 * 16]
19910 movu [r0 + 585 * 16], m3
19912 ; mode 12 [row 1 - second half]
19913 movu [r0 + 643 * 16], m3
19916 pmaddubsw m3, m0, [r5 + 20 * 16]
19918 pmaddubsw m5, m2, [r5 + 20 * 16]
19921 movu [r0 + 586 * 16], m3
19922 pmaddubsw m3, m1, [r5 + 20 * 16]
19924 pmaddubsw m5, m4, [r5 + 20 * 16]
19927 movu [r0 + 587 * 16], m3
19930 pmaddubsw m3, m0, [r5 + 18 * 16]
19932 pmaddubsw m5, m2, [r5 + 18 * 16]
19935 movu [r0 + 588 * 16], m3
19936 pmaddubsw m3, m1, [r5 + 18 * 16]
19938 pmaddubsw m5, m4, [r5 + 18 * 16]
19941 movu [r0 + 589 * 16], m3
19944 pmaddubsw m3, m0, [r5 + 16 * 16]
19946 pmaddubsw m5, m2, [r5 + 16 * 16]
19949 movu [r0 + 590 * 16], m3
19950 pmaddubsw m3, m1, [r5 + 16 * 16]
19952 pmaddubsw m5, m4, [r5 + 16 * 16]
19955 movu [r0 + 591 * 16], m3
19958 pmaddubsw m3, m0, [r5 + 14 * 16]
19960 pmaddubsw m5, m2, [r5 + 14 * 16]
19963 movu [r0 + 592 * 16], m3
19965 ; mode 13 [row 1 - first half]
19966 movu [r0 + 706 * 16], m3
19968 pmaddubsw m3, m1, [r5 + 14 * 16]
19970 pmaddubsw m5, m4, [r5 + 14 * 16]
19973 movu [r0 + 593 * 16], m3
19975 ; mode 13 [row 1 - second half]
19976 movu [r0 + 707 * 16], m3
19979 pmaddubsw m3, m0, [r5 + 12 * 16]
19981 pmaddubsw m5, m2, [r5 + 12 * 16]
19984 movu [r0 + 594 * 16], m3
19986 ; mode 12 [row 3 - first half]
19987 movu [r0 + 646 * 16], m3
19989 pmaddubsw m3, m1, [r5 + 12 * 16]
19991 pmaddubsw m5, m4, [r5 + 12 * 16]
19994 movu [r0 + 595 * 16], m3
19996 ; mode 12 [row 3 - second half]
19997 movu [r0 + 647 * 16], m3
20000 pmaddubsw m3, m0, [r5 + 10 * 16]
20002 pmaddubsw m5, m2, [r5 + 10 * 16]
20005 movu [r0 + 596 * 16], m3
20006 pmaddubsw m3, m1, [r5 + 10 * 16]
20008 pmaddubsw m5, m4, [r5 + 10 * 16]
20011 movu [r0 + 597 * 16], m3
20014 pmaddubsw m3, m0, [r5 + 8 * 16]
20016 pmaddubsw m5, m2, [r5 + 8 * 16]
20019 movu [r0 + 598 * 16], m3
20020 pmaddubsw m3, m1, [r5 + 8 * 16]
20022 pmaddubsw m5, m4, [r5 + 8 * 16]
20025 movu [r0 + 599 * 16], m3
20028 pmaddubsw m3, m0, [r5 + 6 * 16]
20030 pmaddubsw m5, m2, [r5 + 6 * 16]
20033 movu [r0 + 600 * 16], m3
20035 ; mode 14 [row 1 - first half]
20036 movu [r0 + 770 * 16], m3
20038 pmaddubsw m3, m1, [r5 + 6 * 16]
20040 pmaddubsw m5, m4, [r5 + 6 * 16]
20043 movu [r0 + 601 * 16], m3
20045 ; mode 14 [row 1 - second half]
20046 movu [r0 + 771 * 16], m3
20049 pmaddubsw m3, m0, [r5 + 4 * 16]
20051 pmaddubsw m5, m2, [r5 + 4 * 16]
20054 movu [r0 + 602 * 16], m3
20055 pmaddubsw m3, m1, [r5 + 4 * 16]
20057 pmaddubsw m5, m4, [r5 + 4 * 16]
20060 movu [r0 + 603 * 16], m3
20063 pmaddubsw m3, m0, [r5 + 2 * 16]
20065 pmaddubsw m5, m2, [r5 + 2 * 16]
20068 movu [r0 + 604 * 16], m3
20070 ; mode 13 [row 5 - first half]
20071 movu [r0 + 650 * 16], m3
20073 pmaddubsw m3, m1, [r5 + 2 * 16]
20075 pmaddubsw m5, m4, [r5 + 2 * 16]
20078 movu [r0 + 605 * 16], m3
20080 ; mode 13 [row 5 - second half]
20081 movu [r0 + 651 * 16], m3
20084 pmaddubsw m3, m0, [r5 + 27 * 16]
20086 pmaddubsw m5, m2, [r5 + 27 * 16]
20089 movu [r0 + 640 * 16], m3
20090 pmaddubsw m3, m1, [r5 + 27 * 16]
20092 pmaddubsw m5, m4, [r5 + 27 * 16]
20095 movu [r0 + 641 * 16], m3
20098 pmaddubsw m3, m0, [r5 + 17 * 16]
20100 pmaddubsw m5, m2, [r5 + 17 * 16]
20103 movu [r0 + 644 * 16], m3
20104 pmaddubsw m3, m1, [r5 + 17 * 16]
20106 pmaddubsw m5, m4, [r5 + 17 * 16]
20109 movu [r0 + 645 * 16], m3
20112 pmaddubsw m3, m0, [r5 + 7 * 16]
20114 pmaddubsw m5, m2, [r5 + 7 * 16]
20117 movu [r0 + 648 * 16], m3
20118 pmaddubsw m3, m1, [r5 + 7 * 16]
20120 pmaddubsw m5, m4, [r5 + 7 * 16]
20123 movu [r0 + 649 * 16], m3
20126 pmaddubsw m3, m0, [r5 + 23 * 16]
20128 pmaddubsw m5, m2, [r5 + 23 * 16]
20131 movu [r0 + 704 * 16], m3
20132 pmaddubsw m3, m1, [r5 + 23 * 16]
20134 pmaddubsw m5, m4, [r5 + 23 * 16]
20137 movu [r0 + 705 * 16], m3
20140 pmaddubsw m3, m0, [r5 + 5 * 16]
20142 pmaddubsw m5, m2, [r5 + 5 * 16]
20145 movu [r0 + 708 * 16], m3
20146 pmaddubsw m3, m1, [r5 + 5 * 16]
20148 pmaddubsw m5, m4, [r5 + 5 * 16]
20151 movu [r0 + 709 * 16], m3
20154 pmaddubsw m3, m0, [r5 + 19 * 16]
20156 pmaddubsw m5, m2, [r5 + 19 * 16]
20159 movu [r0 + 768 * 16], m3
20160 pmaddubsw m3, m1, [r5 + 19 * 16]
20162 pmaddubsw m5, m4, [r5 + 19 * 16]
20165 movu [r0 + 769 * 16], m3
20168 pmaddubsw m3, m0, [r5 + 15 * 16]
20170 pmaddubsw m5, m2, [r5 + 15 * 16]
20173 movu [r0 + 832 * 16], m3
20174 pmaddubsw m3, m1, [r5 + 15 * 16]
20176 pmaddubsw m5, m4, [r5 + 15 * 16]
20179 movu [r0 + 833 * 16], m3
20183 pinsrb m0, [r4 + 0], 1
20184 pinsrb m0, [r3 + 16], 0
20185 pmaddubsw m3, m0, [r5 + 30 * 16]
20188 pinsrb m2, [r4 + 8], 1
20189 pinsrb m2, [r4 + 7], 0
20190 pmaddubsw m5, m2, [r5 + 30 * 16]
20193 movu [r0 + 608 * 16], m3
20195 pinsrb m1, [r4 + 16], 1
20196 pinsrb m1, [r4 + 15], 0
20197 pmaddubsw m3, m1, [r5 + 30 * 16]
20200 pinsrb m4, [r4 + 24], 1
20201 pinsrb m4, [r4 + 23], 0
20202 pmaddubsw m5, m4, [r5 + 30 * 16]
20205 movu [r0 + 609 * 16], m3
20208 pmaddubsw m3, m0, [r5 + 28 * 16]
20210 pmaddubsw m5, m2, [r5 + 28 * 16]
20213 movu [r0 + 610 * 16], m3
20214 pmaddubsw m3, m1, [r5 + 28 * 16]
20216 pmaddubsw m5, m4, [r5 + 28 * 16]
20219 movu [r0 + 611 * 16], m3
20222 pmaddubsw m3, m0, [r5 + 26 * 16]
20224 pmaddubsw m5, m2, [r5 + 26 * 16]
20227 movu [r0 + 612 * 16], m3
20228 pmaddubsw m3, m1, [r5 + 26 * 16]
20230 pmaddubsw m5, m4, [r5 + 26 * 16]
20233 movu [r0 + 613 * 16], m3
20236 pmaddubsw m3, m0, [r5 + 24 * 16]
20238 pmaddubsw m5, m2, [r5 + 24 * 16]
20241 movu [r0 + 614 * 16], m3
20242 pmaddubsw m3, m1, [r5 + 24 * 16]
20244 pmaddubsw m5, m4, [r5 + 24 * 16]
20247 movu [r0 + 615 * 16], m3
20250 pmaddubsw m3, m0, [r5 + 22 * 16]
20252 pmaddubsw m5, m2, [r5 + 22 * 16]
20255 movu [r0 + 616 * 16], m3
20256 pmaddubsw m3, m1, [r5 + 22 * 16]
20258 pmaddubsw m5, m4, [r5 + 22 * 16]
20261 movu [r0 + 617 * 16], m3
20264 pmaddubsw m3, m0, [r5 + 20 * 16]
20266 pmaddubsw m5, m2, [r5 + 20 * 16]
20269 movu [r0 + 618 * 16], m3
20270 pmaddubsw m3, m1, [r5 + 20 * 16]
20272 pmaddubsw m5, m4, [r5 + 20 * 16]
20275 movu [r0 + 619 * 16], m3
20278 pmaddubsw m3, m0, [r5 + 18 * 16]
20280 pmaddubsw m5, m2, [r5 + 18 * 16]
20283 movu [r0 + 620 * 16], m3
20284 pmaddubsw m3, m1, [r5 + 18 * 16]
20286 pmaddubsw m5, m4, [r5 + 18 * 16]
20289 movu [r0 + 621 * 16], m3
20292 pmaddubsw m3, m0, [r5 + 16 * 16]
20294 pmaddubsw m5, m2, [r5 + 16 * 16]
20297 movu [r0 + 622 * 16], m3
20298 pmaddubsw m3, m1, [r5 + 16 * 16]
20300 pmaddubsw m5, m4, [r5 + 16 * 16]
20303 movu [r0 + 623 * 16], m3
20306 pmaddubsw m3, m0, [r5 + 14 * 16]
20308 pmaddubsw m5, m2, [r5 + 14 * 16]
20311 movu [r0 + 624 * 16], m3
20312 pmaddubsw m3, m1, [r5 + 14 * 16]
20314 pmaddubsw m5, m4, [r5 + 14 * 16]
20317 movu [r0 + 625 * 16], m3
20320 pmaddubsw m3, m0, [r5 + 12 * 16]
20322 pmaddubsw m5, m2, [r5 + 12 * 16]
20325 movu [r0 + 626 * 16], m3
20326 pmaddubsw m3, m1, [r5 + 12 * 16]
20328 pmaddubsw m5, m4, [r5 + 12 * 16]
20331 movu [r0 + 627 * 16], m3
20334 pmaddubsw m3, m0, [r5 + 10 * 16]
20336 pmaddubsw m5, m2, [r5 + 10 * 16]
20339 movu [r0 + 628 * 16], m3
20340 pmaddubsw m3, m1, [r5 + 10 * 16]
20342 pmaddubsw m5, m4, [r5 + 10 * 16]
20345 movu [r0 + 629 * 16], m3
20348 pmaddubsw m3, m0, [r5 + 8 * 16]
20350 pmaddubsw m5, m2, [r5 + 8 * 16]
20353 movu [r0 + 630 * 16], m3
20354 pmaddubsw m3, m1, [r5 + 8 * 16]
20356 pmaddubsw m5, m4, [r5 + 8 * 16]
20359 movu [r0 + 631 * 16], m3
20362 pmaddubsw m3, m0, [r5 + 6 * 16]
20364 pmaddubsw m5, m2, [r5 + 6 * 16]
20367 movu [r0 + 632 * 16], m3
20368 pmaddubsw m3, m1, [r5 + 6 * 16]
20370 pmaddubsw m5, m4, [r5 + 6 * 16]
20373 movu [r0 + 633 * 16], m3
20376 pmaddubsw m3, m0, [r5 + 4 * 16]
20378 pmaddubsw m5, m2, [r5 + 4 * 16]
20381 movu [r0 + 634 * 16], m3
20382 pmaddubsw m3, m1, [r5 + 4 * 16]
20384 pmaddubsw m5, m4, [r5 + 4 * 16]
20387 movu [r0 + 635 * 16], m3
20390 pmaddubsw m3, m0, [r5 + 2 * 16]
20392 pmaddubsw m5, m2, [r5 + 2 * 16]
20395 movu [r0 + 636 * 16], m3
20396 pmaddubsw m3, m1, [r5 + 2 * 16]
20398 pmaddubsw m5, m4, [r5 + 2 * 16]
20401 movu [r0 + 637 * 16], m3
20404 pinsrb m0, [r3 + 6], 0
20405 pmaddubsw m3, m0, [r5 + 29 * 16]
20407 pmaddubsw m5, m2, [r5 + 29 * 16]
20410 movu [r0 + 652 * 16], m3
20411 pmaddubsw m3, m1, [r5 + 29 * 16]
20413 pmaddubsw m5, m4, [r5 + 29 * 16]
20416 movu [r0 + 653 * 16], m3
20419 pmaddubsw m3, m0, [r5 + 24 * 16]
20421 pmaddubsw m5, m2, [r5 + 24 * 16]
20424 movu [r0 + 654 * 16], m3
20425 pmaddubsw m3, m1, [r5 + 24 * 16]
20427 pmaddubsw m5, m4, [r5 + 24 * 16]
20430 movu [r0 + 655 * 16], m3
20433 pmaddubsw m3, m0, [r5 + 19 * 16]
20435 pmaddubsw m5, m2, [r5 + 19 * 16]
20438 movu [r0 + 656 * 16], m3
20439 pmaddubsw m3, m1, [r5 + 19 * 16]
20441 pmaddubsw m5, m4, [r5 + 19 * 16]
20444 movu [r0 + 657 * 16], m3
20447 pmaddubsw m3, m0, [r5 + 14 * 16]
20449 pmaddubsw m5, m2, [r5 + 14 * 16]
20452 movu [r0 + 658 * 16], m3
20453 pmaddubsw m3, m1, [r5 + 14 * 16]
20455 pmaddubsw m5, m4, [r5 + 14 * 16]
20458 movu [r0 + 659 * 16], m3
20461 pmaddubsw m3, m0, [r5 + 9 * 16]
20463 pmaddubsw m5, m2, [r5 + 9 * 16]
20466 movu [r0 + 660 * 16], m3
20467 pmaddubsw m3, m1, [r5 + 9 * 16]
20469 pmaddubsw m5, m4, [r5 + 9 * 16]
20472 movu [r0 + 661 * 16], m3
20475 pmaddubsw m3, m0, [r5 + 4 * 16]
20477 pmaddubsw m5, m2, [r5 + 4 * 16]
20480 movu [r0 + 662 * 16], m3
20481 pmaddubsw m3, m1, [r5 + 4 * 16]
20483 pmaddubsw m5, m4, [r5 + 4 * 16]
20486 movu [r0 + 663 * 16], m3
20490 pinsrb m6, [r3 + 4], 0
20491 pmaddubsw m3, m6, [r5 + 28 * 16]
20493 pmaddubsw m5, m2, [r5 + 28 * 16]
20496 movu [r0 + 710 * 16], m3
20497 pmaddubsw m3, m1, [r5 + 28 * 16]
20499 pmaddubsw m5, m4, [r5 + 28 * 16]
20502 movu [r0 + 711 * 16], m3
20505 pmaddubsw m3, m6, [r5 + 19 * 16]
20507 pmaddubsw m5, m2, [r5 + 19 * 16]
20510 movu [r0 + 712 * 16], m3
20511 pmaddubsw m3, m1, [r5 + 19 * 16]
20513 pmaddubsw m5, m4, [r5 + 19 * 16]
20516 movu [r0 + 713 * 16], m3
20519 pmaddubsw m3, m6, [r5 + 10 * 16]
20521 pmaddubsw m5, m2, [r5 + 10 * 16]
20524 movu [r0 + 714 * 16], m3
20525 pmaddubsw m3, m1, [r5 + 10 * 16]
20527 pmaddubsw m5, m4, [r5 + 10 * 16]
20530 movu [r0 + 715 * 16], m3
20533 pmaddubsw m3, m6, [r5 + 1 * 16]
20535 pmaddubsw m5, m2, [r5 + 1 * 16]
20538 movu [r0 + 716 * 16], m3
20539 pmaddubsw m3, m1, [r5 + 1 * 16]
20541 pmaddubsw m5, m4, [r5 + 1 * 16]
20544 movu [r0 + 717 * 16], m3
20548 pinsrb m6, [r4 + 0], 1
20549 pinsrb m6, [r3 + 2], 0
20550 pmaddubsw m3, m6, [r5 + 25 * 16]
20552 pmaddubsw m5, m2, [r5 + 25 * 16]
20555 movu [r0 + 772 * 16], m3
20556 pmaddubsw m3, m1, [r5 + 25 * 16]
20558 pmaddubsw m5, m4, [r5 + 25 * 16]
20561 movu [r0 + 773 * 16], m3
20564 pmaddubsw m3, m6, [r5 + 12 * 16]
20566 pmaddubsw m5, m2, [r5 + 12 * 16]
20569 movu [r0 + 774 * 16], m3
20570 pmaddubsw m3, m1, [r5 + 12 * 16]
20572 pmaddubsw m5, m4, [r5 + 12 * 16]
20575 movu [r0 + 775 * 16], m3
20578 pmaddubsw m3, m6, [r5 + 30 * 16]
20580 pmaddubsw m5, m2, [r5 + 30 * 16]
20583 movu [r0 + 834 * 16], m3
20584 pmaddubsw m3, m1, [r5 + 30 * 16]
20586 pmaddubsw m5, m4, [r5 + 30 * 16]
20589 movu [r0 + 835 * 16], m3
20592 pmaddubsw m3, m6, [r5 + 13 * 16]
20594 pmaddubsw m5, m2, [r5 + 13 * 16]
20597 movu [r0 + 836 * 16], m3
20598 pmaddubsw m3, m1, [r5 + 13 * 16]
20600 pmaddubsw m5, m4, [r5 + 13 * 16]
20603 movu [r0 + 837 * 16], m3
20607 pinsrb m6, [r3 + 2], 1
20608 pinsrb m6, [r3 + 4], 0
20609 pmaddubsw m3, m6, [r5 + 28 * 16]
20612 pinsrb m2, [r4 + 7], 1
20613 pinsrb m2, [r4 + 6], 0
20614 pmaddubsw m5, m2, [r5 + 28 * 16]
20617 movu [r0 + 838 * 16], m3
20619 pinsrb m1, [r4 + 15], 1
20620 pinsrb m1, [r4 + 14], 0
20621 pmaddubsw m3, m1, [r5 + 28 * 16]
20624 pinsrb m4, [r4 + 23], 1
20625 pinsrb m4, [r4 + 22], 0
20626 pmaddubsw m5, m4, [r5 + 28 * 16]
20629 movu [r0 + 839 * 16], m3
20632 pmaddubsw m3, m6, [r5 + 11 * 16]
20634 pmaddubsw m5, m2, [r5 + 11 * 16]
20637 movu [r0 + 840 * 16], m3
20638 pmaddubsw m3, m1, [r5 + 11 * 16]
20640 pmaddubsw m5, m4, [r5 + 11 * 16]
20643 movu [r0 + 841 * 16], m3
20645 ; mode 15 [row 5, 0-7]
20647 pinsrb m6, [r3 + 4], 1
20648 pinsrb m6, [r3 + 6], 0
20649 pmaddubsw m3, m6, [r5 + 26 * 16]
20652 movh [r0 + 842 * 16], m3
20654 ; mode 15 [row 6, 0-7]
20655 pmaddubsw m3, m6, [r5 + 9 * 16]
20658 movh [r0 + 844 * 16], m3
20660 ; mode 15 [row 7, 0-7]
20662 pinsrb m6, [r3 + 6], 1
20663 pinsrb m6, [r3 + 8], 0
20664 pmaddubsw m3, m6, [r5 + 24 * 16]
20667 movh [r0 + 846 * 16], m3
20669 ; mode 15 [row 8, 0-7]
20670 pmaddubsw m3, m6, [r5 + 7 * 16]
20673 movh [r0 + 848 * 16], m3
20675 ; mode 15 [row 9, 0-7]
20677 pinsrb m6, [r3 + 8], 1
20678 pinsrb m6, [r3 + 9], 0
20679 pmaddubsw m3, m6, [r5 + 22 * 16]
20682 movh [r0 + 850 * 16], m3
20684 ; mode 15 [row 10, 0-7]
20685 pmaddubsw m3, m6, [r5 + 5 * 16]
20688 movh [r0 + 852 * 16], m3
20690 ; mode 15 [row 11, 0-7]
20692 pinsrb m6, [r3 + 9], 1
20693 pinsrb m6, [r3 + 11], 0
20694 pmaddubsw m3, m6, [r5 + 20 * 16]
20697 movh [r0 + 854 * 16], m3
20699 ; mode 15 [row 12, 0-7]
20700 pmaddubsw m3, m6, [r5 + 3 * 16]
20703 movh [r0 + 856 * 16], m3
20705 ; mode 15 [row 13, 0-7]
20707 pinsrb m6, [r3 + 11], 1
20708 pinsrb m6, [r3 + 13], 0
20709 pmaddubsw m3, m6, [r5 + 18 * 16]
20712 movh [r0 + 858 * 16], m3
20714 ; mode 15 [row 14, 0-7]
20715 pmaddubsw m3, m6, [r5 + 1 * 16]
20718 movh [r0 + 860 * 16], m3
20720 ; mode 15 [row 15, 0-7]
20722 pinsrb m6, [r3 + 13], 1
20723 pinsrb m6, [r3 + 15], 0
20724 pmaddubsw m3, m6, [r5 + 16 * 16]
20727 movh [r0 + 862 * 16], m3
20729 ; mode 15 [row 16, 0-7]
20731 pinsrb m6, [r3 + 15], 1
20732 pinsrb m6, [r3 + 17], 0
20733 pmaddubsw m3, m6, [r5 + 31 * 16]
20736 movh [r0 + 864 * 16], m3
20738 ; mode 15 [row 17, 0-7]
20739 pmaddubsw m3, m6, [r5 + 14 * 16]
20742 movh [r0 + 866 * 16], m3
20744 ; mode 15 [row 18, 0-7]
20746 pinsrb m6, [r3 + 17], 1
20747 pinsrb m6, [r3 + 19], 0
20748 pmaddubsw m3, m6, [r5 + 29 * 16]
20751 movh [r0 + 868 * 16], m3
20753 ; mode 15 [row 19, 0-7]
20754 pmaddubsw m3, m6, [r5 + 12 * 16]
20757 movh [r0 + 870 * 16], m3
20759 ; mode 15 [row 20, 0-7]
20761 pinsrb m6, [r3 + 19], 1
20762 pinsrb m6, [r3 + 21], 0
20763 pmaddubsw m3, m6, [r5 + 27 * 16]
20766 movh [r0 + 872 * 16], m3
20768 ; mode 15 [row 21, 0-7]
20769 pmaddubsw m3, m6, [r5 + 10 * 16]
20772 movh [r0 + 874 * 16], m3
20774 ; mode 15 [row 22, 0-7]
20776 pinsrb m6, [r3 + 21], 1
20777 pinsrb m6, [r3 + 23], 0
20778 pmaddubsw m3, m6, [r5 + 25 * 16]
20781 movh [r0 + 876 * 16], m3
20783 ; mode 15 [row 23, 0-7]
20784 pmaddubsw m3, m6, [r5 + 8 * 16]
20787 movh [r0 + 878 * 16], m3
20789 ; mode 15 [row 24, 0-7]
20791 pinsrb m6, [r3 + 23], 1
20792 pinsrb m6, [r3 + 24], 0
20793 pmaddubsw m3, m6, [r5 + 23 * 16]
20796 movh [r0 + 880 * 16], m3
20798 ; mode 15 [row 25, 0-7]
20799 pmaddubsw m3, m6, [r5 + 6 * 16]
20802 movh [r0 + 882 * 16], m3
20804 ; mode 15 [row 26, 0-7]
20806 pinsrb m6, [r3 + 24], 1
20807 pinsrb m6, [r3 + 26], 0
20808 pmaddubsw m3, m6, [r5 + 21 * 16]
20811 movh [r0 + 884 * 16], m3
20813 ; mode 15 [row 27, 0-7]
20814 pmaddubsw m3, m6, [r5 + 4 * 16]
20817 movh [r0 + 886 * 16], m3
20819 ; mode 15 [row 28, 0-7]
20821 pinsrb m6, [r3 + 26], 1
20822 pinsrb m6, [r3 + 28], 0
20823 pmaddubsw m3, m6, [r5 + 19 * 16]
20826 movh [r0 + 888 * 16], m3
20828 ; mode 15 [row 29, 0-7]
20829 pmaddubsw m3, m6, [r5 + 2 * 16]
20832 movh [r0 + 890 * 16], m3
20834 ; mode 15 [row 30, 0-7]
20836 pinsrb m6, [r3 + 28], 1
20837 pinsrb m6, [r3 + 30], 0
20838 pmaddubsw m3, m6, [r5 + 17 * 16]
20841 movh [r0 + 892 * 16], m3
20843 ; mode 15 [row 31, 0-7]
20844 pshufb m3, m6, [tab_S2]
20845 movh [r0 + 894 * 16], m3
20849 pinsrb m0, [r3 + 6], 1
20850 pinsrb m0, [r3 + 13], 0
20851 pmaddubsw m3, m0, [r5 + 31 * 16]
20853 pmaddubsw m5, m2, [r5 + 31 * 16]
20856 movu [r0 + 664 * 16], m3
20857 pmaddubsw m3, m1, [r5 + 31 * 16]
20859 pmaddubsw m5, m4, [r5 + 31 * 16]
20862 movu [r0 + 665 * 16], m3
20865 pmaddubsw m3, m0, [r5 + 26 * 16]
20867 pmaddubsw m5, m2, [r5 + 26 * 16]
20870 movu [r0 + 666 * 16], m3
20871 pmaddubsw m3, m1, [r5 + 26 * 16]
20873 pmaddubsw m5, m4, [r5 + 26 * 16]
20876 movu [r0 + 667 * 16], m3
20879 pmaddubsw m3, m0, [r5 + 21 * 16]
20881 pmaddubsw m5, m2, [r5 + 21 * 16]
20884 movu [r0 + 668 * 16], m3
20885 pmaddubsw m3, m1, [r5 + 21 * 16]
20887 pmaddubsw m5, m4, [r5 + 21 * 16]
20890 movu [r0 + 669 * 16], m3
20893 pmaddubsw m3, m0, [r5 + 16 * 16]
20895 pmaddubsw m5, m2, [r5 + 16 * 16]
20898 movu [r0 + 670 * 16], m3
20899 pmaddubsw m3, m1, [r5 + 16 * 16]
20901 pmaddubsw m5, m4, [r5 + 16 * 16]
20904 movu [r0 + 671 * 16], m3
20907 pmaddubsw m3, m0, [r5 + 11 * 16]
20909 pmaddubsw m5, m2, [r5 + 11 * 16]
20912 movu [r0 + 672 * 16], m3
20913 pmaddubsw m3, m1, [r5 + 11 * 16]
20915 pmaddubsw m5, m4, [r5 + 11 * 16]
20918 movu [r0 + 673 * 16], m3
20921 pmaddubsw m3, m0, [r5 + 6 * 16]
20923 pmaddubsw m5, m2, [r5 + 6 * 16]
20926 movu [r0 + 674 * 16], m3
20927 pmaddubsw m3, m1, [r5 + 6 * 16]
20929 pmaddubsw m5, m4, [r5 + 6 * 16]
20932 movu [r0 + 675 * 16], m3
20935 pmaddubsw m3, m0, [r5 + 1 * 16]
20937 pmaddubsw m5, m2, [r5 + 1 * 16]
20940 movu [r0 + 676 * 16], m3
20941 pmaddubsw m3, m1, [r5 + 1 * 16]
20943 pmaddubsw m5, m4, [r5 + 1 * 16]
20946 movu [r0 + 677 * 16], m3
20950 pinsrb m6, [r3 + 4], 2
20951 pinsrb m6, [r3 + 4], 1
20952 pinsrb m6, [r3 + 7], 0
20953 pmaddubsw m3, m6, [r5 + 24 * 16]
20955 pmaddubsw m5, m2, [r5 + 24 * 16]
20958 movu [r0 + 718 * 16], m3
20959 pmaddubsw m3, m1, [r5 + 24 * 16]
20961 pmaddubsw m5, m4, [r5 + 24 * 16]
20964 movu [r0 + 719 * 16], m3
20967 pmaddubsw m3, m6, [r5 + 15 * 16]
20969 pmaddubsw m5, m2, [r5 + 15 * 16]
20972 movu [r0 + 720 * 16], m3
20973 pmaddubsw m3, m1, [r5 + 15 * 16]
20975 pmaddubsw m5, m4, [r5 + 15 * 16]
20978 movu [r0 + 721 * 16], m3
20981 pmaddubsw m3, m6, [r5 + 6 * 16]
20983 pmaddubsw m5, m2, [r5 + 6 * 16]
20986 movu [r0 + 722 * 16], m3
20987 pmaddubsw m3, m1, [r5 + 6 * 16]
20989 pmaddubsw m5, m4, [r5 + 6 * 16]
20992 movu [r0 + 723 * 16], m3
20995 pinsrb m6, [r3 + 2], 2
20996 pinsrb m6, [r3 + 2], 1
20997 pinsrb m6, [r3 + 5], 0
20998 pmaddubsw m3, m6, [r5 + 31 * 16]
21000 pmaddubsw m5, m2, [r5 + 31 * 16]
21003 movu [r0 + 776 * 16], m3
21004 pmaddubsw m3, m1, [r5 + 31 * 16]
21006 pmaddubsw m5, m4, [r5 + 31 * 16]
21009 movu [r0 + 777 * 16], m3
21012 pmaddubsw m3, m6, [r5 + 18 * 16]
21014 pmaddubsw m5, m2, [r5 + 18 * 16]
21017 movu [r0 + 778 * 16], m3
21018 pmaddubsw m3, m1, [r5 + 18 * 16]
21020 pmaddubsw m5, m4, [r5 + 18 * 16]
21023 movu [r0 + 779 * 16], m3
21026 pmaddubsw m3, m6, [r5 + 5 * 16]
21028 pmaddubsw m5, m2, [r5 + 5 * 16]
21031 movu [r0 + 780 * 16], m3
21032 pmaddubsw m3, m1, [r5 + 5 * 16]
21034 pmaddubsw m5, m4, [r5 + 5 * 16]
21037 movu [r0 + 781 * 16], m3
21041 pinsrb m6, [r3 + 5], 1
21042 pinsrb m6, [r3 + 7], 0
21043 pmaddubsw m3, m6, [r5 + 24 * 16]
21046 pinsrw m2, [r4 + 5], 0
21047 pmaddubsw m5, m2, [r5 + 24 * 16]
21050 movu [r0 + 782 * 16], m3
21052 pinsrw m1, [r4 + 13], 0
21053 pmaddubsw m3, m1, [r5 + 24 * 16]
21056 pinsrw m4, [r4 + 21], 0
21057 pmaddubsw m5, m4, [r5 + 24 * 16]
21060 movu [r0 + 783 * 16], m3
21063 pmaddubsw m3, m6, [r5 + 11 * 16]
21065 pmaddubsw m5, m2, [r5 + 11 * 16]
21068 movu [r0 + 784 * 16], m3
21069 pmaddubsw m3, m1, [r5 + 11 * 16]
21071 pmaddubsw m5, m4, [r5 + 11 * 16]
21074 movu [r0 + 785 * 16], m3
21076 ; mode 15 [row 5, 8-31]
21077 pmaddubsw m5, m2, [r5 + 26 * 16]
21080 movh [r0 + 842 * 16 + 8], m5
21081 pmaddubsw m3, m1, [r5 + 26 * 16]
21083 pmaddubsw m5, m4, [r5 + 26 * 16]
21086 movu [r0 + 843 * 16], m3
21088 ; mode 15 [row 6, 8-31]
21089 pmaddubsw m5, m2, [r5 + 9 * 16]
21092 movh [r0 + 844 * 16 + 8], m5
21093 pmaddubsw m3, m1, [r5 + 9 * 16]
21095 pmaddubsw m5, m4, [r5 + 9 * 16]
21098 movu [r0 + 845 * 16], m3
21102 pinsrb m0, [r3 + 13], 1
21103 pinsrb m0, [r3 + 19], 0
21104 pmaddubsw m3, m0, [r5 + 28 * 16]
21106 pmaddubsw m5, m2, [r5 + 28 * 16]
21109 movu [r0 + 678 * 16], m3
21110 pmaddubsw m3, m1, [r5 + 28 * 16]
21112 pmaddubsw m5, m4, [r5 + 28 * 16]
21115 movu [r0 + 679 * 16], m3
21118 pmaddubsw m3, m0, [r5 + 23 * 16]
21120 pmaddubsw m5, m2, [r5 + 23 * 16]
21123 movu [r0 + 680 * 16], m3
21124 pmaddubsw m3, m1, [r5 + 23 * 16]
21126 pmaddubsw m5, m4, [r5 + 23 * 16]
21129 movu [r0 + 681 * 16], m3
21132 pmaddubsw m3, m0, [r5 + 18 * 16]
21134 pmaddubsw m5, m2, [r5 + 18 * 16]
21137 movu [r0 + 682 * 16], m3
21138 pmaddubsw m3, m1, [r5 + 18 * 16]
21140 pmaddubsw m5, m4, [r5 + 18 * 16]
21143 movu [r0 + 683 * 16], m3
21146 pmaddubsw m3, m0, [r5 + 13 * 16]
21148 pmaddubsw m5, m2, [r5 + 13 * 16]
21151 movu [r0 + 684 * 16], m3
21152 pmaddubsw m3, m1, [r5 + 13 * 16]
21154 pmaddubsw m5, m4, [r5 + 13 * 16]
21157 movu [r0 + 685 * 16], m3
21160 pmaddubsw m3, m0, [r5 + 8 * 16]
21162 pmaddubsw m5, m2, [r5 + 8 * 16]
21165 movu [r0 + 686 * 16], m3
21166 pmaddubsw m3, m1, [r5 + 8 * 16]
21168 pmaddubsw m5, m4, [r5 + 8 * 16]
21171 movu [r0 + 687 * 16], m3
21174 pmaddubsw m3, m0, [r5 + 3 * 16]
21176 pmaddubsw m5, m2, [r5 + 3 * 16]
21179 movu [r0 + 688 * 16], m3
21180 pmaddubsw m3, m1, [r5 + 3 * 16]
21182 pmaddubsw m5, m4, [r5 + 3 * 16]
21185 movu [r0 + 689 * 16], m3
21190 pinsrb m6, [r3 + 4], 4
21191 pinsrb m6, [r3 + 4], 3
21192 pinsrb m6, [r3 + 7], 2
21193 pinsrb m6, [r3 + 7], 1
21194 pinsrb m6, [r3 + 11], 0
21195 pmaddubsw m3, m6, [r5 + 29 * 16]
21196 pmulhrsw m3, [pw_1024]
21197 pmaddubsw m5, m2, [r5 + 29 * 16]
21198 pmulhrsw m5, [pw_1024]
21200 movu [r0 + 724 * 16], m3
21201 pmaddubsw m3, m1, [r5 + 29 * 16]
21202 pmulhrsw m3, [pw_1024]
21203 pmaddubsw m5, m4, [r5 + 29 * 16]
21204 pmulhrsw m5, [pw_1024]
21206 movu [r0 + 725 * 16], m3
21209 pmaddubsw m3, m6, [r5 + 20 * 16]
21210 pmulhrsw m3, [pw_1024]
21211 pmaddubsw m5, m2, [r5 + 20 * 16]
21212 pmulhrsw m5, [pw_1024]
21214 movu [r0 + 726 * 16], m3
21215 pmaddubsw m3, m1, [r5 + 20 * 16]
21216 pmulhrsw m3, [pw_1024]
21217 pmaddubsw m5, m4, [r5 + 20 * 16]
21218 pmulhrsw m5, [pw_1024]
21220 movu [r0 + 727 * 16], m3
21223 pmaddubsw m3, m6, [r5 + 11 * 16]
21224 pmulhrsw m3, [pw_1024]
21225 pmaddubsw m5, m2, [r5 + 11 * 16]
21226 pmulhrsw m5, [pw_1024]
21228 movu [r0 + 728 * 16], m3
21229 pmaddubsw m3, m1, [r5 + 11 * 16]
21230 pmulhrsw m3, [pw_1024]
21231 pmaddubsw m5, m4, [r5 + 11 * 16]
21232 pmulhrsw m5, [pw_1024]
21234 movu [r0 + 729 * 16], m3
21237 pmaddubsw m3, m6, [r5 + 2 * 16]
21238 pmulhrsw m3, [pw_1024]
21239 pmaddubsw m5, m2, [r5 + 2 * 16]
21240 pmulhrsw m5, [pw_1024]
21242 movu [r0 + 730 * 16], m3
21243 pmaddubsw m3, m1, [r5 + 2 * 16]
21244 pmulhrsw m3, [pw_1024]
21245 pmaddubsw m5, m4, [r5 + 2 * 16]
21246 pmulhrsw m5, [pw_1024]
21248 movu [r0 + 731 * 16], m3
21252 pinsrb m7, [r3 + 7], 1
21253 pinsrb m7, [r3 + 10], 0
21254 pmaddubsw m3, m7, [r5 + 30 * 16]
21255 pmulhrsw m3, [pw_1024]
21257 pinsrw m2, [r4 + 4], 0
21258 pmaddubsw m5, m2, [r5 + 30 * 16]
21259 pmulhrsw m5, [pw_1024]
21261 movu [r0 + 786 * 16], m3
21263 pinsrw m1, [r4 + 12], 0
21264 pmaddubsw m3, m1, [r5 + 30 * 16]
21265 pmulhrsw m3, [pw_1024]
21267 pinsrb m4, [r4 + 21], 1
21268 pinsrb m4, [r4 + 20], 0
21269 pmaddubsw m5, m4, [r5 + 30 * 16]
21270 pmulhrsw m5, [pw_1024]
21272 movu [r0 + 787 * 16], m3
21275 pmaddubsw m3, m7, [r5 + 17 * 16]
21276 pmulhrsw m3, [pw_1024]
21277 pmaddubsw m5, m2, [r5 + 17 * 16]
21278 pmulhrsw m5, [pw_1024]
21280 movu [r0 + 788 * 16], m3
21281 pmaddubsw m3, m1, [r5 + 17 * 16]
21282 pmulhrsw m3, [pw_1024]
21283 pmaddubsw m5, m4, [r5 + 17 * 16]
21284 pmulhrsw m5, [pw_1024]
21286 movu [r0 + 789 * 16], m3
21289 pmaddubsw m3, m7, [r5 + 4 * 16]
21290 pmulhrsw m3, [pw_1024]
21291 pmaddubsw m5, m2, [r5 + 4 * 16]
21292 pmulhrsw m5, [pw_1024]
21294 movu [r0 + 790 * 16], m3
21295 pmaddubsw m3, m1, [r5 + 4 * 16]
21296 pmulhrsw m3, [pw_1024]
21297 pmaddubsw m5, m4, [r5 + 4 * 16]
21298 pmulhrsw m5, [pw_1024]
21300 movu [r0 + 791 * 16], m3
21304 ; mode 15 [row 7, 8-31]
21305 pmaddubsw m5, m2, [r5 + 24 * 16]
21308 movh [r0 + 846 * 16 + 8], m5
21309 pmaddubsw m3, m1, [r5 + 24 * 16]
21311 pmaddubsw m5, m4, [r5 + 24 * 16]
21314 movu [r0 + 847 * 16], m3
21316 ; mode 15 [row 8, 8-31]
21317 pmaddubsw m5, m2, [r5 + 7 * 16]
21320 movh [r0 + 848 * 16 + 8], m5
21321 pmaddubsw m3, m1, [r5 + 7 * 16]
21323 pmaddubsw m5, m4, [r5 + 7 * 16]
21326 movu [r0 + 849 * 16], m3
21330 pinsrb m0, [r3 + 19], 1
21331 pinsrb m0, [r3 + 26], 0
21332 pmaddubsw m3, m0, [r5 + 30 * 16]
21333 pmulhrsw m3, [pw_1024]
21334 pmaddubsw m5, m2, [r5 + 30 * 16]
21335 pmulhrsw m5, [pw_1024]
21337 movu [r0 + 690 * 16], m3
21338 pmaddubsw m3, m1, [r5 + 30 * 16]
21339 pmulhrsw m3, [pw_1024]
21340 pmaddubsw m5, m4, [r5 + 30 * 16]
21341 pmulhrsw m5, [pw_1024]
21343 movu [r0 + 691 * 16], m3
21346 pmaddubsw m3, m0, [r5 + 25 * 16]
21347 pmulhrsw m3, [pw_1024]
21348 pmaddubsw m5, m2, [r5 + 25 * 16]
21349 pmulhrsw m5, [pw_1024]
21351 movu [r0 + 692 * 16], m3
21352 pmaddubsw m3, m1, [r5 + 25 * 16]
21353 pmulhrsw m3, [pw_1024]
21354 pmaddubsw m5, m4, [r5 + 25 * 16]
21355 pmulhrsw m5, [pw_1024]
21357 movu [r0 + 693 * 16], m3
21360 pmaddubsw m3, m0, [r5 + 20 * 16]
21361 pmulhrsw m3, [pw_1024]
21362 pmaddubsw m5, m2, [r5 + 20 * 16]
21363 pmulhrsw m5, [pw_1024]
21365 movu [r0 + 694 * 16], m3
21366 pmaddubsw m3, m1, [r5 + 20 * 16]
21367 pmulhrsw m3, [pw_1024]
21368 pmaddubsw m5, m4, [r5 + 20 * 16]
21369 pmulhrsw m5, [pw_1024]
21371 movu [r0 + 695 * 16], m3
21374 pmaddubsw m3, m0, [r5 + 15 * 16]
21375 pmulhrsw m3, [pw_1024]
21376 pmaddubsw m5, m2, [r5 + 15 * 16]
21377 pmulhrsw m5, [pw_1024]
21379 movu [r0 + 696 * 16], m3
21380 pmaddubsw m3, m1, [r5 + 15 * 16]
21381 pmulhrsw m3, [pw_1024]
21382 pmaddubsw m5, m4, [r5 + 15 * 16]
21383 pmulhrsw m5, [pw_1024]
21385 movu [r0 + 697 * 16], m3
21388 pmaddubsw m3, m0, [r5 + 10 * 16]
21389 pmulhrsw m3, [pw_1024]
21390 pmaddubsw m5, m2, [r5 + 10 * 16]
21391 pmulhrsw m5, [pw_1024]
21393 movu [r0 + 698 * 16], m3
21394 pmaddubsw m3, m1, [r5 + 10 * 16]
21395 pmulhrsw m3, [pw_1024]
21396 pmaddubsw m5, m4, [r5 + 10 * 16]
21397 pmulhrsw m5, [pw_1024]
21399 movu [r0 + 699 * 16], m3
21402 pmaddubsw m3, m0, [r5 + 5 * 16]
21403 pmulhrsw m3, [pw_1024]
21404 pmaddubsw m5, m2, [r5 + 5 * 16]
21405 pmulhrsw m5, [pw_1024]
21407 movu [r0 + 700 * 16], m3
21408 pmaddubsw m3, m1, [r5 + 5 * 16]
21409 pmulhrsw m3, [pw_1024]
21410 pmaddubsw m5, m4, [r5 + 5 * 16]
21411 pmulhrsw m5, [pw_1024]
21413 movu [r0 + 701 * 16], m3
21417 pinsrb m6, [r3 + 4], 6
21418 pinsrb m6, [r3 + 4], 5
21419 pinsrb m6, [r3 + 7], 4
21420 pinsrb m6, [r3 + 7], 3
21421 pinsrb m6, [r3 + 11], 2
21422 pinsrb m6, [r3 + 11], 1
21423 pinsrb m6, [r3 + 14], 0
21424 pmaddubsw m3, m6, [r5 + 25 * 16]
21425 pmulhrsw m3, [pw_1024]
21426 pmaddubsw m5, m2, [r5 + 25 * 16]
21427 pmulhrsw m5, [pw_1024]
21429 movu [r0 + 732 * 16], m3
21430 pmaddubsw m3, m1, [r5 + 25 * 16]
21431 pmulhrsw m3, [pw_1024]
21432 pmaddubsw m5, m4, [r5 + 25 * 16]
21433 pmulhrsw m5, [pw_1024]
21435 movu [r0 + 733 * 16], m3
21438 pmaddubsw m3, m6, [r5 + 16 * 16]
21439 pmulhrsw m3, [pw_1024]
21440 pmaddubsw m5, m2, [r5 + 16 * 16]
21441 pmulhrsw m5, [pw_1024]
21443 movu [r0 + 734 * 16], m3
21444 pmaddubsw m3, m1, [r5 + 16 * 16]
21445 pmulhrsw m3, [pw_1024]
21446 pmaddubsw m5, m4, [r5 + 16 * 16]
21447 pmulhrsw m5, [pw_1024]
21449 movu [r0 + 735 * 16], m3
21452 pmaddubsw m3, m6, [r5 + 7 * 16]
21453 pmulhrsw m3, [pw_1024]
21454 pmaddubsw m5, m2, [r5 + 7 * 16]
21455 pmulhrsw m5, [pw_1024]
21457 movu [r0 + 736 * 16], m3
21458 pmaddubsw m3, m1, [r5 + 7 * 16]
21459 pmulhrsw m3, [pw_1024]
21460 pmaddubsw m5, m4, [r5 + 7 * 16]
21461 pmulhrsw m5, [pw_1024]
21463 movu [r0 + 737 * 16], m3
21467 pinsrb m6, [r3 + 14], 1
21468 pinsrb m6, [r3 + 18], 0
21469 pmaddubsw m3, m6, [r5 + 30 * 16]
21470 pmulhrsw m3, [pw_1024]
21472 pinsrw m2, [r4 + 3], 0
21473 pmaddubsw m5, m2, [r5 + 30 * 16]
21474 pmulhrsw m5, [pw_1024]
21476 movu [r0 + 738 * 16], m3
21478 pinsrw m1, [r4 + 11], 0
21479 pmaddubsw m3, m1, [r5 + 30 * 16]
21480 pmulhrsw m3, [pw_1024]
21482 pinsrw m4, [r4 + 19], 0
21483 pmaddubsw m5, m4, [r5 + 30 * 16]
21484 pmulhrsw m5, [pw_1024]
21486 movu [r0 + 739 * 16], m3
21489 pmaddubsw m3, m6, [r5 + 21 * 16]
21490 pmulhrsw m3, [pw_1024]
21491 pmaddubsw m5, m2, [r5 + 21 * 16]
21492 pmulhrsw m5, [pw_1024]
21494 movu [r0 + 740 * 16], m3
21495 pmaddubsw m3, m1, [r5 + 21 * 16]
21496 pmulhrsw m3, [pw_1024]
21497 pmaddubsw m5, m4, [r5 + 21 * 16]
21498 pmulhrsw m5, [pw_1024]
21500 movu [r0 + 741 * 16], m3
21503 pmaddubsw m3, m6, [r5 + 12 * 16]
21504 pmulhrsw m3, [pw_1024]
21505 pmaddubsw m5, m2, [r5 + 12 * 16]
21506 pmulhrsw m5, [pw_1024]
21508 movu [r0 + 742 * 16], m3
21509 pmaddubsw m3, m1, [r5 + 12 * 16]
21510 pmulhrsw m3, [pw_1024]
21511 pmaddubsw m5, m4, [r5 + 12 * 16]
21512 pmulhrsw m5, [pw_1024]
21514 movu [r0 + 743 * 16], m3
21517 pmaddubsw m3, m6, [r5 + 3 * 16]
21518 pmulhrsw m3, [pw_1024]
21519 pmaddubsw m5, m2, [r5 + 3 * 16]
21520 pmulhrsw m5, [pw_1024]
21522 movu [r0 + 744 * 16], m3
21523 pmaddubsw m3, m1, [r5 + 3 * 16]
21524 pmulhrsw m3, [pw_1024]
21525 pmaddubsw m5, m4, [r5 + 3 * 16]
21526 pmulhrsw m5, [pw_1024]
21528 movu [r0 + 745 * 16], m3
21532 pinsrb m7, [r3 + 10], 1
21533 pinsrb m7, [r3 + 12], 0
21534 pmaddubsw m3, m7, [r5 + 23 * 16]
21535 pmulhrsw m3, [pw_1024]
21536 pmaddubsw m5, m2, [r5 + 23 * 16]
21537 pmulhrsw m5, [pw_1024]
21539 movu [r0 + 792 * 16], m3
21540 pmaddubsw m3, m1, [r5 + 23 * 16]
21541 pmulhrsw m3, [pw_1024]
21542 pmaddubsw m5, m4, [r5 + 23 * 16]
21543 pmulhrsw m5, [pw_1024]
21545 movu [r0 + 793 * 16], m3
21548 pmaddubsw m3, m7, [r5 + 10 * 16]
21549 pmulhrsw m3, [pw_1024]
21550 pmaddubsw m5, m2, [r5 + 10 * 16]
21551 pmulhrsw m5, [pw_1024]
21553 movu [r0 + 794 * 16], m3
21554 pmaddubsw m3, m1, [r5 + 10 * 16]
21555 pmulhrsw m3, [pw_1024]
21556 pmaddubsw m5, m4, [r5 + 10 * 16]
21557 pmulhrsw m5, [pw_1024]
21559 movu [r0 + 795 * 16], m3
21562 pmaddubsw m5, m2, [r5 + 22 * 16]
21563 pmulhrsw m5, [pw_1024]
21565 movu [r0 + 850 * 16 + 8], m5
21566 pmaddubsw m3, m1, [r5 + 22 * 16]
21567 pmulhrsw m3, [pw_1024]
21568 pmaddubsw m5, m4, [r5 + 22 * 16]
21569 pmulhrsw m5, [pw_1024]
21571 movu [r0 + 851 * 16], m3
21574 pmaddubsw m5, m2, [r5 + 5 * 16]
21575 pmulhrsw m5, [pw_1024]
21577 movu [r0 + 852 * 16 + 8], m5
21578 pmaddubsw m3, m1, [r5 + 5 * 16]
21579 pmulhrsw m3, [pw_1024]
21580 pmaddubsw m5, m4, [r5 + 5 * 16]
21581 pmulhrsw m5, [pw_1024]
21583 movu [r0 + 853 * 16], m3
21587 pinsrb m6, [r3 + 18], 1
21588 pinsrb m6, [r3 + 21], 0
21589 pmaddubsw m3, m6, [r5 + 26 * 16]
21590 pmulhrsw m3, [pw_1024]
21592 pinsrw m2, [r4 + 2], 0
21593 pmaddubsw m5, m2, [r5 + 26 * 16]
21594 pmulhrsw m5, [pw_1024]
21596 movu [r0 + 746 * 16], m3
21598 pinsrw m1, [r4 + 10], 0
21599 pmaddubsw m3, m1, [r5 + 26 * 16]
21600 pmulhrsw m3, [pw_1024]
21602 pinsrw m4, [r4 + 18], 0
21603 pmaddubsw m5, m4, [r5 + 26 * 16]
21604 pmulhrsw m5, [pw_1024]
21606 movu [r0 + 747 * 16], m3
21609 pmaddubsw m3, m6, [r5 + 17 * 16]
21610 pmulhrsw m3, [pw_1024]
21611 pmaddubsw m5, m2, [r5 + 17 * 16]
21612 pmulhrsw m5, [pw_1024]
21614 movu [r0 + 748 * 16], m3
21615 pmaddubsw m3, m1, [r5 + 17 * 16]
21616 pmulhrsw m3, [pw_1024]
21617 pmaddubsw m5, m4, [r5 + 17 * 16]
21618 pmulhrsw m5, [pw_1024]
21620 movu [r0 + 749 * 16], m3
21623 pmaddubsw m3, m6, [r5 + 8 * 16]
21624 pmulhrsw m3, [pw_1024]
21625 pmaddubsw m5, m2, [r5 + 8 * 16]
21626 pmulhrsw m5, [pw_1024]
21628 movu [r0 + 750 * 16], m3
21629 pmaddubsw m3, m1, [r5 + 8 * 16]
21630 pmulhrsw m3, [pw_1024]
21631 pmaddubsw m5, m4, [r5 + 8 * 16]
21632 pmulhrsw m5, [pw_1024]
21634 movu [r0 + 751 * 16], m3
21638 pinsrb m7, [r3 + 12], 1
21639 pinsrb m7, [r3 + 15], 0
21640 pmaddubsw m3, m7, [r5 + 29 * 16]
21641 pmulhrsw m3, [pw_1024]
21642 pmaddubsw m5, m2, [r5 + 29 * 16]
21643 pmulhrsw m5, [pw_1024]
21645 movu [r0 + 796 * 16], m3
21646 pmaddubsw m3, m1, [r5 + 29 * 16]
21647 pmulhrsw m3, [pw_1024]
21648 pmaddubsw m5, m4, [r5 + 29 * 16]
21649 pmulhrsw m5, [pw_1024]
21651 movu [r0 + 797 * 16], m3
21654 pmaddubsw m3, m7, [r5 + 16 * 16]
21655 pmulhrsw m3, [pw_1024]
21656 pmaddubsw m5, m2, [r5 + 16 * 16]
21657 pmulhrsw m5, [pw_1024]
21659 movu [r0 + 798 * 16], m3
21660 pmaddubsw m3, m1, [r5 + 16 * 16]
21661 pmulhrsw m3, [pw_1024]
21662 pmaddubsw m5, m4, [r5 + 16 * 16]
21663 pmulhrsw m5, [pw_1024]
21665 movu [r0 + 799 * 16], m3
21668 pmaddubsw m3, m7, [r5 + 3 * 16]
21669 pmulhrsw m3, [pw_1024]
21670 pmaddubsw m5, m2, [r5 + 3 * 16]
21671 pmulhrsw m5, [pw_1024]
21673 movu [r0 + 800 * 16], m3
21674 pmaddubsw m3, m1, [r5 + 3 * 16]
21675 pmulhrsw m3, [pw_1024]
21676 pmaddubsw m5, m4, [r5 + 3 * 16]
21677 pmulhrsw m5, [pw_1024]
21679 movu [r0 + 801 * 16], m3
21682 pmaddubsw m5, m2, [r5 + 20 * 16]
21683 pmulhrsw m5, [pw_1024]
21685 movh [r0 + 854 * 16 + 8], m5
21686 pmaddubsw m3, m1, [r5 + 20 * 16]
21687 pmulhrsw m3, [pw_1024]
21688 pmaddubsw m5, m4, [r5 + 20 * 16]
21689 pmulhrsw m5, [pw_1024]
21691 movu [r0 + 855 * 16], m3
21694 pmaddubsw m5, m2, [r5 + 3 * 16]
21695 pmulhrsw m5, [pw_1024]
21697 movh [r0 + 856 * 16 + 8], m5
21698 pmaddubsw m3, m1, [r5 + 3 * 16]
21699 pmulhrsw m3, [pw_1024]
21700 pmaddubsw m5, m4, [r5 + 3 * 16]
21701 pmulhrsw m5, [pw_1024]
21703 movu [r0 + 857 * 16], m3
21707 pinsrb m6, [r3 + 21], 1
21708 pinsrb m6, [r3 + 25], 0
21709 pmaddubsw m3, m6, [r5 + 31 * 16]
21710 pmulhrsw m3, [pw_1024]
21712 pinsrw m2, [r4 + 1], 0
21713 pmaddubsw m5, m2, [r5 + 31 * 16]
21714 pmulhrsw m5, [pw_1024]
21716 movu [r0 + 752 * 16], m3
21718 pinsrw m1, [r4 + 9], 0
21719 pmaddubsw m3, m1, [r5 + 31 * 16]
21720 pmulhrsw m3, [pw_1024]
21722 pinsrw m4, [r4 + 17], 0
21723 pmaddubsw m5, m4, [r5 + 31 * 16]
21724 pmulhrsw m5, [pw_1024]
21726 movu [r0 + 753 * 16], m3
21729 pmaddubsw m3, m6, [r5 + 22 * 16]
21730 pmulhrsw m3, [pw_1024]
21731 pmaddubsw m5, m2, [r5 + 22 * 16]
21732 pmulhrsw m5, [pw_1024]
21734 movu [r0 + 754 * 16], m3
21735 pmaddubsw m3, m1, [r5 + 22 * 16]
21736 pmulhrsw m3, [pw_1024]
21737 pmaddubsw m5, m4, [r5 + 22 * 16]
21738 pmulhrsw m5, [pw_1024]
21740 movu [r0 + 755 * 16], m3
21743 pmaddubsw m3, m6, [r5 + 13 * 16]
21744 pmulhrsw m3, [pw_1024]
21745 pmaddubsw m5, m2, [r5 + 13 * 16]
21746 pmulhrsw m5, [pw_1024]
21748 movu [r0 + 756 * 16], m3
21749 pmaddubsw m3, m1, [r5 + 13 * 16]
21750 pmulhrsw m3, [pw_1024]
21751 pmaddubsw m5, m4, [r5 + 13 * 16]
21752 pmulhrsw m5, [pw_1024]
21754 movu [r0 + 757 * 16], m3
21757 pmaddubsw m3, m6, [r5 + 4 * 16]
21758 pmulhrsw m3, [pw_1024]
21759 pmaddubsw m5, m2, [r5 + 4 * 16]
21760 pmulhrsw m5, [pw_1024]
21762 movu [r0 + 758 * 16], m3
21763 pmaddubsw m3, m1, [r5 + 4 * 16]
21764 pmulhrsw m3, [pw_1024]
21765 pmaddubsw m5, m4, [r5 + 4 * 16]
21766 pmulhrsw m5, [pw_1024]
21768 movu [r0 + 759 * 16], m3
21772 pinsrb m7, [r3 + 15], 1
21773 pinsrb m7, [r3 + 17], 0
21774 pmaddubsw m3, m7, [r5 + 22 * 16]
21775 pmulhrsw m3, [pw_1024]
21776 pmaddubsw m5, m2, [r5 + 22 * 16]
21777 pmulhrsw m5, [pw_1024]
21779 movu [r0 + 802 * 16], m3
21780 pmaddubsw m3, m1, [r5 + 22 * 16]
21781 pmulhrsw m3, [pw_1024]
21782 pmaddubsw m5, m4, [r5 + 22 * 16]
21783 pmulhrsw m5, [pw_1024]
21785 movu [r0 + 803 * 16], m3
21788 pmaddubsw m3, m7, [r5 + 9 * 16]
21789 pmulhrsw m3, [pw_1024]
21790 pmaddubsw m5, m2, [r5 + 9 * 16]
21791 pmulhrsw m5, [pw_1024]
21793 movu [r0 + 804 * 16], m3
21794 pmaddubsw m3, m1, [r5 + 9 * 16]
21795 pmulhrsw m3, [pw_1024]
21796 pmaddubsw m5, m4, [r5 + 9 * 16]
21797 pmulhrsw m5, [pw_1024]
21799 movu [r0 + 805 * 16], m3
21802 pmaddubsw m5, m2, [r5 + 18 * 16]
21803 pmulhrsw m5, [pw_1024]
21805 movh [r0 + 858 * 16 + 8], m5
21806 pmaddubsw m3, m1, [r5 + 18 * 16]
21807 pmulhrsw m3, [pw_1024]
21808 pmaddubsw m5, m4, [r5 + 18 * 16]
21809 pmulhrsw m5, [pw_1024]
21811 movu [r0 + 859 * 16], m3
21814 pmaddubsw m5, m2, [r5 + 1 * 16]
21815 pmulhrsw m5, [pw_1024]
21817 movh [r0 + 860 * 16 + 8], m5
21818 pmaddubsw m3, m1, [r5 + 1 * 16]
21819 pmulhrsw m3, [pw_1024]
21820 pmaddubsw m5, m4, [r5 + 1 * 16]
21821 pmulhrsw m5, [pw_1024]
21823 movu [r0 + 861 * 16], m3
21827 pinsrb m6, [r3 + 25], 1
21828 pinsrb m6, [r3 + 28], 0
21829 pmaddubsw m3, m6, [r5 + 27 * 16]
21830 pmulhrsw m3, [pw_1024]
21832 pinsrw m2, [r4 + 0], 0
21833 pmaddubsw m5, m2, [r5 + 27 * 16]
21834 pmulhrsw m5, [pw_1024]
21836 movu [r0 + 760 * 16], m3
21838 pinsrw m1, [r4 + 8], 0
21839 pmaddubsw m3, m1, [r5 + 27 * 16]
21840 pmulhrsw m3, [pw_1024]
21842 pinsrw m4, [r4 + 16], 0
21843 pmaddubsw m5, m4, [r5 + 27 * 16]
21844 pmulhrsw m5, [pw_1024]
21846 movu [r0 + 761 * 16], m3
21849 pmaddubsw m3, m6, [r5 + 18 * 16]
21850 pmulhrsw m3, [pw_1024]
21851 pmaddubsw m5, m2, [r5 + 18 * 16]
21852 pmulhrsw m5, [pw_1024]
21854 movu [r0 + 762 * 16], m3
21855 pmaddubsw m3, m1, [r5 + 18 * 16]
21856 pmulhrsw m3, [pw_1024]
21857 pmaddubsw m5, m4, [r5 + 18 * 16]
21858 pmulhrsw m5, [pw_1024]
21860 movu [r0 + 763 * 16], m3
21863 pmaddubsw m3, m6, [r5 + 9 * 16]
21864 pmulhrsw m3, [pw_1024]
21865 pmaddubsw m5, m2, [r5 + 9 * 16]
21866 pmulhrsw m5, [pw_1024]
21868 movu [r0 + 764 * 16], m3
21869 pmaddubsw m3, m1, [r5 + 9 * 16]
21870 pmulhrsw m3, [pw_1024]
21871 pmaddubsw m5, m4, [r5 + 9 * 16]
21872 pmulhrsw m5, [pw_1024]
21874 movu [r0 + 765 * 16], m3
21878 pinsrb m7, [r3 + 17], 1
21879 pinsrb m7, [r3 + 20], 0
21880 pmaddubsw m3, m7, [r5 + 28 * 16]
21881 pmulhrsw m3, [pw_1024]
21882 pmaddubsw m5, m2, [r5 + 28 * 16]
21883 pmulhrsw m5, [pw_1024]
21885 movu [r0 + 806 * 16], m3
21886 pmaddubsw m3, m1, [r5 + 28 * 16]
21887 pmulhrsw m3, [pw_1024]
21888 pmaddubsw m5, m4, [r5 + 28 * 16]
21889 pmulhrsw m5, [pw_1024]
21891 movu [r0 + 807 * 16], m3
21894 pmaddubsw m3, m7, [r5 + 15 * 16]
21895 pmulhrsw m3, [pw_1024]
21896 pmaddubsw m5, m2, [r5 + 15 * 16]
21897 pmulhrsw m5, [pw_1024]
21899 movu [r0 + 808 * 16], m3
21900 pmaddubsw m3, m1, [r5 + 15 * 16]
21901 pmulhrsw m3, [pw_1024]
21902 pmaddubsw m5, m4, [r5 + 15 * 16]
21903 pmulhrsw m5, [pw_1024]
21905 movu [r0 + 809 * 16], m3
21908 pmaddubsw m3, m7, [r5 + 2 * 16]
21909 pmulhrsw m3, [pw_1024]
21910 pmaddubsw m5, m2, [r5 + 2 * 16]
21911 pmulhrsw m5, [pw_1024]
21913 movu [r0 + 810 * 16], m3
21914 pmaddubsw m3, m1, [r5 + 2 * 16]
21915 pmulhrsw m3, [pw_1024]
21916 pmaddubsw m5, m4, [r5 + 2 * 16]
21917 pmulhrsw m5, [pw_1024]
21919 movu [r0 + 811 * 16], m3
21922 pmaddubsw m5, m2, [r5 + 16 * 16]
21923 pmulhrsw m5, [pw_1024]
21925 movh [r0 + 862 * 16 + 8], m5
21926 pmaddubsw m3, m1, [r5 + 16 * 16]
21927 pmulhrsw m3, [pw_1024]
21928 pmaddubsw m5, m4, [r5 + 16 * 16]
21929 pmulhrsw m5, [pw_1024]
21931 movu [r0 + 863 * 16], m3
21935 pinsrb m7, [r3 + 20], 1
21936 pinsrb m7, [r3 + 22], 0
21937 pmaddubsw m3, m7, [r5 + 21 * 16]
21938 pmulhrsw m3, [pw_1024]
21940 pinsrb m2, [r4 + 0], 1
21941 pinsrb m2, [r3 + 2], 0
21942 pmaddubsw m5, m2, [r5 + 21 * 16]
21943 pmulhrsw m5, [pw_1024]
21945 movu [r0 + 812 * 16], m3
21947 pinsrw m1, [r4 + 7], 0
21948 pmaddubsw m3, m1, [r5 + 21 * 16]
21949 pmulhrsw m3, [pw_1024]
21951 pinsrw m4, [r4 + 15], 0
21952 pmaddubsw m5, m4, [r5 + 21 * 16]
21953 pmulhrsw m5, [pw_1024]
21955 movu [r0 + 813 * 16], m3
21958 pmaddubsw m3, m7, [r5 + 8 * 16]
21959 pmulhrsw m3, [pw_1024]
21960 pmaddubsw m5, m2, [r5 + 8 * 16]
21961 pmulhrsw m5, [pw_1024]
21963 movu [r0 + 814 * 16], m3
21964 pmaddubsw m3, m1, [r5 + 8 * 16]
21965 pmulhrsw m3, [pw_1024]
21966 pmaddubsw m5, m4, [r5 + 8 * 16]
21967 pmulhrsw m5, [pw_1024]
21969 movu [r0 + 815 * 16], m3
21972 pmaddubsw m5, m2, [r5 + 31 * 16]
21973 pmulhrsw m5, [pw_1024]
21975 movh [r0 + 864 * 16 + 8], m5
21976 pmaddubsw m3, m1, [r5 + 31 * 16]
21977 pmulhrsw m3, [pw_1024]
21978 pmaddubsw m5, m4, [r5 + 31 * 16]
21979 pmulhrsw m5, [pw_1024]
21981 movu [r0 + 865 * 16], m3
21984 pmaddubsw m5, m2, [r5 + 14 * 16]
21985 pmulhrsw m5, [pw_1024]
21987 movh [r0 + 866 * 16 + 8], m5
21988 pmaddubsw m3, m1, [r5 + 14 * 16]
21989 pmulhrsw m3, [pw_1024]
21990 pmaddubsw m5, m4, [r5 + 14 * 16]
21991 pmulhrsw m5, [pw_1024]
21993 movu [r0 + 867 * 16], m3
21997 pinsrb m7, [r3 + 22], 1
21998 pinsrb m7, [r3 + 25], 0
21999 pmaddubsw m3, m7, [r5 + 27 * 16]
22000 pmulhrsw m3, [pw_1024]
22002 pinsrb m2, [r3 + 2], 1
22003 pinsrb m2, [r3 + 5], 0
22004 pmaddubsw m5, m2, [r5 + 27 * 16]
22005 pmulhrsw m5, [pw_1024]
22007 movu [r0 + 816 * 16], m3
22009 pinsrw m1, [r4 + 6], 0
22010 pmaddubsw m3, m1, [r5 + 27 * 16]
22011 pmulhrsw m3, [pw_1024]
22013 pinsrw m4, [r4 + 14], 0
22014 pmaddubsw m5, m4, [r5 + 27 * 16]
22015 pmulhrsw m5, [pw_1024]
22017 movu [r0 + 817 * 16], m3
22020 pmaddubsw m3, m7, [r5 + 14 * 16]
22021 pmulhrsw m3, [pw_1024]
22022 pmaddubsw m5, m2, [r5 + 14 * 16]
22023 pmulhrsw m5, [pw_1024]
22025 movu [r0 + 818 * 16], m3
22026 pmaddubsw m3, m1, [r5 + 14 * 16]
22027 pmulhrsw m3, [pw_1024]
22028 pmaddubsw m5, m4, [r5 + 14 * 16]
22029 pmulhrsw m5, [pw_1024]
22031 movu [r0 + 819 * 16], m3
22034 pmaddubsw m3, m7, [r5 + 1 * 16]
22035 pmulhrsw m3, [pw_1024]
22036 pmaddubsw m5, m2, [r5 + 1 * 16]
22037 pmulhrsw m5, [pw_1024]
22039 movu [r0 + 820 * 16], m3
22040 pmaddubsw m3, m1, [r5 + 1 * 16]
22041 pmulhrsw m3, [pw_1024]
22042 pmaddubsw m5, m4, [r5 + 1 * 16]
22043 pmulhrsw m5, [pw_1024]
22045 movu [r0 + 821 * 16], m3
22048 pinsrb m2, [r3 + 4], 0
22049 pmaddubsw m5, m2, [r5 + 29 * 16]
22050 pmulhrsw m5, [pw_1024]
22052 movh [r0 + 868 * 16 + 8], m5
22053 pmaddubsw m3, m1, [r5 + 29 * 16]
22054 pmulhrsw m3, [pw_1024]
22055 pmaddubsw m5, m4, [r5 + 29 * 16]
22056 pmulhrsw m5, [pw_1024]
22058 movu [r0 + 869 * 16], m3
22061 pmaddubsw m5, m2, [r5 + 12 * 16]
22062 pmulhrsw m5, [pw_1024]
22064 movh [r0 + 870 * 16 + 8], m5
22065 pmaddubsw m3, m1, [r5 + 12 * 16]
22066 pmulhrsw m3, [pw_1024]
22067 pmaddubsw m5, m4, [r5 + 12 * 16]
22068 pmulhrsw m5, [pw_1024]
22070 movu [r0 + 871 * 16], m3
22072 ; mode 15 [row 20 - 8 to 15]
22074 pinsrb m3, [r3 + 4], 1
22075 pinsrb m3, [r3 + 6], 0
22076 pmaddubsw m5, m3, [r5 + 27 * 16]
22077 pmulhrsw m5, [pw_1024]
22079 movh [r0 + 872 * 16 + 8], m5
22081 ; mode 15 [row 21 - 8 to 15]
22082 pmaddubsw m5, m3, [r5 + 10 * 16]
22083 pmulhrsw m5, [pw_1024]
22085 movh [r0 + 874 * 16 + 8], m5
22087 ; mode 15 [row 22 - 8 to 15]
22089 pinsrb m3, [r3 + 6], 1
22090 pinsrb m3, [r3 + 8], 0
22091 pmaddubsw m5, m3, [r5 + 25 * 16]
22092 pmulhrsw m5, [pw_1024]
22094 movh [r0 + 876 * 16 + 8], m5
22096 ; mode 15 [row 23 - 8 to 15]
22097 pmaddubsw m5, m3, [r5 + 8 * 16]
22098 pmulhrsw m5, [pw_1024]
22100 movh [r0 + 878 * 16 + 8], m5
22102 ; mode 15 [row 24 - 8 to 15]
22104 pinsrb m3, [r3 + 8], 1
22105 pinsrb m3, [r3 + 9], 0
22106 pmaddubsw m5, m3, [r5 + 23 * 16]
22107 pmulhrsw m5, [pw_1024]
22109 movh [r0 + 880 * 16 + 8], m5
22111 ; mode 15 [row 25 - 8 to 15]
22112 pmaddubsw m5, m3, [r5 + 6 * 16]
22113 pmulhrsw m5, [pw_1024]
22115 movh [r0 + 882 * 16 + 8], m5
22117 ; mode 15 [row 26 - 8 to 15]
22119 pinsrb m3, [r3 + 9], 1
22120 pinsrb m3, [r3 + 11], 0
22121 pmaddubsw m5, m3, [r5 + 21 * 16]
22122 pmulhrsw m5, [pw_1024]
22124 movh [r0 + 884 * 16 + 8], m5
22126 ; mode 15 [row 27 - 8 to 15]
22127 pmaddubsw m5, m3, [r5 + 4 * 16]
22128 pmulhrsw m5, [pw_1024]
22130 movh [r0 + 886 * 16 + 8], m5
22132 ; mode 15 [row 28 - 8 to 15]
22134 pinsrb m3, [r3 + 11], 1
22135 pinsrb m3, [r3 + 13], 0
22136 pmaddubsw m5, m3, [r5 + 19 * 16]
22137 pmulhrsw m5, [pw_1024]
22139 movh [r0 + 888 * 16 + 8], m5
22141 ; mode 15 [row 29 - 8 to 15]
22142 pmaddubsw m5, m3, [r5 + 2 * 16]
22143 pmulhrsw m5, [pw_1024]
22145 movh [r0 + 890 * 16 + 8], m5
22147 ; mode 15 [row 30 - 8 to 15]
22149 pinsrb m3, [r3 + 13], 1
22150 pinsrb m3, [r3 + 15], 0
22151 pmaddubsw m5, m3, [r5 + 17 * 16]
22152 pmulhrsw m5, [pw_1024]
22154 movh [r0 + 892 * 16 + 8], m5
22156 ; mode 15 [row 31, 8 to 15]
22157 pshufb m5, m3, [tab_S2]
22158 movh [r0 + 894 * 16 + 8], m5
22161 pinsrb m2, [r3 + 5], 0
22163 pinsrb m7, [r3 + 25], 1
22164 pinsrb m7, [r3 + 27], 0
22165 pmaddubsw m3, m7, [r5 + 20 * 16]
22166 pmulhrsw m3, [pw_1024]
22168 pinsrb m2, [r3 + 5], 1
22169 pinsrb m2, [r3 + 7], 0
22170 pmaddubsw m5, m2, [r5 + 20 * 16]
22171 pmulhrsw m5, [pw_1024]
22173 movu [r0 + 822 * 16], m3
22175 pinsrw m1, [r4 + 5], 0
22176 pmaddubsw m3, m1, [r5 + 20 * 16]
22177 pmulhrsw m3, [pw_1024]
22179 pinsrw m4, [r4 + 13], 0
22180 pmaddubsw m5, m4, [r5 + 20 * 16]
22181 pmulhrsw m5, [pw_1024]
22183 movu [r0 + 823 * 16], m3
22185 ; mode 15 [row 20 - 16 to 31]
22186 pmaddubsw m3, m1, [r5 + 27 * 16]
22187 pmulhrsw m3, [pw_1024]
22188 pmaddubsw m5, m4, [r5 + 27 * 16]
22189 pmulhrsw m5, [pw_1024]
22191 movu [r0 + 873 * 16], m3
22193 ; mode 15 [row 21 - 16 to 31]
22194 pmaddubsw m3, m1, [r5 + 10 * 16]
22195 pmulhrsw m3, [pw_1024]
22196 pmaddubsw m5, m4, [r5 + 10 * 16]
22197 pmulhrsw m5, [pw_1024]
22199 movu [r0 + 875 * 16], m3
22202 pmaddubsw m3, m7, [r5 + 7 * 16]
22203 pmulhrsw m3, [pw_1024]
22204 pmaddubsw m5, m2, [r5 + 7 * 16]
22205 pmulhrsw m5, [pw_1024]
22207 movu [r0 + 824 * 16], m3
22208 pmaddubsw m3, m1, [r5 + 7 * 16]
22209 pmulhrsw m3, [pw_1024]
22210 pmaddubsw m5, m4, [r5 + 7 * 16]
22211 pmulhrsw m5, [pw_1024]
22213 movu [r0 + 825 * 16], m3
22217 pinsrb m7, [r3 + 27], 1
22218 pinsrb m7, [r3 + 30], 0
22219 pmaddubsw m3, m7, [r5 + 26 * 16]
22220 pmulhrsw m3, [pw_1024]
22222 pinsrb m2, [r3 + 7], 1
22223 pinsrb m2, [r3 + 10], 0
22224 pmaddubsw m5, m2, [r5 + 26 * 16]
22225 pmulhrsw m5, [pw_1024]
22227 movu [r0 + 826 * 16], m3
22229 pinsrw m1, [r4 + 4], 0
22230 pmaddubsw m3, m1, [r5 + 26 * 16]
22231 pmulhrsw m3, [pw_1024]
22233 pinsrw m4, [r4 + 12], 0
22234 pmaddubsw m5, m4, [r5 + 26 * 16]
22235 pmulhrsw m5, [pw_1024]
22237 movu [r0 + 827 * 16], m3
22240 pmaddubsw m3, m7, [r5 + 13 * 16]
22241 pmulhrsw m3, [pw_1024]
22242 pmaddubsw m5, m2, [r5 + 13 * 16]
22243 pmulhrsw m5, [pw_1024]
22245 movu [r0 + 828 * 16], m3
22246 pmaddubsw m3, m1, [r5 + 13 * 16]
22247 pmulhrsw m3, [pw_1024]
22248 pmaddubsw m5, m4, [r5 + 13 * 16]
22249 pmulhrsw m5, [pw_1024]
22251 movu [r0 + 829 * 16], m3
22254 pmaddubsw m3, m1, [r5 + 25 * 16]
22255 pmulhrsw m3, [pw_1024]
22256 pmaddubsw m5, m4, [r5 + 25 * 16]
22257 pmulhrsw m5, [pw_1024]
22259 movu [r0 + 877 * 16], m3
22262 pmaddubsw m3, m1, [r5 + 8 * 16]
22263 pmulhrsw m3, [pw_1024]
22264 pmaddubsw m5, m4, [r5 + 8 * 16]
22265 pmulhrsw m5, [pw_1024]
22267 movu [r0 + 879 * 16], m3
22270 pshufb m3, m7, [tab_S2]
22271 movh [r0 + 830 * 16], m3
22272 pshufb m3, m2, [tab_S2]
22273 movh [r0 + 830 * 16 + 8], m3
22274 pshufb m3, m1, [tab_S2]
22275 movh [r0 + 831 * 16], m3
22276 pshufb m3, m4, [tab_S2]
22277 movh [r0 + 831 * 16 + 8], m3
22280 pshufb m0, m6, [tab_S2]
22281 movh [r0 + 766 * 16], m0
22283 movh [r0 + 766 * 16 + 8], m0
22285 movu [r0 + 767 * 16], m0
22289 pinsrw m1, [r4 + 3], 0
22290 pmaddubsw m3, m1, [r5 + 23 * 16]
22291 pmulhrsw m3, [pw_1024]
22293 pinsrw m4, [r4 + 11], 0
22294 pmaddubsw m5, m4, [r5 + 23 * 16]
22295 pmulhrsw m5, [pw_1024]
22297 movu [r0 + 881 * 16], m3
22300 pmaddubsw m3, m1, [r5 + 6 * 16]
22301 pmulhrsw m3, [pw_1024]
22302 pmaddubsw m5, m4, [r5 + 6 * 16]
22303 pmulhrsw m5, [pw_1024]
22305 movu [r0 + 883 * 16], m3
22309 pinsrw m1, [r4 + 2], 0
22310 pmaddubsw m3, m1, [r5 + 21 * 16]
22311 pmulhrsw m3, [pw_1024]
22313 pinsrw m4, [r4 + 10], 0
22314 pmaddubsw m5, m4, [r5 + 21 * 16]
22315 pmulhrsw m5, [pw_1024]
22317 movu [r0 + 885 * 16], m3
22320 pmaddubsw m3, m1, [r5 + 4 * 16]
22321 pmulhrsw m3, [pw_1024]
22322 pmaddubsw m5, m4, [r5 + 4 * 16]
22323 pmulhrsw m5, [pw_1024]
22325 movu [r0 + 887 * 16], m3
22329 pinsrw m1, [r4 + 1], 0
22330 pmaddubsw m3, m1, [r5 + 19 * 16]
22331 pmulhrsw m3, [pw_1024]
22333 pinsrw m4, [r4 + 9], 0
22334 pmaddubsw m5, m4, [r5 + 19 * 16]
22335 pmulhrsw m5, [pw_1024]
22337 movu [r0 + 889 * 16], m3
22340 pmaddubsw m3, m1, [r5 + 2 * 16]
22341 pmulhrsw m3, [pw_1024]
22342 pmaddubsw m5, m4, [r5 + 2 * 16]
22343 pmulhrsw m5, [pw_1024]
22345 movu [r0 + 891 * 16], m3
22349 pinsrw m1, [r4 + 0], 0
22350 pmaddubsw m3, m1, [r5 + 17 * 16]
22351 pmulhrsw m3, [pw_1024]
22353 pinsrw m4, [r4 + 8], 0
22354 pmaddubsw m5, m4, [r5 + 17 * 16]
22355 pmulhrsw m5, [pw_1024]
22357 movu [r0 + 893 * 16], m3
22360 pshufb m5, m1, [tab_S2]
22361 movh [r0 + 895 * 16], m5
22362 pshufb m5, m4, [tab_S2]
22363 movh [r0 + 895 * 16 + 8], m5
22366 movu m6, [r5 + 11 * 16]
22371 pmaddubsw m1, m0, m6
22376 pmaddubsw m3, m2, m6
22379 movu [r0 + 896 * 16], m1
22384 pmaddubsw m3, m1, m6
22389 pmaddubsw m5, m4, m6
22392 movu [r0 + 897 * 16], m3
22395 movu m6, [r5 + 22 * 16]
22398 pinsrb m0, [r3 + 2], 0
22399 pmaddubsw m3, m0, m6
22402 pinsrw m2, [r4 + 7], 0
22403 pmaddubsw m5, m2, m6
22406 movu [r0 + 898 * 16], m3
22409 pinsrw m1, [r4 + 15], 0
22410 pmaddubsw m3, m1, m6
22413 pinsrw m4, [r4 + 23], 0
22414 pmaddubsw m5, m4, m6
22417 movu [r0 + 899 * 16], m3
22420 movu m6, [r5 + 1 * 16]
22421 pmaddubsw m3, m0, m6
22423 pmaddubsw m5, m2, m6
22426 movu [r0 + 900 * 16], m3
22428 pmaddubsw m3, m1, m6
22430 pmaddubsw m5, m4, m6
22433 movu [r0 + 901 * 16], m3
22436 movu m6, [r5 + 12 * 16]
22438 pinsrb m0, [r3 + 2], 1
22439 pinsrb m0, [r3 + 3], 0
22440 pmaddubsw m3, m0, m6
22443 pinsrw m2, [r4 + 6], 0
22444 pmaddubsw m5, m2, m6
22447 movu [r0 + 902 * 16], m3
22450 pinsrw m1, [r4 + 14], 0
22451 pmaddubsw m3, m1, m6
22454 pinsrw m4, [r4 + 22], 0
22455 pmaddubsw m5, m4, m6
22458 movu [r0 + 903 * 16], m3
22461 movu m6, [r5 + 23 * 16]
22463 pinsrb m0, [r3 + 3], 1
22464 pinsrb m0, [r3 + 5], 0
22465 pmaddubsw m3, m0, m6
22468 pinsrw m2, [r4 + 5], 0
22469 pmaddubsw m5, m2, m6
22472 movu [r0 + 904 * 16], m3
22475 pinsrw m1, [r4 + 13], 0
22476 pmaddubsw m3, m1, m6
22479 pinsrw m4, [r4 + 21], 0
22480 pmaddubsw m5, m4, m6
22483 movu [r0 + 905 * 16], m3
22486 movu m6, [r5 + 2 * 16]
22487 pmaddubsw m3, m0, m6
22489 pmaddubsw m5, m2, m6
22492 movu [r0 + 906 * 16], m3
22494 pmaddubsw m3, m1, m6
22496 pmaddubsw m5, m4, m6
22499 movu [r0 + 907 * 16], m3
22502 movu m6, [r5 + 13 * 16]
22504 pinsrb m0, [r3 + 5], 1
22505 pinsrb m0, [r3 + 6], 0
22506 pmaddubsw m3, m0, m6
22509 pinsrb m2, [r4 + 5], 1
22510 pinsrb m2, [r4 + 4], 0
22511 pmaddubsw m5, m2, m6
22514 movu [r0 + 908 * 16], m3
22516 pinsrw m1, [r4 + 12], 0
22517 pmaddubsw m3, m1, m6
22520 pinsrw m4, [r4 + 20], 0
22521 pmaddubsw m5, m4, m6
22524 movu [r0 + 909 * 16], m3
22527 movu m6, [r5 + 24 * 16]
22529 pinsrb m0, [r3 + 6], 1
22530 pinsrb m0, [r3 + 8], 0
22531 pmaddubsw m3, m0, m6
22534 pinsrw m2, [r4 + 3], 0
22535 pmaddubsw m5, m2, m6
22538 movu [r0 + 910 * 16], m3
22541 pinsrw m1, [r4 + 11], 0
22542 pmaddubsw m3, m1, m6
22545 pinsrw m4, [r4 + 19], 0
22546 pmaddubsw m5, m4, m6
22549 movu [r0 + 911 * 16], m3
22552 movu m6, [r5 + 3 * 16]
22553 pmaddubsw m3, m0, m6
22555 pmaddubsw m5, m2, m6
22558 movu [r0 + 912 * 16], m3
22560 pmaddubsw m3, m1, m6
22562 pmaddubsw m5, m4, m6
22565 movu [r0 + 913 * 16], m3
22568 movu m6, [r5 + 14 * 16]
22570 pinsrb m0, [r3 + 8], 1
22571 pinsrb m0, [r3 + 9], 0
22572 pmaddubsw m3, m0, m6
22575 pinsrw m2, [r4 + 2], 0
22576 pmaddubsw m5, m2, m6
22579 movu [r0 + 914 * 16], m3
22582 pinsrw m1, [r4 + 10], 0
22583 pmaddubsw m3, m1, m6
22586 pinsrw m4, [r4 + 18], 0
22587 pmaddubsw m5, m4, m6
22590 movu [r0 + 915 * 16], m3
22593 movu m6, [r5 + 25 * 16]
22595 pinsrb m0, [r3 + 9], 1
22596 pinsrb m0, [r3 + 11], 0
22597 pmaddubsw m3, m0, m6
22600 pinsrw m2, [r4 + 1], 0
22601 pmaddubsw m5, m2, m6
22604 movu [r0 + 916 * 16], m3
22607 pinsrw m1, [r4 + 9], 0
22608 pmaddubsw m3, m1, m6
22611 pinsrb m4, [r4 + 18], 1
22612 pinsrb m4, [r4 + 17], 0
22613 pmaddubsw m5, m4, m6
22616 movu [r0 + 917 * 16], m3
22619 movu m6, [r5 + 4 * 16]
22620 pmaddubsw m3, m0, m6
22622 pmaddubsw m5, m2, m6
22625 movu [r0 + 918 * 16], m3
22627 pmaddubsw m3, m1, m6
22629 pmaddubsw m5, m4, m6
22632 movu [r0 + 919 * 16], m3
22635 movu m6, [r5 + 15 * 16]
22637 pinsrb m0, [r3 + 11], 1
22638 pinsrb m0, [r3 + 12], 0
22639 pmaddubsw m3, m0, m6
22642 pinsrw m2, [r4 + 0], 0
22643 pmaddubsw m5, m2, m6
22646 movu [r0 + 920 * 16], m3
22649 pinsrw m1, [r4 + 8], 0
22650 pmaddubsw m3, m1, m6
22653 pinsrw m4, [r4 + 16], 0
22654 pmaddubsw m5, m4, m6
22657 movu [r0 + 921 * 16], m3
22660 movu m6, [r5 + 26 * 16]
22662 pinsrb m0, [r3 + 12], 1
22663 pinsrb m0, [r3 + 14], 0
22664 pmaddubsw m3, m0, m6
22667 pinsrb m2, [r4 + 0], 1
22668 pinsrb m2, [r3 + 2], 0
22669 pmaddubsw m5, m2, m6
22672 movu [r0 + 922 * 16], m3
22675 pinsrw m1, [r4 + 7], 0
22676 pmaddubsw m3, m1, m6
22679 pinsrw m4, [r4 + 15], 0
22680 pmaddubsw m5, m4, m6
22683 movu [r0 + 923 * 16], m3
22686 movu m6, [r5 + 5 * 16]
22687 pmaddubsw m3, m0, m6
22689 pmaddubsw m5, m2, m6
22692 movu [r0 + 924 * 16], m3
22694 pmaddubsw m3, m1, m6
22696 pmaddubsw m5, m4, m6
22699 movu [r0 + 925 * 16], m3
22702 movu m6, [r5 + 16 * 16]
22704 pinsrb m0, [r3 + 14], 1
22705 pinsrb m0, [r3 + 15], 0
22706 pmaddubsw m3, m0, m6
22709 pinsrb m2, [r3 + 2], 1
22710 pinsrb m2, [r3 + 3], 0
22711 pmaddubsw m5, m2, m6
22714 movu [r0 + 926 * 16], m3
22717 pinsrw m1, [r4 + 6], 0
22718 pmaddubsw m3, m1, m6
22721 pinsrw m4, [r4 + 14], 0
22722 pmaddubsw m5, m4, m6
22725 movu [r0 + 927 * 16], m3
22728 movu m6, [r5 + 27 * 16]
22730 pinsrb m0, [r3 + 15], 1
22731 pinsrb m0, [r3 + 17], 0
22732 pmaddubsw m3, m0, m6
22735 pinsrb m2, [r3 + 3], 1
22736 pinsrb m2, [r3 + 5], 0
22737 pmaddubsw m5, m2, m6
22740 movu [r0 + 928 * 16], m3
22743 pinsrw m1, [r4 + 5], 0
22744 pmaddubsw m3, m1, m6
22747 pinsrw m4, [r4 + 13], 0
22748 pmaddubsw m5, m4, m6
22751 movu [r0 + 929 * 16], m3
22754 movu m6, [r5 + 6 * 16]
22755 pmaddubsw m3, m0, m6
22757 pmaddubsw m5, m2, m6
22760 movu [r0 + 930 * 16], m3
22762 pmaddubsw m3, m1, m6
22764 pmaddubsw m5, m4, m6
22767 movu [r0 + 931 * 16], m3
22770 movu m6, [r5 + 17 * 16]
22772 pinsrb m0, [r3 + 17], 1
22773 pinsrb m0, [r3 + 18], 0
22774 pmaddubsw m3, m0, m6
22777 pinsrb m2, [r3 + 5], 1
22778 pinsrb m2, [r3 + 6], 0
22779 pmaddubsw m5, m2, m6
22782 movu [r0 + 932 * 16], m3
22785 pinsrw m1, [r4 + 4], 0
22786 pmaddubsw m3, m1, m6
22789 pinsrw m4, [r4 + 12], 0
22790 pmaddubsw m5, m4, m6
22793 movu [r0 + 933 * 16], m3
22796 movu m6, [r5 + 28 * 16]
22798 pinsrb m0, [r3 + 18], 1
22799 pinsrb m0, [r3 + 20], 0
22800 pmaddubsw m3, m0, m6
22803 pinsrb m2, [r3 + 6], 1
22804 pinsrb m2, [r3 + 8], 0
22805 pmaddubsw m5, m2, m6
22808 movu [r0 + 934 * 16], m3
22811 pinsrw m1, [r4 + 3], 0
22812 pmaddubsw m3, m1, m6
22815 pinsrw m4, [r4 + 11], 0
22816 pmaddubsw m5, m4, m6
22819 movu [r0 + 935 * 16], m3
22822 movu m6, [r5 + 7 * 16]
22823 pmaddubsw m3, m0, m6
22825 pmaddubsw m5, m2, m6
22828 movu [r0 + 936 * 16], m3
22830 pmaddubsw m3, m1, m6
22832 pmaddubsw m5, m4, m6
22835 movu [r0 + 937 * 16], m3
22838 movu m6, [r5 + 18 * 16]
22840 pinsrb m0, [r3 + 20], 1
22841 pinsrb m0, [r3 + 21], 0
22842 pmaddubsw m3, m0, m6
22845 pinsrb m2, [r3 + 8], 1
22846 pinsrb m2, [r3 + 9], 0
22847 pmaddubsw m5, m2, m6
22850 movu [r0 + 938 * 16], m3
22853 pinsrw m1, [r4 + 2], 0
22854 pmaddubsw m3, m1, m6
22857 pinsrw m4, [r4 + 10], 0
22858 pmaddubsw m5, m4, m6
22861 movu [r0 + 939 * 16], m3
22864 movu m6, [r5 + 29 * 16]
22866 pinsrb m0, [r3 + 21], 1
22867 pinsrb m0, [r3 + 23], 0
22868 pmaddubsw m3, m0, m6
22871 pinsrb m2, [r3 + 9], 1
22872 pinsrb m2, [r3 + 11], 0
22873 pmaddubsw m5, m2, m6
22876 movu [r0 + 940 * 16], m3
22879 pinsrw m1, [r4 + 1], 0
22880 pmaddubsw m3, m1, m6
22883 pinsrw m4, [r4 + 9], 0
22884 pmaddubsw m5, m4, m6
22887 movu [r0 + 941 * 16], m3
22890 movu m6, [r5 + 8 * 16]
22891 pmaddubsw m3, m0, m6
22893 pmaddubsw m5, m2, m6
22896 movu [r0 + 942 * 16], m3
22898 pmaddubsw m3, m1, m6
22900 pmaddubsw m5, m4, m6
22903 movu [r0 + 943 * 16], m3
22906 movu m6, [r5 + 19 * 16]
22908 pinsrb m0, [r3 + 23], 1
22909 pinsrb m0, [r3 + 24], 0
22910 pmaddubsw m3, m0, m6
22913 pinsrb m2, [r3 + 11], 1
22914 pinsrb m2, [r3 + 12], 0
22915 pmaddubsw m5, m2, m6
22918 movu [r0 + 944 * 16], m3
22921 pinsrw m1, [r4 + 0], 0
22922 pmaddubsw m3, m1, m6
22925 pinsrw m4, [r4 + 8], 0
22926 pmaddubsw m5, m4, m6
22929 movu [r0 + 945 * 16], m3
22932 movu m6, [r5 + 30 * 16]
22934 pinsrb m0, [r3 + 24], 1
22935 pinsrb m0, [r3 + 26], 0
22936 pmaddubsw m3, m0, m6
22939 pinsrb m2, [r3 + 12], 1
22940 pinsrb m2, [r3 + 14], 0
22941 pmaddubsw m5, m2, m6
22944 movu [r0 + 946 * 16], m3
22947 pinsrb m1, [r4 + 0], 1
22948 pinsrb m1, [r3 + 2], 0
22949 pmaddubsw m3, m1, m6
22952 pinsrw m4, [r4 + 7], 0
22953 pmaddubsw m5, m4, m6
22956 movu [r0 + 947 * 16], m3
22959 movu m6, [r5 + 9 * 16]
22960 pmaddubsw m3, m0, m6
22962 pmaddubsw m5, m2, m6
22965 movu [r0 + 948 * 16], m3
22967 pmaddubsw m3, m1, m6
22969 pmaddubsw m5, m4, m6
22972 movu [r0 + 949 * 16], m3
22975 movu m6, [r5 + 20 * 16]
22977 pinsrb m0, [r3 + 26], 1
22978 pinsrb m0, [r3 + 27], 0
22979 pmaddubsw m3, m0, m6
22982 pinsrb m2, [r3 + 14], 1
22983 pinsrb m2, [r3 + 15], 0
22984 pmaddubsw m5, m2, m6
22987 movu [r0 + 950 * 16], m3
22990 pinsrb m1, [r3 + 2], 1
22991 pinsrb m1, [r3 + 3], 0
22992 pmaddubsw m3, m1, m6
22995 pinsrw m4, [r4 + 6], 0
22996 pmaddubsw m5, m4, m6
22999 movu [r0 + 951 * 16], m3
23002 movu m6, [r5 + 31 * 16]
23004 pinsrb m0, [r3 + 27], 1
23005 pinsrb m0, [r3 + 29], 0
23006 pmaddubsw m3, m0, m6
23009 pinsrb m2, [r3 + 15], 1
23010 pinsrb m2, [r3 + 17], 0
23011 pmaddubsw m5, m2, m6
23014 movu [r0 + 952 * 16], m3
23017 pinsrb m1, [r3 + 3], 1
23018 pinsrb m1, [r3 + 5], 0
23019 pmaddubsw m3, m1, m6
23022 pinsrw m4, [r4 + 5], 0
23023 pmaddubsw m5, m4, m6
23026 movu [r0 + 953 * 16], m3
23029 movu m6, [r5 + 10 * 16]
23030 pmaddubsw m3, m0, m6
23032 pmaddubsw m5, m2, m6
23035 movu [r0 + 954 * 16], m3
23037 pmaddubsw m3, m1, m6
23039 pmaddubsw m5, m4, m6
23042 movu [r0 + 955 * 16], m3
23045 movu m6, [r5 + 21 * 16]
23047 pinsrb m0, [r3 + 29], 1
23048 pinsrb m0, [r3 + 30], 0
23049 pmaddubsw m3, m0, m6
23052 pinsrb m2, [r3 + 17], 1
23053 pinsrb m2, [r3 + 18], 0
23054 pmaddubsw m5, m2, m6
23057 movu [r0 + 956 * 16], m3
23060 pinsrb m1, [r3 + 5], 1
23061 pinsrb m1, [r3 + 6], 0
23062 pmaddubsw m3, m1, m6
23065 pinsrw m4, [r4 + 4], 0
23066 pmaddubsw m5, m4, m6
23069 movu [r0 + 957 * 16], m3
23072 pshufb m5, m0, [tab_S2]
23073 movh [r0 + 958 * 16], m5
23074 pshufb m5, m2, [tab_S2]
23075 movh [r0 + 958 * 16 + 8], m5
23076 pshufb m5, m1, [tab_S2]
23077 movh [r0 + 959 * 16], m5
23078 pshufb m5, m4, [tab_S2]
23079 movh [r0 + 959 * 16 + 8], m5
23082 movu m6, [r5 + 6 * 16]
23087 pmaddubsw m1, m0, m6
23092 pmaddubsw m3, m2, m6
23095 movu [r0 + 960 * 16], m1
23100 pmaddubsw m3, m1, m6
23105 pmaddubsw m5, m4, m6
23108 movu [r0 + 961 * 16], m3
23111 movu m6, [r5 + 12 * 16]
23113 pinsrb m0, [r3 + 0], 1
23114 pinsrb m0, [r3 + 1], 0
23115 pmaddubsw m3, m0, m6
23118 pinsrw m2, [r4 + 7], 0
23119 pmaddubsw m5, m2, m6
23122 movu [r0 + 962 * 16], m3
23125 pinsrw m1, [r4 + 15], 0
23126 pmaddubsw m3, m1, m6
23129 pinsrw m4, [r4 + 23], 0
23130 pmaddubsw m5, m4, m6
23133 movu [r0 + 963 * 16], m3
23136 movu m6, [r5 + 18 * 16]
23138 pinsrb m0, [r3 + 1], 1
23139 pinsrb m0, [r3 + 2], 0
23140 pmaddubsw m3, m0, m6
23143 pinsrw m2, [r4 + 6], 0
23144 pmaddubsw m5, m2, m6
23147 movu [r0 + 964 * 16], m3
23150 pinsrw m1, [r4 + 14], 0
23151 pmaddubsw m3, m1, m6
23154 pinsrw m4, [r4 + 22], 0
23155 pmaddubsw m5, m4, m6
23158 movu [r0 + 965 * 16], m3
23161 movu m6, [r5 + 24 * 16]
23163 pinsrb m0, [r3 + 2], 1
23164 pinsrb m0, [r3 + 4], 0
23165 pmaddubsw m3, m0, m6
23168 pinsrw m2, [r4 + 5], 0
23169 pmaddubsw m5, m2, m6
23172 movu [r0 + 966 * 16], m3
23175 pinsrw m1, [r4 + 13], 0
23176 pmaddubsw m3, m1, m6
23179 pinsrw m4, [r4 + 21], 0
23180 pmaddubsw m5, m4, m6
23183 movu [r0 + 967 * 16], m3
23186 movu m6, [r5 + 30 * 16]
23188 pinsrb m0, [r3 + 4], 1
23189 pinsrb m0, [r3 + 5], 0
23190 pmaddubsw m3, m0, m6
23193 pinsrw m2, [r4 + 4], 0
23194 pmaddubsw m5, m2, m6
23197 movu [r0 + 968 * 16], m3
23200 pinsrw m1, [r4 + 12], 0
23201 pmaddubsw m3, m1, m6
23204 pinsrw m4, [r4 + 20], 0
23205 pmaddubsw m5, m4, m6
23208 movu [r0 + 969 * 16], m3
23211 movu m6, [r5 + 4 * 16]
23212 pmaddubsw m3, m0, m6
23214 pmaddubsw m5, m2, m6
23217 movu [r0 + 970 * 16], m3
23219 pmaddubsw m3, m1, m6
23221 pmaddubsw m5, m4, m6
23224 movu [r0 + 971 * 16], m3
23227 movu m6, [r5 + 10 * 16]
23229 pinsrb m0, [r3 + 5], 1
23230 pinsrb m0, [r3 + 6], 0
23231 pmaddubsw m3, m0, m6
23234 pinsrw m2, [r4 + 3], 0
23235 pmaddubsw m5, m2, m6
23238 movu [r0 + 972 * 16], m3
23241 pinsrw m1, [r4 + 11], 0
23242 pmaddubsw m3, m1, m6
23245 pinsrw m4, [r4 + 19], 0
23246 pmaddubsw m5, m4, m6
23249 movu [r0 + 973 * 16], m3
23252 movu m6, [r5 + 16 * 16]
23254 pinsrb m0, [r3 + 6], 1
23255 pinsrb m0, [r3 + 7], 0
23256 pmaddubsw m3, m0, m6
23259 pinsrw m2, [r4 + 2], 0
23260 pmaddubsw m5, m2, m6
23263 movu [r0 + 974 * 16], m3
23266 pinsrw m1, [r4 + 10], 0
23267 pmaddubsw m3, m1, m6
23270 pinsrw m4, [r4 + 18], 0
23271 pmaddubsw m5, m4, m6
23274 movu [r0 + 975 * 16], m3
23277 movu m6, [r5 + 22 * 16]
23279 pinsrb m0, [r3 + 7], 1
23280 pinsrb m0, [r3 + 9], 0
23281 pmaddubsw m3, m0, m6
23284 pinsrw m2, [r4 + 1], 0
23285 pmaddubsw m5, m2, m6
23288 movu [r0 + 976 * 16], m3
23291 pinsrw m1, [r4 + 9], 0
23292 pmaddubsw m3, m1, m6
23295 pinsrw m4, [r4 + 17], 0
23296 pmaddubsw m5, m4, m6
23299 movu [r0 + 977 * 16], m3
23302 movu m6, [r5 + 28 * 16]
23304 pinsrb m0, [r3 + 9], 1
23305 pinsrb m0, [r3 + 10], 0
23306 pmaddubsw m3, m0, m6
23309 pinsrw m2, [r4 + 0], 0
23310 pmaddubsw m5, m2, m6
23313 movu [r0 + 978 * 16], m3
23316 pinsrw m1, [r4 + 8], 0
23317 pmaddubsw m3, m1, m6
23320 pinsrw m4, [r4 + 16], 0
23321 pmaddubsw m5, m4, m6
23324 movu [r0 + 979 * 16], m3
23327 movu m6, [r5 + 2 * 16]
23328 pmaddubsw m3, m0, m6
23330 pmaddubsw m5, m2, m6
23333 movu [r0 + 980 * 16], m3
23335 pmaddubsw m3, m1, m6
23337 pmaddubsw m5, m4, m6
23340 movu [r0 + 981 * 16], m3
23343 movu m6, [r5 + 8 * 16]
23345 pinsrb m0, [r3 + 10], 1
23346 pinsrb m0, [r3 + 11], 0
23347 pmaddubsw m3, m0, m6
23350 pinsrb m2, [r4 + 0], 1
23351 pinsrb m2, [r3 + 1], 0
23352 pmaddubsw m5, m2, m6
23355 movu [r0 + 982 * 16], m3
23358 pinsrw m1, [r4 + 7], 0
23359 pmaddubsw m3, m1, m6
23362 pinsrw m4, [r4 + 15], 0
23363 pmaddubsw m5, m4, m6
23366 movu [r0 + 983 * 16], m3
23369 movu m6, [r5 + 14 * 16]
23371 pinsrb m0, [r3 + 11], 1
23372 pinsrb m0, [r3 + 12], 0
23373 pmaddubsw m3, m0, m6
23376 pinsrb m2, [r3 + 1], 1
23377 pinsrb m2, [r3 + 2], 0
23378 pmaddubsw m5, m2, m6
23381 movu [r0 + 984 * 16], m3
23384 pinsrw m1, [r4 + 6], 0
23385 pmaddubsw m3, m1, m6
23388 pinsrw m4, [r4 + 14], 0
23389 pmaddubsw m5, m4, m6
23392 movu [r0 + 985 * 16], m3
23395 movu m6, [r5 + 20 * 16]
23397 pinsrb m0, [r3 + 12], 1
23398 pinsrb m0, [r3 + 14], 0
23399 pmaddubsw m3, m0, m6
23402 pinsrb m2, [r3 + 2], 1
23403 pinsrb m2, [r3 + 4], 0
23404 pmaddubsw m5, m2, m6
23407 movu [r0 + 986 * 16], m3
23410 pinsrw m1, [r4 + 5], 0
23411 pmaddubsw m3, m1, m6
23414 pinsrw m4, [r4 + 13], 0
23415 pmaddubsw m5, m4, m6
23418 movu [r0 + 987 * 16], m3
23421 movu m6, [r5 + 26 * 16]
23423 pinsrb m0, [r3 + 14], 1
23424 pinsrb m0, [r3 + 15], 0
23425 pmaddubsw m3, m0, m6
23428 pinsrb m2, [r3 + 4], 1
23429 pinsrb m2, [r3 + 5], 0
23430 pmaddubsw m5, m2, m6
23433 movu [r0 + 988 * 16], m3
23436 pinsrw m1, [r4 + 4], 0
23437 pmaddubsw m3, m1, m6
23440 pinsrw m4, [r4 + 12], 0
23441 pmaddubsw m5, m4, m6
23444 movu [r0 + 989 * 16], m3
23447 pshufb m5, m0, [tab_S2]
23448 movh [r0 + 990 * 16], m5
23449 pshufb m5, m2, [tab_S2]
23450 movh [r0 + 990 * 16 + 8], m5
23451 pshufb m5, m1, [tab_S2]
23452 movh [r0 + 991 * 16], m5
23453 pshufb m5, m4, [tab_S2]
23454 movh [r0 + 991 * 16 + 8], m5
23457 movu m6, [r5 + 6 * 16]
23459 pinsrb m0, [r3 + 15], 1
23460 pinsrb m0, [r3 + 16], 0
23461 pmaddubsw m3, m0, m6
23464 pinsrb m2, [r3 + 5], 1
23465 pinsrb m2, [r3 + 6], 0
23466 pmaddubsw m5, m2, m6
23469 movu [r0 + 992 * 16], m3
23472 pinsrw m1, [r4 + 3], 0
23473 pmaddubsw m3, m1, m6
23476 pinsrw m4, [r4 + 11], 0
23477 pmaddubsw m5, m4, m6
23480 movu [r0 + 993 * 16], m3
23483 movu m6, [r5 + 12 * 16]
23485 pinsrb m0, [r3 + 16], 1
23486 pinsrb m0, [r3 + 17], 0
23487 pmaddubsw m3, m0, m6
23490 pinsrb m2, [r3 + 6], 1
23491 pinsrb m2, [r3 + 7], 0
23492 pmaddubsw m5, m2, m6
23495 movu [r0 + 994 * 16], m3
23498 pinsrw m1, [r4 + 2], 0
23499 pmaddubsw m3, m1, m6
23502 pinsrw m4, [r4 + 10], 0
23503 pmaddubsw m5, m4, m6
23506 movu [r0 + 995 * 16], m3
23509 movu m6, [r5 + 18 * 16]
23511 pinsrb m0, [r3 + 17], 1
23512 pinsrb m0, [r3 + 18], 0
23513 pmaddubsw m3, m0, m6
23516 pinsrb m2, [r3 + 7], 1
23517 pinsrb m2, [r3 + 9], 0
23518 pmaddubsw m5, m2, m6
23521 movu [r0 + 996 * 16], m3
23524 pinsrw m1, [r4 + 1], 0
23525 pmaddubsw m3, m1, m6
23528 pinsrw m4, [r4 + 9], 0
23529 pmaddubsw m5, m4, m6
23532 movu [r0 + 997 * 16], m3
23535 movu m6, [r5 + 24 * 16]
23537 pinsrb m0, [r3 + 18], 1
23538 pinsrb m0, [r3 + 20], 0
23539 pmaddubsw m3, m0, m6
23542 pinsrb m2, [r3 + 9], 1
23543 pinsrb m2, [r3 + 10], 0
23544 pmaddubsw m5, m2, m6
23547 movu [r0 + 998 * 16], m3
23550 pinsrw m1, [r4 + 0], 0
23551 pmaddubsw m3, m1, m6
23554 pinsrw m4, [r4 + 8], 0
23555 pmaddubsw m5, m4, m6
23558 movu [r0 + 999 * 16], m3
23561 movu m6, [r5 + 30 * 16]
23563 pinsrb m0, [r3 + 20], 1
23564 pinsrb m0, [r3 + 21], 0
23565 pmaddubsw m3, m0, m6
23568 pinsrb m2, [r3 + 10], 1
23569 pinsrb m2, [r3 + 11], 0
23570 pmaddubsw m5, m2, m6
23573 movu [r0 + 1000 * 16], m3
23576 pinsrb m1, [r4 + 0], 1
23577 pinsrb m1, [r3 + 1], 0
23578 pmaddubsw m3, m1, m6
23581 ;pinsrb m4, [r4 + 8], 1
23582 ;pinsrb m4, [r4 + 7], 0
23583 pinsrw m4, [r4 + 7], 0
23584 pmaddubsw m5, m4, m6
23587 movu [r0 + 1001 * 16], m3
23590 movu m6, [r5 + 4 * 16]
23591 pmaddubsw m3, m0, m6
23593 pmaddubsw m5, m2, m6
23596 movu [r0 + 1002 * 16], m3
23598 pmaddubsw m3, m1, m6
23600 pmaddubsw m5, m4, m6
23603 movu [r0 + 1003 * 16], m3
23606 movu m6, [r5 + 10 * 16]
23608 pinsrb m0, [r3 + 21], 1
23609 pinsrb m0, [r3 + 22], 0
23610 pmaddubsw m3, m0, m6
23613 pinsrb m2, [r3 + 11], 1
23614 pinsrb m2, [r3 + 12], 0
23615 pmaddubsw m5, m2, m6
23618 movu [r0 + 1004 * 16], m3
23621 pinsrb m1, [r3 + 1], 1
23622 pinsrb m1, [r3 + 2], 0
23623 pmaddubsw m3, m1, m6
23626 pinsrw m4, [r4 + 6], 0
23627 pmaddubsw m5, m4, m6
23630 movu [r0 + 1005 * 16], m3
23633 movu m6, [r5 + 16 * 16]
23635 pinsrb m0, [r3 + 22], 1
23636 pinsrb m0, [r3 + 23], 0
23637 pmaddubsw m3, m0, m6
23640 pinsrb m2, [r3 + 12], 1
23641 pinsrb m2, [r3 + 14], 0
23642 pmaddubsw m5, m2, m6
23645 movu [r0 + 1006 * 16], m3
23648 pinsrb m1, [r3 + 2], 1
23649 pinsrb m1, [r3 + 4], 0
23650 pmaddubsw m3, m1, m6
23653 pinsrw m4, [r4 + 5], 0
23654 pmaddubsw m5, m4, m6
23657 movu [r0 + 1007 * 16], m3
23660 movu m6, [r5 + 22 * 16]
23662 pinsrb m0, [r3 + 23], 1
23663 pinsrb m0, [r3 + 25], 0
23664 pmaddubsw m3, m0, m6
23667 pinsrb m2, [r3 + 14], 1
23668 pinsrb m2, [r3 + 15], 0
23669 pmaddubsw m5, m2, m6
23672 movu [r0 + 1008 * 16], m3
23675 pinsrb m1, [r3 + 4], 1
23676 pinsrb m1, [r3 + 5], 0
23677 pmaddubsw m3, m1, m6
23680 pinsrw m4, [r4 + 4], 0
23681 pmaddubsw m5, m4, m6
23684 movu [r0 + 1009 * 16], m3
23687 movu m6, [r5 + 28 * 16]
23689 pinsrb m0, [r3 + 25], 1
23690 pinsrb m0, [r3 + 26], 0
23691 pmaddubsw m3, m0, m6
23694 pinsrb m2, [r3 + 15], 1
23695 pinsrb m2, [r3 + 16], 0
23696 pmaddubsw m5, m2, m6
23699 movu [r0 + 1010 * 16], m3
23702 pinsrb m1, [r3 + 5], 1
23703 pinsrb m1, [r3 + 6], 0
23704 pmaddubsw m3, m1, m6
23707 pinsrw m4, [r4 + 3], 0
23708 pmaddubsw m5, m4, m6
23711 movu [r0 + 1011 * 16], m3
23714 movu m6, [r5 + 2 * 16]
23715 pmaddubsw m3, m0, m6
23717 pmaddubsw m5, m2, m6
23720 movu [r0 + 1012 * 16], m3
23722 pmaddubsw m3, m1, m6
23724 pmaddubsw m5, m4, m6
23727 movu [r0 + 1013 * 16], m3
23730 movu m6, [r5 + 8 * 16]
23732 pinsrb m0, [r3 + 26], 1
23733 pinsrb m0, [r3 + 27], 0
23734 pmaddubsw m3, m0, m6
23737 pinsrb m2, [r3 + 16], 1
23738 pinsrb m2, [r3 + 17], 0
23739 pmaddubsw m5, m2, m6
23742 movu [r0 + 1014 * 16], m3
23745 pinsrb m1, [r3 + 6], 1
23746 pinsrb m1, [r3 + 7], 0
23747 pmaddubsw m3, m1, m6
23750 pinsrw m4, [r4 + 2], 0
23751 pmaddubsw m5, m4, m6
23754 movu [r0 + 1015 * 16], m3
23757 movu m6, [r5 + 14 * 16]
23759 pinsrb m0, [r3 + 27], 1
23760 pinsrb m0, [r3 + 28], 0
23761 pmaddubsw m3, m0, m6
23764 pinsrb m2, [r3 + 17], 1
23765 pinsrb m2, [r3 + 18], 0
23766 pmaddubsw m5, m2, m6
23769 movu [r0 + 1016 * 16], m3
23772 pinsrb m1, [r3 + 7], 1
23773 pinsrb m1, [r3 + 9], 0
23774 pmaddubsw m3, m1, m6
23777 pinsrw m4, [r4 + 1], 0
23778 pmaddubsw m5, m4, m6
23781 movu [r0 + 1017 * 16], m3
23784 movu m6, [r5 + 20 * 16]
23786 pinsrb m0, [r3 + 28], 1
23787 pinsrb m0, [r3 + 30], 0
23788 pmaddubsw m3, m0, m6
23791 pinsrb m2, [r3 + 18], 1
23792 pinsrb m2, [r3 + 20], 0
23793 pmaddubsw m5, m2, m6
23796 movu [r0 + 1018 * 16], m3
23799 pinsrb m1, [r3 + 9], 1
23800 pinsrb m1, [r3 + 10], 0
23801 pmaddubsw m3, m1, m6
23804 pinsrw m4, [r4 + 0], 0
23805 pmaddubsw m5, m4, m6
23808 movu [r0 + 1019 * 16], m3
23811 movu m6, [r5 + 26 * 16]
23813 pinsrb m0, [r3 + 30], 1
23814 pinsrb m0, [r3 + 31], 0
23815 pmaddubsw m3, m0, m6
23818 pinsrb m2, [r3 + 20], 1
23819 pinsrb m2, [r3 + 21], 0
23820 pmaddubsw m5, m2, m6
23823 movu [r0 + 1020 * 16], m3
23826 pinsrb m1, [r3 + 10], 1
23827 pinsrb m1, [r3 + 11], 0
23828 pmaddubsw m3, m1, m6
23831 pinsrb m4, [r4 + 0], 1
23832 pinsrb m4, [r3 + 1], 0
23833 pmaddubsw m5, m4, m6
23836 movu [r0 + 1021 * 16], m3
23839 pshufb m5, m0, [tab_S2]
23840 movh [r0 + 1022 * 16], m5
23841 pshufb m5, m2, [tab_S2]
23842 movh [r0 + 1022 * 16 + 8], m5
23843 pshufb m5, m1, [tab_S2]
23844 movh [r0 + 1023 * 16], m5
23845 pshufb m5, m4, [tab_S2]
23846 movh [r0 + 1023 * 16 + 8], m5
23850 movu [r0 + 1024 * 16], m0
23852 movu [r0 + 1025 * 16], m1
23856 pinsrb m0, [r4 + 1], 0
23857 movu [r0 + 1026 * 16], m0
23859 pinsrb m1, [r3 + 15], 0
23860 movu [r0 + 1027 * 16], m1
23864 pinsrb m0, [r4 + 2], 0
23865 movu [r0 + 1028 * 16], m0
23867 pinsrb m1, [r3 + 14], 0
23868 movu [r0 + 1029 * 16], m1
23872 pinsrb m0, [r4 + 3], 0
23873 movu [r0 + 1030 * 16], m0
23875 pinsrb m1, [r3 + 13], 0
23876 movu [r0 + 1031 * 16], m1
23880 pinsrb m0, [r4 + 4], 0
23881 movu [r0 + 1032 * 16], m0
23883 pinsrb m1, [r3 + 12], 0
23884 movu [r0 + 1033 * 16], m1
23888 pinsrb m0, [r4 + 5], 0
23889 movu [r0 + 1034 * 16], m0
23891 pinsrb m1, [r3 + 11], 0
23892 movu [r0 + 1035 * 16], m1
23896 pinsrb m0, [r4 + 6], 0
23897 movu [r0 + 1036 * 16], m0
23899 pinsrb m1, [r3 + 10], 0
23900 movu [r0 + 1037 * 16], m1
23904 pinsrb m0, [r4 + 7], 0
23905 movu [r0 + 1038 * 16], m0
23907 pinsrb m1, [r3 + 9], 0
23908 movu [r0 + 1039 * 16], m1
23912 pinsrb m0, [r4 + 8], 0
23913 movu [r0 + 1040 * 16], m0
23915 pinsrb m1, [r3 + 8], 0
23916 movu [r0 + 1041 * 16], m1
23920 pinsrb m0, [r4 + 9], 0
23921 movu [r0 + 1042 * 16], m0
23923 pinsrb m1, [r3 + 7], 0
23924 movu [r0 + 1043 * 16], m1
23928 pinsrb m0, [r4 + 10], 0
23929 movu [r0 + 1044 * 16], m0
23931 pinsrb m1, [r3 + 6], 0
23932 movu [r0 + 1045 * 16], m1
23936 pinsrb m0, [r4 + 11], 0
23937 movu [r0 + 1046 * 16], m0
23939 pinsrb m1, [r3 + 5], 0
23940 movu [r0 + 1047 * 16], m1
23944 pinsrb m0, [r4 + 12], 0
23945 movu [r0 + 1048 * 16], m0
23947 pinsrb m1, [r3 + 4], 0
23948 movu [r0 + 1049 * 16], m1
23952 pinsrb m0, [r4 + 13], 0
23953 movu [r0 + 1050 * 16], m0
23955 pinsrb m1, [r3 + 3], 0
23956 movu [r0 + 1051 * 16], m1
23960 pinsrb m0, [r4 + 14], 0
23961 movu [r0 + 1052 * 16], m0
23963 pinsrb m1, [r3 + 2], 0
23964 movu [r0 + 1053 * 16], m1
23968 pinsrb m0, [r4 + 15], 0
23969 movu [r0 + 1054 * 16], m0
23971 pinsrb m1, [r3 + 1], 0
23972 movu [r0 + 1055 * 16], m1
23976 pinsrb m0, [r4 + 16], 0
23977 movu [r0 + 1056 * 16], m0
23979 pinsrb m1, [r3 + 0], 0
23980 movu [r0 + 1057 * 16], m1
23984 pinsrb m0, [r4 + 17], 0
23985 movu [r0 + 1058 * 16], m0
23987 pinsrb m1, [r4 + 1], 0
23988 movu [r0 + 1059 * 16], m1
23992 pinsrb m0, [r4 + 18], 0
23993 movu [r0 + 1060 * 16], m0
23995 pinsrb m1, [r4 + 2], 0
23996 movu [r0 + 1061 * 16], m1
24000 pinsrb m0, [r4 + 19], 0
24001 movu [r0 + 1062 * 16], m0
24003 pinsrb m1, [r4 + 3], 0
24004 movu [r0 + 1063 * 16], m1
24008 pinsrb m0, [r4 + 20], 0
24009 movu [r0 + 1064 * 16], m0
24011 pinsrb m1, [r4 + 4], 0
24012 movu [r0 + 1065 * 16], m1
24016 pinsrb m0, [r4 + 21], 0
24017 movu [r0 + 1066 * 16], m0
24019 pinsrb m1, [r4 + 5], 0
24020 movu [r0 + 1067 * 16], m1
24024 pinsrb m0, [r4 + 22], 0
24025 movu [r0 + 1068 * 16], m0
24027 pinsrb m1, [r4 + 6], 0
24028 movu [r0 + 1069 * 16], m1
24032 pinsrb m0, [r4 + 23], 0
24033 movu [r0 + 1070 * 16], m0
24035 pinsrb m1, [r4 + 7], 0
24036 movu [r0 + 1071 * 16], m1
24040 pinsrb m0, [r4 + 24], 0
24041 movu [r0 + 1072 * 16], m0
24043 pinsrb m1, [r4 + 8], 0
24044 movu [r0 + 1073 * 16], m1
24048 pinsrb m0, [r4 + 25], 0
24049 movu [r0 + 1074 * 16], m0
24051 pinsrb m1, [r4 + 9], 0
24052 movu [r0 + 1075 * 16], m1
24056 pinsrb m0, [r4 + 26], 0
24057 movu [r0 + 1076 * 16], m0
24059 pinsrb m1, [r4 + 10], 0
24060 movu [r0 + 1077 * 16], m1
24064 pinsrb m0, [r4 + 27], 0
24065 movu [r0 + 1078 * 16], m0
24067 pinsrb m1, [r4 + 11], 0
24068 movu [r0 + 1079 * 16], m1
24072 pinsrb m0, [r4 + 28], 0
24073 movu [r0 + 1080 * 16], m0
24075 pinsrb m1, [r4 + 12], 0
24076 movu [r0 + 1081 * 16], m1
24080 pinsrb m0, [r4 + 29], 0
24081 movu [r0 + 1082 * 16], m0
24083 pinsrb m1, [r4 + 13], 0
24084 movu [r0 + 1083 * 16], m1
24088 pinsrb m0, [r4 + 30], 0
24089 movu [r0 + 1084 * 16], m0
24091 pinsrb m1, [r4 + 14], 0
24092 movu [r0 + 1085 * 16], m1
24096 pinsrb m0, [r4 + 31], 0
24097 movu [r0 + 1086 * 16], m0
24099 pinsrb m1, [r4 + 15], 0
24100 movu [r0 + 1087 * 16], m1
24103 movu m6, [r5 + 6 * 16]
24107 pmaddubsw m1, m0, m6
24112 pmaddubsw m3, m2, m6
24115 movu [r0 + 1088 * 16], m1
24120 pmaddubsw m4, m1, m6
24125 pmaddubsw m5, m3, m6
24128 movu [r0 + 1089 * 16], m4
24131 movu m6, [r5 + 12 * 16]
24133 pinsrb m0, [r4 + 0], 1
24134 pinsrb m0, [r4 + 1], 0
24135 pmaddubsw m4, m0, m6
24138 pinsrw m2, [r3 + 7], 0
24139 pmaddubsw m5, m2, m6
24142 movu [r0 + 1090 * 16], m4
24144 pinsrw m1, [r3 + 15], 0
24145 pmaddubsw m4, m1, m6
24148 pinsrw m3, [r3 + 23], 0
24149 pmaddubsw m5, m3, m6
24152 movu [r0 + 1091 * 16], m4
24155 movu m6, [r5 + 18 * 16]
24157 pinsrb m0, [r4 + 1], 1
24158 pinsrb m0, [r4 + 2], 0
24159 pmaddubsw m4, m0, m6
24162 pinsrw m2, [r3 + 6], 0
24163 pmaddubsw m5, m2, m6
24166 movu [r0 + 1092 * 16], m4
24168 pinsrw m1, [r3 + 14], 0
24169 pmaddubsw m4, m1, m6
24172 pinsrw m3, [r3 + 22], 0
24173 pmaddubsw m5, m3, m6
24176 movu [r0 + 1093 * 16], m4
24179 movu m6, [r5 + 24 * 16]
24181 pinsrb m0, [r4 + 2], 1
24182 pinsrb m0, [r4 + 4], 0
24183 pmaddubsw m4, m0, m6
24186 pinsrw m2, [r3 + 5], 0
24187 pmaddubsw m5, m2, m6
24190 movu [r0 + 1094 * 16], m4
24192 pinsrw m1, [r3 + 13], 0
24193 pmaddubsw m4, m1, m6
24196 pinsrw m3, [r3 + 21], 0
24197 pmaddubsw m5, m3, m6
24200 movu [r0 + 1095 * 16], m4
24203 movu m6, [r5 + 30 * 16]
24205 pinsrb m0, [r4 + 4], 1
24206 pinsrb m0, [r4 + 5], 0
24207 pmaddubsw m4, m0, m6
24210 pinsrw m2, [r3 + 4], 0
24211 pmaddubsw m5, m2, m6
24214 movu [r0 + 1096 * 16], m4
24216 pinsrw m1, [r3 + 12], 0
24217 pmaddubsw m4, m1, m6
24220 pinsrw m3, [r3 + 20], 0
24221 pmaddubsw m5, m3, m6
24224 movu [r0 + 1097 * 16], m4
24227 movu m6, [r5 + 4 * 16]
24228 pmaddubsw m4, m0, m6
24230 pmaddubsw m5, m2, m6
24233 movu [r0 + 1098 * 16], m4
24234 pmaddubsw m4, m1, m6
24236 pmaddubsw m5, m3, m6
24239 movu [r0 + 1099 * 16], m4
24242 movu m6, [r5 + 10 * 16]
24244 pinsrb m0, [r4 + 5], 1
24245 pinsrb m0, [r4 + 6], 0
24246 pmaddubsw m4, m0, m6
24249 pinsrw m2, [r3 + 3], 0
24250 pmaddubsw m5, m2, m6
24253 movu [r0 + 1100 * 16], m4
24255 pinsrw m1, [r3 + 11], 0
24256 pmaddubsw m4, m1, m6
24259 pinsrw m3, [r3 + 19], 0
24260 pmaddubsw m5, m3, m6
24263 movu [r0 + 1101 * 16], m4
24266 movu m6, [r5 + 16 * 16]
24268 pinsrb m0, [r4 + 6], 1
24269 pinsrb m0, [r4 + 7], 0
24270 pmaddubsw m4, m0, m6
24273 pinsrw m2, [r3 + 2], 0
24274 pmaddubsw m5, m2, m6
24277 movu [r0 + 1102 * 16], m4
24279 pinsrw m1, [r3 + 10], 0
24280 pmaddubsw m4, m1, m6
24283 pinsrw m3, [r3 + 18], 0
24284 pmaddubsw m5, m3, m6
24287 movu [r0 + 1103 * 16], m4
24290 movu m6, [r5 + 22 * 16]
24292 pinsrb m0, [r4 + 7], 1
24293 pinsrb m0, [r4 + 9], 0
24294 pmaddubsw m4, m0, m6
24297 pinsrw m2, [r3 + 1], 0
24298 pmaddubsw m5, m2, m6
24301 movu [r0 + 1104 * 16], m4
24303 pinsrw m1, [r3 + 9], 0
24304 pmaddubsw m4, m1, m6
24307 pinsrw m3, [r3 + 17], 0
24308 pmaddubsw m5, m3, m6
24311 movu [r0 + 1105 * 16], m4
24314 movu m6, [r5 + 28 * 16]
24316 pinsrb m0, [r4 + 9], 1
24317 pinsrb m0, [r4 + 10], 0
24318 pmaddubsw m4, m0, m6
24321 pinsrw m2, [r3 + 0], 0
24322 pmaddubsw m5, m2, m6
24325 movu [r0 + 1106 * 16], m4
24327 pinsrw m1, [r3 + 8], 0
24328 pmaddubsw m4, m1, m6
24331 pinsrw m3, [r3 + 16], 0
24332 pmaddubsw m5, m3, m6
24335 movu [r0 + 1107 * 16], m4
24338 movu m6, [r5 + 2 * 16]
24339 pmaddubsw m4, m0, m6
24341 pmaddubsw m5, m2, m6
24344 movu [r0 + 1108 * 16], m4
24345 pmaddubsw m4, m1, m6
24347 pmaddubsw m5, m3, m6
24350 movu [r0 + 1109 * 16], m4
24353 movu m6, [r5 + 8 * 16]
24355 pinsrb m0, [r4 + 10], 1
24356 pinsrb m0, [r4 + 11], 0
24357 pmaddubsw m4, m0, m6
24360 pinsrb m2, [r3 + 0], 1
24361 pinsrb m2, [r4 + 1], 0
24362 pmaddubsw m5, m2, m6
24365 movu [r0 + 1110 * 16], m4
24367 pinsrw m1, [r3 + 7], 0
24368 pmaddubsw m4, m1, m6
24371 pinsrw m3, [r3 + 15], 0
24372 pmaddubsw m5, m3, m6
24375 movu [r0 + 1111 * 16], m4
24378 movu m6, [r5 + 14 * 16]
24380 pinsrb m0, [r4 + 11], 1
24381 pinsrb m0, [r4 + 12], 0
24382 pmaddubsw m4, m0, m6
24385 pinsrb m2, [r4 + 1], 1
24386 pinsrb m2, [r4 + 2], 0
24387 pmaddubsw m5, m2, m6
24390 movu [r0 + 1112 * 16], m4
24392 pinsrw m1, [r3 + 6], 0
24393 pmaddubsw m4, m1, m6
24396 pinsrw m3, [r3 + 14], 0
24397 pmaddubsw m5, m3, m6
24400 movu [r0 + 1113 * 16], m4
24403 movu m6, [r5 + 20 * 16]
24405 pinsrb m0, [r4 + 12], 1
24406 pinsrb m0, [r4 + 14], 0
24407 pmaddubsw m4, m0, m6
24410 pinsrb m2, [r4 + 2], 1
24411 pinsrb m2, [r4 + 4], 0
24412 pmaddubsw m5, m2, m6
24415 movu [r0 + 1114 * 16], m4
24417 pinsrw m1, [r3 + 5], 0
24418 pmaddubsw m4, m1, m6
24421 pinsrw m3, [r3 + 13], 0
24422 pmaddubsw m5, m3, m6
24425 movu [r0 + 1115 * 16], m4
24428 movu m6, [r5 + 26 * 16]
24430 pinsrb m0, [r4 + 14], 1
24431 pinsrb m0, [r4 + 15], 0
24432 pmaddubsw m4, m0, m6
24435 pinsrb m2, [r4 + 4], 1
24436 pinsrb m2, [r4 + 5], 0
24437 pmaddubsw m5, m2, m6
24440 movu [r0 + 1116 * 16], m4
24442 pinsrw m1, [r3 + 4], 0
24443 pmaddubsw m4, m1, m6
24446 pinsrw m3, [r3 + 12], 0
24447 pmaddubsw m5, m3, m6
24450 movu [r0 + 1117 * 16], m4
24453 pshufb m5, m0, [tab_S2]
24454 movh [r0 + 1118 * 16], m5
24455 pshufb m5, m2, [tab_S2]
24456 movh [r0 + 1118 * 16 + 8], m5
24457 pshufb m5, m1, [tab_S2]
24458 movh [r0 + 1119 * 16], m5
24459 pshufb m5, m3, [tab_S2]
24460 movh [r0 + 1119 * 16 + 8], m5
24463 movu m6, [r5 + 6 * 16]
24465 pinsrb m0, [r4 + 15], 1
24466 pinsrb m0, [r4 + 16], 0
24467 pmaddubsw m4, m0, m6
24470 pinsrb m2, [r4 + 5], 1
24471 pinsrb m2, [r4 + 6], 0
24472 pmaddubsw m5, m2, m6
24475 movu [r0 + 1120 * 16], m4
24477 pinsrw m1, [r3 + 3], 0
24478 pmaddubsw m4, m1, m6
24481 pinsrw m3, [r3 + 11], 0
24482 pmaddubsw m5, m3, m6
24485 movu [r0 + 1121 * 16], m4
24488 movu m6, [r5 + 12 * 16]
24490 pinsrb m0, [r4 + 16], 1
24491 pinsrb m0, [r4 + 17], 0
24492 pmaddubsw m4, m0, m6
24495 pinsrb m2, [r4 + 6], 1
24496 pinsrb m2, [r4 + 7], 0
24497 pmaddubsw m5, m2, m6
24500 movu [r0 + 1122 * 16], m4
24502 pinsrw m1, [r3 + 2], 0
24503 pmaddubsw m4, m1, m6
24506 pinsrw m3, [r3 + 10], 0
24507 pmaddubsw m5, m3, m6
24510 movu [r0 + 1123 * 16], m4
24513 movu m6, [r5 + 18 * 16]
24515 pinsrb m0, [r4 + 17], 1
24516 pinsrb m0, [r4 + 18], 0
24517 pmaddubsw m4, m0, m6
24520 pinsrb m2, [r4 + 7], 1
24521 pinsrb m2, [r4 + 9], 0
24522 pmaddubsw m5, m2, m6
24525 movu [r0 + 1124 * 16], m4
24527 pinsrw m1, [r3 + 1], 0
24528 pmaddubsw m4, m1, m6
24531 pinsrw m3, [r3 + 9], 0
24532 pmaddubsw m5, m3, m6
24535 movu [r0 + 1125 * 16], m4
24538 movu m6, [r5 + 24 * 16]
24540 pinsrb m0, [r4 + 18], 1
24541 pinsrb m0, [r4 + 20], 0
24542 pmaddubsw m4, m0, m6
24545 pinsrb m2, [r4 + 9], 1
24546 pinsrb m2, [r4 + 10], 0
24547 pmaddubsw m5, m2, m6
24550 movu [r0 + 1126 * 16], m4
24552 pinsrw m1, [r3 + 0], 0
24553 pmaddubsw m4, m1, m6
24556 pinsrw m3, [r3 + 8], 0
24557 pmaddubsw m5, m3, m6
24560 movu [r0 + 1127 * 16], m4
24563 movu m6, [r5 + 30 * 16]
24565 pinsrb m0, [r4 + 20], 1
24566 pinsrb m0, [r4 + 21], 0
24567 pmaddubsw m4, m0, m6
24570 pinsrb m2, [r4 + 10], 1
24571 pinsrb m2, [r4 + 11], 0
24572 pmaddubsw m5, m2, m6
24575 movu [r0 + 1128 * 16], m4
24577 pinsrb m1, [r4 + 0], 1
24578 pinsrb m1, [r4 + 1], 0
24579 pmaddubsw m4, m1, m6
24582 pinsrb m3, [r3 + 8], 1
24583 pinsrb m3, [r3 + 7], 0
24584 pmaddubsw m5, m3, m6
24587 movu [r0 + 1129 * 16], m4
24590 movu m6, [r5 + 4 * 16]
24591 pmaddubsw m4, m0, m6
24593 pmaddubsw m5, m2, m6
24596 movu [r0 + 1130 * 16], m4
24597 pmaddubsw m4, m1, m6
24599 pmaddubsw m5, m3, m6
24602 movu [r0 + 1131 * 16], m4
24605 movu m6, [r5 + 10 * 16]
24607 pinsrb m0, [r4 + 21], 1
24608 pinsrb m0, [r4 + 22], 0
24609 pmaddubsw m4, m0, m6
24612 pinsrb m2, [r4 + 11], 1
24613 pinsrb m2, [r4 + 12], 0
24614 pmaddubsw m5, m2, m6
24617 movu [r0 + 1132 * 16], m4
24619 pinsrb m1, [r4 + 1], 1
24620 pinsrb m1, [r4 + 2], 0
24621 pmaddubsw m4, m1, m6
24624 pinsrw m3, [r3 + 6], 0
24625 pmaddubsw m5, m3, m6
24628 movu [r0 + 1133 * 16], m4
24631 movu m6, [r5 + 16 * 16]
24633 pinsrb m0, [r4 + 22], 1
24634 pinsrb m0, [r4 + 23], 0
24635 pmaddubsw m4, m0, m6
24638 pinsrb m2, [r4 + 12], 1
24639 pinsrb m2, [r4 + 14], 0
24640 pmaddubsw m5, m2, m6
24643 movu [r0 + 1134 * 16], m4
24645 pinsrb m1, [r4 + 2], 1
24646 pinsrb m1, [r4 + 4], 0
24647 pmaddubsw m4, m1, m6
24650 pinsrw m3, [r3 + 5], 0
24651 pmaddubsw m5, m3, m6
24654 movu [r0 + 1135 * 16], m4
24657 movu m6, [r5 + 22 * 16]
24659 pinsrb m0, [r4 + 23], 1
24660 pinsrb m0, [r4 + 25], 0
24661 pmaddubsw m4, m0, m6
24664 pinsrb m2, [r4 + 14], 1
24665 pinsrb m2, [r4 + 15], 0
24666 pmaddubsw m5, m2, m6
24669 movu [r0 + 1136 * 16], m4
24671 pinsrb m1, [r4 + 4], 1
24672 pinsrb m1, [r4 + 5], 0
24673 pmaddubsw m4, m1, m6
24676 pinsrw m3, [r3 + 4], 0
24677 pmaddubsw m5, m3, m6
24680 movu [r0 + 1137 * 16], m4
24683 movu m6, [r5 + 28 * 16]
24685 pinsrb m0, [r4 + 25], 1
24686 pinsrb m0, [r4 + 26], 0
24687 pmaddubsw m4, m0, m6
24690 pinsrb m2, [r4 + 15], 1
24691 pinsrb m2, [r4 + 16], 0
24692 pmaddubsw m5, m2, m6
24695 movu [r0 + 1138 * 16], m4
24697 pinsrb m1, [r4 + 5], 1
24698 pinsrb m1, [r4 + 6], 0
24699 pmaddubsw m4, m1, m6
24702 pinsrw m3, [r3 + 3], 0
24703 pmaddubsw m5, m3, m6
24706 movu [r0 + 1139 * 16], m4
24709 movu m6, [r5 + 2 * 16]
24710 pmaddubsw m4, m0, m6
24712 pmaddubsw m5, m2, m6
24715 movu [r0 + 1140 * 16], m4
24716 pmaddubsw m4, m1, m6
24718 pmaddubsw m5, m3, m6
24721 movu [r0 + 1141 * 16], m4
24724 movu m6, [r5 + 8 * 16]
24726 pinsrb m0, [r4 + 26], 1
24727 pinsrb m0, [r4 + 27], 0
24728 pmaddubsw m4, m0, m6
24731 pinsrb m2, [r4 + 16], 1
24732 pinsrb m2, [r4 + 17], 0
24733 pmaddubsw m5, m2, m6
24736 movu [r0 + 1142 * 16], m4
24738 pinsrb m1, [r4 + 6], 1
24739 pinsrb m1, [r4 + 7], 0
24740 pmaddubsw m4, m1, m6
24743 pinsrw m3, [r3 + 2], 0
24744 pmaddubsw m5, m3, m6
24747 movu [r0 + 1143 * 16], m4
24750 movu m6, [r5 + 14 * 16]
24752 pinsrb m0, [r4 + 27], 1
24753 pinsrb m0, [r4 + 28], 0
24754 pmaddubsw m4, m0, m6
24757 pinsrb m2, [r4 + 17], 1
24758 pinsrb m2, [r4 + 18], 0
24759 pmaddubsw m5, m2, m6
24762 movu [r0 + 1144 * 16], m4
24764 pinsrb m1, [r4 + 7], 1
24765 pinsrb m1, [r4 + 9], 0
24766 pmaddubsw m4, m1, m6
24769 pinsrw m3, [r3 + 1], 0
24770 pmaddubsw m5, m3, m6
24773 movu [r0 + 1145 * 16], m4
24776 movu m6, [r5 + 20 * 16]
24778 pinsrb m0, [r4 + 28], 1
24779 pinsrb m0, [r4 + 30], 0
24780 pmaddubsw m4, m0, m6
24783 pinsrb m2, [r4 + 18], 1
24784 pinsrb m2, [r4 + 20], 0
24785 pmaddubsw m5, m2, m6
24788 movu [r0 + 1146 * 16], m4
24790 pinsrb m1, [r4 + 9], 1
24791 pinsrb m1, [r4 + 10], 0
24792 pmaddubsw m4, m1, m6
24795 pinsrw m3, [r3 + 0], 0
24796 pmaddubsw m5, m3, m6
24799 movu [r0 + 1147 * 16], m4
24802 movu m6, [r5 + 26 * 16]
24804 pinsrb m0, [r4 + 30], 1
24805 pinsrb m0, [r4 + 31], 0
24806 pmaddubsw m4, m0, m6
24809 pinsrb m2, [r4 + 20], 1
24810 pinsrb m2, [r4 + 21], 0
24811 pmaddubsw m5, m2, m6
24814 movu [r0 + 1148 * 16], m4
24816 pinsrb m1, [r4 + 10], 1
24817 pinsrb m1, [r4 + 11], 0
24818 pmaddubsw m4, m1, m6
24821 pinsrb m3, [r4 + 0], 1
24822 pinsrb m3, [r4 + 1], 0
24823 pmaddubsw m5, m3, m6
24826 movu [r0 + 1149 * 16], m4
24829 pshufb m5, m0, [tab_S2]
24830 movh [r0 + 1150 * 16], m5
24831 pshufb m5, m2, [tab_S2]
24832 movh [r0 + 1150 * 16 + 8], m5
24833 pshufb m5, m1, [tab_S2]
24834 movh [r0 + 1151 * 16], m5
24835 pshufb m5, m3, [tab_S2]
24836 movh [r0 + 1151 * 16 + 8], m5
24839 movu m6, [r5 + 11 * 16]
24843 pmaddubsw m1, m0, m6
24848 pmaddubsw m3, m2, m6
24851 movu [r0 + 1152 * 16], m1
24856 pmaddubsw m4, m1, m6
24861 pmaddubsw m5, m3, m6
24864 movu [r0 + 1153 * 16], m4
24867 movu m6, [r5 + 22 * 16]
24869 pinsrb m0, [r4 + 0], 1
24870 pinsrb m0, [r4 + 2], 0
24871 pmaddubsw m4, m0, m6
24874 pinsrw m2, [r3 + 7], 0
24875 pmaddubsw m5, m2, m6
24878 movu [r0 + 1154 * 16], m4
24880 pinsrw m1, [r3 + 15], 0
24881 pmaddubsw m4, m1, m6
24884 pinsrw m3, [r3 + 23], 0
24885 pmaddubsw m5, m3, m6
24888 movu [r0 + 1155 * 16], m4
24891 movu m6, [r5 + 1 * 16]
24892 pmaddubsw m4, m0, m6
24894 pmaddubsw m5, m2, m6
24897 movu [r0 + 1156 * 16], m4
24898 pmaddubsw m4, m1, m6
24900 pmaddubsw m5, m3, m6
24903 movu [r0 + 1157 * 16], m4
24906 movu m6, [r5 + 12 * 16]
24908 pinsrb m0, [r4 + 2], 1
24909 pinsrb m0, [r4 + 3], 0
24910 pmaddubsw m4, m0, m6
24913 pinsrw m2, [r3 + 6], 0
24914 pmaddubsw m5, m2, m6
24917 movu [r0 + 1158 * 16], m4
24919 pinsrw m1, [r3 + 14], 0
24920 pmaddubsw m4, m1, m6
24923 pinsrw m3, [r3 + 22], 0
24924 pmaddubsw m5, m3, m6
24927 movu [r0 + 1159 * 16], m4
24930 movu m6, [r5 + 23 * 16]
24932 pinsrb m0, [r4 + 3], 1
24933 pinsrb m0, [r4 + 5], 0
24934 pmaddubsw m4, m0, m6
24937 pinsrw m2, [r3 + 5], 0
24938 pmaddubsw m5, m2, m6
24941 movu [r0 + 1160 * 16], m4
24943 pinsrw m1, [r3 + 13], 0
24944 pmaddubsw m4, m1, m6
24947 pinsrw m3, [r3 + 21], 0
24948 pmaddubsw m5, m3, m6
24951 movu [r0 + 1161 * 16], m4
24954 movu m6, [r5 + 2 * 16]
24955 pmaddubsw m4, m0, m6
24957 pmaddubsw m5, m2, m6
24960 movu [r0 + 1162 * 16], m4
24961 pmaddubsw m4, m1, m6
24963 pmaddubsw m5, m3, m6
24966 movu [r0 + 1163 * 16], m4
24969 movu m6, [r5 + 13 * 16]
24971 pinsrb m0, [r4 + 5], 1
24972 pinsrb m0, [r4 + 6], 0
24973 pmaddubsw m4, m0, m6
24976 pinsrw m2, [r3 + 4], 0
24977 pmaddubsw m5, m2, m6
24980 movu [r0 + 1164 * 16], m4
24982 pinsrw m1, [r3 + 12], 0
24983 pmaddubsw m4, m1, m6
24986 pinsrw m3, [r3 + 20], 0
24987 pmaddubsw m5, m3, m6
24990 movu [r0 + 1165 * 16], m4
24993 movu m6, [r5 + 24 * 16]
24995 pinsrb m0, [r4 + 6], 1
24996 pinsrb m0, [r4 + 8], 0
24997 pmaddubsw m4, m0, m6
25000 pinsrw m2, [r3 + 3], 0
25001 pmaddubsw m5, m2, m6
25004 movu [r0 + 1166 * 16], m4
25006 pinsrw m1, [r3 + 11], 0
25007 pmaddubsw m4, m1, m6
25010 pinsrw m3, [r3 + 19], 0
25011 pmaddubsw m5, m3, m6
25014 movu [r0 + 1167 * 16], m4
25017 movu m6, [r5 + 3 * 16]
25018 pmaddubsw m4, m0, m6
25020 pmaddubsw m5, m2, m6
25023 movu [r0 + 1168 * 16], m4
25024 pmaddubsw m4, m1, m6
25026 pmaddubsw m5, m3, m6
25029 movu [r0 + 1169 * 16], m4
25032 movu m6, [r5 + 14 * 16]
25034 pinsrb m0, [r4 + 8], 1
25035 pinsrb m0, [r4 + 9], 0
25036 pmaddubsw m4, m0, m6
25039 pinsrb m2, [r3 + 3], 1
25040 pinsrb m2, [r3 + 2], 0
25041 pmaddubsw m5, m2, m6
25044 movu [r0 + 1170 * 16], m4
25046 pinsrw m1, [r3 + 10], 0
25047 pmaddubsw m4, m1, m6
25050 pinsrw m3, [r3 + 18], 0
25051 pmaddubsw m5, m3, m6
25054 movu [r0 + 1171 * 16], m4
25057 movu m6, [r5 + 25 * 16]
25059 pinsrb m0, [r4 + 9], 1
25060 pinsrb m0, [r4 + 11], 0
25061 pmaddubsw m4, m0, m6
25064 pinsrw m2, [r3 + 1], 0
25065 pmaddubsw m5, m2, m6
25068 movu [r0 + 1172 * 16], m4
25070 pinsrw m1, [r3 + 9], 0
25071 pmaddubsw m4, m1, m6
25074 pinsrw m3, [r3 + 17], 0
25075 pmaddubsw m5, m3, m6
25078 movu [r0 + 1173 * 16], m4
25081 movu m6, [r5 + 4 * 16]
25082 pmaddubsw m4, m0, m6
25084 pmaddubsw m5, m2, m6
25087 movu [r0 + 1174 * 16], m4
25088 pmaddubsw m4, m1, m6
25090 pmaddubsw m5, m3, m6
25093 movu [r0 + 1175 * 16], m4
25096 movu m6, [r5 + 15 * 16]
25098 pinsrb m0, [r4 + 11], 1
25099 pinsrb m0, [r4 + 12], 0
25100 pmaddubsw m4, m0, m6
25103 pinsrb m2, [r3 + 1], 1
25104 pinsrb m2, [r3 + 0], 0
25105 pmaddubsw m5, m2, m6
25108 movu [r0 + 1176 * 16], m4
25110 pinsrw m1, [r3 + 8], 0
25111 pmaddubsw m4, m1, m6
25114 pinsrw m3, [r3 + 16], 0
25115 pmaddubsw m5, m3, m6
25118 movu [r0 + 1177 * 16], m4
25121 movu m6, [r5 + 26 * 16]
25123 pinsrb m0, [r4 + 12], 1
25124 pinsrb m0, [r4 + 14], 0
25125 pmaddubsw m4, m0, m6
25128 pinsrb m2, [r4 + 0], 1
25129 pinsrb m2, [r4 + 2], 0
25130 pmaddubsw m5, m2, m6
25133 movu [r0 + 1178 * 16], m4
25135 pinsrw m1, [r3 + 7], 0
25136 pmaddubsw m4, m1, m6
25139 pinsrw m3, [r3 + 15], 0
25140 pmaddubsw m5, m3, m6
25143 movu [r0 + 1179 * 16], m4
25146 movu m6, [r5 + 5 * 16]
25147 pmaddubsw m4, m0, m6
25149 pmaddubsw m5, m2, m6
25152 movu [r0 + 1180 * 16], m4
25153 pmaddubsw m4, m1, m6
25155 pmaddubsw m5, m3, m6
25158 movu [r0 + 1181 * 16], m4
25161 movu m6, [r5 + 16 * 16]
25163 pinsrb m0, [r4 + 14], 1
25164 pinsrb m0, [r4 + 15], 0
25165 pmaddubsw m4, m0, m6
25168 pinsrb m2, [r4 + 2], 1
25169 pinsrb m2, [r4 + 3], 0
25170 pmaddubsw m5, m2, m6
25173 movu [r0 + 1182 * 16], m4
25175 pinsrw m1, [r3 + 6], 0
25176 pmaddubsw m4, m1, m6
25179 pinsrw m3, [r3 + 14], 0
25180 pmaddubsw m5, m3, m6
25183 movu [r0 + 1183 * 16], m4
25186 movu m6, [r5 + 27 * 16]
25188 pinsrb m0, [r4 + 15], 1
25189 pinsrb m0, [r4 + 17], 0
25190 pmaddubsw m4, m0, m6
25193 pinsrb m2, [r4 + 3], 1
25194 pinsrb m2, [r4 + 5], 0
25195 pmaddubsw m5, m2, m6
25198 movu [r0 + 1184 * 16], m4
25200 pinsrw m1, [r3 + 5], 0
25201 pmaddubsw m4, m1, m6
25204 pinsrw m3, [r3 + 13], 0
25205 pmaddubsw m5, m3, m6
25208 movu [r0 + 1185 * 16], m4
25211 movu m6, [r5 + 6 * 16]
25212 pmaddubsw m4, m0, m6
25214 pmaddubsw m5, m2, m6
25217 movu [r0 + 1186 * 16], m4
25218 pmaddubsw m4, m1, m6
25220 pmaddubsw m5, m3, m6
25223 movu [r0 + 1187 * 16], m4
25226 movu m6, [r5 + 17 * 16]
25228 pinsrb m0, [r4 + 17], 1
25229 pinsrb m0, [r4 + 18], 0
25230 pmaddubsw m4, m0, m6
25233 pinsrb m2, [r4 + 5], 1
25234 pinsrb m2, [r4 + 6], 0
25235 pmaddubsw m5, m2, m6
25238 movu [r0 + 1188 * 16], m4
25240 pinsrw m1, [r3 + 4], 0
25241 pmaddubsw m4, m1, m6
25244 pinsrw m3, [r3 + 12], 0
25245 pmaddubsw m5, m3, m6
25248 movu [r0 + 1189 * 16], m4
25251 movu m6, [r5 + 28 * 16]
25253 pinsrb m0, [r4 + 18], 1
25254 pinsrb m0, [r4 + 20], 0
25255 pmaddubsw m4, m0, m6
25258 pinsrb m2, [r4 + 6], 1
25259 pinsrb m2, [r4 + 8], 0
25260 pmaddubsw m5, m2, m6
25263 movu [r0 + 1190 * 16], m4
25265 pinsrw m1, [r3 + 3], 0
25266 pmaddubsw m4, m1, m6
25269 pinsrw m3, [r3 + 11], 0
25270 pmaddubsw m5, m3, m6
25273 movu [r0 + 1191 * 16], m4
25276 movu m6, [r5 + 7 * 16]
25277 pmaddubsw m4, m0, m6
25279 pmaddubsw m5, m2, m6
25282 movu [r0 + 1192 * 16], m4
25283 pmaddubsw m4, m1, m6
25285 pmaddubsw m5, m3, m6
25288 movu [r0 + 1193 * 16], m4
25291 movu m6, [r5 + 18 * 16]
25293 pinsrb m0, [r4 + 20], 1
25294 pinsrb m0, [r4 + 21], 0
25295 pmaddubsw m4, m0, m6
25298 pinsrb m2, [r4 + 8], 1
25299 pinsrb m2, [r4 + 9], 0
25300 pmaddubsw m5, m2, m6
25303 movu [r0 + 1194 * 16], m4
25305 pinsrw m1, [r3 + 2], 0
25306 pmaddubsw m4, m1, m6
25309 pinsrw m3, [r3 + 10], 0
25310 pmaddubsw m5, m3, m6
25313 movu [r0 + 1195 * 16], m4
25316 movu m6, [r5 + 29 * 16]
25318 pinsrb m0, [r4 + 21], 1
25319 pinsrb m0, [r4 + 23], 0
25320 pmaddubsw m4, m0, m6
25323 pinsrb m2, [r4 + 9], 1
25324 pinsrb m2, [r4 + 11], 0
25325 pmaddubsw m5, m2, m6
25328 movu [r0 + 1196 * 16], m4
25330 pinsrw m1, [r3 + 1], 0
25331 pmaddubsw m4, m1, m6
25334 pinsrw m3, [r3 + 9], 0
25335 pmaddubsw m5, m3, m6
25338 movu [r0 + 1197 * 16], m4
25341 movu m6, [r5 + 8 * 16]
25342 pmaddubsw m4, m0, m6
25344 pmaddubsw m5, m2, m6
25347 movu [r0 + 1198 * 16], m4
25348 pmaddubsw m4, m1, m6
25350 pmaddubsw m5, m3, m6
25353 movu [r0 + 1199 * 16], m4
25356 movu m6, [r5 + 19 * 16]
25358 pinsrb m0, [r4 + 23], 1
25359 pinsrb m0, [r4 + 24], 0
25360 pmaddubsw m4, m0, m6
25363 pinsrb m2, [r4 + 11], 1
25364 pinsrb m2, [r4 + 12], 0
25365 pmaddubsw m5, m2, m6
25368 movu [r0 + 1200 * 16], m4
25370 pinsrw m1, [r3 + 0], 0
25371 pmaddubsw m4, m1, m6
25374 pinsrw m3, [r3 + 8], 0
25375 pmaddubsw m5, m3, m6
25378 movu [r0 + 1201 * 16], m4
25381 movu m6, [r5 + 30 * 16]
25383 pinsrb m0, [r4 + 24], 1
25384 pinsrb m0, [r4 + 26], 0
25385 pmaddubsw m4, m0, m6
25388 pinsrb m2, [r4 + 12], 1
25389 pinsrb m2, [r4 + 14], 0
25390 pmaddubsw m5, m2, m6
25393 movu [r0 + 1202 * 16], m4
25395 pinsrb m1, [r4 + 0], 1
25396 pinsrb m1, [r4 + 2], 0
25397 pmaddubsw m4, m1, m6
25400 pinsrw m3, [r3 + 7], 0
25401 pmaddubsw m5, m3, m6
25404 movu [r0 + 1203 * 16], m4
25407 movu m6, [r5 + 9 * 16]
25408 pmaddubsw m4, m0, m6
25410 pmaddubsw m5, m2, m6
25413 movu [r0 + 1204 * 16], m4
25414 pmaddubsw m4, m1, m6
25416 pmaddubsw m5, m3, m6
25419 movu [r0 + 1205 * 16], m4
25422 movu m6, [r5 + 20 * 16]
25424 pinsrb m0, [r4 + 26], 1
25425 pinsrb m0, [r4 + 27], 0
25426 pmaddubsw m4, m0, m6
25429 pinsrb m2, [r4 + 14], 1
25430 pinsrb m2, [r4 + 15], 0
25431 pmaddubsw m5, m2, m6
25434 movu [r0 + 1206 * 16], m4
25436 pinsrb m1, [r4 + 2], 1
25437 pinsrb m1, [r4 + 3], 0
25438 pmaddubsw m4, m1, m6
25441 pinsrw m3, [r3 + 6], 0
25442 pmaddubsw m5, m3, m6
25445 movu [r0 + 1207 * 16], m4
25448 movu m6, [r5 + 31 * 16]
25450 pinsrb m0, [r4 + 27], 1
25451 pinsrb m0, [r4 + 29], 0
25452 pmaddubsw m4, m0, m6
25455 pinsrb m2, [r4 + 15], 1
25456 pinsrb m2, [r4 + 17], 0
25457 pmaddubsw m5, m2, m6
25460 movu [r0 + 1208 * 16], m4
25462 pinsrb m1, [r4 + 3], 1
25463 pinsrb m1, [r4 + 5], 0
25464 pmaddubsw m4, m1, m6
25467 pinsrw m3, [r3 + 5], 0
25468 pmaddubsw m5, m3, m6
25471 movu [r0 + 1209 * 16], m4
25474 movu m6, [r5 + 10 * 16]
25475 pmaddubsw m4, m0, m6
25477 pmaddubsw m5, m2, m6
25480 movu [r0 + 1210 * 16], m4
25481 pmaddubsw m4, m1, m6
25483 pmaddubsw m5, m3, m6
25486 movu [r0 + 1211 * 16], m4
25489 movu m6, [r5 + 21 * 16]
25491 pinsrb m0, [r4 + 29], 1
25492 pinsrb m0, [r4 + 30], 0
25493 pmaddubsw m4, m0, m6
25496 pinsrb m2, [r4 + 17], 1
25497 pinsrb m2, [r4 + 18], 0
25498 pmaddubsw m5, m2, m6
25501 movu [r0 + 1212 * 16], m4
25503 pinsrb m1, [r4 + 5], 1
25504 pinsrb m1, [r4 + 6], 0
25505 pmaddubsw m4, m1, m6
25508 pinsrw m3, [r3 + 4], 0
25509 pmaddubsw m5, m3, m6
25512 movu [r0 + 1213 * 16], m4
25515 pshufb m5, m0, [tab_S2]
25516 movh [r0 + 1214 * 16], m5
25517 pshufb m5, m2, [tab_S2]
25518 movh [r0 + 1214 * 16 + 8], m5
25519 pshufb m5, m1, [tab_S2]
25520 movh [r0 + 1215 * 16], m5
25521 pshufb m5, m3, [tab_S2]
25522 movh [r0 + 1215 * 16 + 8], m5
25525 movu m6, [r5 + 15 * 16]
25529 pmaddubsw m1, m0, m6
25534 pmaddubsw m3, m2, m6
25537 movu [r0 + 1216 * 16], m1
25542 pmaddubsw m4, m1, m6
25547 pmaddubsw m5, m3, m6
25550 movu [r0 + 1217 * 16], m4
25553 movu m6, [r5 + 30 * 16]
25555 pinsrb m0, [r4 + 0], 1
25556 pinsrb m0, [r4 + 2], 0
25557 pmaddubsw m4, m0, m6
25560 pinsrw m2, [r3 + 7], 0
25561 pmaddubsw m5, m2, m6
25564 movu [r0 + 1218 * 16], m4
25566 pinsrw m1, [r3 + 15], 0
25567 pmaddubsw m4, m1, m6
25570 pinsrw m3, [r3 + 23], 0
25571 pmaddubsw m5, m3, m6
25574 movu [r0 + 1219 * 16], m4
25577 movu m6, [r5 + 13 * 16]
25578 pmaddubsw m4, m0, m6
25580 pmaddubsw m5, m2, m6
25583 movu [r0 + 1220 * 16], m4
25584 pmaddubsw m4, m1, m6
25586 pmaddubsw m5, m3, m6
25589 movu [r0 + 1221 * 16], m4
25592 movu m6, [r5 + 28 * 16]
25594 pinsrb m0, [r4 + 2], 1
25595 pinsrb m0, [r4 + 4], 0
25596 pmaddubsw m4, m0, m6
25599 pinsrw m2, [r3 + 6], 0
25600 pmaddubsw m5, m2, m6
25603 movu [r0 + 1222 * 16], m4
25605 pinsrw m1, [r3 + 14], 0
25606 pmaddubsw m4, m1, m6
25609 pinsrw m3, [r3 + 22], 0
25610 pmaddubsw m5, m3, m6
25613 movu [r0 + 1223 * 16], m4
25616 movu m6, [r5 + 11 * 16]
25617 pmaddubsw m4, m0, m6
25619 pmaddubsw m5, m2, m6
25622 movu [r0 + 1224 * 16], m4
25623 pmaddubsw m4, m1, m6
25625 pmaddubsw m5, m3, m6
25628 movu [r0 + 1225 * 16], m4
25631 movu m6, [r5 + 26 * 16]
25633 pinsrb m0, [r4 + 4], 1
25634 pinsrb m0, [r4 + 6], 0
25635 pmaddubsw m4, m0, m6
25638 pinsrw m2, [r3 + 5], 0
25639 pmaddubsw m5, m2, m6
25642 movu [r0 + 1226 * 16], m4
25644 pinsrw m1, [r3 + 13], 0
25645 pmaddubsw m4, m1, m6
25648 pinsrw m3, [r3 + 21], 0
25649 pmaddubsw m5, m3, m6
25652 movu [r0 + 1227 * 16], m4
25655 movu m6, [r5 + 9 * 16]
25656 pmaddubsw m4, m0, m6
25658 pmaddubsw m5, m2, m6
25661 movu [r0 + 1228 * 16], m4
25662 pmaddubsw m4, m1, m6
25664 pmaddubsw m5, m3, m6
25667 movu [r0 + 1229 * 16], m4
25670 movu m6, [r5 + 24 * 16]
25672 pinsrb m0, [r4 + 6], 1
25673 pinsrb m0, [r4 + 8], 0
25674 pmaddubsw m4, m0, m6
25677 pinsrw m2, [r3 + 4], 0
25678 pmaddubsw m5, m2, m6
25681 movu [r0 + 1230 * 16], m4
25683 pinsrw m1, [r3 + 12], 0
25684 pmaddubsw m4, m1, m6
25687 pinsrw m3, [r3 + 20], 0
25688 pmaddubsw m5, m3, m6
25691 movu [r0 + 1231 * 16], m4
25694 movu m6, [r5 + 7 * 16]
25695 pmaddubsw m4, m0, m6
25697 pmaddubsw m5, m2, m6
25700 movu [r0 + 1232 * 16], m4
25701 pmaddubsw m4, m1, m6
25703 pmaddubsw m5, m3, m6
25706 movu [r0 + 1233 * 16], m4
25709 movu m6, [r5 + 22 * 16]
25711 pinsrb m0, [r4 + 8], 1
25712 pinsrb m0, [r4 + 9], 0
25713 pmaddubsw m4, m0, m6
25716 pinsrw m2, [r3 + 3], 0
25717 pmaddubsw m5, m2, m6
25720 movu [r0 + 1234 * 16], m4
25722 pinsrw m1, [r3 + 11], 0
25723 pmaddubsw m4, m1, m6
25726 pinsrw m3, [r3 + 19], 0
25727 pmaddubsw m5, m3, m6
25730 movu [r0 + 1235 * 16], m4
25733 movu m6, [r5 + 5 * 16]
25734 pmaddubsw m4, m0, m6
25736 pmaddubsw m5, m2, m6
25739 movu [r0 + 1236 * 16], m4
25740 pmaddubsw m4, m1, m6
25742 pmaddubsw m5, m3, m6
25745 movu [r0 + 1237 * 16], m4
25748 movu m6, [r5 + 20 * 16]
25750 pinsrb m0, [r4 + 9], 1
25751 pinsrb m0, [r4 + 11], 0
25752 pmaddubsw m4, m0, m6
25755 pinsrw m2, [r3 + 2], 0
25756 pmaddubsw m5, m2, m6
25759 movu [r0 + 1238 * 16], m4
25761 pinsrw m1, [r3 + 10], 0
25762 pmaddubsw m4, m1, m6
25765 pinsrw m3, [r3 + 18], 0
25766 pmaddubsw m5, m3, m6
25769 movu [r0 + 1239 * 16], m4
25772 movu m6, [r5 + 3 * 16]
25773 pmaddubsw m4, m0, m6
25775 pmaddubsw m5, m2, m6
25778 movu [r0 + 1240 * 16], m4
25779 pmaddubsw m4, m1, m6
25781 pmaddubsw m5, m3, m6
25784 movu [r0 + 1241 * 16], m4
25787 movu m6, [r5 + 18 * 16]
25789 pinsrb m0, [r4 + 11], 1
25790 pinsrb m0, [r4 + 13], 0
25791 pmaddubsw m4, m0, m6
25794 pinsrw m2, [r3 + 1], 0
25795 pmaddubsw m5, m2, m6
25798 movu [r0 + 1242 * 16], m4
25800 pinsrw m1, [r3 + 9], 0
25801 pmaddubsw m4, m1, m6
25804 pinsrw m3, [r3 + 17], 0
25805 pmaddubsw m5, m3, m6
25808 movu [r0 + 1243 * 16], m4
25811 movu m6, [r5 + 1 * 16]
25812 pmaddubsw m4, m0, m6
25814 pmaddubsw m5, m2, m6
25817 movu [r0 + 1244 * 16], m4
25818 pmaddubsw m4, m1, m6
25820 pmaddubsw m5, m3, m6
25823 movu [r0 + 1245 * 16], m4
25826 movu m6, [r5 + 16 * 16]
25828 pinsrb m0, [r4 + 13], 1
25829 pinsrb m0, [r4 + 15], 0
25830 pmaddubsw m4, m0, m6
25833 pinsrw m2, [r3 + 0], 0
25834 pmaddubsw m5, m2, m6
25837 movu [r0 + 1246 * 16], m4
25839 pinsrw m1, [r3 + 8], 0
25840 pmaddubsw m4, m1, m6
25843 pinsrw m3, [r3 + 16], 0
25844 pmaddubsw m5, m3, m6
25847 movu [r0 + 1247 * 16], m4
25850 movu m6, [r5 + 31 * 16]
25852 pinsrb m0, [r4 + 15], 1
25853 pinsrb m0, [r4 + 17], 0
25854 pmaddubsw m4, m0, m6
25857 pinsrb m2, [r4 + 0], 1
25858 pinsrb m2, [r4 + 2], 0
25859 pmaddubsw m5, m2, m6
25862 movu [r0 + 1248 * 16], m4
25864 pinsrw m1, [r3 + 7], 0
25865 pmaddubsw m4, m1, m6
25868 pinsrw m3, [r3 + 15], 0
25869 pmaddubsw m5, m3, m6
25872 movu [r0 + 1249 * 16], m4
25875 movu m6, [r5 + 14 * 16]
25876 pmaddubsw m4, m0, m6
25878 pmaddubsw m5, m2, m6
25881 movu [r0 + 1250 * 16], m4
25882 pmaddubsw m4, m1, m6
25884 pmaddubsw m5, m3, m6
25887 movu [r0 + 1251 * 16], m4
25890 movu m6, [r5 + 29 * 16]
25892 pinsrb m0, [r4 + 17], 1
25893 pinsrb m0, [r4 + 19], 0
25894 pmaddubsw m4, m0, m6
25897 pinsrb m2, [r4 + 2], 1
25898 pinsrb m2, [r4 + 4], 0
25899 pmaddubsw m5, m2, m6
25902 movu [r0 + 1252 * 16], m4
25904 pinsrb m1, [r3 + 7], 1
25905 pinsrb m1, [r3 + 6], 0
25906 pmaddubsw m4, m1, m6
25909 pinsrb m3, [r3 + 15], 1
25910 pinsrb m3, [r3 + 14], 0
25911 pmaddubsw m5, m3, m6
25914 movu [r0 + 1253 * 16], m4
25917 movu m6, [r5 + 12 * 16]
25918 pmaddubsw m4, m0, m6
25920 pmaddubsw m5, m2, m6
25923 movu [r0 + 1254 * 16], m4
25924 pmaddubsw m4, m1, m6
25926 pmaddubsw m5, m3, m6
25929 movu [r0 + 1255 * 16], m4
25932 movu m6, [r5 + 27 * 16]
25934 pinsrb m0, [r4 + 19], 1
25935 pinsrb m0, [r4 + 21], 0
25936 pmaddubsw m4, m0, m6
25939 pinsrb m2, [r4 + 4], 1
25940 pinsrb m2, [r4 + 6], 0
25941 pmaddubsw m5, m2, m6
25944 movu [r0 + 1256 * 16], m4
25946 pinsrw m1, [r3 + 5], 0
25947 pmaddubsw m4, m1, m6
25950 pinsrw m3, [r3 + 13], 0
25951 pmaddubsw m5, m3, m6
25954 movu [r0 + 1257 * 16], m4
25957 movu m6, [r5 + 10 * 16]
25958 pmaddubsw m4, m0, m6
25960 pmaddubsw m5, m2, m6
25963 movu [r0 + 1258 * 16], m4
25964 pmaddubsw m4, m1, m6
25966 pmaddubsw m5, m3, m6
25969 movu [r0 + 1259 * 16], m4
25972 movu m6, [r5 + 25 * 16]
25974 pinsrb m0, [r4 + 21], 1
25975 pinsrb m0, [r4 + 23], 0
25976 pmaddubsw m4, m0, m6
25979 pinsrb m2, [r4 + 6], 1
25980 pinsrb m2, [r4 + 8], 0
25981 pmaddubsw m5, m2, m6
25984 movu [r0 + 1260 * 16], m4
25986 pinsrw m1, [r3 + 4], 0
25987 pmaddubsw m4, m1, m6
25990 pinsrw m3, [r3 + 12], 0
25991 pmaddubsw m5, m3, m6
25994 movu [r0 + 1261 * 16], m4
25997 movu m6, [r5 + 8 * 16]
25998 pmaddubsw m4, m0, m6
26000 pmaddubsw m5, m2, m6
26003 movu [r0 + 1262 * 16], m4
26004 pmaddubsw m4, m1, m6
26006 pmaddubsw m5, m3, m6
26009 movu [r0 + 1263 * 16], m4
26012 movu m6, [r5 + 23 * 16]
26014 pinsrb m0, [r4 + 23], 1
26015 pinsrb m0, [r4 + 24], 0
26016 pmaddubsw m4, m0, m6
26019 pinsrb m2, [r4 + 8], 1
26020 pinsrb m2, [r4 + 9], 0
26021 pmaddubsw m5, m2, m6
26024 movu [r0 + 1264 * 16], m4
26026 pinsrw m1, [r3 + 3], 0
26027 pmaddubsw m4, m1, m6
26030 pinsrw m3, [r3 + 11], 0
26031 pmaddubsw m5, m3, m6
26034 movu [r0 + 1265 * 16], m4
26037 movu m6, [r5 + 6 * 16]
26038 pmaddubsw m4, m0, m6
26040 pmaddubsw m5, m2, m6
26043 movu [r0 + 1266 * 16], m4
26044 pmaddubsw m4, m1, m6
26046 pmaddubsw m5, m3, m6
26049 movu [r0 + 1267 * 16], m4
26052 movu m6, [r5 + 21 * 16]
26054 pinsrb m0, [r4 + 24], 1
26055 pinsrb m0, [r4 + 26], 0
26056 pmaddubsw m4, m0, m6
26059 pinsrb m2, [r4 + 9], 1
26060 pinsrb m2, [r4 + 11], 0
26061 pmaddubsw m5, m2, m6
26064 movu [r0 + 1268 * 16], m4
26066 pinsrw m1, [r3 + 2], 0
26067 pmaddubsw m4, m1, m6
26070 pinsrw m3, [r3 + 10], 0
26071 pmaddubsw m5, m3, m6
26074 movu [r0 + 1269 * 16], m4
26077 movu m6, [r5 + 4 * 16]
26078 pmaddubsw m4, m0, m6
26080 pmaddubsw m5, m2, m6
26083 movu [r0 + 1270 * 16], m4
26084 pmaddubsw m4, m1, m6
26086 pmaddubsw m5, m3, m6
26089 movu [r0 + 1271 * 16], m4
26092 movu m6, [r5 + 19 * 16]
26094 pinsrb m0, [r4 + 26], 1
26095 pinsrb m0, [r4 + 28], 0
26096 pmaddubsw m4, m0, m6
26099 pinsrb m2, [r4 + 11], 1
26100 pinsrb m2, [r4 + 13], 0
26101 pmaddubsw m5, m2, m6
26104 movu [r0 + 1272 * 16], m4
26106 pinsrw m1, [r3 + 1], 0
26107 pmaddubsw m4, m1, m6
26110 pinsrw m3, [r3 + 9], 0
26111 pmaddubsw m5, m3, m6
26114 movu [r0 + 1273 * 16], m4
26117 movu m6, [r5 + 2 * 16]
26118 pmaddubsw m4, m0, m6
26120 pmaddubsw m5, m2, m6
26123 movu [r0 + 1274 * 16], m4
26124 pmaddubsw m4, m1, m6
26126 pmaddubsw m5, m3, m6
26129 movu [r0 + 1275 * 16], m4
26132 movu m6, [r5 + 17 * 16]
26134 pinsrb m0, [r4 + 28], 1
26135 pinsrb m0, [r4 + 30], 0
26136 pmaddubsw m4, m0, m6
26139 pinsrb m2, [r4 + 13], 1
26140 pinsrb m2, [r4 + 15], 0
26141 pmaddubsw m5, m2, m6
26144 movu [r0 + 1276 * 16], m4
26146 pinsrw m1, [r3 + 0], 0
26147 pmaddubsw m4, m1, m6
26150 pinsrw m3, [r3 + 8], 0
26151 pmaddubsw m5, m3, m6
26154 movu [r0 + 1277 * 16], m4
26157 pshufb m5, m0, [tab_S2]
26158 movh [r0 + 1278 * 16], m5
26159 pshufb m5, m2, [tab_S2]
26160 movh [r0 + 1278 * 16 + 8], m5
26161 pshufb m5, m1, [tab_S2]
26162 movh [r0 + 1279 * 16], m5
26163 pshufb m5, m3, [tab_S2]
26164 movh [r0 + 1279 * 16 + 8], m5
26167 movu m6, [r5 + 19 * 16]
26171 pmaddubsw m1, m0, m6
26176 pmaddubsw m3, m2, m6
26179 movu [r0 + 1280 * 16], m1
26184 pmaddubsw m4, m1, m6
26189 pmaddubsw m5, m3, m6
26192 movu [r0 + 1281 * 16], m4
26195 movu m6, [r5 + 6 * 16]
26196 pmaddubsw m4, m0, m6
26198 pmaddubsw m5, m2, m6
26201 movu [r0 + 1282 * 16], m4
26202 pmaddubsw m4, m1, m6
26204 pmaddubsw m5, m3, m6
26207 movu [r0 + 1283 * 16], m4
26210 movu m6, [r5 + 25 * 16]
26212 pinsrb m0, [r4 + 0], 1
26213 pinsrb m0, [r4 + 2], 0
26214 pmaddubsw m4, m0, m6
26217 pinsrw m2, [r3 + 7], 0
26218 pmaddubsw m5, m2, m6
26221 movu [r0 + 1284 * 16], m4
26223 pinsrw m1, [r3 + 15], 0
26224 pmaddubsw m4, m1, m6
26227 pinsrw m3, [r3 + 23], 0
26228 pmaddubsw m5, m3, m6
26231 movu [r0 + 1285 * 16], m4
26234 movu m6, [r5 + 12 * 16]
26235 pmaddubsw m4, m0, m6
26237 pmaddubsw m5, m2, m6
26240 movu [r0 + 1286 * 16], m4
26241 pmaddubsw m4, m1, m6
26243 pmaddubsw m5, m3, m6
26246 movu [r0 + 1287 * 16], m4
26249 movu m6, [r5 + 31 * 16]
26251 pinsrb m0, [r4 + 2], 1
26252 pinsrb m0, [r4 + 5], 0
26253 pmaddubsw m4, m0, m6
26256 pinsrw m2, [r3 + 6], 0
26257 pmaddubsw m5, m2, m6
26260 movu [r0 + 1288 * 16], m4
26262 pinsrw m1, [r3 + 14], 0
26263 pmaddubsw m4, m1, m6
26266 pinsrw m3, [r3 + 22], 0
26267 pmaddubsw m5, m3, m6
26270 movu [r0 + 1289 * 16], m4
26273 movu m6, [r5 + 18 * 16]
26274 pmaddubsw m4, m0, m6
26276 pmaddubsw m5, m2, m6
26279 movu [r0 + 1290 * 16], m4
26280 pmaddubsw m4, m1, m6
26282 pmaddubsw m5, m3, m6
26285 movu [r0 + 1291 * 16], m4
26288 movu m6, [r5 + 5 * 16]
26289 pmaddubsw m4, m0, m6
26291 pmaddubsw m5, m2, m6
26294 movu [r0 + 1292 * 16], m4
26295 pmaddubsw m4, m1, m6
26297 pmaddubsw m5, m3, m6
26300 movu [r0 + 1293 * 16], m4
26303 movu m6, [r5 + 24 * 16]
26305 pinsrb m0, [r4 + 5], 1
26306 pinsrb m0, [r4 + 7], 0
26307 pmaddubsw m4, m0, m6
26310 pinsrw m2, [r3 + 5], 0
26311 pmaddubsw m5, m2, m6
26314 movu [r0 + 1294 * 16], m4
26316 pinsrw m1, [r3 + 13], 0
26317 pmaddubsw m4, m1, m6
26320 pinsrw m3, [r3 + 21], 0
26321 pmaddubsw m5, m3, m6
26324 movu [r0 + 1295 * 16], m4
26327 movu m6, [r5 + 11 * 16]
26328 pmaddubsw m4, m0, m6
26330 pmaddubsw m5, m2, m6
26333 movu [r0 + 1296 * 16], m4
26334 pmaddubsw m4, m1, m6
26336 pmaddubsw m5, m3, m6
26339 movu [r0 + 1297 * 16], m4
26342 movu m6, [r5 + 30 * 16]
26344 pinsrb m0, [r4 + 7], 1
26345 pinsrb m0, [r4 + 10], 0
26346 pmaddubsw m4, m0, m6
26349 pinsrw m2, [r3 + 4], 0
26350 pmaddubsw m5, m2, m6
26353 movu [r0 + 1298 * 16], m4
26355 pinsrw m1, [r3 + 12], 0
26356 pmaddubsw m4, m1, m6
26359 pinsrw m3, [r3 + 20], 0
26360 pmaddubsw m5, m3, m6
26363 movu [r0 + 1299 * 16], m4
26366 movu m6, [r5 + 17 * 16]
26367 pmaddubsw m4, m0, m6
26369 pmaddubsw m5, m2, m6
26372 movu [r0 + 1300 * 16], m4
26373 pmaddubsw m4, m1, m6
26375 pmaddubsw m5, m3, m6
26378 movu [r0 + 1301 * 16], m4
26381 movu m6, [r5 + 4 * 16]
26382 pmaddubsw m4, m0, m6
26384 pmaddubsw m5, m2, m6
26387 movu [r0 + 1302 * 16], m4
26388 pmaddubsw m4, m1, m6
26390 pmaddubsw m5, m3, m6
26393 movu [r0 + 1303 * 16], m4
26396 movu m6, [r5 + 23 * 16]
26398 pinsrb m0, [r4 + 10], 1
26399 pinsrb m0, [r4 + 12], 0
26400 pmaddubsw m4, m0, m6
26403 pinsrw m2, [r3 + 3], 0
26404 pmaddubsw m5, m2, m6
26407 movu [r0 + 1304 * 16], m4
26409 pinsrw m1, [r3 + 11], 0
26410 pmaddubsw m4, m1, m6
26413 pinsrw m3, [r3 + 19], 0
26414 pmaddubsw m5, m3, m6
26417 movu [r0 + 1305 * 16], m4
26420 movu m6, [r5 + 10 * 16]
26421 pmaddubsw m4, m0, m6
26423 pmaddubsw m5, m2, m6
26426 movu [r0 + 1306 * 16], m4
26427 pmaddubsw m4, m1, m6
26429 pmaddubsw m5, m3, m6
26432 movu [r0 + 1307 * 16], m4
26435 movu m6, [r5 + 29 * 16]
26437 pinsrb m0, [r4 + 12], 1
26438 pinsrb m0, [r4 + 15], 0
26439 pmaddubsw m4, m0, m6
26442 pinsrw m2, [r3 + 2], 0
26443 pmaddubsw m5, m2, m6
26446 movu [r0 + 1308 * 16], m4
26448 pinsrw m1, [r3 + 10], 0
26449 pmaddubsw m4, m1, m6
26452 pinsrw m3, [r3 + 18], 0
26453 pmaddubsw m5, m3, m6
26456 movu [r0 + 1309 * 16], m4
26459 movu m6, [r5 + 16 * 16]
26460 pmaddubsw m4, m0, m6
26462 pmaddubsw m5, m2, m6
26465 movu [r0 + 1310 * 16], m4
26466 pmaddubsw m4, m1, m6
26468 pmaddubsw m5, m3, m6
26471 movu [r0 + 1311 * 16], m4
26474 movu m6, [r5 + 3 * 16]
26475 pmaddubsw m4, m0, m6
26477 pmaddubsw m5, m2, m6
26480 movu [r0 + 1312 * 16], m4
26481 pmaddubsw m4, m1, m6
26483 pmaddubsw m5, m3, m6
26486 movu [r0 + 1313 * 16], m4
26489 movu m6, [r5 + 22 * 16]
26491 pinsrb m0, [r4 + 15], 1
26492 pinsrb m0, [r4 + 17], 0
26493 pmaddubsw m4, m0, m6
26496 pinsrw m2, [r3 + 1], 0
26497 pmaddubsw m5, m2, m6
26500 movu [r0 + 1314 * 16], m4
26502 pinsrw m1, [r3 + 9], 0
26503 pmaddubsw m4, m1, m6
26506 pinsrw m3, [r3 + 17], 0
26507 pmaddubsw m5, m3, m6
26510 movu [r0 + 1315 * 16], m4
26513 movu m6, [r5 + 9 * 16]
26514 pmaddubsw m4, m0, m6
26516 pmaddubsw m5, m2, m6
26519 movu [r0 + 1316 * 16], m4
26520 pmaddubsw m4, m1, m6
26522 pmaddubsw m5, m3, m6
26525 movu [r0 + 1317 * 16], m4
26528 movu m6, [r5 + 28 * 16]
26530 pinsrb m0, [r4 + 17], 1
26531 pinsrb m0, [r4 + 20], 0
26532 pmaddubsw m4, m0, m6
26535 pinsrw m2, [r3 + 0], 0
26536 pmaddubsw m5, m2, m6
26539 movu [r0 + 1318 * 16], m4
26541 pinsrw m1, [r3 + 8], 0
26542 pmaddubsw m4, m1, m6
26545 pinsrw m3, [r3 + 16], 0
26546 pmaddubsw m5, m3, m6
26549 movu [r0 + 1319 * 16], m4
26552 movu m6, [r5 + 15 * 16]
26553 pmaddubsw m4, m0, m6
26555 pmaddubsw m5, m2, m6
26558 movu [r0 + 1320 * 16], m4
26559 pmaddubsw m4, m1, m6
26561 pmaddubsw m5, m3, m6
26564 movu [r0 + 1321 * 16], m4
26567 movu m6, [r5 + 2 * 16]
26568 pmaddubsw m4, m0, m6
26570 pmaddubsw m5, m2, m6
26573 movu [r0 + 1322 * 16], m4
26574 pmaddubsw m4, m1, m6
26576 pmaddubsw m5, m3, m6
26579 movu [r0 + 1323 * 16], m4
26582 movu m6, [r5 + 21 * 16]
26584 pinsrb m0, [r4 + 20], 1
26585 pinsrb m0, [r4 + 22], 0
26586 pmaddubsw m4, m0, m6
26589 pinsrb m2, [r4 + 0], 1
26590 pinsrb m2, [r4 + 2], 0
26591 pmaddubsw m5, m2, m6
26594 movu [r0 + 1324 * 16], m4
26596 pinsrw m1, [r3 + 7], 0
26597 pmaddubsw m4, m1, m6
26600 pinsrw m3, [r3 + 15], 0
26601 pmaddubsw m5, m3, m6
26604 movu [r0 + 1325 * 16], m4
26607 movu m6, [r5 + 8 * 16]
26608 pmaddubsw m4, m0, m6
26610 pmaddubsw m5, m2, m6
26613 movu [r0 + 1326 * 16], m4
26614 pmaddubsw m4, m1, m6
26616 pmaddubsw m5, m3, m6
26619 movu [r0 + 1327 * 16], m4
26622 movu m6, [r5 + 27 * 16]
26624 pinsrb m0, [r4 + 22], 1
26625 pinsrb m0, [r4 + 25], 0
26626 pmaddubsw m4, m0, m6
26629 pinsrb m2, [r4 + 2], 1
26630 pinsrb m2, [r4 + 5], 0
26631 pmaddubsw m5, m2, m6
26634 movu [r0 + 1328 * 16], m4
26636 pinsrw m1, [r3 + 6], 0
26637 pmaddubsw m4, m1, m6
26640 pinsrw m3, [r3 + 14], 0
26641 pmaddubsw m5, m3, m6
26644 movu [r0 + 1329 * 16], m4
26647 movu m6, [r5 + 14 * 16]
26648 pmaddubsw m4, m0, m6
26650 pmaddubsw m5, m2, m6
26653 movu [r0 + 1330 * 16], m4
26654 pmaddubsw m4, m1, m6
26656 pmaddubsw m5, m3, m6
26659 movu [r0 + 1331 * 16], m4
26662 movu m6, [r5 + 1 * 16]
26663 pmaddubsw m4, m0, m6
26665 pmaddubsw m5, m2, m6
26668 movu [r0 + 1332 * 16], m4
26669 pmaddubsw m4, m1, m6
26671 pmaddubsw m5, m3, m6
26674 movu [r0 + 1333 * 16], m4
26677 movu m6, [r5 + 20 * 16]
26679 pinsrb m0, [r4 + 25], 1
26680 pinsrb m0, [r4 + 27], 0
26681 pmaddubsw m4, m0, m6
26684 pinsrb m2, [r4 + 5], 1
26685 pinsrb m2, [r4 + 7], 0
26686 pmaddubsw m5, m2, m6
26689 movu [r0 + 1334 * 16], m4
26691 pinsrw m1, [r3 + 5], 0
26692 pmaddubsw m4, m1, m6
26695 pinsrw m3, [r3 + 13], 0
26696 pmaddubsw m5, m3, m6
26699 movu [r0 + 1335 * 16], m4
26702 movu m6, [r5 + 7 * 16]
26703 pmaddubsw m4, m0, m6
26705 pmaddubsw m5, m2, m6
26708 movu [r0 + 1336 * 16], m4
26709 pmaddubsw m4, m1, m6
26711 pmaddubsw m5, m3, m6
26714 movu [r0 + 1337 * 16], m4
26717 movu m6, [r5 + 26 * 16]
26719 pinsrb m0, [r4 + 27], 1
26720 pinsrb m0, [r4 + 30], 0
26721 pmaddubsw m4, m0, m6
26724 pinsrb m2, [r4 + 7], 1
26725 pinsrb m2, [r4 + 10], 0
26726 pmaddubsw m5, m2, m6
26729 movu [r0 + 1338 * 16], m4
26731 pinsrw m1, [r3 + 4], 0
26732 pmaddubsw m4, m1, m6
26735 pinsrw m3, [r3 + 12], 0
26736 pmaddubsw m5, m3, m6
26739 movu [r0 + 1339 * 16], m4
26742 movu m6, [r5 + 13 * 16]
26743 pmaddubsw m4, m0, m6
26745 pmaddubsw m5, m2, m6
26748 movu [r0 + 1340 * 16], m4
26749 pmaddubsw m4, m1, m6
26751 pmaddubsw m5, m3, m6
26754 movu [r0 + 1341 * 16], m4
26757 pshufb m5, m0, [tab_S2]
26758 movh [r0 + 1342 * 16], m5
26759 pshufb m5, m2, [tab_S2]
26760 movh [r0 + 1342 * 16 + 8], m5
26761 pshufb m5, m1, [tab_S2]
26762 movh [r0 + 1343 * 16], m5
26763 pshufb m5, m3, [tab_S2]
26764 movh [r0 + 1343 * 16 + 8], m5
26767 movu m6, [r5 + 23 * 16]
26771 pmaddubsw m1, m0, m6
26776 pmaddubsw m3, m2, m6
26779 movu [r0 + 1344 * 16], m1
26784 pmaddubsw m4, m1, m6
26789 pmaddubsw m5, m3, m6
26792 movu [r0 + 1345 * 16], m4
26795 movu m6, [r5 + 14 * 16]
26796 pmaddubsw m4, m0, m6
26798 pmaddubsw m5, m2, m6
26801 movu [r0 + 1346 * 16], m4
26802 pmaddubsw m4, m1, m6
26804 pmaddubsw m5, m3, m6
26807 movu [r0 + 1347 * 16], m4
26810 movu m6, [r5 + 5 * 16]
26811 pmaddubsw m4, m0, m6
26813 pmaddubsw m5, m2, m6
26816 movu [r0 + 1348 * 16], m4
26817 pmaddubsw m4, m1, m6
26819 pmaddubsw m5, m3, m6
26822 movu [r0 + 1349 * 16], m4
26825 movu m6, [r5 + 28 * 16]
26827 pinsrb m0, [r4 + 0], 1
26828 pinsrb m0, [r4 + 4], 0
26829 pmaddubsw m4, m0, m6
26832 pinsrw m2, [r3 + 7], 0
26833 pmaddubsw m5, m2, m6
26836 movu [r0 + 1350 * 16], m4
26838 pinsrw m1, [r3 + 15], 0
26839 pmaddubsw m4, m1, m6
26842 pinsrw m3, [r3 + 23], 0
26843 pmaddubsw m5, m3, m6
26846 movu [r0 + 1351 * 16], m4
26849 movu m6, [r5 + 19 * 16]
26850 pmaddubsw m4, m0, m6
26852 pmaddubsw m5, m2, m6
26855 movu [r0 + 1352 * 16], m4
26856 pmaddubsw m4, m1, m6
26858 pmaddubsw m5, m3, m6
26861 movu [r0 + 1353 * 16], m4
26864 movu m6, [r5 + 10 * 16]
26865 pmaddubsw m4, m0, m6
26867 pmaddubsw m5, m2, m6
26870 movu [r0 + 1354 * 16], m4
26871 pmaddubsw m4, m1, m6
26873 pmaddubsw m5, m3, m6
26876 movu [r0 + 1355 * 16], m4
26879 movu m6, [r5 + 1 * 16]
26880 pmaddubsw m4, m0, m6
26882 pmaddubsw m5, m2, m6
26885 movu [r0 + 1356 * 16], m4
26886 pmaddubsw m4, m1, m6
26888 pmaddubsw m5, m3, m6
26891 movu [r0 + 1357 * 16], m4
26894 movu m6, [r5 + 24 * 16]
26896 pinsrb m0, [r4 + 4], 1
26897 pinsrb m0, [r4 + 7], 0
26898 pmaddubsw m4, m0, m6
26901 pinsrw m2, [r3 + 6], 0
26902 pmaddubsw m5, m2, m6
26905 movu [r0 + 1358 * 16], m4
26907 pinsrw m1, [r3 + 14], 0
26908 pmaddubsw m4, m1, m6
26911 pinsrw m3, [r3 + 22], 0
26912 pmaddubsw m5, m3, m6
26915 movu [r0 + 1359 * 16], m4
26918 movu m6, [r5 + 15 * 16]
26919 pmaddubsw m4, m0, m6
26921 pmaddubsw m5, m2, m6
26924 movu [r0 + 1360 * 16], m4
26925 pmaddubsw m4, m1, m6
26927 pmaddubsw m5, m3, m6
26930 movu [r0 + 1361 * 16], m4
26933 movu m6, [r5 + 6 * 16]
26934 pmaddubsw m4, m0, m6
26936 pmaddubsw m5, m2, m6
26939 movu [r0 + 1362 * 16], m4
26940 pmaddubsw m4, m1, m6
26942 pmaddubsw m5, m3, m6
26945 movu [r0 + 1363 * 16], m4
26948 movu m6, [r5 + 29 * 16]
26950 pinsrb m0, [r4 + 7], 1
26951 pinsrb m0, [r4 + 11], 0
26952 pmaddubsw m4, m0, m6
26955 pinsrw m2, [r3 + 5], 0
26956 pmaddubsw m5, m2, m6
26959 movu [r0 + 1364 * 16], m4
26961 pinsrw m1, [r3 + 13], 0
26962 pmaddubsw m4, m1, m6
26965 pinsrw m3, [r3 + 21], 0
26966 pmaddubsw m5, m3, m6
26969 movu [r0 + 1365 * 16], m4
26972 movu m6, [r5 + 20 * 16]
26973 pmaddubsw m4, m0, m6
26975 pmaddubsw m5, m2, m6
26978 movu [r0 + 1366 * 16], m4
26979 pmaddubsw m4, m1, m6
26981 pmaddubsw m5, m3, m6
26984 movu [r0 + 1367 * 16], m4
26987 movu m6, [r5 + 11 * 16]
26988 pmaddubsw m4, m0, m6
26990 pmaddubsw m5, m2, m6
26993 movu [r0 + 1368 * 16], m4
26994 pmaddubsw m4, m1, m6
26996 pmaddubsw m5, m3, m6
26999 movu [r0 + 1369 * 16], m4
27002 movu m6, [r5 + 2 * 16]
27003 pmaddubsw m4, m0, m6
27005 pmaddubsw m5, m2, m6
27008 movu [r0 + 1370 * 16], m4
27009 pmaddubsw m4, m1, m6
27011 pmaddubsw m5, m3, m6
27014 movu [r0 + 1371 * 16], m4
27017 movu m6, [r5 + 25 * 16]
27019 pinsrb m0, [r4 + 11], 1
27020 pinsrb m0, [r4 + 14], 0
27021 pmaddubsw m4, m0, m6
27024 pinsrw m2, [r3 + 4], 0
27025 pmaddubsw m5, m2, m6
27028 movu [r0 + 1372 * 16], m4
27030 pinsrw m1, [r3 + 12], 0
27031 pmaddubsw m4, m1, m6
27034 pinsrw m3, [r3 + 20], 0
27035 pmaddubsw m5, m3, m6
27038 movu [r0 + 1373 * 16], m4
27041 movu m6, [r5 + 16 * 16]
27042 pmaddubsw m4, m0, m6
27044 pmaddubsw m5, m2, m6
27047 movu [r0 + 1374 * 16], m4
27048 pmaddubsw m4, m1, m6
27050 pmaddubsw m5, m3, m6
27053 movu [r0 + 1375 * 16], m4
27056 movu m6, [r5 + 7 * 16]
27057 pmaddubsw m4, m0, m6
27059 pmaddubsw m5, m2, m6
27062 movu [r0 + 1376 * 16], m4
27063 pmaddubsw m4, m1, m6
27065 pmaddubsw m5, m3, m6
27068 movu [r0 + 1377 * 16], m4
27071 movu m6, [r5 + 30 * 16]
27073 pinsrb m0, [r4 + 14], 1
27074 pinsrb m0, [r4 + 18], 0
27075 pmaddubsw m4, m0, m6
27078 pinsrw m2, [r3 + 3], 0
27079 pmaddubsw m5, m2, m6
27082 movu [r0 + 1378 * 16], m4
27084 pinsrw m1, [r3 + 11], 0
27085 pmaddubsw m4, m1, m6
27088 pinsrw m3, [r3 + 19], 0
27089 pmaddubsw m5, m3, m6
27092 movu [r0 + 1379 * 16], m4
27095 movu m6, [r5 + 21 * 16]
27096 pmaddubsw m4, m0, m6
27098 pmaddubsw m5, m2, m6
27101 movu [r0 + 1380 * 16], m4
27102 pmaddubsw m4, m1, m6
27104 pmaddubsw m5, m3, m6
27107 movu [r0 + 1381 * 16], m4
27110 movu m6, [r5 + 12 * 16]
27111 pmaddubsw m4, m0, m6
27113 pmaddubsw m5, m2, m6
27116 movu [r0 + 1382 * 16], m4
27117 pmaddubsw m4, m1, m6
27119 pmaddubsw m5, m3, m6
27122 movu [r0 + 1383 * 16], m4
27125 movu m6, [r5 + 3 * 16]
27126 pmaddubsw m4, m0, m6
27128 pmaddubsw m5, m2, m6
27131 movu [r0 + 1384 * 16], m4
27132 pmaddubsw m4, m1, m6
27134 pmaddubsw m5, m3, m6
27137 movu [r0 + 1385 * 16], m4
27140 movu m6, [r5 + 26 * 16]
27142 pinsrb m0, [r4 + 18], 1
27143 pinsrb m0, [r4 + 21], 0
27144 pmaddubsw m4, m0, m6
27147 pinsrw m2, [r3 + 2], 0
27148 pmaddubsw m5, m2, m6
27151 movu [r0 + 1386 * 16], m4
27153 pinsrw m1, [r3 + 10], 0
27154 pmaddubsw m4, m1, m6
27157 pinsrw m3, [r3 + 18], 0
27158 pmaddubsw m5, m3, m6
27161 movu [r0 + 1387 * 16], m4
27164 movu m6, [r5 + 17 * 16]
27165 pmaddubsw m4, m0, m6
27167 pmaddubsw m5, m2, m6
27170 movu [r0 + 1388 * 16], m4
27171 pmaddubsw m4, m1, m6
27173 pmaddubsw m5, m3, m6
27176 movu [r0 + 1389 * 16], m4
27179 movu m6, [r5 + 8 * 16]
27180 pmaddubsw m4, m0, m6
27182 pmaddubsw m5, m2, m6
27185 movu [r0 + 1390 * 16], m4
27186 pmaddubsw m4, m1, m6
27188 pmaddubsw m5, m3, m6
27191 movu [r0 + 1391 * 16], m4
27194 movu m6, [r5 + 31 * 16]
27196 pinsrb m0, [r4 + 21], 1
27197 pinsrb m0, [r4 + 25], 0
27198 pmaddubsw m4, m0, m6
27201 pinsrw m2, [r3 + 1], 0
27202 pmaddubsw m5, m2, m6
27205 movu [r0 + 1392 * 16], m4
27207 pinsrw m1, [r3 + 9], 0
27208 pmaddubsw m4, m1, m6
27211 pinsrw m3, [r3 + 17], 0
27212 pmaddubsw m5, m3, m6
27215 movu [r0 + 1393 * 16], m4
27218 movu m6, [r5 + 22 * 16]
27219 pmaddubsw m4, m0, m6
27221 pmaddubsw m5, m2, m6
27224 movu [r0 + 1394 * 16], m4
27225 pmaddubsw m4, m1, m6
27227 pmaddubsw m5, m3, m6
27230 movu [r0 + 1395 * 16], m4
27233 movu m6, [r5 + 13 * 16]
27234 pmaddubsw m4, m0, m6
27236 pmaddubsw m5, m2, m6
27239 movu [r0 + 1396 * 16], m4
27240 pmaddubsw m4, m1, m6
27242 pmaddubsw m5, m3, m6
27245 movu [r0 + 1397 * 16], m4
27248 movu m6, [r5 + 4 * 16]
27249 pmaddubsw m4, m0, m6
27251 pmaddubsw m5, m2, m6
27254 movu [r0 + 1398 * 16], m4
27255 pmaddubsw m4, m1, m6
27257 pmaddubsw m5, m3, m6
27260 movu [r0 + 1399 * 16], m4
27263 movu m6, [r5 + 27 * 16]
27265 pinsrb m0, [r4 + 25], 1
27266 pinsrb m0, [r4 + 28], 0
27267 pmaddubsw m4, m0, m6
27270 pinsrw m2, [r3 + 0], 0
27271 pmaddubsw m5, m2, m6
27274 movu [r0 + 1400 * 16], m4
27276 pinsrw m1, [r3 + 8], 0
27277 pmaddubsw m4, m1, m6
27280 pinsrw m3, [r3 + 16], 0
27281 pmaddubsw m5, m3, m6
27284 movu [r0 + 1401 * 16], m4
27287 movu m6, [r5 + 18 * 16]
27288 pmaddubsw m4, m0, m6
27290 pmaddubsw m5, m2, m6
27293 movu [r0 + 1402 * 16], m4
27294 pmaddubsw m4, m1, m6
27296 pmaddubsw m5, m3, m6
27299 movu [r0 + 1403 * 16], m4
27302 movu m6, [r5 + 9 * 16]
27303 pmaddubsw m4, m0, m6
27305 pmaddubsw m5, m2, m6
27308 movu [r0 + 1404 * 16], m4
27309 pmaddubsw m4, m1, m6
27311 pmaddubsw m5, m3, m6
27314 movu [r0 + 1405 * 16], m4
27317 pshufb m5, m0, [tab_S2]
27318 movh [r0 + 1406 * 16], m5
27319 pshufb m5, m2, [tab_S2]
27320 movh [r0 + 1406 * 16 + 8], m5
27321 pshufb m5, m1, [tab_S2]
27322 movh [r0 + 1407 * 16], m5
27323 pshufb m5, m3, [tab_S2]
27324 movh [r0 + 1407 * 16 + 8], m5
27327 movu m6, [r5 + 27 * 16]
27331 pmaddubsw m4, m0, m6
27336 pmaddubsw m5, m2, m6
27339 movu [r0 + 1408 * 16], m4
27344 pmaddubsw m4, m1, m6
27349 pmaddubsw m5, m3, m6
27352 movu [r0 + 1409 * 16], m4
27355 movu m6, [r5 + 22 * 16]
27356 pmaddubsw m4, m0, m6
27358 pmaddubsw m5, m2, m6
27361 movu [r0 + 1410 * 16], m4
27362 pmaddubsw m4, m1, m6
27364 pmaddubsw m5, m3, m6
27367 movu [r0 + 1411 * 16], m4
27370 movu m6, [r5 + 17 * 16]
27371 pmaddubsw m4, m0, m6
27373 pmaddubsw m5, m2, m6
27376 movu [r0 + 1412 * 16], m4
27377 pmaddubsw m4, m1, m6
27379 pmaddubsw m5, m3, m6
27382 movu [r0 + 1413 * 16], m4
27385 movu m6, [r5 + 12 * 16]
27386 pmaddubsw m4, m0, m6
27388 pmaddubsw m5, m2, m6
27391 movu [r0 + 1414 * 16], m4
27392 pmaddubsw m4, m1, m6
27394 pmaddubsw m5, m3, m6
27397 movu [r0 + 1415 * 16], m4
27400 movu m6, [r5 + 7 * 16]
27401 pmaddubsw m4, m0, m6
27403 pmaddubsw m5, m2, m6
27406 movu [r0 + 1416 * 16], m4
27407 pmaddubsw m4, m1, m6
27409 pmaddubsw m5, m3, m6
27412 movu [r0 + 1417 * 16], m4
27415 movu m6, [r5 + 2 * 16]
27416 pmaddubsw m4, m0, m6
27418 pmaddubsw m5, m2, m6
27421 movu [r0 + 1418 * 16], m4
27422 pmaddubsw m4, m1, m6
27424 pmaddubsw m5, m3, m6
27427 movu [r0 + 1419 * 16], m4
27430 movu m6, [r5 + 29 * 16]
27432 pinsrb m0, [r4 + 0], 1
27433 pinsrb m0, [r4 + 6], 0
27434 pmaddubsw m4, m0, m6
27437 pinsrw m2, [r3 + 7], 0
27438 pmaddubsw m5, m2, m6
27441 movu [r0 + 1420 * 16], m4
27443 pinsrw m1, [r3 + 15], 0
27444 pmaddubsw m4, m1, m6
27447 pinsrw m3, [r3 + 23], 0
27448 pmaddubsw m5, m3, m6
27451 movu [r0 + 1421 * 16], m4
27454 movu m6, [r5 + 24 * 16]
27455 pmaddubsw m4, m0, m6
27457 pmaddubsw m5, m2, m6
27460 movu [r0 + 1422 * 16], m4
27461 pmaddubsw m4, m1, m6
27463 pmaddubsw m5, m3, m6
27466 movu [r0 + 1423 * 16], m4
27469 movu m6, [r5 + 19 * 16]
27470 pmaddubsw m4, m0, m6
27472 pmaddubsw m5, m2, m6
27475 movu [r0 + 1424 * 16], m4
27476 pmaddubsw m4, m1, m6
27478 pmaddubsw m5, m3, m6
27481 movu [r0 + 1425 * 16], m4
27484 movu m6, [r5 + 14 * 16]
27485 pmaddubsw m4, m0, m6
27487 pmaddubsw m5, m2, m6
27490 movu [r0 + 1426 * 16], m4
27491 pmaddubsw m4, m1, m6
27493 pmaddubsw m5, m3, m6
27496 movu [r0 + 1427 * 16], m4
27499 movu m6, [r5 + 9 * 16]
27500 pmaddubsw m4, m0, m6
27502 pmaddubsw m5, m2, m6
27505 movu [r0 + 1428 * 16], m4
27506 pmaddubsw m4, m1, m6
27508 pmaddubsw m5, m3, m6
27511 movu [r0 + 1429 * 16], m4
27514 movu m6, [r5 + 4 * 16]
27515 pmaddubsw m4, m0, m6
27517 pmaddubsw m5, m2, m6
27520 movu [r0 + 1430 * 16], m4
27521 pmaddubsw m4, m1, m6
27523 pmaddubsw m5, m3, m6
27526 movu [r0 + 1431 * 16], m4
27529 movu m6, [r5 + 31 * 16]
27531 pinsrb m0, [r4 + 6], 1
27532 pinsrb m0, [r4 + 13], 0
27533 pmaddubsw m4, m0, m6
27536 pinsrw m2, [r3 + 6], 0
27537 pmaddubsw m5, m2, m6
27540 movu [r0 + 1432 * 16], m4
27542 pinsrw m1, [r3 + 14], 0
27543 pmaddubsw m4, m1, m6
27546 pinsrw m3, [r3 + 22], 0
27547 pmaddubsw m5, m3, m6
27550 movu [r0 + 1433 * 16], m4
27553 movu m6, [r5 + 26 * 16]
27554 pmaddubsw m4, m0, m6
27556 pmaddubsw m5, m2, m6
27559 movu [r0 + 1434 * 16], m4
27560 pmaddubsw m4, m1, m6
27562 pmaddubsw m5, m3, m6
27565 movu [r0 + 1435 * 16], m4
27568 movu m6, [r5 + 21 * 16]
27569 pmaddubsw m4, m0, m6
27571 pmaddubsw m5, m2, m6
27574 movu [r0 + 1436 * 16], m4
27575 pmaddubsw m4, m1, m6
27577 pmaddubsw m5, m3, m6
27580 movu [r0 + 1437 * 16], m4
27583 movu m6, [r5 + 16 * 16]
27584 pmaddubsw m4, m0, m6
27586 pmaddubsw m5, m2, m6
27589 movu [r0 + 1438 * 16], m4
27590 pmaddubsw m4, m1, m6
27592 pmaddubsw m5, m3, m6
27595 movu [r0 + 1439 * 16], m4
27598 movu m6, [r5 + 11 * 16]
27599 pmaddubsw m4, m0, m6
27601 pmaddubsw m5, m2, m6
27604 movu [r0 + 1440 * 16], m4
27605 pmaddubsw m4, m1, m6
27607 pmaddubsw m5, m3, m6
27610 movu [r0 + 1441 * 16], m4
27613 movu m6, [r5 + 6 * 16]
27614 pmaddubsw m4, m0, m6
27616 pmaddubsw m5, m2, m6
27619 movu [r0 + 1442 * 16], m4
27620 pmaddubsw m4, m1, m6
27622 pmaddubsw m5, m3, m6
27625 movu [r0 + 1443 * 16], m4
27628 movu m6, [r5 + 1 * 16]
27629 pmaddubsw m4, m0, m6
27631 pmaddubsw m5, m2, m6
27634 movu [r0 + 1444 * 16], m4
27635 pmaddubsw m4, m1, m6
27637 pmaddubsw m5, m3, m6
27640 movu [r0 + 1445 * 16], m4
27643 movu m6, [r5 + 28 * 16]
27645 pinsrb m0, [r4 + 13], 1
27646 pinsrb m0, [r4 + 19], 0
27647 pmaddubsw m4, m0, m6
27650 pinsrw m2, [r3 + 5], 0
27651 pmaddubsw m5, m2, m6
27654 movu [r0 + 1446 * 16], m4
27656 pinsrw m1, [r3 + 13], 0
27657 pmaddubsw m4, m1, m6
27660 pinsrw m3, [r3 + 21], 0
27661 pmaddubsw m5, m3, m6
27664 movu [r0 + 1447 * 16], m4
27667 movu m6, [r5 + 23 * 16]
27668 pmaddubsw m4, m0, m6
27670 pmaddubsw m5, m2, m6
27673 movu [r0 + 1448 * 16], m4
27674 pmaddubsw m4, m1, m6
27676 pmaddubsw m5, m3, m6
27679 movu [r0 + 1449 * 16], m4
27682 movu m6, [r5 + 18 * 16]
27683 pmaddubsw m4, m0, m6
27685 pmaddubsw m5, m2, m6
27688 movu [r0 + 1450 * 16], m4
27689 pmaddubsw m4, m1, m6
27691 pmaddubsw m5, m3, m6
27694 movu [r0 + 1451 * 16], m4
27697 movu m6, [r5 + 13 * 16]
27698 pmaddubsw m4, m0, m6
27700 pmaddubsw m5, m2, m6
27703 movu [r0 + 1452 * 16], m4
27704 pmaddubsw m4, m1, m6
27706 pmaddubsw m5, m3, m6
27709 movu [r0 + 1453 * 16], m4
27712 movu m6, [r5 + 8 * 16]
27713 pmaddubsw m4, m0, m6
27715 pmaddubsw m5, m2, m6
27718 movu [r0 + 1454 * 16], m4
27719 pmaddubsw m4, m1, m6
27721 pmaddubsw m5, m3, m6
27724 movu [r0 + 1455 * 16], m4
27727 movu m6, [r5 + 3 * 16]
27728 pmaddubsw m4, m0, m6
27730 pmaddubsw m5, m2, m6
27733 movu [r0 + 1456 * 16], m4
27734 pmaddubsw m4, m1, m6
27736 pmaddubsw m5, m3, m6
27739 movu [r0 + 1457 * 16], m4
27742 movu m6, [r5 + 30 * 16]
27744 pinsrb m0, [r4 + 19], 1
27745 pinsrb m0, [r4 + 26], 0
27746 pmaddubsw m4, m0, m6
27749 pinsrw m2, [r3 + 4], 0
27750 pmaddubsw m5, m2, m6
27753 movu [r0 + 1458 * 16], m4
27755 pinsrw m1, [r3 + 12], 0
27756 pmaddubsw m4, m1, m6
27759 pinsrw m3, [r3 + 20], 0
27760 pmaddubsw m5, m3, m6
27763 movu [r0 + 1459 * 16], m4
27766 movu m6, [r5 + 25 * 16]
27767 pmaddubsw m4, m0, m6
27769 pmaddubsw m5, m2, m6
27772 movu [r0 + 1460 * 16], m4
27773 pmaddubsw m4, m1, m6
27775 pmaddubsw m5, m3, m6
27778 movu [r0 + 1461 * 16], m4
27781 movu m6, [r5 + 20 * 16]
27782 pmaddubsw m4, m0, m6
27784 pmaddubsw m5, m2, m6
27787 movu [r0 + 1462 * 16], m4
27788 pmaddubsw m4, m1, m6
27790 pmaddubsw m5, m3, m6
27793 movu [r0 + 1463 * 16], m4
27796 movu m6, [r5 + 15 * 16]
27797 pmaddubsw m4, m0, m6
27799 pmaddubsw m5, m2, m6
27802 movu [r0 + 1464 * 16], m4
27803 pmaddubsw m4, m1, m6
27805 pmaddubsw m5, m3, m6
27808 movu [r0 + 1465 * 16], m4
27811 movu m6, [r5 + 10 * 16]
27812 pmaddubsw m4, m0, m6
27814 pmaddubsw m5, m2, m6
27817 movu [r0 + 1466 * 16], m4
27818 pmaddubsw m4, m1, m6
27820 pmaddubsw m5, m3, m6
27823 movu [r0 + 1467 * 16], m4
27826 movu m6, [r5 + 5 * 16]
27827 pmaddubsw m4, m0, m6
27829 pmaddubsw m5, m2, m6
27832 movu [r0 + 1468 * 16], m4
27833 pmaddubsw m4, m1, m6
27835 pmaddubsw m5, m3, m6
27838 movu [r0 + 1469 * 16], m4
27841 pshufb m5, m0, [tab_S2]
27842 movh [r0 + 1470 * 16], m5
27843 pshufb m5, m2, [tab_S2]
27844 movh [r0 + 1470 * 16 + 8], m5
27845 pshufb m5, m1, [tab_S2]
27846 movh [r0 + 1471 * 16], m5
27847 pshufb m5, m3, [tab_S2]
27848 movh [r0 + 1471 * 16 + 8], m5
27851 movu m6, [r5 + 30 * 16]
27855 pmaddubsw m4, m0, m6
27860 pmaddubsw m5, m2, m6
27863 movu [r0 + 1472 * 16], m4
27868 pmaddubsw m4, m1, m6
27873 pmaddubsw m5, m3, m6
27876 movu [r0 + 1473 * 16], m4
27879 movu m6, [r5 + 28 * 16]
27880 pmaddubsw m4, m0, m6
27882 pmaddubsw m5, m2, m6
27885 movu [r0 + 1474 * 16], m4
27886 pmaddubsw m4, m1, m6
27888 pmaddubsw m5, m3, m6
27891 movu [r0 + 1475 * 16], m4
27894 movu m6, [r5 + 26 * 16]
27895 pmaddubsw m4, m0, m6
27897 pmaddubsw m5, m2, m6
27900 movu [r0 + 1476 * 16], m4
27901 pmaddubsw m4, m1, m6
27903 pmaddubsw m5, m3, m6
27906 movu [r0 + 1477 * 16], m4
27909 movu m6, [r5 + 24 * 16]
27910 pmaddubsw m4, m0, m6
27912 pmaddubsw m5, m2, m6
27915 movu [r0 + 1478 * 16], m4
27916 pmaddubsw m4, m1, m6
27918 pmaddubsw m5, m3, m6
27921 movu [r0 + 1479 * 16], m4
27924 movu m6, [r5 + 22 * 16]
27925 pmaddubsw m4, m0, m6
27927 pmaddubsw m5, m2, m6
27930 movu [r0 + 1480 * 16], m4
27931 pmaddubsw m4, m1, m6
27933 pmaddubsw m5, m3, m6
27936 movu [r0 + 1481 * 16], m4
27939 movu m6, [r5 + 20 * 16]
27940 pmaddubsw m4, m0, m6
27942 pmaddubsw m5, m2, m6
27945 movu [r0 + 1482 * 16], m4
27946 pmaddubsw m4, m1, m6
27948 pmaddubsw m5, m3, m6
27951 movu [r0 + 1483 * 16], m4
27954 movu m6, [r5 + 18 * 16]
27955 pmaddubsw m4, m0, m6
27957 pmaddubsw m5, m2, m6
27960 movu [r0 + 1484 * 16], m4
27961 pmaddubsw m4, m1, m6
27963 pmaddubsw m5, m3, m6
27966 movu [r0 + 1485 * 16], m4
27969 movu m6, [r5 + 16 * 16]
27970 pmaddubsw m4, m0, m6
27972 pmaddubsw m5, m2, m6
27975 movu [r0 + 1486 * 16], m4
27976 pmaddubsw m4, m1, m6
27978 pmaddubsw m5, m3, m6
27981 movu [r0 + 1487 * 16], m4
27984 movu m6, [r5 + 14 * 16]
27985 pmaddubsw m4, m0, m6
27987 pmaddubsw m5, m2, m6
27990 movu [r0 + 1488 * 16], m4
27991 pmaddubsw m4, m1, m6
27993 pmaddubsw m5, m3, m6
27996 movu [r0 + 1489 * 16], m4
27999 movu m6, [r5 + 12 * 16]
28000 pmaddubsw m4, m0, m6
28002 pmaddubsw m5, m2, m6
28005 movu [r0 + 1490 * 16], m4
28006 pmaddubsw m4, m1, m6
28008 pmaddubsw m5, m3, m6
28011 movu [r0 + 1491 * 16], m4
28014 movu m6, [r5 + 10 * 16]
28015 pmaddubsw m4, m0, m6
28017 pmaddubsw m5, m2, m6
28020 movu [r0 + 1492 * 16], m4
28021 pmaddubsw m4, m1, m6
28023 pmaddubsw m5, m3, m6
28026 movu [r0 + 1493 * 16], m4
28029 movu m6, [r5 + 8 * 16]
28030 pmaddubsw m4, m0, m6
28032 pmaddubsw m5, m2, m6
28035 movu [r0 + 1494 * 16], m4
28036 pmaddubsw m4, m1, m6
28038 pmaddubsw m5, m3, m6
28041 movu [r0 + 1495 * 16], m4
28044 movu m6, [r5 + 6 * 16]
28045 pmaddubsw m4, m0, m6
28047 pmaddubsw m5, m2, m6
28050 movu [r0 + 1496 * 16], m4
28051 pmaddubsw m4, m1, m6
28053 pmaddubsw m5, m3, m6
28056 movu [r0 + 1497 * 16], m4
28059 movu m6, [r5 + 4 * 16]
28060 pmaddubsw m4, m0, m6
28062 pmaddubsw m5, m2, m6
28065 movu [r0 + 1498 * 16], m4
28066 pmaddubsw m4, m1, m6
28068 pmaddubsw m5, m3, m6
28071 movu [r0 + 1499 * 16], m4
28074 movu m6, [r5 + 2 * 16]
28075 pmaddubsw m4, m0, m6
28077 pmaddubsw m5, m2, m6
28080 movu [r0 + 1500 * 16], m4
28081 pmaddubsw m4, m1, m6
28083 pmaddubsw m5, m3, m6
28086 movu [r0 + 1501 * 16], m4
28089 pshufb m5, m0, [tab_S2]
28090 movh [r0 + 1502 * 16], m5
28091 pshufb m5, m2, [tab_S2]
28092 movh [r0 + 1502 * 16 + 8], m5
28093 pshufb m5, m1, [tab_S2]
28094 movh [r0 + 1503 * 16], m5
28095 pshufb m5, m3, [tab_S2]
28096 movh [r0 + 1503 * 16 + 8], m5
28099 movu m6, [r5 + 30 * 16]
28101 pinsrb m0, [r4 + 0], 1
28102 pinsrb m0, [r4 + 16], 0
28103 pmaddubsw m4, m0, m6
28106 pinsrw m2, [r3 + 7], 0
28107 pmaddubsw m5, m2, m6
28110 movu [r0 + 1504 * 16], m4
28112 pinsrw m1, [r3 + 15], 0
28113 pmaddubsw m4, m1, m6
28116 pinsrw m3, [r3 + 23], 0
28117 pmaddubsw m5, m3, m6
28120 movu [r0 + 1505 * 16], m4
28123 movu m6, [r5 + 28 * 16]
28124 pmaddubsw m4, m0, m6
28126 pmaddubsw m5, m2, m6
28129 movu [r0 + 1506 * 16], m4
28130 pmaddubsw m4, m1, m6
28132 pmaddubsw m5, m3, m6
28135 movu [r0 + 1507 * 16], m4
28138 movu m6, [r5 + 26 * 16]
28139 pmaddubsw m4, m0, m6
28141 pmaddubsw m5, m2, m6
28144 movu [r0 + 1508 * 16], m4
28145 pmaddubsw m4, m1, m6
28147 pmaddubsw m5, m3, m6
28150 movu [r0 + 1509 * 16], m4
28153 movu m6, [r5 + 24 * 16]
28154 pmaddubsw m4, m0, m6
28156 pmaddubsw m5, m2, m6
28159 movu [r0 + 1510 * 16], m4
28160 pmaddubsw m4, m1, m6
28162 pmaddubsw m5, m3, m6
28165 movu [r0 + 1511 * 16], m4
28168 movu m6, [r5 + 22 * 16]
28169 pmaddubsw m4, m0, m6
28171 pmaddubsw m5, m2, m6
28174 movu [r0 + 1512 * 16], m4
28175 pmaddubsw m4, m1, m6
28177 pmaddubsw m5, m3, m6
28180 movu [r0 + 1513 * 16], m4
28183 movu m6, [r5 + 20 * 16]
28184 pmaddubsw m4, m0, m6
28186 pmaddubsw m5, m2, m6
28189 movu [r0 + 1514 * 16], m4
28190 pmaddubsw m4, m1, m6
28192 pmaddubsw m5, m3, m6
28195 movu [r0 + 1515 * 16], m4
28198 movu m6, [r5 + 18 * 16]
28199 pmaddubsw m4, m0, m6
28201 pmaddubsw m5, m2, m6
28204 movu [r0 + 1516 * 16], m4
28205 pmaddubsw m4, m1, m6
28207 pmaddubsw m5, m3, m6
28210 movu [r0 + 1517 * 16], m4
28213 movu m6, [r5 + 16 * 16]
28214 pmaddubsw m4, m0, m6
28216 pmaddubsw m5, m2, m6
28219 movu [r0 + 1518 * 16], m4
28220 pmaddubsw m4, m1, m6
28222 pmaddubsw m5, m3, m6
28225 movu [r0 + 1519 * 16], m4
28228 movu m6, [r5 + 14 * 16]
28229 pmaddubsw m4, m0, m6
28231 pmaddubsw m5, m2, m6
28234 movu [r0 + 1520 * 16], m4
28235 pmaddubsw m4, m1, m6
28237 pmaddubsw m5, m3, m6
28240 movu [r0 + 1521 * 16], m4
28243 movu m6, [r5 + 12 * 16]
28244 pmaddubsw m4, m0, m6
28246 pmaddubsw m5, m2, m6
28249 movu [r0 + 1522 * 16], m4
28250 pmaddubsw m4, m1, m6
28252 pmaddubsw m5, m3, m6
28255 movu [r0 + 1523 * 16], m4
28258 movu m6, [r5 + 10 * 16]
28259 pmaddubsw m4, m0, m6
28261 pmaddubsw m5, m2, m6
28264 movu [r0 + 1524 * 16], m4
28265 pmaddubsw m4, m1, m6
28267 pmaddubsw m5, m3, m6
28270 movu [r0 + 1525 * 16], m4
28273 movu m6, [r5 + 8 * 16]
28274 pmaddubsw m4, m0, m6
28276 pmaddubsw m5, m2, m6
28279 movu [r0 + 1526 * 16], m4
28280 pmaddubsw m4, m1, m6
28282 pmaddubsw m5, m3, m6
28285 movu [r0 + 1527 * 16], m4
28288 movu m6, [r5 + 6 * 16]
28289 pmaddubsw m4, m0, m6
28291 pmaddubsw m5, m2, m6
28294 movu [r0 + 1528 * 16], m4
28295 pmaddubsw m4, m1, m6
28297 pmaddubsw m5, m3, m6
28300 movu [r0 + 1529 * 16], m4
28303 movu m6, [r5 + 4 * 16]
28304 pmaddubsw m4, m0, m6
28306 pmaddubsw m5, m2, m6
28309 movu [r0 + 1530 * 16], m4
28310 pmaddubsw m4, m1, m6
28312 pmaddubsw m5, m3, m6
28315 movu [r0 + 1531 * 16], m4
28318 movu m6, [r5 + 2 * 16]
28319 pmaddubsw m4, m0, m6
28321 pmaddubsw m5, m2, m6
28324 movu [r0 + 1532 * 16], m4
28325 pmaddubsw m4, m1, m6
28327 pmaddubsw m5, m3, m6
28330 movu [r0 + 1533 * 16], m4
28333 pshufb m5, m0, [tab_S2]
28334 movh [r0 + 1534 * 16], m5
28335 pshufb m5, m2, [tab_S2]
28336 movh [r0 + 1534 * 16 + 8], m5
28337 pshufb m5, m1, [tab_S2]
28338 movh [r0 + 1535 * 16], m5
28339 pshufb m5, m3, [tab_S2]
28340 movh [r0 + 1535 * 16 + 8], m5
28345 movu [r0 + 1536 * 16], m1
28346 movu [r0 + 1537 * 16], m2
28347 movu [r0 + 1538 * 16], m1
28348 movu [r0 + 1539 * 16], m2
28349 movu [r0 + 1540 * 16], m1
28350 movu [r0 + 1541 * 16], m2
28351 movu [r0 + 1542 * 16], m1
28352 movu [r0 + 1543 * 16], m2
28353 movu [r0 + 1544 * 16], m1
28354 movu [r0 + 1545 * 16], m2
28355 movu [r0 + 1546 * 16], m1
28356 movu [r0 + 1547 * 16], m2
28357 movu [r0 + 1548 * 16], m1
28358 movu [r0 + 1549 * 16], m2
28359 movu [r0 + 1550 * 16], m1
28360 movu [r0 + 1551 * 16], m2
28362 movu [r0 + 1552 * 16], m1
28363 movu [r0 + 1553 * 16], m2
28364 movu [r0 + 1554 * 16], m1
28365 movu [r0 + 1555 * 16], m2
28366 movu [r0 + 1556 * 16], m1
28367 movu [r0 + 1557 * 16], m2
28368 movu [r0 + 1558 * 16], m1
28369 movu [r0 + 1559 * 16], m2
28370 movu [r0 + 1560 * 16], m1
28371 movu [r0 + 1561 * 16], m2
28372 movu [r0 + 1562 * 16], m1
28373 movu [r0 + 1563 * 16], m2
28374 movu [r0 + 1564 * 16], m1
28375 movu [r0 + 1565 * 16], m2
28376 movu [r0 + 1566 * 16], m1
28377 movu [r0 + 1567 * 16], m2
28379 movu [r0 + 1568 * 16], m1
28380 movu [r0 + 1569 * 16], m2
28381 movu [r0 + 1570 * 16], m1
28382 movu [r0 + 1571 * 16], m2
28383 movu [r0 + 1572 * 16], m1
28384 movu [r0 + 1573 * 16], m2
28385 movu [r0 + 1574 * 16], m1
28386 movu [r0 + 1575 * 16], m2
28387 movu [r0 + 1576 * 16], m1
28388 movu [r0 + 1577 * 16], m2
28389 movu [r0 + 1578 * 16], m1
28390 movu [r0 + 1579 * 16], m2
28391 movu [r0 + 1580 * 16], m1
28392 movu [r0 + 1581 * 16], m2
28393 movu [r0 + 1582 * 16], m1
28394 movu [r0 + 1583 * 16], m2
28396 movu [r0 + 1584 * 16], m1
28397 movu [r0 + 1585 * 16], m2
28398 movu [r0 + 1586 * 16], m1
28399 movu [r0 + 1587 * 16], m2
28400 movu [r0 + 1588 * 16], m1
28401 movu [r0 + 1589 * 16], m2
28402 movu [r0 + 1590 * 16], m1
28403 movu [r0 + 1591 * 16], m2
28404 movu [r0 + 1592 * 16], m1
28405 movu [r0 + 1593 * 16], m2
28406 movu [r0 + 1594 * 16], m1
28407 movu [r0 + 1595 * 16], m2
28408 movu [r0 + 1596 * 16], m1
28409 movu [r0 + 1597 * 16], m2
28410 movu [r0 + 1598 * 16], m1
28411 movu [r0 + 1599 * 16], m2
28414 movu m6, [r5 + 2 * 16]
28418 pmaddubsw m4, m0, m6
28423 pmaddubsw m5, m2, m6
28426 movu [r0 + 1600 * 16], m4
28431 pmaddubsw m4, m1, m6
28436 pmaddubsw m5, m3, m6
28439 movu [r0 + 1601 * 16], m4
28442 movu m6, [r5 + 4 * 16]
28443 pmaddubsw m4, m0, m6
28445 pmaddubsw m5, m2, m6
28448 movu [r0 + 1602 * 16], m4
28449 pmaddubsw m4, m1, m6
28451 pmaddubsw m5, m3, m6
28454 movu [r0 + 1603 * 16], m4
28457 movu m6, [r5 + 6 * 16]
28458 pmaddubsw m4, m0, m6
28460 pmaddubsw m5, m2, m6
28463 movu [r0 + 1604 * 16], m4
28464 pmaddubsw m4, m1, m6
28466 pmaddubsw m5, m3, m6
28469 movu [r0 + 1605 * 16], m4
28472 movu m6, [r5 + 8 * 16]
28473 pmaddubsw m4, m0, m6
28475 pmaddubsw m5, m2, m6
28478 movu [r0 + 1606 * 16], m4
28479 pmaddubsw m4, m1, m6
28481 pmaddubsw m5, m3, m6
28484 movu [r0 + 1607 * 16], m4
28487 movu m6, [r5 + 10 * 16]
28488 pmaddubsw m4, m0, m6
28490 pmaddubsw m5, m2, m6
28493 movu [r0 + 1608 * 16], m4
28495 ; mode 28 [row 1 -first half]
28496 movu [r0 + 1666 * 16], m4
28498 pmaddubsw m4, m1, m6
28500 pmaddubsw m5, m3, m6
28503 movu [r0 + 1609 * 16], m4
28505 ; mode 28 [row 1 - second half]
28506 movu [r0 + 1667 * 16], m4
28509 movu m6, [r5 + 12 * 16]
28510 pmaddubsw m4, m0, m6
28512 pmaddubsw m5, m2, m6
28515 movu [r0 + 1610 * 16], m4
28517 pmaddubsw m4, m1, m6
28519 pmaddubsw m5, m3, m6
28522 movu [r0 + 1611 * 16], m4
28525 movu m6, [r5 + 14 * 16]
28526 pmaddubsw m4, m0, m6
28528 pmaddubsw m5, m2, m6
28531 movu [r0 + 1612 * 16], m4
28532 pmaddubsw m4, m1, m6
28534 pmaddubsw m5, m3, m6
28537 movu [r0 + 1613 * 16], m4
28540 movu m6, [r5 + 16 * 16]
28541 pmaddubsw m4, m0, m6
28543 pmaddubsw m5, m2, m6
28546 movu [r0 + 1614 * 16], m4
28547 pmaddubsw m4, m1, m6
28549 pmaddubsw m5, m3, m6
28552 movu [r0 + 1615 * 16], m4
28555 movu m6, [r5 + 18 * 16]
28556 pmaddubsw m4, m0, m6
28558 pmaddubsw m5, m2, m6
28561 movu [r0 + 1616 * 16], m4
28563 ; mode 29 [row 1 - first half]
28564 movu [r0 + 1730 * 16], m4
28566 pmaddubsw m4, m1, m6
28568 pmaddubsw m5, m3, m6
28571 movu [r0 + 1617 * 16], m4
28573 ; mode 29 [row 1 - second half]
28574 movu [r0 + 1731 * 16], m4
28577 movu m6, [r5 + 20 * 16]
28578 pmaddubsw m4, m0, m6
28580 pmaddubsw m5, m2, m6
28583 movu [r0 + 1618 * 16], m4
28585 ; mode 28 [row 3 -first half]
28586 movu [r0 + 1670 * 16], m4
28588 pmaddubsw m4, m1, m6
28590 pmaddubsw m5, m3, m6
28593 movu [r0 + 1619 * 16], m4
28595 ; mode 28 [row 3 -second half]
28596 movu [r0 + 1671 * 16], m4
28599 movu m6, [r5 + 22 * 16]
28600 pmaddubsw m4, m0, m6
28602 pmaddubsw m5, m2, m6
28605 movu [r0 + 1620 * 16], m4
28606 pmaddubsw m4, m1, m6
28608 pmaddubsw m5, m3, m6
28611 movu [r0 + 1621 * 16], m4
28614 movu m6, [r5 + 24 * 16]
28615 pmaddubsw m4, m0, m6
28617 pmaddubsw m5, m2, m6
28620 movu [r0 + 1622 * 16], m4
28621 pmaddubsw m4, m1, m6
28623 pmaddubsw m5, m3, m6
28626 movu [r0 + 1623 * 16], m4
28629 movu m6, [r5 + 26 * 16]
28630 pmaddubsw m4, m0, m6
28632 pmaddubsw m5, m2, m6
28635 movu [r0 + 1624 * 16], m4
28637 ; mode 30 [row 1 - first half]
28638 movu [r0 + 1794 * 16], m4
28640 ; mode 33 [row 0 - first half]
28641 movu [r0 + 1984 * 16], m4
28643 pmaddubsw m4, m1, m6
28645 pmaddubsw m5, m3, m6
28648 movu [r0 + 1625 * 16], m4
28650 ; mode 30 [row 1 - second half]
28651 movu [r0 + 1795 * 16], m4
28653 ; mode 33 [row 0 - second half]
28654 movu [r0 + 1985 * 16], m4
28657 movu m6, [r5 + 28 * 16]
28658 pmaddubsw m4, m0, m6
28660 pmaddubsw m5, m2, m6
28663 movu [r0 + 1626 * 16], m4
28664 pmaddubsw m4, m1, m6
28666 pmaddubsw m5, m3, m6
28669 movu [r0 + 1627 * 16], m4
28672 movu m6, [r5 + 30 * 16]
28673 pmaddubsw m4, m0, m6
28675 pmaddubsw m5, m2, m6
28678 movu [r0 + 1628 * 16], m4
28680 ; mode 28 [row 5 first half]
28681 movu [r0 + 1674 * 16], m4
28683 pmaddubsw m4, m1, m6
28685 pmaddubsw m5, m3, m6
28688 movu [r0 + 1629 * 16], m4
28690 ; mode 28 [row 5 second half]
28691 movu [r0 + 1675 * 16], m4
28694 movu m6, [r5 + 5 * 16]
28695 pmaddubsw m4, m0, m6
28697 pmaddubsw m5, m2, m6
28700 movu [r0 + 1664 * 16], m4
28701 pmaddubsw m4, m1, m6
28703 pmaddubsw m5, m3, m6
28706 movu [r0 + 1665 * 16], m4
28709 movu m6, [r5 + 15 * 16]
28710 pmaddubsw m4, m0, m6
28712 pmaddubsw m5, m2, m6
28715 movu [r0 + 1668 * 16], m4
28716 pmaddubsw m4, m1, m6
28718 pmaddubsw m5, m3, m6
28721 movu [r0 + 1669 * 16], m4
28724 movu m6, [r5 + 25 * 16]
28725 pmaddubsw m4, m0, m6
28727 pmaddubsw m5, m2, m6
28730 movu [r0 + 1672 * 16], m4
28731 pmaddubsw m4, m1, m6
28733 pmaddubsw m5, m3, m6
28736 movu [r0 + 1673 * 16], m4
28739 movu m6, [r5 + 13 * 16]
28740 pmaddubsw m4, m0, m6
28742 pmaddubsw m5, m2, m6
28745 movu [r0 + 1792 * 16], m4
28746 pmaddubsw m4, m1, m6
28748 pmaddubsw m5, m3, m6
28751 movu [r0 + 1793 * 16], m4
28754 movu m6, [r5 + 9 * 16]
28755 pmaddubsw m4, m0, m6
28757 pmaddubsw m5, m2, m6
28760 movu [r0 + 1728 * 16], m4
28761 pmaddubsw m4, m1, m6
28763 pmaddubsw m5, m3, m6
28766 movu [r0 + 1729 * 16], m4
28769 movu m6, [r5 + 27 * 16]
28770 pmaddubsw m4, m0, m6
28772 pmaddubsw m5, m2, m6
28775 movu [r0 + 1732 * 16], m4
28776 pmaddubsw m4, m1, m6
28778 pmaddubsw m5, m3, m6
28781 movu [r0 + 1733 * 16], m4
28784 movu m6, [r5 + 17 * 16]
28785 pmaddubsw m4, m0, m6
28787 pmaddubsw m5, m2, m6
28790 movu [r0 + 1856 * 16], m4
28791 pmaddubsw m4, m1, m6
28793 pmaddubsw m5, m3, m6
28796 movu [r0 + 1857 * 16], m4
28799 movu m6, [r5 + 21 * 16]
28800 pmaddubsw m4, m0, m6
28802 pmaddubsw m5, m2, m6
28805 movu [r0 + 1920 * 16], m4
28806 pmaddubsw m4, m1, m6
28808 pmaddubsw m5, m3, m6
28811 movu [r0 + 1921 * 16], m4
28831 pshufb m5, m0, [tab_S2]
28832 movh [r0 + 1630 * 16], m5
28833 pshufb m5, m2, [tab_S2]
28834 movh [r0 + 1630 * 16 + 8], m5
28835 pshufb m5, m1, [tab_S2]
28836 movh [r0 + 1631 * 16], m5
28837 pshufb m5, m4, [tab_S2]
28838 movh [r0 + 1631 * 16 + 8], m5
28841 movu m6, [r5 + 2 * 16]
28842 pmaddubsw m3, m0, m6
28844 pmaddubsw m5, m2, m6
28847 movu [r0 + 1632 * 16], m3
28849 ; mode 31 [row 1 - first half]
28850 movu [r0 + 1858 * 16], m3
28852 pmaddubsw m3, m1, m6
28854 pmaddubsw m5, m4, m6
28857 movu [r0 + 1633 * 16], m3
28859 ; mode 31 [row 1 - second half]
28860 movu [r0 + 1859 * 16], m3
28863 movu m6, [r5 + 4 * 16]
28864 pmaddubsw m3, m0, m6
28866 pmaddubsw m5, m2, m6
28869 movu [r0 + 1634 * 16], m3
28871 ; mode 29 [row 3 - first half]
28872 movu [r0 + 1734 * 16], m3
28874 pmaddubsw m3, m1, m6
28876 pmaddubsw m5, m4, m6
28879 movu [r0 + 1635 * 16], m3
28881 ; mode 29 [row 3 - second half]
28882 movu [r0 + 1735 * 16], m3
28885 movu m6, [r5 + 6 * 16]
28886 pmaddubsw m3, m0, m6
28888 pmaddubsw m5, m2, m6
28891 movu [r0 + 1636 * 16], m3
28892 pmaddubsw m3, m1, m6
28894 pmaddubsw m5, m4, m6
28897 movu [r0 + 1637 * 16], m3
28900 movu m6, [r5 + 8 * 16]
28901 pmaddubsw m3, m0, m6
28903 pmaddubsw m5, m2, m6
28906 movu [r0 + 1638 * 16], m3
28908 ; mode 28 [row 7 - first half]
28909 movu [r0 + 1678 * 16], m3
28911 pmaddubsw m3, m1, m6
28913 pmaddubsw m5, m4, m6
28916 movu [r0 + 1639 * 16], m3
28918 ; mode 28 [row 7 - second half]
28919 movu [r0 + 1679 * 16], m3
28922 movu m6, [r5 + 10 * 16]
28923 pmaddubsw m3, m0, m6
28925 pmaddubsw m5, m2, m6
28928 movu [r0 + 1640 * 16], m3
28930 ; mode 32 [row 1 - first half]
28931 movu [r0 + 1922 * 16], m3
28933 pmaddubsw m3, m1, m6
28935 pmaddubsw m5, m4, m6
28938 movu [r0 + 1641 * 16], m3
28940 ; mode 32 [row 1 - second half]
28941 movu [r0 + 1923 * 16], m3
28944 movu m6, [r5 + 12 * 16]
28945 pmaddubsw m3, m0, m6
28947 pmaddubsw m5, m2, m6
28950 movu [r0 + 1642 * 16], m3
28951 pmaddubsw m3, m1, m6
28953 pmaddubsw m5, m4, m6
28956 movu [r0 + 1643 * 16], m3
28959 movu m6, [r5 + 14 * 16]
28960 pmaddubsw m3, m0, m6
28962 pmaddubsw m5, m2, m6
28965 movu [r0 + 1644 * 16], m3
28966 pmaddubsw m3, m1, m6
28968 pmaddubsw m5, m4, m6
28971 movu [r0 + 1645 * 16], m3
28974 movu m6, [r5 + 16 * 16]
28975 pmaddubsw m3, m0, m6
28977 pmaddubsw m5, m2, m6
28980 movu [r0 + 1646 * 16], m3
28981 pmaddubsw m3, m1, m6
28983 pmaddubsw m5, m4, m6
28986 movu [r0 + 1647 * 16], m3
28989 movu m6, [r5 + 18 * 16]
28990 pmaddubsw m3, m0, m6
28992 pmaddubsw m5, m2, m6
28995 movu [r0 + 1648 * 16], m3
28997 ; mode 28 [row 9 - first half]
28998 movu [r0 + 1682 * 16], m3
29000 pmaddubsw m3, m1, m6
29002 pmaddubsw m5, m4, m6
29005 movu [r0 + 1649 * 16], m3
29007 ; mode 28 [row 9 - second half]
29008 movu [r0 + 1683 * 16], m3
29011 movu m6, [r5 + 20 * 16]
29012 pmaddubsw m3, m0, m6
29014 pmaddubsw m5, m2, m6
29017 movu [r0 + 1650 * 16], m3
29019 ; mode 30 [row 3 - first half]
29020 movu [r0 + 1798 * 16], m3
29022 ; mode 33 [row 1 - first half]
29023 movu [r0 + 1986 * 16], m3
29025 pmaddubsw m3, m1, m6
29027 pmaddubsw m5, m4, m6
29030 movu [r0 + 1651 * 16], m3
29032 ; mode 30 [row 3 - second half]
29033 movu [r0 + 1799 * 16], m3
29035 ; mode 33 [row 1 - second half]
29036 movu [r0 + 1987 * 16], m3
29039 movu m6, [r5 + 22 * 16]
29040 pmaddubsw m3, m0, m6
29042 pmaddubsw m5, m2, m6
29045 movu [r0 + 1652 * 16], m3
29047 ; mode 29 [row 5 - first half]
29048 movu [r0 + 1738 * 16], m3
29050 pmaddubsw m3, m1, m6
29052 pmaddubsw m5, m4, m6
29055 movu [r0 + 1653 * 16], m3
29057 ; mode 29 [row 5 - second half]
29058 movu [r0 + 1739 * 16], m3
29061 movu m6, [r5 + 24 * 16]
29062 pmaddubsw m3, m0, m6
29064 pmaddubsw m5, m2, m6
29067 movu [r0 + 1654 * 16], m3
29068 pmaddubsw m3, m1, m6
29070 pmaddubsw m5, m4, m6
29073 movu [r0 + 1655 * 16], m3
29076 movu m6, [r5 + 26 * 16]
29077 pmaddubsw m3, m0, m6
29079 pmaddubsw m5, m2, m6
29082 movu [r0 + 1656 * 16], m3
29083 pmaddubsw m3, m1, m6
29085 pmaddubsw m5, m4, m6
29088 movu [r0 + 1657 * 16], m3
29091 movu m6, [r5 + 28 * 16]
29092 pmaddubsw m3, m0, m6
29094 pmaddubsw m5, m2, m6
29097 movu [r0 + 1658 * 16], m3
29099 ; mode 28 [row 11 - first half]
29100 movu [r0 + 1686 * 16], m3
29102 pmaddubsw m3, m1, m6
29104 pmaddubsw m5, m4, m6
29107 movu [r0 + 1659 * 16], m3
29109 ; mode 28 [row 11 - second half]
29110 movu [r0 + 1687 * 16], m3
29113 movu m6, [r5 + 30 * 16]
29114 pmaddubsw m3, m0, m6
29116 pmaddubsw m5, m2, m6
29119 movu [r0 + 1660 * 16], m3
29120 pmaddubsw m3, m1, m6
29122 pmaddubsw m5, m4, m6
29125 movu [r0 + 1661 * 16], m3
29128 movu m6, [r5 + 3 * 16]
29129 pmaddubsw m3, m0, m6
29131 pmaddubsw m5, m2, m6
29134 movu [r0 + 1676 * 16], m3
29135 pmaddubsw m3, m1, m6
29137 pmaddubsw m5, m4, m6
29140 movu [r0 + 1677 * 16], m3
29143 movu m6, [r5 + 13 * 16]
29144 pmaddubsw m3, m0, m6
29146 pmaddubsw m5, m2, m6
29149 movu [r0 + 1680 * 16], m3
29151 ; mode 29 [row 4 - first half]
29152 movu [r0 + 1736 * 16], m3
29154 pmaddubsw m3, m1, m6
29156 pmaddubsw m5, m4, m6
29159 movu [r0 + 1681 * 16], m3
29161 ; mode 29 [row 4 - second half]
29162 movu [r0 + 1737 * 16], m3
29165 movu m6, [r5 + 23 * 16]
29166 pmaddubsw m3, m0, m6
29168 pmaddubsw m5, m2, m6
29171 movu [r0 + 1684 * 16], m3
29172 pmaddubsw m3, m1, m6
29174 pmaddubsw m5, m4, m6
29177 movu [r0 + 1685 * 16], m3
29180 movu m6, [r5 + 31 * 16]
29181 pmaddubsw m3, m0, m6
29183 pmaddubsw m5, m2, m6
29186 movu [r0 + 1740 * 16], m3
29188 ; mode 32 [row 2 - first half]
29189 movu [r0 + 1924 * 16], m3
29191 pmaddubsw m3, m1, m6
29193 pmaddubsw m5, m4, m6
29196 movu [r0 + 1741 * 16], m3
29198 ; mode 32 [row 2 - second half]
29199 movu [r0 + 1925 * 16], m3
29202 movu m6, [r5 + 7 * 16]
29203 pmaddubsw m3, m0, m6
29205 pmaddubsw m5, m2, m6
29208 movu [r0 + 1796 * 16], m3
29209 pmaddubsw m3, m1, m6
29211 pmaddubsw m5, m4, m6
29214 movu [r0 + 1797 * 16], m3
29217 movu m6, [r5 + 19 * 16]
29218 pmaddubsw m3, m0, m6
29220 pmaddubsw m5, m2, m6
29223 movu [r0 + 1860 * 16], m3
29224 pmaddubsw m3, m1, m6
29226 pmaddubsw m5, m4, m6
29229 movu [r0 + 1861 * 16], m3
29249 pshufb m5, m0, [tab_S2]
29250 movh [r0 + 1662 * 16], m5
29251 pshufb m5, m2, [tab_S2]
29252 movh [r0 + 1662 * 16 + 8], m5
29253 pshufb m5, m1, [tab_S2]
29254 movh [r0 + 1663 * 16], m5
29255 pshufb m5, m4, [tab_S2]
29256 movh [r0 + 1663 * 16 + 8], m5
29259 movu m6, [r5 + 1 * 16]
29260 pmaddubsw m3, m0, m6
29262 pmaddubsw m5, m2, m6
29265 movu [r0 + 1688 * 16], m3
29267 ; mode 30 [row 4 - first half]
29268 movu [r0 + 1800 * 16], m3
29270 pmaddubsw m3, m1, m6
29272 pmaddubsw m5, m4, m6
29275 movu [r0 + 1689 * 16], m3
29277 ; mode 30 [row 4 - second half]
29278 movu [r0 + 1801 * 16], m3
29281 movu m6, [r5 + 6 * 16]
29282 pmaddubsw m3, m0, m6
29284 pmaddubsw m5, m2, m6
29287 movu [r0 + 1690 * 16], m3
29288 pmaddubsw m3, m1, m6
29290 pmaddubsw m5, m4, m6
29293 movu [r0 + 1691 * 16], m3
29296 movu m6, [r5 + 11 * 16]
29297 pmaddubsw m3, m0, m6
29299 pmaddubsw m5, m2, m6
29302 movu [r0 + 1692 * 16], m3
29303 pmaddubsw m3, m1, m6
29305 pmaddubsw m5, m4, m6
29308 movu [r0 + 1693 * 16], m3
29311 movu m6, [r5 + 16 * 16]
29312 pmaddubsw m3, m0, m6
29314 pmaddubsw m5, m2, m6
29317 movu [r0 + 1694 * 16], m3
29318 pmaddubsw m3, m1, m6
29320 pmaddubsw m5, m4, m6
29323 movu [r0 + 1695 * 16], m3
29326 movu m6, [r5 + 21 * 16]
29327 pmaddubsw m3, m0, m6
29329 pmaddubsw m5, m2, m6
29332 movu [r0 + 1696 * 16], m3
29334 ; mode 31 [row 4 - first half]
29335 movu [r0 + 1864 * 16], m3
29337 pmaddubsw m3, m1, m6
29339 pmaddubsw m5, m4, m6
29342 movu [r0 + 1697 * 16], m3
29344 ; mode 31 [row 4 - second half]
29345 movu [r0 + 1865 * 16], m3
29348 movu m6, [r5 + 26 * 16]
29349 pmaddubsw m3, m0, m6
29351 pmaddubsw m5, m2, m6
29354 movu [r0 + 1698 * 16], m3
29356 ; mode 29 [row 9 - first half]
29357 movu [r0 + 1746 * 16], m3
29359 pmaddubsw m3, m1, m6
29361 pmaddubsw m5, m4, m6
29364 movu [r0 + 1699 * 16], m3
29366 ; mode 29 [row 9 - second half]
29367 movu [r0 + 1747 * 16], m3
29370 movu m6, [r5 + 31 * 16]
29371 pmaddubsw m3, m0, m6
29373 pmaddubsw m5, m2, m6
29376 movu [r0 + 1700 * 16], m3
29377 pmaddubsw m3, m1, m6
29379 pmaddubsw m5, m4, m6
29382 movu [r0 + 1701 * 16], m3
29385 movu m6, [r5 + 8 * 16]
29386 pmaddubsw m3, m0, m6
29388 pmaddubsw m5, m2, m6
29391 movu [r0 + 1742 * 16], m3
29392 pmaddubsw m3, m1, m6
29394 pmaddubsw m5, m4, m6
29397 movu [r0 + 1743 * 16], m3
29400 movu m6, [r5 + 17 * 16]
29401 pmaddubsw m3, m0, m6
29403 pmaddubsw m5, m2, m6
29406 movu [r0 + 1744 * 16], m3
29407 pmaddubsw m3, m1, m6
29409 pmaddubsw m5, m4, m6
29412 movu [r0 + 1745 * 16], m3
29415 movu m6, [r5 + 14 * 16]
29416 pmaddubsw m3, m0, m6
29418 pmaddubsw m5, m2, m6
29421 movu [r0 + 1802 * 16], m3
29423 ; mode 33 [row 2 - first half]
29424 movu [r0 + 1988 * 16], m3
29426 pmaddubsw m3, m1, m6
29428 pmaddubsw m5, m4, m6
29431 movu [r0 + 1803 * 16], m3
29433 ; mode 33 [row 2 - second half]
29434 movu [r0 + 1989 * 16], m3
29437 movu m6, [r5 + 27 * 16]
29438 pmaddubsw m3, m0, m6
29440 pmaddubsw m5, m2, m6
29443 movu [r0 + 1804 * 16], m3
29444 pmaddubsw m3, m1, m6
29446 pmaddubsw m5, m4, m6
29449 movu [r0 + 1805 * 16], m3
29452 movu m6, [r5 + 4 * 16]
29453 pmaddubsw m3, m0, m6
29455 pmaddubsw m5, m2, m6
29458 movu [r0 + 1862 * 16], m3
29459 pmaddubsw m3, m1, m6
29461 pmaddubsw m5, m4, m6
29464 movu [r0 + 1863 * 16], m3
29467 movu m6, [r5 + 20 * 16]
29468 pmaddubsw m3, m0, m6
29470 pmaddubsw m5, m2, m6
29473 movu [r0 + 1926 * 16], m3
29474 pmaddubsw m3, m1, m6
29476 pmaddubsw m5, m4, m6
29479 movu [r0 + 1927 * 16], m3
29482 movu m6, [r5 + 4 * 16]
29487 pmaddubsw m3, m0, m6
29493 pmaddubsw m5, m2, m6
29496 movu [r0 + 1702 * 16], m3
29502 pmaddubsw m3, m1, m6
29508 pmaddubsw m5, m4, m6
29511 movu [r0 + 1703 * 16], m3
29514 movu m6, [r5 + 9 * 16]
29515 pmaddubsw m3, m0, m6
29517 pmaddubsw m5, m2, m6
29520 movu [r0 + 1704 * 16], m3
29522 ; mode 32 [row 4 - first half]
29523 movu [r0 + 1928 * 16], m3
29525 pmaddubsw m3, m1, m6
29527 pmaddubsw m5, m4, m6
29530 movu [r0 + 1705 * 16], m3
29532 ; mode 32 [row 4 - second half]
29533 movu [r0 + 1929 * 16], m3
29536 movu m6, [r5 + 14 * 16]
29537 pmaddubsw m3, m0, m6
29539 pmaddubsw m5, m2, m6
29542 movu [r0 + 1706 * 16], m3
29543 pmaddubsw m3, m1, m6
29545 pmaddubsw m5, m4, m6
29548 movu [r0 + 1707 * 16], m3
29551 movu m6, [r5 + 19 * 16]
29552 pmaddubsw m3, m0, m6
29554 pmaddubsw m5, m2, m6
29557 movu [r0 + 1708 * 16], m3
29558 pmaddubsw m3, m1, m6
29560 pmaddubsw m5, m4, m6
29563 movu [r0 + 1709 * 16], m3
29566 movu m6, [r5 + 24 * 16]
29567 pmaddubsw m3, m0, m6
29569 pmaddubsw m5, m2, m6
29572 movu [r0 + 1710 * 16], m3
29573 pmaddubsw m3, m1, m6
29575 pmaddubsw m5, m4, m6
29578 movu [r0 + 1711 * 16], m3
29581 movu m6, [r5 + 29 * 16]
29582 pmaddubsw m3, m0, m6
29584 pmaddubsw m5, m2, m6
29587 movu [r0 + 1712 * 16], m3
29588 pmaddubsw m3, m1, m6
29590 pmaddubsw m5, m4, m6
29593 movu [r0 + 1713 * 16], m3
29596 movu m6, [r5 + 3 * 16]
29597 pmaddubsw m3, m0, m6
29599 pmaddubsw m5, m2, m6
29602 movu [r0 + 1748 * 16], m3
29603 pmaddubsw m3, m1, m6
29605 pmaddubsw m5, m4, m6
29608 movu [r0 + 1749 * 16], m3
29611 movu m6, [r5 + 12 * 16]
29612 pmaddubsw m3, m0, m6
29614 pmaddubsw m5, m2, m6
29617 movu [r0 + 1750 * 16], m3
29618 pmaddubsw m3, m1, m6
29620 pmaddubsw m5, m4, m6
29623 movu [r0 + 1751 * 16], m3
29626 movu m6, [r5 + 21 * 16]
29627 pmaddubsw m3, m0, m6
29629 pmaddubsw m5, m2, m6
29632 movu [r0 + 1752 * 16], m3
29634 ; mode 30 [row 8 -first half]
29635 movu [r0 + 1808 * 16], m3
29637 pmaddubsw m3, m1, m6
29639 pmaddubsw m5, m4, m6
29642 movu [r0 + 1753 * 16], m3
29644 ; mode 30 [row 8 -second half]
29645 movu [r0 + 1809 * 16], m3
29648 movu m6, [r5 + 30 * 16]
29649 pmaddubsw m3, m0, m6
29651 pmaddubsw m5, m2, m6
29654 movu [r0 + 1754 * 16], m3
29656 ; mode 32 [row 5 - first half]
29657 movu [r0 + 1930 * 16], m3
29659 pmaddubsw m3, m1, m6
29661 pmaddubsw m5, m4, m6
29664 movu [r0 + 1755 * 16], m3
29666 ; mode 32 [row 5 - second half]
29667 movu [r0 + 1931 * 16], m3
29670 movu m6, [r5 + 8 * 16]
29671 pmaddubsw m3, m0, m6
29673 pmaddubsw m5, m2, m6
29676 movu [r0 + 1806 * 16], m3
29678 ; mode 33 [row 3 - first half]
29679 movu [r0 + 1990 * 16], m3
29681 pmaddubsw m3, m1, m6
29683 pmaddubsw m5, m4, m6
29686 movu [r0 + 1807 * 16], m3
29688 ; mode 33 [row 3 - second half]
29689 movu [r0 + 1991 * 16], m3
29692 movu m6, [r5 + 6 * 16]
29693 pmaddubsw m3, m0, m6
29695 pmaddubsw m5, m2, m6
29698 movu [r0 + 1866 * 16], m3
29699 pmaddubsw m3, m1, m6
29701 pmaddubsw m5, m4, m6
29704 movu [r0 + 1867 * 16], m3
29707 movu m6, [r5 + 23 * 16]
29708 pmaddubsw m3, m0, m6
29710 pmaddubsw m5, m2, m6
29713 movu [r0 + 1868 * 16], m3
29714 pmaddubsw m3, m1, m6
29716 pmaddubsw m5, m4, m6
29719 movu [r0 + 1869 * 16], m3
29722 movu m6, [r5 + 2 * 16]
29727 pmaddubsw m3, m0, m6
29733 pmaddubsw m5, m2, m6
29736 movu [r0 + 1714 * 16], m3
29742 pmaddubsw m3, m1, m6
29748 pmaddubsw m5, m4, m6
29751 movu [r0 + 1715 * 16], m3
29754 movu m6, [r5 + 7 * 16]
29755 pmaddubsw m3, m0, m6
29757 pmaddubsw m5, m2, m6
29760 movu [r0 + 1716 * 16], m3
29762 ; mode 29 [row 14 - first half]
29763 movu [r0 + 1756 * 16], m3
29765 pmaddubsw m3, m1, m6
29767 pmaddubsw m5, m4, m6
29770 movu [r0 + 1717 * 16], m3
29772 ; mode 29 [row 14 - second half]
29773 movu [r0 + 1757 * 16], m3
29776 movu m6, [r5 + 12 * 16]
29777 pmaddubsw m3, m0, m6
29779 pmaddubsw m5, m2, m6
29782 movu [r0 + 1718 * 16], m3
29783 pmaddubsw m3, m1, m6
29785 pmaddubsw m5, m4, m6
29788 movu [r0 + 1719 * 16], m3
29791 movu m6, [r5 + 17 * 16]
29792 pmaddubsw m3, m0, m6
29794 pmaddubsw m5, m2, m6
29797 movu [r0 + 1720 * 16], m3
29798 pmaddubsw m3, m1, m6
29800 pmaddubsw m5, m4, m6
29803 movu [r0 + 1721 * 16], m3
29806 movu m6, [r5 + 22 * 16]
29807 pmaddubsw m3, m0, m6
29809 pmaddubsw m5, m2, m6
29812 movu [r0 + 1722 * 16], m3
29813 pmaddubsw m3, m1, m6
29815 pmaddubsw m5, m4, m6
29818 movu [r0 + 1723 * 16], m3
29821 movu m6, [r5 + 27 * 16]
29822 pmaddubsw m3, m0, m6
29824 pmaddubsw m5, m2, m6
29827 movu [r0 + 1724 * 16], m3
29828 pmaddubsw m3, m1, m6
29830 pmaddubsw m5, m4, m6
29833 movu [r0 + 1725 * 16], m3
29836 movu m6, [r5 + 16 * 16]
29837 pmaddubsw m3, m0, m6
29839 pmaddubsw m5, m2, m6
29842 movu [r0 + 1758 * 16], m3
29843 pmaddubsw m3, m1, m6
29845 pmaddubsw m5, m4, m6
29848 movu [r0 + 1759 * 16], m3
29851 movu m6, [r5 + 25 * 16]
29852 pmaddubsw m3, m0, m6
29854 pmaddubsw m5, m2, m6
29857 movu [r0 + 1760 * 16], m3
29858 pmaddubsw m3, m1, m6
29860 pmaddubsw m5, m4, m6
29863 movu [r0 + 1761 * 16], m3
29866 movu m6, [r5 + 2 * 16]
29867 pmaddubsw m3, m0, m6
29869 pmaddubsw m5, m2, m6
29872 movu [r0 + 1810 * 16], m3
29874 ; mode 33 [row 4 - first half]
29875 movu [r0 + 1992 * 16], m3
29877 pmaddubsw m3, m1, m6
29879 pmaddubsw m5, m4, m6
29882 movu [r0 + 1811 * 16], m3
29884 ; mode 33 [row 4 - second half]
29885 movu [r0 + 1993 * 16], m3
29888 movu m6, [r5 + 15 * 16]
29889 pmaddubsw m3, m0, m6
29891 pmaddubsw m5, m2, m6
29894 movu [r0 + 1812 * 16], m3
29895 pmaddubsw m3, m1, m6
29897 pmaddubsw m5, m4, m6
29900 movu [r0 + 1813 * 16], m3
29903 movu m6, [r5 + 8 * 16]
29904 pmaddubsw m3, m0, m6
29906 pmaddubsw m5, m2, m6
29909 movu [r0 + 1870 * 16], m3
29910 pmaddubsw m3, m1, m6
29912 pmaddubsw m5, m4, m6
29915 movu [r0 + 1871 * 16], m3
29918 movu m6, [r5 + 25 * 16]
29919 pmaddubsw m3, m0, m6
29921 pmaddubsw m5, m2, m6
29924 movu [r0 + 1872 * 16], m3
29925 pmaddubsw m3, m1, m6
29927 pmaddubsw m5, m4, m6
29930 movu [r0 + 1873 * 16], m3
29933 movu m6, [r5 + 19 * 16]
29934 pmaddubsw m3, m0, m6
29936 pmaddubsw m5, m2, m6
29939 movu [r0 + 1932 * 16], m3
29940 pmaddubsw m3, m1, m6
29942 pmaddubsw m5, m4, m6
29945 movu [r0 + 1933 * 16], m3
29948 movu m6, [r5 + 28 * 16]
29949 pmaddubsw m3, m0, m6
29951 pmaddubsw m5, m2, m6
29954 movu [r0 + 1814 * 16], m3
29956 ; mode 33 [row 5 - first half]
29957 movu [r0 + 1994 * 16], m3
29959 pmaddubsw m3, m1, m6
29961 pmaddubsw m5, m4, m6
29964 movu [r0 + 1815 * 16], m3
29966 ; mode 33 [row 5 - second half]
29967 movu [r0 + 1995 * 16], m3
29987 pshufb m5, m0, [tab_S2]
29988 movh [r0 + 1726 * 16], m5
29989 pshufb m5, m2, [tab_S2]
29990 movh [r0 + 1726 * 16 + 8], m5
29991 pshufb m5, m1, [tab_S2]
29992 movh [r0 + 1727 * 16], m5
29993 pshufb m5, m4, [tab_S2]
29994 movh [r0 + 1727 * 16 + 8], m5
29997 movu m6, [r5 + 2 * 16]
29998 pmaddubsw m3, m0, m6
30000 pmaddubsw m5, m2, m6
30003 movu [r0 + 1762 * 16], m3
30004 pmaddubsw m3, m1, m6
30006 pmaddubsw m5, m4, m6
30009 movu [r0 + 1763 * 16], m3
30012 movu m6, [r5 + 11 * 16]
30013 pmaddubsw m3, m0, m6
30015 pmaddubsw m5, m2, m6
30018 movu [r0 + 1764 * 16], m3
30019 pmaddubsw m3, m1, m6
30021 pmaddubsw m5, m4, m6
30024 movu [r0 + 1765 * 16], m3
30027 movu m6, [r5 + 20 * 16]
30028 pmaddubsw m3, m0, m6
30030 pmaddubsw m5, m2, m6
30033 movu [r0 + 1766 * 16], m3
30034 pmaddubsw m3, m1, m6
30036 pmaddubsw m5, m4, m6
30039 movu [r0 + 1767 * 16], m3
30042 movu m6, [r5 + 29 * 16]
30043 pmaddubsw m3, m0, m6
30045 pmaddubsw m5, m2, m6
30048 movu [r0 + 1768 * 16], m3
30050 ; mode 32 [row 8 - first halif]
30051 movu [r0 + 1936 * 16], m3
30053 pmaddubsw m3, m1, m6
30055 pmaddubsw m5, m4, m6
30058 movu [r0 + 1769 * 16], m3
30060 ; mode 32 [row 8 - second halif]
30061 movu [r0 + 1937 * 16], m3
30064 movu m6, [r5 + 9 * 16]
30065 pmaddubsw m3, m0, m6
30067 pmaddubsw m5, m2, m6
30070 movu [r0 + 1816 * 16], m3
30071 pmaddubsw m3, m1, m6
30073 pmaddubsw m5, m4, m6
30076 movu [r0 + 1817 * 16], m3
30079 movu m6, [r5 + 22 * 16]
30080 pmaddubsw m3, m0, m6
30082 pmaddubsw m5, m2, m6
30085 movu [r0 + 1818 * 16], m3
30087 ; mode 33 [row 6 - first half]
30088 movu [r0 + 1996 * 16], m3
30090 pmaddubsw m3, m1, m6
30092 pmaddubsw m5, m4, m6
30095 movu [r0 + 1819 * 16], m3
30097 ; mode 33 [row 6 - second half]
30098 movu [r0 + 1997 * 16], m3
30101 movu m6, [r5 + 10 * 16]
30102 pmaddubsw m3, m0, m6
30104 pmaddubsw m5, m2, m6
30107 movu [r0 + 1874 * 16], m3
30108 pmaddubsw m3, m1, m6
30110 pmaddubsw m5, m4, m6
30113 movu [r0 + 1875 * 16], m3
30116 movu m6, [r5 + 27 * 16]
30117 pmaddubsw m3, m0, m6
30119 pmaddubsw m5, m2, m6
30122 movu [r0 + 1876 * 16], m3
30123 pmaddubsw m3, m1, m6
30125 pmaddubsw m5, m4, m6
30128 movu [r0 + 1877 * 16], m3
30131 movu m6, [r5 + 8 * 16]
30132 pmaddubsw m3, m0, m6
30134 pmaddubsw m5, m2, m6
30137 movu [r0 + 1934 * 16], m3
30138 pmaddubsw m3, m1, m6
30140 pmaddubsw m5, m4, m6
30143 movu [r0 + 1935 * 16], m3
30146 movu m6, [r5 + 6 * 16]
30151 pmaddubsw m3, m0, m6
30157 pmaddubsw m5, m2, m6
30160 movu [r0 + 1770 * 16], m3
30166 pmaddubsw m3, m1, m6
30172 pmaddubsw m5, m4, m6
30175 movu [r0 + 1771 * 16], m3
30178 movu m6, [r5 + 15 * 16]
30179 pmaddubsw m3, m0, m6
30181 pmaddubsw m5, m2, m6
30184 movu [r0 + 1772 * 16], m3
30185 pmaddubsw m3, m1, m6
30187 pmaddubsw m5, m4, m6
30190 movu [r0 + 1773 * 16], m3
30193 movu m6, [r5 + 24 * 16]
30194 pmaddubsw m3, m0, m6
30196 pmaddubsw m5, m2, m6
30199 movu [r0 + 1774 * 16], m3
30200 pmaddubsw m3, m1, m6
30202 pmaddubsw m5, m4, m6
30205 movu [r0 + 1775 * 16], m3
30208 movu m6, [r5 + 3 * 16]
30209 pmaddubsw m3, m0, m6
30211 pmaddubsw m5, m2, m6
30214 movu [r0 + 1820 * 16], m3
30215 pmaddubsw m3, m1, m6
30217 pmaddubsw m5, m4, m6
30220 movu [r0 + 1821 * 16], m3
30223 movu m6, [r5 + 16 * 16]
30224 pmaddubsw m3, m0, m6
30226 pmaddubsw m5, m2, m6
30229 movu [r0 + 1822 * 16], m3
30231 ; mode 33 [row 7 - first half]
30232 movu [r0 + 1998 * 16], m3
30234 pmaddubsw m3, m1, m6
30236 pmaddubsw m5, m4, m6
30239 movu [r0 + 1823 * 16], m3
30241 ; mode 33 [row 7 - second half]
30242 movu [r0 + 1999 * 16], m3
30245 movu m6, [r5 + 29 * 16]
30246 pmaddubsw m3, m0, m6
30248 pmaddubsw m5, m2, m6
30251 movu [r0 + 1824 * 16], m3
30253 ; mode 31 [row 12 - first half]
30254 movu [r0 + 1880 * 16], m3
30256 pmaddubsw m3, m1, m6
30258 pmaddubsw m5, m4, m6
30261 movu [r0 + 1825 * 16], m3
30263 ; mode 31 [row 12 - second half]
30264 movu [r0 + 1881 * 16], m3
30267 movu m6, [r5 + 12 * 16]
30268 pmaddubsw m3, m0, m6
30270 pmaddubsw m5, m2, m6
30273 movu [r0 + 1878 * 16], m3
30274 pmaddubsw m3, m1, m6
30276 pmaddubsw m5, m4, m6
30279 movu [r0 + 1879 * 16], m3
30282 movu m6, [r5 + 18 * 16]
30283 pmaddubsw m3, m0, m6
30285 pmaddubsw m5, m2, m6
30288 movu [r0 + 1938 * 16], m3
30289 pmaddubsw m3, m1, m6
30291 pmaddubsw m5, m4, m6
30294 movu [r0 + 1939 * 16], m3
30297 movu m6, [r5 + 1 * 16]
30302 pmaddubsw m3, m0, m6
30308 pmaddubsw m5, m2, m6
30311 movu [r0 + 1776 * 16], m3
30317 pmaddubsw m3, m1, m6
30323 pmaddubsw m5, m4, m6
30326 movu [r0 + 1777 * 16], m3
30329 movu m6, [r5 + 10 * 16]
30330 pmaddubsw m3, m0, m6
30332 pmaddubsw m5, m2, m6
30335 movu [r0 + 1778 * 16], m3
30337 ; mode 30 [row 17 - first half]
30338 movu [r0 + 1826 * 16], m3
30340 ; mode 33 [row 8 - first half]
30341 movu [r0 + 2000 * 16], m3
30343 pmaddubsw m3, m1, m6
30345 pmaddubsw m5, m4, m6
30348 movu [r0 + 1779 * 16], m3
30350 ; mode 30 [row 17 - second half]
30351 movu [r0 + 1827 * 16], m3
30353 ; mode 33 [row 8 - second half]
30354 movu [r0 + 2001 * 16], m3
30357 movu m6, [r5 + 19 * 16]
30358 pmaddubsw m3, m0, m6
30360 pmaddubsw m5, m2, m6
30363 movu [r0 + 1780 * 16], m3
30364 pmaddubsw m3, m1, m6
30366 pmaddubsw m5, m4, m6
30369 movu [r0 + 1781 * 16], m3
30372 movu m6, [r5 + 28 * 16]
30373 pmaddubsw m3, m0, m6
30375 pmaddubsw m5, m2, m6
30378 movu [r0 + 1782 * 16], m3
30380 ; mode 32 [row 11 - first half]
30381 movu [r0 + 1942 * 16], m3
30383 pmaddubsw m3, m1, m6
30385 pmaddubsw m5, m4, m6
30388 movu [r0 + 1783 * 16], m3
30390 ; mode 32 [row 11 - second half]
30391 movu [r0 + 1943 * 16], m3
30394 movu m6, [r5 + 23 * 16]
30395 pmaddubsw m3, m0, m6
30397 pmaddubsw m5, m2, m6
30400 movu [r0 + 1828 * 16], m3
30401 pmaddubsw m3, m1, m6
30403 pmaddubsw m5, m4, m6
30406 movu [r0 + 1829 * 16], m3
30409 movu m6, [r5 + 14 * 16]
30410 pmaddubsw m3, m0, m6
30412 pmaddubsw m5, m2, m6
30415 movu [r0 + 1882 * 16], m3
30416 pmaddubsw m3, m1, m6
30418 pmaddubsw m5, m4, m6
30421 movu [r0 + 1883 * 16], m3
30424 movu m6, [r5 + 31 * 16]
30425 pmaddubsw m3, m0, m6
30427 pmaddubsw m5, m2, m6
30430 movu [r0 + 1884 * 16], m3
30431 pmaddubsw m3, m1, m6
30433 pmaddubsw m5, m4, m6
30436 movu [r0 + 1885 * 16], m3
30439 movu m6, [r5 + 7 * 16]
30440 pmaddubsw m3, m0, m6
30442 pmaddubsw m5, m2, m6
30445 movu [r0 + 1940 * 16], m3
30446 pmaddubsw m3, m1, m6
30448 pmaddubsw m5, m4, m6
30451 movu [r0 + 1941 * 16], m3
30454 movu m6, [r5 + 5 * 16]
30459 pmaddubsw m3, m0, m6
30465 pmaddubsw m5, m2, m6
30468 movu [r0 + 1784 * 16], m3
30474 pmaddubsw m3, m1, m6
30480 pmaddubsw m5, m4, m6
30483 movu [r0 + 1785 * 16], m3
30486 movu m6, [r5 + 14 * 16]
30487 pmaddubsw m3, m0, m6
30489 pmaddubsw m5, m2, m6
30492 movu [r0 + 1786 * 16], m3
30493 pmaddubsw m3, m1, m6
30495 pmaddubsw m5, m4, m6
30498 movu [r0 + 1787 * 16], m3
30501 movu m6, [r5 + 23 * 16]
30502 pmaddubsw m3, m0, m6
30504 pmaddubsw m5, m2, m6
30507 movu [r0 + 1788 * 16], m3
30508 pmaddubsw m3, m1, m6
30510 pmaddubsw m5, m4, m6
30513 movu [r0 + 1789 * 16], m3
30516 movu m6, [r5 + 4 * 16]
30517 pmaddubsw m3, m0, m6
30519 pmaddubsw m5, m2, m6
30522 movu [r0 + 1830 * 16], m3
30524 ; mode 33 [row 9 - first half]
30525 movu [r0 + 2002 * 16], m3
30527 pmaddubsw m3, m1, m6
30529 pmaddubsw m5, m4, m6
30532 movu [r0 + 1831 * 16], m3
30534 ; mode 33 [row 9 - second half]
30535 movu [r0 + 2003 * 16], m3
30538 movu m6, [r5 + 17 * 16]
30539 pmaddubsw m3, m0, m6
30541 pmaddubsw m5, m2, m6
30544 movu [r0 + 1832 * 16], m3
30546 ; mode 32 [row 12 - first half]
30547 movu [r0 + 1944 * 16], m3
30549 pmaddubsw m3, m1, m6
30551 pmaddubsw m5, m4, m6
30554 movu [r0 + 1833 * 16], m3
30556 ; mode 32 [row 12 - second half]
30557 movu [r0 + 1945 * 16], m3
30560 movu m6, [r5 + 30 * 16]
30561 pmaddubsw m3, m0, m6
30563 pmaddubsw m5, m2, m6
30566 movu [r0 + 1834 * 16], m3
30568 ; mode 33 [row 10 - first half]
30569 movu [r0 + 2004 * 16], m3
30571 pmaddubsw m3, m1, m6
30573 pmaddubsw m5, m4, m6
30576 movu [r0 + 1835 * 16], m3
30578 ; mode 33 [row 10 - second half]
30579 movu [r0 + 2005 * 16], m3
30582 movu m6, [r5 + 16 * 16]
30583 pmaddubsw m3, m0, m6
30585 pmaddubsw m5, m2, m6
30588 movu [r0 + 1886 * 16], m3
30589 pmaddubsw m3, m1, m6
30591 pmaddubsw m5, m4, m6
30594 movu [r0 + 1887 * 16], m3
30614 pshufb m5, m0, [tab_S2]
30615 movh [r0 + 1790 * 16], m5
30616 pshufb m5, m2, [tab_S2]
30617 movh [r0 + 1790 * 16 + 8], m5
30618 pshufb m5, m1, [tab_S2]
30619 movh [r0 + 1791 * 16], m5
30620 pshufb m5, m4, [tab_S2]
30621 movh [r0 + 1791 * 16 + 8], m5
30624 movu m6, [r5 + 11 * 16]
30625 pmaddubsw m3, m0, m6
30627 pmaddubsw m5, m2, m6
30630 movu [r0 + 1836 * 16], m3
30631 pmaddubsw m3, m1, m6
30633 pmaddubsw m5, m4, m6
30636 movu [r0 + 1837 * 16], m3
30639 movu m6, [r5 + 24 * 16]
30640 pmaddubsw m3, m0, m6
30642 pmaddubsw m5, m2, m6
30645 movu [r0 + 1838 * 16], m3
30647 ; mode 33 [row 11 - first half]
30648 movu [r0 + 2006 * 16], m3
30650 pmaddubsw m3, m1, m6
30652 pmaddubsw m5, m4, m6
30655 movu [r0 + 1839 * 16], m3
30657 ; mode 33 [row 11 - second half]
30658 movu [r0 + 2007 * 16], m3
30661 movu m6, [r5 + 1 * 16]
30662 pmaddubsw m3, m0, m6
30664 pmaddubsw m5, m2, m6
30667 movu [r0 + 1888 * 16], m3
30668 pmaddubsw m3, m1, m6
30670 pmaddubsw m5, m4, m6
30673 movu [r0 + 1889 * 16], m3
30676 movu m6, [r5 + 18 * 16]
30677 pmaddubsw m3, m0, m6
30679 pmaddubsw m5, m2, m6
30682 movu [r0 + 1890 * 16], m3
30683 pmaddubsw m3, m1, m6
30685 pmaddubsw m5, m4, m6
30688 movu [r0 + 1891 * 16], m3
30691 movu m6, [r5 + 6 * 16]
30692 pmaddubsw m3, m0, m6
30694 pmaddubsw m5, m2, m6
30697 movu [r0 + 1946 * 16], m3
30698 pmaddubsw m3, m1, m6
30700 pmaddubsw m5, m4, m6
30703 movu [r0 + 1947 * 16], m3
30706 movu m6, [r5 + 27 * 16]
30707 pmaddubsw m3, m0, m6
30709 pmaddubsw m5, m2, m6
30712 movu [r0 + 1948 * 16], m3
30713 pmaddubsw m3, m1, m6
30715 pmaddubsw m5, m4, m6
30718 movu [r0 + 1949 * 16], m3
30721 movu m6, [r5 + 5 * 16]
30726 pmaddubsw m3, m0, m6
30732 pmaddubsw m5, m2, m6
30735 movu [r0 + 1840 * 16], m3
30741 pmaddubsw m3, m1, m6
30747 pmaddubsw m5, m4, m6
30750 movu [r0 + 1841 * 16], m3
30753 movu m6, [r5 + 18 * 16]
30754 pmaddubsw m3, m0, m6
30756 pmaddubsw m5, m2, m6
30759 movu [r0 + 1842 * 16], m3
30761 ; mode 33 [row 12 - first half]
30762 movu [r0 + 2008 * 16], m3
30764 pmaddubsw m3, m1, m6
30766 pmaddubsw m5, m4, m6
30769 movu [r0 + 1843 * 16], m3
30771 ; mode 33 [row 12 - second half]
30772 movu [r0 + 2009 * 16], m3
30775 movu m6, [r5 + 31 * 16]
30776 pmaddubsw m3, m0, m6
30778 pmaddubsw m5, m2, m6
30781 movu [r0 + 1844 * 16], m3
30782 pmaddubsw m3, m1, m6
30784 pmaddubsw m5, m4, m6
30787 movu [r0 + 1845 * 16], m3
30790 movu m6, [r5 + 3 * 16]
30791 pmaddubsw m3, m0, m6
30793 pmaddubsw m5, m2, m6
30796 movu [r0 + 1892 * 16], m3
30797 pmaddubsw m3, m1, m6
30799 pmaddubsw m5, m4, m6
30802 movu [r0 + 1893 * 16], m3
30805 movu m6, [r5 + 20 * 16]
30806 pmaddubsw m3, m0, m6
30808 pmaddubsw m5, m2, m6
30811 movu [r0 + 1894 * 16], m3
30812 pmaddubsw m3, m1, m6
30814 pmaddubsw m5, m4, m6
30817 movu [r0 + 1895 * 16], m3
30820 movu m6, [r5 + 16 * 16]
30821 pmaddubsw m3, m0, m6
30823 pmaddubsw m5, m2, m6
30826 movu [r0 + 1950 * 16], m3
30827 pmaddubsw m3, m1, m6
30829 pmaddubsw m5, m4, m6
30832 movu [r0 + 1951 * 16], m3
30835 movu m6, [r5 + 12 * 16]
30840 pmaddubsw m3, m0, m6
30846 pmaddubsw m5, m2, m6
30849 movu [r0 + 1846 * 16], m3
30851 ; mode 33 [row 13 - first half]
30852 movu [r0 + 2010 * 16], m3
30858 pmaddubsw m3, m1, m6
30864 pmaddubsw m5, m4, m6
30867 movu [r0 + 1847 * 16], m3
30869 ; mode 33 [row 13 - second half]
30870 movu [r0 + 2011 * 16], m3
30873 movu m6, [r5 + 25 * 16]
30874 pmaddubsw m3, m0, m6
30876 pmaddubsw m5, m2, m6
30879 movu [r0 + 1848 * 16], m3
30880 pmaddubsw m3, m1, m6
30882 pmaddubsw m5, m4, m6
30885 movu [r0 + 1849 * 16], m3
30888 movu m6, [r5 + 5 * 16]
30889 pmaddubsw m3, m0, m6
30891 pmaddubsw m5, m2, m6
30894 movu [r0 + 1896 * 16], m3
30896 ; mode 32 [row 16 - first half]
30897 movu [r0 + 1952 * 16], m3
30899 pmaddubsw m3, m1, m6
30901 pmaddubsw m5, m4, m6
30904 movu [r0 + 1897 * 16], m3
30906 ; mode 32 [row 16 - second half]
30907 movu [r0 + 1953 * 16], m3
30910 movu m6, [r5 + 22 * 16]
30911 pmaddubsw m3, m0, m6
30913 pmaddubsw m5, m2, m6
30916 movu [r0 + 1898 * 16], m3
30917 pmaddubsw m3, m1, m6
30919 pmaddubsw m5, m4, m6
30922 movu [r0 + 1899 * 16], m3
30925 movu m6, [r5 + 26 * 16]
30926 pmaddubsw m3, m0, m6
30928 pmaddubsw m5, m2, m6
30931 movu [r0 + 1954 * 16], m3
30932 pmaddubsw m3, m1, m6
30934 pmaddubsw m5, m4, m6
30937 movu [r0 + 1955 * 16], m3
30940 movu m6, [r5 + 6 * 16]
30945 pmaddubsw m3, m0, m6
30951 pmaddubsw m5, m2, m6
30954 movu [r0 + 1850 * 16], m3
30956 ; mode 33 [row 14 - first half]
30957 movu [r0 + 2012 * 16], m3
30963 pmaddubsw m3, m1, m6
30969 pmaddubsw m5, m4, m6
30972 movu [r0 + 1851 * 16], m3
30974 ; mode 33 [row 14 - second half]
30975 movu [r0 + 2013 * 16], m3
30978 movu m6, [r5 + 19 * 16]
30979 pmaddubsw m3, m0, m6
30981 pmaddubsw m5, m2, m6
30984 movu [r0 + 1852 * 16], m3
30985 pmaddubsw m3, m1, m6
30987 pmaddubsw m5, m4, m6
30990 movu [r0 + 1853 * 16], m3
30993 movu m6, [r5 + 7 * 16]
30994 pmaddubsw m3, m0, m6
30996 pmaddubsw m5, m2, m6
30999 movu [r0 + 1900 * 16], m3
31000 pmaddubsw m3, m1, m6
31002 pmaddubsw m5, m4, m6
31005 movu [r0 + 1901 * 16], m3
31008 movu m6, [r5 + 24 * 16]
31009 pmaddubsw m3, m0, m6
31011 pmaddubsw m5, m2, m6
31014 movu [r0 + 1902 * 16], m3
31015 pmaddubsw m3, m1, m6
31017 pmaddubsw m5, m4, m6
31020 movu [r0 + 1903 * 16], m3
31023 movu m6, [r5 + 15 * 16]
31024 pmaddubsw m3, m0, m6
31026 pmaddubsw m5, m2, m6
31029 movu [r0 + 1956 * 16], m3
31030 pmaddubsw m3, m1, m6
31032 pmaddubsw m5, m4, m6
31035 movu [r0 + 1957 * 16], m3
31055 pshufb m5, m0, [tab_S2]
31056 movh [r0 + 1854 * 16], m5
31058 ; mode 33 [row 15 - first eight]
31059 movh [r0 + 2014 * 16], m5
31061 pshufb m5, m2, [tab_S2]
31062 movh [r0 + 1854 * 16 + 8], m5
31064 ; mode 33 [row 15 - second eight]
31065 movh [r0 + 2014 * 16 + 8], m5
31067 pshufb m5, m1, [tab_S2]
31068 movh [r0 + 1855 * 16], m5
31070 ; mode 33 [row 15 - third eight]
31071 movh [r0 + 2015 * 16], m5
31073 pshufb m5, m4, [tab_S2]
31074 movh [r0 + 1855 * 16 + 8], m5
31076 ; mode 33 [row 15 - fourth eight]
31077 movh [r0 + 2015 * 16 + 8], m5
31080 movu m6, [r5 + 9 * 16]
31081 pmaddubsw m3, m0, m6
31083 pmaddubsw m5, m2, m6
31086 movu [r0 + 1904 * 16], m3
31087 pmaddubsw m3, m1, m6
31089 pmaddubsw m5, m4, m6
31092 movu [r0 + 1905 * 16], m3
31095 movu m6, [r5 + 26 * 16]
31096 pmaddubsw m3, m0, m6
31098 pmaddubsw m5, m2, m6
31101 movu [r0 + 1906 * 16], m3
31103 ; mode 33 [row 16 - first half]
31104 movu [r0 + 2016 * 16], m3
31106 pmaddubsw m3, m1, m6
31108 pmaddubsw m5, m4, m6
31111 movu [r0 + 1907 * 16], m3
31113 ; mode 33 [row 16 - second half]
31114 movu [r0 + 2017 * 16], m3
31117 movu m6, [r5 + 4 * 16]
31118 pmaddubsw m3, m0, m6
31120 pmaddubsw m5, m2, m6
31123 movu [r0 + 1958 * 16], m3
31124 pmaddubsw m3, m1, m6
31126 pmaddubsw m5, m4, m6
31129 movu [r0 + 1959 * 16], m3
31132 movu m6, [r5 + 25 * 16]
31133 pmaddubsw m3, m0, m6
31135 pmaddubsw m5, m2, m6
31138 movu [r0 + 1960 * 16], m3
31139 pmaddubsw m3, m1, m6
31141 pmaddubsw m5, m4, m6
31144 movu [r0 + 1961 * 16], m3
31147 movu m6, [r5 + 11 * 16]
31152 pmaddubsw m3, m0, m6
31158 pmaddubsw m5, m2, m6
31161 movu [r0 + 1908 * 16], m3
31167 pmaddubsw m3, m1, m6
31173 pmaddubsw m5, m4, m6
31176 movu [r0 + 1909 * 16], m3
31179 movu m6, [r5 + 28 * 16]
31180 pmaddubsw m3, m0, m6
31182 pmaddubsw m5, m2, m6
31185 movu [r0 + 1910 * 16], m3
31186 pmaddubsw m3, m1, m6
31188 pmaddubsw m5, m4, m6
31191 movu [r0 + 1911 * 16], m3
31194 movu m6, [r5 + 14 * 16]
31195 pmaddubsw m3, m0, m6
31197 pmaddubsw m5, m2, m6
31200 movu [r0 + 1962 * 16], m3
31201 pmaddubsw m3, m1, m6
31203 pmaddubsw m5, m4, m6
31206 movu [r0 + 1963 * 16], m3
31209 movu m6, [r5 + 20 * 16]
31210 pmaddubsw m3, m0, m6
31212 pmaddubsw m5, m2, m6
31215 movu [r0 + 2018 * 16], m3
31216 pmaddubsw m3, m1, m6
31218 pmaddubsw m5, m4, m6
31221 movu [r0 + 2019 * 16], m3
31224 movu m6, [r5 + 13 * 16]
31229 pmaddubsw m3, m0, m6
31235 pmaddubsw m5, m2, m6
31238 movu [r0 + 1912 * 16], m3
31244 pmaddubsw m3, m1, m6
31250 pmaddubsw m5, m4, m6
31253 movu [r0 + 1913 * 16], m3
31256 movu m6, [r5 + 30 * 16]
31257 pmaddubsw m3, m0, m6
31259 pmaddubsw m5, m2, m6
31262 movu [r0 + 1914 * 16], m3
31263 pmaddubsw m3, m1, m6
31265 pmaddubsw m5, m4, m6
31268 movu [r0 + 1915 * 16], m3
31271 movu m6, [r5 + 3 * 16]
31272 pmaddubsw m3, m0, m6
31274 pmaddubsw m5, m2, m6
31277 movu [r0 + 1964 * 16], m3
31278 pmaddubsw m3, m1, m6
31280 pmaddubsw m5, m4, m6
31283 movu [r0 + 1965 * 16], m3
31286 movu m6, [r5 + 24 * 16]
31287 pmaddubsw m3, m0, m6
31289 pmaddubsw m5, m2, m6
31292 movu [r0 + 1966 * 16], m3
31293 pmaddubsw m3, m1, m6
31295 pmaddubsw m5, m4, m6
31298 movu [r0 + 1967 * 16], m3
31301 movu m6, [r5 + 14 * 16]
31302 pmaddubsw m3, m0, m6
31304 pmaddubsw m5, m2, m6
31307 movu [r0 + 2020 * 16], m3
31308 pmaddubsw m3, m1, m6
31310 pmaddubsw m5, m4, m6
31313 movu [r0 + 2021 * 16], m3
31316 movu m6, [r5 + 15 * 16]
31321 pmaddubsw m3, m0, m6
31327 pmaddubsw m5, m2, m6
31330 movu [r0 + 1916 * 16], m3
31336 pmaddubsw m3, m1, m6
31342 pmaddubsw m5, m4, m6
31345 movu [r0 + 1917 * 16], m3
31348 movu m6, [r5 + 13 * 16]
31349 pmaddubsw m3, m0, m6
31351 pmaddubsw m5, m2, m6
31354 movu [r0 + 1968 * 16], m3
31355 pmaddubsw m3, m1, m6
31357 pmaddubsw m5, m4, m6
31360 movu [r0 + 1969 * 16], m3
31363 movu m6, [r5 + 8 * 16]
31364 pmaddubsw m3, m0, m6
31366 pmaddubsw m5, m2, m6
31369 movu [r0 + 2022 * 16], m3
31370 pmaddubsw m3, m1, m6
31372 pmaddubsw m5, m4, m6
31375 movu [r0 + 2023 * 16], m3
31395 pshufb m5, m0, [tab_S2]
31396 movh [r0 + 1918 * 16], m5
31397 pshufb m5, m2, [tab_S2]
31398 movh [r0 + 1918 * 16 + 8], m5
31399 pshufb m5, m1, [tab_S2]
31400 movh [r0 + 1919 * 16], m5
31401 pshufb m5, m4, [tab_S2]
31402 movh [r0 + 1919 * 16 + 8], m5
31405 movu m6, [r5 + 2 * 16]
31406 pmaddubsw m3, m0, m6
31408 pmaddubsw m5, m2, m6
31411 movu [r0 + 1970 * 16], m3
31413 ; mode 33 [row 20 - first half]
31414 movu [r0 + 2024 * 16], m3
31416 pmaddubsw m3, m1, m6
31418 pmaddubsw m5, m4, m6
31421 movu [r0 + 1971 * 16], m3
31423 ; mode 33 [row 20 - second half]
31424 movu [r0 + 2025 * 16], m3
31427 movu m6, [r5 + 23 * 16]
31428 pmaddubsw m3, m0, m6
31430 pmaddubsw m5, m2, m6
31433 movu [r0 + 1972 * 16], m3
31434 pmaddubsw m3, m1, m6
31436 pmaddubsw m5, m4, m6
31439 movu [r0 + 1973 * 16], m3
31442 movu m6, [r5 + 28 * 16]
31443 pmaddubsw m3, m0, m6
31445 pmaddubsw m5, m2, m6
31448 movu [r0 + 2026 * 16], m3
31449 pmaddubsw m3, m1, m6
31451 pmaddubsw m5, m4, m6
31454 movu [r0 + 2027 * 16], m3
31457 movu m6, [r5 + 12 * 16]
31462 pmaddubsw m3, m0, m6
31468 pmaddubsw m5, m2, m6
31471 movu [r0 + 1974 * 16], m3
31477 pmaddubsw m3, m1, m6
31483 pmaddubsw m5, m4, m6
31486 movu [r0 + 1975 * 16], m3
31489 movu m6, [r5 + 22 * 16]
31490 pmaddubsw m3, m0, m6
31492 pmaddubsw m5, m2, m6
31495 movu [r0 + 2028 * 16], m3
31496 pmaddubsw m3, m1, m6
31498 pmaddubsw m5, m4, m6
31501 movu [r0 + 2029 * 16], m3
31504 movu m6, [r5 + 1 * 16]
31509 pmaddubsw m3, m0, m6
31515 pmaddubsw m5, m2, m6
31518 movu [r0 + 1976 * 16], m3
31524 pmaddubsw m3, m1, m6
31530 pmaddubsw m5, m4, m6
31533 movu [r0 + 1977 * 16], m3
31536 movu m6, [r5 + 22 * 16]
31537 pmaddubsw m3, m0, m6
31539 pmaddubsw m5, m2, m6
31542 movu [r0 + 1978 * 16], m3
31543 pmaddubsw m3, m1, m6
31545 pmaddubsw m5, m4, m6
31548 movu [r0 + 1979 * 16], m3
31551 movu m6, [r5 + 16 * 16]
31552 pmaddubsw m3, m0, m6
31554 pmaddubsw m5, m2, m6
31557 movu [r0 + 2030 * 16], m3
31558 pmaddubsw m3, m1, m6
31560 pmaddubsw m5, m4, m6
31563 movu [r0 + 2031 * 16], m3
31566 movu m6, [r5 + 11 * 16]
31571 pmaddubsw m3, m0, m6
31577 pmaddubsw m5, m2, m6
31580 movu [r0 + 1980 * 16], m3
31586 pmaddubsw m3, m1, m6
31592 pmaddubsw m5, m4, m6
31595 movu [r0 + 1981 * 16], m3
31598 movu m6, [r5 + 10 * 16]
31599 pmaddubsw m3, m0, m6
31601 pmaddubsw m5, m2, m6
31604 movu [r0 + 2032 * 16], m3
31605 pmaddubsw m3, m1, m6
31607 pmaddubsw m5, m4, m6
31610 movu [r0 + 2033 * 16], m3
31630 pshufb m5, m0, [tab_S2]
31631 movh [r0 + 1982 * 16], m5
31632 pshufb m5, m2, [tab_S2]
31633 movh [r0 + 1982 * 16 + 8], m5
31634 pshufb m5, m1, [tab_S2]
31635 movh [r0 + 1983 * 16], m5
31636 pshufb m5, m4, [tab_S2]
31637 movh [r0 + 1983 * 16 + 8], m5
31640 movu m6, [r5 + 4 * 16]
31641 pmaddubsw m3, m0, m6
31643 pmaddubsw m5, m2, m6
31646 movu [r0 + 2034 * 16], m3
31647 pmaddubsw m3, m1, m6
31649 pmaddubsw m5, m4, m6
31652 movu [r0 + 2035 * 16], m3
31655 movu m6, [r5 + 30 * 16]
31656 pmaddubsw m3, m0, m6
31658 pmaddubsw m5, m2, m6
31661 movu [r0 + 2036 * 16], m3
31662 pmaddubsw m3, m1, m6
31664 pmaddubsw m5, m4, m6
31667 movu [r0 + 2037 * 16], m3
31670 movu m6, [r5 + 24 * 16]
31675 pmaddubsw m3, m0, m6
31681 pmaddubsw m5, m2, m6
31684 movu [r0 + 2038 * 16], m3
31690 pmaddubsw m3, m1, m6
31696 pmaddubsw m5, m4, m6
31699 movu [r0 + 2039 * 16], m3
31702 movu m6, [r5 + 18 * 16]
31707 pmaddubsw m3, m0, m6
31713 pmaddubsw m5, m2, m6
31716 movu [r0 + 2040 * 16], m3
31722 pmaddubsw m3, m1, m6
31728 pmaddubsw m5, m4, m6
31731 movu [r0 + 2041 * 16], m3
31734 movu m6, [r5 + 12 * 16]
31739 pmaddubsw m3, m0, m6
31745 pmaddubsw m5, m2, m6
31748 movu [r0 + 2042 * 16], m3
31754 pmaddubsw m3, m1, m6
31760 pmaddubsw m5, m4, m6
31763 movu [r0 + 2043 * 16], m3
31766 movu m6, [r5 + 6 * 16]
31771 pmaddubsw m3, m0, m6
31777 pmaddubsw m5, m2, m6
31780 movu [r0 + 2044 * 16], m3
31786 pmaddubsw m3, m1, m6
31792 pmaddubsw m5, m4, m6
31795 movu [r0 + 2045 * 16], m3
31799 movu [r0 + 2046 * 16], m5
31801 movu [r0 + 2047 * 16], m5
31805 movu [r0 + 2048 * 16], m0
31807 movu [r0 + 2049 * 16], m1
31811 palignr m3, m1, m0, 1
31812 movu [r0 + 2050 * 16], m3
31813 palignr m4, m2, m1, 1
31814 movu [r0 + 2051 * 16], m4
31817 palignr m3, m1, m0, 2
31818 movu [r0 + 2052 * 16], m3
31819 palignr m4, m2, m1, 2
31820 movu [r0 + 2053 * 16], m4
31823 palignr m3, m1, m0, 3
31824 movu [r0 + 2054 * 16], m3
31825 palignr m4, m2, m1, 3
31826 movu [r0 + 2055 * 16], m4
31829 palignr m3, m1, m0, 4
31830 movu [r0 + 2056 * 16], m3
31831 palignr m4, m2, m1, 4
31832 movu [r0 + 2057 * 16], m4
31835 palignr m3, m1, m0, 5
31836 movu [r0 + 2058 * 16], m3
31837 palignr m4, m2, m1, 5
31838 movu [r0 + 2059 * 16], m4
31841 palignr m3, m1, m0, 6
31842 movu [r0 + 2060 * 16], m3
31843 palignr m4, m2, m1, 6
31844 movu [r0 + 2061 * 16], m4
31847 palignr m3, m1, m0, 7
31848 movu [r0 + 2062 * 16], m3
31849 palignr m4, m2, m1, 7
31850 movu [r0 + 2063 * 16], m4
31853 palignr m3, m1, m0, 8
31854 movu [r0 + 2064 * 16], m3
31855 palignr m4, m2, m1, 8
31856 movu [r0 + 2065 * 16], m4
31859 palignr m3, m1, m0, 9
31860 movu [r0 + 2066 * 16], m3
31861 palignr m4, m2, m1, 9
31862 movu [r0 + 2067 * 16], m4
31865 palignr m3, m1, m0, 10
31866 movu [r0 + 2068 * 16], m3
31867 palignr m4, m2, m1, 10
31868 movu [r0 + 2069 * 16], m4
31871 palignr m3, m1, m0, 11
31872 movu [r0 + 2070 * 16], m3
31873 palignr m4, m2, m1, 11
31874 movu [r0 + 2071 * 16], m4
31877 palignr m3, m1, m0, 12
31878 movu [r0 + 2072 * 16], m3
31879 palignr m4, m2, m1, 12
31880 movu [r0 + 2073 * 16], m4
31883 palignr m3, m1, m0, 13
31884 movu [r0 + 2074 * 16], m3
31885 palignr m4, m2, m1, 13
31886 movu [r0 + 2075 * 16], m4
31889 palignr m3, m1, m0, 14
31890 movu [r0 + 2076 * 16], m3
31891 palignr m4, m2, m1, 14
31892 movu [r0 + 2077 * 16], m4
31895 palignr m3, m1, m0, 15
31896 movu [r0 + 2078 * 16], m3
31897 palignr m4, m2, m1, 15
31898 movu [r0 + 2079 * 16], m4
31901 palignr m3, m1, m0, 16
31902 movu [r0 + 2080 * 16], m3
31903 palignr m4, m2, m1, 16
31904 movu [r0 + 2081 * 16], m4
31908 movu [r0 + 2082 * 16], m0
31910 movu [r0 + 2083 * 16], m1
31914 palignr m3, m1, m0, 1
31915 movu [r0 + 2084 * 16], m3
31916 palignr m4, m2, m1, 1
31917 movu [r0 + 2085 * 16], m4
31920 palignr m3, m1, m0, 2
31921 movu [r0 + 2086 * 16], m3
31922 palignr m4, m2, m1, 2
31923 movu [r0 + 2087 * 16], m4
31926 palignr m3, m1, m0, 3
31927 movu [r0 + 2088 * 16], m3
31928 palignr m4, m2, m1, 3
31929 movu [r0 + 2089 * 16], m4
31932 palignr m3, m1, m0, 4
31933 movu [r0 + 2090 * 16], m3
31934 palignr m4, m2, m1, 4
31935 movu [r0 + 2091 * 16], m4
31938 palignr m3, m1, m0, 5
31939 movu [r0 + 2092 * 16], m3
31940 palignr m4, m2, m1, 5
31941 movu [r0 + 2093 * 16], m4
31944 palignr m3, m1, m0, 6
31945 movu [r0 + 2094 * 16], m3
31946 palignr m4, m2, m1, 6
31947 movu [r0 + 2095 * 16], m4
31950 palignr m3, m1, m0, 7
31951 movu [r0 + 2096 * 16], m3
31952 palignr m4, m2, m1, 7
31953 movu [r0 + 2097 * 16], m4
31956 palignr m3, m1, m0, 8
31957 movu [r0 + 2098 * 16], m3
31958 palignr m4, m2, m1, 8
31959 movu [r0 + 2099 * 16], m4
31962 palignr m3, m1, m0, 9
31963 movu [r0 + 2100 * 16], m3
31964 palignr m4, m2, m1, 9
31965 movu [r0 + 2101 * 16], m4
31968 palignr m3, m1, m0, 10
31969 movu [r0 + 2102 * 16], m3
31970 palignr m4, m2, m1, 10
31971 movu [r0 + 2103 * 16], m4
31974 palignr m3, m1, m0, 11
31975 movu [r0 + 2104 * 16], m3
31976 palignr m4, m2, m1, 11
31977 movu [r0 + 2105 * 16], m4
31980 palignr m3, m1, m0, 12
31981 movu [r0 + 2106 * 16], m3
31982 palignr m4, m2, m1, 12
31983 movu [r0 + 2107 * 16], m4
31986 palignr m3, m1, m0, 13
31987 movu [r0 + 2108 * 16], m3
31988 palignr m4, m2, m1, 13
31989 movu [r0 + 2109 * 16], m4
31992 palignr m3, m1, m0, 14
31993 movu [r0 + 2110 * 16], m3
31994 palignr m4, m2, m1, 14
31995 movu [r0 + 2111 * 16], m4