1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
4 ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
5 ;* Nabajit Deka <nabajit@multicorewareinc.com>
7 ;* This program is free software; you can redistribute it and/or modify
8 ;* it under the terms of the GNU General Public License as published by
9 ;* the Free Software Foundation; either version 2 of the License, or
10 ;* (at your option) any later version.
12 ;* This program is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;* GNU General Public License for more details.
17 ;* You should have received a copy of the GNU General Public License
18 ;* along with this program; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 ;* This program is also available under a commercial proprietary license.
22 ;* For more information, contact us at license @ x265.com.
23 ;*****************************************************************************/
26 %include "x86util.asm"
31 ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
32 ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
33 pf_64: times 4 dd 64.0
34 pf_128: times 4 dd 128.0
36 ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64
37 ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63
39 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
40 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
42 mask_ff: times 16 db 0xff
44 deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
45 deinterleave_word_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
46 hmul_16p: times 16 db 1
48 hmulw_16p: times 8 dw 1
51 trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
65 ;-----------------------------------------------------------------------------
66 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
67 ;-----------------------------------------------------------------------------
70 cglobal getResidual4, 4,4,4
100 cglobal getResidual4, 4,4,5
114 movhps [r2 + r3 * 2], m1
115 lea r0, [r0 + r3 * 2]
116 lea r1, [r1 + r3 * 2]
117 lea r2, [r2 + r3 * 4]
130 movhps [r2 + r3 * 2], m1
137 cglobal getResidual8, 4,4,4
153 lea r0, [r0 + r3 * 2]
154 lea r1, [r1 + r3 * 2]
155 lea r2, [r2 + r3 * 2]
159 cglobal getResidual8, 4,4,5
176 movu [r2 + r3 * 2], m2
179 lea r0, [r0 + r3 * 2]
180 lea r1, [r1 + r3 * 2]
181 lea r2, [r2 + r3 * 4]
189 cglobal getResidual16, 4,5,6
197 movu m3, [r0 + r3 + 16]
203 movu m5, [r1 + r3 + 16]
206 lea r0, [r0 + r3 * 2]
207 lea r1, [r1 + r3 * 2]
212 movu [r2 + r3 + 16], m3
213 lea r2, [r2 + r3 * 2]
219 movu m3, [r0 + r3 + 16]
225 movu m5, [r1 + r3 + 16]
232 movu [r2 + r3 + 16], m3
236 lea r0, [r0 + r3 * 2]
237 lea r1, [r1 + r3 * 2]
238 lea r2, [r2 + r3 * 2]
244 cglobal getResidual16, 4,5,8
268 movu [r2 + r3 * 2], m6
269 movu [r2 + r3 * 2 + 16], m2
271 lea r0, [r0 + r3 * 2]
272 lea r1, [r1 + r3 * 2]
273 lea r2, [r2 + r3 * 4]
295 movu [r2 + r3 * 2], m6
296 movu [r2 + r3 * 2 + 16], m2
300 lea r0, [r0 + r3 * 2]
301 lea r1, [r1 + r3 * 2]
302 lea r2, [r2 + r3 * 4]
311 cglobal getResidual32, 4,5,6
336 movu m1, [r0 + r3 + 16]
337 movu m2, [r0 + r3 + 32]
338 movu m3, [r0 + r3 + 48]
340 movu m5, [r1 + r3 + 16]
343 movu m4, [r1 + r3 + 32]
344 movu m5, [r1 + r3 + 48]
349 movu [r2 + r3 + 16], m1
350 movu [r2 + r3 + 32], m2
351 movu [r2 + r3 + 48], m3
355 lea r0, [r0 + r3 * 2]
356 lea r1, [r1 + r3 * 2]
357 lea r2, [r2 + r3 * 2]
363 cglobal getResidual32, 4,5,7
377 movu [r2 + 0 * 16], m5
378 movu [r2 + 1 * 16], m1
386 movu [r2 + 2 * 16], m5
387 movu [r2 + 3 * 16], m2
390 movu m2, [r0 + r3 + 16]
392 movu m4, [r1 + r3 + 16]
399 movu [r2 + r3 * 2 + 0 * 16], m5
400 movu [r2 + r3 * 2 + 1 * 16], m1
408 movu [r2 + r3 * 2 + 2 * 16], m5
409 movu [r2 + r3 * 2 + 3 * 16], m2
413 lea r0, [r0 + r3 * 2]
414 lea r1, [r1 + r3 * 2]
415 lea r2, [r2 + r3 * 4]
422 ;-----------------------------------------------------------------------------
423 ; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
424 ;-----------------------------------------------------------------------------
428 movd m4, r4d ; m4 = qbits
432 movd m6, r4d ; m6 = qbits8
436 pshufd m5, m5, 0 ; m5 = add
442 pxor m7, m7 ; m7 = numZero
445 pmovsxwd m0, [r0] ; m0 = level
447 pmulld m1, [r1] ; m0 = tmpLevel1
449 psrad m2, m4 ; m2 = level1
453 psubd m1, m3 ; m1 = deltaU1
463 pmovsxwd m0, [r0 + 8] ; m0 = level
465 pmulld m1, [r1 + 16] ; m0 = tmpLevel1
467 psrad m2, m4 ; m2 = level1
470 psubd m1, m3 ; m1 = deltaU1
497 cglobal quant, 5,5,10
499 movd xm4, r4d ; m4 = qbits
503 movd xm6, r4d ; m6 = qbits8
506 vpbroadcastd m5, r5m ; m5 = add
508 vpbroadcastw m9, [pw_1] ; m9 = word [1]
512 pxor m7, m7 ; m7 = numZero
515 pmovsxwd m0, [r0] ; m0 = level
517 pmulld m1, [r1] ; m0 = tmpLevel1
519 psrad m2, xm4 ; m2 = level1
523 psubd m1, m3 ; m1 = deltaU1
528 pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
530 pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
532 psrad m3, xm4 ; m2 = level1
536 psubd m1, m8 ; m1 = deltaU1
537 movu [r2 + mmsize], m1
544 ; count non-zero coeff
545 ; TODO: popcnt is faster, but some CPU can't support
560 vextracti128 xm1, m7, 1
567 %else ; ARCH_X86_64 == 1
571 movd xm4, r4d ; m4 = qbits
575 movd xm6, r4d ; m6 = qbits8
578 vpbroadcastd m5, r5m ; m5 = ad
584 pxor m7, m7 ; m7 = numZero
587 pmovsxwd m0, [r0] ; m0 = level
589 pmulld m1, [r1] ; m0 = tmpLevel1
591 psrad m2, xm4 ; m2 = level1
595 psubd m1, m3 ; m1 = deltaU1
606 pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
608 pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
610 psrad m2, xm4 ; m2 = level1
614 psubd m1, m3 ; m1 = deltaU1
616 movu [r2 + mmsize], m1
622 movu [r3 + mmsize/2], xm3
634 vextracti128 xm1, m7, 1
640 %endif ; ARCH_X86_64 == 1
644 ;-----------------------------------------------------------------------------
645 ; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
646 ;-----------------------------------------------------------------------------
648 cglobal nquant, 3,5,8
651 pxor m7, m7 ; m7 = numZero
652 movd m5, r3m ; m5 = qbits
653 pshufd m6, m6, 0 ; m6 = add
654 mov r3d, r4d ; r3 = numCoeff
658 pmovsxwd m0, [r0] ; m0 = level
659 pmovsxwd m1, [r0 + 8] ; m1 = level
662 pmulld m2, [r1] ; m0 = tmpLevel1 * qcoeff
664 psrad m2, m5 ; m0 = level1
668 pmulld m3, [r1 + 16] ; m1 = tmpLevel1 * qcoeff
670 psrad m3, m5 ; m1 = level1
691 sub eax, r4d ; numSig
696 cglobal nquant, 3,5,7
698 vpbroadcastd m6, [pw_1]
700 pxor m5, m5 ; m7 = numZero
701 movd xm3, r3m ; m5 = qbits
702 mov r3d, r4d ; r3 = numCoeff
706 pmovsxwd m0, [r0] ; m0 = level
708 pmulld m1, [r1] ; m0 = tmpLevel1 * qcoeff
710 psrad m1, xm3 ; m0 = level1
713 pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
715 pmulld m2, [r1 + mmsize] ; m0 = tmpLevel1 * qcoeff
717 psrad m2, xm3 ; m0 = level1
736 vextracti128 xm0, m5, 1
744 ;-----------------------------------------------------------------------------
745 ; void dequant_normal(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift)
746 ;-----------------------------------------------------------------------------
748 cglobal dequant_normal, 5,5,5
757 movd m0, r4d ; m0 = shift
761 pshufd m1, m1, 0 ; m1 = dword [add scale]
769 pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
785 cglobal dequant_normal, 5,5,7
786 vpbroadcastd m2, [pw_1] ; m2 = word [1]
787 vpbroadcastd m5, [pd_32767] ; m5 = dword [32767]
788 vpbroadcastd m6, [pd_n32768] ; m6 = dword [-32768]
796 movd xm0, r4d ; m0 = shift
799 vpbroadcastd m1, r3d ; m1 = dword [add scale]
809 pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
818 mova [r1 + 0 * mmsize/2], xm3
819 vextracti128 [r1 + 1 * mmsize/2], m3, 1
829 ;-----------------------------------------------------------------------------
830 ; int count_nonzero(const int16_t *quantCoeff, int numCoeff);
831 ;-----------------------------------------------------------------------------
833 cglobal count_nonzero, 2,2,3
841 packsswb m2, [r0 + 16]
855 ;-----------------------------------------------------------------------------------------------------------------------------------------------
856 ;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
857 ;-----------------------------------------------------------------------------------------------------------------------------------------------
859 cglobal weight_pp, 6, 7, 6
861 shl r5d, 6 ; m0 = [w0<<6]
864 or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each.
866 pshufd m0, m0, 0 ; m0 = [w0<<6, round]
893 pmovzxbw m4, [r0 + 8]
923 cglobal weight_pp, 6, 7, 6
925 shl r5d, 6 ; m0 = [w0<<6]
928 or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each.
930 pshufd xm0, xm0, 0 ; m0 = [w0<<6, round]
931 vinserti128 m0, m0, xm0, 1 ; document says (pshufd + vinserti128) can be replaced with vpbroadcastd m0, xm0, but having build problem, need to investigate
955 vextracti128 xm4, m3, 1
972 ;-------------------------------------------------------------------------------------------------------------------------------------------------
973 ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
974 ;-------------------------------------------------------------------------------------------------------------------------------------------------
977 cglobal weight_sp, 6, 7+2, 7
980 %else ; ARCH_X86_64 = 0
981 cglobal weight_sp, 6, 7, 7, 0-(2*4)
982 %define tmp_r0 [(rsp + 0 * 4)]
983 %define tmp_r1 [(rsp + 1 * 4)]
986 movd m0, r6m ; m0 = [w0]
988 movd m1, r7m ; m1 = [round]
990 pshufd m0, m0, 0 ; m0 = [w0 round]
992 movd m1, r8m ; m1 = [shift]
995 pshufd m2, m2, 0 ; m2 =[offset]
1005 ; save old src and dst
1056 ;-----------------------------------------------------------------
1057 ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
1058 ;-----------------------------------------------------------------
1060 cglobal transpose4, 3, 3, 4, dest, src, stride
1061 %if HIGH_BIT_DEPTH == 1
1065 movh m2, [r1 + 2 * r2]
1066 lea r1, [r1 + 2 * r2]
1070 punpckhdq m1, m0, m2
1074 %else ;HIGH_BIT_DEPTH == 0
1077 movd m2, [r1 + 2 * r2]
1078 lea r1, [r1 + 2 * r2]
1088 ;-----------------------------------------------------------------
1089 ; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride)
1090 ;-----------------------------------------------------------------
1091 %if HIGH_BIT_DEPTH == 1
1092 %if ARCH_X86_64 == 1
1094 cglobal transpose8, 3, 5, 5
1097 lea r4, [r1 + 4 * r2]
1099 vinserti128 m0, m0, [r4], 1
1101 vinserti128 m1, m1, [r4 + r2], 1
1102 movu xm2, [r1 + 2 * r2]
1103 vinserti128 m2, m2, [r4 + 2 * r2], 1
1105 vinserti128 m3, m3, [r4 + r3], 1
1107 punpcklwd m4, m0, m1 ;[1 - 4][row1row2;row5row6]
1108 punpckhwd m0, m1 ;[5 - 8][row1row2;row5row6]
1110 punpcklwd m1, m2, m3 ;[1 - 4][row3row4;row7row8]
1111 punpckhwd m2, m3 ;[5 - 8][row3row4;row7row8]
1113 punpckldq m3, m4, m1 ;[1 - 2][row1row2row3row4;row5row6row7row8]
1114 punpckhdq m4, m1 ;[3 - 4][row1row2row3row4;row5row6row7row8]
1116 punpckldq m1, m0, m2 ;[5 - 6][row1row2row3row4;row5row6row7row8]
1117 punpckhdq m0, m2 ;[7 - 8][row1row2row3row4;row5row6row7row8]
1119 vpermq m3, m3, 0xD8 ;[1 ; 2][row1row2row3row4row5row6row7row8]
1120 vpermq m4, m4, 0xD8 ;[3 ; 4][row1row2row3row4row5row6row7row8]
1121 vpermq m1, m1, 0xD8 ;[5 ; 6][row1row2row3row4row5row6row7row8]
1122 vpermq m0, m0, 0xD8 ;[7 ; 8][row1row2row3row4row5row6row7row8]
1124 movu [r0 + 0 * 32], m3
1125 movu [r0 + 1 * 32], m4
1126 movu [r0 + 2 * 32], m1
1127 movu [r0 + 3 * 32], m0
1132 %macro TRANSPOSE_4x4 1
1135 movh m2, [r1 + 2 * r2]
1136 lea r1, [r1 + 2 * r2]
1140 punpckhdq m1, m0, m2
1143 movhps [r0 + %1], m0
1144 movh [r0 + 2 * %1], m1
1145 lea r0, [r0 + 2 * %1]
1146 movhps [r0 + %1], m1
1148 cglobal transpose8_internal
1150 lea r1, [r1 + 2 * r2]
1153 lea r1, [r1 + 2 * r2]
1155 lea r1, [r1 + r2 * 8 + 8]
1157 lea r0, [r3 + 4 * r5]
1159 lea r1, [r1 + 2 * r2]
1160 lea r0, [r3 + 8 + 4 * r5]
1163 cglobal transpose8, 3, 6, 4, dest, src, stride
1167 call transpose8_internal
1169 %else ;HIGH_BIT_DEPTH == 0
1170 %if ARCH_X86_64 == 1
1172 cglobal transpose8, 3, 4, 4
1175 movhps xm0, [r1 + 2 * r2]
1177 movhps xm1, [r1 + r3]
1178 lea r1, [r1 + 4 * r2]
1180 movhps xm2, [r1 + 2 * r2]
1182 movhps xm3, [r1 + r3]
1184 vinserti128 m0, m0, xm2, 1 ;[row1 row3 row5 row7]
1185 vinserti128 m1, m1, xm3, 1 ;[row2 row4 row6 row8]
1187 punpcklbw m2, m0, m1 ;[1 - 8; 1 - 8][row1row2; row5row6]
1188 punpckhbw m0, m1 ;[1 - 8; 1 - 8][row3row4; row7row8]
1190 punpcklwd m1, m2, m0 ;[1 - 4; 1 - 4][row1row2row3row4; row5row6row7row8]
1191 punpckhwd m2, m0 ;[5 - 8; 5 - 8][row1row2row3row4; row5row6row7row8]
1193 mova m0, [trans8_shuf]
1195 vpermd m1, m0, m1 ;[1 - 2; 3 - 4][row1row2row3row4row5row6row7row8]
1196 vpermd m2, m0, m2 ;[4 - 5; 6 - 7][row1row2row3row4row5row6row7row8]
1204 cglobal transpose8, 3, 5, 8, dest, src, stride
1211 movh m4, [r1 + 4 * r2]
1212 lea r1, [r1 + 4 * r2]
1222 punpckhwd m1, m0, m2
1224 punpckhwd m5, m4, m6
1226 punpckhdq m2, m0, m4
1228 punpckhdq m3, m1, m5
1238 %macro TRANSPOSE_8x8 1
1242 movh m2, [r1 + 2 * r2]
1243 lea r1, [r1 + 2 * r2]
1245 movh m4, [r1 + 2 * r2]
1246 lea r1, [r1 + 2 * r2]
1248 movh m6, [r1 + 2 * r2]
1249 lea r1, [r1 + 2 * r2]
1257 punpckhwd m1, m0, m2
1259 punpckhwd m5, m4, m6
1261 punpckhdq m2, m0, m4
1263 punpckhdq m3, m1, m5
1267 movhps [r0 + %1], m0
1268 movh [r0 + 2 * %1], m2
1269 lea r0, [r0 + 2 * %1]
1270 movhps [r0 + %1], m2
1271 movh [r0 + 2 * %1], m1
1272 lea r0, [r0 + 2 * %1]
1273 movhps [r0 + %1], m1
1274 movh [r0 + 2 * %1], m3
1275 lea r0, [r0 + 2 * %1]
1276 movhps [r0 + %1], m3
1281 ;-----------------------------------------------------------------
1282 ; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride)
1283 ;-----------------------------------------------------------------
1284 %if HIGH_BIT_DEPTH == 1
1285 %if ARCH_X86_64 == 1
1287 cglobal transpose16x8_internal
1290 movu m2, [r1 + 2 * r2]
1292 lea r1, [r1 + 4 * r2]
1296 movu m6, [r1 + 2 * r2]
1299 punpcklwd m8, m0, m1 ;[1 - 4; 9 - 12][1 2]
1300 punpckhwd m0, m1 ;[5 - 8; 13 -16][1 2]
1302 punpcklwd m1, m2, m3 ;[1 - 4; 9 - 12][3 4]
1303 punpckhwd m2, m3 ;[5 - 8; 13 -16][3 4]
1305 punpcklwd m3, m4, m5 ;[1 - 4; 9 - 12][5 6]
1306 punpckhwd m4, m5 ;[5 - 8; 13 -16][5 6]
1308 punpcklwd m5, m6, m7 ;[1 - 4; 9 - 12][7 8]
1309 punpckhwd m6, m7 ;[5 - 8; 13 -16][7 8]
1311 punpckldq m7, m8, m1 ;[1 - 2; 9 - 10][1 2 3 4]
1312 punpckhdq m8, m1 ;[3 - 4; 11 - 12][1 2 3 4]
1314 punpckldq m1, m3, m5 ;[1 - 2; 9 - 10][5 6 7 8]
1315 punpckhdq m3, m5 ;[3 - 4; 11 - 12][5 6 7 8]
1317 punpckldq m5, m0, m2 ;[5 - 6; 13 - 14][1 2 3 4]
1318 punpckhdq m0, m2 ;[7 - 8; 15 - 16][1 2 3 4]
1320 punpckldq m2, m4, m6 ;[5 - 6; 13 - 14][5 6 7 8]
1321 punpckhdq m4, m6 ;[7 - 8; 15 - 16][5 6 7 8]
1323 punpcklqdq m6, m7, m1 ;[1 ; 9 ][1 2 3 4 5 6 7 8]
1324 punpckhqdq m7, m1 ;[2 ; 10][1 2 3 4 5 6 7 8]
1326 punpcklqdq m1, m8, m3 ;[3 ; 11][1 2 3 4 5 6 7 8]
1327 punpckhqdq m8, m3 ;[4 ; 12][1 2 3 4 5 6 7 8]
1329 punpcklqdq m3, m5, m2 ;[5 ; 13][1 2 3 4 5 6 7 8]
1330 punpckhqdq m5, m2 ;[6 ; 14][1 2 3 4 5 6 7 8]
1332 punpcklqdq m2, m0, m4 ;[7 ; 15][1 2 3 4 5 6 7 8]
1333 punpckhqdq m0, m4 ;[8 ; 16][1 2 3 4 5 6 7 8]
1335 movu [r0 + 0 * 32], xm6
1336 vextracti128 [r0 + 8 * 32], m6, 1
1337 movu [r0 + 1 * 32], xm7
1338 vextracti128 [r0 + 9 * 32], m7, 1
1339 movu [r0 + 2 * 32], xm1
1340 vextracti128 [r0 + 10 * 32], m1, 1
1341 movu [r0 + 3 * 32], xm8
1342 vextracti128 [r0 + 11 * 32], m8, 1
1343 movu [r0 + 4 * 32], xm3
1344 vextracti128 [r0 + 12 * 32], m3, 1
1345 movu [r0 + 5 * 32], xm5
1346 vextracti128 [r0 + 13 * 32], m5, 1
1347 movu [r0 + 6 * 32], xm2
1348 vextracti128 [r0 + 14 * 32], m2, 1
1349 movu [r0 + 7 * 32], xm0
1350 vextracti128 [r0 + 15 * 32], m0, 1
1353 cglobal transpose16, 3, 4, 9
1356 call transpose16x8_internal
1357 lea r1, [r1 + 4 * r2]
1359 call transpose16x8_internal
1363 cglobal transpose16, 3, 7, 4, dest, src, stride
1369 call transpose8_internal
1370 lea r1, [r1 - 8 + 2 * r2]
1373 call transpose8_internal
1375 lea r0, [r6 + 8 * r5]
1377 call transpose8_internal
1378 lea r1, [r1 - 8 + 2 * r2]
1379 lea r0, [r6 + 8 * r5 + 16]
1381 call transpose8_internal
1383 %else ;HIGH_BIT_DEPTH == 0
1384 %if ARCH_X86_64 == 1
1386 cglobal transpose16, 3, 5, 9
1388 lea r4, [r1 + 8 * r2]
1392 movu xm2, [r1 + 2 * r2]
1394 vinserti128 m0, m0, [r4], 1
1395 vinserti128 m1, m1, [r4 + r2], 1
1396 vinserti128 m2, m2, [r4 + 2 * r2], 1
1397 vinserti128 m3, m3, [r4 + r3], 1
1398 lea r1, [r1 + 4 * r2]
1399 lea r4, [r4 + 4 * r2]
1403 movu xm6, [r1 + 2 * r2]
1405 vinserti128 m4, m4, [r4], 1
1406 vinserti128 m5, m5, [r4 + r2], 1
1407 vinserti128 m6, m6, [r4 + 2 * r2], 1
1408 vinserti128 m7, m7, [r4 + r3], 1
1410 punpcklbw m8, m0, m1 ;[1 - 8 ; 1 - 8 ][1 2 9 10]
1411 punpckhbw m0, m1 ;[9 - 16; 9 - 16][1 2 9 10]
1413 punpcklbw m1, m2, m3 ;[1 - 8 ; 1 - 8 ][3 4 11 12]
1414 punpckhbw m2, m3 ;[9 - 16; 9 - 16][3 4 11 12]
1416 punpcklbw m3, m4, m5 ;[1 - 8 ; 1 - 8 ][5 6 13 14]
1417 punpckhbw m4, m5 ;[9 - 16; 9 - 16][5 6 13 14]
1419 punpcklbw m5, m6, m7 ;[1 - 8 ; 1 - 8 ][7 8 15 16]
1420 punpckhbw m6, m7 ;[9 - 16; 9 - 16][7 8 15 16]
1422 punpcklwd m7, m8, m1 ;[1 - 4 ; 1 - 4][1 2 3 4 9 10 11 12]
1423 punpckhwd m8, m1 ;[5 - 8 ; 5 - 8][1 2 3 4 9 10 11 12]
1425 punpcklwd m1, m3, m5 ;[1 - 4 ; 1 - 4][5 6 7 8 13 14 15 16]
1426 punpckhwd m3, m5 ;[5 - 8 ; 5 - 8][5 6 7 8 13 14 15 16]
1428 punpcklwd m5, m0, m2 ;[9 - 12; 9 - 12][1 2 3 4 9 10 11 12]
1429 punpckhwd m0, m2 ;[13- 16; 13 - 16][1 2 3 4 9 10 11 12]
1431 punpcklwd m2, m4, m6 ;[9 - 12; 9 - 12][5 6 7 8 13 14 15 16]
1432 punpckhwd m4, m6 ;[13- 16; 13 - 16][5 6 7 8 13 14 15 16]
1434 punpckldq m6, m7, m1 ;[1 - 2 ; 1 - 2][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1435 punpckhdq m7, m1 ;[3 - 4 ; 3 - 4][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1437 punpckldq m1, m8, m3 ;[5 - 6 ; 5 - 6][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1438 punpckhdq m8, m3 ;[7 - 8 ; 7 - 8][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1440 punpckldq m3, m5, m2 ;[9 - 10; 9 - 10][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1441 punpckhdq m5, m2 ;[11- 12; 11 - 12][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1443 punpckldq m2, m0, m4 ;[13- 14; 13 - 14][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1444 punpckhdq m0, m4 ;[15- 16; 15 - 16][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1455 movu [r0 + 0 * 16], m6
1456 movu [r0 + 2 * 16], m7
1457 movu [r0 + 4 * 16], m1
1458 movu [r0 + 6 * 16], m8
1459 movu [r0 + 8 * 16], m3
1460 movu [r0 + 10 * 16], m5
1461 movu [r0 + 12 * 16], m2
1462 movu [r0 + 14 * 16], m0
1466 cglobal transpose16, 3, 5, 8, dest, src, stride
1470 lea r1, [r1 + 2 * r2]
1474 lea r0, [r3 + 8 * 16]
1476 lea r1, [r1 + 2 * r2]
1477 lea r0, [r3 + 8 * 16 + 8]
1482 cglobal transpose16_internal
1484 lea r1, [r1 + 2 * r2]
1487 lea r1, [r1 + 2 * r2]
1489 lea r1, [r1 + r2 * 8]
1490 lea r1, [r1 + r2 * 8 + 8]
1492 lea r0, [r5 + 8 * r6]
1494 lea r1, [r1 + 2 * r2]
1495 lea r0, [r5 + 8 * r6 + 8]
1499 ;-----------------------------------------------------------------
1500 ; void transpose_32x32(pixel *dst, pixel *src, intptr_t stride)
1501 ;-----------------------------------------------------------------
1502 %if HIGH_BIT_DEPTH == 1
1503 %if ARCH_X86_64 == 1
1505 cglobal transpose8x32_internal
1509 movu m3, [r1 + r2 + 32]
1510 movu m4, [r1 + 2 * r2]
1511 movu m5, [r1 + 2 * r2 + 32]
1513 movu m7, [r1 + r3 + 32]
1514 lea r1, [r1 + 4 * r2]
1516 punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2]
1517 punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2]
1519 punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4]
1520 punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4]
1522 punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2]
1523 punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2]
1525 punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4]
1526 punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4]
1528 punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4]
1529 punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4]
1531 punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4]
1532 punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4]
1534 punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4]
1535 punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4]
1537 punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4]
1538 punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4]
1540 movq [r0 + 0 * 64], xm7
1541 movhps [r0 + 1 * 64], xm7
1542 vextracti128 xm5, m7, 1
1543 movq [r0 + 8 * 64], xm5
1544 movhps [r0 + 9 * 64], xm5
1549 movu m11, [r1 + r2 + 32]
1550 movu m12, [r1 + 2 * r2]
1551 movu m13, [r1 + 2 * r2 + 32]
1553 movu m15, [r1 + r3 + 32]
1555 punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6]
1556 punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6]
1558 punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8]
1559 punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8]
1561 punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6]
1562 punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6]
1564 punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8]
1565 punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8]
1567 punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8]
1568 punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8]
1570 punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8]
1571 punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8]
1573 punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8]
1574 punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8]
1576 punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8]
1577 punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8]
1579 movq [r0 + 0 * 64 + 8], xm15
1580 movhps [r0 + 1 * 64 + 8], xm15
1581 vextracti128 xm13, m15, 1
1582 movq [r0 + 8 * 64 + 8], xm13
1583 movhps [r0 + 9 * 64 + 8], xm13
1585 punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8]
1586 punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8]
1588 punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8]
1589 punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8]
1591 punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8]
1592 punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8]
1594 punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8]
1595 punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8]
1597 punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8]
1598 punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8]
1600 punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8]
1601 punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8]
1603 punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8]
1604 punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8]
1606 movu [r0 + 2 * 64], xm13
1607 vextracti128 [r0 + 10 * 64], m13, 1
1609 movu [r0 + 3 * 64], xm8
1610 vextracti128 [r0 + 11 * 64], m8, 1
1612 movu [r0 + 4 * 64], xm5
1613 vextracti128 [r0 + 12 * 64], m5, 1
1615 movu [r0 + 5 * 64], xm2
1616 vextracti128 [r0 + 13 * 64], m2, 1
1618 movu [r0 + 6 * 64], xm10
1619 vextracti128 [r0 + 14 * 64], m10, 1
1621 movu [r0 + 7 * 64], xm0
1622 vextracti128 [r0 + 15 * 64], m0, 1
1624 movu [r0 + 16 * 64], xm7
1625 vextracti128 [r0 + 24 * 64], m7, 1
1627 movu [r0 + 17 * 64], xm4
1628 vextracti128 [r0 + 25 * 64], m4, 1
1630 movu [r0 + 18 * 64], xm12
1631 vextracti128 [r0 + 26 * 64], m12, 1
1633 movu [r0 + 19 * 64], xm6
1634 vextracti128 [r0 + 27 * 64], m6, 1
1636 movu [r0 + 20 * 64], xm14
1637 vextracti128 [r0 + 28 * 64], m14, 1
1639 movu [r0 + 21 * 64], xm3
1640 vextracti128 [r0 + 29 * 64], m3, 1
1642 movu [r0 + 22 * 64], xm11
1643 vextracti128 [r0 + 30 * 64], m11, 1
1645 movu [r0 + 23 * 64], xm1
1646 vextracti128 [r0 + 31 * 64], m1, 1
1649 cglobal transpose32, 3, 4, 16
1652 call transpose8x32_internal
1654 lea r1, [r1 + 4 * r2]
1655 call transpose8x32_internal
1657 lea r1, [r1 + 4 * r2]
1658 call transpose8x32_internal
1660 lea r1, [r1 + 4 * r2]
1661 call transpose8x32_internal
1665 cglobal transpose32, 3, 7, 4, dest, src, stride
1671 call transpose8_internal
1672 lea r1, [r1 - 8 + 2 * r2]
1675 call transpose8_internal
1676 lea r1, [r1 - 8 + 2 * r2]
1679 call transpose8_internal
1680 lea r1, [r1 - 8 + 2 * r2]
1683 call transpose8_internal
1685 lea r0, [r6 + 8 * 64]
1687 call transpose8_internal
1688 lea r1, [r1 - 8 + 2 * r2]
1689 lea r0, [r6 + 8 * 64 + 16]
1691 call transpose8_internal
1692 lea r1, [r1 - 8 + 2 * r2]
1693 lea r0, [r6 + 8 * 64 + 32]
1695 call transpose8_internal
1696 lea r1, [r1 - 8 + 2 * r2]
1697 lea r0, [r6 + 8 * 64 + 48]
1699 call transpose8_internal
1701 lea r0, [r6 + 16 * 64]
1703 call transpose8_internal
1704 lea r1, [r1 - 8 + 2 * r2]
1705 lea r0, [r6 + 16 * 64 + 16]
1707 call transpose8_internal
1708 lea r1, [r1 - 8 + 2 * r2]
1709 lea r0, [r6 + 16 * 64 + 32]
1711 call transpose8_internal
1712 lea r1, [r1 - 8 + 2 * r2]
1713 lea r0, [r6 + 16 * 64 + 48]
1715 call transpose8_internal
1717 lea r0, [r6 + 24 * 64]
1719 call transpose8_internal
1720 lea r1, [r1 - 8 + 2 * r2]
1721 lea r0, [r6 + 24 * 64 + 16]
1723 call transpose8_internal
1724 lea r1, [r1 - 8 + 2 * r2]
1725 lea r0, [r6 + 24 * 64 + 32]
1727 call transpose8_internal
1728 lea r1, [r1 - 8 + 2 * r2]
1729 lea r0, [r6 + 24 * 64 + 48]
1731 call transpose8_internal
1733 %else ;HIGH_BIT_DEPTH == 0
1735 cglobal transpose32, 3, 7, 8, dest, src, stride
1740 call transpose16_internal
1741 lea r1, [r1 - 8 + 2 * r2]
1744 call transpose16_internal
1746 lea r0, [r3 + 16 * 32]
1748 call transpose16_internal
1749 lea r1, [r1 - 8 + 2 * r2]
1750 lea r0, [r3 + 16 * 32 + 16]
1752 call transpose16_internal
1755 %if ARCH_X86_64 == 1
1757 cglobal transpose32, 3, 5, 16
1764 movu m2, [r1 + 2 * r2]
1766 lea r1, [r1 + 4 * r2]
1770 movu m6, [r1 + 2 * r2]
1773 punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2]
1774 punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2]
1776 punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4]
1777 punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4]
1779 punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6]
1780 punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6]
1782 punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8]
1783 punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8]
1785 punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4]
1786 punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4]
1788 punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8]
1789 punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8]
1791 punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4]
1792 punpckhwd m0, m2 ;[13- 15; 29 - 32][1 2 3 4]
1794 punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8]
1795 punpckhwd m4, m6 ;[13- 15; 29 - 32][5 6 7 8]
1797 punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8]
1798 punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8]
1800 punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8]
1801 punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8]
1803 punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8]
1804 punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8]
1806 punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8]
1807 punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8]
1809 movq [r0 + 0 * 32], xm6
1810 movhps [r0 + 1 * 32], xm6
1811 vextracti128 xm4, m6, 1
1812 movq [r0 + 16 * 32], xm4
1813 movhps [r0 + 17 * 32], xm4
1815 lea r1, [r1 + 4 * r2]
1818 movu m11, [r1 + 2 * r2]
1820 lea r1, [r1 + 4 * r2]
1824 movu m15, [r1 + 2 * r2]
1827 punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10]
1828 punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10]
1830 punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12]
1831 punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12]
1833 punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14]
1834 punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14]
1836 punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16]
1837 punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16]
1839 punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12]
1840 punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12]
1842 punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16]
1843 punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16]
1845 punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12]
1846 punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12]
1848 punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16]
1849 punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16]
1851 punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16]
1852 punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16]
1854 punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16]
1855 punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16]
1857 punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16]
1858 punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16]
1860 punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16]
1861 punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16]
1864 punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1865 punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1867 punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1868 punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1870 punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1871 punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1873 punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1874 punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1876 punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1877 punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1879 punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1880 punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1882 punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1883 punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1885 movq [r0 + 0 * 32 + 8], xm15
1886 movhps [r0 + 1 * 32 + 8], xm15
1887 vextracti128 xm9, m15, 1
1888 movq [r0 + 16 * 32 + 8], xm9
1889 movhps [r0 + 17 * 32 + 8], xm9
1891 movu [r0 + 2 * 32], xm13
1892 vextracti128 [r0 + 18 * 32], m13, 1
1894 movu [r0 + 3 * 32], xm7
1895 vextracti128 [r0 + 19 * 32], m7, 1
1897 movu [r0 + 4 * 32], xm6
1898 vextracti128 [r0 + 20 * 32], m6, 1
1900 movu [r0 + 5 * 32], xm1
1901 vextracti128 [r0 + 21 * 32], m1, 1
1903 movu [r0 + 6 * 32], xm10
1904 vextracti128 [r0 + 22 * 32], m10, 1
1906 movu [r0 + 7 * 32], xm8
1907 vextracti128 [r0 + 23 * 32], m8, 1
1909 movu [r0 + 8 * 32], xm4
1910 vextracti128 [r0 + 24 * 32], m4, 1
1912 movu [r0 + 9 * 32], xm3
1913 vextracti128 [r0 + 25 * 32], m3, 1
1915 movu [r0 + 10 * 32], xm12
1916 vextracti128 [r0 + 26 * 32], m12, 1
1918 movu [r0 + 11 * 32], xm5
1919 vextracti128 [r0 + 27 * 32], m5, 1
1921 movu [r0 + 12 * 32], xm14
1922 vextracti128 [r0 + 28 * 32], m14, 1
1924 movu [r0 + 13 * 32], xm2
1925 vextracti128 [r0 + 29 * 32], m2, 1
1927 movu [r0 + 14 * 32], xm11
1928 vextracti128 [r0 + 30 * 32], m11, 1
1930 movu [r0 + 15 * 32], xm0
1931 vextracti128 [r0 + 31 * 32], m0, 1
1934 lea r1, [r1 + 4 * r2]
1941 ;-----------------------------------------------------------------
1942 ; void transpose_64x64(pixel *dst, pixel *src, intptr_t stride)
1943 ;-----------------------------------------------------------------
1944 %if HIGH_BIT_DEPTH == 1
1945 %if ARCH_X86_64 == 1
1947 cglobal transpose8x32_64_internal
1951 movu m3, [r1 + r2 + 32]
1952 movu m4, [r1 + 2 * r2]
1953 movu m5, [r1 + 2 * r2 + 32]
1955 movu m7, [r1 + r3 + 32]
1956 lea r1, [r1 + 4 * r2]
1958 punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2]
1959 punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2]
1961 punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4]
1962 punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4]
1964 punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2]
1965 punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2]
1967 punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4]
1968 punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4]
1970 punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4]
1971 punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4]
1973 punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4]
1974 punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4]
1976 punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4]
1977 punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4]
1979 punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4]
1980 punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4]
1982 movq [r0 + 0 * 128], xm7
1983 movhps [r0 + 1 * 128], xm7
1984 vextracti128 xm5, m7, 1
1985 movq [r0 + 8 * 128], xm5
1986 movhps [r0 + 9 * 128], xm5
1991 movu m11, [r1 + r2 + 32]
1992 movu m12, [r1 + 2 * r2]
1993 movu m13, [r1 + 2 * r2 + 32]
1995 movu m15, [r1 + r3 + 32]
1997 punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6]
1998 punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6]
2000 punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8]
2001 punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8]
2003 punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6]
2004 punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6]
2006 punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8]
2007 punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8]
2009 punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8]
2010 punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8]
2012 punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8]
2013 punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8]
2015 punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8]
2016 punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8]
2018 punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8]
2019 punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8]
2021 movq [r0 + 0 * 128 + 8], xm15
2022 movhps [r0 + 1 * 128 + 8], xm15
2023 vextracti128 xm13, m15, 1
2024 movq [r0 + 8 * 128 + 8], xm13
2025 movhps [r0 + 9 * 128 + 8], xm13
2027 punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8]
2028 punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8]
2030 punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8]
2031 punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8]
2033 punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8]
2034 punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8]
2036 punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8]
2037 punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8]
2039 punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8]
2040 punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8]
2042 punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8]
2043 punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8]
2045 punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8]
2046 punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8]
2048 movu [r0 + 2 * 128], xm13
2049 vextracti128 [r0 + 10 * 128], m13, 1
2051 movu [r0 + 3 * 128], xm8
2052 vextracti128 [r0 + 11 * 128], m8, 1
2054 movu [r0 + 4 * 128], xm5
2055 vextracti128 [r0 + 12 * 128], m5, 1
2057 movu [r0 + 5 * 128], xm2
2058 vextracti128 [r0 + 13 * 128], m2, 1
2060 movu [r0 + 6 * 128], xm10
2061 vextracti128 [r0 + 14 * 128], m10, 1
2063 movu [r0 + 7 * 128], xm0
2064 vextracti128 [r0 + 15 * 128], m0, 1
2066 movu [r0 + 16 * 128], xm7
2067 vextracti128 [r0 + 24 * 128], m7, 1
2069 movu [r0 + 17 * 128], xm4
2070 vextracti128 [r0 + 25 * 128], m4, 1
2072 movu [r0 + 18 * 128], xm12
2073 vextracti128 [r0 + 26 * 128], m12, 1
2075 movu [r0 + 19 * 128], xm6
2076 vextracti128 [r0 + 27 * 128], m6, 1
2078 movu [r0 + 20 * 128], xm14
2079 vextracti128 [r0 + 28 * 128], m14, 1
2081 movu [r0 + 21 * 128], xm3
2082 vextracti128 [r0 + 29 * 128], m3, 1
2084 movu [r0 + 22 * 128], xm11
2085 vextracti128 [r0 + 30 * 128], m11, 1
2087 movu [r0 + 23 * 128], xm1
2088 vextracti128 [r0 + 31 * 128], m1, 1
2091 cglobal transpose64, 3, 6, 16
2097 call transpose8x32_64_internal
2099 lea r0, [r0 + 32 * 128]
2100 call transpose8x32_64_internal
2103 lea r4, [r1 + 4 * r2]
2105 call transpose8x32_64_internal
2107 lea r0, [r0 + 32 * 128]
2108 call transpose8x32_64_internal
2111 lea r4, [r1 + 4 * r2]
2113 call transpose8x32_64_internal
2115 lea r0, [r0 + 32 * 128]
2116 call transpose8x32_64_internal
2119 lea r4, [r1 + 4 * r2]
2121 call transpose8x32_64_internal
2123 lea r0, [r0 + 32 * 128]
2124 call transpose8x32_64_internal
2127 lea r4, [r1 + 4 * r2]
2129 call transpose8x32_64_internal
2131 lea r0, [r0 + 32 * 128]
2132 call transpose8x32_64_internal
2135 lea r4, [r1 + 4 * r2]
2137 call transpose8x32_64_internal
2139 lea r0, [r0 + 32 * 128]
2140 call transpose8x32_64_internal
2143 lea r4, [r1 + 4 * r2]
2145 call transpose8x32_64_internal
2147 lea r0, [r0 + 32 * 128]
2148 call transpose8x32_64_internal
2150 lea r4, [r1 + 4 * r2]
2152 call transpose8x32_64_internal
2154 lea r0, [r0 + 32 * 128]
2155 call transpose8x32_64_internal
2159 cglobal transpose64, 3, 7, 4, dest, src, stride
2165 call transpose8_internal
2166 lea r1, [r1 - 8 + 2 * r2]
2169 call transpose8_internal
2170 lea r1, [r1 - 8 + 2 * r2]
2173 call transpose8_internal
2174 lea r1, [r1 - 8 + 2 * r2]
2177 call transpose8_internal
2178 lea r1, [r1 - 8 + 2 * r2]
2181 call transpose8_internal
2182 lea r1, [r1 - 8 + 2 * r2]
2185 call transpose8_internal
2186 lea r1, [r1 - 8 + 2 * r2]
2189 call transpose8_internal
2190 lea r1, [r1 - 8 + 2 * r2]
2193 call transpose8_internal
2196 lea r0, [r6 + 8 * 128]
2198 call transpose8_internal
2199 lea r1, [r1 - 8 + 2 * r2]
2200 lea r0, [r6 + 8 * 128 + 16]
2202 call transpose8_internal
2203 lea r1, [r1 - 8 + 2 * r2]
2204 lea r0, [r6 + 8 * 128 + 32]
2206 call transpose8_internal
2207 lea r1, [r1 - 8 + 2 * r2]
2208 lea r0, [r6 + 8 * 128 + 48]
2210 call transpose8_internal
2211 lea r1, [r1 - 8 + 2 * r2]
2212 lea r0, [r6 + 8 * 128 + 64]
2214 call transpose8_internal
2215 lea r1, [r1 - 8 + 2 * r2]
2216 lea r0, [r6 + 8 * 128 + 80]
2218 call transpose8_internal
2219 lea r1, [r1 - 8 + 2 * r2]
2220 lea r0, [r6 + 8 * 128 + 96]
2222 call transpose8_internal
2223 lea r1, [r1 - 8 + 2 * r2]
2224 lea r0, [r6 + 8 * 128 + 112]
2226 call transpose8_internal
2229 lea r0, [r6 + 16 * 128]
2231 call transpose8_internal
2232 lea r1, [r1 - 8 + 2 * r2]
2233 lea r0, [r6 + 16 * 128 + 16]
2235 call transpose8_internal
2236 lea r1, [r1 - 8 + 2 * r2]
2237 lea r0, [r6 + 16 * 128 + 32]
2239 call transpose8_internal
2240 lea r1, [r1 - 8 + 2 * r2]
2241 lea r0, [r6 + 16 * 128 + 48]
2243 call transpose8_internal
2244 lea r1, [r1 - 8 + 2 * r2]
2245 lea r0, [r6 + 16 * 128 + 64]
2247 call transpose8_internal
2248 lea r1, [r1 - 8 + 2 * r2]
2249 lea r0, [r6 + 16 * 128 + 80]
2251 call transpose8_internal
2252 lea r1, [r1 - 8 + 2 * r2]
2253 lea r0, [r6 + 16 * 128 + 96]
2255 call transpose8_internal
2256 lea r1, [r1 - 8 + 2 * r2]
2257 lea r0, [r6 + 16 * 128 + 112]
2259 call transpose8_internal
2262 lea r0, [r6 + 24 * 128]
2264 call transpose8_internal
2265 lea r1, [r1 - 8 + 2 * r2]
2266 lea r0, [r6 + 24 * 128 + 16]
2268 call transpose8_internal
2269 lea r1, [r1 - 8 + 2 * r2]
2270 lea r0, [r6 + 24 * 128 + 32]
2272 call transpose8_internal
2273 lea r1, [r1 - 8 + 2 * r2]
2274 lea r0, [r6 + 24 * 128 + 48]
2276 call transpose8_internal
2277 lea r1, [r1 - 8 + 2 * r2]
2278 lea r0, [r6 + 24 * 128 + 64]
2280 call transpose8_internal
2281 lea r1, [r1 - 8 + 2 * r2]
2282 lea r0, [r6 + 24 * 128 + 80]
2284 call transpose8_internal
2285 lea r1, [r1 - 8 + 2 * r2]
2286 lea r0, [r6 + 24 * 128 + 96]
2288 call transpose8_internal
2289 lea r1, [r1 - 8 + 2 * r2]
2290 lea r0, [r6 + 24 * 128 + 112]
2292 call transpose8_internal
2295 lea r0, [r6 + 32 * 128]
2297 call transpose8_internal
2298 lea r1, [r1 - 8 + 2 * r2]
2299 lea r0, [r6 + 32 * 128 + 16]
2301 call transpose8_internal
2302 lea r1, [r1 - 8 + 2 * r2]
2303 lea r0, [r6 + 32 * 128 + 32]
2305 call transpose8_internal
2306 lea r1, [r1 - 8 + 2 * r2]
2307 lea r0, [r6 + 32 * 128 + 48]
2309 call transpose8_internal
2310 lea r1, [r1 - 8 + 2 * r2]
2311 lea r0, [r6 + 32 * 128 + 64]
2313 call transpose8_internal
2314 lea r1, [r1 - 8 + 2 * r2]
2315 lea r0, [r6 + 32 * 128 + 80]
2317 call transpose8_internal
2318 lea r1, [r1 - 8 + 2 * r2]
2319 lea r0, [r6 + 32 * 128 + 96]
2321 call transpose8_internal
2322 lea r1, [r1 - 8 + 2 * r2]
2323 lea r0, [r6 + 32 * 128 + 112]
2325 call transpose8_internal
2328 lea r0, [r6 + 40 * 128]
2330 call transpose8_internal
2331 lea r1, [r1 - 8 + 2 * r2]
2332 lea r0, [r6 + 40 * 128 + 16]
2334 call transpose8_internal
2335 lea r1, [r1 - 8 + 2 * r2]
2336 lea r0, [r6 + 40 * 128 + 32]
2338 call transpose8_internal
2339 lea r1, [r1 - 8 + 2 * r2]
2340 lea r0, [r6 + 40 * 128 + 48]
2342 call transpose8_internal
2343 lea r1, [r1 - 8 + 2 * r2]
2344 lea r0, [r6 + 40 * 128 + 64]
2346 call transpose8_internal
2347 lea r1, [r1 - 8 + 2 * r2]
2348 lea r0, [r6 + 40 * 128 + 80]
2350 call transpose8_internal
2351 lea r1, [r1 - 8 + 2 * r2]
2352 lea r0, [r6 + 40 * 128 + 96]
2354 call transpose8_internal
2355 lea r1, [r1 - 8 + 2 * r2]
2356 lea r0, [r6 + 40 * 128 + 112]
2358 call transpose8_internal
2361 lea r0, [r6 + 48 * 128]
2363 call transpose8_internal
2364 lea r1, [r1 - 8 + 2 * r2]
2365 lea r0, [r6 + 48 * 128 + 16]
2367 call transpose8_internal
2368 lea r1, [r1 - 8 + 2 * r2]
2369 lea r0, [r6 + 48 * 128 + 32]
2371 call transpose8_internal
2372 lea r1, [r1 - 8 + 2 * r2]
2373 lea r0, [r6 + 48 * 128 + 48]
2375 call transpose8_internal
2376 lea r1, [r1 - 8 + 2 * r2]
2377 lea r0, [r6 + 48 * 128 + 64]
2379 call transpose8_internal
2380 lea r1, [r1 - 8 + 2 * r2]
2381 lea r0, [r6 + 48 * 128 + 80]
2383 call transpose8_internal
2384 lea r1, [r1 - 8 + 2 * r2]
2385 lea r0, [r6 + 48 * 128 + 96]
2387 call transpose8_internal
2388 lea r1, [r1 - 8 + 2 * r2]
2389 lea r0, [r6 + 48 * 128 + 112]
2391 call transpose8_internal
2394 lea r0, [r6 + 56 * 128]
2396 call transpose8_internal
2397 lea r1, [r1 - 8 + 2 * r2]
2398 lea r0, [r6 + 56 * 128 + 16]
2400 call transpose8_internal
2401 lea r1, [r1 - 8 + 2 * r2]
2402 lea r0, [r6 + 56 * 128 + 32]
2404 call transpose8_internal
2405 lea r1, [r1 - 8 + 2 * r2]
2406 lea r0, [r6 + 56 * 128 + 48]
2408 call transpose8_internal
2409 lea r1, [r1 - 8 + 2 * r2]
2410 lea r0, [r6 + 56 * 128 + 64]
2412 call transpose8_internal
2413 lea r1, [r1 - 8 + 2 * r2]
2414 lea r0, [r6 + 56 * 128 + 80]
2416 call transpose8_internal
2417 lea r1, [r1 - 8 + 2 * r2]
2418 lea r0, [r6 + 56 * 128 + 96]
2420 call transpose8_internal
2421 lea r1, [r1 - 8 + 2 * r2]
2422 lea r0, [r6 + 56 * 128 + 112]
2424 call transpose8_internal
2426 %else ;HIGH_BIT_DEPTH == 0
2427 %if ARCH_X86_64 == 1
2430 cglobal transpose16x32_avx2
2433 movu m2, [r1 + 2 * r2]
2435 lea r1, [r1 + 4 * r2]
2439 movu m6, [r1 + 2 * r2]
2442 punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2]
2443 punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2]
2445 punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4]
2446 punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4]
2448 punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6]
2449 punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6]
2451 punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8]
2452 punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8]
2454 punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4]
2455 punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4]
2457 punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8]
2458 punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8]
2460 punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4]
2461 punpckhwd m0, m2 ;[12- 15; 29 - 32][1 2 3 4]
2463 punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8]
2464 punpckhwd m4, m6 ;[12- 15; 29 - 32][5 6 7 8]
2466 punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8]
2467 punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8]
2469 punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8]
2470 punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8]
2472 punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8]
2473 punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8]
2475 punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8]
2476 punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8]
2478 movq [r0 + 0 * 64], xm6
2479 movhps [r0 + 1 * 64], xm6
2480 vextracti128 xm4, m6, 1
2481 movq [r0 + 16 * 64], xm4
2482 movhps [r0 + 17 * 64], xm4
2484 lea r1, [r1 + 4 * r2]
2487 movu m11, [r1 + 2 * r2]
2489 lea r1, [r1 + 4 * r2]
2493 movu m15, [r1 + 2 * r2]
2496 punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10]
2497 punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10]
2499 punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12]
2500 punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12]
2502 punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14]
2503 punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14]
2505 punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16]
2506 punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16]
2508 punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12]
2509 punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12]
2511 punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16]
2512 punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16]
2514 punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12]
2515 punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12]
2517 punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16]
2518 punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16]
2520 punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16]
2521 punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16]
2523 punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16]
2524 punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16]
2526 punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16]
2527 punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16]
2529 punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16]
2530 punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16]
2533 punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2534 punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2536 punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2537 punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2539 punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2540 punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2542 punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2543 punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2545 punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2546 punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2548 punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2549 punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2551 punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2552 punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2554 movq [r0 + 0 * 64 + 8], xm15
2555 movhps [r0 + 1 * 64 + 8], xm15
2556 vextracti128 xm9, m15, 1
2557 movq [r0 + 16 * 64 + 8], xm9
2558 movhps [r0 + 17 * 64 + 8], xm9
2560 movu [r0 + 2 * 64], xm13
2561 vextracti128 [r0 + 18 * 64], m13, 1
2563 movu [r0 + 3 * 64], xm7
2564 vextracti128 [r0 + 19 * 64], m7, 1
2566 movu [r0 + 4 * 64], xm6
2567 vextracti128 [r0 + 20 * 64], m6, 1
2569 movu [r0 + 5 * 64], xm1
2570 vextracti128 [r0 + 21 * 64], m1, 1
2572 movu [r0 + 6 * 64], xm10
2573 vextracti128 [r0 + 22 * 64], m10, 1
2575 movu [r0 + 7 * 64], xm8
2576 vextracti128 [r0 + 23 * 64], m8, 1
2578 movu [r0 + 8 * 64], xm4
2579 vextracti128 [r0 + 24 * 64], m4, 1
2581 movu [r0 + 9 * 64], xm3
2582 vextracti128 [r0 + 25 * 64], m3, 1
2584 movu [r0 + 10 * 64], xm12
2585 vextracti128 [r0 + 26 * 64], m12, 1
2587 movu [r0 + 11 * 64], xm5
2588 vextracti128 [r0 + 27 * 64], m5, 1
2590 movu [r0 + 12 * 64], xm14
2591 vextracti128 [r0 + 28 * 64], m14, 1
2593 movu [r0 + 13 * 64], xm2
2594 vextracti128 [r0 + 29 * 64], m2, 1
2596 movu [r0 + 14 * 64], xm11
2597 vextracti128 [r0 + 30 * 64], m11, 1
2599 movu [r0 + 15 * 64], xm0
2600 vextracti128 [r0 + 31 * 64], m0, 1
2603 cglobal transpose64, 3, 6, 16
2609 call transpose16x32_avx2
2610 lea r0, [r0 + 32 * 64]
2612 call transpose16x32_avx2
2615 lea r5, [r1 + 4 * r2]
2618 call transpose16x32_avx2
2619 lea r0, [r0 + 32 * 64]
2621 call transpose16x32_avx2
2624 lea r5, [r1 + 4 * r2]
2627 call transpose16x32_avx2
2628 lea r0, [r0 + 32 * 64]
2630 call transpose16x32_avx2
2632 lea r5, [r1 + 4 * r2]
2636 call transpose16x32_avx2
2637 lea r0, [r0 + 32 * 64]
2639 call transpose16x32_avx2
2644 cglobal transpose64, 3, 7, 8, dest, src, stride
2649 call transpose16_internal
2650 lea r1, [r1 - 8 + 2 * r2]
2653 call transpose16_internal
2654 lea r1, [r1 - 8 + 2 * r2]
2657 call transpose16_internal
2658 lea r1, [r1 - 8 + 2 * r2]
2661 call transpose16_internal
2664 lea r0, [r3 + 16 * 64]
2666 call transpose16_internal
2667 lea r1, [r1 - 8 + 2 * r2]
2668 lea r0, [r3 + 16 * 64 + 16]
2670 call transpose16_internal
2671 lea r1, [r1 - 8 + 2 * r2]
2672 lea r0, [r3 + 16 * 64 + 32]
2674 call transpose16_internal
2675 lea r1, [r1 - 8 + 2 * r2]
2676 lea r0, [r3 + 16 * 64 + 48]
2678 call transpose16_internal
2681 lea r0, [r3 + 32 * 64]
2683 call transpose16_internal
2684 lea r1, [r1 - 8 + 2 * r2]
2685 lea r0, [r3 + 32 * 64 + 16]
2687 call transpose16_internal
2688 lea r1, [r1 - 8 + 2 * r2]
2689 lea r0, [r3 + 32 * 64 + 32]
2691 call transpose16_internal
2692 lea r1, [r1 - 8 + 2 * r2]
2693 lea r0, [r3 + 32 * 64 + 48]
2695 call transpose16_internal
2698 lea r0, [r3 + 48 * 64]
2700 call transpose16_internal
2701 lea r1, [r1 - 8 + 2 * r2]
2702 lea r0, [r3 + 48 * 64 + 16]
2704 call transpose16_internal
2705 lea r1, [r1 - 8 + 2 * r2]
2706 lea r0, [r3 + 48 * 64 + 32]
2708 call transpose16_internal
2709 lea r1, [r1 - 8 + 2 * r2]
2710 lea r0, [r3 + 48 * 64 + 48]
2712 call transpose16_internal
2717 ;=============================================================================
2719 ;=============================================================================
2721 ;-----------------------------------------------------------------------------
2722 ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
2723 ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
2724 ;-----------------------------------------------------------------------------
2727 movdqu m5, [r0+(%1&1)*r1]
2728 movdqu m6, [r2+(%1&1)*r3]
2730 movq m5, [r0+(%1&1)*r1]
2731 movq m6, [r2+(%1&1)*r3]
2749 ACCUM paddd, 3, 5, %1
2750 ACCUM paddd, 4, 7, %1
2755 cglobal pixel_ssim_4x4x2_core, 4,4,8
2765 pshufd m5, m3, q2301
2768 pshufd m6, m4, q2301
2771 pshufd m1, m1, q3120
2774 punpckhdq m5, m3, m4
2790 ;-----------------------------------------------------------------------------
2791 ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
2792 ;-----------------------------------------------------------------------------
2793 cglobal pixel_ssim_end4, 2,3
2809 TRANSPOSE4x4D 0, 1, 2, 3, 4
2811 ; s1=m0, s2=m1, ss=m2, s12=m3
2817 mulps m4, m0, m1 ; s1*s2
2818 mulps m0, m0 ; s1*s1
2819 mulps m1, m1 ; s2*s2
2820 mulps m2, [pf_64] ; ss*64
2821 mulps m3, [pf_128] ; s12*128
2822 addps m4, m4 ; s1*s2*2
2823 addps m0, m1 ; s1*s1 + s2*s2
2825 subps m3, m4 ; covar*2
2826 movaps m1, [ssim_c1]
2827 addps m4, m1 ; s1*s2*2 + ssim_c1
2828 addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1
2829 movaps m1, [ssim_c2]
2830 addps m2, m1 ; vars + ssim_c2
2831 addps m3, m1 ; covar*2 + ssim_c2
2833 pmaddwd m4, m1, m0 ; s1*s2
2836 pmaddwd m0, m0 ; s1*s1 + s2*s2
2840 psubd m3, m4 ; covar*2
2848 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
2849 cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
2850 cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
2851 cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
2858 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
2862 lea r3, [mask_ff + 16]
2865 %xdefine %%mask mask_ff + 16
2868 andps m4, [%%mask + r2*4]
2870 movups m0, [%%mask + r2*4]
2880 pshuflw m4, m0, q0032
2883 %if ARCH_X86_64 == 0
2895 ;-----------------------------------------------------------------
2896 ; void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
2897 ;-----------------------------------------------------------------
2899 cglobal scale1D_128to64, 2, 2, 8, dest, src1, stride
2901 mova m7, [deinterleave_word_shuf]
2992 mova m7, [deinterleave_shuf]
3048 %if HIGH_BIT_DEPTH == 1
3050 cglobal scale1D_128to64, 2, 2, 3
3081 %else ; HIGH_BIT_DEPTH == 0
3083 cglobal scale1D_128to64, 2, 2, 4
3088 pmaddubsw m0, m0, m3
3091 pmaddubsw m1, m1, m3
3098 pmaddubsw m0, m0, m3
3101 pmaddubsw m1, m1, m3
3109 ;-----------------------------------------------------------------
3110 ; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
3111 ;-----------------------------------------------------------------
3114 cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
3116 mova m7, [deinterleave_word_shuf]
3121 movu m2, [r1 + r2] ;k
3131 pavgw m0, m2 ;(s+t+1)/2
3133 pand m4, m5 ;(ij|kl)&st
3134 pand m4, [hmulw_16p]
3135 psubw m0, m4 ;Result
3136 movu m1, [r1 + 16] ;i
3138 movu m3, [r1 + r2 + 16] ;k
3148 pavgw m1, m3 ;(s+t+1)/2
3150 pand m5, m6 ;(ij|kl)&st
3151 pand m5, [hmulw_16p]
3152 psubw m1, m5 ;Result
3159 movu m0, [r1 + 32] ;i
3161 movu m2, [r1 + r2 + 32] ;k
3171 pavgw m0, m2 ;(s+t+1)/2
3173 pand m4, m5 ;(ij|kl)&st
3174 pand m4, [hmulw_16p]
3175 psubw m0, m4 ;Result
3176 movu m1, [r1 + 48] ;i
3178 movu m3, [r1 + r2 + 48] ;k
3188 pavgw m1, m3 ;(s+t+1)/2
3190 pand m5, m6 ;(ij|kl)&st
3191 pand m5, [hmulw_16p]
3192 psubw m1, m5 ;Result
3199 movu m0, [r1 + 64] ;i
3201 movu m2, [r1 + r2 + 64] ;k
3211 pavgw m0, m2 ;(s+t+1)/2
3213 pand m4, m5 ;(ij|kl)&st
3214 pand m4, [hmulw_16p]
3215 psubw m0, m4 ;Result
3216 movu m1, [r1 + 80] ;i
3218 movu m3, [r1 + r2 + 80] ;k
3228 pavgw m1, m3 ;(s+t+1)/2
3230 pand m5, m6 ;(ij|kl)&st
3231 pand m5, [hmulw_16p]
3232 psubw m1, m5 ;Result
3239 movu m0, [r1 + 96] ;i
3241 movu m2, [r1 + r2 + 96] ;k
3251 pavgw m0, m2 ;(s+t+1)/2
3253 pand m4, m5 ;(ij|kl)&st
3254 pand m4, [hmulw_16p]
3255 psubw m0, m4 ;Result
3256 movu m1, [r1 + 112] ;i
3258 movu m3, [r1 + r2 + 112] ;k
3268 pavgw m1, m3 ;(s+t+1)/2
3270 pand m5, m6 ;(ij|kl)&st
3271 pand m5, [hmulw_16p]
3272 psubw m1, m5 ;Result
3279 lea r1, [r1 + 2 * r2]
3286 cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
3288 mova m7, [deinterleave_shuf]
3293 movu m2, [r1 + r2] ;k
3305 pavgb m0, m2 ;(s+t+1)/2
3307 pand m4, m5 ;(ij|kl)&st
3309 psubb m0, m4 ;Result
3311 movu m1, [r1 + 16] ;i
3313 movu m3, [r1 + r2 + 16] ;k
3325 pavgb m1, m3 ;(s+t+1)/2
3327 pand m5, m6 ;(ij|kl)&st
3329 psubb m1, m5 ;Result
3337 movu m0, [r1 + 32] ;i
3339 movu m2, [r1 + r2 + 32] ;k
3351 pavgb m0, m2 ;(s+t+1)/2
3353 pand m4, m5 ;(ij|kl)&st
3355 psubb m0, m4 ;Result
3357 movu m1, [r1 + 48] ;i
3359 movu m3, [r1 + r2 + 48] ;k
3371 pavgb m1, m3 ;(s+t+1)/2
3373 pand m5, m6 ;(ij|kl)&st
3375 psubb m1, m5 ;Result
3384 lea r1, [r1 + 2 * r2]
3391 ;-----------------------------------------------------------------------------
3392 ; void pixel_sub_ps_4x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3393 ;-----------------------------------------------------------------------------
3396 cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3404 lea r2, [r2 + r4 * 2]
3405 lea r3, [r3 + r5 * 2]
3418 lea r0, [r0 + r1 * 2]
3425 cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3431 lea r2, [r2 + r4 * 2]
3432 lea r3, [r3 + r5 * 2]
3450 movhps [r0 + r1], m0
3451 movh [r0 + r1 * 2], m4
3452 lea r0, [r0 + r1 * 2]
3453 movhps [r0 + r1], m4
3459 ;-----------------------------------------------------------------------------
3460 ; void pixel_sub_ps_4x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3461 ;-----------------------------------------------------------------------------
3462 %macro PIXELSUB_PS_W4_H4 2
3464 cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3474 lea r2, [r2 + r4 * 2]
3475 lea r3, [r3 + r5 * 2]
3481 lea r2, [r2 + r4 * 2]
3482 lea r3, [r3 + r5 * 2]
3491 movh [r0 + r1 * 2], m4
3492 lea r0, [r0 + r1 * 2]
3494 lea r0, [r0 + r1 * 2]
3499 cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3507 lea r2, [r2 + r4 * 2]
3508 lea r3, [r3 + r5 * 2]
3514 lea r2, [r2 + r4 * 2]
3515 lea r3, [r3 + r5 * 2]
3529 movhps [r0 + r1], m0
3530 movh [r0 + r1 * 2], m4
3531 lea r0, [r0 + r1 * 2]
3532 movhps [r0 + r1], m4
3533 lea r0, [r0 + r1 * 2]
3542 PIXELSUB_PS_W4_H4 4, 8
3545 PIXELSUB_PS_W4_H4 4, 8
3549 ;-----------------------------------------------------------------------------
3550 ; void pixel_sub_ps_8x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3551 ;-----------------------------------------------------------------------------
3552 %macro PIXELSUB_PS_W8_H4 2
3554 cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3564 lea r2, [r2 + r4 * 2]
3565 lea r3, [r3 + r5 * 2]
3571 lea r2, [r2 + r4 * 2]
3572 lea r3, [r3 + r5 * 2]
3581 movu [r0 + r1 * 2], m4
3582 lea r0, [r0 + r1 * 2]
3584 lea r0, [r0 + r1 * 2]
3589 cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3597 lea r2, [r2 + r4 * 2]
3598 lea r3, [r3 + r5 * 2]
3604 lea r2, [r2 + r4 * 2]
3605 lea r3, [r3 + r5 * 2]
3622 movu [r0 + r1 * 2], m4
3623 lea r0, [r0 + r1 * 2]
3625 lea r0, [r0 + r1 * 2]
3634 PIXELSUB_PS_W8_H4 8, 8
3635 PIXELSUB_PS_W8_H4 8, 16
3638 PIXELSUB_PS_W8_H4 8, 8
3639 PIXELSUB_PS_W8_H4 8, 16
3643 ;-----------------------------------------------------------------------------
3644 ; void pixel_sub_ps_16x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3645 ;-----------------------------------------------------------------------------
3646 %macro PIXELSUB_PS_W16_H4 2
3648 cglobal pixel_sub_ps_16x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3659 movu m6, [r2 + r4 + 16]
3661 movu m7, [r3 + r5 + 16]
3663 lea r2, [r2 + r4 * 2]
3664 lea r3, [r3 + r5 * 2]
3674 movu [r0 + r1 + 16], m6
3682 movu m6, [r2 + r4 + 16]
3683 movu m7, [r3 + r5 + 16]
3684 lea r0, [r0 + r1 * 2]
3685 lea r2, [r2 + r4 * 2]
3686 lea r3, [r3 + r5 * 2]
3696 movu [r0 + r1 + 16], m6
3697 lea r0, [r0 + r1 * 2]
3702 cglobal pixel_sub_ps_16x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
3719 lea r2, [r2 + r4 * 2]
3720 lea r3, [r3 + r5 * 2]
3732 movu [r0 + r1 + 16], m5
3747 lea r2, [r2 + r4 * 2]
3748 lea r3, [r3 + r5 * 2]
3749 lea r0, [r0 + r1 * 2]
3761 movu [r0 + r1 + 16], m5
3762 lea r0, [r0 + r1 * 2]
3771 PIXELSUB_PS_W16_H4 16, 16
3772 PIXELSUB_PS_W16_H4 16, 32
3775 PIXELSUB_PS_W16_H4 16, 16
3776 PIXELSUB_PS_W16_H4 16, 32
3780 ;-----------------------------------------------------------------------------
3781 ; void pixel_sub_ps_32x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3782 ;-----------------------------------------------------------------------------
3783 %macro PIXELSUB_PS_W32_H2 2
3785 cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3812 movu m2, [r2 + r4 + 16]
3813 movu m4, [r2 + r4 + 32]
3814 movu m6, [r2 + r4 + 48]
3816 movu m3, [r3 + r5 + 16]
3817 movu m5, [r3 + r5 + 32]
3818 movu m7, [r3 + r5 + 48]
3819 lea r2, [r2 + r4 * 2]
3820 lea r3, [r3 + r5 * 2]
3828 movu [r0 + r1 + 16], m2
3829 movu [r0 + r1 + 32], m4
3830 movu [r0 + r1 + 48], m6
3831 lea r0, [r0 + r1 * 2]
3836 cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3869 movh m1, [r2 + r4 + 8]
3870 movh m2, [r2 + r4 + 16]
3871 movh m6, [r2 + r4 + 24]
3873 movh m4, [r3 + r5 + 8]
3874 movh m5, [r3 + r5 + 16]
3875 movh m7, [r3 + r5 + 24]
3876 lea r2, [r2 + r4 * 2]
3877 lea r3, [r3 + r5 * 2]
3893 movu [r0 + r1 + 16], m1
3894 movu [r0 + r1 + 32], m2
3895 movu [r0 + r1 + 48], m6
3896 lea r0, [r0 + r1 * 2]
3905 PIXELSUB_PS_W32_H2 32, 32
3906 PIXELSUB_PS_W32_H2 32, 64
3909 PIXELSUB_PS_W32_H2 32, 32
3910 PIXELSUB_PS_W32_H2 32, 64
3914 ;-----------------------------------------------------------------------------
3915 ; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3916 ;-----------------------------------------------------------------------------
3917 %macro PIXELSUB_PS_W64_H2 2
3919 cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3964 movu m2, [r2 + r4 + 16]
3965 movu m4, [r2 + r4 + 32]
3966 movu m6, [r2 + r4 + 48]
3968 movu m3, [r3 + r5 + 16]
3969 movu m5, [r3 + r5 + 32]
3970 movu m7, [r3 + r5 + 48]
3978 movu [r0 + r1 + 16], m2
3979 movu [r0 + r1 + 32], m4
3980 movu [r0 + r1 + 48], m6
3982 movu m0, [r2 + r4 + 64]
3983 movu m2, [r2 + r4 + 80]
3984 movu m4, [r2 + r4 + 96]
3985 movu m6, [r2 + r4 + 112]
3986 movu m1, [r3 + r5 + 64]
3987 movu m3, [r3 + r5 + 80]
3988 movu m5, [r3 + r5 + 96]
3989 movu m7, [r3 + r5 + 112]
3991 lea r2, [r2 + r4 * 2]
3992 lea r3, [r3 + r5 * 2]
3999 movu [r0 + r1 + 64], m0
4000 movu [r0 + r1 + 80], m2
4001 movu [r0 + r1 + 96], m4
4002 movu [r0 + r1 + 112], m6
4003 lea r0, [r0 + r1 * 2]
4008 cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
4068 movu m2, [r2 + r4 + 16]
4069 movu m5, [r3 + r5 + 16]
4079 movu [r0 + r1 + 16], m7
4081 movu m0, [r2 + r4 + 32]
4082 movu m3, [r3 + r5 + 32]
4092 movu [r0 + r1 + 32], m1
4093 movu [r0 + r1 + 48], m2
4095 movu m4, [r2 + r4 + 48]
4096 movu m5, [r3 + r5 + 48]
4097 lea r2, [r2 + r4 * 2]
4098 lea r3, [r3 + r5 * 2]
4107 movu [r0 + r1 + 64], m7
4108 movu [r0 + r1 + 80], m0
4115 movu [r0 + r1 + 96], m2
4116 movu [r0 + r1 + 112], m4
4117 lea r0, [r0 + r1 * 2]
4126 PIXELSUB_PS_W64_H2 64, 64
4129 PIXELSUB_PS_W64_H2 64, 64
4133 ;=============================================================================
4135 ;=============================================================================
4139 pxor m6, m6 ; sum squared
4140 %if HIGH_BIT_DEPTH == 0
4146 %endif ; !HIGH_BIT_DEPTH
4151 %if mmsize == 8 && %1*%2 == 256
4162 %else ; !HIGH_BIT_DEPTH
4170 %endif ; HIGH_BIT_DEPTH
4202 movu m1, [r0+mmsize]
4204 movu m4, [r0+%1+mmsize]
4205 %else ; !HIGH_BIT_DEPTH
4207 punpckhbw m1, m0, m7
4211 %endif ; HIGH_BIT_DEPTH
4217 %if HIGH_BIT_DEPTH == 0
4220 %endif ; !HIGH_BIT_DEPTH
4226 ;-----------------------------------------------------------------------------
4227 ; int pixel_var_wxh( uint8_t *, intptr_t )
4228 ;-----------------------------------------------------------------------------
4230 cglobal pixel_var_16x16, 2,3
4233 VAR_2ROW 8*SIZEOF_PIXEL, 16, 1
4236 cglobal pixel_var_8x8, 2,3
4244 cglobal pixel_var_16x16, 2,3,8
4250 cglobal pixel_var_8x8, 2,3,8
4266 cglobal pixel_var_32x32, 2,6,8
4288 cglobal pixel_var_64x64, 2,6,8
4379 %endif ; HIGH_BIT_DEPTH
4381 %if HIGH_BIT_DEPTH == 0
4383 cglobal pixel_var_8x8, 2,3,8
4388 movhps m0, [r0 + r1 * 2]
4389 movhps m3, [r0 + r2]
4390 DEINTB 1, 0, 4, 3, 7
4391 lea r0, [r0 + r1 * 4]
4395 movhps m0, [r0 + r1 * 2]
4396 movhps m3, [r0 + r2]
4397 DEINTB 1, 0, 4, 3, 7
4401 cglobal pixel_var_16x16_internal
4404 DEINTB 1, 0, 4, 3, 7
4406 movu m0, [r0 + 2 * r1]
4408 DEINTB 1, 0, 4, 3, 7
4409 lea r0, [r0 + r1 * 4]
4413 DEINTB 1, 0, 4, 3, 7
4415 movu m0, [r0 + 2 * r1]
4417 DEINTB 1, 0, 4, 3, 7
4418 lea r0, [r0 + r1 * 4]
4422 DEINTB 1, 0, 4, 3, 7
4424 movu m0, [r0 + 2 * r1]
4426 DEINTB 1, 0, 4, 3, 7
4427 lea r0, [r0 + r1 * 4]
4431 DEINTB 1, 0, 4, 3, 7
4433 movu m0, [r0 + 2 * r1]
4435 DEINTB 1, 0, 4, 3, 7
4439 cglobal pixel_var_16x16, 2,3,8
4442 call pixel_var_16x16_internal
4445 cglobal pixel_var_32x32, 2,4,8
4449 call pixel_var_16x16_internal
4450 lea r0, [r0 + r1 * 4]
4451 call pixel_var_16x16_internal
4453 call pixel_var_16x16_internal
4454 lea r0, [r0 + r1 * 4]
4455 call pixel_var_16x16_internal
4458 cglobal pixel_var_64x64, 2,6,8
4462 call pixel_var_16x16_internal
4463 lea r0, [r0 + r1 * 4]
4464 call pixel_var_16x16_internal
4465 lea r0, [r0 + r1 * 4]
4466 call pixel_var_16x16_internal
4467 lea r0, [r0 + r1 * 4]
4468 call pixel_var_16x16_internal
4473 call pixel_var_16x16_internal
4474 lea r0, [r0 + r1 * 4]
4475 call pixel_var_16x16_internal
4476 lea r0, [r0 + r1 * 4]
4477 call pixel_var_16x16_internal
4478 lea r0, [r0 + r1 * 4]
4479 call pixel_var_16x16_internal
4485 call pixel_var_16x16_internal
4486 lea r0, [r0 + r1 * 4]
4487 call pixel_var_16x16_internal
4488 lea r0, [r0 + r1 * 4]
4489 call pixel_var_16x16_internal
4490 lea r0, [r0 + r1 * 4]
4491 call pixel_var_16x16_internal
4497 call pixel_var_16x16_internal
4498 lea r0, [r0 + r1 * 4]
4499 call pixel_var_16x16_internal
4500 lea r0, [r0 + r1 * 4]
4501 call pixel_var_16x16_internal
4502 lea r0, [r0 + r1 * 4]
4503 call pixel_var_16x16_internal
4515 cglobal pixel_var_16x16, 2,4,7
4521 pmovzxbw m3, [r0+r1]
4522 pmovzxbw m1, [r0+r1*2]
4523 pmovzxbw m4, [r0+r3]
4528 vextracti128 xm0, m5, 1
4529 vextracti128 xm1, m6, 1
4542 %endif ; !HIGH_BIT_DEPTH
4552 sub eax, r1d ; sqr - (sum * sum >> shift)