1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
4 ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
5 ;* Nabajit Deka <nabajit@multicorewareinc.com>
7 ;* This program is free software; you can redistribute it and/or modify
8 ;* it under the terms of the GNU General Public License as published by
9 ;* the Free Software Foundation; either version 2 of the License, or
10 ;* (at your option) any later version.
12 ;* This program is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;* GNU General Public License for more details.
17 ;* You should have received a copy of the GNU General Public License
18 ;* along with this program; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 ;* This program is also available under a commercial proprietary license.
22 ;* For more information, contact us at license @ x265.com.
23 ;*****************************************************************************/
26 %include "x86util.asm"
31 ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
32 ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
33 pf_64: times 4 dd 64.0
34 pf_128: times 4 dd 128.0
36 ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64
37 ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63
39 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
40 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
42 mask_ff: times 16 db 0xff
44 deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
45 deinterleave_word_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
46 hmul_16p: times 16 db 1
48 hmulw_16p: times 8 dw 1
51 trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
64 ;-----------------------------------------------------------------------------
65 ; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
66 ;-----------------------------------------------------------------------------
70 cglobal calcRecons4, 5,8,4
73 cglobal calcRecons4, 5,7,4,0-1
74 %define t7b byte [rsp]
84 mova m5, [pw_pixel_max]
104 lea r0, [r0 + r4 * 2]
105 lea r1, [r1 + r4 * 2]
106 lea r2, [r2 + r5 * 2]
107 lea r3, [r3 + r6 * 2]
112 %else ;HIGH_BIT_DEPTH
115 cglobal calcRecons4, 5,8,4
118 cglobal calcRecons4, 5,7,4,0-1
119 %define t7b byte [rsp]
134 movh m3, [r1 + r4 * 2]
139 ; store recon[] and recipred[]
149 lea r0, [r0 + r4 * 2]
150 lea r1, [r1 + r4 * 4]
151 lea r2, [r2 + r5 * 2]
152 lea r3, [r3 + r6 * 2]
157 %endif ;HIGH_BIT_DEPTH
162 cglobal calcRecons8, 5,8,4
165 cglobal calcRecons8, 5,7,4,0-1
166 %define t7b byte [rsp]
178 mova m5, [pw_pixel_max]
187 CLIPW2 m0, m1, m4, m5
197 lea r0, [r0 + r4 * 2]
198 lea r1, [r1 + r4 * 2]
199 lea r2, [r2 + r5 * 2]
200 lea r3, [r3 + r6 * 2]
205 %else ;HIGH_BIT_DEPTH
220 movu m4, [r1 + r4 * 2]
225 ; store recon[] and recipred[]
235 lea r0, [r0 + r4 * 2]
236 lea r1, [r1 + r4 * 4]
237 lea r2, [r2 + r5 * 2]
238 lea r3, [r3 + r6 * 2]
243 %endif ;HIGH_BIT_DEPTH
250 cglobal calcRecons16, 5,8,4
253 cglobal calcRecons16, 5,7,4,0-1
254 %define t7b byte [rsp]
265 mova m5, [pw_pixel_max]
274 CLIPW2 m0, m1, m4, m5
285 movu m1, [r0 + r4 + 16]
287 movu m3, [r1 + r4 + 16]
290 CLIPW2 m0, m1, m4, m5
292 ; store recon[] and recipred[]
294 movu [r3 + r6 + 16], m1
298 movu [r2 + r5 + 16], m1
300 lea r0, [r0 + r4 * 2]
301 lea r1, [r1 + r4 * 2]
302 lea r2, [r2 + r5 * 2]
303 lea r3, [r3 + r6 * 2]
308 %else ;HIGH_BIT_DEPTH
312 cglobal calcRecons16, 5,8,4
315 cglobal calcRecons16, 5,7,4,0-1
316 %define t7b byte [rsp]
334 ; store recon[] and recipred[]
346 lea r1, [r1 + r4 * 2]
351 %endif ;HIGH_BIT_DEPTH
356 cglobal calcRecons32, 5,8,4
359 cglobal calcRecons32, 5,7,4,0-1
360 %define t7b byte [rsp]
371 mova m5, [pw_pixel_max]
381 CLIPW2 m0, m1, m4, m5
397 CLIPW2 m0, m1, m4, m5
399 ; store recon[] and recipred[]
409 movu m1, [r0 + r4 + 16]
411 movu m3, [r1 + r4 + 16]
414 CLIPW2 m0, m1, m4, m5
416 ; store recon[] and recipred[]
418 movu [r3 + r6 + 16], m1
424 movu m0, [r0 + r4 + 32]
425 movu m1, [r0 + r4 + 48]
426 movu m2, [r1 + r4 + 32]
427 movu m3, [r1 + r4 + 48]
430 CLIPW2 m0, m1, m4, m5
432 ; store recon[] and recipred[]
433 movu [r3 + r6 + 32], m0
434 movu [r3 + r6 + 48], m1
435 lea r3, [r3 + r6 * 2]
442 lea r0, [r0 + r4 * 2]
443 lea r1, [r1 + r4 * 2]
448 %else ;HIGH_BIT_DEPTH
451 cglobal calcRecons32, 5,8,4
454 cglobal calcRecons32, 5,7,4,0-1
455 %define t7b byte [rsp]
473 paddw m1, [r1 + 0 * 16]
474 paddw m2, [r1 + 1 * 16]
477 paddw m3, [r1 + 2 * 16]
478 paddw m4, [r1 + 3 * 16]
481 ; store recon[] and recipred[]
488 movu [r2 + 0 * 16], m2
489 movu [r2 + 1 * 16], m1
492 movu [r2 + 2 * 16], m4
493 movu [r2 + 3 * 16], m3
498 lea r1, [r1 + r4 * 2]
503 %endif ;HIGH_BIT_DEPTH
506 ;-----------------------------------------------------------------------------
507 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
508 ;-----------------------------------------------------------------------------
511 cglobal getResidual4, 4,4,4
525 lea r0, [r0 + r3 * 2]
526 lea r1, [r1 + r3 * 2]
527 lea r2, [r2 + r3 * 2]
541 cglobal getResidual4, 4,4,5
555 movhps [r2 + r3 * 2], m1
556 lea r0, [r0 + r3 * 2]
557 lea r1, [r1 + r3 * 2]
558 lea r2, [r2 + r3 * 4]
571 movhps [r2 + r3 * 2], m1
578 cglobal getResidual8, 4,4,4
594 lea r0, [r0 + r3 * 2]
595 lea r1, [r1 + r3 * 2]
596 lea r2, [r2 + r3 * 2]
600 cglobal getResidual8, 4,4,5
617 movu [r2 + r3 * 2], m2
620 lea r0, [r0 + r3 * 2]
621 lea r1, [r1 + r3 * 2]
622 lea r2, [r2 + r3 * 4]
630 cglobal getResidual16, 4,5,6
638 movu m3, [r0 + r3 + 16]
644 movu m5, [r1 + r3 + 16]
647 lea r0, [r0 + r3 * 2]
648 lea r1, [r1 + r3 * 2]
653 movu [r2 + r3 + 16], m3
654 lea r2, [r2 + r3 * 2]
660 movu m3, [r0 + r3 + 16]
666 movu m5, [r1 + r3 + 16]
673 movu [r2 + r3 + 16], m3
677 lea r0, [r0 + r3 * 2]
678 lea r1, [r1 + r3 * 2]
679 lea r2, [r2 + r3 * 2]
685 cglobal getResidual16, 4,5,8
709 movu [r2 + r3 * 2], m6
710 movu [r2 + r3 * 2 + 16], m2
712 lea r0, [r0 + r3 * 2]
713 lea r1, [r1 + r3 * 2]
714 lea r2, [r2 + r3 * 4]
736 movu [r2 + r3 * 2], m6
737 movu [r2 + r3 * 2 + 16], m2
741 lea r0, [r0 + r3 * 2]
742 lea r1, [r1 + r3 * 2]
743 lea r2, [r2 + r3 * 4]
752 cglobal getResidual32, 4,5,6
777 movu m1, [r0 + r3 + 16]
778 movu m2, [r0 + r3 + 32]
779 movu m3, [r0 + r3 + 48]
781 movu m5, [r1 + r3 + 16]
784 movu m4, [r1 + r3 + 32]
785 movu m5, [r1 + r3 + 48]
790 movu [r2 + r3 + 16], m1
791 movu [r2 + r3 + 32], m2
792 movu [r2 + r3 + 48], m3
796 lea r0, [r0 + r3 * 2]
797 lea r1, [r1 + r3 * 2]
798 lea r2, [r2 + r3 * 2]
804 cglobal getResidual32, 4,5,7
818 movu [r2 + 0 * 16], m5
819 movu [r2 + 1 * 16], m1
827 movu [r2 + 2 * 16], m5
828 movu [r2 + 3 * 16], m2
831 movu m2, [r0 + r3 + 16]
833 movu m4, [r1 + r3 + 16]
840 movu [r2 + r3 * 2 + 0 * 16], m5
841 movu [r2 + r3 * 2 + 1 * 16], m1
849 movu [r2 + r3 * 2 + 2 * 16], m5
850 movu [r2 + r3 * 2 + 3 * 16], m2
854 lea r0, [r0 + r3 * 2]
855 lea r1, [r1 + r3 * 2]
856 lea r2, [r2 + r3 * 4]
863 ;-----------------------------------------------------------------------------
864 ; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
865 ;-----------------------------------------------------------------------------
869 movd m4, r4d ; m4 = qbits
873 movd m6, r4d ; m6 = qbits8
877 pshufd m5, m5, 0 ; m5 = add
883 pxor m7, m7 ; m7 = numZero
886 movu m0, [r0] ; m0 = level
888 pmulld m1, [r1] ; m0 = tmpLevel1
890 psrad m2, m4 ; m2 = level1
894 psubd m1, m3 ; m1 = deltaU1
904 movu m0, [r0 + 16] ; m0 = level
906 pmulld m1, [r1 + 16] ; m0 = tmpLevel1
908 psrad m2, m4 ; m2 = level1
911 psubd m1, m3 ; m1 = deltaU1
938 cglobal quant, 5,5,10
940 movd xm4, r4d ; m4 = qbits
944 movd xm6, r4d ; m6 = qbits8
947 vpbroadcastd m5, r5m ; m5 = add
949 vpbroadcastw m9, [pw_1] ; m9 = word [1]
953 pxor m7, m7 ; m7 = numZero
956 movu m0, [r0] ; m0 = level
958 pmulld m1, [r1] ; m0 = tmpLevel1
960 psrad m2, xm4 ; m2 = level1
964 psubd m1, m3 ; m1 = deltaU1
969 movu m0, [r0 + mmsize] ; m0 = level
971 pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
973 psrad m3, xm4 ; m2 = level1
977 psubd m1, m8 ; m1 = deltaU1
978 movu [r2 + mmsize], m1
985 ; count non-zero coeff
986 ; TODO: popcnt is faster, but some CPU can't support
1001 vextracti128 xm1, m7, 1
1008 %else ; ARCH_X86_64 == 1
1010 cglobal quant, 5,6,8
1012 movd xm4, r4d ; m4 = qbits
1016 movd xm6, r4d ; m6 = qbits8
1019 vpbroadcastd m5, r5m ; m5 = ad
1025 pxor m7, m7 ; m7 = numZero
1028 movu m0, [r0] ; m0 = level
1030 pmulld m1, [r1] ; m0 = tmpLevel1
1032 psrad m2, xm4 ; m2 = level1
1036 psubd m1, m3 ; m1 = deltaU1
1043 vpermq m3, m3, q0020
1047 movu m0, [r0 + mmsize] ; m0 = level
1049 pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
1051 psrad m2, xm4 ; m2 = level1
1055 psubd m1, m3 ; m1 = deltaU1
1057 movu [r2 + mmsize], m1
1062 vpermq m3, m3, q0020
1063 movu [r3 + mmsize/2], xm3
1075 vextracti128 xm1, m7, 1
1081 %endif ; ARCH_X86_64 == 1
1085 ;-----------------------------------------------------------------------------
1086 ; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
1087 ;-----------------------------------------------------------------------------
1089 cglobal nquant, 3,5,8
1092 pxor m7, m7 ; m7 = numZero
1093 movd m5, r3m ; m5 = qbits
1094 pshufd m6, m6, 0 ; m6 = add
1095 mov r3d, r4d ; r3 = numCoeff
1099 movu m0, [r0] ; m0 = level
1100 movu m1, [r0 + 16] ; m1 = level
1103 pmulld m2, [r1] ; m0 = tmpLevel1 * qcoeff
1105 psrad m2, m5 ; m0 = level1
1109 pmulld m3, [r1 + 16] ; m1 = tmpLevel1 * qcoeff
1111 psrad m3, m5 ; m1 = level1
1132 sub eax, r4d ; numSig
1137 cglobal nquant, 3,5,7
1138 vpbroadcastd m4, r4m
1139 vpbroadcastd m6, [pw_1]
1141 pxor m5, m5 ; m7 = numZero
1142 movd xm3, r3m ; m5 = qbits
1143 mov r3d, r4d ; r3 = numCoeff
1147 movu m0, [r0] ; m0 = level
1149 pmulld m1, [r1] ; m0 = tmpLevel1 * qcoeff
1151 psrad m1, xm3 ; m0 = level1
1154 movu m0, [r0 + mmsize] ; m0 = level
1156 pmulld m2, [r1 + mmsize] ; m0 = tmpLevel1 * qcoeff
1158 psrad m2, xm3 ; m0 = level1
1162 vpermq m2, m1, q3120
1177 vextracti128 xm0, m5, 1
1185 ;-----------------------------------------------------------------------------
1186 ; void dequant_normal(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift)
1187 ;-----------------------------------------------------------------------------
1189 cglobal dequant_normal, 5,5,5
1198 movd m0, r4d ; m0 = shift
1202 pshufd m1, m1, 0 ; m1 = dword [add scale]
1208 punpckhwd m4, m3, m2
1210 pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
1214 packssdw m3, m3 ; OPT_ME: store must be 32 bits
1230 cglobal dequant_normal, 5,5,7
1231 vpbroadcastd m2, [pw_1] ; m2 = word [1]
1232 vpbroadcastd m5, [pd_32767] ; m5 = dword [32767]
1233 vpbroadcastd m6, [pd_n32768] ; m6 = dword [-32768]
1241 movd xm0, r4d ; m0 = shift
1244 vpbroadcastd m1, r3d ; m1 = dword [add scale]
1252 punpckhwd m4, m3, m2
1254 pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
1262 mova [r1 + 0 * mmsize/2], xm3
1263 mova [r1 + 1 * mmsize/2], xm4
1264 vextracti128 [r1 + 2 * mmsize/2], m3, 1
1265 vextracti128 [r1 + 3 * mmsize/2], m4, 1
1275 ;-----------------------------------------------------------------------------
1276 ; int count_nonzero(const int16_t *quantCoeff, int numCoeff);
1277 ;-----------------------------------------------------------------------------
1279 cglobal count_nonzero, 2,2,3
1287 packsswb m2, [r0 + 16]
1301 ;-----------------------------------------------------------------------------------------------------------------------------------------------
1302 ;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
1303 ;-----------------------------------------------------------------------------------------------------------------------------------------------
1305 cglobal weight_pp, 6, 7, 6
1307 shl r5d, 6 ; m0 = [w0<<6]
1310 or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each.
1312 pshufd m0, m0, 0 ; m0 = [w0<<6, round]
1325 punpcklwd m3, m4, m5
1339 pmovzxbw m4, [r0 + 8]
1340 punpcklwd m3, m4, m5
1369 cglobal weight_pp, 6, 7, 6
1371 shl r5d, 6 ; m0 = [w0<<6]
1374 or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each.
1376 pshufd xm0, xm0, 0 ; m0 = [w0<<6, round]
1377 vinserti128 m0, m0, xm0, 1 ; document says (pshufd + vinserti128) can be replaced with vpbroadcastd m0, xm0, but having build problem, need to investigate
1380 vpbroadcastd m2, r8m
1390 punpcklwd m3, m4, m5
1401 vextracti128 xm4, m3, 1
1418 ;-------------------------------------------------------------------------------------------------------------------------------------------------
1419 ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
1420 ;-------------------------------------------------------------------------------------------------------------------------------------------------
1423 cglobal weight_sp, 6, 7+2, 7
1426 %else ; ARCH_X86_64 = 0
1427 cglobal weight_sp, 6, 7, 7, 0-(2*4)
1428 %define tmp_r0 [(rsp + 0 * 4)]
1429 %define tmp_r1 [(rsp + 1 * 4)]
1430 %endif ; ARCH_X86_64
1432 movd m0, r6m ; m0 = [w0]
1434 movd m1, r7m ; m1 = [round]
1436 pshufd m0, m0, 0 ; m0 = [w0 round]
1438 movd m1, r8m ; m1 = [shift]
1441 pshufd m2, m2, 0 ; m2 =[offset]
1451 ; save old src and dst
1502 ;-----------------------------------------------------------------
1503 ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
1504 ;-----------------------------------------------------------------
1506 cglobal transpose4, 3, 3, 4, dest, src, stride
1507 %if HIGH_BIT_DEPTH == 1
1511 movh m2, [r1 + 2 * r2]
1512 lea r1, [r1 + 2 * r2]
1516 punpckhdq m1, m0, m2
1520 %else ;HIGH_BIT_DEPTH == 0
1523 movd m2, [r1 + 2 * r2]
1524 lea r1, [r1 + 2 * r2]
1534 ;-----------------------------------------------------------------
1535 ; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride)
1536 ;-----------------------------------------------------------------
1537 %if HIGH_BIT_DEPTH == 1
1538 %if ARCH_X86_64 == 1
1540 cglobal transpose8, 3, 5, 5
1543 lea r4, [r1 + 4 * r2]
1545 vinserti128 m0, m0, [r4], 1
1547 vinserti128 m1, m1, [r4 + r2], 1
1548 movu xm2, [r1 + 2 * r2]
1549 vinserti128 m2, m2, [r4 + 2 * r2], 1
1551 vinserti128 m3, m3, [r4 + r3], 1
1553 punpcklwd m4, m0, m1 ;[1 - 4][row1row2;row5row6]
1554 punpckhwd m0, m1 ;[5 - 8][row1row2;row5row6]
1556 punpcklwd m1, m2, m3 ;[1 - 4][row3row4;row7row8]
1557 punpckhwd m2, m3 ;[5 - 8][row3row4;row7row8]
1559 punpckldq m3, m4, m1 ;[1 - 2][row1row2row3row4;row5row6row7row8]
1560 punpckhdq m4, m1 ;[3 - 4][row1row2row3row4;row5row6row7row8]
1562 punpckldq m1, m0, m2 ;[5 - 6][row1row2row3row4;row5row6row7row8]
1563 punpckhdq m0, m2 ;[7 - 8][row1row2row3row4;row5row6row7row8]
1565 vpermq m3, m3, 0xD8 ;[1 ; 2][row1row2row3row4row5row6row7row8]
1566 vpermq m4, m4, 0xD8 ;[3 ; 4][row1row2row3row4row5row6row7row8]
1567 vpermq m1, m1, 0xD8 ;[5 ; 6][row1row2row3row4row5row6row7row8]
1568 vpermq m0, m0, 0xD8 ;[7 ; 8][row1row2row3row4row5row6row7row8]
1570 movu [r0 + 0 * 32], m3
1571 movu [r0 + 1 * 32], m4
1572 movu [r0 + 2 * 32], m1
1573 movu [r0 + 3 * 32], m0
1578 %macro TRANSPOSE_4x4 1
1581 movh m2, [r1 + 2 * r2]
1582 lea r1, [r1 + 2 * r2]
1586 punpckhdq m1, m0, m2
1589 movhps [r0 + %1], m0
1590 movh [r0 + 2 * %1], m1
1591 lea r0, [r0 + 2 * %1]
1592 movhps [r0 + %1], m1
1594 cglobal transpose8_internal
1596 lea r1, [r1 + 2 * r2]
1599 lea r1, [r1 + 2 * r2]
1601 lea r1, [r1 + r2 * 8 + 8]
1603 lea r0, [r3 + 4 * r5]
1605 lea r1, [r1 + 2 * r2]
1606 lea r0, [r3 + 8 + 4 * r5]
1609 cglobal transpose8, 3, 6, 4, dest, src, stride
1613 call transpose8_internal
1615 %else ;HIGH_BIT_DEPTH == 0
1616 %if ARCH_X86_64 == 1
1618 cglobal transpose8, 3, 4, 4
1621 movhps xm0, [r1 + 2 * r2]
1623 movhps xm1, [r1 + r3]
1624 lea r1, [r1 + 4 * r2]
1626 movhps xm2, [r1 + 2 * r2]
1628 movhps xm3, [r1 + r3]
1630 vinserti128 m0, m0, xm2, 1 ;[row1 row3 row5 row7]
1631 vinserti128 m1, m1, xm3, 1 ;[row2 row4 row6 row8]
1633 punpcklbw m2, m0, m1 ;[1 - 8; 1 - 8][row1row2; row5row6]
1634 punpckhbw m0, m1 ;[1 - 8; 1 - 8][row3row4; row7row8]
1636 punpcklwd m1, m2, m0 ;[1 - 4; 1 - 4][row1row2row3row4; row5row6row7row8]
1637 punpckhwd m2, m0 ;[5 - 8; 5 - 8][row1row2row3row4; row5row6row7row8]
1639 mova m0, [trans8_shuf]
1641 vpermd m1, m0, m1 ;[1 - 2; 3 - 4][row1row2row3row4row5row6row7row8]
1642 vpermd m2, m0, m2 ;[4 - 5; 6 - 7][row1row2row3row4row5row6row7row8]
1650 cglobal transpose8, 3, 5, 8, dest, src, stride
1657 movh m4, [r1 + 4 * r2]
1658 lea r1, [r1 + 4 * r2]
1668 punpckhwd m1, m0, m2
1670 punpckhwd m5, m4, m6
1672 punpckhdq m2, m0, m4
1674 punpckhdq m3, m1, m5
1684 %macro TRANSPOSE_8x8 1
1688 movh m2, [r1 + 2 * r2]
1689 lea r1, [r1 + 2 * r2]
1691 movh m4, [r1 + 2 * r2]
1692 lea r1, [r1 + 2 * r2]
1694 movh m6, [r1 + 2 * r2]
1695 lea r1, [r1 + 2 * r2]
1703 punpckhwd m1, m0, m2
1705 punpckhwd m5, m4, m6
1707 punpckhdq m2, m0, m4
1709 punpckhdq m3, m1, m5
1713 movhps [r0 + %1], m0
1714 movh [r0 + 2 * %1], m2
1715 lea r0, [r0 + 2 * %1]
1716 movhps [r0 + %1], m2
1717 movh [r0 + 2 * %1], m1
1718 lea r0, [r0 + 2 * %1]
1719 movhps [r0 + %1], m1
1720 movh [r0 + 2 * %1], m3
1721 lea r0, [r0 + 2 * %1]
1722 movhps [r0 + %1], m3
1727 ;-----------------------------------------------------------------
1728 ; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride)
1729 ;-----------------------------------------------------------------
1730 %if HIGH_BIT_DEPTH == 1
1731 %if ARCH_X86_64 == 1
1733 cglobal transpose16x8_internal
1736 movu m2, [r1 + 2 * r2]
1738 lea r1, [r1 + 4 * r2]
1742 movu m6, [r1 + 2 * r2]
1745 punpcklwd m8, m0, m1 ;[1 - 4; 9 - 12][1 2]
1746 punpckhwd m0, m1 ;[5 - 8; 13 -16][1 2]
1748 punpcklwd m1, m2, m3 ;[1 - 4; 9 - 12][3 4]
1749 punpckhwd m2, m3 ;[5 - 8; 13 -16][3 4]
1751 punpcklwd m3, m4, m5 ;[1 - 4; 9 - 12][5 6]
1752 punpckhwd m4, m5 ;[5 - 8; 13 -16][5 6]
1754 punpcklwd m5, m6, m7 ;[1 - 4; 9 - 12][7 8]
1755 punpckhwd m6, m7 ;[5 - 8; 13 -16][7 8]
1757 punpckldq m7, m8, m1 ;[1 - 2; 9 - 10][1 2 3 4]
1758 punpckhdq m8, m1 ;[3 - 4; 11 - 12][1 2 3 4]
1760 punpckldq m1, m3, m5 ;[1 - 2; 9 - 10][5 6 7 8]
1761 punpckhdq m3, m5 ;[3 - 4; 11 - 12][5 6 7 8]
1763 punpckldq m5, m0, m2 ;[5 - 6; 13 - 14][1 2 3 4]
1764 punpckhdq m0, m2 ;[7 - 8; 15 - 16][1 2 3 4]
1766 punpckldq m2, m4, m6 ;[5 - 6; 13 - 14][5 6 7 8]
1767 punpckhdq m4, m6 ;[7 - 8; 15 - 16][5 6 7 8]
1769 punpcklqdq m6, m7, m1 ;[1 ; 9 ][1 2 3 4 5 6 7 8]
1770 punpckhqdq m7, m1 ;[2 ; 10][1 2 3 4 5 6 7 8]
1772 punpcklqdq m1, m8, m3 ;[3 ; 11][1 2 3 4 5 6 7 8]
1773 punpckhqdq m8, m3 ;[4 ; 12][1 2 3 4 5 6 7 8]
1775 punpcklqdq m3, m5, m2 ;[5 ; 13][1 2 3 4 5 6 7 8]
1776 punpckhqdq m5, m2 ;[6 ; 14][1 2 3 4 5 6 7 8]
1778 punpcklqdq m2, m0, m4 ;[7 ; 15][1 2 3 4 5 6 7 8]
1779 punpckhqdq m0, m4 ;[8 ; 16][1 2 3 4 5 6 7 8]
1781 movu [r0 + 0 * 32], xm6
1782 vextracti128 [r0 + 8 * 32], m6, 1
1783 movu [r0 + 1 * 32], xm7
1784 vextracti128 [r0 + 9 * 32], m7, 1
1785 movu [r0 + 2 * 32], xm1
1786 vextracti128 [r0 + 10 * 32], m1, 1
1787 movu [r0 + 3 * 32], xm8
1788 vextracti128 [r0 + 11 * 32], m8, 1
1789 movu [r0 + 4 * 32], xm3
1790 vextracti128 [r0 + 12 * 32], m3, 1
1791 movu [r0 + 5 * 32], xm5
1792 vextracti128 [r0 + 13 * 32], m5, 1
1793 movu [r0 + 6 * 32], xm2
1794 vextracti128 [r0 + 14 * 32], m2, 1
1795 movu [r0 + 7 * 32], xm0
1796 vextracti128 [r0 + 15 * 32], m0, 1
1799 cglobal transpose16, 3, 4, 9
1802 call transpose16x8_internal
1803 lea r1, [r1 + 4 * r2]
1805 call transpose16x8_internal
1809 cglobal transpose16, 3, 7, 4, dest, src, stride
1815 call transpose8_internal
1816 lea r1, [r1 - 8 + 2 * r2]
1819 call transpose8_internal
1821 lea r0, [r6 + 8 * r5]
1823 call transpose8_internal
1824 lea r1, [r1 - 8 + 2 * r2]
1825 lea r0, [r6 + 8 * r5 + 16]
1827 call transpose8_internal
1829 %else ;HIGH_BIT_DEPTH == 0
1830 %if ARCH_X86_64 == 1
1832 cglobal transpose16, 3, 5, 9
1834 lea r4, [r1 + 8 * r2]
1838 movu xm2, [r1 + 2 * r2]
1840 vinserti128 m0, m0, [r4], 1
1841 vinserti128 m1, m1, [r4 + r2], 1
1842 vinserti128 m2, m2, [r4 + 2 * r2], 1
1843 vinserti128 m3, m3, [r4 + r3], 1
1844 lea r1, [r1 + 4 * r2]
1845 lea r4, [r4 + 4 * r2]
1849 movu xm6, [r1 + 2 * r2]
1851 vinserti128 m4, m4, [r4], 1
1852 vinserti128 m5, m5, [r4 + r2], 1
1853 vinserti128 m6, m6, [r4 + 2 * r2], 1
1854 vinserti128 m7, m7, [r4 + r3], 1
1856 punpcklbw m8, m0, m1 ;[1 - 8 ; 1 - 8 ][1 2 9 10]
1857 punpckhbw m0, m1 ;[9 - 16; 9 - 16][1 2 9 10]
1859 punpcklbw m1, m2, m3 ;[1 - 8 ; 1 - 8 ][3 4 11 12]
1860 punpckhbw m2, m3 ;[9 - 16; 9 - 16][3 4 11 12]
1862 punpcklbw m3, m4, m5 ;[1 - 8 ; 1 - 8 ][5 6 13 14]
1863 punpckhbw m4, m5 ;[9 - 16; 9 - 16][5 6 13 14]
1865 punpcklbw m5, m6, m7 ;[1 - 8 ; 1 - 8 ][7 8 15 16]
1866 punpckhbw m6, m7 ;[9 - 16; 9 - 16][7 8 15 16]
1868 punpcklwd m7, m8, m1 ;[1 - 4 ; 1 - 4][1 2 3 4 9 10 11 12]
1869 punpckhwd m8, m1 ;[5 - 8 ; 5 - 8][1 2 3 4 9 10 11 12]
1871 punpcklwd m1, m3, m5 ;[1 - 4 ; 1 - 4][5 6 7 8 13 14 15 16]
1872 punpckhwd m3, m5 ;[5 - 8 ; 5 - 8][5 6 7 8 13 14 15 16]
1874 punpcklwd m5, m0, m2 ;[9 - 12; 9 - 12][1 2 3 4 9 10 11 12]
1875 punpckhwd m0, m2 ;[13- 16; 13 - 16][1 2 3 4 9 10 11 12]
1877 punpcklwd m2, m4, m6 ;[9 - 12; 9 - 12][5 6 7 8 13 14 15 16]
1878 punpckhwd m4, m6 ;[13- 16; 13 - 16][5 6 7 8 13 14 15 16]
1880 punpckldq m6, m7, m1 ;[1 - 2 ; 1 - 2][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1881 punpckhdq m7, m1 ;[3 - 4 ; 3 - 4][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1883 punpckldq m1, m8, m3 ;[5 - 6 ; 5 - 6][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1884 punpckhdq m8, m3 ;[7 - 8 ; 7 - 8][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1886 punpckldq m3, m5, m2 ;[9 - 10; 9 - 10][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1887 punpckhdq m5, m2 ;[11- 12; 11 - 12][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1889 punpckldq m2, m0, m4 ;[13- 14; 13 - 14][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1890 punpckhdq m0, m4 ;[15- 16; 15 - 16][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1901 movu [r0 + 0 * 16], m6
1902 movu [r0 + 2 * 16], m7
1903 movu [r0 + 4 * 16], m1
1904 movu [r0 + 6 * 16], m8
1905 movu [r0 + 8 * 16], m3
1906 movu [r0 + 10 * 16], m5
1907 movu [r0 + 12 * 16], m2
1908 movu [r0 + 14 * 16], m0
1912 cglobal transpose16, 3, 5, 8, dest, src, stride
1916 lea r1, [r1 + 2 * r2]
1920 lea r0, [r3 + 8 * 16]
1922 lea r1, [r1 + 2 * r2]
1923 lea r0, [r3 + 8 * 16 + 8]
1928 cglobal transpose16_internal
1930 lea r1, [r1 + 2 * r2]
1933 lea r1, [r1 + 2 * r2]
1935 lea r1, [r1 + r2 * 8]
1936 lea r1, [r1 + r2 * 8 + 8]
1938 lea r0, [r5 + 8 * r6]
1940 lea r1, [r1 + 2 * r2]
1941 lea r0, [r5 + 8 * r6 + 8]
1945 ;-----------------------------------------------------------------
1946 ; void transpose_32x32(pixel *dst, pixel *src, intptr_t stride)
1947 ;-----------------------------------------------------------------
1948 %if HIGH_BIT_DEPTH == 1
1949 %if ARCH_X86_64 == 1
1951 cglobal transpose8x32_internal
1955 movu m3, [r1 + r2 + 32]
1956 movu m4, [r1 + 2 * r2]
1957 movu m5, [r1 + 2 * r2 + 32]
1959 movu m7, [r1 + r3 + 32]
1960 lea r1, [r1 + 4 * r2]
1962 punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2]
1963 punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2]
1965 punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4]
1966 punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4]
1968 punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2]
1969 punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2]
1971 punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4]
1972 punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4]
1974 punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4]
1975 punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4]
1977 punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4]
1978 punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4]
1980 punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4]
1981 punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4]
1983 punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4]
1984 punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4]
1986 movq [r0 + 0 * 64], xm7
1987 movhps [r0 + 1 * 64], xm7
1988 vextracti128 xm5, m7, 1
1989 movq [r0 + 8 * 64], xm5
1990 movhps [r0 + 9 * 64], xm5
1995 movu m11, [r1 + r2 + 32]
1996 movu m12, [r1 + 2 * r2]
1997 movu m13, [r1 + 2 * r2 + 32]
1999 movu m15, [r1 + r3 + 32]
2001 punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6]
2002 punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6]
2004 punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8]
2005 punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8]
2007 punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6]
2008 punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6]
2010 punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8]
2011 punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8]
2013 punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8]
2014 punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8]
2016 punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8]
2017 punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8]
2019 punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8]
2020 punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8]
2022 punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8]
2023 punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8]
2025 movq [r0 + 0 * 64 + 8], xm15
2026 movhps [r0 + 1 * 64 + 8], xm15
2027 vextracti128 xm13, m15, 1
2028 movq [r0 + 8 * 64 + 8], xm13
2029 movhps [r0 + 9 * 64 + 8], xm13
2031 punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8]
2032 punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8]
2034 punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8]
2035 punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8]
2037 punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8]
2038 punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8]
2040 punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8]
2041 punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8]
2043 punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8]
2044 punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8]
2046 punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8]
2047 punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8]
2049 punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8]
2050 punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8]
2052 movu [r0 + 2 * 64], xm13
2053 vextracti128 [r0 + 10 * 64], m13, 1
2055 movu [r0 + 3 * 64], xm8
2056 vextracti128 [r0 + 11 * 64], m8, 1
2058 movu [r0 + 4 * 64], xm5
2059 vextracti128 [r0 + 12 * 64], m5, 1
2061 movu [r0 + 5 * 64], xm2
2062 vextracti128 [r0 + 13 * 64], m2, 1
2064 movu [r0 + 6 * 64], xm10
2065 vextracti128 [r0 + 14 * 64], m10, 1
2067 movu [r0 + 7 * 64], xm0
2068 vextracti128 [r0 + 15 * 64], m0, 1
2070 movu [r0 + 16 * 64], xm7
2071 vextracti128 [r0 + 24 * 64], m7, 1
2073 movu [r0 + 17 * 64], xm4
2074 vextracti128 [r0 + 25 * 64], m4, 1
2076 movu [r0 + 18 * 64], xm12
2077 vextracti128 [r0 + 26 * 64], m12, 1
2079 movu [r0 + 19 * 64], xm6
2080 vextracti128 [r0 + 27 * 64], m6, 1
2082 movu [r0 + 20 * 64], xm14
2083 vextracti128 [r0 + 28 * 64], m14, 1
2085 movu [r0 + 21 * 64], xm3
2086 vextracti128 [r0 + 29 * 64], m3, 1
2088 movu [r0 + 22 * 64], xm11
2089 vextracti128 [r0 + 30 * 64], m11, 1
2091 movu [r0 + 23 * 64], xm1
2092 vextracti128 [r0 + 31 * 64], m1, 1
2095 cglobal transpose32, 3, 4, 16
2098 call transpose8x32_internal
2100 lea r1, [r1 + 4 * r2]
2101 call transpose8x32_internal
2103 lea r1, [r1 + 4 * r2]
2104 call transpose8x32_internal
2106 lea r1, [r1 + 4 * r2]
2107 call transpose8x32_internal
2111 cglobal transpose32, 3, 7, 4, dest, src, stride
2117 call transpose8_internal
2118 lea r1, [r1 - 8 + 2 * r2]
2121 call transpose8_internal
2122 lea r1, [r1 - 8 + 2 * r2]
2125 call transpose8_internal
2126 lea r1, [r1 - 8 + 2 * r2]
2129 call transpose8_internal
2131 lea r0, [r6 + 8 * 64]
2133 call transpose8_internal
2134 lea r1, [r1 - 8 + 2 * r2]
2135 lea r0, [r6 + 8 * 64 + 16]
2137 call transpose8_internal
2138 lea r1, [r1 - 8 + 2 * r2]
2139 lea r0, [r6 + 8 * 64 + 32]
2141 call transpose8_internal
2142 lea r1, [r1 - 8 + 2 * r2]
2143 lea r0, [r6 + 8 * 64 + 48]
2145 call transpose8_internal
2147 lea r0, [r6 + 16 * 64]
2149 call transpose8_internal
2150 lea r1, [r1 - 8 + 2 * r2]
2151 lea r0, [r6 + 16 * 64 + 16]
2153 call transpose8_internal
2154 lea r1, [r1 - 8 + 2 * r2]
2155 lea r0, [r6 + 16 * 64 + 32]
2157 call transpose8_internal
2158 lea r1, [r1 - 8 + 2 * r2]
2159 lea r0, [r6 + 16 * 64 + 48]
2161 call transpose8_internal
2163 lea r0, [r6 + 24 * 64]
2165 call transpose8_internal
2166 lea r1, [r1 - 8 + 2 * r2]
2167 lea r0, [r6 + 24 * 64 + 16]
2169 call transpose8_internal
2170 lea r1, [r1 - 8 + 2 * r2]
2171 lea r0, [r6 + 24 * 64 + 32]
2173 call transpose8_internal
2174 lea r1, [r1 - 8 + 2 * r2]
2175 lea r0, [r6 + 24 * 64 + 48]
2177 call transpose8_internal
2179 %else ;HIGH_BIT_DEPTH == 0
2181 cglobal transpose32, 3, 7, 8, dest, src, stride
2186 call transpose16_internal
2187 lea r1, [r1 - 8 + 2 * r2]
2190 call transpose16_internal
2192 lea r0, [r3 + 16 * 32]
2194 call transpose16_internal
2195 lea r1, [r1 - 8 + 2 * r2]
2196 lea r0, [r3 + 16 * 32 + 16]
2198 call transpose16_internal
2201 %if ARCH_X86_64 == 1
2203 cglobal transpose32, 3, 5, 16
2210 movu m2, [r1 + 2 * r2]
2212 lea r1, [r1 + 4 * r2]
2216 movu m6, [r1 + 2 * r2]
2219 punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2]
2220 punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2]
2222 punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4]
2223 punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4]
2225 punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6]
2226 punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6]
2228 punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8]
2229 punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8]
2231 punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4]
2232 punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4]
2234 punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8]
2235 punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8]
2237 punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4]
2238 punpckhwd m0, m2 ;[13- 15; 29 - 32][1 2 3 4]
2240 punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8]
2241 punpckhwd m4, m6 ;[13- 15; 29 - 32][5 6 7 8]
2243 punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8]
2244 punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8]
2246 punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8]
2247 punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8]
2249 punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8]
2250 punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8]
2252 punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8]
2253 punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8]
2255 movq [r0 + 0 * 32], xm6
2256 movhps [r0 + 1 * 32], xm6
2257 vextracti128 xm4, m6, 1
2258 movq [r0 + 16 * 32], xm4
2259 movhps [r0 + 17 * 32], xm4
2261 lea r1, [r1 + 4 * r2]
2264 movu m11, [r1 + 2 * r2]
2266 lea r1, [r1 + 4 * r2]
2270 movu m15, [r1 + 2 * r2]
2273 punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10]
2274 punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10]
2276 punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12]
2277 punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12]
2279 punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14]
2280 punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14]
2282 punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16]
2283 punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16]
2285 punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12]
2286 punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12]
2288 punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16]
2289 punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16]
2291 punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12]
2292 punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12]
2294 punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16]
2295 punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16]
2297 punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16]
2298 punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16]
2300 punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16]
2301 punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16]
2303 punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16]
2304 punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16]
2306 punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16]
2307 punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16]
2310 punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2311 punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2313 punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2314 punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2316 punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2317 punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2319 punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2320 punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2322 punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2323 punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2325 punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2326 punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2328 punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2329 punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2331 movq [r0 + 0 * 32 + 8], xm15
2332 movhps [r0 + 1 * 32 + 8], xm15
2333 vextracti128 xm9, m15, 1
2334 movq [r0 + 16 * 32 + 8], xm9
2335 movhps [r0 + 17 * 32 + 8], xm9
2337 movu [r0 + 2 * 32], xm13
2338 vextracti128 [r0 + 18 * 32], m13, 1
2340 movu [r0 + 3 * 32], xm7
2341 vextracti128 [r0 + 19 * 32], m7, 1
2343 movu [r0 + 4 * 32], xm6
2344 vextracti128 [r0 + 20 * 32], m6, 1
2346 movu [r0 + 5 * 32], xm1
2347 vextracti128 [r0 + 21 * 32], m1, 1
2349 movu [r0 + 6 * 32], xm10
2350 vextracti128 [r0 + 22 * 32], m10, 1
2352 movu [r0 + 7 * 32], xm8
2353 vextracti128 [r0 + 23 * 32], m8, 1
2355 movu [r0 + 8 * 32], xm4
2356 vextracti128 [r0 + 24 * 32], m4, 1
2358 movu [r0 + 9 * 32], xm3
2359 vextracti128 [r0 + 25 * 32], m3, 1
2361 movu [r0 + 10 * 32], xm12
2362 vextracti128 [r0 + 26 * 32], m12, 1
2364 movu [r0 + 11 * 32], xm5
2365 vextracti128 [r0 + 27 * 32], m5, 1
2367 movu [r0 + 12 * 32], xm14
2368 vextracti128 [r0 + 28 * 32], m14, 1
2370 movu [r0 + 13 * 32], xm2
2371 vextracti128 [r0 + 29 * 32], m2, 1
2373 movu [r0 + 14 * 32], xm11
2374 vextracti128 [r0 + 30 * 32], m11, 1
2376 movu [r0 + 15 * 32], xm0
2377 vextracti128 [r0 + 31 * 32], m0, 1
2380 lea r1, [r1 + 4 * r2]
2387 ;-----------------------------------------------------------------
2388 ; void transpose_64x64(pixel *dst, pixel *src, intptr_t stride)
2389 ;-----------------------------------------------------------------
2390 %if HIGH_BIT_DEPTH == 1
2391 %if ARCH_X86_64 == 1
2393 cglobal transpose8x32_64_internal
2397 movu m3, [r1 + r2 + 32]
2398 movu m4, [r1 + 2 * r2]
2399 movu m5, [r1 + 2 * r2 + 32]
2401 movu m7, [r1 + r3 + 32]
2402 lea r1, [r1 + 4 * r2]
2404 punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2]
2405 punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2]
2407 punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4]
2408 punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4]
2410 punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2]
2411 punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2]
2413 punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4]
2414 punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4]
2416 punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4]
2417 punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4]
2419 punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4]
2420 punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4]
2422 punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4]
2423 punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4]
2425 punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4]
2426 punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4]
2428 movq [r0 + 0 * 128], xm7
2429 movhps [r0 + 1 * 128], xm7
2430 vextracti128 xm5, m7, 1
2431 movq [r0 + 8 * 128], xm5
2432 movhps [r0 + 9 * 128], xm5
2437 movu m11, [r1 + r2 + 32]
2438 movu m12, [r1 + 2 * r2]
2439 movu m13, [r1 + 2 * r2 + 32]
2441 movu m15, [r1 + r3 + 32]
2443 punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6]
2444 punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6]
2446 punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8]
2447 punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8]
2449 punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6]
2450 punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6]
2452 punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8]
2453 punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8]
2455 punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8]
2456 punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8]
2458 punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8]
2459 punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8]
2461 punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8]
2462 punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8]
2464 punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8]
2465 punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8]
2467 movq [r0 + 0 * 128 + 8], xm15
2468 movhps [r0 + 1 * 128 + 8], xm15
2469 vextracti128 xm13, m15, 1
2470 movq [r0 + 8 * 128 + 8], xm13
2471 movhps [r0 + 9 * 128 + 8], xm13
2473 punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8]
2474 punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8]
2476 punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8]
2477 punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8]
2479 punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8]
2480 punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8]
2482 punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8]
2483 punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8]
2485 punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8]
2486 punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8]
2488 punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8]
2489 punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8]
2491 punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8]
2492 punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8]
2494 movu [r0 + 2 * 128], xm13
2495 vextracti128 [r0 + 10 * 128], m13, 1
2497 movu [r0 + 3 * 128], xm8
2498 vextracti128 [r0 + 11 * 128], m8, 1
2500 movu [r0 + 4 * 128], xm5
2501 vextracti128 [r0 + 12 * 128], m5, 1
2503 movu [r0 + 5 * 128], xm2
2504 vextracti128 [r0 + 13 * 128], m2, 1
2506 movu [r0 + 6 * 128], xm10
2507 vextracti128 [r0 + 14 * 128], m10, 1
2509 movu [r0 + 7 * 128], xm0
2510 vextracti128 [r0 + 15 * 128], m0, 1
2512 movu [r0 + 16 * 128], xm7
2513 vextracti128 [r0 + 24 * 128], m7, 1
2515 movu [r0 + 17 * 128], xm4
2516 vextracti128 [r0 + 25 * 128], m4, 1
2518 movu [r0 + 18 * 128], xm12
2519 vextracti128 [r0 + 26 * 128], m12, 1
2521 movu [r0 + 19 * 128], xm6
2522 vextracti128 [r0 + 27 * 128], m6, 1
2524 movu [r0 + 20 * 128], xm14
2525 vextracti128 [r0 + 28 * 128], m14, 1
2527 movu [r0 + 21 * 128], xm3
2528 vextracti128 [r0 + 29 * 128], m3, 1
2530 movu [r0 + 22 * 128], xm11
2531 vextracti128 [r0 + 30 * 128], m11, 1
2533 movu [r0 + 23 * 128], xm1
2534 vextracti128 [r0 + 31 * 128], m1, 1
2537 cglobal transpose64, 3, 6, 16
2543 call transpose8x32_64_internal
2545 lea r0, [r0 + 32 * 128]
2546 call transpose8x32_64_internal
2549 lea r4, [r1 + 4 * r2]
2551 call transpose8x32_64_internal
2553 lea r0, [r0 + 32 * 128]
2554 call transpose8x32_64_internal
2557 lea r4, [r1 + 4 * r2]
2559 call transpose8x32_64_internal
2561 lea r0, [r0 + 32 * 128]
2562 call transpose8x32_64_internal
2565 lea r4, [r1 + 4 * r2]
2567 call transpose8x32_64_internal
2569 lea r0, [r0 + 32 * 128]
2570 call transpose8x32_64_internal
2573 lea r4, [r1 + 4 * r2]
2575 call transpose8x32_64_internal
2577 lea r0, [r0 + 32 * 128]
2578 call transpose8x32_64_internal
2581 lea r4, [r1 + 4 * r2]
2583 call transpose8x32_64_internal
2585 lea r0, [r0 + 32 * 128]
2586 call transpose8x32_64_internal
2589 lea r4, [r1 + 4 * r2]
2591 call transpose8x32_64_internal
2593 lea r0, [r0 + 32 * 128]
2594 call transpose8x32_64_internal
2596 lea r4, [r1 + 4 * r2]
2598 call transpose8x32_64_internal
2600 lea r0, [r0 + 32 * 128]
2601 call transpose8x32_64_internal
2605 cglobal transpose64, 3, 7, 4, dest, src, stride
2611 call transpose8_internal
2612 lea r1, [r1 - 8 + 2 * r2]
2615 call transpose8_internal
2616 lea r1, [r1 - 8 + 2 * r2]
2619 call transpose8_internal
2620 lea r1, [r1 - 8 + 2 * r2]
2623 call transpose8_internal
2624 lea r1, [r1 - 8 + 2 * r2]
2627 call transpose8_internal
2628 lea r1, [r1 - 8 + 2 * r2]
2631 call transpose8_internal
2632 lea r1, [r1 - 8 + 2 * r2]
2635 call transpose8_internal
2636 lea r1, [r1 - 8 + 2 * r2]
2639 call transpose8_internal
2642 lea r0, [r6 + 8 * 128]
2644 call transpose8_internal
2645 lea r1, [r1 - 8 + 2 * r2]
2646 lea r0, [r6 + 8 * 128 + 16]
2648 call transpose8_internal
2649 lea r1, [r1 - 8 + 2 * r2]
2650 lea r0, [r6 + 8 * 128 + 32]
2652 call transpose8_internal
2653 lea r1, [r1 - 8 + 2 * r2]
2654 lea r0, [r6 + 8 * 128 + 48]
2656 call transpose8_internal
2657 lea r1, [r1 - 8 + 2 * r2]
2658 lea r0, [r6 + 8 * 128 + 64]
2660 call transpose8_internal
2661 lea r1, [r1 - 8 + 2 * r2]
2662 lea r0, [r6 + 8 * 128 + 80]
2664 call transpose8_internal
2665 lea r1, [r1 - 8 + 2 * r2]
2666 lea r0, [r6 + 8 * 128 + 96]
2668 call transpose8_internal
2669 lea r1, [r1 - 8 + 2 * r2]
2670 lea r0, [r6 + 8 * 128 + 112]
2672 call transpose8_internal
2675 lea r0, [r6 + 16 * 128]
2677 call transpose8_internal
2678 lea r1, [r1 - 8 + 2 * r2]
2679 lea r0, [r6 + 16 * 128 + 16]
2681 call transpose8_internal
2682 lea r1, [r1 - 8 + 2 * r2]
2683 lea r0, [r6 + 16 * 128 + 32]
2685 call transpose8_internal
2686 lea r1, [r1 - 8 + 2 * r2]
2687 lea r0, [r6 + 16 * 128 + 48]
2689 call transpose8_internal
2690 lea r1, [r1 - 8 + 2 * r2]
2691 lea r0, [r6 + 16 * 128 + 64]
2693 call transpose8_internal
2694 lea r1, [r1 - 8 + 2 * r2]
2695 lea r0, [r6 + 16 * 128 + 80]
2697 call transpose8_internal
2698 lea r1, [r1 - 8 + 2 * r2]
2699 lea r0, [r6 + 16 * 128 + 96]
2701 call transpose8_internal
2702 lea r1, [r1 - 8 + 2 * r2]
2703 lea r0, [r6 + 16 * 128 + 112]
2705 call transpose8_internal
2708 lea r0, [r6 + 24 * 128]
2710 call transpose8_internal
2711 lea r1, [r1 - 8 + 2 * r2]
2712 lea r0, [r6 + 24 * 128 + 16]
2714 call transpose8_internal
2715 lea r1, [r1 - 8 + 2 * r2]
2716 lea r0, [r6 + 24 * 128 + 32]
2718 call transpose8_internal
2719 lea r1, [r1 - 8 + 2 * r2]
2720 lea r0, [r6 + 24 * 128 + 48]
2722 call transpose8_internal
2723 lea r1, [r1 - 8 + 2 * r2]
2724 lea r0, [r6 + 24 * 128 + 64]
2726 call transpose8_internal
2727 lea r1, [r1 - 8 + 2 * r2]
2728 lea r0, [r6 + 24 * 128 + 80]
2730 call transpose8_internal
2731 lea r1, [r1 - 8 + 2 * r2]
2732 lea r0, [r6 + 24 * 128 + 96]
2734 call transpose8_internal
2735 lea r1, [r1 - 8 + 2 * r2]
2736 lea r0, [r6 + 24 * 128 + 112]
2738 call transpose8_internal
2741 lea r0, [r6 + 32 * 128]
2743 call transpose8_internal
2744 lea r1, [r1 - 8 + 2 * r2]
2745 lea r0, [r6 + 32 * 128 + 16]
2747 call transpose8_internal
2748 lea r1, [r1 - 8 + 2 * r2]
2749 lea r0, [r6 + 32 * 128 + 32]
2751 call transpose8_internal
2752 lea r1, [r1 - 8 + 2 * r2]
2753 lea r0, [r6 + 32 * 128 + 48]
2755 call transpose8_internal
2756 lea r1, [r1 - 8 + 2 * r2]
2757 lea r0, [r6 + 32 * 128 + 64]
2759 call transpose8_internal
2760 lea r1, [r1 - 8 + 2 * r2]
2761 lea r0, [r6 + 32 * 128 + 80]
2763 call transpose8_internal
2764 lea r1, [r1 - 8 + 2 * r2]
2765 lea r0, [r6 + 32 * 128 + 96]
2767 call transpose8_internal
2768 lea r1, [r1 - 8 + 2 * r2]
2769 lea r0, [r6 + 32 * 128 + 112]
2771 call transpose8_internal
2774 lea r0, [r6 + 40 * 128]
2776 call transpose8_internal
2777 lea r1, [r1 - 8 + 2 * r2]
2778 lea r0, [r6 + 40 * 128 + 16]
2780 call transpose8_internal
2781 lea r1, [r1 - 8 + 2 * r2]
2782 lea r0, [r6 + 40 * 128 + 32]
2784 call transpose8_internal
2785 lea r1, [r1 - 8 + 2 * r2]
2786 lea r0, [r6 + 40 * 128 + 48]
2788 call transpose8_internal
2789 lea r1, [r1 - 8 + 2 * r2]
2790 lea r0, [r6 + 40 * 128 + 64]
2792 call transpose8_internal
2793 lea r1, [r1 - 8 + 2 * r2]
2794 lea r0, [r6 + 40 * 128 + 80]
2796 call transpose8_internal
2797 lea r1, [r1 - 8 + 2 * r2]
2798 lea r0, [r6 + 40 * 128 + 96]
2800 call transpose8_internal
2801 lea r1, [r1 - 8 + 2 * r2]
2802 lea r0, [r6 + 40 * 128 + 112]
2804 call transpose8_internal
2807 lea r0, [r6 + 48 * 128]
2809 call transpose8_internal
2810 lea r1, [r1 - 8 + 2 * r2]
2811 lea r0, [r6 + 48 * 128 + 16]
2813 call transpose8_internal
2814 lea r1, [r1 - 8 + 2 * r2]
2815 lea r0, [r6 + 48 * 128 + 32]
2817 call transpose8_internal
2818 lea r1, [r1 - 8 + 2 * r2]
2819 lea r0, [r6 + 48 * 128 + 48]
2821 call transpose8_internal
2822 lea r1, [r1 - 8 + 2 * r2]
2823 lea r0, [r6 + 48 * 128 + 64]
2825 call transpose8_internal
2826 lea r1, [r1 - 8 + 2 * r2]
2827 lea r0, [r6 + 48 * 128 + 80]
2829 call transpose8_internal
2830 lea r1, [r1 - 8 + 2 * r2]
2831 lea r0, [r6 + 48 * 128 + 96]
2833 call transpose8_internal
2834 lea r1, [r1 - 8 + 2 * r2]
2835 lea r0, [r6 + 48 * 128 + 112]
2837 call transpose8_internal
2840 lea r0, [r6 + 56 * 128]
2842 call transpose8_internal
2843 lea r1, [r1 - 8 + 2 * r2]
2844 lea r0, [r6 + 56 * 128 + 16]
2846 call transpose8_internal
2847 lea r1, [r1 - 8 + 2 * r2]
2848 lea r0, [r6 + 56 * 128 + 32]
2850 call transpose8_internal
2851 lea r1, [r1 - 8 + 2 * r2]
2852 lea r0, [r6 + 56 * 128 + 48]
2854 call transpose8_internal
2855 lea r1, [r1 - 8 + 2 * r2]
2856 lea r0, [r6 + 56 * 128 + 64]
2858 call transpose8_internal
2859 lea r1, [r1 - 8 + 2 * r2]
2860 lea r0, [r6 + 56 * 128 + 80]
2862 call transpose8_internal
2863 lea r1, [r1 - 8 + 2 * r2]
2864 lea r0, [r6 + 56 * 128 + 96]
2866 call transpose8_internal
2867 lea r1, [r1 - 8 + 2 * r2]
2868 lea r0, [r6 + 56 * 128 + 112]
2870 call transpose8_internal
2872 %else ;HIGH_BIT_DEPTH == 0
2873 %if ARCH_X86_64 == 1
2876 cglobal transpose16x32_avx2
2879 movu m2, [r1 + 2 * r2]
2881 lea r1, [r1 + 4 * r2]
2885 movu m6, [r1 + 2 * r2]
2888 punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2]
2889 punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2]
2891 punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4]
2892 punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4]
2894 punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6]
2895 punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6]
2897 punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8]
2898 punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8]
2900 punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4]
2901 punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4]
2903 punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8]
2904 punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8]
2906 punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4]
2907 punpckhwd m0, m2 ;[12- 15; 29 - 32][1 2 3 4]
2909 punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8]
2910 punpckhwd m4, m6 ;[12- 15; 29 - 32][5 6 7 8]
2912 punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8]
2913 punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8]
2915 punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8]
2916 punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8]
2918 punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8]
2919 punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8]
2921 punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8]
2922 punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8]
2924 movq [r0 + 0 * 64], xm6
2925 movhps [r0 + 1 * 64], xm6
2926 vextracti128 xm4, m6, 1
2927 movq [r0 + 16 * 64], xm4
2928 movhps [r0 + 17 * 64], xm4
2930 lea r1, [r1 + 4 * r2]
2933 movu m11, [r1 + 2 * r2]
2935 lea r1, [r1 + 4 * r2]
2939 movu m15, [r1 + 2 * r2]
2942 punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10]
2943 punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10]
2945 punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12]
2946 punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12]
2948 punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14]
2949 punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14]
2951 punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16]
2952 punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16]
2954 punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12]
2955 punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12]
2957 punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16]
2958 punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16]
2960 punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12]
2961 punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12]
2963 punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16]
2964 punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16]
2966 punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16]
2967 punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16]
2969 punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16]
2970 punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16]
2972 punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16]
2973 punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16]
2975 punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16]
2976 punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16]
2979 punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2980 punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2982 punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2983 punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2985 punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2986 punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2988 punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2989 punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2991 punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2992 punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2994 punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2995 punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2997 punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2998 punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
3000 movq [r0 + 0 * 64 + 8], xm15
3001 movhps [r0 + 1 * 64 + 8], xm15
3002 vextracti128 xm9, m15, 1
3003 movq [r0 + 16 * 64 + 8], xm9
3004 movhps [r0 + 17 * 64 + 8], xm9
3006 movu [r0 + 2 * 64], xm13
3007 vextracti128 [r0 + 18 * 64], m13, 1
3009 movu [r0 + 3 * 64], xm7
3010 vextracti128 [r0 + 19 * 64], m7, 1
3012 movu [r0 + 4 * 64], xm6
3013 vextracti128 [r0 + 20 * 64], m6, 1
3015 movu [r0 + 5 * 64], xm1
3016 vextracti128 [r0 + 21 * 64], m1, 1
3018 movu [r0 + 6 * 64], xm10
3019 vextracti128 [r0 + 22 * 64], m10, 1
3021 movu [r0 + 7 * 64], xm8
3022 vextracti128 [r0 + 23 * 64], m8, 1
3024 movu [r0 + 8 * 64], xm4
3025 vextracti128 [r0 + 24 * 64], m4, 1
3027 movu [r0 + 9 * 64], xm3
3028 vextracti128 [r0 + 25 * 64], m3, 1
3030 movu [r0 + 10 * 64], xm12
3031 vextracti128 [r0 + 26 * 64], m12, 1
3033 movu [r0 + 11 * 64], xm5
3034 vextracti128 [r0 + 27 * 64], m5, 1
3036 movu [r0 + 12 * 64], xm14
3037 vextracti128 [r0 + 28 * 64], m14, 1
3039 movu [r0 + 13 * 64], xm2
3040 vextracti128 [r0 + 29 * 64], m2, 1
3042 movu [r0 + 14 * 64], xm11
3043 vextracti128 [r0 + 30 * 64], m11, 1
3045 movu [r0 + 15 * 64], xm0
3046 vextracti128 [r0 + 31 * 64], m0, 1
3049 cglobal transpose64, 3, 6, 16
3055 call transpose16x32_avx2
3056 lea r0, [r0 + 32 * 64]
3058 call transpose16x32_avx2
3061 lea r5, [r1 + 4 * r2]
3064 call transpose16x32_avx2
3065 lea r0, [r0 + 32 * 64]
3067 call transpose16x32_avx2
3070 lea r5, [r1 + 4 * r2]
3073 call transpose16x32_avx2
3074 lea r0, [r0 + 32 * 64]
3076 call transpose16x32_avx2
3078 lea r5, [r1 + 4 * r2]
3082 call transpose16x32_avx2
3083 lea r0, [r0 + 32 * 64]
3085 call transpose16x32_avx2
3090 cglobal transpose64, 3, 7, 8, dest, src, stride
3095 call transpose16_internal
3096 lea r1, [r1 - 8 + 2 * r2]
3099 call transpose16_internal
3100 lea r1, [r1 - 8 + 2 * r2]
3103 call transpose16_internal
3104 lea r1, [r1 - 8 + 2 * r2]
3107 call transpose16_internal
3110 lea r0, [r3 + 16 * 64]
3112 call transpose16_internal
3113 lea r1, [r1 - 8 + 2 * r2]
3114 lea r0, [r3 + 16 * 64 + 16]
3116 call transpose16_internal
3117 lea r1, [r1 - 8 + 2 * r2]
3118 lea r0, [r3 + 16 * 64 + 32]
3120 call transpose16_internal
3121 lea r1, [r1 - 8 + 2 * r2]
3122 lea r0, [r3 + 16 * 64 + 48]
3124 call transpose16_internal
3127 lea r0, [r3 + 32 * 64]
3129 call transpose16_internal
3130 lea r1, [r1 - 8 + 2 * r2]
3131 lea r0, [r3 + 32 * 64 + 16]
3133 call transpose16_internal
3134 lea r1, [r1 - 8 + 2 * r2]
3135 lea r0, [r3 + 32 * 64 + 32]
3137 call transpose16_internal
3138 lea r1, [r1 - 8 + 2 * r2]
3139 lea r0, [r3 + 32 * 64 + 48]
3141 call transpose16_internal
3144 lea r0, [r3 + 48 * 64]
3146 call transpose16_internal
3147 lea r1, [r1 - 8 + 2 * r2]
3148 lea r0, [r3 + 48 * 64 + 16]
3150 call transpose16_internal
3151 lea r1, [r1 - 8 + 2 * r2]
3152 lea r0, [r3 + 48 * 64 + 32]
3154 call transpose16_internal
3155 lea r1, [r1 - 8 + 2 * r2]
3156 lea r0, [r3 + 48 * 64 + 48]
3158 call transpose16_internal
3163 ;=============================================================================
3165 ;=============================================================================
3167 ;-----------------------------------------------------------------------------
3168 ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
3169 ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
3170 ;-----------------------------------------------------------------------------
3173 movdqu m5, [r0+(%1&1)*r1]
3174 movdqu m6, [r2+(%1&1)*r3]
3176 movq m5, [r0+(%1&1)*r1]
3177 movq m6, [r2+(%1&1)*r3]
3195 ACCUM paddd, 3, 5, %1
3196 ACCUM paddd, 4, 7, %1
3201 cglobal pixel_ssim_4x4x2_core, 4,4,8
3211 pshufd m5, m3, q2301
3214 pshufd m6, m4, q2301
3217 pshufd m1, m1, q3120
3220 punpckhdq m5, m3, m4
3236 ;-----------------------------------------------------------------------------
3237 ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
3238 ;-----------------------------------------------------------------------------
3239 cglobal pixel_ssim_end4, 2,3
3255 TRANSPOSE4x4D 0, 1, 2, 3, 4
3257 ; s1=m0, s2=m1, ss=m2, s12=m3
3263 mulps m4, m0, m1 ; s1*s2
3264 mulps m0, m0 ; s1*s1
3265 mulps m1, m1 ; s2*s2
3266 mulps m2, [pf_64] ; ss*64
3267 mulps m3, [pf_128] ; s12*128
3268 addps m4, m4 ; s1*s2*2
3269 addps m0, m1 ; s1*s1 + s2*s2
3271 subps m3, m4 ; covar*2
3272 movaps m1, [ssim_c1]
3273 addps m4, m1 ; s1*s2*2 + ssim_c1
3274 addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1
3275 movaps m1, [ssim_c2]
3276 addps m2, m1 ; vars + ssim_c2
3277 addps m3, m1 ; covar*2 + ssim_c2
3279 pmaddwd m4, m1, m0 ; s1*s2
3282 pmaddwd m0, m0 ; s1*s1 + s2*s2
3286 psubd m3, m4 ; covar*2
3294 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
3295 cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
3296 cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
3297 cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
3304 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
3308 lea r3, [mask_ff + 16]
3311 %xdefine %%mask mask_ff + 16
3314 andps m4, [%%mask + r2*4]
3316 movups m0, [%%mask + r2*4]
3326 pshuflw m4, m0, q0032
3329 %if ARCH_X86_64 == 0
3341 ;-----------------------------------------------------------------
3342 ; void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
3343 ;-----------------------------------------------------------------
3345 cglobal scale1D_128to64, 2, 2, 8, dest, src1, stride
3347 mova m7, [deinterleave_word_shuf]
3438 mova m7, [deinterleave_shuf]
3494 %if HIGH_BIT_DEPTH == 1
3496 cglobal scale1D_128to64, 2, 2, 3
3527 %else ; HIGH_BIT_DEPTH == 0
3529 cglobal scale1D_128to64, 2, 2, 4
3534 pmaddubsw m0, m0, m3
3537 pmaddubsw m1, m1, m3
3544 pmaddubsw m0, m0, m3
3547 pmaddubsw m1, m1, m3
3555 ;-----------------------------------------------------------------
3556 ; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
3557 ;-----------------------------------------------------------------
3560 cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
3562 mova m7, [deinterleave_word_shuf]
3567 movu m2, [r1 + r2] ;k
3577 pavgw m0, m2 ;(s+t+1)/2
3579 pand m4, m5 ;(ij|kl)&st
3580 pand m4, [hmulw_16p]
3581 psubw m0, m4 ;Result
3582 movu m1, [r1 + 16] ;i
3584 movu m3, [r1 + r2 + 16] ;k
3594 pavgw m1, m3 ;(s+t+1)/2
3596 pand m5, m6 ;(ij|kl)&st
3597 pand m5, [hmulw_16p]
3598 psubw m1, m5 ;Result
3605 movu m0, [r1 + 32] ;i
3607 movu m2, [r1 + r2 + 32] ;k
3617 pavgw m0, m2 ;(s+t+1)/2
3619 pand m4, m5 ;(ij|kl)&st
3620 pand m4, [hmulw_16p]
3621 psubw m0, m4 ;Result
3622 movu m1, [r1 + 48] ;i
3624 movu m3, [r1 + r2 + 48] ;k
3634 pavgw m1, m3 ;(s+t+1)/2
3636 pand m5, m6 ;(ij|kl)&st
3637 pand m5, [hmulw_16p]
3638 psubw m1, m5 ;Result
3645 movu m0, [r1 + 64] ;i
3647 movu m2, [r1 + r2 + 64] ;k
3657 pavgw m0, m2 ;(s+t+1)/2
3659 pand m4, m5 ;(ij|kl)&st
3660 pand m4, [hmulw_16p]
3661 psubw m0, m4 ;Result
3662 movu m1, [r1 + 80] ;i
3664 movu m3, [r1 + r2 + 80] ;k
3674 pavgw m1, m3 ;(s+t+1)/2
3676 pand m5, m6 ;(ij|kl)&st
3677 pand m5, [hmulw_16p]
3678 psubw m1, m5 ;Result
3685 movu m0, [r1 + 96] ;i
3687 movu m2, [r1 + r2 + 96] ;k
3697 pavgw m0, m2 ;(s+t+1)/2
3699 pand m4, m5 ;(ij|kl)&st
3700 pand m4, [hmulw_16p]
3701 psubw m0, m4 ;Result
3702 movu m1, [r1 + 112] ;i
3704 movu m3, [r1 + r2 + 112] ;k
3714 pavgw m1, m3 ;(s+t+1)/2
3716 pand m5, m6 ;(ij|kl)&st
3717 pand m5, [hmulw_16p]
3718 psubw m1, m5 ;Result
3725 lea r1, [r1 + 2 * r2]
3732 cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
3734 mova m7, [deinterleave_shuf]
3739 movu m2, [r1 + r2] ;k
3751 pavgb m0, m2 ;(s+t+1)/2
3753 pand m4, m5 ;(ij|kl)&st
3755 psubb m0, m4 ;Result
3757 movu m1, [r1 + 16] ;i
3759 movu m3, [r1 + r2 + 16] ;k
3771 pavgb m1, m3 ;(s+t+1)/2
3773 pand m5, m6 ;(ij|kl)&st
3775 psubb m1, m5 ;Result
3783 movu m0, [r1 + 32] ;i
3785 movu m2, [r1 + r2 + 32] ;k
3797 pavgb m0, m2 ;(s+t+1)/2
3799 pand m4, m5 ;(ij|kl)&st
3801 psubb m0, m4 ;Result
3803 movu m1, [r1 + 48] ;i
3805 movu m3, [r1 + r2 + 48] ;k
3817 pavgb m1, m3 ;(s+t+1)/2
3819 pand m5, m6 ;(ij|kl)&st
3821 psubb m1, m5 ;Result
3830 lea r1, [r1 + 2 * r2]
3837 ;-----------------------------------------------------------------------------
3838 ; void pixel_sub_ps_4x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3839 ;-----------------------------------------------------------------------------
3842 cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3850 lea r2, [r2 + r4 * 2]
3851 lea r3, [r3 + r5 * 2]
3864 lea r0, [r0 + r1 * 2]
3871 cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3877 lea r2, [r2 + r4 * 2]
3878 lea r3, [r3 + r5 * 2]
3896 movhps [r0 + r1], m0
3897 movh [r0 + r1 * 2], m4
3898 lea r0, [r0 + r1 * 2]
3899 movhps [r0 + r1], m4
3905 ;-----------------------------------------------------------------------------
3906 ; void pixel_sub_ps_4x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3907 ;-----------------------------------------------------------------------------
3908 %macro PIXELSUB_PS_W4_H4 2
3910 cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3920 lea r2, [r2 + r4 * 2]
3921 lea r3, [r3 + r5 * 2]
3927 lea r2, [r2 + r4 * 2]
3928 lea r3, [r3 + r5 * 2]
3937 movh [r0 + r1 * 2], m4
3938 lea r0, [r0 + r1 * 2]
3940 lea r0, [r0 + r1 * 2]
3945 cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3953 lea r2, [r2 + r4 * 2]
3954 lea r3, [r3 + r5 * 2]
3960 lea r2, [r2 + r4 * 2]
3961 lea r3, [r3 + r5 * 2]
3975 movhps [r0 + r1], m0
3976 movh [r0 + r1 * 2], m4
3977 lea r0, [r0 + r1 * 2]
3978 movhps [r0 + r1], m4
3979 lea r0, [r0 + r1 * 2]
3988 PIXELSUB_PS_W4_H4 4, 8
3991 PIXELSUB_PS_W4_H4 4, 8
3995 ;-----------------------------------------------------------------------------
3996 ; void pixel_sub_ps_8x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3997 ;-----------------------------------------------------------------------------
3998 %macro PIXELSUB_PS_W8_H4 2
4000 cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
4010 lea r2, [r2 + r4 * 2]
4011 lea r3, [r3 + r5 * 2]
4017 lea r2, [r2 + r4 * 2]
4018 lea r3, [r3 + r5 * 2]
4027 movu [r0 + r1 * 2], m4
4028 lea r0, [r0 + r1 * 2]
4030 lea r0, [r0 + r1 * 2]
4035 cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
4043 lea r2, [r2 + r4 * 2]
4044 lea r3, [r3 + r5 * 2]
4050 lea r2, [r2 + r4 * 2]
4051 lea r3, [r3 + r5 * 2]
4068 movu [r0 + r1 * 2], m4
4069 lea r0, [r0 + r1 * 2]
4071 lea r0, [r0 + r1 * 2]
4080 PIXELSUB_PS_W8_H4 8, 8
4081 PIXELSUB_PS_W8_H4 8, 16
4084 PIXELSUB_PS_W8_H4 8, 8
4085 PIXELSUB_PS_W8_H4 8, 16
4089 ;-----------------------------------------------------------------------------
4090 ; void pixel_sub_ps_16x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
4091 ;-----------------------------------------------------------------------------
4092 %macro PIXELSUB_PS_W16_H4 2
4094 cglobal pixel_sub_ps_16x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
4105 movu m6, [r2 + r4 + 16]
4107 movu m7, [r3 + r5 + 16]
4109 lea r2, [r2 + r4 * 2]
4110 lea r3, [r3 + r5 * 2]
4120 movu [r0 + r1 + 16], m6
4128 movu m6, [r2 + r4 + 16]
4129 movu m7, [r3 + r5 + 16]
4130 lea r0, [r0 + r1 * 2]
4131 lea r2, [r2 + r4 * 2]
4132 lea r3, [r3 + r5 * 2]
4142 movu [r0 + r1 + 16], m6
4143 lea r0, [r0 + r1 * 2]
4148 cglobal pixel_sub_ps_16x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
4165 lea r2, [r2 + r4 * 2]
4166 lea r3, [r3 + r5 * 2]
4178 movu [r0 + r1 + 16], m5
4193 lea r2, [r2 + r4 * 2]
4194 lea r3, [r3 + r5 * 2]
4195 lea r0, [r0 + r1 * 2]
4207 movu [r0 + r1 + 16], m5
4208 lea r0, [r0 + r1 * 2]
4217 PIXELSUB_PS_W16_H4 16, 16
4218 PIXELSUB_PS_W16_H4 16, 32
4221 PIXELSUB_PS_W16_H4 16, 16
4222 PIXELSUB_PS_W16_H4 16, 32
4226 ;-----------------------------------------------------------------------------
4227 ; void pixel_sub_ps_32x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
4228 ;-----------------------------------------------------------------------------
4229 %macro PIXELSUB_PS_W32_H2 2
4231 cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
4258 movu m2, [r2 + r4 + 16]
4259 movu m4, [r2 + r4 + 32]
4260 movu m6, [r2 + r4 + 48]
4262 movu m3, [r3 + r5 + 16]
4263 movu m5, [r3 + r5 + 32]
4264 movu m7, [r3 + r5 + 48]
4265 lea r2, [r2 + r4 * 2]
4266 lea r3, [r3 + r5 * 2]
4274 movu [r0 + r1 + 16], m2
4275 movu [r0 + r1 + 32], m4
4276 movu [r0 + r1 + 48], m6
4277 lea r0, [r0 + r1 * 2]
4282 cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
4315 movh m1, [r2 + r4 + 8]
4316 movh m2, [r2 + r4 + 16]
4317 movh m6, [r2 + r4 + 24]
4319 movh m4, [r3 + r5 + 8]
4320 movh m5, [r3 + r5 + 16]
4321 movh m7, [r3 + r5 + 24]
4322 lea r2, [r2 + r4 * 2]
4323 lea r3, [r3 + r5 * 2]
4339 movu [r0 + r1 + 16], m1
4340 movu [r0 + r1 + 32], m2
4341 movu [r0 + r1 + 48], m6
4342 lea r0, [r0 + r1 * 2]
4351 PIXELSUB_PS_W32_H2 32, 32
4352 PIXELSUB_PS_W32_H2 32, 64
4355 PIXELSUB_PS_W32_H2 32, 32
4356 PIXELSUB_PS_W32_H2 32, 64
4360 ;-----------------------------------------------------------------------------
4361 ; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
4362 ;-----------------------------------------------------------------------------
4363 %macro PIXELSUB_PS_W64_H2 2
4365 cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
4410 movu m2, [r2 + r4 + 16]
4411 movu m4, [r2 + r4 + 32]
4412 movu m6, [r2 + r4 + 48]
4414 movu m3, [r3 + r5 + 16]
4415 movu m5, [r3 + r5 + 32]
4416 movu m7, [r3 + r5 + 48]
4424 movu [r0 + r1 + 16], m2
4425 movu [r0 + r1 + 32], m4
4426 movu [r0 + r1 + 48], m6
4428 movu m0, [r2 + r4 + 64]
4429 movu m2, [r2 + r4 + 80]
4430 movu m4, [r2 + r4 + 96]
4431 movu m6, [r2 + r4 + 112]
4432 movu m1, [r3 + r5 + 64]
4433 movu m3, [r3 + r5 + 80]
4434 movu m5, [r3 + r5 + 96]
4435 movu m7, [r3 + r5 + 112]
4437 lea r2, [r2 + r4 * 2]
4438 lea r3, [r3 + r5 * 2]
4445 movu [r0 + r1 + 64], m0
4446 movu [r0 + r1 + 80], m2
4447 movu [r0 + r1 + 96], m4
4448 movu [r0 + r1 + 112], m6
4449 lea r0, [r0 + r1 * 2]
4454 cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
4514 movu m2, [r2 + r4 + 16]
4515 movu m5, [r3 + r5 + 16]
4525 movu [r0 + r1 + 16], m7
4527 movu m0, [r2 + r4 + 32]
4528 movu m3, [r3 + r5 + 32]
4538 movu [r0 + r1 + 32], m1
4539 movu [r0 + r1 + 48], m2
4541 movu m4, [r2 + r4 + 48]
4542 movu m5, [r3 + r5 + 48]
4543 lea r2, [r2 + r4 * 2]
4544 lea r3, [r3 + r5 * 2]
4553 movu [r0 + r1 + 64], m7
4554 movu [r0 + r1 + 80], m0
4561 movu [r0 + r1 + 96], m2
4562 movu [r0 + r1 + 112], m4
4563 lea r0, [r0 + r1 * 2]
4572 PIXELSUB_PS_W64_H2 64, 64
4575 PIXELSUB_PS_W64_H2 64, 64
4579 ;=============================================================================
4581 ;=============================================================================
4585 pxor m6, m6 ; sum squared
4586 %if HIGH_BIT_DEPTH == 0
4592 %endif ; !HIGH_BIT_DEPTH
4597 %if mmsize == 8 && %1*%2 == 256
4608 %else ; !HIGH_BIT_DEPTH
4616 %endif ; HIGH_BIT_DEPTH
4648 movu m1, [r0+mmsize]
4650 movu m4, [r0+%1+mmsize]
4651 %else ; !HIGH_BIT_DEPTH
4653 punpckhbw m1, m0, m7
4657 %endif ; HIGH_BIT_DEPTH
4663 %if HIGH_BIT_DEPTH == 0
4666 %endif ; !HIGH_BIT_DEPTH
4672 ;-----------------------------------------------------------------------------
4673 ; int pixel_var_wxh( uint8_t *, intptr_t )
4674 ;-----------------------------------------------------------------------------
4676 cglobal pixel_var_16x16, 2,3
4679 VAR_2ROW 8*SIZEOF_PIXEL, 16, 1
4682 cglobal pixel_var_8x8, 2,3
4690 cglobal pixel_var_16x16, 2,3,8
4696 cglobal pixel_var_8x8, 2,3,8
4712 cglobal pixel_var_32x32, 2,6,8
4734 cglobal pixel_var_64x64, 2,6,8
4825 %endif ; HIGH_BIT_DEPTH
4827 %if HIGH_BIT_DEPTH == 0
4829 cglobal pixel_var_8x8, 2,3,8
4834 movhps m0, [r0 + r1 * 2]
4835 movhps m3, [r0 + r2]
4836 DEINTB 1, 0, 4, 3, 7
4837 lea r0, [r0 + r1 * 4]
4841 movhps m0, [r0 + r1 * 2]
4842 movhps m3, [r0 + r2]
4843 DEINTB 1, 0, 4, 3, 7
4847 cglobal pixel_var_16x16_internal
4850 DEINTB 1, 0, 4, 3, 7
4852 movu m0, [r0 + 2 * r1]
4854 DEINTB 1, 0, 4, 3, 7
4855 lea r0, [r0 + r1 * 4]
4859 DEINTB 1, 0, 4, 3, 7
4861 movu m0, [r0 + 2 * r1]
4863 DEINTB 1, 0, 4, 3, 7
4864 lea r0, [r0 + r1 * 4]
4868 DEINTB 1, 0, 4, 3, 7
4870 movu m0, [r0 + 2 * r1]
4872 DEINTB 1, 0, 4, 3, 7
4873 lea r0, [r0 + r1 * 4]
4877 DEINTB 1, 0, 4, 3, 7
4879 movu m0, [r0 + 2 * r1]
4881 DEINTB 1, 0, 4, 3, 7
4885 cglobal pixel_var_16x16, 2,3,8
4888 call pixel_var_16x16_internal
4891 cglobal pixel_var_32x32, 2,4,8
4895 call pixel_var_16x16_internal
4896 lea r0, [r0 + r1 * 4]
4897 call pixel_var_16x16_internal
4899 call pixel_var_16x16_internal
4900 lea r0, [r0 + r1 * 4]
4901 call pixel_var_16x16_internal
4904 cglobal pixel_var_64x64, 2,6,8
4908 call pixel_var_16x16_internal
4909 lea r0, [r0 + r1 * 4]
4910 call pixel_var_16x16_internal
4911 lea r0, [r0 + r1 * 4]
4912 call pixel_var_16x16_internal
4913 lea r0, [r0 + r1 * 4]
4914 call pixel_var_16x16_internal
4919 call pixel_var_16x16_internal
4920 lea r0, [r0 + r1 * 4]
4921 call pixel_var_16x16_internal
4922 lea r0, [r0 + r1 * 4]
4923 call pixel_var_16x16_internal
4924 lea r0, [r0 + r1 * 4]
4925 call pixel_var_16x16_internal
4931 call pixel_var_16x16_internal
4932 lea r0, [r0 + r1 * 4]
4933 call pixel_var_16x16_internal
4934 lea r0, [r0 + r1 * 4]
4935 call pixel_var_16x16_internal
4936 lea r0, [r0 + r1 * 4]
4937 call pixel_var_16x16_internal
4943 call pixel_var_16x16_internal
4944 lea r0, [r0 + r1 * 4]
4945 call pixel_var_16x16_internal
4946 lea r0, [r0 + r1 * 4]
4947 call pixel_var_16x16_internal
4948 lea r0, [r0 + r1 * 4]
4949 call pixel_var_16x16_internal
4961 cglobal pixel_var_16x16, 2,4,7
4967 pmovzxbw m3, [r0+r1]
4968 pmovzxbw m1, [r0+r1*2]
4969 pmovzxbw m4, [r0+r3]
4974 vextracti128 xm0, m5, 1
4975 vextracti128 xm1, m6, 1
4988 %endif ; !HIGH_BIT_DEPTH
4998 sub eax, r1d ; sqr - (sum * sum >> shift)