| 1 | ;***************************************************************************** |
| 2 | ;* Copyright (C) 2013 x265 project |
| 3 | ;* |
| 4 | ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com> |
| 5 | ;* |
| 6 | ;* This program is free software; you can redistribute it and/or modify |
| 7 | ;* it under the terms of the GNU General Public License as published by |
| 8 | ;* the Free Software Foundation; either version 2 of the License, or |
| 9 | ;* (at your option) any later version. |
| 10 | ;* |
| 11 | ;* This program is distributed in the hope that it will be useful, |
| 12 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | ;* GNU General Public License for more details. |
| 15 | ;* |
| 16 | ;* You should have received a copy of the GNU General Public License |
| 17 | ;* along with this program; if not, write to the Free Software |
| 18 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
| 19 | ;* |
| 20 | ;* This program is also available under a commercial proprietary license. |
| 21 | ;* For more information, contact us at license @ x265.com. |
| 22 | ;*****************************************************************************/ |
| 23 | |
| 24 | %include "x86inc.asm" |
| 25 | %include "x86util.asm" |
| 26 | |
| 27 | SECTION_RODATA 32 |
| 28 | |
| 29 | pb_0_8 times 8 db 0, 8 |
| 30 | pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8 |
| 31 | pb_swap8: times 2 db 7, 6, 5, 4, 3, 2, 1, 0 |
| 32 | c_trans_4x4 db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 |
| 33 | tab_Si: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 |
| 34 | pb_fact0: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 |
| 35 | c_mode32_12_0: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 7, 0 |
| 36 | c_mode32_13_0: db 3, 6, 10, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| 37 | c_mode32_13_shuf: db 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0 |
| 38 | c_mode32_14_shuf: db 15, 14, 13, 0, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15 |
| 39 | c_mode32_14_0: db 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
| 40 | c_mode32_15_0: db 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0 |
| 41 | c_mode32_16_0: db 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0 |
| 42 | c_mode32_17_0: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0 |
| 43 | c_mode32_18_0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
| 44 | c_shuf8_0: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
| 45 | c_deinterval8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 |
| 46 | tab_S1: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0 |
| 47 | pb_unpackbq: db 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 |
| 48 | c_mode16_12: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6 |
| 49 | c_mode16_13: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4 |
| 50 | c_mode16_14: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2 |
| 51 | c_mode16_15: db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2 |
| 52 | c_mode16_16: db 8, 6, 5, 3, 2, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2 |
| 53 | c_mode16_17: db 4, 2, 1, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1 |
| 54 | c_mode16_18: db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 |
| 55 | tab_S2: db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0 |
| 56 | |
| 57 | const ang_table |
| 58 | %assign x 0 |
| 59 | %rep 32 |
| 60 | times 8 db (32-x), x |
| 61 | %assign x x+1 |
| 62 | %endrep |
| 63 | |
| 64 | SECTION .text |
| 65 | |
| 66 | cextern pw_8 |
| 67 | cextern pw_1024 |
| 68 | cextern pb_unpackbd1 |
| 69 | cextern multiL |
| 70 | cextern multiH |
| 71 | cextern multiH2 |
| 72 | cextern multiH3 |
| 73 | cextern multi_2Row |
| 74 | |
| 75 | ;----------------------------------------------------------------------------- |
| 76 | ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) |
| 77 | ;----------------------------------------------------------------------------- |
| 78 | INIT_XMM sse4 |
| 79 | cglobal intra_pred_dc4, 4,6,3 |
| 80 | mov r4d, r5m |
| 81 | inc r2 |
| 82 | inc r3 |
| 83 | pxor m0, m0 |
| 84 | movd m1, [r2] |
| 85 | movd m2, [r3] |
| 86 | punpckldq m1, m2 |
| 87 | psadbw m1, m0 ; m1 = sum |
| 88 | |
| 89 | test r4d, r4d |
| 90 | |
| 91 | mov r4d, 4096 |
| 92 | movd m2, r4d |
| 93 | pmulhrsw m1, m2 ; m1 = (sum + 4) / 8 |
| 94 | movd r4d, m1 ; r4d = dc_val |
| 95 | pshufb m1, m0 ; m1 = byte [dc_val ...] |
| 96 | |
| 97 | ; store DC 4x4 |
| 98 | lea r5, [r1 * 3] |
| 99 | movd [r0], m1 |
| 100 | movd [r0 + r1], m1 |
| 101 | movd [r0 + r1 * 2], m1 |
| 102 | movd [r0 + r5], m1 |
| 103 | |
| 104 | ; do DC filter |
| 105 | jz .end |
| 106 | lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2 |
| 107 | add r4d, r5d ; r4d = DC * 3 + 2 |
| 108 | movd m1, r4d |
| 109 | pshuflw m1, m1, 0 ; m1 = pixDCx3 |
| 110 | |
| 111 | ; filter top |
| 112 | pmovzxbw m2, [r3] |
| 113 | paddw m2, m1 |
| 114 | psraw m2, 2 |
| 115 | packuswb m2, m2 |
| 116 | movd [r0], m2 ; overwrite top-left pixel, we will update it later |
| 117 | |
| 118 | ; filter top-left |
| 119 | movzx r3d, byte [r3] |
| 120 | add r5d, r3d |
| 121 | movzx r3d, byte [r2] |
| 122 | add r3d, r5d |
| 123 | shr r3d, 2 |
| 124 | mov [r0], r3b |
| 125 | |
| 126 | ; filter left |
| 127 | add r0, r1 |
| 128 | pmovzxbw m2, [r2 + 1] |
| 129 | paddw m2, m1 |
| 130 | psraw m2, 2 |
| 131 | packuswb m2, m2 |
| 132 | pextrb [r0], m2, 0 |
| 133 | pextrb [r0 + r1], m2, 1 |
| 134 | pextrb [r0 + r1 * 2], m2, 2 |
| 135 | |
| 136 | .end: |
| 137 | RET |
| 138 | |
| 139 | |
| 140 | ;------------------------------------------------------------------------------------------- |
| 141 | ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) |
| 142 | ;------------------------------------------------------------------------------------------- |
| 143 | INIT_XMM sse4 |
| 144 | cglobal intra_pred_dc8, 4, 7, 3 |
| 145 | mov r4d, r5m |
| 146 | inc r2 |
| 147 | inc r3 |
| 148 | pxor m0, m0 |
| 149 | movh m1, [r2] |
| 150 | movh m2, [r3] |
| 151 | punpcklqdq m1, m2 |
| 152 | psadbw m1, m0 |
| 153 | pshufd m2, m1, 2 |
| 154 | paddw m1, m2 |
| 155 | |
| 156 | movd r5d, m1 |
| 157 | add r5d, 8 |
| 158 | shr r5d, 4 ; sum = sum / 16 |
| 159 | movd m1, r5d |
| 160 | pshufb m1, m0 ; m1 = byte [dc_val ...] |
| 161 | |
| 162 | test r4d, r4d |
| 163 | |
| 164 | ; store DC 8x8 |
| 165 | mov r6, r0 |
| 166 | movh [r0], m1 |
| 167 | movh [r0 + r1], m1 |
| 168 | lea r0, [r0 + r1 * 2] |
| 169 | movh [r0], m1 |
| 170 | movh [r0 + r1], m1 |
| 171 | lea r0, [r0 + r1 * 2] |
| 172 | movh [r0], m1 |
| 173 | movh [r0 + r1], m1 |
| 174 | lea r0, [r0 + r1 * 2] |
| 175 | movh [r0], m1 |
| 176 | movh [r0 + r1], m1 |
| 177 | |
| 178 | ; Do DC Filter |
| 179 | jz .end |
| 180 | lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 |
| 181 | add r5d, r4d ; r5d = DC * 3 + 2 |
| 182 | movd m1, r5d |
| 183 | pshuflw m1, m1, 0 ; m1 = pixDCx3 |
| 184 | pshufd m1, m1, 0 |
| 185 | |
| 186 | ; filter top |
| 187 | pmovzxbw m2, [r3] |
| 188 | paddw m2, m1 |
| 189 | psraw m2, 2 |
| 190 | packuswb m2, m2 |
| 191 | movh [r6], m2 |
| 192 | |
| 193 | ; filter top-left |
| 194 | movzx r3d, byte [r3] |
| 195 | add r4d, r3d |
| 196 | movzx r3d, byte [r2] |
| 197 | add r3d, r4d |
| 198 | shr r3d, 2 |
| 199 | mov [r6], r3b |
| 200 | |
| 201 | ; filter left |
| 202 | add r6, r1 |
| 203 | pmovzxbw m2, [r2 + 1] |
| 204 | paddw m2, m1 |
| 205 | psraw m2, 2 |
| 206 | packuswb m2, m2 |
| 207 | pextrb [r6], m2, 0 |
| 208 | pextrb [r6 + r1], m2, 1 |
| 209 | pextrb [r6 + 2 * r1], m2, 2 |
| 210 | lea r6, [r6 + r1 * 2] |
| 211 | pextrb [r6 + r1], m2, 3 |
| 212 | pextrb [r6 + r1 * 2], m2, 4 |
| 213 | pextrb [r6 + r1 * 4], m2, 6 |
| 214 | lea r1, [r1 * 3] |
| 215 | pextrb [r6 + r1], m2, 5 |
| 216 | |
| 217 | .end: |
| 218 | RET |
| 219 | |
| 220 | ;------------------------------------------------------------------------------------------- |
| 221 | ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) |
| 222 | ;------------------------------------------------------------------------------------------- |
| 223 | INIT_XMM sse4 |
| 224 | cglobal intra_pred_dc16, 5, 7, 4 |
| 225 | mov r4d, r5m |
| 226 | inc r2 |
| 227 | inc r3 |
| 228 | pxor m0, m0 |
| 229 | movu m1, [r2] |
| 230 | movu m2, [r3] |
| 231 | psadbw m1, m0 |
| 232 | psadbw m2, m0 |
| 233 | paddw m1, m2 |
| 234 | pshufd m2, m1, 2 |
| 235 | paddw m1, m2 |
| 236 | |
| 237 | movd r5d, m1 |
| 238 | add r5d, 16 |
| 239 | shr r5d, 5 ; sum = sum / 32 |
| 240 | movd m1, r5d |
| 241 | pshufb m1, m0 ; m1 = byte [dc_val ...] |
| 242 | |
| 243 | test r4d, r4d |
| 244 | |
| 245 | ; store DC 16x16 |
| 246 | mov r6, r0 |
| 247 | movu [r0], m1 |
| 248 | movu [r0 + r1], m1 |
| 249 | lea r0, [r0 + r1 * 2] |
| 250 | movu [r0], m1 |
| 251 | movu [r0 + r1], m1 |
| 252 | lea r0, [r0 + r1 * 2] |
| 253 | movu [r0], m1 |
| 254 | movu [r0 + r1], m1 |
| 255 | lea r0, [r0 + r1 * 2] |
| 256 | movu [r0], m1 |
| 257 | movu [r0 + r1], m1 |
| 258 | lea r0, [r0 + r1 * 2] |
| 259 | movu [r0], m1 |
| 260 | movu [r0 + r1], m1 |
| 261 | lea r0, [r0 + r1 * 2] |
| 262 | movu [r0], m1 |
| 263 | movu [r0 + r1], m1 |
| 264 | lea r0, [r0 + r1 * 2] |
| 265 | movu [r0], m1 |
| 266 | movu [r0 + r1], m1 |
| 267 | lea r0, [r0 + r1 * 2] |
| 268 | movu [r0], m1 |
| 269 | movu [r0 + r1], m1 |
| 270 | |
| 271 | ; Do DC Filter |
| 272 | jz .end |
| 273 | lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 |
| 274 | add r5d, r4d ; r5d = DC * 3 + 2 |
| 275 | movd m1, r5d |
| 276 | pshuflw m1, m1, 0 ; m1 = pixDCx3 |
| 277 | pshufd m1, m1, 0 |
| 278 | |
| 279 | ; filter top |
| 280 | pmovzxbw m2, [r3] |
| 281 | paddw m2, m1 |
| 282 | psraw m2, 2 |
| 283 | packuswb m2, m2 |
| 284 | movh [r6], m2 |
| 285 | pmovzxbw m3, [r3 + 8] |
| 286 | paddw m3, m1 |
| 287 | psraw m3, 2 |
| 288 | packuswb m3, m3 |
| 289 | movh [r6 + 8], m3 |
| 290 | |
| 291 | ; filter top-left |
| 292 | movzx r3d, byte [r3] |
| 293 | add r4d, r3d |
| 294 | movzx r3d, byte [r2] |
| 295 | add r3d, r4d |
| 296 | shr r3d, 2 |
| 297 | mov [r6], r3b |
| 298 | |
| 299 | ; filter left |
| 300 | add r6, r1 |
| 301 | pmovzxbw m2, [r2 + 1] |
| 302 | paddw m2, m1 |
| 303 | psraw m2, 2 |
| 304 | packuswb m2, m2 |
| 305 | pextrb [r6], m2, 0 |
| 306 | pextrb [r6 + r1], m2, 1 |
| 307 | pextrb [r6 + r1 * 2], m2, 2 |
| 308 | lea r6, [r6 + r1 * 2] |
| 309 | pextrb [r6 + r1], m2, 3 |
| 310 | pextrb [r6 + r1 * 2], m2, 4 |
| 311 | lea r6, [r6 + r1 * 2] |
| 312 | pextrb [r6 + r1], m2, 5 |
| 313 | pextrb [r6 + r1 * 2], m2, 6 |
| 314 | lea r6, [r6 + r1 * 2] |
| 315 | pextrb [r6 + r1], m2, 7 |
| 316 | |
| 317 | pmovzxbw m3, [r2 + 9] |
| 318 | paddw m3, m1 |
| 319 | psraw m3, 2 |
| 320 | packuswb m3, m3 |
| 321 | pextrb [r6 + r1 * 2], m3, 0 |
| 322 | lea r6, [r6 + r1 * 2] |
| 323 | pextrb [r6 + r1], m3, 1 |
| 324 | pextrb [r6 + r1 * 2], m3, 2 |
| 325 | lea r6, [r6 + r1 * 2] |
| 326 | pextrb [r6 + r1], m3, 3 |
| 327 | pextrb [r6 + r1 * 2], m3, 4 |
| 328 | lea r6, [r6 + r1 * 2] |
| 329 | pextrb [r6 + r1], m3, 5 |
| 330 | pextrb [r6 + r1 * 2], m3, 6 |
| 331 | |
| 332 | .end: |
| 333 | RET |
| 334 | |
| 335 | ;------------------------------------------------------------------------------------------- |
| 336 | ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) |
| 337 | ;------------------------------------------------------------------------------------------- |
| 338 | INIT_XMM sse4 |
| 339 | cglobal intra_pred_dc32, 4, 5, 5 |
| 340 | inc r2 |
| 341 | inc r3 |
| 342 | pxor m0, m0 |
| 343 | movu m1, [r2] |
| 344 | movu m2, [r2 + 16] |
| 345 | movu m3, [r3] |
| 346 | movu m4, [r3 + 16] |
| 347 | psadbw m1, m0 |
| 348 | psadbw m2, m0 |
| 349 | psadbw m3, m0 |
| 350 | psadbw m4, m0 |
| 351 | paddw m1, m2 |
| 352 | paddw m3, m4 |
| 353 | paddw m1, m3 |
| 354 | pshufd m2, m1, 2 |
| 355 | paddw m1, m2 |
| 356 | |
| 357 | movd r4d, m1 |
| 358 | add r4d, 32 |
| 359 | shr r4d, 6 ; sum = sum / 64 |
| 360 | movd m1, r4d |
| 361 | pshufb m1, m0 ; m1 = byte [dc_val ...] |
| 362 | |
| 363 | %rep 2 |
| 364 | ; store DC 16x16 |
| 365 | movu [r0], m1 |
| 366 | movu [r0 + r1], m1 |
| 367 | movu [r0 + 16], m1 |
| 368 | movu [r0 + r1 + 16],m1 |
| 369 | lea r0, [r0 + 2 * r1] |
| 370 | movu [r0], m1 |
| 371 | movu [r0 + r1], m1 |
| 372 | movu [r0 + 16], m1 |
| 373 | movu [r0 + r1 + 16],m1 |
| 374 | lea r0, [r0 + 2 * r1] |
| 375 | movu [r0], m1 |
| 376 | movu [r0 + r1], m1 |
| 377 | movu [r0 + 16], m1 |
| 378 | movu [r0 + r1 + 16],m1 |
| 379 | lea r0, [r0 + 2 * r1] |
| 380 | movu [r0], m1 |
| 381 | movu [r0 + r1], m1 |
| 382 | movu [r0 + 16], m1 |
| 383 | movu [r0 + r1 + 16],m1 |
| 384 | lea r0, [r0 + 2 * r1] |
| 385 | movu [r0], m1 |
| 386 | movu [r0 + r1], m1 |
| 387 | movu [r0 + 16], m1 |
| 388 | movu [r0 + r1 + 16],m1 |
| 389 | lea r0, [r0 + 2 * r1] |
| 390 | movu [r0], m1 |
| 391 | movu [r0 + r1], m1 |
| 392 | movu [r0 + 16], m1 |
| 393 | movu [r0 + r1 + 16],m1 |
| 394 | lea r0, [r0 + 2 * r1] |
| 395 | movu [r0], m1 |
| 396 | movu [r0 + r1], m1 |
| 397 | movu [r0 + 16], m1 |
| 398 | movu [r0 + r1 + 16],m1 |
| 399 | lea r0, [r0 + 2 * r1] |
| 400 | movu [r0], m1 |
| 401 | movu [r0 + r1], m1 |
| 402 | movu [r0 + 16], m1 |
| 403 | movu [r0 + r1 + 16],m1 |
| 404 | lea r0, [r0 + 2 * r1] |
| 405 | %endrep |
| 406 | |
| 407 | RET |
| 408 | |
| 409 | ;----------------------------------------------------------------------------------------------------------- |
| 410 | ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) |
| 411 | ;----------------------------------------------------------------------------------------------------------- |
| 412 | INIT_XMM sse4 |
| 413 | cglobal intra_pred_planar4, 4,7,5 |
| 414 | inc r2 |
| 415 | inc r3 |
| 416 | pmovzxbw m0, [r3] ; topRow[i] = above[i]; |
| 417 | punpcklqdq m0, m0 |
| 418 | |
| 419 | pxor m1, m1 |
| 420 | movd m2, [r2 + 4] ; bottomLeft = left[4] |
| 421 | movzx r6d, byte [r3 + 4] ; topRight = above[4]; |
| 422 | pshufb m2, m1 |
| 423 | punpcklbw m2, m1 |
| 424 | psubw m2, m0 ; bottomRow[i] = bottomLeft - topRow[i] |
| 425 | psllw m0, 2 |
| 426 | punpcklqdq m3, m2, m1 |
| 427 | psubw m0, m3 |
| 428 | paddw m2, m2 |
| 429 | |
| 430 | %macro COMP_PRED_PLANAR_2ROW 1 |
| 431 | movzx r4d, byte [r2 + %1] |
| 432 | lea r4d, [r4d * 4 + 4] |
| 433 | movd m3, r4d |
| 434 | pshuflw m3, m3, 0 |
| 435 | |
| 436 | movzx r4d, byte [r2 + %1 + 1] |
| 437 | lea r4d, [r4d * 4 + 4] |
| 438 | movd m4, r4d |
| 439 | pshuflw m4, m4, 0 |
| 440 | punpcklqdq m3, m4 ; horPred |
| 441 | |
| 442 | movzx r4d, byte [r2 + %1] |
| 443 | mov r5d, r6d |
| 444 | sub r5d, r4d |
| 445 | movd m4, r5d |
| 446 | pshuflw m4, m4, 0 |
| 447 | |
| 448 | movzx r4d, byte [r2 + %1 + 1] |
| 449 | mov r5d, r6d |
| 450 | sub r5d, r4d |
| 451 | movd m1, r5d |
| 452 | pshuflw m1, m1, 0 |
| 453 | punpcklqdq m4, m1 ; rightColumnN |
| 454 | |
| 455 | pmullw m4, [multi_2Row] |
| 456 | paddw m3, m4 |
| 457 | paddw m0, m2 |
| 458 | paddw m3, m0 |
| 459 | psraw m3, 3 |
| 460 | packuswb m3, m3 |
| 461 | |
| 462 | movd [r0], m3 |
| 463 | pshufd m3, m3, 0x55 |
| 464 | movd [r0 + r1], m3 |
| 465 | lea r0, [r0 + 2 * r1] |
| 466 | %endmacro |
| 467 | |
| 468 | COMP_PRED_PLANAR_2ROW 0 |
| 469 | COMP_PRED_PLANAR_2ROW 2 |
| 470 | |
| 471 | RET |
| 472 | |
| 473 | ;----------------------------------------------------------------------------------------------------------- |
| 474 | ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) |
| 475 | ;----------------------------------------------------------------------------------------------------------- |
| 476 | INIT_XMM sse4 |
| 477 | cglobal intra_pred_planar8, 4,4,7 |
| 478 | inc r2 |
| 479 | inc r3 |
| 480 | pxor m0, m0 |
| 481 | pmovzxbw m1, [r3] ; v_topRow |
| 482 | pmovzxbw m2, [r2] ; v_leftColumn |
| 483 | |
| 484 | movd m3, [r3 + 8] ; topRight = above[8]; |
| 485 | movd m4, [r2 + 8] ; bottomLeft = left[8]; |
| 486 | |
| 487 | pshufb m3, m0 |
| 488 | pshufb m4, m0 |
| 489 | punpcklbw m3, m0 ; v_topRight |
| 490 | punpcklbw m4, m0 ; v_bottomLeft |
| 491 | |
| 492 | psubw m4, m1 ; v_bottomRow |
| 493 | psubw m3, m2 ; v_rightColumn |
| 494 | |
| 495 | psllw m1, 3 ; v_topRow |
| 496 | psllw m2, 3 ; v_leftColumn |
| 497 | |
| 498 | paddw m6, m2, [pw_8] |
| 499 | |
| 500 | %macro PRED_PLANAR_ROW8 1 |
| 501 | %if (%1 < 4) |
| 502 | pshuflw m5, m6, 0x55 * %1 |
| 503 | pshufd m5, m5, 0 |
| 504 | pshuflw m2, m3, 0x55 * %1 |
| 505 | pshufd m2, m2, 0 |
| 506 | %else |
| 507 | pshufhw m5, m6, 0x55 * (%1 - 4) |
| 508 | pshufd m5, m5, 0xAA |
| 509 | pshufhw m2, m3, 0x55 * (%1 - 4) |
| 510 | pshufd m2, m2, 0xAA |
| 511 | %endif |
| 512 | |
| 513 | pmullw m2, [multiL] |
| 514 | paddw m5, m2 |
| 515 | paddw m1, m4 |
| 516 | paddw m5, m1 |
| 517 | psraw m5, 4 |
| 518 | packuswb m5, m5 |
| 519 | |
| 520 | movh [r0], m5 |
| 521 | lea r0, [r0 + r1] |
| 522 | |
| 523 | %endmacro |
| 524 | |
| 525 | PRED_PLANAR_ROW8 0 |
| 526 | PRED_PLANAR_ROW8 1 |
| 527 | PRED_PLANAR_ROW8 2 |
| 528 | PRED_PLANAR_ROW8 3 |
| 529 | PRED_PLANAR_ROW8 4 |
| 530 | PRED_PLANAR_ROW8 5 |
| 531 | PRED_PLANAR_ROW8 6 |
| 532 | PRED_PLANAR_ROW8 7 |
| 533 | |
| 534 | RET |
| 535 | |
| 536 | |
| 537 | ;----------------------------------------------------------------------------------------------------------- |
| 538 | ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) |
| 539 | ;----------------------------------------------------------------------------------------------------------- |
| 540 | INIT_XMM sse4 |
| 541 | cglobal intra_pred_planar16, 4,6,8 |
| 542 | inc r2 |
| 543 | inc r3 |
| 544 | pxor m0, m0 |
| 545 | pmovzxbw m1, [r3] ; topRow[0-7] |
| 546 | pmovzxbw m2, [r3 + 8] ; topRow[8-15] |
| 547 | |
| 548 | movd m3, [r2 + 16] |
| 549 | pshufb m3, m0 |
| 550 | punpcklbw m3, m0 ; v_bottomLeft = left[16] |
| 551 | movzx r4d, byte [r3 + 16] ; topRight = above[16] |
| 552 | |
| 553 | psubw m4, m3, m1 ; v_bottomRow[0] |
| 554 | psubw m5, m3, m2 ; v_bottomRow[1] |
| 555 | |
| 556 | psllw m1, 4 |
| 557 | psllw m2, 4 |
| 558 | |
| 559 | %macro PRED_PLANAR_ROW16 1 |
| 560 | movzx r5d, byte [r2 + %1] |
| 561 | add r5d, r5d |
| 562 | lea r5d, [r5d * 8 + 16] |
| 563 | movd m3, r5d |
| 564 | pshuflw m3, m3, 0 |
| 565 | pshufd m3, m3, 0 ; horPred |
| 566 | |
| 567 | movzx r5d, byte [r2 + %1] |
| 568 | mov r3d, r4d |
| 569 | sub r3d, r5d |
| 570 | movd m6, r3d |
| 571 | pshuflw m6, m6, 0 |
| 572 | pshufd m6, m6, 0 |
| 573 | |
| 574 | pmullw m7, m6, [multiL] |
| 575 | paddw m7, m3 |
| 576 | paddw m1, m4 |
| 577 | paddw m7, m1 |
| 578 | psraw m7, 5 |
| 579 | |
| 580 | pmullw m6, m6, [multiH] |
| 581 | paddw m3, m6 |
| 582 | paddw m2, m5 |
| 583 | paddw m3, m2 |
| 584 | psraw m3, 5 |
| 585 | |
| 586 | packuswb m7, m3 |
| 587 | movu [r0], m7 |
| 588 | lea r0, [r0 + r1] |
| 589 | %endmacro |
| 590 | |
| 591 | PRED_PLANAR_ROW16 0 |
| 592 | PRED_PLANAR_ROW16 1 |
| 593 | PRED_PLANAR_ROW16 2 |
| 594 | PRED_PLANAR_ROW16 3 |
| 595 | PRED_PLANAR_ROW16 4 |
| 596 | PRED_PLANAR_ROW16 5 |
| 597 | PRED_PLANAR_ROW16 6 |
| 598 | PRED_PLANAR_ROW16 7 |
| 599 | PRED_PLANAR_ROW16 8 |
| 600 | PRED_PLANAR_ROW16 9 |
| 601 | PRED_PLANAR_ROW16 10 |
| 602 | PRED_PLANAR_ROW16 11 |
| 603 | PRED_PLANAR_ROW16 12 |
| 604 | PRED_PLANAR_ROW16 13 |
| 605 | PRED_PLANAR_ROW16 14 |
| 606 | PRED_PLANAR_ROW16 15 |
| 607 | |
| 608 | RET |
| 609 | |
| 610 | |
| 611 | ;----------------------------------------------------------------------------------------------------------- |
| 612 | ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) |
| 613 | ;----------------------------------------------------------------------------------------------------------- |
| 614 | INIT_XMM sse4 |
| 615 | %if ARCH_X86_64 == 1 |
| 616 | cglobal intra_pred_planar32, 4,7,12 |
| 617 | %define bottomRow0 m8 |
| 618 | %define bottomRow1 m9 |
| 619 | %define bottomRow2 m10 |
| 620 | %define bottomRow3 m11 |
| 621 | %else |
| 622 | cglobal intra_pred_planar32, 4,7,8,0-(4*mmsize) |
| 623 | %define bottomRow0 [rsp + 0 * mmsize] |
| 624 | %define bottomRow1 [rsp + 1 * mmsize] |
| 625 | %define bottomRow2 [rsp + 2 * mmsize] |
| 626 | %define bottomRow3 [rsp + 3 * mmsize] |
| 627 | %endif |
| 628 | inc r2 |
| 629 | inc r3 |
| 630 | pxor m3, m3 |
| 631 | movd m0, [r2 + 32] |
| 632 | pshufb m0, m3 |
| 633 | punpcklbw m0, m3 ; v_bottomLeft = left[32] |
| 634 | movzx r4d, byte [r3 + 32] ; topRight = above[32] |
| 635 | |
| 636 | pmovzxbw m1, [r3 + 0] ; topRow[0] |
| 637 | pmovzxbw m2, [r3 + 8] ; topRow[1] |
| 638 | pmovzxbw m3, [r3 +16] ; topRow[2] |
| 639 | pmovzxbw m4, [r3 +24] ; topRow[3] |
| 640 | |
| 641 | psubw m5, m0, m1 ; v_bottomRow[0] |
| 642 | psubw m6, m0, m2 ; v_bottomRow[1] |
| 643 | psubw m7, m0, m3 ; v_bottomRow[2] |
| 644 | psubw m0, m4 ; v_bottomRow[3] |
| 645 | |
| 646 | mova bottomRow0, m5 |
| 647 | mova bottomRow1, m6 |
| 648 | mova bottomRow2, m7 |
| 649 | mova bottomRow3, m0 |
| 650 | |
| 651 | psllw m1, 5 |
| 652 | psllw m2, 5 |
| 653 | psllw m3, 5 |
| 654 | psllw m4, 5 |
| 655 | |
| 656 | %macro COMP_PRED_PLANAR_ROW 1 |
| 657 | movzx r5d, byte [r2] |
| 658 | shl r5d, 5 |
| 659 | add r5d, 32 |
| 660 | movd m5, r5d |
| 661 | pshuflw m5, m5, 0 |
| 662 | pshufd m5, m5, 0 ; horPred |
| 663 | |
| 664 | movzx r5d, byte [r2] |
| 665 | mov r6d, r4d |
| 666 | sub r6d, r5d |
| 667 | movd m6, r6d |
| 668 | pshuflw m6, m6, 0 |
| 669 | pshufd m6, m6, 0 |
| 670 | |
| 671 | %if (%1 == 0) |
| 672 | pmullw m7, m6, [multiL] |
| 673 | %else |
| 674 | pmullw m7, m6, [multiH2] |
| 675 | %endif |
| 676 | |
| 677 | paddw m7, m5 |
| 678 | %if (%1 == 0) |
| 679 | paddw m1, bottomRow0 |
| 680 | paddw m7, m1 |
| 681 | %else |
| 682 | paddw m3, bottomRow2 |
| 683 | paddw m7, m3 |
| 684 | %endif |
| 685 | psraw m7, 6 |
| 686 | |
| 687 | %if (%1 == 0) |
| 688 | pmullw m6, [multiH] |
| 689 | %else |
| 690 | pmullw m6, [multiH3] |
| 691 | %endif |
| 692 | paddw m6, m5 |
| 693 | %if (%1 == 0) |
| 694 | paddw m2, bottomRow1 |
| 695 | paddw m6, m2 |
| 696 | %else |
| 697 | paddw m4, bottomRow3 |
| 698 | paddw m6, m4 |
| 699 | %endif |
| 700 | psraw m6, 6 |
| 701 | |
| 702 | packuswb m7, m6 |
| 703 | movu [r0 + %1], m7 |
| 704 | %endmacro |
| 705 | |
| 706 | mov r3, 32 |
| 707 | .loop: |
| 708 | COMP_PRED_PLANAR_ROW 0 |
| 709 | COMP_PRED_PLANAR_ROW 16 |
| 710 | inc r2 |
| 711 | lea r0, [r0 + r1] |
| 712 | |
| 713 | dec r3 |
| 714 | jnz .loop |
| 715 | %undef COMP_PRED_PLANAR_ROW |
| 716 | |
| 717 | RET |
| 718 | |
| 719 | ;----------------------------------------------------------------------------- |
| 720 | ; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 721 | ;----------------------------------------------------------------------------- |
| 722 | INIT_XMM ssse3 |
| 723 | cglobal intra_pred_ang4_2, 3,3,4 |
| 724 | cmp r4m, byte 34 |
| 725 | cmove r2, r3mp |
| 726 | movh m0, [r2 + 2] |
| 727 | movd [r0], m0 |
| 728 | palignr m1, m0, 1 |
| 729 | movd [r0 + r1], m1 |
| 730 | palignr m2, m0, 2 |
| 731 | movd [r0 + r1 * 2], m2 |
| 732 | lea r1, [r1 * 3] |
| 733 | psrldq m0, 3 |
| 734 | movd [r0 + r1], m0 |
| 735 | RET |
| 736 | |
| 737 | |
| 738 | INIT_XMM sse4 |
| 739 | cglobal intra_pred_ang4_3, 3,4,5 |
| 740 | cmp r4m, byte 33 |
| 741 | cmove r2, r3mp |
| 742 | lea r3, [ang_table + 20 * 16] |
| 743 | movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] |
| 744 | palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] |
| 745 | punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 746 | palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] |
| 747 | palignr m2, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] |
| 748 | palignr m3, m0, 6 ; [x x x x x x x x 8 7 7 6 6 5 5 4] |
| 749 | punpcklqdq m0, m1 |
| 750 | punpcklqdq m2, m3 |
| 751 | |
| 752 | movh m3, [r3 + 6 * 16] ; [26] |
| 753 | movhps m3, [r3] ; [20] |
| 754 | movh m4, [r3 - 6 * 16] ; [14] |
| 755 | movhps m4, [r3 - 12 * 16] ; [ 8] |
| 756 | jmp .do_filter4x4 |
| 757 | |
| 758 | ; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose |
| 759 | ALIGN 16 |
| 760 | .do_filter4x4: |
| 761 | mova m1, [pw_1024] |
| 762 | |
| 763 | pmaddubsw m0, m3 |
| 764 | pmulhrsw m0, m1 |
| 765 | pmaddubsw m2, m4 |
| 766 | pmulhrsw m2, m1 |
| 767 | packuswb m0, m2 |
| 768 | |
| 769 | ; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before |
| 770 | jz .store |
| 771 | |
| 772 | ; transpose 4x4 |
| 773 | pshufb m0, [c_trans_4x4] |
| 774 | |
| 775 | .store: |
| 776 | ; TODO: use pextrd here after intrinsic ssse3 removed |
| 777 | movd [r0], m0 |
| 778 | pextrd [r0 + r1], m0, 1 |
| 779 | pextrd [r0 + r1 * 2], m0, 2 |
| 780 | lea r1, [r1 * 3] |
| 781 | pextrd [r0 + r1], m0, 3 |
| 782 | RET |
| 783 | |
| 784 | |
| 785 | cglobal intra_pred_ang4_4, 3,4,5 |
| 786 | cmp r4m, byte 32 |
| 787 | cmove r2, r3mp |
| 788 | lea r3, [ang_table + 18 * 16] |
| 789 | movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] |
| 790 | palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] |
| 791 | punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 792 | palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] |
| 793 | palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] |
| 794 | punpcklqdq m0, m1 |
| 795 | punpcklqdq m2, m1, m3 |
| 796 | |
| 797 | movh m3, [r3 + 3 * 16] ; [21] |
| 798 | movhps m3, [r3 - 8 * 16] ; [10] |
| 799 | movh m4, [r3 + 13 * 16] ; [31] |
| 800 | movhps m4, [r3 + 2 * 16] ; [20] |
| 801 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) |
| 802 | |
| 803 | |
| 804 | cglobal intra_pred_ang4_5, 3,4,5 |
| 805 | cmp r4m, byte 31 |
| 806 | cmove r2, r3mp |
| 807 | lea r3, [ang_table + 10 * 16] |
| 808 | movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] |
| 809 | palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] |
| 810 | punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 811 | palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] |
| 812 | palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] |
| 813 | punpcklqdq m0, m1 |
| 814 | punpcklqdq m2, m1, m3 |
| 815 | |
| 816 | movh m3, [r3 + 7 * 16] ; [17] |
| 817 | movhps m3, [r3 - 8 * 16] ; [ 2] |
| 818 | movh m4, [r3 + 9 * 16] ; [19] |
| 819 | movhps m4, [r3 - 6 * 16] ; [ 4] |
| 820 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) |
| 821 | |
| 822 | |
| 823 | cglobal intra_pred_ang4_6, 3,4,5 |
| 824 | cmp r4m, byte 30 |
| 825 | cmove r2, r3mp |
| 826 | lea r3, [ang_table + 19 * 16] |
| 827 | movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] |
| 828 | palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] |
| 829 | punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 830 | palignr m2, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] |
| 831 | punpcklqdq m0, m0 |
| 832 | punpcklqdq m2, m2 |
| 833 | |
| 834 | movh m3, [r3 - 6 * 16] ; [13] |
| 835 | movhps m3, [r3 + 7 * 16] ; [26] |
| 836 | movh m4, [r3 - 12 * 16] ; [ 7] |
| 837 | movhps m4, [r3 + 1 * 16] ; [20] |
| 838 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) |
| 839 | |
| 840 | |
| 841 | cglobal intra_pred_ang4_7, 3,4,5 |
| 842 | cmp r4m, byte 29 |
| 843 | cmove r2, r3mp |
| 844 | lea r3, [ang_table + 20 * 16] |
| 845 | movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] |
| 846 | palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] |
| 847 | punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 848 | palignr m3, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] |
| 849 | punpcklqdq m2, m0, m3 |
| 850 | punpcklqdq m0, m0 |
| 851 | |
| 852 | movh m3, [r3 - 11 * 16] ; [ 9] |
| 853 | movhps m3, [r3 - 2 * 16] ; [18] |
| 854 | movh m4, [r3 + 7 * 16] ; [27] |
| 855 | movhps m4, [r3 - 16 * 16] ; [ 4] |
| 856 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) |
| 857 | |
| 858 | |
| 859 | cglobal intra_pred_ang4_8, 3,4,5 |
| 860 | cmp r4m, byte 28 |
| 861 | cmove r2, r3mp |
| 862 | lea r3, [ang_table + 13 * 16] |
| 863 | movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] |
| 864 | palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] |
| 865 | punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 866 | punpcklqdq m0, m0 |
| 867 | mova m2, m0 |
| 868 | |
| 869 | movh m3, [r3 - 8 * 16] ; [ 5] |
| 870 | movhps m3, [r3 - 3 * 16] ; [10] |
| 871 | movh m4, [r3 + 2 * 16] ; [15] |
| 872 | movhps m4, [r3 + 7 * 16] ; [20] |
| 873 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) |
| 874 | |
| 875 | |
| 876 | cglobal intra_pred_ang4_9, 3,4,5 |
| 877 | cmp r4m, byte 27 |
| 878 | cmove r2, r3mp |
| 879 | lea r3, [ang_table + 4 * 16] |
| 880 | movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] |
| 881 | palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] |
| 882 | punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 883 | punpcklqdq m0, m0 |
| 884 | mova m2, m0 |
| 885 | |
| 886 | movh m3, [r3 - 2 * 16] ; [ 2] |
| 887 | movhps m3, [r3 - 0 * 16] ; [ 4] |
| 888 | movh m4, [r3 + 2 * 16] ; [ 6] |
| 889 | movhps m4, [r3 + 4 * 16] ; [ 8] |
| 890 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) |
| 891 | |
| 892 | |
| 893 | cglobal intra_pred_ang4_10, 3,3,4 |
| 894 | movd m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] |
| 895 | pshufb m0, [pb_unpackbd1] |
| 896 | |
| 897 | pshufd m1, m0, 1 |
| 898 | movhlps m2, m0 |
| 899 | pshufd m3, m0, 3 |
| 900 | movd [r0 + r1], m1 |
| 901 | movd [r0 + r1 * 2], m2 |
| 902 | lea r1, [r1 * 3] |
| 903 | movd [r0 + r1], m3 |
| 904 | |
| 905 | cmp r5m, byte 0 |
| 906 | jz .quit |
| 907 | |
| 908 | ; filter |
| 909 | mov r2, r3mp |
| 910 | pmovzxbw m0, m0 ; [-1 -1 -1 -1] |
| 911 | movh m1, [r2] ; [4 3 2 1 0] |
| 912 | pshufb m2, m1, [pb_0_8] ; [0 0 0 0] |
| 913 | pshufb m1, [pb_unpackbw1] ; [4 3 2 1] |
| 914 | psubw m1, m2 |
| 915 | psraw m1, 1 |
| 916 | paddw m0, m1 |
| 917 | packuswb m0, m0 |
| 918 | |
| 919 | .quit: |
| 920 | movd [r0], m0 |
| 921 | RET |
| 922 | |
| 923 | |
| 924 | INIT_XMM sse4 |
| 925 | cglobal intra_pred_ang4_26, 4,4,3 |
| 926 | movd m0, [r3 + 1] ; [8 7 6 5 4 3 2 1] |
| 927 | |
| 928 | ; store |
| 929 | movd [r0], m0 |
| 930 | movd [r0 + r1], m0 |
| 931 | movd [r0 + r1 * 2], m0 |
| 932 | lea r3, [r1 * 3] |
| 933 | movd [r0 + r3], m0 |
| 934 | |
| 935 | ; filter |
| 936 | cmp r5m, byte 0 |
| 937 | jz .quit |
| 938 | |
| 939 | pshufb m0, [pb_0_8] ; [ 1 1 1 1] |
| 940 | movh m1, [r2] ; [-4 -3 -2 -1 0] |
| 941 | pshufb m2, m1, [pb_0_8] ; [0 0 0 0] |
| 942 | pshufb m1, [pb_unpackbw1] ; [-4 -3 -2 -1] |
| 943 | psubw m1, m2 |
| 944 | psraw m1, 1 |
| 945 | paddw m0, m1 |
| 946 | packuswb m0, m0 |
| 947 | |
| 948 | pextrb [r0], m0, 0 |
| 949 | pextrb [r0 + r1], m0, 1 |
| 950 | pextrb [r0 + r1 * 2], m0, 2 |
| 951 | pextrb [r0 + r3], m0, 3 |
| 952 | |
| 953 | .quit: |
| 954 | RET |
| 955 | |
| 956 | |
| 957 | cglobal intra_pred_ang4_11, 3,4,5 |
| 958 | cmp r4m, byte 25 |
| 959 | cmove r2, r3mp |
| 960 | lea r3, [ang_table + 24 * 16] |
| 961 | movh m0, [r2] ; [x x x 4 3 2 1 0] |
| 962 | palignr m1, m0, 1 ; [x x x x 4 3 2 1] |
| 963 | punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0] |
| 964 | punpcklqdq m0, m0 |
| 965 | mova m2, m0 |
| 966 | |
| 967 | movh m3, [r3 + 6 * 16] ; [24] |
| 968 | movhps m3, [r3 + 4 * 16] ; [26] |
| 969 | movh m4, [r3 + 2 * 16] ; [28] |
| 970 | movhps m4, [r3 + 0 * 16] ; [30] |
| 971 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) |
| 972 | |
| 973 | |
| 974 | cglobal intra_pred_ang4_12, 3,4,5 |
| 975 | cmp r4m, byte 24 |
| 976 | cmove r2, r3mp |
| 977 | lea r3, [ang_table + 20 * 16] |
| 978 | movh m0, [r2] ; [x x x 4 3 2 1 0] |
| 979 | palignr m1, m0, 1 ; [x x x x 4 3 2 1] |
| 980 | punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0] |
| 981 | punpcklqdq m0, m0 |
| 982 | mova m2, m0 |
| 983 | |
| 984 | movh m3, [r3 + 7 * 16] ; [27] |
| 985 | movhps m3, [r3 + 2 * 16] ; [22] |
| 986 | movh m4, [r3 - 3 * 16] ; [17] |
| 987 | movhps m4, [r3 - 8 * 16] ; [12] |
| 988 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) |
| 989 | |
| 990 | |
| 991 | cglobal intra_pred_ang4_13, 4,4,5 |
| 992 | cmp r4m, byte 23 |
| 993 | jnz .load |
| 994 | xchg r2, r3 |
| 995 | .load: |
| 996 | movh m1, [r2 - 1] ; [x x 4 3 2 1 0 x] |
| 997 | palignr m0, m1, 1 ; [x x x 4 3 2 1 0] |
| 998 | palignr m2, m1, 2 ; [x x x x 4 3 2 1] |
| 999 | pinsrb m1, [r3 + 4], 0 |
| 1000 | punpcklbw m1, m0 ; [3 2 2 1 1 0 0 x] |
| 1001 | punpcklbw m0, m2 ; [4 3 3 2 2 1 1 0] |
| 1002 | punpcklqdq m2, m0, m1 |
| 1003 | punpcklqdq m0, m0 |
| 1004 | |
| 1005 | lea r3, [ang_table + 21 * 16] |
| 1006 | movh m3, [r3 + 2 * 16] ; [23] |
| 1007 | movhps m3, [r3 - 7 * 16] ; [14] |
| 1008 | movh m4, [r3 - 16 * 16] ; [ 5] |
| 1009 | movhps m4, [r3 + 7 * 16] ; [28] |
| 1010 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) |
| 1011 | |
| 1012 | |
| 1013 | cglobal intra_pred_ang4_14, 4,4,5 |
| 1014 | cmp r4m, byte 22 |
| 1015 | jnz .load |
| 1016 | xchg r2, r3 |
| 1017 | .load: |
| 1018 | movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x] |
| 1019 | palignr m0, m2, 1 ; [x x x 4 3 2 1 0] |
| 1020 | palignr m1, m2, 2 ; [x x x x 4 3 2 1] |
| 1021 | pinsrb m2, [r3 + 2], 0 |
| 1022 | punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] |
| 1023 | punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] |
| 1024 | punpcklqdq m0, m0 |
| 1025 | punpcklqdq m2, m2 |
| 1026 | |
| 1027 | lea r3, [ang_table + 19 * 16] |
| 1028 | movh m3, [r3 + 0 * 16] ; [19] |
| 1029 | movhps m3, [r3 - 13 * 16] ; [ 6] |
| 1030 | movh m4, [r3 + 6 * 16] ; [25] |
| 1031 | movhps m4, [r3 - 7 * 16] ; [12] |
| 1032 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) |
| 1033 | |
| 1034 | |
| 1035 | cglobal intra_pred_ang4_15, 4,4,5 |
| 1036 | cmp r4m, byte 21 |
| 1037 | jnz .load |
| 1038 | xchg r2, r3 |
| 1039 | .load: |
| 1040 | movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x] |
| 1041 | palignr m0, m2, 1 ; [x x x 4 3 2 1 0] |
| 1042 | palignr m1, m2, 2 ; [x x x x 4 3 2 1] |
| 1043 | pinsrb m2, [r3 + 2], 0 |
| 1044 | pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y] |
| 1045 | pinsrb m3, [r3 + 4], 0 |
| 1046 | punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y] |
| 1047 | punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] |
| 1048 | punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] |
| 1049 | punpcklqdq m0, m2 |
| 1050 | punpcklqdq m2, m4 |
| 1051 | |
| 1052 | lea r3, [ang_table + 23 * 16] |
| 1053 | movh m3, [r3 - 8 * 16] ; [15] |
| 1054 | movhps m3, [r3 + 7 * 16] ; [30] |
| 1055 | movh m4, [r3 - 10 * 16] ; [13] |
| 1056 | movhps m4, [r3 + 5 * 16] ; [28] |
| 1057 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) |
| 1058 | |
| 1059 | |
| 1060 | cglobal intra_pred_ang4_16, 4,4,5 |
| 1061 | cmp r4m, byte 20 |
| 1062 | jnz .load |
| 1063 | xchg r2, r3 |
| 1064 | .load: |
| 1065 | movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x] |
| 1066 | palignr m0, m2, 1 ; [x x x 4 3 2 1 0] |
| 1067 | palignr m1, m2, 2 ; [x x x x 4 3 2 1] |
| 1068 | pinsrb m2, [r3 + 2], 0 |
| 1069 | pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y] |
| 1070 | pinsrb m3, [r3 + 3], 0 |
| 1071 | punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y] |
| 1072 | punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] |
| 1073 | punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] |
| 1074 | punpcklqdq m0, m2 |
| 1075 | punpcklqdq m2, m4 |
| 1076 | |
| 1077 | lea r3, [ang_table + 19 * 16] |
| 1078 | movh m3, [r3 - 8 * 16] ; [11] |
| 1079 | movhps m3, [r3 + 3 * 16] ; [22] |
| 1080 | movh m4, [r3 - 18 * 16] ; [ 1] |
| 1081 | movhps m4, [r3 - 7 * 16] ; [12] |
| 1082 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) |
| 1083 | |
| 1084 | |
| 1085 | cglobal intra_pred_ang4_17, 4,4,5 |
| 1086 | cmp r4m, byte 19 |
| 1087 | jnz .load |
| 1088 | xchg r2, r3 |
| 1089 | .load: |
| 1090 | movh m3, [r2 - 1] ; [- - 4 3 2 1 0 x] |
| 1091 | palignr m0, m3, 1 ; [- - - 4 3 2 1 0] |
| 1092 | palignr m1, m3, 2 ; [- - - - 4 3 2 1] |
| 1093 | mova m4, m0 |
| 1094 | punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] |
| 1095 | |
| 1096 | pinsrb m3, [r3 + 1], 0 |
| 1097 | punpcklbw m1, m3, m4 ; [3 2 2 1 1 0 0 x] |
| 1098 | punpcklqdq m0, m1 |
| 1099 | |
| 1100 | pslldq m2, m3, 1 ; [- 4 3 2 1 0 x y] |
| 1101 | pinsrb m2, [r3 + 2], 0 |
| 1102 | pslldq m1, m2, 1 ; [4 3 2 1 0 x y z] |
| 1103 | pinsrb m1, [r3 + 4], 0 |
| 1104 | punpcklbw m1, m2 ; [1 0 0 x x y y z] |
| 1105 | punpcklbw m2, m3 ; [2 1 1 0 0 x x y] |
| 1106 | punpcklqdq m2, m1 |
| 1107 | |
| 1108 | lea r3, [ang_table + 14 * 16] |
| 1109 | movh m3, [r3 - 8 * 16] ; [ 6] |
| 1110 | movhps m3, [r3 - 2 * 16] ; [12] |
| 1111 | movh m4, [r3 + 4 * 16] ; [18] |
| 1112 | movhps m4, [r3 + 10 * 16] ; [24] |
| 1113 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) |
| 1114 | |
| 1115 | |
| 1116 | cglobal intra_pred_ang4_18, 4,4,1 |
| 1117 | mov r2d, [r2] |
| 1118 | bswap r2d |
| 1119 | movd m0, r2d |
| 1120 | pinsrd m0, [r3 + 1], 1 ; [- 3 2 1 0 -1 -2 -3] |
| 1121 | lea r2, [r1 * 3] |
| 1122 | movd [r0 + r2], m0 |
| 1123 | psrldq m0, 1 |
| 1124 | movd [r0 + r1 * 2], m0 |
| 1125 | psrldq m0, 1 |
| 1126 | movd [r0 + r1], m0 |
| 1127 | psrldq m0, 1 |
| 1128 | movd [r0], m0 |
| 1129 | RET |
| 1130 | ;----------------------------------------------------------------------------- |
| 1131 | ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 1132 | ;----------------------------------------------------------------------------- |
| 1133 | INIT_XMM ssse3 |
| 1134 | cglobal intra_pred_ang8_2, 3,5,2 |
| 1135 | cmp r4m, byte 34 |
| 1136 | cmove r2, r3mp |
| 1137 | movu m0, [r2 + 2] |
| 1138 | lea r4, [r1 * 3] |
| 1139 | |
| 1140 | movh [r0], m0 |
| 1141 | palignr m1, m0, 1 |
| 1142 | movh [r0 + r1], m1 |
| 1143 | palignr m1, m0, 2 |
| 1144 | movh [r0 + r1 * 2], m1 |
| 1145 | palignr m1, m0, 3 |
| 1146 | movh [r0 + r4], m1 |
| 1147 | palignr m1, m0, 4 |
| 1148 | lea r0, [r0 + r1 * 4] |
| 1149 | movh [r0], m1 |
| 1150 | palignr m1, m0, 5 |
| 1151 | movh [r0 + r1], m1 |
| 1152 | palignr m1, m0, 6 |
| 1153 | movh [r0 + r1 * 2], m1 |
| 1154 | palignr m1, m0, 7 |
| 1155 | movh [r0 + r4], m1 |
| 1156 | RET |
| 1157 | |
| 1158 | INIT_XMM sse4 |
| 1159 | cglobal intra_pred_ang8_3, 3,5,8 |
| 1160 | cmp r4m, byte 33 |
| 1161 | cmove r2, r3mp |
| 1162 | lea r3, [ang_table + 22 * 16] |
| 1163 | lea r4, [ang_table + 8 * 16] |
| 1164 | mova m3, [pw_1024] |
| 1165 | |
| 1166 | movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 1167 | palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 1168 | |
| 1169 | punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] |
| 1170 | punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 1171 | palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] |
| 1172 | |
| 1173 | pmaddubsw m4, m0, [r3 + 4 * 16] ; [26] |
| 1174 | pmulhrsw m4, m3 |
| 1175 | pmaddubsw m1, [r3 - 2 * 16] ; [20] |
| 1176 | pmulhrsw m1, m3 |
| 1177 | packuswb m4, m1 |
| 1178 | |
| 1179 | palignr m5, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] |
| 1180 | |
| 1181 | pmaddubsw m5, [r3 - 8 * 16] ; [14] |
| 1182 | pmulhrsw m5, m3 |
| 1183 | |
| 1184 | palignr m6, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] |
| 1185 | |
| 1186 | pmaddubsw m6, [r4] ; [ 8] |
| 1187 | pmulhrsw m6, m3 |
| 1188 | packuswb m5, m6 |
| 1189 | |
| 1190 | palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] |
| 1191 | |
| 1192 | pmaddubsw m6, m1, [r4 - 6 * 16] ; [ 2] |
| 1193 | pmulhrsw m6, m3 |
| 1194 | |
| 1195 | pmaddubsw m1, [r3 + 6 * 16] ; [28] |
| 1196 | pmulhrsw m1, m3 |
| 1197 | packuswb m6, m1 |
| 1198 | |
| 1199 | palignr m1, m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6] |
| 1200 | |
| 1201 | pmaddubsw m1, [r3] ; [22] |
| 1202 | pmulhrsw m1, m3 |
| 1203 | |
| 1204 | palignr m2, m0, 12 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7] |
| 1205 | |
| 1206 | pmaddubsw m2, [r3 - 6 * 16] ; [16] |
| 1207 | pmulhrsw m2, m3 |
| 1208 | packuswb m1, m2 |
| 1209 | jmp .transpose8x8 |
| 1210 | |
| 1211 | ALIGN 16 |
| 1212 | .transpose8x8: |
| 1213 | jz .store |
| 1214 | |
| 1215 | ; transpose 8x8 |
| 1216 | punpckhbw m0, m4, m5 |
| 1217 | punpcklbw m4, m5 |
| 1218 | punpckhbw m2, m4, m0 |
| 1219 | punpcklbw m4, m0 |
| 1220 | |
| 1221 | punpckhbw m0, m6, m1 |
| 1222 | punpcklbw m6, m1 |
| 1223 | punpckhbw m1, m6, m0 |
| 1224 | punpcklbw m6, m0 |
| 1225 | |
| 1226 | punpckhdq m5, m4, m6 |
| 1227 | punpckldq m4, m6 |
| 1228 | punpckldq m6, m2, m1 |
| 1229 | punpckhdq m2, m1 |
| 1230 | mova m1, m2 |
| 1231 | |
| 1232 | .store: |
| 1233 | lea r4, [r1 * 3] |
| 1234 | movh [r0], m4 |
| 1235 | movhps [r0 + r1], m4 |
| 1236 | movh [r0 + r1 * 2], m5 |
| 1237 | movhps [r0 + r4], m5 |
| 1238 | add r0, r4 |
| 1239 | movh [r0 + r1], m6 |
| 1240 | movhps [r0 + r1 * 2], m6 |
| 1241 | movh [r0 + r4], m1 |
| 1242 | movhps [r0 + r1 * 4], m1 |
| 1243 | RET |
| 1244 | |
| 1245 | cglobal intra_pred_ang8_4, 3,5,8 |
| 1246 | cmp r4m, byte 32 |
| 1247 | cmove r2, r3mp |
| 1248 | lea r3, [ang_table + 24 * 16] |
| 1249 | lea r4, [ang_table + 10 * 16] |
| 1250 | mova m3, [pw_1024] |
| 1251 | |
| 1252 | movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 1253 | palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 1254 | |
| 1255 | punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] |
| 1256 | punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 1257 | palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] |
| 1258 | mova m5, m1 |
| 1259 | |
| 1260 | pmaddubsw m4, m0, [r3 - 3 * 16] ; [21] |
| 1261 | pmulhrsw m4, m3 |
| 1262 | pmaddubsw m1, [r4] ; [10] |
| 1263 | pmulhrsw m1, m3 |
| 1264 | packuswb m4, m1 |
| 1265 | |
| 1266 | pmaddubsw m5, [r3 + 7 * 16] ; [31] |
| 1267 | pmulhrsw m5, m3 |
| 1268 | |
| 1269 | palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] |
| 1270 | |
| 1271 | pmaddubsw m6, [r3 - 4 * 16] ; [ 20] |
| 1272 | pmulhrsw m6, m3 |
| 1273 | packuswb m5, m6 |
| 1274 | |
| 1275 | palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] |
| 1276 | |
| 1277 | pmaddubsw m6, m1, [r4 - 1 * 16] ; [ 9] |
| 1278 | pmulhrsw m6, m3 |
| 1279 | |
| 1280 | pmaddubsw m1, [r3 + 6 * 16] ; [30] |
| 1281 | pmulhrsw m1, m3 |
| 1282 | packuswb m6, m1 |
| 1283 | |
| 1284 | palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] |
| 1285 | |
| 1286 | pmaddubsw m1, [r3 - 5 * 16] ; [19] |
| 1287 | pmulhrsw m1, m3 |
| 1288 | |
| 1289 | palignr m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 8] |
| 1290 | |
| 1291 | pmaddubsw m2, [r4 - 2 * 16] ; [8] |
| 1292 | pmulhrsw m2, m3 |
| 1293 | packuswb m1, m2 |
| 1294 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) |
| 1295 | |
| 1296 | cglobal intra_pred_ang8_5, 3,5,8 |
| 1297 | cmp r4m, byte 31 |
| 1298 | cmove r2, r3mp |
| 1299 | lea r3, [ang_table + 17 * 16] |
| 1300 | lea r4, [ang_table + 2 * 16] |
| 1301 | mova m3, [pw_1024] |
| 1302 | |
| 1303 | movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 1304 | palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 1305 | |
| 1306 | punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] |
| 1307 | punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 1308 | palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] |
| 1309 | mova m5, m1 |
| 1310 | |
| 1311 | pmaddubsw m4, m0, [r3] ; [17] |
| 1312 | pmulhrsw m4, m3 |
| 1313 | pmaddubsw m1, [r4] ; [2] |
| 1314 | pmulhrsw m1, m3 |
| 1315 | packuswb m4, m1 |
| 1316 | |
| 1317 | pmaddubsw m5, [r3 + 2 * 16] ; [19] |
| 1318 | pmulhrsw m5, m3 |
| 1319 | |
| 1320 | palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] |
| 1321 | mova m1, m6 |
| 1322 | |
| 1323 | pmaddubsw m1, [r4 + 2 * 16] ; [4] |
| 1324 | pmulhrsw m1, m3 |
| 1325 | packuswb m5, m1 |
| 1326 | |
| 1327 | pmaddubsw m6, [r3 + 4 * 16] ; [21] |
| 1328 | pmulhrsw m6, m3 |
| 1329 | |
| 1330 | palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] |
| 1331 | |
| 1332 | mova m7, m1 |
| 1333 | pmaddubsw m7, [r4 + 4 * 16] ; [6] |
| 1334 | pmulhrsw m7, m3 |
| 1335 | packuswb m6, m7 |
| 1336 | |
| 1337 | pmaddubsw m1, [r3 + 6 * 16] ; [23] |
| 1338 | pmulhrsw m1, m3 |
| 1339 | |
| 1340 | palignr m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 8 8 9] |
| 1341 | |
| 1342 | pmaddubsw m2, [r4 + 6 * 16] ; [8] |
| 1343 | pmulhrsw m2, m3 |
| 1344 | packuswb m1, m2 |
| 1345 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) |
| 1346 | |
| 1347 | cglobal intra_pred_ang8_6, 3,5,8 |
| 1348 | cmp r4m, byte 30 |
| 1349 | cmove r2, r3mp |
| 1350 | lea r3, [ang_table + 20 * 16] |
| 1351 | lea r4, [ang_table + 8 * 16] |
| 1352 | mova m7, [pw_1024] |
| 1353 | |
| 1354 | movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 1355 | palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 1356 | |
| 1357 | punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] |
| 1358 | punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 1359 | mova m1, m0 |
| 1360 | |
| 1361 | pmaddubsw m4, m0, [r3 - 7 * 16] ; [13] |
| 1362 | pmulhrsw m4, m7 |
| 1363 | pmaddubsw m1, [r3 + 6 * 16] ; [26] |
| 1364 | pmulhrsw m1, m7 |
| 1365 | packuswb m4, m1 |
| 1366 | |
| 1367 | palignr m6, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] |
| 1368 | |
| 1369 | pmaddubsw m5, m6, [r4 - 1 * 16] ; [7] |
| 1370 | pmulhrsw m5, m7 |
| 1371 | |
| 1372 | pmaddubsw m6, [r3] ; [20] |
| 1373 | pmulhrsw m6, m7 |
| 1374 | packuswb m5, m6 |
| 1375 | |
| 1376 | palignr m1, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] |
| 1377 | |
| 1378 | pmaddubsw m6, m1, [r4 - 7 * 16] ; [1] |
| 1379 | pmulhrsw m6, m7 |
| 1380 | |
| 1381 | mova m3, m1 |
| 1382 | pmaddubsw m3, [r3 - 6 * 16] ; [14] |
| 1383 | pmulhrsw m3, m7 |
| 1384 | packuswb m6, m3 |
| 1385 | |
| 1386 | pmaddubsw m1, [r3 + 7 * 16] ; [27] |
| 1387 | pmulhrsw m1, m7 |
| 1388 | |
| 1389 | palignr m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] |
| 1390 | |
| 1391 | pmaddubsw m2, [r4] ; [8] |
| 1392 | pmulhrsw m2, m7 |
| 1393 | packuswb m1, m2 |
| 1394 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) |
| 1395 | |
| 1396 | cglobal intra_pred_ang8_7, 3,5,8 |
| 1397 | cmp r4m, byte 29 |
| 1398 | cmove r2, r3mp |
| 1399 | lea r3, [ang_table + 24 * 16] |
| 1400 | lea r4, [ang_table + 6 * 16] |
| 1401 | mova m7, [pw_1024] |
| 1402 | |
| 1403 | movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 1404 | palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 1405 | |
| 1406 | punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] |
| 1407 | punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 1408 | |
| 1409 | pmaddubsw m4, m0, [r4 + 3 * 16] ; [9] |
| 1410 | pmulhrsw m4, m7 |
| 1411 | pmaddubsw m3, m0, [r3 - 6 * 16] ; [18] |
| 1412 | pmulhrsw m3, m7 |
| 1413 | packuswb m4, m3 |
| 1414 | |
| 1415 | pmaddubsw m5, m0, [r3 + 3 * 16] ; [27] |
| 1416 | pmulhrsw m5, m7 |
| 1417 | |
| 1418 | palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] |
| 1419 | |
| 1420 | pmaddubsw m6, m1, [r4 - 2 * 16] ; [4] |
| 1421 | pmulhrsw m6, m7 |
| 1422 | packuswb m5, m6 |
| 1423 | |
| 1424 | pmaddubsw m6, m1, [r4 + 7 * 16] ; [13] |
| 1425 | pmulhrsw m6, m7 |
| 1426 | |
| 1427 | mova m3, m1 |
| 1428 | pmaddubsw m3, [r3 - 2 * 16] ; [22] |
| 1429 | pmulhrsw m3, m7 |
| 1430 | packuswb m6, m3 |
| 1431 | |
| 1432 | pmaddubsw m1, [r3 + 7 * 16] ; [31] |
| 1433 | pmulhrsw m1, m7 |
| 1434 | |
| 1435 | palignr m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] |
| 1436 | |
| 1437 | pmaddubsw m2, [r4 + 2 * 16] ; [8] |
| 1438 | pmulhrsw m2, m7 |
| 1439 | packuswb m1, m2 |
| 1440 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) |
| 1441 | |
| 1442 | cglobal intra_pred_ang8_8, 3,5,8 |
| 1443 | cmp r4m, byte 28 |
| 1444 | cmove r2, r3mp |
| 1445 | lea r3, [ang_table + 23 * 16] |
| 1446 | lea r4, [ang_table + 8 * 16] |
| 1447 | mova m7, [pw_1024] |
| 1448 | |
| 1449 | movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 1450 | palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 1451 | |
| 1452 | punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] |
| 1453 | punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 1454 | palignr m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] |
| 1455 | |
| 1456 | pmaddubsw m4, m0, [r4 - 3 * 16] ; [5] |
| 1457 | pmulhrsw m4, m7 |
| 1458 | pmaddubsw m3, m0, [r4 + 2 * 16] ; [10] |
| 1459 | pmulhrsw m3, m7 |
| 1460 | packuswb m4, m3 |
| 1461 | |
| 1462 | pmaddubsw m5, m0, [r3 - 8 * 16] ; [15] |
| 1463 | pmulhrsw m5, m7 |
| 1464 | |
| 1465 | pmaddubsw m6, m0, [r3 - 3 * 16] ; [20] |
| 1466 | pmulhrsw m6, m7 |
| 1467 | packuswb m5, m6 |
| 1468 | |
| 1469 | pmaddubsw m6, m0, [r3 + 2 * 16] ; [25] |
| 1470 | pmulhrsw m6, m7 |
| 1471 | |
| 1472 | pmaddubsw m0, [r3 + 7 * 16] ; [30] |
| 1473 | pmulhrsw m0, m7 |
| 1474 | packuswb m6, m0 |
| 1475 | |
| 1476 | pmaddubsw m1, m2, [r4 - 5 * 16] ; [3] |
| 1477 | pmulhrsw m1, m7 |
| 1478 | |
| 1479 | pmaddubsw m2, [r4] ; [8] |
| 1480 | pmulhrsw m2, m7 |
| 1481 | packuswb m1, m2 |
| 1482 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) |
| 1483 | |
| 1484 | cglobal intra_pred_ang8_9, 3,5,8 |
| 1485 | cmp r4m, byte 27 |
| 1486 | cmove r2, r3mp |
| 1487 | lea r3, [ang_table + 10 * 16] |
| 1488 | mova m7, [pw_1024] |
| 1489 | |
| 1490 | movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 1491 | palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 1492 | |
| 1493 | punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 1494 | |
| 1495 | pmaddubsw m4, m0, [r3 - 8 * 16] ; [2] |
| 1496 | pmulhrsw m4, m7 |
| 1497 | pmaddubsw m3, m0, [r3 - 6 * 16] ; [4] |
| 1498 | pmulhrsw m3, m7 |
| 1499 | packuswb m4, m3 |
| 1500 | |
| 1501 | pmaddubsw m5, m0, [r3 - 4 * 16] ; [6] |
| 1502 | pmulhrsw m5, m7 |
| 1503 | |
| 1504 | pmaddubsw m6, m0, [r3 - 2 * 16] ; [8] |
| 1505 | pmulhrsw m6, m7 |
| 1506 | packuswb m5, m6 |
| 1507 | |
| 1508 | pmaddubsw m6, m0, [r3] ; [10] |
| 1509 | pmulhrsw m6, m7 |
| 1510 | |
| 1511 | pmaddubsw m2, m0, [r3 + 2 * 16] ; [12] |
| 1512 | pmulhrsw m2, m7 |
| 1513 | packuswb m6, m2 |
| 1514 | |
| 1515 | pmaddubsw m1, m0, [r3 + 4 * 16] ; [14] |
| 1516 | pmulhrsw m1, m7 |
| 1517 | |
| 1518 | pmaddubsw m0, [r3 + 6 * 16] ; [16] |
| 1519 | pmulhrsw m0, m7 |
| 1520 | packuswb m1, m0 |
| 1521 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) |
| 1522 | |
| 1523 | cglobal intra_pred_ang8_10, 4,5,5 |
| 1524 | movh m0, [r2 + 1] |
| 1525 | mova m4, [pb_unpackbq] |
| 1526 | palignr m1, m0, 2 |
| 1527 | pshufb m1, m4 |
| 1528 | palignr m2, m0, 4 |
| 1529 | pshufb m2, m4 |
| 1530 | palignr m3, m0, 6 |
| 1531 | pshufb m3, m4 |
| 1532 | pshufb m0, m4 |
| 1533 | |
| 1534 | lea r4, [r1 * 3] |
| 1535 | movhps [r0 + r1], m0 |
| 1536 | movh [r0 + r1 * 2], m1 |
| 1537 | movhps [r0 + r4], m1 |
| 1538 | lea r2, [r0 + r1 * 4] |
| 1539 | movh [r2], m2 |
| 1540 | movhps [r2 + r1], m2 |
| 1541 | movh [r2 + r1 * 2], m3 |
| 1542 | movhps [r2 + r4], m3 |
| 1543 | |
| 1544 | ; filter |
| 1545 | cmp r5m, byte 0 |
| 1546 | jz .quit |
| 1547 | |
| 1548 | pmovzxbw m0, m0 |
| 1549 | movu m1, [r3] |
| 1550 | palignr m2, m1, 1 |
| 1551 | pshufb m1, m4 |
| 1552 | pmovzxbw m1, m1 |
| 1553 | pmovzxbw m2, m2 |
| 1554 | psubw m2, m1 |
| 1555 | psraw m2, 1 |
| 1556 | paddw m0, m2 |
| 1557 | packuswb m0, m0 |
| 1558 | |
| 1559 | .quit: |
| 1560 | movh [r0], m0 |
| 1561 | RET |
| 1562 | |
| 1563 | cglobal intra_pred_ang8_26, 4,5,3 |
| 1564 | movh m0, [r3 + 1] |
| 1565 | |
| 1566 | lea r4, [r1 * 3] |
| 1567 | movh [r0], m0 |
| 1568 | movh [r0 + r1], m0 |
| 1569 | movh [r0 + r1 * 2], m0 |
| 1570 | movh [r0 + r4], m0 |
| 1571 | lea r3, [r0 + r1 * 4] |
| 1572 | movh [r3], m0 |
| 1573 | movh [r3 + r1], m0 |
| 1574 | movh [r3 + r1 * 2], m0 |
| 1575 | movh [r3 + r4], m0 |
| 1576 | |
| 1577 | ; filter |
| 1578 | cmp r5m, byte 0 |
| 1579 | jz .quit |
| 1580 | |
| 1581 | pshufb m0, [pb_unpackbq] |
| 1582 | pmovzxbw m0, m0 |
| 1583 | movu m1, [r2] |
| 1584 | palignr m2, m1, 1 |
| 1585 | pshufb m1, [pb_unpackbq] |
| 1586 | pmovzxbw m1, m1 |
| 1587 | pmovzxbw m2, m2 |
| 1588 | psubw m2, m1 |
| 1589 | psraw m2, 1 |
| 1590 | paddw m0, m2 |
| 1591 | packuswb m0, m0 |
| 1592 | pextrb [r0], m0, 0 |
| 1593 | pextrb [r0 + r1], m0, 1 |
| 1594 | pextrb [r0 + r1 * 2], m0, 2 |
| 1595 | pextrb [r0 + r4], m0, 3 |
| 1596 | pextrb [r3], m0, 4 |
| 1597 | pextrb [r3 + r1], m0, 5 |
| 1598 | pextrb [r3 + r1 * 2], m0, 6 |
| 1599 | pextrb [r3 + r4], m0, 7 |
| 1600 | |
| 1601 | .quit: |
| 1602 | RET |
| 1603 | |
| 1604 | cglobal intra_pred_ang8_11, 3,5,8 |
| 1605 | cmp r4m, byte 25 |
| 1606 | cmove r2, r3mp |
| 1607 | lea r3, [ang_table + 23 * 16] |
| 1608 | mova m7, [pw_1024] |
| 1609 | |
| 1610 | movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 1611 | palignr m1, m0, 1 ; [x 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 1612 | |
| 1613 | punpcklbw m0, m1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 1614 | |
| 1615 | pmaddubsw m4, m0, [r3 + 7 * 16] ; [30] |
| 1616 | pmulhrsw m4, m7 |
| 1617 | pmaddubsw m3, m0, [r3 + 5 * 16] ; [28] |
| 1618 | pmulhrsw m3, m7 |
| 1619 | packuswb m4, m3 |
| 1620 | |
| 1621 | pmaddubsw m5, m0, [r3 + 3 * 16] ; [26] |
| 1622 | pmulhrsw m5, m7 |
| 1623 | |
| 1624 | pmaddubsw m6, m0, [r3 + 1 * 16] ; [24] |
| 1625 | pmulhrsw m6, m7 |
| 1626 | packuswb m5, m6 |
| 1627 | |
| 1628 | pmaddubsw m6, m0, [r3 - 1 * 16] ; [22] |
| 1629 | pmulhrsw m6, m7 |
| 1630 | |
| 1631 | pmaddubsw m2, m0, [r3 - 3 * 16] ; [20] |
| 1632 | pmulhrsw m2, m7 |
| 1633 | packuswb m6, m2 |
| 1634 | |
| 1635 | pmaddubsw m1, m0, [r3 - 5 * 16] ; [18] |
| 1636 | pmulhrsw m1, m7 |
| 1637 | |
| 1638 | pmaddubsw m0, [r3 - 7 * 16] ; [16] |
| 1639 | pmulhrsw m0, m7 |
| 1640 | packuswb m1, m0 |
| 1641 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) |
| 1642 | |
| 1643 | cglobal intra_pred_ang8_12, 4,5,8 |
| 1644 | cmp r4m, byte 24 |
| 1645 | mov r4, r2 |
| 1646 | cmovz r2, r3 |
| 1647 | cmovz r3, r4 |
| 1648 | |
| 1649 | lea r4, [ang_table + 22 * 16] |
| 1650 | mova m7, [pw_1024] |
| 1651 | |
| 1652 | movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 1653 | pslldq m0, m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] |
| 1654 | pinsrb m0, [r3 + 6], 0 |
| 1655 | punpckhbw m2, m0, m1 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7] |
| 1656 | punpcklbw m0, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] |
| 1657 | palignr m2, m0, 2 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 1658 | |
| 1659 | pmaddubsw m4, m2, [r4 + 5 * 16] ; [27] |
| 1660 | pmulhrsw m4, m7 |
| 1661 | pmaddubsw m3, m2, [r4] ; [22] |
| 1662 | pmulhrsw m3, m7 |
| 1663 | packuswb m4, m3 |
| 1664 | |
| 1665 | pmaddubsw m1, m0, [r4 + 7 * 16] ; [29] |
| 1666 | pmulhrsw m1, m7 |
| 1667 | |
| 1668 | pmaddubsw m0, [r4 + 2 * 16] ; [24] |
| 1669 | pmulhrsw m0, m7 |
| 1670 | packuswb m1, m0 |
| 1671 | |
| 1672 | pmaddubsw m5, m2, [r4 - 5 * 16] ; [17] |
| 1673 | pmulhrsw m5, m7 |
| 1674 | |
| 1675 | lea r4, [ang_table + 7 * 16] |
| 1676 | pmaddubsw m6, m2, [r4 + 5 * 16] ; [12] |
| 1677 | pmulhrsw m6, m7 |
| 1678 | packuswb m5, m6 |
| 1679 | |
| 1680 | pmaddubsw m6, m2, [r4] ; [7] |
| 1681 | pmulhrsw m6, m7 |
| 1682 | |
| 1683 | pmaddubsw m2, [r4 - 5 * 16] ; [2] |
| 1684 | pmulhrsw m2, m7 |
| 1685 | packuswb m6, m2 |
| 1686 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) |
| 1687 | |
| 1688 | cglobal intra_pred_ang8_13, 4,5,8 |
| 1689 | cmp r4m, byte 23 |
| 1690 | mov r4, r2 |
| 1691 | cmovz r2, r3 |
| 1692 | cmovz r3, r4 |
| 1693 | |
| 1694 | lea r4, [ang_table + 24 * 16] |
| 1695 | mova m7, [pw_1024] |
| 1696 | |
| 1697 | movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 1698 | pslldq m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] |
| 1699 | pinsrb m1, [r3 + 4], 0 |
| 1700 | pslldq m0, m1, 1 ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b] |
| 1701 | pinsrb m0, [r3 + 7], 0 |
| 1702 | punpckhbw m5, m0, m1 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6] |
| 1703 | punpcklbw m0, m1 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] |
| 1704 | palignr m1, m5, m0, 2 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] |
| 1705 | palignr m5, m0, 4 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 1706 | |
| 1707 | pmaddubsw m4, m5, [r4 - 1 * 16] ; [23] |
| 1708 | pmulhrsw m4, m7 |
| 1709 | |
| 1710 | pmaddubsw m6, m1, [r4 + 4 * 16] ; [28] |
| 1711 | pmulhrsw m6, m7 |
| 1712 | |
| 1713 | pmaddubsw m0, [r4] ; [24] |
| 1714 | pmulhrsw m0, m7 |
| 1715 | |
| 1716 | lea r4, [ang_table + 13 * 16] |
| 1717 | pmaddubsw m3, m5, [r4 + 1 * 16] ; [14] |
| 1718 | pmulhrsw m3, m7 |
| 1719 | packuswb m4, m3 |
| 1720 | |
| 1721 | pmaddubsw m5, [r4 - 8 * 16] ; [5] |
| 1722 | pmulhrsw m5, m7 |
| 1723 | packuswb m5, m6 |
| 1724 | |
| 1725 | pmaddubsw m6, m1, [r4 + 6 * 16] ; [19] |
| 1726 | pmulhrsw m6, m7 |
| 1727 | |
| 1728 | pmaddubsw m2, m1, [r4 - 3 * 16] ; [10] |
| 1729 | pmulhrsw m2, m7 |
| 1730 | packuswb m6, m2 |
| 1731 | |
| 1732 | pmaddubsw m1, [r4 - 12 * 16] ; [1] |
| 1733 | pmulhrsw m1, m7 |
| 1734 | packuswb m1, m0 |
| 1735 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) |
| 1736 | |
| 1737 | cglobal intra_pred_ang8_14, 4,5,8 |
| 1738 | cmp r4m, byte 22 |
| 1739 | mov r4, r2 |
| 1740 | cmovz r2, r3 |
| 1741 | cmovz r3, r4 |
| 1742 | |
| 1743 | lea r4, [ang_table + 24 * 16] |
| 1744 | mova m3, [pw_1024] |
| 1745 | |
| 1746 | movu m1, [r2 - 2] ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b] |
| 1747 | pinsrb m1, [r3 + 2], 1 |
| 1748 | pinsrb m1, [r3 + 5], 0 |
| 1749 | pslldq m0, m1, 1 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c] |
| 1750 | pinsrb m0, [r3 + 7], 0 |
| 1751 | punpckhbw m2, m0, m1 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] |
| 1752 | punpcklbw m0, m1 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] |
| 1753 | palignr m1, m2, m0, 2 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] |
| 1754 | palignr m6, m2, m0, 4 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] |
| 1755 | palignr m2, m0, 6 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 1756 | |
| 1757 | pmaddubsw m4, m2, [r4 - 5 * 16] ; [19] |
| 1758 | pmulhrsw m4, m3 |
| 1759 | |
| 1760 | pmaddubsw m0, [r4] ; [24] |
| 1761 | pmulhrsw m0, m3 |
| 1762 | |
| 1763 | pmaddubsw m5, m6, [r4 + 1 * 16] ; [25] |
| 1764 | pmulhrsw m5, m3 |
| 1765 | |
| 1766 | lea r4, [ang_table + 12 * 16] |
| 1767 | pmaddubsw m6, [r4] ; [12] |
| 1768 | pmulhrsw m6, m3 |
| 1769 | packuswb m5, m6 |
| 1770 | |
| 1771 | pmaddubsw m6, m1, [r4 + 19 * 16] ; [31] |
| 1772 | pmulhrsw m6, m3 |
| 1773 | |
| 1774 | pmaddubsw m2, [r4 - 6 * 16] ; [6] |
| 1775 | pmulhrsw m2, m3 |
| 1776 | packuswb m4, m2 |
| 1777 | |
| 1778 | pmaddubsw m2, m1, [r4 + 6 * 16] ; [18] |
| 1779 | pmulhrsw m2, m3 |
| 1780 | packuswb m6, m2 |
| 1781 | |
| 1782 | pmaddubsw m1, [r4 - 7 * 16] ; [5] |
| 1783 | pmulhrsw m1, m3 |
| 1784 | packuswb m1, m0 |
| 1785 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) |
| 1786 | |
| 1787 | cglobal intra_pred_ang8_15, 4,5,8 |
| 1788 | cmp r4m, byte 21 |
| 1789 | mov r4, r2 |
| 1790 | cmovz r2, r3 |
| 1791 | cmovz r3, r4 |
| 1792 | |
| 1793 | lea r4, [ang_table + 23 * 16] |
| 1794 | mova m3, [pw_1024] |
| 1795 | |
| 1796 | movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 1797 | movu m2, [r3] |
| 1798 | pshufb m2, [c_mode16_15] |
| 1799 | palignr m1, m2, 13 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c] |
| 1800 | pslldq m0, m1, 1 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d] |
| 1801 | pinsrb m0, [r3 + 8], 0 |
| 1802 | punpckhbw m4, m0, m1 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] |
| 1803 | punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] |
| 1804 | palignr m1, m4, m0, 2 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] |
| 1805 | palignr m6, m4, m0, 4 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] |
| 1806 | palignr m5, m4, m0, 6 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] |
| 1807 | palignr m4, m0, 8 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 1808 | |
| 1809 | pmaddubsw m4, [r4 - 8 * 16] ; [15] |
| 1810 | pmulhrsw m4, m3 |
| 1811 | |
| 1812 | pmaddubsw m2, m5, [r4 + 7 * 16] ; [30] |
| 1813 | pmulhrsw m2, m3 |
| 1814 | packuswb m4, m2 |
| 1815 | |
| 1816 | pmaddubsw m5, [r4 - 10 * 16] ; [13] |
| 1817 | pmulhrsw m5, m3 |
| 1818 | |
| 1819 | pmaddubsw m2, m6, [r4 + 5 * 16] ; [28] |
| 1820 | pmulhrsw m2, m3 |
| 1821 | packuswb m5, m2 |
| 1822 | |
| 1823 | pmaddubsw m2, m1, [r4 + 3 * 16] ; [26] |
| 1824 | pmulhrsw m2, m3 |
| 1825 | |
| 1826 | pmaddubsw m0, [r4 + 1 * 16] ; [24] |
| 1827 | pmulhrsw m0, m3 |
| 1828 | |
| 1829 | lea r4, [ang_table + 11 * 16] |
| 1830 | pmaddubsw m6, [r4] ; [11] |
| 1831 | pmulhrsw m6, m3 |
| 1832 | packuswb m6, m2 |
| 1833 | |
| 1834 | pmaddubsw m1, [r4 - 2 * 16] ; [9] |
| 1835 | pmulhrsw m1, m3 |
| 1836 | packuswb m1, m0 |
| 1837 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) |
| 1838 | |
| 1839 | cglobal intra_pred_ang8_16, 4,5,8 |
| 1840 | cmp r4m, byte 20 |
| 1841 | mov r4, r2 |
| 1842 | cmovz r2, r3 |
| 1843 | cmovz r3, r4 |
| 1844 | |
| 1845 | lea r4, [ang_table + 22 * 16] |
| 1846 | mova m7, [pw_1024] |
| 1847 | |
| 1848 | movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 1849 | movu m2, [r3] |
| 1850 | pshufb m2, [c_mode16_16] |
| 1851 | palignr m1, m2, 12 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d] |
| 1852 | pslldq m0, m1, 1 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e] |
| 1853 | pinsrb m0, [r3 + 8], 0 |
| 1854 | punpckhbw m4, m0, m1 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] |
| 1855 | punpcklbw m0, m1 ; [3 2 2 1 1 0 0 a a b b c c d d e] |
| 1856 | palignr m1, m4, m0, 2 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] |
| 1857 | palignr m6, m4, m0, 4 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] |
| 1858 | palignr m2, m4, m0, 6 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] |
| 1859 | palignr m5, m4, m0, 8 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] |
| 1860 | palignr m4, m0, 10 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 1861 | |
| 1862 | pmaddubsw m3, m5, [r4] ; [22] |
| 1863 | pmulhrsw m3, m7 |
| 1864 | |
| 1865 | pmaddubsw m0, [r4 + 2 * 16] ; [24] |
| 1866 | pmulhrsw m0, m7 |
| 1867 | |
| 1868 | lea r4, [ang_table + 9 * 16] |
| 1869 | |
| 1870 | pmaddubsw m4, [r4 + 2 * 16] ; [11] |
| 1871 | pmulhrsw m4, m7 |
| 1872 | packuswb m4, m3 |
| 1873 | |
| 1874 | pmaddubsw m2, [r4 + 3 * 16] ; [12] |
| 1875 | pmulhrsw m2, m7 |
| 1876 | |
| 1877 | pmaddubsw m5, [r4 - 8 * 16] ; [1] |
| 1878 | pmulhrsw m5, m7 |
| 1879 | packuswb m5, m2 |
| 1880 | |
| 1881 | mova m2, m6 |
| 1882 | pmaddubsw m6, [r4 + 14 * 16] ; [23] |
| 1883 | pmulhrsw m6, m7 |
| 1884 | |
| 1885 | pmaddubsw m2, [r4 - 7 * 16] ; [2] |
| 1886 | pmulhrsw m2, m7 |
| 1887 | packuswb m6, m2 |
| 1888 | |
| 1889 | pmaddubsw m1, [r4 + 4 * 16] ; [13] |
| 1890 | pmulhrsw m1, m7 |
| 1891 | packuswb m1, m0 |
| 1892 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) |
| 1893 | |
| 1894 | cglobal intra_pred_ang8_17, 4,5,8 |
| 1895 | cmp r4m, byte 19 |
| 1896 | mov r4, r2 |
| 1897 | cmovz r2, r3 |
| 1898 | cmovz r3, r4 |
| 1899 | |
| 1900 | lea r4, [ang_table + 17 * 16] |
| 1901 | mova m3, [pw_1024] |
| 1902 | |
| 1903 | movu m2, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 1904 | movu m1, [r3] |
| 1905 | pshufb m1, [c_mode16_17] |
| 1906 | palignr m2, m1, 11 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e] |
| 1907 | pslldq m0, m2, 1 ; [9 8 7 6 5 4 3 2 1 0 a b c d e f] |
| 1908 | pinsrb m0, [r3 + 7], 0 |
| 1909 | punpckhbw m1, m0, m2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] |
| 1910 | punpcklbw m0, m2 ; [2 1 1 0 0 a a b b c c d d e e f] |
| 1911 | |
| 1912 | palignr m5, m1, m0, 8 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] |
| 1913 | palignr m2, m1, m0, 10 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] |
| 1914 | palignr m4, m1, m0, 12 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 1915 | |
| 1916 | |
| 1917 | pmaddubsw m2, [r4 - 5 * 16] ; [12] |
| 1918 | pmulhrsw m2, m3 |
| 1919 | |
| 1920 | pmaddubsw m4, [r4 - 11 * 16] ; [6] |
| 1921 | pmulhrsw m4, m3 |
| 1922 | packuswb m4, m2 |
| 1923 | |
| 1924 | pmaddubsw m5, [r4 + 1 * 16] ; [18] |
| 1925 | pmulhrsw m5, m3 |
| 1926 | |
| 1927 | palignr m2, m1, m0, 6 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] |
| 1928 | pmaddubsw m2, [r4 + 7 * 16] ; [24] |
| 1929 | pmulhrsw m2, m3 |
| 1930 | packuswb m5, m2 |
| 1931 | |
| 1932 | palignr m6, m1, m0, 4 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] |
| 1933 | mova m2, m6 |
| 1934 | pmaddubsw m6, [r4 + 13 * 16] ; [30] |
| 1935 | pmulhrsw m6, m3 |
| 1936 | |
| 1937 | pmaddubsw m2, [r4 - 13 * 16] ; [4] |
| 1938 | pmulhrsw m2, m3 |
| 1939 | packuswb m6, m2 |
| 1940 | |
| 1941 | palignr m1, m0, 2 ; [3 2 2 1 1 0 0 a a b b c c d d e] |
| 1942 | pmaddubsw m1, [r4 - 7 * 16] ; [10] |
| 1943 | pmulhrsw m1, m3 |
| 1944 | |
| 1945 | pmaddubsw m0, [r4 - 1 * 16] ; [16] |
| 1946 | pmulhrsw m0, m3 |
| 1947 | packuswb m1, m0 |
| 1948 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) |
| 1949 | |
| 1950 | cglobal intra_pred_ang8_18, 4,4,1 |
| 1951 | movu m0, [r2] |
| 1952 | pshufb m0, [pb_swap8] |
| 1953 | movhps m0, [r3 + 1] |
| 1954 | lea r2, [r0 + r1 * 4] |
| 1955 | lea r3, [r1 * 3] |
| 1956 | movh [r2 + r3], m0 |
| 1957 | psrldq m0, 1 |
| 1958 | movh [r2 + r1 * 2], m0 |
| 1959 | psrldq m0, 1 |
| 1960 | movh [r2 + r1], m0 |
| 1961 | psrldq m0, 1 |
| 1962 | movh [r2], m0 |
| 1963 | psrldq m0, 1 |
| 1964 | movh [r0 + r3], m0 |
| 1965 | psrldq m0, 1 |
| 1966 | movh [r0 + r1 * 2], m0 |
| 1967 | psrldq m0, 1 |
| 1968 | movh [r0 + r1], m0 |
| 1969 | psrldq m0, 1 |
| 1970 | movh [r0], m0 |
| 1971 | RET |
| 1972 | |
| 1973 | |
| 1974 | ;----------------------------------------------------------------------------- |
| 1975 | ; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 1976 | ;----------------------------------------------------------------------------- |
| 1977 | INIT_XMM ssse3 |
| 1978 | cglobal intra_pred_ang16_2, 3,3,3 |
| 1979 | cmp r4m, byte 34 |
| 1980 | cmove r2, r3mp |
| 1981 | movu m0, [r2 + 2] |
| 1982 | movu m1, [r2 + 18] |
| 1983 | movu [r0], m0 |
| 1984 | palignr m2, m1, m0, 1 |
| 1985 | movu [r0 + r1], m2 |
| 1986 | lea r0, [r0 + r1 * 2] |
| 1987 | palignr m2, m1, m0, 2 |
| 1988 | movu [r0], m2 |
| 1989 | palignr m2, m1, m0, 3 |
| 1990 | movu [r0 + r1], m2 |
| 1991 | lea r0, [r0 + r1 * 2] |
| 1992 | palignr m2, m1, m0, 4 |
| 1993 | movu [r0], m2 |
| 1994 | palignr m2, m1, m0, 5 |
| 1995 | movu [r0 + r1], m2 |
| 1996 | lea r0, [r0 + r1 * 2] |
| 1997 | palignr m2, m1, m0, 6 |
| 1998 | movu [r0], m2 |
| 1999 | palignr m2, m1, m0, 7 |
| 2000 | movu [r0 + r1], m2 |
| 2001 | lea r0, [r0 + r1 * 2] |
| 2002 | palignr m2, m1, m0, 8 |
| 2003 | movu [r0], m2 |
| 2004 | palignr m2, m1, m0, 9 |
| 2005 | movu [r0 + r1], m2 |
| 2006 | lea r0, [r0 + r1 * 2] |
| 2007 | palignr m2, m1, m0, 10 |
| 2008 | movu [r0], m2 |
| 2009 | palignr m2, m1, m0, 11 |
| 2010 | movu [r0 + r1], m2 |
| 2011 | lea r0, [r0 + r1 * 2] |
| 2012 | palignr m2, m1, m0, 12 |
| 2013 | movu [r0], m2 |
| 2014 | palignr m2, m1, m0, 13 |
| 2015 | movu [r0 + r1], m2 |
| 2016 | lea r0, [r0 + r1 * 2] |
| 2017 | palignr m2, m1, m0, 14 |
| 2018 | movu [r0], m2 |
| 2019 | palignr m2, m1, m0, 15 |
| 2020 | movu [r0 + r1], m2 |
| 2021 | RET |
| 2022 | |
| 2023 | %macro TRANSPOSE_STORE_8x8 6 |
| 2024 | %if %2 == 1 |
| 2025 | ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32 |
| 2026 | punpckhbw m0, %3, %4 |
| 2027 | punpcklbw %3, %4 |
| 2028 | punpckhbw %4, %3, m0 |
| 2029 | punpcklbw %3, m0 |
| 2030 | |
| 2031 | punpckhbw m0, %5, m1 |
| 2032 | punpcklbw %5, %6 |
| 2033 | punpckhbw %6, %5, m0 |
| 2034 | punpcklbw %5, m0 |
| 2035 | |
| 2036 | punpckhdq m0, %3, %5 |
| 2037 | punpckldq %3, %5 |
| 2038 | punpckldq %5, %4, %6 |
| 2039 | punpckhdq %4, %6 |
| 2040 | |
| 2041 | movh [r0 + + %1 * 8], %3 |
| 2042 | movhps [r0 + r1 + %1 * 8], %3 |
| 2043 | movh [r0 + r1*2 + %1 * 8], m0 |
| 2044 | movhps [r0 + r5 + %1 * 8], m0 |
| 2045 | movh [r6 + %1 * 8], %5 |
| 2046 | movhps [r6 + r1 + %1 * 8], %5 |
| 2047 | movh [r6 + r1*2 + %1 * 8], %4 |
| 2048 | movhps [r6 + r5 + %1 * 8], %4 |
| 2049 | %else |
| 2050 | ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32 |
| 2051 | movh [r0 ], %3 |
| 2052 | movhps [r0 + r1 ], %3 |
| 2053 | movh [r0 + r1 * 2], %4 |
| 2054 | movhps [r0 + r5 ], %4 |
| 2055 | lea r0, [r0 + r1 * 4] |
| 2056 | movh [r0 ], %5 |
| 2057 | movhps [r0 + r1 ], %5 |
| 2058 | movh [r0 + r1 * 2], %6 |
| 2059 | movhps [r0 + r5 ], %6 |
| 2060 | lea r0, [r0 + r1 * 4] |
| 2061 | %endif |
| 2062 | %endmacro |
| 2063 | |
| 2064 | INIT_XMM sse4 |
| 2065 | cglobal intra_pred_ang16_3, 3,7,8 |
| 2066 | |
| 2067 | lea r3, [ang_table + 16 * 16] |
| 2068 | mov r4d, 2 |
| 2069 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 2070 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 2071 | mova m7, [pw_1024] |
| 2072 | |
| 2073 | .loop: |
| 2074 | movu m0, [r2 + 1] |
| 2075 | palignr m1, m0, 1 |
| 2076 | |
| 2077 | punpckhbw m2, m0, m1 |
| 2078 | punpcklbw m0, m1 |
| 2079 | palignr m1, m2, m0, 2 |
| 2080 | |
| 2081 | pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] |
| 2082 | pmulhrsw m4, m7 |
| 2083 | pmaddubsw m1, [r3 + 4 * 16] ; [20] |
| 2084 | pmulhrsw m1, m7 |
| 2085 | packuswb m4, m1 |
| 2086 | |
| 2087 | palignr m5, m2, m0, 4 |
| 2088 | |
| 2089 | pmaddubsw m5, [r3 - 2 * 16] ; [14] |
| 2090 | pmulhrsw m5, m7 |
| 2091 | |
| 2092 | palignr m6, m2, m0, 6 |
| 2093 | |
| 2094 | pmaddubsw m6, [r3 - 8 * 16] ; [ 8] |
| 2095 | pmulhrsw m6, m7 |
| 2096 | packuswb m5, m6 |
| 2097 | |
| 2098 | palignr m1, m2, m0, 8 |
| 2099 | |
| 2100 | pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] |
| 2101 | pmulhrsw m6, m7 |
| 2102 | |
| 2103 | pmaddubsw m1, [r3 + 12 * 16] ; [28] |
| 2104 | pmulhrsw m1, m7 |
| 2105 | packuswb m6, m1 |
| 2106 | |
| 2107 | palignr m1, m2, m0, 10 |
| 2108 | |
| 2109 | pmaddubsw m1, [r3 + 6 * 16] ; [22] |
| 2110 | pmulhrsw m1, m7 |
| 2111 | |
| 2112 | palignr m2, m0, 12 |
| 2113 | |
| 2114 | pmaddubsw m2, [r3] ; [16] |
| 2115 | pmulhrsw m2, m7 |
| 2116 | packuswb m1, m2 |
| 2117 | |
| 2118 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 2119 | |
| 2120 | movu m0, [r2 + 8] |
| 2121 | palignr m1, m0, 1 |
| 2122 | |
| 2123 | punpckhbw m2, m0, m1 |
| 2124 | punpcklbw m0, m1 |
| 2125 | palignr m5, m2, m0, 2 |
| 2126 | |
| 2127 | pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] |
| 2128 | pmulhrsw m4, m7 |
| 2129 | pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] |
| 2130 | pmulhrsw m1, m7 |
| 2131 | packuswb m4, m1 |
| 2132 | |
| 2133 | pmaddubsw m5, [r3 + 14 * 16] ; [30] |
| 2134 | pmulhrsw m5, m7 |
| 2135 | |
| 2136 | palignr m6, m2, m0, 4 |
| 2137 | |
| 2138 | pmaddubsw m6, [r3 + 8 * 16] ; [24] |
| 2139 | pmulhrsw m6, m7 |
| 2140 | packuswb m5, m6 |
| 2141 | |
| 2142 | palignr m1, m2, m0, 6 |
| 2143 | |
| 2144 | pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] |
| 2145 | pmulhrsw m6, m7 |
| 2146 | |
| 2147 | palignr m1, m2, m0, 8 |
| 2148 | |
| 2149 | pmaddubsw m1, [r3 - 4 * 16] ; [12] |
| 2150 | pmulhrsw m1, m7 |
| 2151 | packuswb m6, m1 |
| 2152 | |
| 2153 | palignr m1, m2, m0, 10 |
| 2154 | |
| 2155 | pmaddubsw m1, [r3 - 10 * 16] ; [06] |
| 2156 | pmulhrsw m1, m7 |
| 2157 | packuswb m1, m1 |
| 2158 | |
| 2159 | movhps m1, [r2 + 14] ; [00] |
| 2160 | |
| 2161 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 2162 | |
| 2163 | lea r0, [r6 + r1 * 4] |
| 2164 | lea r6, [r6 + r1 * 8] |
| 2165 | add r2, 8 |
| 2166 | dec r4 |
| 2167 | jnz .loop |
| 2168 | |
| 2169 | RET |
| 2170 | |
| 2171 | INIT_XMM sse4 |
| 2172 | cglobal intra_pred_ang16_33, 3,7,8 |
| 2173 | mov r2, r3mp |
| 2174 | lea r3, [ang_table + 16 * 16] |
| 2175 | mov r4d, 2 |
| 2176 | lea r5, [r1 * 3] |
| 2177 | mov r6, r0 |
| 2178 | mova m7, [pw_1024] |
| 2179 | |
| 2180 | .loop: |
| 2181 | movu m0, [r2 + 1] |
| 2182 | palignr m1, m0, 1 |
| 2183 | |
| 2184 | punpckhbw m2, m0, m1 |
| 2185 | punpcklbw m0, m1 |
| 2186 | palignr m1, m2, m0, 2 |
| 2187 | |
| 2188 | pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] |
| 2189 | pmulhrsw m4, m7 |
| 2190 | pmaddubsw m1, [r3 + 4 * 16] ; [20] |
| 2191 | pmulhrsw m1, m7 |
| 2192 | packuswb m4, m1 |
| 2193 | |
| 2194 | palignr m5, m2, m0, 4 |
| 2195 | |
| 2196 | pmaddubsw m5, [r3 - 2 * 16] ; [14] |
| 2197 | pmulhrsw m5, m7 |
| 2198 | |
| 2199 | palignr m6, m2, m0, 6 |
| 2200 | |
| 2201 | pmaddubsw m6, [r3 - 8 * 16] ; [ 8] |
| 2202 | pmulhrsw m6, m7 |
| 2203 | packuswb m5, m6 |
| 2204 | |
| 2205 | palignr m1, m2, m0, 8 |
| 2206 | |
| 2207 | pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] |
| 2208 | pmulhrsw m6, m7 |
| 2209 | |
| 2210 | pmaddubsw m1, [r3 + 12 * 16] ; [28] |
| 2211 | pmulhrsw m1, m7 |
| 2212 | packuswb m6, m1 |
| 2213 | |
| 2214 | palignr m1, m2, m0, 10 |
| 2215 | |
| 2216 | pmaddubsw m1, [r3 + 6 * 16] ; [22] |
| 2217 | pmulhrsw m1, m7 |
| 2218 | |
| 2219 | palignr m2, m0, 12 |
| 2220 | |
| 2221 | pmaddubsw m2, [r3] ; [16] |
| 2222 | pmulhrsw m2, m7 |
| 2223 | packuswb m1, m2 |
| 2224 | |
| 2225 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 2226 | |
| 2227 | movu m0, [r2 + 8] |
| 2228 | palignr m1, m0, 1 |
| 2229 | |
| 2230 | punpckhbw m2, m0, m1 |
| 2231 | punpcklbw m0, m1 |
| 2232 | palignr m5, m2, m0, 2 |
| 2233 | |
| 2234 | pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] |
| 2235 | pmulhrsw m4, m7 |
| 2236 | pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] |
| 2237 | pmulhrsw m1, m7 |
| 2238 | packuswb m4, m1 |
| 2239 | |
| 2240 | pmaddubsw m5, [r3 + 14 * 16] ; [30] |
| 2241 | pmulhrsw m5, m7 |
| 2242 | |
| 2243 | palignr m6, m2, m0, 4 |
| 2244 | |
| 2245 | pmaddubsw m6, [r3 + 8 * 16] ; [24] |
| 2246 | pmulhrsw m6, m7 |
| 2247 | packuswb m5, m6 |
| 2248 | |
| 2249 | palignr m1, m2, m0, 6 |
| 2250 | |
| 2251 | pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] |
| 2252 | pmulhrsw m6, m7 |
| 2253 | |
| 2254 | palignr m1, m2, m0, 8 |
| 2255 | |
| 2256 | pmaddubsw m1, [r3 - 4 * 16] ; [12] |
| 2257 | pmulhrsw m1, m7 |
| 2258 | packuswb m6, m1 |
| 2259 | |
| 2260 | palignr m1, m2, m0, 10 |
| 2261 | |
| 2262 | pmaddubsw m1, [r3 - 10 * 16] ; [06] |
| 2263 | pmulhrsw m1, m7 |
| 2264 | packuswb m1, m1 |
| 2265 | |
| 2266 | movh m2, [r2 + 14] ; [00] |
| 2267 | |
| 2268 | movh [r0 ], m4 |
| 2269 | movhps [r0 + r1 ], m4 |
| 2270 | movh [r0 + r1 * 2], m5 |
| 2271 | movhps [r0 + r5 ], m5 |
| 2272 | lea r0, [r0 + r1 * 4] |
| 2273 | movh [r0 ], m6 |
| 2274 | movhps [r0 + r1 ], m6 |
| 2275 | movh [r0 + r1 * 2], m1 |
| 2276 | movh [r0 + r5 ], m2 |
| 2277 | |
| 2278 | lea r0, [r6 + 8] |
| 2279 | add r2, 8 |
| 2280 | dec r4 |
| 2281 | jnz .loop |
| 2282 | |
| 2283 | RET |
| 2284 | |
| 2285 | INIT_XMM sse4 |
| 2286 | cglobal intra_pred_ang16_4, 3,7,8 |
| 2287 | |
| 2288 | lea r3, [ang_table + 16 * 16] |
| 2289 | mov r4d, 2 |
| 2290 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 2291 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 2292 | mova m7, [pw_1024] |
| 2293 | |
| 2294 | .loop: |
| 2295 | movu m0, [r2 + 1] |
| 2296 | palignr m1, m0, 1 |
| 2297 | |
| 2298 | punpckhbw m2, m0, m1 |
| 2299 | punpcklbw m0, m1 |
| 2300 | palignr m1, m2, m0, 2 |
| 2301 | mova m5, m1 |
| 2302 | |
| 2303 | pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] |
| 2304 | pmulhrsw m4, m7 |
| 2305 | pmaddubsw m1, [r3 - 6 * 16] ; [10] |
| 2306 | pmulhrsw m1, m7 |
| 2307 | packuswb m4, m1 |
| 2308 | |
| 2309 | pmaddubsw m5, [r3 + 15 * 16] ; [31] |
| 2310 | pmulhrsw m5, m7 |
| 2311 | |
| 2312 | palignr m6, m2, m0, 4 |
| 2313 | |
| 2314 | pmaddubsw m6, [r3 + 4 * 16] ; [ 20] |
| 2315 | pmulhrsw m6, m7 |
| 2316 | packuswb m5, m6 |
| 2317 | |
| 2318 | palignr m1, m2, m0, 6 |
| 2319 | |
| 2320 | pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] |
| 2321 | pmulhrsw m6, m7 |
| 2322 | |
| 2323 | pmaddubsw m1, [r3 + 14 * 16] ; [30] |
| 2324 | pmulhrsw m1, m7 |
| 2325 | packuswb m6, m1 |
| 2326 | |
| 2327 | palignr m1, m2, m0, 8 |
| 2328 | |
| 2329 | pmaddubsw m1, [r3 + 3 * 16] ; [19] |
| 2330 | pmulhrsw m1, m7 |
| 2331 | |
| 2332 | palignr m2, m0, 10 |
| 2333 | |
| 2334 | pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] |
| 2335 | pmulhrsw m3, m7 |
| 2336 | packuswb m1, m3 |
| 2337 | |
| 2338 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 2339 | |
| 2340 | pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] |
| 2341 | pmulhrsw m4, m7 |
| 2342 | |
| 2343 | movu m0, [r2 + 6] |
| 2344 | palignr m1, m0, 1 |
| 2345 | |
| 2346 | punpckhbw m2, m0, m1 |
| 2347 | punpcklbw m0, m1 |
| 2348 | palignr m1, m2, m0, 2 |
| 2349 | |
| 2350 | pmaddubsw m1, [r3 + 2 * 16] ; [18] |
| 2351 | pmulhrsw m1, m7 |
| 2352 | packuswb m4, m1 |
| 2353 | |
| 2354 | palignr m5, m2, m0, 4 |
| 2355 | mova m6, m5 |
| 2356 | |
| 2357 | pmaddubsw m5, [r3 - 9 * 16] ; [07] |
| 2358 | pmulhrsw m5, m7 |
| 2359 | |
| 2360 | pmaddubsw m6, [r3 + 12 * 16] ; [28] |
| 2361 | pmulhrsw m6, m7 |
| 2362 | packuswb m5, m6 |
| 2363 | |
| 2364 | palignr m6, m2, m0, 6 |
| 2365 | |
| 2366 | pmaddubsw m6, [r3 + 16] ; [17] |
| 2367 | pmulhrsw m6, m7 |
| 2368 | |
| 2369 | palignr m1, m2, m0, 8 |
| 2370 | palignr m2, m0, 10 |
| 2371 | |
| 2372 | pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] |
| 2373 | pmulhrsw m3, m7 |
| 2374 | packuswb m6, m3 |
| 2375 | |
| 2376 | pmaddubsw m1, [r3 + 11 * 16] ; [27] |
| 2377 | pmulhrsw m1, m7 |
| 2378 | |
| 2379 | pmaddubsw m2, [r3] ; [16] |
| 2380 | pmulhrsw m2, m7 |
| 2381 | packuswb m1, m2 |
| 2382 | |
| 2383 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 2384 | |
| 2385 | lea r0, [r6 + r1 * 4] |
| 2386 | lea r6, [r6 + r1 * 8] |
| 2387 | add r2, 8 |
| 2388 | dec r4 |
| 2389 | jnz .loop |
| 2390 | |
| 2391 | RET |
| 2392 | |
| 2393 | INIT_XMM sse4 |
| 2394 | cglobal intra_pred_ang16_32, 3,7,8 |
| 2395 | mov r2, r3mp |
| 2396 | lea r3, [ang_table + 16 * 16] |
| 2397 | mov r4d, 2 |
| 2398 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 2399 | mov r6, r0 |
| 2400 | mova m7, [pw_1024] |
| 2401 | |
| 2402 | .loop: |
| 2403 | movu m0, [r2 + 1] |
| 2404 | palignr m1, m0, 1 |
| 2405 | |
| 2406 | punpckhbw m2, m0, m1 |
| 2407 | punpcklbw m0, m1 |
| 2408 | palignr m1, m2, m0, 2 |
| 2409 | mova m5, m1 |
| 2410 | |
| 2411 | |
| 2412 | pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] |
| 2413 | pmulhrsw m4, m7 |
| 2414 | pmaddubsw m1, [r3 - 6 * 16] ; [10] |
| 2415 | pmulhrsw m1, m7 |
| 2416 | packuswb m4, m1 |
| 2417 | |
| 2418 | pmaddubsw m5, [r3 + 15 * 16] ; [31] |
| 2419 | pmulhrsw m5, m7 |
| 2420 | |
| 2421 | palignr m6, m2, m0, 4 |
| 2422 | |
| 2423 | pmaddubsw m6, [r3 + 4 * 16] ; [ 20] |
| 2424 | pmulhrsw m6, m7 |
| 2425 | packuswb m5, m6 |
| 2426 | |
| 2427 | palignr m1, m2, m0, 6 |
| 2428 | |
| 2429 | pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] |
| 2430 | pmulhrsw m6, m7 |
| 2431 | |
| 2432 | pmaddubsw m1, [r3 + 14 * 16] ; [30] |
| 2433 | pmulhrsw m1, m7 |
| 2434 | packuswb m6, m1 |
| 2435 | |
| 2436 | palignr m1, m2, m0, 8 |
| 2437 | |
| 2438 | pmaddubsw m1, [r3 + 3 * 16] ; [19] |
| 2439 | pmulhrsw m1, m7 |
| 2440 | |
| 2441 | palignr m2, m0, 10 |
| 2442 | |
| 2443 | pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] |
| 2444 | pmulhrsw m3, m7 |
| 2445 | packuswb m1, m3 |
| 2446 | |
| 2447 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 2448 | |
| 2449 | pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] |
| 2450 | pmulhrsw m4, m7 |
| 2451 | |
| 2452 | movu m0, [r2 + 6] |
| 2453 | palignr m1, m0, 1 |
| 2454 | |
| 2455 | punpckhbw m2, m0, m1 |
| 2456 | punpcklbw m0, m1 |
| 2457 | palignr m1, m2, m0, 2 |
| 2458 | |
| 2459 | pmaddubsw m1, [r3 + 2 * 16] ; [18] |
| 2460 | pmulhrsw m1, m7 |
| 2461 | packuswb m4, m1 |
| 2462 | |
| 2463 | palignr m5, m2, m0, 4 |
| 2464 | mova m6, m5 |
| 2465 | |
| 2466 | pmaddubsw m5, [r3 - 9 * 16] ; [07] |
| 2467 | pmulhrsw m5, m7 |
| 2468 | |
| 2469 | pmaddubsw m6, [r3 + 12 * 16] ; [28] |
| 2470 | pmulhrsw m6, m7 |
| 2471 | packuswb m5, m6 |
| 2472 | |
| 2473 | palignr m6, m2, m0, 6 |
| 2474 | |
| 2475 | pmaddubsw m6, [r3 + 16] ; [17] |
| 2476 | pmulhrsw m6, m7 |
| 2477 | |
| 2478 | palignr m1, m2, m0, 8 |
| 2479 | palignr m2, m0, 10 |
| 2480 | |
| 2481 | pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] |
| 2482 | pmulhrsw m3, m7 |
| 2483 | packuswb m6, m3 |
| 2484 | |
| 2485 | pmaddubsw m1, [r3 + 11 * 16] ; [27] |
| 2486 | pmulhrsw m1, m7 |
| 2487 | |
| 2488 | pmaddubsw m2, [r3] ; [16] |
| 2489 | pmulhrsw m2, m7 |
| 2490 | packuswb m1, m2 |
| 2491 | |
| 2492 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 2493 | |
| 2494 | lea r0, [r6 + 8] |
| 2495 | add r2, 8 |
| 2496 | dec r4 |
| 2497 | jnz .loop |
| 2498 | |
| 2499 | RET |
| 2500 | |
| 2501 | INIT_XMM sse4 |
| 2502 | cglobal intra_pred_ang16_5, 3,7,8 |
| 2503 | |
| 2504 | lea r3, [ang_table + 16 * 16] |
| 2505 | mov r4d, 2 |
| 2506 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 2507 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 2508 | mova m7, [pw_1024] |
| 2509 | |
| 2510 | .loop: |
| 2511 | movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 2512 | movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 2513 | punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] |
| 2514 | punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 2515 | |
| 2516 | palignr m5, m2, m3, 2 |
| 2517 | |
| 2518 | pmaddubsw m4, m3, [r3 + 16] ; [17] |
| 2519 | pmulhrsw m4, m7 |
| 2520 | pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] |
| 2521 | pmulhrsw m1, m7 |
| 2522 | packuswb m4, m1 |
| 2523 | |
| 2524 | palignr m6, m2, m3, 4 |
| 2525 | |
| 2526 | pmaddubsw m5, [r3 + 3 * 16] ; [19] |
| 2527 | pmulhrsw m5, m7 |
| 2528 | pmaddubsw m1, m6, [r3 - 12 * 16] ; [4] |
| 2529 | pmulhrsw m1, m7 |
| 2530 | packuswb m5, m1 |
| 2531 | |
| 2532 | palignr m1, m2, m3, 6 |
| 2533 | |
| 2534 | pmaddubsw m6, [r3 + 5 * 16] ; [21] |
| 2535 | pmulhrsw m6, m7 |
| 2536 | pmaddubsw m0, m1, [r3 - 10 * 16] ; [6] |
| 2537 | pmulhrsw m0, m7 |
| 2538 | packuswb m6, m0 |
| 2539 | |
| 2540 | palignr m0, m2, m3, 8 |
| 2541 | |
| 2542 | pmaddubsw m1, [r3 + 7 * 16] ; [23] |
| 2543 | pmulhrsw m1, m7 |
| 2544 | pmaddubsw m0, [r3 - 8 * 16] ; [8] |
| 2545 | pmulhrsw m0, m7 |
| 2546 | packuswb m1, m0 |
| 2547 | |
| 2548 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 2549 | |
| 2550 | palignr m4, m2, m3, 8 |
| 2551 | palignr m5, m2, m3, 10 |
| 2552 | |
| 2553 | pmaddubsw m4, [r3 + 9 * 16] ; [25] |
| 2554 | pmulhrsw m4, m7 |
| 2555 | pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] |
| 2556 | pmulhrsw m1, m7 |
| 2557 | packuswb m4, m1 |
| 2558 | |
| 2559 | palignr m6, m2, m3, 12 |
| 2560 | |
| 2561 | pmaddubsw m5, [r3 + 11 * 16] ; [27] |
| 2562 | pmulhrsw m5, m7 |
| 2563 | pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] |
| 2564 | pmulhrsw m1, m7 |
| 2565 | packuswb m5, m1 |
| 2566 | |
| 2567 | palignr m1, m2, m3, 14 |
| 2568 | |
| 2569 | pmaddubsw m6, [r3 + 13 * 16] ; [29] |
| 2570 | pmulhrsw m6, m7 |
| 2571 | pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] |
| 2572 | pmulhrsw m0, m7 |
| 2573 | packuswb m6, m0 |
| 2574 | |
| 2575 | pmaddubsw m1, [r3 + 15 * 16] ; [31] |
| 2576 | pmulhrsw m1, m7 |
| 2577 | pmaddubsw m2, [r3] ; [16] |
| 2578 | pmulhrsw m2, m7 |
| 2579 | packuswb m1, m2 |
| 2580 | |
| 2581 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 2582 | |
| 2583 | lea r0, [r6 + r1 * 4] |
| 2584 | lea r6, [r6 + r1 * 8] |
| 2585 | add r2, 8 |
| 2586 | dec r4 |
| 2587 | jnz .loop |
| 2588 | |
| 2589 | RET |
| 2590 | |
| 2591 | INIT_XMM sse4 |
| 2592 | cglobal intra_pred_ang16_31, 3,7,8 |
| 2593 | mov r2, r3mp |
| 2594 | lea r3, [ang_table + 16 * 16] |
| 2595 | mov r4d, 2 |
| 2596 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 2597 | mov r6, r0 |
| 2598 | mova m7, [pw_1024] |
| 2599 | |
| 2600 | .loop: |
| 2601 | movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 2602 | movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 2603 | punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] |
| 2604 | punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 2605 | |
| 2606 | palignr m5, m2, m3, 2 |
| 2607 | |
| 2608 | pmaddubsw m4, m3, [r3 + 16] ; [17] |
| 2609 | pmulhrsw m4, m7 |
| 2610 | pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] |
| 2611 | pmulhrsw m1, m7 |
| 2612 | packuswb m4, m1 |
| 2613 | |
| 2614 | palignr m6, m2, m3, 4 |
| 2615 | |
| 2616 | pmaddubsw m5, [r3 + 3 * 16] ; [19] |
| 2617 | pmulhrsw m5, m7 |
| 2618 | pmaddubsw m1, m6, [r3 - 12 * 16] ; [4] |
| 2619 | pmulhrsw m1, m7 |
| 2620 | packuswb m5, m1 |
| 2621 | |
| 2622 | palignr m1, m2, m3, 6 |
| 2623 | |
| 2624 | pmaddubsw m6, [r3 + 5 * 16] ; [21] |
| 2625 | pmulhrsw m6, m7 |
| 2626 | pmaddubsw m0, m1, [r3 - 10 * 16] ; [6] |
| 2627 | pmulhrsw m0, m7 |
| 2628 | packuswb m6, m0 |
| 2629 | |
| 2630 | palignr m0, m2, m3, 8 |
| 2631 | |
| 2632 | pmaddubsw m1, [r3 + 7 * 16] ; [23] |
| 2633 | pmulhrsw m1, m7 |
| 2634 | pmaddubsw m0, [r3 - 8 * 16] ; [8] |
| 2635 | pmulhrsw m0, m7 |
| 2636 | packuswb m1, m0 |
| 2637 | |
| 2638 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 2639 | |
| 2640 | palignr m4, m2, m3, 8 |
| 2641 | palignr m5, m2, m3, 10 |
| 2642 | |
| 2643 | pmaddubsw m4, [r3 + 9 * 16] ; [25] |
| 2644 | pmulhrsw m4, m7 |
| 2645 | pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] |
| 2646 | pmulhrsw m1, m7 |
| 2647 | packuswb m4, m1 |
| 2648 | |
| 2649 | palignr m6, m2, m3, 12 |
| 2650 | |
| 2651 | pmaddubsw m5, [r3 + 11 * 16] ; [27] |
| 2652 | pmulhrsw m5, m7 |
| 2653 | pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] |
| 2654 | pmulhrsw m1, m7 |
| 2655 | packuswb m5, m1 |
| 2656 | |
| 2657 | palignr m1, m2, m3, 14 |
| 2658 | |
| 2659 | pmaddubsw m6, [r3 + 13 * 16] ; [29] |
| 2660 | pmulhrsw m6, m7 |
| 2661 | pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] |
| 2662 | pmulhrsw m0, m7 |
| 2663 | packuswb m6, m0 |
| 2664 | |
| 2665 | pmaddubsw m1, [r3 + 15 * 16] ; [31] |
| 2666 | pmulhrsw m1, m7 |
| 2667 | pmaddubsw m2, [r3] ; [16] |
| 2668 | pmulhrsw m2, m7 |
| 2669 | packuswb m1, m2 |
| 2670 | |
| 2671 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 2672 | |
| 2673 | lea r0, [r6 + 8] |
| 2674 | add r2, 8 |
| 2675 | dec r4 |
| 2676 | jnz .loop |
| 2677 | |
| 2678 | RET |
| 2679 | |
| 2680 | INIT_XMM sse4 |
| 2681 | cglobal intra_pred_ang16_6, 3,7,8 |
| 2682 | |
| 2683 | lea r3, [ang_table + 16 * 16] |
| 2684 | mov r4d, 2 |
| 2685 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 2686 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 2687 | mova m7, [pw_1024] |
| 2688 | |
| 2689 | .loop: |
| 2690 | movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 2691 | palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 2692 | punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] |
| 2693 | punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 2694 | |
| 2695 | pmaddubsw m4, m3, [r3 - 3 * 16] ; [13] |
| 2696 | pmulhrsw m4, m7 |
| 2697 | pmaddubsw m1, m3, [r3 + 10 * 16] ; [26] |
| 2698 | pmulhrsw m1, m7 |
| 2699 | packuswb m4, m1 |
| 2700 | |
| 2701 | palignr m6, m2, m3, 2 |
| 2702 | |
| 2703 | pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] |
| 2704 | pmulhrsw m5, m7 |
| 2705 | pmaddubsw m6, [r3 + 4 * 16] ; [20] |
| 2706 | pmulhrsw m6, m7 |
| 2707 | packuswb m5, m6 |
| 2708 | |
| 2709 | palignr m1, m2, m3, 4 |
| 2710 | |
| 2711 | pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] |
| 2712 | pmulhrsw m6, m7 |
| 2713 | pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] |
| 2714 | pmulhrsw m0, m7 |
| 2715 | packuswb m6, m0 |
| 2716 | |
| 2717 | palignr m0, m2, m3, 6 |
| 2718 | |
| 2719 | pmaddubsw m1, [r3 + 11 * 16] ; [27] |
| 2720 | pmulhrsw m1, m7 |
| 2721 | pmaddubsw m0, [r3 - 8 * 16] ; [8] |
| 2722 | pmulhrsw m0, m7 |
| 2723 | packuswb m1, m0 |
| 2724 | |
| 2725 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 2726 | |
| 2727 | palignr m4, m2, m3, 6 |
| 2728 | palignr m6, m2, m3, 8 |
| 2729 | |
| 2730 | pmaddubsw m4, [r3 + 5 * 16] ; [21] |
| 2731 | pmulhrsw m4, m7 |
| 2732 | pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] |
| 2733 | pmulhrsw m1, m7 |
| 2734 | packuswb m4, m1 |
| 2735 | |
| 2736 | pmaddubsw m5, m6, [r3 - 16] ; [15] |
| 2737 | pmulhrsw m5, m7 |
| 2738 | pmaddubsw m6, [r3 + 12 * 16] ; [28] |
| 2739 | pmulhrsw m6, m7 |
| 2740 | packuswb m5, m6 |
| 2741 | |
| 2742 | palignr m0, m2, m3, 10 |
| 2743 | |
| 2744 | pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] |
| 2745 | pmulhrsw m6, m7 |
| 2746 | pmaddubsw m0, [r3 + 6 * 16] ; [22] |
| 2747 | pmulhrsw m0, m7 |
| 2748 | packuswb m6, m0 |
| 2749 | |
| 2750 | palignr m2, m3, 12 |
| 2751 | |
| 2752 | pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] |
| 2753 | pmulhrsw m1, m7 |
| 2754 | pmaddubsw m2, [r3] ; [16] |
| 2755 | pmulhrsw m2, m7 |
| 2756 | packuswb m1, m2 |
| 2757 | |
| 2758 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 2759 | |
| 2760 | lea r0, [r6 + r1 * 4] |
| 2761 | lea r6, [r6 + r1 * 8] |
| 2762 | add r2, 8 |
| 2763 | dec r4 |
| 2764 | jnz .loop |
| 2765 | |
| 2766 | RET |
| 2767 | |
| 2768 | INIT_XMM sse4 |
| 2769 | cglobal intra_pred_ang16_30, 3,7,8 |
| 2770 | mov r2, r3mp |
| 2771 | lea r3, [ang_table + 16 * 16] |
| 2772 | mov r4d, 2 |
| 2773 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 2774 | mov r6, r0 |
| 2775 | mova m7, [pw_1024] |
| 2776 | |
| 2777 | .loop: |
| 2778 | movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 2779 | palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 2780 | punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] |
| 2781 | punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 2782 | |
| 2783 | pmaddubsw m4, m3, [r3 - 3 * 16] ; [13] |
| 2784 | pmulhrsw m4, m7 |
| 2785 | pmaddubsw m1, m3, [r3 + 10 * 16] ; [26] |
| 2786 | pmulhrsw m1, m7 |
| 2787 | packuswb m4, m1 |
| 2788 | |
| 2789 | palignr m6, m2, m3, 2 |
| 2790 | |
| 2791 | pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] |
| 2792 | pmulhrsw m5, m7 |
| 2793 | pmaddubsw m6, [r3 + 4 * 16] ; [20] |
| 2794 | pmulhrsw m6, m7 |
| 2795 | packuswb m5, m6 |
| 2796 | |
| 2797 | palignr m1, m2, m3, 4 |
| 2798 | |
| 2799 | pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] |
| 2800 | pmulhrsw m6, m7 |
| 2801 | pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] |
| 2802 | pmulhrsw m0, m7 |
| 2803 | packuswb m6, m0 |
| 2804 | |
| 2805 | palignr m0, m2, m3, 6 |
| 2806 | |
| 2807 | pmaddubsw m1, [r3 + 11 * 16] ; [27] |
| 2808 | pmulhrsw m1, m7 |
| 2809 | pmaddubsw m0, [r3 - 8 * 16] ; [8] |
| 2810 | pmulhrsw m0, m7 |
| 2811 | packuswb m1, m0 |
| 2812 | |
| 2813 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 2814 | |
| 2815 | palignr m4, m2, m3, 6 |
| 2816 | palignr m6, m2, m3, 8 |
| 2817 | |
| 2818 | pmaddubsw m4, [r3 + 5 * 16] ; [21] |
| 2819 | pmulhrsw m4, m7 |
| 2820 | pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] |
| 2821 | pmulhrsw m1, m7 |
| 2822 | packuswb m4, m1 |
| 2823 | |
| 2824 | pmaddubsw m5, m6, [r3 - 16] ; [15] |
| 2825 | pmulhrsw m5, m7 |
| 2826 | pmaddubsw m6, [r3 + 12 * 16] ; [28] |
| 2827 | pmulhrsw m6, m7 |
| 2828 | packuswb m5, m6 |
| 2829 | |
| 2830 | palignr m0, m2, m3, 10 |
| 2831 | |
| 2832 | pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] |
| 2833 | pmulhrsw m6, m7 |
| 2834 | pmaddubsw m0, [r3 + 6 * 16] ; [22] |
| 2835 | pmulhrsw m0, m7 |
| 2836 | packuswb m6, m0 |
| 2837 | |
| 2838 | palignr m2, m3, 12 |
| 2839 | |
| 2840 | pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] |
| 2841 | pmulhrsw m1, m7 |
| 2842 | pmaddubsw m2, [r3] ; [16] |
| 2843 | pmulhrsw m2, m7 |
| 2844 | packuswb m1, m2 |
| 2845 | |
| 2846 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 2847 | |
| 2848 | lea r0, [r6 + 8] |
| 2849 | add r2, 8 |
| 2850 | dec r4 |
| 2851 | jnz .loop |
| 2852 | |
| 2853 | RET |
| 2854 | |
| 2855 | INIT_XMM sse4 |
| 2856 | cglobal intra_pred_ang16_7, 3,7,8 |
| 2857 | |
| 2858 | lea r3, [ang_table + 16 * 16] |
| 2859 | mov r4d, 2 |
| 2860 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 2861 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 2862 | mova m7, [pw_1024] |
| 2863 | |
| 2864 | .loop: |
| 2865 | movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 2866 | palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 2867 | punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] |
| 2868 | punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 2869 | |
| 2870 | pmaddubsw m4, m3, [r3 - 7 * 16] ; [9] |
| 2871 | pmulhrsw m4, m7 |
| 2872 | pmaddubsw m0, m3, [r3 + 2 * 16] ; [18] |
| 2873 | pmulhrsw m0, m7 |
| 2874 | packuswb m4, m0 |
| 2875 | |
| 2876 | palignr m1, m2, m3, 2 |
| 2877 | |
| 2878 | pmaddubsw m5, m3, [r3 + 11 * 16] ; [27] |
| 2879 | pmulhrsw m5, m7 |
| 2880 | pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] |
| 2881 | pmulhrsw m6, m7 |
| 2882 | packuswb m5, m6 |
| 2883 | |
| 2884 | pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] |
| 2885 | pmulhrsw m6, m7 |
| 2886 | pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] |
| 2887 | pmulhrsw m0, m7 |
| 2888 | packuswb m6, m0 |
| 2889 | |
| 2890 | palignr m0, m2, m3, 4 |
| 2891 | |
| 2892 | pmaddubsw m1, [r3 + 15 * 16] ; [31] |
| 2893 | pmulhrsw m1, m7 |
| 2894 | pmaddubsw m0, [r3 - 8 * 16] ; [8] |
| 2895 | pmulhrsw m0, m7 |
| 2896 | packuswb m1, m0 |
| 2897 | |
| 2898 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 2899 | |
| 2900 | palignr m1, m2, m3, 4 |
| 2901 | |
| 2902 | pmaddubsw m4, m1, [r3 + 16] ; [17] |
| 2903 | pmulhrsw m4, m7 |
| 2904 | pmaddubsw m1, [r3 + 10 * 16] ; [26] |
| 2905 | pmulhrsw m1, m7 |
| 2906 | packuswb m4, m1 |
| 2907 | |
| 2908 | palignr m0, m2, m3, 6 |
| 2909 | |
| 2910 | pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] |
| 2911 | pmulhrsw m5, m7 |
| 2912 | pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] |
| 2913 | pmulhrsw m6, m7 |
| 2914 | packuswb m5, m6 |
| 2915 | |
| 2916 | pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] |
| 2917 | pmulhrsw m6, m7 |
| 2918 | pmaddubsw m0, [r3 + 14 * 16] ; [30] |
| 2919 | pmulhrsw m0, m7 |
| 2920 | packuswb m6, m0 |
| 2921 | |
| 2922 | palignr m2, m3, 8 |
| 2923 | |
| 2924 | pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] |
| 2925 | pmulhrsw m1, m7 |
| 2926 | pmaddubsw m2, [r3] ; [16] |
| 2927 | pmulhrsw m2, m7 |
| 2928 | packuswb m1, m2 |
| 2929 | |
| 2930 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 2931 | |
| 2932 | lea r0, [r6 + r1 * 4] |
| 2933 | lea r6, [r6 + r1 * 8] |
| 2934 | add r2, 8 |
| 2935 | dec r4 |
| 2936 | jnz .loop |
| 2937 | |
| 2938 | RET |
| 2939 | |
| 2940 | INIT_XMM sse4 |
| 2941 | cglobal intra_pred_ang16_29, 3,7,8 |
| 2942 | mov r2, r3mp |
| 2943 | lea r3, [ang_table + 16 * 16] |
| 2944 | mov r4d, 2 |
| 2945 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 2946 | mov r6, r0 |
| 2947 | mova m7, [pw_1024] |
| 2948 | |
| 2949 | .loop: |
| 2950 | movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 2951 | palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 2952 | punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] |
| 2953 | punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 2954 | |
| 2955 | pmaddubsw m4, m3, [r3 - 7 * 16] ; [9] |
| 2956 | pmulhrsw m4, m7 |
| 2957 | pmaddubsw m0, m3, [r3 + 2 * 16] ; [18] |
| 2958 | pmulhrsw m0, m7 |
| 2959 | packuswb m4, m0 |
| 2960 | |
| 2961 | palignr m1, m2, m3, 2 |
| 2962 | |
| 2963 | pmaddubsw m5, m3, [r3 + 11 * 16] ; [27] |
| 2964 | pmulhrsw m5, m7 |
| 2965 | pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] |
| 2966 | pmulhrsw m6, m7 |
| 2967 | packuswb m5, m6 |
| 2968 | |
| 2969 | pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] |
| 2970 | pmulhrsw m6, m7 |
| 2971 | pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] |
| 2972 | pmulhrsw m0, m7 |
| 2973 | packuswb m6, m0 |
| 2974 | |
| 2975 | palignr m0, m2, m3, 4 |
| 2976 | |
| 2977 | pmaddubsw m1, [r3 + 15 * 16] ; [31] |
| 2978 | pmulhrsw m1, m7 |
| 2979 | pmaddubsw m0, [r3 - 8 * 16] ; [8] |
| 2980 | pmulhrsw m0, m7 |
| 2981 | packuswb m1, m0 |
| 2982 | |
| 2983 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 2984 | |
| 2985 | palignr m1, m2, m3, 4 |
| 2986 | |
| 2987 | pmaddubsw m4, m1, [r3 + 16] ; [17] |
| 2988 | pmulhrsw m4, m7 |
| 2989 | pmaddubsw m1, [r3 + 10 * 16] ; [26] |
| 2990 | pmulhrsw m1, m7 |
| 2991 | packuswb m4, m1 |
| 2992 | |
| 2993 | palignr m0, m2, m3, 6 |
| 2994 | |
| 2995 | pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] |
| 2996 | pmulhrsw m5, m7 |
| 2997 | pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] |
| 2998 | pmulhrsw m6, m7 |
| 2999 | packuswb m5, m6 |
| 3000 | |
| 3001 | pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] |
| 3002 | pmulhrsw m6, m7 |
| 3003 | pmaddubsw m0, [r3 + 14 * 16] ; [30] |
| 3004 | pmulhrsw m0, m7 |
| 3005 | packuswb m6, m0 |
| 3006 | |
| 3007 | palignr m2, m3, 8 |
| 3008 | |
| 3009 | pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] |
| 3010 | pmulhrsw m1, m7 |
| 3011 | pmaddubsw m2, [r3] ; [16] |
| 3012 | pmulhrsw m2, m7 |
| 3013 | packuswb m1, m2 |
| 3014 | |
| 3015 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 3016 | |
| 3017 | lea r0, [r6 + 8] |
| 3018 | add r2, 8 |
| 3019 | dec r4 |
| 3020 | jnz .loop |
| 3021 | |
| 3022 | RET |
| 3023 | |
| 3024 | INIT_XMM sse4 |
| 3025 | cglobal intra_pred_ang16_8, 3,7,8 |
| 3026 | |
| 3027 | lea r3, [ang_table + 16 * 16] |
| 3028 | mov r4d, 2 |
| 3029 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 3030 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 3031 | mova m7, [pw_1024] |
| 3032 | |
| 3033 | .loop: |
| 3034 | movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 3035 | palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 3036 | punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] |
| 3037 | punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 3038 | |
| 3039 | pmaddubsw m4, m1, [r3 - 11 * 16] ; [5] |
| 3040 | pmulhrsw m4, m7 |
| 3041 | pmaddubsw m2, m1, [r3 - 6 * 16] ; [10] |
| 3042 | pmulhrsw m2, m7 |
| 3043 | packuswb m4, m2 |
| 3044 | |
| 3045 | pmaddubsw m5, m1, [r3 - 1 * 16] ; [15] |
| 3046 | pmulhrsw m5, m7 |
| 3047 | pmaddubsw m6, m1, [r3 + 4 * 16] ; [20] |
| 3048 | pmulhrsw m6, m7 |
| 3049 | packuswb m5, m6 |
| 3050 | |
| 3051 | pmaddubsw m6, m1, [r3 + 9 * 16] ; [25] |
| 3052 | pmulhrsw m6, m7 |
| 3053 | pmaddubsw m2, m1, [r3 + 14 * 16] ; [30] |
| 3054 | pmulhrsw m2, m7 |
| 3055 | packuswb m6, m2 |
| 3056 | |
| 3057 | palignr m2, m0, m1, 2 |
| 3058 | palignr m3, m0, m1, 4 |
| 3059 | |
| 3060 | pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] |
| 3061 | pmulhrsw m1, m7 |
| 3062 | pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] |
| 3063 | pmulhrsw m0, m7 |
| 3064 | packuswb m1, m0 |
| 3065 | |
| 3066 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 3067 | |
| 3068 | pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] |
| 3069 | pmulhrsw m4, m7 |
| 3070 | pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] |
| 3071 | pmulhrsw m5, m7 |
| 3072 | packuswb m4, m5 |
| 3073 | |
| 3074 | pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] |
| 3075 | pmulhrsw m5, m7 |
| 3076 | pmaddubsw m2, [r3 + 12 * 16] ; [28] |
| 3077 | pmulhrsw m2, m7 |
| 3078 | packuswb m5, m2 |
| 3079 | |
| 3080 | pmaddubsw m6, m3, [r3 - 15 * 16] ; [01] |
| 3081 | pmulhrsw m6, m7 |
| 3082 | pmaddubsw m1, m3, [r3 - 10 * 16] ; [06] |
| 3083 | pmulhrsw m1, m7 |
| 3084 | packuswb m6, m1 |
| 3085 | |
| 3086 | pmaddubsw m1, m3, [r3 - 5 * 16] ; [11] |
| 3087 | pmulhrsw m1, m7 |
| 3088 | pmaddubsw m3, [r3] ; [16] |
| 3089 | pmulhrsw m3, m7 |
| 3090 | packuswb m1, m3 |
| 3091 | |
| 3092 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 3093 | |
| 3094 | lea r0, [r6 + r1 * 4] |
| 3095 | lea r6, [r6 + r1 * 8] |
| 3096 | add r2, 8 |
| 3097 | dec r4 |
| 3098 | jnz .loop |
| 3099 | |
| 3100 | RET |
| 3101 | |
| 3102 | INIT_XMM sse4 |
| 3103 | cglobal intra_pred_ang16_28, 3,7,8 |
| 3104 | mov r2, r3mp |
| 3105 | lea r3, [ang_table + 16 * 16] |
| 3106 | mov r4d, 2 |
| 3107 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 3108 | mov r6, r0 |
| 3109 | mova m7, [pw_1024] |
| 3110 | |
| 3111 | .loop: |
| 3112 | movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 3113 | palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 3114 | punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] |
| 3115 | punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 3116 | |
| 3117 | pmaddubsw m4, m1, [r3 - 11 * 16] ; [5] |
| 3118 | pmulhrsw m4, m7 |
| 3119 | pmaddubsw m2, m1, [r3 - 6 * 16] ; [10] |
| 3120 | pmulhrsw m2, m7 |
| 3121 | packuswb m4, m2 |
| 3122 | |
| 3123 | pmaddubsw m5, m1, [r3 - 1 * 16] ; [15] |
| 3124 | pmulhrsw m5, m7 |
| 3125 | pmaddubsw m6, m1, [r3 + 4 * 16] ; [20] |
| 3126 | pmulhrsw m6, m7 |
| 3127 | packuswb m5, m6 |
| 3128 | |
| 3129 | pmaddubsw m6, m1, [r3 + 9 * 16] ; [25] |
| 3130 | pmulhrsw m6, m7 |
| 3131 | pmaddubsw m2, m1, [r3 + 14 * 16] ; [30] |
| 3132 | pmulhrsw m2, m7 |
| 3133 | packuswb m6, m2 |
| 3134 | |
| 3135 | palignr m2, m0, m1, 2 |
| 3136 | palignr m3, m0, m1, 4 |
| 3137 | |
| 3138 | pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] |
| 3139 | pmulhrsw m1, m7 |
| 3140 | pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] |
| 3141 | pmulhrsw m0, m7 |
| 3142 | packuswb m1, m0 |
| 3143 | |
| 3144 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 3145 | |
| 3146 | pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] |
| 3147 | pmulhrsw m4, m7 |
| 3148 | pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] |
| 3149 | pmulhrsw m5, m7 |
| 3150 | packuswb m4, m5 |
| 3151 | |
| 3152 | pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] |
| 3153 | pmulhrsw m5, m7 |
| 3154 | pmaddubsw m2, [r3 + 12 * 16] ; [28] |
| 3155 | pmulhrsw m2, m7 |
| 3156 | packuswb m5, m2 |
| 3157 | |
| 3158 | pmaddubsw m6, m3, [r3 - 15 * 16] ; [01] |
| 3159 | pmulhrsw m6, m7 |
| 3160 | pmaddubsw m1, m3, [r3 - 10 * 16] ; [06] |
| 3161 | pmulhrsw m1, m7 |
| 3162 | packuswb m6, m1 |
| 3163 | |
| 3164 | pmaddubsw m1, m3, [r3 - 5 * 16] ; [11] |
| 3165 | pmulhrsw m1, m7 |
| 3166 | pmaddubsw m3, [r3] ; [16] |
| 3167 | pmulhrsw m3, m7 |
| 3168 | packuswb m1, m3 |
| 3169 | |
| 3170 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 3171 | |
| 3172 | lea r0, [r6 + 8] |
| 3173 | add r2, 8 |
| 3174 | dec r4 |
| 3175 | jnz .loop |
| 3176 | |
| 3177 | RET |
| 3178 | |
| 3179 | INIT_XMM sse4 |
| 3180 | cglobal intra_pred_ang16_9, 3,7,8 |
| 3181 | |
| 3182 | lea r3, [ang_table + 16 * 16] |
| 3183 | mov r4d, 2 |
| 3184 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 3185 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 3186 | mova m7, [pw_1024] |
| 3187 | |
| 3188 | .loop: |
| 3189 | movu m2, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 3190 | palignr m3, m2, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 3191 | punpcklbw m2, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 3192 | |
| 3193 | pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] |
| 3194 | pmulhrsw m4, m7 |
| 3195 | pmaddubsw m0, m2, [r3 - 12 * 16] ; [4] |
| 3196 | pmulhrsw m0, m7 |
| 3197 | packuswb m4, m0 |
| 3198 | |
| 3199 | pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] |
| 3200 | pmulhrsw m5, m7 |
| 3201 | pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] |
| 3202 | pmulhrsw m6, m7 |
| 3203 | packuswb m5, m6 |
| 3204 | |
| 3205 | pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] |
| 3206 | pmulhrsw m6, m7 |
| 3207 | pmaddubsw m0, m2, [r3 - 4 * 16] ; [12] |
| 3208 | pmulhrsw m0, m7 |
| 3209 | packuswb m6, m0 |
| 3210 | |
| 3211 | pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] |
| 3212 | pmulhrsw m1, m7 |
| 3213 | pmaddubsw m0, m2, [r3] ; [16] |
| 3214 | pmulhrsw m0, m7 |
| 3215 | packuswb m1, m0 |
| 3216 | |
| 3217 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 3218 | |
| 3219 | pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] |
| 3220 | pmulhrsw m4, m7 |
| 3221 | pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] |
| 3222 | pmulhrsw m5, m7 |
| 3223 | packuswb m4, m5 |
| 3224 | |
| 3225 | pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] |
| 3226 | pmulhrsw m5, m7 |
| 3227 | pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] |
| 3228 | pmulhrsw m6, m7 |
| 3229 | packuswb m5, m6 |
| 3230 | |
| 3231 | pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] |
| 3232 | pmulhrsw m6, m7 |
| 3233 | pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] |
| 3234 | pmulhrsw m1, m7 |
| 3235 | packuswb m6, m1 |
| 3236 | |
| 3237 | pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] |
| 3238 | pmulhrsw m1, m7 |
| 3239 | packuswb m1, m1 |
| 3240 | |
| 3241 | punpcklqdq m1, m3 ; [00] |
| 3242 | |
| 3243 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 3244 | |
| 3245 | lea r0, [r6 + r1 * 4] |
| 3246 | lea r6, [r6 + r1 * 8] |
| 3247 | add r2, 8 |
| 3248 | dec r4 |
| 3249 | jnz .loop |
| 3250 | |
| 3251 | RET |
| 3252 | |
| 3253 | INIT_XMM sse4 |
| 3254 | cglobal intra_pred_ang16_27, 3,7,8 |
| 3255 | mov r2, r3mp |
| 3256 | lea r3, [ang_table + 16 * 16] |
| 3257 | mov r4d, 2 |
| 3258 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 3259 | mov r6, r0 |
| 3260 | mova m7, [pw_1024] |
| 3261 | |
| 3262 | .loop: |
| 3263 | movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 3264 | palignr m2, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 3265 | punpcklbw m3, m2 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 3266 | |
| 3267 | pmaddubsw m4, m3, [r3 - 14 * 16] ; [2] |
| 3268 | pmulhrsw m4, m7 |
| 3269 | pmaddubsw m0, m3, [r3 - 12 * 16] ; [4] |
| 3270 | pmulhrsw m0, m7 |
| 3271 | packuswb m4, m0 |
| 3272 | |
| 3273 | pmaddubsw m5, m3, [r3 - 10 * 16] ; [6] |
| 3274 | pmulhrsw m5, m7 |
| 3275 | pmaddubsw m6, m3, [r3 - 8 * 16] ; [8] |
| 3276 | pmulhrsw m6, m7 |
| 3277 | packuswb m5, m6 |
| 3278 | |
| 3279 | pmaddubsw m6, m3, [r3 - 6 * 16] ; [10] |
| 3280 | pmulhrsw m6, m7 |
| 3281 | pmaddubsw m0, m3, [r3 - 4 * 16] ; [12] |
| 3282 | pmulhrsw m0, m7 |
| 3283 | packuswb m6, m0 |
| 3284 | |
| 3285 | pmaddubsw m1, m3, [r3 - 2 * 16] ; [14] |
| 3286 | pmulhrsw m1, m7 |
| 3287 | pmaddubsw m0, m3, [r3] ; [16] |
| 3288 | pmulhrsw m0, m7 |
| 3289 | packuswb m1, m0 |
| 3290 | |
| 3291 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 3292 | |
| 3293 | pmaddubsw m4, m3, [r3 + 2 * 16] ; [18] |
| 3294 | pmulhrsw m4, m7 |
| 3295 | pmaddubsw m5, m3, [r3 + 4 * 16] ; [20] |
| 3296 | pmulhrsw m5, m7 |
| 3297 | packuswb m4, m5 |
| 3298 | |
| 3299 | pmaddubsw m5, m3, [r3 + 6 * 16] ; [22] |
| 3300 | pmulhrsw m5, m7 |
| 3301 | pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] |
| 3302 | pmulhrsw m6, m7 |
| 3303 | packuswb m5, m6 |
| 3304 | |
| 3305 | pmaddubsw m6, m3, [r3 + 10 * 16] ; [26] |
| 3306 | pmulhrsw m6, m7 |
| 3307 | pmaddubsw m1, m3, [r3 + 12 * 16] ; [28] |
| 3308 | pmulhrsw m1, m7 |
| 3309 | packuswb m6, m1 |
| 3310 | |
| 3311 | pmaddubsw m1, m3, [r3 + 14 * 16] ; [30] |
| 3312 | pmulhrsw m1, m7 |
| 3313 | packuswb m1, m1 |
| 3314 | |
| 3315 | movh [r0 ], m4 |
| 3316 | movhps [r0 + r1 ], m4 |
| 3317 | movh [r0 + r1 * 2], m5 |
| 3318 | movhps [r0 + r5 ], m5 |
| 3319 | lea r0, [r0 + r1 * 4] |
| 3320 | movh [r0 ], m6 |
| 3321 | movhps [r0 + r1 ], m6 |
| 3322 | movh [r0 + r1 * 2], m1 |
| 3323 | movh [r0 + r5 ], m2 |
| 3324 | |
| 3325 | lea r0, [r6 + 8] |
| 3326 | add r2, 8 |
| 3327 | dec r4 |
| 3328 | jnz .loop |
| 3329 | |
| 3330 | RET |
| 3331 | |
| 3332 | INIT_XMM sse4 |
| 3333 | cglobal intra_pred_ang16_10, 6,6,8 |
| 3334 | lea r4, [r1 * 3] |
| 3335 | pxor m7, m7 |
| 3336 | |
| 3337 | movu m0, [r2 + 1] |
| 3338 | palignr m1, m0, 1 |
| 3339 | pshufb m1, m7 |
| 3340 | palignr m2, m0, 2 |
| 3341 | pshufb m2, m7 |
| 3342 | palignr m3, m0, 3 |
| 3343 | pshufb m3, m7 |
| 3344 | palignr m4, m0, 4 |
| 3345 | pshufb m4, m7 |
| 3346 | palignr m5, m0, 5 |
| 3347 | pshufb m5, m7 |
| 3348 | palignr m6, m0, 6 |
| 3349 | pshufb m6, m7 |
| 3350 | |
| 3351 | movu [r0 + r1], m1 |
| 3352 | movu [r0 + r1 * 2], m2 |
| 3353 | movu [r0 + r4], m3 |
| 3354 | lea r2, [r0 + r1 * 4] |
| 3355 | movu [r2], m4 |
| 3356 | movu [r2 + r1], m5 |
| 3357 | movu [r2 + r1 * 2], m6 |
| 3358 | |
| 3359 | palignr m1, m0, 7 |
| 3360 | pshufb m1, m7 |
| 3361 | movhlps m2, m0 |
| 3362 | pshufb m2, m7 |
| 3363 | palignr m3, m0, 9 |
| 3364 | pshufb m3, m7 |
| 3365 | palignr m4, m0, 10 |
| 3366 | pshufb m4, m7 |
| 3367 | palignr m5, m0, 11 |
| 3368 | pshufb m5, m7 |
| 3369 | palignr m6, m0, 12 |
| 3370 | pshufb m6, m7 |
| 3371 | |
| 3372 | movu [r2 + r4], m1 |
| 3373 | lea r2, [r2 + r1 * 4] |
| 3374 | movu [r2], m2 |
| 3375 | movu [r2 + r1], m3 |
| 3376 | movu [r2 + r1 * 2], m4 |
| 3377 | movu [r2 + r4], m5 |
| 3378 | lea r2, [r2 + r1 * 4] |
| 3379 | movu [r2], m6 |
| 3380 | |
| 3381 | palignr m1, m0, 13 |
| 3382 | pshufb m1, m7 |
| 3383 | palignr m2, m0, 14 |
| 3384 | pshufb m2, m7 |
| 3385 | palignr m3, m0, 15 |
| 3386 | pshufb m3, m7 |
| 3387 | pshufb m0, m7 |
| 3388 | |
| 3389 | movu [r2 + r1], m1 |
| 3390 | movu [r2 + r1 * 2], m2 |
| 3391 | movu [r2 + r4], m3 |
| 3392 | |
| 3393 | ; filter |
| 3394 | cmp r5w, byte 0 |
| 3395 | jz .quit |
| 3396 | pmovzxbw m0, m0 |
| 3397 | mova m1, m0 |
| 3398 | movu m2, [r3] |
| 3399 | movu m3, [r3 + 1] |
| 3400 | |
| 3401 | pshufb m2, m7 |
| 3402 | pmovzxbw m2, m2 |
| 3403 | movhlps m4, m3 |
| 3404 | pmovzxbw m3, m3 |
| 3405 | pmovzxbw m4, m4 |
| 3406 | psubw m3, m2 |
| 3407 | psubw m4, m2 |
| 3408 | psraw m3, 1 |
| 3409 | psraw m4, 1 |
| 3410 | paddw m0, m3 |
| 3411 | paddw m1, m4 |
| 3412 | packuswb m0, m1 |
| 3413 | |
| 3414 | .quit: |
| 3415 | movu [r0], m0 |
| 3416 | |
| 3417 | RET |
| 3418 | |
| 3419 | INIT_XMM sse4 |
| 3420 | %if ARCH_X86_64 == 1 |
| 3421 | cglobal intra_pred_ang16_26, 4,8,5 |
| 3422 | mov r7, r5mp |
| 3423 | %define bfilter r7w |
| 3424 | %else |
| 3425 | cglobal intra_pred_ang16_26, 6,7,5,0 - 4 |
| 3426 | %define bfilter dword[rsp] |
| 3427 | mov bfilter, r5 |
| 3428 | %endif |
| 3429 | movu m0, [r3 + 1] |
| 3430 | |
| 3431 | lea r4, [r1 * 3] |
| 3432 | lea r3, [r0 + r1 * 4] |
| 3433 | lea r5, [r3 + r1 * 4] |
| 3434 | lea r6, [r5 + r1 * 4] |
| 3435 | |
| 3436 | movu [r0], m0 |
| 3437 | movu [r0 + r1], m0 |
| 3438 | movu [r0 + r1 * 2], m0 |
| 3439 | movu [r0 + r4], m0 |
| 3440 | movu [r3], m0 |
| 3441 | movu [r3 + r1], m0 |
| 3442 | movu [r3 + r1 * 2], m0 |
| 3443 | movu [r3 + r4], m0 |
| 3444 | movu [r5], m0 |
| 3445 | movu [r5 + r1], m0 |
| 3446 | movu [r5 + r1 * 2], m0 |
| 3447 | movu [r5 + r4], m0 |
| 3448 | |
| 3449 | movu [r6], m0 |
| 3450 | movu [r6 + r1], m0 |
| 3451 | movu [r6 + r1 * 2], m0 |
| 3452 | movu [r6 + r4], m0 |
| 3453 | |
| 3454 | ; filter |
| 3455 | cmp bfilter, byte 0 |
| 3456 | jz .quit |
| 3457 | |
| 3458 | pxor m4, m4 |
| 3459 | pshufb m0, m4 |
| 3460 | pmovzxbw m0, m0 |
| 3461 | mova m1, m0 |
| 3462 | movu m2, [r2] |
| 3463 | movu m3, [r2 + 1] |
| 3464 | |
| 3465 | pshufb m2, m4 |
| 3466 | pmovzxbw m2, m2 |
| 3467 | movhlps m4, m3 |
| 3468 | pmovzxbw m3, m3 |
| 3469 | pmovzxbw m4, m4 |
| 3470 | psubw m3, m2 |
| 3471 | psubw m4, m2 |
| 3472 | psraw m3, 1 |
| 3473 | psraw m4, 1 |
| 3474 | paddw m0, m3 |
| 3475 | paddw m1, m4 |
| 3476 | packuswb m0, m1 |
| 3477 | |
| 3478 | pextrb [r0], m0, 0 |
| 3479 | pextrb [r0 + r1], m0, 1 |
| 3480 | pextrb [r0 + r1 * 2], m0, 2 |
| 3481 | pextrb [r0 + r4], m0, 3 |
| 3482 | pextrb [r3], m0, 4 |
| 3483 | pextrb [r3 + r1], m0, 5 |
| 3484 | pextrb [r3 + r1 * 2], m0, 6 |
| 3485 | pextrb [r3 + r4], m0, 7 |
| 3486 | pextrb [r5], m0, 8 |
| 3487 | pextrb [r5 + r1], m0, 9 |
| 3488 | pextrb [r5 + r1 * 2], m0, 10 |
| 3489 | pextrb [r5 + r4], m0, 11 |
| 3490 | pextrb [r6], m0, 12 |
| 3491 | pextrb [r6 + r1], m0, 13 |
| 3492 | pextrb [r6 + r1 * 2], m0, 14 |
| 3493 | pextrb [r6 + r4], m0, 15 |
| 3494 | |
| 3495 | .quit: |
| 3496 | RET |
| 3497 | |
| 3498 | INIT_XMM sse4 |
| 3499 | cglobal intra_pred_ang16_11, 3,7,8 |
| 3500 | |
| 3501 | lea r3, [ang_table + 16 * 16] |
| 3502 | mov r4d, 2 |
| 3503 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 3504 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 3505 | mova m7, [pw_1024] |
| 3506 | |
| 3507 | .loop: |
| 3508 | movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 3509 | mova m2, m3 |
| 3510 | palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 3511 | punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 3512 | |
| 3513 | pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] |
| 3514 | pmulhrsw m4, m7 |
| 3515 | pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] |
| 3516 | pmulhrsw m0, m7 |
| 3517 | packuswb m4, m0 |
| 3518 | |
| 3519 | pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] |
| 3520 | pmulhrsw m5, m7 |
| 3521 | pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] |
| 3522 | pmulhrsw m6, m7 |
| 3523 | packuswb m5, m6 |
| 3524 | |
| 3525 | pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] |
| 3526 | pmulhrsw m6, m7 |
| 3527 | pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] |
| 3528 | pmulhrsw m0, m7 |
| 3529 | packuswb m6, m0 |
| 3530 | |
| 3531 | pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] |
| 3532 | pmulhrsw m1, m7 |
| 3533 | pmaddubsw m0, m3, [r3] ; [16] |
| 3534 | pmulhrsw m0, m7 |
| 3535 | packuswb m1, m0 |
| 3536 | |
| 3537 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 3538 | |
| 3539 | pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] |
| 3540 | pmulhrsw m4, m7 |
| 3541 | pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] |
| 3542 | pmulhrsw m5, m7 |
| 3543 | packuswb m4, m5 |
| 3544 | |
| 3545 | pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] |
| 3546 | pmulhrsw m5, m7 |
| 3547 | pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] |
| 3548 | pmulhrsw m6, m7 |
| 3549 | packuswb m5, m6 |
| 3550 | |
| 3551 | pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] |
| 3552 | pmulhrsw m6, m7 |
| 3553 | pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] |
| 3554 | pmulhrsw m1, m7 |
| 3555 | packuswb m6, m1 |
| 3556 | |
| 3557 | pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] |
| 3558 | pmulhrsw m1, m7 |
| 3559 | packuswb m1, m1 |
| 3560 | punpcklqdq m1, m2 ;[00] |
| 3561 | |
| 3562 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 3563 | |
| 3564 | lea r0, [r6 + r1 * 4] |
| 3565 | lea r6, [r6 + r1 * 8] |
| 3566 | add r2, 8 |
| 3567 | dec r4 |
| 3568 | jnz .loop |
| 3569 | |
| 3570 | RET |
| 3571 | |
| 3572 | INIT_XMM sse4 |
| 3573 | cglobal intra_pred_ang16_25, 3,7,8 |
| 3574 | mov r2, r3mp |
| 3575 | lea r3, [ang_table + 16 * 16] |
| 3576 | mov r4d, 2 |
| 3577 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 3578 | mov r6, r0 |
| 3579 | mova m7, [pw_1024] |
| 3580 | |
| 3581 | .loop: |
| 3582 | movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 3583 | mova m2, m3 |
| 3584 | palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 3585 | punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 3586 | |
| 3587 | pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] |
| 3588 | pmulhrsw m4, m7 |
| 3589 | pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] |
| 3590 | pmulhrsw m0, m7 |
| 3591 | packuswb m4, m0 |
| 3592 | |
| 3593 | pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] |
| 3594 | pmulhrsw m5, m7 |
| 3595 | pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] |
| 3596 | pmulhrsw m6, m7 |
| 3597 | packuswb m5, m6 |
| 3598 | |
| 3599 | pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] |
| 3600 | pmulhrsw m6, m7 |
| 3601 | pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] |
| 3602 | pmulhrsw m0, m7 |
| 3603 | packuswb m6, m0 |
| 3604 | |
| 3605 | pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] |
| 3606 | pmulhrsw m1, m7 |
| 3607 | pmaddubsw m0, m3, [r3] ; [16] |
| 3608 | pmulhrsw m0, m7 |
| 3609 | packuswb m1, m0 |
| 3610 | |
| 3611 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 3612 | |
| 3613 | pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] |
| 3614 | pmulhrsw m4, m7 |
| 3615 | pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] |
| 3616 | pmulhrsw m5, m7 |
| 3617 | packuswb m4, m5 |
| 3618 | |
| 3619 | pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] |
| 3620 | pmulhrsw m5, m7 |
| 3621 | pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] |
| 3622 | pmulhrsw m6, m7 |
| 3623 | packuswb m5, m6 |
| 3624 | |
| 3625 | pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] |
| 3626 | pmulhrsw m6, m7 |
| 3627 | pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] |
| 3628 | pmulhrsw m1, m7 |
| 3629 | packuswb m6, m1 |
| 3630 | |
| 3631 | pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] |
| 3632 | pmulhrsw m1, m7 |
| 3633 | packuswb m1, m1 |
| 3634 | |
| 3635 | movh [r0 ], m4 |
| 3636 | movhps [r0 + r1 ], m4 |
| 3637 | movh [r0 + r1 * 2], m5 |
| 3638 | movhps [r0 + r5 ], m5 |
| 3639 | lea r0, [r0 + r1 * 4] |
| 3640 | movh [r0 ], m6 |
| 3641 | movhps [r0 + r1 ], m6 |
| 3642 | movh [r0 + r1 * 2], m1 |
| 3643 | movh [r0 + r5 ], m2 |
| 3644 | |
| 3645 | lea r0, [r6 + 8] |
| 3646 | add r2, 8 |
| 3647 | dec r4 |
| 3648 | jnz .loop |
| 3649 | |
| 3650 | RET |
| 3651 | |
| 3652 | INIT_XMM sse4 |
| 3653 | cglobal intra_pred_ang16_12, 4,7,8 |
| 3654 | |
| 3655 | lea r4, [ang_table + 16 * 16] |
| 3656 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 3657 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 3658 | mova m7, [pw_1024] |
| 3659 | |
| 3660 | movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 3661 | punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] |
| 3662 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] |
| 3663 | movu m2, [r3] |
| 3664 | pshufb m2, [c_mode16_12] |
| 3665 | |
| 3666 | palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 3667 | |
| 3668 | pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] |
| 3669 | pmulhrsw m4, m7 |
| 3670 | pmaddubsw m1, m0, [r4 + 6 * 16] ; [22] |
| 3671 | pmulhrsw m1, m7 |
| 3672 | packuswb m4, m1 |
| 3673 | |
| 3674 | pmaddubsw m5, m0, [r4 + 1 * 16] ; [17] |
| 3675 | pmulhrsw m5, m7 |
| 3676 | pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] |
| 3677 | pmulhrsw m6, m7 |
| 3678 | packuswb m5, m6 |
| 3679 | |
| 3680 | pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] |
| 3681 | pmulhrsw m6, m7 |
| 3682 | pmaddubsw m0, [r4 - 14 * 16] ; [2] |
| 3683 | pmulhrsw m0, m7 |
| 3684 | packuswb m6, m0 |
| 3685 | |
| 3686 | palignr m3, m2, 15 |
| 3687 | |
| 3688 | pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] |
| 3689 | pmulhrsw m1, m7 |
| 3690 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 3691 | pmulhrsw m0, m7 |
| 3692 | packuswb m1, m0 |
| 3693 | |
| 3694 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 3695 | |
| 3696 | pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] |
| 3697 | pmulhrsw m4, m7 |
| 3698 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] |
| 3699 | pmulhrsw m5, m7 |
| 3700 | packuswb m4, m5 |
| 3701 | |
| 3702 | pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] |
| 3703 | pmulhrsw m5, m7 |
| 3704 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] |
| 3705 | pmulhrsw m6, m7 |
| 3706 | packuswb m5, m6 |
| 3707 | |
| 3708 | palignr m3, m2, 14 |
| 3709 | |
| 3710 | pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] |
| 3711 | pmulhrsw m6, m7 |
| 3712 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] |
| 3713 | pmulhrsw m1, m7 |
| 3714 | packuswb m6, m1 |
| 3715 | |
| 3716 | pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] |
| 3717 | pmulhrsw m1, m7 |
| 3718 | pmaddubsw m3, [r4] ; [16] |
| 3719 | pmulhrsw m3, m7 |
| 3720 | packuswb m1, m3 |
| 3721 | |
| 3722 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 3723 | |
| 3724 | lea r0, [r6 + r1 * 4] |
| 3725 | lea r6, [r6 + r1 * 8] |
| 3726 | |
| 3727 | movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 3728 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] |
| 3729 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] |
| 3730 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] |
| 3731 | |
| 3732 | pmaddubsw m4, m3, [r4 + 11 * 16] ; [27] |
| 3733 | pmulhrsw m4, m7 |
| 3734 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] |
| 3735 | pmulhrsw m5, m7 |
| 3736 | packuswb m4, m5 |
| 3737 | |
| 3738 | pmaddubsw m5, m3, [r4 + 1 * 16] ; [17] |
| 3739 | pmulhrsw m5, m7 |
| 3740 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] |
| 3741 | pmulhrsw m6, m7 |
| 3742 | packuswb m5, m6 |
| 3743 | |
| 3744 | pmaddubsw m6, m3, [r4 - 9 * 16] ; [7] |
| 3745 | pmulhrsw m6, m7 |
| 3746 | pmaddubsw m0, m3, [r4 - 14 * 16] ; [2] |
| 3747 | pmulhrsw m0, m7 |
| 3748 | packuswb m6, m0 |
| 3749 | |
| 3750 | palignr m3, m2, 14 |
| 3751 | |
| 3752 | pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] |
| 3753 | pmulhrsw m1, m7 |
| 3754 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 3755 | pmulhrsw m0, m7 |
| 3756 | packuswb m1, m0 |
| 3757 | |
| 3758 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 3759 | |
| 3760 | pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] |
| 3761 | pmulhrsw m4, m7 |
| 3762 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] |
| 3763 | pmulhrsw m5, m7 |
| 3764 | packuswb m4, m5 |
| 3765 | |
| 3766 | pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] |
| 3767 | pmulhrsw m5, m7 |
| 3768 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] |
| 3769 | pmulhrsw m6, m7 |
| 3770 | packuswb m5, m6 |
| 3771 | |
| 3772 | pslldq m2, 1 |
| 3773 | palignr m3, m2, 14 |
| 3774 | |
| 3775 | pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] |
| 3776 | pmulhrsw m6, m7 |
| 3777 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] |
| 3778 | pmulhrsw m1, m7 |
| 3779 | packuswb m6, m1 |
| 3780 | |
| 3781 | pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] |
| 3782 | pmulhrsw m1, m7 |
| 3783 | pmaddubsw m3, [r4] ; [16] |
| 3784 | pmulhrsw m3, m7 |
| 3785 | packuswb m1, m3 |
| 3786 | |
| 3787 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 3788 | |
| 3789 | RET |
| 3790 | |
| 3791 | INIT_XMM sse4 |
| 3792 | cglobal intra_pred_ang16_24, 4,7,8 |
| 3793 | |
| 3794 | lea r4, [ang_table + 16 * 16] |
| 3795 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 3796 | mov r6, r0 |
| 3797 | mova m7, [pw_1024] |
| 3798 | |
| 3799 | movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 3800 | punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] |
| 3801 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] |
| 3802 | movu m2, [r2] |
| 3803 | pshufb m2, [c_mode16_12] |
| 3804 | |
| 3805 | palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 3806 | |
| 3807 | pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] |
| 3808 | pmulhrsw m4, m7 |
| 3809 | pmaddubsw m1, m0, [r4 + 6 * 16] ; [22] |
| 3810 | pmulhrsw m1, m7 |
| 3811 | packuswb m4, m1 |
| 3812 | |
| 3813 | pmaddubsw m5, m0, [r4 + 1 * 16] ; [17] |
| 3814 | pmulhrsw m5, m7 |
| 3815 | pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] |
| 3816 | pmulhrsw m6, m7 |
| 3817 | packuswb m5, m6 |
| 3818 | |
| 3819 | pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] |
| 3820 | pmulhrsw m6, m7 |
| 3821 | pmaddubsw m0, [r4 - 14 * 16] ; [2] |
| 3822 | pmulhrsw m0, m7 |
| 3823 | packuswb m6, m0 |
| 3824 | |
| 3825 | palignr m3, m2, 15 |
| 3826 | |
| 3827 | pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] |
| 3828 | pmulhrsw m1, m7 |
| 3829 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 3830 | pmulhrsw m0, m7 |
| 3831 | packuswb m1, m0 |
| 3832 | |
| 3833 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 3834 | |
| 3835 | pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] |
| 3836 | pmulhrsw m4, m7 |
| 3837 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] |
| 3838 | pmulhrsw m5, m7 |
| 3839 | packuswb m4, m5 |
| 3840 | |
| 3841 | pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] |
| 3842 | pmulhrsw m5, m7 |
| 3843 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] |
| 3844 | pmulhrsw m6, m7 |
| 3845 | packuswb m5, m6 |
| 3846 | |
| 3847 | palignr m3, m2, 14 |
| 3848 | |
| 3849 | pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] |
| 3850 | pmulhrsw m6, m7 |
| 3851 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] |
| 3852 | pmulhrsw m1, m7 |
| 3853 | packuswb m6, m1 |
| 3854 | |
| 3855 | pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] |
| 3856 | pmulhrsw m1, m7 |
| 3857 | pmaddubsw m3, [r4] ; [16] |
| 3858 | pmulhrsw m3, m7 |
| 3859 | packuswb m1, m3 |
| 3860 | |
| 3861 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 3862 | |
| 3863 | lea r0, [r6 + 8] |
| 3864 | |
| 3865 | movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 3866 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] |
| 3867 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] |
| 3868 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] |
| 3869 | |
| 3870 | pmaddubsw m4, m3, [r4 + 11 * 16] ; [27] |
| 3871 | pmulhrsw m4, m7 |
| 3872 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] |
| 3873 | pmulhrsw m5, m7 |
| 3874 | packuswb m4, m5 |
| 3875 | |
| 3876 | pmaddubsw m5, m3, [r4 + 1 * 16] ; [17] |
| 3877 | pmulhrsw m5, m7 |
| 3878 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] |
| 3879 | pmulhrsw m6, m7 |
| 3880 | packuswb m5, m6 |
| 3881 | |
| 3882 | pmaddubsw m6, m3, [r4 - 9 * 16] ; [7] |
| 3883 | pmulhrsw m6, m7 |
| 3884 | pmaddubsw m0, m3, [r4 - 14 * 16] ; [2] |
| 3885 | pmulhrsw m0, m7 |
| 3886 | packuswb m6, m0 |
| 3887 | |
| 3888 | palignr m3, m2, 14 |
| 3889 | |
| 3890 | pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] |
| 3891 | pmulhrsw m1, m7 |
| 3892 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 3893 | pmulhrsw m0, m7 |
| 3894 | packuswb m1, m0 |
| 3895 | |
| 3896 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 3897 | |
| 3898 | pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] |
| 3899 | pmulhrsw m4, m7 |
| 3900 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] |
| 3901 | pmulhrsw m5, m7 |
| 3902 | packuswb m4, m5 |
| 3903 | |
| 3904 | pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] |
| 3905 | pmulhrsw m5, m7 |
| 3906 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] |
| 3907 | pmulhrsw m6, m7 |
| 3908 | packuswb m5, m6 |
| 3909 | |
| 3910 | pslldq m2, 1 |
| 3911 | palignr m3, m2, 14 |
| 3912 | |
| 3913 | pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] |
| 3914 | pmulhrsw m6, m7 |
| 3915 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] |
| 3916 | pmulhrsw m1, m7 |
| 3917 | packuswb m6, m1 |
| 3918 | |
| 3919 | pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] |
| 3920 | pmulhrsw m1, m7 |
| 3921 | pmaddubsw m3, [r4] ; [16] |
| 3922 | pmulhrsw m3, m7 |
| 3923 | packuswb m1, m3 |
| 3924 | |
| 3925 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 3926 | |
| 3927 | RET |
| 3928 | |
| 3929 | INIT_XMM sse4 |
| 3930 | cglobal intra_pred_ang16_13, 4,7,8 |
| 3931 | |
| 3932 | lea r4, [ang_table + 16 * 16] |
| 3933 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 3934 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 3935 | mova m7, [pw_1024] |
| 3936 | |
| 3937 | movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 3938 | punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] |
| 3939 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] |
| 3940 | movu m2, [r3] |
| 3941 | pshufb m2, [c_mode16_13] |
| 3942 | |
| 3943 | palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 3944 | |
| 3945 | pmaddubsw m4, m5, [r4 + 7 * 16] ; [23] |
| 3946 | pmulhrsw m4, m7 |
| 3947 | pmaddubsw m0, m5, [r4 - 2 * 16] ; [14] |
| 3948 | pmulhrsw m0, m7 |
| 3949 | packuswb m4, m0 |
| 3950 | |
| 3951 | pmaddubsw m5, [r4 - 11 * 16] ; [05] |
| 3952 | pmulhrsw m5, m7 |
| 3953 | |
| 3954 | palignr m3, m2, 15 |
| 3955 | |
| 3956 | pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] |
| 3957 | pmulhrsw m6, m7 |
| 3958 | packuswb m5, m6 |
| 3959 | |
| 3960 | pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] |
| 3961 | pmulhrsw m6, m7 |
| 3962 | pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] |
| 3963 | pmulhrsw m0, m7 |
| 3964 | packuswb m6, m0 |
| 3965 | |
| 3966 | pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] |
| 3967 | pmulhrsw m1, m7 |
| 3968 | |
| 3969 | palignr m3, m2, 14 |
| 3970 | |
| 3971 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 3972 | pmulhrsw m0, m7 |
| 3973 | packuswb m1, m0 |
| 3974 | |
| 3975 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 3976 | |
| 3977 | pmaddubsw m4, m3, [r4 - 16] ; [15] |
| 3978 | pmulhrsw m4, m7 |
| 3979 | pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] |
| 3980 | pmulhrsw m5, m7 |
| 3981 | packuswb m4, m5 |
| 3982 | |
| 3983 | pslldq m2, 1 |
| 3984 | palignr m3, m2, 14 |
| 3985 | |
| 3986 | pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] |
| 3987 | pmulhrsw m5, m7 |
| 3988 | pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] |
| 3989 | pmulhrsw m6, m7 |
| 3990 | packuswb m5, m6 |
| 3991 | |
| 3992 | pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] |
| 3993 | pmulhrsw m6, m7 |
| 3994 | pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] |
| 3995 | pmulhrsw m1, m7 |
| 3996 | packuswb m6, m1 |
| 3997 | |
| 3998 | pslldq m2, 1 |
| 3999 | palignr m3, m2, 14 |
| 4000 | |
| 4001 | pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] |
| 4002 | pmulhrsw m1, m7 |
| 4003 | pmaddubsw m3, [r4] ; [16] |
| 4004 | pmulhrsw m3, m7 |
| 4005 | packuswb m1, m3 |
| 4006 | |
| 4007 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 4008 | |
| 4009 | lea r0, [r6 + r1 * 4] |
| 4010 | lea r6, [r6 + r1 * 8] |
| 4011 | |
| 4012 | movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 4013 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] |
| 4014 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] |
| 4015 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] |
| 4016 | |
| 4017 | pmaddubsw m4, m3, [r4 + 7 * 16] ; [23] |
| 4018 | pmulhrsw m4, m7 |
| 4019 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] |
| 4020 | pmulhrsw m5, m7 |
| 4021 | packuswb m4, m5 |
| 4022 | |
| 4023 | pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] |
| 4024 | pmulhrsw m5, m7 |
| 4025 | |
| 4026 | palignr m3, m2, 14 |
| 4027 | |
| 4028 | pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] |
| 4029 | pmulhrsw m6, m7 |
| 4030 | packuswb m5, m6 |
| 4031 | |
| 4032 | pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] |
| 4033 | pmulhrsw m6, m7 |
| 4034 | pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] |
| 4035 | pmulhrsw m0, m7 |
| 4036 | packuswb m6, m0 |
| 4037 | |
| 4038 | pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] |
| 4039 | pmulhrsw m1, m7 |
| 4040 | |
| 4041 | pslldq m2, 1 |
| 4042 | palignr m3, m2, 14 |
| 4043 | |
| 4044 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 4045 | pmulhrsw m0, m7 |
| 4046 | packuswb m1, m0 |
| 4047 | |
| 4048 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 4049 | |
| 4050 | pmaddubsw m4, m3, [r4 - 16] ; [15] |
| 4051 | pmulhrsw m4, m7 |
| 4052 | pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] |
| 4053 | pmulhrsw m5, m7 |
| 4054 | packuswb m4, m5 |
| 4055 | |
| 4056 | pslldq m2, 1 |
| 4057 | palignr m3, m2, 14 |
| 4058 | |
| 4059 | pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] |
| 4060 | pmulhrsw m5, m7 |
| 4061 | pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] |
| 4062 | pmulhrsw m6, m7 |
| 4063 | packuswb m5, m6 |
| 4064 | |
| 4065 | pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] |
| 4066 | pmulhrsw m6, m7 |
| 4067 | pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] |
| 4068 | pmulhrsw m1, m7 |
| 4069 | packuswb m6, m1 |
| 4070 | |
| 4071 | pslldq m2, 1 |
| 4072 | palignr m3, m2, 14 |
| 4073 | |
| 4074 | pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] |
| 4075 | pmulhrsw m1, m7 |
| 4076 | pmaddubsw m3, [r4] ; [16] |
| 4077 | pmulhrsw m3, m7 |
| 4078 | packuswb m1, m3 |
| 4079 | |
| 4080 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 4081 | |
| 4082 | RET |
| 4083 | |
| 4084 | INIT_XMM sse4 |
| 4085 | cglobal intra_pred_ang16_23, 4,7,8 |
| 4086 | |
| 4087 | lea r4, [ang_table + 16 * 16] |
| 4088 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 4089 | mov r6, r0 |
| 4090 | mova m7, [pw_1024] |
| 4091 | |
| 4092 | movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 4093 | punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] |
| 4094 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] |
| 4095 | movu m2, [r2] |
| 4096 | pshufb m2, [c_mode16_13] |
| 4097 | |
| 4098 | palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 4099 | |
| 4100 | pmaddubsw m4, m5, [r4 + 7 * 16] ; [23] |
| 4101 | pmulhrsw m4, m7 |
| 4102 | pmaddubsw m0, m5, [r4 - 2 * 16] ; [14] |
| 4103 | pmulhrsw m0, m7 |
| 4104 | packuswb m4, m0 |
| 4105 | |
| 4106 | pmaddubsw m5, [r4 - 11 * 16] ; [05] |
| 4107 | pmulhrsw m5, m7 |
| 4108 | |
| 4109 | palignr m3, m2, 15 |
| 4110 | |
| 4111 | pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] |
| 4112 | pmulhrsw m6, m7 |
| 4113 | packuswb m5, m6 |
| 4114 | |
| 4115 | pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] |
| 4116 | pmulhrsw m6, m7 |
| 4117 | pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] |
| 4118 | pmulhrsw m0, m7 |
| 4119 | packuswb m6, m0 |
| 4120 | |
| 4121 | pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] |
| 4122 | pmulhrsw m1, m7 |
| 4123 | |
| 4124 | palignr m3, m2, 14 |
| 4125 | |
| 4126 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 4127 | pmulhrsw m0, m7 |
| 4128 | packuswb m1, m0 |
| 4129 | |
| 4130 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 4131 | |
| 4132 | pmaddubsw m4, m3, [r4 - 16] ; [15] |
| 4133 | pmulhrsw m4, m7 |
| 4134 | pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] |
| 4135 | pmulhrsw m5, m7 |
| 4136 | packuswb m4, m5 |
| 4137 | |
| 4138 | pslldq m2, 1 |
| 4139 | palignr m3, m2, 14 |
| 4140 | |
| 4141 | pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] |
| 4142 | pmulhrsw m5, m7 |
| 4143 | pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] |
| 4144 | pmulhrsw m6, m7 |
| 4145 | packuswb m5, m6 |
| 4146 | |
| 4147 | pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] |
| 4148 | pmulhrsw m6, m7 |
| 4149 | pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] |
| 4150 | pmulhrsw m1, m7 |
| 4151 | packuswb m6, m1 |
| 4152 | |
| 4153 | pslldq m2, 1 |
| 4154 | palignr m3, m2, 14 |
| 4155 | |
| 4156 | pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] |
| 4157 | pmulhrsw m1, m7 |
| 4158 | pmaddubsw m3, [r4] ; [16] |
| 4159 | pmulhrsw m3, m7 |
| 4160 | packuswb m1, m3 |
| 4161 | |
| 4162 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 4163 | |
| 4164 | lea r0, [r6 + 8] |
| 4165 | |
| 4166 | movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 4167 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] |
| 4168 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] |
| 4169 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] |
| 4170 | |
| 4171 | pmaddubsw m4, m3, [r4 + 7 * 16] ; [23] |
| 4172 | pmulhrsw m4, m7 |
| 4173 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] |
| 4174 | pmulhrsw m5, m7 |
| 4175 | packuswb m4, m5 |
| 4176 | |
| 4177 | pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] |
| 4178 | pmulhrsw m5, m7 |
| 4179 | |
| 4180 | palignr m3, m2, 14 |
| 4181 | |
| 4182 | pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] |
| 4183 | pmulhrsw m6, m7 |
| 4184 | packuswb m5, m6 |
| 4185 | |
| 4186 | pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] |
| 4187 | pmulhrsw m6, m7 |
| 4188 | pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] |
| 4189 | pmulhrsw m0, m7 |
| 4190 | packuswb m6, m0 |
| 4191 | |
| 4192 | pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] |
| 4193 | pmulhrsw m1, m7 |
| 4194 | |
| 4195 | pslldq m2, 1 |
| 4196 | palignr m3, m2, 14 |
| 4197 | |
| 4198 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 4199 | pmulhrsw m0, m7 |
| 4200 | packuswb m1, m0 |
| 4201 | |
| 4202 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 4203 | |
| 4204 | pmaddubsw m4, m3, [r4 - 16] ; [15] |
| 4205 | pmulhrsw m4, m7 |
| 4206 | pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] |
| 4207 | pmulhrsw m5, m7 |
| 4208 | packuswb m4, m5 |
| 4209 | |
| 4210 | pslldq m2, 1 |
| 4211 | palignr m3, m2, 14 |
| 4212 | |
| 4213 | pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] |
| 4214 | pmulhrsw m5, m7 |
| 4215 | pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] |
| 4216 | pmulhrsw m6, m7 |
| 4217 | packuswb m5, m6 |
| 4218 | |
| 4219 | pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] |
| 4220 | pmulhrsw m6, m7 |
| 4221 | pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] |
| 4222 | pmulhrsw m1, m7 |
| 4223 | packuswb m6, m1 |
| 4224 | |
| 4225 | pslldq m2, 1 |
| 4226 | palignr m3, m2, 14 |
| 4227 | |
| 4228 | pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] |
| 4229 | pmulhrsw m1, m7 |
| 4230 | pmaddubsw m3, [r4] ; [16] |
| 4231 | pmulhrsw m3, m7 |
| 4232 | packuswb m1, m3 |
| 4233 | |
| 4234 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 4235 | |
| 4236 | RET |
| 4237 | |
| 4238 | INIT_XMM sse4 |
| 4239 | cglobal intra_pred_ang16_14, 4,7,8 |
| 4240 | |
| 4241 | lea r4, [ang_table + 16 * 16] |
| 4242 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 4243 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 4244 | mova m7, [pw_1024] |
| 4245 | |
| 4246 | movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 4247 | punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] |
| 4248 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] |
| 4249 | movu m2, [r3] |
| 4250 | pshufb m2, [c_mode16_14] |
| 4251 | |
| 4252 | palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 4253 | |
| 4254 | pmaddubsw m4, m5, [r4 + 3 * 16] ; [19] |
| 4255 | pmulhrsw m4, m7 |
| 4256 | pmaddubsw m5, [r4 - 10 * 16] ; [06] |
| 4257 | pmulhrsw m5, m7 |
| 4258 | packuswb m4, m5 |
| 4259 | |
| 4260 | palignr m3, m2, 15 |
| 4261 | |
| 4262 | pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] |
| 4263 | pmulhrsw m5, m7 |
| 4264 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] |
| 4265 | pmulhrsw m6, m7 |
| 4266 | packuswb m5, m6 |
| 4267 | |
| 4268 | palignr m3, m2, 14 |
| 4269 | |
| 4270 | pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] |
| 4271 | pmulhrsw m6, m7 |
| 4272 | pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] |
| 4273 | pmulhrsw m0, m7 |
| 4274 | packuswb m6, m0 |
| 4275 | |
| 4276 | pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] |
| 4277 | pmulhrsw m1, m7 |
| 4278 | |
| 4279 | pslldq m2, 1 |
| 4280 | palignr m3, m2, 14 |
| 4281 | |
| 4282 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 4283 | pmulhrsw m0, m7 |
| 4284 | packuswb m1, m0 |
| 4285 | |
| 4286 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 4287 | |
| 4288 | pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] |
| 4289 | pmulhrsw m4, m7 |
| 4290 | |
| 4291 | pslldq m2, 1 |
| 4292 | palignr m3, m2, 14 |
| 4293 | |
| 4294 | pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] |
| 4295 | pmulhrsw m5, m7 |
| 4296 | packuswb m4, m5 |
| 4297 | |
| 4298 | pmaddubsw m5, m3, [r4 + 16] ; [17] |
| 4299 | pmulhrsw m5, m7 |
| 4300 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] |
| 4301 | pmulhrsw m6, m7 |
| 4302 | packuswb m5, m6 |
| 4303 | |
| 4304 | pslldq m2, 1 |
| 4305 | palignr m3, m2, 14 |
| 4306 | |
| 4307 | pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] |
| 4308 | pmulhrsw m6, m7 |
| 4309 | pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] |
| 4310 | pmulhrsw m1, m7 |
| 4311 | packuswb m6, m1 |
| 4312 | |
| 4313 | pslldq m2, 1 |
| 4314 | palignr m3, m2, 14 |
| 4315 | |
| 4316 | pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] |
| 4317 | pmulhrsw m1, m7 |
| 4318 | pmaddubsw m3, [r4] ; [16] |
| 4319 | pmulhrsw m3, m7 |
| 4320 | packuswb m1, m3 |
| 4321 | |
| 4322 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 4323 | |
| 4324 | lea r0, [r6 + r1 * 4] |
| 4325 | lea r6, [r6 + r1 * 8] |
| 4326 | |
| 4327 | movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 4328 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] |
| 4329 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] |
| 4330 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] |
| 4331 | |
| 4332 | pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] |
| 4333 | pmulhrsw m4, m7 |
| 4334 | pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] |
| 4335 | pmulhrsw m5, m7 |
| 4336 | packuswb m4, m5 |
| 4337 | |
| 4338 | palignr m3, m2, 14 |
| 4339 | |
| 4340 | pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] |
| 4341 | pmulhrsw m5, m7 |
| 4342 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] |
| 4343 | pmulhrsw m6, m7 |
| 4344 | packuswb m5, m6 |
| 4345 | |
| 4346 | pslldq m2, 1 |
| 4347 | palignr m3, m2, 14 |
| 4348 | |
| 4349 | pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] |
| 4350 | pmulhrsw m6, m7 |
| 4351 | pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] |
| 4352 | pmulhrsw m0, m7 |
| 4353 | packuswb m6, m0 |
| 4354 | |
| 4355 | pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] |
| 4356 | pmulhrsw m1, m7 |
| 4357 | |
| 4358 | pslldq m2, 1 |
| 4359 | palignr m3, m2, 14 |
| 4360 | |
| 4361 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 4362 | pmulhrsw m0, m7 |
| 4363 | packuswb m1, m0 |
| 4364 | |
| 4365 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 4366 | |
| 4367 | pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] |
| 4368 | pmulhrsw m4, m7 |
| 4369 | |
| 4370 | pslldq m2, 1 |
| 4371 | palignr m3, m2, 14 |
| 4372 | |
| 4373 | pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] |
| 4374 | pmulhrsw m5, m7 |
| 4375 | packuswb m4, m5 |
| 4376 | |
| 4377 | pmaddubsw m5, m3, [r4 + 16] ; [17] |
| 4378 | pmulhrsw m5, m7 |
| 4379 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] |
| 4380 | pmulhrsw m6, m7 |
| 4381 | packuswb m5, m6 |
| 4382 | |
| 4383 | pslldq m2, 1 |
| 4384 | palignr m3, m2, 14 |
| 4385 | |
| 4386 | pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] |
| 4387 | pmulhrsw m6, m7 |
| 4388 | pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] |
| 4389 | pmulhrsw m1, m7 |
| 4390 | packuswb m6, m1 |
| 4391 | |
| 4392 | pslldq m2, 1 |
| 4393 | palignr m3, m2, 14 |
| 4394 | |
| 4395 | pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] |
| 4396 | pmulhrsw m1, m7 |
| 4397 | pmaddubsw m3, [r4] ; [16] |
| 4398 | pmulhrsw m3, m7 |
| 4399 | packuswb m1, m3 |
| 4400 | |
| 4401 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 4402 | |
| 4403 | RET |
| 4404 | |
| 4405 | INIT_XMM sse4 |
| 4406 | cglobal intra_pred_ang16_22, 4,7,8 |
| 4407 | |
| 4408 | lea r4, [ang_table + 16 * 16] |
| 4409 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 4410 | mov r6, r0 |
| 4411 | mova m7, [pw_1024] |
| 4412 | |
| 4413 | movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 4414 | punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] |
| 4415 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] |
| 4416 | movu m2, [r2] |
| 4417 | pshufb m2, [c_mode16_14] |
| 4418 | |
| 4419 | palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 4420 | |
| 4421 | pmaddubsw m4, m5, [r4 + 3 * 16] ; [19] |
| 4422 | pmulhrsw m4, m7 |
| 4423 | pmaddubsw m5, [r4 - 10 * 16] ; [06] |
| 4424 | pmulhrsw m5, m7 |
| 4425 | packuswb m4, m5 |
| 4426 | |
| 4427 | palignr m3, m2, 15 |
| 4428 | |
| 4429 | pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] |
| 4430 | pmulhrsw m5, m7 |
| 4431 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] |
| 4432 | pmulhrsw m6, m7 |
| 4433 | packuswb m5, m6 |
| 4434 | |
| 4435 | palignr m3, m2, 14 |
| 4436 | |
| 4437 | pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] |
| 4438 | pmulhrsw m6, m7 |
| 4439 | pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] |
| 4440 | pmulhrsw m0, m7 |
| 4441 | packuswb m6, m0 |
| 4442 | |
| 4443 | pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] |
| 4444 | pmulhrsw m1, m7 |
| 4445 | |
| 4446 | pslldq m2, 1 |
| 4447 | palignr m3, m2, 14 |
| 4448 | |
| 4449 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 4450 | pmulhrsw m0, m7 |
| 4451 | packuswb m1, m0 |
| 4452 | |
| 4453 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 4454 | |
| 4455 | pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] |
| 4456 | pmulhrsw m4, m7 |
| 4457 | |
| 4458 | pslldq m2, 1 |
| 4459 | palignr m3, m2, 14 |
| 4460 | |
| 4461 | pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] |
| 4462 | pmulhrsw m5, m7 |
| 4463 | packuswb m4, m5 |
| 4464 | |
| 4465 | pmaddubsw m5, m3, [r4 + 16] ; [17] |
| 4466 | pmulhrsw m5, m7 |
| 4467 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] |
| 4468 | pmulhrsw m6, m7 |
| 4469 | packuswb m5, m6 |
| 4470 | |
| 4471 | pslldq m2, 1 |
| 4472 | palignr m3, m2, 14 |
| 4473 | |
| 4474 | pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] |
| 4475 | pmulhrsw m6, m7 |
| 4476 | pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] |
| 4477 | pmulhrsw m1, m7 |
| 4478 | packuswb m6, m1 |
| 4479 | |
| 4480 | pslldq m2, 1 |
| 4481 | palignr m3, m2, 14 |
| 4482 | |
| 4483 | pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] |
| 4484 | pmulhrsw m1, m7 |
| 4485 | pmaddubsw m3, [r4] ; [16] |
| 4486 | pmulhrsw m3, m7 |
| 4487 | packuswb m1, m3 |
| 4488 | |
| 4489 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 4490 | |
| 4491 | lea r0, [r6 + 8] |
| 4492 | |
| 4493 | movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 4494 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] |
| 4495 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] |
| 4496 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] |
| 4497 | |
| 4498 | pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] |
| 4499 | pmulhrsw m4, m7 |
| 4500 | pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] |
| 4501 | pmulhrsw m5, m7 |
| 4502 | packuswb m4, m5 |
| 4503 | |
| 4504 | palignr m3, m2, 14 |
| 4505 | |
| 4506 | pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] |
| 4507 | pmulhrsw m5, m7 |
| 4508 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] |
| 4509 | pmulhrsw m6, m7 |
| 4510 | packuswb m5, m6 |
| 4511 | |
| 4512 | pslldq m2, 1 |
| 4513 | palignr m3, m2, 14 |
| 4514 | |
| 4515 | pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] |
| 4516 | pmulhrsw m6, m7 |
| 4517 | pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] |
| 4518 | pmulhrsw m0, m7 |
| 4519 | packuswb m6, m0 |
| 4520 | |
| 4521 | pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] |
| 4522 | pmulhrsw m1, m7 |
| 4523 | |
| 4524 | pslldq m2, 1 |
| 4525 | palignr m3, m2, 14 |
| 4526 | |
| 4527 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 4528 | pmulhrsw m0, m7 |
| 4529 | packuswb m1, m0 |
| 4530 | |
| 4531 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 4532 | |
| 4533 | pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] |
| 4534 | pmulhrsw m4, m7 |
| 4535 | |
| 4536 | pslldq m2, 1 |
| 4537 | palignr m3, m2, 14 |
| 4538 | |
| 4539 | pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] |
| 4540 | pmulhrsw m5, m7 |
| 4541 | packuswb m4, m5 |
| 4542 | |
| 4543 | pmaddubsw m5, m3, [r4 + 16] ; [17] |
| 4544 | pmulhrsw m5, m7 |
| 4545 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] |
| 4546 | pmulhrsw m6, m7 |
| 4547 | packuswb m5, m6 |
| 4548 | |
| 4549 | pslldq m2, 1 |
| 4550 | palignr m3, m2, 14 |
| 4551 | |
| 4552 | pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] |
| 4553 | pmulhrsw m6, m7 |
| 4554 | pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] |
| 4555 | pmulhrsw m1, m7 |
| 4556 | packuswb m6, m1 |
| 4557 | |
| 4558 | pslldq m2, 1 |
| 4559 | palignr m3, m2, 14 |
| 4560 | |
| 4561 | pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] |
| 4562 | pmulhrsw m1, m7 |
| 4563 | pmaddubsw m3, [r4] ; [16] |
| 4564 | pmulhrsw m3, m7 |
| 4565 | packuswb m1, m3 |
| 4566 | |
| 4567 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 4568 | |
| 4569 | RET |
| 4570 | |
| 4571 | INIT_XMM sse4 |
| 4572 | cglobal intra_pred_ang16_15, 4,7,8 |
| 4573 | |
| 4574 | lea r4, [ang_table + 16 * 16] |
| 4575 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 4576 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 4577 | mova m7, [pw_1024] |
| 4578 | |
| 4579 | movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 4580 | punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] |
| 4581 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] |
| 4582 | movu m2, [r3] |
| 4583 | pshufb m2, [c_mode16_15] |
| 4584 | |
| 4585 | palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 4586 | |
| 4587 | pmaddubsw m4, [r4 - 16] ; [15] |
| 4588 | pmulhrsw m4, m7 |
| 4589 | |
| 4590 | palignr m3, m2, 15 |
| 4591 | |
| 4592 | pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] |
| 4593 | pmulhrsw m5, m7 |
| 4594 | packuswb m4, m5 |
| 4595 | |
| 4596 | pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] |
| 4597 | pmulhrsw m5, m7 |
| 4598 | |
| 4599 | palignr m3, m2, 14 |
| 4600 | |
| 4601 | pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] |
| 4602 | pmulhrsw m6, m7 |
| 4603 | packuswb m5, m6 |
| 4604 | |
| 4605 | pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] |
| 4606 | pmulhrsw m6, m7 |
| 4607 | |
| 4608 | pslldq m2, 1 |
| 4609 | palignr m3, m2, 14 |
| 4610 | |
| 4611 | pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] |
| 4612 | pmulhrsw m0, m7 |
| 4613 | packuswb m6, m0 |
| 4614 | |
| 4615 | pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] |
| 4616 | pmulhrsw m1, m7 |
| 4617 | |
| 4618 | pslldq m2, 1 |
| 4619 | palignr m3, m2, 14 |
| 4620 | |
| 4621 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 4622 | pmulhrsw m0, m7 |
| 4623 | packuswb m1, m0 |
| 4624 | |
| 4625 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 4626 | |
| 4627 | pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] |
| 4628 | pmulhrsw m4, m7 |
| 4629 | |
| 4630 | pslldq m2, 1 |
| 4631 | palignr m3, m2, 14 |
| 4632 | |
| 4633 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] |
| 4634 | pmulhrsw m5, m7 |
| 4635 | packuswb m4, m5 |
| 4636 | |
| 4637 | pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] |
| 4638 | pmulhrsw m5, m7 |
| 4639 | |
| 4640 | pslldq m2, 1 |
| 4641 | palignr m3, m2, 14 |
| 4642 | |
| 4643 | pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] |
| 4644 | pmulhrsw m6, m7 |
| 4645 | packuswb m5, m6 |
| 4646 | |
| 4647 | pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] |
| 4648 | pmulhrsw m6, m7 |
| 4649 | |
| 4650 | pslldq m2, 1 |
| 4651 | palignr m3, m2, 14 |
| 4652 | |
| 4653 | pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] |
| 4654 | pmulhrsw m1, m7 |
| 4655 | packuswb m6, m1 |
| 4656 | |
| 4657 | pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] |
| 4658 | pmulhrsw m1, m7 |
| 4659 | |
| 4660 | pslldq m2, 1 |
| 4661 | palignr m3, m2, 14 |
| 4662 | |
| 4663 | pmaddubsw m3, [r4] ; [16] |
| 4664 | pmulhrsw m3, m7 |
| 4665 | packuswb m1, m3 |
| 4666 | |
| 4667 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 4668 | |
| 4669 | lea r0, [r6 + r1 * 4] |
| 4670 | lea r6, [r6 + r1 * 8] |
| 4671 | |
| 4672 | movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 4673 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] |
| 4674 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] |
| 4675 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L] |
| 4676 | |
| 4677 | pmaddubsw m4, m3, [r4 - 16] ; [15] |
| 4678 | pmulhrsw m4, m7 |
| 4679 | |
| 4680 | palignr m3, m2, 14 |
| 4681 | |
| 4682 | pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] |
| 4683 | pmulhrsw m5, m7 |
| 4684 | packuswb m4, m5 |
| 4685 | |
| 4686 | pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] |
| 4687 | pmulhrsw m5, m7 |
| 4688 | |
| 4689 | pslldq m2, 1 |
| 4690 | palignr m3, m2, 14 |
| 4691 | |
| 4692 | pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] |
| 4693 | pmulhrsw m6, m7 |
| 4694 | packuswb m5, m6 |
| 4695 | |
| 4696 | pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] |
| 4697 | pmulhrsw m6, m7 |
| 4698 | |
| 4699 | pslldq m2, 1 |
| 4700 | palignr m3, m2, 14 |
| 4701 | |
| 4702 | pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] |
| 4703 | pmulhrsw m0, m7 |
| 4704 | packuswb m6, m0 |
| 4705 | |
| 4706 | pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] |
| 4707 | pmulhrsw m1, m7 |
| 4708 | |
| 4709 | pslldq m2, 1 |
| 4710 | palignr m3, m2, 14 |
| 4711 | |
| 4712 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 4713 | pmulhrsw m0, m7 |
| 4714 | packuswb m1, m0 |
| 4715 | |
| 4716 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 4717 | |
| 4718 | pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] |
| 4719 | pmulhrsw m4, m7 |
| 4720 | |
| 4721 | pslldq m2, 1 |
| 4722 | palignr m3, m2, 14 |
| 4723 | |
| 4724 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] |
| 4725 | pmulhrsw m5, m7 |
| 4726 | packuswb m4, m5 |
| 4727 | |
| 4728 | pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] |
| 4729 | pmulhrsw m5, m7 |
| 4730 | |
| 4731 | pslldq m2, 1 |
| 4732 | palignr m3, m2, 14 |
| 4733 | |
| 4734 | pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] |
| 4735 | pmulhrsw m6, m7 |
| 4736 | packuswb m5, m6 |
| 4737 | |
| 4738 | pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] |
| 4739 | pmulhrsw m6, m7 |
| 4740 | |
| 4741 | pslldq m2, 1 |
| 4742 | palignr m3, m2, 14 |
| 4743 | |
| 4744 | pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] |
| 4745 | pmulhrsw m1, m7 |
| 4746 | packuswb m6, m1 |
| 4747 | |
| 4748 | pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] |
| 4749 | pmulhrsw m1, m7 |
| 4750 | |
| 4751 | pslldq m2, 1 |
| 4752 | palignr m3, m2, 14 |
| 4753 | |
| 4754 | pmaddubsw m3, [r4] ; [16] |
| 4755 | pmulhrsw m3, m7 |
| 4756 | packuswb m1, m3 |
| 4757 | |
| 4758 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 4759 | |
| 4760 | RET |
| 4761 | |
| 4762 | INIT_XMM sse4 |
| 4763 | cglobal intra_pred_ang16_21, 4,7,8 |
| 4764 | |
| 4765 | lea r4, [ang_table + 16 * 16] |
| 4766 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 4767 | mov r6, r0 |
| 4768 | mova m7, [pw_1024] |
| 4769 | |
| 4770 | movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 4771 | punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] |
| 4772 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] |
| 4773 | movu m2, [r2] |
| 4774 | pshufb m2, [c_mode16_15] |
| 4775 | |
| 4776 | palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 4777 | |
| 4778 | pmaddubsw m4, [r4 - 16] ; [15] |
| 4779 | pmulhrsw m4, m7 |
| 4780 | |
| 4781 | palignr m3, m2, 15 |
| 4782 | |
| 4783 | pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] |
| 4784 | pmulhrsw m5, m7 |
| 4785 | packuswb m4, m5 |
| 4786 | |
| 4787 | pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] |
| 4788 | pmulhrsw m5, m7 |
| 4789 | |
| 4790 | palignr m3, m2, 14 |
| 4791 | |
| 4792 | pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] |
| 4793 | pmulhrsw m6, m7 |
| 4794 | packuswb m5, m6 |
| 4795 | |
| 4796 | pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] |
| 4797 | pmulhrsw m6, m7 |
| 4798 | |
| 4799 | pslldq m2, 1 |
| 4800 | palignr m3, m2, 14 |
| 4801 | |
| 4802 | pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] |
| 4803 | pmulhrsw m0, m7 |
| 4804 | packuswb m6, m0 |
| 4805 | |
| 4806 | pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] |
| 4807 | pmulhrsw m1, m7 |
| 4808 | |
| 4809 | pslldq m2, 1 |
| 4810 | palignr m3, m2, 14 |
| 4811 | |
| 4812 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 4813 | pmulhrsw m0, m7 |
| 4814 | packuswb m1, m0 |
| 4815 | |
| 4816 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 4817 | |
| 4818 | pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] |
| 4819 | pmulhrsw m4, m7 |
| 4820 | |
| 4821 | pslldq m2, 1 |
| 4822 | palignr m3, m2, 14 |
| 4823 | |
| 4824 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] |
| 4825 | pmulhrsw m5, m7 |
| 4826 | packuswb m4, m5 |
| 4827 | |
| 4828 | pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] |
| 4829 | pmulhrsw m5, m7 |
| 4830 | |
| 4831 | pslldq m2, 1 |
| 4832 | palignr m3, m2, 14 |
| 4833 | |
| 4834 | pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] |
| 4835 | pmulhrsw m6, m7 |
| 4836 | packuswb m5, m6 |
| 4837 | |
| 4838 | pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] |
| 4839 | pmulhrsw m6, m7 |
| 4840 | |
| 4841 | pslldq m2, 1 |
| 4842 | palignr m3, m2, 14 |
| 4843 | |
| 4844 | pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] |
| 4845 | pmulhrsw m1, m7 |
| 4846 | packuswb m6, m1 |
| 4847 | |
| 4848 | pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] |
| 4849 | pmulhrsw m1, m7 |
| 4850 | |
| 4851 | pslldq m2, 1 |
| 4852 | palignr m3, m2, 14 |
| 4853 | |
| 4854 | pmaddubsw m3, [r4] ; [16] |
| 4855 | pmulhrsw m3, m7 |
| 4856 | packuswb m1, m3 |
| 4857 | |
| 4858 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 4859 | |
| 4860 | lea r0, [r6 + 8] |
| 4861 | |
| 4862 | movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 4863 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] |
| 4864 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] |
| 4865 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L] |
| 4866 | |
| 4867 | pmaddubsw m4, m3, [r4 - 16] ; [15] |
| 4868 | pmulhrsw m4, m7 |
| 4869 | |
| 4870 | palignr m3, m2, 14 |
| 4871 | |
| 4872 | pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] |
| 4873 | pmulhrsw m5, m7 |
| 4874 | packuswb m4, m5 |
| 4875 | |
| 4876 | pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] |
| 4877 | pmulhrsw m5, m7 |
| 4878 | |
| 4879 | pslldq m2, 1 |
| 4880 | palignr m3, m2, 14 |
| 4881 | |
| 4882 | pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] |
| 4883 | pmulhrsw m6, m7 |
| 4884 | packuswb m5, m6 |
| 4885 | |
| 4886 | pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] |
| 4887 | pmulhrsw m6, m7 |
| 4888 | |
| 4889 | pslldq m2, 1 |
| 4890 | palignr m3, m2, 14 |
| 4891 | |
| 4892 | pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] |
| 4893 | pmulhrsw m0, m7 |
| 4894 | packuswb m6, m0 |
| 4895 | |
| 4896 | pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] |
| 4897 | pmulhrsw m1, m7 |
| 4898 | |
| 4899 | pslldq m2, 1 |
| 4900 | palignr m3, m2, 14 |
| 4901 | |
| 4902 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 4903 | pmulhrsw m0, m7 |
| 4904 | packuswb m1, m0 |
| 4905 | |
| 4906 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 4907 | |
| 4908 | pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] |
| 4909 | pmulhrsw m4, m7 |
| 4910 | |
| 4911 | pslldq m2, 1 |
| 4912 | palignr m3, m2, 14 |
| 4913 | |
| 4914 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] |
| 4915 | pmulhrsw m5, m7 |
| 4916 | packuswb m4, m5 |
| 4917 | |
| 4918 | pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] |
| 4919 | pmulhrsw m5, m7 |
| 4920 | |
| 4921 | pslldq m2, 1 |
| 4922 | palignr m3, m2, 14 |
| 4923 | |
| 4924 | pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] |
| 4925 | pmulhrsw m6, m7 |
| 4926 | packuswb m5, m6 |
| 4927 | |
| 4928 | pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] |
| 4929 | pmulhrsw m6, m7 |
| 4930 | |
| 4931 | pslldq m2, 1 |
| 4932 | palignr m3, m2, 14 |
| 4933 | |
| 4934 | pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] |
| 4935 | pmulhrsw m1, m7 |
| 4936 | packuswb m6, m1 |
| 4937 | |
| 4938 | pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] |
| 4939 | pmulhrsw m1, m7 |
| 4940 | |
| 4941 | pslldq m2, 1 |
| 4942 | palignr m3, m2, 14 |
| 4943 | |
| 4944 | pmaddubsw m3, [r4] ; [16] |
| 4945 | pmulhrsw m3, m7 |
| 4946 | packuswb m1, m3 |
| 4947 | |
| 4948 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 4949 | |
| 4950 | RET |
| 4951 | |
| 4952 | INIT_XMM sse4 |
| 4953 | cglobal intra_pred_ang16_16, 4,7,8 |
| 4954 | |
| 4955 | lea r4, [ang_table + 16 * 16] |
| 4956 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 4957 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 4958 | mova m7, [pw_1024] |
| 4959 | |
| 4960 | movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 4961 | punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] |
| 4962 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] |
| 4963 | movu m2, [r3] |
| 4964 | pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8] |
| 4965 | palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 4966 | |
| 4967 | pmaddubsw m4, [r4 - 5 * 16] ; [11] |
| 4968 | pmulhrsw m4, m7 |
| 4969 | |
| 4970 | palignr m3, m2, 15 |
| 4971 | |
| 4972 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] |
| 4973 | pmulhrsw m5, m7 |
| 4974 | packuswb m4, m5 |
| 4975 | |
| 4976 | pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] |
| 4977 | pmulhrsw m5, m7 |
| 4978 | |
| 4979 | palignr m3, m2, 14 |
| 4980 | |
| 4981 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] |
| 4982 | pmulhrsw m6, m7 |
| 4983 | packuswb m5, m6 |
| 4984 | |
| 4985 | pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x] |
| 4986 | palignr m3, m2, 14 |
| 4987 | |
| 4988 | pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] |
| 4989 | pmulhrsw m6, m7 |
| 4990 | pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] |
| 4991 | pmulhrsw m0, m7 |
| 4992 | packuswb m6, m0 |
| 4993 | |
| 4994 | pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x] |
| 4995 | palignr m3, m2, 14 |
| 4996 | |
| 4997 | pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] |
| 4998 | pmulhrsw m1, m7 |
| 4999 | |
| 5000 | pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x] |
| 5001 | palignr m3, m2, 14 |
| 5002 | |
| 5003 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 5004 | pmulhrsw m0, m7 |
| 5005 | packuswb m1, m0 |
| 5006 | |
| 5007 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 5008 | |
| 5009 | pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] |
| 5010 | pmulhrsw m4, m7 |
| 5011 | |
| 5012 | pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x] |
| 5013 | palignr m3, m2, 14 |
| 5014 | |
| 5015 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] |
| 5016 | pmulhrsw m5, m7 |
| 5017 | packuswb m4, m5 |
| 5018 | |
| 5019 | pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x] |
| 5020 | palignr m3, m2, 14 |
| 5021 | |
| 5022 | pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] |
| 5023 | pmulhrsw m5, m7 |
| 5024 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] |
| 5025 | pmulhrsw m6, m7 |
| 5026 | packuswb m5, m6 |
| 5027 | |
| 5028 | pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x] |
| 5029 | palignr m3, m2, 14 |
| 5030 | |
| 5031 | pmaddubsw m6, m3, [r4 - 16] ; [15] |
| 5032 | pmulhrsw m6, m7 |
| 5033 | |
| 5034 | pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x] |
| 5035 | palignr m3, m2, 14 |
| 5036 | |
| 5037 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] |
| 5038 | pmulhrsw m1, m7 |
| 5039 | packuswb m6, m1 |
| 5040 | |
| 5041 | pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] |
| 5042 | pmulhrsw m1, m7 |
| 5043 | |
| 5044 | pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x] |
| 5045 | palignr m3, m2, 14 |
| 5046 | |
| 5047 | pmaddubsw m3, [r4] ; [16] |
| 5048 | pmulhrsw m3, m7 |
| 5049 | packuswb m1, m3 |
| 5050 | |
| 5051 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 5052 | |
| 5053 | lea r0, [r6 + r1 * 4] |
| 5054 | lea r6, [r6 + r1 * 8] |
| 5055 | |
| 5056 | movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 5057 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] |
| 5058 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] |
| 5059 | palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x] |
| 5060 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] |
| 5061 | |
| 5062 | pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] |
| 5063 | pmulhrsw m4, m7 |
| 5064 | |
| 5065 | palignr m3, m2, 14 |
| 5066 | |
| 5067 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] |
| 5068 | pmulhrsw m5, m7 |
| 5069 | packuswb m4, m5 |
| 5070 | |
| 5071 | pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] |
| 5072 | pmulhrsw m5, m7 |
| 5073 | |
| 5074 | pslldq m2, 1 |
| 5075 | palignr m3, m2, 14 |
| 5076 | |
| 5077 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] |
| 5078 | pmulhrsw m6, m7 |
| 5079 | packuswb m5, m6 |
| 5080 | |
| 5081 | pslldq m2, 1 |
| 5082 | palignr m3, m2, 14 |
| 5083 | |
| 5084 | pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] |
| 5085 | pmulhrsw m6, m7 |
| 5086 | |
| 5087 | pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] |
| 5088 | pmulhrsw m0, m7 |
| 5089 | packuswb m6, m0 |
| 5090 | |
| 5091 | pslldq m2, 1 |
| 5092 | palignr m3, m2, 14 |
| 5093 | |
| 5094 | pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] |
| 5095 | pmulhrsw m1, m7 |
| 5096 | |
| 5097 | pslldq m2, 1 |
| 5098 | palignr m3, m2, 14 |
| 5099 | |
| 5100 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 5101 | pmulhrsw m0, m7 |
| 5102 | packuswb m1, m0 |
| 5103 | |
| 5104 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 5105 | |
| 5106 | pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] |
| 5107 | pmulhrsw m4, m7 |
| 5108 | |
| 5109 | pslldq m2, 1 |
| 5110 | palignr m3, m2, 14 |
| 5111 | |
| 5112 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] |
| 5113 | pmulhrsw m5, m7 |
| 5114 | packuswb m4, m5 |
| 5115 | |
| 5116 | pslldq m2, 1 |
| 5117 | palignr m3, m2, 14 |
| 5118 | |
| 5119 | pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] |
| 5120 | pmulhrsw m5, m7 |
| 5121 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] |
| 5122 | pmulhrsw m6, m7 |
| 5123 | packuswb m5, m6 |
| 5124 | |
| 5125 | pslldq m2, 1 |
| 5126 | palignr m3, m2, 14 |
| 5127 | |
| 5128 | pmaddubsw m6, m3, [r4 - 16] ; [15] |
| 5129 | pmulhrsw m6, m7 |
| 5130 | |
| 5131 | pslldq m2, 1 |
| 5132 | palignr m3, m2, 14 |
| 5133 | |
| 5134 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] |
| 5135 | pmulhrsw m1, m7 |
| 5136 | packuswb m6, m1 |
| 5137 | |
| 5138 | pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] |
| 5139 | pmulhrsw m1, m7 |
| 5140 | |
| 5141 | pslldq m2, 1 |
| 5142 | palignr m3, m2, 14 |
| 5143 | |
| 5144 | pmaddubsw m3, [r4] ; [16] |
| 5145 | pmulhrsw m3, m7 |
| 5146 | packuswb m1, m3 |
| 5147 | |
| 5148 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 5149 | |
| 5150 | RET |
| 5151 | |
| 5152 | INIT_XMM sse4 |
| 5153 | cglobal intra_pred_ang16_20, 4,7,8 |
| 5154 | |
| 5155 | lea r4, [ang_table + 16 * 16] |
| 5156 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 5157 | mov r6, r0 |
| 5158 | mova m7, [pw_1024] |
| 5159 | |
| 5160 | movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 5161 | punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] |
| 5162 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] |
| 5163 | movu m2, [r2] |
| 5164 | pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8] |
| 5165 | palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 5166 | |
| 5167 | pmaddubsw m4, [r4 - 5 * 16] ; [11] |
| 5168 | pmulhrsw m4, m7 |
| 5169 | |
| 5170 | palignr m3, m2, 15 |
| 5171 | |
| 5172 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] |
| 5173 | pmulhrsw m5, m7 |
| 5174 | packuswb m4, m5 |
| 5175 | |
| 5176 | pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] |
| 5177 | pmulhrsw m5, m7 |
| 5178 | |
| 5179 | palignr m3, m2, 14 |
| 5180 | |
| 5181 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] |
| 5182 | pmulhrsw m6, m7 |
| 5183 | packuswb m5, m6 |
| 5184 | |
| 5185 | pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x] |
| 5186 | palignr m3, m2, 14 |
| 5187 | |
| 5188 | pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] |
| 5189 | pmulhrsw m6, m7 |
| 5190 | pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] |
| 5191 | pmulhrsw m0, m7 |
| 5192 | packuswb m6, m0 |
| 5193 | |
| 5194 | pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x] |
| 5195 | palignr m3, m2, 14 |
| 5196 | |
| 5197 | pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] |
| 5198 | pmulhrsw m1, m7 |
| 5199 | |
| 5200 | pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x] |
| 5201 | palignr m3, m2, 14 |
| 5202 | |
| 5203 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 5204 | pmulhrsw m0, m7 |
| 5205 | packuswb m1, m0 |
| 5206 | |
| 5207 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 5208 | |
| 5209 | pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] |
| 5210 | pmulhrsw m4, m7 |
| 5211 | |
| 5212 | pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x] |
| 5213 | palignr m3, m2, 14 |
| 5214 | |
| 5215 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] |
| 5216 | pmulhrsw m5, m7 |
| 5217 | packuswb m4, m5 |
| 5218 | |
| 5219 | pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x] |
| 5220 | palignr m3, m2, 14 |
| 5221 | |
| 5222 | pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] |
| 5223 | pmulhrsw m5, m7 |
| 5224 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] |
| 5225 | pmulhrsw m6, m7 |
| 5226 | packuswb m5, m6 |
| 5227 | |
| 5228 | pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x] |
| 5229 | palignr m3, m2, 14 |
| 5230 | |
| 5231 | pmaddubsw m6, m3, [r4 - 16] ; [15] |
| 5232 | pmulhrsw m6, m7 |
| 5233 | |
| 5234 | pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x] |
| 5235 | palignr m3, m2, 14 |
| 5236 | |
| 5237 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] |
| 5238 | pmulhrsw m1, m7 |
| 5239 | packuswb m6, m1 |
| 5240 | |
| 5241 | pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] |
| 5242 | pmulhrsw m1, m7 |
| 5243 | |
| 5244 | pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x] |
| 5245 | palignr m3, m2, 14 |
| 5246 | |
| 5247 | pmaddubsw m3, [r4] ; [16] |
| 5248 | pmulhrsw m3, m7 |
| 5249 | packuswb m1, m3 |
| 5250 | |
| 5251 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 5252 | |
| 5253 | lea r0, [r6 + 8] |
| 5254 | |
| 5255 | movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 5256 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] |
| 5257 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] |
| 5258 | palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x] |
| 5259 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] |
| 5260 | |
| 5261 | pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] |
| 5262 | pmulhrsw m4, m7 |
| 5263 | |
| 5264 | palignr m3, m2, 14 |
| 5265 | |
| 5266 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] |
| 5267 | pmulhrsw m5, m7 |
| 5268 | packuswb m4, m5 |
| 5269 | |
| 5270 | pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] |
| 5271 | pmulhrsw m5, m7 |
| 5272 | |
| 5273 | pslldq m2, 1 |
| 5274 | palignr m3, m2, 14 |
| 5275 | |
| 5276 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] |
| 5277 | pmulhrsw m6, m7 |
| 5278 | packuswb m5, m6 |
| 5279 | |
| 5280 | pslldq m2, 1 |
| 5281 | palignr m3, m2, 14 |
| 5282 | |
| 5283 | pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] |
| 5284 | pmulhrsw m6, m7 |
| 5285 | |
| 5286 | pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] |
| 5287 | pmulhrsw m0, m7 |
| 5288 | packuswb m6, m0 |
| 5289 | |
| 5290 | pslldq m2, 1 |
| 5291 | palignr m3, m2, 14 |
| 5292 | |
| 5293 | pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] |
| 5294 | pmulhrsw m1, m7 |
| 5295 | |
| 5296 | pslldq m2, 1 |
| 5297 | palignr m3, m2, 14 |
| 5298 | |
| 5299 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] |
| 5300 | pmulhrsw m0, m7 |
| 5301 | packuswb m1, m0 |
| 5302 | |
| 5303 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 5304 | |
| 5305 | pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] |
| 5306 | pmulhrsw m4, m7 |
| 5307 | |
| 5308 | pslldq m2, 1 |
| 5309 | palignr m3, m2, 14 |
| 5310 | |
| 5311 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] |
| 5312 | pmulhrsw m5, m7 |
| 5313 | packuswb m4, m5 |
| 5314 | |
| 5315 | pslldq m2, 1 |
| 5316 | palignr m3, m2, 14 |
| 5317 | |
| 5318 | pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] |
| 5319 | pmulhrsw m5, m7 |
| 5320 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] |
| 5321 | pmulhrsw m6, m7 |
| 5322 | packuswb m5, m6 |
| 5323 | |
| 5324 | pslldq m2, 1 |
| 5325 | palignr m3, m2, 14 |
| 5326 | |
| 5327 | pmaddubsw m6, m3, [r4 - 16] ; [15] |
| 5328 | pmulhrsw m6, m7 |
| 5329 | |
| 5330 | pslldq m2, 1 |
| 5331 | palignr m3, m2, 14 |
| 5332 | |
| 5333 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] |
| 5334 | pmulhrsw m1, m7 |
| 5335 | packuswb m6, m1 |
| 5336 | |
| 5337 | pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] |
| 5338 | pmulhrsw m1, m7 |
| 5339 | |
| 5340 | pslldq m2, 1 |
| 5341 | palignr m3, m2, 14 |
| 5342 | |
| 5343 | pmaddubsw m3, [r4] ; [16] |
| 5344 | pmulhrsw m3, m7 |
| 5345 | packuswb m1, m3 |
| 5346 | |
| 5347 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 5348 | |
| 5349 | RET |
| 5350 | |
| 5351 | INIT_XMM sse4 |
| 5352 | cglobal intra_pred_ang16_17, 4,7,8 |
| 5353 | |
| 5354 | lea r4, [ang_table + 16 * 16] |
| 5355 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 5356 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 5357 | mova m7, [pw_1024] |
| 5358 | |
| 5359 | movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 5360 | punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] |
| 5361 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] |
| 5362 | movu m2, [r3] |
| 5363 | pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4] |
| 5364 | palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 5365 | |
| 5366 | pmaddubsw m4, [r4 - 10 * 16] ; [06] |
| 5367 | pmulhrsw m4, m7 |
| 5368 | |
| 5369 | palignr m3, m2, 15 |
| 5370 | |
| 5371 | pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] |
| 5372 | pmulhrsw m5, m7 |
| 5373 | packuswb m4, m5 |
| 5374 | |
| 5375 | palignr m3, m2, 14 |
| 5376 | |
| 5377 | pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] |
| 5378 | pmulhrsw m5, m7 |
| 5379 | |
| 5380 | pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x] |
| 5381 | pinsrb m2, [r3 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5] |
| 5382 | palignr m3, m2, 14 |
| 5383 | |
| 5384 | pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] |
| 5385 | pmulhrsw m6, m7 |
| 5386 | packuswb m5, m6 |
| 5387 | |
| 5388 | pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x] |
| 5389 | palignr m3, m2, 14 |
| 5390 | |
| 5391 | pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] |
| 5392 | pmulhrsw m6, m7 |
| 5393 | pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] |
| 5394 | pmulhrsw m0, m7 |
| 5395 | packuswb m6, m0 |
| 5396 | |
| 5397 | pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x] |
| 5398 | palignr m3, m2, 14 |
| 5399 | |
| 5400 | pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] |
| 5401 | pmulhrsw m1, m7 |
| 5402 | |
| 5403 | pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x] |
| 5404 | palignr m3, m2, 14 |
| 5405 | |
| 5406 | pmaddubsw m0, m3, [r4] ; [16] |
| 5407 | pmulhrsw m0, m7 |
| 5408 | packuswb m1, m0 |
| 5409 | |
| 5410 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 5411 | |
| 5412 | pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x] |
| 5413 | palignr m3, m2, 14 |
| 5414 | |
| 5415 | pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] |
| 5416 | pmulhrsw m4, m7 |
| 5417 | |
| 5418 | pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x] |
| 5419 | palignr m3, m2, 14 |
| 5420 | |
| 5421 | pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] |
| 5422 | pmulhrsw m5, m7 |
| 5423 | packuswb m4, m5 |
| 5424 | |
| 5425 | pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] |
| 5426 | pmulhrsw m5, m7 |
| 5427 | |
| 5428 | pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x] |
| 5429 | palignr m3, m2, 14 |
| 5430 | |
| 5431 | pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] |
| 5432 | pmulhrsw m6, m7 |
| 5433 | packuswb m5, m6 |
| 5434 | |
| 5435 | pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x] |
| 5436 | palignr m3, m2, 14 |
| 5437 | |
| 5438 | pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] |
| 5439 | pmulhrsw m6, m7 |
| 5440 | |
| 5441 | pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x] |
| 5442 | palignr m3, m2, 14 |
| 5443 | |
| 5444 | pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] |
| 5445 | pmulhrsw m1, m7 |
| 5446 | packuswb m6, m1 |
| 5447 | |
| 5448 | pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x] |
| 5449 | palignr m3, m2, 14 |
| 5450 | |
| 5451 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] |
| 5452 | pmulhrsw m1, m7 |
| 5453 | pmaddubsw m3, [r4 - 16 * 16] ; [00] |
| 5454 | pmulhrsw m3, m7 |
| 5455 | packuswb m1, m3 |
| 5456 | |
| 5457 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 5458 | |
| 5459 | lea r0, [r6 + r1 * 4] |
| 5460 | lea r6, [r6 + r1 * 8] |
| 5461 | |
| 5462 | movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 5463 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] |
| 5464 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] |
| 5465 | palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x] |
| 5466 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 1, 2, 4, 5, x, x, x] |
| 5467 | |
| 5468 | pmaddubsw m4, m3, [r4 - 10 * 16] ; [06] |
| 5469 | pmulhrsw m4, m7 |
| 5470 | |
| 5471 | palignr m3, m2, 14 |
| 5472 | |
| 5473 | pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] |
| 5474 | pmulhrsw m5, m7 |
| 5475 | packuswb m4, m5 |
| 5476 | |
| 5477 | pslldq m2, 1 |
| 5478 | palignr m3, m2, 14 |
| 5479 | |
| 5480 | pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] |
| 5481 | pmulhrsw m5, m7 |
| 5482 | |
| 5483 | pslldq m2, 1 |
| 5484 | palignr m3, m2, 14 |
| 5485 | |
| 5486 | pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] |
| 5487 | pmulhrsw m6, m7 |
| 5488 | packuswb m5, m6 |
| 5489 | |
| 5490 | pslldq m2, 1 |
| 5491 | palignr m3, m2, 14 |
| 5492 | |
| 5493 | pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] |
| 5494 | pmulhrsw m6, m7 |
| 5495 | pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] |
| 5496 | pmulhrsw m0, m7 |
| 5497 | packuswb m6, m0 |
| 5498 | |
| 5499 | pslldq m2, 1 |
| 5500 | palignr m3, m2, 14 |
| 5501 | |
| 5502 | pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] |
| 5503 | pmulhrsw m1, m7 |
| 5504 | |
| 5505 | pslldq m2, 1 |
| 5506 | palignr m3, m2, 14 |
| 5507 | |
| 5508 | pmaddubsw m0, m3, [r4] ; [16] |
| 5509 | pmulhrsw m0, m7 |
| 5510 | packuswb m1, m0 |
| 5511 | |
| 5512 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 |
| 5513 | |
| 5514 | pslldq m2, 1 |
| 5515 | palignr m3, m2, 14 |
| 5516 | |
| 5517 | pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] |
| 5518 | pmulhrsw m4, m7 |
| 5519 | |
| 5520 | pslldq m2, 1 |
| 5521 | palignr m3, m2, 14 |
| 5522 | |
| 5523 | pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] |
| 5524 | pmulhrsw m5, m7 |
| 5525 | packuswb m4, m5 |
| 5526 | |
| 5527 | pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] |
| 5528 | pmulhrsw m5, m7 |
| 5529 | |
| 5530 | pslldq m2, 1 |
| 5531 | palignr m3, m2, 14 |
| 5532 | |
| 5533 | pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] |
| 5534 | pmulhrsw m6, m7 |
| 5535 | packuswb m5, m6 |
| 5536 | |
| 5537 | pslldq m2, 1 |
| 5538 | palignr m3, m2, 14 |
| 5539 | |
| 5540 | pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] |
| 5541 | pmulhrsw m6, m7 |
| 5542 | |
| 5543 | pslldq m2, 1 |
| 5544 | palignr m3, m2, 14 |
| 5545 | |
| 5546 | pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] |
| 5547 | pmulhrsw m1, m7 |
| 5548 | packuswb m6, m1 |
| 5549 | |
| 5550 | pslldq m2, 1 |
| 5551 | palignr m3, m2, 14 |
| 5552 | |
| 5553 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] |
| 5554 | pmulhrsw m1, m7 |
| 5555 | pmaddubsw m3, [r4 - 16 * 16] ; [00] |
| 5556 | pmulhrsw m3, m7 |
| 5557 | packuswb m1, m3 |
| 5558 | |
| 5559 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 |
| 5560 | |
| 5561 | RET |
| 5562 | |
| 5563 | INIT_XMM sse4 |
| 5564 | cglobal intra_pred_ang16_19, 4,7,8 |
| 5565 | |
| 5566 | lea r4, [ang_table + 16 * 16] |
| 5567 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 5568 | mov r6, r0 |
| 5569 | mova m7, [pw_1024] |
| 5570 | |
| 5571 | movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 5572 | punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] |
| 5573 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] |
| 5574 | movu m2, [r2] |
| 5575 | pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4] |
| 5576 | palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] |
| 5577 | |
| 5578 | pmaddubsw m4, [r4 - 10 * 16] ; [06] |
| 5579 | pmulhrsw m4, m7 |
| 5580 | |
| 5581 | palignr m3, m2, 15 |
| 5582 | |
| 5583 | pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] |
| 5584 | pmulhrsw m5, m7 |
| 5585 | packuswb m4, m5 |
| 5586 | |
| 5587 | palignr m3, m2, 14 |
| 5588 | |
| 5589 | pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] |
| 5590 | pmulhrsw m5, m7 |
| 5591 | |
| 5592 | pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x] |
| 5593 | pinsrb m2, [r2 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5] |
| 5594 | palignr m3, m2, 14 |
| 5595 | |
| 5596 | pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] |
| 5597 | pmulhrsw m6, m7 |
| 5598 | packuswb m5, m6 |
| 5599 | |
| 5600 | pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x] |
| 5601 | palignr m3, m2, 14 |
| 5602 | |
| 5603 | pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] |
| 5604 | pmulhrsw m6, m7 |
| 5605 | pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] |
| 5606 | pmulhrsw m0, m7 |
| 5607 | packuswb m6, m0 |
| 5608 | |
| 5609 | pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x] |
| 5610 | palignr m3, m2, 14 |
| 5611 | |
| 5612 | pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] |
| 5613 | pmulhrsw m1, m7 |
| 5614 | |
| 5615 | pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x] |
| 5616 | palignr m3, m2, 14 |
| 5617 | |
| 5618 | pmaddubsw m0, m3, [r4] ; [16] |
| 5619 | pmulhrsw m0, m7 |
| 5620 | packuswb m1, m0 |
| 5621 | |
| 5622 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 5623 | |
| 5624 | pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x] |
| 5625 | palignr m3, m2, 14 |
| 5626 | |
| 5627 | pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] |
| 5628 | pmulhrsw m4, m7 |
| 5629 | |
| 5630 | pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x] |
| 5631 | palignr m3, m2, 14 |
| 5632 | |
| 5633 | pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] |
| 5634 | pmulhrsw m5, m7 |
| 5635 | packuswb m4, m5 |
| 5636 | |
| 5637 | pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] |
| 5638 | pmulhrsw m5, m7 |
| 5639 | |
| 5640 | pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x] |
| 5641 | palignr m3, m2, 14 |
| 5642 | |
| 5643 | pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] |
| 5644 | pmulhrsw m6, m7 |
| 5645 | packuswb m5, m6 |
| 5646 | |
| 5647 | pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x] |
| 5648 | palignr m3, m2, 14 |
| 5649 | |
| 5650 | pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] |
| 5651 | pmulhrsw m6, m7 |
| 5652 | |
| 5653 | pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x] |
| 5654 | palignr m3, m2, 14 |
| 5655 | |
| 5656 | pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] |
| 5657 | pmulhrsw m1, m7 |
| 5658 | packuswb m6, m1 |
| 5659 | |
| 5660 | pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x] |
| 5661 | palignr m3, m2, 14 |
| 5662 | |
| 5663 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] |
| 5664 | pmulhrsw m1, m7 |
| 5665 | pmaddubsw m3, [r4 - 16 * 16] ; [00] |
| 5666 | pmulhrsw m3, m7 |
| 5667 | packuswb m1, m3 |
| 5668 | |
| 5669 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 5670 | |
| 5671 | lea r0, [r6 + 8] |
| 5672 | |
| 5673 | movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 5674 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] |
| 5675 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] |
| 5676 | palignr m2, m2, 6 ; [x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x] |
| 5677 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] |
| 5678 | |
| 5679 | pmaddubsw m4, m3, [r4 - 10 * 16] ; [06] |
| 5680 | pmulhrsw m4, m7 |
| 5681 | |
| 5682 | palignr m3, m2, 14 |
| 5683 | |
| 5684 | pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] |
| 5685 | pmulhrsw m5, m7 |
| 5686 | packuswb m4, m5 |
| 5687 | |
| 5688 | pslldq m2, 1 |
| 5689 | palignr m3, m2, 14 |
| 5690 | |
| 5691 | pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] |
| 5692 | pmulhrsw m5, m7 |
| 5693 | |
| 5694 | pslldq m2, 1 |
| 5695 | palignr m3, m2, 14 |
| 5696 | |
| 5697 | pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] |
| 5698 | pmulhrsw m6, m7 |
| 5699 | packuswb m5, m6 |
| 5700 | |
| 5701 | pslldq m2, 1 |
| 5702 | palignr m3, m2, 14 |
| 5703 | |
| 5704 | pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] |
| 5705 | pmulhrsw m6, m7 |
| 5706 | pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] |
| 5707 | pmulhrsw m0, m7 |
| 5708 | packuswb m6, m0 |
| 5709 | |
| 5710 | pslldq m2, 1 |
| 5711 | palignr m3, m2, 14 |
| 5712 | |
| 5713 | pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] |
| 5714 | pmulhrsw m1, m7 |
| 5715 | |
| 5716 | pslldq m2, 1 |
| 5717 | palignr m3, m2, 14 |
| 5718 | |
| 5719 | pmaddubsw m0, m3, [r4] ; [16] |
| 5720 | pmulhrsw m0, m7 |
| 5721 | packuswb m1, m0 |
| 5722 | |
| 5723 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 |
| 5724 | |
| 5725 | pslldq m2, 1 |
| 5726 | palignr m3, m2, 14 |
| 5727 | |
| 5728 | pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] |
| 5729 | pmulhrsw m4, m7 |
| 5730 | |
| 5731 | pslldq m2, 1 |
| 5732 | palignr m3, m2, 14 |
| 5733 | |
| 5734 | pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] |
| 5735 | pmulhrsw m5, m7 |
| 5736 | packuswb m4, m5 |
| 5737 | |
| 5738 | pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] |
| 5739 | pmulhrsw m5, m7 |
| 5740 | |
| 5741 | pslldq m2, 1 |
| 5742 | palignr m3, m2, 14 |
| 5743 | |
| 5744 | pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] |
| 5745 | pmulhrsw m6, m7 |
| 5746 | packuswb m5, m6 |
| 5747 | |
| 5748 | pslldq m2, 1 |
| 5749 | palignr m3, m2, 14 |
| 5750 | |
| 5751 | pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] |
| 5752 | pmulhrsw m6, m7 |
| 5753 | |
| 5754 | pslldq m2, 1 |
| 5755 | palignr m3, m2, 14 |
| 5756 | |
| 5757 | pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] |
| 5758 | pmulhrsw m1, m7 |
| 5759 | packuswb m6, m1 |
| 5760 | |
| 5761 | pslldq m2, 1 |
| 5762 | palignr m3, m2, 14 |
| 5763 | |
| 5764 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] |
| 5765 | pmulhrsw m1, m7 |
| 5766 | pmaddubsw m3, [r4 - 16 * 16] ; [00] |
| 5767 | pmulhrsw m3, m7 |
| 5768 | packuswb m1, m3 |
| 5769 | |
| 5770 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 |
| 5771 | |
| 5772 | RET |
| 5773 | |
| 5774 | INIT_XMM sse4 |
| 5775 | cglobal intra_pred_ang16_18, 4,5,3 |
| 5776 | |
| 5777 | movu m0, [r3] |
| 5778 | movu m1, [r2] |
| 5779 | mova m2, [c_mode16_18] |
| 5780 | pshufb m1, m2 |
| 5781 | |
| 5782 | lea r2, [r1 * 2] |
| 5783 | lea r3, [r1 * 3] |
| 5784 | lea r4, [r1 * 4] |
| 5785 | movu [r0], m0 |
| 5786 | palignr m2, m0, m1, 15 |
| 5787 | movu [r0 + r1], m2 |
| 5788 | palignr m2, m0, m1, 14 |
| 5789 | movu [r0 + r2], m2 |
| 5790 | palignr m2, m0, m1, 13 |
| 5791 | movu [r0 + r3], m2 |
| 5792 | lea r0, [r0 + r4] |
| 5793 | palignr m2, m0, m1, 12 |
| 5794 | movu [r0], m2 |
| 5795 | palignr m2, m0, m1, 11 |
| 5796 | movu [r0 + r1], m2 |
| 5797 | palignr m2, m0, m1, 10 |
| 5798 | movu [r0 + r2], m2 |
| 5799 | palignr m2, m0, m1, 9 |
| 5800 | movu [r0 + r3], m2 |
| 5801 | lea r0, [r0 + r4] |
| 5802 | palignr m2, m0, m1, 8 |
| 5803 | movu [r0], m2 |
| 5804 | palignr m2, m0, m1, 7 |
| 5805 | movu [r0 + r1], m2 |
| 5806 | palignr m2, m0, m1, 6 |
| 5807 | movu [r0 + r2], m2 |
| 5808 | palignr m2, m0, m1, 5 |
| 5809 | movu [r0 + r3], m2 |
| 5810 | lea r0, [r0 + r4] |
| 5811 | palignr m2, m0, m1, 4 |
| 5812 | movu [r0], m2 |
| 5813 | palignr m2, m0, m1, 3 |
| 5814 | movu [r0 + r1], m2 |
| 5815 | palignr m2, m0, m1, 2 |
| 5816 | movu [r0 + r2], m2 |
| 5817 | palignr m0, m1, 1 |
| 5818 | movu [r0 + r3], m0 |
| 5819 | RET |
| 5820 | |
| 5821 | ;--------------------------------------------------------------------------------------------------------------- |
| 5822 | ; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 5823 | ;--------------------------------------------------------------------------------------------------------------- |
| 5824 | INIT_XMM ssse3 |
| 5825 | cglobal intra_pred_ang32_2, 3,4,4 |
| 5826 | cmp r4m, byte 34 |
| 5827 | cmove r2, r3mp |
| 5828 | movu m0, [r2 + 2] |
| 5829 | movu m1, [r2 + 18] |
| 5830 | movu m3, [r2 + 34] |
| 5831 | |
| 5832 | lea r3, [r1 * 3] |
| 5833 | |
| 5834 | movu [r0], m0 |
| 5835 | movu [r0 + 16], m1 |
| 5836 | palignr m2, m1, m0, 1 |
| 5837 | movu [r0 + r1], m2 |
| 5838 | palignr m2, m3, m1, 1 |
| 5839 | movu [r0 + r1 + 16], m2 |
| 5840 | palignr m2, m1, m0, 2 |
| 5841 | movu [r0 + r1 * 2], m2 |
| 5842 | palignr m2, m3, m1, 2 |
| 5843 | movu [r0 + r1 * 2 + 16], m2 |
| 5844 | palignr m2, m1, m0, 3 |
| 5845 | movu [r0 + r3], m2 |
| 5846 | palignr m2, m3, m1, 3 |
| 5847 | movu [r0 + r3 + 16], m2 |
| 5848 | |
| 5849 | lea r0, [r0 + r1 * 4] |
| 5850 | |
| 5851 | palignr m2, m1, m0, 4 |
| 5852 | movu [r0], m2 |
| 5853 | palignr m2, m3, m1, 4 |
| 5854 | movu [r0 + 16], m2 |
| 5855 | palignr m2, m1, m0, 5 |
| 5856 | movu [r0 + r1], m2 |
| 5857 | palignr m2, m3, m1, 5 |
| 5858 | movu [r0 + r1 + 16], m2 |
| 5859 | palignr m2, m1, m0, 6 |
| 5860 | movu [r0 + r1 * 2], m2 |
| 5861 | palignr m2, m3, m1, 6 |
| 5862 | movu [r0 + r1 * 2 + 16], m2 |
| 5863 | palignr m2, m1, m0, 7 |
| 5864 | movu [r0 + r3], m2 |
| 5865 | palignr m2, m3, m1, 7 |
| 5866 | movu [r0 + r3 + 16], m2 |
| 5867 | |
| 5868 | lea r0, [r0 + r1 * 4] |
| 5869 | |
| 5870 | palignr m2, m1, m0, 8 |
| 5871 | movu [r0], m2 |
| 5872 | palignr m2, m3, m1, 8 |
| 5873 | movu [r0 + 16], m2 |
| 5874 | palignr m2, m1, m0, 9 |
| 5875 | movu [r0 + r1], m2 |
| 5876 | palignr m2, m3, m1, 9 |
| 5877 | movu [r0 + r1 + 16], m2 |
| 5878 | palignr m2, m1, m0, 10 |
| 5879 | movu [r0 + r1 * 2], m2 |
| 5880 | palignr m2, m3, m1, 10 |
| 5881 | movu [r0 + r1 * 2 + 16], m2 |
| 5882 | palignr m2, m1, m0, 11 |
| 5883 | movu [r0 + r3], m2 |
| 5884 | palignr m2, m3, m1, 11 |
| 5885 | movu [r0 + r3 + 16], m2 |
| 5886 | |
| 5887 | lea r0, [r0 + r1 * 4] |
| 5888 | |
| 5889 | palignr m2, m1, m0, 12 |
| 5890 | movu [r0], m2 |
| 5891 | palignr m2, m3, m1, 12 |
| 5892 | movu [r0 + 16], m2 |
| 5893 | palignr m2, m1, m0, 13 |
| 5894 | movu [r0 + r1], m2 |
| 5895 | palignr m2, m3, m1, 13 |
| 5896 | movu [r0 + r1 + 16], m2 |
| 5897 | palignr m2, m1, m0, 14 |
| 5898 | movu [r0 + r1 * 2], m2 |
| 5899 | palignr m2, m3, m1, 14 |
| 5900 | movu [r0 + r1 * 2 + 16], m2 |
| 5901 | palignr m2, m1, m0, 15 |
| 5902 | movu [r0 + r3], m2 |
| 5903 | palignr m2, m3, m1, 15 |
| 5904 | movu [r0 + r3 + 16], m2 |
| 5905 | |
| 5906 | lea r0, [r0 + r1 * 4] |
| 5907 | |
| 5908 | movu [r0], m1 |
| 5909 | movu m0, [r2 + 50] |
| 5910 | movu [r0 + 16], m3 |
| 5911 | palignr m2, m3, m1, 1 |
| 5912 | movu [r0 + r1], m2 |
| 5913 | palignr m2, m0, m3, 1 |
| 5914 | movu [r0 + r1 + 16], m2 |
| 5915 | palignr m2, m3, m1, 2 |
| 5916 | movu [r0 + r1 * 2], m2 |
| 5917 | palignr m2, m0, m3, 2 |
| 5918 | movu [r0 + r1 * 2 + 16], m2 |
| 5919 | palignr m2, m3, m1, 3 |
| 5920 | movu [r0 + r3], m2 |
| 5921 | palignr m2, m0, m3, 3 |
| 5922 | movu [r0 + r3 + 16], m2 |
| 5923 | |
| 5924 | lea r0, [r0 + r1 * 4] |
| 5925 | |
| 5926 | palignr m2, m3, m1, 4 |
| 5927 | movu [r0], m2 |
| 5928 | palignr m2, m0, m3, 4 |
| 5929 | movu [r0 + 16], m2 |
| 5930 | palignr m2, m3, m1, 5 |
| 5931 | movu [r0 + r1], m2 |
| 5932 | palignr m2, m0, m3, 5 |
| 5933 | movu [r0 + r1 + 16], m2 |
| 5934 | palignr m2, m3, m1, 6 |
| 5935 | movu [r0 + r1 * 2], m2 |
| 5936 | palignr m2, m0, m3, 6 |
| 5937 | movu [r0 + r1 * 2 + 16], m2 |
| 5938 | palignr m2, m3, m1, 7 |
| 5939 | movu [r0 + r3], m2 |
| 5940 | palignr m2, m0, m3, 7 |
| 5941 | movu [r0 + r3 + 16], m2 |
| 5942 | |
| 5943 | lea r0, [r0 + r1 * 4] |
| 5944 | |
| 5945 | palignr m2, m3, m1, 8 |
| 5946 | movu [r0], m2 |
| 5947 | palignr m2, m0, m3, 8 |
| 5948 | movu [r0 + 16], m2 |
| 5949 | palignr m2, m3, m1, 9 |
| 5950 | movu [r0 + r1], m2 |
| 5951 | palignr m2, m0, m3, 9 |
| 5952 | movu [r0 + r1 + 16], m2 |
| 5953 | palignr m2, m3, m1, 10 |
| 5954 | movu [r0 + r1 * 2], m2 |
| 5955 | palignr m2, m0, m3, 10 |
| 5956 | movu [r0 + r1 * 2 + 16], m2 |
| 5957 | palignr m2, m3, m1, 11 |
| 5958 | movu [r0 + r3], m2 |
| 5959 | palignr m2, m0, m3, 11 |
| 5960 | movu [r0 + r3 + 16], m2 |
| 5961 | |
| 5962 | lea r0, [r0 + r1 * 4] |
| 5963 | |
| 5964 | palignr m2, m3, m1, 12 |
| 5965 | movu [r0], m2 |
| 5966 | palignr m2, m0, m3, 12 |
| 5967 | movu [r0 + 16], m2 |
| 5968 | palignr m2, m3, m1, 13 |
| 5969 | movu [r0 + r1], m2 |
| 5970 | palignr m2, m0, m3, 13 |
| 5971 | movu [r0 + r1 + 16], m2 |
| 5972 | palignr m2, m3, m1, 14 |
| 5973 | movu [r0 + r1 * 2], m2 |
| 5974 | palignr m2, m0, m3, 14 |
| 5975 | movu [r0 + r1 * 2 + 16], m2 |
| 5976 | palignr m2, m3, m1, 15 |
| 5977 | movu [r0 + r3], m2 |
| 5978 | palignr m2, m0, m3, 15 |
| 5979 | movu [r0 + r3 + 16], m2 |
| 5980 | RET |
| 5981 | |
| 5982 | ; Process Intra32x32, input 8x8 in [m0, m1, m2, m3, m4, m5, m6, m7], output 8x8 |
| 5983 | %macro PROC32_8x8 10 ; col4, transpose[0/1] c0, c1, c2, c3, c4, c5, c6, c7 |
| 5984 | %if %3 == 0 |
| 5985 | %else |
| 5986 | pshufb m0, [r3] |
| 5987 | pmaddubsw m0, [r4 + %3 * 16] |
| 5988 | pmulhrsw m0, [pw_1024] |
| 5989 | %endif |
| 5990 | %if %4 == 0 |
| 5991 | pmovzxbw m1, m1 |
| 5992 | %else |
| 5993 | pshufb m1, [r3] |
| 5994 | pmaddubsw m1, [r4 + %4 * 16] |
| 5995 | pmulhrsw m1, [pw_1024] |
| 5996 | %endif |
| 5997 | %if %3 == 0 |
| 5998 | packuswb m1, m1 |
| 5999 | movlhps m0, m1 |
| 6000 | %else |
| 6001 | packuswb m0, m1 |
| 6002 | %endif |
| 6003 | mova m1, [pw_1024] |
| 6004 | %if %5 == 0 |
| 6005 | %else |
| 6006 | pshufb m2, [r3] |
| 6007 | pmaddubsw m2, [r4 + %5 * 16] |
| 6008 | pmulhrsw m2, m1 |
| 6009 | %endif |
| 6010 | %if %6 == 0 |
| 6011 | pmovzxbw m3, m3 |
| 6012 | %else |
| 6013 | pshufb m3, [r3] |
| 6014 | pmaddubsw m3, [r4 + %6 * 16] |
| 6015 | pmulhrsw m3, m1 |
| 6016 | %endif |
| 6017 | %if %5 == 0 |
| 6018 | packuswb m3, m3 |
| 6019 | movlhps m2, m3 |
| 6020 | %else |
| 6021 | packuswb m2, m3 |
| 6022 | %endif |
| 6023 | %if %7 == 0 |
| 6024 | %else |
| 6025 | pshufb m4, [r3] |
| 6026 | pmaddubsw m4, [r4 + %7 * 16] |
| 6027 | pmulhrsw m4, m1 |
| 6028 | %endif |
| 6029 | %if %8 == 0 |
| 6030 | pmovzxbw m5, m5 |
| 6031 | %else |
| 6032 | pshufb m5, [r3] |
| 6033 | pmaddubsw m5, [r4 + %8 * 16] |
| 6034 | pmulhrsw m5, m1 |
| 6035 | %endif |
| 6036 | %if %7 == 0 |
| 6037 | packuswb m5, m5 |
| 6038 | movlhps m4, m5 |
| 6039 | %else |
| 6040 | packuswb m4, m5 |
| 6041 | %endif |
| 6042 | %if %9 == 0 |
| 6043 | %else |
| 6044 | pshufb m6, [r3] |
| 6045 | pmaddubsw m6, [r4 + %9 * 16] |
| 6046 | pmulhrsw m6, m1 |
| 6047 | %endif |
| 6048 | %if %10 == 0 |
| 6049 | pmovzxbw m7, m7 |
| 6050 | %else |
| 6051 | pshufb m7, [r3] |
| 6052 | pmaddubsw m7, [r4 + %10 * 16] |
| 6053 | pmulhrsw m7, m1 |
| 6054 | %endif |
| 6055 | %if %9 == 0 |
| 6056 | packuswb m7, m7 |
| 6057 | movlhps m6, m7 |
| 6058 | %else |
| 6059 | packuswb m6, m7 |
| 6060 | %endif |
| 6061 | |
| 6062 | %if %2 == 1 |
| 6063 | ; transpose |
| 6064 | punpckhbw m1, m0, m2 |
| 6065 | punpcklbw m0, m2 |
| 6066 | punpckhbw m3, m0, m1 |
| 6067 | punpcklbw m0, m1 |
| 6068 | |
| 6069 | punpckhbw m1, m4, m6 |
| 6070 | punpcklbw m4, m6 |
| 6071 | punpckhbw m6, m4, m1 |
| 6072 | punpcklbw m4, m1 |
| 6073 | |
| 6074 | punpckhdq m2, m0, m4 |
| 6075 | punpckldq m0, m4 |
| 6076 | punpckldq m4, m3, m6 |
| 6077 | punpckhdq m3, m6 |
| 6078 | |
| 6079 | movh [r0 + + %1 * 8], m0 |
| 6080 | movhps [r0 + r1 + %1 * 8], m0 |
| 6081 | movh [r0 + r1*2 + %1 * 8], m2 |
| 6082 | movhps [r0 + r5 + %1 * 8], m2 |
| 6083 | movh [r6 + %1 * 8], m4 |
| 6084 | movhps [r6 + r1 + %1 * 8], m4 |
| 6085 | movh [r6 + r1*2 + %1 * 8], m3 |
| 6086 | movhps [r6 + r5 + %1 * 8], m3 |
| 6087 | %else |
| 6088 | movh [r0 ], m0 |
| 6089 | movhps [r0 + r1 ], m0 |
| 6090 | movh [r0 + r1 * 2], m2 |
| 6091 | movhps [r0 + r5 ], m2 |
| 6092 | lea r0, [r0 + r1 * 4] |
| 6093 | movh [r0 ], m4 |
| 6094 | movhps [r0 + r1 ], m4 |
| 6095 | movh [r0 + r1 * 2], m6 |
| 6096 | movhps [r0 + r5 ], m6 |
| 6097 | %endif |
| 6098 | %endmacro |
| 6099 | |
| 6100 | %macro MODE_3_33 1 |
| 6101 | movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 6102 | palignr m1, m0, 1 ; [ x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] |
| 6103 | punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] |
| 6104 | punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] |
| 6105 | palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] |
| 6106 | pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] |
| 6107 | pmulhrsw m4, m7 |
| 6108 | pmaddubsw m1, [r3 + 4 * 16] ; [20] |
| 6109 | pmulhrsw m1, m7 |
| 6110 | packuswb m4, m1 |
| 6111 | palignr m5, m2, m0, 4 |
| 6112 | pmaddubsw m5, [r3 - 2 * 16] ; [14] |
| 6113 | pmulhrsw m5, m7 |
| 6114 | palignr m6, m2, m0, 6 |
| 6115 | pmaddubsw m6, [r3 - 8 * 16] ; [ 8] |
| 6116 | pmulhrsw m6, m7 |
| 6117 | packuswb m5, m6 |
| 6118 | palignr m1, m2, m0, 8 |
| 6119 | pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] |
| 6120 | pmulhrsw m6, m7 |
| 6121 | pmaddubsw m1, [r3 + 12 * 16] ; [28] |
| 6122 | pmulhrsw m1, m7 |
| 6123 | packuswb m6, m1 |
| 6124 | palignr m1, m2, m0, 10 |
| 6125 | pmaddubsw m1, [r3 + 6 * 16] ; [22] |
| 6126 | pmulhrsw m1, m7 |
| 6127 | palignr m2, m0, 12 |
| 6128 | pmaddubsw m2, [r3] ; [16] |
| 6129 | pmulhrsw m2, m7 |
| 6130 | packuswb m1, m2 |
| 6131 | |
| 6132 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 |
| 6133 | |
| 6134 | movu m0, [r2 + 8] |
| 6135 | palignr m1, m0, 1 |
| 6136 | punpckhbw m2, m0, m1 |
| 6137 | punpcklbw m0, m1 |
| 6138 | palignr m5, m2, m0, 2 |
| 6139 | pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] |
| 6140 | pmulhrsw m4, m7 |
| 6141 | pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] |
| 6142 | pmulhrsw m1, m7 |
| 6143 | packuswb m4, m1 |
| 6144 | pmaddubsw m5, [r3 + 14 * 16] ; [30] |
| 6145 | pmulhrsw m5, m7 |
| 6146 | palignr m6, m2, m0, 4 |
| 6147 | pmaddubsw m6, [r3 + 8 * 16] ; [24] |
| 6148 | pmulhrsw m6, m7 |
| 6149 | packuswb m5, m6 |
| 6150 | palignr m1, m2, m0, 6 |
| 6151 | pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] |
| 6152 | pmulhrsw m6, m7 |
| 6153 | palignr m1, m2, m0, 8 |
| 6154 | pmaddubsw m1, [r3 - 4 * 16] ; [12] |
| 6155 | pmulhrsw m1, m7 |
| 6156 | packuswb m6, m1 |
| 6157 | palignr m1, m2, m0, 10 |
| 6158 | pmaddubsw m1, [r3 - 10 * 16] ; [06] |
| 6159 | pmulhrsw m1, m7 |
| 6160 | packuswb m1, m1 |
| 6161 | movhps m1, [r2 + 14] ; [00] |
| 6162 | |
| 6163 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 |
| 6164 | |
| 6165 | movu m0, [r2 + 14] |
| 6166 | palignr m1, m0, 1 |
| 6167 | punpckhbw m2, m0, m1 |
| 6168 | punpcklbw m0, m1 |
| 6169 | palignr m1, m2, m0, 2 |
| 6170 | pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] |
| 6171 | pmulhrsw m4, m7 |
| 6172 | pmaddubsw m1, [r3 + 4 * 16] ; [20] |
| 6173 | pmulhrsw m1, m7 |
| 6174 | packuswb m4, m1 |
| 6175 | palignr m5, m2, m0, 4 |
| 6176 | pmaddubsw m5, [r3 - 2 * 16] ; [14] |
| 6177 | pmulhrsw m5, m7 |
| 6178 | palignr m6, m2, m0, 6 |
| 6179 | pmaddubsw m6, [r3 - 8 * 16] ; [ 8] |
| 6180 | pmulhrsw m6, m7 |
| 6181 | packuswb m5, m6 |
| 6182 | palignr m1, m2, m0, 8 |
| 6183 | pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] |
| 6184 | pmulhrsw m6, m7 |
| 6185 | pmaddubsw m1, [r3 + 12 * 16] ; [28] |
| 6186 | pmulhrsw m1, m7 |
| 6187 | packuswb m6, m1 |
| 6188 | palignr m1, m2, m0, 10 |
| 6189 | pmaddubsw m1, [r3 + 6 * 16] ; [22] |
| 6190 | pmulhrsw m1, m7 |
| 6191 | palignr m2, m0, 12 |
| 6192 | pmaddubsw m2, [r3] ; [16] |
| 6193 | pmulhrsw m2, m7 |
| 6194 | packuswb m1, m2 |
| 6195 | |
| 6196 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 |
| 6197 | |
| 6198 | movu m0, [r2 + 21] |
| 6199 | palignr m1, m0, 1 |
| 6200 | punpckhbw m2, m0, m1 |
| 6201 | punpcklbw m0, m1 |
| 6202 | palignr m5, m2, m0, 2 |
| 6203 | pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] |
| 6204 | pmulhrsw m4, m7 |
| 6205 | pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] |
| 6206 | pmulhrsw m1, m7 |
| 6207 | packuswb m4, m1 |
| 6208 | pmaddubsw m5, [r3 + 14 * 16] ; [30] |
| 6209 | pmulhrsw m5, m7 |
| 6210 | palignr m6, m2, m0, 4 |
| 6211 | pmaddubsw m6, [r3 + 8 * 16] ; [24] |
| 6212 | pmulhrsw m6, m7 |
| 6213 | packuswb m5, m6 |
| 6214 | palignr m1, m2, m0, 6 |
| 6215 | pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] |
| 6216 | pmulhrsw m6, m7 |
| 6217 | palignr m1, m2, m0, 8 |
| 6218 | pmaddubsw m1, [r3 - 4 * 16] ; [12] |
| 6219 | pmulhrsw m1, m7 |
| 6220 | packuswb m6, m1 |
| 6221 | palignr m1, m2, m0, 10 |
| 6222 | pmaddubsw m1, [r3 - 10 * 16] ; [06] |
| 6223 | pmulhrsw m1, m7 |
| 6224 | packuswb m1, m1 |
| 6225 | movhps m1, [r2 + 27] ; [00] |
| 6226 | |
| 6227 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 |
| 6228 | %endmacro |
| 6229 | ;------------------------------------------------------------------------------------------------------------------ |
| 6230 | ; void intraPredAng32_3(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 6231 | ;------------------------------------------------------------------------------------------------------------------ |
| 6232 | INIT_XMM sse4 |
| 6233 | cglobal intra_pred_ang32_3, 3,7,8 |
| 6234 | lea r3, [ang_table + 16 * 16] |
| 6235 | mov r4d, 4 |
| 6236 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 6237 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 6238 | mova m7, [pw_1024] |
| 6239 | .loop: |
| 6240 | MODE_3_33 1 |
| 6241 | lea r0, [r6 + r1 * 4] |
| 6242 | lea r6, [r6 + r1 * 8] |
| 6243 | add r2, 8 |
| 6244 | dec r4 |
| 6245 | jnz .loop |
| 6246 | RET |
| 6247 | |
| 6248 | %macro MODE_4_32 1 |
| 6249 | movu m0, [r2 + 1] |
| 6250 | palignr m1, m0, 1 |
| 6251 | punpckhbw m2, m0, m1 |
| 6252 | punpcklbw m0, m1 |
| 6253 | palignr m1, m2, m0, 2 |
| 6254 | mova m5, m1 |
| 6255 | pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] |
| 6256 | pmulhrsw m4, m7 |
| 6257 | pmaddubsw m1, [r3 - 6 * 16] ; [10] |
| 6258 | pmulhrsw m1, m7 |
| 6259 | packuswb m4, m1 |
| 6260 | pmaddubsw m5, [r3 + 15 * 16] ; [31] |
| 6261 | pmulhrsw m5, m7 |
| 6262 | palignr m6, m2, m0, 4 |
| 6263 | pmaddubsw m6, [r3 + 4 * 16] ; [ 20] |
| 6264 | pmulhrsw m6, m7 |
| 6265 | packuswb m5, m6 |
| 6266 | palignr m1, m2, m0, 6 |
| 6267 | pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] |
| 6268 | pmulhrsw m6, m7 |
| 6269 | pmaddubsw m1, [r3 + 14 * 16] ; [30] |
| 6270 | pmulhrsw m1, m7 |
| 6271 | packuswb m6, m1 |
| 6272 | palignr m1, m2, m0, 8 |
| 6273 | pmaddubsw m1, [r3 + 3 * 16] ; [19] |
| 6274 | pmulhrsw m1, m7 |
| 6275 | palignr m2, m0, 10 |
| 6276 | pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] |
| 6277 | pmulhrsw m3, m7 |
| 6278 | packuswb m1, m3 |
| 6279 | |
| 6280 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 |
| 6281 | |
| 6282 | pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] |
| 6283 | pmulhrsw m4, m7 |
| 6284 | movu m0, [r2 + 6] |
| 6285 | palignr m1, m0, 1 |
| 6286 | punpckhbw m2, m0, m1 |
| 6287 | punpcklbw m0, m1 |
| 6288 | palignr m1, m2, m0, 2 |
| 6289 | pmaddubsw m1, [r3 + 2 * 16] ; [18] |
| 6290 | pmulhrsw m1, m7 |
| 6291 | packuswb m4, m1 |
| 6292 | palignr m5, m2, m0, 4 |
| 6293 | mova m6, m5 |
| 6294 | pmaddubsw m5, [r3 - 9 * 16] ; [07] |
| 6295 | pmulhrsw m5, m7 |
| 6296 | pmaddubsw m6, [r3 + 12 * 16] ; [28] |
| 6297 | pmulhrsw m6, m7 |
| 6298 | packuswb m5, m6 |
| 6299 | palignr m6, m2, m0, 6 |
| 6300 | pmaddubsw m6, [r3 + 16] ; [17] |
| 6301 | pmulhrsw m6, m7 |
| 6302 | palignr m1, m2, m0, 8 |
| 6303 | pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] |
| 6304 | pmulhrsw m3, m7 |
| 6305 | packuswb m6, m3 |
| 6306 | pmaddubsw m1, [r3 + 11 * 16] ; [27] |
| 6307 | pmulhrsw m1, m7 |
| 6308 | palignr m2, m0, 10 |
| 6309 | pmaddubsw m2, [r3] ; [16] |
| 6310 | pmulhrsw m2, m7 |
| 6311 | packuswb m1, m2 |
| 6312 | |
| 6313 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 |
| 6314 | |
| 6315 | movu m0, [r2 + 12] |
| 6316 | palignr m1, m0, 1 |
| 6317 | punpckhbw m2, m0, m1 |
| 6318 | punpcklbw m0, m1 |
| 6319 | mova m1, m0 |
| 6320 | pmaddubsw m4, m0, [r3 - 11 * 16] ; [5] |
| 6321 | pmulhrsw m4, m7 |
| 6322 | pmaddubsw m1, [r3 + 10 * 16] ; [26] |
| 6323 | pmulhrsw m1, m7 |
| 6324 | packuswb m4, m1 |
| 6325 | palignr m5, m2, m0, 2 |
| 6326 | pmaddubsw m5, [r3 - 16] ; [15] |
| 6327 | pmulhrsw m5, m7 |
| 6328 | palignr m6, m2, m0, 4 |
| 6329 | mova m1, m6 |
| 6330 | pmaddubsw m1, [r3 - 12 * 16] ; [4] |
| 6331 | pmulhrsw m1, m7 |
| 6332 | packuswb m5, m1 |
| 6333 | pmaddubsw m6, [r3 + 9 * 16] ; [25] |
| 6334 | pmulhrsw m6, m7 |
| 6335 | palignr m1, m2, m0, 6 |
| 6336 | pmaddubsw m1, [r3 - 2 * 16] ; [14] |
| 6337 | pmulhrsw m1, m7 |
| 6338 | packuswb m6, m1 |
| 6339 | palignr m1, m2, m0, 8 |
| 6340 | mova m2, m1 |
| 6341 | pmaddubsw m1, [r3 - 13 * 16] ; [3] |
| 6342 | pmulhrsw m1, m7 |
| 6343 | pmaddubsw m2, [r3 + 8 * 16] ; [24] |
| 6344 | pmulhrsw m2, m7 |
| 6345 | packuswb m1, m2 |
| 6346 | |
| 6347 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 |
| 6348 | |
| 6349 | movu m0, [r2 + 17] |
| 6350 | palignr m1, m0, 1 |
| 6351 | punpckhbw m2, m0, m1 |
| 6352 | punpcklbw m0, m1 |
| 6353 | pmaddubsw m4, m0, [r3 - 3 * 16] ; [13] |
| 6354 | pmulhrsw m4, m7 |
| 6355 | palignr m5, m2, m0, 2 |
| 6356 | pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] |
| 6357 | pmulhrsw m1, m7 |
| 6358 | packuswb m4, m1 |
| 6359 | pmaddubsw m5, [r3 + 7 * 16] ; [23] |
| 6360 | pmulhrsw m5, m7 |
| 6361 | palignr m6, m2, m0, 4 |
| 6362 | pmaddubsw m6, [r3 - 4 * 16] ; [12] |
| 6363 | pmulhrsw m6, m7 |
| 6364 | packuswb m5, m6 |
| 6365 | palignr m6, m2, m0, 6 |
| 6366 | mova m1, m6 |
| 6367 | pmaddubsw m6, [r3 - 15 * 16] ; [1] |
| 6368 | pmulhrsw m6, m7 |
| 6369 | pmaddubsw m1, [r3 + 6 * 16] ; [22] |
| 6370 | pmulhrsw m1, m7 |
| 6371 | packuswb m6, m1 |
| 6372 | palignr m1, m2, m0, 8 |
| 6373 | pmaddubsw m1, [r3 - 5 * 16] ; [11] |
| 6374 | pmulhrsw m1, m7 |
| 6375 | packuswb m1, m1 |
| 6376 | movhps m1, [r2 + 22] ; [00] |
| 6377 | |
| 6378 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 |
| 6379 | %endmacro |
| 6380 | ;----------------------------------------------------------------------------------------------------------------- |
| 6381 | ; void intraPredAng32_4(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 6382 | ;----------------------------------------------------------------------------------------------------------------- |
| 6383 | INIT_XMM sse4 |
| 6384 | cglobal intra_pred_ang32_4, 3,7,8 |
| 6385 | lea r3, [ang_table + 16 * 16] |
| 6386 | mov r4d, 4 |
| 6387 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 6388 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 6389 | mova m7, [pw_1024] |
| 6390 | .loop: |
| 6391 | MODE_4_32 1 |
| 6392 | lea r0, [r6 + r1 * 4] |
| 6393 | lea r6, [r6 + r1 * 8] |
| 6394 | add r2, 8 |
| 6395 | dec r4 |
| 6396 | jnz .loop |
| 6397 | RET |
| 6398 | |
| 6399 | %macro MODE_5_31 1 |
| 6400 | movu m0, [r2 + 1] |
| 6401 | palignr m1, m0, 1 |
| 6402 | punpckhbw m2, m0, m1 |
| 6403 | punpcklbw m0, m1 |
| 6404 | palignr m1, m2, m0, 2 |
| 6405 | mova m5, m1 |
| 6406 | pmaddubsw m4, m0, [r3 + 16] ; [17] |
| 6407 | pmulhrsw m4, m7 |
| 6408 | pmaddubsw m1, [r3 - 14 * 16] ; [2] |
| 6409 | pmulhrsw m1, m7 |
| 6410 | packuswb m4, m1 |
| 6411 | pmaddubsw m5, [r3 + 3 * 16] ; [19] |
| 6412 | pmulhrsw m5, m7 |
| 6413 | palignr m6, m2, m0, 4 |
| 6414 | mova m1, m6 |
| 6415 | pmaddubsw m6, [r3 - 12 * 16] ; [4] |
| 6416 | pmulhrsw m6, m7 |
| 6417 | packuswb m5, m6 |
| 6418 | pmaddubsw m6, m1, [r3 + 5 * 16] ; [21] |
| 6419 | pmulhrsw m6, m7 |
| 6420 | palignr m1, m2, m0, 6 |
| 6421 | mova m3, m1 |
| 6422 | pmaddubsw m3, [r3 - 10 * 16] ; [6] |
| 6423 | pmulhrsw m3, m7 |
| 6424 | packuswb m6, m3 |
| 6425 | pmaddubsw m1, [r3 + 7 * 16] ; [23] |
| 6426 | pmulhrsw m1, m7 |
| 6427 | palignr m2, m0, 8 |
| 6428 | pmaddubsw m2, [r3 - 8 * 16] ; [8] |
| 6429 | pmulhrsw m2, m7 |
| 6430 | packuswb m1, m2 |
| 6431 | |
| 6432 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 |
| 6433 | |
| 6434 | movu m0, [r2 + 5] |
| 6435 | palignr m1, m0, 1 |
| 6436 | punpckhbw m2, m0, m1 |
| 6437 | punpcklbw m0, m1 |
| 6438 | palignr m1, m2, m0, 2 |
| 6439 | mova m5, m1 |
| 6440 | pmaddubsw m4, m0, [r3 + 9 * 16] ; [25] |
| 6441 | pmulhrsw m4, m7 |
| 6442 | pmaddubsw m1, [r3 - 6 * 16] ; [10] |
| 6443 | pmulhrsw m1, m7 |
| 6444 | packuswb m4, m1 |
| 6445 | pmaddubsw m5, [r3 + 11 * 16] ; [27] |
| 6446 | pmulhrsw m5, m7 |
| 6447 | palignr m6, m2, m0, 4 |
| 6448 | mova m1, m6 |
| 6449 | pmaddubsw m6, [r3 - 4 * 16] ; [12] |
| 6450 | pmulhrsw m6, m7 |
| 6451 | packuswb m5, m6 |
| 6452 | pmaddubsw m6, m1, [r3 + 13 * 16] ; [29] |
| 6453 | pmulhrsw m6, m7 |
| 6454 | palignr m1, m2, m0, 6 |
| 6455 | mova m3, m1 |
| 6456 | pmaddubsw m3, [r3 - 2 * 16] ; [14] |
| 6457 | pmulhrsw m3, m7 |
| 6458 | packuswb m6, m3 |
| 6459 | pmaddubsw m1, [r3 + 15 * 16] ; [31] |
| 6460 | pmulhrsw m1, m7 |
| 6461 | palignr m2, m0, 8 |
| 6462 | pmaddubsw m2, [r3] ; [16] |
| 6463 | pmulhrsw m2, m7 |
| 6464 | packuswb m1, m2 |
| 6465 | |
| 6466 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 |
| 6467 | |
| 6468 | movu m0, [r2 + 10] |
| 6469 | palignr m1, m0, 1 |
| 6470 | punpckhbw m2, m0, m1 |
| 6471 | punpcklbw m0, m1 |
| 6472 | mova m1, m0 |
| 6473 | pmaddubsw m4, m0, [r3 - 15 * 16] ; [1] |
| 6474 | pmulhrsw m4, m7 |
| 6475 | pmaddubsw m1, [r3 + 2 * 16] ; [18] |
| 6476 | pmulhrsw m1, m7 |
| 6477 | packuswb m4, m1 |
| 6478 | palignr m5, m2, m0, 2 |
| 6479 | mova m1, m5 |
| 6480 | pmaddubsw m5, [r3 - 13 * 16] ; [3] |
| 6481 | pmulhrsw m5, m7 |
| 6482 | pmaddubsw m1, [r3 + 4 * 16] ; [20] |
| 6483 | pmulhrsw m1, m7 |
| 6484 | packuswb m5, m1 |
| 6485 | palignr m1, m2, m0, 4 |
| 6486 | pmaddubsw m6, m1, [r3 - 11 * 16] ; [5] |
| 6487 | pmulhrsw m6, m7 |
| 6488 | pmaddubsw m1, [r3 + 6 * 16] ; [22] |
| 6489 | pmulhrsw m1, m7 |
| 6490 | packuswb m6, m1 |
| 6491 | palignr m2, m0, 6 |
| 6492 | pmaddubsw m1, m2, [r3 - 9 * 16] ; [7] |
| 6493 | pmulhrsw m1, m7 |
| 6494 | pmaddubsw m2, [r3 + 8 * 16] ; [24] |
| 6495 | pmulhrsw m2, m7 |
| 6496 | packuswb m1, m2 |
| 6497 | |
| 6498 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 |
| 6499 | |
| 6500 | movu m0, [r2 + 14] |
| 6501 | palignr m1, m0, 1 |
| 6502 | punpckhbw m2, m0, m1 |
| 6503 | punpcklbw m0, m1 |
| 6504 | mova m1, m0 |
| 6505 | pmaddubsw m4, m0, [r3 - 7 * 16] ; [9] |
| 6506 | pmulhrsw m4, m7 |
| 6507 | pmaddubsw m1, [r3 + 10 * 16] ; [26] |
| 6508 | pmulhrsw m1, m7 |
| 6509 | packuswb m4, m1 |
| 6510 | palignr m5, m2, m0, 2 |
| 6511 | mova m1, m5 |
| 6512 | pmaddubsw m5, [r3 - 5 * 16] ; [11] |
| 6513 | pmulhrsw m5, m7 |
| 6514 | pmaddubsw m1, [r3 + 12 * 16] ; [28] |
| 6515 | pmulhrsw m1, m7 |
| 6516 | packuswb m5, m1 |
| 6517 | palignr m1, m2, m0, 4 |
| 6518 | pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] |
| 6519 | pmulhrsw m6, m7 |
| 6520 | pmaddubsw m1, [r3 + 14 * 16] ; [30] |
| 6521 | pmulhrsw m1, m7 |
| 6522 | packuswb m6, m1 |
| 6523 | palignr m2, m0, 6 |
| 6524 | pmaddubsw m1, m2, [r3 - 16] ; [15] |
| 6525 | pmulhrsw m1, m7 |
| 6526 | packuswb m1, m1 |
| 6527 | movhps m1, [r2 + 18] ; [00] |
| 6528 | |
| 6529 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 |
| 6530 | %endmacro |
| 6531 | ;------------------------------------------------------------------------------------------------------------------ |
| 6532 | ; void intraPredAng32_5(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 6533 | ;------------------------------------------------------------------------------------------------------------------ |
| 6534 | INIT_XMM sse4 |
| 6535 | cglobal intra_pred_ang32_5, 3,7,8 |
| 6536 | lea r3, [ang_table + 16 * 16] |
| 6537 | mov r4d, 4 |
| 6538 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 6539 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 6540 | mova m7, [pw_1024] |
| 6541 | .loop: |
| 6542 | MODE_5_31 1 |
| 6543 | lea r0, [r6 + r1 * 4] |
| 6544 | lea r6, [r6 + r1 * 8] |
| 6545 | add r2, 8 |
| 6546 | dec r4 |
| 6547 | jnz .loop |
| 6548 | RET |
| 6549 | |
| 6550 | %macro MODE_6_30 1 |
| 6551 | movu m0, [r2 + 1] |
| 6552 | palignr m1, m0, 1 |
| 6553 | punpckhbw m2, m0, m1 |
| 6554 | punpcklbw m0, m1 |
| 6555 | mova m1, m0 |
| 6556 | pmaddubsw m4, m0, [r3 - 3 * 16] ; [13] |
| 6557 | pmulhrsw m4, m7 |
| 6558 | pmaddubsw m1, [r3 + 10 * 16] ; [26] |
| 6559 | pmulhrsw m1, m7 |
| 6560 | packuswb m4, m1 |
| 6561 | palignr m6, m2, m0, 2 |
| 6562 | pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] |
| 6563 | pmulhrsw m5, m7 |
| 6564 | pmaddubsw m6, [r3 + 4 * 16] ; [20] |
| 6565 | pmulhrsw m6, m7 |
| 6566 | packuswb m5, m6 |
| 6567 | palignr m1, m2, m0, 4 |
| 6568 | pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] |
| 6569 | pmulhrsw m6, m7 |
| 6570 | pmaddubsw m3, m1, [r3 - 2 * 16] ; [14] |
| 6571 | pmulhrsw m3, m7 |
| 6572 | packuswb m6, m3 |
| 6573 | pmaddubsw m1, [r3 + 11 * 16] ; [27] |
| 6574 | pmulhrsw m1, m7 |
| 6575 | palignr m2, m0, 6 |
| 6576 | pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] |
| 6577 | pmulhrsw m3, m7 |
| 6578 | packuswb m1, m3 |
| 6579 | |
| 6580 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 |
| 6581 | |
| 6582 | pmaddubsw m4, m2, [r3 + 5 * 16] ; [21] |
| 6583 | pmulhrsw m4, m7 |
| 6584 | movu m0, [r2 + 5] |
| 6585 | palignr m1, m0, 1 |
| 6586 | punpckhbw m2, m0, m1 |
| 6587 | punpcklbw m0, m1 |
| 6588 | mova m6, m0 |
| 6589 | pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] |
| 6590 | pmulhrsw m1, m7 |
| 6591 | packuswb m4, m1 |
| 6592 | pmaddubsw m5, m6, [r3 - 16] ; [15] |
| 6593 | pmulhrsw m5, m7 |
| 6594 | pmaddubsw m6, [r3 + 12 * 16] ; [28] |
| 6595 | pmulhrsw m6, m7 |
| 6596 | packuswb m5, m6 |
| 6597 | palignr m3, m2, m0, 2 |
| 6598 | pmaddubsw m6, m3, [r3 - 7 * 16] ; [9] |
| 6599 | pmulhrsw m6, m7 |
| 6600 | pmaddubsw m3, [r3 + 6 * 16] ; [22] |
| 6601 | pmulhrsw m3, m7 |
| 6602 | packuswb m6, m3 |
| 6603 | palignr m2, m0, 4 |
| 6604 | pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] |
| 6605 | pmulhrsw m1, m7 |
| 6606 | pmaddubsw m3, m2, [r3] ; [16] |
| 6607 | pmulhrsw m3, m7 |
| 6608 | packuswb m1, m3 |
| 6609 | |
| 6610 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 |
| 6611 | |
| 6612 | pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] |
| 6613 | pmulhrsw m4, m7 |
| 6614 | movu m0, [r2 + 7] |
| 6615 | palignr m1, m0, 1 |
| 6616 | punpckhbw m2, m0, m1 |
| 6617 | punpcklbw m0, m1 |
| 6618 | palignr m5, m2, m0, 2 |
| 6619 | pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] |
| 6620 | pmulhrsw m1, m7 |
| 6621 | packuswb m4, m1 |
| 6622 | pmaddubsw m5, [r3 + 7 * 16] ; [23] |
| 6623 | pmulhrsw m5, m7 |
| 6624 | palignr m1, m2, m0, 4 |
| 6625 | pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] |
| 6626 | pmulhrsw m6, m7 |
| 6627 | packuswb m5, m6 |
| 6628 | pmaddubsw m6, m1, [r3 + 16] ; [17] |
| 6629 | pmulhrsw m6, m7 |
| 6630 | pmaddubsw m1, [r3 + 14 * 16] ; [30] |
| 6631 | pmulhrsw m1, m7 |
| 6632 | packuswb m6, m1 |
| 6633 | palignr m2, m2, m0, 6 |
| 6634 | pmaddubsw m1, m2, [r3 - 5 * 16] ; [11] |
| 6635 | pmulhrsw m1, m7 |
| 6636 | pmaddubsw m2, m2, [r3 + 8 * 16] ; [24] |
| 6637 | pmulhrsw m2, m7 |
| 6638 | packuswb m1, m2 |
| 6639 | |
| 6640 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 |
| 6641 | |
| 6642 | movu m0, [r2 + 11] |
| 6643 | palignr m1, m0, 1 |
| 6644 | punpckhbw m2, m0, m1 |
| 6645 | punpcklbw m0, m1 |
| 6646 | mova m5, m0 |
| 6647 | pmaddubsw m4, m0, [r3 - 11 * 16] ; [5] |
| 6648 | pmulhrsw m4, m7 |
| 6649 | pmaddubsw m3, m5, [r3 + 2 * 16] ; [18] |
| 6650 | pmulhrsw m3, m7 |
| 6651 | packuswb m4, m3 |
| 6652 | pmaddubsw m5, [r3 + 15 * 16] ; [31] |
| 6653 | pmulhrsw m5, m7 |
| 6654 | palignr m6, m2, m0, 2 |
| 6655 | pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] |
| 6656 | pmulhrsw m1, m7 |
| 6657 | packuswb m5, m1 |
| 6658 | pmaddubsw m6, [r3 + 9 * 16] ; [25] |
| 6659 | pmulhrsw m6, m7 |
| 6660 | palignr m1, m2, m0, 4 |
| 6661 | pmaddubsw m2, m1, [r3 - 10 * 16] ; [6] |
| 6662 | pmulhrsw m2, m7 |
| 6663 | packuswb m6, m2 |
| 6664 | pmaddubsw m1, [r3 + 3 * 16] ; [19] |
| 6665 | pmulhrsw m1, m7 |
| 6666 | packuswb m1, m1 |
| 6667 | movhps m1, [r2 + 14] ; [00] |
| 6668 | |
| 6669 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 |
| 6670 | %endmacro |
| 6671 | ;------------------------------------------------------------------------------------------------------------------ |
| 6672 | ; void intraPredAng32_6(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 6673 | ;------------------------------------------------------------------------------------------------------------------ |
| 6674 | INIT_XMM sse4 |
| 6675 | cglobal intra_pred_ang32_6, 3,7,8 |
| 6676 | lea r3, [ang_table + 16 * 16] |
| 6677 | mov r4d, 4 |
| 6678 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 6679 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 6680 | mova m7, [pw_1024] |
| 6681 | .loop: |
| 6682 | MODE_6_30 1 |
| 6683 | lea r0, [r6 + r1 * 4] |
| 6684 | lea r6, [r6 + r1 * 8] |
| 6685 | add r2, 8 |
| 6686 | dec r4 |
| 6687 | jnz .loop |
| 6688 | RET |
| 6689 | |
| 6690 | %macro MODE_7_29 1 |
| 6691 | movu m0, [r2 + 1] |
| 6692 | palignr m1, m0, 1 |
| 6693 | punpckhbw m2, m0, m1 |
| 6694 | punpcklbw m0, m1 |
| 6695 | mova m5, m0 |
| 6696 | pmaddubsw m4, m0, [r3 - 7 * 16] ; [9] |
| 6697 | pmulhrsw m4, m7 |
| 6698 | pmaddubsw m3, m5, [r3 + 2 * 16] ; [18] |
| 6699 | pmulhrsw m3, m7 |
| 6700 | packuswb m4, m3 |
| 6701 | pmaddubsw m5, [r3 + 11 * 16] ; [27] |
| 6702 | pmulhrsw m5, m7 |
| 6703 | palignr m1, m2, m0, 2 |
| 6704 | palignr m2, m0, 4 |
| 6705 | pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] |
| 6706 | pmulhrsw m6, m7 |
| 6707 | packuswb m5, m6 |
| 6708 | pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] |
| 6709 | pmulhrsw m6, m7 |
| 6710 | pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] |
| 6711 | pmulhrsw m0, m7 |
| 6712 | packuswb m6, m0 |
| 6713 | pmaddubsw m1, [r3 + 15 * 16] ; [31] |
| 6714 | pmulhrsw m1, m7 |
| 6715 | pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] |
| 6716 | pmulhrsw m0, m7 |
| 6717 | packuswb m1, m0 |
| 6718 | |
| 6719 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 |
| 6720 | |
| 6721 | pmaddubsw m4, m2, [r3 + 16] ; [17] |
| 6722 | pmulhrsw m4, m7 |
| 6723 | pmaddubsw m2, [r3 + 10 * 16] ; [26] |
| 6724 | pmulhrsw m2, m7 |
| 6725 | packuswb m4, m2 |
| 6726 | movu m0, [r2 + 4] |
| 6727 | palignr m1, m0, 1 |
| 6728 | punpckhbw m2, m0, m1 |
| 6729 | punpcklbw m0, m1 |
| 6730 | palignr m2, m0, 2 |
| 6731 | pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] |
| 6732 | pmulhrsw m5, m7 |
| 6733 | pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] |
| 6734 | pmulhrsw m6, m7 |
| 6735 | packuswb m5, m6 |
| 6736 | pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] |
| 6737 | pmulhrsw m6, m7 |
| 6738 | pmaddubsw m0, [r3 + 14 * 16] ; [30] |
| 6739 | pmulhrsw m0, m7 |
| 6740 | packuswb m6, m0 |
| 6741 | pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] |
| 6742 | pmulhrsw m1, m7 |
| 6743 | pmaddubsw m3, m2, [r3] ; [16] |
| 6744 | pmulhrsw m3, m7 |
| 6745 | packuswb m1, m3 |
| 6746 | |
| 6747 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 |
| 6748 | |
| 6749 | pmaddubsw m4, m2, [r3 + 9 * 16] ; [25] |
| 6750 | pmulhrsw m4, m7 |
| 6751 | movu m0, [r2 + 6] |
| 6752 | palignr m1, m0, 1 |
| 6753 | punpckhbw m2, m0, m1 |
| 6754 | punpcklbw m0, m1 |
| 6755 | palignr m2, m0, 2 |
| 6756 | pmaddubsw m1, m0, [r3 - 14 * 16] ; [2] |
| 6757 | pmulhrsw m1, m7 |
| 6758 | packuswb m4, m1 |
| 6759 | pmaddubsw m5, m0, [r3 - 5 * 16] ; [11] |
| 6760 | pmulhrsw m5, m7 |
| 6761 | pmaddubsw m6, m0, [r3 + 4 * 16] ; [20] |
| 6762 | pmulhrsw m6, m7 |
| 6763 | packuswb m5, m6 |
| 6764 | pmaddubsw m6, m0, [r3 + 13 * 16] ; [29] |
| 6765 | pmulhrsw m6, m7 |
| 6766 | pmaddubsw m1, m2, [r3 - 10 * 16] ; [6] |
| 6767 | pmulhrsw m1, m7 |
| 6768 | packuswb m6, m1 |
| 6769 | pmaddubsw m1, m2, [r3 - 16] ; [15] |
| 6770 | pmulhrsw m1, m7 |
| 6771 | pmaddubsw m2, m2, [r3 + 8 * 16] ; [24] |
| 6772 | pmulhrsw m2, m7 |
| 6773 | packuswb m1, m2 |
| 6774 | |
| 6775 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 |
| 6776 | |
| 6777 | movu m0, [r2 + 8] |
| 6778 | palignr m1, m0, 1 |
| 6779 | punpckhbw m2, m0, m1 |
| 6780 | punpcklbw m0, m1 |
| 6781 | pmaddubsw m4, m0, [r3 - 15 * 16] ; [1] |
| 6782 | pmulhrsw m4, m7 |
| 6783 | pmaddubsw m3, m0, [r3 - 6 * 16] ; [10] |
| 6784 | pmulhrsw m3, m7 |
| 6785 | packuswb m4, m3 |
| 6786 | pmaddubsw m5, m0, [r3 + 3 * 16] ; [19] |
| 6787 | pmulhrsw m5, m7 |
| 6788 | pmaddubsw m6, m0, [r3 + 12 * 16] ; [28] |
| 6789 | pmulhrsw m6, m7 |
| 6790 | packuswb m5, m6 |
| 6791 | palignr m2, m0, 2 |
| 6792 | pmaddubsw m6, m2, [r3 - 11 * 16] ; [5] |
| 6793 | pmulhrsw m6, m7 |
| 6794 | pmaddubsw m0, m2, [r3 - 2 * 16] ; [14] |
| 6795 | pmulhrsw m0, m7 |
| 6796 | packuswb m6, m0 |
| 6797 | pmaddubsw m1, m2, [r3 + 7 * 16] ; [23] |
| 6798 | pmulhrsw m1, m7 |
| 6799 | packuswb m1, m1 |
| 6800 | movhps m1, [r2 + 10] ; [0] |
| 6801 | |
| 6802 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 |
| 6803 | %endmacro |
| 6804 | ;------------------------------------------------------------------------------------------------------------------ |
| 6805 | ; void intraPredAng32_7(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 6806 | ;------------------------------------------------------------------------------------------------------------------ |
| 6807 | INIT_XMM sse4 |
| 6808 | cglobal intra_pred_ang32_7, 3,7,8 |
| 6809 | lea r3, [ang_table + 16 * 16] |
| 6810 | mov r4d, 4 |
| 6811 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 6812 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 6813 | mova m7, [pw_1024] |
| 6814 | .loop: |
| 6815 | MODE_7_29 1 |
| 6816 | lea r0, [r6 + r1 * 4] |
| 6817 | lea r6, [r6 + r1 * 8] |
| 6818 | add r2, 8 |
| 6819 | dec r4 |
| 6820 | jnz .loop |
| 6821 | RET |
| 6822 | |
| 6823 | %macro MODE_8_28 1 |
| 6824 | movu m0, [r2 + 1] |
| 6825 | palignr m1, m0, 1 |
| 6826 | punpckhbw m2, m0, m1 |
| 6827 | punpcklbw m0, m1 |
| 6828 | palignr m2, m0, 2 |
| 6829 | pmaddubsw m4, m0, [r3 - 11 * 16] ; [5] |
| 6830 | pmulhrsw m4, m7 |
| 6831 | pmaddubsw m3, m0, [r3 - 6 * 16] ; [10] |
| 6832 | pmulhrsw m3, m7 |
| 6833 | packuswb m4, m3 |
| 6834 | pmaddubsw m5, m0, [r3 - 1 * 16] ; [15] |
| 6835 | pmulhrsw m5, m7 |
| 6836 | pmaddubsw m6, m0, [r3 + 4 * 16] ; [20] |
| 6837 | pmulhrsw m6, m7 |
| 6838 | packuswb m5, m6 |
| 6839 | pmaddubsw m6, m0, [r3 + 9 * 16] ; [25] |
| 6840 | pmulhrsw m6, m7 |
| 6841 | pmaddubsw m0, [r3 + 14 * 16] ; [30] |
| 6842 | pmulhrsw m0, m7 |
| 6843 | packuswb m6, m0 |
| 6844 | pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] |
| 6845 | pmulhrsw m1, m7 |
| 6846 | pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] |
| 6847 | pmulhrsw m0, m7 |
| 6848 | packuswb m1, m0 |
| 6849 | |
| 6850 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 |
| 6851 | |
| 6852 | pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] |
| 6853 | pmulhrsw m4, m7 |
| 6854 | pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] |
| 6855 | pmulhrsw m5, m7 |
| 6856 | packuswb m4, m5 |
| 6857 | pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] |
| 6858 | pmulhrsw m5, m7 |
| 6859 | pmaddubsw m2, [r3 + 12 * 16] ; [28] |
| 6860 | pmulhrsw m2, m7 |
| 6861 | packuswb m5, m2 |
| 6862 | movu m0, [r2 + 3] |
| 6863 | palignr m1, m0, 1 |
| 6864 | punpckhbw m2, m0, m1 |
| 6865 | punpcklbw m0, m1 |
| 6866 | pmaddubsw m6, m0, [r3 - 15 * 16] ; [01] |
| 6867 | pmulhrsw m6, m7 |
| 6868 | pmaddubsw m1, m0, [r3 - 10 * 16] ; [06] |
| 6869 | pmulhrsw m1, m7 |
| 6870 | packuswb m6, m1 |
| 6871 | pmaddubsw m1, m0, [r3 - 5 * 16] ; [11] |
| 6872 | pmulhrsw m1, m7 |
| 6873 | mova m2, m0 |
| 6874 | pmaddubsw m0, [r3] ; [16] |
| 6875 | pmulhrsw m0, m7 |
| 6876 | packuswb m1, m0 |
| 6877 | |
| 6878 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 |
| 6879 | |
| 6880 | pmaddubsw m4, m2, [r3 + 5 * 16] ; [21] |
| 6881 | pmulhrsw m4, m7 |
| 6882 | pmaddubsw m5, m2, [r3 + 10 * 16] ; [26] |
| 6883 | pmulhrsw m5, m7 |
| 6884 | packuswb m4, m5 |
| 6885 | pmaddubsw m5, m2, [r3 + 15 * 16] ; [31] |
| 6886 | pmulhrsw m5, m7 |
| 6887 | movu m0, [r2 + 4] |
| 6888 | palignr m1, m0, 1 |
| 6889 | punpckhbw m2, m0, m1 |
| 6890 | punpcklbw m0, m1 |
| 6891 | pmaddubsw m2, m0, [r3 - 12 * 16] ; [4] |
| 6892 | pmulhrsw m2, m7 |
| 6893 | packuswb m5, m2 |
| 6894 | pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] |
| 6895 | pmulhrsw m6, m7 |
| 6896 | pmaddubsw m1, m0, [r3 - 2 * 16] ; [14] |
| 6897 | pmulhrsw m1, m7 |
| 6898 | packuswb m6, m1 |
| 6899 | pmaddubsw m1, m0, [r3 + 3 * 16] ; [19] |
| 6900 | pmulhrsw m1, m7 |
| 6901 | mova m2, m0 |
| 6902 | pmaddubsw m0, [r3 + 8 * 16] ; [24] |
| 6903 | pmulhrsw m0, m7 |
| 6904 | packuswb m1, m0 |
| 6905 | |
| 6906 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 |
| 6907 | |
| 6908 | pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] |
| 6909 | pmulhrsw m4, m7 |
| 6910 | movu m0, [r2 + 5] |
| 6911 | palignr m1, m0, 1 |
| 6912 | punpckhbw m2, m0, m1 |
| 6913 | punpcklbw m0, m1 |
| 6914 | pmaddubsw m1, m0, [r3 - 14 * 16] ; [2] |
| 6915 | pmulhrsw m1, m7 |
| 6916 | packuswb m4, m1 |
| 6917 | pmaddubsw m5, m0, [r3 - 9 * 16] ; [7] |
| 6918 | pmulhrsw m5, m7 |
| 6919 | pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] |
| 6920 | pmulhrsw m6, m7 |
| 6921 | packuswb m5, m6 |
| 6922 | pmaddubsw m6, m0, [r3 + 16] ; [17] |
| 6923 | pmulhrsw m6, m7 |
| 6924 | pmaddubsw m1, m0, [r3 + 6 * 16] ; [22] |
| 6925 | pmulhrsw m1, m7 |
| 6926 | packuswb m6, m1 |
| 6927 | pmaddubsw m1, m0, [r3 + 11 * 16] ; [27] |
| 6928 | pmulhrsw m1, m7 |
| 6929 | packuswb m1, m1 |
| 6930 | movhps m1, [r2 + 6] ; [00] |
| 6931 | |
| 6932 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 |
| 6933 | %endmacro |
| 6934 | ;------------------------------------------------------------------------------------------------------------------ |
| 6935 | ; void intraPredAng32_8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 6936 | ;------------------------------------------------------------------------------------------------------------------ |
| 6937 | INIT_XMM sse4 |
| 6938 | cglobal intra_pred_ang32_8, 3,7,8 |
| 6939 | lea r3, [ang_table + 16 * 16] |
| 6940 | mov r4d, 4 |
| 6941 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 6942 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 6943 | mova m7, [pw_1024] |
| 6944 | .loop: |
| 6945 | MODE_8_28 1 |
| 6946 | lea r0, [r6 + r1 * 4] |
| 6947 | lea r6, [r6 + r1 * 8] |
| 6948 | add r2, 8 |
| 6949 | dec r4 |
| 6950 | jnz .loop |
| 6951 | RET |
| 6952 | |
| 6953 | %macro MODE_9_27 1 |
| 6954 | movu m2, [r2 + 1] |
| 6955 | palignr m1, m2, 1 |
| 6956 | punpckhbw m0, m2, m1 |
| 6957 | punpcklbw m2, m1 |
| 6958 | pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] |
| 6959 | pmulhrsw m4, m7 |
| 6960 | pmaddubsw m3, m2, [r3 - 12 * 16] ; [4] |
| 6961 | pmulhrsw m3, m7 |
| 6962 | packuswb m4, m3 |
| 6963 | pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] |
| 6964 | pmulhrsw m5, m7 |
| 6965 | pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] |
| 6966 | pmulhrsw m6, m7 |
| 6967 | packuswb m5, m6 |
| 6968 | pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] |
| 6969 | pmulhrsw m6, m7 |
| 6970 | pmaddubsw m3, m2, [r3 - 4 * 16] ; [12] |
| 6971 | pmulhrsw m3, m7 |
| 6972 | packuswb m6, m3 |
| 6973 | pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] |
| 6974 | pmulhrsw m1, m7 |
| 6975 | pmaddubsw m0, m2, [r3] ; [16] |
| 6976 | pmulhrsw m0, m7 |
| 6977 | packuswb m1, m0 |
| 6978 | |
| 6979 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 |
| 6980 | |
| 6981 | pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] |
| 6982 | pmulhrsw m4, m7 |
| 6983 | pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] |
| 6984 | pmulhrsw m5, m7 |
| 6985 | packuswb m4, m5 |
| 6986 | pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] |
| 6987 | pmulhrsw m5, m7 |
| 6988 | pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] |
| 6989 | pmulhrsw m6, m7 |
| 6990 | packuswb m5, m6 |
| 6991 | pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] |
| 6992 | pmulhrsw m6, m7 |
| 6993 | pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] |
| 6994 | pmulhrsw m1, m7 |
| 6995 | packuswb m6, m1 |
| 6996 | pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] |
| 6997 | pmulhrsw m1, m7 |
| 6998 | packuswb m1, m1 |
| 6999 | movhps m1, [r2 + 2] ; [00] |
| 7000 | |
| 7001 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 |
| 7002 | |
| 7003 | movu m2, [r2 + 2] |
| 7004 | palignr m1, m2, 1 |
| 7005 | punpcklbw m2, m1 |
| 7006 | pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] |
| 7007 | pmulhrsw m4, m7 |
| 7008 | pmaddubsw m3, m2, [r3 - 12 * 16] ; [4] |
| 7009 | pmulhrsw m3, m7 |
| 7010 | packuswb m4, m3 |
| 7011 | pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] |
| 7012 | pmulhrsw m5, m7 |
| 7013 | pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] |
| 7014 | pmulhrsw m6, m7 |
| 7015 | packuswb m5, m6 |
| 7016 | pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] |
| 7017 | pmulhrsw m6, m7 |
| 7018 | pmaddubsw m0, m2, [r3 - 4 * 16] ; [12] |
| 7019 | pmulhrsw m0, m7 |
| 7020 | packuswb m6, m0 |
| 7021 | pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] |
| 7022 | pmulhrsw m1, m7 |
| 7023 | pmaddubsw m0, m2, [r3] ; [16] |
| 7024 | pmulhrsw m0, m7 |
| 7025 | packuswb m1, m0 |
| 7026 | |
| 7027 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 |
| 7028 | |
| 7029 | movu m2, [r2 + 2] |
| 7030 | palignr m1, m2, 1 |
| 7031 | punpcklbw m2, m1 |
| 7032 | pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] |
| 7033 | pmulhrsw m4, m7 |
| 7034 | pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] |
| 7035 | pmulhrsw m5, m7 |
| 7036 | packuswb m4, m5 |
| 7037 | pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] |
| 7038 | pmulhrsw m5, m7 |
| 7039 | pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] |
| 7040 | pmulhrsw m6, m7 |
| 7041 | packuswb m5, m6 |
| 7042 | pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] |
| 7043 | pmulhrsw m6, m7 |
| 7044 | pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] |
| 7045 | pmulhrsw m1, m7 |
| 7046 | packuswb m6, m1 |
| 7047 | pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] |
| 7048 | pmulhrsw m1, m7 |
| 7049 | packuswb m1, m1 |
| 7050 | movhps m1, [r2 + 3] ; [00] |
| 7051 | |
| 7052 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 |
| 7053 | %endmacro |
| 7054 | ;------------------------------------------------------------------------------------------------------------------ |
| 7055 | ; void intraPredAng32_9(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 7056 | ;------------------------------------------------------------------------------------------------------------------ |
| 7057 | INIT_XMM sse4 |
| 7058 | cglobal intra_pred_ang32_9, 3,7,8 |
| 7059 | lea r3, [ang_table + 16 * 16] |
| 7060 | mov r4d, 4 |
| 7061 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 7062 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 7063 | mova m7, [pw_1024] |
| 7064 | .loop: |
| 7065 | MODE_9_27 1 |
| 7066 | lea r0, [r6 + r1 * 4] |
| 7067 | lea r6, [r6 + r1 * 8] |
| 7068 | add r2, 8 |
| 7069 | dec r4 |
| 7070 | jnz .loop |
| 7071 | RET |
| 7072 | |
| 7073 | ;------------------------------------------------------------------------------------------------------------------ |
| 7074 | ; void intraPredAng32_10(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 7075 | ;------------------------------------------------------------------------------------------------------------------ |
| 7076 | INIT_XMM sse4 |
| 7077 | cglobal intra_pred_ang32_10, 6,7,8,0-(2*mmsize) |
| 7078 | %define m8 [rsp + 0 * mmsize] |
| 7079 | %define m9 [rsp + 1 * mmsize] |
| 7080 | lea r4, [r1 * 3] |
| 7081 | pxor m7, m7 |
| 7082 | mov r6, 2 |
| 7083 | movu m0, [r3] |
| 7084 | movu m1, [r3 + 1] |
| 7085 | mova m8, m0 |
| 7086 | mova m9, m1 |
| 7087 | mov r3d, r5d |
| 7088 | |
| 7089 | .loop: |
| 7090 | movu m0, [r2 + 1] |
| 7091 | palignr m1, m0, 1 |
| 7092 | pshufb m1, m7 |
| 7093 | palignr m2, m0, 2 |
| 7094 | pshufb m2, m7 |
| 7095 | palignr m3, m0, 3 |
| 7096 | pshufb m3, m7 |
| 7097 | palignr m4, m0, 4 |
| 7098 | pshufb m4, m7 |
| 7099 | palignr m5, m0, 5 |
| 7100 | pshufb m5, m7 |
| 7101 | palignr m6, m0, 6 |
| 7102 | pshufb m6, m7 |
| 7103 | |
| 7104 | movu [r0 + r1], m1 |
| 7105 | movu [r0 + r1 + 16], m1 |
| 7106 | movu [r0 + r1 * 2], m2 |
| 7107 | movu [r0 + r1 * 2 + 16], m2 |
| 7108 | movu [r0 + r4], m3 |
| 7109 | movu [r0 + r4 + 16], m3 |
| 7110 | lea r5, [r0 + r1 * 4] |
| 7111 | movu [r5], m4 |
| 7112 | movu [r5 + 16], m4 |
| 7113 | movu [r5 + r1], m5 |
| 7114 | movu [r5 + r1 + 16], m5 |
| 7115 | movu [r5 + r1 * 2], m6 |
| 7116 | movu [r5 + r1 * 2 + 16], m6 |
| 7117 | |
| 7118 | palignr m1, m0, 7 |
| 7119 | pshufb m1, m7 |
| 7120 | movhlps m2, m0 |
| 7121 | pshufb m2, m7 |
| 7122 | palignr m3, m0, 9 |
| 7123 | pshufb m3, m7 |
| 7124 | palignr m4, m0, 10 |
| 7125 | pshufb m4, m7 |
| 7126 | palignr m5, m0, 11 |
| 7127 | pshufb m5, m7 |
| 7128 | palignr m6, m0, 12 |
| 7129 | pshufb m6, m7 |
| 7130 | |
| 7131 | movu [r5 + r4], m1 |
| 7132 | movu [r5 + r4 + 16], m1 |
| 7133 | lea r5, [r5 + r1 * 4] |
| 7134 | movu [r5], m2 |
| 7135 | movu [r5 + 16], m2 |
| 7136 | movu [r5 + r1], m3 |
| 7137 | movu [r5 + r1 + 16], m3 |
| 7138 | movu [r5 + r1 * 2], m4 |
| 7139 | movu [r5 + r1 * 2 + 16], m4 |
| 7140 | movu [r5 + r4], m5 |
| 7141 | movu [r5 + r4 + 16], m5 |
| 7142 | lea r5, [r5 + r1 * 4] |
| 7143 | movu [r5], m6 |
| 7144 | movu [r5 + 16], m6 |
| 7145 | |
| 7146 | palignr m1, m0, 13 |
| 7147 | pshufb m1, m7 |
| 7148 | palignr m2, m0, 14 |
| 7149 | pshufb m2, m7 |
| 7150 | palignr m3, m0, 15 |
| 7151 | pshufb m3, m7 |
| 7152 | pshufb m0, m7 |
| 7153 | |
| 7154 | movu [r5 + r1], m1 |
| 7155 | movu [r5 + r1 + 16], m1 |
| 7156 | movu [r5 + r1 * 2], m2 |
| 7157 | movu [r5 + r1 * 2 + 16], m2 |
| 7158 | movu [r5 + r4], m3 |
| 7159 | movu [r5 + r4 + 16], m3 |
| 7160 | |
| 7161 | ; filter |
| 7162 | cmp r3d, byte 0 |
| 7163 | jz .quit |
| 7164 | movhlps m1, m0 |
| 7165 | pmovzxbw m0, m0 |
| 7166 | mova m1, m0 |
| 7167 | movu m2, m8 |
| 7168 | movu m3, m9 |
| 7169 | |
| 7170 | pshufb m2, m7 |
| 7171 | pmovzxbw m2, m2 |
| 7172 | movhlps m4, m3 |
| 7173 | pmovzxbw m3, m3 |
| 7174 | pmovzxbw m4, m4 |
| 7175 | psubw m3, m2 |
| 7176 | psubw m4, m2 |
| 7177 | psraw m3, 1 |
| 7178 | psraw m4, 1 |
| 7179 | paddw m0, m3 |
| 7180 | paddw m1, m4 |
| 7181 | packuswb m0, m1 |
| 7182 | |
| 7183 | .quit: |
| 7184 | movu [r0], m0 |
| 7185 | movu [r0 + 16], m0 |
| 7186 | dec r6 |
| 7187 | lea r0, [r5 + r1 * 4] |
| 7188 | lea r2, [r2 + 16] |
| 7189 | jnz .loop |
| 7190 | RET |
| 7191 | |
| 7192 | ;------------------------------------------------------------------------------------------------------------------- |
| 7193 | ; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 7194 | ;------------------------------------------------------------------------------------------------------------------- |
| 7195 | INIT_XMM sse4 |
| 7196 | cglobal intra_pred_ang32_11, 4,7,8 |
| 7197 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line |
| 7198 | |
| 7199 | mov r6, rsp |
| 7200 | sub rsp, 64+gprsize |
| 7201 | and rsp, ~63 |
| 7202 | mov [rsp+64], r6 |
| 7203 | |
| 7204 | ; collect reference pixel |
| 7205 | movu m0, [r3 + 16] |
| 7206 | pxor m1, m1 |
| 7207 | pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] |
| 7208 | mova [rsp], m0 |
| 7209 | movu m0, [r2] |
| 7210 | movu m1, [r2 + 16] |
| 7211 | movu m2, [r2 + 32] |
| 7212 | movu [rsp + 1], m0 |
| 7213 | movu [rsp + 1 + 16], m1 |
| 7214 | movu [rsp + 1 + 32], m2 |
| 7215 | mov [rsp + 63], byte 4 |
| 7216 | |
| 7217 | ; filter |
| 7218 | lea r2, [rsp + 1] ; r2 -> [0] |
| 7219 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 |
| 7220 | lea r4, [ang_table] ; r4 -> ang_table |
| 7221 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 7222 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 7223 | mova m5, [pw_1024] ; m5 -> 1024 |
| 7224 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 |
| 7225 | |
| 7226 | .loop: |
| 7227 | ; Row[0 - 7] |
| 7228 | movu m7, [r2] |
| 7229 | mova m0, m7 |
| 7230 | mova m1, m7 |
| 7231 | mova m2, m7 |
| 7232 | mova m3, m7 |
| 7233 | mova m4, m7 |
| 7234 | mova m5, m7 |
| 7235 | mova m6, m7 |
| 7236 | PROC32_8x8 0, 1, 30,28,26,24,22,20,18,16 |
| 7237 | |
| 7238 | ; Row[8 - 15] |
| 7239 | movu m7, [r2] |
| 7240 | mova m0, m7 |
| 7241 | mova m1, m7 |
| 7242 | mova m2, m7 |
| 7243 | mova m3, m7 |
| 7244 | mova m4, m7 |
| 7245 | mova m5, m7 |
| 7246 | mova m6, m7 |
| 7247 | PROC32_8x8 1, 1, 14,12,10,8,6,4,2,0 |
| 7248 | |
| 7249 | ; Row[16 - 23] |
| 7250 | movu m7, [r2 - 1] |
| 7251 | mova m0, m7 |
| 7252 | mova m1, m7 |
| 7253 | mova m2, m7 |
| 7254 | mova m3, m7 |
| 7255 | mova m4, m7 |
| 7256 | mova m5, m7 |
| 7257 | mova m6, m7 |
| 7258 | PROC32_8x8 2, 1, 30,28,26,24,22,20,18,16 |
| 7259 | |
| 7260 | ; Row[24 - 31] |
| 7261 | movu m7, [r2 - 1] |
| 7262 | mova m0, m7 |
| 7263 | mova m1, m7 |
| 7264 | mova m2, m7 |
| 7265 | mova m3, m7 |
| 7266 | mova m4, m7 |
| 7267 | mova m5, m7 |
| 7268 | mova m6, m7 |
| 7269 | PROC32_8x8 3, 1, 14,12,10,8,6,4,2,0 |
| 7270 | |
| 7271 | lea r0, [r6 + r1 * 4] |
| 7272 | lea r6, [r6 + r1 * 8] |
| 7273 | add r2, 8 |
| 7274 | dec byte [rsp + 63] |
| 7275 | jnz .loop |
| 7276 | mov rsp, [rsp+64] |
| 7277 | RET |
| 7278 | |
| 7279 | %macro MODE_12_24_ROW0 1 |
| 7280 | movu m0, [r3 + 6] |
| 7281 | pshufb m0, [c_mode32_12_0] |
| 7282 | pinsrb m0, [r3 + 26], 12 |
| 7283 | mova above, m0 |
| 7284 | movu m2, [r2] |
| 7285 | palignr m1, m2, 1 |
| 7286 | punpcklbw m2, m1 |
| 7287 | pmaddubsw m4, m2, [r4 + 11 * 16] ; [27] |
| 7288 | pmulhrsw m4, m7 |
| 7289 | pmaddubsw m3, m2, [r4 + 6 * 16] ; [22] |
| 7290 | pmulhrsw m3, m7 |
| 7291 | packuswb m4, m3 |
| 7292 | pmaddubsw m5, m2, [r4 + 16] ; [17] |
| 7293 | pmulhrsw m5, m7 |
| 7294 | pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] |
| 7295 | pmulhrsw m6, m7 |
| 7296 | packuswb m5, m6 |
| 7297 | pmaddubsw m6, m2, [r4 - 9 * 16] ; [7] |
| 7298 | pmulhrsw m6, m7 |
| 7299 | pmaddubsw m3, m2, [r4 - 14 * 16] ; [2] |
| 7300 | pmulhrsw m3, m7 |
| 7301 | packuswb m6, m3 |
| 7302 | movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 7303 | palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] |
| 7304 | punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] |
| 7305 | pmaddubsw m1, m2, [r4 + 13 * 16] ; [29] |
| 7306 | pmulhrsw m1, m7 |
| 7307 | pmaddubsw m3, m2, [r4 + 8 * 16] ; [24] |
| 7308 | pmulhrsw m3, m7 |
| 7309 | packuswb m1, m3 |
| 7310 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 |
| 7311 | pmaddubsw m4, m2, [r4 + 3 * 16] ; [19] |
| 7312 | pmulhrsw m4, m7 |
| 7313 | pmaddubsw m5, m2, [r4 - 2 * 16] ; [14] |
| 7314 | pmulhrsw m5, m7 |
| 7315 | packuswb m4, m5 |
| 7316 | pmaddubsw m5, m2, [r4 - 7 * 16] ; [09] |
| 7317 | pmulhrsw m5, m7 |
| 7318 | pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] |
| 7319 | pmulhrsw m6, m7 |
| 7320 | packuswb m5, m6 |
| 7321 | palignr m2, above, 14 ;[6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] |
| 7322 | pmaddubsw m6, m2, [r4 + 15 * 16] ; [31] |
| 7323 | pmulhrsw m6, m7 |
| 7324 | pmaddubsw m1, m2, [r4 + 10 * 16] ; [26] |
| 7325 | pmulhrsw m1, m7 |
| 7326 | packuswb m6, m1 |
| 7327 | pmaddubsw m1, m2, [r4 + 5 * 16] ; [21] |
| 7328 | pmulhrsw m1, m7 |
| 7329 | pmaddubsw m3, m2, [r4] ; [16] |
| 7330 | pmulhrsw m3, m7 |
| 7331 | packuswb m1, m3 |
| 7332 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 |
| 7333 | pmaddubsw m4, m2, [r4 - 5 * 16] ; [11] |
| 7334 | pmulhrsw m4, m7 |
| 7335 | pmaddubsw m3, m2, [r4 - 10 * 16] ; [06] |
| 7336 | pmulhrsw m3, m7 |
| 7337 | packuswb m4, m3 |
| 7338 | pmaddubsw m5, m2, [r4 - 15 * 16] ; [1] |
| 7339 | pmulhrsw m5, m7 |
| 7340 | pslldq m1, above, 1 |
| 7341 | palignr m2, m1, 14 |
| 7342 | pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] |
| 7343 | pmulhrsw m6, m7 |
| 7344 | packuswb m5, m6 |
| 7345 | pmaddubsw m6, m2, [r4 + 7 * 16] ; [23] |
| 7346 | pmulhrsw m6, m7 |
| 7347 | pmaddubsw m3, m2, [r4 + 2 * 16] ; [18] |
| 7348 | pmulhrsw m3, m7 |
| 7349 | packuswb m6, m3 |
| 7350 | pmaddubsw m1, m2, [r4 - 3 * 16] ; [13] |
| 7351 | pmulhrsw m1, m7 |
| 7352 | pmaddubsw m3, m2, [r4 - 8 * 16] ; [8] |
| 7353 | pmulhrsw m3, m7 |
| 7354 | packuswb m1, m3 |
| 7355 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 |
| 7356 | pmaddubsw m4, m2, [r4 - 13 * 16] ; [3] |
| 7357 | pmulhrsw m4, m7 |
| 7358 | pslldq m1, above, 2 |
| 7359 | palignr m2, m1, 14 |
| 7360 | pmaddubsw m5, m2, [r4 + 14 * 16] ; [30] |
| 7361 | pmulhrsw m5, m7 |
| 7362 | packuswb m4, m5 |
| 7363 | pmaddubsw m5, m2, [r4 + 9 * 16] ; [25] |
| 7364 | pmulhrsw m5, m7 |
| 7365 | pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] |
| 7366 | pmulhrsw m6, m7 |
| 7367 | packuswb m5, m6 |
| 7368 | pmaddubsw m6, m2, [r4 - 16] ; [15] |
| 7369 | pmulhrsw m6, m7 |
| 7370 | pmaddubsw m1, m2, [r4 - 6 * 16] ; [10] |
| 7371 | pmulhrsw m1, m7 |
| 7372 | packuswb m6, m1 |
| 7373 | pmaddubsw m1, m2, [r4 - 11 * 16] ; [05] |
| 7374 | pmulhrsw m1, m7 |
| 7375 | movu m0, [pb_fact0] |
| 7376 | pshufb m2, m0 |
| 7377 | pmovzxbw m2, m2 |
| 7378 | packuswb m1, m2 |
| 7379 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 |
| 7380 | %endmacro |
| 7381 | |
| 7382 | %macro MODE_12_24 1 |
| 7383 | movu m2, [r2] |
| 7384 | palignr m1, m2, 1 |
| 7385 | punpckhbw m0, m2, m1 |
| 7386 | punpcklbw m2, m1 |
| 7387 | palignr m0, m2, 2 |
| 7388 | pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] |
| 7389 | pmulhrsw m4, m7 |
| 7390 | pmaddubsw m3, m0, [r4 + 6 * 16] ; [22] |
| 7391 | pmulhrsw m3, m7 |
| 7392 | packuswb m4, m3 |
| 7393 | pmaddubsw m5, m0, [r4 + 16] ; [17] |
| 7394 | pmulhrsw m5, m7 |
| 7395 | pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] |
| 7396 | pmulhrsw m6, m7 |
| 7397 | packuswb m5, m6 |
| 7398 | pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] |
| 7399 | pmulhrsw m6, m7 |
| 7400 | pmaddubsw m3, m0, [r4 - 14 * 16] ; [2] |
| 7401 | pmulhrsw m3, m7 |
| 7402 | packuswb m6, m3 |
| 7403 | pmaddubsw m1, m2, [r4 + 13 * 16] ; [29] |
| 7404 | pmulhrsw m1, m7 |
| 7405 | pmaddubsw m3, m2, [r4 + 8 * 16] ; [24] |
| 7406 | pmulhrsw m3, m7 |
| 7407 | packuswb m1, m3 |
| 7408 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 |
| 7409 | pmaddubsw m4, m2, [r4 + 3 * 16] ; [19] |
| 7410 | pmulhrsw m4, m7 |
| 7411 | pmaddubsw m5, m2, [r4 - 2 * 16] ; [14] |
| 7412 | pmulhrsw m5, m7 |
| 7413 | packuswb m4, m5 |
| 7414 | pmaddubsw m5, m2, [r4 - 7 * 16] ; [09] |
| 7415 | pmulhrsw m5, m7 |
| 7416 | pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] |
| 7417 | pmulhrsw m6, m7 |
| 7418 | packuswb m5, m6 |
| 7419 | movu m0, [r2 - 2] |
| 7420 | palignr m1, m0, 1 |
| 7421 | punpckhbw m2, m0, m1 |
| 7422 | punpcklbw m0, m1 |
| 7423 | palignr m2, m0, 2 |
| 7424 | pmaddubsw m6, m2, [r4 + 15 * 16] ; [31] |
| 7425 | pmulhrsw m6, m7 |
| 7426 | pmaddubsw m1, m2, [r4 + 10 * 16] ; [26] |
| 7427 | pmulhrsw m1, m7 |
| 7428 | packuswb m6, m1 |
| 7429 | pmaddubsw m1, m2, [r4 + 5 * 16] ; [21] |
| 7430 | pmulhrsw m1, m7 |
| 7431 | pmaddubsw m3, m2, [r4] ; [16] |
| 7432 | pmulhrsw m3, m7 |
| 7433 | packuswb m1, m3 |
| 7434 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 |
| 7435 | pmaddubsw m4, m2, [r4 - 5 * 16] ; [11] |
| 7436 | pmulhrsw m4, m7 |
| 7437 | pmaddubsw m3, m2, [r4 - 10 * 16] ; [06] |
| 7438 | pmulhrsw m3, m7 |
| 7439 | packuswb m4, m3 |
| 7440 | pmaddubsw m5, m2, [r4 - 15 * 16] ; [1] |
| 7441 | pmulhrsw m5, m7 |
| 7442 | movu m0, [r2 - 3] |
| 7443 | palignr m1, m0, 1 |
| 7444 | punpckhbw m2, m0, m1 |
| 7445 | punpcklbw m0, m1 |
| 7446 | palignr m2, m0, 2 |
| 7447 | pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] |
| 7448 | pmulhrsw m6, m7 |
| 7449 | packuswb m5, m6 |
| 7450 | pmaddubsw m6, m2, [r4 + 7 * 16] ; [23] |
| 7451 | pmulhrsw m6, m7 |
| 7452 | pmaddubsw m3, m2, [r4 + 2 * 16] ; [18] |
| 7453 | pmulhrsw m3, m7 |
| 7454 | packuswb m6, m3 |
| 7455 | pmaddubsw m1, m2, [r4 - 3 * 16] ; [13] |
| 7456 | pmulhrsw m1, m7 |
| 7457 | pmaddubsw m3, m2, [r4 - 8 * 16] ; [8] |
| 7458 | pmulhrsw m3, m7 |
| 7459 | packuswb m1, m3 |
| 7460 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 |
| 7461 | pmaddubsw m4, m2, [r4 - 13 * 16] ; [3] |
| 7462 | pmulhrsw m4, m7 |
| 7463 | movu m2, [r2 - 4] |
| 7464 | palignr m1, m2, 1 |
| 7465 | punpckhbw m0, m2, m1 |
| 7466 | punpcklbw m2, m1 |
| 7467 | palignr m0, m2, 2 |
| 7468 | pmaddubsw m5, m0, [r4 + 14 * 16] ; [30] |
| 7469 | pmulhrsw m5, m7 |
| 7470 | packuswb m4, m5 |
| 7471 | pmaddubsw m5, m0, [r4 + 9 * 16] ; [25] |
| 7472 | pmulhrsw m5, m7 |
| 7473 | pmaddubsw m6, m0, [r4 + 4 * 16] ; [20] |
| 7474 | pmulhrsw m6, m7 |
| 7475 | packuswb m5, m6 |
| 7476 | pmaddubsw m6, m0, [r4 - 16] ; [15] |
| 7477 | pmulhrsw m6, m7 |
| 7478 | pmaddubsw m1, m0, [r4 - 6 * 16] ; [10] |
| 7479 | pmulhrsw m1, m7 |
| 7480 | packuswb m6, m1 |
| 7481 | pmaddubsw m1, m0, [r4 - 11 * 16] ; [05] |
| 7482 | pmulhrsw m1, m7 |
| 7483 | movu m2, [pb_fact0] |
| 7484 | pshufb m0, m2 |
| 7485 | pmovzxbw m0, m0 |
| 7486 | packuswb m1, m0 |
| 7487 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 |
| 7488 | %endmacro |
| 7489 | ;----------------------------------------------------------------------------------------------------------------- |
| 7490 | ; void intraPredAng32_12(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 7491 | ;----------------------------------------------------------------------------------------------------------------- |
| 7492 | INIT_XMM sse4 |
| 7493 | cglobal intra_pred_ang32_12, 4,7,8,0-(1*mmsize) |
| 7494 | %define above [rsp + 0 * mmsize] |
| 7495 | |
| 7496 | lea r4, [ang_table + 16 * 16] |
| 7497 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 7498 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 7499 | mova m7, [pw_1024] |
| 7500 | |
| 7501 | MODE_12_24_ROW0 1 |
| 7502 | lea r0, [r6 + r1 * 4] |
| 7503 | lea r6, [r6 + r1 * 8] |
| 7504 | add r2, 7 |
| 7505 | mov r3, 3 |
| 7506 | .loop: |
| 7507 | MODE_12_24 1 |
| 7508 | lea r0, [r6 + r1 * 4] |
| 7509 | lea r6, [r6 + r1 * 8] |
| 7510 | add r2, 8 |
| 7511 | dec r3 |
| 7512 | jnz .loop |
| 7513 | RET |
| 7514 | |
| 7515 | %macro MODE_13_23_ROW0 1 |
| 7516 | movu m0, [r3 + 1] |
| 7517 | movu m1, [r3 + 15] |
| 7518 | pshufb m0, [c_mode32_13_0] |
| 7519 | pshufb m1, [c_mode32_13_0] |
| 7520 | punpckldq m0, m1 |
| 7521 | pshufb m0, [c_mode32_13_shuf] |
| 7522 | mova above, m0 |
| 7523 | movu m2, [r2] |
| 7524 | palignr m1, m2, 1 |
| 7525 | punpcklbw m2, m1 |
| 7526 | pmaddubsw m4, m2, [r4 + 7 * 16] ; [23] |
| 7527 | pmulhrsw m4, m7 |
| 7528 | pmaddubsw m3, m2, [r4 - 2 * 16] ; [14] |
| 7529 | pmulhrsw m3, m7 |
| 7530 | packuswb m4, m3 |
| 7531 | pmaddubsw m5, m2, [r4 - 11 * 16] ; [5] |
| 7532 | pmulhrsw m5, m7 |
| 7533 | movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 7534 | palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] |
| 7535 | punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] |
| 7536 | pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] |
| 7537 | pmulhrsw m6, m7 |
| 7538 | packuswb m5, m6 |
| 7539 | pmaddubsw m6, m2, [r4 + 3 * 16] ; [19] |
| 7540 | pmulhrsw m6, m7 |
| 7541 | pmaddubsw m0, m2, [r4 - 6 * 16] ; [10] |
| 7542 | pmulhrsw m0, m7 |
| 7543 | packuswb m6, m0 |
| 7544 | pmaddubsw m1, m2, [r4 - 15 * 16] ; [1] |
| 7545 | pmulhrsw m1, m7 |
| 7546 | palignr m2, above, 14 |
| 7547 | pmaddubsw m3, m2, [r4 + 8 * 16] ; [24] |
| 7548 | pmulhrsw m3, m7 |
| 7549 | packuswb m1, m3 |
| 7550 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 |
| 7551 | pmaddubsw m4, m2, [r4 - 16] ; [15] |
| 7552 | pmulhrsw m4, m7 |
| 7553 | pmaddubsw m5, m2, [r4 - 10 * 16] ; [6] |
| 7554 | pmulhrsw m5, m7 |
| 7555 | packuswb m4, m5 |
| 7556 | pslldq m0, above, 1 |
| 7557 | palignr m2, m0, 14 |
| 7558 | pmaddubsw m5, m2, [r4 + 13 * 16] ; [29] |
| 7559 | pmulhrsw m5, m7 |
| 7560 | pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] |
| 7561 | pmulhrsw m6, m7 |
| 7562 | packuswb m5, m6 |
| 7563 | pmaddubsw m6, m2, [r4 - 5 * 16] ; [11] |
| 7564 | pmulhrsw m6, m7 |
| 7565 | pmaddubsw m1, m2, [r4 - 14 * 16] ; [2] |
| 7566 | pmulhrsw m1, m7 |
| 7567 | packuswb m6, m1 |
| 7568 | pslldq m0, 1 |
| 7569 | palignr m2, m0, 14 |
| 7570 | pmaddubsw m1, m2, [r4 + 9 * 16] ; [25] |
| 7571 | pmulhrsw m1, m7 |
| 7572 | pmaddubsw m0, m2, [r4] ; [16] |
| 7573 | pmulhrsw m0, m7 |
| 7574 | packuswb m1, m0 |
| 7575 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 |
| 7576 | pmaddubsw m4, m2, [r4 - 9 * 16] ; [7] |
| 7577 | pmulhrsw m4, m7 |
| 7578 | pslldq m0, above, 3 |
| 7579 | palignr m2, m0, 14 |
| 7580 | pmaddubsw m3, m2, [r4 + 14 * 16] ; [30] |
| 7581 | pmulhrsw m3, m7 |
| 7582 | packuswb m4, m3 |
| 7583 | pmaddubsw m5, m2, [r4 + 5 * 16] ; [21] |
| 7584 | pmulhrsw m5, m7 |
| 7585 | pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] |
| 7586 | pmulhrsw m6, m7 |
| 7587 | packuswb m5, m6 |
| 7588 | pmaddubsw m6, m2, [r4 - 13 * 16] ; [3] |
| 7589 | pmulhrsw m6, m7 |
| 7590 | pslldq m0, 1 |
| 7591 | palignr m2, m0, 14 |
| 7592 | pmaddubsw m0, m2, [r4 + 10 * 16] ; [26] |
| 7593 | pmulhrsw m0, m7 |
| 7594 | packuswb m6, m0 |
| 7595 | pmaddubsw m1, m2, [r4 + 16] ; [17] |
| 7596 | pmulhrsw m1, m7 |
| 7597 | pmaddubsw m0, m2, [r4 - 8 * 16] ; [8] |
| 7598 | pmulhrsw m0, m7 |
| 7599 | packuswb m1, m0 |
| 7600 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 |
| 7601 | pslldq m0, above, 5 |
| 7602 | palignr m2, m0, 14 |
| 7603 | pmaddubsw m4, m2, [r4 + 15 * 16] ; [31] |
| 7604 | pmulhrsw m4, m7 |
| 7605 | pmaddubsw m5, m2, [r4 + 6 * 16] ; [22] |
| 7606 | pmulhrsw m5, m7 |
| 7607 | packuswb m4, m5 |
| 7608 | pmaddubsw m5, m2, [r4 - 3 * 16] ; [13] |
| 7609 | pmulhrsw m5, m7 |
| 7610 | pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] |
| 7611 | pmulhrsw m6, m7 |
| 7612 | packuswb m5, m6 |
| 7613 | pslldq m0, 1 |
| 7614 | palignr m2, m0, 14 |
| 7615 | pmaddubsw m6, m2, [r4 + 11 * 16] ; [27] |
| 7616 | pmulhrsw m6, m7 |
| 7617 | pmaddubsw m1, m2, [r4 + 2 * 16] ; [18] |
| 7618 | pmulhrsw m1, m7 |
| 7619 | packuswb m6, m1 |
| 7620 | pmaddubsw m1, m2, [r4 - 7 * 16] ; [09] |
| 7621 | pmulhrsw m1, m7 |
| 7622 | pmaddubsw m3, m2, [r4 - 16 * 16] ; [00] |
| 7623 | pmulhrsw m3, m7 |
| 7624 | packuswb m1, m3 |
| 7625 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 |
| 7626 | %endmacro |
| 7627 | |
| 7628 | %macro MODE_13_23 1 |
| 7629 | movu m2, [r2] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] |
| 7630 | palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] |
| 7631 | punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] |
| 7632 | punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] |
| 7633 | palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] |
| 7634 | pmaddubsw m4, m0, [r4 + 7 * 16] ; [23] |
| 7635 | pmulhrsw m4, m7 |
| 7636 | pmaddubsw m3, m0, [r4 - 2 * 16] ; [14] |
| 7637 | pmulhrsw m3, m7 |
| 7638 | packuswb m4, m3 |
| 7639 | pmaddubsw m5, m0, [r4 - 11 * 16] ; [05] |
| 7640 | pmulhrsw m5, m7 |
| 7641 | pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] |
| 7642 | pmulhrsw m6, m7 |
| 7643 | packuswb m5, m6 |
| 7644 | pmaddubsw m6, m2, [r4 + 3 * 16] ; [19] |
| 7645 | pmulhrsw m6, m7 |
| 7646 | pmaddubsw m3, m2, [r4 - 6 * 16] ; [10] |
| 7647 | pmulhrsw m3, m7 |
| 7648 | packuswb m6, m3 |
| 7649 | pmaddubsw m1, m2, [r4 - 15 * 16] ; [1] |
| 7650 | pmulhrsw m1, m7 |
| 7651 | movu m2, [r2 - 2] ; [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1] |
| 7652 | palignr m3, m2, 1 ; [x, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] |
| 7653 | punpckhbw m0, m2, m3 |
| 7654 | punpcklbw m2, m3 |
| 7655 | palignr m0, m2, 2 |
| 7656 | pmaddubsw m3, m0, [r4 + 8 * 16] ; [24] |
| 7657 | pmulhrsw m3, m7 |
| 7658 | packuswb m1, m3 |
| 7659 | mova m3, m0 |
| 7660 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 |
| 7661 | pmaddubsw m4, m3, [r4 - 16] ; [15] |
| 7662 | pmulhrsw m4, m7 |
| 7663 | pmaddubsw m5, m3, [r4 - 10 * 16] ; [6] |
| 7664 | pmulhrsw m5, m7 |
| 7665 | packuswb m4, m5 |
| 7666 | pmaddubsw m5, m2, [r4 + 13 * 16] ; [29] |
| 7667 | pmulhrsw m5, m7 |
| 7668 | pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] |
| 7669 | pmulhrsw m6, m7 |
| 7670 | packuswb m5, m6 |
| 7671 | pmaddubsw m6, m2, [r4 - 5 * 16] ; [11] |
| 7672 | pmulhrsw m6, m7 |
| 7673 | pmaddubsw m1, m2, [r4 - 14 * 16] ; [2] |
| 7674 | pmulhrsw m1, m7 |
| 7675 | packuswb m6, m1 |
| 7676 | movu m2, [r2 - 4] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] |
| 7677 | palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] |
| 7678 | punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] |
| 7679 | punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] |
| 7680 | palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] |
| 7681 | pmaddubsw m1, m0, [r4 + 9 * 16] ; [25] |
| 7682 | pmulhrsw m1, m7 |
| 7683 | pmaddubsw m3, m0, [r4] ; [16] |
| 7684 | pmulhrsw m3, m7 |
| 7685 | packuswb m1, m3 |
| 7686 | mova m3, m0 |
| 7687 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 |
| 7688 | pmaddubsw m4, m3, [r4 - 9 * 16] ; [7] |
| 7689 | pmulhrsw m4, m7 |
| 7690 | pmaddubsw m3, m2, [r4 + 14 * 16] ; [30] |
| 7691 | pmulhrsw m3, m7 |
| 7692 | packuswb m4, m3 |
| 7693 | pmaddubsw m5, m2, [r4 + 5 * 16] ; [21] |
| 7694 | pmulhrsw m5, m7 |
| 7695 | pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] |
| 7696 | pmulhrsw m6, m7 |
| 7697 | packuswb m5, m6 |
| 7698 | pmaddubsw m6, m2, [r4 - 13 * 16] ; [3] |
| 7699 | pmulhrsw m6, m7 |
| 7700 | movu m2, [r2 - 6] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] |
| 7701 | palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] |
| 7702 | punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] |
| 7703 | punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] |
| 7704 | palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] |
| 7705 | pmaddubsw m3, m0, [r4 + 10 * 16] ; [26] |
| 7706 | pmulhrsw m3, m7 |
| 7707 | packuswb m6, m3 |
| 7708 | pmaddubsw m1, m0, [r4 + 16] ; [17] |
| 7709 | pmulhrsw m1, m7 |
| 7710 | pmaddubsw m3, m0, [r4 - 8 * 16] ; [8] |
| 7711 | pmulhrsw m3, m7 |
| 7712 | packuswb m1, m3 |
| 7713 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 |
| 7714 | pmaddubsw m4, m2, [r4 + 15 * 16] ; [31] |
| 7715 | pmulhrsw m4, m7 |
| 7716 | pmaddubsw m5, m2, [r4 + 6 * 16] ; [22] |
| 7717 | pmulhrsw m5, m7 |
| 7718 | packuswb m4, m5 |
| 7719 | pmaddubsw m5, m2, [r4 - 3 * 16] ; [13] |
| 7720 | pmulhrsw m5, m7 |
| 7721 | pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] |
| 7722 | pmulhrsw m6, m7 |
| 7723 | packuswb m5, m6 |
| 7724 | movu m2, [r2 - 7] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] |
| 7725 | palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] |
| 7726 | punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] |
| 7727 | pmaddubsw m6, m2, [r4 + 11 * 16] ; [27] |
| 7728 | pmulhrsw m6, m7 |
| 7729 | pmaddubsw m1, m2, [r4 + 2 * 16] ; [18] |
| 7730 | pmulhrsw m1, m7 |
| 7731 | packuswb m6, m1 |
| 7732 | pmaddubsw m1, m2, [r4 - 7 * 16] ; [09] |
| 7733 | pmulhrsw m1, m7 |
| 7734 | movu m0, [pb_fact0] |
| 7735 | pshufb m2, m0 |
| 7736 | pmovzxbw m2, m2 |
| 7737 | packuswb m1, m2 |
| 7738 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 |
| 7739 | %endmacro |
| 7740 | ;----------------------------------------------------------------------------------------------------------------- |
| 7741 | ; void intraPredAng32_13(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 7742 | ;----------------------------------------------------------------------------------------------------------------- |
| 7743 | INIT_XMM sse4 |
| 7744 | cglobal intra_pred_ang32_13, 4,7,8,0-(1*mmsize) |
| 7745 | %define above [rsp + 0 * mmsize] |
| 7746 | lea r4, [ang_table + 16 * 16] |
| 7747 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 7748 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 7749 | mova m7, [pw_1024] |
| 7750 | |
| 7751 | MODE_13_23_ROW0 1 |
| 7752 | lea r0, [r6 + r1 * 4] |
| 7753 | lea r6, [r6 + r1 * 8] |
| 7754 | add r2, 7 |
| 7755 | mov r3, 3 |
| 7756 | .loop: |
| 7757 | MODE_13_23 1 |
| 7758 | lea r0, [r6 + r1 * 4] |
| 7759 | lea r6, [r6 + r1 * 8] |
| 7760 | add r2, 8 |
| 7761 | dec r3 |
| 7762 | jnz .loop |
| 7763 | RET |
| 7764 | |
| 7765 | ;------------------------------------------------------------------------------------------------------------------- |
| 7766 | ; void intraPredAng32_14(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 7767 | ;------------------------------------------------------------------------------------------------------------------- |
| 7768 | INIT_XMM sse4 |
| 7769 | cglobal intra_pred_ang32_14, 4,7,8 |
| 7770 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line |
| 7771 | mov r6, rsp |
| 7772 | sub rsp, 64+gprsize |
| 7773 | and rsp, ~63 |
| 7774 | mov [rsp+64], r6 |
| 7775 | |
| 7776 | ; collect reference pixel |
| 7777 | movu m0, [r3] |
| 7778 | movu m1, [r3 + 15] |
| 7779 | pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15] |
| 7780 | pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30] |
| 7781 | pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x] |
| 7782 | palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30] |
| 7783 | mova [rsp], m0 |
| 7784 | movu m0, [r2 + 1] |
| 7785 | movu m1, [r2 + 1 + 16] |
| 7786 | movu [rsp + 13], m0 |
| 7787 | movu [rsp + 13 + 16], m1 |
| 7788 | mov [rsp + 63], byte 4 |
| 7789 | |
| 7790 | ; filter |
| 7791 | lea r2, [rsp + 13] ; r2 -> [0] |
| 7792 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 |
| 7793 | lea r4, [ang_table] ; r4 -> ang_table |
| 7794 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 7795 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 7796 | mova m5, [pw_1024] ; m5 -> 1024 |
| 7797 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 |
| 7798 | |
| 7799 | .loop: |
| 7800 | ; Row[0 - 7] |
| 7801 | movu m7, [r2 - 4] |
| 7802 | palignr m0, m7, 3 |
| 7803 | mova m1, m0 |
| 7804 | palignr m2, m7, 2 |
| 7805 | mova m3, m2 |
| 7806 | palignr m4, m7, 1 |
| 7807 | mova m5, m4 |
| 7808 | mova m6, m4 |
| 7809 | PROC32_8x8 0, 1, 19,6,25,12,31,18,5,24 |
| 7810 | |
| 7811 | ; Row[8 - 15] |
| 7812 | movu m7, [r2 - 7] |
| 7813 | palignr m0, m7, 3 |
| 7814 | palignr m1, m7, 2 |
| 7815 | mova m2, m1 |
| 7816 | mova m3, m1 |
| 7817 | palignr m4, m7, 1 |
| 7818 | mova m5, m4 |
| 7819 | mova m6, m7 |
| 7820 | PROC32_8x8 1, 1, 11,30,17,4,23,10,29,16 |
| 7821 | |
| 7822 | ; Row[16 - 23] |
| 7823 | movu m7, [r2 - 10] |
| 7824 | palignr m0, m7, 3 |
| 7825 | palignr m1, m7, 2 |
| 7826 | mova m2, m1 |
| 7827 | palignr m3, m7, 1 |
| 7828 | mova m4, m3 |
| 7829 | mova m5, m3 |
| 7830 | mova m6, m7 |
| 7831 | PROC32_8x8 2, 1, 3,22,9,28,15,2,21,8 |
| 7832 | |
| 7833 | ; Row[24 - 31] |
| 7834 | movu m7, [r2 - 13] |
| 7835 | palignr m0, m7, 2 |
| 7836 | mova m1, m0 |
| 7837 | mova m2, m0 |
| 7838 | palignr m3, m7, 1 |
| 7839 | mova m4, m3 |
| 7840 | mova m5, m7 |
| 7841 | mova m6, m7 |
| 7842 | PROC32_8x8 3, 1, 27,14,1,20,7,26,13,0 |
| 7843 | |
| 7844 | lea r0, [r6 + r1 * 4] |
| 7845 | lea r6, [r6 + r1 * 8] |
| 7846 | add r2, 8 |
| 7847 | dec byte [rsp + 63] |
| 7848 | jnz .loop |
| 7849 | mov rsp, [rsp+64] |
| 7850 | RET |
| 7851 | |
| 7852 | ;------------------------------------------------------------------------------------------------------------------- |
| 7853 | ; void intraPredAng32_15(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 7854 | ;------------------------------------------------------------------------------------------------------------------- |
| 7855 | INIT_XMM sse4 |
| 7856 | cglobal intra_pred_ang32_15, 4,7,8 |
| 7857 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line |
| 7858 | mov r6, rsp |
| 7859 | sub rsp, 64+gprsize |
| 7860 | and rsp, ~63 |
| 7861 | mov [rsp+64], r6 |
| 7862 | |
| 7863 | ; collect reference pixel |
| 7864 | movu m0, [r3] |
| 7865 | movu m1, [r3 + 15] |
| 7866 | pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15] |
| 7867 | pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30] |
| 7868 | mova [rsp], m1 |
| 7869 | movu [rsp + 8], m0 |
| 7870 | movu m0, [r2 + 1] |
| 7871 | movu m1, [r2 + 1 + 16] |
| 7872 | movu [rsp + 17], m0 |
| 7873 | movu [rsp + 17 + 16], m1 |
| 7874 | mov [rsp + 63], byte 4 |
| 7875 | |
| 7876 | ; filter |
| 7877 | lea r2, [rsp + 17] ; r2 -> [0] |
| 7878 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 |
| 7879 | lea r4, [ang_table] ; r4 -> ang_table |
| 7880 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 7881 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 7882 | mova m5, [pw_1024] ; m5 -> 1024 |
| 7883 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 |
| 7884 | |
| 7885 | .loop: |
| 7886 | ; Row[0 - 7] |
| 7887 | movu m7, [r2 - 5] |
| 7888 | palignr m0, m7, 4 |
| 7889 | palignr m1, m7, 3 |
| 7890 | mova m2, m1 |
| 7891 | palignr m3, m7, 2 |
| 7892 | mova m4, m3 |
| 7893 | palignr m5, m7, 1 |
| 7894 | mova m6, m5 |
| 7895 | PROC32_8x8 0, 1, 15,30,13,28,11,26,9,24 |
| 7896 | |
| 7897 | ; Row[8 - 15] |
| 7898 | movu m7, [r2 - 9] |
| 7899 | palignr m0, m7, 4 |
| 7900 | palignr m1, m7, 3 |
| 7901 | mova m2, m1 |
| 7902 | palignr m3, m7, 2 |
| 7903 | mova m4, m3 |
| 7904 | palignr m5, m7, 1 |
| 7905 | mova m6, m5 |
| 7906 | PROC32_8x8 1, 1, 7,22,5,20,3,18,1,16 |
| 7907 | |
| 7908 | ; Row[16 - 23] |
| 7909 | movu m7, [r2 - 13] |
| 7910 | palignr m0, m7, 3 |
| 7911 | mova m1, m0 |
| 7912 | palignr m2, m7, 2 |
| 7913 | mova m3, m2 |
| 7914 | palignr m4, m7, 1 |
| 7915 | mova m5, m4 |
| 7916 | mova m6, m7 |
| 7917 | PROC32_8x8 2, 1, 31,14,29,12,27,10,25,8 |
| 7918 | |
| 7919 | ; Row[24 - 31] |
| 7920 | movu m7, [r2 - 17] |
| 7921 | palignr m0, m7, 3 |
| 7922 | mova m1, m0 |
| 7923 | palignr m2, m7, 2 |
| 7924 | mova m3, m2 |
| 7925 | palignr m4, m7, 1 |
| 7926 | mova m5, m4 |
| 7927 | mova m6, m7 |
| 7928 | PROC32_8x8 3, 1, 23,6,21,4,19,2,17,0 |
| 7929 | |
| 7930 | lea r0, [r6 + r1 * 4] |
| 7931 | lea r6, [r6 + r1 * 8] |
| 7932 | add r2, 8 |
| 7933 | dec byte [rsp + 63] |
| 7934 | jnz .loop |
| 7935 | mov rsp, [rsp+64] |
| 7936 | RET |
| 7937 | |
| 7938 | ;------------------------------------------------------------------------------------------------------------------- |
| 7939 | ; void intraPredAng32_16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 7940 | ;------------------------------------------------------------------------------------------------------------------- |
| 7941 | INIT_XMM sse4 |
| 7942 | cglobal intra_pred_ang32_16, 4,7,8 |
| 7943 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line |
| 7944 | mov r6, rsp |
| 7945 | sub rsp, 64+gprsize |
| 7946 | and rsp, ~63 |
| 7947 | mov [rsp+64], r6 |
| 7948 | |
| 7949 | ; collect reference pixel |
| 7950 | movu m0, [r3] |
| 7951 | movu m1, [r3 + 15] |
| 7952 | pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15] |
| 7953 | pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30] |
| 7954 | mova [rsp], m1 |
| 7955 | movu [rsp + 10], m0 |
| 7956 | movu m0, [r2 + 1] |
| 7957 | movu m1, [r2 + 1 + 16] |
| 7958 | movu [rsp + 21], m0 |
| 7959 | movu [rsp + 21 + 16], m1 |
| 7960 | mov [rsp + 63], byte 4 |
| 7961 | |
| 7962 | ; filter |
| 7963 | lea r2, [rsp + 21] ; r2 -> [0] |
| 7964 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 |
| 7965 | lea r4, [ang_table] ; r4 -> ang_table |
| 7966 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 7967 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 7968 | mova m5, [pw_1024] ; m5 -> 1024 |
| 7969 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 |
| 7970 | |
| 7971 | .loop: |
| 7972 | ; Row[0 - 7] |
| 7973 | movu m7, [r2 - 6] |
| 7974 | palignr m0, m7, 5 |
| 7975 | palignr m1, m7, 4 |
| 7976 | mova m2, m1 |
| 7977 | palignr m3, m7, 3 |
| 7978 | palignr m4, m7, 2 |
| 7979 | mova m5, m4 |
| 7980 | palignr m6, m7, 1 |
| 7981 | PROC32_8x8 0, 1, 11,22,1,12,23,2,13,24 |
| 7982 | |
| 7983 | ; Row[8 - 15] |
| 7984 | movu m7, [r2 - 11] |
| 7985 | palignr m0, m7, 5 |
| 7986 | palignr m1, m7, 4 |
| 7987 | palignr m2, m7, 3 |
| 7988 | mova m3, m2 |
| 7989 | palignr m4, m7, 2 |
| 7990 | palignr m5, m7, 1 |
| 7991 | mova m6, m5 |
| 7992 | PROC32_8x8 1, 1, 3,14,25,4,15,26,5,16 |
| 7993 | |
| 7994 | ; Row[16 - 23] |
| 7995 | movu m7, [r2 - 16] |
| 7996 | palignr m0, m7, 4 |
| 7997 | mova m1, m0 |
| 7998 | palignr m2, m7, 3 |
| 7999 | palignr m3, m7, 2 |
| 8000 | mova m4, m3 |
| 8001 | palignr m5, m7, 1 |
| 8002 | mova m6, m7 |
| 8003 | PROC32_8x8 2, 1, 27,6,17,28,7,18,29,8 |
| 8004 | |
| 8005 | ; Row[24 - 31] |
| 8006 | movu m7, [r2 - 21] |
| 8007 | palignr m0, m7, 4 |
| 8008 | palignr m1, m7, 3 |
| 8009 | mova m2, m1 |
| 8010 | palignr m3, m7, 2 |
| 8011 | palignr m4, m7, 1 |
| 8012 | mova m5, m4 |
| 8013 | mova m6, m7 |
| 8014 | PROC32_8x8 3, 1, 19,30,9,20,31,10,21,0 |
| 8015 | |
| 8016 | lea r0, [r6 + r1 * 4] |
| 8017 | lea r6, [r6 + r1 * 8] |
| 8018 | add r2, 8 |
| 8019 | dec byte [rsp + 63] |
| 8020 | jnz .loop |
| 8021 | mov rsp, [rsp+64] |
| 8022 | RET |
| 8023 | |
| 8024 | ;------------------------------------------------------------------------------------------------------------------ |
| 8025 | ; void intraPredAng32_17(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 8026 | ;------------------------------------------------------------------------------------------------------------------ |
| 8027 | INIT_XMM sse4 |
| 8028 | cglobal intra_pred_ang32_17, 4,7,8 |
| 8029 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line |
| 8030 | mov r6, rsp |
| 8031 | sub rsp, 64+gprsize |
| 8032 | and rsp, ~63 |
| 8033 | mov [rsp+64], r6 |
| 8034 | |
| 8035 | ; collect reference pixel |
| 8036 | movu m0, [r3] |
| 8037 | movu m1, [r3 + 16] |
| 8038 | pshufb m0, [c_mode32_17_0] |
| 8039 | pshufb m1, [c_mode32_17_0] |
| 8040 | mova [rsp ], m1 |
| 8041 | movu [rsp + 13], m0 |
| 8042 | movu m0, [r2 + 1] |
| 8043 | movu m1, [r2 + 1 + 16] |
| 8044 | movu [rsp + 26], m0 |
| 8045 | movu [rsp + 26 + 16], m1 |
| 8046 | mov [rsp + 63], byte 4 |
| 8047 | |
| 8048 | ; filter |
| 8049 | lea r2, [rsp + 25] ; r2 -> [0] |
| 8050 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 |
| 8051 | lea r4, [ang_table] ; r4 -> ang_table |
| 8052 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 8053 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride |
| 8054 | mova m5, [pw_1024] ; m5 -> 1024 |
| 8055 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 |
| 8056 | |
| 8057 | .loop: |
| 8058 | ; Row[0 - 7] |
| 8059 | movu m7, [r2 - 6] |
| 8060 | palignr m0, m7, 6 |
| 8061 | palignr m1, m7, 5 |
| 8062 | palignr m2, m7, 4 |
| 8063 | palignr m3, m7, 3 |
| 8064 | palignr m4, m7, 2 |
| 8065 | mova m5, m4 |
| 8066 | palignr m6, m7, 1 |
| 8067 | PROC32_8x8 0, 1, 6,12,18,24,30,4,10,16 |
| 8068 | |
| 8069 | ; Row[7 - 15] |
| 8070 | movu m7, [r2 - 12] |
| 8071 | palignr m0, m7, 5 |
| 8072 | palignr m1, m7, 4 |
| 8073 | mova m2, m1 |
| 8074 | palignr m3, m7, 3 |
| 8075 | palignr m4, m7, 2 |
| 8076 | palignr m5, m7, 1 |
| 8077 | mova m6, m7 |
| 8078 | PROC32_8x8 1, 1, 22,28,2,8,14,20,26,0 |
| 8079 | |
| 8080 | ; Row[16 - 23] |
| 8081 | movu m7, [r2 - 19] |
| 8082 | palignr m0, m7, 6 |
| 8083 | palignr m1, m7, 5 |
| 8084 | palignr m2, m7, 4 |
| 8085 | palignr m3, m7, 3 |
| 8086 | palignr m4, m7, 2 |
| 8087 | mova m5, m4 |
| 8088 | palignr m6, m7, 1 |
| 8089 | PROC32_8x8 2, 1, 6,12,18,24,30,4,10,16 |
| 8090 | |
| 8091 | ; Row[24 - 31] |
| 8092 | movu m7, [r2 - 25] |
| 8093 | palignr m0, m7, 5 |
| 8094 | palignr m1, m7, 4 |
| 8095 | mova m2, m1 |
| 8096 | palignr m3, m7, 3 |
| 8097 | palignr m4, m7, 2 |
| 8098 | palignr m5, m7, 1 |
| 8099 | mova m6, m7 |
| 8100 | PROC32_8x8 3, 1, 22,28,2,8,14,20,26,0 |
| 8101 | |
| 8102 | lea r0, [r6 + r1 * 4] |
| 8103 | lea r6, [r6 + r1 * 8] |
| 8104 | add r2, 8 |
| 8105 | dec byte [rsp + 63] |
| 8106 | jnz .loop |
| 8107 | mov rsp, [rsp+64] |
| 8108 | |
| 8109 | RET |
| 8110 | |
| 8111 | ;------------------------------------------------------------------------------------------------------------------- |
| 8112 | ; void intraPredAng32_18(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 8113 | ;------------------------------------------------------------------------------------------------------------------- |
| 8114 | INIT_XMM sse4 |
| 8115 | cglobal intra_pred_ang32_18, 4,5,5 |
| 8116 | movu m0, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] |
| 8117 | movu m1, [r3 + 16] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16] |
| 8118 | movu m2, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] |
| 8119 | movu m3, [r2 + 17] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] |
| 8120 | |
| 8121 | lea r2, [r1 * 2] |
| 8122 | lea r3, [r1 * 3] |
| 8123 | lea r4, [r1 * 4] |
| 8124 | |
| 8125 | movu [r0], m0 |
| 8126 | movu [r0 + 16], m1 |
| 8127 | |
| 8128 | pshufb m2, [c_mode32_18_0] ; [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] |
| 8129 | pshufb m3, [c_mode32_18_0] ; [17 18 19 20 21 22 23 24 25 26 27 28 19 30 31 32] |
| 8130 | |
| 8131 | palignr m4, m0, m2, 15 |
| 8132 | movu [r0 + r1], m4 |
| 8133 | palignr m4, m1, m0, 15 |
| 8134 | movu [r0 + r1 + 16], m4 |
| 8135 | palignr m4, m0, m2, 14 |
| 8136 | movu [r0 + r2], m4 |
| 8137 | palignr m4, m1, m0, 14 |
| 8138 | movu [r0 + r2 + 16], m4 |
| 8139 | palignr m4, m0, m2, 13 |
| 8140 | movu [r0 + r3], m4 |
| 8141 | palignr m4, m1, m0, 13 |
| 8142 | movu [r0 + r3 + 16], m4 |
| 8143 | |
| 8144 | lea r0, [r0 + r4] |
| 8145 | |
| 8146 | palignr m4, m0, m2, 12 |
| 8147 | movu [r0], m4 |
| 8148 | palignr m4, m1, m0, 12 |
| 8149 | movu [r0 + 16], m4 |
| 8150 | palignr m4, m0, m2, 11 |
| 8151 | movu [r0 + r1], m4 |
| 8152 | palignr m4, m1, m0, 11 |
| 8153 | movu [r0 + r1 + 16], m4 |
| 8154 | palignr m4, m0, m2, 10 |
| 8155 | movu [r0 + r2], m4 |
| 8156 | palignr m4, m1, m0, 10 |
| 8157 | movu [r0 + r2 + 16], m4 |
| 8158 | palignr m4, m0, m2, 9 |
| 8159 | movu [r0 + r3], m4 |
| 8160 | palignr m4, m1, m0, 9 |
| 8161 | movu [r0 + r3 + 16], m4 |
| 8162 | |
| 8163 | lea r0, [r0 + r4] |
| 8164 | |
| 8165 | palignr m4, m0, m2, 8 |
| 8166 | movu [r0], m4 |
| 8167 | palignr m4, m1, m0, 8 |
| 8168 | movu [r0 + 16], m4 |
| 8169 | palignr m4, m0, m2, 7 |
| 8170 | movu [r0 + r1], m4 |
| 8171 | palignr m4, m1, m0, 7 |
| 8172 | movu [r0 + r1 + 16], m4 |
| 8173 | palignr m4, m0, m2, 6 |
| 8174 | movu [r0 + r2], m4 |
| 8175 | palignr m4, m1, m0, 6 |
| 8176 | movu [r0 + r2 + 16], m4 |
| 8177 | palignr m4, m0, m2, 5 |
| 8178 | movu [r0 + r3], m4 |
| 8179 | palignr m4, m1, m0, 5 |
| 8180 | movu [r0 + r3 + 16], m4 |
| 8181 | |
| 8182 | lea r0, [r0 + r4] |
| 8183 | |
| 8184 | palignr m4, m0, m2, 4 |
| 8185 | movu [r0], m4 |
| 8186 | palignr m4, m1, m0, 4 |
| 8187 | movu [r0 + 16], m4 |
| 8188 | palignr m4, m0, m2, 3 |
| 8189 | movu [r0 + r1], m4 |
| 8190 | palignr m4, m1, m0, 3 |
| 8191 | movu [r0 + r1 + 16], m4 |
| 8192 | palignr m4, m0, m2, 2 |
| 8193 | movu [r0 + r2], m4 |
| 8194 | palignr m4, m1, m0, 2 |
| 8195 | movu [r0 + r2 + 16], m4 |
| 8196 | palignr m4, m0, m2, 1 |
| 8197 | movu [r0 + r3], m4 |
| 8198 | palignr m4, m1, m0, 1 |
| 8199 | movu [r0 + r3 + 16], m4 |
| 8200 | |
| 8201 | lea r0, [r0 + r4] |
| 8202 | |
| 8203 | movu [r0], m2 |
| 8204 | movu [r0 + 16], m0 |
| 8205 | palignr m4, m2, m3, 15 |
| 8206 | movu [r0 + r1], m4 |
| 8207 | palignr m4, m0, m2, 15 |
| 8208 | movu [r0 + r1 + 16], m4 |
| 8209 | palignr m4, m2, m3, 14 |
| 8210 | movu [r0 + r2], m4 |
| 8211 | palignr m4, m0, m2, 14 |
| 8212 | movu [r0 + r2 + 16], m4 |
| 8213 | palignr m4, m2, m3, 13 |
| 8214 | movu [r0 + r3], m4 |
| 8215 | palignr m4, m0, m2, 13 |
| 8216 | movu [r0 + r3 + 16], m4 |
| 8217 | |
| 8218 | lea r0, [r0 + r4] |
| 8219 | |
| 8220 | palignr m4, m2, m3, 12 |
| 8221 | movu [r0], m4 |
| 8222 | palignr m4, m0, m2, 12 |
| 8223 | movu [r0 + 16], m4 |
| 8224 | palignr m4, m2, m3, 11 |
| 8225 | movu [r0 + r1], m4 |
| 8226 | palignr m4, m0, m2, 11 |
| 8227 | movu [r0 + r1 + 16], m4 |
| 8228 | palignr m4, m2, m3, 10 |
| 8229 | movu [r0 + r2], m4 |
| 8230 | palignr m4, m0, m2, 10 |
| 8231 | movu [r0 + r2 + 16], m4 |
| 8232 | palignr m4, m2, m3, 9 |
| 8233 | movu [r0 + r3], m4 |
| 8234 | palignr m4, m0, m2, 9 |
| 8235 | movu [r0 + r3 + 16], m4 |
| 8236 | |
| 8237 | lea r0, [r0 + r4] |
| 8238 | |
| 8239 | palignr m4, m2, m3, 8 |
| 8240 | movu [r0], m4 |
| 8241 | palignr m4, m0, m2, 8 |
| 8242 | movu [r0 + 16], m4 |
| 8243 | palignr m4, m2, m3, 7 |
| 8244 | movu [r0 + r1], m4 |
| 8245 | palignr m4, m0, m2, 7 |
| 8246 | movu [r0 + r1 + 16], m4 |
| 8247 | palignr m4, m2, m3, 6 |
| 8248 | movu [r0 + r2], m4 |
| 8249 | palignr m4, m0, m2, 6 |
| 8250 | movu [r0 + r2 + 16], m4 |
| 8251 | palignr m4, m2, m3, 5 |
| 8252 | movu [r0 + r3], m4 |
| 8253 | palignr m4, m0, m2, 5 |
| 8254 | movu [r0 + r3 + 16], m4 |
| 8255 | |
| 8256 | lea r0, [r0 + r4] |
| 8257 | |
| 8258 | palignr m4, m2, m3, 4 |
| 8259 | movu [r0], m4 |
| 8260 | palignr m4, m0, m2, 4 |
| 8261 | movu [r0 + 16], m4 |
| 8262 | palignr m4, m2, m3, 3 |
| 8263 | movu [r0 + r1], m4 |
| 8264 | palignr m4, m0, m2, 3 |
| 8265 | movu [r0 + r1 + 16], m4 |
| 8266 | palignr m4, m2, m3, 2 |
| 8267 | movu [r0 + r2], m4 |
| 8268 | palignr m4, m0, m2, 2 |
| 8269 | movu [r0 + r2 + 16], m4 |
| 8270 | palignr m4, m2, m3, 1 |
| 8271 | movu [r0 + r3], m4 |
| 8272 | palignr m4, m0, m2, 1 |
| 8273 | movu [r0 + r3 + 16], m4 |
| 8274 | RET |
| 8275 | |
| 8276 | ;------------------------------------------------------------------------------------------------------------------ |
| 8277 | ; void intraPredAng32_19(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 8278 | ;------------------------------------------------------------------------------------------------------------------ |
| 8279 | INIT_XMM sse4 |
| 8280 | cglobal intra_pred_ang32_19, 4,7,8 |
| 8281 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line |
| 8282 | xchg r2, r3 |
| 8283 | mov r6, rsp |
| 8284 | sub rsp, 64+gprsize |
| 8285 | and rsp, ~63 |
| 8286 | mov [rsp+64], r6 |
| 8287 | |
| 8288 | ; collect reference pixel |
| 8289 | movu m0, [r3] |
| 8290 | movu m1, [r3 + 16] |
| 8291 | pshufb m0, [c_mode32_17_0] |
| 8292 | pshufb m1, [c_mode32_17_0] |
| 8293 | mova [rsp ], m1 |
| 8294 | movu [rsp + 13], m0 |
| 8295 | movu m0, [r2 + 1] |
| 8296 | movu m1, [r2 + 1 + 16] |
| 8297 | movu [rsp + 26], m0 |
| 8298 | movu [rsp + 26 + 16], m1 |
| 8299 | mov [rsp + 63], byte 4 |
| 8300 | |
| 8301 | ; filter |
| 8302 | lea r2, [rsp + 25] ; r2 -> [0] |
| 8303 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 |
| 8304 | lea r4, [ang_table] ; r4 -> ang_table |
| 8305 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 8306 | lea r6, [r0] ; r6 -> r0 |
| 8307 | mova m5, [pw_1024] ; m5 -> 1024 |
| 8308 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 |
| 8309 | |
| 8310 | .loop: |
| 8311 | ; Row[0 - 7] |
| 8312 | movu m7, [r2 - 6] |
| 8313 | palignr m0, m7, 6 |
| 8314 | palignr m1, m7, 5 |
| 8315 | palignr m2, m7, 4 |
| 8316 | palignr m3, m7, 3 |
| 8317 | palignr m4, m7, 2 |
| 8318 | mova m5, m4 |
| 8319 | palignr m6, m7, 1 |
| 8320 | PROC32_8x8 0, 0, 6,12,18,24,30,4,10,16 |
| 8321 | |
| 8322 | ; Row[7 - 15] |
| 8323 | movu m7, [r2 - 12] |
| 8324 | palignr m0, m7, 5 |
| 8325 | palignr m1, m7, 4 |
| 8326 | mova m2, m1 |
| 8327 | palignr m3, m7, 3 |
| 8328 | palignr m4, m7, 2 |
| 8329 | palignr m5, m7, 1 |
| 8330 | mova m6, m7 |
| 8331 | lea r0, [r0 + r1 * 4] |
| 8332 | PROC32_8x8 1, 0, 22,28,2,8,14,20,26,0 |
| 8333 | |
| 8334 | ; Row[16 - 23] |
| 8335 | movu m7, [r2 - 19] |
| 8336 | palignr m0, m7, 6 |
| 8337 | palignr m1, m7, 5 |
| 8338 | palignr m2, m7, 4 |
| 8339 | palignr m3, m7, 3 |
| 8340 | palignr m4, m7, 2 |
| 8341 | mova m5, m4 |
| 8342 | palignr m6, m7, 1 |
| 8343 | lea r0, [r0 + r1 * 4] |
| 8344 | PROC32_8x8 2, 0, 6,12,18,24,30,4,10,16 |
| 8345 | |
| 8346 | ; Row[24 - 31] |
| 8347 | movu m7, [r2 - 25] |
| 8348 | palignr m0, m7, 5 |
| 8349 | palignr m1, m7, 4 |
| 8350 | mova m2, m1 |
| 8351 | palignr m3, m7, 3 |
| 8352 | palignr m4, m7, 2 |
| 8353 | palignr m5, m7, 1 |
| 8354 | mova m6, m7 |
| 8355 | lea r0, [r0 + r1 * 4] |
| 8356 | PROC32_8x8 3, 0, 22,28,2,8,14,20,26,0 |
| 8357 | |
| 8358 | add r6, 8 |
| 8359 | mov r0, r6 |
| 8360 | add r2, 8 |
| 8361 | dec byte [rsp + 63] |
| 8362 | jnz .loop |
| 8363 | mov rsp, [rsp+64] |
| 8364 | RET |
| 8365 | |
| 8366 | ;------------------------------------------------------------------------------------------------------------------- |
| 8367 | ; void intraPredAng32_20(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 8368 | ;------------------------------------------------------------------------------------------------------------------- |
| 8369 | INIT_XMM sse4 |
| 8370 | cglobal intra_pred_ang32_20, 4,7,8 |
| 8371 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line |
| 8372 | xchg r2, r3 |
| 8373 | mov r6, rsp |
| 8374 | sub rsp, 64+gprsize |
| 8375 | and rsp, ~63 |
| 8376 | mov [rsp+64], r6 |
| 8377 | |
| 8378 | ; collect reference pixel |
| 8379 | movu m0, [r3] |
| 8380 | movu m1, [r3 + 15] |
| 8381 | pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15] |
| 8382 | pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30] |
| 8383 | mova [rsp], m1 |
| 8384 | movu [rsp + 10], m0 |
| 8385 | movu m0, [r2 + 1] |
| 8386 | movu m1, [r2 + 1 + 16] |
| 8387 | movu [rsp + 21], m0 |
| 8388 | movu [rsp + 21 + 16], m1 |
| 8389 | mov [rsp + 63], byte 4 |
| 8390 | |
| 8391 | ; filter |
| 8392 | lea r2, [rsp + 21] ; r2 -> [0] |
| 8393 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 |
| 8394 | lea r4, [ang_table] ; r4 -> ang_table |
| 8395 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 8396 | lea r6, [r0] ; r6 -> r0 |
| 8397 | mova m5, [pw_1024] ; m5 -> 1024 |
| 8398 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 |
| 8399 | |
| 8400 | .loop: |
| 8401 | ; Row[0 - 7] |
| 8402 | movu m7, [r2 - 6] |
| 8403 | palignr m0, m7, 5 |
| 8404 | palignr m1, m7, 4 |
| 8405 | mova m2, m1 |
| 8406 | palignr m3, m7, 3 |
| 8407 | palignr m4, m7, 2 |
| 8408 | mova m5, m4 |
| 8409 | palignr m6, m7, 1 |
| 8410 | PROC32_8x8 0, 0, 11,22,1,12,23,2,13,24 |
| 8411 | |
| 8412 | ; Row[8 - 15] |
| 8413 | movu m7, [r2 - 11] |
| 8414 | palignr m0, m7, 5 |
| 8415 | palignr m1, m7, 4 |
| 8416 | palignr m2, m7, 3 |
| 8417 | mova m3, m2 |
| 8418 | palignr m4, m7, 2 |
| 8419 | palignr m5, m7, 1 |
| 8420 | mova m6, m5 |
| 8421 | lea r0, [r0 + r1 * 4] |
| 8422 | PROC32_8x8 1, 0, 3,14,25,4,15,26,5,16 |
| 8423 | |
| 8424 | ; Row[16 - 23] |
| 8425 | movu m7, [r2 - 16] |
| 8426 | palignr m0, m7, 4 |
| 8427 | mova m1, m0 |
| 8428 | palignr m2, m7, 3 |
| 8429 | palignr m3, m7, 2 |
| 8430 | mova m4, m3 |
| 8431 | palignr m5, m7, 1 |
| 8432 | mova m6, m7 |
| 8433 | lea r0, [r0 + r1 * 4] |
| 8434 | PROC32_8x8 2, 0, 27,6,17,28,7,18,29,8 |
| 8435 | |
| 8436 | ; Row[24 - 31] |
| 8437 | movu m7, [r2 - 21] |
| 8438 | palignr m0, m7, 4 |
| 8439 | palignr m1, m7, 3 |
| 8440 | mova m2, m1 |
| 8441 | palignr m3, m7, 2 |
| 8442 | palignr m4, m7, 1 |
| 8443 | mova m5, m4 |
| 8444 | mova m6, m7 |
| 8445 | lea r0, [r0 + r1 * 4] |
| 8446 | PROC32_8x8 3, 0, 19,30,9,20,31,10,21,0 |
| 8447 | |
| 8448 | add r6, 8 |
| 8449 | mov r0, r6 |
| 8450 | add r2, 8 |
| 8451 | dec byte [rsp + 63] |
| 8452 | jnz .loop |
| 8453 | mov rsp, [rsp+64] |
| 8454 | RET |
| 8455 | |
| 8456 | ;------------------------------------------------------------------------------------------------------------------- |
| 8457 | ; void intraPredAng32_21(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 8458 | ;------------------------------------------------------------------------------------------------------------------- |
| 8459 | INIT_XMM sse4 |
| 8460 | cglobal intra_pred_ang32_21, 4,7,8 |
| 8461 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line |
| 8462 | xchg r2, r3 |
| 8463 | mov r6, rsp |
| 8464 | sub rsp, 64+gprsize |
| 8465 | and rsp, ~63 |
| 8466 | mov [rsp+64], r6 |
| 8467 | |
| 8468 | ; collect reference pixel |
| 8469 | movu m0, [r3] |
| 8470 | movu m1, [r3 + 15] |
| 8471 | pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15] |
| 8472 | pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30] |
| 8473 | mova [rsp], m1 |
| 8474 | movu [rsp + 8], m0 |
| 8475 | movu m0, [r2 + 1] |
| 8476 | movu m1, [r2 + 1 + 16] |
| 8477 | movu [rsp + 17], m0 |
| 8478 | movu [rsp + 17 + 16], m1 |
| 8479 | mov [rsp + 63], byte 4 |
| 8480 | |
| 8481 | ; filter |
| 8482 | lea r2, [rsp + 17] ; r2 -> [0] |
| 8483 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 |
| 8484 | lea r4, [ang_table] ; r4 -> ang_table |
| 8485 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 8486 | lea r6, [r0] ; r6 -> r0 |
| 8487 | mova m5, [pw_1024] ; m5 -> 1024 |
| 8488 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 |
| 8489 | |
| 8490 | .loop: |
| 8491 | ; Row[0 - 7] |
| 8492 | movu m7, [r2 - 5] |
| 8493 | palignr m0, m7, 4 |
| 8494 | palignr m1, m7, 3 |
| 8495 | mova m2, m1 |
| 8496 | palignr m3, m7, 2 |
| 8497 | mova m4, m3 |
| 8498 | palignr m5, m7, 1 |
| 8499 | mova m6, m5 |
| 8500 | PROC32_8x8 0, 0, 15,30,13,28,11,26,9,24 |
| 8501 | |
| 8502 | ; Row[8 - 15] |
| 8503 | movu m7, [r2 - 9] |
| 8504 | palignr m0, m7, 4 |
| 8505 | palignr m1, m7, 3 |
| 8506 | mova m2, m1 |
| 8507 | palignr m3, m7, 2 |
| 8508 | mova m4, m3 |
| 8509 | palignr m5, m7, 1 |
| 8510 | mova m6, m5 |
| 8511 | lea r0, [r0 + r1 * 4] |
| 8512 | PROC32_8x8 1, 0, 7,22,5,20,3,18,1,16 |
| 8513 | |
| 8514 | ; Row[16 - 23] |
| 8515 | movu m7, [r2 - 13] |
| 8516 | palignr m0, m7, 3 |
| 8517 | mova m1, m0 |
| 8518 | palignr m2, m7, 2 |
| 8519 | mova m3, m2 |
| 8520 | palignr m4, m7, 1 |
| 8521 | mova m5, m4 |
| 8522 | mova m6, m7 |
| 8523 | lea r0, [r0 + r1 * 4] |
| 8524 | PROC32_8x8 2, 0, 31,14,29,12,27,10,25,8 |
| 8525 | |
| 8526 | ; Row[24 - 31] |
| 8527 | movu m7, [r2 - 17] |
| 8528 | palignr m0, m7, 3 |
| 8529 | mova m1, m0 |
| 8530 | palignr m2, m7, 2 |
| 8531 | mova m3, m2 |
| 8532 | palignr m4, m7, 1 |
| 8533 | mova m5, m4 |
| 8534 | mova m6, m7 |
| 8535 | lea r0, [r0 + r1 * 4] |
| 8536 | PROC32_8x8 3, 0, 23,6,21,4,19,2,17,0 |
| 8537 | |
| 8538 | add r6, 8 |
| 8539 | mov r0, r6 |
| 8540 | add r2, 8 |
| 8541 | dec byte [rsp + 63] |
| 8542 | jnz .loop |
| 8543 | mov rsp, [rsp+64] |
| 8544 | RET |
| 8545 | |
| 8546 | ;------------------------------------------------------------------------------------------------------------------- |
| 8547 | ; void intraPredAng32_22(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 8548 | ;------------------------------------------------------------------------------------------------------------------- |
| 8549 | INIT_XMM sse4 |
| 8550 | cglobal intra_pred_ang32_22, 4,7,8 |
| 8551 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line |
| 8552 | |
| 8553 | xchg r2, r3 |
| 8554 | mov r6, rsp |
| 8555 | sub rsp, 64+gprsize |
| 8556 | and rsp, ~63 |
| 8557 | mov [rsp+64], r6 |
| 8558 | |
| 8559 | ; collect reference pixel |
| 8560 | movu m0, [r3] |
| 8561 | movu m1, [r3 + 15] |
| 8562 | pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15] |
| 8563 | pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30] |
| 8564 | pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x] |
| 8565 | palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30] |
| 8566 | mova [rsp], m0 |
| 8567 | movu m0, [r2 + 1] |
| 8568 | movu m1, [r2 + 1 + 16] |
| 8569 | movu [rsp + 13], m0 |
| 8570 | movu [rsp + 13 + 16], m1 |
| 8571 | mov [rsp + 63], byte 4 |
| 8572 | |
| 8573 | ; filter |
| 8574 | lea r2, [rsp + 13] ; r2 -> [0] |
| 8575 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 |
| 8576 | lea r4, [ang_table] ; r4 -> ang_table |
| 8577 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 8578 | lea r6, [r0] ; r6 -> r0 |
| 8579 | mova m5, [pw_1024] ; m5 -> 1024 |
| 8580 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 |
| 8581 | |
| 8582 | .loop: |
| 8583 | ; Row[0 - 7] |
| 8584 | movu m7, [r2 - 4] |
| 8585 | palignr m0, m7, 3 |
| 8586 | mova m1, m0 |
| 8587 | palignr m2, m7, 2 |
| 8588 | mova m3, m2 |
| 8589 | palignr m4, m7, 1 |
| 8590 | mova m5, m4 |
| 8591 | mova m6, m4 |
| 8592 | PROC32_8x8 0, 0, 19,6,25,12,31,18,5,24 |
| 8593 | |
| 8594 | ; Row[8 - 15] |
| 8595 | movu m7, [r2 - 7] |
| 8596 | palignr m0, m7, 3 |
| 8597 | palignr m1, m7, 2 |
| 8598 | mova m2, m1 |
| 8599 | mova m3, m1 |
| 8600 | palignr m4, m7, 1 |
| 8601 | mova m5, m4 |
| 8602 | mova m6, m7 |
| 8603 | lea r0, [r0 + r1 * 4] |
| 8604 | PROC32_8x8 1, 0, 11,30,17,4,23,10,29,16 |
| 8605 | |
| 8606 | ; Row[16 - 23] |
| 8607 | movu m7, [r2 - 10] |
| 8608 | palignr m0, m7, 3 |
| 8609 | palignr m1, m7, 2 |
| 8610 | mova m2, m1 |
| 8611 | palignr m3, m7, 1 |
| 8612 | mova m4, m3 |
| 8613 | mova m5, m3 |
| 8614 | mova m6, m7 |
| 8615 | lea r0, [r0 + r1 * 4] |
| 8616 | PROC32_8x8 2, 0, 3,22,9,28,15,2,21,8 |
| 8617 | |
| 8618 | ; Row[24 - 31] |
| 8619 | movu m7, [r2 - 13] |
| 8620 | palignr m0, m7, 2 |
| 8621 | mova m1, m0 |
| 8622 | mova m2, m0 |
| 8623 | palignr m3, m7, 1 |
| 8624 | mova m4, m3 |
| 8625 | mova m5, m7 |
| 8626 | mova m6, m7 |
| 8627 | lea r0, [r0 + r1 * 4] |
| 8628 | PROC32_8x8 3, 0, 27,14,1,20,7,26,13,0 |
| 8629 | |
| 8630 | add r6, 8 |
| 8631 | mov r0, r6 |
| 8632 | add r2, 8 |
| 8633 | dec byte [rsp + 63] |
| 8634 | jnz .loop |
| 8635 | mov rsp, [rsp+64] |
| 8636 | RET |
| 8637 | |
| 8638 | ;----------------------------------------------------------------------------------------------------------------- |
| 8639 | ; void intraPredAng32_23(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 8640 | ;----------------------------------------------------------------------------------------------------------------- |
| 8641 | INIT_XMM sse4 |
| 8642 | cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize) |
| 8643 | %define above [rsp + 0 * mmsize] |
| 8644 | xchg r2, r3 |
| 8645 | lea r4, [ang_table + 16 * 16] |
| 8646 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 8647 | mov r6, r0 |
| 8648 | mova m7, [pw_1024] |
| 8649 | |
| 8650 | MODE_13_23_ROW0 0 |
| 8651 | add r6, 8 |
| 8652 | mov r0, r6 |
| 8653 | add r2, 7 |
| 8654 | mov r3, 3 |
| 8655 | .loop: |
| 8656 | MODE_13_23 0 |
| 8657 | add r6, 8 |
| 8658 | mov r0, r6 |
| 8659 | add r2, 8 |
| 8660 | dec r3 |
| 8661 | jnz .loop |
| 8662 | RET |
| 8663 | |
| 8664 | ;----------------------------------------------------------------------------------------------------------------- |
| 8665 | ; void intraPredAng32_24(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 8666 | ;----------------------------------------------------------------------------------------------------------------- |
| 8667 | INIT_XMM sse4 |
| 8668 | cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize) |
| 8669 | %define above [rsp + 0 * mmsize] |
| 8670 | xchg r2, r3 |
| 8671 | lea r4, [ang_table + 16 * 16] |
| 8672 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 8673 | mov r6, r0 |
| 8674 | mova m7, [pw_1024] |
| 8675 | |
| 8676 | MODE_12_24_ROW0 0 |
| 8677 | add r6, 8 |
| 8678 | mov r0, r6 |
| 8679 | add r2, 7 |
| 8680 | mov r3, 3 |
| 8681 | .loop: |
| 8682 | MODE_12_24 0 |
| 8683 | add r6, 8 |
| 8684 | mov r0, r6 |
| 8685 | add r2, 8 |
| 8686 | dec r3 |
| 8687 | jnz .loop |
| 8688 | RET |
| 8689 | |
| 8690 | ;------------------------------------------------------------------------------------------------------------------- |
| 8691 | ; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 8692 | ;------------------------------------------------------------------------------------------------------------------- |
| 8693 | INIT_XMM sse4 |
| 8694 | cglobal intra_pred_ang32_25, 4,7,8 |
| 8695 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line |
| 8696 | xchg r2, r3 |
| 8697 | mov r6, rsp |
| 8698 | sub rsp, 64+gprsize |
| 8699 | and rsp, ~63 |
| 8700 | mov [rsp+64], r6 |
| 8701 | |
| 8702 | ; collect reference pixel |
| 8703 | movu m0, [r3 + 16] |
| 8704 | pxor m1, m1 |
| 8705 | pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] |
| 8706 | mova [rsp], m0 |
| 8707 | movu m0, [r2] |
| 8708 | movu m1, [r2 + 16] |
| 8709 | movu m2, [r2 + 32] |
| 8710 | movu [rsp + 1], m0 |
| 8711 | movu [rsp + 1 + 16], m1 |
| 8712 | movu [rsp + 1 + 32], m2 |
| 8713 | mov [rsp + 63], byte 4 |
| 8714 | |
| 8715 | ; filter |
| 8716 | lea r2, [rsp + 1] ; r2 -> [0] |
| 8717 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 |
| 8718 | lea r4, [ang_table] ; r4 -> ang_table |
| 8719 | lea r5, [r1 * 3] ; r5 -> 3 * stride |
| 8720 | lea r6, [r0] ; r6 -> r0 |
| 8721 | mova m5, [pw_1024] ; m5 -> 1024 |
| 8722 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 |
| 8723 | |
| 8724 | .loop: |
| 8725 | ; Row[0 - 7] |
| 8726 | movu m7, [r2] |
| 8727 | mova m0, m7 |
| 8728 | mova m1, m7 |
| 8729 | mova m2, m7 |
| 8730 | mova m3, m7 |
| 8731 | mova m4, m7 |
| 8732 | mova m5, m7 |
| 8733 | mova m6, m7 |
| 8734 | PROC32_8x8 0, 0, 30,28,26,24,22,20,18,16 |
| 8735 | |
| 8736 | ; Row[8 - 15] |
| 8737 | movu m7, [r2] |
| 8738 | mova m0, m7 |
| 8739 | mova m1, m7 |
| 8740 | mova m2, m7 |
| 8741 | mova m3, m7 |
| 8742 | mova m4, m7 |
| 8743 | mova m5, m7 |
| 8744 | mova m6, m7 |
| 8745 | lea r0, [r0 + r1 * 4] |
| 8746 | PROC32_8x8 1, 0, 14,12,10,8,6,4,2,0 |
| 8747 | |
| 8748 | ; Row[16 - 23] |
| 8749 | movu m7, [r2 - 1] |
| 8750 | mova m0, m7 |
| 8751 | mova m1, m7 |
| 8752 | mova m2, m7 |
| 8753 | mova m3, m7 |
| 8754 | mova m4, m7 |
| 8755 | mova m5, m7 |
| 8756 | mova m6, m7 |
| 8757 | lea r0, [r0 + r1 * 4] |
| 8758 | PROC32_8x8 2, 0, 30,28,26,24,22,20,18,16 |
| 8759 | |
| 8760 | ; Row[24 - 31] |
| 8761 | movu m7, [r2 - 1] |
| 8762 | mova m0, m7 |
| 8763 | mova m1, m7 |
| 8764 | mova m2, m7 |
| 8765 | mova m3, m7 |
| 8766 | mova m4, m7 |
| 8767 | mova m5, m7 |
| 8768 | mova m6, m7 |
| 8769 | lea r0, [r0 + r1 * 4] |
| 8770 | PROC32_8x8 3, 0, 14,12,10,8,6,4,2,0 |
| 8771 | |
| 8772 | add r6, 8 |
| 8773 | mov r0, r6 |
| 8774 | add r2, 8 |
| 8775 | dec byte [rsp + 63] |
| 8776 | jnz .loop |
| 8777 | mov rsp, [rsp+64] |
| 8778 | RET |
| 8779 | |
| 8780 | ;------------------------------------------------------------------------------------------------------------------ |
| 8781 | ; void intraPredAng32_26(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 8782 | ;------------------------------------------------------------------------------------------------------------------ |
| 8783 | INIT_XMM sse4 |
| 8784 | cglobal intra_pred_ang32_26, 6,7,7,0-(2*mmsize) |
| 8785 | %define m8 [rsp + 0 * mmsize] |
| 8786 | %define m9 [rsp + 1 * mmsize] |
| 8787 | lea r4, [r1 * 3] |
| 8788 | mov r6, 2 |
| 8789 | movu m0, [r2] |
| 8790 | movu m1, [r2 + 1] |
| 8791 | mova m8, m0 |
| 8792 | mova m9, m1 |
| 8793 | mov r2d, r5d |
| 8794 | |
| 8795 | .loop: |
| 8796 | movu m0, [r3 + 1] |
| 8797 | |
| 8798 | movu [r0], m0 |
| 8799 | movu [r0 + r1], m0 |
| 8800 | movu [r0 + r1 * 2], m0 |
| 8801 | movu [r0 + r4], m0 |
| 8802 | lea r5, [r0 + r1 * 4] |
| 8803 | movu [r5], m0 |
| 8804 | movu [r5 + r1], m0 |
| 8805 | movu [r5 + r1 * 2], m0 |
| 8806 | movu [r5 + r4], m0 |
| 8807 | lea r5, [r5 + r1 * 4] |
| 8808 | movu [r5], m0 |
| 8809 | movu [r5 + r1], m0 |
| 8810 | movu [r5 + r1 * 2], m0 |
| 8811 | movu [r5 + r4], m0 |
| 8812 | lea r5, [r5 + r1 * 4] |
| 8813 | movu [r5], m0 |
| 8814 | movu [r5 + r1], m0 |
| 8815 | movu [r5 + r1 * 2], m0 |
| 8816 | movu [r5 + r4], m0 |
| 8817 | lea r5, [r0 + r1 * 4] |
| 8818 | movu [r5], m0 |
| 8819 | movu [r5 + r1], m0 |
| 8820 | movu [r5 + r1 * 2], m0 |
| 8821 | movu [r5 + r4], m0 |
| 8822 | lea r5, [r5 + r1 * 4] |
| 8823 | movu [r5], m0 |
| 8824 | movu [r5 + r1], m0 |
| 8825 | movu [r5 + r1 * 2], m0 |
| 8826 | movu [r5 + r4], m0 |
| 8827 | lea r5, [r5 + r1 * 4] |
| 8828 | movu [r5], m0 |
| 8829 | movu [r5 + r1], m0 |
| 8830 | movu [r5 + r1 * 2], m0 |
| 8831 | movu [r5 + r4], m0 |
| 8832 | lea r5, [r5 + r1 * 4] |
| 8833 | movu [r5], m0 |
| 8834 | movu [r5 + r1], m0 |
| 8835 | movu [r5 + r1 * 2], m0 |
| 8836 | movu [r5 + r4], m0 |
| 8837 | lea r5, [r5 + r1 * 4] |
| 8838 | movu [r5], m0 |
| 8839 | movu [r5 + r1], m0 |
| 8840 | movu [r5 + r1 * 2], m0 |
| 8841 | movu [r5 + r4], m0 |
| 8842 | lea r5, [r5 + r1 * 4] |
| 8843 | movu [r5], m0 |
| 8844 | movu [r5 + r1], m0 |
| 8845 | movu [r5 + r1 * 2], m0 |
| 8846 | movu [r5 + r4], m0 |
| 8847 | lea r5, [r5 + r1 * 4] |
| 8848 | movu [r5], m0 |
| 8849 | movu [r5 + r1], m0 |
| 8850 | movu [r5 + r1 * 2], m0 |
| 8851 | movu [r5 + r4], m0 |
| 8852 | |
| 8853 | ; filter |
| 8854 | cmp r2d, byte 0 |
| 8855 | jz .quit |
| 8856 | |
| 8857 | pxor m4, m4 |
| 8858 | pshufb m0, m4 |
| 8859 | pmovzxbw m0, m0 |
| 8860 | mova m1, m0 |
| 8861 | movu m2, m8 |
| 8862 | movu m3, m9 |
| 8863 | |
| 8864 | pshufb m2, m4 |
| 8865 | pmovzxbw m2, m2 |
| 8866 | movhlps m4, m3 |
| 8867 | pmovzxbw m3, m3 |
| 8868 | pmovzxbw m4, m4 |
| 8869 | psubw m3, m2 |
| 8870 | psubw m4, m2 |
| 8871 | psraw m3, 1 |
| 8872 | psraw m4, 1 |
| 8873 | paddw m0, m3 |
| 8874 | paddw m1, m4 |
| 8875 | packuswb m0, m1 |
| 8876 | |
| 8877 | pextrb [r0], m0, 0 |
| 8878 | pextrb [r0 + r1], m0, 1 |
| 8879 | pextrb [r0 + r1 * 2], m0, 2 |
| 8880 | pextrb [r0 + r4], m0, 3 |
| 8881 | lea r5, [r0 + r1 * 4] |
| 8882 | pextrb [r5], m0, 4 |
| 8883 | pextrb [r5 + r1], m0, 5 |
| 8884 | pextrb [r5 + r1 * 2], m0, 6 |
| 8885 | pextrb [r5 + r4], m0, 7 |
| 8886 | lea r5, [r5 + r1 * 4] |
| 8887 | pextrb [r5], m0, 8 |
| 8888 | pextrb [r5 + r1], m0, 9 |
| 8889 | pextrb [r5 + r1 * 2], m0, 10 |
| 8890 | pextrb [r5 + r4], m0, 11 |
| 8891 | lea r5, [r5 + r1 * 4] |
| 8892 | pextrb [r5], m0, 12 |
| 8893 | pextrb [r5 + r1], m0, 13 |
| 8894 | pextrb [r5 + r1 * 2], m0, 14 |
| 8895 | pextrb [r5 + r4], m0, 15 |
| 8896 | |
| 8897 | .quit: |
| 8898 | lea r3, [r3 + 16] |
| 8899 | add r0, 16 |
| 8900 | dec r6d |
| 8901 | jnz .loop |
| 8902 | RET |
| 8903 | |
| 8904 | ;------------------------------------------------------------------------------------------------------------------ |
| 8905 | ; void intraPredAng32_27(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 8906 | ;------------------------------------------------------------------------------------------------------------------ |
| 8907 | INIT_XMM sse4 |
| 8908 | cglobal intra_pred_ang32_27, 3,7,8 |
| 8909 | mov r2, r3mp |
| 8910 | lea r3, [ang_table + 16 * 16] |
| 8911 | mov r4d, 4 |
| 8912 | lea r5, [r1 * 3] |
| 8913 | mov r6, r0 |
| 8914 | mova m7, [pw_1024] |
| 8915 | .loop: |
| 8916 | MODE_9_27 0 |
| 8917 | add r6, 8 |
| 8918 | mov r0, r6 |
| 8919 | add r2, 8 |
| 8920 | dec r4 |
| 8921 | jnz .loop |
| 8922 | RET |
| 8923 | |
| 8924 | ;------------------------------------------------------------------------------------------------------------------ |
| 8925 | ; void intraPredAng32_28(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 8926 | ;------------------------------------------------------------------------------------------------------------------ |
| 8927 | INIT_XMM sse4 |
| 8928 | cglobal intra_pred_ang32_28, 3,7,8 |
| 8929 | mov r2, r3mp |
| 8930 | lea r3, [ang_table + 16 * 16] |
| 8931 | mov r4d, 4 |
| 8932 | lea r5, [r1 * 3] |
| 8933 | mov r6, r0 |
| 8934 | mova m7, [pw_1024] |
| 8935 | .loop: |
| 8936 | MODE_8_28 0 |
| 8937 | add r6, 8 |
| 8938 | mov r0, r6 |
| 8939 | add r2, 8 |
| 8940 | dec r4 |
| 8941 | jnz .loop |
| 8942 | RET |
| 8943 | |
| 8944 | ;------------------------------------------------------------------------------------------------------------------ |
| 8945 | ; void intraPredAng32_29(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 8946 | ;------------------------------------------------------------------------------------------------------------------ |
| 8947 | INIT_XMM sse4 |
| 8948 | cglobal intra_pred_ang32_29, 3,7,8 |
| 8949 | mov r2, r3mp |
| 8950 | lea r3, [ang_table + 16 * 16] |
| 8951 | mov r4d, 4 |
| 8952 | lea r5, [r1 * 3] |
| 8953 | mov r6, r0 |
| 8954 | mova m7, [pw_1024] |
| 8955 | .loop: |
| 8956 | MODE_7_29 0 |
| 8957 | add r6, 8 |
| 8958 | mov r0, r6 |
| 8959 | add r2, 8 |
| 8960 | dec r4 |
| 8961 | jnz .loop |
| 8962 | RET |
| 8963 | |
| 8964 | ;------------------------------------------------------------------------------------------------------------------ |
| 8965 | ; void intraPredAng32_30(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 8966 | ;------------------------------------------------------------------------------------------------------------------ |
| 8967 | INIT_XMM sse4 |
| 8968 | cglobal intra_pred_ang32_30, 3,7,8 |
| 8969 | mov r2, r3mp |
| 8970 | lea r3, [ang_table + 16 * 16] |
| 8971 | mov r4d, 4 |
| 8972 | lea r5, [r1 * 3] |
| 8973 | mov r6, r0 |
| 8974 | mova m7, [pw_1024] |
| 8975 | .loop: |
| 8976 | MODE_6_30 0 |
| 8977 | add r6, 8 |
| 8978 | mov r0, r6 |
| 8979 | add r2, 8 |
| 8980 | dec r4 |
| 8981 | jnz .loop |
| 8982 | RET |
| 8983 | |
| 8984 | ;------------------------------------------------------------------------------------------------------------------ |
| 8985 | ; void intraPredAng32_31(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 8986 | ;------------------------------------------------------------------------------------------------------------------ |
| 8987 | INIT_XMM sse4 |
| 8988 | cglobal intra_pred_ang32_31, 3,7,8 |
| 8989 | mov r2, r3mp |
| 8990 | lea r3, [ang_table + 16 * 16] |
| 8991 | mov r4d, 4 |
| 8992 | lea r5, [r1 * 3] |
| 8993 | mov r6, r0 |
| 8994 | mova m7, [pw_1024] |
| 8995 | .loop: |
| 8996 | MODE_5_31 0 |
| 8997 | add r6, 8 |
| 8998 | mov r0, r6 |
| 8999 | add r2, 8 |
| 9000 | dec r4 |
| 9001 | jnz .loop |
| 9002 | RET |
| 9003 | |
| 9004 | ;----------------------------------------------------------------------------------------------------------------- |
| 9005 | ; void intraPredAng32_32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 9006 | ;----------------------------------------------------------------------------------------------------------------- |
| 9007 | INIT_XMM sse4 |
| 9008 | cglobal intra_pred_ang32_32, 3,7,8 |
| 9009 | mov r2, r3mp |
| 9010 | lea r3, [ang_table + 16 * 16] |
| 9011 | mov r4d, 4 |
| 9012 | lea r5, [r1 * 3] |
| 9013 | mov r6, r0 |
| 9014 | mova m7, [pw_1024] |
| 9015 | .loop: |
| 9016 | MODE_4_32 0 |
| 9017 | add r6, 8 |
| 9018 | mov r0, r6 |
| 9019 | add r2, 8 |
| 9020 | dec r4 |
| 9021 | jnz .loop |
| 9022 | RET |
| 9023 | |
| 9024 | ;------------------------------------------------------------------------------------------------------------------ |
| 9025 | ; void intraPredAng32_33(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) |
| 9026 | ;------------------------------------------------------------------------------------------------------------------ |
| 9027 | INIT_XMM sse4 |
| 9028 | cglobal intra_pred_ang32_33, 3,7,8 |
| 9029 | xchg r2, r3mp |
| 9030 | lea r3, [ang_table + 16 * 16] |
| 9031 | mov r4d, 4 |
| 9032 | lea r5, [r1 * 3] |
| 9033 | mov r6, r0 |
| 9034 | mova m7, [pw_1024] |
| 9035 | .loop: |
| 9036 | MODE_3_33 0 |
| 9037 | add r6, 8 |
| 9038 | mov r0, r6 |
| 9039 | add r2, 8 |
| 9040 | dec r4 |
| 9041 | jnz .loop |
| 9042 | RET |
| 9043 | |
| 9044 | ;----------------------------------------------------------------------------- |
| 9045 | ; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma) |
| 9046 | ;----------------------------------------------------------------------------- |
| 9047 | INIT_XMM sse4 |
| 9048 | cglobal all_angs_pred_4x4, 6, 6, 8 |
| 9049 | |
| 9050 | ; mode 2 |
| 9051 | |
| 9052 | movh m0, [r2 + 2] |
| 9053 | movd [r0], m0 |
| 9054 | |
| 9055 | palignr m1, m0, 1 |
| 9056 | movd [r0 + 4], m1 |
| 9057 | |
| 9058 | palignr m1, m0, 2 |
| 9059 | movd [r0 + 8], m1 |
| 9060 | |
| 9061 | psrldq m0, 3 |
| 9062 | movd [r0 + 12], m0 |
| 9063 | |
| 9064 | ; mode 3 |
| 9065 | |
| 9066 | mova m0, [pw_1024] |
| 9067 | |
| 9068 | movh m1, [r2 + 1] |
| 9069 | |
| 9070 | palignr m2, m1, 1 |
| 9071 | punpcklbw m1, m2 |
| 9072 | |
| 9073 | lea r5, [ang_table] |
| 9074 | |
| 9075 | pmaddubsw m5, m1, [r5 + 26 * 16] |
| 9076 | pmulhrsw m5, m0 |
| 9077 | packuswb m5, m5 |
| 9078 | movd [r0 + 16], m5 |
| 9079 | |
| 9080 | palignr m2, m1, 2 |
| 9081 | |
| 9082 | mova m7, [r5 + 20 * 16] |
| 9083 | |
| 9084 | pmaddubsw m6, m2, m7 |
| 9085 | pmulhrsw m6, m0 |
| 9086 | packuswb m6, m6 |
| 9087 | movd [r0 + 20], m6 |
| 9088 | |
| 9089 | palignr m3, m1, 4 |
| 9090 | |
| 9091 | pmaddubsw m4, m3, [r5 + 14 * 16] |
| 9092 | pmulhrsw m4, m0 |
| 9093 | packuswb m4, m4 |
| 9094 | movd [r0 + 24], m4 |
| 9095 | |
| 9096 | palignr m4, m1, 6 |
| 9097 | |
| 9098 | pmaddubsw m4, [r5 + 8 * 16] |
| 9099 | pmulhrsw m4, m0 |
| 9100 | packuswb m4, m4 |
| 9101 | movd [r0 + 28], m4 |
| 9102 | |
| 9103 | ; mode 4 |
| 9104 | |
| 9105 | pmaddubsw m4, m1, [r5 + 21 * 16] |
| 9106 | pmulhrsw m4, m0 |
| 9107 | packuswb m4, m4 |
| 9108 | movd [r0 + 32], m4 |
| 9109 | |
| 9110 | pmaddubsw m4, m2, [r5 + 10 * 16] |
| 9111 | pmulhrsw m4, m0 |
| 9112 | packuswb m4, m4 |
| 9113 | movd [r0 + 36], m4 |
| 9114 | |
| 9115 | pmaddubsw m4, m2, [r5 + 31 * 16] |
| 9116 | pmulhrsw m4, m0 |
| 9117 | packuswb m4, m4 |
| 9118 | movd [r0 + 40], m4 |
| 9119 | |
| 9120 | pmaddubsw m4, m3, m7 |
| 9121 | pmulhrsw m4, m0 |
| 9122 | packuswb m4, m4 |
| 9123 | movd [r0 + 44], m4 |
| 9124 | |
| 9125 | ; mode 5 |
| 9126 | |
| 9127 | pmaddubsw m4, m1, [r5 + 17 * 16] |
| 9128 | pmulhrsw m4, m0 |
| 9129 | packuswb m4, m4 |
| 9130 | movd [r0 + 48], m4 |
| 9131 | |
| 9132 | pmaddubsw m4, m2, [r5 + 2 * 16] |
| 9133 | pmulhrsw m4, m0 |
| 9134 | packuswb m4, m4 |
| 9135 | movd [r0 + 52], m4 |
| 9136 | |
| 9137 | pmaddubsw m4, m2, [r5 + 19 * 16] |
| 9138 | pmulhrsw m4, m0 |
| 9139 | packuswb m4, m4 |
| 9140 | movd [r0 + 56], m4 |
| 9141 | |
| 9142 | pmaddubsw m3, [r5 + 4 * 16] |
| 9143 | pmulhrsw m3, m0 |
| 9144 | packuswb m3, m3 |
| 9145 | movd [r0 + 60], m3 |
| 9146 | |
| 9147 | ; mode 6 |
| 9148 | |
| 9149 | pmaddubsw m3, m1, [r5 + 13 * 16] |
| 9150 | pmulhrsw m3, m0 |
| 9151 | packuswb m3, m3 |
| 9152 | movd [r0 + 64], m3 |
| 9153 | |
| 9154 | movd [r0 + 68], m5 |
| 9155 | |
| 9156 | pmaddubsw m3, m2, [r5 + 7 * 16] |
| 9157 | pmulhrsw m3, m0 |
| 9158 | packuswb m3, m3 |
| 9159 | movd [r0 + 72], m3 |
| 9160 | |
| 9161 | movd [r0 + 76], m6 |
| 9162 | |
| 9163 | ; mode 7 |
| 9164 | |
| 9165 | pmaddubsw m3, m1, [r5 + 9 * 16] |
| 9166 | pmulhrsw m3, m0 |
| 9167 | packuswb m3, m3 |
| 9168 | movd [r0 + 80], m3 |
| 9169 | |
| 9170 | pmaddubsw m3, m1, [r5 + 18 * 16] |
| 9171 | pmulhrsw m3, m0 |
| 9172 | packuswb m3, m3 |
| 9173 | movd [r0 + 84], m3 |
| 9174 | |
| 9175 | pmaddubsw m3, m1, [r5 + 27 * 16] |
| 9176 | pmulhrsw m3, m0 |
| 9177 | packuswb m3, m3 |
| 9178 | movd [r0 + 88], m3 |
| 9179 | |
| 9180 | pmaddubsw m2, [r5 + 4 * 16] |
| 9181 | pmulhrsw m2, m0 |
| 9182 | packuswb m2, m2 |
| 9183 | movd [r0 + 92], m2 |
| 9184 | |
| 9185 | ; mode 8 |
| 9186 | |
| 9187 | pmaddubsw m2, m1, [r5 + 5 * 16] |
| 9188 | pmulhrsw m2, m0 |
| 9189 | packuswb m2, m2 |
| 9190 | movd [r0 + 96], m2 |
| 9191 | |
| 9192 | pmaddubsw m2, m1, [r5 + 10 * 16] |
| 9193 | pmulhrsw m2, m0 |
| 9194 | packuswb m2, m2 |
| 9195 | movd [r0 + 100], m2 |
| 9196 | |
| 9197 | pmaddubsw m2, m1, [r5 + 15 * 16] |
| 9198 | pmulhrsw m2, m0 |
| 9199 | packuswb m2, m2 |
| 9200 | movd [r0 + 104], m2 |
| 9201 | |
| 9202 | pmaddubsw m2, m1, m7 |
| 9203 | pmulhrsw m2, m0 |
| 9204 | packuswb m2, m2 |
| 9205 | movd [r0 + 108], m2 |
| 9206 | |
| 9207 | ; mode 9 |
| 9208 | |
| 9209 | pmaddubsw m2, m1, [r5 + 2 * 16] |
| 9210 | pmulhrsw m2, m0 |
| 9211 | packuswb m2, m2 |
| 9212 | movd [r0 + 112], m2 |
| 9213 | |
| 9214 | pmaddubsw m2, m1, [r5 + 4 * 16] |
| 9215 | pmulhrsw m2, m0 |
| 9216 | packuswb m2, m2 |
| 9217 | movd [r0 + 116], m2 |
| 9218 | |
| 9219 | pmaddubsw m2, m1, [r5 + 6 * 16] |
| 9220 | pmulhrsw m2, m0 |
| 9221 | packuswb m2, m2 |
| 9222 | movd [r0 + 120], m2 |
| 9223 | |
| 9224 | pmaddubsw m1, [r5 + 8 * 16] |
| 9225 | pmulhrsw m1, m0 |
| 9226 | packuswb m1, m1 |
| 9227 | movd [r0 + 124], m1 |
| 9228 | |
| 9229 | ; mode 10 |
| 9230 | |
| 9231 | movh m1, [r2] |
| 9232 | palignr m2, m1, 1 |
| 9233 | pshufd m3, m2, 0 |
| 9234 | movu [r0 + 128], m3 |
| 9235 | |
| 9236 | pxor m3, m3 |
| 9237 | |
| 9238 | pshufb m4, m2, m3 |
| 9239 | punpcklbw m4, m3 |
| 9240 | |
| 9241 | movh m5, [r1] |
| 9242 | |
| 9243 | pshufb m6, m5, m3 |
| 9244 | punpcklbw m6, m3 |
| 9245 | |
| 9246 | psrldq m5, 1 |
| 9247 | punpcklbw m5, m3 |
| 9248 | |
| 9249 | psubw m5, m6 |
| 9250 | psraw m5, 1 |
| 9251 | |
| 9252 | paddw m4, m5 |
| 9253 | |
| 9254 | packuswb m4, m3 |
| 9255 | |
| 9256 | pextrb [r0 + 128], m4, 0 |
| 9257 | pextrb [r0 + 132], m4, 1 |
| 9258 | pextrb [r0 + 136], m4, 2 |
| 9259 | pextrb [r0 + 140], m4, 3 |
| 9260 | |
| 9261 | ; mode 11 |
| 9262 | |
| 9263 | palignr m2, m1, 1 |
| 9264 | punpcklbw m1, m2 |
| 9265 | |
| 9266 | pmaddubsw m2, m1, [r5 + 30 * 16] |
| 9267 | pmulhrsw m2, m0 |
| 9268 | packuswb m2, m2 |
| 9269 | movd [r0 + 144], m2 |
| 9270 | |
| 9271 | pmaddubsw m2, m1, [r5 + 28 * 16] |
| 9272 | pmulhrsw m2, m0 |
| 9273 | packuswb m2, m2 |
| 9274 | movd [r0 + 148], m2 |
| 9275 | |
| 9276 | pmaddubsw m2, m1, [r5 + 26 * 16] |
| 9277 | pmulhrsw m2, m0 |
| 9278 | packuswb m2, m2 |
| 9279 | movd [r0 + 152], m2 |
| 9280 | |
| 9281 | pmaddubsw m2, m1, [r5 + 24 * 16] |
| 9282 | pmulhrsw m2, m0 |
| 9283 | packuswb m2, m2 |
| 9284 | movd [r0 + 156], m2 |
| 9285 | |
| 9286 | ; mode 12 |
| 9287 | |
| 9288 | pmaddubsw m2, m1, [r5 + 27 * 16] |
| 9289 | pmulhrsw m2, m0 |
| 9290 | packuswb m2, m2 |
| 9291 | movd [r0 + 160], m2 |
| 9292 | |
| 9293 | pmaddubsw m2, m1, [r5 + 22 * 16] |
| 9294 | pmulhrsw m2, m0 |
| 9295 | packuswb m2, m2 |
| 9296 | movd [r0 + 164], m2 |
| 9297 | |
| 9298 | pmaddubsw m2, m1, [r5 + 17 * 16] |
| 9299 | pmulhrsw m2, m0 |
| 9300 | packuswb m2, m2 |
| 9301 | movd [r0 + 168], m2 |
| 9302 | |
| 9303 | pmaddubsw m2, m1, [r5 + 12 * 16] |
| 9304 | pmulhrsw m2, m0 |
| 9305 | packuswb m2, m2 |
| 9306 | movd [r0 + 172], m2 |
| 9307 | |
| 9308 | ; mode 13 |
| 9309 | |
| 9310 | pmaddubsw m2, m1, [r5 + 23 * 16] |
| 9311 | pmulhrsw m2, m0 |
| 9312 | packuswb m2, m2 |
| 9313 | movd [r0 + 176], m2 |
| 9314 | |
| 9315 | pmaddubsw m2, m1, [r5 + 14 * 16] |
| 9316 | pmulhrsw m2, m0 |
| 9317 | packuswb m2, m2 |
| 9318 | movd [r0 + 180], m2 |
| 9319 | |
| 9320 | pmaddubsw m2, m1, [r5 + 5 * 16] |
| 9321 | pmulhrsw m2, m0 |
| 9322 | packuswb m2, m2 |
| 9323 | movd [r0 + 184], m2 |
| 9324 | |
| 9325 | pslldq m2, m1, 2 |
| 9326 | pinsrb m2, [r1 + 0], 1 |
| 9327 | pinsrb m2, [r1 + 4], 0 |
| 9328 | |
| 9329 | pmaddubsw m3, m2, [r5 + 28 * 16] |
| 9330 | pmulhrsw m3, m0 |
| 9331 | packuswb m3, m3 |
| 9332 | movd [r0 + 188], m3 |
| 9333 | |
| 9334 | ; mode 14 |
| 9335 | |
| 9336 | pmaddubsw m3, m1, [r5 + 19 * 16] |
| 9337 | pmulhrsw m3, m0 |
| 9338 | packuswb m3, m3 |
| 9339 | movd [r0 + 192], m3 |
| 9340 | |
| 9341 | pmaddubsw m5, m1, [r5 + 6 * 16] |
| 9342 | pmulhrsw m5, m0 |
| 9343 | packuswb m5, m5 |
| 9344 | movd [r0 + 196], m5 |
| 9345 | |
| 9346 | pinsrb m2, [r1 + 2], 0 |
| 9347 | |
| 9348 | pmaddubsw m3, m2, [r5 + 25 * 16] |
| 9349 | pmulhrsw m3, m0 |
| 9350 | packuswb m3, m3 |
| 9351 | movd [r0 + 200], m3 |
| 9352 | |
| 9353 | pmaddubsw m3, m2, [r5 + 12 * 16] |
| 9354 | pmulhrsw m3, m0 |
| 9355 | packuswb m3, m3 |
| 9356 | movd [r0 + 204], m3 |
| 9357 | |
| 9358 | ; mode 15 |
| 9359 | |
| 9360 | pmaddubsw m3, m1, [r5 + 15 * 16] |
| 9361 | pmulhrsw m3, m0 |
| 9362 | packuswb m3, m3 |
| 9363 | movd [r0 + 208], m3 |
| 9364 | |
| 9365 | pmaddubsw m3, m2, [r5 + 30 * 16] |
| 9366 | pmulhrsw m3, m0 |
| 9367 | packuswb m3, m3 |
| 9368 | movd [r0 + 212], m3 |
| 9369 | |
| 9370 | pmaddubsw m3, m2, [r5 + 13 * 16] |
| 9371 | pmulhrsw m3, m0 |
| 9372 | packuswb m3, m3 |
| 9373 | movd [r0 + 216], m3 |
| 9374 | |
| 9375 | pslldq m3, m2, 2 |
| 9376 | pinsrb m3, [r1 + 2], 1 |
| 9377 | pinsrb m3, [r1 + 4], 0 |
| 9378 | |
| 9379 | pmaddubsw m4, m3, [r5 + 28 * 16] |
| 9380 | pmulhrsw m4, m0 |
| 9381 | packuswb m4, m4 |
| 9382 | movd [r0 + 220], m4 |
| 9383 | |
| 9384 | ; mode 16 |
| 9385 | |
| 9386 | pmaddubsw m4, m1, [r5 + 11 * 16] |
| 9387 | pmulhrsw m4, m0 |
| 9388 | packuswb m4, m4 |
| 9389 | movd [r0 + 224], m4 |
| 9390 | |
| 9391 | pmaddubsw m4, m2, [r5 + 22 * 16] |
| 9392 | pmulhrsw m4, m0 |
| 9393 | packuswb m4, m4 |
| 9394 | movd [r0 + 228], m4 |
| 9395 | |
| 9396 | pmaddubsw m4, m2, [r5 + 1 * 16] |
| 9397 | pmulhrsw m4, m0 |
| 9398 | packuswb m4, m4 |
| 9399 | movd [r0 + 232], m4 |
| 9400 | |
| 9401 | pinsrb m3, [r1 + 3], 0 |
| 9402 | |
| 9403 | pmaddubsw m3, [r5 + 12 * 16] |
| 9404 | pmulhrsw m3, m0 |
| 9405 | packuswb m3, m3 |
| 9406 | movd [r0 + 236], m3 |
| 9407 | |
| 9408 | ; mode 17 |
| 9409 | |
| 9410 | movd [r0 + 240], m5 |
| 9411 | |
| 9412 | pslldq m1, 2 |
| 9413 | pinsrb m1, [r1 + 1], 0 |
| 9414 | pinsrb m1, [r1 + 0], 1 |
| 9415 | |
| 9416 | pmaddubsw m2, m1, [r5 + 12 * 16] |
| 9417 | pmulhrsw m2, m0 |
| 9418 | packuswb m2, m2 |
| 9419 | movd [r0 + 244], m2 |
| 9420 | |
| 9421 | pslldq m1, 2 |
| 9422 | pinsrb m1, [r1 + 2], 0 |
| 9423 | pinsrb m1, [r1 + 1], 1 |
| 9424 | |
| 9425 | pmaddubsw m2, m1, [r5 + 18 * 16] |
| 9426 | pmulhrsw m2, m0 |
| 9427 | packuswb m2, m2 |
| 9428 | movd [r0 + 248], m2 |
| 9429 | |
| 9430 | pslldq m1, 2 |
| 9431 | pinsrb m1, [r1 + 4], 0 |
| 9432 | pinsrb m1, [r1 + 2], 1 |
| 9433 | |
| 9434 | pmaddubsw m1, [r5 + 24 * 16] |
| 9435 | pmulhrsw m1, m0 |
| 9436 | packuswb m1, m1 |
| 9437 | movd [r0 + 252], m1 |
| 9438 | |
| 9439 | ; mode 18 |
| 9440 | |
| 9441 | movh m1, [r1] |
| 9442 | movd [r0 + 256], m1 |
| 9443 | |
| 9444 | pslldq m2, m1, 1 |
| 9445 | pinsrb m2, [r2 + 1], 0 |
| 9446 | movd [r0 + 260], m2 |
| 9447 | |
| 9448 | pslldq m3, m2, 1 |
| 9449 | pinsrb m3, [r2 + 2], 0 |
| 9450 | movd [r0 + 264], m3 |
| 9451 | |
| 9452 | pslldq m4, m3, 1 |
| 9453 | pinsrb m4, [r2 + 3], 0 |
| 9454 | movd [r0 + 268], m4 |
| 9455 | |
| 9456 | ; mode 19 |
| 9457 | |
| 9458 | palignr m4, m1, 1 |
| 9459 | punpcklbw m1, m4 |
| 9460 | |
| 9461 | pmaddubsw m5, m1, [r5 + 6 * 16] |
| 9462 | pmulhrsw m5, m0 |
| 9463 | packuswb m5, m5 |
| 9464 | movd [r0 + 272], m5 |
| 9465 | |
| 9466 | pslldq m2, m1, 2 |
| 9467 | pinsrb m2, [r2 + 1], 0 |
| 9468 | pinsrb m2, [r2], 1 |
| 9469 | |
| 9470 | pmaddubsw m3, m2, [r5 + 12 * 16] |
| 9471 | pmulhrsw m3, m0 |
| 9472 | packuswb m3, m3 |
| 9473 | movd [r0 + 276], m3 |
| 9474 | |
| 9475 | pslldq m3, m2, 2 |
| 9476 | pinsrb m3, [r2 + 1], 1 |
| 9477 | pinsrb m3, [r2 + 2], 0 |
| 9478 | |
| 9479 | pmaddubsw m4, m3, [r5 + 18 * 16] |
| 9480 | pmulhrsw m4, m0 |
| 9481 | packuswb m4, m4 |
| 9482 | movd [r0 + 280], m4 |
| 9483 | |
| 9484 | pslldq m3, 2 |
| 9485 | pinsrb m3, [r2 + 2], 1 |
| 9486 | pinsrb m3, [r2 + 4], 0 |
| 9487 | |
| 9488 | pmaddubsw m3, [r5 + 24 * 16] |
| 9489 | pmulhrsw m3, m0 |
| 9490 | packuswb m3, m3 |
| 9491 | movd [r0 + 284], m3 |
| 9492 | |
| 9493 | ; mode 20 |
| 9494 | |
| 9495 | pmaddubsw m3, m1, [r5 + 11 * 16] |
| 9496 | pmulhrsw m3, m0 |
| 9497 | packuswb m3, m3 |
| 9498 | movd [r0 + 288], m3 |
| 9499 | |
| 9500 | pinsrb m2, [r2 + 2], 0 |
| 9501 | |
| 9502 | pmaddubsw m3, m2, [r5 + 22 * 16] |
| 9503 | pmulhrsw m3, m0 |
| 9504 | packuswb m3, m3 |
| 9505 | movd [r0 + 292], m3 |
| 9506 | |
| 9507 | pmaddubsw m3, m2, [r5 + 1 * 16] |
| 9508 | pmulhrsw m3, m0 |
| 9509 | packuswb m3, m3 |
| 9510 | movd [r0 + 296], m3 |
| 9511 | |
| 9512 | pslldq m3, m2, 2 |
| 9513 | pinsrb m3, [r2 + 2], 1 |
| 9514 | pinsrb m3, [r2 + 3], 0 |
| 9515 | |
| 9516 | pmaddubsw m4, m3, [r5 + 12 * 16] |
| 9517 | pmulhrsw m4, m0 |
| 9518 | packuswb m4, m4 |
| 9519 | movd [r0 + 300], m4 |
| 9520 | |
| 9521 | ; mode 21 |
| 9522 | |
| 9523 | pmaddubsw m4, m1, [r5 + 15 * 16] |
| 9524 | pmulhrsw m4, m0 |
| 9525 | packuswb m4, m4 |
| 9526 | movd [r0 + 304], m4 |
| 9527 | |
| 9528 | pmaddubsw m4, m2, [r5 + 30 * 16] |
| 9529 | pmulhrsw m4, m0 |
| 9530 | packuswb m4, m4 |
| 9531 | movd [r0 + 308], m4 |
| 9532 | |
| 9533 | pmaddubsw m4, m2, [r5 + 13 * 16] |
| 9534 | pmulhrsw m4, m0 |
| 9535 | packuswb m4, m4 |
| 9536 | movd [r0 + 312], m4 |
| 9537 | |
| 9538 | pinsrb m3, [r2 + 4], 0 |
| 9539 | |
| 9540 | pmaddubsw m3, [r5 + 28 * 16] |
| 9541 | pmulhrsw m3, m0 |
| 9542 | packuswb m3, m3 |
| 9543 | movd [r0 + 316], m3 |
| 9544 | |
| 9545 | ; mode 22 |
| 9546 | |
| 9547 | pmaddubsw m3, m1, [r5 + 19 * 16] |
| 9548 | pmulhrsw m3, m0 |
| 9549 | packuswb m3, m3 |
| 9550 | movd [r0 + 320], m3 |
| 9551 | |
| 9552 | movd [r0 + 324], m5 |
| 9553 | |
| 9554 | pmaddubsw m3, m2, [r5 + 25 * 16] |
| 9555 | pmulhrsw m3, m0 |
| 9556 | packuswb m3, m3 |
| 9557 | movd [r0 + 328], m3 |
| 9558 | |
| 9559 | pmaddubsw m3, m2, [r5 + 12 * 16] |
| 9560 | pmulhrsw m3, m0 |
| 9561 | packuswb m3, m3 |
| 9562 | movd [r0 + 332], m3 |
| 9563 | |
| 9564 | ; mode 23 |
| 9565 | |
| 9566 | pmaddubsw m3, m1, [r5 + 23 * 16] |
| 9567 | pmulhrsw m3, m0 |
| 9568 | packuswb m3, m3 |
| 9569 | movd [r0 + 336], m3 |
| 9570 | |
| 9571 | pmaddubsw m3, m1, [r5 + 14 * 16] |
| 9572 | pmulhrsw m3, m0 |
| 9573 | packuswb m3, m3 |
| 9574 | movd [r0 + 340], m3 |
| 9575 | |
| 9576 | pmaddubsw m3, m1, [r5 + 5 * 16] |
| 9577 | pmulhrsw m3, m0 |
| 9578 | packuswb m3, m3 |
| 9579 | movd [r0 + 344], m3 |
| 9580 | |
| 9581 | pinsrb m2, [r2 + 4], 0 |
| 9582 | |
| 9583 | pmaddubsw m2, [r5 + 28 * 16] |
| 9584 | pmulhrsw m2, m0 |
| 9585 | packuswb m2, m2 |
| 9586 | movd [r0 + 348], m2 |
| 9587 | |
| 9588 | ; mode 24 |
| 9589 | |
| 9590 | pmaddubsw m2, m1, [r5 + 27 * 16] |
| 9591 | pmulhrsw m2, m0 |
| 9592 | packuswb m2, m2 |
| 9593 | movd [r0 + 352], m2 |
| 9594 | |
| 9595 | pmaddubsw m2, m1, [r5 + 22 * 16] |
| 9596 | pmulhrsw m2, m0 |
| 9597 | packuswb m2, m2 |
| 9598 | movd [r0 + 356], m2 |
| 9599 | |
| 9600 | pmaddubsw m2, m1, [r5 + 17 * 16] |
| 9601 | pmulhrsw m2, m0 |
| 9602 | packuswb m2, m2 |
| 9603 | movd [r0 + 360], m2 |
| 9604 | |
| 9605 | pmaddubsw m2, m1, [r5 + 12 * 16] |
| 9606 | pmulhrsw m2, m0 |
| 9607 | packuswb m2, m2 |
| 9608 | movd [r0 + 364], m2 |
| 9609 | |
| 9610 | ; mode 25 |
| 9611 | |
| 9612 | pmaddubsw m2, m1, [r5 + 30 * 16] |
| 9613 | pmulhrsw m2, m0 |
| 9614 | packuswb m2, m2 |
| 9615 | movd [r0 + 368], m2 |
| 9616 | |
| 9617 | pmaddubsw m2, m1, [r5 + 28 * 16] |
| 9618 | pmulhrsw m2, m0 |
| 9619 | packuswb m2, m2 |
| 9620 | movd [r0 + 372], m2 |
| 9621 | |
| 9622 | pmaddubsw m2, m1, [r5 + 26 * 16] |
| 9623 | pmulhrsw m2, m0 |
| 9624 | packuswb m2, m2 |
| 9625 | movd [r0 + 376], m2 |
| 9626 | |
| 9627 | pmaddubsw m2, m1, [r5 + 24 * 16] |
| 9628 | pmulhrsw m2, m0 |
| 9629 | packuswb m2, m2 |
| 9630 | movd [r0 + 380], m2 |
| 9631 | |
| 9632 | ; mode 26 |
| 9633 | |
| 9634 | movh m1, [r1 + 1] |
| 9635 | pshufd m2, m1, 0 |
| 9636 | movu [r0 + 384], m2 |
| 9637 | |
| 9638 | pxor m2, m2 |
| 9639 | |
| 9640 | pshufb m3, m1, m2 |
| 9641 | punpcklbw m3, m2 |
| 9642 | |
| 9643 | movh m4, [r2] |
| 9644 | |
| 9645 | pshufb m5, m4, m2 |
| 9646 | punpcklbw m5, m2 |
| 9647 | |
| 9648 | psrldq m4, 1 |
| 9649 | punpcklbw m4, m2 |
| 9650 | |
| 9651 | psubw m4, m5 |
| 9652 | psraw m4, 1 |
| 9653 | |
| 9654 | paddw m3, m4 |
| 9655 | |
| 9656 | packuswb m3, m2 |
| 9657 | |
| 9658 | pextrb [r0 + 384], m3, 0 |
| 9659 | pextrb [r0 + 388], m3, 1 |
| 9660 | pextrb [r0 + 392], m3, 2 |
| 9661 | pextrb [r0 + 396], m3, 3 |
| 9662 | |
| 9663 | ; mode 27 |
| 9664 | |
| 9665 | palignr m2, m1, 1 |
| 9666 | punpcklbw m1, m2 |
| 9667 | |
| 9668 | pmaddubsw m2, m1, [r5 + 2 * 16] |
| 9669 | pmulhrsw m2, m0 |
| 9670 | packuswb m2, m2 |
| 9671 | movd [r0 + 400], m2 |
| 9672 | |
| 9673 | pmaddubsw m2, m1, [r5 + 4 * 16] |
| 9674 | pmulhrsw m2, m0 |
| 9675 | packuswb m2, m2 |
| 9676 | movd [r0 + 404], m2 |
| 9677 | |
| 9678 | pmaddubsw m2, m1, [r5 + 6 * 16] |
| 9679 | pmulhrsw m2, m0 |
| 9680 | packuswb m2, m2 |
| 9681 | movd [r0 + 408], m2 |
| 9682 | |
| 9683 | pmaddubsw m2, m1, [r5 + 8 * 16] |
| 9684 | pmulhrsw m2, m0 |
| 9685 | packuswb m2, m2 |
| 9686 | movd [r0 + 412], m2 |
| 9687 | |
| 9688 | ; mode 28 |
| 9689 | |
| 9690 | pmaddubsw m2, m1, [r5 + 5 * 16] |
| 9691 | pmulhrsw m2, m0 |
| 9692 | packuswb m2, m2 |
| 9693 | movd [r0 + 416], m2 |
| 9694 | |
| 9695 | pmaddubsw m2, m1, [r5 + 10 * 16] |
| 9696 | pmulhrsw m2, m0 |
| 9697 | packuswb m2, m2 |
| 9698 | movd [r0 + 420], m2 |
| 9699 | |
| 9700 | pmaddubsw m2, m1, [r5 + 15 * 16] |
| 9701 | pmulhrsw m2, m0 |
| 9702 | packuswb m2, m2 |
| 9703 | movd [r0 + 424], m2 |
| 9704 | |
| 9705 | pmaddubsw m2, m1, m7 |
| 9706 | pmulhrsw m2, m0 |
| 9707 | packuswb m2, m2 |
| 9708 | movd [r0 + 428], m2 |
| 9709 | |
| 9710 | ; mode 29 |
| 9711 | |
| 9712 | pmaddubsw m2, m1, [r5 + 9 * 16] |
| 9713 | pmulhrsw m2, m0 |
| 9714 | packuswb m2, m2 |
| 9715 | movd [r0 + 432], m2 |
| 9716 | |
| 9717 | pmaddubsw m2, m1, [r5 + 18 * 16] |
| 9718 | pmulhrsw m2, m0 |
| 9719 | packuswb m2, m2 |
| 9720 | movd [r0 + 436], m2 |
| 9721 | |
| 9722 | pmaddubsw m2, m1, [r5 + 27 * 16] |
| 9723 | pmulhrsw m2, m0 |
| 9724 | packuswb m2, m2 |
| 9725 | movd [r0 + 440], m2 |
| 9726 | |
| 9727 | palignr m2, m1, 2 |
| 9728 | |
| 9729 | pmaddubsw m3, m2, [r5 + 4 * 16] |
| 9730 | pmulhrsw m3, m0 |
| 9731 | packuswb m3, m3 |
| 9732 | movd [r0 + 444], m3 |
| 9733 | |
| 9734 | ; mode 30 |
| 9735 | |
| 9736 | pmaddubsw m3, m1, [r5 + 13 * 16] |
| 9737 | pmulhrsw m3, m0 |
| 9738 | packuswb m3, m3 |
| 9739 | movd [r0 + 448], m3 |
| 9740 | |
| 9741 | pmaddubsw m6, m1, [r5 + 26 * 16] |
| 9742 | pmulhrsw m6, m0 |
| 9743 | packuswb m6, m6 |
| 9744 | movd [r0 + 452], m6 |
| 9745 | |
| 9746 | pmaddubsw m3, m2, [r5 + 7 * 16] |
| 9747 | pmulhrsw m3, m0 |
| 9748 | packuswb m3, m3 |
| 9749 | movd [r0 + 456], m3 |
| 9750 | |
| 9751 | pmaddubsw m5, m2, m7 |
| 9752 | pmulhrsw m5, m0 |
| 9753 | packuswb m5, m5 |
| 9754 | movd [r0 + 460], m5 |
| 9755 | |
| 9756 | ; mode 31 |
| 9757 | |
| 9758 | pmaddubsw m3, m1, [r5 + 17 * 16] |
| 9759 | pmulhrsw m3, m0 |
| 9760 | packuswb m3, m3 |
| 9761 | movd [r0 + 464], m3 |
| 9762 | |
| 9763 | pmaddubsw m3, m2, [r5 + 2 * 16] |
| 9764 | pmulhrsw m3, m0 |
| 9765 | packuswb m3, m3 |
| 9766 | movd [r0 + 468], m3 |
| 9767 | |
| 9768 | pmaddubsw m3, m2, [r5 + 19 * 16] |
| 9769 | pmulhrsw m3, m0 |
| 9770 | packuswb m3, m3 |
| 9771 | movd [r0 + 472], m3 |
| 9772 | |
| 9773 | palignr m3, m2, 2 |
| 9774 | |
| 9775 | pmaddubsw m4, m3, [r5 + 4 * 16] |
| 9776 | pmulhrsw m4, m0 |
| 9777 | packuswb m4, m4 |
| 9778 | movd [r0 + 476], m4 |
| 9779 | |
| 9780 | ; mode 32 |
| 9781 | |
| 9782 | pmaddubsw m4, m1, [r5 + 21 * 16] |
| 9783 | pmulhrsw m4, m0 |
| 9784 | packuswb m4, m4 |
| 9785 | movd [r0 + 480], m4 |
| 9786 | |
| 9787 | pmaddubsw m4, m2, [r5 + 10 * 16] |
| 9788 | pmulhrsw m4, m0 |
| 9789 | packuswb m4, m4 |
| 9790 | movd [r0 + 484], m4 |
| 9791 | |
| 9792 | pmaddubsw m4, m2, [r5 + 31 * 16] |
| 9793 | pmulhrsw m4, m0 |
| 9794 | packuswb m4, m4 |
| 9795 | movd [r0 + 488], m4 |
| 9796 | |
| 9797 | pmaddubsw m4, m3, m7 |
| 9798 | pmulhrsw m4, m0 |
| 9799 | packuswb m4, m4 |
| 9800 | movd [r0 + 492], m4 |
| 9801 | |
| 9802 | ; mode 33 |
| 9803 | |
| 9804 | movd [r0 + 496], m6 |
| 9805 | |
| 9806 | movd [r0 + 500], m5 |
| 9807 | |
| 9808 | pmaddubsw m4, m3, [r5 + 14 * 16] |
| 9809 | pmulhrsw m4, m0 |
| 9810 | packuswb m4, m4 |
| 9811 | movd [r0 + 504], m4 |
| 9812 | |
| 9813 | psrldq m3, 2 |
| 9814 | |
| 9815 | pmaddubsw m3, [r5 + 8 * 16] |
| 9816 | pmulhrsw m3, m0 |
| 9817 | packuswb m3, m3 |
| 9818 | movd [r0 + 508], m3 |
| 9819 | |
| 9820 | ; mode 34 |
| 9821 | |
| 9822 | movh m0, [r1 + 2] |
| 9823 | movd [r0 + 512], m0 |
| 9824 | |
| 9825 | palignr m1, m0, 1 |
| 9826 | movd [r0 + 516], m1 |
| 9827 | |
| 9828 | palignr m1, m0, 2 |
| 9829 | movd [r0 + 520], m1 |
| 9830 | |
| 9831 | palignr m1, m0, 3 |
| 9832 | movd [r0 + 524], m1 |
| 9833 | |
| 9834 | RET |
| 9835 | |
| 9836 | ;----------------------------------------------------------------------------- |
| 9837 | ; void all_angs_pred_8x8(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma) |
| 9838 | ;----------------------------------------------------------------------------- |
| 9839 | INIT_XMM sse4 |
| 9840 | cglobal all_angs_pred_8x8, 6, 6, 8, dest, above0, left0, above1, left1, bLuma |
| 9841 | |
| 9842 | ; mode 2 |
| 9843 | |
| 9844 | movu m0, [r4 + 2] |
| 9845 | |
| 9846 | palignr m1, m0, 1 |
| 9847 | punpcklqdq m2, m0, m1 |
| 9848 | movu [r0], m2 |
| 9849 | |
| 9850 | palignr m1, m0, 2 |
| 9851 | palignr m2, m0, 3 |
| 9852 | punpcklqdq m1, m2 |
| 9853 | movu [r0 + 16], m1 |
| 9854 | |
| 9855 | palignr m1, m0, 4 |
| 9856 | palignr m2, m0, 5 |
| 9857 | punpcklqdq m1, m2 |
| 9858 | movu [r0 + 32], m1 |
| 9859 | |
| 9860 | palignr m1, m0, 6 |
| 9861 | palignr m2, m0, 7 |
| 9862 | punpcklqdq m1, m2 |
| 9863 | movu [r0 + 48], m1 |
| 9864 | |
| 9865 | ; mode 3 [row 0, 1] |
| 9866 | |
| 9867 | mova m7, [pw_1024] |
| 9868 | lea r5, [ang_table] |
| 9869 | |
| 9870 | movu m0, [r2 + 1] |
| 9871 | |
| 9872 | palignr m1, m0, 1 |
| 9873 | palignr m2, m0, 2 |
| 9874 | |
| 9875 | punpcklbw m3, m0, m1 |
| 9876 | pmaddubsw m4, m3, [r5 + 26 * 16] |
| 9877 | pmulhrsw m4, m7 |
| 9878 | |
| 9879 | punpcklbw m1, m2 |
| 9880 | pmaddubsw m5, m1, [r5 + 20 * 16] |
| 9881 | pmulhrsw m5, m7 |
| 9882 | |
| 9883 | packuswb m4, m5 |
| 9884 | |
| 9885 | movu [r0 + 64], m4 |
| 9886 | |
| 9887 | ; mode 6 [row 1] |
| 9888 | |
| 9889 | movh [r0 + 264], m4 |
| 9890 | |
| 9891 | ; mode 6 [row 3] |
| 9892 | |
| 9893 | movhps [r0 + 280], m4 |
| 9894 | |
| 9895 | ; mode 4 [row 0, 1] |
| 9896 | |
| 9897 | pmaddubsw m4, m3, [r5 + 21 * 16] |
| 9898 | pmulhrsw m4, m7 |
| 9899 | |
| 9900 | pmaddubsw m5, m1, [r5 + 10 * 16] |
| 9901 | pmulhrsw m5, m7 |
| 9902 | |
| 9903 | packuswb m4, m5 |
| 9904 | movu [r0 + 128], m4 |
| 9905 | |
| 9906 | ; mode 5 [row 0, 1] |
| 9907 | |
| 9908 | pmaddubsw m4, m3, [r5 + 17 * 16] |
| 9909 | pmulhrsw m4, m7 |
| 9910 | |
| 9911 | pmaddubsw m5, m1, [r5 + 2 * 16] |
| 9912 | pmulhrsw m5, m7 |
| 9913 | |
| 9914 | packuswb m4, m5 |
| 9915 | movu [r0 + 192], m4 |
| 9916 | |
| 9917 | ; mode 6 [row 0] |
| 9918 | |
| 9919 | pmaddubsw m4, m3, [r5 + 13 * 16] |
| 9920 | pmulhrsw m4, m7 |
| 9921 | |
| 9922 | pxor m5, m5 |
| 9923 | |
| 9924 | packuswb m4, m5 |
| 9925 | movh [r0 + 256], m4 |
| 9926 | |
| 9927 | ; mode 7 [row 0, 1] |
| 9928 | |
| 9929 | pmaddubsw m4, m3, [r5 + 9 * 16] |
| 9930 | pmulhrsw m4, m7 |
| 9931 | |
| 9932 | pmaddubsw m5, m3, [r5 + 18 * 16] |
| 9933 | pmulhrsw m5, m7 |
| 9934 | |
| 9935 | packuswb m4, m5 |
| 9936 | movu [r0 + 320], m4 |
| 9937 | |
| 9938 | ; mode 8 [row 0, 1] |
| 9939 | |
| 9940 | pmaddubsw m4, m3, [r5 + 5 * 16] |
| 9941 | pmulhrsw m4, m7 |
| 9942 | |
| 9943 | pmaddubsw m5, m3, [r5 + 10 * 16] |
| 9944 | pmulhrsw m5, m7 |
| 9945 | |
| 9946 | packuswb m4, m5 |
| 9947 | movu [r0 + 384], m4 |
| 9948 | |
| 9949 | ; mode 8 [row 2, 3] |
| 9950 | |
| 9951 | pmaddubsw m4, m3, [r5 + 15 * 16] |
| 9952 | pmulhrsw m4, m7 |
| 9953 | |
| 9954 | pmaddubsw m5, m3, [r5 + 20 * 16] |
| 9955 | pmulhrsw m5, m7 |
| 9956 | |
| 9957 | packuswb m4, m5 |
| 9958 | movu [r0 + 400], m4 |
| 9959 | |
| 9960 | ; mode 8 [row 4, 5] |
| 9961 | |
| 9962 | pmaddubsw m4, m3, [r5 + 25 * 16] |
| 9963 | pmulhrsw m4, m7 |
| 9964 | |
| 9965 | pmaddubsw m5, m3, [r5 + 30 * 16] |
| 9966 | pmulhrsw m5, m7 |
| 9967 | |
| 9968 | packuswb m4, m5 |
| 9969 | movu [r0 + 416], m4 |
| 9970 | |
| 9971 | ; mode 8 [row 6, 7] |
| 9972 | |
| 9973 | pmaddubsw m4, m1, [r5 + 3 * 16] |
| 9974 | pmulhrsw m4, m7 |
| 9975 | |
| 9976 | pmaddubsw m5, m1, [r5 + 8 * 16] |
| 9977 | pmulhrsw m5, m7 |
| 9978 | |
| 9979 | packuswb m4, m5 |
| 9980 | movu [r0 + 432], m4 |
| 9981 | |
| 9982 | ; mode 9 [row 0, 1] |
| 9983 | |
| 9984 | pmaddubsw m4, m3, [r5 + 2 * 16] |
| 9985 | pmulhrsw m4, m7 |
| 9986 | |
| 9987 | pmaddubsw m5, m3, [r5 + 4 * 16] |
| 9988 | pmulhrsw m5, m7 |
| 9989 | |
| 9990 | packuswb m4, m5 |
| 9991 | movu [r0 + 448], m4 |
| 9992 | |
| 9993 | ; mode 9 [row 2, 3] |
| 9994 | |
| 9995 | pmaddubsw m4, m3, [r5 + 6 * 16] |
| 9996 | pmulhrsw m4, m7 |
| 9997 | |
| 9998 | pmaddubsw m5, m3, [r5 + 8 * 16] |
| 9999 | pmulhrsw m5, m7 |
| 10000 | |
| 10001 | packuswb m4, m5 |
| 10002 | movu [r0 + 464], m4 |
| 10003 | |
| 10004 | ; mode 9 [row 4, 5] |
| 10005 | |
| 10006 | pmaddubsw m4, m3, [r5 + 10 * 16] |
| 10007 | pmulhrsw m4, m7 |
| 10008 | |
| 10009 | pmaddubsw m5, m3, [r5 + 12 * 16] |
| 10010 | pmulhrsw m5, m7 |
| 10011 | |
| 10012 | packuswb m4, m5 |
| 10013 | movu [r0 + 480], m4 |
| 10014 | |
| 10015 | ; mode 9 [row 6, 7] |
| 10016 | |
| 10017 | pmaddubsw m4, m3, [r5 + 14 * 16] |
| 10018 | pmulhrsw m4, m7 |
| 10019 | |
| 10020 | pmaddubsw m5, m3, [r5 + 16 * 16] |
| 10021 | pmulhrsw m5, m7 |
| 10022 | |
| 10023 | packuswb m4, m5 |
| 10024 | movu [r0 + 496], m4 |
| 10025 | |
| 10026 | ; mode 7 [row 2, 3] |
| 10027 | |
| 10028 | pmaddubsw m4, m3, [r5 + 27 * 16] |
| 10029 | pmulhrsw m4, m7 |
| 10030 | |
| 10031 | pmaddubsw m5, m1, [r5 + 4 * 16] |
| 10032 | pmulhrsw m5, m7 |
| 10033 | |
| 10034 | packuswb m4, m5 |
| 10035 | movu [r0 + 336], m4 |
| 10036 | |
| 10037 | ; mode 7 [row 4, 5] |
| 10038 | |
| 10039 | pmaddubsw m4, m1, [r5 + 13 * 16] |
| 10040 | pmulhrsw m4, m7 |
| 10041 | |
| 10042 | pmaddubsw m5, m1, [r5 + 22 * 16] |
| 10043 | pmulhrsw m5, m7 |
| 10044 | |
| 10045 | packuswb m4, m5 |
| 10046 | movu [r0 + 352], m4 |
| 10047 | |
| 10048 | ; mode 6 [row 2] |
| 10049 | |
| 10050 | pmaddubsw m4, m1, [r5 + 7 * 16] |
| 10051 | pmulhrsw m4, m7 |
| 10052 | |
| 10053 | pxor m5, m5 |
| 10054 | |
| 10055 | packuswb m4, m5 |
| 10056 | movh [r0 + 272], m4 |
| 10057 | |
| 10058 | ; mode 3 [row 2, 3] |
| 10059 | |
| 10060 | palignr m1, m0, 3 |
| 10061 | palignr m3, m0, 4 |
| 10062 | |
| 10063 | punpcklbw m2, m1 |
| 10064 | pmaddubsw m5, m2, [r5 + 14 * 16] |
| 10065 | pmulhrsw m5, m7 |
| 10066 | |
| 10067 | punpcklbw m1, m3 |
| 10068 | pmaddubsw m6, m1, [r5 + 8 * 16] |
| 10069 | pmulhrsw m6, m7 |
| 10070 | |
| 10071 | packuswb m5, m6 |
| 10072 | movu [r0 + 80], m5 |
| 10073 | |
| 10074 | ; mode 6 [row 7] |
| 10075 | |
| 10076 | movhps [r0 + 312], m5 |
| 10077 | |
| 10078 | ; mode 6 [row 5] |
| 10079 | |
| 10080 | movh [r0 + 296], m5 |
| 10081 | |
| 10082 | ; mode 4 [calculate and store row 4, 5] |
| 10083 | |
| 10084 | pmaddubsw m4, m1, [r5 + 9 * 16] |
| 10085 | pmulhrsw m4, m7 |
| 10086 | |
| 10087 | pmaddubsw m5, m1, [r5 + 30 * 16] |
| 10088 | pmulhrsw m5, m7 |
| 10089 | |
| 10090 | packuswb m4, m5 |
| 10091 | movu [r0 + 160], m4 |
| 10092 | |
| 10093 | ; mode 5 [row 4, 5] |
| 10094 | |
| 10095 | pmaddubsw m4, m2, [r5 + 21 * 16] |
| 10096 | pmulhrsw m4, m7 |
| 10097 | |
| 10098 | pmaddubsw m5, m1, [r5 + 6 * 16] |
| 10099 | pmulhrsw m5, m7 |
| 10100 | |
| 10101 | packuswb m4, m5 |
| 10102 | movu [r0 + 224], m4 |
| 10103 | |
| 10104 | ; mode 6 [row 4, 5] |
| 10105 | |
| 10106 | pmaddubsw m5, m2, [r5 + 1 * 16] |
| 10107 | pmulhrsw m5, m7 |
| 10108 | |
| 10109 | pxor m6, m6 |
| 10110 | |
| 10111 | packuswb m5, m6 |
| 10112 | movh [r0 + 288], m5 |
| 10113 | |
| 10114 | ; mode 6 [row 6, 7] |
| 10115 | |
| 10116 | pmaddubsw m5, m2, [r5 + 27 * 16] |
| 10117 | pmulhrsw m5, m7 |
| 10118 | |
| 10119 | pxor m6, m6 |
| 10120 | |
| 10121 | packuswb m5, m6 |
| 10122 | movh [r0 + 304], m5 |
| 10123 | |
| 10124 | ; mode 5 [calculate row 6] |
| 10125 | |
| 10126 | pmaddubsw m6, m1, [r5 + 23 * 16] |
| 10127 | pmulhrsw m6, m7 |
| 10128 | |
| 10129 | ; mode 3 [row 4, 5] |
| 10130 | |
| 10131 | palignr m1, m0, 5 |
| 10132 | |
| 10133 | punpcklbw m3, m1 |
| 10134 | pmaddubsw m4, m3, [r5 + 2 * 16] |
| 10135 | pmulhrsw m4, m7 |
| 10136 | |
| 10137 | pmaddubsw m5, m3, [r5 + 28 * 16] |
| 10138 | pmulhrsw m5, m7 |
| 10139 | |
| 10140 | packuswb m4, m5 |
| 10141 | movu [r0 + 96], m4 |
| 10142 | |
| 10143 | ; mode 4 [calculate row 7] |
| 10144 | |
| 10145 | pmaddubsw m5, m3, [r5 + 19 * 16] |
| 10146 | pmulhrsw m5, m7 |
| 10147 | |
| 10148 | ; mode 5 [calculate row 6] |
| 10149 | |
| 10150 | pmaddubsw m4, m3, [r5 + 8 * 16] |
| 10151 | pmulhrsw m4, m7 |
| 10152 | |
| 10153 | packuswb m6, m4 |
| 10154 | movu [r0 + 240], m6 |
| 10155 | |
| 10156 | ; mode 3 [row 6, 7] |
| 10157 | |
| 10158 | palignr m2, m0, 6 |
| 10159 | palignr m3, m0, 7 |
| 10160 | |
| 10161 | punpcklbw m1, m2 |
| 10162 | pmaddubsw m4, m1, [r5 + 22 * 16] |
| 10163 | pmulhrsw m4, m7 |
| 10164 | |
| 10165 | punpcklbw m2, m3 |
| 10166 | pmaddubsw m2, [r5 + 16 * 16] |
| 10167 | pmulhrsw m2, m7 |
| 10168 | |
| 10169 | packuswb m4, m2 |
| 10170 | movu [r0 + 112], m4 |
| 10171 | |
| 10172 | ; mode 4 [calculate row 7] |
| 10173 | |
| 10174 | pmaddubsw m2, m1, [r5 + 8 * 16] |
| 10175 | pmulhrsw m2, m7 |
| 10176 | |
| 10177 | ; mode 4 [store row 6 and 7] |
| 10178 | |
| 10179 | packuswb m5, m2 |
| 10180 | movu [r0 + 176], m5 |
| 10181 | |
| 10182 | ; mode 4 [row 2, 3] |
| 10183 | |
| 10184 | palignr m1, m0, 1 |
| 10185 | palignr m2, m0, 2 |
| 10186 | palignr m3, m0, 3 |
| 10187 | |
| 10188 | punpcklbw m1, m2 |
| 10189 | pmaddubsw m4, m1, [r5 + 31 * 16] |
| 10190 | pmulhrsw m4, m7 |
| 10191 | |
| 10192 | punpcklbw m2, m3 |
| 10193 | pmaddubsw m5, m2, [r5 + 20 * 16] |
| 10194 | pmulhrsw m5, m7 |
| 10195 | |
| 10196 | packuswb m4, m5 |
| 10197 | movu [r0 + 144], m4 |
| 10198 | |
| 10199 | ; mode 5 [row 2, 3] |
| 10200 | |
| 10201 | pmaddubsw m4, m1, [r5 + 19 * 16] |
| 10202 | pmulhrsw m4, m7 |
| 10203 | |
| 10204 | pmaddubsw m5, m2, [r5 + 4 * 16] |
| 10205 | pmulhrsw m5, m7 |
| 10206 | |
| 10207 | packuswb m4, m5 |
| 10208 | movu [r0 + 208], m4 |
| 10209 | |
| 10210 | ; mode 7 [row 6, 7] |
| 10211 | |
| 10212 | pmaddubsw m4, m1, [r5 + 31 * 16] |
| 10213 | pmulhrsw m4, m7 |
| 10214 | |
| 10215 | pmaddubsw m5, m2, [r5 + 8 * 16] |
| 10216 | pmulhrsw m5, m7 |
| 10217 | |
| 10218 | packuswb m4, m5 |
| 10219 | movu [r0 + 368], m4 |
| 10220 | |
| 10221 | ; mode 10 |
| 10222 | |
| 10223 | pshufb m1, m0, [tab_Si] |
| 10224 | movu [r0 + 512], m1 |
| 10225 | movu [r0 + 528], m1 |
| 10226 | movu [r0 + 544], m1 |
| 10227 | movu [r0 + 560], m1 |
| 10228 | |
| 10229 | pxor m0, m0 |
| 10230 | |
| 10231 | pshufb m1, m1, m0 |
| 10232 | punpcklbw m1, m0 |
| 10233 | |
| 10234 | movu m2, [r1] |
| 10235 | |
| 10236 | pshufb m3, m2, m0 |
| 10237 | punpcklbw m3, m0 |
| 10238 | |
| 10239 | psrldq m4, m2, 1 |
| 10240 | punpcklbw m4, m0 |
| 10241 | |
| 10242 | movu m2, [r1 + 9] |
| 10243 | punpcklbw m2, m0 |
| 10244 | |
| 10245 | psubw m4, m3 |
| 10246 | psubw m2, m3 |
| 10247 | |
| 10248 | psraw m4, 1 |
| 10249 | psraw m2, 1 |
| 10250 | |
| 10251 | paddw m4, m1 |
| 10252 | paddw m2, m1 |
| 10253 | |
| 10254 | packuswb m4, m2 |
| 10255 | |
| 10256 | pextrb [r0 + 512], m4, 0 |
| 10257 | pextrb [r0 + 520], m4, 1 |
| 10258 | pextrb [r0 + 528], m4, 2 |
| 10259 | pextrb [r0 + 536], m4, 3 |
| 10260 | pextrb [r0 + 544], m4, 4 |
| 10261 | pextrb [r0 + 552], m4, 5 |
| 10262 | pextrb [r0 + 560], m4, 6 |
| 10263 | pextrb [r0 + 568], m4, 7 |
| 10264 | |
| 10265 | ; mode 11 [row 0, 1] |
| 10266 | |
| 10267 | movu m0, [r2] |
| 10268 | palignr m1, m0, 1 |
| 10269 | punpcklbw m2, m0, m1 |
| 10270 | |
| 10271 | pmaddubsw m3, m2, [r5 + 30 * 16] |
| 10272 | pmulhrsw m3, m7 |
| 10273 | |
| 10274 | pmaddubsw m4, m2, [r5 + 28 * 16] |
| 10275 | pmulhrsw m4, m7 |
| 10276 | |
| 10277 | packuswb m3, m4 |
| 10278 | movu [r0 + 576], m3 |
| 10279 | |
| 10280 | ; mode 11 [row 2, 3] |
| 10281 | |
| 10282 | pmaddubsw m3, m2, [r5 + 26 * 16] |
| 10283 | pmulhrsw m3, m7 |
| 10284 | |
| 10285 | pmaddubsw m4, m2, [r5 + 24 * 16] |
| 10286 | pmulhrsw m4, m7 |
| 10287 | |
| 10288 | packuswb m3, m4 |
| 10289 | movu [r0 + 592], m3 |
| 10290 | |
| 10291 | ; mode 11 [row 4, 5] |
| 10292 | |
| 10293 | pmaddubsw m3, m2, [r5 + 22 * 16] |
| 10294 | pmulhrsw m3, m7 |
| 10295 | |
| 10296 | pmaddubsw m4, m2, [r5 + 20 * 16] |
| 10297 | pmulhrsw m4, m7 |
| 10298 | |
| 10299 | packuswb m5, m3, m4 |
| 10300 | movu [r0 + 608], m5 |
| 10301 | |
| 10302 | ; mode 12 [row 0, 1] |
| 10303 | |
| 10304 | pmaddubsw m4, m2, [r5 + 27 * 16] |
| 10305 | pmulhrsw m4, m7 |
| 10306 | |
| 10307 | packuswb m4, m3 |
| 10308 | movu [r0 + 640], m4 |
| 10309 | |
| 10310 | ; mode 11 [row 6, 7] |
| 10311 | |
| 10312 | pmaddubsw m3, m2, [r5 + 18 * 16] |
| 10313 | pmulhrsw m3, m7 |
| 10314 | |
| 10315 | pmaddubsw m4, m2, [r5 + 16 * 16] |
| 10316 | pmulhrsw m4, m7 |
| 10317 | |
| 10318 | packuswb m3, m4 |
| 10319 | movu [r0 + 624], m3 |
| 10320 | |
| 10321 | ; mode 12 [row 2, 3] |
| 10322 | |
| 10323 | pmaddubsw m3, m2, [r5 + 17 * 16] |
| 10324 | pmulhrsw m3, m7 |
| 10325 | |
| 10326 | pmaddubsw m4, m2, [r5 + 12 * 16] |
| 10327 | pmulhrsw m4, m7 |
| 10328 | |
| 10329 | packuswb m3, m4 |
| 10330 | movu [r0 + 656], m3 |
| 10331 | |
| 10332 | ; mode 12 [row 4, 5] |
| 10333 | |
| 10334 | pmaddubsw m3, m2, [r5 + 7 * 16] |
| 10335 | pmulhrsw m3, m7 |
| 10336 | |
| 10337 | pmaddubsw m4, m2, [r5 + 2 * 16] |
| 10338 | pmulhrsw m4, m7 |
| 10339 | |
| 10340 | packuswb m3, m4 |
| 10341 | movu [r0 + 672], m3 |
| 10342 | |
| 10343 | ; mode 12 [row 6, 7] |
| 10344 | |
| 10345 | pslldq m3, m2, 2 |
| 10346 | pinsrb m3, [r1 + 0], 1 |
| 10347 | pinsrb m3, [r1 + 6], 0 |
| 10348 | |
| 10349 | pmaddubsw m4, m3, [r5 + 29 * 16] |
| 10350 | pmulhrsw m4, m7 |
| 10351 | |
| 10352 | pmaddubsw m5, m3, [r5 + 24 * 16] |
| 10353 | pmulhrsw m5, m7 |
| 10354 | |
| 10355 | packuswb m4, m5 |
| 10356 | movu [r0 + 688], m4 |
| 10357 | |
| 10358 | ; mode 13 [row 0, 1] |
| 10359 | |
| 10360 | pmaddubsw m4, m2, [r5 + 23 * 16] |
| 10361 | pmulhrsw m4, m7 |
| 10362 | |
| 10363 | pmaddubsw m5, m2, [r5 + 14 * 16] |
| 10364 | pmulhrsw m5, m7 |
| 10365 | |
| 10366 | packuswb m4, m5 |
| 10367 | movu [r0 + 704], m4 |
| 10368 | |
| 10369 | ; mode 13 [row 2, 3] |
| 10370 | |
| 10371 | pmaddubsw m4, m2, [r5 + 5 * 16] |
| 10372 | pmulhrsw m4, m7 |
| 10373 | |
| 10374 | pinsrb m3, [r1 + 4], 0 |
| 10375 | pmaddubsw m5, m3, [r5 + 28 * 16] |
| 10376 | pmulhrsw m5, m7 |
| 10377 | |
| 10378 | packuswb m4, m5 |
| 10379 | movu [r0 + 720], m4 |
| 10380 | |
| 10381 | ; mode 13 [row 4, 5] |
| 10382 | |
| 10383 | pmaddubsw m4, m3, [r5 + 19 * 16] |
| 10384 | pmulhrsw m4, m7 |
| 10385 | |
| 10386 | pmaddubsw m5, m3, [r5 + 10 * 16] |
| 10387 | pmulhrsw m5, m7 |
| 10388 | |
| 10389 | packuswb m4, m5 |
| 10390 | movu [r0 + 736], m4 |
| 10391 | |
| 10392 | ; mode 13 [row 6, 7] |
| 10393 | |
| 10394 | pmaddubsw m4, m3, [r5 + 1 * 16] |
| 10395 | pmulhrsw m4, m7 |
| 10396 | |
| 10397 | pslldq m5, m3, 2 |
| 10398 | pinsrb m5, [r1 + 4], 1 |
| 10399 | pinsrb m5, [r1 + 7], 0 |
| 10400 | |
| 10401 | pmaddubsw m5, [r5 + 24 * 16] |
| 10402 | pmulhrsw m5, m7 |
| 10403 | |
| 10404 | packuswb m4, m5 |
| 10405 | movu [r0 + 752], m4 |
| 10406 | |
| 10407 | ; mode 14 [row 0, 1] |
| 10408 | |
| 10409 | pmaddubsw m4, m2, [r5 + 19 * 16] |
| 10410 | pmulhrsw m4, m7 |
| 10411 | |
| 10412 | pmaddubsw m5, m2, [r5 + 6 * 16] |
| 10413 | pmulhrsw m5, m7 |
| 10414 | |
| 10415 | packuswb m4, m5 |
| 10416 | movu [r0 + 768], m4 |
| 10417 | |
| 10418 | ; mode 14 [row 2, 3] |
| 10419 | |
| 10420 | pinsrb m3, [r1 + 2], 0 |
| 10421 | |
| 10422 | pmaddubsw m4, m3, [r5 + 25 * 16] |
| 10423 | pmulhrsw m4, m7 |
| 10424 | |
| 10425 | pmaddubsw m5, m3, [r5 + 12 * 16] |
| 10426 | pmulhrsw m5, m7 |
| 10427 | |
| 10428 | packuswb m4, m5 |
| 10429 | movu [r0 + 784], m4 |
| 10430 | |
| 10431 | ; mode 14 [row 4, 5] |
| 10432 | |
| 10433 | pslldq m1, m3, 2 |
| 10434 | pinsrb m1, [r1 + 2], 1 |
| 10435 | pinsrb m1, [r1 + 5], 0 |
| 10436 | |
| 10437 | pmaddubsw m4, m1, [r5 + 31 * 16] |
| 10438 | pmulhrsw m4, m7 |
| 10439 | |
| 10440 | pmaddubsw m5, m1, [r5 + 18 * 16] |
| 10441 | pmulhrsw m5, m7 |
| 10442 | |
| 10443 | packuswb m4, m5 |
| 10444 | movu [r0 + 800], m4 |
| 10445 | |
| 10446 | ; mode 14 [row 6, 7] |
| 10447 | |
| 10448 | pmaddubsw m4, m1, [r5 + 5 * 16] |
| 10449 | pmulhrsw m4, m7 |
| 10450 | |
| 10451 | pslldq m1, 2 |
| 10452 | pinsrb m1, [r1 + 5], 1 |
| 10453 | pinsrb m1, [r1 + 7], 0 |
| 10454 | |
| 10455 | pmaddubsw m5, m1, [r5 + 24 * 16] |
| 10456 | pmulhrsw m5, m7 |
| 10457 | |
| 10458 | packuswb m4, m5 |
| 10459 | movu [r0 + 816], m4 |
| 10460 | |
| 10461 | ; mode 15 [row 0, 1] |
| 10462 | |
| 10463 | pmaddubsw m4, m2, [r5 + 15 * 16] |
| 10464 | pmulhrsw m4, m7 |
| 10465 | |
| 10466 | pmaddubsw m5, m3, [r5 + 30 * 16] |
| 10467 | pmulhrsw m5, m7 |
| 10468 | |
| 10469 | packuswb m4, m5 |
| 10470 | movu [r0 + 832], m4 |
| 10471 | |
| 10472 | ; mode 15 [row 2, 3] |
| 10473 | |
| 10474 | pmaddubsw m4, m3, [r5 + 13 * 16] |
| 10475 | pmulhrsw m4, m7 |
| 10476 | |
| 10477 | pslldq m1, m3, 2 |
| 10478 | pinsrb m1, [r1 + 2], 1 |
| 10479 | pinsrb m1, [r1 + 4], 0 |
| 10480 | |
| 10481 | pmaddubsw m5, m1, [r5 + 28 * 16] |
| 10482 | pmulhrsw m5, m7 |
| 10483 | |
| 10484 | packuswb m4, m5 |
| 10485 | movu [r0 + 848], m4 |
| 10486 | |
| 10487 | ; mode 15 [row 4, 5] |
| 10488 | |
| 10489 | pmaddubsw m4, m1, [r5 + 11 * 16] |
| 10490 | pmulhrsw m4, m7 |
| 10491 | |
| 10492 | pslldq m1, 2 |
| 10493 | pinsrb m1, [r1 + 4], 1 |
| 10494 | pinsrb m1, [r1 + 6], 0 |
| 10495 | |
| 10496 | pmaddubsw m5, m1, [r5 + 26 * 16] |
| 10497 | pmulhrsw m5, m7 |
| 10498 | |
| 10499 | packuswb m4, m5 |
| 10500 | movu [r0 + 864], m4 |
| 10501 | |
| 10502 | ; mode 15 [row 6, 7] |
| 10503 | |
| 10504 | pmaddubsw m4, m1, [r5 + 9 * 16] |
| 10505 | pmulhrsw m4, m7 |
| 10506 | |
| 10507 | pslldq m1, 2 |
| 10508 | pinsrb m1, [r1 + 6], 1 |
| 10509 | pinsrb m1, [r1 + 8], 0 |
| 10510 | |
| 10511 | pmaddubsw m1, [r5 + 24 * 16] |
| 10512 | pmulhrsw m1, m7 |
| 10513 | |
| 10514 | packuswb m4, m1 |
| 10515 | movu [r0 + 880], m4 |
| 10516 | |
| 10517 | ; mode 16 [row 0, 1] |
| 10518 | |
| 10519 | pmaddubsw m4, m2, [r5 + 11 * 16] |
| 10520 | pmulhrsw m4, m7 |
| 10521 | |
| 10522 | pmaddubsw m5, m3, [r5 + 22 * 16] |
| 10523 | pmulhrsw m5, m7 |
| 10524 | |
| 10525 | packuswb m4, m5 |
| 10526 | movu [r0 + 896], m4 |
| 10527 | |
| 10528 | ; mode 16 [row 2, 3] |
| 10529 | |
| 10530 | pmaddubsw m4, m3, [r5 + 1 * 16] |
| 10531 | pmulhrsw m4, m7 |
| 10532 | |
| 10533 | pslldq m3, 2 |
| 10534 | pinsrb m3, [r1 + 2], 1 |
| 10535 | pinsrb m3, [r1 + 3], 0 |
| 10536 | |
| 10537 | pmaddubsw m5, m3, [r5 + 12 * 16] |
| 10538 | pmulhrsw m5, m7 |
| 10539 | |
| 10540 | packuswb m4, m5 |
| 10541 | movu [r0 + 912], m4 |
| 10542 | |
| 10543 | ; mode 16 [row 4, 5] |
| 10544 | |
| 10545 | pslldq m3, 2 |
| 10546 | pinsrb m3, [r1 + 3], 1 |
| 10547 | pinsrb m3, [r1 + 5], 0 |
| 10548 | |
| 10549 | pmaddubsw m4, m3, [r5 + 23 * 16] |
| 10550 | pmulhrsw m4, m7 |
| 10551 | |
| 10552 | pmaddubsw m5, m3, [r5 + 2 * 16] |
| 10553 | pmulhrsw m5, m7 |
| 10554 | |
| 10555 | packuswb m4, m5 |
| 10556 | movu [r0 + 928], m4 |
| 10557 | |
| 10558 | ; mode 16 [row 6, 7] |
| 10559 | |
| 10560 | pslldq m3, 2 |
| 10561 | pinsrb m3, [r1 + 5], 1 |
| 10562 | pinsrb m3, [r1 + 6], 0 |
| 10563 | |
| 10564 | pmaddubsw m4, m3, [r5 + 13 * 16] |
| 10565 | pmulhrsw m4, m7 |
| 10566 | |
| 10567 | pslldq m3, 2 |
| 10568 | pinsrb m3, [r1 + 6], 1 |
| 10569 | pinsrb m3, [r1 + 8], 0 |
| 10570 | |
| 10571 | pmaddubsw m3, [r5 + 24 * 16] |
| 10572 | pmulhrsw m3, m7 |
| 10573 | |
| 10574 | packuswb m4, m3 |
| 10575 | movu [r0 + 944], m4 |
| 10576 | |
| 10577 | ; mode 17 [row 0, 1] |
| 10578 | |
| 10579 | pmaddubsw m4, m2, [r5 + 6 * 16] |
| 10580 | pmulhrsw m4, m7 |
| 10581 | |
| 10582 | pslldq m2, 2 |
| 10583 | pinsrb m2, [r1 + 0], 1 |
| 10584 | pinsrb m2, [r1 + 1], 0 |
| 10585 | |
| 10586 | pmaddubsw m3, m2, [r5 + 12 * 16] |
| 10587 | pmulhrsw m3, m7 |
| 10588 | |
| 10589 | packuswb m4, m3 |
| 10590 | movu [r0 + 960], m4 |
| 10591 | |
| 10592 | ; mode 17 [row 2, 3] |
| 10593 | |
| 10594 | pslldq m2, 2 |
| 10595 | pinsrb m2, [r1 + 1], 1 |
| 10596 | pinsrb m2, [r1 + 2], 0 |
| 10597 | |
| 10598 | pmaddubsw m4, m2, [r5 + 18 * 16] |
| 10599 | pmulhrsw m4, m7 |
| 10600 | |
| 10601 | pslldq m2, 2 |
| 10602 | pinsrb m2, [r1 + 2], 1 |
| 10603 | pinsrb m2, [r1 + 4], 0 |
| 10604 | |
| 10605 | pmaddubsw m3, m2, [r5 + 24 * 16] |
| 10606 | pmulhrsw m3, m7 |
| 10607 | |
| 10608 | packuswb m4, m3 |
| 10609 | movu [r0 + 976], m4 |
| 10610 | |
| 10611 | ; mode 17 [row 4, 5] |
| 10612 | |
| 10613 | pslldq m2, 2 |
| 10614 | pinsrb m2, [r1 + 4], 1 |
| 10615 | pinsrb m2, [r1 + 5], 0 |
| 10616 | |
| 10617 | pmaddubsw m4, m2, [r5 + 30 * 16] |
| 10618 | pmulhrsw m4, m7 |
| 10619 | |
| 10620 | pmaddubsw m3, m2, [r5 + 4 * 16] |
| 10621 | pmulhrsw m3, m7 |
| 10622 | |
| 10623 | packuswb m4, m3 |
| 10624 | movu [r0 + 992], m4 |
| 10625 | |
| 10626 | ; mode 17 [row 6, 7] |
| 10627 | |
| 10628 | pslldq m2, 2 |
| 10629 | pinsrb m2, [r1 + 5], 1 |
| 10630 | pinsrb m2, [r1 + 6], 0 |
| 10631 | |
| 10632 | pmaddubsw m4, m2, [r5 + 10 * 16] |
| 10633 | pmulhrsw m4, m7 |
| 10634 | |
| 10635 | pslldq m2, 2 |
| 10636 | pinsrb m2, [r1 + 6], 1 |
| 10637 | pinsrb m2, [r1 + 7], 0 |
| 10638 | |
| 10639 | pmaddubsw m3, m2, [r5 + 16 * 16] |
| 10640 | pmulhrsw m3, m7 |
| 10641 | |
| 10642 | packuswb m4, m3 |
| 10643 | movu [r0 + 1008], m4 |
| 10644 | |
| 10645 | ; mode 18 [row 0, 1, 2, 3, 4, 5, 6, 7] |
| 10646 | |
| 10647 | movh m1, [r3] |
| 10648 | movh [r0 + 1024], m1 |
| 10649 | |
| 10650 | pslldq m2, m1, 1 |
| 10651 | pinsrb m2, [r4 + 1], 0 |
| 10652 | movh [r0 + 1032], m2 |
| 10653 | |
| 10654 | pslldq m2, 1 |
| 10655 | pinsrb m2, [r4 + 2], 0 |
| 10656 | movh [r0 + 1040], m2 |
| 10657 | |
| 10658 | pslldq m2, 1 |
| 10659 | pinsrb m2, [r4 + 3], 0 |
| 10660 | movh [r0 + 1048], m2 |
| 10661 | |
| 10662 | pslldq m2, 1 |
| 10663 | pinsrb m2, [r4 + 4], 0 |
| 10664 | movh [r0 + 1056], m2 |
| 10665 | |
| 10666 | pslldq m2, 1 |
| 10667 | pinsrb m2, [r4 + 5], 0 |
| 10668 | movh [r0 + 1064], m2 |
| 10669 | |
| 10670 | pslldq m2, 1 |
| 10671 | pinsrb m2, [r4 + 6], 0 |
| 10672 | movh [r0 + 1072], m2 |
| 10673 | |
| 10674 | pslldq m2, 1 |
| 10675 | pinsrb m2, [r4 + 7], 0 |
| 10676 | movh [r0 + 1080], m2 |
| 10677 | |
| 10678 | ; mode 19 [row 0, 1] |
| 10679 | |
| 10680 | movu m0, [r1] |
| 10681 | palignr m1, m0, 1 |
| 10682 | punpcklbw m0, m1 |
| 10683 | |
| 10684 | pmaddubsw m1, m0, [r5 + 6 * 16] |
| 10685 | pmulhrsw m1, m7 |
| 10686 | |
| 10687 | pslldq m2, m0, 2 |
| 10688 | pinsrb m2, [r2 + 0], 1 |
| 10689 | pinsrb m2, [r2 + 1], 0 |
| 10690 | |
| 10691 | pmaddubsw m3, m2, [r5 + 12 * 16] |
| 10692 | pmulhrsw m3, m7 |
| 10693 | |
| 10694 | packuswb m1, m3 |
| 10695 | movu [r0 + 1088], m1 |
| 10696 | |
| 10697 | ; mode 19 [row 2, 3] |
| 10698 | |
| 10699 | pslldq m2, 2 |
| 10700 | pinsrb m2, [r2 + 1], 1 |
| 10701 | pinsrb m2, [r2 + 2], 0 |
| 10702 | |
| 10703 | pmaddubsw m4, m2, [r5 + 18 * 16] |
| 10704 | pmulhrsw m4, m7 |
| 10705 | |
| 10706 | pslldq m2, 2 |
| 10707 | pinsrb m2, [r2 + 2], 1 |
| 10708 | pinsrb m2, [r2 + 4], 0 |
| 10709 | |
| 10710 | pmaddubsw m5, m2, [r5 + 24 * 16] |
| 10711 | pmulhrsw m5, m7 |
| 10712 | |
| 10713 | packuswb m4, m5 |
| 10714 | movu [r0 + 1104], m4 |
| 10715 | |
| 10716 | ; mode 19 [row 4, 5] |
| 10717 | |
| 10718 | pslldq m2, 2 |
| 10719 | pinsrb m2, [r2 + 4], 1 |
| 10720 | pinsrb m2, [r2 + 5], 0 |
| 10721 | |
| 10722 | pmaddubsw m4, m2, [r5 + 30 * 16] |
| 10723 | pmulhrsw m4, m7 |
| 10724 | |
| 10725 | pmaddubsw m5, m2, [r5 + 4 * 16] |
| 10726 | pmulhrsw m5, m7 |
| 10727 | |
| 10728 | packuswb m4, m5 |
| 10729 | movu [r0 + 1120], m4 |
| 10730 | |
| 10731 | ; mode 19 [row 6, 7] |
| 10732 | |
| 10733 | pslldq m2, 2 |
| 10734 | pinsrb m2, [r2 + 5], 1 |
| 10735 | pinsrb m2, [r2 + 6], 0 |
| 10736 | |
| 10737 | pmaddubsw m4, m2, [r5 + 10 * 16] |
| 10738 | pmulhrsw m4, m7 |
| 10739 | |
| 10740 | pslldq m2, 2 |
| 10741 | pinsrb m2, [r2 + 6], 1 |
| 10742 | pinsrb m2, [r2 + 7], 0 |
| 10743 | |
| 10744 | pmaddubsw m2, [r5 + 16 * 16] |
| 10745 | pmulhrsw m2, m7 |
| 10746 | |
| 10747 | packuswb m4, m2 |
| 10748 | movu [r0 + 1136], m4 |
| 10749 | |
| 10750 | ; mode 20 [row 0, 1] |
| 10751 | |
| 10752 | pmaddubsw m3, m0, [r5 + 11 * 16] |
| 10753 | pmulhrsw m3, m7 |
| 10754 | |
| 10755 | pslldq m1, m0, 2 |
| 10756 | pinsrb m1, [r2 + 0], 1 |
| 10757 | pinsrb m1, [r2 + 2], 0 |
| 10758 | |
| 10759 | pmaddubsw m4, m1, [r5 + 22 * 16] |
| 10760 | pmulhrsw m4, m7 |
| 10761 | |
| 10762 | packuswb m3, m4 |
| 10763 | movu [r0 + 1152], m3 |
| 10764 | |
| 10765 | ; mode 20 [row 2, 3] |
| 10766 | |
| 10767 | pmaddubsw m3, m1, [r5 + 1 * 16] |
| 10768 | pmulhrsw m3, m7 |
| 10769 | |
| 10770 | pslldq m2, m1, 2 |
| 10771 | pinsrb m2, [r2 + 2], 1 |
| 10772 | pinsrb m2, [r2 + 3], 0 |
| 10773 | |
| 10774 | pmaddubsw m4, m2, [r5 + 12 * 16] |
| 10775 | pmulhrsw m4, m7 |
| 10776 | |
| 10777 | packuswb m3, m4 |
| 10778 | movu [r0 + 1168], m3 |
| 10779 | |
| 10780 | ; mode 20 [row 4, 5] |
| 10781 | |
| 10782 | pslldq m2, 2 |
| 10783 | pinsrb m2, [r2 + 3], 1 |
| 10784 | pinsrb m2, [r2 + 5], 0 |
| 10785 | |
| 10786 | pmaddubsw m3, m2, [r5 + 23 * 16] |
| 10787 | pmulhrsw m3, m7 |
| 10788 | |
| 10789 | pmaddubsw m4, m2, [r5 + 2 * 16] |
| 10790 | pmulhrsw m4, m7 |
| 10791 | |
| 10792 | packuswb m3, m4 |
| 10793 | movu [r0 + 1184], m3 |
| 10794 | |
| 10795 | ; mode 20 [row 6, 7] |
| 10796 | |
| 10797 | pslldq m2, 2 |
| 10798 | pinsrb m2, [r2 + 5], 1 |
| 10799 | pinsrb m2, [r2 + 6], 0 |
| 10800 | |
| 10801 | pmaddubsw m3, m2, [r5 + 13 * 16] |
| 10802 | pmulhrsw m3, m7 |
| 10803 | |
| 10804 | pslldq m2, 2 |
| 10805 | pinsrb m2, [r2 + 6], 1 |
| 10806 | pinsrb m2, [r2 + 8], 0 |
| 10807 | |
| 10808 | pmaddubsw m4, m2, [r5 + 24 * 16] |
| 10809 | pmulhrsw m4, m7 |
| 10810 | |
| 10811 | packuswb m3, m4 |
| 10812 | movu [r0 + 1200], m3 |
| 10813 | |
| 10814 | ; mode 21 [row 0, 1] |
| 10815 | |
| 10816 | pmaddubsw m2, m0, [r5 + 15 * 16] |
| 10817 | pmulhrsw m2, m7 |
| 10818 | |
| 10819 | pmaddubsw m3, m1, [r5 + 30 * 16] |
| 10820 | pmulhrsw m3, m7 |
| 10821 | |
| 10822 | packuswb m2, m3 |
| 10823 | movu [r0 + 1216], m2 |
| 10824 | |
| 10825 | ; mode 21 [row 2, 3] |
| 10826 | |
| 10827 | pmaddubsw m2, m1, [r5 + 13 * 16] |
| 10828 | pmulhrsw m2, m7 |
| 10829 | |
| 10830 | pslldq m3, m1, 2 |
| 10831 | pinsrb m3, [r2 + 2], 1 |
| 10832 | pinsrb m3, [r2 + 4], 0 |
| 10833 | |
| 10834 | pmaddubsw m4, m3, [r5 + 28 * 16] |
| 10835 | pmulhrsw m4, m7 |
| 10836 | |
| 10837 | packuswb m2, m4 |
| 10838 | movu [r0 + 1232], m2 |
| 10839 | |
| 10840 | ; mode 21 [row 4, 5] |
| 10841 | |
| 10842 | pmaddubsw m2, m3, [r5 + 11 * 16] |
| 10843 | pmulhrsw m2, m7 |
| 10844 | |
| 10845 | pslldq m3, 2 |
| 10846 | pinsrb m3, [r2 + 4], 1 |
| 10847 | pinsrb m3, [r2 + 6], 0 |
| 10848 | |
| 10849 | pmaddubsw m4, m3, [r5 + 26 * 16] |
| 10850 | pmulhrsw m4, m7 |
| 10851 | |
| 10852 | packuswb m2, m4 |
| 10853 | movu [r0 + 1248], m2 |
| 10854 | |
| 10855 | ; mode 21 [row 6, 7] |
| 10856 | |
| 10857 | pmaddubsw m2, m3, [r5 + 9 * 16] |
| 10858 | pmulhrsw m2, m7 |
| 10859 | |
| 10860 | pslldq m3, 2 |
| 10861 | pinsrb m3, [r2 + 6], 1 |
| 10862 | pinsrb m3, [r2 + 8], 0 |
| 10863 | |
| 10864 | pmaddubsw m4, m3, [r5 + 24 * 16] |
| 10865 | pmulhrsw m4, m7 |
| 10866 | |
| 10867 | packuswb m2, m4 |
| 10868 | movu [r0 + 1264], m2 |
| 10869 | |
| 10870 | ; mode 22 [row 0, 1] |
| 10871 | |
| 10872 | pmaddubsw m2, m0, [r5 + 19 * 16] |
| 10873 | pmulhrsw m2, m7 |
| 10874 | |
| 10875 | pmaddubsw m4, m0, [r5 + 6 * 16] |
| 10876 | pmulhrsw m4, m7 |
| 10877 | |
| 10878 | packuswb m2, m4 |
| 10879 | movu [r0 + 1280], m2 |
| 10880 | |
| 10881 | ; mode 22 [row 2, 3] |
| 10882 | |
| 10883 | pmaddubsw m2, m1, [r5 + 25 * 16] |
| 10884 | pmulhrsw m2, m7 |
| 10885 | |
| 10886 | pmaddubsw m3, m1, [r5 + 12 * 16] |
| 10887 | pmulhrsw m3, m7 |
| 10888 | |
| 10889 | packuswb m2, m3 |
| 10890 | movu [r0 + 1296], m2 |
| 10891 | |
| 10892 | ; mode 22 [row 4, 5] |
| 10893 | |
| 10894 | pslldq m1, 2 |
| 10895 | pinsrb m1, [r2 + 5], 0 |
| 10896 | pinsrb m1, [r2 + 2], 1 |
| 10897 | |
| 10898 | pmaddubsw m2, m1, [r5 + 31 * 16] |
| 10899 | pmulhrsw m2, m7 |
| 10900 | |
| 10901 | pmaddubsw m3, m1, [r5 + 18 * 16] |
| 10902 | pmulhrsw m3, m7 |
| 10903 | |
| 10904 | packuswb m2, m3 |
| 10905 | movu [r0 + 1312], m2 |
| 10906 | |
| 10907 | ; mode 22 [row 6, 7] |
| 10908 | |
| 10909 | pmaddubsw m2, m1, [r5 + 5 * 16] |
| 10910 | pmulhrsw m2, m7 |
| 10911 | |
| 10912 | pslldq m1, 2 |
| 10913 | pinsrb m1, [r2 + 5], 1 |
| 10914 | pinsrb m1, [r2 + 7], 0 |
| 10915 | |
| 10916 | pmaddubsw m1, [r5 + 24 * 16] |
| 10917 | pmulhrsw m1, m7 |
| 10918 | |
| 10919 | packuswb m2, m1 |
| 10920 | movu [r0 + 1328], m2 |
| 10921 | |
| 10922 | ; mode 23 [row 0, 1] |
| 10923 | |
| 10924 | pmaddubsw m2, m0, [r5 + 23 * 16] |
| 10925 | pmulhrsw m2, m7 |
| 10926 | |
| 10927 | pmaddubsw m3, m0, [r5 + 14 * 16] |
| 10928 | pmulhrsw m3, m7 |
| 10929 | |
| 10930 | packuswb m2, m3 |
| 10931 | movu [r0 + 1344], m2 |
| 10932 | |
| 10933 | ; mode 23 [row 2, 3] |
| 10934 | |
| 10935 | pmaddubsw m2, m0, [r5 + 5 * 16] |
| 10936 | pmulhrsw m2, m7 |
| 10937 | |
| 10938 | pslldq m1, m0, 2 |
| 10939 | pinsrb m1, [r2 + 0], 1 |
| 10940 | pinsrb m1, [r2 + 4], 0 |
| 10941 | |
| 10942 | pmaddubsw m3, m1, [r5 + 28 * 16] |
| 10943 | pmulhrsw m3, m7 |
| 10944 | |
| 10945 | packuswb m2, m3 |
| 10946 | movu [r0 + 1360], m2 |
| 10947 | |
| 10948 | ; mode 23 [row 4, 5] |
| 10949 | |
| 10950 | pmaddubsw m2, m1, [r5 + 19 * 16] |
| 10951 | pmulhrsw m2, m7 |
| 10952 | |
| 10953 | pmaddubsw m3, m1, [r5 + 10 * 16] |
| 10954 | pmulhrsw m3, m7 |
| 10955 | |
| 10956 | packuswb m2, m3 |
| 10957 | movu [r0 + 1376], m2 |
| 10958 | |
| 10959 | ; mode 23 [row 6, 7] |
| 10960 | |
| 10961 | pmaddubsw m2, m1, [r5 + 1 * 16] |
| 10962 | pmulhrsw m2, m7 |
| 10963 | |
| 10964 | pslldq m3, m1, 2 |
| 10965 | pinsrb m3, [r2 + 4], 1 |
| 10966 | pinsrb m3, [r2 + 7], 0 |
| 10967 | |
| 10968 | pmaddubsw m3, [r5 + 24 * 16] |
| 10969 | pmulhrsw m3, m7 |
| 10970 | |
| 10971 | packuswb m2, m3 |
| 10972 | movu [r0 + 1392], m2 |
| 10973 | |
| 10974 | ; mode 24 [row 0, 1] |
| 10975 | |
| 10976 | pmaddubsw m2, m0, [r5 + 27 * 16] |
| 10977 | pmulhrsw m2, m7 |
| 10978 | |
| 10979 | pmaddubsw m5, m0, [r5 + 22 * 16] |
| 10980 | pmulhrsw m5, m7 |
| 10981 | |
| 10982 | packuswb m2, m5 |
| 10983 | movu [r0 + 1408], m2 |
| 10984 | |
| 10985 | ; mode 24 [row 2, 3] |
| 10986 | |
| 10987 | pmaddubsw m2, m0, [r5 + 17 * 16] |
| 10988 | pmulhrsw m2, m7 |
| 10989 | |
| 10990 | pmaddubsw m3, m0, [r5 + 12 * 16] |
| 10991 | pmulhrsw m3, m7 |
| 10992 | |
| 10993 | packuswb m2, m3 |
| 10994 | movu [r0 + 1424], m2 |
| 10995 | |
| 10996 | ; mode 24 [row 4, 5] |
| 10997 | |
| 10998 | pmaddubsw m2, m0, [r5 + 7 * 16] |
| 10999 | pmulhrsw m2, m7 |
| 11000 | |
| 11001 | pmaddubsw m3, m0, [r5 + 2 * 16] |
| 11002 | pmulhrsw m3, m7 |
| 11003 | |
| 11004 | packuswb m2, m3 |
| 11005 | movu [r0 + 1440], m2 |
| 11006 | |
| 11007 | ; mode 24 [row 6, 7] |
| 11008 | |
| 11009 | pinsrb m1, [r2 + 6], 0 |
| 11010 | |
| 11011 | pmaddubsw m2, m1, [r5 + 29 * 16] |
| 11012 | pmulhrsw m2, m7 |
| 11013 | |
| 11014 | pmaddubsw m1, [r5 + 24 * 16] |
| 11015 | pmulhrsw m1, m7 |
| 11016 | |
| 11017 | packuswb m2, m1 |
| 11018 | movu [r0 + 1456], m2 |
| 11019 | |
| 11020 | ; mode 25 [row 0, 1] |
| 11021 | |
| 11022 | pmaddubsw m2, m0, [r5 + 30 * 16] |
| 11023 | pmulhrsw m2, m7 |
| 11024 | |
| 11025 | pmaddubsw m1, m0, [r5 + 28 * 16] |
| 11026 | pmulhrsw m1, m7 |
| 11027 | |
| 11028 | packuswb m2, m1 |
| 11029 | movu [r0 + 1472], m2 |
| 11030 | |
| 11031 | ; mode 25 [row 2, 3] |
| 11032 | |
| 11033 | pmaddubsw m2, m0, [r5 + 26 * 16] |
| 11034 | pmulhrsw m2, m7 |
| 11035 | |
| 11036 | pmaddubsw m1, m0, [r5 + 24 * 16] |
| 11037 | pmulhrsw m1, m7 |
| 11038 | |
| 11039 | packuswb m2, m1 |
| 11040 | movu [r0 + 1488], m2 |
| 11041 | |
| 11042 | ; mode 25 [row 4, 5] |
| 11043 | |
| 11044 | pmaddubsw m1, m0, [r5 + 20 * 16] |
| 11045 | pmulhrsw m1, m7 |
| 11046 | |
| 11047 | packuswb m5, m1 |
| 11048 | movu [r0 + 1504], m5 |
| 11049 | |
| 11050 | ; mode 25 [row 6, 7] |
| 11051 | |
| 11052 | pmaddubsw m2, m0, [r5 + 18 * 16] |
| 11053 | pmulhrsw m2, m7 |
| 11054 | |
| 11055 | pmaddubsw m1, m0, [r5 + 16 * 16] |
| 11056 | pmulhrsw m1, m7 |
| 11057 | |
| 11058 | packuswb m2, m1 |
| 11059 | movu [r0 + 1520], m2 |
| 11060 | |
| 11061 | ; mode 26 |
| 11062 | |
| 11063 | movu m0, [r1 + 1] |
| 11064 | |
| 11065 | pshufb m1, m0, [tab_Si] |
| 11066 | movu [r0 + 1536], m1 |
| 11067 | movu [r0 + 1552], m1 |
| 11068 | movu [r0 + 1568], m1 |
| 11069 | movu [r0 + 1584], m1 |
| 11070 | |
| 11071 | pxor m5, m5 |
| 11072 | |
| 11073 | pshufb m1, m1, m5 |
| 11074 | punpcklbw m1, m5 |
| 11075 | |
| 11076 | movu m2, [r2] |
| 11077 | |
| 11078 | pshufb m3, m2, m5 |
| 11079 | punpcklbw m3, m5 |
| 11080 | |
| 11081 | psrldq m4, m2, 1 |
| 11082 | punpcklbw m4, m5 |
| 11083 | |
| 11084 | movu m2, [r2 + 9] |
| 11085 | punpcklbw m2, m5 |
| 11086 | |
| 11087 | psubw m4, m3 |
| 11088 | psubw m2, m3 |
| 11089 | |
| 11090 | psraw m4, 1 |
| 11091 | psraw m2, 1 |
| 11092 | |
| 11093 | paddw m4, m1 |
| 11094 | paddw m2, m1 |
| 11095 | |
| 11096 | packuswb m4, m2 |
| 11097 | |
| 11098 | pextrb [r0 + 1536], m4, 0 |
| 11099 | pextrb [r0 + 1544], m4, 1 |
| 11100 | pextrb [r0 + 1552], m4, 2 |
| 11101 | pextrb [r0 + 1560], m4, 3 |
| 11102 | pextrb [r0 + 1568], m4, 4 |
| 11103 | pextrb [r0 + 1576], m4, 5 |
| 11104 | pextrb [r0 + 1584], m4, 6 |
| 11105 | pextrb [r0 + 1592], m4, 7 |
| 11106 | |
| 11107 | ; mode 27 [row 0, 1] |
| 11108 | |
| 11109 | palignr m6, m0, 1 |
| 11110 | punpcklbw m4, m0, m6 |
| 11111 | |
| 11112 | pmaddubsw m1, m4, [r5 + 2 * 16] |
| 11113 | pmulhrsw m1, m7 |
| 11114 | |
| 11115 | pmaddubsw m2, m4, [r5 + 4 * 16] |
| 11116 | pmulhrsw m2, m7 |
| 11117 | |
| 11118 | packuswb m1, m2 |
| 11119 | movu [r0 + 1600], m1 |
| 11120 | |
| 11121 | ; mode 27 [row 2, 3] |
| 11122 | |
| 11123 | pmaddubsw m1, m4, [r5 + 6 * 16] |
| 11124 | pmulhrsw m1, m7 |
| 11125 | |
| 11126 | pmaddubsw m2, m4, [r5 + 8 * 16] |
| 11127 | pmulhrsw m2, m7 |
| 11128 | |
| 11129 | packuswb m1, m2 |
| 11130 | movu [r0 + 1616], m1 |
| 11131 | |
| 11132 | ; mode 27 [row 4, 5] |
| 11133 | |
| 11134 | pmaddubsw m3, m4, [r5 + 10 * 16] |
| 11135 | pmulhrsw m3, m7 |
| 11136 | |
| 11137 | pmaddubsw m2, m4, [r5 + 12 * 16] |
| 11138 | pmulhrsw m2, m7 |
| 11139 | |
| 11140 | packuswb m1, m3, m2 |
| 11141 | movu [r0 + 1632], m1 |
| 11142 | |
| 11143 | ; mode 27 [row 6, 7] |
| 11144 | |
| 11145 | pmaddubsw m1, m4, [r5 + 14 * 16] |
| 11146 | pmulhrsw m1, m7 |
| 11147 | |
| 11148 | pmaddubsw m2, m4, [r5 + 16 * 16] |
| 11149 | pmulhrsw m2, m7 |
| 11150 | |
| 11151 | packuswb m1, m2 |
| 11152 | movu [r0 + 1648], m1 |
| 11153 | |
| 11154 | ; mode 28 [row 0, 1] |
| 11155 | |
| 11156 | pmaddubsw m1, m4, [r5 + 5 * 16] |
| 11157 | pmulhrsw m1, m7 |
| 11158 | |
| 11159 | packuswb m1, m3 |
| 11160 | movu [r0 + 1664], m1 |
| 11161 | |
| 11162 | ; mode 28 [row 2, 3] |
| 11163 | |
| 11164 | pmaddubsw m1, m4, [r5 + 15 * 16] |
| 11165 | pmulhrsw m1, m7 |
| 11166 | |
| 11167 | pmaddubsw m2, m4, [r5 + 20 * 16] |
| 11168 | pmulhrsw m2, m7 |
| 11169 | |
| 11170 | packuswb m1, m2 |
| 11171 | movu [r0 + 1680], m1 |
| 11172 | |
| 11173 | ; mode 28 [row 4, 5] |
| 11174 | |
| 11175 | pmaddubsw m1, m4, [r5 + 25 * 16] |
| 11176 | pmulhrsw m1, m7 |
| 11177 | |
| 11178 | pmaddubsw m2, m4, [r5 + 30 * 16] |
| 11179 | pmulhrsw m2, m7 |
| 11180 | |
| 11181 | packuswb m1, m2 |
| 11182 | movu [r0 + 1696], m1 |
| 11183 | |
| 11184 | ; mode 28 [row 6, 7] |
| 11185 | |
| 11186 | palignr m1, m0, 2 |
| 11187 | punpcklbw m5, m6, m1 |
| 11188 | |
| 11189 | pmaddubsw m2, m5, [r5 + 3 * 16] |
| 11190 | pmulhrsw m2, m7 |
| 11191 | |
| 11192 | pmaddubsw m3, m5, [r5 + 8 * 16] |
| 11193 | pmulhrsw m3, m7 |
| 11194 | |
| 11195 | packuswb m2, m3 |
| 11196 | movu [r0 + 1712], m2 |
| 11197 | |
| 11198 | ; mode 29 [row 0, 1] |
| 11199 | |
| 11200 | pmaddubsw m2, m4, [r5 + 9 * 16] |
| 11201 | pmulhrsw m2, m7 |
| 11202 | |
| 11203 | pmaddubsw m3, m4, [r5 + 18 * 16] |
| 11204 | pmulhrsw m3, m7 |
| 11205 | |
| 11206 | packuswb m2, m3 |
| 11207 | movu [r0 + 1728], m2 |
| 11208 | |
| 11209 | ; mode 29 [row 2, 3] |
| 11210 | |
| 11211 | pmaddubsw m2, m4, [r5 + 27 * 16] |
| 11212 | pmulhrsw m2, m7 |
| 11213 | |
| 11214 | pmaddubsw m3, m5, [r5 + 4 * 16] |
| 11215 | pmulhrsw m3, m7 |
| 11216 | |
| 11217 | packuswb m2, m3 |
| 11218 | movu [r0 + 1744], m2 |
| 11219 | |
| 11220 | ; mode 29 [row 4, 5] |
| 11221 | |
| 11222 | pmaddubsw m2, m5, [r5 + 13 * 16] |
| 11223 | pmulhrsw m2, m7 |
| 11224 | |
| 11225 | pmaddubsw m3, m5, [r5 + 22 * 16] |
| 11226 | pmulhrsw m3, m7 |
| 11227 | |
| 11228 | packuswb m2, m3 |
| 11229 | movu [r0 + 1760], m2 |
| 11230 | |
| 11231 | ; mode 29 [row 6, 7] |
| 11232 | |
| 11233 | pmaddubsw m2, m5, [r5 + 31 * 16] |
| 11234 | pmulhrsw m2, m7 |
| 11235 | |
| 11236 | palignr m6, m0, 3 |
| 11237 | punpcklbw m1, m6 |
| 11238 | |
| 11239 | pmaddubsw m3, m1, [r5 + 8 * 16] |
| 11240 | pmulhrsw m3, m7 |
| 11241 | |
| 11242 | packuswb m2, m3 |
| 11243 | movu [r0 + 1776], m2 |
| 11244 | |
| 11245 | ; mode 32 [row 2] |
| 11246 | |
| 11247 | movh [r0 + 1936], m2 |
| 11248 | |
| 11249 | ; mode 30 [row 0, 1] |
| 11250 | |
| 11251 | pmaddubsw m2, m4, [r5 + 13 * 16] |
| 11252 | pmulhrsw m2, m7 |
| 11253 | |
| 11254 | pmaddubsw m3, m4, [r5 + 26 * 16] |
| 11255 | pmulhrsw m3, m7 |
| 11256 | |
| 11257 | packuswb m2, m3 |
| 11258 | movu [r0 + 1792], m2 |
| 11259 | |
| 11260 | ; mode 30 [row 2, 3] |
| 11261 | |
| 11262 | pmaddubsw m2, m5, [r5 + 7 * 16] |
| 11263 | pmulhrsw m2, m7 |
| 11264 | |
| 11265 | pmaddubsw m3, m5, [r5 + 20 * 16] |
| 11266 | pmulhrsw m3, m7 |
| 11267 | |
| 11268 | packuswb m2, m3 |
| 11269 | movu [r0 + 1808], m2 |
| 11270 | |
| 11271 | ; mode 33 [row 1] |
| 11272 | |
| 11273 | movhps [r0 + 1992], m2 |
| 11274 | |
| 11275 | ; mode 30 [row 4, 5] |
| 11276 | |
| 11277 | pmaddubsw m2, m1, [r5 + 1 * 16] |
| 11278 | pmulhrsw m2, m7 |
| 11279 | |
| 11280 | pmaddubsw m3, m1, [r5 + 14 * 16] |
| 11281 | pmulhrsw m3, m7 |
| 11282 | |
| 11283 | packuswb m2, m3 |
| 11284 | movu [r0 + 1824], m2 |
| 11285 | |
| 11286 | ; mode 33 [row 2] |
| 11287 | |
| 11288 | movhps [r0 + 2000], m2 |
| 11289 | |
| 11290 | ; mode 30 [row 6, 7] |
| 11291 | |
| 11292 | pmaddubsw m2, m1, [r5 + 27 * 16] |
| 11293 | pmulhrsw m2, m7 |
| 11294 | |
| 11295 | psrldq m0, 4 |
| 11296 | punpcklbw m6, m0 |
| 11297 | |
| 11298 | pmaddubsw m3, m6, [r5 + 8 * 16] |
| 11299 | pmulhrsw m3, m7 |
| 11300 | |
| 11301 | packuswb m2, m3 |
| 11302 | movu [r0 + 1840], m2 |
| 11303 | |
| 11304 | ; mode 33 [row 3] |
| 11305 | |
| 11306 | movhps [r0 + 2008], m2 |
| 11307 | |
| 11308 | ; mode 31 [row 0, 1] |
| 11309 | |
| 11310 | pmaddubsw m2, m4, [r5 + 17 * 16] |
| 11311 | pmulhrsw m2, m7 |
| 11312 | |
| 11313 | pmaddubsw m3, m5, [r5 + 2 * 16] |
| 11314 | pmulhrsw m3, m7 |
| 11315 | |
| 11316 | packuswb m2, m3 |
| 11317 | movu [r0 + 1856], m2 |
| 11318 | |
| 11319 | ; mode 31 [row 2, 3] |
| 11320 | |
| 11321 | pmaddubsw m2, m5, [r5 + 19 * 16] |
| 11322 | pmulhrsw m2, m7 |
| 11323 | |
| 11324 | pmaddubsw m3, m1, [r5 + 4 * 16] |
| 11325 | pmulhrsw m3, m7 |
| 11326 | |
| 11327 | packuswb m2, m3 |
| 11328 | movu [r0 + 1872], m2 |
| 11329 | |
| 11330 | ; mode 31 [row 4, 5] |
| 11331 | |
| 11332 | pmaddubsw m2, m1, [r5 + 21 * 16] |
| 11333 | pmulhrsw m2, m7 |
| 11334 | |
| 11335 | pmaddubsw m3, m6, [r5 + 6 * 16] |
| 11336 | pmulhrsw m3, m7 |
| 11337 | |
| 11338 | packuswb m2, m3 |
| 11339 | movu [r0 + 1888], m2 |
| 11340 | |
| 11341 | ; mode 31 [row 6, 7] |
| 11342 | |
| 11343 | pmaddubsw m2, m6, [r5 + 23 * 16] |
| 11344 | pmulhrsw m2, m7 |
| 11345 | |
| 11346 | movu m3, [r1 + 6] |
| 11347 | punpcklbw m0, m3 |
| 11348 | |
| 11349 | pmaddubsw m3, m0, [r5 + 8 * 16] |
| 11350 | pmulhrsw m3, m7 |
| 11351 | |
| 11352 | packuswb m2, m3 |
| 11353 | movu [r0 + 1904], m2 |
| 11354 | |
| 11355 | ; mode 32 [row 0, 1] |
| 11356 | |
| 11357 | pmaddubsw m2, m4, [r5 + 21 * 16] |
| 11358 | pmulhrsw m2, m7 |
| 11359 | |
| 11360 | pmaddubsw m3, m5, [r5 + 10 * 16] |
| 11361 | pmulhrsw m3, m7 |
| 11362 | |
| 11363 | packuswb m2, m3 |
| 11364 | movu [r0 + 1920], m2 |
| 11365 | |
| 11366 | ; mode 32 [row 3] |
| 11367 | |
| 11368 | pmaddubsw m2, m1, [r5 + 20 * 16] |
| 11369 | pmulhrsw m2, m7 |
| 11370 | |
| 11371 | pxor m3, m3 |
| 11372 | |
| 11373 | packuswb m2, m3 |
| 11374 | movh [r0 + 1944], m2 |
| 11375 | |
| 11376 | ; mode 32 [row 4, 5] |
| 11377 | |
| 11378 | pmaddubsw m2, m6, [r5 + 9 * 16] |
| 11379 | pmulhrsw m2, m7 |
| 11380 | |
| 11381 | pmaddubsw m3, m6, [r5 + 30 * 16] |
| 11382 | pmulhrsw m3, m7 |
| 11383 | |
| 11384 | packuswb m2, m3 |
| 11385 | movu [r0 + 1952], m2 |
| 11386 | |
| 11387 | ; mode 33 [row 4, 5] |
| 11388 | |
| 11389 | pmaddubsw m2, m0, [r5 + 2 * 16] |
| 11390 | pmulhrsw m2, m7 |
| 11391 | |
| 11392 | pmaddubsw m3, m0, [r5 + 28 * 16] |
| 11393 | pmulhrsw m3, m7 |
| 11394 | |
| 11395 | packuswb m2, m3 |
| 11396 | movu [r0 + 2016], m2 |
| 11397 | |
| 11398 | ; mode 32 [row 6] |
| 11399 | |
| 11400 | pmaddubsw m2, m0, [r5 + 19 * 16] |
| 11401 | pmulhrsw m2, m7 |
| 11402 | |
| 11403 | ; mode 32 [row 7] |
| 11404 | |
| 11405 | movu m0, [r1 + 6] |
| 11406 | palignr m3, m0, 1 |
| 11407 | punpcklbw m0, m3 |
| 11408 | |
| 11409 | pmaddubsw m3, m0, [r5 + 8 * 16] |
| 11410 | pmulhrsw m3, m7 |
| 11411 | |
| 11412 | packuswb m2, m3 |
| 11413 | movu [r0 + 1968], m2 |
| 11414 | |
| 11415 | ; mode 33 [row 6, 7] |
| 11416 | |
| 11417 | pmaddubsw m2, m0, [r5 + 22 * 16] |
| 11418 | pmulhrsw m2, m7 |
| 11419 | |
| 11420 | movu m0, [r1 + 7] |
| 11421 | palignr m3, m0, 1 |
| 11422 | punpcklbw m0, m3 |
| 11423 | |
| 11424 | pmaddubsw m3, m0, [r5 + 16 * 16] |
| 11425 | pmulhrsw m3, m7 |
| 11426 | |
| 11427 | packuswb m2, m3 |
| 11428 | movu [r0 + 2032], m2 |
| 11429 | |
| 11430 | ; mode 33 [row 0] |
| 11431 | |
| 11432 | pmaddubsw m2, m4, [r5 + 26 * 16] |
| 11433 | pmulhrsw m2, m7 |
| 11434 | |
| 11435 | pxor m3, m3 |
| 11436 | |
| 11437 | packuswb m2, m3 |
| 11438 | movh [r0 + 1984], m2 |
| 11439 | |
| 11440 | ; mode 34 [row 0, 1, 2, 3, 4, 5, 6, 7] |
| 11441 | |
| 11442 | movu m0, [r3 + 2] |
| 11443 | palignr m1, m0, 1 |
| 11444 | punpcklqdq m2, m0, m1 |
| 11445 | movu [r0 + 2048], m2 |
| 11446 | |
| 11447 | palignr m1, m0, 2 |
| 11448 | palignr m2, m0, 3 |
| 11449 | punpcklqdq m1, m2 |
| 11450 | movu [r0 + 2064], m1 |
| 11451 | |
| 11452 | palignr m1, m0, 4 |
| 11453 | palignr m2, m0, 5 |
| 11454 | punpcklqdq m1, m2 |
| 11455 | movu [r0 + 2080], m1 |
| 11456 | |
| 11457 | palignr m1, m0, 6 |
| 11458 | palignr m2, m0, 7 |
| 11459 | punpcklqdq m1, m2 |
| 11460 | movu [r0 + 2096], m1 |
| 11461 | |
| 11462 | RET |
| 11463 | |
| 11464 | ;----------------------------------------------------------------------------- |
| 11465 | ; void all_angs_pred_16x16(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma) |
| 11466 | ;----------------------------------------------------------------------------- |
| 11467 | INIT_XMM sse4 |
| 11468 | cglobal all_angs_pred_16x16, 6, 6, 8, dest, above0, left0, above1, left1, bLuma |
| 11469 | |
| 11470 | movu m0, [r4 + 2] |
| 11471 | movu [r0 + 0 * 16], m0 |
| 11472 | |
| 11473 | movu m1, m0 |
| 11474 | |
| 11475 | movu m6, [r4 + 18] |
| 11476 | palignr m5, m6, m0, 1 |
| 11477 | movu [r0 + 1 * 16], m5 |
| 11478 | |
| 11479 | movu m4, m5 |
| 11480 | |
| 11481 | palignr m5, m6, m0, 2 |
| 11482 | movu [r0 + 2 * 16], m5 |
| 11483 | palignr m5, m6, m0, 3 |
| 11484 | movu [r0 + 3 * 16], m5 |
| 11485 | palignr m5, m6, m0, 4 |
| 11486 | movu [r0 + 4 * 16], m5 |
| 11487 | palignr m5, m6, m0, 5 |
| 11488 | movu [r0 + 5 * 16], m5 |
| 11489 | palignr m5, m6, m0, 6 |
| 11490 | movu [r0 + 6 * 16], m5 |
| 11491 | palignr m5, m6, m0, 7 |
| 11492 | movu [r0 + 7 * 16], m5 |
| 11493 | |
| 11494 | movu m7, m5 |
| 11495 | |
| 11496 | palignr m5, m6, m0, 8 |
| 11497 | movu [r0 + 8 * 16], m5 |
| 11498 | |
| 11499 | movu m2, m5 |
| 11500 | |
| 11501 | palignr m5, m6, m0, 9 |
| 11502 | movu [r0 + 9 * 16], m5 |
| 11503 | |
| 11504 | palignr m3, m6, m0, 10 |
| 11505 | movu [r0 + 10 * 16], m3 |
| 11506 | palignr m3, m6, m0, 11 |
| 11507 | movu [r0 + 11 * 16], m3 |
| 11508 | palignr m3, m6, m0, 12 |
| 11509 | movu [r0 + 12 * 16], m3 |
| 11510 | |
| 11511 | ; mode 3 [row 15] |
| 11512 | movu [r0 + (3-2)*16*16 + 15 * 16], m3 |
| 11513 | |
| 11514 | palignr m3, m6, m0, 13 |
| 11515 | movu [r0 + 13 * 16], m3 |
| 11516 | palignr m3, m6, m0, 14 |
| 11517 | movu [r0 + 14 * 16], m3 |
| 11518 | palignr m3, m6, m0, 15 |
| 11519 | movu [r0 + 15 * 16], m3 |
| 11520 | |
| 11521 | ; mode 3 [row 0] |
| 11522 | lea r5, [ang_table] |
| 11523 | movu m3, [pw_1024] |
| 11524 | movu m0, [r4 + 1] |
| 11525 | punpcklbw m0, m1 |
| 11526 | |
| 11527 | ; mode 17 [row 8 - second half] |
| 11528 | pmaddubsw m1, m0, [r5 + 22 * 16] |
| 11529 | pmulhrsw m1, m3 |
| 11530 | packuswb m1, m1 |
| 11531 | movh [r0 + 248 * 16 + 8], m1 |
| 11532 | ; mode 17 [row 8 - second half] end |
| 11533 | |
| 11534 | pmaddubsw m1, m0, [r5 + 26 * 16] |
| 11535 | pmulhrsw m1, m3 |
| 11536 | punpcklbw m7, m2 |
| 11537 | pmaddubsw m2, m7, [r5 + 26 * 16] |
| 11538 | pmulhrsw m2, m3 |
| 11539 | packuswb m1, m2 |
| 11540 | movu [r0 + 16 * 16], m1 |
| 11541 | |
| 11542 | ;mode 6 [row 1] |
| 11543 | movu [r0 + 65 * 16], m1 |
| 11544 | |
| 11545 | ; mode 4 [row 0] |
| 11546 | pmaddubsw m1, m0, [r5 + 21 * 16] |
| 11547 | pmulhrsw m1, m3 |
| 11548 | pmaddubsw m2, m7, [r5 + 21 * 16] |
| 11549 | pmulhrsw m2, m3 |
| 11550 | packuswb m1, m2 |
| 11551 | movu [r0 + 32 * 16], m1 |
| 11552 | |
| 11553 | ; mode 5 [row 0] |
| 11554 | pmaddubsw m1, m0, [r5 + 17 * 16] |
| 11555 | pmulhrsw m1, m3 |
| 11556 | pmaddubsw m2, m7, [r5 + 17 * 16] |
| 11557 | pmulhrsw m2, m3 |
| 11558 | packuswb m1, m2 |
| 11559 | movu [r0 + 48 * 16], m1 |
| 11560 | |
| 11561 | ; mode 6 [row 0] |
| 11562 | pmaddubsw m1, m0, [r5 + 13 * 16] |
| 11563 | pmulhrsw m1, m3 |
| 11564 | pmaddubsw m2, m7, [r5 + 13 * 16] |
| 11565 | pmulhrsw m2, m3 |
| 11566 | packuswb m1, m2 |
| 11567 | movu [r0 + 64 * 16], m1 |
| 11568 | |
| 11569 | ; mode 7 [row 0] |
| 11570 | pmaddubsw m1, m0, [r5 + 9 * 16] |
| 11571 | pmulhrsw m1, m3 |
| 11572 | pmaddubsw m2, m7, [r5 + 9 * 16] |
| 11573 | pmulhrsw m2, m3 |
| 11574 | packuswb m1, m2 |
| 11575 | movu [r0 + 80 * 16], m1 |
| 11576 | |
| 11577 | ; mode 7 [row 1] |
| 11578 | pmaddubsw m1, m0, [r5 + 18 * 16] |
| 11579 | pmulhrsw m1, m3 |
| 11580 | pmaddubsw m2, m7, [r5 + 18 * 16] |
| 11581 | pmulhrsw m2, m3 |
| 11582 | packuswb m1, m2 |
| 11583 | movu [r0 + 81 * 16], m1 |
| 11584 | |
| 11585 | ; mode 7 [row 2] |
| 11586 | pmaddubsw m1, m0, [r5 + 27 * 16] |
| 11587 | pmulhrsw m1, m3 |
| 11588 | pmaddubsw m2, m7, [r5 + 27 * 16] |
| 11589 | pmulhrsw m2, m3 |
| 11590 | packuswb m1, m2 |
| 11591 | movu [r0 + 82 * 16], m1 |
| 11592 | |
| 11593 | ; mode 8 [row 0] |
| 11594 | pmaddubsw m1, m0, [r5 + 5 * 16] |
| 11595 | pmulhrsw m1, m3 |
| 11596 | pmaddubsw m2, m7, [r5 + 5 * 16] |
| 11597 | pmulhrsw m2, m3 |
| 11598 | packuswb m1, m2 |
| 11599 | movu [r0 + 96 * 16], m1 |
| 11600 | |
| 11601 | ; mode 8 [row 1] |
| 11602 | pmaddubsw m1, m0, [r5 + 10 * 16] |
| 11603 | pmulhrsw m1, m3 |
| 11604 | pmaddubsw m2, m7, [r5 + 10 * 16] |
| 11605 | pmulhrsw m2, m3 |
| 11606 | packuswb m1, m2 |
| 11607 | movu [r0 + 97 * 16], m1 |
| 11608 | |
| 11609 | ; mode 8 [row 2] |
| 11610 | pmaddubsw m1, m0, [r5 + 15 * 16] |
| 11611 | pmulhrsw m1, m3 |
| 11612 | pmaddubsw m2, m7, [r5 + 15 * 16] |
| 11613 | pmulhrsw m2, m3 |
| 11614 | packuswb m1, m2 |
| 11615 | movu [r0 + 98 * 16], m1 |
| 11616 | |
| 11617 | ; mode 8 [row 3] |
| 11618 | pmaddubsw m1, m0, [r5 + 20 * 16] |
| 11619 | pmulhrsw m1, m3 |
| 11620 | pmaddubsw m2, m7, [r5 + 20 * 16] |
| 11621 | pmulhrsw m2, m3 |
| 11622 | packuswb m1, m2 |
| 11623 | movu [r0 + 99 * 16], m1 |
| 11624 | |
| 11625 | ; mode 8 [row 4] |
| 11626 | pmaddubsw m1, m0, [r5 + 25 * 16] |
| 11627 | pmulhrsw m1, m3 |
| 11628 | pmaddubsw m2, m7, [r5 + 25 * 16] |
| 11629 | pmulhrsw m2, m3 |
| 11630 | packuswb m1, m2 |
| 11631 | movu [r0 + 100 * 16], m1 |
| 11632 | |
| 11633 | ; mode 8 [row 5] |
| 11634 | pmaddubsw m1, m0, [r5 + 30 * 16] |
| 11635 | pmulhrsw m1, m3 |
| 11636 | pmaddubsw m2, m7, [r5 + 30 * 16] |
| 11637 | pmulhrsw m2, m3 |
| 11638 | packuswb m1, m2 |
| 11639 | movu [r0 + 101 * 16], m1 |
| 11640 | |
| 11641 | ; mode 15 [row 13 - second half] |
| 11642 | pmaddubsw m1, m0, [r5 + 18 * 16] |
| 11643 | pmulhrsw m1, m3 |
| 11644 | packuswb m1, m1 |
| 11645 | movh [r0 + 221 * 16 + 8], m1 |
| 11646 | ; mode 15 [row 13 - second half] end |
| 11647 | |
| 11648 | ; mode 15 [row 14 - second half] |
| 11649 | pmaddubsw m1, m0, [r5 + 1 * 16] |
| 11650 | pmulhrsw m1, m3 |
| 11651 | packuswb m1, m1 |
| 11652 | movh [r0 + 222 * 16 + 8], m1 |
| 11653 | ; mode 15 [row 14 - second half] end |
| 11654 | |
| 11655 | ; mode 16 [row 10 - second half] |
| 11656 | pmaddubsw m1, m0, [r5 + 25 * 16] |
| 11657 | pmulhrsw m1, m3 |
| 11658 | packuswb m1, m1 |
| 11659 | movh [r0 + 234 * 16 + 8], m1 |
| 11660 | ; mode 16 [row 10 - second half] end |
| 11661 | |
| 11662 | ; mode 16 [row 11 - second half] |
| 11663 | pmaddubsw m1, m0, [r5 + 4 * 16] |
| 11664 | pmulhrsw m1, m3 |
| 11665 | packuswb m1, m1 |
| 11666 | movh [r0 + 235 * 16 + 8], m1 |
| 11667 | ; mode 16 [row 11 - second half] end |
| 11668 | |
| 11669 | ; mode 3 [row 1] |
| 11670 | movu m6, [r5 + 20 * 16] |
| 11671 | movu m0, [r4 + 2] |
| 11672 | punpcklbw m0, m4 |
| 11673 | |
| 11674 | ; mode 17 [row 7 - second half] |
| 11675 | pmaddubsw m1, m0, [r5 + 16 * 16] |
| 11676 | pmulhrsw m1, m3 |
| 11677 | packuswb m1, m1 |
| 11678 | movh [r0 + 247 * 16 + 8], m1 |
| 11679 | |
| 11680 | ; mode 17 [row 7 - second half] end |
| 11681 | pmaddubsw m1, m0, m6 |
| 11682 | pmulhrsw m1, m3 |
| 11683 | movu m2, [r4 + 10] |
| 11684 | punpcklbw m2, m5 |
| 11685 | pmaddubsw m4, m2, m6 |
| 11686 | pmulhrsw m4, m3 |
| 11687 | packuswb m1, m4 |
| 11688 | movu [r0 + 17 * 16], m1 |
| 11689 | |
| 11690 | ;mode 6 [row 3] |
| 11691 | movu [r0 + 67 * 16], m1 |
| 11692 | |
| 11693 | ; mode 4 row [row 1] |
| 11694 | pmaddubsw m1, m0, [r5 + 10 * 16] |
| 11695 | pmulhrsw m1, m3 |
| 11696 | pmaddubsw m4, m2, [r5 + 10 * 16] |
| 11697 | pmulhrsw m4, m3 |
| 11698 | packuswb m1, m4 |
| 11699 | movu [r0 + 33 * 16], m1 |
| 11700 | |
| 11701 | ; mode 4 row [row 2] |
| 11702 | pmaddubsw m1, m0, [r5 + 31 * 16] |
| 11703 | pmulhrsw m1, m3 |
| 11704 | pmaddubsw m4, m2, [r5 + 31 * 16] |
| 11705 | pmulhrsw m4, m3 |
| 11706 | packuswb m1, m4 |
| 11707 | movu [r0 + 34 * 16], m1 |
| 11708 | |
| 11709 | ; mode 7 [row 6] |
| 11710 | movu [r0 + 86 * 16], m1 |
| 11711 | |
| 11712 | ; mode 5 row [row 1] |
| 11713 | pmaddubsw m1, m0, [r5 + 2 * 16] |
| 11714 | pmulhrsw m1, m3 |
| 11715 | pmaddubsw m4, m2, [r5 + 2 * 16] |
| 11716 | pmulhrsw m4, m3 |
| 11717 | packuswb m1, m4 |
| 11718 | movu [r0 + 49 * 16], m1 |
| 11719 | |
| 11720 | ; mode 5 row [row 2] |
| 11721 | pmaddubsw m1, m0, [r5 + 19 * 16] |
| 11722 | pmulhrsw m1, m3 |
| 11723 | pmaddubsw m4, m2, [r5 + 19 * 16] |
| 11724 | pmulhrsw m4, m3 |
| 11725 | packuswb m1, m4 |
| 11726 | movu [r0 + 50 * 16], m1 |
| 11727 | |
| 11728 | ; mode 6 [row 2] |
| 11729 | pmaddubsw m1, m0, [r5 + 7 * 16] |
| 11730 | pmulhrsw m1, m3 |
| 11731 | pmaddubsw m4, m2, [r5 + 7 * 16] |
| 11732 | pmulhrsw m4, m3 |
| 11733 | packuswb m1, m4 |
| 11734 | movu [r0 + 66 * 16], m1 |
| 11735 | |
| 11736 | ; mode 7 [row 3] |
| 11737 | pmaddubsw m1, m0, [r5 + 4 * 16] |
| 11738 | pmulhrsw m1, m3 |
| 11739 | pmaddubsw m4, m2, [r5 + 4 * 16] |
| 11740 | pmulhrsw m4, m3 |
| 11741 | packuswb m1, m4 |
| 11742 | movu [r0 + 83 * 16], m1 |
| 11743 | |
| 11744 | ; mode 7 [row 4] |
| 11745 | pmaddubsw m1, m0, [r5 + 13 * 16] |
| 11746 | pmulhrsw m1, m3 |
| 11747 | pmaddubsw m4, m2, [r5 + 13 * 16] |
| 11748 | pmulhrsw m4, m3 |
| 11749 | packuswb m1, m4 |
| 11750 | movu [r0 + 84 * 16], m1 |
| 11751 | |
| 11752 | ; mode 8 [row 8] |
| 11753 | movu [r0 + 104 * 16], m1 |
| 11754 | |
| 11755 | ; mode 7 [row 5] |
| 11756 | pmaddubsw m1, m0, [r5 + 22 * 16] |
| 11757 | pmulhrsw m1, m3 |
| 11758 | pmaddubsw m4, m2, [r5 + 22 * 16] |
| 11759 | pmulhrsw m4, m3 |
| 11760 | packuswb m1, m4 |
| 11761 | movu [r0 + 85 * 16], m1 |
| 11762 | |
| 11763 | ; mode 8 [row 6] |
| 11764 | pmaddubsw m1, m0, [r5 + 3 * 16] |
| 11765 | pmulhrsw m1, m3 |
| 11766 | pmaddubsw m4, m2, [r5 + 3 * 16] |
| 11767 | pmulhrsw m4, m3 |
| 11768 | packuswb m1, m4 |
| 11769 | movu [r0 + 102 * 16], m1 |
| 11770 | |
| 11771 | ; mode 8 [row 7] |
| 11772 | pmaddubsw m1, m0, [r5 + 8 * 16] |
| 11773 | pmulhrsw m1, m3 |
| 11774 | pmaddubsw m4, m2, [r5 + 8 * 16] |
| 11775 | pmulhrsw m4, m3 |
| 11776 | packuswb m1, m4 |
| 11777 | movu [r0 + 103 * 16], m1 |
| 11778 | |
| 11779 | ; mode 8 [row 9] |
| 11780 | pmaddubsw m1, m0, [r5 + 18 * 16] |
| 11781 | pmulhrsw m1, m3 |
| 11782 | pmaddubsw m4, m2, [r5 + 18 * 16] |
| 11783 | pmulhrsw m4, m3 |
| 11784 | packuswb m1, m4 |
| 11785 | movu [r0 + 105 * 16], m1 |
| 11786 | |
| 11787 | ; mode 8 [row 10] |
| 11788 | pmaddubsw m1, m0, [r5 + 23 * 16] |
| 11789 | pmulhrsw m1, m3 |
| 11790 | pmaddubsw m4, m2, [r5 + 23 * 16] |
| 11791 | pmulhrsw m4, m3 |
| 11792 | packuswb m1, m4 |
| 11793 | movu [r0 + 106 * 16], m1 |
| 11794 | |
| 11795 | ; mode 8 [row 11] |
| 11796 | pmaddubsw m1, m0, [r5 + 28 * 16] |
| 11797 | pmulhrsw m1, m3 |
| 11798 | pmaddubsw m4, m2, [r5 + 28 * 16] |
| 11799 | pmulhrsw m4, m3 |
| 11800 | packuswb m1, m4 |
| 11801 | movu [r0 + 107 * 16], m1 |
| 11802 | |
| 11803 | ; mode 3 [row 2] |
| 11804 | movu m0, [r4 + 3] |
| 11805 | movd m1, [r4 + 19] |
| 11806 | palignr m1, m0, 1 |
| 11807 | punpcklbw m0, m1 |
| 11808 | |
| 11809 | ; mode 17 [row 6 - second half] |
| 11810 | pmaddubsw m1, m0, [r5 + 10 * 16] |
| 11811 | pmulhrsw m1, m3 |
| 11812 | packuswb m1, m1 |
| 11813 | movh [r0 + 246 * 16 + 8], m1 |
| 11814 | ; mode 17 [row 6 - second half] end |
| 11815 | |
| 11816 | pmaddubsw m1, m0, [r5 + 14 * 16] |
| 11817 | pmulhrsw m1, m3 |
| 11818 | movu m2, [r4 + 11] |
| 11819 | movd m4, [r4 + 27] |
| 11820 | palignr m4, m2, 1 |
| 11821 | punpcklbw m2, m4 |
| 11822 | pmaddubsw m4, m2, [r5 + 14 * 16] |
| 11823 | pmulhrsw m4, m3 |
| 11824 | packuswb m1, m4 |
| 11825 | movu [r0 + 18 * 16], m1 |
| 11826 | |
| 11827 | ; mode 6 [row 5] |
| 11828 | movu [r0 + 69 * 16], m1 |
| 11829 | |
| 11830 | ; mode 4 row [row 3] |
| 11831 | pmaddubsw m1, m0, [r5 + 20 * 16] |
| 11832 | pmulhrsw m1, m3 |
| 11833 | pmaddubsw m4, m2, [r5 + 20 * 16] |
| 11834 | pmulhrsw m4, m3 |
| 11835 | packuswb m1, m4 |
| 11836 | movu [r0 + 35 * 16], m1 |
| 11837 | |
| 11838 | ; mode 5 row [row 3] |
| 11839 | pmaddubsw m1, m0, [r5 + 4 * 16] |
| 11840 | pmulhrsw m1, m3 |
| 11841 | pmaddubsw m4, m2, [r5 + 4 * 16] |
| 11842 | pmulhrsw m4, m3 |
| 11843 | packuswb m1, m4 |
| 11844 | movu [r0 + 51 * 16], m1 |
| 11845 | |
| 11846 | ; mode 5 row [row 4] |
| 11847 | pmaddubsw m1, m0, [r5 + 21 * 16] |
| 11848 | pmulhrsw m1, m3 |
| 11849 | pmaddubsw m4, m2, [r5 + 21 * 16] |
| 11850 | pmulhrsw m4, m3 |
| 11851 | packuswb m1, m4 |
| 11852 | movu [r0 + 52 * 16], m1 |
| 11853 | |
| 11854 | ; mode 6 [row 4] |
| 11855 | pmaddubsw m1, m0, [r5 + 1 * 16] |
| 11856 | pmulhrsw m1, m3 |
| 11857 | pmaddubsw m4, m2, [r5 + 1 * 16] |
| 11858 | pmulhrsw m4, m3 |
| 11859 | packuswb m1, m4 |
| 11860 | movu [r0 + 68 * 16], m1 |
| 11861 | |
| 11862 | ; mode 6 [row 6] |
| 11863 | pmaddubsw m1, m0, [r5 + 27 * 16] |
| 11864 | pmulhrsw m1, m3 |
| 11865 | pmaddubsw m4, m2, [r5 + 27 * 16] |
| 11866 | pmulhrsw m4, m3 |
| 11867 | packuswb m1, m4 |
| 11868 | movu [r0 + 70 * 16], m1 |
| 11869 | |
| 11870 | ; mode 7 [row 7] |
| 11871 | pmaddubsw m1, m0, [r5 + 8 * 16] |
| 11872 | pmulhrsw m1, m3 |
| 11873 | pmaddubsw m4, m2, [r5 + 8 * 16] |
| 11874 | pmulhrsw m4, m3 |
| 11875 | packuswb m1, m4 |
| 11876 | movu [r0 + 87 * 16], m1 |
| 11877 | |
| 11878 | ; mode 7 [row 8] |
| 11879 | pmaddubsw m1, m0, [r5 + 17 * 16] |
| 11880 | pmulhrsw m1, m3 |
| 11881 | pmaddubsw m4, m2, [r5 + 17 * 16] |
| 11882 | pmulhrsw m4, m3 |
| 11883 | packuswb m1, m4 |
| 11884 | movu [r0 + 88 * 16], m1 |
| 11885 | |
| 11886 | ; mode 7 [row 9] |
| 11887 | pmaddubsw m1, m0, [r5 + 26 * 16] |
| 11888 | pmulhrsw m1, m3 |
| 11889 | pmaddubsw m4, m2, [r5 + 26 * 16] |
| 11890 | pmulhrsw m4, m3 |
| 11891 | packuswb m1, m4 |
| 11892 | movu [r0 + 89 * 16], m1 |
| 11893 | |
| 11894 | ; mode 8 [row 12] |
| 11895 | pmaddubsw m1, m0, [r5 + 1 * 16] |
| 11896 | pmulhrsw m1, m3 |
| 11897 | pmaddubsw m4, m2, [r5 + 1 * 16] |
| 11898 | pmulhrsw m4, m3 |
| 11899 | packuswb m1, m4 |
| 11900 | movu [r0 + 108 * 16], m1 |
| 11901 | |
| 11902 | ; mode 8 [row 13] |
| 11903 | pmaddubsw m1, m0, [r5 + 6 * 16] |
| 11904 | pmulhrsw m1, m3 |
| 11905 | pmaddubsw m4, m2, [r5 + 6 * 16] |
| 11906 | pmulhrsw m4, m3 |
| 11907 | packuswb m1, m4 |
| 11908 | movu [r0 + 109 * 16], m1 |
| 11909 | |
| 11910 | ; mode 8 [row 14] |
| 11911 | pmaddubsw m1, m0, [r5 + 11 * 16] |
| 11912 | pmulhrsw m1, m3 |
| 11913 | pmaddubsw m4, m2, [r5 + 11 * 16] |
| 11914 | pmulhrsw m4, m3 |
| 11915 | packuswb m1, m4 |
| 11916 | movu [r0 + 110 * 16], m1 |
| 11917 | |
| 11918 | ; mode 8 [row 15] |
| 11919 | pmaddubsw m1, m0, [r5 + 16 * 16] |
| 11920 | pmulhrsw m1, m3 |
| 11921 | pmaddubsw m4, m2, [r5 + 16 * 16] |
| 11922 | pmulhrsw m4, m3 |
| 11923 | packuswb m1, m4 |
| 11924 | movu [r0 + 111 * 16], m1 |
| 11925 | |
| 11926 | ; mode 3 [row 3] |
| 11927 | movu m0, [r4 + 4] |
| 11928 | movd m1, [r4 + 20] |
| 11929 | palignr m1, m0, 1 |
| 11930 | punpcklbw m0, m1 |
| 11931 | |
| 11932 | ; mode 17 [row 4 - second half] |
| 11933 | pmaddubsw m1, m0, [r5 + 30 * 16] |
| 11934 | pmulhrsw m1, m3 |
| 11935 | packuswb m1, m1 |
| 11936 | movh [r0 + 244 * 16 + 8], m1 |
| 11937 | ; mode 17 [row 4 - second half] end |
| 11938 | |
| 11939 | ; mode 17 [row 5 - second half] |
| 11940 | pmaddubsw m1, m0, [r5 + 4 * 16] |
| 11941 | pmulhrsw m1, m3 |
| 11942 | packuswb m1, m1 |
| 11943 | movh [r0 + 245 * 16 + 8], m1 |
| 11944 | ; mode 17 [row 5 - second half] end |
| 11945 | |
| 11946 | pmaddubsw m1, m0, [r5 + 8 * 16] |
| 11947 | pmulhrsw m1, m3 |
| 11948 | movu m2, [r4 + 12] |
| 11949 | movd m4, [r4 + 28] |
| 11950 | palignr m4, m2, 1 |
| 11951 | punpcklbw m2, m4 |
| 11952 | pmaddubsw m4, m2, [r5 + 8 * 16] |
| 11953 | pmulhrsw m4, m3 |
| 11954 | packuswb m1, m4 |
| 11955 | movu [r0 + 19 * 16], m1 |
| 11956 | |
| 11957 | ; mode 6 [row 7] |
| 11958 | movu [r0 + 71 * 16], m1 |
| 11959 | |
| 11960 | ; mode 4 row [row 4] |
| 11961 | pmaddubsw m1, m0, [r5 + 9 * 16] |
| 11962 | pmulhrsw m1, m3 |
| 11963 | pmaddubsw m4, m2, [r5 + 9 * 16] |
| 11964 | pmulhrsw m4, m3 |
| 11965 | packuswb m1, m4 |
| 11966 | movu [r0 + 36 * 16], m1 |
| 11967 | |
| 11968 | ; mode 4 row [row 5] |
| 11969 | pmaddubsw m1, m0, [r5 + 30 * 16] |
| 11970 | pmulhrsw m1, m3 |
| 11971 | pmaddubsw m4, m2, [r5 + 30 * 16] |
| 11972 | pmulhrsw m4, m3 |
| 11973 | packuswb m1, m4 |
| 11974 | movu [r0 + 37 * 16], m1 |
| 11975 | |
| 11976 | ; mode 7 row [row 13] |
| 11977 | movu [r0 + 93 * 16], m1 |
| 11978 | |
| 11979 | ; mode 5 row [row 5] |
| 11980 | pmaddubsw m1, m0, [r5 + 6 * 16] |
| 11981 | pmulhrsw m1, m3 |
| 11982 | pmaddubsw m4, m2, [r5 + 6 * 16] |
| 11983 | pmulhrsw m4, m3 |
| 11984 | packuswb m1, m4 |
| 11985 | movu [r0 + 53 * 16], m1 |
| 11986 | |
| 11987 | ; mode 5 row [row 6] |
| 11988 | pmaddubsw m1, m0, [r5 + 23 * 16] |
| 11989 | pmulhrsw m1, m3 |
| 11990 | pmaddubsw m4, m2, [r5 + 23 * 16] |
| 11991 | pmulhrsw m4, m3 |
| 11992 | packuswb m1, m4 |
| 11993 | movu [r0 + 54 * 16], m1 |
| 11994 | |
| 11995 | ; mode 6 [row 8] |
| 11996 | pmaddubsw m1, m0, [r5 + 21 * 16] |
| 11997 | pmulhrsw m1, m3 |
| 11998 | pmaddubsw m4, m2, [r5 + 21 * 16] |
| 11999 | pmulhrsw m4, m3 |
| 12000 | packuswb m1, m4 |
| 12001 | movu [r0 + 72 * 16], m1 |
| 12002 | |
| 12003 | ; mode 7 [row 12] |
| 12004 | movu [r0 + 92 * 16], m1 |
| 12005 | |
| 12006 | ; mode 7 [row 10] |
| 12007 | pmaddubsw m1, m0, [r5 + 3 * 16] |
| 12008 | pmulhrsw m1, m3 |
| 12009 | pmaddubsw m4, m2, [r5 + 3 * 16] |
| 12010 | pmulhrsw m4, m3 |
| 12011 | packuswb m1, m4 |
| 12012 | movu [r0 + 90 * 16], m1 |
| 12013 | |
| 12014 | ; mode 7 [row 11] |
| 12015 | pmaddubsw m1, m0, [r5 + 12 * 16] |
| 12016 | pmulhrsw m1, m3 |
| 12017 | pmaddubsw m4, m2, [r5 + 12 * 16] |
| 12018 | pmulhrsw m4, m3 |
| 12019 | packuswb m1, m4 |
| 12020 | movu [r0 + 91 * 16], m1 |
| 12021 | |
| 12022 | ; mode 3 [row 4] |
| 12023 | movu m0, [r4 + 5] |
| 12024 | movd m1, [r4 + 20] |
| 12025 | palignr m1, m0, 1 |
| 12026 | punpcklbw m0, m1 |
| 12027 | |
| 12028 | ; mode 17 [row 3 - second half] |
| 12029 | pmaddubsw m1, m0, [r5 + 24 * 16] |
| 12030 | pmulhrsw m1, m3 |
| 12031 | packuswb m1, m1 |
| 12032 | movh [r0 + 243 * 16 + 8], m1 |
| 12033 | |
| 12034 | ; mode 17 [row 3 - second half] end |
| 12035 | pmaddubsw m1, m0, [r5 + 2 * 16] |
| 12036 | pmulhrsw m1, m3 |
| 12037 | movu m2, [r4 + 13] |
| 12038 | movd m4, [r4 + 29] |
| 12039 | palignr m4, m2, 1 |
| 12040 | punpcklbw m2, m4 |
| 12041 | pmaddubsw m4, m2, [r5 + 2 * 16] |
| 12042 | pmulhrsw m4, m3 |
| 12043 | packuswb m1, m4 |
| 12044 | movu [r0 + 20 * 16], m1 |
| 12045 | |
| 12046 | ;mode 6 [row 9] |
| 12047 | movu [r0 + 73 * 16], m1 |
| 12048 | |
| 12049 | ; mode 4 row [row 6] |
| 12050 | movu m6, [r5 + 19 * 16] |
| 12051 | pmaddubsw m1, m0, m6 |
| 12052 | pmulhrsw m1, m3 |
| 12053 | pmaddubsw m4, m2, m6 |
| 12054 | pmulhrsw m4, m3 |
| 12055 | packuswb m1, m4 |
| 12056 | movu [r0 + 38 * 16], m1 |
| 12057 | |
| 12058 | ; mode 3 [row 5] |
| 12059 | pmaddubsw m1, m0, [r5 + 28 * 16] |
| 12060 | pmulhrsw m1, m3 |
| 12061 | pmaddubsw m4, m2, [r5 + 28 * 16] |
| 12062 | pmulhrsw m4, m3 |
| 12063 | packuswb m1, m4 |
| 12064 | movu [r0 + 21 * 16], m1 |
| 12065 | |
| 12066 | ;mode 6 [row 11] |
| 12067 | movu [r0 + 75 * 16], m1 |
| 12068 | |
| 12069 | ; mode 5 row [row 7] |
| 12070 | pmaddubsw m1, m0, [r5 + 8 * 16] |
| 12071 | pmulhrsw m1, m3 |
| 12072 | pmaddubsw m4, m2, [r5 + 8 * 16] |
| 12073 | pmulhrsw m4, m3 |
| 12074 | packuswb m1, m4 |
| 12075 | movu [r0 + 55 * 16], m1 |
| 12076 | |
| 12077 | ; mode 5 row [row 8] |
| 12078 | pmaddubsw m1, m0, [r5 + 25 * 16] |
| 12079 | pmulhrsw m1, m3 |
| 12080 | pmaddubsw m4, m2, [r5 + 25 * 16] |
| 12081 | pmulhrsw m4, m3 |
| 12082 | packuswb m1, m4 |
| 12083 | movu [r0 + 56 * 16], m1 |
| 12084 | |
| 12085 | ; mode 6 [row 10] |
| 12086 | pmaddubsw m1, m0, [r5 + 15 * 16] |
| 12087 | pmulhrsw m1, m3 |
| 12088 | pmaddubsw m4, m2, [r5 + 15 * 16] |
| 12089 | pmulhrsw m4, m3 |
| 12090 | packuswb m1, m4 |
| 12091 | movu [r0 + 74 * 16], m1 |
| 12092 | |
| 12093 | ; mode 7 [row 14] |
| 12094 | pmaddubsw m1, m0, [r5 + 7 * 16] |
| 12095 | pmulhrsw m1, m3 |
| 12096 | pmaddubsw m4, m2, [r5 + 7 * 16] |
| 12097 | pmulhrsw m4, m3 |
| 12098 | packuswb m1, m4 |
| 12099 | movu [r0 + 94 * 16], m1 |
| 12100 | |
| 12101 | ; mode 7 [row 15] |
| 12102 | pmaddubsw m1, m0, [r5 + 16 * 16] |
| 12103 | pmulhrsw m1, m3 |
| 12104 | pmaddubsw m4, m2, [r5 + 16 * 16] |
| 12105 | pmulhrsw m4, m3 |
| 12106 | packuswb m1, m4 |
| 12107 | movu [r0 + 95 * 16], m1 |
| 12108 | |
| 12109 | ; mode 3 [row 6] |
| 12110 | movu m0, [r4 + 6] |
| 12111 | movd m1, [r4 + 22] |
| 12112 | palignr m1, m0, 1 |
| 12113 | punpcklbw m0, m1 |
| 12114 | |
| 12115 | ; mode 17 [row 2 - second half] |
| 12116 | pmaddubsw m1, m0, [r5 + 18 * 16] |
| 12117 | pmulhrsw m1, m3 |
| 12118 | packuswb m1, m1 |
| 12119 | movh [r0 + 242 * 16 + 8], m1 |
| 12120 | ; mode 17 [row 2 - second half] end |
| 12121 | |
| 12122 | pmaddubsw m1, m0, [r5 + 22 * 16] |
| 12123 | pmulhrsw m1, m3 |
| 12124 | movu m2, [r4 + 14] |
| 12125 | movd m4, [r4 + 30] |
| 12126 | palignr m4, m2, 1 |
| 12127 | punpcklbw m2, m4 |
| 12128 | pmaddubsw m4, m2, [r5 + 22 * 16] |
| 12129 | pmulhrsw m4, m3 |
| 12130 | packuswb m1, m4 |
| 12131 | movu [r0 + 22 * 16], m1 |
| 12132 | |
| 12133 | ; mode 6 [row 13] |
| 12134 | movu [r0 + 77 * 16], m1 |
| 12135 | |
| 12136 | ; mode 4 row [row 7] |
| 12137 | pmaddubsw m1, m0, [r5 + 8 * 16] |
| 12138 | pmulhrsw m1, m3 |
| 12139 | pmaddubsw m4, m2, [r5 + 8 * 16] |
| 12140 | pmulhrsw m4, m3 |
| 12141 | packuswb m1, m4 |
| 12142 | movu [r0 + 39 * 16], m1 |
| 12143 | |
| 12144 | ; mode 4 row [row 8] |
| 12145 | pmaddubsw m1, m0, [r5 + 29 * 16] |
| 12146 | pmulhrsw m1, m3 |
| 12147 | pmaddubsw m4, m2, [r5 + 29 * 16] |
| 12148 | pmulhrsw m4, m3 |
| 12149 | packuswb m1, m4 |
| 12150 | movu [r0 + 40 * 16], m1 |
| 12151 | |
| 12152 | ; mode 5 row [row 9] |
| 12153 | pmaddubsw m1, m0, [r5 + 10 * 16] |
| 12154 | pmulhrsw m1, m3 |
| 12155 | pmaddubsw m4, m2, [r5 + 10 * 16] |
| 12156 | pmulhrsw m4, m3 |
| 12157 | packuswb m1, m4 |
| 12158 | movu [r0 + 57 * 16], m1 |
| 12159 | |
| 12160 | ; mode 5 row [row 10] |
| 12161 | pmaddubsw m1, m0, [r5 + 27 * 16] |
| 12162 | pmulhrsw m1, m3 |
| 12163 | pmaddubsw m4, m2, [r5 + 27 * 16] |
| 12164 | pmulhrsw m4, m3 |
| 12165 | packuswb m1, m4 |
| 12166 | movu [r0 + 58 * 16], m1 |
| 12167 | |
| 12168 | ; mode 6 [row 12] |
| 12169 | pmaddubsw m1, m0, [r5 + 9 * 16] |
| 12170 | pmulhrsw m1, m3 |
| 12171 | pmaddubsw m4, m2, [r5 + 9 * 16] |
| 12172 | pmulhrsw m4, m3 |
| 12173 | packuswb m1, m4 |
| 12174 | movu [r0 + 76 * 16], m1 |
| 12175 | |
| 12176 | ; mode 3 [row 7] |
| 12177 | movu m0, [r4 + 7] |
| 12178 | movd m1, [r4 + 27] |
| 12179 | palignr m1, m0, 1 |
| 12180 | punpcklbw m0, m1 |
| 12181 | |
| 12182 | ; mode 17 [row 1 - second half] |
| 12183 | pmaddubsw m1, m0, [r5 + 12 * 16] |
| 12184 | pmulhrsw m1, m3 |
| 12185 | packuswb m1, m1 |
| 12186 | movh [r0 + 241 * 16 + 8], m1 |
| 12187 | ; mode 17 [row 1 - second half] end |
| 12188 | |
| 12189 | pmaddubsw m1, m0, [r5 + 16 * 16] |
| 12190 | pmulhrsw m1, m3 |
| 12191 | movu m2, [r4 + 15] |
| 12192 | movd m4, [r4 + 25] |
| 12193 | palignr m4, m2, 1 |
| 12194 | punpcklbw m2, m4 |
| 12195 | pmaddubsw m4, m2, [r5 + 16 * 16] |
| 12196 | pmulhrsw m4, m3 |
| 12197 | packuswb m1, m4 |
| 12198 | movu [r0 + 23 * 16], m1 |
| 12199 | |
| 12200 | ; mode 6 [row 15] |
| 12201 | movu [r0 + 79 * 16], m1 |
| 12202 | |
| 12203 | ; mode 4 row [row 9] |
| 12204 | pmaddubsw m1, m0, [r5 + 18 * 16] |
| 12205 | pmulhrsw m1, m3 |
| 12206 | pmaddubsw m4, m2, [r5 + 18 * 16] |
| 12207 | pmulhrsw m4, m3 |
| 12208 | packuswb m1, m4 |
| 12209 | movu [r0 + 41 * 16], m1 |
| 12210 | |
| 12211 | ; mode 5 row [row 11] |
| 12212 | pmaddubsw m1, m0, [r5 + 12 * 16] |
| 12213 | pmulhrsw m1, m3 |
| 12214 | pmaddubsw m4, m2, [r5 + 12 * 16] |
| 12215 | pmulhrsw m4, m3 |
| 12216 | packuswb m1, m4 |
| 12217 | movu [r0 + 59 * 16], m1 |
| 12218 | |
| 12219 | ; mode 5 row [row 12] |
| 12220 | pmaddubsw m1, m0, [r5 + 29 * 16] |
| 12221 | pmulhrsw m1, m3 |
| 12222 | pmaddubsw m4, m2, [r5 + 29 * 16] |
| 12223 | pmulhrsw m4, m3 |
| 12224 | packuswb m1, m4 |
| 12225 | movu [r0 + 60 * 16], m1 |
| 12226 | |
| 12227 | ; mode 6 [row 14] |
| 12228 | pmaddubsw m1, m0, [r5 + 3 * 16] |
| 12229 | pmulhrsw m1, m3 |
| 12230 | pmaddubsw m4, m2, [r5 + 3 * 16] |
| 12231 | pmulhrsw m4, m3 |
| 12232 | packuswb m1, m4 |
| 12233 | movu [r0 + 78 * 16], m1 |
| 12234 | |
| 12235 | ; mode 3 [row 8] |
| 12236 | movu m0, [r4 + 8] |
| 12237 | movd m1, [r4 + 24] |
| 12238 | palignr m1, m0, 1 |
| 12239 | punpcklbw m0, m1 |
| 12240 | pmaddubsw m1, m0, [r5 + 10 * 16] |
| 12241 | pmulhrsw m1, m3 |
| 12242 | movu m2, [r4 + 16] |
| 12243 | psrldq m4, m2, 1 |
| 12244 | pinsrb m4, [r4 + 32], 15 |
| 12245 | punpcklbw m2, m4 |
| 12246 | pmaddubsw m4, m2, [r5 + 10 * 16] |
| 12247 | pmulhrsw m4, m3 |
| 12248 | packuswb m1, m4 |
| 12249 | movu [r0 + 24 * 16], m1 |
| 12250 | |
| 12251 | ; mode 4 row [row 10] |
| 12252 | pmaddubsw m1, m0, [r5 + 7 * 16] |
| 12253 | pmulhrsw m1, m3 |
| 12254 | pmaddubsw m4, m2, [r5 + 7 * 16] |
| 12255 | pmulhrsw m4, m3 |
| 12256 | packuswb m1, m4 |
| 12257 | movu [r0 + 42 * 16], m1 |
| 12258 | |
| 12259 | ; mode 4 row [row 11] |
| 12260 | pmaddubsw m1, m0, [r5 + 28 * 16] |
| 12261 | pmulhrsw m1, m3 |
| 12262 | pmaddubsw m4, m2, [r5 + 28 * 16] |
| 12263 | pmulhrsw m4, m3 |
| 12264 | packuswb m1, m4 |
| 12265 | movu [r0 + 43 * 16], m1 |
| 12266 | |
| 12267 | ; mode 5 row [row 13] |
| 12268 | pmaddubsw m1, m0, [r5 + 14 * 16] |
| 12269 | pmulhrsw m1, m3 |
| 12270 | pmaddubsw m4, m2, [r5 + 14 * 16] |
| 12271 | pmulhrsw m4, m3 |
| 12272 | packuswb m1, m4 |
| 12273 | movu [r0 + 61 * 16], m1 |
| 12274 | |
| 12275 | ; mode 5 row [row 14] |
| 12276 | pmaddubsw m1, m0, [r5 + 31 * 16] |
| 12277 | pmulhrsw m1, m3 |
| 12278 | pmaddubsw m4, m2, [r5 + 31 * 16] |
| 12279 | pmulhrsw m4, m3 |
| 12280 | packuswb m1, m4 |
| 12281 | movu [r0 + 62 * 16], m1 |
| 12282 | |
| 12283 | ; mode 3 [row 9] |
| 12284 | movu m0, [r4 + 9] |
| 12285 | movd m1, [r4 + 16] |
| 12286 | palignr m1, m0, 1 |
| 12287 | punpcklbw m0, m1 |
| 12288 | pmaddubsw m1, m0, [r5 + 4 * 16] |
| 12289 | pmulhrsw m1, m3 |
| 12290 | movu m2, [r4 + 17] |
| 12291 | movd m4, [r4 + 33] |
| 12292 | palignr m4, m2, 1 |
| 12293 | punpcklbw m2, m4 |
| 12294 | pmaddubsw m4, m2, [r5 + 4 * 16] |
| 12295 | pmulhrsw m4, m3 |
| 12296 | packuswb m1, m4 |
| 12297 | movu [r0 + 25 * 16], m1 |
| 12298 | |
| 12299 | ; mode 4 row [row 12] |
| 12300 | pmaddubsw m1, m0, [r5 + 17 * 16] |
| 12301 | pmulhrsw m1, m3 |
| 12302 | pmaddubsw m4, m2, [r5 + 17 * 16] |
| 12303 | pmulhrsw m4, m3 |
| 12304 | packuswb m1, m4 |
| 12305 | movu [r0 + 44 * 16], m1 |
| 12306 | |
| 12307 | ; mode 3 [row 10] |
| 12308 | pmaddubsw m1, m0, [r5 + 30 * 16] |
| 12309 | pmulhrsw m1, m3 |
| 12310 | pmaddubsw m4, m2, [r5 + 30 * 16] |
| 12311 | pmulhrsw m4, m3 |
| 12312 | packuswb m1, m4 |
| 12313 | movu [r0 + 26 * 16], m1 |
| 12314 | |
| 12315 | ; mode 5 row [row 15] |
| 12316 | pmaddubsw m1, m0, [r5 + 16 * 16] |
| 12317 | pmulhrsw m1, m3 |
| 12318 | pmaddubsw m4, m2, [r5 + 16 * 16] |
| 12319 | pmulhrsw m4, m3 |
| 12320 | packuswb m1, m4 |
| 12321 | movu [r0 + 63 * 16], m1 |
| 12322 | |
| 12323 | ; mode 3 [row 11] |
| 12324 | movu m0, [r4 + 10] |
| 12325 | movd m1, [r4 + 26] |
| 12326 | palignr m1, m0, 1 |
| 12327 | punpcklbw m0, m1 |
| 12328 | pmaddubsw m1, m0, [r5 + 24 * 16] |
| 12329 | pmulhrsw m1, m3 |
| 12330 | movu m2, [r4 + 18] |
| 12331 | movd m4, [r4 + 34] |
| 12332 | palignr m4, m2, 1 |
| 12333 | punpcklbw m2, m4 |
| 12334 | pmaddubsw m4, m2, [r5 + 24 * 16] |
| 12335 | pmulhrsw m4, m3 |
| 12336 | packuswb m1, m4 |
| 12337 | movu [r0 + 27 * 16], m1 |
| 12338 | |
| 12339 | ; mode 4 row [row 13] |
| 12340 | pmaddubsw m1, m0, [r5 + 6 * 16] |
| 12341 | pmulhrsw m1, m3 |
| 12342 | pmaddubsw m4, m2, [r5 + 6 * 16] |
| 12343 | pmulhrsw m4, m3 |
| 12344 | packuswb m1, m4 |
| 12345 | movu [r0 + 45 * 16], m1 |
| 12346 | |
| 12347 | ; mode 4 row [row 14] |
| 12348 | pmaddubsw m1, m0, [r5 + 27 * 16] |
| 12349 | pmulhrsw m1, m3 |
| 12350 | pmaddubsw m4, m2, [r5 + 27 * 16] |
| 12351 | pmulhrsw m4, m3 |
| 12352 | packuswb m1, m4 |
| 12353 | movu [r0 + 46 * 16], m1 |
| 12354 | |
| 12355 | ; mode 3 [row 12] |
| 12356 | movu m0, [r4 + 11] |
| 12357 | movd m1, [r4 + 27] |
| 12358 | palignr m1, m0, 1 |
| 12359 | punpcklbw m0, m1 |
| 12360 | pmaddubsw m1, m0, [r5 + 18 * 16] |
| 12361 | pmulhrsw m1, m3 |
| 12362 | movu m2, [r4 + 19] |
| 12363 | movd m4, [r4 + 35] |
| 12364 | palignr m4, m2, 1 |
| 12365 | punpcklbw m2, m4 |
| 12366 | pmaddubsw m4, m2, [r5 + 18 * 16] |
| 12367 | pmulhrsw m4, m3 |
| 12368 | packuswb m1, m4 |
| 12369 | movu [r0 + 28 * 16], m1 |
| 12370 | |
| 12371 | ; mode 4 row [row 15] |
| 12372 | pmaddubsw m1, m0, [r5 + 16 * 16] |
| 12373 | pmulhrsw m1, m3 |
| 12374 | pmaddubsw m4, m2, [r5 + 16 * 16] |
| 12375 | pmulhrsw m4, m3 |
| 12376 | packuswb m1, m4 |
| 12377 | movu [r0 + 47 * 16], m1 |
| 12378 | |
| 12379 | ; mode 3 [row 13] |
| 12380 | movu m0, [r4 + 12] |
| 12381 | movd m1, [r4 + 28] |
| 12382 | palignr m1, m0, 1 |
| 12383 | punpcklbw m0, m1 |
| 12384 | pmaddubsw m1, m0, [r5 + 12 * 16] |
| 12385 | pmulhrsw m1, m3 |
| 12386 | movu m2, [r4 + 20] |
| 12387 | movd m4, [r4 + 36] |
| 12388 | palignr m4, m2, 1 |
| 12389 | punpcklbw m2, m4 |
| 12390 | pmaddubsw m4, m2, [r5 + 12 * 16] |
| 12391 | pmulhrsw m4, m3 |
| 12392 | packuswb m1, m4 |
| 12393 | movu [r0 + 29 * 16], m1 |
| 12394 | |
| 12395 | ; mode 3 [row 14] |
| 12396 | movu m0, [r4 + 13] |
| 12397 | movd m1, [r4 + 29] |
| 12398 | palignr m1, m0, 1 |
| 12399 | punpcklbw m0, m1 |
| 12400 | pmaddubsw m1, m0, [r5 + 6 * 16] |
| 12401 | pmulhrsw m1, m3 |
| 12402 | movu m2, [r4 + 21] |
| 12403 | movd m4, [r4 + 37] |
| 12404 | palignr m4, m2, 1 |
| 12405 | punpcklbw m2, m4 |
| 12406 | pmaddubsw m4, m2, [r5 + 6 * 16] |
| 12407 | pmulhrsw m4, m3 |
| 12408 | packuswb m1, m4 |
| 12409 | movu [r0 + 30 * 16], m1 |
| 12410 | |
| 12411 | ; mode 9 |
| 12412 | movu m0, [r2 + 1] |
| 12413 | movd m1, [r2 + 17] |
| 12414 | palignr m1, m0, 1 |
| 12415 | |
| 12416 | ; mode 9 [row 15] |
| 12417 | movu [r0 + 127 * 16], m1 |
| 12418 | |
| 12419 | ; mode 9 [row 0] |
| 12420 | punpcklbw m0, m1 |
| 12421 | pmaddubsw m1, m0, [r5 + 2 * 16] |
| 12422 | pmulhrsw m1, m3 |
| 12423 | movu m7, [r2 + 9] |
| 12424 | movd m4, [r4 + 25] |
| 12425 | palignr m2, m7, 1 |
| 12426 | punpcklbw m7, m2 |
| 12427 | pmaddubsw m2, m7, [r5 + 2 * 16] |
| 12428 | pmulhrsw m2, m3 |
| 12429 | packuswb m1, m2 |
| 12430 | movu [r0 + 112 * 16], m1 |
| 12431 | |
| 12432 | ; mode 9 [row 1] |
| 12433 | pmaddubsw m1, m0, [r5 + 4 * 16] |
| 12434 | pmulhrsw m1, m3 |
| 12435 | pmaddubsw m2, m7, [r5 + 4 * 16] |
| 12436 | pmulhrsw m2, m3 |
| 12437 | packuswb m1, m2 |
| 12438 | movu [r0 + 113 * 16], m1 |
| 12439 | |
| 12440 | ; mode 9 [row 2] |
| 12441 | pmaddubsw m1, m0, [r5 + 6 * 16] |
| 12442 | pmulhrsw m1, m3 |
| 12443 | pmaddubsw m2, m7, [r5 + 6 * 16] |
| 12444 | pmulhrsw m2, m3 |
| 12445 | packuswb m1, m2 |
| 12446 | movu [r0 + 114 * 16], m1 |
| 12447 | |
| 12448 | ; mode 9 [row 3] |
| 12449 | pmaddubsw m1, m0, [r5 + 8 * 16] |
| 12450 | pmulhrsw m1, m3 |
| 12451 | pmaddubsw m2, m7, [r5 + 8 * 16] |
| 12452 | pmulhrsw m2, m3 |
| 12453 | packuswb m1, m2 |
| 12454 | movu [r0 + 115 * 16], m1 |
| 12455 | |
| 12456 | ; mode 9 [row 4] |
| 12457 | pmaddubsw m1, m0, [r5 + 10 * 16] |
| 12458 | pmulhrsw m1, m3 |
| 12459 | pmaddubsw m2, m7, [r5 + 10 * 16] |
| 12460 | pmulhrsw m2, m3 |
| 12461 | packuswb m1, m2 |
| 12462 | movu [r0 + 116 * 16], m1 |
| 12463 | |
| 12464 | ; mode 9 [row 5] |
| 12465 | pmaddubsw m1, m0, [r5 + 12 * 16] |
| 12466 | pmulhrsw m1, m3 |
| 12467 | pmaddubsw m2, m7, [r5 + 12 * 16] |
| 12468 | pmulhrsw m2, m3 |
| 12469 | packuswb m1, m2 |
| 12470 | movu [r0 + 117 * 16], m1 |
| 12471 | |
| 12472 | ; mode 9 [row 6] |
| 12473 | pmaddubsw m1, m0, [r5 + 14 * 16] |
| 12474 | pmulhrsw m1, m3 |
| 12475 | pmaddubsw m2, m7, [r5 + 14 * 16] |
| 12476 | pmulhrsw m2, m3 |
| 12477 | packuswb m1, m2 |
| 12478 | movu [r0 + 118 * 16], m1 |
| 12479 | |
| 12480 | ; mode 9 [row 7] |
| 12481 | pmaddubsw m1, m0, [r5 + 16 * 16] |
| 12482 | pmulhrsw m1, m3 |
| 12483 | pmaddubsw m2, m7, [r5 + 16 * 16] |
| 12484 | pmulhrsw m2, m3 |
| 12485 | packuswb m1, m2 |
| 12486 | movu [r0 + 119 * 16], m1 |
| 12487 | |
| 12488 | ; mode 9 [row 8] |
| 12489 | pmaddubsw m1, m0, [r5 + 18 * 16] |
| 12490 | pmulhrsw m1, m3 |
| 12491 | pmaddubsw m2, m7, [r5 + 18 * 16] |
| 12492 | pmulhrsw m2, m3 |
| 12493 | packuswb m1, m2 |
| 12494 | movu [r0 + 120 * 16], m1 |
| 12495 | |
| 12496 | ; mode 9 [row 9] |
| 12497 | pmaddubsw m1, m0, [r5 + 20 * 16] |
| 12498 | pmulhrsw m1, m3 |
| 12499 | pmaddubsw m2, m7, [r5 + 20 * 16] |
| 12500 | pmulhrsw m2, m3 |
| 12501 | packuswb m1, m2 |
| 12502 | movu [r0 + 121 * 16], m1 |
| 12503 | |
| 12504 | ; mode 9 [row 10] |
| 12505 | pmaddubsw m1, m0, [r5 + 22 * 16] |
| 12506 | pmulhrsw m1, m3 |
| 12507 | pmaddubsw m2, m7, [r5 + 22 * 16] |
| 12508 | pmulhrsw m2, m3 |
| 12509 | packuswb m1, m2 |
| 12510 | movu [r0 + 122 * 16], m1 |
| 12511 | |
| 12512 | ; mode 9 [row 11] |
| 12513 | pmaddubsw m1, m0, [r5 + 24 * 16] |
| 12514 | pmulhrsw m1, m3 |
| 12515 | pmaddubsw m2, m7, [r5 + 24 * 16] |
| 12516 | pmulhrsw m2, m3 |
| 12517 | packuswb m1, m2 |
| 12518 | movu [r0 + 123 * 16], m1 |
| 12519 | |
| 12520 | ; mode 9 [row 12] |
| 12521 | pmaddubsw m1, m0, [r5 + 26 * 16] |
| 12522 | pmulhrsw m1, m3 |
| 12523 | pmaddubsw m2, m7, [r5 + 26 * 16] |
| 12524 | pmulhrsw m2, m3 |
| 12525 | packuswb m1, m2 |
| 12526 | movu [r0 + 124 * 16], m1 |
| 12527 | |
| 12528 | ; mode 9 [row 13] |
| 12529 | pmaddubsw m1, m0, [r5 + 28 * 16] |
| 12530 | pmulhrsw m1, m3 |
| 12531 | pmaddubsw m2, m7, [r5 + 28 * 16] |
| 12532 | pmulhrsw m2, m3 |
| 12533 | packuswb m1, m2 |
| 12534 | movu [r0 + 125 * 16], m1 |
| 12535 | |
| 12536 | ; mode 9 [row 14] |
| 12537 | pmaddubsw m1, m0, [r5 + 30 * 16] |
| 12538 | pmulhrsw m1, m3 |
| 12539 | pmaddubsw m2, m7, [r5 + 30 * 16] |
| 12540 | pmulhrsw m2, m3 |
| 12541 | packuswb m1, m2 |
| 12542 | movu [r0 + 126 * 16], m1 |
| 12543 | |
| 12544 | ; mode 10 |
| 12545 | movu m1, [r2 + 1] |
| 12546 | movu [r0 + 128 * 16], m1 |
| 12547 | movu [r0 + 129 * 16], m1 |
| 12548 | movu [r0 + 130 * 16], m1 |
| 12549 | movu [r0 + 131 * 16], m1 |
| 12550 | movu [r0 + 132 * 16], m1 |
| 12551 | movu [r0 + 133 * 16], m1 |
| 12552 | movu [r0 + 134 * 16], m1 |
| 12553 | movu [r0 + 135 * 16], m1 |
| 12554 | movu [r0 + 136 * 16], m1 |
| 12555 | movu [r0 + 137 * 16], m1 |
| 12556 | movu [r0 + 138 * 16], m1 |
| 12557 | movu [r0 + 139 * 16], m1 |
| 12558 | movu [r0 + 140 * 16], m1 |
| 12559 | movu [r0 + 141 * 16], m1 |
| 12560 | movu [r0 + 142 * 16], m1 |
| 12561 | movu [r0 + 143 * 16], m1 |
| 12562 | |
| 12563 | pxor m0, m0 |
| 12564 | pshufb m1, m1, m0 |
| 12565 | punpcklbw m1, m0 |
| 12566 | movu m2, [r1] |
| 12567 | pshufb m2, m2, m0 |
| 12568 | punpcklbw m2, m0 |
| 12569 | movu m4, [r1 + 1] |
| 12570 | punpcklbw m5, m4, m0 |
| 12571 | punpckhbw m4, m0 |
| 12572 | psubw m5, m2 |
| 12573 | psubw m4, m2 |
| 12574 | psraw m5, 1 |
| 12575 | psraw m4, 1 |
| 12576 | paddw m5, m1 |
| 12577 | paddw m4, m1 |
| 12578 | packuswb m5, m4 |
| 12579 | |
| 12580 | pextrb [r0 + 128 * 16], m5, 0 |
| 12581 | pextrb [r0 + 129 * 16], m5, 1 |
| 12582 | pextrb [r0 + 130 * 16], m5, 2 |
| 12583 | pextrb [r0 + 131 * 16], m5, 3 |
| 12584 | pextrb [r0 + 132 * 16], m5, 4 |
| 12585 | pextrb [r0 + 133 * 16], m5, 5 |
| 12586 | pextrb [r0 + 134 * 16], m5, 6 |
| 12587 | pextrb [r0 + 135 * 16], m5, 7 |
| 12588 | pextrb [r0 + 136 * 16], m5, 8 |
| 12589 | pextrb [r0 + 137 * 16], m5, 9 |
| 12590 | pextrb [r0 + 138 * 16], m5, 10 |
| 12591 | pextrb [r0 + 139 * 16], m5, 11 |
| 12592 | pextrb [r0 + 140 * 16], m5, 12 |
| 12593 | pextrb [r0 + 141 * 16], m5, 13 |
| 12594 | pextrb [r0 + 142 * 16], m5, 14 |
| 12595 | pextrb [r0 + 143 * 16], m5, 15 |
| 12596 | |
| 12597 | ; mode 11 |
| 12598 | movu m0, [r2] |
| 12599 | |
| 12600 | ; mode 11 [row 15] |
| 12601 | movu [r0 + 159 * 16], m0 |
| 12602 | |
| 12603 | ; mode 11 [row 0] |
| 12604 | movu m1, [r2 + 1] |
| 12605 | punpcklbw m0, m1 |
| 12606 | pmaddubsw m1, m0, [r5 + 30 * 16] |
| 12607 | pmulhrsw m1, m3 |
| 12608 | movu m7, [r2 + 8] |
| 12609 | movu m2, [r2 + 9] |
| 12610 | punpcklbw m7, m2 |
| 12611 | pmaddubsw m2, m7, [r5 + 30 * 16] |
| 12612 | pmulhrsw m2, m3 |
| 12613 | packuswb m1, m2 |
| 12614 | movu [r0 + 144 * 16], m1 |
| 12615 | |
| 12616 | ; mode 11 [row 1] |
| 12617 | pmaddubsw m1, m0, [r5 + 28 * 16] |
| 12618 | pmulhrsw m1, m3 |
| 12619 | pmaddubsw m2, m7, [r5 + 28 * 16] |
| 12620 | pmulhrsw m2, m3 |
| 12621 | packuswb m1, m2 |
| 12622 | movu [r0 + 145 * 16], m1 |
| 12623 | |
| 12624 | ; mode 11 [row 2] |
| 12625 | pmaddubsw m1, m0, [r5 + 26 * 16] |
| 12626 | pmulhrsw m1, m3 |
| 12627 | pmaddubsw m2, m7, [r5 + 26 * 16] |
| 12628 | pmulhrsw m2, m3 |
| 12629 | packuswb m1, m2 |
| 12630 | movu [r0 + 146 * 16], m1 |
| 12631 | |
| 12632 | ; mode 11 [row 3] |
| 12633 | pmaddubsw m1, m0, [r5 + 24 * 16] |
| 12634 | pmulhrsw m1, m3 |
| 12635 | pmaddubsw m2, m7, [r5 + 24 * 16] |
| 12636 | pmulhrsw m2, m3 |
| 12637 | packuswb m1, m2 |
| 12638 | movu [r0 + 147 * 16], m1 |
| 12639 | |
| 12640 | ; mode 11 [row 4] |
| 12641 | pmaddubsw m1, m0, [r5 + 22 * 16] |
| 12642 | pmulhrsw m1, m3 |
| 12643 | pmaddubsw m2, m7, [r5 + 22 * 16] |
| 12644 | pmulhrsw m2, m3 |
| 12645 | packuswb m1, m2 |
| 12646 | movu [r0 + 148 * 16], m1 |
| 12647 | |
| 12648 | ; mode 11 [row 5] |
| 12649 | pmaddubsw m1, m0, [r5 + 20 * 16] |
| 12650 | pmulhrsw m1, m3 |
| 12651 | pmaddubsw m2, m7, [r5 + 20 * 16] |
| 12652 | pmulhrsw m2, m3 |
| 12653 | packuswb m1, m2 |
| 12654 | movu [r0 + 149 * 16], m1 |
| 12655 | |
| 12656 | ; mode 11 [row 6] |
| 12657 | pmaddubsw m1, m0, [r5 + 18 * 16] |
| 12658 | pmulhrsw m1, m3 |
| 12659 | pmaddubsw m2, m7, [r5 + 18 * 16] |
| 12660 | pmulhrsw m2, m3 |
| 12661 | packuswb m1, m2 |
| 12662 | movu [r0 + 150 * 16], m1 |
| 12663 | |
| 12664 | ; mode 11 [row 7] |
| 12665 | pmaddubsw m1, m0, [r5 + 16 * 16] |
| 12666 | pmulhrsw m1, m3 |
| 12667 | pmaddubsw m2, m7, [r5 + 16 * 16] |
| 12668 | pmulhrsw m2, m3 |
| 12669 | packuswb m1, m2 |
| 12670 | movu [r0 + 151 * 16], m1 |
| 12671 | |
| 12672 | ; mode 11 [row 8] |
| 12673 | pmaddubsw m1, m0, [r5 + 14 * 16] |
| 12674 | pmulhrsw m1, m3 |
| 12675 | pmaddubsw m2, m7, [r5 + 14 * 16] |
| 12676 | pmulhrsw m2, m3 |
| 12677 | packuswb m1, m2 |
| 12678 | movu [r0 + 152 * 16], m1 |
| 12679 | |
| 12680 | ; mode 11 [row 9] |
| 12681 | pmaddubsw m1, m0, [r5 + 12 * 16] |
| 12682 | pmulhrsw m1, m3 |
| 12683 | pmaddubsw m2, m7, [r5 + 12 * 16] |
| 12684 | pmulhrsw m2, m3 |
| 12685 | packuswb m1, m2 |
| 12686 | movu [r0 + 153 * 16], m1 |
| 12687 | |
| 12688 | ; mode 11 [row 10] |
| 12689 | pmaddubsw m1, m0, [r5 + 10 * 16] |
| 12690 | pmulhrsw m1, m3 |
| 12691 | pmaddubsw m2, m7, [r5 + 10 * 16] |
| 12692 | pmulhrsw m2, m3 |
| 12693 | packuswb m1, m2 |
| 12694 | movu [r0 + 154 * 16], m1 |
| 12695 | |
| 12696 | ; mode 11 [row 11] |
| 12697 | pmaddubsw m1, m0, [r5 + 8 * 16] |
| 12698 | pmulhrsw m1, m3 |
| 12699 | pmaddubsw m2, m7, [r5 + 8 * 16] |
| 12700 | pmulhrsw m2, m3 |
| 12701 | packuswb m1, m2 |
| 12702 | movu [r0 + 155 * 16], m1 |
| 12703 | |
| 12704 | ; mode 11 [row 12] |
| 12705 | pmaddubsw m1, m0, [r5 + 6 * 16] |
| 12706 | pmulhrsw m1, m3 |
| 12707 | pmaddubsw m2, m7, [r5 + 6 * 16] |
| 12708 | pmulhrsw m2, m3 |
| 12709 | packuswb m1, m2 |
| 12710 | movu [r0 + 156 * 16], m1 |
| 12711 | |
| 12712 | ; mode 11 [row 13] |
| 12713 | pmaddubsw m1, m0, [r5 + 4 * 16] |
| 12714 | pmulhrsw m1, m3 |
| 12715 | pmaddubsw m2, m7, [r5 + 4 * 16] |
| 12716 | pmulhrsw m2, m3 |
| 12717 | packuswb m1, m2 |
| 12718 | movu [r0 + 157 * 16], m1 |
| 12719 | |
| 12720 | ; mode 11 [row 14] |
| 12721 | pmaddubsw m1, m0, [r5 + 2 * 16] |
| 12722 | pmulhrsw m1, m3 |
| 12723 | pmaddubsw m2, m7, [r5 + 2 * 16] |
| 12724 | pmulhrsw m2, m3 |
| 12725 | packuswb m1, m2 |
| 12726 | movu [r0 + 158 * 16], m1 |
| 12727 | |
| 12728 | ; mode 12 [row 0] |
| 12729 | movu m0, [r4] |
| 12730 | movu m1, [r4 + 1] |
| 12731 | punpcklbw m0, m1 |
| 12732 | pmaddubsw m1, m0, [r5 + 27 * 16] |
| 12733 | pmulhrsw m1, m3 |
| 12734 | movu m7, [r4 + 8] |
| 12735 | movd m2, [r4 + 24] |
| 12736 | palignr m2, m7, 1 |
| 12737 | punpcklbw m7, m2 |
| 12738 | pmaddubsw m2, m7, [r5 + 27 * 16] |
| 12739 | pmulhrsw m2, m3 |
| 12740 | packuswb m1, m2 |
| 12741 | movu [r0 + 160 * 16], m1 |
| 12742 | |
| 12743 | ; mode 12 [row 1] |
| 12744 | pmaddubsw m1, m0, [r5 + 22 * 16] |
| 12745 | pmulhrsw m1, m3 |
| 12746 | pmaddubsw m2, m7, [r5 + 22 * 16] |
| 12747 | pmulhrsw m2, m3 |
| 12748 | packuswb m1, m2 |
| 12749 | movu [r0 + 161 * 16], m1 |
| 12750 | |
| 12751 | ; mode 12 [row 2] |
| 12752 | pmaddubsw m1, m0, [r5 + 17 * 16] |
| 12753 | pmulhrsw m1, m3 |
| 12754 | pmaddubsw m2, m7, [r5 + 17 * 16] |
| 12755 | pmulhrsw m2, m3 |
| 12756 | packuswb m1, m2 |
| 12757 | movu [r0 + 162 * 16], m1 |
| 12758 | |
| 12759 | ; mode 12 [row 3] |
| 12760 | pmaddubsw m1, m0, [r5 + 12 * 16] |
| 12761 | pmulhrsw m1, m3 |
| 12762 | pmaddubsw m2, m7, [r5 + 12 * 16] |
| 12763 | pmulhrsw m2, m3 |
| 12764 | packuswb m1, m2 |
| 12765 | movu [r0 + 163 * 16], m1 |
| 12766 | |
| 12767 | ; mode 12 [row 4] |
| 12768 | pmaddubsw m1, m0, [r5 + 7 * 16] |
| 12769 | pmulhrsw m1, m3 |
| 12770 | pmaddubsw m2, m7, [r5 + 7 * 16] |
| 12771 | pmulhrsw m2, m3 |
| 12772 | packuswb m1, m2 |
| 12773 | movu [r0 + 164 * 16], m1 |
| 12774 | |
| 12775 | ; mode 12 [row 5] |
| 12776 | pmaddubsw m1, m0, [r5 + 2 * 16] |
| 12777 | pmulhrsw m1, m3 |
| 12778 | pmaddubsw m2, m7, [r5 + 2 * 16] |
| 12779 | pmulhrsw m2, m3 |
| 12780 | packuswb m1, m2 |
| 12781 | movu [r0 + 165 * 16], m1 |
| 12782 | |
| 12783 | ; mode 13 [row 0] |
| 12784 | pmaddubsw m1, m0, [r5 + 23 * 16] |
| 12785 | pmulhrsw m1, m3 |
| 12786 | pmaddubsw m2, m7, [r5 + 23 * 16] |
| 12787 | pmulhrsw m2, m3 |
| 12788 | packuswb m1, m2 |
| 12789 | movu [r0 + 176 * 16], m1 |
| 12790 | |
| 12791 | ; mode 13 [row 1] |
| 12792 | pmaddubsw m1, m0, [r5 + 14 * 16] |
| 12793 | pmulhrsw m1, m3 |
| 12794 | pmaddubsw m2, m7, [r5 + 14 * 16] |
| 12795 | pmulhrsw m2, m3 |
| 12796 | packuswb m1, m2 |
| 12797 | movu [r0 + 177 * 16], m1 |
| 12798 | |
| 12799 | ; mode 13 [row 2] |
| 12800 | pmaddubsw m1, m0, [r5 + 5 * 16] |
| 12801 | pmulhrsw m1, m3 |
| 12802 | pmaddubsw m2, m7, [r5 + 5 * 16] |
| 12803 | pmulhrsw m2, m3 |
| 12804 | packuswb m1, m2 |
| 12805 | movu [r0 + 178 * 16], m1 |
| 12806 | |
| 12807 | ; mode 14 [row 0] |
| 12808 | pmaddubsw m1, m0, [r5 + 19 * 16] |
| 12809 | pmulhrsw m1, m3 |
| 12810 | pmaddubsw m2, m7, [r5 + 19 * 16] |
| 12811 | pmulhrsw m2, m3 |
| 12812 | packuswb m1, m2 |
| 12813 | movu [r0 + 192 * 16], m1 |
| 12814 | |
| 12815 | ; mode 14 [row 1] |
| 12816 | pmaddubsw m1, m0, [r5 + 6 * 16] |
| 12817 | pmulhrsw m1, m3 |
| 12818 | pmaddubsw m2, m7, [r5 + 6 * 16] |
| 12819 | pmulhrsw m2, m3 |
| 12820 | packuswb m1, m2 |
| 12821 | movu [r0 + 193 * 16], m1 |
| 12822 | |
| 12823 | ; mode 17 [row 0] |
| 12824 | movu [r0 + 240 * 16], m1 |
| 12825 | |
| 12826 | ; mode 15 [row 0] |
| 12827 | pmaddubsw m1, m0, [r5 + 15 * 16] |
| 12828 | pmulhrsw m1, m3 |
| 12829 | pmaddubsw m2, m7, [r5 + 15 * 16] |
| 12830 | pmulhrsw m2, m3 |
| 12831 | packuswb m1, m2 |
| 12832 | movu [r0 + 208 * 16], m1 |
| 12833 | |
| 12834 | ; mode 15 [row 15 - second half] |
| 12835 | pmaddubsw m1, m0, [r5 + 16 * 16] |
| 12836 | pmulhrsw m1, m3 |
| 12837 | packuswb m1, m1 |
| 12838 | movh [r0 + 223 * 16 + 8], m1 |
| 12839 | ; mode 15 [row 15 - second half] end |
| 12840 | |
| 12841 | ; mode 16 [row 0] |
| 12842 | pmaddubsw m1, m0, [r5 + 11 * 16] |
| 12843 | pmulhrsw m1, m3 |
| 12844 | pmaddubsw m2, m7, [r5 + 11 * 16] |
| 12845 | pmulhrsw m2, m3 |
| 12846 | packuswb m1, m2 |
| 12847 | movu [r0 + 224 * 16], m1 |
| 12848 | |
| 12849 | ; mode 17 [row 9 - second half] |
| 12850 | pmaddubsw m1, m0, [r5 + 28 * 16] |
| 12851 | pmulhrsw m1, m3 |
| 12852 | packuswb m1, m1 |
| 12853 | movh [r0 + 249 * 16 + 8], m1 |
| 12854 | ; mode 17 [row 9 - second half] end |
| 12855 | |
| 12856 | ; mode 17 [row 10 - second half] |
| 12857 | pmaddubsw m1, m0, [r5 + 2 * 16] |
| 12858 | pmulhrsw m1, m3 |
| 12859 | packuswb m1, m1 |
| 12860 | movh [r0 + 250 * 16 + 8], m1 |
| 12861 | ; mode 17 [row 10 - second half] end |
| 12862 | |
| 12863 | ; mode 17 [row 1 - first half] |
| 12864 | pslldq m6, m0, 2 |
| 12865 | pinsrb m6, [r3 + 0], 1 |
| 12866 | pinsrb m6, [r3 + 1], 0 |
| 12867 | pmaddubsw m1, m6, [r5 + 12 * 16] |
| 12868 | pmulhrsw m1, m3 |
| 12869 | packuswb m1, m1 |
| 12870 | movh [r0 + 241 * 16], m1 |
| 12871 | |
| 12872 | ; mode 17 [row 11 - second half] |
| 12873 | pmaddubsw m1, m6, [r5 + 8 * 16] |
| 12874 | pmulhrsw m1, m3 |
| 12875 | packuswb m1, m1 |
| 12876 | movh [r0 + 251 * 16 + 8], m1 |
| 12877 | ; mode 17 [row 11 - second half] end |
| 12878 | |
| 12879 | ; mode 17 [row 2 - first half] |
| 12880 | pslldq m6, 2 |
| 12881 | pinsrb m6, [r3 + 1], 1 |
| 12882 | pinsrb m6, [r3 + 2], 0 |
| 12883 | pmaddubsw m1, m6, [r5 + 18 * 16] |
| 12884 | pmulhrsw m1, m3 |
| 12885 | packuswb m1, m1 |
| 12886 | movh [r0 + 242 * 16], m1 |
| 12887 | |
| 12888 | ; mode 17 [row 12 - second half] |
| 12889 | pmaddubsw m1, m6, [r5 + 14 * 16] |
| 12890 | pmulhrsw m1, m3 |
| 12891 | packuswb m1, m1 |
| 12892 | movh [r0 + 252 * 16 + 8], m1 |
| 12893 | ; mode 17 [row 12 - second half] end |
| 12894 | |
| 12895 | ; mode 17 [row 3 - first half] |
| 12896 | pslldq m6, 2 |
| 12897 | pinsrb m6, [r3 + 2], 1 |
| 12898 | pinsrb m6, [r3 + 4], 0 |
| 12899 | pmaddubsw m1, m6, [r5 + 24 * 16] |
| 12900 | pmulhrsw m1, m3 |
| 12901 | packuswb m1, m1 |
| 12902 | movh [r0 + 243 * 16], m1 |
| 12903 | |
| 12904 | ; mode 17 [row 13 - first half] |
| 12905 | pmaddubsw m1, m6, [r5 + 20 * 16] |
| 12906 | pmulhrsw m1, m3 |
| 12907 | packuswb m1, m1 |
| 12908 | movh [r0 + 253 * 16 + 8], m1 |
| 12909 | |
| 12910 | ; mode 17 [row 4 - first half] |
| 12911 | pslldq m6, 2 |
| 12912 | pinsrb m6, [r3 + 4], 1 |
| 12913 | pinsrb m6, [r3 + 5], 0 |
| 12914 | pmaddubsw m1, m6, [r5 + 30 * 16] |
| 12915 | pmulhrsw m1, m3 |
| 12916 | packuswb m1, m1 |
| 12917 | movh [r0 + 244 * 16], m1 |
| 12918 | |
| 12919 | ; mode 17 [row 5 - first half] |
| 12920 | pmaddubsw m1, m6, [r5 + 4 * 16] |
| 12921 | pmulhrsw m1, m3 |
| 12922 | packuswb m1, m1 |
| 12923 | movh [r0 + 245 * 16], m1 |
| 12924 | |
| 12925 | ; mode 17 [row 14 - second half] |
| 12926 | pmaddubsw m1, m6, [r5 + 26 * 16] |
| 12927 | pmulhrsw m1, m3 |
| 12928 | packuswb m1, m1 |
| 12929 | movh [r0 + 254 * 16 + 8], m1 |
| 12930 | ; mode 17 [row 14 - second half] end |
| 12931 | |
| 12932 | ; mode 17 [row 6 - first half] |
| 12933 | pslldq m6, 2 |
| 12934 | pinsrb m6, [r3 + 5], 1 |
| 12935 | pinsrb m6, [r3 + 6], 0 |
| 12936 | pmaddubsw m1, m6, [r5 + 10 * 16] |
| 12937 | pmulhrsw m1, m3 |
| 12938 | packuswb m1, m1 |
| 12939 | movh [r0 + 246 * 16], m1 |
| 12940 | |
| 12941 | ; mode 17 [row 7 - first half] |
| 12942 | pslldq m6, 2 |
| 12943 | pinsrb m6, [r3 + 6], 1 |
| 12944 | pinsrb m6, [r3 + 7], 0 |
| 12945 | pmaddubsw m1, m6, [r5 + 16 * 16] |
| 12946 | pmulhrsw m1, m3 |
| 12947 | packuswb m1, m1 |
| 12948 | movh [r0 + 247 * 16], m1 |
| 12949 | |
| 12950 | ; mode 17 [row 8 - first half] |
| 12951 | pslldq m6, 2 |
| 12952 | pinsrb m6, [r3 + 7], 1 |
| 12953 | pinsrb m6, [r3 + 9], 0 |
| 12954 | pmaddubsw m1, m6, [r5 + 22 * 16] |
| 12955 | pmulhrsw m1, m3 |
| 12956 | packuswb m1, m1 |
| 12957 | movh [r0 + 248 * 16], m1 |
| 12958 | |
| 12959 | ; mode 17 [row 9 - first half] |
| 12960 | pslldq m6, 2 |
| 12961 | pinsrb m6, [r3 + 9], 1 |
| 12962 | pinsrb m6, [r3 + 10], 0 |
| 12963 | pmaddubsw m1, m6, [r5 + 28 * 16] |
| 12964 | pmulhrsw m1, m3 |
| 12965 | packuswb m1, m1 |
| 12966 | movh [r0 + 249 * 16], m1 |
| 12967 | |
| 12968 | ; mode 17 [row 10 - first half] |
| 12969 | pmaddubsw m1, m6, [r5 + 2 * 16] |
| 12970 | pmulhrsw m1, m3 |
| 12971 | packuswb m1, m1 |
| 12972 | movh [r0 + 250 * 16], m1 |
| 12973 | |
| 12974 | ; mode 17 [row 11 - first half] |
| 12975 | pslldq m6, 2 |
| 12976 | pinsrb m6, [r3 + 10], 1 |
| 12977 | pinsrb m6, [r3 + 11], 0 |
| 12978 | pmaddubsw m1, m6, [r5 + 8 * 16] |
| 12979 | pmulhrsw m1, m3 |
| 12980 | packuswb m1, m1 |
| 12981 | movh [r0 + 251 * 16], m1 |
| 12982 | |
| 12983 | ; mode 17 [row 12 - first half] |
| 12984 | pslldq m6, 2 |
| 12985 | pinsrb m6, [r3 + 11], 1 |
| 12986 | pinsrb m6, [r3 + 12], 0 |
| 12987 | pmaddubsw m1, m6, [r5 + 14 * 16] |
| 12988 | pmulhrsw m1, m3 |
| 12989 | packuswb m1, m1 |
| 12990 | movh [r0 + 252 * 16], m1 |
| 12991 | |
| 12992 | ; mode 17 [row 13 - first half] |
| 12993 | pslldq m6, 2 |
| 12994 | pinsrb m6, [r3 + 12], 1 |
| 12995 | pinsrb m6, [r3 + 14], 0 |
| 12996 | pmaddubsw m1, m6, [r5 + 20 * 16] |
| 12997 | pmulhrsw m1, m3 |
| 12998 | packuswb m1, m1 |
| 12999 | movh [r0 + 253 * 16], m1 |
| 13000 | |
| 13001 | ; mode 17 [row 14 - first half] |
| 13002 | pslldq m6, 2 |
| 13003 | pinsrb m6, [r3 + 14], 1 |
| 13004 | pinsrb m6, [r3 + 15], 0 |
| 13005 | pmaddubsw m1, m6, [r5 + 26 * 16] |
| 13006 | pmulhrsw m1, m3 |
| 13007 | packuswb m1, m1 |
| 13008 | movh [r0 + 254 * 16], m1 |
| 13009 | |
| 13010 | ; mode 16 [row 12 - second half] |
| 13011 | pmaddubsw m1, m0, [r5 + 15 * 16] |
| 13012 | pmulhrsw m1, m3 |
| 13013 | packuswb m1, m1 |
| 13014 | movh [r0 + 236 * 16 + 8], m1 |
| 13015 | ; mode 16 [row 12 - second half] |
| 13016 | |
| 13017 | ; mode 12 [row 6] |
| 13018 | pslldq m2, m0, 2 |
| 13019 | pinsrb m2, [r3 + 0], 1 |
| 13020 | pinsrb m2, [r3 + 6], 0 |
| 13021 | pmaddubsw m1, m2, [r5 + 29 * 16] |
| 13022 | pmulhrsw m1, m3 |
| 13023 | movu m0, [r4 + 7] |
| 13024 | psrldq m4, m0, 1 |
| 13025 | punpcklbw m0, m4 |
| 13026 | pmaddubsw m4, m0, [r5 + 29 * 16] |
| 13027 | pmulhrsw m4, m3 |
| 13028 | packuswb m1, m4 |
| 13029 | movu [r0 + 166 * 16], m1 |
| 13030 | |
| 13031 | ; mode 12 [row 7] |
| 13032 | pmaddubsw m1, m2, [r5 + 24 * 16] |
| 13033 | pmulhrsw m1, m3 |
| 13034 | pmaddubsw m4, m0, [r5 + 24 * 16] |
| 13035 | pmulhrsw m4, m3 |
| 13036 | packuswb m1, m4 |
| 13037 | movu [r0 + 167 * 16], m1 |
| 13038 | |
| 13039 | ; mode 12 [row 8] |
| 13040 | pmaddubsw m1, m2, [r5 + 19 * 16] |
| 13041 | pmulhrsw m1, m3 |
| 13042 | pmaddubsw m4, m0, [r5 + 19 * 16] |
| 13043 | pmulhrsw m4, m3 |
| 13044 | packuswb m1, m4 |
| 13045 | movu [r0 + 168 * 16], m1 |
| 13046 | |
| 13047 | ; mode 12 [row 9] |
| 13048 | pmaddubsw m1, m2, [r5 + 14 * 16] |
| 13049 | pmulhrsw m1, m3 |
| 13050 | pmaddubsw m4, m0, [r5 + 14 * 16] |
| 13051 | pmulhrsw m4, m3 |
| 13052 | packuswb m1, m4 |
| 13053 | movu [r0 + 169 * 16], m1 |
| 13054 | |
| 13055 | ; mode 12 [row 10] |
| 13056 | pmaddubsw m1, m2, [r5 + 9 * 16] |
| 13057 | pmulhrsw m1, m3 |
| 13058 | pmaddubsw m4, m0, [r5 + 9 * 16] |
| 13059 | pmulhrsw m4, m3 |
| 13060 | packuswb m1, m4 |
| 13061 | movu [r0 + 170 * 16], m1 |
| 13062 | |
| 13063 | ; mode 12 [row 11] |
| 13064 | pmaddubsw m1, m2, [r5 + 4 * 16] |
| 13065 | pmulhrsw m1, m3 |
| 13066 | pmaddubsw m4, m0, [r5 + 4 * 16] |
| 13067 | pmulhrsw m4, m3 |
| 13068 | packuswb m1, m4 |
| 13069 | movu [r0 + 171 * 16], m1 |
| 13070 | |
| 13071 | ; mode 13 [row 3] |
| 13072 | pinsrb m7, m2, [r3 + 4], 0 |
| 13073 | pmaddubsw m1, m7, [r5 + 28 * 16] |
| 13074 | pmulhrsw m1, m3 |
| 13075 | pmaddubsw m4, m0, [r5 + 28 * 16] |
| 13076 | pmulhrsw m4, m3 |
| 13077 | packuswb m1, m4 |
| 13078 | movu [r0 + 179 * 16], m1 |
| 13079 | |
| 13080 | ; mode 13 [row 4] |
| 13081 | pmaddubsw m1, m7, [r5 + 19 * 16] |
| 13082 | pmulhrsw m1, m3 |
| 13083 | pmaddubsw m4, m0, [r5 + 19 * 16] |
| 13084 | pmulhrsw m4, m3 |
| 13085 | packuswb m1, m4 |
| 13086 | movu [r0 + 180 * 16], m1 |
| 13087 | |
| 13088 | ; mode 13 [row 5] |
| 13089 | pmaddubsw m1, m7, [r5 + 10 * 16] |
| 13090 | pmulhrsw m1, m3 |
| 13091 | pmaddubsw m4, m0, [r5 + 10 * 16] |
| 13092 | pmulhrsw m4, m3 |
| 13093 | packuswb m1, m4 |
| 13094 | movu [r0 + 181 * 16], m1 |
| 13095 | |
| 13096 | ; mode 13 [row 6] |
| 13097 | pmaddubsw m1, m7, [r5 + 1 * 16] |
| 13098 | pmulhrsw m1, m3 |
| 13099 | pmaddubsw m4, m0, [r5 + 1 * 16] |
| 13100 | pmulhrsw m4, m3 |
| 13101 | packuswb m1, m4 |
| 13102 | movu [r0 + 182 * 16], m1 |
| 13103 | |
| 13104 | ; mode 14 [row 2] |
| 13105 | pinsrb m5, m7, [r3 + 2], 0 |
| 13106 | pmaddubsw m1, m5, [r5 + 25 * 16] |
| 13107 | pmulhrsw m1, m3 |
| 13108 | pmaddubsw m4, m0, [r5 + 25 * 16] |
| 13109 | pmulhrsw m4, m3 |
| 13110 | packuswb m1, m4 |
| 13111 | movu [r0 + 194 * 16], m1 |
| 13112 | |
| 13113 | ; mode 14 [row 3] |
| 13114 | pmaddubsw m1, m5, [r5 + 12 * 16] |
| 13115 | pmulhrsw m1, m3 |
| 13116 | pmaddubsw m4, m0, [r5 + 12 * 16] |
| 13117 | pmulhrsw m4, m3 |
| 13118 | packuswb m1, m4 |
| 13119 | movu [r0 + 195 * 16], m1 |
| 13120 | |
| 13121 | ; mode 15 [row 1] |
| 13122 | pmaddubsw m1, m5, [r5 + 30 * 16] |
| 13123 | pmulhrsw m1, m3 |
| 13124 | pmaddubsw m4, m0, [r5 + 30 * 16] |
| 13125 | pmulhrsw m4, m3 |
| 13126 | packuswb m1, m4 |
| 13127 | movu [r0 + 209 * 16], m1 |
| 13128 | |
| 13129 | ; mode 15 [row 2] |
| 13130 | pmaddubsw m1, m5, [r5 + 13 * 16] |
| 13131 | pmulhrsw m1, m3 |
| 13132 | pmaddubsw m4, m0, [r5 + 13 * 16] |
| 13133 | pmulhrsw m4, m3 |
| 13134 | packuswb m1, m4 |
| 13135 | movu [r0 + 210 * 16], m1 |
| 13136 | |
| 13137 | ; mode 16 [row 1] |
| 13138 | pmaddubsw m1, m5, [r5 + 22 * 16] |
| 13139 | pmulhrsw m1, m3 |
| 13140 | pmaddubsw m4, m0, [r5 + 22 * 16] |
| 13141 | pmulhrsw m4, m3 |
| 13142 | packuswb m1, m4 |
| 13143 | movu [r0 + 225 * 16], m1 |
| 13144 | |
| 13145 | ; mode 16 [row 2] |
| 13146 | pmaddubsw m1, m5, [r5 + 1 * 16] |
| 13147 | pmulhrsw m1, m3 |
| 13148 | pmaddubsw m4, m0, [r5 + 1 * 16] |
| 13149 | pmulhrsw m4, m3 |
| 13150 | packuswb m1, m4 |
| 13151 | movu [r0 + 226 * 16], m1 |
| 13152 | |
| 13153 | ; mode 16 [row 13 - second half] |
| 13154 | pmaddubsw m1, m5, [r5 + 26 * 16] |
| 13155 | pmulhrsw m1, m3 |
| 13156 | packuswb m1, m1 |
| 13157 | movh [r0 + 237 * 16 + 8], m1 |
| 13158 | ; mode 16 [row 13 - second half] |
| 13159 | |
| 13160 | ; mode 16 [row 14 - second half] |
| 13161 | pmaddubsw m1, m5, [r5 + 5 * 16] |
| 13162 | pmulhrsw m1, m3 |
| 13163 | packuswb m1, m1 |
| 13164 | movh [r0 + 238 * 16 + 8], m1 |
| 13165 | ; mode 16 [row 14 - second half] |
| 13166 | |
| 13167 | ; mode 16 [row 3] |
| 13168 | pslldq m6, m5, 2 |
| 13169 | pinsrb m6, [r3 + 2], 1 |
| 13170 | pinsrb m6, [r3 + 3], 0 |
| 13171 | pmaddubsw m1, m6, [r5 + 12 * 16] |
| 13172 | pmulhrsw m1, m3 |
| 13173 | packuswb m1, m1 |
| 13174 | movh [r0 + 227 * 16], m1 |
| 13175 | |
| 13176 | ; mode 16 [row 15 - second half] |
| 13177 | pmaddubsw m1, m6, [r5 + 16 * 16] |
| 13178 | pmulhrsw m1, m3 |
| 13179 | packuswb m1, m1 |
| 13180 | movh [r0 + 239 * 16 + 8], m1 |
| 13181 | ; mode 16 [row 15 - second half] end |
| 13182 | |
| 13183 | ; mode 16 [row 4- first half] |
| 13184 | pslldq m6, 2 |
| 13185 | pinsrb m6, [r3 + 3], 1 |
| 13186 | pinsrb m6, [r3 + 5], 0 |
| 13187 | pmaddubsw m1, m6, [r5 + 23 * 16] |
| 13188 | pmulhrsw m1, m3 |
| 13189 | packuswb m1, m1 |
| 13190 | movh [r0 + 228 * 16], m1 |
| 13191 | |
| 13192 | ; mode 16 [row 5- first half] |
| 13193 | pmaddubsw m1, m6, [r5 + 2 * 16] |
| 13194 | pmulhrsw m1, m3 |
| 13195 | packuswb m1, m1 |
| 13196 | movh [r0 + 229 * 16], m1 |
| 13197 | |
| 13198 | ; mode 16 [row 6- first half] |
| 13199 | pslldq m6, 2 |
| 13200 | pinsrb m6, [r3 + 5], 1 |
| 13201 | pinsrb m6, [r3 + 6], 0 |
| 13202 | pmaddubsw m1, m6, [r5 + 13 * 16] |
| 13203 | pmulhrsw m1, m3 |
| 13204 | packuswb m1, m1 |
| 13205 | movh [r0 + 230 * 16], m1 |
| 13206 | |
| 13207 | ; mode 16 [row 7- first half] |
| 13208 | pslldq m6, 2 |
| 13209 | pinsrb m6, [r3 + 6], 1 |
| 13210 | pinsrb m6, [r3 + 8], 0 |
| 13211 | pmaddubsw m1, m6, [r5 + 24 * 16] |
| 13212 | pmulhrsw m1, m3 |
| 13213 | packuswb m1, m1 |
| 13214 | movh [r0 + 231 * 16], m1 |
| 13215 | |
| 13216 | ; mode 16 [row 8- first half] |
| 13217 | pmaddubsw m1, m6, [r5 + 3 * 16] |
| 13218 | pmulhrsw m1, m3 |
| 13219 | packuswb m1, m1 |
| 13220 | movh [r0 + 232 * 16], m1 |
| 13221 | ; mode 19 [row 0 - second half] end |
| 13222 | |
| 13223 | ; mode 16 [row 9- first half] |
| 13224 | pslldq m6, 2 |
| 13225 | pinsrb m6, [r3 + 8], 1 |
| 13226 | pinsrb m6, [r3 + 9], 0 |
| 13227 | pmaddubsw m1, m6, [r5 + 14 * 16] |
| 13228 | pmulhrsw m1, m3 |
| 13229 | packuswb m1, m1 |
| 13230 | movh [r0 + 233 * 16], m1 |
| 13231 | |
| 13232 | ; mode 16 [row 10 - first half] |
| 13233 | pslldq m6, 2 |
| 13234 | pinsrb m6, [r3 + 9], 1 |
| 13235 | pinsrb m6, [r3 + 11], 0 |
| 13236 | pmaddubsw m1, m6, [r5 + 25 * 16] |
| 13237 | pmulhrsw m1, m3 |
| 13238 | packuswb m1, m1 |
| 13239 | movh [r0 + 234 * 16], m1 |
| 13240 | |
| 13241 | ; mode 16 [row 11 - first half] |
| 13242 | pmaddubsw m1, m6, [r5 + 4 * 16] |
| 13243 | pmulhrsw m1, m3 |
| 13244 | packuswb m1, m1 |
| 13245 | movh [r0 + 235 * 16], m1 |
| 13246 | |
| 13247 | ; mode 16 [row 12 - first half] |
| 13248 | pslldq m6, 2 |
| 13249 | pinsrb m6, [r3 + 11], 1 |
| 13250 | pinsrb m6, [r3 + 12], 0 |
| 13251 | pmaddubsw m1, m6, [r5 + 15 * 16] |
| 13252 | pmulhrsw m1, m3 |
| 13253 | packuswb m1, m1 |
| 13254 | movh [r0 + 236 * 16], m1 |
| 13255 | |
| 13256 | ; mode 16 [row 13 - first half] |
| 13257 | pslldq m6, 2 |
| 13258 | pinsrb m6, [r3 + 12], 1 |
| 13259 | pinsrb m6, [r3 + 14], 0 |
| 13260 | pmaddubsw m1, m6, [r5 + 26 * 16] |
| 13261 | pmulhrsw m1, m3 |
| 13262 | packuswb m1, m1 |
| 13263 | movh [r0 + 237 * 16], m1 |
| 13264 | |
| 13265 | ; mode 16 [row 14 - first half] |
| 13266 | pmaddubsw m1, m6, [r5 + 5 * 16] |
| 13267 | pmulhrsw m1, m3 |
| 13268 | packuswb m1, m1 |
| 13269 | movh [r0 + 238 * 16], m1 |
| 13270 | |
| 13271 | ; mode 16 [row 15 - first half] |
| 13272 | pslldq m6, 2 |
| 13273 | pinsrb m6, [r3 + 14], 1 |
| 13274 | pinsrb m6, [r3 + 15], 0 |
| 13275 | pmaddubsw m1, m6, [r5 + 16 * 16] |
| 13276 | pmulhrsw m1, m3 |
| 13277 | packuswb m1, m1 |
| 13278 | movh [r0 + 239 * 16], m1 |
| 13279 | |
| 13280 | ; mode 14 [row 4] |
| 13281 | pslldq m5, 2 |
| 13282 | pinsrb m5, [r3 + 2], 1 |
| 13283 | pinsrb m5, [r3 + 5], 0 |
| 13284 | movu m4, [r4 + 6] |
| 13285 | psrldq m0, m4, 1 |
| 13286 | punpcklbw m4, m0 |
| 13287 | |
| 13288 | ; mode 16 [row 3 - second half] |
| 13289 | pmaddubsw m1, m4, [r5 + 12 * 16] |
| 13290 | pmulhrsw m1, m3 |
| 13291 | packuswb m1, m1 |
| 13292 | movh [r0 + 227 * 16 + 8], m1 |
| 13293 | |
| 13294 | ; mode 16 [row 3 - second half] end |
| 13295 | pmaddubsw m1, m5, [r5 + 31 * 16] |
| 13296 | pmulhrsw m1, m3 |
| 13297 | pmaddubsw m0, m4, [r5 + 31 * 16] |
| 13298 | pmulhrsw m0, m3 |
| 13299 | packuswb m1, m0 |
| 13300 | movu [r0 + 196 * 16], m1 |
| 13301 | |
| 13302 | ; mode 14 [row 5] |
| 13303 | pmaddubsw m1, m5, [r5 + 18 * 16] |
| 13304 | pmulhrsw m1, m3 |
| 13305 | pmaddubsw m0, m4, [r5 + 18 * 16] |
| 13306 | pmulhrsw m0, m3 |
| 13307 | packuswb m1, m0 |
| 13308 | movu [r0 + 197 * 16], m1 |
| 13309 | |
| 13310 | ; mode 14 [row 6] |
| 13311 | pmaddubsw m1, m5, [r5 + 5 * 16] |
| 13312 | pmulhrsw m1, m3 |
| 13313 | pmaddubsw m0, m4, [r5 + 5 * 16] |
| 13314 | pmulhrsw m0, m3 |
| 13315 | packuswb m1, m0 |
| 13316 | movu [r0 + 198 * 16], m1 |
| 13317 | |
| 13318 | ; mode 15 [row 3] |
| 13319 | movu m6, m5 |
| 13320 | pinsrb m6, [r3 + 4], 0 |
| 13321 | pmaddubsw m1, m6, [r5 + 28 * 16] |
| 13322 | pmulhrsw m1, m3 |
| 13323 | pmaddubsw m0, m4, [r5 + 28 * 16] |
| 13324 | pmulhrsw m0, m3 |
| 13325 | packuswb m1, m0 |
| 13326 | movu [r0 + 211 * 16], m1 |
| 13327 | |
| 13328 | ; mode 15 [row 4] |
| 13329 | pmaddubsw m1, m6, [r5 + 11 * 16] |
| 13330 | pmulhrsw m1, m3 |
| 13331 | pmaddubsw m0, m4, [r5 + 11 * 16] |
| 13332 | pmulhrsw m0, m3 |
| 13333 | packuswb m1, m0 |
| 13334 | movu [r0 + 212 * 16], m1 |
| 13335 | |
| 13336 | ; mode 15 [row 5 - first half] |
| 13337 | pslldq m6, 2 |
| 13338 | pinsrb m6, [r3 + 4], 1 |
| 13339 | pinsrb m6, [r3 + 6], 0 |
| 13340 | pmaddubsw m1, m6, [r5 + 26 * 16] |
| 13341 | pmulhrsw m1, m3 |
| 13342 | packuswb m1, m1 |
| 13343 | movh [r0 + 213 * 16], m1 |
| 13344 | |
| 13345 | ; mode 15 [row 6 - first half] |
| 13346 | pmaddubsw m1, m6, [r5 + 9 * 16] |
| 13347 | pmulhrsw m1, m3 |
| 13348 | packuswb m1, m1 |
| 13349 | movh [r0 + 214 * 16], m1 |
| 13350 | |
| 13351 | ; mode 15 [row 7 - first half] |
| 13352 | pslldq m6, 2 |
| 13353 | pinsrb m6, [r3 + 6], 1 |
| 13354 | pinsrb m6, [r3 + 8], 0 |
| 13355 | pmaddubsw m1, m6, [r5 + 24 * 16] |
| 13356 | pmulhrsw m1, m3 |
| 13357 | packuswb m1, m1 |
| 13358 | movh [r0 + 215 * 16], m1 |
| 13359 | |
| 13360 | ; mode 15 [row 8 - first half] |
| 13361 | pmaddubsw m1, m6, [r5 + 7 * 16] |
| 13362 | pmulhrsw m1, m3 |
| 13363 | packuswb m1, m1 |
| 13364 | movh [r0 + 216 * 16], m1 |
| 13365 | |
| 13366 | ; mode 15 [row 9 - first half] |
| 13367 | pslldq m6, 2 |
| 13368 | pinsrb m6, [r3 + 8], 1 |
| 13369 | pinsrb m6, [r3 + 9], 0 |
| 13370 | pmaddubsw m1, m6, [r5 + 22 * 16] |
| 13371 | pmulhrsw m1, m3 |
| 13372 | packuswb m1, m1 |
| 13373 | movh [r0 + 217 * 16], m1 |
| 13374 | |
| 13375 | ; mode 15 [row 10 - first half] |
| 13376 | pmaddubsw m1, m6, [r5 + 5 * 16] |
| 13377 | pmulhrsw m1, m3 |
| 13378 | packuswb m1, m1 |
| 13379 | movh [r0 + 218 * 16], m1 |
| 13380 | |
| 13381 | ; mode 15 [row 11 - first half] |
| 13382 | pslldq m6, 2 |
| 13383 | pinsrb m6, [r3 + 9], 1 |
| 13384 | pinsrb m6, [r3 + 11], 0 |
| 13385 | pmaddubsw m1, m6, [r5 + 20 * 16] |
| 13386 | pmulhrsw m1, m3 |
| 13387 | packuswb m1, m1 |
| 13388 | movh [r0 + 219 * 16], m1 |
| 13389 | |
| 13390 | ; mode 15 [row 12 - first half] |
| 13391 | pmaddubsw m1, m6, [r5 + 3 * 16] |
| 13392 | pmulhrsw m1, m3 |
| 13393 | packuswb m1, m1 |
| 13394 | movh [r0 + 220 * 16], m1 |
| 13395 | |
| 13396 | ; mode 15 [row 13 - first half] |
| 13397 | pslldq m6, 2 |
| 13398 | pinsrb m6, [r3 + 11], 1 |
| 13399 | pinsrb m6, [r3 + 13], 0 |
| 13400 | pmaddubsw m1, m6, [r5 + 18 * 16] |
| 13401 | pmulhrsw m1, m3 |
| 13402 | packuswb m1, m1 |
| 13403 | movh [r0 + 221 * 16], m1 |
| 13404 | |
| 13405 | ; mode 15 [row 14 - first half] |
| 13406 | pmaddubsw m1, m6, [r5 + 1 * 16] |
| 13407 | pmulhrsw m1, m3 |
| 13408 | packuswb m1, m1 |
| 13409 | movh [r0 + 222 * 16], m1 |
| 13410 | |
| 13411 | ; mode 15 [row 15 - first half] |
| 13412 | pslldq m6, 2 |
| 13413 | pinsrb m6, [r3 + 13], 1 |
| 13414 | pinsrb m6, [r3 + 15], 0 |
| 13415 | pmaddubsw m1, m6, [r5 + 16 * 16] |
| 13416 | pmulhrsw m1, m3 |
| 13417 | packuswb m1, m1 |
| 13418 | movh [r0 + 223 * 16], m1 |
| 13419 | |
| 13420 | ; mode 14 [row 7] |
| 13421 | pslldq m5, 2 |
| 13422 | pinsrb m5, [r3 + 5], 1 |
| 13423 | pinsrb m5, [r3 + 7], 0 |
| 13424 | movu m0, [r4 + 5] |
| 13425 | psrldq m6, m0, 1 |
| 13426 | punpcklbw m0, m6 |
| 13427 | |
| 13428 | ; mode 15 [row 5 - second half] |
| 13429 | pmaddubsw m1, m0, [r5 + 26 * 16] |
| 13430 | pmulhrsw m1, m3 |
| 13431 | packuswb m1, m1 |
| 13432 | movh [r0 + 213 * 16 + 8], m1 |
| 13433 | ; mode 15 [row 5 - second half] end |
| 13434 | |
| 13435 | ; mode 15 [row 6 - second half] |
| 13436 | pmaddubsw m1, m0, [r5 + 9 * 16] |
| 13437 | pmulhrsw m1, m3 |
| 13438 | packuswb m1, m1 |
| 13439 | movh [r0 + 214 * 16 + 8], m1 |
| 13440 | ; mode 15 [row 6 - second half] end |
| 13441 | |
| 13442 | ; mode 16 [row 4 - second half] |
| 13443 | pmaddubsw m1, m0, [r5 + 23 * 16] |
| 13444 | pmulhrsw m1, m3 |
| 13445 | packuswb m1, m1 |
| 13446 | movh [r0 + 228 * 16 + 8], m1 |
| 13447 | ; mode 16 [row 4 - second half] end |
| 13448 | |
| 13449 | ; mode 16 [row 5 - second half] |
| 13450 | pmaddubsw m1, m0, [r5 + 2 * 16] |
| 13451 | pmulhrsw m1, m3 |
| 13452 | packuswb m1, m1 |
| 13453 | movh [r0 + 229 * 16 + 8], m1 |
| 13454 | |
| 13455 | ; mode 16 [row 5 - second half] end |
| 13456 | pmaddubsw m1, m5, [r5 + 24 * 16] |
| 13457 | pmulhrsw m1, m3 |
| 13458 | pmaddubsw m6, m0, [r5 + 24 * 16] |
| 13459 | pmulhrsw m6, m3 |
| 13460 | packuswb m1, m6 |
| 13461 | movu [r0 + 199 * 16], m1 |
| 13462 | |
| 13463 | ; mode 14 [row 8] |
| 13464 | pmaddubsw m1, m5, [r5 + 11 * 16] |
| 13465 | pmulhrsw m1, m3 |
| 13466 | pmaddubsw m6, m0, [r5 + 11 * 16] |
| 13467 | pmulhrsw m6, m3 |
| 13468 | packuswb m1, m6 |
| 13469 | movu [r0 + 200 * 16], m1 |
| 13470 | |
| 13471 | ; mode 14 [row 9] |
| 13472 | pslldq m5, 2 |
| 13473 | pinsrb m5, [r3 + 7], 1 |
| 13474 | pinsrb m5, [r3 + 10], 0 |
| 13475 | movu m0, [r4 + 4] |
| 13476 | psrldq m6, m0, 1 |
| 13477 | punpcklbw m0, m6 |
| 13478 | |
| 13479 | ; mode 15 [row 7 - second half] |
| 13480 | pmaddubsw m1, m0, [r5 + 24 * 16] |
| 13481 | pmulhrsw m1, m3 |
| 13482 | packuswb m1, m1 |
| 13483 | movh [r0 + 215 * 16 + 8], m1 |
| 13484 | ; mode 15 [row 7 - second half] end |
| 13485 | |
| 13486 | ; mode 15 [row 8 - second half] |
| 13487 | pmaddubsw m1, m0, [r5 + 7 * 16] |
| 13488 | pmulhrsw m1, m3 |
| 13489 | packuswb m1, m1 |
| 13490 | movh [r0 + 216 * 16 + 8], m1 |
| 13491 | ; mode 15 [row 8 - second half] end |
| 13492 | |
| 13493 | ; mode 16 [row 6 - second half] |
| 13494 | pmaddubsw m1, m0, [r5 + 13 * 16] |
| 13495 | pmulhrsw m1, m3 |
| 13496 | packuswb m1, m1 |
| 13497 | movh [r0 + 230 * 16 + 8], m1 |
| 13498 | ; mode 16 [row 6 - second half] end |
| 13499 | |
| 13500 | ; mode 15 [row 6 - second half] end |
| 13501 | pmaddubsw m1, m5, [r5 + 30 * 16] |
| 13502 | pmulhrsw m1, m3 |
| 13503 | pmaddubsw m6, m0, [r5 + 30 * 16] |
| 13504 | pmulhrsw m6, m3 |
| 13505 | packuswb m1, m6 |
| 13506 | movu [r0 + 201 * 16], m1 |
| 13507 | |
| 13508 | ; mode 14 [row 10] |
| 13509 | pmaddubsw m1, m5, [r5 + 17 * 16] |
| 13510 | pmulhrsw m1, m3 |
| 13511 | pmaddubsw m6, m0, [r5 + 17 * 16] |
| 13512 | pmulhrsw m6, m3 |
| 13513 | packuswb m1, m6 |
| 13514 | movu [r0 + 202 * 16], m1 |
| 13515 | |
| 13516 | ; mode 14 [row 11] |
| 13517 | pmaddubsw m1, m5, [r5 + 4 * 16] |
| 13518 | pmulhrsw m1, m3 |
| 13519 | pmaddubsw m6, m0, [r5 + 4 * 16] |
| 13520 | pmulhrsw m6, m3 |
| 13521 | packuswb m1, m6 |
| 13522 | movu [r0 + 203 * 16], m1 |
| 13523 | |
| 13524 | ; mode 14 [row 12] |
| 13525 | pslldq m5, 2 |
| 13526 | pinsrb m5, [r3 + 10], 1 |
| 13527 | pinsrb m5, [r3 + 12], 0 |
| 13528 | movu m0, [r4 + 3] |
| 13529 | psrldq m6, m0, 1 |
| 13530 | punpcklbw m0, m6 |
| 13531 | |
| 13532 | ; mode 15 [row 9 - second half] |
| 13533 | pmaddubsw m1, m0, [r5 + 22 * 16] |
| 13534 | pmulhrsw m1, m3 |
| 13535 | packuswb m1, m1 |
| 13536 | movh [r0 + 217 * 16 + 8], m1 |
| 13537 | ; mode 15 [row 9 - second half] end |
| 13538 | |
| 13539 | ; mode 15 [row 10 - second half] |
| 13540 | pmaddubsw m1, m0, [r5 + 5 * 16] |
| 13541 | pmulhrsw m1, m3 |
| 13542 | packuswb m1, m1 |
| 13543 | movh [r0 + 218 * 16 + 8], m1 |
| 13544 | ; mode 15 [row 10 - second half] end |
| 13545 | |
| 13546 | ; mode 16 [row 7 - second half] |
| 13547 | pmaddubsw m1, m0, [r5 + 24 * 16] |
| 13548 | pmulhrsw m1, m3 |
| 13549 | packuswb m1, m1 |
| 13550 | movh [r0 + 231 * 16 + 8], m1 |
| 13551 | ; mode 16 [row 7 - second half] end |
| 13552 | |
| 13553 | ; mode 16 [row 8 - second half] |
| 13554 | pmaddubsw m1, m0, [r5 + 3 * 16] |
| 13555 | pmulhrsw m1, m3 |
| 13556 | packuswb m1, m1 |
| 13557 | movh [r0 + 232 * 16 + 8], m1 |
| 13558 | ; mode 16 [row 8 - second half] end |
| 13559 | |
| 13560 | pmaddubsw m1, m5, [r5 + 23 * 16] |
| 13561 | pmulhrsw m1, m3 |
| 13562 | pmaddubsw m6, m0, [r5 + 23 * 16] |
| 13563 | pmulhrsw m6, m3 |
| 13564 | packuswb m1, m6 |
| 13565 | movu [r0 + 204 * 16], m1 |
| 13566 | |
| 13567 | ; mode 14 [row 13] |
| 13568 | pmaddubsw m1, m5, [r5 + 10 * 16] |
| 13569 | pmulhrsw m1, m3 |
| 13570 | pmaddubsw m6, m0, [r5 + 10 * 16] |
| 13571 | pmulhrsw m6, m3 |
| 13572 | packuswb m1, m6 |
| 13573 | movu [r0 + 205 * 16], m1 |
| 13574 | |
| 13575 | ; mode 14 [row 14] |
| 13576 | pslldq m5, 2 |
| 13577 | pinsrb m5, [r3 + 12], 1 |
| 13578 | pinsrb m5, [r3 + 15], 0 |
| 13579 | movu m0, [r4 + 2] |
| 13580 | psrldq m6, m0, 1 |
| 13581 | punpcklbw m0, m6 |
| 13582 | |
| 13583 | ; mode 15 [row 11 - second half] |
| 13584 | pmaddubsw m1, m0, [r5 + 20 * 16] |
| 13585 | pmulhrsw m1, m3 |
| 13586 | packuswb m1, m1 |
| 13587 | movh [r0 + 219 * 16 + 8], m1 |
| 13588 | ; mode 15 [row 11 - second half] end |
| 13589 | |
| 13590 | ; mode 15 [row 12 - second half] |
| 13591 | pmaddubsw m1, m0, [r5 + 3 * 16] |
| 13592 | pmulhrsw m1, m3 |
| 13593 | packuswb m1, m1 |
| 13594 | movh [r0 + 220 * 16 + 8], m1 |
| 13595 | ; mode 15 [row 12 - second half] end |
| 13596 | |
| 13597 | ; mode 16 [row 9 - second half] |
| 13598 | pmaddubsw m1, m0, [r5 + 14 * 16] |
| 13599 | pmulhrsw m1, m3 |
| 13600 | packuswb m1, m1 |
| 13601 | movh [r0 + 233 * 16 + 8], m1 |
| 13602 | |
| 13603 | ; mode 16 [row 9 - second half] end |
| 13604 | pmaddubsw m1, m5, [r5 + 29 * 16] |
| 13605 | pmulhrsw m1, m3 |
| 13606 | pmaddubsw m6, m0, [r5 + 29 * 16] |
| 13607 | pmulhrsw m6, m3 |
| 13608 | packuswb m1, m6 |
| 13609 | movu [r0 + 206 * 16], m1 |
| 13610 | |
| 13611 | ; mode 14 [row 15] |
| 13612 | pmaddubsw m1, m5, [r5 + 16 * 16] |
| 13613 | pmulhrsw m1, m3 |
| 13614 | pmaddubsw m6, m0, [r5 + 16 * 16] |
| 13615 | pmulhrsw m6, m3 |
| 13616 | packuswb m1, m6 |
| 13617 | movu [r0 + 207 * 16], m1 |
| 13618 | |
| 13619 | ; mode 12 [row 12] |
| 13620 | pslldq m0, m2, 2 |
| 13621 | pinsrb m0, [r3 + 6], 1 |
| 13622 | pinsrb m0, [r3 + 13], 0 |
| 13623 | pmaddubsw m1, m0, [r5 + 31 * 16] |
| 13624 | pmulhrsw m1, m3 |
| 13625 | pmaddubsw m5, m4, [r5 + 31 * 16] |
| 13626 | pmulhrsw m5, m3 |
| 13627 | packuswb m1, m5 |
| 13628 | movu [r0 + 172 * 16], m1 |
| 13629 | |
| 13630 | ; mode 12 [row 13] |
| 13631 | pmaddubsw m1, m0, [r5 + 26 * 16] |
| 13632 | pmulhrsw m1, m3 |
| 13633 | pmaddubsw m5, m4, [r5 + 26 * 16] |
| 13634 | pmulhrsw m5, m3 |
| 13635 | packuswb m1, m5 |
| 13636 | movu [r0 + 173 * 16], m1 |
| 13637 | |
| 13638 | ; mode 12 [row 14] |
| 13639 | pmaddubsw m1, m0, [r5 + 21 * 16] |
| 13640 | pmulhrsw m1, m3 |
| 13641 | pmaddubsw m5, m4, [r5 + 21 * 16] |
| 13642 | pmulhrsw m5, m3 |
| 13643 | packuswb m1, m5 |
| 13644 | movu [r0 + 174 * 16], m1 |
| 13645 | |
| 13646 | ; mode 12 [row 15] |
| 13647 | pmaddubsw m1, m0, [r5 + 16 * 16] |
| 13648 | pmulhrsw m1, m3 |
| 13649 | pmaddubsw m5, m4, [r5 + 16 * 16] |
| 13650 | pmulhrsw m5, m3 |
| 13651 | packuswb m1, m5 |
| 13652 | movu [r0 + 175 * 16], m1 |
| 13653 | |
| 13654 | ; mode 13 [row 7] |
| 13655 | pslldq m7, 2 |
| 13656 | pinsrb m7, [r3 + 4], 1 |
| 13657 | pinsrb m7, [r3 + 7], 0 |
| 13658 | pmaddubsw m1, m7, [r5 + 24 * 16] |
| 13659 | pmulhrsw m1, m3 |
| 13660 | pmaddubsw m5, m4, [r5 + 24 * 16] |
| 13661 | pmulhrsw m5, m3 |
| 13662 | packuswb m1, m5 |
| 13663 | movu [r0 + 183 * 16], m1 |
| 13664 | |
| 13665 | ; mode 13 [row 8] |
| 13666 | pmaddubsw m1, m7, [r5 + 15 * 16] |
| 13667 | pmulhrsw m1, m3 |
| 13668 | pmaddubsw m5, m4, [r5 + 15 * 16] |
| 13669 | pmulhrsw m5, m3 |
| 13670 | packuswb m1, m5 |
| 13671 | movu [r0 + 184 * 16], m1 |
| 13672 | |
| 13673 | ; mode 13 [row 9] |
| 13674 | pmaddubsw m1, m7, [r5 + 6 * 16] |
| 13675 | pmulhrsw m1, m3 |
| 13676 | pmaddubsw m5, m4, [r5 + 6 * 16] |
| 13677 | pmulhrsw m5, m3 |
| 13678 | packuswb m1, m5 |
| 13679 | movu [r0 + 185 * 16], m1 |
| 13680 | |
| 13681 | ; mode 13 [row 10] |
| 13682 | pslldq m7, 2 |
| 13683 | pinsrb m7, [r3 + 7], 1 |
| 13684 | pinsrb m7, [r3 + 11], 0 |
| 13685 | pmaddubsw m1, m7, [r5 + 29 * 16] |
| 13686 | pmulhrsw m1, m3 |
| 13687 | movu m4, [r4 + 5] |
| 13688 | psrldq m5, m4, 1 |
| 13689 | punpcklbw m4, m5 |
| 13690 | pmaddubsw m5, m4, [r5 + 29 * 16] |
| 13691 | pmulhrsw m5, m3 |
| 13692 | packuswb m1, m5 |
| 13693 | movu [r0 + 186 * 16], m1 |
| 13694 | |
| 13695 | ; mode 13 [row 11] |
| 13696 | pmaddubsw m1, m7, [r5 + 20 * 16] |
| 13697 | pmulhrsw m1, m3 |
| 13698 | pmaddubsw m5, m4, [r5 + 20 * 16] |
| 13699 | pmulhrsw m5, m3 |
| 13700 | packuswb m1, m5 |
| 13701 | movu [r0 + 187 * 16], m1 |
| 13702 | |
| 13703 | ; mode 13 [row 12] |
| 13704 | pmaddubsw m1, m7, [r5 + 11 * 16] |
| 13705 | pmulhrsw m1, m3 |
| 13706 | pmaddubsw m5, m4, [r5 + 11 * 16] |
| 13707 | pmulhrsw m5, m3 |
| 13708 | packuswb m1, m5 |
| 13709 | movu [r0 + 188 * 16], m1 |
| 13710 | |
| 13711 | ; mode 13 [row 13] |
| 13712 | pmaddubsw m1, m7, [r5 + 2 * 16] |
| 13713 | pmulhrsw m1, m3 |
| 13714 | pmaddubsw m5, m4, [r5 + 2 * 16] |
| 13715 | pmulhrsw m5, m3 |
| 13716 | packuswb m1, m5 |
| 13717 | movu [r0 + 189 * 16], m1 |
| 13718 | |
| 13719 | ; mode 13 [row 14] |
| 13720 | pslldq m7, 2 |
| 13721 | pinsrb m7, [r3 + 11], 1 |
| 13722 | pinsrb m7, [r3 + 14], 0 |
| 13723 | pmaddubsw m1, m7, [r5 + 25 * 16] |
| 13724 | pmulhrsw m1, m3 |
| 13725 | movu m4, [r4 + 4] |
| 13726 | psrldq m5, m4, 1 |
| 13727 | punpcklbw m4, m5 |
| 13728 | pmaddubsw m5, m4, [r5 + 25 * 16] |
| 13729 | pmulhrsw m5, m3 |
| 13730 | packuswb m1, m5 |
| 13731 | movu [r0 + 190 * 16], m1 |
| 13732 | |
| 13733 | ; mode 13 [row 15] |
| 13734 | pmaddubsw m1, m7, [r5 + 16 * 16] |
| 13735 | pmulhrsw m1, m3 |
| 13736 | pmaddubsw m5, m4, [r5 + 16 * 16] |
| 13737 | pmulhrsw m5, m3 |
| 13738 | packuswb m1, m5 |
| 13739 | movu [r0 + 191 * 16], m1 |
| 13740 | |
| 13741 | ; mode 17 [row 15] |
| 13742 | movu m0, [r3] |
| 13743 | pshufb m1, m0, [tab_S1] |
| 13744 | movu [r0 + 255 * 16], m1 |
| 13745 | movu m2, [r4] |
| 13746 | movd [r0 + 255 * 16 + 12], m2 |
| 13747 | |
| 13748 | ; mode 18 [row 0] |
| 13749 | movu [r0 + 256 * 16], m0 |
| 13750 | |
| 13751 | ; mode 18 [row 1] |
| 13752 | pslldq m4, m0, 1 |
| 13753 | pinsrb m4, [r4 + 1], 0 |
| 13754 | movu [r0 + 257 * 16], m4 |
| 13755 | pslldq m4, 1 |
| 13756 | pinsrb m4, [r4 + 2], 0 |
| 13757 | movu [r0 + 258 * 16], m4 |
| 13758 | pslldq m4, 1 |
| 13759 | pinsrb m4, [r4 + 3], 0 |
| 13760 | movu [r0 + 259 * 16], m4 |
| 13761 | pslldq m4, 1 |
| 13762 | pinsrb m4, [r4 + 4], 0 |
| 13763 | movu [r0 + 260 * 16], m4 |
| 13764 | pslldq m4, 1 |
| 13765 | pinsrb m4, [r4 + 5], 0 |
| 13766 | movu [r0 + 261 * 16], m4 |
| 13767 | pslldq m4, 1 |
| 13768 | pinsrb m4, [r4 + 6], 0 |
| 13769 | movu [r0 + 262 * 16], m4 |
| 13770 | pslldq m4, 1 |
| 13771 | pinsrb m4, [r4 + 7], 0 |
| 13772 | movu [r0 + 263 * 16], m4 |
| 13773 | pslldq m4, 1 |
| 13774 | pinsrb m4, [r4 + 8], 0 |
| 13775 | movu [r0 + 264 * 16], m4 |
| 13776 | pslldq m4, 1 |
| 13777 | pinsrb m4, [r4 + 9], 0 |
| 13778 | movu [r0 + 265 * 16], m4 |
| 13779 | pslldq m4, 1 |
| 13780 | pinsrb m4, [r4 + 10], 0 |
| 13781 | movu [r0 + 266 * 16], m4 |
| 13782 | pslldq m4, 1 |
| 13783 | pinsrb m4, [r4 + 11], 0 |
| 13784 | movu [r0 + 267 * 16], m4 |
| 13785 | pslldq m4, 1 |
| 13786 | pinsrb m4, [r4 + 12], 0 |
| 13787 | movu [r0 + 268 * 16], m4 |
| 13788 | pslldq m4, 1 |
| 13789 | pinsrb m4, [r4 + 13], 0 |
| 13790 | movu [r0 + 269 * 16], m4 |
| 13791 | pslldq m4, 1 |
| 13792 | pinsrb m4, [r4 + 14], 0 |
| 13793 | movu [r0 + 270 * 16], m4 |
| 13794 | pslldq m4, 1 |
| 13795 | pinsrb m4, [r4 + 15], 0 |
| 13796 | movu [r0 + 271 * 16], m4 |
| 13797 | |
| 13798 | ; mode 19 [row 0] |
| 13799 | psrldq m2, m0, 1 |
| 13800 | punpcklbw m0, m2 |
| 13801 | movu m5, [r3 + 8] |
| 13802 | psrldq m6, m5, 1 |
| 13803 | punpcklbw m5, m6 |
| 13804 | pmaddubsw m4, m0, [r5 + 6 * 16] |
| 13805 | pmulhrsw m4, m3 |
| 13806 | pmaddubsw m6, m5, [r5 + 6 * 16] |
| 13807 | pmulhrsw m6, m3 |
| 13808 | packuswb m4, m6 |
| 13809 | movu [r0 + 272 * 16], m4 |
| 13810 | |
| 13811 | ; mode 20 [row 0] |
| 13812 | pmaddubsw m4, m0, [r5 + 11 * 16] |
| 13813 | pmulhrsw m4, m3 |
| 13814 | pmaddubsw m6, m5, [r5 + 11 * 16] |
| 13815 | pmulhrsw m6, m3 |
| 13816 | packuswb m4, m6 |
| 13817 | movu [r0 + 288 * 16], m4 |
| 13818 | |
| 13819 | ; mode 21 [row 0] |
| 13820 | pmaddubsw m4, m0, [r5 + 15 * 16] |
| 13821 | pmulhrsw m4, m3 |
| 13822 | pmaddubsw m6, m5, [r5 + 15 * 16] |
| 13823 | pmulhrsw m6, m3 |
| 13824 | packuswb m4, m6 |
| 13825 | movu [r0 + 304 * 16], m4 |
| 13826 | |
| 13827 | ; mode 22 [row 0] |
| 13828 | pmaddubsw m4, m0, [r5 + 19 * 16] |
| 13829 | pmulhrsw m4, m3 |
| 13830 | pmaddubsw m6, m5, [r5 + 19 * 16] |
| 13831 | pmulhrsw m6, m3 |
| 13832 | packuswb m4, m6 |
| 13833 | movu [r0 + 320 * 16], m4 |
| 13834 | |
| 13835 | ; mode 22 [row 1] |
| 13836 | pmaddubsw m4, m0, [r5 + 6 * 16] |
| 13837 | pmulhrsw m4, m3 |
| 13838 | pmaddubsw m6, m5, [r5 + 6 * 16] |
| 13839 | pmulhrsw m6, m3 |
| 13840 | packuswb m4, m6 |
| 13841 | movu [r0 + 321 * 16], m4 |
| 13842 | |
| 13843 | ; mode 23 [row 0] |
| 13844 | pmaddubsw m4, m0, [r5 + 23 * 16] |
| 13845 | pmulhrsw m4, m3 |
| 13846 | pmaddubsw m6, m5, [r5 + 23 * 16] |
| 13847 | pmulhrsw m6, m3 |
| 13848 | packuswb m4, m6 |
| 13849 | movu [r0 + 336 * 16], m4 |
| 13850 | |
| 13851 | ; mode 23 [row 1] |
| 13852 | pmaddubsw m4, m0, [r5 + 14 * 16] |
| 13853 | pmulhrsw m4, m3 |
| 13854 | pmaddubsw m6, m5, [r5 + 14 * 16] |
| 13855 | pmulhrsw m6, m3 |
| 13856 | packuswb m4, m6 |
| 13857 | movu [r0 + 337 * 16], m4 |
| 13858 | |
| 13859 | ; mode 23 [row 2] |
| 13860 | pmaddubsw m4, m0, [r5 + 5 * 16] |
| 13861 | pmulhrsw m4, m3 |
| 13862 | pmaddubsw m6, m5, [r5 + 5 * 16] |
| 13863 | pmulhrsw m6, m3 |
| 13864 | packuswb m4, m6 |
| 13865 | movu [r0 + 338 * 16], m4 |
| 13866 | |
| 13867 | ; mode 24 [row 0] |
| 13868 | pmaddubsw m4, m0, [r5 + 27 * 16] |
| 13869 | pmulhrsw m4, m3 |
| 13870 | pmaddubsw m6, m5, [r5 + 27 * 16] |
| 13871 | pmulhrsw m6, m3 |
| 13872 | packuswb m4, m6 |
| 13873 | movu [r0 + 352 * 16], m4 |
| 13874 | |
| 13875 | ; mode 24 [row 1] |
| 13876 | pmaddubsw m4, m0, [r5 + 22 * 16] |
| 13877 | pmulhrsw m4, m3 |
| 13878 | pmaddubsw m6, m5, [r5 + 22 * 16] |
| 13879 | pmulhrsw m6, m3 |
| 13880 | packuswb m4, m6 |
| 13881 | movu [r0 + 353 * 16], m4 |
| 13882 | |
| 13883 | ; mode 24 [row 2] |
| 13884 | pmaddubsw m4, m0, [r5 + 17 * 16] |
| 13885 | pmulhrsw m4, m3 |
| 13886 | pmaddubsw m6, m5, [r5 + 17 * 16] |
| 13887 | pmulhrsw m6, m3 |
| 13888 | packuswb m4, m6 |
| 13889 | movu [r0 + 354 * 16], m4 |
| 13890 | |
| 13891 | ; mode 24 [row 3] |
| 13892 | pmaddubsw m4, m0, [r5 + 12 * 16] |
| 13893 | pmulhrsw m4, m3 |
| 13894 | pmaddubsw m6, m5, [r5 + 12 * 16] |
| 13895 | pmulhrsw m6, m3 |
| 13896 | packuswb m4, m6 |
| 13897 | movu [r0 + 355 * 16], m4 |
| 13898 | |
| 13899 | ; mode 24 [row 4] |
| 13900 | pmaddubsw m4, m0, [r5 + 7 * 16] |
| 13901 | pmulhrsw m4, m3 |
| 13902 | pmaddubsw m6, m5, [r5 + 7 * 16] |
| 13903 | pmulhrsw m6, m3 |
| 13904 | packuswb m4, m6 |
| 13905 | movu [r0 + 356 * 16], m4 |
| 13906 | |
| 13907 | ; mode 24 [row 5] |
| 13908 | pmaddubsw m4, m0, [r5 + 2 * 16] |
| 13909 | pmulhrsw m4, m3 |
| 13910 | pmaddubsw m6, m5, [r5 + 2 * 16] |
| 13911 | pmulhrsw m6, m3 |
| 13912 | packuswb m4, m6 |
| 13913 | movu [r0 + 357 * 16], m4 |
| 13914 | |
| 13915 | ; mode 24 [row 6 - first half] |
| 13916 | pslldq m7, m0, 2 |
| 13917 | pinsrb m7, [r4 + 0], 1 |
| 13918 | pinsrb m7, [r4 + 6], 0 |
| 13919 | pmaddubsw m4, m7, [r5 + 29 * 16] |
| 13920 | pmulhrsw m4, m3 |
| 13921 | packuswb m4, m4 |
| 13922 | movh [r0 + 358 * 16], m4 |
| 13923 | |
| 13924 | ; mode 24 [row 7 - first half] |
| 13925 | pmaddubsw m4, m7, [r5 + 24 * 16] |
| 13926 | pmulhrsw m4, m3 |
| 13927 | packuswb m4, m4 |
| 13928 | movh [r0 + 359 * 16], m4 |
| 13929 | |
| 13930 | ; mode 24 [row 8 - first half] |
| 13931 | pmaddubsw m4, m7, [r5 + 19 * 16] |
| 13932 | pmulhrsw m4, m3 |
| 13933 | packuswb m4, m4 |
| 13934 | movh [r0 + 360 * 16], m4 |
| 13935 | |
| 13936 | ; mode 24 [row 9 - first half] |
| 13937 | pmaddubsw m4, m7, [r5 + 14 * 16] |
| 13938 | pmulhrsw m4, m3 |
| 13939 | packuswb m4, m4 |
| 13940 | movh [r0 + 361 * 16], m4 |
| 13941 | |
| 13942 | ; mode 24 [row 10 - first half] |
| 13943 | pmaddubsw m4, m7, [r5 + 9 * 16] |
| 13944 | pmulhrsw m4, m3 |
| 13945 | packuswb m4, m4 |
| 13946 | movh [r0 + 362 * 16], m4 |
| 13947 | |
| 13948 | ; mode 24 [row 11 - first half] |
| 13949 | pmaddubsw m4, m7, [r5 + 4 * 16] |
| 13950 | pmulhrsw m4, m3 |
| 13951 | packuswb m4, m4 |
| 13952 | movh [r0 + 363 * 16], m4 |
| 13953 | |
| 13954 | ; mode 24 [row 12 - first half] |
| 13955 | pslldq m7, 2 |
| 13956 | pinsrb m7, [r4 + 6], 1 |
| 13957 | pinsrb m7, [r4 + 13], 0 |
| 13958 | pmaddubsw m4, m7, [r5 + 31 * 16] |
| 13959 | pmulhrsw m4, m3 |
| 13960 | packuswb m4, m4 |
| 13961 | movh [r0 + 364 * 16], m4 |
| 13962 | |
| 13963 | ; mode 24 [row 13 - first half] |
| 13964 | pmaddubsw m4, m7, [r5 + 26 * 16] |
| 13965 | pmulhrsw m4, m3 |
| 13966 | packuswb m4, m4 |
| 13967 | movh [r0 + 365 * 16], m4 |
| 13968 | |
| 13969 | ; mode 24 [row 14 - first half] |
| 13970 | pmaddubsw m4, m7, [r5 + 21 * 16] |
| 13971 | pmulhrsw m4, m3 |
| 13972 | packuswb m4, m4 |
| 13973 | movh [r0 + 366 * 16], m4 |
| 13974 | |
| 13975 | ; mode 24 [row 15 - first half] |
| 13976 | pmaddubsw m4, m7, [r5 + 16 * 16] |
| 13977 | pmulhrsw m4, m3 |
| 13978 | packuswb m4, m4 |
| 13979 | movh [r0 + 367 * 16], m4 |
| 13980 | |
| 13981 | ; mode 23 [row 3 - first half] |
| 13982 | pslldq m7, m0, 2 |
| 13983 | pinsrb m7, [r4 + 0], 1 |
| 13984 | pinsrb m7, [r4 + 4], 0 |
| 13985 | pmaddubsw m4, m7, [r5 + 28 * 16] |
| 13986 | pmulhrsw m4, m3 |
| 13987 | packuswb m4, m4 |
| 13988 | movh [r0 + 339 * 16], m4 |
| 13989 | |
| 13990 | ; mode 23 [row 4 - first half] |
| 13991 | pmaddubsw m4, m7, [r5 + 19 * 16] |
| 13992 | pmulhrsw m4, m3 |
| 13993 | packuswb m4, m4 |
| 13994 | movh [r0 + 340 * 16], m4 |
| 13995 | |
| 13996 | ; mode 23 [row 5 - first half] |
| 13997 | pmaddubsw m4, m7, [r5 + 10 * 16] |
| 13998 | pmulhrsw m4, m3 |
| 13999 | packuswb m4, m4 |
| 14000 | movh [r0 + 341 * 16], m4 |
| 14001 | |
| 14002 | ; mode 23 [row 6 - first half] |
| 14003 | pmaddubsw m4, m7, [r5 + 1 * 16] |
| 14004 | pmulhrsw m4, m3 |
| 14005 | packuswb m4, m4 |
| 14006 | movh [r0 + 342 * 16], m4 |
| 14007 | |
| 14008 | ; mode 23 [row 7 - first half] |
| 14009 | pslldq m7, 2 |
| 14010 | pinsrb m7, [r4 + 4], 1 |
| 14011 | pinsrb m7, [r4 + 7], 0 |
| 14012 | pmaddubsw m4, m7, [r5 + 24 * 16] |
| 14013 | pmulhrsw m4, m3 |
| 14014 | packuswb m4, m4 |
| 14015 | movh [r0 + 343 * 16], m4 |
| 14016 | |
| 14017 | ; mode 23 [row 8 - first half] |
| 14018 | pmaddubsw m4, m7, [r5 + 15 * 16] |
| 14019 | pmulhrsw m4, m3 |
| 14020 | packuswb m4, m4 |
| 14021 | movh [r0 + 344 * 16], m4 |
| 14022 | |
| 14023 | ; mode 23 [row 9 - first half] |
| 14024 | pmaddubsw m4, m7, [r5 + 6 * 16] |
| 14025 | pmulhrsw m4, m3 |
| 14026 | packuswb m4, m4 |
| 14027 | movh [r0 + 345 * 16], m4 |
| 14028 | |
| 14029 | ; mode 23 [row 10 - first half] |
| 14030 | pslldq m7, 2 |
| 14031 | pinsrb m7, [r4 + 7], 1 |
| 14032 | pinsrb m7, [r4 + 11], 0 |
| 14033 | pmaddubsw m4, m7, [r5 + 29 * 16] |
| 14034 | pmulhrsw m4, m3 |
| 14035 | packuswb m4, m4 |
| 14036 | movh [r0 + 346 * 16], m4 |
| 14037 | |
| 14038 | ; mode 23 [row 11 - first half] |
| 14039 | pmaddubsw m4, m7, [r5 + 20 * 16] |
| 14040 | pmulhrsw m4, m3 |
| 14041 | packuswb m4, m4 |
| 14042 | movh [r0 + 347 * 16], m4 |
| 14043 | |
| 14044 | ; mode 23 [row 12 - first half] |
| 14045 | pmaddubsw m4, m7, [r5 + 11 * 16] |
| 14046 | pmulhrsw m4, m3 |
| 14047 | packuswb m4, m4 |
| 14048 | movh [r0 + 348 * 16], m4 |
| 14049 | |
| 14050 | ; mode 23 [row 13 - first half] |
| 14051 | pmaddubsw m4, m7, [r5 + 2 * 16] |
| 14052 | pmulhrsw m4, m3 |
| 14053 | packuswb m4, m4 |
| 14054 | movh [r0 + 349 * 16], m4 |
| 14055 | |
| 14056 | ; mode 23 [row 14 - first half] |
| 14057 | pslldq m7, 2 |
| 14058 | pinsrb m7, [r4 + 11], 1 |
| 14059 | pinsrb m7, [r4 + 14], 0 |
| 14060 | pmaddubsw m4, m7, [r5 + 25 * 16] |
| 14061 | pmulhrsw m4, m3 |
| 14062 | packuswb m4, m4 |
| 14063 | movh [r0 + 350 * 16], m4 |
| 14064 | |
| 14065 | ; mode 23 [row 15 - first half] |
| 14066 | pmaddubsw m4, m7, [r5 + 16 * 16] |
| 14067 | pmulhrsw m4, m3 |
| 14068 | packuswb m4, m4 |
| 14069 | movh [r0 + 351 * 16], m4 |
| 14070 | |
| 14071 | ; mode 21 [row 15 - first half] |
| 14072 | pmaddubsw m4, m0, [r5 + 16 * 16] |
| 14073 | pmulhrsw m4, m3 |
| 14074 | packuswb m4, m4 |
| 14075 | movh [r0 + 319 * 16 + 8], m4 |
| 14076 | ; mode 21 [row 15 - second half] end |
| 14077 | |
| 14078 | ; mode 20 [row 1 - first half] |
| 14079 | pslldq m7, m0, 2 |
| 14080 | pinsrb m7, [r4 + 0], 1 |
| 14081 | pinsrb m7, [r4 + 2], 0 |
| 14082 | pmaddubsw m4, m7, [r5 + 22 * 16] |
| 14083 | pmulhrsw m4, m3 |
| 14084 | packuswb m4, m4 |
| 14085 | movh [r0 + 289 * 16], m4 |
| 14086 | |
| 14087 | ; mode 20 [row 2 - first half] |
| 14088 | pmaddubsw m4, m7, [r5 + 1 * 16] |
| 14089 | pmulhrsw m4, m3 |
| 14090 | packuswb m4, m4 |
| 14091 | movh [r0 + 290 * 16], m4 |
| 14092 | |
| 14093 | ; mode 21 [row 1 - first half] |
| 14094 | pmaddubsw m4, m7, [r5 + 30 * 16] |
| 14095 | pmulhrsw m4, m3 |
| 14096 | packuswb m4, m4 |
| 14097 | movh [r0 + 305 * 16], m4 |
| 14098 | |
| 14099 | ; mode 21 [row 2 - first half] |
| 14100 | pmaddubsw m4, m7, [r5 + 13 * 16] |
| 14101 | pmulhrsw m4, m3 |
| 14102 | packuswb m4, m4 |
| 14103 | movh [r0 + 306 * 16], m4 |
| 14104 | |
| 14105 | ; mode 22 [row 2 - first half] |
| 14106 | pmaddubsw m4, m7, [r5 + 25 * 16] |
| 14107 | pmulhrsw m4, m3 |
| 14108 | packuswb m4, m4 |
| 14109 | movh [r0 + 322 * 16], m4 |
| 14110 | |
| 14111 | ; mode 22 [row 3 - first half] |
| 14112 | pmaddubsw m4, m7, [r5 + 12 * 16] |
| 14113 | pmulhrsw m4, m3 |
| 14114 | packuswb m4, m4 |
| 14115 | movh [r0 + 323 * 16], m4 |
| 14116 | |
| 14117 | ; mode 22 [row 4 - first half] |
| 14118 | pslldq m1, m7, 2 |
| 14119 | pinsrb m1, [r4 + 2], 1 |
| 14120 | pinsrb m1, [r4 + 5], 0 |
| 14121 | pmaddubsw m4, m1, [r5 + 31 * 16] |
| 14122 | pmulhrsw m4, m3 |
| 14123 | packuswb m4, m4 |
| 14124 | movh [r0 + 324 * 16], m4 |
| 14125 | |
| 14126 | ; mode 22 [row 5 - first half] |
| 14127 | pmaddubsw m4, m1, [r5 + 18 * 16] |
| 14128 | pmulhrsw m4, m3 |
| 14129 | packuswb m4, m4 |
| 14130 | movh [r0 + 325 * 16], m4 |
| 14131 | |
| 14132 | ; mode 22 [row 6 - first half] |
| 14133 | pmaddubsw m4, m1, [r5 + 5 * 16] |
| 14134 | pmulhrsw m4, m3 |
| 14135 | packuswb m4, m4 |
| 14136 | movh [r0 + 326 * 16], m4 |
| 14137 | |
| 14138 | ; mode 22 [row 7 - first half] |
| 14139 | pslldq m1, 2 |
| 14140 | pinsrb m1, [r4 + 5], 1 |
| 14141 | pinsrb m1, [r4 + 7], 0 |
| 14142 | pmaddubsw m4, m1, [r5 + 24 * 16] |
| 14143 | pmulhrsw m4, m3 |
| 14144 | packuswb m4, m4 |
| 14145 | movh [r0 + 327 * 16], m4 |
| 14146 | |
| 14147 | ; mode 22 [row 8 - first half] |
| 14148 | pmaddubsw m4, m1, [r5 + 11 * 16] |
| 14149 | pmulhrsw m4, m3 |
| 14150 | packuswb m4, m4 |
| 14151 | movh [r0 + 328 * 16], m4 |
| 14152 | |
| 14153 | ; mode 22 [row 9 - first half] |
| 14154 | pslldq m1, 2 |
| 14155 | pinsrb m1, [r4 + 7], 1 |
| 14156 | pinsrb m1, [r4 + 10], 0 |
| 14157 | pmaddubsw m4, m1, [r5 + 30 * 16] |
| 14158 | pmulhrsw m4, m3 |
| 14159 | packuswb m4, m4 |
| 14160 | movh [r0 + 329 * 16], m4 |
| 14161 | |
| 14162 | ; mode 22 [row 10 - first half] |
| 14163 | pmaddubsw m4, m1, [r5 + 17 * 16] |
| 14164 | pmulhrsw m4, m3 |
| 14165 | packuswb m4, m4 |
| 14166 | movh [r0 + 330 * 16], m4 |
| 14167 | |
| 14168 | ; mode 22 [row 11 - first half] |
| 14169 | pmaddubsw m4, m1, [r5 + 4 * 16] |
| 14170 | pmulhrsw m4, m3 |
| 14171 | packuswb m4, m4 |
| 14172 | movh [r0 + 331 * 16], m4 |
| 14173 | |
| 14174 | ; mode 22 [row 12 - first half] |
| 14175 | pslldq m1, 2 |
| 14176 | pinsrb m1, [r4 + 10], 1 |
| 14177 | pinsrb m1, [r4 + 12], 0 |
| 14178 | pmaddubsw m4, m1, [r5 + 23 * 16] |
| 14179 | pmulhrsw m4, m3 |
| 14180 | packuswb m4, m4 |
| 14181 | movh [r0 + 332 * 16], m4 |
| 14182 | |
| 14183 | ; mode 22 [row 13 - first half] |
| 14184 | pmaddubsw m4, m1, [r5 + 10 * 16] |
| 14185 | pmulhrsw m4, m3 |
| 14186 | packuswb m4, m4 |
| 14187 | movh [r0 + 333 * 16], m4 |
| 14188 | |
| 14189 | ; mode 22 [row 14 - first half] |
| 14190 | pslldq m1, 2 |
| 14191 | pinsrb m1, [r4 + 12], 1 |
| 14192 | pinsrb m1, [r4 + 15], 0 |
| 14193 | pmaddubsw m4, m1, [r5 + 29 * 16] |
| 14194 | pmulhrsw m4, m3 |
| 14195 | packuswb m4, m4 |
| 14196 | movh [r0 + 334 * 16], m4 |
| 14197 | |
| 14198 | ; mode 22 [row 15 - first half] |
| 14199 | pmaddubsw m4, m1, [r5 + 16 * 16] |
| 14200 | pmulhrsw m4, m3 |
| 14201 | packuswb m4, m4 |
| 14202 | movh [r0 + 335 * 16], m4 |
| 14203 | |
| 14204 | ; mode 21 [row 3 - first half] |
| 14205 | pslldq m6, m7, 2 |
| 14206 | pinsrb m6, [r4 + 2], 1 |
| 14207 | pinsrb m6, [r4 + 4], 0 |
| 14208 | pmaddubsw m4, m6, [r5 + 28 * 16] |
| 14209 | pmulhrsw m4, m3 |
| 14210 | packuswb m4, m4 |
| 14211 | movh [r0 + 307 * 16], m4 |
| 14212 | |
| 14213 | ; mode 21 [row 4 - first half] |
| 14214 | pmaddubsw m4, m6, [r5 + 11 * 16] |
| 14215 | pmulhrsw m4, m3 |
| 14216 | packuswb m4, m4 |
| 14217 | movh [r0 + 308 * 16], m4 |
| 14218 | |
| 14219 | ; mode 21 [row 5 - first half] |
| 14220 | pslldq m6, 2 |
| 14221 | pinsrb m6, [r4 + 4], 1 |
| 14222 | pinsrb m6, [r4 + 6], 0 |
| 14223 | pmaddubsw m4, m6, [r5 + 26 * 16] |
| 14224 | pmulhrsw m4, m3 |
| 14225 | packuswb m4, m4 |
| 14226 | movh [r0 + 309 * 16], m4 |
| 14227 | |
| 14228 | ; mode 21 [row 6 - first half] |
| 14229 | pmaddubsw m4, m6, [r5 + 9 * 16] |
| 14230 | pmulhrsw m4, m3 |
| 14231 | packuswb m4, m4 |
| 14232 | movh [r0 + 310 * 16], m4 |
| 14233 | |
| 14234 | ; mode 21 [row 7 - first half] |
| 14235 | pslldq m6, 2 |
| 14236 | pinsrb m6, [r4 + 6], 1 |
| 14237 | pinsrb m6, [r4 + 8], 0 |
| 14238 | pmaddubsw m4, m6, [r5 + 24 * 16] |
| 14239 | pmulhrsw m4, m3 |
| 14240 | packuswb m4, m4 |
| 14241 | movh [r0 + 311 * 16], m4 |
| 14242 | |
| 14243 | ; mode 21 [row 8 - first half] |
| 14244 | pmaddubsw m4, m6, [r5 + 7 * 16] |
| 14245 | pmulhrsw m4, m3 |
| 14246 | packuswb m4, m4 |
| 14247 | movh [r0 + 312 * 16], m4 |
| 14248 | |
| 14249 | ; mode 21 [row 9 - first half] |
| 14250 | pslldq m6, 2 |
| 14251 | pinsrb m6, [r4 + 8], 1 |
| 14252 | pinsrb m6, [r4 + 9], 0 |
| 14253 | pmaddubsw m4, m6, [r5 + 22 * 16] |
| 14254 | pmulhrsw m4, m3 |
| 14255 | packuswb m4, m4 |
| 14256 | movh [r0 + 313 * 16], m4 |
| 14257 | |
| 14258 | ; mode 21 [row 10 - first half] |
| 14259 | pmaddubsw m4, m6, [r5 + 5 * 16] |
| 14260 | pmulhrsw m4, m3 |
| 14261 | packuswb m4, m4 |
| 14262 | movh [r0 + 314 * 16], m4 |
| 14263 | |
| 14264 | ; mode 21 [row 11 - first half] |
| 14265 | pslldq m6, 2 |
| 14266 | pinsrb m6, [r4 + 9], 1 |
| 14267 | pinsrb m6, [r4 + 11], 0 |
| 14268 | pmaddubsw m4, m6, [r5 + 20 * 16] |
| 14269 | pmulhrsw m4, m3 |
| 14270 | packuswb m4, m4 |
| 14271 | movh [r0 + 315 * 16], m4 |
| 14272 | |
| 14273 | ; mode 21 [row 12 - first half] |
| 14274 | pmaddubsw m4, m6, [r5 + 3 * 16] |
| 14275 | pmulhrsw m4, m3 |
| 14276 | packuswb m4, m4 |
| 14277 | movh [r0 + 316 * 16], m4 |
| 14278 | |
| 14279 | ; mode 21 [row 13 - first half] |
| 14280 | pslldq m6, 2 |
| 14281 | pinsrb m6, [r4 + 11], 1 |
| 14282 | pinsrb m6, [r4 + 13], 0 |
| 14283 | pmaddubsw m4, m6, [r5 + 18 * 16] |
| 14284 | pmulhrsw m4, m3 |
| 14285 | packuswb m4, m4 |
| 14286 | movh [r0 + 317 * 16], m4 |
| 14287 | |
| 14288 | ; mode 21 [row 14 - first half] |
| 14289 | pmaddubsw m4, m6, [r5 + 1 * 16] |
| 14290 | pmulhrsw m4, m3 |
| 14291 | packuswb m4, m4 |
| 14292 | movh [r0 + 318 * 16], m4 |
| 14293 | |
| 14294 | ; mode 21 [row 15 - first half] |
| 14295 | pslldq m6, 2 |
| 14296 | pinsrb m6, [r4 + 13], 1 |
| 14297 | pinsrb m6, [r4 + 15], 0 |
| 14298 | pmaddubsw m4, m6, [r5 + 16 * 16] |
| 14299 | pmulhrsw m4, m3 |
| 14300 | packuswb m4, m4 |
| 14301 | movh [r0 + 319 * 16], m4 |
| 14302 | |
| 14303 | ; mode 20 [row 13 - second half] |
| 14304 | pmaddubsw m4, m7, [r5 + 26 * 16] |
| 14305 | pmulhrsw m4, m3 |
| 14306 | packuswb m4, m4 |
| 14307 | movh [r0 + 301 * 16 + 8], m4 |
| 14308 | ; mode 20 [row 13 - second half] |
| 14309 | |
| 14310 | ; mode 20 [row 14 - second half] |
| 14311 | pmaddubsw m4, m7, [r5 + 5 * 16] |
| 14312 | pmulhrsw m4, m3 |
| 14313 | packuswb m4, m4 |
| 14314 | movh [r0 + 302 * 16 + 8], m4 |
| 14315 | ; mode 20 [row 14 - second half] |
| 14316 | |
| 14317 | ; mode 20 [row 3 - first half] |
| 14318 | pslldq m7, 2 |
| 14319 | pinsrb m7, [r4 + 2], 1 |
| 14320 | pinsrb m7, [r4 + 3], 0 |
| 14321 | pmaddubsw m4, m7, [r5 + 12 * 16] |
| 14322 | pmulhrsw m4, m3 |
| 14323 | packuswb m4, m4 |
| 14324 | movh [r0 + 291 * 16], m4 |
| 14325 | |
| 14326 | ; mode 20 [row 15 - second half] |
| 14327 | pmaddubsw m4, m7, [r5 + 16 * 16] |
| 14328 | pmulhrsw m4, m3 |
| 14329 | packuswb m4, m4 |
| 14330 | movh [r0 + 303 * 16 + 8], m4 |
| 14331 | ; mode 20 [row 15 - second half] |
| 14332 | |
| 14333 | ; mode 20 [row 4 - first half] |
| 14334 | pslldq m7, 2 |
| 14335 | pinsrb m7, [r4 + 3], 1 |
| 14336 | pinsrb m7, [r4 + 5], 0 |
| 14337 | pmaddubsw m4, m7, [r5 + 23 * 16] |
| 14338 | pmulhrsw m4, m3 |
| 14339 | packuswb m4, m4 |
| 14340 | movh [r0 + 292 * 16], m4 |
| 14341 | |
| 14342 | ; mode 20 [row 5 - first half] |
| 14343 | pmaddubsw m4, m7, [r5 + 2 * 16] |
| 14344 | pmulhrsw m4, m3 |
| 14345 | packuswb m4, m4 |
| 14346 | movh [r0 + 293 * 16], m4 |
| 14347 | |
| 14348 | ; mode 20 [row 6 - first half] |
| 14349 | pslldq m7, 2 |
| 14350 | pinsrb m7, [r4 + 5], 1 |
| 14351 | pinsrb m7, [r4 + 6], 0 |
| 14352 | pmaddubsw m4, m7, [r5 + 13 * 16] |
| 14353 | pmulhrsw m4, m3 |
| 14354 | packuswb m4, m4 |
| 14355 | movh [r0 + 294 * 16], m4 |
| 14356 | |
| 14357 | ; mode 20 [row 7 - first half] |
| 14358 | pslldq m7, 2 |
| 14359 | pinsrb m7, [r4 + 6], 1 |
| 14360 | pinsrb m7, [r4 + 8], 0 |
| 14361 | pmaddubsw m4, m7, [r5 + 24 * 16] |
| 14362 | pmulhrsw m4, m3 |
| 14363 | packuswb m4, m4 |
| 14364 | movh [r0 + 295 * 16], m4 |
| 14365 | |
| 14366 | ; mode 20 [row 8 - first half] |
| 14367 | pmaddubsw m4, m7, [r5 + 3 * 16] |
| 14368 | pmulhrsw m4, m3 |
| 14369 | packuswb m4, m4 |
| 14370 | movh [r0 + 296 * 16], m4 |
| 14371 | |
| 14372 | ; mode 20 [row 9 - first half] |
| 14373 | pslldq m7, 2 |
| 14374 | pinsrb m7, [r4 + 8], 1 |
| 14375 | pinsrb m7, [r4 + 9], 0 |
| 14376 | pmaddubsw m4, m7, [r5 + 14 * 16] |
| 14377 | pmulhrsw m4, m3 |
| 14378 | packuswb m4, m4 |
| 14379 | movh [r0 + 297 * 16], m4 |
| 14380 | |
| 14381 | ; mode 20 [row 10 - first half] |
| 14382 | pslldq m7, 2 |
| 14383 | pinsrb m7, [r4 + 9], 1 |
| 14384 | pinsrb m7, [r4 + 11], 0 |
| 14385 | pmaddubsw m4, m7, [r5 + 25 * 16] |
| 14386 | pmulhrsw m4, m3 |
| 14387 | packuswb m4, m4 |
| 14388 | movh [r0 + 298 * 16], m4 |
| 14389 | |
| 14390 | ; mode 20 [row 11 - first half] |
| 14391 | pmaddubsw m4, m7, [r5 + 4 * 16] |
| 14392 | pmulhrsw m4, m3 |
| 14393 | packuswb m4, m4 |
| 14394 | movh [r0 + 299 * 16], m4 |
| 14395 | |
| 14396 | ; mode 20 [row 12 - first half] |
| 14397 | movu m1, [r5 + 15 * 16] |
| 14398 | pslldq m7, 2 |
| 14399 | pinsrb m7, [r4 + 11], 1 |
| 14400 | pinsrb m7, [r4 + 12], 0 |
| 14401 | pmaddubsw m4, m7, [r5 + 15 * 16] |
| 14402 | pmulhrsw m4, m3 |
| 14403 | packuswb m4, m4 |
| 14404 | movh [r0 + 300 * 16], m4 |
| 14405 | |
| 14406 | ; mode 20 [row 13 - first half] |
| 14407 | pslldq m7, 2 |
| 14408 | pinsrb m7, [r4 + 12], 1 |
| 14409 | pinsrb m7, [r4 + 14], 0 |
| 14410 | pmaddubsw m4, m7, [r5 + 26 * 16] |
| 14411 | pmulhrsw m4, m3 |
| 14412 | packuswb m4, m4 |
| 14413 | movh [r0 + 301 * 16], m4 |
| 14414 | |
| 14415 | ; mode 20 [row 14 - first half] |
| 14416 | pmaddubsw m4, m7, [r5 + 5 * 16] |
| 14417 | pmulhrsw m4, m3 |
| 14418 | packuswb m4, m4 |
| 14419 | movh [r0 + 302 * 16], m4 |
| 14420 | |
| 14421 | ; mode 20 [row 15 - first half] |
| 14422 | pslldq m7, 2 |
| 14423 | pinsrb m7, [r4 + 14], 1 |
| 14424 | pinsrb m7, [r4 + 15], 0 |
| 14425 | pmaddubsw m4, m7, [r5 + 16 * 16] |
| 14426 | pmulhrsw m4, m3 |
| 14427 | packuswb m4, m4 |
| 14428 | movh [r0 + 303 * 16], m4 |
| 14429 | |
| 14430 | ; mode 19 [row 1] |
| 14431 | pslldq m0, 2 |
| 14432 | pinsrb m0, [r4 + 0], 1 |
| 14433 | pinsrb m0, [r4 + 1], 0 |
| 14434 | pslldq m5, 2 |
| 14435 | pinsrb m5, [r3 + 8], 1 |
| 14436 | pinsrb m5, [r3 + 7], 0 |
| 14437 | |
| 14438 | ; mode 20 [row 1 - second half] |
| 14439 | pmaddubsw m4, m5, [r5 + 22 * 16] |
| 14440 | pmulhrsw m4, m3 |
| 14441 | packuswb m4, m4 |
| 14442 | movh [r0 + 289 * 16 + 8], m4 |
| 14443 | ; mode 20 [row 1 - second half] end |
| 14444 | |
| 14445 | ; mode 20 [row 2 - second half] |
| 14446 | pmaddubsw m4, m5, [r5 + 1 * 16] |
| 14447 | pmulhrsw m4, m3 |
| 14448 | packuswb m4, m4 |
| 14449 | movh [r0 + 290 * 16 + 8], m4 |
| 14450 | ; mode 20 [row 2 - second half] end |
| 14451 | |
| 14452 | ; mode 21 [row 2 - second half] |
| 14453 | pmaddubsw m4, m5, [r5 + 30 * 16] |
| 14454 | pmulhrsw m4, m3 |
| 14455 | packuswb m4, m4 |
| 14456 | movh [r0 + 305 * 16 + 8], m4 |
| 14457 | ; mode 21 [row 2 - second half] end |
| 14458 | |
| 14459 | ; mode 21 [row 3 - second half] |
| 14460 | pmaddubsw m4, m5, [r5 + 13 * 16] |
| 14461 | pmulhrsw m4, m3 |
| 14462 | packuswb m4, m4 |
| 14463 | movh [r0 + 306 * 16 + 8], m4 |
| 14464 | ; mode 21 [row 3 - second half] end |
| 14465 | |
| 14466 | ; mode 21 [row 4 - second half] |
| 14467 | pmaddubsw m4, m5, [r5 + 11 * 16] |
| 14468 | pmulhrsw m4, m3 |
| 14469 | packuswb m4, m4 |
| 14470 | movh [r0 + 307 * 16 + 8], m4 |
| 14471 | ; mode 21 [row 4 - second half] end |
| 14472 | |
| 14473 | ; mode 22 [row 2 - second half] |
| 14474 | pmaddubsw m4, m5, [r5 + 25 * 16] |
| 14475 | pmulhrsw m4, m3 |
| 14476 | packuswb m4, m4 |
| 14477 | movh [r0 + 322 * 16 + 8], m4 |
| 14478 | ; mode 22 [row 2 - second half] end |
| 14479 | |
| 14480 | ; mode 22 [row 3 - second half] |
| 14481 | pmaddubsw m4, m5, [r5 + 12 * 16] |
| 14482 | pmulhrsw m4, m3 |
| 14483 | packuswb m4, m4 |
| 14484 | movh [r0 + 323 * 16 + 8], m4 |
| 14485 | ; mode 22 [row 3 - second half] end |
| 14486 | |
| 14487 | ; mode 23 [row 3 - second half] |
| 14488 | pmaddubsw m4, m5, [r5 + 28 * 16] |
| 14489 | pmulhrsw m4, m3 |
| 14490 | packuswb m4, m4 |
| 14491 | movh [r0 + 339 * 16 + 8], m4 |
| 14492 | ; mode 23 [row 3 - second half] end |
| 14493 | |
| 14494 | ; mode 23 [row 4 - second half] |
| 14495 | pmaddubsw m4, m5, [r5 + 19 * 16] |
| 14496 | pmulhrsw m4, m3 |
| 14497 | packuswb m4, m4 |
| 14498 | movh [r0 + 340 * 16 + 8], m4 |
| 14499 | ; mode 23 [row 4 - second half] end |
| 14500 | |
| 14501 | ; mode 23 [row 5 - second half] |
| 14502 | pmaddubsw m4, m5, [r5 + 10 * 16] |
| 14503 | pmulhrsw m4, m3 |
| 14504 | packuswb m4, m4 |
| 14505 | movh [r0 + 341 * 16 + 8], m4 |
| 14506 | ; mode 23 [row 5 - second half] end |
| 14507 | |
| 14508 | ; mode 23 [row 6 - second half] |
| 14509 | pmaddubsw m4, m5, [r5 + 1 * 16] |
| 14510 | pmulhrsw m4, m3 |
| 14511 | packuswb m4, m4 |
| 14512 | movh [r0 + 342 * 16 + 8], m4 |
| 14513 | ; mode 23 [row 6 - second half] end |
| 14514 | |
| 14515 | ; mode 24 [row 6 - second half] |
| 14516 | pmaddubsw m4, m5, [r5 + 29 * 16] |
| 14517 | pmulhrsw m4, m3 |
| 14518 | packuswb m4, m4 |
| 14519 | movh [r0 + 358 * 16 + 8], m4 |
| 14520 | ; mode 24 [row 6 - second half] end |
| 14521 | |
| 14522 | ; mode 24 [row 7 - second half] |
| 14523 | pmaddubsw m4, m5, [r5 + 24 * 16] |
| 14524 | pmulhrsw m4, m3 |
| 14525 | packuswb m4, m4 |
| 14526 | movh [r0 + 359 * 16 + 8], m4 |
| 14527 | ; mode 24 [row 7 - second half] end |
| 14528 | |
| 14529 | ; mode 24 [row 8 - second half] |
| 14530 | pmaddubsw m4, m5, [r5 + 19 * 16] |
| 14531 | pmulhrsw m4, m3 |
| 14532 | packuswb m4, m4 |
| 14533 | movh [r0 + 360 * 16 + 8], m4 |
| 14534 | ; mode 24 [row 8 - second half] end |
| 14535 | |
| 14536 | ; mode 24 [row 9 - second half] |
| 14537 | pmaddubsw m4, m5, [r5 + 14 * 16] |
| 14538 | pmulhrsw m4, m3 |
| 14539 | packuswb m4, m4 |
| 14540 | movh [r0 + 361 * 16 + 8], m4 |
| 14541 | ; mode 24 [row 9 - second half] end |
| 14542 | |
| 14543 | ; mode 24 [row 10 - second half] |
| 14544 | pmaddubsw m4, m5, [r5 + 9 * 16] |
| 14545 | pmulhrsw m4, m3 |
| 14546 | packuswb m4, m4 |
| 14547 | movh [r0 + 362 * 16 + 8], m4 |
| 14548 | ; mode 24 [row 10 - second half] end |
| 14549 | |
| 14550 | ; mode 24 [row 11 - second half] |
| 14551 | pmaddubsw m4, m5, [r5 + 4 * 16] |
| 14552 | pmulhrsw m4, m3 |
| 14553 | packuswb m4, m4 |
| 14554 | movh [r0 + 363 * 16 + 8], m4 |
| 14555 | ; mode 24 [row 11 - second half] end |
| 14556 | |
| 14557 | pmaddubsw m4, m0, [r5 + 12 * 16] |
| 14558 | pmulhrsw m4, m3 |
| 14559 | pmaddubsw m6, m5, [r5 + 12 * 16] |
| 14560 | pmulhrsw m6, m3 |
| 14561 | packuswb m4, m6 |
| 14562 | movu [r0 + 273 * 16], m4 |
| 14563 | |
| 14564 | ; mode 19 [row 2] |
| 14565 | pslldq m0, 2 |
| 14566 | pinsrb m0, [r4 + 1], 1 |
| 14567 | pinsrb m0, [r4 + 2], 0 |
| 14568 | pslldq m5, 2 |
| 14569 | pinsrb m5, [r3 + 7], 1 |
| 14570 | pinsrb m5, [r3 + 6], 0 |
| 14571 | |
| 14572 | ; mode 20 [row 3 - second half] |
| 14573 | pmaddubsw m4, m5, [r5 + 12 * 16] |
| 14574 | pmulhrsw m4, m3 |
| 14575 | packuswb m4, m4 |
| 14576 | movh [r0 + 291 * 16 + 8], m4 |
| 14577 | ; mode 20 [row 3 - second half] end |
| 14578 | |
| 14579 | ; mode 21 [row 3 - second half] |
| 14580 | pmaddubsw m4, m5, [r5 + 28 * 16] |
| 14581 | pmulhrsw m4, m3 |
| 14582 | packuswb m4, m4 |
| 14583 | movh [r0 + 307 * 16 + 8], m4 |
| 14584 | ; mode 21 [row 3 - second half] end |
| 14585 | |
| 14586 | ; mode 21 [row 4 - second half] |
| 14587 | pmaddubsw m4, m5, [r5 + 11 * 16] |
| 14588 | pmulhrsw m4, m3 |
| 14589 | packuswb m4, m4 |
| 14590 | movh [r0 + 308 * 16 + 8], m4 |
| 14591 | ; mode 21 [row 4 - second half] end |
| 14592 | |
| 14593 | ; mode 22 [row 4 - second half] |
| 14594 | pmaddubsw m4, m5, [r5 + 31 * 16] |
| 14595 | pmulhrsw m4, m3 |
| 14596 | packuswb m4, m4 |
| 14597 | movh [r0 + 324 * 16 + 8], m4 |
| 14598 | ; mode 22 [row 4 - second half] end |
| 14599 | |
| 14600 | ; mode 22 [row 5 - second half] |
| 14601 | pmaddubsw m4, m5, [r5 + 18 * 16] |
| 14602 | pmulhrsw m4, m3 |
| 14603 | packuswb m4, m4 |
| 14604 | movh [r0 + 325 * 16 + 8], m4 |
| 14605 | ; mode 22 [row 5 - second half] end |
| 14606 | |
| 14607 | ; mode 22 [row 6 - second half] |
| 14608 | pmaddubsw m4, m5, [r5 + 5 * 16] |
| 14609 | pmulhrsw m4, m3 |
| 14610 | packuswb m4, m4 |
| 14611 | movh [r0 + 326 * 16 + 8], m4 |
| 14612 | ; mode 22 [row 6 - second half] end |
| 14613 | |
| 14614 | ; mode 23 [row 7 - second half] |
| 14615 | pmaddubsw m4, m5, [r5 + 24 * 16] |
| 14616 | pmulhrsw m4, m3 |
| 14617 | packuswb m4, m4 |
| 14618 | movh [r0 + 343 * 16 + 8], m4 |
| 14619 | ; mode 23 [row 7 - second half] end |
| 14620 | |
| 14621 | ; mode 23 [row 8 - second half] |
| 14622 | pmaddubsw m4, m5, [r5 + 15 * 16] |
| 14623 | pmulhrsw m4, m3 |
| 14624 | packuswb m4, m4 |
| 14625 | movh [r0 + 344 * 16 + 8], m4 |
| 14626 | ; mode 23 [row 8 - second half] end |
| 14627 | |
| 14628 | ; mode 23 [row 9 - second half] |
| 14629 | pmaddubsw m4, m5, [r5 + 6 * 16] |
| 14630 | pmulhrsw m4, m3 |
| 14631 | packuswb m4, m4 |
| 14632 | movh [r0 + 345 * 16 + 8], m4 |
| 14633 | ; mode 23 [row 9 - second half] end |
| 14634 | |
| 14635 | ; mode 24 [row 12 - second half] |
| 14636 | pmaddubsw m4, m5, [r5 + 31 * 16] |
| 14637 | pmulhrsw m4, m3 |
| 14638 | packuswb m4, m4 |
| 14639 | movh [r0 + 364 * 16 + 8], m4 |
| 14640 | ; mode 24 [row 12 - second half] end |
| 14641 | |
| 14642 | ; mode 24 [row 13 - second half] |
| 14643 | pmaddubsw m4, m5, [r5 + 26 * 16] |
| 14644 | pmulhrsw m4, m3 |
| 14645 | packuswb m4, m4 |
| 14646 | movh [r0 + 365 * 16 + 8], m4 |
| 14647 | ; mode 24 [row 13 - second half] end |
| 14648 | |
| 14649 | ; mode 24 [row 14 - second half] |
| 14650 | pmaddubsw m4, m5, [r5 + 21 * 16] |
| 14651 | pmulhrsw m4, m3 |
| 14652 | packuswb m4, m4 |
| 14653 | movh [r0 + 366 * 16 + 8], m4 |
| 14654 | ; mode 24 [row 14 - second half] end |
| 14655 | |
| 14656 | ; mode 24 [row 15 - second half] |
| 14657 | pmaddubsw m4, m5, [r5 + 16 * 16] |
| 14658 | pmulhrsw m4, m3 |
| 14659 | packuswb m4, m4 |
| 14660 | movh [r0 + 367 * 16 + 8], m4 |
| 14661 | ; mode 24 [row 15 - second half] end |
| 14662 | |
| 14663 | pmaddubsw m4, m0, [r5 + 18 * 16] |
| 14664 | pmulhrsw m4, m3 |
| 14665 | pmaddubsw m6, m5, [r5 + 18 * 16] |
| 14666 | pmulhrsw m6, m3 |
| 14667 | packuswb m4, m6 |
| 14668 | movu [r0 + 274 * 16], m4 |
| 14669 | |
| 14670 | ; mode 19 [row 3] |
| 14671 | pslldq m0, 2 |
| 14672 | pinsrb m0, [r4 + 2], 1 |
| 14673 | pinsrb m0, [r4 + 4], 0 |
| 14674 | pslldq m5, 2 |
| 14675 | pinsrb m5, [r3 + 6], 1 |
| 14676 | pinsrb m5, [r3 + 5], 0 |
| 14677 | |
| 14678 | ; mode 20 [row 4 - second half] |
| 14679 | pmaddubsw m4, m5, [r5 + 23 * 16] |
| 14680 | pmulhrsw m4, m3 |
| 14681 | packuswb m4, m4 |
| 14682 | movh [r0 + 292 * 16 + 8], m4 |
| 14683 | ; mode 20 [row 4 - second half] end |
| 14684 | |
| 14685 | ; mode 20 [row 5 - second half] |
| 14686 | pmaddubsw m4, m5, [r5 + 2 * 16] |
| 14687 | pmulhrsw m4, m3 |
| 14688 | packuswb m4, m4 |
| 14689 | movh [r0 + 293 * 16 + 8], m4 |
| 14690 | ; mode 20 [row 5 - second half] end |
| 14691 | |
| 14692 | ; mode 21 [row 5 - second half] |
| 14693 | pmaddubsw m4, m5, [r5 + 26 * 16] |
| 14694 | pmulhrsw m4, m3 |
| 14695 | packuswb m4, m4 |
| 14696 | movh [r0 + 309 * 16 + 8], m4 |
| 14697 | ; mode 21 [row 5 - second half] end |
| 14698 | |
| 14699 | ; mode 21 [row 6 - second half] |
| 14700 | pmaddubsw m4, m5, [r5 + 9 * 16] |
| 14701 | pmulhrsw m4, m3 |
| 14702 | packuswb m4, m4 |
| 14703 | movh [r0 + 310 * 16 + 8], m4 |
| 14704 | ; mode 21 [row 6 - second half] end |
| 14705 | |
| 14706 | ; mode 22 [row 7 - second half] |
| 14707 | pmaddubsw m4, m5, [r5 + 24 * 16] |
| 14708 | pmulhrsw m4, m3 |
| 14709 | packuswb m4, m4 |
| 14710 | movh [r0 + 327 * 16 + 8], m4 |
| 14711 | ; mode 22 [row 7 - second half] end |
| 14712 | |
| 14713 | ; mode 22 [row 8 - second half] |
| 14714 | pmaddubsw m4, m5, [r5 + 11 * 16] |
| 14715 | pmulhrsw m4, m3 |
| 14716 | packuswb m4, m4 |
| 14717 | movh [r0 + 328 * 16 + 8], m4 |
| 14718 | ; mode 22 [row 7 - second half] end |
| 14719 | |
| 14720 | ; mode 23 [row 10 - second half] |
| 14721 | pmaddubsw m4, m5, [r5 + 29 * 16] |
| 14722 | pmulhrsw m4, m3 |
| 14723 | packuswb m4, m4 |
| 14724 | movh [r0 + 346 * 16 + 8], m4 |
| 14725 | ; mode 23 [row 10 - second half] end |
| 14726 | |
| 14727 | ; mode 23 [row 11 - second half] |
| 14728 | pmaddubsw m4, m5, [r5 + 20 * 16] |
| 14729 | pmulhrsw m4, m3 |
| 14730 | packuswb m4, m4 |
| 14731 | movh [r0 + 347 * 16 + 8], m4 |
| 14732 | ; mode 23 [row 11 - second half] end |
| 14733 | |
| 14734 | ; mode 23 [row 12 - second half] |
| 14735 | pmaddubsw m4, m5, [r5 + 11 * 16] |
| 14736 | pmulhrsw m4, m3 |
| 14737 | packuswb m4, m4 |
| 14738 | movh [r0 + 348 * 16 + 8], m4 |
| 14739 | ; mode 23 [row 12 - second half] end |
| 14740 | |
| 14741 | ; mode 23 [row 13 - second half] |
| 14742 | pmaddubsw m4, m5, [r5 + 2 * 16] |
| 14743 | pmulhrsw m4, m3 |
| 14744 | packuswb m4, m4 |
| 14745 | movh [r0 + 349 * 16 + 8], m4 |
| 14746 | ; mode 23 [row 13 - second half] end |
| 14747 | |
| 14748 | pmaddubsw m4, m0, [r5 + 24 * 16] |
| 14749 | pmulhrsw m4, m3 |
| 14750 | pmaddubsw m6, m5, [r5 + 24 * 16] |
| 14751 | pmulhrsw m6, m3 |
| 14752 | packuswb m4, m6 |
| 14753 | movu [r0 + 275 * 16], m4 |
| 14754 | |
| 14755 | ; mode 19 [row 4] |
| 14756 | pslldq m0, 2 |
| 14757 | pinsrb m0, [r4 + 4], 1 |
| 14758 | pinsrb m0, [r4 + 5], 0 |
| 14759 | pslldq m5, 2 |
| 14760 | pinsrb m5, [r3 + 5], 1 |
| 14761 | pinsrb m5, [r3 + 4], 0 |
| 14762 | |
| 14763 | ; mode 20 [row 6 - second half] |
| 14764 | pmaddubsw m4, m5, [r5 + 13 * 16] |
| 14765 | pmulhrsw m4, m3 |
| 14766 | packuswb m4, m4 |
| 14767 | movh [r0 + 294 * 16 + 8], m4 |
| 14768 | ; mode 20 [row 6 - second half] end |
| 14769 | |
| 14770 | ; mode 21 [row 7 - second half] |
| 14771 | pmaddubsw m4, m5, [r5 + 24 * 16] |
| 14772 | pmulhrsw m4, m3 |
| 14773 | packuswb m4, m4 |
| 14774 | movh [r0 + 311 * 16 + 8], m4 |
| 14775 | ; mode 21 [row 7 - second half] end |
| 14776 | |
| 14777 | ; mode 21 [row 8 - second half] |
| 14778 | pmaddubsw m4, m5, [r5 + 7 * 16] |
| 14779 | pmulhrsw m4, m3 |
| 14780 | packuswb m4, m4 |
| 14781 | movh [r0 + 312 * 16 + 8], m4 |
| 14782 | ; mode 21 [row 8 - second half] end |
| 14783 | |
| 14784 | ; mode 22 [row 9 - second half] |
| 14785 | pmaddubsw m4, m5, [r5 + 30 * 16] |
| 14786 | pmulhrsw m4, m3 |
| 14787 | packuswb m4, m4 |
| 14788 | movh [r0 + 329 * 16 + 8], m4 |
| 14789 | ; mode 22 [row 9 - second half] end |
| 14790 | |
| 14791 | ; mode 22 [row 10 - second half] |
| 14792 | pmaddubsw m4, m5, [r5 + 17 * 16] |
| 14793 | pmulhrsw m4, m3 |
| 14794 | packuswb m4, m4 |
| 14795 | movh [r0 + 330 * 16 + 8], m4 |
| 14796 | ; mode 22 [row 10 - second half] end |
| 14797 | |
| 14798 | ; mode 22 [row 11 - second half] |
| 14799 | pmaddubsw m4, m5, [r5 + 4 * 16] |
| 14800 | pmulhrsw m4, m3 |
| 14801 | packuswb m4, m4 |
| 14802 | movh [r0 + 331 * 16 + 8], m4 |
| 14803 | ; mode 22 [row 11 - second half] end |
| 14804 | |
| 14805 | ; mode 23 [row 14 - second half] |
| 14806 | pmaddubsw m4, m5, [r5 + 25 * 16] |
| 14807 | pmulhrsw m4, m3 |
| 14808 | packuswb m4, m4 |
| 14809 | movh [r0 + 350 * 16 + 8], m4 |
| 14810 | ; mode 23 [row 14 - second half] end |
| 14811 | |
| 14812 | ; mode 23 [row 15 - second half] |
| 14813 | pmaddubsw m4, m5, [r5 + 16 * 16] |
| 14814 | pmulhrsw m4, m3 |
| 14815 | packuswb m4, m4 |
| 14816 | movh [r0 + 351 * 16 + 8], m4 |
| 14817 | |
| 14818 | ; mode 23 [row 15 - second half] end |
| 14819 | pmaddubsw m4, m0, [r5 + 30 * 16] |
| 14820 | pmulhrsw m4, m3 |
| 14821 | pmaddubsw m6, m5, [r5 + 30 * 16] |
| 14822 | pmulhrsw m6, m3 |
| 14823 | packuswb m4, m6 |
| 14824 | movu [r0 + 276 * 16], m4 |
| 14825 | |
| 14826 | ; mode 19 [row 5] |
| 14827 | pmaddubsw m4, m0, [r5 + 4 * 16] |
| 14828 | pmulhrsw m4, m3 |
| 14829 | pmaddubsw m6, m5, [r5 + 4 * 16] |
| 14830 | pmulhrsw m6, m3 |
| 14831 | packuswb m4, m6 |
| 14832 | movu [r0 + 277 * 16], m4 |
| 14833 | |
| 14834 | ; mode 19 [row 6] |
| 14835 | pslldq m0, 2 |
| 14836 | pinsrb m0, [r4 + 5], 1 |
| 14837 | pinsrb m0, [r4 + 6], 0 |
| 14838 | pslldq m5, 2 |
| 14839 | pinsrb m5, [r3 + 4], 1 |
| 14840 | pinsrb m5, [r3 + 3], 0 |
| 14841 | |
| 14842 | ; mode 20 [row 7 - second half] |
| 14843 | pmaddubsw m4, m5, [r5 + 24 * 16] |
| 14844 | pmulhrsw m4, m3 |
| 14845 | packuswb m4, m4 |
| 14846 | movh [r0 + 295 * 16 + 8], m4 |
| 14847 | ; mode 20 [row 7 - second half] end |
| 14848 | |
| 14849 | ; mode 20 [row 8 - second half] |
| 14850 | pmaddubsw m4, m5, [r5 + 3 * 16] |
| 14851 | pmulhrsw m4, m3 |
| 14852 | packuswb m4, m4 |
| 14853 | movh [r0 + 296 * 16 + 8], m4 |
| 14854 | ; mode 20 [row 8 - second half] end |
| 14855 | |
| 14856 | ; mode 21 [row 9 - second half] |
| 14857 | pmaddubsw m4, m5, [r5 + 22 * 16] |
| 14858 | pmulhrsw m4, m3 |
| 14859 | packuswb m4, m4 |
| 14860 | movh [r0 + 313 * 16 + 8], m4 |
| 14861 | ; mode 21 [row 9 - second half] end |
| 14862 | |
| 14863 | ; mode 21 [row 10 - second half] |
| 14864 | pmaddubsw m4, m5, [r5 + 5 * 16] |
| 14865 | pmulhrsw m4, m3 |
| 14866 | packuswb m4, m4 |
| 14867 | movh [r0 + 314 * 16 + 8], m4 |
| 14868 | ; mode 21 [row 10 - second half] end |
| 14869 | |
| 14870 | ; mode 22 [row 12 - second half] |
| 14871 | pmaddubsw m4, m5, [r5 + 23 * 16] |
| 14872 | pmulhrsw m4, m3 |
| 14873 | packuswb m4, m4 |
| 14874 | movh [r0 + 332 * 16 + 8], m4 |
| 14875 | ; mode 22 [row 12 - second half] end |
| 14876 | |
| 14877 | ; mode 22 [row 12 - second half] |
| 14878 | pmaddubsw m4, m5, [r5 + 10 * 16] |
| 14879 | pmulhrsw m4, m3 |
| 14880 | packuswb m4, m4 |
| 14881 | movh [r0 + 333 * 16 + 8], m4 |
| 14882 | ; mode 22 [row 12 - second half] end |
| 14883 | |
| 14884 | pmaddubsw m4, m0, [r5 + 10 * 16] |
| 14885 | pmulhrsw m4, m3 |
| 14886 | pmaddubsw m6, m5, [r5 + 10 * 16] |
| 14887 | pmulhrsw m6, m3 |
| 14888 | packuswb m4, m6 |
| 14889 | movu [r0 + 278 * 16], m4 |
| 14890 | |
| 14891 | ; mode 19 [row 7] |
| 14892 | pslldq m0, 2 |
| 14893 | pinsrb m0, [r4 + 6], 1 |
| 14894 | pinsrb m0, [r4 + 7], 0 |
| 14895 | pslldq m5, 2 |
| 14896 | pinsrb m5, [r3 + 3], 1 |
| 14897 | pinsrb m5, [r3 + 2], 0 |
| 14898 | |
| 14899 | ; mode 20 [row 9 - second half] |
| 14900 | pmaddubsw m4, m5, [r5 + 14 * 16] |
| 14901 | pmulhrsw m4, m3 |
| 14902 | packuswb m4, m4 |
| 14903 | movh [r0 + 297 * 16 + 8], m4 |
| 14904 | ; mode 20 [row 9 - second half] |
| 14905 | |
| 14906 | ; mode 21 [row 11 - second half] |
| 14907 | pmaddubsw m4, m5, [r5 + 20 * 16] |
| 14908 | pmulhrsw m4, m3 |
| 14909 | packuswb m4, m4 |
| 14910 | movh [r0 + 315 * 16 + 8], m4 |
| 14911 | ; mode 21 [row 11 - second half] end |
| 14912 | |
| 14913 | ; mode 21 [row 12 - second half] |
| 14914 | pmaddubsw m4, m5, [r5 + 3 * 16] |
| 14915 | pmulhrsw m4, m3 |
| 14916 | packuswb m4, m4 |
| 14917 | movh [r0 + 316 * 16 + 8], m4 |
| 14918 | ; mode 21 [row 12 - second half] end |
| 14919 | |
| 14920 | ; mode 22 [row 14 - second half] |
| 14921 | pmaddubsw m4, m5, [r5 + 29 * 16] |
| 14922 | pmulhrsw m4, m3 |
| 14923 | packuswb m4, m4 |
| 14924 | movh [r0 + 334 * 16 + 8], m4 |
| 14925 | ; mode 22 [row 14 - second half] end |
| 14926 | |
| 14927 | ; mode 22 [row 15 - second half] |
| 14928 | pmaddubsw m4, m5, [r5 + 16 * 16] |
| 14929 | pmulhrsw m4, m3 |
| 14930 | packuswb m4, m4 |
| 14931 | movh [r0 + 335 * 16 + 8], m4 |
| 14932 | ; mode 22 [row 15 - second half] end |
| 14933 | |
| 14934 | pmaddubsw m4, m0, [r5 + 16 * 16] |
| 14935 | pmulhrsw m4, m3 |
| 14936 | pmaddubsw m6, m5, [r5 + 16 * 16] |
| 14937 | pmulhrsw m6, m3 |
| 14938 | packuswb m4, m6 |
| 14939 | movu [r0 + 279 * 16], m4 |
| 14940 | |
| 14941 | ; mode 19 [row 8] |
| 14942 | pslldq m0, 2 |
| 14943 | pinsrb m0, [r4 + 7], 1 |
| 14944 | pinsrb m0, [r4 + 9], 0 |
| 14945 | pslldq m5, 2 |
| 14946 | pinsrb m5, [r3 + 2], 1 |
| 14947 | pinsrb m5, [r3 + 1], 0 |
| 14948 | |
| 14949 | ; mode 20 [row 10 - second half] |
| 14950 | pmaddubsw m4, m5, [r5 + 25 * 16] |
| 14951 | pmulhrsw m4, m3 |
| 14952 | packuswb m4, m4 |
| 14953 | movh [r0 + 298 * 16 + 8], m4 |
| 14954 | ; mode 20 [row 10 - second half] end |
| 14955 | |
| 14956 | ; mode 20 [row 11 - second half] |
| 14957 | pmaddubsw m4, m5, [r5 + 4 * 16] |
| 14958 | pmulhrsw m4, m3 |
| 14959 | packuswb m4, m4 |
| 14960 | movh [r0 + 299 * 16 + 8], m4 |
| 14961 | ; mode 20 [row 11 - second half] end |
| 14962 | |
| 14963 | ; mode 21 [row 13 - second half] |
| 14964 | pmaddubsw m4, m5, [r5 + 18 * 16] |
| 14965 | pmulhrsw m4, m3 |
| 14966 | packuswb m4, m4 |
| 14967 | movh [r0 + 317 * 16 + 8], m4 |
| 14968 | ; mode 21 [row 13 - second half] end |
| 14969 | |
| 14970 | ; mode 21 [row 14 - second half] |
| 14971 | pmaddubsw m4, m5, [r5 + 1 * 16] |
| 14972 | pmulhrsw m4, m3 |
| 14973 | packuswb m4, m4 |
| 14974 | movh [r0 + 318 * 16 + 8], m4 |
| 14975 | ; mode 21 [row 14 - second half] end |
| 14976 | |
| 14977 | pmaddubsw m4, m0, [r5 + 22 * 16] |
| 14978 | pmulhrsw m4, m3 |
| 14979 | pmaddubsw m6, m5, [r5 + 22 * 16] |
| 14980 | pmulhrsw m6, m3 |
| 14981 | packuswb m4, m6 |
| 14982 | movu [r0 + 280 * 16], m4 |
| 14983 | |
| 14984 | ; mode 19 [row 9] |
| 14985 | pslldq m0, 2 |
| 14986 | pinsrb m0, [r4 + 9], 1 |
| 14987 | pinsrb m0, [r4 + 10], 0 |
| 14988 | pslldq m5, 2 |
| 14989 | pinsrb m5, [r3 + 1], 1 |
| 14990 | pinsrb m5, [r3 + 0], 0 |
| 14991 | |
| 14992 | ; mode 20 [row 12 - second half] |
| 14993 | pmaddubsw m4, m5, [r5 + 15 * 16] |
| 14994 | pmulhrsw m4, m3 |
| 14995 | packuswb m4, m4 |
| 14996 | movh [r0 + 300 * 16 + 8], m4 |
| 14997 | |
| 14998 | ; mode 20 [row 12 - second half] end |
| 14999 | pmaddubsw m4, m0, [r5 + 28 * 16] |
| 15000 | pmulhrsw m4, m3 |
| 15001 | pmaddubsw m6, m5, [r5 + 28 * 16] |
| 15002 | pmulhrsw m6, m3 |
| 15003 | packuswb m4, m6 |
| 15004 | movu [r0 + 281 * 16], m4 |
| 15005 | |
| 15006 | ; mode 19 [row 10] |
| 15007 | pmaddubsw m4, m0, [r5 + 2 * 16] |
| 15008 | pmulhrsw m4, m3 |
| 15009 | pmaddubsw m6, m5, [r5 + 2 * 16] |
| 15010 | pmulhrsw m6, m3 |
| 15011 | packuswb m4, m6 |
| 15012 | movu [r0 + 282 * 16], m4 |
| 15013 | |
| 15014 | ; mode 19 [row 11] |
| 15015 | pslldq m0, 2 |
| 15016 | pinsrb m0, [r4 + 10], 1 |
| 15017 | pinsrb m0, [r4 + 11], 0 |
| 15018 | pmaddubsw m4, m0, [r5 + 8 * 16] |
| 15019 | pmulhrsw m4, m3 |
| 15020 | pslldq m5, 2 |
| 15021 | pinsrb m5, [r4 + 0], 1 |
| 15022 | pinsrb m5, [r4 + 1], 0 |
| 15023 | pmaddubsw m6, m5, [r5 + 8 * 16] |
| 15024 | pmulhrsw m6, m3 |
| 15025 | packuswb m4, m6 |
| 15026 | movu [r0 + 283 * 16], m4 |
| 15027 | |
| 15028 | ; mode 19 [row 12] |
| 15029 | pslldq m0, 2 |
| 15030 | pinsrb m0, [r4 + 11], 1 |
| 15031 | pinsrb m0, [r4 + 12], 0 |
| 15032 | pslldq m5, 2 |
| 15033 | pinsrb m5, [r4 + 1], 1 |
| 15034 | pinsrb m5, [r4 + 2], 0 |
| 15035 | pmaddubsw m4, m0, [r5 + 14 * 16] |
| 15036 | pmulhrsw m4, m3 |
| 15037 | pmaddubsw m6, m5, [r5 + 14 * 16] |
| 15038 | pmulhrsw m6, m3 |
| 15039 | packuswb m4, m6 |
| 15040 | movu [r0 + 284 * 16], m4 |
| 15041 | |
| 15042 | ; mode 19 [row 13] |
| 15043 | pslldq m0, 2 |
| 15044 | pinsrb m0, [r4 + 12], 1 |
| 15045 | pinsrb m0, [r4 + 14], 0 |
| 15046 | pmaddubsw m4, m0, [r5 + 20 * 16] |
| 15047 | pmulhrsw m4, m3 |
| 15048 | pslldq m5, 2 |
| 15049 | pinsrb m5, [r4 + 2], 1 |
| 15050 | pinsrb m5, [r4 + 4], 0 |
| 15051 | pmaddubsw m6, m5, [r5 + 20 * 16] |
| 15052 | pmulhrsw m6, m3 |
| 15053 | packuswb m4, m6 |
| 15054 | movu [r0 + 285 * 16], m4 |
| 15055 | |
| 15056 | ; mode 19 [row 14] |
| 15057 | pslldq m0, 2 |
| 15058 | pinsrb m0, [r4 + 14], 1 |
| 15059 | pinsrb m0, [r4 + 15], 0 |
| 15060 | pmaddubsw m4, m0, [r5 + 26 * 16] |
| 15061 | pmulhrsw m4, m3 |
| 15062 | pslldq m5, 2 |
| 15063 | pinsrb m5, [r4 + 4], 1 |
| 15064 | pinsrb m5, [r4 + 5], 0 |
| 15065 | pmaddubsw m6, m5, [r5 + 26 * 16] |
| 15066 | pmulhrsw m6, m3 |
| 15067 | packuswb m4, m6 |
| 15068 | movu [r0 + 286 * 16], m4 |
| 15069 | |
| 15070 | ; mode 19 [row 15] |
| 15071 | movu m0, [r4] |
| 15072 | pshufb m0, [tab_S1] |
| 15073 | movu [r0 + 287 * 16], m0 |
| 15074 | movd m1, [r3] |
| 15075 | movd [r0 + 287 * 16 + 12], m1 |
| 15076 | |
| 15077 | ; mode 25 |
| 15078 | movu m1, [r1] |
| 15079 | |
| 15080 | ; mode 26 [all rows] |
| 15081 | psrldq m6, m1, 1 |
| 15082 | pinsrb m6, [r1 + 16], 15 |
| 15083 | movu m7, m6 |
| 15084 | movu [r0 + 384 * 16], m6 |
| 15085 | movu [r0 + 385 * 16], m6 |
| 15086 | movu [r0 + 386 * 16], m6 |
| 15087 | movu [r0 + 387 * 16], m6 |
| 15088 | movu [r0 + 388 * 16], m6 |
| 15089 | movu [r0 + 389 * 16], m6 |
| 15090 | movu [r0 + 390 * 16], m6 |
| 15091 | movu [r0 + 391 * 16], m6 |
| 15092 | movu [r0 + 392 * 16], m6 |
| 15093 | movu [r0 + 393 * 16], m6 |
| 15094 | movu [r0 + 394 * 16], m6 |
| 15095 | movu [r0 + 395 * 16], m6 |
| 15096 | movu [r0 + 396 * 16], m6 |
| 15097 | movu [r0 + 397 * 16], m6 |
| 15098 | movu [r0 + 398 * 16], m6 |
| 15099 | movu [r0 + 399 * 16], m6 |
| 15100 | |
| 15101 | pxor m0, m0 |
| 15102 | pshufb m6, m6, m0 |
| 15103 | punpcklbw m6, m0 |
| 15104 | movu m2, [r2] |
| 15105 | pshufb m2, m2, m0 |
| 15106 | punpcklbw m2, m0 |
| 15107 | movu m4, [r2 + 1] |
| 15108 | punpcklbw m5, m4, m0 |
| 15109 | punpckhbw m4, m0 |
| 15110 | psubw m5, m2 |
| 15111 | psubw m4, m2 |
| 15112 | psraw m5, 1 |
| 15113 | psraw m4, 1 |
| 15114 | paddw m5, m6 |
| 15115 | paddw m4, m6 |
| 15116 | packuswb m5, m4 |
| 15117 | |
| 15118 | pextrb [r0 + 384 * 16], m5, 0 |
| 15119 | pextrb [r0 + 385 * 16], m5, 1 |
| 15120 | pextrb [r0 + 386 * 16], m5, 2 |
| 15121 | pextrb [r0 + 387 * 16], m5, 3 |
| 15122 | pextrb [r0 + 388 * 16], m5, 4 |
| 15123 | pextrb [r0 + 389 * 16], m5, 5 |
| 15124 | pextrb [r0 + 390 * 16], m5, 6 |
| 15125 | pextrb [r0 + 391 * 16], m5, 7 |
| 15126 | pextrb [r0 + 392 * 16], m5, 8 |
| 15127 | pextrb [r0 + 393 * 16], m5, 9 |
| 15128 | pextrb [r0 + 394 * 16], m5, 10 |
| 15129 | pextrb [r0 + 395 * 16], m5, 11 |
| 15130 | pextrb [r0 + 396 * 16], m5, 12 |
| 15131 | pextrb [r0 + 397 * 16], m5, 13 |
| 15132 | pextrb [r0 + 398 * 16], m5, 14 |
| 15133 | pextrb [r0 + 399 * 16], m5, 15 |
| 15134 | |
| 15135 | ; mode 25 [row 15] |
| 15136 | movu [r0 + 383 * 16], m1 |
| 15137 | |
| 15138 | ; mode 25 [row 0] |
| 15139 | psrldq m2, m1, 1 |
| 15140 | punpcklbw m1, m2 |
| 15141 | movu m2, [r1 + 8] |
| 15142 | psrldq m4, m2, 1 |
| 15143 | punpcklbw m2, m4 |
| 15144 | pmaddubsw m4, m1, [r5 + 30 * 16] |
| 15145 | pmulhrsw m4, m3 |
| 15146 | pmaddubsw m5, m2, [r5 + 30 * 16] |
| 15147 | pmulhrsw m5, m3 |
| 15148 | packuswb m4, m5 |
| 15149 | movu [r0 + 368 * 16], m4 |
| 15150 | |
| 15151 | ; mode 25 [row 1] |
| 15152 | pmaddubsw m4, m1, [r5 + 28 * 16] |
| 15153 | pmulhrsw m4, m3 |
| 15154 | pmaddubsw m5, m2, [r5 + 28 * 16] |
| 15155 | pmulhrsw m5, m3 |
| 15156 | packuswb m4, m5 |
| 15157 | movu [r0 + 369 * 16], m4 |
| 15158 | |
| 15159 | ; mode 25 [row 2] |
| 15160 | pmaddubsw m4, m1, [r5 + 26 * 16] |
| 15161 | pmulhrsw m4, m3 |
| 15162 | pmaddubsw m5, m2, [r5 + 26 * 16] |
| 15163 | pmulhrsw m5, m3 |
| 15164 | packuswb m4, m5 |
| 15165 | movu [r0 + 370 * 16], m4 |
| 15166 | |
| 15167 | ; mode 25 [row 3] |
| 15168 | pmaddubsw m4, m1, [r5 + 24 * 16] |
| 15169 | pmulhrsw m4, m3 |
| 15170 | pmaddubsw m5, m2, [r5 + 24 * 16] |
| 15171 | pmulhrsw m5, m3 |
| 15172 | packuswb m4, m5 |
| 15173 | movu [r0 + 371 * 16], m4 |
| 15174 | |
| 15175 | ; mode 25 [row 4] |
| 15176 | pmaddubsw m4, m1, [r5 + 22 * 16] |
| 15177 | pmulhrsw m4, m3 |
| 15178 | pmaddubsw m5, m2, [r5 + 22 * 16] |
| 15179 | pmulhrsw m5, m3 |
| 15180 | packuswb m4, m5 |
| 15181 | movu [r0 + 372 * 16], m4 |
| 15182 | |
| 15183 | ; mode 25 [row 5] |
| 15184 | pmaddubsw m4, m1, [r5 + 20 * 16] |
| 15185 | pmulhrsw m4, m3 |
| 15186 | pmaddubsw m5, m2, [r5 + 20 * 16] |
| 15187 | pmulhrsw m5, m3 |
| 15188 | packuswb m4, m5 |
| 15189 | movu [r0 + 373 * 16], m4 |
| 15190 | |
| 15191 | ; mode 25 [row 6] |
| 15192 | pmaddubsw m4, m1, [r5 + 18 * 16] |
| 15193 | pmulhrsw m4, m3 |
| 15194 | pmaddubsw m5, m2, [r5 + 18 * 16] |
| 15195 | pmulhrsw m5, m3 |
| 15196 | packuswb m4, m5 |
| 15197 | movu [r0 + 374 * 16], m4 |
| 15198 | |
| 15199 | ; mode 25 [row 7] |
| 15200 | pmaddubsw m4, m1, [r5 + 16 * 16] |
| 15201 | pmulhrsw m4, m3 |
| 15202 | pmaddubsw m5, m2, [r5 + 16 * 16] |
| 15203 | pmulhrsw m5, m3 |
| 15204 | packuswb m4, m5 |
| 15205 | movu [r0 + 375 * 16], m4 |
| 15206 | |
| 15207 | ; mode 25 [row 8] |
| 15208 | pmaddubsw m4, m1, [r5 + 14 * 16] |
| 15209 | pmulhrsw m4, m3 |
| 15210 | pmaddubsw m5, m2, [r5 + 14 * 16] |
| 15211 | pmulhrsw m5, m3 |
| 15212 | packuswb m4, m5 |
| 15213 | movu [r0 + 376 * 16], m4 |
| 15214 | |
| 15215 | ; mode 25 [row 9] |
| 15216 | pmaddubsw m4, m1, [r5 + 12 * 16] |
| 15217 | pmulhrsw m4, m3 |
| 15218 | pmaddubsw m5, m2, [r5 + 12 * 16] |
| 15219 | pmulhrsw m5, m3 |
| 15220 | packuswb m4, m5 |
| 15221 | movu [r0 + 377 * 16], m4 |
| 15222 | |
| 15223 | ; mode 25 [row 10] |
| 15224 | pmaddubsw m4, m1, [r5 + 10 * 16] |
| 15225 | pmulhrsw m4, m3 |
| 15226 | pmaddubsw m5, m2, [r5 + 10 * 16] |
| 15227 | pmulhrsw m5, m3 |
| 15228 | packuswb m4, m5 |
| 15229 | movu [r0 + 378 * 16], m4 |
| 15230 | |
| 15231 | ; mode 25 [row 11] |
| 15232 | pmaddubsw m4, m1, [r5 + 8 * 16] |
| 15233 | pmulhrsw m4, m3 |
| 15234 | pmaddubsw m5, m2, [r5 + 8 * 16] |
| 15235 | pmulhrsw m5, m3 |
| 15236 | packuswb m4, m5 |
| 15237 | movu [r0 + 379 * 16], m4 |
| 15238 | |
| 15239 | ; mode 25 [row 12] |
| 15240 | pmaddubsw m4, m1, [r5 + 6 * 16] |
| 15241 | pmulhrsw m4, m3 |
| 15242 | pmaddubsw m5, m2, [r5 + 6 * 16] |
| 15243 | pmulhrsw m5, m3 |
| 15244 | packuswb m4, m5 |
| 15245 | movu [r0 + 380 * 16], m4 |
| 15246 | |
| 15247 | ; mode 25 [row 13] |
| 15248 | pmaddubsw m4, m1, [r5 + 4 * 16] |
| 15249 | pmulhrsw m4, m3 |
| 15250 | pmaddubsw m5, m2, [r5 + 4 * 16] |
| 15251 | pmulhrsw m5, m3 |
| 15252 | packuswb m4, m5 |
| 15253 | movu [r0 + 381 * 16], m4 |
| 15254 | |
| 15255 | ; mode 25 [row 14] |
| 15256 | pmaddubsw m4, m1, [r5 + 2 * 16] |
| 15257 | pmulhrsw m4, m3 |
| 15258 | pmaddubsw m5, m2, [r5 + 2 * 16] |
| 15259 | pmulhrsw m5, m3 |
| 15260 | packuswb m4, m5 |
| 15261 | movu [r0 + 382 * 16], m4 |
| 15262 | |
| 15263 | ; mode 27 [row 15] |
| 15264 | psrldq m6, m7, 1 |
| 15265 | punpcklbw m7, m6 |
| 15266 | pinsrb m6, [r1 + 17], 15 |
| 15267 | movu [r0 + 415 * 16], m6 |
| 15268 | |
| 15269 | ; mode 27 [row 0] |
| 15270 | movu m4, [r1 + 9] |
| 15271 | psrldq m5, m4, 1 |
| 15272 | punpcklbw m4, m5 |
| 15273 | pmaddubsw m6, m7, [r5 + 2 * 16] |
| 15274 | pmulhrsw m6, m3 |
| 15275 | pmaddubsw m5, m4, [r5 + 2 * 16] |
| 15276 | pmulhrsw m5, m3 |
| 15277 | packuswb m6, m5 |
| 15278 | movu [r0 + 400 * 16], m6 |
| 15279 | |
| 15280 | ; mode 27 [row 1] |
| 15281 | pmaddubsw m6, m7, [r5 + 4 * 16] |
| 15282 | pmulhrsw m6, m3 |
| 15283 | pmaddubsw m5, m4, [r5 + 4 * 16] |
| 15284 | pmulhrsw m5, m3 |
| 15285 | packuswb m6, m5 |
| 15286 | movu [r0 + 401 * 16], m6 |
| 15287 | |
| 15288 | ; mode 27 [row 2] |
| 15289 | pmaddubsw m6, m7, [r5 + 6 * 16] |
| 15290 | pmulhrsw m6, m3 |
| 15291 | pmaddubsw m5, m4, [r5 + 6 * 16] |
| 15292 | pmulhrsw m5, m3 |
| 15293 | packuswb m6, m5 |
| 15294 | movu [r0 + 402 * 16], m6 |
| 15295 | |
| 15296 | ; mode 27 [row 3] |
| 15297 | pmaddubsw m6, m7, [r5 + 8 * 16] |
| 15298 | pmulhrsw m6, m3 |
| 15299 | pmaddubsw m5, m4, [r5 + 8 * 16] |
| 15300 | pmulhrsw m5, m3 |
| 15301 | packuswb m6, m5 |
| 15302 | movu [r0 + 403 * 16], m6 |
| 15303 | |
| 15304 | ; mode 27 [row 4] |
| 15305 | pmaddubsw m6, m7, [r5 + 10 * 16] |
| 15306 | pmulhrsw m6, m3 |
| 15307 | pmaddubsw m5, m4, [r5 + 10 * 16] |
| 15308 | pmulhrsw m5, m3 |
| 15309 | packuswb m6, m5 |
| 15310 | movu [r0 + 404 * 16], m6 |
| 15311 | |
| 15312 | ; mode 27 [row 5] |
| 15313 | pmaddubsw m6, m7, [r5 + 12 * 16] |
| 15314 | pmulhrsw m6, m3 |
| 15315 | pmaddubsw m5, m4, [r5 + 12 * 16] |
| 15316 | pmulhrsw m5, m3 |
| 15317 | packuswb m6, m5 |
| 15318 | movu [r0 + 405 * 16], m6 |
| 15319 | |
| 15320 | ; mode 27 [row 6] |
| 15321 | pmaddubsw m6, m7, [r5 + 14 * 16] |
| 15322 | pmulhrsw m6, m3 |
| 15323 | pmaddubsw m5, m4, [r5 + 14 * 16] |
| 15324 | pmulhrsw m5, m3 |
| 15325 | packuswb m6, m5 |
| 15326 | movu [r0 + 406 * 16], m6 |
| 15327 | |
| 15328 | ; mode 27 [row 7] |
| 15329 | pmaddubsw m6, m7, [r5 + 16 * 16] |
| 15330 | pmulhrsw m6, m3 |
| 15331 | pmaddubsw m5, m4, [r5 + 16 * 16] |
| 15332 | pmulhrsw m5, m3 |
| 15333 | packuswb m6, m5 |
| 15334 | movu [r0 + 407 * 16], m6 |
| 15335 | |
| 15336 | ; mode 27 [row 8] |
| 15337 | pmaddubsw m6, m7, [r5 + 18 * 16] |
| 15338 | pmulhrsw m6, m3 |
| 15339 | pmaddubsw m5, m4, [r5 + 18 * 16] |
| 15340 | pmulhrsw m5, m3 |
| 15341 | packuswb m6, m5 |
| 15342 | movu [r0 + 408 * 16], m6 |
| 15343 | |
| 15344 | ; mode 27 [row 9] |
| 15345 | pmaddubsw m6, m7, [r5 + 20 * 16] |
| 15346 | pmulhrsw m6, m3 |
| 15347 | pmaddubsw m5, m4, [r5 + 20 * 16] |
| 15348 | pmulhrsw m5, m3 |
| 15349 | packuswb m6, m5 |
| 15350 | movu [r0 + 409 * 16], m6 |
| 15351 | |
| 15352 | ; mode 27 [row 10] |
| 15353 | pmaddubsw m6, m7, [r5 + 22 * 16] |
| 15354 | pmulhrsw m6, m3 |
| 15355 | pmaddubsw m5, m4, [r5 + 22 * 16] |
| 15356 | pmulhrsw m5, m3 |
| 15357 | packuswb m6, m5 |
| 15358 | movu [r0 + 410 * 16], m6 |
| 15359 | |
| 15360 | ; mode 27 [row 11] |
| 15361 | pmaddubsw m6, m7, [r5 + 24 * 16] |
| 15362 | pmulhrsw m6, m3 |
| 15363 | pmaddubsw m5, m4, [r5 + 24 * 16] |
| 15364 | pmulhrsw m5, m3 |
| 15365 | packuswb m6, m5 |
| 15366 | movu [r0 + 411 * 16], m6 |
| 15367 | |
| 15368 | ; mode 27 [row 12] |
| 15369 | pmaddubsw m6, m7, [r5 + 26 * 16] |
| 15370 | pmulhrsw m6, m3 |
| 15371 | pmaddubsw m5, m4, [r5 + 26 * 16] |
| 15372 | pmulhrsw m5, m3 |
| 15373 | packuswb m6, m5 |
| 15374 | movu [r0 + 412 * 16], m6 |
| 15375 | |
| 15376 | ; mode 27 [row 13] |
| 15377 | pmaddubsw m6, m7, [r5 + 28 * 16] |
| 15378 | pmulhrsw m6, m3 |
| 15379 | pmaddubsw m5, m4, [r5 + 28 * 16] |
| 15380 | pmulhrsw m5, m3 |
| 15381 | packuswb m6, m5 |
| 15382 | movu [r0 + 413 * 16], m6 |
| 15383 | |
| 15384 | ; mode 27 [row 14] |
| 15385 | pmaddubsw m6, m7, [r5 + 30 * 16] |
| 15386 | pmulhrsw m6, m3 |
| 15387 | pmaddubsw m5, m4, [r5 + 30 * 16] |
| 15388 | pmulhrsw m5, m3 |
| 15389 | packuswb m6, m5 |
| 15390 | movu [r0 + 414 * 16], m6 |
| 15391 | |
| 15392 | ; mode 28 [row 0] |
| 15393 | movu m1, [r3 + 1] |
| 15394 | psrldq m2, m1, 1 |
| 15395 | punpcklbw m1, m2 |
| 15396 | movu m4, [r3 + 9] |
| 15397 | psrldq m5, m4, 1 |
| 15398 | punpcklbw m4, m5 |
| 15399 | pmaddubsw m2, m1, [r5 + 5 * 16] |
| 15400 | pmulhrsw m2, m3 |
| 15401 | pmaddubsw m5, m4, [r5 + 5 * 16] |
| 15402 | pmulhrsw m5, m3 |
| 15403 | packuswb m2, m5 |
| 15404 | movu [r0 + 416 * 16], m2 |
| 15405 | |
| 15406 | ; mode 28 [row 0] |
| 15407 | pmaddubsw m2, m1, [r5 + 5 * 16] |
| 15408 | pmulhrsw m2, m3 |
| 15409 | pmaddubsw m5, m4, [r5 + 5 * 16] |
| 15410 | pmulhrsw m5, m3 |
| 15411 | packuswb m2, m5 |
| 15412 | movu [r0 + 416 * 16], m2 |
| 15413 | |
| 15414 | ; mode 28 [row 1] |
| 15415 | pmaddubsw m2, m1, [r5 + 10 * 16] |
| 15416 | pmulhrsw m2, m3 |
| 15417 | pmaddubsw m5, m4, [r5 + 10 * 16] |
| 15418 | pmulhrsw m5, m3 |
| 15419 | packuswb m2, m5 |
| 15420 | movu [r0 + 417 * 16], m2 |
| 15421 | |
| 15422 | ; mode 28 [row 2] |
| 15423 | pmaddubsw m2, m1, [r5 + 15 * 16] |
| 15424 | pmulhrsw m2, m3 |
| 15425 | pmaddubsw m5, m4, [r5 + 15 * 16] |
| 15426 | pmulhrsw m5, m3 |
| 15427 | packuswb m2, m5 |
| 15428 | movu [r0 + 418 * 16], m2 |
| 15429 | |
| 15430 | ; mode 28 [row 3] |
| 15431 | pmaddubsw m2, m1, [r5 + 20 * 16] |
| 15432 | pmulhrsw m2, m3 |
| 15433 | pmaddubsw m5, m4, [r5 + 20 * 16] |
| 15434 | pmulhrsw m5, m3 |
| 15435 | packuswb m2, m5 |
| 15436 | movu [r0 + 419 * 16], m2 |
| 15437 | |
| 15438 | ; mode 28 [row 4] |
| 15439 | pmaddubsw m2, m1, [r5 + 25 * 16] |
| 15440 | pmulhrsw m2, m3 |
| 15441 | pmaddubsw m5, m4, [r5 + 25 * 16] |
| 15442 | pmulhrsw m5, m3 |
| 15443 | packuswb m2, m5 |
| 15444 | movu [r0 + 420 * 16], m2 |
| 15445 | |
| 15446 | ; mode 28 [row 5] |
| 15447 | pmaddubsw m2, m1, [r5 + 30 * 16] |
| 15448 | pmulhrsw m2, m3 |
| 15449 | pmaddubsw m5, m4, [r5 + 30 * 16] |
| 15450 | pmulhrsw m5, m3 |
| 15451 | packuswb m2, m5 |
| 15452 | movu [r0 + 421 * 16], m2 |
| 15453 | |
| 15454 | ; mode 29 [row 0] |
| 15455 | pmaddubsw m2, m1, [r5 + 9 * 16] |
| 15456 | pmulhrsw m2, m3 |
| 15457 | pmaddubsw m5, m4, [r5 + 9 * 16] |
| 15458 | pmulhrsw m5, m3 |
| 15459 | packuswb m2, m5 |
| 15460 | movu [r0 + 432 * 16], m2 |
| 15461 | |
| 15462 | ; mode 29 [row 1] |
| 15463 | pmaddubsw m2, m1, [r5 + 18 * 16] |
| 15464 | pmulhrsw m2, m3 |
| 15465 | pmaddubsw m5, m4, [r5 + 18 * 16] |
| 15466 | pmulhrsw m5, m3 |
| 15467 | packuswb m2, m5 |
| 15468 | movu [r0 + 433 * 16], m2 |
| 15469 | |
| 15470 | ; mode 29 [row 2] |
| 15471 | pmaddubsw m2, m1, [r5 + 27 * 16] |
| 15472 | pmulhrsw m2, m3 |
| 15473 | pmaddubsw m5, m4, [r5 + 27 * 16] |
| 15474 | pmulhrsw m5, m3 |
| 15475 | packuswb m2, m5 |
| 15476 | movu [r0 + 434 * 16], m2 |
| 15477 | |
| 15478 | ; mode 30 [row 0] |
| 15479 | pmaddubsw m2, m1, [r5 + 13 * 16] |
| 15480 | pmulhrsw m2, m3 |
| 15481 | pmaddubsw m5, m4, [r5 + 13 * 16] |
| 15482 | pmulhrsw m5, m3 |
| 15483 | packuswb m2, m5 |
| 15484 | movu [r0 + 448 * 16], m2 |
| 15485 | |
| 15486 | ; mode 30 [row 1] |
| 15487 | pmaddubsw m2, m1, [r5 + 26 * 16] |
| 15488 | pmulhrsw m2, m3 |
| 15489 | pmaddubsw m5, m4, [r5 + 26 * 16] |
| 15490 | pmulhrsw m5, m3 |
| 15491 | packuswb m2, m5 |
| 15492 | movu [r0 + 449 * 16], m2 |
| 15493 | |
| 15494 | ; mode 33 [row 0] |
| 15495 | movu [r0 + 496 * 16], m2 |
| 15496 | |
| 15497 | ; mode 31 [row 0] |
| 15498 | pmaddubsw m2, m1, [r5 + 17 * 16] |
| 15499 | pmulhrsw m2, m3 |
| 15500 | pmaddubsw m5, m4, [r5 + 17 * 16] |
| 15501 | pmulhrsw m5, m3 |
| 15502 | packuswb m2, m5 |
| 15503 | movu [r0 + 464 * 16], m2 |
| 15504 | |
| 15505 | ; mode 32 [row 0] |
| 15506 | pmaddubsw m2, m1, [r5 + 21 * 16] |
| 15507 | pmulhrsw m2, m3 |
| 15508 | pmaddubsw m5, m4, [r5 + 21 * 16] |
| 15509 | pmulhrsw m5, m3 |
| 15510 | packuswb m2, m5 |
| 15511 | movu [r0 + 480 * 16], m2 |
| 15512 | |
| 15513 | ; mode 28 [row 6] |
| 15514 | movd m7, [r3 + 9] |
| 15515 | palignr m7, m1, 2 |
| 15516 | pmaddubsw m2, m7, [r5 + 3 * 16] |
| 15517 | pmulhrsw m2, m3 |
| 15518 | movd m6, [r3 + 17] |
| 15519 | palignr m6, m4, 2 |
| 15520 | pmaddubsw m5, m6, [r5 + 3 * 16] |
| 15521 | pmulhrsw m5, m3 |
| 15522 | packuswb m2, m5 |
| 15523 | movu [r0 + 422 * 16], m2 |
| 15524 | |
| 15525 | ; mode 28 [row 7] |
| 15526 | pmaddubsw m2, m7, [r5 + 8 * 16] |
| 15527 | pmulhrsw m2, m3 |
| 15528 | pmaddubsw m5, m6, [r5 + 8 * 16] |
| 15529 | pmulhrsw m5, m3 |
| 15530 | packuswb m2, m5 |
| 15531 | movu [r0 + 423 * 16], m2 |
| 15532 | |
| 15533 | ; mode 28 [row 8] |
| 15534 | pmaddubsw m2, m7, [r5 + 13 * 16] |
| 15535 | pmulhrsw m2, m3 |
| 15536 | pmaddubsw m5, m6, [r5 + 13 * 16] |
| 15537 | pmulhrsw m5, m3 |
| 15538 | packuswb m2, m5 |
| 15539 | movu [r0 + 424 * 16], m2 |
| 15540 | |
| 15541 | ; mode 28 [row 9] |
| 15542 | pmaddubsw m2, m7, [r5 + 18 * 16] |
| 15543 | pmulhrsw m2, m3 |
| 15544 | pmaddubsw m5, m6, [r5 + 18 * 16] |
| 15545 | pmulhrsw m5, m3 |
| 15546 | packuswb m2, m5 |
| 15547 | movu [r0 + 425 * 16], m2 |
| 15548 | |
| 15549 | ; mode 28 [row 10] |
| 15550 | pmaddubsw m2, m7, [r5 + 23 * 16] |
| 15551 | pmulhrsw m2, m3 |
| 15552 | pmaddubsw m5, m6, [r5 + 23 * 16] |
| 15553 | pmulhrsw m5, m3 |
| 15554 | packuswb m2, m5 |
| 15555 | movu [r0 + 426 * 16], m2 |
| 15556 | |
| 15557 | ; mode 29 [row 3] |
| 15558 | pmaddubsw m2, m7, [r5 + 4 * 16] |
| 15559 | pmulhrsw m2, m3 |
| 15560 | pmaddubsw m5, m6, [r5 + 4 * 16] |
| 15561 | pmulhrsw m5, m3 |
| 15562 | packuswb m2, m5 |
| 15563 | movu [r0 + 435 * 16], m2 |
| 15564 | |
| 15565 | ; mode 29 [row 4] |
| 15566 | pmaddubsw m2, m7, [r5 + 13 * 16] |
| 15567 | pmulhrsw m2, m3 |
| 15568 | pmaddubsw m5, m6, [r5 + 13 * 16] |
| 15569 | pmulhrsw m5, m3 |
| 15570 | packuswb m2, m5 |
| 15571 | movu [r0 + 436 * 16], m2 |
| 15572 | |
| 15573 | ; mode 29 [row 5] |
| 15574 | pmaddubsw m2, m7, [r5 + 22 * 16] |
| 15575 | pmulhrsw m2, m3 |
| 15576 | pmaddubsw m5, m6, [r5 + 22 * 16] |
| 15577 | pmulhrsw m5, m3 |
| 15578 | packuswb m2, m5 |
| 15579 | movu [r0 + 437 * 16], m2 |
| 15580 | |
| 15581 | ; mode 29 [row 6] |
| 15582 | pmaddubsw m2, m7, [r5 + 31 * 16] |
| 15583 | pmulhrsw m2, m3 |
| 15584 | pmaddubsw m5, m6, [r5 + 31 * 16] |
| 15585 | pmulhrsw m5, m3 |
| 15586 | packuswb m2, m5 |
| 15587 | movu [r0 + 438 * 16], m2 |
| 15588 | |
| 15589 | ; mode 32 [row 2] |
| 15590 | movu [r0 + 482 * 16], m2 |
| 15591 | |
| 15592 | ; mode 30 [row 2] |
| 15593 | pmaddubsw m2, m7, [r5 + 7 * 16] |
| 15594 | pmulhrsw m2, m3 |
| 15595 | pmaddubsw m5, m6, [r5 + 7 * 16] |
| 15596 | pmulhrsw m5, m3 |
| 15597 | packuswb m2, m5 |
| 15598 | movu [r0 + 450 * 16], m2 |
| 15599 | |
| 15600 | ; mode 30 [row 3] |
| 15601 | pmaddubsw m2, m7, [r5 + 20 * 16] |
| 15602 | pmulhrsw m2, m3 |
| 15603 | pmaddubsw m5, m6, [r5 + 20 * 16] |
| 15604 | pmulhrsw m5, m3 |
| 15605 | packuswb m2, m5 |
| 15606 | movu [r0 + 451 * 16], m2 |
| 15607 | |
| 15608 | ; mode 33 [row 1] |
| 15609 | movu [r0 + 497 * 16], m2 |
| 15610 | |
| 15611 | ; mode 31 [row 1] |
| 15612 | pmaddubsw m2, m7, [r5 + 2 * 16] |
| 15613 | pmulhrsw m2, m3 |
| 15614 | pmaddubsw m5, m6, [r5 + 2 * 16] |
| 15615 | pmulhrsw m5, m3 |
| 15616 | packuswb m2, m5 |
| 15617 | movu [r0 + 465 * 16], m2 |
| 15618 | |
| 15619 | ; mode 31 [row 2] |
| 15620 | pmaddubsw m2, m7, [r5 + 19 * 16] |
| 15621 | pmulhrsw m2, m3 |
| 15622 | pmaddubsw m5, m6, [r5 + 19 * 16] |
| 15623 | pmulhrsw m5, m3 |
| 15624 | packuswb m2, m5 |
| 15625 | movu [r0 + 466 * 16], m2 |
| 15626 | |
| 15627 | ; mode 32 [row 1] |
| 15628 | pmaddubsw m2, m7, [r5 + 10 * 16] |
| 15629 | pmulhrsw m2, m3 |
| 15630 | pmaddubsw m5, m6, [r5 + 10 * 16] |
| 15631 | pmulhrsw m5, m3 |
| 15632 | packuswb m2, m5 |
| 15633 | movu [r0 + 481 * 16], m2 |
| 15634 | |
| 15635 | ; mode 28 [row 11] |
| 15636 | pmaddubsw m2, m7, [r5 + 28 * 16] |
| 15637 | pmulhrsw m2, m3 |
| 15638 | pmaddubsw m5, m6, [r5 + 28 * 16] |
| 15639 | pmulhrsw m5, m3 |
| 15640 | packuswb m2, m5 |
| 15641 | movu [r0 + 427 * 16], m2 |
| 15642 | |
| 15643 | ; mode 28 [row 12] |
| 15644 | movd m1, [r3 + 10] |
| 15645 | palignr m1, m7, 2 |
| 15646 | pmaddubsw m2, m1, [r5 + 1 * 16] |
| 15647 | pmulhrsw m2, m3 |
| 15648 | movd m4, [r3 + 18] |
| 15649 | palignr m4, m6, 2 |
| 15650 | pmaddubsw m5, m4, [r5 + 1 * 16] |
| 15651 | pmulhrsw m5, m3 |
| 15652 | packuswb m2, m5 |
| 15653 | movu [r0 + 428 * 16], m2 |
| 15654 | |
| 15655 | ; mode 30 [row 4] |
| 15656 | movu [r0 + 452 * 16], m2 |
| 15657 | |
| 15658 | ; mode 28 [row 13] |
| 15659 | pmaddubsw m2, m1, [r5 + 6 * 16] |
| 15660 | pmulhrsw m2, m3 |
| 15661 | pmaddubsw m5, m4, [r5 + 6 * 16] |
| 15662 | pmulhrsw m5, m3 |
| 15663 | packuswb m2, m5 |
| 15664 | movu [r0 + 429 * 16], m2 |
| 15665 | |
| 15666 | ; mode 28 [row 14] |
| 15667 | pmaddubsw m2, m1, [r5 + 11 * 16] |
| 15668 | pmulhrsw m2, m3 |
| 15669 | pmaddubsw m5, m4, [r5 + 11 * 16] |
| 15670 | pmulhrsw m5, m3 |
| 15671 | packuswb m2, m5 |
| 15672 | movu [r0 + 430 * 16], m2 |
| 15673 | |
| 15674 | ; mode 28 [row 15] |
| 15675 | pmaddubsw m2, m1, [r5 + 16 * 16] |
| 15676 | pmulhrsw m2, m3 |
| 15677 | pmaddubsw m5, m4, [r5 + 16 * 16] |
| 15678 | pmulhrsw m5, m3 |
| 15679 | packuswb m2, m5 |
| 15680 | movu [r0 + 431 * 16], m2 |
| 15681 | |
| 15682 | ; mode 29 [row 7] |
| 15683 | pmaddubsw m2, m1, [r5 + 8 * 16] |
| 15684 | pmulhrsw m2, m3 |
| 15685 | pmaddubsw m5, m4, [r5 + 8 * 16] |
| 15686 | pmulhrsw m5, m3 |
| 15687 | packuswb m2, m5 |
| 15688 | movu [r0 + 439 * 16], m2 |
| 15689 | |
| 15690 | ; mode 29 [row 8] |
| 15691 | pmaddubsw m2, m1, [r5 + 17 * 16] |
| 15692 | pmulhrsw m2, m3 |
| 15693 | pmaddubsw m5, m4, [r5 + 17 * 16] |
| 15694 | pmulhrsw m5, m3 |
| 15695 | packuswb m2, m5 |
| 15696 | movu [r0 + 440 * 16], m2 |
| 15697 | |
| 15698 | ; mode 29 [row 9] |
| 15699 | pmaddubsw m2, m1, [r5 + 26 * 16] |
| 15700 | pmulhrsw m2, m3 |
| 15701 | pmaddubsw m5, m4, [r5 + 26 * 16] |
| 15702 | pmulhrsw m5, m3 |
| 15703 | packuswb m2, m5 |
| 15704 | movu [r0 + 441 * 16], m2 |
| 15705 | |
| 15706 | ; mode 30 [row 5] |
| 15707 | pmaddubsw m2, m1, [r5 + 14 * 16] |
| 15708 | pmulhrsw m2, m3 |
| 15709 | pmaddubsw m5, m4, [r5 + 14 * 16] |
| 15710 | pmulhrsw m5, m3 |
| 15711 | packuswb m2, m5 |
| 15712 | movu [r0 + 453 * 16], m2 |
| 15713 | |
| 15714 | ; mode 33 [row 2] |
| 15715 | movu [r0 + 498 * 16], m2 |
| 15716 | |
| 15717 | ; mode 30 [row 6] |
| 15718 | pmaddubsw m2, m1, [r5 + 27 * 16] |
| 15719 | pmulhrsw m2, m3 |
| 15720 | pmaddubsw m5, m4, [r5 + 27 * 16] |
| 15721 | pmulhrsw m5, m3 |
| 15722 | packuswb m2, m5 |
| 15723 | movu [r0 + 454 * 16], m2 |
| 15724 | |
| 15725 | ; mode 31 [row 3] |
| 15726 | pmaddubsw m2, m1, [r5 + 4 * 16] |
| 15727 | pmulhrsw m2, m3 |
| 15728 | pmaddubsw m5, m4, [r5 + 4 * 16] |
| 15729 | pmulhrsw m5, m3 |
| 15730 | packuswb m2, m5 |
| 15731 | movu [r0 + 467 * 16], m2 |
| 15732 | |
| 15733 | ; mode 31 [row 4] |
| 15734 | pmaddubsw m2, m1, [r5 + 21 * 16] |
| 15735 | pmulhrsw m2, m3 |
| 15736 | pmaddubsw m5, m4, [r5 + 21 * 16] |
| 15737 | pmulhrsw m5, m3 |
| 15738 | packuswb m2, m5 |
| 15739 | movu [r0 + 468 * 16], m2 |
| 15740 | |
| 15741 | ; mode 32 [row 3] |
| 15742 | pmaddubsw m2, m1, [r5 + 20 * 16] |
| 15743 | pmulhrsw m2, m3 |
| 15744 | pmaddubsw m5, m4, [r5 + 20 * 16] |
| 15745 | pmulhrsw m5, m3 |
| 15746 | packuswb m2, m5 |
| 15747 | movu [r0 + 483 * 16], m2 |
| 15748 | |
| 15749 | ; mode 29 [row 10] |
| 15750 | movd m7, [r3 + 11] |
| 15751 | palignr m7, m1, 2 |
| 15752 | pmaddubsw m2, m7, [r5 + 3 * 16] |
| 15753 | pmulhrsw m2, m3 |
| 15754 | movd m6, [r3 + 19] |
| 15755 | palignr m6, m4, 2 |
| 15756 | pmaddubsw m5, m6, [r5 + 3 * 16] |
| 15757 | pmulhrsw m5, m3 |
| 15758 | packuswb m2, m5 |
| 15759 | movu [r0 + 442 * 16], m2 |
| 15760 | |
| 15761 | ; mode 29 [row 11] |
| 15762 | pmaddubsw m2, m7, [r5 + 12 * 16] |
| 15763 | pmulhrsw m2, m3 |
| 15764 | pmaddubsw m5, m6, [r5 + 12 * 16] |
| 15765 | pmulhrsw m5, m3 |
| 15766 | packuswb m2, m5 |
| 15767 | movu [r0 + 443 * 16], m2 |
| 15768 | |
| 15769 | ; mode 29 [row 12] |
| 15770 | pmaddubsw m2, m7, [r5 + 21 * 16] |
| 15771 | pmulhrsw m2, m3 |
| 15772 | pmaddubsw m5, m6, [r5 + 21 * 16] |
| 15773 | pmulhrsw m5, m3 |
| 15774 | packuswb m2, m5 |
| 15775 | movu [r0 + 444 * 16], m2 |
| 15776 | |
| 15777 | ; mode 30 [row 8] |
| 15778 | movu [r0 + 456 * 16], m2 |
| 15779 | |
| 15780 | ; mode 29 [row 13] |
| 15781 | pmaddubsw m2, m7, [r5 + 30 * 16] |
| 15782 | pmulhrsw m2, m3 |
| 15783 | pmaddubsw m5, m6, [r5 + 30 * 16] |
| 15784 | pmulhrsw m5, m3 |
| 15785 | packuswb m2, m5 |
| 15786 | movu [r0 + 445 * 16], m2 |
| 15787 | |
| 15788 | ; mode 32 [row 5] |
| 15789 | movu [r0 + 485 * 16], m2 |
| 15790 | |
| 15791 | ; mode 30 [row 7] |
| 15792 | pmaddubsw m2, m7, [r5 + 8 * 16] |
| 15793 | pmulhrsw m2, m3 |
| 15794 | pmaddubsw m5, m6, [r5 + 8 * 16] |
| 15795 | pmulhrsw m5, m3 |
| 15796 | packuswb m2, m5 |
| 15797 | movu [r0 + 455 * 16], m2 |
| 15798 | |
| 15799 | ; mode 33 [row 3] |
| 15800 | movu [r0 + 499 * 16], m2 |
| 15801 | |
| 15802 | ; mode 31 [row 5] |
| 15803 | pmaddubsw m2, m7, [r5 + 6 * 16] |
| 15804 | pmulhrsw m2, m3 |
| 15805 | pmaddubsw m5, m6, [r5 + 6 * 16] |
| 15806 | pmulhrsw m5, m3 |
| 15807 | packuswb m2, m5 |
| 15808 | movu [r0 + 469 * 16], m2 |
| 15809 | |
| 15810 | ; mode 31 [row 6] |
| 15811 | pmaddubsw m2, m7, [r5 + 23 * 16] |
| 15812 | pmulhrsw m2, m3 |
| 15813 | pmaddubsw m5, m6, [r5 + 23 * 16] |
| 15814 | pmulhrsw m5, m3 |
| 15815 | packuswb m2, m5 |
| 15816 | movu [r0 + 470 * 16], m2 |
| 15817 | |
| 15818 | ; mode 32 [row 4] |
| 15819 | pmaddubsw m2, m7, [r5 + 9 * 16] |
| 15820 | pmulhrsw m2, m3 |
| 15821 | pmaddubsw m5, m6, [r5 + 9 * 16] |
| 15822 | pmulhrsw m5, m3 |
| 15823 | packuswb m2, m5 |
| 15824 | movu [r0 + 484 * 16], m2 |
| 15825 | |
| 15826 | movu m1, m7 |
| 15827 | movu m4, m6 |
| 15828 | |
| 15829 | ; mode 29 [row 14] |
| 15830 | movu m1, [r3 + 12] |
| 15831 | palignr m1, m7, 2 |
| 15832 | pmaddubsw m2, m1, [r5 + 7 * 16] |
| 15833 | pmulhrsw m2, m3 |
| 15834 | movd m4, [r3 + 20] |
| 15835 | palignr m4, m6, 2 |
| 15836 | pmaddubsw m5, m4, [r5 + 7 * 16] |
| 15837 | pmulhrsw m5, m3 |
| 15838 | packuswb m2, m5 |
| 15839 | movu [r0 + 446 * 16], m2 |
| 15840 | |
| 15841 | ; mode 29 [row 15] |
| 15842 | pmaddubsw m2, m1, [r5 + 16 * 16] |
| 15843 | pmulhrsw m2, m3 |
| 15844 | pmaddubsw m5, m4, [r5 + 16 * 16] |
| 15845 | pmulhrsw m5, m3 |
| 15846 | packuswb m2, m5 |
| 15847 | movu [r0 + 447 * 16], m2 |
| 15848 | |
| 15849 | ; mode 30 [row 9] |
| 15850 | pmaddubsw m2, m1, [r5 + 2 * 16] |
| 15851 | pmulhrsw m2, m3 |
| 15852 | pmaddubsw m5, m4, [r5 + 2 * 16] |
| 15853 | pmulhrsw m5, m3 |
| 15854 | packuswb m2, m5 |
| 15855 | movu [r0 + 457 * 16], m2 |
| 15856 | |
| 15857 | ; mode 33 [row 4] |
| 15858 | movu [r0 + 500 * 16], m2 |
| 15859 | |
| 15860 | ; mode 30 [row 10] |
| 15861 | pmaddubsw m2, m1, [r5 + 15 * 16] |
| 15862 | pmulhrsw m2, m3 |
| 15863 | pmaddubsw m5, m4, [r5 + 15 * 16] |
| 15864 | pmulhrsw m5, m3 |
| 15865 | packuswb m2, m5 |
| 15866 | movu [r0 + 458 * 16], m2 |
| 15867 | |
| 15868 | ; mode 30 [row 11] |
| 15869 | pmaddubsw m2, m1, [r5 + 28 * 16] |
| 15870 | pmulhrsw m2, m3 |
| 15871 | pmaddubsw m5, m4, [r5 + 28 * 16] |
| 15872 | pmulhrsw m5, m3 |
| 15873 | packuswb m2, m5 |
| 15874 | movu [r0 + 459 * 16], m2 |
| 15875 | |
| 15876 | ; mode 33 [row 5] |
| 15877 | movu [r0 + 501 * 16], m2 |
| 15878 | |
| 15879 | ; mode 31 [row 7] |
| 15880 | pmaddubsw m2, m1, [r5 + 8 * 16] |
| 15881 | pmulhrsw m2, m3 |
| 15882 | pmaddubsw m5, m4, [r5 + 8 * 16] |
| 15883 | pmulhrsw m5, m3 |
| 15884 | packuswb m2, m5 |
| 15885 | movu [r0 + 471 * 16], m2 |
| 15886 | |
| 15887 | ; mode 31 [row 8] |
| 15888 | pmaddubsw m2, m1, [r5 + 25 * 16] |
| 15889 | pmulhrsw m2, m3 |
| 15890 | pmaddubsw m5, m4, [r5 + 25 * 16] |
| 15891 | pmulhrsw m5, m3 |
| 15892 | packuswb m2, m5 |
| 15893 | movu [r0 + 472 * 16], m2 |
| 15894 | |
| 15895 | ; mode 32 [row 6] |
| 15896 | pmaddubsw m2, m1, [r5 + 19 * 16] |
| 15897 | pmulhrsw m2, m3 |
| 15898 | pmaddubsw m5, m4, [r5 + 19 * 16] |
| 15899 | pmulhrsw m5, m3 |
| 15900 | packuswb m2, m5 |
| 15901 | movu [r0 + 486 * 16], m2 |
| 15902 | |
| 15903 | ; mode 30 [row 12] |
| 15904 | movd m7, [r3 + 13] |
| 15905 | palignr m7, m1, 2 |
| 15906 | pmaddubsw m2, m7, [r5 + 9 * 16] |
| 15907 | pmulhrsw m2, m3 |
| 15908 | movd m6, [r3 + 21] |
| 15909 | palignr m6, m4, 2 |
| 15910 | pmaddubsw m5, m6, [r5 + 9 * 16] |
| 15911 | pmulhrsw m5, m3 |
| 15912 | packuswb m2, m5 |
| 15913 | movu [r0 + 460 * 16], m2 |
| 15914 | |
| 15915 | ; mode 30 [row 13] |
| 15916 | pmaddubsw m2, m7, [r5 + 22 * 16] |
| 15917 | pmulhrsw m2, m3 |
| 15918 | pmaddubsw m5, m6, [r5 + 22 * 16] |
| 15919 | pmulhrsw m5, m3 |
| 15920 | packuswb m2, m5 |
| 15921 | movu [r0 + 461 * 16], m2 |
| 15922 | |
| 15923 | ; mode 33 [row 6] |
| 15924 | movu [r0 + 502 * 16], m2 |
| 15925 | |
| 15926 | ; mode 31 [row 9] |
| 15927 | pmaddubsw m2, m7, [r5 + 10 * 16] |
| 15928 | pmulhrsw m2, m3 |
| 15929 | pmaddubsw m5, m6, [r5 + 10 * 16] |
| 15930 | pmulhrsw m5, m3 |
| 15931 | packuswb m2, m5 |
| 15932 | movu [r0 + 473 * 16], m2 |
| 15933 | |
| 15934 | ; mode 31 [row 10] |
| 15935 | pmaddubsw m2, m7, [r5 + 27 * 16] |
| 15936 | pmulhrsw m2, m3 |
| 15937 | pmaddubsw m5, m6, [r5 + 27 * 16] |
| 15938 | pmulhrsw m5, m3 |
| 15939 | packuswb m2, m5 |
| 15940 | movu [r0 + 474 * 16], m2 |
| 15941 | |
| 15942 | ; mode 32 [row 7] |
| 15943 | pmaddubsw m2, m7, [r5 + 8 * 16] |
| 15944 | pmulhrsw m2, m3 |
| 15945 | pmaddubsw m5, m6, [r5 + 8 * 16] |
| 15946 | pmulhrsw m5, m3 |
| 15947 | packuswb m2, m5 |
| 15948 | movu [r0 + 487 * 16], m2 |
| 15949 | |
| 15950 | ; mode 32 [row 8] |
| 15951 | pmaddubsw m2, m7, [r5 + 29 * 16] |
| 15952 | pmulhrsw m2, m3 |
| 15953 | pmaddubsw m5, m6, [r5 + 29 * 16] |
| 15954 | pmulhrsw m5, m3 |
| 15955 | packuswb m2, m5 |
| 15956 | movu [r0 + 488 * 16], m2 |
| 15957 | |
| 15958 | |
| 15959 | movu m1, m7 |
| 15960 | movu m4, m6 |
| 15961 | |
| 15962 | ; mode 30 [row 14] |
| 15963 | movd m1, [r3 + 14] |
| 15964 | palignr m1, m7, 2 |
| 15965 | pmaddubsw m2, m1, [r5 + 3 * 16] |
| 15966 | pmulhrsw m2, m3 |
| 15967 | movd m4, [r3 + 22] |
| 15968 | palignr m4, m6, 2 |
| 15969 | pmaddubsw m5, m4, [r5 + 3 * 16] |
| 15970 | pmulhrsw m5, m3 |
| 15971 | packuswb m2, m5 |
| 15972 | movu [r0 + 462 * 16], m2 |
| 15973 | |
| 15974 | ; mode 30 [row 15] |
| 15975 | pmaddubsw m2, m1, [r5 + 16 * 16] |
| 15976 | pmulhrsw m2, m3 |
| 15977 | pmaddubsw m5, m4, [r5 + 16 * 16] |
| 15978 | pmulhrsw m5, m3 |
| 15979 | packuswb m2, m5 |
| 15980 | movu [r0 + 463 * 16], m2 |
| 15981 | |
| 15982 | ; mode 33 [row 7] |
| 15983 | movu [r0 + 503 * 16], m2 |
| 15984 | |
| 15985 | ; mode 31 [row 11] |
| 15986 | pmaddubsw m2, m1, [r5 + 12 * 16] |
| 15987 | pmulhrsw m2, m3 |
| 15988 | pmaddubsw m5, m4, [r5 + 12 * 16] |
| 15989 | pmulhrsw m5, m3 |
| 15990 | packuswb m2, m5 |
| 15991 | movu [r0 + 475 * 16], m2 |
| 15992 | |
| 15993 | ; mode 31 [row 12] |
| 15994 | pmaddubsw m2, m1, [r5 + 29 * 16] |
| 15995 | pmulhrsw m2, m3 |
| 15996 | pmaddubsw m5, m4, [r5 + 29 * 16] |
| 15997 | pmulhrsw m5, m3 |
| 15998 | packuswb m2, m5 |
| 15999 | movu [r0 + 476 * 16], m2 |
| 16000 | |
| 16001 | ; mode 32 [row 9] |
| 16002 | pmaddubsw m2, m1, [r5 + 18 * 16] |
| 16003 | pmulhrsw m2, m3 |
| 16004 | pmaddubsw m5, m4, [r5 + 18 * 16] |
| 16005 | pmulhrsw m5, m3 |
| 16006 | packuswb m2, m5 |
| 16007 | movu [r0 + 489 * 16], m2 |
| 16008 | |
| 16009 | ; mode 31 [row 13] |
| 16010 | movd m7, [r3 + 15] |
| 16011 | palignr m7, m1, 2 |
| 16012 | pmaddubsw m2, m7, [r5 + 14 * 16] |
| 16013 | pmulhrsw m2, m3 |
| 16014 | movd m6, [r3 + 23] |
| 16015 | palignr m6, m4, 2 |
| 16016 | pmaddubsw m5, m6, [r5 + 14 * 16] |
| 16017 | pmulhrsw m5, m3 |
| 16018 | packuswb m2, m5 |
| 16019 | movu [r0 + 477 * 16], m2 |
| 16020 | |
| 16021 | ; mode 31 [row 14] |
| 16022 | pmaddubsw m2, m7, [r5 + 31 * 16] |
| 16023 | pmulhrsw m2, m3 |
| 16024 | pmaddubsw m5, m6, [r5 + 31 * 16] |
| 16025 | pmulhrsw m5, m3 |
| 16026 | packuswb m2, m5 |
| 16027 | movu [r0 + 478 * 16], m2 |
| 16028 | |
| 16029 | ; mode 32 [row 10] |
| 16030 | pmaddubsw m2, m7, [r5 + 7 * 16] |
| 16031 | pmulhrsw m2, m3 |
| 16032 | pmaddubsw m5, m6, [r5 + 7 * 16] |
| 16033 | pmulhrsw m5, m3 |
| 16034 | packuswb m2, m5 |
| 16035 | movu [r0 + 490 * 16], m2 |
| 16036 | |
| 16037 | ; mode 32 [row 11] |
| 16038 | pmaddubsw m2, m7, [r5 + 28 * 16] |
| 16039 | pmulhrsw m2, m3 |
| 16040 | pmaddubsw m5, m6, [r5 + 28 * 16] |
| 16041 | pmulhrsw m5, m3 |
| 16042 | packuswb m2, m5 |
| 16043 | movu [r0 + 491 * 16], m2 |
| 16044 | |
| 16045 | ; mode 33 [row 8] |
| 16046 | pmaddubsw m2, m7, [r5 + 10 * 16] |
| 16047 | pmulhrsw m2, m3 |
| 16048 | pmaddubsw m5, m6, [r5 + 10 * 16] |
| 16049 | pmulhrsw m5, m3 |
| 16050 | packuswb m2, m5 |
| 16051 | movu [r0 + 504 * 16], m2 |
| 16052 | |
| 16053 | ; mode 31 [row 15] |
| 16054 | movd m1, [r3 + 16] |
| 16055 | palignr m1, m7, 2 |
| 16056 | pmaddubsw m2, m1, [r5 + 16 * 16] |
| 16057 | pmulhrsw m2, m3 |
| 16058 | movd m4, [r3 + 24] |
| 16059 | palignr m4, m6, 2 |
| 16060 | pmaddubsw m5, m4, [r5 + 16 * 16] |
| 16061 | pmulhrsw m5, m3 |
| 16062 | packuswb m2, m5 |
| 16063 | movu [r0 + 479 * 16], m2 |
| 16064 | |
| 16065 | ; mode 32 [row 12] |
| 16066 | pmaddubsw m2, m1, [r5 + 17 * 16] |
| 16067 | pmulhrsw m2, m3 |
| 16068 | pmaddubsw m5, m4, [r5 + 17 * 16] |
| 16069 | pmulhrsw m5, m3 |
| 16070 | packuswb m2, m5 |
| 16071 | movu [r0 + 492 * 16], m2 |
| 16072 | |
| 16073 | ; mode 33 [row 9] |
| 16074 | pmaddubsw m2, m1, [r5 + 4 * 16] |
| 16075 | pmulhrsw m2, m3 |
| 16076 | pmaddubsw m5, m4, [r5 + 4 * 16] |
| 16077 | pmulhrsw m5, m3 |
| 16078 | packuswb m2, m5 |
| 16079 | movu [r0 + 505 * 16], m2 |
| 16080 | |
| 16081 | ; mode 33 [row 10] |
| 16082 | pmaddubsw m2, m1, [r5 + 30 * 16] |
| 16083 | pmulhrsw m2, m3 |
| 16084 | pmaddubsw m5, m4, [r5 + 30 * 16] |
| 16085 | pmulhrsw m5, m3 |
| 16086 | packuswb m2, m5 |
| 16087 | movu [r0 + 506 * 16], m2 |
| 16088 | |
| 16089 | ; mode 33 [row 10] |
| 16090 | pmaddubsw m2, m1, [r5 + 4 * 16] |
| 16091 | pmulhrsw m2, m3 |
| 16092 | pmaddubsw m5, m4, [r5 + 4 * 16] |
| 16093 | pmulhrsw m5, m3 |
| 16094 | packuswb m2, m5 |
| 16095 | movu [r0 + 505 * 16], m2 |
| 16096 | |
| 16097 | ; mode 32 [row 13] |
| 16098 | movd m7, [r3 + 17] |
| 16099 | palignr m7, m1, 2 |
| 16100 | pmaddubsw m2, m7, [r5 + 6 * 16] |
| 16101 | pmulhrsw m2, m3 |
| 16102 | |
| 16103 | movd m6, [r3 + 25] |
| 16104 | palignr m6, m4, 2 |
| 16105 | pmaddubsw m5, m6, [r5 + 6 * 16] |
| 16106 | pmulhrsw m5, m3 |
| 16107 | packuswb m2, m5 |
| 16108 | movu [r0 + 493 * 16], m2 |
| 16109 | |
| 16110 | ; mode 32 [row 14] |
| 16111 | pmaddubsw m2, m7, [r5 + 27 * 16] |
| 16112 | pmulhrsw m2, m3 |
| 16113 | pmaddubsw m5, m6, [r5 + 27 * 16] |
| 16114 | pmulhrsw m5, m3 |
| 16115 | packuswb m2, m5 |
| 16116 | movu [r0 + 494 * 16], m2 |
| 16117 | |
| 16118 | ; mode 33 [row 11] |
| 16119 | pmaddubsw m2, m7, [r5 + 24 * 16] |
| 16120 | pmulhrsw m2, m3 |
| 16121 | pmaddubsw m5, m6, [r5 + 24 * 16] |
| 16122 | pmulhrsw m5, m3 |
| 16123 | packuswb m2, m5 |
| 16124 | movu [r0 + 507 * 16], m2 |
| 16125 | |
| 16126 | ; mode 32 [row 15] |
| 16127 | movd m1, [r3 + 18] |
| 16128 | palignr m1, m7, 2 |
| 16129 | pmaddubsw m2, m1, [r5 + 16 * 16] |
| 16130 | pmulhrsw m2, m3 |
| 16131 | psrldq m4, 2 |
| 16132 | pinsrb m4, [r3 + 26], 14 |
| 16133 | pinsrb m4, [r3 + 27], 15 |
| 16134 | movd m4, [r3 + 26] |
| 16135 | palignr m4, m6, 2 |
| 16136 | pmaddubsw m5, m4, [r5 + 16 * 16] |
| 16137 | pmulhrsw m5, m3 |
| 16138 | packuswb m2, m5 |
| 16139 | movu [r0 + 495 * 16], m2 |
| 16140 | |
| 16141 | ; mode 33 [row 12] |
| 16142 | pmaddubsw m2, m1, [r5 + 18 * 16] |
| 16143 | pmulhrsw m2, m3 |
| 16144 | pmaddubsw m5, m4, [r5 + 18 * 16] |
| 16145 | pmulhrsw m5, m3 |
| 16146 | packuswb m2, m5 |
| 16147 | movu [r0 + 508 * 16], m2 |
| 16148 | |
| 16149 | ; mode 33 [row 13] |
| 16150 | movd m7, [r3 + 19] |
| 16151 | palignr m7, m1, 2 |
| 16152 | pmaddubsw m2, m7, [r5 + 12 * 16] |
| 16153 | pmulhrsw m2, m3 |
| 16154 | movd m6, [r3 + 27] |
| 16155 | palignr m6, m4, 2 |
| 16156 | pmaddubsw m5, m6, [r5 + 12 * 16] |
| 16157 | pmulhrsw m5, m3 |
| 16158 | packuswb m2, m5 |
| 16159 | movu [r0 + 509 * 16], m2 |
| 16160 | |
| 16161 | ; mode 33 [row 14] |
| 16162 | movd m1, [r3 + 20] |
| 16163 | palignr m1, m7, 2 |
| 16164 | pmaddubsw m2, m1, [r5 + 6 * 16] |
| 16165 | pmulhrsw m2, m3 |
| 16166 | movd m4, [r3 + 28] |
| 16167 | palignr m4, m6, 2 |
| 16168 | pmaddubsw m5, m4, [r5 + 6 * 16] |
| 16169 | pmulhrsw m5, m3 |
| 16170 | packuswb m2, m5 |
| 16171 | movu [r0 + 510 * 16], m2 |
| 16172 | |
| 16173 | ; mode 34 [row 0] |
| 16174 | movu m1, [r3 + 2] |
| 16175 | movu [r0 + 512 * 16], m1 |
| 16176 | movu m2, [r3 + 18] |
| 16177 | palignr m3, m2, m1, 1 |
| 16178 | movu [r0 + 513 * 16], m3 |
| 16179 | palignr m3, m2, m1, 2 |
| 16180 | movu [r0 + 514 * 16], m3 |
| 16181 | palignr m3, m2, m1, 3 |
| 16182 | movu [r0 + 515 * 16], m3 |
| 16183 | palignr m3, m2, m1, 4 |
| 16184 | movu [r0 + 516 * 16], m3 |
| 16185 | palignr m3, m2, m1, 5 |
| 16186 | movu [r0 + 517 * 16], m3 |
| 16187 | palignr m3, m2, m1, 6 |
| 16188 | movu [r0 + 518 * 16], m3 |
| 16189 | palignr m3, m2, m1, 7 |
| 16190 | movu [r0 + 519 * 16], m3 |
| 16191 | palignr m3, m2, m1, 8 |
| 16192 | movu [r0 + 520 * 16], m3 |
| 16193 | palignr m3, m2, m1, 9 |
| 16194 | movu [r0 + 521 * 16], m3 |
| 16195 | palignr m3, m2, m1, 10 |
| 16196 | movu [r0 + 522 * 16], m3 |
| 16197 | palignr m3, m2, m1, 11 |
| 16198 | movu [r0 + 523 * 16], m3 |
| 16199 | palignr m3, m2, m1, 12 |
| 16200 | movu [r0 + 524 * 16], m3 |
| 16201 | |
| 16202 | ; mode 33 [row 15] |
| 16203 | movu [r0 + 511 * 16], m3 |
| 16204 | |
| 16205 | ; mode 34 |
| 16206 | palignr m3, m2, m1, 13 |
| 16207 | movu [r0 + 525 * 16], m3 |
| 16208 | palignr m3, m2, m1, 14 |
| 16209 | movu [r0 + 526 * 16], m3 |
| 16210 | palignr m3, m2, m1, 15 |
| 16211 | movu [r0 + 527 * 16], m3 |
| 16212 | |
| 16213 | RET |
| 16214 | |
| 16215 | ;----------------------------------------------------------------------------- |
| 16216 | ; void all_angs_pred_32x32(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma) |
| 16217 | ;----------------------------------------------------------------------------- |
| 16218 | INIT_XMM sse4 |
| 16219 | cglobal all_angs_pred_32x32, 6, 6, 8, dest, above0, left0, above1, left1, bLuma |
| 16220 | |
| 16221 | ;mode 2[row 0] |
| 16222 | movu m0, [r4 + 2] |
| 16223 | movu [r0 + 0 * 16], m0 |
| 16224 | movu m1, [r4 + 18] |
| 16225 | movu [r0 + 1 * 16], m1 |
| 16226 | |
| 16227 | ;mode 9 [row 15] |
| 16228 | movu [r0 + 478 * 16], m0 |
| 16229 | movu [r0 + 479 * 16], m1 |
| 16230 | |
| 16231 | ;mode 2[row 1] |
| 16232 | movu m2, [r4 + 34] |
| 16233 | palignr m3, m1, m0, 1 |
| 16234 | movu [r0 + 2 * 16], m3 |
| 16235 | palignr m4, m2, m1, 1 |
| 16236 | movu [r0 + 3 * 16], m4 |
| 16237 | |
| 16238 | ; mode 9 [row 31] |
| 16239 | movu [r0 + 510 * 16], m3 |
| 16240 | movu [r0 + 511 * 16], m4 |
| 16241 | |
| 16242 | ;mode 2[row 17] |
| 16243 | movu [r0 + 34 * 16], m4 |
| 16244 | movu m5, [r4 + 35] |
| 16245 | movu [r0 + 35 * 16], m5 |
| 16246 | |
| 16247 | ;mode 2[row 2] |
| 16248 | palignr m3, m1, m0, 2 |
| 16249 | movu [r0 + 4 * 16], m3 |
| 16250 | palignr m4, m2, m1, 2 |
| 16251 | movu [r0 + 5 * 16], m4 |
| 16252 | |
| 16253 | ;mode 2[row 18] |
| 16254 | movu [r0 + 36 * 16], m4 |
| 16255 | movu m6, [r4 + 51] |
| 16256 | palignr m7, m6, m5, 1 |
| 16257 | movu [r0 + 37 * 16], m7 |
| 16258 | |
| 16259 | ;mode 2[row 3] |
| 16260 | palignr m3, m1, m0, 3 |
| 16261 | movu [r0 + 6 * 16], m3 |
| 16262 | palignr m4, m2, m1, 3 |
| 16263 | movu [r0 + 7 * 16], m4 |
| 16264 | |
| 16265 | ;mode 2[row 19] |
| 16266 | movu [r0 + 38 * 16], m4 |
| 16267 | palignr m7, m6, m5, 2 |
| 16268 | movu [r0 + 39 * 16], m7 |
| 16269 | |
| 16270 | ;mode 2[row 4] |
| 16271 | palignr m3, m1, m0, 4 |
| 16272 | movu [r0 + 8 * 16], m3 |
| 16273 | palignr m4, m2, m1, 4 |
| 16274 | movu [r0 + 9 * 16], m4 |
| 16275 | |
| 16276 | ; mode 8 [row 31] |
| 16277 | movu [r0 + 446 * 16], m3 |
| 16278 | movu [r0 + 447 * 16], m4 |
| 16279 | |
| 16280 | ;mode 2[row 20] |
| 16281 | movu [r0 + 40 * 16], m4 |
| 16282 | palignr m7, m6, m5, 3 |
| 16283 | movu [r0 + 41 * 16], m7 |
| 16284 | |
| 16285 | ; mode 4 [row 31] |
| 16286 | movu [r0 + 190 * 16], m4 |
| 16287 | movu [r0 + 191 * 16], m7 |
| 16288 | |
| 16289 | ;mode 2[row 5] |
| 16290 | palignr m3, m1, m0, 5 |
| 16291 | movu [r0 + 10 * 16], m3 |
| 16292 | palignr m4, m2, m1, 5 |
| 16293 | movu [r0 + 11 * 16], m4 |
| 16294 | |
| 16295 | ;mode 2[row 21] |
| 16296 | movu [r0 + 42 * 16], m4 |
| 16297 | palignr m7, m6, m5, 4 |
| 16298 | movu [r0 + 43 * 16], m7 |
| 16299 | |
| 16300 | ;mode 2[row 6] |
| 16301 | palignr m3, m1, m0, 6 |
| 16302 | movu [r0 + 12 * 16], m3 |
| 16303 | palignr m4, m2, m1, 6 |
| 16304 | movu [r0 + 13 * 16], m4 |
| 16305 | |
| 16306 | ;mode 2[row 22] |
| 16307 | movu [r0 + 44 * 16], m4 |
| 16308 | palignr m7, m6, m5, 5 |
| 16309 | movu [r0 + 45 * 16], m7 |
| 16310 | |
| 16311 | ;mode 2[row 7] |
| 16312 | palignr m3, m1, m0, 7 |
| 16313 | movu [r0 + 14 * 16], m3 |
| 16314 | palignr m4, m2, m1, 7 |
| 16315 | movu [r0 + 15 * 16], m4 |
| 16316 | |
| 16317 | ;mode 2[row 23] |
| 16318 | movu [r0 + 46 * 16], m4 |
| 16319 | palignr m7, m6, m5, 6 |
| 16320 | movu [r0 + 47 * 16], m7 |
| 16321 | |
| 16322 | ;mode 2[row 8] |
| 16323 | palignr m3, m1, m0, 8 |
| 16324 | movu [r0 + 16 * 16], m3 |
| 16325 | palignr m4, m2, m1, 8 |
| 16326 | movu [r0 + 17 * 16], m4 |
| 16327 | |
| 16328 | ;mode 7[row 31] |
| 16329 | movu [r0 + 382 * 16], m3 |
| 16330 | movu [r0 + 383 * 16], m4 |
| 16331 | |
| 16332 | ;mode 2[row 24] |
| 16333 | movu [r0 + 48 * 16], m4 |
| 16334 | palignr m7, m6, m5, 7 |
| 16335 | movu [r0 + 49 * 16], m7 |
| 16336 | |
| 16337 | ;mode 2[row 9] |
| 16338 | palignr m3, m1, m0, 9 |
| 16339 | movu [r0 + 18 * 16], m3 |
| 16340 | palignr m4, m2, m1, 9 |
| 16341 | movu [r0 + 19 * 16], m4 |
| 16342 | |
| 16343 | ;mode 2[row 25] |
| 16344 | movu [r0 + 50 * 16], m4 |
| 16345 | palignr m7, m6, m5, 8 |
| 16346 | movu [r0 + 51 * 16], m7 |
| 16347 | |
| 16348 | ; mode 3 [row 31] |
| 16349 | movu [r0 + 126 * 16], m4 |
| 16350 | movu [r0 + 127 * 16], m7 |
| 16351 | |
| 16352 | ;mode 2[row 10] |
| 16353 | palignr m3, m1, m0, 10 |
| 16354 | movu [r0 + 20 * 16], m3 |
| 16355 | palignr m4, m2, m1, 10 |
| 16356 | movu [r0 + 21 * 16], m4 |
| 16357 | |
| 16358 | ;mode 2[row 26] |
| 16359 | movu [r0 + 52 * 16], m4 |
| 16360 | palignr m7, m6, m5, 9 |
| 16361 | movu [r0 + 53 * 16], m7 |
| 16362 | |
| 16363 | ;mode 2[row 11] |
| 16364 | palignr m3, m1, m0, 11 |
| 16365 | movu [r0 + 22 * 16], m3 |
| 16366 | palignr m4, m2, m1, 11 |
| 16367 | movu [r0 + 23 * 16], m4 |
| 16368 | |
| 16369 | ;mode 2[row 27] |
| 16370 | movu [r0 + 54 * 16], m4 |
| 16371 | palignr m7, m6, m5, 10 |
| 16372 | movu [r0 + 55 * 16], m7 |
| 16373 | |
| 16374 | ;mode 2[row 12] |
| 16375 | palignr m3, m1, m0, 12 |
| 16376 | movu [r0 + 24 * 16], m3 |
| 16377 | palignr m4, m2, m1, 12 |
| 16378 | movu [r0 + 25 * 16], m4 |
| 16379 | |
| 16380 | ; mode 6 [row 31] |
| 16381 | movu [r0 + 318 * 16], m3 |
| 16382 | movu [r0 + 319 * 16], m4 |
| 16383 | |
| 16384 | ; mode 3 [row 15] |
| 16385 | movu [r0 + 94 * 16], m3 |
| 16386 | movu [r0 + 95 * 16], m4 |
| 16387 | |
| 16388 | ;mode 2[row 28] |
| 16389 | movu [r0 + 56 * 16], m4 |
| 16390 | palignr m7, m6, m5, 11 |
| 16391 | movu [r0 + 57 * 16], m7 |
| 16392 | |
| 16393 | ;mode 2[row 13] |
| 16394 | palignr m3, m1, m0, 13 |
| 16395 | movu [r0 + 26 * 16], m3 |
| 16396 | palignr m4, m2, m1, 13 |
| 16397 | movu [r0 + 27 * 16], m4 |
| 16398 | |
| 16399 | ;mode 2[row 29] |
| 16400 | movu [r0 + 58 * 16], m4 |
| 16401 | palignr m7, m6, m5, 12 |
| 16402 | movu [r0 + 59 * 16], m7 |
| 16403 | |
| 16404 | ;mode 2[row 14] |
| 16405 | palignr m3, m1, m0, 14 |
| 16406 | movu [r0 + 28 * 16], m3 |
| 16407 | palignr m4, m2, m1, 14 |
| 16408 | movu [r0 + 29 * 16], m4 |
| 16409 | |
| 16410 | ;mode 2[row 30] |
| 16411 | movu [r0 + 60 * 16], m4 |
| 16412 | palignr m7, m6, m5, 13 |
| 16413 | movu [r0 + 61 * 16], m7 |
| 16414 | |
| 16415 | ;mode 2[row 15] |
| 16416 | palignr m3, m1, m0, 15 |
| 16417 | movu [r0 + 30 * 16], m3 |
| 16418 | palignr m4, m2, m1, 15 |
| 16419 | movu [r0 + 31 * 16], m4 |
| 16420 | |
| 16421 | ;mode 2[row 31] |
| 16422 | movu [r0 + 62 * 16], m4 |
| 16423 | palignr m7, m6, m5, 14 |
| 16424 | movu [r0 + 63 * 16], m7 |
| 16425 | |
| 16426 | ;mode 2[row 16] |
| 16427 | movu [r0 + 32 * 16], m1 |
| 16428 | movu [r0 + 33 * 16], m2 |
| 16429 | |
| 16430 | ; mode 5[row 31] |
| 16431 | movu [r0 + 254 * 16], m1 |
| 16432 | movu [r0 + 255 * 16], m2 |
| 16433 | |
| 16434 | ; mode 3 [row 0] |
| 16435 | lea r5, [ang_table] |
| 16436 | movu m6, [r5 + 26 * 16] |
| 16437 | movu m7, [pw_1024 ] |
| 16438 | movu m1, [r4 + 1 ] |
| 16439 | punpcklbw m1, m0 |
| 16440 | pmaddubsw m0, m1, m6 |
| 16441 | pmulhrsw m0, m7 |
| 16442 | movu m2, [r4 + 9] |
| 16443 | movd m3, [r4 + 10] |
| 16444 | palignr m3, m2, 1 |
| 16445 | punpcklbw m2, m3 |
| 16446 | pmaddubsw m3, m2, m6 |
| 16447 | pmulhrsw m3, m7 |
| 16448 | packuswb m0, m3 |
| 16449 | movu [r0 + 64 * 16], m0 |
| 16450 | |
| 16451 | ; mode 6 [row 1 - first half] |
| 16452 | movu [r0 + 258 * 16], m0 |
| 16453 | |
| 16454 | ; mode 9 [row 12 - first half] |
| 16455 | movu [r0 + 472 * 16], m0 |
| 16456 | |
| 16457 | movu m0, [r4 + 17] |
| 16458 | movd m3, [r4 + 18] |
| 16459 | palignr m3, m0, 1 |
| 16460 | punpcklbw m0, m3 |
| 16461 | pmaddubsw m3, m0, m6 |
| 16462 | pmulhrsw m3, m7 |
| 16463 | movu m4, [r4 + 25] |
| 16464 | movd m5, [r4 + 26] |
| 16465 | palignr m5, m4, 1 |
| 16466 | punpcklbw m4, m5 |
| 16467 | pmaddubsw m5, m4, m6 |
| 16468 | pmulhrsw m5, m7 |
| 16469 | packuswb m3, m5 |
| 16470 | movu [r0 + 65 * 16], m3 |
| 16471 | |
| 16472 | ; mode 6 [row 1 - second half] |
| 16473 | movu [r0 + 259 * 16], m3 |
| 16474 | |
| 16475 | ; mode 9 [row 12 - second half] |
| 16476 | movu [r0 + 473 * 16], m3 |
| 16477 | |
| 16478 | ; mode 4 [row 0] |
| 16479 | movu m6, [r5 + 21 * 16] |
| 16480 | pmaddubsw m3, m1, m6 |
| 16481 | pmulhrsw m3, m7 |
| 16482 | pmaddubsw m5, m2, m6 |
| 16483 | pmulhrsw m5, m7 |
| 16484 | packuswb m3, m5 |
| 16485 | movu [r0 + 128 * 16], m3 |
| 16486 | pmaddubsw m3, m0, m6 |
| 16487 | pmulhrsw m3, m7 |
| 16488 | pmaddubsw m5, m4, m6 |
| 16489 | pmulhrsw m5, m7 |
| 16490 | packuswb m3, m5 |
| 16491 | movu [r0 + 129 * 16], m3 |
| 16492 | |
| 16493 | ; mode 5 [row 0] |
| 16494 | movu m6, [r5 + 17 * 16] |
| 16495 | pmaddubsw m3, m1, m6 |
| 16496 | pmulhrsw m3, m7 |
| 16497 | pmaddubsw m5, m2, m6 |
| 16498 | pmulhrsw m5, m7 |
| 16499 | packuswb m3, m5 |
| 16500 | movu [r0 + 192 * 16], m3 |
| 16501 | pmaddubsw m3, m0, m6 |
| 16502 | pmulhrsw m3, m7 |
| 16503 | pmaddubsw m5, m4, m6 |
| 16504 | pmulhrsw m5, m7 |
| 16505 | packuswb m3, m5 |
| 16506 | movu [r0 + 193 * 16], m3 |
| 16507 | |
| 16508 | ; mode 6 [row 0] |
| 16509 | movu m6, [r5 + 13 * 16] |
| 16510 | pmaddubsw m3, m1, m6 |
| 16511 | pmulhrsw m3, m7 |
| 16512 | pmaddubsw m5, m2, m6 |
| 16513 | pmulhrsw m5, m7 |
| 16514 | packuswb m3, m5 |
| 16515 | movu [r0 + 256 * 16], m3 |
| 16516 | pmaddubsw m3, m0, m6 |
| 16517 | pmulhrsw m3, m7 |
| 16518 | pmaddubsw m5, m4, m6 |
| 16519 | pmulhrsw m5, m7 |
| 16520 | packuswb m3, m5 |
| 16521 | movu [r0 + 257 * 16], m3 |
| 16522 | |
| 16523 | ; mode 7 [row 0] |
| 16524 | movu m6, [r5 + 9 * 16] |
| 16525 | pmaddubsw m3, m1, m6 |
| 16526 | pmulhrsw m3, m7 |
| 16527 | pmaddubsw m5, m2, m6 |
| 16528 | pmulhrsw m5, m7 |
| 16529 | packuswb m3, m5 |
| 16530 | movu [r0 + 320 * 16], m3 |
| 16531 | pmaddubsw m3, m0, m6 |
| 16532 | pmulhrsw m3, m7 |
| 16533 | pmaddubsw m5, m4, m6 |
| 16534 | pmulhrsw m5, m7 |
| 16535 | packuswb m3, m5 |
| 16536 | movu [r0 + 321 * 16], m3 |
| 16537 | |
| 16538 | ; mode 7 [row 1] |
| 16539 | movu m6, [r5 + 18 * 16] |
| 16540 | pmaddubsw m3, m1, m6 |
| 16541 | pmulhrsw m3, m7 |
| 16542 | pmaddubsw m5, m2, m6 |
| 16543 | pmulhrsw m5, m7 |
| 16544 | packuswb m3, m5 |
| 16545 | movu [r0 + 322 * 16], m3 |
| 16546 | |
| 16547 | ; mode 9 [row 8 - first half] |
| 16548 | movu [r0 + 464 * 16], m3 |
| 16549 | |
| 16550 | pmaddubsw m3, m0, m6 |
| 16551 | pmulhrsw m3, m7 |
| 16552 | pmaddubsw m5, m4, m6 |
| 16553 | pmulhrsw m5, m7 |
| 16554 | packuswb m3, m5 |
| 16555 | movu [r0 + 323 * 16], m3 |
| 16556 | |
| 16557 | ; mode 9 [row 8 - second half] |
| 16558 | movu [r0 + 465 * 16], m3 |
| 16559 | |
| 16560 | ; mode 7 [row 2] |
| 16561 | movu m6, [r5 + 27 * 16] |
| 16562 | pmaddubsw m3, m1, m6 |
| 16563 | pmulhrsw m3, m7 |
| 16564 | pmaddubsw m5, m2, m6 |
| 16565 | pmulhrsw m5, m7 |
| 16566 | packuswb m3, m5 |
| 16567 | movu [r0 + 324 * 16], m3 |
| 16568 | pmaddubsw m3, m0, m6 |
| 16569 | pmulhrsw m3, m7 |
| 16570 | pmaddubsw m5, m4, m6 |
| 16571 | pmulhrsw m5, m7 |
| 16572 | packuswb m3, m5 |
| 16573 | movu [r0 + 325 * 16], m3 |
| 16574 | |
| 16575 | ; mode 8 [row 0] |
| 16576 | movu m6, [r5 + 5 * 16] |
| 16577 | pmaddubsw m3, m1, m6 |
| 16578 | pmulhrsw m3, m7 |
| 16579 | pmaddubsw m5, m2, m6 |
| 16580 | pmulhrsw m5, m7 |
| 16581 | packuswb m3, m5 |
| 16582 | movu [r0 + 384 * 16], m3 |
| 16583 | pmaddubsw m3, m0, m6 |
| 16584 | pmulhrsw m3, m7 |
| 16585 | pmaddubsw m5, m4, m6 |
| 16586 | pmulhrsw m5, m7 |
| 16587 | packuswb m3, m5 |
| 16588 | movu [r0 + 385 * 16], m3 |
| 16589 | |
| 16590 | ; mode 8 [row 1] |
| 16591 | movu m6, [r5 + 10 * 16] |
| 16592 | pmaddubsw m3, m1, m6 |
| 16593 | pmulhrsw m3, m7 |
| 16594 | pmaddubsw m5, m2, m6 |
| 16595 | pmulhrsw m5, m7 |
| 16596 | packuswb m3, m5 |
| 16597 | movu [r0 + 386 * 16], m3 |
| 16598 | |
| 16599 | ; mode 9 [row 4 - first half] |
| 16600 | movu [r0 + 456 * 16], m3 |
| 16601 | |
| 16602 | pmaddubsw m3, m0, m6 |
| 16603 | pmulhrsw m3, m7 |
| 16604 | pmaddubsw m5, m4, m6 |
| 16605 | pmulhrsw m5, m7 |
| 16606 | packuswb m3, m5 |
| 16607 | movu [r0 + 387 * 16], m3 |
| 16608 | |
| 16609 | ; mode 9 [row 4 - second half] |
| 16610 | movu [r0 + 457 * 16], m3 |
| 16611 | |
| 16612 | ; mode 8 [row 2] |
| 16613 | movu m6, [r5 + 15 * 16] |
| 16614 | pmaddubsw m3, m1, m6 |
| 16615 | pmulhrsw m3, m7 |
| 16616 | pmaddubsw m5, m2, m6 |
| 16617 | pmulhrsw m5, m7 |
| 16618 | packuswb m3, m5 |
| 16619 | movu [r0 + 388 * 16], m3 |
| 16620 | pmaddubsw m3, m0, m6 |
| 16621 | pmulhrsw m3, m7 |
| 16622 | pmaddubsw m5, m4, m6 |
| 16623 | pmulhrsw m5, m7 |
| 16624 | packuswb m3, m5 |
| 16625 | movu [r0 + 389 * 16], m3 |
| 16626 | |
| 16627 | ; mode 8 [row 3] |
| 16628 | movu m6, [r5 + 20 * 16] |
| 16629 | pmaddubsw m3, m1, m6 |
| 16630 | pmulhrsw m3, m7 |
| 16631 | pmaddubsw m5, m2, m6 |
| 16632 | pmulhrsw m5, m7 |
| 16633 | packuswb m3, m5 |
| 16634 | movu [r0 + 390 * 16], m3 |
| 16635 | |
| 16636 | ; mode 9 [row 9 - first half] |
| 16637 | movu [r0 + 466 * 16], m3 |
| 16638 | |
| 16639 | pmaddubsw m3, m0, m6 |
| 16640 | pmulhrsw m3, m7 |
| 16641 | pmaddubsw m5, m4, m6 |
| 16642 | pmulhrsw m5, m7 |
| 16643 | packuswb m3, m5 |
| 16644 | movu [r0 + 391 * 16], m3 |
| 16645 | |
| 16646 | ; mode 9 [row 9 - second half] |
| 16647 | movu [r0 + 467 * 16], m3 |
| 16648 | |
| 16649 | ; mode 8 [row 4] |
| 16650 | movu m6, [r5 + 25 * 16] |
| 16651 | pmaddubsw m3, m1, m6 |
| 16652 | pmulhrsw m3, m7 |
| 16653 | pmaddubsw m5, m2, m6 |
| 16654 | pmulhrsw m5, m7 |
| 16655 | packuswb m3, m5 |
| 16656 | movu [r0 + 392 * 16], m3 |
| 16657 | pmaddubsw m3, m0, m6 |
| 16658 | pmulhrsw m3, m7 |
| 16659 | pmaddubsw m5, m4, m6 |
| 16660 | pmulhrsw m5, m7 |
| 16661 | packuswb m3, m5 |
| 16662 | movu [r0 + 393 * 16], m3 |
| 16663 | |
| 16664 | ; mode 8 [row 5] |
| 16665 | movu m6, [r5 + 30 * 16] |
| 16666 | pmaddubsw m3, m1, m6 |
| 16667 | pmulhrsw m3, m7 |
| 16668 | pmaddubsw m5, m2, m6 |
| 16669 | pmulhrsw m5, m7 |
| 16670 | packuswb m3, m5 |
| 16671 | movu [r0 + 394 * 16], m3 |
| 16672 | |
| 16673 | ; mode 9 [row 14 - first half] |
| 16674 | movu [r0 + 476 * 16], m3 |
| 16675 | |
| 16676 | pmaddubsw m3, m0, m6 |
| 16677 | pmulhrsw m3, m7 |
| 16678 | pmaddubsw m5, m4, m6 |
| 16679 | pmulhrsw m5, m7 |
| 16680 | packuswb m3, m5 |
| 16681 | movu [r0 + 395 * 16], m3 |
| 16682 | |
| 16683 | ; mode 9 [row 14 - second half] |
| 16684 | movu [r0 + 477 * 16], m3 |
| 16685 | |
| 16686 | ; mode 9 [row 0] |
| 16687 | movu m6, [r5 + 2 * 16] |
| 16688 | pmaddubsw m3, m1, m6 |
| 16689 | pmulhrsw m3, m7 |
| 16690 | pmaddubsw m5, m2, m6 |
| 16691 | pmulhrsw m5, m7 |
| 16692 | packuswb m3, m5 |
| 16693 | movu [r0 + 448 * 16], m3 |
| 16694 | pmaddubsw m3, m0, m6 |
| 16695 | pmulhrsw m3, m7 |
| 16696 | pmaddubsw m5, m4, m6 |
| 16697 | pmulhrsw m5, m7 |
| 16698 | packuswb m3, m5 |
| 16699 | movu [r0 + 449 * 16], m3 |
| 16700 | |
| 16701 | ; mode 9 [row 1] |
| 16702 | movu m6, [r5 + 4 * 16] |
| 16703 | pmaddubsw m3, m1, m6 |
| 16704 | pmulhrsw m3, m7 |
| 16705 | pmaddubsw m5, m2, m6 |
| 16706 | pmulhrsw m5, m7 |
| 16707 | packuswb m3, m5 |
| 16708 | movu [r0 + 450 * 16], m3 |
| 16709 | pmaddubsw m3, m0, m6 |
| 16710 | pmulhrsw m3, m7 |
| 16711 | pmaddubsw m5, m4, m6 |
| 16712 | pmulhrsw m5, m7 |
| 16713 | packuswb m3, m5 |
| 16714 | movu [r0 + 451 * 16], m3 |
| 16715 | |
| 16716 | ; mode 9 [row 2] |
| 16717 | movu m6, [r5 + 6 * 16] |
| 16718 | pmaddubsw m3, m1, m6 |
| 16719 | pmulhrsw m3, m7 |
| 16720 | pmaddubsw m5, m2, m6 |
| 16721 | pmulhrsw m5, m7 |
| 16722 | packuswb m3, m5 |
| 16723 | movu [r0 + 452 * 16], m3 |
| 16724 | pmaddubsw m3, m0, m6 |
| 16725 | pmulhrsw m3, m7 |
| 16726 | pmaddubsw m5, m4, m6 |
| 16727 | pmulhrsw m5, m7 |
| 16728 | packuswb m3, m5 |
| 16729 | movu [r0 + 453 * 16], m3 |
| 16730 | |
| 16731 | ; mode 9 [row 3] |
| 16732 | movu m6, [r5 + 8 * 16] |
| 16733 | pmaddubsw m3, m1, m6 |
| 16734 | pmulhrsw m3, m7 |
| 16735 | pmaddubsw m5, m2, m6 |
| 16736 | pmulhrsw m5, m7 |
| 16737 | packuswb m3, m5 |
| 16738 | movu [r0 + 454 * 16], m3 |
| 16739 | pmaddubsw m3, m0, m6 |
| 16740 | pmulhrsw m3, m7 |
| 16741 | pmaddubsw m5, m4, m6 |
| 16742 | pmulhrsw m5, m7 |
| 16743 | packuswb m3, m5 |
| 16744 | movu [r0 + 455 * 16], m3 |
| 16745 | |
| 16746 | ; mode 9 [row 5] |
| 16747 | movu m6, [r5 + 12 * 16] |
| 16748 | pmaddubsw m3, m1, m6 |
| 16749 | pmulhrsw m3, m7 |
| 16750 | pmaddubsw m5, m2, m6 |
| 16751 | pmulhrsw m5, m7 |
| 16752 | packuswb m3, m5 |
| 16753 | movu [r0 + 458 * 16], m3 |
| 16754 | pmaddubsw m3, m0, m6 |
| 16755 | pmulhrsw m3, m7 |
| 16756 | pmaddubsw m5, m4, m6 |
| 16757 | pmulhrsw m5, m7 |
| 16758 | packuswb m3, m5 |
| 16759 | movu [r0 + 459 * 16], m3 |
| 16760 | |
| 16761 | ; mode 9 [row 6] |
| 16762 | movu m6, [r5 + 14 * 16] |
| 16763 | pmaddubsw m3, m1, m6 |
| 16764 | pmulhrsw m3, m7 |
| 16765 | pmaddubsw m5, m2, m6 |
| 16766 | pmulhrsw m5, m7 |
| 16767 | packuswb m3, m5 |
| 16768 | movu [r0 + 460 * 16], m3 |
| 16769 | pmaddubsw m3, m0, m6 |
| 16770 | pmulhrsw m3, m7 |
| 16771 | pmaddubsw m5, m4, m6 |
| 16772 | pmulhrsw m5, m7 |
| 16773 | packuswb m3, m5 |
| 16774 | movu [r0 + 461 * 16], m3 |
| 16775 | |
| 16776 | ; mode 9 [row 7] |
| 16777 | movu m6, [r5 + 16 * 16] |
| 16778 | pmaddubsw m3, m1, m6 |
| 16779 | pmulhrsw m3, m7 |
| 16780 | pmaddubsw m5, m2, m6 |
| 16781 | pmulhrsw m5, m7 |
| 16782 | packuswb m3, m5 |
| 16783 | movu [r0 + 462 * 16], m3 |
| 16784 | pmaddubsw m3, m0, m6 |
| 16785 | pmulhrsw m3, m7 |
| 16786 | pmaddubsw m5, m4, m6 |
| 16787 | pmulhrsw m5, m7 |
| 16788 | packuswb m3, m5 |
| 16789 | movu [r0 + 463 * 16], m3 |
| 16790 | |
| 16791 | ; mode 9 [row 10] |
| 16792 | movu m6, [r5 + 22 * 16] |
| 16793 | pmaddubsw m3, m1, m6 |
| 16794 | pmulhrsw m3, m7 |
| 16795 | pmaddubsw m5, m2, m6 |
| 16796 | pmulhrsw m5, m7 |
| 16797 | packuswb m3, m5 |
| 16798 | movu [r0 + 468 * 16], m3 |
| 16799 | pmaddubsw m3, m0, m6 |
| 16800 | pmulhrsw m3, m7 |
| 16801 | pmaddubsw m5, m4, m6 |
| 16802 | pmulhrsw m5, m7 |
| 16803 | packuswb m3, m5 |
| 16804 | movu [r0 + 469 * 16], m3 |
| 16805 | |
| 16806 | ; mode 9 [row 11] |
| 16807 | movu m6, [r5 + 24 * 16] |
| 16808 | pmaddubsw m3, m1, m6 |
| 16809 | pmulhrsw m3, m7 |
| 16810 | pmaddubsw m5, m2, m6 |
| 16811 | pmulhrsw m5, m7 |
| 16812 | packuswb m3, m5 |
| 16813 | movu [r0 + 470 * 16], m3 |
| 16814 | pmaddubsw m3, m0, m6 |
| 16815 | pmulhrsw m3, m7 |
| 16816 | pmaddubsw m5, m4, m6 |
| 16817 | pmulhrsw m5, m7 |
| 16818 | packuswb m3, m5 |
| 16819 | movu [r0 + 471 * 16], m3 |
| 16820 | |
| 16821 | ; mode 9 [row 13] |
| 16822 | movu m6, [r5 + 28 * 16] |
| 16823 | pmaddubsw m3, m1, m6 |
| 16824 | pmulhrsw m3, m7 |
| 16825 | pmaddubsw m5, m2, m6 |
| 16826 | pmulhrsw m5, m7 |
| 16827 | packuswb m3, m5 |
| 16828 | movu [r0 + 474 * 16], m3 |
| 16829 | pmaddubsw m3, m0, m6 |
| 16830 | pmulhrsw m3, m7 |
| 16831 | pmaddubsw m5, m4, m6 |
| 16832 | pmulhrsw m5, m7 |
| 16833 | packuswb m3, m5 |
| 16834 | movu [r0 + 475 * 16], m3 |
| 16835 | |
| 16836 | ; mode 3 [row 1] |
| 16837 | movu m6, [r5 + 20 * 16] |
| 16838 | movu m0, [r4 + 2] |
| 16839 | movd m1, [r4 + 3] |
| 16840 | palignr m1, m0, 1 |
| 16841 | punpcklbw m0, m1 |
| 16842 | pmaddubsw m1, m0, m6 |
| 16843 | pmulhrsw m1, m7 |
| 16844 | movu m2, [r4 + 10] |
| 16845 | movd m3, [r4 + 11] |
| 16846 | palignr m3, m2, 1 |
| 16847 | punpcklbw m2, m3 |
| 16848 | pmaddubsw m3, m2, m6 |
| 16849 | pmulhrsw m3, m7 |
| 16850 | packuswb m1, m3 |
| 16851 | movu [r0 + 66 * 16], m1 |
| 16852 | |
| 16853 | ; mode 6 [row 3 - first half] |
| 16854 | movu [r0 + 262 * 16], m1 |
| 16855 | |
| 16856 | ; mode 9 [row 25 - first half] |
| 16857 | movu [r0 + 498 * 16], m1 |
| 16858 | |
| 16859 | movu m1, [r4 + 18] |
| 16860 | movd m3, [r4 + 19] |
| 16861 | palignr m3, m1, 1 |
| 16862 | punpcklbw m1, m3 |
| 16863 | pmaddubsw m3, m1, m6 |
| 16864 | pmulhrsw m3, m7 |
| 16865 | movu m4, [r4 + 26] |
| 16866 | movd m5, [r4 + 27] |
| 16867 | palignr m5, m4, 1 |
| 16868 | punpcklbw m4, m5 |
| 16869 | pmaddubsw m5, m4, m6 |
| 16870 | pmulhrsw m5, m7 |
| 16871 | packuswb m3, m5 |
| 16872 | movu [r0 + 67 * 16], m3 |
| 16873 | |
| 16874 | ; mode 6 [row 3 - second half] |
| 16875 | movu [r0 + 263 * 16], m3 |
| 16876 | |
| 16877 | ; mode 9 [row 25 - second half] |
| 16878 | movu [r0 + 499 * 16], m3 |
| 16879 | |
| 16880 | ; mode 4 [row 1] |
| 16881 | movu m6, [r5 + 10 * 16] |
| 16882 | pmaddubsw m3, m0, m6 |
| 16883 | pmulhrsw m3, m7 |
| 16884 | pmaddubsw m5, m2, m6 |
| 16885 | pmulhrsw m5, m7 |
| 16886 | packuswb m3, m5 |
| 16887 | movu [r0 + 130 * 16], m3 |
| 16888 | |
| 16889 | ; mode 9 [row 20 - first half] |
| 16890 | movu [r0 + 488 * 16], m3 |
| 16891 | |
| 16892 | pmaddubsw m3, m1, m6 |
| 16893 | pmulhrsw m3, m7 |
| 16894 | pmaddubsw m5, m4, m6 |
| 16895 | pmulhrsw m5, m7 |
| 16896 | packuswb m3, m5 |
| 16897 | movu [r0 + 131 * 16], m3 |
| 16898 | |
| 16899 | ; mode 9 [row 20 - second half] |
| 16900 | movu [r0 + 489 * 16], m3 |
| 16901 | |
| 16902 | ; mode 4 [row 2] |
| 16903 | movu m6, [r5 + 31 * 16] |
| 16904 | pmaddubsw m3, m0, m6 |
| 16905 | pmulhrsw m3, m7 |
| 16906 | pmaddubsw m5, m2, m6 |
| 16907 | pmulhrsw m5, m7 |
| 16908 | packuswb m3, m5 |
| 16909 | movu [r0 + 132 * 16], m3 |
| 16910 | |
| 16911 | ; mode 7 [row 6 - first half] |
| 16912 | movu [r0 + 332 * 16], m3 |
| 16913 | |
| 16914 | pmaddubsw m3, m1, m6 |
| 16915 | pmulhrsw m3, m7 |
| 16916 | pmaddubsw m5, m4, m6 |
| 16917 | pmulhrsw m5, m7 |
| 16918 | packuswb m3, m5 |
| 16919 | movu [r0 + 133 * 16], m3 |
| 16920 | |
| 16921 | ; mode 7 [row 6 - second half] |
| 16922 | movu [r0 + 333 * 16], m3 |
| 16923 | |
| 16924 | ; mode 5 [row 1] |
| 16925 | movu m6, [r5 + 2 * 16] |
| 16926 | pmaddubsw m3, m0, m6 |
| 16927 | pmulhrsw m3, m7 |
| 16928 | pmaddubsw m5, m2, m6 |
| 16929 | pmulhrsw m5, m7 |
| 16930 | packuswb m3, m5 |
| 16931 | movu [r0 + 194 * 16], m3 |
| 16932 | |
| 16933 | ; mode 5 [row 1 - first half] |
| 16934 | movu [r0 + 480 * 16], m3 |
| 16935 | |
| 16936 | pmaddubsw m3, m1, m6 |
| 16937 | pmulhrsw m3, m7 |
| 16938 | pmaddubsw m5, m4, m6 |
| 16939 | pmulhrsw m5, m7 |
| 16940 | packuswb m3, m5 |
| 16941 | movu [r0 + 195 * 16], m3 |
| 16942 | |
| 16943 | ; mode 5 [row 1 - second half] |
| 16944 | movu [r0 + 481 * 16], m3 |
| 16945 | |
| 16946 | ; mode 5 [row 2] |
| 16947 | movu m6, [r5 + 19 * 16] |
| 16948 | pmaddubsw m3, m0, m6 |
| 16949 | pmulhrsw m3, m7 |
| 16950 | pmaddubsw m5, m2, m6 |
| 16951 | pmulhrsw m5, m7 |
| 16952 | packuswb m3, m5 |
| 16953 | movu [r0 + 196 * 16], m3 |
| 16954 | pmaddubsw m3, m1, m6 |
| 16955 | pmulhrsw m3, m7 |
| 16956 | pmaddubsw m5, m4, m6 |
| 16957 | pmulhrsw m5, m7 |
| 16958 | packuswb m3, m5 |
| 16959 | movu [r0 + 197 * 16], m3 |
| 16960 | |
| 16961 | ; mode 6 [row 2] |
| 16962 | movu m6, [r5 + 7 * 16] |
| 16963 | pmaddubsw m3, m0, m6 |
| 16964 | pmulhrsw m3, m7 |
| 16965 | pmaddubsw m5, m2, m6 |
| 16966 | pmulhrsw m5, m7 |
| 16967 | packuswb m3, m5 |
| 16968 | movu [r0 + 260 * 16], m3 |
| 16969 | pmaddubsw m3, m1, m6 |
| 16970 | pmulhrsw m3, m7 |
| 16971 | pmaddubsw m5, m4, m6 |
| 16972 | pmulhrsw m5, m7 |
| 16973 | packuswb m3, m5 |
| 16974 | movu [r0 + 261 * 16], m3 |
| 16975 | |
| 16976 | ; mode 7 [row 3] |
| 16977 | movu m6, [r5 + 4 * 16] |
| 16978 | pmaddubsw m3, m0, m6 |
| 16979 | pmulhrsw m3, m7 |
| 16980 | pmaddubsw m5, m2, m6 |
| 16981 | pmulhrsw m5, m7 |
| 16982 | packuswb m3, m5 |
| 16983 | movu [r0 + 326 * 16], m3 |
| 16984 | |
| 16985 | ; mode 9 [row 17 - first half] |
| 16986 | movu [r0 + 482 * 16], m3 |
| 16987 | |
| 16988 | pmaddubsw m3, m1, m6 |
| 16989 | pmulhrsw m3, m7 |
| 16990 | pmaddubsw m5, m4, m6 |
| 16991 | pmulhrsw m5, m7 |
| 16992 | packuswb m3, m5 |
| 16993 | movu [r0 + 327 * 16], m3 |
| 16994 | |
| 16995 | ; mode 9 [row 17 - second half] |
| 16996 | movu [r0 + 483 * 16], m3 |
| 16997 | |
| 16998 | ; mode 7 [row 4] |
| 16999 | movu m6, [r5 + 13 * 16] |
| 17000 | pmaddubsw m3, m0, m6 |
| 17001 | pmulhrsw m3, m7 |
| 17002 | pmaddubsw m5, m2, m6 |
| 17003 | pmulhrsw m5, m7 |
| 17004 | packuswb m3, m5 |
| 17005 | movu [r0 + 328 * 16], m3 |
| 17006 | |
| 17007 | ; mode 8 [row 8 - first half] |
| 17008 | movu [r0 + 400 * 16], m3 |
| 17009 | |
| 17010 | pmaddubsw m3, m1, m6 |
| 17011 | pmulhrsw m3, m7 |
| 17012 | pmaddubsw m5, m4, m6 |
| 17013 | pmulhrsw m5, m7 |
| 17014 | packuswb m3, m5 |
| 17015 | movu [r0 + 329 * 16], m3 |
| 17016 | |
| 17017 | ; mode 8 [row 8 - second half] |
| 17018 | movu [r0 + 401 * 16], m3 |
| 17019 | |
| 17020 | ; mode 7 [row 5] |
| 17021 | movu m6, [r5 + 22 * 16] |
| 17022 | pmaddubsw m3, m0, m6 |
| 17023 | pmulhrsw m3, m7 |
| 17024 | pmaddubsw m5, m2, m6 |
| 17025 | pmulhrsw m5, m7 |
| 17026 | packuswb m3, m5 |
| 17027 | movu [r0 + 330 * 16], m3 |
| 17028 | |
| 17029 | ; mode 9 [row 26 - first half] |
| 17030 | movu [r0 + 500 * 16], m3 |
| 17031 | |
| 17032 | pmaddubsw m3, m1, m6 |
| 17033 | pmulhrsw m3, m7 |
| 17034 | pmaddubsw m5, m4, m6 |
| 17035 | pmulhrsw m5, m7 |
| 17036 | packuswb m3, m5 |
| 17037 | movu [r0 + 331 * 16], m3 |
| 17038 | |
| 17039 | ; mode 9 [row 26 - second half] |
| 17040 | movu [r0 + 501 * 16], m3 |
| 17041 | |
| 17042 | ; mode 8 [row 6] |
| 17043 | movu m6, [r5 + 3 * 16] |
| 17044 | pmaddubsw m3, m0, m6 |
| 17045 | pmulhrsw m3, m7 |
| 17046 | pmaddubsw m5, m2, m6 |
| 17047 | pmulhrsw m5, m7 |
| 17048 | packuswb m3, m5 |
| 17049 | movu [r0 + 396 * 16], m3 |
| 17050 | pmaddubsw m3, m1, m6 |
| 17051 | pmulhrsw m3, m7 |
| 17052 | pmaddubsw m5, m4, m6 |
| 17053 | pmulhrsw m5, m7 |
| 17054 | packuswb m3, m5 |
| 17055 | movu [r0 + 397 * 16], m3 |
| 17056 | |
| 17057 | ; mode 9 [row 18] |
| 17058 | movu m6, [r5 + 6 * 16] |
| 17059 | pmaddubsw m3, m0, m6 |
| 17060 | pmulhrsw m3, m7 |
| 17061 | pmaddubsw m5, m2, m6 |
| 17062 | pmulhrsw m5, m7 |
| 17063 | packuswb m3, m5 |
| 17064 | movu [r0 + 484 * 16], m3 |
| 17065 | pmaddubsw m3, m1, m6 |
| 17066 | pmulhrsw m3, m7 |
| 17067 | pmaddubsw m5, m4, m6 |
| 17068 | pmulhrsw m5, m7 |
| 17069 | packuswb m3, m5 |
| 17070 | movu [r0 + 485 * 16], m3 |
| 17071 | |
| 17072 | ; mode 9 [row 21] |
| 17073 | movu m6, [r5 + 12 * 16] |
| 17074 | pmaddubsw m3, m0, m6 |
| 17075 | pmulhrsw m3, m7 |
| 17076 | pmaddubsw m5, m2, m6 |
| 17077 | pmulhrsw m5, m7 |
| 17078 | packuswb m3, m5 |
| 17079 | movu [r0 + 490 * 16], m3 |
| 17080 | pmaddubsw m3, m1, m6 |
| 17081 | pmulhrsw m3, m7 |
| 17082 | pmaddubsw m5, m4, m6 |
| 17083 | pmulhrsw m5, m7 |
| 17084 | packuswb m3, m5 |
| 17085 | movu [r0 + 491 * 16], m3 |
| 17086 | |
| 17087 | ; mode 9 [row 22] |
| 17088 | movu m6, [r5 + 14 * 16] |
| 17089 | pmaddubsw m3, m0, m6 |
| 17090 | pmulhrsw m3, m7 |
| 17091 | pmaddubsw m5, m2, m6 |
| 17092 | pmulhrsw m5, m7 |
| 17093 | packuswb m3, m5 |
| 17094 | movu [r0 + 492 * 16], m3 |
| 17095 | pmaddubsw m3, m1, m6 |
| 17096 | pmulhrsw m3, m7 |
| 17097 | pmaddubsw m5, m4, m6 |
| 17098 | pmulhrsw m5, m7 |
| 17099 | packuswb m3, m5 |
| 17100 | movu [r0 + 493 * 16], m3 |
| 17101 | |
| 17102 | ; mode 9 [row 23] |
| 17103 | movu m6, [r5 + 16 * 16] |
| 17104 | pmaddubsw m3, m0, m6 |
| 17105 | pmulhrsw m3, m7 |
| 17106 | pmaddubsw m5, m2, m6 |
| 17107 | pmulhrsw m5, m7 |
| 17108 | packuswb m3, m5 |
| 17109 | movu [r0 + 494 * 16], m3 |
| 17110 | pmaddubsw m3, m1, m6 |
| 17111 | pmulhrsw m3, m7 |
| 17112 | pmaddubsw m5, m4, m6 |
| 17113 | pmulhrsw m5, m7 |
| 17114 | packuswb m3, m5 |
| 17115 | movu [r0 + 495 * 16], m3 |
| 17116 | |
| 17117 | ; mode 9 [row 27] |
| 17118 | movu m6, [r5 + 24 * 16] |
| 17119 | pmaddubsw m3, m0, m6 |
| 17120 | pmulhrsw m3, m7 |
| 17121 | pmaddubsw m5, m2, m6 |
| 17122 | pmulhrsw m5, m7 |
| 17123 | packuswb m3, m5 |
| 17124 | movu [r0 + 502 * 16], m3 |
| 17125 | pmaddubsw m3, m1, m6 |
| 17126 | pmulhrsw m3, m7 |
| 17127 | pmaddubsw m5, m4, m6 |
| 17128 | pmulhrsw m5, m7 |
| 17129 | packuswb m3, m5 |
| 17130 | movu [r0 + 503 * 16], m3 |
| 17131 | |
| 17132 | ; mode 9 [row 28] |
| 17133 | movu m6, [r5 + 26 * 16] |
| 17134 | pmaddubsw m3, m0, m6 |
| 17135 | pmulhrsw m3, m7 |
| 17136 | pmaddubsw m5, m2, m6 |
| 17137 | pmulhrsw m5, m7 |
| 17138 | packuswb m3, m5 |
| 17139 | movu [r0 + 504 * 16], m3 |
| 17140 | pmaddubsw m3, m1, m6 |
| 17141 | pmulhrsw m3, m7 |
| 17142 | pmaddubsw m5, m4, m6 |
| 17143 | pmulhrsw m5, m7 |
| 17144 | packuswb m3, m5 |
| 17145 | movu [r0 + 505 * 16], m3 |
| 17146 | |
| 17147 | ; mode 9 [row 30] |
| 17148 | movu m6, [r5 + 30 * 16] |
| 17149 | pmaddubsw m3, m0, m6 |
| 17150 | pmulhrsw m3, m7 |
| 17151 | pmaddubsw m5, m2, m6 |
| 17152 | pmulhrsw m5, m7 |
| 17153 | packuswb m3, m5 |
| 17154 | movu [r0 + 508 * 16], m3 |
| 17155 | pmaddubsw m3, m1, m6 |
| 17156 | pmulhrsw m3, m7 |
| 17157 | pmaddubsw m5, m4, m6 |
| 17158 | pmulhrsw m5, m7 |
| 17159 | packuswb m3, m5 |
| 17160 | movu [r0 + 509 * 16], m3 |
| 17161 | |
| 17162 | ; mode 8 [row 7] |
| 17163 | movu m6, [r5 + 8 * 16] |
| 17164 | pmaddubsw m3, m0, m6 |
| 17165 | pmulhrsw m3, m7 |
| 17166 | pmaddubsw m5, m2, m6 |
| 17167 | pmulhrsw m5, m7 |
| 17168 | packuswb m3, m5 |
| 17169 | movu [r0 + 398 * 16], m3 |
| 17170 | |
| 17171 | ; mode 9 [row 19 - first half] |
| 17172 | movu [r0 + 486 * 16], m3 |
| 17173 | |
| 17174 | pmaddubsw m3, m1, m6 |
| 17175 | pmulhrsw m3, m7 |
| 17176 | pmaddubsw m5, m4, m6 |
| 17177 | pmulhrsw m5, m7 |
| 17178 | packuswb m3, m5 |
| 17179 | movu [r0 + 399 * 16], m3 |
| 17180 | |
| 17181 | ; mode 9 [row 19 - second half] |
| 17182 | movu [r0 + 487 * 16], m3 |
| 17183 | |
| 17184 | ; mode 8 [row 9] |
| 17185 | movu m6, [r5 + 18 * 16] |
| 17186 | pmaddubsw m3, m0, m6 |
| 17187 | pmulhrsw m3, m7 |
| 17188 | pmaddubsw m5, m2, m6 |
| 17189 | pmulhrsw m5, m7 |
| 17190 | packuswb m3, m5 |
| 17191 | movu [r0 + 402 * 16], m3 |
| 17192 | |
| 17193 | ; mode 9 [row 24 - first half] |
| 17194 | movu [r0 + 496 * 16], m3 |
| 17195 | |
| 17196 | pmaddubsw m3, m1, m6 |
| 17197 | pmulhrsw m3, m7 |
| 17198 | pmaddubsw m5, m4, m6 |
| 17199 | pmulhrsw m5, m7 |
| 17200 | packuswb m3, m5 |
| 17201 | movu [r0 + 403 * 16], m3 |
| 17202 | |
| 17203 | ; mode 9 [row 24 - second half] |
| 17204 | movu [r0 + 497 * 16], m3 |
| 17205 | |
| 17206 | ; mode 8 [row 10] |
| 17207 | movu m6, [r5 + 23 * 16] |
| 17208 | pmaddubsw m3, m0, m6 |
| 17209 | pmulhrsw m3, m7 |
| 17210 | pmaddubsw m5, m2, m6 |
| 17211 | pmulhrsw m5, m7 |
| 17212 | packuswb m3, m5 |
| 17213 | movu [r0 + 404 * 16], m3 |
| 17214 | pmaddubsw m3, m1, m6 |
| 17215 | pmulhrsw m3, m7 |
| 17216 | pmaddubsw m5, m4, m6 |
| 17217 | pmulhrsw m5, m7 |
| 17218 | packuswb m3, m5 |
| 17219 | movu [r0 + 405 * 16], m3 |
| 17220 | |
| 17221 | ; mode 8 [row 11] |
| 17222 | movu m6, [r5 + 28 * 16] |
| 17223 | pmaddubsw m3, m0, m6 |
| 17224 | pmulhrsw m3, m7 |
| 17225 | pmaddubsw m5, m2, m6 |
| 17226 | pmulhrsw m5, m7 |
| 17227 | packuswb m3, m5 |
| 17228 | movu [r0 + 406 * 16], m3 |
| 17229 | |
| 17230 | ; mode 9 [row 29 - first half] |
| 17231 | movu [r0 + 506 * 16], m3 |
| 17232 | |
| 17233 | pmaddubsw m3, m1, m6 |
| 17234 | pmulhrsw m3, m7 |
| 17235 | pmaddubsw m5, m4, m6 |
| 17236 | pmulhrsw m5, m7 |
| 17237 | packuswb m3, m5 |
| 17238 | movu [r0 + 407 * 16], m3 |
| 17239 | |
| 17240 | ; mode 9 [row 29 - second half] |
| 17241 | movu [r0 + 507 * 16], m3 |
| 17242 | |
| 17243 | ; mode 3 [row 2] |
| 17244 | movu m6, [r5 + 14 * 16] |
| 17245 | movu m0, [r4 + 3] |
| 17246 | movd m1, [r4 + 4] |
| 17247 | palignr m1, m0, 1 |
| 17248 | punpcklbw m0, m1 |
| 17249 | pmaddubsw m1, m0, m6 |
| 17250 | pmulhrsw m1, m7 |
| 17251 | movu m2, [r4 + 11] |
| 17252 | movd m3, [r4 + 12] |
| 17253 | palignr m3, m2, 1 |
| 17254 | punpcklbw m2, m3 |
| 17255 | pmaddubsw m3, m2, m6 |
| 17256 | pmulhrsw m3, m7 |
| 17257 | packuswb m1, m3 |
| 17258 | movu [r0 + 68 * 16], m1 |
| 17259 | |
| 17260 | ; mode 3 [row 2 - first half] |
| 17261 | movu [r0 + 266 * 16], m1 |
| 17262 | |
| 17263 | movu m1, [r4 + 19] |
| 17264 | movd m3, [r4 + 20] |
| 17265 | palignr m3, m1, 1 |
| 17266 | punpcklbw m1, m3 |
| 17267 | pmaddubsw m3, m1, m6 |
| 17268 | pmulhrsw m3, m7 |
| 17269 | movu m4, [r4 + 27] |
| 17270 | movd m5, [r4 + 28] |
| 17271 | palignr m5, m4, 1 |
| 17272 | punpcklbw m4, m5 |
| 17273 | pmaddubsw m5, m4, m6 |
| 17274 | pmulhrsw m5, m7 |
| 17275 | packuswb m3, m5 |
| 17276 | movu [r0 + 69 * 16], m3 |
| 17277 | |
| 17278 | ; mode 3 [row 2 - second half] |
| 17279 | movu [r0 + 267 * 16], m3 |
| 17280 | |
| 17281 | ; mode 4 [row 3] |
| 17282 | movu m6, [r5 + 20 * 16] |
| 17283 | pmaddubsw m3, m0, m6 |
| 17284 | pmulhrsw m3, m7 |
| 17285 | pmaddubsw m5, m2, m6 |
| 17286 | pmulhrsw m5, m7 |
| 17287 | packuswb m3, m5 |
| 17288 | movu [r0 + 134 * 16], m3 |
| 17289 | pmaddubsw m3, m1, m6 |
| 17290 | pmulhrsw m3, m7 |
| 17291 | pmaddubsw m5, m4, m6 |
| 17292 | pmulhrsw m5, m7 |
| 17293 | packuswb m3, m5 |
| 17294 | movu [r0 + 135 * 16], m3 |
| 17295 | |
| 17296 | ; mode 5 [row 3] |
| 17297 | movu m6, [r5 + 4 * 16] |
| 17298 | pmaddubsw m3, m0, m6 |
| 17299 | pmulhrsw m3, m7 |
| 17300 | pmaddubsw m5, m2, m6 |
| 17301 | pmulhrsw m5, m7 |
| 17302 | packuswb m3, m5 |
| 17303 | movu [r0 + 198 * 16], m3 |
| 17304 | pmaddubsw m3, m1, m6 |
| 17305 | pmulhrsw m3, m7 |
| 17306 | pmaddubsw m5, m4, m6 |
| 17307 | pmulhrsw m5, m7 |
| 17308 | packuswb m3, m5 |
| 17309 | movu [r0 + 199 * 16], m3 |
| 17310 | |
| 17311 | ; mode 5 [row 4] |
| 17312 | movu m6, [r5 + 21 * 16] |
| 17313 | pmaddubsw m3, m0, m6 |
| 17314 | pmulhrsw m3, m7 |
| 17315 | pmaddubsw m5, m2, m6 |
| 17316 | pmulhrsw m5, m7 |
| 17317 | packuswb m3, m5 |
| 17318 | movu [r0 + 200 * 16], m3 |
| 17319 | |
| 17320 | ; mode 8 [row 16 - first half] |
| 17321 | movu [r0 + 416 * 16], m3 |
| 17322 | |
| 17323 | pmaddubsw m3, m1, m6 |
| 17324 | pmulhrsw m3, m7 |
| 17325 | pmaddubsw m5, m4, m6 |
| 17326 | pmulhrsw m5, m7 |
| 17327 | packuswb m3, m5 |
| 17328 | movu [r0 + 201 * 16], m3 |
| 17329 | |
| 17330 | ; mode 8 [row 16 - second half] |
| 17331 | movu [r0 + 417 * 16], m3 |
| 17332 | |
| 17333 | ; mode 6 [row 4] |
| 17334 | movu m6, [r5 + 1 * 16] |
| 17335 | pmaddubsw m3, m0, m6 |
| 17336 | pmulhrsw m3, m7 |
| 17337 | pmaddubsw m5, m2, m6 |
| 17338 | pmulhrsw m5, m7 |
| 17339 | packuswb m3, m5 |
| 17340 | movu [r0 + 264 * 16], m3 |
| 17341 | |
| 17342 | ; mode 6 [row 4 - first half] |
| 17343 | movu [r0 + 408 * 16], m3 |
| 17344 | |
| 17345 | pmaddubsw m3, m1, m6 |
| 17346 | pmulhrsw m3, m7 |
| 17347 | pmaddubsw m5, m4, m6 |
| 17348 | pmulhrsw m5, m7 |
| 17349 | packuswb m3, m5 |
| 17350 | movu [r0 + 265 * 16], m3 |
| 17351 | |
| 17352 | ; mode 6 [row 4 - second half] |
| 17353 | movu [r0 + 409 * 16], m3 |
| 17354 | |
| 17355 | ; mode 6 [row 6] |
| 17356 | movu m6, [r5 + 27 * 16] |
| 17357 | pmaddubsw m3, m0, m6 |
| 17358 | pmulhrsw m3, m7 |
| 17359 | pmaddubsw m5, m2, m6 |
| 17360 | pmulhrsw m5, m7 |
| 17361 | packuswb m3, m5 |
| 17362 | movu [r0 + 268 * 16], m3 |
| 17363 | pmaddubsw m3, m1, m6 |
| 17364 | pmulhrsw m3, m7 |
| 17365 | pmaddubsw m5, m4, m6 |
| 17366 | pmulhrsw m5, m7 |
| 17367 | packuswb m3, m5 |
| 17368 | movu [r0 + 269 * 16], m3 |
| 17369 | |
| 17370 | ; mode 7 [row 7] |
| 17371 | movu m6, [r5 + 8 * 16] |
| 17372 | pmaddubsw m3, m0, m6 |
| 17373 | pmulhrsw m3, m7 |
| 17374 | pmaddubsw m5, m2, m6 |
| 17375 | pmulhrsw m5, m7 |
| 17376 | packuswb m3, m5 |
| 17377 | movu [r0 + 334 * 16], m3 |
| 17378 | pmaddubsw m3, m1, m6 |
| 17379 | pmulhrsw m3, m7 |
| 17380 | pmaddubsw m5, m4, m6 |
| 17381 | pmulhrsw m5, m7 |
| 17382 | packuswb m3, m5 |
| 17383 | movu [r0 + 335 * 16], m3 |
| 17384 | |
| 17385 | ; mode 7 [row 8] |
| 17386 | movu m6, [r5 + 17 * 16] |
| 17387 | pmaddubsw m3, m0, m6 |
| 17388 | pmulhrsw m3, m7 |
| 17389 | pmaddubsw m5, m2, m6 |
| 17390 | pmulhrsw m5, m7 |
| 17391 | packuswb m3, m5 |
| 17392 | movu [r0 + 336 * 16], m3 |
| 17393 | pmaddubsw m3, m1, m6 |
| 17394 | pmulhrsw m3, m7 |
| 17395 | pmaddubsw m5, m4, m6 |
| 17396 | pmulhrsw m5, m7 |
| 17397 | packuswb m3, m5 |
| 17398 | movu [r0 + 337 * 16], m3 |
| 17399 | |
| 17400 | ; mode 7 [row 9] |
| 17401 | movu m6, [r5 + 26 * 16] |
| 17402 | pmaddubsw m3, m0, m6 |
| 17403 | pmulhrsw m3, m7 |
| 17404 | pmaddubsw m5, m2, m6 |
| 17405 | pmulhrsw m5, m7 |
| 17406 | packuswb m3, m5 |
| 17407 | movu [r0 + 338 * 16], m3 |
| 17408 | |
| 17409 | ; mode 8 [row 17 - first half] |
| 17410 | movu [r0 + 418 * 16], m3 |
| 17411 | |
| 17412 | pmaddubsw m3, m1, m6 |
| 17413 | pmulhrsw m3, m7 |
| 17414 | pmaddubsw m5, m4, m6 |
| 17415 | pmulhrsw m5, m7 |
| 17416 | packuswb m3, m5 |
| 17417 | movu [r0 + 339 * 16], m3 |
| 17418 | |
| 17419 | ; mode 8 [row 17 - second half] |
| 17420 | movu [r0 + 419 * 16], m3 |
| 17421 | |
| 17422 | ; mode 8 [row 13] |
| 17423 | movu m6, [r5 + 6 * 16] |
| 17424 | pmaddubsw m3, m0, m6 |
| 17425 | pmulhrsw m3, m7 |
| 17426 | pmaddubsw m5, m2, m6 |
| 17427 | pmulhrsw m5, m7 |
| 17428 | packuswb m3, m5 |
| 17429 | movu [r0 + 410 * 16], m3 |
| 17430 | pmaddubsw m3, m1, m6 |
| 17431 | pmulhrsw m3, m7 |
| 17432 | pmaddubsw m5, m4, m6 |
| 17433 | pmulhrsw m5, m7 |
| 17434 | packuswb m3, m5 |
| 17435 | movu [r0 + 411 * 16], m3 |
| 17436 | |
| 17437 | ; mode 8 [row 14] |
| 17438 | movu m6, [r5 + 11 * 16] |
| 17439 | pmaddubsw m3, m0, m6 |
| 17440 | pmulhrsw m3, m7 |
| 17441 | pmaddubsw m5, m2, m6 |
| 17442 | pmulhrsw m5, m7 |
| 17443 | packuswb m3, m5 |
| 17444 | movu [r0 + 412 * 16], m3 |
| 17445 | pmaddubsw m3, m1, m6 |
| 17446 | pmulhrsw m3, m7 |
| 17447 | pmaddubsw m5, m4, m6 |
| 17448 | pmulhrsw m5, m7 |
| 17449 | packuswb m3, m5 |
| 17450 | movu [r0 + 413 * 16], m3 |
| 17451 | |
| 17452 | ; mode 8 [row 15] |
| 17453 | movu m6, [r5 + 16 * 16] |
| 17454 | pmaddubsw m3, m0, m6 |
| 17455 | pmulhrsw m3, m7 |
| 17456 | pmaddubsw m5, m2, m6 |
| 17457 | pmulhrsw m5, m7 |
| 17458 | packuswb m3, m5 |
| 17459 | movu [r0 + 414 * 16], m3 |
| 17460 | pmaddubsw m3, m1, m6 |
| 17461 | pmulhrsw m3, m7 |
| 17462 | pmaddubsw m5, m4, m6 |
| 17463 | pmulhrsw m5, m7 |
| 17464 | packuswb m3, m5 |
| 17465 | movu [r0 + 415 * 16], m3 |
| 17466 | |
| 17467 | ; mode 8 [row 18] |
| 17468 | movu m6, [r5 + 31 * 16] |
| 17469 | pmaddubsw m3, m0, m6 |
| 17470 | pmulhrsw m3, m7 |
| 17471 | pmaddubsw m5, m2, m6 |
| 17472 | pmulhrsw m5, m7 |
| 17473 | packuswb m3, m5 |
| 17474 | movu [r0 + 420 * 16], m3 |
| 17475 | pmaddubsw m3, m1, m6 |
| 17476 | pmulhrsw m3, m7 |
| 17477 | pmaddubsw m5, m4, m6 |
| 17478 | pmulhrsw m5, m7 |
| 17479 | packuswb m3, m5 |
| 17480 | movu [r0 + 421 * 16], m3 |
| 17481 | |
| 17482 | ; mode 3 [row 3] |
| 17483 | movu m6, [r5 + 8 * 16] |
| 17484 | movu m0, [r4 + 4] |
| 17485 | movd m1, [r4 + 5] |
| 17486 | palignr m1, m0, 1 |
| 17487 | punpcklbw m0, m1 |
| 17488 | pmaddubsw m1, m0, m6 |
| 17489 | pmulhrsw m1, m7 |
| 17490 | movu m2, [r4 + 12] |
| 17491 | movd m3, [r4 + 13] |
| 17492 | palignr m3, m2, 1 |
| 17493 | punpcklbw m2, m3 |
| 17494 | pmaddubsw m3, m2, m6 |
| 17495 | pmulhrsw m3, m7 |
| 17496 | packuswb m1, m3 |
| 17497 | movu [r0 + 70 * 16], m1 |
| 17498 | |
| 17499 | ; mode 6 [row 7 - first half] |
| 17500 | movu [r0 + 270 * 16], m1 |
| 17501 | |
| 17502 | movu m1, [r4 + 20] |
| 17503 | movd m3, [r4 + 21] |
| 17504 | palignr m3, m1, 1 |
| 17505 | punpcklbw m1, m3 |
| 17506 | pmaddubsw m3, m1, m6 |
| 17507 | pmulhrsw m3, m7 |
| 17508 | movu m4, [r4 + 28] |
| 17509 | movd m5, [r4 + 29] |
| 17510 | palignr m5, m4, 1 |
| 17511 | punpcklbw m4, m5 |
| 17512 | pmaddubsw m5, m4, m6 |
| 17513 | pmulhrsw m5, m7 |
| 17514 | packuswb m3, m5 |
| 17515 | movu [r0 + 71 * 16], m3 |
| 17516 | |
| 17517 | ; mode 6 [row 7 - second half] |
| 17518 | movu [r0 + 271 * 16], m3 |
| 17519 | |
| 17520 | ; mode 4 [row 4] |
| 17521 | movu m6, [r5 + 9 * 16] |
| 17522 | pmaddubsw m3, m0, m6 |
| 17523 | pmulhrsw m3, m7 |
| 17524 | pmaddubsw m5, m2, m6 |
| 17525 | pmulhrsw m5, m7 |
| 17526 | packuswb m3, m5 |
| 17527 | movu [r0 + 136 * 16], m3 |
| 17528 | |
| 17529 | ; mode 4 [row 4 - first half] |
| 17530 | movu [r0 + 424 * 16], m3 |
| 17531 | |
| 17532 | pmaddubsw m3, m1, m6 |
| 17533 | pmulhrsw m3, m7 |
| 17534 | pmaddubsw m5, m4, m6 |
| 17535 | pmulhrsw m5, m7 |
| 17536 | packuswb m3, m5 |
| 17537 | movu [r0 + 137 * 16], m3 |
| 17538 | |
| 17539 | ; mode 4 [row 4 - second half] |
| 17540 | movu [r0 + 425 * 16], m3 |
| 17541 | |
| 17542 | ; mode 4 [row 5] |
| 17543 | movu m6, [r5 + 30 * 16] |
| 17544 | pmaddubsw m3, m0, m6 |
| 17545 | pmulhrsw m3, m7 |
| 17546 | pmaddubsw m5, m2, m6 |
| 17547 | pmulhrsw m5, m7 |
| 17548 | packuswb m3, m5 |
| 17549 | movu [r0 + 138 * 16], m3 |
| 17550 | |
| 17551 | ; mode 7 [row 13 - first half] |
| 17552 | movu [r0 + 346 * 16], m3 |
| 17553 | |
| 17554 | pmaddubsw m3, m1, m6 |
| 17555 | pmulhrsw m3, m7 |
| 17556 | pmaddubsw m5, m4, m6 |
| 17557 | pmulhrsw m5, m7 |
| 17558 | packuswb m3, m5 |
| 17559 | movu [r0 + 139 * 16], m3 |
| 17560 | |
| 17561 | ; mode 7 [row 13 - second half] |
| 17562 | movu [r0 + 347 * 16], m3 |
| 17563 | |
| 17564 | ; mode 5 [row 5] |
| 17565 | movu m6, [r5 + 6 * 16] |
| 17566 | pmaddubsw m3, m0, m6 |
| 17567 | pmulhrsw m3, m7 |
| 17568 | pmaddubsw m5, m2, m6 |
| 17569 | pmulhrsw m5, m7 |
| 17570 | packuswb m3, m5 |
| 17571 | movu [r0 + 202 * 16], m3 |
| 17572 | pmaddubsw m3, m1, m6 |
| 17573 | pmulhrsw m3, m7 |
| 17574 | pmaddubsw m5, m4, m6 |
| 17575 | pmulhrsw m5, m7 |
| 17576 | packuswb m3, m5 |
| 17577 | movu [r0 + 203 * 16], m3 |
| 17578 | |
| 17579 | ; mode 5 [row 6] |
| 17580 | movu m6, [r5 + 23 * 16] |
| 17581 | pmaddubsw m3, m0, m6 |
| 17582 | pmulhrsw m3, m7 |
| 17583 | pmaddubsw m5, m2, m6 |
| 17584 | pmulhrsw m5, m7 |
| 17585 | packuswb m3, m5 |
| 17586 | movu [r0 + 204 * 16], m3 |
| 17587 | pmaddubsw m3, m1, m6 |
| 17588 | pmulhrsw m3, m7 |
| 17589 | pmaddubsw m5, m4, m6 |
| 17590 | pmulhrsw m5, m7 |
| 17591 | packuswb m3, m5 |
| 17592 | movu [r0 + 205 * 16], m3 |
| 17593 | |
| 17594 | ; mode 6 [row 8] |
| 17595 | movu m6, [r5 + 21 * 16] |
| 17596 | pmaddubsw m3, m0, m6 |
| 17597 | pmulhrsw m3, m7 |
| 17598 | pmaddubsw m5, m2, m6 |
| 17599 | pmulhrsw m5, m7 |
| 17600 | packuswb m3, m5 |
| 17601 | movu [r0 + 272 * 16], m3 |
| 17602 | |
| 17603 | ; mode 7 [row 12 - first half] |
| 17604 | movu [r0 + 344 * 16], m3 |
| 17605 | |
| 17606 | pmaddubsw m3, m1, m6 |
| 17607 | pmulhrsw m3, m7 |
| 17608 | pmaddubsw m5, m4, m6 |
| 17609 | pmulhrsw m5, m7 |
| 17610 | packuswb m3, m5 |
| 17611 | movu [r0 + 273 * 16], m3 |
| 17612 | |
| 17613 | ; mode 7 [row 12 - second half] |
| 17614 | movu [r0 + 345 * 16], m3 |
| 17615 | |
| 17616 | ; mode 7 [row 10] |
| 17617 | movu m6, [r5 + 3 * 16] |
| 17618 | pmaddubsw m3, m0, m6 |
| 17619 | pmulhrsw m3, m7 |
| 17620 | pmaddubsw m5, m2, m6 |
| 17621 | pmulhrsw m5, m7 |
| 17622 | packuswb m3, m5 |
| 17623 | movu [r0 + 340 * 16], m3 |
| 17624 | pmaddubsw m3, m1, m6 |
| 17625 | pmulhrsw m3, m7 |
| 17626 | pmaddubsw m5, m4, m6 |
| 17627 | pmulhrsw m5, m7 |
| 17628 | packuswb m3, m5 |
| 17629 | movu [r0 + 341 * 16], m3 |
| 17630 | |
| 17631 | ; mode 7 [row 11] |
| 17632 | movu m6, [r5 + 12 * 16] |
| 17633 | pmaddubsw m3, m0, m6 |
| 17634 | pmulhrsw m3, m7 |
| 17635 | pmaddubsw m5, m2, m6 |
| 17636 | pmulhrsw m5, m7 |
| 17637 | packuswb m3, m5 |
| 17638 | movu [r0 + 342 * 16], m3 |
| 17639 | pmaddubsw m3, m1, m6 |
| 17640 | pmulhrsw m3, m7 |
| 17641 | pmaddubsw m5, m4, m6 |
| 17642 | pmulhrsw m5, m7 |
| 17643 | packuswb m3, m5 |
| 17644 | movu [r0 + 343 * 16], m3 |
| 17645 | |
| 17646 | ; mode 8 [row 19] |
| 17647 | movu m6, [r5 + 4 * 16] |
| 17648 | pmaddubsw m3, m0, m6 |
| 17649 | pmulhrsw m3, m7 |
| 17650 | pmaddubsw m5, m2, m6 |
| 17651 | pmulhrsw m5, m7 |
| 17652 | packuswb m3, m5 |
| 17653 | movu [r0 + 422 * 16], m3 |
| 17654 | pmaddubsw m3, m1, m6 |
| 17655 | pmulhrsw m3, m7 |
| 17656 | pmaddubsw m5, m4, m6 |
| 17657 | pmulhrsw m5, m7 |
| 17658 | packuswb m3, m5 |
| 17659 | movu [r0 + 423 * 16], m3 |
| 17660 | |
| 17661 | ; mode 8 [row 21] |
| 17662 | movu m6, [r5 + 14 * 16] |
| 17663 | pmaddubsw m3, m0, m6 |
| 17664 | pmulhrsw m3, m7 |
| 17665 | pmaddubsw m5, m2, m6 |
| 17666 | pmulhrsw m5, m7 |
| 17667 | packuswb m3, m5 |
| 17668 | movu [r0 + 426 * 16], m3 |
| 17669 | pmaddubsw m3, m1, m6 |
| 17670 | pmulhrsw m3, m7 |
| 17671 | pmaddubsw m5, m4, m6 |
| 17672 | pmulhrsw m5, m7 |
| 17673 | packuswb m3, m5 |
| 17674 | movu [r0 + 427 * 16], m3 |
| 17675 | |
| 17676 | ; mode 8 [row 22] |
| 17677 | movu m6, [r5 + 19 * 16] |
| 17678 | pmaddubsw m3, m0, m6 |
| 17679 | pmulhrsw m3, m7 |
| 17680 | pmaddubsw m5, m2, m6 |
| 17681 | pmulhrsw m5, m7 |
| 17682 | packuswb m3, m5 |
| 17683 | movu [r0 + 428 * 16], m3 |
| 17684 | pmaddubsw m3, m1, m6 |
| 17685 | pmulhrsw m3, m7 |
| 17686 | pmaddubsw m5, m4, m6 |
| 17687 | pmulhrsw m5, m7 |
| 17688 | packuswb m3, m5 |
| 17689 | movu [r0 + 429 * 16], m3 |
| 17690 | |
| 17691 | ; mode 8 [row 23] |
| 17692 | movu m6, [r5 + 24 * 16] |
| 17693 | pmaddubsw m3, m0, m6 |
| 17694 | pmulhrsw m3, m7 |
| 17695 | pmaddubsw m5, m2, m6 |
| 17696 | pmulhrsw m5, m7 |
| 17697 | packuswb m3, m5 |
| 17698 | movu [r0 + 430 * 16], m3 |
| 17699 | pmaddubsw m3, m1, m6 |
| 17700 | pmulhrsw m3, m7 |
| 17701 | pmaddubsw m5, m4, m6 |
| 17702 | pmulhrsw m5, m7 |
| 17703 | packuswb m3, m5 |
| 17704 | movu [r0 + 431 * 16], m3 |
| 17705 | |
| 17706 | ; mode 8 [row 24] |
| 17707 | movu m6, [r5 + 29 * 16] |
| 17708 | pmaddubsw m3, m0, m6 |
| 17709 | pmulhrsw m3, m7 |
| 17710 | pmaddubsw m5, m2, m6 |
| 17711 | pmulhrsw m5, m7 |
| 17712 | packuswb m3, m5 |
| 17713 | movu [r0 + 432 * 16], m3 |
| 17714 | pmaddubsw m3, m1, m6 |
| 17715 | pmulhrsw m3, m7 |
| 17716 | pmaddubsw m5, m4, m6 |
| 17717 | pmulhrsw m5, m7 |
| 17718 | packuswb m3, m5 |
| 17719 | movu [r0 + 433 * 16], m3 |
| 17720 | |
| 17721 | ; mode 3 [row 4] |
| 17722 | movu m6, [r5 + 2 * 16] |
| 17723 | movu m0, [r4 + 5] |
| 17724 | movd m1, [r4 + 6] |
| 17725 | palignr m1, m0, 1 |
| 17726 | punpcklbw m0, m1 |
| 17727 | pmaddubsw m1, m0, m6 |
| 17728 | pmulhrsw m1, m7 |
| 17729 | movu m2, [r4 + 13] |
| 17730 | movd m3, [r4 + 14] |
| 17731 | palignr m3, m2, 1 |
| 17732 | punpcklbw m2, m3 |
| 17733 | pmaddubsw m3, m2, m6 |
| 17734 | pmulhrsw m3, m7 |
| 17735 | packuswb m1, m3 |
| 17736 | movu [r0 + 72 * 16], m1 |
| 17737 | |
| 17738 | ; mode 3 [row 4 - first half] |
| 17739 | movu [r0 + 274 * 16], m1 |
| 17740 | |
| 17741 | ; mode 8 [row 25 - first half] |
| 17742 | movu [r0 + 434 * 16], m1 |
| 17743 | |
| 17744 | movu m1, [r4 + 21] |
| 17745 | movd m3, [r4 + 22] |
| 17746 | palignr m3, m1, 1 |
| 17747 | punpcklbw m1, m3 |
| 17748 | pmaddubsw m3, m1, m6 |
| 17749 | pmulhrsw m3, m7 |
| 17750 | movu m4, [r4 + 29] |
| 17751 | movd m5, [r4 + 30] |
| 17752 | palignr m5, m4, 1 |
| 17753 | punpcklbw m4, m5 |
| 17754 | pmaddubsw m5, m4, m6 |
| 17755 | pmulhrsw m5, m7 |
| 17756 | packuswb m3, m5 |
| 17757 | movu [r0 + 73 * 16], m3 |
| 17758 | |
| 17759 | ; mode 3 [row 4 - second half] |
| 17760 | movu [r0 + 275 * 16], m3 |
| 17761 | |
| 17762 | ; mode 8 [row 25 - second half] |
| 17763 | movu [r0 + 435 * 16], m3 |
| 17764 | |
| 17765 | ; mode 3 [row 5] |
| 17766 | movu m6, [r5 + 28 * 16] |
| 17767 | pmaddubsw m3, m0, m6 |
| 17768 | pmulhrsw m3, m7 |
| 17769 | pmaddubsw m5, m2, m6 |
| 17770 | pmulhrsw m5, m7 |
| 17771 | packuswb m3, m5 |
| 17772 | movu [r0 + 74 * 16], m3 |
| 17773 | |
| 17774 | ; mode 3 [row 5 - first half] |
| 17775 | movu [r0 + 278 * 16], m3 |
| 17776 | |
| 17777 | pmaddubsw m3, m1, m6 |
| 17778 | pmulhrsw m3, m7 |
| 17779 | pmaddubsw m5, m4, m6 |
| 17780 | pmulhrsw m5, m7 |
| 17781 | packuswb m3, m5 |
| 17782 | movu [r0 + 75 * 16], m3 |
| 17783 | |
| 17784 | ; mode 3 [row 5 - second half] |
| 17785 | movu [r0 + 279 * 16], m3 |
| 17786 | |
| 17787 | ; mode 4 [row 6] |
| 17788 | movu m6, [r5 + 19 * 16] |
| 17789 | pmaddubsw m3, m0, m6 |
| 17790 | pmulhrsw m3, m7 |
| 17791 | pmaddubsw m5, m2, m6 |
| 17792 | pmulhrsw m5, m7 |
| 17793 | packuswb m3, m5 |
| 17794 | movu [r0 + 140 * 16], m3 |
| 17795 | pmaddubsw m3, m1, m6 |
| 17796 | pmulhrsw m3, m7 |
| 17797 | pmaddubsw m5, m4, m6 |
| 17798 | pmulhrsw m5, m7 |
| 17799 | packuswb m3, m5 |
| 17800 | movu [r0 + 141 * 16], m3 |
| 17801 | |
| 17802 | ; mode 5 [row 7] |
| 17803 | movu m6, [r5 + 8 * 16] |
| 17804 | pmaddubsw m3, m0, m6 |
| 17805 | pmulhrsw m3, m7 |
| 17806 | pmaddubsw m5, m2, m6 |
| 17807 | pmulhrsw m5, m7 |
| 17808 | packuswb m3, m5 |
| 17809 | movu [r0 + 206 * 16], m3 |
| 17810 | pmaddubsw m3, m1, m6 |
| 17811 | pmulhrsw m3, m7 |
| 17812 | pmaddubsw m5, m4, m6 |
| 17813 | pmulhrsw m5, m7 |
| 17814 | packuswb m3, m5 |
| 17815 | movu [r0 + 207 * 16], m3 |
| 17816 | |
| 17817 | ; mode 5 [row 8] |
| 17818 | movu m6, [r5 + 25 * 16] |
| 17819 | pmaddubsw m3, m0, m6 |
| 17820 | pmulhrsw m3, m7 |
| 17821 | pmaddubsw m5, m2, m6 |
| 17822 | pmulhrsw m5, m7 |
| 17823 | packuswb m3, m5 |
| 17824 | movu [r0 + 208 * 16], m3 |
| 17825 | |
| 17826 | ; mode 7 [row 16 - first half] |
| 17827 | movu [r0 + 352 * 16], m3 |
| 17828 | |
| 17829 | pmaddubsw m3, m1, m6 |
| 17830 | pmulhrsw m3, m7 |
| 17831 | pmaddubsw m5, m4, m6 |
| 17832 | pmulhrsw m5, m7 |
| 17833 | packuswb m3, m5 |
| 17834 | movu [r0 + 209 * 16], m3 |
| 17835 | |
| 17836 | ; mode 7 [row 16 - second half] |
| 17837 | movu [r0 + 353 * 16], m3 |
| 17838 | |
| 17839 | ; mode 6 [row 10] |
| 17840 | movu m6, [r5 + 15 * 16] |
| 17841 | pmaddubsw m3, m0, m6 |
| 17842 | pmulhrsw m3, m7 |
| 17843 | pmaddubsw m5, m2, m6 |
| 17844 | pmulhrsw m5, m7 |
| 17845 | packuswb m3, m5 |
| 17846 | movu [r0 + 276 * 16], m3 |
| 17847 | pmaddubsw m3, m1, m6 |
| 17848 | pmulhrsw m3, m7 |
| 17849 | pmaddubsw m5, m4, m6 |
| 17850 | pmulhrsw m5, m7 |
| 17851 | packuswb m3, m5 |
| 17852 | movu [r0 + 277 * 16], m3 |
| 17853 | |
| 17854 | ; mode 7 [row 14] |
| 17855 | movu m6, [r5 + 7 * 16] |
| 17856 | pmaddubsw m3, m0, m6 |
| 17857 | pmulhrsw m3, m7 |
| 17858 | pmaddubsw m5, m2, m6 |
| 17859 | pmulhrsw m5, m7 |
| 17860 | packuswb m3, m5 |
| 17861 | movu [r0 + 348 * 16], m3 |
| 17862 | |
| 17863 | ; mode 8 [row 26 - first half] |
| 17864 | movu [r0 + 436 * 16], m3 |
| 17865 | |
| 17866 | pmaddubsw m3, m1, m6 |
| 17867 | pmulhrsw m3, m7 |
| 17868 | pmaddubsw m5, m4, m6 |
| 17869 | pmulhrsw m5, m7 |
| 17870 | packuswb m3, m5 |
| 17871 | movu [r0 + 349 * 16], m3 |
| 17872 | |
| 17873 | ; mode 8 [row 26 - second half] |
| 17874 | movu [r0 + 437 * 16], m3 |
| 17875 | |
| 17876 | ; mode 7 [row 15] |
| 17877 | movu m6, [r5 + 16 * 16] |
| 17878 | pmaddubsw m3, m0, m6 |
| 17879 | pmulhrsw m3, m7 |
| 17880 | pmaddubsw m5, m2, m6 |
| 17881 | pmulhrsw m5, m7 |
| 17882 | packuswb m3, m5 |
| 17883 | movu [r0 + 350 * 16], m3 |
| 17884 | pmaddubsw m3, m1, m6 |
| 17885 | pmulhrsw m3, m7 |
| 17886 | pmaddubsw m5, m4, m6 |
| 17887 | pmulhrsw m5, m7 |
| 17888 | packuswb m3, m5 |
| 17889 | movu [r0 + 351 * 16], m3 |
| 17890 | |
| 17891 | ; mode 8 [row 27] |
| 17892 | movu m6, [r5 + 12 * 16] |
| 17893 | pmaddubsw m3, m0, m6 |
| 17894 | pmulhrsw m3, m7 |
| 17895 | pmaddubsw m5, m2, m6 |
| 17896 | pmulhrsw m5, m7 |
| 17897 | packuswb m3, m5 |
| 17898 | movu [r0 + 438 * 16], m3 |
| 17899 | pmaddubsw m3, m1, m6 |
| 17900 | pmulhrsw m3, m7 |
| 17901 | pmaddubsw m5, m4, m6 |
| 17902 | pmulhrsw m5, m7 |
| 17903 | packuswb m3, m5 |
| 17904 | movu [r0 + 439 * 16], m3 |
| 17905 | |
| 17906 | ; mode 8 [row 28] |
| 17907 | movu m6, [r5 + 17 * 16] |
| 17908 | pmaddubsw m3, m0, m6 |
| 17909 | pmulhrsw m3, m7 |
| 17910 | pmaddubsw m5, m2, m6 |
| 17911 | pmulhrsw m5, m7 |
| 17912 | packuswb m3, m5 |
| 17913 | movu [r0 + 440 * 16], m3 |
| 17914 | pmaddubsw m3, m1, m6 |
| 17915 | pmulhrsw m3, m7 |
| 17916 | pmaddubsw m5, m4, m6 |
| 17917 | pmulhrsw m5, m7 |
| 17918 | packuswb m3, m5 |
| 17919 | movu [r0 + 441 * 16], m3 |
| 17920 | |
| 17921 | ; mode 8 [row 29] |
| 17922 | movu m6, [r5 + 22 * 16] |
| 17923 | pmaddubsw m3, m0, m6 |
| 17924 | pmulhrsw m3, m7 |
| 17925 | pmaddubsw m5, m2, m6 |
| 17926 | pmulhrsw m5, m7 |
| 17927 | packuswb m3, m5 |
| 17928 | movu [r0 + 442 * 16], m3 |
| 17929 | pmaddubsw m3, m1, m6 |
| 17930 | pmulhrsw m3, m7 |
| 17931 | pmaddubsw m5, m4, m6 |
| 17932 | pmulhrsw m5, m7 |
| 17933 | packuswb m3, m5 |
| 17934 | movu [r0 + 443 * 16], m3 |
| 17935 | |
| 17936 | ; mode 8 [row 30] |
| 17937 | movu m6, [r5 + 27 * 16] |
| 17938 | pmaddubsw m3, m0, m6 |
| 17939 | pmulhrsw m3, m7 |
| 17940 | pmaddubsw m5, m2, m6 |
| 17941 | pmulhrsw m5, m7 |
| 17942 | packuswb m3, m5 |
| 17943 | movu [r0 + 444 * 16], m3 |
| 17944 | pmaddubsw m3, m1, m6 |
| 17945 | pmulhrsw m3, m7 |
| 17946 | pmaddubsw m5, m4, m6 |
| 17947 | pmulhrsw m5, m7 |
| 17948 | packuswb m3, m5 |
| 17949 | movu [r0 + 445 * 16], m3 |
| 17950 | |
| 17951 | ; mode 3 [row 6] |
| 17952 | movu m6, [r5 + 22 * 16] |
| 17953 | movu m0, [r4 + 6] |
| 17954 | movd m1, [r4 + 7] |
| 17955 | palignr m1, m0, 1 |
| 17956 | punpcklbw m0, m1 |
| 17957 | pmaddubsw m1, m0, m6 |
| 17958 | pmulhrsw m1, m7 |
| 17959 | movu m2, [r4 + 14] |
| 17960 | movd m3, [r4 + 15] |
| 17961 | palignr m3, m2, 1 |
| 17962 | punpcklbw m2, m3 |
| 17963 | pmaddubsw m3, m2, m6 |
| 17964 | pmulhrsw m3, m7 |
| 17965 | packuswb m1, m3 |
| 17966 | movu [r0 + 76 * 16], m1 |
| 17967 | |
| 17968 | ; mode 6 [row 13 - first half] |
| 17969 | movu [r0 + 282 * 16], m1 |
| 17970 | |
| 17971 | movu m1, [r4 + 22] |
| 17972 | movd m3, [r4 + 23] |
| 17973 | palignr m3, m1, 1 |
| 17974 | punpcklbw m1, m3 |
| 17975 | pmaddubsw m3, m1, m6 |
| 17976 | pmulhrsw m3, m7 |
| 17977 | movu m4, [r4 + 30] |
| 17978 | movd m5, [r4 + 31] |
| 17979 | palignr m5, m4, 1 |
| 17980 | punpcklbw m4, m5 |
| 17981 | pmaddubsw m5, m4, m6 |
| 17982 | pmulhrsw m5, m7 |
| 17983 | packuswb m3, m5 |
| 17984 | movu [r0 + 77 * 16], m3 |
| 17985 | |
| 17986 | ; mode 6 [row 13 - second half] |
| 17987 | movu [r0 + 283 * 16], m3 |
| 17988 | |
| 17989 | ; mode 4 [row 7] |
| 17990 | movu m6, [r5 + 8 * 16] |
| 17991 | pmaddubsw m3, m0, m6 |
| 17992 | pmulhrsw m3, m7 |
| 17993 | pmaddubsw m5, m2, m6 |
| 17994 | pmulhrsw m5, m7 |
| 17995 | packuswb m3, m5 |
| 17996 | movu [r0 + 142 * 16], m3 |
| 17997 | pmaddubsw m3, m1, m6 |
| 17998 | pmulhrsw m3, m7 |
| 17999 | pmaddubsw m5, m4, m6 |
| 18000 | pmulhrsw m5, m7 |
| 18001 | packuswb m3, m5 |
| 18002 | movu [r0 + 143 * 16], m3 |
| 18003 | |
| 18004 | ; mode 4 [row 8] |
| 18005 | movu m6, [r5 + 29 * 16] |
| 18006 | pmaddubsw m3, m0, m6 |
| 18007 | pmulhrsw m3, m7 |
| 18008 | pmaddubsw m5, m2, m6 |
| 18009 | pmulhrsw m5, m7 |
| 18010 | packuswb m3, m5 |
| 18011 | movu [r0 + 144 * 16], m3 |
| 18012 | |
| 18013 | ; mode 4 [row 8 - first half] |
| 18014 | movu [r0 + 360 * 16], m3 |
| 18015 | |
| 18016 | pmaddubsw m3, m1, m6 |
| 18017 | pmulhrsw m3, m7 |
| 18018 | pmaddubsw m5, m4, m6 |
| 18019 | pmulhrsw m5, m7 |
| 18020 | packuswb m3, m5 |
| 18021 | movu [r0 + 145 * 16], m3 |
| 18022 | |
| 18023 | ; mode 4 [row 8 - second half] |
| 18024 | movu [r0 + 361 * 16], m3 |
| 18025 | |
| 18026 | ; mode 5 [row 9] |
| 18027 | movu m6, [r5 + 10 * 16] |
| 18028 | pmaddubsw m3, m0, m6 |
| 18029 | pmulhrsw m3, m7 |
| 18030 | pmaddubsw m5, m2, m6 |
| 18031 | pmulhrsw m5, m7 |
| 18032 | packuswb m3, m5 |
| 18033 | movu [r0 + 210 * 16], m3 |
| 18034 | pmaddubsw m3, m1, m6 |
| 18035 | pmulhrsw m3, m7 |
| 18036 | pmaddubsw m5, m4, m6 |
| 18037 | pmulhrsw m5, m7 |
| 18038 | packuswb m3, m5 |
| 18039 | movu [r0 + 211 * 16], m3 |
| 18040 | |
| 18041 | ; mode 5 [row 10] |
| 18042 | movu m6, [r5 + 27 * 16] |
| 18043 | pmaddubsw m3, m0, m6 |
| 18044 | pmulhrsw m3, m7 |
| 18045 | pmaddubsw m5, m2, m6 |
| 18046 | pmulhrsw m5, m7 |
| 18047 | packuswb m3, m5 |
| 18048 | movu [r0 + 212 * 16], m3 |
| 18049 | pmaddubsw m3, m1, m6 |
| 18050 | pmulhrsw m3, m7 |
| 18051 | pmaddubsw m5, m4, m6 |
| 18052 | pmulhrsw m5, m7 |
| 18053 | packuswb m3, m5 |
| 18054 | movu [r0 + 213 * 16], m3 |
| 18055 | |
| 18056 | ; mode 7 [row 17] |
| 18057 | movu m6, [r5 + 2 * 16] |
| 18058 | pmaddubsw m3, m0, m6 |
| 18059 | pmulhrsw m3, m7 |
| 18060 | pmaddubsw m5, m2, m6 |
| 18061 | pmulhrsw m5, m7 |
| 18062 | packuswb m3, m5 |
| 18063 | movu [r0 + 354 * 16], m3 |
| 18064 | pmaddubsw m3, m1, m6 |
| 18065 | pmulhrsw m3, m7 |
| 18066 | pmaddubsw m5, m4, m6 |
| 18067 | pmulhrsw m5, m7 |
| 18068 | packuswb m3, m5 |
| 18069 | movu [r0 + 355 * 16], m3 |
| 18070 | |
| 18071 | ; mode 7 [row 18] |
| 18072 | movu m6, [r5 + 11 * 16] |
| 18073 | pmaddubsw m3, m0, m6 |
| 18074 | pmulhrsw m3, m7 |
| 18075 | pmaddubsw m5, m2, m6 |
| 18076 | pmulhrsw m5, m7 |
| 18077 | packuswb m3, m5 |
| 18078 | movu [r0 + 356 * 16], m3 |
| 18079 | pmaddubsw m3, m1, m6 |
| 18080 | pmulhrsw m3, m7 |
| 18081 | pmaddubsw m5, m4, m6 |
| 18082 | pmulhrsw m5, m7 |
| 18083 | packuswb m3, m5 |
| 18084 | movu [r0 + 357 * 16], m3 |
| 18085 | |
| 18086 | ; mode 7 [row 19] |
| 18087 | movu m6, [r5 + 20 * 16] |
| 18088 | pmaddubsw m3, m0, m6 |
| 18089 | pmulhrsw m3, m7 |
| 18090 | pmaddubsw m5, m2, m6 |
| 18091 | pmulhrsw m5, m7 |
| 18092 | packuswb m3, m5 |
| 18093 | movu [r0 + 358 * 16], m3 |
| 18094 | pmaddubsw m3, m1, m6 |
| 18095 | pmulhrsw m3, m7 |
| 18096 | pmaddubsw m5, m4, m6 |
| 18097 | pmulhrsw m5, m7 |
| 18098 | packuswb m3, m5 |
| 18099 | movu [r0 + 359 * 16], m3 |
| 18100 | |
| 18101 | ; mode 6 [row 12] |
| 18102 | movu m6, [r5 + 9 * 16] |
| 18103 | pmaddubsw m3, m0, m6 |
| 18104 | pmulhrsw m3, m7 |
| 18105 | pmaddubsw m5, m2, m6 |
| 18106 | pmulhrsw m5, m7 |
| 18107 | packuswb m3, m5 |
| 18108 | movu [r0 + 280 * 16], m3 |
| 18109 | pmaddubsw m3, m1, m6 |
| 18110 | pmulhrsw m3, m7 |
| 18111 | pmaddubsw m5, m4, m6 |
| 18112 | pmulhrsw m5, m7 |
| 18113 | packuswb m3, m5 |
| 18114 | movu [r0 + 281 * 16], m3 |
| 18115 | |
| 18116 | ; mode 3 [row 7] |
| 18117 | movu m6, [r5 + 16 * 16] |
| 18118 | movu m0, [r4 + 7] |
| 18119 | movd m1, [r4 + 8] |
| 18120 | palignr m1, m0, 1 |
| 18121 | punpcklbw m0, m1 |
| 18122 | pmaddubsw m1, m0, m6 |
| 18123 | pmulhrsw m1, m7 |
| 18124 | movu m2, [r4 + 15] |
| 18125 | movd m3, [r4 + 16] |
| 18126 | palignr m3, m2, 1 |
| 18127 | punpcklbw m2, m3 |
| 18128 | pmaddubsw m3, m2, m6 |
| 18129 | pmulhrsw m3, m7 |
| 18130 | packuswb m1, m3 |
| 18131 | movu [r0 + 78 * 16], m1 |
| 18132 | |
| 18133 | ; mode 6 [row 15 - first half] |
| 18134 | movu [r0 + 286 * 16], m1 |
| 18135 | |
| 18136 | movu m1, [r4 + 23] |
| 18137 | movd m3, [r4 + 24] |
| 18138 | palignr m3, m1, 1 |
| 18139 | punpcklbw m1, m3 |
| 18140 | pmaddubsw m3, m1, m6 |
| 18141 | pmulhrsw m3, m7 |
| 18142 | movu m4, [r4 + 31] |
| 18143 | movd m5, [r4 + 32] |
| 18144 | palignr m5, m4, 1 |
| 18145 | punpcklbw m4, m5 |
| 18146 | pmaddubsw m5, m4, m6 |
| 18147 | pmulhrsw m5, m7 |
| 18148 | packuswb m3, m5 |
| 18149 | movu [r0 + 79 * 16], m3 |
| 18150 | |
| 18151 | ; mode 6 [row 15 - second half] |
| 18152 | movu [r0 + 287 * 16], m3 |
| 18153 | |
| 18154 | ; mode 4 [row 9] |
| 18155 | movu m6, [r5 + 18 * 16] |
| 18156 | pmaddubsw m3, m0, m6 |
| 18157 | pmulhrsw m3, m7 |
| 18158 | pmaddubsw m5, m2, m6 |
| 18159 | pmulhrsw m5, m7 |
| 18160 | packuswb m3, m5 |
| 18161 | movu [r0 + 146 * 16], m3 |
| 18162 | pmaddubsw m3, m1, m6 |
| 18163 | pmulhrsw m3, m7 |
| 18164 | pmaddubsw m5, m4, m6 |
| 18165 | pmulhrsw m5, m7 |
| 18166 | packuswb m3, m5 |
| 18167 | movu [r0 + 147 * 16], m3 |
| 18168 | |
| 18169 | ; mode 5 [row 11] |
| 18170 | movu m6, [r5 + 12 * 16] |
| 18171 | pmaddubsw m3, m0, m6 |
| 18172 | pmulhrsw m3, m7 |
| 18173 | pmaddubsw m5, m2, m6 |
| 18174 | pmulhrsw m5, m7 |
| 18175 | packuswb m3, m5 |
| 18176 | movu [r0 + 214 * 16], m3 |
| 18177 | pmaddubsw m3, m1, m6 |
| 18178 | pmulhrsw m3, m7 |
| 18179 | pmaddubsw m5, m4, m6 |
| 18180 | pmulhrsw m5, m7 |
| 18181 | packuswb m3, m5 |
| 18182 | movu [r0 + 215 * 16], m3 |
| 18183 | |
| 18184 | ; mode 5 [row 12] |
| 18185 | movu m6, [r5 + 29 * 16] |
| 18186 | pmaddubsw m3, m0, m6 |
| 18187 | pmulhrsw m3, m7 |
| 18188 | pmaddubsw m5, m2, m6 |
| 18189 | pmulhrsw m5, m7 |
| 18190 | packuswb m3, m5 |
| 18191 | movu [r0 + 216 * 16], m3 |
| 18192 | |
| 18193 | ; mode 6 [row 16 - first half] |
| 18194 | movu [r0 + 288 * 16], m3 |
| 18195 | |
| 18196 | pmaddubsw m3, m1, m6 |
| 18197 | pmulhrsw m3, m7 |
| 18198 | pmaddubsw m5, m4, m6 |
| 18199 | pmulhrsw m5, m7 |
| 18200 | packuswb m3, m5 |
| 18201 | movu [r0 + 217 * 16], m3 |
| 18202 | |
| 18203 | ; mode 6 [row 16 - second half] |
| 18204 | movu [r0 + 289 * 16], m3 |
| 18205 | |
| 18206 | ; mode 6 [row 14] |
| 18207 | movu m6, [r5 + 3 * 16] |
| 18208 | pmaddubsw m3, m0, m6 |
| 18209 | pmulhrsw m3, m7 |
| 18210 | pmaddubsw m5, m2, m6 |
| 18211 | pmulhrsw m5, m7 |
| 18212 | packuswb m3, m5 |
| 18213 | movu [r0 + 284 * 16], m3 |
| 18214 | pmaddubsw m3, m1, m6 |
| 18215 | pmulhrsw m3, m7 |
| 18216 | pmaddubsw m5, m4, m6 |
| 18217 | pmulhrsw m5, m7 |
| 18218 | packuswb m3, m5 |
| 18219 | movu [r0 + 285 * 16], m3 |
| 18220 | |
| 18221 | ; mode 7 [row 21] |
| 18222 | movu m6, [r5 + 6 * 16] |
| 18223 | pmaddubsw m3, m0, m6 |
| 18224 | pmulhrsw m3, m7 |
| 18225 | pmaddubsw m5, m2, m6 |
| 18226 | pmulhrsw m5, m7 |
| 18227 | packuswb m3, m5 |
| 18228 | movu [r0 + 362 * 16], m3 |
| 18229 | pmaddubsw m3, m1, m6 |
| 18230 | pmulhrsw m3, m7 |
| 18231 | pmaddubsw m5, m4, m6 |
| 18232 | pmulhrsw m5, m7 |
| 18233 | packuswb m3, m5 |
| 18234 | movu [r0 + 363 * 16], m3 |
| 18235 | |
| 18236 | ; mode 7 [row 22] |
| 18237 | movu m6, [r5 + 15 * 16] |
| 18238 | pmaddubsw m3, m0, m6 |
| 18239 | pmulhrsw m3, m7 |
| 18240 | pmaddubsw m5, m2, m6 |
| 18241 | pmulhrsw m5, m7 |
| 18242 | packuswb m3, m5 |
| 18243 | movu [r0 + 364 * 16], m3 |
| 18244 | pmaddubsw m3, m1, m6 |
| 18245 | pmulhrsw m3, m7 |
| 18246 | pmaddubsw m5, m4, m6 |
| 18247 | pmulhrsw m5, m7 |
| 18248 | packuswb m3, m5 |
| 18249 | movu [r0 + 365 * 16], m3 |
| 18250 | |
| 18251 | ; mode 7 [row 23] |
| 18252 | movu m6, [r5 + 24 * 16] |
| 18253 | pmaddubsw m3, m0, m6 |
| 18254 | pmulhrsw m3, m7 |
| 18255 | pmaddubsw m5, m2, m6 |
| 18256 | pmulhrsw m5, m7 |
| 18257 | packuswb m3, m5 |
| 18258 | movu [r0 + 366 * 16], m3 |
| 18259 | pmaddubsw m3, m1, m6 |
| 18260 | pmulhrsw m3, m7 |
| 18261 | pmaddubsw m5, m4, m6 |
| 18262 | pmulhrsw m5, m7 |
| 18263 | packuswb m3, m5 |
| 18264 | movu [r0 + 367 * 16], m3 |
| 18265 | |
| 18266 | ; mode 3 [row 8] |
| 18267 | movu m6, [r5 + 10 * 16] |
| 18268 | movu m0, [r4 + 8] |
| 18269 | movd m1, [r4 + 9] |
| 18270 | palignr m1, m0, 1 |
| 18271 | punpcklbw m0, m1 |
| 18272 | pmaddubsw m1, m0, m6 |
| 18273 | pmulhrsw m1, m7 |
| 18274 | movu m2, [r4 + 16] |
| 18275 | movd m3, [r4 + 17] |
| 18276 | palignr m3, m2, 1 |
| 18277 | punpcklbw m2, m3 |
| 18278 | pmaddubsw m3, m2, m6 |
| 18279 | pmulhrsw m3, m7 |
| 18280 | packuswb m1, m3 |
| 18281 | movu [r0 + 80 * 16], m1 |
| 18282 | |
| 18283 | ; mode 7 [row 25 - first half] |
| 18284 | movu [r0 + 290 * 16], m1 |
| 18285 | |
| 18286 | ; mode 6 [row 17 - first half] |
| 18287 | movu [r0 + 370 * 16], m1 |
| 18288 | |
| 18289 | movu m1, [r4 + 24] |
| 18290 | movd m3, [r4 + 25] |
| 18291 | palignr m3, m1, 1 |
| 18292 | punpcklbw m1, m3 |
| 18293 | pmaddubsw m3, m1, m6 |
| 18294 | pmulhrsw m3, m7 |
| 18295 | movu m4, [r4 + 32] |
| 18296 | movd m5, [r4 + 33] |
| 18297 | palignr m5, m4, 1 |
| 18298 | punpcklbw m4, m5 |
| 18299 | pmaddubsw m5, m4, m6 |
| 18300 | pmulhrsw m5, m7 |
| 18301 | packuswb m3, m5 |
| 18302 | movu [r0 + 81 * 16], m3 |
| 18303 | |
| 18304 | ; mode 7 [row 25 - second half] |
| 18305 | movu [r0 + 291 * 16], m3 |
| 18306 | |
| 18307 | ; mode 6 [row 17 - second half] |
| 18308 | movu [r0 + 371 * 16], m3 |
| 18309 | |
| 18310 | ; mode 4 [row 10] |
| 18311 | movu m6, [r5 + 7 * 16] |
| 18312 | pmaddubsw m3, m0, m6 |
| 18313 | pmulhrsw m3, m7 |
| 18314 | pmaddubsw m5, m2, m6 |
| 18315 | pmulhrsw m5, m7 |
| 18316 | packuswb m3, m5 |
| 18317 | movu [r0 + 148 * 16], m3 |
| 18318 | pmaddubsw m3, m1, m6 |
| 18319 | pmulhrsw m3, m7 |
| 18320 | pmaddubsw m5, m4, m6 |
| 18321 | pmulhrsw m5, m7 |
| 18322 | packuswb m3, m5 |
| 18323 | movu [r0 + 149 * 16], m3 |
| 18324 | |
| 18325 | ; mode 4 [row 11] |
| 18326 | movu m6, [r5 + 28 * 16] |
| 18327 | pmaddubsw m3, m0, m6 |
| 18328 | pmulhrsw m3, m7 |
| 18329 | pmaddubsw m5, m2, m6 |
| 18330 | pmulhrsw m5, m7 |
| 18331 | packuswb m3, m5 |
| 18332 | movu [r0 + 150 * 16], m3 |
| 18333 | |
| 18334 | ; mode 7 [row 27 - first half] |
| 18335 | movu [r0 + 374 * 16], m3 |
| 18336 | |
| 18337 | pmaddubsw m3, m1, m6 |
| 18338 | pmulhrsw m3, m7 |
| 18339 | pmaddubsw m5, m4, m6 |
| 18340 | pmulhrsw m5, m7 |
| 18341 | packuswb m3, m5 |
| 18342 | movu [r0 + 151 * 16], m3 |
| 18343 | |
| 18344 | ; mode 7 [row 27 - second half] |
| 18345 | movu [r0 + 375 * 16], m3 |
| 18346 | |
| 18347 | ; mode 5 [row 13] |
| 18348 | movu m6, [r5 + 14 * 16] |
| 18349 | pmaddubsw m3, m0, m6 |
| 18350 | pmulhrsw m3, m7 |
| 18351 | pmaddubsw m5, m2, m6 |
| 18352 | pmulhrsw m5, m7 |
| 18353 | packuswb m3, m5 |
| 18354 | movu [r0 + 218 * 16], m3 |
| 18355 | pmaddubsw m3, m1, m6 |
| 18356 | pmulhrsw m3, m7 |
| 18357 | pmaddubsw m5, m4, m6 |
| 18358 | pmulhrsw m5, m7 |
| 18359 | packuswb m3, m5 |
| 18360 | movu [r0 + 219 * 16], m3 |
| 18361 | |
| 18362 | ; mode 5 [row 14] |
| 18363 | movu m6, [r5 + 31 * 16] |
| 18364 | pmaddubsw m3, m0, m6 |
| 18365 | pmulhrsw m3, m7 |
| 18366 | pmaddubsw m5, m2, m6 |
| 18367 | pmulhrsw m5, m7 |
| 18368 | packuswb m3, m5 |
| 18369 | movu [r0 + 220 * 16], m3 |
| 18370 | pmaddubsw m3, m1, m6 |
| 18371 | pmulhrsw m3, m7 |
| 18372 | pmaddubsw m5, m4, m6 |
| 18373 | pmulhrsw m5, m7 |
| 18374 | packuswb m3, m5 |
| 18375 | movu [r0 + 221 * 16], m3 |
| 18376 | |
| 18377 | ; mode 6 [row 18] |
| 18378 | movu m6, [r5 + 23 * 16] |
| 18379 | pmaddubsw m3, m0, m6 |
| 18380 | pmulhrsw m3, m7 |
| 18381 | pmaddubsw m5, m2, m6 |
| 18382 | pmulhrsw m5, m7 |
| 18383 | packuswb m3, m5 |
| 18384 | movu [r0 + 292 * 16], m3 |
| 18385 | pmaddubsw m3, m1, m6 |
| 18386 | pmulhrsw m3, m7 |
| 18387 | pmaddubsw m5, m4, m6 |
| 18388 | pmulhrsw m5, m7 |
| 18389 | packuswb m3, m5 |
| 18390 | movu [r0 + 293 * 16], m3 |
| 18391 | |
| 18392 | ; mode 7 [row 24] |
| 18393 | movu m6, [r5 + 1 * 16] |
| 18394 | pmaddubsw m3, m0, m6 |
| 18395 | pmulhrsw m3, m7 |
| 18396 | pmaddubsw m5, m2, m6 |
| 18397 | pmulhrsw m5, m7 |
| 18398 | packuswb m3, m5 |
| 18399 | movu [r0 + 368 * 16], m3 |
| 18400 | pmaddubsw m3, m1, m6 |
| 18401 | pmulhrsw m3, m7 |
| 18402 | pmaddubsw m5, m4, m6 |
| 18403 | pmulhrsw m5, m7 |
| 18404 | packuswb m3, m5 |
| 18405 | movu [r0 + 369 * 16], m3 |
| 18406 | |
| 18407 | ; mode 7 [row 26] |
| 18408 | movu m6, [r5 + 19 * 16] |
| 18409 | pmaddubsw m3, m0, m6 |
| 18410 | pmulhrsw m3, m7 |
| 18411 | pmaddubsw m5, m2, m6 |
| 18412 | pmulhrsw m5, m7 |
| 18413 | packuswb m3, m5 |
| 18414 | movu [r0 + 372 * 16], m3 |
| 18415 | pmaddubsw m3, m1, m6 |
| 18416 | pmulhrsw m3, m7 |
| 18417 | pmaddubsw m5, m4, m6 |
| 18418 | pmulhrsw m5, m7 |
| 18419 | packuswb m3, m5 |
| 18420 | movu [r0 + 373 * 16], m3 |
| 18421 | |
| 18422 | ; mode 3 [row 9] |
| 18423 | movu m6, [r5 + 4 * 16] |
| 18424 | movu m0, [r4 + 9] |
| 18425 | movd m1, [r4 + 10] |
| 18426 | palignr m1, m0, 1 |
| 18427 | punpcklbw m0, m1 |
| 18428 | pmaddubsw m1, m0, m6 |
| 18429 | pmulhrsw m1, m7 |
| 18430 | movu m2, [r4 + 17] |
| 18431 | movd m3, [r4 + 18] |
| 18432 | palignr m3, m2, 1 |
| 18433 | punpcklbw m2, m3 |
| 18434 | pmaddubsw m3, m2, m6 |
| 18435 | pmulhrsw m3, m7 |
| 18436 | packuswb m1, m3 |
| 18437 | movu [r0 + 82 * 16], m1 |
| 18438 | |
| 18439 | ; mode 6 [row 19 - first half] |
| 18440 | movu [r0 + 294 * 16], m1 |
| 18441 | |
| 18442 | movu m1, [r4 + 25] |
| 18443 | movd m3, [r4 + 26] |
| 18444 | palignr m3, m1, 1 |
| 18445 | punpcklbw m1, m3 |
| 18446 | pmaddubsw m3, m1, m6 |
| 18447 | pmulhrsw m3, m7 |
| 18448 | movu m4, [r4 + 33] |
| 18449 | movd m5, [r4 + 34] |
| 18450 | palignr m5, m4, 1 |
| 18451 | punpcklbw m4, m5 |
| 18452 | pmaddubsw m5, m4, m6 |
| 18453 | pmulhrsw m5, m7 |
| 18454 | packuswb m3, m5 |
| 18455 | movu [r0 + 83 * 16], m3 |
| 18456 | |
| 18457 | ; mode 6 [row 19 - second half] |
| 18458 | movu [r0 + 295 * 16], m3 |
| 18459 | |
| 18460 | ; mode 4 [row 12] |
| 18461 | movu m6, [r5 + 17 * 16] |
| 18462 | pmaddubsw m3, m0, m6 |
| 18463 | pmulhrsw m3, m7 |
| 18464 | pmaddubsw m5, m2, m6 |
| 18465 | pmulhrsw m5, m7 |
| 18466 | packuswb m3, m5 |
| 18467 | movu [r0 + 152 * 16], m3 |
| 18468 | |
| 18469 | ; mode 4 [row 12 - first half] |
| 18470 | movu [r0 + 296 * 16], m3 |
| 18471 | |
| 18472 | pmaddubsw m3, m1, m6 |
| 18473 | pmulhrsw m3, m7 |
| 18474 | pmaddubsw m5, m4, m6 |
| 18475 | pmulhrsw m5, m7 |
| 18476 | packuswb m3, m5 |
| 18477 | movu [r0 + 153 * 16], m3 |
| 18478 | |
| 18479 | ; mode 4 [row 12 - second half] |
| 18480 | movu [r0 + 297 * 16], m3 |
| 18481 | |
| 18482 | ; mode 3 [row 10] |
| 18483 | movu m6, [r5 + 30 * 16] |
| 18484 | pmaddubsw m3, m0, m6 |
| 18485 | pmulhrsw m3, m7 |
| 18486 | pmaddubsw m5, m2, m6 |
| 18487 | pmulhrsw m5, m7 |
| 18488 | packuswb m3, m5 |
| 18489 | movu [r0 + 84 * 16], m3 |
| 18490 | |
| 18491 | ; mode 6 [row 21 - first half] |
| 18492 | movu [r0 + 298 * 16], m3 |
| 18493 | |
| 18494 | pmaddubsw m3, m1, m6 |
| 18495 | pmulhrsw m3, m7 |
| 18496 | pmaddubsw m5, m4, m6 |
| 18497 | pmulhrsw m5, m7 |
| 18498 | packuswb m3, m5 |
| 18499 | movu [r0 + 85 * 16], m3 |
| 18500 | |
| 18501 | ; mode 6 [row 21 - second half] |
| 18502 | movu [r0 + 299 * 16], m3 |
| 18503 | |
| 18504 | ; mode 5 [row 15] |
| 18505 | movu m6, [r5 + 16 * 16] |
| 18506 | pmaddubsw m3, m0, m6 |
| 18507 | pmulhrsw m3, m7 |
| 18508 | pmaddubsw m5, m2, m6 |
| 18509 | pmulhrsw m5, m7 |
| 18510 | packuswb m3, m5 |
| 18511 | movu [r0 + 222 * 16], m3 |
| 18512 | pmaddubsw m3, m1, m6 |
| 18513 | pmulhrsw m3, m7 |
| 18514 | pmaddubsw m5, m4, m6 |
| 18515 | pmulhrsw m5, m7 |
| 18516 | packuswb m3, m5 |
| 18517 | movu [r0 + 223 * 16], m3 |
| 18518 | |
| 18519 | ; mode 7 [row 28] |
| 18520 | movu m6, [r5 + 5 * 16] |
| 18521 | pmaddubsw m3, m0, m6 |
| 18522 | pmulhrsw m3, m7 |
| 18523 | pmaddubsw m5, m2, m6 |
| 18524 | pmulhrsw m5, m7 |
| 18525 | packuswb m3, m5 |
| 18526 | movu [r0 + 376 * 16], m3 |
| 18527 | pmaddubsw m3, m1, m6 |
| 18528 | pmulhrsw m3, m7 |
| 18529 | pmaddubsw m5, m4, m6 |
| 18530 | pmulhrsw m5, m7 |
| 18531 | packuswb m3, m5 |
| 18532 | movu [r0 + 377 * 16], m3 |
| 18533 | |
| 18534 | ; mode 7 [row 29] |
| 18535 | movu m6, [r5 + 14 * 16] |
| 18536 | pmaddubsw m3, m0, m6 |
| 18537 | pmulhrsw m3, m7 |
| 18538 | pmaddubsw m5, m2, m6 |
| 18539 | pmulhrsw m5, m7 |
| 18540 | packuswb m3, m5 |
| 18541 | movu [r0 + 378 * 16], m3 |
| 18542 | pmaddubsw m3, m1, m6 |
| 18543 | pmulhrsw m3, m7 |
| 18544 | pmaddubsw m5, m4, m6 |
| 18545 | pmulhrsw m5, m7 |
| 18546 | packuswb m3, m5 |
| 18547 | movu [r0 + 379 * 16], m3 |
| 18548 | |
| 18549 | ; mode 7 [row 30] |
| 18550 | movu m6, [r5 + 23 * 16] |
| 18551 | pmaddubsw m3, m0, m6 |
| 18552 | pmulhrsw m3, m7 |
| 18553 | pmaddubsw m5, m2, m6 |
| 18554 | pmulhrsw m5, m7 |
| 18555 | packuswb m3, m5 |
| 18556 | movu [r0 + 380 * 16], m3 |
| 18557 | pmaddubsw m3, m1, m6 |
| 18558 | pmulhrsw m3, m7 |
| 18559 | pmaddubsw m5, m4, m6 |
| 18560 | pmulhrsw m5, m7 |
| 18561 | packuswb m3, m5 |
| 18562 | movu [r0 + 381 * 16], m3 |
| 18563 | |
| 18564 | ; mode 3 [row 11] |
| 18565 | movu m6, [r5 + 24 * 16] |
| 18566 | movu m0, [r4 + 10] |
| 18567 | movd m1, [r4 + 11] |
| 18568 | palignr m1, m0, 1 |
| 18569 | punpcklbw m0, m1 |
| 18570 | pmaddubsw m1, m0, m6 |
| 18571 | pmulhrsw m1, m7 |
| 18572 | movu m2, [r4 + 18] |
| 18573 | movd m3, [r4 + 19] |
| 18574 | palignr m3, m2, 1 |
| 18575 | punpcklbw m2, m3 |
| 18576 | pmaddubsw m3, m2, m6 |
| 18577 | pmulhrsw m3, m7 |
| 18578 | packuswb m1, m3 |
| 18579 | movu [r0 + 86 * 16], m1 |
| 18580 | |
| 18581 | ; mode 6 [row 23 - first half] |
| 18582 | movu [r0 + 302 * 16], m1 |
| 18583 | |
| 18584 | movu m1, [r4 + 26] |
| 18585 | movd m3, [r4 + 27] |
| 18586 | palignr m3, m1, 1 |
| 18587 | punpcklbw m1, m3 |
| 18588 | pmaddubsw m3, m1, m6 |
| 18589 | pmulhrsw m3, m7 |
| 18590 | movu m4, [r4 + 34] |
| 18591 | movd m5, [r4 + 35] |
| 18592 | palignr m5, m4, 1 |
| 18593 | punpcklbw m4, m5 |
| 18594 | pmaddubsw m5, m4, m6 |
| 18595 | pmulhrsw m5, m7 |
| 18596 | packuswb m3, m5 |
| 18597 | movu [r0 + 87 * 16], m3 |
| 18598 | |
| 18599 | ; mode 6 [row 23 - second half] |
| 18600 | movu [r0 + 303 * 16], m3 |
| 18601 | |
| 18602 | ; mode 4 [row 13] |
| 18603 | movu m6, [r5 + 6 * 16] |
| 18604 | pmaddubsw m3, m0, m6 |
| 18605 | pmulhrsw m3, m7 |
| 18606 | pmaddubsw m5, m2, m6 |
| 18607 | pmulhrsw m5, m7 |
| 18608 | packuswb m3, m5 |
| 18609 | movu [r0 + 154 * 16], m3 |
| 18610 | pmaddubsw m3, m1, m6 |
| 18611 | pmulhrsw m3, m7 |
| 18612 | pmaddubsw m5, m4, m6 |
| 18613 | pmulhrsw m5, m7 |
| 18614 | packuswb m3, m5 |
| 18615 | movu [r0 + 155 * 16], m3 |
| 18616 | |
| 18617 | ; mode 4 [row 14] |
| 18618 | movu m6, [r5 + 27 * 16] |
| 18619 | pmaddubsw m3, m0, m6 |
| 18620 | pmulhrsw m3, m7 |
| 18621 | pmaddubsw m5, m2, m6 |
| 18622 | pmulhrsw m5, m7 |
| 18623 | packuswb m3, m5 |
| 18624 | movu [r0 + 156 * 16], m3 |
| 18625 | pmaddubsw m3, m1, m6 |
| 18626 | pmulhrsw m3, m7 |
| 18627 | pmaddubsw m5, m4, m6 |
| 18628 | pmulhrsw m5, m7 |
| 18629 | packuswb m3, m5 |
| 18630 | movu [r0 + 157 * 16], m3 |
| 18631 | |
| 18632 | ; mode 5 [row 16] |
| 18633 | movu m6, [r5 + 1 * 16] |
| 18634 | pmaddubsw m3, m0, m6 |
| 18635 | pmulhrsw m3, m7 |
| 18636 | pmaddubsw m5, m2, m6 |
| 18637 | pmulhrsw m5, m7 |
| 18638 | packuswb m3, m5 |
| 18639 | movu [r0 + 224 * 16], m3 |
| 18640 | pmaddubsw m3, m1, m6 |
| 18641 | pmulhrsw m3, m7 |
| 18642 | pmaddubsw m5, m4, m6 |
| 18643 | pmulhrsw m5, m7 |
| 18644 | packuswb m3, m5 |
| 18645 | movu [r0 + 225 * 16], m3 |
| 18646 | |
| 18647 | ; mode 5 [row 17] |
| 18648 | movu m6, [r5 + 18 * 16] |
| 18649 | pmaddubsw m3, m0, m6 |
| 18650 | pmulhrsw m3, m7 |
| 18651 | pmaddubsw m5, m2, m6 |
| 18652 | pmulhrsw m5, m7 |
| 18653 | packuswb m3, m5 |
| 18654 | movu [r0 + 226 * 16], m3 |
| 18655 | pmaddubsw m3, m1, m6 |
| 18656 | pmulhrsw m3, m7 |
| 18657 | pmaddubsw m5, m4, m6 |
| 18658 | pmulhrsw m5, m7 |
| 18659 | packuswb m3, m5 |
| 18660 | movu [r0 + 227 * 16], m3 |
| 18661 | |
| 18662 | ; mode 6 [row 22] |
| 18663 | movu m6, [r5 + 11 * 16] |
| 18664 | pmaddubsw m3, m0, m6 |
| 18665 | pmulhrsw m3, m7 |
| 18666 | pmaddubsw m5, m2, m6 |
| 18667 | pmulhrsw m5, m7 |
| 18668 | packuswb m3, m5 |
| 18669 | movu [r0 + 300 * 16], m3 |
| 18670 | pmaddubsw m3, m1, m6 |
| 18671 | pmulhrsw m3, m7 |
| 18672 | pmaddubsw m5, m4, m6 |
| 18673 | pmulhrsw m5, m7 |
| 18674 | packuswb m3, m5 |
| 18675 | movu [r0 + 301 * 16], m3 |
| 18676 | |
| 18677 | ; mode 3 [row 12] |
| 18678 | movu m6, [r5 + 18 * 16] |
| 18679 | movu m0, [r4 + 11] |
| 18680 | movd m1, [r4 + 12] |
| 18681 | palignr m1, m0, 1 |
| 18682 | punpcklbw m0, m1 |
| 18683 | pmaddubsw m1, m0, m6 |
| 18684 | pmulhrsw m1, m7 |
| 18685 | movu m2, [r4 + 19] |
| 18686 | movd m3, [r4 + 20] |
| 18687 | palignr m3, m2, 1 |
| 18688 | punpcklbw m2, m3 |
| 18689 | pmaddubsw m3, m2, m6 |
| 18690 | pmulhrsw m3, m7 |
| 18691 | packuswb m1, m3 |
| 18692 | movu [r0 + 88 * 16], m1 |
| 18693 | |
| 18694 | ; mode 6 [row 25 - first half] |
| 18695 | movu [r0 + 306 * 16], m1 |
| 18696 | |
| 18697 | movu m1, [r4 + 27] |
| 18698 | movd m3, [r4 + 28] |
| 18699 | palignr m3, m1, 1 |
| 18700 | punpcklbw m1, m3 |
| 18701 | pmaddubsw m3, m1, m6 |
| 18702 | pmulhrsw m3, m7 |
| 18703 | movu m4, [r4 + 35] |
| 18704 | movd m5, [r4 + 36] |
| 18705 | palignr m5, m4, 1 |
| 18706 | punpcklbw m4, m5 |
| 18707 | pmaddubsw m5, m4, m6 |
| 18708 | pmulhrsw m5, m7 |
| 18709 | packuswb m3, m5 |
| 18710 | movu [r0 + 89 * 16], m3 |
| 18711 | |
| 18712 | ; mode 6 [row 25 - second half] |
| 18713 | movu [r0 + 307 * 16], m3 |
| 18714 | |
| 18715 | ; mode 4 [row 15] |
| 18716 | movu m6, [r5 + 16 * 16] |
| 18717 | pmaddubsw m3, m0, m6 |
| 18718 | pmulhrsw m3, m7 |
| 18719 | pmaddubsw m5, m2, m6 |
| 18720 | pmulhrsw m5, m7 |
| 18721 | packuswb m3, m5 |
| 18722 | movu [r0 + 158 * 16], m3 |
| 18723 | pmaddubsw m3, m1, m6 |
| 18724 | pmulhrsw m3, m7 |
| 18725 | pmaddubsw m5, m4, m6 |
| 18726 | pmulhrsw m5, m7 |
| 18727 | packuswb m3, m5 |
| 18728 | movu [r0 + 159 * 16], m3 |
| 18729 | |
| 18730 | ; mode 5 [row 18] |
| 18731 | movu m6, [r5 + 3 * 16] |
| 18732 | pmaddubsw m3, m0, m6 |
| 18733 | pmulhrsw m3, m7 |
| 18734 | pmaddubsw m5, m2, m6 |
| 18735 | pmulhrsw m5, m7 |
| 18736 | packuswb m3, m5 |
| 18737 | movu [r0 + 228 * 16], m3 |
| 18738 | pmaddubsw m3, m1, m6 |
| 18739 | pmulhrsw m3, m7 |
| 18740 | pmaddubsw m5, m4, m6 |
| 18741 | pmulhrsw m5, m7 |
| 18742 | packuswb m3, m5 |
| 18743 | movu [r0 + 229 * 16], m3 |
| 18744 | |
| 18745 | ; mode 5 [row 19] |
| 18746 | movu m6, [r5 + 20 * 16] |
| 18747 | pmaddubsw m3, m0, m6 |
| 18748 | pmulhrsw m3, m7 |
| 18749 | pmaddubsw m5, m2, m6 |
| 18750 | pmulhrsw m5, m7 |
| 18751 | packuswb m3, m5 |
| 18752 | movu [r0 + 230 * 16], m3 |
| 18753 | pmaddubsw m3, m1, m6 |
| 18754 | pmulhrsw m3, m7 |
| 18755 | pmaddubsw m5, m4, m6 |
| 18756 | pmulhrsw m5, m7 |
| 18757 | packuswb m3, m5 |
| 18758 | movu [r0 + 231 * 16], m3 |
| 18759 | |
| 18760 | ; mode 6 [row 24] |
| 18761 | movu m6, [r5 + 5 * 16] |
| 18762 | pmaddubsw m3, m0, m6 |
| 18763 | pmulhrsw m3, m7 |
| 18764 | pmaddubsw m5, m2, m6 |
| 18765 | pmulhrsw m5, m7 |
| 18766 | packuswb m3, m5 |
| 18767 | movu [r0 + 304 * 16], m3 |
| 18768 | pmaddubsw m3, m1, m6 |
| 18769 | pmulhrsw m3, m7 |
| 18770 | pmaddubsw m5, m4, m6 |
| 18771 | pmulhrsw m5, m7 |
| 18772 | packuswb m3, m5 |
| 18773 | movu [r0 + 305 * 16], m3 |
| 18774 | |
| 18775 | ; mode 6 [row 26] |
| 18776 | movu m6, [r5 + 31 * 16] |
| 18777 | pmaddubsw m3, m0, m6 |
| 18778 | pmulhrsw m3, m7 |
| 18779 | pmaddubsw m5, m2, m6 |
| 18780 | pmulhrsw m5, m7 |
| 18781 | packuswb m3, m5 |
| 18782 | movu [r0 + 308 * 16], m3 |
| 18783 | pmaddubsw m3, m1, m6 |
| 18784 | pmulhrsw m3, m7 |
| 18785 | pmaddubsw m5, m4, m6 |
| 18786 | pmulhrsw m5, m7 |
| 18787 | packuswb m3, m5 |
| 18788 | movu [r0 + 309 * 16], m3 |
| 18789 | |
| 18790 | ; mode 3 [row 13] |
| 18791 | movu m6, [r5 + 12 * 16] |
| 18792 | movu m0, [r4 + 12] |
| 18793 | movd m1, [r4 + 13] |
| 18794 | palignr m1, m0, 1 |
| 18795 | punpcklbw m0, m1 |
| 18796 | pmaddubsw m1, m0, m6 |
| 18797 | pmulhrsw m1, m7 |
| 18798 | movu m2, [r4 + 20] |
| 18799 | movd m3, [r4 + 21] |
| 18800 | palignr m3, m2, 1 |
| 18801 | punpcklbw m2, m3 |
| 18802 | pmaddubsw m3, m2, m6 |
| 18803 | pmulhrsw m3, m7 |
| 18804 | packuswb m1, m3 |
| 18805 | movu [r0 + 90 * 16], m1 |
| 18806 | |
| 18807 | movu m1, [r4 + 28] |
| 18808 | movd m3, [r4 + 29] |
| 18809 | palignr m3, m1, 1 |
| 18810 | punpcklbw m1, m3 |
| 18811 | pmaddubsw m3, m1, m6 |
| 18812 | pmulhrsw m3, m7 |
| 18813 | movu m4, [r4 + 36] |
| 18814 | movd m5, [r4 + 37] |
| 18815 | palignr m5, m4, 1 |
| 18816 | punpcklbw m4, m5 |
| 18817 | pmaddubsw m5, m4, m6 |
| 18818 | pmulhrsw m5, m7 |
| 18819 | packuswb m3, m5 |
| 18820 | movu [r0 + 91 * 16], m3 |
| 18821 | |
| 18822 | ; mode 4 [row 16] |
| 18823 | movu m6, [r5 + 5 * 16] |
| 18824 | pmaddubsw m3, m0, m6 |
| 18825 | pmulhrsw m3, m7 |
| 18826 | pmaddubsw m5, m2, m6 |
| 18827 | pmulhrsw m5, m7 |
| 18828 | packuswb m3, m5 |
| 18829 | movu [r0 + 160 * 16], m3 |
| 18830 | |
| 18831 | ; mode 5 [row 20 - first half] |
| 18832 | movu [r0 + 232 * 16], m3 |
| 18833 | |
| 18834 | pmaddubsw m3, m1, m6 |
| 18835 | pmulhrsw m3, m7 |
| 18836 | pmaddubsw m5, m4, m6 |
| 18837 | pmulhrsw m5, m7 |
| 18838 | packuswb m3, m5 |
| 18839 | movu [r0 + 161 * 16], m3 |
| 18840 | |
| 18841 | ; mode 5 [row 20 - second half] |
| 18842 | movu [r0 + 233 * 16], m3 |
| 18843 | |
| 18844 | ; mode 4 [row 17] |
| 18845 | movu m6, [r5 + 26 * 16] |
| 18846 | pmaddubsw m3, m0, m6 |
| 18847 | pmulhrsw m3, m7 |
| 18848 | pmaddubsw m5, m2, m6 |
| 18849 | pmulhrsw m5, m7 |
| 18850 | packuswb m3, m5 |
| 18851 | movu [r0 + 162 * 16], m3 |
| 18852 | pmaddubsw m3, m1, m6 |
| 18853 | pmulhrsw m3, m7 |
| 18854 | pmaddubsw m5, m4, m6 |
| 18855 | pmulhrsw m5, m7 |
| 18856 | packuswb m3, m5 |
| 18857 | movu [r0 + 163 * 16], m3 |
| 18858 | |
| 18859 | ; mode 5 [row 21] |
| 18860 | movu m6, [r5 + 22 * 16] |
| 18861 | pmaddubsw m3, m0, m6 |
| 18862 | pmulhrsw m3, m7 |
| 18863 | pmaddubsw m5, m2, m6 |
| 18864 | pmulhrsw m5, m7 |
| 18865 | packuswb m3, m5 |
| 18866 | movu [r0 + 234 * 16], m3 |
| 18867 | pmaddubsw m3, m1, m6 |
| 18868 | pmulhrsw m3, m7 |
| 18869 | pmaddubsw m5, m4, m6 |
| 18870 | pmulhrsw m5, m7 |
| 18871 | packuswb m3, m5 |
| 18872 | movu [r0 + 235 * 16], m3 |
| 18873 | |
| 18874 | ; mode 6 [row 27] |
| 18875 | movu m6, [r5 + 12 * 16] |
| 18876 | pmaddubsw m3, m0, m6 |
| 18877 | pmulhrsw m3, m7 |
| 18878 | pmaddubsw m5, m2, m6 |
| 18879 | pmulhrsw m5, m7 |
| 18880 | packuswb m3, m5 |
| 18881 | movu [r0 + 310 * 16], m3 |
| 18882 | pmaddubsw m3, m1, m6 |
| 18883 | pmulhrsw m3, m7 |
| 18884 | pmaddubsw m5, m4, m6 |
| 18885 | pmulhrsw m5, m7 |
| 18886 | packuswb m3, m5 |
| 18887 | movu [r0 + 311 * 16], m3 |
| 18888 | |
| 18889 | ; mode 6 [row 28] |
| 18890 | movu m6, [r5 + 25 * 16] |
| 18891 | pmaddubsw m3, m0, m6 |
| 18892 | pmulhrsw m3, m7 |
| 18893 | pmaddubsw m5, m2, m6 |
| 18894 | pmulhrsw m5, m7 |
| 18895 | packuswb m3, m5 |
| 18896 | movu [r0 + 312 * 16], m3 |
| 18897 | pmaddubsw m3, m1, m6 |
| 18898 | pmulhrsw m3, m7 |
| 18899 | pmaddubsw m5, m4, m6 |
| 18900 | pmulhrsw m5, m7 |
| 18901 | packuswb m3, m5 |
| 18902 | movu [r0 + 313 * 16], m3 |
| 18903 | |
| 18904 | ; mode 3 [row 14] |
| 18905 | movu m6, [r5 + 6 * 16] |
| 18906 | movu m0, [r4 + 13] |
| 18907 | movd m1, [r4 + 14] |
| 18908 | palignr m1, m0, 1 |
| 18909 | punpcklbw m0, m1 |
| 18910 | pmaddubsw m1, m0, m6 |
| 18911 | pmulhrsw m1, m7 |
| 18912 | movu m2, [r4 + 21] |
| 18913 | movd m3, [r4 + 22] |
| 18914 | palignr m3, m2, 1 |
| 18915 | punpcklbw m2, m3 |
| 18916 | pmaddubsw m3, m2, m6 |
| 18917 | pmulhrsw m3, m7 |
| 18918 | packuswb m1, m3 |
| 18919 | movu [r0 + 92 * 16], m1 |
| 18920 | |
| 18921 | ; mode 6 [row 29 - first half] |
| 18922 | movu [r0 + 314 * 16], m1 |
| 18923 | |
| 18924 | movu m1, [r4 + 29] |
| 18925 | movd m3, [r4 + 30] |
| 18926 | palignr m3, m1, 1 |
| 18927 | punpcklbw m1, m3 |
| 18928 | pmaddubsw m3, m1, m6 |
| 18929 | pmulhrsw m3, m7 |
| 18930 | movu m4, [r4 + 37] |
| 18931 | movd m5, [r4 + 38] |
| 18932 | palignr m5, m4, 1 |
| 18933 | punpcklbw m4, m5 |
| 18934 | pmaddubsw m5, m4, m6 |
| 18935 | pmulhrsw m5, m7 |
| 18936 | packuswb m3, m5 |
| 18937 | movu [r0 + 93 * 16], m3 |
| 18938 | |
| 18939 | ; mode 6 [row 29 - second half] |
| 18940 | movu [r0 + 315 * 16], m3 |
| 18941 | |
| 18942 | ; mode 4 [row 18] |
| 18943 | movu m6, [r5 + 15 * 16] |
| 18944 | pmaddubsw m3, m0, m6 |
| 18945 | pmulhrsw m3, m7 |
| 18946 | pmaddubsw m5, m2, m6 |
| 18947 | pmulhrsw m5, m7 |
| 18948 | packuswb m3, m5 |
| 18949 | movu [r0 + 164 * 16], m3 |
| 18950 | pmaddubsw m3, m1, m6 |
| 18951 | pmulhrsw m3, m7 |
| 18952 | pmaddubsw m5, m4, m6 |
| 18953 | pmulhrsw m5, m7 |
| 18954 | packuswb m3, m5 |
| 18955 | movu [r0 + 165 * 16], m3 |
| 18956 | |
| 18957 | ; mode 5 [row 22] |
| 18958 | movu m6, [r5 + 7 * 16] |
| 18959 | pmaddubsw m3, m0, m6 |
| 18960 | pmulhrsw m3, m7 |
| 18961 | pmaddubsw m5, m2, m6 |
| 18962 | pmulhrsw m5, m7 |
| 18963 | packuswb m3, m5 |
| 18964 | movu [r0 + 236 * 16], m3 |
| 18965 | pmaddubsw m3, m1, m6 |
| 18966 | pmulhrsw m3, m7 |
| 18967 | pmaddubsw m5, m4, m6 |
| 18968 | pmulhrsw m5, m7 |
| 18969 | packuswb m3, m5 |
| 18970 | movu [r0 + 237 * 16], m3 |
| 18971 | |
| 18972 | ; mode 5 [row 23] |
| 18973 | movu m6, [r5 + 24 * 16] |
| 18974 | pmaddubsw m3, m0, m6 |
| 18975 | pmulhrsw m3, m7 |
| 18976 | pmaddubsw m5, m2, m6 |
| 18977 | pmulhrsw m5, m7 |
| 18978 | packuswb m3, m5 |
| 18979 | movu [r0 + 238 * 16], m3 |
| 18980 | pmaddubsw m3, m1, m6 |
| 18981 | pmulhrsw m3, m7 |
| 18982 | pmaddubsw m5, m4, m6 |
| 18983 | pmulhrsw m5, m7 |
| 18984 | packuswb m3, m5 |
| 18985 | movu [r0 + 239 * 16], m3 |
| 18986 | |
| 18987 | ; mode 6 [row 30] |
| 18988 | movu m6, [r5 + 19 * 16] |
| 18989 | pmaddubsw m3, m0, m6 |
| 18990 | pmulhrsw m3, m7 |
| 18991 | pmaddubsw m5, m2, m6 |
| 18992 | pmulhrsw m5, m7 |
| 18993 | packuswb m3, m5 |
| 18994 | movu [r0 + 316 * 16], m3 |
| 18995 | pmaddubsw m3, m1, m6 |
| 18996 | pmulhrsw m3, m7 |
| 18997 | pmaddubsw m5, m4, m6 |
| 18998 | pmulhrsw m5, m7 |
| 18999 | packuswb m3, m5 |
| 19000 | movu [r0 + 317 * 16], m3 |
| 19001 | |
| 19002 | ; mode 3 [row 16] |
| 19003 | movu m6, [r5 + 26 * 16] |
| 19004 | movu m0, [r4 + 14] |
| 19005 | movd m1, [r4 + 15] |
| 19006 | palignr m1, m0, 1 |
| 19007 | punpcklbw m0, m1 |
| 19008 | pmaddubsw m1, m0, m6 |
| 19009 | pmulhrsw m1, m7 |
| 19010 | movu m2, [r4 + 22] |
| 19011 | movd m3, [r4 + 23] |
| 19012 | palignr m3, m2, 1 |
| 19013 | punpcklbw m2, m3 |
| 19014 | pmaddubsw m3, m2, m6 |
| 19015 | pmulhrsw m3, m7 |
| 19016 | packuswb m1, m3 |
| 19017 | movu [r0 + 96 * 16], m1 |
| 19018 | |
| 19019 | ; mode 5 [row 25 - first half] |
| 19020 | movu [r0 + 242 * 16], m1 |
| 19021 | |
| 19022 | movu m1, [r4 + 30] |
| 19023 | movd m3, [r4 + 31] |
| 19024 | palignr m3, m1, 1 |
| 19025 | punpcklbw m1, m3 |
| 19026 | pmaddubsw m3, m1, m6 |
| 19027 | pmulhrsw m3, m7 |
| 19028 | movu m4, [r4 + 38] |
| 19029 | movd m5, [r4 + 39] |
| 19030 | palignr m5, m4, 1 |
| 19031 | punpcklbw m4, m5 |
| 19032 | pmaddubsw m5, m4, m6 |
| 19033 | pmulhrsw m5, m7 |
| 19034 | packuswb m3, m5 |
| 19035 | movu [r0 + 97 * 16], m3 |
| 19036 | |
| 19037 | ; mode 5 [row 25 - second half] |
| 19038 | movu [r0 + 243 * 16], m3 |
| 19039 | |
| 19040 | ; mode 4 [row 19] |
| 19041 | movu m6, [r5 + 4 * 16] |
| 19042 | pmaddubsw m3, m0, m6 |
| 19043 | pmulhrsw m3, m7 |
| 19044 | pmaddubsw m5, m2, m6 |
| 19045 | pmulhrsw m5, m7 |
| 19046 | packuswb m3, m5 |
| 19047 | movu [r0 + 166 * 16], m3 |
| 19048 | pmaddubsw m3, m1, m6 |
| 19049 | pmulhrsw m3, m7 |
| 19050 | pmaddubsw m5, m4, m6 |
| 19051 | pmulhrsw m5, m7 |
| 19052 | packuswb m3, m5 |
| 19053 | movu [r0 + 167 * 16], m3 |
| 19054 | |
| 19055 | ; mode 4 [row 20] |
| 19056 | movu m6, [r5 + 25 * 16] |
| 19057 | pmaddubsw m3, m0, m6 |
| 19058 | pmulhrsw m3, m7 |
| 19059 | pmaddubsw m5, m2, m6 |
| 19060 | pmulhrsw m5, m7 |
| 19061 | packuswb m3, m5 |
| 19062 | movu [r0 + 168 * 16], m3 |
| 19063 | pmaddubsw m3, m1, m6 |
| 19064 | pmulhrsw m3, m7 |
| 19065 | pmaddubsw m5, m4, m6 |
| 19066 | pmulhrsw m5, m7 |
| 19067 | packuswb m3, m5 |
| 19068 | movu [r0 + 169 * 16], m3 |
| 19069 | |
| 19070 | ; mode 5 [row 24] |
| 19071 | movu m6, [r5 + 9 * 16] |
| 19072 | pmaddubsw m3, m0, m6 |
| 19073 | pmulhrsw m3, m7 |
| 19074 | pmaddubsw m5, m2, m6 |
| 19075 | pmulhrsw m5, m7 |
| 19076 | packuswb m3, m5 |
| 19077 | movu [r0 + 240 * 16], m3 |
| 19078 | pmaddubsw m3, m1, m6 |
| 19079 | pmulhrsw m3, m7 |
| 19080 | pmaddubsw m5, m4, m6 |
| 19081 | pmulhrsw m5, m7 |
| 19082 | packuswb m3, m5 |
| 19083 | movu [r0 + 241 * 16], m3 |
| 19084 | |
| 19085 | ; mode 3 [row 17] |
| 19086 | movu m6, [r5 + 20 * 16] |
| 19087 | movu m0, [r4 + 15] |
| 19088 | movd m1, [r4 + 16] |
| 19089 | palignr m1, m0, 1 |
| 19090 | punpcklbw m0, m1 |
| 19091 | pmaddubsw m1, m0, m6 |
| 19092 | pmulhrsw m1, m7 |
| 19093 | movu m2, [r4 + 23] |
| 19094 | movd m3, [r4 + 24] |
| 19095 | palignr m3, m2, 1 |
| 19096 | punpcklbw m2, m3 |
| 19097 | pmaddubsw m3, m2, m6 |
| 19098 | pmulhrsw m3, m7 |
| 19099 | packuswb m1, m3 |
| 19100 | movu [r0 + 98 * 16], m1 |
| 19101 | |
| 19102 | movu m1, [r4 + 31] |
| 19103 | movd m3, [r4 + 32] |
| 19104 | palignr m3, m1, 1 |
| 19105 | punpcklbw m1, m3 |
| 19106 | pmaddubsw m3, m1, m6 |
| 19107 | pmulhrsw m3, m7 |
| 19108 | movu m4, [r4 + 39] |
| 19109 | movd m5, [r4 + 40] |
| 19110 | palignr m5, m4, 1 |
| 19111 | punpcklbw m4, m5 |
| 19112 | pmaddubsw m5, m4, m6 |
| 19113 | pmulhrsw m5, m7 |
| 19114 | packuswb m3, m5 |
| 19115 | movu [r0 + 99 * 16], m3 |
| 19116 | |
| 19117 | ; mode 4 [row 21] |
| 19118 | movu m6, [r5 + 14 * 16] |
| 19119 | pmaddubsw m3, m0, m6 |
| 19120 | pmulhrsw m3, m7 |
| 19121 | pmaddubsw m5, m2, m6 |
| 19122 | pmulhrsw m5, m7 |
| 19123 | packuswb m3, m5 |
| 19124 | movu [r0 + 170 * 16], m3 |
| 19125 | pmaddubsw m3, m1, m6 |
| 19126 | pmulhrsw m3, m7 |
| 19127 | pmaddubsw m5, m4, m6 |
| 19128 | pmulhrsw m5, m7 |
| 19129 | packuswb m3, m5 |
| 19130 | movu [r0 + 171 * 16], m3 |
| 19131 | |
| 19132 | ; mode 5 [row 26] |
| 19133 | movu m6, [r5 + 11 * 16] |
| 19134 | pmaddubsw m3, m0, m6 |
| 19135 | pmulhrsw m3, m7 |
| 19136 | pmaddubsw m5, m2, m6 |
| 19137 | pmulhrsw m5, m7 |
| 19138 | packuswb m3, m5 |
| 19139 | movu [r0 + 244 * 16], m3 |
| 19140 | pmaddubsw m3, m1, m6 |
| 19141 | pmulhrsw m3, m7 |
| 19142 | pmaddubsw m5, m4, m6 |
| 19143 | pmulhrsw m5, m7 |
| 19144 | packuswb m3, m5 |
| 19145 | movu [r0 + 245 * 16], m3 |
| 19146 | |
| 19147 | ; mode 5 [row 27] |
| 19148 | movu m6, [r5 + 28 * 16] |
| 19149 | pmaddubsw m3, m0, m6 |
| 19150 | pmulhrsw m3, m7 |
| 19151 | pmaddubsw m5, m2, m6 |
| 19152 | pmulhrsw m5, m7 |
| 19153 | packuswb m3, m5 |
| 19154 | movu [r0 + 246 * 16], m3 |
| 19155 | pmaddubsw m3, m1, m6 |
| 19156 | pmulhrsw m3, m7 |
| 19157 | pmaddubsw m5, m4, m6 |
| 19158 | pmulhrsw m5, m7 |
| 19159 | packuswb m3, m5 |
| 19160 | movu [r0 + 247 * 16], m3 |
| 19161 | |
| 19162 | ; mode 3 [row 18] |
| 19163 | movu m6, [r5 + 14 * 16] |
| 19164 | movu m0, [r4 + 16] |
| 19165 | movd m1, [r4 + 17] |
| 19166 | palignr m1, m0, 1 |
| 19167 | punpcklbw m0, m1 |
| 19168 | pmaddubsw m1, m0, m6 |
| 19169 | pmulhrsw m1, m7 |
| 19170 | movu m2, [r4 + 24] |
| 19171 | movd m3, [r4 + 25] |
| 19172 | palignr m3, m2, 1 |
| 19173 | punpcklbw m2, m3 |
| 19174 | pmaddubsw m3, m2, m6 |
| 19175 | pmulhrsw m3, m7 |
| 19176 | packuswb m1, m3 |
| 19177 | movu [r0 + 100 * 16], m1 |
| 19178 | |
| 19179 | movu m1, [r4 + 32] |
| 19180 | movd m3, [r4 + 33] |
| 19181 | palignr m3, m1, 1 |
| 19182 | punpcklbw m1, m3 |
| 19183 | pmaddubsw m3, m1, m6 |
| 19184 | pmulhrsw m3, m7 |
| 19185 | movu m4, [r4 + 40] |
| 19186 | movd m5, [r4 + 41] |
| 19187 | palignr m5, m4, 1 |
| 19188 | punpcklbw m4, m5 |
| 19189 | pmaddubsw m5, m4, m6 |
| 19190 | pmulhrsw m5, m7 |
| 19191 | packuswb m3, m5 |
| 19192 | movu [r0 + 101 * 16], m3 |
| 19193 | |
| 19194 | ; mode 4 [row 22] |
| 19195 | movu m6, [r5 + 3 * 16] |
| 19196 | pmaddubsw m3, m0, m6 |
| 19197 | pmulhrsw m3, m7 |
| 19198 | pmaddubsw m5, m2, m6 |
| 19199 | pmulhrsw m5, m7 |
| 19200 | packuswb m3, m5 |
| 19201 | movu [r0 + 172 * 16], m3 |
| 19202 | pmaddubsw m3, m1, m6 |
| 19203 | pmulhrsw m3, m7 |
| 19204 | pmaddubsw m5, m4, m6 |
| 19205 | pmulhrsw m5, m7 |
| 19206 | packuswb m3, m5 |
| 19207 | movu [r0 + 173 * 16], m3 |
| 19208 | |
| 19209 | ; mode 4 [row 23] |
| 19210 | movu m6, [r5 + 24 * 16] |
| 19211 | pmaddubsw m3, m0, m6 |
| 19212 | pmulhrsw m3, m7 |
| 19213 | pmaddubsw m5, m2, m6 |
| 19214 | pmulhrsw m5, m7 |
| 19215 | packuswb m3, m5 |
| 19216 | movu [r0 + 174 * 16], m3 |
| 19217 | pmaddubsw m3, m1, m6 |
| 19218 | pmulhrsw m3, m7 |
| 19219 | pmaddubsw m5, m4, m6 |
| 19220 | pmulhrsw m5, m7 |
| 19221 | packuswb m3, m5 |
| 19222 | movu [r0 + 175 * 16], m3 |
| 19223 | |
| 19224 | ; mode 5 [row 28] |
| 19225 | movu m6, [r5 + 13 * 16] |
| 19226 | pmaddubsw m3, m0, m6 |
| 19227 | pmulhrsw m3, m7 |
| 19228 | pmaddubsw m5, m2, m6 |
| 19229 | pmulhrsw m5, m7 |
| 19230 | packuswb m3, m5 |
| 19231 | movu [r0 + 248 * 16], m3 |
| 19232 | pmaddubsw m3, m1, m6 |
| 19233 | pmulhrsw m3, m7 |
| 19234 | pmaddubsw m5, m4, m6 |
| 19235 | pmulhrsw m5, m7 |
| 19236 | packuswb m3, m5 |
| 19237 | movu [r0 + 249 * 16], m3 |
| 19238 | |
| 19239 | ; mode 5 [row 29] |
| 19240 | movu m6, [r5 + 30 * 16] |
| 19241 | pmaddubsw m3, m0, m6 |
| 19242 | pmulhrsw m3, m7 |
| 19243 | pmaddubsw m5, m2, m6 |
| 19244 | pmulhrsw m5, m7 |
| 19245 | packuswb m3, m5 |
| 19246 | movu [r0 + 250 * 16], m3 |
| 19247 | pmaddubsw m3, m1, m6 |
| 19248 | pmulhrsw m3, m7 |
| 19249 | pmaddubsw m5, m4, m6 |
| 19250 | pmulhrsw m5, m7 |
| 19251 | packuswb m3, m5 |
| 19252 | movu [r0 + 251 * 16], m3 |
| 19253 | |
| 19254 | ; mode 3 [row 19] |
| 19255 | movu m6, [r5 + 8 * 16] |
| 19256 | movu m0, [r4 + 17] |
| 19257 | movd m1, [r4 + 18] |
| 19258 | palignr m1, m0, 1 |
| 19259 | punpcklbw m0, m1 |
| 19260 | pmaddubsw m1, m0, m6 |
| 19261 | pmulhrsw m1, m7 |
| 19262 | movu m2, [r4 + 25] |
| 19263 | movd m3, [r4 + 26] |
| 19264 | palignr m3, m2, 1 |
| 19265 | punpcklbw m2, m3 |
| 19266 | pmaddubsw m3, m2, m6 |
| 19267 | pmulhrsw m3, m7 |
| 19268 | packuswb m1, m3 |
| 19269 | movu [r0 + 102 * 16], m1 |
| 19270 | |
| 19271 | movu m1, [r4 + 33] |
| 19272 | movd m3, [r4 + 34] |
| 19273 | palignr m3, m1, 1 |
| 19274 | punpcklbw m1, m3 |
| 19275 | pmaddubsw m3, m1, m6 |
| 19276 | pmulhrsw m3, m7 |
| 19277 | movu m4, [r4 + 41] |
| 19278 | movd m5, [r4 + 42] |
| 19279 | palignr m5, m4, 1 |
| 19280 | punpcklbw m4, m5 |
| 19281 | pmaddubsw m5, m4, m6 |
| 19282 | pmulhrsw m5, m7 |
| 19283 | packuswb m3, m5 |
| 19284 | movu [r0 + 103 * 16], m3 |
| 19285 | |
| 19286 | ; mode 4 [row 24] |
| 19287 | movu m6, [r5 + 13 * 16] |
| 19288 | pmaddubsw m3, m0, m6 |
| 19289 | pmulhrsw m3, m7 |
| 19290 | pmaddubsw m5, m2, m6 |
| 19291 | pmulhrsw m5, m7 |
| 19292 | packuswb m3, m5 |
| 19293 | movu [r0 + 176 * 16], m3 |
| 19294 | pmaddubsw m3, m1, m6 |
| 19295 | pmulhrsw m3, m7 |
| 19296 | pmaddubsw m5, m4, m6 |
| 19297 | pmulhrsw m5, m7 |
| 19298 | packuswb m3, m5 |
| 19299 | movu [r0 + 177 * 16], m3 |
| 19300 | |
| 19301 | ; mode 5 [row 30] |
| 19302 | movu m6, [r5 + 15 * 16] |
| 19303 | pmaddubsw m3, m0, m6 |
| 19304 | pmulhrsw m3, m7 |
| 19305 | pmaddubsw m5, m2, m6 |
| 19306 | pmulhrsw m5, m7 |
| 19307 | packuswb m3, m5 |
| 19308 | movu [r0 + 252 * 16], m3 |
| 19309 | pmaddubsw m3, m1, m6 |
| 19310 | pmulhrsw m3, m7 |
| 19311 | pmaddubsw m5, m4, m6 |
| 19312 | pmulhrsw m5, m7 |
| 19313 | packuswb m3, m5 |
| 19314 | movu [r0 + 253 * 16], m3 |
| 19315 | |
| 19316 | ; mode 3 [row 20] |
| 19317 | movu m6, [r5 + 2 * 16] |
| 19318 | movu m0, [r4 + 18] |
| 19319 | movd m1, [r4 + 19] |
| 19320 | palignr m1, m0, 1 |
| 19321 | punpcklbw m0, m1 |
| 19322 | pmaddubsw m1, m0, m6 |
| 19323 | pmulhrsw m1, m7 |
| 19324 | movu m2, [r4 + 26] |
| 19325 | movd m3, [r4 + 27] |
| 19326 | palignr m3, m2, 1 |
| 19327 | punpcklbw m2, m3 |
| 19328 | pmaddubsw m3, m2, m6 |
| 19329 | pmulhrsw m3, m7 |
| 19330 | packuswb m1, m3 |
| 19331 | movu [r0 + 104 * 16], m1 |
| 19332 | |
| 19333 | movu m1, [r4 + 34] |
| 19334 | movd m3, [r4 + 35] |
| 19335 | palignr m3, m1, 1 |
| 19336 | punpcklbw m1, m3 |
| 19337 | pmaddubsw m3, m1, m6 |
| 19338 | pmulhrsw m3, m7 |
| 19339 | movu m4, [r4 + 42] |
| 19340 | movd m5, [r4 + 43] |
| 19341 | palignr m5, m4, 1 |
| 19342 | punpcklbw m4, m5 |
| 19343 | pmaddubsw m5, m4, m6 |
| 19344 | pmulhrsw m5, m7 |
| 19345 | packuswb m3, m5 |
| 19346 | movu [r0 + 105 * 16], m3 |
| 19347 | |
| 19348 | ; mode 4 [row 25] |
| 19349 | pmaddubsw m3, m0, m6 |
| 19350 | pmulhrsw m3, m7 |
| 19351 | pmaddubsw m5, m2, m6 |
| 19352 | pmulhrsw m5, m7 |
| 19353 | packuswb m3, m5 |
| 19354 | movu [r0 + 178 * 16], m3 |
| 19355 | pmaddubsw m3, m1, m6 |
| 19356 | pmulhrsw m3, m7 |
| 19357 | pmaddubsw m5, m4, m6 |
| 19358 | pmulhrsw m5, m7 |
| 19359 | packuswb m3, m5 |
| 19360 | movu [r0 + 179 * 16], m3 |
| 19361 | |
| 19362 | ; mode 4 [row 26] |
| 19363 | movu m6, [r5 + 23 * 16] |
| 19364 | pmaddubsw m3, m0, m6 |
| 19365 | pmulhrsw m3, m7 |
| 19366 | pmaddubsw m5, m2, m6 |
| 19367 | pmulhrsw m5, m7 |
| 19368 | packuswb m3, m5 |
| 19369 | movu [r0 + 180 * 16], m3 |
| 19370 | pmaddubsw m3, m1, m6 |
| 19371 | pmulhrsw m3, m7 |
| 19372 | pmaddubsw m5, m4, m6 |
| 19373 | pmulhrsw m5, m7 |
| 19374 | packuswb m3, m5 |
| 19375 | movu [r0 + 181 * 16], m3 |
| 19376 | |
| 19377 | ; mode 3 [row 21] |
| 19378 | movu m6, [r5 + 28 * 16] |
| 19379 | pmaddubsw m3, m0, m6 |
| 19380 | pmulhrsw m3, m7 |
| 19381 | pmaddubsw m5, m2, m6 |
| 19382 | pmulhrsw m5, m7 |
| 19383 | packuswb m3, m5 |
| 19384 | movu [r0 + 106 * 16], m3 |
| 19385 | pmaddubsw m3, m1, m6 |
| 19386 | pmulhrsw m3, m7 |
| 19387 | pmaddubsw m5, m4, m6 |
| 19388 | pmulhrsw m5, m7 |
| 19389 | packuswb m3, m5 |
| 19390 | movu [r0 + 107 * 16], m3 |
| 19391 | |
| 19392 | ; mode 3 [row 22] |
| 19393 | movu m6, [r5 + 22 * 16] |
| 19394 | movu m0, [r4 + 19] |
| 19395 | movd m1, [r4 + 20] |
| 19396 | palignr m1, m0, 1 |
| 19397 | punpcklbw m0, m1 |
| 19398 | pmaddubsw m1, m0, m6 |
| 19399 | pmulhrsw m1, m7 |
| 19400 | movu m2, [r4 + 27] |
| 19401 | movd m3, [r4 + 28] |
| 19402 | palignr m3, m2, 1 |
| 19403 | punpcklbw m2, m3 |
| 19404 | pmaddubsw m3, m2, m6 |
| 19405 | pmulhrsw m3, m7 |
| 19406 | packuswb m1, m3 |
| 19407 | movu [r0 + 108 * 16], m1 |
| 19408 | |
| 19409 | movu m1, [r4 + 35] |
| 19410 | movd m3, [r4 + 36] |
| 19411 | palignr m3, m1, 1 |
| 19412 | punpcklbw m1, m3 |
| 19413 | pmaddubsw m3, m1, m6 |
| 19414 | pmulhrsw m3, m7 |
| 19415 | movu m4, [r4 + 43] |
| 19416 | movd m5, [r4 + 44] |
| 19417 | palignr m5, m4, 1 |
| 19418 | punpcklbw m4, m5 |
| 19419 | pmaddubsw m5, m4, m6 |
| 19420 | pmulhrsw m5, m7 |
| 19421 | packuswb m3, m5 |
| 19422 | movu [r0 + 109 * 16], m3 |
| 19423 | |
| 19424 | ; mode 4 [row 27] |
| 19425 | movu m6, [r5 + 12 * 16] |
| 19426 | pmaddubsw m3, m0, m6 |
| 19427 | pmulhrsw m3, m7 |
| 19428 | pmaddubsw m5, m2, m6 |
| 19429 | pmulhrsw m5, m7 |
| 19430 | packuswb m3, m5 |
| 19431 | movu [r0 + 182 * 16], m3 |
| 19432 | pmaddubsw m3, m1, m6 |
| 19433 | pmulhrsw m3, m7 |
| 19434 | pmaddubsw m5, m4, m6 |
| 19435 | pmulhrsw m5, m7 |
| 19436 | packuswb m3, m5 |
| 19437 | movu [r0 + 183 * 16], m3 |
| 19438 | |
| 19439 | ; mode 3 [row 23] |
| 19440 | movu m6, [r5 + 16 * 16] |
| 19441 | movu m0, [r4 + 20] |
| 19442 | movd m1, [r4 + 21] |
| 19443 | palignr m1, m0, 1 |
| 19444 | punpcklbw m0, m1 |
| 19445 | pmaddubsw m1, m0, m6 |
| 19446 | pmulhrsw m1, m7 |
| 19447 | movu m2, [r4 + 28] |
| 19448 | movd m3, [r4 + 29] |
| 19449 | palignr m3, m2, 1 |
| 19450 | punpcklbw m2, m3 |
| 19451 | pmaddubsw m3, m2, m6 |
| 19452 | pmulhrsw m3, m7 |
| 19453 | packuswb m1, m3 |
| 19454 | movu [r0 + 110 * 16], m1 |
| 19455 | |
| 19456 | movu m1, [r4 + 36] |
| 19457 | movd m3, [r4 + 37] |
| 19458 | palignr m3, m1, 1 |
| 19459 | punpcklbw m1, m3 |
| 19460 | pmaddubsw m3, m1, m6 |
| 19461 | pmulhrsw m3, m7 |
| 19462 | movu m4, [r4 + 44] |
| 19463 | movd m5, [r4 + 45] |
| 19464 | palignr m5, m4, 1 |
| 19465 | punpcklbw m4, m5 |
| 19466 | pmaddubsw m5, m4, m6 |
| 19467 | pmulhrsw m5, m7 |
| 19468 | packuswb m3, m5 |
| 19469 | movu [r0 + 111 * 16], m3 |
| 19470 | |
| 19471 | ; mode 4 [row 28] |
| 19472 | movu m6, [r5 + 1 * 16] |
| 19473 | pmaddubsw m3, m0, m6 |
| 19474 | pmulhrsw m3, m7 |
| 19475 | pmaddubsw m5, m2, m6 |
| 19476 | pmulhrsw m5, m7 |
| 19477 | packuswb m3, m5 |
| 19478 | movu [r0 + 184 * 16], m3 |
| 19479 | pmaddubsw m3, m1, m6 |
| 19480 | pmulhrsw m3, m7 |
| 19481 | pmaddubsw m5, m4, m6 |
| 19482 | pmulhrsw m5, m7 |
| 19483 | packuswb m3, m5 |
| 19484 | movu [r0 + 185 * 16], m3 |
| 19485 | |
| 19486 | ; mode 4 [row 29] |
| 19487 | movu m6, [r5 + 22 * 16] |
| 19488 | pmaddubsw m3, m0, m6 |
| 19489 | pmulhrsw m3, m7 |
| 19490 | pmaddubsw m5, m2, m6 |
| 19491 | pmulhrsw m5, m7 |
| 19492 | packuswb m3, m5 |
| 19493 | movu [r0 + 186 * 16], m3 |
| 19494 | pmaddubsw m3, m1, m6 |
| 19495 | pmulhrsw m3, m7 |
| 19496 | pmaddubsw m5, m4, m6 |
| 19497 | pmulhrsw m5, m7 |
| 19498 | packuswb m3, m5 |
| 19499 | movu [r0 + 187 * 16], m3 |
| 19500 | |
| 19501 | ; mode 3 [row 24] |
| 19502 | movu m6, [r5 + 10 * 16] |
| 19503 | movu m0, [r4 + 21] |
| 19504 | movd m1, [r4 + 22] |
| 19505 | palignr m1, m0, 1 |
| 19506 | punpcklbw m0, m1 |
| 19507 | pmaddubsw m1, m0, m6 |
| 19508 | pmulhrsw m1, m7 |
| 19509 | movu m2, [r4 + 29] |
| 19510 | movd m3, [r4 + 30] |
| 19511 | palignr m3, m2, 1 |
| 19512 | punpcklbw m2, m3 |
| 19513 | pmaddubsw m3, m2, m6 |
| 19514 | pmulhrsw m3, m7 |
| 19515 | packuswb m1, m3 |
| 19516 | movu [r0 + 112 * 16], m1 |
| 19517 | |
| 19518 | movu m1, [r4 + 37] |
| 19519 | movd m3, [r4 + 38] |
| 19520 | palignr m3, m1, 1 |
| 19521 | punpcklbw m1, m3 |
| 19522 | pmaddubsw m3, m1, m6 |
| 19523 | pmulhrsw m3, m7 |
| 19524 | movu m4, [r4 + 45] |
| 19525 | movd m5, [r4 + 46] |
| 19526 | palignr m5, m4, 1 |
| 19527 | punpcklbw m4, m5 |
| 19528 | pmaddubsw m5, m4, m6 |
| 19529 | pmulhrsw m5, m7 |
| 19530 | packuswb m3, m5 |
| 19531 | movu [r0 + 113 * 16], m3 |
| 19532 | |
| 19533 | ; mode 4 [row 30] |
| 19534 | movu m6, [r5 + 11 * 16] |
| 19535 | pmaddubsw m3, m0, m6 |
| 19536 | pmulhrsw m3, m7 |
| 19537 | pmaddubsw m5, m2, m6 |
| 19538 | pmulhrsw m5, m7 |
| 19539 | packuswb m3, m5 |
| 19540 | movu [r0 + 188 * 16], m3 |
| 19541 | pmaddubsw m3, m1, m6 |
| 19542 | pmulhrsw m3, m7 |
| 19543 | pmaddubsw m5, m4, m6 |
| 19544 | pmulhrsw m5, m7 |
| 19545 | packuswb m3, m5 |
| 19546 | movu [r0 + 189 * 16], m3 |
| 19547 | |
| 19548 | ; mode 3 [row 25] |
| 19549 | movu m6, [r5 + 4 * 16] |
| 19550 | movu m0, [r4 + 22] |
| 19551 | movd m1, [r4 + 23] |
| 19552 | palignr m1, m0, 1 |
| 19553 | punpcklbw m0, m1 |
| 19554 | pmaddubsw m1, m0, m6 |
| 19555 | pmulhrsw m1, m7 |
| 19556 | movu m2, [r4 + 30] |
| 19557 | movd m3, [r4 + 31] |
| 19558 | palignr m3, m2, 1 |
| 19559 | punpcklbw m2, m3 |
| 19560 | pmaddubsw m3, m2, m6 |
| 19561 | pmulhrsw m3, m7 |
| 19562 | packuswb m1, m3 |
| 19563 | movu [r0 + 114 * 16], m1 |
| 19564 | |
| 19565 | movu m1, [r4 + 38] |
| 19566 | movd m3, [r4 + 39] |
| 19567 | palignr m3, m1, 1 |
| 19568 | punpcklbw m1, m3 |
| 19569 | pmaddubsw m3, m1, m6 |
| 19570 | pmulhrsw m3, m7 |
| 19571 | movu m4, [r4 + 46] |
| 19572 | movd m5, [r4 + 47] |
| 19573 | palignr m5, m4, 1 |
| 19574 | punpcklbw m4, m5 |
| 19575 | pmaddubsw m5, m4, m6 |
| 19576 | pmulhrsw m5, m7 |
| 19577 | packuswb m3, m5 |
| 19578 | movu [r0 + 115 * 16], m3 |
| 19579 | |
| 19580 | ; mode 3 [row 26] |
| 19581 | movu m6, [r5 + 30 * 16] |
| 19582 | pmaddubsw m3, m0, m6 |
| 19583 | pmulhrsw m3, m7 |
| 19584 | pmaddubsw m5, m2, m6 |
| 19585 | pmulhrsw m5, m7 |
| 19586 | packuswb m3, m5 |
| 19587 | movu [r0 + 116 * 16], m3 |
| 19588 | pmaddubsw m3, m1, m6 |
| 19589 | pmulhrsw m3, m7 |
| 19590 | pmaddubsw m5, m4, m6 |
| 19591 | pmulhrsw m5, m7 |
| 19592 | packuswb m3, m5 |
| 19593 | movu [r0 + 117 * 16], m3 |
| 19594 | |
| 19595 | ; mode 3 [row 27] |
| 19596 | movu m6, [r5 + 24 * 16] |
| 19597 | movu m0, [r4 + 23] |
| 19598 | movd m1, [r4 + 24] |
| 19599 | palignr m1, m0, 1 |
| 19600 | punpcklbw m0, m1 |
| 19601 | pmaddubsw m1, m0, m6 |
| 19602 | pmulhrsw m1, m7 |
| 19603 | movu m2, [r4 + 31] |
| 19604 | movd m3, [r4 + 32] |
| 19605 | palignr m3, m2, 1 |
| 19606 | punpcklbw m2, m3 |
| 19607 | pmaddubsw m3, m2, m6 |
| 19608 | pmulhrsw m3, m7 |
| 19609 | packuswb m1, m3 |
| 19610 | movu [r0 + 118 * 16], m1 |
| 19611 | |
| 19612 | movu m1, [r4 + 39] |
| 19613 | movd m3, [r4 + 40] |
| 19614 | palignr m3, m1, 1 |
| 19615 | punpcklbw m1, m3 |
| 19616 | pmaddubsw m3, m1, m6 |
| 19617 | pmulhrsw m3, m7 |
| 19618 | movu m4, [r4 + 47] |
| 19619 | movd m5, [r4 + 48] |
| 19620 | palignr m5, m4, 1 |
| 19621 | punpcklbw m4, m5 |
| 19622 | pmaddubsw m5, m4, m6 |
| 19623 | pmulhrsw m5, m7 |
| 19624 | packuswb m3, m5 |
| 19625 | movu [r0 + 119 * 16], m3 |
| 19626 | |
| 19627 | ; mode 3 [row 28] |
| 19628 | movu m6, [r5 + 18 * 16] |
| 19629 | movu m0, [r4 + 24] |
| 19630 | movd m1, [r4 + 25] |
| 19631 | palignr m1, m0, 1 |
| 19632 | punpcklbw m0, m1 |
| 19633 | pmaddubsw m1, m0, m6 |
| 19634 | pmulhrsw m1, m7 |
| 19635 | movu m2, [r4 + 32] |
| 19636 | movd m3, [r4 + 33] |
| 19637 | palignr m3, m2, 1 |
| 19638 | punpcklbw m2, m3 |
| 19639 | pmaddubsw m3, m2, m6 |
| 19640 | pmulhrsw m3, m7 |
| 19641 | packuswb m1, m3 |
| 19642 | movu [r0 + 120 * 16], m1 |
| 19643 | |
| 19644 | movu m1, [r4 + 40] |
| 19645 | movd m3, [r4 + 41] |
| 19646 | palignr m3, m1, 1 |
| 19647 | punpcklbw m1, m3 |
| 19648 | pmaddubsw m3, m1, m6 |
| 19649 | pmulhrsw m3, m7 |
| 19650 | movu m4, [r4 + 48] |
| 19651 | movd m5, [r4 + 49] |
| 19652 | palignr m5, m4, 1 |
| 19653 | punpcklbw m4, m5 |
| 19654 | pmaddubsw m5, m4, m6 |
| 19655 | pmulhrsw m5, m7 |
| 19656 | packuswb m3, m5 |
| 19657 | movu [r0 + 121 * 16], m3 |
| 19658 | |
| 19659 | ; mode 3 [row 29] |
| 19660 | movu m6, [r5 + 12 * 16] |
| 19661 | movu m0, [r4 + 25] |
| 19662 | movd m1, [r4 + 26] |
| 19663 | palignr m1, m0, 1 |
| 19664 | punpcklbw m0, m1 |
| 19665 | pmaddubsw m1, m0, m6 |
| 19666 | pmulhrsw m1, m7 |
| 19667 | movu m2, [r4 + 33] |
| 19668 | movd m3, [r4 + 34] |
| 19669 | palignr m3, m2, 1 |
| 19670 | punpcklbw m2, m3 |
| 19671 | pmaddubsw m3, m2, m6 |
| 19672 | pmulhrsw m3, m7 |
| 19673 | packuswb m1, m3 |
| 19674 | movu [r0 + 122 * 16], m1 |
| 19675 | |
| 19676 | movu m1, [r4 + 41] |
| 19677 | movd m3, [r4 + 42] |
| 19678 | palignr m3, m1, 1 |
| 19679 | punpcklbw m1, m3 |
| 19680 | pmaddubsw m3, m1, m6 |
| 19681 | pmulhrsw m3, m7 |
| 19682 | movu m4, [r4 + 49] |
| 19683 | movd m5, [r4 + 50] |
| 19684 | palignr m5, m4, 1 |
| 19685 | punpcklbw m4, m5 |
| 19686 | pmaddubsw m5, m4, m6 |
| 19687 | pmulhrsw m5, m7 |
| 19688 | packuswb m3, m5 |
| 19689 | movu [r0 + 123 * 16], m3 |
| 19690 | |
| 19691 | ; mode 3 [row 30] |
| 19692 | movu m6, [r5 + 6 * 16] |
| 19693 | movu m0, [r4 + 26] |
| 19694 | movd m1, [r4 + 27] |
| 19695 | palignr m1, m0, 1 |
| 19696 | punpcklbw m0, m1 |
| 19697 | pmaddubsw m1, m0, m6 |
| 19698 | pmulhrsw m1, m7 |
| 19699 | movu m2, [r4 + 34] |
| 19700 | movd m3, [r4 + 35] |
| 19701 | palignr m3, m2, 1 |
| 19702 | punpcklbw m2, m3 |
| 19703 | pmaddubsw m3, m2, m6 |
| 19704 | pmulhrsw m3, m7 |
| 19705 | packuswb m1, m3 |
| 19706 | movu [r0 + 124 * 16], m1 |
| 19707 | |
| 19708 | movu m1, [r4 + 42] |
| 19709 | movd m3, [r4 + 43] |
| 19710 | palignr m3, m1, 1 |
| 19711 | punpcklbw m1, m3 |
| 19712 | pmaddubsw m3, m1, m6 |
| 19713 | pmulhrsw m3, m7 |
| 19714 | movu m4, [r4 + 50] |
| 19715 | movd m5, [r4 + 51] |
| 19716 | palignr m5, m4, 1 |
| 19717 | punpcklbw m4, m5 |
| 19718 | pmaddubsw m5, m4, m6 |
| 19719 | pmulhrsw m5, m7 |
| 19720 | packuswb m3, m5 |
| 19721 | movu [r0 + 125 * 16], m3 |
| 19722 | |
| 19723 | ; mode 10 |
| 19724 | movu m1, [r2 + 1] |
| 19725 | movu m2, [r2 + 17] |
| 19726 | movu [r0 + 512 * 16], m1 |
| 19727 | movu [r0 + 513 * 16], m2 |
| 19728 | movu [r0 + 514 * 16], m1 |
| 19729 | movu [r0 + 515 * 16], m2 |
| 19730 | movu [r0 + 516 * 16], m1 |
| 19731 | movu [r0 + 517 * 16], m2 |
| 19732 | movu [r0 + 518 * 16], m1 |
| 19733 | movu [r0 + 519 * 16], m2 |
| 19734 | movu [r0 + 520 * 16], m1 |
| 19735 | movu [r0 + 521 * 16], m2 |
| 19736 | movu [r0 + 522 * 16], m1 |
| 19737 | movu [r0 + 523 * 16], m2 |
| 19738 | movu [r0 + 524 * 16], m1 |
| 19739 | movu [r0 + 525 * 16], m2 |
| 19740 | movu [r0 + 526 * 16], m1 |
| 19741 | movu [r0 + 527 * 16], m2 |
| 19742 | |
| 19743 | movu [r0 + 528 * 16], m1 |
| 19744 | movu [r0 + 529 * 16], m2 |
| 19745 | movu [r0 + 530 * 16], m1 |
| 19746 | movu [r0 + 531 * 16], m2 |
| 19747 | movu [r0 + 532 * 16], m1 |
| 19748 | movu [r0 + 533 * 16], m2 |
| 19749 | movu [r0 + 534 * 16], m1 |
| 19750 | movu [r0 + 535 * 16], m2 |
| 19751 | movu [r0 + 536 * 16], m1 |
| 19752 | movu [r0 + 537 * 16], m2 |
| 19753 | movu [r0 + 538 * 16], m1 |
| 19754 | movu [r0 + 539 * 16], m2 |
| 19755 | movu [r0 + 540 * 16], m1 |
| 19756 | movu [r0 + 541 * 16], m2 |
| 19757 | movu [r0 + 542 * 16], m1 |
| 19758 | movu [r0 + 543 * 16], m2 |
| 19759 | |
| 19760 | movu [r0 + 544 * 16], m1 |
| 19761 | movu [r0 + 545 * 16], m2 |
| 19762 | movu [r0 + 546 * 16], m1 |
| 19763 | movu [r0 + 547 * 16], m2 |
| 19764 | movu [r0 + 548 * 16], m1 |
| 19765 | movu [r0 + 549 * 16], m2 |
| 19766 | movu [r0 + 550 * 16], m1 |
| 19767 | movu [r0 + 551 * 16], m2 |
| 19768 | movu [r0 + 552 * 16], m1 |
| 19769 | movu [r0 + 553 * 16], m2 |
| 19770 | movu [r0 + 554 * 16], m1 |
| 19771 | movu [r0 + 555 * 16], m2 |
| 19772 | movu [r0 + 556 * 16], m1 |
| 19773 | movu [r0 + 557 * 16], m2 |
| 19774 | movu [r0 + 558 * 16], m1 |
| 19775 | movu [r0 + 559 * 16], m2 |
| 19776 | |
| 19777 | movu [r0 + 560 * 16], m1 |
| 19778 | movu [r0 + 561 * 16], m2 |
| 19779 | movu [r0 + 562 * 16], m1 |
| 19780 | movu [r0 + 563 * 16], m2 |
| 19781 | movu [r0 + 564 * 16], m1 |
| 19782 | movu [r0 + 565 * 16], m2 |
| 19783 | movu [r0 + 566 * 16], m1 |
| 19784 | movu [r0 + 567 * 16], m2 |
| 19785 | movu [r0 + 568 * 16], m1 |
| 19786 | movu [r0 + 569 * 16], m2 |
| 19787 | movu [r0 + 570 * 16], m1 |
| 19788 | movu [r0 + 571 * 16], m2 |
| 19789 | movu [r0 + 572 * 16], m1 |
| 19790 | movu [r0 + 573 * 16], m2 |
| 19791 | movu [r0 + 574 * 16], m1 |
| 19792 | movu [r0 + 575 * 16], m2 |
| 19793 | |
| 19794 | ; mode 11 [row 0] |
| 19795 | movu m0, [r4] |
| 19796 | |
| 19797 | ; mode 11 [row 15 - first half] |
| 19798 | movu [r0 + 606 * 16], m0 |
| 19799 | |
| 19800 | movu [r0 + 606 * 16], m0 |
| 19801 | |
| 19802 | ; mode 12 [row 31] |
| 19803 | pslldq m6, m0, 4 |
| 19804 | pinsrb m6, [r3 + 26], 0 |
| 19805 | pinsrb m6, [r3 + 19], 1 |
| 19806 | pinsrb m6, [r3 + 13], 2 |
| 19807 | pinsrb m6, [r3 + 6], 3 |
| 19808 | movu [r0 + 702 * 16], m6 |
| 19809 | movu m6, [r4 + 12] |
| 19810 | movu [r0 + 703 * 16], m6 |
| 19811 | |
| 19812 | ; mode 11 [row 31] |
| 19813 | pslldq m6, m0, 1 |
| 19814 | pinsrb m6, [r3 + 16], 0 |
| 19815 | movu [r0 + 638 * 16], m6 |
| 19816 | movu m6, [r4 + 15] |
| 19817 | movu [r0 + 639 * 16], m6 |
| 19818 | |
| 19819 | movd m1, [r4 + 1] |
| 19820 | palignr m1, m0, 1 |
| 19821 | punpcklbw m0, m1 |
| 19822 | pmaddubsw m1, m0, [r5 + 30 * 16] |
| 19823 | pmulhrsw m1, m7 |
| 19824 | movu m2, [r4 + 8] |
| 19825 | movd m3, [r4 + 9] |
| 19826 | palignr m3, m2, 1 |
| 19827 | punpcklbw m2, m3 |
| 19828 | pmaddubsw m3, m2, [r5 + 30 * 16] |
| 19829 | pmulhrsw m3, m7 |
| 19830 | packuswb m1, m3 |
| 19831 | movu [r0 + 576 * 16], m1 |
| 19832 | |
| 19833 | movu m1, [r4 + 16] |
| 19834 | |
| 19835 | ; mode 11 [row 15 - second half] |
| 19836 | movu [r0 + 607 * 16], m1 |
| 19837 | |
| 19838 | movd m3, [r4 + 17] |
| 19839 | palignr m3, m1, 1 |
| 19840 | punpcklbw m1, m3 |
| 19841 | pmaddubsw m3, m1, [r5 + 30 * 16] |
| 19842 | pmulhrsw m3, m7 |
| 19843 | movu m4, [r4 + 24] |
| 19844 | movd m5, [r4 + 25] |
| 19845 | palignr m5, m4, 1 |
| 19846 | punpcklbw m4, m5 |
| 19847 | pmaddubsw m5, m4, [r5 + 30 * 16] |
| 19848 | pmulhrsw m5, m7 |
| 19849 | packuswb m3, m5 |
| 19850 | movu [r0 + 577 * 16], m3 |
| 19851 | |
| 19852 | ; mode 11 [row 1] |
| 19853 | pmaddubsw m3, m0, [r5 + 28 * 16] |
| 19854 | pmulhrsw m3, m7 |
| 19855 | pmaddubsw m5, m2, [r5 + 28 * 16] |
| 19856 | pmulhrsw m5, m7 |
| 19857 | packuswb m3, m5 |
| 19858 | movu [r0 + 578 * 16], m3 |
| 19859 | pmaddubsw m3, m1, [r5 + 28 * 16] |
| 19860 | pmulhrsw m3, m7 |
| 19861 | pmaddubsw m5, m4, [r5 + 28 * 16] |
| 19862 | pmulhrsw m5, m7 |
| 19863 | packuswb m3, m5 |
| 19864 | movu [r0 + 579 * 16], m3 |
| 19865 | |
| 19866 | ; mode 11 [row 2] |
| 19867 | pmaddubsw m3, m0, [r5 + 26 * 16] |
| 19868 | pmulhrsw m3, m7 |
| 19869 | pmaddubsw m5, m2, [r5 + 26 * 16] |
| 19870 | pmulhrsw m5, m7 |
| 19871 | packuswb m3, m5 |
| 19872 | movu [r0 + 580 * 16], m3 |
| 19873 | pmaddubsw m3, m1, [r5 + 26 * 16] |
| 19874 | pmulhrsw m3, m7 |
| 19875 | pmaddubsw m5, m4, [r5 + 26 * 16] |
| 19876 | pmulhrsw m5, m7 |
| 19877 | packuswb m3, m5 |
| 19878 | movu [r0 + 581 * 16], m3 |
| 19879 | |
| 19880 | ; mode 11 [row 3] |
| 19881 | pmaddubsw m3, m0, [r5 + 24 * 16] |
| 19882 | pmulhrsw m3, m7 |
| 19883 | pmaddubsw m5, m2, [r5 + 24 * 16] |
| 19884 | pmulhrsw m5, m7 |
| 19885 | packuswb m3, m5 |
| 19886 | movu [r0 + 582 * 16], m3 |
| 19887 | pmaddubsw m3, m1, [r5 + 24 * 16] |
| 19888 | pmulhrsw m3, m7 |
| 19889 | pmaddubsw m5, m4, [r5 + 24 * 16] |
| 19890 | pmulhrsw m5, m7 |
| 19891 | packuswb m3, m5 |
| 19892 | movu [r0 + 583 * 16], m3 |
| 19893 | |
| 19894 | ; mode 11 [row 4] |
| 19895 | pmaddubsw m3, m0, [r5 + 22 * 16] |
| 19896 | pmulhrsw m3, m7 |
| 19897 | pmaddubsw m5, m2, [r5 + 22 * 16] |
| 19898 | pmulhrsw m5, m7 |
| 19899 | packuswb m3, m5 |
| 19900 | movu [r0 + 584 * 16], m3 |
| 19901 | |
| 19902 | ; mode 12 [row 1 - first half] |
| 19903 | movu [r0 + 642 * 16], m3 |
| 19904 | |
| 19905 | pmaddubsw m3, m1, [r5 + 22 * 16] |
| 19906 | pmulhrsw m3, m7 |
| 19907 | pmaddubsw m5, m4, [r5 + 22 * 16] |
| 19908 | pmulhrsw m5, m7 |
| 19909 | packuswb m3, m5 |
| 19910 | movu [r0 + 585 * 16], m3 |
| 19911 | |
| 19912 | ; mode 12 [row 1 - second half] |
| 19913 | movu [r0 + 643 * 16], m3 |
| 19914 | |
| 19915 | ; mode 11 [row 5] |
| 19916 | pmaddubsw m3, m0, [r5 + 20 * 16] |
| 19917 | pmulhrsw m3, m7 |
| 19918 | pmaddubsw m5, m2, [r5 + 20 * 16] |
| 19919 | pmulhrsw m5, m7 |
| 19920 | packuswb m3, m5 |
| 19921 | movu [r0 + 586 * 16], m3 |
| 19922 | pmaddubsw m3, m1, [r5 + 20 * 16] |
| 19923 | pmulhrsw m3, m7 |
| 19924 | pmaddubsw m5, m4, [r5 + 20 * 16] |
| 19925 | pmulhrsw m5, m7 |
| 19926 | packuswb m3, m5 |
| 19927 | movu [r0 + 587 * 16], m3 |
| 19928 | |
| 19929 | ; mode 11 [row 6] |
| 19930 | pmaddubsw m3, m0, [r5 + 18 * 16] |
| 19931 | pmulhrsw m3, m7 |
| 19932 | pmaddubsw m5, m2, [r5 + 18 * 16] |
| 19933 | pmulhrsw m5, m7 |
| 19934 | packuswb m3, m5 |
| 19935 | movu [r0 + 588 * 16], m3 |
| 19936 | pmaddubsw m3, m1, [r5 + 18 * 16] |
| 19937 | pmulhrsw m3, m7 |
| 19938 | pmaddubsw m5, m4, [r5 + 18 * 16] |
| 19939 | pmulhrsw m5, m7 |
| 19940 | packuswb m3, m5 |
| 19941 | movu [r0 + 589 * 16], m3 |
| 19942 | |
| 19943 | ; mode 11 [row 7] |
| 19944 | pmaddubsw m3, m0, [r5 + 16 * 16] |
| 19945 | pmulhrsw m3, m7 |
| 19946 | pmaddubsw m5, m2, [r5 + 16 * 16] |
| 19947 | pmulhrsw m5, m7 |
| 19948 | packuswb m3, m5 |
| 19949 | movu [r0 + 590 * 16], m3 |
| 19950 | pmaddubsw m3, m1, [r5 + 16 * 16] |
| 19951 | pmulhrsw m3, m7 |
| 19952 | pmaddubsw m5, m4, [r5 + 16 * 16] |
| 19953 | pmulhrsw m5, m7 |
| 19954 | packuswb m3, m5 |
| 19955 | movu [r0 + 591 * 16], m3 |
| 19956 | |
| 19957 | ; mode 11 [row 8] |
| 19958 | pmaddubsw m3, m0, [r5 + 14 * 16] |
| 19959 | pmulhrsw m3, m7 |
| 19960 | pmaddubsw m5, m2, [r5 + 14 * 16] |
| 19961 | pmulhrsw m5, m7 |
| 19962 | packuswb m3, m5 |
| 19963 | movu [r0 + 592 * 16], m3 |
| 19964 | |
| 19965 | ; mode 13 [row 1 - first half] |
| 19966 | movu [r0 + 706 * 16], m3 |
| 19967 | |
| 19968 | pmaddubsw m3, m1, [r5 + 14 * 16] |
| 19969 | pmulhrsw m3, m7 |
| 19970 | pmaddubsw m5, m4, [r5 + 14 * 16] |
| 19971 | pmulhrsw m5, m7 |
| 19972 | packuswb m3, m5 |
| 19973 | movu [r0 + 593 * 16], m3 |
| 19974 | |
| 19975 | ; mode 13 [row 1 - second half] |
| 19976 | movu [r0 + 707 * 16], m3 |
| 19977 | |
| 19978 | ; mode 11 [row 9] |
| 19979 | pmaddubsw m3, m0, [r5 + 12 * 16] |
| 19980 | pmulhrsw m3, m7 |
| 19981 | pmaddubsw m5, m2, [r5 + 12 * 16] |
| 19982 | pmulhrsw m5, m7 |
| 19983 | packuswb m3, m5 |
| 19984 | movu [r0 + 594 * 16], m3 |
| 19985 | |
| 19986 | ; mode 12 [row 3 - first half] |
| 19987 | movu [r0 + 646 * 16], m3 |
| 19988 | |
| 19989 | pmaddubsw m3, m1, [r5 + 12 * 16] |
| 19990 | pmulhrsw m3, m7 |
| 19991 | pmaddubsw m5, m4, [r5 + 12 * 16] |
| 19992 | pmulhrsw m5, m7 |
| 19993 | packuswb m3, m5 |
| 19994 | movu [r0 + 595 * 16], m3 |
| 19995 | |
| 19996 | ; mode 12 [row 3 - second half] |
| 19997 | movu [r0 + 647 * 16], m3 |
| 19998 | |
| 19999 | ; mode 11 [row 10] |
| 20000 | pmaddubsw m3, m0, [r5 + 10 * 16] |
| 20001 | pmulhrsw m3, m7 |
| 20002 | pmaddubsw m5, m2, [r5 + 10 * 16] |
| 20003 | pmulhrsw m5, m7 |
| 20004 | packuswb m3, m5 |
| 20005 | movu [r0 + 596 * 16], m3 |
| 20006 | pmaddubsw m3, m1, [r5 + 10 * 16] |
| 20007 | pmulhrsw m3, m7 |
| 20008 | pmaddubsw m5, m4, [r5 + 10 * 16] |
| 20009 | pmulhrsw m5, m7 |
| 20010 | packuswb m3, m5 |
| 20011 | movu [r0 + 597 * 16], m3 |
| 20012 | |
| 20013 | ; mode 11 [row 11] |
| 20014 | pmaddubsw m3, m0, [r5 + 8 * 16] |
| 20015 | pmulhrsw m3, m7 |
| 20016 | pmaddubsw m5, m2, [r5 + 8 * 16] |
| 20017 | pmulhrsw m5, m7 |
| 20018 | packuswb m3, m5 |
| 20019 | movu [r0 + 598 * 16], m3 |
| 20020 | pmaddubsw m3, m1, [r5 + 8 * 16] |
| 20021 | pmulhrsw m3, m7 |
| 20022 | pmaddubsw m5, m4, [r5 + 8 * 16] |
| 20023 | pmulhrsw m5, m7 |
| 20024 | packuswb m3, m5 |
| 20025 | movu [r0 + 599 * 16], m3 |
| 20026 | |
| 20027 | ; mode 11 [row 12] |
| 20028 | pmaddubsw m3, m0, [r5 + 6 * 16] |
| 20029 | pmulhrsw m3, m7 |
| 20030 | pmaddubsw m5, m2, [r5 + 6 * 16] |
| 20031 | pmulhrsw m5, m7 |
| 20032 | packuswb m3, m5 |
| 20033 | movu [r0 + 600 * 16], m3 |
| 20034 | |
| 20035 | ; mode 14 [row 1 - first half] |
| 20036 | movu [r0 + 770 * 16], m3 |
| 20037 | |
| 20038 | pmaddubsw m3, m1, [r5 + 6 * 16] |
| 20039 | pmulhrsw m3, m7 |
| 20040 | pmaddubsw m5, m4, [r5 + 6 * 16] |
| 20041 | pmulhrsw m5, m7 |
| 20042 | packuswb m3, m5 |
| 20043 | movu [r0 + 601 * 16], m3 |
| 20044 | |
| 20045 | ; mode 14 [row 1 - second half] |
| 20046 | movu [r0 + 771 * 16], m3 |
| 20047 | |
| 20048 | ; mode 11 [row 13] |
| 20049 | pmaddubsw m3, m0, [r5 + 4 * 16] |
| 20050 | pmulhrsw m3, m7 |
| 20051 | pmaddubsw m5, m2, [r5 + 4 * 16] |
| 20052 | pmulhrsw m5, m7 |
| 20053 | packuswb m3, m5 |
| 20054 | movu [r0 + 602 * 16], m3 |
| 20055 | pmaddubsw m3, m1, [r5 + 4 * 16] |
| 20056 | pmulhrsw m3, m7 |
| 20057 | pmaddubsw m5, m4, [r5 + 4 * 16] |
| 20058 | pmulhrsw m5, m7 |
| 20059 | packuswb m3, m5 |
| 20060 | movu [r0 + 603 * 16], m3 |
| 20061 | |
| 20062 | ; mode 11 [row 14] |
| 20063 | pmaddubsw m3, m0, [r5 + 2 * 16] |
| 20064 | pmulhrsw m3, m7 |
| 20065 | pmaddubsw m5, m2, [r5 + 2 * 16] |
| 20066 | pmulhrsw m5, m7 |
| 20067 | packuswb m3, m5 |
| 20068 | movu [r0 + 604 * 16], m3 |
| 20069 | |
| 20070 | ; mode 13 [row 5 - first half] |
| 20071 | movu [r0 + 650 * 16], m3 |
| 20072 | |
| 20073 | pmaddubsw m3, m1, [r5 + 2 * 16] |
| 20074 | pmulhrsw m3, m7 |
| 20075 | pmaddubsw m5, m4, [r5 + 2 * 16] |
| 20076 | pmulhrsw m5, m7 |
| 20077 | packuswb m3, m5 |
| 20078 | movu [r0 + 605 * 16], m3 |
| 20079 | |
| 20080 | ; mode 13 [row 5 - second half] |
| 20081 | movu [r0 + 651 * 16], m3 |
| 20082 | |
| 20083 | ; mode 12 [row 0] |
| 20084 | pmaddubsw m3, m0, [r5 + 27 * 16] |
| 20085 | pmulhrsw m3, m7 |
| 20086 | pmaddubsw m5, m2, [r5 + 27 * 16] |
| 20087 | pmulhrsw m5, m7 |
| 20088 | packuswb m3, m5 |
| 20089 | movu [r0 + 640 * 16], m3 |
| 20090 | pmaddubsw m3, m1, [r5 + 27 * 16] |
| 20091 | pmulhrsw m3, m7 |
| 20092 | pmaddubsw m5, m4, [r5 + 27 * 16] |
| 20093 | pmulhrsw m5, m7 |
| 20094 | packuswb m3, m5 |
| 20095 | movu [r0 + 641 * 16], m3 |
| 20096 | |
| 20097 | ; mode 12 [row 2] |
| 20098 | pmaddubsw m3, m0, [r5 + 17 * 16] |
| 20099 | pmulhrsw m3, m7 |
| 20100 | pmaddubsw m5, m2, [r5 + 17 * 16] |
| 20101 | pmulhrsw m5, m7 |
| 20102 | packuswb m3, m5 |
| 20103 | movu [r0 + 644 * 16], m3 |
| 20104 | pmaddubsw m3, m1, [r5 + 17 * 16] |
| 20105 | pmulhrsw m3, m7 |
| 20106 | pmaddubsw m5, m4, [r5 + 17 * 16] |
| 20107 | pmulhrsw m5, m7 |
| 20108 | packuswb m3, m5 |
| 20109 | movu [r0 + 645 * 16], m3 |
| 20110 | |
| 20111 | ; mode 12 [row 4] |
| 20112 | pmaddubsw m3, m0, [r5 + 7 * 16] |
| 20113 | pmulhrsw m3, m7 |
| 20114 | pmaddubsw m5, m2, [r5 + 7 * 16] |
| 20115 | pmulhrsw m5, m7 |
| 20116 | packuswb m3, m5 |
| 20117 | movu [r0 + 648 * 16], m3 |
| 20118 | pmaddubsw m3, m1, [r5 + 7 * 16] |
| 20119 | pmulhrsw m3, m7 |
| 20120 | pmaddubsw m5, m4, [r5 + 7 * 16] |
| 20121 | pmulhrsw m5, m7 |
| 20122 | packuswb m3, m5 |
| 20123 | movu [r0 + 649 * 16], m3 |
| 20124 | |
| 20125 | ; mode 13 [row 0] |
| 20126 | pmaddubsw m3, m0, [r5 + 23 * 16] |
| 20127 | pmulhrsw m3, m7 |
| 20128 | pmaddubsw m5, m2, [r5 + 23 * 16] |
| 20129 | pmulhrsw m5, m7 |
| 20130 | packuswb m3, m5 |
| 20131 | movu [r0 + 704 * 16], m3 |
| 20132 | pmaddubsw m3, m1, [r5 + 23 * 16] |
| 20133 | pmulhrsw m3, m7 |
| 20134 | pmaddubsw m5, m4, [r5 + 23 * 16] |
| 20135 | pmulhrsw m5, m7 |
| 20136 | packuswb m3, m5 |
| 20137 | movu [r0 + 705 * 16], m3 |
| 20138 | |
| 20139 | ; mode 13 [row 2] |
| 20140 | pmaddubsw m3, m0, [r5 + 5 * 16] |
| 20141 | pmulhrsw m3, m7 |
| 20142 | pmaddubsw m5, m2, [r5 + 5 * 16] |
| 20143 | pmulhrsw m5, m7 |
| 20144 | packuswb m3, m5 |
| 20145 | movu [r0 + 708 * 16], m3 |
| 20146 | pmaddubsw m3, m1, [r5 + 5 * 16] |
| 20147 | pmulhrsw m3, m7 |
| 20148 | pmaddubsw m5, m4, [r5 + 5 * 16] |
| 20149 | pmulhrsw m5, m7 |
| 20150 | packuswb m3, m5 |
| 20151 | movu [r0 + 709 * 16], m3 |
| 20152 | |
| 20153 | ; mode 14 [row 0] |
| 20154 | pmaddubsw m3, m0, [r5 + 19 * 16] |
| 20155 | pmulhrsw m3, m7 |
| 20156 | pmaddubsw m5, m2, [r5 + 19 * 16] |
| 20157 | pmulhrsw m5, m7 |
| 20158 | packuswb m3, m5 |
| 20159 | movu [r0 + 768 * 16], m3 |
| 20160 | pmaddubsw m3, m1, [r5 + 19 * 16] |
| 20161 | pmulhrsw m3, m7 |
| 20162 | pmaddubsw m5, m4, [r5 + 19 * 16] |
| 20163 | pmulhrsw m5, m7 |
| 20164 | packuswb m3, m5 |
| 20165 | movu [r0 + 769 * 16], m3 |
| 20166 | |
| 20167 | ; mode 15 [row 0] |
| 20168 | pmaddubsw m3, m0, [r5 + 15 * 16] |
| 20169 | pmulhrsw m3, m7 |
| 20170 | pmaddubsw m5, m2, [r5 + 15 * 16] |
| 20171 | pmulhrsw m5, m7 |
| 20172 | packuswb m3, m5 |
| 20173 | movu [r0 + 832 * 16], m3 |
| 20174 | pmaddubsw m3, m1, [r5 + 15 * 16] |
| 20175 | pmulhrsw m3, m7 |
| 20176 | pmaddubsw m5, m4, [r5 + 15 * 16] |
| 20177 | pmulhrsw m5, m7 |
| 20178 | packuswb m3, m5 |
| 20179 | movu [r0 + 833 * 16], m3 |
| 20180 | |
| 20181 | ; mode 11 [row 16] |
| 20182 | pslldq m0, 2 |
| 20183 | pinsrb m0, [r4 + 0], 1 |
| 20184 | pinsrb m0, [r3 + 16], 0 |
| 20185 | pmaddubsw m3, m0, [r5 + 30 * 16] |
| 20186 | pmulhrsw m3, m7 |
| 20187 | pslldq m2, 2 |
| 20188 | pinsrb m2, [r4 + 8], 1 |
| 20189 | pinsrb m2, [r4 + 7], 0 |
| 20190 | pmaddubsw m5, m2, [r5 + 30 * 16] |
| 20191 | pmulhrsw m5, m7 |
| 20192 | packuswb m3, m5 |
| 20193 | movu [r0 + 608 * 16], m3 |
| 20194 | pslldq m1, 2 |
| 20195 | pinsrb m1, [r4 + 16], 1 |
| 20196 | pinsrb m1, [r4 + 15], 0 |
| 20197 | pmaddubsw m3, m1, [r5 + 30 * 16] |
| 20198 | pmulhrsw m3, m7 |
| 20199 | pslldq m4, 2 |
| 20200 | pinsrb m4, [r4 + 24], 1 |
| 20201 | pinsrb m4, [r4 + 23], 0 |
| 20202 | pmaddubsw m5, m4, [r5 + 30 * 16] |
| 20203 | pmulhrsw m5, m7 |
| 20204 | packuswb m3, m5 |
| 20205 | movu [r0 + 609 * 16], m3 |
| 20206 | |
| 20207 | ; mode 11 [row 17] |
| 20208 | pmaddubsw m3, m0, [r5 + 28 * 16] |
| 20209 | pmulhrsw m3, m7 |
| 20210 | pmaddubsw m5, m2, [r5 + 28 * 16] |
| 20211 | pmulhrsw m5, m7 |
| 20212 | packuswb m3, m5 |
| 20213 | movu [r0 + 610 * 16], m3 |
| 20214 | pmaddubsw m3, m1, [r5 + 28 * 16] |
| 20215 | pmulhrsw m3, m7 |
| 20216 | pmaddubsw m5, m4, [r5 + 28 * 16] |
| 20217 | pmulhrsw m5, m7 |
| 20218 | packuswb m3, m5 |
| 20219 | movu [r0 + 611 * 16], m3 |
| 20220 | |
| 20221 | ; mode 11 [row 18] |
| 20222 | pmaddubsw m3, m0, [r5 + 26 * 16] |
| 20223 | pmulhrsw m3, m7 |
| 20224 | pmaddubsw m5, m2, [r5 + 26 * 16] |
| 20225 | pmulhrsw m5, m7 |
| 20226 | packuswb m3, m5 |
| 20227 | movu [r0 + 612 * 16], m3 |
| 20228 | pmaddubsw m3, m1, [r5 + 26 * 16] |
| 20229 | pmulhrsw m3, m7 |
| 20230 | pmaddubsw m5, m4, [r5 + 26 * 16] |
| 20231 | pmulhrsw m5, m7 |
| 20232 | packuswb m3, m5 |
| 20233 | movu [r0 + 613 * 16], m3 |
| 20234 | |
| 20235 | ; mode 11 [row 19] |
| 20236 | pmaddubsw m3, m0, [r5 + 24 * 16] |
| 20237 | pmulhrsw m3, m7 |
| 20238 | pmaddubsw m5, m2, [r5 + 24 * 16] |
| 20239 | pmulhrsw m5, m7 |
| 20240 | packuswb m3, m5 |
| 20241 | movu [r0 + 614 * 16], m3 |
| 20242 | pmaddubsw m3, m1, [r5 + 24 * 16] |
| 20243 | pmulhrsw m3, m7 |
| 20244 | pmaddubsw m5, m4, [r5 + 24 * 16] |
| 20245 | pmulhrsw m5, m7 |
| 20246 | packuswb m3, m5 |
| 20247 | movu [r0 + 615 * 16], m3 |
| 20248 | |
| 20249 | ; mode 11 [row 20] |
| 20250 | pmaddubsw m3, m0, [r5 + 22 * 16] |
| 20251 | pmulhrsw m3, m7 |
| 20252 | pmaddubsw m5, m2, [r5 + 22 * 16] |
| 20253 | pmulhrsw m5, m7 |
| 20254 | packuswb m3, m5 |
| 20255 | movu [r0 + 616 * 16], m3 |
| 20256 | pmaddubsw m3, m1, [r5 + 22 * 16] |
| 20257 | pmulhrsw m3, m7 |
| 20258 | pmaddubsw m5, m4, [r5 + 22 * 16] |
| 20259 | pmulhrsw m5, m7 |
| 20260 | packuswb m3, m5 |
| 20261 | movu [r0 + 617 * 16], m3 |
| 20262 | |
| 20263 | ; mode 11 [row 21] |
| 20264 | pmaddubsw m3, m0, [r5 + 20 * 16] |
| 20265 | pmulhrsw m3, m7 |
| 20266 | pmaddubsw m5, m2, [r5 + 20 * 16] |
| 20267 | pmulhrsw m5, m7 |
| 20268 | packuswb m3, m5 |
| 20269 | movu [r0 + 618 * 16], m3 |
| 20270 | pmaddubsw m3, m1, [r5 + 20 * 16] |
| 20271 | pmulhrsw m3, m7 |
| 20272 | pmaddubsw m5, m4, [r5 + 20 * 16] |
| 20273 | pmulhrsw m5, m7 |
| 20274 | packuswb m3, m5 |
| 20275 | movu [r0 + 619 * 16], m3 |
| 20276 | |
| 20277 | ; mode 11 [row 22] |
| 20278 | pmaddubsw m3, m0, [r5 + 18 * 16] |
| 20279 | pmulhrsw m3, m7 |
| 20280 | pmaddubsw m5, m2, [r5 + 18 * 16] |
| 20281 | pmulhrsw m5, m7 |
| 20282 | packuswb m3, m5 |
| 20283 | movu [r0 + 620 * 16], m3 |
| 20284 | pmaddubsw m3, m1, [r5 + 18 * 16] |
| 20285 | pmulhrsw m3, m7 |
| 20286 | pmaddubsw m5, m4, [r5 + 18 * 16] |
| 20287 | pmulhrsw m5, m7 |
| 20288 | packuswb m3, m5 |
| 20289 | movu [r0 + 621 * 16], m3 |
| 20290 | |
| 20291 | ; mode 11 [row 23] |
| 20292 | pmaddubsw m3, m0, [r5 + 16 * 16] |
| 20293 | pmulhrsw m3, m7 |
| 20294 | pmaddubsw m5, m2, [r5 + 16 * 16] |
| 20295 | pmulhrsw m5, m7 |
| 20296 | packuswb m3, m5 |
| 20297 | movu [r0 + 622 * 16], m3 |
| 20298 | pmaddubsw m3, m1, [r5 + 16 * 16] |
| 20299 | pmulhrsw m3, m7 |
| 20300 | pmaddubsw m5, m4, [r5 + 16 * 16] |
| 20301 | pmulhrsw m5, m7 |
| 20302 | packuswb m3, m5 |
| 20303 | movu [r0 + 623 * 16], m3 |
| 20304 | |
| 20305 | ; mode 11 [row 24] |
| 20306 | pmaddubsw m3, m0, [r5 + 14 * 16] |
| 20307 | pmulhrsw m3, m7 |
| 20308 | pmaddubsw m5, m2, [r5 + 14 * 16] |
| 20309 | pmulhrsw m5, m7 |
| 20310 | packuswb m3, m5 |
| 20311 | movu [r0 + 624 * 16], m3 |
| 20312 | pmaddubsw m3, m1, [r5 + 14 * 16] |
| 20313 | pmulhrsw m3, m7 |
| 20314 | pmaddubsw m5, m4, [r5 + 14 * 16] |
| 20315 | pmulhrsw m5, m7 |
| 20316 | packuswb m3, m5 |
| 20317 | movu [r0 + 625 * 16], m3 |
| 20318 | |
| 20319 | ; mode 11 [row 25] |
| 20320 | pmaddubsw m3, m0, [r5 + 12 * 16] |
| 20321 | pmulhrsw m3, m7 |
| 20322 | pmaddubsw m5, m2, [r5 + 12 * 16] |
| 20323 | pmulhrsw m5, m7 |
| 20324 | packuswb m3, m5 |
| 20325 | movu [r0 + 626 * 16], m3 |
| 20326 | pmaddubsw m3, m1, [r5 + 12 * 16] |
| 20327 | pmulhrsw m3, m7 |
| 20328 | pmaddubsw m5, m4, [r5 + 12 * 16] |
| 20329 | pmulhrsw m5, m7 |
| 20330 | packuswb m3, m5 |
| 20331 | movu [r0 + 627 * 16], m3 |
| 20332 | |
| 20333 | ; mode 11 [row 26] |
| 20334 | pmaddubsw m3, m0, [r5 + 10 * 16] |
| 20335 | pmulhrsw m3, m7 |
| 20336 | pmaddubsw m5, m2, [r5 + 10 * 16] |
| 20337 | pmulhrsw m5, m7 |
| 20338 | packuswb m3, m5 |
| 20339 | movu [r0 + 628 * 16], m3 |
| 20340 | pmaddubsw m3, m1, [r5 + 10 * 16] |
| 20341 | pmulhrsw m3, m7 |
| 20342 | pmaddubsw m5, m4, [r5 + 10 * 16] |
| 20343 | pmulhrsw m5, m7 |
| 20344 | packuswb m3, m5 |
| 20345 | movu [r0 + 629 * 16], m3 |
| 20346 | |
| 20347 | ; mode 11 [row 27] |
| 20348 | pmaddubsw m3, m0, [r5 + 8 * 16] |
| 20349 | pmulhrsw m3, m7 |
| 20350 | pmaddubsw m5, m2, [r5 + 8 * 16] |
| 20351 | pmulhrsw m5, m7 |
| 20352 | packuswb m3, m5 |
| 20353 | movu [r0 + 630 * 16], m3 |
| 20354 | pmaddubsw m3, m1, [r5 + 8 * 16] |
| 20355 | pmulhrsw m3, m7 |
| 20356 | pmaddubsw m5, m4, [r5 + 8 * 16] |
| 20357 | pmulhrsw m5, m7 |
| 20358 | packuswb m3, m5 |
| 20359 | movu [r0 + 631 * 16], m3 |
| 20360 | |
| 20361 | ; mode 11 [row 28] |
| 20362 | pmaddubsw m3, m0, [r5 + 6 * 16] |
| 20363 | pmulhrsw m3, m7 |
| 20364 | pmaddubsw m5, m2, [r5 + 6 * 16] |
| 20365 | pmulhrsw m5, m7 |
| 20366 | packuswb m3, m5 |
| 20367 | movu [r0 + 632 * 16], m3 |
| 20368 | pmaddubsw m3, m1, [r5 + 6 * 16] |
| 20369 | pmulhrsw m3, m7 |
| 20370 | pmaddubsw m5, m4, [r5 + 6 * 16] |
| 20371 | pmulhrsw m5, m7 |
| 20372 | packuswb m3, m5 |
| 20373 | movu [r0 + 633 * 16], m3 |
| 20374 | |
| 20375 | ; mode 11 [row 29] |
| 20376 | pmaddubsw m3, m0, [r5 + 4 * 16] |
| 20377 | pmulhrsw m3, m7 |
| 20378 | pmaddubsw m5, m2, [r5 + 4 * 16] |
| 20379 | pmulhrsw m5, m7 |
| 20380 | packuswb m3, m5 |
| 20381 | movu [r0 + 634 * 16], m3 |
| 20382 | pmaddubsw m3, m1, [r5 + 4 * 16] |
| 20383 | pmulhrsw m3, m7 |
| 20384 | pmaddubsw m5, m4, [r5 + 4 * 16] |
| 20385 | pmulhrsw m5, m7 |
| 20386 | packuswb m3, m5 |
| 20387 | movu [r0 + 635 * 16], m3 |
| 20388 | |
| 20389 | ; mode 11 [row 30] |
| 20390 | pmaddubsw m3, m0, [r5 + 2 * 16] |
| 20391 | pmulhrsw m3, m7 |
| 20392 | pmaddubsw m5, m2, [r5 + 2 * 16] |
| 20393 | pmulhrsw m5, m7 |
| 20394 | packuswb m3, m5 |
| 20395 | movu [r0 + 636 * 16], m3 |
| 20396 | pmaddubsw m3, m1, [r5 + 2 * 16] |
| 20397 | pmulhrsw m3, m7 |
| 20398 | pmaddubsw m5, m4, [r5 + 2 * 16] |
| 20399 | pmulhrsw m5, m7 |
| 20400 | packuswb m3, m5 |
| 20401 | movu [r0 + 637 * 16], m3 |
| 20402 | |
| 20403 | ; mode 12 [row 6] |
| 20404 | pinsrb m0, [r3 + 6], 0 |
| 20405 | pmaddubsw m3, m0, [r5 + 29 * 16] |
| 20406 | pmulhrsw m3, m7 |
| 20407 | pmaddubsw m5, m2, [r5 + 29 * 16] |
| 20408 | pmulhrsw m5, m7 |
| 20409 | packuswb m3, m5 |
| 20410 | movu [r0 + 652 * 16], m3 |
| 20411 | pmaddubsw m3, m1, [r5 + 29 * 16] |
| 20412 | pmulhrsw m3, m7 |
| 20413 | pmaddubsw m5, m4, [r5 + 29 * 16] |
| 20414 | pmulhrsw m5, m7 |
| 20415 | packuswb m3, m5 |
| 20416 | movu [r0 + 653 * 16], m3 |
| 20417 | |
| 20418 | ; mode 12 [row 7] |
| 20419 | pmaddubsw m3, m0, [r5 + 24 * 16] |
| 20420 | pmulhrsw m3, m7 |
| 20421 | pmaddubsw m5, m2, [r5 + 24 * 16] |
| 20422 | pmulhrsw m5, m7 |
| 20423 | packuswb m3, m5 |
| 20424 | movu [r0 + 654 * 16], m3 |
| 20425 | pmaddubsw m3, m1, [r5 + 24 * 16] |
| 20426 | pmulhrsw m3, m7 |
| 20427 | pmaddubsw m5, m4, [r5 + 24 * 16] |
| 20428 | pmulhrsw m5, m7 |
| 20429 | packuswb m3, m5 |
| 20430 | movu [r0 + 655 * 16], m3 |
| 20431 | |
| 20432 | ; mode 12 [row 8] |
| 20433 | pmaddubsw m3, m0, [r5 + 19 * 16] |
| 20434 | pmulhrsw m3, m7 |
| 20435 | pmaddubsw m5, m2, [r5 + 19 * 16] |
| 20436 | pmulhrsw m5, m7 |
| 20437 | packuswb m3, m5 |
| 20438 | movu [r0 + 656 * 16], m3 |
| 20439 | pmaddubsw m3, m1, [r5 + 19 * 16] |
| 20440 | pmulhrsw m3, m7 |
| 20441 | pmaddubsw m5, m4, [r5 + 19 * 16] |
| 20442 | pmulhrsw m5, m7 |
| 20443 | packuswb m3, m5 |
| 20444 | movu [r0 + 657 * 16], m3 |
| 20445 | |
| 20446 | ; mode 12 [row 9] |
| 20447 | pmaddubsw m3, m0, [r5 + 14 * 16] |
| 20448 | pmulhrsw m3, m7 |
| 20449 | pmaddubsw m5, m2, [r5 + 14 * 16] |
| 20450 | pmulhrsw m5, m7 |
| 20451 | packuswb m3, m5 |
| 20452 | movu [r0 + 658 * 16], m3 |
| 20453 | pmaddubsw m3, m1, [r5 + 14 * 16] |
| 20454 | pmulhrsw m3, m7 |
| 20455 | pmaddubsw m5, m4, [r5 + 14 * 16] |
| 20456 | pmulhrsw m5, m7 |
| 20457 | packuswb m3, m5 |
| 20458 | movu [r0 + 659 * 16], m3 |
| 20459 | |
| 20460 | ; mode 12 [row 10] |
| 20461 | pmaddubsw m3, m0, [r5 + 9 * 16] |
| 20462 | pmulhrsw m3, m7 |
| 20463 | pmaddubsw m5, m2, [r5 + 9 * 16] |
| 20464 | pmulhrsw m5, m7 |
| 20465 | packuswb m3, m5 |
| 20466 | movu [r0 + 660 * 16], m3 |
| 20467 | pmaddubsw m3, m1, [r5 + 9 * 16] |
| 20468 | pmulhrsw m3, m7 |
| 20469 | pmaddubsw m5, m4, [r5 + 9 * 16] |
| 20470 | pmulhrsw m5, m7 |
| 20471 | packuswb m3, m5 |
| 20472 | movu [r0 + 661 * 16], m3 |
| 20473 | |
| 20474 | ; mode 12 [row 11] |
| 20475 | pmaddubsw m3, m0, [r5 + 4 * 16] |
| 20476 | pmulhrsw m3, m7 |
| 20477 | pmaddubsw m5, m2, [r5 + 4 * 16] |
| 20478 | pmulhrsw m5, m7 |
| 20479 | packuswb m3, m5 |
| 20480 | movu [r0 + 662 * 16], m3 |
| 20481 | pmaddubsw m3, m1, [r5 + 4 * 16] |
| 20482 | pmulhrsw m3, m7 |
| 20483 | pmaddubsw m5, m4, [r5 + 4 * 16] |
| 20484 | pmulhrsw m5, m7 |
| 20485 | packuswb m3, m5 |
| 20486 | movu [r0 + 663 * 16], m3 |
| 20487 | |
| 20488 | ; mode 13 [row 3] |
| 20489 | movu m6, m0 |
| 20490 | pinsrb m6, [r3 + 4], 0 |
| 20491 | pmaddubsw m3, m6, [r5 + 28 * 16] |
| 20492 | pmulhrsw m3, m7 |
| 20493 | pmaddubsw m5, m2, [r5 + 28 * 16] |
| 20494 | pmulhrsw m5, m7 |
| 20495 | packuswb m3, m5 |
| 20496 | movu [r0 + 710 * 16], m3 |
| 20497 | pmaddubsw m3, m1, [r5 + 28 * 16] |
| 20498 | pmulhrsw m3, m7 |
| 20499 | pmaddubsw m5, m4, [r5 + 28 * 16] |
| 20500 | pmulhrsw m5, m7 |
| 20501 | packuswb m3, m5 |
| 20502 | movu [r0 + 711 * 16], m3 |
| 20503 | |
| 20504 | ; mode 13 [row 4] |
| 20505 | pmaddubsw m3, m6, [r5 + 19 * 16] |
| 20506 | pmulhrsw m3, m7 |
| 20507 | pmaddubsw m5, m2, [r5 + 19 * 16] |
| 20508 | pmulhrsw m5, m7 |
| 20509 | packuswb m3, m5 |
| 20510 | movu [r0 + 712 * 16], m3 |
| 20511 | pmaddubsw m3, m1, [r5 + 19 * 16] |
| 20512 | pmulhrsw m3, m7 |
| 20513 | pmaddubsw m5, m4, [r5 + 19 * 16] |
| 20514 | pmulhrsw m5, m7 |
| 20515 | packuswb m3, m5 |
| 20516 | movu [r0 + 713 * 16], m3 |
| 20517 | |
| 20518 | ; mode 13 [row 5] |
| 20519 | pmaddubsw m3, m6, [r5 + 10 * 16] |
| 20520 | pmulhrsw m3, m7 |
| 20521 | pmaddubsw m5, m2, [r5 + 10 * 16] |
| 20522 | pmulhrsw m5, m7 |
| 20523 | packuswb m3, m5 |
| 20524 | movu [r0 + 714 * 16], m3 |
| 20525 | pmaddubsw m3, m1, [r5 + 10 * 16] |
| 20526 | pmulhrsw m3, m7 |
| 20527 | pmaddubsw m5, m4, [r5 + 10 * 16] |
| 20528 | pmulhrsw m5, m7 |
| 20529 | packuswb m3, m5 |
| 20530 | movu [r0 + 715 * 16], m3 |
| 20531 | |
| 20532 | ; mode 13 [row 6] |
| 20533 | pmaddubsw m3, m6, [r5 + 1 * 16] |
| 20534 | pmulhrsw m3, m7 |
| 20535 | pmaddubsw m5, m2, [r5 + 1 * 16] |
| 20536 | pmulhrsw m5, m7 |
| 20537 | packuswb m3, m5 |
| 20538 | movu [r0 + 716 * 16], m3 |
| 20539 | pmaddubsw m3, m1, [r5 + 1 * 16] |
| 20540 | pmulhrsw m3, m7 |
| 20541 | pmaddubsw m5, m4, [r5 + 1 * 16] |
| 20542 | pmulhrsw m5, m7 |
| 20543 | packuswb m3, m5 |
| 20544 | movu [r0 + 717 * 16], m3 |
| 20545 | |
| 20546 | ; mode 14 [row 2] |
| 20547 | movu m6, m0 |
| 20548 | pinsrb m6, [r4 + 0], 1 |
| 20549 | pinsrb m6, [r3 + 2], 0 |
| 20550 | pmaddubsw m3, m6, [r5 + 25 * 16] |
| 20551 | pmulhrsw m3, m7 |
| 20552 | pmaddubsw m5, m2, [r5 + 25 * 16] |
| 20553 | pmulhrsw m5, m7 |
| 20554 | packuswb m3, m5 |
| 20555 | movu [r0 + 772 * 16], m3 |
| 20556 | pmaddubsw m3, m1, [r5 + 25 * 16] |
| 20557 | pmulhrsw m3, m7 |
| 20558 | pmaddubsw m5, m4, [r5 + 25 * 16] |
| 20559 | pmulhrsw m5, m7 |
| 20560 | packuswb m3, m5 |
| 20561 | movu [r0 + 773 * 16], m3 |
| 20562 | |
| 20563 | ; mode 14 [row 3] |
| 20564 | pmaddubsw m3, m6, [r5 + 12 * 16] |
| 20565 | pmulhrsw m3, m7 |
| 20566 | pmaddubsw m5, m2, [r5 + 12 * 16] |
| 20567 | pmulhrsw m5, m7 |
| 20568 | packuswb m3, m5 |
| 20569 | movu [r0 + 774 * 16], m3 |
| 20570 | pmaddubsw m3, m1, [r5 + 12 * 16] |
| 20571 | pmulhrsw m3, m7 |
| 20572 | pmaddubsw m5, m4, [r5 + 12 * 16] |
| 20573 | pmulhrsw m5, m7 |
| 20574 | packuswb m3, m5 |
| 20575 | movu [r0 + 775 * 16], m3 |
| 20576 | |
| 20577 | ; mode 15 [row 1] |
| 20578 | pmaddubsw m3, m6, [r5 + 30 * 16] |
| 20579 | pmulhrsw m3, m7 |
| 20580 | pmaddubsw m5, m2, [r5 + 30 * 16] |
| 20581 | pmulhrsw m5, m7 |
| 20582 | packuswb m3, m5 |
| 20583 | movu [r0 + 834 * 16], m3 |
| 20584 | pmaddubsw m3, m1, [r5 + 30 * 16] |
| 20585 | pmulhrsw m3, m7 |
| 20586 | pmaddubsw m5, m4, [r5 + 30 * 16] |
| 20587 | pmulhrsw m5, m7 |
| 20588 | packuswb m3, m5 |
| 20589 | movu [r0 + 835 * 16], m3 |
| 20590 | |
| 20591 | ; mode 15 [row 2] |
| 20592 | pmaddubsw m3, m6, [r5 + 13 * 16] |
| 20593 | pmulhrsw m3, m7 |
| 20594 | pmaddubsw m5, m2, [r5 + 13 * 16] |
| 20595 | pmulhrsw m5, m7 |
| 20596 | packuswb m3, m5 |
| 20597 | movu [r0 + 836 * 16], m3 |
| 20598 | pmaddubsw m3, m1, [r5 + 13 * 16] |
| 20599 | pmulhrsw m3, m7 |
| 20600 | pmaddubsw m5, m4, [r5 + 13 * 16] |
| 20601 | pmulhrsw m5, m7 |
| 20602 | packuswb m3, m5 |
| 20603 | movu [r0 + 837 * 16], m3 |
| 20604 | |
| 20605 | ; mode 15 [row 3] |
| 20606 | pslldq m6, 2 |
| 20607 | pinsrb m6, [r3 + 2], 1 |
| 20608 | pinsrb m6, [r3 + 4], 0 |
| 20609 | pmaddubsw m3, m6, [r5 + 28 * 16] |
| 20610 | pmulhrsw m3, m7 |
| 20611 | pslldq m2, 2 |
| 20612 | pinsrb m2, [r4 + 7], 1 |
| 20613 | pinsrb m2, [r4 + 6], 0 |
| 20614 | pmaddubsw m5, m2, [r5 + 28 * 16] |
| 20615 | pmulhrsw m5, m7 |
| 20616 | packuswb m3, m5 |
| 20617 | movu [r0 + 838 * 16], m3 |
| 20618 | pslldq m1, 2 |
| 20619 | pinsrb m1, [r4 + 15], 1 |
| 20620 | pinsrb m1, [r4 + 14], 0 |
| 20621 | pmaddubsw m3, m1, [r5 + 28 * 16] |
| 20622 | pmulhrsw m3, m7 |
| 20623 | pslldq m4, 2 |
| 20624 | pinsrb m4, [r4 + 23], 1 |
| 20625 | pinsrb m4, [r4 + 22], 0 |
| 20626 | pmaddubsw m5, m4, [r5 + 28 * 16] |
| 20627 | pmulhrsw m5, m7 |
| 20628 | packuswb m3, m5 |
| 20629 | movu [r0 + 839 * 16], m3 |
| 20630 | |
| 20631 | ; mode 15 [row 4] |
| 20632 | pmaddubsw m3, m6, [r5 + 11 * 16] |
| 20633 | pmulhrsw m3, m7 |
| 20634 | pmaddubsw m5, m2, [r5 + 11 * 16] |
| 20635 | pmulhrsw m5, m7 |
| 20636 | packuswb m3, m5 |
| 20637 | movu [r0 + 840 * 16], m3 |
| 20638 | pmaddubsw m3, m1, [r5 + 11 * 16] |
| 20639 | pmulhrsw m3, m7 |
| 20640 | pmaddubsw m5, m4, [r5 + 11 * 16] |
| 20641 | pmulhrsw m5, m7 |
| 20642 | packuswb m3, m5 |
| 20643 | movu [r0 + 841 * 16], m3 |
| 20644 | |
| 20645 | ; mode 15 [row 5, 0-7] |
| 20646 | pslldq m6, 2 |
| 20647 | pinsrb m6, [r3 + 4], 1 |
| 20648 | pinsrb m6, [r3 + 6], 0 |
| 20649 | pmaddubsw m3, m6, [r5 + 26 * 16] |
| 20650 | pmulhrsw m3, m7 |
| 20651 | packuswb m3, m3 |
| 20652 | movh [r0 + 842 * 16], m3 |
| 20653 | |
| 20654 | ; mode 15 [row 6, 0-7] |
| 20655 | pmaddubsw m3, m6, [r5 + 9 * 16] |
| 20656 | pmulhrsw m3, m7 |
| 20657 | packuswb m3, m3 |
| 20658 | movh [r0 + 844 * 16], m3 |
| 20659 | |
| 20660 | ; mode 15 [row 7, 0-7] |
| 20661 | pslldq m6, 2 |
| 20662 | pinsrb m6, [r3 + 6], 1 |
| 20663 | pinsrb m6, [r3 + 8], 0 |
| 20664 | pmaddubsw m3, m6, [r5 + 24 * 16] |
| 20665 | pmulhrsw m3, m7 |
| 20666 | packuswb m3, m3 |
| 20667 | movh [r0 + 846 * 16], m3 |
| 20668 | |
| 20669 | ; mode 15 [row 8, 0-7] |
| 20670 | pmaddubsw m3, m6, [r5 + 7 * 16] |
| 20671 | pmulhrsw m3, m7 |
| 20672 | packuswb m3, m3 |
| 20673 | movh [r0 + 848 * 16], m3 |
| 20674 | |
| 20675 | ; mode 15 [row 9, 0-7] |
| 20676 | pslldq m6, 2 |
| 20677 | pinsrb m6, [r3 + 8], 1 |
| 20678 | pinsrb m6, [r3 + 9], 0 |
| 20679 | pmaddubsw m3, m6, [r5 + 22 * 16] |
| 20680 | pmulhrsw m3, m7 |
| 20681 | packuswb m3, m3 |
| 20682 | movh [r0 + 850 * 16], m3 |
| 20683 | |
| 20684 | ; mode 15 [row 10, 0-7] |
| 20685 | pmaddubsw m3, m6, [r5 + 5 * 16] |
| 20686 | pmulhrsw m3, m7 |
| 20687 | packuswb m3, m3 |
| 20688 | movh [r0 + 852 * 16], m3 |
| 20689 | |
| 20690 | ; mode 15 [row 11, 0-7] |
| 20691 | pslldq m6, 2 |
| 20692 | pinsrb m6, [r3 + 9], 1 |
| 20693 | pinsrb m6, [r3 + 11], 0 |
| 20694 | pmaddubsw m3, m6, [r5 + 20 * 16] |
| 20695 | pmulhrsw m3, m7 |
| 20696 | packuswb m3, m3 |
| 20697 | movh [r0 + 854 * 16], m3 |
| 20698 | |
| 20699 | ; mode 15 [row 12, 0-7] |
| 20700 | pmaddubsw m3, m6, [r5 + 3 * 16] |
| 20701 | pmulhrsw m3, m7 |
| 20702 | packuswb m3, m3 |
| 20703 | movh [r0 + 856 * 16], m3 |
| 20704 | |
| 20705 | ; mode 15 [row 13, 0-7] |
| 20706 | pslldq m6, 2 |
| 20707 | pinsrb m6, [r3 + 11], 1 |
| 20708 | pinsrb m6, [r3 + 13], 0 |
| 20709 | pmaddubsw m3, m6, [r5 + 18 * 16] |
| 20710 | pmulhrsw m3, m7 |
| 20711 | packuswb m3, m3 |
| 20712 | movh [r0 + 858 * 16], m3 |
| 20713 | |
| 20714 | ; mode 15 [row 14, 0-7] |
| 20715 | pmaddubsw m3, m6, [r5 + 1 * 16] |
| 20716 | pmulhrsw m3, m7 |
| 20717 | packuswb m3, m3 |
| 20718 | movh [r0 + 860 * 16], m3 |
| 20719 | |
| 20720 | ; mode 15 [row 15, 0-7] |
| 20721 | pslldq m6, 2 |
| 20722 | pinsrb m6, [r3 + 13], 1 |
| 20723 | pinsrb m6, [r3 + 15], 0 |
| 20724 | pmaddubsw m3, m6, [r5 + 16 * 16] |
| 20725 | pmulhrsw m3, m7 |
| 20726 | packuswb m3, m3 |
| 20727 | movh [r0 + 862 * 16], m3 |
| 20728 | |
| 20729 | ; mode 15 [row 16, 0-7] |
| 20730 | pslldq m6, 2 |
| 20731 | pinsrb m6, [r3 + 15], 1 |
| 20732 | pinsrb m6, [r3 + 17], 0 |
| 20733 | pmaddubsw m3, m6, [r5 + 31 * 16] |
| 20734 | pmulhrsw m3, m7 |
| 20735 | packuswb m3, m3 |
| 20736 | movh [r0 + 864 * 16], m3 |
| 20737 | |
| 20738 | ; mode 15 [row 17, 0-7] |
| 20739 | pmaddubsw m3, m6, [r5 + 14 * 16] |
| 20740 | pmulhrsw m3, m7 |
| 20741 | packuswb m3, m3 |
| 20742 | movh [r0 + 866 * 16], m3 |
| 20743 | |
| 20744 | ; mode 15 [row 18, 0-7] |
| 20745 | pslldq m6, 2 |
| 20746 | pinsrb m6, [r3 + 17], 1 |
| 20747 | pinsrb m6, [r3 + 19], 0 |
| 20748 | pmaddubsw m3, m6, [r5 + 29 * 16] |
| 20749 | pmulhrsw m3, m7 |
| 20750 | packuswb m3, m3 |
| 20751 | movh [r0 + 868 * 16], m3 |
| 20752 | |
| 20753 | ; mode 15 [row 19, 0-7] |
| 20754 | pmaddubsw m3, m6, [r5 + 12 * 16] |
| 20755 | pmulhrsw m3, m7 |
| 20756 | packuswb m3, m3 |
| 20757 | movh [r0 + 870 * 16], m3 |
| 20758 | |
| 20759 | ; mode 15 [row 20, 0-7] |
| 20760 | pslldq m6, 2 |
| 20761 | pinsrb m6, [r3 + 19], 1 |
| 20762 | pinsrb m6, [r3 + 21], 0 |
| 20763 | pmaddubsw m3, m6, [r5 + 27 * 16] |
| 20764 | pmulhrsw m3, m7 |
| 20765 | packuswb m3, m3 |
| 20766 | movh [r0 + 872 * 16], m3 |
| 20767 | |
| 20768 | ; mode 15 [row 21, 0-7] |
| 20769 | pmaddubsw m3, m6, [r5 + 10 * 16] |
| 20770 | pmulhrsw m3, m7 |
| 20771 | packuswb m3, m3 |
| 20772 | movh [r0 + 874 * 16], m3 |
| 20773 | |
| 20774 | ; mode 15 [row 22, 0-7] |
| 20775 | pslldq m6, 2 |
| 20776 | pinsrb m6, [r3 + 21], 1 |
| 20777 | pinsrb m6, [r3 + 23], 0 |
| 20778 | pmaddubsw m3, m6, [r5 + 25 * 16] |
| 20779 | pmulhrsw m3, m7 |
| 20780 | packuswb m3, m3 |
| 20781 | movh [r0 + 876 * 16], m3 |
| 20782 | |
| 20783 | ; mode 15 [row 23, 0-7] |
| 20784 | pmaddubsw m3, m6, [r5 + 8 * 16] |
| 20785 | pmulhrsw m3, m7 |
| 20786 | packuswb m3, m3 |
| 20787 | movh [r0 + 878 * 16], m3 |
| 20788 | |
| 20789 | ; mode 15 [row 24, 0-7] |
| 20790 | pslldq m6, 2 |
| 20791 | pinsrb m6, [r3 + 23], 1 |
| 20792 | pinsrb m6, [r3 + 24], 0 |
| 20793 | pmaddubsw m3, m6, [r5 + 23 * 16] |
| 20794 | pmulhrsw m3, m7 |
| 20795 | packuswb m3, m3 |
| 20796 | movh [r0 + 880 * 16], m3 |
| 20797 | |
| 20798 | ; mode 15 [row 25, 0-7] |
| 20799 | pmaddubsw m3, m6, [r5 + 6 * 16] |
| 20800 | pmulhrsw m3, m7 |
| 20801 | packuswb m3, m3 |
| 20802 | movh [r0 + 882 * 16], m3 |
| 20803 | |
| 20804 | ; mode 15 [row 26, 0-7] |
| 20805 | pslldq m6, 2 |
| 20806 | pinsrb m6, [r3 + 24], 1 |
| 20807 | pinsrb m6, [r3 + 26], 0 |
| 20808 | pmaddubsw m3, m6, [r5 + 21 * 16] |
| 20809 | pmulhrsw m3, m7 |
| 20810 | packuswb m3, m3 |
| 20811 | movh [r0 + 884 * 16], m3 |
| 20812 | |
| 20813 | ; mode 15 [row 27, 0-7] |
| 20814 | pmaddubsw m3, m6, [r5 + 4 * 16] |
| 20815 | pmulhrsw m3, m7 |
| 20816 | packuswb m3, m3 |
| 20817 | movh [r0 + 886 * 16], m3 |
| 20818 | |
| 20819 | ; mode 15 [row 28, 0-7] |
| 20820 | pslldq m6, 2 |
| 20821 | pinsrb m6, [r3 + 26], 1 |
| 20822 | pinsrb m6, [r3 + 28], 0 |
| 20823 | pmaddubsw m3, m6, [r5 + 19 * 16] |
| 20824 | pmulhrsw m3, m7 |
| 20825 | packuswb m3, m3 |
| 20826 | movh [r0 + 888 * 16], m3 |
| 20827 | |
| 20828 | ; mode 15 [row 29, 0-7] |
| 20829 | pmaddubsw m3, m6, [r5 + 2 * 16] |
| 20830 | pmulhrsw m3, m7 |
| 20831 | packuswb m3, m3 |
| 20832 | movh [r0 + 890 * 16], m3 |
| 20833 | |
| 20834 | ; mode 15 [row 30, 0-7] |
| 20835 | pslldq m6, 2 |
| 20836 | pinsrb m6, [r3 + 28], 1 |
| 20837 | pinsrb m6, [r3 + 30], 0 |
| 20838 | pmaddubsw m3, m6, [r5 + 17 * 16] |
| 20839 | pmulhrsw m3, m7 |
| 20840 | packuswb m3, m3 |
| 20841 | movh [r0 + 892 * 16], m3 |
| 20842 | |
| 20843 | ; mode 15 [row 31, 0-7] |
| 20844 | pshufb m3, m6, [tab_S2] |
| 20845 | movh [r0 + 894 * 16], m3 |
| 20846 | |
| 20847 | ; mode 12 [row 12] |
| 20848 | pslldq m0, 2 |
| 20849 | pinsrb m0, [r3 + 6], 1 |
| 20850 | pinsrb m0, [r3 + 13], 0 |
| 20851 | pmaddubsw m3, m0, [r5 + 31 * 16] |
| 20852 | pmulhrsw m3, m7 |
| 20853 | pmaddubsw m5, m2, [r5 + 31 * 16] |
| 20854 | pmulhrsw m5, m7 |
| 20855 | packuswb m3, m5 |
| 20856 | movu [r0 + 664 * 16], m3 |
| 20857 | pmaddubsw m3, m1, [r5 + 31 * 16] |
| 20858 | pmulhrsw m3, m7 |
| 20859 | pmaddubsw m5, m4, [r5 + 31 * 16] |
| 20860 | pmulhrsw m5, m7 |
| 20861 | packuswb m3, m5 |
| 20862 | movu [r0 + 665 * 16], m3 |
| 20863 | |
| 20864 | ; mode 12 [row 13] |
| 20865 | pmaddubsw m3, m0, [r5 + 26 * 16] |
| 20866 | pmulhrsw m3, m7 |
| 20867 | pmaddubsw m5, m2, [r5 + 26 * 16] |
| 20868 | pmulhrsw m5, m7 |
| 20869 | packuswb m3, m5 |
| 20870 | movu [r0 + 666 * 16], m3 |
| 20871 | pmaddubsw m3, m1, [r5 + 26 * 16] |
| 20872 | pmulhrsw m3, m7 |
| 20873 | pmaddubsw m5, m4, [r5 + 26 * 16] |
| 20874 | pmulhrsw m5, m7 |
| 20875 | packuswb m3, m5 |
| 20876 | movu [r0 + 667 * 16], m3 |
| 20877 | |
| 20878 | ; mode 12 [row 14] |
| 20879 | pmaddubsw m3, m0, [r5 + 21 * 16] |
| 20880 | pmulhrsw m3, m7 |
| 20881 | pmaddubsw m5, m2, [r5 + 21 * 16] |
| 20882 | pmulhrsw m5, m7 |
| 20883 | packuswb m3, m5 |
| 20884 | movu [r0 + 668 * 16], m3 |
| 20885 | pmaddubsw m3, m1, [r5 + 21 * 16] |
| 20886 | pmulhrsw m3, m7 |
| 20887 | pmaddubsw m5, m4, [r5 + 21 * 16] |
| 20888 | pmulhrsw m5, m7 |
| 20889 | packuswb m3, m5 |
| 20890 | movu [r0 + 669 * 16], m3 |
| 20891 | |
| 20892 | ; mode 12 [row 15] |
| 20893 | pmaddubsw m3, m0, [r5 + 16 * 16] |
| 20894 | pmulhrsw m3, m7 |
| 20895 | pmaddubsw m5, m2, [r5 + 16 * 16] |
| 20896 | pmulhrsw m5, m7 |
| 20897 | packuswb m3, m5 |
| 20898 | movu [r0 + 670 * 16], m3 |
| 20899 | pmaddubsw m3, m1, [r5 + 16 * 16] |
| 20900 | pmulhrsw m3, m7 |
| 20901 | pmaddubsw m5, m4, [r5 + 16 * 16] |
| 20902 | pmulhrsw m5, m7 |
| 20903 | packuswb m3, m5 |
| 20904 | movu [r0 + 671 * 16], m3 |
| 20905 | |
| 20906 | ; mode 12 [row 16] |
| 20907 | pmaddubsw m3, m0, [r5 + 11 * 16] |
| 20908 | pmulhrsw m3, m7 |
| 20909 | pmaddubsw m5, m2, [r5 + 11 * 16] |
| 20910 | pmulhrsw m5, m7 |
| 20911 | packuswb m3, m5 |
| 20912 | movu [r0 + 672 * 16], m3 |
| 20913 | pmaddubsw m3, m1, [r5 + 11 * 16] |
| 20914 | pmulhrsw m3, m7 |
| 20915 | pmaddubsw m5, m4, [r5 + 11 * 16] |
| 20916 | pmulhrsw m5, m7 |
| 20917 | packuswb m3, m5 |
| 20918 | movu [r0 + 673 * 16], m3 |
| 20919 | |
| 20920 | ; mode 12 [row 17] |
| 20921 | pmaddubsw m3, m0, [r5 + 6 * 16] |
| 20922 | pmulhrsw m3, m7 |
| 20923 | pmaddubsw m5, m2, [r5 + 6 * 16] |
| 20924 | pmulhrsw m5, m7 |
| 20925 | packuswb m3, m5 |
| 20926 | movu [r0 + 674 * 16], m3 |
| 20927 | pmaddubsw m3, m1, [r5 + 6 * 16] |
| 20928 | pmulhrsw m3, m7 |
| 20929 | pmaddubsw m5, m4, [r5 + 6 * 16] |
| 20930 | pmulhrsw m5, m7 |
| 20931 | packuswb m3, m5 |
| 20932 | movu [r0 + 675 * 16], m3 |
| 20933 | |
| 20934 | ; mode 12 [row 18] |
| 20935 | pmaddubsw m3, m0, [r5 + 1 * 16] |
| 20936 | pmulhrsw m3, m7 |
| 20937 | pmaddubsw m5, m2, [r5 + 1 * 16] |
| 20938 | pmulhrsw m5, m7 |
| 20939 | packuswb m3, m5 |
| 20940 | movu [r0 + 676 * 16], m3 |
| 20941 | pmaddubsw m3, m1, [r5 + 1 * 16] |
| 20942 | pmulhrsw m3, m7 |
| 20943 | pmaddubsw m5, m4, [r5 + 1 * 16] |
| 20944 | pmulhrsw m5, m7 |
| 20945 | packuswb m3, m5 |
| 20946 | movu [r0 + 677 * 16], m3 |
| 20947 | |
| 20948 | ; mode 13 [row 7] |
| 20949 | movu m6, m0 |
| 20950 | pinsrb m6, [r3 + 4], 2 |
| 20951 | pinsrb m6, [r3 + 4], 1 |
| 20952 | pinsrb m6, [r3 + 7], 0 |
| 20953 | pmaddubsw m3, m6, [r5 + 24 * 16] |
| 20954 | pmulhrsw m3, m7 |
| 20955 | pmaddubsw m5, m2, [r5 + 24 * 16] |
| 20956 | pmulhrsw m5, m7 |
| 20957 | packuswb m3, m5 |
| 20958 | movu [r0 + 718 * 16], m3 |
| 20959 | pmaddubsw m3, m1, [r5 + 24 * 16] |
| 20960 | pmulhrsw m3, m7 |
| 20961 | pmaddubsw m5, m4, [r5 + 24 * 16] |
| 20962 | pmulhrsw m5, m7 |
| 20963 | packuswb m3, m5 |
| 20964 | movu [r0 + 719 * 16], m3 |
| 20965 | |
| 20966 | ; mode 13 [row 8] |
| 20967 | pmaddubsw m3, m6, [r5 + 15 * 16] |
| 20968 | pmulhrsw m3, m7 |
| 20969 | pmaddubsw m5, m2, [r5 + 15 * 16] |
| 20970 | pmulhrsw m5, m7 |
| 20971 | packuswb m3, m5 |
| 20972 | movu [r0 + 720 * 16], m3 |
| 20973 | pmaddubsw m3, m1, [r5 + 15 * 16] |
| 20974 | pmulhrsw m3, m7 |
| 20975 | pmaddubsw m5, m4, [r5 + 15 * 16] |
| 20976 | pmulhrsw m5, m7 |
| 20977 | packuswb m3, m5 |
| 20978 | movu [r0 + 721 * 16], m3 |
| 20979 | |
| 20980 | ; mode 13 [row 9] |
| 20981 | pmaddubsw m3, m6, [r5 + 6 * 16] |
| 20982 | pmulhrsw m3, m7 |
| 20983 | pmaddubsw m5, m2, [r5 + 6 * 16] |
| 20984 | pmulhrsw m5, m7 |
| 20985 | packuswb m3, m5 |
| 20986 | movu [r0 + 722 * 16], m3 |
| 20987 | pmaddubsw m3, m1, [r5 + 6 * 16] |
| 20988 | pmulhrsw m3, m7 |
| 20989 | pmaddubsw m5, m4, [r5 + 6 * 16] |
| 20990 | pmulhrsw m5, m7 |
| 20991 | packuswb m3, m5 |
| 20992 | movu [r0 + 723 * 16], m3 |
| 20993 | |
| 20994 | ; mode 14 [row 4] |
| 20995 | pinsrb m6, [r3 + 2], 2 |
| 20996 | pinsrb m6, [r3 + 2], 1 |
| 20997 | pinsrb m6, [r3 + 5], 0 |
| 20998 | pmaddubsw m3, m6, [r5 + 31 * 16] |
| 20999 | pmulhrsw m3, m7 |
| 21000 | pmaddubsw m5, m2, [r5 + 31 * 16] |
| 21001 | pmulhrsw m5, m7 |
| 21002 | packuswb m3, m5 |
| 21003 | movu [r0 + 776 * 16], m3 |
| 21004 | pmaddubsw m3, m1, [r5 + 31 * 16] |
| 21005 | pmulhrsw m3, m7 |
| 21006 | pmaddubsw m5, m4, [r5 + 31 * 16] |
| 21007 | pmulhrsw m5, m7 |
| 21008 | packuswb m3, m5 |
| 21009 | movu [r0 + 777 * 16], m3 |
| 21010 | |
| 21011 | ; mode 14 [row 5] |
| 21012 | pmaddubsw m3, m6, [r5 + 18 * 16] |
| 21013 | pmulhrsw m3, m7 |
| 21014 | pmaddubsw m5, m2, [r5 + 18 * 16] |
| 21015 | pmulhrsw m5, m7 |
| 21016 | packuswb m3, m5 |
| 21017 | movu [r0 + 778 * 16], m3 |
| 21018 | pmaddubsw m3, m1, [r5 + 18 * 16] |
| 21019 | pmulhrsw m3, m7 |
| 21020 | pmaddubsw m5, m4, [r5 + 18 * 16] |
| 21021 | pmulhrsw m5, m7 |
| 21022 | packuswb m3, m5 |
| 21023 | movu [r0 + 779 * 16], m3 |
| 21024 | |
| 21025 | ; mode 14 [row 6] |
| 21026 | pmaddubsw m3, m6, [r5 + 5 * 16] |
| 21027 | pmulhrsw m3, m7 |
| 21028 | pmaddubsw m5, m2, [r5 + 5 * 16] |
| 21029 | pmulhrsw m5, m7 |
| 21030 | packuswb m3, m5 |
| 21031 | movu [r0 + 780 * 16], m3 |
| 21032 | pmaddubsw m3, m1, [r5 + 5 * 16] |
| 21033 | pmulhrsw m3, m7 |
| 21034 | pmaddubsw m5, m4, [r5 + 5 * 16] |
| 21035 | pmulhrsw m5, m7 |
| 21036 | packuswb m3, m5 |
| 21037 | movu [r0 + 781 * 16], m3 |
| 21038 | |
| 21039 | ; mode 14 [row 7] |
| 21040 | pslldq m6, 2 |
| 21041 | pinsrb m6, [r3 + 5], 1 |
| 21042 | pinsrb m6, [r3 + 7], 0 |
| 21043 | pmaddubsw m3, m6, [r5 + 24 * 16] |
| 21044 | pmulhrsw m3, m7 |
| 21045 | pslldq m2, 2 |
| 21046 | pinsrw m2, [r4 + 5], 0 |
| 21047 | pmaddubsw m5, m2, [r5 + 24 * 16] |
| 21048 | pmulhrsw m5, m7 |
| 21049 | packuswb m3, m5 |
| 21050 | movu [r0 + 782 * 16], m3 |
| 21051 | pslldq m1, 2 |
| 21052 | pinsrw m1, [r4 + 13], 0 |
| 21053 | pmaddubsw m3, m1, [r5 + 24 * 16] |
| 21054 | pmulhrsw m3, m7 |
| 21055 | pslldq m4, 2 |
| 21056 | pinsrw m4, [r4 + 21], 0 |
| 21057 | pmaddubsw m5, m4, [r5 + 24 * 16] |
| 21058 | pmulhrsw m5, m7 |
| 21059 | packuswb m3, m5 |
| 21060 | movu [r0 + 783 * 16], m3 |
| 21061 | |
| 21062 | ; mode 14 [row 8] |
| 21063 | pmaddubsw m3, m6, [r5 + 11 * 16] |
| 21064 | pmulhrsw m3, m7 |
| 21065 | pmaddubsw m5, m2, [r5 + 11 * 16] |
| 21066 | pmulhrsw m5, m7 |
| 21067 | packuswb m3, m5 |
| 21068 | movu [r0 + 784 * 16], m3 |
| 21069 | pmaddubsw m3, m1, [r5 + 11 * 16] |
| 21070 | pmulhrsw m3, m7 |
| 21071 | pmaddubsw m5, m4, [r5 + 11 * 16] |
| 21072 | pmulhrsw m5, m7 |
| 21073 | packuswb m3, m5 |
| 21074 | movu [r0 + 785 * 16], m3 |
| 21075 | |
| 21076 | ; mode 15 [row 5, 8-31] |
| 21077 | pmaddubsw m5, m2, [r5 + 26 * 16] |
| 21078 | pmulhrsw m5, m7 |
| 21079 | packuswb m5, m5 |
| 21080 | movh [r0 + 842 * 16 + 8], m5 |
| 21081 | pmaddubsw m3, m1, [r5 + 26 * 16] |
| 21082 | pmulhrsw m3, m7 |
| 21083 | pmaddubsw m5, m4, [r5 + 26 * 16] |
| 21084 | pmulhrsw m5, m7 |
| 21085 | packuswb m3, m5 |
| 21086 | movu [r0 + 843 * 16], m3 |
| 21087 | |
| 21088 | ; mode 15 [row 6, 8-31] |
| 21089 | pmaddubsw m5, m2, [r5 + 9 * 16] |
| 21090 | pmulhrsw m5, m7 |
| 21091 | packuswb m5, m5 |
| 21092 | movh [r0 + 844 * 16 + 8], m5 |
| 21093 | pmaddubsw m3, m1, [r5 + 9 * 16] |
| 21094 | pmulhrsw m3, m7 |
| 21095 | pmaddubsw m5, m4, [r5 + 9 * 16] |
| 21096 | pmulhrsw m5, m7 |
| 21097 | packuswb m3, m5 |
| 21098 | movu [r0 + 845 * 16], m3 |
| 21099 | |
| 21100 | ; mode 12 [row 19] |
| 21101 | pslldq m0, 2 |
| 21102 | pinsrb m0, [r3 + 13], 1 |
| 21103 | pinsrb m0, [r3 + 19], 0 |
| 21104 | pmaddubsw m3, m0, [r5 + 28 * 16] |
| 21105 | pmulhrsw m3, m7 |
| 21106 | pmaddubsw m5, m2, [r5 + 28 * 16] |
| 21107 | pmulhrsw m5, m7 |
| 21108 | packuswb m3, m5 |
| 21109 | movu [r0 + 678 * 16], m3 |
| 21110 | pmaddubsw m3, m1, [r5 + 28 * 16] |
| 21111 | pmulhrsw m3, m7 |
| 21112 | pmaddubsw m5, m4, [r5 + 28 * 16] |
| 21113 | pmulhrsw m5, m7 |
| 21114 | packuswb m3, m5 |
| 21115 | movu [r0 + 679 * 16], m3 |
| 21116 | |
| 21117 | ; mode 12 [row 20] |
| 21118 | pmaddubsw m3, m0, [r5 + 23 * 16] |
| 21119 | pmulhrsw m3, m7 |
| 21120 | pmaddubsw m5, m2, [r5 + 23 * 16] |
| 21121 | pmulhrsw m5, m7 |
| 21122 | packuswb m3, m5 |
| 21123 | movu [r0 + 680 * 16], m3 |
| 21124 | pmaddubsw m3, m1, [r5 + 23 * 16] |
| 21125 | pmulhrsw m3, m7 |
| 21126 | pmaddubsw m5, m4, [r5 + 23 * 16] |
| 21127 | pmulhrsw m5, m7 |
| 21128 | packuswb m3, m5 |
| 21129 | movu [r0 + 681 * 16], m3 |
| 21130 | |
| 21131 | ; mode 12 [row 21] |
| 21132 | pmaddubsw m3, m0, [r5 + 18 * 16] |
| 21133 | pmulhrsw m3, m7 |
| 21134 | pmaddubsw m5, m2, [r5 + 18 * 16] |
| 21135 | pmulhrsw m5, m7 |
| 21136 | packuswb m3, m5 |
| 21137 | movu [r0 + 682 * 16], m3 |
| 21138 | pmaddubsw m3, m1, [r5 + 18 * 16] |
| 21139 | pmulhrsw m3, m7 |
| 21140 | pmaddubsw m5, m4, [r5 + 18 * 16] |
| 21141 | pmulhrsw m5, m7 |
| 21142 | packuswb m3, m5 |
| 21143 | movu [r0 + 683 * 16], m3 |
| 21144 | |
| 21145 | ; mode 12 [row 22] |
| 21146 | pmaddubsw m3, m0, [r5 + 13 * 16] |
| 21147 | pmulhrsw m3, m7 |
| 21148 | pmaddubsw m5, m2, [r5 + 13 * 16] |
| 21149 | pmulhrsw m5, m7 |
| 21150 | packuswb m3, m5 |
| 21151 | movu [r0 + 684 * 16], m3 |
| 21152 | pmaddubsw m3, m1, [r5 + 13 * 16] |
| 21153 | pmulhrsw m3, m7 |
| 21154 | pmaddubsw m5, m4, [r5 + 13 * 16] |
| 21155 | pmulhrsw m5, m7 |
| 21156 | packuswb m3, m5 |
| 21157 | movu [r0 + 685 * 16], m3 |
| 21158 | |
| 21159 | ; mode 12 [row 23] |
| 21160 | pmaddubsw m3, m0, [r5 + 8 * 16] |
| 21161 | pmulhrsw m3, m7 |
| 21162 | pmaddubsw m5, m2, [r5 + 8 * 16] |
| 21163 | pmulhrsw m5, m7 |
| 21164 | packuswb m3, m5 |
| 21165 | movu [r0 + 686 * 16], m3 |
| 21166 | pmaddubsw m3, m1, [r5 + 8 * 16] |
| 21167 | pmulhrsw m3, m7 |
| 21168 | pmaddubsw m5, m4, [r5 + 8 * 16] |
| 21169 | pmulhrsw m5, m7 |
| 21170 | packuswb m3, m5 |
| 21171 | movu [r0 + 687 * 16], m3 |
| 21172 | |
| 21173 | ; mode 12 [row 24] |
| 21174 | pmaddubsw m3, m0, [r5 + 3 * 16] |
| 21175 | pmulhrsw m3, m7 |
| 21176 | pmaddubsw m5, m2, [r5 + 3 * 16] |
| 21177 | pmulhrsw m5, m7 |
| 21178 | packuswb m3, m5 |
| 21179 | movu [r0 + 688 * 16], m3 |
| 21180 | pmaddubsw m3, m1, [r5 + 3 * 16] |
| 21181 | pmulhrsw m3, m7 |
| 21182 | pmaddubsw m5, m4, [r5 + 3 * 16] |
| 21183 | pmulhrsw m5, m7 |
| 21184 | packuswb m3, m5 |
| 21185 | movu [r0 + 689 * 16], m3 |
| 21186 | |
| 21187 | ; mode 13 [row 10] |
| 21188 | movu m7, m6 |
| 21189 | movu m6, m0 |
| 21190 | pinsrb m6, [r3 + 4], 4 |
| 21191 | pinsrb m6, [r3 + 4], 3 |
| 21192 | pinsrb m6, [r3 + 7], 2 |
| 21193 | pinsrb m6, [r3 + 7], 1 |
| 21194 | pinsrb m6, [r3 + 11], 0 |
| 21195 | pmaddubsw m3, m6, [r5 + 29 * 16] |
| 21196 | pmulhrsw m3, [pw_1024] |
| 21197 | pmaddubsw m5, m2, [r5 + 29 * 16] |
| 21198 | pmulhrsw m5, [pw_1024] |
| 21199 | packuswb m3, m5 |
| 21200 | movu [r0 + 724 * 16], m3 |
| 21201 | pmaddubsw m3, m1, [r5 + 29 * 16] |
| 21202 | pmulhrsw m3, [pw_1024] |
| 21203 | pmaddubsw m5, m4, [r5 + 29 * 16] |
| 21204 | pmulhrsw m5, [pw_1024] |
| 21205 | packuswb m3, m5 |
| 21206 | movu [r0 + 725 * 16], m3 |
| 21207 | |
| 21208 | ; mode 13 [row 11] |
| 21209 | pmaddubsw m3, m6, [r5 + 20 * 16] |
| 21210 | pmulhrsw m3, [pw_1024] |
| 21211 | pmaddubsw m5, m2, [r5 + 20 * 16] |
| 21212 | pmulhrsw m5, [pw_1024] |
| 21213 | packuswb m3, m5 |
| 21214 | movu [r0 + 726 * 16], m3 |
| 21215 | pmaddubsw m3, m1, [r5 + 20 * 16] |
| 21216 | pmulhrsw m3, [pw_1024] |
| 21217 | pmaddubsw m5, m4, [r5 + 20 * 16] |
| 21218 | pmulhrsw m5, [pw_1024] |
| 21219 | packuswb m3, m5 |
| 21220 | movu [r0 + 727 * 16], m3 |
| 21221 | |
| 21222 | ; mode 13 [row 12] |
| 21223 | pmaddubsw m3, m6, [r5 + 11 * 16] |
| 21224 | pmulhrsw m3, [pw_1024] |
| 21225 | pmaddubsw m5, m2, [r5 + 11 * 16] |
| 21226 | pmulhrsw m5, [pw_1024] |
| 21227 | packuswb m3, m5 |
| 21228 | movu [r0 + 728 * 16], m3 |
| 21229 | pmaddubsw m3, m1, [r5 + 11 * 16] |
| 21230 | pmulhrsw m3, [pw_1024] |
| 21231 | pmaddubsw m5, m4, [r5 + 11 * 16] |
| 21232 | pmulhrsw m5, [pw_1024] |
| 21233 | packuswb m3, m5 |
| 21234 | movu [r0 + 729 * 16], m3 |
| 21235 | |
| 21236 | ; mode 13 [row 13] |
| 21237 | pmaddubsw m3, m6, [r5 + 2 * 16] |
| 21238 | pmulhrsw m3, [pw_1024] |
| 21239 | pmaddubsw m5, m2, [r5 + 2 * 16] |
| 21240 | pmulhrsw m5, [pw_1024] |
| 21241 | packuswb m3, m5 |
| 21242 | movu [r0 + 730 * 16], m3 |
| 21243 | pmaddubsw m3, m1, [r5 + 2 * 16] |
| 21244 | pmulhrsw m3, [pw_1024] |
| 21245 | pmaddubsw m5, m4, [r5 + 2 * 16] |
| 21246 | pmulhrsw m5, [pw_1024] |
| 21247 | packuswb m3, m5 |
| 21248 | movu [r0 + 731 * 16], m3 |
| 21249 | |
| 21250 | ; mode 14 [row 9] |
| 21251 | pslldq m7, 2 |
| 21252 | pinsrb m7, [r3 + 7], 1 |
| 21253 | pinsrb m7, [r3 + 10], 0 |
| 21254 | pmaddubsw m3, m7, [r5 + 30 * 16] |
| 21255 | pmulhrsw m3, [pw_1024] |
| 21256 | pslldq m2, 2 |
| 21257 | pinsrw m2, [r4 + 4], 0 |
| 21258 | pmaddubsw m5, m2, [r5 + 30 * 16] |
| 21259 | pmulhrsw m5, [pw_1024] |
| 21260 | packuswb m3, m5 |
| 21261 | movu [r0 + 786 * 16], m3 |
| 21262 | pslldq m1, 2 |
| 21263 | pinsrw m1, [r4 + 12], 0 |
| 21264 | pmaddubsw m3, m1, [r5 + 30 * 16] |
| 21265 | pmulhrsw m3, [pw_1024] |
| 21266 | pslldq m4, 2 |
| 21267 | pinsrb m4, [r4 + 21], 1 |
| 21268 | pinsrb m4, [r4 + 20], 0 |
| 21269 | pmaddubsw m5, m4, [r5 + 30 * 16] |
| 21270 | pmulhrsw m5, [pw_1024] |
| 21271 | packuswb m3, m5 |
| 21272 | movu [r0 + 787 * 16], m3 |
| 21273 | |
| 21274 | ; mode 14 [row 10] |
| 21275 | pmaddubsw m3, m7, [r5 + 17 * 16] |
| 21276 | pmulhrsw m3, [pw_1024] |
| 21277 | pmaddubsw m5, m2, [r5 + 17 * 16] |
| 21278 | pmulhrsw m5, [pw_1024] |
| 21279 | packuswb m3, m5 |
| 21280 | movu [r0 + 788 * 16], m3 |
| 21281 | pmaddubsw m3, m1, [r5 + 17 * 16] |
| 21282 | pmulhrsw m3, [pw_1024] |
| 21283 | pmaddubsw m5, m4, [r5 + 17 * 16] |
| 21284 | pmulhrsw m5, [pw_1024] |
| 21285 | packuswb m3, m5 |
| 21286 | movu [r0 + 789 * 16], m3 |
| 21287 | |
| 21288 | ; mode 14 [row 11] |
| 21289 | pmaddubsw m3, m7, [r5 + 4 * 16] |
| 21290 | pmulhrsw m3, [pw_1024] |
| 21291 | pmaddubsw m5, m2, [r5 + 4 * 16] |
| 21292 | pmulhrsw m5, [pw_1024] |
| 21293 | packuswb m3, m5 |
| 21294 | movu [r0 + 790 * 16], m3 |
| 21295 | pmaddubsw m3, m1, [r5 + 4 * 16] |
| 21296 | pmulhrsw m3, [pw_1024] |
| 21297 | pmaddubsw m5, m4, [r5 + 4 * 16] |
| 21298 | pmulhrsw m5, [pw_1024] |
| 21299 | packuswb m3, m5 |
| 21300 | movu [r0 + 791 * 16], m3 |
| 21301 | |
| 21302 | movu m6, [pw_1024] |
| 21303 | |
| 21304 | ; mode 15 [row 7, 8-31] |
| 21305 | pmaddubsw m5, m2, [r5 + 24 * 16] |
| 21306 | pmulhrsw m5, m6 |
| 21307 | packuswb m5, m5 |
| 21308 | movh [r0 + 846 * 16 + 8], m5 |
| 21309 | pmaddubsw m3, m1, [r5 + 24 * 16] |
| 21310 | pmulhrsw m3, m6 |
| 21311 | pmaddubsw m5, m4, [r5 + 24 * 16] |
| 21312 | pmulhrsw m5, m6 |
| 21313 | packuswb m3, m5 |
| 21314 | movu [r0 + 847 * 16], m3 |
| 21315 | |
| 21316 | ; mode 15 [row 8, 8-31] |
| 21317 | pmaddubsw m5, m2, [r5 + 7 * 16] |
| 21318 | pmulhrsw m5, m6 |
| 21319 | packuswb m5, m5 |
| 21320 | movh [r0 + 848 * 16 + 8], m5 |
| 21321 | pmaddubsw m3, m1, [r5 + 7 * 16] |
| 21322 | pmulhrsw m3, m6 |
| 21323 | pmaddubsw m5, m4, [r5 + 7 * 16] |
| 21324 | pmulhrsw m5, m6 |
| 21325 | packuswb m3, m5 |
| 21326 | movu [r0 + 849 * 16], m3 |
| 21327 | |
| 21328 | ; mode 12 [row 25] |
| 21329 | pslldq m0, 2 |
| 21330 | pinsrb m0, [r3 + 19], 1 |
| 21331 | pinsrb m0, [r3 + 26], 0 |
| 21332 | pmaddubsw m3, m0, [r5 + 30 * 16] |
| 21333 | pmulhrsw m3, [pw_1024] |
| 21334 | pmaddubsw m5, m2, [r5 + 30 * 16] |
| 21335 | pmulhrsw m5, [pw_1024] |
| 21336 | packuswb m3, m5 |
| 21337 | movu [r0 + 690 * 16], m3 |
| 21338 | pmaddubsw m3, m1, [r5 + 30 * 16] |
| 21339 | pmulhrsw m3, [pw_1024] |
| 21340 | pmaddubsw m5, m4, [r5 + 30 * 16] |
| 21341 | pmulhrsw m5, [pw_1024] |
| 21342 | packuswb m3, m5 |
| 21343 | movu [r0 + 691 * 16], m3 |
| 21344 | |
| 21345 | ; mode 12 [row 26] |
| 21346 | pmaddubsw m3, m0, [r5 + 25 * 16] |
| 21347 | pmulhrsw m3, [pw_1024] |
| 21348 | pmaddubsw m5, m2, [r5 + 25 * 16] |
| 21349 | pmulhrsw m5, [pw_1024] |
| 21350 | packuswb m3, m5 |
| 21351 | movu [r0 + 692 * 16], m3 |
| 21352 | pmaddubsw m3, m1, [r5 + 25 * 16] |
| 21353 | pmulhrsw m3, [pw_1024] |
| 21354 | pmaddubsw m5, m4, [r5 + 25 * 16] |
| 21355 | pmulhrsw m5, [pw_1024] |
| 21356 | packuswb m3, m5 |
| 21357 | movu [r0 + 693 * 16], m3 |
| 21358 | |
| 21359 | ; mode 12 [row 27] |
| 21360 | pmaddubsw m3, m0, [r5 + 20 * 16] |
| 21361 | pmulhrsw m3, [pw_1024] |
| 21362 | pmaddubsw m5, m2, [r5 + 20 * 16] |
| 21363 | pmulhrsw m5, [pw_1024] |
| 21364 | packuswb m3, m5 |
| 21365 | movu [r0 + 694 * 16], m3 |
| 21366 | pmaddubsw m3, m1, [r5 + 20 * 16] |
| 21367 | pmulhrsw m3, [pw_1024] |
| 21368 | pmaddubsw m5, m4, [r5 + 20 * 16] |
| 21369 | pmulhrsw m5, [pw_1024] |
| 21370 | packuswb m3, m5 |
| 21371 | movu [r0 + 695 * 16], m3 |
| 21372 | |
| 21373 | ; mode 12 [row 28] |
| 21374 | pmaddubsw m3, m0, [r5 + 15 * 16] |
| 21375 | pmulhrsw m3, [pw_1024] |
| 21376 | pmaddubsw m5, m2, [r5 + 15 * 16] |
| 21377 | pmulhrsw m5, [pw_1024] |
| 21378 | packuswb m3, m5 |
| 21379 | movu [r0 + 696 * 16], m3 |
| 21380 | pmaddubsw m3, m1, [r5 + 15 * 16] |
| 21381 | pmulhrsw m3, [pw_1024] |
| 21382 | pmaddubsw m5, m4, [r5 + 15 * 16] |
| 21383 | pmulhrsw m5, [pw_1024] |
| 21384 | packuswb m3, m5 |
| 21385 | movu [r0 + 697 * 16], m3 |
| 21386 | |
| 21387 | ; mode 12 [row 29] |
| 21388 | pmaddubsw m3, m0, [r5 + 10 * 16] |
| 21389 | pmulhrsw m3, [pw_1024] |
| 21390 | pmaddubsw m5, m2, [r5 + 10 * 16] |
| 21391 | pmulhrsw m5, [pw_1024] |
| 21392 | packuswb m3, m5 |
| 21393 | movu [r0 + 698 * 16], m3 |
| 21394 | pmaddubsw m3, m1, [r5 + 10 * 16] |
| 21395 | pmulhrsw m3, [pw_1024] |
| 21396 | pmaddubsw m5, m4, [r5 + 10 * 16] |
| 21397 | pmulhrsw m5, [pw_1024] |
| 21398 | packuswb m3, m5 |
| 21399 | movu [r0 + 699 * 16], m3 |
| 21400 | |
| 21401 | ; mode 12 [row 30] |
| 21402 | pmaddubsw m3, m0, [r5 + 5 * 16] |
| 21403 | pmulhrsw m3, [pw_1024] |
| 21404 | pmaddubsw m5, m2, [r5 + 5 * 16] |
| 21405 | pmulhrsw m5, [pw_1024] |
| 21406 | packuswb m3, m5 |
| 21407 | movu [r0 + 700 * 16], m3 |
| 21408 | pmaddubsw m3, m1, [r5 + 5 * 16] |
| 21409 | pmulhrsw m3, [pw_1024] |
| 21410 | pmaddubsw m5, m4, [r5 + 5 * 16] |
| 21411 | pmulhrsw m5, [pw_1024] |
| 21412 | packuswb m3, m5 |
| 21413 | movu [r0 + 701 * 16], m3 |
| 21414 | |
| 21415 | ; mode 13 [row 14] |
| 21416 | movu m6, m0 |
| 21417 | pinsrb m6, [r3 + 4], 6 |
| 21418 | pinsrb m6, [r3 + 4], 5 |
| 21419 | pinsrb m6, [r3 + 7], 4 |
| 21420 | pinsrb m6, [r3 + 7], 3 |
| 21421 | pinsrb m6, [r3 + 11], 2 |
| 21422 | pinsrb m6, [r3 + 11], 1 |
| 21423 | pinsrb m6, [r3 + 14], 0 |
| 21424 | pmaddubsw m3, m6, [r5 + 25 * 16] |
| 21425 | pmulhrsw m3, [pw_1024] |
| 21426 | pmaddubsw m5, m2, [r5 + 25 * 16] |
| 21427 | pmulhrsw m5, [pw_1024] |
| 21428 | packuswb m3, m5 |
| 21429 | movu [r0 + 732 * 16], m3 |
| 21430 | pmaddubsw m3, m1, [r5 + 25 * 16] |
| 21431 | pmulhrsw m3, [pw_1024] |
| 21432 | pmaddubsw m5, m4, [r5 + 25 * 16] |
| 21433 | pmulhrsw m5, [pw_1024] |
| 21434 | packuswb m3, m5 |
| 21435 | movu [r0 + 733 * 16], m3 |
| 21436 | |
| 21437 | ; mode 13 [row 15] |
| 21438 | pmaddubsw m3, m6, [r5 + 16 * 16] |
| 21439 | pmulhrsw m3, [pw_1024] |
| 21440 | pmaddubsw m5, m2, [r5 + 16 * 16] |
| 21441 | pmulhrsw m5, [pw_1024] |
| 21442 | packuswb m3, m5 |
| 21443 | movu [r0 + 734 * 16], m3 |
| 21444 | pmaddubsw m3, m1, [r5 + 16 * 16] |
| 21445 | pmulhrsw m3, [pw_1024] |
| 21446 | pmaddubsw m5, m4, [r5 + 16 * 16] |
| 21447 | pmulhrsw m5, [pw_1024] |
| 21448 | packuswb m3, m5 |
| 21449 | movu [r0 + 735 * 16], m3 |
| 21450 | |
| 21451 | ; mode 13 [row 16] |
| 21452 | pmaddubsw m3, m6, [r5 + 7 * 16] |
| 21453 | pmulhrsw m3, [pw_1024] |
| 21454 | pmaddubsw m5, m2, [r5 + 7 * 16] |
| 21455 | pmulhrsw m5, [pw_1024] |
| 21456 | packuswb m3, m5 |
| 21457 | movu [r0 + 736 * 16], m3 |
| 21458 | pmaddubsw m3, m1, [r5 + 7 * 16] |
| 21459 | pmulhrsw m3, [pw_1024] |
| 21460 | pmaddubsw m5, m4, [r5 + 7 * 16] |
| 21461 | pmulhrsw m5, [pw_1024] |
| 21462 | packuswb m3, m5 |
| 21463 | movu [r0 + 737 * 16], m3 |
| 21464 | |
| 21465 | ; mode 13 [row 17] |
| 21466 | pslldq m6, 2 |
| 21467 | pinsrb m6, [r3 + 14], 1 |
| 21468 | pinsrb m6, [r3 + 18], 0 |
| 21469 | pmaddubsw m3, m6, [r5 + 30 * 16] |
| 21470 | pmulhrsw m3, [pw_1024] |
| 21471 | pslldq m2, 2 |
| 21472 | pinsrw m2, [r4 + 3], 0 |
| 21473 | pmaddubsw m5, m2, [r5 + 30 * 16] |
| 21474 | pmulhrsw m5, [pw_1024] |
| 21475 | packuswb m3, m5 |
| 21476 | movu [r0 + 738 * 16], m3 |
| 21477 | pslldq m1, 2 |
| 21478 | pinsrw m1, [r4 + 11], 0 |
| 21479 | pmaddubsw m3, m1, [r5 + 30 * 16] |
| 21480 | pmulhrsw m3, [pw_1024] |
| 21481 | pslldq m4, 2 |
| 21482 | pinsrw m4, [r4 + 19], 0 |
| 21483 | pmaddubsw m5, m4, [r5 + 30 * 16] |
| 21484 | pmulhrsw m5, [pw_1024] |
| 21485 | packuswb m3, m5 |
| 21486 | movu [r0 + 739 * 16], m3 |
| 21487 | |
| 21488 | ; mode 13 [row 18] |
| 21489 | pmaddubsw m3, m6, [r5 + 21 * 16] |
| 21490 | pmulhrsw m3, [pw_1024] |
| 21491 | pmaddubsw m5, m2, [r5 + 21 * 16] |
| 21492 | pmulhrsw m5, [pw_1024] |
| 21493 | packuswb m3, m5 |
| 21494 | movu [r0 + 740 * 16], m3 |
| 21495 | pmaddubsw m3, m1, [r5 + 21 * 16] |
| 21496 | pmulhrsw m3, [pw_1024] |
| 21497 | pmaddubsw m5, m4, [r5 + 21 * 16] |
| 21498 | pmulhrsw m5, [pw_1024] |
| 21499 | packuswb m3, m5 |
| 21500 | movu [r0 + 741 * 16], m3 |
| 21501 | |
| 21502 | ; mode 13 [row 19] |
| 21503 | pmaddubsw m3, m6, [r5 + 12 * 16] |
| 21504 | pmulhrsw m3, [pw_1024] |
| 21505 | pmaddubsw m5, m2, [r5 + 12 * 16] |
| 21506 | pmulhrsw m5, [pw_1024] |
| 21507 | packuswb m3, m5 |
| 21508 | movu [r0 + 742 * 16], m3 |
| 21509 | pmaddubsw m3, m1, [r5 + 12 * 16] |
| 21510 | pmulhrsw m3, [pw_1024] |
| 21511 | pmaddubsw m5, m4, [r5 + 12 * 16] |
| 21512 | pmulhrsw m5, [pw_1024] |
| 21513 | packuswb m3, m5 |
| 21514 | movu [r0 + 743 * 16], m3 |
| 21515 | |
| 21516 | ; mode 13 [row 20] |
| 21517 | pmaddubsw m3, m6, [r5 + 3 * 16] |
| 21518 | pmulhrsw m3, [pw_1024] |
| 21519 | pmaddubsw m5, m2, [r5 + 3 * 16] |
| 21520 | pmulhrsw m5, [pw_1024] |
| 21521 | packuswb m3, m5 |
| 21522 | movu [r0 + 744 * 16], m3 |
| 21523 | pmaddubsw m3, m1, [r5 + 3 * 16] |
| 21524 | pmulhrsw m3, [pw_1024] |
| 21525 | pmaddubsw m5, m4, [r5 + 3 * 16] |
| 21526 | pmulhrsw m5, [pw_1024] |
| 21527 | packuswb m3, m5 |
| 21528 | movu [r0 + 745 * 16], m3 |
| 21529 | |
| 21530 | ; mode 14 [row 12] |
| 21531 | pslldq m7, 2 |
| 21532 | pinsrb m7, [r3 + 10], 1 |
| 21533 | pinsrb m7, [r3 + 12], 0 |
| 21534 | pmaddubsw m3, m7, [r5 + 23 * 16] |
| 21535 | pmulhrsw m3, [pw_1024] |
| 21536 | pmaddubsw m5, m2, [r5 + 23 * 16] |
| 21537 | pmulhrsw m5, [pw_1024] |
| 21538 | packuswb m3, m5 |
| 21539 | movu [r0 + 792 * 16], m3 |
| 21540 | pmaddubsw m3, m1, [r5 + 23 * 16] |
| 21541 | pmulhrsw m3, [pw_1024] |
| 21542 | pmaddubsw m5, m4, [r5 + 23 * 16] |
| 21543 | pmulhrsw m5, [pw_1024] |
| 21544 | packuswb m3, m5 |
| 21545 | movu [r0 + 793 * 16], m3 |
| 21546 | |
| 21547 | ; mode 14 [row 13] |
| 21548 | pmaddubsw m3, m7, [r5 + 10 * 16] |
| 21549 | pmulhrsw m3, [pw_1024] |
| 21550 | pmaddubsw m5, m2, [r5 + 10 * 16] |
| 21551 | pmulhrsw m5, [pw_1024] |
| 21552 | packuswb m3, m5 |
| 21553 | movu [r0 + 794 * 16], m3 |
| 21554 | pmaddubsw m3, m1, [r5 + 10 * 16] |
| 21555 | pmulhrsw m3, [pw_1024] |
| 21556 | pmaddubsw m5, m4, [r5 + 10 * 16] |
| 21557 | pmulhrsw m5, [pw_1024] |
| 21558 | packuswb m3, m5 |
| 21559 | movu [r0 + 795 * 16], m3 |
| 21560 | |
| 21561 | ; mode 15 [row 9] |
| 21562 | pmaddubsw m5, m2, [r5 + 22 * 16] |
| 21563 | pmulhrsw m5, [pw_1024] |
| 21564 | packuswb m5, m5 |
| 21565 | movu [r0 + 850 * 16 + 8], m5 |
| 21566 | pmaddubsw m3, m1, [r5 + 22 * 16] |
| 21567 | pmulhrsw m3, [pw_1024] |
| 21568 | pmaddubsw m5, m4, [r5 + 22 * 16] |
| 21569 | pmulhrsw m5, [pw_1024] |
| 21570 | packuswb m3, m5 |
| 21571 | movu [r0 + 851 * 16], m3 |
| 21572 | |
| 21573 | ; mode 15 [row 10] |
| 21574 | pmaddubsw m5, m2, [r5 + 5 * 16] |
| 21575 | pmulhrsw m5, [pw_1024] |
| 21576 | packuswb m5, m5 |
| 21577 | movu [r0 + 852 * 16 + 8], m5 |
| 21578 | pmaddubsw m3, m1, [r5 + 5 * 16] |
| 21579 | pmulhrsw m3, [pw_1024] |
| 21580 | pmaddubsw m5, m4, [r5 + 5 * 16] |
| 21581 | pmulhrsw m5, [pw_1024] |
| 21582 | packuswb m3, m5 |
| 21583 | movu [r0 + 853 * 16], m3 |
| 21584 | |
| 21585 | ; mode 13 [row 21] |
| 21586 | pslldq m6, 2 |
| 21587 | pinsrb m6, [r3 + 18], 1 |
| 21588 | pinsrb m6, [r3 + 21], 0 |
| 21589 | pmaddubsw m3, m6, [r5 + 26 * 16] |
| 21590 | pmulhrsw m3, [pw_1024] |
| 21591 | pslldq m2, 2 |
| 21592 | pinsrw m2, [r4 + 2], 0 |
| 21593 | pmaddubsw m5, m2, [r5 + 26 * 16] |
| 21594 | pmulhrsw m5, [pw_1024] |
| 21595 | packuswb m3, m5 |
| 21596 | movu [r0 + 746 * 16], m3 |
| 21597 | pslldq m1, 2 |
| 21598 | pinsrw m1, [r4 + 10], 0 |
| 21599 | pmaddubsw m3, m1, [r5 + 26 * 16] |
| 21600 | pmulhrsw m3, [pw_1024] |
| 21601 | pslldq m4, 2 |
| 21602 | pinsrw m4, [r4 + 18], 0 |
| 21603 | pmaddubsw m5, m4, [r5 + 26 * 16] |
| 21604 | pmulhrsw m5, [pw_1024] |
| 21605 | packuswb m3, m5 |
| 21606 | movu [r0 + 747 * 16], m3 |
| 21607 | |
| 21608 | ; mode 13 [row 22] |
| 21609 | pmaddubsw m3, m6, [r5 + 17 * 16] |
| 21610 | pmulhrsw m3, [pw_1024] |
| 21611 | pmaddubsw m5, m2, [r5 + 17 * 16] |
| 21612 | pmulhrsw m5, [pw_1024] |
| 21613 | packuswb m3, m5 |
| 21614 | movu [r0 + 748 * 16], m3 |
| 21615 | pmaddubsw m3, m1, [r5 + 17 * 16] |
| 21616 | pmulhrsw m3, [pw_1024] |
| 21617 | pmaddubsw m5, m4, [r5 + 17 * 16] |
| 21618 | pmulhrsw m5, [pw_1024] |
| 21619 | packuswb m3, m5 |
| 21620 | movu [r0 + 749 * 16], m3 |
| 21621 | |
| 21622 | ; mode 13 [row 23] |
| 21623 | pmaddubsw m3, m6, [r5 + 8 * 16] |
| 21624 | pmulhrsw m3, [pw_1024] |
| 21625 | pmaddubsw m5, m2, [r5 + 8 * 16] |
| 21626 | pmulhrsw m5, [pw_1024] |
| 21627 | packuswb m3, m5 |
| 21628 | movu [r0 + 750 * 16], m3 |
| 21629 | pmaddubsw m3, m1, [r5 + 8 * 16] |
| 21630 | pmulhrsw m3, [pw_1024] |
| 21631 | pmaddubsw m5, m4, [r5 + 8 * 16] |
| 21632 | pmulhrsw m5, [pw_1024] |
| 21633 | packuswb m3, m5 |
| 21634 | movu [r0 + 751 * 16], m3 |
| 21635 | |
| 21636 | ; mode 14 [row 14] |
| 21637 | pslldq m7, 2 |
| 21638 | pinsrb m7, [r3 + 12], 1 |
| 21639 | pinsrb m7, [r3 + 15], 0 |
| 21640 | pmaddubsw m3, m7, [r5 + 29 * 16] |
| 21641 | pmulhrsw m3, [pw_1024] |
| 21642 | pmaddubsw m5, m2, [r5 + 29 * 16] |
| 21643 | pmulhrsw m5, [pw_1024] |
| 21644 | packuswb m3, m5 |
| 21645 | movu [r0 + 796 * 16], m3 |
| 21646 | pmaddubsw m3, m1, [r5 + 29 * 16] |
| 21647 | pmulhrsw m3, [pw_1024] |
| 21648 | pmaddubsw m5, m4, [r5 + 29 * 16] |
| 21649 | pmulhrsw m5, [pw_1024] |
| 21650 | packuswb m3, m5 |
| 21651 | movu [r0 + 797 * 16], m3 |
| 21652 | |
| 21653 | ; mode 14 [row 15] |
| 21654 | pmaddubsw m3, m7, [r5 + 16 * 16] |
| 21655 | pmulhrsw m3, [pw_1024] |
| 21656 | pmaddubsw m5, m2, [r5 + 16 * 16] |
| 21657 | pmulhrsw m5, [pw_1024] |
| 21658 | packuswb m3, m5 |
| 21659 | movu [r0 + 798 * 16], m3 |
| 21660 | pmaddubsw m3, m1, [r5 + 16 * 16] |
| 21661 | pmulhrsw m3, [pw_1024] |
| 21662 | pmaddubsw m5, m4, [r5 + 16 * 16] |
| 21663 | pmulhrsw m5, [pw_1024] |
| 21664 | packuswb m3, m5 |
| 21665 | movu [r0 + 799 * 16], m3 |
| 21666 | |
| 21667 | ; mode 14 [row 16] |
| 21668 | pmaddubsw m3, m7, [r5 + 3 * 16] |
| 21669 | pmulhrsw m3, [pw_1024] |
| 21670 | pmaddubsw m5, m2, [r5 + 3 * 16] |
| 21671 | pmulhrsw m5, [pw_1024] |
| 21672 | packuswb m3, m5 |
| 21673 | movu [r0 + 800 * 16], m3 |
| 21674 | pmaddubsw m3, m1, [r5 + 3 * 16] |
| 21675 | pmulhrsw m3, [pw_1024] |
| 21676 | pmaddubsw m5, m4, [r5 + 3 * 16] |
| 21677 | pmulhrsw m5, [pw_1024] |
| 21678 | packuswb m3, m5 |
| 21679 | movu [r0 + 801 * 16], m3 |
| 21680 | |
| 21681 | ; mode 15 [row 11] |
| 21682 | pmaddubsw m5, m2, [r5 + 20 * 16] |
| 21683 | pmulhrsw m5, [pw_1024] |
| 21684 | packuswb m5, m5 |
| 21685 | movh [r0 + 854 * 16 + 8], m5 |
| 21686 | pmaddubsw m3, m1, [r5 + 20 * 16] |
| 21687 | pmulhrsw m3, [pw_1024] |
| 21688 | pmaddubsw m5, m4, [r5 + 20 * 16] |
| 21689 | pmulhrsw m5, [pw_1024] |
| 21690 | packuswb m3, m5 |
| 21691 | movu [r0 + 855 * 16], m3 |
| 21692 | |
| 21693 | ; mode 15 [row 12] |
| 21694 | pmaddubsw m5, m2, [r5 + 3 * 16] |
| 21695 | pmulhrsw m5, [pw_1024] |
| 21696 | packuswb m5, m5 |
| 21697 | movh [r0 + 856 * 16 + 8], m5 |
| 21698 | pmaddubsw m3, m1, [r5 + 3 * 16] |
| 21699 | pmulhrsw m3, [pw_1024] |
| 21700 | pmaddubsw m5, m4, [r5 + 3 * 16] |
| 21701 | pmulhrsw m5, [pw_1024] |
| 21702 | packuswb m3, m5 |
| 21703 | movu [r0 + 857 * 16], m3 |
| 21704 | |
| 21705 | ; mode 13 [row 24] |
| 21706 | pslldq m6, 2 |
| 21707 | pinsrb m6, [r3 + 21], 1 |
| 21708 | pinsrb m6, [r3 + 25], 0 |
| 21709 | pmaddubsw m3, m6, [r5 + 31 * 16] |
| 21710 | pmulhrsw m3, [pw_1024] |
| 21711 | pslldq m2, 2 |
| 21712 | pinsrw m2, [r4 + 1], 0 |
| 21713 | pmaddubsw m5, m2, [r5 + 31 * 16] |
| 21714 | pmulhrsw m5, [pw_1024] |
| 21715 | packuswb m3, m5 |
| 21716 | movu [r0 + 752 * 16], m3 |
| 21717 | pslldq m1, 2 |
| 21718 | pinsrw m1, [r4 + 9], 0 |
| 21719 | pmaddubsw m3, m1, [r5 + 31 * 16] |
| 21720 | pmulhrsw m3, [pw_1024] |
| 21721 | pslldq m4, 2 |
| 21722 | pinsrw m4, [r4 + 17], 0 |
| 21723 | pmaddubsw m5, m4, [r5 + 31 * 16] |
| 21724 | pmulhrsw m5, [pw_1024] |
| 21725 | packuswb m3, m5 |
| 21726 | movu [r0 + 753 * 16], m3 |
| 21727 | |
| 21728 | ; mode 13 [row 25] |
| 21729 | pmaddubsw m3, m6, [r5 + 22 * 16] |
| 21730 | pmulhrsw m3, [pw_1024] |
| 21731 | pmaddubsw m5, m2, [r5 + 22 * 16] |
| 21732 | pmulhrsw m5, [pw_1024] |
| 21733 | packuswb m3, m5 |
| 21734 | movu [r0 + 754 * 16], m3 |
| 21735 | pmaddubsw m3, m1, [r5 + 22 * 16] |
| 21736 | pmulhrsw m3, [pw_1024] |
| 21737 | pmaddubsw m5, m4, [r5 + 22 * 16] |
| 21738 | pmulhrsw m5, [pw_1024] |
| 21739 | packuswb m3, m5 |
| 21740 | movu [r0 + 755 * 16], m3 |
| 21741 | |
| 21742 | ; mode 13 [row 26] |
| 21743 | pmaddubsw m3, m6, [r5 + 13 * 16] |
| 21744 | pmulhrsw m3, [pw_1024] |
| 21745 | pmaddubsw m5, m2, [r5 + 13 * 16] |
| 21746 | pmulhrsw m5, [pw_1024] |
| 21747 | packuswb m3, m5 |
| 21748 | movu [r0 + 756 * 16], m3 |
| 21749 | pmaddubsw m3, m1, [r5 + 13 * 16] |
| 21750 | pmulhrsw m3, [pw_1024] |
| 21751 | pmaddubsw m5, m4, [r5 + 13 * 16] |
| 21752 | pmulhrsw m5, [pw_1024] |
| 21753 | packuswb m3, m5 |
| 21754 | movu [r0 + 757 * 16], m3 |
| 21755 | |
| 21756 | ; mode 13 [row 27] |
| 21757 | pmaddubsw m3, m6, [r5 + 4 * 16] |
| 21758 | pmulhrsw m3, [pw_1024] |
| 21759 | pmaddubsw m5, m2, [r5 + 4 * 16] |
| 21760 | pmulhrsw m5, [pw_1024] |
| 21761 | packuswb m3, m5 |
| 21762 | movu [r0 + 758 * 16], m3 |
| 21763 | pmaddubsw m3, m1, [r5 + 4 * 16] |
| 21764 | pmulhrsw m3, [pw_1024] |
| 21765 | pmaddubsw m5, m4, [r5 + 4 * 16] |
| 21766 | pmulhrsw m5, [pw_1024] |
| 21767 | packuswb m3, m5 |
| 21768 | movu [r0 + 759 * 16], m3 |
| 21769 | |
| 21770 | ; mode 14 [row 17] |
| 21771 | pslldq m7, 2 |
| 21772 | pinsrb m7, [r3 + 15], 1 |
| 21773 | pinsrb m7, [r3 + 17], 0 |
| 21774 | pmaddubsw m3, m7, [r5 + 22 * 16] |
| 21775 | pmulhrsw m3, [pw_1024] |
| 21776 | pmaddubsw m5, m2, [r5 + 22 * 16] |
| 21777 | pmulhrsw m5, [pw_1024] |
| 21778 | packuswb m3, m5 |
| 21779 | movu [r0 + 802 * 16], m3 |
| 21780 | pmaddubsw m3, m1, [r5 + 22 * 16] |
| 21781 | pmulhrsw m3, [pw_1024] |
| 21782 | pmaddubsw m5, m4, [r5 + 22 * 16] |
| 21783 | pmulhrsw m5, [pw_1024] |
| 21784 | packuswb m3, m5 |
| 21785 | movu [r0 + 803 * 16], m3 |
| 21786 | |
| 21787 | ; mode 14 [row 18] |
| 21788 | pmaddubsw m3, m7, [r5 + 9 * 16] |
| 21789 | pmulhrsw m3, [pw_1024] |
| 21790 | pmaddubsw m5, m2, [r5 + 9 * 16] |
| 21791 | pmulhrsw m5, [pw_1024] |
| 21792 | packuswb m3, m5 |
| 21793 | movu [r0 + 804 * 16], m3 |
| 21794 | pmaddubsw m3, m1, [r5 + 9 * 16] |
| 21795 | pmulhrsw m3, [pw_1024] |
| 21796 | pmaddubsw m5, m4, [r5 + 9 * 16] |
| 21797 | pmulhrsw m5, [pw_1024] |
| 21798 | packuswb m3, m5 |
| 21799 | movu [r0 + 805 * 16], m3 |
| 21800 | |
| 21801 | ; mode 15 [row 13] |
| 21802 | pmaddubsw m5, m2, [r5 + 18 * 16] |
| 21803 | pmulhrsw m5, [pw_1024] |
| 21804 | packuswb m5, m5 |
| 21805 | movh [r0 + 858 * 16 + 8], m5 |
| 21806 | pmaddubsw m3, m1, [r5 + 18 * 16] |
| 21807 | pmulhrsw m3, [pw_1024] |
| 21808 | pmaddubsw m5, m4, [r5 + 18 * 16] |
| 21809 | pmulhrsw m5, [pw_1024] |
| 21810 | packuswb m3, m5 |
| 21811 | movu [r0 + 859 * 16], m3 |
| 21812 | |
| 21813 | ; mode 15 [row 14] |
| 21814 | pmaddubsw m5, m2, [r5 + 1 * 16] |
| 21815 | pmulhrsw m5, [pw_1024] |
| 21816 | packuswb m5, m5 |
| 21817 | movh [r0 + 860 * 16 + 8], m5 |
| 21818 | pmaddubsw m3, m1, [r5 + 1 * 16] |
| 21819 | pmulhrsw m3, [pw_1024] |
| 21820 | pmaddubsw m5, m4, [r5 + 1 * 16] |
| 21821 | pmulhrsw m5, [pw_1024] |
| 21822 | packuswb m3, m5 |
| 21823 | movu [r0 + 861 * 16], m3 |
| 21824 | |
| 21825 | ; mode 13 [row 28] |
| 21826 | pslldq m6, 2 |
| 21827 | pinsrb m6, [r3 + 25], 1 |
| 21828 | pinsrb m6, [r3 + 28], 0 |
| 21829 | pmaddubsw m3, m6, [r5 + 27 * 16] |
| 21830 | pmulhrsw m3, [pw_1024] |
| 21831 | pslldq m2, 2 |
| 21832 | pinsrw m2, [r4 + 0], 0 |
| 21833 | pmaddubsw m5, m2, [r5 + 27 * 16] |
| 21834 | pmulhrsw m5, [pw_1024] |
| 21835 | packuswb m3, m5 |
| 21836 | movu [r0 + 760 * 16], m3 |
| 21837 | pslldq m1, 2 |
| 21838 | pinsrw m1, [r4 + 8], 0 |
| 21839 | pmaddubsw m3, m1, [r5 + 27 * 16] |
| 21840 | pmulhrsw m3, [pw_1024] |
| 21841 | pslldq m4, 2 |
| 21842 | pinsrw m4, [r4 + 16], 0 |
| 21843 | pmaddubsw m5, m4, [r5 + 27 * 16] |
| 21844 | pmulhrsw m5, [pw_1024] |
| 21845 | packuswb m3, m5 |
| 21846 | movu [r0 + 761 * 16], m3 |
| 21847 | |
| 21848 | ; mode 13 [row 29] |
| 21849 | pmaddubsw m3, m6, [r5 + 18 * 16] |
| 21850 | pmulhrsw m3, [pw_1024] |
| 21851 | pmaddubsw m5, m2, [r5 + 18 * 16] |
| 21852 | pmulhrsw m5, [pw_1024] |
| 21853 | packuswb m3, m5 |
| 21854 | movu [r0 + 762 * 16], m3 |
| 21855 | pmaddubsw m3, m1, [r5 + 18 * 16] |
| 21856 | pmulhrsw m3, [pw_1024] |
| 21857 | pmaddubsw m5, m4, [r5 + 18 * 16] |
| 21858 | pmulhrsw m5, [pw_1024] |
| 21859 | packuswb m3, m5 |
| 21860 | movu [r0 + 763 * 16], m3 |
| 21861 | |
| 21862 | ; mode 13 [row 30] |
| 21863 | pmaddubsw m3, m6, [r5 + 9 * 16] |
| 21864 | pmulhrsw m3, [pw_1024] |
| 21865 | pmaddubsw m5, m2, [r5 + 9 * 16] |
| 21866 | pmulhrsw m5, [pw_1024] |
| 21867 | packuswb m3, m5 |
| 21868 | movu [r0 + 764 * 16], m3 |
| 21869 | pmaddubsw m3, m1, [r5 + 9 * 16] |
| 21870 | pmulhrsw m3, [pw_1024] |
| 21871 | pmaddubsw m5, m4, [r5 + 9 * 16] |
| 21872 | pmulhrsw m5, [pw_1024] |
| 21873 | packuswb m3, m5 |
| 21874 | movu [r0 + 765 * 16], m3 |
| 21875 | |
| 21876 | ; mode 14 [row 19] |
| 21877 | pslldq m7, 2 |
| 21878 | pinsrb m7, [r3 + 17], 1 |
| 21879 | pinsrb m7, [r3 + 20], 0 |
| 21880 | pmaddubsw m3, m7, [r5 + 28 * 16] |
| 21881 | pmulhrsw m3, [pw_1024] |
| 21882 | pmaddubsw m5, m2, [r5 + 28 * 16] |
| 21883 | pmulhrsw m5, [pw_1024] |
| 21884 | packuswb m3, m5 |
| 21885 | movu [r0 + 806 * 16], m3 |
| 21886 | pmaddubsw m3, m1, [r5 + 28 * 16] |
| 21887 | pmulhrsw m3, [pw_1024] |
| 21888 | pmaddubsw m5, m4, [r5 + 28 * 16] |
| 21889 | pmulhrsw m5, [pw_1024] |
| 21890 | packuswb m3, m5 |
| 21891 | movu [r0 + 807 * 16], m3 |
| 21892 | |
| 21893 | ; mode 14 [row 20] |
| 21894 | pmaddubsw m3, m7, [r5 + 15 * 16] |
| 21895 | pmulhrsw m3, [pw_1024] |
| 21896 | pmaddubsw m5, m2, [r5 + 15 * 16] |
| 21897 | pmulhrsw m5, [pw_1024] |
| 21898 | packuswb m3, m5 |
| 21899 | movu [r0 + 808 * 16], m3 |
| 21900 | pmaddubsw m3, m1, [r5 + 15 * 16] |
| 21901 | pmulhrsw m3, [pw_1024] |
| 21902 | pmaddubsw m5, m4, [r5 + 15 * 16] |
| 21903 | pmulhrsw m5, [pw_1024] |
| 21904 | packuswb m3, m5 |
| 21905 | movu [r0 + 809 * 16], m3 |
| 21906 | |
| 21907 | ; mode 14 [row 21] |
| 21908 | pmaddubsw m3, m7, [r5 + 2 * 16] |
| 21909 | pmulhrsw m3, [pw_1024] |
| 21910 | pmaddubsw m5, m2, [r5 + 2 * 16] |
| 21911 | pmulhrsw m5, [pw_1024] |
| 21912 | packuswb m3, m5 |
| 21913 | movu [r0 + 810 * 16], m3 |
| 21914 | pmaddubsw m3, m1, [r5 + 2 * 16] |
| 21915 | pmulhrsw m3, [pw_1024] |
| 21916 | pmaddubsw m5, m4, [r5 + 2 * 16] |
| 21917 | pmulhrsw m5, [pw_1024] |
| 21918 | packuswb m3, m5 |
| 21919 | movu [r0 + 811 * 16], m3 |
| 21920 | |
| 21921 | ; mode 15 [row 15] |
| 21922 | pmaddubsw m5, m2, [r5 + 16 * 16] |
| 21923 | pmulhrsw m5, [pw_1024] |
| 21924 | packuswb m5, m5 |
| 21925 | movh [r0 + 862 * 16 + 8], m5 |
| 21926 | pmaddubsw m3, m1, [r5 + 16 * 16] |
| 21927 | pmulhrsw m3, [pw_1024] |
| 21928 | pmaddubsw m5, m4, [r5 + 16 * 16] |
| 21929 | pmulhrsw m5, [pw_1024] |
| 21930 | packuswb m3, m5 |
| 21931 | movu [r0 + 863 * 16], m3 |
| 21932 | |
| 21933 | ; mode 14 [row 22] |
| 21934 | pslldq m7, 2 |
| 21935 | pinsrb m7, [r3 + 20], 1 |
| 21936 | pinsrb m7, [r3 + 22], 0 |
| 21937 | pmaddubsw m3, m7, [r5 + 21 * 16] |
| 21938 | pmulhrsw m3, [pw_1024] |
| 21939 | pslldq m2, 2 |
| 21940 | pinsrb m2, [r4 + 0], 1 |
| 21941 | pinsrb m2, [r3 + 2], 0 |
| 21942 | pmaddubsw m5, m2, [r5 + 21 * 16] |
| 21943 | pmulhrsw m5, [pw_1024] |
| 21944 | packuswb m3, m5 |
| 21945 | movu [r0 + 812 * 16], m3 |
| 21946 | pslldq m1, 2 |
| 21947 | pinsrw m1, [r4 + 7], 0 |
| 21948 | pmaddubsw m3, m1, [r5 + 21 * 16] |
| 21949 | pmulhrsw m3, [pw_1024] |
| 21950 | pslldq m4, 2 |
| 21951 | pinsrw m4, [r4 + 15], 0 |
| 21952 | pmaddubsw m5, m4, [r5 + 21 * 16] |
| 21953 | pmulhrsw m5, [pw_1024] |
| 21954 | packuswb m3, m5 |
| 21955 | movu [r0 + 813 * 16], m3 |
| 21956 | |
| 21957 | ; mode 14 [row 23] |
| 21958 | pmaddubsw m3, m7, [r5 + 8 * 16] |
| 21959 | pmulhrsw m3, [pw_1024] |
| 21960 | pmaddubsw m5, m2, [r5 + 8 * 16] |
| 21961 | pmulhrsw m5, [pw_1024] |
| 21962 | packuswb m3, m5 |
| 21963 | movu [r0 + 814 * 16], m3 |
| 21964 | pmaddubsw m3, m1, [r5 + 8 * 16] |
| 21965 | pmulhrsw m3, [pw_1024] |
| 21966 | pmaddubsw m5, m4, [r5 + 8 * 16] |
| 21967 | pmulhrsw m5, [pw_1024] |
| 21968 | packuswb m3, m5 |
| 21969 | movu [r0 + 815 * 16], m3 |
| 21970 | |
| 21971 | ; mode 15 [row 16] |
| 21972 | pmaddubsw m5, m2, [r5 + 31 * 16] |
| 21973 | pmulhrsw m5, [pw_1024] |
| 21974 | packuswb m5, m5 |
| 21975 | movh [r0 + 864 * 16 + 8], m5 |
| 21976 | pmaddubsw m3, m1, [r5 + 31 * 16] |
| 21977 | pmulhrsw m3, [pw_1024] |
| 21978 | pmaddubsw m5, m4, [r5 + 31 * 16] |
| 21979 | pmulhrsw m5, [pw_1024] |
| 21980 | packuswb m3, m5 |
| 21981 | movu [r0 + 865 * 16], m3 |
| 21982 | |
| 21983 | ; mode 15 [row 17] |
| 21984 | pmaddubsw m5, m2, [r5 + 14 * 16] |
| 21985 | pmulhrsw m5, [pw_1024] |
| 21986 | packuswb m5, m5 |
| 21987 | movh [r0 + 866 * 16 + 8], m5 |
| 21988 | pmaddubsw m3, m1, [r5 + 14 * 16] |
| 21989 | pmulhrsw m3, [pw_1024] |
| 21990 | pmaddubsw m5, m4, [r5 + 14 * 16] |
| 21991 | pmulhrsw m5, [pw_1024] |
| 21992 | packuswb m3, m5 |
| 21993 | movu [r0 + 867 * 16], m3 |
| 21994 | |
| 21995 | ; mode 14 [row 24] |
| 21996 | pslldq m7, 2 |
| 21997 | pinsrb m7, [r3 + 22], 1 |
| 21998 | pinsrb m7, [r3 + 25], 0 |
| 21999 | pmaddubsw m3, m7, [r5 + 27 * 16] |
| 22000 | pmulhrsw m3, [pw_1024] |
| 22001 | pslldq m2, 2 |
| 22002 | pinsrb m2, [r3 + 2], 1 |
| 22003 | pinsrb m2, [r3 + 5], 0 |
| 22004 | pmaddubsw m5, m2, [r5 + 27 * 16] |
| 22005 | pmulhrsw m5, [pw_1024] |
| 22006 | packuswb m3, m5 |
| 22007 | movu [r0 + 816 * 16], m3 |
| 22008 | pslldq m1, 2 |
| 22009 | pinsrw m1, [r4 + 6], 0 |
| 22010 | pmaddubsw m3, m1, [r5 + 27 * 16] |
| 22011 | pmulhrsw m3, [pw_1024] |
| 22012 | pslldq m4, 2 |
| 22013 | pinsrw m4, [r4 + 14], 0 |
| 22014 | pmaddubsw m5, m4, [r5 + 27 * 16] |
| 22015 | pmulhrsw m5, [pw_1024] |
| 22016 | packuswb m3, m5 |
| 22017 | movu [r0 + 817 * 16], m3 |
| 22018 | |
| 22019 | ; mode 14 [row 25] |
| 22020 | pmaddubsw m3, m7, [r5 + 14 * 16] |
| 22021 | pmulhrsw m3, [pw_1024] |
| 22022 | pmaddubsw m5, m2, [r5 + 14 * 16] |
| 22023 | pmulhrsw m5, [pw_1024] |
| 22024 | packuswb m3, m5 |
| 22025 | movu [r0 + 818 * 16], m3 |
| 22026 | pmaddubsw m3, m1, [r5 + 14 * 16] |
| 22027 | pmulhrsw m3, [pw_1024] |
| 22028 | pmaddubsw m5, m4, [r5 + 14 * 16] |
| 22029 | pmulhrsw m5, [pw_1024] |
| 22030 | packuswb m3, m5 |
| 22031 | movu [r0 + 819 * 16], m3 |
| 22032 | |
| 22033 | ; mode 14 [row 26] |
| 22034 | pmaddubsw m3, m7, [r5 + 1 * 16] |
| 22035 | pmulhrsw m3, [pw_1024] |
| 22036 | pmaddubsw m5, m2, [r5 + 1 * 16] |
| 22037 | pmulhrsw m5, [pw_1024] |
| 22038 | packuswb m3, m5 |
| 22039 | movu [r0 + 820 * 16], m3 |
| 22040 | pmaddubsw m3, m1, [r5 + 1 * 16] |
| 22041 | pmulhrsw m3, [pw_1024] |
| 22042 | pmaddubsw m5, m4, [r5 + 1 * 16] |
| 22043 | pmulhrsw m5, [pw_1024] |
| 22044 | packuswb m3, m5 |
| 22045 | movu [r0 + 821 * 16], m3 |
| 22046 | |
| 22047 | ; mode 15 [row 18] |
| 22048 | pinsrb m2, [r3 + 4], 0 |
| 22049 | pmaddubsw m5, m2, [r5 + 29 * 16] |
| 22050 | pmulhrsw m5, [pw_1024] |
| 22051 | packuswb m5, m5 |
| 22052 | movh [r0 + 868 * 16 + 8], m5 |
| 22053 | pmaddubsw m3, m1, [r5 + 29 * 16] |
| 22054 | pmulhrsw m3, [pw_1024] |
| 22055 | pmaddubsw m5, m4, [r5 + 29 * 16] |
| 22056 | pmulhrsw m5, [pw_1024] |
| 22057 | packuswb m3, m5 |
| 22058 | movu [r0 + 869 * 16], m3 |
| 22059 | |
| 22060 | ; mode 15 [row 19] |
| 22061 | pmaddubsw m5, m2, [r5 + 12 * 16] |
| 22062 | pmulhrsw m5, [pw_1024] |
| 22063 | packuswb m5, m5 |
| 22064 | movh [r0 + 870 * 16 + 8], m5 |
| 22065 | pmaddubsw m3, m1, [r5 + 12 * 16] |
| 22066 | pmulhrsw m3, [pw_1024] |
| 22067 | pmaddubsw m5, m4, [r5 + 12 * 16] |
| 22068 | pmulhrsw m5, [pw_1024] |
| 22069 | packuswb m3, m5 |
| 22070 | movu [r0 + 871 * 16], m3 |
| 22071 | |
| 22072 | ; mode 15 [row 20 - 8 to 15] |
| 22073 | pslldq m3, m2, 2 |
| 22074 | pinsrb m3, [r3 + 4], 1 |
| 22075 | pinsrb m3, [r3 + 6], 0 |
| 22076 | pmaddubsw m5, m3, [r5 + 27 * 16] |
| 22077 | pmulhrsw m5, [pw_1024] |
| 22078 | packuswb m5, m5 |
| 22079 | movh [r0 + 872 * 16 + 8], m5 |
| 22080 | |
| 22081 | ; mode 15 [row 21 - 8 to 15] |
| 22082 | pmaddubsw m5, m3, [r5 + 10 * 16] |
| 22083 | pmulhrsw m5, [pw_1024] |
| 22084 | packuswb m5, m5 |
| 22085 | movh [r0 + 874 * 16 + 8], m5 |
| 22086 | |
| 22087 | ; mode 15 [row 22 - 8 to 15] |
| 22088 | pslldq m3, 2 |
| 22089 | pinsrb m3, [r3 + 6], 1 |
| 22090 | pinsrb m3, [r3 + 8], 0 |
| 22091 | pmaddubsw m5, m3, [r5 + 25 * 16] |
| 22092 | pmulhrsw m5, [pw_1024] |
| 22093 | packuswb m5, m5 |
| 22094 | movh [r0 + 876 * 16 + 8], m5 |
| 22095 | |
| 22096 | ; mode 15 [row 23 - 8 to 15] |
| 22097 | pmaddubsw m5, m3, [r5 + 8 * 16] |
| 22098 | pmulhrsw m5, [pw_1024] |
| 22099 | packuswb m5, m5 |
| 22100 | movh [r0 + 878 * 16 + 8], m5 |
| 22101 | |
| 22102 | ; mode 15 [row 24 - 8 to 15] |
| 22103 | pslldq m3, 2 |
| 22104 | pinsrb m3, [r3 + 8], 1 |
| 22105 | pinsrb m3, [r3 + 9], 0 |
| 22106 | pmaddubsw m5, m3, [r5 + 23 * 16] |
| 22107 | pmulhrsw m5, [pw_1024] |
| 22108 | packuswb m5, m5 |
| 22109 | movh [r0 + 880 * 16 + 8], m5 |
| 22110 | |
| 22111 | ; mode 15 [row 25 - 8 to 15] |
| 22112 | pmaddubsw m5, m3, [r5 + 6 * 16] |
| 22113 | pmulhrsw m5, [pw_1024] |
| 22114 | packuswb m5, m5 |
| 22115 | movh [r0 + 882 * 16 + 8], m5 |
| 22116 | |
| 22117 | ; mode 15 [row 26 - 8 to 15] |
| 22118 | pslldq m3, 2 |
| 22119 | pinsrb m3, [r3 + 9], 1 |
| 22120 | pinsrb m3, [r3 + 11], 0 |
| 22121 | pmaddubsw m5, m3, [r5 + 21 * 16] |
| 22122 | pmulhrsw m5, [pw_1024] |
| 22123 | packuswb m5, m5 |
| 22124 | movh [r0 + 884 * 16 + 8], m5 |
| 22125 | |
| 22126 | ; mode 15 [row 27 - 8 to 15] |
| 22127 | pmaddubsw m5, m3, [r5 + 4 * 16] |
| 22128 | pmulhrsw m5, [pw_1024] |
| 22129 | packuswb m5, m5 |
| 22130 | movh [r0 + 886 * 16 + 8], m5 |
| 22131 | |
| 22132 | ; mode 15 [row 28 - 8 to 15] |
| 22133 | pslldq m3, 2 |
| 22134 | pinsrb m3, [r3 + 11], 1 |
| 22135 | pinsrb m3, [r3 + 13], 0 |
| 22136 | pmaddubsw m5, m3, [r5 + 19 * 16] |
| 22137 | pmulhrsw m5, [pw_1024] |
| 22138 | packuswb m5, m5 |
| 22139 | movh [r0 + 888 * 16 + 8], m5 |
| 22140 | |
| 22141 | ; mode 15 [row 29 - 8 to 15] |
| 22142 | pmaddubsw m5, m3, [r5 + 2 * 16] |
| 22143 | pmulhrsw m5, [pw_1024] |
| 22144 | packuswb m5, m5 |
| 22145 | movh [r0 + 890 * 16 + 8], m5 |
| 22146 | |
| 22147 | ; mode 15 [row 30 - 8 to 15] |
| 22148 | pslldq m3, 2 |
| 22149 | pinsrb m3, [r3 + 13], 1 |
| 22150 | pinsrb m3, [r3 + 15], 0 |
| 22151 | pmaddubsw m5, m3, [r5 + 17 * 16] |
| 22152 | pmulhrsw m5, [pw_1024] |
| 22153 | packuswb m5, m5 |
| 22154 | movh [r0 + 892 * 16 + 8], m5 |
| 22155 | |
| 22156 | ; mode 15 [row 31, 8 to 15] |
| 22157 | pshufb m5, m3, [tab_S2] |
| 22158 | movh [r0 + 894 * 16 + 8], m5 |
| 22159 | |
| 22160 | ; mode 14 [row 27] |
| 22161 | pinsrb m2, [r3 + 5], 0 |
| 22162 | pslldq m7, 2 |
| 22163 | pinsrb m7, [r3 + 25], 1 |
| 22164 | pinsrb m7, [r3 + 27], 0 |
| 22165 | pmaddubsw m3, m7, [r5 + 20 * 16] |
| 22166 | pmulhrsw m3, [pw_1024] |
| 22167 | pslldq m2, 2 |
| 22168 | pinsrb m2, [r3 + 5], 1 |
| 22169 | pinsrb m2, [r3 + 7], 0 |
| 22170 | pmaddubsw m5, m2, [r5 + 20 * 16] |
| 22171 | pmulhrsw m5, [pw_1024] |
| 22172 | packuswb m3, m5 |
| 22173 | movu [r0 + 822 * 16], m3 |
| 22174 | pslldq m1, 2 |
| 22175 | pinsrw m1, [r4 + 5], 0 |
| 22176 | pmaddubsw m3, m1, [r5 + 20 * 16] |
| 22177 | pmulhrsw m3, [pw_1024] |
| 22178 | pslldq m4, 2 |
| 22179 | pinsrw m4, [r4 + 13], 0 |
| 22180 | pmaddubsw m5, m4, [r5 + 20 * 16] |
| 22181 | pmulhrsw m5, [pw_1024] |
| 22182 | packuswb m3, m5 |
| 22183 | movu [r0 + 823 * 16], m3 |
| 22184 | |
| 22185 | ; mode 15 [row 20 - 16 to 31] |
| 22186 | pmaddubsw m3, m1, [r5 + 27 * 16] |
| 22187 | pmulhrsw m3, [pw_1024] |
| 22188 | pmaddubsw m5, m4, [r5 + 27 * 16] |
| 22189 | pmulhrsw m5, [pw_1024] |
| 22190 | packuswb m3, m5 |
| 22191 | movu [r0 + 873 * 16], m3 |
| 22192 | |
| 22193 | ; mode 15 [row 21 - 16 to 31] |
| 22194 | pmaddubsw m3, m1, [r5 + 10 * 16] |
| 22195 | pmulhrsw m3, [pw_1024] |
| 22196 | pmaddubsw m5, m4, [r5 + 10 * 16] |
| 22197 | pmulhrsw m5, [pw_1024] |
| 22198 | packuswb m3, m5 |
| 22199 | movu [r0 + 875 * 16], m3 |
| 22200 | |
| 22201 | ; mode 14 [row 28] |
| 22202 | pmaddubsw m3, m7, [r5 + 7 * 16] |
| 22203 | pmulhrsw m3, [pw_1024] |
| 22204 | pmaddubsw m5, m2, [r5 + 7 * 16] |
| 22205 | pmulhrsw m5, [pw_1024] |
| 22206 | packuswb m3, m5 |
| 22207 | movu [r0 + 824 * 16], m3 |
| 22208 | pmaddubsw m3, m1, [r5 + 7 * 16] |
| 22209 | pmulhrsw m3, [pw_1024] |
| 22210 | pmaddubsw m5, m4, [r5 + 7 * 16] |
| 22211 | pmulhrsw m5, [pw_1024] |
| 22212 | packuswb m3, m5 |
| 22213 | movu [r0 + 825 * 16], m3 |
| 22214 | |
| 22215 | ; mode 14 [row 29] |
| 22216 | pslldq m7, 2 |
| 22217 | pinsrb m7, [r3 + 27], 1 |
| 22218 | pinsrb m7, [r3 + 30], 0 |
| 22219 | pmaddubsw m3, m7, [r5 + 26 * 16] |
| 22220 | pmulhrsw m3, [pw_1024] |
| 22221 | pslldq m2, 2 |
| 22222 | pinsrb m2, [r3 + 7], 1 |
| 22223 | pinsrb m2, [r3 + 10], 0 |
| 22224 | pmaddubsw m5, m2, [r5 + 26 * 16] |
| 22225 | pmulhrsw m5, [pw_1024] |
| 22226 | packuswb m3, m5 |
| 22227 | movu [r0 + 826 * 16], m3 |
| 22228 | pslldq m1, 2 |
| 22229 | pinsrw m1, [r4 + 4], 0 |
| 22230 | pmaddubsw m3, m1, [r5 + 26 * 16] |
| 22231 | pmulhrsw m3, [pw_1024] |
| 22232 | pslldq m4, 2 |
| 22233 | pinsrw m4, [r4 + 12], 0 |
| 22234 | pmaddubsw m5, m4, [r5 + 26 * 16] |
| 22235 | pmulhrsw m5, [pw_1024] |
| 22236 | packuswb m3, m5 |
| 22237 | movu [r0 + 827 * 16], m3 |
| 22238 | |
| 22239 | ; mode 14 [row 30] |
| 22240 | pmaddubsw m3, m7, [r5 + 13 * 16] |
| 22241 | pmulhrsw m3, [pw_1024] |
| 22242 | pmaddubsw m5, m2, [r5 + 13 * 16] |
| 22243 | pmulhrsw m5, [pw_1024] |
| 22244 | packuswb m3, m5 |
| 22245 | movu [r0 + 828 * 16], m3 |
| 22246 | pmaddubsw m3, m1, [r5 + 13 * 16] |
| 22247 | pmulhrsw m3, [pw_1024] |
| 22248 | pmaddubsw m5, m4, [r5 + 13 * 16] |
| 22249 | pmulhrsw m5, [pw_1024] |
| 22250 | packuswb m3, m5 |
| 22251 | movu [r0 + 829 * 16], m3 |
| 22252 | |
| 22253 | ; mode 15 [row 22] |
| 22254 | pmaddubsw m3, m1, [r5 + 25 * 16] |
| 22255 | pmulhrsw m3, [pw_1024] |
| 22256 | pmaddubsw m5, m4, [r5 + 25 * 16] |
| 22257 | pmulhrsw m5, [pw_1024] |
| 22258 | packuswb m3, m5 |
| 22259 | movu [r0 + 877 * 16], m3 |
| 22260 | |
| 22261 | ; mode 15 [row 23] |
| 22262 | pmaddubsw m3, m1, [r5 + 8 * 16] |
| 22263 | pmulhrsw m3, [pw_1024] |
| 22264 | pmaddubsw m5, m4, [r5 + 8 * 16] |
| 22265 | pmulhrsw m5, [pw_1024] |
| 22266 | packuswb m3, m5 |
| 22267 | movu [r0 + 879 * 16], m3 |
| 22268 | |
| 22269 | ; mode 14 [row 31] |
| 22270 | pshufb m3, m7, [tab_S2] |
| 22271 | movh [r0 + 830 * 16], m3 |
| 22272 | pshufb m3, m2, [tab_S2] |
| 22273 | movh [r0 + 830 * 16 + 8], m3 |
| 22274 | pshufb m3, m1, [tab_S2] |
| 22275 | movh [r0 + 831 * 16], m3 |
| 22276 | pshufb m3, m4, [tab_S2] |
| 22277 | movh [r0 + 831 * 16 + 8], m3 |
| 22278 | |
| 22279 | ; mode 13 [row 31] |
| 22280 | pshufb m0, m6, [tab_S2] |
| 22281 | movh [r0 + 766 * 16], m0 |
| 22282 | movh m0, [r4] |
| 22283 | movh [r0 + 766 * 16 + 8], m0 |
| 22284 | movu m0, [r4 + 8] |
| 22285 | movu [r0 + 767 * 16], m0 |
| 22286 | |
| 22287 | ; mode 15 [row 24] |
| 22288 | pslldq m1, 2 |
| 22289 | pinsrw m1, [r4 + 3], 0 |
| 22290 | pmaddubsw m3, m1, [r5 + 23 * 16] |
| 22291 | pmulhrsw m3, [pw_1024] |
| 22292 | pslldq m4, 2 |
| 22293 | pinsrw m4, [r4 + 11], 0 |
| 22294 | pmaddubsw m5, m4, [r5 + 23 * 16] |
| 22295 | pmulhrsw m5, [pw_1024] |
| 22296 | packuswb m3, m5 |
| 22297 | movu [r0 + 881 * 16], m3 |
| 22298 | |
| 22299 | ; mode 15 [row 25] |
| 22300 | pmaddubsw m3, m1, [r5 + 6 * 16] |
| 22301 | pmulhrsw m3, [pw_1024] |
| 22302 | pmaddubsw m5, m4, [r5 + 6 * 16] |
| 22303 | pmulhrsw m5, [pw_1024] |
| 22304 | packuswb m3, m5 |
| 22305 | movu [r0 + 883 * 16], m3 |
| 22306 | |
| 22307 | ; mode 15 [row 26] |
| 22308 | pslldq m1, 2 |
| 22309 | pinsrw m1, [r4 + 2], 0 |
| 22310 | pmaddubsw m3, m1, [r5 + 21 * 16] |
| 22311 | pmulhrsw m3, [pw_1024] |
| 22312 | pslldq m4, 2 |
| 22313 | pinsrw m4, [r4 + 10], 0 |
| 22314 | pmaddubsw m5, m4, [r5 + 21 * 16] |
| 22315 | pmulhrsw m5, [pw_1024] |
| 22316 | packuswb m3, m5 |
| 22317 | movu [r0 + 885 * 16], m3 |
| 22318 | |
| 22319 | ; mode 15 [row 27] |
| 22320 | pmaddubsw m3, m1, [r5 + 4 * 16] |
| 22321 | pmulhrsw m3, [pw_1024] |
| 22322 | pmaddubsw m5, m4, [r5 + 4 * 16] |
| 22323 | pmulhrsw m5, [pw_1024] |
| 22324 | packuswb m3, m5 |
| 22325 | movu [r0 + 887 * 16], m3 |
| 22326 | |
| 22327 | ; mode 15 [row 28] |
| 22328 | pslldq m1, 2 |
| 22329 | pinsrw m1, [r4 + 1], 0 |
| 22330 | pmaddubsw m3, m1, [r5 + 19 * 16] |
| 22331 | pmulhrsw m3, [pw_1024] |
| 22332 | pslldq m4, 2 |
| 22333 | pinsrw m4, [r4 + 9], 0 |
| 22334 | pmaddubsw m5, m4, [r5 + 19 * 16] |
| 22335 | pmulhrsw m5, [pw_1024] |
| 22336 | packuswb m3, m5 |
| 22337 | movu [r0 + 889 * 16], m3 |
| 22338 | |
| 22339 | ; mode 15 [row 29] |
| 22340 | pmaddubsw m3, m1, [r5 + 2 * 16] |
| 22341 | pmulhrsw m3, [pw_1024] |
| 22342 | pmaddubsw m5, m4, [r5 + 2 * 16] |
| 22343 | pmulhrsw m5, [pw_1024] |
| 22344 | packuswb m3, m5 |
| 22345 | movu [r0 + 891 * 16], m3 |
| 22346 | |
| 22347 | ; mode 15 [row 30] |
| 22348 | pslldq m1, 2 |
| 22349 | pinsrw m1, [r4 + 0], 0 |
| 22350 | pmaddubsw m3, m1, [r5 + 17 * 16] |
| 22351 | pmulhrsw m3, [pw_1024] |
| 22352 | pslldq m4, 2 |
| 22353 | pinsrw m4, [r4 + 8], 0 |
| 22354 | pmaddubsw m5, m4, [r5 + 17 * 16] |
| 22355 | pmulhrsw m5, [pw_1024] |
| 22356 | packuswb m3, m5 |
| 22357 | movu [r0 + 893 * 16], m3 |
| 22358 | |
| 22359 | ; mode 15 [row 31] |
| 22360 | pshufb m5, m1, [tab_S2] |
| 22361 | movh [r0 + 895 * 16], m5 |
| 22362 | pshufb m5, m4, [tab_S2] |
| 22363 | movh [r0 + 895 * 16 + 8], m5 |
| 22364 | |
| 22365 | ; mode 16 [row 0] |
| 22366 | movu m6, [r5 + 11 * 16] |
| 22367 | movu m7, [pw_1024] |
| 22368 | movh m0, [r4 ] |
| 22369 | movh m1, [r4 + 1 ] |
| 22370 | punpcklbw m0, m1 |
| 22371 | pmaddubsw m1, m0, m6 |
| 22372 | pmulhrsw m1, m7 |
| 22373 | movh m2, [r4 + 8] |
| 22374 | movh m3, [r4 + 9] |
| 22375 | punpcklbw m2, m3 |
| 22376 | pmaddubsw m3, m2, m6 |
| 22377 | pmulhrsw m3, m7 |
| 22378 | packuswb m1, m3 |
| 22379 | movu [r0 + 896 * 16], m1 |
| 22380 | |
| 22381 | movh m1, [r4 + 16] |
| 22382 | movh m3, [r4 + 17] |
| 22383 | punpcklbw m1, m3 |
| 22384 | pmaddubsw m3, m1, m6 |
| 22385 | pmulhrsw m3, m7 |
| 22386 | movh m4, [r4 + 24] |
| 22387 | movh m5, [r4 + 25] |
| 22388 | punpcklbw m4, m5 |
| 22389 | pmaddubsw m5, m4, m6 |
| 22390 | pmulhrsw m5, m7 |
| 22391 | packuswb m3, m5 |
| 22392 | movu [r0 + 897 * 16], m3 |
| 22393 | |
| 22394 | ; mode16 [row 1] |
| 22395 | movu m6, [r5 + 22 * 16] |
| 22396 | pslldq m0, 2 |
| 22397 | pinsrb m0, [r4], 1 |
| 22398 | pinsrb m0, [r3 + 2], 0 |
| 22399 | pmaddubsw m3, m0, m6 |
| 22400 | pmulhrsw m3, m7 |
| 22401 | pslldq m2, 2 |
| 22402 | pinsrw m2, [r4 + 7], 0 |
| 22403 | pmaddubsw m5, m2, m6 |
| 22404 | pmulhrsw m5, m7 |
| 22405 | packuswb m3, m5 |
| 22406 | movu [r0 + 898 * 16], m3 |
| 22407 | |
| 22408 | pslldq m1, 2 |
| 22409 | pinsrw m1, [r4 + 15], 0 |
| 22410 | pmaddubsw m3, m1, m6 |
| 22411 | pmulhrsw m3, m7 |
| 22412 | pslldq m4, 2 |
| 22413 | pinsrw m4, [r4 + 23], 0 |
| 22414 | pmaddubsw m5, m4, m6 |
| 22415 | pmulhrsw m5, m7 |
| 22416 | packuswb m3, m5 |
| 22417 | movu [r0 + 899 * 16], m3 |
| 22418 | |
| 22419 | ; mode16 [row 2] |
| 22420 | movu m6, [r5 + 1 * 16] |
| 22421 | pmaddubsw m3, m0, m6 |
| 22422 | pmulhrsw m3, m7 |
| 22423 | pmaddubsw m5, m2, m6 |
| 22424 | pmulhrsw m5, m7 |
| 22425 | packuswb m3, m5 |
| 22426 | movu [r0 + 900 * 16], m3 |
| 22427 | |
| 22428 | pmaddubsw m3, m1, m6 |
| 22429 | pmulhrsw m3, m7 |
| 22430 | pmaddubsw m5, m4, m6 |
| 22431 | pmulhrsw m5, m7 |
| 22432 | packuswb m3, m5 |
| 22433 | movu [r0 + 901 * 16], m3 |
| 22434 | |
| 22435 | ; mode16 [row 3] |
| 22436 | movu m6, [r5 + 12 * 16] |
| 22437 | pslldq m0, 2 |
| 22438 | pinsrb m0, [r3 + 2], 1 |
| 22439 | pinsrb m0, [r3 + 3], 0 |
| 22440 | pmaddubsw m3, m0, m6 |
| 22441 | pmulhrsw m3, m7 |
| 22442 | pslldq m2, 2 |
| 22443 | pinsrw m2, [r4 + 6], 0 |
| 22444 | pmaddubsw m5, m2, m6 |
| 22445 | pmulhrsw m5, m7 |
| 22446 | packuswb m3, m5 |
| 22447 | movu [r0 + 902 * 16], m3 |
| 22448 | |
| 22449 | pslldq m1, 2 |
| 22450 | pinsrw m1, [r4 + 14], 0 |
| 22451 | pmaddubsw m3, m1, m6 |
| 22452 | pmulhrsw m3, m7 |
| 22453 | pslldq m4, 2 |
| 22454 | pinsrw m4, [r4 + 22], 0 |
| 22455 | pmaddubsw m5, m4, m6 |
| 22456 | pmulhrsw m5, m7 |
| 22457 | packuswb m3, m5 |
| 22458 | movu [r0 + 903 * 16], m3 |
| 22459 | |
| 22460 | ; mode16 [row 4] |
| 22461 | movu m6, [r5 + 23 * 16] |
| 22462 | pslldq m0, 2 |
| 22463 | pinsrb m0, [r3 + 3], 1 |
| 22464 | pinsrb m0, [r3 + 5], 0 |
| 22465 | pmaddubsw m3, m0, m6 |
| 22466 | pmulhrsw m3, m7 |
| 22467 | pslldq m2, 2 |
| 22468 | pinsrw m2, [r4 + 5], 0 |
| 22469 | pmaddubsw m5, m2, m6 |
| 22470 | pmulhrsw m5, m7 |
| 22471 | packuswb m3, m5 |
| 22472 | movu [r0 + 904 * 16], m3 |
| 22473 | |
| 22474 | pslldq m1, 2 |
| 22475 | pinsrw m1, [r4 + 13], 0 |
| 22476 | pmaddubsw m3, m1, m6 |
| 22477 | pmulhrsw m3, m7 |
| 22478 | pslldq m4, 2 |
| 22479 | pinsrw m4, [r4 + 21], 0 |
| 22480 | pmaddubsw m5, m4, m6 |
| 22481 | pmulhrsw m5, m7 |
| 22482 | packuswb m3, m5 |
| 22483 | movu [r0 + 905 * 16], m3 |
| 22484 | |
| 22485 | ; mode16 [row 5] |
| 22486 | movu m6, [r5 + 2 * 16] |
| 22487 | pmaddubsw m3, m0, m6 |
| 22488 | pmulhrsw m3, m7 |
| 22489 | pmaddubsw m5, m2, m6 |
| 22490 | pmulhrsw m5, m7 |
| 22491 | packuswb m3, m5 |
| 22492 | movu [r0 + 906 * 16], m3 |
| 22493 | |
| 22494 | pmaddubsw m3, m1, m6 |
| 22495 | pmulhrsw m3, m7 |
| 22496 | pmaddubsw m5, m4, m6 |
| 22497 | pmulhrsw m5, m7 |
| 22498 | packuswb m3, m5 |
| 22499 | movu [r0 + 907 * 16], m3 |
| 22500 | |
| 22501 | ; mode16 [row 6] |
| 22502 | movu m6, [r5 + 13 * 16] |
| 22503 | pslldq m0, 2 |
| 22504 | pinsrb m0, [r3 + 5], 1 |
| 22505 | pinsrb m0, [r3 + 6], 0 |
| 22506 | pmaddubsw m3, m0, m6 |
| 22507 | pmulhrsw m3, m7 |
| 22508 | pslldq m2, 2 |
| 22509 | pinsrb m2, [r4 + 5], 1 |
| 22510 | pinsrb m2, [r4 + 4], 0 |
| 22511 | pmaddubsw m5, m2, m6 |
| 22512 | pmulhrsw m5, m7 |
| 22513 | packuswb m3, m5 |
| 22514 | movu [r0 + 908 * 16], m3 |
| 22515 | pslldq m1, 2 |
| 22516 | pinsrw m1, [r4 + 12], 0 |
| 22517 | pmaddubsw m3, m1, m6 |
| 22518 | pmulhrsw m3, m7 |
| 22519 | pslldq m4, 2 |
| 22520 | pinsrw m4, [r4 + 20], 0 |
| 22521 | pmaddubsw m5, m4, m6 |
| 22522 | pmulhrsw m5, m7 |
| 22523 | packuswb m3, m5 |
| 22524 | movu [r0 + 909 * 16], m3 |
| 22525 | |
| 22526 | ; mode16 [row 7] |
| 22527 | movu m6, [r5 + 24 * 16] |
| 22528 | pslldq m0, 2 |
| 22529 | pinsrb m0, [r3 + 6], 1 |
| 22530 | pinsrb m0, [r3 + 8], 0 |
| 22531 | pmaddubsw m3, m0, m6 |
| 22532 | pmulhrsw m3, m7 |
| 22533 | pslldq m2, 2 |
| 22534 | pinsrw m2, [r4 + 3], 0 |
| 22535 | pmaddubsw m5, m2, m6 |
| 22536 | pmulhrsw m5, m7 |
| 22537 | packuswb m3, m5 |
| 22538 | movu [r0 + 910 * 16], m3 |
| 22539 | |
| 22540 | pslldq m1, 2 |
| 22541 | pinsrw m1, [r4 + 11], 0 |
| 22542 | pmaddubsw m3, m1, m6 |
| 22543 | pmulhrsw m3, m7 |
| 22544 | pslldq m4, 2 |
| 22545 | pinsrw m4, [r4 + 19], 0 |
| 22546 | pmaddubsw m5, m4, m6 |
| 22547 | pmulhrsw m5, m7 |
| 22548 | packuswb m3, m5 |
| 22549 | movu [r0 + 911 * 16], m3 |
| 22550 | |
| 22551 | ; mode16 [row 8] |
| 22552 | movu m6, [r5 + 3 * 16] |
| 22553 | pmaddubsw m3, m0, m6 |
| 22554 | pmulhrsw m3, m7 |
| 22555 | pmaddubsw m5, m2, m6 |
| 22556 | pmulhrsw m5, m7 |
| 22557 | packuswb m3, m5 |
| 22558 | movu [r0 + 912 * 16], m3 |
| 22559 | |
| 22560 | pmaddubsw m3, m1, m6 |
| 22561 | pmulhrsw m3, m7 |
| 22562 | pmaddubsw m5, m4, m6 |
| 22563 | pmulhrsw m5, m7 |
| 22564 | packuswb m3, m5 |
| 22565 | movu [r0 + 913 * 16], m3 |
| 22566 | |
| 22567 | ; mode16 [row 9] |
| 22568 | movu m6, [r5 + 14 * 16] |
| 22569 | pslldq m0, 2 |
| 22570 | pinsrb m0, [r3 + 8], 1 |
| 22571 | pinsrb m0, [r3 + 9], 0 |
| 22572 | pmaddubsw m3, m0, m6 |
| 22573 | pmulhrsw m3, m7 |
| 22574 | pslldq m2, 2 |
| 22575 | pinsrw m2, [r4 + 2], 0 |
| 22576 | pmaddubsw m5, m2, m6 |
| 22577 | pmulhrsw m5, m7 |
| 22578 | packuswb m3, m5 |
| 22579 | movu [r0 + 914 * 16], m3 |
| 22580 | |
| 22581 | pslldq m1, 2 |
| 22582 | pinsrw m1, [r4 + 10], 0 |
| 22583 | pmaddubsw m3, m1, m6 |
| 22584 | pmulhrsw m3, m7 |
| 22585 | pslldq m4, 2 |
| 22586 | pinsrw m4, [r4 + 18], 0 |
| 22587 | pmaddubsw m5, m4, m6 |
| 22588 | pmulhrsw m5, m7 |
| 22589 | packuswb m3, m5 |
| 22590 | movu [r0 + 915 * 16], m3 |
| 22591 | |
| 22592 | ; mode16 [row 10] |
| 22593 | movu m6, [r5 + 25 * 16] |
| 22594 | pslldq m0, 2 |
| 22595 | pinsrb m0, [r3 + 9], 1 |
| 22596 | pinsrb m0, [r3 + 11], 0 |
| 22597 | pmaddubsw m3, m0, m6 |
| 22598 | pmulhrsw m3, m7 |
| 22599 | pslldq m2, 2 |
| 22600 | pinsrw m2, [r4 + 1], 0 |
| 22601 | pmaddubsw m5, m2, m6 |
| 22602 | pmulhrsw m5, m7 |
| 22603 | packuswb m3, m5 |
| 22604 | movu [r0 + 916 * 16], m3 |
| 22605 | |
| 22606 | pslldq m1, 2 |
| 22607 | pinsrw m1, [r4 + 9], 0 |
| 22608 | pmaddubsw m3, m1, m6 |
| 22609 | pmulhrsw m3, m7 |
| 22610 | pslldq m4, 2 |
| 22611 | pinsrb m4, [r4 + 18], 1 |
| 22612 | pinsrb m4, [r4 + 17], 0 |
| 22613 | pmaddubsw m5, m4, m6 |
| 22614 | pmulhrsw m5, m7 |
| 22615 | packuswb m3, m5 |
| 22616 | movu [r0 + 917 * 16], m3 |
| 22617 | |
| 22618 | ; mode16 [row 11] |
| 22619 | movu m6, [r5 + 4 * 16] |
| 22620 | pmaddubsw m3, m0, m6 |
| 22621 | pmulhrsw m3, m7 |
| 22622 | pmaddubsw m5, m2, m6 |
| 22623 | pmulhrsw m5, m7 |
| 22624 | packuswb m3, m5 |
| 22625 | movu [r0 + 918 * 16], m3 |
| 22626 | |
| 22627 | pmaddubsw m3, m1, m6 |
| 22628 | pmulhrsw m3, m7 |
| 22629 | pmaddubsw m5, m4, m6 |
| 22630 | pmulhrsw m5, m7 |
| 22631 | packuswb m3, m5 |
| 22632 | movu [r0 + 919 * 16], m3 |
| 22633 | |
| 22634 | ; mode16 [row 12] |
| 22635 | movu m6, [r5 + 15 * 16] |
| 22636 | pslldq m0, 2 |
| 22637 | pinsrb m0, [r3 + 11], 1 |
| 22638 | pinsrb m0, [r3 + 12], 0 |
| 22639 | pmaddubsw m3, m0, m6 |
| 22640 | pmulhrsw m3, m7 |
| 22641 | pslldq m2, 2 |
| 22642 | pinsrw m2, [r4 + 0], 0 |
| 22643 | pmaddubsw m5, m2, m6 |
| 22644 | pmulhrsw m5, m7 |
| 22645 | packuswb m3, m5 |
| 22646 | movu [r0 + 920 * 16], m3 |
| 22647 | |
| 22648 | pslldq m1, 2 |
| 22649 | pinsrw m1, [r4 + 8], 0 |
| 22650 | pmaddubsw m3, m1, m6 |
| 22651 | pmulhrsw m3, m7 |
| 22652 | pslldq m4, 2 |
| 22653 | pinsrw m4, [r4 + 16], 0 |
| 22654 | pmaddubsw m5, m4, m6 |
| 22655 | pmulhrsw m5, m7 |
| 22656 | packuswb m3, m5 |
| 22657 | movu [r0 + 921 * 16], m3 |
| 22658 | |
| 22659 | ; mode16 [row 13] |
| 22660 | movu m6, [r5 + 26 * 16] |
| 22661 | pslldq m0, 2 |
| 22662 | pinsrb m0, [r3 + 12], 1 |
| 22663 | pinsrb m0, [r3 + 14], 0 |
| 22664 | pmaddubsw m3, m0, m6 |
| 22665 | pmulhrsw m3, m7 |
| 22666 | pslldq m2, 2 |
| 22667 | pinsrb m2, [r4 + 0], 1 |
| 22668 | pinsrb m2, [r3 + 2], 0 |
| 22669 | pmaddubsw m5, m2, m6 |
| 22670 | pmulhrsw m5, m7 |
| 22671 | packuswb m3, m5 |
| 22672 | movu [r0 + 922 * 16], m3 |
| 22673 | |
| 22674 | pslldq m1, 2 |
| 22675 | pinsrw m1, [r4 + 7], 0 |
| 22676 | pmaddubsw m3, m1, m6 |
| 22677 | pmulhrsw m3, m7 |
| 22678 | pslldq m4, 2 |
| 22679 | pinsrw m4, [r4 + 15], 0 |
| 22680 | pmaddubsw m5, m4, m6 |
| 22681 | pmulhrsw m5, m7 |
| 22682 | packuswb m3, m5 |
| 22683 | movu [r0 + 923 * 16], m3 |
| 22684 | |
| 22685 | ; mode16 [row 14] |
| 22686 | movu m6, [r5 + 5 * 16] |
| 22687 | pmaddubsw m3, m0, m6 |
| 22688 | pmulhrsw m3, m7 |
| 22689 | pmaddubsw m5, m2, m6 |
| 22690 | pmulhrsw m5, m7 |
| 22691 | packuswb m3, m5 |
| 22692 | movu [r0 + 924 * 16], m3 |
| 22693 | |
| 22694 | pmaddubsw m3, m1, m6 |
| 22695 | pmulhrsw m3, m7 |
| 22696 | pmaddubsw m5, m4, m6 |
| 22697 | pmulhrsw m5, m7 |
| 22698 | packuswb m3, m5 |
| 22699 | movu [r0 + 925 * 16], m3 |
| 22700 | |
| 22701 | ; mode16 [row 15] |
| 22702 | movu m6, [r5 + 16 * 16] |
| 22703 | pslldq m0, 2 |
| 22704 | pinsrb m0, [r3 + 14], 1 |
| 22705 | pinsrb m0, [r3 + 15], 0 |
| 22706 | pmaddubsw m3, m0, m6 |
| 22707 | pmulhrsw m3, m7 |
| 22708 | pslldq m2, 2 |
| 22709 | pinsrb m2, [r3 + 2], 1 |
| 22710 | pinsrb m2, [r3 + 3], 0 |
| 22711 | pmaddubsw m5, m2, m6 |
| 22712 | pmulhrsw m5, m7 |
| 22713 | packuswb m3, m5 |
| 22714 | movu [r0 + 926 * 16], m3 |
| 22715 | |
| 22716 | pslldq m1, 2 |
| 22717 | pinsrw m1, [r4 + 6], 0 |
| 22718 | pmaddubsw m3, m1, m6 |
| 22719 | pmulhrsw m3, m7 |
| 22720 | pslldq m4, 2 |
| 22721 | pinsrw m4, [r4 + 14], 0 |
| 22722 | pmaddubsw m5, m4, m6 |
| 22723 | pmulhrsw m5, m7 |
| 22724 | packuswb m3, m5 |
| 22725 | movu [r0 + 927 * 16], m3 |
| 22726 | |
| 22727 | ; mode16 [row 16] |
| 22728 | movu m6, [r5 + 27 * 16] |
| 22729 | pslldq m0, 2 |
| 22730 | pinsrb m0, [r3 + 15], 1 |
| 22731 | pinsrb m0, [r3 + 17], 0 |
| 22732 | pmaddubsw m3, m0, m6 |
| 22733 | pmulhrsw m3, m7 |
| 22734 | pslldq m2, 2 |
| 22735 | pinsrb m2, [r3 + 3], 1 |
| 22736 | pinsrb m2, [r3 + 5], 0 |
| 22737 | pmaddubsw m5, m2, m6 |
| 22738 | pmulhrsw m5, m7 |
| 22739 | packuswb m3, m5 |
| 22740 | movu [r0 + 928 * 16], m3 |
| 22741 | |
| 22742 | pslldq m1, 2 |
| 22743 | pinsrw m1, [r4 + 5], 0 |
| 22744 | pmaddubsw m3, m1, m6 |
| 22745 | pmulhrsw m3, m7 |
| 22746 | pslldq m4, 2 |
| 22747 | pinsrw m4, [r4 + 13], 0 |
| 22748 | pmaddubsw m5, m4, m6 |
| 22749 | pmulhrsw m5, m7 |
| 22750 | packuswb m3, m5 |
| 22751 | movu [r0 + 929 * 16], m3 |
| 22752 | |
| 22753 | ; mode16 [row 17] |
| 22754 | movu m6, [r5 + 6 * 16] |
| 22755 | pmaddubsw m3, m0, m6 |
| 22756 | pmulhrsw m3, m7 |
| 22757 | pmaddubsw m5, m2, m6 |
| 22758 | pmulhrsw m5, m7 |
| 22759 | packuswb m3, m5 |
| 22760 | movu [r0 + 930 * 16], m3 |
| 22761 | |
| 22762 | pmaddubsw m3, m1, m6 |
| 22763 | pmulhrsw m3, m7 |
| 22764 | pmaddubsw m5, m4, m6 |
| 22765 | pmulhrsw m5, m7 |
| 22766 | packuswb m3, m5 |
| 22767 | movu [r0 + 931 * 16], m3 |
| 22768 | |
| 22769 | ; mode16 [row 18] |
| 22770 | movu m6, [r5 + 17 * 16] |
| 22771 | pslldq m0, 2 |
| 22772 | pinsrb m0, [r3 + 17], 1 |
| 22773 | pinsrb m0, [r3 + 18], 0 |
| 22774 | pmaddubsw m3, m0, m6 |
| 22775 | pmulhrsw m3, m7 |
| 22776 | pslldq m2, 2 |
| 22777 | pinsrb m2, [r3 + 5], 1 |
| 22778 | pinsrb m2, [r3 + 6], 0 |
| 22779 | pmaddubsw m5, m2, m6 |
| 22780 | pmulhrsw m5, m7 |
| 22781 | packuswb m3, m5 |
| 22782 | movu [r0 + 932 * 16], m3 |
| 22783 | |
| 22784 | pslldq m1, 2 |
| 22785 | pinsrw m1, [r4 + 4], 0 |
| 22786 | pmaddubsw m3, m1, m6 |
| 22787 | pmulhrsw m3, m7 |
| 22788 | pslldq m4, 2 |
| 22789 | pinsrw m4, [r4 + 12], 0 |
| 22790 | pmaddubsw m5, m4, m6 |
| 22791 | pmulhrsw m5, m7 |
| 22792 | packuswb m3, m5 |
| 22793 | movu [r0 + 933 * 16], m3 |
| 22794 | |
| 22795 | ; mode16 [row 19] |
| 22796 | movu m6, [r5 + 28 * 16] |
| 22797 | pslldq m0, 2 |
| 22798 | pinsrb m0, [r3 + 18], 1 |
| 22799 | pinsrb m0, [r3 + 20], 0 |
| 22800 | pmaddubsw m3, m0, m6 |
| 22801 | pmulhrsw m3, m7 |
| 22802 | pslldq m2, 2 |
| 22803 | pinsrb m2, [r3 + 6], 1 |
| 22804 | pinsrb m2, [r3 + 8], 0 |
| 22805 | pmaddubsw m5, m2, m6 |
| 22806 | pmulhrsw m5, m7 |
| 22807 | packuswb m3, m5 |
| 22808 | movu [r0 + 934 * 16], m3 |
| 22809 | |
| 22810 | pslldq m1, 2 |
| 22811 | pinsrw m1, [r4 + 3], 0 |
| 22812 | pmaddubsw m3, m1, m6 |
| 22813 | pmulhrsw m3, m7 |
| 22814 | pslldq m4, 2 |
| 22815 | pinsrw m4, [r4 + 11], 0 |
| 22816 | pmaddubsw m5, m4, m6 |
| 22817 | pmulhrsw m5, m7 |
| 22818 | packuswb m3, m5 |
| 22819 | movu [r0 + 935 * 16], m3 |
| 22820 | |
| 22821 | ; mode16 [row 20] |
| 22822 | movu m6, [r5 + 7 * 16] |
| 22823 | pmaddubsw m3, m0, m6 |
| 22824 | pmulhrsw m3, m7 |
| 22825 | pmaddubsw m5, m2, m6 |
| 22826 | pmulhrsw m5, m7 |
| 22827 | packuswb m3, m5 |
| 22828 | movu [r0 + 936 * 16], m3 |
| 22829 | |
| 22830 | pmaddubsw m3, m1, m6 |
| 22831 | pmulhrsw m3, m7 |
| 22832 | pmaddubsw m5, m4, m6 |
| 22833 | pmulhrsw m5, m7 |
| 22834 | packuswb m3, m5 |
| 22835 | movu [r0 + 937 * 16], m3 |
| 22836 | |
| 22837 | ; mode16 [row 21] |
| 22838 | movu m6, [r5 + 18 * 16] |
| 22839 | pslldq m0, 2 |
| 22840 | pinsrb m0, [r3 + 20], 1 |
| 22841 | pinsrb m0, [r3 + 21], 0 |
| 22842 | pmaddubsw m3, m0, m6 |
| 22843 | pmulhrsw m3, m7 |
| 22844 | pslldq m2, 2 |
| 22845 | pinsrb m2, [r3 + 8], 1 |
| 22846 | pinsrb m2, [r3 + 9], 0 |
| 22847 | pmaddubsw m5, m2, m6 |
| 22848 | pmulhrsw m5, m7 |
| 22849 | packuswb m3, m5 |
| 22850 | movu [r0 + 938 * 16], m3 |
| 22851 | |
| 22852 | pslldq m1, 2 |
| 22853 | pinsrw m1, [r4 + 2], 0 |
| 22854 | pmaddubsw m3, m1, m6 |
| 22855 | pmulhrsw m3, m7 |
| 22856 | pslldq m4, 2 |
| 22857 | pinsrw m4, [r4 + 10], 0 |
| 22858 | pmaddubsw m5, m4, m6 |
| 22859 | pmulhrsw m5, m7 |
| 22860 | packuswb m3, m5 |
| 22861 | movu [r0 + 939 * 16], m3 |
| 22862 | |
| 22863 | ; mode16 [row 22] |
| 22864 | movu m6, [r5 + 29 * 16] |
| 22865 | pslldq m0, 2 |
| 22866 | pinsrb m0, [r3 + 21], 1 |
| 22867 | pinsrb m0, [r3 + 23], 0 |
| 22868 | pmaddubsw m3, m0, m6 |
| 22869 | pmulhrsw m3, m7 |
| 22870 | pslldq m2, 2 |
| 22871 | pinsrb m2, [r3 + 9], 1 |
| 22872 | pinsrb m2, [r3 + 11], 0 |
| 22873 | pmaddubsw m5, m2, m6 |
| 22874 | pmulhrsw m5, m7 |
| 22875 | packuswb m3, m5 |
| 22876 | movu [r0 + 940 * 16], m3 |
| 22877 | |
| 22878 | pslldq m1, 2 |
| 22879 | pinsrw m1, [r4 + 1], 0 |
| 22880 | pmaddubsw m3, m1, m6 |
| 22881 | pmulhrsw m3, m7 |
| 22882 | pslldq m4, 2 |
| 22883 | pinsrw m4, [r4 + 9], 0 |
| 22884 | pmaddubsw m5, m4, m6 |
| 22885 | pmulhrsw m5, m7 |
| 22886 | packuswb m3, m5 |
| 22887 | movu [r0 + 941 * 16], m3 |
| 22888 | |
| 22889 | ; mode16 [row 23] |
| 22890 | movu m6, [r5 + 8 * 16] |
| 22891 | pmaddubsw m3, m0, m6 |
| 22892 | pmulhrsw m3, m7 |
| 22893 | pmaddubsw m5, m2, m6 |
| 22894 | pmulhrsw m5, m7 |
| 22895 | packuswb m3, m5 |
| 22896 | movu [r0 + 942 * 16], m3 |
| 22897 | |
| 22898 | pmaddubsw m3, m1, m6 |
| 22899 | pmulhrsw m3, m7 |
| 22900 | pmaddubsw m5, m4, m6 |
| 22901 | pmulhrsw m5, m7 |
| 22902 | packuswb m3, m5 |
| 22903 | movu [r0 + 943 * 16], m3 |
| 22904 | |
| 22905 | ; mode16 [row 24] |
| 22906 | movu m6, [r5 + 19 * 16] |
| 22907 | pslldq m0, 2 |
| 22908 | pinsrb m0, [r3 + 23], 1 |
| 22909 | pinsrb m0, [r3 + 24], 0 |
| 22910 | pmaddubsw m3, m0, m6 |
| 22911 | pmulhrsw m3, m7 |
| 22912 | pslldq m2, 2 |
| 22913 | pinsrb m2, [r3 + 11], 1 |
| 22914 | pinsrb m2, [r3 + 12], 0 |
| 22915 | pmaddubsw m5, m2, m6 |
| 22916 | pmulhrsw m5, m7 |
| 22917 | packuswb m3, m5 |
| 22918 | movu [r0 + 944 * 16], m3 |
| 22919 | |
| 22920 | pslldq m1, 2 |
| 22921 | pinsrw m1, [r4 + 0], 0 |
| 22922 | pmaddubsw m3, m1, m6 |
| 22923 | pmulhrsw m3, m7 |
| 22924 | pslldq m4, 2 |
| 22925 | pinsrw m4, [r4 + 8], 0 |
| 22926 | pmaddubsw m5, m4, m6 |
| 22927 | pmulhrsw m5, m7 |
| 22928 | packuswb m3, m5 |
| 22929 | movu [r0 + 945 * 16], m3 |
| 22930 | |
| 22931 | ; mode16 [row 25] |
| 22932 | movu m6, [r5 + 30 * 16] |
| 22933 | pslldq m0, 2 |
| 22934 | pinsrb m0, [r3 + 24], 1 |
| 22935 | pinsrb m0, [r3 + 26], 0 |
| 22936 | pmaddubsw m3, m0, m6 |
| 22937 | pmulhrsw m3, m7 |
| 22938 | pslldq m2, 2 |
| 22939 | pinsrb m2, [r3 + 12], 1 |
| 22940 | pinsrb m2, [r3 + 14], 0 |
| 22941 | pmaddubsw m5, m2, m6 |
| 22942 | pmulhrsw m5, m7 |
| 22943 | packuswb m3, m5 |
| 22944 | movu [r0 + 946 * 16], m3 |
| 22945 | |
| 22946 | pslldq m1, 2 |
| 22947 | pinsrb m1, [r4 + 0], 1 |
| 22948 | pinsrb m1, [r3 + 2], 0 |
| 22949 | pmaddubsw m3, m1, m6 |
| 22950 | pmulhrsw m3, m7 |
| 22951 | pslldq m4, 2 |
| 22952 | pinsrw m4, [r4 + 7], 0 |
| 22953 | pmaddubsw m5, m4, m6 |
| 22954 | pmulhrsw m5, m7 |
| 22955 | packuswb m3, m5 |
| 22956 | movu [r0 + 947 * 16], m3 |
| 22957 | |
| 22958 | ; mode16 [row 26] |
| 22959 | movu m6, [r5 + 9 * 16] |
| 22960 | pmaddubsw m3, m0, m6 |
| 22961 | pmulhrsw m3, m7 |
| 22962 | pmaddubsw m5, m2, m6 |
| 22963 | pmulhrsw m5, m7 |
| 22964 | packuswb m3, m5 |
| 22965 | movu [r0 + 948 * 16], m3 |
| 22966 | |
| 22967 | pmaddubsw m3, m1, m6 |
| 22968 | pmulhrsw m3, m7 |
| 22969 | pmaddubsw m5, m4, m6 |
| 22970 | pmulhrsw m5, m7 |
| 22971 | packuswb m3, m5 |
| 22972 | movu [r0 + 949 * 16], m3 |
| 22973 | |
| 22974 | ; mode16 [row 27] |
| 22975 | movu m6, [r5 + 20 * 16] |
| 22976 | pslldq m0, 2 |
| 22977 | pinsrb m0, [r3 + 26], 1 |
| 22978 | pinsrb m0, [r3 + 27], 0 |
| 22979 | pmaddubsw m3, m0, m6 |
| 22980 | pmulhrsw m3, m7 |
| 22981 | pslldq m2, 2 |
| 22982 | pinsrb m2, [r3 + 14], 1 |
| 22983 | pinsrb m2, [r3 + 15], 0 |
| 22984 | pmaddubsw m5, m2, m6 |
| 22985 | pmulhrsw m5, m7 |
| 22986 | packuswb m3, m5 |
| 22987 | movu [r0 + 950 * 16], m3 |
| 22988 | |
| 22989 | pslldq m1, 2 |
| 22990 | pinsrb m1, [r3 + 2], 1 |
| 22991 | pinsrb m1, [r3 + 3], 0 |
| 22992 | pmaddubsw m3, m1, m6 |
| 22993 | pmulhrsw m3, m7 |
| 22994 | pslldq m4, 2 |
| 22995 | pinsrw m4, [r4 + 6], 0 |
| 22996 | pmaddubsw m5, m4, m6 |
| 22997 | pmulhrsw m5, m7 |
| 22998 | packuswb m3, m5 |
| 22999 | movu [r0 + 951 * 16], m3 |
| 23000 | |
| 23001 | ; mode16 [row 28] |
| 23002 | movu m6, [r5 + 31 * 16] |
| 23003 | pslldq m0, 2 |
| 23004 | pinsrb m0, [r3 + 27], 1 |
| 23005 | pinsrb m0, [r3 + 29], 0 |
| 23006 | pmaddubsw m3, m0, m6 |
| 23007 | pmulhrsw m3, m7 |
| 23008 | pslldq m2, 2 |
| 23009 | pinsrb m2, [r3 + 15], 1 |
| 23010 | pinsrb m2, [r3 + 17], 0 |
| 23011 | pmaddubsw m5, m2, m6 |
| 23012 | pmulhrsw m5, m7 |
| 23013 | packuswb m3, m5 |
| 23014 | movu [r0 + 952 * 16], m3 |
| 23015 | |
| 23016 | pslldq m1, 2 |
| 23017 | pinsrb m1, [r3 + 3], 1 |
| 23018 | pinsrb m1, [r3 + 5], 0 |
| 23019 | pmaddubsw m3, m1, m6 |
| 23020 | pmulhrsw m3, m7 |
| 23021 | pslldq m4, 2 |
| 23022 | pinsrw m4, [r4 + 5], 0 |
| 23023 | pmaddubsw m5, m4, m6 |
| 23024 | pmulhrsw m5, m7 |
| 23025 | packuswb m3, m5 |
| 23026 | movu [r0 + 953 * 16], m3 |
| 23027 | |
| 23028 | ; mode16 [row 29] |
| 23029 | movu m6, [r5 + 10 * 16] |
| 23030 | pmaddubsw m3, m0, m6 |
| 23031 | pmulhrsw m3, m7 |
| 23032 | pmaddubsw m5, m2, m6 |
| 23033 | pmulhrsw m5, m7 |
| 23034 | packuswb m3, m5 |
| 23035 | movu [r0 + 954 * 16], m3 |
| 23036 | |
| 23037 | pmaddubsw m3, m1, m6 |
| 23038 | pmulhrsw m3, m7 |
| 23039 | pmaddubsw m5, m4, m6 |
| 23040 | pmulhrsw m5, m7 |
| 23041 | packuswb m3, m5 |
| 23042 | movu [r0 + 955 * 16], m3 |
| 23043 | |
| 23044 | ; mode16 [row 30] |
| 23045 | movu m6, [r5 + 21 * 16] |
| 23046 | pslldq m0, 2 |
| 23047 | pinsrb m0, [r3 + 29], 1 |
| 23048 | pinsrb m0, [r3 + 30], 0 |
| 23049 | pmaddubsw m3, m0, m6 |
| 23050 | pmulhrsw m3, m7 |
| 23051 | pslldq m2, 2 |
| 23052 | pinsrb m2, [r3 + 17], 1 |
| 23053 | pinsrb m2, [r3 + 18], 0 |
| 23054 | pmaddubsw m5, m2, m6 |
| 23055 | pmulhrsw m5, m7 |
| 23056 | packuswb m3, m5 |
| 23057 | movu [r0 + 956 * 16], m3 |
| 23058 | |
| 23059 | pslldq m1, 2 |
| 23060 | pinsrb m1, [r3 + 5], 1 |
| 23061 | pinsrb m1, [r3 + 6], 0 |
| 23062 | pmaddubsw m3, m1, m6 |
| 23063 | pmulhrsw m3, m7 |
| 23064 | pslldq m4, 2 |
| 23065 | pinsrw m4, [r4 + 4], 0 |
| 23066 | pmaddubsw m5, m4, m6 |
| 23067 | pmulhrsw m5, m7 |
| 23068 | packuswb m3, m5 |
| 23069 | movu [r0 + 957 * 16], m3 |
| 23070 | |
| 23071 | ; mode16 [row 31] |
| 23072 | pshufb m5, m0, [tab_S2] |
| 23073 | movh [r0 + 958 * 16], m5 |
| 23074 | pshufb m5, m2, [tab_S2] |
| 23075 | movh [r0 + 958 * 16 + 8], m5 |
| 23076 | pshufb m5, m1, [tab_S2] |
| 23077 | movh [r0 + 959 * 16], m5 |
| 23078 | pshufb m5, m4, [tab_S2] |
| 23079 | movh [r0 + 959 * 16 + 8], m5 |
| 23080 | |
| 23081 | ; mode 17 [row 0] |
| 23082 | movu m6, [r5 + 6 * 16] |
| 23083 | movu m7, [pw_1024] |
| 23084 | movh m0, [r4 ] |
| 23085 | movh m1, [r4 + 1 ] |
| 23086 | punpcklbw m0, m1 |
| 23087 | pmaddubsw m1, m0, m6 |
| 23088 | pmulhrsw m1, m7 |
| 23089 | movh m2, [r4 + 8] |
| 23090 | movh m3, [r4 + 9] |
| 23091 | punpcklbw m2, m3 |
| 23092 | pmaddubsw m3, m2, m6 |
| 23093 | pmulhrsw m3, m7 |
| 23094 | packuswb m1, m3 |
| 23095 | movu [r0 + 960 * 16], m1 |
| 23096 | |
| 23097 | movh m1, [r4 + 16] |
| 23098 | movh m3, [r4 + 17] |
| 23099 | punpcklbw m1, m3 |
| 23100 | pmaddubsw m3, m1, m6 |
| 23101 | pmulhrsw m3, m7 |
| 23102 | movh m4, [r4 + 24] |
| 23103 | movh m5, [r4 + 25] |
| 23104 | punpcklbw m4, m5 |
| 23105 | pmaddubsw m5, m4, m6 |
| 23106 | pmulhrsw m5, m7 |
| 23107 | packuswb m3, m5 |
| 23108 | movu [r0 + 961 * 16], m3 |
| 23109 | |
| 23110 | ; mode17 [row 1] |
| 23111 | movu m6, [r5 + 12 * 16] |
| 23112 | pslldq m0, 2 |
| 23113 | pinsrb m0, [r3 + 0], 1 |
| 23114 | pinsrb m0, [r3 + 1], 0 |
| 23115 | pmaddubsw m3, m0, m6 |
| 23116 | pmulhrsw m3, m7 |
| 23117 | pslldq m2, 2 |
| 23118 | pinsrw m2, [r4 + 7], 0 |
| 23119 | pmaddubsw m5, m2, m6 |
| 23120 | pmulhrsw m5, m7 |
| 23121 | packuswb m3, m5 |
| 23122 | movu [r0 + 962 * 16], m3 |
| 23123 | |
| 23124 | pslldq m1, 2 |
| 23125 | pinsrw m1, [r4 + 15], 0 |
| 23126 | pmaddubsw m3, m1, m6 |
| 23127 | pmulhrsw m3, m7 |
| 23128 | pslldq m4, 2 |
| 23129 | pinsrw m4, [r4 + 23], 0 |
| 23130 | pmaddubsw m5, m4, m6 |
| 23131 | pmulhrsw m5, m7 |
| 23132 | packuswb m3, m5 |
| 23133 | movu [r0 + 963 * 16], m3 |
| 23134 | |
| 23135 | ; mode17 [row 2] |
| 23136 | movu m6, [r5 + 18 * 16] |
| 23137 | pslldq m0, 2 |
| 23138 | pinsrb m0, [r3 + 1], 1 |
| 23139 | pinsrb m0, [r3 + 2], 0 |
| 23140 | pmaddubsw m3, m0, m6 |
| 23141 | pmulhrsw m3, m7 |
| 23142 | pslldq m2, 2 |
| 23143 | pinsrw m2, [r4 + 6], 0 |
| 23144 | pmaddubsw m5, m2, m6 |
| 23145 | pmulhrsw m5, m7 |
| 23146 | packuswb m3, m5 |
| 23147 | movu [r0 + 964 * 16], m3 |
| 23148 | |
| 23149 | pslldq m1, 2 |
| 23150 | pinsrw m1, [r4 + 14], 0 |
| 23151 | pmaddubsw m3, m1, m6 |
| 23152 | pmulhrsw m3, m7 |
| 23153 | pslldq m4, 2 |
| 23154 | pinsrw m4, [r4 + 22], 0 |
| 23155 | pmaddubsw m5, m4, m6 |
| 23156 | pmulhrsw m5, m7 |
| 23157 | packuswb m3, m5 |
| 23158 | movu [r0 + 965 * 16], m3 |
| 23159 | |
| 23160 | ; mode17 [row 3] |
| 23161 | movu m6, [r5 + 24 * 16] |
| 23162 | pslldq m0, 2 |
| 23163 | pinsrb m0, [r3 + 2], 1 |
| 23164 | pinsrb m0, [r3 + 4], 0 |
| 23165 | pmaddubsw m3, m0, m6 |
| 23166 | pmulhrsw m3, m7 |
| 23167 | pslldq m2, 2 |
| 23168 | pinsrw m2, [r4 + 5], 0 |
| 23169 | pmaddubsw m5, m2, m6 |
| 23170 | pmulhrsw m5, m7 |
| 23171 | packuswb m3, m5 |
| 23172 | movu [r0 + 966 * 16], m3 |
| 23173 | |
| 23174 | pslldq m1, 2 |
| 23175 | pinsrw m1, [r4 + 13], 0 |
| 23176 | pmaddubsw m3, m1, m6 |
| 23177 | pmulhrsw m3, m7 |
| 23178 | pslldq m4, 2 |
| 23179 | pinsrw m4, [r4 + 21], 0 |
| 23180 | pmaddubsw m5, m4, m6 |
| 23181 | pmulhrsw m5, m7 |
| 23182 | packuswb m3, m5 |
| 23183 | movu [r0 + 967 * 16], m3 |
| 23184 | |
| 23185 | ; mode17 [row 4] |
| 23186 | movu m6, [r5 + 30 * 16] |
| 23187 | pslldq m0, 2 |
| 23188 | pinsrb m0, [r3 + 4], 1 |
| 23189 | pinsrb m0, [r3 + 5], 0 |
| 23190 | pmaddubsw m3, m0, m6 |
| 23191 | pmulhrsw m3, m7 |
| 23192 | pslldq m2, 2 |
| 23193 | pinsrw m2, [r4 + 4], 0 |
| 23194 | pmaddubsw m5, m2, m6 |
| 23195 | pmulhrsw m5, m7 |
| 23196 | packuswb m3, m5 |
| 23197 | movu [r0 + 968 * 16], m3 |
| 23198 | |
| 23199 | pslldq m1, 2 |
| 23200 | pinsrw m1, [r4 + 12], 0 |
| 23201 | pmaddubsw m3, m1, m6 |
| 23202 | pmulhrsw m3, m7 |
| 23203 | pslldq m4, 2 |
| 23204 | pinsrw m4, [r4 + 20], 0 |
| 23205 | pmaddubsw m5, m4, m6 |
| 23206 | pmulhrsw m5, m7 |
| 23207 | packuswb m3, m5 |
| 23208 | movu [r0 + 969 * 16], m3 |
| 23209 | |
| 23210 | ; mode17 [row 5] |
| 23211 | movu m6, [r5 + 4 * 16] |
| 23212 | pmaddubsw m3, m0, m6 |
| 23213 | pmulhrsw m3, m7 |
| 23214 | pmaddubsw m5, m2, m6 |
| 23215 | pmulhrsw m5, m7 |
| 23216 | packuswb m3, m5 |
| 23217 | movu [r0 + 970 * 16], m3 |
| 23218 | |
| 23219 | pmaddubsw m3, m1, m6 |
| 23220 | pmulhrsw m3, m7 |
| 23221 | pmaddubsw m5, m4, m6 |
| 23222 | pmulhrsw m5, m7 |
| 23223 | packuswb m3, m5 |
| 23224 | movu [r0 + 971 * 16], m3 |
| 23225 | |
| 23226 | ; mode17 [row 6] |
| 23227 | movu m6, [r5 + 10 * 16] |
| 23228 | pslldq m0, 2 |
| 23229 | pinsrb m0, [r3 + 5], 1 |
| 23230 | pinsrb m0, [r3 + 6], 0 |
| 23231 | pmaddubsw m3, m0, m6 |
| 23232 | pmulhrsw m3, m7 |
| 23233 | pslldq m2, 2 |
| 23234 | pinsrw m2, [r4 + 3], 0 |
| 23235 | pmaddubsw m5, m2, m6 |
| 23236 | pmulhrsw m5, m7 |
| 23237 | packuswb m3, m5 |
| 23238 | movu [r0 + 972 * 16], m3 |
| 23239 | |
| 23240 | pslldq m1, 2 |
| 23241 | pinsrw m1, [r4 + 11], 0 |
| 23242 | pmaddubsw m3, m1, m6 |
| 23243 | pmulhrsw m3, m7 |
| 23244 | pslldq m4, 2 |
| 23245 | pinsrw m4, [r4 + 19], 0 |
| 23246 | pmaddubsw m5, m4, m6 |
| 23247 | pmulhrsw m5, m7 |
| 23248 | packuswb m3, m5 |
| 23249 | movu [r0 + 973 * 16], m3 |
| 23250 | |
| 23251 | ; mode17 [row 7] |
| 23252 | movu m6, [r5 + 16 * 16] |
| 23253 | pslldq m0, 2 |
| 23254 | pinsrb m0, [r3 + 6], 1 |
| 23255 | pinsrb m0, [r3 + 7], 0 |
| 23256 | pmaddubsw m3, m0, m6 |
| 23257 | pmulhrsw m3, m7 |
| 23258 | pslldq m2, 2 |
| 23259 | pinsrw m2, [r4 + 2], 0 |
| 23260 | pmaddubsw m5, m2, m6 |
| 23261 | pmulhrsw m5, m7 |
| 23262 | packuswb m3, m5 |
| 23263 | movu [r0 + 974 * 16], m3 |
| 23264 | |
| 23265 | pslldq m1, 2 |
| 23266 | pinsrw m1, [r4 + 10], 0 |
| 23267 | pmaddubsw m3, m1, m6 |
| 23268 | pmulhrsw m3, m7 |
| 23269 | pslldq m4, 2 |
| 23270 | pinsrw m4, [r4 + 18], 0 |
| 23271 | pmaddubsw m5, m4, m6 |
| 23272 | pmulhrsw m5, m7 |
| 23273 | packuswb m3, m5 |
| 23274 | movu [r0 + 975 * 16], m3 |
| 23275 | |
| 23276 | ; mode17 [row 8] |
| 23277 | movu m6, [r5 + 22 * 16] |
| 23278 | pslldq m0, 2 |
| 23279 | pinsrb m0, [r3 + 7], 1 |
| 23280 | pinsrb m0, [r3 + 9], 0 |
| 23281 | pmaddubsw m3, m0, m6 |
| 23282 | pmulhrsw m3, m7 |
| 23283 | pslldq m2, 2 |
| 23284 | pinsrw m2, [r4 + 1], 0 |
| 23285 | pmaddubsw m5, m2, m6 |
| 23286 | pmulhrsw m5, m7 |
| 23287 | packuswb m3, m5 |
| 23288 | movu [r0 + 976 * 16], m3 |
| 23289 | |
| 23290 | pslldq m1, 2 |
| 23291 | pinsrw m1, [r4 + 9], 0 |
| 23292 | pmaddubsw m3, m1, m6 |
| 23293 | pmulhrsw m3, m7 |
| 23294 | pslldq m4, 2 |
| 23295 | pinsrw m4, [r4 + 17], 0 |
| 23296 | pmaddubsw m5, m4, m6 |
| 23297 | pmulhrsw m5, m7 |
| 23298 | packuswb m3, m5 |
| 23299 | movu [r0 + 977 * 16], m3 |
| 23300 | |
| 23301 | ; mode17 [row 9] |
| 23302 | movu m6, [r5 + 28 * 16] |
| 23303 | pslldq m0, 2 |
| 23304 | pinsrb m0, [r3 + 9], 1 |
| 23305 | pinsrb m0, [r3 + 10], 0 |
| 23306 | pmaddubsw m3, m0, m6 |
| 23307 | pmulhrsw m3, m7 |
| 23308 | pslldq m2, 2 |
| 23309 | pinsrw m2, [r4 + 0], 0 |
| 23310 | pmaddubsw m5, m2, m6 |
| 23311 | pmulhrsw m5, m7 |
| 23312 | packuswb m3, m5 |
| 23313 | movu [r0 + 978 * 16], m3 |
| 23314 | |
| 23315 | pslldq m1, 2 |
| 23316 | pinsrw m1, [r4 + 8], 0 |
| 23317 | pmaddubsw m3, m1, m6 |
| 23318 | pmulhrsw m3, m7 |
| 23319 | pslldq m4, 2 |
| 23320 | pinsrw m4, [r4 + 16], 0 |
| 23321 | pmaddubsw m5, m4, m6 |
| 23322 | pmulhrsw m5, m7 |
| 23323 | packuswb m3, m5 |
| 23324 | movu [r0 + 979 * 16], m3 |
| 23325 | |
| 23326 | ; mode17 [row 10] |
| 23327 | movu m6, [r5 + 2 * 16] |
| 23328 | pmaddubsw m3, m0, m6 |
| 23329 | pmulhrsw m3, m7 |
| 23330 | pmaddubsw m5, m2, m6 |
| 23331 | pmulhrsw m5, m7 |
| 23332 | packuswb m3, m5 |
| 23333 | movu [r0 + 980 * 16], m3 |
| 23334 | |
| 23335 | pmaddubsw m3, m1, m6 |
| 23336 | pmulhrsw m3, m7 |
| 23337 | pmaddubsw m5, m4, m6 |
| 23338 | pmulhrsw m5, m7 |
| 23339 | packuswb m3, m5 |
| 23340 | movu [r0 + 981 * 16], m3 |
| 23341 | |
| 23342 | ; mode17 [row 11] |
| 23343 | movu m6, [r5 + 8 * 16] |
| 23344 | pslldq m0, 2 |
| 23345 | pinsrb m0, [r3 + 10], 1 |
| 23346 | pinsrb m0, [r3 + 11], 0 |
| 23347 | pmaddubsw m3, m0, m6 |
| 23348 | pmulhrsw m3, m7 |
| 23349 | pslldq m2, 2 |
| 23350 | pinsrb m2, [r4 + 0], 1 |
| 23351 | pinsrb m2, [r3 + 1], 0 |
| 23352 | pmaddubsw m5, m2, m6 |
| 23353 | pmulhrsw m5, m7 |
| 23354 | packuswb m3, m5 |
| 23355 | movu [r0 + 982 * 16], m3 |
| 23356 | |
| 23357 | pslldq m1, 2 |
| 23358 | pinsrw m1, [r4 + 7], 0 |
| 23359 | pmaddubsw m3, m1, m6 |
| 23360 | pmulhrsw m3, m7 |
| 23361 | pslldq m4, 2 |
| 23362 | pinsrw m4, [r4 + 15], 0 |
| 23363 | pmaddubsw m5, m4, m6 |
| 23364 | pmulhrsw m5, m7 |
| 23365 | packuswb m3, m5 |
| 23366 | movu [r0 + 983 * 16], m3 |
| 23367 | |
| 23368 | ; mode17 [row 12] |
| 23369 | movu m6, [r5 + 14 * 16] |
| 23370 | pslldq m0, 2 |
| 23371 | pinsrb m0, [r3 + 11], 1 |
| 23372 | pinsrb m0, [r3 + 12], 0 |
| 23373 | pmaddubsw m3, m0, m6 |
| 23374 | pmulhrsw m3, m7 |
| 23375 | pslldq m2, 2 |
| 23376 | pinsrb m2, [r3 + 1], 1 |
| 23377 | pinsrb m2, [r3 + 2], 0 |
| 23378 | pmaddubsw m5, m2, m6 |
| 23379 | pmulhrsw m5, m7 |
| 23380 | packuswb m3, m5 |
| 23381 | movu [r0 + 984 * 16], m3 |
| 23382 | |
| 23383 | pslldq m1, 2 |
| 23384 | pinsrw m1, [r4 + 6], 0 |
| 23385 | pmaddubsw m3, m1, m6 |
| 23386 | pmulhrsw m3, m7 |
| 23387 | pslldq m4, 2 |
| 23388 | pinsrw m4, [r4 + 14], 0 |
| 23389 | pmaddubsw m5, m4, m6 |
| 23390 | pmulhrsw m5, m7 |
| 23391 | packuswb m3, m5 |
| 23392 | movu [r0 + 985 * 16], m3 |
| 23393 | |
| 23394 | ; mode17 [row 13] |
| 23395 | movu m6, [r5 + 20 * 16] |
| 23396 | pslldq m0, 2 |
| 23397 | pinsrb m0, [r3 + 12], 1 |
| 23398 | pinsrb m0, [r3 + 14], 0 |
| 23399 | pmaddubsw m3, m0, m6 |
| 23400 | pmulhrsw m3, m7 |
| 23401 | pslldq m2, 2 |
| 23402 | pinsrb m2, [r3 + 2], 1 |
| 23403 | pinsrb m2, [r3 + 4], 0 |
| 23404 | pmaddubsw m5, m2, m6 |
| 23405 | pmulhrsw m5, m7 |
| 23406 | packuswb m3, m5 |
| 23407 | movu [r0 + 986 * 16], m3 |
| 23408 | |
| 23409 | pslldq m1, 2 |
| 23410 | pinsrw m1, [r4 + 5], 0 |
| 23411 | pmaddubsw m3, m1, m6 |
| 23412 | pmulhrsw m3, m7 |
| 23413 | pslldq m4, 2 |
| 23414 | pinsrw m4, [r4 + 13], 0 |
| 23415 | pmaddubsw m5, m4, m6 |
| 23416 | pmulhrsw m5, m7 |
| 23417 | packuswb m3, m5 |
| 23418 | movu [r0 + 987 * 16], m3 |
| 23419 | |
| 23420 | ; mode17 [row 14] |
| 23421 | movu m6, [r5 + 26 * 16] |
| 23422 | pslldq m0, 2 |
| 23423 | pinsrb m0, [r3 + 14], 1 |
| 23424 | pinsrb m0, [r3 + 15], 0 |
| 23425 | pmaddubsw m3, m0, m6 |
| 23426 | pmulhrsw m3, m7 |
| 23427 | pslldq m2, 2 |
| 23428 | pinsrb m2, [r3 + 4], 1 |
| 23429 | pinsrb m2, [r3 + 5], 0 |
| 23430 | pmaddubsw m5, m2, m6 |
| 23431 | pmulhrsw m5, m7 |
| 23432 | packuswb m3, m5 |
| 23433 | movu [r0 + 988 * 16], m3 |
| 23434 | |
| 23435 | pslldq m1, 2 |
| 23436 | pinsrw m1, [r4 + 4], 0 |
| 23437 | pmaddubsw m3, m1, m6 |
| 23438 | pmulhrsw m3, m7 |
| 23439 | pslldq m4, 2 |
| 23440 | pinsrw m4, [r4 + 12], 0 |
| 23441 | pmaddubsw m5, m4, m6 |
| 23442 | pmulhrsw m5, m7 |
| 23443 | packuswb m3, m5 |
| 23444 | movu [r0 + 989 * 16], m3 |
| 23445 | |
| 23446 | ; mode17 [row 15] |
| 23447 | pshufb m5, m0, [tab_S2] |
| 23448 | movh [r0 + 990 * 16], m5 |
| 23449 | pshufb m5, m2, [tab_S2] |
| 23450 | movh [r0 + 990 * 16 + 8], m5 |
| 23451 | pshufb m5, m1, [tab_S2] |
| 23452 | movh [r0 + 991 * 16], m5 |
| 23453 | pshufb m5, m4, [tab_S2] |
| 23454 | movh [r0 + 991 * 16 + 8], m5 |
| 23455 | |
| 23456 | ; mode17 [row 16] |
| 23457 | movu m6, [r5 + 6 * 16] |
| 23458 | pslldq m0, 2 |
| 23459 | pinsrb m0, [r3 + 15], 1 |
| 23460 | pinsrb m0, [r3 + 16], 0 |
| 23461 | pmaddubsw m3, m0, m6 |
| 23462 | pmulhrsw m3, m7 |
| 23463 | pslldq m2, 2 |
| 23464 | pinsrb m2, [r3 + 5], 1 |
| 23465 | pinsrb m2, [r3 + 6], 0 |
| 23466 | pmaddubsw m5, m2, m6 |
| 23467 | pmulhrsw m5, m7 |
| 23468 | packuswb m3, m5 |
| 23469 | movu [r0 + 992 * 16], m3 |
| 23470 | |
| 23471 | pslldq m1, 2 |
| 23472 | pinsrw m1, [r4 + 3], 0 |
| 23473 | pmaddubsw m3, m1, m6 |
| 23474 | pmulhrsw m3, m7 |
| 23475 | pslldq m4, 2 |
| 23476 | pinsrw m4, [r4 + 11], 0 |
| 23477 | pmaddubsw m5, m4, m6 |
| 23478 | pmulhrsw m5, m7 |
| 23479 | packuswb m3, m5 |
| 23480 | movu [r0 + 993 * 16], m3 |
| 23481 | |
| 23482 | ; mode17 [row 17] |
| 23483 | movu m6, [r5 + 12 * 16] |
| 23484 | pslldq m0, 2 |
| 23485 | pinsrb m0, [r3 + 16], 1 |
| 23486 | pinsrb m0, [r3 + 17], 0 |
| 23487 | pmaddubsw m3, m0, m6 |
| 23488 | pmulhrsw m3, m7 |
| 23489 | pslldq m2, 2 |
| 23490 | pinsrb m2, [r3 + 6], 1 |
| 23491 | pinsrb m2, [r3 + 7], 0 |
| 23492 | pmaddubsw m5, m2, m6 |
| 23493 | pmulhrsw m5, m7 |
| 23494 | packuswb m3, m5 |
| 23495 | movu [r0 + 994 * 16], m3 |
| 23496 | |
| 23497 | pslldq m1, 2 |
| 23498 | pinsrw m1, [r4 + 2], 0 |
| 23499 | pmaddubsw m3, m1, m6 |
| 23500 | pmulhrsw m3, m7 |
| 23501 | pslldq m4, 2 |
| 23502 | pinsrw m4, [r4 + 10], 0 |
| 23503 | pmaddubsw m5, m4, m6 |
| 23504 | pmulhrsw m5, m7 |
| 23505 | packuswb m3, m5 |
| 23506 | movu [r0 + 995 * 16], m3 |
| 23507 | |
| 23508 | ; mode17 [row 18] |
| 23509 | movu m6, [r5 + 18 * 16] |
| 23510 | pslldq m0, 2 |
| 23511 | pinsrb m0, [r3 + 17], 1 |
| 23512 | pinsrb m0, [r3 + 18], 0 |
| 23513 | pmaddubsw m3, m0, m6 |
| 23514 | pmulhrsw m3, m7 |
| 23515 | pslldq m2, 2 |
| 23516 | pinsrb m2, [r3 + 7], 1 |
| 23517 | pinsrb m2, [r3 + 9], 0 |
| 23518 | pmaddubsw m5, m2, m6 |
| 23519 | pmulhrsw m5, m7 |
| 23520 | packuswb m3, m5 |
| 23521 | movu [r0 + 996 * 16], m3 |
| 23522 | |
| 23523 | pslldq m1, 2 |
| 23524 | pinsrw m1, [r4 + 1], 0 |
| 23525 | pmaddubsw m3, m1, m6 |
| 23526 | pmulhrsw m3, m7 |
| 23527 | pslldq m4, 2 |
| 23528 | pinsrw m4, [r4 + 9], 0 |
| 23529 | pmaddubsw m5, m4, m6 |
| 23530 | pmulhrsw m5, m7 |
| 23531 | packuswb m3, m5 |
| 23532 | movu [r0 + 997 * 16], m3 |
| 23533 | |
| 23534 | ; mode17 [row 19] |
| 23535 | movu m6, [r5 + 24 * 16] |
| 23536 | pslldq m0, 2 |
| 23537 | pinsrb m0, [r3 + 18], 1 |
| 23538 | pinsrb m0, [r3 + 20], 0 |
| 23539 | pmaddubsw m3, m0, m6 |
| 23540 | pmulhrsw m3, m7 |
| 23541 | pslldq m2, 2 |
| 23542 | pinsrb m2, [r3 + 9], 1 |
| 23543 | pinsrb m2, [r3 + 10], 0 |
| 23544 | pmaddubsw m5, m2, m6 |
| 23545 | pmulhrsw m5, m7 |
| 23546 | packuswb m3, m5 |
| 23547 | movu [r0 + 998 * 16], m3 |
| 23548 | |
| 23549 | pslldq m1, 2 |
| 23550 | pinsrw m1, [r4 + 0], 0 |
| 23551 | pmaddubsw m3, m1, m6 |
| 23552 | pmulhrsw m3, m7 |
| 23553 | pslldq m4, 2 |
| 23554 | pinsrw m4, [r4 + 8], 0 |
| 23555 | pmaddubsw m5, m4, m6 |
| 23556 | pmulhrsw m5, m7 |
| 23557 | packuswb m3, m5 |
| 23558 | movu [r0 + 999 * 16], m3 |
| 23559 | |
| 23560 | ; mode17 [row 20] |
| 23561 | movu m6, [r5 + 30 * 16] |
| 23562 | pslldq m0, 2 |
| 23563 | pinsrb m0, [r3 + 20], 1 |
| 23564 | pinsrb m0, [r3 + 21], 0 |
| 23565 | pmaddubsw m3, m0, m6 |
| 23566 | pmulhrsw m3, m7 |
| 23567 | pslldq m2, 2 |
| 23568 | pinsrb m2, [r3 + 10], 1 |
| 23569 | pinsrb m2, [r3 + 11], 0 |
| 23570 | pmaddubsw m5, m2, m6 |
| 23571 | pmulhrsw m5, m7 |
| 23572 | packuswb m3, m5 |
| 23573 | movu [r0 + 1000 * 16], m3 |
| 23574 | |
| 23575 | pslldq m1, 2 |
| 23576 | pinsrb m1, [r4 + 0], 1 |
| 23577 | pinsrb m1, [r3 + 1], 0 |
| 23578 | pmaddubsw m3, m1, m6 |
| 23579 | pmulhrsw m3, m7 |
| 23580 | pslldq m4, 2 |
| 23581 | ;pinsrb m4, [r4 + 8], 1 |
| 23582 | ;pinsrb m4, [r4 + 7], 0 |
| 23583 | pinsrw m4, [r4 + 7], 0 |
| 23584 | pmaddubsw m5, m4, m6 |
| 23585 | pmulhrsw m5, m7 |
| 23586 | packuswb m3, m5 |
| 23587 | movu [r0 + 1001 * 16], m3 |
| 23588 | |
| 23589 | ; mode17 [row 21] |
| 23590 | movu m6, [r5 + 4 * 16] |
| 23591 | pmaddubsw m3, m0, m6 |
| 23592 | pmulhrsw m3, m7 |
| 23593 | pmaddubsw m5, m2, m6 |
| 23594 | pmulhrsw m5, m7 |
| 23595 | packuswb m3, m5 |
| 23596 | movu [r0 + 1002 * 16], m3 |
| 23597 | |
| 23598 | pmaddubsw m3, m1, m6 |
| 23599 | pmulhrsw m3, m7 |
| 23600 | pmaddubsw m5, m4, m6 |
| 23601 | pmulhrsw m5, m7 |
| 23602 | packuswb m3, m5 |
| 23603 | movu [r0 + 1003 * 16], m3 |
| 23604 | |
| 23605 | ; mode17 [row 22] |
| 23606 | movu m6, [r5 + 10 * 16] |
| 23607 | pslldq m0, 2 |
| 23608 | pinsrb m0, [r3 + 21], 1 |
| 23609 | pinsrb m0, [r3 + 22], 0 |
| 23610 | pmaddubsw m3, m0, m6 |
| 23611 | pmulhrsw m3, m7 |
| 23612 | pslldq m2, 2 |
| 23613 | pinsrb m2, [r3 + 11], 1 |
| 23614 | pinsrb m2, [r3 + 12], 0 |
| 23615 | pmaddubsw m5, m2, m6 |
| 23616 | pmulhrsw m5, m7 |
| 23617 | packuswb m3, m5 |
| 23618 | movu [r0 + 1004 * 16], m3 |
| 23619 | |
| 23620 | pslldq m1, 2 |
| 23621 | pinsrb m1, [r3 + 1], 1 |
| 23622 | pinsrb m1, [r3 + 2], 0 |
| 23623 | pmaddubsw m3, m1, m6 |
| 23624 | pmulhrsw m3, m7 |
| 23625 | pslldq m4, 2 |
| 23626 | pinsrw m4, [r4 + 6], 0 |
| 23627 | pmaddubsw m5, m4, m6 |
| 23628 | pmulhrsw m5, m7 |
| 23629 | packuswb m3, m5 |
| 23630 | movu [r0 + 1005 * 16], m3 |
| 23631 | |
| 23632 | ; mode17 [row 23] |
| 23633 | movu m6, [r5 + 16 * 16] |
| 23634 | pslldq m0, 2 |
| 23635 | pinsrb m0, [r3 + 22], 1 |
| 23636 | pinsrb m0, [r3 + 23], 0 |
| 23637 | pmaddubsw m3, m0, m6 |
| 23638 | pmulhrsw m3, m7 |
| 23639 | pslldq m2, 2 |
| 23640 | pinsrb m2, [r3 + 12], 1 |
| 23641 | pinsrb m2, [r3 + 14], 0 |
| 23642 | pmaddubsw m5, m2, m6 |
| 23643 | pmulhrsw m5, m7 |
| 23644 | packuswb m3, m5 |
| 23645 | movu [r0 + 1006 * 16], m3 |
| 23646 | |
| 23647 | pslldq m1, 2 |
| 23648 | pinsrb m1, [r3 + 2], 1 |
| 23649 | pinsrb m1, [r3 + 4], 0 |
| 23650 | pmaddubsw m3, m1, m6 |
| 23651 | pmulhrsw m3, m7 |
| 23652 | pslldq m4, 2 |
| 23653 | pinsrw m4, [r4 + 5], 0 |
| 23654 | pmaddubsw m5, m4, m6 |
| 23655 | pmulhrsw m5, m7 |
| 23656 | packuswb m3, m5 |
| 23657 | movu [r0 + 1007 * 16], m3 |
| 23658 | |
| 23659 | ; mode17 [row 24] |
| 23660 | movu m6, [r5 + 22 * 16] |
| 23661 | pslldq m0, 2 |
| 23662 | pinsrb m0, [r3 + 23], 1 |
| 23663 | pinsrb m0, [r3 + 25], 0 |
| 23664 | pmaddubsw m3, m0, m6 |
| 23665 | pmulhrsw m3, m7 |
| 23666 | pslldq m2, 2 |
| 23667 | pinsrb m2, [r3 + 14], 1 |
| 23668 | pinsrb m2, [r3 + 15], 0 |
| 23669 | pmaddubsw m5, m2, m6 |
| 23670 | pmulhrsw m5, m7 |
| 23671 | packuswb m3, m5 |
| 23672 | movu [r0 + 1008 * 16], m3 |
| 23673 | |
| 23674 | pslldq m1, 2 |
| 23675 | pinsrb m1, [r3 + 4], 1 |
| 23676 | pinsrb m1, [r3 + 5], 0 |
| 23677 | pmaddubsw m3, m1, m6 |
| 23678 | pmulhrsw m3, m7 |
| 23679 | pslldq m4, 2 |
| 23680 | pinsrw m4, [r4 + 4], 0 |
| 23681 | pmaddubsw m5, m4, m6 |
| 23682 | pmulhrsw m5, m7 |
| 23683 | packuswb m3, m5 |
| 23684 | movu [r0 + 1009 * 16], m3 |
| 23685 | |
| 23686 | ; mode17 [row 25] |
| 23687 | movu m6, [r5 + 28 * 16] |
| 23688 | pslldq m0, 2 |
| 23689 | pinsrb m0, [r3 + 25], 1 |
| 23690 | pinsrb m0, [r3 + 26], 0 |
| 23691 | pmaddubsw m3, m0, m6 |
| 23692 | pmulhrsw m3, m7 |
| 23693 | pslldq m2, 2 |
| 23694 | pinsrb m2, [r3 + 15], 1 |
| 23695 | pinsrb m2, [r3 + 16], 0 |
| 23696 | pmaddubsw m5, m2, m6 |
| 23697 | pmulhrsw m5, m7 |
| 23698 | packuswb m3, m5 |
| 23699 | movu [r0 + 1010 * 16], m3 |
| 23700 | |
| 23701 | pslldq m1, 2 |
| 23702 | pinsrb m1, [r3 + 5], 1 |
| 23703 | pinsrb m1, [r3 + 6], 0 |
| 23704 | pmaddubsw m3, m1, m6 |
| 23705 | pmulhrsw m3, m7 |
| 23706 | pslldq m4, 2 |
| 23707 | pinsrw m4, [r4 + 3], 0 |
| 23708 | pmaddubsw m5, m4, m6 |
| 23709 | pmulhrsw m5, m7 |
| 23710 | packuswb m3, m5 |
| 23711 | movu [r0 + 1011 * 16], m3 |
| 23712 | |
| 23713 | ; mode17 [row 26] |
| 23714 | movu m6, [r5 + 2 * 16] |
| 23715 | pmaddubsw m3, m0, m6 |
| 23716 | pmulhrsw m3, m7 |
| 23717 | pmaddubsw m5, m2, m6 |
| 23718 | pmulhrsw m5, m7 |
| 23719 | packuswb m3, m5 |
| 23720 | movu [r0 + 1012 * 16], m3 |
| 23721 | |
| 23722 | pmaddubsw m3, m1, m6 |
| 23723 | pmulhrsw m3, m7 |
| 23724 | pmaddubsw m5, m4, m6 |
| 23725 | pmulhrsw m5, m7 |
| 23726 | packuswb m3, m5 |
| 23727 | movu [r0 + 1013 * 16], m3 |
| 23728 | |
| 23729 | ; mode17 [row 27] |
| 23730 | movu m6, [r5 + 8 * 16] |
| 23731 | pslldq m0, 2 |
| 23732 | pinsrb m0, [r3 + 26], 1 |
| 23733 | pinsrb m0, [r3 + 27], 0 |
| 23734 | pmaddubsw m3, m0, m6 |
| 23735 | pmulhrsw m3, m7 |
| 23736 | pslldq m2, 2 |
| 23737 | pinsrb m2, [r3 + 16], 1 |
| 23738 | pinsrb m2, [r3 + 17], 0 |
| 23739 | pmaddubsw m5, m2, m6 |
| 23740 | pmulhrsw m5, m7 |
| 23741 | packuswb m3, m5 |
| 23742 | movu [r0 + 1014 * 16], m3 |
| 23743 | |
| 23744 | pslldq m1, 2 |
| 23745 | pinsrb m1, [r3 + 6], 1 |
| 23746 | pinsrb m1, [r3 + 7], 0 |
| 23747 | pmaddubsw m3, m1, m6 |
| 23748 | pmulhrsw m3, m7 |
| 23749 | pslldq m4, 2 |
| 23750 | pinsrw m4, [r4 + 2], 0 |
| 23751 | pmaddubsw m5, m4, m6 |
| 23752 | pmulhrsw m5, m7 |
| 23753 | packuswb m3, m5 |
| 23754 | movu [r0 + 1015 * 16], m3 |
| 23755 | |
| 23756 | ; mode17 [row 28] |
| 23757 | movu m6, [r5 + 14 * 16] |
| 23758 | pslldq m0, 2 |
| 23759 | pinsrb m0, [r3 + 27], 1 |
| 23760 | pinsrb m0, [r3 + 28], 0 |
| 23761 | pmaddubsw m3, m0, m6 |
| 23762 | pmulhrsw m3, m7 |
| 23763 | pslldq m2, 2 |
| 23764 | pinsrb m2, [r3 + 17], 1 |
| 23765 | pinsrb m2, [r3 + 18], 0 |
| 23766 | pmaddubsw m5, m2, m6 |
| 23767 | pmulhrsw m5, m7 |
| 23768 | packuswb m3, m5 |
| 23769 | movu [r0 + 1016 * 16], m3 |
| 23770 | |
| 23771 | pslldq m1, 2 |
| 23772 | pinsrb m1, [r3 + 7], 1 |
| 23773 | pinsrb m1, [r3 + 9], 0 |
| 23774 | pmaddubsw m3, m1, m6 |
| 23775 | pmulhrsw m3, m7 |
| 23776 | pslldq m4, 2 |
| 23777 | pinsrw m4, [r4 + 1], 0 |
| 23778 | pmaddubsw m5, m4, m6 |
| 23779 | pmulhrsw m5, m7 |
| 23780 | packuswb m3, m5 |
| 23781 | movu [r0 + 1017 * 16], m3 |
| 23782 | |
| 23783 | ; mode17 [row 29] |
| 23784 | movu m6, [r5 + 20 * 16] |
| 23785 | pslldq m0, 2 |
| 23786 | pinsrb m0, [r3 + 28], 1 |
| 23787 | pinsrb m0, [r3 + 30], 0 |
| 23788 | pmaddubsw m3, m0, m6 |
| 23789 | pmulhrsw m3, m7 |
| 23790 | pslldq m2, 2 |
| 23791 | pinsrb m2, [r3 + 18], 1 |
| 23792 | pinsrb m2, [r3 + 20], 0 |
| 23793 | pmaddubsw m5, m2, m6 |
| 23794 | pmulhrsw m5, m7 |
| 23795 | packuswb m3, m5 |
| 23796 | movu [r0 + 1018 * 16], m3 |
| 23797 | |
| 23798 | pslldq m1, 2 |
| 23799 | pinsrb m1, [r3 + 9], 1 |
| 23800 | pinsrb m1, [r3 + 10], 0 |
| 23801 | pmaddubsw m3, m1, m6 |
| 23802 | pmulhrsw m3, m7 |
| 23803 | pslldq m4, 2 |
| 23804 | pinsrw m4, [r4 + 0], 0 |
| 23805 | pmaddubsw m5, m4, m6 |
| 23806 | pmulhrsw m5, m7 |
| 23807 | packuswb m3, m5 |
| 23808 | movu [r0 + 1019 * 16], m3 |
| 23809 | |
| 23810 | ; mode17 [row 30] |
| 23811 | movu m6, [r5 + 26 * 16] |
| 23812 | pslldq m0, 2 |
| 23813 | pinsrb m0, [r3 + 30], 1 |
| 23814 | pinsrb m0, [r3 + 31], 0 |
| 23815 | pmaddubsw m3, m0, m6 |
| 23816 | pmulhrsw m3, m7 |
| 23817 | pslldq m2, 2 |
| 23818 | pinsrb m2, [r3 + 20], 1 |
| 23819 | pinsrb m2, [r3 + 21], 0 |
| 23820 | pmaddubsw m5, m2, m6 |
| 23821 | pmulhrsw m5, m7 |
| 23822 | packuswb m3, m5 |
| 23823 | movu [r0 + 1020 * 16], m3 |
| 23824 | |
| 23825 | pslldq m1, 2 |
| 23826 | pinsrb m1, [r3 + 10], 1 |
| 23827 | pinsrb m1, [r3 + 11], 0 |
| 23828 | pmaddubsw m3, m1, m6 |
| 23829 | pmulhrsw m3, m7 |
| 23830 | pslldq m4, 2 |
| 23831 | pinsrb m4, [r4 + 0], 1 |
| 23832 | pinsrb m4, [r3 + 1], 0 |
| 23833 | pmaddubsw m5, m4, m6 |
| 23834 | pmulhrsw m5, m7 |
| 23835 | packuswb m3, m5 |
| 23836 | movu [r0 + 1021 * 16], m3 |
| 23837 | |
| 23838 | ; mode17 [row 31] |
| 23839 | pshufb m5, m0, [tab_S2] |
| 23840 | movh [r0 + 1022 * 16], m5 |
| 23841 | pshufb m5, m2, [tab_S2] |
| 23842 | movh [r0 + 1022 * 16 + 8], m5 |
| 23843 | pshufb m5, m1, [tab_S2] |
| 23844 | movh [r0 + 1023 * 16], m5 |
| 23845 | pshufb m5, m4, [tab_S2] |
| 23846 | movh [r0 + 1023 * 16 + 8], m5 |
| 23847 | |
| 23848 | ;mode 18[row 0] |
| 23849 | movu m0, [r3] |
| 23850 | movu [r0 + 1024 * 16], m0 |
| 23851 | movu m1, [r3 + 16] |
| 23852 | movu [r0 + 1025 * 16], m1 |
| 23853 | |
| 23854 | ;mode 18[row 1] |
| 23855 | pslldq m0, 1 |
| 23856 | pinsrb m0, [r4 + 1], 0 |
| 23857 | movu [r0 + 1026 * 16], m0 |
| 23858 | pslldq m1, 1 |
| 23859 | pinsrb m1, [r3 + 15], 0 |
| 23860 | movu [r0 + 1027 * 16], m1 |
| 23861 | |
| 23862 | ;mode 18[row 2] |
| 23863 | pslldq m0, 1 |
| 23864 | pinsrb m0, [r4 + 2], 0 |
| 23865 | movu [r0 + 1028 * 16], m0 |
| 23866 | pslldq m1, 1 |
| 23867 | pinsrb m1, [r3 + 14], 0 |
| 23868 | movu [r0 + 1029 * 16], m1 |
| 23869 | |
| 23870 | ;mode 18[row 3] |
| 23871 | pslldq m0, 1 |
| 23872 | pinsrb m0, [r4 + 3], 0 |
| 23873 | movu [r0 + 1030 * 16], m0 |
| 23874 | pslldq m1, 1 |
| 23875 | pinsrb m1, [r3 + 13], 0 |
| 23876 | movu [r0 + 1031 * 16], m1 |
| 23877 | |
| 23878 | ;mode 18[row 4] |
| 23879 | pslldq m0, 1 |
| 23880 | pinsrb m0, [r4 + 4], 0 |
| 23881 | movu [r0 + 1032 * 16], m0 |
| 23882 | pslldq m1, 1 |
| 23883 | pinsrb m1, [r3 + 12], 0 |
| 23884 | movu [r0 + 1033 * 16], m1 |
| 23885 | |
| 23886 | ;mode 18[row 5] |
| 23887 | pslldq m0, 1 |
| 23888 | pinsrb m0, [r4 + 5], 0 |
| 23889 | movu [r0 + 1034 * 16], m0 |
| 23890 | pslldq m1, 1 |
| 23891 | pinsrb m1, [r3 + 11], 0 |
| 23892 | movu [r0 + 1035 * 16], m1 |
| 23893 | |
| 23894 | ;mode 18[row 6] |
| 23895 | pslldq m0, 1 |
| 23896 | pinsrb m0, [r4 + 6], 0 |
| 23897 | movu [r0 + 1036 * 16], m0 |
| 23898 | pslldq m1, 1 |
| 23899 | pinsrb m1, [r3 + 10], 0 |
| 23900 | movu [r0 + 1037 * 16], m1 |
| 23901 | |
| 23902 | ;mode 18[row 7] |
| 23903 | pslldq m0, 1 |
| 23904 | pinsrb m0, [r4 + 7], 0 |
| 23905 | movu [r0 + 1038 * 16], m0 |
| 23906 | pslldq m1, 1 |
| 23907 | pinsrb m1, [r3 + 9], 0 |
| 23908 | movu [r0 + 1039 * 16], m1 |
| 23909 | |
| 23910 | ;mode 18[row 8] |
| 23911 | pslldq m0, 1 |
| 23912 | pinsrb m0, [r4 + 8], 0 |
| 23913 | movu [r0 + 1040 * 16], m0 |
| 23914 | pslldq m1, 1 |
| 23915 | pinsrb m1, [r3 + 8], 0 |
| 23916 | movu [r0 + 1041 * 16], m1 |
| 23917 | |
| 23918 | ;mode 18[row 9] |
| 23919 | pslldq m0, 1 |
| 23920 | pinsrb m0, [r4 + 9], 0 |
| 23921 | movu [r0 + 1042 * 16], m0 |
| 23922 | pslldq m1, 1 |
| 23923 | pinsrb m1, [r3 + 7], 0 |
| 23924 | movu [r0 + 1043 * 16], m1 |
| 23925 | |
| 23926 | ;mode 18[row 10] |
| 23927 | pslldq m0, 1 |
| 23928 | pinsrb m0, [r4 + 10], 0 |
| 23929 | movu [r0 + 1044 * 16], m0 |
| 23930 | pslldq m1, 1 |
| 23931 | pinsrb m1, [r3 + 6], 0 |
| 23932 | movu [r0 + 1045 * 16], m1 |
| 23933 | |
| 23934 | ;mode 18[row 11] |
| 23935 | pslldq m0, 1 |
| 23936 | pinsrb m0, [r4 + 11], 0 |
| 23937 | movu [r0 + 1046 * 16], m0 |
| 23938 | pslldq m1, 1 |
| 23939 | pinsrb m1, [r3 + 5], 0 |
| 23940 | movu [r0 + 1047 * 16], m1 |
| 23941 | |
| 23942 | ;mode 18[row 12] |
| 23943 | pslldq m0, 1 |
| 23944 | pinsrb m0, [r4 + 12], 0 |
| 23945 | movu [r0 + 1048 * 16], m0 |
| 23946 | pslldq m1, 1 |
| 23947 | pinsrb m1, [r3 + 4], 0 |
| 23948 | movu [r0 + 1049 * 16], m1 |
| 23949 | |
| 23950 | ;mode 18[row 13] |
| 23951 | pslldq m0, 1 |
| 23952 | pinsrb m0, [r4 + 13], 0 |
| 23953 | movu [r0 + 1050 * 16], m0 |
| 23954 | pslldq m1, 1 |
| 23955 | pinsrb m1, [r3 + 3], 0 |
| 23956 | movu [r0 + 1051 * 16], m1 |
| 23957 | |
| 23958 | ;mode 18[row 14] |
| 23959 | pslldq m0, 1 |
| 23960 | pinsrb m0, [r4 + 14], 0 |
| 23961 | movu [r0 + 1052 * 16], m0 |
| 23962 | pslldq m1, 1 |
| 23963 | pinsrb m1, [r3 + 2], 0 |
| 23964 | movu [r0 + 1053 * 16], m1 |
| 23965 | |
| 23966 | ;mode 18[row 15] |
| 23967 | pslldq m0, 1 |
| 23968 | pinsrb m0, [r4 + 15], 0 |
| 23969 | movu [r0 + 1054 * 16], m0 |
| 23970 | pslldq m1, 1 |
| 23971 | pinsrb m1, [r3 + 1], 0 |
| 23972 | movu [r0 + 1055 * 16], m1 |
| 23973 | |
| 23974 | ;mode 18[row 16] |
| 23975 | pslldq m0, 1 |
| 23976 | pinsrb m0, [r4 + 16], 0 |
| 23977 | movu [r0 + 1056 * 16], m0 |
| 23978 | pslldq m1, 1 |
| 23979 | pinsrb m1, [r3 + 0], 0 |
| 23980 | movu [r0 + 1057 * 16], m1 |
| 23981 | |
| 23982 | ;mode 18[row 17] |
| 23983 | pslldq m0, 1 |
| 23984 | pinsrb m0, [r4 + 17], 0 |
| 23985 | movu [r0 + 1058 * 16], m0 |
| 23986 | pslldq m1, 1 |
| 23987 | pinsrb m1, [r4 + 1], 0 |
| 23988 | movu [r0 + 1059 * 16], m1 |
| 23989 | |
| 23990 | ;mode 18[row 18] |
| 23991 | pslldq m0, 1 |
| 23992 | pinsrb m0, [r4 + 18], 0 |
| 23993 | movu [r0 + 1060 * 16], m0 |
| 23994 | pslldq m1, 1 |
| 23995 | pinsrb m1, [r4 + 2], 0 |
| 23996 | movu [r0 + 1061 * 16], m1 |
| 23997 | |
| 23998 | ;mode 18[row 19] |
| 23999 | pslldq m0, 1 |
| 24000 | pinsrb m0, [r4 + 19], 0 |
| 24001 | movu [r0 + 1062 * 16], m0 |
| 24002 | pslldq m1, 1 |
| 24003 | pinsrb m1, [r4 + 3], 0 |
| 24004 | movu [r0 + 1063 * 16], m1 |
| 24005 | |
| 24006 | ;mode 18[row 20] |
| 24007 | pslldq m0, 1 |
| 24008 | pinsrb m0, [r4 + 20], 0 |
| 24009 | movu [r0 + 1064 * 16], m0 |
| 24010 | pslldq m1, 1 |
| 24011 | pinsrb m1, [r4 + 4], 0 |
| 24012 | movu [r0 + 1065 * 16], m1 |
| 24013 | |
| 24014 | ;mode 18[row 21] |
| 24015 | pslldq m0, 1 |
| 24016 | pinsrb m0, [r4 + 21], 0 |
| 24017 | movu [r0 + 1066 * 16], m0 |
| 24018 | pslldq m1, 1 |
| 24019 | pinsrb m1, [r4 + 5], 0 |
| 24020 | movu [r0 + 1067 * 16], m1 |
| 24021 | |
| 24022 | ;mode 18[row 22] |
| 24023 | pslldq m0, 1 |
| 24024 | pinsrb m0, [r4 + 22], 0 |
| 24025 | movu [r0 + 1068 * 16], m0 |
| 24026 | pslldq m1, 1 |
| 24027 | pinsrb m1, [r4 + 6], 0 |
| 24028 | movu [r0 + 1069 * 16], m1 |
| 24029 | |
| 24030 | ;mode 18[row 23] |
| 24031 | pslldq m0, 1 |
| 24032 | pinsrb m0, [r4 + 23], 0 |
| 24033 | movu [r0 + 1070 * 16], m0 |
| 24034 | pslldq m1, 1 |
| 24035 | pinsrb m1, [r4 + 7], 0 |
| 24036 | movu [r0 + 1071 * 16], m1 |
| 24037 | |
| 24038 | ;mode 18[row 24] |
| 24039 | pslldq m0, 1 |
| 24040 | pinsrb m0, [r4 + 24], 0 |
| 24041 | movu [r0 + 1072 * 16], m0 |
| 24042 | pslldq m1, 1 |
| 24043 | pinsrb m1, [r4 + 8], 0 |
| 24044 | movu [r0 + 1073 * 16], m1 |
| 24045 | |
| 24046 | ;mode 18[row 25] |
| 24047 | pslldq m0, 1 |
| 24048 | pinsrb m0, [r4 + 25], 0 |
| 24049 | movu [r0 + 1074 * 16], m0 |
| 24050 | pslldq m1, 1 |
| 24051 | pinsrb m1, [r4 + 9], 0 |
| 24052 | movu [r0 + 1075 * 16], m1 |
| 24053 | |
| 24054 | ;mode 18[row 26] |
| 24055 | pslldq m0, 1 |
| 24056 | pinsrb m0, [r4 + 26], 0 |
| 24057 | movu [r0 + 1076 * 16], m0 |
| 24058 | pslldq m1, 1 |
| 24059 | pinsrb m1, [r4 + 10], 0 |
| 24060 | movu [r0 + 1077 * 16], m1 |
| 24061 | |
| 24062 | ;mode 18[row 27] |
| 24063 | pslldq m0, 1 |
| 24064 | pinsrb m0, [r4 + 27], 0 |
| 24065 | movu [r0 + 1078 * 16], m0 |
| 24066 | pslldq m1, 1 |
| 24067 | pinsrb m1, [r4 + 11], 0 |
| 24068 | movu [r0 + 1079 * 16], m1 |
| 24069 | |
| 24070 | ;mode 18[row 28] |
| 24071 | pslldq m0, 1 |
| 24072 | pinsrb m0, [r4 + 28], 0 |
| 24073 | movu [r0 + 1080 * 16], m0 |
| 24074 | pslldq m1, 1 |
| 24075 | pinsrb m1, [r4 + 12], 0 |
| 24076 | movu [r0 + 1081 * 16], m1 |
| 24077 | |
| 24078 | ;mode 18[row 29] |
| 24079 | pslldq m0, 1 |
| 24080 | pinsrb m0, [r4 + 29], 0 |
| 24081 | movu [r0 + 1082 * 16], m0 |
| 24082 | pslldq m1, 1 |
| 24083 | pinsrb m1, [r4 + 13], 0 |
| 24084 | movu [r0 + 1083 * 16], m1 |
| 24085 | |
| 24086 | ;mode 18[row 30] |
| 24087 | pslldq m0, 1 |
| 24088 | pinsrb m0, [r4 + 30], 0 |
| 24089 | movu [r0 + 1084 * 16], m0 |
| 24090 | pslldq m1, 1 |
| 24091 | pinsrb m1, [r4 + 14], 0 |
| 24092 | movu [r0 + 1085 * 16], m1 |
| 24093 | |
| 24094 | ;mode 18[row 31] |
| 24095 | pslldq m0, 1 |
| 24096 | pinsrb m0, [r4 + 31], 0 |
| 24097 | movu [r0 + 1086 * 16], m0 |
| 24098 | pslldq m1, 1 |
| 24099 | pinsrb m1, [r4 + 15], 0 |
| 24100 | movu [r0 + 1087 * 16], m1 |
| 24101 | |
| 24102 | ; mode 19 [row 0] |
| 24103 | movu m6, [r5 + 6 * 16] |
| 24104 | movu m0, [r3 ] |
| 24105 | movu m1, [r3 + 1 ] |
| 24106 | punpcklbw m0, m1 |
| 24107 | pmaddubsw m1, m0, m6 |
| 24108 | pmulhrsw m1, m7 |
| 24109 | movu m2, [r3 + 8] |
| 24110 | movu m3, [r3 + 9] |
| 24111 | punpcklbw m2, m3 |
| 24112 | pmaddubsw m3, m2, m6 |
| 24113 | pmulhrsw m3, m7 |
| 24114 | packuswb m1, m3 |
| 24115 | movu [r0 + 1088 * 16], m1 |
| 24116 | |
| 24117 | movu m1, [r3 + 16] |
| 24118 | movu m3, [r3 + 17] |
| 24119 | punpcklbw m1, m3 |
| 24120 | pmaddubsw m4, m1, m6 |
| 24121 | pmulhrsw m4, m7 |
| 24122 | movu m3, [r3 + 24] |
| 24123 | movu m5, [r3 + 25] |
| 24124 | punpcklbw m3, m5 |
| 24125 | pmaddubsw m5, m3, m6 |
| 24126 | pmulhrsw m5, m7 |
| 24127 | packuswb m4, m5 |
| 24128 | movu [r0 + 1089 * 16], m4 |
| 24129 | |
| 24130 | ; mode 19 [row 1] |
| 24131 | movu m6, [r5 + 12 * 16] |
| 24132 | pslldq m0, 2 |
| 24133 | pinsrb m0, [r4 + 0], 1 |
| 24134 | pinsrb m0, [r4 + 1], 0 |
| 24135 | pmaddubsw m4, m0, m6 |
| 24136 | pmulhrsw m4, m7 |
| 24137 | pslldq m2, 2 |
| 24138 | pinsrw m2, [r3 + 7], 0 |
| 24139 | pmaddubsw m5, m2, m6 |
| 24140 | pmulhrsw m5, m7 |
| 24141 | packuswb m4, m5 |
| 24142 | movu [r0 + 1090 * 16], m4 |
| 24143 | pslldq m1, 2 |
| 24144 | pinsrw m1, [r3 + 15], 0 |
| 24145 | pmaddubsw m4, m1, m6 |
| 24146 | pmulhrsw m4, m7 |
| 24147 | pslldq m3, 2 |
| 24148 | pinsrw m3, [r3 + 23], 0 |
| 24149 | pmaddubsw m5, m3, m6 |
| 24150 | pmulhrsw m5, m7 |
| 24151 | packuswb m4, m5 |
| 24152 | movu [r0 + 1091 * 16], m4 |
| 24153 | |
| 24154 | ; mode 19 [row 2] |
| 24155 | movu m6, [r5 + 18 * 16] |
| 24156 | pslldq m0, 2 |
| 24157 | pinsrb m0, [r4 + 1], 1 |
| 24158 | pinsrb m0, [r4 + 2], 0 |
| 24159 | pmaddubsw m4, m0, m6 |
| 24160 | pmulhrsw m4, m7 |
| 24161 | pslldq m2, 2 |
| 24162 | pinsrw m2, [r3 + 6], 0 |
| 24163 | pmaddubsw m5, m2, m6 |
| 24164 | pmulhrsw m5, m7 |
| 24165 | packuswb m4, m5 |
| 24166 | movu [r0 + 1092 * 16], m4 |
| 24167 | pslldq m1, 2 |
| 24168 | pinsrw m1, [r3 + 14], 0 |
| 24169 | pmaddubsw m4, m1, m6 |
| 24170 | pmulhrsw m4, m7 |
| 24171 | pslldq m3, 2 |
| 24172 | pinsrw m3, [r3 + 22], 0 |
| 24173 | pmaddubsw m5, m3, m6 |
| 24174 | pmulhrsw m5, m7 |
| 24175 | packuswb m4, m5 |
| 24176 | movu [r0 + 1093 * 16], m4 |
| 24177 | |
| 24178 | ; mode 19 [row 3] |
| 24179 | movu m6, [r5 + 24 * 16] |
| 24180 | pslldq m0, 2 |
| 24181 | pinsrb m0, [r4 + 2], 1 |
| 24182 | pinsrb m0, [r4 + 4], 0 |
| 24183 | pmaddubsw m4, m0, m6 |
| 24184 | pmulhrsw m4, m7 |
| 24185 | pslldq m2, 2 |
| 24186 | pinsrw m2, [r3 + 5], 0 |
| 24187 | pmaddubsw m5, m2, m6 |
| 24188 | pmulhrsw m5, m7 |
| 24189 | packuswb m4, m5 |
| 24190 | movu [r0 + 1094 * 16], m4 |
| 24191 | pslldq m1, 2 |
| 24192 | pinsrw m1, [r3 + 13], 0 |
| 24193 | pmaddubsw m4, m1, m6 |
| 24194 | pmulhrsw m4, m7 |
| 24195 | pslldq m3, 2 |
| 24196 | pinsrw m3, [r3 + 21], 0 |
| 24197 | pmaddubsw m5, m3, m6 |
| 24198 | pmulhrsw m5, m7 |
| 24199 | packuswb m4, m5 |
| 24200 | movu [r0 + 1095 * 16], m4 |
| 24201 | |
| 24202 | ; mode 19 [row 4] |
| 24203 | movu m6, [r5 + 30 * 16] |
| 24204 | pslldq m0, 2 |
| 24205 | pinsrb m0, [r4 + 4], 1 |
| 24206 | pinsrb m0, [r4 + 5], 0 |
| 24207 | pmaddubsw m4, m0, m6 |
| 24208 | pmulhrsw m4, m7 |
| 24209 | pslldq m2, 2 |
| 24210 | pinsrw m2, [r3 + 4], 0 |
| 24211 | pmaddubsw m5, m2, m6 |
| 24212 | pmulhrsw m5, m7 |
| 24213 | packuswb m4, m5 |
| 24214 | movu [r0 + 1096 * 16], m4 |
| 24215 | pslldq m1, 2 |
| 24216 | pinsrw m1, [r3 + 12], 0 |
| 24217 | pmaddubsw m4, m1, m6 |
| 24218 | pmulhrsw m4, m7 |
| 24219 | pslldq m3, 2 |
| 24220 | pinsrw m3, [r3 + 20], 0 |
| 24221 | pmaddubsw m5, m3, m6 |
| 24222 | pmulhrsw m5, m7 |
| 24223 | packuswb m4, m5 |
| 24224 | movu [r0 + 1097 * 16], m4 |
| 24225 | |
| 24226 | ; mode 19 [row 5] |
| 24227 | movu m6, [r5 + 4 * 16] |
| 24228 | pmaddubsw m4, m0, m6 |
| 24229 | pmulhrsw m4, m7 |
| 24230 | pmaddubsw m5, m2, m6 |
| 24231 | pmulhrsw m5, m7 |
| 24232 | packuswb m4, m5 |
| 24233 | movu [r0 + 1098 * 16], m4 |
| 24234 | pmaddubsw m4, m1, m6 |
| 24235 | pmulhrsw m4, m7 |
| 24236 | pmaddubsw m5, m3, m6 |
| 24237 | pmulhrsw m5, m7 |
| 24238 | packuswb m4, m5 |
| 24239 | movu [r0 + 1099 * 16], m4 |
| 24240 | |
| 24241 | ; mode 19 [row 6] |
| 24242 | movu m6, [r5 + 10 * 16] |
| 24243 | pslldq m0, 2 |
| 24244 | pinsrb m0, [r4 + 5], 1 |
| 24245 | pinsrb m0, [r4 + 6], 0 |
| 24246 | pmaddubsw m4, m0, m6 |
| 24247 | pmulhrsw m4, m7 |
| 24248 | pslldq m2, 2 |
| 24249 | pinsrw m2, [r3 + 3], 0 |
| 24250 | pmaddubsw m5, m2, m6 |
| 24251 | pmulhrsw m5, m7 |
| 24252 | packuswb m4, m5 |
| 24253 | movu [r0 + 1100 * 16], m4 |
| 24254 | pslldq m1, 2 |
| 24255 | pinsrw m1, [r3 + 11], 0 |
| 24256 | pmaddubsw m4, m1, m6 |
| 24257 | pmulhrsw m4, m7 |
| 24258 | pslldq m3, 2 |
| 24259 | pinsrw m3, [r3 + 19], 0 |
| 24260 | pmaddubsw m5, m3, m6 |
| 24261 | pmulhrsw m5, m7 |
| 24262 | packuswb m4, m5 |
| 24263 | movu [r0 + 1101 * 16], m4 |
| 24264 | |
| 24265 | ; mode 19 [row 7] |
| 24266 | movu m6, [r5 + 16 * 16] |
| 24267 | pslldq m0, 2 |
| 24268 | pinsrb m0, [r4 + 6], 1 |
| 24269 | pinsrb m0, [r4 + 7], 0 |
| 24270 | pmaddubsw m4, m0, m6 |
| 24271 | pmulhrsw m4, m7 |
| 24272 | pslldq m2, 2 |
| 24273 | pinsrw m2, [r3 + 2], 0 |
| 24274 | pmaddubsw m5, m2, m6 |
| 24275 | pmulhrsw m5, m7 |
| 24276 | packuswb m4, m5 |
| 24277 | movu [r0 + 1102 * 16], m4 |
| 24278 | pslldq m1, 2 |
| 24279 | pinsrw m1, [r3 + 10], 0 |
| 24280 | pmaddubsw m4, m1, m6 |
| 24281 | pmulhrsw m4, m7 |
| 24282 | pslldq m3, 2 |
| 24283 | pinsrw m3, [r3 + 18], 0 |
| 24284 | pmaddubsw m5, m3, m6 |
| 24285 | pmulhrsw m5, m7 |
| 24286 | packuswb m4, m5 |
| 24287 | movu [r0 + 1103 * 16], m4 |
| 24288 | |
| 24289 | ; mode 19 [row 8] |
| 24290 | movu m6, [r5 + 22 * 16] |
| 24291 | pslldq m0, 2 |
| 24292 | pinsrb m0, [r4 + 7], 1 |
| 24293 | pinsrb m0, [r4 + 9], 0 |
| 24294 | pmaddubsw m4, m0, m6 |
| 24295 | pmulhrsw m4, m7 |
| 24296 | pslldq m2, 2 |
| 24297 | pinsrw m2, [r3 + 1], 0 |
| 24298 | pmaddubsw m5, m2, m6 |
| 24299 | pmulhrsw m5, m7 |
| 24300 | packuswb m4, m5 |
| 24301 | movu [r0 + 1104 * 16], m4 |
| 24302 | pslldq m1, 2 |
| 24303 | pinsrw m1, [r3 + 9], 0 |
| 24304 | pmaddubsw m4, m1, m6 |
| 24305 | pmulhrsw m4, m7 |
| 24306 | pslldq m3, 2 |
| 24307 | pinsrw m3, [r3 + 17], 0 |
| 24308 | pmaddubsw m5, m3, m6 |
| 24309 | pmulhrsw m5, m7 |
| 24310 | packuswb m4, m5 |
| 24311 | movu [r0 + 1105 * 16], m4 |
| 24312 | |
| 24313 | ; mode 19 [row 9] |
| 24314 | movu m6, [r5 + 28 * 16] |
| 24315 | pslldq m0, 2 |
| 24316 | pinsrb m0, [r4 + 9], 1 |
| 24317 | pinsrb m0, [r4 + 10], 0 |
| 24318 | pmaddubsw m4, m0, m6 |
| 24319 | pmulhrsw m4, m7 |
| 24320 | pslldq m2, 2 |
| 24321 | pinsrw m2, [r3 + 0], 0 |
| 24322 | pmaddubsw m5, m2, m6 |
| 24323 | pmulhrsw m5, m7 |
| 24324 | packuswb m4, m5 |
| 24325 | movu [r0 + 1106 * 16], m4 |
| 24326 | pslldq m1, 2 |
| 24327 | pinsrw m1, [r3 + 8], 0 |
| 24328 | pmaddubsw m4, m1, m6 |
| 24329 | pmulhrsw m4, m7 |
| 24330 | pslldq m3, 2 |
| 24331 | pinsrw m3, [r3 + 16], 0 |
| 24332 | pmaddubsw m5, m3, m6 |
| 24333 | pmulhrsw m5, m7 |
| 24334 | packuswb m4, m5 |
| 24335 | movu [r0 + 1107 * 16], m4 |
| 24336 | |
| 24337 | ; mode 19 [row 10] |
| 24338 | movu m6, [r5 + 2 * 16] |
| 24339 | pmaddubsw m4, m0, m6 |
| 24340 | pmulhrsw m4, m7 |
| 24341 | pmaddubsw m5, m2, m6 |
| 24342 | pmulhrsw m5, m7 |
| 24343 | packuswb m4, m5 |
| 24344 | movu [r0 + 1108 * 16], m4 |
| 24345 | pmaddubsw m4, m1, m6 |
| 24346 | pmulhrsw m4, m7 |
| 24347 | pmaddubsw m5, m3, m6 |
| 24348 | pmulhrsw m5, m7 |
| 24349 | packuswb m4, m5 |
| 24350 | movu [r0 + 1109 * 16], m4 |
| 24351 | |
| 24352 | ; mode 19 [row 11] |
| 24353 | movu m6, [r5 + 8 * 16] |
| 24354 | pslldq m0, 2 |
| 24355 | pinsrb m0, [r4 + 10], 1 |
| 24356 | pinsrb m0, [r4 + 11], 0 |
| 24357 | pmaddubsw m4, m0, m6 |
| 24358 | pmulhrsw m4, m7 |
| 24359 | pslldq m2, 2 |
| 24360 | pinsrb m2, [r3 + 0], 1 |
| 24361 | pinsrb m2, [r4 + 1], 0 |
| 24362 | pmaddubsw m5, m2, m6 |
| 24363 | pmulhrsw m5, m7 |
| 24364 | packuswb m4, m5 |
| 24365 | movu [r0 + 1110 * 16], m4 |
| 24366 | pslldq m1, 2 |
| 24367 | pinsrw m1, [r3 + 7], 0 |
| 24368 | pmaddubsw m4, m1, m6 |
| 24369 | pmulhrsw m4, m7 |
| 24370 | pslldq m3, 2 |
| 24371 | pinsrw m3, [r3 + 15], 0 |
| 24372 | pmaddubsw m5, m3, m6 |
| 24373 | pmulhrsw m5, m7 |
| 24374 | packuswb m4, m5 |
| 24375 | movu [r0 + 1111 * 16], m4 |
| 24376 | |
| 24377 | ; mode 19 [row 12] |
| 24378 | movu m6, [r5 + 14 * 16] |
| 24379 | pslldq m0, 2 |
| 24380 | pinsrb m0, [r4 + 11], 1 |
| 24381 | pinsrb m0, [r4 + 12], 0 |
| 24382 | pmaddubsw m4, m0, m6 |
| 24383 | pmulhrsw m4, m7 |
| 24384 | pslldq m2, 2 |
| 24385 | pinsrb m2, [r4 + 1], 1 |
| 24386 | pinsrb m2, [r4 + 2], 0 |
| 24387 | pmaddubsw m5, m2, m6 |
| 24388 | pmulhrsw m5, m7 |
| 24389 | packuswb m4, m5 |
| 24390 | movu [r0 + 1112 * 16], m4 |
| 24391 | pslldq m1, 2 |
| 24392 | pinsrw m1, [r3 + 6], 0 |
| 24393 | pmaddubsw m4, m1, m6 |
| 24394 | pmulhrsw m4, m7 |
| 24395 | pslldq m3, 2 |
| 24396 | pinsrw m3, [r3 + 14], 0 |
| 24397 | pmaddubsw m5, m3, m6 |
| 24398 | pmulhrsw m5, m7 |
| 24399 | packuswb m4, m5 |
| 24400 | movu [r0 + 1113 * 16], m4 |
| 24401 | |
| 24402 | ; mode 19 [row 13] |
| 24403 | movu m6, [r5 + 20 * 16] |
| 24404 | pslldq m0, 2 |
| 24405 | pinsrb m0, [r4 + 12], 1 |
| 24406 | pinsrb m0, [r4 + 14], 0 |
| 24407 | pmaddubsw m4, m0, m6 |
| 24408 | pmulhrsw m4, m7 |
| 24409 | pslldq m2, 2 |
| 24410 | pinsrb m2, [r4 + 2], 1 |
| 24411 | pinsrb m2, [r4 + 4], 0 |
| 24412 | pmaddubsw m5, m2, m6 |
| 24413 | pmulhrsw m5, m7 |
| 24414 | packuswb m4, m5 |
| 24415 | movu [r0 + 1114 * 16], m4 |
| 24416 | pslldq m1, 2 |
| 24417 | pinsrw m1, [r3 + 5], 0 |
| 24418 | pmaddubsw m4, m1, m6 |
| 24419 | pmulhrsw m4, m7 |
| 24420 | pslldq m3, 2 |
| 24421 | pinsrw m3, [r3 + 13], 0 |
| 24422 | pmaddubsw m5, m3, m6 |
| 24423 | pmulhrsw m5, m7 |
| 24424 | packuswb m4, m5 |
| 24425 | movu [r0 + 1115 * 16], m4 |
| 24426 | |
| 24427 | ; mode 19 [row 14] |
| 24428 | movu m6, [r5 + 26 * 16] |
| 24429 | pslldq m0, 2 |
| 24430 | pinsrb m0, [r4 + 14], 1 |
| 24431 | pinsrb m0, [r4 + 15], 0 |
| 24432 | pmaddubsw m4, m0, m6 |
| 24433 | pmulhrsw m4, m7 |
| 24434 | pslldq m2, 2 |
| 24435 | pinsrb m2, [r4 + 4], 1 |
| 24436 | pinsrb m2, [r4 + 5], 0 |
| 24437 | pmaddubsw m5, m2, m6 |
| 24438 | pmulhrsw m5, m7 |
| 24439 | packuswb m4, m5 |
| 24440 | movu [r0 + 1116 * 16], m4 |
| 24441 | pslldq m1, 2 |
| 24442 | pinsrw m1, [r3 + 4], 0 |
| 24443 | pmaddubsw m4, m1, m6 |
| 24444 | pmulhrsw m4, m7 |
| 24445 | pslldq m3, 2 |
| 24446 | pinsrw m3, [r3 + 12], 0 |
| 24447 | pmaddubsw m5, m3, m6 |
| 24448 | pmulhrsw m5, m7 |
| 24449 | packuswb m4, m5 |
| 24450 | movu [r0 + 1117 * 16], m4 |
| 24451 | |
| 24452 | ; mode19 [row 15] |
| 24453 | pshufb m5, m0, [tab_S2] |
| 24454 | movh [r0 + 1118 * 16], m5 |
| 24455 | pshufb m5, m2, [tab_S2] |
| 24456 | movh [r0 + 1118 * 16 + 8], m5 |
| 24457 | pshufb m5, m1, [tab_S2] |
| 24458 | movh [r0 + 1119 * 16], m5 |
| 24459 | pshufb m5, m3, [tab_S2] |
| 24460 | movh [r0 + 1119 * 16 + 8], m5 |
| 24461 | |
| 24462 | ; mode 19 [row 16] |
| 24463 | movu m6, [r5 + 6 * 16] |
| 24464 | pslldq m0, 2 |
| 24465 | pinsrb m0, [r4 + 15], 1 |
| 24466 | pinsrb m0, [r4 + 16], 0 |
| 24467 | pmaddubsw m4, m0, m6 |
| 24468 | pmulhrsw m4, m7 |
| 24469 | pslldq m2, 2 |
| 24470 | pinsrb m2, [r4 + 5], 1 |
| 24471 | pinsrb m2, [r4 + 6], 0 |
| 24472 | pmaddubsw m5, m2, m6 |
| 24473 | pmulhrsw m5, m7 |
| 24474 | packuswb m4, m5 |
| 24475 | movu [r0 + 1120 * 16], m4 |
| 24476 | pslldq m1, 2 |
| 24477 | pinsrw m1, [r3 + 3], 0 |
| 24478 | pmaddubsw m4, m1, m6 |
| 24479 | pmulhrsw m4, m7 |
| 24480 | pslldq m3, 2 |
| 24481 | pinsrw m3, [r3 + 11], 0 |
| 24482 | pmaddubsw m5, m3, m6 |
| 24483 | pmulhrsw m5, m7 |
| 24484 | packuswb m4, m5 |
| 24485 | movu [r0 + 1121 * 16], m4 |
| 24486 | |
| 24487 | ; mode 19 [row 17] |
| 24488 | movu m6, [r5 + 12 * 16] |
| 24489 | pslldq m0, 2 |
| 24490 | pinsrb m0, [r4 + 16], 1 |
| 24491 | pinsrb m0, [r4 + 17], 0 |
| 24492 | pmaddubsw m4, m0, m6 |
| 24493 | pmulhrsw m4, m7 |
| 24494 | pslldq m2, 2 |
| 24495 | pinsrb m2, [r4 + 6], 1 |
| 24496 | pinsrb m2, [r4 + 7], 0 |
| 24497 | pmaddubsw m5, m2, m6 |
| 24498 | pmulhrsw m5, m7 |
| 24499 | packuswb m4, m5 |
| 24500 | movu [r0 + 1122 * 16], m4 |
| 24501 | pslldq m1, 2 |
| 24502 | pinsrw m1, [r3 + 2], 0 |
| 24503 | pmaddubsw m4, m1, m6 |
| 24504 | pmulhrsw m4, m7 |
| 24505 | pslldq m3, 2 |
| 24506 | pinsrw m3, [r3 + 10], 0 |
| 24507 | pmaddubsw m5, m3, m6 |
| 24508 | pmulhrsw m5, m7 |
| 24509 | packuswb m4, m5 |
| 24510 | movu [r0 + 1123 * 16], m4 |
| 24511 | |
| 24512 | ; mode 19 [row 18] |
| 24513 | movu m6, [r5 + 18 * 16] |
| 24514 | pslldq m0, 2 |
| 24515 | pinsrb m0, [r4 + 17], 1 |
| 24516 | pinsrb m0, [r4 + 18], 0 |
| 24517 | pmaddubsw m4, m0, m6 |
| 24518 | pmulhrsw m4, m7 |
| 24519 | pslldq m2, 2 |
| 24520 | pinsrb m2, [r4 + 7], 1 |
| 24521 | pinsrb m2, [r4 + 9], 0 |
| 24522 | pmaddubsw m5, m2, m6 |
| 24523 | pmulhrsw m5, m7 |
| 24524 | packuswb m4, m5 |
| 24525 | movu [r0 + 1124 * 16], m4 |
| 24526 | pslldq m1, 2 |
| 24527 | pinsrw m1, [r3 + 1], 0 |
| 24528 | pmaddubsw m4, m1, m6 |
| 24529 | pmulhrsw m4, m7 |
| 24530 | pslldq m3, 2 |
| 24531 | pinsrw m3, [r3 + 9], 0 |
| 24532 | pmaddubsw m5, m3, m6 |
| 24533 | pmulhrsw m5, m7 |
| 24534 | packuswb m4, m5 |
| 24535 | movu [r0 + 1125 * 16], m4 |
| 24536 | |
| 24537 | ; mode 19 [row 19] |
| 24538 | movu m6, [r5 + 24 * 16] |
| 24539 | pslldq m0, 2 |
| 24540 | pinsrb m0, [r4 + 18], 1 |
| 24541 | pinsrb m0, [r4 + 20], 0 |
| 24542 | pmaddubsw m4, m0, m6 |
| 24543 | pmulhrsw m4, m7 |
| 24544 | pslldq m2, 2 |
| 24545 | pinsrb m2, [r4 + 9], 1 |
| 24546 | pinsrb m2, [r4 + 10], 0 |
| 24547 | pmaddubsw m5, m2, m6 |
| 24548 | pmulhrsw m5, m7 |
| 24549 | packuswb m4, m5 |
| 24550 | movu [r0 + 1126 * 16], m4 |
| 24551 | pslldq m1, 2 |
| 24552 | pinsrw m1, [r3 + 0], 0 |
| 24553 | pmaddubsw m4, m1, m6 |
| 24554 | pmulhrsw m4, m7 |
| 24555 | pslldq m3, 2 |
| 24556 | pinsrw m3, [r3 + 8], 0 |
| 24557 | pmaddubsw m5, m3, m6 |
| 24558 | pmulhrsw m5, m7 |
| 24559 | packuswb m4, m5 |
| 24560 | movu [r0 + 1127 * 16], m4 |
| 24561 | |
| 24562 | ; mode 19 [row 20] |
| 24563 | movu m6, [r5 + 30 * 16] |
| 24564 | pslldq m0, 2 |
| 24565 | pinsrb m0, [r4 + 20], 1 |
| 24566 | pinsrb m0, [r4 + 21], 0 |
| 24567 | pmaddubsw m4, m0, m6 |
| 24568 | pmulhrsw m4, m7 |
| 24569 | pslldq m2, 2 |
| 24570 | pinsrb m2, [r4 + 10], 1 |
| 24571 | pinsrb m2, [r4 + 11], 0 |
| 24572 | pmaddubsw m5, m2, m6 |
| 24573 | pmulhrsw m5, m7 |
| 24574 | packuswb m4, m5 |
| 24575 | movu [r0 + 1128 * 16], m4 |
| 24576 | pslldq m1, 2 |
| 24577 | pinsrb m1, [r4 + 0], 1 |
| 24578 | pinsrb m1, [r4 + 1], 0 |
| 24579 | pmaddubsw m4, m1, m6 |
| 24580 | pmulhrsw m4, m7 |
| 24581 | pslldq m3, 2 |
| 24582 | pinsrb m3, [r3 + 8], 1 |
| 24583 | pinsrb m3, [r3 + 7], 0 |
| 24584 | pmaddubsw m5, m3, m6 |
| 24585 | pmulhrsw m5, m7 |
| 24586 | packuswb m4, m5 |
| 24587 | movu [r0 + 1129 * 16], m4 |
| 24588 | |
| 24589 | ; mode 19 [row 21] |
| 24590 | movu m6, [r5 + 4 * 16] |
| 24591 | pmaddubsw m4, m0, m6 |
| 24592 | pmulhrsw m4, m7 |
| 24593 | pmaddubsw m5, m2, m6 |
| 24594 | pmulhrsw m5, m7 |
| 24595 | packuswb m4, m5 |
| 24596 | movu [r0 + 1130 * 16], m4 |
| 24597 | pmaddubsw m4, m1, m6 |
| 24598 | pmulhrsw m4, m7 |
| 24599 | pmaddubsw m5, m3, m6 |
| 24600 | pmulhrsw m5, m7 |
| 24601 | packuswb m4, m5 |
| 24602 | movu [r0 + 1131 * 16], m4 |
| 24603 | |
| 24604 | ; mode 19 [row 22] |
| 24605 | movu m6, [r5 + 10 * 16] |
| 24606 | pslldq m0, 2 |
| 24607 | pinsrb m0, [r4 + 21], 1 |
| 24608 | pinsrb m0, [r4 + 22], 0 |
| 24609 | pmaddubsw m4, m0, m6 |
| 24610 | pmulhrsw m4, m7 |
| 24611 | pslldq m2, 2 |
| 24612 | pinsrb m2, [r4 + 11], 1 |
| 24613 | pinsrb m2, [r4 + 12], 0 |
| 24614 | pmaddubsw m5, m2, m6 |
| 24615 | pmulhrsw m5, m7 |
| 24616 | packuswb m4, m5 |
| 24617 | movu [r0 + 1132 * 16], m4 |
| 24618 | pslldq m1, 2 |
| 24619 | pinsrb m1, [r4 + 1], 1 |
| 24620 | pinsrb m1, [r4 + 2], 0 |
| 24621 | pmaddubsw m4, m1, m6 |
| 24622 | pmulhrsw m4, m7 |
| 24623 | pslldq m3, 2 |
| 24624 | pinsrw m3, [r3 + 6], 0 |
| 24625 | pmaddubsw m5, m3, m6 |
| 24626 | pmulhrsw m5, m7 |
| 24627 | packuswb m4, m5 |
| 24628 | movu [r0 + 1133 * 16], m4 |
| 24629 | |
| 24630 | ; mode 19 [row 23] |
| 24631 | movu m6, [r5 + 16 * 16] |
| 24632 | pslldq m0, 2 |
| 24633 | pinsrb m0, [r4 + 22], 1 |
| 24634 | pinsrb m0, [r4 + 23], 0 |
| 24635 | pmaddubsw m4, m0, m6 |
| 24636 | pmulhrsw m4, m7 |
| 24637 | pslldq m2, 2 |
| 24638 | pinsrb m2, [r4 + 12], 1 |
| 24639 | pinsrb m2, [r4 + 14], 0 |
| 24640 | pmaddubsw m5, m2, m6 |
| 24641 | pmulhrsw m5, m7 |
| 24642 | packuswb m4, m5 |
| 24643 | movu [r0 + 1134 * 16], m4 |
| 24644 | pslldq m1, 2 |
| 24645 | pinsrb m1, [r4 + 2], 1 |
| 24646 | pinsrb m1, [r4 + 4], 0 |
| 24647 | pmaddubsw m4, m1, m6 |
| 24648 | pmulhrsw m4, m7 |
| 24649 | pslldq m3, 2 |
| 24650 | pinsrw m3, [r3 + 5], 0 |
| 24651 | pmaddubsw m5, m3, m6 |
| 24652 | pmulhrsw m5, m7 |
| 24653 | packuswb m4, m5 |
| 24654 | movu [r0 + 1135 * 16], m4 |
| 24655 | |
| 24656 | ; mode 19 [row 24] |
| 24657 | movu m6, [r5 + 22 * 16] |
| 24658 | pslldq m0, 2 |
| 24659 | pinsrb m0, [r4 + 23], 1 |
| 24660 | pinsrb m0, [r4 + 25], 0 |
| 24661 | pmaddubsw m4, m0, m6 |
| 24662 | pmulhrsw m4, m7 |
| 24663 | pslldq m2, 2 |
| 24664 | pinsrb m2, [r4 + 14], 1 |
| 24665 | pinsrb m2, [r4 + 15], 0 |
| 24666 | pmaddubsw m5, m2, m6 |
| 24667 | pmulhrsw m5, m7 |
| 24668 | packuswb m4, m5 |
| 24669 | movu [r0 + 1136 * 16], m4 |
| 24670 | pslldq m1, 2 |
| 24671 | pinsrb m1, [r4 + 4], 1 |
| 24672 | pinsrb m1, [r4 + 5], 0 |
| 24673 | pmaddubsw m4, m1, m6 |
| 24674 | pmulhrsw m4, m7 |
| 24675 | pslldq m3, 2 |
| 24676 | pinsrw m3, [r3 + 4], 0 |
| 24677 | pmaddubsw m5, m3, m6 |
| 24678 | pmulhrsw m5, m7 |
| 24679 | packuswb m4, m5 |
| 24680 | movu [r0 + 1137 * 16], m4 |
| 24681 | |
| 24682 | ; mode 19 [row 25] |
| 24683 | movu m6, [r5 + 28 * 16] |
| 24684 | pslldq m0, 2 |
| 24685 | pinsrb m0, [r4 + 25], 1 |
| 24686 | pinsrb m0, [r4 + 26], 0 |
| 24687 | pmaddubsw m4, m0, m6 |
| 24688 | pmulhrsw m4, m7 |
| 24689 | pslldq m2, 2 |
| 24690 | pinsrb m2, [r4 + 15], 1 |
| 24691 | pinsrb m2, [r4 + 16], 0 |
| 24692 | pmaddubsw m5, m2, m6 |
| 24693 | pmulhrsw m5, m7 |
| 24694 | packuswb m4, m5 |
| 24695 | movu [r0 + 1138 * 16], m4 |
| 24696 | pslldq m1, 2 |
| 24697 | pinsrb m1, [r4 + 5], 1 |
| 24698 | pinsrb m1, [r4 + 6], 0 |
| 24699 | pmaddubsw m4, m1, m6 |
| 24700 | pmulhrsw m4, m7 |
| 24701 | pslldq m3, 2 |
| 24702 | pinsrw m3, [r3 + 3], 0 |
| 24703 | pmaddubsw m5, m3, m6 |
| 24704 | pmulhrsw m5, m7 |
| 24705 | packuswb m4, m5 |
| 24706 | movu [r0 + 1139 * 16], m4 |
| 24707 | |
| 24708 | ; mode 19 [row 26] |
| 24709 | movu m6, [r5 + 2 * 16] |
| 24710 | pmaddubsw m4, m0, m6 |
| 24711 | pmulhrsw m4, m7 |
| 24712 | pmaddubsw m5, m2, m6 |
| 24713 | pmulhrsw m5, m7 |
| 24714 | packuswb m4, m5 |
| 24715 | movu [r0 + 1140 * 16], m4 |
| 24716 | pmaddubsw m4, m1, m6 |
| 24717 | pmulhrsw m4, m7 |
| 24718 | pmaddubsw m5, m3, m6 |
| 24719 | pmulhrsw m5, m7 |
| 24720 | packuswb m4, m5 |
| 24721 | movu [r0 + 1141 * 16], m4 |
| 24722 | |
| 24723 | ; mode 19 [row 27] |
| 24724 | movu m6, [r5 + 8 * 16] |
| 24725 | pslldq m0, 2 |
| 24726 | pinsrb m0, [r4 + 26], 1 |
| 24727 | pinsrb m0, [r4 + 27], 0 |
| 24728 | pmaddubsw m4, m0, m6 |
| 24729 | pmulhrsw m4, m7 |
| 24730 | pslldq m2, 2 |
| 24731 | pinsrb m2, [r4 + 16], 1 |
| 24732 | pinsrb m2, [r4 + 17], 0 |
| 24733 | pmaddubsw m5, m2, m6 |
| 24734 | pmulhrsw m5, m7 |
| 24735 | packuswb m4, m5 |
| 24736 | movu [r0 + 1142 * 16], m4 |
| 24737 | pslldq m1, 2 |
| 24738 | pinsrb m1, [r4 + 6], 1 |
| 24739 | pinsrb m1, [r4 + 7], 0 |
| 24740 | pmaddubsw m4, m1, m6 |
| 24741 | pmulhrsw m4, m7 |
| 24742 | pslldq m3, 2 |
| 24743 | pinsrw m3, [r3 + 2], 0 |
| 24744 | pmaddubsw m5, m3, m6 |
| 24745 | pmulhrsw m5, m7 |
| 24746 | packuswb m4, m5 |
| 24747 | movu [r0 + 1143 * 16], m4 |
| 24748 | |
| 24749 | ; mode 19 [row 28] |
| 24750 | movu m6, [r5 + 14 * 16] |
| 24751 | pslldq m0, 2 |
| 24752 | pinsrb m0, [r4 + 27], 1 |
| 24753 | pinsrb m0, [r4 + 28], 0 |
| 24754 | pmaddubsw m4, m0, m6 |
| 24755 | pmulhrsw m4, m7 |
| 24756 | pslldq m2, 2 |
| 24757 | pinsrb m2, [r4 + 17], 1 |
| 24758 | pinsrb m2, [r4 + 18], 0 |
| 24759 | pmaddubsw m5, m2, m6 |
| 24760 | pmulhrsw m5, m7 |
| 24761 | packuswb m4, m5 |
| 24762 | movu [r0 + 1144 * 16], m4 |
| 24763 | pslldq m1, 2 |
| 24764 | pinsrb m1, [r4 + 7], 1 |
| 24765 | pinsrb m1, [r4 + 9], 0 |
| 24766 | pmaddubsw m4, m1, m6 |
| 24767 | pmulhrsw m4, m7 |
| 24768 | pslldq m3, 2 |
| 24769 | pinsrw m3, [r3 + 1], 0 |
| 24770 | pmaddubsw m5, m3, m6 |
| 24771 | pmulhrsw m5, m7 |
| 24772 | packuswb m4, m5 |
| 24773 | movu [r0 + 1145 * 16], m4 |
| 24774 | |
| 24775 | ; mode 19 [row 29] |
| 24776 | movu m6, [r5 + 20 * 16] |
| 24777 | pslldq m0, 2 |
| 24778 | pinsrb m0, [r4 + 28], 1 |
| 24779 | pinsrb m0, [r4 + 30], 0 |
| 24780 | pmaddubsw m4, m0, m6 |
| 24781 | pmulhrsw m4, m7 |
| 24782 | pslldq m2, 2 |
| 24783 | pinsrb m2, [r4 + 18], 1 |
| 24784 | pinsrb m2, [r4 + 20], 0 |
| 24785 | pmaddubsw m5, m2, m6 |
| 24786 | pmulhrsw m5, m7 |
| 24787 | packuswb m4, m5 |
| 24788 | movu [r0 + 1146 * 16], m4 |
| 24789 | pslldq m1, 2 |
| 24790 | pinsrb m1, [r4 + 9], 1 |
| 24791 | pinsrb m1, [r4 + 10], 0 |
| 24792 | pmaddubsw m4, m1, m6 |
| 24793 | pmulhrsw m4, m7 |
| 24794 | pslldq m3, 2 |
| 24795 | pinsrw m3, [r3 + 0], 0 |
| 24796 | pmaddubsw m5, m3, m6 |
| 24797 | pmulhrsw m5, m7 |
| 24798 | packuswb m4, m5 |
| 24799 | movu [r0 + 1147 * 16], m4 |
| 24800 | |
| 24801 | ; mode 19 [row 30] |
| 24802 | movu m6, [r5 + 26 * 16] |
| 24803 | pslldq m0, 2 |
| 24804 | pinsrb m0, [r4 + 30], 1 |
| 24805 | pinsrb m0, [r4 + 31], 0 |
| 24806 | pmaddubsw m4, m0, m6 |
| 24807 | pmulhrsw m4, m7 |
| 24808 | pslldq m2, 2 |
| 24809 | pinsrb m2, [r4 + 20], 1 |
| 24810 | pinsrb m2, [r4 + 21], 0 |
| 24811 | pmaddubsw m5, m2, m6 |
| 24812 | pmulhrsw m5, m7 |
| 24813 | packuswb m4, m5 |
| 24814 | movu [r0 + 1148 * 16], m4 |
| 24815 | pslldq m1, 2 |
| 24816 | pinsrb m1, [r4 + 10], 1 |
| 24817 | pinsrb m1, [r4 + 11], 0 |
| 24818 | pmaddubsw m4, m1, m6 |
| 24819 | pmulhrsw m4, m7 |
| 24820 | pslldq m3, 2 |
| 24821 | pinsrb m3, [r4 + 0], 1 |
| 24822 | pinsrb m3, [r4 + 1], 0 |
| 24823 | pmaddubsw m5, m3, m6 |
| 24824 | pmulhrsw m5, m7 |
| 24825 | packuswb m4, m5 |
| 24826 | movu [r0 + 1149 * 16], m4 |
| 24827 | |
| 24828 | ; mode19 [row 31] |
| 24829 | pshufb m5, m0, [tab_S2] |
| 24830 | movh [r0 + 1150 * 16], m5 |
| 24831 | pshufb m5, m2, [tab_S2] |
| 24832 | movh [r0 + 1150 * 16 + 8], m5 |
| 24833 | pshufb m5, m1, [tab_S2] |
| 24834 | movh [r0 + 1151 * 16], m5 |
| 24835 | pshufb m5, m3, [tab_S2] |
| 24836 | movh [r0 + 1151 * 16 + 8], m5 |
| 24837 | |
| 24838 | ; mode 20 [row 0] |
| 24839 | movu m6, [r5 + 11 * 16] |
| 24840 | movu m0, [r3 ] |
| 24841 | movu m1, [r3 + 1 ] |
| 24842 | punpcklbw m0, m1 |
| 24843 | pmaddubsw m1, m0, m6 |
| 24844 | pmulhrsw m1, m7 |
| 24845 | movu m2, [r3 + 8] |
| 24846 | movu m3, [r3 + 9] |
| 24847 | punpcklbw m2, m3 |
| 24848 | pmaddubsw m3, m2, m6 |
| 24849 | pmulhrsw m3, m7 |
| 24850 | packuswb m1, m3 |
| 24851 | movu [r0 + 1152 * 16], m1 |
| 24852 | |
| 24853 | movu m1, [r3 + 16] |
| 24854 | movu m3, [r3 + 17] |
| 24855 | punpcklbw m1, m3 |
| 24856 | pmaddubsw m4, m1, m6 |
| 24857 | pmulhrsw m4, m7 |
| 24858 | movu m3, [r3 + 24] |
| 24859 | movu m5, [r3 + 25] |
| 24860 | punpcklbw m3, m5 |
| 24861 | pmaddubsw m5, m3, m6 |
| 24862 | pmulhrsw m5, m7 |
| 24863 | packuswb m4, m5 |
| 24864 | movu [r0 + 1153 * 16], m4 |
| 24865 | |
| 24866 | ; mode 20 [row 1] |
| 24867 | movu m6, [r5 + 22 * 16] |
| 24868 | pslldq m0, 2 |
| 24869 | pinsrb m0, [r4 + 0], 1 |
| 24870 | pinsrb m0, [r4 + 2], 0 |
| 24871 | pmaddubsw m4, m0, m6 |
| 24872 | pmulhrsw m4, m7 |
| 24873 | pslldq m2, 2 |
| 24874 | pinsrw m2, [r3 + 7], 0 |
| 24875 | pmaddubsw m5, m2, m6 |
| 24876 | pmulhrsw m5, m7 |
| 24877 | packuswb m4, m5 |
| 24878 | movu [r0 + 1154 * 16], m4 |
| 24879 | pslldq m1, 2 |
| 24880 | pinsrw m1, [r3 + 15], 0 |
| 24881 | pmaddubsw m4, m1, m6 |
| 24882 | pmulhrsw m4, m7 |
| 24883 | pslldq m3, 2 |
| 24884 | pinsrw m3, [r3 + 23], 0 |
| 24885 | pmaddubsw m5, m3, m6 |
| 24886 | pmulhrsw m5, m7 |
| 24887 | packuswb m4, m5 |
| 24888 | movu [r0 + 1155 * 16], m4 |
| 24889 | |
| 24890 | ; mode 20 [row 2] |
| 24891 | movu m6, [r5 + 1 * 16] |
| 24892 | pmaddubsw m4, m0, m6 |
| 24893 | pmulhrsw m4, m7 |
| 24894 | pmaddubsw m5, m2, m6 |
| 24895 | pmulhrsw m5, m7 |
| 24896 | packuswb m4, m5 |
| 24897 | movu [r0 + 1156 * 16], m4 |
| 24898 | pmaddubsw m4, m1, m6 |
| 24899 | pmulhrsw m4, m7 |
| 24900 | pmaddubsw m5, m3, m6 |
| 24901 | pmulhrsw m5, m7 |
| 24902 | packuswb m4, m5 |
| 24903 | movu [r0 + 1157 * 16], m4 |
| 24904 | |
| 24905 | ; mode 20 [row 3] |
| 24906 | movu m6, [r5 + 12 * 16] |
| 24907 | pslldq m0, 2 |
| 24908 | pinsrb m0, [r4 + 2], 1 |
| 24909 | pinsrb m0, [r4 + 3], 0 |
| 24910 | pmaddubsw m4, m0, m6 |
| 24911 | pmulhrsw m4, m7 |
| 24912 | pslldq m2, 2 |
| 24913 | pinsrw m2, [r3 + 6], 0 |
| 24914 | pmaddubsw m5, m2, m6 |
| 24915 | pmulhrsw m5, m7 |
| 24916 | packuswb m4, m5 |
| 24917 | movu [r0 + 1158 * 16], m4 |
| 24918 | pslldq m1, 2 |
| 24919 | pinsrw m1, [r3 + 14], 0 |
| 24920 | pmaddubsw m4, m1, m6 |
| 24921 | pmulhrsw m4, m7 |
| 24922 | pslldq m3, 2 |
| 24923 | pinsrw m3, [r3 + 22], 0 |
| 24924 | pmaddubsw m5, m3, m6 |
| 24925 | pmulhrsw m5, m7 |
| 24926 | packuswb m4, m5 |
| 24927 | movu [r0 + 1159 * 16], m4 |
| 24928 | |
| 24929 | ; mode 20 [row 4] |
| 24930 | movu m6, [r5 + 23 * 16] |
| 24931 | pslldq m0, 2 |
| 24932 | pinsrb m0, [r4 + 3], 1 |
| 24933 | pinsrb m0, [r4 + 5], 0 |
| 24934 | pmaddubsw m4, m0, m6 |
| 24935 | pmulhrsw m4, m7 |
| 24936 | pslldq m2, 2 |
| 24937 | pinsrw m2, [r3 + 5], 0 |
| 24938 | pmaddubsw m5, m2, m6 |
| 24939 | pmulhrsw m5, m7 |
| 24940 | packuswb m4, m5 |
| 24941 | movu [r0 + 1160 * 16], m4 |
| 24942 | pslldq m1, 2 |
| 24943 | pinsrw m1, [r3 + 13], 0 |
| 24944 | pmaddubsw m4, m1, m6 |
| 24945 | pmulhrsw m4, m7 |
| 24946 | pslldq m3, 2 |
| 24947 | pinsrw m3, [r3 + 21], 0 |
| 24948 | pmaddubsw m5, m3, m6 |
| 24949 | pmulhrsw m5, m7 |
| 24950 | packuswb m4, m5 |
| 24951 | movu [r0 + 1161 * 16], m4 |
| 24952 | |
| 24953 | ; mode 20 [row 5] |
| 24954 | movu m6, [r5 + 2 * 16] |
| 24955 | pmaddubsw m4, m0, m6 |
| 24956 | pmulhrsw m4, m7 |
| 24957 | pmaddubsw m5, m2, m6 |
| 24958 | pmulhrsw m5, m7 |
| 24959 | packuswb m4, m5 |
| 24960 | movu [r0 + 1162 * 16], m4 |
| 24961 | pmaddubsw m4, m1, m6 |
| 24962 | pmulhrsw m4, m7 |
| 24963 | pmaddubsw m5, m3, m6 |
| 24964 | pmulhrsw m5, m7 |
| 24965 | packuswb m4, m5 |
| 24966 | movu [r0 + 1163 * 16], m4 |
| 24967 | |
| 24968 | ; mode 20 [row 6] |
| 24969 | movu m6, [r5 + 13 * 16] |
| 24970 | pslldq m0, 2 |
| 24971 | pinsrb m0, [r4 + 5], 1 |
| 24972 | pinsrb m0, [r4 + 6], 0 |
| 24973 | pmaddubsw m4, m0, m6 |
| 24974 | pmulhrsw m4, m7 |
| 24975 | pslldq m2, 2 |
| 24976 | pinsrw m2, [r3 + 4], 0 |
| 24977 | pmaddubsw m5, m2, m6 |
| 24978 | pmulhrsw m5, m7 |
| 24979 | packuswb m4, m5 |
| 24980 | movu [r0 + 1164 * 16], m4 |
| 24981 | pslldq m1, 2 |
| 24982 | pinsrw m1, [r3 + 12], 0 |
| 24983 | pmaddubsw m4, m1, m6 |
| 24984 | pmulhrsw m4, m7 |
| 24985 | pslldq m3, 2 |
| 24986 | pinsrw m3, [r3 + 20], 0 |
| 24987 | pmaddubsw m5, m3, m6 |
| 24988 | pmulhrsw m5, m7 |
| 24989 | packuswb m4, m5 |
| 24990 | movu [r0 + 1165 * 16], m4 |
| 24991 | |
| 24992 | ; mode 20 [row 7] |
| 24993 | movu m6, [r5 + 24 * 16] |
| 24994 | pslldq m0, 2 |
| 24995 | pinsrb m0, [r4 + 6], 1 |
| 24996 | pinsrb m0, [r4 + 8], 0 |
| 24997 | pmaddubsw m4, m0, m6 |
| 24998 | pmulhrsw m4, m7 |
| 24999 | pslldq m2, 2 |
| 25000 | pinsrw m2, [r3 + 3], 0 |
| 25001 | pmaddubsw m5, m2, m6 |
| 25002 | pmulhrsw m5, m7 |
| 25003 | packuswb m4, m5 |
| 25004 | movu [r0 + 1166 * 16], m4 |
| 25005 | pslldq m1, 2 |
| 25006 | pinsrw m1, [r3 + 11], 0 |
| 25007 | pmaddubsw m4, m1, m6 |
| 25008 | pmulhrsw m4, m7 |
| 25009 | pslldq m3, 2 |
| 25010 | pinsrw m3, [r3 + 19], 0 |
| 25011 | pmaddubsw m5, m3, m6 |
| 25012 | pmulhrsw m5, m7 |
| 25013 | packuswb m4, m5 |
| 25014 | movu [r0 + 1167 * 16], m4 |
| 25015 | |
| 25016 | ; mode 20 [row 8] |
| 25017 | movu m6, [r5 + 3 * 16] |
| 25018 | pmaddubsw m4, m0, m6 |
| 25019 | pmulhrsw m4, m7 |
| 25020 | pmaddubsw m5, m2, m6 |
| 25021 | pmulhrsw m5, m7 |
| 25022 | packuswb m4, m5 |
| 25023 | movu [r0 + 1168 * 16], m4 |
| 25024 | pmaddubsw m4, m1, m6 |
| 25025 | pmulhrsw m4, m7 |
| 25026 | pmaddubsw m5, m3, m6 |
| 25027 | pmulhrsw m5, m7 |
| 25028 | packuswb m4, m5 |
| 25029 | movu [r0 + 1169 * 16], m4 |
| 25030 | |
| 25031 | ; mode 20 [row 9] |
| 25032 | movu m6, [r5 + 14 * 16] |
| 25033 | pslldq m0, 2 |
| 25034 | pinsrb m0, [r4 + 8], 1 |
| 25035 | pinsrb m0, [r4 + 9], 0 |
| 25036 | pmaddubsw m4, m0, m6 |
| 25037 | pmulhrsw m4, m7 |
| 25038 | pslldq m2, 2 |
| 25039 | pinsrb m2, [r3 + 3], 1 |
| 25040 | pinsrb m2, [r3 + 2], 0 |
| 25041 | pmaddubsw m5, m2, m6 |
| 25042 | pmulhrsw m5, m7 |
| 25043 | packuswb m4, m5 |
| 25044 | movu [r0 + 1170 * 16], m4 |
| 25045 | pslldq m1, 2 |
| 25046 | pinsrw m1, [r3 + 10], 0 |
| 25047 | pmaddubsw m4, m1, m6 |
| 25048 | pmulhrsw m4, m7 |
| 25049 | pslldq m3, 2 |
| 25050 | pinsrw m3, [r3 + 18], 0 |
| 25051 | pmaddubsw m5, m3, m6 |
| 25052 | pmulhrsw m5, m7 |
| 25053 | packuswb m4, m5 |
| 25054 | movu [r0 + 1171 * 16], m4 |
| 25055 | |
| 25056 | ; mode 20 [row 10] |
| 25057 | movu m6, [r5 + 25 * 16] |
| 25058 | pslldq m0, 2 |
| 25059 | pinsrb m0, [r4 + 9], 1 |
| 25060 | pinsrb m0, [r4 + 11], 0 |
| 25061 | pmaddubsw m4, m0, m6 |
| 25062 | pmulhrsw m4, m7 |
| 25063 | pslldq m2, 2 |
| 25064 | pinsrw m2, [r3 + 1], 0 |
| 25065 | pmaddubsw m5, m2, m6 |
| 25066 | pmulhrsw m5, m7 |
| 25067 | packuswb m4, m5 |
| 25068 | movu [r0 + 1172 * 16], m4 |
| 25069 | pslldq m1, 2 |
| 25070 | pinsrw m1, [r3 + 9], 0 |
| 25071 | pmaddubsw m4, m1, m6 |
| 25072 | pmulhrsw m4, m7 |
| 25073 | pslldq m3, 2 |
| 25074 | pinsrw m3, [r3 + 17], 0 |
| 25075 | pmaddubsw m5, m3, m6 |
| 25076 | pmulhrsw m5, m7 |
| 25077 | packuswb m4, m5 |
| 25078 | movu [r0 + 1173 * 16], m4 |
| 25079 | |
| 25080 | ; mode 20 [row 11] |
| 25081 | movu m6, [r5 + 4 * 16] |
| 25082 | pmaddubsw m4, m0, m6 |
| 25083 | pmulhrsw m4, m7 |
| 25084 | pmaddubsw m5, m2, m6 |
| 25085 | pmulhrsw m5, m7 |
| 25086 | packuswb m4, m5 |
| 25087 | movu [r0 + 1174 * 16], m4 |
| 25088 | pmaddubsw m4, m1, m6 |
| 25089 | pmulhrsw m4, m7 |
| 25090 | pmaddubsw m5, m3, m6 |
| 25091 | pmulhrsw m5, m7 |
| 25092 | packuswb m4, m5 |
| 25093 | movu [r0 + 1175 * 16], m4 |
| 25094 | |
| 25095 | ; mode 20 [row 12] |
| 25096 | movu m6, [r5 + 15 * 16] |
| 25097 | pslldq m0, 2 |
| 25098 | pinsrb m0, [r4 + 11], 1 |
| 25099 | pinsrb m0, [r4 + 12], 0 |
| 25100 | pmaddubsw m4, m0, m6 |
| 25101 | pmulhrsw m4, m7 |
| 25102 | pslldq m2, 2 |
| 25103 | pinsrb m2, [r3 + 1], 1 |
| 25104 | pinsrb m2, [r3 + 0], 0 |
| 25105 | pmaddubsw m5, m2, m6 |
| 25106 | pmulhrsw m5, m7 |
| 25107 | packuswb m4, m5 |
| 25108 | movu [r0 + 1176 * 16], m4 |
| 25109 | pslldq m1, 2 |
| 25110 | pinsrw m1, [r3 + 8], 0 |
| 25111 | pmaddubsw m4, m1, m6 |
| 25112 | pmulhrsw m4, m7 |
| 25113 | pslldq m3, 2 |
| 25114 | pinsrw m3, [r3 + 16], 0 |
| 25115 | pmaddubsw m5, m3, m6 |
| 25116 | pmulhrsw m5, m7 |
| 25117 | packuswb m4, m5 |
| 25118 | movu [r0 + 1177 * 16], m4 |
| 25119 | |
| 25120 | ; mode 20 [row 13] |
| 25121 | movu m6, [r5 + 26 * 16] |
| 25122 | pslldq m0, 2 |
| 25123 | pinsrb m0, [r4 + 12], 1 |
| 25124 | pinsrb m0, [r4 + 14], 0 |
| 25125 | pmaddubsw m4, m0, m6 |
| 25126 | pmulhrsw m4, m7 |
| 25127 | pslldq m2, 2 |
| 25128 | pinsrb m2, [r4 + 0], 1 |
| 25129 | pinsrb m2, [r4 + 2], 0 |
| 25130 | pmaddubsw m5, m2, m6 |
| 25131 | pmulhrsw m5, m7 |
| 25132 | packuswb m4, m5 |
| 25133 | movu [r0 + 1178 * 16], m4 |
| 25134 | pslldq m1, 2 |
| 25135 | pinsrw m1, [r3 + 7], 0 |
| 25136 | pmaddubsw m4, m1, m6 |
| 25137 | pmulhrsw m4, m7 |
| 25138 | pslldq m3, 2 |
| 25139 | pinsrw m3, [r3 + 15], 0 |
| 25140 | pmaddubsw m5, m3, m6 |
| 25141 | pmulhrsw m5, m7 |
| 25142 | packuswb m4, m5 |
| 25143 | movu [r0 + 1179 * 16], m4 |
| 25144 | |
| 25145 | ; mode 20 [row 14] |
| 25146 | movu m6, [r5 + 5 * 16] |
| 25147 | pmaddubsw m4, m0, m6 |
| 25148 | pmulhrsw m4, m7 |
| 25149 | pmaddubsw m5, m2, m6 |
| 25150 | pmulhrsw m5, m7 |
| 25151 | packuswb m4, m5 |
| 25152 | movu [r0 + 1180 * 16], m4 |
| 25153 | pmaddubsw m4, m1, m6 |
| 25154 | pmulhrsw m4, m7 |
| 25155 | pmaddubsw m5, m3, m6 |
| 25156 | pmulhrsw m5, m7 |
| 25157 | packuswb m4, m5 |
| 25158 | movu [r0 + 1181 * 16], m4 |
| 25159 | |
| 25160 | ; mode 20 [row 15] |
| 25161 | movu m6, [r5 + 16 * 16] |
| 25162 | pslldq m0, 2 |
| 25163 | pinsrb m0, [r4 + 14], 1 |
| 25164 | pinsrb m0, [r4 + 15], 0 |
| 25165 | pmaddubsw m4, m0, m6 |
| 25166 | pmulhrsw m4, m7 |
| 25167 | pslldq m2, 2 |
| 25168 | pinsrb m2, [r4 + 2], 1 |
| 25169 | pinsrb m2, [r4 + 3], 0 |
| 25170 | pmaddubsw m5, m2, m6 |
| 25171 | pmulhrsw m5, m7 |
| 25172 | packuswb m4, m5 |
| 25173 | movu [r0 + 1182 * 16], m4 |
| 25174 | pslldq m1, 2 |
| 25175 | pinsrw m1, [r3 + 6], 0 |
| 25176 | pmaddubsw m4, m1, m6 |
| 25177 | pmulhrsw m4, m7 |
| 25178 | pslldq m3, 2 |
| 25179 | pinsrw m3, [r3 + 14], 0 |
| 25180 | pmaddubsw m5, m3, m6 |
| 25181 | pmulhrsw m5, m7 |
| 25182 | packuswb m4, m5 |
| 25183 | movu [r0 + 1183 * 16], m4 |
| 25184 | |
| 25185 | ; mode 20 [row 16] |
| 25186 | movu m6, [r5 + 27 * 16] |
| 25187 | pslldq m0, 2 |
| 25188 | pinsrb m0, [r4 + 15], 1 |
| 25189 | pinsrb m0, [r4 + 17], 0 |
| 25190 | pmaddubsw m4, m0, m6 |
| 25191 | pmulhrsw m4, m7 |
| 25192 | pslldq m2, 2 |
| 25193 | pinsrb m2, [r4 + 3], 1 |
| 25194 | pinsrb m2, [r4 + 5], 0 |
| 25195 | pmaddubsw m5, m2, m6 |
| 25196 | pmulhrsw m5, m7 |
| 25197 | packuswb m4, m5 |
| 25198 | movu [r0 + 1184 * 16], m4 |
| 25199 | pslldq m1, 2 |
| 25200 | pinsrw m1, [r3 + 5], 0 |
| 25201 | pmaddubsw m4, m1, m6 |
| 25202 | pmulhrsw m4, m7 |
| 25203 | pslldq m3, 2 |
| 25204 | pinsrw m3, [r3 + 13], 0 |
| 25205 | pmaddubsw m5, m3, m6 |
| 25206 | pmulhrsw m5, m7 |
| 25207 | packuswb m4, m5 |
| 25208 | movu [r0 + 1185 * 16], m4 |
| 25209 | |
| 25210 | ; mode 20 [row 17] |
| 25211 | movu m6, [r5 + 6 * 16] |
| 25212 | pmaddubsw m4, m0, m6 |
| 25213 | pmulhrsw m4, m7 |
| 25214 | pmaddubsw m5, m2, m6 |
| 25215 | pmulhrsw m5, m7 |
| 25216 | packuswb m4, m5 |
| 25217 | movu [r0 + 1186 * 16], m4 |
| 25218 | pmaddubsw m4, m1, m6 |
| 25219 | pmulhrsw m4, m7 |
| 25220 | pmaddubsw m5, m3, m6 |
| 25221 | pmulhrsw m5, m7 |
| 25222 | packuswb m4, m5 |
| 25223 | movu [r0 + 1187 * 16], m4 |
| 25224 | |
| 25225 | ; mode 20 [row 18] |
| 25226 | movu m6, [r5 + 17 * 16] |
| 25227 | pslldq m0, 2 |
| 25228 | pinsrb m0, [r4 + 17], 1 |
| 25229 | pinsrb m0, [r4 + 18], 0 |
| 25230 | pmaddubsw m4, m0, m6 |
| 25231 | pmulhrsw m4, m7 |
| 25232 | pslldq m2, 2 |
| 25233 | pinsrb m2, [r4 + 5], 1 |
| 25234 | pinsrb m2, [r4 + 6], 0 |
| 25235 | pmaddubsw m5, m2, m6 |
| 25236 | pmulhrsw m5, m7 |
| 25237 | packuswb m4, m5 |
| 25238 | movu [r0 + 1188 * 16], m4 |
| 25239 | pslldq m1, 2 |
| 25240 | pinsrw m1, [r3 + 4], 0 |
| 25241 | pmaddubsw m4, m1, m6 |
| 25242 | pmulhrsw m4, m7 |
| 25243 | pslldq m3, 2 |
| 25244 | pinsrw m3, [r3 + 12], 0 |
| 25245 | pmaddubsw m5, m3, m6 |
| 25246 | pmulhrsw m5, m7 |
| 25247 | packuswb m4, m5 |
| 25248 | movu [r0 + 1189 * 16], m4 |
| 25249 | |
| 25250 | ; mode 20 [row 19] |
| 25251 | movu m6, [r5 + 28 * 16] |
| 25252 | pslldq m0, 2 |
| 25253 | pinsrb m0, [r4 + 18], 1 |
| 25254 | pinsrb m0, [r4 + 20], 0 |
| 25255 | pmaddubsw m4, m0, m6 |
| 25256 | pmulhrsw m4, m7 |
| 25257 | pslldq m2, 2 |
| 25258 | pinsrb m2, [r4 + 6], 1 |
| 25259 | pinsrb m2, [r4 + 8], 0 |
| 25260 | pmaddubsw m5, m2, m6 |
| 25261 | pmulhrsw m5, m7 |
| 25262 | packuswb m4, m5 |
| 25263 | movu [r0 + 1190 * 16], m4 |
| 25264 | pslldq m1, 2 |
| 25265 | pinsrw m1, [r3 + 3], 0 |
| 25266 | pmaddubsw m4, m1, m6 |
| 25267 | pmulhrsw m4, m7 |
| 25268 | pslldq m3, 2 |
| 25269 | pinsrw m3, [r3 + 11], 0 |
| 25270 | pmaddubsw m5, m3, m6 |
| 25271 | pmulhrsw m5, m7 |
| 25272 | packuswb m4, m5 |
| 25273 | movu [r0 + 1191 * 16], m4 |
| 25274 | |
| 25275 | ; mode 20 [row 20] |
| 25276 | movu m6, [r5 + 7 * 16] |
| 25277 | pmaddubsw m4, m0, m6 |
| 25278 | pmulhrsw m4, m7 |
| 25279 | pmaddubsw m5, m2, m6 |
| 25280 | pmulhrsw m5, m7 |
| 25281 | packuswb m4, m5 |
| 25282 | movu [r0 + 1192 * 16], m4 |
| 25283 | pmaddubsw m4, m1, m6 |
| 25284 | pmulhrsw m4, m7 |
| 25285 | pmaddubsw m5, m3, m6 |
| 25286 | pmulhrsw m5, m7 |
| 25287 | packuswb m4, m5 |
| 25288 | movu [r0 + 1193 * 16], m4 |
| 25289 | |
| 25290 | ; mode 20 [row 21] |
| 25291 | movu m6, [r5 + 18 * 16] |
| 25292 | pslldq m0, 2 |
| 25293 | pinsrb m0, [r4 + 20], 1 |
| 25294 | pinsrb m0, [r4 + 21], 0 |
| 25295 | pmaddubsw m4, m0, m6 |
| 25296 | pmulhrsw m4, m7 |
| 25297 | pslldq m2, 2 |
| 25298 | pinsrb m2, [r4 + 8], 1 |
| 25299 | pinsrb m2, [r4 + 9], 0 |
| 25300 | pmaddubsw m5, m2, m6 |
| 25301 | pmulhrsw m5, m7 |
| 25302 | packuswb m4, m5 |
| 25303 | movu [r0 + 1194 * 16], m4 |
| 25304 | pslldq m1, 2 |
| 25305 | pinsrw m1, [r3 + 2], 0 |
| 25306 | pmaddubsw m4, m1, m6 |
| 25307 | pmulhrsw m4, m7 |
| 25308 | pslldq m3, 2 |
| 25309 | pinsrw m3, [r3 + 10], 0 |
| 25310 | pmaddubsw m5, m3, m6 |
| 25311 | pmulhrsw m5, m7 |
| 25312 | packuswb m4, m5 |
| 25313 | movu [r0 + 1195 * 16], m4 |
| 25314 | |
| 25315 | ; mode 20 [row 22] |
| 25316 | movu m6, [r5 + 29 * 16] |
| 25317 | pslldq m0, 2 |
| 25318 | pinsrb m0, [r4 + 21], 1 |
| 25319 | pinsrb m0, [r4 + 23], 0 |
| 25320 | pmaddubsw m4, m0, m6 |
| 25321 | pmulhrsw m4, m7 |
| 25322 | pslldq m2, 2 |
| 25323 | pinsrb m2, [r4 + 9], 1 |
| 25324 | pinsrb m2, [r4 + 11], 0 |
| 25325 | pmaddubsw m5, m2, m6 |
| 25326 | pmulhrsw m5, m7 |
| 25327 | packuswb m4, m5 |
| 25328 | movu [r0 + 1196 * 16], m4 |
| 25329 | pslldq m1, 2 |
| 25330 | pinsrw m1, [r3 + 1], 0 |
| 25331 | pmaddubsw m4, m1, m6 |
| 25332 | pmulhrsw m4, m7 |
| 25333 | pslldq m3, 2 |
| 25334 | pinsrw m3, [r3 + 9], 0 |
| 25335 | pmaddubsw m5, m3, m6 |
| 25336 | pmulhrsw m5, m7 |
| 25337 | packuswb m4, m5 |
| 25338 | movu [r0 + 1197 * 16], m4 |
| 25339 | |
| 25340 | ; mode 20 [row 23] |
| 25341 | movu m6, [r5 + 8 * 16] |
| 25342 | pmaddubsw m4, m0, m6 |
| 25343 | pmulhrsw m4, m7 |
| 25344 | pmaddubsw m5, m2, m6 |
| 25345 | pmulhrsw m5, m7 |
| 25346 | packuswb m4, m5 |
| 25347 | movu [r0 + 1198 * 16], m4 |
| 25348 | pmaddubsw m4, m1, m6 |
| 25349 | pmulhrsw m4, m7 |
| 25350 | pmaddubsw m5, m3, m6 |
| 25351 | pmulhrsw m5, m7 |
| 25352 | packuswb m4, m5 |
| 25353 | movu [r0 + 1199 * 16], m4 |
| 25354 | |
| 25355 | ; mode 20 [row 24] |
| 25356 | movu m6, [r5 + 19 * 16] |
| 25357 | pslldq m0, 2 |
| 25358 | pinsrb m0, [r4 + 23], 1 |
| 25359 | pinsrb m0, [r4 + 24], 0 |
| 25360 | pmaddubsw m4, m0, m6 |
| 25361 | pmulhrsw m4, m7 |
| 25362 | pslldq m2, 2 |
| 25363 | pinsrb m2, [r4 + 11], 1 |
| 25364 | pinsrb m2, [r4 + 12], 0 |
| 25365 | pmaddubsw m5, m2, m6 |
| 25366 | pmulhrsw m5, m7 |
| 25367 | packuswb m4, m5 |
| 25368 | movu [r0 + 1200 * 16], m4 |
| 25369 | pslldq m1, 2 |
| 25370 | pinsrw m1, [r3 + 0], 0 |
| 25371 | pmaddubsw m4, m1, m6 |
| 25372 | pmulhrsw m4, m7 |
| 25373 | pslldq m3, 2 |
| 25374 | pinsrw m3, [r3 + 8], 0 |
| 25375 | pmaddubsw m5, m3, m6 |
| 25376 | pmulhrsw m5, m7 |
| 25377 | packuswb m4, m5 |
| 25378 | movu [r0 + 1201 * 16], m4 |
| 25379 | |
| 25380 | ; mode 20 [row 25] |
| 25381 | movu m6, [r5 + 30 * 16] |
| 25382 | pslldq m0, 2 |
| 25383 | pinsrb m0, [r4 + 24], 1 |
| 25384 | pinsrb m0, [r4 + 26], 0 |
| 25385 | pmaddubsw m4, m0, m6 |
| 25386 | pmulhrsw m4, m7 |
| 25387 | pslldq m2, 2 |
| 25388 | pinsrb m2, [r4 + 12], 1 |
| 25389 | pinsrb m2, [r4 + 14], 0 |
| 25390 | pmaddubsw m5, m2, m6 |
| 25391 | pmulhrsw m5, m7 |
| 25392 | packuswb m4, m5 |
| 25393 | movu [r0 + 1202 * 16], m4 |
| 25394 | pslldq m1, 2 |
| 25395 | pinsrb m1, [r4 + 0], 1 |
| 25396 | pinsrb m1, [r4 + 2], 0 |
| 25397 | pmaddubsw m4, m1, m6 |
| 25398 | pmulhrsw m4, m7 |
| 25399 | pslldq m3, 2 |
| 25400 | pinsrw m3, [r3 + 7], 0 |
| 25401 | pmaddubsw m5, m3, m6 |
| 25402 | pmulhrsw m5, m7 |
| 25403 | packuswb m4, m5 |
| 25404 | movu [r0 + 1203 * 16], m4 |
| 25405 | |
| 25406 | ; mode 20 [row 26] |
| 25407 | movu m6, [r5 + 9 * 16] |
| 25408 | pmaddubsw m4, m0, m6 |
| 25409 | pmulhrsw m4, m7 |
| 25410 | pmaddubsw m5, m2, m6 |
| 25411 | pmulhrsw m5, m7 |
| 25412 | packuswb m4, m5 |
| 25413 | movu [r0 + 1204 * 16], m4 |
| 25414 | pmaddubsw m4, m1, m6 |
| 25415 | pmulhrsw m4, m7 |
| 25416 | pmaddubsw m5, m3, m6 |
| 25417 | pmulhrsw m5, m7 |
| 25418 | packuswb m4, m5 |
| 25419 | movu [r0 + 1205 * 16], m4 |
| 25420 | |
| 25421 | ; mode 20 [row 27] |
| 25422 | movu m6, [r5 + 20 * 16] |
| 25423 | pslldq m0, 2 |
| 25424 | pinsrb m0, [r4 + 26], 1 |
| 25425 | pinsrb m0, [r4 + 27], 0 |
| 25426 | pmaddubsw m4, m0, m6 |
| 25427 | pmulhrsw m4, m7 |
| 25428 | pslldq m2, 2 |
| 25429 | pinsrb m2, [r4 + 14], 1 |
| 25430 | pinsrb m2, [r4 + 15], 0 |
| 25431 | pmaddubsw m5, m2, m6 |
| 25432 | pmulhrsw m5, m7 |
| 25433 | packuswb m4, m5 |
| 25434 | movu [r0 + 1206 * 16], m4 |
| 25435 | pslldq m1, 2 |
| 25436 | pinsrb m1, [r4 + 2], 1 |
| 25437 | pinsrb m1, [r4 + 3], 0 |
| 25438 | pmaddubsw m4, m1, m6 |
| 25439 | pmulhrsw m4, m7 |
| 25440 | pslldq m3, 2 |
| 25441 | pinsrw m3, [r3 + 6], 0 |
| 25442 | pmaddubsw m5, m3, m6 |
| 25443 | pmulhrsw m5, m7 |
| 25444 | packuswb m4, m5 |
| 25445 | movu [r0 + 1207 * 16], m4 |
| 25446 | |
| 25447 | ; mode 20 [row 28] |
| 25448 | movu m6, [r5 + 31 * 16] |
| 25449 | pslldq m0, 2 |
| 25450 | pinsrb m0, [r4 + 27], 1 |
| 25451 | pinsrb m0, [r4 + 29], 0 |
| 25452 | pmaddubsw m4, m0, m6 |
| 25453 | pmulhrsw m4, m7 |
| 25454 | pslldq m2, 2 |
| 25455 | pinsrb m2, [r4 + 15], 1 |
| 25456 | pinsrb m2, [r4 + 17], 0 |
| 25457 | pmaddubsw m5, m2, m6 |
| 25458 | pmulhrsw m5, m7 |
| 25459 | packuswb m4, m5 |
| 25460 | movu [r0 + 1208 * 16], m4 |
| 25461 | pslldq m1, 2 |
| 25462 | pinsrb m1, [r4 + 3], 1 |
| 25463 | pinsrb m1, [r4 + 5], 0 |
| 25464 | pmaddubsw m4, m1, m6 |
| 25465 | pmulhrsw m4, m7 |
| 25466 | pslldq m3, 2 |
| 25467 | pinsrw m3, [r3 + 5], 0 |
| 25468 | pmaddubsw m5, m3, m6 |
| 25469 | pmulhrsw m5, m7 |
| 25470 | packuswb m4, m5 |
| 25471 | movu [r0 + 1209 * 16], m4 |
| 25472 | |
| 25473 | ; mode 20 [row 29] |
| 25474 | movu m6, [r5 + 10 * 16] |
| 25475 | pmaddubsw m4, m0, m6 |
| 25476 | pmulhrsw m4, m7 |
| 25477 | pmaddubsw m5, m2, m6 |
| 25478 | pmulhrsw m5, m7 |
| 25479 | packuswb m4, m5 |
| 25480 | movu [r0 + 1210 * 16], m4 |
| 25481 | pmaddubsw m4, m1, m6 |
| 25482 | pmulhrsw m4, m7 |
| 25483 | pmaddubsw m5, m3, m6 |
| 25484 | pmulhrsw m5, m7 |
| 25485 | packuswb m4, m5 |
| 25486 | movu [r0 + 1211 * 16], m4 |
| 25487 | |
| 25488 | ; mode 20 [row 30] |
| 25489 | movu m6, [r5 + 21 * 16] |
| 25490 | pslldq m0, 2 |
| 25491 | pinsrb m0, [r4 + 29], 1 |
| 25492 | pinsrb m0, [r4 + 30], 0 |
| 25493 | pmaddubsw m4, m0, m6 |
| 25494 | pmulhrsw m4, m7 |
| 25495 | pslldq m2, 2 |
| 25496 | pinsrb m2, [r4 + 17], 1 |
| 25497 | pinsrb m2, [r4 + 18], 0 |
| 25498 | pmaddubsw m5, m2, m6 |
| 25499 | pmulhrsw m5, m7 |
| 25500 | packuswb m4, m5 |
| 25501 | movu [r0 + 1212 * 16], m4 |
| 25502 | pslldq m1, 2 |
| 25503 | pinsrb m1, [r4 + 5], 1 |
| 25504 | pinsrb m1, [r4 + 6], 0 |
| 25505 | pmaddubsw m4, m1, m6 |
| 25506 | pmulhrsw m4, m7 |
| 25507 | pslldq m3, 2 |
| 25508 | pinsrw m3, [r3 + 4], 0 |
| 25509 | pmaddubsw m5, m3, m6 |
| 25510 | pmulhrsw m5, m7 |
| 25511 | packuswb m4, m5 |
| 25512 | movu [r0 + 1213 * 16], m4 |
| 25513 | |
| 25514 | ; mode20 [row 31] |
| 25515 | pshufb m5, m0, [tab_S2] |
| 25516 | movh [r0 + 1214 * 16], m5 |
| 25517 | pshufb m5, m2, [tab_S2] |
| 25518 | movh [r0 + 1214 * 16 + 8], m5 |
| 25519 | pshufb m5, m1, [tab_S2] |
| 25520 | movh [r0 + 1215 * 16], m5 |
| 25521 | pshufb m5, m3, [tab_S2] |
| 25522 | movh [r0 + 1215 * 16 + 8], m5 |
| 25523 | |
| 25524 | ; mode 21 [row 0] |
| 25525 | movu m6, [r5 + 15 * 16] |
| 25526 | movu m0, [r3 ] |
| 25527 | movu m1, [r3 + 1 ] |
| 25528 | punpcklbw m0, m1 |
| 25529 | pmaddubsw m1, m0, m6 |
| 25530 | pmulhrsw m1, m7 |
| 25531 | movu m2, [r3 + 8] |
| 25532 | movu m3, [r3 + 9] |
| 25533 | punpcklbw m2, m3 |
| 25534 | pmaddubsw m3, m2, m6 |
| 25535 | pmulhrsw m3, m7 |
| 25536 | packuswb m1, m3 |
| 25537 | movu [r0 + 1216 * 16], m1 |
| 25538 | |
| 25539 | movu m1, [r3 + 16] |
| 25540 | movu m3, [r3 + 17] |
| 25541 | punpcklbw m1, m3 |
| 25542 | pmaddubsw m4, m1, m6 |
| 25543 | pmulhrsw m4, m7 |
| 25544 | movu m3, [r3 + 24] |
| 25545 | movu m5, [r3 + 25] |
| 25546 | punpcklbw m3, m5 |
| 25547 | pmaddubsw m5, m3, m6 |
| 25548 | pmulhrsw m5, m7 |
| 25549 | packuswb m4, m5 |
| 25550 | movu [r0 + 1217 * 16], m4 |
| 25551 | |
| 25552 | ; mode 21 [row 1] |
| 25553 | movu m6, [r5 + 30 * 16] |
| 25554 | pslldq m0, 2 |
| 25555 | pinsrb m0, [r4 + 0], 1 |
| 25556 | pinsrb m0, [r4 + 2], 0 |
| 25557 | pmaddubsw m4, m0, m6 |
| 25558 | pmulhrsw m4, m7 |
| 25559 | pslldq m2, 2 |
| 25560 | pinsrw m2, [r3 + 7], 0 |
| 25561 | pmaddubsw m5, m2, m6 |
| 25562 | pmulhrsw m5, m7 |
| 25563 | packuswb m4, m5 |
| 25564 | movu [r0 + 1218 * 16], m4 |
| 25565 | pslldq m1, 2 |
| 25566 | pinsrw m1, [r3 + 15], 0 |
| 25567 | pmaddubsw m4, m1, m6 |
| 25568 | pmulhrsw m4, m7 |
| 25569 | pslldq m3, 2 |
| 25570 | pinsrw m3, [r3 + 23], 0 |
| 25571 | pmaddubsw m5, m3, m6 |
| 25572 | pmulhrsw m5, m7 |
| 25573 | packuswb m4, m5 |
| 25574 | movu [r0 + 1219 * 16], m4 |
| 25575 | |
| 25576 | ; mode 21 [row 2] |
| 25577 | movu m6, [r5 + 13 * 16] |
| 25578 | pmaddubsw m4, m0, m6 |
| 25579 | pmulhrsw m4, m7 |
| 25580 | pmaddubsw m5, m2, m6 |
| 25581 | pmulhrsw m5, m7 |
| 25582 | packuswb m4, m5 |
| 25583 | movu [r0 + 1220 * 16], m4 |
| 25584 | pmaddubsw m4, m1, m6 |
| 25585 | pmulhrsw m4, m7 |
| 25586 | pmaddubsw m5, m3, m6 |
| 25587 | pmulhrsw m5, m7 |
| 25588 | packuswb m4, m5 |
| 25589 | movu [r0 + 1221 * 16], m4 |
| 25590 | |
| 25591 | ; mode 21 [row 3] |
| 25592 | movu m6, [r5 + 28 * 16] |
| 25593 | pslldq m0, 2 |
| 25594 | pinsrb m0, [r4 + 2], 1 |
| 25595 | pinsrb m0, [r4 + 4], 0 |
| 25596 | pmaddubsw m4, m0, m6 |
| 25597 | pmulhrsw m4, m7 |
| 25598 | pslldq m2, 2 |
| 25599 | pinsrw m2, [r3 + 6], 0 |
| 25600 | pmaddubsw m5, m2, m6 |
| 25601 | pmulhrsw m5, m7 |
| 25602 | packuswb m4, m5 |
| 25603 | movu [r0 + 1222 * 16], m4 |
| 25604 | pslldq m1, 2 |
| 25605 | pinsrw m1, [r3 + 14], 0 |
| 25606 | pmaddubsw m4, m1, m6 |
| 25607 | pmulhrsw m4, m7 |
| 25608 | pslldq m3, 2 |
| 25609 | pinsrw m3, [r3 + 22], 0 |
| 25610 | pmaddubsw m5, m3, m6 |
| 25611 | pmulhrsw m5, m7 |
| 25612 | packuswb m4, m5 |
| 25613 | movu [r0 + 1223 * 16], m4 |
| 25614 | |
| 25615 | ; mode 21 [row 4] |
| 25616 | movu m6, [r5 + 11 * 16] |
| 25617 | pmaddubsw m4, m0, m6 |
| 25618 | pmulhrsw m4, m7 |
| 25619 | pmaddubsw m5, m2, m6 |
| 25620 | pmulhrsw m5, m7 |
| 25621 | packuswb m4, m5 |
| 25622 | movu [r0 + 1224 * 16], m4 |
| 25623 | pmaddubsw m4, m1, m6 |
| 25624 | pmulhrsw m4, m7 |
| 25625 | pmaddubsw m5, m3, m6 |
| 25626 | pmulhrsw m5, m7 |
| 25627 | packuswb m4, m5 |
| 25628 | movu [r0 + 1225 * 16], m4 |
| 25629 | |
| 25630 | ; mode 21 [row 5] |
| 25631 | movu m6, [r5 + 26 * 16] |
| 25632 | pslldq m0, 2 |
| 25633 | pinsrb m0, [r4 + 4], 1 |
| 25634 | pinsrb m0, [r4 + 6], 0 |
| 25635 | pmaddubsw m4, m0, m6 |
| 25636 | pmulhrsw m4, m7 |
| 25637 | pslldq m2, 2 |
| 25638 | pinsrw m2, [r3 + 5], 0 |
| 25639 | pmaddubsw m5, m2, m6 |
| 25640 | pmulhrsw m5, m7 |
| 25641 | packuswb m4, m5 |
| 25642 | movu [r0 + 1226 * 16], m4 |
| 25643 | pslldq m1, 2 |
| 25644 | pinsrw m1, [r3 + 13], 0 |
| 25645 | pmaddubsw m4, m1, m6 |
| 25646 | pmulhrsw m4, m7 |
| 25647 | pslldq m3, 2 |
| 25648 | pinsrw m3, [r3 + 21], 0 |
| 25649 | pmaddubsw m5, m3, m6 |
| 25650 | pmulhrsw m5, m7 |
| 25651 | packuswb m4, m5 |
| 25652 | movu [r0 + 1227 * 16], m4 |
| 25653 | |
| 25654 | ; mode 21 [row 6] |
| 25655 | movu m6, [r5 + 9 * 16] |
| 25656 | pmaddubsw m4, m0, m6 |
| 25657 | pmulhrsw m4, m7 |
| 25658 | pmaddubsw m5, m2, m6 |
| 25659 | pmulhrsw m5, m7 |
| 25660 | packuswb m4, m5 |
| 25661 | movu [r0 + 1228 * 16], m4 |
| 25662 | pmaddubsw m4, m1, m6 |
| 25663 | pmulhrsw m4, m7 |
| 25664 | pmaddubsw m5, m3, m6 |
| 25665 | pmulhrsw m5, m7 |
| 25666 | packuswb m4, m5 |
| 25667 | movu [r0 + 1229 * 16], m4 |
| 25668 | |
| 25669 | ; mode 21 [row 7] |
| 25670 | movu m6, [r5 + 24 * 16] |
| 25671 | pslldq m0, 2 |
| 25672 | pinsrb m0, [r4 + 6], 1 |
| 25673 | pinsrb m0, [r4 + 8], 0 |
| 25674 | pmaddubsw m4, m0, m6 |
| 25675 | pmulhrsw m4, m7 |
| 25676 | pslldq m2, 2 |
| 25677 | pinsrw m2, [r3 + 4], 0 |
| 25678 | pmaddubsw m5, m2, m6 |
| 25679 | pmulhrsw m5, m7 |
| 25680 | packuswb m4, m5 |
| 25681 | movu [r0 + 1230 * 16], m4 |
| 25682 | pslldq m1, 2 |
| 25683 | pinsrw m1, [r3 + 12], 0 |
| 25684 | pmaddubsw m4, m1, m6 |
| 25685 | pmulhrsw m4, m7 |
| 25686 | pslldq m3, 2 |
| 25687 | pinsrw m3, [r3 + 20], 0 |
| 25688 | pmaddubsw m5, m3, m6 |
| 25689 | pmulhrsw m5, m7 |
| 25690 | packuswb m4, m5 |
| 25691 | movu [r0 + 1231 * 16], m4 |
| 25692 | |
| 25693 | ; mode 21 [row 8] |
| 25694 | movu m6, [r5 + 7 * 16] |
| 25695 | pmaddubsw m4, m0, m6 |
| 25696 | pmulhrsw m4, m7 |
| 25697 | pmaddubsw m5, m2, m6 |
| 25698 | pmulhrsw m5, m7 |
| 25699 | packuswb m4, m5 |
| 25700 | movu [r0 + 1232 * 16], m4 |
| 25701 | pmaddubsw m4, m1, m6 |
| 25702 | pmulhrsw m4, m7 |
| 25703 | pmaddubsw m5, m3, m6 |
| 25704 | pmulhrsw m5, m7 |
| 25705 | packuswb m4, m5 |
| 25706 | movu [r0 + 1233 * 16], m4 |
| 25707 | |
| 25708 | ; mode 21 [row 9] |
| 25709 | movu m6, [r5 + 22 * 16] |
| 25710 | pslldq m0, 2 |
| 25711 | pinsrb m0, [r4 + 8], 1 |
| 25712 | pinsrb m0, [r4 + 9], 0 |
| 25713 | pmaddubsw m4, m0, m6 |
| 25714 | pmulhrsw m4, m7 |
| 25715 | pslldq m2, 2 |
| 25716 | pinsrw m2, [r3 + 3], 0 |
| 25717 | pmaddubsw m5, m2, m6 |
| 25718 | pmulhrsw m5, m7 |
| 25719 | packuswb m4, m5 |
| 25720 | movu [r0 + 1234 * 16], m4 |
| 25721 | pslldq m1, 2 |
| 25722 | pinsrw m1, [r3 + 11], 0 |
| 25723 | pmaddubsw m4, m1, m6 |
| 25724 | pmulhrsw m4, m7 |
| 25725 | pslldq m3, 2 |
| 25726 | pinsrw m3, [r3 + 19], 0 |
| 25727 | pmaddubsw m5, m3, m6 |
| 25728 | pmulhrsw m5, m7 |
| 25729 | packuswb m4, m5 |
| 25730 | movu [r0 + 1235 * 16], m4 |
| 25731 | |
| 25732 | ; mode 21 [row 10] |
| 25733 | movu m6, [r5 + 5 * 16] |
| 25734 | pmaddubsw m4, m0, m6 |
| 25735 | pmulhrsw m4, m7 |
| 25736 | pmaddubsw m5, m2, m6 |
| 25737 | pmulhrsw m5, m7 |
| 25738 | packuswb m4, m5 |
| 25739 | movu [r0 + 1236 * 16], m4 |
| 25740 | pmaddubsw m4, m1, m6 |
| 25741 | pmulhrsw m4, m7 |
| 25742 | pmaddubsw m5, m3, m6 |
| 25743 | pmulhrsw m5, m7 |
| 25744 | packuswb m4, m5 |
| 25745 | movu [r0 + 1237 * 16], m4 |
| 25746 | |
| 25747 | ; mode 21 [row 11] |
| 25748 | movu m6, [r5 + 20 * 16] |
| 25749 | pslldq m0, 2 |
| 25750 | pinsrb m0, [r4 + 9], 1 |
| 25751 | pinsrb m0, [r4 + 11], 0 |
| 25752 | pmaddubsw m4, m0, m6 |
| 25753 | pmulhrsw m4, m7 |
| 25754 | pslldq m2, 2 |
| 25755 | pinsrw m2, [r3 + 2], 0 |
| 25756 | pmaddubsw m5, m2, m6 |
| 25757 | pmulhrsw m5, m7 |
| 25758 | packuswb m4, m5 |
| 25759 | movu [r0 + 1238 * 16], m4 |
| 25760 | pslldq m1, 2 |
| 25761 | pinsrw m1, [r3 + 10], 0 |
| 25762 | pmaddubsw m4, m1, m6 |
| 25763 | pmulhrsw m4, m7 |
| 25764 | pslldq m3, 2 |
| 25765 | pinsrw m3, [r3 + 18], 0 |
| 25766 | pmaddubsw m5, m3, m6 |
| 25767 | pmulhrsw m5, m7 |
| 25768 | packuswb m4, m5 |
| 25769 | movu [r0 + 1239 * 16], m4 |
| 25770 | |
| 25771 | ; mode 21 [row 12] |
| 25772 | movu m6, [r5 + 3 * 16] |
| 25773 | pmaddubsw m4, m0, m6 |
| 25774 | pmulhrsw m4, m7 |
| 25775 | pmaddubsw m5, m2, m6 |
| 25776 | pmulhrsw m5, m7 |
| 25777 | packuswb m4, m5 |
| 25778 | movu [r0 + 1240 * 16], m4 |
| 25779 | pmaddubsw m4, m1, m6 |
| 25780 | pmulhrsw m4, m7 |
| 25781 | pmaddubsw m5, m3, m6 |
| 25782 | pmulhrsw m5, m7 |
| 25783 | packuswb m4, m5 |
| 25784 | movu [r0 + 1241 * 16], m4 |
| 25785 | |
| 25786 | ; mode 21 [row 13] |
| 25787 | movu m6, [r5 + 18 * 16] |
| 25788 | pslldq m0, 2 |
| 25789 | pinsrb m0, [r4 + 11], 1 |
| 25790 | pinsrb m0, [r4 + 13], 0 |
| 25791 | pmaddubsw m4, m0, m6 |
| 25792 | pmulhrsw m4, m7 |
| 25793 | pslldq m2, 2 |
| 25794 | pinsrw m2, [r3 + 1], 0 |
| 25795 | pmaddubsw m5, m2, m6 |
| 25796 | pmulhrsw m5, m7 |
| 25797 | packuswb m4, m5 |
| 25798 | movu [r0 + 1242 * 16], m4 |
| 25799 | pslldq m1, 2 |
| 25800 | pinsrw m1, [r3 + 9], 0 |
| 25801 | pmaddubsw m4, m1, m6 |
| 25802 | pmulhrsw m4, m7 |
| 25803 | pslldq m3, 2 |
| 25804 | pinsrw m3, [r3 + 17], 0 |
| 25805 | pmaddubsw m5, m3, m6 |
| 25806 | pmulhrsw m5, m7 |
| 25807 | packuswb m4, m5 |
| 25808 | movu [r0 + 1243 * 16], m4 |
| 25809 | |
| 25810 | ; mode 21 [row 14] |
| 25811 | movu m6, [r5 + 1 * 16] |
| 25812 | pmaddubsw m4, m0, m6 |
| 25813 | pmulhrsw m4, m7 |
| 25814 | pmaddubsw m5, m2, m6 |
| 25815 | pmulhrsw m5, m7 |
| 25816 | packuswb m4, m5 |
| 25817 | movu [r0 + 1244 * 16], m4 |
| 25818 | pmaddubsw m4, m1, m6 |
| 25819 | pmulhrsw m4, m7 |
| 25820 | pmaddubsw m5, m3, m6 |
| 25821 | pmulhrsw m5, m7 |
| 25822 | packuswb m4, m5 |
| 25823 | movu [r0 + 1245 * 16], m4 |
| 25824 | |
| 25825 | ; mode 21 [row 15] |
| 25826 | movu m6, [r5 + 16 * 16] |
| 25827 | pslldq m0, 2 |
| 25828 | pinsrb m0, [r4 + 13], 1 |
| 25829 | pinsrb m0, [r4 + 15], 0 |
| 25830 | pmaddubsw m4, m0, m6 |
| 25831 | pmulhrsw m4, m7 |
| 25832 | pslldq m2, 2 |
| 25833 | pinsrw m2, [r3 + 0], 0 |
| 25834 | pmaddubsw m5, m2, m6 |
| 25835 | pmulhrsw m5, m7 |
| 25836 | packuswb m4, m5 |
| 25837 | movu [r0 + 1246 * 16], m4 |
| 25838 | pslldq m1, 2 |
| 25839 | pinsrw m1, [r3 + 8], 0 |
| 25840 | pmaddubsw m4, m1, m6 |
| 25841 | pmulhrsw m4, m7 |
| 25842 | pslldq m3, 2 |
| 25843 | pinsrw m3, [r3 + 16], 0 |
| 25844 | pmaddubsw m5, m3, m6 |
| 25845 | pmulhrsw m5, m7 |
| 25846 | packuswb m4, m5 |
| 25847 | movu [r0 + 1247 * 16], m4 |
| 25848 | |
| 25849 | ; mode 21 [row 16] |
| 25850 | movu m6, [r5 + 31 * 16] |
| 25851 | pslldq m0, 2 |
| 25852 | pinsrb m0, [r4 + 15], 1 |
| 25853 | pinsrb m0, [r4 + 17], 0 |
| 25854 | pmaddubsw m4, m0, m6 |
| 25855 | pmulhrsw m4, m7 |
| 25856 | pslldq m2, 2 |
| 25857 | pinsrb m2, [r4 + 0], 1 |
| 25858 | pinsrb m2, [r4 + 2], 0 |
| 25859 | pmaddubsw m5, m2, m6 |
| 25860 | pmulhrsw m5, m7 |
| 25861 | packuswb m4, m5 |
| 25862 | movu [r0 + 1248 * 16], m4 |
| 25863 | pslldq m1, 2 |
| 25864 | pinsrw m1, [r3 + 7], 0 |
| 25865 | pmaddubsw m4, m1, m6 |
| 25866 | pmulhrsw m4, m7 |
| 25867 | pslldq m3, 2 |
| 25868 | pinsrw m3, [r3 + 15], 0 |
| 25869 | pmaddubsw m5, m3, m6 |
| 25870 | pmulhrsw m5, m7 |
| 25871 | packuswb m4, m5 |
| 25872 | movu [r0 + 1249 * 16], m4 |
| 25873 | |
| 25874 | ; mode 21 [row 17] |
| 25875 | movu m6, [r5 + 14 * 16] |
| 25876 | pmaddubsw m4, m0, m6 |
| 25877 | pmulhrsw m4, m7 |
| 25878 | pmaddubsw m5, m2, m6 |
| 25879 | pmulhrsw m5, m7 |
| 25880 | packuswb m4, m5 |
| 25881 | movu [r0 + 1250 * 16], m4 |
| 25882 | pmaddubsw m4, m1, m6 |
| 25883 | pmulhrsw m4, m7 |
| 25884 | pmaddubsw m5, m3, m6 |
| 25885 | pmulhrsw m5, m7 |
| 25886 | packuswb m4, m5 |
| 25887 | movu [r0 + 1251 * 16], m4 |
| 25888 | |
| 25889 | ; mode 21 [row 18] |
| 25890 | movu m6, [r5 + 29 * 16] |
| 25891 | pslldq m0, 2 |
| 25892 | pinsrb m0, [r4 + 17], 1 |
| 25893 | pinsrb m0, [r4 + 19], 0 |
| 25894 | pmaddubsw m4, m0, m6 |
| 25895 | pmulhrsw m4, m7 |
| 25896 | pslldq m2, 2 |
| 25897 | pinsrb m2, [r4 + 2], 1 |
| 25898 | pinsrb m2, [r4 + 4], 0 |
| 25899 | pmaddubsw m5, m2, m6 |
| 25900 | pmulhrsw m5, m7 |
| 25901 | packuswb m4, m5 |
| 25902 | movu [r0 + 1252 * 16], m4 |
| 25903 | pslldq m1, 2 |
| 25904 | pinsrb m1, [r3 + 7], 1 |
| 25905 | pinsrb m1, [r3 + 6], 0 |
| 25906 | pmaddubsw m4, m1, m6 |
| 25907 | pmulhrsw m4, m7 |
| 25908 | pslldq m3, 2 |
| 25909 | pinsrb m3, [r3 + 15], 1 |
| 25910 | pinsrb m3, [r3 + 14], 0 |
| 25911 | pmaddubsw m5, m3, m6 |
| 25912 | pmulhrsw m5, m7 |
| 25913 | packuswb m4, m5 |
| 25914 | movu [r0 + 1253 * 16], m4 |
| 25915 | |
| 25916 | ; mode 21 [row 19] |
| 25917 | movu m6, [r5 + 12 * 16] |
| 25918 | pmaddubsw m4, m0, m6 |
| 25919 | pmulhrsw m4, m7 |
| 25920 | pmaddubsw m5, m2, m6 |
| 25921 | pmulhrsw m5, m7 |
| 25922 | packuswb m4, m5 |
| 25923 | movu [r0 + 1254 * 16], m4 |
| 25924 | pmaddubsw m4, m1, m6 |
| 25925 | pmulhrsw m4, m7 |
| 25926 | pmaddubsw m5, m3, m6 |
| 25927 | pmulhrsw m5, m7 |
| 25928 | packuswb m4, m5 |
| 25929 | movu [r0 + 1255 * 16], m4 |
| 25930 | |
| 25931 | ; mode 21 [row 20] |
| 25932 | movu m6, [r5 + 27 * 16] |
| 25933 | pslldq m0, 2 |
| 25934 | pinsrb m0, [r4 + 19], 1 |
| 25935 | pinsrb m0, [r4 + 21], 0 |
| 25936 | pmaddubsw m4, m0, m6 |
| 25937 | pmulhrsw m4, m7 |
| 25938 | pslldq m2, 2 |
| 25939 | pinsrb m2, [r4 + 4], 1 |
| 25940 | pinsrb m2, [r4 + 6], 0 |
| 25941 | pmaddubsw m5, m2, m6 |
| 25942 | pmulhrsw m5, m7 |
| 25943 | packuswb m4, m5 |
| 25944 | movu [r0 + 1256 * 16], m4 |
| 25945 | pslldq m1, 2 |
| 25946 | pinsrw m1, [r3 + 5], 0 |
| 25947 | pmaddubsw m4, m1, m6 |
| 25948 | pmulhrsw m4, m7 |
| 25949 | pslldq m3, 2 |
| 25950 | pinsrw m3, [r3 + 13], 0 |
| 25951 | pmaddubsw m5, m3, m6 |
| 25952 | pmulhrsw m5, m7 |
| 25953 | packuswb m4, m5 |
| 25954 | movu [r0 + 1257 * 16], m4 |
| 25955 | |
| 25956 | ; mode 21 [row 21] |
| 25957 | movu m6, [r5 + 10 * 16] |
| 25958 | pmaddubsw m4, m0, m6 |
| 25959 | pmulhrsw m4, m7 |
| 25960 | pmaddubsw m5, m2, m6 |
| 25961 | pmulhrsw m5, m7 |
| 25962 | packuswb m4, m5 |
| 25963 | movu [r0 + 1258 * 16], m4 |
| 25964 | pmaddubsw m4, m1, m6 |
| 25965 | pmulhrsw m4, m7 |
| 25966 | pmaddubsw m5, m3, m6 |
| 25967 | pmulhrsw m5, m7 |
| 25968 | packuswb m4, m5 |
| 25969 | movu [r0 + 1259 * 16], m4 |
| 25970 | |
| 25971 | ; mode 21 [row 22] |
| 25972 | movu m6, [r5 + 25 * 16] |
| 25973 | pslldq m0, 2 |
| 25974 | pinsrb m0, [r4 + 21], 1 |
| 25975 | pinsrb m0, [r4 + 23], 0 |
| 25976 | pmaddubsw m4, m0, m6 |
| 25977 | pmulhrsw m4, m7 |
| 25978 | pslldq m2, 2 |
| 25979 | pinsrb m2, [r4 + 6], 1 |
| 25980 | pinsrb m2, [r4 + 8], 0 |
| 25981 | pmaddubsw m5, m2, m6 |
| 25982 | pmulhrsw m5, m7 |
| 25983 | packuswb m4, m5 |
| 25984 | movu [r0 + 1260 * 16], m4 |
| 25985 | pslldq m1, 2 |
| 25986 | pinsrw m1, [r3 + 4], 0 |
| 25987 | pmaddubsw m4, m1, m6 |
| 25988 | pmulhrsw m4, m7 |
| 25989 | pslldq m3, 2 |
| 25990 | pinsrw m3, [r3 + 12], 0 |
| 25991 | pmaddubsw m5, m3, m6 |
| 25992 | pmulhrsw m5, m7 |
| 25993 | packuswb m4, m5 |
| 25994 | movu [r0 + 1261 * 16], m4 |
| 25995 | |
| 25996 | ; mode 21 [row 23] |
| 25997 | movu m6, [r5 + 8 * 16] |
| 25998 | pmaddubsw m4, m0, m6 |
| 25999 | pmulhrsw m4, m7 |
| 26000 | pmaddubsw m5, m2, m6 |
| 26001 | pmulhrsw m5, m7 |
| 26002 | packuswb m4, m5 |
| 26003 | movu [r0 + 1262 * 16], m4 |
| 26004 | pmaddubsw m4, m1, m6 |
| 26005 | pmulhrsw m4, m7 |
| 26006 | pmaddubsw m5, m3, m6 |
| 26007 | pmulhrsw m5, m7 |
| 26008 | packuswb m4, m5 |
| 26009 | movu [r0 + 1263 * 16], m4 |
| 26010 | |
| 26011 | ; mode 21 [row 24] |
| 26012 | movu m6, [r5 + 23 * 16] |
| 26013 | pslldq m0, 2 |
| 26014 | pinsrb m0, [r4 + 23], 1 |
| 26015 | pinsrb m0, [r4 + 24], 0 |
| 26016 | pmaddubsw m4, m0, m6 |
| 26017 | pmulhrsw m4, m7 |
| 26018 | pslldq m2, 2 |
| 26019 | pinsrb m2, [r4 + 8], 1 |
| 26020 | pinsrb m2, [r4 + 9], 0 |
| 26021 | pmaddubsw m5, m2, m6 |
| 26022 | pmulhrsw m5, m7 |
| 26023 | packuswb m4, m5 |
| 26024 | movu [r0 + 1264 * 16], m4 |
| 26025 | pslldq m1, 2 |
| 26026 | pinsrw m1, [r3 + 3], 0 |
| 26027 | pmaddubsw m4, m1, m6 |
| 26028 | pmulhrsw m4, m7 |
| 26029 | pslldq m3, 2 |
| 26030 | pinsrw m3, [r3 + 11], 0 |
| 26031 | pmaddubsw m5, m3, m6 |
| 26032 | pmulhrsw m5, m7 |
| 26033 | packuswb m4, m5 |
| 26034 | movu [r0 + 1265 * 16], m4 |
| 26035 | |
| 26036 | ; mode 21 [row 25] |
| 26037 | movu m6, [r5 + 6 * 16] |
| 26038 | pmaddubsw m4, m0, m6 |
| 26039 | pmulhrsw m4, m7 |
| 26040 | pmaddubsw m5, m2, m6 |
| 26041 | pmulhrsw m5, m7 |
| 26042 | packuswb m4, m5 |
| 26043 | movu [r0 + 1266 * 16], m4 |
| 26044 | pmaddubsw m4, m1, m6 |
| 26045 | pmulhrsw m4, m7 |
| 26046 | pmaddubsw m5, m3, m6 |
| 26047 | pmulhrsw m5, m7 |
| 26048 | packuswb m4, m5 |
| 26049 | movu [r0 + 1267 * 16], m4 |
| 26050 | |
| 26051 | ; mode 21 [row 26] |
| 26052 | movu m6, [r5 + 21 * 16] |
| 26053 | pslldq m0, 2 |
| 26054 | pinsrb m0, [r4 + 24], 1 |
| 26055 | pinsrb m0, [r4 + 26], 0 |
| 26056 | pmaddubsw m4, m0, m6 |
| 26057 | pmulhrsw m4, m7 |
| 26058 | pslldq m2, 2 |
| 26059 | pinsrb m2, [r4 + 9], 1 |
| 26060 | pinsrb m2, [r4 + 11], 0 |
| 26061 | pmaddubsw m5, m2, m6 |
| 26062 | pmulhrsw m5, m7 |
| 26063 | packuswb m4, m5 |
| 26064 | movu [r0 + 1268 * 16], m4 |
| 26065 | pslldq m1, 2 |
| 26066 | pinsrw m1, [r3 + 2], 0 |
| 26067 | pmaddubsw m4, m1, m6 |
| 26068 | pmulhrsw m4, m7 |
| 26069 | pslldq m3, 2 |
| 26070 | pinsrw m3, [r3 + 10], 0 |
| 26071 | pmaddubsw m5, m3, m6 |
| 26072 | pmulhrsw m5, m7 |
| 26073 | packuswb m4, m5 |
| 26074 | movu [r0 + 1269 * 16], m4 |
| 26075 | |
| 26076 | ; mode 21 [row 27] |
| 26077 | movu m6, [r5 + 4 * 16] |
| 26078 | pmaddubsw m4, m0, m6 |
| 26079 | pmulhrsw m4, m7 |
| 26080 | pmaddubsw m5, m2, m6 |
| 26081 | pmulhrsw m5, m7 |
| 26082 | packuswb m4, m5 |
| 26083 | movu [r0 + 1270 * 16], m4 |
| 26084 | pmaddubsw m4, m1, m6 |
| 26085 | pmulhrsw m4, m7 |
| 26086 | pmaddubsw m5, m3, m6 |
| 26087 | pmulhrsw m5, m7 |
| 26088 | packuswb m4, m5 |
| 26089 | movu [r0 + 1271 * 16], m4 |
| 26090 | |
| 26091 | ; mode 21 [row 28] |
| 26092 | movu m6, [r5 + 19 * 16] |
| 26093 | pslldq m0, 2 |
| 26094 | pinsrb m0, [r4 + 26], 1 |
| 26095 | pinsrb m0, [r4 + 28], 0 |
| 26096 | pmaddubsw m4, m0, m6 |
| 26097 | pmulhrsw m4, m7 |
| 26098 | pslldq m2, 2 |
| 26099 | pinsrb m2, [r4 + 11], 1 |
| 26100 | pinsrb m2, [r4 + 13], 0 |
| 26101 | pmaddubsw m5, m2, m6 |
| 26102 | pmulhrsw m5, m7 |
| 26103 | packuswb m4, m5 |
| 26104 | movu [r0 + 1272 * 16], m4 |
| 26105 | pslldq m1, 2 |
| 26106 | pinsrw m1, [r3 + 1], 0 |
| 26107 | pmaddubsw m4, m1, m6 |
| 26108 | pmulhrsw m4, m7 |
| 26109 | pslldq m3, 2 |
| 26110 | pinsrw m3, [r3 + 9], 0 |
| 26111 | pmaddubsw m5, m3, m6 |
| 26112 | pmulhrsw m5, m7 |
| 26113 | packuswb m4, m5 |
| 26114 | movu [r0 + 1273 * 16], m4 |
| 26115 | |
| 26116 | ; mode 21 [row 29] |
| 26117 | movu m6, [r5 + 2 * 16] |
| 26118 | pmaddubsw m4, m0, m6 |
| 26119 | pmulhrsw m4, m7 |
| 26120 | pmaddubsw m5, m2, m6 |
| 26121 | pmulhrsw m5, m7 |
| 26122 | packuswb m4, m5 |
| 26123 | movu [r0 + 1274 * 16], m4 |
| 26124 | pmaddubsw m4, m1, m6 |
| 26125 | pmulhrsw m4, m7 |
| 26126 | pmaddubsw m5, m3, m6 |
| 26127 | pmulhrsw m5, m7 |
| 26128 | packuswb m4, m5 |
| 26129 | movu [r0 + 1275 * 16], m4 |
| 26130 | |
| 26131 | ; mode 21 [row 30] |
| 26132 | movu m6, [r5 + 17 * 16] |
| 26133 | pslldq m0, 2 |
| 26134 | pinsrb m0, [r4 + 28], 1 |
| 26135 | pinsrb m0, [r4 + 30], 0 |
| 26136 | pmaddubsw m4, m0, m6 |
| 26137 | pmulhrsw m4, m7 |
| 26138 | pslldq m2, 2 |
| 26139 | pinsrb m2, [r4 + 13], 1 |
| 26140 | pinsrb m2, [r4 + 15], 0 |
| 26141 | pmaddubsw m5, m2, m6 |
| 26142 | pmulhrsw m5, m7 |
| 26143 | packuswb m4, m5 |
| 26144 | movu [r0 + 1276 * 16], m4 |
| 26145 | pslldq m1, 2 |
| 26146 | pinsrw m1, [r3 + 0], 0 |
| 26147 | pmaddubsw m4, m1, m6 |
| 26148 | pmulhrsw m4, m7 |
| 26149 | pslldq m3, 2 |
| 26150 | pinsrw m3, [r3 + 8], 0 |
| 26151 | pmaddubsw m5, m3, m6 |
| 26152 | pmulhrsw m5, m7 |
| 26153 | packuswb m4, m5 |
| 26154 | movu [r0 + 1277 * 16], m4 |
| 26155 | |
| 26156 | ; mode21 [row 31] |
| 26157 | pshufb m5, m0, [tab_S2] |
| 26158 | movh [r0 + 1278 * 16], m5 |
| 26159 | pshufb m5, m2, [tab_S2] |
| 26160 | movh [r0 + 1278 * 16 + 8], m5 |
| 26161 | pshufb m5, m1, [tab_S2] |
| 26162 | movh [r0 + 1279 * 16], m5 |
| 26163 | pshufb m5, m3, [tab_S2] |
| 26164 | movh [r0 + 1279 * 16 + 8], m5 |
| 26165 | |
| 26166 | ; mode 22 [row 0] |
| 26167 | movu m6, [r5 + 19 * 16] |
| 26168 | movu m0, [r3 ] |
| 26169 | movu m1, [r3 + 1 ] |
| 26170 | punpcklbw m0, m1 |
| 26171 | pmaddubsw m1, m0, m6 |
| 26172 | pmulhrsw m1, m7 |
| 26173 | movu m2, [r3 + 8] |
| 26174 | movu m3, [r3 + 9] |
| 26175 | punpcklbw m2, m3 |
| 26176 | pmaddubsw m3, m2, m6 |
| 26177 | pmulhrsw m3, m7 |
| 26178 | packuswb m1, m3 |
| 26179 | movu [r0 + 1280 * 16], m1 |
| 26180 | |
| 26181 | movu m1, [r3 + 16] |
| 26182 | movu m3, [r3 + 17] |
| 26183 | punpcklbw m1, m3 |
| 26184 | pmaddubsw m4, m1, m6 |
| 26185 | pmulhrsw m4, m7 |
| 26186 | movu m3, [r3 + 24] |
| 26187 | movu m5, [r3 + 25] |
| 26188 | punpcklbw m3, m5 |
| 26189 | pmaddubsw m5, m3, m6 |
| 26190 | pmulhrsw m5, m7 |
| 26191 | packuswb m4, m5 |
| 26192 | movu [r0 + 1281 * 16], m4 |
| 26193 | |
| 26194 | ; mode 22 [row 1] |
| 26195 | movu m6, [r5 + 6 * 16] |
| 26196 | pmaddubsw m4, m0, m6 |
| 26197 | pmulhrsw m4, m7 |
| 26198 | pmaddubsw m5, m2, m6 |
| 26199 | pmulhrsw m5, m7 |
| 26200 | packuswb m4, m5 |
| 26201 | movu [r0 + 1282 * 16], m4 |
| 26202 | pmaddubsw m4, m1, m6 |
| 26203 | pmulhrsw m4, m7 |
| 26204 | pmaddubsw m5, m3, m6 |
| 26205 | pmulhrsw m5, m7 |
| 26206 | packuswb m4, m5 |
| 26207 | movu [r0 + 1283 * 16], m4 |
| 26208 | |
| 26209 | ; mode 22 [row 2] |
| 26210 | movu m6, [r5 + 25 * 16] |
| 26211 | pslldq m0, 2 |
| 26212 | pinsrb m0, [r4 + 0], 1 |
| 26213 | pinsrb m0, [r4 + 2], 0 |
| 26214 | pmaddubsw m4, m0, m6 |
| 26215 | pmulhrsw m4, m7 |
| 26216 | pslldq m2, 2 |
| 26217 | pinsrw m2, [r3 + 7], 0 |
| 26218 | pmaddubsw m5, m2, m6 |
| 26219 | pmulhrsw m5, m7 |
| 26220 | packuswb m4, m5 |
| 26221 | movu [r0 + 1284 * 16], m4 |
| 26222 | pslldq m1, 2 |
| 26223 | pinsrw m1, [r3 + 15], 0 |
| 26224 | pmaddubsw m4, m1, m6 |
| 26225 | pmulhrsw m4, m7 |
| 26226 | pslldq m3, 2 |
| 26227 | pinsrw m3, [r3 + 23], 0 |
| 26228 | pmaddubsw m5, m3, m6 |
| 26229 | pmulhrsw m5, m7 |
| 26230 | packuswb m4, m5 |
| 26231 | movu [r0 + 1285 * 16], m4 |
| 26232 | |
| 26233 | ; mode 22 [row 3] |
| 26234 | movu m6, [r5 + 12 * 16] |
| 26235 | pmaddubsw m4, m0, m6 |
| 26236 | pmulhrsw m4, m7 |
| 26237 | pmaddubsw m5, m2, m6 |
| 26238 | pmulhrsw m5, m7 |
| 26239 | packuswb m4, m5 |
| 26240 | movu [r0 + 1286 * 16], m4 |
| 26241 | pmaddubsw m4, m1, m6 |
| 26242 | pmulhrsw m4, m7 |
| 26243 | pmaddubsw m5, m3, m6 |
| 26244 | pmulhrsw m5, m7 |
| 26245 | packuswb m4, m5 |
| 26246 | movu [r0 + 1287 * 16], m4 |
| 26247 | |
| 26248 | ; mode 22 [row 4] |
| 26249 | movu m6, [r5 + 31 * 16] |
| 26250 | pslldq m0, 2 |
| 26251 | pinsrb m0, [r4 + 2], 1 |
| 26252 | pinsrb m0, [r4 + 5], 0 |
| 26253 | pmaddubsw m4, m0, m6 |
| 26254 | pmulhrsw m4, m7 |
| 26255 | pslldq m2, 2 |
| 26256 | pinsrw m2, [r3 + 6], 0 |
| 26257 | pmaddubsw m5, m2, m6 |
| 26258 | pmulhrsw m5, m7 |
| 26259 | packuswb m4, m5 |
| 26260 | movu [r0 + 1288 * 16], m4 |
| 26261 | pslldq m1, 2 |
| 26262 | pinsrw m1, [r3 + 14], 0 |
| 26263 | pmaddubsw m4, m1, m6 |
| 26264 | pmulhrsw m4, m7 |
| 26265 | pslldq m3, 2 |
| 26266 | pinsrw m3, [r3 + 22], 0 |
| 26267 | pmaddubsw m5, m3, m6 |
| 26268 | pmulhrsw m5, m7 |
| 26269 | packuswb m4, m5 |
| 26270 | movu [r0 + 1289 * 16], m4 |
| 26271 | |
| 26272 | ; mode 22 [row 5] |
| 26273 | movu m6, [r5 + 18 * 16] |
| 26274 | pmaddubsw m4, m0, m6 |
| 26275 | pmulhrsw m4, m7 |
| 26276 | pmaddubsw m5, m2, m6 |
| 26277 | pmulhrsw m5, m7 |
| 26278 | packuswb m4, m5 |
| 26279 | movu [r0 + 1290 * 16], m4 |
| 26280 | pmaddubsw m4, m1, m6 |
| 26281 | pmulhrsw m4, m7 |
| 26282 | pmaddubsw m5, m3, m6 |
| 26283 | pmulhrsw m5, m7 |
| 26284 | packuswb m4, m5 |
| 26285 | movu [r0 + 1291 * 16], m4 |
| 26286 | |
| 26287 | ; mode 22 [row 6] |
| 26288 | movu m6, [r5 + 5 * 16] |
| 26289 | pmaddubsw m4, m0, m6 |
| 26290 | pmulhrsw m4, m7 |
| 26291 | pmaddubsw m5, m2, m6 |
| 26292 | pmulhrsw m5, m7 |
| 26293 | packuswb m4, m5 |
| 26294 | movu [r0 + 1292 * 16], m4 |
| 26295 | pmaddubsw m4, m1, m6 |
| 26296 | pmulhrsw m4, m7 |
| 26297 | pmaddubsw m5, m3, m6 |
| 26298 | pmulhrsw m5, m7 |
| 26299 | packuswb m4, m5 |
| 26300 | movu [r0 + 1293 * 16], m4 |
| 26301 | |
| 26302 | ; mode 22 [row 7] |
| 26303 | movu m6, [r5 + 24 * 16] |
| 26304 | pslldq m0, 2 |
| 26305 | pinsrb m0, [r4 + 5], 1 |
| 26306 | pinsrb m0, [r4 + 7], 0 |
| 26307 | pmaddubsw m4, m0, m6 |
| 26308 | pmulhrsw m4, m7 |
| 26309 | pslldq m2, 2 |
| 26310 | pinsrw m2, [r3 + 5], 0 |
| 26311 | pmaddubsw m5, m2, m6 |
| 26312 | pmulhrsw m5, m7 |
| 26313 | packuswb m4, m5 |
| 26314 | movu [r0 + 1294 * 16], m4 |
| 26315 | pslldq m1, 2 |
| 26316 | pinsrw m1, [r3 + 13], 0 |
| 26317 | pmaddubsw m4, m1, m6 |
| 26318 | pmulhrsw m4, m7 |
| 26319 | pslldq m3, 2 |
| 26320 | pinsrw m3, [r3 + 21], 0 |
| 26321 | pmaddubsw m5, m3, m6 |
| 26322 | pmulhrsw m5, m7 |
| 26323 | packuswb m4, m5 |
| 26324 | movu [r0 + 1295 * 16], m4 |
| 26325 | |
| 26326 | ; mode 22 [row 8] |
| 26327 | movu m6, [r5 + 11 * 16] |
| 26328 | pmaddubsw m4, m0, m6 |
| 26329 | pmulhrsw m4, m7 |
| 26330 | pmaddubsw m5, m2, m6 |
| 26331 | pmulhrsw m5, m7 |
| 26332 | packuswb m4, m5 |
| 26333 | movu [r0 + 1296 * 16], m4 |
| 26334 | pmaddubsw m4, m1, m6 |
| 26335 | pmulhrsw m4, m7 |
| 26336 | pmaddubsw m5, m3, m6 |
| 26337 | pmulhrsw m5, m7 |
| 26338 | packuswb m4, m5 |
| 26339 | movu [r0 + 1297 * 16], m4 |
| 26340 | |
| 26341 | ; mode 22 [row 9] |
| 26342 | movu m6, [r5 + 30 * 16] |
| 26343 | pslldq m0, 2 |
| 26344 | pinsrb m0, [r4 + 7], 1 |
| 26345 | pinsrb m0, [r4 + 10], 0 |
| 26346 | pmaddubsw m4, m0, m6 |
| 26347 | pmulhrsw m4, m7 |
| 26348 | pslldq m2, 2 |
| 26349 | pinsrw m2, [r3 + 4], 0 |
| 26350 | pmaddubsw m5, m2, m6 |
| 26351 | pmulhrsw m5, m7 |
| 26352 | packuswb m4, m5 |
| 26353 | movu [r0 + 1298 * 16], m4 |
| 26354 | pslldq m1, 2 |
| 26355 | pinsrw m1, [r3 + 12], 0 |
| 26356 | pmaddubsw m4, m1, m6 |
| 26357 | pmulhrsw m4, m7 |
| 26358 | pslldq m3, 2 |
| 26359 | pinsrw m3, [r3 + 20], 0 |
| 26360 | pmaddubsw m5, m3, m6 |
| 26361 | pmulhrsw m5, m7 |
| 26362 | packuswb m4, m5 |
| 26363 | movu [r0 + 1299 * 16], m4 |
| 26364 | |
| 26365 | ; mode 22 [row 10] |
| 26366 | movu m6, [r5 + 17 * 16] |
| 26367 | pmaddubsw m4, m0, m6 |
| 26368 | pmulhrsw m4, m7 |
| 26369 | pmaddubsw m5, m2, m6 |
| 26370 | pmulhrsw m5, m7 |
| 26371 | packuswb m4, m5 |
| 26372 | movu [r0 + 1300 * 16], m4 |
| 26373 | pmaddubsw m4, m1, m6 |
| 26374 | pmulhrsw m4, m7 |
| 26375 | pmaddubsw m5, m3, m6 |
| 26376 | pmulhrsw m5, m7 |
| 26377 | packuswb m4, m5 |
| 26378 | movu [r0 + 1301 * 16], m4 |
| 26379 | |
| 26380 | ; mode 22 [row 11] |
| 26381 | movu m6, [r5 + 4 * 16] |
| 26382 | pmaddubsw m4, m0, m6 |
| 26383 | pmulhrsw m4, m7 |
| 26384 | pmaddubsw m5, m2, m6 |
| 26385 | pmulhrsw m5, m7 |
| 26386 | packuswb m4, m5 |
| 26387 | movu [r0 + 1302 * 16], m4 |
| 26388 | pmaddubsw m4, m1, m6 |
| 26389 | pmulhrsw m4, m7 |
| 26390 | pmaddubsw m5, m3, m6 |
| 26391 | pmulhrsw m5, m7 |
| 26392 | packuswb m4, m5 |
| 26393 | movu [r0 + 1303 * 16], m4 |
| 26394 | |
| 26395 | ; mode 22 [row 12] |
| 26396 | movu m6, [r5 + 23 * 16] |
| 26397 | pslldq m0, 2 |
| 26398 | pinsrb m0, [r4 + 10], 1 |
| 26399 | pinsrb m0, [r4 + 12], 0 |
| 26400 | pmaddubsw m4, m0, m6 |
| 26401 | pmulhrsw m4, m7 |
| 26402 | pslldq m2, 2 |
| 26403 | pinsrw m2, [r3 + 3], 0 |
| 26404 | pmaddubsw m5, m2, m6 |
| 26405 | pmulhrsw m5, m7 |
| 26406 | packuswb m4, m5 |
| 26407 | movu [r0 + 1304 * 16], m4 |
| 26408 | pslldq m1, 2 |
| 26409 | pinsrw m1, [r3 + 11], 0 |
| 26410 | pmaddubsw m4, m1, m6 |
| 26411 | pmulhrsw m4, m7 |
| 26412 | pslldq m3, 2 |
| 26413 | pinsrw m3, [r3 + 19], 0 |
| 26414 | pmaddubsw m5, m3, m6 |
| 26415 | pmulhrsw m5, m7 |
| 26416 | packuswb m4, m5 |
| 26417 | movu [r0 + 1305 * 16], m4 |
| 26418 | |
| 26419 | ; mode 22 [row 13] |
| 26420 | movu m6, [r5 + 10 * 16] |
| 26421 | pmaddubsw m4, m0, m6 |
| 26422 | pmulhrsw m4, m7 |
| 26423 | pmaddubsw m5, m2, m6 |
| 26424 | pmulhrsw m5, m7 |
| 26425 | packuswb m4, m5 |
| 26426 | movu [r0 + 1306 * 16], m4 |
| 26427 | pmaddubsw m4, m1, m6 |
| 26428 | pmulhrsw m4, m7 |
| 26429 | pmaddubsw m5, m3, m6 |
| 26430 | pmulhrsw m5, m7 |
| 26431 | packuswb m4, m5 |
| 26432 | movu [r0 + 1307 * 16], m4 |
| 26433 | |
| 26434 | ; mode 22 [row 14] |
| 26435 | movu m6, [r5 + 29 * 16] |
| 26436 | pslldq m0, 2 |
| 26437 | pinsrb m0, [r4 + 12], 1 |
| 26438 | pinsrb m0, [r4 + 15], 0 |
| 26439 | pmaddubsw m4, m0, m6 |
| 26440 | pmulhrsw m4, m7 |
| 26441 | pslldq m2, 2 |
| 26442 | pinsrw m2, [r3 + 2], 0 |
| 26443 | pmaddubsw m5, m2, m6 |
| 26444 | pmulhrsw m5, m7 |
| 26445 | packuswb m4, m5 |
| 26446 | movu [r0 + 1308 * 16], m4 |
| 26447 | pslldq m1, 2 |
| 26448 | pinsrw m1, [r3 + 10], 0 |
| 26449 | pmaddubsw m4, m1, m6 |
| 26450 | pmulhrsw m4, m7 |
| 26451 | pslldq m3, 2 |
| 26452 | pinsrw m3, [r3 + 18], 0 |
| 26453 | pmaddubsw m5, m3, m6 |
| 26454 | pmulhrsw m5, m7 |
| 26455 | packuswb m4, m5 |
| 26456 | movu [r0 + 1309 * 16], m4 |
| 26457 | |
| 26458 | ; mode 22 [row 15] |
| 26459 | movu m6, [r5 + 16 * 16] |
| 26460 | pmaddubsw m4, m0, m6 |
| 26461 | pmulhrsw m4, m7 |
| 26462 | pmaddubsw m5, m2, m6 |
| 26463 | pmulhrsw m5, m7 |
| 26464 | packuswb m4, m5 |
| 26465 | movu [r0 + 1310 * 16], m4 |
| 26466 | pmaddubsw m4, m1, m6 |
| 26467 | pmulhrsw m4, m7 |
| 26468 | pmaddubsw m5, m3, m6 |
| 26469 | pmulhrsw m5, m7 |
| 26470 | packuswb m4, m5 |
| 26471 | movu [r0 + 1311 * 16], m4 |
| 26472 | |
| 26473 | ; mode 22 [row 16] |
| 26474 | movu m6, [r5 + 3 * 16] |
| 26475 | pmaddubsw m4, m0, m6 |
| 26476 | pmulhrsw m4, m7 |
| 26477 | pmaddubsw m5, m2, m6 |
| 26478 | pmulhrsw m5, m7 |
| 26479 | packuswb m4, m5 |
| 26480 | movu [r0 + 1312 * 16], m4 |
| 26481 | pmaddubsw m4, m1, m6 |
| 26482 | pmulhrsw m4, m7 |
| 26483 | pmaddubsw m5, m3, m6 |
| 26484 | pmulhrsw m5, m7 |
| 26485 | packuswb m4, m5 |
| 26486 | movu [r0 + 1313 * 16], m4 |
| 26487 | |
| 26488 | ; mode 22 [row 17] |
| 26489 | movu m6, [r5 + 22 * 16] |
| 26490 | pslldq m0, 2 |
| 26491 | pinsrb m0, [r4 + 15], 1 |
| 26492 | pinsrb m0, [r4 + 17], 0 |
| 26493 | pmaddubsw m4, m0, m6 |
| 26494 | pmulhrsw m4, m7 |
| 26495 | pslldq m2, 2 |
| 26496 | pinsrw m2, [r3 + 1], 0 |
| 26497 | pmaddubsw m5, m2, m6 |
| 26498 | pmulhrsw m5, m7 |
| 26499 | packuswb m4, m5 |
| 26500 | movu [r0 + 1314 * 16], m4 |
| 26501 | pslldq m1, 2 |
| 26502 | pinsrw m1, [r3 + 9], 0 |
| 26503 | pmaddubsw m4, m1, m6 |
| 26504 | pmulhrsw m4, m7 |
| 26505 | pslldq m3, 2 |
| 26506 | pinsrw m3, [r3 + 17], 0 |
| 26507 | pmaddubsw m5, m3, m6 |
| 26508 | pmulhrsw m5, m7 |
| 26509 | packuswb m4, m5 |
| 26510 | movu [r0 + 1315 * 16], m4 |
| 26511 | |
| 26512 | ; mode 22 [row 18] |
| 26513 | movu m6, [r5 + 9 * 16] |
| 26514 | pmaddubsw m4, m0, m6 |
| 26515 | pmulhrsw m4, m7 |
| 26516 | pmaddubsw m5, m2, m6 |
| 26517 | pmulhrsw m5, m7 |
| 26518 | packuswb m4, m5 |
| 26519 | movu [r0 + 1316 * 16], m4 |
| 26520 | pmaddubsw m4, m1, m6 |
| 26521 | pmulhrsw m4, m7 |
| 26522 | pmaddubsw m5, m3, m6 |
| 26523 | pmulhrsw m5, m7 |
| 26524 | packuswb m4, m5 |
| 26525 | movu [r0 + 1317 * 16], m4 |
| 26526 | |
| 26527 | ; mode 22 [row 19] |
| 26528 | movu m6, [r5 + 28 * 16] |
| 26529 | pslldq m0, 2 |
| 26530 | pinsrb m0, [r4 + 17], 1 |
| 26531 | pinsrb m0, [r4 + 20], 0 |
| 26532 | pmaddubsw m4, m0, m6 |
| 26533 | pmulhrsw m4, m7 |
| 26534 | pslldq m2, 2 |
| 26535 | pinsrw m2, [r3 + 0], 0 |
| 26536 | pmaddubsw m5, m2, m6 |
| 26537 | pmulhrsw m5, m7 |
| 26538 | packuswb m4, m5 |
| 26539 | movu [r0 + 1318 * 16], m4 |
| 26540 | pslldq m1, 2 |
| 26541 | pinsrw m1, [r3 + 8], 0 |
| 26542 | pmaddubsw m4, m1, m6 |
| 26543 | pmulhrsw m4, m7 |
| 26544 | pslldq m3, 2 |
| 26545 | pinsrw m3, [r3 + 16], 0 |
| 26546 | pmaddubsw m5, m3, m6 |
| 26547 | pmulhrsw m5, m7 |
| 26548 | packuswb m4, m5 |
| 26549 | movu [r0 + 1319 * 16], m4 |
| 26550 | |
| 26551 | ; mode 22 [row 20] |
| 26552 | movu m6, [r5 + 15 * 16] |
| 26553 | pmaddubsw m4, m0, m6 |
| 26554 | pmulhrsw m4, m7 |
| 26555 | pmaddubsw m5, m2, m6 |
| 26556 | pmulhrsw m5, m7 |
| 26557 | packuswb m4, m5 |
| 26558 | movu [r0 + 1320 * 16], m4 |
| 26559 | pmaddubsw m4, m1, m6 |
| 26560 | pmulhrsw m4, m7 |
| 26561 | pmaddubsw m5, m3, m6 |
| 26562 | pmulhrsw m5, m7 |
| 26563 | packuswb m4, m5 |
| 26564 | movu [r0 + 1321 * 16], m4 |
| 26565 | |
| 26566 | ; mode 22 [row 21] |
| 26567 | movu m6, [r5 + 2 * 16] |
| 26568 | pmaddubsw m4, m0, m6 |
| 26569 | pmulhrsw m4, m7 |
| 26570 | pmaddubsw m5, m2, m6 |
| 26571 | pmulhrsw m5, m7 |
| 26572 | packuswb m4, m5 |
| 26573 | movu [r0 + 1322 * 16], m4 |
| 26574 | pmaddubsw m4, m1, m6 |
| 26575 | pmulhrsw m4, m7 |
| 26576 | pmaddubsw m5, m3, m6 |
| 26577 | pmulhrsw m5, m7 |
| 26578 | packuswb m4, m5 |
| 26579 | movu [r0 + 1323 * 16], m4 |
| 26580 | |
| 26581 | ; mode 22 [row 22] |
| 26582 | movu m6, [r5 + 21 * 16] |
| 26583 | pslldq m0, 2 |
| 26584 | pinsrb m0, [r4 + 20], 1 |
| 26585 | pinsrb m0, [r4 + 22], 0 |
| 26586 | pmaddubsw m4, m0, m6 |
| 26587 | pmulhrsw m4, m7 |
| 26588 | pslldq m2, 2 |
| 26589 | pinsrb m2, [r4 + 0], 1 |
| 26590 | pinsrb m2, [r4 + 2], 0 |
| 26591 | pmaddubsw m5, m2, m6 |
| 26592 | pmulhrsw m5, m7 |
| 26593 | packuswb m4, m5 |
| 26594 | movu [r0 + 1324 * 16], m4 |
| 26595 | pslldq m1, 2 |
| 26596 | pinsrw m1, [r3 + 7], 0 |
| 26597 | pmaddubsw m4, m1, m6 |
| 26598 | pmulhrsw m4, m7 |
| 26599 | pslldq m3, 2 |
| 26600 | pinsrw m3, [r3 + 15], 0 |
| 26601 | pmaddubsw m5, m3, m6 |
| 26602 | pmulhrsw m5, m7 |
| 26603 | packuswb m4, m5 |
| 26604 | movu [r0 + 1325 * 16], m4 |
| 26605 | |
| 26606 | ; mode 22 [row 23] |
| 26607 | movu m6, [r5 + 8 * 16] |
| 26608 | pmaddubsw m4, m0, m6 |
| 26609 | pmulhrsw m4, m7 |
| 26610 | pmaddubsw m5, m2, m6 |
| 26611 | pmulhrsw m5, m7 |
| 26612 | packuswb m4, m5 |
| 26613 | movu [r0 + 1326 * 16], m4 |
| 26614 | pmaddubsw m4, m1, m6 |
| 26615 | pmulhrsw m4, m7 |
| 26616 | pmaddubsw m5, m3, m6 |
| 26617 | pmulhrsw m5, m7 |
| 26618 | packuswb m4, m5 |
| 26619 | movu [r0 + 1327 * 16], m4 |
| 26620 | |
| 26621 | ; mode 22 [row 24] |
| 26622 | movu m6, [r5 + 27 * 16] |
| 26623 | pslldq m0, 2 |
| 26624 | pinsrb m0, [r4 + 22], 1 |
| 26625 | pinsrb m0, [r4 + 25], 0 |
| 26626 | pmaddubsw m4, m0, m6 |
| 26627 | pmulhrsw m4, m7 |
| 26628 | pslldq m2, 2 |
| 26629 | pinsrb m2, [r4 + 2], 1 |
| 26630 | pinsrb m2, [r4 + 5], 0 |
| 26631 | pmaddubsw m5, m2, m6 |
| 26632 | pmulhrsw m5, m7 |
| 26633 | packuswb m4, m5 |
| 26634 | movu [r0 + 1328 * 16], m4 |
| 26635 | pslldq m1, 2 |
| 26636 | pinsrw m1, [r3 + 6], 0 |
| 26637 | pmaddubsw m4, m1, m6 |
| 26638 | pmulhrsw m4, m7 |
| 26639 | pslldq m3, 2 |
| 26640 | pinsrw m3, [r3 + 14], 0 |
| 26641 | pmaddubsw m5, m3, m6 |
| 26642 | pmulhrsw m5, m7 |
| 26643 | packuswb m4, m5 |
| 26644 | movu [r0 + 1329 * 16], m4 |
| 26645 | |
| 26646 | ; mode 22 [row 25] |
| 26647 | movu m6, [r5 + 14 * 16] |
| 26648 | pmaddubsw m4, m0, m6 |
| 26649 | pmulhrsw m4, m7 |
| 26650 | pmaddubsw m5, m2, m6 |
| 26651 | pmulhrsw m5, m7 |
| 26652 | packuswb m4, m5 |
| 26653 | movu [r0 + 1330 * 16], m4 |
| 26654 | pmaddubsw m4, m1, m6 |
| 26655 | pmulhrsw m4, m7 |
| 26656 | pmaddubsw m5, m3, m6 |
| 26657 | pmulhrsw m5, m7 |
| 26658 | packuswb m4, m5 |
| 26659 | movu [r0 + 1331 * 16], m4 |
| 26660 | |
| 26661 | ; mode 22 [row 26] |
| 26662 | movu m6, [r5 + 1 * 16] |
| 26663 | pmaddubsw m4, m0, m6 |
| 26664 | pmulhrsw m4, m7 |
| 26665 | pmaddubsw m5, m2, m6 |
| 26666 | pmulhrsw m5, m7 |
| 26667 | packuswb m4, m5 |
| 26668 | movu [r0 + 1332 * 16], m4 |
| 26669 | pmaddubsw m4, m1, m6 |
| 26670 | pmulhrsw m4, m7 |
| 26671 | pmaddubsw m5, m3, m6 |
| 26672 | pmulhrsw m5, m7 |
| 26673 | packuswb m4, m5 |
| 26674 | movu [r0 + 1333 * 16], m4 |
| 26675 | |
| 26676 | ; mode 22 [row 27] |
| 26677 | movu m6, [r5 + 20 * 16] |
| 26678 | pslldq m0, 2 |
| 26679 | pinsrb m0, [r4 + 25], 1 |
| 26680 | pinsrb m0, [r4 + 27], 0 |
| 26681 | pmaddubsw m4, m0, m6 |
| 26682 | pmulhrsw m4, m7 |
| 26683 | pslldq m2, 2 |
| 26684 | pinsrb m2, [r4 + 5], 1 |
| 26685 | pinsrb m2, [r4 + 7], 0 |
| 26686 | pmaddubsw m5, m2, m6 |
| 26687 | pmulhrsw m5, m7 |
| 26688 | packuswb m4, m5 |
| 26689 | movu [r0 + 1334 * 16], m4 |
| 26690 | pslldq m1, 2 |
| 26691 | pinsrw m1, [r3 + 5], 0 |
| 26692 | pmaddubsw m4, m1, m6 |
| 26693 | pmulhrsw m4, m7 |
| 26694 | pslldq m3, 2 |
| 26695 | pinsrw m3, [r3 + 13], 0 |
| 26696 | pmaddubsw m5, m3, m6 |
| 26697 | pmulhrsw m5, m7 |
| 26698 | packuswb m4, m5 |
| 26699 | movu [r0 + 1335 * 16], m4 |
| 26700 | |
| 26701 | ; mode 22 [row 28] |
| 26702 | movu m6, [r5 + 7 * 16] |
| 26703 | pmaddubsw m4, m0, m6 |
| 26704 | pmulhrsw m4, m7 |
| 26705 | pmaddubsw m5, m2, m6 |
| 26706 | pmulhrsw m5, m7 |
| 26707 | packuswb m4, m5 |
| 26708 | movu [r0 + 1336 * 16], m4 |
| 26709 | pmaddubsw m4, m1, m6 |
| 26710 | pmulhrsw m4, m7 |
| 26711 | pmaddubsw m5, m3, m6 |
| 26712 | pmulhrsw m5, m7 |
| 26713 | packuswb m4, m5 |
| 26714 | movu [r0 + 1337 * 16], m4 |
| 26715 | |
| 26716 | ; mode 22 [row 29] |
| 26717 | movu m6, [r5 + 26 * 16] |
| 26718 | pslldq m0, 2 |
| 26719 | pinsrb m0, [r4 + 27], 1 |
| 26720 | pinsrb m0, [r4 + 30], 0 |
| 26721 | pmaddubsw m4, m0, m6 |
| 26722 | pmulhrsw m4, m7 |
| 26723 | pslldq m2, 2 |
| 26724 | pinsrb m2, [r4 + 7], 1 |
| 26725 | pinsrb m2, [r4 + 10], 0 |
| 26726 | pmaddubsw m5, m2, m6 |
| 26727 | pmulhrsw m5, m7 |
| 26728 | packuswb m4, m5 |
| 26729 | movu [r0 + 1338 * 16], m4 |
| 26730 | pslldq m1, 2 |
| 26731 | pinsrw m1, [r3 + 4], 0 |
| 26732 | pmaddubsw m4, m1, m6 |
| 26733 | pmulhrsw m4, m7 |
| 26734 | pslldq m3, 2 |
| 26735 | pinsrw m3, [r3 + 12], 0 |
| 26736 | pmaddubsw m5, m3, m6 |
| 26737 | pmulhrsw m5, m7 |
| 26738 | packuswb m4, m5 |
| 26739 | movu [r0 + 1339 * 16], m4 |
| 26740 | |
| 26741 | ; mode 22 [row 30] |
| 26742 | movu m6, [r5 + 13 * 16] |
| 26743 | pmaddubsw m4, m0, m6 |
| 26744 | pmulhrsw m4, m7 |
| 26745 | pmaddubsw m5, m2, m6 |
| 26746 | pmulhrsw m5, m7 |
| 26747 | packuswb m4, m5 |
| 26748 | movu [r0 + 1340 * 16], m4 |
| 26749 | pmaddubsw m4, m1, m6 |
| 26750 | pmulhrsw m4, m7 |
| 26751 | pmaddubsw m5, m3, m6 |
| 26752 | pmulhrsw m5, m7 |
| 26753 | packuswb m4, m5 |
| 26754 | movu [r0 + 1341 * 16], m4 |
| 26755 | |
| 26756 | ; mode22 [row 31] |
| 26757 | pshufb m5, m0, [tab_S2] |
| 26758 | movh [r0 + 1342 * 16], m5 |
| 26759 | pshufb m5, m2, [tab_S2] |
| 26760 | movh [r0 + 1342 * 16 + 8], m5 |
| 26761 | pshufb m5, m1, [tab_S2] |
| 26762 | movh [r0 + 1343 * 16], m5 |
| 26763 | pshufb m5, m3, [tab_S2] |
| 26764 | movh [r0 + 1343 * 16 + 8], m5 |
| 26765 | |
| 26766 | ; mode 23 [row 0] |
| 26767 | movu m6, [r5 + 23 * 16] |
| 26768 | movu m0, [r3 ] |
| 26769 | movu m1, [r3 + 1 ] |
| 26770 | punpcklbw m0, m1 |
| 26771 | pmaddubsw m1, m0, m6 |
| 26772 | pmulhrsw m1, m7 |
| 26773 | movu m2, [r3 + 8] |
| 26774 | movu m3, [r3 + 9] |
| 26775 | punpcklbw m2, m3 |
| 26776 | pmaddubsw m3, m2, m6 |
| 26777 | pmulhrsw m3, m7 |
| 26778 | packuswb m1, m3 |
| 26779 | movu [r0 + 1344 * 16], m1 |
| 26780 | |
| 26781 | movu m1, [r3 + 16] |
| 26782 | movu m3, [r3 + 17] |
| 26783 | punpcklbw m1, m3 |
| 26784 | pmaddubsw m4, m1, m6 |
| 26785 | pmulhrsw m4, m7 |
| 26786 | movu m3, [r3 + 24] |
| 26787 | movu m5, [r3 + 25] |
| 26788 | punpcklbw m3, m5 |
| 26789 | pmaddubsw m5, m3, m6 |
| 26790 | pmulhrsw m5, m7 |
| 26791 | packuswb m4, m5 |
| 26792 | movu [r0 + 1345 * 16], m4 |
| 26793 | |
| 26794 | ; mode 23 [row 1] |
| 26795 | movu m6, [r5 + 14 * 16] |
| 26796 | pmaddubsw m4, m0, m6 |
| 26797 | pmulhrsw m4, m7 |
| 26798 | pmaddubsw m5, m2, m6 |
| 26799 | pmulhrsw m5, m7 |
| 26800 | packuswb m4, m5 |
| 26801 | movu [r0 + 1346 * 16], m4 |
| 26802 | pmaddubsw m4, m1, m6 |
| 26803 | pmulhrsw m4, m7 |
| 26804 | pmaddubsw m5, m3, m6 |
| 26805 | pmulhrsw m5, m7 |
| 26806 | packuswb m4, m5 |
| 26807 | movu [r0 + 1347 * 16], m4 |
| 26808 | |
| 26809 | ; mode 23 [row 2] |
| 26810 | movu m6, [r5 + 5 * 16] |
| 26811 | pmaddubsw m4, m0, m6 |
| 26812 | pmulhrsw m4, m7 |
| 26813 | pmaddubsw m5, m2, m6 |
| 26814 | pmulhrsw m5, m7 |
| 26815 | packuswb m4, m5 |
| 26816 | movu [r0 + 1348 * 16], m4 |
| 26817 | pmaddubsw m4, m1, m6 |
| 26818 | pmulhrsw m4, m7 |
| 26819 | pmaddubsw m5, m3, m6 |
| 26820 | pmulhrsw m5, m7 |
| 26821 | packuswb m4, m5 |
| 26822 | movu [r0 + 1349 * 16], m4 |
| 26823 | |
| 26824 | ; mode 23 [row 3] |
| 26825 | movu m6, [r5 + 28 * 16] |
| 26826 | pslldq m0, 2 |
| 26827 | pinsrb m0, [r4 + 0], 1 |
| 26828 | pinsrb m0, [r4 + 4], 0 |
| 26829 | pmaddubsw m4, m0, m6 |
| 26830 | pmulhrsw m4, m7 |
| 26831 | pslldq m2, 2 |
| 26832 | pinsrw m2, [r3 + 7], 0 |
| 26833 | pmaddubsw m5, m2, m6 |
| 26834 | pmulhrsw m5, m7 |
| 26835 | packuswb m4, m5 |
| 26836 | movu [r0 + 1350 * 16], m4 |
| 26837 | pslldq m1, 2 |
| 26838 | pinsrw m1, [r3 + 15], 0 |
| 26839 | pmaddubsw m4, m1, m6 |
| 26840 | pmulhrsw m4, m7 |
| 26841 | pslldq m3, 2 |
| 26842 | pinsrw m3, [r3 + 23], 0 |
| 26843 | pmaddubsw m5, m3, m6 |
| 26844 | pmulhrsw m5, m7 |
| 26845 | packuswb m4, m5 |
| 26846 | movu [r0 + 1351 * 16], m4 |
| 26847 | |
| 26848 | ; mode 23 [row 4] |
| 26849 | movu m6, [r5 + 19 * 16] |
| 26850 | pmaddubsw m4, m0, m6 |
| 26851 | pmulhrsw m4, m7 |
| 26852 | pmaddubsw m5, m2, m6 |
| 26853 | pmulhrsw m5, m7 |
| 26854 | packuswb m4, m5 |
| 26855 | movu [r0 + 1352 * 16], m4 |
| 26856 | pmaddubsw m4, m1, m6 |
| 26857 | pmulhrsw m4, m7 |
| 26858 | pmaddubsw m5, m3, m6 |
| 26859 | pmulhrsw m5, m7 |
| 26860 | packuswb m4, m5 |
| 26861 | movu [r0 + 1353 * 16], m4 |
| 26862 | |
| 26863 | ; mode 23 [row 5] |
| 26864 | movu m6, [r5 + 10 * 16] |
| 26865 | pmaddubsw m4, m0, m6 |
| 26866 | pmulhrsw m4, m7 |
| 26867 | pmaddubsw m5, m2, m6 |
| 26868 | pmulhrsw m5, m7 |
| 26869 | packuswb m4, m5 |
| 26870 | movu [r0 + 1354 * 16], m4 |
| 26871 | pmaddubsw m4, m1, m6 |
| 26872 | pmulhrsw m4, m7 |
| 26873 | pmaddubsw m5, m3, m6 |
| 26874 | pmulhrsw m5, m7 |
| 26875 | packuswb m4, m5 |
| 26876 | movu [r0 + 1355 * 16], m4 |
| 26877 | |
| 26878 | ; mode 23 [row 6] |
| 26879 | movu m6, [r5 + 1 * 16] |
| 26880 | pmaddubsw m4, m0, m6 |
| 26881 | pmulhrsw m4, m7 |
| 26882 | pmaddubsw m5, m2, m6 |
| 26883 | pmulhrsw m5, m7 |
| 26884 | packuswb m4, m5 |
| 26885 | movu [r0 + 1356 * 16], m4 |
| 26886 | pmaddubsw m4, m1, m6 |
| 26887 | pmulhrsw m4, m7 |
| 26888 | pmaddubsw m5, m3, m6 |
| 26889 | pmulhrsw m5, m7 |
| 26890 | packuswb m4, m5 |
| 26891 | movu [r0 + 1357 * 16], m4 |
| 26892 | |
| 26893 | ; mode 23 [row 7] |
| 26894 | movu m6, [r5 + 24 * 16] |
| 26895 | pslldq m0, 2 |
| 26896 | pinsrb m0, [r4 + 4], 1 |
| 26897 | pinsrb m0, [r4 + 7], 0 |
| 26898 | pmaddubsw m4, m0, m6 |
| 26899 | pmulhrsw m4, m7 |
| 26900 | pslldq m2, 2 |
| 26901 | pinsrw m2, [r3 + 6], 0 |
| 26902 | pmaddubsw m5, m2, m6 |
| 26903 | pmulhrsw m5, m7 |
| 26904 | packuswb m4, m5 |
| 26905 | movu [r0 + 1358 * 16], m4 |
| 26906 | pslldq m1, 2 |
| 26907 | pinsrw m1, [r3 + 14], 0 |
| 26908 | pmaddubsw m4, m1, m6 |
| 26909 | pmulhrsw m4, m7 |
| 26910 | pslldq m3, 2 |
| 26911 | pinsrw m3, [r3 + 22], 0 |
| 26912 | pmaddubsw m5, m3, m6 |
| 26913 | pmulhrsw m5, m7 |
| 26914 | packuswb m4, m5 |
| 26915 | movu [r0 + 1359 * 16], m4 |
| 26916 | |
| 26917 | ; mode 23 [row 8] |
| 26918 | movu m6, [r5 + 15 * 16] |
| 26919 | pmaddubsw m4, m0, m6 |
| 26920 | pmulhrsw m4, m7 |
| 26921 | pmaddubsw m5, m2, m6 |
| 26922 | pmulhrsw m5, m7 |
| 26923 | packuswb m4, m5 |
| 26924 | movu [r0 + 1360 * 16], m4 |
| 26925 | pmaddubsw m4, m1, m6 |
| 26926 | pmulhrsw m4, m7 |
| 26927 | pmaddubsw m5, m3, m6 |
| 26928 | pmulhrsw m5, m7 |
| 26929 | packuswb m4, m5 |
| 26930 | movu [r0 + 1361 * 16], m4 |
| 26931 | |
| 26932 | ; mode 23 [row 9] |
| 26933 | movu m6, [r5 + 6 * 16] |
| 26934 | pmaddubsw m4, m0, m6 |
| 26935 | pmulhrsw m4, m7 |
| 26936 | pmaddubsw m5, m2, m6 |
| 26937 | pmulhrsw m5, m7 |
| 26938 | packuswb m4, m5 |
| 26939 | movu [r0 + 1362 * 16], m4 |
| 26940 | pmaddubsw m4, m1, m6 |
| 26941 | pmulhrsw m4, m7 |
| 26942 | pmaddubsw m5, m3, m6 |
| 26943 | pmulhrsw m5, m7 |
| 26944 | packuswb m4, m5 |
| 26945 | movu [r0 + 1363 * 16], m4 |
| 26946 | |
| 26947 | ; mode 23 [row 10] |
| 26948 | movu m6, [r5 + 29 * 16] |
| 26949 | pslldq m0, 2 |
| 26950 | pinsrb m0, [r4 + 7], 1 |
| 26951 | pinsrb m0, [r4 + 11], 0 |
| 26952 | pmaddubsw m4, m0, m6 |
| 26953 | pmulhrsw m4, m7 |
| 26954 | pslldq m2, 2 |
| 26955 | pinsrw m2, [r3 + 5], 0 |
| 26956 | pmaddubsw m5, m2, m6 |
| 26957 | pmulhrsw m5, m7 |
| 26958 | packuswb m4, m5 |
| 26959 | movu [r0 + 1364 * 16], m4 |
| 26960 | pslldq m1, 2 |
| 26961 | pinsrw m1, [r3 + 13], 0 |
| 26962 | pmaddubsw m4, m1, m6 |
| 26963 | pmulhrsw m4, m7 |
| 26964 | pslldq m3, 2 |
| 26965 | pinsrw m3, [r3 + 21], 0 |
| 26966 | pmaddubsw m5, m3, m6 |
| 26967 | pmulhrsw m5, m7 |
| 26968 | packuswb m4, m5 |
| 26969 | movu [r0 + 1365 * 16], m4 |
| 26970 | |
| 26971 | ; mode 23 [row 11] |
| 26972 | movu m6, [r5 + 20 * 16] |
| 26973 | pmaddubsw m4, m0, m6 |
| 26974 | pmulhrsw m4, m7 |
| 26975 | pmaddubsw m5, m2, m6 |
| 26976 | pmulhrsw m5, m7 |
| 26977 | packuswb m4, m5 |
| 26978 | movu [r0 + 1366 * 16], m4 |
| 26979 | pmaddubsw m4, m1, m6 |
| 26980 | pmulhrsw m4, m7 |
| 26981 | pmaddubsw m5, m3, m6 |
| 26982 | pmulhrsw m5, m7 |
| 26983 | packuswb m4, m5 |
| 26984 | movu [r0 + 1367 * 16], m4 |
| 26985 | |
| 26986 | ; mode 23 [row 12] |
| 26987 | movu m6, [r5 + 11 * 16] |
| 26988 | pmaddubsw m4, m0, m6 |
| 26989 | pmulhrsw m4, m7 |
| 26990 | pmaddubsw m5, m2, m6 |
| 26991 | pmulhrsw m5, m7 |
| 26992 | packuswb m4, m5 |
| 26993 | movu [r0 + 1368 * 16], m4 |
| 26994 | pmaddubsw m4, m1, m6 |
| 26995 | pmulhrsw m4, m7 |
| 26996 | pmaddubsw m5, m3, m6 |
| 26997 | pmulhrsw m5, m7 |
| 26998 | packuswb m4, m5 |
| 26999 | movu [r0 + 1369 * 16], m4 |
| 27000 | |
| 27001 | ; mode 23 [row 13] |
| 27002 | movu m6, [r5 + 2 * 16] |
| 27003 | pmaddubsw m4, m0, m6 |
| 27004 | pmulhrsw m4, m7 |
| 27005 | pmaddubsw m5, m2, m6 |
| 27006 | pmulhrsw m5, m7 |
| 27007 | packuswb m4, m5 |
| 27008 | movu [r0 + 1370 * 16], m4 |
| 27009 | pmaddubsw m4, m1, m6 |
| 27010 | pmulhrsw m4, m7 |
| 27011 | pmaddubsw m5, m3, m6 |
| 27012 | pmulhrsw m5, m7 |
| 27013 | packuswb m4, m5 |
| 27014 | movu [r0 + 1371 * 16], m4 |
| 27015 | |
| 27016 | ; mode 23 [row 14] |
| 27017 | movu m6, [r5 + 25 * 16] |
| 27018 | pslldq m0, 2 |
| 27019 | pinsrb m0, [r4 + 11], 1 |
| 27020 | pinsrb m0, [r4 + 14], 0 |
| 27021 | pmaddubsw m4, m0, m6 |
| 27022 | pmulhrsw m4, m7 |
| 27023 | pslldq m2, 2 |
| 27024 | pinsrw m2, [r3 + 4], 0 |
| 27025 | pmaddubsw m5, m2, m6 |
| 27026 | pmulhrsw m5, m7 |
| 27027 | packuswb m4, m5 |
| 27028 | movu [r0 + 1372 * 16], m4 |
| 27029 | pslldq m1, 2 |
| 27030 | pinsrw m1, [r3 + 12], 0 |
| 27031 | pmaddubsw m4, m1, m6 |
| 27032 | pmulhrsw m4, m7 |
| 27033 | pslldq m3, 2 |
| 27034 | pinsrw m3, [r3 + 20], 0 |
| 27035 | pmaddubsw m5, m3, m6 |
| 27036 | pmulhrsw m5, m7 |
| 27037 | packuswb m4, m5 |
| 27038 | movu [r0 + 1373 * 16], m4 |
| 27039 | |
| 27040 | ; mode 23 [row 15] |
| 27041 | movu m6, [r5 + 16 * 16] |
| 27042 | pmaddubsw m4, m0, m6 |
| 27043 | pmulhrsw m4, m7 |
| 27044 | pmaddubsw m5, m2, m6 |
| 27045 | pmulhrsw m5, m7 |
| 27046 | packuswb m4, m5 |
| 27047 | movu [r0 + 1374 * 16], m4 |
| 27048 | pmaddubsw m4, m1, m6 |
| 27049 | pmulhrsw m4, m7 |
| 27050 | pmaddubsw m5, m3, m6 |
| 27051 | pmulhrsw m5, m7 |
| 27052 | packuswb m4, m5 |
| 27053 | movu [r0 + 1375 * 16], m4 |
| 27054 | |
| 27055 | ; mode 23 [row 16] |
| 27056 | movu m6, [r5 + 7 * 16] |
| 27057 | pmaddubsw m4, m0, m6 |
| 27058 | pmulhrsw m4, m7 |
| 27059 | pmaddubsw m5, m2, m6 |
| 27060 | pmulhrsw m5, m7 |
| 27061 | packuswb m4, m5 |
| 27062 | movu [r0 + 1376 * 16], m4 |
| 27063 | pmaddubsw m4, m1, m6 |
| 27064 | pmulhrsw m4, m7 |
| 27065 | pmaddubsw m5, m3, m6 |
| 27066 | pmulhrsw m5, m7 |
| 27067 | packuswb m4, m5 |
| 27068 | movu [r0 + 1377 * 16], m4 |
| 27069 | |
| 27070 | ; mode 23 [row 17] |
| 27071 | movu m6, [r5 + 30 * 16] |
| 27072 | pslldq m0, 2 |
| 27073 | pinsrb m0, [r4 + 14], 1 |
| 27074 | pinsrb m0, [r4 + 18], 0 |
| 27075 | pmaddubsw m4, m0, m6 |
| 27076 | pmulhrsw m4, m7 |
| 27077 | pslldq m2, 2 |
| 27078 | pinsrw m2, [r3 + 3], 0 |
| 27079 | pmaddubsw m5, m2, m6 |
| 27080 | pmulhrsw m5, m7 |
| 27081 | packuswb m4, m5 |
| 27082 | movu [r0 + 1378 * 16], m4 |
| 27083 | pslldq m1, 2 |
| 27084 | pinsrw m1, [r3 + 11], 0 |
| 27085 | pmaddubsw m4, m1, m6 |
| 27086 | pmulhrsw m4, m7 |
| 27087 | pslldq m3, 2 |
| 27088 | pinsrw m3, [r3 + 19], 0 |
| 27089 | pmaddubsw m5, m3, m6 |
| 27090 | pmulhrsw m5, m7 |
| 27091 | packuswb m4, m5 |
| 27092 | movu [r0 + 1379 * 16], m4 |
| 27093 | |
| 27094 | ; mode 23 [row 18] |
| 27095 | movu m6, [r5 + 21 * 16] |
| 27096 | pmaddubsw m4, m0, m6 |
| 27097 | pmulhrsw m4, m7 |
| 27098 | pmaddubsw m5, m2, m6 |
| 27099 | pmulhrsw m5, m7 |
| 27100 | packuswb m4, m5 |
| 27101 | movu [r0 + 1380 * 16], m4 |
| 27102 | pmaddubsw m4, m1, m6 |
| 27103 | pmulhrsw m4, m7 |
| 27104 | pmaddubsw m5, m3, m6 |
| 27105 | pmulhrsw m5, m7 |
| 27106 | packuswb m4, m5 |
| 27107 | movu [r0 + 1381 * 16], m4 |
| 27108 | |
| 27109 | ; mode 23 [row 19] |
| 27110 | movu m6, [r5 + 12 * 16] |
| 27111 | pmaddubsw m4, m0, m6 |
| 27112 | pmulhrsw m4, m7 |
| 27113 | pmaddubsw m5, m2, m6 |
| 27114 | pmulhrsw m5, m7 |
| 27115 | packuswb m4, m5 |
| 27116 | movu [r0 + 1382 * 16], m4 |
| 27117 | pmaddubsw m4, m1, m6 |
| 27118 | pmulhrsw m4, m7 |
| 27119 | pmaddubsw m5, m3, m6 |
| 27120 | pmulhrsw m5, m7 |
| 27121 | packuswb m4, m5 |
| 27122 | movu [r0 + 1383 * 16], m4 |
| 27123 | |
| 27124 | ; mode 23 [row 20] |
| 27125 | movu m6, [r5 + 3 * 16] |
| 27126 | pmaddubsw m4, m0, m6 |
| 27127 | pmulhrsw m4, m7 |
| 27128 | pmaddubsw m5, m2, m6 |
| 27129 | pmulhrsw m5, m7 |
| 27130 | packuswb m4, m5 |
| 27131 | movu [r0 + 1384 * 16], m4 |
| 27132 | pmaddubsw m4, m1, m6 |
| 27133 | pmulhrsw m4, m7 |
| 27134 | pmaddubsw m5, m3, m6 |
| 27135 | pmulhrsw m5, m7 |
| 27136 | packuswb m4, m5 |
| 27137 | movu [r0 + 1385 * 16], m4 |
| 27138 | |
| 27139 | ; mode 23 [row 21] |
| 27140 | movu m6, [r5 + 26 * 16] |
| 27141 | pslldq m0, 2 |
| 27142 | pinsrb m0, [r4 + 18], 1 |
| 27143 | pinsrb m0, [r4 + 21], 0 |
| 27144 | pmaddubsw m4, m0, m6 |
| 27145 | pmulhrsw m4, m7 |
| 27146 | pslldq m2, 2 |
| 27147 | pinsrw m2, [r3 + 2], 0 |
| 27148 | pmaddubsw m5, m2, m6 |
| 27149 | pmulhrsw m5, m7 |
| 27150 | packuswb m4, m5 |
| 27151 | movu [r0 + 1386 * 16], m4 |
| 27152 | pslldq m1, 2 |
| 27153 | pinsrw m1, [r3 + 10], 0 |
| 27154 | pmaddubsw m4, m1, m6 |
| 27155 | pmulhrsw m4, m7 |
| 27156 | pslldq m3, 2 |
| 27157 | pinsrw m3, [r3 + 18], 0 |
| 27158 | pmaddubsw m5, m3, m6 |
| 27159 | pmulhrsw m5, m7 |
| 27160 | packuswb m4, m5 |
| 27161 | movu [r0 + 1387 * 16], m4 |
| 27162 | |
| 27163 | ; mode 23 [row 22] |
| 27164 | movu m6, [r5 + 17 * 16] |
| 27165 | pmaddubsw m4, m0, m6 |
| 27166 | pmulhrsw m4, m7 |
| 27167 | pmaddubsw m5, m2, m6 |
| 27168 | pmulhrsw m5, m7 |
| 27169 | packuswb m4, m5 |
| 27170 | movu [r0 + 1388 * 16], m4 |
| 27171 | pmaddubsw m4, m1, m6 |
| 27172 | pmulhrsw m4, m7 |
| 27173 | pmaddubsw m5, m3, m6 |
| 27174 | pmulhrsw m5, m7 |
| 27175 | packuswb m4, m5 |
| 27176 | movu [r0 + 1389 * 16], m4 |
| 27177 | |
| 27178 | ; mode 23 [row 23] |
| 27179 | movu m6, [r5 + 8 * 16] |
| 27180 | pmaddubsw m4, m0, m6 |
| 27181 | pmulhrsw m4, m7 |
| 27182 | pmaddubsw m5, m2, m6 |
| 27183 | pmulhrsw m5, m7 |
| 27184 | packuswb m4, m5 |
| 27185 | movu [r0 + 1390 * 16], m4 |
| 27186 | pmaddubsw m4, m1, m6 |
| 27187 | pmulhrsw m4, m7 |
| 27188 | pmaddubsw m5, m3, m6 |
| 27189 | pmulhrsw m5, m7 |
| 27190 | packuswb m4, m5 |
| 27191 | movu [r0 + 1391 * 16], m4 |
| 27192 | |
| 27193 | ; mode 23 [row 24] |
| 27194 | movu m6, [r5 + 31 * 16] |
| 27195 | pslldq m0, 2 |
| 27196 | pinsrb m0, [r4 + 21], 1 |
| 27197 | pinsrb m0, [r4 + 25], 0 |
| 27198 | pmaddubsw m4, m0, m6 |
| 27199 | pmulhrsw m4, m7 |
| 27200 | pslldq m2, 2 |
| 27201 | pinsrw m2, [r3 + 1], 0 |
| 27202 | pmaddubsw m5, m2, m6 |
| 27203 | pmulhrsw m5, m7 |
| 27204 | packuswb m4, m5 |
| 27205 | movu [r0 + 1392 * 16], m4 |
| 27206 | pslldq m1, 2 |
| 27207 | pinsrw m1, [r3 + 9], 0 |
| 27208 | pmaddubsw m4, m1, m6 |
| 27209 | pmulhrsw m4, m7 |
| 27210 | pslldq m3, 2 |
| 27211 | pinsrw m3, [r3 + 17], 0 |
| 27212 | pmaddubsw m5, m3, m6 |
| 27213 | pmulhrsw m5, m7 |
| 27214 | packuswb m4, m5 |
| 27215 | movu [r0 + 1393 * 16], m4 |
| 27216 | |
| 27217 | ; mode 23 [row 25] |
| 27218 | movu m6, [r5 + 22 * 16] |
| 27219 | pmaddubsw m4, m0, m6 |
| 27220 | pmulhrsw m4, m7 |
| 27221 | pmaddubsw m5, m2, m6 |
| 27222 | pmulhrsw m5, m7 |
| 27223 | packuswb m4, m5 |
| 27224 | movu [r0 + 1394 * 16], m4 |
| 27225 | pmaddubsw m4, m1, m6 |
| 27226 | pmulhrsw m4, m7 |
| 27227 | pmaddubsw m5, m3, m6 |
| 27228 | pmulhrsw m5, m7 |
| 27229 | packuswb m4, m5 |
| 27230 | movu [r0 + 1395 * 16], m4 |
| 27231 | |
| 27232 | ; mode 23 [row 26] |
| 27233 | movu m6, [r5 + 13 * 16] |
| 27234 | pmaddubsw m4, m0, m6 |
| 27235 | pmulhrsw m4, m7 |
| 27236 | pmaddubsw m5, m2, m6 |
| 27237 | pmulhrsw m5, m7 |
| 27238 | packuswb m4, m5 |
| 27239 | movu [r0 + 1396 * 16], m4 |
| 27240 | pmaddubsw m4, m1, m6 |
| 27241 | pmulhrsw m4, m7 |
| 27242 | pmaddubsw m5, m3, m6 |
| 27243 | pmulhrsw m5, m7 |
| 27244 | packuswb m4, m5 |
| 27245 | movu [r0 + 1397 * 16], m4 |
| 27246 | |
| 27247 | ; mode 23 [row 27] |
| 27248 | movu m6, [r5 + 4 * 16] |
| 27249 | pmaddubsw m4, m0, m6 |
| 27250 | pmulhrsw m4, m7 |
| 27251 | pmaddubsw m5, m2, m6 |
| 27252 | pmulhrsw m5, m7 |
| 27253 | packuswb m4, m5 |
| 27254 | movu [r0 + 1398 * 16], m4 |
| 27255 | pmaddubsw m4, m1, m6 |
| 27256 | pmulhrsw m4, m7 |
| 27257 | pmaddubsw m5, m3, m6 |
| 27258 | pmulhrsw m5, m7 |
| 27259 | packuswb m4, m5 |
| 27260 | movu [r0 + 1399 * 16], m4 |
| 27261 | |
| 27262 | ; mode 23 [row 28] |
| 27263 | movu m6, [r5 + 27 * 16] |
| 27264 | pslldq m0, 2 |
| 27265 | pinsrb m0, [r4 + 25], 1 |
| 27266 | pinsrb m0, [r4 + 28], 0 |
| 27267 | pmaddubsw m4, m0, m6 |
| 27268 | pmulhrsw m4, m7 |
| 27269 | pslldq m2, 2 |
| 27270 | pinsrw m2, [r3 + 0], 0 |
| 27271 | pmaddubsw m5, m2, m6 |
| 27272 | pmulhrsw m5, m7 |
| 27273 | packuswb m4, m5 |
| 27274 | movu [r0 + 1400 * 16], m4 |
| 27275 | pslldq m1, 2 |
| 27276 | pinsrw m1, [r3 + 8], 0 |
| 27277 | pmaddubsw m4, m1, m6 |
| 27278 | pmulhrsw m4, m7 |
| 27279 | pslldq m3, 2 |
| 27280 | pinsrw m3, [r3 + 16], 0 |
| 27281 | pmaddubsw m5, m3, m6 |
| 27282 | pmulhrsw m5, m7 |
| 27283 | packuswb m4, m5 |
| 27284 | movu [r0 + 1401 * 16], m4 |
| 27285 | |
| 27286 | ; mode 23 [row 29] |
| 27287 | movu m6, [r5 + 18 * 16] |
| 27288 | pmaddubsw m4, m0, m6 |
| 27289 | pmulhrsw m4, m7 |
| 27290 | pmaddubsw m5, m2, m6 |
| 27291 | pmulhrsw m5, m7 |
| 27292 | packuswb m4, m5 |
| 27293 | movu [r0 + 1402 * 16], m4 |
| 27294 | pmaddubsw m4, m1, m6 |
| 27295 | pmulhrsw m4, m7 |
| 27296 | pmaddubsw m5, m3, m6 |
| 27297 | pmulhrsw m5, m7 |
| 27298 | packuswb m4, m5 |
| 27299 | movu [r0 + 1403 * 16], m4 |
| 27300 | |
| 27301 | ; mode 23 [row 30] |
| 27302 | movu m6, [r5 + 9 * 16] |
| 27303 | pmaddubsw m4, m0, m6 |
| 27304 | pmulhrsw m4, m7 |
| 27305 | pmaddubsw m5, m2, m6 |
| 27306 | pmulhrsw m5, m7 |
| 27307 | packuswb m4, m5 |
| 27308 | movu [r0 + 1404 * 16], m4 |
| 27309 | pmaddubsw m4, m1, m6 |
| 27310 | pmulhrsw m4, m7 |
| 27311 | pmaddubsw m5, m3, m6 |
| 27312 | pmulhrsw m5, m7 |
| 27313 | packuswb m4, m5 |
| 27314 | movu [r0 + 1405 * 16], m4 |
| 27315 | |
| 27316 | ; mode23 [row 31] |
| 27317 | pshufb m5, m0, [tab_S2] |
| 27318 | movh [r0 + 1406 * 16], m5 |
| 27319 | pshufb m5, m2, [tab_S2] |
| 27320 | movh [r0 + 1406 * 16 + 8], m5 |
| 27321 | pshufb m5, m1, [tab_S2] |
| 27322 | movh [r0 + 1407 * 16], m5 |
| 27323 | pshufb m5, m3, [tab_S2] |
| 27324 | movh [r0 + 1407 * 16 + 8], m5 |
| 27325 | |
| 27326 | ; mode 24 [row 0] |
| 27327 | movu m6, [r5 + 27 * 16] |
| 27328 | movu m0, [r3 ] |
| 27329 | movu m1, [r3 + 1 ] |
| 27330 | punpcklbw m0, m1 |
| 27331 | pmaddubsw m4, m0, m6 |
| 27332 | pmulhrsw m4, m7 |
| 27333 | movu m2, [r3 + 8] |
| 27334 | movu m3, [r3 + 9] |
| 27335 | punpcklbw m2, m3 |
| 27336 | pmaddubsw m5, m2, m6 |
| 27337 | pmulhrsw m5, m7 |
| 27338 | packuswb m4, m5 |
| 27339 | movu [r0 + 1408 * 16], m4 |
| 27340 | |
| 27341 | movu m1, [r3 + 16] |
| 27342 | movu m3, [r3 + 17] |
| 27343 | punpcklbw m1, m3 |
| 27344 | pmaddubsw m4, m1, m6 |
| 27345 | pmulhrsw m4, m7 |
| 27346 | movu m3, [r3 + 24] |
| 27347 | movu m5, [r3 + 25] |
| 27348 | punpcklbw m3, m5 |
| 27349 | pmaddubsw m5, m3, m6 |
| 27350 | pmulhrsw m5, m7 |
| 27351 | packuswb m4, m5 |
| 27352 | movu [r0 + 1409 * 16], m4 |
| 27353 | |
| 27354 | ; mode 24 [row 1] |
| 27355 | movu m6, [r5 + 22 * 16] |
| 27356 | pmaddubsw m4, m0, m6 |
| 27357 | pmulhrsw m4, m7 |
| 27358 | pmaddubsw m5, m2, m6 |
| 27359 | pmulhrsw m5, m7 |
| 27360 | packuswb m4, m5 |
| 27361 | movu [r0 + 1410 * 16], m4 |
| 27362 | pmaddubsw m4, m1, m6 |
| 27363 | pmulhrsw m4, m7 |
| 27364 | pmaddubsw m5, m3, m6 |
| 27365 | pmulhrsw m5, m7 |
| 27366 | packuswb m4, m5 |
| 27367 | movu [r0 + 1411 * 16], m4 |
| 27368 | |
| 27369 | ; mode 24 [row 2] |
| 27370 | movu m6, [r5 + 17 * 16] |
| 27371 | pmaddubsw m4, m0, m6 |
| 27372 | pmulhrsw m4, m7 |
| 27373 | pmaddubsw m5, m2, m6 |
| 27374 | pmulhrsw m5, m7 |
| 27375 | packuswb m4, m5 |
| 27376 | movu [r0 + 1412 * 16], m4 |
| 27377 | pmaddubsw m4, m1, m6 |
| 27378 | pmulhrsw m4, m7 |
| 27379 | pmaddubsw m5, m3, m6 |
| 27380 | pmulhrsw m5, m7 |
| 27381 | packuswb m4, m5 |
| 27382 | movu [r0 + 1413 * 16], m4 |
| 27383 | |
| 27384 | ; mode 24 [row 3] |
| 27385 | movu m6, [r5 + 12 * 16] |
| 27386 | pmaddubsw m4, m0, m6 |
| 27387 | pmulhrsw m4, m7 |
| 27388 | pmaddubsw m5, m2, m6 |
| 27389 | pmulhrsw m5, m7 |
| 27390 | packuswb m4, m5 |
| 27391 | movu [r0 + 1414 * 16], m4 |
| 27392 | pmaddubsw m4, m1, m6 |
| 27393 | pmulhrsw m4, m7 |
| 27394 | pmaddubsw m5, m3, m6 |
| 27395 | pmulhrsw m5, m7 |
| 27396 | packuswb m4, m5 |
| 27397 | movu [r0 + 1415 * 16], m4 |
| 27398 | |
| 27399 | ; mode 24 [row 4] |
| 27400 | movu m6, [r5 + 7 * 16] |
| 27401 | pmaddubsw m4, m0, m6 |
| 27402 | pmulhrsw m4, m7 |
| 27403 | pmaddubsw m5, m2, m6 |
| 27404 | pmulhrsw m5, m7 |
| 27405 | packuswb m4, m5 |
| 27406 | movu [r0 + 1416 * 16], m4 |
| 27407 | pmaddubsw m4, m1, m6 |
| 27408 | pmulhrsw m4, m7 |
| 27409 | pmaddubsw m5, m3, m6 |
| 27410 | pmulhrsw m5, m7 |
| 27411 | packuswb m4, m5 |
| 27412 | movu [r0 + 1417 * 16], m4 |
| 27413 | |
| 27414 | ; mode 24 [row 5] |
| 27415 | movu m6, [r5 + 2 * 16] |
| 27416 | pmaddubsw m4, m0, m6 |
| 27417 | pmulhrsw m4, m7 |
| 27418 | pmaddubsw m5, m2, m6 |
| 27419 | pmulhrsw m5, m7 |
| 27420 | packuswb m4, m5 |
| 27421 | movu [r0 + 1418 * 16], m4 |
| 27422 | pmaddubsw m4, m1, m6 |
| 27423 | pmulhrsw m4, m7 |
| 27424 | pmaddubsw m5, m3, m6 |
| 27425 | pmulhrsw m5, m7 |
| 27426 | packuswb m4, m5 |
| 27427 | movu [r0 + 1419 * 16], m4 |
| 27428 | |
| 27429 | ; mode 24 [row 6] |
| 27430 | movu m6, [r5 + 29 * 16] |
| 27431 | pslldq m0, 2 |
| 27432 | pinsrb m0, [r4 + 0], 1 |
| 27433 | pinsrb m0, [r4 + 6], 0 |
| 27434 | pmaddubsw m4, m0, m6 |
| 27435 | pmulhrsw m4, m7 |
| 27436 | pslldq m2, 2 |
| 27437 | pinsrw m2, [r3 + 7], 0 |
| 27438 | pmaddubsw m5, m2, m6 |
| 27439 | pmulhrsw m5, m7 |
| 27440 | packuswb m4, m5 |
| 27441 | movu [r0 + 1420 * 16], m4 |
| 27442 | pslldq m1, 2 |
| 27443 | pinsrw m1, [r3 + 15], 0 |
| 27444 | pmaddubsw m4, m1, m6 |
| 27445 | pmulhrsw m4, m7 |
| 27446 | pslldq m3, 2 |
| 27447 | pinsrw m3, [r3 + 23], 0 |
| 27448 | pmaddubsw m5, m3, m6 |
| 27449 | pmulhrsw m5, m7 |
| 27450 | packuswb m4, m5 |
| 27451 | movu [r0 + 1421 * 16], m4 |
| 27452 | |
| 27453 | ; mode 24 [row 7] |
| 27454 | movu m6, [r5 + 24 * 16] |
| 27455 | pmaddubsw m4, m0, m6 |
| 27456 | pmulhrsw m4, m7 |
| 27457 | pmaddubsw m5, m2, m6 |
| 27458 | pmulhrsw m5, m7 |
| 27459 | packuswb m4, m5 |
| 27460 | movu [r0 + 1422 * 16], m4 |
| 27461 | pmaddubsw m4, m1, m6 |
| 27462 | pmulhrsw m4, m7 |
| 27463 | pmaddubsw m5, m3, m6 |
| 27464 | pmulhrsw m5, m7 |
| 27465 | packuswb m4, m5 |
| 27466 | movu [r0 + 1423 * 16], m4 |
| 27467 | |
| 27468 | ; mode 24 [row 8] |
| 27469 | movu m6, [r5 + 19 * 16] |
| 27470 | pmaddubsw m4, m0, m6 |
| 27471 | pmulhrsw m4, m7 |
| 27472 | pmaddubsw m5, m2, m6 |
| 27473 | pmulhrsw m5, m7 |
| 27474 | packuswb m4, m5 |
| 27475 | movu [r0 + 1424 * 16], m4 |
| 27476 | pmaddubsw m4, m1, m6 |
| 27477 | pmulhrsw m4, m7 |
| 27478 | pmaddubsw m5, m3, m6 |
| 27479 | pmulhrsw m5, m7 |
| 27480 | packuswb m4, m5 |
| 27481 | movu [r0 + 1425 * 16], m4 |
| 27482 | |
| 27483 | ; mode 24 [row 9] |
| 27484 | movu m6, [r5 + 14 * 16] |
| 27485 | pmaddubsw m4, m0, m6 |
| 27486 | pmulhrsw m4, m7 |
| 27487 | pmaddubsw m5, m2, m6 |
| 27488 | pmulhrsw m5, m7 |
| 27489 | packuswb m4, m5 |
| 27490 | movu [r0 + 1426 * 16], m4 |
| 27491 | pmaddubsw m4, m1, m6 |
| 27492 | pmulhrsw m4, m7 |
| 27493 | pmaddubsw m5, m3, m6 |
| 27494 | pmulhrsw m5, m7 |
| 27495 | packuswb m4, m5 |
| 27496 | movu [r0 + 1427 * 16], m4 |
| 27497 | |
| 27498 | ; mode 24 [row 10] |
| 27499 | movu m6, [r5 + 9 * 16] |
| 27500 | pmaddubsw m4, m0, m6 |
| 27501 | pmulhrsw m4, m7 |
| 27502 | pmaddubsw m5, m2, m6 |
| 27503 | pmulhrsw m5, m7 |
| 27504 | packuswb m4, m5 |
| 27505 | movu [r0 + 1428 * 16], m4 |
| 27506 | pmaddubsw m4, m1, m6 |
| 27507 | pmulhrsw m4, m7 |
| 27508 | pmaddubsw m5, m3, m6 |
| 27509 | pmulhrsw m5, m7 |
| 27510 | packuswb m4, m5 |
| 27511 | movu [r0 + 1429 * 16], m4 |
| 27512 | |
| 27513 | ; mode 24 [row 11] |
| 27514 | movu m6, [r5 + 4 * 16] |
| 27515 | pmaddubsw m4, m0, m6 |
| 27516 | pmulhrsw m4, m7 |
| 27517 | pmaddubsw m5, m2, m6 |
| 27518 | pmulhrsw m5, m7 |
| 27519 | packuswb m4, m5 |
| 27520 | movu [r0 + 1430 * 16], m4 |
| 27521 | pmaddubsw m4, m1, m6 |
| 27522 | pmulhrsw m4, m7 |
| 27523 | pmaddubsw m5, m3, m6 |
| 27524 | pmulhrsw m5, m7 |
| 27525 | packuswb m4, m5 |
| 27526 | movu [r0 + 1431 * 16], m4 |
| 27527 | |
| 27528 | ; mode 24 [row 12] |
| 27529 | movu m6, [r5 + 31 * 16] |
| 27530 | pslldq m0, 2 |
| 27531 | pinsrb m0, [r4 + 6], 1 |
| 27532 | pinsrb m0, [r4 + 13], 0 |
| 27533 | pmaddubsw m4, m0, m6 |
| 27534 | pmulhrsw m4, m7 |
| 27535 | pslldq m2, 2 |
| 27536 | pinsrw m2, [r3 + 6], 0 |
| 27537 | pmaddubsw m5, m2, m6 |
| 27538 | pmulhrsw m5, m7 |
| 27539 | packuswb m4, m5 |
| 27540 | movu [r0 + 1432 * 16], m4 |
| 27541 | pslldq m1, 2 |
| 27542 | pinsrw m1, [r3 + 14], 0 |
| 27543 | pmaddubsw m4, m1, m6 |
| 27544 | pmulhrsw m4, m7 |
| 27545 | pslldq m3, 2 |
| 27546 | pinsrw m3, [r3 + 22], 0 |
| 27547 | pmaddubsw m5, m3, m6 |
| 27548 | pmulhrsw m5, m7 |
| 27549 | packuswb m4, m5 |
| 27550 | movu [r0 + 1433 * 16], m4 |
| 27551 | |
| 27552 | ; mode 24 [row 13] |
| 27553 | movu m6, [r5 + 26 * 16] |
| 27554 | pmaddubsw m4, m0, m6 |
| 27555 | pmulhrsw m4, m7 |
| 27556 | pmaddubsw m5, m2, m6 |
| 27557 | pmulhrsw m5, m7 |
| 27558 | packuswb m4, m5 |
| 27559 | movu [r0 + 1434 * 16], m4 |
| 27560 | pmaddubsw m4, m1, m6 |
| 27561 | pmulhrsw m4, m7 |
| 27562 | pmaddubsw m5, m3, m6 |
| 27563 | pmulhrsw m5, m7 |
| 27564 | packuswb m4, m5 |
| 27565 | movu [r0 + 1435 * 16], m4 |
| 27566 | |
| 27567 | ; mode 24 [row 14] |
| 27568 | movu m6, [r5 + 21 * 16] |
| 27569 | pmaddubsw m4, m0, m6 |
| 27570 | pmulhrsw m4, m7 |
| 27571 | pmaddubsw m5, m2, m6 |
| 27572 | pmulhrsw m5, m7 |
| 27573 | packuswb m4, m5 |
| 27574 | movu [r0 + 1436 * 16], m4 |
| 27575 | pmaddubsw m4, m1, m6 |
| 27576 | pmulhrsw m4, m7 |
| 27577 | pmaddubsw m5, m3, m6 |
| 27578 | pmulhrsw m5, m7 |
| 27579 | packuswb m4, m5 |
| 27580 | movu [r0 + 1437 * 16], m4 |
| 27581 | |
| 27582 | ; mode 24 [row 15] |
| 27583 | movu m6, [r5 + 16 * 16] |
| 27584 | pmaddubsw m4, m0, m6 |
| 27585 | pmulhrsw m4, m7 |
| 27586 | pmaddubsw m5, m2, m6 |
| 27587 | pmulhrsw m5, m7 |
| 27588 | packuswb m4, m5 |
| 27589 | movu [r0 + 1438 * 16], m4 |
| 27590 | pmaddubsw m4, m1, m6 |
| 27591 | pmulhrsw m4, m7 |
| 27592 | pmaddubsw m5, m3, m6 |
| 27593 | pmulhrsw m5, m7 |
| 27594 | packuswb m4, m5 |
| 27595 | movu [r0 + 1439 * 16], m4 |
| 27596 | |
| 27597 | ; mode 24 [row 16] |
| 27598 | movu m6, [r5 + 11 * 16] |
| 27599 | pmaddubsw m4, m0, m6 |
| 27600 | pmulhrsw m4, m7 |
| 27601 | pmaddubsw m5, m2, m6 |
| 27602 | pmulhrsw m5, m7 |
| 27603 | packuswb m4, m5 |
| 27604 | movu [r0 + 1440 * 16], m4 |
| 27605 | pmaddubsw m4, m1, m6 |
| 27606 | pmulhrsw m4, m7 |
| 27607 | pmaddubsw m5, m3, m6 |
| 27608 | pmulhrsw m5, m7 |
| 27609 | packuswb m4, m5 |
| 27610 | movu [r0 + 1441 * 16], m4 |
| 27611 | |
| 27612 | ; mode 24 [row 17] |
| 27613 | movu m6, [r5 + 6 * 16] |
| 27614 | pmaddubsw m4, m0, m6 |
| 27615 | pmulhrsw m4, m7 |
| 27616 | pmaddubsw m5, m2, m6 |
| 27617 | pmulhrsw m5, m7 |
| 27618 | packuswb m4, m5 |
| 27619 | movu [r0 + 1442 * 16], m4 |
| 27620 | pmaddubsw m4, m1, m6 |
| 27621 | pmulhrsw m4, m7 |
| 27622 | pmaddubsw m5, m3, m6 |
| 27623 | pmulhrsw m5, m7 |
| 27624 | packuswb m4, m5 |
| 27625 | movu [r0 + 1443 * 16], m4 |
| 27626 | |
| 27627 | ; mode 24 [row 18] |
| 27628 | movu m6, [r5 + 1 * 16] |
| 27629 | pmaddubsw m4, m0, m6 |
| 27630 | pmulhrsw m4, m7 |
| 27631 | pmaddubsw m5, m2, m6 |
| 27632 | pmulhrsw m5, m7 |
| 27633 | packuswb m4, m5 |
| 27634 | movu [r0 + 1444 * 16], m4 |
| 27635 | pmaddubsw m4, m1, m6 |
| 27636 | pmulhrsw m4, m7 |
| 27637 | pmaddubsw m5, m3, m6 |
| 27638 | pmulhrsw m5, m7 |
| 27639 | packuswb m4, m5 |
| 27640 | movu [r0 + 1445 * 16], m4 |
| 27641 | |
| 27642 | ; mode 24 [row 19] |
| 27643 | movu m6, [r5 + 28 * 16] |
| 27644 | pslldq m0, 2 |
| 27645 | pinsrb m0, [r4 + 13], 1 |
| 27646 | pinsrb m0, [r4 + 19], 0 |
| 27647 | pmaddubsw m4, m0, m6 |
| 27648 | pmulhrsw m4, m7 |
| 27649 | pslldq m2, 2 |
| 27650 | pinsrw m2, [r3 + 5], 0 |
| 27651 | pmaddubsw m5, m2, m6 |
| 27652 | pmulhrsw m5, m7 |
| 27653 | packuswb m4, m5 |
| 27654 | movu [r0 + 1446 * 16], m4 |
| 27655 | pslldq m1, 2 |
| 27656 | pinsrw m1, [r3 + 13], 0 |
| 27657 | pmaddubsw m4, m1, m6 |
| 27658 | pmulhrsw m4, m7 |
| 27659 | pslldq m3, 2 |
| 27660 | pinsrw m3, [r3 + 21], 0 |
| 27661 | pmaddubsw m5, m3, m6 |
| 27662 | pmulhrsw m5, m7 |
| 27663 | packuswb m4, m5 |
| 27664 | movu [r0 + 1447 * 16], m4 |
| 27665 | |
| 27666 | ; mode 24 [row 20] |
| 27667 | movu m6, [r5 + 23 * 16] |
| 27668 | pmaddubsw m4, m0, m6 |
| 27669 | pmulhrsw m4, m7 |
| 27670 | pmaddubsw m5, m2, m6 |
| 27671 | pmulhrsw m5, m7 |
| 27672 | packuswb m4, m5 |
| 27673 | movu [r0 + 1448 * 16], m4 |
| 27674 | pmaddubsw m4, m1, m6 |
| 27675 | pmulhrsw m4, m7 |
| 27676 | pmaddubsw m5, m3, m6 |
| 27677 | pmulhrsw m5, m7 |
| 27678 | packuswb m4, m5 |
| 27679 | movu [r0 + 1449 * 16], m4 |
| 27680 | |
| 27681 | ; mode 24 [row 21] |
| 27682 | movu m6, [r5 + 18 * 16] |
| 27683 | pmaddubsw m4, m0, m6 |
| 27684 | pmulhrsw m4, m7 |
| 27685 | pmaddubsw m5, m2, m6 |
| 27686 | pmulhrsw m5, m7 |
| 27687 | packuswb m4, m5 |
| 27688 | movu [r0 + 1450 * 16], m4 |
| 27689 | pmaddubsw m4, m1, m6 |
| 27690 | pmulhrsw m4, m7 |
| 27691 | pmaddubsw m5, m3, m6 |
| 27692 | pmulhrsw m5, m7 |
| 27693 | packuswb m4, m5 |
| 27694 | movu [r0 + 1451 * 16], m4 |
| 27695 | |
| 27696 | ; mode 24 [row 22] |
| 27697 | movu m6, [r5 + 13 * 16] |
| 27698 | pmaddubsw m4, m0, m6 |
| 27699 | pmulhrsw m4, m7 |
| 27700 | pmaddubsw m5, m2, m6 |
| 27701 | pmulhrsw m5, m7 |
| 27702 | packuswb m4, m5 |
| 27703 | movu [r0 + 1452 * 16], m4 |
| 27704 | pmaddubsw m4, m1, m6 |
| 27705 | pmulhrsw m4, m7 |
| 27706 | pmaddubsw m5, m3, m6 |
| 27707 | pmulhrsw m5, m7 |
| 27708 | packuswb m4, m5 |
| 27709 | movu [r0 + 1453 * 16], m4 |
| 27710 | |
| 27711 | ; mode 24 [row 23] |
| 27712 | movu m6, [r5 + 8 * 16] |
| 27713 | pmaddubsw m4, m0, m6 |
| 27714 | pmulhrsw m4, m7 |
| 27715 | pmaddubsw m5, m2, m6 |
| 27716 | pmulhrsw m5, m7 |
| 27717 | packuswb m4, m5 |
| 27718 | movu [r0 + 1454 * 16], m4 |
| 27719 | pmaddubsw m4, m1, m6 |
| 27720 | pmulhrsw m4, m7 |
| 27721 | pmaddubsw m5, m3, m6 |
| 27722 | pmulhrsw m5, m7 |
| 27723 | packuswb m4, m5 |
| 27724 | movu [r0 + 1455 * 16], m4 |
| 27725 | |
| 27726 | ; mode 24 [row 24] |
| 27727 | movu m6, [r5 + 3 * 16] |
| 27728 | pmaddubsw m4, m0, m6 |
| 27729 | pmulhrsw m4, m7 |
| 27730 | pmaddubsw m5, m2, m6 |
| 27731 | pmulhrsw m5, m7 |
| 27732 | packuswb m4, m5 |
| 27733 | movu [r0 + 1456 * 16], m4 |
| 27734 | pmaddubsw m4, m1, m6 |
| 27735 | pmulhrsw m4, m7 |
| 27736 | pmaddubsw m5, m3, m6 |
| 27737 | pmulhrsw m5, m7 |
| 27738 | packuswb m4, m5 |
| 27739 | movu [r0 + 1457 * 16], m4 |
| 27740 | |
| 27741 | ; mode 24 [row 25] |
| 27742 | movu m6, [r5 + 30 * 16] |
| 27743 | pslldq m0, 2 |
| 27744 | pinsrb m0, [r4 + 19], 1 |
| 27745 | pinsrb m0, [r4 + 26], 0 |
| 27746 | pmaddubsw m4, m0, m6 |
| 27747 | pmulhrsw m4, m7 |
| 27748 | pslldq m2, 2 |
| 27749 | pinsrw m2, [r3 + 4], 0 |
| 27750 | pmaddubsw m5, m2, m6 |
| 27751 | pmulhrsw m5, m7 |
| 27752 | packuswb m4, m5 |
| 27753 | movu [r0 + 1458 * 16], m4 |
| 27754 | pslldq m1, 2 |
| 27755 | pinsrw m1, [r3 + 12], 0 |
| 27756 | pmaddubsw m4, m1, m6 |
| 27757 | pmulhrsw m4, m7 |
| 27758 | pslldq m3, 2 |
| 27759 | pinsrw m3, [r3 + 20], 0 |
| 27760 | pmaddubsw m5, m3, m6 |
| 27761 | pmulhrsw m5, m7 |
| 27762 | packuswb m4, m5 |
| 27763 | movu [r0 + 1459 * 16], m4 |
| 27764 | |
| 27765 | ; mode 24 [row 26] |
| 27766 | movu m6, [r5 + 25 * 16] |
| 27767 | pmaddubsw m4, m0, m6 |
| 27768 | pmulhrsw m4, m7 |
| 27769 | pmaddubsw m5, m2, m6 |
| 27770 | pmulhrsw m5, m7 |
| 27771 | packuswb m4, m5 |
| 27772 | movu [r0 + 1460 * 16], m4 |
| 27773 | pmaddubsw m4, m1, m6 |
| 27774 | pmulhrsw m4, m7 |
| 27775 | pmaddubsw m5, m3, m6 |
| 27776 | pmulhrsw m5, m7 |
| 27777 | packuswb m4, m5 |
| 27778 | movu [r0 + 1461 * 16], m4 |
| 27779 | |
| 27780 | ; mode 24 [row 27] |
| 27781 | movu m6, [r5 + 20 * 16] |
| 27782 | pmaddubsw m4, m0, m6 |
| 27783 | pmulhrsw m4, m7 |
| 27784 | pmaddubsw m5, m2, m6 |
| 27785 | pmulhrsw m5, m7 |
| 27786 | packuswb m4, m5 |
| 27787 | movu [r0 + 1462 * 16], m4 |
| 27788 | pmaddubsw m4, m1, m6 |
| 27789 | pmulhrsw m4, m7 |
| 27790 | pmaddubsw m5, m3, m6 |
| 27791 | pmulhrsw m5, m7 |
| 27792 | packuswb m4, m5 |
| 27793 | movu [r0 + 1463 * 16], m4 |
| 27794 | |
| 27795 | ; mode 24 [row 28] |
| 27796 | movu m6, [r5 + 15 * 16] |
| 27797 | pmaddubsw m4, m0, m6 |
| 27798 | pmulhrsw m4, m7 |
| 27799 | pmaddubsw m5, m2, m6 |
| 27800 | pmulhrsw m5, m7 |
| 27801 | packuswb m4, m5 |
| 27802 | movu [r0 + 1464 * 16], m4 |
| 27803 | pmaddubsw m4, m1, m6 |
| 27804 | pmulhrsw m4, m7 |
| 27805 | pmaddubsw m5, m3, m6 |
| 27806 | pmulhrsw m5, m7 |
| 27807 | packuswb m4, m5 |
| 27808 | movu [r0 + 1465 * 16], m4 |
| 27809 | |
| 27810 | ; mode 24 [row 29] |
| 27811 | movu m6, [r5 + 10 * 16] |
| 27812 | pmaddubsw m4, m0, m6 |
| 27813 | pmulhrsw m4, m7 |
| 27814 | pmaddubsw m5, m2, m6 |
| 27815 | pmulhrsw m5, m7 |
| 27816 | packuswb m4, m5 |
| 27817 | movu [r0 + 1466 * 16], m4 |
| 27818 | pmaddubsw m4, m1, m6 |
| 27819 | pmulhrsw m4, m7 |
| 27820 | pmaddubsw m5, m3, m6 |
| 27821 | pmulhrsw m5, m7 |
| 27822 | packuswb m4, m5 |
| 27823 | movu [r0 + 1467 * 16], m4 |
| 27824 | |
| 27825 | ; mode 24 [row 30] |
| 27826 | movu m6, [r5 + 5 * 16] |
| 27827 | pmaddubsw m4, m0, m6 |
| 27828 | pmulhrsw m4, m7 |
| 27829 | pmaddubsw m5, m2, m6 |
| 27830 | pmulhrsw m5, m7 |
| 27831 | packuswb m4, m5 |
| 27832 | movu [r0 + 1468 * 16], m4 |
| 27833 | pmaddubsw m4, m1, m6 |
| 27834 | pmulhrsw m4, m7 |
| 27835 | pmaddubsw m5, m3, m6 |
| 27836 | pmulhrsw m5, m7 |
| 27837 | packuswb m4, m5 |
| 27838 | movu [r0 + 1469 * 16], m4 |
| 27839 | |
| 27840 | ; mode 24 [row 31] |
| 27841 | pshufb m5, m0, [tab_S2] |
| 27842 | movh [r0 + 1470 * 16], m5 |
| 27843 | pshufb m5, m2, [tab_S2] |
| 27844 | movh [r0 + 1470 * 16 + 8], m5 |
| 27845 | pshufb m5, m1, [tab_S2] |
| 27846 | movh [r0 + 1471 * 16], m5 |
| 27847 | pshufb m5, m3, [tab_S2] |
| 27848 | movh [r0 + 1471 * 16 + 8], m5 |
| 27849 | |
| 27850 | ; mode 25 [row 0] |
| 27851 | movu m6, [r5 + 30 * 16] |
| 27852 | movu m0, [r3 ] |
| 27853 | movu m1, [r3 + 1 ] |
| 27854 | punpcklbw m0, m1 |
| 27855 | pmaddubsw m4, m0, m6 |
| 27856 | pmulhrsw m4, m7 |
| 27857 | movu m2, [r3 + 8] |
| 27858 | movu m3, [r3 + 9] |
| 27859 | punpcklbw m2, m3 |
| 27860 | pmaddubsw m5, m2, m6 |
| 27861 | pmulhrsw m5, m7 |
| 27862 | packuswb m4, m5 |
| 27863 | movu [r0 + 1472 * 16], m4 |
| 27864 | |
| 27865 | movu m1, [r3 + 16] |
| 27866 | movu m3, [r3 + 17] |
| 27867 | punpcklbw m1, m3 |
| 27868 | pmaddubsw m4, m1, m6 |
| 27869 | pmulhrsw m4, m7 |
| 27870 | movu m3, [r3 + 24] |
| 27871 | movu m5, [r3 + 25] |
| 27872 | punpcklbw m3, m5 |
| 27873 | pmaddubsw m5, m3, m6 |
| 27874 | pmulhrsw m5, m7 |
| 27875 | packuswb m4, m5 |
| 27876 | movu [r0 + 1473 * 16], m4 |
| 27877 | |
| 27878 | ; mode 25 [row 1] |
| 27879 | movu m6, [r5 + 28 * 16] |
| 27880 | pmaddubsw m4, m0, m6 |
| 27881 | pmulhrsw m4, m7 |
| 27882 | pmaddubsw m5, m2, m6 |
| 27883 | pmulhrsw m5, m7 |
| 27884 | packuswb m4, m5 |
| 27885 | movu [r0 + 1474 * 16], m4 |
| 27886 | pmaddubsw m4, m1, m6 |
| 27887 | pmulhrsw m4, m7 |
| 27888 | pmaddubsw m5, m3, m6 |
| 27889 | pmulhrsw m5, m7 |
| 27890 | packuswb m4, m5 |
| 27891 | movu [r0 + 1475 * 16], m4 |
| 27892 | |
| 27893 | ; mode 25 [row 2] |
| 27894 | movu m6, [r5 + 26 * 16] |
| 27895 | pmaddubsw m4, m0, m6 |
| 27896 | pmulhrsw m4, m7 |
| 27897 | pmaddubsw m5, m2, m6 |
| 27898 | pmulhrsw m5, m7 |
| 27899 | packuswb m4, m5 |
| 27900 | movu [r0 + 1476 * 16], m4 |
| 27901 | pmaddubsw m4, m1, m6 |
| 27902 | pmulhrsw m4, m7 |
| 27903 | pmaddubsw m5, m3, m6 |
| 27904 | pmulhrsw m5, m7 |
| 27905 | packuswb m4, m5 |
| 27906 | movu [r0 + 1477 * 16], m4 |
| 27907 | |
| 27908 | ; mode 25 [row 3] |
| 27909 | movu m6, [r5 + 24 * 16] |
| 27910 | pmaddubsw m4, m0, m6 |
| 27911 | pmulhrsw m4, m7 |
| 27912 | pmaddubsw m5, m2, m6 |
| 27913 | pmulhrsw m5, m7 |
| 27914 | packuswb m4, m5 |
| 27915 | movu [r0 + 1478 * 16], m4 |
| 27916 | pmaddubsw m4, m1, m6 |
| 27917 | pmulhrsw m4, m7 |
| 27918 | pmaddubsw m5, m3, m6 |
| 27919 | pmulhrsw m5, m7 |
| 27920 | packuswb m4, m5 |
| 27921 | movu [r0 + 1479 * 16], m4 |
| 27922 | |
| 27923 | ; mode 25 [row 4] |
| 27924 | movu m6, [r5 + 22 * 16] |
| 27925 | pmaddubsw m4, m0, m6 |
| 27926 | pmulhrsw m4, m7 |
| 27927 | pmaddubsw m5, m2, m6 |
| 27928 | pmulhrsw m5, m7 |
| 27929 | packuswb m4, m5 |
| 27930 | movu [r0 + 1480 * 16], m4 |
| 27931 | pmaddubsw m4, m1, m6 |
| 27932 | pmulhrsw m4, m7 |
| 27933 | pmaddubsw m5, m3, m6 |
| 27934 | pmulhrsw m5, m7 |
| 27935 | packuswb m4, m5 |
| 27936 | movu [r0 + 1481 * 16], m4 |
| 27937 | |
| 27938 | ; mode 25 [row 5] |
| 27939 | movu m6, [r5 + 20 * 16] |
| 27940 | pmaddubsw m4, m0, m6 |
| 27941 | pmulhrsw m4, m7 |
| 27942 | pmaddubsw m5, m2, m6 |
| 27943 | pmulhrsw m5, m7 |
| 27944 | packuswb m4, m5 |
| 27945 | movu [r0 + 1482 * 16], m4 |
| 27946 | pmaddubsw m4, m1, m6 |
| 27947 | pmulhrsw m4, m7 |
| 27948 | pmaddubsw m5, m3, m6 |
| 27949 | pmulhrsw m5, m7 |
| 27950 | packuswb m4, m5 |
| 27951 | movu [r0 + 1483 * 16], m4 |
| 27952 | |
| 27953 | ; mode 25 [row 6] |
| 27954 | movu m6, [r5 + 18 * 16] |
| 27955 | pmaddubsw m4, m0, m6 |
| 27956 | pmulhrsw m4, m7 |
| 27957 | pmaddubsw m5, m2, m6 |
| 27958 | pmulhrsw m5, m7 |
| 27959 | packuswb m4, m5 |
| 27960 | movu [r0 + 1484 * 16], m4 |
| 27961 | pmaddubsw m4, m1, m6 |
| 27962 | pmulhrsw m4, m7 |
| 27963 | pmaddubsw m5, m3, m6 |
| 27964 | pmulhrsw m5, m7 |
| 27965 | packuswb m4, m5 |
| 27966 | movu [r0 + 1485 * 16], m4 |
| 27967 | |
| 27968 | ; mode 25 [row 7] |
| 27969 | movu m6, [r5 + 16 * 16] |
| 27970 | pmaddubsw m4, m0, m6 |
| 27971 | pmulhrsw m4, m7 |
| 27972 | pmaddubsw m5, m2, m6 |
| 27973 | pmulhrsw m5, m7 |
| 27974 | packuswb m4, m5 |
| 27975 | movu [r0 + 1486 * 16], m4 |
| 27976 | pmaddubsw m4, m1, m6 |
| 27977 | pmulhrsw m4, m7 |
| 27978 | pmaddubsw m5, m3, m6 |
| 27979 | pmulhrsw m5, m7 |
| 27980 | packuswb m4, m5 |
| 27981 | movu [r0 + 1487 * 16], m4 |
| 27982 | |
| 27983 | ; mode 25 [row 8] |
| 27984 | movu m6, [r5 + 14 * 16] |
| 27985 | pmaddubsw m4, m0, m6 |
| 27986 | pmulhrsw m4, m7 |
| 27987 | pmaddubsw m5, m2, m6 |
| 27988 | pmulhrsw m5, m7 |
| 27989 | packuswb m4, m5 |
| 27990 | movu [r0 + 1488 * 16], m4 |
| 27991 | pmaddubsw m4, m1, m6 |
| 27992 | pmulhrsw m4, m7 |
| 27993 | pmaddubsw m5, m3, m6 |
| 27994 | pmulhrsw m5, m7 |
| 27995 | packuswb m4, m5 |
| 27996 | movu [r0 + 1489 * 16], m4 |
| 27997 | |
| 27998 | ; mode 25 [row 9] |
| 27999 | movu m6, [r5 + 12 * 16] |
| 28000 | pmaddubsw m4, m0, m6 |
| 28001 | pmulhrsw m4, m7 |
| 28002 | pmaddubsw m5, m2, m6 |
| 28003 | pmulhrsw m5, m7 |
| 28004 | packuswb m4, m5 |
| 28005 | movu [r0 + 1490 * 16], m4 |
| 28006 | pmaddubsw m4, m1, m6 |
| 28007 | pmulhrsw m4, m7 |
| 28008 | pmaddubsw m5, m3, m6 |
| 28009 | pmulhrsw m5, m7 |
| 28010 | packuswb m4, m5 |
| 28011 | movu [r0 + 1491 * 16], m4 |
| 28012 | |
| 28013 | ; mode 25 [row 10] |
| 28014 | movu m6, [r5 + 10 * 16] |
| 28015 | pmaddubsw m4, m0, m6 |
| 28016 | pmulhrsw m4, m7 |
| 28017 | pmaddubsw m5, m2, m6 |
| 28018 | pmulhrsw m5, m7 |
| 28019 | packuswb m4, m5 |
| 28020 | movu [r0 + 1492 * 16], m4 |
| 28021 | pmaddubsw m4, m1, m6 |
| 28022 | pmulhrsw m4, m7 |
| 28023 | pmaddubsw m5, m3, m6 |
| 28024 | pmulhrsw m5, m7 |
| 28025 | packuswb m4, m5 |
| 28026 | movu [r0 + 1493 * 16], m4 |
| 28027 | |
| 28028 | ; mode 25 [row 11] |
| 28029 | movu m6, [r5 + 8 * 16] |
| 28030 | pmaddubsw m4, m0, m6 |
| 28031 | pmulhrsw m4, m7 |
| 28032 | pmaddubsw m5, m2, m6 |
| 28033 | pmulhrsw m5, m7 |
| 28034 | packuswb m4, m5 |
| 28035 | movu [r0 + 1494 * 16], m4 |
| 28036 | pmaddubsw m4, m1, m6 |
| 28037 | pmulhrsw m4, m7 |
| 28038 | pmaddubsw m5, m3, m6 |
| 28039 | pmulhrsw m5, m7 |
| 28040 | packuswb m4, m5 |
| 28041 | movu [r0 + 1495 * 16], m4 |
| 28042 | |
| 28043 | ; mode 25 [row 12] |
| 28044 | movu m6, [r5 + 6 * 16] |
| 28045 | pmaddubsw m4, m0, m6 |
| 28046 | pmulhrsw m4, m7 |
| 28047 | pmaddubsw m5, m2, m6 |
| 28048 | pmulhrsw m5, m7 |
| 28049 | packuswb m4, m5 |
| 28050 | movu [r0 + 1496 * 16], m4 |
| 28051 | pmaddubsw m4, m1, m6 |
| 28052 | pmulhrsw m4, m7 |
| 28053 | pmaddubsw m5, m3, m6 |
| 28054 | pmulhrsw m5, m7 |
| 28055 | packuswb m4, m5 |
| 28056 | movu [r0 + 1497 * 16], m4 |
| 28057 | |
| 28058 | ; mode 25 [row 13] |
| 28059 | movu m6, [r5 + 4 * 16] |
| 28060 | pmaddubsw m4, m0, m6 |
| 28061 | pmulhrsw m4, m7 |
| 28062 | pmaddubsw m5, m2, m6 |
| 28063 | pmulhrsw m5, m7 |
| 28064 | packuswb m4, m5 |
| 28065 | movu [r0 + 1498 * 16], m4 |
| 28066 | pmaddubsw m4, m1, m6 |
| 28067 | pmulhrsw m4, m7 |
| 28068 | pmaddubsw m5, m3, m6 |
| 28069 | pmulhrsw m5, m7 |
| 28070 | packuswb m4, m5 |
| 28071 | movu [r0 + 1499 * 16], m4 |
| 28072 | |
| 28073 | ; mode 25 [row 14] |
| 28074 | movu m6, [r5 + 2 * 16] |
| 28075 | pmaddubsw m4, m0, m6 |
| 28076 | pmulhrsw m4, m7 |
| 28077 | pmaddubsw m5, m2, m6 |
| 28078 | pmulhrsw m5, m7 |
| 28079 | packuswb m4, m5 |
| 28080 | movu [r0 + 1500 * 16], m4 |
| 28081 | pmaddubsw m4, m1, m6 |
| 28082 | pmulhrsw m4, m7 |
| 28083 | pmaddubsw m5, m3, m6 |
| 28084 | pmulhrsw m5, m7 |
| 28085 | packuswb m4, m5 |
| 28086 | movu [r0 + 1501 * 16], m4 |
| 28087 | |
| 28088 | ; mode 25 [row 15] |
| 28089 | pshufb m5, m0, [tab_S2] |
| 28090 | movh [r0 + 1502 * 16], m5 |
| 28091 | pshufb m5, m2, [tab_S2] |
| 28092 | movh [r0 + 1502 * 16 + 8], m5 |
| 28093 | pshufb m5, m1, [tab_S2] |
| 28094 | movh [r0 + 1503 * 16], m5 |
| 28095 | pshufb m5, m3, [tab_S2] |
| 28096 | movh [r0 + 1503 * 16 + 8], m5 |
| 28097 | |
| 28098 | ; mode 25 [row 16] |
| 28099 | movu m6, [r5 + 30 * 16] |
| 28100 | pslldq m0, 2 |
| 28101 | pinsrb m0, [r4 + 0], 1 |
| 28102 | pinsrb m0, [r4 + 16], 0 |
| 28103 | pmaddubsw m4, m0, m6 |
| 28104 | pmulhrsw m4, m7 |
| 28105 | pslldq m2, 2 |
| 28106 | pinsrw m2, [r3 + 7], 0 |
| 28107 | pmaddubsw m5, m2, m6 |
| 28108 | pmulhrsw m5, m7 |
| 28109 | packuswb m4, m5 |
| 28110 | movu [r0 + 1504 * 16], m4 |
| 28111 | pslldq m1, 2 |
| 28112 | pinsrw m1, [r3 + 15], 0 |
| 28113 | pmaddubsw m4, m1, m6 |
| 28114 | pmulhrsw m4, m7 |
| 28115 | pslldq m3, 2 |
| 28116 | pinsrw m3, [r3 + 23], 0 |
| 28117 | pmaddubsw m5, m3, m6 |
| 28118 | pmulhrsw m5, m7 |
| 28119 | packuswb m4, m5 |
| 28120 | movu [r0 + 1505 * 16], m4 |
| 28121 | |
| 28122 | ; mode 25 [row 17] |
| 28123 | movu m6, [r5 + 28 * 16] |
| 28124 | pmaddubsw m4, m0, m6 |
| 28125 | pmulhrsw m4, m7 |
| 28126 | pmaddubsw m5, m2, m6 |
| 28127 | pmulhrsw m5, m7 |
| 28128 | packuswb m4, m5 |
| 28129 | movu [r0 + 1506 * 16], m4 |
| 28130 | pmaddubsw m4, m1, m6 |
| 28131 | pmulhrsw m4, m7 |
| 28132 | pmaddubsw m5, m3, m6 |
| 28133 | pmulhrsw m5, m7 |
| 28134 | packuswb m4, m5 |
| 28135 | movu [r0 + 1507 * 16], m4 |
| 28136 | |
| 28137 | ; mode 25 [row 18] |
| 28138 | movu m6, [r5 + 26 * 16] |
| 28139 | pmaddubsw m4, m0, m6 |
| 28140 | pmulhrsw m4, m7 |
| 28141 | pmaddubsw m5, m2, m6 |
| 28142 | pmulhrsw m5, m7 |
| 28143 | packuswb m4, m5 |
| 28144 | movu [r0 + 1508 * 16], m4 |
| 28145 | pmaddubsw m4, m1, m6 |
| 28146 | pmulhrsw m4, m7 |
| 28147 | pmaddubsw m5, m3, m6 |
| 28148 | pmulhrsw m5, m7 |
| 28149 | packuswb m4, m5 |
| 28150 | movu [r0 + 1509 * 16], m4 |
| 28151 | |
| 28152 | ; mode 25 [row 19] |
| 28153 | movu m6, [r5 + 24 * 16] |
| 28154 | pmaddubsw m4, m0, m6 |
| 28155 | pmulhrsw m4, m7 |
| 28156 | pmaddubsw m5, m2, m6 |
| 28157 | pmulhrsw m5, m7 |
| 28158 | packuswb m4, m5 |
| 28159 | movu [r0 + 1510 * 16], m4 |
| 28160 | pmaddubsw m4, m1, m6 |
| 28161 | pmulhrsw m4, m7 |
| 28162 | pmaddubsw m5, m3, m6 |
| 28163 | pmulhrsw m5, m7 |
| 28164 | packuswb m4, m5 |
| 28165 | movu [r0 + 1511 * 16], m4 |
| 28166 | |
| 28167 | ; mode 25 [row 20] |
| 28168 | movu m6, [r5 + 22 * 16] |
| 28169 | pmaddubsw m4, m0, m6 |
| 28170 | pmulhrsw m4, m7 |
| 28171 | pmaddubsw m5, m2, m6 |
| 28172 | pmulhrsw m5, m7 |
| 28173 | packuswb m4, m5 |
| 28174 | movu [r0 + 1512 * 16], m4 |
| 28175 | pmaddubsw m4, m1, m6 |
| 28176 | pmulhrsw m4, m7 |
| 28177 | pmaddubsw m5, m3, m6 |
| 28178 | pmulhrsw m5, m7 |
| 28179 | packuswb m4, m5 |
| 28180 | movu [r0 + 1513 * 16], m4 |
| 28181 | |
| 28182 | ; mode 25 [row 21] |
| 28183 | movu m6, [r5 + 20 * 16] |
| 28184 | pmaddubsw m4, m0, m6 |
| 28185 | pmulhrsw m4, m7 |
| 28186 | pmaddubsw m5, m2, m6 |
| 28187 | pmulhrsw m5, m7 |
| 28188 | packuswb m4, m5 |
| 28189 | movu [r0 + 1514 * 16], m4 |
| 28190 | pmaddubsw m4, m1, m6 |
| 28191 | pmulhrsw m4, m7 |
| 28192 | pmaddubsw m5, m3, m6 |
| 28193 | pmulhrsw m5, m7 |
| 28194 | packuswb m4, m5 |
| 28195 | movu [r0 + 1515 * 16], m4 |
| 28196 | |
| 28197 | ; mode 25 [row 22] |
| 28198 | movu m6, [r5 + 18 * 16] |
| 28199 | pmaddubsw m4, m0, m6 |
| 28200 | pmulhrsw m4, m7 |
| 28201 | pmaddubsw m5, m2, m6 |
| 28202 | pmulhrsw m5, m7 |
| 28203 | packuswb m4, m5 |
| 28204 | movu [r0 + 1516 * 16], m4 |
| 28205 | pmaddubsw m4, m1, m6 |
| 28206 | pmulhrsw m4, m7 |
| 28207 | pmaddubsw m5, m3, m6 |
| 28208 | pmulhrsw m5, m7 |
| 28209 | packuswb m4, m5 |
| 28210 | movu [r0 + 1517 * 16], m4 |
| 28211 | |
| 28212 | ; mode 25 [row 23] |
| 28213 | movu m6, [r5 + 16 * 16] |
| 28214 | pmaddubsw m4, m0, m6 |
| 28215 | pmulhrsw m4, m7 |
| 28216 | pmaddubsw m5, m2, m6 |
| 28217 | pmulhrsw m5, m7 |
| 28218 | packuswb m4, m5 |
| 28219 | movu [r0 + 1518 * 16], m4 |
| 28220 | pmaddubsw m4, m1, m6 |
| 28221 | pmulhrsw m4, m7 |
| 28222 | pmaddubsw m5, m3, m6 |
| 28223 | pmulhrsw m5, m7 |
| 28224 | packuswb m4, m5 |
| 28225 | movu [r0 + 1519 * 16], m4 |
| 28226 | |
| 28227 | ; mode 25 [row 24] |
| 28228 | movu m6, [r5 + 14 * 16] |
| 28229 | pmaddubsw m4, m0, m6 |
| 28230 | pmulhrsw m4, m7 |
| 28231 | pmaddubsw m5, m2, m6 |
| 28232 | pmulhrsw m5, m7 |
| 28233 | packuswb m4, m5 |
| 28234 | movu [r0 + 1520 * 16], m4 |
| 28235 | pmaddubsw m4, m1, m6 |
| 28236 | pmulhrsw m4, m7 |
| 28237 | pmaddubsw m5, m3, m6 |
| 28238 | pmulhrsw m5, m7 |
| 28239 | packuswb m4, m5 |
| 28240 | movu [r0 + 1521 * 16], m4 |
| 28241 | |
| 28242 | ; mode 25 [row 25] |
| 28243 | movu m6, [r5 + 12 * 16] |
| 28244 | pmaddubsw m4, m0, m6 |
| 28245 | pmulhrsw m4, m7 |
| 28246 | pmaddubsw m5, m2, m6 |
| 28247 | pmulhrsw m5, m7 |
| 28248 | packuswb m4, m5 |
| 28249 | movu [r0 + 1522 * 16], m4 |
| 28250 | pmaddubsw m4, m1, m6 |
| 28251 | pmulhrsw m4, m7 |
| 28252 | pmaddubsw m5, m3, m6 |
| 28253 | pmulhrsw m5, m7 |
| 28254 | packuswb m4, m5 |
| 28255 | movu [r0 + 1523 * 16], m4 |
| 28256 | |
| 28257 | ; mode 25 [row 26] |
| 28258 | movu m6, [r5 + 10 * 16] |
| 28259 | pmaddubsw m4, m0, m6 |
| 28260 | pmulhrsw m4, m7 |
| 28261 | pmaddubsw m5, m2, m6 |
| 28262 | pmulhrsw m5, m7 |
| 28263 | packuswb m4, m5 |
| 28264 | movu [r0 + 1524 * 16], m4 |
| 28265 | pmaddubsw m4, m1, m6 |
| 28266 | pmulhrsw m4, m7 |
| 28267 | pmaddubsw m5, m3, m6 |
| 28268 | pmulhrsw m5, m7 |
| 28269 | packuswb m4, m5 |
| 28270 | movu [r0 + 1525 * 16], m4 |
| 28271 | |
| 28272 | ; mode 25 [row 27] |
| 28273 | movu m6, [r5 + 8 * 16] |
| 28274 | pmaddubsw m4, m0, m6 |
| 28275 | pmulhrsw m4, m7 |
| 28276 | pmaddubsw m5, m2, m6 |
| 28277 | pmulhrsw m5, m7 |
| 28278 | packuswb m4, m5 |
| 28279 | movu [r0 + 1526 * 16], m4 |
| 28280 | pmaddubsw m4, m1, m6 |
| 28281 | pmulhrsw m4, m7 |
| 28282 | pmaddubsw m5, m3, m6 |
| 28283 | pmulhrsw m5, m7 |
| 28284 | packuswb m4, m5 |
| 28285 | movu [r0 + 1527 * 16], m4 |
| 28286 | |
| 28287 | ; mode 25 [row 28] |
| 28288 | movu m6, [r5 + 6 * 16] |
| 28289 | pmaddubsw m4, m0, m6 |
| 28290 | pmulhrsw m4, m7 |
| 28291 | pmaddubsw m5, m2, m6 |
| 28292 | pmulhrsw m5, m7 |
| 28293 | packuswb m4, m5 |
| 28294 | movu [r0 + 1528 * 16], m4 |
| 28295 | pmaddubsw m4, m1, m6 |
| 28296 | pmulhrsw m4, m7 |
| 28297 | pmaddubsw m5, m3, m6 |
| 28298 | pmulhrsw m5, m7 |
| 28299 | packuswb m4, m5 |
| 28300 | movu [r0 + 1529 * 16], m4 |
| 28301 | |
| 28302 | ; mode 25 [row 29] |
| 28303 | movu m6, [r5 + 4 * 16] |
| 28304 | pmaddubsw m4, m0, m6 |
| 28305 | pmulhrsw m4, m7 |
| 28306 | pmaddubsw m5, m2, m6 |
| 28307 | pmulhrsw m5, m7 |
| 28308 | packuswb m4, m5 |
| 28309 | movu [r0 + 1530 * 16], m4 |
| 28310 | pmaddubsw m4, m1, m6 |
| 28311 | pmulhrsw m4, m7 |
| 28312 | pmaddubsw m5, m3, m6 |
| 28313 | pmulhrsw m5, m7 |
| 28314 | packuswb m4, m5 |
| 28315 | movu [r0 + 1531 * 16], m4 |
| 28316 | |
| 28317 | ; mode 25 [row 30] |
| 28318 | movu m6, [r5 + 2 * 16] |
| 28319 | pmaddubsw m4, m0, m6 |
| 28320 | pmulhrsw m4, m7 |
| 28321 | pmaddubsw m5, m2, m6 |
| 28322 | pmulhrsw m5, m7 |
| 28323 | packuswb m4, m5 |
| 28324 | movu [r0 + 1532 * 16], m4 |
| 28325 | pmaddubsw m4, m1, m6 |
| 28326 | pmulhrsw m4, m7 |
| 28327 | pmaddubsw m5, m3, m6 |
| 28328 | pmulhrsw m5, m7 |
| 28329 | packuswb m4, m5 |
| 28330 | movu [r0 + 1533 * 16], m4 |
| 28331 | |
| 28332 | ; mode 25 [row 31] |
| 28333 | pshufb m5, m0, [tab_S2] |
| 28334 | movh [r0 + 1534 * 16], m5 |
| 28335 | pshufb m5, m2, [tab_S2] |
| 28336 | movh [r0 + 1534 * 16 + 8], m5 |
| 28337 | pshufb m5, m1, [tab_S2] |
| 28338 | movh [r0 + 1535 * 16], m5 |
| 28339 | pshufb m5, m3, [tab_S2] |
| 28340 | movh [r0 + 1535 * 16 + 8], m5 |
| 28341 | |
| 28342 | ; mode 26 |
| 28343 | movu m1, [r1 + 1] |
| 28344 | movu m2, [r1 + 17] |
| 28345 | movu [r0 + 1536 * 16], m1 |
| 28346 | movu [r0 + 1537 * 16], m2 |
| 28347 | movu [r0 + 1538 * 16], m1 |
| 28348 | movu [r0 + 1539 * 16], m2 |
| 28349 | movu [r0 + 1540 * 16], m1 |
| 28350 | movu [r0 + 1541 * 16], m2 |
| 28351 | movu [r0 + 1542 * 16], m1 |
| 28352 | movu [r0 + 1543 * 16], m2 |
| 28353 | movu [r0 + 1544 * 16], m1 |
| 28354 | movu [r0 + 1545 * 16], m2 |
| 28355 | movu [r0 + 1546 * 16], m1 |
| 28356 | movu [r0 + 1547 * 16], m2 |
| 28357 | movu [r0 + 1548 * 16], m1 |
| 28358 | movu [r0 + 1549 * 16], m2 |
| 28359 | movu [r0 + 1550 * 16], m1 |
| 28360 | movu [r0 + 1551 * 16], m2 |
| 28361 | |
| 28362 | movu [r0 + 1552 * 16], m1 |
| 28363 | movu [r0 + 1553 * 16], m2 |
| 28364 | movu [r0 + 1554 * 16], m1 |
| 28365 | movu [r0 + 1555 * 16], m2 |
| 28366 | movu [r0 + 1556 * 16], m1 |
| 28367 | movu [r0 + 1557 * 16], m2 |
| 28368 | movu [r0 + 1558 * 16], m1 |
| 28369 | movu [r0 + 1559 * 16], m2 |
| 28370 | movu [r0 + 1560 * 16], m1 |
| 28371 | movu [r0 + 1561 * 16], m2 |
| 28372 | movu [r0 + 1562 * 16], m1 |
| 28373 | movu [r0 + 1563 * 16], m2 |
| 28374 | movu [r0 + 1564 * 16], m1 |
| 28375 | movu [r0 + 1565 * 16], m2 |
| 28376 | movu [r0 + 1566 * 16], m1 |
| 28377 | movu [r0 + 1567 * 16], m2 |
| 28378 | |
| 28379 | movu [r0 + 1568 * 16], m1 |
| 28380 | movu [r0 + 1569 * 16], m2 |
| 28381 | movu [r0 + 1570 * 16], m1 |
| 28382 | movu [r0 + 1571 * 16], m2 |
| 28383 | movu [r0 + 1572 * 16], m1 |
| 28384 | movu [r0 + 1573 * 16], m2 |
| 28385 | movu [r0 + 1574 * 16], m1 |
| 28386 | movu [r0 + 1575 * 16], m2 |
| 28387 | movu [r0 + 1576 * 16], m1 |
| 28388 | movu [r0 + 1577 * 16], m2 |
| 28389 | movu [r0 + 1578 * 16], m1 |
| 28390 | movu [r0 + 1579 * 16], m2 |
| 28391 | movu [r0 + 1580 * 16], m1 |
| 28392 | movu [r0 + 1581 * 16], m2 |
| 28393 | movu [r0 + 1582 * 16], m1 |
| 28394 | movu [r0 + 1583 * 16], m2 |
| 28395 | |
| 28396 | movu [r0 + 1584 * 16], m1 |
| 28397 | movu [r0 + 1585 * 16], m2 |
| 28398 | movu [r0 + 1586 * 16], m1 |
| 28399 | movu [r0 + 1587 * 16], m2 |
| 28400 | movu [r0 + 1588 * 16], m1 |
| 28401 | movu [r0 + 1589 * 16], m2 |
| 28402 | movu [r0 + 1590 * 16], m1 |
| 28403 | movu [r0 + 1591 * 16], m2 |
| 28404 | movu [r0 + 1592 * 16], m1 |
| 28405 | movu [r0 + 1593 * 16], m2 |
| 28406 | movu [r0 + 1594 * 16], m1 |
| 28407 | movu [r0 + 1595 * 16], m2 |
| 28408 | movu [r0 + 1596 * 16], m1 |
| 28409 | movu [r0 + 1597 * 16], m2 |
| 28410 | movu [r0 + 1598 * 16], m1 |
| 28411 | movu [r0 + 1599 * 16], m2 |
| 28412 | |
| 28413 | ; mode 27 [row 0] |
| 28414 | movu m6, [r5 + 2 * 16] |
| 28415 | movu m0, [r3 + 1 ] |
| 28416 | movu m1, [r3 + 2 ] |
| 28417 | punpcklbw m0, m1 |
| 28418 | pmaddubsw m4, m0, m6 |
| 28419 | pmulhrsw m4, m7 |
| 28420 | movu m2, [r3 + 9] |
| 28421 | movu m3, [r3 + 10] |
| 28422 | punpcklbw m2, m3 |
| 28423 | pmaddubsw m5, m2, m6 |
| 28424 | pmulhrsw m5, m7 |
| 28425 | packuswb m4, m5 |
| 28426 | movu [r0 + 1600 * 16], m4 |
| 28427 | |
| 28428 | movu m1, [r3 + 17] |
| 28429 | movu m3, [r3 + 18] |
| 28430 | punpcklbw m1, m3 |
| 28431 | pmaddubsw m4, m1, m6 |
| 28432 | pmulhrsw m4, m7 |
| 28433 | movu m3, [r3 + 25] |
| 28434 | movu m5, [r3 + 26] |
| 28435 | punpcklbw m3, m5 |
| 28436 | pmaddubsw m5, m3, m6 |
| 28437 | pmulhrsw m5, m7 |
| 28438 | packuswb m4, m5 |
| 28439 | movu [r0 + 1601 * 16], m4 |
| 28440 | |
| 28441 | ; mode 27 [row 1] |
| 28442 | movu m6, [r5 + 4 * 16] |
| 28443 | pmaddubsw m4, m0, m6 |
| 28444 | pmulhrsw m4, m7 |
| 28445 | pmaddubsw m5, m2, m6 |
| 28446 | pmulhrsw m5, m7 |
| 28447 | packuswb m4, m5 |
| 28448 | movu [r0 + 1602 * 16], m4 |
| 28449 | pmaddubsw m4, m1, m6 |
| 28450 | pmulhrsw m4, m7 |
| 28451 | pmaddubsw m5, m3, m6 |
| 28452 | pmulhrsw m5, m7 |
| 28453 | packuswb m4, m5 |
| 28454 | movu [r0 + 1603 * 16], m4 |
| 28455 | |
| 28456 | ; mode 27 [row 2] |
| 28457 | movu m6, [r5 + 6 * 16] |
| 28458 | pmaddubsw m4, m0, m6 |
| 28459 | pmulhrsw m4, m7 |
| 28460 | pmaddubsw m5, m2, m6 |
| 28461 | pmulhrsw m5, m7 |
| 28462 | packuswb m4, m5 |
| 28463 | movu [r0 + 1604 * 16], m4 |
| 28464 | pmaddubsw m4, m1, m6 |
| 28465 | pmulhrsw m4, m7 |
| 28466 | pmaddubsw m5, m3, m6 |
| 28467 | pmulhrsw m5, m7 |
| 28468 | packuswb m4, m5 |
| 28469 | movu [r0 + 1605 * 16], m4 |
| 28470 | |
| 28471 | ; mode 27 [row 3] |
| 28472 | movu m6, [r5 + 8 * 16] |
| 28473 | pmaddubsw m4, m0, m6 |
| 28474 | pmulhrsw m4, m7 |
| 28475 | pmaddubsw m5, m2, m6 |
| 28476 | pmulhrsw m5, m7 |
| 28477 | packuswb m4, m5 |
| 28478 | movu [r0 + 1606 * 16], m4 |
| 28479 | pmaddubsw m4, m1, m6 |
| 28480 | pmulhrsw m4, m7 |
| 28481 | pmaddubsw m5, m3, m6 |
| 28482 | pmulhrsw m5, m7 |
| 28483 | packuswb m4, m5 |
| 28484 | movu [r0 + 1607 * 16], m4 |
| 28485 | |
| 28486 | ; mode 27 [row 4] |
| 28487 | movu m6, [r5 + 10 * 16] |
| 28488 | pmaddubsw m4, m0, m6 |
| 28489 | pmulhrsw m4, m7 |
| 28490 | pmaddubsw m5, m2, m6 |
| 28491 | pmulhrsw m5, m7 |
| 28492 | packuswb m4, m5 |
| 28493 | movu [r0 + 1608 * 16], m4 |
| 28494 | |
| 28495 | ; mode 28 [row 1 -first half] |
| 28496 | movu [r0 + 1666 * 16], m4 |
| 28497 | |
| 28498 | pmaddubsw m4, m1, m6 |
| 28499 | pmulhrsw m4, m7 |
| 28500 | pmaddubsw m5, m3, m6 |
| 28501 | pmulhrsw m5, m7 |
| 28502 | packuswb m4, m5 |
| 28503 | movu [r0 + 1609 * 16], m4 |
| 28504 | |
| 28505 | ; mode 28 [row 1 - second half] |
| 28506 | movu [r0 + 1667 * 16], m4 |
| 28507 | |
| 28508 | ; mode 27 [row 5] |
| 28509 | movu m6, [r5 + 12 * 16] |
| 28510 | pmaddubsw m4, m0, m6 |
| 28511 | pmulhrsw m4, m7 |
| 28512 | pmaddubsw m5, m2, m6 |
| 28513 | pmulhrsw m5, m7 |
| 28514 | packuswb m4, m5 |
| 28515 | movu [r0 + 1610 * 16], m4 |
| 28516 | |
| 28517 | pmaddubsw m4, m1, m6 |
| 28518 | pmulhrsw m4, m7 |
| 28519 | pmaddubsw m5, m3, m6 |
| 28520 | pmulhrsw m5, m7 |
| 28521 | packuswb m4, m5 |
| 28522 | movu [r0 + 1611 * 16], m4 |
| 28523 | |
| 28524 | ; mode 27 [row 6] |
| 28525 | movu m6, [r5 + 14 * 16] |
| 28526 | pmaddubsw m4, m0, m6 |
| 28527 | pmulhrsw m4, m7 |
| 28528 | pmaddubsw m5, m2, m6 |
| 28529 | pmulhrsw m5, m7 |
| 28530 | packuswb m4, m5 |
| 28531 | movu [r0 + 1612 * 16], m4 |
| 28532 | pmaddubsw m4, m1, m6 |
| 28533 | pmulhrsw m4, m7 |
| 28534 | pmaddubsw m5, m3, m6 |
| 28535 | pmulhrsw m5, m7 |
| 28536 | packuswb m4, m5 |
| 28537 | movu [r0 + 1613 * 16], m4 |
| 28538 | |
| 28539 | ; mode 27 [row 7] |
| 28540 | movu m6, [r5 + 16 * 16] |
| 28541 | pmaddubsw m4, m0, m6 |
| 28542 | pmulhrsw m4, m7 |
| 28543 | pmaddubsw m5, m2, m6 |
| 28544 | pmulhrsw m5, m7 |
| 28545 | packuswb m4, m5 |
| 28546 | movu [r0 + 1614 * 16], m4 |
| 28547 | pmaddubsw m4, m1, m6 |
| 28548 | pmulhrsw m4, m7 |
| 28549 | pmaddubsw m5, m3, m6 |
| 28550 | pmulhrsw m5, m7 |
| 28551 | packuswb m4, m5 |
| 28552 | movu [r0 + 1615 * 16], m4 |
| 28553 | |
| 28554 | ; mode 27 [row 8] |
| 28555 | movu m6, [r5 + 18 * 16] |
| 28556 | pmaddubsw m4, m0, m6 |
| 28557 | pmulhrsw m4, m7 |
| 28558 | pmaddubsw m5, m2, m6 |
| 28559 | pmulhrsw m5, m7 |
| 28560 | packuswb m4, m5 |
| 28561 | movu [r0 + 1616 * 16], m4 |
| 28562 | |
| 28563 | ; mode 29 [row 1 - first half] |
| 28564 | movu [r0 + 1730 * 16], m4 |
| 28565 | |
| 28566 | pmaddubsw m4, m1, m6 |
| 28567 | pmulhrsw m4, m7 |
| 28568 | pmaddubsw m5, m3, m6 |
| 28569 | pmulhrsw m5, m7 |
| 28570 | packuswb m4, m5 |
| 28571 | movu [r0 + 1617 * 16], m4 |
| 28572 | |
| 28573 | ; mode 29 [row 1 - second half] |
| 28574 | movu [r0 + 1731 * 16], m4 |
| 28575 | |
| 28576 | ; mode 27 [row 9] |
| 28577 | movu m6, [r5 + 20 * 16] |
| 28578 | pmaddubsw m4, m0, m6 |
| 28579 | pmulhrsw m4, m7 |
| 28580 | pmaddubsw m5, m2, m6 |
| 28581 | pmulhrsw m5, m7 |
| 28582 | packuswb m4, m5 |
| 28583 | movu [r0 + 1618 * 16], m4 |
| 28584 | |
| 28585 | ; mode 28 [row 3 -first half] |
| 28586 | movu [r0 + 1670 * 16], m4 |
| 28587 | |
| 28588 | pmaddubsw m4, m1, m6 |
| 28589 | pmulhrsw m4, m7 |
| 28590 | pmaddubsw m5, m3, m6 |
| 28591 | pmulhrsw m5, m7 |
| 28592 | packuswb m4, m5 |
| 28593 | movu [r0 + 1619 * 16], m4 |
| 28594 | |
| 28595 | ; mode 28 [row 3 -second half] |
| 28596 | movu [r0 + 1671 * 16], m4 |
| 28597 | |
| 28598 | ; mode 27 [row 10] |
| 28599 | movu m6, [r5 + 22 * 16] |
| 28600 | pmaddubsw m4, m0, m6 |
| 28601 | pmulhrsw m4, m7 |
| 28602 | pmaddubsw m5, m2, m6 |
| 28603 | pmulhrsw m5, m7 |
| 28604 | packuswb m4, m5 |
| 28605 | movu [r0 + 1620 * 16], m4 |
| 28606 | pmaddubsw m4, m1, m6 |
| 28607 | pmulhrsw m4, m7 |
| 28608 | pmaddubsw m5, m3, m6 |
| 28609 | pmulhrsw m5, m7 |
| 28610 | packuswb m4, m5 |
| 28611 | movu [r0 + 1621 * 16], m4 |
| 28612 | |
| 28613 | ; mode 27 [row 11] |
| 28614 | movu m6, [r5 + 24 * 16] |
| 28615 | pmaddubsw m4, m0, m6 |
| 28616 | pmulhrsw m4, m7 |
| 28617 | pmaddubsw m5, m2, m6 |
| 28618 | pmulhrsw m5, m7 |
| 28619 | packuswb m4, m5 |
| 28620 | movu [r0 + 1622 * 16], m4 |
| 28621 | pmaddubsw m4, m1, m6 |
| 28622 | pmulhrsw m4, m7 |
| 28623 | pmaddubsw m5, m3, m6 |
| 28624 | pmulhrsw m5, m7 |
| 28625 | packuswb m4, m5 |
| 28626 | movu [r0 + 1623 * 16], m4 |
| 28627 | |
| 28628 | ; mode 27 [row 12] |
| 28629 | movu m6, [r5 + 26 * 16] |
| 28630 | pmaddubsw m4, m0, m6 |
| 28631 | pmulhrsw m4, m7 |
| 28632 | pmaddubsw m5, m2, m6 |
| 28633 | pmulhrsw m5, m7 |
| 28634 | packuswb m4, m5 |
| 28635 | movu [r0 + 1624 * 16], m4 |
| 28636 | |
| 28637 | ; mode 30 [row 1 - first half] |
| 28638 | movu [r0 + 1794 * 16], m4 |
| 28639 | |
| 28640 | ; mode 33 [row 0 - first half] |
| 28641 | movu [r0 + 1984 * 16], m4 |
| 28642 | |
| 28643 | pmaddubsw m4, m1, m6 |
| 28644 | pmulhrsw m4, m7 |
| 28645 | pmaddubsw m5, m3, m6 |
| 28646 | pmulhrsw m5, m7 |
| 28647 | packuswb m4, m5 |
| 28648 | movu [r0 + 1625 * 16], m4 |
| 28649 | |
| 28650 | ; mode 30 [row 1 - second half] |
| 28651 | movu [r0 + 1795 * 16], m4 |
| 28652 | |
| 28653 | ; mode 33 [row 0 - second half] |
| 28654 | movu [r0 + 1985 * 16], m4 |
| 28655 | |
| 28656 | ; mode 27 [row 13] |
| 28657 | movu m6, [r5 + 28 * 16] |
| 28658 | pmaddubsw m4, m0, m6 |
| 28659 | pmulhrsw m4, m7 |
| 28660 | pmaddubsw m5, m2, m6 |
| 28661 | pmulhrsw m5, m7 |
| 28662 | packuswb m4, m5 |
| 28663 | movu [r0 + 1626 * 16], m4 |
| 28664 | pmaddubsw m4, m1, m6 |
| 28665 | pmulhrsw m4, m7 |
| 28666 | pmaddubsw m5, m3, m6 |
| 28667 | pmulhrsw m5, m7 |
| 28668 | packuswb m4, m5 |
| 28669 | movu [r0 + 1627 * 16], m4 |
| 28670 | |
| 28671 | ; mode 27 [row 14] |
| 28672 | movu m6, [r5 + 30 * 16] |
| 28673 | pmaddubsw m4, m0, m6 |
| 28674 | pmulhrsw m4, m7 |
| 28675 | pmaddubsw m5, m2, m6 |
| 28676 | pmulhrsw m5, m7 |
| 28677 | packuswb m4, m5 |
| 28678 | movu [r0 + 1628 * 16], m4 |
| 28679 | |
| 28680 | ; mode 28 [row 5 first half] |
| 28681 | movu [r0 + 1674 * 16], m4 |
| 28682 | |
| 28683 | pmaddubsw m4, m1, m6 |
| 28684 | pmulhrsw m4, m7 |
| 28685 | pmaddubsw m5, m3, m6 |
| 28686 | pmulhrsw m5, m7 |
| 28687 | packuswb m4, m5 |
| 28688 | movu [r0 + 1629 * 16], m4 |
| 28689 | |
| 28690 | ; mode 28 [row 5 second half] |
| 28691 | movu [r0 + 1675 * 16], m4 |
| 28692 | |
| 28693 | ; mode 28 [row 0] |
| 28694 | movu m6, [r5 + 5 * 16] |
| 28695 | pmaddubsw m4, m0, m6 |
| 28696 | pmulhrsw m4, m7 |
| 28697 | pmaddubsw m5, m2, m6 |
| 28698 | pmulhrsw m5, m7 |
| 28699 | packuswb m4, m5 |
| 28700 | movu [r0 + 1664 * 16], m4 |
| 28701 | pmaddubsw m4, m1, m6 |
| 28702 | pmulhrsw m4, m7 |
| 28703 | pmaddubsw m5, m3, m6 |
| 28704 | pmulhrsw m5, m7 |
| 28705 | packuswb m4, m5 |
| 28706 | movu [r0 + 1665 * 16], m4 |
| 28707 | |
| 28708 | ; mode 28 [row 2] |
| 28709 | movu m6, [r5 + 15 * 16] |
| 28710 | pmaddubsw m4, m0, m6 |
| 28711 | pmulhrsw m4, m7 |
| 28712 | pmaddubsw m5, m2, m6 |
| 28713 | pmulhrsw m5, m7 |
| 28714 | packuswb m4, m5 |
| 28715 | movu [r0 + 1668 * 16], m4 |
| 28716 | pmaddubsw m4, m1, m6 |
| 28717 | pmulhrsw m4, m7 |
| 28718 | pmaddubsw m5, m3, m6 |
| 28719 | pmulhrsw m5, m7 |
| 28720 | packuswb m4, m5 |
| 28721 | movu [r0 + 1669 * 16], m4 |
| 28722 | |
| 28723 | ; mode 28 [row 4] |
| 28724 | movu m6, [r5 + 25 * 16] |
| 28725 | pmaddubsw m4, m0, m6 |
| 28726 | pmulhrsw m4, m7 |
| 28727 | pmaddubsw m5, m2, m6 |
| 28728 | pmulhrsw m5, m7 |
| 28729 | packuswb m4, m5 |
| 28730 | movu [r0 + 1672 * 16], m4 |
| 28731 | pmaddubsw m4, m1, m6 |
| 28732 | pmulhrsw m4, m7 |
| 28733 | pmaddubsw m5, m3, m6 |
| 28734 | pmulhrsw m5, m7 |
| 28735 | packuswb m4, m5 |
| 28736 | movu [r0 + 1673 * 16], m4 |
| 28737 | |
| 28738 | ; mode 30 [row 0] |
| 28739 | movu m6, [r5 + 13 * 16] |
| 28740 | pmaddubsw m4, m0, m6 |
| 28741 | pmulhrsw m4, m7 |
| 28742 | pmaddubsw m5, m2, m6 |
| 28743 | pmulhrsw m5, m7 |
| 28744 | packuswb m4, m5 |
| 28745 | movu [r0 + 1792 * 16], m4 |
| 28746 | pmaddubsw m4, m1, m6 |
| 28747 | pmulhrsw m4, m7 |
| 28748 | pmaddubsw m5, m3, m6 |
| 28749 | pmulhrsw m5, m7 |
| 28750 | packuswb m4, m5 |
| 28751 | movu [r0 + 1793 * 16], m4 |
| 28752 | |
| 28753 | ; mode 29 [row 0] |
| 28754 | movu m6, [r5 + 9 * 16] |
| 28755 | pmaddubsw m4, m0, m6 |
| 28756 | pmulhrsw m4, m7 |
| 28757 | pmaddubsw m5, m2, m6 |
| 28758 | pmulhrsw m5, m7 |
| 28759 | packuswb m4, m5 |
| 28760 | movu [r0 + 1728 * 16], m4 |
| 28761 | pmaddubsw m4, m1, m6 |
| 28762 | pmulhrsw m4, m7 |
| 28763 | pmaddubsw m5, m3, m6 |
| 28764 | pmulhrsw m5, m7 |
| 28765 | packuswb m4, m5 |
| 28766 | movu [r0 + 1729 * 16], m4 |
| 28767 | |
| 28768 | ; mode 29 [row 2] |
| 28769 | movu m6, [r5 + 27 * 16] |
| 28770 | pmaddubsw m4, m0, m6 |
| 28771 | pmulhrsw m4, m7 |
| 28772 | pmaddubsw m5, m2, m6 |
| 28773 | pmulhrsw m5, m7 |
| 28774 | packuswb m4, m5 |
| 28775 | movu [r0 + 1732 * 16], m4 |
| 28776 | pmaddubsw m4, m1, m6 |
| 28777 | pmulhrsw m4, m7 |
| 28778 | pmaddubsw m5, m3, m6 |
| 28779 | pmulhrsw m5, m7 |
| 28780 | packuswb m4, m5 |
| 28781 | movu [r0 + 1733 * 16], m4 |
| 28782 | |
| 28783 | ; mode 31 [row 0] |
| 28784 | movu m6, [r5 + 17 * 16] |
| 28785 | pmaddubsw m4, m0, m6 |
| 28786 | pmulhrsw m4, m7 |
| 28787 | pmaddubsw m5, m2, m6 |
| 28788 | pmulhrsw m5, m7 |
| 28789 | packuswb m4, m5 |
| 28790 | movu [r0 + 1856 * 16], m4 |
| 28791 | pmaddubsw m4, m1, m6 |
| 28792 | pmulhrsw m4, m7 |
| 28793 | pmaddubsw m5, m3, m6 |
| 28794 | pmulhrsw m5, m7 |
| 28795 | packuswb m4, m5 |
| 28796 | movu [r0 + 1857 * 16], m4 |
| 28797 | |
| 28798 | ; mode 32 [row 0] |
| 28799 | movu m6, [r5 + 21 * 16] |
| 28800 | pmaddubsw m4, m0, m6 |
| 28801 | pmulhrsw m4, m7 |
| 28802 | pmaddubsw m5, m2, m6 |
| 28803 | pmulhrsw m5, m7 |
| 28804 | packuswb m4, m5 |
| 28805 | movu [r0 + 1920 * 16], m4 |
| 28806 | pmaddubsw m4, m1, m6 |
| 28807 | pmulhrsw m4, m7 |
| 28808 | pmaddubsw m5, m3, m6 |
| 28809 | pmulhrsw m5, m7 |
| 28810 | packuswb m4, m5 |
| 28811 | movu [r0 + 1921 * 16], m4 |
| 28812 | |
| 28813 | ; mode 27 [row 15] |
| 28814 | movu m0, [r3 + 2] |
| 28815 | movd m1, [r3 + 3] |
| 28816 | palignr m1, m0, 1 |
| 28817 | punpcklbw m0, m1 |
| 28818 | movu m2, [r3 + 10] |
| 28819 | movd m3, [r3 + 11] |
| 28820 | palignr m3, m2, 1 |
| 28821 | punpcklbw m2, m3 |
| 28822 | movu m1, [r3 + 18] |
| 28823 | movd m3, [r3 + 19] |
| 28824 | palignr m3, m1, 1 |
| 28825 | punpcklbw m1, m3 |
| 28826 | movu m4, [r3 + 26] |
| 28827 | movd m5, [r3 + 27] |
| 28828 | palignr m5, m4, 1 |
| 28829 | punpcklbw m4, m5 |
| 28830 | |
| 28831 | pshufb m5, m0, [tab_S2] |
| 28832 | movh [r0 + 1630 * 16], m5 |
| 28833 | pshufb m5, m2, [tab_S2] |
| 28834 | movh [r0 + 1630 * 16 + 8], m5 |
| 28835 | pshufb m5, m1, [tab_S2] |
| 28836 | movh [r0 + 1631 * 16], m5 |
| 28837 | pshufb m5, m4, [tab_S2] |
| 28838 | movh [r0 + 1631 * 16 + 8], m5 |
| 28839 | |
| 28840 | ; mode 27 [row 16] |
| 28841 | movu m6, [r5 + 2 * 16] |
| 28842 | pmaddubsw m3, m0, m6 |
| 28843 | pmulhrsw m3, m7 |
| 28844 | pmaddubsw m5, m2, m6 |
| 28845 | pmulhrsw m5, m7 |
| 28846 | packuswb m3, m5 |
| 28847 | movu [r0 + 1632 * 16], m3 |
| 28848 | |
| 28849 | ; mode 31 [row 1 - first half] |
| 28850 | movu [r0 + 1858 * 16], m3 |
| 28851 | |
| 28852 | pmaddubsw m3, m1, m6 |
| 28853 | pmulhrsw m3, m7 |
| 28854 | pmaddubsw m5, m4, m6 |
| 28855 | pmulhrsw m5, m7 |
| 28856 | packuswb m3, m5 |
| 28857 | movu [r0 + 1633 * 16], m3 |
| 28858 | |
| 28859 | ; mode 31 [row 1 - second half] |
| 28860 | movu [r0 + 1859 * 16], m3 |
| 28861 | |
| 28862 | ; mode 27 [row 17] |
| 28863 | movu m6, [r5 + 4 * 16] |
| 28864 | pmaddubsw m3, m0, m6 |
| 28865 | pmulhrsw m3, m7 |
| 28866 | pmaddubsw m5, m2, m6 |
| 28867 | pmulhrsw m5, m7 |
| 28868 | packuswb m3, m5 |
| 28869 | movu [r0 + 1634 * 16], m3 |
| 28870 | |
| 28871 | ; mode 29 [row 3 - first half] |
| 28872 | movu [r0 + 1734 * 16], m3 |
| 28873 | |
| 28874 | pmaddubsw m3, m1, m6 |
| 28875 | pmulhrsw m3, m7 |
| 28876 | pmaddubsw m5, m4, m6 |
| 28877 | pmulhrsw m5, m7 |
| 28878 | packuswb m3, m5 |
| 28879 | movu [r0 + 1635 * 16], m3 |
| 28880 | |
| 28881 | ; mode 29 [row 3 - second half] |
| 28882 | movu [r0 + 1735 * 16], m3 |
| 28883 | |
| 28884 | ; mode 27 [row 18] |
| 28885 | movu m6, [r5 + 6 * 16] |
| 28886 | pmaddubsw m3, m0, m6 |
| 28887 | pmulhrsw m3, m7 |
| 28888 | pmaddubsw m5, m2, m6 |
| 28889 | pmulhrsw m5, m7 |
| 28890 | packuswb m3, m5 |
| 28891 | movu [r0 + 1636 * 16], m3 |
| 28892 | pmaddubsw m3, m1, m6 |
| 28893 | pmulhrsw m3, m7 |
| 28894 | pmaddubsw m5, m4, m6 |
| 28895 | pmulhrsw m5, m7 |
| 28896 | packuswb m3, m5 |
| 28897 | movu [r0 + 1637 * 16], m3 |
| 28898 | |
| 28899 | ; mode 27 [row 19] |
| 28900 | movu m6, [r5 + 8 * 16] |
| 28901 | pmaddubsw m3, m0, m6 |
| 28902 | pmulhrsw m3, m7 |
| 28903 | pmaddubsw m5, m2, m6 |
| 28904 | pmulhrsw m5, m7 |
| 28905 | packuswb m3, m5 |
| 28906 | movu [r0 + 1638 * 16], m3 |
| 28907 | |
| 28908 | ; mode 28 [row 7 - first half] |
| 28909 | movu [r0 + 1678 * 16], m3 |
| 28910 | |
| 28911 | pmaddubsw m3, m1, m6 |
| 28912 | pmulhrsw m3, m7 |
| 28913 | pmaddubsw m5, m4, m6 |
| 28914 | pmulhrsw m5, m7 |
| 28915 | packuswb m3, m5 |
| 28916 | movu [r0 + 1639 * 16], m3 |
| 28917 | |
| 28918 | ; mode 28 [row 7 - second half] |
| 28919 | movu [r0 + 1679 * 16], m3 |
| 28920 | |
| 28921 | ; mode 27 [row 20] |
| 28922 | movu m6, [r5 + 10 * 16] |
| 28923 | pmaddubsw m3, m0, m6 |
| 28924 | pmulhrsw m3, m7 |
| 28925 | pmaddubsw m5, m2, m6 |
| 28926 | pmulhrsw m5, m7 |
| 28927 | packuswb m3, m5 |
| 28928 | movu [r0 + 1640 * 16], m3 |
| 28929 | |
| 28930 | ; mode 32 [row 1 - first half] |
| 28931 | movu [r0 + 1922 * 16], m3 |
| 28932 | |
| 28933 | pmaddubsw m3, m1, m6 |
| 28934 | pmulhrsw m3, m7 |
| 28935 | pmaddubsw m5, m4, m6 |
| 28936 | pmulhrsw m5, m7 |
| 28937 | packuswb m3, m5 |
| 28938 | movu [r0 + 1641 * 16], m3 |
| 28939 | |
| 28940 | ; mode 32 [row 1 - second half] |
| 28941 | movu [r0 + 1923 * 16], m3 |
| 28942 | |
| 28943 | ; mode 27 [row 21] |
| 28944 | movu m6, [r5 + 12 * 16] |
| 28945 | pmaddubsw m3, m0, m6 |
| 28946 | pmulhrsw m3, m7 |
| 28947 | pmaddubsw m5, m2, m6 |
| 28948 | pmulhrsw m5, m7 |
| 28949 | packuswb m3, m5 |
| 28950 | movu [r0 + 1642 * 16], m3 |
| 28951 | pmaddubsw m3, m1, m6 |
| 28952 | pmulhrsw m3, m7 |
| 28953 | pmaddubsw m5, m4, m6 |
| 28954 | pmulhrsw m5, m7 |
| 28955 | packuswb m3, m5 |
| 28956 | movu [r0 + 1643 * 16], m3 |
| 28957 | |
| 28958 | ; mode 27 [row 22] |
| 28959 | movu m6, [r5 + 14 * 16] |
| 28960 | pmaddubsw m3, m0, m6 |
| 28961 | pmulhrsw m3, m7 |
| 28962 | pmaddubsw m5, m2, m6 |
| 28963 | pmulhrsw m5, m7 |
| 28964 | packuswb m3, m5 |
| 28965 | movu [r0 + 1644 * 16], m3 |
| 28966 | pmaddubsw m3, m1, m6 |
| 28967 | pmulhrsw m3, m7 |
| 28968 | pmaddubsw m5, m4, m6 |
| 28969 | pmulhrsw m5, m7 |
| 28970 | packuswb m3, m5 |
| 28971 | movu [r0 + 1645 * 16], m3 |
| 28972 | |
| 28973 | ; mode 27 [row 23] |
| 28974 | movu m6, [r5 + 16 * 16] |
| 28975 | pmaddubsw m3, m0, m6 |
| 28976 | pmulhrsw m3, m7 |
| 28977 | pmaddubsw m5, m2, m6 |
| 28978 | pmulhrsw m5, m7 |
| 28979 | packuswb m3, m5 |
| 28980 | movu [r0 + 1646 * 16], m3 |
| 28981 | pmaddubsw m3, m1, m6 |
| 28982 | pmulhrsw m3, m7 |
| 28983 | pmaddubsw m5, m4, m6 |
| 28984 | pmulhrsw m5, m7 |
| 28985 | packuswb m3, m5 |
| 28986 | movu [r0 + 1647 * 16], m3 |
| 28987 | |
| 28988 | ; mode 27 [row 24] |
| 28989 | movu m6, [r5 + 18 * 16] |
| 28990 | pmaddubsw m3, m0, m6 |
| 28991 | pmulhrsw m3, m7 |
| 28992 | pmaddubsw m5, m2, m6 |
| 28993 | pmulhrsw m5, m7 |
| 28994 | packuswb m3, m5 |
| 28995 | movu [r0 + 1648 * 16], m3 |
| 28996 | |
| 28997 | ; mode 28 [row 9 - first half] |
| 28998 | movu [r0 + 1682 * 16], m3 |
| 28999 | |
| 29000 | pmaddubsw m3, m1, m6 |
| 29001 | pmulhrsw m3, m7 |
| 29002 | pmaddubsw m5, m4, m6 |
| 29003 | pmulhrsw m5, m7 |
| 29004 | packuswb m3, m5 |
| 29005 | movu [r0 + 1649 * 16], m3 |
| 29006 | |
| 29007 | ; mode 28 [row 9 - second half] |
| 29008 | movu [r0 + 1683 * 16], m3 |
| 29009 | |
| 29010 | ; mode 27 [row 25] |
| 29011 | movu m6, [r5 + 20 * 16] |
| 29012 | pmaddubsw m3, m0, m6 |
| 29013 | pmulhrsw m3, m7 |
| 29014 | pmaddubsw m5, m2, m6 |
| 29015 | pmulhrsw m5, m7 |
| 29016 | packuswb m3, m5 |
| 29017 | movu [r0 + 1650 * 16], m3 |
| 29018 | |
| 29019 | ; mode 30 [row 3 - first half] |
| 29020 | movu [r0 + 1798 * 16], m3 |
| 29021 | |
| 29022 | ; mode 33 [row 1 - first half] |
| 29023 | movu [r0 + 1986 * 16], m3 |
| 29024 | |
| 29025 | pmaddubsw m3, m1, m6 |
| 29026 | pmulhrsw m3, m7 |
| 29027 | pmaddubsw m5, m4, m6 |
| 29028 | pmulhrsw m5, m7 |
| 29029 | packuswb m3, m5 |
| 29030 | movu [r0 + 1651 * 16], m3 |
| 29031 | |
| 29032 | ; mode 30 [row 3 - second half] |
| 29033 | movu [r0 + 1799 * 16], m3 |
| 29034 | |
| 29035 | ; mode 33 [row 1 - second half] |
| 29036 | movu [r0 + 1987 * 16], m3 |
| 29037 | |
| 29038 | ; mode 27 [row 26] |
| 29039 | movu m6, [r5 + 22 * 16] |
| 29040 | pmaddubsw m3, m0, m6 |
| 29041 | pmulhrsw m3, m7 |
| 29042 | pmaddubsw m5, m2, m6 |
| 29043 | pmulhrsw m5, m7 |
| 29044 | packuswb m3, m5 |
| 29045 | movu [r0 + 1652 * 16], m3 |
| 29046 | |
| 29047 | ; mode 29 [row 5 - first half] |
| 29048 | movu [r0 + 1738 * 16], m3 |
| 29049 | |
| 29050 | pmaddubsw m3, m1, m6 |
| 29051 | pmulhrsw m3, m7 |
| 29052 | pmaddubsw m5, m4, m6 |
| 29053 | pmulhrsw m5, m7 |
| 29054 | packuswb m3, m5 |
| 29055 | movu [r0 + 1653 * 16], m3 |
| 29056 | |
| 29057 | ; mode 29 [row 5 - second half] |
| 29058 | movu [r0 + 1739 * 16], m3 |
| 29059 | |
| 29060 | ; mode 27 [row 27] |
| 29061 | movu m6, [r5 + 24 * 16] |
| 29062 | pmaddubsw m3, m0, m6 |
| 29063 | pmulhrsw m3, m7 |
| 29064 | pmaddubsw m5, m2, m6 |
| 29065 | pmulhrsw m5, m7 |
| 29066 | packuswb m3, m5 |
| 29067 | movu [r0 + 1654 * 16], m3 |
| 29068 | pmaddubsw m3, m1, m6 |
| 29069 | pmulhrsw m3, m7 |
| 29070 | pmaddubsw m5, m4, m6 |
| 29071 | pmulhrsw m5, m7 |
| 29072 | packuswb m3, m5 |
| 29073 | movu [r0 + 1655 * 16], m3 |
| 29074 | |
| 29075 | ; mode 27 [row 28] |
| 29076 | movu m6, [r5 + 26 * 16] |
| 29077 | pmaddubsw m3, m0, m6 |
| 29078 | pmulhrsw m3, m7 |
| 29079 | pmaddubsw m5, m2, m6 |
| 29080 | pmulhrsw m5, m7 |
| 29081 | packuswb m3, m5 |
| 29082 | movu [r0 + 1656 * 16], m3 |
| 29083 | pmaddubsw m3, m1, m6 |
| 29084 | pmulhrsw m3, m7 |
| 29085 | pmaddubsw m5, m4, m6 |
| 29086 | pmulhrsw m5, m7 |
| 29087 | packuswb m3, m5 |
| 29088 | movu [r0 + 1657 * 16], m3 |
| 29089 | |
| 29090 | ; mode 27 [row 29] |
| 29091 | movu m6, [r5 + 28 * 16] |
| 29092 | pmaddubsw m3, m0, m6 |
| 29093 | pmulhrsw m3, m7 |
| 29094 | pmaddubsw m5, m2, m6 |
| 29095 | pmulhrsw m5, m7 |
| 29096 | packuswb m3, m5 |
| 29097 | movu [r0 + 1658 * 16], m3 |
| 29098 | |
| 29099 | ; mode 28 [row 11 - first half] |
| 29100 | movu [r0 + 1686 * 16], m3 |
| 29101 | |
| 29102 | pmaddubsw m3, m1, m6 |
| 29103 | pmulhrsw m3, m7 |
| 29104 | pmaddubsw m5, m4, m6 |
| 29105 | pmulhrsw m5, m7 |
| 29106 | packuswb m3, m5 |
| 29107 | movu [r0 + 1659 * 16], m3 |
| 29108 | |
| 29109 | ; mode 28 [row 11 - second half] |
| 29110 | movu [r0 + 1687 * 16], m3 |
| 29111 | |
| 29112 | ; mode 27 [row 30] |
| 29113 | movu m6, [r5 + 30 * 16] |
| 29114 | pmaddubsw m3, m0, m6 |
| 29115 | pmulhrsw m3, m7 |
| 29116 | pmaddubsw m5, m2, m6 |
| 29117 | pmulhrsw m5, m7 |
| 29118 | packuswb m3, m5 |
| 29119 | movu [r0 + 1660 * 16], m3 |
| 29120 | pmaddubsw m3, m1, m6 |
| 29121 | pmulhrsw m3, m7 |
| 29122 | pmaddubsw m5, m4, m6 |
| 29123 | pmulhrsw m5, m7 |
| 29124 | packuswb m3, m5 |
| 29125 | movu [r0 + 1661 * 16], m3 |
| 29126 | |
| 29127 | ; mode 28 [row 6] |
| 29128 | movu m6, [r5 + 3 * 16] |
| 29129 | pmaddubsw m3, m0, m6 |
| 29130 | pmulhrsw m3, m7 |
| 29131 | pmaddubsw m5, m2, m6 |
| 29132 | pmulhrsw m5, m7 |
| 29133 | packuswb m3, m5 |
| 29134 | movu [r0 + 1676 * 16], m3 |
| 29135 | pmaddubsw m3, m1, m6 |
| 29136 | pmulhrsw m3, m7 |
| 29137 | pmaddubsw m5, m4, m6 |
| 29138 | pmulhrsw m5, m7 |
| 29139 | packuswb m3, m5 |
| 29140 | movu [r0 + 1677 * 16], m3 |
| 29141 | |
| 29142 | ; mode 28 [row 8] |
| 29143 | movu m6, [r5 + 13 * 16] |
| 29144 | pmaddubsw m3, m0, m6 |
| 29145 | pmulhrsw m3, m7 |
| 29146 | pmaddubsw m5, m2, m6 |
| 29147 | pmulhrsw m5, m7 |
| 29148 | packuswb m3, m5 |
| 29149 | movu [r0 + 1680 * 16], m3 |
| 29150 | |
| 29151 | ; mode 29 [row 4 - first half] |
| 29152 | movu [r0 + 1736 * 16], m3 |
| 29153 | |
| 29154 | pmaddubsw m3, m1, m6 |
| 29155 | pmulhrsw m3, m7 |
| 29156 | pmaddubsw m5, m4, m6 |
| 29157 | pmulhrsw m5, m7 |
| 29158 | packuswb m3, m5 |
| 29159 | movu [r0 + 1681 * 16], m3 |
| 29160 | |
| 29161 | ; mode 29 [row 4 - second half] |
| 29162 | movu [r0 + 1737 * 16], m3 |
| 29163 | |
| 29164 | ; mode 28 [row 10] |
| 29165 | movu m6, [r5 + 23 * 16] |
| 29166 | pmaddubsw m3, m0, m6 |
| 29167 | pmulhrsw m3, m7 |
| 29168 | pmaddubsw m5, m2, m6 |
| 29169 | pmulhrsw m5, m7 |
| 29170 | packuswb m3, m5 |
| 29171 | movu [r0 + 1684 * 16], m3 |
| 29172 | pmaddubsw m3, m1, m6 |
| 29173 | pmulhrsw m3, m7 |
| 29174 | pmaddubsw m5, m4, m6 |
| 29175 | pmulhrsw m5, m7 |
| 29176 | packuswb m3, m5 |
| 29177 | movu [r0 + 1685 * 16], m3 |
| 29178 | |
| 29179 | ; mode 29 [row 6] |
| 29180 | movu m6, [r5 + 31 * 16] |
| 29181 | pmaddubsw m3, m0, m6 |
| 29182 | pmulhrsw m3, m7 |
| 29183 | pmaddubsw m5, m2, m6 |
| 29184 | pmulhrsw m5, m7 |
| 29185 | packuswb m3, m5 |
| 29186 | movu [r0 + 1740 * 16], m3 |
| 29187 | |
| 29188 | ; mode 32 [row 2 - first half] |
| 29189 | movu [r0 + 1924 * 16], m3 |
| 29190 | |
| 29191 | pmaddubsw m3, m1, m6 |
| 29192 | pmulhrsw m3, m7 |
| 29193 | pmaddubsw m5, m4, m6 |
| 29194 | pmulhrsw m5, m7 |
| 29195 | packuswb m3, m5 |
| 29196 | movu [r0 + 1741 * 16], m3 |
| 29197 | |
| 29198 | ; mode 32 [row 2 - second half] |
| 29199 | movu [r0 + 1925 * 16], m3 |
| 29200 | |
| 29201 | ; mode 30 [row 2] |
| 29202 | movu m6, [r5 + 7 * 16] |
| 29203 | pmaddubsw m3, m0, m6 |
| 29204 | pmulhrsw m3, m7 |
| 29205 | pmaddubsw m5, m2, m6 |
| 29206 | pmulhrsw m5, m7 |
| 29207 | packuswb m3, m5 |
| 29208 | movu [r0 + 1796 * 16], m3 |
| 29209 | pmaddubsw m3, m1, m6 |
| 29210 | pmulhrsw m3, m7 |
| 29211 | pmaddubsw m5, m4, m6 |
| 29212 | pmulhrsw m5, m7 |
| 29213 | packuswb m3, m5 |
| 29214 | movu [r0 + 1797 * 16], m3 |
| 29215 | |
| 29216 | ; mode 31 [row 2] |
| 29217 | movu m6, [r5 + 19 * 16] |
| 29218 | pmaddubsw m3, m0, m6 |
| 29219 | pmulhrsw m3, m7 |
| 29220 | pmaddubsw m5, m2, m6 |
| 29221 | pmulhrsw m5, m7 |
| 29222 | packuswb m3, m5 |
| 29223 | movu [r0 + 1860 * 16], m3 |
| 29224 | pmaddubsw m3, m1, m6 |
| 29225 | pmulhrsw m3, m7 |
| 29226 | pmaddubsw m5, m4, m6 |
| 29227 | pmulhrsw m5, m7 |
| 29228 | packuswb m3, m5 |
| 29229 | movu [r0 + 1861 * 16], m3 |
| 29230 | |
| 29231 | ; mode 27 [row 15] |
| 29232 | movu m0, [r3 + 3] |
| 29233 | movd m1, [r3 + 4] |
| 29234 | palignr m1, m0, 1 |
| 29235 | punpcklbw m0, m1 |
| 29236 | movu m2, [r3 + 11] |
| 29237 | movd m3, [r3 + 12] |
| 29238 | palignr m3, m2, 1 |
| 29239 | punpcklbw m2, m3 |
| 29240 | movu m1, [r3 + 19] |
| 29241 | movd m3, [r3 + 20] |
| 29242 | palignr m3, m1, 1 |
| 29243 | punpcklbw m1, m3 |
| 29244 | movu m4, [r3 + 27] |
| 29245 | movd m5, [r3 + 28] |
| 29246 | palignr m5, m4, 1 |
| 29247 | punpcklbw m4, m5 |
| 29248 | |
| 29249 | pshufb m5, m0, [tab_S2] |
| 29250 | movh [r0 + 1662 * 16], m5 |
| 29251 | pshufb m5, m2, [tab_S2] |
| 29252 | movh [r0 + 1662 * 16 + 8], m5 |
| 29253 | pshufb m5, m1, [tab_S2] |
| 29254 | movh [r0 + 1663 * 16], m5 |
| 29255 | pshufb m5, m4, [tab_S2] |
| 29256 | movh [r0 + 1663 * 16 + 8], m5 |
| 29257 | |
| 29258 | ; mode 28 [row 12] |
| 29259 | movu m6, [r5 + 1 * 16] |
| 29260 | pmaddubsw m3, m0, m6 |
| 29261 | pmulhrsw m3, m7 |
| 29262 | pmaddubsw m5, m2, m6 |
| 29263 | pmulhrsw m5, m7 |
| 29264 | packuswb m3, m5 |
| 29265 | movu [r0 + 1688 * 16], m3 |
| 29266 | |
| 29267 | ; mode 30 [row 4 - first half] |
| 29268 | movu [r0 + 1800 * 16], m3 |
| 29269 | |
| 29270 | pmaddubsw m3, m1, m6 |
| 29271 | pmulhrsw m3, m7 |
| 29272 | pmaddubsw m5, m4, m6 |
| 29273 | pmulhrsw m5, m7 |
| 29274 | packuswb m3, m5 |
| 29275 | movu [r0 + 1689 * 16], m3 |
| 29276 | |
| 29277 | ; mode 30 [row 4 - second half] |
| 29278 | movu [r0 + 1801 * 16], m3 |
| 29279 | |
| 29280 | ; mode 28 [row 13] |
| 29281 | movu m6, [r5 + 6 * 16] |
| 29282 | pmaddubsw m3, m0, m6 |
| 29283 | pmulhrsw m3, m7 |
| 29284 | pmaddubsw m5, m2, m6 |
| 29285 | pmulhrsw m5, m7 |
| 29286 | packuswb m3, m5 |
| 29287 | movu [r0 + 1690 * 16], m3 |
| 29288 | pmaddubsw m3, m1, m6 |
| 29289 | pmulhrsw m3, m7 |
| 29290 | pmaddubsw m5, m4, m6 |
| 29291 | pmulhrsw m5, m7 |
| 29292 | packuswb m3, m5 |
| 29293 | movu [r0 + 1691 * 16], m3 |
| 29294 | |
| 29295 | ; mode 28 [row 14] |
| 29296 | movu m6, [r5 + 11 * 16] |
| 29297 | pmaddubsw m3, m0, m6 |
| 29298 | pmulhrsw m3, m7 |
| 29299 | pmaddubsw m5, m2, m6 |
| 29300 | pmulhrsw m5, m7 |
| 29301 | packuswb m3, m5 |
| 29302 | movu [r0 + 1692 * 16], m3 |
| 29303 | pmaddubsw m3, m1, m6 |
| 29304 | pmulhrsw m3, m7 |
| 29305 | pmaddubsw m5, m4, m6 |
| 29306 | pmulhrsw m5, m7 |
| 29307 | packuswb m3, m5 |
| 29308 | movu [r0 + 1693 * 16], m3 |
| 29309 | |
| 29310 | ; mode 28 [row 15] |
| 29311 | movu m6, [r5 + 16 * 16] |
| 29312 | pmaddubsw m3, m0, m6 |
| 29313 | pmulhrsw m3, m7 |
| 29314 | pmaddubsw m5, m2, m6 |
| 29315 | pmulhrsw m5, m7 |
| 29316 | packuswb m3, m5 |
| 29317 | movu [r0 + 1694 * 16], m3 |
| 29318 | pmaddubsw m3, m1, m6 |
| 29319 | pmulhrsw m3, m7 |
| 29320 | pmaddubsw m5, m4, m6 |
| 29321 | pmulhrsw m5, m7 |
| 29322 | packuswb m3, m5 |
| 29323 | movu [r0 + 1695 * 16], m3 |
| 29324 | |
| 29325 | ; mode 28 [row 16] |
| 29326 | movu m6, [r5 + 21 * 16] |
| 29327 | pmaddubsw m3, m0, m6 |
| 29328 | pmulhrsw m3, m7 |
| 29329 | pmaddubsw m5, m2, m6 |
| 29330 | pmulhrsw m5, m7 |
| 29331 | packuswb m3, m5 |
| 29332 | movu [r0 + 1696 * 16], m3 |
| 29333 | |
| 29334 | ; mode 31 [row 4 - first half] |
| 29335 | movu [r0 + 1864 * 16], m3 |
| 29336 | |
| 29337 | pmaddubsw m3, m1, m6 |
| 29338 | pmulhrsw m3, m7 |
| 29339 | pmaddubsw m5, m4, m6 |
| 29340 | pmulhrsw m5, m7 |
| 29341 | packuswb m3, m5 |
| 29342 | movu [r0 + 1697 * 16], m3 |
| 29343 | |
| 29344 | ; mode 31 [row 4 - second half] |
| 29345 | movu [r0 + 1865 * 16], m3 |
| 29346 | |
| 29347 | ; mode 28 [row 17] |
| 29348 | movu m6, [r5 + 26 * 16] |
| 29349 | pmaddubsw m3, m0, m6 |
| 29350 | pmulhrsw m3, m7 |
| 29351 | pmaddubsw m5, m2, m6 |
| 29352 | pmulhrsw m5, m7 |
| 29353 | packuswb m3, m5 |
| 29354 | movu [r0 + 1698 * 16], m3 |
| 29355 | |
| 29356 | ; mode 29 [row 9 - first half] |
| 29357 | movu [r0 + 1746 * 16], m3 |
| 29358 | |
| 29359 | pmaddubsw m3, m1, m6 |
| 29360 | pmulhrsw m3, m7 |
| 29361 | pmaddubsw m5, m4, m6 |
| 29362 | pmulhrsw m5, m7 |
| 29363 | packuswb m3, m5 |
| 29364 | movu [r0 + 1699 * 16], m3 |
| 29365 | |
| 29366 | ; mode 29 [row 9 - second half] |
| 29367 | movu [r0 + 1747 * 16], m3 |
| 29368 | |
| 29369 | ; mode 28 [row 18] |
| 29370 | movu m6, [r5 + 31 * 16] |
| 29371 | pmaddubsw m3, m0, m6 |
| 29372 | pmulhrsw m3, m7 |
| 29373 | pmaddubsw m5, m2, m6 |
| 29374 | pmulhrsw m5, m7 |
| 29375 | packuswb m3, m5 |
| 29376 | movu [r0 + 1700 * 16], m3 |
| 29377 | pmaddubsw m3, m1, m6 |
| 29378 | pmulhrsw m3, m7 |
| 29379 | pmaddubsw m5, m4, m6 |
| 29380 | pmulhrsw m5, m7 |
| 29381 | packuswb m3, m5 |
| 29382 | movu [r0 + 1701 * 16], m3 |
| 29383 | |
| 29384 | ; mode 29 [row 7] |
| 29385 | movu m6, [r5 + 8 * 16] |
| 29386 | pmaddubsw m3, m0, m6 |
| 29387 | pmulhrsw m3, m7 |
| 29388 | pmaddubsw m5, m2, m6 |
| 29389 | pmulhrsw m5, m7 |
| 29390 | packuswb m3, m5 |
| 29391 | movu [r0 + 1742 * 16], m3 |
| 29392 | pmaddubsw m3, m1, m6 |
| 29393 | pmulhrsw m3, m7 |
| 29394 | pmaddubsw m5, m4, m6 |
| 29395 | pmulhrsw m5, m7 |
| 29396 | packuswb m3, m5 |
| 29397 | movu [r0 + 1743 * 16], m3 |
| 29398 | |
| 29399 | ; mode 29 [row 8] |
| 29400 | movu m6, [r5 + 17 * 16] |
| 29401 | pmaddubsw m3, m0, m6 |
| 29402 | pmulhrsw m3, m7 |
| 29403 | pmaddubsw m5, m2, m6 |
| 29404 | pmulhrsw m5, m7 |
| 29405 | packuswb m3, m5 |
| 29406 | movu [r0 + 1744 * 16], m3 |
| 29407 | pmaddubsw m3, m1, m6 |
| 29408 | pmulhrsw m3, m7 |
| 29409 | pmaddubsw m5, m4, m6 |
| 29410 | pmulhrsw m5, m7 |
| 29411 | packuswb m3, m5 |
| 29412 | movu [r0 + 1745 * 16], m3 |
| 29413 | |
| 29414 | ; mode 30 [row 5] |
| 29415 | movu m6, [r5 + 14 * 16] |
| 29416 | pmaddubsw m3, m0, m6 |
| 29417 | pmulhrsw m3, m7 |
| 29418 | pmaddubsw m5, m2, m6 |
| 29419 | pmulhrsw m5, m7 |
| 29420 | packuswb m3, m5 |
| 29421 | movu [r0 + 1802 * 16], m3 |
| 29422 | |
| 29423 | ; mode 33 [row 2 - first half] |
| 29424 | movu [r0 + 1988 * 16], m3 |
| 29425 | |
| 29426 | pmaddubsw m3, m1, m6 |
| 29427 | pmulhrsw m3, m7 |
| 29428 | pmaddubsw m5, m4, m6 |
| 29429 | pmulhrsw m5, m7 |
| 29430 | packuswb m3, m5 |
| 29431 | movu [r0 + 1803 * 16], m3 |
| 29432 | |
| 29433 | ; mode 33 [row 2 - second half] |
| 29434 | movu [r0 + 1989 * 16], m3 |
| 29435 | |
| 29436 | ; mode 30 [row 6] |
| 29437 | movu m6, [r5 + 27 * 16] |
| 29438 | pmaddubsw m3, m0, m6 |
| 29439 | pmulhrsw m3, m7 |
| 29440 | pmaddubsw m5, m2, m6 |
| 29441 | pmulhrsw m5, m7 |
| 29442 | packuswb m3, m5 |
| 29443 | movu [r0 + 1804 * 16], m3 |
| 29444 | pmaddubsw m3, m1, m6 |
| 29445 | pmulhrsw m3, m7 |
| 29446 | pmaddubsw m5, m4, m6 |
| 29447 | pmulhrsw m5, m7 |
| 29448 | packuswb m3, m5 |
| 29449 | movu [r0 + 1805 * 16], m3 |
| 29450 | |
| 29451 | ; mode 31 [row 3] |
| 29452 | movu m6, [r5 + 4 * 16] |
| 29453 | pmaddubsw m3, m0, m6 |
| 29454 | pmulhrsw m3, m7 |
| 29455 | pmaddubsw m5, m2, m6 |
| 29456 | pmulhrsw m5, m7 |
| 29457 | packuswb m3, m5 |
| 29458 | movu [r0 + 1862 * 16], m3 |
| 29459 | pmaddubsw m3, m1, m6 |
| 29460 | pmulhrsw m3, m7 |
| 29461 | pmaddubsw m5, m4, m6 |
| 29462 | pmulhrsw m5, m7 |
| 29463 | packuswb m3, m5 |
| 29464 | movu [r0 + 1863 * 16], m3 |
| 29465 | |
| 29466 | ; mode 32 [row 3] |
| 29467 | movu m6, [r5 + 20 * 16] |
| 29468 | pmaddubsw m3, m0, m6 |
| 29469 | pmulhrsw m3, m7 |
| 29470 | pmaddubsw m5, m2, m6 |
| 29471 | pmulhrsw m5, m7 |
| 29472 | packuswb m3, m5 |
| 29473 | movu [r0 + 1926 * 16], m3 |
| 29474 | pmaddubsw m3, m1, m6 |
| 29475 | pmulhrsw m3, m7 |
| 29476 | pmaddubsw m5, m4, m6 |
| 29477 | pmulhrsw m5, m7 |
| 29478 | packuswb m3, m5 |
| 29479 | movu [r0 + 1927 * 16], m3 |
| 29480 | |
| 29481 | ; mode 28 [row 19] |
| 29482 | movu m6, [r5 + 4 * 16] |
| 29483 | movu m0, [r3 + 4] |
| 29484 | movd m1, [r3 + 5] |
| 29485 | palignr m1, m0, 1 |
| 29486 | punpcklbw m0, m1 |
| 29487 | pmaddubsw m3, m0, m6 |
| 29488 | pmulhrsw m3, m7 |
| 29489 | movu m2, [r3 + 12] |
| 29490 | movd m4, [r3 + 13] |
| 29491 | palignr m4, m2, 1 |
| 29492 | punpcklbw m2, m4 |
| 29493 | pmaddubsw m5, m2, m6 |
| 29494 | pmulhrsw m5, m7 |
| 29495 | packuswb m3, m5 |
| 29496 | movu [r0 + 1702 * 16], m3 |
| 29497 | |
| 29498 | movu m1, [r3 + 20] |
| 29499 | movd m3, [r3 + 21] |
| 29500 | palignr m3, m1, 1 |
| 29501 | punpcklbw m1, m3 |
| 29502 | pmaddubsw m3, m1, m6 |
| 29503 | pmulhrsw m3, m7 |
| 29504 | movu m4, [r3 + 28] |
| 29505 | movd m5, [r3 + 29] |
| 29506 | palignr m5, m4, 1 |
| 29507 | punpcklbw m4, m5 |
| 29508 | pmaddubsw m5, m4, m6 |
| 29509 | pmulhrsw m5, m7 |
| 29510 | packuswb m3, m5 |
| 29511 | movu [r0 + 1703 * 16], m3 |
| 29512 | |
| 29513 | ; mode 28 [row 20] |
| 29514 | movu m6, [r5 + 9 * 16] |
| 29515 | pmaddubsw m3, m0, m6 |
| 29516 | pmulhrsw m3, m7 |
| 29517 | pmaddubsw m5, m2, m6 |
| 29518 | pmulhrsw m5, m7 |
| 29519 | packuswb m3, m5 |
| 29520 | movu [r0 + 1704 * 16], m3 |
| 29521 | |
| 29522 | ; mode 32 [row 4 - first half] |
| 29523 | movu [r0 + 1928 * 16], m3 |
| 29524 | |
| 29525 | pmaddubsw m3, m1, m6 |
| 29526 | pmulhrsw m3, m7 |
| 29527 | pmaddubsw m5, m4, m6 |
| 29528 | pmulhrsw m5, m7 |
| 29529 | packuswb m3, m5 |
| 29530 | movu [r0 + 1705 * 16], m3 |
| 29531 | |
| 29532 | ; mode 32 [row 4 - second half] |
| 29533 | movu [r0 + 1929 * 16], m3 |
| 29534 | |
| 29535 | ; mode 28 [row 21] |
| 29536 | movu m6, [r5 + 14 * 16] |
| 29537 | pmaddubsw m3, m0, m6 |
| 29538 | pmulhrsw m3, m7 |
| 29539 | pmaddubsw m5, m2, m6 |
| 29540 | pmulhrsw m5, m7 |
| 29541 | packuswb m3, m5 |
| 29542 | movu [r0 + 1706 * 16], m3 |
| 29543 | pmaddubsw m3, m1, m6 |
| 29544 | pmulhrsw m3, m7 |
| 29545 | pmaddubsw m5, m4, m6 |
| 29546 | pmulhrsw m5, m7 |
| 29547 | packuswb m3, m5 |
| 29548 | movu [r0 + 1707 * 16], m3 |
| 29549 | |
| 29550 | ; mode 28 [row 22] |
| 29551 | movu m6, [r5 + 19 * 16] |
| 29552 | pmaddubsw m3, m0, m6 |
| 29553 | pmulhrsw m3, m7 |
| 29554 | pmaddubsw m5, m2, m6 |
| 29555 | pmulhrsw m5, m7 |
| 29556 | packuswb m3, m5 |
| 29557 | movu [r0 + 1708 * 16], m3 |
| 29558 | pmaddubsw m3, m1, m6 |
| 29559 | pmulhrsw m3, m7 |
| 29560 | pmaddubsw m5, m4, m6 |
| 29561 | pmulhrsw m5, m7 |
| 29562 | packuswb m3, m5 |
| 29563 | movu [r0 + 1709 * 16], m3 |
| 29564 | |
| 29565 | ; mode 28 [row 23] |
| 29566 | movu m6, [r5 + 24 * 16] |
| 29567 | pmaddubsw m3, m0, m6 |
| 29568 | pmulhrsw m3, m7 |
| 29569 | pmaddubsw m5, m2, m6 |
| 29570 | pmulhrsw m5, m7 |
| 29571 | packuswb m3, m5 |
| 29572 | movu [r0 + 1710 * 16], m3 |
| 29573 | pmaddubsw m3, m1, m6 |
| 29574 | pmulhrsw m3, m7 |
| 29575 | pmaddubsw m5, m4, m6 |
| 29576 | pmulhrsw m5, m7 |
| 29577 | packuswb m3, m5 |
| 29578 | movu [r0 + 1711 * 16], m3 |
| 29579 | |
| 29580 | ; mode 28 [row 24] |
| 29581 | movu m6, [r5 + 29 * 16] |
| 29582 | pmaddubsw m3, m0, m6 |
| 29583 | pmulhrsw m3, m7 |
| 29584 | pmaddubsw m5, m2, m6 |
| 29585 | pmulhrsw m5, m7 |
| 29586 | packuswb m3, m5 |
| 29587 | movu [r0 + 1712 * 16], m3 |
| 29588 | pmaddubsw m3, m1, m6 |
| 29589 | pmulhrsw m3, m7 |
| 29590 | pmaddubsw m5, m4, m6 |
| 29591 | pmulhrsw m5, m7 |
| 29592 | packuswb m3, m5 |
| 29593 | movu [r0 + 1713 * 16], m3 |
| 29594 | |
| 29595 | ; mode 29 [row 10] |
| 29596 | movu m6, [r5 + 3 * 16] |
| 29597 | pmaddubsw m3, m0, m6 |
| 29598 | pmulhrsw m3, m7 |
| 29599 | pmaddubsw m5, m2, m6 |
| 29600 | pmulhrsw m5, m7 |
| 29601 | packuswb m3, m5 |
| 29602 | movu [r0 + 1748 * 16], m3 |
| 29603 | pmaddubsw m3, m1, m6 |
| 29604 | pmulhrsw m3, m7 |
| 29605 | pmaddubsw m5, m4, m6 |
| 29606 | pmulhrsw m5, m7 |
| 29607 | packuswb m3, m5 |
| 29608 | movu [r0 + 1749 * 16], m3 |
| 29609 | |
| 29610 | ; mode 29 [row 11] |
| 29611 | movu m6, [r5 + 12 * 16] |
| 29612 | pmaddubsw m3, m0, m6 |
| 29613 | pmulhrsw m3, m7 |
| 29614 | pmaddubsw m5, m2, m6 |
| 29615 | pmulhrsw m5, m7 |
| 29616 | packuswb m3, m5 |
| 29617 | movu [r0 + 1750 * 16], m3 |
| 29618 | pmaddubsw m3, m1, m6 |
| 29619 | pmulhrsw m3, m7 |
| 29620 | pmaddubsw m5, m4, m6 |
| 29621 | pmulhrsw m5, m7 |
| 29622 | packuswb m3, m5 |
| 29623 | movu [r0 + 1751 * 16], m3 |
| 29624 | |
| 29625 | ; mode 29 [row 12] |
| 29626 | movu m6, [r5 + 21 * 16] |
| 29627 | pmaddubsw m3, m0, m6 |
| 29628 | pmulhrsw m3, m7 |
| 29629 | pmaddubsw m5, m2, m6 |
| 29630 | pmulhrsw m5, m7 |
| 29631 | packuswb m3, m5 |
| 29632 | movu [r0 + 1752 * 16], m3 |
| 29633 | |
| 29634 | ; mode 30 [row 8 -first half] |
| 29635 | movu [r0 + 1808 * 16], m3 |
| 29636 | |
| 29637 | pmaddubsw m3, m1, m6 |
| 29638 | pmulhrsw m3, m7 |
| 29639 | pmaddubsw m5, m4, m6 |
| 29640 | pmulhrsw m5, m7 |
| 29641 | packuswb m3, m5 |
| 29642 | movu [r0 + 1753 * 16], m3 |
| 29643 | |
| 29644 | ; mode 30 [row 8 -second half] |
| 29645 | movu [r0 + 1809 * 16], m3 |
| 29646 | |
| 29647 | ; mode 29 [row 13] |
| 29648 | movu m6, [r5 + 30 * 16] |
| 29649 | pmaddubsw m3, m0, m6 |
| 29650 | pmulhrsw m3, m7 |
| 29651 | pmaddubsw m5, m2, m6 |
| 29652 | pmulhrsw m5, m7 |
| 29653 | packuswb m3, m5 |
| 29654 | movu [r0 + 1754 * 16], m3 |
| 29655 | |
| 29656 | ; mode 32 [row 5 - first half] |
| 29657 | movu [r0 + 1930 * 16], m3 |
| 29658 | |
| 29659 | pmaddubsw m3, m1, m6 |
| 29660 | pmulhrsw m3, m7 |
| 29661 | pmaddubsw m5, m4, m6 |
| 29662 | pmulhrsw m5, m7 |
| 29663 | packuswb m3, m5 |
| 29664 | movu [r0 + 1755 * 16], m3 |
| 29665 | |
| 29666 | ; mode 32 [row 5 - second half] |
| 29667 | movu [r0 + 1931 * 16], m3 |
| 29668 | |
| 29669 | ; mode 30 [row 7] |
| 29670 | movu m6, [r5 + 8 * 16] |
| 29671 | pmaddubsw m3, m0, m6 |
| 29672 | pmulhrsw m3, m7 |
| 29673 | pmaddubsw m5, m2, m6 |
| 29674 | pmulhrsw m5, m7 |
| 29675 | packuswb m3, m5 |
| 29676 | movu [r0 + 1806 * 16], m3 |
| 29677 | |
| 29678 | ; mode 33 [row 3 - first half] |
| 29679 | movu [r0 + 1990 * 16], m3 |
| 29680 | |
| 29681 | pmaddubsw m3, m1, m6 |
| 29682 | pmulhrsw m3, m7 |
| 29683 | pmaddubsw m5, m4, m6 |
| 29684 | pmulhrsw m5, m7 |
| 29685 | packuswb m3, m5 |
| 29686 | movu [r0 + 1807 * 16], m3 |
| 29687 | |
| 29688 | ; mode 33 [row 3 - second half] |
| 29689 | movu [r0 + 1991 * 16], m3 |
| 29690 | |
| 29691 | ; mode 31 [row 5] |
| 29692 | movu m6, [r5 + 6 * 16] |
| 29693 | pmaddubsw m3, m0, m6 |
| 29694 | pmulhrsw m3, m7 |
| 29695 | pmaddubsw m5, m2, m6 |
| 29696 | pmulhrsw m5, m7 |
| 29697 | packuswb m3, m5 |
| 29698 | movu [r0 + 1866 * 16], m3 |
| 29699 | pmaddubsw m3, m1, m6 |
| 29700 | pmulhrsw m3, m7 |
| 29701 | pmaddubsw m5, m4, m6 |
| 29702 | pmulhrsw m5, m7 |
| 29703 | packuswb m3, m5 |
| 29704 | movu [r0 + 1867 * 16], m3 |
| 29705 | |
| 29706 | ; mode 31 [row 6] |
| 29707 | movu m6, [r5 + 23 * 16] |
| 29708 | pmaddubsw m3, m0, m6 |
| 29709 | pmulhrsw m3, m7 |
| 29710 | pmaddubsw m5, m2, m6 |
| 29711 | pmulhrsw m5, m7 |
| 29712 | packuswb m3, m5 |
| 29713 | movu [r0 + 1868 * 16], m3 |
| 29714 | pmaddubsw m3, m1, m6 |
| 29715 | pmulhrsw m3, m7 |
| 29716 | pmaddubsw m5, m4, m6 |
| 29717 | pmulhrsw m5, m7 |
| 29718 | packuswb m3, m5 |
| 29719 | movu [r0 + 1869 * 16], m3 |
| 29720 | |
| 29721 | ; mode 28 [row 25] |
| 29722 | movu m6, [r5 + 2 * 16] |
| 29723 | movu m0, [r3 + 5] |
| 29724 | movd m1, [r3 + 6] |
| 29725 | palignr m1, m0, 1 |
| 29726 | punpcklbw m0, m1 |
| 29727 | pmaddubsw m3, m0, m6 |
| 29728 | pmulhrsw m3, m7 |
| 29729 | movu m2, [r3 + 13] |
| 29730 | movd m4, [r3 + 14] |
| 29731 | palignr m4, m2, 1 |
| 29732 | punpcklbw m2, m4 |
| 29733 | pmaddubsw m5, m2, m6 |
| 29734 | pmulhrsw m5, m7 |
| 29735 | packuswb m3, m5 |
| 29736 | movu [r0 + 1714 * 16], m3 |
| 29737 | |
| 29738 | movu m1, [r3 + 21] |
| 29739 | movd m3, [r3 + 22] |
| 29740 | palignr m3, m1, 1 |
| 29741 | punpcklbw m1, m3 |
| 29742 | pmaddubsw m3, m1, m6 |
| 29743 | pmulhrsw m3, m7 |
| 29744 | movu m4, [r3 + 29] |
| 29745 | movd m5, [r3 + 30] |
| 29746 | palignr m5, m4, 1 |
| 29747 | punpcklbw m4, m5 |
| 29748 | pmaddubsw m5, m4, m6 |
| 29749 | pmulhrsw m5, m7 |
| 29750 | packuswb m3, m5 |
| 29751 | movu [r0 + 1715 * 16], m3 |
| 29752 | |
| 29753 | ; mode 28 [row 26] |
| 29754 | movu m6, [r5 + 7 * 16] |
| 29755 | pmaddubsw m3, m0, m6 |
| 29756 | pmulhrsw m3, m7 |
| 29757 | pmaddubsw m5, m2, m6 |
| 29758 | pmulhrsw m5, m7 |
| 29759 | packuswb m3, m5 |
| 29760 | movu [r0 + 1716 * 16], m3 |
| 29761 | |
| 29762 | ; mode 29 [row 14 - first half] |
| 29763 | movu [r0 + 1756 * 16], m3 |
| 29764 | |
| 29765 | pmaddubsw m3, m1, m6 |
| 29766 | pmulhrsw m3, m7 |
| 29767 | pmaddubsw m5, m4, m6 |
| 29768 | pmulhrsw m5, m7 |
| 29769 | packuswb m3, m5 |
| 29770 | movu [r0 + 1717 * 16], m3 |
| 29771 | |
| 29772 | ; mode 29 [row 14 - second half] |
| 29773 | movu [r0 + 1757 * 16], m3 |
| 29774 | |
| 29775 | ; mode 28 [row 27] |
| 29776 | movu m6, [r5 + 12 * 16] |
| 29777 | pmaddubsw m3, m0, m6 |
| 29778 | pmulhrsw m3, m7 |
| 29779 | pmaddubsw m5, m2, m6 |
| 29780 | pmulhrsw m5, m7 |
| 29781 | packuswb m3, m5 |
| 29782 | movu [r0 + 1718 * 16], m3 |
| 29783 | pmaddubsw m3, m1, m6 |
| 29784 | pmulhrsw m3, m7 |
| 29785 | pmaddubsw m5, m4, m6 |
| 29786 | pmulhrsw m5, m7 |
| 29787 | packuswb m3, m5 |
| 29788 | movu [r0 + 1719 * 16], m3 |
| 29789 | |
| 29790 | ; mode 28 [row 28] |
| 29791 | movu m6, [r5 + 17 * 16] |
| 29792 | pmaddubsw m3, m0, m6 |
| 29793 | pmulhrsw m3, m7 |
| 29794 | pmaddubsw m5, m2, m6 |
| 29795 | pmulhrsw m5, m7 |
| 29796 | packuswb m3, m5 |
| 29797 | movu [r0 + 1720 * 16], m3 |
| 29798 | pmaddubsw m3, m1, m6 |
| 29799 | pmulhrsw m3, m7 |
| 29800 | pmaddubsw m5, m4, m6 |
| 29801 | pmulhrsw m5, m7 |
| 29802 | packuswb m3, m5 |
| 29803 | movu [r0 + 1721 * 16], m3 |
| 29804 | |
| 29805 | ; mode 28 [row 29] |
| 29806 | movu m6, [r5 + 22 * 16] |
| 29807 | pmaddubsw m3, m0, m6 |
| 29808 | pmulhrsw m3, m7 |
| 29809 | pmaddubsw m5, m2, m6 |
| 29810 | pmulhrsw m5, m7 |
| 29811 | packuswb m3, m5 |
| 29812 | movu [r0 + 1722 * 16], m3 |
| 29813 | pmaddubsw m3, m1, m6 |
| 29814 | pmulhrsw m3, m7 |
| 29815 | pmaddubsw m5, m4, m6 |
| 29816 | pmulhrsw m5, m7 |
| 29817 | packuswb m3, m5 |
| 29818 | movu [r0 + 1723 * 16], m3 |
| 29819 | |
| 29820 | ; mode 28 [row 30] |
| 29821 | movu m6, [r5 + 27 * 16] |
| 29822 | pmaddubsw m3, m0, m6 |
| 29823 | pmulhrsw m3, m7 |
| 29824 | pmaddubsw m5, m2, m6 |
| 29825 | pmulhrsw m5, m7 |
| 29826 | packuswb m3, m5 |
| 29827 | movu [r0 + 1724 * 16], m3 |
| 29828 | pmaddubsw m3, m1, m6 |
| 29829 | pmulhrsw m3, m7 |
| 29830 | pmaddubsw m5, m4, m6 |
| 29831 | pmulhrsw m5, m7 |
| 29832 | packuswb m3, m5 |
| 29833 | movu [r0 + 1725 * 16], m3 |
| 29834 | |
| 29835 | ; mode 29 [row 15] |
| 29836 | movu m6, [r5 + 16 * 16] |
| 29837 | pmaddubsw m3, m0, m6 |
| 29838 | pmulhrsw m3, m7 |
| 29839 | pmaddubsw m5, m2, m6 |
| 29840 | pmulhrsw m5, m7 |
| 29841 | packuswb m3, m5 |
| 29842 | movu [r0 + 1758 * 16], m3 |
| 29843 | pmaddubsw m3, m1, m6 |
| 29844 | pmulhrsw m3, m7 |
| 29845 | pmaddubsw m5, m4, m6 |
| 29846 | pmulhrsw m5, m7 |
| 29847 | packuswb m3, m5 |
| 29848 | movu [r0 + 1759 * 16], m3 |
| 29849 | |
| 29850 | ; mode 29 [row 16] |
| 29851 | movu m6, [r5 + 25 * 16] |
| 29852 | pmaddubsw m3, m0, m6 |
| 29853 | pmulhrsw m3, m7 |
| 29854 | pmaddubsw m5, m2, m6 |
| 29855 | pmulhrsw m5, m7 |
| 29856 | packuswb m3, m5 |
| 29857 | movu [r0 + 1760 * 16], m3 |
| 29858 | pmaddubsw m3, m1, m6 |
| 29859 | pmulhrsw m3, m7 |
| 29860 | pmaddubsw m5, m4, m6 |
| 29861 | pmulhrsw m5, m7 |
| 29862 | packuswb m3, m5 |
| 29863 | movu [r0 + 1761 * 16], m3 |
| 29864 | |
| 29865 | ; mode 30 [row 9] |
| 29866 | movu m6, [r5 + 2 * 16] |
| 29867 | pmaddubsw m3, m0, m6 |
| 29868 | pmulhrsw m3, m7 |
| 29869 | pmaddubsw m5, m2, m6 |
| 29870 | pmulhrsw m5, m7 |
| 29871 | packuswb m3, m5 |
| 29872 | movu [r0 + 1810 * 16], m3 |
| 29873 | |
| 29874 | ; mode 33 [row 4 - first half] |
| 29875 | movu [r0 + 1992 * 16], m3 |
| 29876 | |
| 29877 | pmaddubsw m3, m1, m6 |
| 29878 | pmulhrsw m3, m7 |
| 29879 | pmaddubsw m5, m4, m6 |
| 29880 | pmulhrsw m5, m7 |
| 29881 | packuswb m3, m5 |
| 29882 | movu [r0 + 1811 * 16], m3 |
| 29883 | |
| 29884 | ; mode 33 [row 4 - second half] |
| 29885 | movu [r0 + 1993 * 16], m3 |
| 29886 | |
| 29887 | ; mode 30 [row 10] |
| 29888 | movu m6, [r5 + 15 * 16] |
| 29889 | pmaddubsw m3, m0, m6 |
| 29890 | pmulhrsw m3, m7 |
| 29891 | pmaddubsw m5, m2, m6 |
| 29892 | pmulhrsw m5, m7 |
| 29893 | packuswb m3, m5 |
| 29894 | movu [r0 + 1812 * 16], m3 |
| 29895 | pmaddubsw m3, m1, m6 |
| 29896 | pmulhrsw m3, m7 |
| 29897 | pmaddubsw m5, m4, m6 |
| 29898 | pmulhrsw m5, m7 |
| 29899 | packuswb m3, m5 |
| 29900 | movu [r0 + 1813 * 16], m3 |
| 29901 | |
| 29902 | ; mode 31 [row 7] |
| 29903 | movu m6, [r5 + 8 * 16] |
| 29904 | pmaddubsw m3, m0, m6 |
| 29905 | pmulhrsw m3, m7 |
| 29906 | pmaddubsw m5, m2, m6 |
| 29907 | pmulhrsw m5, m7 |
| 29908 | packuswb m3, m5 |
| 29909 | movu [r0 + 1870 * 16], m3 |
| 29910 | pmaddubsw m3, m1, m6 |
| 29911 | pmulhrsw m3, m7 |
| 29912 | pmaddubsw m5, m4, m6 |
| 29913 | pmulhrsw m5, m7 |
| 29914 | packuswb m3, m5 |
| 29915 | movu [r0 + 1871 * 16], m3 |
| 29916 | |
| 29917 | ; mode 31 [row 8] |
| 29918 | movu m6, [r5 + 25 * 16] |
| 29919 | pmaddubsw m3, m0, m6 |
| 29920 | pmulhrsw m3, m7 |
| 29921 | pmaddubsw m5, m2, m6 |
| 29922 | pmulhrsw m5, m7 |
| 29923 | packuswb m3, m5 |
| 29924 | movu [r0 + 1872 * 16], m3 |
| 29925 | pmaddubsw m3, m1, m6 |
| 29926 | pmulhrsw m3, m7 |
| 29927 | pmaddubsw m5, m4, m6 |
| 29928 | pmulhrsw m5, m7 |
| 29929 | packuswb m3, m5 |
| 29930 | movu [r0 + 1873 * 16], m3 |
| 29931 | |
| 29932 | ; mode 32 [row 6] |
| 29933 | movu m6, [r5 + 19 * 16] |
| 29934 | pmaddubsw m3, m0, m6 |
| 29935 | pmulhrsw m3, m7 |
| 29936 | pmaddubsw m5, m2, m6 |
| 29937 | pmulhrsw m5, m7 |
| 29938 | packuswb m3, m5 |
| 29939 | movu [r0 + 1932 * 16], m3 |
| 29940 | pmaddubsw m3, m1, m6 |
| 29941 | pmulhrsw m3, m7 |
| 29942 | pmaddubsw m5, m4, m6 |
| 29943 | pmulhrsw m5, m7 |
| 29944 | packuswb m3, m5 |
| 29945 | movu [r0 + 1933 * 16], m3 |
| 29946 | |
| 29947 | ; mode 30 [row 11] |
| 29948 | movu m6, [r5 + 28 * 16] |
| 29949 | pmaddubsw m3, m0, m6 |
| 29950 | pmulhrsw m3, m7 |
| 29951 | pmaddubsw m5, m2, m6 |
| 29952 | pmulhrsw m5, m7 |
| 29953 | packuswb m3, m5 |
| 29954 | movu [r0 + 1814 * 16], m3 |
| 29955 | |
| 29956 | ; mode 33 [row 5 - first half] |
| 29957 | movu [r0 + 1994 * 16], m3 |
| 29958 | |
| 29959 | pmaddubsw m3, m1, m6 |
| 29960 | pmulhrsw m3, m7 |
| 29961 | pmaddubsw m5, m4, m6 |
| 29962 | pmulhrsw m5, m7 |
| 29963 | packuswb m3, m5 |
| 29964 | movu [r0 + 1815 * 16], m3 |
| 29965 | |
| 29966 | ; mode 33 [row 5 - second half] |
| 29967 | movu [r0 + 1995 * 16], m3 |
| 29968 | |
| 29969 | ; mode 28 [row 31] |
| 29970 | movu m0, [r3 + 6] |
| 29971 | movd m1, [r3 + 7] |
| 29972 | palignr m1, m0, 1 |
| 29973 | punpcklbw m0, m1 |
| 29974 | movu m2, [r3 + 14] |
| 29975 | movd m3, [r3 + 15] |
| 29976 | palignr m3, m2, 1 |
| 29977 | punpcklbw m2, m3 |
| 29978 | movu m1, [r3 + 22] |
| 29979 | movd m3, [r3 + 23] |
| 29980 | palignr m3, m1, 1 |
| 29981 | punpcklbw m1, m3 |
| 29982 | movu m4, [r3 + 30] |
| 29983 | movd m5, [r3 + 31] |
| 29984 | palignr m5, m4, 1 |
| 29985 | punpcklbw m4, m5 |
| 29986 | |
| 29987 | pshufb m5, m0, [tab_S2] |
| 29988 | movh [r0 + 1726 * 16], m5 |
| 29989 | pshufb m5, m2, [tab_S2] |
| 29990 | movh [r0 + 1726 * 16 + 8], m5 |
| 29991 | pshufb m5, m1, [tab_S2] |
| 29992 | movh [r0 + 1727 * 16], m5 |
| 29993 | pshufb m5, m4, [tab_S2] |
| 29994 | movh [r0 + 1727 * 16 + 8], m5 |
| 29995 | |
| 29996 | ; mode 29 [row 17] |
| 29997 | movu m6, [r5 + 2 * 16] |
| 29998 | pmaddubsw m3, m0, m6 |
| 29999 | pmulhrsw m3, m7 |
| 30000 | pmaddubsw m5, m2, m6 |
| 30001 | pmulhrsw m5, m7 |
| 30002 | packuswb m3, m5 |
| 30003 | movu [r0 + 1762 * 16], m3 |
| 30004 | pmaddubsw m3, m1, m6 |
| 30005 | pmulhrsw m3, m7 |
| 30006 | pmaddubsw m5, m4, m6 |
| 30007 | pmulhrsw m5, m7 |
| 30008 | packuswb m3, m5 |
| 30009 | movu [r0 + 1763 * 16], m3 |
| 30010 | |
| 30011 | ; mode 29 [row 18] |
| 30012 | movu m6, [r5 + 11 * 16] |
| 30013 | pmaddubsw m3, m0, m6 |
| 30014 | pmulhrsw m3, m7 |
| 30015 | pmaddubsw m5, m2, m6 |
| 30016 | pmulhrsw m5, m7 |
| 30017 | packuswb m3, m5 |
| 30018 | movu [r0 + 1764 * 16], m3 |
| 30019 | pmaddubsw m3, m1, m6 |
| 30020 | pmulhrsw m3, m7 |
| 30021 | pmaddubsw m5, m4, m6 |
| 30022 | pmulhrsw m5, m7 |
| 30023 | packuswb m3, m5 |
| 30024 | movu [r0 + 1765 * 16], m3 |
| 30025 | |
| 30026 | ; mode 29 [row 19] |
| 30027 | movu m6, [r5 + 20 * 16] |
| 30028 | pmaddubsw m3, m0, m6 |
| 30029 | pmulhrsw m3, m7 |
| 30030 | pmaddubsw m5, m2, m6 |
| 30031 | pmulhrsw m5, m7 |
| 30032 | packuswb m3, m5 |
| 30033 | movu [r0 + 1766 * 16], m3 |
| 30034 | pmaddubsw m3, m1, m6 |
| 30035 | pmulhrsw m3, m7 |
| 30036 | pmaddubsw m5, m4, m6 |
| 30037 | pmulhrsw m5, m7 |
| 30038 | packuswb m3, m5 |
| 30039 | movu [r0 + 1767 * 16], m3 |
| 30040 | |
| 30041 | ; mode 29 [row 20] |
| 30042 | movu m6, [r5 + 29 * 16] |
| 30043 | pmaddubsw m3, m0, m6 |
| 30044 | pmulhrsw m3, m7 |
| 30045 | pmaddubsw m5, m2, m6 |
| 30046 | pmulhrsw m5, m7 |
| 30047 | packuswb m3, m5 |
| 30048 | movu [r0 + 1768 * 16], m3 |
| 30049 | |
| 30050 | ; mode 32 [row 8 - first halif] |
| 30051 | movu [r0 + 1936 * 16], m3 |
| 30052 | |
| 30053 | pmaddubsw m3, m1, m6 |
| 30054 | pmulhrsw m3, m7 |
| 30055 | pmaddubsw m5, m4, m6 |
| 30056 | pmulhrsw m5, m7 |
| 30057 | packuswb m3, m5 |
| 30058 | movu [r0 + 1769 * 16], m3 |
| 30059 | |
| 30060 | ; mode 32 [row 8 - second halif] |
| 30061 | movu [r0 + 1937 * 16], m3 |
| 30062 | |
| 30063 | ; mode 30 [row 12] |
| 30064 | movu m6, [r5 + 9 * 16] |
| 30065 | pmaddubsw m3, m0, m6 |
| 30066 | pmulhrsw m3, m7 |
| 30067 | pmaddubsw m5, m2, m6 |
| 30068 | pmulhrsw m5, m7 |
| 30069 | packuswb m3, m5 |
| 30070 | movu [r0 + 1816 * 16], m3 |
| 30071 | pmaddubsw m3, m1, m6 |
| 30072 | pmulhrsw m3, m7 |
| 30073 | pmaddubsw m5, m4, m6 |
| 30074 | pmulhrsw m5, m7 |
| 30075 | packuswb m3, m5 |
| 30076 | movu [r0 + 1817 * 16], m3 |
| 30077 | |
| 30078 | ; mode 30 [row 13] |
| 30079 | movu m6, [r5 + 22 * 16] |
| 30080 | pmaddubsw m3, m0, m6 |
| 30081 | pmulhrsw m3, m7 |
| 30082 | pmaddubsw m5, m2, m6 |
| 30083 | pmulhrsw m5, m7 |
| 30084 | packuswb m3, m5 |
| 30085 | movu [r0 + 1818 * 16], m3 |
| 30086 | |
| 30087 | ; mode 33 [row 6 - first half] |
| 30088 | movu [r0 + 1996 * 16], m3 |
| 30089 | |
| 30090 | pmaddubsw m3, m1, m6 |
| 30091 | pmulhrsw m3, m7 |
| 30092 | pmaddubsw m5, m4, m6 |
| 30093 | pmulhrsw m5, m7 |
| 30094 | packuswb m3, m5 |
| 30095 | movu [r0 + 1819 * 16], m3 |
| 30096 | |
| 30097 | ; mode 33 [row 6 - second half] |
| 30098 | movu [r0 + 1997 * 16], m3 |
| 30099 | |
| 30100 | ; mode 31 [row 9] |
| 30101 | movu m6, [r5 + 10 * 16] |
| 30102 | pmaddubsw m3, m0, m6 |
| 30103 | pmulhrsw m3, m7 |
| 30104 | pmaddubsw m5, m2, m6 |
| 30105 | pmulhrsw m5, m7 |
| 30106 | packuswb m3, m5 |
| 30107 | movu [r0 + 1874 * 16], m3 |
| 30108 | pmaddubsw m3, m1, m6 |
| 30109 | pmulhrsw m3, m7 |
| 30110 | pmaddubsw m5, m4, m6 |
| 30111 | pmulhrsw m5, m7 |
| 30112 | packuswb m3, m5 |
| 30113 | movu [r0 + 1875 * 16], m3 |
| 30114 | |
| 30115 | ; mode 31 [row 10] |
| 30116 | movu m6, [r5 + 27 * 16] |
| 30117 | pmaddubsw m3, m0, m6 |
| 30118 | pmulhrsw m3, m7 |
| 30119 | pmaddubsw m5, m2, m6 |
| 30120 | pmulhrsw m5, m7 |
| 30121 | packuswb m3, m5 |
| 30122 | movu [r0 + 1876 * 16], m3 |
| 30123 | pmaddubsw m3, m1, m6 |
| 30124 | pmulhrsw m3, m7 |
| 30125 | pmaddubsw m5, m4, m6 |
| 30126 | pmulhrsw m5, m7 |
| 30127 | packuswb m3, m5 |
| 30128 | movu [r0 + 1877 * 16], m3 |
| 30129 | |
| 30130 | ; mode 32 [row 7] |
| 30131 | movu m6, [r5 + 8 * 16] |
| 30132 | pmaddubsw m3, m0, m6 |
| 30133 | pmulhrsw m3, m7 |
| 30134 | pmaddubsw m5, m2, m6 |
| 30135 | pmulhrsw m5, m7 |
| 30136 | packuswb m3, m5 |
| 30137 | movu [r0 + 1934 * 16], m3 |
| 30138 | pmaddubsw m3, m1, m6 |
| 30139 | pmulhrsw m3, m7 |
| 30140 | pmaddubsw m5, m4, m6 |
| 30141 | pmulhrsw m5, m7 |
| 30142 | packuswb m3, m5 |
| 30143 | movu [r0 + 1935 * 16], m3 |
| 30144 | |
| 30145 | ; mode 29 [row 21] |
| 30146 | movu m6, [r5 + 6 * 16] |
| 30147 | movu m0, [r3 + 7] |
| 30148 | movd m1, [r3 + 8] |
| 30149 | palignr m1, m0, 1 |
| 30150 | punpcklbw m0, m1 |
| 30151 | pmaddubsw m3, m0, m6 |
| 30152 | pmulhrsw m3, m7 |
| 30153 | movu m2, [r3 + 15] |
| 30154 | movd m4, [r3 + 16] |
| 30155 | palignr m4, m2, 1 |
| 30156 | punpcklbw m2, m4 |
| 30157 | pmaddubsw m5, m2, m6 |
| 30158 | pmulhrsw m5, m7 |
| 30159 | packuswb m3, m5 |
| 30160 | movu [r0 + 1770 * 16], m3 |
| 30161 | |
| 30162 | movu m1, [r3 + 23] |
| 30163 | movd m3, [r3 + 24] |
| 30164 | palignr m3, m1, 1 |
| 30165 | punpcklbw m1, m3 |
| 30166 | pmaddubsw m3, m1, m6 |
| 30167 | pmulhrsw m3, m7 |
| 30168 | movu m4, [r3 + 31] |
| 30169 | movd m5, [r3 + 32] |
| 30170 | palignr m5, m4, 1 |
| 30171 | punpcklbw m4, m5 |
| 30172 | pmaddubsw m5, m4, m6 |
| 30173 | pmulhrsw m5, m7 |
| 30174 | packuswb m3, m5 |
| 30175 | movu [r0 + 1771 * 16], m3 |
| 30176 | |
| 30177 | ; mode 29 [row 22] |
| 30178 | movu m6, [r5 + 15 * 16] |
| 30179 | pmaddubsw m3, m0, m6 |
| 30180 | pmulhrsw m3, m7 |
| 30181 | pmaddubsw m5, m2, m6 |
| 30182 | pmulhrsw m5, m7 |
| 30183 | packuswb m3, m5 |
| 30184 | movu [r0 + 1772 * 16], m3 |
| 30185 | pmaddubsw m3, m1, m6 |
| 30186 | pmulhrsw m3, m7 |
| 30187 | pmaddubsw m5, m4, m6 |
| 30188 | pmulhrsw m5, m7 |
| 30189 | packuswb m3, m5 |
| 30190 | movu [r0 + 1773 * 16], m3 |
| 30191 | |
| 30192 | ; mode 29 [row 23] |
| 30193 | movu m6, [r5 + 24 * 16] |
| 30194 | pmaddubsw m3, m0, m6 |
| 30195 | pmulhrsw m3, m7 |
| 30196 | pmaddubsw m5, m2, m6 |
| 30197 | pmulhrsw m5, m7 |
| 30198 | packuswb m3, m5 |
| 30199 | movu [r0 + 1774 * 16], m3 |
| 30200 | pmaddubsw m3, m1, m6 |
| 30201 | pmulhrsw m3, m7 |
| 30202 | pmaddubsw m5, m4, m6 |
| 30203 | pmulhrsw m5, m7 |
| 30204 | packuswb m3, m5 |
| 30205 | movu [r0 + 1775 * 16], m3 |
| 30206 | |
| 30207 | ; mode 30 [row 14] |
| 30208 | movu m6, [r5 + 3 * 16] |
| 30209 | pmaddubsw m3, m0, m6 |
| 30210 | pmulhrsw m3, m7 |
| 30211 | pmaddubsw m5, m2, m6 |
| 30212 | pmulhrsw m5, m7 |
| 30213 | packuswb m3, m5 |
| 30214 | movu [r0 + 1820 * 16], m3 |
| 30215 | pmaddubsw m3, m1, m6 |
| 30216 | pmulhrsw m3, m7 |
| 30217 | pmaddubsw m5, m4, m6 |
| 30218 | pmulhrsw m5, m7 |
| 30219 | packuswb m3, m5 |
| 30220 | movu [r0 + 1821 * 16], m3 |
| 30221 | |
| 30222 | ; mode 30 [row 15] |
| 30223 | movu m6, [r5 + 16 * 16] |
| 30224 | pmaddubsw m3, m0, m6 |
| 30225 | pmulhrsw m3, m7 |
| 30226 | pmaddubsw m5, m2, m6 |
| 30227 | pmulhrsw m5, m7 |
| 30228 | packuswb m3, m5 |
| 30229 | movu [r0 + 1822 * 16], m3 |
| 30230 | |
| 30231 | ; mode 33 [row 7 - first half] |
| 30232 | movu [r0 + 1998 * 16], m3 |
| 30233 | |
| 30234 | pmaddubsw m3, m1, m6 |
| 30235 | pmulhrsw m3, m7 |
| 30236 | pmaddubsw m5, m4, m6 |
| 30237 | pmulhrsw m5, m7 |
| 30238 | packuswb m3, m5 |
| 30239 | movu [r0 + 1823 * 16], m3 |
| 30240 | |
| 30241 | ; mode 33 [row 7 - second half] |
| 30242 | movu [r0 + 1999 * 16], m3 |
| 30243 | |
| 30244 | ; mode 30 [row 16] |
| 30245 | movu m6, [r5 + 29 * 16] |
| 30246 | pmaddubsw m3, m0, m6 |
| 30247 | pmulhrsw m3, m7 |
| 30248 | pmaddubsw m5, m2, m6 |
| 30249 | pmulhrsw m5, m7 |
| 30250 | packuswb m3, m5 |
| 30251 | movu [r0 + 1824 * 16], m3 |
| 30252 | |
| 30253 | ; mode 31 [row 12 - first half] |
| 30254 | movu [r0 + 1880 * 16], m3 |
| 30255 | |
| 30256 | pmaddubsw m3, m1, m6 |
| 30257 | pmulhrsw m3, m7 |
| 30258 | pmaddubsw m5, m4, m6 |
| 30259 | pmulhrsw m5, m7 |
| 30260 | packuswb m3, m5 |
| 30261 | movu [r0 + 1825 * 16], m3 |
| 30262 | |
| 30263 | ; mode 31 [row 12 - second half] |
| 30264 | movu [r0 + 1881 * 16], m3 |
| 30265 | |
| 30266 | ; mode 31 [row 11] |
| 30267 | movu m6, [r5 + 12 * 16] |
| 30268 | pmaddubsw m3, m0, m6 |
| 30269 | pmulhrsw m3, m7 |
| 30270 | pmaddubsw m5, m2, m6 |
| 30271 | pmulhrsw m5, m7 |
| 30272 | packuswb m3, m5 |
| 30273 | movu [r0 + 1878 * 16], m3 |
| 30274 | pmaddubsw m3, m1, m6 |
| 30275 | pmulhrsw m3, m7 |
| 30276 | pmaddubsw m5, m4, m6 |
| 30277 | pmulhrsw m5, m7 |
| 30278 | packuswb m3, m5 |
| 30279 | movu [r0 + 1879 * 16], m3 |
| 30280 | |
| 30281 | ; mode 32 [row 9] |
| 30282 | movu m6, [r5 + 18 * 16] |
| 30283 | pmaddubsw m3, m0, m6 |
| 30284 | pmulhrsw m3, m7 |
| 30285 | pmaddubsw m5, m2, m6 |
| 30286 | pmulhrsw m5, m7 |
| 30287 | packuswb m3, m5 |
| 30288 | movu [r0 + 1938 * 16], m3 |
| 30289 | pmaddubsw m3, m1, m6 |
| 30290 | pmulhrsw m3, m7 |
| 30291 | pmaddubsw m5, m4, m6 |
| 30292 | pmulhrsw m5, m7 |
| 30293 | packuswb m3, m5 |
| 30294 | movu [r0 + 1939 * 16], m3 |
| 30295 | |
| 30296 | ; mode 29 [row 24] |
| 30297 | movu m6, [r5 + 1 * 16] |
| 30298 | movu m0, [r3 + 8] |
| 30299 | movd m1, [r3 + 9] |
| 30300 | palignr m1, m0, 1 |
| 30301 | punpcklbw m0, m1 |
| 30302 | pmaddubsw m3, m0, m6 |
| 30303 | pmulhrsw m3, m7 |
| 30304 | movu m2, [r3 + 16] |
| 30305 | movd m4, [r3 + 17] |
| 30306 | palignr m4, m2, 1 |
| 30307 | punpcklbw m2, m4 |
| 30308 | pmaddubsw m5, m2, m6 |
| 30309 | pmulhrsw m5, m7 |
| 30310 | packuswb m3, m5 |
| 30311 | movu [r0 + 1776 * 16], m3 |
| 30312 | |
| 30313 | movu m1, [r3 + 24] |
| 30314 | movd m3, [r3 + 25] |
| 30315 | palignr m3, m1, 1 |
| 30316 | punpcklbw m1, m3 |
| 30317 | pmaddubsw m3, m1, m6 |
| 30318 | pmulhrsw m3, m7 |
| 30319 | movu m4, [r3 + 32] |
| 30320 | movd m5, [r3 + 33] |
| 30321 | palignr m5, m4, 1 |
| 30322 | punpcklbw m4, m5 |
| 30323 | pmaddubsw m5, m4, m6 |
| 30324 | pmulhrsw m5, m7 |
| 30325 | packuswb m3, m5 |
| 30326 | movu [r0 + 1777 * 16], m3 |
| 30327 | |
| 30328 | ; mode 29 [row 25] |
| 30329 | movu m6, [r5 + 10 * 16] |
| 30330 | pmaddubsw m3, m0, m6 |
| 30331 | pmulhrsw m3, m7 |
| 30332 | pmaddubsw m5, m2, m6 |
| 30333 | pmulhrsw m5, m7 |
| 30334 | packuswb m3, m5 |
| 30335 | movu [r0 + 1778 * 16], m3 |
| 30336 | |
| 30337 | ; mode 30 [row 17 - first half] |
| 30338 | movu [r0 + 1826 * 16], m3 |
| 30339 | |
| 30340 | ; mode 33 [row 8 - first half] |
| 30341 | movu [r0 + 2000 * 16], m3 |
| 30342 | |
| 30343 | pmaddubsw m3, m1, m6 |
| 30344 | pmulhrsw m3, m7 |
| 30345 | pmaddubsw m5, m4, m6 |
| 30346 | pmulhrsw m5, m7 |
| 30347 | packuswb m3, m5 |
| 30348 | movu [r0 + 1779 * 16], m3 |
| 30349 | |
| 30350 | ; mode 30 [row 17 - second half] |
| 30351 | movu [r0 + 1827 * 16], m3 |
| 30352 | |
| 30353 | ; mode 33 [row 8 - second half] |
| 30354 | movu [r0 + 2001 * 16], m3 |
| 30355 | |
| 30356 | ; mode 29 [row 26] |
| 30357 | movu m6, [r5 + 19 * 16] |
| 30358 | pmaddubsw m3, m0, m6 |
| 30359 | pmulhrsw m3, m7 |
| 30360 | pmaddubsw m5, m2, m6 |
| 30361 | pmulhrsw m5, m7 |
| 30362 | packuswb m3, m5 |
| 30363 | movu [r0 + 1780 * 16], m3 |
| 30364 | pmaddubsw m3, m1, m6 |
| 30365 | pmulhrsw m3, m7 |
| 30366 | pmaddubsw m5, m4, m6 |
| 30367 | pmulhrsw m5, m7 |
| 30368 | packuswb m3, m5 |
| 30369 | movu [r0 + 1781 * 16], m3 |
| 30370 | |
| 30371 | ; mode 29 [row 27] |
| 30372 | movu m6, [r5 + 28 * 16] |
| 30373 | pmaddubsw m3, m0, m6 |
| 30374 | pmulhrsw m3, m7 |
| 30375 | pmaddubsw m5, m2, m6 |
| 30376 | pmulhrsw m5, m7 |
| 30377 | packuswb m3, m5 |
| 30378 | movu [r0 + 1782 * 16], m3 |
| 30379 | |
| 30380 | ; mode 32 [row 11 - first half] |
| 30381 | movu [r0 + 1942 * 16], m3 |
| 30382 | |
| 30383 | pmaddubsw m3, m1, m6 |
| 30384 | pmulhrsw m3, m7 |
| 30385 | pmaddubsw m5, m4, m6 |
| 30386 | pmulhrsw m5, m7 |
| 30387 | packuswb m3, m5 |
| 30388 | movu [r0 + 1783 * 16], m3 |
| 30389 | |
| 30390 | ; mode 32 [row 11 - second half] |
| 30391 | movu [r0 + 1943 * 16], m3 |
| 30392 | |
| 30393 | ; mode 30 [row 18] |
| 30394 | movu m6, [r5 + 23 * 16] |
| 30395 | pmaddubsw m3, m0, m6 |
| 30396 | pmulhrsw m3, m7 |
| 30397 | pmaddubsw m5, m2, m6 |
| 30398 | pmulhrsw m5, m7 |
| 30399 | packuswb m3, m5 |
| 30400 | movu [r0 + 1828 * 16], m3 |
| 30401 | pmaddubsw m3, m1, m6 |
| 30402 | pmulhrsw m3, m7 |
| 30403 | pmaddubsw m5, m4, m6 |
| 30404 | pmulhrsw m5, m7 |
| 30405 | packuswb m3, m5 |
| 30406 | movu [r0 + 1829 * 16], m3 |
| 30407 | |
| 30408 | ; mode 31 [row 13] |
| 30409 | movu m6, [r5 + 14 * 16] |
| 30410 | pmaddubsw m3, m0, m6 |
| 30411 | pmulhrsw m3, m7 |
| 30412 | pmaddubsw m5, m2, m6 |
| 30413 | pmulhrsw m5, m7 |
| 30414 | packuswb m3, m5 |
| 30415 | movu [r0 + 1882 * 16], m3 |
| 30416 | pmaddubsw m3, m1, m6 |
| 30417 | pmulhrsw m3, m7 |
| 30418 | pmaddubsw m5, m4, m6 |
| 30419 | pmulhrsw m5, m7 |
| 30420 | packuswb m3, m5 |
| 30421 | movu [r0 + 1883 * 16], m3 |
| 30422 | |
| 30423 | ; mode 31 [row 14] |
| 30424 | movu m6, [r5 + 31 * 16] |
| 30425 | pmaddubsw m3, m0, m6 |
| 30426 | pmulhrsw m3, m7 |
| 30427 | pmaddubsw m5, m2, m6 |
| 30428 | pmulhrsw m5, m7 |
| 30429 | packuswb m3, m5 |
| 30430 | movu [r0 + 1884 * 16], m3 |
| 30431 | pmaddubsw m3, m1, m6 |
| 30432 | pmulhrsw m3, m7 |
| 30433 | pmaddubsw m5, m4, m6 |
| 30434 | pmulhrsw m5, m7 |
| 30435 | packuswb m3, m5 |
| 30436 | movu [r0 + 1885 * 16], m3 |
| 30437 | |
| 30438 | ; mode 32 [row 10] |
| 30439 | movu m6, [r5 + 7 * 16] |
| 30440 | pmaddubsw m3, m0, m6 |
| 30441 | pmulhrsw m3, m7 |
| 30442 | pmaddubsw m5, m2, m6 |
| 30443 | pmulhrsw m5, m7 |
| 30444 | packuswb m3, m5 |
| 30445 | movu [r0 + 1940 * 16], m3 |
| 30446 | pmaddubsw m3, m1, m6 |
| 30447 | pmulhrsw m3, m7 |
| 30448 | pmaddubsw m5, m4, m6 |
| 30449 | pmulhrsw m5, m7 |
| 30450 | packuswb m3, m5 |
| 30451 | movu [r0 + 1941 * 16], m3 |
| 30452 | |
| 30453 | ; mode 29 [row 28] |
| 30454 | movu m6, [r5 + 5 * 16] |
| 30455 | movu m0, [r3 + 9] |
| 30456 | movd m1, [r3 + 10] |
| 30457 | palignr m1, m0, 1 |
| 30458 | punpcklbw m0, m1 |
| 30459 | pmaddubsw m3, m0, m6 |
| 30460 | pmulhrsw m3, m7 |
| 30461 | movu m2, [r3 + 17] |
| 30462 | movd m4, [r3 + 18] |
| 30463 | palignr m4, m2, 1 |
| 30464 | punpcklbw m2, m4 |
| 30465 | pmaddubsw m5, m2, m6 |
| 30466 | pmulhrsw m5, m7 |
| 30467 | packuswb m3, m5 |
| 30468 | movu [r0 + 1784 * 16], m3 |
| 30469 | |
| 30470 | movu m1, [r3 + 25] |
| 30471 | movd m3, [r3 + 26] |
| 30472 | palignr m3, m1, 1 |
| 30473 | punpcklbw m1, m3 |
| 30474 | pmaddubsw m3, m1, m6 |
| 30475 | pmulhrsw m3, m7 |
| 30476 | movu m4, [r3 + 33] |
| 30477 | movd m5, [r3 + 34] |
| 30478 | palignr m5, m4, 1 |
| 30479 | punpcklbw m4, m5 |
| 30480 | pmaddubsw m5, m4, m6 |
| 30481 | pmulhrsw m5, m7 |
| 30482 | packuswb m3, m5 |
| 30483 | movu [r0 + 1785 * 16], m3 |
| 30484 | |
| 30485 | ; mode 29 [row 29] |
| 30486 | movu m6, [r5 + 14 * 16] |
| 30487 | pmaddubsw m3, m0, m6 |
| 30488 | pmulhrsw m3, m7 |
| 30489 | pmaddubsw m5, m2, m6 |
| 30490 | pmulhrsw m5, m7 |
| 30491 | packuswb m3, m5 |
| 30492 | movu [r0 + 1786 * 16], m3 |
| 30493 | pmaddubsw m3, m1, m6 |
| 30494 | pmulhrsw m3, m7 |
| 30495 | pmaddubsw m5, m4, m6 |
| 30496 | pmulhrsw m5, m7 |
| 30497 | packuswb m3, m5 |
| 30498 | movu [r0 + 1787 * 16], m3 |
| 30499 | |
| 30500 | ; mode 29 [row 30] |
| 30501 | movu m6, [r5 + 23 * 16] |
| 30502 | pmaddubsw m3, m0, m6 |
| 30503 | pmulhrsw m3, m7 |
| 30504 | pmaddubsw m5, m2, m6 |
| 30505 | pmulhrsw m5, m7 |
| 30506 | packuswb m3, m5 |
| 30507 | movu [r0 + 1788 * 16], m3 |
| 30508 | pmaddubsw m3, m1, m6 |
| 30509 | pmulhrsw m3, m7 |
| 30510 | pmaddubsw m5, m4, m6 |
| 30511 | pmulhrsw m5, m7 |
| 30512 | packuswb m3, m5 |
| 30513 | movu [r0 + 1789 * 16], m3 |
| 30514 | |
| 30515 | ; mode 30 [row 19] |
| 30516 | movu m6, [r5 + 4 * 16] |
| 30517 | pmaddubsw m3, m0, m6 |
| 30518 | pmulhrsw m3, m7 |
| 30519 | pmaddubsw m5, m2, m6 |
| 30520 | pmulhrsw m5, m7 |
| 30521 | packuswb m3, m5 |
| 30522 | movu [r0 + 1830 * 16], m3 |
| 30523 | |
| 30524 | ; mode 33 [row 9 - first half] |
| 30525 | movu [r0 + 2002 * 16], m3 |
| 30526 | |
| 30527 | pmaddubsw m3, m1, m6 |
| 30528 | pmulhrsw m3, m7 |
| 30529 | pmaddubsw m5, m4, m6 |
| 30530 | pmulhrsw m5, m7 |
| 30531 | packuswb m3, m5 |
| 30532 | movu [r0 + 1831 * 16], m3 |
| 30533 | |
| 30534 | ; mode 33 [row 9 - second half] |
| 30535 | movu [r0 + 2003 * 16], m3 |
| 30536 | |
| 30537 | ; mode 30 [row 20] |
| 30538 | movu m6, [r5 + 17 * 16] |
| 30539 | pmaddubsw m3, m0, m6 |
| 30540 | pmulhrsw m3, m7 |
| 30541 | pmaddubsw m5, m2, m6 |
| 30542 | pmulhrsw m5, m7 |
| 30543 | packuswb m3, m5 |
| 30544 | movu [r0 + 1832 * 16], m3 |
| 30545 | |
| 30546 | ; mode 32 [row 12 - first half] |
| 30547 | movu [r0 + 1944 * 16], m3 |
| 30548 | |
| 30549 | pmaddubsw m3, m1, m6 |
| 30550 | pmulhrsw m3, m7 |
| 30551 | pmaddubsw m5, m4, m6 |
| 30552 | pmulhrsw m5, m7 |
| 30553 | packuswb m3, m5 |
| 30554 | movu [r0 + 1833 * 16], m3 |
| 30555 | |
| 30556 | ; mode 32 [row 12 - second half] |
| 30557 | movu [r0 + 1945 * 16], m3 |
| 30558 | |
| 30559 | ; mode 30 [row 21] |
| 30560 | movu m6, [r5 + 30 * 16] |
| 30561 | pmaddubsw m3, m0, m6 |
| 30562 | pmulhrsw m3, m7 |
| 30563 | pmaddubsw m5, m2, m6 |
| 30564 | pmulhrsw m5, m7 |
| 30565 | packuswb m3, m5 |
| 30566 | movu [r0 + 1834 * 16], m3 |
| 30567 | |
| 30568 | ; mode 33 [row 10 - first half] |
| 30569 | movu [r0 + 2004 * 16], m3 |
| 30570 | |
| 30571 | pmaddubsw m3, m1, m6 |
| 30572 | pmulhrsw m3, m7 |
| 30573 | pmaddubsw m5, m4, m6 |
| 30574 | pmulhrsw m5, m7 |
| 30575 | packuswb m3, m5 |
| 30576 | movu [r0 + 1835 * 16], m3 |
| 30577 | |
| 30578 | ; mode 33 [row 10 - second half] |
| 30579 | movu [r0 + 2005 * 16], m3 |
| 30580 | |
| 30581 | ; mode 31 [row 15] |
| 30582 | movu m6, [r5 + 16 * 16] |
| 30583 | pmaddubsw m3, m0, m6 |
| 30584 | pmulhrsw m3, m7 |
| 30585 | pmaddubsw m5, m2, m6 |
| 30586 | pmulhrsw m5, m7 |
| 30587 | packuswb m3, m5 |
| 30588 | movu [r0 + 1886 * 16], m3 |
| 30589 | pmaddubsw m3, m1, m6 |
| 30590 | pmulhrsw m3, m7 |
| 30591 | pmaddubsw m5, m4, m6 |
| 30592 | pmulhrsw m5, m7 |
| 30593 | packuswb m3, m5 |
| 30594 | movu [r0 + 1887 * 16], m3 |
| 30595 | |
| 30596 | ; mode 29 [row 31] |
| 30597 | movu m0, [r3 + 10] |
| 30598 | movd m1, [r3 + 11] |
| 30599 | palignr m1, m0, 1 |
| 30600 | punpcklbw m0, m1 |
| 30601 | movu m2, [r3 + 18] |
| 30602 | movd m3, [r3 + 19] |
| 30603 | palignr m3, m2, 1 |
| 30604 | punpcklbw m2, m3 |
| 30605 | movu m1, [r3 + 26] |
| 30606 | movd m3, [r3 + 27] |
| 30607 | palignr m3, m1, 1 |
| 30608 | punpcklbw m1, m3 |
| 30609 | movu m4, [r3 + 34] |
| 30610 | movd m5, [r3 + 35] |
| 30611 | palignr m5, m4, 1 |
| 30612 | punpcklbw m4, m5 |
| 30613 | |
| 30614 | pshufb m5, m0, [tab_S2] |
| 30615 | movh [r0 + 1790 * 16], m5 |
| 30616 | pshufb m5, m2, [tab_S2] |
| 30617 | movh [r0 + 1790 * 16 + 8], m5 |
| 30618 | pshufb m5, m1, [tab_S2] |
| 30619 | movh [r0 + 1791 * 16], m5 |
| 30620 | pshufb m5, m4, [tab_S2] |
| 30621 | movh [r0 + 1791 * 16 + 8], m5 |
| 30622 | |
| 30623 | ; mode 30 [row 22] |
| 30624 | movu m6, [r5 + 11 * 16] |
| 30625 | pmaddubsw m3, m0, m6 |
| 30626 | pmulhrsw m3, m7 |
| 30627 | pmaddubsw m5, m2, m6 |
| 30628 | pmulhrsw m5, m7 |
| 30629 | packuswb m3, m5 |
| 30630 | movu [r0 + 1836 * 16], m3 |
| 30631 | pmaddubsw m3, m1, m6 |
| 30632 | pmulhrsw m3, m7 |
| 30633 | pmaddubsw m5, m4, m6 |
| 30634 | pmulhrsw m5, m7 |
| 30635 | packuswb m3, m5 |
| 30636 | movu [r0 + 1837 * 16], m3 |
| 30637 | |
| 30638 | ; mode 30 [row 23] |
| 30639 | movu m6, [r5 + 24 * 16] |
| 30640 | pmaddubsw m3, m0, m6 |
| 30641 | pmulhrsw m3, m7 |
| 30642 | pmaddubsw m5, m2, m6 |
| 30643 | pmulhrsw m5, m7 |
| 30644 | packuswb m3, m5 |
| 30645 | movu [r0 + 1838 * 16], m3 |
| 30646 | |
| 30647 | ; mode 33 [row 11 - first half] |
| 30648 | movu [r0 + 2006 * 16], m3 |
| 30649 | |
| 30650 | pmaddubsw m3, m1, m6 |
| 30651 | pmulhrsw m3, m7 |
| 30652 | pmaddubsw m5, m4, m6 |
| 30653 | pmulhrsw m5, m7 |
| 30654 | packuswb m3, m5 |
| 30655 | movu [r0 + 1839 * 16], m3 |
| 30656 | |
| 30657 | ; mode 33 [row 11 - second half] |
| 30658 | movu [r0 + 2007 * 16], m3 |
| 30659 | |
| 30660 | ; mode 31 [row 16] |
| 30661 | movu m6, [r5 + 1 * 16] |
| 30662 | pmaddubsw m3, m0, m6 |
| 30663 | pmulhrsw m3, m7 |
| 30664 | pmaddubsw m5, m2, m6 |
| 30665 | pmulhrsw m5, m7 |
| 30666 | packuswb m3, m5 |
| 30667 | movu [r0 + 1888 * 16], m3 |
| 30668 | pmaddubsw m3, m1, m6 |
| 30669 | pmulhrsw m3, m7 |
| 30670 | pmaddubsw m5, m4, m6 |
| 30671 | pmulhrsw m5, m7 |
| 30672 | packuswb m3, m5 |
| 30673 | movu [r0 + 1889 * 16], m3 |
| 30674 | |
| 30675 | ; mode 31 [row 17] |
| 30676 | movu m6, [r5 + 18 * 16] |
| 30677 | pmaddubsw m3, m0, m6 |
| 30678 | pmulhrsw m3, m7 |
| 30679 | pmaddubsw m5, m2, m6 |
| 30680 | pmulhrsw m5, m7 |
| 30681 | packuswb m3, m5 |
| 30682 | movu [r0 + 1890 * 16], m3 |
| 30683 | pmaddubsw m3, m1, m6 |
| 30684 | pmulhrsw m3, m7 |
| 30685 | pmaddubsw m5, m4, m6 |
| 30686 | pmulhrsw m5, m7 |
| 30687 | packuswb m3, m5 |
| 30688 | movu [r0 + 1891 * 16], m3 |
| 30689 | |
| 30690 | ; mode 32 [row 13] |
| 30691 | movu m6, [r5 + 6 * 16] |
| 30692 | pmaddubsw m3, m0, m6 |
| 30693 | pmulhrsw m3, m7 |
| 30694 | pmaddubsw m5, m2, m6 |
| 30695 | pmulhrsw m5, m7 |
| 30696 | packuswb m3, m5 |
| 30697 | movu [r0 + 1946 * 16], m3 |
| 30698 | pmaddubsw m3, m1, m6 |
| 30699 | pmulhrsw m3, m7 |
| 30700 | pmaddubsw m5, m4, m6 |
| 30701 | pmulhrsw m5, m7 |
| 30702 | packuswb m3, m5 |
| 30703 | movu [r0 + 1947 * 16], m3 |
| 30704 | |
| 30705 | ; mode 32 [row 14] |
| 30706 | movu m6, [r5 + 27 * 16] |
| 30707 | pmaddubsw m3, m0, m6 |
| 30708 | pmulhrsw m3, m7 |
| 30709 | pmaddubsw m5, m2, m6 |
| 30710 | pmulhrsw m5, m7 |
| 30711 | packuswb m3, m5 |
| 30712 | movu [r0 + 1948 * 16], m3 |
| 30713 | pmaddubsw m3, m1, m6 |
| 30714 | pmulhrsw m3, m7 |
| 30715 | pmaddubsw m5, m4, m6 |
| 30716 | pmulhrsw m5, m7 |
| 30717 | packuswb m3, m5 |
| 30718 | movu [r0 + 1949 * 16], m3 |
| 30719 | |
| 30720 | ; mode 30 [row 24] |
| 30721 | movu m6, [r5 + 5 * 16] |
| 30722 | movu m0, [r3 + 11] |
| 30723 | movd m1, [r3 + 12] |
| 30724 | palignr m1, m0, 1 |
| 30725 | punpcklbw m0, m1 |
| 30726 | pmaddubsw m3, m0, m6 |
| 30727 | pmulhrsw m3, m7 |
| 30728 | movu m2, [r3 + 19] |
| 30729 | movd m4, [r3 + 20] |
| 30730 | palignr m4, m2, 1 |
| 30731 | punpcklbw m2, m4 |
| 30732 | pmaddubsw m5, m2, m6 |
| 30733 | pmulhrsw m5, m7 |
| 30734 | packuswb m3, m5 |
| 30735 | movu [r0 + 1840 * 16], m3 |
| 30736 | |
| 30737 | movu m1, [r3 + 27] |
| 30738 | movd m3, [r3 + 28] |
| 30739 | palignr m3, m1, 1 |
| 30740 | punpcklbw m1, m3 |
| 30741 | pmaddubsw m3, m1, m6 |
| 30742 | pmulhrsw m3, m7 |
| 30743 | movu m4, [r3 + 35] |
| 30744 | movd m5, [r3 + 36] |
| 30745 | palignr m5, m4, 1 |
| 30746 | punpcklbw m4, m5 |
| 30747 | pmaddubsw m5, m4, m6 |
| 30748 | pmulhrsw m5, m7 |
| 30749 | packuswb m3, m5 |
| 30750 | movu [r0 + 1841 * 16], m3 |
| 30751 | |
| 30752 | ; mode 30 [row 25] |
| 30753 | movu m6, [r5 + 18 * 16] |
| 30754 | pmaddubsw m3, m0, m6 |
| 30755 | pmulhrsw m3, m7 |
| 30756 | pmaddubsw m5, m2, m6 |
| 30757 | pmulhrsw m5, m7 |
| 30758 | packuswb m3, m5 |
| 30759 | movu [r0 + 1842 * 16], m3 |
| 30760 | |
| 30761 | ; mode 33 [row 12 - first half] |
| 30762 | movu [r0 + 2008 * 16], m3 |
| 30763 | |
| 30764 | pmaddubsw m3, m1, m6 |
| 30765 | pmulhrsw m3, m7 |
| 30766 | pmaddubsw m5, m4, m6 |
| 30767 | pmulhrsw m5, m7 |
| 30768 | packuswb m3, m5 |
| 30769 | movu [r0 + 1843 * 16], m3 |
| 30770 | |
| 30771 | ; mode 33 [row 12 - second half] |
| 30772 | movu [r0 + 2009 * 16], m3 |
| 30773 | |
| 30774 | ; mode 30 [row 26] |
| 30775 | movu m6, [r5 + 31 * 16] |
| 30776 | pmaddubsw m3, m0, m6 |
| 30777 | pmulhrsw m3, m7 |
| 30778 | pmaddubsw m5, m2, m6 |
| 30779 | pmulhrsw m5, m7 |
| 30780 | packuswb m3, m5 |
| 30781 | movu [r0 + 1844 * 16], m3 |
| 30782 | pmaddubsw m3, m1, m6 |
| 30783 | pmulhrsw m3, m7 |
| 30784 | pmaddubsw m5, m4, m6 |
| 30785 | pmulhrsw m5, m7 |
| 30786 | packuswb m3, m5 |
| 30787 | movu [r0 + 1845 * 16], m3 |
| 30788 | |
| 30789 | ; mode 31 [row 18] |
| 30790 | movu m6, [r5 + 3 * 16] |
| 30791 | pmaddubsw m3, m0, m6 |
| 30792 | pmulhrsw m3, m7 |
| 30793 | pmaddubsw m5, m2, m6 |
| 30794 | pmulhrsw m5, m7 |
| 30795 | packuswb m3, m5 |
| 30796 | movu [r0 + 1892 * 16], m3 |
| 30797 | pmaddubsw m3, m1, m6 |
| 30798 | pmulhrsw m3, m7 |
| 30799 | pmaddubsw m5, m4, m6 |
| 30800 | pmulhrsw m5, m7 |
| 30801 | packuswb m3, m5 |
| 30802 | movu [r0 + 1893 * 16], m3 |
| 30803 | |
| 30804 | ; mode 31 [row 19] |
| 30805 | movu m6, [r5 + 20 * 16] |
| 30806 | pmaddubsw m3, m0, m6 |
| 30807 | pmulhrsw m3, m7 |
| 30808 | pmaddubsw m5, m2, m6 |
| 30809 | pmulhrsw m5, m7 |
| 30810 | packuswb m3, m5 |
| 30811 | movu [r0 + 1894 * 16], m3 |
| 30812 | pmaddubsw m3, m1, m6 |
| 30813 | pmulhrsw m3, m7 |
| 30814 | pmaddubsw m5, m4, m6 |
| 30815 | pmulhrsw m5, m7 |
| 30816 | packuswb m3, m5 |
| 30817 | movu [r0 + 1895 * 16], m3 |
| 30818 | |
| 30819 | ; mode 32 [row 15] |
| 30820 | movu m6, [r5 + 16 * 16] |
| 30821 | pmaddubsw m3, m0, m6 |
| 30822 | pmulhrsw m3, m7 |
| 30823 | pmaddubsw m5, m2, m6 |
| 30824 | pmulhrsw m5, m7 |
| 30825 | packuswb m3, m5 |
| 30826 | movu [r0 + 1950 * 16], m3 |
| 30827 | pmaddubsw m3, m1, m6 |
| 30828 | pmulhrsw m3, m7 |
| 30829 | pmaddubsw m5, m4, m6 |
| 30830 | pmulhrsw m5, m7 |
| 30831 | packuswb m3, m5 |
| 30832 | movu [r0 + 1951 * 16], m3 |
| 30833 | |
| 30834 | ; mode 30 [row 27] |
| 30835 | movu m6, [r5 + 12 * 16] |
| 30836 | movu m0, [r3 + 12] |
| 30837 | movd m1, [r3 + 13] |
| 30838 | palignr m1, m0, 1 |
| 30839 | punpcklbw m0, m1 |
| 30840 | pmaddubsw m3, m0, m6 |
| 30841 | pmulhrsw m3, m7 |
| 30842 | movu m2, [r3 + 20] |
| 30843 | movd m4, [r3 + 21] |
| 30844 | palignr m4, m2, 1 |
| 30845 | punpcklbw m2, m4 |
| 30846 | pmaddubsw m5, m2, m6 |
| 30847 | pmulhrsw m5, m7 |
| 30848 | packuswb m3, m5 |
| 30849 | movu [r0 + 1846 * 16], m3 |
| 30850 | |
| 30851 | ; mode 33 [row 13 - first half] |
| 30852 | movu [r0 + 2010 * 16], m3 |
| 30853 | |
| 30854 | movu m1, [r3 + 28] |
| 30855 | movd m3, [r3 + 29] |
| 30856 | palignr m3, m1, 1 |
| 30857 | punpcklbw m1, m3 |
| 30858 | pmaddubsw m3, m1, m6 |
| 30859 | pmulhrsw m3, m7 |
| 30860 | movu m4, [r3 + 36] |
| 30861 | movd m5, [r3 + 37] |
| 30862 | palignr m5, m4, 1 |
| 30863 | punpcklbw m4, m5 |
| 30864 | pmaddubsw m5, m4, m6 |
| 30865 | pmulhrsw m5, m7 |
| 30866 | packuswb m3, m5 |
| 30867 | movu [r0 + 1847 * 16], m3 |
| 30868 | |
| 30869 | ; mode 33 [row 13 - second half] |
| 30870 | movu [r0 + 2011 * 16], m3 |
| 30871 | |
| 30872 | ; mode 30 [row 28] |
| 30873 | movu m6, [r5 + 25 * 16] |
| 30874 | pmaddubsw m3, m0, m6 |
| 30875 | pmulhrsw m3, m7 |
| 30876 | pmaddubsw m5, m2, m6 |
| 30877 | pmulhrsw m5, m7 |
| 30878 | packuswb m3, m5 |
| 30879 | movu [r0 + 1848 * 16], m3 |
| 30880 | pmaddubsw m3, m1, m6 |
| 30881 | pmulhrsw m3, m7 |
| 30882 | pmaddubsw m5, m4, m6 |
| 30883 | pmulhrsw m5, m7 |
| 30884 | packuswb m3, m5 |
| 30885 | movu [r0 + 1849 * 16], m3 |
| 30886 | |
| 30887 | ; mode 31 [row 20] |
| 30888 | movu m6, [r5 + 5 * 16] |
| 30889 | pmaddubsw m3, m0, m6 |
| 30890 | pmulhrsw m3, m7 |
| 30891 | pmaddubsw m5, m2, m6 |
| 30892 | pmulhrsw m5, m7 |
| 30893 | packuswb m3, m5 |
| 30894 | movu [r0 + 1896 * 16], m3 |
| 30895 | |
| 30896 | ; mode 32 [row 16 - first half] |
| 30897 | movu [r0 + 1952 * 16], m3 |
| 30898 | |
| 30899 | pmaddubsw m3, m1, m6 |
| 30900 | pmulhrsw m3, m7 |
| 30901 | pmaddubsw m5, m4, m6 |
| 30902 | pmulhrsw m5, m7 |
| 30903 | packuswb m3, m5 |
| 30904 | movu [r0 + 1897 * 16], m3 |
| 30905 | |
| 30906 | ; mode 32 [row 16 - second half] |
| 30907 | movu [r0 + 1953 * 16], m3 |
| 30908 | |
| 30909 | ; mode 31 [row 21] |
| 30910 | movu m6, [r5 + 22 * 16] |
| 30911 | pmaddubsw m3, m0, m6 |
| 30912 | pmulhrsw m3, m7 |
| 30913 | pmaddubsw m5, m2, m6 |
| 30914 | pmulhrsw m5, m7 |
| 30915 | packuswb m3, m5 |
| 30916 | movu [r0 + 1898 * 16], m3 |
| 30917 | pmaddubsw m3, m1, m6 |
| 30918 | pmulhrsw m3, m7 |
| 30919 | pmaddubsw m5, m4, m6 |
| 30920 | pmulhrsw m5, m7 |
| 30921 | packuswb m3, m5 |
| 30922 | movu [r0 + 1899 * 16], m3 |
| 30923 | |
| 30924 | ; mode 32 [row 17] |
| 30925 | movu m6, [r5 + 26 * 16] |
| 30926 | pmaddubsw m3, m0, m6 |
| 30927 | pmulhrsw m3, m7 |
| 30928 | pmaddubsw m5, m2, m6 |
| 30929 | pmulhrsw m5, m7 |
| 30930 | packuswb m3, m5 |
| 30931 | movu [r0 + 1954 * 16], m3 |
| 30932 | pmaddubsw m3, m1, m6 |
| 30933 | pmulhrsw m3, m7 |
| 30934 | pmaddubsw m5, m4, m6 |
| 30935 | pmulhrsw m5, m7 |
| 30936 | packuswb m3, m5 |
| 30937 | movu [r0 + 1955 * 16], m3 |
| 30938 | |
| 30939 | ; mode 30 [row 29] |
| 30940 | movu m6, [r5 + 6 * 16] |
| 30941 | movu m0, [r3 + 13] |
| 30942 | movd m1, [r3 + 14] |
| 30943 | palignr m1, m0, 1 |
| 30944 | punpcklbw m0, m1 |
| 30945 | pmaddubsw m3, m0, m6 |
| 30946 | pmulhrsw m3, m7 |
| 30947 | movu m2, [r3 + 21] |
| 30948 | movd m4, [r3 + 22] |
| 30949 | palignr m4, m2, 1 |
| 30950 | punpcklbw m2, m4 |
| 30951 | pmaddubsw m5, m2, m6 |
| 30952 | pmulhrsw m5, m7 |
| 30953 | packuswb m3, m5 |
| 30954 | movu [r0 + 1850 * 16], m3 |
| 30955 | |
| 30956 | ; mode 33 [row 14 - first half] |
| 30957 | movu [r0 + 2012 * 16], m3 |
| 30958 | |
| 30959 | movu m1, [r3 + 29] |
| 30960 | movd m3, [r3 + 30] |
| 30961 | palignr m3, m1, 1 |
| 30962 | punpcklbw m1, m3 |
| 30963 | pmaddubsw m3, m1, m6 |
| 30964 | pmulhrsw m3, m7 |
| 30965 | movu m4, [r3 + 37] |
| 30966 | movd m5, [r3 + 38] |
| 30967 | palignr m5, m4, 1 |
| 30968 | punpcklbw m4, m5 |
| 30969 | pmaddubsw m5, m4, m6 |
| 30970 | pmulhrsw m5, m7 |
| 30971 | packuswb m3, m5 |
| 30972 | movu [r0 + 1851 * 16], m3 |
| 30973 | |
| 30974 | ; mode 33 [row 14 - second half] |
| 30975 | movu [r0 + 2013 * 16], m3 |
| 30976 | |
| 30977 | ; mode 30 [row 30] |
| 30978 | movu m6, [r5 + 19 * 16] |
| 30979 | pmaddubsw m3, m0, m6 |
| 30980 | pmulhrsw m3, m7 |
| 30981 | pmaddubsw m5, m2, m6 |
| 30982 | pmulhrsw m5, m7 |
| 30983 | packuswb m3, m5 |
| 30984 | movu [r0 + 1852 * 16], m3 |
| 30985 | pmaddubsw m3, m1, m6 |
| 30986 | pmulhrsw m3, m7 |
| 30987 | pmaddubsw m5, m4, m6 |
| 30988 | pmulhrsw m5, m7 |
| 30989 | packuswb m3, m5 |
| 30990 | movu [r0 + 1853 * 16], m3 |
| 30991 | |
| 30992 | ; mode 31 [row 22] |
| 30993 | movu m6, [r5 + 7 * 16] |
| 30994 | pmaddubsw m3, m0, m6 |
| 30995 | pmulhrsw m3, m7 |
| 30996 | pmaddubsw m5, m2, m6 |
| 30997 | pmulhrsw m5, m7 |
| 30998 | packuswb m3, m5 |
| 30999 | movu [r0 + 1900 * 16], m3 |
| 31000 | pmaddubsw m3, m1, m6 |
| 31001 | pmulhrsw m3, m7 |
| 31002 | pmaddubsw m5, m4, m6 |
| 31003 | pmulhrsw m5, m7 |
| 31004 | packuswb m3, m5 |
| 31005 | movu [r0 + 1901 * 16], m3 |
| 31006 | |
| 31007 | ; mode 31 [row 23] |
| 31008 | movu m6, [r5 + 24 * 16] |
| 31009 | pmaddubsw m3, m0, m6 |
| 31010 | pmulhrsw m3, m7 |
| 31011 | pmaddubsw m5, m2, m6 |
| 31012 | pmulhrsw m5, m7 |
| 31013 | packuswb m3, m5 |
| 31014 | movu [r0 + 1902 * 16], m3 |
| 31015 | pmaddubsw m3, m1, m6 |
| 31016 | pmulhrsw m3, m7 |
| 31017 | pmaddubsw m5, m4, m6 |
| 31018 | pmulhrsw m5, m7 |
| 31019 | packuswb m3, m5 |
| 31020 | movu [r0 + 1903 * 16], m3 |
| 31021 | |
| 31022 | ; mode 32 [row 18] |
| 31023 | movu m6, [r5 + 15 * 16] |
| 31024 | pmaddubsw m3, m0, m6 |
| 31025 | pmulhrsw m3, m7 |
| 31026 | pmaddubsw m5, m2, m6 |
| 31027 | pmulhrsw m5, m7 |
| 31028 | packuswb m3, m5 |
| 31029 | movu [r0 + 1956 * 16], m3 |
| 31030 | pmaddubsw m3, m1, m6 |
| 31031 | pmulhrsw m3, m7 |
| 31032 | pmaddubsw m5, m4, m6 |
| 31033 | pmulhrsw m5, m7 |
| 31034 | packuswb m3, m5 |
| 31035 | movu [r0 + 1957 * 16], m3 |
| 31036 | |
| 31037 | ; mode 30 [row 31] |
| 31038 | movu m0, [r3 + 14] |
| 31039 | movd m1, [r3 + 15] |
| 31040 | palignr m1, m0, 1 |
| 31041 | punpcklbw m0, m1 |
| 31042 | movu m2, [r3 + 22] |
| 31043 | movd m3, [r3 + 23] |
| 31044 | palignr m3, m2, 1 |
| 31045 | punpcklbw m2, m3 |
| 31046 | movu m1, [r3 + 30] |
| 31047 | movd m3, [r3 + 31] |
| 31048 | palignr m3, m1, 1 |
| 31049 | punpcklbw m1, m3 |
| 31050 | movu m4, [r3 + 38] |
| 31051 | movd m5, [r3 + 39] |
| 31052 | palignr m5, m4, 1 |
| 31053 | punpcklbw m4, m5 |
| 31054 | |
| 31055 | pshufb m5, m0, [tab_S2] |
| 31056 | movh [r0 + 1854 * 16], m5 |
| 31057 | |
| 31058 | ; mode 33 [row 15 - first eight] |
| 31059 | movh [r0 + 2014 * 16], m5 |
| 31060 | |
| 31061 | pshufb m5, m2, [tab_S2] |
| 31062 | movh [r0 + 1854 * 16 + 8], m5 |
| 31063 | |
| 31064 | ; mode 33 [row 15 - second eight] |
| 31065 | movh [r0 + 2014 * 16 + 8], m5 |
| 31066 | |
| 31067 | pshufb m5, m1, [tab_S2] |
| 31068 | movh [r0 + 1855 * 16], m5 |
| 31069 | |
| 31070 | ; mode 33 [row 15 - third eight] |
| 31071 | movh [r0 + 2015 * 16], m5 |
| 31072 | |
| 31073 | pshufb m5, m4, [tab_S2] |
| 31074 | movh [r0 + 1855 * 16 + 8], m5 |
| 31075 | |
| 31076 | ; mode 33 [row 15 - fourth eight] |
| 31077 | movh [r0 + 2015 * 16 + 8], m5 |
| 31078 | |
| 31079 | ; mode 31 [row 24] |
| 31080 | movu m6, [r5 + 9 * 16] |
| 31081 | pmaddubsw m3, m0, m6 |
| 31082 | pmulhrsw m3, m7 |
| 31083 | pmaddubsw m5, m2, m6 |
| 31084 | pmulhrsw m5, m7 |
| 31085 | packuswb m3, m5 |
| 31086 | movu [r0 + 1904 * 16], m3 |
| 31087 | pmaddubsw m3, m1, m6 |
| 31088 | pmulhrsw m3, m7 |
| 31089 | pmaddubsw m5, m4, m6 |
| 31090 | pmulhrsw m5, m7 |
| 31091 | packuswb m3, m5 |
| 31092 | movu [r0 + 1905 * 16], m3 |
| 31093 | |
| 31094 | ; mode 31 [row 25] |
| 31095 | movu m6, [r5 + 26 * 16] |
| 31096 | pmaddubsw m3, m0, m6 |
| 31097 | pmulhrsw m3, m7 |
| 31098 | pmaddubsw m5, m2, m6 |
| 31099 | pmulhrsw m5, m7 |
| 31100 | packuswb m3, m5 |
| 31101 | movu [r0 + 1906 * 16], m3 |
| 31102 | |
| 31103 | ; mode 33 [row 16 - first half] |
| 31104 | movu [r0 + 2016 * 16], m3 |
| 31105 | |
| 31106 | pmaddubsw m3, m1, m6 |
| 31107 | pmulhrsw m3, m7 |
| 31108 | pmaddubsw m5, m4, m6 |
| 31109 | pmulhrsw m5, m7 |
| 31110 | packuswb m3, m5 |
| 31111 | movu [r0 + 1907 * 16], m3 |
| 31112 | |
| 31113 | ; mode 33 [row 16 - second half] |
| 31114 | movu [r0 + 2017 * 16], m3 |
| 31115 | |
| 31116 | ; mode 32 [row 19] |
| 31117 | movu m6, [r5 + 4 * 16] |
| 31118 | pmaddubsw m3, m0, m6 |
| 31119 | pmulhrsw m3, m7 |
| 31120 | pmaddubsw m5, m2, m6 |
| 31121 | pmulhrsw m5, m7 |
| 31122 | packuswb m3, m5 |
| 31123 | movu [r0 + 1958 * 16], m3 |
| 31124 | pmaddubsw m3, m1, m6 |
| 31125 | pmulhrsw m3, m7 |
| 31126 | pmaddubsw m5, m4, m6 |
| 31127 | pmulhrsw m5, m7 |
| 31128 | packuswb m3, m5 |
| 31129 | movu [r0 + 1959 * 16], m3 |
| 31130 | |
| 31131 | ; mode 32 [row 20] |
| 31132 | movu m6, [r5 + 25 * 16] |
| 31133 | pmaddubsw m3, m0, m6 |
| 31134 | pmulhrsw m3, m7 |
| 31135 | pmaddubsw m5, m2, m6 |
| 31136 | pmulhrsw m5, m7 |
| 31137 | packuswb m3, m5 |
| 31138 | movu [r0 + 1960 * 16], m3 |
| 31139 | pmaddubsw m3, m1, m6 |
| 31140 | pmulhrsw m3, m7 |
| 31141 | pmaddubsw m5, m4, m6 |
| 31142 | pmulhrsw m5, m7 |
| 31143 | packuswb m3, m5 |
| 31144 | movu [r0 + 1961 * 16], m3 |
| 31145 | |
| 31146 | ; mode 31 [row 26] |
| 31147 | movu m6, [r5 + 11 * 16] |
| 31148 | movu m0, [r3 + 15] |
| 31149 | movd m1, [r3 + 16] |
| 31150 | palignr m1, m0, 1 |
| 31151 | punpcklbw m0, m1 |
| 31152 | pmaddubsw m3, m0, m6 |
| 31153 | pmulhrsw m3, m7 |
| 31154 | movu m2, [r3 + 23] |
| 31155 | movd m4, [r3 + 24] |
| 31156 | palignr m4, m2, 1 |
| 31157 | punpcklbw m2, m4 |
| 31158 | pmaddubsw m5, m2, m6 |
| 31159 | pmulhrsw m5, m7 |
| 31160 | packuswb m3, m5 |
| 31161 | movu [r0 + 1908 * 16], m3 |
| 31162 | |
| 31163 | movu m1, [r3 + 31] |
| 31164 | movd m3, [r3 + 32] |
| 31165 | palignr m3, m1, 1 |
| 31166 | punpcklbw m1, m3 |
| 31167 | pmaddubsw m3, m1, m6 |
| 31168 | pmulhrsw m3, m7 |
| 31169 | movu m4, [r3 + 39] |
| 31170 | movd m5, [r3 + 40] |
| 31171 | palignr m5, m4, 1 |
| 31172 | punpcklbw m4, m5 |
| 31173 | pmaddubsw m5, m4, m6 |
| 31174 | pmulhrsw m5, m7 |
| 31175 | packuswb m3, m5 |
| 31176 | movu [r0 + 1909 * 16], m3 |
| 31177 | |
| 31178 | ; mode 31 [row 27] |
| 31179 | movu m6, [r5 + 28 * 16] |
| 31180 | pmaddubsw m3, m0, m6 |
| 31181 | pmulhrsw m3, m7 |
| 31182 | pmaddubsw m5, m2, m6 |
| 31183 | pmulhrsw m5, m7 |
| 31184 | packuswb m3, m5 |
| 31185 | movu [r0 + 1910 * 16], m3 |
| 31186 | pmaddubsw m3, m1, m6 |
| 31187 | pmulhrsw m3, m7 |
| 31188 | pmaddubsw m5, m4, m6 |
| 31189 | pmulhrsw m5, m7 |
| 31190 | packuswb m3, m5 |
| 31191 | movu [r0 + 1911 * 16], m3 |
| 31192 | |
| 31193 | ; mode 32 [row 21] |
| 31194 | movu m6, [r5 + 14 * 16] |
| 31195 | pmaddubsw m3, m0, m6 |
| 31196 | pmulhrsw m3, m7 |
| 31197 | pmaddubsw m5, m2, m6 |
| 31198 | pmulhrsw m5, m7 |
| 31199 | packuswb m3, m5 |
| 31200 | movu [r0 + 1962 * 16], m3 |
| 31201 | pmaddubsw m3, m1, m6 |
| 31202 | pmulhrsw m3, m7 |
| 31203 | pmaddubsw m5, m4, m6 |
| 31204 | pmulhrsw m5, m7 |
| 31205 | packuswb m3, m5 |
| 31206 | movu [r0 + 1963 * 16], m3 |
| 31207 | |
| 31208 | ; mode 33 [row 17] |
| 31209 | movu m6, [r5 + 20 * 16] |
| 31210 | pmaddubsw m3, m0, m6 |
| 31211 | pmulhrsw m3, m7 |
| 31212 | pmaddubsw m5, m2, m6 |
| 31213 | pmulhrsw m5, m7 |
| 31214 | packuswb m3, m5 |
| 31215 | movu [r0 + 2018 * 16], m3 |
| 31216 | pmaddubsw m3, m1, m6 |
| 31217 | pmulhrsw m3, m7 |
| 31218 | pmaddubsw m5, m4, m6 |
| 31219 | pmulhrsw m5, m7 |
| 31220 | packuswb m3, m5 |
| 31221 | movu [r0 + 2019 * 16], m3 |
| 31222 | |
| 31223 | ; mode 31 [row 28] |
| 31224 | movu m6, [r5 + 13 * 16] |
| 31225 | movu m0, [r3 + 16] |
| 31226 | movd m1, [r3 + 17] |
| 31227 | palignr m1, m0, 1 |
| 31228 | punpcklbw m0, m1 |
| 31229 | pmaddubsw m3, m0, m6 |
| 31230 | pmulhrsw m3, m7 |
| 31231 | movu m2, [r3 + 24] |
| 31232 | movd m4, [r3 + 25] |
| 31233 | palignr m4, m2, 1 |
| 31234 | punpcklbw m2, m4 |
| 31235 | pmaddubsw m5, m2, m6 |
| 31236 | pmulhrsw m5, m7 |
| 31237 | packuswb m3, m5 |
| 31238 | movu [r0 + 1912 * 16], m3 |
| 31239 | |
| 31240 | movu m1, [r3 + 32] |
| 31241 | movd m3, [r3 + 33] |
| 31242 | palignr m3, m1, 1 |
| 31243 | punpcklbw m1, m3 |
| 31244 | pmaddubsw m3, m1, m6 |
| 31245 | pmulhrsw m3, m7 |
| 31246 | movu m4, [r3 + 40] |
| 31247 | movd m5, [r3 + 41] |
| 31248 | palignr m5, m4, 1 |
| 31249 | punpcklbw m4, m5 |
| 31250 | pmaddubsw m5, m4, m6 |
| 31251 | pmulhrsw m5, m7 |
| 31252 | packuswb m3, m5 |
| 31253 | movu [r0 + 1913 * 16], m3 |
| 31254 | |
| 31255 | ; mode 31 [row 29] |
| 31256 | movu m6, [r5 + 30 * 16] |
| 31257 | pmaddubsw m3, m0, m6 |
| 31258 | pmulhrsw m3, m7 |
| 31259 | pmaddubsw m5, m2, m6 |
| 31260 | pmulhrsw m5, m7 |
| 31261 | packuswb m3, m5 |
| 31262 | movu [r0 + 1914 * 16], m3 |
| 31263 | pmaddubsw m3, m1, m6 |
| 31264 | pmulhrsw m3, m7 |
| 31265 | pmaddubsw m5, m4, m6 |
| 31266 | pmulhrsw m5, m7 |
| 31267 | packuswb m3, m5 |
| 31268 | movu [r0 + 1915 * 16], m3 |
| 31269 | |
| 31270 | ; mode 32 [row 22] |
| 31271 | movu m6, [r5 + 3 * 16] |
| 31272 | pmaddubsw m3, m0, m6 |
| 31273 | pmulhrsw m3, m7 |
| 31274 | pmaddubsw m5, m2, m6 |
| 31275 | pmulhrsw m5, m7 |
| 31276 | packuswb m3, m5 |
| 31277 | movu [r0 + 1964 * 16], m3 |
| 31278 | pmaddubsw m3, m1, m6 |
| 31279 | pmulhrsw m3, m7 |
| 31280 | pmaddubsw m5, m4, m6 |
| 31281 | pmulhrsw m5, m7 |
| 31282 | packuswb m3, m5 |
| 31283 | movu [r0 + 1965 * 16], m3 |
| 31284 | |
| 31285 | ; mode 32 [row 23] |
| 31286 | movu m6, [r5 + 24 * 16] |
| 31287 | pmaddubsw m3, m0, m6 |
| 31288 | pmulhrsw m3, m7 |
| 31289 | pmaddubsw m5, m2, m6 |
| 31290 | pmulhrsw m5, m7 |
| 31291 | packuswb m3, m5 |
| 31292 | movu [r0 + 1966 * 16], m3 |
| 31293 | pmaddubsw m3, m1, m6 |
| 31294 | pmulhrsw m3, m7 |
| 31295 | pmaddubsw m5, m4, m6 |
| 31296 | pmulhrsw m5, m7 |
| 31297 | packuswb m3, m5 |
| 31298 | movu [r0 + 1967 * 16], m3 |
| 31299 | |
| 31300 | ; mode 33 [row 18] |
| 31301 | movu m6, [r5 + 14 * 16] |
| 31302 | pmaddubsw m3, m0, m6 |
| 31303 | pmulhrsw m3, m7 |
| 31304 | pmaddubsw m5, m2, m6 |
| 31305 | pmulhrsw m5, m7 |
| 31306 | packuswb m3, m5 |
| 31307 | movu [r0 + 2020 * 16], m3 |
| 31308 | pmaddubsw m3, m1, m6 |
| 31309 | pmulhrsw m3, m7 |
| 31310 | pmaddubsw m5, m4, m6 |
| 31311 | pmulhrsw m5, m7 |
| 31312 | packuswb m3, m5 |
| 31313 | movu [r0 + 2021 * 16], m3 |
| 31314 | |
| 31315 | ; mode 31 [row 30] |
| 31316 | movu m6, [r5 + 15 * 16] |
| 31317 | movu m0, [r3 + 17] |
| 31318 | movd m1, [r3 + 18] |
| 31319 | palignr m1, m0, 1 |
| 31320 | punpcklbw m0, m1 |
| 31321 | pmaddubsw m3, m0, m6 |
| 31322 | pmulhrsw m3, m7 |
| 31323 | movu m2, [r3 + 25] |
| 31324 | movd m4, [r3 + 26] |
| 31325 | palignr m4, m2, 1 |
| 31326 | punpcklbw m2, m4 |
| 31327 | pmaddubsw m5, m2, m6 |
| 31328 | pmulhrsw m5, m7 |
| 31329 | packuswb m3, m5 |
| 31330 | movu [r0 + 1916 * 16], m3 |
| 31331 | |
| 31332 | movu m1, [r3 + 33] |
| 31333 | movd m3, [r3 + 34] |
| 31334 | palignr m3, m1, 1 |
| 31335 | punpcklbw m1, m3 |
| 31336 | pmaddubsw m3, m1, m6 |
| 31337 | pmulhrsw m3, m7 |
| 31338 | movu m4, [r3 + 41] |
| 31339 | movd m5, [r3 + 42] |
| 31340 | palignr m5, m4, 1 |
| 31341 | punpcklbw m4, m5 |
| 31342 | pmaddubsw m5, m4, m6 |
| 31343 | pmulhrsw m5, m7 |
| 31344 | packuswb m3, m5 |
| 31345 | movu [r0 + 1917 * 16], m3 |
| 31346 | |
| 31347 | ; mode 32 [row 24] |
| 31348 | movu m6, [r5 + 13 * 16] |
| 31349 | pmaddubsw m3, m0, m6 |
| 31350 | pmulhrsw m3, m7 |
| 31351 | pmaddubsw m5, m2, m6 |
| 31352 | pmulhrsw m5, m7 |
| 31353 | packuswb m3, m5 |
| 31354 | movu [r0 + 1968 * 16], m3 |
| 31355 | pmaddubsw m3, m1, m6 |
| 31356 | pmulhrsw m3, m7 |
| 31357 | pmaddubsw m5, m4, m6 |
| 31358 | pmulhrsw m5, m7 |
| 31359 | packuswb m3, m5 |
| 31360 | movu [r0 + 1969 * 16], m3 |
| 31361 | |
| 31362 | ; mode 33 [row 19] |
| 31363 | movu m6, [r5 + 8 * 16] |
| 31364 | pmaddubsw m3, m0, m6 |
| 31365 | pmulhrsw m3, m7 |
| 31366 | pmaddubsw m5, m2, m6 |
| 31367 | pmulhrsw m5, m7 |
| 31368 | packuswb m3, m5 |
| 31369 | movu [r0 + 2022 * 16], m3 |
| 31370 | pmaddubsw m3, m1, m6 |
| 31371 | pmulhrsw m3, m7 |
| 31372 | pmaddubsw m5, m4, m6 |
| 31373 | pmulhrsw m5, m7 |
| 31374 | packuswb m3, m5 |
| 31375 | movu [r0 + 2023 * 16], m3 |
| 31376 | |
| 31377 | ; mode 31 [row 31] |
| 31378 | movu m0, [r3 + 18] |
| 31379 | movd m1, [r3 + 19] |
| 31380 | palignr m1, m0, 1 |
| 31381 | punpcklbw m0, m1 |
| 31382 | movu m2, [r3 + 26] |
| 31383 | movd m3, [r3 + 27] |
| 31384 | palignr m3, m2, 1 |
| 31385 | punpcklbw m2, m3 |
| 31386 | movu m1, [r3 + 34] |
| 31387 | movd m3, [r3 + 35] |
| 31388 | palignr m3, m1, 1 |
| 31389 | punpcklbw m1, m3 |
| 31390 | movu m4, [r3 + 42] |
| 31391 | movd m5, [r3 + 43] |
| 31392 | palignr m5, m4, 1 |
| 31393 | punpcklbw m4, m5 |
| 31394 | |
| 31395 | pshufb m5, m0, [tab_S2] |
| 31396 | movh [r0 + 1918 * 16], m5 |
| 31397 | pshufb m5, m2, [tab_S2] |
| 31398 | movh [r0 + 1918 * 16 + 8], m5 |
| 31399 | pshufb m5, m1, [tab_S2] |
| 31400 | movh [r0 + 1919 * 16], m5 |
| 31401 | pshufb m5, m4, [tab_S2] |
| 31402 | movh [r0 + 1919 * 16 + 8], m5 |
| 31403 | |
| 31404 | ; mode 32 [row 25] |
| 31405 | movu m6, [r5 + 2 * 16] |
| 31406 | pmaddubsw m3, m0, m6 |
| 31407 | pmulhrsw m3, m7 |
| 31408 | pmaddubsw m5, m2, m6 |
| 31409 | pmulhrsw m5, m7 |
| 31410 | packuswb m3, m5 |
| 31411 | movu [r0 + 1970 * 16], m3 |
| 31412 | |
| 31413 | ; mode 33 [row 20 - first half] |
| 31414 | movu [r0 + 2024 * 16], m3 |
| 31415 | |
| 31416 | pmaddubsw m3, m1, m6 |
| 31417 | pmulhrsw m3, m7 |
| 31418 | pmaddubsw m5, m4, m6 |
| 31419 | pmulhrsw m5, m7 |
| 31420 | packuswb m3, m5 |
| 31421 | movu [r0 + 1971 * 16], m3 |
| 31422 | |
| 31423 | ; mode 33 [row 20 - second half] |
| 31424 | movu [r0 + 2025 * 16], m3 |
| 31425 | |
| 31426 | ; mode 32 [row 26] |
| 31427 | movu m6, [r5 + 23 * 16] |
| 31428 | pmaddubsw m3, m0, m6 |
| 31429 | pmulhrsw m3, m7 |
| 31430 | pmaddubsw m5, m2, m6 |
| 31431 | pmulhrsw m5, m7 |
| 31432 | packuswb m3, m5 |
| 31433 | movu [r0 + 1972 * 16], m3 |
| 31434 | pmaddubsw m3, m1, m6 |
| 31435 | pmulhrsw m3, m7 |
| 31436 | pmaddubsw m5, m4, m6 |
| 31437 | pmulhrsw m5, m7 |
| 31438 | packuswb m3, m5 |
| 31439 | movu [r0 + 1973 * 16], m3 |
| 31440 | |
| 31441 | ; mode 33 [row 21] |
| 31442 | movu m6, [r5 + 28 * 16] |
| 31443 | pmaddubsw m3, m0, m6 |
| 31444 | pmulhrsw m3, m7 |
| 31445 | pmaddubsw m5, m2, m6 |
| 31446 | pmulhrsw m5, m7 |
| 31447 | packuswb m3, m5 |
| 31448 | movu [r0 + 2026 * 16], m3 |
| 31449 | pmaddubsw m3, m1, m6 |
| 31450 | pmulhrsw m3, m7 |
| 31451 | pmaddubsw m5, m4, m6 |
| 31452 | pmulhrsw m5, m7 |
| 31453 | packuswb m3, m5 |
| 31454 | movu [r0 + 2027 * 16], m3 |
| 31455 | |
| 31456 | ; mode 32 [row 27] |
| 31457 | movu m6, [r5 + 12 * 16] |
| 31458 | movu m0, [r3 + 19] |
| 31459 | movd m1, [r3 + 20] |
| 31460 | palignr m1, m0, 1 |
| 31461 | punpcklbw m0, m1 |
| 31462 | pmaddubsw m3, m0, m6 |
| 31463 | pmulhrsw m3, m7 |
| 31464 | movu m2, [r3 + 27] |
| 31465 | movd m4, [r3 + 28] |
| 31466 | palignr m4, m2, 1 |
| 31467 | punpcklbw m2, m4 |
| 31468 | pmaddubsw m5, m2, m6 |
| 31469 | pmulhrsw m5, m7 |
| 31470 | packuswb m3, m5 |
| 31471 | movu [r0 + 1974 * 16], m3 |
| 31472 | |
| 31473 | movu m1, [r3 + 35] |
| 31474 | movd m3, [r3 + 36] |
| 31475 | palignr m3, m1, 1 |
| 31476 | punpcklbw m1, m3 |
| 31477 | pmaddubsw m3, m1, m6 |
| 31478 | pmulhrsw m3, m7 |
| 31479 | movu m4, [r3 + 43] |
| 31480 | movd m5, [r3 + 44] |
| 31481 | palignr m5, m4, 1 |
| 31482 | punpcklbw m4, m5 |
| 31483 | pmaddubsw m5, m4, m6 |
| 31484 | pmulhrsw m5, m7 |
| 31485 | packuswb m3, m5 |
| 31486 | movu [r0 + 1975 * 16], m3 |
| 31487 | |
| 31488 | ; mode 33 [row 22] |
| 31489 | movu m6, [r5 + 22 * 16] |
| 31490 | pmaddubsw m3, m0, m6 |
| 31491 | pmulhrsw m3, m7 |
| 31492 | pmaddubsw m5, m2, m6 |
| 31493 | pmulhrsw m5, m7 |
| 31494 | packuswb m3, m5 |
| 31495 | movu [r0 + 2028 * 16], m3 |
| 31496 | pmaddubsw m3, m1, m6 |
| 31497 | pmulhrsw m3, m7 |
| 31498 | pmaddubsw m5, m4, m6 |
| 31499 | pmulhrsw m5, m7 |
| 31500 | packuswb m3, m5 |
| 31501 | movu [r0 + 2029 * 16], m3 |
| 31502 | |
| 31503 | ; mode 32 [row 28] |
| 31504 | movu m6, [r5 + 1 * 16] |
| 31505 | movu m0, [r3 + 20] |
| 31506 | movd m1, [r3 + 21] |
| 31507 | palignr m1, m0, 1 |
| 31508 | punpcklbw m0, m1 |
| 31509 | pmaddubsw m3, m0, m6 |
| 31510 | pmulhrsw m3, m7 |
| 31511 | movu m2, [r3 + 28] |
| 31512 | movd m4, [r3 + 29] |
| 31513 | palignr m4, m2, 1 |
| 31514 | punpcklbw m2, m4 |
| 31515 | pmaddubsw m5, m2, m6 |
| 31516 | pmulhrsw m5, m7 |
| 31517 | packuswb m3, m5 |
| 31518 | movu [r0 + 1976 * 16], m3 |
| 31519 | |
| 31520 | movu m1, [r3 + 36] |
| 31521 | movd m3, [r3 + 37] |
| 31522 | palignr m3, m1, 1 |
| 31523 | punpcklbw m1, m3 |
| 31524 | pmaddubsw m3, m1, m6 |
| 31525 | pmulhrsw m3, m7 |
| 31526 | movu m4, [r3 + 44] |
| 31527 | movd m5, [r3 + 45] |
| 31528 | palignr m5, m4, 1 |
| 31529 | punpcklbw m4, m5 |
| 31530 | pmaddubsw m5, m4, m6 |
| 31531 | pmulhrsw m5, m7 |
| 31532 | packuswb m3, m5 |
| 31533 | movu [r0 + 1977 * 16], m3 |
| 31534 | |
| 31535 | ; mode 32 [row 29] |
| 31536 | movu m6, [r5 + 22 * 16] |
| 31537 | pmaddubsw m3, m0, m6 |
| 31538 | pmulhrsw m3, m7 |
| 31539 | pmaddubsw m5, m2, m6 |
| 31540 | pmulhrsw m5, m7 |
| 31541 | packuswb m3, m5 |
| 31542 | movu [r0 + 1978 * 16], m3 |
| 31543 | pmaddubsw m3, m1, m6 |
| 31544 | pmulhrsw m3, m7 |
| 31545 | pmaddubsw m5, m4, m6 |
| 31546 | pmulhrsw m5, m7 |
| 31547 | packuswb m3, m5 |
| 31548 | movu [r0 + 1979 * 16], m3 |
| 31549 | |
| 31550 | ; mode 33 [row 23] |
| 31551 | movu m6, [r5 + 16 * 16] |
| 31552 | pmaddubsw m3, m0, m6 |
| 31553 | pmulhrsw m3, m7 |
| 31554 | pmaddubsw m5, m2, m6 |
| 31555 | pmulhrsw m5, m7 |
| 31556 | packuswb m3, m5 |
| 31557 | movu [r0 + 2030 * 16], m3 |
| 31558 | pmaddubsw m3, m1, m6 |
| 31559 | pmulhrsw m3, m7 |
| 31560 | pmaddubsw m5, m4, m6 |
| 31561 | pmulhrsw m5, m7 |
| 31562 | packuswb m3, m5 |
| 31563 | movu [r0 + 2031 * 16], m3 |
| 31564 | |
| 31565 | ; mode 32 [row 30] |
| 31566 | movu m6, [r5 + 11 * 16] |
| 31567 | movu m0, [r3 + 21] |
| 31568 | movd m1, [r3 + 22] |
| 31569 | palignr m1, m0, 1 |
| 31570 | punpcklbw m0, m1 |
| 31571 | pmaddubsw m3, m0, m6 |
| 31572 | pmulhrsw m3, m7 |
| 31573 | movu m2, [r3 + 29] |
| 31574 | movd m4, [r3 + 30] |
| 31575 | palignr m4, m2, 1 |
| 31576 | punpcklbw m2, m4 |
| 31577 | pmaddubsw m5, m2, m6 |
| 31578 | pmulhrsw m5, m7 |
| 31579 | packuswb m3, m5 |
| 31580 | movu [r0 + 1980 * 16], m3 |
| 31581 | |
| 31582 | movu m1, [r3 + 37] |
| 31583 | movd m3, [r3 + 38] |
| 31584 | palignr m3, m1, 1 |
| 31585 | punpcklbw m1, m3 |
| 31586 | pmaddubsw m3, m1, m6 |
| 31587 | pmulhrsw m3, m7 |
| 31588 | movu m4, [r3 + 45] |
| 31589 | movd m5, [r3 + 46] |
| 31590 | palignr m5, m4, 1 |
| 31591 | punpcklbw m4, m5 |
| 31592 | pmaddubsw m5, m4, m6 |
| 31593 | pmulhrsw m5, m7 |
| 31594 | packuswb m3, m5 |
| 31595 | movu [r0 + 1981 * 16], m3 |
| 31596 | |
| 31597 | ; mode 33 [row 24] |
| 31598 | movu m6, [r5 + 10 * 16] |
| 31599 | pmaddubsw m3, m0, m6 |
| 31600 | pmulhrsw m3, m7 |
| 31601 | pmaddubsw m5, m2, m6 |
| 31602 | pmulhrsw m5, m7 |
| 31603 | packuswb m3, m5 |
| 31604 | movu [r0 + 2032 * 16], m3 |
| 31605 | pmaddubsw m3, m1, m6 |
| 31606 | pmulhrsw m3, m7 |
| 31607 | pmaddubsw m5, m4, m6 |
| 31608 | pmulhrsw m5, m7 |
| 31609 | packuswb m3, m5 |
| 31610 | movu [r0 + 2033 * 16], m3 |
| 31611 | |
| 31612 | ; mode 32 [row 31] |
| 31613 | movu m0, [r3 + 22] |
| 31614 | movd m1, [r3 + 23] |
| 31615 | palignr m1, m0, 1 |
| 31616 | punpcklbw m0, m1 |
| 31617 | movu m2, [r3 + 30] |
| 31618 | movd m3, [r3 + 31] |
| 31619 | palignr m3, m2, 1 |
| 31620 | punpcklbw m2, m3 |
| 31621 | movu m1, [r3 + 38] |
| 31622 | movd m3, [r3 + 39] |
| 31623 | palignr m3, m1, 1 |
| 31624 | punpcklbw m1, m3 |
| 31625 | movu m4, [r3 + 46] |
| 31626 | movd m5, [r3 + 47] |
| 31627 | palignr m5, m4, 1 |
| 31628 | punpcklbw m4, m5 |
| 31629 | |
| 31630 | pshufb m5, m0, [tab_S2] |
| 31631 | movh [r0 + 1982 * 16], m5 |
| 31632 | pshufb m5, m2, [tab_S2] |
| 31633 | movh [r0 + 1982 * 16 + 8], m5 |
| 31634 | pshufb m5, m1, [tab_S2] |
| 31635 | movh [r0 + 1983 * 16], m5 |
| 31636 | pshufb m5, m4, [tab_S2] |
| 31637 | movh [r0 + 1983 * 16 + 8], m5 |
| 31638 | |
| 31639 | ; mode 33 [row 25] |
| 31640 | movu m6, [r5 + 4 * 16] |
| 31641 | pmaddubsw m3, m0, m6 |
| 31642 | pmulhrsw m3, m7 |
| 31643 | pmaddubsw m5, m2, m6 |
| 31644 | pmulhrsw m5, m7 |
| 31645 | packuswb m3, m5 |
| 31646 | movu [r0 + 2034 * 16], m3 |
| 31647 | pmaddubsw m3, m1, m6 |
| 31648 | pmulhrsw m3, m7 |
| 31649 | pmaddubsw m5, m4, m6 |
| 31650 | pmulhrsw m5, m7 |
| 31651 | packuswb m3, m5 |
| 31652 | movu [r0 + 2035 * 16], m3 |
| 31653 | |
| 31654 | ; mode 33 [row 26] |
| 31655 | movu m6, [r5 + 30 * 16] |
| 31656 | pmaddubsw m3, m0, m6 |
| 31657 | pmulhrsw m3, m7 |
| 31658 | pmaddubsw m5, m2, m6 |
| 31659 | pmulhrsw m5, m7 |
| 31660 | packuswb m3, m5 |
| 31661 | movu [r0 + 2036 * 16], m3 |
| 31662 | pmaddubsw m3, m1, m6 |
| 31663 | pmulhrsw m3, m7 |
| 31664 | pmaddubsw m5, m4, m6 |
| 31665 | pmulhrsw m5, m7 |
| 31666 | packuswb m3, m5 |
| 31667 | movu [r0 + 2037 * 16], m3 |
| 31668 | |
| 31669 | ; mode 33 [row 27] |
| 31670 | movu m6, [r5 + 24 * 16] |
| 31671 | movu m0, [r3 + 23] |
| 31672 | movd m1, [r3 + 24] |
| 31673 | palignr m1, m0, 1 |
| 31674 | punpcklbw m0, m1 |
| 31675 | pmaddubsw m3, m0, m6 |
| 31676 | pmulhrsw m3, m7 |
| 31677 | movu m2, [r3 + 31] |
| 31678 | movd m4, [r3 + 32] |
| 31679 | palignr m4, m2, 1 |
| 31680 | punpcklbw m2, m4 |
| 31681 | pmaddubsw m5, m2, m6 |
| 31682 | pmulhrsw m5, m7 |
| 31683 | packuswb m3, m5 |
| 31684 | movu [r0 + 2038 * 16], m3 |
| 31685 | |
| 31686 | movu m1, [r3 + 39] |
| 31687 | movd m3, [r3 + 40] |
| 31688 | palignr m3, m1, 1 |
| 31689 | punpcklbw m1, m3 |
| 31690 | pmaddubsw m3, m1, m6 |
| 31691 | pmulhrsw m3, m7 |
| 31692 | movu m4, [r3 + 47] |
| 31693 | movd m5, [r3 + 48] |
| 31694 | palignr m5, m4, 1 |
| 31695 | punpcklbw m4, m5 |
| 31696 | pmaddubsw m5, m4, m6 |
| 31697 | pmulhrsw m5, m7 |
| 31698 | packuswb m3, m5 |
| 31699 | movu [r0 + 2039 * 16], m3 |
| 31700 | |
| 31701 | ; mode 33 [row 28] |
| 31702 | movu m6, [r5 + 18 * 16] |
| 31703 | movu m0, [r3 + 24] |
| 31704 | movd m1, [r3 + 25] |
| 31705 | palignr m1, m0, 1 |
| 31706 | punpcklbw m0, m1 |
| 31707 | pmaddubsw m3, m0, m6 |
| 31708 | pmulhrsw m3, m7 |
| 31709 | movu m2, [r3 + 32] |
| 31710 | movd m4, [r3 + 33] |
| 31711 | palignr m4, m2, 1 |
| 31712 | punpcklbw m2, m4 |
| 31713 | pmaddubsw m5, m2, m6 |
| 31714 | pmulhrsw m5, m7 |
| 31715 | packuswb m3, m5 |
| 31716 | movu [r0 + 2040 * 16], m3 |
| 31717 | |
| 31718 | movu m1, [r3 + 40] |
| 31719 | movd m3, [r3 + 41] |
| 31720 | palignr m3, m1, 1 |
| 31721 | punpcklbw m1, m3 |
| 31722 | pmaddubsw m3, m1, m6 |
| 31723 | pmulhrsw m3, m7 |
| 31724 | movu m4, [r3 + 48] |
| 31725 | movd m5, [r3 + 49] |
| 31726 | palignr m5, m4, 1 |
| 31727 | punpcklbw m4, m5 |
| 31728 | pmaddubsw m5, m4, m6 |
| 31729 | pmulhrsw m5, m7 |
| 31730 | packuswb m3, m5 |
| 31731 | movu [r0 + 2041 * 16], m3 |
| 31732 | |
| 31733 | ; mode 33 [row 29] |
| 31734 | movu m6, [r5 + 12 * 16] |
| 31735 | movu m0, [r3 + 25] |
| 31736 | movd m1, [r3 + 26] |
| 31737 | palignr m1, m0, 1 |
| 31738 | punpcklbw m0, m1 |
| 31739 | pmaddubsw m3, m0, m6 |
| 31740 | pmulhrsw m3, m7 |
| 31741 | movu m2, [r3 + 33] |
| 31742 | movd m4, [r3 + 34] |
| 31743 | palignr m4, m2, 1 |
| 31744 | punpcklbw m2, m4 |
| 31745 | pmaddubsw m5, m2, m6 |
| 31746 | pmulhrsw m5, m7 |
| 31747 | packuswb m3, m5 |
| 31748 | movu [r0 + 2042 * 16], m3 |
| 31749 | |
| 31750 | movu m1, [r3 + 41] |
| 31751 | movd m3, [r3 + 42] |
| 31752 | palignr m3, m1, 1 |
| 31753 | punpcklbw m1, m3 |
| 31754 | pmaddubsw m3, m1, m6 |
| 31755 | pmulhrsw m3, m7 |
| 31756 | movu m4, [r3 + 49] |
| 31757 | movd m5, [r3 + 50] |
| 31758 | palignr m5, m4, 1 |
| 31759 | punpcklbw m4, m5 |
| 31760 | pmaddubsw m5, m4, m6 |
| 31761 | pmulhrsw m5, m7 |
| 31762 | packuswb m3, m5 |
| 31763 | movu [r0 + 2043 * 16], m3 |
| 31764 | |
| 31765 | ; mode 33 [row 30] |
| 31766 | movu m6, [r5 + 6 * 16] |
| 31767 | movu m0, [r3 + 26] |
| 31768 | movd m1, [r3 + 27] |
| 31769 | palignr m1, m0, 1 |
| 31770 | punpcklbw m0, m1 |
| 31771 | pmaddubsw m3, m0, m6 |
| 31772 | pmulhrsw m3, m7 |
| 31773 | movu m2, [r3 + 34] |
| 31774 | movd m4, [r3 + 35] |
| 31775 | palignr m4, m2, 1 |
| 31776 | punpcklbw m2, m4 |
| 31777 | pmaddubsw m5, m2, m6 |
| 31778 | pmulhrsw m5, m7 |
| 31779 | packuswb m3, m5 |
| 31780 | movu [r0 + 2044 * 16], m3 |
| 31781 | |
| 31782 | movu m1, [r3 + 42] |
| 31783 | movd m3, [r3 + 43] |
| 31784 | palignr m3, m1, 1 |
| 31785 | punpcklbw m1, m3 |
| 31786 | pmaddubsw m3, m1, m6 |
| 31787 | pmulhrsw m3, m7 |
| 31788 | movu m4, [r3 + 50] |
| 31789 | movd m5, [r3 + 51] |
| 31790 | palignr m5, m4, 1 |
| 31791 | punpcklbw m4, m5 |
| 31792 | pmaddubsw m5, m4, m6 |
| 31793 | pmulhrsw m5, m7 |
| 31794 | packuswb m3, m5 |
| 31795 | movu [r0 + 2045 * 16], m3 |
| 31796 | |
| 31797 | ; mode 33 [row 31] |
| 31798 | movu m5, [r3 + 27] |
| 31799 | movu [r0 + 2046 * 16], m5 |
| 31800 | movu m5, [r3 + 43] |
| 31801 | movu [r0 + 2047 * 16], m5 |
| 31802 | |
| 31803 | ;mode 34 [row 0] |
| 31804 | movu m0, [r3 + 2] |
| 31805 | movu [r0 + 2048 * 16], m0 |
| 31806 | movu m1, [r3 + 18] |
| 31807 | movu [r0 + 2049 * 16], m1 |
| 31808 | |
| 31809 | ;mode 34 [row 1] |
| 31810 | movu m2, [r3 + 34] |
| 31811 | palignr m3, m1, m0, 1 |
| 31812 | movu [r0 + 2050 * 16], m3 |
| 31813 | palignr m4, m2, m1, 1 |
| 31814 | movu [r0 + 2051 * 16], m4 |
| 31815 | |
| 31816 | ;mode 34 [row 2] |
| 31817 | palignr m3, m1, m0, 2 |
| 31818 | movu [r0 + 2052 * 16], m3 |
| 31819 | palignr m4, m2, m1, 2 |
| 31820 | movu [r0 + 2053 * 16], m4 |
| 31821 | |
| 31822 | ;mode 34 [row 3] |
| 31823 | palignr m3, m1, m0, 3 |
| 31824 | movu [r0 + 2054 * 16], m3 |
| 31825 | palignr m4, m2, m1, 3 |
| 31826 | movu [r0 + 2055 * 16], m4 |
| 31827 | |
| 31828 | ;mode 34 [row 4] |
| 31829 | palignr m3, m1, m0, 4 |
| 31830 | movu [r0 + 2056 * 16], m3 |
| 31831 | palignr m4, m2, m1, 4 |
| 31832 | movu [r0 + 2057 * 16], m4 |
| 31833 | |
| 31834 | ;mode 34 [row 5] |
| 31835 | palignr m3, m1, m0, 5 |
| 31836 | movu [r0 + 2058 * 16], m3 |
| 31837 | palignr m4, m2, m1, 5 |
| 31838 | movu [r0 + 2059 * 16], m4 |
| 31839 | |
| 31840 | ;mode 34 [row 6] |
| 31841 | palignr m3, m1, m0, 6 |
| 31842 | movu [r0 + 2060 * 16], m3 |
| 31843 | palignr m4, m2, m1, 6 |
| 31844 | movu [r0 + 2061 * 16], m4 |
| 31845 | |
| 31846 | ;mode 34 [row 7] |
| 31847 | palignr m3, m1, m0, 7 |
| 31848 | movu [r0 + 2062 * 16], m3 |
| 31849 | palignr m4, m2, m1, 7 |
| 31850 | movu [r0 + 2063 * 16], m4 |
| 31851 | |
| 31852 | ;mode 34 [row 8] |
| 31853 | palignr m3, m1, m0, 8 |
| 31854 | movu [r0 + 2064 * 16], m3 |
| 31855 | palignr m4, m2, m1, 8 |
| 31856 | movu [r0 + 2065 * 16], m4 |
| 31857 | |
| 31858 | ;mode 34 [row 9] |
| 31859 | palignr m3, m1, m0, 9 |
| 31860 | movu [r0 + 2066 * 16], m3 |
| 31861 | palignr m4, m2, m1, 9 |
| 31862 | movu [r0 + 2067 * 16], m4 |
| 31863 | |
| 31864 | ;mode 34 [row 10] |
| 31865 | palignr m3, m1, m0, 10 |
| 31866 | movu [r0 + 2068 * 16], m3 |
| 31867 | palignr m4, m2, m1, 10 |
| 31868 | movu [r0 + 2069 * 16], m4 |
| 31869 | |
| 31870 | ;mode 34 [row 11] |
| 31871 | palignr m3, m1, m0, 11 |
| 31872 | movu [r0 + 2070 * 16], m3 |
| 31873 | palignr m4, m2, m1, 11 |
| 31874 | movu [r0 + 2071 * 16], m4 |
| 31875 | |
| 31876 | ;mode 34 [row 12] |
| 31877 | palignr m3, m1, m0, 12 |
| 31878 | movu [r0 + 2072 * 16], m3 |
| 31879 | palignr m4, m2, m1, 12 |
| 31880 | movu [r0 + 2073 * 16], m4 |
| 31881 | |
| 31882 | ;mode 34 [row 13] |
| 31883 | palignr m3, m1, m0, 13 |
| 31884 | movu [r0 + 2074 * 16], m3 |
| 31885 | palignr m4, m2, m1, 13 |
| 31886 | movu [r0 + 2075 * 16], m4 |
| 31887 | |
| 31888 | ;mode 34 [row 14] |
| 31889 | palignr m3, m1, m0, 14 |
| 31890 | movu [r0 + 2076 * 16], m3 |
| 31891 | palignr m4, m2, m1, 14 |
| 31892 | movu [r0 + 2077 * 16], m4 |
| 31893 | |
| 31894 | ;mode 34 [row 15] |
| 31895 | palignr m3, m1, m0, 15 |
| 31896 | movu [r0 + 2078 * 16], m3 |
| 31897 | palignr m4, m2, m1, 15 |
| 31898 | movu [r0 + 2079 * 16], m4 |
| 31899 | |
| 31900 | ;mode 34 [row 16] |
| 31901 | palignr m3, m1, m0, 16 |
| 31902 | movu [r0 + 2080 * 16], m3 |
| 31903 | palignr m4, m2, m1, 16 |
| 31904 | movu [r0 + 2081 * 16], m4 |
| 31905 | |
| 31906 | ;mode 34 [row 17] |
| 31907 | movu m0, [r3 + 19] |
| 31908 | movu [r0 + 2082 * 16], m0 |
| 31909 | movu m1, [r3 + 35] |
| 31910 | movu [r0 + 2083 * 16], m1 |
| 31911 | |
| 31912 | ;mode 34 [row 18] |
| 31913 | movu m2, [r3 + 51] |
| 31914 | palignr m3, m1, m0, 1 |
| 31915 | movu [r0 + 2084 * 16], m3 |
| 31916 | palignr m4, m2, m1, 1 |
| 31917 | movu [r0 + 2085 * 16], m4 |
| 31918 | |
| 31919 | ;mode 34 [row 19] |
| 31920 | palignr m3, m1, m0, 2 |
| 31921 | movu [r0 + 2086 * 16], m3 |
| 31922 | palignr m4, m2, m1, 2 |
| 31923 | movu [r0 + 2087 * 16], m4 |
| 31924 | |
| 31925 | ;mode 34 [row 20] |
| 31926 | palignr m3, m1, m0, 3 |
| 31927 | movu [r0 + 2088 * 16], m3 |
| 31928 | palignr m4, m2, m1, 3 |
| 31929 | movu [r0 + 2089 * 16], m4 |
| 31930 | |
| 31931 | ;mode 34 [row 21] |
| 31932 | palignr m3, m1, m0, 4 |
| 31933 | movu [r0 + 2090 * 16], m3 |
| 31934 | palignr m4, m2, m1, 4 |
| 31935 | movu [r0 + 2091 * 16], m4 |
| 31936 | |
| 31937 | ;mode 34 [row 22] |
| 31938 | palignr m3, m1, m0, 5 |
| 31939 | movu [r0 + 2092 * 16], m3 |
| 31940 | palignr m4, m2, m1, 5 |
| 31941 | movu [r0 + 2093 * 16], m4 |
| 31942 | |
| 31943 | ;mode 34 [row 23] |
| 31944 | palignr m3, m1, m0, 6 |
| 31945 | movu [r0 + 2094 * 16], m3 |
| 31946 | palignr m4, m2, m1, 6 |
| 31947 | movu [r0 + 2095 * 16], m4 |
| 31948 | |
| 31949 | ;mode 34 [row 24] |
| 31950 | palignr m3, m1, m0, 7 |
| 31951 | movu [r0 + 2096 * 16], m3 |
| 31952 | palignr m4, m2, m1, 7 |
| 31953 | movu [r0 + 2097 * 16], m4 |
| 31954 | |
| 31955 | ;mode 34 [row 25] |
| 31956 | palignr m3, m1, m0, 8 |
| 31957 | movu [r0 + 2098 * 16], m3 |
| 31958 | palignr m4, m2, m1, 8 |
| 31959 | movu [r0 + 2099 * 16], m4 |
| 31960 | |
| 31961 | ;mode 34 [row 26] |
| 31962 | palignr m3, m1, m0, 9 |
| 31963 | movu [r0 + 2100 * 16], m3 |
| 31964 | palignr m4, m2, m1, 9 |
| 31965 | movu [r0 + 2101 * 16], m4 |
| 31966 | |
| 31967 | ;mode 34 [row 27] |
| 31968 | palignr m3, m1, m0, 10 |
| 31969 | movu [r0 + 2102 * 16], m3 |
| 31970 | palignr m4, m2, m1, 10 |
| 31971 | movu [r0 + 2103 * 16], m4 |
| 31972 | |
| 31973 | ;mode 34 [row 28] |
| 31974 | palignr m3, m1, m0, 11 |
| 31975 | movu [r0 + 2104 * 16], m3 |
| 31976 | palignr m4, m2, m1, 11 |
| 31977 | movu [r0 + 2105 * 16], m4 |
| 31978 | |
| 31979 | ;mode 34 [row 29] |
| 31980 | palignr m3, m1, m0, 12 |
| 31981 | movu [r0 + 2106 * 16], m3 |
| 31982 | palignr m4, m2, m1, 12 |
| 31983 | movu [r0 + 2107 * 16], m4 |
| 31984 | |
| 31985 | ;mode 34 [row 30] |
| 31986 | palignr m3, m1, m0, 13 |
| 31987 | movu [r0 + 2108 * 16], m3 |
| 31988 | palignr m4, m2, m1, 13 |
| 31989 | movu [r0 + 2109 * 16], m4 |
| 31990 | |
| 31991 | ;mode 34 [row 31] |
| 31992 | palignr m3, m1, m0, 14 |
| 31993 | movu [r0 + 2110 * 16], m3 |
| 31994 | palignr m4, m2, m1, 14 |
| 31995 | movu [r0 + 2111 * 16], m4 |
| 31996 | |
| 31997 | RET |